xref: /freebsd/sys/dev/hyperv/hvsock/hv_sock.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/domain.h>
35 #include <sys/lock.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysproto.h>
46 #include <sys/systm.h>
47 #include <sys/sockbuf.h>
48 #include <sys/sx.h>
49 #include <sys/uio.h>
50 
51 #include <net/vnet.h>
52 
53 #include <dev/hyperv/vmbus/vmbus_reg.h>
54 
55 #include "hv_sock.h"
56 
57 #define HVSOCK_DBG_NONE			0x0
58 #define HVSOCK_DBG_INFO			0x1
59 #define HVSOCK_DBG_ERR			0x2
60 #define HVSOCK_DBG_VERBOSE		0x3
61 
62 
63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
64 
65 static int hvs_dbg_level;
66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
67     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
68 
69 
70 #define HVSOCK_DBG(level, ...) do {					\
71 	if (hvs_dbg_level >= (level))					\
72 		printf(__VA_ARGS__);					\
73 	} while (0)
74 
75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
76 
77 static int hvs_dom_probe(void);
78 
79 /* The MTU is 16KB per host side's design */
80 #define HVSOCK_MTU_SIZE		(1024 * 16)
81 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
82 
83 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
84 
85 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
86 					 roundup2(payload_len, 8) + \
87 					 sizeof(uint64_t))
88 
89 
90 static struct domain		hv_socket_domain;
91 
92 /*
93  * HyperV Transport sockets
94  */
95 static struct pr_usrreqs	hvs_trans_usrreqs = {
96 	.pru_attach =		hvs_trans_attach,
97 	.pru_bind =		hvs_trans_bind,
98 	.pru_listen =		hvs_trans_listen,
99 	.pru_accept =		hvs_trans_accept,
100 	.pru_connect =		hvs_trans_connect,
101 	.pru_peeraddr =		hvs_trans_peeraddr,
102 	.pru_sockaddr =		hvs_trans_sockaddr,
103 	.pru_soreceive =	hvs_trans_soreceive,
104 	.pru_sosend =		hvs_trans_sosend,
105 	.pru_disconnect =	hvs_trans_disconnect,
106 	.pru_close =		hvs_trans_close,
107 	.pru_detach =		hvs_trans_detach,
108 	.pru_shutdown =		hvs_trans_shutdown,
109 	.pru_abort =		hvs_trans_abort,
110 };
111 
112 /*
113  * Definitions of protocols supported in HyperV socket domain
114  */
115 static struct protosw		hv_socket_protosw[] = {
116 {
117 	.pr_type =		SOCK_STREAM,
118 	.pr_domain =		&hv_socket_domain,
119 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
120 	.pr_flags =		PR_CONNREQUIRED,
121 	.pr_usrreqs =		&hvs_trans_usrreqs,
122 },
123 };
124 
125 static struct domain		hv_socket_domain = {
126 	.dom_family =		AF_HYPERV,
127 	.dom_name =		"hyperv",
128 	.dom_probe =		hvs_dom_probe,
129 	.dom_protosw =		hv_socket_protosw,
130 	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
131 };
132 
133 DOMAIN_SET(hv_socket_);
134 
135 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
136 #define MIN_PORT			((uint32_t)0x0)
137 
138 /* 00000000-facb-11e6-bd58-64006a7986d3 */
139 static const struct hyperv_guid srv_id_template = {
140 	.hv_guid = {
141 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
142 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
143 };
144 
145 static int		hvsock_br_callback(void *, int, void *);
146 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
147 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
148 static int		hvsock_send_data(struct vmbus_channel *chan,
149     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
150 
151 
152 
153 /* Globals */
154 static struct sx		hvs_trans_socks_sx;
155 static struct mtx		hvs_trans_socks_mtx;
156 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
157 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
158 static uint32_t			previous_auto_bound_port;
159 
160 static void
161 hvsock_print_guid(struct hyperv_guid *guid)
162 {
163 	unsigned char *p = (unsigned char *)guid;
164 
165 	HVSOCK_DBG(HVSOCK_DBG_INFO,
166 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
167 	    *(unsigned int *)p,
168 	    *((unsigned short *) &p[4]),
169 	    *((unsigned short *) &p[6]),
170 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
171 }
172 
173 static bool
174 is_valid_srv_id(const struct hyperv_guid *id)
175 {
176 	return !memcmp(&id->hv_guid[4],
177 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
178 }
179 
180 static unsigned int
181 get_port_by_srv_id(const struct hyperv_guid *srv_id)
182 {
183 	return *((const unsigned int *)srv_id);
184 }
185 
186 static void
187 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
188 {
189 	*((unsigned int *)srv_id) = port;
190 }
191 
192 
193 static void
194 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
195 {
196 	struct hvs_pcb *p = NULL;
197 
198 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
199 
200 	if (!pcb)
201 		return;
202 
203 	if (list & HVS_LIST_BOUND) {
204 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
205 			if  (p == pcb)
206 				LIST_REMOVE(p, bound_next);
207 	}
208 
209 	if (list & HVS_LIST_CONNECTED) {
210 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
211 			if (p == pcb)
212 				LIST_REMOVE(pcb, connected_next);
213 	}
214 }
215 
216 static void
217 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
218 {
219 	struct hvs_pcb *pcb = so2hvspcb(so);
220 
221 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
222 
223 	__hvs_remove_pcb_from_list(pcb, list);
224 }
225 
226 static void
227 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
228 {
229 	struct hvs_pcb *pcb = so2hvspcb(so);
230 
231 	if (list & HVS_LIST_BOUND)
232 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
233 		   pcb, bound_next);
234 
235 	if (list & HVS_LIST_CONNECTED)
236 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
237 		   pcb, connected_next);
238 }
239 
240 void
241 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
242 {
243 	if (!so || !so->so_pcb) {
244 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
245 		    "%s: socket or so_pcb is null\n", __func__);
246 		return;
247 	}
248 
249 	mtx_lock(&hvs_trans_socks_mtx);
250 	__hvs_remove_socket_from_list(so, list);
251 	mtx_unlock(&hvs_trans_socks_mtx);
252 }
253 
254 static void
255 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
256 {
257 	if (!so || !so->so_pcb) {
258 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
259 		    "%s: socket or so_pcb is null\n", __func__);
260 		return;
261 	}
262 
263 	mtx_lock(&hvs_trans_socks_mtx);
264 	__hvs_insert_socket_on_list(so, list);
265 	mtx_unlock(&hvs_trans_socks_mtx);
266 }
267 
268 static struct socket *
269 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
270 {
271 	struct hvs_pcb *p = NULL;
272 
273 	if (list & HVS_LIST_BOUND)
274 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
275 			if (p->so != NULL &&
276 			    addr->hvs_port == p->local_addr.hvs_port)
277 				return p->so;
278 
279 	if (list & HVS_LIST_CONNECTED)
280 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
281 			if (p->so != NULL &&
282 			    addr->hvs_port == p->local_addr.hvs_port)
283 				return p->so;
284 
285 	return NULL;
286 }
287 
288 static struct socket *
289 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
290 {
291 	struct socket *s = NULL;
292 
293 	mtx_lock(&hvs_trans_socks_mtx);
294 	s = __hvs_find_socket_on_list(addr, list);
295 	mtx_unlock(&hvs_trans_socks_mtx);
296 
297 	return s;
298 }
299 
300 static inline void
301 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
302 {
303 	memset(addr, 0, sizeof(*addr));
304 	addr->sa_family = AF_HYPERV;
305 	addr->sa_len = sizeof(*addr);
306 	addr->hvs_port = port;
307 }
308 
309 void
310 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
311 {
312 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
313 }
314 
315 int
316 hvs_trans_lock(void)
317 {
318 	sx_xlock(&hvs_trans_socks_sx);
319 	return (0);
320 }
321 
322 void
323 hvs_trans_unlock(void)
324 {
325 	sx_xunlock(&hvs_trans_socks_sx);
326 }
327 
328 static int
329 hvs_dom_probe(void)
330 {
331 
332 	/* Don't even give us a chance to attach on non-HyperV. */
333 	if (vm_guest != VM_GUEST_HV)
334 		return (ENXIO);
335 	return (0);
336 }
337 
338 static void
339 hvs_trans_init(void *arg __unused)
340 {
341 
342 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
343 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
344 
345 	/* Initialize Globals */
346 	previous_auto_bound_port = MAX_PORT;
347 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
348 	mtx_init(&hvs_trans_socks_mtx,
349 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
350 	LIST_INIT(&hvs_trans_bound_socks);
351 	LIST_INIT(&hvs_trans_connected_socks);
352 }
353 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
354     hvs_trans_init, NULL);
355 
356 /*
357  * Called in two cases:
358  * 1) When user calls socket();
359  * 2) When we accept new incoming conneciton and call sonewconn().
360  */
361 int
362 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
363 {
364 	struct hvs_pcb *pcb = so2hvspcb(so);
365 
366 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
367 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
368 
369 	if (so->so_type != SOCK_STREAM)
370 		return (ESOCKTNOSUPPORT);
371 
372 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
373 		return (EPROTONOSUPPORT);
374 
375 	if (pcb != NULL)
376 		return (EISCONN);
377 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
378 	if (pcb == NULL)
379 		return (ENOMEM);
380 
381 	pcb->so = so;
382 	so->so_pcb = (void *)pcb;
383 
384 	return (0);
385 }
386 
387 void
388 hvs_trans_detach(struct socket *so)
389 {
390 	struct hvs_pcb *pcb;
391 
392 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
393 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
394 
395 	(void) hvs_trans_lock();
396 	pcb = so2hvspcb(so);
397 	if (pcb == NULL) {
398 		hvs_trans_unlock();
399 		return;
400 	}
401 
402 	if (SOLISTENING(so)) {
403 		bzero(pcb, sizeof(*pcb));
404 		free(pcb, M_HVSOCK);
405 	}
406 
407 	so->so_pcb = NULL;
408 
409 	hvs_trans_unlock();
410 }
411 
412 int
413 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
414 {
415 	struct hvs_pcb *pcb = so2hvspcb(so);
416 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
417 	int error = 0;
418 
419 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
420 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
421 
422 	if (sa == NULL) {
423 		return (EINVAL);
424 	}
425 
426 	if (pcb == NULL) {
427 		return (EINVAL);
428 	}
429 
430 	if (sa->sa_family != AF_HYPERV) {
431 		HVSOCK_DBG(HVSOCK_DBG_ERR,
432 		    "%s: Not supported, sa_family is %u\n",
433 		    __func__, sa->sa_family);
434 		return (EAFNOSUPPORT);
435 	}
436 	if (sa->sa_len != sizeof(*sa)) {
437 		HVSOCK_DBG(HVSOCK_DBG_ERR,
438 		    "%s: Not supported, sa_len is %u\n",
439 		    __func__, sa->sa_len);
440 		return (EINVAL);
441 	}
442 
443 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
444 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
445 
446 	mtx_lock(&hvs_trans_socks_mtx);
447 	if (__hvs_find_socket_on_list(sa,
448 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
449 		error = EADDRINUSE;
450 	} else {
451 		/*
452 		 * The address is available for us to bind.
453 		 * Add socket to the bound list.
454 		 */
455 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
456 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
457 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
458 	}
459 	mtx_unlock(&hvs_trans_socks_mtx);
460 
461 	return (error);
462 }
463 
464 int
465 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
466 {
467 	struct hvs_pcb *pcb = so2hvspcb(so);
468 	struct socket *bound_so;
469 	int error;
470 
471 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
472 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
473 
474 	if (pcb == NULL)
475 		return (EINVAL);
476 
477 	/* Check if the address is already bound and it was by us. */
478 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
479 	if (bound_so == NULL || bound_so != so) {
480 		HVSOCK_DBG(HVSOCK_DBG_ERR,
481 		    "%s: Address not bound or not by us.\n", __func__);
482 		return (EADDRNOTAVAIL);
483 	}
484 
485 	SOCK_LOCK(so);
486 	error = solisten_proto_check(so);
487 	if (error == 0)
488 		solisten_proto(so, backlog);
489 	SOCK_UNLOCK(so);
490 
491 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
492 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
493 	return (error);
494 }
495 
496 int
497 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
498 {
499 	struct hvs_pcb *pcb = so2hvspcb(so);
500 
501 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
502 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
503 
504 	if (pcb == NULL)
505 		return (EINVAL);
506 
507 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
508 	    M_NOWAIT);
509 
510 	return ((*nam == NULL) ? ENOMEM : 0);
511 }
512 
513 int
514 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
515 {
516 	struct hvs_pcb *pcb = so2hvspcb(so);
517 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
518 	bool found_auto_bound_port = false;
519 	int i, error = 0;
520 
521 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
522 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
523 	    __func__, raddr->hvs_port);
524 
525 	if (pcb == NULL)
526 		return (EINVAL);
527 
528 	/* Verify the remote address */
529 	if (raddr == NULL)
530 		return (EINVAL);
531 	if (raddr->sa_family != AF_HYPERV)
532 		return (EAFNOSUPPORT);
533 	if (raddr->sa_len != sizeof(*raddr))
534 		return (EINVAL);
535 
536 	mtx_lock(&hvs_trans_socks_mtx);
537 	if (so->so_state &
538 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
539 			HVSOCK_DBG(HVSOCK_DBG_ERR,
540 			    "%s: socket connect in progress\n",
541 			    __func__);
542 			error = EINPROGRESS;
543 			goto out;
544 	}
545 
546 	/*
547 	 * Find an available port for us to auto bind the local
548 	 * address.
549 	 */
550 	hvs_addr_set(&pcb->local_addr, 0);
551 
552 	for (i = previous_auto_bound_port - 1;
553 	    i != previous_auto_bound_port; i --) {
554 		if (i == MIN_PORT)
555 			i = MAX_PORT;
556 
557 		pcb->local_addr.hvs_port = i;
558 
559 		if (__hvs_find_socket_on_list(&pcb->local_addr,
560 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
561 			found_auto_bound_port = true;
562 			previous_auto_bound_port = i;
563 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
564 			    "%s: found local bound port is %x\n",
565 			    __func__, pcb->local_addr.hvs_port);
566 			break;
567 		}
568 	}
569 
570 	if (found_auto_bound_port == true) {
571 		/* Found available port for auto bound, put on list */
572 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
573 		/* Set VM service ID */
574 		pcb->vm_srv_id = srv_id_template;
575 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
576 		/* Set host service ID and remote port */
577 		pcb->host_srv_id = srv_id_template;
578 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
579 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
580 
581 		/* Change the socket state to SS_ISCONNECTING */
582 		soisconnecting(so);
583 	} else {
584 		HVSOCK_DBG(HVSOCK_DBG_ERR,
585 		    "%s: No local port available for auto bound\n",
586 		    __func__);
587 		error = EADDRINUSE;
588 	}
589 
590 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
591 	hvsock_print_guid(&pcb->vm_srv_id);
592 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
593 	hvsock_print_guid(&pcb->host_srv_id);
594 
595 out:
596 	mtx_unlock(&hvs_trans_socks_mtx);
597 
598 	if (found_auto_bound_port == true)
599 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
600 
601 	return (error);
602 }
603 
604 int
605 hvs_trans_disconnect(struct socket *so)
606 {
607 	struct hvs_pcb *pcb;
608 
609 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
610 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
611 
612 	(void) hvs_trans_lock();
613 	pcb = so2hvspcb(so);
614 	if (pcb == NULL) {
615 		hvs_trans_unlock();
616 		return (EINVAL);
617 	}
618 
619 	/* If socket is already disconnected, skip this */
620 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
621 		soisdisconnecting(so);
622 
623 	hvs_trans_unlock();
624 
625 	return (0);
626 }
627 
628 struct hvs_callback_arg {
629 	struct uio *uio;
630 	struct sockbuf *sb;
631 };
632 
633 int
634 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
635     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
636 {
637 	struct hvs_pcb *pcb = so2hvspcb(so);
638 	struct sockbuf *sb;
639 	ssize_t orig_resid;
640 	uint32_t canread, to_read;
641 	int flags, error = 0;
642 	struct hvs_callback_arg cbarg;
643 
644 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
645 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
646 
647 	if (so->so_type != SOCK_STREAM)
648 		return (EINVAL);
649 	if (pcb == NULL)
650 		return (EINVAL);
651 
652 	if (flagsp != NULL)
653 		flags = *flagsp &~ MSG_EOR;
654 	else
655 		flags = 0;
656 
657 	if (flags & MSG_PEEK)
658 		return (EOPNOTSUPP);
659 
660 	/* If no space to copy out anything */
661 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
662 		return (EINVAL);
663 
664 	orig_resid = uio->uio_resid;
665 
666 	/* Prevent other readers from entering the socket. */
667 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
668 	if (error) {
669 		HVSOCK_DBG(HVSOCK_DBG_ERR,
670 		    "%s: soiolock returned error = %d\n", __func__, error);
671 		return (error);
672 	}
673 
674 	sb = &so->so_rcv;
675 	SOCKBUF_LOCK(sb);
676 
677 	cbarg.uio = uio;
678 	cbarg.sb = sb;
679 	/*
680 	 * If the socket is closing, there might still be some data
681 	 * in rx br to read. However we need to make sure
682 	 * the channel is still open.
683 	 */
684 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
685 	    (so->so_state & SS_ISDISCONNECTED)) {
686 		/* Other thread already closed the channel */
687 		error = EPIPE;
688 		goto out;
689 	}
690 
691 	while (true) {
692 		while (uio->uio_resid > 0 &&
693 		    (canread = hvsock_canread_check(pcb)) > 0) {
694 			to_read = MIN(canread, uio->uio_resid);
695 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
696 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
697 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
698 			    pcb->recv_data_off));
699 
700 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
701 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
702 			    hvsock_br_callback, (void *)&cbarg);
703 			/*
704 			 * It is possible socket is disconnected becasue
705 			 * we released lock in hvsock_br_callback. So we
706 			 * need to check the state to make sure it is not
707 			 * disconnected.
708 			 */
709 			if (error || so->so_state & SS_ISDISCONNECTED) {
710 				break;
711 			}
712 
713 			pcb->recv_data_len -= to_read;
714 			pcb->recv_data_off += to_read;
715 		}
716 
717 		if (error)
718 			break;
719 
720 		/* Abort if socket has reported problems. */
721 		if (so->so_error) {
722 			if (so->so_error == ESHUTDOWN &&
723 			    orig_resid > uio->uio_resid) {
724 				/*
725 				 * Although we got a FIN, we also received
726 				 * some data in this round. Delivery it
727 				 * to user.
728 				 */
729 				error = 0;
730 			} else {
731 				if (so->so_error != ESHUTDOWN)
732 					error = so->so_error;
733 			}
734 
735 			break;
736 		}
737 
738 		/* Cannot received more. */
739 		if (sb->sb_state & SBS_CANTRCVMORE)
740 			break;
741 
742 		/* We are done if buffer has been filled */
743 		if (uio->uio_resid == 0)
744 			break;
745 
746 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
747 			break;
748 
749 		/* Buffer ring is empty and we shall not block */
750 		if ((so->so_state & SS_NBIO) ||
751 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
752 			if (orig_resid == uio->uio_resid) {
753 				/* We have not read anything */
754 				error = EAGAIN;
755 			}
756 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
757 			    "%s: non blocked read return, error %d.\n",
758 			    __func__, error);
759 			break;
760 		}
761 
762 		/*
763 		 * Wait and block until (more) data comes in.
764 		 * Note: Drops the sockbuf lock during wait.
765 		 */
766 		error = sbwait(sb);
767 
768 		if (error)
769 			break;
770 
771 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
772 		    "%s: wake up from sbwait, read available is %u\n",
773 		    __func__, vmbus_chan_read_available(pcb->chan));
774 	}
775 
776 out:
777 	SOCKBUF_UNLOCK(sb);
778 	SOCK_IO_RECV_UNLOCK(so);
779 
780 	/* We recieved a FIN in this call */
781 	if (so->so_error == ESHUTDOWN) {
782 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
783 			/* Send has already closed */
784 			soisdisconnecting(so);
785 		} else {
786 			/* Just close the receive side */
787 			socantrcvmore(so);
788 		}
789 	}
790 
791 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
792 	    "%s: returning error = %d, so_error = %d\n",
793 	    __func__, error, so->so_error);
794 
795 	return (error);
796 }
797 
798 int
799 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
800     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
801 {
802 	struct hvs_pcb *pcb = so2hvspcb(so);
803 	struct sockbuf *sb;
804 	ssize_t orig_resid;
805 	uint32_t canwrite, to_write;
806 	int error = 0;
807 
808 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
809 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
810 	    __func__, uio->uio_resid);
811 
812 	if (so->so_type != SOCK_STREAM)
813 		return (EINVAL);
814 	if (pcb == NULL)
815 		return (EINVAL);
816 
817 	/* If nothing to send */
818 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
819 		return (EINVAL);
820 
821 	orig_resid = uio->uio_resid;
822 
823 	/* Prevent other writers from entering the socket. */
824 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
825 	if (error) {
826 		HVSOCK_DBG(HVSOCK_DBG_ERR,
827 		    "%s: soiolocak returned error = %d\n", __func__, error);
828 		return (error);
829 	}
830 
831 	sb = &so->so_snd;
832 	SOCKBUF_LOCK(sb);
833 
834 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
835 	    so->so_error == ESHUTDOWN) {
836 		error = EPIPE;
837 		goto out;
838 	}
839 
840 	while (uio->uio_resid > 0) {
841 		canwrite = hvsock_canwrite_check(pcb);
842 		if (canwrite == 0) {
843 			/* We have sent some data */
844 			if (orig_resid > uio->uio_resid)
845 				break;
846 			/*
847 			 * We have not sent any data and it is
848 			 * non-blocked io
849 			 */
850 			if (so->so_state & SS_NBIO ||
851 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
852 				error = EWOULDBLOCK;
853 				break;
854 			} else {
855 				/*
856 				 * We are here because there is no space on
857 				 * send buffer ring. Signal the other side
858 				 * to read and free more space.
859 				 * Sleep wait until space avaiable to send
860 				 * Note: Drops the sockbuf lock during wait.
861 				 */
862 				error = sbwait(sb);
863 
864 				if (error)
865 					break;
866 
867 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
868 				    "%s: wake up from sbwait, space avail on "
869 				    "tx ring is %u\n",
870 				    __func__,
871 				    vmbus_chan_write_available(pcb->chan));
872 
873 				continue;
874 			}
875 		}
876 		to_write = MIN(canwrite, uio->uio_resid);
877 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
878 
879 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
880 		    "%s: canwrite is %u, to_write = %u\n", __func__,
881 		    canwrite, to_write);
882 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
883 
884 		if (error)
885 			break;
886 	}
887 
888 out:
889 	SOCKBUF_UNLOCK(sb);
890 	SOCK_IO_SEND_UNLOCK(so);
891 
892 	return (error);
893 }
894 
895 int
896 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
897 {
898 	struct hvs_pcb *pcb = so2hvspcb(so);
899 
900 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
901 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
902 
903 	if (pcb == NULL)
904 		return (EINVAL);
905 
906 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
907 
908 	return ((*nam == NULL)? ENOMEM : 0);
909 }
910 
911 int
912 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
913 {
914 	struct hvs_pcb *pcb = so2hvspcb(so);
915 
916 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
917 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
918 
919 	if (pcb == NULL)
920 		return (EINVAL);
921 
922 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
923 
924 	return ((*nam == NULL)? ENOMEM : 0);
925 }
926 
927 void
928 hvs_trans_close(struct socket *so)
929 {
930 	struct hvs_pcb *pcb;
931 
932 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
933 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
934 
935 	(void) hvs_trans_lock();
936 	pcb = so2hvspcb(so);
937 	if (!pcb) {
938 		hvs_trans_unlock();
939 		return;
940 	}
941 
942 	if (so->so_state & SS_ISCONNECTED) {
943 		/* Send a FIN to peer */
944 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
945 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
946 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
947 	}
948 
949 	if (so->so_state &
950 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
951 		soisdisconnected(so);
952 
953 	pcb->chan = NULL;
954 	pcb->so = NULL;
955 
956 	if (SOLISTENING(so)) {
957 		mtx_lock(&hvs_trans_socks_mtx);
958 		/* Remove from bound list */
959 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
960 		mtx_unlock(&hvs_trans_socks_mtx);
961 	}
962 
963 	hvs_trans_unlock();
964 
965 	return;
966 }
967 
968 void
969 hvs_trans_abort(struct socket *so)
970 {
971 	struct hvs_pcb *pcb = so2hvspcb(so);
972 
973 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
974 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
975 
976 	(void) hvs_trans_lock();
977 	if (pcb == NULL) {
978 		hvs_trans_unlock();
979 		return;
980 	}
981 
982 	if (SOLISTENING(so)) {
983 		mtx_lock(&hvs_trans_socks_mtx);
984 		/* Remove from bound list */
985 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
986 		mtx_unlock(&hvs_trans_socks_mtx);
987 	}
988 
989 	if (so->so_state & SS_ISCONNECTED) {
990 		(void) sodisconnect(so);
991 	}
992 	hvs_trans_unlock();
993 
994 	return;
995 }
996 
997 int
998 hvs_trans_shutdown(struct socket *so)
999 {
1000 	struct hvs_pcb *pcb = so2hvspcb(so);
1001 	struct sockbuf *sb;
1002 
1003 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1004 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
1005 
1006 	if (pcb == NULL)
1007 		return (EINVAL);
1008 
1009 	/*
1010 	 * Only get called with the shutdown method is SHUT_WR or
1011 	 * SHUT_RDWR.
1012 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
1013 	 * already set the SBS_CANTRCVMORE on receive side socket
1014 	 * buffer.
1015 	 */
1016 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1017 		/*
1018 		 * SHUT_WR only case.
1019 		 * Receive side is still open. Just close
1020 		 * the send side.
1021 		 */
1022 		socantsendmore(so);
1023 	} else {
1024 		/* SHUT_RDWR case */
1025 		if (so->so_state & SS_ISCONNECTED) {
1026 			/* Send a FIN to peer */
1027 			sb = &so->so_snd;
1028 			SOCKBUF_LOCK(sb);
1029 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1030 			SOCKBUF_UNLOCK(sb);
1031 
1032 			soisdisconnecting(so);
1033 		}
1034 	}
1035 
1036 	return (0);
1037 }
1038 
1039 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1040  * <port> (see struct sockaddr_hvs).
1041  *
1042  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1043  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1044  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1045  * the below sockaddr:
1046  *
1047  * struct SOCKADDR_HV
1048  * {
1049  *    ADDRESS_FAMILY Family;
1050  *    USHORT Reserved;
1051  *    GUID VmId;
1052  *    GUID ServiceId;
1053  * };
1054  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1055  * VMBus, because here it's obvious the host and the VM can easily identify
1056  * each other. Though the VmID is useful on the host, especially in the case
1057  * of Windows container, FreeBSD VM doesn't need it at all.
1058  *
1059  * To be compatible with similar infrastructure in Linux VMs, we have
1060  * to limit the available GUID space of SOCKADDR_HV so that we can create
1061  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1062  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1063  *
1064  ****************************************************************************
1065  * The only valid Service GUIDs, from the perspectives of both the host and *
1066  * FreeBSD VM, that can be connected by the other end, must conform to this *
1067  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1068  ****************************************************************************
1069  *
1070  * When we write apps on the host to connect(), the GUID ServiceID is used.
1071  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1072  * port and the driver will form the GUID and use that to request the host.
1073  *
1074  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1075  * auto-generated remote port for a connect request initiated by the host's
1076  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1077  * FreeBSD guest.
1078  */
1079 
1080 /*
1081  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1082  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1083  * HyperV hosts doen't have this limit.
1084  */
1085 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1086 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1087 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1088 
1089 struct hvsock_sc {
1090 	device_t		dev;
1091 	struct hvs_pcb		*pcb;
1092 	struct vmbus_channel	*channel;
1093 };
1094 
1095 static bool
1096 hvsock_chan_readable(struct vmbus_channel *chan)
1097 {
1098 	uint32_t readable = vmbus_chan_read_available(chan);
1099 
1100 	return (readable >= HVSOCK_PKT_LEN(0));
1101 }
1102 
1103 static void
1104 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1105 {
1106 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1107 	struct socket *so;
1108 	uint32_t canwrite;
1109 
1110 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1111 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1112 	    __func__, pcb);
1113 
1114 	/*
1115 	 * Check if the socket is still attached and valid.
1116 	 * Here we know channel is still open. Need to make
1117 	 * sure the socket has not been closed or freed.
1118 	 */
1119 	(void) hvs_trans_lock();
1120 	so = hsvpcb2so(pcb);
1121 
1122 	if (pcb->chan != NULL && so != NULL) {
1123 		/*
1124 		 * Wake up reader if there are data to read.
1125 		 */
1126 		SOCKBUF_LOCK(&(so)->so_rcv);
1127 
1128 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1129 		    "%s: read available = %u\n", __func__,
1130 		    vmbus_chan_read_available(pcb->chan));
1131 
1132 		if (hvsock_chan_readable(pcb->chan))
1133 			sorwakeup_locked(so);
1134 		else
1135 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1136 
1137 		/*
1138 		 * Wake up sender if space becomes available to write.
1139 		 */
1140 		SOCKBUF_LOCK(&(so)->so_snd);
1141 		canwrite = hvsock_canwrite_check(pcb);
1142 
1143 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1144 		    "%s: canwrite = %u\n", __func__, canwrite);
1145 
1146 		if (canwrite > 0) {
1147 			sowwakeup_locked(so);
1148 		} else {
1149 			SOCKBUF_UNLOCK(&(so)->so_snd);
1150 		}
1151 	}
1152 
1153 	hvs_trans_unlock();
1154 
1155 	return;
1156 }
1157 
1158 static int
1159 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1160 {
1161 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1162 	struct uio *uio = arg->uio;
1163 	struct sockbuf *sb = arg->sb;
1164 	int error = 0;
1165 
1166 	if (cbarg == NULL || datap == NULL)
1167 		return (EINVAL);
1168 
1169 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1170 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1171 	    "datap = %p\n",
1172 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1173 	    uio->uio_resid, cplen, datap);
1174 
1175 	if (sb)
1176 		SOCKBUF_UNLOCK(sb);
1177 
1178 	error = uiomove(datap, cplen, uio);
1179 
1180 	if (sb)
1181 		SOCKBUF_LOCK(sb);
1182 
1183 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1184 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1185 	    __func__, uio->uio_resid, error);
1186 
1187 	return (error);
1188 }
1189 
1190 static int
1191 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1192     uint32_t to_write, struct sockbuf *sb)
1193 {
1194 	struct hvs_pkt_header hvs_pkt;
1195 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1196 	uint64_t pad = 0;
1197 	struct iovec iov[3];
1198 	struct hvs_callback_arg cbarg;
1199 
1200 	if (chan == NULL)
1201 		return (ENOTCONN);
1202 
1203 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1204 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1205 	hvs_pktlen = hvs_pkthlen + to_write;
1206 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1207 
1208 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1209 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1210 	    "pad_pktlen = %u, data_len = %u\n",
1211 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1212 
1213 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1214 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1215 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1216 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1217 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1218 
1219 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1220 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1221 
1222 	cbarg.uio = uio;
1223 	cbarg.sb = sb;
1224 
1225 	if (uio && to_write > 0) {
1226 		iov[0].iov_base = &hvs_pkt;
1227 		iov[0].iov_len = hvs_pkthlen;
1228 		iov[1].iov_base = NULL;
1229 		iov[1].iov_len = to_write;
1230 		iov[2].iov_base = &pad;
1231 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1232 
1233 		error = vmbus_chan_iov_send(chan, iov, 3,
1234 		    hvsock_br_callback, &cbarg);
1235 	} else {
1236 		if (to_write == 0) {
1237 			iov[0].iov_base = &hvs_pkt;
1238 			iov[0].iov_len = hvs_pkthlen;
1239 			iov[1].iov_base = &pad;
1240 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1241 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1242 		}
1243 	}
1244 
1245 	if (error) {
1246 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1247 		    "%s: error = %d\n", __func__, error);
1248 	}
1249 
1250 	return (error);
1251 }
1252 
1253 /*
1254  * Check if we have data on current ring buffer to read
1255  * or not. If not, advance the ring buffer read index to
1256  * next packet. Update the recev_data_len and recev_data_off
1257  * to new value.
1258  * Return the number of bytes can read.
1259  */
1260 static uint32_t
1261 hvsock_canread_check(struct hvs_pcb *pcb)
1262 {
1263 	uint32_t advance;
1264 	uint32_t tlen, hlen, dlen;
1265 	uint32_t bytes_canread = 0;
1266 	int error;
1267 
1268 	if (pcb == NULL || pcb->chan == NULL) {
1269 		pcb->so->so_error = EIO;
1270 		return (0);
1271 	}
1272 
1273 	/* Still have data not read yet on current packet */
1274 	if (pcb->recv_data_len > 0)
1275 		return (pcb->recv_data_len);
1276 
1277 	if (pcb->rb_init)
1278 		advance =
1279 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1280 	else
1281 		advance = 0;
1282 
1283 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1284 
1285 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1286 	    "%s: bytes_canread on br = %u, advance = %u\n",
1287 	    __func__, bytes_canread, advance);
1288 
1289 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1290 		/*
1291 		 * Nothing to read. Need to advance the rindex before
1292 		 * calling sbwait, so host knows to wake us up when data
1293 		 * is available to read on rb.
1294 		 */
1295 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1296 		if (error) {
1297 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1298 			    "%s: after calling vmbus_chan_recv_idxadv, "
1299 			    "got error = %d\n",  __func__, error);
1300 			return (0);
1301 		} else {
1302 			pcb->rb_init = false;
1303 			pcb->recv_data_len = 0;
1304 			pcb->recv_data_off = 0;
1305 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1306 
1307 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1308 			    "%s: advanced %u bytes, "
1309 			    " bytes_canread on br now = %u\n",
1310 			    __func__, advance, bytes_canread);
1311 
1312 			if (bytes_canread == 0)
1313 				return (0);
1314 			else
1315 				advance = 0;
1316 		}
1317 	}
1318 
1319 	if (bytes_canread <
1320 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1321 		return (0);
1322 
1323 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1324 	    sizeof(struct hvs_pkt_header), advance);
1325 
1326 	/* Don't have anything to read */
1327 	if (error) {
1328 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1329 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1330 		    __func__, error);
1331 		return (0);
1332 	}
1333 
1334 	/*
1335 	 * We just read in a new packet header. Do some sanity checks.
1336 	 */
1337 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1338 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1339 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1340 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1341 	    __predict_false(hlen > tlen) ||
1342 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1343 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1344 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1345 		    tlen, hlen, dlen);
1346 		pcb->so->so_error = EIO;
1347 		return (0);
1348 	}
1349 	if (pcb->rb_init == false)
1350 		pcb->rb_init = true;
1351 
1352 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1353 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1354 	    tlen, hlen, dlen);
1355 
1356 	/* The other side has sent a close FIN */
1357 	if (dlen == 0) {
1358 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1359 		    "%s: Received FIN from other side\n", __func__);
1360 		/* inform the caller by seting so_error to ESHUTDOWN */
1361 		pcb->so->so_error = ESHUTDOWN;
1362 	}
1363 
1364 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1365 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1366 
1367 	pcb->recv_data_len = dlen;
1368 	pcb->recv_data_off = 0;
1369 
1370 	return (pcb->recv_data_len);
1371 }
1372 
1373 static uint32_t
1374 hvsock_canwrite_check(struct hvs_pcb *pcb)
1375 {
1376 	uint32_t writeable;
1377 	uint32_t ret;
1378 
1379 	if (pcb == NULL || pcb->chan == NULL)
1380 		return (0);
1381 
1382 	writeable = vmbus_chan_write_available(pcb->chan);
1383 
1384 	/*
1385 	 * We must always reserve a 0-length-payload packet for the FIN.
1386 	 */
1387 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1388 	    "%s: writeable is %u, should be greater than %ju\n",
1389 	    __func__, writeable,
1390 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1391 
1392 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1393 		/*
1394 		 * The Tx ring seems full.
1395 		 */
1396 		return (0);
1397 	}
1398 
1399 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1400 
1401 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1402 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1403 
1404 	return (rounddown2(ret, 8));
1405 }
1406 
1407 static void
1408 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1409 {
1410 	vmbus_chan_set_pending_send_size(chan,
1411 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1412 }
1413 
1414 static int
1415 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1416 {
1417 	unsigned int rcvbuf, sndbuf;
1418 	struct hvs_pcb *pcb = so2hvspcb(so);
1419 	int ret;
1420 
1421 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1422 		sndbuf = HVS_RINGBUF_SND_SIZE;
1423 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1424 	} else {
1425 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1426 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1427 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1428 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1429 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1430 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1431 	}
1432 
1433 	/*
1434 	 * Can only read whatever user provided size of data
1435 	 * from ring buffer. Turn off batched reading.
1436 	 */
1437 	vmbus_chan_set_readbatch(chan, false);
1438 
1439 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1440 	    hvsock_chan_cb, pcb);
1441 
1442 	if (ret != 0) {
1443 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1444 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1445 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1446 	} else {
1447 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1448 		    "%s: hvsock channel opened, sndbuf = %u, i"
1449 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1450 		/*
1451 		 * Se the pending send size so to receive wakeup
1452 		 * signals from host when there is enough space on
1453 		 * rx buffer ring to write.
1454 		 */
1455 		hvsock_set_chan_pending_send_size(chan);
1456 	}
1457 
1458 	return ret;
1459 }
1460 
1461 /*
1462  * Guest is listening passively on the socket. Open channel and
1463  * create a new socket for the conneciton.
1464  */
1465 static void
1466 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1467     struct hvsock_sc *sc)
1468 {
1469 	struct socket *new_so;
1470 	struct hvs_pcb *new_pcb, *pcb;
1471 	int error;
1472 
1473 	/* Do nothing if socket is not listening */
1474 	if (!SOLISTENING(so)) {
1475 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1476 		    "%s: socket is not a listening one\n", __func__);
1477 		return;
1478 	}
1479 
1480 	/*
1481 	 * Create a new socket. This will call pru_attach to complete
1482 	 * the socket initialization and put the new socket onto
1483 	 * listening socket's sol_incomp list, waiting to be promoted
1484 	 * to sol_comp list.
1485 	 * The new socket created has ref count 0. There is no other
1486 	 * thread that changes the state of this new one at the
1487 	 * moment, so we don't need to hold its lock while opening
1488 	 * channel and filling out its pcb information.
1489 	 */
1490 	new_so = sonewconn(so, 0);
1491 	if (!new_so)
1492 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1493 		    "%s: creating new socket failed\n", __func__);
1494 
1495 	/*
1496 	 * Now open the vmbus channel. If it fails, the socket will be
1497 	 * on the listening socket's sol_incomp queue until it is
1498 	 * replaced and aborted.
1499 	 */
1500 	error = hvsock_open_channel(chan, new_so);
1501 	if (error) {
1502 		new_so->so_error = error;
1503 		return;
1504 	}
1505 
1506 	pcb = so->so_pcb;
1507 	new_pcb = new_so->so_pcb;
1508 
1509 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1510 	/* Remote port is unknown to guest in this type of conneciton */
1511 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1512 	new_pcb->chan = chan;
1513 	new_pcb->recv_data_len = 0;
1514 	new_pcb->recv_data_off = 0;
1515 	new_pcb->rb_init = false;
1516 
1517 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1518 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1519 
1520 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1521 
1522 	sc->pcb = new_pcb;
1523 
1524 	/*
1525 	 * Change the socket state to SS_ISCONNECTED. This will promote
1526 	 * the socket to sol_comp queue and wake up the thread which
1527 	 * is accepting connection.
1528 	 */
1529 	soisconnected(new_so);
1530 }
1531 
1532 
1533 /*
1534  * Guest is actively connecting to host.
1535  */
1536 static void
1537 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1538 {
1539 	struct hvs_pcb *pcb;
1540 	int error;
1541 
1542 	error = hvsock_open_channel(chan, so);
1543 	if (error) {
1544 		so->so_error = error;
1545 		return;
1546 	}
1547 
1548 	pcb = so->so_pcb;
1549 	pcb->chan = chan;
1550 	pcb->recv_data_len = 0;
1551 	pcb->recv_data_off = 0;
1552 	pcb->rb_init = false;
1553 
1554 	mtx_lock(&hvs_trans_socks_mtx);
1555 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1556 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1557 	mtx_unlock(&hvs_trans_socks_mtx);
1558 
1559 	/*
1560 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1561 	 * the thread sleeping in connect call.
1562 	 */
1563 	soisconnected(so);
1564 }
1565 
1566 static void
1567 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1568 {
1569 	struct hyperv_guid *inst_guid, *type_guid;
1570 	bool conn_from_host;
1571 	struct sockaddr_hvs addr;
1572 	struct socket *so;
1573 	struct hvs_pcb *pcb;
1574 
1575 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1576 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1577 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1578 
1579 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1580 	hvsock_print_guid(type_guid);
1581 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1582 	hvsock_print_guid(inst_guid);
1583 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1584 	    (conn_from_host == true ) ? "from" : "to");
1585 
1586 	/*
1587 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1588 	 */
1589 	if (!is_valid_srv_id(type_guid))
1590 		return;
1591 
1592 	/*
1593 	 * There should be a bound socket already created no matter
1594 	 * it is a passive or active connection.
1595 	 * For host initiated connection (passive on guest side),
1596 	 * the  type_guid contains the port which guest is bound and
1597 	 * listening.
1598 	 * For the guest initiated connection (active on guest side),
1599 	 * the inst_guid contains the port that guest has auto bound
1600 	 * to.
1601 	 */
1602 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1603 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1604 	if (!so) {
1605 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1606 		    "%s: no bound socket found for port %u\n",
1607 		    __func__, addr.hvs_port);
1608 		return;
1609 	}
1610 
1611 	if (conn_from_host) {
1612 		hvsock_open_conn_passive(chan, so, sc);
1613 	} else {
1614 		(void) hvs_trans_lock();
1615 		pcb = so->so_pcb;
1616 		if (pcb && pcb->so) {
1617 			sc->pcb = so2hvspcb(so);
1618 			hvsock_open_conn_active(chan, so);
1619 		} else {
1620 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1621 			    "%s: channel detached before open\n", __func__);
1622 		}
1623 		hvs_trans_unlock();
1624 	}
1625 
1626 }
1627 
1628 static int
1629 hvsock_probe(device_t dev)
1630 {
1631 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1632 
1633 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1634 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1635 		    "hvsock_probe called but not a hvsock channel id %u\n",
1636 		    vmbus_chan_id(channel));
1637 
1638 		return ENXIO;
1639 	} else {
1640 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1641 		    "hvsock_probe got a hvsock channel id %u\n",
1642 		    vmbus_chan_id(channel));
1643 
1644 		return BUS_PROBE_DEFAULT;
1645 	}
1646 }
1647 
1648 static int
1649 hvsock_attach(device_t dev)
1650 {
1651 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1652 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1653 
1654 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1655 
1656 	hvsock_open_connection(channel, sc);
1657 
1658 	/*
1659 	 * Always return success. On error the host will rescind the device
1660 	 * in 30 seconds and we can do cleanup at that time in
1661 	 * vmbus_chan_msgproc_chrescind().
1662 	 */
1663 	return (0);
1664 }
1665 
1666 static int
1667 hvsock_detach(device_t dev)
1668 {
1669 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1670 	struct socket *so;
1671 	int retry;
1672 
1673 	if (bootverbose)
1674 		device_printf(dev, "hvsock_detach called.\n");
1675 
1676 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1677 
1678 	if (sc->pcb != NULL) {
1679 		(void) hvs_trans_lock();
1680 
1681 		so = hsvpcb2so(sc->pcb);
1682 		if (so) {
1683 			/* Close the connection */
1684 			if (so->so_state &
1685 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1686 				soisdisconnected(so);
1687 		}
1688 
1689 		mtx_lock(&hvs_trans_socks_mtx);
1690 		__hvs_remove_pcb_from_list(sc->pcb,
1691 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1692 		mtx_unlock(&hvs_trans_socks_mtx);
1693 
1694 		/*
1695 		 * Close channel while no reader and sender are working
1696 		 * on the buffer rings.
1697 		 */
1698 		if (so) {
1699 			retry = 0;
1700 			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
1701 				/*
1702 				 * Someone is reading, rx br is busy
1703 				 */
1704 				soisdisconnected(so);
1705 				DELAY(500);
1706 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1707 				    "waiting for rx reader to exit, "
1708 				    "retry = %d\n", retry++);
1709 			}
1710 			retry = 0;
1711 			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
1712 				/*
1713 				 * Someone is sending, tx br is busy
1714 				 */
1715 				soisdisconnected(so);
1716 				DELAY(500);
1717 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1718 				    "waiting for tx sender to exit, "
1719 				    "retry = %d\n", retry++);
1720 			}
1721 		}
1722 
1723 
1724 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1725 		free(sc->pcb, M_HVSOCK);
1726 		sc->pcb = NULL;
1727 
1728 		if (so) {
1729 			SOCK_IO_RECV_UNLOCK(so);
1730 			SOCK_IO_SEND_UNLOCK(so);
1731 			so->so_pcb = NULL;
1732 		}
1733 
1734 		hvs_trans_unlock();
1735 	}
1736 
1737 	vmbus_chan_close(vmbus_get_channel(dev));
1738 
1739 	return (0);
1740 }
1741 
1742 static device_method_t hvsock_methods[] = {
1743 	/* Device interface */
1744 	DEVMETHOD(device_probe, hvsock_probe),
1745 	DEVMETHOD(device_attach, hvsock_attach),
1746 	DEVMETHOD(device_detach, hvsock_detach),
1747 	DEVMETHOD_END
1748 };
1749 
1750 static driver_t hvsock_driver = {
1751 	"hv_sock",
1752 	hvsock_methods,
1753 	sizeof(struct hvsock_sc)
1754 };
1755 
1756 static devclass_t hvsock_devclass;
1757 
1758 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
1759 MODULE_VERSION(hvsock, 1);
1760 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1761