xref: /freebsd/sys/dev/hyperv/hvsock/hv_sock.c (revision c5405d1c850765d04f74067ebb71f57e9a26b8ea)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/domain.h>
32 #include <sys/lock.h>
33 #include <sys/kernel.h>
34 #include <sys/types.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/proc.h>
39 #include <sys/protosw.h>
40 #include <sys/socket.h>
41 #include <sys/sysctl.h>
42 #include <sys/sysproto.h>
43 #include <sys/systm.h>
44 #include <sys/sockbuf.h>
45 #include <sys/sx.h>
46 #include <sys/uio.h>
47 
48 #include <net/vnet.h>
49 
50 #include <dev/hyperv/vmbus/vmbus_reg.h>
51 
52 #include "hv_sock.h"
53 
54 #define HVSOCK_DBG_NONE			0x0
55 #define HVSOCK_DBG_INFO			0x1
56 #define HVSOCK_DBG_ERR			0x2
57 #define HVSOCK_DBG_VERBOSE		0x3
58 
59 
60 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
61 
62 static int hvs_dbg_level;
63 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
64     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
65 
66 
67 #define HVSOCK_DBG(level, ...) do {					\
68 	if (hvs_dbg_level >= (level))					\
69 		printf(__VA_ARGS__);					\
70 	} while (0)
71 
72 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
73 
74 static int hvs_dom_probe(void);
75 
76 /* The MTU is 16KB per host side's design */
77 #define HVSOCK_MTU_SIZE		(1024 * 16)
78 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
79 
80 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
81 
82 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
83 					 roundup2(payload_len, 8) + \
84 					 sizeof(uint64_t))
85 
86 /*
87  * HyperV Transport sockets
88  */
89 static struct protosw hv_socket_protosw = {
90 	.pr_type =		SOCK_STREAM,
91 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
92 	.pr_flags =		PR_CONNREQUIRED,
93 	.pr_attach =		hvs_trans_attach,
94 	.pr_bind =		hvs_trans_bind,
95 	.pr_listen =		hvs_trans_listen,
96 	.pr_accept =		hvs_trans_accept,
97 	.pr_connect =		hvs_trans_connect,
98 	.pr_peeraddr =		hvs_trans_peeraddr,
99 	.pr_sockaddr =		hvs_trans_sockaddr,
100 	.pr_soreceive =		hvs_trans_soreceive,
101 	.pr_sosend =		hvs_trans_sosend,
102 	.pr_disconnect =	hvs_trans_disconnect,
103 	.pr_close =		hvs_trans_close,
104 	.pr_detach =		hvs_trans_detach,
105 	.pr_shutdown =		hvs_trans_shutdown,
106 	.pr_abort =		hvs_trans_abort,
107 };
108 
109 static struct domain		hv_socket_domain = {
110 	.dom_family =		AF_HYPERV,
111 	.dom_name =		"hyperv",
112 	.dom_probe =		hvs_dom_probe,
113 	.dom_nprotosw =		1,
114 	.dom_protosw =		{ &hv_socket_protosw },
115 };
116 
117 DOMAIN_SET(hv_socket_);
118 
119 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
120 #define MIN_PORT			((uint32_t)0x0)
121 
122 /* 00000000-facb-11e6-bd58-64006a7986d3 */
123 static const struct hyperv_guid srv_id_template = {
124 	.hv_guid = {
125 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
126 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
127 };
128 
129 static int		hvsock_br_callback(void *, int, void *);
130 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
131 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
132 static int		hvsock_send_data(struct vmbus_channel *chan,
133     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
134 
135 
136 
137 /* Globals */
138 static struct sx		hvs_trans_socks_sx;
139 static struct mtx		hvs_trans_socks_mtx;
140 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
141 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
142 static uint32_t			previous_auto_bound_port;
143 
144 static void
145 hvsock_print_guid(struct hyperv_guid *guid)
146 {
147 	unsigned char *p = (unsigned char *)guid;
148 
149 	HVSOCK_DBG(HVSOCK_DBG_INFO,
150 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
151 	    *(unsigned int *)p,
152 	    *((unsigned short *) &p[4]),
153 	    *((unsigned short *) &p[6]),
154 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
155 }
156 
157 static bool
158 is_valid_srv_id(const struct hyperv_guid *id)
159 {
160 	return !memcmp(&id->hv_guid[4],
161 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
162 }
163 
164 static unsigned int
165 get_port_by_srv_id(const struct hyperv_guid *srv_id)
166 {
167 	return *((const unsigned int *)srv_id);
168 }
169 
170 static void
171 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
172 {
173 	*((unsigned int *)srv_id) = port;
174 }
175 
176 
177 static void
178 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
179 {
180 	struct hvs_pcb *p = NULL;
181 
182 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
183 
184 	if (!pcb)
185 		return;
186 
187 	if (list & HVS_LIST_BOUND) {
188 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
189 			if  (p == pcb)
190 				LIST_REMOVE(p, bound_next);
191 	}
192 
193 	if (list & HVS_LIST_CONNECTED) {
194 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
195 			if (p == pcb)
196 				LIST_REMOVE(pcb, connected_next);
197 	}
198 }
199 
200 static void
201 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
202 {
203 	struct hvs_pcb *pcb = so2hvspcb(so);
204 
205 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
206 
207 	__hvs_remove_pcb_from_list(pcb, list);
208 }
209 
210 static void
211 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
212 {
213 	struct hvs_pcb *pcb = so2hvspcb(so);
214 
215 	if (list & HVS_LIST_BOUND)
216 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
217 		   pcb, bound_next);
218 
219 	if (list & HVS_LIST_CONNECTED)
220 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
221 		   pcb, connected_next);
222 }
223 
224 void
225 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
226 {
227 	if (!so || !so->so_pcb) {
228 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
229 		    "%s: socket or so_pcb is null\n", __func__);
230 		return;
231 	}
232 
233 	mtx_lock(&hvs_trans_socks_mtx);
234 	__hvs_remove_socket_from_list(so, list);
235 	mtx_unlock(&hvs_trans_socks_mtx);
236 }
237 
238 static void
239 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
240 {
241 	if (!so || !so->so_pcb) {
242 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
243 		    "%s: socket or so_pcb is null\n", __func__);
244 		return;
245 	}
246 
247 	mtx_lock(&hvs_trans_socks_mtx);
248 	__hvs_insert_socket_on_list(so, list);
249 	mtx_unlock(&hvs_trans_socks_mtx);
250 }
251 
252 static struct socket *
253 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
254 {
255 	struct hvs_pcb *p = NULL;
256 
257 	if (list & HVS_LIST_BOUND)
258 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
259 			if (p->so != NULL &&
260 			    addr->hvs_port == p->local_addr.hvs_port)
261 				return p->so;
262 
263 	if (list & HVS_LIST_CONNECTED)
264 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
265 			if (p->so != NULL &&
266 			    addr->hvs_port == p->local_addr.hvs_port)
267 				return p->so;
268 
269 	return NULL;
270 }
271 
272 static struct socket *
273 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
274 {
275 	struct socket *s = NULL;
276 
277 	mtx_lock(&hvs_trans_socks_mtx);
278 	s = __hvs_find_socket_on_list(addr, list);
279 	mtx_unlock(&hvs_trans_socks_mtx);
280 
281 	return s;
282 }
283 
284 static inline void
285 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
286 {
287 	memset(addr, 0, sizeof(*addr));
288 	addr->sa_family = AF_HYPERV;
289 	addr->sa_len = sizeof(*addr);
290 	addr->hvs_port = port;
291 }
292 
293 void
294 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
295 {
296 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
297 }
298 
299 int
300 hvs_trans_lock(void)
301 {
302 	sx_xlock(&hvs_trans_socks_sx);
303 	return (0);
304 }
305 
306 void
307 hvs_trans_unlock(void)
308 {
309 	sx_xunlock(&hvs_trans_socks_sx);
310 }
311 
312 static int
313 hvs_dom_probe(void)
314 {
315 
316 	/* Don't even give us a chance to attach on non-HyperV. */
317 	if (vm_guest != VM_GUEST_HV)
318 		return (ENXIO);
319 	return (0);
320 }
321 
322 static void
323 hvs_trans_init(void *arg __unused)
324 {
325 
326 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
327 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
328 
329 	/* Initialize Globals */
330 	previous_auto_bound_port = MAX_PORT;
331 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
332 	mtx_init(&hvs_trans_socks_mtx,
333 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
334 	LIST_INIT(&hvs_trans_bound_socks);
335 	LIST_INIT(&hvs_trans_connected_socks);
336 }
337 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
338     hvs_trans_init, NULL);
339 
340 /*
341  * Called in two cases:
342  * 1) When user calls socket();
343  * 2) When we accept new incoming conneciton and call sonewconn().
344  */
345 int
346 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
347 {
348 	struct hvs_pcb *pcb = so2hvspcb(so);
349 
350 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
351 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
352 
353 	if (so->so_type != SOCK_STREAM)
354 		return (ESOCKTNOSUPPORT);
355 
356 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
357 		return (EPROTONOSUPPORT);
358 
359 	if (pcb != NULL)
360 		return (EISCONN);
361 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
362 	if (pcb == NULL)
363 		return (ENOMEM);
364 
365 	pcb->so = so;
366 	so->so_pcb = (void *)pcb;
367 
368 	return (0);
369 }
370 
371 void
372 hvs_trans_detach(struct socket *so)
373 {
374 	struct hvs_pcb *pcb;
375 
376 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
377 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
378 
379 	(void) hvs_trans_lock();
380 	pcb = so2hvspcb(so);
381 	if (pcb == NULL) {
382 		hvs_trans_unlock();
383 		return;
384 	}
385 
386 	if (SOLISTENING(so)) {
387 		bzero(pcb, sizeof(*pcb));
388 		free(pcb, M_HVSOCK);
389 	}
390 
391 	so->so_pcb = NULL;
392 
393 	hvs_trans_unlock();
394 }
395 
396 int
397 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
398 {
399 	struct hvs_pcb *pcb = so2hvspcb(so);
400 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
401 	int error = 0;
402 
403 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
404 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
405 
406 	if (sa == NULL) {
407 		return (EINVAL);
408 	}
409 
410 	if (pcb == NULL) {
411 		return (EINVAL);
412 	}
413 
414 	if (sa->sa_family != AF_HYPERV) {
415 		HVSOCK_DBG(HVSOCK_DBG_ERR,
416 		    "%s: Not supported, sa_family is %u\n",
417 		    __func__, sa->sa_family);
418 		return (EAFNOSUPPORT);
419 	}
420 	if (sa->sa_len != sizeof(*sa)) {
421 		HVSOCK_DBG(HVSOCK_DBG_ERR,
422 		    "%s: Not supported, sa_len is %u\n",
423 		    __func__, sa->sa_len);
424 		return (EINVAL);
425 	}
426 
427 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
428 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
429 
430 	mtx_lock(&hvs_trans_socks_mtx);
431 	if (__hvs_find_socket_on_list(sa,
432 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
433 		error = EADDRINUSE;
434 	} else {
435 		/*
436 		 * The address is available for us to bind.
437 		 * Add socket to the bound list.
438 		 */
439 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
440 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
441 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
442 	}
443 	mtx_unlock(&hvs_trans_socks_mtx);
444 
445 	return (error);
446 }
447 
448 int
449 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
450 {
451 	struct hvs_pcb *pcb = so2hvspcb(so);
452 	struct socket *bound_so;
453 	int error;
454 
455 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
456 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
457 
458 	if (pcb == NULL)
459 		return (EINVAL);
460 
461 	/* Check if the address is already bound and it was by us. */
462 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
463 	if (bound_so == NULL || bound_so != so) {
464 		HVSOCK_DBG(HVSOCK_DBG_ERR,
465 		    "%s: Address not bound or not by us.\n", __func__);
466 		return (EADDRNOTAVAIL);
467 	}
468 
469 	SOCK_LOCK(so);
470 	error = solisten_proto_check(so);
471 	if (error == 0)
472 		solisten_proto(so, backlog);
473 	SOCK_UNLOCK(so);
474 
475 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
476 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
477 	return (error);
478 }
479 
480 int
481 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
482 {
483 	struct hvs_pcb *pcb = so2hvspcb(so);
484 
485 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
486 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
487 
488 	if (pcb == NULL)
489 		return (EINVAL);
490 
491 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
492 	    M_NOWAIT);
493 
494 	return ((*nam == NULL) ? ENOMEM : 0);
495 }
496 
497 int
498 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
499 {
500 	struct hvs_pcb *pcb = so2hvspcb(so);
501 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
502 	bool found_auto_bound_port = false;
503 	int i, error = 0;
504 
505 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
506 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
507 	    __func__, raddr->hvs_port);
508 
509 	if (pcb == NULL)
510 		return (EINVAL);
511 
512 	/* Verify the remote address */
513 	if (raddr == NULL)
514 		return (EINVAL);
515 	if (raddr->sa_family != AF_HYPERV)
516 		return (EAFNOSUPPORT);
517 	if (raddr->sa_len != sizeof(*raddr))
518 		return (EINVAL);
519 
520 	mtx_lock(&hvs_trans_socks_mtx);
521 	if (so->so_state &
522 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
523 			HVSOCK_DBG(HVSOCK_DBG_ERR,
524 			    "%s: socket connect in progress\n",
525 			    __func__);
526 			error = EINPROGRESS;
527 			goto out;
528 	}
529 
530 	/*
531 	 * Find an available port for us to auto bind the local
532 	 * address.
533 	 */
534 	hvs_addr_set(&pcb->local_addr, 0);
535 
536 	for (i = previous_auto_bound_port - 1;
537 	    i != previous_auto_bound_port; i --) {
538 		if (i == MIN_PORT)
539 			i = MAX_PORT;
540 
541 		pcb->local_addr.hvs_port = i;
542 
543 		if (__hvs_find_socket_on_list(&pcb->local_addr,
544 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
545 			found_auto_bound_port = true;
546 			previous_auto_bound_port = i;
547 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
548 			    "%s: found local bound port is %x\n",
549 			    __func__, pcb->local_addr.hvs_port);
550 			break;
551 		}
552 	}
553 
554 	if (found_auto_bound_port == true) {
555 		/* Found available port for auto bound, put on list */
556 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
557 		/* Set VM service ID */
558 		pcb->vm_srv_id = srv_id_template;
559 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
560 		/* Set host service ID and remote port */
561 		pcb->host_srv_id = srv_id_template;
562 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
563 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
564 
565 		/* Change the socket state to SS_ISCONNECTING */
566 		soisconnecting(so);
567 	} else {
568 		HVSOCK_DBG(HVSOCK_DBG_ERR,
569 		    "%s: No local port available for auto bound\n",
570 		    __func__);
571 		error = EADDRINUSE;
572 	}
573 
574 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
575 	hvsock_print_guid(&pcb->vm_srv_id);
576 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
577 	hvsock_print_guid(&pcb->host_srv_id);
578 
579 out:
580 	mtx_unlock(&hvs_trans_socks_mtx);
581 
582 	if (found_auto_bound_port == true)
583 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
584 
585 	return (error);
586 }
587 
588 int
589 hvs_trans_disconnect(struct socket *so)
590 {
591 	struct hvs_pcb *pcb;
592 
593 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
594 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
595 
596 	(void) hvs_trans_lock();
597 	pcb = so2hvspcb(so);
598 	if (pcb == NULL) {
599 		hvs_trans_unlock();
600 		return (EINVAL);
601 	}
602 
603 	/* If socket is already disconnected, skip this */
604 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
605 		soisdisconnecting(so);
606 
607 	hvs_trans_unlock();
608 
609 	return (0);
610 }
611 
612 struct hvs_callback_arg {
613 	struct uio *uio;
614 	struct sockbuf *sb;
615 };
616 
617 int
618 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
619     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
620 {
621 	struct hvs_pcb *pcb = so2hvspcb(so);
622 	struct sockbuf *sb;
623 	ssize_t orig_resid;
624 	uint32_t canread, to_read;
625 	int flags, error = 0;
626 	struct hvs_callback_arg cbarg;
627 
628 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
629 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
630 
631 	if (so->so_type != SOCK_STREAM)
632 		return (EINVAL);
633 	if (pcb == NULL)
634 		return (EINVAL);
635 
636 	if (flagsp != NULL)
637 		flags = *flagsp &~ MSG_EOR;
638 	else
639 		flags = 0;
640 
641 	if (flags & MSG_PEEK)
642 		return (EOPNOTSUPP);
643 
644 	/* If no space to copy out anything */
645 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
646 		return (EINVAL);
647 
648 	orig_resid = uio->uio_resid;
649 
650 	/* Prevent other readers from entering the socket. */
651 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
652 	if (error) {
653 		HVSOCK_DBG(HVSOCK_DBG_ERR,
654 		    "%s: soiolock returned error = %d\n", __func__, error);
655 		return (error);
656 	}
657 
658 	sb = &so->so_rcv;
659 	SOCKBUF_LOCK(sb);
660 
661 	cbarg.uio = uio;
662 	cbarg.sb = sb;
663 	/*
664 	 * If the socket is closing, there might still be some data
665 	 * in rx br to read. However we need to make sure
666 	 * the channel is still open.
667 	 */
668 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
669 	    (so->so_state & SS_ISDISCONNECTED)) {
670 		/* Other thread already closed the channel */
671 		error = EPIPE;
672 		goto out;
673 	}
674 
675 	while (true) {
676 		while (uio->uio_resid > 0 &&
677 		    (canread = hvsock_canread_check(pcb)) > 0) {
678 			to_read = MIN(canread, uio->uio_resid);
679 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
680 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
681 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
682 			    pcb->recv_data_off));
683 
684 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
685 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
686 			    hvsock_br_callback, (void *)&cbarg);
687 			/*
688 			 * It is possible socket is disconnected becasue
689 			 * we released lock in hvsock_br_callback. So we
690 			 * need to check the state to make sure it is not
691 			 * disconnected.
692 			 */
693 			if (error || so->so_state & SS_ISDISCONNECTED) {
694 				break;
695 			}
696 
697 			pcb->recv_data_len -= to_read;
698 			pcb->recv_data_off += to_read;
699 		}
700 
701 		if (error)
702 			break;
703 
704 		/* Abort if socket has reported problems. */
705 		if (so->so_error) {
706 			if (so->so_error == ESHUTDOWN &&
707 			    orig_resid > uio->uio_resid) {
708 				/*
709 				 * Although we got a FIN, we also received
710 				 * some data in this round. Delivery it
711 				 * to user.
712 				 */
713 				error = 0;
714 			} else {
715 				if (so->so_error != ESHUTDOWN)
716 					error = so->so_error;
717 			}
718 
719 			break;
720 		}
721 
722 		/* Cannot received more. */
723 		if (sb->sb_state & SBS_CANTRCVMORE)
724 			break;
725 
726 		/* We are done if buffer has been filled */
727 		if (uio->uio_resid == 0)
728 			break;
729 
730 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
731 			break;
732 
733 		/* Buffer ring is empty and we shall not block */
734 		if ((so->so_state & SS_NBIO) ||
735 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
736 			if (orig_resid == uio->uio_resid) {
737 				/* We have not read anything */
738 				error = EAGAIN;
739 			}
740 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
741 			    "%s: non blocked read return, error %d.\n",
742 			    __func__, error);
743 			break;
744 		}
745 
746 		/*
747 		 * Wait and block until (more) data comes in.
748 		 * Note: Drops the sockbuf lock during wait.
749 		 */
750 		error = sbwait(so, SO_RCV);
751 
752 		if (error)
753 			break;
754 
755 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
756 		    "%s: wake up from sbwait, read available is %u\n",
757 		    __func__, vmbus_chan_read_available(pcb->chan));
758 	}
759 
760 out:
761 	SOCKBUF_UNLOCK(sb);
762 	SOCK_IO_RECV_UNLOCK(so);
763 
764 	/* We recieved a FIN in this call */
765 	if (so->so_error == ESHUTDOWN) {
766 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
767 			/* Send has already closed */
768 			soisdisconnecting(so);
769 		} else {
770 			/* Just close the receive side */
771 			socantrcvmore(so);
772 		}
773 	}
774 
775 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
776 	    "%s: returning error = %d, so_error = %d\n",
777 	    __func__, error, so->so_error);
778 
779 	return (error);
780 }
781 
782 int
783 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
784     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
785 {
786 	struct hvs_pcb *pcb = so2hvspcb(so);
787 	struct sockbuf *sb;
788 	ssize_t orig_resid;
789 	uint32_t canwrite, to_write;
790 	int error = 0;
791 
792 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
793 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
794 	    __func__, uio->uio_resid);
795 
796 	if (so->so_type != SOCK_STREAM)
797 		return (EINVAL);
798 	if (pcb == NULL)
799 		return (EINVAL);
800 
801 	/* If nothing to send */
802 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
803 		return (EINVAL);
804 
805 	orig_resid = uio->uio_resid;
806 
807 	/* Prevent other writers from entering the socket. */
808 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
809 	if (error) {
810 		HVSOCK_DBG(HVSOCK_DBG_ERR,
811 		    "%s: soiolocak returned error = %d\n", __func__, error);
812 		return (error);
813 	}
814 
815 	sb = &so->so_snd;
816 	SOCKBUF_LOCK(sb);
817 
818 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
819 	    so->so_error == ESHUTDOWN) {
820 		error = EPIPE;
821 		goto out;
822 	}
823 
824 	while (uio->uio_resid > 0) {
825 		canwrite = hvsock_canwrite_check(pcb);
826 		if (canwrite == 0) {
827 			/* We have sent some data */
828 			if (orig_resid > uio->uio_resid)
829 				break;
830 			/*
831 			 * We have not sent any data and it is
832 			 * non-blocked io
833 			 */
834 			if (so->so_state & SS_NBIO ||
835 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
836 				error = EWOULDBLOCK;
837 				break;
838 			} else {
839 				/*
840 				 * We are here because there is no space on
841 				 * send buffer ring. Signal the other side
842 				 * to read and free more space.
843 				 * Sleep wait until space avaiable to send
844 				 * Note: Drops the sockbuf lock during wait.
845 				 */
846 				error = sbwait(so, SO_SND);
847 
848 				if (error)
849 					break;
850 
851 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
852 				    "%s: wake up from sbwait, space avail on "
853 				    "tx ring is %u\n",
854 				    __func__,
855 				    vmbus_chan_write_available(pcb->chan));
856 
857 				continue;
858 			}
859 		}
860 		to_write = MIN(canwrite, uio->uio_resid);
861 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
862 
863 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
864 		    "%s: canwrite is %u, to_write = %u\n", __func__,
865 		    canwrite, to_write);
866 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
867 
868 		if (error)
869 			break;
870 	}
871 
872 out:
873 	SOCKBUF_UNLOCK(sb);
874 	SOCK_IO_SEND_UNLOCK(so);
875 
876 	return (error);
877 }
878 
879 int
880 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
881 {
882 	struct hvs_pcb *pcb = so2hvspcb(so);
883 
884 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
885 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
886 
887 	if (pcb == NULL)
888 		return (EINVAL);
889 
890 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
891 
892 	return ((*nam == NULL)? ENOMEM : 0);
893 }
894 
895 int
896 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
897 {
898 	struct hvs_pcb *pcb = so2hvspcb(so);
899 
900 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
901 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
902 
903 	if (pcb == NULL)
904 		return (EINVAL);
905 
906 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
907 
908 	return ((*nam == NULL)? ENOMEM : 0);
909 }
910 
911 void
912 hvs_trans_close(struct socket *so)
913 {
914 	struct hvs_pcb *pcb;
915 
916 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
917 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
918 
919 	(void) hvs_trans_lock();
920 	pcb = so2hvspcb(so);
921 	if (!pcb) {
922 		hvs_trans_unlock();
923 		return;
924 	}
925 
926 	if (so->so_state & SS_ISCONNECTED) {
927 		/* Send a FIN to peer */
928 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
929 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
930 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
931 	}
932 
933 	if (so->so_state &
934 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
935 		soisdisconnected(so);
936 
937 	pcb->chan = NULL;
938 	pcb->so = NULL;
939 
940 	if (SOLISTENING(so)) {
941 		mtx_lock(&hvs_trans_socks_mtx);
942 		/* Remove from bound list */
943 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
944 		mtx_unlock(&hvs_trans_socks_mtx);
945 	}
946 
947 	hvs_trans_unlock();
948 
949 	return;
950 }
951 
952 void
953 hvs_trans_abort(struct socket *so)
954 {
955 	struct hvs_pcb *pcb = so2hvspcb(so);
956 
957 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
958 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
959 
960 	(void) hvs_trans_lock();
961 	if (pcb == NULL) {
962 		hvs_trans_unlock();
963 		return;
964 	}
965 
966 	if (SOLISTENING(so)) {
967 		mtx_lock(&hvs_trans_socks_mtx);
968 		/* Remove from bound list */
969 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
970 		mtx_unlock(&hvs_trans_socks_mtx);
971 	}
972 
973 	if (so->so_state & SS_ISCONNECTED) {
974 		(void) sodisconnect(so);
975 	}
976 	hvs_trans_unlock();
977 
978 	return;
979 }
980 
981 int
982 hvs_trans_shutdown(struct socket *so)
983 {
984 	struct hvs_pcb *pcb = so2hvspcb(so);
985 	struct sockbuf *sb;
986 
987 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
988 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
989 
990 	if (pcb == NULL)
991 		return (EINVAL);
992 
993 	/*
994 	 * Only get called with the shutdown method is SHUT_WR or
995 	 * SHUT_RDWR.
996 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
997 	 * already set the SBS_CANTRCVMORE on receive side socket
998 	 * buffer.
999 	 */
1000 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1001 		/*
1002 		 * SHUT_WR only case.
1003 		 * Receive side is still open. Just close
1004 		 * the send side.
1005 		 */
1006 		socantsendmore(so);
1007 	} else {
1008 		/* SHUT_RDWR case */
1009 		if (so->so_state & SS_ISCONNECTED) {
1010 			/* Send a FIN to peer */
1011 			sb = &so->so_snd;
1012 			SOCKBUF_LOCK(sb);
1013 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1014 			SOCKBUF_UNLOCK(sb);
1015 
1016 			soisdisconnecting(so);
1017 		}
1018 	}
1019 
1020 	return (0);
1021 }
1022 
1023 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1024  * <port> (see struct sockaddr_hvs).
1025  *
1026  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1027  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1028  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1029  * the below sockaddr:
1030  *
1031  * struct SOCKADDR_HV
1032  * {
1033  *    ADDRESS_FAMILY Family;
1034  *    USHORT Reserved;
1035  *    GUID VmId;
1036  *    GUID ServiceId;
1037  * };
1038  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1039  * VMBus, because here it's obvious the host and the VM can easily identify
1040  * each other. Though the VmID is useful on the host, especially in the case
1041  * of Windows container, FreeBSD VM doesn't need it at all.
1042  *
1043  * To be compatible with similar infrastructure in Linux VMs, we have
1044  * to limit the available GUID space of SOCKADDR_HV so that we can create
1045  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1046  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1047  *
1048  ****************************************************************************
1049  * The only valid Service GUIDs, from the perspectives of both the host and *
1050  * FreeBSD VM, that can be connected by the other end, must conform to this *
1051  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1052  ****************************************************************************
1053  *
1054  * When we write apps on the host to connect(), the GUID ServiceID is used.
1055  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1056  * port and the driver will form the GUID and use that to request the host.
1057  *
1058  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1059  * auto-generated remote port for a connect request initiated by the host's
1060  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1061  * FreeBSD guest.
1062  */
1063 
1064 /*
1065  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1066  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1067  * HyperV hosts doen't have this limit.
1068  */
1069 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1070 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1071 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1072 
1073 struct hvsock_sc {
1074 	device_t		dev;
1075 	struct hvs_pcb		*pcb;
1076 	struct vmbus_channel	*channel;
1077 };
1078 
1079 static bool
1080 hvsock_chan_readable(struct vmbus_channel *chan)
1081 {
1082 	uint32_t readable = vmbus_chan_read_available(chan);
1083 
1084 	return (readable >= HVSOCK_PKT_LEN(0));
1085 }
1086 
1087 static void
1088 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1089 {
1090 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1091 	struct socket *so;
1092 	uint32_t canwrite;
1093 
1094 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1095 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1096 	    __func__, pcb);
1097 
1098 	/*
1099 	 * Check if the socket is still attached and valid.
1100 	 * Here we know channel is still open. Need to make
1101 	 * sure the socket has not been closed or freed.
1102 	 */
1103 	(void) hvs_trans_lock();
1104 	so = hsvpcb2so(pcb);
1105 
1106 	if (pcb->chan != NULL && so != NULL) {
1107 		/*
1108 		 * Wake up reader if there are data to read.
1109 		 */
1110 		SOCKBUF_LOCK(&(so)->so_rcv);
1111 
1112 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1113 		    "%s: read available = %u\n", __func__,
1114 		    vmbus_chan_read_available(pcb->chan));
1115 
1116 		if (hvsock_chan_readable(pcb->chan))
1117 			sorwakeup_locked(so);
1118 		else
1119 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1120 
1121 		/*
1122 		 * Wake up sender if space becomes available to write.
1123 		 */
1124 		SOCKBUF_LOCK(&(so)->so_snd);
1125 		canwrite = hvsock_canwrite_check(pcb);
1126 
1127 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1128 		    "%s: canwrite = %u\n", __func__, canwrite);
1129 
1130 		if (canwrite > 0) {
1131 			sowwakeup_locked(so);
1132 		} else {
1133 			SOCKBUF_UNLOCK(&(so)->so_snd);
1134 		}
1135 	}
1136 
1137 	hvs_trans_unlock();
1138 
1139 	return;
1140 }
1141 
1142 static int
1143 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1144 {
1145 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1146 	struct uio *uio = arg->uio;
1147 	struct sockbuf *sb = arg->sb;
1148 	int error = 0;
1149 
1150 	if (cbarg == NULL || datap == NULL)
1151 		return (EINVAL);
1152 
1153 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1154 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1155 	    "datap = %p\n",
1156 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1157 	    uio->uio_resid, cplen, datap);
1158 
1159 	if (sb)
1160 		SOCKBUF_UNLOCK(sb);
1161 
1162 	error = uiomove(datap, cplen, uio);
1163 
1164 	if (sb)
1165 		SOCKBUF_LOCK(sb);
1166 
1167 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1168 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1169 	    __func__, uio->uio_resid, error);
1170 
1171 	return (error);
1172 }
1173 
1174 static int
1175 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1176     uint32_t to_write, struct sockbuf *sb)
1177 {
1178 	struct hvs_pkt_header hvs_pkt;
1179 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1180 	uint64_t pad = 0;
1181 	struct iovec iov[3];
1182 	struct hvs_callback_arg cbarg;
1183 
1184 	if (chan == NULL)
1185 		return (ENOTCONN);
1186 
1187 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1188 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1189 	hvs_pktlen = hvs_pkthlen + to_write;
1190 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1191 
1192 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1193 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1194 	    "pad_pktlen = %u, data_len = %u\n",
1195 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1196 
1197 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1198 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1199 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1200 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1201 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1202 
1203 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1204 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1205 
1206 	cbarg.uio = uio;
1207 	cbarg.sb = sb;
1208 
1209 	if (uio && to_write > 0) {
1210 		iov[0].iov_base = &hvs_pkt;
1211 		iov[0].iov_len = hvs_pkthlen;
1212 		iov[1].iov_base = NULL;
1213 		iov[1].iov_len = to_write;
1214 		iov[2].iov_base = &pad;
1215 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1216 
1217 		error = vmbus_chan_iov_send(chan, iov, 3,
1218 		    hvsock_br_callback, &cbarg);
1219 	} else {
1220 		if (to_write == 0) {
1221 			iov[0].iov_base = &hvs_pkt;
1222 			iov[0].iov_len = hvs_pkthlen;
1223 			iov[1].iov_base = &pad;
1224 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1225 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1226 		}
1227 	}
1228 
1229 	if (error) {
1230 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1231 		    "%s: error = %d\n", __func__, error);
1232 	}
1233 
1234 	return (error);
1235 }
1236 
1237 /*
1238  * Check if we have data on current ring buffer to read
1239  * or not. If not, advance the ring buffer read index to
1240  * next packet. Update the recev_data_len and recev_data_off
1241  * to new value.
1242  * Return the number of bytes can read.
1243  */
1244 static uint32_t
1245 hvsock_canread_check(struct hvs_pcb *pcb)
1246 {
1247 	uint32_t advance;
1248 	uint32_t tlen, hlen, dlen;
1249 	uint32_t bytes_canread = 0;
1250 	int error;
1251 
1252 	if (pcb == NULL || pcb->chan == NULL) {
1253 		pcb->so->so_error = EIO;
1254 		return (0);
1255 	}
1256 
1257 	/* Still have data not read yet on current packet */
1258 	if (pcb->recv_data_len > 0)
1259 		return (pcb->recv_data_len);
1260 
1261 	if (pcb->rb_init)
1262 		advance =
1263 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1264 	else
1265 		advance = 0;
1266 
1267 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1268 
1269 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1270 	    "%s: bytes_canread on br = %u, advance = %u\n",
1271 	    __func__, bytes_canread, advance);
1272 
1273 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1274 		/*
1275 		 * Nothing to read. Need to advance the rindex before
1276 		 * calling sbwait, so host knows to wake us up when data
1277 		 * is available to read on rb.
1278 		 */
1279 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1280 		if (error) {
1281 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1282 			    "%s: after calling vmbus_chan_recv_idxadv, "
1283 			    "got error = %d\n",  __func__, error);
1284 			return (0);
1285 		} else {
1286 			pcb->rb_init = false;
1287 			pcb->recv_data_len = 0;
1288 			pcb->recv_data_off = 0;
1289 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1290 
1291 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1292 			    "%s: advanced %u bytes, "
1293 			    " bytes_canread on br now = %u\n",
1294 			    __func__, advance, bytes_canread);
1295 
1296 			if (bytes_canread == 0)
1297 				return (0);
1298 			else
1299 				advance = 0;
1300 		}
1301 	}
1302 
1303 	if (bytes_canread <
1304 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1305 		return (0);
1306 
1307 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1308 	    sizeof(struct hvs_pkt_header), advance);
1309 
1310 	/* Don't have anything to read */
1311 	if (error) {
1312 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1313 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1314 		    __func__, error);
1315 		return (0);
1316 	}
1317 
1318 	/*
1319 	 * We just read in a new packet header. Do some sanity checks.
1320 	 */
1321 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1322 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1323 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1324 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1325 	    __predict_false(hlen > tlen) ||
1326 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1327 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1328 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1329 		    tlen, hlen, dlen);
1330 		pcb->so->so_error = EIO;
1331 		return (0);
1332 	}
1333 	if (pcb->rb_init == false)
1334 		pcb->rb_init = true;
1335 
1336 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1337 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1338 	    tlen, hlen, dlen);
1339 
1340 	/* The other side has sent a close FIN */
1341 	if (dlen == 0) {
1342 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1343 		    "%s: Received FIN from other side\n", __func__);
1344 		/* inform the caller by seting so_error to ESHUTDOWN */
1345 		pcb->so->so_error = ESHUTDOWN;
1346 	}
1347 
1348 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1349 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1350 
1351 	pcb->recv_data_len = dlen;
1352 	pcb->recv_data_off = 0;
1353 
1354 	return (pcb->recv_data_len);
1355 }
1356 
1357 static uint32_t
1358 hvsock_canwrite_check(struct hvs_pcb *pcb)
1359 {
1360 	uint32_t writeable;
1361 	uint32_t ret;
1362 
1363 	if (pcb == NULL || pcb->chan == NULL)
1364 		return (0);
1365 
1366 	writeable = vmbus_chan_write_available(pcb->chan);
1367 
1368 	/*
1369 	 * We must always reserve a 0-length-payload packet for the FIN.
1370 	 */
1371 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1372 	    "%s: writeable is %u, should be greater than %ju\n",
1373 	    __func__, writeable,
1374 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1375 
1376 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1377 		/*
1378 		 * The Tx ring seems full.
1379 		 */
1380 		return (0);
1381 	}
1382 
1383 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1384 
1385 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1386 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1387 
1388 	return (rounddown2(ret, 8));
1389 }
1390 
1391 static void
1392 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1393 {
1394 	vmbus_chan_set_pending_send_size(chan,
1395 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1396 }
1397 
1398 static int
1399 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1400 {
1401 	unsigned int rcvbuf, sndbuf;
1402 	struct hvs_pcb *pcb = so2hvspcb(so);
1403 	int ret;
1404 
1405 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1406 		sndbuf = HVS_RINGBUF_SND_SIZE;
1407 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1408 	} else {
1409 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1410 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1411 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1412 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1413 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1414 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1415 	}
1416 
1417 	/*
1418 	 * Can only read whatever user provided size of data
1419 	 * from ring buffer. Turn off batched reading.
1420 	 */
1421 	vmbus_chan_set_readbatch(chan, false);
1422 
1423 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1424 	    hvsock_chan_cb, pcb);
1425 
1426 	if (ret != 0) {
1427 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1428 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1429 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1430 	} else {
1431 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1432 		    "%s: hvsock channel opened, sndbuf = %u, i"
1433 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1434 		/*
1435 		 * Se the pending send size so to receive wakeup
1436 		 * signals from host when there is enough space on
1437 		 * rx buffer ring to write.
1438 		 */
1439 		hvsock_set_chan_pending_send_size(chan);
1440 	}
1441 
1442 	return ret;
1443 }
1444 
1445 /*
1446  * Guest is listening passively on the socket. Open channel and
1447  * create a new socket for the conneciton.
1448  */
1449 static void
1450 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1451     struct hvsock_sc *sc)
1452 {
1453 	struct socket *new_so;
1454 	struct hvs_pcb *new_pcb, *pcb;
1455 	int error;
1456 
1457 	/* Do nothing if socket is not listening */
1458 	if (!SOLISTENING(so)) {
1459 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1460 		    "%s: socket is not a listening one\n", __func__);
1461 		return;
1462 	}
1463 
1464 	/*
1465 	 * Create a new socket. This will call pru_attach to complete
1466 	 * the socket initialization and put the new socket onto
1467 	 * listening socket's sol_incomp list, waiting to be promoted
1468 	 * to sol_comp list.
1469 	 * The new socket created has ref count 0. There is no other
1470 	 * thread that changes the state of this new one at the
1471 	 * moment, so we don't need to hold its lock while opening
1472 	 * channel and filling out its pcb information.
1473 	 */
1474 	new_so = sonewconn(so, 0);
1475 	if (!new_so)
1476 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1477 		    "%s: creating new socket failed\n", __func__);
1478 
1479 	/*
1480 	 * Now open the vmbus channel. If it fails, the socket will be
1481 	 * on the listening socket's sol_incomp queue until it is
1482 	 * replaced and aborted.
1483 	 */
1484 	error = hvsock_open_channel(chan, new_so);
1485 	if (error) {
1486 		new_so->so_error = error;
1487 		return;
1488 	}
1489 
1490 	pcb = so->so_pcb;
1491 	new_pcb = new_so->so_pcb;
1492 
1493 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1494 	/* Remote port is unknown to guest in this type of conneciton */
1495 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1496 	new_pcb->chan = chan;
1497 	new_pcb->recv_data_len = 0;
1498 	new_pcb->recv_data_off = 0;
1499 	new_pcb->rb_init = false;
1500 
1501 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1502 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1503 
1504 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1505 
1506 	sc->pcb = new_pcb;
1507 
1508 	/*
1509 	 * Change the socket state to SS_ISCONNECTED. This will promote
1510 	 * the socket to sol_comp queue and wake up the thread which
1511 	 * is accepting connection.
1512 	 */
1513 	soisconnected(new_so);
1514 }
1515 
1516 
1517 /*
1518  * Guest is actively connecting to host.
1519  */
1520 static void
1521 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1522 {
1523 	struct hvs_pcb *pcb;
1524 	int error;
1525 
1526 	error = hvsock_open_channel(chan, so);
1527 	if (error) {
1528 		so->so_error = error;
1529 		return;
1530 	}
1531 
1532 	pcb = so->so_pcb;
1533 	pcb->chan = chan;
1534 	pcb->recv_data_len = 0;
1535 	pcb->recv_data_off = 0;
1536 	pcb->rb_init = false;
1537 
1538 	mtx_lock(&hvs_trans_socks_mtx);
1539 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1540 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1541 	mtx_unlock(&hvs_trans_socks_mtx);
1542 
1543 	/*
1544 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1545 	 * the thread sleeping in connect call.
1546 	 */
1547 	soisconnected(so);
1548 }
1549 
1550 static void
1551 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1552 {
1553 	struct hyperv_guid *inst_guid, *type_guid;
1554 	bool conn_from_host;
1555 	struct sockaddr_hvs addr;
1556 	struct socket *so;
1557 	struct hvs_pcb *pcb;
1558 
1559 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1560 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1561 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1562 
1563 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1564 	hvsock_print_guid(type_guid);
1565 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1566 	hvsock_print_guid(inst_guid);
1567 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1568 	    (conn_from_host == true ) ? "from" : "to");
1569 
1570 	/*
1571 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1572 	 */
1573 	if (!is_valid_srv_id(type_guid))
1574 		return;
1575 
1576 	/*
1577 	 * There should be a bound socket already created no matter
1578 	 * it is a passive or active connection.
1579 	 * For host initiated connection (passive on guest side),
1580 	 * the  type_guid contains the port which guest is bound and
1581 	 * listening.
1582 	 * For the guest initiated connection (active on guest side),
1583 	 * the inst_guid contains the port that guest has auto bound
1584 	 * to.
1585 	 */
1586 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1587 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1588 	if (!so) {
1589 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1590 		    "%s: no bound socket found for port %u\n",
1591 		    __func__, addr.hvs_port);
1592 		return;
1593 	}
1594 
1595 	if (conn_from_host) {
1596 		hvsock_open_conn_passive(chan, so, sc);
1597 	} else {
1598 		(void) hvs_trans_lock();
1599 		pcb = so->so_pcb;
1600 		if (pcb && pcb->so) {
1601 			sc->pcb = so2hvspcb(so);
1602 			hvsock_open_conn_active(chan, so);
1603 		} else {
1604 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1605 			    "%s: channel detached before open\n", __func__);
1606 		}
1607 		hvs_trans_unlock();
1608 	}
1609 
1610 }
1611 
1612 static int
1613 hvsock_probe(device_t dev)
1614 {
1615 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1616 
1617 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1618 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1619 		    "hvsock_probe called but not a hvsock channel id %u\n",
1620 		    vmbus_chan_id(channel));
1621 
1622 		return ENXIO;
1623 	} else {
1624 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1625 		    "hvsock_probe got a hvsock channel id %u\n",
1626 		    vmbus_chan_id(channel));
1627 
1628 		return BUS_PROBE_DEFAULT;
1629 	}
1630 }
1631 
1632 static int
1633 hvsock_attach(device_t dev)
1634 {
1635 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1636 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1637 
1638 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1639 
1640 	hvsock_open_connection(channel, sc);
1641 
1642 	/*
1643 	 * Always return success. On error the host will rescind the device
1644 	 * in 30 seconds and we can do cleanup at that time in
1645 	 * vmbus_chan_msgproc_chrescind().
1646 	 */
1647 	return (0);
1648 }
1649 
1650 static int
1651 hvsock_detach(device_t dev)
1652 {
1653 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1654 	struct socket *so;
1655 	int retry;
1656 
1657 	if (bootverbose)
1658 		device_printf(dev, "hvsock_detach called.\n");
1659 
1660 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1661 
1662 	if (sc->pcb != NULL) {
1663 		(void) hvs_trans_lock();
1664 
1665 		so = hsvpcb2so(sc->pcb);
1666 		if (so) {
1667 			/* Close the connection */
1668 			if (so->so_state &
1669 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1670 				soisdisconnected(so);
1671 		}
1672 
1673 		mtx_lock(&hvs_trans_socks_mtx);
1674 		__hvs_remove_pcb_from_list(sc->pcb,
1675 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1676 		mtx_unlock(&hvs_trans_socks_mtx);
1677 
1678 		/*
1679 		 * Close channel while no reader and sender are working
1680 		 * on the buffer rings.
1681 		 */
1682 		if (so) {
1683 			retry = 0;
1684 			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
1685 				/*
1686 				 * Someone is reading, rx br is busy
1687 				 */
1688 				soisdisconnected(so);
1689 				DELAY(500);
1690 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1691 				    "waiting for rx reader to exit, "
1692 				    "retry = %d\n", retry++);
1693 			}
1694 			retry = 0;
1695 			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
1696 				/*
1697 				 * Someone is sending, tx br is busy
1698 				 */
1699 				soisdisconnected(so);
1700 				DELAY(500);
1701 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1702 				    "waiting for tx sender to exit, "
1703 				    "retry = %d\n", retry++);
1704 			}
1705 		}
1706 
1707 
1708 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1709 		free(sc->pcb, M_HVSOCK);
1710 		sc->pcb = NULL;
1711 
1712 		if (so) {
1713 			SOCK_IO_RECV_UNLOCK(so);
1714 			SOCK_IO_SEND_UNLOCK(so);
1715 			so->so_pcb = NULL;
1716 		}
1717 
1718 		hvs_trans_unlock();
1719 	}
1720 
1721 	vmbus_chan_close(vmbus_get_channel(dev));
1722 
1723 	return (0);
1724 }
1725 
1726 static device_method_t hvsock_methods[] = {
1727 	/* Device interface */
1728 	DEVMETHOD(device_probe, hvsock_probe),
1729 	DEVMETHOD(device_attach, hvsock_attach),
1730 	DEVMETHOD(device_detach, hvsock_detach),
1731 	DEVMETHOD_END
1732 };
1733 
1734 static driver_t hvsock_driver = {
1735 	"hv_sock",
1736 	hvsock_methods,
1737 	sizeof(struct hvsock_sc)
1738 };
1739 
1740 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL);
1741 MODULE_VERSION(hvsock, 1);
1742 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1743