xref: /freebsd/sys/dev/hyperv/hvsock/hv_sock.c (revision 10aa369afd9946da18ae51b07aeadc3314fba56d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/domain.h>
35 #include <sys/lock.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysproto.h>
46 #include <sys/systm.h>
47 #include <sys/sockbuf.h>
48 #include <sys/sx.h>
49 #include <sys/uio.h>
50 
51 #include <net/vnet.h>
52 
53 #include <dev/hyperv/vmbus/vmbus_reg.h>
54 
55 #include "hv_sock.h"
56 
57 #define HVSOCK_DBG_NONE			0x0
58 #define HVSOCK_DBG_INFO			0x1
59 #define HVSOCK_DBG_ERR			0x2
60 #define HVSOCK_DBG_VERBOSE		0x3
61 
62 
63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
64 
65 static int hvs_dbg_level;
66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
67     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
68 
69 
70 #define HVSOCK_DBG(level, ...) do {					\
71 	if (hvs_dbg_level >= (level))					\
72 		printf(__VA_ARGS__);					\
73 	} while (0)
74 
75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
76 
77 static int hvs_dom_probe(void);
78 
79 /* The MTU is 16KB per host side's design */
80 #define HVSOCK_MTU_SIZE		(1024 * 16)
81 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
82 
83 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
84 
85 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
86 					 roundup2(payload_len, 8) + \
87 					 sizeof(uint64_t))
88 
89 /*
90  * HyperV Transport sockets
91  */
92 static struct protosw hv_socket_protosw = {
93 	.pr_type =		SOCK_STREAM,
94 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
95 	.pr_flags =		PR_CONNREQUIRED,
96 	.pr_attach =		hvs_trans_attach,
97 	.pr_bind =		hvs_trans_bind,
98 	.pr_listen =		hvs_trans_listen,
99 	.pr_accept =		hvs_trans_accept,
100 	.pr_connect =		hvs_trans_connect,
101 	.pr_peeraddr =		hvs_trans_peeraddr,
102 	.pr_sockaddr =		hvs_trans_sockaddr,
103 	.pr_soreceive =		hvs_trans_soreceive,
104 	.pr_sosend =		hvs_trans_sosend,
105 	.pr_disconnect =	hvs_trans_disconnect,
106 	.pr_close =		hvs_trans_close,
107 	.pr_detach =		hvs_trans_detach,
108 	.pr_shutdown =		hvs_trans_shutdown,
109 	.pr_abort =		hvs_trans_abort,
110 };
111 
112 static struct domain		hv_socket_domain = {
113 	.dom_family =		AF_HYPERV,
114 	.dom_name =		"hyperv",
115 	.dom_probe =		hvs_dom_probe,
116 	.dom_nprotosw =		1,
117 	.dom_protosw =		{ &hv_socket_protosw },
118 };
119 
120 DOMAIN_SET(hv_socket_);
121 
122 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
123 #define MIN_PORT			((uint32_t)0x0)
124 
125 /* 00000000-facb-11e6-bd58-64006a7986d3 */
126 static const struct hyperv_guid srv_id_template = {
127 	.hv_guid = {
128 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
129 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
130 };
131 
132 static int		hvsock_br_callback(void *, int, void *);
133 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
134 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
135 static int		hvsock_send_data(struct vmbus_channel *chan,
136     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
137 
138 
139 
140 /* Globals */
141 static struct sx		hvs_trans_socks_sx;
142 static struct mtx		hvs_trans_socks_mtx;
143 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
144 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
145 static uint32_t			previous_auto_bound_port;
146 
147 static void
148 hvsock_print_guid(struct hyperv_guid *guid)
149 {
150 	unsigned char *p = (unsigned char *)guid;
151 
152 	HVSOCK_DBG(HVSOCK_DBG_INFO,
153 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
154 	    *(unsigned int *)p,
155 	    *((unsigned short *) &p[4]),
156 	    *((unsigned short *) &p[6]),
157 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
158 }
159 
160 static bool
161 is_valid_srv_id(const struct hyperv_guid *id)
162 {
163 	return !memcmp(&id->hv_guid[4],
164 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
165 }
166 
167 static unsigned int
168 get_port_by_srv_id(const struct hyperv_guid *srv_id)
169 {
170 	return *((const unsigned int *)srv_id);
171 }
172 
173 static void
174 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
175 {
176 	*((unsigned int *)srv_id) = port;
177 }
178 
179 
180 static void
181 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
182 {
183 	struct hvs_pcb *p = NULL;
184 
185 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
186 
187 	if (!pcb)
188 		return;
189 
190 	if (list & HVS_LIST_BOUND) {
191 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
192 			if  (p == pcb)
193 				LIST_REMOVE(p, bound_next);
194 	}
195 
196 	if (list & HVS_LIST_CONNECTED) {
197 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
198 			if (p == pcb)
199 				LIST_REMOVE(pcb, connected_next);
200 	}
201 }
202 
203 static void
204 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
205 {
206 	struct hvs_pcb *pcb = so2hvspcb(so);
207 
208 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
209 
210 	__hvs_remove_pcb_from_list(pcb, list);
211 }
212 
213 static void
214 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
215 {
216 	struct hvs_pcb *pcb = so2hvspcb(so);
217 
218 	if (list & HVS_LIST_BOUND)
219 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
220 		   pcb, bound_next);
221 
222 	if (list & HVS_LIST_CONNECTED)
223 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
224 		   pcb, connected_next);
225 }
226 
227 void
228 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
229 {
230 	if (!so || !so->so_pcb) {
231 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
232 		    "%s: socket or so_pcb is null\n", __func__);
233 		return;
234 	}
235 
236 	mtx_lock(&hvs_trans_socks_mtx);
237 	__hvs_remove_socket_from_list(so, list);
238 	mtx_unlock(&hvs_trans_socks_mtx);
239 }
240 
241 static void
242 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
243 {
244 	if (!so || !so->so_pcb) {
245 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
246 		    "%s: socket or so_pcb is null\n", __func__);
247 		return;
248 	}
249 
250 	mtx_lock(&hvs_trans_socks_mtx);
251 	__hvs_insert_socket_on_list(so, list);
252 	mtx_unlock(&hvs_trans_socks_mtx);
253 }
254 
255 static struct socket *
256 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
257 {
258 	struct hvs_pcb *p = NULL;
259 
260 	if (list & HVS_LIST_BOUND)
261 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
262 			if (p->so != NULL &&
263 			    addr->hvs_port == p->local_addr.hvs_port)
264 				return p->so;
265 
266 	if (list & HVS_LIST_CONNECTED)
267 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
268 			if (p->so != NULL &&
269 			    addr->hvs_port == p->local_addr.hvs_port)
270 				return p->so;
271 
272 	return NULL;
273 }
274 
275 static struct socket *
276 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
277 {
278 	struct socket *s = NULL;
279 
280 	mtx_lock(&hvs_trans_socks_mtx);
281 	s = __hvs_find_socket_on_list(addr, list);
282 	mtx_unlock(&hvs_trans_socks_mtx);
283 
284 	return s;
285 }
286 
287 static inline void
288 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
289 {
290 	memset(addr, 0, sizeof(*addr));
291 	addr->sa_family = AF_HYPERV;
292 	addr->sa_len = sizeof(*addr);
293 	addr->hvs_port = port;
294 }
295 
296 void
297 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
298 {
299 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
300 }
301 
302 int
303 hvs_trans_lock(void)
304 {
305 	sx_xlock(&hvs_trans_socks_sx);
306 	return (0);
307 }
308 
309 void
310 hvs_trans_unlock(void)
311 {
312 	sx_xunlock(&hvs_trans_socks_sx);
313 }
314 
315 static int
316 hvs_dom_probe(void)
317 {
318 
319 	/* Don't even give us a chance to attach on non-HyperV. */
320 	if (vm_guest != VM_GUEST_HV)
321 		return (ENXIO);
322 	return (0);
323 }
324 
325 static void
326 hvs_trans_init(void *arg __unused)
327 {
328 
329 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
330 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
331 
332 	/* Initialize Globals */
333 	previous_auto_bound_port = MAX_PORT;
334 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
335 	mtx_init(&hvs_trans_socks_mtx,
336 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
337 	LIST_INIT(&hvs_trans_bound_socks);
338 	LIST_INIT(&hvs_trans_connected_socks);
339 }
340 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
341     hvs_trans_init, NULL);
342 
343 /*
344  * Called in two cases:
345  * 1) When user calls socket();
346  * 2) When we accept new incoming conneciton and call sonewconn().
347  */
348 int
349 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
350 {
351 	struct hvs_pcb *pcb = so2hvspcb(so);
352 
353 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
354 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
355 
356 	if (so->so_type != SOCK_STREAM)
357 		return (ESOCKTNOSUPPORT);
358 
359 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
360 		return (EPROTONOSUPPORT);
361 
362 	if (pcb != NULL)
363 		return (EISCONN);
364 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
365 	if (pcb == NULL)
366 		return (ENOMEM);
367 
368 	pcb->so = so;
369 	so->so_pcb = (void *)pcb;
370 
371 	return (0);
372 }
373 
374 void
375 hvs_trans_detach(struct socket *so)
376 {
377 	struct hvs_pcb *pcb;
378 
379 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
380 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
381 
382 	(void) hvs_trans_lock();
383 	pcb = so2hvspcb(so);
384 	if (pcb == NULL) {
385 		hvs_trans_unlock();
386 		return;
387 	}
388 
389 	if (SOLISTENING(so)) {
390 		bzero(pcb, sizeof(*pcb));
391 		free(pcb, M_HVSOCK);
392 	}
393 
394 	so->so_pcb = NULL;
395 
396 	hvs_trans_unlock();
397 }
398 
399 int
400 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
401 {
402 	struct hvs_pcb *pcb = so2hvspcb(so);
403 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
404 	int error = 0;
405 
406 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
407 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
408 
409 	if (sa == NULL) {
410 		return (EINVAL);
411 	}
412 
413 	if (pcb == NULL) {
414 		return (EINVAL);
415 	}
416 
417 	if (sa->sa_family != AF_HYPERV) {
418 		HVSOCK_DBG(HVSOCK_DBG_ERR,
419 		    "%s: Not supported, sa_family is %u\n",
420 		    __func__, sa->sa_family);
421 		return (EAFNOSUPPORT);
422 	}
423 	if (sa->sa_len != sizeof(*sa)) {
424 		HVSOCK_DBG(HVSOCK_DBG_ERR,
425 		    "%s: Not supported, sa_len is %u\n",
426 		    __func__, sa->sa_len);
427 		return (EINVAL);
428 	}
429 
430 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
431 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
432 
433 	mtx_lock(&hvs_trans_socks_mtx);
434 	if (__hvs_find_socket_on_list(sa,
435 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
436 		error = EADDRINUSE;
437 	} else {
438 		/*
439 		 * The address is available for us to bind.
440 		 * Add socket to the bound list.
441 		 */
442 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
443 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
444 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
445 	}
446 	mtx_unlock(&hvs_trans_socks_mtx);
447 
448 	return (error);
449 }
450 
451 int
452 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
453 {
454 	struct hvs_pcb *pcb = so2hvspcb(so);
455 	struct socket *bound_so;
456 	int error;
457 
458 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
459 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
460 
461 	if (pcb == NULL)
462 		return (EINVAL);
463 
464 	/* Check if the address is already bound and it was by us. */
465 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
466 	if (bound_so == NULL || bound_so != so) {
467 		HVSOCK_DBG(HVSOCK_DBG_ERR,
468 		    "%s: Address not bound or not by us.\n", __func__);
469 		return (EADDRNOTAVAIL);
470 	}
471 
472 	SOCK_LOCK(so);
473 	error = solisten_proto_check(so);
474 	if (error == 0)
475 		solisten_proto(so, backlog);
476 	SOCK_UNLOCK(so);
477 
478 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
479 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
480 	return (error);
481 }
482 
483 int
484 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
485 {
486 	struct hvs_pcb *pcb = so2hvspcb(so);
487 
488 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
489 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
490 
491 	if (pcb == NULL)
492 		return (EINVAL);
493 
494 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
495 	    M_NOWAIT);
496 
497 	return ((*nam == NULL) ? ENOMEM : 0);
498 }
499 
500 int
501 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
502 {
503 	struct hvs_pcb *pcb = so2hvspcb(so);
504 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
505 	bool found_auto_bound_port = false;
506 	int i, error = 0;
507 
508 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
509 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
510 	    __func__, raddr->hvs_port);
511 
512 	if (pcb == NULL)
513 		return (EINVAL);
514 
515 	/* Verify the remote address */
516 	if (raddr == NULL)
517 		return (EINVAL);
518 	if (raddr->sa_family != AF_HYPERV)
519 		return (EAFNOSUPPORT);
520 	if (raddr->sa_len != sizeof(*raddr))
521 		return (EINVAL);
522 
523 	mtx_lock(&hvs_trans_socks_mtx);
524 	if (so->so_state &
525 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
526 			HVSOCK_DBG(HVSOCK_DBG_ERR,
527 			    "%s: socket connect in progress\n",
528 			    __func__);
529 			error = EINPROGRESS;
530 			goto out;
531 	}
532 
533 	/*
534 	 * Find an available port for us to auto bind the local
535 	 * address.
536 	 */
537 	hvs_addr_set(&pcb->local_addr, 0);
538 
539 	for (i = previous_auto_bound_port - 1;
540 	    i != previous_auto_bound_port; i --) {
541 		if (i == MIN_PORT)
542 			i = MAX_PORT;
543 
544 		pcb->local_addr.hvs_port = i;
545 
546 		if (__hvs_find_socket_on_list(&pcb->local_addr,
547 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
548 			found_auto_bound_port = true;
549 			previous_auto_bound_port = i;
550 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
551 			    "%s: found local bound port is %x\n",
552 			    __func__, pcb->local_addr.hvs_port);
553 			break;
554 		}
555 	}
556 
557 	if (found_auto_bound_port == true) {
558 		/* Found available port for auto bound, put on list */
559 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
560 		/* Set VM service ID */
561 		pcb->vm_srv_id = srv_id_template;
562 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
563 		/* Set host service ID and remote port */
564 		pcb->host_srv_id = srv_id_template;
565 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
566 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
567 
568 		/* Change the socket state to SS_ISCONNECTING */
569 		soisconnecting(so);
570 	} else {
571 		HVSOCK_DBG(HVSOCK_DBG_ERR,
572 		    "%s: No local port available for auto bound\n",
573 		    __func__);
574 		error = EADDRINUSE;
575 	}
576 
577 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
578 	hvsock_print_guid(&pcb->vm_srv_id);
579 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
580 	hvsock_print_guid(&pcb->host_srv_id);
581 
582 out:
583 	mtx_unlock(&hvs_trans_socks_mtx);
584 
585 	if (found_auto_bound_port == true)
586 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
587 
588 	return (error);
589 }
590 
591 int
592 hvs_trans_disconnect(struct socket *so)
593 {
594 	struct hvs_pcb *pcb;
595 
596 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
597 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
598 
599 	(void) hvs_trans_lock();
600 	pcb = so2hvspcb(so);
601 	if (pcb == NULL) {
602 		hvs_trans_unlock();
603 		return (EINVAL);
604 	}
605 
606 	/* If socket is already disconnected, skip this */
607 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
608 		soisdisconnecting(so);
609 
610 	hvs_trans_unlock();
611 
612 	return (0);
613 }
614 
615 struct hvs_callback_arg {
616 	struct uio *uio;
617 	struct sockbuf *sb;
618 };
619 
620 int
621 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
622     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
623 {
624 	struct hvs_pcb *pcb = so2hvspcb(so);
625 	struct sockbuf *sb;
626 	ssize_t orig_resid;
627 	uint32_t canread, to_read;
628 	int flags, error = 0;
629 	struct hvs_callback_arg cbarg;
630 
631 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
632 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
633 
634 	if (so->so_type != SOCK_STREAM)
635 		return (EINVAL);
636 	if (pcb == NULL)
637 		return (EINVAL);
638 
639 	if (flagsp != NULL)
640 		flags = *flagsp &~ MSG_EOR;
641 	else
642 		flags = 0;
643 
644 	if (flags & MSG_PEEK)
645 		return (EOPNOTSUPP);
646 
647 	/* If no space to copy out anything */
648 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
649 		return (EINVAL);
650 
651 	orig_resid = uio->uio_resid;
652 
653 	/* Prevent other readers from entering the socket. */
654 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
655 	if (error) {
656 		HVSOCK_DBG(HVSOCK_DBG_ERR,
657 		    "%s: soiolock returned error = %d\n", __func__, error);
658 		return (error);
659 	}
660 
661 	sb = &so->so_rcv;
662 	SOCKBUF_LOCK(sb);
663 
664 	cbarg.uio = uio;
665 	cbarg.sb = sb;
666 	/*
667 	 * If the socket is closing, there might still be some data
668 	 * in rx br to read. However we need to make sure
669 	 * the channel is still open.
670 	 */
671 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
672 	    (so->so_state & SS_ISDISCONNECTED)) {
673 		/* Other thread already closed the channel */
674 		error = EPIPE;
675 		goto out;
676 	}
677 
678 	while (true) {
679 		while (uio->uio_resid > 0 &&
680 		    (canread = hvsock_canread_check(pcb)) > 0) {
681 			to_read = MIN(canread, uio->uio_resid);
682 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
683 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
684 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
685 			    pcb->recv_data_off));
686 
687 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
688 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
689 			    hvsock_br_callback, (void *)&cbarg);
690 			/*
691 			 * It is possible socket is disconnected becasue
692 			 * we released lock in hvsock_br_callback. So we
693 			 * need to check the state to make sure it is not
694 			 * disconnected.
695 			 */
696 			if (error || so->so_state & SS_ISDISCONNECTED) {
697 				break;
698 			}
699 
700 			pcb->recv_data_len -= to_read;
701 			pcb->recv_data_off += to_read;
702 		}
703 
704 		if (error)
705 			break;
706 
707 		/* Abort if socket has reported problems. */
708 		if (so->so_error) {
709 			if (so->so_error == ESHUTDOWN &&
710 			    orig_resid > uio->uio_resid) {
711 				/*
712 				 * Although we got a FIN, we also received
713 				 * some data in this round. Delivery it
714 				 * to user.
715 				 */
716 				error = 0;
717 			} else {
718 				if (so->so_error != ESHUTDOWN)
719 					error = so->so_error;
720 			}
721 
722 			break;
723 		}
724 
725 		/* Cannot received more. */
726 		if (sb->sb_state & SBS_CANTRCVMORE)
727 			break;
728 
729 		/* We are done if buffer has been filled */
730 		if (uio->uio_resid == 0)
731 			break;
732 
733 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
734 			break;
735 
736 		/* Buffer ring is empty and we shall not block */
737 		if ((so->so_state & SS_NBIO) ||
738 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
739 			if (orig_resid == uio->uio_resid) {
740 				/* We have not read anything */
741 				error = EAGAIN;
742 			}
743 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
744 			    "%s: non blocked read return, error %d.\n",
745 			    __func__, error);
746 			break;
747 		}
748 
749 		/*
750 		 * Wait and block until (more) data comes in.
751 		 * Note: Drops the sockbuf lock during wait.
752 		 */
753 		error = sbwait(so, SO_RCV);
754 
755 		if (error)
756 			break;
757 
758 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
759 		    "%s: wake up from sbwait, read available is %u\n",
760 		    __func__, vmbus_chan_read_available(pcb->chan));
761 	}
762 
763 out:
764 	SOCKBUF_UNLOCK(sb);
765 	SOCK_IO_RECV_UNLOCK(so);
766 
767 	/* We recieved a FIN in this call */
768 	if (so->so_error == ESHUTDOWN) {
769 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
770 			/* Send has already closed */
771 			soisdisconnecting(so);
772 		} else {
773 			/* Just close the receive side */
774 			socantrcvmore(so);
775 		}
776 	}
777 
778 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
779 	    "%s: returning error = %d, so_error = %d\n",
780 	    __func__, error, so->so_error);
781 
782 	return (error);
783 }
784 
785 int
786 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
787     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
788 {
789 	struct hvs_pcb *pcb = so2hvspcb(so);
790 	struct sockbuf *sb;
791 	ssize_t orig_resid;
792 	uint32_t canwrite, to_write;
793 	int error = 0;
794 
795 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
796 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
797 	    __func__, uio->uio_resid);
798 
799 	if (so->so_type != SOCK_STREAM)
800 		return (EINVAL);
801 	if (pcb == NULL)
802 		return (EINVAL);
803 
804 	/* If nothing to send */
805 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
806 		return (EINVAL);
807 
808 	orig_resid = uio->uio_resid;
809 
810 	/* Prevent other writers from entering the socket. */
811 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
812 	if (error) {
813 		HVSOCK_DBG(HVSOCK_DBG_ERR,
814 		    "%s: soiolocak returned error = %d\n", __func__, error);
815 		return (error);
816 	}
817 
818 	sb = &so->so_snd;
819 	SOCKBUF_LOCK(sb);
820 
821 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
822 	    so->so_error == ESHUTDOWN) {
823 		error = EPIPE;
824 		goto out;
825 	}
826 
827 	while (uio->uio_resid > 0) {
828 		canwrite = hvsock_canwrite_check(pcb);
829 		if (canwrite == 0) {
830 			/* We have sent some data */
831 			if (orig_resid > uio->uio_resid)
832 				break;
833 			/*
834 			 * We have not sent any data and it is
835 			 * non-blocked io
836 			 */
837 			if (so->so_state & SS_NBIO ||
838 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
839 				error = EWOULDBLOCK;
840 				break;
841 			} else {
842 				/*
843 				 * We are here because there is no space on
844 				 * send buffer ring. Signal the other side
845 				 * to read and free more space.
846 				 * Sleep wait until space avaiable to send
847 				 * Note: Drops the sockbuf lock during wait.
848 				 */
849 				error = sbwait(so, SO_SND);
850 
851 				if (error)
852 					break;
853 
854 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
855 				    "%s: wake up from sbwait, space avail on "
856 				    "tx ring is %u\n",
857 				    __func__,
858 				    vmbus_chan_write_available(pcb->chan));
859 
860 				continue;
861 			}
862 		}
863 		to_write = MIN(canwrite, uio->uio_resid);
864 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
865 
866 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
867 		    "%s: canwrite is %u, to_write = %u\n", __func__,
868 		    canwrite, to_write);
869 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
870 
871 		if (error)
872 			break;
873 	}
874 
875 out:
876 	SOCKBUF_UNLOCK(sb);
877 	SOCK_IO_SEND_UNLOCK(so);
878 
879 	return (error);
880 }
881 
882 int
883 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
884 {
885 	struct hvs_pcb *pcb = so2hvspcb(so);
886 
887 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
888 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
889 
890 	if (pcb == NULL)
891 		return (EINVAL);
892 
893 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
894 
895 	return ((*nam == NULL)? ENOMEM : 0);
896 }
897 
898 int
899 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
900 {
901 	struct hvs_pcb *pcb = so2hvspcb(so);
902 
903 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
904 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
905 
906 	if (pcb == NULL)
907 		return (EINVAL);
908 
909 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
910 
911 	return ((*nam == NULL)? ENOMEM : 0);
912 }
913 
914 void
915 hvs_trans_close(struct socket *so)
916 {
917 	struct hvs_pcb *pcb;
918 
919 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
920 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
921 
922 	(void) hvs_trans_lock();
923 	pcb = so2hvspcb(so);
924 	if (!pcb) {
925 		hvs_trans_unlock();
926 		return;
927 	}
928 
929 	if (so->so_state & SS_ISCONNECTED) {
930 		/* Send a FIN to peer */
931 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
932 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
933 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
934 	}
935 
936 	if (so->so_state &
937 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
938 		soisdisconnected(so);
939 
940 	pcb->chan = NULL;
941 	pcb->so = NULL;
942 
943 	if (SOLISTENING(so)) {
944 		mtx_lock(&hvs_trans_socks_mtx);
945 		/* Remove from bound list */
946 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
947 		mtx_unlock(&hvs_trans_socks_mtx);
948 	}
949 
950 	hvs_trans_unlock();
951 
952 	return;
953 }
954 
955 void
956 hvs_trans_abort(struct socket *so)
957 {
958 	struct hvs_pcb *pcb = so2hvspcb(so);
959 
960 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
961 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
962 
963 	(void) hvs_trans_lock();
964 	if (pcb == NULL) {
965 		hvs_trans_unlock();
966 		return;
967 	}
968 
969 	if (SOLISTENING(so)) {
970 		mtx_lock(&hvs_trans_socks_mtx);
971 		/* Remove from bound list */
972 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
973 		mtx_unlock(&hvs_trans_socks_mtx);
974 	}
975 
976 	if (so->so_state & SS_ISCONNECTED) {
977 		(void) sodisconnect(so);
978 	}
979 	hvs_trans_unlock();
980 
981 	return;
982 }
983 
984 int
985 hvs_trans_shutdown(struct socket *so)
986 {
987 	struct hvs_pcb *pcb = so2hvspcb(so);
988 	struct sockbuf *sb;
989 
990 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
991 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
992 
993 	if (pcb == NULL)
994 		return (EINVAL);
995 
996 	/*
997 	 * Only get called with the shutdown method is SHUT_WR or
998 	 * SHUT_RDWR.
999 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
1000 	 * already set the SBS_CANTRCVMORE on receive side socket
1001 	 * buffer.
1002 	 */
1003 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1004 		/*
1005 		 * SHUT_WR only case.
1006 		 * Receive side is still open. Just close
1007 		 * the send side.
1008 		 */
1009 		socantsendmore(so);
1010 	} else {
1011 		/* SHUT_RDWR case */
1012 		if (so->so_state & SS_ISCONNECTED) {
1013 			/* Send a FIN to peer */
1014 			sb = &so->so_snd;
1015 			SOCKBUF_LOCK(sb);
1016 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1017 			SOCKBUF_UNLOCK(sb);
1018 
1019 			soisdisconnecting(so);
1020 		}
1021 	}
1022 
1023 	return (0);
1024 }
1025 
1026 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1027  * <port> (see struct sockaddr_hvs).
1028  *
1029  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1030  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1031  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1032  * the below sockaddr:
1033  *
1034  * struct SOCKADDR_HV
1035  * {
1036  *    ADDRESS_FAMILY Family;
1037  *    USHORT Reserved;
1038  *    GUID VmId;
1039  *    GUID ServiceId;
1040  * };
1041  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1042  * VMBus, because here it's obvious the host and the VM can easily identify
1043  * each other. Though the VmID is useful on the host, especially in the case
1044  * of Windows container, FreeBSD VM doesn't need it at all.
1045  *
1046  * To be compatible with similar infrastructure in Linux VMs, we have
1047  * to limit the available GUID space of SOCKADDR_HV so that we can create
1048  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1049  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1050  *
1051  ****************************************************************************
1052  * The only valid Service GUIDs, from the perspectives of both the host and *
1053  * FreeBSD VM, that can be connected by the other end, must conform to this *
1054  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1055  ****************************************************************************
1056  *
1057  * When we write apps on the host to connect(), the GUID ServiceID is used.
1058  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1059  * port and the driver will form the GUID and use that to request the host.
1060  *
1061  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1062  * auto-generated remote port for a connect request initiated by the host's
1063  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1064  * FreeBSD guest.
1065  */
1066 
1067 /*
1068  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1069  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1070  * HyperV hosts doen't have this limit.
1071  */
1072 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1073 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1074 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1075 
1076 struct hvsock_sc {
1077 	device_t		dev;
1078 	struct hvs_pcb		*pcb;
1079 	struct vmbus_channel	*channel;
1080 };
1081 
1082 static bool
1083 hvsock_chan_readable(struct vmbus_channel *chan)
1084 {
1085 	uint32_t readable = vmbus_chan_read_available(chan);
1086 
1087 	return (readable >= HVSOCK_PKT_LEN(0));
1088 }
1089 
1090 static void
1091 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1092 {
1093 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1094 	struct socket *so;
1095 	uint32_t canwrite;
1096 
1097 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1098 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1099 	    __func__, pcb);
1100 
1101 	/*
1102 	 * Check if the socket is still attached and valid.
1103 	 * Here we know channel is still open. Need to make
1104 	 * sure the socket has not been closed or freed.
1105 	 */
1106 	(void) hvs_trans_lock();
1107 	so = hsvpcb2so(pcb);
1108 
1109 	if (pcb->chan != NULL && so != NULL) {
1110 		/*
1111 		 * Wake up reader if there are data to read.
1112 		 */
1113 		SOCKBUF_LOCK(&(so)->so_rcv);
1114 
1115 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1116 		    "%s: read available = %u\n", __func__,
1117 		    vmbus_chan_read_available(pcb->chan));
1118 
1119 		if (hvsock_chan_readable(pcb->chan))
1120 			sorwakeup_locked(so);
1121 		else
1122 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1123 
1124 		/*
1125 		 * Wake up sender if space becomes available to write.
1126 		 */
1127 		SOCKBUF_LOCK(&(so)->so_snd);
1128 		canwrite = hvsock_canwrite_check(pcb);
1129 
1130 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1131 		    "%s: canwrite = %u\n", __func__, canwrite);
1132 
1133 		if (canwrite > 0) {
1134 			sowwakeup_locked(so);
1135 		} else {
1136 			SOCKBUF_UNLOCK(&(so)->so_snd);
1137 		}
1138 	}
1139 
1140 	hvs_trans_unlock();
1141 
1142 	return;
1143 }
1144 
1145 static int
1146 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1147 {
1148 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1149 	struct uio *uio = arg->uio;
1150 	struct sockbuf *sb = arg->sb;
1151 	int error = 0;
1152 
1153 	if (cbarg == NULL || datap == NULL)
1154 		return (EINVAL);
1155 
1156 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1157 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1158 	    "datap = %p\n",
1159 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1160 	    uio->uio_resid, cplen, datap);
1161 
1162 	if (sb)
1163 		SOCKBUF_UNLOCK(sb);
1164 
1165 	error = uiomove(datap, cplen, uio);
1166 
1167 	if (sb)
1168 		SOCKBUF_LOCK(sb);
1169 
1170 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1171 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1172 	    __func__, uio->uio_resid, error);
1173 
1174 	return (error);
1175 }
1176 
1177 static int
1178 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1179     uint32_t to_write, struct sockbuf *sb)
1180 {
1181 	struct hvs_pkt_header hvs_pkt;
1182 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1183 	uint64_t pad = 0;
1184 	struct iovec iov[3];
1185 	struct hvs_callback_arg cbarg;
1186 
1187 	if (chan == NULL)
1188 		return (ENOTCONN);
1189 
1190 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1191 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1192 	hvs_pktlen = hvs_pkthlen + to_write;
1193 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1194 
1195 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1196 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1197 	    "pad_pktlen = %u, data_len = %u\n",
1198 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1199 
1200 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1201 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1202 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1203 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1204 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1205 
1206 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1207 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1208 
1209 	cbarg.uio = uio;
1210 	cbarg.sb = sb;
1211 
1212 	if (uio && to_write > 0) {
1213 		iov[0].iov_base = &hvs_pkt;
1214 		iov[0].iov_len = hvs_pkthlen;
1215 		iov[1].iov_base = NULL;
1216 		iov[1].iov_len = to_write;
1217 		iov[2].iov_base = &pad;
1218 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1219 
1220 		error = vmbus_chan_iov_send(chan, iov, 3,
1221 		    hvsock_br_callback, &cbarg);
1222 	} else {
1223 		if (to_write == 0) {
1224 			iov[0].iov_base = &hvs_pkt;
1225 			iov[0].iov_len = hvs_pkthlen;
1226 			iov[1].iov_base = &pad;
1227 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1228 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1229 		}
1230 	}
1231 
1232 	if (error) {
1233 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1234 		    "%s: error = %d\n", __func__, error);
1235 	}
1236 
1237 	return (error);
1238 }
1239 
1240 /*
1241  * Check if we have data on current ring buffer to read
1242  * or not. If not, advance the ring buffer read index to
1243  * next packet. Update the recev_data_len and recev_data_off
1244  * to new value.
1245  * Return the number of bytes can read.
1246  */
1247 static uint32_t
1248 hvsock_canread_check(struct hvs_pcb *pcb)
1249 {
1250 	uint32_t advance;
1251 	uint32_t tlen, hlen, dlen;
1252 	uint32_t bytes_canread = 0;
1253 	int error;
1254 
1255 	if (pcb == NULL || pcb->chan == NULL) {
1256 		pcb->so->so_error = EIO;
1257 		return (0);
1258 	}
1259 
1260 	/* Still have data not read yet on current packet */
1261 	if (pcb->recv_data_len > 0)
1262 		return (pcb->recv_data_len);
1263 
1264 	if (pcb->rb_init)
1265 		advance =
1266 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1267 	else
1268 		advance = 0;
1269 
1270 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1271 
1272 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1273 	    "%s: bytes_canread on br = %u, advance = %u\n",
1274 	    __func__, bytes_canread, advance);
1275 
1276 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1277 		/*
1278 		 * Nothing to read. Need to advance the rindex before
1279 		 * calling sbwait, so host knows to wake us up when data
1280 		 * is available to read on rb.
1281 		 */
1282 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1283 		if (error) {
1284 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1285 			    "%s: after calling vmbus_chan_recv_idxadv, "
1286 			    "got error = %d\n",  __func__, error);
1287 			return (0);
1288 		} else {
1289 			pcb->rb_init = false;
1290 			pcb->recv_data_len = 0;
1291 			pcb->recv_data_off = 0;
1292 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1293 
1294 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1295 			    "%s: advanced %u bytes, "
1296 			    " bytes_canread on br now = %u\n",
1297 			    __func__, advance, bytes_canread);
1298 
1299 			if (bytes_canread == 0)
1300 				return (0);
1301 			else
1302 				advance = 0;
1303 		}
1304 	}
1305 
1306 	if (bytes_canread <
1307 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1308 		return (0);
1309 
1310 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1311 	    sizeof(struct hvs_pkt_header), advance);
1312 
1313 	/* Don't have anything to read */
1314 	if (error) {
1315 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1316 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1317 		    __func__, error);
1318 		return (0);
1319 	}
1320 
1321 	/*
1322 	 * We just read in a new packet header. Do some sanity checks.
1323 	 */
1324 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1325 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1326 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1327 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1328 	    __predict_false(hlen > tlen) ||
1329 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1330 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1331 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1332 		    tlen, hlen, dlen);
1333 		pcb->so->so_error = EIO;
1334 		return (0);
1335 	}
1336 	if (pcb->rb_init == false)
1337 		pcb->rb_init = true;
1338 
1339 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1340 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1341 	    tlen, hlen, dlen);
1342 
1343 	/* The other side has sent a close FIN */
1344 	if (dlen == 0) {
1345 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1346 		    "%s: Received FIN from other side\n", __func__);
1347 		/* inform the caller by seting so_error to ESHUTDOWN */
1348 		pcb->so->so_error = ESHUTDOWN;
1349 	}
1350 
1351 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1352 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1353 
1354 	pcb->recv_data_len = dlen;
1355 	pcb->recv_data_off = 0;
1356 
1357 	return (pcb->recv_data_len);
1358 }
1359 
1360 static uint32_t
1361 hvsock_canwrite_check(struct hvs_pcb *pcb)
1362 {
1363 	uint32_t writeable;
1364 	uint32_t ret;
1365 
1366 	if (pcb == NULL || pcb->chan == NULL)
1367 		return (0);
1368 
1369 	writeable = vmbus_chan_write_available(pcb->chan);
1370 
1371 	/*
1372 	 * We must always reserve a 0-length-payload packet for the FIN.
1373 	 */
1374 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1375 	    "%s: writeable is %u, should be greater than %ju\n",
1376 	    __func__, writeable,
1377 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1378 
1379 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1380 		/*
1381 		 * The Tx ring seems full.
1382 		 */
1383 		return (0);
1384 	}
1385 
1386 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1387 
1388 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1389 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1390 
1391 	return (rounddown2(ret, 8));
1392 }
1393 
1394 static void
1395 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1396 {
1397 	vmbus_chan_set_pending_send_size(chan,
1398 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1399 }
1400 
1401 static int
1402 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1403 {
1404 	unsigned int rcvbuf, sndbuf;
1405 	struct hvs_pcb *pcb = so2hvspcb(so);
1406 	int ret;
1407 
1408 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1409 		sndbuf = HVS_RINGBUF_SND_SIZE;
1410 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1411 	} else {
1412 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1413 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1414 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1415 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1416 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1417 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1418 	}
1419 
1420 	/*
1421 	 * Can only read whatever user provided size of data
1422 	 * from ring buffer. Turn off batched reading.
1423 	 */
1424 	vmbus_chan_set_readbatch(chan, false);
1425 
1426 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1427 	    hvsock_chan_cb, pcb);
1428 
1429 	if (ret != 0) {
1430 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1431 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1432 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1433 	} else {
1434 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1435 		    "%s: hvsock channel opened, sndbuf = %u, i"
1436 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1437 		/*
1438 		 * Se the pending send size so to receive wakeup
1439 		 * signals from host when there is enough space on
1440 		 * rx buffer ring to write.
1441 		 */
1442 		hvsock_set_chan_pending_send_size(chan);
1443 	}
1444 
1445 	return ret;
1446 }
1447 
1448 /*
1449  * Guest is listening passively on the socket. Open channel and
1450  * create a new socket for the conneciton.
1451  */
1452 static void
1453 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1454     struct hvsock_sc *sc)
1455 {
1456 	struct socket *new_so;
1457 	struct hvs_pcb *new_pcb, *pcb;
1458 	int error;
1459 
1460 	/* Do nothing if socket is not listening */
1461 	if (!SOLISTENING(so)) {
1462 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1463 		    "%s: socket is not a listening one\n", __func__);
1464 		return;
1465 	}
1466 
1467 	/*
1468 	 * Create a new socket. This will call pru_attach to complete
1469 	 * the socket initialization and put the new socket onto
1470 	 * listening socket's sol_incomp list, waiting to be promoted
1471 	 * to sol_comp list.
1472 	 * The new socket created has ref count 0. There is no other
1473 	 * thread that changes the state of this new one at the
1474 	 * moment, so we don't need to hold its lock while opening
1475 	 * channel and filling out its pcb information.
1476 	 */
1477 	new_so = sonewconn(so, 0);
1478 	if (!new_so)
1479 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1480 		    "%s: creating new socket failed\n", __func__);
1481 
1482 	/*
1483 	 * Now open the vmbus channel. If it fails, the socket will be
1484 	 * on the listening socket's sol_incomp queue until it is
1485 	 * replaced and aborted.
1486 	 */
1487 	error = hvsock_open_channel(chan, new_so);
1488 	if (error) {
1489 		new_so->so_error = error;
1490 		return;
1491 	}
1492 
1493 	pcb = so->so_pcb;
1494 	new_pcb = new_so->so_pcb;
1495 
1496 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1497 	/* Remote port is unknown to guest in this type of conneciton */
1498 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1499 	new_pcb->chan = chan;
1500 	new_pcb->recv_data_len = 0;
1501 	new_pcb->recv_data_off = 0;
1502 	new_pcb->rb_init = false;
1503 
1504 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1505 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1506 
1507 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1508 
1509 	sc->pcb = new_pcb;
1510 
1511 	/*
1512 	 * Change the socket state to SS_ISCONNECTED. This will promote
1513 	 * the socket to sol_comp queue and wake up the thread which
1514 	 * is accepting connection.
1515 	 */
1516 	soisconnected(new_so);
1517 }
1518 
1519 
1520 /*
1521  * Guest is actively connecting to host.
1522  */
1523 static void
1524 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1525 {
1526 	struct hvs_pcb *pcb;
1527 	int error;
1528 
1529 	error = hvsock_open_channel(chan, so);
1530 	if (error) {
1531 		so->so_error = error;
1532 		return;
1533 	}
1534 
1535 	pcb = so->so_pcb;
1536 	pcb->chan = chan;
1537 	pcb->recv_data_len = 0;
1538 	pcb->recv_data_off = 0;
1539 	pcb->rb_init = false;
1540 
1541 	mtx_lock(&hvs_trans_socks_mtx);
1542 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1543 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1544 	mtx_unlock(&hvs_trans_socks_mtx);
1545 
1546 	/*
1547 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1548 	 * the thread sleeping in connect call.
1549 	 */
1550 	soisconnected(so);
1551 }
1552 
1553 static void
1554 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1555 {
1556 	struct hyperv_guid *inst_guid, *type_guid;
1557 	bool conn_from_host;
1558 	struct sockaddr_hvs addr;
1559 	struct socket *so;
1560 	struct hvs_pcb *pcb;
1561 
1562 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1563 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1564 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1565 
1566 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1567 	hvsock_print_guid(type_guid);
1568 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1569 	hvsock_print_guid(inst_guid);
1570 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1571 	    (conn_from_host == true ) ? "from" : "to");
1572 
1573 	/*
1574 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1575 	 */
1576 	if (!is_valid_srv_id(type_guid))
1577 		return;
1578 
1579 	/*
1580 	 * There should be a bound socket already created no matter
1581 	 * it is a passive or active connection.
1582 	 * For host initiated connection (passive on guest side),
1583 	 * the  type_guid contains the port which guest is bound and
1584 	 * listening.
1585 	 * For the guest initiated connection (active on guest side),
1586 	 * the inst_guid contains the port that guest has auto bound
1587 	 * to.
1588 	 */
1589 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1590 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1591 	if (!so) {
1592 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1593 		    "%s: no bound socket found for port %u\n",
1594 		    __func__, addr.hvs_port);
1595 		return;
1596 	}
1597 
1598 	if (conn_from_host) {
1599 		hvsock_open_conn_passive(chan, so, sc);
1600 	} else {
1601 		(void) hvs_trans_lock();
1602 		pcb = so->so_pcb;
1603 		if (pcb && pcb->so) {
1604 			sc->pcb = so2hvspcb(so);
1605 			hvsock_open_conn_active(chan, so);
1606 		} else {
1607 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1608 			    "%s: channel detached before open\n", __func__);
1609 		}
1610 		hvs_trans_unlock();
1611 	}
1612 
1613 }
1614 
1615 static int
1616 hvsock_probe(device_t dev)
1617 {
1618 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1619 
1620 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1621 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1622 		    "hvsock_probe called but not a hvsock channel id %u\n",
1623 		    vmbus_chan_id(channel));
1624 
1625 		return ENXIO;
1626 	} else {
1627 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1628 		    "hvsock_probe got a hvsock channel id %u\n",
1629 		    vmbus_chan_id(channel));
1630 
1631 		return BUS_PROBE_DEFAULT;
1632 	}
1633 }
1634 
1635 static int
1636 hvsock_attach(device_t dev)
1637 {
1638 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1639 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1640 
1641 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1642 
1643 	hvsock_open_connection(channel, sc);
1644 
1645 	/*
1646 	 * Always return success. On error the host will rescind the device
1647 	 * in 30 seconds and we can do cleanup at that time in
1648 	 * vmbus_chan_msgproc_chrescind().
1649 	 */
1650 	return (0);
1651 }
1652 
1653 static int
1654 hvsock_detach(device_t dev)
1655 {
1656 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1657 	struct socket *so;
1658 	int retry;
1659 
1660 	if (bootverbose)
1661 		device_printf(dev, "hvsock_detach called.\n");
1662 
1663 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1664 
1665 	if (sc->pcb != NULL) {
1666 		(void) hvs_trans_lock();
1667 
1668 		so = hsvpcb2so(sc->pcb);
1669 		if (so) {
1670 			/* Close the connection */
1671 			if (so->so_state &
1672 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1673 				soisdisconnected(so);
1674 		}
1675 
1676 		mtx_lock(&hvs_trans_socks_mtx);
1677 		__hvs_remove_pcb_from_list(sc->pcb,
1678 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1679 		mtx_unlock(&hvs_trans_socks_mtx);
1680 
1681 		/*
1682 		 * Close channel while no reader and sender are working
1683 		 * on the buffer rings.
1684 		 */
1685 		if (so) {
1686 			retry = 0;
1687 			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
1688 				/*
1689 				 * Someone is reading, rx br is busy
1690 				 */
1691 				soisdisconnected(so);
1692 				DELAY(500);
1693 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1694 				    "waiting for rx reader to exit, "
1695 				    "retry = %d\n", retry++);
1696 			}
1697 			retry = 0;
1698 			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
1699 				/*
1700 				 * Someone is sending, tx br is busy
1701 				 */
1702 				soisdisconnected(so);
1703 				DELAY(500);
1704 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1705 				    "waiting for tx sender to exit, "
1706 				    "retry = %d\n", retry++);
1707 			}
1708 		}
1709 
1710 
1711 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1712 		free(sc->pcb, M_HVSOCK);
1713 		sc->pcb = NULL;
1714 
1715 		if (so) {
1716 			SOCK_IO_RECV_UNLOCK(so);
1717 			SOCK_IO_SEND_UNLOCK(so);
1718 			so->so_pcb = NULL;
1719 		}
1720 
1721 		hvs_trans_unlock();
1722 	}
1723 
1724 	vmbus_chan_close(vmbus_get_channel(dev));
1725 
1726 	return (0);
1727 }
1728 
1729 static device_method_t hvsock_methods[] = {
1730 	/* Device interface */
1731 	DEVMETHOD(device_probe, hvsock_probe),
1732 	DEVMETHOD(device_attach, hvsock_attach),
1733 	DEVMETHOD(device_detach, hvsock_detach),
1734 	DEVMETHOD_END
1735 };
1736 
1737 static driver_t hvsock_driver = {
1738 	"hv_sock",
1739 	hvsock_methods,
1740 	sizeof(struct hvsock_sc)
1741 };
1742 
1743 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL);
1744 MODULE_VERSION(hvsock, 1);
1745 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1746