xref: /freebsd/sys/dev/hyperv/hvsock/hv_sock.c (revision 6132212808e8dccedc9e5d85fea4390c2f38059a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/domain.h>
35 #include <sys/lock.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysproto.h>
46 #include <sys/systm.h>
47 #include <sys/sockbuf.h>
48 #include <sys/sx.h>
49 #include <sys/uio.h>
50 
51 #include <net/vnet.h>
52 
53 #include <dev/hyperv/vmbus/vmbus_reg.h>
54 
55 #include "hv_sock.h"
56 
57 #define HVSOCK_DBG_NONE			0x0
58 #define HVSOCK_DBG_INFO			0x1
59 #define HVSOCK_DBG_ERR			0x2
60 #define HVSOCK_DBG_VERBOSE		0x3
61 
62 
63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
64 
65 static int hvs_dbg_level;
66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
67     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
68 
69 
70 #define HVSOCK_DBG(level, ...) do {					\
71 	if (hvs_dbg_level >= (level))					\
72 		printf(__VA_ARGS__);					\
73 	} while (0)
74 
75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
76 
77 /* The MTU is 16KB per host side's design */
78 #define HVSOCK_MTU_SIZE		(1024 * 16)
79 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
80 
81 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
82 
83 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
84 					 roundup2(payload_len, 8) + \
85 					 sizeof(uint64_t))
86 
87 
88 static struct domain		hv_socket_domain;
89 
90 /*
91  * HyperV Transport sockets
92  */
93 static struct pr_usrreqs	hvs_trans_usrreqs = {
94 	.pru_attach =		hvs_trans_attach,
95 	.pru_bind =		hvs_trans_bind,
96 	.pru_listen =		hvs_trans_listen,
97 	.pru_accept =		hvs_trans_accept,
98 	.pru_connect =		hvs_trans_connect,
99 	.pru_peeraddr =		hvs_trans_peeraddr,
100 	.pru_sockaddr =		hvs_trans_sockaddr,
101 	.pru_soreceive =	hvs_trans_soreceive,
102 	.pru_sosend =		hvs_trans_sosend,
103 	.pru_disconnect =	hvs_trans_disconnect,
104 	.pru_close =		hvs_trans_close,
105 	.pru_detach =		hvs_trans_detach,
106 	.pru_shutdown =		hvs_trans_shutdown,
107 	.pru_abort =		hvs_trans_abort,
108 };
109 
110 /*
111  * Definitions of protocols supported in HyperV socket domain
112  */
113 static struct protosw		hv_socket_protosw[] = {
114 {
115 	.pr_type =		SOCK_STREAM,
116 	.pr_domain =		&hv_socket_domain,
117 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
118 	.pr_flags =		PR_CONNREQUIRED,
119 	.pr_init =		hvs_trans_init,
120 	.pr_usrreqs =		&hvs_trans_usrreqs,
121 },
122 };
123 
124 static struct domain		hv_socket_domain = {
125 	.dom_family =		AF_HYPERV,
126 	.dom_name =		"hyperv",
127 	.dom_protosw =		hv_socket_protosw,
128 	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
129 };
130 
131 VNET_DOMAIN_SET(hv_socket_);
132 
133 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
134 #define MIN_PORT			((uint32_t)0x0)
135 
136 /* 00000000-facb-11e6-bd58-64006a7986d3 */
137 static const struct hyperv_guid srv_id_template = {
138 	.hv_guid = {
139 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
140 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
141 };
142 
143 static int		hvsock_br_callback(void *, int, void *);
144 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
145 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
146 static int		hvsock_send_data(struct vmbus_channel *chan,
147     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
148 
149 
150 
151 /* Globals */
152 static struct sx		hvs_trans_socks_sx;
153 static struct mtx		hvs_trans_socks_mtx;
154 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
155 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
156 static uint32_t			previous_auto_bound_port;
157 
158 static void
159 hvsock_print_guid(struct hyperv_guid *guid)
160 {
161 	unsigned char *p = (unsigned char *)guid;
162 
163 	HVSOCK_DBG(HVSOCK_DBG_INFO,
164 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
165 	    *(unsigned int *)p,
166 	    *((unsigned short *) &p[4]),
167 	    *((unsigned short *) &p[6]),
168 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
169 }
170 
171 static bool
172 is_valid_srv_id(const struct hyperv_guid *id)
173 {
174 	return !memcmp(&id->hv_guid[4],
175 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
176 }
177 
178 static unsigned int
179 get_port_by_srv_id(const struct hyperv_guid *srv_id)
180 {
181 	return *((const unsigned int *)srv_id);
182 }
183 
184 static void
185 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
186 {
187 	*((unsigned int *)srv_id) = port;
188 }
189 
190 
191 static void
192 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
193 {
194 	struct hvs_pcb *p = NULL;
195 
196 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
197 
198 	if (!pcb)
199 		return;
200 
201 	if (list & HVS_LIST_BOUND) {
202 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
203 			if  (p == pcb)
204 				LIST_REMOVE(p, bound_next);
205 	}
206 
207 	if (list & HVS_LIST_CONNECTED) {
208 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
209 			if (p == pcb)
210 				LIST_REMOVE(pcb, connected_next);
211 	}
212 }
213 
214 static void
215 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
216 {
217 	struct hvs_pcb *pcb = so2hvspcb(so);
218 
219 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
220 
221 	__hvs_remove_pcb_from_list(pcb, list);
222 }
223 
224 static void
225 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
226 {
227 	struct hvs_pcb *pcb = so2hvspcb(so);
228 
229 	if (list & HVS_LIST_BOUND)
230 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
231 		   pcb, bound_next);
232 
233 	if (list & HVS_LIST_CONNECTED)
234 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
235 		   pcb, connected_next);
236 }
237 
238 void
239 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
240 {
241 	if (!so || !so->so_pcb) {
242 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
243 		    "%s: socket or so_pcb is null\n", __func__);
244 		return;
245 	}
246 
247 	mtx_lock(&hvs_trans_socks_mtx);
248 	__hvs_remove_socket_from_list(so, list);
249 	mtx_unlock(&hvs_trans_socks_mtx);
250 }
251 
252 static void
253 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
254 {
255 	if (!so || !so->so_pcb) {
256 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
257 		    "%s: socket or so_pcb is null\n", __func__);
258 		return;
259 	}
260 
261 	mtx_lock(&hvs_trans_socks_mtx);
262 	__hvs_insert_socket_on_list(so, list);
263 	mtx_unlock(&hvs_trans_socks_mtx);
264 }
265 
266 static struct socket *
267 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
268 {
269 	struct hvs_pcb *p = NULL;
270 
271 	if (list & HVS_LIST_BOUND)
272 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
273 			if (p->so != NULL &&
274 			    addr->hvs_port == p->local_addr.hvs_port)
275 				return p->so;
276 
277 	if (list & HVS_LIST_CONNECTED)
278 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
279 			if (p->so != NULL &&
280 			    addr->hvs_port == p->local_addr.hvs_port)
281 				return p->so;
282 
283 	return NULL;
284 }
285 
286 static struct socket *
287 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
288 {
289 	struct socket *s = NULL;
290 
291 	mtx_lock(&hvs_trans_socks_mtx);
292 	s = __hvs_find_socket_on_list(addr, list);
293 	mtx_unlock(&hvs_trans_socks_mtx);
294 
295 	return s;
296 }
297 
298 static inline void
299 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
300 {
301 	memset(addr, 0, sizeof(*addr));
302 	addr->sa_family = AF_HYPERV;
303 	addr->hvs_port = port;
304 }
305 
306 void
307 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
308 {
309 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
310 }
311 
312 int
313 hvs_trans_lock(void)
314 {
315 	sx_xlock(&hvs_trans_socks_sx);
316 	return (0);
317 }
318 
319 void
320 hvs_trans_unlock(void)
321 {
322 	sx_xunlock(&hvs_trans_socks_sx);
323 }
324 
325 void
326 hvs_trans_init(void)
327 {
328 	/* Skip initialization of globals for non-default instances. */
329 	if (!IS_DEFAULT_VNET(curvnet))
330 		return;
331 
332 	if (vm_guest != VM_GUEST_HV)
333 		return;
334 
335 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
336 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
337 
338 	/* Initialize Globals */
339 	previous_auto_bound_port = MAX_PORT;
340 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
341 	mtx_init(&hvs_trans_socks_mtx,
342 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
343 	LIST_INIT(&hvs_trans_bound_socks);
344 	LIST_INIT(&hvs_trans_connected_socks);
345 }
346 
347 /*
348  * Called in two cases:
349  * 1) When user calls socket();
350  * 2) When we accept new incoming conneciton and call sonewconn().
351  */
352 int
353 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
354 {
355 	struct hvs_pcb *pcb = so2hvspcb(so);
356 
357 	if (vm_guest != VM_GUEST_HV)
358 		return (ESOCKTNOSUPPORT);
359 
360 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
361 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
362 
363 	if (so->so_type != SOCK_STREAM)
364 		return (ESOCKTNOSUPPORT);
365 
366 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
367 		return (EPROTONOSUPPORT);
368 
369 	if (pcb != NULL)
370 		return (EISCONN);
371 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
372 	if (pcb == NULL)
373 		return (ENOMEM);
374 
375 	pcb->so = so;
376 	so->so_pcb = (void *)pcb;
377 
378 	return (0);
379 }
380 
381 void
382 hvs_trans_detach(struct socket *so)
383 {
384 	struct hvs_pcb *pcb;
385 
386 	if (vm_guest != VM_GUEST_HV)
387 		return;
388 
389 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
390 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
391 
392 	(void) hvs_trans_lock();
393 	pcb = so2hvspcb(so);
394 	if (pcb == NULL) {
395 		hvs_trans_unlock();
396 		return;
397 	}
398 
399 	if (SOLISTENING(so)) {
400 		bzero(pcb, sizeof(*pcb));
401 		free(pcb, M_HVSOCK);
402 	}
403 
404 	so->so_pcb = NULL;
405 
406 	hvs_trans_unlock();
407 }
408 
409 int
410 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
411 {
412 	struct hvs_pcb *pcb = so2hvspcb(so);
413 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
414 	int error = 0;
415 
416 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
417 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
418 
419 	if (sa == NULL) {
420 		return (EINVAL);
421 	}
422 
423 	if (pcb == NULL) {
424 		return (EINVAL);
425 	}
426 
427 	if (sa->sa_family != AF_HYPERV) {
428 		HVSOCK_DBG(HVSOCK_DBG_ERR,
429 		    "%s: Not supported, sa_family is %u\n",
430 		    __func__, sa->sa_family);
431 		return (EAFNOSUPPORT);
432 	}
433 
434 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
435 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
436 
437 	mtx_lock(&hvs_trans_socks_mtx);
438 	if (__hvs_find_socket_on_list(sa,
439 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
440 		error = EADDRINUSE;
441 	} else {
442 		/*
443 		 * The address is available for us to bind.
444 		 * Add socket to the bound list.
445 		 */
446 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
447 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
448 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
449 	}
450 	mtx_unlock(&hvs_trans_socks_mtx);
451 
452 	return (error);
453 }
454 
455 int
456 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
457 {
458 	struct hvs_pcb *pcb = so2hvspcb(so);
459 	struct socket *bound_so;
460 	int error;
461 
462 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
463 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
464 
465 	if (pcb == NULL)
466 		return (EINVAL);
467 
468 	/* Check if the address is already bound and it was by us. */
469 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
470 	if (bound_so == NULL || bound_so != so) {
471 		HVSOCK_DBG(HVSOCK_DBG_ERR,
472 		    "%s: Address not bound or not by us.\n", __func__);
473 		return (EADDRNOTAVAIL);
474 	}
475 
476 	SOCK_LOCK(so);
477 	error = solisten_proto_check(so);
478 	if (error == 0)
479 		solisten_proto(so, backlog);
480 	SOCK_UNLOCK(so);
481 
482 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
483 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
484 	return (error);
485 }
486 
487 int
488 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
489 {
490 	struct hvs_pcb *pcb = so2hvspcb(so);
491 
492 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
493 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
494 
495 	if (pcb == NULL)
496 		return (EINVAL);
497 
498 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
499 	    M_NOWAIT);
500 
501 	return ((*nam == NULL) ? ENOMEM : 0);
502 }
503 
504 int
505 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
506 {
507 	struct hvs_pcb *pcb = so2hvspcb(so);
508 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
509 	bool found_auto_bound_port = false;
510 	int i, error = 0;
511 
512 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
513 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
514 	    __func__, raddr->hvs_port);
515 
516 	if (pcb == NULL)
517 		return (EINVAL);
518 
519 	/* Verify the remote address */
520 	if (raddr == NULL)
521 		return (EINVAL);
522 	if (raddr->sa_family != AF_HYPERV)
523 		return (EAFNOSUPPORT);
524 
525 	mtx_lock(&hvs_trans_socks_mtx);
526 	if (so->so_state &
527 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
528 			HVSOCK_DBG(HVSOCK_DBG_ERR,
529 			    "%s: socket connect in progress\n",
530 			    __func__);
531 			error = EINPROGRESS;
532 			goto out;
533 	}
534 
535 	/*
536 	 * Find an available port for us to auto bind the local
537 	 * address.
538 	 */
539 	hvs_addr_set(&pcb->local_addr, 0);
540 
541 	for (i = previous_auto_bound_port - 1;
542 	    i != previous_auto_bound_port; i --) {
543 		if (i == MIN_PORT)
544 			i = MAX_PORT;
545 
546 		pcb->local_addr.hvs_port = i;
547 
548 		if (__hvs_find_socket_on_list(&pcb->local_addr,
549 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
550 			found_auto_bound_port = true;
551 			previous_auto_bound_port = i;
552 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
553 			    "%s: found local bound port is %x\n",
554 			    __func__, pcb->local_addr.hvs_port);
555 			break;
556 		}
557 	}
558 
559 	if (found_auto_bound_port == true) {
560 		/* Found available port for auto bound, put on list */
561 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
562 		/* Set VM service ID */
563 		pcb->vm_srv_id = srv_id_template;
564 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
565 		/* Set host service ID and remote port */
566 		pcb->host_srv_id = srv_id_template;
567 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
568 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
569 
570 		/* Change the socket state to SS_ISCONNECTING */
571 		soisconnecting(so);
572 	} else {
573 		HVSOCK_DBG(HVSOCK_DBG_ERR,
574 		    "%s: No local port available for auto bound\n",
575 		    __func__);
576 		error = EADDRINUSE;
577 	}
578 
579 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
580 	hvsock_print_guid(&pcb->vm_srv_id);
581 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
582 	hvsock_print_guid(&pcb->host_srv_id);
583 
584 out:
585 	mtx_unlock(&hvs_trans_socks_mtx);
586 
587 	if (found_auto_bound_port == true)
588 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
589 
590 	return (error);
591 }
592 
593 int
594 hvs_trans_disconnect(struct socket *so)
595 {
596 	struct hvs_pcb *pcb;
597 
598 	if (vm_guest != VM_GUEST_HV)
599 		return (ESOCKTNOSUPPORT);
600 
601 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
602 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
603 
604 	(void) hvs_trans_lock();
605 	pcb = so2hvspcb(so);
606 	if (pcb == NULL) {
607 		hvs_trans_unlock();
608 		return (EINVAL);
609 	}
610 
611 	/* If socket is already disconnected, skip this */
612 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
613 		soisdisconnecting(so);
614 
615 	hvs_trans_unlock();
616 
617 	return (0);
618 }
619 
620 #define SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
621 struct hvs_callback_arg {
622 	struct uio *uio;
623 	struct sockbuf *sb;
624 };
625 
626 int
627 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
628     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
629 {
630 	struct hvs_pcb *pcb = so2hvspcb(so);
631 	struct sockbuf *sb;
632 	ssize_t orig_resid;
633 	uint32_t canread, to_read;
634 	int flags, error = 0;
635 	struct hvs_callback_arg cbarg;
636 
637 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
638 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
639 
640 	if (so->so_type != SOCK_STREAM)
641 		return (EINVAL);
642 	if (pcb == NULL)
643 		return (EINVAL);
644 
645 	if (flagsp != NULL)
646 		flags = *flagsp &~ MSG_EOR;
647 	else
648 		flags = 0;
649 
650 	if (flags & MSG_PEEK)
651 		return (EOPNOTSUPP);
652 
653 	/* If no space to copy out anything */
654 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
655 		return (EINVAL);
656 
657 	sb = &so->so_rcv;
658 
659 	orig_resid = uio->uio_resid;
660 
661 	/* Prevent other readers from entering the socket. */
662 	error = sblock(sb, SBLOCKWAIT(flags));
663 	if (error) {
664 		HVSOCK_DBG(HVSOCK_DBG_ERR,
665 		    "%s: sblock returned error = %d\n", __func__, error);
666 		return (error);
667 	}
668 
669 	SOCKBUF_LOCK(sb);
670 
671 	cbarg.uio = uio;
672 	cbarg.sb = sb;
673 	/*
674 	 * If the socket is closing, there might still be some data
675 	 * in rx br to read. However we need to make sure
676 	 * the channel is still open.
677 	 */
678 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
679 	    (so->so_state & SS_ISDISCONNECTED)) {
680 		/* Other thread already closed the channel */
681 		error = EPIPE;
682 		goto out;
683 	}
684 
685 	while (true) {
686 		while (uio->uio_resid > 0 &&
687 		    (canread = hvsock_canread_check(pcb)) > 0) {
688 			to_read = MIN(canread, uio->uio_resid);
689 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
690 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
691 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
692 			    pcb->recv_data_off));
693 
694 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
695 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
696 			    hvsock_br_callback, (void *)&cbarg);
697 			/*
698 			 * It is possible socket is disconnected becasue
699 			 * we released lock in hvsock_br_callback. So we
700 			 * need to check the state to make sure it is not
701 			 * disconnected.
702 			 */
703 			if (error || so->so_state & SS_ISDISCONNECTED) {
704 				break;
705 			}
706 
707 			pcb->recv_data_len -= to_read;
708 			pcb->recv_data_off += to_read;
709 		}
710 
711 		if (error)
712 			break;
713 
714 		/* Abort if socket has reported problems. */
715 		if (so->so_error) {
716 			if (so->so_error == ESHUTDOWN &&
717 			    orig_resid > uio->uio_resid) {
718 				/*
719 				 * Although we got a FIN, we also received
720 				 * some data in this round. Delivery it
721 				 * to user.
722 				 */
723 				error = 0;
724 			} else {
725 				if (so->so_error != ESHUTDOWN)
726 					error = so->so_error;
727 			}
728 
729 			break;
730 		}
731 
732 		/* Cannot received more. */
733 		if (sb->sb_state & SBS_CANTRCVMORE)
734 			break;
735 
736 		/* We are done if buffer has been filled */
737 		if (uio->uio_resid == 0)
738 			break;
739 
740 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
741 			break;
742 
743 		/* Buffer ring is empty and we shall not block */
744 		if ((so->so_state & SS_NBIO) ||
745 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
746 			if (orig_resid == uio->uio_resid) {
747 				/* We have not read anything */
748 				error = EAGAIN;
749 			}
750 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
751 			    "%s: non blocked read return, error %d.\n",
752 			    __func__, error);
753 			break;
754 		}
755 
756 		/*
757 		 * Wait and block until (more) data comes in.
758 		 * Note: Drops the sockbuf lock during wait.
759 		 */
760 		error = sbwait(sb);
761 
762 		if (error)
763 			break;
764 
765 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
766 		    "%s: wake up from sbwait, read available is %u\n",
767 		    __func__, vmbus_chan_read_available(pcb->chan));
768 	}
769 
770 out:
771 	SOCKBUF_UNLOCK(sb);
772 
773 	sbunlock(sb);
774 
775 	/* We recieved a FIN in this call */
776 	if (so->so_error == ESHUTDOWN) {
777 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
778 			/* Send has already closed */
779 			soisdisconnecting(so);
780 		} else {
781 			/* Just close the receive side */
782 			socantrcvmore(so);
783 		}
784 	}
785 
786 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
787 	    "%s: returning error = %d, so_error = %d\n",
788 	    __func__, error, so->so_error);
789 
790 	return (error);
791 }
792 
793 int
794 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
795     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
796 {
797 	struct hvs_pcb *pcb = so2hvspcb(so);
798 	struct sockbuf *sb;
799 	ssize_t orig_resid;
800 	uint32_t canwrite, to_write;
801 	int error = 0;
802 
803 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
804 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
805 	    __func__, uio->uio_resid);
806 
807 	if (so->so_type != SOCK_STREAM)
808 		return (EINVAL);
809 	if (pcb == NULL)
810 		return (EINVAL);
811 
812 	/* If nothing to send */
813 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
814 		return (EINVAL);
815 
816 	sb = &so->so_snd;
817 
818 	orig_resid = uio->uio_resid;
819 
820 	/* Prevent other writers from entering the socket. */
821 	error = sblock(sb, SBLOCKWAIT(flags));
822 	if (error) {
823 		HVSOCK_DBG(HVSOCK_DBG_ERR,
824 		    "%s: sblock returned error = %d\n", __func__, error);
825 		return (error);
826 	}
827 
828 	SOCKBUF_LOCK(sb);
829 
830 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
831 	    so->so_error == ESHUTDOWN) {
832 		error = EPIPE;
833 		goto out;
834 	}
835 
836 	while (uio->uio_resid > 0) {
837 		canwrite = hvsock_canwrite_check(pcb);
838 		if (canwrite == 0) {
839 			/* We have sent some data */
840 			if (orig_resid > uio->uio_resid)
841 				break;
842 			/*
843 			 * We have not sent any data and it is
844 			 * non-blocked io
845 			 */
846 			if (so->so_state & SS_NBIO ||
847 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
848 				error = EWOULDBLOCK;
849 				break;
850 			} else {
851 				/*
852 				 * We are here because there is no space on
853 				 * send buffer ring. Signal the other side
854 				 * to read and free more space.
855 				 * Sleep wait until space avaiable to send
856 				 * Note: Drops the sockbuf lock during wait.
857 				 */
858 				error = sbwait(sb);
859 
860 				if (error)
861 					break;
862 
863 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
864 				    "%s: wake up from sbwait, space avail on "
865 				    "tx ring is %u\n",
866 				    __func__,
867 				    vmbus_chan_write_available(pcb->chan));
868 
869 				continue;
870 			}
871 		}
872 		to_write = MIN(canwrite, uio->uio_resid);
873 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
874 
875 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
876 		    "%s: canwrite is %u, to_write = %u\n", __func__,
877 		    canwrite, to_write);
878 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
879 
880 		if (error)
881 			break;
882 	}
883 
884 out:
885 	SOCKBUF_UNLOCK(sb);
886 	sbunlock(sb);
887 
888 	return (error);
889 }
890 
891 int
892 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
893 {
894 	struct hvs_pcb *pcb = so2hvspcb(so);
895 
896 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
897 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
898 
899 	if (pcb == NULL)
900 		return (EINVAL);
901 
902 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
903 
904 	return ((*nam == NULL)? ENOMEM : 0);
905 }
906 
907 int
908 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
909 {
910 	struct hvs_pcb *pcb = so2hvspcb(so);
911 
912 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
913 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
914 
915 	if (pcb == NULL)
916 		return (EINVAL);
917 
918 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
919 
920 	return ((*nam == NULL)? ENOMEM : 0);
921 }
922 
923 void
924 hvs_trans_close(struct socket *so)
925 {
926 	struct hvs_pcb *pcb;
927 
928 	if (vm_guest != VM_GUEST_HV)
929 		return;
930 
931 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
932 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
933 
934 	(void) hvs_trans_lock();
935 	pcb = so2hvspcb(so);
936 	if (!pcb) {
937 		hvs_trans_unlock();
938 		return;
939 	}
940 
941 	if (so->so_state & SS_ISCONNECTED) {
942 		/* Send a FIN to peer */
943 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
944 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
945 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
946 	}
947 
948 	if (so->so_state &
949 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
950 		soisdisconnected(so);
951 
952 	pcb->chan = NULL;
953 	pcb->so = NULL;
954 
955 	if (SOLISTENING(so)) {
956 		mtx_lock(&hvs_trans_socks_mtx);
957 		/* Remove from bound list */
958 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
959 		mtx_unlock(&hvs_trans_socks_mtx);
960 	}
961 
962 	hvs_trans_unlock();
963 
964 	return;
965 }
966 
967 void
968 hvs_trans_abort(struct socket *so)
969 {
970 	struct hvs_pcb *pcb = so2hvspcb(so);
971 
972 	if (vm_guest != VM_GUEST_HV)
973 		return;
974 
975 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
976 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
977 
978 	(void) hvs_trans_lock();
979 	if (pcb == NULL) {
980 		hvs_trans_unlock();
981 		return;
982 	}
983 
984 	if (SOLISTENING(so)) {
985 		mtx_lock(&hvs_trans_socks_mtx);
986 		/* Remove from bound list */
987 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
988 		mtx_unlock(&hvs_trans_socks_mtx);
989 	}
990 
991 	if (so->so_state & SS_ISCONNECTED) {
992 		(void) sodisconnect(so);
993 	}
994 	hvs_trans_unlock();
995 
996 	return;
997 }
998 
999 int
1000 hvs_trans_shutdown(struct socket *so)
1001 {
1002 	struct hvs_pcb *pcb = so2hvspcb(so);
1003 	struct sockbuf *sb;
1004 
1005 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1006 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
1007 
1008 	if (pcb == NULL)
1009 		return (EINVAL);
1010 
1011 	/*
1012 	 * Only get called with the shutdown method is SHUT_WR or
1013 	 * SHUT_RDWR.
1014 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
1015 	 * already set the SBS_CANTRCVMORE on receive side socket
1016 	 * buffer.
1017 	 */
1018 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1019 		/*
1020 		 * SHUT_WR only case.
1021 		 * Receive side is still open. Just close
1022 		 * the send side.
1023 		 */
1024 		socantsendmore(so);
1025 	} else {
1026 		/* SHUT_RDWR case */
1027 		if (so->so_state & SS_ISCONNECTED) {
1028 			/* Send a FIN to peer */
1029 			sb = &so->so_snd;
1030 			SOCKBUF_LOCK(sb);
1031 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1032 			SOCKBUF_UNLOCK(sb);
1033 
1034 			soisdisconnecting(so);
1035 		}
1036 	}
1037 
1038 	return (0);
1039 }
1040 
1041 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1042  * <port> (see struct sockaddr_hvs).
1043  *
1044  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1045  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1046  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1047  * the below sockaddr:
1048  *
1049  * struct SOCKADDR_HV
1050  * {
1051  *    ADDRESS_FAMILY Family;
1052  *    USHORT Reserved;
1053  *    GUID VmId;
1054  *    GUID ServiceId;
1055  * };
1056  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1057  * VMBus, because here it's obvious the host and the VM can easily identify
1058  * each other. Though the VmID is useful on the host, especially in the case
1059  * of Windows container, FreeBSD VM doesn't need it at all.
1060  *
1061  * To be compatible with similar infrastructure in Linux VMs, we have
1062  * to limit the available GUID space of SOCKADDR_HV so that we can create
1063  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1064  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1065  *
1066  ****************************************************************************
1067  * The only valid Service GUIDs, from the perspectives of both the host and *
1068  * FreeBSD VM, that can be connected by the other end, must conform to this *
1069  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1070  ****************************************************************************
1071  *
1072  * When we write apps on the host to connect(), the GUID ServiceID is used.
1073  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1074  * port and the driver will form the GUID and use that to request the host.
1075  *
1076  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1077  * auto-generated remote port for a connect request initiated by the host's
1078  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1079  * FreeBSD guest.
1080  */
1081 
1082 /*
1083  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1084  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1085  * HyperV hosts doen't have this limit.
1086  */
1087 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1088 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1089 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1090 
1091 struct hvsock_sc {
1092 	device_t		dev;
1093 	struct hvs_pcb		*pcb;
1094 	struct vmbus_channel	*channel;
1095 };
1096 
1097 static bool
1098 hvsock_chan_readable(struct vmbus_channel *chan)
1099 {
1100 	uint32_t readable = vmbus_chan_read_available(chan);
1101 
1102 	return (readable >= HVSOCK_PKT_LEN(0));
1103 }
1104 
1105 static void
1106 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1107 {
1108 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1109 	struct socket *so;
1110 	uint32_t canwrite;
1111 
1112 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1113 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1114 	    __func__, pcb);
1115 
1116 	/*
1117 	 * Check if the socket is still attached and valid.
1118 	 * Here we know channel is still open. Need to make
1119 	 * sure the socket has not been closed or freed.
1120 	 */
1121 	(void) hvs_trans_lock();
1122 	so = hsvpcb2so(pcb);
1123 
1124 	if (pcb->chan != NULL && so != NULL) {
1125 		/*
1126 		 * Wake up reader if there are data to read.
1127 		 */
1128 		SOCKBUF_LOCK(&(so)->so_rcv);
1129 
1130 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1131 		    "%s: read available = %u\n", __func__,
1132 		    vmbus_chan_read_available(pcb->chan));
1133 
1134 		if (hvsock_chan_readable(pcb->chan))
1135 			sorwakeup_locked(so);
1136 		else
1137 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1138 
1139 		/*
1140 		 * Wake up sender if space becomes available to write.
1141 		 */
1142 		SOCKBUF_LOCK(&(so)->so_snd);
1143 		canwrite = hvsock_canwrite_check(pcb);
1144 
1145 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1146 		    "%s: canwrite = %u\n", __func__, canwrite);
1147 
1148 		if (canwrite > 0) {
1149 			sowwakeup_locked(so);
1150 		} else {
1151 			SOCKBUF_UNLOCK(&(so)->so_snd);
1152 		}
1153 	}
1154 
1155 	hvs_trans_unlock();
1156 
1157 	return;
1158 }
1159 
1160 static int
1161 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1162 {
1163 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1164 	struct uio *uio = arg->uio;
1165 	struct sockbuf *sb = arg->sb;
1166 	int error = 0;
1167 
1168 	if (cbarg == NULL || datap == NULL)
1169 		return (EINVAL);
1170 
1171 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1172 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1173 	    "datap = %p\n",
1174 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1175 	    uio->uio_resid, cplen, datap);
1176 
1177 	if (sb)
1178 		SOCKBUF_UNLOCK(sb);
1179 
1180 	error = uiomove(datap, cplen, uio);
1181 
1182 	if (sb)
1183 		SOCKBUF_LOCK(sb);
1184 
1185 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1186 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1187 	    __func__, uio->uio_resid, error);
1188 
1189 	return (error);
1190 }
1191 
1192 static int
1193 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1194     uint32_t to_write, struct sockbuf *sb)
1195 {
1196 	struct hvs_pkt_header hvs_pkt;
1197 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1198 	uint64_t pad = 0;
1199 	struct iovec iov[3];
1200 	struct hvs_callback_arg cbarg;
1201 
1202 	if (chan == NULL)
1203 		return (ENOTCONN);
1204 
1205 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1206 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1207 	hvs_pktlen = hvs_pkthlen + to_write;
1208 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1209 
1210 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1211 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1212 	    "pad_pktlen = %u, data_len = %u\n",
1213 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1214 
1215 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1216 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1217 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1218 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1219 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1220 
1221 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1222 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1223 
1224 	cbarg.uio = uio;
1225 	cbarg.sb = sb;
1226 
1227 	if (uio && to_write > 0) {
1228 		iov[0].iov_base = &hvs_pkt;
1229 		iov[0].iov_len = hvs_pkthlen;
1230 		iov[1].iov_base = NULL;
1231 		iov[1].iov_len = to_write;
1232 		iov[2].iov_base = &pad;
1233 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1234 
1235 		error = vmbus_chan_iov_send(chan, iov, 3,
1236 		    hvsock_br_callback, &cbarg);
1237 	} else {
1238 		if (to_write == 0) {
1239 			iov[0].iov_base = &hvs_pkt;
1240 			iov[0].iov_len = hvs_pkthlen;
1241 			iov[1].iov_base = &pad;
1242 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1243 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1244 		}
1245 	}
1246 
1247 	if (error) {
1248 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1249 		    "%s: error = %d\n", __func__, error);
1250 	}
1251 
1252 	return (error);
1253 }
1254 
1255 /*
1256  * Check if we have data on current ring buffer to read
1257  * or not. If not, advance the ring buffer read index to
1258  * next packet. Update the recev_data_len and recev_data_off
1259  * to new value.
1260  * Return the number of bytes can read.
1261  */
1262 static uint32_t
1263 hvsock_canread_check(struct hvs_pcb *pcb)
1264 {
1265 	uint32_t advance;
1266 	uint32_t tlen, hlen, dlen;
1267 	uint32_t bytes_canread = 0;
1268 	int error;
1269 
1270 	if (pcb == NULL || pcb->chan == NULL) {
1271 		pcb->so->so_error = EIO;
1272 		return (0);
1273 	}
1274 
1275 	/* Still have data not read yet on current packet */
1276 	if (pcb->recv_data_len > 0)
1277 		return (pcb->recv_data_len);
1278 
1279 	if (pcb->rb_init)
1280 		advance =
1281 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1282 	else
1283 		advance = 0;
1284 
1285 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1286 
1287 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1288 	    "%s: bytes_canread on br = %u, advance = %u\n",
1289 	    __func__, bytes_canread, advance);
1290 
1291 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1292 		/*
1293 		 * Nothing to read. Need to advance the rindex before
1294 		 * calling sbwait, so host knows to wake us up when data
1295 		 * is available to read on rb.
1296 		 */
1297 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1298 		if (error) {
1299 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1300 			    "%s: after calling vmbus_chan_recv_idxadv, "
1301 			    "got error = %d\n",  __func__, error);
1302 			return (0);
1303 		} else {
1304 			pcb->rb_init = false;
1305 			pcb->recv_data_len = 0;
1306 			pcb->recv_data_off = 0;
1307 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1308 
1309 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1310 			    "%s: advanced %u bytes, "
1311 			    " bytes_canread on br now = %u\n",
1312 			    __func__, advance, bytes_canread);
1313 
1314 			if (bytes_canread == 0)
1315 				return (0);
1316 			else
1317 				advance = 0;
1318 		}
1319 	}
1320 
1321 	if (bytes_canread <
1322 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1323 		return (0);
1324 
1325 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1326 	    sizeof(struct hvs_pkt_header), advance);
1327 
1328 	/* Don't have anything to read */
1329 	if (error) {
1330 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1331 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1332 		    __func__, error);
1333 		return (0);
1334 	}
1335 
1336 	/*
1337 	 * We just read in a new packet header. Do some sanity checks.
1338 	 */
1339 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1340 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1341 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1342 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1343 	    __predict_false(hlen > tlen) ||
1344 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1345 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1346 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1347 		    tlen, hlen, dlen);
1348 		pcb->so->so_error = EIO;
1349 		return (0);
1350 	}
1351 	if (pcb->rb_init == false)
1352 		pcb->rb_init = true;
1353 
1354 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1355 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1356 	    tlen, hlen, dlen);
1357 
1358 	/* The other side has sent a close FIN */
1359 	if (dlen == 0) {
1360 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1361 		    "%s: Received FIN from other side\n", __func__);
1362 		/* inform the caller by seting so_error to ESHUTDOWN */
1363 		pcb->so->so_error = ESHUTDOWN;
1364 	}
1365 
1366 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1367 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1368 
1369 	pcb->recv_data_len = dlen;
1370 	pcb->recv_data_off = 0;
1371 
1372 	return (pcb->recv_data_len);
1373 }
1374 
1375 static uint32_t
1376 hvsock_canwrite_check(struct hvs_pcb *pcb)
1377 {
1378 	uint32_t writeable;
1379 	uint32_t ret;
1380 
1381 	if (pcb == NULL || pcb->chan == NULL)
1382 		return (0);
1383 
1384 	writeable = vmbus_chan_write_available(pcb->chan);
1385 
1386 	/*
1387 	 * We must always reserve a 0-length-payload packet for the FIN.
1388 	 */
1389 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1390 	    "%s: writeable is %u, should be greater than %ju\n",
1391 	    __func__, writeable,
1392 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1393 
1394 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1395 		/*
1396 		 * The Tx ring seems full.
1397 		 */
1398 		return (0);
1399 	}
1400 
1401 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1402 
1403 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1404 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1405 
1406 	return (rounddown2(ret, 8));
1407 }
1408 
1409 static void
1410 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1411 {
1412 	vmbus_chan_set_pending_send_size(chan,
1413 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1414 }
1415 
1416 static int
1417 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1418 {
1419 	unsigned int rcvbuf, sndbuf;
1420 	struct hvs_pcb *pcb = so2hvspcb(so);
1421 	int ret;
1422 
1423 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1424 		sndbuf = HVS_RINGBUF_SND_SIZE;
1425 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1426 	} else {
1427 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1428 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1429 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1430 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1431 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1432 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1433 	}
1434 
1435 	/*
1436 	 * Can only read whatever user provided size of data
1437 	 * from ring buffer. Turn off batched reading.
1438 	 */
1439 	vmbus_chan_set_readbatch(chan, false);
1440 
1441 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1442 	    hvsock_chan_cb, pcb);
1443 
1444 	if (ret != 0) {
1445 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1446 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1447 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1448 	} else {
1449 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1450 		    "%s: hvsock channel opened, sndbuf = %u, i"
1451 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1452 		/*
1453 		 * Se the pending send size so to receive wakeup
1454 		 * signals from host when there is enough space on
1455 		 * rx buffer ring to write.
1456 		 */
1457 		hvsock_set_chan_pending_send_size(chan);
1458 	}
1459 
1460 	return ret;
1461 }
1462 
1463 /*
1464  * Guest is listening passively on the socket. Open channel and
1465  * create a new socket for the conneciton.
1466  */
1467 static void
1468 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1469     struct hvsock_sc *sc)
1470 {
1471 	struct socket *new_so;
1472 	struct hvs_pcb *new_pcb, *pcb;
1473 	int error;
1474 
1475 	/* Do nothing if socket is not listening */
1476 	if ((so->so_options & SO_ACCEPTCONN) == 0) {
1477 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1478 		    "%s: socket is not a listening one\n", __func__);
1479 		return;
1480 	}
1481 
1482 	/*
1483 	 * Create a new socket. This will call pru_attach to complete
1484 	 * the socket initialization and put the new socket onto
1485 	 * listening socket's sol_incomp list, waiting to be promoted
1486 	 * to sol_comp list.
1487 	 * The new socket created has ref count 0. There is no other
1488 	 * thread that changes the state of this new one at the
1489 	 * moment, so we don't need to hold its lock while opening
1490 	 * channel and filling out its pcb information.
1491 	 */
1492 	new_so = sonewconn(so, 0);
1493 	if (!new_so)
1494 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1495 		    "%s: creating new socket failed\n", __func__);
1496 
1497 	/*
1498 	 * Now open the vmbus channel. If it fails, the socket will be
1499 	 * on the listening socket's sol_incomp queue until it is
1500 	 * replaced and aborted.
1501 	 */
1502 	error = hvsock_open_channel(chan, new_so);
1503 	if (error) {
1504 		new_so->so_error = error;
1505 		return;
1506 	}
1507 
1508 	pcb = so->so_pcb;
1509 	new_pcb = new_so->so_pcb;
1510 
1511 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1512 	/* Remote port is unknown to guest in this type of conneciton */
1513 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1514 	new_pcb->chan = chan;
1515 	new_pcb->recv_data_len = 0;
1516 	new_pcb->recv_data_off = 0;
1517 	new_pcb->rb_init = false;
1518 
1519 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1520 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1521 
1522 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1523 
1524 	sc->pcb = new_pcb;
1525 
1526 	/*
1527 	 * Change the socket state to SS_ISCONNECTED. This will promote
1528 	 * the socket to sol_comp queue and wake up the thread which
1529 	 * is accepting connection.
1530 	 */
1531 	soisconnected(new_so);
1532 }
1533 
1534 
1535 /*
1536  * Guest is actively connecting to host.
1537  */
1538 static void
1539 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1540 {
1541 	struct hvs_pcb *pcb;
1542 	int error;
1543 
1544 	error = hvsock_open_channel(chan, so);
1545 	if (error) {
1546 		so->so_error = error;
1547 		return;
1548 	}
1549 
1550 	pcb = so->so_pcb;
1551 	pcb->chan = chan;
1552 	pcb->recv_data_len = 0;
1553 	pcb->recv_data_off = 0;
1554 	pcb->rb_init = false;
1555 
1556 	mtx_lock(&hvs_trans_socks_mtx);
1557 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1558 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1559 	mtx_unlock(&hvs_trans_socks_mtx);
1560 
1561 	/*
1562 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1563 	 * the thread sleeping in connect call.
1564 	 */
1565 	soisconnected(so);
1566 }
1567 
1568 static void
1569 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1570 {
1571 	struct hyperv_guid *inst_guid, *type_guid;
1572 	bool conn_from_host;
1573 	struct sockaddr_hvs addr;
1574 	struct socket *so;
1575 	struct hvs_pcb *pcb;
1576 
1577 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1578 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1579 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1580 
1581 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1582 	hvsock_print_guid(type_guid);
1583 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1584 	hvsock_print_guid(inst_guid);
1585 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1586 	    (conn_from_host == true ) ? "from" : "to");
1587 
1588 	/*
1589 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1590 	 */
1591 	if (!is_valid_srv_id(type_guid))
1592 		return;
1593 
1594 	/*
1595 	 * There should be a bound socket already created no matter
1596 	 * it is a passive or active connection.
1597 	 * For host initiated connection (passive on guest side),
1598 	 * the  type_guid contains the port which guest is bound and
1599 	 * listening.
1600 	 * For the guest initiated connection (active on guest side),
1601 	 * the inst_guid contains the port that guest has auto bound
1602 	 * to.
1603 	 */
1604 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1605 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1606 	if (!so) {
1607 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1608 		    "%s: no bound socket found for port %u\n",
1609 		    __func__, addr.hvs_port);
1610 		return;
1611 	}
1612 
1613 	if (conn_from_host) {
1614 		hvsock_open_conn_passive(chan, so, sc);
1615 	} else {
1616 		(void) hvs_trans_lock();
1617 		pcb = so->so_pcb;
1618 		if (pcb && pcb->so) {
1619 			sc->pcb = so2hvspcb(so);
1620 			hvsock_open_conn_active(chan, so);
1621 		} else {
1622 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1623 			    "%s: channel detached before open\n", __func__);
1624 		}
1625 		hvs_trans_unlock();
1626 	}
1627 
1628 }
1629 
1630 static int
1631 hvsock_probe(device_t dev)
1632 {
1633 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1634 
1635 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1636 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1637 		    "hvsock_probe called but not a hvsock channel id %u\n",
1638 		    vmbus_chan_id(channel));
1639 
1640 		return ENXIO;
1641 	} else {
1642 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1643 		    "hvsock_probe got a hvsock channel id %u\n",
1644 		    vmbus_chan_id(channel));
1645 
1646 		return BUS_PROBE_DEFAULT;
1647 	}
1648 }
1649 
1650 static int
1651 hvsock_attach(device_t dev)
1652 {
1653 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1654 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1655 
1656 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1657 
1658 	hvsock_open_connection(channel, sc);
1659 
1660 	/*
1661 	 * Always return success. On error the host will rescind the device
1662 	 * in 30 seconds and we can do cleanup at that time in
1663 	 * vmbus_chan_msgproc_chrescind().
1664 	 */
1665 	return (0);
1666 }
1667 
1668 static int
1669 hvsock_detach(device_t dev)
1670 {
1671 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1672 	struct socket *so;
1673 	int error, retry;
1674 
1675 	if (bootverbose)
1676 		device_printf(dev, "hvsock_detach called.\n");
1677 
1678 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1679 
1680 	if (sc->pcb != NULL) {
1681 		(void) hvs_trans_lock();
1682 
1683 		so = hsvpcb2so(sc->pcb);
1684 		if (so) {
1685 			/* Close the connection */
1686 			if (so->so_state &
1687 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1688 				soisdisconnected(so);
1689 		}
1690 
1691 		mtx_lock(&hvs_trans_socks_mtx);
1692 		__hvs_remove_pcb_from_list(sc->pcb,
1693 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1694 		mtx_unlock(&hvs_trans_socks_mtx);
1695 
1696 		/*
1697 		 * Close channel while no reader and sender are working
1698 		 * on the buffer rings.
1699 		 */
1700 		if (so) {
1701 			retry = 0;
1702 			while ((error = sblock(&so->so_rcv, 0)) ==
1703 			    EWOULDBLOCK) {
1704 				/*
1705 				 * Someone is reading, rx br is busy
1706 				 */
1707 				soisdisconnected(so);
1708 				DELAY(500);
1709 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1710 				    "waiting for rx reader to exit, "
1711 				    "retry = %d\n", retry++);
1712 			}
1713 			retry = 0;
1714 			while ((error = sblock(&so->so_snd, 0)) ==
1715 			    EWOULDBLOCK) {
1716 				/*
1717 				 * Someone is sending, tx br is busy
1718 				 */
1719 				soisdisconnected(so);
1720 				DELAY(500);
1721 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1722 				    "waiting for tx sender to exit, "
1723 				    "retry = %d\n", retry++);
1724 			}
1725 		}
1726 
1727 
1728 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1729 		free(sc->pcb, M_HVSOCK);
1730 		sc->pcb = NULL;
1731 
1732 		if (so) {
1733 			sbunlock(&so->so_rcv);
1734 			sbunlock(&so->so_snd);
1735 			so->so_pcb = NULL;
1736 		}
1737 
1738 		hvs_trans_unlock();
1739 	}
1740 
1741 	vmbus_chan_close(vmbus_get_channel(dev));
1742 
1743 	return (0);
1744 }
1745 
1746 static device_method_t hvsock_methods[] = {
1747 	/* Device interface */
1748 	DEVMETHOD(device_probe, hvsock_probe),
1749 	DEVMETHOD(device_attach, hvsock_attach),
1750 	DEVMETHOD(device_detach, hvsock_detach),
1751 	DEVMETHOD_END
1752 };
1753 
1754 static driver_t hvsock_driver = {
1755 	"hv_sock",
1756 	hvsock_methods,
1757 	sizeof(struct hvsock_sc)
1758 };
1759 
1760 static devclass_t hvsock_devclass;
1761 
1762 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
1763 MODULE_VERSION(hvsock, 1);
1764 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1765