xref: /freebsd/sys/dev/hyperv/hvsock/hv_sock.c (revision 3ff01b231dfa83d518854c63e7c9cd1debd1139e)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/domain.h>
35 #include <sys/lock.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysproto.h>
46 #include <sys/systm.h>
47 #include <sys/sockbuf.h>
48 #include <sys/sx.h>
49 #include <sys/uio.h>
50 
51 #include <net/vnet.h>
52 
53 #include <dev/hyperv/vmbus/vmbus_reg.h>
54 
55 #include "hv_sock.h"
56 
57 #define HVSOCK_DBG_NONE			0x0
58 #define HVSOCK_DBG_INFO			0x1
59 #define HVSOCK_DBG_ERR			0x2
60 #define HVSOCK_DBG_VERBOSE		0x3
61 
62 
63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
64 
65 static int hvs_dbg_level;
66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
67     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
68 
69 
70 #define HVSOCK_DBG(level, ...) do {					\
71 	if (hvs_dbg_level >= (level))					\
72 		printf(__VA_ARGS__);					\
73 	} while (0)
74 
75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
76 
77 /* The MTU is 16KB per host side's design */
78 #define HVSOCK_MTU_SIZE		(1024 * 16)
79 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
80 
81 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
82 
83 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
84 					 roundup2(payload_len, 8) + \
85 					 sizeof(uint64_t))
86 
87 
88 static struct domain		hv_socket_domain;
89 
90 /*
91  * HyperV Transport sockets
92  */
93 static struct pr_usrreqs	hvs_trans_usrreqs = {
94 	.pru_attach =		hvs_trans_attach,
95 	.pru_bind =		hvs_trans_bind,
96 	.pru_listen =		hvs_trans_listen,
97 	.pru_accept =		hvs_trans_accept,
98 	.pru_connect =		hvs_trans_connect,
99 	.pru_peeraddr =		hvs_trans_peeraddr,
100 	.pru_sockaddr =		hvs_trans_sockaddr,
101 	.pru_soreceive =	hvs_trans_soreceive,
102 	.pru_sosend =		hvs_trans_sosend,
103 	.pru_disconnect =	hvs_trans_disconnect,
104 	.pru_close =		hvs_trans_close,
105 	.pru_detach =		hvs_trans_detach,
106 	.pru_shutdown =		hvs_trans_shutdown,
107 	.pru_abort =		hvs_trans_abort,
108 };
109 
110 /*
111  * Definitions of protocols supported in HyperV socket domain
112  */
113 static struct protosw		hv_socket_protosw[] = {
114 {
115 	.pr_type =		SOCK_STREAM,
116 	.pr_domain =		&hv_socket_domain,
117 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
118 	.pr_flags =		PR_CONNREQUIRED,
119 	.pr_init =		hvs_trans_init,
120 	.pr_usrreqs =		&hvs_trans_usrreqs,
121 },
122 };
123 
124 static struct domain		hv_socket_domain = {
125 	.dom_family =		AF_HYPERV,
126 	.dom_name =		"hyperv",
127 	.dom_protosw =		hv_socket_protosw,
128 	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
129 };
130 
131 VNET_DOMAIN_SET(hv_socket_);
132 
133 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
134 #define MIN_PORT			((uint32_t)0x0)
135 
136 /* 00000000-facb-11e6-bd58-64006a7986d3 */
137 static const struct hyperv_guid srv_id_template = {
138 	.hv_guid = {
139 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
140 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
141 };
142 
143 static int		hvsock_br_callback(void *, int, void *);
144 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
145 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
146 static int		hvsock_send_data(struct vmbus_channel *chan,
147     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
148 
149 
150 
151 /* Globals */
152 static struct sx		hvs_trans_socks_sx;
153 static struct mtx		hvs_trans_socks_mtx;
154 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
155 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
156 static uint32_t			previous_auto_bound_port;
157 
158 static void
159 hvsock_print_guid(struct hyperv_guid *guid)
160 {
161 	unsigned char *p = (unsigned char *)guid;
162 
163 	HVSOCK_DBG(HVSOCK_DBG_INFO,
164 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
165 	    *(unsigned int *)p,
166 	    *((unsigned short *) &p[4]),
167 	    *((unsigned short *) &p[6]),
168 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
169 }
170 
171 static bool
172 is_valid_srv_id(const struct hyperv_guid *id)
173 {
174 	return !memcmp(&id->hv_guid[4],
175 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
176 }
177 
178 static unsigned int
179 get_port_by_srv_id(const struct hyperv_guid *srv_id)
180 {
181 	return *((const unsigned int *)srv_id);
182 }
183 
184 static void
185 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
186 {
187 	*((unsigned int *)srv_id) = port;
188 }
189 
190 
191 static void
192 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
193 {
194 	struct hvs_pcb *p = NULL;
195 
196 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
197 
198 	if (!pcb)
199 		return;
200 
201 	if (list & HVS_LIST_BOUND) {
202 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
203 			if  (p == pcb)
204 				LIST_REMOVE(p, bound_next);
205 	}
206 
207 	if (list & HVS_LIST_CONNECTED) {
208 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
209 			if (p == pcb)
210 				LIST_REMOVE(pcb, connected_next);
211 	}
212 }
213 
214 static void
215 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
216 {
217 	struct hvs_pcb *pcb = so2hvspcb(so);
218 
219 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
220 
221 	__hvs_remove_pcb_from_list(pcb, list);
222 }
223 
224 static void
225 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
226 {
227 	struct hvs_pcb *pcb = so2hvspcb(so);
228 
229 	if (list & HVS_LIST_BOUND)
230 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
231 		   pcb, bound_next);
232 
233 	if (list & HVS_LIST_CONNECTED)
234 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
235 		   pcb, connected_next);
236 }
237 
238 void
239 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
240 {
241 	if (!so || !so->so_pcb) {
242 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
243 		    "%s: socket or so_pcb is null\n", __func__);
244 		return;
245 	}
246 
247 	mtx_lock(&hvs_trans_socks_mtx);
248 	__hvs_remove_socket_from_list(so, list);
249 	mtx_unlock(&hvs_trans_socks_mtx);
250 }
251 
252 static void
253 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
254 {
255 	if (!so || !so->so_pcb) {
256 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
257 		    "%s: socket or so_pcb is null\n", __func__);
258 		return;
259 	}
260 
261 	mtx_lock(&hvs_trans_socks_mtx);
262 	__hvs_insert_socket_on_list(so, list);
263 	mtx_unlock(&hvs_trans_socks_mtx);
264 }
265 
266 static struct socket *
267 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
268 {
269 	struct hvs_pcb *p = NULL;
270 
271 	if (list & HVS_LIST_BOUND)
272 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
273 			if (p->so != NULL &&
274 			    addr->hvs_port == p->local_addr.hvs_port)
275 				return p->so;
276 
277 	if (list & HVS_LIST_CONNECTED)
278 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
279 			if (p->so != NULL &&
280 			    addr->hvs_port == p->local_addr.hvs_port)
281 				return p->so;
282 
283 	return NULL;
284 }
285 
286 static struct socket *
287 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
288 {
289 	struct socket *s = NULL;
290 
291 	mtx_lock(&hvs_trans_socks_mtx);
292 	s = __hvs_find_socket_on_list(addr, list);
293 	mtx_unlock(&hvs_trans_socks_mtx);
294 
295 	return s;
296 }
297 
298 static inline void
299 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
300 {
301 	memset(addr, 0, sizeof(*addr));
302 	addr->sa_family = AF_HYPERV;
303 	addr->sa_len = sizeof(*addr);
304 	addr->hvs_port = port;
305 }
306 
307 void
308 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
309 {
310 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
311 }
312 
313 int
314 hvs_trans_lock(void)
315 {
316 	sx_xlock(&hvs_trans_socks_sx);
317 	return (0);
318 }
319 
320 void
321 hvs_trans_unlock(void)
322 {
323 	sx_xunlock(&hvs_trans_socks_sx);
324 }
325 
326 void
327 hvs_trans_init(void)
328 {
329 	/* Skip initialization of globals for non-default instances. */
330 	if (!IS_DEFAULT_VNET(curvnet))
331 		return;
332 
333 	if (vm_guest != VM_GUEST_HV)
334 		return;
335 
336 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
337 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
338 
339 	/* Initialize Globals */
340 	previous_auto_bound_port = MAX_PORT;
341 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
342 	mtx_init(&hvs_trans_socks_mtx,
343 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
344 	LIST_INIT(&hvs_trans_bound_socks);
345 	LIST_INIT(&hvs_trans_connected_socks);
346 }
347 
348 /*
349  * Called in two cases:
350  * 1) When user calls socket();
351  * 2) When we accept new incoming conneciton and call sonewconn().
352  */
353 int
354 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
355 {
356 	struct hvs_pcb *pcb = so2hvspcb(so);
357 
358 	if (vm_guest != VM_GUEST_HV)
359 		return (ESOCKTNOSUPPORT);
360 
361 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
362 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
363 
364 	if (so->so_type != SOCK_STREAM)
365 		return (ESOCKTNOSUPPORT);
366 
367 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
368 		return (EPROTONOSUPPORT);
369 
370 	if (pcb != NULL)
371 		return (EISCONN);
372 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
373 	if (pcb == NULL)
374 		return (ENOMEM);
375 
376 	pcb->so = so;
377 	so->so_pcb = (void *)pcb;
378 
379 	return (0);
380 }
381 
382 void
383 hvs_trans_detach(struct socket *so)
384 {
385 	struct hvs_pcb *pcb;
386 
387 	if (vm_guest != VM_GUEST_HV)
388 		return;
389 
390 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
391 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
392 
393 	(void) hvs_trans_lock();
394 	pcb = so2hvspcb(so);
395 	if (pcb == NULL) {
396 		hvs_trans_unlock();
397 		return;
398 	}
399 
400 	if (SOLISTENING(so)) {
401 		bzero(pcb, sizeof(*pcb));
402 		free(pcb, M_HVSOCK);
403 	}
404 
405 	so->so_pcb = NULL;
406 
407 	hvs_trans_unlock();
408 }
409 
410 int
411 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
412 {
413 	struct hvs_pcb *pcb = so2hvspcb(so);
414 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
415 	int error = 0;
416 
417 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
418 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
419 
420 	if (sa == NULL) {
421 		return (EINVAL);
422 	}
423 
424 	if (pcb == NULL) {
425 		return (EINVAL);
426 	}
427 
428 	if (sa->sa_family != AF_HYPERV) {
429 		HVSOCK_DBG(HVSOCK_DBG_ERR,
430 		    "%s: Not supported, sa_family is %u\n",
431 		    __func__, sa->sa_family);
432 		return (EAFNOSUPPORT);
433 	}
434 	if (sa->sa_len != sizeof(*sa)) {
435 		HVSOCK_DBG(HVSOCK_DBG_ERR,
436 		    "%s: Not supported, sa_len is %u\n",
437 		    __func__, sa->sa_len);
438 		return (EINVAL);
439 	}
440 
441 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
442 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
443 
444 	mtx_lock(&hvs_trans_socks_mtx);
445 	if (__hvs_find_socket_on_list(sa,
446 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
447 		error = EADDRINUSE;
448 	} else {
449 		/*
450 		 * The address is available for us to bind.
451 		 * Add socket to the bound list.
452 		 */
453 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
454 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
455 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
456 	}
457 	mtx_unlock(&hvs_trans_socks_mtx);
458 
459 	return (error);
460 }
461 
462 int
463 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
464 {
465 	struct hvs_pcb *pcb = so2hvspcb(so);
466 	struct socket *bound_so;
467 	int error;
468 
469 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
470 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
471 
472 	if (pcb == NULL)
473 		return (EINVAL);
474 
475 	/* Check if the address is already bound and it was by us. */
476 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
477 	if (bound_so == NULL || bound_so != so) {
478 		HVSOCK_DBG(HVSOCK_DBG_ERR,
479 		    "%s: Address not bound or not by us.\n", __func__);
480 		return (EADDRNOTAVAIL);
481 	}
482 
483 	SOCK_LOCK(so);
484 	error = solisten_proto_check(so);
485 	if (error == 0)
486 		solisten_proto(so, backlog);
487 	SOCK_UNLOCK(so);
488 
489 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
490 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
491 	return (error);
492 }
493 
494 int
495 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
496 {
497 	struct hvs_pcb *pcb = so2hvspcb(so);
498 
499 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
500 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
501 
502 	if (pcb == NULL)
503 		return (EINVAL);
504 
505 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
506 	    M_NOWAIT);
507 
508 	return ((*nam == NULL) ? ENOMEM : 0);
509 }
510 
511 int
512 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
513 {
514 	struct hvs_pcb *pcb = so2hvspcb(so);
515 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
516 	bool found_auto_bound_port = false;
517 	int i, error = 0;
518 
519 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
520 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
521 	    __func__, raddr->hvs_port);
522 
523 	if (pcb == NULL)
524 		return (EINVAL);
525 
526 	/* Verify the remote address */
527 	if (raddr == NULL)
528 		return (EINVAL);
529 	if (raddr->sa_family != AF_HYPERV)
530 		return (EAFNOSUPPORT);
531 	if (raddr->sa_len != sizeof(*raddr))
532 		return (EINVAL);
533 
534 	mtx_lock(&hvs_trans_socks_mtx);
535 	if (so->so_state &
536 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
537 			HVSOCK_DBG(HVSOCK_DBG_ERR,
538 			    "%s: socket connect in progress\n",
539 			    __func__);
540 			error = EINPROGRESS;
541 			goto out;
542 	}
543 
544 	/*
545 	 * Find an available port for us to auto bind the local
546 	 * address.
547 	 */
548 	hvs_addr_set(&pcb->local_addr, 0);
549 
550 	for (i = previous_auto_bound_port - 1;
551 	    i != previous_auto_bound_port; i --) {
552 		if (i == MIN_PORT)
553 			i = MAX_PORT;
554 
555 		pcb->local_addr.hvs_port = i;
556 
557 		if (__hvs_find_socket_on_list(&pcb->local_addr,
558 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
559 			found_auto_bound_port = true;
560 			previous_auto_bound_port = i;
561 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
562 			    "%s: found local bound port is %x\n",
563 			    __func__, pcb->local_addr.hvs_port);
564 			break;
565 		}
566 	}
567 
568 	if (found_auto_bound_port == true) {
569 		/* Found available port for auto bound, put on list */
570 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
571 		/* Set VM service ID */
572 		pcb->vm_srv_id = srv_id_template;
573 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
574 		/* Set host service ID and remote port */
575 		pcb->host_srv_id = srv_id_template;
576 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
577 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
578 
579 		/* Change the socket state to SS_ISCONNECTING */
580 		soisconnecting(so);
581 	} else {
582 		HVSOCK_DBG(HVSOCK_DBG_ERR,
583 		    "%s: No local port available for auto bound\n",
584 		    __func__);
585 		error = EADDRINUSE;
586 	}
587 
588 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
589 	hvsock_print_guid(&pcb->vm_srv_id);
590 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
591 	hvsock_print_guid(&pcb->host_srv_id);
592 
593 out:
594 	mtx_unlock(&hvs_trans_socks_mtx);
595 
596 	if (found_auto_bound_port == true)
597 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
598 
599 	return (error);
600 }
601 
602 int
603 hvs_trans_disconnect(struct socket *so)
604 {
605 	struct hvs_pcb *pcb;
606 
607 	if (vm_guest != VM_GUEST_HV)
608 		return (ESOCKTNOSUPPORT);
609 
610 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
611 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
612 
613 	(void) hvs_trans_lock();
614 	pcb = so2hvspcb(so);
615 	if (pcb == NULL) {
616 		hvs_trans_unlock();
617 		return (EINVAL);
618 	}
619 
620 	/* If socket is already disconnected, skip this */
621 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
622 		soisdisconnecting(so);
623 
624 	hvs_trans_unlock();
625 
626 	return (0);
627 }
628 
629 #define SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
630 struct hvs_callback_arg {
631 	struct uio *uio;
632 	struct sockbuf *sb;
633 };
634 
635 int
636 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
637     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
638 {
639 	struct hvs_pcb *pcb = so2hvspcb(so);
640 	struct sockbuf *sb;
641 	ssize_t orig_resid;
642 	uint32_t canread, to_read;
643 	int flags, error = 0;
644 	struct hvs_callback_arg cbarg;
645 
646 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
647 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
648 
649 	if (so->so_type != SOCK_STREAM)
650 		return (EINVAL);
651 	if (pcb == NULL)
652 		return (EINVAL);
653 
654 	if (flagsp != NULL)
655 		flags = *flagsp &~ MSG_EOR;
656 	else
657 		flags = 0;
658 
659 	if (flags & MSG_PEEK)
660 		return (EOPNOTSUPP);
661 
662 	/* If no space to copy out anything */
663 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
664 		return (EINVAL);
665 
666 	sb = &so->so_rcv;
667 
668 	orig_resid = uio->uio_resid;
669 
670 	/* Prevent other readers from entering the socket. */
671 	error = sblock(sb, SBLOCKWAIT(flags));
672 	if (error) {
673 		HVSOCK_DBG(HVSOCK_DBG_ERR,
674 		    "%s: sblock returned error = %d\n", __func__, error);
675 		return (error);
676 	}
677 
678 	SOCKBUF_LOCK(sb);
679 
680 	cbarg.uio = uio;
681 	cbarg.sb = sb;
682 	/*
683 	 * If the socket is closing, there might still be some data
684 	 * in rx br to read. However we need to make sure
685 	 * the channel is still open.
686 	 */
687 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
688 	    (so->so_state & SS_ISDISCONNECTED)) {
689 		/* Other thread already closed the channel */
690 		error = EPIPE;
691 		goto out;
692 	}
693 
694 	while (true) {
695 		while (uio->uio_resid > 0 &&
696 		    (canread = hvsock_canread_check(pcb)) > 0) {
697 			to_read = MIN(canread, uio->uio_resid);
698 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
699 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
700 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
701 			    pcb->recv_data_off));
702 
703 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
704 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
705 			    hvsock_br_callback, (void *)&cbarg);
706 			/*
707 			 * It is possible socket is disconnected becasue
708 			 * we released lock in hvsock_br_callback. So we
709 			 * need to check the state to make sure it is not
710 			 * disconnected.
711 			 */
712 			if (error || so->so_state & SS_ISDISCONNECTED) {
713 				break;
714 			}
715 
716 			pcb->recv_data_len -= to_read;
717 			pcb->recv_data_off += to_read;
718 		}
719 
720 		if (error)
721 			break;
722 
723 		/* Abort if socket has reported problems. */
724 		if (so->so_error) {
725 			if (so->so_error == ESHUTDOWN &&
726 			    orig_resid > uio->uio_resid) {
727 				/*
728 				 * Although we got a FIN, we also received
729 				 * some data in this round. Delivery it
730 				 * to user.
731 				 */
732 				error = 0;
733 			} else {
734 				if (so->so_error != ESHUTDOWN)
735 					error = so->so_error;
736 			}
737 
738 			break;
739 		}
740 
741 		/* Cannot received more. */
742 		if (sb->sb_state & SBS_CANTRCVMORE)
743 			break;
744 
745 		/* We are done if buffer has been filled */
746 		if (uio->uio_resid == 0)
747 			break;
748 
749 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
750 			break;
751 
752 		/* Buffer ring is empty and we shall not block */
753 		if ((so->so_state & SS_NBIO) ||
754 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
755 			if (orig_resid == uio->uio_resid) {
756 				/* We have not read anything */
757 				error = EAGAIN;
758 			}
759 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
760 			    "%s: non blocked read return, error %d.\n",
761 			    __func__, error);
762 			break;
763 		}
764 
765 		/*
766 		 * Wait and block until (more) data comes in.
767 		 * Note: Drops the sockbuf lock during wait.
768 		 */
769 		error = sbwait(sb);
770 
771 		if (error)
772 			break;
773 
774 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
775 		    "%s: wake up from sbwait, read available is %u\n",
776 		    __func__, vmbus_chan_read_available(pcb->chan));
777 	}
778 
779 out:
780 	SOCKBUF_UNLOCK(sb);
781 
782 	sbunlock(sb);
783 
784 	/* We recieved a FIN in this call */
785 	if (so->so_error == ESHUTDOWN) {
786 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
787 			/* Send has already closed */
788 			soisdisconnecting(so);
789 		} else {
790 			/* Just close the receive side */
791 			socantrcvmore(so);
792 		}
793 	}
794 
795 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
796 	    "%s: returning error = %d, so_error = %d\n",
797 	    __func__, error, so->so_error);
798 
799 	return (error);
800 }
801 
802 int
803 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
804     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
805 {
806 	struct hvs_pcb *pcb = so2hvspcb(so);
807 	struct sockbuf *sb;
808 	ssize_t orig_resid;
809 	uint32_t canwrite, to_write;
810 	int error = 0;
811 
812 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
813 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
814 	    __func__, uio->uio_resid);
815 
816 	if (so->so_type != SOCK_STREAM)
817 		return (EINVAL);
818 	if (pcb == NULL)
819 		return (EINVAL);
820 
821 	/* If nothing to send */
822 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
823 		return (EINVAL);
824 
825 	sb = &so->so_snd;
826 
827 	orig_resid = uio->uio_resid;
828 
829 	/* Prevent other writers from entering the socket. */
830 	error = sblock(sb, SBLOCKWAIT(flags));
831 	if (error) {
832 		HVSOCK_DBG(HVSOCK_DBG_ERR,
833 		    "%s: sblock returned error = %d\n", __func__, error);
834 		return (error);
835 	}
836 
837 	SOCKBUF_LOCK(sb);
838 
839 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
840 	    so->so_error == ESHUTDOWN) {
841 		error = EPIPE;
842 		goto out;
843 	}
844 
845 	while (uio->uio_resid > 0) {
846 		canwrite = hvsock_canwrite_check(pcb);
847 		if (canwrite == 0) {
848 			/* We have sent some data */
849 			if (orig_resid > uio->uio_resid)
850 				break;
851 			/*
852 			 * We have not sent any data and it is
853 			 * non-blocked io
854 			 */
855 			if (so->so_state & SS_NBIO ||
856 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
857 				error = EWOULDBLOCK;
858 				break;
859 			} else {
860 				/*
861 				 * We are here because there is no space on
862 				 * send buffer ring. Signal the other side
863 				 * to read and free more space.
864 				 * Sleep wait until space avaiable to send
865 				 * Note: Drops the sockbuf lock during wait.
866 				 */
867 				error = sbwait(sb);
868 
869 				if (error)
870 					break;
871 
872 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
873 				    "%s: wake up from sbwait, space avail on "
874 				    "tx ring is %u\n",
875 				    __func__,
876 				    vmbus_chan_write_available(pcb->chan));
877 
878 				continue;
879 			}
880 		}
881 		to_write = MIN(canwrite, uio->uio_resid);
882 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
883 
884 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
885 		    "%s: canwrite is %u, to_write = %u\n", __func__,
886 		    canwrite, to_write);
887 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
888 
889 		if (error)
890 			break;
891 	}
892 
893 out:
894 	SOCKBUF_UNLOCK(sb);
895 	sbunlock(sb);
896 
897 	return (error);
898 }
899 
900 int
901 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
902 {
903 	struct hvs_pcb *pcb = so2hvspcb(so);
904 
905 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
906 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
907 
908 	if (pcb == NULL)
909 		return (EINVAL);
910 
911 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
912 
913 	return ((*nam == NULL)? ENOMEM : 0);
914 }
915 
916 int
917 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
918 {
919 	struct hvs_pcb *pcb = so2hvspcb(so);
920 
921 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
922 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
923 
924 	if (pcb == NULL)
925 		return (EINVAL);
926 
927 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
928 
929 	return ((*nam == NULL)? ENOMEM : 0);
930 }
931 
932 void
933 hvs_trans_close(struct socket *so)
934 {
935 	struct hvs_pcb *pcb;
936 
937 	if (vm_guest != VM_GUEST_HV)
938 		return;
939 
940 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
941 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
942 
943 	(void) hvs_trans_lock();
944 	pcb = so2hvspcb(so);
945 	if (!pcb) {
946 		hvs_trans_unlock();
947 		return;
948 	}
949 
950 	if (so->so_state & SS_ISCONNECTED) {
951 		/* Send a FIN to peer */
952 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
953 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
954 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
955 	}
956 
957 	if (so->so_state &
958 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
959 		soisdisconnected(so);
960 
961 	pcb->chan = NULL;
962 	pcb->so = NULL;
963 
964 	if (SOLISTENING(so)) {
965 		mtx_lock(&hvs_trans_socks_mtx);
966 		/* Remove from bound list */
967 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
968 		mtx_unlock(&hvs_trans_socks_mtx);
969 	}
970 
971 	hvs_trans_unlock();
972 
973 	return;
974 }
975 
976 void
977 hvs_trans_abort(struct socket *so)
978 {
979 	struct hvs_pcb *pcb = so2hvspcb(so);
980 
981 	if (vm_guest != VM_GUEST_HV)
982 		return;
983 
984 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
985 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
986 
987 	(void) hvs_trans_lock();
988 	if (pcb == NULL) {
989 		hvs_trans_unlock();
990 		return;
991 	}
992 
993 	if (SOLISTENING(so)) {
994 		mtx_lock(&hvs_trans_socks_mtx);
995 		/* Remove from bound list */
996 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
997 		mtx_unlock(&hvs_trans_socks_mtx);
998 	}
999 
1000 	if (so->so_state & SS_ISCONNECTED) {
1001 		(void) sodisconnect(so);
1002 	}
1003 	hvs_trans_unlock();
1004 
1005 	return;
1006 }
1007 
1008 int
1009 hvs_trans_shutdown(struct socket *so)
1010 {
1011 	struct hvs_pcb *pcb = so2hvspcb(so);
1012 	struct sockbuf *sb;
1013 
1014 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1015 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
1016 
1017 	if (pcb == NULL)
1018 		return (EINVAL);
1019 
1020 	/*
1021 	 * Only get called with the shutdown method is SHUT_WR or
1022 	 * SHUT_RDWR.
1023 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
1024 	 * already set the SBS_CANTRCVMORE on receive side socket
1025 	 * buffer.
1026 	 */
1027 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1028 		/*
1029 		 * SHUT_WR only case.
1030 		 * Receive side is still open. Just close
1031 		 * the send side.
1032 		 */
1033 		socantsendmore(so);
1034 	} else {
1035 		/* SHUT_RDWR case */
1036 		if (so->so_state & SS_ISCONNECTED) {
1037 			/* Send a FIN to peer */
1038 			sb = &so->so_snd;
1039 			SOCKBUF_LOCK(sb);
1040 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1041 			SOCKBUF_UNLOCK(sb);
1042 
1043 			soisdisconnecting(so);
1044 		}
1045 	}
1046 
1047 	return (0);
1048 }
1049 
1050 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1051  * <port> (see struct sockaddr_hvs).
1052  *
1053  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1054  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1055  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1056  * the below sockaddr:
1057  *
1058  * struct SOCKADDR_HV
1059  * {
1060  *    ADDRESS_FAMILY Family;
1061  *    USHORT Reserved;
1062  *    GUID VmId;
1063  *    GUID ServiceId;
1064  * };
1065  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1066  * VMBus, because here it's obvious the host and the VM can easily identify
1067  * each other. Though the VmID is useful on the host, especially in the case
1068  * of Windows container, FreeBSD VM doesn't need it at all.
1069  *
1070  * To be compatible with similar infrastructure in Linux VMs, we have
1071  * to limit the available GUID space of SOCKADDR_HV so that we can create
1072  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1073  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1074  *
1075  ****************************************************************************
1076  * The only valid Service GUIDs, from the perspectives of both the host and *
1077  * FreeBSD VM, that can be connected by the other end, must conform to this *
1078  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1079  ****************************************************************************
1080  *
1081  * When we write apps on the host to connect(), the GUID ServiceID is used.
1082  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1083  * port and the driver will form the GUID and use that to request the host.
1084  *
1085  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1086  * auto-generated remote port for a connect request initiated by the host's
1087  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1088  * FreeBSD guest.
1089  */
1090 
1091 /*
1092  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1093  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1094  * HyperV hosts doen't have this limit.
1095  */
1096 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1097 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1098 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1099 
1100 struct hvsock_sc {
1101 	device_t		dev;
1102 	struct hvs_pcb		*pcb;
1103 	struct vmbus_channel	*channel;
1104 };
1105 
1106 static bool
1107 hvsock_chan_readable(struct vmbus_channel *chan)
1108 {
1109 	uint32_t readable = vmbus_chan_read_available(chan);
1110 
1111 	return (readable >= HVSOCK_PKT_LEN(0));
1112 }
1113 
1114 static void
1115 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1116 {
1117 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1118 	struct socket *so;
1119 	uint32_t canwrite;
1120 
1121 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1122 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1123 	    __func__, pcb);
1124 
1125 	/*
1126 	 * Check if the socket is still attached and valid.
1127 	 * Here we know channel is still open. Need to make
1128 	 * sure the socket has not been closed or freed.
1129 	 */
1130 	(void) hvs_trans_lock();
1131 	so = hsvpcb2so(pcb);
1132 
1133 	if (pcb->chan != NULL && so != NULL) {
1134 		/*
1135 		 * Wake up reader if there are data to read.
1136 		 */
1137 		SOCKBUF_LOCK(&(so)->so_rcv);
1138 
1139 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1140 		    "%s: read available = %u\n", __func__,
1141 		    vmbus_chan_read_available(pcb->chan));
1142 
1143 		if (hvsock_chan_readable(pcb->chan))
1144 			sorwakeup_locked(so);
1145 		else
1146 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1147 
1148 		/*
1149 		 * Wake up sender if space becomes available to write.
1150 		 */
1151 		SOCKBUF_LOCK(&(so)->so_snd);
1152 		canwrite = hvsock_canwrite_check(pcb);
1153 
1154 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1155 		    "%s: canwrite = %u\n", __func__, canwrite);
1156 
1157 		if (canwrite > 0) {
1158 			sowwakeup_locked(so);
1159 		} else {
1160 			SOCKBUF_UNLOCK(&(so)->so_snd);
1161 		}
1162 	}
1163 
1164 	hvs_trans_unlock();
1165 
1166 	return;
1167 }
1168 
1169 static int
1170 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1171 {
1172 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1173 	struct uio *uio = arg->uio;
1174 	struct sockbuf *sb = arg->sb;
1175 	int error = 0;
1176 
1177 	if (cbarg == NULL || datap == NULL)
1178 		return (EINVAL);
1179 
1180 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1181 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1182 	    "datap = %p\n",
1183 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1184 	    uio->uio_resid, cplen, datap);
1185 
1186 	if (sb)
1187 		SOCKBUF_UNLOCK(sb);
1188 
1189 	error = uiomove(datap, cplen, uio);
1190 
1191 	if (sb)
1192 		SOCKBUF_LOCK(sb);
1193 
1194 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1195 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1196 	    __func__, uio->uio_resid, error);
1197 
1198 	return (error);
1199 }
1200 
1201 static int
1202 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1203     uint32_t to_write, struct sockbuf *sb)
1204 {
1205 	struct hvs_pkt_header hvs_pkt;
1206 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1207 	uint64_t pad = 0;
1208 	struct iovec iov[3];
1209 	struct hvs_callback_arg cbarg;
1210 
1211 	if (chan == NULL)
1212 		return (ENOTCONN);
1213 
1214 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1215 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1216 	hvs_pktlen = hvs_pkthlen + to_write;
1217 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1218 
1219 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1220 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1221 	    "pad_pktlen = %u, data_len = %u\n",
1222 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1223 
1224 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1225 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1226 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1227 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1228 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1229 
1230 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1231 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1232 
1233 	cbarg.uio = uio;
1234 	cbarg.sb = sb;
1235 
1236 	if (uio && to_write > 0) {
1237 		iov[0].iov_base = &hvs_pkt;
1238 		iov[0].iov_len = hvs_pkthlen;
1239 		iov[1].iov_base = NULL;
1240 		iov[1].iov_len = to_write;
1241 		iov[2].iov_base = &pad;
1242 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1243 
1244 		error = vmbus_chan_iov_send(chan, iov, 3,
1245 		    hvsock_br_callback, &cbarg);
1246 	} else {
1247 		if (to_write == 0) {
1248 			iov[0].iov_base = &hvs_pkt;
1249 			iov[0].iov_len = hvs_pkthlen;
1250 			iov[1].iov_base = &pad;
1251 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1252 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1253 		}
1254 	}
1255 
1256 	if (error) {
1257 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1258 		    "%s: error = %d\n", __func__, error);
1259 	}
1260 
1261 	return (error);
1262 }
1263 
1264 /*
1265  * Check if we have data on current ring buffer to read
1266  * or not. If not, advance the ring buffer read index to
1267  * next packet. Update the recev_data_len and recev_data_off
1268  * to new value.
1269  * Return the number of bytes can read.
1270  */
1271 static uint32_t
1272 hvsock_canread_check(struct hvs_pcb *pcb)
1273 {
1274 	uint32_t advance;
1275 	uint32_t tlen, hlen, dlen;
1276 	uint32_t bytes_canread = 0;
1277 	int error;
1278 
1279 	if (pcb == NULL || pcb->chan == NULL) {
1280 		pcb->so->so_error = EIO;
1281 		return (0);
1282 	}
1283 
1284 	/* Still have data not read yet on current packet */
1285 	if (pcb->recv_data_len > 0)
1286 		return (pcb->recv_data_len);
1287 
1288 	if (pcb->rb_init)
1289 		advance =
1290 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1291 	else
1292 		advance = 0;
1293 
1294 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1295 
1296 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1297 	    "%s: bytes_canread on br = %u, advance = %u\n",
1298 	    __func__, bytes_canread, advance);
1299 
1300 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1301 		/*
1302 		 * Nothing to read. Need to advance the rindex before
1303 		 * calling sbwait, so host knows to wake us up when data
1304 		 * is available to read on rb.
1305 		 */
1306 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1307 		if (error) {
1308 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1309 			    "%s: after calling vmbus_chan_recv_idxadv, "
1310 			    "got error = %d\n",  __func__, error);
1311 			return (0);
1312 		} else {
1313 			pcb->rb_init = false;
1314 			pcb->recv_data_len = 0;
1315 			pcb->recv_data_off = 0;
1316 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1317 
1318 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1319 			    "%s: advanced %u bytes, "
1320 			    " bytes_canread on br now = %u\n",
1321 			    __func__, advance, bytes_canread);
1322 
1323 			if (bytes_canread == 0)
1324 				return (0);
1325 			else
1326 				advance = 0;
1327 		}
1328 	}
1329 
1330 	if (bytes_canread <
1331 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1332 		return (0);
1333 
1334 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1335 	    sizeof(struct hvs_pkt_header), advance);
1336 
1337 	/* Don't have anything to read */
1338 	if (error) {
1339 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1340 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1341 		    __func__, error);
1342 		return (0);
1343 	}
1344 
1345 	/*
1346 	 * We just read in a new packet header. Do some sanity checks.
1347 	 */
1348 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1349 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1350 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1351 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1352 	    __predict_false(hlen > tlen) ||
1353 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1354 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1355 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1356 		    tlen, hlen, dlen);
1357 		pcb->so->so_error = EIO;
1358 		return (0);
1359 	}
1360 	if (pcb->rb_init == false)
1361 		pcb->rb_init = true;
1362 
1363 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1364 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1365 	    tlen, hlen, dlen);
1366 
1367 	/* The other side has sent a close FIN */
1368 	if (dlen == 0) {
1369 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1370 		    "%s: Received FIN from other side\n", __func__);
1371 		/* inform the caller by seting so_error to ESHUTDOWN */
1372 		pcb->so->so_error = ESHUTDOWN;
1373 	}
1374 
1375 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1376 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1377 
1378 	pcb->recv_data_len = dlen;
1379 	pcb->recv_data_off = 0;
1380 
1381 	return (pcb->recv_data_len);
1382 }
1383 
1384 static uint32_t
1385 hvsock_canwrite_check(struct hvs_pcb *pcb)
1386 {
1387 	uint32_t writeable;
1388 	uint32_t ret;
1389 
1390 	if (pcb == NULL || pcb->chan == NULL)
1391 		return (0);
1392 
1393 	writeable = vmbus_chan_write_available(pcb->chan);
1394 
1395 	/*
1396 	 * We must always reserve a 0-length-payload packet for the FIN.
1397 	 */
1398 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1399 	    "%s: writeable is %u, should be greater than %ju\n",
1400 	    __func__, writeable,
1401 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1402 
1403 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1404 		/*
1405 		 * The Tx ring seems full.
1406 		 */
1407 		return (0);
1408 	}
1409 
1410 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1411 
1412 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1413 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1414 
1415 	return (rounddown2(ret, 8));
1416 }
1417 
1418 static void
1419 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1420 {
1421 	vmbus_chan_set_pending_send_size(chan,
1422 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1423 }
1424 
1425 static int
1426 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1427 {
1428 	unsigned int rcvbuf, sndbuf;
1429 	struct hvs_pcb *pcb = so2hvspcb(so);
1430 	int ret;
1431 
1432 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1433 		sndbuf = HVS_RINGBUF_SND_SIZE;
1434 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1435 	} else {
1436 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1437 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1438 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1439 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1440 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1441 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1442 	}
1443 
1444 	/*
1445 	 * Can only read whatever user provided size of data
1446 	 * from ring buffer. Turn off batched reading.
1447 	 */
1448 	vmbus_chan_set_readbatch(chan, false);
1449 
1450 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1451 	    hvsock_chan_cb, pcb);
1452 
1453 	if (ret != 0) {
1454 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1455 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1456 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1457 	} else {
1458 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1459 		    "%s: hvsock channel opened, sndbuf = %u, i"
1460 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1461 		/*
1462 		 * Se the pending send size so to receive wakeup
1463 		 * signals from host when there is enough space on
1464 		 * rx buffer ring to write.
1465 		 */
1466 		hvsock_set_chan_pending_send_size(chan);
1467 	}
1468 
1469 	return ret;
1470 }
1471 
1472 /*
1473  * Guest is listening passively on the socket. Open channel and
1474  * create a new socket for the conneciton.
1475  */
1476 static void
1477 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1478     struct hvsock_sc *sc)
1479 {
1480 	struct socket *new_so;
1481 	struct hvs_pcb *new_pcb, *pcb;
1482 	int error;
1483 
1484 	/* Do nothing if socket is not listening */
1485 	if ((so->so_options & SO_ACCEPTCONN) == 0) {
1486 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1487 		    "%s: socket is not a listening one\n", __func__);
1488 		return;
1489 	}
1490 
1491 	/*
1492 	 * Create a new socket. This will call pru_attach to complete
1493 	 * the socket initialization and put the new socket onto
1494 	 * listening socket's sol_incomp list, waiting to be promoted
1495 	 * to sol_comp list.
1496 	 * The new socket created has ref count 0. There is no other
1497 	 * thread that changes the state of this new one at the
1498 	 * moment, so we don't need to hold its lock while opening
1499 	 * channel and filling out its pcb information.
1500 	 */
1501 	new_so = sonewconn(so, 0);
1502 	if (!new_so)
1503 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1504 		    "%s: creating new socket failed\n", __func__);
1505 
1506 	/*
1507 	 * Now open the vmbus channel. If it fails, the socket will be
1508 	 * on the listening socket's sol_incomp queue until it is
1509 	 * replaced and aborted.
1510 	 */
1511 	error = hvsock_open_channel(chan, new_so);
1512 	if (error) {
1513 		new_so->so_error = error;
1514 		return;
1515 	}
1516 
1517 	pcb = so->so_pcb;
1518 	new_pcb = new_so->so_pcb;
1519 
1520 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1521 	/* Remote port is unknown to guest in this type of conneciton */
1522 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1523 	new_pcb->chan = chan;
1524 	new_pcb->recv_data_len = 0;
1525 	new_pcb->recv_data_off = 0;
1526 	new_pcb->rb_init = false;
1527 
1528 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1529 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1530 
1531 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1532 
1533 	sc->pcb = new_pcb;
1534 
1535 	/*
1536 	 * Change the socket state to SS_ISCONNECTED. This will promote
1537 	 * the socket to sol_comp queue and wake up the thread which
1538 	 * is accepting connection.
1539 	 */
1540 	soisconnected(new_so);
1541 }
1542 
1543 
1544 /*
1545  * Guest is actively connecting to host.
1546  */
1547 static void
1548 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1549 {
1550 	struct hvs_pcb *pcb;
1551 	int error;
1552 
1553 	error = hvsock_open_channel(chan, so);
1554 	if (error) {
1555 		so->so_error = error;
1556 		return;
1557 	}
1558 
1559 	pcb = so->so_pcb;
1560 	pcb->chan = chan;
1561 	pcb->recv_data_len = 0;
1562 	pcb->recv_data_off = 0;
1563 	pcb->rb_init = false;
1564 
1565 	mtx_lock(&hvs_trans_socks_mtx);
1566 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1567 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1568 	mtx_unlock(&hvs_trans_socks_mtx);
1569 
1570 	/*
1571 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1572 	 * the thread sleeping in connect call.
1573 	 */
1574 	soisconnected(so);
1575 }
1576 
1577 static void
1578 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1579 {
1580 	struct hyperv_guid *inst_guid, *type_guid;
1581 	bool conn_from_host;
1582 	struct sockaddr_hvs addr;
1583 	struct socket *so;
1584 	struct hvs_pcb *pcb;
1585 
1586 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1587 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1588 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1589 
1590 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1591 	hvsock_print_guid(type_guid);
1592 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1593 	hvsock_print_guid(inst_guid);
1594 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1595 	    (conn_from_host == true ) ? "from" : "to");
1596 
1597 	/*
1598 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1599 	 */
1600 	if (!is_valid_srv_id(type_guid))
1601 		return;
1602 
1603 	/*
1604 	 * There should be a bound socket already created no matter
1605 	 * it is a passive or active connection.
1606 	 * For host initiated connection (passive on guest side),
1607 	 * the  type_guid contains the port which guest is bound and
1608 	 * listening.
1609 	 * For the guest initiated connection (active on guest side),
1610 	 * the inst_guid contains the port that guest has auto bound
1611 	 * to.
1612 	 */
1613 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1614 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1615 	if (!so) {
1616 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1617 		    "%s: no bound socket found for port %u\n",
1618 		    __func__, addr.hvs_port);
1619 		return;
1620 	}
1621 
1622 	if (conn_from_host) {
1623 		hvsock_open_conn_passive(chan, so, sc);
1624 	} else {
1625 		(void) hvs_trans_lock();
1626 		pcb = so->so_pcb;
1627 		if (pcb && pcb->so) {
1628 			sc->pcb = so2hvspcb(so);
1629 			hvsock_open_conn_active(chan, so);
1630 		} else {
1631 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1632 			    "%s: channel detached before open\n", __func__);
1633 		}
1634 		hvs_trans_unlock();
1635 	}
1636 
1637 }
1638 
1639 static int
1640 hvsock_probe(device_t dev)
1641 {
1642 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1643 
1644 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1645 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1646 		    "hvsock_probe called but not a hvsock channel id %u\n",
1647 		    vmbus_chan_id(channel));
1648 
1649 		return ENXIO;
1650 	} else {
1651 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1652 		    "hvsock_probe got a hvsock channel id %u\n",
1653 		    vmbus_chan_id(channel));
1654 
1655 		return BUS_PROBE_DEFAULT;
1656 	}
1657 }
1658 
1659 static int
1660 hvsock_attach(device_t dev)
1661 {
1662 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1663 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1664 
1665 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1666 
1667 	hvsock_open_connection(channel, sc);
1668 
1669 	/*
1670 	 * Always return success. On error the host will rescind the device
1671 	 * in 30 seconds and we can do cleanup at that time in
1672 	 * vmbus_chan_msgproc_chrescind().
1673 	 */
1674 	return (0);
1675 }
1676 
1677 static int
1678 hvsock_detach(device_t dev)
1679 {
1680 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1681 	struct socket *so;
1682 	int error, retry;
1683 
1684 	if (bootverbose)
1685 		device_printf(dev, "hvsock_detach called.\n");
1686 
1687 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1688 
1689 	if (sc->pcb != NULL) {
1690 		(void) hvs_trans_lock();
1691 
1692 		so = hsvpcb2so(sc->pcb);
1693 		if (so) {
1694 			/* Close the connection */
1695 			if (so->so_state &
1696 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1697 				soisdisconnected(so);
1698 		}
1699 
1700 		mtx_lock(&hvs_trans_socks_mtx);
1701 		__hvs_remove_pcb_from_list(sc->pcb,
1702 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1703 		mtx_unlock(&hvs_trans_socks_mtx);
1704 
1705 		/*
1706 		 * Close channel while no reader and sender are working
1707 		 * on the buffer rings.
1708 		 */
1709 		if (so) {
1710 			retry = 0;
1711 			while ((error = sblock(&so->so_rcv, 0)) ==
1712 			    EWOULDBLOCK) {
1713 				/*
1714 				 * Someone is reading, rx br is busy
1715 				 */
1716 				soisdisconnected(so);
1717 				DELAY(500);
1718 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1719 				    "waiting for rx reader to exit, "
1720 				    "retry = %d\n", retry++);
1721 			}
1722 			retry = 0;
1723 			while ((error = sblock(&so->so_snd, 0)) ==
1724 			    EWOULDBLOCK) {
1725 				/*
1726 				 * Someone is sending, tx br is busy
1727 				 */
1728 				soisdisconnected(so);
1729 				DELAY(500);
1730 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1731 				    "waiting for tx sender to exit, "
1732 				    "retry = %d\n", retry++);
1733 			}
1734 		}
1735 
1736 
1737 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1738 		free(sc->pcb, M_HVSOCK);
1739 		sc->pcb = NULL;
1740 
1741 		if (so) {
1742 			sbunlock(&so->so_rcv);
1743 			sbunlock(&so->so_snd);
1744 			so->so_pcb = NULL;
1745 		}
1746 
1747 		hvs_trans_unlock();
1748 	}
1749 
1750 	vmbus_chan_close(vmbus_get_channel(dev));
1751 
1752 	return (0);
1753 }
1754 
1755 static device_method_t hvsock_methods[] = {
1756 	/* Device interface */
1757 	DEVMETHOD(device_probe, hvsock_probe),
1758 	DEVMETHOD(device_attach, hvsock_attach),
1759 	DEVMETHOD(device_detach, hvsock_detach),
1760 	DEVMETHOD_END
1761 };
1762 
1763 static driver_t hvsock_driver = {
1764 	"hv_sock",
1765 	hvsock_methods,
1766 	sizeof(struct hvsock_sc)
1767 };
1768 
1769 static devclass_t hvsock_devclass;
1770 
1771 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
1772 MODULE_VERSION(hvsock, 1);
1773 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1774