xref: /freebsd/sys/dev/hyperv/hvsock/hv_sock.c (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/domain.h>
33 #include <sys/lock.h>
34 #include <sys/kernel.h>
35 #include <sys/types.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/mutex.h>
39 #include <sys/proc.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysproto.h>
44 #include <sys/systm.h>
45 #include <sys/sockbuf.h>
46 #include <sys/sx.h>
47 #include <sys/uio.h>
48 
49 #include <net/vnet.h>
50 
51 #include <dev/hyperv/vmbus/vmbus_reg.h>
52 
53 #include "hv_sock.h"
54 
55 #define HVSOCK_DBG_NONE			0x0
56 #define HVSOCK_DBG_INFO			0x1
57 #define HVSOCK_DBG_ERR			0x2
58 #define HVSOCK_DBG_VERBOSE		0x3
59 
60 
61 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
62 
63 static int hvs_dbg_level;
64 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
65     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
66 
67 
68 #define HVSOCK_DBG(level, ...) do {					\
69 	if (hvs_dbg_level >= (level))					\
70 		printf(__VA_ARGS__);					\
71 	} while (0)
72 
73 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
74 
75 static int hvs_dom_probe(void);
76 
77 /* The MTU is 16KB per host side's design */
78 #define HVSOCK_MTU_SIZE		(1024 * 16)
79 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
80 
81 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
82 
83 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
84 					 roundup2(payload_len, 8) + \
85 					 sizeof(uint64_t))
86 
87 /*
88  * HyperV Transport sockets
89  */
90 static struct protosw hv_socket_protosw = {
91 	.pr_type =		SOCK_STREAM,
92 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
93 	.pr_flags =		PR_CONNREQUIRED,
94 	.pr_attach =		hvs_trans_attach,
95 	.pr_bind =		hvs_trans_bind,
96 	.pr_listen =		hvs_trans_listen,
97 	.pr_accept =		hvs_trans_accept,
98 	.pr_connect =		hvs_trans_connect,
99 	.pr_peeraddr =		hvs_trans_peeraddr,
100 	.pr_sockaddr =		hvs_trans_sockaddr,
101 	.pr_soreceive =		hvs_trans_soreceive,
102 	.pr_sosend =		hvs_trans_sosend,
103 	.pr_disconnect =	hvs_trans_disconnect,
104 	.pr_close =		hvs_trans_close,
105 	.pr_detach =		hvs_trans_detach,
106 	.pr_shutdown =		hvs_trans_shutdown,
107 	.pr_abort =		hvs_trans_abort,
108 };
109 
110 static struct domain		hv_socket_domain = {
111 	.dom_family =		AF_HYPERV,
112 	.dom_name =		"hyperv",
113 	.dom_probe =		hvs_dom_probe,
114 	.dom_nprotosw =		1,
115 	.dom_protosw =		{ &hv_socket_protosw },
116 };
117 
118 DOMAIN_SET(hv_socket_);
119 
120 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
121 #define MIN_PORT			((uint32_t)0x0)
122 
123 /* 00000000-facb-11e6-bd58-64006a7986d3 */
124 static const struct hyperv_guid srv_id_template = {
125 	.hv_guid = {
126 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
127 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
128 };
129 
130 static int		hvsock_br_callback(void *, int, void *);
131 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
132 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
133 static int		hvsock_send_data(struct vmbus_channel *chan,
134     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
135 
136 
137 
138 /* Globals */
139 static struct sx		hvs_trans_socks_sx;
140 static struct mtx		hvs_trans_socks_mtx;
141 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
142 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
143 static uint32_t			previous_auto_bound_port;
144 
145 static void
146 hvsock_print_guid(struct hyperv_guid *guid)
147 {
148 	unsigned char *p = (unsigned char *)guid;
149 
150 	HVSOCK_DBG(HVSOCK_DBG_INFO,
151 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
152 	    *(unsigned int *)p,
153 	    *((unsigned short *) &p[4]),
154 	    *((unsigned short *) &p[6]),
155 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
156 }
157 
158 static bool
159 is_valid_srv_id(const struct hyperv_guid *id)
160 {
161 	return !memcmp(&id->hv_guid[4],
162 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
163 }
164 
165 static unsigned int
166 get_port_by_srv_id(const struct hyperv_guid *srv_id)
167 {
168 	return *((const unsigned int *)srv_id);
169 }
170 
171 static void
172 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
173 {
174 	*((unsigned int *)srv_id) = port;
175 }
176 
177 
178 static void
179 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
180 {
181 	struct hvs_pcb *p = NULL;
182 
183 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
184 
185 	if (!pcb)
186 		return;
187 
188 	if (list & HVS_LIST_BOUND) {
189 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
190 			if  (p == pcb)
191 				LIST_REMOVE(p, bound_next);
192 	}
193 
194 	if (list & HVS_LIST_CONNECTED) {
195 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
196 			if (p == pcb)
197 				LIST_REMOVE(pcb, connected_next);
198 	}
199 }
200 
201 static void
202 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
203 {
204 	struct hvs_pcb *pcb = so2hvspcb(so);
205 
206 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
207 
208 	__hvs_remove_pcb_from_list(pcb, list);
209 }
210 
211 static void
212 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
213 {
214 	struct hvs_pcb *pcb = so2hvspcb(so);
215 
216 	if (list & HVS_LIST_BOUND)
217 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
218 		   pcb, bound_next);
219 
220 	if (list & HVS_LIST_CONNECTED)
221 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
222 		   pcb, connected_next);
223 }
224 
225 void
226 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
227 {
228 	if (!so || !so->so_pcb) {
229 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
230 		    "%s: socket or so_pcb is null\n", __func__);
231 		return;
232 	}
233 
234 	mtx_lock(&hvs_trans_socks_mtx);
235 	__hvs_remove_socket_from_list(so, list);
236 	mtx_unlock(&hvs_trans_socks_mtx);
237 }
238 
239 static void
240 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
241 {
242 	if (!so || !so->so_pcb) {
243 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
244 		    "%s: socket or so_pcb is null\n", __func__);
245 		return;
246 	}
247 
248 	mtx_lock(&hvs_trans_socks_mtx);
249 	__hvs_insert_socket_on_list(so, list);
250 	mtx_unlock(&hvs_trans_socks_mtx);
251 }
252 
253 static struct socket *
254 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
255 {
256 	struct hvs_pcb *p = NULL;
257 
258 	if (list & HVS_LIST_BOUND)
259 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
260 			if (p->so != NULL &&
261 			    addr->hvs_port == p->local_addr.hvs_port)
262 				return p->so;
263 
264 	if (list & HVS_LIST_CONNECTED)
265 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
266 			if (p->so != NULL &&
267 			    addr->hvs_port == p->local_addr.hvs_port)
268 				return p->so;
269 
270 	return NULL;
271 }
272 
273 static struct socket *
274 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
275 {
276 	struct socket *s = NULL;
277 
278 	mtx_lock(&hvs_trans_socks_mtx);
279 	s = __hvs_find_socket_on_list(addr, list);
280 	mtx_unlock(&hvs_trans_socks_mtx);
281 
282 	return s;
283 }
284 
285 static inline void
286 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
287 {
288 	memset(addr, 0, sizeof(*addr));
289 	addr->sa_family = AF_HYPERV;
290 	addr->sa_len = sizeof(*addr);
291 	addr->hvs_port = port;
292 }
293 
294 void
295 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
296 {
297 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
298 }
299 
300 int
301 hvs_trans_lock(void)
302 {
303 	sx_xlock(&hvs_trans_socks_sx);
304 	return (0);
305 }
306 
307 void
308 hvs_trans_unlock(void)
309 {
310 	sx_xunlock(&hvs_trans_socks_sx);
311 }
312 
313 static int
314 hvs_dom_probe(void)
315 {
316 
317 	/* Don't even give us a chance to attach on non-HyperV. */
318 	if (vm_guest != VM_GUEST_HV)
319 		return (ENXIO);
320 	return (0);
321 }
322 
323 static void
324 hvs_trans_init(void *arg __unused)
325 {
326 
327 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
328 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
329 
330 	/* Initialize Globals */
331 	previous_auto_bound_port = MAX_PORT;
332 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
333 	mtx_init(&hvs_trans_socks_mtx,
334 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
335 	LIST_INIT(&hvs_trans_bound_socks);
336 	LIST_INIT(&hvs_trans_connected_socks);
337 }
338 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
339     hvs_trans_init, NULL);
340 
341 /*
342  * Called in two cases:
343  * 1) When user calls socket();
344  * 2) When we accept new incoming conneciton and call sonewconn().
345  */
346 int
347 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
348 {
349 	struct hvs_pcb *pcb = so2hvspcb(so);
350 
351 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
352 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
353 
354 	if (so->so_type != SOCK_STREAM)
355 		return (ESOCKTNOSUPPORT);
356 
357 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
358 		return (EPROTONOSUPPORT);
359 
360 	if (pcb != NULL)
361 		return (EISCONN);
362 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
363 	if (pcb == NULL)
364 		return (ENOMEM);
365 
366 	pcb->so = so;
367 	so->so_pcb = (void *)pcb;
368 
369 	return (0);
370 }
371 
372 void
373 hvs_trans_detach(struct socket *so)
374 {
375 	struct hvs_pcb *pcb;
376 
377 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
378 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
379 
380 	(void) hvs_trans_lock();
381 	pcb = so2hvspcb(so);
382 	if (pcb == NULL) {
383 		hvs_trans_unlock();
384 		return;
385 	}
386 
387 	if (SOLISTENING(so)) {
388 		bzero(pcb, sizeof(*pcb));
389 		free(pcb, M_HVSOCK);
390 	}
391 
392 	so->so_pcb = NULL;
393 
394 	hvs_trans_unlock();
395 }
396 
397 int
398 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
399 {
400 	struct hvs_pcb *pcb = so2hvspcb(so);
401 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
402 	int error = 0;
403 
404 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
405 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
406 
407 	if (sa == NULL) {
408 		return (EINVAL);
409 	}
410 
411 	if (pcb == NULL) {
412 		return (EINVAL);
413 	}
414 
415 	if (sa->sa_family != AF_HYPERV) {
416 		HVSOCK_DBG(HVSOCK_DBG_ERR,
417 		    "%s: Not supported, sa_family is %u\n",
418 		    __func__, sa->sa_family);
419 		return (EAFNOSUPPORT);
420 	}
421 	if (sa->sa_len != sizeof(*sa)) {
422 		HVSOCK_DBG(HVSOCK_DBG_ERR,
423 		    "%s: Not supported, sa_len is %u\n",
424 		    __func__, sa->sa_len);
425 		return (EINVAL);
426 	}
427 
428 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
429 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
430 
431 	mtx_lock(&hvs_trans_socks_mtx);
432 	if (__hvs_find_socket_on_list(sa,
433 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
434 		error = EADDRINUSE;
435 	} else {
436 		/*
437 		 * The address is available for us to bind.
438 		 * Add socket to the bound list.
439 		 */
440 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
441 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
442 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
443 	}
444 	mtx_unlock(&hvs_trans_socks_mtx);
445 
446 	return (error);
447 }
448 
449 int
450 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
451 {
452 	struct hvs_pcb *pcb = so2hvspcb(so);
453 	struct socket *bound_so;
454 	int error;
455 
456 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
457 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
458 
459 	if (pcb == NULL)
460 		return (EINVAL);
461 
462 	/* Check if the address is already bound and it was by us. */
463 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
464 	if (bound_so == NULL || bound_so != so) {
465 		HVSOCK_DBG(HVSOCK_DBG_ERR,
466 		    "%s: Address not bound or not by us.\n", __func__);
467 		return (EADDRNOTAVAIL);
468 	}
469 
470 	SOCK_LOCK(so);
471 	error = solisten_proto_check(so);
472 	if (error == 0)
473 		solisten_proto(so, backlog);
474 	SOCK_UNLOCK(so);
475 
476 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
477 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
478 	return (error);
479 }
480 
481 int
482 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
483 {
484 	struct hvs_pcb *pcb = so2hvspcb(so);
485 
486 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
487 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
488 
489 	if (pcb == NULL)
490 		return (EINVAL);
491 
492 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
493 	    M_NOWAIT);
494 
495 	return ((*nam == NULL) ? ENOMEM : 0);
496 }
497 
498 int
499 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
500 {
501 	struct hvs_pcb *pcb = so2hvspcb(so);
502 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
503 	bool found_auto_bound_port = false;
504 	int i, error = 0;
505 
506 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
507 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
508 	    __func__, raddr->hvs_port);
509 
510 	if (pcb == NULL)
511 		return (EINVAL);
512 
513 	/* Verify the remote address */
514 	if (raddr == NULL)
515 		return (EINVAL);
516 	if (raddr->sa_family != AF_HYPERV)
517 		return (EAFNOSUPPORT);
518 	if (raddr->sa_len != sizeof(*raddr))
519 		return (EINVAL);
520 
521 	mtx_lock(&hvs_trans_socks_mtx);
522 	if (so->so_state &
523 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
524 			HVSOCK_DBG(HVSOCK_DBG_ERR,
525 			    "%s: socket connect in progress\n",
526 			    __func__);
527 			error = EINPROGRESS;
528 			goto out;
529 	}
530 
531 	/*
532 	 * Find an available port for us to auto bind the local
533 	 * address.
534 	 */
535 	hvs_addr_set(&pcb->local_addr, 0);
536 
537 	for (i = previous_auto_bound_port - 1;
538 	    i != previous_auto_bound_port; i --) {
539 		if (i == MIN_PORT)
540 			i = MAX_PORT;
541 
542 		pcb->local_addr.hvs_port = i;
543 
544 		if (__hvs_find_socket_on_list(&pcb->local_addr,
545 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
546 			found_auto_bound_port = true;
547 			previous_auto_bound_port = i;
548 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
549 			    "%s: found local bound port is %x\n",
550 			    __func__, pcb->local_addr.hvs_port);
551 			break;
552 		}
553 	}
554 
555 	if (found_auto_bound_port == true) {
556 		/* Found available port for auto bound, put on list */
557 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
558 		/* Set VM service ID */
559 		pcb->vm_srv_id = srv_id_template;
560 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
561 		/* Set host service ID and remote port */
562 		pcb->host_srv_id = srv_id_template;
563 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
564 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
565 
566 		/* Change the socket state to SS_ISCONNECTING */
567 		soisconnecting(so);
568 	} else {
569 		HVSOCK_DBG(HVSOCK_DBG_ERR,
570 		    "%s: No local port available for auto bound\n",
571 		    __func__);
572 		error = EADDRINUSE;
573 	}
574 
575 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
576 	hvsock_print_guid(&pcb->vm_srv_id);
577 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
578 	hvsock_print_guid(&pcb->host_srv_id);
579 
580 out:
581 	mtx_unlock(&hvs_trans_socks_mtx);
582 
583 	if (found_auto_bound_port == true)
584 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
585 
586 	return (error);
587 }
588 
589 int
590 hvs_trans_disconnect(struct socket *so)
591 {
592 	struct hvs_pcb *pcb;
593 
594 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
595 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
596 
597 	(void) hvs_trans_lock();
598 	pcb = so2hvspcb(so);
599 	if (pcb == NULL) {
600 		hvs_trans_unlock();
601 		return (EINVAL);
602 	}
603 
604 	/* If socket is already disconnected, skip this */
605 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
606 		soisdisconnecting(so);
607 
608 	hvs_trans_unlock();
609 
610 	return (0);
611 }
612 
613 struct hvs_callback_arg {
614 	struct uio *uio;
615 	struct sockbuf *sb;
616 };
617 
618 int
619 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
620     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
621 {
622 	struct hvs_pcb *pcb = so2hvspcb(so);
623 	struct sockbuf *sb;
624 	ssize_t orig_resid;
625 	uint32_t canread, to_read;
626 	int flags, error = 0;
627 	struct hvs_callback_arg cbarg;
628 
629 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
630 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
631 
632 	if (so->so_type != SOCK_STREAM)
633 		return (EINVAL);
634 	if (pcb == NULL)
635 		return (EINVAL);
636 
637 	if (flagsp != NULL)
638 		flags = *flagsp &~ MSG_EOR;
639 	else
640 		flags = 0;
641 
642 	if (flags & MSG_PEEK)
643 		return (EOPNOTSUPP);
644 
645 	/* If no space to copy out anything */
646 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
647 		return (EINVAL);
648 
649 	orig_resid = uio->uio_resid;
650 
651 	/* Prevent other readers from entering the socket. */
652 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
653 	if (error) {
654 		HVSOCK_DBG(HVSOCK_DBG_ERR,
655 		    "%s: soiolock returned error = %d\n", __func__, error);
656 		return (error);
657 	}
658 
659 	sb = &so->so_rcv;
660 	SOCKBUF_LOCK(sb);
661 
662 	cbarg.uio = uio;
663 	cbarg.sb = sb;
664 	/*
665 	 * If the socket is closing, there might still be some data
666 	 * in rx br to read. However we need to make sure
667 	 * the channel is still open.
668 	 */
669 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
670 	    (so->so_state & SS_ISDISCONNECTED)) {
671 		/* Other thread already closed the channel */
672 		error = EPIPE;
673 		goto out;
674 	}
675 
676 	while (true) {
677 		while (uio->uio_resid > 0 &&
678 		    (canread = hvsock_canread_check(pcb)) > 0) {
679 			to_read = MIN(canread, uio->uio_resid);
680 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
681 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
682 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
683 			    pcb->recv_data_off));
684 
685 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
686 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
687 			    hvsock_br_callback, (void *)&cbarg);
688 			/*
689 			 * It is possible socket is disconnected becasue
690 			 * we released lock in hvsock_br_callback. So we
691 			 * need to check the state to make sure it is not
692 			 * disconnected.
693 			 */
694 			if (error || so->so_state & SS_ISDISCONNECTED) {
695 				break;
696 			}
697 
698 			pcb->recv_data_len -= to_read;
699 			pcb->recv_data_off += to_read;
700 		}
701 
702 		if (error)
703 			break;
704 
705 		/* Abort if socket has reported problems. */
706 		if (so->so_error) {
707 			if (so->so_error == ESHUTDOWN &&
708 			    orig_resid > uio->uio_resid) {
709 				/*
710 				 * Although we got a FIN, we also received
711 				 * some data in this round. Delivery it
712 				 * to user.
713 				 */
714 				error = 0;
715 			} else {
716 				if (so->so_error != ESHUTDOWN)
717 					error = so->so_error;
718 			}
719 
720 			break;
721 		}
722 
723 		/* Cannot received more. */
724 		if (sb->sb_state & SBS_CANTRCVMORE)
725 			break;
726 
727 		/* We are done if buffer has been filled */
728 		if (uio->uio_resid == 0)
729 			break;
730 
731 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
732 			break;
733 
734 		/* Buffer ring is empty and we shall not block */
735 		if ((so->so_state & SS_NBIO) ||
736 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
737 			if (orig_resid == uio->uio_resid) {
738 				/* We have not read anything */
739 				error = EAGAIN;
740 			}
741 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
742 			    "%s: non blocked read return, error %d.\n",
743 			    __func__, error);
744 			break;
745 		}
746 
747 		/*
748 		 * Wait and block until (more) data comes in.
749 		 * Note: Drops the sockbuf lock during wait.
750 		 */
751 		error = sbwait(so, SO_RCV);
752 
753 		if (error)
754 			break;
755 
756 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
757 		    "%s: wake up from sbwait, read available is %u\n",
758 		    __func__, vmbus_chan_read_available(pcb->chan));
759 	}
760 
761 out:
762 	SOCKBUF_UNLOCK(sb);
763 	SOCK_IO_RECV_UNLOCK(so);
764 
765 	/* We recieved a FIN in this call */
766 	if (so->so_error == ESHUTDOWN) {
767 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
768 			/* Send has already closed */
769 			soisdisconnecting(so);
770 		} else {
771 			/* Just close the receive side */
772 			socantrcvmore(so);
773 		}
774 	}
775 
776 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
777 	    "%s: returning error = %d, so_error = %d\n",
778 	    __func__, error, so->so_error);
779 
780 	return (error);
781 }
782 
783 int
784 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
785     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
786 {
787 	struct hvs_pcb *pcb = so2hvspcb(so);
788 	struct sockbuf *sb;
789 	ssize_t orig_resid;
790 	uint32_t canwrite, to_write;
791 	int error = 0;
792 
793 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
794 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
795 	    __func__, uio->uio_resid);
796 
797 	if (so->so_type != SOCK_STREAM)
798 		return (EINVAL);
799 	if (pcb == NULL)
800 		return (EINVAL);
801 
802 	/* If nothing to send */
803 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
804 		return (EINVAL);
805 
806 	orig_resid = uio->uio_resid;
807 
808 	/* Prevent other writers from entering the socket. */
809 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
810 	if (error) {
811 		HVSOCK_DBG(HVSOCK_DBG_ERR,
812 		    "%s: soiolocak returned error = %d\n", __func__, error);
813 		return (error);
814 	}
815 
816 	sb = &so->so_snd;
817 	SOCKBUF_LOCK(sb);
818 
819 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
820 	    so->so_error == ESHUTDOWN) {
821 		error = EPIPE;
822 		goto out;
823 	}
824 
825 	while (uio->uio_resid > 0) {
826 		canwrite = hvsock_canwrite_check(pcb);
827 		if (canwrite == 0) {
828 			/* We have sent some data */
829 			if (orig_resid > uio->uio_resid)
830 				break;
831 			/*
832 			 * We have not sent any data and it is
833 			 * non-blocked io
834 			 */
835 			if (so->so_state & SS_NBIO ||
836 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
837 				error = EWOULDBLOCK;
838 				break;
839 			} else {
840 				/*
841 				 * We are here because there is no space on
842 				 * send buffer ring. Signal the other side
843 				 * to read and free more space.
844 				 * Sleep wait until space avaiable to send
845 				 * Note: Drops the sockbuf lock during wait.
846 				 */
847 				error = sbwait(so, SO_SND);
848 
849 				if (error)
850 					break;
851 
852 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
853 				    "%s: wake up from sbwait, space avail on "
854 				    "tx ring is %u\n",
855 				    __func__,
856 				    vmbus_chan_write_available(pcb->chan));
857 
858 				continue;
859 			}
860 		}
861 		to_write = MIN(canwrite, uio->uio_resid);
862 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
863 
864 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
865 		    "%s: canwrite is %u, to_write = %u\n", __func__,
866 		    canwrite, to_write);
867 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
868 
869 		if (error)
870 			break;
871 	}
872 
873 out:
874 	SOCKBUF_UNLOCK(sb);
875 	SOCK_IO_SEND_UNLOCK(so);
876 
877 	return (error);
878 }
879 
880 int
881 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
882 {
883 	struct hvs_pcb *pcb = so2hvspcb(so);
884 
885 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
886 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
887 
888 	if (pcb == NULL)
889 		return (EINVAL);
890 
891 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
892 
893 	return ((*nam == NULL)? ENOMEM : 0);
894 }
895 
896 int
897 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
898 {
899 	struct hvs_pcb *pcb = so2hvspcb(so);
900 
901 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
902 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
903 
904 	if (pcb == NULL)
905 		return (EINVAL);
906 
907 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
908 
909 	return ((*nam == NULL)? ENOMEM : 0);
910 }
911 
912 void
913 hvs_trans_close(struct socket *so)
914 {
915 	struct hvs_pcb *pcb;
916 
917 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
918 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
919 
920 	(void) hvs_trans_lock();
921 	pcb = so2hvspcb(so);
922 	if (!pcb) {
923 		hvs_trans_unlock();
924 		return;
925 	}
926 
927 	if (so->so_state & SS_ISCONNECTED) {
928 		/* Send a FIN to peer */
929 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
930 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
931 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
932 	}
933 
934 	if (so->so_state &
935 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
936 		soisdisconnected(so);
937 
938 	pcb->chan = NULL;
939 	pcb->so = NULL;
940 
941 	if (SOLISTENING(so)) {
942 		mtx_lock(&hvs_trans_socks_mtx);
943 		/* Remove from bound list */
944 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
945 		mtx_unlock(&hvs_trans_socks_mtx);
946 	}
947 
948 	hvs_trans_unlock();
949 
950 	return;
951 }
952 
953 void
954 hvs_trans_abort(struct socket *so)
955 {
956 	struct hvs_pcb *pcb = so2hvspcb(so);
957 
958 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
959 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
960 
961 	(void) hvs_trans_lock();
962 	if (pcb == NULL) {
963 		hvs_trans_unlock();
964 		return;
965 	}
966 
967 	if (SOLISTENING(so)) {
968 		mtx_lock(&hvs_trans_socks_mtx);
969 		/* Remove from bound list */
970 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
971 		mtx_unlock(&hvs_trans_socks_mtx);
972 	}
973 
974 	if (so->so_state & SS_ISCONNECTED) {
975 		(void) sodisconnect(so);
976 	}
977 	hvs_trans_unlock();
978 
979 	return;
980 }
981 
982 int
983 hvs_trans_shutdown(struct socket *so)
984 {
985 	struct hvs_pcb *pcb = so2hvspcb(so);
986 	struct sockbuf *sb;
987 
988 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
989 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
990 
991 	if (pcb == NULL)
992 		return (EINVAL);
993 
994 	/*
995 	 * Only get called with the shutdown method is SHUT_WR or
996 	 * SHUT_RDWR.
997 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
998 	 * already set the SBS_CANTRCVMORE on receive side socket
999 	 * buffer.
1000 	 */
1001 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1002 		/*
1003 		 * SHUT_WR only case.
1004 		 * Receive side is still open. Just close
1005 		 * the send side.
1006 		 */
1007 		socantsendmore(so);
1008 	} else {
1009 		/* SHUT_RDWR case */
1010 		if (so->so_state & SS_ISCONNECTED) {
1011 			/* Send a FIN to peer */
1012 			sb = &so->so_snd;
1013 			SOCKBUF_LOCK(sb);
1014 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1015 			SOCKBUF_UNLOCK(sb);
1016 
1017 			soisdisconnecting(so);
1018 		}
1019 	}
1020 
1021 	return (0);
1022 }
1023 
1024 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1025  * <port> (see struct sockaddr_hvs).
1026  *
1027  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1028  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1029  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1030  * the below sockaddr:
1031  *
1032  * struct SOCKADDR_HV
1033  * {
1034  *    ADDRESS_FAMILY Family;
1035  *    USHORT Reserved;
1036  *    GUID VmId;
1037  *    GUID ServiceId;
1038  * };
1039  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1040  * VMBus, because here it's obvious the host and the VM can easily identify
1041  * each other. Though the VmID is useful on the host, especially in the case
1042  * of Windows container, FreeBSD VM doesn't need it at all.
1043  *
1044  * To be compatible with similar infrastructure in Linux VMs, we have
1045  * to limit the available GUID space of SOCKADDR_HV so that we can create
1046  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1047  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1048  *
1049  ****************************************************************************
1050  * The only valid Service GUIDs, from the perspectives of both the host and *
1051  * FreeBSD VM, that can be connected by the other end, must conform to this *
1052  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1053  ****************************************************************************
1054  *
1055  * When we write apps on the host to connect(), the GUID ServiceID is used.
1056  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1057  * port and the driver will form the GUID and use that to request the host.
1058  *
1059  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1060  * auto-generated remote port for a connect request initiated by the host's
1061  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1062  * FreeBSD guest.
1063  */
1064 
1065 /*
1066  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1067  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1068  * HyperV hosts doen't have this limit.
1069  */
1070 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1071 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1072 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1073 
1074 struct hvsock_sc {
1075 	device_t		dev;
1076 	struct hvs_pcb		*pcb;
1077 	struct vmbus_channel	*channel;
1078 };
1079 
1080 static bool
1081 hvsock_chan_readable(struct vmbus_channel *chan)
1082 {
1083 	uint32_t readable = vmbus_chan_read_available(chan);
1084 
1085 	return (readable >= HVSOCK_PKT_LEN(0));
1086 }
1087 
1088 static void
1089 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1090 {
1091 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1092 	struct socket *so;
1093 	uint32_t canwrite;
1094 
1095 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1096 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1097 	    __func__, pcb);
1098 
1099 	/*
1100 	 * Check if the socket is still attached and valid.
1101 	 * Here we know channel is still open. Need to make
1102 	 * sure the socket has not been closed or freed.
1103 	 */
1104 	(void) hvs_trans_lock();
1105 	so = hsvpcb2so(pcb);
1106 
1107 	if (pcb->chan != NULL && so != NULL) {
1108 		/*
1109 		 * Wake up reader if there are data to read.
1110 		 */
1111 		SOCKBUF_LOCK(&(so)->so_rcv);
1112 
1113 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1114 		    "%s: read available = %u\n", __func__,
1115 		    vmbus_chan_read_available(pcb->chan));
1116 
1117 		if (hvsock_chan_readable(pcb->chan))
1118 			sorwakeup_locked(so);
1119 		else
1120 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1121 
1122 		/*
1123 		 * Wake up sender if space becomes available to write.
1124 		 */
1125 		SOCKBUF_LOCK(&(so)->so_snd);
1126 		canwrite = hvsock_canwrite_check(pcb);
1127 
1128 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1129 		    "%s: canwrite = %u\n", __func__, canwrite);
1130 
1131 		if (canwrite > 0) {
1132 			sowwakeup_locked(so);
1133 		} else {
1134 			SOCKBUF_UNLOCK(&(so)->so_snd);
1135 		}
1136 	}
1137 
1138 	hvs_trans_unlock();
1139 
1140 	return;
1141 }
1142 
1143 static int
1144 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1145 {
1146 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1147 	struct uio *uio = arg->uio;
1148 	struct sockbuf *sb = arg->sb;
1149 	int error = 0;
1150 
1151 	if (cbarg == NULL || datap == NULL)
1152 		return (EINVAL);
1153 
1154 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1155 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1156 	    "datap = %p\n",
1157 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1158 	    uio->uio_resid, cplen, datap);
1159 
1160 	if (sb)
1161 		SOCKBUF_UNLOCK(sb);
1162 
1163 	error = uiomove(datap, cplen, uio);
1164 
1165 	if (sb)
1166 		SOCKBUF_LOCK(sb);
1167 
1168 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1169 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1170 	    __func__, uio->uio_resid, error);
1171 
1172 	return (error);
1173 }
1174 
1175 static int
1176 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1177     uint32_t to_write, struct sockbuf *sb)
1178 {
1179 	struct hvs_pkt_header hvs_pkt;
1180 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1181 	uint64_t pad = 0;
1182 	struct iovec iov[3];
1183 	struct hvs_callback_arg cbarg;
1184 
1185 	if (chan == NULL)
1186 		return (ENOTCONN);
1187 
1188 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1189 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1190 	hvs_pktlen = hvs_pkthlen + to_write;
1191 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1192 
1193 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1194 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1195 	    "pad_pktlen = %u, data_len = %u\n",
1196 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1197 
1198 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1199 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1200 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1201 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1202 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1203 
1204 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1205 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1206 
1207 	cbarg.uio = uio;
1208 	cbarg.sb = sb;
1209 
1210 	if (uio && to_write > 0) {
1211 		iov[0].iov_base = &hvs_pkt;
1212 		iov[0].iov_len = hvs_pkthlen;
1213 		iov[1].iov_base = NULL;
1214 		iov[1].iov_len = to_write;
1215 		iov[2].iov_base = &pad;
1216 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1217 
1218 		error = vmbus_chan_iov_send(chan, iov, 3,
1219 		    hvsock_br_callback, &cbarg);
1220 	} else {
1221 		if (to_write == 0) {
1222 			iov[0].iov_base = &hvs_pkt;
1223 			iov[0].iov_len = hvs_pkthlen;
1224 			iov[1].iov_base = &pad;
1225 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1226 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1227 		}
1228 	}
1229 
1230 	if (error) {
1231 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1232 		    "%s: error = %d\n", __func__, error);
1233 	}
1234 
1235 	return (error);
1236 }
1237 
1238 /*
1239  * Check if we have data on current ring buffer to read
1240  * or not. If not, advance the ring buffer read index to
1241  * next packet. Update the recev_data_len and recev_data_off
1242  * to new value.
1243  * Return the number of bytes can read.
1244  */
1245 static uint32_t
1246 hvsock_canread_check(struct hvs_pcb *pcb)
1247 {
1248 	uint32_t advance;
1249 	uint32_t tlen, hlen, dlen;
1250 	uint32_t bytes_canread = 0;
1251 	int error;
1252 
1253 	if (pcb == NULL || pcb->chan == NULL) {
1254 		pcb->so->so_error = EIO;
1255 		return (0);
1256 	}
1257 
1258 	/* Still have data not read yet on current packet */
1259 	if (pcb->recv_data_len > 0)
1260 		return (pcb->recv_data_len);
1261 
1262 	if (pcb->rb_init)
1263 		advance =
1264 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1265 	else
1266 		advance = 0;
1267 
1268 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1269 
1270 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1271 	    "%s: bytes_canread on br = %u, advance = %u\n",
1272 	    __func__, bytes_canread, advance);
1273 
1274 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1275 		/*
1276 		 * Nothing to read. Need to advance the rindex before
1277 		 * calling sbwait, so host knows to wake us up when data
1278 		 * is available to read on rb.
1279 		 */
1280 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1281 		if (error) {
1282 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1283 			    "%s: after calling vmbus_chan_recv_idxadv, "
1284 			    "got error = %d\n",  __func__, error);
1285 			return (0);
1286 		} else {
1287 			pcb->rb_init = false;
1288 			pcb->recv_data_len = 0;
1289 			pcb->recv_data_off = 0;
1290 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1291 
1292 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1293 			    "%s: advanced %u bytes, "
1294 			    " bytes_canread on br now = %u\n",
1295 			    __func__, advance, bytes_canread);
1296 
1297 			if (bytes_canread == 0)
1298 				return (0);
1299 			else
1300 				advance = 0;
1301 		}
1302 	}
1303 
1304 	if (bytes_canread <
1305 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1306 		return (0);
1307 
1308 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1309 	    sizeof(struct hvs_pkt_header), advance);
1310 
1311 	/* Don't have anything to read */
1312 	if (error) {
1313 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1314 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1315 		    __func__, error);
1316 		return (0);
1317 	}
1318 
1319 	/*
1320 	 * We just read in a new packet header. Do some sanity checks.
1321 	 */
1322 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1323 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1324 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1325 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1326 	    __predict_false(hlen > tlen) ||
1327 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1328 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1329 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1330 		    tlen, hlen, dlen);
1331 		pcb->so->so_error = EIO;
1332 		return (0);
1333 	}
1334 	if (pcb->rb_init == false)
1335 		pcb->rb_init = true;
1336 
1337 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1338 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1339 	    tlen, hlen, dlen);
1340 
1341 	/* The other side has sent a close FIN */
1342 	if (dlen == 0) {
1343 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1344 		    "%s: Received FIN from other side\n", __func__);
1345 		/* inform the caller by seting so_error to ESHUTDOWN */
1346 		pcb->so->so_error = ESHUTDOWN;
1347 	}
1348 
1349 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1350 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1351 
1352 	pcb->recv_data_len = dlen;
1353 	pcb->recv_data_off = 0;
1354 
1355 	return (pcb->recv_data_len);
1356 }
1357 
1358 static uint32_t
1359 hvsock_canwrite_check(struct hvs_pcb *pcb)
1360 {
1361 	uint32_t writeable;
1362 	uint32_t ret;
1363 
1364 	if (pcb == NULL || pcb->chan == NULL)
1365 		return (0);
1366 
1367 	writeable = vmbus_chan_write_available(pcb->chan);
1368 
1369 	/*
1370 	 * We must always reserve a 0-length-payload packet for the FIN.
1371 	 */
1372 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1373 	    "%s: writeable is %u, should be greater than %ju\n",
1374 	    __func__, writeable,
1375 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1376 
1377 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1378 		/*
1379 		 * The Tx ring seems full.
1380 		 */
1381 		return (0);
1382 	}
1383 
1384 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1385 
1386 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1387 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1388 
1389 	return (rounddown2(ret, 8));
1390 }
1391 
1392 static void
1393 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1394 {
1395 	vmbus_chan_set_pending_send_size(chan,
1396 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1397 }
1398 
1399 static int
1400 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1401 {
1402 	unsigned int rcvbuf, sndbuf;
1403 	struct hvs_pcb *pcb = so2hvspcb(so);
1404 	int ret;
1405 
1406 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1407 		sndbuf = HVS_RINGBUF_SND_SIZE;
1408 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1409 	} else {
1410 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1411 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1412 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1413 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1414 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1415 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1416 	}
1417 
1418 	/*
1419 	 * Can only read whatever user provided size of data
1420 	 * from ring buffer. Turn off batched reading.
1421 	 */
1422 	vmbus_chan_set_readbatch(chan, false);
1423 
1424 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1425 	    hvsock_chan_cb, pcb);
1426 
1427 	if (ret != 0) {
1428 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1429 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1430 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1431 	} else {
1432 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1433 		    "%s: hvsock channel opened, sndbuf = %u, i"
1434 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1435 		/*
1436 		 * Se the pending send size so to receive wakeup
1437 		 * signals from host when there is enough space on
1438 		 * rx buffer ring to write.
1439 		 */
1440 		hvsock_set_chan_pending_send_size(chan);
1441 	}
1442 
1443 	return ret;
1444 }
1445 
1446 /*
1447  * Guest is listening passively on the socket. Open channel and
1448  * create a new socket for the conneciton.
1449  */
1450 static void
1451 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1452     struct hvsock_sc *sc)
1453 {
1454 	struct socket *new_so;
1455 	struct hvs_pcb *new_pcb, *pcb;
1456 	int error;
1457 
1458 	/* Do nothing if socket is not listening */
1459 	if (!SOLISTENING(so)) {
1460 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1461 		    "%s: socket is not a listening one\n", __func__);
1462 		return;
1463 	}
1464 
1465 	/*
1466 	 * Create a new socket. This will call pru_attach to complete
1467 	 * the socket initialization and put the new socket onto
1468 	 * listening socket's sol_incomp list, waiting to be promoted
1469 	 * to sol_comp list.
1470 	 * The new socket created has ref count 0. There is no other
1471 	 * thread that changes the state of this new one at the
1472 	 * moment, so we don't need to hold its lock while opening
1473 	 * channel and filling out its pcb information.
1474 	 */
1475 	new_so = sonewconn(so, 0);
1476 	if (!new_so)
1477 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1478 		    "%s: creating new socket failed\n", __func__);
1479 
1480 	/*
1481 	 * Now open the vmbus channel. If it fails, the socket will be
1482 	 * on the listening socket's sol_incomp queue until it is
1483 	 * replaced and aborted.
1484 	 */
1485 	error = hvsock_open_channel(chan, new_so);
1486 	if (error) {
1487 		new_so->so_error = error;
1488 		return;
1489 	}
1490 
1491 	pcb = so->so_pcb;
1492 	new_pcb = new_so->so_pcb;
1493 
1494 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1495 	/* Remote port is unknown to guest in this type of conneciton */
1496 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1497 	new_pcb->chan = chan;
1498 	new_pcb->recv_data_len = 0;
1499 	new_pcb->recv_data_off = 0;
1500 	new_pcb->rb_init = false;
1501 
1502 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1503 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1504 
1505 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1506 
1507 	sc->pcb = new_pcb;
1508 
1509 	/*
1510 	 * Change the socket state to SS_ISCONNECTED. This will promote
1511 	 * the socket to sol_comp queue and wake up the thread which
1512 	 * is accepting connection.
1513 	 */
1514 	soisconnected(new_so);
1515 }
1516 
1517 
1518 /*
1519  * Guest is actively connecting to host.
1520  */
1521 static void
1522 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1523 {
1524 	struct hvs_pcb *pcb;
1525 	int error;
1526 
1527 	error = hvsock_open_channel(chan, so);
1528 	if (error) {
1529 		so->so_error = error;
1530 		return;
1531 	}
1532 
1533 	pcb = so->so_pcb;
1534 	pcb->chan = chan;
1535 	pcb->recv_data_len = 0;
1536 	pcb->recv_data_off = 0;
1537 	pcb->rb_init = false;
1538 
1539 	mtx_lock(&hvs_trans_socks_mtx);
1540 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1541 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1542 	mtx_unlock(&hvs_trans_socks_mtx);
1543 
1544 	/*
1545 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1546 	 * the thread sleeping in connect call.
1547 	 */
1548 	soisconnected(so);
1549 }
1550 
1551 static void
1552 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1553 {
1554 	struct hyperv_guid *inst_guid, *type_guid;
1555 	bool conn_from_host;
1556 	struct sockaddr_hvs addr;
1557 	struct socket *so;
1558 	struct hvs_pcb *pcb;
1559 
1560 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1561 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1562 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1563 
1564 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1565 	hvsock_print_guid(type_guid);
1566 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1567 	hvsock_print_guid(inst_guid);
1568 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1569 	    (conn_from_host == true ) ? "from" : "to");
1570 
1571 	/*
1572 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1573 	 */
1574 	if (!is_valid_srv_id(type_guid))
1575 		return;
1576 
1577 	/*
1578 	 * There should be a bound socket already created no matter
1579 	 * it is a passive or active connection.
1580 	 * For host initiated connection (passive on guest side),
1581 	 * the  type_guid contains the port which guest is bound and
1582 	 * listening.
1583 	 * For the guest initiated connection (active on guest side),
1584 	 * the inst_guid contains the port that guest has auto bound
1585 	 * to.
1586 	 */
1587 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1588 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1589 	if (!so) {
1590 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1591 		    "%s: no bound socket found for port %u\n",
1592 		    __func__, addr.hvs_port);
1593 		return;
1594 	}
1595 
1596 	if (conn_from_host) {
1597 		hvsock_open_conn_passive(chan, so, sc);
1598 	} else {
1599 		(void) hvs_trans_lock();
1600 		pcb = so->so_pcb;
1601 		if (pcb && pcb->so) {
1602 			sc->pcb = so2hvspcb(so);
1603 			hvsock_open_conn_active(chan, so);
1604 		} else {
1605 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1606 			    "%s: channel detached before open\n", __func__);
1607 		}
1608 		hvs_trans_unlock();
1609 	}
1610 
1611 }
1612 
1613 static int
1614 hvsock_probe(device_t dev)
1615 {
1616 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1617 
1618 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1619 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1620 		    "hvsock_probe called but not a hvsock channel id %u\n",
1621 		    vmbus_chan_id(channel));
1622 
1623 		return ENXIO;
1624 	} else {
1625 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1626 		    "hvsock_probe got a hvsock channel id %u\n",
1627 		    vmbus_chan_id(channel));
1628 
1629 		return BUS_PROBE_DEFAULT;
1630 	}
1631 }
1632 
1633 static int
1634 hvsock_attach(device_t dev)
1635 {
1636 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1637 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1638 
1639 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1640 
1641 	hvsock_open_connection(channel, sc);
1642 
1643 	/*
1644 	 * Always return success. On error the host will rescind the device
1645 	 * in 30 seconds and we can do cleanup at that time in
1646 	 * vmbus_chan_msgproc_chrescind().
1647 	 */
1648 	return (0);
1649 }
1650 
1651 static int
1652 hvsock_detach(device_t dev)
1653 {
1654 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1655 	struct socket *so;
1656 	int retry;
1657 
1658 	if (bootverbose)
1659 		device_printf(dev, "hvsock_detach called.\n");
1660 
1661 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1662 
1663 	if (sc->pcb != NULL) {
1664 		(void) hvs_trans_lock();
1665 
1666 		so = hsvpcb2so(sc->pcb);
1667 		if (so) {
1668 			/* Close the connection */
1669 			if (so->so_state &
1670 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1671 				soisdisconnected(so);
1672 		}
1673 
1674 		mtx_lock(&hvs_trans_socks_mtx);
1675 		__hvs_remove_pcb_from_list(sc->pcb,
1676 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1677 		mtx_unlock(&hvs_trans_socks_mtx);
1678 
1679 		/*
1680 		 * Close channel while no reader and sender are working
1681 		 * on the buffer rings.
1682 		 */
1683 		if (so) {
1684 			retry = 0;
1685 			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
1686 				/*
1687 				 * Someone is reading, rx br is busy
1688 				 */
1689 				soisdisconnected(so);
1690 				DELAY(500);
1691 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1692 				    "waiting for rx reader to exit, "
1693 				    "retry = %d\n", retry++);
1694 			}
1695 			retry = 0;
1696 			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
1697 				/*
1698 				 * Someone is sending, tx br is busy
1699 				 */
1700 				soisdisconnected(so);
1701 				DELAY(500);
1702 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1703 				    "waiting for tx sender to exit, "
1704 				    "retry = %d\n", retry++);
1705 			}
1706 		}
1707 
1708 
1709 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1710 		free(sc->pcb, M_HVSOCK);
1711 		sc->pcb = NULL;
1712 
1713 		if (so) {
1714 			SOCK_IO_RECV_UNLOCK(so);
1715 			SOCK_IO_SEND_UNLOCK(so);
1716 			so->so_pcb = NULL;
1717 		}
1718 
1719 		hvs_trans_unlock();
1720 	}
1721 
1722 	vmbus_chan_close(vmbus_get_channel(dev));
1723 
1724 	return (0);
1725 }
1726 
1727 static device_method_t hvsock_methods[] = {
1728 	/* Device interface */
1729 	DEVMETHOD(device_probe, hvsock_probe),
1730 	DEVMETHOD(device_attach, hvsock_attach),
1731 	DEVMETHOD(device_detach, hvsock_detach),
1732 	DEVMETHOD_END
1733 };
1734 
1735 static driver_t hvsock_driver = {
1736 	"hv_sock",
1737 	hvsock_methods,
1738 	sizeof(struct hvsock_sc)
1739 };
1740 
1741 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL);
1742 MODULE_VERSION(hvsock, 1);
1743 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1744