1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Microsoft Corp. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bus.h> 34 #include <sys/domain.h> 35 #include <sys/lock.h> 36 #include <sys/kernel.h> 37 #include <sys/types.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/sysproto.h> 46 #include <sys/systm.h> 47 #include <sys/sockbuf.h> 48 #include <sys/sx.h> 49 #include <sys/uio.h> 50 51 #include <net/vnet.h> 52 53 #include <dev/hyperv/vmbus/vmbus_reg.h> 54 55 #include "hv_sock.h" 56 57 #define HVSOCK_DBG_NONE 0x0 58 #define HVSOCK_DBG_INFO 0x1 59 #define HVSOCK_DBG_ERR 0x2 60 #define HVSOCK_DBG_VERBOSE 0x3 61 62 63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); 64 65 static int hvs_dbg_level; 66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 67 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); 68 69 70 #define HVSOCK_DBG(level, ...) do { \ 71 if (hvs_dbg_level >= (level)) \ 72 printf(__VA_ARGS__); \ 73 } while (0) 74 75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); 76 77 /* The MTU is 16KB per host side's design */ 78 #define HVSOCK_MTU_SIZE (1024 * 16) 79 #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) 80 81 #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) 82 83 #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ 84 roundup2(payload_len, 8) + \ 85 sizeof(uint64_t)) 86 87 88 static struct domain hv_socket_domain; 89 90 /* 91 * HyperV Transport sockets 92 */ 93 static struct pr_usrreqs hvs_trans_usrreqs = { 94 .pru_attach = hvs_trans_attach, 95 .pru_bind = hvs_trans_bind, 96 .pru_listen = hvs_trans_listen, 97 .pru_accept = hvs_trans_accept, 98 .pru_connect = hvs_trans_connect, 99 .pru_peeraddr = hvs_trans_peeraddr, 100 .pru_sockaddr = hvs_trans_sockaddr, 101 .pru_soreceive = hvs_trans_soreceive, 102 .pru_sosend = hvs_trans_sosend, 103 .pru_disconnect = hvs_trans_disconnect, 104 .pru_close = hvs_trans_close, 105 .pru_detach = hvs_trans_detach, 106 .pru_shutdown = hvs_trans_shutdown, 107 .pru_abort = hvs_trans_abort, 108 }; 109 110 /* 111 * Definitions of protocols supported in HyperV socket domain 112 */ 113 static struct protosw hv_socket_protosw[] = { 114 { 115 .pr_type = SOCK_STREAM, 116 .pr_domain = &hv_socket_domain, 117 .pr_protocol = HYPERV_SOCK_PROTO_TRANS, 118 .pr_flags = PR_CONNREQUIRED, 119 .pr_init = hvs_trans_init, 120 .pr_usrreqs = &hvs_trans_usrreqs, 121 }, 122 }; 123 124 static struct domain hv_socket_domain = { 125 .dom_family = AF_HYPERV, 126 .dom_name = "hyperv", 127 .dom_protosw = hv_socket_protosw, 128 .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] 129 }; 130 131 VNET_DOMAIN_SET(hv_socket_); 132 133 #define MAX_PORT ((uint32_t)0xFFFFFFFF) 134 #define MIN_PORT ((uint32_t)0x0) 135 136 /* 00000000-facb-11e6-bd58-64006a7986d3 */ 137 static const struct hyperv_guid srv_id_template = { 138 .hv_guid = { 139 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 140 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } 141 }; 142 143 static int hvsock_br_callback(void *, int, void *); 144 static uint32_t hvsock_canread_check(struct hvs_pcb *); 145 static uint32_t hvsock_canwrite_check(struct hvs_pcb *); 146 static int hvsock_send_data(struct vmbus_channel *chan, 147 struct uio *uio, uint32_t to_write, struct sockbuf *sb); 148 149 150 151 /* Globals */ 152 static struct sx hvs_trans_socks_sx; 153 static struct mtx hvs_trans_socks_mtx; 154 static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; 155 static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; 156 static uint32_t previous_auto_bound_port; 157 158 static void 159 hvsock_print_guid(struct hyperv_guid *guid) 160 { 161 unsigned char *p = (unsigned char *)guid; 162 163 HVSOCK_DBG(HVSOCK_DBG_INFO, 164 "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", 165 *(unsigned int *)p, 166 *((unsigned short *) &p[4]), 167 *((unsigned short *) &p[6]), 168 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 169 } 170 171 static bool 172 is_valid_srv_id(const struct hyperv_guid *id) 173 { 174 return !memcmp(&id->hv_guid[4], 175 &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); 176 } 177 178 static unsigned int 179 get_port_by_srv_id(const struct hyperv_guid *srv_id) 180 { 181 return *((const unsigned int *)srv_id); 182 } 183 184 static void 185 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) 186 { 187 *((unsigned int *)srv_id) = port; 188 } 189 190 191 static void 192 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) 193 { 194 struct hvs_pcb *p = NULL; 195 196 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 197 198 if (!pcb) 199 return; 200 201 if (list & HVS_LIST_BOUND) { 202 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 203 if (p == pcb) 204 LIST_REMOVE(p, bound_next); 205 } 206 207 if (list & HVS_LIST_CONNECTED) { 208 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 209 if (p == pcb) 210 LIST_REMOVE(pcb, connected_next); 211 } 212 } 213 214 static void 215 __hvs_remove_socket_from_list(struct socket *so, unsigned char list) 216 { 217 struct hvs_pcb *pcb = so2hvspcb(so); 218 219 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 220 221 __hvs_remove_pcb_from_list(pcb, list); 222 } 223 224 static void 225 __hvs_insert_socket_on_list(struct socket *so, unsigned char list) 226 { 227 struct hvs_pcb *pcb = so2hvspcb(so); 228 229 if (list & HVS_LIST_BOUND) 230 LIST_INSERT_HEAD(&hvs_trans_bound_socks, 231 pcb, bound_next); 232 233 if (list & HVS_LIST_CONNECTED) 234 LIST_INSERT_HEAD(&hvs_trans_connected_socks, 235 pcb, connected_next); 236 } 237 238 void 239 hvs_remove_socket_from_list(struct socket *so, unsigned char list) 240 { 241 if (!so || !so->so_pcb) { 242 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 243 "%s: socket or so_pcb is null\n", __func__); 244 return; 245 } 246 247 mtx_lock(&hvs_trans_socks_mtx); 248 __hvs_remove_socket_from_list(so, list); 249 mtx_unlock(&hvs_trans_socks_mtx); 250 } 251 252 static void 253 hvs_insert_socket_on_list(struct socket *so, unsigned char list) 254 { 255 if (!so || !so->so_pcb) { 256 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 257 "%s: socket or so_pcb is null\n", __func__); 258 return; 259 } 260 261 mtx_lock(&hvs_trans_socks_mtx); 262 __hvs_insert_socket_on_list(so, list); 263 mtx_unlock(&hvs_trans_socks_mtx); 264 } 265 266 static struct socket * 267 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 268 { 269 struct hvs_pcb *p = NULL; 270 271 if (list & HVS_LIST_BOUND) 272 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 273 if (p->so != NULL && 274 addr->hvs_port == p->local_addr.hvs_port) 275 return p->so; 276 277 if (list & HVS_LIST_CONNECTED) 278 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 279 if (p->so != NULL && 280 addr->hvs_port == p->local_addr.hvs_port) 281 return p->so; 282 283 return NULL; 284 } 285 286 static struct socket * 287 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 288 { 289 struct socket *s = NULL; 290 291 mtx_lock(&hvs_trans_socks_mtx); 292 s = __hvs_find_socket_on_list(addr, list); 293 mtx_unlock(&hvs_trans_socks_mtx); 294 295 return s; 296 } 297 298 static inline void 299 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) 300 { 301 memset(addr, 0, sizeof(*addr)); 302 addr->sa_family = AF_HYPERV; 303 addr->hvs_port = port; 304 } 305 306 void 307 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) 308 { 309 hvs_addr_set(addr, get_port_by_srv_id(svr_id)); 310 } 311 312 int 313 hvs_trans_lock(void) 314 { 315 sx_xlock(&hvs_trans_socks_sx); 316 return (0); 317 } 318 319 void 320 hvs_trans_unlock(void) 321 { 322 sx_xunlock(&hvs_trans_socks_sx); 323 } 324 325 void 326 hvs_trans_init(void) 327 { 328 /* Skip initialization of globals for non-default instances. */ 329 if (!IS_DEFAULT_VNET(curvnet)) 330 return; 331 332 if (vm_guest != VM_GUEST_HV) 333 return; 334 335 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 336 "%s: HyperV Socket hvs_trans_init called\n", __func__); 337 338 /* Initialize Globals */ 339 previous_auto_bound_port = MAX_PORT; 340 sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); 341 mtx_init(&hvs_trans_socks_mtx, 342 "hvs_trans_socks_mtx", NULL, MTX_DEF); 343 LIST_INIT(&hvs_trans_bound_socks); 344 LIST_INIT(&hvs_trans_connected_socks); 345 } 346 347 /* 348 * Called in two cases: 349 * 1) When user calls socket(); 350 * 2) When we accept new incoming conneciton and call sonewconn(). 351 */ 352 int 353 hvs_trans_attach(struct socket *so, int proto, struct thread *td) 354 { 355 struct hvs_pcb *pcb = so2hvspcb(so); 356 357 if (vm_guest != VM_GUEST_HV) 358 return (ESOCKTNOSUPPORT); 359 360 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 361 "%s: HyperV Socket hvs_trans_attach called\n", __func__); 362 363 if (so->so_type != SOCK_STREAM) 364 return (ESOCKTNOSUPPORT); 365 366 if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) 367 return (EPROTONOSUPPORT); 368 369 if (pcb != NULL) 370 return (EISCONN); 371 pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); 372 if (pcb == NULL) 373 return (ENOMEM); 374 375 pcb->so = so; 376 so->so_pcb = (void *)pcb; 377 378 return (0); 379 } 380 381 void 382 hvs_trans_detach(struct socket *so) 383 { 384 struct hvs_pcb *pcb; 385 386 if (vm_guest != VM_GUEST_HV) 387 return; 388 389 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 390 "%s: HyperV Socket hvs_trans_detach called\n", __func__); 391 392 (void) hvs_trans_lock(); 393 pcb = so2hvspcb(so); 394 if (pcb == NULL) { 395 hvs_trans_unlock(); 396 return; 397 } 398 399 if (SOLISTENING(so)) { 400 bzero(pcb, sizeof(*pcb)); 401 free(pcb, M_HVSOCK); 402 } 403 404 so->so_pcb = NULL; 405 406 hvs_trans_unlock(); 407 } 408 409 int 410 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) 411 { 412 struct hvs_pcb *pcb = so2hvspcb(so); 413 struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; 414 int error = 0; 415 416 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 417 "%s: HyperV Socket hvs_trans_bind called\n", __func__); 418 419 if (sa == NULL) { 420 return (EINVAL); 421 } 422 423 if (pcb == NULL) { 424 return (EINVAL); 425 } 426 427 if (sa->sa_family != AF_HYPERV) { 428 HVSOCK_DBG(HVSOCK_DBG_ERR, 429 "%s: Not supported, sa_family is %u\n", 430 __func__, sa->sa_family); 431 return (EAFNOSUPPORT); 432 } 433 434 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 435 "%s: binding port = 0x%x\n", __func__, sa->hvs_port); 436 437 mtx_lock(&hvs_trans_socks_mtx); 438 if (__hvs_find_socket_on_list(sa, 439 HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { 440 error = EADDRINUSE; 441 } else { 442 /* 443 * The address is available for us to bind. 444 * Add socket to the bound list. 445 */ 446 hvs_addr_set(&pcb->local_addr, sa->hvs_port); 447 hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); 448 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 449 } 450 mtx_unlock(&hvs_trans_socks_mtx); 451 452 return (error); 453 } 454 455 int 456 hvs_trans_listen(struct socket *so, int backlog, struct thread *td) 457 { 458 struct hvs_pcb *pcb = so2hvspcb(so); 459 struct socket *bound_so; 460 int error; 461 462 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 463 "%s: HyperV Socket hvs_trans_listen called\n", __func__); 464 465 if (pcb == NULL) 466 return (EINVAL); 467 468 /* Check if the address is already bound and it was by us. */ 469 bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); 470 if (bound_so == NULL || bound_so != so) { 471 HVSOCK_DBG(HVSOCK_DBG_ERR, 472 "%s: Address not bound or not by us.\n", __func__); 473 return (EADDRNOTAVAIL); 474 } 475 476 SOCK_LOCK(so); 477 error = solisten_proto_check(so); 478 if (error == 0) 479 solisten_proto(so, backlog); 480 SOCK_UNLOCK(so); 481 482 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 483 "%s: HyperV Socket listen error = %d\n", __func__, error); 484 return (error); 485 } 486 487 int 488 hvs_trans_accept(struct socket *so, struct sockaddr **nam) 489 { 490 struct hvs_pcb *pcb = so2hvspcb(so); 491 492 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 493 "%s: HyperV Socket hvs_trans_accept called\n", __func__); 494 495 if (pcb == NULL) 496 return (EINVAL); 497 498 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, 499 M_NOWAIT); 500 501 return ((*nam == NULL) ? ENOMEM : 0); 502 } 503 504 int 505 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 506 { 507 struct hvs_pcb *pcb = so2hvspcb(so); 508 struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; 509 bool found_auto_bound_port = false; 510 int i, error = 0; 511 512 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 513 "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", 514 __func__, raddr->hvs_port); 515 516 if (pcb == NULL) 517 return (EINVAL); 518 519 /* Verify the remote address */ 520 if (raddr == NULL) 521 return (EINVAL); 522 if (raddr->sa_family != AF_HYPERV) 523 return (EAFNOSUPPORT); 524 525 mtx_lock(&hvs_trans_socks_mtx); 526 if (so->so_state & 527 (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { 528 HVSOCK_DBG(HVSOCK_DBG_ERR, 529 "%s: socket connect in progress\n", 530 __func__); 531 error = EINPROGRESS; 532 goto out; 533 } 534 535 /* 536 * Find an available port for us to auto bind the local 537 * address. 538 */ 539 hvs_addr_set(&pcb->local_addr, 0); 540 541 for (i = previous_auto_bound_port - 1; 542 i != previous_auto_bound_port; i --) { 543 if (i == MIN_PORT) 544 i = MAX_PORT; 545 546 pcb->local_addr.hvs_port = i; 547 548 if (__hvs_find_socket_on_list(&pcb->local_addr, 549 HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { 550 found_auto_bound_port = true; 551 previous_auto_bound_port = i; 552 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 553 "%s: found local bound port is %x\n", 554 __func__, pcb->local_addr.hvs_port); 555 break; 556 } 557 } 558 559 if (found_auto_bound_port == true) { 560 /* Found available port for auto bound, put on list */ 561 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 562 /* Set VM service ID */ 563 pcb->vm_srv_id = srv_id_template; 564 set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); 565 /* Set host service ID and remote port */ 566 pcb->host_srv_id = srv_id_template; 567 set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); 568 hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); 569 570 /* Change the socket state to SS_ISCONNECTING */ 571 soisconnecting(so); 572 } else { 573 HVSOCK_DBG(HVSOCK_DBG_ERR, 574 "%s: No local port available for auto bound\n", 575 __func__); 576 error = EADDRINUSE; 577 } 578 579 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); 580 hvsock_print_guid(&pcb->vm_srv_id); 581 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); 582 hvsock_print_guid(&pcb->host_srv_id); 583 584 out: 585 mtx_unlock(&hvs_trans_socks_mtx); 586 587 if (found_auto_bound_port == true) 588 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); 589 590 return (error); 591 } 592 593 int 594 hvs_trans_disconnect(struct socket *so) 595 { 596 struct hvs_pcb *pcb; 597 598 if (vm_guest != VM_GUEST_HV) 599 return (ESOCKTNOSUPPORT); 600 601 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 602 "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); 603 604 (void) hvs_trans_lock(); 605 pcb = so2hvspcb(so); 606 if (pcb == NULL) { 607 hvs_trans_unlock(); 608 return (EINVAL); 609 } 610 611 /* If socket is already disconnected, skip this */ 612 if ((so->so_state & SS_ISDISCONNECTED) == 0) 613 soisdisconnecting(so); 614 615 hvs_trans_unlock(); 616 617 return (0); 618 } 619 620 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 621 struct hvs_callback_arg { 622 struct uio *uio; 623 struct sockbuf *sb; 624 }; 625 626 int 627 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, 628 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 629 { 630 struct hvs_pcb *pcb = so2hvspcb(so); 631 struct sockbuf *sb; 632 ssize_t orig_resid; 633 uint32_t canread, to_read; 634 int flags, error = 0; 635 struct hvs_callback_arg cbarg; 636 637 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 638 "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); 639 640 if (so->so_type != SOCK_STREAM) 641 return (EINVAL); 642 if (pcb == NULL) 643 return (EINVAL); 644 645 if (flagsp != NULL) 646 flags = *flagsp &~ MSG_EOR; 647 else 648 flags = 0; 649 650 if (flags & MSG_PEEK) 651 return (EOPNOTSUPP); 652 653 /* If no space to copy out anything */ 654 if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) 655 return (EINVAL); 656 657 sb = &so->so_rcv; 658 659 orig_resid = uio->uio_resid; 660 661 /* Prevent other readers from entering the socket. */ 662 error = sblock(sb, SBLOCKWAIT(flags)); 663 if (error) { 664 HVSOCK_DBG(HVSOCK_DBG_ERR, 665 "%s: sblock returned error = %d\n", __func__, error); 666 return (error); 667 } 668 669 SOCKBUF_LOCK(sb); 670 671 cbarg.uio = uio; 672 cbarg.sb = sb; 673 /* 674 * If the socket is closing, there might still be some data 675 * in rx br to read. However we need to make sure 676 * the channel is still open. 677 */ 678 if ((sb->sb_state & SBS_CANTRCVMORE) && 679 (so->so_state & SS_ISDISCONNECTED)) { 680 /* Other thread already closed the channel */ 681 error = EPIPE; 682 goto out; 683 } 684 685 while (true) { 686 while (uio->uio_resid > 0 && 687 (canread = hvsock_canread_check(pcb)) > 0) { 688 to_read = MIN(canread, uio->uio_resid); 689 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 690 "%s: to_read = %u, skip = %u\n", __func__, to_read, 691 (unsigned int)(sizeof(struct hvs_pkt_header) + 692 pcb->recv_data_off)); 693 694 error = vmbus_chan_recv_peek_call(pcb->chan, to_read, 695 sizeof(struct hvs_pkt_header) + pcb->recv_data_off, 696 hvsock_br_callback, (void *)&cbarg); 697 /* 698 * It is possible socket is disconnected becasue 699 * we released lock in hvsock_br_callback. So we 700 * need to check the state to make sure it is not 701 * disconnected. 702 */ 703 if (error || so->so_state & SS_ISDISCONNECTED) { 704 break; 705 } 706 707 pcb->recv_data_len -= to_read; 708 pcb->recv_data_off += to_read; 709 } 710 711 if (error) 712 break; 713 714 /* Abort if socket has reported problems. */ 715 if (so->so_error) { 716 if (so->so_error == ESHUTDOWN && 717 orig_resid > uio->uio_resid) { 718 /* 719 * Although we got a FIN, we also received 720 * some data in this round. Delivery it 721 * to user. 722 */ 723 error = 0; 724 } else { 725 if (so->so_error != ESHUTDOWN) 726 error = so->so_error; 727 } 728 729 break; 730 } 731 732 /* Cannot received more. */ 733 if (sb->sb_state & SBS_CANTRCVMORE) 734 break; 735 736 /* We are done if buffer has been filled */ 737 if (uio->uio_resid == 0) 738 break; 739 740 if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) 741 break; 742 743 /* Buffer ring is empty and we shall not block */ 744 if ((so->so_state & SS_NBIO) || 745 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 746 if (orig_resid == uio->uio_resid) { 747 /* We have not read anything */ 748 error = EAGAIN; 749 } 750 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 751 "%s: non blocked read return, error %d.\n", 752 __func__, error); 753 break; 754 } 755 756 /* 757 * Wait and block until (more) data comes in. 758 * Note: Drops the sockbuf lock during wait. 759 */ 760 error = sbwait(sb); 761 762 if (error) 763 break; 764 765 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 766 "%s: wake up from sbwait, read available is %u\n", 767 __func__, vmbus_chan_read_available(pcb->chan)); 768 } 769 770 out: 771 SOCKBUF_UNLOCK(sb); 772 773 sbunlock(sb); 774 775 /* We recieved a FIN in this call */ 776 if (so->so_error == ESHUTDOWN) { 777 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 778 /* Send has already closed */ 779 soisdisconnecting(so); 780 } else { 781 /* Just close the receive side */ 782 socantrcvmore(so); 783 } 784 } 785 786 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 787 "%s: returning error = %d, so_error = %d\n", 788 __func__, error, so->so_error); 789 790 return (error); 791 } 792 793 int 794 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 795 struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) 796 { 797 struct hvs_pcb *pcb = so2hvspcb(so); 798 struct sockbuf *sb; 799 ssize_t orig_resid; 800 uint32_t canwrite, to_write; 801 int error = 0; 802 803 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 804 "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", 805 __func__, uio->uio_resid); 806 807 if (so->so_type != SOCK_STREAM) 808 return (EINVAL); 809 if (pcb == NULL) 810 return (EINVAL); 811 812 /* If nothing to send */ 813 if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) 814 return (EINVAL); 815 816 sb = &so->so_snd; 817 818 orig_resid = uio->uio_resid; 819 820 /* Prevent other writers from entering the socket. */ 821 error = sblock(sb, SBLOCKWAIT(flags)); 822 if (error) { 823 HVSOCK_DBG(HVSOCK_DBG_ERR, 824 "%s: sblock returned error = %d\n", __func__, error); 825 return (error); 826 } 827 828 SOCKBUF_LOCK(sb); 829 830 if ((sb->sb_state & SBS_CANTSENDMORE) || 831 so->so_error == ESHUTDOWN) { 832 error = EPIPE; 833 goto out; 834 } 835 836 while (uio->uio_resid > 0) { 837 canwrite = hvsock_canwrite_check(pcb); 838 if (canwrite == 0) { 839 /* We have sent some data */ 840 if (orig_resid > uio->uio_resid) 841 break; 842 /* 843 * We have not sent any data and it is 844 * non-blocked io 845 */ 846 if (so->so_state & SS_NBIO || 847 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 848 error = EWOULDBLOCK; 849 break; 850 } else { 851 /* 852 * We are here because there is no space on 853 * send buffer ring. Signal the other side 854 * to read and free more space. 855 * Sleep wait until space avaiable to send 856 * Note: Drops the sockbuf lock during wait. 857 */ 858 error = sbwait(sb); 859 860 if (error) 861 break; 862 863 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 864 "%s: wake up from sbwait, space avail on " 865 "tx ring is %u\n", 866 __func__, 867 vmbus_chan_write_available(pcb->chan)); 868 869 continue; 870 } 871 } 872 to_write = MIN(canwrite, uio->uio_resid); 873 to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); 874 875 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 876 "%s: canwrite is %u, to_write = %u\n", __func__, 877 canwrite, to_write); 878 error = hvsock_send_data(pcb->chan, uio, to_write, sb); 879 880 if (error) 881 break; 882 } 883 884 out: 885 SOCKBUF_UNLOCK(sb); 886 sbunlock(sb); 887 888 return (error); 889 } 890 891 int 892 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) 893 { 894 struct hvs_pcb *pcb = so2hvspcb(so); 895 896 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 897 "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); 898 899 if (pcb == NULL) 900 return (EINVAL); 901 902 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); 903 904 return ((*nam == NULL)? ENOMEM : 0); 905 } 906 907 int 908 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) 909 { 910 struct hvs_pcb *pcb = so2hvspcb(so); 911 912 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 913 "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); 914 915 if (pcb == NULL) 916 return (EINVAL); 917 918 *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); 919 920 return ((*nam == NULL)? ENOMEM : 0); 921 } 922 923 void 924 hvs_trans_close(struct socket *so) 925 { 926 struct hvs_pcb *pcb; 927 928 if (vm_guest != VM_GUEST_HV) 929 return; 930 931 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 932 "%s: HyperV Socket hvs_trans_close called\n", __func__); 933 934 (void) hvs_trans_lock(); 935 pcb = so2hvspcb(so); 936 if (!pcb) { 937 hvs_trans_unlock(); 938 return; 939 } 940 941 if (so->so_state & SS_ISCONNECTED) { 942 /* Send a FIN to peer */ 943 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 944 "%s: hvs_trans_close sending a FIN to host\n", __func__); 945 (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); 946 } 947 948 if (so->so_state & 949 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 950 soisdisconnected(so); 951 952 pcb->chan = NULL; 953 pcb->so = NULL; 954 955 if (SOLISTENING(so)) { 956 mtx_lock(&hvs_trans_socks_mtx); 957 /* Remove from bound list */ 958 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 959 mtx_unlock(&hvs_trans_socks_mtx); 960 } 961 962 hvs_trans_unlock(); 963 964 return; 965 } 966 967 void 968 hvs_trans_abort(struct socket *so) 969 { 970 struct hvs_pcb *pcb = so2hvspcb(so); 971 972 if (vm_guest != VM_GUEST_HV) 973 return; 974 975 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 976 "%s: HyperV Socket hvs_trans_abort called\n", __func__); 977 978 (void) hvs_trans_lock(); 979 if (pcb == NULL) { 980 hvs_trans_unlock(); 981 return; 982 } 983 984 if (SOLISTENING(so)) { 985 mtx_lock(&hvs_trans_socks_mtx); 986 /* Remove from bound list */ 987 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 988 mtx_unlock(&hvs_trans_socks_mtx); 989 } 990 991 if (so->so_state & SS_ISCONNECTED) { 992 (void) sodisconnect(so); 993 } 994 hvs_trans_unlock(); 995 996 return; 997 } 998 999 int 1000 hvs_trans_shutdown(struct socket *so) 1001 { 1002 struct hvs_pcb *pcb = so2hvspcb(so); 1003 struct sockbuf *sb; 1004 1005 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1006 "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); 1007 1008 if (pcb == NULL) 1009 return (EINVAL); 1010 1011 /* 1012 * Only get called with the shutdown method is SHUT_WR or 1013 * SHUT_RDWR. 1014 * When the method is SHUT_RD or SHUT_RDWR, the caller 1015 * already set the SBS_CANTRCVMORE on receive side socket 1016 * buffer. 1017 */ 1018 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1019 /* 1020 * SHUT_WR only case. 1021 * Receive side is still open. Just close 1022 * the send side. 1023 */ 1024 socantsendmore(so); 1025 } else { 1026 /* SHUT_RDWR case */ 1027 if (so->so_state & SS_ISCONNECTED) { 1028 /* Send a FIN to peer */ 1029 sb = &so->so_snd; 1030 SOCKBUF_LOCK(sb); 1031 (void) hvsock_send_data(pcb->chan, NULL, 0, sb); 1032 SOCKBUF_UNLOCK(sb); 1033 1034 soisdisconnecting(so); 1035 } 1036 } 1037 1038 return (0); 1039 } 1040 1041 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is 1042 * <port> (see struct sockaddr_hvs). 1043 * 1044 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: 1045 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- 1046 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with 1047 * the below sockaddr: 1048 * 1049 * struct SOCKADDR_HV 1050 * { 1051 * ADDRESS_FAMILY Family; 1052 * USHORT Reserved; 1053 * GUID VmId; 1054 * GUID ServiceId; 1055 * }; 1056 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via 1057 * VMBus, because here it's obvious the host and the VM can easily identify 1058 * each other. Though the VmID is useful on the host, especially in the case 1059 * of Windows container, FreeBSD VM doesn't need it at all. 1060 * 1061 * To be compatible with similar infrastructure in Linux VMs, we have 1062 * to limit the available GUID space of SOCKADDR_HV so that we can create 1063 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. 1064 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: 1065 * 1066 **************************************************************************** 1067 * The only valid Service GUIDs, from the perspectives of both the host and * 1068 * FreeBSD VM, that can be connected by the other end, must conform to this * 1069 * format: <port>-facb-11e6-bd58-64006a7986d3. * 1070 **************************************************************************** 1071 * 1072 * When we write apps on the host to connect(), the GUID ServiceID is used. 1073 * When we write apps in FreeBSD VM to connect(), we only need to specify the 1074 * port and the driver will form the GUID and use that to request the host. 1075 * 1076 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the 1077 * auto-generated remote port for a connect request initiated by the host's 1078 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the 1079 * FreeBSD guest. 1080 */ 1081 1082 /* 1083 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) 1084 * restricts HyperV socket ring buffer size to six 4K pages. Newer 1085 * HyperV hosts doen't have this limit. 1086 */ 1087 #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) 1088 #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) 1089 #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) 1090 1091 struct hvsock_sc { 1092 device_t dev; 1093 struct hvs_pcb *pcb; 1094 struct vmbus_channel *channel; 1095 }; 1096 1097 static bool 1098 hvsock_chan_readable(struct vmbus_channel *chan) 1099 { 1100 uint32_t readable = vmbus_chan_read_available(chan); 1101 1102 return (readable >= HVSOCK_PKT_LEN(0)); 1103 } 1104 1105 static void 1106 hvsock_chan_cb(struct vmbus_channel *chan, void *context) 1107 { 1108 struct hvs_pcb *pcb = (struct hvs_pcb *) context; 1109 struct socket *so; 1110 uint32_t canwrite; 1111 1112 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1113 "%s: host send us a wakeup on rb data, pcb = %p\n", 1114 __func__, pcb); 1115 1116 /* 1117 * Check if the socket is still attached and valid. 1118 * Here we know channel is still open. Need to make 1119 * sure the socket has not been closed or freed. 1120 */ 1121 (void) hvs_trans_lock(); 1122 so = hsvpcb2so(pcb); 1123 1124 if (pcb->chan != NULL && so != NULL) { 1125 /* 1126 * Wake up reader if there are data to read. 1127 */ 1128 SOCKBUF_LOCK(&(so)->so_rcv); 1129 1130 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1131 "%s: read available = %u\n", __func__, 1132 vmbus_chan_read_available(pcb->chan)); 1133 1134 if (hvsock_chan_readable(pcb->chan)) 1135 sorwakeup_locked(so); 1136 else 1137 SOCKBUF_UNLOCK(&(so)->so_rcv); 1138 1139 /* 1140 * Wake up sender if space becomes available to write. 1141 */ 1142 SOCKBUF_LOCK(&(so)->so_snd); 1143 canwrite = hvsock_canwrite_check(pcb); 1144 1145 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1146 "%s: canwrite = %u\n", __func__, canwrite); 1147 1148 if (canwrite > 0) { 1149 sowwakeup_locked(so); 1150 } else { 1151 SOCKBUF_UNLOCK(&(so)->so_snd); 1152 } 1153 } 1154 1155 hvs_trans_unlock(); 1156 1157 return; 1158 } 1159 1160 static int 1161 hvsock_br_callback(void *datap, int cplen, void *cbarg) 1162 { 1163 struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; 1164 struct uio *uio = arg->uio; 1165 struct sockbuf *sb = arg->sb; 1166 int error = 0; 1167 1168 if (cbarg == NULL || datap == NULL) 1169 return (EINVAL); 1170 1171 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1172 "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " 1173 "datap = %p\n", 1174 __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", 1175 uio->uio_resid, cplen, datap); 1176 1177 if (sb) 1178 SOCKBUF_UNLOCK(sb); 1179 1180 error = uiomove(datap, cplen, uio); 1181 1182 if (sb) 1183 SOCKBUF_LOCK(sb); 1184 1185 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1186 "%s: after uiomove, uio_resid = %zd, error = %d\n", 1187 __func__, uio->uio_resid, error); 1188 1189 return (error); 1190 } 1191 1192 static int 1193 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, 1194 uint32_t to_write, struct sockbuf *sb) 1195 { 1196 struct hvs_pkt_header hvs_pkt; 1197 int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; 1198 uint64_t pad = 0; 1199 struct iovec iov[3]; 1200 struct hvs_callback_arg cbarg; 1201 1202 if (chan == NULL) 1203 return (ENOTCONN); 1204 1205 hlen = sizeof(struct vmbus_chanpkt_hdr); 1206 hvs_pkthlen = sizeof(struct hvs_pkt_header); 1207 hvs_pktlen = hvs_pkthlen + to_write; 1208 pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); 1209 1210 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1211 "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " 1212 "pad_pktlen = %u, data_len = %u\n", 1213 __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); 1214 1215 hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; 1216 hvs_pkt.chan_pkt_hdr.cph_flags = 0; 1217 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); 1218 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); 1219 hvs_pkt.chan_pkt_hdr.cph_xactid = 0; 1220 1221 hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; 1222 hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; 1223 1224 cbarg.uio = uio; 1225 cbarg.sb = sb; 1226 1227 if (uio && to_write > 0) { 1228 iov[0].iov_base = &hvs_pkt; 1229 iov[0].iov_len = hvs_pkthlen; 1230 iov[1].iov_base = NULL; 1231 iov[1].iov_len = to_write; 1232 iov[2].iov_base = &pad; 1233 iov[2].iov_len = pad_pktlen - hvs_pktlen; 1234 1235 error = vmbus_chan_iov_send(chan, iov, 3, 1236 hvsock_br_callback, &cbarg); 1237 } else { 1238 if (to_write == 0) { 1239 iov[0].iov_base = &hvs_pkt; 1240 iov[0].iov_len = hvs_pkthlen; 1241 iov[1].iov_base = &pad; 1242 iov[1].iov_len = pad_pktlen - hvs_pktlen; 1243 error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); 1244 } 1245 } 1246 1247 if (error) { 1248 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1249 "%s: error = %d\n", __func__, error); 1250 } 1251 1252 return (error); 1253 } 1254 1255 /* 1256 * Check if we have data on current ring buffer to read 1257 * or not. If not, advance the ring buffer read index to 1258 * next packet. Update the recev_data_len and recev_data_off 1259 * to new value. 1260 * Return the number of bytes can read. 1261 */ 1262 static uint32_t 1263 hvsock_canread_check(struct hvs_pcb *pcb) 1264 { 1265 uint32_t advance; 1266 uint32_t tlen, hlen, dlen; 1267 uint32_t bytes_canread = 0; 1268 int error; 1269 1270 if (pcb == NULL || pcb->chan == NULL) { 1271 pcb->so->so_error = EIO; 1272 return (0); 1273 } 1274 1275 /* Still have data not read yet on current packet */ 1276 if (pcb->recv_data_len > 0) 1277 return (pcb->recv_data_len); 1278 1279 if (pcb->rb_init) 1280 advance = 1281 VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1282 else 1283 advance = 0; 1284 1285 bytes_canread = vmbus_chan_read_available(pcb->chan); 1286 1287 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1288 "%s: bytes_canread on br = %u, advance = %u\n", 1289 __func__, bytes_canread, advance); 1290 1291 if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { 1292 /* 1293 * Nothing to read. Need to advance the rindex before 1294 * calling sbwait, so host knows to wake us up when data 1295 * is available to read on rb. 1296 */ 1297 error = vmbus_chan_recv_idxadv(pcb->chan, advance); 1298 if (error) { 1299 HVSOCK_DBG(HVSOCK_DBG_ERR, 1300 "%s: after calling vmbus_chan_recv_idxadv, " 1301 "got error = %d\n", __func__, error); 1302 return (0); 1303 } else { 1304 pcb->rb_init = false; 1305 pcb->recv_data_len = 0; 1306 pcb->recv_data_off = 0; 1307 bytes_canread = vmbus_chan_read_available(pcb->chan); 1308 1309 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1310 "%s: advanced %u bytes, " 1311 " bytes_canread on br now = %u\n", 1312 __func__, advance, bytes_canread); 1313 1314 if (bytes_canread == 0) 1315 return (0); 1316 else 1317 advance = 0; 1318 } 1319 } 1320 1321 if (bytes_canread < 1322 advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) 1323 return (0); 1324 1325 error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, 1326 sizeof(struct hvs_pkt_header), advance); 1327 1328 /* Don't have anything to read */ 1329 if (error) { 1330 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1331 "%s: after calling vmbus_chan_recv_peek, got error = %d\n", 1332 __func__, error); 1333 return (0); 1334 } 1335 1336 /* 1337 * We just read in a new packet header. Do some sanity checks. 1338 */ 1339 tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1340 hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); 1341 dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; 1342 if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || 1343 __predict_false(hlen > tlen) || 1344 __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { 1345 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1346 "invalid tlen(%u), hlen(%u) or dlen(%u)\n", 1347 tlen, hlen, dlen); 1348 pcb->so->so_error = EIO; 1349 return (0); 1350 } 1351 if (pcb->rb_init == false) 1352 pcb->rb_init = true; 1353 1354 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1355 "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", 1356 tlen, hlen, dlen); 1357 1358 /* The other side has sent a close FIN */ 1359 if (dlen == 0) { 1360 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1361 "%s: Received FIN from other side\n", __func__); 1362 /* inform the caller by seting so_error to ESHUTDOWN */ 1363 pcb->so->so_error = ESHUTDOWN; 1364 } 1365 1366 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1367 "%s: canread on receive ring is %u \n", __func__, dlen); 1368 1369 pcb->recv_data_len = dlen; 1370 pcb->recv_data_off = 0; 1371 1372 return (pcb->recv_data_len); 1373 } 1374 1375 static uint32_t 1376 hvsock_canwrite_check(struct hvs_pcb *pcb) 1377 { 1378 uint32_t writeable; 1379 uint32_t ret; 1380 1381 if (pcb == NULL || pcb->chan == NULL) 1382 return (0); 1383 1384 writeable = vmbus_chan_write_available(pcb->chan); 1385 1386 /* 1387 * We must always reserve a 0-length-payload packet for the FIN. 1388 */ 1389 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1390 "%s: writeable is %u, should be greater than %ju\n", 1391 __func__, writeable, 1392 (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); 1393 1394 if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { 1395 /* 1396 * The Tx ring seems full. 1397 */ 1398 return (0); 1399 } 1400 1401 ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); 1402 1403 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1404 "%s: available size is %u\n", __func__, rounddown2(ret, 8)); 1405 1406 return (rounddown2(ret, 8)); 1407 } 1408 1409 static void 1410 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) 1411 { 1412 vmbus_chan_set_pending_send_size(chan, 1413 HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); 1414 } 1415 1416 static int 1417 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) 1418 { 1419 unsigned int rcvbuf, sndbuf; 1420 struct hvs_pcb *pcb = so2hvspcb(so); 1421 int ret; 1422 1423 if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { 1424 sndbuf = HVS_RINGBUF_SND_SIZE; 1425 rcvbuf = HVS_RINGBUF_RCV_SIZE; 1426 } else { 1427 sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); 1428 sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); 1429 sndbuf = rounddown2(sndbuf, PAGE_SIZE); 1430 rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); 1431 rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); 1432 rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); 1433 } 1434 1435 /* 1436 * Can only read whatever user provided size of data 1437 * from ring buffer. Turn off batched reading. 1438 */ 1439 vmbus_chan_set_readbatch(chan, false); 1440 1441 ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, 1442 hvsock_chan_cb, pcb); 1443 1444 if (ret != 0) { 1445 HVSOCK_DBG(HVSOCK_DBG_ERR, 1446 "%s: failed to open hvsock channel, sndbuf = %u, " 1447 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1448 } else { 1449 HVSOCK_DBG(HVSOCK_DBG_INFO, 1450 "%s: hvsock channel opened, sndbuf = %u, i" 1451 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1452 /* 1453 * Se the pending send size so to receive wakeup 1454 * signals from host when there is enough space on 1455 * rx buffer ring to write. 1456 */ 1457 hvsock_set_chan_pending_send_size(chan); 1458 } 1459 1460 return ret; 1461 } 1462 1463 /* 1464 * Guest is listening passively on the socket. Open channel and 1465 * create a new socket for the conneciton. 1466 */ 1467 static void 1468 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, 1469 struct hvsock_sc *sc) 1470 { 1471 struct socket *new_so; 1472 struct hvs_pcb *new_pcb, *pcb; 1473 int error; 1474 1475 /* Do nothing if socket is not listening */ 1476 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1477 HVSOCK_DBG(HVSOCK_DBG_ERR, 1478 "%s: socket is not a listening one\n", __func__); 1479 return; 1480 } 1481 1482 /* 1483 * Create a new socket. This will call pru_attach to complete 1484 * the socket initialization and put the new socket onto 1485 * listening socket's sol_incomp list, waiting to be promoted 1486 * to sol_comp list. 1487 * The new socket created has ref count 0. There is no other 1488 * thread that changes the state of this new one at the 1489 * moment, so we don't need to hold its lock while opening 1490 * channel and filling out its pcb information. 1491 */ 1492 new_so = sonewconn(so, 0); 1493 if (!new_so) 1494 HVSOCK_DBG(HVSOCK_DBG_ERR, 1495 "%s: creating new socket failed\n", __func__); 1496 1497 /* 1498 * Now open the vmbus channel. If it fails, the socket will be 1499 * on the listening socket's sol_incomp queue until it is 1500 * replaced and aborted. 1501 */ 1502 error = hvsock_open_channel(chan, new_so); 1503 if (error) { 1504 new_so->so_error = error; 1505 return; 1506 } 1507 1508 pcb = so->so_pcb; 1509 new_pcb = new_so->so_pcb; 1510 1511 hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); 1512 /* Remote port is unknown to guest in this type of conneciton */ 1513 hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); 1514 new_pcb->chan = chan; 1515 new_pcb->recv_data_len = 0; 1516 new_pcb->recv_data_off = 0; 1517 new_pcb->rb_init = false; 1518 1519 new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); 1520 new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); 1521 1522 hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); 1523 1524 sc->pcb = new_pcb; 1525 1526 /* 1527 * Change the socket state to SS_ISCONNECTED. This will promote 1528 * the socket to sol_comp queue and wake up the thread which 1529 * is accepting connection. 1530 */ 1531 soisconnected(new_so); 1532 } 1533 1534 1535 /* 1536 * Guest is actively connecting to host. 1537 */ 1538 static void 1539 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) 1540 { 1541 struct hvs_pcb *pcb; 1542 int error; 1543 1544 error = hvsock_open_channel(chan, so); 1545 if (error) { 1546 so->so_error = error; 1547 return; 1548 } 1549 1550 pcb = so->so_pcb; 1551 pcb->chan = chan; 1552 pcb->recv_data_len = 0; 1553 pcb->recv_data_off = 0; 1554 pcb->rb_init = false; 1555 1556 mtx_lock(&hvs_trans_socks_mtx); 1557 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 1558 __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); 1559 mtx_unlock(&hvs_trans_socks_mtx); 1560 1561 /* 1562 * Change the socket state to SS_ISCONNECTED. This will wake up 1563 * the thread sleeping in connect call. 1564 */ 1565 soisconnected(so); 1566 } 1567 1568 static void 1569 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) 1570 { 1571 struct hyperv_guid *inst_guid, *type_guid; 1572 bool conn_from_host; 1573 struct sockaddr_hvs addr; 1574 struct socket *so; 1575 struct hvs_pcb *pcb; 1576 1577 type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); 1578 inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); 1579 conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); 1580 1581 HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); 1582 hvsock_print_guid(type_guid); 1583 HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); 1584 hvsock_print_guid(inst_guid); 1585 HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", 1586 (conn_from_host == true ) ? "from" : "to"); 1587 1588 /* 1589 * The listening port should be in [0, MAX_LISTEN_PORT] 1590 */ 1591 if (!is_valid_srv_id(type_guid)) 1592 return; 1593 1594 /* 1595 * There should be a bound socket already created no matter 1596 * it is a passive or active connection. 1597 * For host initiated connection (passive on guest side), 1598 * the type_guid contains the port which guest is bound and 1599 * listening. 1600 * For the guest initiated connection (active on guest side), 1601 * the inst_guid contains the port that guest has auto bound 1602 * to. 1603 */ 1604 hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); 1605 so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); 1606 if (!so) { 1607 HVSOCK_DBG(HVSOCK_DBG_ERR, 1608 "%s: no bound socket found for port %u\n", 1609 __func__, addr.hvs_port); 1610 return; 1611 } 1612 1613 if (conn_from_host) { 1614 hvsock_open_conn_passive(chan, so, sc); 1615 } else { 1616 (void) hvs_trans_lock(); 1617 pcb = so->so_pcb; 1618 if (pcb && pcb->so) { 1619 sc->pcb = so2hvspcb(so); 1620 hvsock_open_conn_active(chan, so); 1621 } else { 1622 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1623 "%s: channel detached before open\n", __func__); 1624 } 1625 hvs_trans_unlock(); 1626 } 1627 1628 } 1629 1630 static int 1631 hvsock_probe(device_t dev) 1632 { 1633 struct vmbus_channel *channel = vmbus_get_channel(dev); 1634 1635 if (!channel || !vmbus_chan_is_hvs(channel)) { 1636 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1637 "hvsock_probe called but not a hvsock channel id %u\n", 1638 vmbus_chan_id(channel)); 1639 1640 return ENXIO; 1641 } else { 1642 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1643 "hvsock_probe got a hvsock channel id %u\n", 1644 vmbus_chan_id(channel)); 1645 1646 return BUS_PROBE_DEFAULT; 1647 } 1648 } 1649 1650 static int 1651 hvsock_attach(device_t dev) 1652 { 1653 struct vmbus_channel *channel = vmbus_get_channel(dev); 1654 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1655 1656 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); 1657 1658 hvsock_open_connection(channel, sc); 1659 1660 /* 1661 * Always return success. On error the host will rescind the device 1662 * in 30 seconds and we can do cleanup at that time in 1663 * vmbus_chan_msgproc_chrescind(). 1664 */ 1665 return (0); 1666 } 1667 1668 static int 1669 hvsock_detach(device_t dev) 1670 { 1671 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1672 struct socket *so; 1673 int error, retry; 1674 1675 if (bootverbose) 1676 device_printf(dev, "hvsock_detach called.\n"); 1677 1678 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); 1679 1680 if (sc->pcb != NULL) { 1681 (void) hvs_trans_lock(); 1682 1683 so = hsvpcb2so(sc->pcb); 1684 if (so) { 1685 /* Close the connection */ 1686 if (so->so_state & 1687 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 1688 soisdisconnected(so); 1689 } 1690 1691 mtx_lock(&hvs_trans_socks_mtx); 1692 __hvs_remove_pcb_from_list(sc->pcb, 1693 HVS_LIST_BOUND | HVS_LIST_CONNECTED); 1694 mtx_unlock(&hvs_trans_socks_mtx); 1695 1696 /* 1697 * Close channel while no reader and sender are working 1698 * on the buffer rings. 1699 */ 1700 if (so) { 1701 retry = 0; 1702 while ((error = sblock(&so->so_rcv, 0)) == 1703 EWOULDBLOCK) { 1704 /* 1705 * Someone is reading, rx br is busy 1706 */ 1707 soisdisconnected(so); 1708 DELAY(500); 1709 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1710 "waiting for rx reader to exit, " 1711 "retry = %d\n", retry++); 1712 } 1713 retry = 0; 1714 while ((error = sblock(&so->so_snd, 0)) == 1715 EWOULDBLOCK) { 1716 /* 1717 * Someone is sending, tx br is busy 1718 */ 1719 soisdisconnected(so); 1720 DELAY(500); 1721 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1722 "waiting for tx sender to exit, " 1723 "retry = %d\n", retry++); 1724 } 1725 } 1726 1727 1728 bzero(sc->pcb, sizeof(struct hvs_pcb)); 1729 free(sc->pcb, M_HVSOCK); 1730 sc->pcb = NULL; 1731 1732 if (so) { 1733 sbunlock(&so->so_rcv); 1734 sbunlock(&so->so_snd); 1735 so->so_pcb = NULL; 1736 } 1737 1738 hvs_trans_unlock(); 1739 } 1740 1741 vmbus_chan_close(vmbus_get_channel(dev)); 1742 1743 return (0); 1744 } 1745 1746 static device_method_t hvsock_methods[] = { 1747 /* Device interface */ 1748 DEVMETHOD(device_probe, hvsock_probe), 1749 DEVMETHOD(device_attach, hvsock_attach), 1750 DEVMETHOD(device_detach, hvsock_detach), 1751 DEVMETHOD_END 1752 }; 1753 1754 static driver_t hvsock_driver = { 1755 "hv_sock", 1756 hvsock_methods, 1757 sizeof(struct hvsock_sc) 1758 }; 1759 1760 static devclass_t hvsock_devclass; 1761 1762 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); 1763 MODULE_VERSION(hvsock, 1); 1764 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); 1765