1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Microsoft Corp. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bus.h> 34 #include <sys/domain.h> 35 #include <sys/lock.h> 36 #include <sys/kernel.h> 37 #include <sys/types.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/sysproto.h> 46 #include <sys/systm.h> 47 #include <sys/sockbuf.h> 48 #include <sys/sx.h> 49 #include <sys/uio.h> 50 51 #include <net/vnet.h> 52 53 #include <dev/hyperv/vmbus/vmbus_reg.h> 54 55 #include "hv_sock.h" 56 57 #define HVSOCK_DBG_NONE 0x0 58 #define HVSOCK_DBG_INFO 0x1 59 #define HVSOCK_DBG_ERR 0x2 60 #define HVSOCK_DBG_VERBOSE 0x3 61 62 63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); 64 65 static int hvs_dbg_level; 66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 67 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); 68 69 70 #define HVSOCK_DBG(level, ...) do { \ 71 if (hvs_dbg_level >= (level)) \ 72 printf(__VA_ARGS__); \ 73 } while (0) 74 75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); 76 77 /* The MTU is 16KB per host side's design */ 78 #define HVSOCK_MTU_SIZE (1024 * 16) 79 #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) 80 81 #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) 82 83 #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ 84 roundup2(payload_len, 8) + \ 85 sizeof(uint64_t)) 86 87 88 static struct domain hv_socket_domain; 89 90 /* 91 * HyperV Transport sockets 92 */ 93 static struct pr_usrreqs hvs_trans_usrreqs = { 94 .pru_attach = hvs_trans_attach, 95 .pru_bind = hvs_trans_bind, 96 .pru_listen = hvs_trans_listen, 97 .pru_accept = hvs_trans_accept, 98 .pru_connect = hvs_trans_connect, 99 .pru_peeraddr = hvs_trans_peeraddr, 100 .pru_sockaddr = hvs_trans_sockaddr, 101 .pru_soreceive = hvs_trans_soreceive, 102 .pru_sosend = hvs_trans_sosend, 103 .pru_disconnect = hvs_trans_disconnect, 104 .pru_close = hvs_trans_close, 105 .pru_detach = hvs_trans_detach, 106 .pru_shutdown = hvs_trans_shutdown, 107 .pru_abort = hvs_trans_abort, 108 }; 109 110 /* 111 * Definitions of protocols supported in HyperV socket domain 112 */ 113 static struct protosw hv_socket_protosw[] = { 114 { 115 .pr_type = SOCK_STREAM, 116 .pr_domain = &hv_socket_domain, 117 .pr_protocol = HYPERV_SOCK_PROTO_TRANS, 118 .pr_flags = PR_CONNREQUIRED, 119 .pr_init = hvs_trans_init, 120 .pr_usrreqs = &hvs_trans_usrreqs, 121 }, 122 }; 123 124 static struct domain hv_socket_domain = { 125 .dom_family = AF_HYPERV, 126 .dom_name = "hyperv", 127 .dom_protosw = hv_socket_protosw, 128 .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] 129 }; 130 131 VNET_DOMAIN_SET(hv_socket_); 132 133 #define MAX_PORT ((uint32_t)0xFFFFFFFF) 134 #define MIN_PORT ((uint32_t)0x0) 135 136 /* 00000000-facb-11e6-bd58-64006a7986d3 */ 137 static const struct hyperv_guid srv_id_template = { 138 .hv_guid = { 139 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 140 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } 141 }; 142 143 static int hvsock_br_callback(void *, int, void *); 144 static uint32_t hvsock_canread_check(struct hvs_pcb *); 145 static uint32_t hvsock_canwrite_check(struct hvs_pcb *); 146 static int hvsock_send_data(struct vmbus_channel *chan, 147 struct uio *uio, uint32_t to_write, struct sockbuf *sb); 148 149 150 151 /* Globals */ 152 static struct sx hvs_trans_socks_sx; 153 static struct mtx hvs_trans_socks_mtx; 154 static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; 155 static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; 156 static uint32_t previous_auto_bound_port; 157 158 static void 159 hvsock_print_guid(struct hyperv_guid *guid) 160 { 161 unsigned char *p = (unsigned char *)guid; 162 163 HVSOCK_DBG(HVSOCK_DBG_INFO, 164 "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", 165 *(unsigned int *)p, 166 *((unsigned short *) &p[4]), 167 *((unsigned short *) &p[6]), 168 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 169 } 170 171 static bool 172 is_valid_srv_id(const struct hyperv_guid *id) 173 { 174 return !memcmp(&id->hv_guid[4], 175 &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); 176 } 177 178 static unsigned int 179 get_port_by_srv_id(const struct hyperv_guid *srv_id) 180 { 181 return *((const unsigned int *)srv_id); 182 } 183 184 static void 185 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) 186 { 187 *((unsigned int *)srv_id) = port; 188 } 189 190 191 static void 192 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) 193 { 194 struct hvs_pcb *p = NULL; 195 196 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 197 198 if (!pcb) 199 return; 200 201 if (list & HVS_LIST_BOUND) { 202 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 203 if (p == pcb) 204 LIST_REMOVE(p, bound_next); 205 } 206 207 if (list & HVS_LIST_CONNECTED) { 208 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 209 if (p == pcb) 210 LIST_REMOVE(pcb, connected_next); 211 } 212 } 213 214 static void 215 __hvs_remove_socket_from_list(struct socket *so, unsigned char list) 216 { 217 struct hvs_pcb *pcb = so2hvspcb(so); 218 219 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 220 221 __hvs_remove_pcb_from_list(pcb, list); 222 } 223 224 static void 225 __hvs_insert_socket_on_list(struct socket *so, unsigned char list) 226 { 227 struct hvs_pcb *pcb = so2hvspcb(so); 228 229 if (list & HVS_LIST_BOUND) 230 LIST_INSERT_HEAD(&hvs_trans_bound_socks, 231 pcb, bound_next); 232 233 if (list & HVS_LIST_CONNECTED) 234 LIST_INSERT_HEAD(&hvs_trans_connected_socks, 235 pcb, connected_next); 236 } 237 238 void 239 hvs_remove_socket_from_list(struct socket *so, unsigned char list) 240 { 241 if (!so || !so->so_pcb) { 242 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 243 "%s: socket or so_pcb is null\n", __func__); 244 return; 245 } 246 247 mtx_lock(&hvs_trans_socks_mtx); 248 __hvs_remove_socket_from_list(so, list); 249 mtx_unlock(&hvs_trans_socks_mtx); 250 } 251 252 static void 253 hvs_insert_socket_on_list(struct socket *so, unsigned char list) 254 { 255 if (!so || !so->so_pcb) { 256 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 257 "%s: socket or so_pcb is null\n", __func__); 258 return; 259 } 260 261 mtx_lock(&hvs_trans_socks_mtx); 262 __hvs_insert_socket_on_list(so, list); 263 mtx_unlock(&hvs_trans_socks_mtx); 264 } 265 266 static struct socket * 267 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 268 { 269 struct hvs_pcb *p = NULL; 270 271 if (list & HVS_LIST_BOUND) 272 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 273 if (p->so != NULL && 274 addr->hvs_port == p->local_addr.hvs_port) 275 return p->so; 276 277 if (list & HVS_LIST_CONNECTED) 278 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 279 if (p->so != NULL && 280 addr->hvs_port == p->local_addr.hvs_port) 281 return p->so; 282 283 return NULL; 284 } 285 286 static struct socket * 287 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 288 { 289 struct socket *s = NULL; 290 291 mtx_lock(&hvs_trans_socks_mtx); 292 s = __hvs_find_socket_on_list(addr, list); 293 mtx_unlock(&hvs_trans_socks_mtx); 294 295 return s; 296 } 297 298 static inline void 299 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) 300 { 301 memset(addr, 0, sizeof(*addr)); 302 addr->sa_family = AF_HYPERV; 303 addr->sa_len = sizeof(*addr); 304 addr->hvs_port = port; 305 } 306 307 void 308 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) 309 { 310 hvs_addr_set(addr, get_port_by_srv_id(svr_id)); 311 } 312 313 int 314 hvs_trans_lock(void) 315 { 316 sx_xlock(&hvs_trans_socks_sx); 317 return (0); 318 } 319 320 void 321 hvs_trans_unlock(void) 322 { 323 sx_xunlock(&hvs_trans_socks_sx); 324 } 325 326 void 327 hvs_trans_init(void) 328 { 329 /* Skip initialization of globals for non-default instances. */ 330 if (!IS_DEFAULT_VNET(curvnet)) 331 return; 332 333 if (vm_guest != VM_GUEST_HV) 334 return; 335 336 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 337 "%s: HyperV Socket hvs_trans_init called\n", __func__); 338 339 /* Initialize Globals */ 340 previous_auto_bound_port = MAX_PORT; 341 sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); 342 mtx_init(&hvs_trans_socks_mtx, 343 "hvs_trans_socks_mtx", NULL, MTX_DEF); 344 LIST_INIT(&hvs_trans_bound_socks); 345 LIST_INIT(&hvs_trans_connected_socks); 346 } 347 348 /* 349 * Called in two cases: 350 * 1) When user calls socket(); 351 * 2) When we accept new incoming conneciton and call sonewconn(). 352 */ 353 int 354 hvs_trans_attach(struct socket *so, int proto, struct thread *td) 355 { 356 struct hvs_pcb *pcb = so2hvspcb(so); 357 358 if (vm_guest != VM_GUEST_HV) 359 return (ESOCKTNOSUPPORT); 360 361 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 362 "%s: HyperV Socket hvs_trans_attach called\n", __func__); 363 364 if (so->so_type != SOCK_STREAM) 365 return (ESOCKTNOSUPPORT); 366 367 if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) 368 return (EPROTONOSUPPORT); 369 370 if (pcb != NULL) 371 return (EISCONN); 372 pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); 373 if (pcb == NULL) 374 return (ENOMEM); 375 376 pcb->so = so; 377 so->so_pcb = (void *)pcb; 378 379 return (0); 380 } 381 382 void 383 hvs_trans_detach(struct socket *so) 384 { 385 struct hvs_pcb *pcb; 386 387 if (vm_guest != VM_GUEST_HV) 388 return; 389 390 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 391 "%s: HyperV Socket hvs_trans_detach called\n", __func__); 392 393 (void) hvs_trans_lock(); 394 pcb = so2hvspcb(so); 395 if (pcb == NULL) { 396 hvs_trans_unlock(); 397 return; 398 } 399 400 if (SOLISTENING(so)) { 401 bzero(pcb, sizeof(*pcb)); 402 free(pcb, M_HVSOCK); 403 } 404 405 so->so_pcb = NULL; 406 407 hvs_trans_unlock(); 408 } 409 410 int 411 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) 412 { 413 struct hvs_pcb *pcb = so2hvspcb(so); 414 struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; 415 int error = 0; 416 417 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 418 "%s: HyperV Socket hvs_trans_bind called\n", __func__); 419 420 if (sa == NULL) { 421 return (EINVAL); 422 } 423 424 if (pcb == NULL) { 425 return (EINVAL); 426 } 427 428 if (sa->sa_family != AF_HYPERV) { 429 HVSOCK_DBG(HVSOCK_DBG_ERR, 430 "%s: Not supported, sa_family is %u\n", 431 __func__, sa->sa_family); 432 return (EAFNOSUPPORT); 433 } 434 if (sa->sa_len != sizeof(*sa)) { 435 HVSOCK_DBG(HVSOCK_DBG_ERR, 436 "%s: Not supported, sa_len is %u\n", 437 __func__, sa->sa_len); 438 return (EINVAL); 439 } 440 441 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 442 "%s: binding port = 0x%x\n", __func__, sa->hvs_port); 443 444 mtx_lock(&hvs_trans_socks_mtx); 445 if (__hvs_find_socket_on_list(sa, 446 HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { 447 error = EADDRINUSE; 448 } else { 449 /* 450 * The address is available for us to bind. 451 * Add socket to the bound list. 452 */ 453 hvs_addr_set(&pcb->local_addr, sa->hvs_port); 454 hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); 455 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 456 } 457 mtx_unlock(&hvs_trans_socks_mtx); 458 459 return (error); 460 } 461 462 int 463 hvs_trans_listen(struct socket *so, int backlog, struct thread *td) 464 { 465 struct hvs_pcb *pcb = so2hvspcb(so); 466 struct socket *bound_so; 467 int error; 468 469 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 470 "%s: HyperV Socket hvs_trans_listen called\n", __func__); 471 472 if (pcb == NULL) 473 return (EINVAL); 474 475 /* Check if the address is already bound and it was by us. */ 476 bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); 477 if (bound_so == NULL || bound_so != so) { 478 HVSOCK_DBG(HVSOCK_DBG_ERR, 479 "%s: Address not bound or not by us.\n", __func__); 480 return (EADDRNOTAVAIL); 481 } 482 483 SOCK_LOCK(so); 484 error = solisten_proto_check(so); 485 if (error == 0) 486 solisten_proto(so, backlog); 487 SOCK_UNLOCK(so); 488 489 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 490 "%s: HyperV Socket listen error = %d\n", __func__, error); 491 return (error); 492 } 493 494 int 495 hvs_trans_accept(struct socket *so, struct sockaddr **nam) 496 { 497 struct hvs_pcb *pcb = so2hvspcb(so); 498 499 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 500 "%s: HyperV Socket hvs_trans_accept called\n", __func__); 501 502 if (pcb == NULL) 503 return (EINVAL); 504 505 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, 506 M_NOWAIT); 507 508 return ((*nam == NULL) ? ENOMEM : 0); 509 } 510 511 int 512 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 513 { 514 struct hvs_pcb *pcb = so2hvspcb(so); 515 struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; 516 bool found_auto_bound_port = false; 517 int i, error = 0; 518 519 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 520 "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", 521 __func__, raddr->hvs_port); 522 523 if (pcb == NULL) 524 return (EINVAL); 525 526 /* Verify the remote address */ 527 if (raddr == NULL) 528 return (EINVAL); 529 if (raddr->sa_family != AF_HYPERV) 530 return (EAFNOSUPPORT); 531 if (raddr->sa_len != sizeof(*raddr)) 532 return (EINVAL); 533 534 mtx_lock(&hvs_trans_socks_mtx); 535 if (so->so_state & 536 (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { 537 HVSOCK_DBG(HVSOCK_DBG_ERR, 538 "%s: socket connect in progress\n", 539 __func__); 540 error = EINPROGRESS; 541 goto out; 542 } 543 544 /* 545 * Find an available port for us to auto bind the local 546 * address. 547 */ 548 hvs_addr_set(&pcb->local_addr, 0); 549 550 for (i = previous_auto_bound_port - 1; 551 i != previous_auto_bound_port; i --) { 552 if (i == MIN_PORT) 553 i = MAX_PORT; 554 555 pcb->local_addr.hvs_port = i; 556 557 if (__hvs_find_socket_on_list(&pcb->local_addr, 558 HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { 559 found_auto_bound_port = true; 560 previous_auto_bound_port = i; 561 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 562 "%s: found local bound port is %x\n", 563 __func__, pcb->local_addr.hvs_port); 564 break; 565 } 566 } 567 568 if (found_auto_bound_port == true) { 569 /* Found available port for auto bound, put on list */ 570 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 571 /* Set VM service ID */ 572 pcb->vm_srv_id = srv_id_template; 573 set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); 574 /* Set host service ID and remote port */ 575 pcb->host_srv_id = srv_id_template; 576 set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); 577 hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); 578 579 /* Change the socket state to SS_ISCONNECTING */ 580 soisconnecting(so); 581 } else { 582 HVSOCK_DBG(HVSOCK_DBG_ERR, 583 "%s: No local port available for auto bound\n", 584 __func__); 585 error = EADDRINUSE; 586 } 587 588 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); 589 hvsock_print_guid(&pcb->vm_srv_id); 590 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); 591 hvsock_print_guid(&pcb->host_srv_id); 592 593 out: 594 mtx_unlock(&hvs_trans_socks_mtx); 595 596 if (found_auto_bound_port == true) 597 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); 598 599 return (error); 600 } 601 602 int 603 hvs_trans_disconnect(struct socket *so) 604 { 605 struct hvs_pcb *pcb; 606 607 if (vm_guest != VM_GUEST_HV) 608 return (ESOCKTNOSUPPORT); 609 610 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 611 "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); 612 613 (void) hvs_trans_lock(); 614 pcb = so2hvspcb(so); 615 if (pcb == NULL) { 616 hvs_trans_unlock(); 617 return (EINVAL); 618 } 619 620 /* If socket is already disconnected, skip this */ 621 if ((so->so_state & SS_ISDISCONNECTED) == 0) 622 soisdisconnecting(so); 623 624 hvs_trans_unlock(); 625 626 return (0); 627 } 628 629 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 630 struct hvs_callback_arg { 631 struct uio *uio; 632 struct sockbuf *sb; 633 }; 634 635 int 636 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, 637 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 638 { 639 struct hvs_pcb *pcb = so2hvspcb(so); 640 struct sockbuf *sb; 641 ssize_t orig_resid; 642 uint32_t canread, to_read; 643 int flags, error = 0; 644 struct hvs_callback_arg cbarg; 645 646 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 647 "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); 648 649 if (so->so_type != SOCK_STREAM) 650 return (EINVAL); 651 if (pcb == NULL) 652 return (EINVAL); 653 654 if (flagsp != NULL) 655 flags = *flagsp &~ MSG_EOR; 656 else 657 flags = 0; 658 659 if (flags & MSG_PEEK) 660 return (EOPNOTSUPP); 661 662 /* If no space to copy out anything */ 663 if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) 664 return (EINVAL); 665 666 sb = &so->so_rcv; 667 668 orig_resid = uio->uio_resid; 669 670 /* Prevent other readers from entering the socket. */ 671 error = sblock(sb, SBLOCKWAIT(flags)); 672 if (error) { 673 HVSOCK_DBG(HVSOCK_DBG_ERR, 674 "%s: sblock returned error = %d\n", __func__, error); 675 return (error); 676 } 677 678 SOCKBUF_LOCK(sb); 679 680 cbarg.uio = uio; 681 cbarg.sb = sb; 682 /* 683 * If the socket is closing, there might still be some data 684 * in rx br to read. However we need to make sure 685 * the channel is still open. 686 */ 687 if ((sb->sb_state & SBS_CANTRCVMORE) && 688 (so->so_state & SS_ISDISCONNECTED)) { 689 /* Other thread already closed the channel */ 690 error = EPIPE; 691 goto out; 692 } 693 694 while (true) { 695 while (uio->uio_resid > 0 && 696 (canread = hvsock_canread_check(pcb)) > 0) { 697 to_read = MIN(canread, uio->uio_resid); 698 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 699 "%s: to_read = %u, skip = %u\n", __func__, to_read, 700 (unsigned int)(sizeof(struct hvs_pkt_header) + 701 pcb->recv_data_off)); 702 703 error = vmbus_chan_recv_peek_call(pcb->chan, to_read, 704 sizeof(struct hvs_pkt_header) + pcb->recv_data_off, 705 hvsock_br_callback, (void *)&cbarg); 706 /* 707 * It is possible socket is disconnected becasue 708 * we released lock in hvsock_br_callback. So we 709 * need to check the state to make sure it is not 710 * disconnected. 711 */ 712 if (error || so->so_state & SS_ISDISCONNECTED) { 713 break; 714 } 715 716 pcb->recv_data_len -= to_read; 717 pcb->recv_data_off += to_read; 718 } 719 720 if (error) 721 break; 722 723 /* Abort if socket has reported problems. */ 724 if (so->so_error) { 725 if (so->so_error == ESHUTDOWN && 726 orig_resid > uio->uio_resid) { 727 /* 728 * Although we got a FIN, we also received 729 * some data in this round. Delivery it 730 * to user. 731 */ 732 error = 0; 733 } else { 734 if (so->so_error != ESHUTDOWN) 735 error = so->so_error; 736 } 737 738 break; 739 } 740 741 /* Cannot received more. */ 742 if (sb->sb_state & SBS_CANTRCVMORE) 743 break; 744 745 /* We are done if buffer has been filled */ 746 if (uio->uio_resid == 0) 747 break; 748 749 if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) 750 break; 751 752 /* Buffer ring is empty and we shall not block */ 753 if ((so->so_state & SS_NBIO) || 754 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 755 if (orig_resid == uio->uio_resid) { 756 /* We have not read anything */ 757 error = EAGAIN; 758 } 759 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 760 "%s: non blocked read return, error %d.\n", 761 __func__, error); 762 break; 763 } 764 765 /* 766 * Wait and block until (more) data comes in. 767 * Note: Drops the sockbuf lock during wait. 768 */ 769 error = sbwait(sb); 770 771 if (error) 772 break; 773 774 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 775 "%s: wake up from sbwait, read available is %u\n", 776 __func__, vmbus_chan_read_available(pcb->chan)); 777 } 778 779 out: 780 SOCKBUF_UNLOCK(sb); 781 782 sbunlock(sb); 783 784 /* We recieved a FIN in this call */ 785 if (so->so_error == ESHUTDOWN) { 786 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 787 /* Send has already closed */ 788 soisdisconnecting(so); 789 } else { 790 /* Just close the receive side */ 791 socantrcvmore(so); 792 } 793 } 794 795 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 796 "%s: returning error = %d, so_error = %d\n", 797 __func__, error, so->so_error); 798 799 return (error); 800 } 801 802 int 803 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 804 struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) 805 { 806 struct hvs_pcb *pcb = so2hvspcb(so); 807 struct sockbuf *sb; 808 ssize_t orig_resid; 809 uint32_t canwrite, to_write; 810 int error = 0; 811 812 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 813 "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", 814 __func__, uio->uio_resid); 815 816 if (so->so_type != SOCK_STREAM) 817 return (EINVAL); 818 if (pcb == NULL) 819 return (EINVAL); 820 821 /* If nothing to send */ 822 if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) 823 return (EINVAL); 824 825 sb = &so->so_snd; 826 827 orig_resid = uio->uio_resid; 828 829 /* Prevent other writers from entering the socket. */ 830 error = sblock(sb, SBLOCKWAIT(flags)); 831 if (error) { 832 HVSOCK_DBG(HVSOCK_DBG_ERR, 833 "%s: sblock returned error = %d\n", __func__, error); 834 return (error); 835 } 836 837 SOCKBUF_LOCK(sb); 838 839 if ((sb->sb_state & SBS_CANTSENDMORE) || 840 so->so_error == ESHUTDOWN) { 841 error = EPIPE; 842 goto out; 843 } 844 845 while (uio->uio_resid > 0) { 846 canwrite = hvsock_canwrite_check(pcb); 847 if (canwrite == 0) { 848 /* We have sent some data */ 849 if (orig_resid > uio->uio_resid) 850 break; 851 /* 852 * We have not sent any data and it is 853 * non-blocked io 854 */ 855 if (so->so_state & SS_NBIO || 856 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 857 error = EWOULDBLOCK; 858 break; 859 } else { 860 /* 861 * We are here because there is no space on 862 * send buffer ring. Signal the other side 863 * to read and free more space. 864 * Sleep wait until space avaiable to send 865 * Note: Drops the sockbuf lock during wait. 866 */ 867 error = sbwait(sb); 868 869 if (error) 870 break; 871 872 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 873 "%s: wake up from sbwait, space avail on " 874 "tx ring is %u\n", 875 __func__, 876 vmbus_chan_write_available(pcb->chan)); 877 878 continue; 879 } 880 } 881 to_write = MIN(canwrite, uio->uio_resid); 882 to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); 883 884 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 885 "%s: canwrite is %u, to_write = %u\n", __func__, 886 canwrite, to_write); 887 error = hvsock_send_data(pcb->chan, uio, to_write, sb); 888 889 if (error) 890 break; 891 } 892 893 out: 894 SOCKBUF_UNLOCK(sb); 895 sbunlock(sb); 896 897 return (error); 898 } 899 900 int 901 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) 902 { 903 struct hvs_pcb *pcb = so2hvspcb(so); 904 905 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 906 "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); 907 908 if (pcb == NULL) 909 return (EINVAL); 910 911 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); 912 913 return ((*nam == NULL)? ENOMEM : 0); 914 } 915 916 int 917 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) 918 { 919 struct hvs_pcb *pcb = so2hvspcb(so); 920 921 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 922 "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); 923 924 if (pcb == NULL) 925 return (EINVAL); 926 927 *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); 928 929 return ((*nam == NULL)? ENOMEM : 0); 930 } 931 932 void 933 hvs_trans_close(struct socket *so) 934 { 935 struct hvs_pcb *pcb; 936 937 if (vm_guest != VM_GUEST_HV) 938 return; 939 940 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 941 "%s: HyperV Socket hvs_trans_close called\n", __func__); 942 943 (void) hvs_trans_lock(); 944 pcb = so2hvspcb(so); 945 if (!pcb) { 946 hvs_trans_unlock(); 947 return; 948 } 949 950 if (so->so_state & SS_ISCONNECTED) { 951 /* Send a FIN to peer */ 952 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 953 "%s: hvs_trans_close sending a FIN to host\n", __func__); 954 (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); 955 } 956 957 if (so->so_state & 958 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 959 soisdisconnected(so); 960 961 pcb->chan = NULL; 962 pcb->so = NULL; 963 964 if (SOLISTENING(so)) { 965 mtx_lock(&hvs_trans_socks_mtx); 966 /* Remove from bound list */ 967 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 968 mtx_unlock(&hvs_trans_socks_mtx); 969 } 970 971 hvs_trans_unlock(); 972 973 return; 974 } 975 976 void 977 hvs_trans_abort(struct socket *so) 978 { 979 struct hvs_pcb *pcb = so2hvspcb(so); 980 981 if (vm_guest != VM_GUEST_HV) 982 return; 983 984 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 985 "%s: HyperV Socket hvs_trans_abort called\n", __func__); 986 987 (void) hvs_trans_lock(); 988 if (pcb == NULL) { 989 hvs_trans_unlock(); 990 return; 991 } 992 993 if (SOLISTENING(so)) { 994 mtx_lock(&hvs_trans_socks_mtx); 995 /* Remove from bound list */ 996 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 997 mtx_unlock(&hvs_trans_socks_mtx); 998 } 999 1000 if (so->so_state & SS_ISCONNECTED) { 1001 (void) sodisconnect(so); 1002 } 1003 hvs_trans_unlock(); 1004 1005 return; 1006 } 1007 1008 int 1009 hvs_trans_shutdown(struct socket *so) 1010 { 1011 struct hvs_pcb *pcb = so2hvspcb(so); 1012 struct sockbuf *sb; 1013 1014 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1015 "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); 1016 1017 if (pcb == NULL) 1018 return (EINVAL); 1019 1020 /* 1021 * Only get called with the shutdown method is SHUT_WR or 1022 * SHUT_RDWR. 1023 * When the method is SHUT_RD or SHUT_RDWR, the caller 1024 * already set the SBS_CANTRCVMORE on receive side socket 1025 * buffer. 1026 */ 1027 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1028 /* 1029 * SHUT_WR only case. 1030 * Receive side is still open. Just close 1031 * the send side. 1032 */ 1033 socantsendmore(so); 1034 } else { 1035 /* SHUT_RDWR case */ 1036 if (so->so_state & SS_ISCONNECTED) { 1037 /* Send a FIN to peer */ 1038 sb = &so->so_snd; 1039 SOCKBUF_LOCK(sb); 1040 (void) hvsock_send_data(pcb->chan, NULL, 0, sb); 1041 SOCKBUF_UNLOCK(sb); 1042 1043 soisdisconnecting(so); 1044 } 1045 } 1046 1047 return (0); 1048 } 1049 1050 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is 1051 * <port> (see struct sockaddr_hvs). 1052 * 1053 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: 1054 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- 1055 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with 1056 * the below sockaddr: 1057 * 1058 * struct SOCKADDR_HV 1059 * { 1060 * ADDRESS_FAMILY Family; 1061 * USHORT Reserved; 1062 * GUID VmId; 1063 * GUID ServiceId; 1064 * }; 1065 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via 1066 * VMBus, because here it's obvious the host and the VM can easily identify 1067 * each other. Though the VmID is useful on the host, especially in the case 1068 * of Windows container, FreeBSD VM doesn't need it at all. 1069 * 1070 * To be compatible with similar infrastructure in Linux VMs, we have 1071 * to limit the available GUID space of SOCKADDR_HV so that we can create 1072 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. 1073 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: 1074 * 1075 **************************************************************************** 1076 * The only valid Service GUIDs, from the perspectives of both the host and * 1077 * FreeBSD VM, that can be connected by the other end, must conform to this * 1078 * format: <port>-facb-11e6-bd58-64006a7986d3. * 1079 **************************************************************************** 1080 * 1081 * When we write apps on the host to connect(), the GUID ServiceID is used. 1082 * When we write apps in FreeBSD VM to connect(), we only need to specify the 1083 * port and the driver will form the GUID and use that to request the host. 1084 * 1085 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the 1086 * auto-generated remote port for a connect request initiated by the host's 1087 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the 1088 * FreeBSD guest. 1089 */ 1090 1091 /* 1092 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) 1093 * restricts HyperV socket ring buffer size to six 4K pages. Newer 1094 * HyperV hosts doen't have this limit. 1095 */ 1096 #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) 1097 #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) 1098 #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) 1099 1100 struct hvsock_sc { 1101 device_t dev; 1102 struct hvs_pcb *pcb; 1103 struct vmbus_channel *channel; 1104 }; 1105 1106 static bool 1107 hvsock_chan_readable(struct vmbus_channel *chan) 1108 { 1109 uint32_t readable = vmbus_chan_read_available(chan); 1110 1111 return (readable >= HVSOCK_PKT_LEN(0)); 1112 } 1113 1114 static void 1115 hvsock_chan_cb(struct vmbus_channel *chan, void *context) 1116 { 1117 struct hvs_pcb *pcb = (struct hvs_pcb *) context; 1118 struct socket *so; 1119 uint32_t canwrite; 1120 1121 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1122 "%s: host send us a wakeup on rb data, pcb = %p\n", 1123 __func__, pcb); 1124 1125 /* 1126 * Check if the socket is still attached and valid. 1127 * Here we know channel is still open. Need to make 1128 * sure the socket has not been closed or freed. 1129 */ 1130 (void) hvs_trans_lock(); 1131 so = hsvpcb2so(pcb); 1132 1133 if (pcb->chan != NULL && so != NULL) { 1134 /* 1135 * Wake up reader if there are data to read. 1136 */ 1137 SOCKBUF_LOCK(&(so)->so_rcv); 1138 1139 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1140 "%s: read available = %u\n", __func__, 1141 vmbus_chan_read_available(pcb->chan)); 1142 1143 if (hvsock_chan_readable(pcb->chan)) 1144 sorwakeup_locked(so); 1145 else 1146 SOCKBUF_UNLOCK(&(so)->so_rcv); 1147 1148 /* 1149 * Wake up sender if space becomes available to write. 1150 */ 1151 SOCKBUF_LOCK(&(so)->so_snd); 1152 canwrite = hvsock_canwrite_check(pcb); 1153 1154 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1155 "%s: canwrite = %u\n", __func__, canwrite); 1156 1157 if (canwrite > 0) { 1158 sowwakeup_locked(so); 1159 } else { 1160 SOCKBUF_UNLOCK(&(so)->so_snd); 1161 } 1162 } 1163 1164 hvs_trans_unlock(); 1165 1166 return; 1167 } 1168 1169 static int 1170 hvsock_br_callback(void *datap, int cplen, void *cbarg) 1171 { 1172 struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; 1173 struct uio *uio = arg->uio; 1174 struct sockbuf *sb = arg->sb; 1175 int error = 0; 1176 1177 if (cbarg == NULL || datap == NULL) 1178 return (EINVAL); 1179 1180 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1181 "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " 1182 "datap = %p\n", 1183 __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", 1184 uio->uio_resid, cplen, datap); 1185 1186 if (sb) 1187 SOCKBUF_UNLOCK(sb); 1188 1189 error = uiomove(datap, cplen, uio); 1190 1191 if (sb) 1192 SOCKBUF_LOCK(sb); 1193 1194 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1195 "%s: after uiomove, uio_resid = %zd, error = %d\n", 1196 __func__, uio->uio_resid, error); 1197 1198 return (error); 1199 } 1200 1201 static int 1202 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, 1203 uint32_t to_write, struct sockbuf *sb) 1204 { 1205 struct hvs_pkt_header hvs_pkt; 1206 int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; 1207 uint64_t pad = 0; 1208 struct iovec iov[3]; 1209 struct hvs_callback_arg cbarg; 1210 1211 if (chan == NULL) 1212 return (ENOTCONN); 1213 1214 hlen = sizeof(struct vmbus_chanpkt_hdr); 1215 hvs_pkthlen = sizeof(struct hvs_pkt_header); 1216 hvs_pktlen = hvs_pkthlen + to_write; 1217 pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); 1218 1219 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1220 "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " 1221 "pad_pktlen = %u, data_len = %u\n", 1222 __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); 1223 1224 hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; 1225 hvs_pkt.chan_pkt_hdr.cph_flags = 0; 1226 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); 1227 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); 1228 hvs_pkt.chan_pkt_hdr.cph_xactid = 0; 1229 1230 hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; 1231 hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; 1232 1233 cbarg.uio = uio; 1234 cbarg.sb = sb; 1235 1236 if (uio && to_write > 0) { 1237 iov[0].iov_base = &hvs_pkt; 1238 iov[0].iov_len = hvs_pkthlen; 1239 iov[1].iov_base = NULL; 1240 iov[1].iov_len = to_write; 1241 iov[2].iov_base = &pad; 1242 iov[2].iov_len = pad_pktlen - hvs_pktlen; 1243 1244 error = vmbus_chan_iov_send(chan, iov, 3, 1245 hvsock_br_callback, &cbarg); 1246 } else { 1247 if (to_write == 0) { 1248 iov[0].iov_base = &hvs_pkt; 1249 iov[0].iov_len = hvs_pkthlen; 1250 iov[1].iov_base = &pad; 1251 iov[1].iov_len = pad_pktlen - hvs_pktlen; 1252 error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); 1253 } 1254 } 1255 1256 if (error) { 1257 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1258 "%s: error = %d\n", __func__, error); 1259 } 1260 1261 return (error); 1262 } 1263 1264 /* 1265 * Check if we have data on current ring buffer to read 1266 * or not. If not, advance the ring buffer read index to 1267 * next packet. Update the recev_data_len and recev_data_off 1268 * to new value. 1269 * Return the number of bytes can read. 1270 */ 1271 static uint32_t 1272 hvsock_canread_check(struct hvs_pcb *pcb) 1273 { 1274 uint32_t advance; 1275 uint32_t tlen, hlen, dlen; 1276 uint32_t bytes_canread = 0; 1277 int error; 1278 1279 if (pcb == NULL || pcb->chan == NULL) { 1280 pcb->so->so_error = EIO; 1281 return (0); 1282 } 1283 1284 /* Still have data not read yet on current packet */ 1285 if (pcb->recv_data_len > 0) 1286 return (pcb->recv_data_len); 1287 1288 if (pcb->rb_init) 1289 advance = 1290 VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1291 else 1292 advance = 0; 1293 1294 bytes_canread = vmbus_chan_read_available(pcb->chan); 1295 1296 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1297 "%s: bytes_canread on br = %u, advance = %u\n", 1298 __func__, bytes_canread, advance); 1299 1300 if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { 1301 /* 1302 * Nothing to read. Need to advance the rindex before 1303 * calling sbwait, so host knows to wake us up when data 1304 * is available to read on rb. 1305 */ 1306 error = vmbus_chan_recv_idxadv(pcb->chan, advance); 1307 if (error) { 1308 HVSOCK_DBG(HVSOCK_DBG_ERR, 1309 "%s: after calling vmbus_chan_recv_idxadv, " 1310 "got error = %d\n", __func__, error); 1311 return (0); 1312 } else { 1313 pcb->rb_init = false; 1314 pcb->recv_data_len = 0; 1315 pcb->recv_data_off = 0; 1316 bytes_canread = vmbus_chan_read_available(pcb->chan); 1317 1318 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1319 "%s: advanced %u bytes, " 1320 " bytes_canread on br now = %u\n", 1321 __func__, advance, bytes_canread); 1322 1323 if (bytes_canread == 0) 1324 return (0); 1325 else 1326 advance = 0; 1327 } 1328 } 1329 1330 if (bytes_canread < 1331 advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) 1332 return (0); 1333 1334 error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, 1335 sizeof(struct hvs_pkt_header), advance); 1336 1337 /* Don't have anything to read */ 1338 if (error) { 1339 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1340 "%s: after calling vmbus_chan_recv_peek, got error = %d\n", 1341 __func__, error); 1342 return (0); 1343 } 1344 1345 /* 1346 * We just read in a new packet header. Do some sanity checks. 1347 */ 1348 tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1349 hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); 1350 dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; 1351 if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || 1352 __predict_false(hlen > tlen) || 1353 __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { 1354 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1355 "invalid tlen(%u), hlen(%u) or dlen(%u)\n", 1356 tlen, hlen, dlen); 1357 pcb->so->so_error = EIO; 1358 return (0); 1359 } 1360 if (pcb->rb_init == false) 1361 pcb->rb_init = true; 1362 1363 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1364 "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", 1365 tlen, hlen, dlen); 1366 1367 /* The other side has sent a close FIN */ 1368 if (dlen == 0) { 1369 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1370 "%s: Received FIN from other side\n", __func__); 1371 /* inform the caller by seting so_error to ESHUTDOWN */ 1372 pcb->so->so_error = ESHUTDOWN; 1373 } 1374 1375 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1376 "%s: canread on receive ring is %u \n", __func__, dlen); 1377 1378 pcb->recv_data_len = dlen; 1379 pcb->recv_data_off = 0; 1380 1381 return (pcb->recv_data_len); 1382 } 1383 1384 static uint32_t 1385 hvsock_canwrite_check(struct hvs_pcb *pcb) 1386 { 1387 uint32_t writeable; 1388 uint32_t ret; 1389 1390 if (pcb == NULL || pcb->chan == NULL) 1391 return (0); 1392 1393 writeable = vmbus_chan_write_available(pcb->chan); 1394 1395 /* 1396 * We must always reserve a 0-length-payload packet for the FIN. 1397 */ 1398 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1399 "%s: writeable is %u, should be greater than %ju\n", 1400 __func__, writeable, 1401 (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); 1402 1403 if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { 1404 /* 1405 * The Tx ring seems full. 1406 */ 1407 return (0); 1408 } 1409 1410 ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); 1411 1412 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1413 "%s: available size is %u\n", __func__, rounddown2(ret, 8)); 1414 1415 return (rounddown2(ret, 8)); 1416 } 1417 1418 static void 1419 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) 1420 { 1421 vmbus_chan_set_pending_send_size(chan, 1422 HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); 1423 } 1424 1425 static int 1426 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) 1427 { 1428 unsigned int rcvbuf, sndbuf; 1429 struct hvs_pcb *pcb = so2hvspcb(so); 1430 int ret; 1431 1432 if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { 1433 sndbuf = HVS_RINGBUF_SND_SIZE; 1434 rcvbuf = HVS_RINGBUF_RCV_SIZE; 1435 } else { 1436 sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); 1437 sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); 1438 sndbuf = rounddown2(sndbuf, PAGE_SIZE); 1439 rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); 1440 rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); 1441 rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); 1442 } 1443 1444 /* 1445 * Can only read whatever user provided size of data 1446 * from ring buffer. Turn off batched reading. 1447 */ 1448 vmbus_chan_set_readbatch(chan, false); 1449 1450 ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, 1451 hvsock_chan_cb, pcb); 1452 1453 if (ret != 0) { 1454 HVSOCK_DBG(HVSOCK_DBG_ERR, 1455 "%s: failed to open hvsock channel, sndbuf = %u, " 1456 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1457 } else { 1458 HVSOCK_DBG(HVSOCK_DBG_INFO, 1459 "%s: hvsock channel opened, sndbuf = %u, i" 1460 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1461 /* 1462 * Se the pending send size so to receive wakeup 1463 * signals from host when there is enough space on 1464 * rx buffer ring to write. 1465 */ 1466 hvsock_set_chan_pending_send_size(chan); 1467 } 1468 1469 return ret; 1470 } 1471 1472 /* 1473 * Guest is listening passively on the socket. Open channel and 1474 * create a new socket for the conneciton. 1475 */ 1476 static void 1477 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, 1478 struct hvsock_sc *sc) 1479 { 1480 struct socket *new_so; 1481 struct hvs_pcb *new_pcb, *pcb; 1482 int error; 1483 1484 /* Do nothing if socket is not listening */ 1485 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1486 HVSOCK_DBG(HVSOCK_DBG_ERR, 1487 "%s: socket is not a listening one\n", __func__); 1488 return; 1489 } 1490 1491 /* 1492 * Create a new socket. This will call pru_attach to complete 1493 * the socket initialization and put the new socket onto 1494 * listening socket's sol_incomp list, waiting to be promoted 1495 * to sol_comp list. 1496 * The new socket created has ref count 0. There is no other 1497 * thread that changes the state of this new one at the 1498 * moment, so we don't need to hold its lock while opening 1499 * channel and filling out its pcb information. 1500 */ 1501 new_so = sonewconn(so, 0); 1502 if (!new_so) 1503 HVSOCK_DBG(HVSOCK_DBG_ERR, 1504 "%s: creating new socket failed\n", __func__); 1505 1506 /* 1507 * Now open the vmbus channel. If it fails, the socket will be 1508 * on the listening socket's sol_incomp queue until it is 1509 * replaced and aborted. 1510 */ 1511 error = hvsock_open_channel(chan, new_so); 1512 if (error) { 1513 new_so->so_error = error; 1514 return; 1515 } 1516 1517 pcb = so->so_pcb; 1518 new_pcb = new_so->so_pcb; 1519 1520 hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); 1521 /* Remote port is unknown to guest in this type of conneciton */ 1522 hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); 1523 new_pcb->chan = chan; 1524 new_pcb->recv_data_len = 0; 1525 new_pcb->recv_data_off = 0; 1526 new_pcb->rb_init = false; 1527 1528 new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); 1529 new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); 1530 1531 hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); 1532 1533 sc->pcb = new_pcb; 1534 1535 /* 1536 * Change the socket state to SS_ISCONNECTED. This will promote 1537 * the socket to sol_comp queue and wake up the thread which 1538 * is accepting connection. 1539 */ 1540 soisconnected(new_so); 1541 } 1542 1543 1544 /* 1545 * Guest is actively connecting to host. 1546 */ 1547 static void 1548 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) 1549 { 1550 struct hvs_pcb *pcb; 1551 int error; 1552 1553 error = hvsock_open_channel(chan, so); 1554 if (error) { 1555 so->so_error = error; 1556 return; 1557 } 1558 1559 pcb = so->so_pcb; 1560 pcb->chan = chan; 1561 pcb->recv_data_len = 0; 1562 pcb->recv_data_off = 0; 1563 pcb->rb_init = false; 1564 1565 mtx_lock(&hvs_trans_socks_mtx); 1566 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 1567 __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); 1568 mtx_unlock(&hvs_trans_socks_mtx); 1569 1570 /* 1571 * Change the socket state to SS_ISCONNECTED. This will wake up 1572 * the thread sleeping in connect call. 1573 */ 1574 soisconnected(so); 1575 } 1576 1577 static void 1578 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) 1579 { 1580 struct hyperv_guid *inst_guid, *type_guid; 1581 bool conn_from_host; 1582 struct sockaddr_hvs addr; 1583 struct socket *so; 1584 struct hvs_pcb *pcb; 1585 1586 type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); 1587 inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); 1588 conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); 1589 1590 HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); 1591 hvsock_print_guid(type_guid); 1592 HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); 1593 hvsock_print_guid(inst_guid); 1594 HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", 1595 (conn_from_host == true ) ? "from" : "to"); 1596 1597 /* 1598 * The listening port should be in [0, MAX_LISTEN_PORT] 1599 */ 1600 if (!is_valid_srv_id(type_guid)) 1601 return; 1602 1603 /* 1604 * There should be a bound socket already created no matter 1605 * it is a passive or active connection. 1606 * For host initiated connection (passive on guest side), 1607 * the type_guid contains the port which guest is bound and 1608 * listening. 1609 * For the guest initiated connection (active on guest side), 1610 * the inst_guid contains the port that guest has auto bound 1611 * to. 1612 */ 1613 hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); 1614 so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); 1615 if (!so) { 1616 HVSOCK_DBG(HVSOCK_DBG_ERR, 1617 "%s: no bound socket found for port %u\n", 1618 __func__, addr.hvs_port); 1619 return; 1620 } 1621 1622 if (conn_from_host) { 1623 hvsock_open_conn_passive(chan, so, sc); 1624 } else { 1625 (void) hvs_trans_lock(); 1626 pcb = so->so_pcb; 1627 if (pcb && pcb->so) { 1628 sc->pcb = so2hvspcb(so); 1629 hvsock_open_conn_active(chan, so); 1630 } else { 1631 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1632 "%s: channel detached before open\n", __func__); 1633 } 1634 hvs_trans_unlock(); 1635 } 1636 1637 } 1638 1639 static int 1640 hvsock_probe(device_t dev) 1641 { 1642 struct vmbus_channel *channel = vmbus_get_channel(dev); 1643 1644 if (!channel || !vmbus_chan_is_hvs(channel)) { 1645 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1646 "hvsock_probe called but not a hvsock channel id %u\n", 1647 vmbus_chan_id(channel)); 1648 1649 return ENXIO; 1650 } else { 1651 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1652 "hvsock_probe got a hvsock channel id %u\n", 1653 vmbus_chan_id(channel)); 1654 1655 return BUS_PROBE_DEFAULT; 1656 } 1657 } 1658 1659 static int 1660 hvsock_attach(device_t dev) 1661 { 1662 struct vmbus_channel *channel = vmbus_get_channel(dev); 1663 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1664 1665 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); 1666 1667 hvsock_open_connection(channel, sc); 1668 1669 /* 1670 * Always return success. On error the host will rescind the device 1671 * in 30 seconds and we can do cleanup at that time in 1672 * vmbus_chan_msgproc_chrescind(). 1673 */ 1674 return (0); 1675 } 1676 1677 static int 1678 hvsock_detach(device_t dev) 1679 { 1680 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1681 struct socket *so; 1682 int error, retry; 1683 1684 if (bootverbose) 1685 device_printf(dev, "hvsock_detach called.\n"); 1686 1687 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); 1688 1689 if (sc->pcb != NULL) { 1690 (void) hvs_trans_lock(); 1691 1692 so = hsvpcb2so(sc->pcb); 1693 if (so) { 1694 /* Close the connection */ 1695 if (so->so_state & 1696 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 1697 soisdisconnected(so); 1698 } 1699 1700 mtx_lock(&hvs_trans_socks_mtx); 1701 __hvs_remove_pcb_from_list(sc->pcb, 1702 HVS_LIST_BOUND | HVS_LIST_CONNECTED); 1703 mtx_unlock(&hvs_trans_socks_mtx); 1704 1705 /* 1706 * Close channel while no reader and sender are working 1707 * on the buffer rings. 1708 */ 1709 if (so) { 1710 retry = 0; 1711 while ((error = sblock(&so->so_rcv, 0)) == 1712 EWOULDBLOCK) { 1713 /* 1714 * Someone is reading, rx br is busy 1715 */ 1716 soisdisconnected(so); 1717 DELAY(500); 1718 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1719 "waiting for rx reader to exit, " 1720 "retry = %d\n", retry++); 1721 } 1722 retry = 0; 1723 while ((error = sblock(&so->so_snd, 0)) == 1724 EWOULDBLOCK) { 1725 /* 1726 * Someone is sending, tx br is busy 1727 */ 1728 soisdisconnected(so); 1729 DELAY(500); 1730 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1731 "waiting for tx sender to exit, " 1732 "retry = %d\n", retry++); 1733 } 1734 } 1735 1736 1737 bzero(sc->pcb, sizeof(struct hvs_pcb)); 1738 free(sc->pcb, M_HVSOCK); 1739 sc->pcb = NULL; 1740 1741 if (so) { 1742 sbunlock(&so->so_rcv); 1743 sbunlock(&so->so_snd); 1744 so->so_pcb = NULL; 1745 } 1746 1747 hvs_trans_unlock(); 1748 } 1749 1750 vmbus_chan_close(vmbus_get_channel(dev)); 1751 1752 return (0); 1753 } 1754 1755 static device_method_t hvsock_methods[] = { 1756 /* Device interface */ 1757 DEVMETHOD(device_probe, hvsock_probe), 1758 DEVMETHOD(device_attach, hvsock_attach), 1759 DEVMETHOD(device_detach, hvsock_detach), 1760 DEVMETHOD_END 1761 }; 1762 1763 static driver_t hvsock_driver = { 1764 "hv_sock", 1765 hvsock_methods, 1766 sizeof(struct hvsock_sc) 1767 }; 1768 1769 static devclass_t hvsock_devclass; 1770 1771 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); 1772 MODULE_VERSION(hvsock, 1); 1773 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); 1774