1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Microsoft Corp. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bus.h> 34 #include <sys/domain.h> 35 #include <sys/lock.h> 36 #include <sys/kernel.h> 37 #include <sys/types.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/sysproto.h> 46 #include <sys/systm.h> 47 #include <sys/sockbuf.h> 48 #include <sys/sx.h> 49 #include <sys/uio.h> 50 51 #include <net/vnet.h> 52 53 #include <dev/hyperv/vmbus/vmbus_reg.h> 54 55 #include "hv_sock.h" 56 57 #define HVSOCK_DBG_NONE 0x0 58 #define HVSOCK_DBG_INFO 0x1 59 #define HVSOCK_DBG_ERR 0x2 60 #define HVSOCK_DBG_VERBOSE 0x3 61 62 63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); 64 65 static int hvs_dbg_level; 66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 67 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); 68 69 70 #define HVSOCK_DBG(level, ...) do { \ 71 if (hvs_dbg_level >= (level)) \ 72 printf(__VA_ARGS__); \ 73 } while (0) 74 75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); 76 77 static int hvs_dom_probe(void); 78 79 /* The MTU is 16KB per host side's design */ 80 #define HVSOCK_MTU_SIZE (1024 * 16) 81 #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) 82 83 #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) 84 85 #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ 86 roundup2(payload_len, 8) + \ 87 sizeof(uint64_t)) 88 89 90 static struct domain hv_socket_domain; 91 92 /* 93 * HyperV Transport sockets 94 */ 95 static struct pr_usrreqs hvs_trans_usrreqs = { 96 .pru_attach = hvs_trans_attach, 97 .pru_bind = hvs_trans_bind, 98 .pru_listen = hvs_trans_listen, 99 .pru_accept = hvs_trans_accept, 100 .pru_connect = hvs_trans_connect, 101 .pru_peeraddr = hvs_trans_peeraddr, 102 .pru_sockaddr = hvs_trans_sockaddr, 103 .pru_soreceive = hvs_trans_soreceive, 104 .pru_sosend = hvs_trans_sosend, 105 .pru_disconnect = hvs_trans_disconnect, 106 .pru_close = hvs_trans_close, 107 .pru_detach = hvs_trans_detach, 108 .pru_shutdown = hvs_trans_shutdown, 109 .pru_abort = hvs_trans_abort, 110 }; 111 112 /* 113 * Definitions of protocols supported in HyperV socket domain 114 */ 115 static struct protosw hv_socket_protosw[] = { 116 { 117 .pr_type = SOCK_STREAM, 118 .pr_domain = &hv_socket_domain, 119 .pr_protocol = HYPERV_SOCK_PROTO_TRANS, 120 .pr_flags = PR_CONNREQUIRED, 121 .pr_usrreqs = &hvs_trans_usrreqs, 122 }, 123 }; 124 125 static struct domain hv_socket_domain = { 126 .dom_family = AF_HYPERV, 127 .dom_name = "hyperv", 128 .dom_probe = hvs_dom_probe, 129 .dom_protosw = hv_socket_protosw, 130 .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] 131 }; 132 133 DOMAIN_SET(hv_socket_); 134 135 #define MAX_PORT ((uint32_t)0xFFFFFFFF) 136 #define MIN_PORT ((uint32_t)0x0) 137 138 /* 00000000-facb-11e6-bd58-64006a7986d3 */ 139 static const struct hyperv_guid srv_id_template = { 140 .hv_guid = { 141 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 142 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } 143 }; 144 145 static int hvsock_br_callback(void *, int, void *); 146 static uint32_t hvsock_canread_check(struct hvs_pcb *); 147 static uint32_t hvsock_canwrite_check(struct hvs_pcb *); 148 static int hvsock_send_data(struct vmbus_channel *chan, 149 struct uio *uio, uint32_t to_write, struct sockbuf *sb); 150 151 152 153 /* Globals */ 154 static struct sx hvs_trans_socks_sx; 155 static struct mtx hvs_trans_socks_mtx; 156 static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; 157 static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; 158 static uint32_t previous_auto_bound_port; 159 160 static void 161 hvsock_print_guid(struct hyperv_guid *guid) 162 { 163 unsigned char *p = (unsigned char *)guid; 164 165 HVSOCK_DBG(HVSOCK_DBG_INFO, 166 "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", 167 *(unsigned int *)p, 168 *((unsigned short *) &p[4]), 169 *((unsigned short *) &p[6]), 170 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 171 } 172 173 static bool 174 is_valid_srv_id(const struct hyperv_guid *id) 175 { 176 return !memcmp(&id->hv_guid[4], 177 &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); 178 } 179 180 static unsigned int 181 get_port_by_srv_id(const struct hyperv_guid *srv_id) 182 { 183 return *((const unsigned int *)srv_id); 184 } 185 186 static void 187 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) 188 { 189 *((unsigned int *)srv_id) = port; 190 } 191 192 193 static void 194 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) 195 { 196 struct hvs_pcb *p = NULL; 197 198 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 199 200 if (!pcb) 201 return; 202 203 if (list & HVS_LIST_BOUND) { 204 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 205 if (p == pcb) 206 LIST_REMOVE(p, bound_next); 207 } 208 209 if (list & HVS_LIST_CONNECTED) { 210 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 211 if (p == pcb) 212 LIST_REMOVE(pcb, connected_next); 213 } 214 } 215 216 static void 217 __hvs_remove_socket_from_list(struct socket *so, unsigned char list) 218 { 219 struct hvs_pcb *pcb = so2hvspcb(so); 220 221 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 222 223 __hvs_remove_pcb_from_list(pcb, list); 224 } 225 226 static void 227 __hvs_insert_socket_on_list(struct socket *so, unsigned char list) 228 { 229 struct hvs_pcb *pcb = so2hvspcb(so); 230 231 if (list & HVS_LIST_BOUND) 232 LIST_INSERT_HEAD(&hvs_trans_bound_socks, 233 pcb, bound_next); 234 235 if (list & HVS_LIST_CONNECTED) 236 LIST_INSERT_HEAD(&hvs_trans_connected_socks, 237 pcb, connected_next); 238 } 239 240 void 241 hvs_remove_socket_from_list(struct socket *so, unsigned char list) 242 { 243 if (!so || !so->so_pcb) { 244 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 245 "%s: socket or so_pcb is null\n", __func__); 246 return; 247 } 248 249 mtx_lock(&hvs_trans_socks_mtx); 250 __hvs_remove_socket_from_list(so, list); 251 mtx_unlock(&hvs_trans_socks_mtx); 252 } 253 254 static void 255 hvs_insert_socket_on_list(struct socket *so, unsigned char list) 256 { 257 if (!so || !so->so_pcb) { 258 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 259 "%s: socket or so_pcb is null\n", __func__); 260 return; 261 } 262 263 mtx_lock(&hvs_trans_socks_mtx); 264 __hvs_insert_socket_on_list(so, list); 265 mtx_unlock(&hvs_trans_socks_mtx); 266 } 267 268 static struct socket * 269 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 270 { 271 struct hvs_pcb *p = NULL; 272 273 if (list & HVS_LIST_BOUND) 274 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 275 if (p->so != NULL && 276 addr->hvs_port == p->local_addr.hvs_port) 277 return p->so; 278 279 if (list & HVS_LIST_CONNECTED) 280 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 281 if (p->so != NULL && 282 addr->hvs_port == p->local_addr.hvs_port) 283 return p->so; 284 285 return NULL; 286 } 287 288 static struct socket * 289 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 290 { 291 struct socket *s = NULL; 292 293 mtx_lock(&hvs_trans_socks_mtx); 294 s = __hvs_find_socket_on_list(addr, list); 295 mtx_unlock(&hvs_trans_socks_mtx); 296 297 return s; 298 } 299 300 static inline void 301 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) 302 { 303 memset(addr, 0, sizeof(*addr)); 304 addr->sa_family = AF_HYPERV; 305 addr->sa_len = sizeof(*addr); 306 addr->hvs_port = port; 307 } 308 309 void 310 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) 311 { 312 hvs_addr_set(addr, get_port_by_srv_id(svr_id)); 313 } 314 315 int 316 hvs_trans_lock(void) 317 { 318 sx_xlock(&hvs_trans_socks_sx); 319 return (0); 320 } 321 322 void 323 hvs_trans_unlock(void) 324 { 325 sx_xunlock(&hvs_trans_socks_sx); 326 } 327 328 static int 329 hvs_dom_probe(void) 330 { 331 332 /* Don't even give us a chance to attach on non-HyperV. */ 333 if (vm_guest != VM_GUEST_HV) 334 return (ENXIO); 335 return (0); 336 } 337 338 static void 339 hvs_trans_init(void *arg __unused) 340 { 341 342 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 343 "%s: HyperV Socket hvs_trans_init called\n", __func__); 344 345 /* Initialize Globals */ 346 previous_auto_bound_port = MAX_PORT; 347 sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); 348 mtx_init(&hvs_trans_socks_mtx, 349 "hvs_trans_socks_mtx", NULL, MTX_DEF); 350 LIST_INIT(&hvs_trans_bound_socks); 351 LIST_INIT(&hvs_trans_connected_socks); 352 } 353 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, 354 hvs_trans_init, NULL); 355 356 /* 357 * Called in two cases: 358 * 1) When user calls socket(); 359 * 2) When we accept new incoming conneciton and call sonewconn(). 360 */ 361 int 362 hvs_trans_attach(struct socket *so, int proto, struct thread *td) 363 { 364 struct hvs_pcb *pcb = so2hvspcb(so); 365 366 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 367 "%s: HyperV Socket hvs_trans_attach called\n", __func__); 368 369 if (so->so_type != SOCK_STREAM) 370 return (ESOCKTNOSUPPORT); 371 372 if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) 373 return (EPROTONOSUPPORT); 374 375 if (pcb != NULL) 376 return (EISCONN); 377 pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); 378 if (pcb == NULL) 379 return (ENOMEM); 380 381 pcb->so = so; 382 so->so_pcb = (void *)pcb; 383 384 return (0); 385 } 386 387 void 388 hvs_trans_detach(struct socket *so) 389 { 390 struct hvs_pcb *pcb; 391 392 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 393 "%s: HyperV Socket hvs_trans_detach called\n", __func__); 394 395 (void) hvs_trans_lock(); 396 pcb = so2hvspcb(so); 397 if (pcb == NULL) { 398 hvs_trans_unlock(); 399 return; 400 } 401 402 if (SOLISTENING(so)) { 403 bzero(pcb, sizeof(*pcb)); 404 free(pcb, M_HVSOCK); 405 } 406 407 so->so_pcb = NULL; 408 409 hvs_trans_unlock(); 410 } 411 412 int 413 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) 414 { 415 struct hvs_pcb *pcb = so2hvspcb(so); 416 struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; 417 int error = 0; 418 419 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 420 "%s: HyperV Socket hvs_trans_bind called\n", __func__); 421 422 if (sa == NULL) { 423 return (EINVAL); 424 } 425 426 if (pcb == NULL) { 427 return (EINVAL); 428 } 429 430 if (sa->sa_family != AF_HYPERV) { 431 HVSOCK_DBG(HVSOCK_DBG_ERR, 432 "%s: Not supported, sa_family is %u\n", 433 __func__, sa->sa_family); 434 return (EAFNOSUPPORT); 435 } 436 if (sa->sa_len != sizeof(*sa)) { 437 HVSOCK_DBG(HVSOCK_DBG_ERR, 438 "%s: Not supported, sa_len is %u\n", 439 __func__, sa->sa_len); 440 return (EINVAL); 441 } 442 443 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 444 "%s: binding port = 0x%x\n", __func__, sa->hvs_port); 445 446 mtx_lock(&hvs_trans_socks_mtx); 447 if (__hvs_find_socket_on_list(sa, 448 HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { 449 error = EADDRINUSE; 450 } else { 451 /* 452 * The address is available for us to bind. 453 * Add socket to the bound list. 454 */ 455 hvs_addr_set(&pcb->local_addr, sa->hvs_port); 456 hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); 457 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 458 } 459 mtx_unlock(&hvs_trans_socks_mtx); 460 461 return (error); 462 } 463 464 int 465 hvs_trans_listen(struct socket *so, int backlog, struct thread *td) 466 { 467 struct hvs_pcb *pcb = so2hvspcb(so); 468 struct socket *bound_so; 469 int error; 470 471 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 472 "%s: HyperV Socket hvs_trans_listen called\n", __func__); 473 474 if (pcb == NULL) 475 return (EINVAL); 476 477 /* Check if the address is already bound and it was by us. */ 478 bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); 479 if (bound_so == NULL || bound_so != so) { 480 HVSOCK_DBG(HVSOCK_DBG_ERR, 481 "%s: Address not bound or not by us.\n", __func__); 482 return (EADDRNOTAVAIL); 483 } 484 485 SOCK_LOCK(so); 486 error = solisten_proto_check(so); 487 if (error == 0) 488 solisten_proto(so, backlog); 489 SOCK_UNLOCK(so); 490 491 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 492 "%s: HyperV Socket listen error = %d\n", __func__, error); 493 return (error); 494 } 495 496 int 497 hvs_trans_accept(struct socket *so, struct sockaddr **nam) 498 { 499 struct hvs_pcb *pcb = so2hvspcb(so); 500 501 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 502 "%s: HyperV Socket hvs_trans_accept called\n", __func__); 503 504 if (pcb == NULL) 505 return (EINVAL); 506 507 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, 508 M_NOWAIT); 509 510 return ((*nam == NULL) ? ENOMEM : 0); 511 } 512 513 int 514 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 515 { 516 struct hvs_pcb *pcb = so2hvspcb(so); 517 struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; 518 bool found_auto_bound_port = false; 519 int i, error = 0; 520 521 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 522 "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", 523 __func__, raddr->hvs_port); 524 525 if (pcb == NULL) 526 return (EINVAL); 527 528 /* Verify the remote address */ 529 if (raddr == NULL) 530 return (EINVAL); 531 if (raddr->sa_family != AF_HYPERV) 532 return (EAFNOSUPPORT); 533 if (raddr->sa_len != sizeof(*raddr)) 534 return (EINVAL); 535 536 mtx_lock(&hvs_trans_socks_mtx); 537 if (so->so_state & 538 (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { 539 HVSOCK_DBG(HVSOCK_DBG_ERR, 540 "%s: socket connect in progress\n", 541 __func__); 542 error = EINPROGRESS; 543 goto out; 544 } 545 546 /* 547 * Find an available port for us to auto bind the local 548 * address. 549 */ 550 hvs_addr_set(&pcb->local_addr, 0); 551 552 for (i = previous_auto_bound_port - 1; 553 i != previous_auto_bound_port; i --) { 554 if (i == MIN_PORT) 555 i = MAX_PORT; 556 557 pcb->local_addr.hvs_port = i; 558 559 if (__hvs_find_socket_on_list(&pcb->local_addr, 560 HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { 561 found_auto_bound_port = true; 562 previous_auto_bound_port = i; 563 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 564 "%s: found local bound port is %x\n", 565 __func__, pcb->local_addr.hvs_port); 566 break; 567 } 568 } 569 570 if (found_auto_bound_port == true) { 571 /* Found available port for auto bound, put on list */ 572 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 573 /* Set VM service ID */ 574 pcb->vm_srv_id = srv_id_template; 575 set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); 576 /* Set host service ID and remote port */ 577 pcb->host_srv_id = srv_id_template; 578 set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); 579 hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); 580 581 /* Change the socket state to SS_ISCONNECTING */ 582 soisconnecting(so); 583 } else { 584 HVSOCK_DBG(HVSOCK_DBG_ERR, 585 "%s: No local port available for auto bound\n", 586 __func__); 587 error = EADDRINUSE; 588 } 589 590 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); 591 hvsock_print_guid(&pcb->vm_srv_id); 592 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); 593 hvsock_print_guid(&pcb->host_srv_id); 594 595 out: 596 mtx_unlock(&hvs_trans_socks_mtx); 597 598 if (found_auto_bound_port == true) 599 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); 600 601 return (error); 602 } 603 604 int 605 hvs_trans_disconnect(struct socket *so) 606 { 607 struct hvs_pcb *pcb; 608 609 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 610 "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); 611 612 (void) hvs_trans_lock(); 613 pcb = so2hvspcb(so); 614 if (pcb == NULL) { 615 hvs_trans_unlock(); 616 return (EINVAL); 617 } 618 619 /* If socket is already disconnected, skip this */ 620 if ((so->so_state & SS_ISDISCONNECTED) == 0) 621 soisdisconnecting(so); 622 623 hvs_trans_unlock(); 624 625 return (0); 626 } 627 628 struct hvs_callback_arg { 629 struct uio *uio; 630 struct sockbuf *sb; 631 }; 632 633 int 634 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, 635 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 636 { 637 struct hvs_pcb *pcb = so2hvspcb(so); 638 struct sockbuf *sb; 639 ssize_t orig_resid; 640 uint32_t canread, to_read; 641 int flags, error = 0; 642 struct hvs_callback_arg cbarg; 643 644 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 645 "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); 646 647 if (so->so_type != SOCK_STREAM) 648 return (EINVAL); 649 if (pcb == NULL) 650 return (EINVAL); 651 652 if (flagsp != NULL) 653 flags = *flagsp &~ MSG_EOR; 654 else 655 flags = 0; 656 657 if (flags & MSG_PEEK) 658 return (EOPNOTSUPP); 659 660 /* If no space to copy out anything */ 661 if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) 662 return (EINVAL); 663 664 orig_resid = uio->uio_resid; 665 666 /* Prevent other readers from entering the socket. */ 667 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 668 if (error) { 669 HVSOCK_DBG(HVSOCK_DBG_ERR, 670 "%s: soiolock returned error = %d\n", __func__, error); 671 return (error); 672 } 673 674 sb = &so->so_rcv; 675 SOCKBUF_LOCK(sb); 676 677 cbarg.uio = uio; 678 cbarg.sb = sb; 679 /* 680 * If the socket is closing, there might still be some data 681 * in rx br to read. However we need to make sure 682 * the channel is still open. 683 */ 684 if ((sb->sb_state & SBS_CANTRCVMORE) && 685 (so->so_state & SS_ISDISCONNECTED)) { 686 /* Other thread already closed the channel */ 687 error = EPIPE; 688 goto out; 689 } 690 691 while (true) { 692 while (uio->uio_resid > 0 && 693 (canread = hvsock_canread_check(pcb)) > 0) { 694 to_read = MIN(canread, uio->uio_resid); 695 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 696 "%s: to_read = %u, skip = %u\n", __func__, to_read, 697 (unsigned int)(sizeof(struct hvs_pkt_header) + 698 pcb->recv_data_off)); 699 700 error = vmbus_chan_recv_peek_call(pcb->chan, to_read, 701 sizeof(struct hvs_pkt_header) + pcb->recv_data_off, 702 hvsock_br_callback, (void *)&cbarg); 703 /* 704 * It is possible socket is disconnected becasue 705 * we released lock in hvsock_br_callback. So we 706 * need to check the state to make sure it is not 707 * disconnected. 708 */ 709 if (error || so->so_state & SS_ISDISCONNECTED) { 710 break; 711 } 712 713 pcb->recv_data_len -= to_read; 714 pcb->recv_data_off += to_read; 715 } 716 717 if (error) 718 break; 719 720 /* Abort if socket has reported problems. */ 721 if (so->so_error) { 722 if (so->so_error == ESHUTDOWN && 723 orig_resid > uio->uio_resid) { 724 /* 725 * Although we got a FIN, we also received 726 * some data in this round. Delivery it 727 * to user. 728 */ 729 error = 0; 730 } else { 731 if (so->so_error != ESHUTDOWN) 732 error = so->so_error; 733 } 734 735 break; 736 } 737 738 /* Cannot received more. */ 739 if (sb->sb_state & SBS_CANTRCVMORE) 740 break; 741 742 /* We are done if buffer has been filled */ 743 if (uio->uio_resid == 0) 744 break; 745 746 if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) 747 break; 748 749 /* Buffer ring is empty and we shall not block */ 750 if ((so->so_state & SS_NBIO) || 751 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 752 if (orig_resid == uio->uio_resid) { 753 /* We have not read anything */ 754 error = EAGAIN; 755 } 756 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 757 "%s: non blocked read return, error %d.\n", 758 __func__, error); 759 break; 760 } 761 762 /* 763 * Wait and block until (more) data comes in. 764 * Note: Drops the sockbuf lock during wait. 765 */ 766 error = sbwait(so, SO_RCV); 767 768 if (error) 769 break; 770 771 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 772 "%s: wake up from sbwait, read available is %u\n", 773 __func__, vmbus_chan_read_available(pcb->chan)); 774 } 775 776 out: 777 SOCKBUF_UNLOCK(sb); 778 SOCK_IO_RECV_UNLOCK(so); 779 780 /* We recieved a FIN in this call */ 781 if (so->so_error == ESHUTDOWN) { 782 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 783 /* Send has already closed */ 784 soisdisconnecting(so); 785 } else { 786 /* Just close the receive side */ 787 socantrcvmore(so); 788 } 789 } 790 791 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 792 "%s: returning error = %d, so_error = %d\n", 793 __func__, error, so->so_error); 794 795 return (error); 796 } 797 798 int 799 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 800 struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) 801 { 802 struct hvs_pcb *pcb = so2hvspcb(so); 803 struct sockbuf *sb; 804 ssize_t orig_resid; 805 uint32_t canwrite, to_write; 806 int error = 0; 807 808 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 809 "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", 810 __func__, uio->uio_resid); 811 812 if (so->so_type != SOCK_STREAM) 813 return (EINVAL); 814 if (pcb == NULL) 815 return (EINVAL); 816 817 /* If nothing to send */ 818 if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) 819 return (EINVAL); 820 821 orig_resid = uio->uio_resid; 822 823 /* Prevent other writers from entering the socket. */ 824 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 825 if (error) { 826 HVSOCK_DBG(HVSOCK_DBG_ERR, 827 "%s: soiolocak returned error = %d\n", __func__, error); 828 return (error); 829 } 830 831 sb = &so->so_snd; 832 SOCKBUF_LOCK(sb); 833 834 if ((sb->sb_state & SBS_CANTSENDMORE) || 835 so->so_error == ESHUTDOWN) { 836 error = EPIPE; 837 goto out; 838 } 839 840 while (uio->uio_resid > 0) { 841 canwrite = hvsock_canwrite_check(pcb); 842 if (canwrite == 0) { 843 /* We have sent some data */ 844 if (orig_resid > uio->uio_resid) 845 break; 846 /* 847 * We have not sent any data and it is 848 * non-blocked io 849 */ 850 if (so->so_state & SS_NBIO || 851 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 852 error = EWOULDBLOCK; 853 break; 854 } else { 855 /* 856 * We are here because there is no space on 857 * send buffer ring. Signal the other side 858 * to read and free more space. 859 * Sleep wait until space avaiable to send 860 * Note: Drops the sockbuf lock during wait. 861 */ 862 error = sbwait(so, SO_SND); 863 864 if (error) 865 break; 866 867 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 868 "%s: wake up from sbwait, space avail on " 869 "tx ring is %u\n", 870 __func__, 871 vmbus_chan_write_available(pcb->chan)); 872 873 continue; 874 } 875 } 876 to_write = MIN(canwrite, uio->uio_resid); 877 to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); 878 879 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 880 "%s: canwrite is %u, to_write = %u\n", __func__, 881 canwrite, to_write); 882 error = hvsock_send_data(pcb->chan, uio, to_write, sb); 883 884 if (error) 885 break; 886 } 887 888 out: 889 SOCKBUF_UNLOCK(sb); 890 SOCK_IO_SEND_UNLOCK(so); 891 892 return (error); 893 } 894 895 int 896 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) 897 { 898 struct hvs_pcb *pcb = so2hvspcb(so); 899 900 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 901 "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); 902 903 if (pcb == NULL) 904 return (EINVAL); 905 906 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); 907 908 return ((*nam == NULL)? ENOMEM : 0); 909 } 910 911 int 912 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) 913 { 914 struct hvs_pcb *pcb = so2hvspcb(so); 915 916 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 917 "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); 918 919 if (pcb == NULL) 920 return (EINVAL); 921 922 *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); 923 924 return ((*nam == NULL)? ENOMEM : 0); 925 } 926 927 void 928 hvs_trans_close(struct socket *so) 929 { 930 struct hvs_pcb *pcb; 931 932 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 933 "%s: HyperV Socket hvs_trans_close called\n", __func__); 934 935 (void) hvs_trans_lock(); 936 pcb = so2hvspcb(so); 937 if (!pcb) { 938 hvs_trans_unlock(); 939 return; 940 } 941 942 if (so->so_state & SS_ISCONNECTED) { 943 /* Send a FIN to peer */ 944 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 945 "%s: hvs_trans_close sending a FIN to host\n", __func__); 946 (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); 947 } 948 949 if (so->so_state & 950 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 951 soisdisconnected(so); 952 953 pcb->chan = NULL; 954 pcb->so = NULL; 955 956 if (SOLISTENING(so)) { 957 mtx_lock(&hvs_trans_socks_mtx); 958 /* Remove from bound list */ 959 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 960 mtx_unlock(&hvs_trans_socks_mtx); 961 } 962 963 hvs_trans_unlock(); 964 965 return; 966 } 967 968 void 969 hvs_trans_abort(struct socket *so) 970 { 971 struct hvs_pcb *pcb = so2hvspcb(so); 972 973 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 974 "%s: HyperV Socket hvs_trans_abort called\n", __func__); 975 976 (void) hvs_trans_lock(); 977 if (pcb == NULL) { 978 hvs_trans_unlock(); 979 return; 980 } 981 982 if (SOLISTENING(so)) { 983 mtx_lock(&hvs_trans_socks_mtx); 984 /* Remove from bound list */ 985 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 986 mtx_unlock(&hvs_trans_socks_mtx); 987 } 988 989 if (so->so_state & SS_ISCONNECTED) { 990 (void) sodisconnect(so); 991 } 992 hvs_trans_unlock(); 993 994 return; 995 } 996 997 int 998 hvs_trans_shutdown(struct socket *so) 999 { 1000 struct hvs_pcb *pcb = so2hvspcb(so); 1001 struct sockbuf *sb; 1002 1003 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1004 "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); 1005 1006 if (pcb == NULL) 1007 return (EINVAL); 1008 1009 /* 1010 * Only get called with the shutdown method is SHUT_WR or 1011 * SHUT_RDWR. 1012 * When the method is SHUT_RD or SHUT_RDWR, the caller 1013 * already set the SBS_CANTRCVMORE on receive side socket 1014 * buffer. 1015 */ 1016 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1017 /* 1018 * SHUT_WR only case. 1019 * Receive side is still open. Just close 1020 * the send side. 1021 */ 1022 socantsendmore(so); 1023 } else { 1024 /* SHUT_RDWR case */ 1025 if (so->so_state & SS_ISCONNECTED) { 1026 /* Send a FIN to peer */ 1027 sb = &so->so_snd; 1028 SOCKBUF_LOCK(sb); 1029 (void) hvsock_send_data(pcb->chan, NULL, 0, sb); 1030 SOCKBUF_UNLOCK(sb); 1031 1032 soisdisconnecting(so); 1033 } 1034 } 1035 1036 return (0); 1037 } 1038 1039 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is 1040 * <port> (see struct sockaddr_hvs). 1041 * 1042 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: 1043 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- 1044 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with 1045 * the below sockaddr: 1046 * 1047 * struct SOCKADDR_HV 1048 * { 1049 * ADDRESS_FAMILY Family; 1050 * USHORT Reserved; 1051 * GUID VmId; 1052 * GUID ServiceId; 1053 * }; 1054 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via 1055 * VMBus, because here it's obvious the host and the VM can easily identify 1056 * each other. Though the VmID is useful on the host, especially in the case 1057 * of Windows container, FreeBSD VM doesn't need it at all. 1058 * 1059 * To be compatible with similar infrastructure in Linux VMs, we have 1060 * to limit the available GUID space of SOCKADDR_HV so that we can create 1061 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. 1062 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: 1063 * 1064 **************************************************************************** 1065 * The only valid Service GUIDs, from the perspectives of both the host and * 1066 * FreeBSD VM, that can be connected by the other end, must conform to this * 1067 * format: <port>-facb-11e6-bd58-64006a7986d3. * 1068 **************************************************************************** 1069 * 1070 * When we write apps on the host to connect(), the GUID ServiceID is used. 1071 * When we write apps in FreeBSD VM to connect(), we only need to specify the 1072 * port and the driver will form the GUID and use that to request the host. 1073 * 1074 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the 1075 * auto-generated remote port for a connect request initiated by the host's 1076 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the 1077 * FreeBSD guest. 1078 */ 1079 1080 /* 1081 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) 1082 * restricts HyperV socket ring buffer size to six 4K pages. Newer 1083 * HyperV hosts doen't have this limit. 1084 */ 1085 #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) 1086 #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) 1087 #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) 1088 1089 struct hvsock_sc { 1090 device_t dev; 1091 struct hvs_pcb *pcb; 1092 struct vmbus_channel *channel; 1093 }; 1094 1095 static bool 1096 hvsock_chan_readable(struct vmbus_channel *chan) 1097 { 1098 uint32_t readable = vmbus_chan_read_available(chan); 1099 1100 return (readable >= HVSOCK_PKT_LEN(0)); 1101 } 1102 1103 static void 1104 hvsock_chan_cb(struct vmbus_channel *chan, void *context) 1105 { 1106 struct hvs_pcb *pcb = (struct hvs_pcb *) context; 1107 struct socket *so; 1108 uint32_t canwrite; 1109 1110 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1111 "%s: host send us a wakeup on rb data, pcb = %p\n", 1112 __func__, pcb); 1113 1114 /* 1115 * Check if the socket is still attached and valid. 1116 * Here we know channel is still open. Need to make 1117 * sure the socket has not been closed or freed. 1118 */ 1119 (void) hvs_trans_lock(); 1120 so = hsvpcb2so(pcb); 1121 1122 if (pcb->chan != NULL && so != NULL) { 1123 /* 1124 * Wake up reader if there are data to read. 1125 */ 1126 SOCKBUF_LOCK(&(so)->so_rcv); 1127 1128 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1129 "%s: read available = %u\n", __func__, 1130 vmbus_chan_read_available(pcb->chan)); 1131 1132 if (hvsock_chan_readable(pcb->chan)) 1133 sorwakeup_locked(so); 1134 else 1135 SOCKBUF_UNLOCK(&(so)->so_rcv); 1136 1137 /* 1138 * Wake up sender if space becomes available to write. 1139 */ 1140 SOCKBUF_LOCK(&(so)->so_snd); 1141 canwrite = hvsock_canwrite_check(pcb); 1142 1143 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1144 "%s: canwrite = %u\n", __func__, canwrite); 1145 1146 if (canwrite > 0) { 1147 sowwakeup_locked(so); 1148 } else { 1149 SOCKBUF_UNLOCK(&(so)->so_snd); 1150 } 1151 } 1152 1153 hvs_trans_unlock(); 1154 1155 return; 1156 } 1157 1158 static int 1159 hvsock_br_callback(void *datap, int cplen, void *cbarg) 1160 { 1161 struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; 1162 struct uio *uio = arg->uio; 1163 struct sockbuf *sb = arg->sb; 1164 int error = 0; 1165 1166 if (cbarg == NULL || datap == NULL) 1167 return (EINVAL); 1168 1169 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1170 "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " 1171 "datap = %p\n", 1172 __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", 1173 uio->uio_resid, cplen, datap); 1174 1175 if (sb) 1176 SOCKBUF_UNLOCK(sb); 1177 1178 error = uiomove(datap, cplen, uio); 1179 1180 if (sb) 1181 SOCKBUF_LOCK(sb); 1182 1183 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1184 "%s: after uiomove, uio_resid = %zd, error = %d\n", 1185 __func__, uio->uio_resid, error); 1186 1187 return (error); 1188 } 1189 1190 static int 1191 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, 1192 uint32_t to_write, struct sockbuf *sb) 1193 { 1194 struct hvs_pkt_header hvs_pkt; 1195 int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; 1196 uint64_t pad = 0; 1197 struct iovec iov[3]; 1198 struct hvs_callback_arg cbarg; 1199 1200 if (chan == NULL) 1201 return (ENOTCONN); 1202 1203 hlen = sizeof(struct vmbus_chanpkt_hdr); 1204 hvs_pkthlen = sizeof(struct hvs_pkt_header); 1205 hvs_pktlen = hvs_pkthlen + to_write; 1206 pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); 1207 1208 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1209 "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " 1210 "pad_pktlen = %u, data_len = %u\n", 1211 __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); 1212 1213 hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; 1214 hvs_pkt.chan_pkt_hdr.cph_flags = 0; 1215 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); 1216 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); 1217 hvs_pkt.chan_pkt_hdr.cph_xactid = 0; 1218 1219 hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; 1220 hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; 1221 1222 cbarg.uio = uio; 1223 cbarg.sb = sb; 1224 1225 if (uio && to_write > 0) { 1226 iov[0].iov_base = &hvs_pkt; 1227 iov[0].iov_len = hvs_pkthlen; 1228 iov[1].iov_base = NULL; 1229 iov[1].iov_len = to_write; 1230 iov[2].iov_base = &pad; 1231 iov[2].iov_len = pad_pktlen - hvs_pktlen; 1232 1233 error = vmbus_chan_iov_send(chan, iov, 3, 1234 hvsock_br_callback, &cbarg); 1235 } else { 1236 if (to_write == 0) { 1237 iov[0].iov_base = &hvs_pkt; 1238 iov[0].iov_len = hvs_pkthlen; 1239 iov[1].iov_base = &pad; 1240 iov[1].iov_len = pad_pktlen - hvs_pktlen; 1241 error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); 1242 } 1243 } 1244 1245 if (error) { 1246 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1247 "%s: error = %d\n", __func__, error); 1248 } 1249 1250 return (error); 1251 } 1252 1253 /* 1254 * Check if we have data on current ring buffer to read 1255 * or not. If not, advance the ring buffer read index to 1256 * next packet. Update the recev_data_len and recev_data_off 1257 * to new value. 1258 * Return the number of bytes can read. 1259 */ 1260 static uint32_t 1261 hvsock_canread_check(struct hvs_pcb *pcb) 1262 { 1263 uint32_t advance; 1264 uint32_t tlen, hlen, dlen; 1265 uint32_t bytes_canread = 0; 1266 int error; 1267 1268 if (pcb == NULL || pcb->chan == NULL) { 1269 pcb->so->so_error = EIO; 1270 return (0); 1271 } 1272 1273 /* Still have data not read yet on current packet */ 1274 if (pcb->recv_data_len > 0) 1275 return (pcb->recv_data_len); 1276 1277 if (pcb->rb_init) 1278 advance = 1279 VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1280 else 1281 advance = 0; 1282 1283 bytes_canread = vmbus_chan_read_available(pcb->chan); 1284 1285 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1286 "%s: bytes_canread on br = %u, advance = %u\n", 1287 __func__, bytes_canread, advance); 1288 1289 if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { 1290 /* 1291 * Nothing to read. Need to advance the rindex before 1292 * calling sbwait, so host knows to wake us up when data 1293 * is available to read on rb. 1294 */ 1295 error = vmbus_chan_recv_idxadv(pcb->chan, advance); 1296 if (error) { 1297 HVSOCK_DBG(HVSOCK_DBG_ERR, 1298 "%s: after calling vmbus_chan_recv_idxadv, " 1299 "got error = %d\n", __func__, error); 1300 return (0); 1301 } else { 1302 pcb->rb_init = false; 1303 pcb->recv_data_len = 0; 1304 pcb->recv_data_off = 0; 1305 bytes_canread = vmbus_chan_read_available(pcb->chan); 1306 1307 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1308 "%s: advanced %u bytes, " 1309 " bytes_canread on br now = %u\n", 1310 __func__, advance, bytes_canread); 1311 1312 if (bytes_canread == 0) 1313 return (0); 1314 else 1315 advance = 0; 1316 } 1317 } 1318 1319 if (bytes_canread < 1320 advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) 1321 return (0); 1322 1323 error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, 1324 sizeof(struct hvs_pkt_header), advance); 1325 1326 /* Don't have anything to read */ 1327 if (error) { 1328 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1329 "%s: after calling vmbus_chan_recv_peek, got error = %d\n", 1330 __func__, error); 1331 return (0); 1332 } 1333 1334 /* 1335 * We just read in a new packet header. Do some sanity checks. 1336 */ 1337 tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1338 hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); 1339 dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; 1340 if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || 1341 __predict_false(hlen > tlen) || 1342 __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { 1343 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1344 "invalid tlen(%u), hlen(%u) or dlen(%u)\n", 1345 tlen, hlen, dlen); 1346 pcb->so->so_error = EIO; 1347 return (0); 1348 } 1349 if (pcb->rb_init == false) 1350 pcb->rb_init = true; 1351 1352 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1353 "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", 1354 tlen, hlen, dlen); 1355 1356 /* The other side has sent a close FIN */ 1357 if (dlen == 0) { 1358 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1359 "%s: Received FIN from other side\n", __func__); 1360 /* inform the caller by seting so_error to ESHUTDOWN */ 1361 pcb->so->so_error = ESHUTDOWN; 1362 } 1363 1364 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1365 "%s: canread on receive ring is %u \n", __func__, dlen); 1366 1367 pcb->recv_data_len = dlen; 1368 pcb->recv_data_off = 0; 1369 1370 return (pcb->recv_data_len); 1371 } 1372 1373 static uint32_t 1374 hvsock_canwrite_check(struct hvs_pcb *pcb) 1375 { 1376 uint32_t writeable; 1377 uint32_t ret; 1378 1379 if (pcb == NULL || pcb->chan == NULL) 1380 return (0); 1381 1382 writeable = vmbus_chan_write_available(pcb->chan); 1383 1384 /* 1385 * We must always reserve a 0-length-payload packet for the FIN. 1386 */ 1387 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1388 "%s: writeable is %u, should be greater than %ju\n", 1389 __func__, writeable, 1390 (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); 1391 1392 if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { 1393 /* 1394 * The Tx ring seems full. 1395 */ 1396 return (0); 1397 } 1398 1399 ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); 1400 1401 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1402 "%s: available size is %u\n", __func__, rounddown2(ret, 8)); 1403 1404 return (rounddown2(ret, 8)); 1405 } 1406 1407 static void 1408 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) 1409 { 1410 vmbus_chan_set_pending_send_size(chan, 1411 HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); 1412 } 1413 1414 static int 1415 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) 1416 { 1417 unsigned int rcvbuf, sndbuf; 1418 struct hvs_pcb *pcb = so2hvspcb(so); 1419 int ret; 1420 1421 if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { 1422 sndbuf = HVS_RINGBUF_SND_SIZE; 1423 rcvbuf = HVS_RINGBUF_RCV_SIZE; 1424 } else { 1425 sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); 1426 sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); 1427 sndbuf = rounddown2(sndbuf, PAGE_SIZE); 1428 rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); 1429 rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); 1430 rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); 1431 } 1432 1433 /* 1434 * Can only read whatever user provided size of data 1435 * from ring buffer. Turn off batched reading. 1436 */ 1437 vmbus_chan_set_readbatch(chan, false); 1438 1439 ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, 1440 hvsock_chan_cb, pcb); 1441 1442 if (ret != 0) { 1443 HVSOCK_DBG(HVSOCK_DBG_ERR, 1444 "%s: failed to open hvsock channel, sndbuf = %u, " 1445 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1446 } else { 1447 HVSOCK_DBG(HVSOCK_DBG_INFO, 1448 "%s: hvsock channel opened, sndbuf = %u, i" 1449 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1450 /* 1451 * Se the pending send size so to receive wakeup 1452 * signals from host when there is enough space on 1453 * rx buffer ring to write. 1454 */ 1455 hvsock_set_chan_pending_send_size(chan); 1456 } 1457 1458 return ret; 1459 } 1460 1461 /* 1462 * Guest is listening passively on the socket. Open channel and 1463 * create a new socket for the conneciton. 1464 */ 1465 static void 1466 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, 1467 struct hvsock_sc *sc) 1468 { 1469 struct socket *new_so; 1470 struct hvs_pcb *new_pcb, *pcb; 1471 int error; 1472 1473 /* Do nothing if socket is not listening */ 1474 if (!SOLISTENING(so)) { 1475 HVSOCK_DBG(HVSOCK_DBG_ERR, 1476 "%s: socket is not a listening one\n", __func__); 1477 return; 1478 } 1479 1480 /* 1481 * Create a new socket. This will call pru_attach to complete 1482 * the socket initialization and put the new socket onto 1483 * listening socket's sol_incomp list, waiting to be promoted 1484 * to sol_comp list. 1485 * The new socket created has ref count 0. There is no other 1486 * thread that changes the state of this new one at the 1487 * moment, so we don't need to hold its lock while opening 1488 * channel and filling out its pcb information. 1489 */ 1490 new_so = sonewconn(so, 0); 1491 if (!new_so) 1492 HVSOCK_DBG(HVSOCK_DBG_ERR, 1493 "%s: creating new socket failed\n", __func__); 1494 1495 /* 1496 * Now open the vmbus channel. If it fails, the socket will be 1497 * on the listening socket's sol_incomp queue until it is 1498 * replaced and aborted. 1499 */ 1500 error = hvsock_open_channel(chan, new_so); 1501 if (error) { 1502 new_so->so_error = error; 1503 return; 1504 } 1505 1506 pcb = so->so_pcb; 1507 new_pcb = new_so->so_pcb; 1508 1509 hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); 1510 /* Remote port is unknown to guest in this type of conneciton */ 1511 hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); 1512 new_pcb->chan = chan; 1513 new_pcb->recv_data_len = 0; 1514 new_pcb->recv_data_off = 0; 1515 new_pcb->rb_init = false; 1516 1517 new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); 1518 new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); 1519 1520 hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); 1521 1522 sc->pcb = new_pcb; 1523 1524 /* 1525 * Change the socket state to SS_ISCONNECTED. This will promote 1526 * the socket to sol_comp queue and wake up the thread which 1527 * is accepting connection. 1528 */ 1529 soisconnected(new_so); 1530 } 1531 1532 1533 /* 1534 * Guest is actively connecting to host. 1535 */ 1536 static void 1537 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) 1538 { 1539 struct hvs_pcb *pcb; 1540 int error; 1541 1542 error = hvsock_open_channel(chan, so); 1543 if (error) { 1544 so->so_error = error; 1545 return; 1546 } 1547 1548 pcb = so->so_pcb; 1549 pcb->chan = chan; 1550 pcb->recv_data_len = 0; 1551 pcb->recv_data_off = 0; 1552 pcb->rb_init = false; 1553 1554 mtx_lock(&hvs_trans_socks_mtx); 1555 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 1556 __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); 1557 mtx_unlock(&hvs_trans_socks_mtx); 1558 1559 /* 1560 * Change the socket state to SS_ISCONNECTED. This will wake up 1561 * the thread sleeping in connect call. 1562 */ 1563 soisconnected(so); 1564 } 1565 1566 static void 1567 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) 1568 { 1569 struct hyperv_guid *inst_guid, *type_guid; 1570 bool conn_from_host; 1571 struct sockaddr_hvs addr; 1572 struct socket *so; 1573 struct hvs_pcb *pcb; 1574 1575 type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); 1576 inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); 1577 conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); 1578 1579 HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); 1580 hvsock_print_guid(type_guid); 1581 HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); 1582 hvsock_print_guid(inst_guid); 1583 HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", 1584 (conn_from_host == true ) ? "from" : "to"); 1585 1586 /* 1587 * The listening port should be in [0, MAX_LISTEN_PORT] 1588 */ 1589 if (!is_valid_srv_id(type_guid)) 1590 return; 1591 1592 /* 1593 * There should be a bound socket already created no matter 1594 * it is a passive or active connection. 1595 * For host initiated connection (passive on guest side), 1596 * the type_guid contains the port which guest is bound and 1597 * listening. 1598 * For the guest initiated connection (active on guest side), 1599 * the inst_guid contains the port that guest has auto bound 1600 * to. 1601 */ 1602 hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); 1603 so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); 1604 if (!so) { 1605 HVSOCK_DBG(HVSOCK_DBG_ERR, 1606 "%s: no bound socket found for port %u\n", 1607 __func__, addr.hvs_port); 1608 return; 1609 } 1610 1611 if (conn_from_host) { 1612 hvsock_open_conn_passive(chan, so, sc); 1613 } else { 1614 (void) hvs_trans_lock(); 1615 pcb = so->so_pcb; 1616 if (pcb && pcb->so) { 1617 sc->pcb = so2hvspcb(so); 1618 hvsock_open_conn_active(chan, so); 1619 } else { 1620 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1621 "%s: channel detached before open\n", __func__); 1622 } 1623 hvs_trans_unlock(); 1624 } 1625 1626 } 1627 1628 static int 1629 hvsock_probe(device_t dev) 1630 { 1631 struct vmbus_channel *channel = vmbus_get_channel(dev); 1632 1633 if (!channel || !vmbus_chan_is_hvs(channel)) { 1634 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1635 "hvsock_probe called but not a hvsock channel id %u\n", 1636 vmbus_chan_id(channel)); 1637 1638 return ENXIO; 1639 } else { 1640 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1641 "hvsock_probe got a hvsock channel id %u\n", 1642 vmbus_chan_id(channel)); 1643 1644 return BUS_PROBE_DEFAULT; 1645 } 1646 } 1647 1648 static int 1649 hvsock_attach(device_t dev) 1650 { 1651 struct vmbus_channel *channel = vmbus_get_channel(dev); 1652 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1653 1654 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); 1655 1656 hvsock_open_connection(channel, sc); 1657 1658 /* 1659 * Always return success. On error the host will rescind the device 1660 * in 30 seconds and we can do cleanup at that time in 1661 * vmbus_chan_msgproc_chrescind(). 1662 */ 1663 return (0); 1664 } 1665 1666 static int 1667 hvsock_detach(device_t dev) 1668 { 1669 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1670 struct socket *so; 1671 int retry; 1672 1673 if (bootverbose) 1674 device_printf(dev, "hvsock_detach called.\n"); 1675 1676 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); 1677 1678 if (sc->pcb != NULL) { 1679 (void) hvs_trans_lock(); 1680 1681 so = hsvpcb2so(sc->pcb); 1682 if (so) { 1683 /* Close the connection */ 1684 if (so->so_state & 1685 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 1686 soisdisconnected(so); 1687 } 1688 1689 mtx_lock(&hvs_trans_socks_mtx); 1690 __hvs_remove_pcb_from_list(sc->pcb, 1691 HVS_LIST_BOUND | HVS_LIST_CONNECTED); 1692 mtx_unlock(&hvs_trans_socks_mtx); 1693 1694 /* 1695 * Close channel while no reader and sender are working 1696 * on the buffer rings. 1697 */ 1698 if (so) { 1699 retry = 0; 1700 while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { 1701 /* 1702 * Someone is reading, rx br is busy 1703 */ 1704 soisdisconnected(so); 1705 DELAY(500); 1706 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1707 "waiting for rx reader to exit, " 1708 "retry = %d\n", retry++); 1709 } 1710 retry = 0; 1711 while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { 1712 /* 1713 * Someone is sending, tx br is busy 1714 */ 1715 soisdisconnected(so); 1716 DELAY(500); 1717 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1718 "waiting for tx sender to exit, " 1719 "retry = %d\n", retry++); 1720 } 1721 } 1722 1723 1724 bzero(sc->pcb, sizeof(struct hvs_pcb)); 1725 free(sc->pcb, M_HVSOCK); 1726 sc->pcb = NULL; 1727 1728 if (so) { 1729 SOCK_IO_RECV_UNLOCK(so); 1730 SOCK_IO_SEND_UNLOCK(so); 1731 so->so_pcb = NULL; 1732 } 1733 1734 hvs_trans_unlock(); 1735 } 1736 1737 vmbus_chan_close(vmbus_get_channel(dev)); 1738 1739 return (0); 1740 } 1741 1742 static device_method_t hvsock_methods[] = { 1743 /* Device interface */ 1744 DEVMETHOD(device_probe, hvsock_probe), 1745 DEVMETHOD(device_attach, hvsock_attach), 1746 DEVMETHOD(device_detach, hvsock_detach), 1747 DEVMETHOD_END 1748 }; 1749 1750 static driver_t hvsock_driver = { 1751 "hv_sock", 1752 hvsock_methods, 1753 sizeof(struct hvsock_sc) 1754 }; 1755 1756 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL); 1757 MODULE_VERSION(hvsock, 1); 1758 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); 1759