1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2020 Microsoft Corp. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/bus.h> 31 #include <sys/domain.h> 32 #include <sys/lock.h> 33 #include <sys/kernel.h> 34 #include <sys/types.h> 35 #include <sys/malloc.h> 36 #include <sys/module.h> 37 #include <sys/mutex.h> 38 #include <sys/proc.h> 39 #include <sys/protosw.h> 40 #include <sys/socket.h> 41 #include <sys/sysctl.h> 42 #include <sys/sysproto.h> 43 #include <sys/systm.h> 44 #include <sys/sockbuf.h> 45 #include <sys/sx.h> 46 #include <sys/uio.h> 47 48 #include <net/vnet.h> 49 50 #include <dev/hyperv/vmbus/vmbus_reg.h> 51 52 #include "hv_sock.h" 53 54 #define HVSOCK_DBG_NONE 0x0 55 #define HVSOCK_DBG_INFO 0x1 56 #define HVSOCK_DBG_ERR 0x2 57 #define HVSOCK_DBG_VERBOSE 0x3 58 59 60 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); 61 62 static int hvs_dbg_level; 63 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 64 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); 65 66 67 #define HVSOCK_DBG(level, ...) do { \ 68 if (hvs_dbg_level >= (level)) \ 69 printf(__VA_ARGS__); \ 70 } while (0) 71 72 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); 73 74 static int hvs_dom_probe(void); 75 76 /* The MTU is 16KB per host side's design */ 77 #define HVSOCK_MTU_SIZE (1024 * 16) 78 #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) 79 80 #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) 81 82 #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ 83 roundup2(payload_len, 8) + \ 84 sizeof(uint64_t)) 85 86 /* 87 * HyperV Transport sockets 88 */ 89 static struct protosw hv_socket_protosw = { 90 .pr_type = SOCK_STREAM, 91 .pr_protocol = HYPERV_SOCK_PROTO_TRANS, 92 .pr_flags = PR_CONNREQUIRED, 93 .pr_attach = hvs_trans_attach, 94 .pr_bind = hvs_trans_bind, 95 .pr_listen = hvs_trans_listen, 96 .pr_accept = hvs_trans_accept, 97 .pr_connect = hvs_trans_connect, 98 .pr_peeraddr = hvs_trans_peeraddr, 99 .pr_sockaddr = hvs_trans_sockaddr, 100 .pr_soreceive = hvs_trans_soreceive, 101 .pr_sosend = hvs_trans_sosend, 102 .pr_disconnect = hvs_trans_disconnect, 103 .pr_close = hvs_trans_close, 104 .pr_detach = hvs_trans_detach, 105 .pr_shutdown = hvs_trans_shutdown, 106 .pr_abort = hvs_trans_abort, 107 }; 108 109 static struct domain hv_socket_domain = { 110 .dom_family = AF_HYPERV, 111 .dom_name = "hyperv", 112 .dom_probe = hvs_dom_probe, 113 .dom_nprotosw = 1, 114 .dom_protosw = { &hv_socket_protosw }, 115 }; 116 117 DOMAIN_SET(hv_socket_); 118 119 #define MAX_PORT ((uint32_t)0xFFFFFFFF) 120 #define MIN_PORT ((uint32_t)0x0) 121 122 /* 00000000-facb-11e6-bd58-64006a7986d3 */ 123 static const struct hyperv_guid srv_id_template = { 124 .hv_guid = { 125 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 126 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } 127 }; 128 129 static int hvsock_br_callback(void *, int, void *); 130 static uint32_t hvsock_canread_check(struct hvs_pcb *); 131 static uint32_t hvsock_canwrite_check(struct hvs_pcb *); 132 static int hvsock_send_data(struct vmbus_channel *chan, 133 struct uio *uio, uint32_t to_write, struct sockbuf *sb); 134 135 136 137 /* Globals */ 138 static struct sx hvs_trans_socks_sx; 139 static struct mtx hvs_trans_socks_mtx; 140 static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; 141 static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; 142 static uint32_t previous_auto_bound_port; 143 144 static void 145 hvsock_print_guid(struct hyperv_guid *guid) 146 { 147 unsigned char *p = (unsigned char *)guid; 148 149 HVSOCK_DBG(HVSOCK_DBG_INFO, 150 "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", 151 *(unsigned int *)p, 152 *((unsigned short *) &p[4]), 153 *((unsigned short *) &p[6]), 154 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 155 } 156 157 static bool 158 is_valid_srv_id(const struct hyperv_guid *id) 159 { 160 return !memcmp(&id->hv_guid[4], 161 &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); 162 } 163 164 static unsigned int 165 get_port_by_srv_id(const struct hyperv_guid *srv_id) 166 { 167 return *((const unsigned int *)srv_id); 168 } 169 170 static void 171 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) 172 { 173 *((unsigned int *)srv_id) = port; 174 } 175 176 177 static void 178 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) 179 { 180 struct hvs_pcb *p = NULL; 181 182 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 183 184 if (!pcb) 185 return; 186 187 if (list & HVS_LIST_BOUND) { 188 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 189 if (p == pcb) 190 LIST_REMOVE(p, bound_next); 191 } 192 193 if (list & HVS_LIST_CONNECTED) { 194 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 195 if (p == pcb) 196 LIST_REMOVE(pcb, connected_next); 197 } 198 } 199 200 static void 201 __hvs_remove_socket_from_list(struct socket *so, unsigned char list) 202 { 203 struct hvs_pcb *pcb = so2hvspcb(so); 204 205 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 206 207 __hvs_remove_pcb_from_list(pcb, list); 208 } 209 210 static void 211 __hvs_insert_socket_on_list(struct socket *so, unsigned char list) 212 { 213 struct hvs_pcb *pcb = so2hvspcb(so); 214 215 if (list & HVS_LIST_BOUND) 216 LIST_INSERT_HEAD(&hvs_trans_bound_socks, 217 pcb, bound_next); 218 219 if (list & HVS_LIST_CONNECTED) 220 LIST_INSERT_HEAD(&hvs_trans_connected_socks, 221 pcb, connected_next); 222 } 223 224 void 225 hvs_remove_socket_from_list(struct socket *so, unsigned char list) 226 { 227 if (!so || !so->so_pcb) { 228 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 229 "%s: socket or so_pcb is null\n", __func__); 230 return; 231 } 232 233 mtx_lock(&hvs_trans_socks_mtx); 234 __hvs_remove_socket_from_list(so, list); 235 mtx_unlock(&hvs_trans_socks_mtx); 236 } 237 238 static void 239 hvs_insert_socket_on_list(struct socket *so, unsigned char list) 240 { 241 if (!so || !so->so_pcb) { 242 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 243 "%s: socket or so_pcb is null\n", __func__); 244 return; 245 } 246 247 mtx_lock(&hvs_trans_socks_mtx); 248 __hvs_insert_socket_on_list(so, list); 249 mtx_unlock(&hvs_trans_socks_mtx); 250 } 251 252 static struct socket * 253 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 254 { 255 struct hvs_pcb *p = NULL; 256 257 if (list & HVS_LIST_BOUND) 258 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 259 if (p->so != NULL && 260 addr->hvs_port == p->local_addr.hvs_port) 261 return p->so; 262 263 if (list & HVS_LIST_CONNECTED) 264 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 265 if (p->so != NULL && 266 addr->hvs_port == p->local_addr.hvs_port) 267 return p->so; 268 269 return NULL; 270 } 271 272 static struct socket * 273 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 274 { 275 struct socket *s = NULL; 276 277 mtx_lock(&hvs_trans_socks_mtx); 278 s = __hvs_find_socket_on_list(addr, list); 279 mtx_unlock(&hvs_trans_socks_mtx); 280 281 return s; 282 } 283 284 static inline void 285 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) 286 { 287 memset(addr, 0, sizeof(*addr)); 288 addr->sa_family = AF_HYPERV; 289 addr->sa_len = sizeof(*addr); 290 addr->hvs_port = port; 291 } 292 293 void 294 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) 295 { 296 hvs_addr_set(addr, get_port_by_srv_id(svr_id)); 297 } 298 299 int 300 hvs_trans_lock(void) 301 { 302 sx_xlock(&hvs_trans_socks_sx); 303 return (0); 304 } 305 306 void 307 hvs_trans_unlock(void) 308 { 309 sx_xunlock(&hvs_trans_socks_sx); 310 } 311 312 static int 313 hvs_dom_probe(void) 314 { 315 316 /* Don't even give us a chance to attach on non-HyperV. */ 317 if (vm_guest != VM_GUEST_HV) 318 return (ENXIO); 319 return (0); 320 } 321 322 static void 323 hvs_trans_init(void *arg __unused) 324 { 325 326 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 327 "%s: HyperV Socket hvs_trans_init called\n", __func__); 328 329 /* Initialize Globals */ 330 previous_auto_bound_port = MAX_PORT; 331 sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); 332 mtx_init(&hvs_trans_socks_mtx, 333 "hvs_trans_socks_mtx", NULL, MTX_DEF); 334 LIST_INIT(&hvs_trans_bound_socks); 335 LIST_INIT(&hvs_trans_connected_socks); 336 } 337 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, 338 hvs_trans_init, NULL); 339 340 /* 341 * Called in two cases: 342 * 1) When user calls socket(); 343 * 2) When we accept new incoming conneciton and call sonewconn(). 344 */ 345 int 346 hvs_trans_attach(struct socket *so, int proto, struct thread *td) 347 { 348 struct hvs_pcb *pcb = so2hvspcb(so); 349 350 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 351 "%s: HyperV Socket hvs_trans_attach called\n", __func__); 352 353 if (so->so_type != SOCK_STREAM) 354 return (ESOCKTNOSUPPORT); 355 356 if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) 357 return (EPROTONOSUPPORT); 358 359 if (pcb != NULL) 360 return (EISCONN); 361 pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); 362 if (pcb == NULL) 363 return (ENOMEM); 364 365 pcb->so = so; 366 so->so_pcb = (void *)pcb; 367 368 return (0); 369 } 370 371 void 372 hvs_trans_detach(struct socket *so) 373 { 374 struct hvs_pcb *pcb; 375 376 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 377 "%s: HyperV Socket hvs_trans_detach called\n", __func__); 378 379 (void) hvs_trans_lock(); 380 pcb = so2hvspcb(so); 381 if (pcb == NULL) { 382 hvs_trans_unlock(); 383 return; 384 } 385 386 if (SOLISTENING(so)) { 387 bzero(pcb, sizeof(*pcb)); 388 free(pcb, M_HVSOCK); 389 } 390 391 so->so_pcb = NULL; 392 393 hvs_trans_unlock(); 394 } 395 396 int 397 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) 398 { 399 struct hvs_pcb *pcb = so2hvspcb(so); 400 struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; 401 int error = 0; 402 403 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 404 "%s: HyperV Socket hvs_trans_bind called\n", __func__); 405 406 if (sa == NULL) { 407 return (EINVAL); 408 } 409 410 if (pcb == NULL) { 411 return (EINVAL); 412 } 413 414 if (sa->sa_family != AF_HYPERV) { 415 HVSOCK_DBG(HVSOCK_DBG_ERR, 416 "%s: Not supported, sa_family is %u\n", 417 __func__, sa->sa_family); 418 return (EAFNOSUPPORT); 419 } 420 if (sa->sa_len != sizeof(*sa)) { 421 HVSOCK_DBG(HVSOCK_DBG_ERR, 422 "%s: Not supported, sa_len is %u\n", 423 __func__, sa->sa_len); 424 return (EINVAL); 425 } 426 427 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 428 "%s: binding port = 0x%x\n", __func__, sa->hvs_port); 429 430 mtx_lock(&hvs_trans_socks_mtx); 431 if (__hvs_find_socket_on_list(sa, 432 HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { 433 error = EADDRINUSE; 434 } else { 435 /* 436 * The address is available for us to bind. 437 * Add socket to the bound list. 438 */ 439 hvs_addr_set(&pcb->local_addr, sa->hvs_port); 440 hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); 441 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 442 } 443 mtx_unlock(&hvs_trans_socks_mtx); 444 445 return (error); 446 } 447 448 int 449 hvs_trans_listen(struct socket *so, int backlog, struct thread *td) 450 { 451 struct hvs_pcb *pcb = so2hvspcb(so); 452 struct socket *bound_so; 453 int error; 454 455 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 456 "%s: HyperV Socket hvs_trans_listen called\n", __func__); 457 458 if (pcb == NULL) 459 return (EINVAL); 460 461 /* Check if the address is already bound and it was by us. */ 462 bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); 463 if (bound_so == NULL || bound_so != so) { 464 HVSOCK_DBG(HVSOCK_DBG_ERR, 465 "%s: Address not bound or not by us.\n", __func__); 466 return (EADDRNOTAVAIL); 467 } 468 469 SOCK_LOCK(so); 470 error = solisten_proto_check(so); 471 if (error == 0) 472 solisten_proto(so, backlog); 473 SOCK_UNLOCK(so); 474 475 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 476 "%s: HyperV Socket listen error = %d\n", __func__, error); 477 return (error); 478 } 479 480 int 481 hvs_trans_accept(struct socket *so, struct sockaddr **nam) 482 { 483 struct hvs_pcb *pcb = so2hvspcb(so); 484 485 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 486 "%s: HyperV Socket hvs_trans_accept called\n", __func__); 487 488 if (pcb == NULL) 489 return (EINVAL); 490 491 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, 492 M_NOWAIT); 493 494 return ((*nam == NULL) ? ENOMEM : 0); 495 } 496 497 int 498 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 499 { 500 struct hvs_pcb *pcb = so2hvspcb(so); 501 struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; 502 bool found_auto_bound_port = false; 503 int i, error = 0; 504 505 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 506 "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", 507 __func__, raddr->hvs_port); 508 509 if (pcb == NULL) 510 return (EINVAL); 511 512 /* Verify the remote address */ 513 if (raddr == NULL) 514 return (EINVAL); 515 if (raddr->sa_family != AF_HYPERV) 516 return (EAFNOSUPPORT); 517 if (raddr->sa_len != sizeof(*raddr)) 518 return (EINVAL); 519 520 mtx_lock(&hvs_trans_socks_mtx); 521 if (so->so_state & 522 (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { 523 HVSOCK_DBG(HVSOCK_DBG_ERR, 524 "%s: socket connect in progress\n", 525 __func__); 526 error = EINPROGRESS; 527 goto out; 528 } 529 530 /* 531 * Find an available port for us to auto bind the local 532 * address. 533 */ 534 hvs_addr_set(&pcb->local_addr, 0); 535 536 for (i = previous_auto_bound_port - 1; 537 i != previous_auto_bound_port; i --) { 538 if (i == MIN_PORT) 539 i = MAX_PORT; 540 541 pcb->local_addr.hvs_port = i; 542 543 if (__hvs_find_socket_on_list(&pcb->local_addr, 544 HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { 545 found_auto_bound_port = true; 546 previous_auto_bound_port = i; 547 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 548 "%s: found local bound port is %x\n", 549 __func__, pcb->local_addr.hvs_port); 550 break; 551 } 552 } 553 554 if (found_auto_bound_port == true) { 555 /* Found available port for auto bound, put on list */ 556 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 557 /* Set VM service ID */ 558 pcb->vm_srv_id = srv_id_template; 559 set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); 560 /* Set host service ID and remote port */ 561 pcb->host_srv_id = srv_id_template; 562 set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); 563 hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); 564 565 /* Change the socket state to SS_ISCONNECTING */ 566 soisconnecting(so); 567 } else { 568 HVSOCK_DBG(HVSOCK_DBG_ERR, 569 "%s: No local port available for auto bound\n", 570 __func__); 571 error = EADDRINUSE; 572 } 573 574 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); 575 hvsock_print_guid(&pcb->vm_srv_id); 576 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); 577 hvsock_print_guid(&pcb->host_srv_id); 578 579 out: 580 mtx_unlock(&hvs_trans_socks_mtx); 581 582 if (found_auto_bound_port == true) 583 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); 584 585 return (error); 586 } 587 588 int 589 hvs_trans_disconnect(struct socket *so) 590 { 591 struct hvs_pcb *pcb; 592 593 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 594 "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); 595 596 (void) hvs_trans_lock(); 597 pcb = so2hvspcb(so); 598 if (pcb == NULL) { 599 hvs_trans_unlock(); 600 return (EINVAL); 601 } 602 603 /* If socket is already disconnected, skip this */ 604 if ((so->so_state & SS_ISDISCONNECTED) == 0) 605 soisdisconnecting(so); 606 607 hvs_trans_unlock(); 608 609 return (0); 610 } 611 612 struct hvs_callback_arg { 613 struct uio *uio; 614 struct sockbuf *sb; 615 }; 616 617 int 618 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, 619 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 620 { 621 struct hvs_pcb *pcb = so2hvspcb(so); 622 struct sockbuf *sb; 623 ssize_t orig_resid; 624 uint32_t canread, to_read; 625 int flags, error = 0; 626 struct hvs_callback_arg cbarg; 627 628 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 629 "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); 630 631 if (so->so_type != SOCK_STREAM) 632 return (EINVAL); 633 if (pcb == NULL) 634 return (EINVAL); 635 636 if (flagsp != NULL) 637 flags = *flagsp &~ MSG_EOR; 638 else 639 flags = 0; 640 641 if (flags & MSG_PEEK) 642 return (EOPNOTSUPP); 643 644 /* If no space to copy out anything */ 645 if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) 646 return (EINVAL); 647 648 orig_resid = uio->uio_resid; 649 650 /* Prevent other readers from entering the socket. */ 651 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 652 if (error) { 653 HVSOCK_DBG(HVSOCK_DBG_ERR, 654 "%s: soiolock returned error = %d\n", __func__, error); 655 return (error); 656 } 657 658 sb = &so->so_rcv; 659 SOCKBUF_LOCK(sb); 660 661 cbarg.uio = uio; 662 cbarg.sb = sb; 663 /* 664 * If the socket is closing, there might still be some data 665 * in rx br to read. However we need to make sure 666 * the channel is still open. 667 */ 668 if ((sb->sb_state & SBS_CANTRCVMORE) && 669 (so->so_state & SS_ISDISCONNECTED)) { 670 /* Other thread already closed the channel */ 671 error = EPIPE; 672 goto out; 673 } 674 675 while (true) { 676 while (uio->uio_resid > 0 && 677 (canread = hvsock_canread_check(pcb)) > 0) { 678 to_read = MIN(canread, uio->uio_resid); 679 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 680 "%s: to_read = %u, skip = %u\n", __func__, to_read, 681 (unsigned int)(sizeof(struct hvs_pkt_header) + 682 pcb->recv_data_off)); 683 684 error = vmbus_chan_recv_peek_call(pcb->chan, to_read, 685 sizeof(struct hvs_pkt_header) + pcb->recv_data_off, 686 hvsock_br_callback, (void *)&cbarg); 687 /* 688 * It is possible socket is disconnected becasue 689 * we released lock in hvsock_br_callback. So we 690 * need to check the state to make sure it is not 691 * disconnected. 692 */ 693 if (error || so->so_state & SS_ISDISCONNECTED) { 694 break; 695 } 696 697 pcb->recv_data_len -= to_read; 698 pcb->recv_data_off += to_read; 699 } 700 701 if (error) 702 break; 703 704 /* Abort if socket has reported problems. */ 705 if (so->so_error) { 706 if (so->so_error == ESHUTDOWN && 707 orig_resid > uio->uio_resid) { 708 /* 709 * Although we got a FIN, we also received 710 * some data in this round. Delivery it 711 * to user. 712 */ 713 error = 0; 714 } else { 715 if (so->so_error != ESHUTDOWN) 716 error = so->so_error; 717 } 718 719 break; 720 } 721 722 /* Cannot received more. */ 723 if (sb->sb_state & SBS_CANTRCVMORE) 724 break; 725 726 /* We are done if buffer has been filled */ 727 if (uio->uio_resid == 0) 728 break; 729 730 if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) 731 break; 732 733 /* Buffer ring is empty and we shall not block */ 734 if ((so->so_state & SS_NBIO) || 735 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 736 if (orig_resid == uio->uio_resid) { 737 /* We have not read anything */ 738 error = EAGAIN; 739 } 740 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 741 "%s: non blocked read return, error %d.\n", 742 __func__, error); 743 break; 744 } 745 746 /* 747 * Wait and block until (more) data comes in. 748 * Note: Drops the sockbuf lock during wait. 749 */ 750 error = sbwait(so, SO_RCV); 751 752 if (error) 753 break; 754 755 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 756 "%s: wake up from sbwait, read available is %u\n", 757 __func__, vmbus_chan_read_available(pcb->chan)); 758 } 759 760 out: 761 SOCKBUF_UNLOCK(sb); 762 SOCK_IO_RECV_UNLOCK(so); 763 764 /* We recieved a FIN in this call */ 765 if (so->so_error == ESHUTDOWN) { 766 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 767 /* Send has already closed */ 768 soisdisconnecting(so); 769 } else { 770 /* Just close the receive side */ 771 socantrcvmore(so); 772 } 773 } 774 775 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 776 "%s: returning error = %d, so_error = %d\n", 777 __func__, error, so->so_error); 778 779 return (error); 780 } 781 782 int 783 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 784 struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) 785 { 786 struct hvs_pcb *pcb = so2hvspcb(so); 787 struct sockbuf *sb; 788 ssize_t orig_resid; 789 uint32_t canwrite, to_write; 790 int error = 0; 791 792 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 793 "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", 794 __func__, uio->uio_resid); 795 796 if (so->so_type != SOCK_STREAM) 797 return (EINVAL); 798 if (pcb == NULL) 799 return (EINVAL); 800 801 /* If nothing to send */ 802 if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) 803 return (EINVAL); 804 805 orig_resid = uio->uio_resid; 806 807 /* Prevent other writers from entering the socket. */ 808 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 809 if (error) { 810 HVSOCK_DBG(HVSOCK_DBG_ERR, 811 "%s: soiolocak returned error = %d\n", __func__, error); 812 return (error); 813 } 814 815 sb = &so->so_snd; 816 SOCKBUF_LOCK(sb); 817 818 if ((sb->sb_state & SBS_CANTSENDMORE) || 819 so->so_error == ESHUTDOWN) { 820 error = EPIPE; 821 goto out; 822 } 823 824 while (uio->uio_resid > 0) { 825 canwrite = hvsock_canwrite_check(pcb); 826 if (canwrite == 0) { 827 /* We have sent some data */ 828 if (orig_resid > uio->uio_resid) 829 break; 830 /* 831 * We have not sent any data and it is 832 * non-blocked io 833 */ 834 if (so->so_state & SS_NBIO || 835 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 836 error = EWOULDBLOCK; 837 break; 838 } else { 839 /* 840 * We are here because there is no space on 841 * send buffer ring. Signal the other side 842 * to read and free more space. 843 * Sleep wait until space avaiable to send 844 * Note: Drops the sockbuf lock during wait. 845 */ 846 error = sbwait(so, SO_SND); 847 848 if (error) 849 break; 850 851 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 852 "%s: wake up from sbwait, space avail on " 853 "tx ring is %u\n", 854 __func__, 855 vmbus_chan_write_available(pcb->chan)); 856 857 continue; 858 } 859 } 860 to_write = MIN(canwrite, uio->uio_resid); 861 to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); 862 863 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 864 "%s: canwrite is %u, to_write = %u\n", __func__, 865 canwrite, to_write); 866 error = hvsock_send_data(pcb->chan, uio, to_write, sb); 867 868 if (error) 869 break; 870 } 871 872 out: 873 SOCKBUF_UNLOCK(sb); 874 SOCK_IO_SEND_UNLOCK(so); 875 876 return (error); 877 } 878 879 int 880 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) 881 { 882 struct hvs_pcb *pcb = so2hvspcb(so); 883 884 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 885 "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); 886 887 if (pcb == NULL) 888 return (EINVAL); 889 890 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); 891 892 return ((*nam == NULL)? ENOMEM : 0); 893 } 894 895 int 896 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) 897 { 898 struct hvs_pcb *pcb = so2hvspcb(so); 899 900 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 901 "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); 902 903 if (pcb == NULL) 904 return (EINVAL); 905 906 *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); 907 908 return ((*nam == NULL)? ENOMEM : 0); 909 } 910 911 void 912 hvs_trans_close(struct socket *so) 913 { 914 struct hvs_pcb *pcb; 915 916 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 917 "%s: HyperV Socket hvs_trans_close called\n", __func__); 918 919 (void) hvs_trans_lock(); 920 pcb = so2hvspcb(so); 921 if (!pcb) { 922 hvs_trans_unlock(); 923 return; 924 } 925 926 if (so->so_state & SS_ISCONNECTED) { 927 /* Send a FIN to peer */ 928 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 929 "%s: hvs_trans_close sending a FIN to host\n", __func__); 930 (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); 931 } 932 933 if (so->so_state & 934 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 935 soisdisconnected(so); 936 937 pcb->chan = NULL; 938 pcb->so = NULL; 939 940 if (SOLISTENING(so)) { 941 mtx_lock(&hvs_trans_socks_mtx); 942 /* Remove from bound list */ 943 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 944 mtx_unlock(&hvs_trans_socks_mtx); 945 } 946 947 hvs_trans_unlock(); 948 949 return; 950 } 951 952 void 953 hvs_trans_abort(struct socket *so) 954 { 955 struct hvs_pcb *pcb = so2hvspcb(so); 956 957 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 958 "%s: HyperV Socket hvs_trans_abort called\n", __func__); 959 960 (void) hvs_trans_lock(); 961 if (pcb == NULL) { 962 hvs_trans_unlock(); 963 return; 964 } 965 966 if (SOLISTENING(so)) { 967 mtx_lock(&hvs_trans_socks_mtx); 968 /* Remove from bound list */ 969 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 970 mtx_unlock(&hvs_trans_socks_mtx); 971 } 972 973 if (so->so_state & SS_ISCONNECTED) { 974 (void) sodisconnect(so); 975 } 976 hvs_trans_unlock(); 977 978 return; 979 } 980 981 int 982 hvs_trans_shutdown(struct socket *so) 983 { 984 struct hvs_pcb *pcb = so2hvspcb(so); 985 struct sockbuf *sb; 986 987 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 988 "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); 989 990 if (pcb == NULL) 991 return (EINVAL); 992 993 /* 994 * Only get called with the shutdown method is SHUT_WR or 995 * SHUT_RDWR. 996 * When the method is SHUT_RD or SHUT_RDWR, the caller 997 * already set the SBS_CANTRCVMORE on receive side socket 998 * buffer. 999 */ 1000 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1001 /* 1002 * SHUT_WR only case. 1003 * Receive side is still open. Just close 1004 * the send side. 1005 */ 1006 socantsendmore(so); 1007 } else { 1008 /* SHUT_RDWR case */ 1009 if (so->so_state & SS_ISCONNECTED) { 1010 /* Send a FIN to peer */ 1011 sb = &so->so_snd; 1012 SOCKBUF_LOCK(sb); 1013 (void) hvsock_send_data(pcb->chan, NULL, 0, sb); 1014 SOCKBUF_UNLOCK(sb); 1015 1016 soisdisconnecting(so); 1017 } 1018 } 1019 1020 return (0); 1021 } 1022 1023 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is 1024 * <port> (see struct sockaddr_hvs). 1025 * 1026 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: 1027 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- 1028 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with 1029 * the below sockaddr: 1030 * 1031 * struct SOCKADDR_HV 1032 * { 1033 * ADDRESS_FAMILY Family; 1034 * USHORT Reserved; 1035 * GUID VmId; 1036 * GUID ServiceId; 1037 * }; 1038 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via 1039 * VMBus, because here it's obvious the host and the VM can easily identify 1040 * each other. Though the VmID is useful on the host, especially in the case 1041 * of Windows container, FreeBSD VM doesn't need it at all. 1042 * 1043 * To be compatible with similar infrastructure in Linux VMs, we have 1044 * to limit the available GUID space of SOCKADDR_HV so that we can create 1045 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. 1046 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: 1047 * 1048 **************************************************************************** 1049 * The only valid Service GUIDs, from the perspectives of both the host and * 1050 * FreeBSD VM, that can be connected by the other end, must conform to this * 1051 * format: <port>-facb-11e6-bd58-64006a7986d3. * 1052 **************************************************************************** 1053 * 1054 * When we write apps on the host to connect(), the GUID ServiceID is used. 1055 * When we write apps in FreeBSD VM to connect(), we only need to specify the 1056 * port and the driver will form the GUID and use that to request the host. 1057 * 1058 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the 1059 * auto-generated remote port for a connect request initiated by the host's 1060 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the 1061 * FreeBSD guest. 1062 */ 1063 1064 /* 1065 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) 1066 * restricts HyperV socket ring buffer size to six 4K pages. Newer 1067 * HyperV hosts doen't have this limit. 1068 */ 1069 #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) 1070 #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) 1071 #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) 1072 1073 struct hvsock_sc { 1074 device_t dev; 1075 struct hvs_pcb *pcb; 1076 struct vmbus_channel *channel; 1077 }; 1078 1079 static bool 1080 hvsock_chan_readable(struct vmbus_channel *chan) 1081 { 1082 uint32_t readable = vmbus_chan_read_available(chan); 1083 1084 return (readable >= HVSOCK_PKT_LEN(0)); 1085 } 1086 1087 static void 1088 hvsock_chan_cb(struct vmbus_channel *chan, void *context) 1089 { 1090 struct hvs_pcb *pcb = (struct hvs_pcb *) context; 1091 struct socket *so; 1092 uint32_t canwrite; 1093 1094 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1095 "%s: host send us a wakeup on rb data, pcb = %p\n", 1096 __func__, pcb); 1097 1098 /* 1099 * Check if the socket is still attached and valid. 1100 * Here we know channel is still open. Need to make 1101 * sure the socket has not been closed or freed. 1102 */ 1103 (void) hvs_trans_lock(); 1104 so = hsvpcb2so(pcb); 1105 1106 if (pcb->chan != NULL && so != NULL) { 1107 /* 1108 * Wake up reader if there are data to read. 1109 */ 1110 SOCKBUF_LOCK(&(so)->so_rcv); 1111 1112 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1113 "%s: read available = %u\n", __func__, 1114 vmbus_chan_read_available(pcb->chan)); 1115 1116 if (hvsock_chan_readable(pcb->chan)) 1117 sorwakeup_locked(so); 1118 else 1119 SOCKBUF_UNLOCK(&(so)->so_rcv); 1120 1121 /* 1122 * Wake up sender if space becomes available to write. 1123 */ 1124 SOCKBUF_LOCK(&(so)->so_snd); 1125 canwrite = hvsock_canwrite_check(pcb); 1126 1127 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1128 "%s: canwrite = %u\n", __func__, canwrite); 1129 1130 if (canwrite > 0) { 1131 sowwakeup_locked(so); 1132 } else { 1133 SOCKBUF_UNLOCK(&(so)->so_snd); 1134 } 1135 } 1136 1137 hvs_trans_unlock(); 1138 1139 return; 1140 } 1141 1142 static int 1143 hvsock_br_callback(void *datap, int cplen, void *cbarg) 1144 { 1145 struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; 1146 struct uio *uio = arg->uio; 1147 struct sockbuf *sb = arg->sb; 1148 int error = 0; 1149 1150 if (cbarg == NULL || datap == NULL) 1151 return (EINVAL); 1152 1153 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1154 "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " 1155 "datap = %p\n", 1156 __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", 1157 uio->uio_resid, cplen, datap); 1158 1159 if (sb) 1160 SOCKBUF_UNLOCK(sb); 1161 1162 error = uiomove(datap, cplen, uio); 1163 1164 if (sb) 1165 SOCKBUF_LOCK(sb); 1166 1167 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1168 "%s: after uiomove, uio_resid = %zd, error = %d\n", 1169 __func__, uio->uio_resid, error); 1170 1171 return (error); 1172 } 1173 1174 static int 1175 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, 1176 uint32_t to_write, struct sockbuf *sb) 1177 { 1178 struct hvs_pkt_header hvs_pkt; 1179 int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; 1180 uint64_t pad = 0; 1181 struct iovec iov[3]; 1182 struct hvs_callback_arg cbarg; 1183 1184 if (chan == NULL) 1185 return (ENOTCONN); 1186 1187 hlen = sizeof(struct vmbus_chanpkt_hdr); 1188 hvs_pkthlen = sizeof(struct hvs_pkt_header); 1189 hvs_pktlen = hvs_pkthlen + to_write; 1190 pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); 1191 1192 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1193 "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " 1194 "pad_pktlen = %u, data_len = %u\n", 1195 __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); 1196 1197 hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; 1198 hvs_pkt.chan_pkt_hdr.cph_flags = 0; 1199 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); 1200 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); 1201 hvs_pkt.chan_pkt_hdr.cph_xactid = 0; 1202 1203 hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; 1204 hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; 1205 1206 cbarg.uio = uio; 1207 cbarg.sb = sb; 1208 1209 if (uio && to_write > 0) { 1210 iov[0].iov_base = &hvs_pkt; 1211 iov[0].iov_len = hvs_pkthlen; 1212 iov[1].iov_base = NULL; 1213 iov[1].iov_len = to_write; 1214 iov[2].iov_base = &pad; 1215 iov[2].iov_len = pad_pktlen - hvs_pktlen; 1216 1217 error = vmbus_chan_iov_send(chan, iov, 3, 1218 hvsock_br_callback, &cbarg); 1219 } else { 1220 if (to_write == 0) { 1221 iov[0].iov_base = &hvs_pkt; 1222 iov[0].iov_len = hvs_pkthlen; 1223 iov[1].iov_base = &pad; 1224 iov[1].iov_len = pad_pktlen - hvs_pktlen; 1225 error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); 1226 } 1227 } 1228 1229 if (error) { 1230 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1231 "%s: error = %d\n", __func__, error); 1232 } 1233 1234 return (error); 1235 } 1236 1237 /* 1238 * Check if we have data on current ring buffer to read 1239 * or not. If not, advance the ring buffer read index to 1240 * next packet. Update the recev_data_len and recev_data_off 1241 * to new value. 1242 * Return the number of bytes can read. 1243 */ 1244 static uint32_t 1245 hvsock_canread_check(struct hvs_pcb *pcb) 1246 { 1247 uint32_t advance; 1248 uint32_t tlen, hlen, dlen; 1249 uint32_t bytes_canread = 0; 1250 int error; 1251 1252 if (pcb == NULL || pcb->chan == NULL) { 1253 pcb->so->so_error = EIO; 1254 return (0); 1255 } 1256 1257 /* Still have data not read yet on current packet */ 1258 if (pcb->recv_data_len > 0) 1259 return (pcb->recv_data_len); 1260 1261 if (pcb->rb_init) 1262 advance = 1263 VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1264 else 1265 advance = 0; 1266 1267 bytes_canread = vmbus_chan_read_available(pcb->chan); 1268 1269 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1270 "%s: bytes_canread on br = %u, advance = %u\n", 1271 __func__, bytes_canread, advance); 1272 1273 if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { 1274 /* 1275 * Nothing to read. Need to advance the rindex before 1276 * calling sbwait, so host knows to wake us up when data 1277 * is available to read on rb. 1278 */ 1279 error = vmbus_chan_recv_idxadv(pcb->chan, advance); 1280 if (error) { 1281 HVSOCK_DBG(HVSOCK_DBG_ERR, 1282 "%s: after calling vmbus_chan_recv_idxadv, " 1283 "got error = %d\n", __func__, error); 1284 return (0); 1285 } else { 1286 pcb->rb_init = false; 1287 pcb->recv_data_len = 0; 1288 pcb->recv_data_off = 0; 1289 bytes_canread = vmbus_chan_read_available(pcb->chan); 1290 1291 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1292 "%s: advanced %u bytes, " 1293 " bytes_canread on br now = %u\n", 1294 __func__, advance, bytes_canread); 1295 1296 if (bytes_canread == 0) 1297 return (0); 1298 else 1299 advance = 0; 1300 } 1301 } 1302 1303 if (bytes_canread < 1304 advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) 1305 return (0); 1306 1307 error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, 1308 sizeof(struct hvs_pkt_header), advance); 1309 1310 /* Don't have anything to read */ 1311 if (error) { 1312 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1313 "%s: after calling vmbus_chan_recv_peek, got error = %d\n", 1314 __func__, error); 1315 return (0); 1316 } 1317 1318 /* 1319 * We just read in a new packet header. Do some sanity checks. 1320 */ 1321 tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1322 hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); 1323 dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; 1324 if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || 1325 __predict_false(hlen > tlen) || 1326 __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { 1327 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1328 "invalid tlen(%u), hlen(%u) or dlen(%u)\n", 1329 tlen, hlen, dlen); 1330 pcb->so->so_error = EIO; 1331 return (0); 1332 } 1333 if (pcb->rb_init == false) 1334 pcb->rb_init = true; 1335 1336 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1337 "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", 1338 tlen, hlen, dlen); 1339 1340 /* The other side has sent a close FIN */ 1341 if (dlen == 0) { 1342 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1343 "%s: Received FIN from other side\n", __func__); 1344 /* inform the caller by seting so_error to ESHUTDOWN */ 1345 pcb->so->so_error = ESHUTDOWN; 1346 } 1347 1348 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1349 "%s: canread on receive ring is %u \n", __func__, dlen); 1350 1351 pcb->recv_data_len = dlen; 1352 pcb->recv_data_off = 0; 1353 1354 return (pcb->recv_data_len); 1355 } 1356 1357 static uint32_t 1358 hvsock_canwrite_check(struct hvs_pcb *pcb) 1359 { 1360 uint32_t writeable; 1361 uint32_t ret; 1362 1363 if (pcb == NULL || pcb->chan == NULL) 1364 return (0); 1365 1366 writeable = vmbus_chan_write_available(pcb->chan); 1367 1368 /* 1369 * We must always reserve a 0-length-payload packet for the FIN. 1370 */ 1371 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1372 "%s: writeable is %u, should be greater than %ju\n", 1373 __func__, writeable, 1374 (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); 1375 1376 if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { 1377 /* 1378 * The Tx ring seems full. 1379 */ 1380 return (0); 1381 } 1382 1383 ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); 1384 1385 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1386 "%s: available size is %u\n", __func__, rounddown2(ret, 8)); 1387 1388 return (rounddown2(ret, 8)); 1389 } 1390 1391 static void 1392 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) 1393 { 1394 vmbus_chan_set_pending_send_size(chan, 1395 HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); 1396 } 1397 1398 static int 1399 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) 1400 { 1401 unsigned int rcvbuf, sndbuf; 1402 struct hvs_pcb *pcb = so2hvspcb(so); 1403 int ret; 1404 1405 if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { 1406 sndbuf = HVS_RINGBUF_SND_SIZE; 1407 rcvbuf = HVS_RINGBUF_RCV_SIZE; 1408 } else { 1409 sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); 1410 sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); 1411 sndbuf = rounddown2(sndbuf, PAGE_SIZE); 1412 rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); 1413 rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); 1414 rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); 1415 } 1416 1417 /* 1418 * Can only read whatever user provided size of data 1419 * from ring buffer. Turn off batched reading. 1420 */ 1421 vmbus_chan_set_readbatch(chan, false); 1422 1423 ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, 1424 hvsock_chan_cb, pcb); 1425 1426 if (ret != 0) { 1427 HVSOCK_DBG(HVSOCK_DBG_ERR, 1428 "%s: failed to open hvsock channel, sndbuf = %u, " 1429 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1430 } else { 1431 HVSOCK_DBG(HVSOCK_DBG_INFO, 1432 "%s: hvsock channel opened, sndbuf = %u, i" 1433 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1434 /* 1435 * Se the pending send size so to receive wakeup 1436 * signals from host when there is enough space on 1437 * rx buffer ring to write. 1438 */ 1439 hvsock_set_chan_pending_send_size(chan); 1440 } 1441 1442 return ret; 1443 } 1444 1445 /* 1446 * Guest is listening passively on the socket. Open channel and 1447 * create a new socket for the conneciton. 1448 */ 1449 static void 1450 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, 1451 struct hvsock_sc *sc) 1452 { 1453 struct socket *new_so; 1454 struct hvs_pcb *new_pcb, *pcb; 1455 int error; 1456 1457 /* Do nothing if socket is not listening */ 1458 if (!SOLISTENING(so)) { 1459 HVSOCK_DBG(HVSOCK_DBG_ERR, 1460 "%s: socket is not a listening one\n", __func__); 1461 return; 1462 } 1463 1464 /* 1465 * Create a new socket. This will call pru_attach to complete 1466 * the socket initialization and put the new socket onto 1467 * listening socket's sol_incomp list, waiting to be promoted 1468 * to sol_comp list. 1469 * The new socket created has ref count 0. There is no other 1470 * thread that changes the state of this new one at the 1471 * moment, so we don't need to hold its lock while opening 1472 * channel and filling out its pcb information. 1473 */ 1474 new_so = sonewconn(so, 0); 1475 if (!new_so) 1476 HVSOCK_DBG(HVSOCK_DBG_ERR, 1477 "%s: creating new socket failed\n", __func__); 1478 1479 /* 1480 * Now open the vmbus channel. If it fails, the socket will be 1481 * on the listening socket's sol_incomp queue until it is 1482 * replaced and aborted. 1483 */ 1484 error = hvsock_open_channel(chan, new_so); 1485 if (error) { 1486 new_so->so_error = error; 1487 return; 1488 } 1489 1490 pcb = so->so_pcb; 1491 new_pcb = new_so->so_pcb; 1492 1493 hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); 1494 /* Remote port is unknown to guest in this type of conneciton */ 1495 hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); 1496 new_pcb->chan = chan; 1497 new_pcb->recv_data_len = 0; 1498 new_pcb->recv_data_off = 0; 1499 new_pcb->rb_init = false; 1500 1501 new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); 1502 new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); 1503 1504 hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); 1505 1506 sc->pcb = new_pcb; 1507 1508 /* 1509 * Change the socket state to SS_ISCONNECTED. This will promote 1510 * the socket to sol_comp queue and wake up the thread which 1511 * is accepting connection. 1512 */ 1513 soisconnected(new_so); 1514 } 1515 1516 1517 /* 1518 * Guest is actively connecting to host. 1519 */ 1520 static void 1521 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) 1522 { 1523 struct hvs_pcb *pcb; 1524 int error; 1525 1526 error = hvsock_open_channel(chan, so); 1527 if (error) { 1528 so->so_error = error; 1529 return; 1530 } 1531 1532 pcb = so->so_pcb; 1533 pcb->chan = chan; 1534 pcb->recv_data_len = 0; 1535 pcb->recv_data_off = 0; 1536 pcb->rb_init = false; 1537 1538 mtx_lock(&hvs_trans_socks_mtx); 1539 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 1540 __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); 1541 mtx_unlock(&hvs_trans_socks_mtx); 1542 1543 /* 1544 * Change the socket state to SS_ISCONNECTED. This will wake up 1545 * the thread sleeping in connect call. 1546 */ 1547 soisconnected(so); 1548 } 1549 1550 static void 1551 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) 1552 { 1553 struct hyperv_guid *inst_guid, *type_guid; 1554 bool conn_from_host; 1555 struct sockaddr_hvs addr; 1556 struct socket *so; 1557 struct hvs_pcb *pcb; 1558 1559 type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); 1560 inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); 1561 conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); 1562 1563 HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); 1564 hvsock_print_guid(type_guid); 1565 HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); 1566 hvsock_print_guid(inst_guid); 1567 HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", 1568 (conn_from_host == true ) ? "from" : "to"); 1569 1570 /* 1571 * The listening port should be in [0, MAX_LISTEN_PORT] 1572 */ 1573 if (!is_valid_srv_id(type_guid)) 1574 return; 1575 1576 /* 1577 * There should be a bound socket already created no matter 1578 * it is a passive or active connection. 1579 * For host initiated connection (passive on guest side), 1580 * the type_guid contains the port which guest is bound and 1581 * listening. 1582 * For the guest initiated connection (active on guest side), 1583 * the inst_guid contains the port that guest has auto bound 1584 * to. 1585 */ 1586 hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); 1587 so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); 1588 if (!so) { 1589 HVSOCK_DBG(HVSOCK_DBG_ERR, 1590 "%s: no bound socket found for port %u\n", 1591 __func__, addr.hvs_port); 1592 return; 1593 } 1594 1595 if (conn_from_host) { 1596 hvsock_open_conn_passive(chan, so, sc); 1597 } else { 1598 (void) hvs_trans_lock(); 1599 pcb = so->so_pcb; 1600 if (pcb && pcb->so) { 1601 sc->pcb = so2hvspcb(so); 1602 hvsock_open_conn_active(chan, so); 1603 } else { 1604 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1605 "%s: channel detached before open\n", __func__); 1606 } 1607 hvs_trans_unlock(); 1608 } 1609 1610 } 1611 1612 static int 1613 hvsock_probe(device_t dev) 1614 { 1615 struct vmbus_channel *channel = vmbus_get_channel(dev); 1616 1617 if (!channel || !vmbus_chan_is_hvs(channel)) { 1618 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1619 "hvsock_probe called but not a hvsock channel id %u\n", 1620 vmbus_chan_id(channel)); 1621 1622 return ENXIO; 1623 } else { 1624 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1625 "hvsock_probe got a hvsock channel id %u\n", 1626 vmbus_chan_id(channel)); 1627 1628 return BUS_PROBE_DEFAULT; 1629 } 1630 } 1631 1632 static int 1633 hvsock_attach(device_t dev) 1634 { 1635 struct vmbus_channel *channel = vmbus_get_channel(dev); 1636 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1637 1638 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); 1639 1640 hvsock_open_connection(channel, sc); 1641 1642 /* 1643 * Always return success. On error the host will rescind the device 1644 * in 30 seconds and we can do cleanup at that time in 1645 * vmbus_chan_msgproc_chrescind(). 1646 */ 1647 return (0); 1648 } 1649 1650 static int 1651 hvsock_detach(device_t dev) 1652 { 1653 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1654 struct socket *so; 1655 int retry; 1656 1657 if (bootverbose) 1658 device_printf(dev, "hvsock_detach called.\n"); 1659 1660 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); 1661 1662 if (sc->pcb != NULL) { 1663 (void) hvs_trans_lock(); 1664 1665 so = hsvpcb2so(sc->pcb); 1666 if (so) { 1667 /* Close the connection */ 1668 if (so->so_state & 1669 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 1670 soisdisconnected(so); 1671 } 1672 1673 mtx_lock(&hvs_trans_socks_mtx); 1674 __hvs_remove_pcb_from_list(sc->pcb, 1675 HVS_LIST_BOUND | HVS_LIST_CONNECTED); 1676 mtx_unlock(&hvs_trans_socks_mtx); 1677 1678 /* 1679 * Close channel while no reader and sender are working 1680 * on the buffer rings. 1681 */ 1682 if (so) { 1683 retry = 0; 1684 while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { 1685 /* 1686 * Someone is reading, rx br is busy 1687 */ 1688 soisdisconnected(so); 1689 DELAY(500); 1690 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1691 "waiting for rx reader to exit, " 1692 "retry = %d\n", retry++); 1693 } 1694 retry = 0; 1695 while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { 1696 /* 1697 * Someone is sending, tx br is busy 1698 */ 1699 soisdisconnected(so); 1700 DELAY(500); 1701 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1702 "waiting for tx sender to exit, " 1703 "retry = %d\n", retry++); 1704 } 1705 } 1706 1707 1708 bzero(sc->pcb, sizeof(struct hvs_pcb)); 1709 free(sc->pcb, M_HVSOCK); 1710 sc->pcb = NULL; 1711 1712 if (so) { 1713 SOCK_IO_RECV_UNLOCK(so); 1714 SOCK_IO_SEND_UNLOCK(so); 1715 so->so_pcb = NULL; 1716 } 1717 1718 hvs_trans_unlock(); 1719 } 1720 1721 vmbus_chan_close(vmbus_get_channel(dev)); 1722 1723 return (0); 1724 } 1725 1726 static device_method_t hvsock_methods[] = { 1727 /* Device interface */ 1728 DEVMETHOD(device_probe, hvsock_probe), 1729 DEVMETHOD(device_attach, hvsock_attach), 1730 DEVMETHOD(device_detach, hvsock_detach), 1731 DEVMETHOD_END 1732 }; 1733 1734 static driver_t hvsock_driver = { 1735 "hv_sock", 1736 hvsock_methods, 1737 sizeof(struct hvsock_sc) 1738 }; 1739 1740 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL); 1741 MODULE_VERSION(hvsock, 1); 1742 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); 1743