1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Microsoft Corp. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bus.h> 34 #include <sys/domain.h> 35 #include <sys/lock.h> 36 #include <sys/kernel.h> 37 #include <sys/types.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/sysproto.h> 46 #include <sys/systm.h> 47 #include <sys/sockbuf.h> 48 #include <sys/sx.h> 49 #include <sys/uio.h> 50 51 #include <net/vnet.h> 52 53 #include <dev/hyperv/vmbus/vmbus_reg.h> 54 55 #include "hv_sock.h" 56 57 #define HVSOCK_DBG_NONE 0x0 58 #define HVSOCK_DBG_INFO 0x1 59 #define HVSOCK_DBG_ERR 0x2 60 #define HVSOCK_DBG_VERBOSE 0x3 61 62 63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); 64 65 static int hvs_dbg_level; 66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 67 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); 68 69 70 #define HVSOCK_DBG(level, ...) do { \ 71 if (hvs_dbg_level >= (level)) \ 72 printf(__VA_ARGS__); \ 73 } while (0) 74 75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); 76 77 static int hvs_dom_probe(void); 78 79 /* The MTU is 16KB per host side's design */ 80 #define HVSOCK_MTU_SIZE (1024 * 16) 81 #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) 82 83 #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) 84 85 #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ 86 roundup2(payload_len, 8) + \ 87 sizeof(uint64_t)) 88 89 /* 90 * HyperV Transport sockets 91 */ 92 static struct protosw hv_socket_protosw = { 93 .pr_type = SOCK_STREAM, 94 .pr_protocol = HYPERV_SOCK_PROTO_TRANS, 95 .pr_flags = PR_CONNREQUIRED, 96 .pr_attach = hvs_trans_attach, 97 .pr_bind = hvs_trans_bind, 98 .pr_listen = hvs_trans_listen, 99 .pr_accept = hvs_trans_accept, 100 .pr_connect = hvs_trans_connect, 101 .pr_peeraddr = hvs_trans_peeraddr, 102 .pr_sockaddr = hvs_trans_sockaddr, 103 .pr_soreceive = hvs_trans_soreceive, 104 .pr_sosend = hvs_trans_sosend, 105 .pr_disconnect = hvs_trans_disconnect, 106 .pr_close = hvs_trans_close, 107 .pr_detach = hvs_trans_detach, 108 .pr_shutdown = hvs_trans_shutdown, 109 .pr_abort = hvs_trans_abort, 110 }; 111 112 static struct domain hv_socket_domain = { 113 .dom_family = AF_HYPERV, 114 .dom_name = "hyperv", 115 .dom_probe = hvs_dom_probe, 116 .dom_nprotosw = 1, 117 .dom_protosw = { &hv_socket_protosw }, 118 }; 119 120 DOMAIN_SET(hv_socket_); 121 122 #define MAX_PORT ((uint32_t)0xFFFFFFFF) 123 #define MIN_PORT ((uint32_t)0x0) 124 125 /* 00000000-facb-11e6-bd58-64006a7986d3 */ 126 static const struct hyperv_guid srv_id_template = { 127 .hv_guid = { 128 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 129 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } 130 }; 131 132 static int hvsock_br_callback(void *, int, void *); 133 static uint32_t hvsock_canread_check(struct hvs_pcb *); 134 static uint32_t hvsock_canwrite_check(struct hvs_pcb *); 135 static int hvsock_send_data(struct vmbus_channel *chan, 136 struct uio *uio, uint32_t to_write, struct sockbuf *sb); 137 138 139 140 /* Globals */ 141 static struct sx hvs_trans_socks_sx; 142 static struct mtx hvs_trans_socks_mtx; 143 static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; 144 static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; 145 static uint32_t previous_auto_bound_port; 146 147 static void 148 hvsock_print_guid(struct hyperv_guid *guid) 149 { 150 unsigned char *p = (unsigned char *)guid; 151 152 HVSOCK_DBG(HVSOCK_DBG_INFO, 153 "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", 154 *(unsigned int *)p, 155 *((unsigned short *) &p[4]), 156 *((unsigned short *) &p[6]), 157 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 158 } 159 160 static bool 161 is_valid_srv_id(const struct hyperv_guid *id) 162 { 163 return !memcmp(&id->hv_guid[4], 164 &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); 165 } 166 167 static unsigned int 168 get_port_by_srv_id(const struct hyperv_guid *srv_id) 169 { 170 return *((const unsigned int *)srv_id); 171 } 172 173 static void 174 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) 175 { 176 *((unsigned int *)srv_id) = port; 177 } 178 179 180 static void 181 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) 182 { 183 struct hvs_pcb *p = NULL; 184 185 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 186 187 if (!pcb) 188 return; 189 190 if (list & HVS_LIST_BOUND) { 191 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 192 if (p == pcb) 193 LIST_REMOVE(p, bound_next); 194 } 195 196 if (list & HVS_LIST_CONNECTED) { 197 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 198 if (p == pcb) 199 LIST_REMOVE(pcb, connected_next); 200 } 201 } 202 203 static void 204 __hvs_remove_socket_from_list(struct socket *so, unsigned char list) 205 { 206 struct hvs_pcb *pcb = so2hvspcb(so); 207 208 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 209 210 __hvs_remove_pcb_from_list(pcb, list); 211 } 212 213 static void 214 __hvs_insert_socket_on_list(struct socket *so, unsigned char list) 215 { 216 struct hvs_pcb *pcb = so2hvspcb(so); 217 218 if (list & HVS_LIST_BOUND) 219 LIST_INSERT_HEAD(&hvs_trans_bound_socks, 220 pcb, bound_next); 221 222 if (list & HVS_LIST_CONNECTED) 223 LIST_INSERT_HEAD(&hvs_trans_connected_socks, 224 pcb, connected_next); 225 } 226 227 void 228 hvs_remove_socket_from_list(struct socket *so, unsigned char list) 229 { 230 if (!so || !so->so_pcb) { 231 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 232 "%s: socket or so_pcb is null\n", __func__); 233 return; 234 } 235 236 mtx_lock(&hvs_trans_socks_mtx); 237 __hvs_remove_socket_from_list(so, list); 238 mtx_unlock(&hvs_trans_socks_mtx); 239 } 240 241 static void 242 hvs_insert_socket_on_list(struct socket *so, unsigned char list) 243 { 244 if (!so || !so->so_pcb) { 245 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 246 "%s: socket or so_pcb is null\n", __func__); 247 return; 248 } 249 250 mtx_lock(&hvs_trans_socks_mtx); 251 __hvs_insert_socket_on_list(so, list); 252 mtx_unlock(&hvs_trans_socks_mtx); 253 } 254 255 static struct socket * 256 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 257 { 258 struct hvs_pcb *p = NULL; 259 260 if (list & HVS_LIST_BOUND) 261 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 262 if (p->so != NULL && 263 addr->hvs_port == p->local_addr.hvs_port) 264 return p->so; 265 266 if (list & HVS_LIST_CONNECTED) 267 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 268 if (p->so != NULL && 269 addr->hvs_port == p->local_addr.hvs_port) 270 return p->so; 271 272 return NULL; 273 } 274 275 static struct socket * 276 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 277 { 278 struct socket *s = NULL; 279 280 mtx_lock(&hvs_trans_socks_mtx); 281 s = __hvs_find_socket_on_list(addr, list); 282 mtx_unlock(&hvs_trans_socks_mtx); 283 284 return s; 285 } 286 287 static inline void 288 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) 289 { 290 memset(addr, 0, sizeof(*addr)); 291 addr->sa_family = AF_HYPERV; 292 addr->sa_len = sizeof(*addr); 293 addr->hvs_port = port; 294 } 295 296 void 297 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) 298 { 299 hvs_addr_set(addr, get_port_by_srv_id(svr_id)); 300 } 301 302 int 303 hvs_trans_lock(void) 304 { 305 sx_xlock(&hvs_trans_socks_sx); 306 return (0); 307 } 308 309 void 310 hvs_trans_unlock(void) 311 { 312 sx_xunlock(&hvs_trans_socks_sx); 313 } 314 315 static int 316 hvs_dom_probe(void) 317 { 318 319 /* Don't even give us a chance to attach on non-HyperV. */ 320 if (vm_guest != VM_GUEST_HV) 321 return (ENXIO); 322 return (0); 323 } 324 325 static void 326 hvs_trans_init(void *arg __unused) 327 { 328 329 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 330 "%s: HyperV Socket hvs_trans_init called\n", __func__); 331 332 /* Initialize Globals */ 333 previous_auto_bound_port = MAX_PORT; 334 sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); 335 mtx_init(&hvs_trans_socks_mtx, 336 "hvs_trans_socks_mtx", NULL, MTX_DEF); 337 LIST_INIT(&hvs_trans_bound_socks); 338 LIST_INIT(&hvs_trans_connected_socks); 339 } 340 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, 341 hvs_trans_init, NULL); 342 343 /* 344 * Called in two cases: 345 * 1) When user calls socket(); 346 * 2) When we accept new incoming conneciton and call sonewconn(). 347 */ 348 int 349 hvs_trans_attach(struct socket *so, int proto, struct thread *td) 350 { 351 struct hvs_pcb *pcb = so2hvspcb(so); 352 353 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 354 "%s: HyperV Socket hvs_trans_attach called\n", __func__); 355 356 if (so->so_type != SOCK_STREAM) 357 return (ESOCKTNOSUPPORT); 358 359 if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) 360 return (EPROTONOSUPPORT); 361 362 if (pcb != NULL) 363 return (EISCONN); 364 pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); 365 if (pcb == NULL) 366 return (ENOMEM); 367 368 pcb->so = so; 369 so->so_pcb = (void *)pcb; 370 371 return (0); 372 } 373 374 void 375 hvs_trans_detach(struct socket *so) 376 { 377 struct hvs_pcb *pcb; 378 379 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 380 "%s: HyperV Socket hvs_trans_detach called\n", __func__); 381 382 (void) hvs_trans_lock(); 383 pcb = so2hvspcb(so); 384 if (pcb == NULL) { 385 hvs_trans_unlock(); 386 return; 387 } 388 389 if (SOLISTENING(so)) { 390 bzero(pcb, sizeof(*pcb)); 391 free(pcb, M_HVSOCK); 392 } 393 394 so->so_pcb = NULL; 395 396 hvs_trans_unlock(); 397 } 398 399 int 400 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) 401 { 402 struct hvs_pcb *pcb = so2hvspcb(so); 403 struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; 404 int error = 0; 405 406 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 407 "%s: HyperV Socket hvs_trans_bind called\n", __func__); 408 409 if (sa == NULL) { 410 return (EINVAL); 411 } 412 413 if (pcb == NULL) { 414 return (EINVAL); 415 } 416 417 if (sa->sa_family != AF_HYPERV) { 418 HVSOCK_DBG(HVSOCK_DBG_ERR, 419 "%s: Not supported, sa_family is %u\n", 420 __func__, sa->sa_family); 421 return (EAFNOSUPPORT); 422 } 423 if (sa->sa_len != sizeof(*sa)) { 424 HVSOCK_DBG(HVSOCK_DBG_ERR, 425 "%s: Not supported, sa_len is %u\n", 426 __func__, sa->sa_len); 427 return (EINVAL); 428 } 429 430 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 431 "%s: binding port = 0x%x\n", __func__, sa->hvs_port); 432 433 mtx_lock(&hvs_trans_socks_mtx); 434 if (__hvs_find_socket_on_list(sa, 435 HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { 436 error = EADDRINUSE; 437 } else { 438 /* 439 * The address is available for us to bind. 440 * Add socket to the bound list. 441 */ 442 hvs_addr_set(&pcb->local_addr, sa->hvs_port); 443 hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); 444 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 445 } 446 mtx_unlock(&hvs_trans_socks_mtx); 447 448 return (error); 449 } 450 451 int 452 hvs_trans_listen(struct socket *so, int backlog, struct thread *td) 453 { 454 struct hvs_pcb *pcb = so2hvspcb(so); 455 struct socket *bound_so; 456 int error; 457 458 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 459 "%s: HyperV Socket hvs_trans_listen called\n", __func__); 460 461 if (pcb == NULL) 462 return (EINVAL); 463 464 /* Check if the address is already bound and it was by us. */ 465 bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); 466 if (bound_so == NULL || bound_so != so) { 467 HVSOCK_DBG(HVSOCK_DBG_ERR, 468 "%s: Address not bound or not by us.\n", __func__); 469 return (EADDRNOTAVAIL); 470 } 471 472 SOCK_LOCK(so); 473 error = solisten_proto_check(so); 474 if (error == 0) 475 solisten_proto(so, backlog); 476 SOCK_UNLOCK(so); 477 478 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 479 "%s: HyperV Socket listen error = %d\n", __func__, error); 480 return (error); 481 } 482 483 int 484 hvs_trans_accept(struct socket *so, struct sockaddr **nam) 485 { 486 struct hvs_pcb *pcb = so2hvspcb(so); 487 488 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 489 "%s: HyperV Socket hvs_trans_accept called\n", __func__); 490 491 if (pcb == NULL) 492 return (EINVAL); 493 494 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, 495 M_NOWAIT); 496 497 return ((*nam == NULL) ? ENOMEM : 0); 498 } 499 500 int 501 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 502 { 503 struct hvs_pcb *pcb = so2hvspcb(so); 504 struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; 505 bool found_auto_bound_port = false; 506 int i, error = 0; 507 508 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 509 "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", 510 __func__, raddr->hvs_port); 511 512 if (pcb == NULL) 513 return (EINVAL); 514 515 /* Verify the remote address */ 516 if (raddr == NULL) 517 return (EINVAL); 518 if (raddr->sa_family != AF_HYPERV) 519 return (EAFNOSUPPORT); 520 if (raddr->sa_len != sizeof(*raddr)) 521 return (EINVAL); 522 523 mtx_lock(&hvs_trans_socks_mtx); 524 if (so->so_state & 525 (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { 526 HVSOCK_DBG(HVSOCK_DBG_ERR, 527 "%s: socket connect in progress\n", 528 __func__); 529 error = EINPROGRESS; 530 goto out; 531 } 532 533 /* 534 * Find an available port for us to auto bind the local 535 * address. 536 */ 537 hvs_addr_set(&pcb->local_addr, 0); 538 539 for (i = previous_auto_bound_port - 1; 540 i != previous_auto_bound_port; i --) { 541 if (i == MIN_PORT) 542 i = MAX_PORT; 543 544 pcb->local_addr.hvs_port = i; 545 546 if (__hvs_find_socket_on_list(&pcb->local_addr, 547 HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { 548 found_auto_bound_port = true; 549 previous_auto_bound_port = i; 550 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 551 "%s: found local bound port is %x\n", 552 __func__, pcb->local_addr.hvs_port); 553 break; 554 } 555 } 556 557 if (found_auto_bound_port == true) { 558 /* Found available port for auto bound, put on list */ 559 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 560 /* Set VM service ID */ 561 pcb->vm_srv_id = srv_id_template; 562 set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); 563 /* Set host service ID and remote port */ 564 pcb->host_srv_id = srv_id_template; 565 set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); 566 hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); 567 568 /* Change the socket state to SS_ISCONNECTING */ 569 soisconnecting(so); 570 } else { 571 HVSOCK_DBG(HVSOCK_DBG_ERR, 572 "%s: No local port available for auto bound\n", 573 __func__); 574 error = EADDRINUSE; 575 } 576 577 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); 578 hvsock_print_guid(&pcb->vm_srv_id); 579 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); 580 hvsock_print_guid(&pcb->host_srv_id); 581 582 out: 583 mtx_unlock(&hvs_trans_socks_mtx); 584 585 if (found_auto_bound_port == true) 586 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); 587 588 return (error); 589 } 590 591 int 592 hvs_trans_disconnect(struct socket *so) 593 { 594 struct hvs_pcb *pcb; 595 596 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 597 "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); 598 599 (void) hvs_trans_lock(); 600 pcb = so2hvspcb(so); 601 if (pcb == NULL) { 602 hvs_trans_unlock(); 603 return (EINVAL); 604 } 605 606 /* If socket is already disconnected, skip this */ 607 if ((so->so_state & SS_ISDISCONNECTED) == 0) 608 soisdisconnecting(so); 609 610 hvs_trans_unlock(); 611 612 return (0); 613 } 614 615 struct hvs_callback_arg { 616 struct uio *uio; 617 struct sockbuf *sb; 618 }; 619 620 int 621 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, 622 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 623 { 624 struct hvs_pcb *pcb = so2hvspcb(so); 625 struct sockbuf *sb; 626 ssize_t orig_resid; 627 uint32_t canread, to_read; 628 int flags, error = 0; 629 struct hvs_callback_arg cbarg; 630 631 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 632 "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); 633 634 if (so->so_type != SOCK_STREAM) 635 return (EINVAL); 636 if (pcb == NULL) 637 return (EINVAL); 638 639 if (flagsp != NULL) 640 flags = *flagsp &~ MSG_EOR; 641 else 642 flags = 0; 643 644 if (flags & MSG_PEEK) 645 return (EOPNOTSUPP); 646 647 /* If no space to copy out anything */ 648 if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) 649 return (EINVAL); 650 651 orig_resid = uio->uio_resid; 652 653 /* Prevent other readers from entering the socket. */ 654 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 655 if (error) { 656 HVSOCK_DBG(HVSOCK_DBG_ERR, 657 "%s: soiolock returned error = %d\n", __func__, error); 658 return (error); 659 } 660 661 sb = &so->so_rcv; 662 SOCKBUF_LOCK(sb); 663 664 cbarg.uio = uio; 665 cbarg.sb = sb; 666 /* 667 * If the socket is closing, there might still be some data 668 * in rx br to read. However we need to make sure 669 * the channel is still open. 670 */ 671 if ((sb->sb_state & SBS_CANTRCVMORE) && 672 (so->so_state & SS_ISDISCONNECTED)) { 673 /* Other thread already closed the channel */ 674 error = EPIPE; 675 goto out; 676 } 677 678 while (true) { 679 while (uio->uio_resid > 0 && 680 (canread = hvsock_canread_check(pcb)) > 0) { 681 to_read = MIN(canread, uio->uio_resid); 682 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 683 "%s: to_read = %u, skip = %u\n", __func__, to_read, 684 (unsigned int)(sizeof(struct hvs_pkt_header) + 685 pcb->recv_data_off)); 686 687 error = vmbus_chan_recv_peek_call(pcb->chan, to_read, 688 sizeof(struct hvs_pkt_header) + pcb->recv_data_off, 689 hvsock_br_callback, (void *)&cbarg); 690 /* 691 * It is possible socket is disconnected becasue 692 * we released lock in hvsock_br_callback. So we 693 * need to check the state to make sure it is not 694 * disconnected. 695 */ 696 if (error || so->so_state & SS_ISDISCONNECTED) { 697 break; 698 } 699 700 pcb->recv_data_len -= to_read; 701 pcb->recv_data_off += to_read; 702 } 703 704 if (error) 705 break; 706 707 /* Abort if socket has reported problems. */ 708 if (so->so_error) { 709 if (so->so_error == ESHUTDOWN && 710 orig_resid > uio->uio_resid) { 711 /* 712 * Although we got a FIN, we also received 713 * some data in this round. Delivery it 714 * to user. 715 */ 716 error = 0; 717 } else { 718 if (so->so_error != ESHUTDOWN) 719 error = so->so_error; 720 } 721 722 break; 723 } 724 725 /* Cannot received more. */ 726 if (sb->sb_state & SBS_CANTRCVMORE) 727 break; 728 729 /* We are done if buffer has been filled */ 730 if (uio->uio_resid == 0) 731 break; 732 733 if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) 734 break; 735 736 /* Buffer ring is empty and we shall not block */ 737 if ((so->so_state & SS_NBIO) || 738 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 739 if (orig_resid == uio->uio_resid) { 740 /* We have not read anything */ 741 error = EAGAIN; 742 } 743 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 744 "%s: non blocked read return, error %d.\n", 745 __func__, error); 746 break; 747 } 748 749 /* 750 * Wait and block until (more) data comes in. 751 * Note: Drops the sockbuf lock during wait. 752 */ 753 error = sbwait(so, SO_RCV); 754 755 if (error) 756 break; 757 758 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 759 "%s: wake up from sbwait, read available is %u\n", 760 __func__, vmbus_chan_read_available(pcb->chan)); 761 } 762 763 out: 764 SOCKBUF_UNLOCK(sb); 765 SOCK_IO_RECV_UNLOCK(so); 766 767 /* We recieved a FIN in this call */ 768 if (so->so_error == ESHUTDOWN) { 769 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 770 /* Send has already closed */ 771 soisdisconnecting(so); 772 } else { 773 /* Just close the receive side */ 774 socantrcvmore(so); 775 } 776 } 777 778 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 779 "%s: returning error = %d, so_error = %d\n", 780 __func__, error, so->so_error); 781 782 return (error); 783 } 784 785 int 786 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 787 struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) 788 { 789 struct hvs_pcb *pcb = so2hvspcb(so); 790 struct sockbuf *sb; 791 ssize_t orig_resid; 792 uint32_t canwrite, to_write; 793 int error = 0; 794 795 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 796 "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", 797 __func__, uio->uio_resid); 798 799 if (so->so_type != SOCK_STREAM) 800 return (EINVAL); 801 if (pcb == NULL) 802 return (EINVAL); 803 804 /* If nothing to send */ 805 if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) 806 return (EINVAL); 807 808 orig_resid = uio->uio_resid; 809 810 /* Prevent other writers from entering the socket. */ 811 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 812 if (error) { 813 HVSOCK_DBG(HVSOCK_DBG_ERR, 814 "%s: soiolocak returned error = %d\n", __func__, error); 815 return (error); 816 } 817 818 sb = &so->so_snd; 819 SOCKBUF_LOCK(sb); 820 821 if ((sb->sb_state & SBS_CANTSENDMORE) || 822 so->so_error == ESHUTDOWN) { 823 error = EPIPE; 824 goto out; 825 } 826 827 while (uio->uio_resid > 0) { 828 canwrite = hvsock_canwrite_check(pcb); 829 if (canwrite == 0) { 830 /* We have sent some data */ 831 if (orig_resid > uio->uio_resid) 832 break; 833 /* 834 * We have not sent any data and it is 835 * non-blocked io 836 */ 837 if (so->so_state & SS_NBIO || 838 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 839 error = EWOULDBLOCK; 840 break; 841 } else { 842 /* 843 * We are here because there is no space on 844 * send buffer ring. Signal the other side 845 * to read and free more space. 846 * Sleep wait until space avaiable to send 847 * Note: Drops the sockbuf lock during wait. 848 */ 849 error = sbwait(so, SO_SND); 850 851 if (error) 852 break; 853 854 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 855 "%s: wake up from sbwait, space avail on " 856 "tx ring is %u\n", 857 __func__, 858 vmbus_chan_write_available(pcb->chan)); 859 860 continue; 861 } 862 } 863 to_write = MIN(canwrite, uio->uio_resid); 864 to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); 865 866 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 867 "%s: canwrite is %u, to_write = %u\n", __func__, 868 canwrite, to_write); 869 error = hvsock_send_data(pcb->chan, uio, to_write, sb); 870 871 if (error) 872 break; 873 } 874 875 out: 876 SOCKBUF_UNLOCK(sb); 877 SOCK_IO_SEND_UNLOCK(so); 878 879 return (error); 880 } 881 882 int 883 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) 884 { 885 struct hvs_pcb *pcb = so2hvspcb(so); 886 887 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 888 "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); 889 890 if (pcb == NULL) 891 return (EINVAL); 892 893 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); 894 895 return ((*nam == NULL)? ENOMEM : 0); 896 } 897 898 int 899 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) 900 { 901 struct hvs_pcb *pcb = so2hvspcb(so); 902 903 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 904 "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); 905 906 if (pcb == NULL) 907 return (EINVAL); 908 909 *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); 910 911 return ((*nam == NULL)? ENOMEM : 0); 912 } 913 914 void 915 hvs_trans_close(struct socket *so) 916 { 917 struct hvs_pcb *pcb; 918 919 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 920 "%s: HyperV Socket hvs_trans_close called\n", __func__); 921 922 (void) hvs_trans_lock(); 923 pcb = so2hvspcb(so); 924 if (!pcb) { 925 hvs_trans_unlock(); 926 return; 927 } 928 929 if (so->so_state & SS_ISCONNECTED) { 930 /* Send a FIN to peer */ 931 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 932 "%s: hvs_trans_close sending a FIN to host\n", __func__); 933 (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); 934 } 935 936 if (so->so_state & 937 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 938 soisdisconnected(so); 939 940 pcb->chan = NULL; 941 pcb->so = NULL; 942 943 if (SOLISTENING(so)) { 944 mtx_lock(&hvs_trans_socks_mtx); 945 /* Remove from bound list */ 946 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 947 mtx_unlock(&hvs_trans_socks_mtx); 948 } 949 950 hvs_trans_unlock(); 951 952 return; 953 } 954 955 void 956 hvs_trans_abort(struct socket *so) 957 { 958 struct hvs_pcb *pcb = so2hvspcb(so); 959 960 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 961 "%s: HyperV Socket hvs_trans_abort called\n", __func__); 962 963 (void) hvs_trans_lock(); 964 if (pcb == NULL) { 965 hvs_trans_unlock(); 966 return; 967 } 968 969 if (SOLISTENING(so)) { 970 mtx_lock(&hvs_trans_socks_mtx); 971 /* Remove from bound list */ 972 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 973 mtx_unlock(&hvs_trans_socks_mtx); 974 } 975 976 if (so->so_state & SS_ISCONNECTED) { 977 (void) sodisconnect(so); 978 } 979 hvs_trans_unlock(); 980 981 return; 982 } 983 984 int 985 hvs_trans_shutdown(struct socket *so) 986 { 987 struct hvs_pcb *pcb = so2hvspcb(so); 988 struct sockbuf *sb; 989 990 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 991 "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); 992 993 if (pcb == NULL) 994 return (EINVAL); 995 996 /* 997 * Only get called with the shutdown method is SHUT_WR or 998 * SHUT_RDWR. 999 * When the method is SHUT_RD or SHUT_RDWR, the caller 1000 * already set the SBS_CANTRCVMORE on receive side socket 1001 * buffer. 1002 */ 1003 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1004 /* 1005 * SHUT_WR only case. 1006 * Receive side is still open. Just close 1007 * the send side. 1008 */ 1009 socantsendmore(so); 1010 } else { 1011 /* SHUT_RDWR case */ 1012 if (so->so_state & SS_ISCONNECTED) { 1013 /* Send a FIN to peer */ 1014 sb = &so->so_snd; 1015 SOCKBUF_LOCK(sb); 1016 (void) hvsock_send_data(pcb->chan, NULL, 0, sb); 1017 SOCKBUF_UNLOCK(sb); 1018 1019 soisdisconnecting(so); 1020 } 1021 } 1022 1023 return (0); 1024 } 1025 1026 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is 1027 * <port> (see struct sockaddr_hvs). 1028 * 1029 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: 1030 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- 1031 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with 1032 * the below sockaddr: 1033 * 1034 * struct SOCKADDR_HV 1035 * { 1036 * ADDRESS_FAMILY Family; 1037 * USHORT Reserved; 1038 * GUID VmId; 1039 * GUID ServiceId; 1040 * }; 1041 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via 1042 * VMBus, because here it's obvious the host and the VM can easily identify 1043 * each other. Though the VmID is useful on the host, especially in the case 1044 * of Windows container, FreeBSD VM doesn't need it at all. 1045 * 1046 * To be compatible with similar infrastructure in Linux VMs, we have 1047 * to limit the available GUID space of SOCKADDR_HV so that we can create 1048 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. 1049 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: 1050 * 1051 **************************************************************************** 1052 * The only valid Service GUIDs, from the perspectives of both the host and * 1053 * FreeBSD VM, that can be connected by the other end, must conform to this * 1054 * format: <port>-facb-11e6-bd58-64006a7986d3. * 1055 **************************************************************************** 1056 * 1057 * When we write apps on the host to connect(), the GUID ServiceID is used. 1058 * When we write apps in FreeBSD VM to connect(), we only need to specify the 1059 * port and the driver will form the GUID and use that to request the host. 1060 * 1061 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the 1062 * auto-generated remote port for a connect request initiated by the host's 1063 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the 1064 * FreeBSD guest. 1065 */ 1066 1067 /* 1068 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) 1069 * restricts HyperV socket ring buffer size to six 4K pages. Newer 1070 * HyperV hosts doen't have this limit. 1071 */ 1072 #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) 1073 #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) 1074 #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) 1075 1076 struct hvsock_sc { 1077 device_t dev; 1078 struct hvs_pcb *pcb; 1079 struct vmbus_channel *channel; 1080 }; 1081 1082 static bool 1083 hvsock_chan_readable(struct vmbus_channel *chan) 1084 { 1085 uint32_t readable = vmbus_chan_read_available(chan); 1086 1087 return (readable >= HVSOCK_PKT_LEN(0)); 1088 } 1089 1090 static void 1091 hvsock_chan_cb(struct vmbus_channel *chan, void *context) 1092 { 1093 struct hvs_pcb *pcb = (struct hvs_pcb *) context; 1094 struct socket *so; 1095 uint32_t canwrite; 1096 1097 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1098 "%s: host send us a wakeup on rb data, pcb = %p\n", 1099 __func__, pcb); 1100 1101 /* 1102 * Check if the socket is still attached and valid. 1103 * Here we know channel is still open. Need to make 1104 * sure the socket has not been closed or freed. 1105 */ 1106 (void) hvs_trans_lock(); 1107 so = hsvpcb2so(pcb); 1108 1109 if (pcb->chan != NULL && so != NULL) { 1110 /* 1111 * Wake up reader if there are data to read. 1112 */ 1113 SOCKBUF_LOCK(&(so)->so_rcv); 1114 1115 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1116 "%s: read available = %u\n", __func__, 1117 vmbus_chan_read_available(pcb->chan)); 1118 1119 if (hvsock_chan_readable(pcb->chan)) 1120 sorwakeup_locked(so); 1121 else 1122 SOCKBUF_UNLOCK(&(so)->so_rcv); 1123 1124 /* 1125 * Wake up sender if space becomes available to write. 1126 */ 1127 SOCKBUF_LOCK(&(so)->so_snd); 1128 canwrite = hvsock_canwrite_check(pcb); 1129 1130 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1131 "%s: canwrite = %u\n", __func__, canwrite); 1132 1133 if (canwrite > 0) { 1134 sowwakeup_locked(so); 1135 } else { 1136 SOCKBUF_UNLOCK(&(so)->so_snd); 1137 } 1138 } 1139 1140 hvs_trans_unlock(); 1141 1142 return; 1143 } 1144 1145 static int 1146 hvsock_br_callback(void *datap, int cplen, void *cbarg) 1147 { 1148 struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; 1149 struct uio *uio = arg->uio; 1150 struct sockbuf *sb = arg->sb; 1151 int error = 0; 1152 1153 if (cbarg == NULL || datap == NULL) 1154 return (EINVAL); 1155 1156 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1157 "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " 1158 "datap = %p\n", 1159 __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", 1160 uio->uio_resid, cplen, datap); 1161 1162 if (sb) 1163 SOCKBUF_UNLOCK(sb); 1164 1165 error = uiomove(datap, cplen, uio); 1166 1167 if (sb) 1168 SOCKBUF_LOCK(sb); 1169 1170 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1171 "%s: after uiomove, uio_resid = %zd, error = %d\n", 1172 __func__, uio->uio_resid, error); 1173 1174 return (error); 1175 } 1176 1177 static int 1178 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, 1179 uint32_t to_write, struct sockbuf *sb) 1180 { 1181 struct hvs_pkt_header hvs_pkt; 1182 int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; 1183 uint64_t pad = 0; 1184 struct iovec iov[3]; 1185 struct hvs_callback_arg cbarg; 1186 1187 if (chan == NULL) 1188 return (ENOTCONN); 1189 1190 hlen = sizeof(struct vmbus_chanpkt_hdr); 1191 hvs_pkthlen = sizeof(struct hvs_pkt_header); 1192 hvs_pktlen = hvs_pkthlen + to_write; 1193 pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); 1194 1195 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1196 "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " 1197 "pad_pktlen = %u, data_len = %u\n", 1198 __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); 1199 1200 hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; 1201 hvs_pkt.chan_pkt_hdr.cph_flags = 0; 1202 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); 1203 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); 1204 hvs_pkt.chan_pkt_hdr.cph_xactid = 0; 1205 1206 hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; 1207 hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; 1208 1209 cbarg.uio = uio; 1210 cbarg.sb = sb; 1211 1212 if (uio && to_write > 0) { 1213 iov[0].iov_base = &hvs_pkt; 1214 iov[0].iov_len = hvs_pkthlen; 1215 iov[1].iov_base = NULL; 1216 iov[1].iov_len = to_write; 1217 iov[2].iov_base = &pad; 1218 iov[2].iov_len = pad_pktlen - hvs_pktlen; 1219 1220 error = vmbus_chan_iov_send(chan, iov, 3, 1221 hvsock_br_callback, &cbarg); 1222 } else { 1223 if (to_write == 0) { 1224 iov[0].iov_base = &hvs_pkt; 1225 iov[0].iov_len = hvs_pkthlen; 1226 iov[1].iov_base = &pad; 1227 iov[1].iov_len = pad_pktlen - hvs_pktlen; 1228 error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); 1229 } 1230 } 1231 1232 if (error) { 1233 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1234 "%s: error = %d\n", __func__, error); 1235 } 1236 1237 return (error); 1238 } 1239 1240 /* 1241 * Check if we have data on current ring buffer to read 1242 * or not. If not, advance the ring buffer read index to 1243 * next packet. Update the recev_data_len and recev_data_off 1244 * to new value. 1245 * Return the number of bytes can read. 1246 */ 1247 static uint32_t 1248 hvsock_canread_check(struct hvs_pcb *pcb) 1249 { 1250 uint32_t advance; 1251 uint32_t tlen, hlen, dlen; 1252 uint32_t bytes_canread = 0; 1253 int error; 1254 1255 if (pcb == NULL || pcb->chan == NULL) { 1256 pcb->so->so_error = EIO; 1257 return (0); 1258 } 1259 1260 /* Still have data not read yet on current packet */ 1261 if (pcb->recv_data_len > 0) 1262 return (pcb->recv_data_len); 1263 1264 if (pcb->rb_init) 1265 advance = 1266 VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1267 else 1268 advance = 0; 1269 1270 bytes_canread = vmbus_chan_read_available(pcb->chan); 1271 1272 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1273 "%s: bytes_canread on br = %u, advance = %u\n", 1274 __func__, bytes_canread, advance); 1275 1276 if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { 1277 /* 1278 * Nothing to read. Need to advance the rindex before 1279 * calling sbwait, so host knows to wake us up when data 1280 * is available to read on rb. 1281 */ 1282 error = vmbus_chan_recv_idxadv(pcb->chan, advance); 1283 if (error) { 1284 HVSOCK_DBG(HVSOCK_DBG_ERR, 1285 "%s: after calling vmbus_chan_recv_idxadv, " 1286 "got error = %d\n", __func__, error); 1287 return (0); 1288 } else { 1289 pcb->rb_init = false; 1290 pcb->recv_data_len = 0; 1291 pcb->recv_data_off = 0; 1292 bytes_canread = vmbus_chan_read_available(pcb->chan); 1293 1294 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1295 "%s: advanced %u bytes, " 1296 " bytes_canread on br now = %u\n", 1297 __func__, advance, bytes_canread); 1298 1299 if (bytes_canread == 0) 1300 return (0); 1301 else 1302 advance = 0; 1303 } 1304 } 1305 1306 if (bytes_canread < 1307 advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) 1308 return (0); 1309 1310 error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, 1311 sizeof(struct hvs_pkt_header), advance); 1312 1313 /* Don't have anything to read */ 1314 if (error) { 1315 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1316 "%s: after calling vmbus_chan_recv_peek, got error = %d\n", 1317 __func__, error); 1318 return (0); 1319 } 1320 1321 /* 1322 * We just read in a new packet header. Do some sanity checks. 1323 */ 1324 tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1325 hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); 1326 dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; 1327 if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || 1328 __predict_false(hlen > tlen) || 1329 __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { 1330 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1331 "invalid tlen(%u), hlen(%u) or dlen(%u)\n", 1332 tlen, hlen, dlen); 1333 pcb->so->so_error = EIO; 1334 return (0); 1335 } 1336 if (pcb->rb_init == false) 1337 pcb->rb_init = true; 1338 1339 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1340 "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", 1341 tlen, hlen, dlen); 1342 1343 /* The other side has sent a close FIN */ 1344 if (dlen == 0) { 1345 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1346 "%s: Received FIN from other side\n", __func__); 1347 /* inform the caller by seting so_error to ESHUTDOWN */ 1348 pcb->so->so_error = ESHUTDOWN; 1349 } 1350 1351 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1352 "%s: canread on receive ring is %u \n", __func__, dlen); 1353 1354 pcb->recv_data_len = dlen; 1355 pcb->recv_data_off = 0; 1356 1357 return (pcb->recv_data_len); 1358 } 1359 1360 static uint32_t 1361 hvsock_canwrite_check(struct hvs_pcb *pcb) 1362 { 1363 uint32_t writeable; 1364 uint32_t ret; 1365 1366 if (pcb == NULL || pcb->chan == NULL) 1367 return (0); 1368 1369 writeable = vmbus_chan_write_available(pcb->chan); 1370 1371 /* 1372 * We must always reserve a 0-length-payload packet for the FIN. 1373 */ 1374 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1375 "%s: writeable is %u, should be greater than %ju\n", 1376 __func__, writeable, 1377 (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); 1378 1379 if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { 1380 /* 1381 * The Tx ring seems full. 1382 */ 1383 return (0); 1384 } 1385 1386 ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); 1387 1388 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1389 "%s: available size is %u\n", __func__, rounddown2(ret, 8)); 1390 1391 return (rounddown2(ret, 8)); 1392 } 1393 1394 static void 1395 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) 1396 { 1397 vmbus_chan_set_pending_send_size(chan, 1398 HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); 1399 } 1400 1401 static int 1402 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) 1403 { 1404 unsigned int rcvbuf, sndbuf; 1405 struct hvs_pcb *pcb = so2hvspcb(so); 1406 int ret; 1407 1408 if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { 1409 sndbuf = HVS_RINGBUF_SND_SIZE; 1410 rcvbuf = HVS_RINGBUF_RCV_SIZE; 1411 } else { 1412 sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); 1413 sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); 1414 sndbuf = rounddown2(sndbuf, PAGE_SIZE); 1415 rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); 1416 rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); 1417 rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); 1418 } 1419 1420 /* 1421 * Can only read whatever user provided size of data 1422 * from ring buffer. Turn off batched reading. 1423 */ 1424 vmbus_chan_set_readbatch(chan, false); 1425 1426 ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, 1427 hvsock_chan_cb, pcb); 1428 1429 if (ret != 0) { 1430 HVSOCK_DBG(HVSOCK_DBG_ERR, 1431 "%s: failed to open hvsock channel, sndbuf = %u, " 1432 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1433 } else { 1434 HVSOCK_DBG(HVSOCK_DBG_INFO, 1435 "%s: hvsock channel opened, sndbuf = %u, i" 1436 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1437 /* 1438 * Se the pending send size so to receive wakeup 1439 * signals from host when there is enough space on 1440 * rx buffer ring to write. 1441 */ 1442 hvsock_set_chan_pending_send_size(chan); 1443 } 1444 1445 return ret; 1446 } 1447 1448 /* 1449 * Guest is listening passively on the socket. Open channel and 1450 * create a new socket for the conneciton. 1451 */ 1452 static void 1453 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, 1454 struct hvsock_sc *sc) 1455 { 1456 struct socket *new_so; 1457 struct hvs_pcb *new_pcb, *pcb; 1458 int error; 1459 1460 /* Do nothing if socket is not listening */ 1461 if (!SOLISTENING(so)) { 1462 HVSOCK_DBG(HVSOCK_DBG_ERR, 1463 "%s: socket is not a listening one\n", __func__); 1464 return; 1465 } 1466 1467 /* 1468 * Create a new socket. This will call pru_attach to complete 1469 * the socket initialization and put the new socket onto 1470 * listening socket's sol_incomp list, waiting to be promoted 1471 * to sol_comp list. 1472 * The new socket created has ref count 0. There is no other 1473 * thread that changes the state of this new one at the 1474 * moment, so we don't need to hold its lock while opening 1475 * channel and filling out its pcb information. 1476 */ 1477 new_so = sonewconn(so, 0); 1478 if (!new_so) 1479 HVSOCK_DBG(HVSOCK_DBG_ERR, 1480 "%s: creating new socket failed\n", __func__); 1481 1482 /* 1483 * Now open the vmbus channel. If it fails, the socket will be 1484 * on the listening socket's sol_incomp queue until it is 1485 * replaced and aborted. 1486 */ 1487 error = hvsock_open_channel(chan, new_so); 1488 if (error) { 1489 new_so->so_error = error; 1490 return; 1491 } 1492 1493 pcb = so->so_pcb; 1494 new_pcb = new_so->so_pcb; 1495 1496 hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); 1497 /* Remote port is unknown to guest in this type of conneciton */ 1498 hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); 1499 new_pcb->chan = chan; 1500 new_pcb->recv_data_len = 0; 1501 new_pcb->recv_data_off = 0; 1502 new_pcb->rb_init = false; 1503 1504 new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); 1505 new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); 1506 1507 hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); 1508 1509 sc->pcb = new_pcb; 1510 1511 /* 1512 * Change the socket state to SS_ISCONNECTED. This will promote 1513 * the socket to sol_comp queue and wake up the thread which 1514 * is accepting connection. 1515 */ 1516 soisconnected(new_so); 1517 } 1518 1519 1520 /* 1521 * Guest is actively connecting to host. 1522 */ 1523 static void 1524 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) 1525 { 1526 struct hvs_pcb *pcb; 1527 int error; 1528 1529 error = hvsock_open_channel(chan, so); 1530 if (error) { 1531 so->so_error = error; 1532 return; 1533 } 1534 1535 pcb = so->so_pcb; 1536 pcb->chan = chan; 1537 pcb->recv_data_len = 0; 1538 pcb->recv_data_off = 0; 1539 pcb->rb_init = false; 1540 1541 mtx_lock(&hvs_trans_socks_mtx); 1542 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 1543 __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); 1544 mtx_unlock(&hvs_trans_socks_mtx); 1545 1546 /* 1547 * Change the socket state to SS_ISCONNECTED. This will wake up 1548 * the thread sleeping in connect call. 1549 */ 1550 soisconnected(so); 1551 } 1552 1553 static void 1554 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) 1555 { 1556 struct hyperv_guid *inst_guid, *type_guid; 1557 bool conn_from_host; 1558 struct sockaddr_hvs addr; 1559 struct socket *so; 1560 struct hvs_pcb *pcb; 1561 1562 type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); 1563 inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); 1564 conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); 1565 1566 HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); 1567 hvsock_print_guid(type_guid); 1568 HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); 1569 hvsock_print_guid(inst_guid); 1570 HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", 1571 (conn_from_host == true ) ? "from" : "to"); 1572 1573 /* 1574 * The listening port should be in [0, MAX_LISTEN_PORT] 1575 */ 1576 if (!is_valid_srv_id(type_guid)) 1577 return; 1578 1579 /* 1580 * There should be a bound socket already created no matter 1581 * it is a passive or active connection. 1582 * For host initiated connection (passive on guest side), 1583 * the type_guid contains the port which guest is bound and 1584 * listening. 1585 * For the guest initiated connection (active on guest side), 1586 * the inst_guid contains the port that guest has auto bound 1587 * to. 1588 */ 1589 hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); 1590 so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); 1591 if (!so) { 1592 HVSOCK_DBG(HVSOCK_DBG_ERR, 1593 "%s: no bound socket found for port %u\n", 1594 __func__, addr.hvs_port); 1595 return; 1596 } 1597 1598 if (conn_from_host) { 1599 hvsock_open_conn_passive(chan, so, sc); 1600 } else { 1601 (void) hvs_trans_lock(); 1602 pcb = so->so_pcb; 1603 if (pcb && pcb->so) { 1604 sc->pcb = so2hvspcb(so); 1605 hvsock_open_conn_active(chan, so); 1606 } else { 1607 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1608 "%s: channel detached before open\n", __func__); 1609 } 1610 hvs_trans_unlock(); 1611 } 1612 1613 } 1614 1615 static int 1616 hvsock_probe(device_t dev) 1617 { 1618 struct vmbus_channel *channel = vmbus_get_channel(dev); 1619 1620 if (!channel || !vmbus_chan_is_hvs(channel)) { 1621 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1622 "hvsock_probe called but not a hvsock channel id %u\n", 1623 vmbus_chan_id(channel)); 1624 1625 return ENXIO; 1626 } else { 1627 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1628 "hvsock_probe got a hvsock channel id %u\n", 1629 vmbus_chan_id(channel)); 1630 1631 return BUS_PROBE_DEFAULT; 1632 } 1633 } 1634 1635 static int 1636 hvsock_attach(device_t dev) 1637 { 1638 struct vmbus_channel *channel = vmbus_get_channel(dev); 1639 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1640 1641 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); 1642 1643 hvsock_open_connection(channel, sc); 1644 1645 /* 1646 * Always return success. On error the host will rescind the device 1647 * in 30 seconds and we can do cleanup at that time in 1648 * vmbus_chan_msgproc_chrescind(). 1649 */ 1650 return (0); 1651 } 1652 1653 static int 1654 hvsock_detach(device_t dev) 1655 { 1656 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1657 struct socket *so; 1658 int retry; 1659 1660 if (bootverbose) 1661 device_printf(dev, "hvsock_detach called.\n"); 1662 1663 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); 1664 1665 if (sc->pcb != NULL) { 1666 (void) hvs_trans_lock(); 1667 1668 so = hsvpcb2so(sc->pcb); 1669 if (so) { 1670 /* Close the connection */ 1671 if (so->so_state & 1672 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 1673 soisdisconnected(so); 1674 } 1675 1676 mtx_lock(&hvs_trans_socks_mtx); 1677 __hvs_remove_pcb_from_list(sc->pcb, 1678 HVS_LIST_BOUND | HVS_LIST_CONNECTED); 1679 mtx_unlock(&hvs_trans_socks_mtx); 1680 1681 /* 1682 * Close channel while no reader and sender are working 1683 * on the buffer rings. 1684 */ 1685 if (so) { 1686 retry = 0; 1687 while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { 1688 /* 1689 * Someone is reading, rx br is busy 1690 */ 1691 soisdisconnected(so); 1692 DELAY(500); 1693 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1694 "waiting for rx reader to exit, " 1695 "retry = %d\n", retry++); 1696 } 1697 retry = 0; 1698 while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { 1699 /* 1700 * Someone is sending, tx br is busy 1701 */ 1702 soisdisconnected(so); 1703 DELAY(500); 1704 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1705 "waiting for tx sender to exit, " 1706 "retry = %d\n", retry++); 1707 } 1708 } 1709 1710 1711 bzero(sc->pcb, sizeof(struct hvs_pcb)); 1712 free(sc->pcb, M_HVSOCK); 1713 sc->pcb = NULL; 1714 1715 if (so) { 1716 SOCK_IO_RECV_UNLOCK(so); 1717 SOCK_IO_SEND_UNLOCK(so); 1718 so->so_pcb = NULL; 1719 } 1720 1721 hvs_trans_unlock(); 1722 } 1723 1724 vmbus_chan_close(vmbus_get_channel(dev)); 1725 1726 return (0); 1727 } 1728 1729 static device_method_t hvsock_methods[] = { 1730 /* Device interface */ 1731 DEVMETHOD(device_probe, hvsock_probe), 1732 DEVMETHOD(device_attach, hvsock_attach), 1733 DEVMETHOD(device_detach, hvsock_detach), 1734 DEVMETHOD_END 1735 }; 1736 1737 static driver_t hvsock_driver = { 1738 "hv_sock", 1739 hvsock_methods, 1740 sizeof(struct hvsock_sc) 1741 }; 1742 1743 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL); 1744 MODULE_VERSION(hvsock, 1); 1745 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); 1746