1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2021 Oxide Computer Company 39 */ 40 41 /* 42 * viona - VirtIO-Net, Accelerated 43 * 44 * The purpose of viona is to provide high performance virtio-net devices to 45 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the 46 * DLS/DLD stack. 47 * 48 * -------------------- 49 * General Architecture 50 * -------------------- 51 * 52 * A single viona instance is comprised of a "link" handle and two "rings". 53 * After opening the viona device, it must be associated with a MAC network 54 * interface and a bhyve (vmm) instance to form its link resource. This is 55 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are 56 * passed in to perform the initialization. With the MAC client opened, and a 57 * driver handle to the vmm instance established, the device is ready to be 58 * configured by the guest. 59 * 60 * The userspace portion of bhyve, which interfaces with the PCI device 61 * emulation framework, is meant to stay out of the datapath if at all 62 * possible. Configuration changes made via PCI are mapped to actions which 63 * will steer the operation of the in-kernel logic. 64 * 65 * 66 * ----------- 67 * Ring Basics 68 * ----------- 69 * 70 * Each viona link has two viona_vring_t entities, RX and TX, for handling data 71 * transfers to and from the guest. They represent an interface to the 72 * standard virtio ring structures. When intiailized and active, each ring is 73 * backed by a kernel worker thread (parented to the bhyve process for the 74 * instance) which handles ring events. The RX worker has the simple task of 75 * watching for ring shutdown conditions. The TX worker does that in addition 76 * to processing all requests to transmit data. Data destined for the guest is 77 * delivered directly by MAC to viona_rx() when the ring is active. 78 * 79 * 80 * ----------- 81 * Ring States 82 * ----------- 83 * 84 * The viona_vring_t instances follow a simple path through the possible state 85 * values represented in virtio_vring_t`vr_state: 86 * 87 * +<--------------------------------------------+ 88 * | | 89 * V ^ 90 * +-----------+ This is the initial state when a link is created or 91 * | VRS_RESET | when the ring has been explicitly reset. 92 * +-----------+ 93 * | ^ 94 * |---* ioctl(VNA_IOC_RING_INIT) issued | 95 * | | 96 * | ^ 97 * V 98 * +-----------+ The ring parameters (size, guest physical addresses) 99 * | VRS_SETUP | have been set and start-up of the ring worker thread 100 * +-----------+ has begun. 101 * | ^ 102 * | | 103 * |---* ring worker thread begins execution | 104 * | | 105 * +-------------------------------------------->+ 106 * | | ^ 107 * | | 108 * | * If ring shutdown is requested (by ioctl or impending 109 * | bhyve process death) while the worker thread is 110 * | starting, the worker will transition the ring to 111 * | VRS_RESET and exit. 112 * | ^ 113 * | | 114 * | ^ 115 * V 116 * +-----------+ The worker thread associated with the ring has started 117 * | VRS_INIT | executing. It has allocated any extra resources needed 118 * +-----------+ for the ring to operate. 119 * | ^ 120 * | | 121 * +-------------------------------------------->+ 122 * | | ^ 123 * | | 124 * | * If ring shutdown is requested while the worker is 125 * | waiting in VRS_INIT, it will free any extra resources 126 * | and transition to VRS_RESET. 127 * | ^ 128 * | | 129 * |--* ioctl(VNA_IOC_RING_KICK) issued | 130 * | ^ 131 * V 132 * +-----------+ The worker thread associated with the ring is executing 133 * | VRS_RUN | workload specific to that ring. 134 * +-----------+ 135 * | ^ 136 * |---* ioctl(VNA_IOC_RING_RESET) issued | 137 * | (or bhyve process begins exit) ^ 138 * | 139 * +-----------+ The worker thread associated with the ring is in the 140 * | VRS_STOP | process of exiting. All outstanding TX and RX 141 * +-----------+ requests are allowed to complete, but new requests 142 * | must be ignored. 143 * | ^ 144 * | | 145 * +-------------------------------------------->+ 146 * 147 * 148 * While the worker thread is not running, changes to vr_state are only made by 149 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts 150 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread 151 * has been started, only it may perform ring state transitions (still under 152 * the protection of vr_lock), when requested by outside consumers via 153 * vr_state_flags or when the containing bhyve process initiates an exit. 154 * 155 * 156 * ---------------------------- 157 * Transmission mblk_t Handling 158 * ---------------------------- 159 * 160 * For incoming frames destined for a bhyve guest, the data must first land in 161 * a host OS buffer from the physical NIC before it is copied into the awaiting 162 * guest buffer(s). Outbound frames transmitted by the guest are not bound by 163 * this limitation and can avoid extra copying before the buffers are accessed 164 * directly by the NIC. When a guest designates buffers to be transmitted, 165 * viona translates the guest-physical addresses contained in the ring 166 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is 167 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). 168 * Doing so increments vr_xfer_outstanding, preventing the ring from being 169 * reset (allowing the link to drop its vmm handle to the guest) until all 170 * transmit mblks referencing guest memory have been processed. Allocation of 171 * the viona_desb_t entries is done during the VRS_INIT stage of the ring 172 * worker thread. The ring size informs that allocation as the number of 173 * concurrent transmissions is limited by the number of descriptors in the 174 * ring. This minimizes allocation in the transmit hot-path by acquiring those 175 * fixed-size resources during initialization. 176 * 177 * This optimization depends on the underlying NIC driver freeing the mblks in 178 * a timely manner after they have been transmitted by the hardware. Some 179 * drivers have been found to flush TX descriptors only when new transmissions 180 * are initiated. This means that there is no upper bound to the time needed 181 * for an mblk to be flushed and can stall bhyve guests from shutting down 182 * since their memory must be free of viona TX references prior to clean-up. 183 * 184 * This expectation of deterministic mblk_t processing is likely the reason 185 * behind the notable exception to the zero-copy TX path: systems with 'bnxe' 186 * loaded will copy transmit data into fresh buffers rather than passing up 187 * zero-copy mblks. It is a hold-over from the original viona sources provided 188 * by Pluribus and its continued necessity has not been confirmed. 189 * 190 * 191 * ---------------------------- 192 * Ring Notification Fast-paths 193 * ---------------------------- 194 * 195 * Device operation for viona requires that notifications flow to and from the 196 * guest to indicate certain ring conditions. In order to minimize latency and 197 * processing overhead, the notification procedures are kept in-kernel whenever 198 * possible. 199 * 200 * Guest-to-host notifications, when new available descriptors have been placed 201 * in the ring, are posted via the 'queue notify' address in the virtio BAR. 202 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to 203 * install a callback hook on an ioport address. Guest exits for accesses to 204 * viona-hooked ioport addresses will result in direct calls to notify the 205 * appropriate ring worker without a trip to userland. 206 * 207 * Host-to-guest notifications in the form of interrupts enjoy similar 208 * acceleration. Each viona ring can be configured to send MSI notifications 209 * to the guest as virtio conditions dictate. This in-kernel interrupt 210 * configuration is kept synchronized through viona ioctls which are utilized 211 * during writes to the associated PCI config registers or MSI-X BAR. 212 * 213 * Guests which do not utilize MSI-X will result in viona falling back to the 214 * slow path for interrupts. It will poll(2) the viona handle, receiving 215 * notification when ring events necessitate the assertion of an interrupt. 216 * 217 * 218 * --------------- 219 * Nethook Support 220 * --------------- 221 * 222 * Viona provides four nethook events that consumers (e.g. ipf) can hook into 223 * to intercept packets as they go up or down the stack. Unfortunately, 224 * the nethook framework does not understand raw packets, so we can only 225 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, 226 * we register callbacks with the neti (netinfo) module that will be invoked 227 * for each netstack already present, as well as for any additional netstack 228 * instances created as the system operates. These callbacks will 229 * register/unregister the hooks with the nethook framework for each 230 * netstack instance. This registration occurs prior to creating any 231 * viona instances for a given netstack, and the unregistration for a netstack 232 * instance occurs after all viona instances of the netstack instance have 233 * been deleted. 234 */ 235 236 #include <sys/conf.h> 237 #include <sys/file.h> 238 #include <sys/stat.h> 239 240 #include <sys/dlpi.h> 241 242 #include "viona_impl.h" 243 244 245 #define VIONA_NAME "Virtio Network Accelerator" 246 #define VIONA_CTL_MINOR 0 247 #define VIONA_CLI_NAME "viona" /* MAC client name */ 248 249 250 /* 251 * Host capabilities. 252 */ 253 #define VIONA_S_HOSTCAPS ( \ 254 VIRTIO_NET_F_GUEST_CSUM | \ 255 VIRTIO_NET_F_MAC | \ 256 VIRTIO_NET_F_GUEST_TSO4 | \ 257 VIRTIO_NET_F_MRG_RXBUF | \ 258 VIRTIO_NET_F_STATUS | \ 259 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ 260 VIRTIO_F_RING_INDIRECT_DESC) 261 262 /* MAC_CAPAB_HCKSUM specifics of interest */ 263 #define VIONA_CAP_HCKSUM_INTEREST \ 264 (HCKSUM_INET_PARTIAL | \ 265 HCKSUM_INET_FULL_V4 | \ 266 HCKSUM_INET_FULL_V6) 267 268 static void *viona_state; 269 static dev_info_t *viona_dip; 270 static id_space_t *viona_minors; 271 272 273 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, 274 void **result); 275 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 276 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 277 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); 278 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); 279 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, 280 cred_t *credp, int *rval); 281 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 282 struct pollhead **phpp); 283 284 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); 285 static int viona_ioc_delete(viona_soft_state_t *, boolean_t); 286 287 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); 288 static int viona_ioc_ring_init(viona_link_t *, void *, int); 289 static int viona_ioc_ring_reset(viona_link_t *, uint_t); 290 static int viona_ioc_ring_kick(viona_link_t *, uint_t); 291 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); 292 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); 293 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); 294 295 static struct cb_ops viona_cb_ops = { 296 viona_open, 297 viona_close, 298 nodev, 299 nodev, 300 nodev, 301 nodev, 302 nodev, 303 viona_ioctl, 304 nodev, 305 nodev, 306 nodev, 307 viona_chpoll, 308 ddi_prop_op, 309 0, 310 D_MP | D_NEW | D_HOTPLUG, 311 CB_REV, 312 nodev, 313 nodev 314 }; 315 316 static struct dev_ops viona_ops = { 317 DEVO_REV, 318 0, 319 viona_info, 320 nulldev, 321 nulldev, 322 viona_attach, 323 viona_detach, 324 nodev, 325 &viona_cb_ops, 326 NULL, 327 ddi_power, 328 ddi_quiesce_not_needed 329 }; 330 331 static struct modldrv modldrv = { 332 &mod_driverops, 333 VIONA_NAME, 334 &viona_ops, 335 }; 336 337 static struct modlinkage modlinkage = { 338 MODREV_1, &modldrv, NULL 339 }; 340 341 int 342 _init(void) 343 { 344 int ret; 345 346 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); 347 if (ret != 0) { 348 return (ret); 349 } 350 351 viona_minors = id_space_create("viona_minors", 352 VIONA_CTL_MINOR + 1, UINT16_MAX); 353 viona_rx_init(); 354 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); 355 356 ret = mod_install(&modlinkage); 357 if (ret != 0) { 358 ddi_soft_state_fini(&viona_state); 359 id_space_destroy(viona_minors); 360 viona_rx_fini(); 361 mutex_destroy(&viona_force_copy_lock); 362 } 363 364 return (ret); 365 } 366 367 int 368 _fini(void) 369 { 370 int ret; 371 372 ret = mod_remove(&modlinkage); 373 if (ret != 0) { 374 return (ret); 375 } 376 377 ddi_soft_state_fini(&viona_state); 378 id_space_destroy(viona_minors); 379 viona_rx_fini(); 380 mutex_destroy(&viona_force_copy_lock); 381 382 return (ret); 383 } 384 385 int 386 _info(struct modinfo *modinfop) 387 { 388 return (mod_info(&modlinkage, modinfop)); 389 } 390 391 /* ARGSUSED */ 392 static int 393 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 394 { 395 int error; 396 397 switch (cmd) { 398 case DDI_INFO_DEVT2DEVINFO: 399 *result = (void *)viona_dip; 400 error = DDI_SUCCESS; 401 break; 402 case DDI_INFO_DEVT2INSTANCE: 403 *result = (void *)0; 404 error = DDI_SUCCESS; 405 break; 406 default: 407 error = DDI_FAILURE; 408 break; 409 } 410 return (error); 411 } 412 413 static int 414 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 415 { 416 if (cmd != DDI_ATTACH) { 417 return (DDI_FAILURE); 418 } 419 420 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, 421 DDI_PSEUDO, 0) != DDI_SUCCESS) { 422 return (DDI_FAILURE); 423 } 424 425 viona_neti_attach(); 426 427 viona_dip = dip; 428 ddi_report_dev(viona_dip); 429 430 return (DDI_SUCCESS); 431 } 432 433 static int 434 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 435 { 436 dev_info_t *old_dip = viona_dip; 437 438 if (cmd != DDI_DETACH) { 439 return (DDI_FAILURE); 440 } 441 442 VERIFY(old_dip != NULL); 443 444 viona_neti_detach(); 445 viona_dip = NULL; 446 ddi_remove_minor_node(old_dip, NULL); 447 448 return (DDI_SUCCESS); 449 } 450 451 static int 452 viona_open(dev_t *devp, int flag, int otype, cred_t *credp) 453 { 454 int minor; 455 viona_soft_state_t *ss; 456 457 if (otype != OTYP_CHR) { 458 return (EINVAL); 459 } 460 #if 0 461 /* 462 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. 463 * Should the check be at open() or ioctl()? 464 */ 465 if (drv_priv(credp) != 0) { 466 return (EPERM); 467 } 468 #endif 469 if (getminor(*devp) != VIONA_CTL_MINOR) { 470 return (ENXIO); 471 } 472 473 minor = id_alloc_nosleep(viona_minors); 474 if (minor == -1) { 475 /* All minors are busy */ 476 return (EBUSY); 477 } 478 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { 479 id_free(viona_minors, minor); 480 return (ENOMEM); 481 } 482 483 ss = ddi_get_soft_state(viona_state, minor); 484 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); 485 *devp = makedevice(getmajor(*devp), minor); 486 487 return (0); 488 } 489 490 static int 491 viona_close(dev_t dev, int flag, int otype, cred_t *credp) 492 { 493 int minor; 494 viona_soft_state_t *ss; 495 496 if (otype != OTYP_CHR) { 497 return (EINVAL); 498 } 499 500 minor = getminor(dev); 501 502 ss = ddi_get_soft_state(viona_state, minor); 503 if (ss == NULL) { 504 return (ENXIO); 505 } 506 507 VERIFY0(viona_ioc_delete(ss, B_TRUE)); 508 VERIFY(!list_link_active(&ss->ss_node)); 509 ddi_soft_state_free(viona_state, minor); 510 id_free(viona_minors, minor); 511 512 return (0); 513 } 514 515 static int 516 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) 517 { 518 viona_soft_state_t *ss; 519 void *dptr = (void *)data; 520 int err = 0, val; 521 viona_link_t *link; 522 523 ss = ddi_get_soft_state(viona_state, getminor(dev)); 524 if (ss == NULL) { 525 return (ENXIO); 526 } 527 528 switch (cmd) { 529 case VNA_IOC_CREATE: 530 return (viona_ioc_create(ss, dptr, md, cr)); 531 case VNA_IOC_DELETE: 532 return (viona_ioc_delete(ss, B_FALSE)); 533 default: 534 break; 535 } 536 537 mutex_enter(&ss->ss_lock); 538 if ((link = ss->ss_link) == NULL || link->l_destroyed || 539 vmm_drv_release_reqd(link->l_vm_hold)) { 540 mutex_exit(&ss->ss_lock); 541 return (ENXIO); 542 } 543 544 switch (cmd) { 545 case VNA_IOC_GET_FEATURES: 546 val = VIONA_S_HOSTCAPS | link->l_features_hw; 547 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { 548 err = EFAULT; 549 } 550 break; 551 case VNA_IOC_SET_FEATURES: 552 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { 553 err = EFAULT; 554 break; 555 } 556 val &= (VIONA_S_HOSTCAPS | link->l_features_hw); 557 558 if ((val & VIRTIO_NET_F_CSUM) == 0) 559 val &= ~VIRTIO_NET_F_HOST_TSO4; 560 561 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) 562 val &= ~VIRTIO_NET_F_GUEST_TSO4; 563 564 link->l_features = val; 565 break; 566 case VNA_IOC_RING_INIT: 567 err = viona_ioc_ring_init(link, dptr, md); 568 break; 569 case VNA_IOC_RING_RESET: 570 err = viona_ioc_ring_reset(link, (uint_t)data); 571 break; 572 case VNA_IOC_RING_KICK: 573 err = viona_ioc_ring_kick(link, (uint_t)data); 574 break; 575 case VNA_IOC_RING_SET_MSI: 576 err = viona_ioc_ring_set_msi(link, dptr, md); 577 break; 578 case VNA_IOC_RING_INTR_CLR: 579 err = viona_ioc_ring_intr_clear(link, (uint_t)data); 580 break; 581 case VNA_IOC_INTR_POLL: 582 err = viona_ioc_intr_poll(link, dptr, md, rv); 583 break; 584 case VNA_IOC_SET_NOTIFY_IOP: 585 if (data < 0 || data > UINT16_MAX) { 586 err = EINVAL; 587 break; 588 } 589 err = viona_ioc_set_notify_ioport(link, (uint16_t)data); 590 break; 591 default: 592 err = ENOTTY; 593 break; 594 } 595 596 mutex_exit(&ss->ss_lock); 597 return (err); 598 } 599 600 static int 601 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 602 struct pollhead **phpp) 603 { 604 viona_soft_state_t *ss; 605 viona_link_t *link; 606 607 ss = ddi_get_soft_state(viona_state, getminor(dev)); 608 if (ss == NULL) { 609 return (ENXIO); 610 } 611 612 mutex_enter(&ss->ss_lock); 613 if ((link = ss->ss_link) == NULL || link->l_destroyed) { 614 mutex_exit(&ss->ss_lock); 615 return (ENXIO); 616 } 617 618 *reventsp = 0; 619 if ((events & POLLRDBAND) != 0) { 620 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 621 if (link->l_vrings[i].vr_intr_enabled != 0) { 622 *reventsp |= POLLRDBAND; 623 break; 624 } 625 } 626 } 627 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 628 *phpp = &link->l_pollhead; 629 } 630 mutex_exit(&ss->ss_lock); 631 632 return (0); 633 } 634 635 static void 636 viona_get_mac_capab(viona_link_t *link) 637 { 638 mac_handle_t mh = link->l_mh; 639 uint32_t cap = 0; 640 mac_capab_lso_t lso_cap; 641 642 link->l_features_hw = 0; 643 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { 644 /* 645 * Only report HW checksum ability if the underlying MAC 646 * resource is capable of populating the L4 header. 647 */ 648 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { 649 link->l_features_hw |= VIRTIO_NET_F_CSUM; 650 } 651 link->l_cap_csum = cap; 652 } 653 654 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && 655 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { 656 /* 657 * Virtio doesn't allow for negotiating a maximum LSO 658 * packet size. We have to assume that the guest may 659 * send a maximum length IP packet. Make sure the 660 * underlying MAC can handle an LSO of this size. 661 */ 662 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && 663 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) 664 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; 665 } 666 } 667 668 static int 669 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) 670 { 671 vioc_create_t kvc; 672 viona_link_t *link = NULL; 673 char cli_name[MAXNAMELEN]; 674 int err = 0; 675 file_t *fp; 676 vmm_hold_t *hold = NULL; 677 viona_neti_t *nip = NULL; 678 zoneid_t zid; 679 680 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); 681 682 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { 683 return (EFAULT); 684 } 685 686 zid = crgetzoneid(cr); 687 nip = viona_neti_lookup_by_zid(zid); 688 if (nip == NULL) { 689 return (EIO); 690 } 691 692 if (!nip->vni_nethook.vnh_hooked) { 693 viona_neti_rele(nip); 694 return (EIO); 695 } 696 697 mutex_enter(&ss->ss_lock); 698 if (ss->ss_link != NULL) { 699 mutex_exit(&ss->ss_lock); 700 viona_neti_rele(nip); 701 return (EEXIST); 702 } 703 704 if ((fp = getf(kvc.c_vmfd)) == NULL) { 705 err = EBADF; 706 goto bail; 707 } 708 err = vmm_drv_hold(fp, cr, &hold); 709 releasef(kvc.c_vmfd); 710 if (err != 0) { 711 goto bail; 712 } 713 714 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); 715 link->l_linkid = kvc.c_linkid; 716 link->l_vm_hold = hold; 717 718 err = mac_open_by_linkid(link->l_linkid, &link->l_mh); 719 if (err != 0) { 720 goto bail; 721 } 722 723 viona_get_mac_capab(link); 724 725 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME, 726 link->l_linkid); 727 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); 728 if (err != 0) { 729 goto bail; 730 } 731 732 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); 733 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); 734 735 if ((err = viona_rx_set(link)) != 0) { 736 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 737 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 738 goto bail; 739 } 740 741 link->l_neti = nip; 742 ss->ss_link = link; 743 mutex_exit(&ss->ss_lock); 744 745 mutex_enter(&nip->vni_lock); 746 list_insert_tail(&nip->vni_dev_list, ss); 747 mutex_exit(&nip->vni_lock); 748 749 return (0); 750 751 bail: 752 if (link != NULL) { 753 if (link->l_mch != NULL) { 754 mac_client_close(link->l_mch, 0); 755 } 756 if (link->l_mh != NULL) { 757 mac_close(link->l_mh); 758 } 759 kmem_free(link, sizeof (viona_link_t)); 760 } 761 if (hold != NULL) { 762 vmm_drv_rele(hold); 763 } 764 viona_neti_rele(nip); 765 766 mutex_exit(&ss->ss_lock); 767 return (err); 768 } 769 770 static int 771 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) 772 { 773 viona_link_t *link; 774 viona_neti_t *nip = NULL; 775 776 mutex_enter(&ss->ss_lock); 777 if ((link = ss->ss_link) == NULL) { 778 /* Link destruction already complete */ 779 mutex_exit(&ss->ss_lock); 780 return (0); 781 } 782 783 if (link->l_destroyed) { 784 /* 785 * Link destruction has been started by another thread, but has 786 * not completed. This condition should be impossible to 787 * encounter when performing the on-close destroy of the link, 788 * since racing ioctl accessors must necessarily be absent. 789 */ 790 VERIFY(!on_close); 791 mutex_exit(&ss->ss_lock); 792 return (EAGAIN); 793 } 794 /* 795 * The link deletion cannot fail after this point, continuing until its 796 * successful completion is reached. 797 */ 798 link->l_destroyed = B_TRUE; 799 800 /* 801 * Tear down the IO port hook so it cannot be used to kick any of the 802 * rings which are about to be reset and stopped. 803 */ 804 VERIFY0(viona_ioc_set_notify_ioport(link, 0)); 805 mutex_exit(&ss->ss_lock); 806 807 /* 808 * Return the rings to their reset state, ignoring any possible 809 * interruptions from signals. 810 */ 811 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); 812 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); 813 814 mutex_enter(&ss->ss_lock); 815 if (link->l_mch != NULL) { 816 /* Unhook the receive callbacks and close out the client */ 817 viona_rx_clear(link); 818 mac_client_close(link->l_mch, 0); 819 } 820 if (link->l_mh != NULL) { 821 mac_close(link->l_mh); 822 } 823 if (link->l_vm_hold != NULL) { 824 vmm_drv_rele(link->l_vm_hold); 825 link->l_vm_hold = NULL; 826 } 827 828 nip = link->l_neti; 829 link->l_neti = NULL; 830 831 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 832 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 833 pollhead_clean(&link->l_pollhead); 834 ss->ss_link = NULL; 835 mutex_exit(&ss->ss_lock); 836 837 mutex_enter(&nip->vni_lock); 838 list_remove(&nip->vni_dev_list, ss); 839 mutex_exit(&nip->vni_lock); 840 841 viona_neti_rele(nip); 842 843 kmem_free(link, sizeof (viona_link_t)); 844 return (0); 845 } 846 847 static int 848 viona_ioc_ring_init(viona_link_t *link, void *udata, int md) 849 { 850 vioc_ring_init_t kri; 851 int err; 852 853 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { 854 return (EFAULT); 855 } 856 857 err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr); 858 859 return (err); 860 } 861 862 static int 863 viona_ioc_ring_reset(viona_link_t *link, uint_t idx) 864 { 865 viona_vring_t *ring; 866 867 if (idx >= VIONA_VQ_MAX) { 868 return (EINVAL); 869 } 870 ring = &link->l_vrings[idx]; 871 872 return (viona_ring_reset(ring, B_TRUE)); 873 } 874 875 static int 876 viona_ioc_ring_kick(viona_link_t *link, uint_t idx) 877 { 878 viona_vring_t *ring; 879 int err; 880 881 if (idx >= VIONA_VQ_MAX) { 882 return (EINVAL); 883 } 884 ring = &link->l_vrings[idx]; 885 886 mutex_enter(&ring->vr_lock); 887 switch (ring->vr_state) { 888 case VRS_SETUP: 889 /* 890 * An early kick to a ring which is starting its worker thread 891 * is fine. Once that thread is active, it will process the 892 * start-up request immediately. 893 */ 894 /* FALLTHROUGH */ 895 case VRS_INIT: 896 ring->vr_state_flags |= VRSF_REQ_START; 897 /* FALLTHROUGH */ 898 case VRS_RUN: 899 cv_broadcast(&ring->vr_cv); 900 err = 0; 901 break; 902 default: 903 err = EBUSY; 904 break; 905 } 906 mutex_exit(&ring->vr_lock); 907 908 return (err); 909 } 910 911 static int 912 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) 913 { 914 vioc_ring_msi_t vrm; 915 viona_vring_t *ring; 916 917 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { 918 return (EFAULT); 919 } 920 if (vrm.rm_index >= VIONA_VQ_MAX) { 921 return (EINVAL); 922 } 923 924 ring = &link->l_vrings[vrm.rm_index]; 925 mutex_enter(&ring->vr_lock); 926 ring->vr_msi_addr = vrm.rm_addr; 927 ring->vr_msi_msg = vrm.rm_msg; 928 mutex_exit(&ring->vr_lock); 929 930 return (0); 931 } 932 933 static int 934 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, 935 uint32_t *val) 936 { 937 viona_link_t *link = (viona_link_t *)arg; 938 uint16_t vq = *val; 939 940 if (in) { 941 /* 942 * Do not service read (in/ins) requests on this ioport. 943 * Instead, indicate that the handler is not found, causing a 944 * fallback to userspace processing. 945 */ 946 return (ESRCH); 947 } 948 949 if (port != link->l_notify_ioport) { 950 return (EINVAL); 951 } 952 return (viona_ioc_ring_kick(link, vq)); 953 } 954 955 static int 956 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) 957 { 958 int err = 0; 959 960 if (link->l_notify_ioport != 0) { 961 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); 962 link->l_notify_ioport = 0; 963 } 964 965 if (ioport != 0) { 966 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, 967 viona_notify_iop, (void *)link, &link->l_notify_cookie); 968 if (err == 0) { 969 link->l_notify_ioport = ioport; 970 } 971 } 972 return (err); 973 } 974 975 static int 976 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) 977 { 978 if (idx >= VIONA_VQ_MAX) { 979 return (EINVAL); 980 } 981 982 link->l_vrings[idx].vr_intr_enabled = 0; 983 return (0); 984 } 985 986 static int 987 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) 988 { 989 uint_t cnt = 0; 990 vioc_intr_poll_t vip; 991 992 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 993 uint_t val = link->l_vrings[i].vr_intr_enabled; 994 995 vip.vip_status[i] = val; 996 if (val != 0) { 997 cnt++; 998 } 999 } 1000 1001 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { 1002 return (EFAULT); 1003 } 1004 *rv = (int)cnt; 1005 return (0); 1006 } 1007