1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 39 * Copyright 2023 Oxide Computer Company 40 */ 41 42 /* 43 * viona - VirtIO-Net, Accelerated 44 * 45 * The purpose of viona is to provide high performance virtio-net devices to 46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the 47 * DLS/DLD stack. 48 * 49 * -------------------- 50 * General Architecture 51 * -------------------- 52 * 53 * A single viona instance is comprised of a "link" handle and two "rings". 54 * After opening the viona device, it must be associated with a MAC network 55 * interface and a bhyve (vmm) instance to form its link resource. This is 56 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are 57 * passed in to perform the initialization. With the MAC client opened, and a 58 * driver handle to the vmm instance established, the device is ready to be 59 * configured by the guest. 60 * 61 * The userspace portion of bhyve, which interfaces with the PCI device 62 * emulation framework, is meant to stay out of the datapath if at all 63 * possible. Configuration changes made via PCI are mapped to actions which 64 * will steer the operation of the in-kernel logic. 65 * 66 * 67 * ----------- 68 * Ring Basics 69 * ----------- 70 * 71 * Each viona link has two viona_vring_t entities, RX and TX, for handling data 72 * transfers to and from the guest. They represent an interface to the 73 * standard virtio ring structures. When initialized and active, each ring is 74 * backed by a kernel worker thread (parented to the bhyve process for the 75 * instance) which handles ring events. The RX worker has the simple task of 76 * watching for ring shutdown conditions. The TX worker does that in addition 77 * to processing all requests to transmit data. Data destined for the guest is 78 * delivered directly by MAC to viona_rx() when the ring is active. 79 * 80 * 81 * ----------- 82 * Ring States 83 * ----------- 84 * 85 * The viona_vring_t instances follow a simple path through the possible state 86 * values represented in virtio_vring_t`vr_state: 87 * 88 * +<--------------------------------------------+ 89 * | | 90 * V ^ 91 * +-----------+ This is the initial state when a link is created or 92 * | VRS_RESET | when the ring has been explicitly reset. 93 * +-----------+ 94 * | ^ 95 * |---* ioctl(VNA_IOC_RING_INIT) issued | 96 * | | 97 * | ^ 98 * V 99 * +-----------+ The ring parameters (size, guest physical addresses) 100 * | VRS_SETUP | have been set and start-up of the ring worker thread 101 * +-----------+ has begun. 102 * | ^ 103 * | | 104 * |---* ring worker thread begins execution | 105 * | | 106 * +-------------------------------------------->+ 107 * | | ^ 108 * | | 109 * | * If ring shutdown is requested (by ioctl or impending 110 * | bhyve process death) while the worker thread is 111 * | starting, the worker will transition the ring to 112 * | VRS_RESET and exit. 113 * | ^ 114 * | | 115 * |<-------------------------------------------<+ 116 * | | | 117 * | | ^ 118 * | * If ring is requested to pause (but not stop)from the 119 * | VRS_RUN state, it will return to the VRS_INIT state. 120 * | 121 * | ^ 122 * | | 123 * | ^ 124 * V 125 * +-----------+ The worker thread associated with the ring has started 126 * | VRS_INIT | executing. It has allocated any extra resources needed 127 * +-----------+ for the ring to operate. 128 * | ^ 129 * | | 130 * +-------------------------------------------->+ 131 * | | ^ 132 * | | 133 * | * If ring shutdown is requested while the worker is 134 * | waiting in VRS_INIT, it will free any extra resources 135 * | and transition to VRS_RESET. 136 * | ^ 137 * | | 138 * |--* ioctl(VNA_IOC_RING_KICK) issued | 139 * | ^ 140 * V 141 * +-----------+ The worker thread associated with the ring is executing 142 * | VRS_RUN | workload specific to that ring. 143 * +-----------+ 144 * | ^ 145 * |---* ioctl(VNA_IOC_RING_RESET) issued | 146 * | (or bhyve process begins exit) ^ 147 * | 148 * +-----------+ The worker thread associated with the ring is in the 149 * | VRS_STOP | process of exiting. All outstanding TX and RX 150 * +-----------+ requests are allowed to complete, but new requests 151 * | must be ignored. 152 * | ^ 153 * | | 154 * +-------------------------------------------->+ 155 * 156 * 157 * While the worker thread is not running, changes to vr_state are only made by 158 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts 159 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread 160 * has been started, only it may perform ring state transitions (still under 161 * the protection of vr_lock), when requested by outside consumers via 162 * vr_state_flags or when the containing bhyve process initiates an exit. 163 * 164 * 165 * ---------------------------- 166 * Transmission mblk_t Handling 167 * ---------------------------- 168 * 169 * For incoming frames destined for a bhyve guest, the data must first land in 170 * a host OS buffer from the physical NIC before it is copied into the awaiting 171 * guest buffer(s). Outbound frames transmitted by the guest are not bound by 172 * this limitation and can avoid extra copying before the buffers are accessed 173 * directly by the NIC. When a guest designates buffers to be transmitted, 174 * viona translates the guest-physical addresses contained in the ring 175 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is 176 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). 177 * Doing so increments vr_xfer_outstanding, preventing the ring from being 178 * reset (allowing the link to drop its vmm handle to the guest) until all 179 * transmit mblks referencing guest memory have been processed. Allocation of 180 * the viona_desb_t entries is done during the VRS_INIT stage of the ring 181 * worker thread. The ring size informs that allocation as the number of 182 * concurrent transmissions is limited by the number of descriptors in the 183 * ring. This minimizes allocation in the transmit hot-path by acquiring those 184 * fixed-size resources during initialization. 185 * 186 * This optimization depends on the underlying NIC driver freeing the mblks in 187 * a timely manner after they have been transmitted by the hardware. Some 188 * drivers have been found to flush TX descriptors only when new transmissions 189 * are initiated. This means that there is no upper bound to the time needed 190 * for an mblk to be flushed and can stall bhyve guests from shutting down 191 * since their memory must be free of viona TX references prior to clean-up. 192 * 193 * This expectation of deterministic mblk_t processing is likely the reason 194 * behind the notable exception to the zero-copy TX path: systems with 'bnxe' 195 * loaded will copy transmit data into fresh buffers rather than passing up 196 * zero-copy mblks. It is a hold-over from the original viona sources provided 197 * by Pluribus and its continued necessity has not been confirmed. 198 * 199 * 200 * ---------------------------- 201 * Ring Notification Fast-paths 202 * ---------------------------- 203 * 204 * Device operation for viona requires that notifications flow to and from the 205 * guest to indicate certain ring conditions. In order to minimize latency and 206 * processing overhead, the notification procedures are kept in-kernel whenever 207 * possible. 208 * 209 * Guest-to-host notifications, when new available descriptors have been placed 210 * in the ring, are posted via the 'queue notify' address in the virtio BAR. 211 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to 212 * install a callback hook on an ioport address. Guest exits for accesses to 213 * viona-hooked ioport addresses will result in direct calls to notify the 214 * appropriate ring worker without a trip to userland. 215 * 216 * Host-to-guest notifications in the form of interrupts enjoy similar 217 * acceleration. Each viona ring can be configured to send MSI notifications 218 * to the guest as virtio conditions dictate. This in-kernel interrupt 219 * configuration is kept synchronized through viona ioctls which are utilized 220 * during writes to the associated PCI config registers or MSI-X BAR. 221 * 222 * Guests which do not utilize MSI-X will result in viona falling back to the 223 * slow path for interrupts. It will poll(2) the viona handle, receiving 224 * notification when ring events necessitate the assertion of an interrupt. 225 * 226 * 227 * --------------- 228 * Nethook Support 229 * --------------- 230 * 231 * Viona provides four nethook events that consumers (e.g. ipf) can hook into 232 * to intercept packets as they go up or down the stack. Unfortunately, 233 * the nethook framework does not understand raw packets, so we can only 234 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, 235 * we register callbacks with the neti (netinfo) module that will be invoked 236 * for each netstack already present, as well as for any additional netstack 237 * instances created as the system operates. These callbacks will 238 * register/unregister the hooks with the nethook framework for each 239 * netstack instance. This registration occurs prior to creating any 240 * viona instances for a given netstack, and the unregistration for a netstack 241 * instance occurs after all viona instances of the netstack instance have 242 * been deleted. 243 */ 244 245 #include <sys/conf.h> 246 #include <sys/file.h> 247 #include <sys/stat.h> 248 249 #include <sys/dlpi.h> 250 #include <sys/vlan.h> 251 252 #include "viona_impl.h" 253 254 255 #define VIONA_NAME "Virtio Network Accelerator" 256 #define VIONA_CTL_MINOR 0 257 #define VIONA_CLI_NAME "viona" /* MAC client name */ 258 259 260 /* 261 * Host capabilities. 262 */ 263 #define VIONA_S_HOSTCAPS ( \ 264 VIRTIO_NET_F_GUEST_CSUM | \ 265 VIRTIO_NET_F_MAC | \ 266 VIRTIO_NET_F_GUEST_TSO4 | \ 267 VIRTIO_NET_F_MRG_RXBUF | \ 268 VIRTIO_NET_F_STATUS | \ 269 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ 270 VIRTIO_F_RING_INDIRECT_DESC) 271 272 /* MAC_CAPAB_HCKSUM specifics of interest */ 273 #define VIONA_CAP_HCKSUM_INTEREST \ 274 (HCKSUM_INET_PARTIAL | \ 275 HCKSUM_INET_FULL_V4 | \ 276 HCKSUM_INET_FULL_V6) 277 278 static void *viona_state; 279 static dev_info_t *viona_dip; 280 static id_space_t *viona_minors; 281 282 283 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, 284 void **result); 285 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 286 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 287 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); 288 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); 289 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, 290 cred_t *credp, int *rval); 291 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 292 struct pollhead **phpp); 293 294 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); 295 static int viona_ioc_delete(viona_soft_state_t *, boolean_t); 296 297 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); 298 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t); 299 static int viona_ioc_ring_init(viona_link_t *, void *, int); 300 static int viona_ioc_ring_set_state(viona_link_t *, void *, int); 301 static int viona_ioc_ring_get_state(viona_link_t *, void *, int); 302 static int viona_ioc_ring_reset(viona_link_t *, uint_t); 303 static int viona_ioc_ring_kick(viona_link_t *, uint_t); 304 static int viona_ioc_ring_pause(viona_link_t *, uint_t); 305 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); 306 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); 307 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); 308 309 static struct cb_ops viona_cb_ops = { 310 viona_open, 311 viona_close, 312 nodev, 313 nodev, 314 nodev, 315 nodev, 316 nodev, 317 viona_ioctl, 318 nodev, 319 nodev, 320 nodev, 321 viona_chpoll, 322 ddi_prop_op, 323 0, 324 D_MP | D_NEW | D_HOTPLUG, 325 CB_REV, 326 nodev, 327 nodev 328 }; 329 330 static struct dev_ops viona_ops = { 331 DEVO_REV, 332 0, 333 viona_info, 334 nulldev, 335 nulldev, 336 viona_attach, 337 viona_detach, 338 nodev, 339 &viona_cb_ops, 340 NULL, 341 ddi_power, 342 ddi_quiesce_not_needed 343 }; 344 345 static struct modldrv modldrv = { 346 &mod_driverops, 347 VIONA_NAME, 348 &viona_ops, 349 }; 350 351 static struct modlinkage modlinkage = { 352 MODREV_1, &modldrv, NULL 353 }; 354 355 int 356 _init(void) 357 { 358 int ret; 359 360 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); 361 if (ret != 0) { 362 return (ret); 363 } 364 365 viona_minors = id_space_create("viona_minors", 366 VIONA_CTL_MINOR + 1, UINT16_MAX); 367 viona_rx_init(); 368 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); 369 370 ret = mod_install(&modlinkage); 371 if (ret != 0) { 372 ddi_soft_state_fini(&viona_state); 373 id_space_destroy(viona_minors); 374 viona_rx_fini(); 375 mutex_destroy(&viona_force_copy_lock); 376 } 377 378 return (ret); 379 } 380 381 int 382 _fini(void) 383 { 384 int ret; 385 386 ret = mod_remove(&modlinkage); 387 if (ret != 0) { 388 return (ret); 389 } 390 391 ddi_soft_state_fini(&viona_state); 392 id_space_destroy(viona_minors); 393 viona_rx_fini(); 394 mutex_destroy(&viona_force_copy_lock); 395 396 return (ret); 397 } 398 399 int 400 _info(struct modinfo *modinfop) 401 { 402 return (mod_info(&modlinkage, modinfop)); 403 } 404 405 /* ARGSUSED */ 406 static int 407 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 408 { 409 int error; 410 411 switch (cmd) { 412 case DDI_INFO_DEVT2DEVINFO: 413 *result = (void *)viona_dip; 414 error = DDI_SUCCESS; 415 break; 416 case DDI_INFO_DEVT2INSTANCE: 417 *result = (void *)0; 418 error = DDI_SUCCESS; 419 break; 420 default: 421 error = DDI_FAILURE; 422 break; 423 } 424 return (error); 425 } 426 427 static int 428 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 429 { 430 if (cmd != DDI_ATTACH) { 431 return (DDI_FAILURE); 432 } 433 434 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, 435 DDI_PSEUDO, 0) != DDI_SUCCESS) { 436 return (DDI_FAILURE); 437 } 438 439 viona_neti_attach(); 440 441 viona_dip = dip; 442 ddi_report_dev(viona_dip); 443 444 return (DDI_SUCCESS); 445 } 446 447 static int 448 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 449 { 450 dev_info_t *old_dip = viona_dip; 451 452 if (cmd != DDI_DETACH) { 453 return (DDI_FAILURE); 454 } 455 456 VERIFY(old_dip != NULL); 457 458 viona_neti_detach(); 459 viona_dip = NULL; 460 ddi_remove_minor_node(old_dip, NULL); 461 462 return (DDI_SUCCESS); 463 } 464 465 static int 466 viona_open(dev_t *devp, int flag, int otype, cred_t *credp) 467 { 468 int minor; 469 viona_soft_state_t *ss; 470 471 if (otype != OTYP_CHR) { 472 return (EINVAL); 473 } 474 #if 0 475 /* 476 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. 477 * Should the check be at open() or ioctl()? 478 */ 479 if (drv_priv(credp) != 0) { 480 return (EPERM); 481 } 482 #endif 483 if (getminor(*devp) != VIONA_CTL_MINOR) { 484 return (ENXIO); 485 } 486 487 minor = id_alloc_nosleep(viona_minors); 488 if (minor == -1) { 489 /* All minors are busy */ 490 return (EBUSY); 491 } 492 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { 493 id_free(viona_minors, minor); 494 return (ENOMEM); 495 } 496 497 ss = ddi_get_soft_state(viona_state, minor); 498 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); 499 *devp = makedevice(getmajor(*devp), minor); 500 501 return (0); 502 } 503 504 static int 505 viona_close(dev_t dev, int flag, int otype, cred_t *credp) 506 { 507 int minor; 508 viona_soft_state_t *ss; 509 510 if (otype != OTYP_CHR) { 511 return (EINVAL); 512 } 513 514 minor = getminor(dev); 515 516 ss = ddi_get_soft_state(viona_state, minor); 517 if (ss == NULL) { 518 return (ENXIO); 519 } 520 521 VERIFY0(viona_ioc_delete(ss, B_TRUE)); 522 VERIFY(!list_link_active(&ss->ss_node)); 523 ddi_soft_state_free(viona_state, minor); 524 id_free(viona_minors, minor); 525 526 return (0); 527 } 528 529 static int 530 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) 531 { 532 viona_soft_state_t *ss; 533 void *dptr = (void *)data; 534 int err = 0, val; 535 viona_link_t *link; 536 537 ss = ddi_get_soft_state(viona_state, getminor(dev)); 538 if (ss == NULL) { 539 return (ENXIO); 540 } 541 542 switch (cmd) { 543 case VNA_IOC_CREATE: 544 return (viona_ioc_create(ss, dptr, md, cr)); 545 case VNA_IOC_DELETE: 546 return (viona_ioc_delete(ss, B_FALSE)); 547 case VNA_IOC_VERSION: 548 *rv = VIONA_CURRENT_INTERFACE_VERSION; 549 return (0); 550 default: 551 break; 552 } 553 554 mutex_enter(&ss->ss_lock); 555 if ((link = ss->ss_link) == NULL || link->l_destroyed || 556 vmm_drv_release_reqd(link->l_vm_hold)) { 557 mutex_exit(&ss->ss_lock); 558 return (ENXIO); 559 } 560 561 switch (cmd) { 562 case VNA_IOC_GET_FEATURES: 563 val = VIONA_S_HOSTCAPS | link->l_features_hw; 564 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { 565 err = EFAULT; 566 } 567 break; 568 case VNA_IOC_SET_FEATURES: 569 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { 570 err = EFAULT; 571 break; 572 } 573 val &= (VIONA_S_HOSTCAPS | link->l_features_hw); 574 575 if ((val & VIRTIO_NET_F_CSUM) == 0) 576 val &= ~VIRTIO_NET_F_HOST_TSO4; 577 578 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) 579 val &= ~VIRTIO_NET_F_GUEST_TSO4; 580 581 link->l_features = val; 582 break; 583 case VNA_IOC_RING_INIT: 584 err = viona_ioc_ring_init(link, dptr, md); 585 break; 586 case VNA_IOC_RING_RESET: 587 err = viona_ioc_ring_reset(link, (uint_t)data); 588 break; 589 case VNA_IOC_RING_KICK: 590 err = viona_ioc_ring_kick(link, (uint_t)data); 591 break; 592 case VNA_IOC_RING_SET_MSI: 593 err = viona_ioc_ring_set_msi(link, dptr, md); 594 break; 595 case VNA_IOC_RING_INTR_CLR: 596 err = viona_ioc_ring_intr_clear(link, (uint_t)data); 597 break; 598 case VNA_IOC_RING_SET_STATE: 599 err = viona_ioc_ring_set_state(link, dptr, md); 600 break; 601 case VNA_IOC_RING_GET_STATE: 602 err = viona_ioc_ring_get_state(link, dptr, md); 603 break; 604 case VNA_IOC_RING_PAUSE: 605 err = viona_ioc_ring_pause(link, (uint_t)data); 606 break; 607 608 case VNA_IOC_INTR_POLL: 609 err = viona_ioc_intr_poll(link, dptr, md, rv); 610 break; 611 case VNA_IOC_SET_NOTIFY_IOP: 612 if (data < 0 || data > UINT16_MAX) { 613 err = EINVAL; 614 break; 615 } 616 err = viona_ioc_set_notify_ioport(link, (uint16_t)data); 617 break; 618 case VNA_IOC_SET_PROMISC: 619 err = viona_ioc_set_promisc(link, (viona_promisc_t)data); 620 break; 621 default: 622 err = ENOTTY; 623 break; 624 } 625 626 mutex_exit(&ss->ss_lock); 627 return (err); 628 } 629 630 static int 631 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 632 struct pollhead **phpp) 633 { 634 viona_soft_state_t *ss; 635 viona_link_t *link; 636 637 ss = ddi_get_soft_state(viona_state, getminor(dev)); 638 if (ss == NULL) { 639 return (ENXIO); 640 } 641 642 mutex_enter(&ss->ss_lock); 643 if ((link = ss->ss_link) == NULL || link->l_destroyed) { 644 mutex_exit(&ss->ss_lock); 645 return (ENXIO); 646 } 647 648 *reventsp = 0; 649 if ((events & POLLRDBAND) != 0) { 650 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 651 if (link->l_vrings[i].vr_intr_enabled != 0) { 652 *reventsp |= POLLRDBAND; 653 break; 654 } 655 } 656 } 657 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 658 *phpp = &link->l_pollhead; 659 } 660 mutex_exit(&ss->ss_lock); 661 662 return (0); 663 } 664 665 static void 666 viona_get_mac_capab(viona_link_t *link) 667 { 668 mac_handle_t mh = link->l_mh; 669 uint32_t cap = 0; 670 mac_capab_lso_t lso_cap; 671 672 link->l_features_hw = 0; 673 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { 674 /* 675 * Only report HW checksum ability if the underlying MAC 676 * resource is capable of populating the L4 header. 677 */ 678 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { 679 link->l_features_hw |= VIRTIO_NET_F_CSUM; 680 } 681 link->l_cap_csum = cap; 682 } 683 684 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && 685 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { 686 /* 687 * Virtio doesn't allow for negotiating a maximum LSO 688 * packet size. We have to assume that the guest may 689 * send a maximum length IP packet. Make sure the 690 * underlying MAC can handle an LSO of this size. 691 */ 692 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && 693 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) 694 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; 695 } 696 } 697 698 static int 699 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) 700 { 701 vioc_create_t kvc; 702 viona_link_t *link = NULL; 703 char cli_name[MAXNAMELEN]; 704 int err = 0; 705 file_t *fp; 706 vmm_hold_t *hold = NULL; 707 viona_neti_t *nip = NULL; 708 zoneid_t zid; 709 mac_diag_t mac_diag = MAC_DIAG_NONE; 710 711 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); 712 713 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { 714 return (EFAULT); 715 } 716 717 zid = crgetzoneid(cr); 718 nip = viona_neti_lookup_by_zid(zid); 719 if (nip == NULL) { 720 return (EIO); 721 } 722 723 if (!nip->vni_nethook.vnh_hooked) { 724 viona_neti_rele(nip); 725 return (EIO); 726 } 727 728 mutex_enter(&ss->ss_lock); 729 if (ss->ss_link != NULL) { 730 mutex_exit(&ss->ss_lock); 731 viona_neti_rele(nip); 732 return (EEXIST); 733 } 734 735 if ((fp = getf(kvc.c_vmfd)) == NULL) { 736 err = EBADF; 737 goto bail; 738 } 739 err = vmm_drv_hold(fp, cr, &hold); 740 releasef(kvc.c_vmfd); 741 if (err != 0) { 742 goto bail; 743 } 744 745 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); 746 link->l_linkid = kvc.c_linkid; 747 link->l_vm_hold = hold; 748 749 err = mac_open_by_linkid(link->l_linkid, &link->l_mh); 750 if (err != 0) { 751 goto bail; 752 } 753 754 viona_get_mac_capab(link); 755 756 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME, 757 link->l_linkid); 758 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); 759 if (err != 0) { 760 goto bail; 761 } 762 763 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY, 764 &link->l_muh, VLAN_ID_NONE, &mac_diag); 765 if (err != 0) { 766 goto bail; 767 } 768 769 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); 770 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); 771 772 /* 773 * Default to passing up all multicast traffic in addition to 774 * classified unicast. Guests which have support will change this 775 * if they need to via the virtio net control queue; guests without 776 * support generally still want to see multicast. 777 */ 778 link->l_promisc = VIONA_PROMISC_MULTI; 779 if ((err = viona_rx_set(link, link->l_promisc)) != 0) { 780 viona_rx_clear(link); 781 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 782 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 783 goto bail; 784 } 785 786 link->l_neti = nip; 787 ss->ss_link = link; 788 mutex_exit(&ss->ss_lock); 789 790 mutex_enter(&nip->vni_lock); 791 list_insert_tail(&nip->vni_dev_list, ss); 792 mutex_exit(&nip->vni_lock); 793 794 return (0); 795 796 bail: 797 if (link != NULL) { 798 if (link->l_mch != NULL) { 799 if (link->l_muh != NULL) { 800 VERIFY0(mac_unicast_remove(link->l_mch, 801 link->l_muh)); 802 link->l_muh = NULL; 803 } 804 mac_client_close(link->l_mch, 0); 805 } 806 if (link->l_mh != NULL) { 807 mac_close(link->l_mh); 808 } 809 kmem_free(link, sizeof (viona_link_t)); 810 } 811 if (hold != NULL) { 812 vmm_drv_rele(hold); 813 } 814 viona_neti_rele(nip); 815 816 mutex_exit(&ss->ss_lock); 817 return (err); 818 } 819 820 static int 821 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) 822 { 823 viona_link_t *link; 824 viona_neti_t *nip = NULL; 825 826 mutex_enter(&ss->ss_lock); 827 if ((link = ss->ss_link) == NULL) { 828 /* Link destruction already complete */ 829 mutex_exit(&ss->ss_lock); 830 return (0); 831 } 832 833 if (link->l_destroyed) { 834 /* 835 * Link destruction has been started by another thread, but has 836 * not completed. This condition should be impossible to 837 * encounter when performing the on-close destroy of the link, 838 * since racing ioctl accessors must necessarily be absent. 839 */ 840 VERIFY(!on_close); 841 mutex_exit(&ss->ss_lock); 842 return (EAGAIN); 843 } 844 /* 845 * The link deletion cannot fail after this point, continuing until its 846 * successful completion is reached. 847 */ 848 link->l_destroyed = B_TRUE; 849 850 /* 851 * Tear down the IO port hook so it cannot be used to kick any of the 852 * rings which are about to be reset and stopped. 853 */ 854 VERIFY0(viona_ioc_set_notify_ioport(link, 0)); 855 mutex_exit(&ss->ss_lock); 856 857 /* 858 * Return the rings to their reset state, ignoring any possible 859 * interruptions from signals. 860 */ 861 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); 862 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); 863 864 mutex_enter(&ss->ss_lock); 865 if (link->l_mch != NULL) { 866 /* Unhook the receive callbacks and close out the client */ 867 viona_rx_clear(link); 868 if (link->l_muh != NULL) { 869 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh)); 870 link->l_muh = NULL; 871 } 872 mac_client_close(link->l_mch, 0); 873 } 874 if (link->l_mh != NULL) { 875 mac_close(link->l_mh); 876 } 877 if (link->l_vm_hold != NULL) { 878 vmm_drv_rele(link->l_vm_hold); 879 link->l_vm_hold = NULL; 880 } 881 882 nip = link->l_neti; 883 link->l_neti = NULL; 884 885 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 886 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 887 pollhead_clean(&link->l_pollhead); 888 ss->ss_link = NULL; 889 mutex_exit(&ss->ss_lock); 890 891 mutex_enter(&nip->vni_lock); 892 list_remove(&nip->vni_dev_list, ss); 893 mutex_exit(&nip->vni_lock); 894 895 viona_neti_rele(nip); 896 897 kmem_free(link, sizeof (viona_link_t)); 898 return (0); 899 } 900 901 static int 902 viona_ioc_ring_init(viona_link_t *link, void *udata, int md) 903 { 904 vioc_ring_init_t kri; 905 int err; 906 907 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { 908 return (EFAULT); 909 } 910 const struct viona_ring_params params = { 911 .vrp_pa = kri.ri_qaddr, 912 .vrp_size = kri.ri_qsize, 913 .vrp_avail_idx = 0, 914 .vrp_used_idx = 0, 915 }; 916 917 err = viona_ring_init(link, kri.ri_index, ¶ms); 918 919 return (err); 920 } 921 922 static int 923 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md) 924 { 925 vioc_ring_state_t krs; 926 int err; 927 928 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 929 return (EFAULT); 930 } 931 const struct viona_ring_params params = { 932 .vrp_pa = krs.vrs_qaddr, 933 .vrp_size = krs.vrs_qsize, 934 .vrp_avail_idx = krs.vrs_avail_idx, 935 .vrp_used_idx = krs.vrs_used_idx, 936 }; 937 938 err = viona_ring_init(link, krs.vrs_index, ¶ms); 939 940 return (err); 941 } 942 943 static int 944 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md) 945 { 946 vioc_ring_state_t krs; 947 948 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 949 return (EFAULT); 950 } 951 952 struct viona_ring_params params; 953 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms); 954 if (err != 0) { 955 return (err); 956 } 957 krs.vrs_qsize = params.vrp_size; 958 krs.vrs_qaddr = params.vrp_pa; 959 krs.vrs_avail_idx = params.vrp_avail_idx; 960 krs.vrs_used_idx = params.vrp_used_idx; 961 962 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) { 963 return (EFAULT); 964 } 965 return (0); 966 } 967 968 static int 969 viona_ioc_ring_reset(viona_link_t *link, uint_t idx) 970 { 971 viona_vring_t *ring; 972 973 if (idx >= VIONA_VQ_MAX) { 974 return (EINVAL); 975 } 976 ring = &link->l_vrings[idx]; 977 978 return (viona_ring_reset(ring, B_TRUE)); 979 } 980 981 static int 982 viona_ioc_ring_kick(viona_link_t *link, uint_t idx) 983 { 984 viona_vring_t *ring; 985 int err; 986 987 if (idx >= VIONA_VQ_MAX) { 988 return (EINVAL); 989 } 990 ring = &link->l_vrings[idx]; 991 992 mutex_enter(&ring->vr_lock); 993 switch (ring->vr_state) { 994 case VRS_SETUP: 995 /* 996 * An early kick to a ring which is starting its worker thread 997 * is fine. Once that thread is active, it will process the 998 * start-up request immediately. 999 */ 1000 /* FALLTHROUGH */ 1001 case VRS_INIT: 1002 ring->vr_state_flags |= VRSF_REQ_START; 1003 /* FALLTHROUGH */ 1004 case VRS_RUN: 1005 cv_broadcast(&ring->vr_cv); 1006 err = 0; 1007 break; 1008 default: 1009 err = EBUSY; 1010 break; 1011 } 1012 mutex_exit(&ring->vr_lock); 1013 1014 return (err); 1015 } 1016 1017 static int 1018 viona_ioc_ring_pause(viona_link_t *link, uint_t idx) 1019 { 1020 if (idx >= VIONA_VQ_MAX) { 1021 return (EINVAL); 1022 } 1023 1024 viona_vring_t *ring = &link->l_vrings[idx]; 1025 return (viona_ring_pause(ring)); 1026 } 1027 1028 static int 1029 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) 1030 { 1031 vioc_ring_msi_t vrm; 1032 viona_vring_t *ring; 1033 1034 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { 1035 return (EFAULT); 1036 } 1037 if (vrm.rm_index >= VIONA_VQ_MAX) { 1038 return (EINVAL); 1039 } 1040 1041 ring = &link->l_vrings[vrm.rm_index]; 1042 mutex_enter(&ring->vr_lock); 1043 ring->vr_msi_addr = vrm.rm_addr; 1044 ring->vr_msi_msg = vrm.rm_msg; 1045 mutex_exit(&ring->vr_lock); 1046 1047 return (0); 1048 } 1049 1050 static int 1051 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, 1052 uint32_t *val) 1053 { 1054 viona_link_t *link = (viona_link_t *)arg; 1055 1056 /* 1057 * If the request is a read (in/ins), or direct at a port other than 1058 * what we expect to be registered on, ignore it. 1059 */ 1060 if (in || port != link->l_notify_ioport) { 1061 return (ESRCH); 1062 } 1063 1064 /* Let userspace handle notifications for rings other than RX/TX. */ 1065 const uint16_t vq = *val; 1066 if (vq >= VIONA_VQ_MAX) { 1067 return (ESRCH); 1068 } 1069 1070 viona_vring_t *ring = &link->l_vrings[vq]; 1071 int res = 0; 1072 1073 mutex_enter(&ring->vr_lock); 1074 if (ring->vr_state == VRS_RUN) { 1075 cv_broadcast(&ring->vr_cv); 1076 } else { 1077 res = ESRCH; 1078 } 1079 mutex_exit(&ring->vr_lock); 1080 1081 return (res); 1082 } 1083 1084 static int 1085 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) 1086 { 1087 int err = 0; 1088 1089 if (link->l_notify_ioport != 0) { 1090 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); 1091 link->l_notify_ioport = 0; 1092 } 1093 1094 if (ioport != 0) { 1095 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, 1096 viona_notify_iop, (void *)link, &link->l_notify_cookie); 1097 if (err == 0) { 1098 link->l_notify_ioport = ioport; 1099 } 1100 } 1101 return (err); 1102 } 1103 1104 static int 1105 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode) 1106 { 1107 int err; 1108 1109 if (mode >= VIONA_PROMISC_MAX) { 1110 return (EINVAL); 1111 } 1112 1113 if (mode == link->l_promisc) { 1114 return (0); 1115 } 1116 1117 if ((err = viona_rx_set(link, mode)) != 0) { 1118 return (err); 1119 } 1120 1121 link->l_promisc = mode; 1122 return (0); 1123 } 1124 1125 static int 1126 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) 1127 { 1128 if (idx >= VIONA_VQ_MAX) { 1129 return (EINVAL); 1130 } 1131 1132 link->l_vrings[idx].vr_intr_enabled = 0; 1133 return (0); 1134 } 1135 1136 static int 1137 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) 1138 { 1139 uint_t cnt = 0; 1140 vioc_intr_poll_t vip; 1141 1142 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 1143 uint_t val = link->l_vrings[i].vr_intr_enabled; 1144 1145 vip.vip_status[i] = val; 1146 if (val != 0) { 1147 cnt++; 1148 } 1149 } 1150 1151 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { 1152 return (EFAULT); 1153 } 1154 *rv = (int)cnt; 1155 return (0); 1156 } 1157