1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2023 Oxide Computer Company 39 */ 40 41 /* 42 * viona - VirtIO-Net, Accelerated 43 * 44 * The purpose of viona is to provide high performance virtio-net devices to 45 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the 46 * DLS/DLD stack. 47 * 48 * -------------------- 49 * General Architecture 50 * -------------------- 51 * 52 * A single viona instance is comprised of a "link" handle and two "rings". 53 * After opening the viona device, it must be associated with a MAC network 54 * interface and a bhyve (vmm) instance to form its link resource. This is 55 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are 56 * passed in to perform the initialization. With the MAC client opened, and a 57 * driver handle to the vmm instance established, the device is ready to be 58 * configured by the guest. 59 * 60 * The userspace portion of bhyve, which interfaces with the PCI device 61 * emulation framework, is meant to stay out of the datapath if at all 62 * possible. Configuration changes made via PCI are mapped to actions which 63 * will steer the operation of the in-kernel logic. 64 * 65 * 66 * ----------- 67 * Ring Basics 68 * ----------- 69 * 70 * Each viona link has two viona_vring_t entities, RX and TX, for handling data 71 * transfers to and from the guest. They represent an interface to the 72 * standard virtio ring structures. When intiailized and active, each ring is 73 * backed by a kernel worker thread (parented to the bhyve process for the 74 * instance) which handles ring events. The RX worker has the simple task of 75 * watching for ring shutdown conditions. The TX worker does that in addition 76 * to processing all requests to transmit data. Data destined for the guest is 77 * delivered directly by MAC to viona_rx() when the ring is active. 78 * 79 * 80 * ----------- 81 * Ring States 82 * ----------- 83 * 84 * The viona_vring_t instances follow a simple path through the possible state 85 * values represented in virtio_vring_t`vr_state: 86 * 87 * +<--------------------------------------------+ 88 * | | 89 * V ^ 90 * +-----------+ This is the initial state when a link is created or 91 * | VRS_RESET | when the ring has been explicitly reset. 92 * +-----------+ 93 * | ^ 94 * |---* ioctl(VNA_IOC_RING_INIT) issued | 95 * | | 96 * | ^ 97 * V 98 * +-----------+ The ring parameters (size, guest physical addresses) 99 * | VRS_SETUP | have been set and start-up of the ring worker thread 100 * +-----------+ has begun. 101 * | ^ 102 * | | 103 * |---* ring worker thread begins execution | 104 * | | 105 * +-------------------------------------------->+ 106 * | | ^ 107 * | | 108 * | * If ring shutdown is requested (by ioctl or impending 109 * | bhyve process death) while the worker thread is 110 * | starting, the worker will transition the ring to 111 * | VRS_RESET and exit. 112 * | ^ 113 * | | 114 * |<-------------------------------------------<+ 115 * | | | 116 * | | ^ 117 * | * If ring is requested to pause (but not stop)from the 118 * | VRS_RUN state, it will return to the VRS_INIT state. 119 * | 120 * | ^ 121 * | | 122 * | ^ 123 * V 124 * +-----------+ The worker thread associated with the ring has started 125 * | VRS_INIT | executing. It has allocated any extra resources needed 126 * +-----------+ for the ring to operate. 127 * | ^ 128 * | | 129 * +-------------------------------------------->+ 130 * | | ^ 131 * | | 132 * | * If ring shutdown is requested while the worker is 133 * | waiting in VRS_INIT, it will free any extra resources 134 * | and transition to VRS_RESET. 135 * | ^ 136 * | | 137 * |--* ioctl(VNA_IOC_RING_KICK) issued | 138 * | ^ 139 * V 140 * +-----------+ The worker thread associated with the ring is executing 141 * | VRS_RUN | workload specific to that ring. 142 * +-----------+ 143 * | ^ 144 * |---* ioctl(VNA_IOC_RING_RESET) issued | 145 * | (or bhyve process begins exit) ^ 146 * | 147 * +-----------+ The worker thread associated with the ring is in the 148 * | VRS_STOP | process of exiting. All outstanding TX and RX 149 * +-----------+ requests are allowed to complete, but new requests 150 * | must be ignored. 151 * | ^ 152 * | | 153 * +-------------------------------------------->+ 154 * 155 * 156 * While the worker thread is not running, changes to vr_state are only made by 157 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts 158 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread 159 * has been started, only it may perform ring state transitions (still under 160 * the protection of vr_lock), when requested by outside consumers via 161 * vr_state_flags or when the containing bhyve process initiates an exit. 162 * 163 * 164 * ---------------------------- 165 * Transmission mblk_t Handling 166 * ---------------------------- 167 * 168 * For incoming frames destined for a bhyve guest, the data must first land in 169 * a host OS buffer from the physical NIC before it is copied into the awaiting 170 * guest buffer(s). Outbound frames transmitted by the guest are not bound by 171 * this limitation and can avoid extra copying before the buffers are accessed 172 * directly by the NIC. When a guest designates buffers to be transmitted, 173 * viona translates the guest-physical addresses contained in the ring 174 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is 175 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). 176 * Doing so increments vr_xfer_outstanding, preventing the ring from being 177 * reset (allowing the link to drop its vmm handle to the guest) until all 178 * transmit mblks referencing guest memory have been processed. Allocation of 179 * the viona_desb_t entries is done during the VRS_INIT stage of the ring 180 * worker thread. The ring size informs that allocation as the number of 181 * concurrent transmissions is limited by the number of descriptors in the 182 * ring. This minimizes allocation in the transmit hot-path by acquiring those 183 * fixed-size resources during initialization. 184 * 185 * This optimization depends on the underlying NIC driver freeing the mblks in 186 * a timely manner after they have been transmitted by the hardware. Some 187 * drivers have been found to flush TX descriptors only when new transmissions 188 * are initiated. This means that there is no upper bound to the time needed 189 * for an mblk to be flushed and can stall bhyve guests from shutting down 190 * since their memory must be free of viona TX references prior to clean-up. 191 * 192 * This expectation of deterministic mblk_t processing is likely the reason 193 * behind the notable exception to the zero-copy TX path: systems with 'bnxe' 194 * loaded will copy transmit data into fresh buffers rather than passing up 195 * zero-copy mblks. It is a hold-over from the original viona sources provided 196 * by Pluribus and its continued necessity has not been confirmed. 197 * 198 * 199 * ---------------------------- 200 * Ring Notification Fast-paths 201 * ---------------------------- 202 * 203 * Device operation for viona requires that notifications flow to and from the 204 * guest to indicate certain ring conditions. In order to minimize latency and 205 * processing overhead, the notification procedures are kept in-kernel whenever 206 * possible. 207 * 208 * Guest-to-host notifications, when new available descriptors have been placed 209 * in the ring, are posted via the 'queue notify' address in the virtio BAR. 210 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to 211 * install a callback hook on an ioport address. Guest exits for accesses to 212 * viona-hooked ioport addresses will result in direct calls to notify the 213 * appropriate ring worker without a trip to userland. 214 * 215 * Host-to-guest notifications in the form of interrupts enjoy similar 216 * acceleration. Each viona ring can be configured to send MSI notifications 217 * to the guest as virtio conditions dictate. This in-kernel interrupt 218 * configuration is kept synchronized through viona ioctls which are utilized 219 * during writes to the associated PCI config registers or MSI-X BAR. 220 * 221 * Guests which do not utilize MSI-X will result in viona falling back to the 222 * slow path for interrupts. It will poll(2) the viona handle, receiving 223 * notification when ring events necessitate the assertion of an interrupt. 224 * 225 * 226 * --------------- 227 * Nethook Support 228 * --------------- 229 * 230 * Viona provides four nethook events that consumers (e.g. ipf) can hook into 231 * to intercept packets as they go up or down the stack. Unfortunately, 232 * the nethook framework does not understand raw packets, so we can only 233 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, 234 * we register callbacks with the neti (netinfo) module that will be invoked 235 * for each netstack already present, as well as for any additional netstack 236 * instances created as the system operates. These callbacks will 237 * register/unregister the hooks with the nethook framework for each 238 * netstack instance. This registration occurs prior to creating any 239 * viona instances for a given netstack, and the unregistration for a netstack 240 * instance occurs after all viona instances of the netstack instance have 241 * been deleted. 242 */ 243 244 #include <sys/conf.h> 245 #include <sys/file.h> 246 #include <sys/stat.h> 247 248 #include <sys/dlpi.h> 249 #include <sys/vlan.h> 250 251 #include "viona_impl.h" 252 253 254 #define VIONA_NAME "Virtio Network Accelerator" 255 #define VIONA_CTL_MINOR 0 256 #define VIONA_CLI_NAME "viona" /* MAC client name */ 257 258 259 /* 260 * Host capabilities. 261 */ 262 #define VIONA_S_HOSTCAPS ( \ 263 VIRTIO_NET_F_GUEST_CSUM | \ 264 VIRTIO_NET_F_MAC | \ 265 VIRTIO_NET_F_GUEST_TSO4 | \ 266 VIRTIO_NET_F_MRG_RXBUF | \ 267 VIRTIO_NET_F_STATUS | \ 268 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ 269 VIRTIO_F_RING_INDIRECT_DESC) 270 271 /* MAC_CAPAB_HCKSUM specifics of interest */ 272 #define VIONA_CAP_HCKSUM_INTEREST \ 273 (HCKSUM_INET_PARTIAL | \ 274 HCKSUM_INET_FULL_V4 | \ 275 HCKSUM_INET_FULL_V6) 276 277 static void *viona_state; 278 static dev_info_t *viona_dip; 279 static id_space_t *viona_minors; 280 281 282 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, 283 void **result); 284 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 285 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 286 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); 287 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); 288 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, 289 cred_t *credp, int *rval); 290 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 291 struct pollhead **phpp); 292 293 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); 294 static int viona_ioc_delete(viona_soft_state_t *, boolean_t); 295 296 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); 297 static int viona_ioc_ring_init(viona_link_t *, void *, int); 298 static int viona_ioc_ring_set_state(viona_link_t *, void *, int); 299 static int viona_ioc_ring_get_state(viona_link_t *, void *, int); 300 static int viona_ioc_ring_reset(viona_link_t *, uint_t); 301 static int viona_ioc_ring_kick(viona_link_t *, uint_t); 302 static int viona_ioc_ring_pause(viona_link_t *, uint_t); 303 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); 304 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); 305 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); 306 307 static struct cb_ops viona_cb_ops = { 308 viona_open, 309 viona_close, 310 nodev, 311 nodev, 312 nodev, 313 nodev, 314 nodev, 315 viona_ioctl, 316 nodev, 317 nodev, 318 nodev, 319 viona_chpoll, 320 ddi_prop_op, 321 0, 322 D_MP | D_NEW | D_HOTPLUG, 323 CB_REV, 324 nodev, 325 nodev 326 }; 327 328 static struct dev_ops viona_ops = { 329 DEVO_REV, 330 0, 331 viona_info, 332 nulldev, 333 nulldev, 334 viona_attach, 335 viona_detach, 336 nodev, 337 &viona_cb_ops, 338 NULL, 339 ddi_power, 340 ddi_quiesce_not_needed 341 }; 342 343 static struct modldrv modldrv = { 344 &mod_driverops, 345 VIONA_NAME, 346 &viona_ops, 347 }; 348 349 static struct modlinkage modlinkage = { 350 MODREV_1, &modldrv, NULL 351 }; 352 353 int 354 _init(void) 355 { 356 int ret; 357 358 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); 359 if (ret != 0) { 360 return (ret); 361 } 362 363 viona_minors = id_space_create("viona_minors", 364 VIONA_CTL_MINOR + 1, UINT16_MAX); 365 viona_rx_init(); 366 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); 367 368 ret = mod_install(&modlinkage); 369 if (ret != 0) { 370 ddi_soft_state_fini(&viona_state); 371 id_space_destroy(viona_minors); 372 viona_rx_fini(); 373 mutex_destroy(&viona_force_copy_lock); 374 } 375 376 return (ret); 377 } 378 379 int 380 _fini(void) 381 { 382 int ret; 383 384 ret = mod_remove(&modlinkage); 385 if (ret != 0) { 386 return (ret); 387 } 388 389 ddi_soft_state_fini(&viona_state); 390 id_space_destroy(viona_minors); 391 viona_rx_fini(); 392 mutex_destroy(&viona_force_copy_lock); 393 394 return (ret); 395 } 396 397 int 398 _info(struct modinfo *modinfop) 399 { 400 return (mod_info(&modlinkage, modinfop)); 401 } 402 403 /* ARGSUSED */ 404 static int 405 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 406 { 407 int error; 408 409 switch (cmd) { 410 case DDI_INFO_DEVT2DEVINFO: 411 *result = (void *)viona_dip; 412 error = DDI_SUCCESS; 413 break; 414 case DDI_INFO_DEVT2INSTANCE: 415 *result = (void *)0; 416 error = DDI_SUCCESS; 417 break; 418 default: 419 error = DDI_FAILURE; 420 break; 421 } 422 return (error); 423 } 424 425 static int 426 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 427 { 428 if (cmd != DDI_ATTACH) { 429 return (DDI_FAILURE); 430 } 431 432 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, 433 DDI_PSEUDO, 0) != DDI_SUCCESS) { 434 return (DDI_FAILURE); 435 } 436 437 viona_neti_attach(); 438 439 viona_dip = dip; 440 ddi_report_dev(viona_dip); 441 442 return (DDI_SUCCESS); 443 } 444 445 static int 446 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 447 { 448 dev_info_t *old_dip = viona_dip; 449 450 if (cmd != DDI_DETACH) { 451 return (DDI_FAILURE); 452 } 453 454 VERIFY(old_dip != NULL); 455 456 viona_neti_detach(); 457 viona_dip = NULL; 458 ddi_remove_minor_node(old_dip, NULL); 459 460 return (DDI_SUCCESS); 461 } 462 463 static int 464 viona_open(dev_t *devp, int flag, int otype, cred_t *credp) 465 { 466 int minor; 467 viona_soft_state_t *ss; 468 469 if (otype != OTYP_CHR) { 470 return (EINVAL); 471 } 472 #if 0 473 /* 474 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. 475 * Should the check be at open() or ioctl()? 476 */ 477 if (drv_priv(credp) != 0) { 478 return (EPERM); 479 } 480 #endif 481 if (getminor(*devp) != VIONA_CTL_MINOR) { 482 return (ENXIO); 483 } 484 485 minor = id_alloc_nosleep(viona_minors); 486 if (minor == -1) { 487 /* All minors are busy */ 488 return (EBUSY); 489 } 490 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { 491 id_free(viona_minors, minor); 492 return (ENOMEM); 493 } 494 495 ss = ddi_get_soft_state(viona_state, minor); 496 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); 497 *devp = makedevice(getmajor(*devp), minor); 498 499 return (0); 500 } 501 502 static int 503 viona_close(dev_t dev, int flag, int otype, cred_t *credp) 504 { 505 int minor; 506 viona_soft_state_t *ss; 507 508 if (otype != OTYP_CHR) { 509 return (EINVAL); 510 } 511 512 minor = getminor(dev); 513 514 ss = ddi_get_soft_state(viona_state, minor); 515 if (ss == NULL) { 516 return (ENXIO); 517 } 518 519 VERIFY0(viona_ioc_delete(ss, B_TRUE)); 520 VERIFY(!list_link_active(&ss->ss_node)); 521 ddi_soft_state_free(viona_state, minor); 522 id_free(viona_minors, minor); 523 524 return (0); 525 } 526 527 static int 528 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) 529 { 530 viona_soft_state_t *ss; 531 void *dptr = (void *)data; 532 int err = 0, val; 533 viona_link_t *link; 534 535 ss = ddi_get_soft_state(viona_state, getminor(dev)); 536 if (ss == NULL) { 537 return (ENXIO); 538 } 539 540 switch (cmd) { 541 case VNA_IOC_CREATE: 542 return (viona_ioc_create(ss, dptr, md, cr)); 543 case VNA_IOC_DELETE: 544 return (viona_ioc_delete(ss, B_FALSE)); 545 case VNA_IOC_VERSION: 546 *rv = VIONA_CURRENT_INTERFACE_VERSION; 547 return (0); 548 default: 549 break; 550 } 551 552 mutex_enter(&ss->ss_lock); 553 if ((link = ss->ss_link) == NULL || link->l_destroyed || 554 vmm_drv_release_reqd(link->l_vm_hold)) { 555 mutex_exit(&ss->ss_lock); 556 return (ENXIO); 557 } 558 559 switch (cmd) { 560 case VNA_IOC_GET_FEATURES: 561 val = VIONA_S_HOSTCAPS | link->l_features_hw; 562 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { 563 err = EFAULT; 564 } 565 break; 566 case VNA_IOC_SET_FEATURES: 567 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { 568 err = EFAULT; 569 break; 570 } 571 val &= (VIONA_S_HOSTCAPS | link->l_features_hw); 572 573 if ((val & VIRTIO_NET_F_CSUM) == 0) 574 val &= ~VIRTIO_NET_F_HOST_TSO4; 575 576 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) 577 val &= ~VIRTIO_NET_F_GUEST_TSO4; 578 579 link->l_features = val; 580 break; 581 case VNA_IOC_RING_INIT: 582 err = viona_ioc_ring_init(link, dptr, md); 583 break; 584 case VNA_IOC_RING_RESET: 585 err = viona_ioc_ring_reset(link, (uint_t)data); 586 break; 587 case VNA_IOC_RING_KICK: 588 err = viona_ioc_ring_kick(link, (uint_t)data); 589 break; 590 case VNA_IOC_RING_SET_MSI: 591 err = viona_ioc_ring_set_msi(link, dptr, md); 592 break; 593 case VNA_IOC_RING_INTR_CLR: 594 err = viona_ioc_ring_intr_clear(link, (uint_t)data); 595 break; 596 case VNA_IOC_RING_SET_STATE: 597 err = viona_ioc_ring_set_state(link, dptr, md); 598 break; 599 case VNA_IOC_RING_GET_STATE: 600 err = viona_ioc_ring_get_state(link, dptr, md); 601 break; 602 case VNA_IOC_RING_PAUSE: 603 err = viona_ioc_ring_pause(link, (uint_t)data); 604 break; 605 606 case VNA_IOC_INTR_POLL: 607 err = viona_ioc_intr_poll(link, dptr, md, rv); 608 break; 609 case VNA_IOC_SET_NOTIFY_IOP: 610 if (data < 0 || data > UINT16_MAX) { 611 err = EINVAL; 612 break; 613 } 614 err = viona_ioc_set_notify_ioport(link, (uint16_t)data); 615 break; 616 default: 617 err = ENOTTY; 618 break; 619 } 620 621 mutex_exit(&ss->ss_lock); 622 return (err); 623 } 624 625 static int 626 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 627 struct pollhead **phpp) 628 { 629 viona_soft_state_t *ss; 630 viona_link_t *link; 631 632 ss = ddi_get_soft_state(viona_state, getminor(dev)); 633 if (ss == NULL) { 634 return (ENXIO); 635 } 636 637 mutex_enter(&ss->ss_lock); 638 if ((link = ss->ss_link) == NULL || link->l_destroyed) { 639 mutex_exit(&ss->ss_lock); 640 return (ENXIO); 641 } 642 643 *reventsp = 0; 644 if ((events & POLLRDBAND) != 0) { 645 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 646 if (link->l_vrings[i].vr_intr_enabled != 0) { 647 *reventsp |= POLLRDBAND; 648 break; 649 } 650 } 651 } 652 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 653 *phpp = &link->l_pollhead; 654 } 655 mutex_exit(&ss->ss_lock); 656 657 return (0); 658 } 659 660 static void 661 viona_get_mac_capab(viona_link_t *link) 662 { 663 mac_handle_t mh = link->l_mh; 664 uint32_t cap = 0; 665 mac_capab_lso_t lso_cap; 666 667 link->l_features_hw = 0; 668 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { 669 /* 670 * Only report HW checksum ability if the underlying MAC 671 * resource is capable of populating the L4 header. 672 */ 673 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { 674 link->l_features_hw |= VIRTIO_NET_F_CSUM; 675 } 676 link->l_cap_csum = cap; 677 } 678 679 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && 680 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { 681 /* 682 * Virtio doesn't allow for negotiating a maximum LSO 683 * packet size. We have to assume that the guest may 684 * send a maximum length IP packet. Make sure the 685 * underlying MAC can handle an LSO of this size. 686 */ 687 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && 688 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) 689 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; 690 } 691 } 692 693 static int 694 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) 695 { 696 vioc_create_t kvc; 697 viona_link_t *link = NULL; 698 char cli_name[MAXNAMELEN]; 699 int err = 0; 700 file_t *fp; 701 vmm_hold_t *hold = NULL; 702 viona_neti_t *nip = NULL; 703 zoneid_t zid; 704 mac_diag_t mac_diag = MAC_DIAG_NONE; 705 706 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); 707 708 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { 709 return (EFAULT); 710 } 711 712 zid = crgetzoneid(cr); 713 nip = viona_neti_lookup_by_zid(zid); 714 if (nip == NULL) { 715 return (EIO); 716 } 717 718 if (!nip->vni_nethook.vnh_hooked) { 719 viona_neti_rele(nip); 720 return (EIO); 721 } 722 723 mutex_enter(&ss->ss_lock); 724 if (ss->ss_link != NULL) { 725 mutex_exit(&ss->ss_lock); 726 viona_neti_rele(nip); 727 return (EEXIST); 728 } 729 730 if ((fp = getf(kvc.c_vmfd)) == NULL) { 731 err = EBADF; 732 goto bail; 733 } 734 err = vmm_drv_hold(fp, cr, &hold); 735 releasef(kvc.c_vmfd); 736 if (err != 0) { 737 goto bail; 738 } 739 740 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); 741 link->l_linkid = kvc.c_linkid; 742 link->l_vm_hold = hold; 743 744 err = mac_open_by_linkid(link->l_linkid, &link->l_mh); 745 if (err != 0) { 746 goto bail; 747 } 748 749 viona_get_mac_capab(link); 750 751 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME, 752 link->l_linkid); 753 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); 754 if (err != 0) { 755 goto bail; 756 } 757 758 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY, 759 &link->l_muh, VLAN_ID_NONE, &mac_diag); 760 if (err != 0) { 761 goto bail; 762 } 763 764 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); 765 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); 766 767 if ((err = viona_rx_set(link)) != 0) { 768 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 769 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 770 goto bail; 771 } 772 773 link->l_neti = nip; 774 ss->ss_link = link; 775 mutex_exit(&ss->ss_lock); 776 777 mutex_enter(&nip->vni_lock); 778 list_insert_tail(&nip->vni_dev_list, ss); 779 mutex_exit(&nip->vni_lock); 780 781 return (0); 782 783 bail: 784 if (link != NULL) { 785 if (link->l_mch != NULL) { 786 if (link->l_muh != NULL) { 787 VERIFY0(mac_unicast_remove(link->l_mch, 788 link->l_muh)); 789 link->l_muh = NULL; 790 } 791 mac_client_close(link->l_mch, 0); 792 } 793 if (link->l_mh != NULL) { 794 mac_close(link->l_mh); 795 } 796 kmem_free(link, sizeof (viona_link_t)); 797 } 798 if (hold != NULL) { 799 vmm_drv_rele(hold); 800 } 801 viona_neti_rele(nip); 802 803 mutex_exit(&ss->ss_lock); 804 return (err); 805 } 806 807 static int 808 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) 809 { 810 viona_link_t *link; 811 viona_neti_t *nip = NULL; 812 813 mutex_enter(&ss->ss_lock); 814 if ((link = ss->ss_link) == NULL) { 815 /* Link destruction already complete */ 816 mutex_exit(&ss->ss_lock); 817 return (0); 818 } 819 820 if (link->l_destroyed) { 821 /* 822 * Link destruction has been started by another thread, but has 823 * not completed. This condition should be impossible to 824 * encounter when performing the on-close destroy of the link, 825 * since racing ioctl accessors must necessarily be absent. 826 */ 827 VERIFY(!on_close); 828 mutex_exit(&ss->ss_lock); 829 return (EAGAIN); 830 } 831 /* 832 * The link deletion cannot fail after this point, continuing until its 833 * successful completion is reached. 834 */ 835 link->l_destroyed = B_TRUE; 836 837 /* 838 * Tear down the IO port hook so it cannot be used to kick any of the 839 * rings which are about to be reset and stopped. 840 */ 841 VERIFY0(viona_ioc_set_notify_ioport(link, 0)); 842 mutex_exit(&ss->ss_lock); 843 844 /* 845 * Return the rings to their reset state, ignoring any possible 846 * interruptions from signals. 847 */ 848 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); 849 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); 850 851 mutex_enter(&ss->ss_lock); 852 if (link->l_mch != NULL) { 853 /* Unhook the receive callbacks and close out the client */ 854 viona_rx_clear(link); 855 if (link->l_muh != NULL) { 856 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh)); 857 link->l_muh = NULL; 858 } 859 mac_client_close(link->l_mch, 0); 860 } 861 if (link->l_mh != NULL) { 862 mac_close(link->l_mh); 863 } 864 if (link->l_vm_hold != NULL) { 865 vmm_drv_rele(link->l_vm_hold); 866 link->l_vm_hold = NULL; 867 } 868 869 nip = link->l_neti; 870 link->l_neti = NULL; 871 872 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 873 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 874 pollhead_clean(&link->l_pollhead); 875 ss->ss_link = NULL; 876 mutex_exit(&ss->ss_lock); 877 878 mutex_enter(&nip->vni_lock); 879 list_remove(&nip->vni_dev_list, ss); 880 mutex_exit(&nip->vni_lock); 881 882 viona_neti_rele(nip); 883 884 kmem_free(link, sizeof (viona_link_t)); 885 return (0); 886 } 887 888 static int 889 viona_ioc_ring_init(viona_link_t *link, void *udata, int md) 890 { 891 vioc_ring_init_t kri; 892 int err; 893 894 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { 895 return (EFAULT); 896 } 897 const struct viona_ring_params params = { 898 .vrp_pa = kri.ri_qaddr, 899 .vrp_size = kri.ri_qsize, 900 .vrp_avail_idx = 0, 901 .vrp_used_idx = 0, 902 }; 903 904 err = viona_ring_init(link, kri.ri_index, ¶ms); 905 906 return (err); 907 } 908 909 static int 910 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md) 911 { 912 vioc_ring_state_t krs; 913 int err; 914 915 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 916 return (EFAULT); 917 } 918 const struct viona_ring_params params = { 919 .vrp_pa = krs.vrs_qaddr, 920 .vrp_size = krs.vrs_qsize, 921 .vrp_avail_idx = krs.vrs_avail_idx, 922 .vrp_used_idx = krs.vrs_used_idx, 923 }; 924 925 err = viona_ring_init(link, krs.vrs_index, ¶ms); 926 927 return (err); 928 } 929 930 static int 931 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md) 932 { 933 vioc_ring_state_t krs; 934 935 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 936 return (EFAULT); 937 } 938 939 struct viona_ring_params params; 940 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms); 941 if (err != 0) { 942 return (err); 943 } 944 krs.vrs_qsize = params.vrp_size; 945 krs.vrs_qaddr = params.vrp_pa; 946 krs.vrs_avail_idx = params.vrp_avail_idx; 947 krs.vrs_used_idx = params.vrp_used_idx; 948 949 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) { 950 return (EFAULT); 951 } 952 return (0); 953 } 954 955 static int 956 viona_ioc_ring_reset(viona_link_t *link, uint_t idx) 957 { 958 viona_vring_t *ring; 959 960 if (idx >= VIONA_VQ_MAX) { 961 return (EINVAL); 962 } 963 ring = &link->l_vrings[idx]; 964 965 return (viona_ring_reset(ring, B_TRUE)); 966 } 967 968 static int 969 viona_ioc_ring_kick(viona_link_t *link, uint_t idx) 970 { 971 viona_vring_t *ring; 972 int err; 973 974 if (idx >= VIONA_VQ_MAX) { 975 return (EINVAL); 976 } 977 ring = &link->l_vrings[idx]; 978 979 mutex_enter(&ring->vr_lock); 980 switch (ring->vr_state) { 981 case VRS_SETUP: 982 /* 983 * An early kick to a ring which is starting its worker thread 984 * is fine. Once that thread is active, it will process the 985 * start-up request immediately. 986 */ 987 /* FALLTHROUGH */ 988 case VRS_INIT: 989 ring->vr_state_flags |= VRSF_REQ_START; 990 /* FALLTHROUGH */ 991 case VRS_RUN: 992 cv_broadcast(&ring->vr_cv); 993 err = 0; 994 break; 995 default: 996 err = EBUSY; 997 break; 998 } 999 mutex_exit(&ring->vr_lock); 1000 1001 return (err); 1002 } 1003 1004 static int 1005 viona_ioc_ring_pause(viona_link_t *link, uint_t idx) 1006 { 1007 if (idx >= VIONA_VQ_MAX) { 1008 return (EINVAL); 1009 } 1010 1011 viona_vring_t *ring = &link->l_vrings[idx]; 1012 return (viona_ring_pause(ring)); 1013 } 1014 1015 static int 1016 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) 1017 { 1018 vioc_ring_msi_t vrm; 1019 viona_vring_t *ring; 1020 1021 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { 1022 return (EFAULT); 1023 } 1024 if (vrm.rm_index >= VIONA_VQ_MAX) { 1025 return (EINVAL); 1026 } 1027 1028 ring = &link->l_vrings[vrm.rm_index]; 1029 mutex_enter(&ring->vr_lock); 1030 ring->vr_msi_addr = vrm.rm_addr; 1031 ring->vr_msi_msg = vrm.rm_msg; 1032 mutex_exit(&ring->vr_lock); 1033 1034 return (0); 1035 } 1036 1037 static int 1038 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, 1039 uint32_t *val) 1040 { 1041 viona_link_t *link = (viona_link_t *)arg; 1042 1043 /* 1044 * If the request is a read (in/ins), or direct at a port other than 1045 * what we expect to be registered on, ignore it. 1046 */ 1047 if (in || port != link->l_notify_ioport) { 1048 return (ESRCH); 1049 } 1050 1051 /* Let userspace handle notifications for rings other than RX/TX. */ 1052 const uint16_t vq = *val; 1053 if (vq >= VIONA_VQ_MAX) { 1054 return (ESRCH); 1055 } 1056 1057 viona_vring_t *ring = &link->l_vrings[vq]; 1058 int res = 0; 1059 1060 mutex_enter(&ring->vr_lock); 1061 if (ring->vr_state == VRS_RUN) { 1062 cv_broadcast(&ring->vr_cv); 1063 } else { 1064 res = ESRCH; 1065 } 1066 mutex_exit(&ring->vr_lock); 1067 1068 return (res); 1069 } 1070 1071 static int 1072 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) 1073 { 1074 int err = 0; 1075 1076 if (link->l_notify_ioport != 0) { 1077 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); 1078 link->l_notify_ioport = 0; 1079 } 1080 1081 if (ioport != 0) { 1082 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, 1083 viona_notify_iop, (void *)link, &link->l_notify_cookie); 1084 if (err == 0) { 1085 link->l_notify_ioport = ioport; 1086 } 1087 } 1088 return (err); 1089 } 1090 1091 static int 1092 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) 1093 { 1094 if (idx >= VIONA_VQ_MAX) { 1095 return (EINVAL); 1096 } 1097 1098 link->l_vrings[idx].vr_intr_enabled = 0; 1099 return (0); 1100 } 1101 1102 static int 1103 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) 1104 { 1105 uint_t cnt = 0; 1106 vioc_intr_poll_t vip; 1107 1108 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 1109 uint_t val = link->l_vrings[i].vr_intr_enabled; 1110 1111 vip.vip_status[i] = val; 1112 if (val != 0) { 1113 cnt++; 1114 } 1115 } 1116 1117 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { 1118 return (EFAULT); 1119 } 1120 *rv = (int)cnt; 1121 return (0); 1122 } 1123