1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 39 * Copyright 2024 Oxide Computer Company 40 */ 41 42 /* 43 * viona - VirtIO-Net, Accelerated 44 * 45 * The purpose of viona is to provide high performance virtio-net devices to 46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the 47 * DLS/DLD stack. 48 * 49 * -------------------- 50 * General Architecture 51 * -------------------- 52 * 53 * A single viona instance is comprised of a "link" handle and two "rings". 54 * After opening the viona device, it must be associated with a MAC network 55 * interface and a bhyve (vmm) instance to form its link resource. This is 56 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are 57 * passed in to perform the initialization. With the MAC client opened, and a 58 * driver handle to the vmm instance established, the device is ready to be 59 * configured by the guest. 60 * 61 * The userspace portion of bhyve, which interfaces with the PCI device 62 * emulation framework, is meant to stay out of the datapath if at all 63 * possible. Configuration changes made via PCI are mapped to actions which 64 * will steer the operation of the in-kernel logic. 65 * 66 * 67 * ----------- 68 * Ring Basics 69 * ----------- 70 * 71 * Each viona link has two viona_vring_t entities, RX and TX, for handling data 72 * transfers to and from the guest. They represent an interface to the 73 * standard virtio ring structures. When initialized and active, each ring is 74 * backed by a kernel worker thread (parented to the bhyve process for the 75 * instance) which handles ring events. The RX worker has the simple task of 76 * watching for ring shutdown conditions. The TX worker does that in addition 77 * to processing all requests to transmit data. Data destined for the guest is 78 * delivered directly by MAC to viona_rx() when the ring is active. 79 * 80 * 81 * ----------- 82 * Ring States 83 * ----------- 84 * 85 * The viona_vring_t instances follow a simple path through the possible state 86 * values represented in virtio_vring_t`vr_state: 87 * 88 * +<--------------------------------------------+ 89 * | | 90 * V ^ 91 * +-----------+ This is the initial state when a link is created or 92 * | VRS_RESET | when the ring has been explicitly reset. 93 * +-----------+ 94 * | ^ 95 * |---* ioctl(VNA_IOC_RING_INIT) issued | 96 * | | 97 * | ^ 98 * V 99 * +-----------+ The ring parameters (size, guest physical addresses) 100 * | VRS_SETUP | have been set and start-up of the ring worker thread 101 * +-----------+ has begun. 102 * | ^ 103 * | | 104 * |---* ring worker thread begins execution | 105 * | | 106 * +-------------------------------------------->+ 107 * | | ^ 108 * | | 109 * | * If ring shutdown is requested (by ioctl or impending 110 * | bhyve process death) while the worker thread is 111 * | starting, the worker will transition the ring to 112 * | VRS_RESET and exit. 113 * | ^ 114 * | | 115 * |<-------------------------------------------<+ 116 * | | | 117 * | | ^ 118 * | * If ring is requested to pause (but not stop)from the 119 * | VRS_RUN state, it will return to the VRS_INIT state. 120 * | 121 * | ^ 122 * | | 123 * | ^ 124 * V 125 * +-----------+ The worker thread associated with the ring has started 126 * | VRS_INIT | executing. It has allocated any extra resources needed 127 * +-----------+ for the ring to operate. 128 * | ^ 129 * | | 130 * +-------------------------------------------->+ 131 * | | ^ 132 * | | 133 * | * If ring shutdown is requested while the worker is 134 * | waiting in VRS_INIT, it will free any extra resources 135 * | and transition to VRS_RESET. 136 * | ^ 137 * | | 138 * |--* ioctl(VNA_IOC_RING_KICK) issued | 139 * | ^ 140 * V 141 * +-----------+ The worker thread associated with the ring is executing 142 * | VRS_RUN | workload specific to that ring. 143 * +-----------+ 144 * | ^ 145 * |---* ioctl(VNA_IOC_RING_RESET) issued | 146 * | (or bhyve process begins exit) ^ 147 * | 148 * +-----------+ The worker thread associated with the ring is in the 149 * | VRS_STOP | process of exiting. All outstanding TX and RX 150 * +-----------+ requests are allowed to complete, but new requests 151 * | must be ignored. 152 * | ^ 153 * | | 154 * +-------------------------------------------->+ 155 * 156 * 157 * While the worker thread is not running, changes to vr_state are only made by 158 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts 159 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread 160 * has been started, only it may perform ring state transitions (still under 161 * the protection of vr_lock), when requested by outside consumers via 162 * vr_state_flags or when the containing bhyve process initiates an exit. 163 * 164 * 165 * ---------------------------- 166 * Transmission mblk_t Handling 167 * ---------------------------- 168 * 169 * For incoming frames destined for a bhyve guest, the data must first land in 170 * a host OS buffer from the physical NIC before it is copied into the awaiting 171 * guest buffer(s). Outbound frames transmitted by the guest are not bound by 172 * this limitation and can avoid extra copying before the buffers are accessed 173 * directly by the NIC. When a guest designates buffers to be transmitted, 174 * viona translates the guest-physical addresses contained in the ring 175 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is 176 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). 177 * Doing so increments vr_xfer_outstanding, preventing the ring from being 178 * reset (allowing the link to drop its vmm handle to the guest) until all 179 * transmit mblks referencing guest memory have been processed. Allocation of 180 * the viona_desb_t entries is done during the VRS_INIT stage of the ring 181 * worker thread. The ring size informs that allocation as the number of 182 * concurrent transmissions is limited by the number of descriptors in the 183 * ring. This minimizes allocation in the transmit hot-path by acquiring those 184 * fixed-size resources during initialization. 185 * 186 * This optimization depends on the underlying NIC driver freeing the mblks in 187 * a timely manner after they have been transmitted by the hardware. Some 188 * drivers have been found to flush TX descriptors only when new transmissions 189 * are initiated. This means that there is no upper bound to the time needed 190 * for an mblk to be flushed and can stall bhyve guests from shutting down 191 * since their memory must be free of viona TX references prior to clean-up. 192 * 193 * This expectation of deterministic mblk_t processing is likely the reason 194 * behind the notable exception to the zero-copy TX path: systems with 'bnxe' 195 * loaded will copy transmit data into fresh buffers rather than passing up 196 * zero-copy mblks. It is a hold-over from the original viona sources provided 197 * by Pluribus and its continued necessity has not been confirmed. 198 * 199 * 200 * ---------------------------- 201 * Ring Notification Fast-paths 202 * ---------------------------- 203 * 204 * Device operation for viona requires that notifications flow to and from the 205 * guest to indicate certain ring conditions. In order to minimize latency and 206 * processing overhead, the notification procedures are kept in-kernel whenever 207 * possible. 208 * 209 * Guest-to-host notifications, when new available descriptors have been placed 210 * in the ring, are posted via the 'queue notify' address in the virtio BAR. 211 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to 212 * install a callback hook on an ioport address. Guest exits for accesses to 213 * viona-hooked ioport addresses will result in direct calls to notify the 214 * appropriate ring worker without a trip to userland. 215 * 216 * Host-to-guest notifications in the form of interrupts enjoy similar 217 * acceleration. Each viona ring can be configured to send MSI notifications 218 * to the guest as virtio conditions dictate. This in-kernel interrupt 219 * configuration is kept synchronized through viona ioctls which are utilized 220 * during writes to the associated PCI config registers or MSI-X BAR. 221 * 222 * Guests which do not utilize MSI-X will result in viona falling back to the 223 * slow path for interrupts. It will poll(2) the viona handle, receiving 224 * notification when ring events necessitate the assertion of an interrupt. 225 * 226 * 227 * --------------- 228 * Nethook Support 229 * --------------- 230 * 231 * Viona provides four nethook events that consumers (e.g. ipf) can hook into 232 * to intercept packets as they go up or down the stack. Unfortunately, 233 * the nethook framework does not understand raw packets, so we can only 234 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, 235 * we register callbacks with the neti (netinfo) module that will be invoked 236 * for each netstack already present, as well as for any additional netstack 237 * instances created as the system operates. These callbacks will 238 * register/unregister the hooks with the nethook framework for each 239 * netstack instance. This registration occurs prior to creating any 240 * viona instances for a given netstack, and the unregistration for a netstack 241 * instance occurs after all viona instances of the netstack instance have 242 * been deleted. 243 * 244 * ------------------ 245 * Metrics/Statistics 246 * ----------------- 247 * 248 * During operation, Viona tracks certain metrics as certain events occur. 249 * 250 * One class of metrics, known as the "error stats", refer to abnormal 251 * conditions in ring processing which are likely the fault of a misbehaving 252 * guest. These are tracked on a per-ring basis, and are not formally exposed 253 * to any consumer besides direct memory access through mdb. 254 * 255 * The other class of metrics tracked for an instance are the "transfer stats", 256 * which are the traditional packets/bytes/errors/drops figures. These are 257 * counted per-ring, and then aggregated into link-wide values exposed via 258 * kstats. Atomic operations are used to increment those per-ring stats during 259 * operation, and then when a ring is stopped, the values are consolidated into 260 * the link-wide values (to prevent loss when the ring is zeroed) under the 261 * protection of viona_link`l_stats_lock. When the kstats are being updated, 262 * l_stats_lock is held to protect against a racing consolidation, with the 263 * existing per-ring values being added in at update time to provide an accurate 264 * figure. 265 */ 266 267 #include <sys/conf.h> 268 #include <sys/file.h> 269 #include <sys/stat.h> 270 271 #include <sys/dlpi.h> 272 #include <sys/vlan.h> 273 274 #include "viona_impl.h" 275 276 277 #define VIONA_NAME "Virtio Network Accelerator" 278 #define VIONA_CTL_MINOR 0 279 #define VIONA_MODULE_NAME "viona" 280 #define VIONA_KSTAT_CLASS "misc" 281 #define VIONA_KSTAT_NAME "viona_stat" 282 283 284 /* 285 * Host capabilities. 286 */ 287 #define VIONA_S_HOSTCAPS ( \ 288 VIRTIO_NET_F_GUEST_CSUM | \ 289 VIRTIO_NET_F_MAC | \ 290 VIRTIO_NET_F_GUEST_TSO4 | \ 291 VIRTIO_NET_F_MRG_RXBUF | \ 292 VIRTIO_NET_F_STATUS | \ 293 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ 294 VIRTIO_F_RING_INDIRECT_DESC) 295 296 /* MAC_CAPAB_HCKSUM specifics of interest */ 297 #define VIONA_CAP_HCKSUM_INTEREST \ 298 (HCKSUM_INET_PARTIAL | \ 299 HCKSUM_INET_FULL_V4 | \ 300 HCKSUM_INET_FULL_V6) 301 302 static void *viona_state; 303 static dev_info_t *viona_dip; 304 static id_space_t *viona_minors; 305 306 307 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, 308 void **result); 309 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 310 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 311 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); 312 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); 313 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, 314 cred_t *credp, int *rval); 315 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 316 struct pollhead **phpp); 317 318 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); 319 static int viona_ioc_delete(viona_soft_state_t *, boolean_t); 320 321 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); 322 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t); 323 static int viona_ioc_ring_init(viona_link_t *, void *, int); 324 static int viona_ioc_ring_set_state(viona_link_t *, void *, int); 325 static int viona_ioc_ring_get_state(viona_link_t *, void *, int); 326 static int viona_ioc_ring_reset(viona_link_t *, uint_t); 327 static int viona_ioc_ring_kick(viona_link_t *, uint_t); 328 static int viona_ioc_ring_pause(viona_link_t *, uint_t); 329 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); 330 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); 331 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); 332 333 static struct cb_ops viona_cb_ops = { 334 viona_open, 335 viona_close, 336 nodev, 337 nodev, 338 nodev, 339 nodev, 340 nodev, 341 viona_ioctl, 342 nodev, 343 nodev, 344 nodev, 345 viona_chpoll, 346 ddi_prop_op, 347 0, 348 D_MP | D_NEW | D_HOTPLUG, 349 CB_REV, 350 nodev, 351 nodev 352 }; 353 354 static struct dev_ops viona_ops = { 355 DEVO_REV, 356 0, 357 viona_info, 358 nulldev, 359 nulldev, 360 viona_attach, 361 viona_detach, 362 nodev, 363 &viona_cb_ops, 364 NULL, 365 ddi_power, 366 ddi_quiesce_not_needed 367 }; 368 369 static struct modldrv modldrv = { 370 &mod_driverops, 371 VIONA_NAME, 372 &viona_ops, 373 }; 374 375 static struct modlinkage modlinkage = { 376 MODREV_1, &modldrv, NULL 377 }; 378 379 int 380 _init(void) 381 { 382 int ret; 383 384 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); 385 if (ret != 0) { 386 return (ret); 387 } 388 389 viona_minors = id_space_create("viona_minors", 390 VIONA_CTL_MINOR + 1, UINT16_MAX); 391 viona_rx_init(); 392 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); 393 394 ret = mod_install(&modlinkage); 395 if (ret != 0) { 396 ddi_soft_state_fini(&viona_state); 397 id_space_destroy(viona_minors); 398 viona_rx_fini(); 399 mutex_destroy(&viona_force_copy_lock); 400 } 401 402 return (ret); 403 } 404 405 int 406 _fini(void) 407 { 408 int ret; 409 410 ret = mod_remove(&modlinkage); 411 if (ret != 0) { 412 return (ret); 413 } 414 415 ddi_soft_state_fini(&viona_state); 416 id_space_destroy(viona_minors); 417 viona_rx_fini(); 418 mutex_destroy(&viona_force_copy_lock); 419 420 return (ret); 421 } 422 423 int 424 _info(struct modinfo *modinfop) 425 { 426 return (mod_info(&modlinkage, modinfop)); 427 } 428 429 /* ARGSUSED */ 430 static int 431 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 432 { 433 int error; 434 435 switch (cmd) { 436 case DDI_INFO_DEVT2DEVINFO: 437 *result = (void *)viona_dip; 438 error = DDI_SUCCESS; 439 break; 440 case DDI_INFO_DEVT2INSTANCE: 441 *result = (void *)0; 442 error = DDI_SUCCESS; 443 break; 444 default: 445 error = DDI_FAILURE; 446 break; 447 } 448 return (error); 449 } 450 451 static int 452 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 453 { 454 if (cmd != DDI_ATTACH) { 455 return (DDI_FAILURE); 456 } 457 458 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, 459 DDI_PSEUDO, 0) != DDI_SUCCESS) { 460 return (DDI_FAILURE); 461 } 462 463 viona_neti_attach(); 464 465 viona_dip = dip; 466 ddi_report_dev(viona_dip); 467 468 return (DDI_SUCCESS); 469 } 470 471 static int 472 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 473 { 474 dev_info_t *old_dip = viona_dip; 475 476 if (cmd != DDI_DETACH) { 477 return (DDI_FAILURE); 478 } 479 480 VERIFY(old_dip != NULL); 481 482 viona_neti_detach(); 483 viona_dip = NULL; 484 ddi_remove_minor_node(old_dip, NULL); 485 486 return (DDI_SUCCESS); 487 } 488 489 static int 490 viona_open(dev_t *devp, int flag, int otype, cred_t *credp) 491 { 492 int minor; 493 viona_soft_state_t *ss; 494 495 if (otype != OTYP_CHR) { 496 return (EINVAL); 497 } 498 #if 0 499 /* 500 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. 501 * Should the check be at open() or ioctl()? 502 */ 503 if (drv_priv(credp) != 0) { 504 return (EPERM); 505 } 506 #endif 507 if (getminor(*devp) != VIONA_CTL_MINOR) { 508 return (ENXIO); 509 } 510 511 minor = id_alloc_nosleep(viona_minors); 512 if (minor == -1) { 513 /* All minors are busy */ 514 return (EBUSY); 515 } 516 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { 517 id_free(viona_minors, minor); 518 return (ENOMEM); 519 } 520 521 ss = ddi_get_soft_state(viona_state, minor); 522 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); 523 ss->ss_minor = minor; 524 *devp = makedevice(getmajor(*devp), minor); 525 526 return (0); 527 } 528 529 static int 530 viona_close(dev_t dev, int flag, int otype, cred_t *credp) 531 { 532 int minor; 533 viona_soft_state_t *ss; 534 535 if (otype != OTYP_CHR) { 536 return (EINVAL); 537 } 538 539 minor = getminor(dev); 540 541 ss = ddi_get_soft_state(viona_state, minor); 542 if (ss == NULL) { 543 return (ENXIO); 544 } 545 546 VERIFY0(viona_ioc_delete(ss, B_TRUE)); 547 VERIFY(!list_link_active(&ss->ss_node)); 548 ddi_soft_state_free(viona_state, minor); 549 id_free(viona_minors, minor); 550 551 return (0); 552 } 553 554 static int 555 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) 556 { 557 viona_soft_state_t *ss; 558 void *dptr = (void *)data; 559 int err = 0, val; 560 viona_link_t *link; 561 562 ss = ddi_get_soft_state(viona_state, getminor(dev)); 563 if (ss == NULL) { 564 return (ENXIO); 565 } 566 567 switch (cmd) { 568 case VNA_IOC_CREATE: 569 return (viona_ioc_create(ss, dptr, md, cr)); 570 case VNA_IOC_DELETE: 571 return (viona_ioc_delete(ss, B_FALSE)); 572 case VNA_IOC_VERSION: 573 *rv = VIONA_CURRENT_INTERFACE_VERSION; 574 return (0); 575 default: 576 break; 577 } 578 579 mutex_enter(&ss->ss_lock); 580 if ((link = ss->ss_link) == NULL || link->l_destroyed || 581 vmm_drv_release_reqd(link->l_vm_hold)) { 582 mutex_exit(&ss->ss_lock); 583 return (ENXIO); 584 } 585 586 switch (cmd) { 587 case VNA_IOC_GET_FEATURES: 588 val = VIONA_S_HOSTCAPS | link->l_features_hw; 589 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { 590 err = EFAULT; 591 } 592 break; 593 case VNA_IOC_SET_FEATURES: 594 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { 595 err = EFAULT; 596 break; 597 } 598 val &= (VIONA_S_HOSTCAPS | link->l_features_hw); 599 600 if ((val & VIRTIO_NET_F_CSUM) == 0) 601 val &= ~VIRTIO_NET_F_HOST_TSO4; 602 603 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) 604 val &= ~VIRTIO_NET_F_GUEST_TSO4; 605 606 link->l_features = val; 607 break; 608 case VNA_IOC_RING_INIT: 609 err = viona_ioc_ring_init(link, dptr, md); 610 break; 611 case VNA_IOC_RING_RESET: 612 err = viona_ioc_ring_reset(link, (uint_t)data); 613 break; 614 case VNA_IOC_RING_KICK: 615 err = viona_ioc_ring_kick(link, (uint_t)data); 616 break; 617 case VNA_IOC_RING_SET_MSI: 618 err = viona_ioc_ring_set_msi(link, dptr, md); 619 break; 620 case VNA_IOC_RING_INTR_CLR: 621 err = viona_ioc_ring_intr_clear(link, (uint_t)data); 622 break; 623 case VNA_IOC_RING_SET_STATE: 624 err = viona_ioc_ring_set_state(link, dptr, md); 625 break; 626 case VNA_IOC_RING_GET_STATE: 627 err = viona_ioc_ring_get_state(link, dptr, md); 628 break; 629 case VNA_IOC_RING_PAUSE: 630 err = viona_ioc_ring_pause(link, (uint_t)data); 631 break; 632 633 case VNA_IOC_INTR_POLL: 634 err = viona_ioc_intr_poll(link, dptr, md, rv); 635 break; 636 case VNA_IOC_SET_NOTIFY_IOP: 637 if (data < 0 || data > UINT16_MAX) { 638 err = EINVAL; 639 break; 640 } 641 err = viona_ioc_set_notify_ioport(link, (uint16_t)data); 642 break; 643 case VNA_IOC_SET_PROMISC: 644 err = viona_ioc_set_promisc(link, (viona_promisc_t)data); 645 break; 646 default: 647 err = ENOTTY; 648 break; 649 } 650 651 mutex_exit(&ss->ss_lock); 652 return (err); 653 } 654 655 static int 656 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 657 struct pollhead **phpp) 658 { 659 viona_soft_state_t *ss; 660 viona_link_t *link; 661 662 ss = ddi_get_soft_state(viona_state, getminor(dev)); 663 if (ss == NULL) { 664 return (ENXIO); 665 } 666 667 mutex_enter(&ss->ss_lock); 668 if ((link = ss->ss_link) == NULL || link->l_destroyed) { 669 mutex_exit(&ss->ss_lock); 670 return (ENXIO); 671 } 672 673 *reventsp = 0; 674 if ((events & POLLRDBAND) != 0) { 675 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 676 if (link->l_vrings[i].vr_intr_enabled != 0) { 677 *reventsp |= POLLRDBAND; 678 break; 679 } 680 } 681 } 682 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 683 *phpp = &link->l_pollhead; 684 } 685 mutex_exit(&ss->ss_lock); 686 687 return (0); 688 } 689 690 static void 691 viona_get_mac_capab(viona_link_t *link) 692 { 693 mac_handle_t mh = link->l_mh; 694 uint32_t cap = 0; 695 mac_capab_lso_t lso_cap; 696 697 link->l_features_hw = 0; 698 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { 699 /* 700 * Only report HW checksum ability if the underlying MAC 701 * resource is capable of populating the L4 header. 702 */ 703 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { 704 link->l_features_hw |= VIRTIO_NET_F_CSUM; 705 } 706 link->l_cap_csum = cap; 707 } 708 709 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && 710 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { 711 /* 712 * Virtio doesn't allow for negotiating a maximum LSO 713 * packet size. We have to assume that the guest may 714 * send a maximum length IP packet. Make sure the 715 * underlying MAC can handle an LSO of this size. 716 */ 717 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && 718 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) 719 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; 720 } 721 } 722 723 static int 724 viona_kstat_update(kstat_t *ksp, int rw) 725 { 726 viona_link_t *link = ksp->ks_private; 727 viona_kstats_t *vk = ksp->ks_data; 728 729 /* 730 * Avoid the potential for mangled values due to a racing consolidation 731 * of stats for a ring by performing the kstat update with l_stats_lock 732 * held while adding up the central (link) and ring values. 733 */ 734 mutex_enter(&link->l_stats_lock); 735 736 const viona_transfer_stats_t *ring_stats = 737 &link->l_vrings[VIONA_VQ_RX].vr_stats; 738 const viona_transfer_stats_t *link_stats = &link->l_stats.vls_rx; 739 740 vk->vk_rx_packets.value.ui64 = 741 link_stats->vts_packets + ring_stats->vts_packets; 742 vk->vk_rx_bytes.value.ui64 = 743 link_stats->vts_bytes + ring_stats->vts_bytes; 744 vk->vk_rx_errors.value.ui64 = 745 link_stats->vts_errors + ring_stats->vts_errors; 746 vk->vk_rx_drops.value.ui64 = 747 link_stats->vts_drops + ring_stats->vts_drops; 748 749 ring_stats = &link->l_vrings[VIONA_VQ_TX].vr_stats; 750 link_stats = &link->l_stats.vls_tx; 751 752 vk->vk_tx_packets.value.ui64 = 753 link_stats->vts_packets + ring_stats->vts_packets; 754 vk->vk_tx_bytes.value.ui64 = 755 link_stats->vts_bytes + ring_stats->vts_bytes; 756 vk->vk_tx_errors.value.ui64 = 757 link_stats->vts_errors + ring_stats->vts_errors; 758 vk->vk_tx_drops.value.ui64 = 759 link_stats->vts_drops + ring_stats->vts_drops; 760 761 mutex_exit(&link->l_stats_lock); 762 763 return (0); 764 } 765 766 static int 767 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr) 768 { 769 zoneid_t zid = crgetzoneid(cr); 770 kstat_t *ksp; 771 772 ASSERT(MUTEX_HELD(&ss->ss_lock)); 773 ASSERT3P(ss->ss_kstat, ==, NULL); 774 775 ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor, 776 VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED, 777 sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid); 778 779 if (ksp == NULL) { 780 /* 781 * Without detail from kstat_create_zone(), assume that resource 782 * exhaustion is to blame for the failure. 783 */ 784 return (ENOMEM); 785 } 786 ss->ss_kstat = ksp; 787 788 /* 789 * If this instance is associated with a non-global zone, make its 790 * kstats visible from the GZ. 791 */ 792 if (zid != GLOBAL_ZONEID) { 793 kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID); 794 } 795 796 viona_kstats_t *vk = ksp->ks_data; 797 798 kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64); 799 kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64); 800 kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64); 801 kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64); 802 kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64); 803 kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64); 804 kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64); 805 kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64); 806 ksp->ks_private = ss->ss_link; 807 ksp->ks_update = viona_kstat_update; 808 809 kstat_install(ss->ss_kstat); 810 return (0); 811 } 812 813 static void 814 viona_kstat_fini(viona_soft_state_t *ss) 815 { 816 ASSERT(MUTEX_HELD(&ss->ss_lock)); 817 818 if (ss->ss_kstat != NULL) { 819 kstat_delete(ss->ss_kstat); 820 ss->ss_kstat = NULL; 821 } 822 } 823 824 static int 825 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) 826 { 827 vioc_create_t kvc; 828 viona_link_t *link = NULL; 829 char cli_name[MAXNAMELEN]; 830 int err = 0; 831 file_t *fp; 832 vmm_hold_t *hold = NULL; 833 viona_neti_t *nip = NULL; 834 zoneid_t zid; 835 mac_diag_t mac_diag = MAC_DIAG_NONE; 836 boolean_t rings_allocd = B_FALSE; 837 838 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); 839 840 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { 841 return (EFAULT); 842 } 843 844 zid = crgetzoneid(cr); 845 nip = viona_neti_lookup_by_zid(zid); 846 if (nip == NULL) { 847 return (EIO); 848 } 849 850 if (!nip->vni_nethook.vnh_hooked) { 851 viona_neti_rele(nip); 852 return (EIO); 853 } 854 855 mutex_enter(&ss->ss_lock); 856 if (ss->ss_link != NULL) { 857 mutex_exit(&ss->ss_lock); 858 viona_neti_rele(nip); 859 return (EEXIST); 860 } 861 862 if ((fp = getf(kvc.c_vmfd)) == NULL) { 863 err = EBADF; 864 goto bail; 865 } 866 err = vmm_drv_hold(fp, cr, &hold); 867 releasef(kvc.c_vmfd); 868 if (err != 0) { 869 goto bail; 870 } 871 872 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); 873 link->l_linkid = kvc.c_linkid; 874 link->l_vm_hold = hold; 875 876 err = mac_open_by_linkid(link->l_linkid, &link->l_mh); 877 if (err != 0) { 878 goto bail; 879 } 880 881 viona_get_mac_capab(link); 882 883 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME, 884 link->l_linkid); 885 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); 886 if (err != 0) { 887 goto bail; 888 } 889 890 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY, 891 &link->l_muh, VLAN_ID_NONE, &mac_diag); 892 if (err != 0) { 893 goto bail; 894 } 895 896 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); 897 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); 898 rings_allocd = B_TRUE; 899 900 /* 901 * Default to passing up all multicast traffic in addition to 902 * classified unicast. Guests which have support will change this 903 * if they need to via the virtio net control queue; guests without 904 * support generally still want to see multicast. 905 */ 906 link->l_promisc = VIONA_PROMISC_MULTI; 907 if ((err = viona_rx_set(link, link->l_promisc)) != 0) { 908 goto bail; 909 } 910 911 link->l_neti = nip; 912 ss->ss_link = link; 913 914 if ((err = viona_kstat_init(ss, cr)) != 0) { 915 goto bail; 916 } 917 918 mutex_exit(&ss->ss_lock); 919 920 mutex_enter(&nip->vni_lock); 921 list_insert_tail(&nip->vni_dev_list, ss); 922 mutex_exit(&nip->vni_lock); 923 924 return (0); 925 926 bail: 927 if (link != NULL) { 928 viona_rx_clear(link); 929 if (link->l_mch != NULL) { 930 if (link->l_muh != NULL) { 931 VERIFY0(mac_unicast_remove(link->l_mch, 932 link->l_muh)); 933 link->l_muh = NULL; 934 } 935 mac_client_close(link->l_mch, 0); 936 } 937 if (link->l_mh != NULL) { 938 mac_close(link->l_mh); 939 } 940 if (rings_allocd) { 941 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 942 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 943 } 944 kmem_free(link, sizeof (viona_link_t)); 945 ss->ss_link = NULL; 946 } 947 if (hold != NULL) { 948 vmm_drv_rele(hold); 949 } 950 viona_neti_rele(nip); 951 952 mutex_exit(&ss->ss_lock); 953 return (err); 954 } 955 956 static int 957 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) 958 { 959 viona_link_t *link; 960 viona_neti_t *nip = NULL; 961 962 mutex_enter(&ss->ss_lock); 963 if ((link = ss->ss_link) == NULL) { 964 /* Link destruction already complete */ 965 mutex_exit(&ss->ss_lock); 966 return (0); 967 } 968 969 if (link->l_destroyed) { 970 /* 971 * Link destruction has been started by another thread, but has 972 * not completed. This condition should be impossible to 973 * encounter when performing the on-close destroy of the link, 974 * since racing ioctl accessors must necessarily be absent. 975 */ 976 VERIFY(!on_close); 977 mutex_exit(&ss->ss_lock); 978 return (EAGAIN); 979 } 980 /* 981 * The link deletion cannot fail after this point, continuing until its 982 * successful completion is reached. 983 */ 984 link->l_destroyed = B_TRUE; 985 986 /* 987 * Tear down the IO port hook so it cannot be used to kick any of the 988 * rings which are about to be reset and stopped. 989 */ 990 VERIFY0(viona_ioc_set_notify_ioport(link, 0)); 991 mutex_exit(&ss->ss_lock); 992 993 /* 994 * Return the rings to their reset state, ignoring any possible 995 * interruptions from signals. 996 */ 997 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); 998 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); 999 1000 mutex_enter(&ss->ss_lock); 1001 viona_kstat_fini(ss); 1002 if (link->l_mch != NULL) { 1003 /* Unhook the receive callbacks and close out the client */ 1004 viona_rx_clear(link); 1005 if (link->l_muh != NULL) { 1006 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh)); 1007 link->l_muh = NULL; 1008 } 1009 mac_client_close(link->l_mch, 0); 1010 } 1011 if (link->l_mh != NULL) { 1012 mac_close(link->l_mh); 1013 } 1014 if (link->l_vm_hold != NULL) { 1015 vmm_drv_rele(link->l_vm_hold); 1016 link->l_vm_hold = NULL; 1017 } 1018 1019 nip = link->l_neti; 1020 link->l_neti = NULL; 1021 1022 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 1023 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 1024 pollhead_clean(&link->l_pollhead); 1025 ss->ss_link = NULL; 1026 mutex_exit(&ss->ss_lock); 1027 1028 mutex_enter(&nip->vni_lock); 1029 list_remove(&nip->vni_dev_list, ss); 1030 mutex_exit(&nip->vni_lock); 1031 1032 viona_neti_rele(nip); 1033 1034 kmem_free(link, sizeof (viona_link_t)); 1035 return (0); 1036 } 1037 1038 static int 1039 viona_ioc_ring_init(viona_link_t *link, void *udata, int md) 1040 { 1041 vioc_ring_init_t kri; 1042 int err; 1043 1044 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { 1045 return (EFAULT); 1046 } 1047 const struct viona_ring_params params = { 1048 .vrp_pa = kri.ri_qaddr, 1049 .vrp_size = kri.ri_qsize, 1050 .vrp_avail_idx = 0, 1051 .vrp_used_idx = 0, 1052 }; 1053 1054 err = viona_ring_init(link, kri.ri_index, ¶ms); 1055 1056 return (err); 1057 } 1058 1059 static int 1060 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md) 1061 { 1062 vioc_ring_state_t krs; 1063 int err; 1064 1065 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 1066 return (EFAULT); 1067 } 1068 const struct viona_ring_params params = { 1069 .vrp_pa = krs.vrs_qaddr, 1070 .vrp_size = krs.vrs_qsize, 1071 .vrp_avail_idx = krs.vrs_avail_idx, 1072 .vrp_used_idx = krs.vrs_used_idx, 1073 }; 1074 1075 err = viona_ring_init(link, krs.vrs_index, ¶ms); 1076 1077 return (err); 1078 } 1079 1080 static int 1081 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md) 1082 { 1083 vioc_ring_state_t krs; 1084 1085 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 1086 return (EFAULT); 1087 } 1088 1089 struct viona_ring_params params; 1090 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms); 1091 if (err != 0) { 1092 return (err); 1093 } 1094 krs.vrs_qsize = params.vrp_size; 1095 krs.vrs_qaddr = params.vrp_pa; 1096 krs.vrs_avail_idx = params.vrp_avail_idx; 1097 krs.vrs_used_idx = params.vrp_used_idx; 1098 1099 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) { 1100 return (EFAULT); 1101 } 1102 return (0); 1103 } 1104 1105 static int 1106 viona_ioc_ring_reset(viona_link_t *link, uint_t idx) 1107 { 1108 viona_vring_t *ring; 1109 1110 if (idx >= VIONA_VQ_MAX) { 1111 return (EINVAL); 1112 } 1113 ring = &link->l_vrings[idx]; 1114 1115 return (viona_ring_reset(ring, B_TRUE)); 1116 } 1117 1118 static int 1119 viona_ioc_ring_kick(viona_link_t *link, uint_t idx) 1120 { 1121 viona_vring_t *ring; 1122 int err; 1123 1124 if (idx >= VIONA_VQ_MAX) { 1125 return (EINVAL); 1126 } 1127 ring = &link->l_vrings[idx]; 1128 1129 mutex_enter(&ring->vr_lock); 1130 switch (ring->vr_state) { 1131 case VRS_SETUP: 1132 /* 1133 * An early kick to a ring which is starting its worker thread 1134 * is fine. Once that thread is active, it will process the 1135 * start-up request immediately. 1136 */ 1137 /* FALLTHROUGH */ 1138 case VRS_INIT: 1139 ring->vr_state_flags |= VRSF_REQ_START; 1140 /* FALLTHROUGH */ 1141 case VRS_RUN: 1142 cv_broadcast(&ring->vr_cv); 1143 err = 0; 1144 break; 1145 default: 1146 err = EBUSY; 1147 break; 1148 } 1149 mutex_exit(&ring->vr_lock); 1150 1151 return (err); 1152 } 1153 1154 static int 1155 viona_ioc_ring_pause(viona_link_t *link, uint_t idx) 1156 { 1157 if (idx >= VIONA_VQ_MAX) { 1158 return (EINVAL); 1159 } 1160 1161 viona_vring_t *ring = &link->l_vrings[idx]; 1162 return (viona_ring_pause(ring)); 1163 } 1164 1165 static int 1166 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) 1167 { 1168 vioc_ring_msi_t vrm; 1169 viona_vring_t *ring; 1170 1171 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { 1172 return (EFAULT); 1173 } 1174 if (vrm.rm_index >= VIONA_VQ_MAX) { 1175 return (EINVAL); 1176 } 1177 1178 ring = &link->l_vrings[vrm.rm_index]; 1179 mutex_enter(&ring->vr_lock); 1180 ring->vr_msi_addr = vrm.rm_addr; 1181 ring->vr_msi_msg = vrm.rm_msg; 1182 mutex_exit(&ring->vr_lock); 1183 1184 return (0); 1185 } 1186 1187 static int 1188 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, 1189 uint32_t *val) 1190 { 1191 viona_link_t *link = (viona_link_t *)arg; 1192 1193 /* 1194 * If the request is a read (in/ins), or direct at a port other than 1195 * what we expect to be registered on, ignore it. 1196 */ 1197 if (in || port != link->l_notify_ioport) { 1198 return (ESRCH); 1199 } 1200 1201 /* Let userspace handle notifications for rings other than RX/TX. */ 1202 const uint16_t vq = *val; 1203 if (vq >= VIONA_VQ_MAX) { 1204 return (ESRCH); 1205 } 1206 1207 viona_vring_t *ring = &link->l_vrings[vq]; 1208 int res = 0; 1209 1210 mutex_enter(&ring->vr_lock); 1211 if (ring->vr_state == VRS_RUN) { 1212 cv_broadcast(&ring->vr_cv); 1213 } else { 1214 res = ESRCH; 1215 } 1216 mutex_exit(&ring->vr_lock); 1217 1218 return (res); 1219 } 1220 1221 static int 1222 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) 1223 { 1224 int err = 0; 1225 1226 if (link->l_notify_ioport != 0) { 1227 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); 1228 link->l_notify_ioport = 0; 1229 } 1230 1231 if (ioport != 0) { 1232 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, 1233 viona_notify_iop, (void *)link, &link->l_notify_cookie); 1234 if (err == 0) { 1235 link->l_notify_ioport = ioport; 1236 } 1237 } 1238 return (err); 1239 } 1240 1241 static int 1242 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode) 1243 { 1244 int err; 1245 1246 if (mode >= VIONA_PROMISC_MAX) { 1247 return (EINVAL); 1248 } 1249 1250 if (mode == link->l_promisc) { 1251 return (0); 1252 } 1253 1254 if ((err = viona_rx_set(link, mode)) != 0) { 1255 return (err); 1256 } 1257 1258 link->l_promisc = mode; 1259 return (0); 1260 } 1261 1262 static int 1263 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) 1264 { 1265 if (idx >= VIONA_VQ_MAX) { 1266 return (EINVAL); 1267 } 1268 1269 link->l_vrings[idx].vr_intr_enabled = 0; 1270 return (0); 1271 } 1272 1273 static int 1274 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) 1275 { 1276 uint_t cnt = 0; 1277 vioc_intr_poll_t vip; 1278 1279 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 1280 uint_t val = link->l_vrings[i].vr_intr_enabled; 1281 1282 vip.vip_status[i] = val; 1283 if (val != 0) { 1284 cnt++; 1285 } 1286 } 1287 1288 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { 1289 return (EFAULT); 1290 } 1291 *rv = (int)cnt; 1292 return (0); 1293 } 1294