1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 39 * Copyright 2025 Oxide Computer Company 40 */ 41 42 /* 43 * viona - VirtIO-Net, Accelerated 44 * 45 * The purpose of viona is to provide high performance virtio-net devices to 46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the 47 * DLS/DLD stack. 48 * 49 * -------------------- 50 * General Architecture 51 * -------------------- 52 * 53 * A single viona instance is comprised of a "link" handle and two "rings". 54 * After opening the viona device, it must be associated with a MAC network 55 * interface and a bhyve (vmm) instance to form its link resource. This is 56 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are 57 * passed in to perform the initialization. With the MAC client opened, and a 58 * driver handle to the vmm instance established, the device is ready to be 59 * configured by the guest. 60 * 61 * The userspace portion of bhyve, which interfaces with the PCI device 62 * emulation framework, is meant to stay out of the datapath if at all 63 * possible. Configuration changes made via PCI are mapped to actions which 64 * will steer the operation of the in-kernel logic. 65 * 66 * 67 * ----------- 68 * Ring Basics 69 * ----------- 70 * 71 * Each viona link has two viona_vring_t entities, RX and TX, for handling data 72 * transfers to and from the guest. They represent an interface to the 73 * standard virtio ring structures. When initialized and active, each ring is 74 * backed by a kernel worker thread (parented to the bhyve process for the 75 * instance) which handles ring events. The RX worker has the simple task of 76 * watching for ring shutdown conditions. The TX worker does that in addition 77 * to processing all requests to transmit data. Data destined for the guest is 78 * delivered directly by MAC to viona_rx() when the ring is active. 79 * 80 * 81 * ----------- 82 * Ring States 83 * ----------- 84 * 85 * The viona_vring_t instances follow a simple path through the possible state 86 * values represented in virtio_vring_t`vr_state: 87 * 88 * +<--------------------------------------------+ 89 * | | 90 * V ^ 91 * +-----------+ This is the initial state when a link is created or 92 * | VRS_RESET | when the ring has been explicitly reset. 93 * +-----------+ 94 * | ^ 95 * |---* ioctl(VNA_IOC_RING_INIT) issued | 96 * | | 97 * | ^ 98 * V 99 * +-----------+ The ring parameters (size, guest physical addresses) 100 * | VRS_SETUP | have been set and start-up of the ring worker thread 101 * +-----------+ has begun. 102 * | ^ 103 * | | 104 * |---* ring worker thread begins execution | 105 * | | 106 * +-------------------------------------------->+ 107 * | | ^ 108 * | | 109 * | * If ring shutdown is requested (by ioctl or impending 110 * | bhyve process death) while the worker thread is 111 * | starting, the worker will transition the ring to 112 * | VRS_RESET and exit. 113 * | ^ 114 * | | 115 * |<-------------------------------------------<+ 116 * | | | 117 * | | ^ 118 * | * If ring is requested to pause (but not stop)from the 119 * | VRS_RUN state, it will return to the VRS_INIT state. 120 * | 121 * | ^ 122 * | | 123 * | ^ 124 * V 125 * +-----------+ The worker thread associated with the ring has started 126 * | VRS_INIT | executing. It has allocated any extra resources needed 127 * +-----------+ for the ring to operate. 128 * | ^ 129 * | | 130 * +-------------------------------------------->+ 131 * | | ^ 132 * | | 133 * | * If ring shutdown is requested while the worker is 134 * | waiting in VRS_INIT, it will free any extra resources 135 * | and transition to VRS_RESET. 136 * | ^ 137 * | | 138 * |--* ioctl(VNA_IOC_RING_KICK) issued | 139 * | ^ 140 * V 141 * +-----------+ The worker thread associated with the ring is executing 142 * | VRS_RUN | workload specific to that ring. 143 * +-----------+ 144 * | ^ 145 * |---* ioctl(VNA_IOC_RING_RESET) issued | 146 * | (or bhyve process begins exit) ^ 147 * | 148 * +-----------+ The worker thread associated with the ring is in the 149 * | VRS_STOP | process of exiting. All outstanding TX and RX 150 * +-----------+ requests are allowed to complete, but new requests 151 * | must be ignored. 152 * | ^ 153 * | | 154 * +-------------------------------------------->+ 155 * 156 * 157 * While the worker thread is not running, changes to vr_state are only made by 158 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts 159 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread 160 * has been started, only it may perform ring state transitions (still under 161 * the protection of vr_lock), when requested by outside consumers via 162 * vr_state_flags or when the containing bhyve process initiates an exit. 163 * 164 * 165 * ---------------------------- 166 * Transmission mblk_t Handling 167 * ---------------------------- 168 * 169 * For incoming frames destined for a bhyve guest, the data must first land in 170 * a host OS buffer from the physical NIC before it is copied into the awaiting 171 * guest buffer(s). Outbound frames transmitted by the guest are not bound by 172 * this limitation and can avoid extra copying before the buffers are accessed 173 * directly by the NIC. When a guest designates buffers to be transmitted, 174 * viona translates the guest-physical addresses contained in the ring 175 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is 176 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). 177 * Doing so increments vr_xfer_outstanding, preventing the ring from being 178 * reset (allowing the link to drop its vmm handle to the guest) until all 179 * transmit mblks referencing guest memory have been processed. Allocation of 180 * the viona_desb_t entries is done during the VRS_INIT stage of the ring 181 * worker thread. The ring size informs that allocation as the number of 182 * concurrent transmissions is limited by the number of descriptors in the 183 * ring. This minimizes allocation in the transmit hot-path by acquiring those 184 * fixed-size resources during initialization. 185 * 186 * This optimization depends on the underlying NIC driver freeing the mblks in 187 * a timely manner after they have been transmitted by the hardware. Some 188 * drivers have been found to flush TX descriptors only when new transmissions 189 * are initiated. This means that there is no upper bound to the time needed 190 * for an mblk to be flushed and can stall bhyve guests from shutting down 191 * since their memory must be free of viona TX references prior to clean-up. 192 * 193 * This expectation of deterministic mblk_t processing is likely the reason 194 * behind the notable exception to the zero-copy TX path: systems with 'bnxe' 195 * loaded will copy transmit data into fresh buffers rather than passing up 196 * zero-copy mblks. It is a hold-over from the original viona sources provided 197 * by Pluribus and its continued necessity has not been confirmed. 198 * 199 * 200 * ---------------------------- 201 * Ring Notification Fast-paths 202 * ---------------------------- 203 * 204 * Device operation for viona requires that notifications flow to and from the 205 * guest to indicate certain ring conditions. In order to minimize latency and 206 * processing overhead, the notification procedures are kept in-kernel whenever 207 * possible. 208 * 209 * Guest-to-host notifications, when new available descriptors have been placed 210 * in the ring, are posted via the 'queue notify' address in the virtio BAR. 211 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to 212 * install a callback hook on an ioport address. Guest exits for accesses to 213 * viona-hooked ioport addresses will result in direct calls to notify the 214 * appropriate ring worker without a trip to userland. 215 * 216 * Host-to-guest notifications in the form of interrupts enjoy similar 217 * acceleration. Each viona ring can be configured to send MSI notifications 218 * to the guest as virtio conditions dictate. This in-kernel interrupt 219 * configuration is kept synchronized through viona ioctls which are utilized 220 * during writes to the associated PCI config registers or MSI-X BAR. 221 * 222 * Guests which do not utilize MSI-X will result in viona falling back to the 223 * slow path for interrupts. It will poll(2) the viona handle, receiving 224 * notification when ring events necessitate the assertion of an interrupt. 225 * 226 * 227 * --------------- 228 * Nethook Support 229 * --------------- 230 * 231 * Viona provides four nethook events that consumers (e.g. ipf) can hook into 232 * to intercept packets as they go up or down the stack. Unfortunately, 233 * the nethook framework does not understand raw packets, so we can only 234 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, 235 * we register callbacks with the neti (netinfo) module that will be invoked 236 * for each netstack already present, as well as for any additional netstack 237 * instances created as the system operates. These callbacks will 238 * register/unregister the hooks with the nethook framework for each 239 * netstack instance. This registration occurs prior to creating any 240 * viona instances for a given netstack, and the unregistration for a netstack 241 * instance occurs after all viona instances of the netstack instance have 242 * been deleted. 243 * 244 * ------------------ 245 * Metrics/Statistics 246 * ----------------- 247 * 248 * During operation, Viona tracks certain metrics as certain events occur. 249 * 250 * One class of metrics, known as the "error stats", refer to abnormal 251 * conditions in ring processing which are likely the fault of a misbehaving 252 * guest. These are tracked on a per-ring basis, and are not formally exposed 253 * to any consumer besides direct memory access through mdb. 254 * 255 * The other class of metrics tracked for an instance are the "transfer stats", 256 * which are the traditional packets/bytes/errors/drops figures. These are 257 * counted per-ring, and then aggregated into link-wide values exposed via 258 * kstats. Atomic operations are used to increment those per-ring stats during 259 * operation, and then when a ring is stopped, the values are consolidated into 260 * the link-wide values (to prevent loss when the ring is zeroed) under the 261 * protection of viona_link`l_stats_lock. When the kstats are being updated, 262 * l_stats_lock is held to protect against a racing consolidation, with the 263 * existing per-ring values being added in at update time to provide an accurate 264 * figure. 265 */ 266 267 #include <sys/conf.h> 268 #include <sys/file.h> 269 #include <sys/stat.h> 270 271 #include <sys/dlpi.h> 272 #include <sys/vlan.h> 273 274 #include "viona_impl.h" 275 276 277 #define VIONA_NAME "Virtio Network Accelerator" 278 #define VIONA_CTL_MINOR 0 279 #define VIONA_MODULE_NAME "viona" 280 #define VIONA_KSTAT_CLASS "misc" 281 #define VIONA_KSTAT_NAME "viona_stat" 282 283 284 /* 285 * Host capabilities. 286 */ 287 #define VIONA_S_HOSTCAPS ( \ 288 VIRTIO_NET_F_GUEST_CSUM | \ 289 VIRTIO_NET_F_MAC | \ 290 VIRTIO_NET_F_GUEST_TSO4 | \ 291 VIRTIO_NET_F_MRG_RXBUF | \ 292 VIRTIO_NET_F_STATUS | \ 293 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ 294 VIRTIO_F_RING_INDIRECT_DESC) 295 296 /* MAC_CAPAB_HCKSUM specifics of interest */ 297 #define VIONA_CAP_HCKSUM_INTEREST \ 298 (HCKSUM_INET_PARTIAL | \ 299 HCKSUM_INET_FULL_V4 | \ 300 HCKSUM_INET_FULL_V6) 301 302 static void *viona_state; 303 static dev_info_t *viona_dip; 304 static id_space_t *viona_minors; 305 306 307 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, 308 void **result); 309 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 310 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 311 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); 312 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); 313 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, 314 cred_t *credp, int *rval); 315 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 316 struct pollhead **phpp); 317 318 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); 319 static int viona_ioc_delete(viona_soft_state_t *, boolean_t); 320 321 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); 322 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t); 323 static int viona_ioc_get_params(viona_link_t *, void *, int); 324 static int viona_ioc_set_params(viona_link_t *, void *, int); 325 static int viona_ioc_ring_init(viona_link_t *, void *, int); 326 static int viona_ioc_ring_set_state(viona_link_t *, void *, int); 327 static int viona_ioc_ring_get_state(viona_link_t *, void *, int); 328 static int viona_ioc_ring_reset(viona_link_t *, uint_t); 329 static int viona_ioc_ring_kick(viona_link_t *, uint_t); 330 static int viona_ioc_ring_pause(viona_link_t *, uint_t); 331 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); 332 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); 333 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); 334 335 static void viona_params_get_defaults(viona_link_params_t *); 336 337 static struct cb_ops viona_cb_ops = { 338 viona_open, 339 viona_close, 340 nodev, 341 nodev, 342 nodev, 343 nodev, 344 nodev, 345 viona_ioctl, 346 nodev, 347 nodev, 348 nodev, 349 viona_chpoll, 350 ddi_prop_op, 351 0, 352 D_MP | D_NEW | D_HOTPLUG, 353 CB_REV, 354 nodev, 355 nodev 356 }; 357 358 static struct dev_ops viona_ops = { 359 DEVO_REV, 360 0, 361 viona_info, 362 nulldev, 363 nulldev, 364 viona_attach, 365 viona_detach, 366 nodev, 367 &viona_cb_ops, 368 NULL, 369 ddi_power, 370 ddi_quiesce_not_needed 371 }; 372 373 static struct modldrv modldrv = { 374 &mod_driverops, 375 VIONA_NAME, 376 &viona_ops, 377 }; 378 379 static struct modlinkage modlinkage = { 380 MODREV_1, &modldrv, NULL 381 }; 382 383 int 384 _init(void) 385 { 386 int ret; 387 388 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); 389 if (ret != 0) { 390 return (ret); 391 } 392 393 viona_minors = id_space_create("viona_minors", 394 VIONA_CTL_MINOR + 1, UINT16_MAX); 395 viona_rx_init(); 396 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); 397 398 ret = mod_install(&modlinkage); 399 if (ret != 0) { 400 ddi_soft_state_fini(&viona_state); 401 id_space_destroy(viona_minors); 402 viona_rx_fini(); 403 mutex_destroy(&viona_force_copy_lock); 404 } 405 406 return (ret); 407 } 408 409 int 410 _fini(void) 411 { 412 int ret; 413 414 ret = mod_remove(&modlinkage); 415 if (ret != 0) { 416 return (ret); 417 } 418 419 ddi_soft_state_fini(&viona_state); 420 id_space_destroy(viona_minors); 421 viona_rx_fini(); 422 mutex_destroy(&viona_force_copy_lock); 423 424 return (ret); 425 } 426 427 int 428 _info(struct modinfo *modinfop) 429 { 430 return (mod_info(&modlinkage, modinfop)); 431 } 432 433 /* ARGSUSED */ 434 static int 435 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 436 { 437 int error; 438 439 switch (cmd) { 440 case DDI_INFO_DEVT2DEVINFO: 441 *result = (void *)viona_dip; 442 error = DDI_SUCCESS; 443 break; 444 case DDI_INFO_DEVT2INSTANCE: 445 *result = (void *)0; 446 error = DDI_SUCCESS; 447 break; 448 default: 449 error = DDI_FAILURE; 450 break; 451 } 452 return (error); 453 } 454 455 static int 456 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 457 { 458 if (cmd != DDI_ATTACH) { 459 return (DDI_FAILURE); 460 } 461 462 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, 463 DDI_PSEUDO, 0) != DDI_SUCCESS) { 464 return (DDI_FAILURE); 465 } 466 467 viona_neti_attach(); 468 469 viona_dip = dip; 470 ddi_report_dev(viona_dip); 471 472 return (DDI_SUCCESS); 473 } 474 475 static int 476 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 477 { 478 dev_info_t *old_dip = viona_dip; 479 480 if (cmd != DDI_DETACH) { 481 return (DDI_FAILURE); 482 } 483 484 VERIFY(old_dip != NULL); 485 486 viona_neti_detach(); 487 viona_dip = NULL; 488 ddi_remove_minor_node(old_dip, NULL); 489 490 return (DDI_SUCCESS); 491 } 492 493 static int 494 viona_open(dev_t *devp, int flag, int otype, cred_t *credp) 495 { 496 int minor; 497 viona_soft_state_t *ss; 498 499 if (otype != OTYP_CHR) { 500 return (EINVAL); 501 } 502 #if 0 503 /* 504 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. 505 * Should the check be at open() or ioctl()? 506 */ 507 if (drv_priv(credp) != 0) { 508 return (EPERM); 509 } 510 #endif 511 if (getminor(*devp) != VIONA_CTL_MINOR) { 512 return (ENXIO); 513 } 514 515 minor = id_alloc_nosleep(viona_minors); 516 if (minor == -1) { 517 /* All minors are busy */ 518 return (EBUSY); 519 } 520 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { 521 id_free(viona_minors, minor); 522 return (ENOMEM); 523 } 524 525 ss = ddi_get_soft_state(viona_state, minor); 526 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); 527 ss->ss_minor = minor; 528 *devp = makedevice(getmajor(*devp), minor); 529 530 return (0); 531 } 532 533 static int 534 viona_close(dev_t dev, int flag, int otype, cred_t *credp) 535 { 536 int minor; 537 viona_soft_state_t *ss; 538 539 if (otype != OTYP_CHR) { 540 return (EINVAL); 541 } 542 543 minor = getminor(dev); 544 545 ss = ddi_get_soft_state(viona_state, minor); 546 if (ss == NULL) { 547 return (ENXIO); 548 } 549 550 VERIFY0(viona_ioc_delete(ss, B_TRUE)); 551 VERIFY(!list_link_active(&ss->ss_node)); 552 ddi_soft_state_free(viona_state, minor); 553 id_free(viona_minors, minor); 554 555 return (0); 556 } 557 558 static int 559 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) 560 { 561 viona_soft_state_t *ss; 562 void *dptr = (void *)data; 563 int err = 0, val; 564 viona_link_t *link; 565 566 ss = ddi_get_soft_state(viona_state, getminor(dev)); 567 if (ss == NULL) { 568 return (ENXIO); 569 } 570 571 switch (cmd) { 572 case VNA_IOC_CREATE: 573 return (viona_ioc_create(ss, dptr, md, cr)); 574 case VNA_IOC_DELETE: 575 return (viona_ioc_delete(ss, B_FALSE)); 576 case VNA_IOC_VERSION: 577 *rv = VIONA_CURRENT_INTERFACE_VERSION; 578 return (0); 579 case VNA_IOC_DEFAULT_PARAMS: 580 /* 581 * With a NULL link parameter, viona_ioc_get_params() will emit 582 * the default parameters with the same error-handling behavior 583 * as VNA_IOC_GET_PARAMS. 584 */ 585 return (viona_ioc_get_params(NULL, dptr, md)); 586 default: 587 break; 588 } 589 590 mutex_enter(&ss->ss_lock); 591 if ((link = ss->ss_link) == NULL || link->l_destroyed || 592 vmm_drv_release_reqd(link->l_vm_hold)) { 593 mutex_exit(&ss->ss_lock); 594 return (ENXIO); 595 } 596 597 switch (cmd) { 598 case VNA_IOC_GET_FEATURES: 599 val = VIONA_S_HOSTCAPS | link->l_features_hw; 600 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { 601 err = EFAULT; 602 } 603 break; 604 case VNA_IOC_SET_FEATURES: 605 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { 606 err = EFAULT; 607 break; 608 } 609 val &= (VIONA_S_HOSTCAPS | link->l_features_hw); 610 611 if ((val & VIRTIO_NET_F_CSUM) == 0) 612 val &= ~VIRTIO_NET_F_HOST_TSO4; 613 614 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) 615 val &= ~VIRTIO_NET_F_GUEST_TSO4; 616 617 link->l_features = val; 618 break; 619 case VNA_IOC_RING_INIT: 620 err = viona_ioc_ring_init(link, dptr, md); 621 break; 622 case VNA_IOC_RING_RESET: 623 err = viona_ioc_ring_reset(link, (uint_t)data); 624 break; 625 case VNA_IOC_RING_KICK: 626 err = viona_ioc_ring_kick(link, (uint_t)data); 627 break; 628 case VNA_IOC_RING_SET_MSI: 629 err = viona_ioc_ring_set_msi(link, dptr, md); 630 break; 631 case VNA_IOC_RING_INTR_CLR: 632 err = viona_ioc_ring_intr_clear(link, (uint_t)data); 633 break; 634 case VNA_IOC_RING_SET_STATE: 635 err = viona_ioc_ring_set_state(link, dptr, md); 636 break; 637 case VNA_IOC_RING_GET_STATE: 638 err = viona_ioc_ring_get_state(link, dptr, md); 639 break; 640 case VNA_IOC_RING_PAUSE: 641 err = viona_ioc_ring_pause(link, (uint_t)data); 642 break; 643 644 case VNA_IOC_INTR_POLL: 645 err = viona_ioc_intr_poll(link, dptr, md, rv); 646 break; 647 case VNA_IOC_SET_NOTIFY_IOP: 648 if (data < 0 || data > UINT16_MAX) { 649 err = EINVAL; 650 break; 651 } 652 err = viona_ioc_set_notify_ioport(link, (uint16_t)data); 653 break; 654 case VNA_IOC_SET_PROMISC: 655 err = viona_ioc_set_promisc(link, (viona_promisc_t)data); 656 break; 657 case VNA_IOC_GET_PARAMS: 658 err = viona_ioc_get_params(link, dptr, md); 659 break; 660 case VNA_IOC_SET_PARAMS: 661 err = viona_ioc_set_params(link, dptr, md); 662 break; 663 case VNA_IOC_GET_MTU: 664 *rv = (int)link->l_mtu; 665 break; 666 case VNA_IOC_SET_MTU: 667 if (data < VIONA_MIN_MTU || data > VIONA_MAX_MTU) 668 err = EINVAL; 669 else 670 link->l_mtu = (uint16_t)data; 671 break; 672 default: 673 err = ENOTTY; 674 break; 675 } 676 677 mutex_exit(&ss->ss_lock); 678 return (err); 679 } 680 681 static int 682 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 683 struct pollhead **phpp) 684 { 685 viona_soft_state_t *ss; 686 viona_link_t *link; 687 688 ss = ddi_get_soft_state(viona_state, getminor(dev)); 689 if (ss == NULL) { 690 return (ENXIO); 691 } 692 693 mutex_enter(&ss->ss_lock); 694 if ((link = ss->ss_link) == NULL || link->l_destroyed) { 695 mutex_exit(&ss->ss_lock); 696 return (ENXIO); 697 } 698 699 *reventsp = 0; 700 if ((events & POLLRDBAND) != 0) { 701 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 702 if (link->l_vrings[i].vr_intr_enabled != 0) { 703 *reventsp |= POLLRDBAND; 704 break; 705 } 706 } 707 } 708 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 709 *phpp = &link->l_pollhead; 710 } 711 mutex_exit(&ss->ss_lock); 712 713 return (0); 714 } 715 716 static void 717 viona_get_mac_capab(viona_link_t *link) 718 { 719 mac_handle_t mh = link->l_mh; 720 uint32_t cap = 0; 721 mac_capab_lso_t lso_cap; 722 723 link->l_features_hw = 0; 724 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { 725 /* 726 * Only report HW checksum ability if the underlying MAC 727 * resource is capable of populating the L4 header. 728 */ 729 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { 730 link->l_features_hw |= VIRTIO_NET_F_CSUM; 731 } 732 link->l_cap_csum = cap; 733 } 734 735 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && 736 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { 737 /* 738 * Virtio doesn't allow for negotiating a maximum LSO 739 * packet size. We have to assume that the guest may 740 * send a maximum length IP packet. Make sure the 741 * underlying MAC can handle an LSO of this size. 742 */ 743 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && 744 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) 745 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; 746 } 747 } 748 749 static int 750 viona_kstat_update(kstat_t *ksp, int rw) 751 { 752 viona_link_t *link = ksp->ks_private; 753 viona_kstats_t *vk = ksp->ks_data; 754 755 /* 756 * Avoid the potential for mangled values due to a racing consolidation 757 * of stats for a ring by performing the kstat update with l_stats_lock 758 * held while adding up the central (link) and ring values. 759 */ 760 mutex_enter(&link->l_stats_lock); 761 762 const viona_transfer_stats_t *ring_stats = 763 &link->l_vrings[VIONA_VQ_RX].vr_stats; 764 const viona_transfer_stats_t *link_stats = &link->l_stats.vls_rx; 765 766 vk->vk_rx_packets.value.ui64 = 767 link_stats->vts_packets + ring_stats->vts_packets; 768 vk->vk_rx_bytes.value.ui64 = 769 link_stats->vts_bytes + ring_stats->vts_bytes; 770 vk->vk_rx_errors.value.ui64 = 771 link_stats->vts_errors + ring_stats->vts_errors; 772 vk->vk_rx_drops.value.ui64 = 773 link_stats->vts_drops + ring_stats->vts_drops; 774 775 ring_stats = &link->l_vrings[VIONA_VQ_TX].vr_stats; 776 link_stats = &link->l_stats.vls_tx; 777 778 vk->vk_tx_packets.value.ui64 = 779 link_stats->vts_packets + ring_stats->vts_packets; 780 vk->vk_tx_bytes.value.ui64 = 781 link_stats->vts_bytes + ring_stats->vts_bytes; 782 vk->vk_tx_errors.value.ui64 = 783 link_stats->vts_errors + ring_stats->vts_errors; 784 vk->vk_tx_drops.value.ui64 = 785 link_stats->vts_drops + ring_stats->vts_drops; 786 787 mutex_exit(&link->l_stats_lock); 788 789 return (0); 790 } 791 792 static int 793 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr) 794 { 795 zoneid_t zid = crgetzoneid(cr); 796 kstat_t *ksp; 797 798 ASSERT(MUTEX_HELD(&ss->ss_lock)); 799 ASSERT3P(ss->ss_kstat, ==, NULL); 800 801 ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor, 802 VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED, 803 sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid); 804 805 if (ksp == NULL) { 806 /* 807 * Without detail from kstat_create_zone(), assume that resource 808 * exhaustion is to blame for the failure. 809 */ 810 return (ENOMEM); 811 } 812 ss->ss_kstat = ksp; 813 814 /* 815 * If this instance is associated with a non-global zone, make its 816 * kstats visible from the GZ. 817 */ 818 if (zid != GLOBAL_ZONEID) { 819 kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID); 820 } 821 822 viona_kstats_t *vk = ksp->ks_data; 823 824 kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64); 825 kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64); 826 kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64); 827 kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64); 828 kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64); 829 kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64); 830 kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64); 831 kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64); 832 ksp->ks_private = ss->ss_link; 833 ksp->ks_update = viona_kstat_update; 834 835 kstat_install(ss->ss_kstat); 836 return (0); 837 } 838 839 static void 840 viona_kstat_fini(viona_soft_state_t *ss) 841 { 842 ASSERT(MUTEX_HELD(&ss->ss_lock)); 843 844 if (ss->ss_kstat != NULL) { 845 kstat_delete(ss->ss_kstat); 846 ss->ss_kstat = NULL; 847 } 848 } 849 850 static int 851 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) 852 { 853 vioc_create_t kvc; 854 viona_link_t *link = NULL; 855 char cli_name[MAXNAMELEN]; 856 int err = 0; 857 file_t *fp; 858 vmm_hold_t *hold = NULL; 859 viona_neti_t *nip = NULL; 860 zoneid_t zid; 861 mac_diag_t mac_diag = MAC_DIAG_NONE; 862 boolean_t rings_allocd = B_FALSE; 863 864 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); 865 866 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { 867 return (EFAULT); 868 } 869 870 zid = crgetzoneid(cr); 871 nip = viona_neti_lookup_by_zid(zid); 872 if (nip == NULL) { 873 return (EIO); 874 } 875 876 if (!nip->vni_nethook.vnh_hooked) { 877 viona_neti_rele(nip); 878 return (EIO); 879 } 880 881 mutex_enter(&ss->ss_lock); 882 if (ss->ss_link != NULL) { 883 mutex_exit(&ss->ss_lock); 884 viona_neti_rele(nip); 885 return (EEXIST); 886 } 887 888 if ((fp = getf(kvc.c_vmfd)) == NULL) { 889 err = EBADF; 890 goto bail; 891 } 892 err = vmm_drv_hold(fp, cr, &hold); 893 releasef(kvc.c_vmfd); 894 if (err != 0) { 895 goto bail; 896 } 897 898 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); 899 link->l_linkid = kvc.c_linkid; 900 link->l_vm_hold = hold; 901 link->l_mtu = VIONA_DEFAULT_MTU; 902 903 err = mac_open_by_linkid(link->l_linkid, &link->l_mh); 904 if (err != 0) { 905 goto bail; 906 } 907 908 viona_get_mac_capab(link); 909 viona_params_get_defaults(&link->l_params); 910 911 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME, 912 link->l_linkid); 913 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); 914 if (err != 0) { 915 goto bail; 916 } 917 918 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY, 919 &link->l_muh, VLAN_ID_NONE, &mac_diag); 920 if (err != 0) { 921 goto bail; 922 } 923 924 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); 925 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); 926 rings_allocd = B_TRUE; 927 928 /* 929 * Default to passing up all multicast traffic in addition to 930 * classified unicast. Guests which have support will change this 931 * if they need to via the virtio net control queue; guests without 932 * support generally still want to see multicast. 933 */ 934 link->l_promisc = VIONA_PROMISC_MULTI; 935 if ((err = viona_rx_set(link, link->l_promisc)) != 0) { 936 goto bail; 937 } 938 939 link->l_neti = nip; 940 ss->ss_link = link; 941 942 if ((err = viona_kstat_init(ss, cr)) != 0) { 943 goto bail; 944 } 945 946 mutex_exit(&ss->ss_lock); 947 948 mutex_enter(&nip->vni_lock); 949 list_insert_tail(&nip->vni_dev_list, ss); 950 mutex_exit(&nip->vni_lock); 951 952 return (0); 953 954 bail: 955 if (link != NULL) { 956 viona_rx_clear(link); 957 if (link->l_mch != NULL) { 958 if (link->l_muh != NULL) { 959 VERIFY0(mac_unicast_remove(link->l_mch, 960 link->l_muh)); 961 link->l_muh = NULL; 962 } 963 mac_client_close(link->l_mch, 0); 964 } 965 if (link->l_mh != NULL) { 966 mac_close(link->l_mh); 967 } 968 if (rings_allocd) { 969 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 970 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 971 } 972 kmem_free(link, sizeof (viona_link_t)); 973 ss->ss_link = NULL; 974 } 975 if (hold != NULL) { 976 vmm_drv_rele(hold); 977 } 978 viona_neti_rele(nip); 979 980 mutex_exit(&ss->ss_lock); 981 return (err); 982 } 983 984 static int 985 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) 986 { 987 viona_link_t *link; 988 viona_neti_t *nip = NULL; 989 990 mutex_enter(&ss->ss_lock); 991 if ((link = ss->ss_link) == NULL) { 992 /* Link destruction already complete */ 993 mutex_exit(&ss->ss_lock); 994 return (0); 995 } 996 997 if (link->l_destroyed) { 998 /* 999 * Link destruction has been started by another thread, but has 1000 * not completed. This condition should be impossible to 1001 * encounter when performing the on-close destroy of the link, 1002 * since racing ioctl accessors must necessarily be absent. 1003 */ 1004 VERIFY(!on_close); 1005 mutex_exit(&ss->ss_lock); 1006 return (EAGAIN); 1007 } 1008 /* 1009 * The link deletion cannot fail after this point, continuing until its 1010 * successful completion is reached. 1011 */ 1012 link->l_destroyed = B_TRUE; 1013 1014 /* 1015 * Tear down the IO port hook so it cannot be used to kick any of the 1016 * rings which are about to be reset and stopped. 1017 */ 1018 VERIFY0(viona_ioc_set_notify_ioport(link, 0)); 1019 mutex_exit(&ss->ss_lock); 1020 1021 /* 1022 * Return the rings to their reset state, ignoring any possible 1023 * interruptions from signals. 1024 */ 1025 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); 1026 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); 1027 1028 mutex_enter(&ss->ss_lock); 1029 viona_kstat_fini(ss); 1030 if (link->l_mch != NULL) { 1031 /* Unhook the receive callbacks and close out the client */ 1032 viona_rx_clear(link); 1033 if (link->l_muh != NULL) { 1034 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh)); 1035 link->l_muh = NULL; 1036 } 1037 mac_client_close(link->l_mch, 0); 1038 } 1039 if (link->l_mh != NULL) { 1040 mac_close(link->l_mh); 1041 } 1042 if (link->l_vm_hold != NULL) { 1043 vmm_drv_rele(link->l_vm_hold); 1044 link->l_vm_hold = NULL; 1045 } 1046 1047 nip = link->l_neti; 1048 link->l_neti = NULL; 1049 1050 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 1051 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 1052 pollhead_clean(&link->l_pollhead); 1053 ss->ss_link = NULL; 1054 mutex_exit(&ss->ss_lock); 1055 1056 mutex_enter(&nip->vni_lock); 1057 list_remove(&nip->vni_dev_list, ss); 1058 mutex_exit(&nip->vni_lock); 1059 1060 viona_neti_rele(nip); 1061 1062 kmem_free(link, sizeof (viona_link_t)); 1063 return (0); 1064 } 1065 1066 static int 1067 viona_ioc_ring_init(viona_link_t *link, void *udata, int md) 1068 { 1069 vioc_ring_init_t kri; 1070 int err; 1071 1072 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { 1073 return (EFAULT); 1074 } 1075 const struct viona_ring_params params = { 1076 .vrp_pa = kri.ri_qaddr, 1077 .vrp_size = kri.ri_qsize, 1078 .vrp_avail_idx = 0, 1079 .vrp_used_idx = 0, 1080 }; 1081 1082 err = viona_ring_init(link, kri.ri_index, ¶ms); 1083 1084 return (err); 1085 } 1086 1087 static int 1088 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md) 1089 { 1090 vioc_ring_state_t krs; 1091 int err; 1092 1093 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 1094 return (EFAULT); 1095 } 1096 const struct viona_ring_params params = { 1097 .vrp_pa = krs.vrs_qaddr, 1098 .vrp_size = krs.vrs_qsize, 1099 .vrp_avail_idx = krs.vrs_avail_idx, 1100 .vrp_used_idx = krs.vrs_used_idx, 1101 }; 1102 1103 err = viona_ring_init(link, krs.vrs_index, ¶ms); 1104 1105 return (err); 1106 } 1107 1108 static int 1109 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md) 1110 { 1111 vioc_ring_state_t krs; 1112 1113 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 1114 return (EFAULT); 1115 } 1116 1117 struct viona_ring_params params; 1118 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms); 1119 if (err != 0) { 1120 return (err); 1121 } 1122 krs.vrs_qsize = params.vrp_size; 1123 krs.vrs_qaddr = params.vrp_pa; 1124 krs.vrs_avail_idx = params.vrp_avail_idx; 1125 krs.vrs_used_idx = params.vrp_used_idx; 1126 1127 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) { 1128 return (EFAULT); 1129 } 1130 return (0); 1131 } 1132 1133 static int 1134 viona_ioc_ring_reset(viona_link_t *link, uint_t idx) 1135 { 1136 viona_vring_t *ring; 1137 1138 if (idx >= VIONA_VQ_MAX) { 1139 return (EINVAL); 1140 } 1141 ring = &link->l_vrings[idx]; 1142 1143 return (viona_ring_reset(ring, B_TRUE)); 1144 } 1145 1146 static int 1147 viona_ioc_ring_kick(viona_link_t *link, uint_t idx) 1148 { 1149 viona_vring_t *ring; 1150 int err; 1151 1152 if (idx >= VIONA_VQ_MAX) { 1153 return (EINVAL); 1154 } 1155 ring = &link->l_vrings[idx]; 1156 1157 mutex_enter(&ring->vr_lock); 1158 switch (ring->vr_state) { 1159 case VRS_SETUP: 1160 /* 1161 * An early kick to a ring which is starting its worker thread 1162 * is fine. Once that thread is active, it will process the 1163 * start-up request immediately. 1164 */ 1165 /* FALLTHROUGH */ 1166 case VRS_INIT: 1167 ring->vr_state_flags |= VRSF_REQ_START; 1168 /* FALLTHROUGH */ 1169 case VRS_RUN: 1170 cv_broadcast(&ring->vr_cv); 1171 err = 0; 1172 break; 1173 default: 1174 err = EBUSY; 1175 break; 1176 } 1177 mutex_exit(&ring->vr_lock); 1178 1179 return (err); 1180 } 1181 1182 static int 1183 viona_ioc_ring_pause(viona_link_t *link, uint_t idx) 1184 { 1185 if (idx >= VIONA_VQ_MAX) { 1186 return (EINVAL); 1187 } 1188 1189 viona_vring_t *ring = &link->l_vrings[idx]; 1190 return (viona_ring_pause(ring)); 1191 } 1192 1193 static int 1194 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) 1195 { 1196 vioc_ring_msi_t vrm; 1197 viona_vring_t *ring; 1198 1199 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { 1200 return (EFAULT); 1201 } 1202 if (vrm.rm_index >= VIONA_VQ_MAX) { 1203 return (EINVAL); 1204 } 1205 1206 ring = &link->l_vrings[vrm.rm_index]; 1207 mutex_enter(&ring->vr_lock); 1208 ring->vr_msi_addr = vrm.rm_addr; 1209 ring->vr_msi_msg = vrm.rm_msg; 1210 mutex_exit(&ring->vr_lock); 1211 1212 return (0); 1213 } 1214 1215 static int 1216 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, 1217 uint32_t *val) 1218 { 1219 viona_link_t *link = (viona_link_t *)arg; 1220 1221 /* 1222 * If the request is a read (in/ins), or direct at a port other than 1223 * what we expect to be registered on, ignore it. 1224 */ 1225 if (in || port != link->l_notify_ioport) { 1226 return (ESRCH); 1227 } 1228 1229 /* Let userspace handle notifications for rings other than RX/TX. */ 1230 const uint16_t vq = *val; 1231 if (vq >= VIONA_VQ_MAX) { 1232 return (ESRCH); 1233 } 1234 1235 viona_vring_t *ring = &link->l_vrings[vq]; 1236 int res = 0; 1237 1238 mutex_enter(&ring->vr_lock); 1239 if (ring->vr_state == VRS_RUN) { 1240 cv_broadcast(&ring->vr_cv); 1241 } else { 1242 res = ESRCH; 1243 } 1244 mutex_exit(&ring->vr_lock); 1245 1246 return (res); 1247 } 1248 1249 static int 1250 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) 1251 { 1252 int err = 0; 1253 1254 if (link->l_notify_ioport != 0) { 1255 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); 1256 link->l_notify_ioport = 0; 1257 } 1258 1259 if (ioport != 0) { 1260 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, 1261 viona_notify_iop, (void *)link, &link->l_notify_cookie); 1262 if (err == 0) { 1263 link->l_notify_ioport = ioport; 1264 } 1265 } 1266 return (err); 1267 } 1268 1269 static int 1270 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode) 1271 { 1272 int err; 1273 1274 if (mode >= VIONA_PROMISC_MAX) { 1275 return (EINVAL); 1276 } 1277 1278 if (mode == link->l_promisc) { 1279 return (0); 1280 } 1281 1282 if ((err = viona_rx_set(link, mode)) != 0) { 1283 return (err); 1284 } 1285 1286 link->l_promisc = mode; 1287 return (0); 1288 } 1289 1290 #define PARAM_NM_TX_COPY_DATA "tx_copy_data" 1291 #define PARAM_NM_TX_HEADER_PAD "tx_header_pad" 1292 1293 #define PARAM_ERR_INVALID_TYPE "invalid type" 1294 #define PARAM_ERR_OUT_OF_RANGE "value out of range" 1295 #define PARAM_ERR_UNK_KEY "unknown key" 1296 1297 static nvlist_t * 1298 viona_params_to_nvlist(const viona_link_params_t *vlp) 1299 { 1300 nvlist_t *nvl = fnvlist_alloc(); 1301 1302 fnvlist_add_boolean_value(nvl, PARAM_NM_TX_COPY_DATA, 1303 vlp->vlp_tx_copy_data); 1304 fnvlist_add_uint16(nvl, PARAM_NM_TX_HEADER_PAD, 1305 vlp->vlp_tx_header_pad); 1306 1307 return (nvl); 1308 } 1309 1310 static nvlist_t * 1311 viona_params_from_nvlist(nvlist_t *nvl, viona_link_params_t *vlp) 1312 { 1313 nvlist_t *nverr = fnvlist_alloc(); 1314 nvpair_t *nvp = NULL; 1315 1316 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1317 const char *name = nvpair_name(nvp); 1318 const data_type_t dtype = nvpair_type(nvp); 1319 1320 if (strcmp(name, PARAM_NM_TX_COPY_DATA) == 0) { 1321 if (dtype == DATA_TYPE_BOOLEAN_VALUE) { 1322 vlp->vlp_tx_copy_data = 1323 fnvpair_value_boolean_value(nvp); 1324 } else { 1325 fnvlist_add_string(nverr, name, 1326 PARAM_ERR_INVALID_TYPE); 1327 } 1328 continue; 1329 } 1330 if (strcmp(name, PARAM_NM_TX_HEADER_PAD) == 0) { 1331 if (dtype == DATA_TYPE_UINT16) { 1332 uint16_t value = fnvpair_value_uint16(nvp); 1333 1334 if (value > viona_max_header_pad) { 1335 fnvlist_add_string(nverr, name, 1336 PARAM_ERR_OUT_OF_RANGE); 1337 } else { 1338 vlp->vlp_tx_header_pad = value; 1339 } 1340 } else { 1341 fnvlist_add_string(nverr, name, 1342 PARAM_ERR_INVALID_TYPE); 1343 } 1344 continue; 1345 } 1346 1347 /* Reject parameters we do not recognize */ 1348 fnvlist_add_string(nverr, name, PARAM_ERR_UNK_KEY); 1349 } 1350 1351 if (!nvlist_empty(nverr)) { 1352 return (nverr); 1353 } 1354 1355 nvlist_free(nverr); 1356 return (NULL); 1357 } 1358 1359 static void 1360 viona_params_get_defaults(viona_link_params_t *vlp) 1361 { 1362 vlp->vlp_tx_copy_data = viona_tx_copy_needed(); 1363 vlp->vlp_tx_header_pad = 0; 1364 } 1365 1366 static int 1367 viona_ioc_get_params(viona_link_t *link, void *udata, int md) 1368 { 1369 vioc_get_params_t vgp; 1370 int err = 0; 1371 1372 if (ddi_copyin(udata, &vgp, sizeof (vgp), md) != 0) { 1373 return (EFAULT); 1374 } 1375 1376 nvlist_t *nvl = NULL; 1377 if (link != NULL) { 1378 nvl = viona_params_to_nvlist(&link->l_params); 1379 } else { 1380 viona_link_params_t vlp = { 0 }; 1381 1382 viona_params_get_defaults(&vlp); 1383 nvl = viona_params_to_nvlist(&vlp); 1384 } 1385 1386 VERIFY(nvl != NULL); 1387 1388 size_t packed_sz; 1389 void *packed = fnvlist_pack(nvl, &packed_sz); 1390 nvlist_free(nvl); 1391 1392 if (packed_sz > vgp.vgp_param_sz) { 1393 err = E2BIG; 1394 } 1395 /* Communicate size, even if the data will not fit */ 1396 vgp.vgp_param_sz = packed_sz; 1397 1398 if (err == 0 && 1399 ddi_copyout(packed, vgp.vgp_param, packed_sz, md) != 0) { 1400 err = EFAULT; 1401 } 1402 kmem_free(packed, packed_sz); 1403 1404 if (ddi_copyout(&vgp, udata, sizeof (vgp), md) != 0) { 1405 if (err != 0) { 1406 err = EFAULT; 1407 } 1408 } 1409 1410 return (err); 1411 } 1412 1413 static int 1414 viona_ioc_set_params(viona_link_t *link, void *udata, int md) 1415 { 1416 vioc_set_params_t vsp; 1417 int err = 0; 1418 nvlist_t *nverr = NULL; 1419 1420 if (ddi_copyin(udata, &vsp, sizeof (vsp), md) != 0) { 1421 return (EFAULT); 1422 } 1423 1424 if (vsp.vsp_param_sz > VIONA_MAX_PARAM_NVLIST_SZ) { 1425 err = E2BIG; 1426 goto done; 1427 } else if (vsp.vsp_param_sz == 0) { 1428 /* 1429 * There is no reason to make this ioctl call with no actual 1430 * parameters to be changed. 1431 */ 1432 err = EINVAL; 1433 goto done; 1434 } 1435 1436 const size_t packed_sz = vsp.vsp_param_sz; 1437 void *packed = kmem_alloc(packed_sz, KM_SLEEP); 1438 if (ddi_copyin(vsp.vsp_param, packed, packed_sz, md) != 0) { 1439 kmem_free(packed, packed_sz); 1440 err = EFAULT; 1441 goto done; 1442 } 1443 1444 nvlist_t *parsed = NULL; 1445 if (nvlist_unpack(packed, packed_sz, &parsed, KM_SLEEP) == 0) { 1446 /* Use the existing parameters as a starting point */ 1447 viona_link_params_t new_params; 1448 bcopy(&link->l_params, &new_params, 1449 sizeof (new_params)); 1450 1451 nverr = viona_params_from_nvlist(parsed, &new_params); 1452 if (nverr == NULL) { 1453 /* 1454 * Only apply the updated parameters if there 1455 * were no errors during parsing. 1456 */ 1457 bcopy(&new_params, &link->l_params, 1458 sizeof (new_params)); 1459 } else { 1460 err = EINVAL; 1461 } 1462 1463 } else { 1464 err = EINVAL; 1465 } 1466 nvlist_free(parsed); 1467 kmem_free(packed, packed_sz); 1468 1469 done: 1470 if (nverr != NULL) { 1471 size_t err_packed_sz; 1472 void *err_packed = fnvlist_pack(nverr, &err_packed_sz); 1473 1474 if (err_packed_sz > vsp.vsp_error_sz) { 1475 if (err != 0) { 1476 err = E2BIG; 1477 } 1478 } else if (ddi_copyout(err_packed, vsp.vsp_error, 1479 err_packed_sz, md) != 0 && err == 0) { 1480 err = EFAULT; 1481 } 1482 vsp.vsp_error_sz = err_packed_sz; 1483 1484 nvlist_free(nverr); 1485 kmem_free(err_packed, err_packed_sz); 1486 } else { 1487 /* 1488 * If there are no detailed per-field errors, it is important to 1489 * communicate that absense to userspace. 1490 */ 1491 vsp.vsp_error_sz = 0; 1492 } 1493 1494 if (ddi_copyout(&vsp, udata, sizeof (vsp), md) != 0 && err == 0) { 1495 err = EFAULT; 1496 } 1497 1498 return (err); 1499 } 1500 1501 static int 1502 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) 1503 { 1504 if (idx >= VIONA_VQ_MAX) { 1505 return (EINVAL); 1506 } 1507 1508 link->l_vrings[idx].vr_intr_enabled = 0; 1509 return (0); 1510 } 1511 1512 static int 1513 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) 1514 { 1515 uint_t cnt = 0; 1516 vioc_intr_poll_t vip; 1517 1518 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 1519 uint_t val = link->l_vrings[i].vr_intr_enabled; 1520 1521 vip.vip_status[i] = val; 1522 if (val != 0) { 1523 cnt++; 1524 } 1525 } 1526 1527 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { 1528 return (EFAULT); 1529 } 1530 *rv = (int)cnt; 1531 return (0); 1532 } 1533