1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 39 * Copyright 2024 Oxide Computer Company 40 */ 41 42 /* 43 * viona - VirtIO-Net, Accelerated 44 * 45 * The purpose of viona is to provide high performance virtio-net devices to 46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the 47 * DLS/DLD stack. 48 * 49 * -------------------- 50 * General Architecture 51 * -------------------- 52 * 53 * A single viona instance is comprised of a "link" handle and two "rings". 54 * After opening the viona device, it must be associated with a MAC network 55 * interface and a bhyve (vmm) instance to form its link resource. This is 56 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are 57 * passed in to perform the initialization. With the MAC client opened, and a 58 * driver handle to the vmm instance established, the device is ready to be 59 * configured by the guest. 60 * 61 * The userspace portion of bhyve, which interfaces with the PCI device 62 * emulation framework, is meant to stay out of the datapath if at all 63 * possible. Configuration changes made via PCI are mapped to actions which 64 * will steer the operation of the in-kernel logic. 65 * 66 * 67 * ----------- 68 * Ring Basics 69 * ----------- 70 * 71 * Each viona link has two viona_vring_t entities, RX and TX, for handling data 72 * transfers to and from the guest. They represent an interface to the 73 * standard virtio ring structures. When initialized and active, each ring is 74 * backed by a kernel worker thread (parented to the bhyve process for the 75 * instance) which handles ring events. The RX worker has the simple task of 76 * watching for ring shutdown conditions. The TX worker does that in addition 77 * to processing all requests to transmit data. Data destined for the guest is 78 * delivered directly by MAC to viona_rx() when the ring is active. 79 * 80 * 81 * ----------- 82 * Ring States 83 * ----------- 84 * 85 * The viona_vring_t instances follow a simple path through the possible state 86 * values represented in virtio_vring_t`vr_state: 87 * 88 * +<--------------------------------------------+ 89 * | | 90 * V ^ 91 * +-----------+ This is the initial state when a link is created or 92 * | VRS_RESET | when the ring has been explicitly reset. 93 * +-----------+ 94 * | ^ 95 * |---* ioctl(VNA_IOC_RING_INIT) issued | 96 * | | 97 * | ^ 98 * V 99 * +-----------+ The ring parameters (size, guest physical addresses) 100 * | VRS_SETUP | have been set and start-up of the ring worker thread 101 * +-----------+ has begun. 102 * | ^ 103 * | | 104 * |---* ring worker thread begins execution | 105 * | | 106 * +-------------------------------------------->+ 107 * | | ^ 108 * | | 109 * | * If ring shutdown is requested (by ioctl or impending 110 * | bhyve process death) while the worker thread is 111 * | starting, the worker will transition the ring to 112 * | VRS_RESET and exit. 113 * | ^ 114 * | | 115 * |<-------------------------------------------<+ 116 * | | | 117 * | | ^ 118 * | * If ring is requested to pause (but not stop)from the 119 * | VRS_RUN state, it will return to the VRS_INIT state. 120 * | 121 * | ^ 122 * | | 123 * | ^ 124 * V 125 * +-----------+ The worker thread associated with the ring has started 126 * | VRS_INIT | executing. It has allocated any extra resources needed 127 * +-----------+ for the ring to operate. 128 * | ^ 129 * | | 130 * +-------------------------------------------->+ 131 * | | ^ 132 * | | 133 * | * If ring shutdown is requested while the worker is 134 * | waiting in VRS_INIT, it will free any extra resources 135 * | and transition to VRS_RESET. 136 * | ^ 137 * | | 138 * |--* ioctl(VNA_IOC_RING_KICK) issued | 139 * | ^ 140 * V 141 * +-----------+ The worker thread associated with the ring is executing 142 * | VRS_RUN | workload specific to that ring. 143 * +-----------+ 144 * | ^ 145 * |---* ioctl(VNA_IOC_RING_RESET) issued | 146 * | (or bhyve process begins exit) ^ 147 * | 148 * +-----------+ The worker thread associated with the ring is in the 149 * | VRS_STOP | process of exiting. All outstanding TX and RX 150 * +-----------+ requests are allowed to complete, but new requests 151 * | must be ignored. 152 * | ^ 153 * | | 154 * +-------------------------------------------->+ 155 * 156 * 157 * While the worker thread is not running, changes to vr_state are only made by 158 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts 159 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread 160 * has been started, only it may perform ring state transitions (still under 161 * the protection of vr_lock), when requested by outside consumers via 162 * vr_state_flags or when the containing bhyve process initiates an exit. 163 * 164 * 165 * ---------------------------- 166 * Transmission mblk_t Handling 167 * ---------------------------- 168 * 169 * For incoming frames destined for a bhyve guest, the data must first land in 170 * a host OS buffer from the physical NIC before it is copied into the awaiting 171 * guest buffer(s). Outbound frames transmitted by the guest are not bound by 172 * this limitation and can avoid extra copying before the buffers are accessed 173 * directly by the NIC. When a guest designates buffers to be transmitted, 174 * viona translates the guest-physical addresses contained in the ring 175 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is 176 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc(). 177 * Doing so increments vr_xfer_outstanding, preventing the ring from being 178 * reset (allowing the link to drop its vmm handle to the guest) until all 179 * transmit mblks referencing guest memory have been processed. Allocation of 180 * the viona_desb_t entries is done during the VRS_INIT stage of the ring 181 * worker thread. The ring size informs that allocation as the number of 182 * concurrent transmissions is limited by the number of descriptors in the 183 * ring. This minimizes allocation in the transmit hot-path by acquiring those 184 * fixed-size resources during initialization. 185 * 186 * This optimization depends on the underlying NIC driver freeing the mblks in 187 * a timely manner after they have been transmitted by the hardware. Some 188 * drivers have been found to flush TX descriptors only when new transmissions 189 * are initiated. This means that there is no upper bound to the time needed 190 * for an mblk to be flushed and can stall bhyve guests from shutting down 191 * since their memory must be free of viona TX references prior to clean-up. 192 * 193 * This expectation of deterministic mblk_t processing is likely the reason 194 * behind the notable exception to the zero-copy TX path: systems with 'bnxe' 195 * loaded will copy transmit data into fresh buffers rather than passing up 196 * zero-copy mblks. It is a hold-over from the original viona sources provided 197 * by Pluribus and its continued necessity has not been confirmed. 198 * 199 * 200 * ---------------------------- 201 * Ring Notification Fast-paths 202 * ---------------------------- 203 * 204 * Device operation for viona requires that notifications flow to and from the 205 * guest to indicate certain ring conditions. In order to minimize latency and 206 * processing overhead, the notification procedures are kept in-kernel whenever 207 * possible. 208 * 209 * Guest-to-host notifications, when new available descriptors have been placed 210 * in the ring, are posted via the 'queue notify' address in the virtio BAR. 211 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to 212 * install a callback hook on an ioport address. Guest exits for accesses to 213 * viona-hooked ioport addresses will result in direct calls to notify the 214 * appropriate ring worker without a trip to userland. 215 * 216 * Host-to-guest notifications in the form of interrupts enjoy similar 217 * acceleration. Each viona ring can be configured to send MSI notifications 218 * to the guest as virtio conditions dictate. This in-kernel interrupt 219 * configuration is kept synchronized through viona ioctls which are utilized 220 * during writes to the associated PCI config registers or MSI-X BAR. 221 * 222 * Guests which do not utilize MSI-X will result in viona falling back to the 223 * slow path for interrupts. It will poll(2) the viona handle, receiving 224 * notification when ring events necessitate the assertion of an interrupt. 225 * 226 * 227 * --------------- 228 * Nethook Support 229 * --------------- 230 * 231 * Viona provides four nethook events that consumers (e.g. ipf) can hook into 232 * to intercept packets as they go up or down the stack. Unfortunately, 233 * the nethook framework does not understand raw packets, so we can only 234 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach, 235 * we register callbacks with the neti (netinfo) module that will be invoked 236 * for each netstack already present, as well as for any additional netstack 237 * instances created as the system operates. These callbacks will 238 * register/unregister the hooks with the nethook framework for each 239 * netstack instance. This registration occurs prior to creating any 240 * viona instances for a given netstack, and the unregistration for a netstack 241 * instance occurs after all viona instances of the netstack instance have 242 * been deleted. 243 * 244 * ------------------ 245 * Metrics/Statistics 246 * ----------------- 247 * 248 * During operation, Viona tracks certain metrics as certain events occur. 249 * 250 * One class of metrics, known as the "error stats", refer to abnormal 251 * conditions in ring processing which are likely the fault of a misbehaving 252 * guest. These are tracked on a per-ring basis, and are not formally exposed 253 * to any consumer besides direct memory access through mdb. 254 * 255 * The other class of metrics tracked for an instance are the "transfer stats", 256 * which are the traditional packets/bytes/errors/drops figures. These are 257 * counted per-ring, and then aggregated into link-wide values exposed via 258 * kstats. Atomic operations are used to increment those per-ring stats during 259 * operation, and then when a ring is stopped, the values are consolidated into 260 * the link-wide values (to prevent loss when the ring is zeroed) under the 261 * protection of viona_link`l_stats_lock. When the kstats are being updated, 262 * l_stats_lock is held to protect against a racing consolidation, with the 263 * existing per-ring values being added in at update time to provide an accurate 264 * figure. 265 */ 266 267 #include <sys/conf.h> 268 #include <sys/file.h> 269 #include <sys/stat.h> 270 271 #include <sys/dlpi.h> 272 #include <sys/vlan.h> 273 274 #include "viona_impl.h" 275 276 277 #define VIONA_NAME "Virtio Network Accelerator" 278 #define VIONA_CTL_MINOR 0 279 #define VIONA_MODULE_NAME "viona" 280 #define VIONA_KSTAT_CLASS "misc" 281 #define VIONA_KSTAT_NAME "viona_stat" 282 283 284 /* 285 * Host capabilities. 286 */ 287 #define VIONA_S_HOSTCAPS ( \ 288 VIRTIO_NET_F_GUEST_CSUM | \ 289 VIRTIO_NET_F_MAC | \ 290 VIRTIO_NET_F_GUEST_TSO4 | \ 291 VIRTIO_NET_F_MRG_RXBUF | \ 292 VIRTIO_NET_F_STATUS | \ 293 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \ 294 VIRTIO_F_RING_INDIRECT_DESC) 295 296 /* MAC_CAPAB_HCKSUM specifics of interest */ 297 #define VIONA_CAP_HCKSUM_INTEREST \ 298 (HCKSUM_INET_PARTIAL | \ 299 HCKSUM_INET_FULL_V4 | \ 300 HCKSUM_INET_FULL_V6) 301 302 static void *viona_state; 303 static dev_info_t *viona_dip; 304 static id_space_t *viona_minors; 305 306 307 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, 308 void **result); 309 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 310 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 311 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp); 312 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp); 313 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode, 314 cred_t *credp, int *rval); 315 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 316 struct pollhead **phpp); 317 318 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *); 319 static int viona_ioc_delete(viona_soft_state_t *, boolean_t); 320 321 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t); 322 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t); 323 static int viona_ioc_get_params(viona_link_t *, void *, int); 324 static int viona_ioc_set_params(viona_link_t *, void *, int); 325 static int viona_ioc_ring_init(viona_link_t *, void *, int); 326 static int viona_ioc_ring_set_state(viona_link_t *, void *, int); 327 static int viona_ioc_ring_get_state(viona_link_t *, void *, int); 328 static int viona_ioc_ring_reset(viona_link_t *, uint_t); 329 static int viona_ioc_ring_kick(viona_link_t *, uint_t); 330 static int viona_ioc_ring_pause(viona_link_t *, uint_t); 331 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int); 332 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t); 333 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *); 334 335 static void viona_params_get_defaults(viona_link_params_t *); 336 337 static struct cb_ops viona_cb_ops = { 338 viona_open, 339 viona_close, 340 nodev, 341 nodev, 342 nodev, 343 nodev, 344 nodev, 345 viona_ioctl, 346 nodev, 347 nodev, 348 nodev, 349 viona_chpoll, 350 ddi_prop_op, 351 0, 352 D_MP | D_NEW | D_HOTPLUG, 353 CB_REV, 354 nodev, 355 nodev 356 }; 357 358 static struct dev_ops viona_ops = { 359 DEVO_REV, 360 0, 361 viona_info, 362 nulldev, 363 nulldev, 364 viona_attach, 365 viona_detach, 366 nodev, 367 &viona_cb_ops, 368 NULL, 369 ddi_power, 370 ddi_quiesce_not_needed 371 }; 372 373 static struct modldrv modldrv = { 374 &mod_driverops, 375 VIONA_NAME, 376 &viona_ops, 377 }; 378 379 static struct modlinkage modlinkage = { 380 MODREV_1, &modldrv, NULL 381 }; 382 383 int 384 _init(void) 385 { 386 int ret; 387 388 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0); 389 if (ret != 0) { 390 return (ret); 391 } 392 393 viona_minors = id_space_create("viona_minors", 394 VIONA_CTL_MINOR + 1, UINT16_MAX); 395 viona_rx_init(); 396 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL); 397 398 ret = mod_install(&modlinkage); 399 if (ret != 0) { 400 ddi_soft_state_fini(&viona_state); 401 id_space_destroy(viona_minors); 402 viona_rx_fini(); 403 mutex_destroy(&viona_force_copy_lock); 404 } 405 406 return (ret); 407 } 408 409 int 410 _fini(void) 411 { 412 int ret; 413 414 ret = mod_remove(&modlinkage); 415 if (ret != 0) { 416 return (ret); 417 } 418 419 ddi_soft_state_fini(&viona_state); 420 id_space_destroy(viona_minors); 421 viona_rx_fini(); 422 mutex_destroy(&viona_force_copy_lock); 423 424 return (ret); 425 } 426 427 int 428 _info(struct modinfo *modinfop) 429 { 430 return (mod_info(&modlinkage, modinfop)); 431 } 432 433 /* ARGSUSED */ 434 static int 435 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 436 { 437 int error; 438 439 switch (cmd) { 440 case DDI_INFO_DEVT2DEVINFO: 441 *result = (void *)viona_dip; 442 error = DDI_SUCCESS; 443 break; 444 case DDI_INFO_DEVT2INSTANCE: 445 *result = (void *)0; 446 error = DDI_SUCCESS; 447 break; 448 default: 449 error = DDI_FAILURE; 450 break; 451 } 452 return (error); 453 } 454 455 static int 456 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 457 { 458 if (cmd != DDI_ATTACH) { 459 return (DDI_FAILURE); 460 } 461 462 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR, 463 DDI_PSEUDO, 0) != DDI_SUCCESS) { 464 return (DDI_FAILURE); 465 } 466 467 viona_neti_attach(); 468 469 viona_dip = dip; 470 ddi_report_dev(viona_dip); 471 472 return (DDI_SUCCESS); 473 } 474 475 static int 476 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 477 { 478 dev_info_t *old_dip = viona_dip; 479 480 if (cmd != DDI_DETACH) { 481 return (DDI_FAILURE); 482 } 483 484 VERIFY(old_dip != NULL); 485 486 viona_neti_detach(); 487 viona_dip = NULL; 488 ddi_remove_minor_node(old_dip, NULL); 489 490 return (DDI_SUCCESS); 491 } 492 493 static int 494 viona_open(dev_t *devp, int flag, int otype, cred_t *credp) 495 { 496 int minor; 497 viona_soft_state_t *ss; 498 499 if (otype != OTYP_CHR) { 500 return (EINVAL); 501 } 502 #if 0 503 /* 504 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right. 505 * Should the check be at open() or ioctl()? 506 */ 507 if (drv_priv(credp) != 0) { 508 return (EPERM); 509 } 510 #endif 511 if (getminor(*devp) != VIONA_CTL_MINOR) { 512 return (ENXIO); 513 } 514 515 minor = id_alloc_nosleep(viona_minors); 516 if (minor == -1) { 517 /* All minors are busy */ 518 return (EBUSY); 519 } 520 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) { 521 id_free(viona_minors, minor); 522 return (ENOMEM); 523 } 524 525 ss = ddi_get_soft_state(viona_state, minor); 526 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL); 527 ss->ss_minor = minor; 528 *devp = makedevice(getmajor(*devp), minor); 529 530 return (0); 531 } 532 533 static int 534 viona_close(dev_t dev, int flag, int otype, cred_t *credp) 535 { 536 int minor; 537 viona_soft_state_t *ss; 538 539 if (otype != OTYP_CHR) { 540 return (EINVAL); 541 } 542 543 minor = getminor(dev); 544 545 ss = ddi_get_soft_state(viona_state, minor); 546 if (ss == NULL) { 547 return (ENXIO); 548 } 549 550 VERIFY0(viona_ioc_delete(ss, B_TRUE)); 551 VERIFY(!list_link_active(&ss->ss_node)); 552 ddi_soft_state_free(viona_state, minor); 553 id_free(viona_minors, minor); 554 555 return (0); 556 } 557 558 static int 559 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv) 560 { 561 viona_soft_state_t *ss; 562 void *dptr = (void *)data; 563 int err = 0, val; 564 viona_link_t *link; 565 566 ss = ddi_get_soft_state(viona_state, getminor(dev)); 567 if (ss == NULL) { 568 return (ENXIO); 569 } 570 571 switch (cmd) { 572 case VNA_IOC_CREATE: 573 return (viona_ioc_create(ss, dptr, md, cr)); 574 case VNA_IOC_DELETE: 575 return (viona_ioc_delete(ss, B_FALSE)); 576 case VNA_IOC_VERSION: 577 *rv = VIONA_CURRENT_INTERFACE_VERSION; 578 return (0); 579 case VNA_IOC_DEFAULT_PARAMS: 580 /* 581 * With a NULL link parameter, viona_ioc_get_params() will emit 582 * the default parameters with the same error-handling behavior 583 * as VNA_IOC_GET_PARAMS. 584 */ 585 return (viona_ioc_get_params(NULL, dptr, md)); 586 default: 587 break; 588 } 589 590 mutex_enter(&ss->ss_lock); 591 if ((link = ss->ss_link) == NULL || link->l_destroyed || 592 vmm_drv_release_reqd(link->l_vm_hold)) { 593 mutex_exit(&ss->ss_lock); 594 return (ENXIO); 595 } 596 597 switch (cmd) { 598 case VNA_IOC_GET_FEATURES: 599 val = VIONA_S_HOSTCAPS | link->l_features_hw; 600 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) { 601 err = EFAULT; 602 } 603 break; 604 case VNA_IOC_SET_FEATURES: 605 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) { 606 err = EFAULT; 607 break; 608 } 609 val &= (VIONA_S_HOSTCAPS | link->l_features_hw); 610 611 if ((val & VIRTIO_NET_F_CSUM) == 0) 612 val &= ~VIRTIO_NET_F_HOST_TSO4; 613 614 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) 615 val &= ~VIRTIO_NET_F_GUEST_TSO4; 616 617 link->l_features = val; 618 break; 619 case VNA_IOC_RING_INIT: 620 err = viona_ioc_ring_init(link, dptr, md); 621 break; 622 case VNA_IOC_RING_RESET: 623 err = viona_ioc_ring_reset(link, (uint_t)data); 624 break; 625 case VNA_IOC_RING_KICK: 626 err = viona_ioc_ring_kick(link, (uint_t)data); 627 break; 628 case VNA_IOC_RING_SET_MSI: 629 err = viona_ioc_ring_set_msi(link, dptr, md); 630 break; 631 case VNA_IOC_RING_INTR_CLR: 632 err = viona_ioc_ring_intr_clear(link, (uint_t)data); 633 break; 634 case VNA_IOC_RING_SET_STATE: 635 err = viona_ioc_ring_set_state(link, dptr, md); 636 break; 637 case VNA_IOC_RING_GET_STATE: 638 err = viona_ioc_ring_get_state(link, dptr, md); 639 break; 640 case VNA_IOC_RING_PAUSE: 641 err = viona_ioc_ring_pause(link, (uint_t)data); 642 break; 643 644 case VNA_IOC_INTR_POLL: 645 err = viona_ioc_intr_poll(link, dptr, md, rv); 646 break; 647 case VNA_IOC_SET_NOTIFY_IOP: 648 if (data < 0 || data > UINT16_MAX) { 649 err = EINVAL; 650 break; 651 } 652 err = viona_ioc_set_notify_ioport(link, (uint16_t)data); 653 break; 654 case VNA_IOC_SET_PROMISC: 655 err = viona_ioc_set_promisc(link, (viona_promisc_t)data); 656 break; 657 case VNA_IOC_GET_PARAMS: 658 err = viona_ioc_get_params(link, dptr, md); 659 break; 660 case VNA_IOC_SET_PARAMS: 661 err = viona_ioc_set_params(link, dptr, md); 662 break; 663 default: 664 err = ENOTTY; 665 break; 666 } 667 668 mutex_exit(&ss->ss_lock); 669 return (err); 670 } 671 672 static int 673 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp, 674 struct pollhead **phpp) 675 { 676 viona_soft_state_t *ss; 677 viona_link_t *link; 678 679 ss = ddi_get_soft_state(viona_state, getminor(dev)); 680 if (ss == NULL) { 681 return (ENXIO); 682 } 683 684 mutex_enter(&ss->ss_lock); 685 if ((link = ss->ss_link) == NULL || link->l_destroyed) { 686 mutex_exit(&ss->ss_lock); 687 return (ENXIO); 688 } 689 690 *reventsp = 0; 691 if ((events & POLLRDBAND) != 0) { 692 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 693 if (link->l_vrings[i].vr_intr_enabled != 0) { 694 *reventsp |= POLLRDBAND; 695 break; 696 } 697 } 698 } 699 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { 700 *phpp = &link->l_pollhead; 701 } 702 mutex_exit(&ss->ss_lock); 703 704 return (0); 705 } 706 707 static void 708 viona_get_mac_capab(viona_link_t *link) 709 { 710 mac_handle_t mh = link->l_mh; 711 uint32_t cap = 0; 712 mac_capab_lso_t lso_cap; 713 714 link->l_features_hw = 0; 715 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) { 716 /* 717 * Only report HW checksum ability if the underlying MAC 718 * resource is capable of populating the L4 header. 719 */ 720 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) { 721 link->l_features_hw |= VIRTIO_NET_F_CSUM; 722 } 723 link->l_cap_csum = cap; 724 } 725 726 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) && 727 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) { 728 /* 729 * Virtio doesn't allow for negotiating a maximum LSO 730 * packet size. We have to assume that the guest may 731 * send a maximum length IP packet. Make sure the 732 * underlying MAC can handle an LSO of this size. 733 */ 734 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) && 735 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) 736 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4; 737 } 738 } 739 740 static int 741 viona_kstat_update(kstat_t *ksp, int rw) 742 { 743 viona_link_t *link = ksp->ks_private; 744 viona_kstats_t *vk = ksp->ks_data; 745 746 /* 747 * Avoid the potential for mangled values due to a racing consolidation 748 * of stats for a ring by performing the kstat update with l_stats_lock 749 * held while adding up the central (link) and ring values. 750 */ 751 mutex_enter(&link->l_stats_lock); 752 753 const viona_transfer_stats_t *ring_stats = 754 &link->l_vrings[VIONA_VQ_RX].vr_stats; 755 const viona_transfer_stats_t *link_stats = &link->l_stats.vls_rx; 756 757 vk->vk_rx_packets.value.ui64 = 758 link_stats->vts_packets + ring_stats->vts_packets; 759 vk->vk_rx_bytes.value.ui64 = 760 link_stats->vts_bytes + ring_stats->vts_bytes; 761 vk->vk_rx_errors.value.ui64 = 762 link_stats->vts_errors + ring_stats->vts_errors; 763 vk->vk_rx_drops.value.ui64 = 764 link_stats->vts_drops + ring_stats->vts_drops; 765 766 ring_stats = &link->l_vrings[VIONA_VQ_TX].vr_stats; 767 link_stats = &link->l_stats.vls_tx; 768 769 vk->vk_tx_packets.value.ui64 = 770 link_stats->vts_packets + ring_stats->vts_packets; 771 vk->vk_tx_bytes.value.ui64 = 772 link_stats->vts_bytes + ring_stats->vts_bytes; 773 vk->vk_tx_errors.value.ui64 = 774 link_stats->vts_errors + ring_stats->vts_errors; 775 vk->vk_tx_drops.value.ui64 = 776 link_stats->vts_drops + ring_stats->vts_drops; 777 778 mutex_exit(&link->l_stats_lock); 779 780 return (0); 781 } 782 783 static int 784 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr) 785 { 786 zoneid_t zid = crgetzoneid(cr); 787 kstat_t *ksp; 788 789 ASSERT(MUTEX_HELD(&ss->ss_lock)); 790 ASSERT3P(ss->ss_kstat, ==, NULL); 791 792 ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor, 793 VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED, 794 sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid); 795 796 if (ksp == NULL) { 797 /* 798 * Without detail from kstat_create_zone(), assume that resource 799 * exhaustion is to blame for the failure. 800 */ 801 return (ENOMEM); 802 } 803 ss->ss_kstat = ksp; 804 805 /* 806 * If this instance is associated with a non-global zone, make its 807 * kstats visible from the GZ. 808 */ 809 if (zid != GLOBAL_ZONEID) { 810 kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID); 811 } 812 813 viona_kstats_t *vk = ksp->ks_data; 814 815 kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64); 816 kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64); 817 kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64); 818 kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64); 819 kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64); 820 kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64); 821 kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64); 822 kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64); 823 ksp->ks_private = ss->ss_link; 824 ksp->ks_update = viona_kstat_update; 825 826 kstat_install(ss->ss_kstat); 827 return (0); 828 } 829 830 static void 831 viona_kstat_fini(viona_soft_state_t *ss) 832 { 833 ASSERT(MUTEX_HELD(&ss->ss_lock)); 834 835 if (ss->ss_kstat != NULL) { 836 kstat_delete(ss->ss_kstat); 837 ss->ss_kstat = NULL; 838 } 839 } 840 841 static int 842 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr) 843 { 844 vioc_create_t kvc; 845 viona_link_t *link = NULL; 846 char cli_name[MAXNAMELEN]; 847 int err = 0; 848 file_t *fp; 849 vmm_hold_t *hold = NULL; 850 viona_neti_t *nip = NULL; 851 zoneid_t zid; 852 mac_diag_t mac_diag = MAC_DIAG_NONE; 853 boolean_t rings_allocd = B_FALSE; 854 855 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock)); 856 857 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) { 858 return (EFAULT); 859 } 860 861 zid = crgetzoneid(cr); 862 nip = viona_neti_lookup_by_zid(zid); 863 if (nip == NULL) { 864 return (EIO); 865 } 866 867 if (!nip->vni_nethook.vnh_hooked) { 868 viona_neti_rele(nip); 869 return (EIO); 870 } 871 872 mutex_enter(&ss->ss_lock); 873 if (ss->ss_link != NULL) { 874 mutex_exit(&ss->ss_lock); 875 viona_neti_rele(nip); 876 return (EEXIST); 877 } 878 879 if ((fp = getf(kvc.c_vmfd)) == NULL) { 880 err = EBADF; 881 goto bail; 882 } 883 err = vmm_drv_hold(fp, cr, &hold); 884 releasef(kvc.c_vmfd); 885 if (err != 0) { 886 goto bail; 887 } 888 889 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP); 890 link->l_linkid = kvc.c_linkid; 891 link->l_vm_hold = hold; 892 893 err = mac_open_by_linkid(link->l_linkid, &link->l_mh); 894 if (err != 0) { 895 goto bail; 896 } 897 898 viona_get_mac_capab(link); 899 viona_params_get_defaults(&link->l_params); 900 901 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME, 902 link->l_linkid); 903 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0); 904 if (err != 0) { 905 goto bail; 906 } 907 908 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY, 909 &link->l_muh, VLAN_ID_NONE, &mac_diag); 910 if (err != 0) { 911 goto bail; 912 } 913 914 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]); 915 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]); 916 rings_allocd = B_TRUE; 917 918 /* 919 * Default to passing up all multicast traffic in addition to 920 * classified unicast. Guests which have support will change this 921 * if they need to via the virtio net control queue; guests without 922 * support generally still want to see multicast. 923 */ 924 link->l_promisc = VIONA_PROMISC_MULTI; 925 if ((err = viona_rx_set(link, link->l_promisc)) != 0) { 926 goto bail; 927 } 928 929 link->l_neti = nip; 930 ss->ss_link = link; 931 932 if ((err = viona_kstat_init(ss, cr)) != 0) { 933 goto bail; 934 } 935 936 mutex_exit(&ss->ss_lock); 937 938 mutex_enter(&nip->vni_lock); 939 list_insert_tail(&nip->vni_dev_list, ss); 940 mutex_exit(&nip->vni_lock); 941 942 return (0); 943 944 bail: 945 if (link != NULL) { 946 viona_rx_clear(link); 947 if (link->l_mch != NULL) { 948 if (link->l_muh != NULL) { 949 VERIFY0(mac_unicast_remove(link->l_mch, 950 link->l_muh)); 951 link->l_muh = NULL; 952 } 953 mac_client_close(link->l_mch, 0); 954 } 955 if (link->l_mh != NULL) { 956 mac_close(link->l_mh); 957 } 958 if (rings_allocd) { 959 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 960 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 961 } 962 kmem_free(link, sizeof (viona_link_t)); 963 ss->ss_link = NULL; 964 } 965 if (hold != NULL) { 966 vmm_drv_rele(hold); 967 } 968 viona_neti_rele(nip); 969 970 mutex_exit(&ss->ss_lock); 971 return (err); 972 } 973 974 static int 975 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close) 976 { 977 viona_link_t *link; 978 viona_neti_t *nip = NULL; 979 980 mutex_enter(&ss->ss_lock); 981 if ((link = ss->ss_link) == NULL) { 982 /* Link destruction already complete */ 983 mutex_exit(&ss->ss_lock); 984 return (0); 985 } 986 987 if (link->l_destroyed) { 988 /* 989 * Link destruction has been started by another thread, but has 990 * not completed. This condition should be impossible to 991 * encounter when performing the on-close destroy of the link, 992 * since racing ioctl accessors must necessarily be absent. 993 */ 994 VERIFY(!on_close); 995 mutex_exit(&ss->ss_lock); 996 return (EAGAIN); 997 } 998 /* 999 * The link deletion cannot fail after this point, continuing until its 1000 * successful completion is reached. 1001 */ 1002 link->l_destroyed = B_TRUE; 1003 1004 /* 1005 * Tear down the IO port hook so it cannot be used to kick any of the 1006 * rings which are about to be reset and stopped. 1007 */ 1008 VERIFY0(viona_ioc_set_notify_ioport(link, 0)); 1009 mutex_exit(&ss->ss_lock); 1010 1011 /* 1012 * Return the rings to their reset state, ignoring any possible 1013 * interruptions from signals. 1014 */ 1015 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE)); 1016 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE)); 1017 1018 mutex_enter(&ss->ss_lock); 1019 viona_kstat_fini(ss); 1020 if (link->l_mch != NULL) { 1021 /* Unhook the receive callbacks and close out the client */ 1022 viona_rx_clear(link); 1023 if (link->l_muh != NULL) { 1024 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh)); 1025 link->l_muh = NULL; 1026 } 1027 mac_client_close(link->l_mch, 0); 1028 } 1029 if (link->l_mh != NULL) { 1030 mac_close(link->l_mh); 1031 } 1032 if (link->l_vm_hold != NULL) { 1033 vmm_drv_rele(link->l_vm_hold); 1034 link->l_vm_hold = NULL; 1035 } 1036 1037 nip = link->l_neti; 1038 link->l_neti = NULL; 1039 1040 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]); 1041 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]); 1042 pollhead_clean(&link->l_pollhead); 1043 ss->ss_link = NULL; 1044 mutex_exit(&ss->ss_lock); 1045 1046 mutex_enter(&nip->vni_lock); 1047 list_remove(&nip->vni_dev_list, ss); 1048 mutex_exit(&nip->vni_lock); 1049 1050 viona_neti_rele(nip); 1051 1052 kmem_free(link, sizeof (viona_link_t)); 1053 return (0); 1054 } 1055 1056 static int 1057 viona_ioc_ring_init(viona_link_t *link, void *udata, int md) 1058 { 1059 vioc_ring_init_t kri; 1060 int err; 1061 1062 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) { 1063 return (EFAULT); 1064 } 1065 const struct viona_ring_params params = { 1066 .vrp_pa = kri.ri_qaddr, 1067 .vrp_size = kri.ri_qsize, 1068 .vrp_avail_idx = 0, 1069 .vrp_used_idx = 0, 1070 }; 1071 1072 err = viona_ring_init(link, kri.ri_index, ¶ms); 1073 1074 return (err); 1075 } 1076 1077 static int 1078 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md) 1079 { 1080 vioc_ring_state_t krs; 1081 int err; 1082 1083 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 1084 return (EFAULT); 1085 } 1086 const struct viona_ring_params params = { 1087 .vrp_pa = krs.vrs_qaddr, 1088 .vrp_size = krs.vrs_qsize, 1089 .vrp_avail_idx = krs.vrs_avail_idx, 1090 .vrp_used_idx = krs.vrs_used_idx, 1091 }; 1092 1093 err = viona_ring_init(link, krs.vrs_index, ¶ms); 1094 1095 return (err); 1096 } 1097 1098 static int 1099 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md) 1100 { 1101 vioc_ring_state_t krs; 1102 1103 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) { 1104 return (EFAULT); 1105 } 1106 1107 struct viona_ring_params params; 1108 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms); 1109 if (err != 0) { 1110 return (err); 1111 } 1112 krs.vrs_qsize = params.vrp_size; 1113 krs.vrs_qaddr = params.vrp_pa; 1114 krs.vrs_avail_idx = params.vrp_avail_idx; 1115 krs.vrs_used_idx = params.vrp_used_idx; 1116 1117 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) { 1118 return (EFAULT); 1119 } 1120 return (0); 1121 } 1122 1123 static int 1124 viona_ioc_ring_reset(viona_link_t *link, uint_t idx) 1125 { 1126 viona_vring_t *ring; 1127 1128 if (idx >= VIONA_VQ_MAX) { 1129 return (EINVAL); 1130 } 1131 ring = &link->l_vrings[idx]; 1132 1133 return (viona_ring_reset(ring, B_TRUE)); 1134 } 1135 1136 static int 1137 viona_ioc_ring_kick(viona_link_t *link, uint_t idx) 1138 { 1139 viona_vring_t *ring; 1140 int err; 1141 1142 if (idx >= VIONA_VQ_MAX) { 1143 return (EINVAL); 1144 } 1145 ring = &link->l_vrings[idx]; 1146 1147 mutex_enter(&ring->vr_lock); 1148 switch (ring->vr_state) { 1149 case VRS_SETUP: 1150 /* 1151 * An early kick to a ring which is starting its worker thread 1152 * is fine. Once that thread is active, it will process the 1153 * start-up request immediately. 1154 */ 1155 /* FALLTHROUGH */ 1156 case VRS_INIT: 1157 ring->vr_state_flags |= VRSF_REQ_START; 1158 /* FALLTHROUGH */ 1159 case VRS_RUN: 1160 cv_broadcast(&ring->vr_cv); 1161 err = 0; 1162 break; 1163 default: 1164 err = EBUSY; 1165 break; 1166 } 1167 mutex_exit(&ring->vr_lock); 1168 1169 return (err); 1170 } 1171 1172 static int 1173 viona_ioc_ring_pause(viona_link_t *link, uint_t idx) 1174 { 1175 if (idx >= VIONA_VQ_MAX) { 1176 return (EINVAL); 1177 } 1178 1179 viona_vring_t *ring = &link->l_vrings[idx]; 1180 return (viona_ring_pause(ring)); 1181 } 1182 1183 static int 1184 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md) 1185 { 1186 vioc_ring_msi_t vrm; 1187 viona_vring_t *ring; 1188 1189 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) { 1190 return (EFAULT); 1191 } 1192 if (vrm.rm_index >= VIONA_VQ_MAX) { 1193 return (EINVAL); 1194 } 1195 1196 ring = &link->l_vrings[vrm.rm_index]; 1197 mutex_enter(&ring->vr_lock); 1198 ring->vr_msi_addr = vrm.rm_addr; 1199 ring->vr_msi_msg = vrm.rm_msg; 1200 mutex_exit(&ring->vr_lock); 1201 1202 return (0); 1203 } 1204 1205 static int 1206 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes, 1207 uint32_t *val) 1208 { 1209 viona_link_t *link = (viona_link_t *)arg; 1210 1211 /* 1212 * If the request is a read (in/ins), or direct at a port other than 1213 * what we expect to be registered on, ignore it. 1214 */ 1215 if (in || port != link->l_notify_ioport) { 1216 return (ESRCH); 1217 } 1218 1219 /* Let userspace handle notifications for rings other than RX/TX. */ 1220 const uint16_t vq = *val; 1221 if (vq >= VIONA_VQ_MAX) { 1222 return (ESRCH); 1223 } 1224 1225 viona_vring_t *ring = &link->l_vrings[vq]; 1226 int res = 0; 1227 1228 mutex_enter(&ring->vr_lock); 1229 if (ring->vr_state == VRS_RUN) { 1230 cv_broadcast(&ring->vr_cv); 1231 } else { 1232 res = ESRCH; 1233 } 1234 mutex_exit(&ring->vr_lock); 1235 1236 return (res); 1237 } 1238 1239 static int 1240 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport) 1241 { 1242 int err = 0; 1243 1244 if (link->l_notify_ioport != 0) { 1245 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie); 1246 link->l_notify_ioport = 0; 1247 } 1248 1249 if (ioport != 0) { 1250 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport, 1251 viona_notify_iop, (void *)link, &link->l_notify_cookie); 1252 if (err == 0) { 1253 link->l_notify_ioport = ioport; 1254 } 1255 } 1256 return (err); 1257 } 1258 1259 static int 1260 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode) 1261 { 1262 int err; 1263 1264 if (mode >= VIONA_PROMISC_MAX) { 1265 return (EINVAL); 1266 } 1267 1268 if (mode == link->l_promisc) { 1269 return (0); 1270 } 1271 1272 if ((err = viona_rx_set(link, mode)) != 0) { 1273 return (err); 1274 } 1275 1276 link->l_promisc = mode; 1277 return (0); 1278 } 1279 1280 #define PARAM_NM_TX_COPY_DATA "tx_copy_data" 1281 #define PARAM_NM_TX_HEADER_PAD "tx_header_pad" 1282 1283 #define PARAM_ERR_INVALID_TYPE "invalid type" 1284 #define PARAM_ERR_OUT_OF_RANGE "value out of range" 1285 #define PARAM_ERR_UNK_KEY "unknown key" 1286 1287 static nvlist_t * 1288 viona_params_to_nvlist(const viona_link_params_t *vlp) 1289 { 1290 nvlist_t *nvl = fnvlist_alloc(); 1291 1292 fnvlist_add_boolean_value(nvl, PARAM_NM_TX_COPY_DATA, 1293 vlp->vlp_tx_copy_data); 1294 fnvlist_add_uint16(nvl, PARAM_NM_TX_HEADER_PAD, 1295 vlp->vlp_tx_header_pad); 1296 1297 return (nvl); 1298 } 1299 1300 static nvlist_t * 1301 viona_params_from_nvlist(nvlist_t *nvl, viona_link_params_t *vlp) 1302 { 1303 nvlist_t *nverr = fnvlist_alloc(); 1304 nvpair_t *nvp = NULL; 1305 1306 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1307 const char *name = nvpair_name(nvp); 1308 const data_type_t dtype = nvpair_type(nvp); 1309 1310 if (strcmp(name, PARAM_NM_TX_COPY_DATA) == 0) { 1311 if (dtype == DATA_TYPE_BOOLEAN_VALUE) { 1312 vlp->vlp_tx_copy_data = 1313 fnvpair_value_boolean_value(nvp); 1314 } else { 1315 fnvlist_add_string(nverr, name, 1316 PARAM_ERR_INVALID_TYPE); 1317 } 1318 continue; 1319 } 1320 if (strcmp(name, PARAM_NM_TX_HEADER_PAD) == 0) { 1321 if (dtype == DATA_TYPE_UINT16) { 1322 uint16_t value = fnvpair_value_uint16(nvp); 1323 1324 if (value > viona_max_header_pad) { 1325 fnvlist_add_string(nverr, name, 1326 PARAM_ERR_OUT_OF_RANGE); 1327 } else { 1328 vlp->vlp_tx_header_pad = value; 1329 } 1330 } else { 1331 fnvlist_add_string(nverr, name, 1332 PARAM_ERR_INVALID_TYPE); 1333 } 1334 continue; 1335 } 1336 1337 /* Reject parameters we do not recognize */ 1338 fnvlist_add_string(nverr, name, PARAM_ERR_UNK_KEY); 1339 } 1340 1341 if (!nvlist_empty(nverr)) { 1342 return (nverr); 1343 } 1344 1345 nvlist_free(nverr); 1346 return (NULL); 1347 } 1348 1349 static void 1350 viona_params_get_defaults(viona_link_params_t *vlp) 1351 { 1352 vlp->vlp_tx_copy_data = viona_tx_copy_needed(); 1353 vlp->vlp_tx_header_pad = 0; 1354 } 1355 1356 static int 1357 viona_ioc_get_params(viona_link_t *link, void *udata, int md) 1358 { 1359 vioc_get_params_t vgp; 1360 int err = 0; 1361 1362 if (ddi_copyin(udata, &vgp, sizeof (vgp), md) != 0) { 1363 return (EFAULT); 1364 } 1365 1366 nvlist_t *nvl = NULL; 1367 if (link != NULL) { 1368 nvl = viona_params_to_nvlist(&link->l_params); 1369 } else { 1370 viona_link_params_t vlp = { 0 }; 1371 1372 viona_params_get_defaults(&vlp); 1373 nvl = viona_params_to_nvlist(&vlp); 1374 } 1375 1376 VERIFY(nvl != NULL); 1377 1378 size_t packed_sz; 1379 void *packed = fnvlist_pack(nvl, &packed_sz); 1380 nvlist_free(nvl); 1381 1382 if (packed_sz > vgp.vgp_param_sz) { 1383 err = E2BIG; 1384 } 1385 /* Communicate size, even if the data will not fit */ 1386 vgp.vgp_param_sz = packed_sz; 1387 1388 if (err == 0 && 1389 ddi_copyout(packed, vgp.vgp_param, packed_sz, md) != 0) { 1390 err = EFAULT; 1391 } 1392 kmem_free(packed, packed_sz); 1393 1394 if (ddi_copyout(&vgp, udata, sizeof (vgp), md) != 0) { 1395 if (err != 0) { 1396 err = EFAULT; 1397 } 1398 } 1399 1400 return (err); 1401 } 1402 1403 static int 1404 viona_ioc_set_params(viona_link_t *link, void *udata, int md) 1405 { 1406 vioc_set_params_t vsp; 1407 int err = 0; 1408 nvlist_t *nverr = NULL; 1409 1410 if (ddi_copyin(udata, &vsp, sizeof (vsp), md) != 0) { 1411 return (EFAULT); 1412 } 1413 1414 if (vsp.vsp_param_sz > VIONA_MAX_PARAM_NVLIST_SZ) { 1415 err = E2BIG; 1416 goto done; 1417 } else if (vsp.vsp_param_sz == 0) { 1418 /* 1419 * There is no reason to make this ioctl call with no actual 1420 * parameters to be changed. 1421 */ 1422 err = EINVAL; 1423 goto done; 1424 } 1425 1426 const size_t packed_sz = vsp.vsp_param_sz; 1427 void *packed = kmem_alloc(packed_sz, KM_SLEEP); 1428 if (ddi_copyin(vsp.vsp_param, packed, packed_sz, md) != 0) { 1429 kmem_free(packed, packed_sz); 1430 err = EFAULT; 1431 goto done; 1432 } 1433 1434 nvlist_t *parsed = NULL; 1435 if (nvlist_unpack(packed, packed_sz, &parsed, KM_SLEEP) == 0) { 1436 /* Use the existing parameters as a starting point */ 1437 viona_link_params_t new_params; 1438 bcopy(&link->l_params, &new_params, 1439 sizeof (new_params)); 1440 1441 nverr = viona_params_from_nvlist(parsed, &new_params); 1442 if (nverr == NULL) { 1443 /* 1444 * Only apply the updated parameters if there 1445 * were no errors during parsing. 1446 */ 1447 bcopy(&new_params, &link->l_params, 1448 sizeof (new_params)); 1449 } else { 1450 err = EINVAL; 1451 } 1452 1453 } else { 1454 err = EINVAL; 1455 } 1456 nvlist_free(parsed); 1457 kmem_free(packed, packed_sz); 1458 1459 done: 1460 if (nverr != NULL) { 1461 size_t err_packed_sz; 1462 void *err_packed = fnvlist_pack(nverr, &err_packed_sz); 1463 1464 if (err_packed_sz > vsp.vsp_error_sz) { 1465 if (err != 0) { 1466 err = E2BIG; 1467 } 1468 } else if (ddi_copyout(err_packed, vsp.vsp_error, 1469 err_packed_sz, md) != 0 && err == 0) { 1470 err = EFAULT; 1471 } 1472 vsp.vsp_error_sz = err_packed_sz; 1473 1474 nvlist_free(nverr); 1475 kmem_free(err_packed, err_packed_sz); 1476 } else { 1477 /* 1478 * If there are no detailed per-field errors, it is important to 1479 * communicate that absense to userspace. 1480 */ 1481 vsp.vsp_error_sz = 0; 1482 } 1483 1484 if (ddi_copyout(&vsp, udata, sizeof (vsp), md) != 0 && err == 0) { 1485 err = EFAULT; 1486 } 1487 1488 return (err); 1489 } 1490 1491 static int 1492 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx) 1493 { 1494 if (idx >= VIONA_VQ_MAX) { 1495 return (EINVAL); 1496 } 1497 1498 link->l_vrings[idx].vr_intr_enabled = 0; 1499 return (0); 1500 } 1501 1502 static int 1503 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv) 1504 { 1505 uint_t cnt = 0; 1506 vioc_intr_poll_t vip; 1507 1508 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) { 1509 uint_t val = link->l_vrings[i].vr_intr_enabled; 1510 1511 vip.vip_status[i] = val; 1512 if (val != 0) { 1513 cnt++; 1514 } 1515 } 1516 1517 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) { 1518 return (EFAULT); 1519 } 1520 *rv = (int)cnt; 1521 return (0); 1522 } 1523