1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39 * Copyright 2026 Oxide Computer Company
40 */
41
42 /*
43 * viona - VirtIO-Net, Accelerated
44 *
45 * The purpose of viona is to provide high performance virtio-net devices to
46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
47 * DLS/DLD stack.
48 *
49 * --------------------
50 * General Architecture
51 * --------------------
52 *
53 * A single viona instance is comprised of a "link" handle and a number of
54 * "ring" pairs. After opening the viona device, it must be associated with a
55 * MAC network interface and a bhyve (vmm) instance to form its link resource.
56 * This is done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd
57 * are passed in to perform the initialization. With the MAC client opened,
58 * and a driver handle to the vmm instance established, the device is ready to
59 * be configured by the guest.
60 *
61 * The userspace portion of bhyve, which interfaces with the PCI device
62 * emulation framework, is meant to stay out of the datapath if at all
63 * possible. Configuration changes made via PCI are mapped to actions which
64 * will steer the operation of the in-kernel logic.
65 *
66 *
67 * -----------
68 * Ring Basics
69 * -----------
70 *
71 * Each viona link has a number of pairs of viona_vring_t entities, each pair
72 * consisting of an RX and TX ring, for handling data transfers to and from the
73 * guest respectively. They represent an interface to the standard virtio ring
74 * structures. When initialized and active, each ring is backed by a kernel
75 * worker thread (parented to the bhyve process for the instance) which handles
76 * ring events. An RX worker has the simple task of watching for ring shutdown
77 * conditions. A TX worker does that in addition to processing all requests to
78 * transmit data. Data destined for the guest is delivered directly by MAC to
79 * viona_rx() when the ring is active.
80 *
81 *
82 * -----------
83 * Ring States
84 * -----------
85 *
86 * The viona_vring_t instances follow a simple path through the possible state
87 * values represented in virtio_vring_t`vr_state:
88 *
89 * +<--------------------------------------------+
90 * | |
91 * V ^
92 * +-----------+ This is the initial state when a link is created or
93 * | VRS_RESET | when the ring has been explicitly reset.
94 * +-----------+
95 * | ^
96 * |---* ioctl(VNA_IOC_RING_INIT) issued |
97 * | |
98 * | ^
99 * V
100 * +-----------+ The ring parameters (size, guest physical addresses)
101 * | VRS_SETUP | have been set and start-up of the ring worker thread
102 * +-----------+ has begun.
103 * | ^
104 * | |
105 * |---* ring worker thread begins execution |
106 * | |
107 * +-------------------------------------------->+
108 * | | ^
109 * | |
110 * | * If ring shutdown is requested (by ioctl or impending
111 * | bhyve process death) while the worker thread is
112 * | starting, the worker will transition the ring to
113 * | VRS_RESET and exit.
114 * | ^
115 * | |
116 * |<-------------------------------------------<+
117 * | | |
118 * | | ^
119 * | * If ring is requested to pause (but not stop) from the
120 * | VRS_RUN state, it will return to the VRS_INIT state.
121 * |
122 * | ^
123 * | |
124 * | ^
125 * V
126 * +-----------+ The worker thread associated with the ring has started
127 * | VRS_INIT | executing. It has allocated any extra resources needed
128 * +-----------+ for the ring to operate.
129 * | ^
130 * | |
131 * +-------------------------------------------->+
132 * | | ^
133 * | |
134 * | * If ring shutdown is requested while the worker is
135 * | waiting in VRS_INIT, it will free any extra resources
136 * | and transition to VRS_RESET.
137 * | ^
138 * | |
139 * |--* ioctl(VNA_IOC_RING_KICK) issued |
140 * | ^
141 * V
142 * +-----------+ The worker thread associated with the ring is executing
143 * | VRS_RUN | workload specific to that ring.
144 * +-----------+
145 * | ^
146 * |---* ioctl(VNA_IOC_RING_RESET) issued |
147 * | (or bhyve process begins exit) ^
148 * |
149 * +-----------+ The worker thread associated with the ring is in the
150 * | VRS_STOP | process of exiting. All outstanding TX and RX
151 * +-----------+ requests are allowed to complete, but new requests
152 * | must be ignored.
153 * | ^
154 * | |
155 * +-------------------------------------------->+
156 *
157 *
158 * While the worker thread is not running, changes to vr_state are only made by
159 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
160 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
161 * has been started, only it may perform ring state transitions (still under
162 * the protection of vr_lock), when requested by outside consumers via
163 * vr_state_flags or when the containing bhyve process initiates an exit.
164 *
165 * Additionally, since all ioctls that affect a ring are mutually exclusive
166 * via a hold on the soft state lock, a ring cannot unexpectedly change state
167 * while this lock is held. This is relied on by the VNA_IOC_SET_PAIRS ioctl to
168 * guarantee that the ring is idle, and remains so, while the number of queue
169 * pairs is being changed.
170 *
171 *
172 * ----------------------------
173 * Multiple Rings (multi-queue)
174 * ----------------------------
175 *
176 * A link starts its life with a single pair of rings (one RX and one TX ring).
177 * The number of pairs can be varied via a call to ioctl(VNA_IOC_SET_PAIRS)
178 * providing all of the existing rings are in the VRS_RESET state. Therefore a
179 * userland consumer may only change the ring count between link creation and
180 * initialising any rings, or after issuing ioctl(VNA_IOC_RING_RESET) on
181 * all rings. The number of active rings cannot be reduced below the number of
182 * rings currently used, see below. The maximum number of rings permitted by
183 * viona (0x100) is lower than that permitted for a network device by the
184 * VirtIO specification (0x8000).
185 *
186 * Separately the number of RX rings that should be used for transmission of
187 * data to the guest can be varied at any time via ioctl(VNA_IOC_SET_USEPAIRS).
188 * The number of pairs to use can never exceed the total number of allocated
189 * pairs.
190 *
191 *
192 * ----------------------------
193 * Transmission mblk_t Handling
194 * ----------------------------
195 *
196 * For incoming frames destined for a bhyve guest, the data must first land in
197 * a host OS buffer from the physical NIC before it is copied into the awaiting
198 * guest buffer(s). Outbound frames transmitted by the guest are not bound by
199 * this limitation and can avoid extra copying before the buffers are accessed
200 * directly by the NIC. When a guest designates buffers to be transmitted,
201 * viona translates the guest-physical addresses contained in the ring
202 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is
203 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
204 * Doing so increments vr_xfer_outstanding, preventing the ring from being
205 * reset (allowing the link to drop its vmm handle to the guest) until all
206 * transmit mblks referencing guest memory have been processed. Allocation of
207 * the viona_desb_t entries is done during the VRS_INIT stage of the ring
208 * worker thread. The ring size informs that allocation as the number of
209 * concurrent transmissions is limited by the number of descriptors in the
210 * ring. This minimizes allocation in the transmit hot-path by acquiring those
211 * fixed-size resources during initialization.
212 *
213 * This optimization depends on the underlying NIC driver freeing the mblks in
214 * a timely manner after they have been transmitted by the hardware. Some
215 * drivers have been found to flush TX descriptors only when new transmissions
216 * are initiated. This means that there is no upper bound to the time needed
217 * for an mblk to be flushed and can stall bhyve guests from shutting down
218 * since their memory must be free of viona TX references prior to clean-up.
219 *
220 * This expectation of deterministic mblk_t processing is likely the reason
221 * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
222 * loaded will copy transmit data into fresh buffers rather than passing up
223 * zero-copy mblks. It is a hold-over from the original viona sources provided
224 * by Pluribus and its continued necessity has not been confirmed.
225 *
226 *
227 * ----------------------------
228 * Ring Notification Fast-paths
229 * ----------------------------
230 *
231 * Device operation for viona requires that notifications flow to and from the
232 * guest to indicate certain ring conditions. In order to minimize latency and
233 * processing overhead, the notification procedures are kept in-kernel whenever
234 * possible.
235 *
236 * Guest-to-host notifications, when new available descriptors have been placed
237 * in the ring, are posted for legacy devices via the 'queue notify' address in
238 * the virtio BAR. For modern devices the notifications are posted to the MMIO
239 * bar that is indicated by the notify PCI capability. The
240 * vmm_drv_ioport_hook() and vmm_drv_mmio_hook() interfaces were added to bhyve
241 * which allows viona to install a callback hook on an ioport, or on an MMIO
242 * address range. Guest exits for accesses to viona-hooked addresses will
243 * result in direct calls to notify the appropriate ring worker without a trip
244 * to userland.
245 *
246 * Host-to-guest notifications in the form of interrupts enjoy similar
247 * acceleration. Each viona ring can be configured to send MSI notifications
248 * to the guest as virtio conditions dictate. This in-kernel interrupt
249 * configuration is kept synchronized through viona ioctls which are utilized
250 * during writes to the associated PCI config registers or MSI-X BAR.
251 *
252 * Guests which do not utilize MSI-X will result in viona falling back to the
253 * slow path for interrupts. It will poll(2) the viona handle, receiving
254 * notification when ring events necessitate the assertion of an interrupt.
255 *
256 *
257 * ---------------
258 * Nethook Support
259 * ---------------
260 *
261 * Viona provides four nethook events that consumers (e.g. ipf) can hook into
262 * to intercept packets as they go up or down the stack. Unfortunately,
263 * the nethook framework does not understand raw packets, so we can only
264 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
265 * we register callbacks with the neti (netinfo) module that will be invoked
266 * for each netstack already present, as well as for any additional netstack
267 * instances created as the system operates. These callbacks will
268 * register/unregister the hooks with the nethook framework for each
269 * netstack instance. This registration occurs prior to creating any
270 * viona instances for a given netstack, and the unregistration for a netstack
271 * instance occurs after all viona instances of the netstack instance have
272 * been deleted.
273 *
274 * ------------------
275 * Metrics/Statistics
276 * -----------------
277 *
278 * During operation, Viona tracks certain metrics as certain events occur.
279 *
280 * One class of metrics, known as the "error stats", refer to abnormal
281 * conditions in ring processing which are likely the fault of a misbehaving
282 * guest. These are tracked on a per-ring basis, and are not formally exposed
283 * to any consumer besides direct memory access through mdb.
284 *
285 * The other class of metrics tracked for an instance are the "transfer stats",
286 * which are the traditional packets/bytes/errors/drops figures. These are
287 * counted per-ring, and then aggregated into link-wide values exposed via
288 * kstats. Atomic operations are used to increment those per-ring stats during
289 * operation, and then when a ring is stopped, the values are consolidated into
290 * the link-wide values (to prevent loss when the ring is zeroed) under the
291 * protection of viona_link`l_stats_lock. When the kstats are being updated,
292 * l_stats_lock is held to protect against a racing consolidation, with the
293 * existing per-ring values being added in at update time to provide an accurate
294 * figure.
295 */
296
297 #include <sys/conf.h>
298 #include <sys/file.h>
299 #include <sys/stat.h>
300
301 #include <sys/dlpi.h>
302 #include <sys/vlan.h>
303
304 #include "viona_impl.h"
305
306
307 #define VIONA_NAME "Virtio Network Accelerator"
308 #define VIONA_CTL_MINOR 0
309 #define VIONA_MODULE_NAME "viona"
310 #define VIONA_KSTAT_CLASS "misc"
311 #define VIONA_KSTAT_NAME "viona_stat"
312
313
314 /*
315 * Host capabilities.
316 */
317 #define VIONA_S_HOSTCAPS ( \
318 VIRTIO_NET_F_GUEST_CSUM | \
319 VIRTIO_NET_F_GUEST_TSO4 | \
320 VIRTIO_NET_F_GUEST_TSO6 | \
321 VIRTIO_NET_F_MRG_RXBUF | \
322 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
323 VIRTIO_F_RING_INDIRECT_DESC)
324
325 /* MAC_CAPAB_HCKSUM specifics of interest */
326 #define VIONA_CAP_HCKSUM_INTEREST \
327 (HCKSUM_INET_PARTIAL | \
328 HCKSUM_INET_FULL_V4 | \
329 HCKSUM_INET_FULL_V6)
330
331 static void *viona_state;
332 static dev_info_t *viona_dip;
333 static id_space_t *viona_minors;
334
335
336 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
337 void **result);
338 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
339 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
340 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
341 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
342 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
343 cred_t *credp, int *rval);
344 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
345 struct pollhead **phpp);
346
347 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
348 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
349
350 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
351 static int viona_ioc_set_notify_mmio(viona_link_t *, void *, int);
352 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t);
353 static int viona_ioc_get_params(viona_link_t *, void *, int);
354 static int viona_ioc_set_params(viona_link_t *, void *, int);
355 static int viona_ioc_link_setpairs(viona_link_t *, uint16_t);
356 static int viona_ioc_link_usepairs(viona_link_t *, uint16_t);
357 static int viona_ioc_ring_init(viona_link_t *, void *, int);
358 static int viona_ioc_ring_init_modern(viona_link_t *, void *, int);
359 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
360 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
361 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
362 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
363 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
364 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
365 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
366 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
367 static int viona_ioc_intr_poll_mq(viona_link_t *, void *, int, int *);
368
369 static void viona_params_get_defaults(viona_link_params_t *);
370
371 static struct cb_ops viona_cb_ops = {
372 viona_open,
373 viona_close,
374 nodev,
375 nodev,
376 nodev,
377 nodev,
378 nodev,
379 viona_ioctl,
380 nodev,
381 nodev,
382 nodev,
383 viona_chpoll,
384 ddi_prop_op,
385 0,
386 D_MP | D_NEW | D_HOTPLUG,
387 CB_REV,
388 nodev,
389 nodev
390 };
391
392 static struct dev_ops viona_ops = {
393 DEVO_REV,
394 0,
395 viona_info,
396 nulldev,
397 nulldev,
398 viona_attach,
399 viona_detach,
400 nodev,
401 &viona_cb_ops,
402 NULL,
403 ddi_power,
404 ddi_quiesce_not_needed
405 };
406
407 static struct modldrv modldrv = {
408 &mod_driverops,
409 VIONA_NAME,
410 &viona_ops,
411 };
412
413 static struct modlinkage modlinkage = {
414 MODREV_1, &modldrv, NULL
415 };
416
417 int
_init(void)418 _init(void)
419 {
420 int ret;
421
422 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
423 if (ret != 0) {
424 return (ret);
425 }
426
427 viona_minors = id_space_create("viona_minors",
428 VIONA_CTL_MINOR + 1, UINT16_MAX);
429 viona_rx_init();
430 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
431
432 ret = mod_install(&modlinkage);
433 if (ret != 0) {
434 ddi_soft_state_fini(&viona_state);
435 id_space_destroy(viona_minors);
436 viona_rx_fini();
437 mutex_destroy(&viona_force_copy_lock);
438 }
439
440 return (ret);
441 }
442
443 int
_fini(void)444 _fini(void)
445 {
446 int ret;
447
448 ret = mod_remove(&modlinkage);
449 if (ret != 0) {
450 return (ret);
451 }
452
453 ddi_soft_state_fini(&viona_state);
454 id_space_destroy(viona_minors);
455 viona_rx_fini();
456 mutex_destroy(&viona_force_copy_lock);
457
458 return (ret);
459 }
460
461 int
_info(struct modinfo * modinfop)462 _info(struct modinfo *modinfop)
463 {
464 return (mod_info(&modlinkage, modinfop));
465 }
466
467 /* ARGSUSED */
468 static int
viona_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)469 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
470 {
471 int error;
472
473 switch (cmd) {
474 case DDI_INFO_DEVT2DEVINFO:
475 *result = (void *)viona_dip;
476 error = DDI_SUCCESS;
477 break;
478 case DDI_INFO_DEVT2INSTANCE:
479 *result = (void *)0;
480 error = DDI_SUCCESS;
481 break;
482 default:
483 error = DDI_FAILURE;
484 break;
485 }
486 return (error);
487 }
488
489 static int
viona_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)490 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
491 {
492 if (cmd != DDI_ATTACH) {
493 return (DDI_FAILURE);
494 }
495
496 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
497 DDI_PSEUDO, 0) != DDI_SUCCESS) {
498 return (DDI_FAILURE);
499 }
500
501 viona_neti_attach();
502
503 viona_dip = dip;
504 ddi_report_dev(viona_dip);
505
506 return (DDI_SUCCESS);
507 }
508
509 static int
viona_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)510 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
511 {
512 dev_info_t *old_dip = viona_dip;
513
514 if (cmd != DDI_DETACH) {
515 return (DDI_FAILURE);
516 }
517
518 VERIFY(old_dip != NULL);
519
520 viona_neti_detach();
521 viona_dip = NULL;
522 ddi_remove_minor_node(old_dip, NULL);
523
524 return (DDI_SUCCESS);
525 }
526
527 static int
viona_open(dev_t * devp,int flag,int otype,cred_t * credp)528 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
529 {
530 int minor;
531 viona_soft_state_t *ss;
532
533 if (otype != OTYP_CHR) {
534 return (EINVAL);
535 }
536 #if 0
537 /*
538 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
539 * Should the check be at open() or ioctl()?
540 */
541 if (drv_priv(credp) != 0) {
542 return (EPERM);
543 }
544 #endif
545 if (getminor(*devp) != VIONA_CTL_MINOR) {
546 return (ENXIO);
547 }
548
549 minor = id_alloc_nosleep(viona_minors);
550 if (minor == -1) {
551 /* All minors are busy */
552 return (EBUSY);
553 }
554 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
555 id_free(viona_minors, minor);
556 return (ENOMEM);
557 }
558
559 ss = ddi_get_soft_state(viona_state, minor);
560 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
561 ss->ss_minor = minor;
562 *devp = makedevice(getmajor(*devp), minor);
563
564 return (0);
565 }
566
567 static int
viona_close(dev_t dev,int flag,int otype,cred_t * credp)568 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
569 {
570 int minor;
571 viona_soft_state_t *ss;
572
573 if (otype != OTYP_CHR) {
574 return (EINVAL);
575 }
576
577 minor = getminor(dev);
578
579 ss = ddi_get_soft_state(viona_state, minor);
580 if (ss == NULL) {
581 return (ENXIO);
582 }
583
584 VERIFY0(viona_ioc_delete(ss, B_TRUE));
585 VERIFY(!list_link_active(&ss->ss_node));
586 ddi_soft_state_free(viona_state, minor);
587 id_free(viona_minors, minor);
588
589 return (0);
590 }
591
592 static int
viona_ioctl(dev_t dev,int cmd,intptr_t data,int md,cred_t * cr,int * rv)593 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
594 {
595 viona_soft_state_t *ss;
596 void *dptr = (void *)data;
597 int err = 0;
598 uint64_t val;
599 viona_link_t *link;
600
601 ss = ddi_get_soft_state(viona_state, getminor(dev));
602 if (ss == NULL) {
603 return (ENXIO);
604 }
605
606 switch (cmd) {
607 case VNA_IOC_CREATE:
608 return (viona_ioc_create(ss, dptr, md, cr));
609 case VNA_IOC_DELETE:
610 return (viona_ioc_delete(ss, B_FALSE));
611 case VNA_IOC_VERSION:
612 *rv = VIONA_CURRENT_INTERFACE_VERSION;
613 return (0);
614 case VNA_IOC_DEFAULT_PARAMS:
615 /*
616 * With a NULL link parameter, viona_ioc_get_params() will emit
617 * the default parameters with the same error-handling behavior
618 * as VNA_IOC_GET_PARAMS.
619 */
620 return (viona_ioc_get_params(NULL, dptr, md));
621 default:
622 break;
623 }
624
625 mutex_enter(&ss->ss_lock);
626 if ((link = ss->ss_link) == NULL || link->l_destroyed ||
627 vmm_drv_release_reqd(link->l_vm_hold)) {
628 mutex_exit(&ss->ss_lock);
629 return (ENXIO);
630 }
631
632 switch (cmd) {
633 case VNA_IOC_GET_FEATURES:
634 val = VIONA_S_HOSTCAPS | link->l_features_hw;
635 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
636 err = EFAULT;
637 }
638 break;
639 case VNA_IOC_SET_FEATURES:
640 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
641 err = EFAULT;
642 break;
643 }
644 link->l_modern = ((val & VIRTIO_F_VERSION_1) != 0);
645 val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
646
647 if ((val & VIRTIO_NET_F_CSUM) == 0) {
648 val &= ~(VIRTIO_NET_F_HOST_TSO4 |
649 VIRTIO_NET_F_HOST_TSO6);
650 }
651
652 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) {
653 val &= ~(VIRTIO_NET_F_GUEST_TSO4 |
654 VIRTIO_NET_F_GUEST_TSO6);
655 }
656
657 link->l_features = val;
658 break;
659 case VNA_IOC_GET_PAIRS:
660 *rv = (int)link->l_npairs;
661 break;
662 case VNA_IOC_SET_PAIRS:
663 if (data > UINT16_MAX)
664 err = EINVAL;
665 else
666 err = viona_ioc_link_setpairs(link, (uint16_t)data);
667 break;
668 case VNA_IOC_GET_USEPAIRS:
669 *rv = (int)link->l_usepairs;
670 break;
671 case VNA_IOC_SET_USEPAIRS:
672 if (data > UINT16_MAX)
673 err = EINVAL;
674 else
675 err = viona_ioc_link_usepairs(link, (uint16_t)data);
676 break;
677 case VNA_IOC_RING_INIT:
678 err = viona_ioc_ring_init(link, dptr, md);
679 break;
680 case VNA_IOC_RING_INIT_MODERN:
681 err = viona_ioc_ring_init_modern(link, dptr, md);
682 break;
683 case VNA_IOC_RING_RESET:
684 err = viona_ioc_ring_reset(link, (uint_t)data);
685 break;
686 case VNA_IOC_RING_KICK:
687 err = viona_ioc_ring_kick(link, (uint_t)data);
688 break;
689 case VNA_IOC_RING_SET_MSI:
690 err = viona_ioc_ring_set_msi(link, dptr, md);
691 break;
692 case VNA_IOC_RING_INTR_CLR:
693 err = viona_ioc_ring_intr_clear(link, (uint_t)data);
694 break;
695 case VNA_IOC_RING_SET_STATE:
696 err = viona_ioc_ring_set_state(link, dptr, md);
697 break;
698 case VNA_IOC_RING_GET_STATE:
699 err = viona_ioc_ring_get_state(link, dptr, md);
700 break;
701 case VNA_IOC_RING_PAUSE:
702 err = viona_ioc_ring_pause(link, (uint_t)data);
703 break;
704
705 case VNA_IOC_INTR_POLL:
706 err = viona_ioc_intr_poll(link, dptr, md, rv);
707 break;
708 case VNA_IOC_INTR_POLL_MQ:
709 err = viona_ioc_intr_poll_mq(link, dptr, md, rv);
710 break;
711 case VNA_IOC_SET_NOTIFY_IOP:
712 if (data < 0 || data > UINT16_MAX) {
713 err = EINVAL;
714 break;
715 }
716 err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
717 break;
718 case VNA_IOC_SET_NOTIFY_MMIO:
719 err = viona_ioc_set_notify_mmio(link, dptr, md);
720 break;
721 case VNA_IOC_SET_PROMISC:
722 err = viona_ioc_set_promisc(link, (viona_promisc_t)data);
723 break;
724 case VNA_IOC_GET_PARAMS:
725 err = viona_ioc_get_params(link, dptr, md);
726 break;
727 case VNA_IOC_SET_PARAMS:
728 err = viona_ioc_set_params(link, dptr, md);
729 break;
730 case VNA_IOC_GET_MTU:
731 *rv = (int)link->l_mtu;
732 break;
733 case VNA_IOC_SET_MTU:
734 if (data < VIONA_MIN_MTU || data > VIONA_MAX_MTU)
735 err = EINVAL;
736 else
737 link->l_mtu = (uint16_t)data;
738 break;
739 default:
740 err = ENOTTY;
741 break;
742 }
743
744 mutex_exit(&ss->ss_lock);
745 return (err);
746 }
747
748 static int
viona_chpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)749 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
750 struct pollhead **phpp)
751 {
752 viona_soft_state_t *ss;
753 viona_link_t *link;
754
755 ss = ddi_get_soft_state(viona_state, getminor(dev));
756 if (ss == NULL) {
757 return (ENXIO);
758 }
759
760 mutex_enter(&ss->ss_lock);
761 if ((link = ss->ss_link) == NULL || link->l_destroyed) {
762 mutex_exit(&ss->ss_lock);
763 return (ENXIO);
764 }
765
766 *reventsp = 0;
767 if ((events & POLLRDBAND) != 0) {
768 for (uint16_t i = 0; i < VIONA_USABLE_RINGS(link); i++) {
769 if (link->l_vrings[i].vr_intr_enabled != 0) {
770 *reventsp |= POLLRDBAND;
771 break;
772 }
773 }
774 }
775 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
776 *phpp = &link->l_pollhead;
777 }
778 mutex_exit(&ss->ss_lock);
779
780 return (0);
781 }
782
783 static void
viona_get_mac_capab(viona_link_t * link)784 viona_get_mac_capab(viona_link_t *link)
785 {
786 mac_handle_t mh = link->l_mh;
787 uint32_t cap = 0;
788 mac_capab_lso_t lso_cap;
789
790 link->l_features_hw = 0;
791 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
792 /*
793 * Only report HW checksum ability if the underlying MAC
794 * resource is capable of populating the L4 header.
795 */
796 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
797 link->l_features_hw |= VIRTIO_NET_F_CSUM;
798 }
799 link->l_cap_csum = cap;
800 }
801
802 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
803 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
804 /*
805 * Virtio doesn't allow for negotiating a maximum LSO
806 * packet size. We have to assume that the guest may
807 * send a maximum length IP packet. Make sure the
808 * underlying MAC can handle an LSO of this size.
809 */
810 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
811 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) {
812 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
813 }
814
815 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV6) &&
816 lso_cap.lso_basic_tcp_ipv6.lso_max >= IP_MAXPACKET) {
817 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO6;
818 }
819 }
820 }
821
822 static int
viona_kstat_update(kstat_t * ksp,int rw)823 viona_kstat_update(kstat_t *ksp, int rw)
824 {
825 viona_link_t *link = ksp->ks_private;
826 viona_kstats_t *vk = ksp->ks_data;
827
828 /*
829 * Avoid the potential for mangled values due to a racing consolidation
830 * of stats for a ring by performing the kstat update with l_stats_lock
831 * held while adding up the central (link) and ring values.
832 */
833 mutex_enter(&link->l_stats_lock);
834
835 for (uint16_t i = 0; i < VIONA_USABLE_RINGS(link); i++) {
836 const viona_vring_t *ring = &link->l_vrings[i];
837 const viona_transfer_stats_t *ring_stats = &ring->vr_stats;
838 const viona_transfer_stats_t *link_stats;
839
840 if (VIONA_RING_ISRX(ring)) {
841 link_stats = &link->l_stats.vls_rx;
842
843 vk->vk_rx_packets.value.ui64 =
844 link_stats->vts_packets + ring_stats->vts_packets;
845 vk->vk_rx_bytes.value.ui64 =
846 link_stats->vts_bytes + ring_stats->vts_bytes;
847 vk->vk_rx_errors.value.ui64 =
848 link_stats->vts_errors + ring_stats->vts_errors;
849 vk->vk_rx_drops.value.ui64 =
850 link_stats->vts_drops + ring_stats->vts_drops;
851 } else if (VIONA_RING_ISTX(ring)) {
852 link_stats = &link->l_stats.vls_tx;
853
854 vk->vk_tx_packets.value.ui64 =
855 link_stats->vts_packets + ring_stats->vts_packets;
856 vk->vk_tx_bytes.value.ui64 =
857 link_stats->vts_bytes + ring_stats->vts_bytes;
858 vk->vk_tx_errors.value.ui64 =
859 link_stats->vts_errors + ring_stats->vts_errors;
860 vk->vk_tx_drops.value.ui64 =
861 link_stats->vts_drops + ring_stats->vts_drops;
862 }
863 }
864
865 mutex_exit(&link->l_stats_lock);
866
867 return (0);
868 }
869
870 static int
viona_kstat_init(viona_soft_state_t * ss,const cred_t * cr)871 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr)
872 {
873 zoneid_t zid = crgetzoneid(cr);
874 kstat_t *ksp;
875
876 ASSERT(MUTEX_HELD(&ss->ss_lock));
877 ASSERT3P(ss->ss_kstat, ==, NULL);
878
879 ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor,
880 VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED,
881 sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid);
882
883 if (ksp == NULL) {
884 /*
885 * Without detail from kstat_create_zone(), assume that resource
886 * exhaustion is to blame for the failure.
887 */
888 return (ENOMEM);
889 }
890 ss->ss_kstat = ksp;
891
892 /*
893 * If this instance is associated with a non-global zone, make its
894 * kstats visible from the GZ.
895 */
896 if (zid != GLOBAL_ZONEID) {
897 kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID);
898 }
899
900 viona_kstats_t *vk = ksp->ks_data;
901
902 kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64);
903 kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64);
904 kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64);
905 kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64);
906 kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64);
907 kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64);
908 kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64);
909 kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64);
910 ksp->ks_private = ss->ss_link;
911 ksp->ks_update = viona_kstat_update;
912
913 kstat_install(ss->ss_kstat);
914 return (0);
915 }
916
917 static void
viona_kstat_fini(viona_soft_state_t * ss)918 viona_kstat_fini(viona_soft_state_t *ss)
919 {
920 ASSERT(MUTEX_HELD(&ss->ss_lock));
921
922 if (ss->ss_kstat != NULL) {
923 kstat_delete(ss->ss_kstat);
924 ss->ss_kstat = NULL;
925 }
926 }
927
928 static void
viona_link_qfree(viona_link_t * link)929 viona_link_qfree(viona_link_t *link)
930 {
931 if (link->l_vrings == NULL)
932 return;
933
934 for (uint16_t i = 0; i < VIONA_NRINGS(link); i++) {
935 ASSERT3U(link->l_vrings[i].vr_state, ==, VRS_RESET);
936 viona_ring_free(&link->l_vrings[i]);
937 }
938 kmem_free(link->l_vrings, sizeof (viona_vring_t) * VIONA_NRINGS(link));
939 link->l_vrings = NULL;
940 link->l_npairs = link->l_usepairs = 0;
941 }
942
943 static int
viona_link_qalloc(viona_link_t * link,uint16_t pairs)944 viona_link_qalloc(viona_link_t *link, uint16_t pairs)
945 {
946 const uint16_t usepairs = link->l_usepairs;
947
948 ASSERT(MUTEX_HELD(&link->l_ss->ss_lock));
949
950 if (pairs < VIONA_MIN_QPAIR || pairs > VIONA_MAX_QPAIR ||
951 pairs < usepairs) {
952 return (EINVAL);
953 }
954
955 for (uint16_t i = 0; i < VIONA_NRINGS(link); i++) {
956 if (link->l_vrings[i].vr_state != VRS_RESET)
957 return (EBUSY);
958 }
959
960 /*
961 * This is safe as we are holding the ss_lock, have checked that all
962 * of the rings are in the VRS_RESET state and know that the mac RX
963 * callback is not set at this point.
964 */
965 viona_link_qfree(link);
966
967 link->l_npairs = pairs;
968 link->l_usepairs = usepairs;
969 link->l_vrings = kmem_zalloc(
970 sizeof (viona_vring_t) * VIONA_NRINGS(link), KM_SLEEP);
971
972 for (uint16_t i = 0; i < VIONA_NRINGS(link); i++)
973 viona_ring_alloc(link, &link->l_vrings[i]);
974
975 return (0);
976 }
977
978 static int
viona_ioc_create(viona_soft_state_t * ss,void * dptr,int md,cred_t * cr)979 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
980 {
981 vioc_create_t kvc;
982 viona_link_t *link = NULL;
983 char cli_name[MAXNAMELEN];
984 int err = 0;
985 file_t *fp;
986 vmm_hold_t *hold = NULL;
987 viona_neti_t *nip = NULL;
988 zoneid_t zid;
989 mac_diag_t mac_diag = MAC_DIAG_NONE;
990
991 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
992
993 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
994 return (EFAULT);
995 }
996
997 zid = crgetzoneid(cr);
998 nip = viona_neti_lookup_by_zid(zid);
999 if (nip == NULL) {
1000 return (EIO);
1001 }
1002
1003 if (!nip->vni_nethook.vnh_hooked) {
1004 viona_neti_rele(nip);
1005 return (EIO);
1006 }
1007
1008 mutex_enter(&ss->ss_lock);
1009 if (ss->ss_link != NULL) {
1010 mutex_exit(&ss->ss_lock);
1011 viona_neti_rele(nip);
1012 return (EEXIST);
1013 }
1014
1015 if ((fp = getf(kvc.c_vmfd)) == NULL) {
1016 err = EBADF;
1017 goto bail;
1018 }
1019 err = vmm_drv_hold(fp, cr, &hold);
1020 releasef(kvc.c_vmfd);
1021 if (err != 0) {
1022 goto bail;
1023 }
1024
1025 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
1026 link->l_ss = ss;
1027 link->l_linkid = kvc.c_linkid;
1028 link->l_vm_hold = hold;
1029 link->l_mtu = VIONA_DEFAULT_MTU;
1030 link->l_notify_mmaddr = NOTIFY_MMADDR_UNSET;
1031
1032 err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
1033 if (err != 0) {
1034 goto bail;
1035 }
1036
1037 viona_get_mac_capab(link);
1038 viona_params_get_defaults(&link->l_params);
1039
1040 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME,
1041 link->l_linkid);
1042 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
1043 if (err != 0) {
1044 goto bail;
1045 }
1046
1047 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
1048 &link->l_muh, VLAN_ID_NONE, &mac_diag);
1049 if (err != 0) {
1050 goto bail;
1051 }
1052
1053 if (viona_link_qalloc(link, 1) != 0)
1054 goto bail;
1055 link->l_usepairs = 1;
1056
1057 /*
1058 * Default to passing up all multicast traffic in addition to
1059 * classified unicast. Guests which have support will change this
1060 * if they need to via the virtio net control queue; guests without
1061 * support generally still want to see multicast.
1062 */
1063 link->l_promisc = VIONA_PROMISC_MULTI;
1064 if ((err = viona_rx_set(link, link->l_promisc)) != 0) {
1065 goto bail;
1066 }
1067
1068 link->l_neti = nip;
1069 ss->ss_link = link;
1070
1071 if ((err = viona_kstat_init(ss, cr)) != 0) {
1072 goto bail;
1073 }
1074
1075 mutex_exit(&ss->ss_lock);
1076
1077 mutex_enter(&nip->vni_lock);
1078 list_insert_tail(&nip->vni_dev_list, ss);
1079 mutex_exit(&nip->vni_lock);
1080
1081 return (0);
1082
1083 bail:
1084 if (link != NULL) {
1085 viona_rx_clear(link);
1086 if (link->l_mch != NULL) {
1087 if (link->l_muh != NULL) {
1088 VERIFY0(mac_unicast_remove(link->l_mch,
1089 link->l_muh));
1090 link->l_muh = NULL;
1091 }
1092 mac_client_close(link->l_mch, 0);
1093 }
1094 if (link->l_mh != NULL) {
1095 mac_close(link->l_mh);
1096 }
1097 viona_link_qfree(link);
1098 kmem_free(link, sizeof (viona_link_t));
1099 ss->ss_link = NULL;
1100 }
1101 if (hold != NULL) {
1102 vmm_drv_rele(hold);
1103 }
1104 viona_neti_rele(nip);
1105
1106 mutex_exit(&ss->ss_lock);
1107 return (err);
1108 }
1109
1110 static int
viona_ioc_delete(viona_soft_state_t * ss,boolean_t on_close)1111 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
1112 {
1113 viona_link_t *link;
1114 viona_neti_t *nip = NULL;
1115
1116 mutex_enter(&ss->ss_lock);
1117 if ((link = ss->ss_link) == NULL) {
1118 /* Link destruction already complete */
1119 mutex_exit(&ss->ss_lock);
1120 return (0);
1121 }
1122
1123 if (link->l_destroyed) {
1124 /*
1125 * Link destruction has been started by another thread, but has
1126 * not completed. This condition should be impossible to
1127 * encounter when performing the on-close destroy of the link,
1128 * since racing ioctl accessors must necessarily be absent.
1129 */
1130 VERIFY(!on_close);
1131 mutex_exit(&ss->ss_lock);
1132 return (EAGAIN);
1133 }
1134 /*
1135 * The link deletion cannot fail after this point, continuing until its
1136 * successful completion is reached.
1137 */
1138 link->l_destroyed = B_TRUE;
1139
1140 /*
1141 * Tear down the IO and MMIO port hooks so they cannot be used to kick
1142 * any of the rings which are about to be reset and stopped.
1143 */
1144 VERIFY0(viona_ioc_set_notify_ioport(link, 0));
1145 VERIFY0(viona_ioc_set_notify_mmio(link, NULL, 0));
1146 mutex_exit(&ss->ss_lock);
1147
1148 /*
1149 * Return the rings to their reset state, ignoring any possible
1150 * interruptions from signals.
1151 */
1152 for (uint16_t i = 0; i < VIONA_NRINGS(link); i++)
1153 VERIFY0(viona_ring_reset(&link->l_vrings[i], B_FALSE));
1154
1155 mutex_enter(&ss->ss_lock);
1156 viona_kstat_fini(ss);
1157 if (link->l_mch != NULL) {
1158 /* Unhook the receive callbacks and close out the client */
1159 viona_rx_clear(link);
1160 if (link->l_muh != NULL) {
1161 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
1162 link->l_muh = NULL;
1163 }
1164 mac_client_close(link->l_mch, 0);
1165 }
1166 if (link->l_mh != NULL) {
1167 mac_close(link->l_mh);
1168 }
1169 if (link->l_vm_hold != NULL) {
1170 vmm_drv_rele(link->l_vm_hold);
1171 link->l_vm_hold = NULL;
1172 }
1173
1174 nip = link->l_neti;
1175 link->l_neti = NULL;
1176
1177 viona_link_qfree(link);
1178 pollhead_clean(&link->l_pollhead);
1179 ss->ss_link = NULL;
1180 mutex_exit(&ss->ss_lock);
1181
1182 mutex_enter(&nip->vni_lock);
1183 list_remove(&nip->vni_dev_list, ss);
1184 mutex_exit(&nip->vni_lock);
1185
1186 viona_neti_rele(nip);
1187
1188 kmem_free(link, sizeof (viona_link_t));
1189 return (0);
1190 }
1191
1192 static int
viona_ioc_ring_init(viona_link_t * link,void * udata,int md)1193 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
1194 {
1195 vioc_ring_init_t kri;
1196 int err;
1197
1198 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
1199 return (EFAULT);
1200 }
1201
1202 if (!VIONA_RING_VALID(link, kri.ri_index))
1203 return (EINVAL);
1204
1205 struct viona_ring_params params = {
1206 .vrp_pa_desc = kri.ri_qaddr,
1207 .vrp_pa_avail = 0,
1208 .vrp_pa_used = 0,
1209 .vrp_size = kri.ri_qsize,
1210 .vrp_avail_idx = 0,
1211 .vrp_used_idx = 0,
1212 };
1213
1214 if ((err = viona_ring_legacy_addr(¶ms)) != 0)
1215 return (err);
1216
1217 err = viona_ring_init(link, kri.ri_index, ¶ms);
1218
1219 return (err);
1220 }
1221
1222 static int
viona_ioc_ring_init_modern(viona_link_t * link,void * udata,int md)1223 viona_ioc_ring_init_modern(viona_link_t *link, void *udata, int md)
1224 {
1225 vioc_ring_init_modern_t krim;
1226 int err;
1227
1228 if (ddi_copyin(udata, &krim, sizeof (krim), md) != 0) {
1229 return (EFAULT);
1230 }
1231
1232 if (!VIONA_RING_VALID(link, krim.rim_index))
1233 return (EINVAL);
1234
1235 const struct viona_ring_params params = {
1236 .vrp_pa_desc = krim.rim_qaddr_desc,
1237 .vrp_pa_avail = krim.rim_qaddr_avail,
1238 .vrp_pa_used = krim.rim_qaddr_used,
1239 .vrp_size = krim.rim_qsize,
1240 .vrp_avail_idx = 0,
1241 .vrp_used_idx = 0,
1242 };
1243
1244 err = viona_ring_init(link, krim.rim_index, ¶ms);
1245
1246 return (err);
1247 }
1248
1249 static int
viona_ioc_ring_set_state(viona_link_t * link,void * udata,int md)1250 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
1251 {
1252 vioc_ring_state_t krs;
1253 int err;
1254
1255 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1256 return (EFAULT);
1257 }
1258 const struct viona_ring_params params = {
1259 .vrp_pa_desc = krs.vrs_qaddr_desc,
1260 .vrp_pa_avail = krs.vrs_qaddr_avail,
1261 .vrp_pa_used = krs.vrs_qaddr_used,
1262 .vrp_size = krs.vrs_qsize,
1263 .vrp_avail_idx = krs.vrs_avail_idx,
1264 .vrp_used_idx = krs.vrs_used_idx,
1265 };
1266
1267 err = viona_ring_init(link, krs.vrs_index, ¶ms);
1268
1269 return (err);
1270 }
1271
1272 static int
viona_ioc_ring_get_state(viona_link_t * link,void * udata,int md)1273 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
1274 {
1275 vioc_ring_state_t krs;
1276
1277 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1278 return (EFAULT);
1279 }
1280
1281 struct viona_ring_params params;
1282 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms);
1283 if (err != 0) {
1284 return (err);
1285 }
1286 krs.vrs_qsize = params.vrp_size;
1287 krs.vrs_qaddr_desc = params.vrp_pa_desc;
1288 krs.vrs_qaddr_avail = params.vrp_pa_avail;
1289 krs.vrs_qaddr_used = params.vrp_pa_used;
1290 krs.vrs_avail_idx = params.vrp_avail_idx;
1291 krs.vrs_used_idx = params.vrp_used_idx;
1292
1293 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
1294 return (EFAULT);
1295 }
1296 return (0);
1297 }
1298
1299 static int
viona_ioc_link_setpairs(viona_link_t * link,uint16_t pairs)1300 viona_ioc_link_setpairs(viona_link_t *link, uint16_t pairs)
1301 {
1302 int err;
1303
1304 /* Unhook the receive callbacks while the rings are being reallocated */
1305 viona_rx_clear(link);
1306 err = viona_link_qalloc(link, pairs);
1307 (void) viona_rx_set(link, link->l_promisc);
1308
1309 return (err);
1310 }
1311
1312 static int
viona_ioc_link_usepairs(viona_link_t * link,uint16_t pairs)1313 viona_ioc_link_usepairs(viona_link_t *link, uint16_t pairs)
1314 {
1315 if (pairs < VIONA_MIN_QPAIR || pairs > link->l_npairs)
1316 return (EINVAL);
1317 link->l_usepairs = pairs;
1318 return (0);
1319 }
1320
1321 static int
viona_ioc_ring_reset(viona_link_t * link,uint_t idx)1322 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
1323 {
1324 viona_vring_t *ring;
1325
1326 if (!VIONA_RING_VALID(link, idx)) {
1327 return (EINVAL);
1328 }
1329 ring = &link->l_vrings[idx];
1330
1331 return (viona_ring_reset(ring, B_TRUE));
1332 }
1333
1334 static int
viona_ioc_ring_kick(viona_link_t * link,uint_t idx)1335 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
1336 {
1337 viona_vring_t *ring;
1338 int err;
1339
1340 if (!VIONA_RING_VALID(link, idx)) {
1341 return (EINVAL);
1342 }
1343 ring = &link->l_vrings[idx];
1344
1345 mutex_enter(&ring->vr_lock);
1346 switch (ring->vr_state) {
1347 case VRS_SETUP:
1348 /*
1349 * An early kick to a ring which is starting its worker thread
1350 * is fine. Once that thread is active, it will process the
1351 * start-up request immediately.
1352 */
1353 /* FALLTHROUGH */
1354 case VRS_INIT:
1355 ring->vr_state_flags |= VRSF_REQ_START;
1356 /* FALLTHROUGH */
1357 case VRS_RUN:
1358 cv_broadcast(&ring->vr_cv);
1359 err = 0;
1360 break;
1361 default:
1362 err = EBUSY;
1363 break;
1364 }
1365 mutex_exit(&ring->vr_lock);
1366
1367 return (err);
1368 }
1369
1370 static int
viona_ioc_ring_pause(viona_link_t * link,uint_t idx)1371 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1372 {
1373 if (!VIONA_RING_VALID(link, idx)) {
1374 return (EINVAL);
1375 }
1376
1377 viona_vring_t *ring = &link->l_vrings[idx];
1378 return (viona_ring_pause(ring));
1379 }
1380
1381 static int
viona_ioc_ring_set_msi(viona_link_t * link,void * data,int md)1382 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1383 {
1384 vioc_ring_msi_t vrm;
1385 viona_vring_t *ring;
1386
1387 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1388 return (EFAULT);
1389 }
1390 if (!VIONA_RING_VALID(link, vrm.rm_index)) {
1391 return (EINVAL);
1392 }
1393
1394 ring = &link->l_vrings[vrm.rm_index];
1395 mutex_enter(&ring->vr_lock);
1396 ring->vr_msi_addr = vrm.rm_addr;
1397 ring->vr_msi_msg = vrm.rm_msg;
1398 mutex_exit(&ring->vr_lock);
1399
1400 return (0);
1401 }
1402
1403 static int
viona_notify_iop(void * arg,bool in,uint16_t port,uint8_t bytes,uint32_t * val)1404 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1405 uint32_t *val)
1406 {
1407 viona_link_t *link = (viona_link_t *)arg;
1408
1409 /*
1410 * If the request is a read (in/ins), or direct at a port other than
1411 * what we expect to be registered on, ignore it.
1412 */
1413 if (in || port != link->l_notify_ioport) {
1414 return (ESRCH);
1415 }
1416
1417 /* Let userspace handle notifications for rings other than RX/TX. */
1418 const uint16_t vq = *val;
1419 if (!VIONA_RING_VALID(link, vq)) {
1420 return (ESRCH);
1421 }
1422
1423 viona_vring_t *ring = &link->l_vrings[vq];
1424 int res = 0;
1425
1426 mutex_enter(&ring->vr_lock);
1427 if (ring->vr_state == VRS_RUN) {
1428 cv_broadcast(&ring->vr_cv);
1429 } else {
1430 res = ESRCH;
1431 }
1432 mutex_exit(&ring->vr_lock);
1433
1434 return (res);
1435 }
1436
1437 static int
viona_ioc_set_notify_ioport(viona_link_t * link,uint16_t ioport)1438 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1439 {
1440 int err = 0;
1441
1442 if (link->l_notify_ioport != 0) {
1443 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1444 link->l_notify_ioport = 0;
1445 }
1446
1447 if (ioport != 0) {
1448 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1449 viona_notify_iop, (void *)link, &link->l_notify_cookie);
1450 if (err == 0) {
1451 link->l_notify_ioport = ioport;
1452 }
1453 }
1454 return (err);
1455 }
1456
1457 static int
viona_notify_mmio(void * arg,bool write,uint64_t address,int bytes,uint64_t * val)1458 viona_notify_mmio(void *arg, bool write, uint64_t address, int bytes,
1459 uint64_t *val)
1460 {
1461 viona_link_t *link = (viona_link_t *)arg;
1462
1463 /*
1464 * We are only interested in writes to this BAR region; kick reads out
1465 * to userspace.
1466 */
1467 if (!write)
1468 return (ESRCH);
1469
1470 const uint16_t vq = *val;
1471
1472 /* Let userspace handle notifications for rings other than RX/TX. */
1473 if (!VIONA_RING_VALID(link, vq))
1474 return (ESRCH);
1475
1476 viona_vring_t *ring = &link->l_vrings[vq];
1477 int res = 0;
1478
1479 mutex_enter(&ring->vr_lock);
1480 if (ring->vr_state == VRS_RUN)
1481 cv_broadcast(&ring->vr_cv);
1482 else
1483 res = ESRCH;
1484 mutex_exit(&ring->vr_lock);
1485
1486 return (res);
1487 }
1488
1489 static int
viona_ioc_set_notify_mmio(viona_link_t * link,void * udata,int md)1490 viona_ioc_set_notify_mmio(viona_link_t *link, void *udata, int md)
1491 {
1492 vioc_notify_mmio_t vim;
1493 int err = 0;
1494
1495 if (link->l_notify_mmaddr != NOTIFY_MMADDR_UNSET) {
1496 int err = vmm_drv_mmio_unhook(link->l_vm_hold,
1497 &link->l_notify_mmcookie);
1498 VERIFY(err == 0 || err == ENOENT);
1499 link->l_notify_mmaddr = NOTIFY_MMADDR_UNSET;
1500 }
1501
1502 if (udata == NULL)
1503 return (0);
1504
1505 if (ddi_copyin(udata, &vim, sizeof (vim), md) != 0)
1506 return (EFAULT);
1507
1508 err = vmm_drv_mmio_hook(link->l_vm_hold, vim.vim_address, vim.vim_size,
1509 viona_notify_mmio, (void *)link, &link->l_notify_mmcookie);
1510 if (err == 0) {
1511 link->l_notify_mmaddr = vim.vim_address;
1512 }
1513
1514 return (err);
1515 }
1516
1517 static int
viona_ioc_set_promisc(viona_link_t * link,viona_promisc_t mode)1518 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode)
1519 {
1520 int err;
1521
1522 if (mode >= VIONA_PROMISC_MAX) {
1523 return (EINVAL);
1524 }
1525
1526 if (mode == link->l_promisc) {
1527 return (0);
1528 }
1529
1530 if ((err = viona_rx_set(link, mode)) != 0) {
1531 return (err);
1532 }
1533
1534 link->l_promisc = mode;
1535 return (0);
1536 }
1537
1538 #define PARAM_NM_TX_COPY_DATA "tx_copy_data"
1539 #define PARAM_NM_TX_HEADER_PAD "tx_header_pad"
1540
1541 #define PARAM_ERR_INVALID_TYPE "invalid type"
1542 #define PARAM_ERR_OUT_OF_RANGE "value out of range"
1543 #define PARAM_ERR_UNK_KEY "unknown key"
1544
1545 static nvlist_t *
viona_params_to_nvlist(const viona_link_params_t * vlp)1546 viona_params_to_nvlist(const viona_link_params_t *vlp)
1547 {
1548 nvlist_t *nvl = fnvlist_alloc();
1549
1550 fnvlist_add_boolean_value(nvl, PARAM_NM_TX_COPY_DATA,
1551 vlp->vlp_tx_copy_data);
1552 fnvlist_add_uint16(nvl, PARAM_NM_TX_HEADER_PAD,
1553 vlp->vlp_tx_header_pad);
1554
1555 return (nvl);
1556 }
1557
1558 static nvlist_t *
viona_params_from_nvlist(nvlist_t * nvl,viona_link_params_t * vlp)1559 viona_params_from_nvlist(nvlist_t *nvl, viona_link_params_t *vlp)
1560 {
1561 nvlist_t *nverr = fnvlist_alloc();
1562 nvpair_t *nvp = NULL;
1563
1564 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
1565 const char *name = nvpair_name(nvp);
1566 const data_type_t dtype = nvpair_type(nvp);
1567
1568 if (strcmp(name, PARAM_NM_TX_COPY_DATA) == 0) {
1569 if (dtype == DATA_TYPE_BOOLEAN_VALUE) {
1570 vlp->vlp_tx_copy_data =
1571 fnvpair_value_boolean_value(nvp);
1572 } else {
1573 fnvlist_add_string(nverr, name,
1574 PARAM_ERR_INVALID_TYPE);
1575 }
1576 continue;
1577 }
1578 if (strcmp(name, PARAM_NM_TX_HEADER_PAD) == 0) {
1579 if (dtype == DATA_TYPE_UINT16) {
1580 uint16_t value = fnvpair_value_uint16(nvp);
1581
1582 if (value > viona_max_header_pad) {
1583 fnvlist_add_string(nverr, name,
1584 PARAM_ERR_OUT_OF_RANGE);
1585 } else {
1586 vlp->vlp_tx_header_pad = value;
1587 }
1588 } else {
1589 fnvlist_add_string(nverr, name,
1590 PARAM_ERR_INVALID_TYPE);
1591 }
1592 continue;
1593 }
1594
1595 /* Reject parameters we do not recognize */
1596 fnvlist_add_string(nverr, name, PARAM_ERR_UNK_KEY);
1597 }
1598
1599 if (!nvlist_empty(nverr)) {
1600 return (nverr);
1601 }
1602
1603 nvlist_free(nverr);
1604 return (NULL);
1605 }
1606
1607 static void
viona_params_get_defaults(viona_link_params_t * vlp)1608 viona_params_get_defaults(viona_link_params_t *vlp)
1609 {
1610 vlp->vlp_tx_copy_data = viona_tx_copy_needed();
1611 vlp->vlp_tx_header_pad = 0;
1612 }
1613
1614 static int
viona_ioc_get_params(viona_link_t * link,void * udata,int md)1615 viona_ioc_get_params(viona_link_t *link, void *udata, int md)
1616 {
1617 vioc_get_params_t vgp;
1618 int err = 0;
1619
1620 if (ddi_copyin(udata, &vgp, sizeof (vgp), md) != 0) {
1621 return (EFAULT);
1622 }
1623
1624 nvlist_t *nvl = NULL;
1625 if (link != NULL) {
1626 nvl = viona_params_to_nvlist(&link->l_params);
1627 } else {
1628 viona_link_params_t vlp = { 0 };
1629
1630 viona_params_get_defaults(&vlp);
1631 nvl = viona_params_to_nvlist(&vlp);
1632 }
1633
1634 VERIFY(nvl != NULL);
1635
1636 size_t packed_sz;
1637 void *packed = fnvlist_pack(nvl, &packed_sz);
1638 nvlist_free(nvl);
1639
1640 if (packed_sz > vgp.vgp_param_sz) {
1641 err = E2BIG;
1642 }
1643 /* Communicate size, even if the data will not fit */
1644 vgp.vgp_param_sz = packed_sz;
1645
1646 if (err == 0 &&
1647 ddi_copyout(packed, vgp.vgp_param, packed_sz, md) != 0) {
1648 err = EFAULT;
1649 }
1650 kmem_free(packed, packed_sz);
1651
1652 if (ddi_copyout(&vgp, udata, sizeof (vgp), md) != 0) {
1653 if (err != 0) {
1654 err = EFAULT;
1655 }
1656 }
1657
1658 return (err);
1659 }
1660
1661 static int
viona_ioc_set_params(viona_link_t * link,void * udata,int md)1662 viona_ioc_set_params(viona_link_t *link, void *udata, int md)
1663 {
1664 vioc_set_params_t vsp;
1665 int err = 0;
1666 nvlist_t *nverr = NULL;
1667
1668 if (ddi_copyin(udata, &vsp, sizeof (vsp), md) != 0) {
1669 return (EFAULT);
1670 }
1671
1672 if (vsp.vsp_param_sz > VIONA_MAX_PARAM_NVLIST_SZ) {
1673 err = E2BIG;
1674 goto done;
1675 } else if (vsp.vsp_param_sz == 0) {
1676 /*
1677 * There is no reason to make this ioctl call with no actual
1678 * parameters to be changed.
1679 */
1680 err = EINVAL;
1681 goto done;
1682 }
1683
1684 const size_t packed_sz = vsp.vsp_param_sz;
1685 void *packed = kmem_alloc(packed_sz, KM_SLEEP);
1686 if (ddi_copyin(vsp.vsp_param, packed, packed_sz, md) != 0) {
1687 kmem_free(packed, packed_sz);
1688 err = EFAULT;
1689 goto done;
1690 }
1691
1692 nvlist_t *parsed = NULL;
1693 if (nvlist_unpack(packed, packed_sz, &parsed, KM_SLEEP) == 0) {
1694 /* Use the existing parameters as a starting point */
1695 viona_link_params_t new_params;
1696 bcopy(&link->l_params, &new_params,
1697 sizeof (new_params));
1698
1699 nverr = viona_params_from_nvlist(parsed, &new_params);
1700 if (nverr == NULL) {
1701 /*
1702 * Only apply the updated parameters if there
1703 * were no errors during parsing.
1704 */
1705 bcopy(&new_params, &link->l_params,
1706 sizeof (new_params));
1707 } else {
1708 err = EINVAL;
1709 }
1710
1711 } else {
1712 err = EINVAL;
1713 }
1714 nvlist_free(parsed);
1715 kmem_free(packed, packed_sz);
1716
1717 done:
1718 if (nverr != NULL) {
1719 size_t err_packed_sz;
1720 void *err_packed = fnvlist_pack(nverr, &err_packed_sz);
1721
1722 if (err_packed_sz > vsp.vsp_error_sz) {
1723 if (err != 0) {
1724 err = E2BIG;
1725 }
1726 } else if (ddi_copyout(err_packed, vsp.vsp_error,
1727 err_packed_sz, md) != 0 && err == 0) {
1728 err = EFAULT;
1729 }
1730 vsp.vsp_error_sz = err_packed_sz;
1731
1732 nvlist_free(nverr);
1733 kmem_free(err_packed, err_packed_sz);
1734 } else {
1735 /*
1736 * If there are no detailed per-field errors, it is important to
1737 * communicate that absense to userspace.
1738 */
1739 vsp.vsp_error_sz = 0;
1740 }
1741
1742 if (ddi_copyout(&vsp, udata, sizeof (vsp), md) != 0 && err == 0) {
1743 err = EFAULT;
1744 }
1745
1746 return (err);
1747 }
1748
1749 static int
viona_ioc_ring_intr_clear(viona_link_t * link,uint_t idx)1750 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1751 {
1752 if (!VIONA_RING_VALID(link, idx)) {
1753 return (EINVAL);
1754 }
1755
1756 link->l_vrings[idx].vr_intr_enabled = 0;
1757 return (0);
1758 }
1759
1760 static int
viona_ioc_intr_poll(viona_link_t * link,void * udata,int md,int * rv)1761 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1762 {
1763 vioc_intr_poll_t vip = { 0 };
1764 uint_t cnt = 0;
1765
1766 for (size_t i = 0;
1767 i < ARRAY_SIZE(vip.vip_status) && i < VIONA_USABLE_RINGS(link);
1768 i++) {
1769 uint_t val = link->l_vrings[i].vr_intr_enabled;
1770
1771 vip.vip_status[i] = val;
1772 if (val != 0)
1773 cnt++;
1774 }
1775
1776 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0)
1777 return (EFAULT);
1778
1779 *rv = (int)cnt;
1780 return (0);
1781 }
1782
1783 static int
viona_ioc_intr_poll_mq(viona_link_t * link,void * udata,int md,int * rv)1784 viona_ioc_intr_poll_mq(viona_link_t *link, void *udata, int md, int *rv)
1785 {
1786 vioc_intr_poll_mq_t vipm;
1787 uint16_t cnt = 0;
1788 int err = 0;
1789
1790 bzero(&vipm, sizeof (vipm));
1791
1792 if (ddi_copyin(udata, &vipm.vipm_nrings, sizeof (vipm.vipm_nrings),
1793 md) != 0) {
1794 return (EFAULT);
1795 }
1796
1797 if (vipm.vipm_nrings < 1 || vipm.vipm_nrings > VIONA_USABLE_RINGS(link))
1798 return (EINVAL);
1799
1800 for (uint_t i = 0; i < vipm.vipm_nrings; i++) {
1801 if (link->l_vrings[i].vr_intr_enabled) {
1802 VIONA_INTR_SET(&vipm, i);
1803 cnt++;
1804 }
1805 }
1806
1807 if (ddi_copyout(&vipm, udata, sizeof (vipm), md) != 0)
1808 err = EFAULT;
1809 else
1810 *rv = (int)cnt;
1811
1812 return (err);
1813 }
1814