1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39 * Copyright 2025 Oxide Computer Company
40 */
41
42 /*
43 * viona - VirtIO-Net, Accelerated
44 *
45 * The purpose of viona is to provide high performance virtio-net devices to
46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
47 * DLS/DLD stack.
48 *
49 * --------------------
50 * General Architecture
51 * --------------------
52 *
53 * A single viona instance is comprised of a "link" handle and two "rings".
54 * After opening the viona device, it must be associated with a MAC network
55 * interface and a bhyve (vmm) instance to form its link resource. This is
56 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
57 * passed in to perform the initialization. With the MAC client opened, and a
58 * driver handle to the vmm instance established, the device is ready to be
59 * configured by the guest.
60 *
61 * The userspace portion of bhyve, which interfaces with the PCI device
62 * emulation framework, is meant to stay out of the datapath if at all
63 * possible. Configuration changes made via PCI are mapped to actions which
64 * will steer the operation of the in-kernel logic.
65 *
66 *
67 * -----------
68 * Ring Basics
69 * -----------
70 *
71 * Each viona link has two viona_vring_t entities, RX and TX, for handling data
72 * transfers to and from the guest. They represent an interface to the
73 * standard virtio ring structures. When initialized and active, each ring is
74 * backed by a kernel worker thread (parented to the bhyve process for the
75 * instance) which handles ring events. The RX worker has the simple task of
76 * watching for ring shutdown conditions. The TX worker does that in addition
77 * to processing all requests to transmit data. Data destined for the guest is
78 * delivered directly by MAC to viona_rx() when the ring is active.
79 *
80 *
81 * -----------
82 * Ring States
83 * -----------
84 *
85 * The viona_vring_t instances follow a simple path through the possible state
86 * values represented in virtio_vring_t`vr_state:
87 *
88 * +<--------------------------------------------+
89 * | |
90 * V ^
91 * +-----------+ This is the initial state when a link is created or
92 * | VRS_RESET | when the ring has been explicitly reset.
93 * +-----------+
94 * | ^
95 * |---* ioctl(VNA_IOC_RING_INIT) issued |
96 * | |
97 * | ^
98 * V
99 * +-----------+ The ring parameters (size, guest physical addresses)
100 * | VRS_SETUP | have been set and start-up of the ring worker thread
101 * +-----------+ has begun.
102 * | ^
103 * | |
104 * |---* ring worker thread begins execution |
105 * | |
106 * +-------------------------------------------->+
107 * | | ^
108 * | |
109 * | * If ring shutdown is requested (by ioctl or impending
110 * | bhyve process death) while the worker thread is
111 * | starting, the worker will transition the ring to
112 * | VRS_RESET and exit.
113 * | ^
114 * | |
115 * |<-------------------------------------------<+
116 * | | |
117 * | | ^
118 * | * If ring is requested to pause (but not stop)from the
119 * | VRS_RUN state, it will return to the VRS_INIT state.
120 * |
121 * | ^
122 * | |
123 * | ^
124 * V
125 * +-----------+ The worker thread associated with the ring has started
126 * | VRS_INIT | executing. It has allocated any extra resources needed
127 * +-----------+ for the ring to operate.
128 * | ^
129 * | |
130 * +-------------------------------------------->+
131 * | | ^
132 * | |
133 * | * If ring shutdown is requested while the worker is
134 * | waiting in VRS_INIT, it will free any extra resources
135 * | and transition to VRS_RESET.
136 * | ^
137 * | |
138 * |--* ioctl(VNA_IOC_RING_KICK) issued |
139 * | ^
140 * V
141 * +-----------+ The worker thread associated with the ring is executing
142 * | VRS_RUN | workload specific to that ring.
143 * +-----------+
144 * | ^
145 * |---* ioctl(VNA_IOC_RING_RESET) issued |
146 * | (or bhyve process begins exit) ^
147 * |
148 * +-----------+ The worker thread associated with the ring is in the
149 * | VRS_STOP | process of exiting. All outstanding TX and RX
150 * +-----------+ requests are allowed to complete, but new requests
151 * | must be ignored.
152 * | ^
153 * | |
154 * +-------------------------------------------->+
155 *
156 *
157 * While the worker thread is not running, changes to vr_state are only made by
158 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
159 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
160 * has been started, only it may perform ring state transitions (still under
161 * the protection of vr_lock), when requested by outside consumers via
162 * vr_state_flags or when the containing bhyve process initiates an exit.
163 *
164 *
165 * ----------------------------
166 * Transmission mblk_t Handling
167 * ----------------------------
168 *
169 * For incoming frames destined for a bhyve guest, the data must first land in
170 * a host OS buffer from the physical NIC before it is copied into the awaiting
171 * guest buffer(s). Outbound frames transmitted by the guest are not bound by
172 * this limitation and can avoid extra copying before the buffers are accessed
173 * directly by the NIC. When a guest designates buffers to be transmitted,
174 * viona translates the guest-physical addresses contained in the ring
175 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is
176 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
177 * Doing so increments vr_xfer_outstanding, preventing the ring from being
178 * reset (allowing the link to drop its vmm handle to the guest) until all
179 * transmit mblks referencing guest memory have been processed. Allocation of
180 * the viona_desb_t entries is done during the VRS_INIT stage of the ring
181 * worker thread. The ring size informs that allocation as the number of
182 * concurrent transmissions is limited by the number of descriptors in the
183 * ring. This minimizes allocation in the transmit hot-path by acquiring those
184 * fixed-size resources during initialization.
185 *
186 * This optimization depends on the underlying NIC driver freeing the mblks in
187 * a timely manner after they have been transmitted by the hardware. Some
188 * drivers have been found to flush TX descriptors only when new transmissions
189 * are initiated. This means that there is no upper bound to the time needed
190 * for an mblk to be flushed and can stall bhyve guests from shutting down
191 * since their memory must be free of viona TX references prior to clean-up.
192 *
193 * This expectation of deterministic mblk_t processing is likely the reason
194 * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
195 * loaded will copy transmit data into fresh buffers rather than passing up
196 * zero-copy mblks. It is a hold-over from the original viona sources provided
197 * by Pluribus and its continued necessity has not been confirmed.
198 *
199 *
200 * ----------------------------
201 * Ring Notification Fast-paths
202 * ----------------------------
203 *
204 * Device operation for viona requires that notifications flow to and from the
205 * guest to indicate certain ring conditions. In order to minimize latency and
206 * processing overhead, the notification procedures are kept in-kernel whenever
207 * possible.
208 *
209 * Guest-to-host notifications, when new available descriptors have been placed
210 * in the ring, are posted via the 'queue notify' address in the virtio BAR.
211 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
212 * install a callback hook on an ioport address. Guest exits for accesses to
213 * viona-hooked ioport addresses will result in direct calls to notify the
214 * appropriate ring worker without a trip to userland.
215 *
216 * Host-to-guest notifications in the form of interrupts enjoy similar
217 * acceleration. Each viona ring can be configured to send MSI notifications
218 * to the guest as virtio conditions dictate. This in-kernel interrupt
219 * configuration is kept synchronized through viona ioctls which are utilized
220 * during writes to the associated PCI config registers or MSI-X BAR.
221 *
222 * Guests which do not utilize MSI-X will result in viona falling back to the
223 * slow path for interrupts. It will poll(2) the viona handle, receiving
224 * notification when ring events necessitate the assertion of an interrupt.
225 *
226 *
227 * ---------------
228 * Nethook Support
229 * ---------------
230 *
231 * Viona provides four nethook events that consumers (e.g. ipf) can hook into
232 * to intercept packets as they go up or down the stack. Unfortunately,
233 * the nethook framework does not understand raw packets, so we can only
234 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
235 * we register callbacks with the neti (netinfo) module that will be invoked
236 * for each netstack already present, as well as for any additional netstack
237 * instances created as the system operates. These callbacks will
238 * register/unregister the hooks with the nethook framework for each
239 * netstack instance. This registration occurs prior to creating any
240 * viona instances for a given netstack, and the unregistration for a netstack
241 * instance occurs after all viona instances of the netstack instance have
242 * been deleted.
243 *
244 * ------------------
245 * Metrics/Statistics
246 * -----------------
247 *
248 * During operation, Viona tracks certain metrics as certain events occur.
249 *
250 * One class of metrics, known as the "error stats", refer to abnormal
251 * conditions in ring processing which are likely the fault of a misbehaving
252 * guest. These are tracked on a per-ring basis, and are not formally exposed
253 * to any consumer besides direct memory access through mdb.
254 *
255 * The other class of metrics tracked for an instance are the "transfer stats",
256 * which are the traditional packets/bytes/errors/drops figures. These are
257 * counted per-ring, and then aggregated into link-wide values exposed via
258 * kstats. Atomic operations are used to increment those per-ring stats during
259 * operation, and then when a ring is stopped, the values are consolidated into
260 * the link-wide values (to prevent loss when the ring is zeroed) under the
261 * protection of viona_link`l_stats_lock. When the kstats are being updated,
262 * l_stats_lock is held to protect against a racing consolidation, with the
263 * existing per-ring values being added in at update time to provide an accurate
264 * figure.
265 */
266
267 #include <sys/conf.h>
268 #include <sys/file.h>
269 #include <sys/stat.h>
270
271 #include <sys/dlpi.h>
272 #include <sys/vlan.h>
273
274 #include "viona_impl.h"
275
276
277 #define VIONA_NAME "Virtio Network Accelerator"
278 #define VIONA_CTL_MINOR 0
279 #define VIONA_MODULE_NAME "viona"
280 #define VIONA_KSTAT_CLASS "misc"
281 #define VIONA_KSTAT_NAME "viona_stat"
282
283
284 /*
285 * Host capabilities.
286 */
287 #define VIONA_S_HOSTCAPS ( \
288 VIRTIO_NET_F_GUEST_CSUM | \
289 VIRTIO_NET_F_MAC | \
290 VIRTIO_NET_F_GUEST_TSO4 | \
291 VIRTIO_NET_F_MRG_RXBUF | \
292 VIRTIO_NET_F_STATUS | \
293 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
294 VIRTIO_F_RING_INDIRECT_DESC)
295
296 /* MAC_CAPAB_HCKSUM specifics of interest */
297 #define VIONA_CAP_HCKSUM_INTEREST \
298 (HCKSUM_INET_PARTIAL | \
299 HCKSUM_INET_FULL_V4 | \
300 HCKSUM_INET_FULL_V6)
301
302 static void *viona_state;
303 static dev_info_t *viona_dip;
304 static id_space_t *viona_minors;
305
306
307 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
308 void **result);
309 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
310 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
311 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
312 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
313 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
314 cred_t *credp, int *rval);
315 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
316 struct pollhead **phpp);
317
318 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
319 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
320
321 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
322 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t);
323 static int viona_ioc_get_params(viona_link_t *, void *, int);
324 static int viona_ioc_set_params(viona_link_t *, void *, int);
325 static int viona_ioc_ring_init(viona_link_t *, void *, int);
326 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
327 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
328 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
329 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
330 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
331 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
332 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
333 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
334
335 static void viona_params_get_defaults(viona_link_params_t *);
336
337 static struct cb_ops viona_cb_ops = {
338 viona_open,
339 viona_close,
340 nodev,
341 nodev,
342 nodev,
343 nodev,
344 nodev,
345 viona_ioctl,
346 nodev,
347 nodev,
348 nodev,
349 viona_chpoll,
350 ddi_prop_op,
351 0,
352 D_MP | D_NEW | D_HOTPLUG,
353 CB_REV,
354 nodev,
355 nodev
356 };
357
358 static struct dev_ops viona_ops = {
359 DEVO_REV,
360 0,
361 viona_info,
362 nulldev,
363 nulldev,
364 viona_attach,
365 viona_detach,
366 nodev,
367 &viona_cb_ops,
368 NULL,
369 ddi_power,
370 ddi_quiesce_not_needed
371 };
372
373 static struct modldrv modldrv = {
374 &mod_driverops,
375 VIONA_NAME,
376 &viona_ops,
377 };
378
379 static struct modlinkage modlinkage = {
380 MODREV_1, &modldrv, NULL
381 };
382
383 int
_init(void)384 _init(void)
385 {
386 int ret;
387
388 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
389 if (ret != 0) {
390 return (ret);
391 }
392
393 viona_minors = id_space_create("viona_minors",
394 VIONA_CTL_MINOR + 1, UINT16_MAX);
395 viona_rx_init();
396 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
397
398 ret = mod_install(&modlinkage);
399 if (ret != 0) {
400 ddi_soft_state_fini(&viona_state);
401 id_space_destroy(viona_minors);
402 viona_rx_fini();
403 mutex_destroy(&viona_force_copy_lock);
404 }
405
406 return (ret);
407 }
408
409 int
_fini(void)410 _fini(void)
411 {
412 int ret;
413
414 ret = mod_remove(&modlinkage);
415 if (ret != 0) {
416 return (ret);
417 }
418
419 ddi_soft_state_fini(&viona_state);
420 id_space_destroy(viona_minors);
421 viona_rx_fini();
422 mutex_destroy(&viona_force_copy_lock);
423
424 return (ret);
425 }
426
427 int
_info(struct modinfo * modinfop)428 _info(struct modinfo *modinfop)
429 {
430 return (mod_info(&modlinkage, modinfop));
431 }
432
433 /* ARGSUSED */
434 static int
viona_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)435 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
436 {
437 int error;
438
439 switch (cmd) {
440 case DDI_INFO_DEVT2DEVINFO:
441 *result = (void *)viona_dip;
442 error = DDI_SUCCESS;
443 break;
444 case DDI_INFO_DEVT2INSTANCE:
445 *result = (void *)0;
446 error = DDI_SUCCESS;
447 break;
448 default:
449 error = DDI_FAILURE;
450 break;
451 }
452 return (error);
453 }
454
455 static int
viona_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)456 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
457 {
458 if (cmd != DDI_ATTACH) {
459 return (DDI_FAILURE);
460 }
461
462 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
463 DDI_PSEUDO, 0) != DDI_SUCCESS) {
464 return (DDI_FAILURE);
465 }
466
467 viona_neti_attach();
468
469 viona_dip = dip;
470 ddi_report_dev(viona_dip);
471
472 return (DDI_SUCCESS);
473 }
474
475 static int
viona_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)476 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
477 {
478 dev_info_t *old_dip = viona_dip;
479
480 if (cmd != DDI_DETACH) {
481 return (DDI_FAILURE);
482 }
483
484 VERIFY(old_dip != NULL);
485
486 viona_neti_detach();
487 viona_dip = NULL;
488 ddi_remove_minor_node(old_dip, NULL);
489
490 return (DDI_SUCCESS);
491 }
492
493 static int
viona_open(dev_t * devp,int flag,int otype,cred_t * credp)494 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
495 {
496 int minor;
497 viona_soft_state_t *ss;
498
499 if (otype != OTYP_CHR) {
500 return (EINVAL);
501 }
502 #if 0
503 /*
504 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
505 * Should the check be at open() or ioctl()?
506 */
507 if (drv_priv(credp) != 0) {
508 return (EPERM);
509 }
510 #endif
511 if (getminor(*devp) != VIONA_CTL_MINOR) {
512 return (ENXIO);
513 }
514
515 minor = id_alloc_nosleep(viona_minors);
516 if (minor == -1) {
517 /* All minors are busy */
518 return (EBUSY);
519 }
520 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
521 id_free(viona_minors, minor);
522 return (ENOMEM);
523 }
524
525 ss = ddi_get_soft_state(viona_state, minor);
526 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
527 ss->ss_minor = minor;
528 *devp = makedevice(getmajor(*devp), minor);
529
530 return (0);
531 }
532
533 static int
viona_close(dev_t dev,int flag,int otype,cred_t * credp)534 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
535 {
536 int minor;
537 viona_soft_state_t *ss;
538
539 if (otype != OTYP_CHR) {
540 return (EINVAL);
541 }
542
543 minor = getminor(dev);
544
545 ss = ddi_get_soft_state(viona_state, minor);
546 if (ss == NULL) {
547 return (ENXIO);
548 }
549
550 VERIFY0(viona_ioc_delete(ss, B_TRUE));
551 VERIFY(!list_link_active(&ss->ss_node));
552 ddi_soft_state_free(viona_state, minor);
553 id_free(viona_minors, minor);
554
555 return (0);
556 }
557
558 static int
viona_ioctl(dev_t dev,int cmd,intptr_t data,int md,cred_t * cr,int * rv)559 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
560 {
561 viona_soft_state_t *ss;
562 void *dptr = (void *)data;
563 int err = 0, val;
564 viona_link_t *link;
565
566 ss = ddi_get_soft_state(viona_state, getminor(dev));
567 if (ss == NULL) {
568 return (ENXIO);
569 }
570
571 switch (cmd) {
572 case VNA_IOC_CREATE:
573 return (viona_ioc_create(ss, dptr, md, cr));
574 case VNA_IOC_DELETE:
575 return (viona_ioc_delete(ss, B_FALSE));
576 case VNA_IOC_VERSION:
577 *rv = VIONA_CURRENT_INTERFACE_VERSION;
578 return (0);
579 case VNA_IOC_DEFAULT_PARAMS:
580 /*
581 * With a NULL link parameter, viona_ioc_get_params() will emit
582 * the default parameters with the same error-handling behavior
583 * as VNA_IOC_GET_PARAMS.
584 */
585 return (viona_ioc_get_params(NULL, dptr, md));
586 default:
587 break;
588 }
589
590 mutex_enter(&ss->ss_lock);
591 if ((link = ss->ss_link) == NULL || link->l_destroyed ||
592 vmm_drv_release_reqd(link->l_vm_hold)) {
593 mutex_exit(&ss->ss_lock);
594 return (ENXIO);
595 }
596
597 switch (cmd) {
598 case VNA_IOC_GET_FEATURES:
599 val = VIONA_S_HOSTCAPS | link->l_features_hw;
600 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
601 err = EFAULT;
602 }
603 break;
604 case VNA_IOC_SET_FEATURES:
605 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
606 err = EFAULT;
607 break;
608 }
609 val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
610
611 if ((val & VIRTIO_NET_F_CSUM) == 0)
612 val &= ~VIRTIO_NET_F_HOST_TSO4;
613
614 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
615 val &= ~VIRTIO_NET_F_GUEST_TSO4;
616
617 link->l_features = val;
618 break;
619 case VNA_IOC_RING_INIT:
620 err = viona_ioc_ring_init(link, dptr, md);
621 break;
622 case VNA_IOC_RING_RESET:
623 err = viona_ioc_ring_reset(link, (uint_t)data);
624 break;
625 case VNA_IOC_RING_KICK:
626 err = viona_ioc_ring_kick(link, (uint_t)data);
627 break;
628 case VNA_IOC_RING_SET_MSI:
629 err = viona_ioc_ring_set_msi(link, dptr, md);
630 break;
631 case VNA_IOC_RING_INTR_CLR:
632 err = viona_ioc_ring_intr_clear(link, (uint_t)data);
633 break;
634 case VNA_IOC_RING_SET_STATE:
635 err = viona_ioc_ring_set_state(link, dptr, md);
636 break;
637 case VNA_IOC_RING_GET_STATE:
638 err = viona_ioc_ring_get_state(link, dptr, md);
639 break;
640 case VNA_IOC_RING_PAUSE:
641 err = viona_ioc_ring_pause(link, (uint_t)data);
642 break;
643
644 case VNA_IOC_INTR_POLL:
645 err = viona_ioc_intr_poll(link, dptr, md, rv);
646 break;
647 case VNA_IOC_SET_NOTIFY_IOP:
648 if (data < 0 || data > UINT16_MAX) {
649 err = EINVAL;
650 break;
651 }
652 err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
653 break;
654 case VNA_IOC_SET_PROMISC:
655 err = viona_ioc_set_promisc(link, (viona_promisc_t)data);
656 break;
657 case VNA_IOC_GET_PARAMS:
658 err = viona_ioc_get_params(link, dptr, md);
659 break;
660 case VNA_IOC_SET_PARAMS:
661 err = viona_ioc_set_params(link, dptr, md);
662 break;
663 case VNA_IOC_GET_MTU:
664 *rv = (int)link->l_mtu;
665 break;
666 case VNA_IOC_SET_MTU:
667 if (data < VIONA_MIN_MTU || data > VIONA_MAX_MTU)
668 err = EINVAL;
669 else
670 link->l_mtu = (uint16_t)data;
671 break;
672 default:
673 err = ENOTTY;
674 break;
675 }
676
677 mutex_exit(&ss->ss_lock);
678 return (err);
679 }
680
681 static int
viona_chpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)682 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
683 struct pollhead **phpp)
684 {
685 viona_soft_state_t *ss;
686 viona_link_t *link;
687
688 ss = ddi_get_soft_state(viona_state, getminor(dev));
689 if (ss == NULL) {
690 return (ENXIO);
691 }
692
693 mutex_enter(&ss->ss_lock);
694 if ((link = ss->ss_link) == NULL || link->l_destroyed) {
695 mutex_exit(&ss->ss_lock);
696 return (ENXIO);
697 }
698
699 *reventsp = 0;
700 if ((events & POLLRDBAND) != 0) {
701 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
702 if (link->l_vrings[i].vr_intr_enabled != 0) {
703 *reventsp |= POLLRDBAND;
704 break;
705 }
706 }
707 }
708 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
709 *phpp = &link->l_pollhead;
710 }
711 mutex_exit(&ss->ss_lock);
712
713 return (0);
714 }
715
716 static void
viona_get_mac_capab(viona_link_t * link)717 viona_get_mac_capab(viona_link_t *link)
718 {
719 mac_handle_t mh = link->l_mh;
720 uint32_t cap = 0;
721 mac_capab_lso_t lso_cap;
722
723 link->l_features_hw = 0;
724 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
725 /*
726 * Only report HW checksum ability if the underlying MAC
727 * resource is capable of populating the L4 header.
728 */
729 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
730 link->l_features_hw |= VIRTIO_NET_F_CSUM;
731 }
732 link->l_cap_csum = cap;
733 }
734
735 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
736 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
737 /*
738 * Virtio doesn't allow for negotiating a maximum LSO
739 * packet size. We have to assume that the guest may
740 * send a maximum length IP packet. Make sure the
741 * underlying MAC can handle an LSO of this size.
742 */
743 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
744 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
745 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
746 }
747 }
748
749 static int
viona_kstat_update(kstat_t * ksp,int rw)750 viona_kstat_update(kstat_t *ksp, int rw)
751 {
752 viona_link_t *link = ksp->ks_private;
753 viona_kstats_t *vk = ksp->ks_data;
754
755 /*
756 * Avoid the potential for mangled values due to a racing consolidation
757 * of stats for a ring by performing the kstat update with l_stats_lock
758 * held while adding up the central (link) and ring values.
759 */
760 mutex_enter(&link->l_stats_lock);
761
762 const viona_transfer_stats_t *ring_stats =
763 &link->l_vrings[VIONA_VQ_RX].vr_stats;
764 const viona_transfer_stats_t *link_stats = &link->l_stats.vls_rx;
765
766 vk->vk_rx_packets.value.ui64 =
767 link_stats->vts_packets + ring_stats->vts_packets;
768 vk->vk_rx_bytes.value.ui64 =
769 link_stats->vts_bytes + ring_stats->vts_bytes;
770 vk->vk_rx_errors.value.ui64 =
771 link_stats->vts_errors + ring_stats->vts_errors;
772 vk->vk_rx_drops.value.ui64 =
773 link_stats->vts_drops + ring_stats->vts_drops;
774
775 ring_stats = &link->l_vrings[VIONA_VQ_TX].vr_stats;
776 link_stats = &link->l_stats.vls_tx;
777
778 vk->vk_tx_packets.value.ui64 =
779 link_stats->vts_packets + ring_stats->vts_packets;
780 vk->vk_tx_bytes.value.ui64 =
781 link_stats->vts_bytes + ring_stats->vts_bytes;
782 vk->vk_tx_errors.value.ui64 =
783 link_stats->vts_errors + ring_stats->vts_errors;
784 vk->vk_tx_drops.value.ui64 =
785 link_stats->vts_drops + ring_stats->vts_drops;
786
787 mutex_exit(&link->l_stats_lock);
788
789 return (0);
790 }
791
792 static int
viona_kstat_init(viona_soft_state_t * ss,const cred_t * cr)793 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr)
794 {
795 zoneid_t zid = crgetzoneid(cr);
796 kstat_t *ksp;
797
798 ASSERT(MUTEX_HELD(&ss->ss_lock));
799 ASSERT3P(ss->ss_kstat, ==, NULL);
800
801 ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor,
802 VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED,
803 sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid);
804
805 if (ksp == NULL) {
806 /*
807 * Without detail from kstat_create_zone(), assume that resource
808 * exhaustion is to blame for the failure.
809 */
810 return (ENOMEM);
811 }
812 ss->ss_kstat = ksp;
813
814 /*
815 * If this instance is associated with a non-global zone, make its
816 * kstats visible from the GZ.
817 */
818 if (zid != GLOBAL_ZONEID) {
819 kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID);
820 }
821
822 viona_kstats_t *vk = ksp->ks_data;
823
824 kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64);
825 kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64);
826 kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64);
827 kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64);
828 kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64);
829 kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64);
830 kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64);
831 kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64);
832 ksp->ks_private = ss->ss_link;
833 ksp->ks_update = viona_kstat_update;
834
835 kstat_install(ss->ss_kstat);
836 return (0);
837 }
838
839 static void
viona_kstat_fini(viona_soft_state_t * ss)840 viona_kstat_fini(viona_soft_state_t *ss)
841 {
842 ASSERT(MUTEX_HELD(&ss->ss_lock));
843
844 if (ss->ss_kstat != NULL) {
845 kstat_delete(ss->ss_kstat);
846 ss->ss_kstat = NULL;
847 }
848 }
849
850 static int
viona_ioc_create(viona_soft_state_t * ss,void * dptr,int md,cred_t * cr)851 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
852 {
853 vioc_create_t kvc;
854 viona_link_t *link = NULL;
855 char cli_name[MAXNAMELEN];
856 int err = 0;
857 file_t *fp;
858 vmm_hold_t *hold = NULL;
859 viona_neti_t *nip = NULL;
860 zoneid_t zid;
861 mac_diag_t mac_diag = MAC_DIAG_NONE;
862 boolean_t rings_allocd = B_FALSE;
863
864 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
865
866 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
867 return (EFAULT);
868 }
869
870 zid = crgetzoneid(cr);
871 nip = viona_neti_lookup_by_zid(zid);
872 if (nip == NULL) {
873 return (EIO);
874 }
875
876 if (!nip->vni_nethook.vnh_hooked) {
877 viona_neti_rele(nip);
878 return (EIO);
879 }
880
881 mutex_enter(&ss->ss_lock);
882 if (ss->ss_link != NULL) {
883 mutex_exit(&ss->ss_lock);
884 viona_neti_rele(nip);
885 return (EEXIST);
886 }
887
888 if ((fp = getf(kvc.c_vmfd)) == NULL) {
889 err = EBADF;
890 goto bail;
891 }
892 err = vmm_drv_hold(fp, cr, &hold);
893 releasef(kvc.c_vmfd);
894 if (err != 0) {
895 goto bail;
896 }
897
898 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
899 link->l_linkid = kvc.c_linkid;
900 link->l_vm_hold = hold;
901 link->l_mtu = VIONA_DEFAULT_MTU;
902
903 err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
904 if (err != 0) {
905 goto bail;
906 }
907
908 viona_get_mac_capab(link);
909 viona_params_get_defaults(&link->l_params);
910
911 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME,
912 link->l_linkid);
913 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
914 if (err != 0) {
915 goto bail;
916 }
917
918 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
919 &link->l_muh, VLAN_ID_NONE, &mac_diag);
920 if (err != 0) {
921 goto bail;
922 }
923
924 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
925 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
926 rings_allocd = B_TRUE;
927
928 /*
929 * Default to passing up all multicast traffic in addition to
930 * classified unicast. Guests which have support will change this
931 * if they need to via the virtio net control queue; guests without
932 * support generally still want to see multicast.
933 */
934 link->l_promisc = VIONA_PROMISC_MULTI;
935 if ((err = viona_rx_set(link, link->l_promisc)) != 0) {
936 goto bail;
937 }
938
939 link->l_neti = nip;
940 ss->ss_link = link;
941
942 if ((err = viona_kstat_init(ss, cr)) != 0) {
943 goto bail;
944 }
945
946 mutex_exit(&ss->ss_lock);
947
948 mutex_enter(&nip->vni_lock);
949 list_insert_tail(&nip->vni_dev_list, ss);
950 mutex_exit(&nip->vni_lock);
951
952 return (0);
953
954 bail:
955 if (link != NULL) {
956 viona_rx_clear(link);
957 if (link->l_mch != NULL) {
958 if (link->l_muh != NULL) {
959 VERIFY0(mac_unicast_remove(link->l_mch,
960 link->l_muh));
961 link->l_muh = NULL;
962 }
963 mac_client_close(link->l_mch, 0);
964 }
965 if (link->l_mh != NULL) {
966 mac_close(link->l_mh);
967 }
968 if (rings_allocd) {
969 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
970 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
971 }
972 kmem_free(link, sizeof (viona_link_t));
973 ss->ss_link = NULL;
974 }
975 if (hold != NULL) {
976 vmm_drv_rele(hold);
977 }
978 viona_neti_rele(nip);
979
980 mutex_exit(&ss->ss_lock);
981 return (err);
982 }
983
984 static int
viona_ioc_delete(viona_soft_state_t * ss,boolean_t on_close)985 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
986 {
987 viona_link_t *link;
988 viona_neti_t *nip = NULL;
989
990 mutex_enter(&ss->ss_lock);
991 if ((link = ss->ss_link) == NULL) {
992 /* Link destruction already complete */
993 mutex_exit(&ss->ss_lock);
994 return (0);
995 }
996
997 if (link->l_destroyed) {
998 /*
999 * Link destruction has been started by another thread, but has
1000 * not completed. This condition should be impossible to
1001 * encounter when performing the on-close destroy of the link,
1002 * since racing ioctl accessors must necessarily be absent.
1003 */
1004 VERIFY(!on_close);
1005 mutex_exit(&ss->ss_lock);
1006 return (EAGAIN);
1007 }
1008 /*
1009 * The link deletion cannot fail after this point, continuing until its
1010 * successful completion is reached.
1011 */
1012 link->l_destroyed = B_TRUE;
1013
1014 /*
1015 * Tear down the IO port hook so it cannot be used to kick any of the
1016 * rings which are about to be reset and stopped.
1017 */
1018 VERIFY0(viona_ioc_set_notify_ioport(link, 0));
1019 mutex_exit(&ss->ss_lock);
1020
1021 /*
1022 * Return the rings to their reset state, ignoring any possible
1023 * interruptions from signals.
1024 */
1025 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
1026 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
1027
1028 mutex_enter(&ss->ss_lock);
1029 viona_kstat_fini(ss);
1030 if (link->l_mch != NULL) {
1031 /* Unhook the receive callbacks and close out the client */
1032 viona_rx_clear(link);
1033 if (link->l_muh != NULL) {
1034 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
1035 link->l_muh = NULL;
1036 }
1037 mac_client_close(link->l_mch, 0);
1038 }
1039 if (link->l_mh != NULL) {
1040 mac_close(link->l_mh);
1041 }
1042 if (link->l_vm_hold != NULL) {
1043 vmm_drv_rele(link->l_vm_hold);
1044 link->l_vm_hold = NULL;
1045 }
1046
1047 nip = link->l_neti;
1048 link->l_neti = NULL;
1049
1050 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
1051 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
1052 pollhead_clean(&link->l_pollhead);
1053 ss->ss_link = NULL;
1054 mutex_exit(&ss->ss_lock);
1055
1056 mutex_enter(&nip->vni_lock);
1057 list_remove(&nip->vni_dev_list, ss);
1058 mutex_exit(&nip->vni_lock);
1059
1060 viona_neti_rele(nip);
1061
1062 kmem_free(link, sizeof (viona_link_t));
1063 return (0);
1064 }
1065
1066 static int
viona_ioc_ring_init(viona_link_t * link,void * udata,int md)1067 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
1068 {
1069 vioc_ring_init_t kri;
1070 int err;
1071
1072 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
1073 return (EFAULT);
1074 }
1075 const struct viona_ring_params params = {
1076 .vrp_pa = kri.ri_qaddr,
1077 .vrp_size = kri.ri_qsize,
1078 .vrp_avail_idx = 0,
1079 .vrp_used_idx = 0,
1080 };
1081
1082 err = viona_ring_init(link, kri.ri_index, ¶ms);
1083
1084 return (err);
1085 }
1086
1087 static int
viona_ioc_ring_set_state(viona_link_t * link,void * udata,int md)1088 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
1089 {
1090 vioc_ring_state_t krs;
1091 int err;
1092
1093 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1094 return (EFAULT);
1095 }
1096 const struct viona_ring_params params = {
1097 .vrp_pa = krs.vrs_qaddr,
1098 .vrp_size = krs.vrs_qsize,
1099 .vrp_avail_idx = krs.vrs_avail_idx,
1100 .vrp_used_idx = krs.vrs_used_idx,
1101 };
1102
1103 err = viona_ring_init(link, krs.vrs_index, ¶ms);
1104
1105 return (err);
1106 }
1107
1108 static int
viona_ioc_ring_get_state(viona_link_t * link,void * udata,int md)1109 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
1110 {
1111 vioc_ring_state_t krs;
1112
1113 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1114 return (EFAULT);
1115 }
1116
1117 struct viona_ring_params params;
1118 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms);
1119 if (err != 0) {
1120 return (err);
1121 }
1122 krs.vrs_qsize = params.vrp_size;
1123 krs.vrs_qaddr = params.vrp_pa;
1124 krs.vrs_avail_idx = params.vrp_avail_idx;
1125 krs.vrs_used_idx = params.vrp_used_idx;
1126
1127 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
1128 return (EFAULT);
1129 }
1130 return (0);
1131 }
1132
1133 static int
viona_ioc_ring_reset(viona_link_t * link,uint_t idx)1134 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
1135 {
1136 viona_vring_t *ring;
1137
1138 if (idx >= VIONA_VQ_MAX) {
1139 return (EINVAL);
1140 }
1141 ring = &link->l_vrings[idx];
1142
1143 return (viona_ring_reset(ring, B_TRUE));
1144 }
1145
1146 static int
viona_ioc_ring_kick(viona_link_t * link,uint_t idx)1147 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
1148 {
1149 viona_vring_t *ring;
1150 int err;
1151
1152 if (idx >= VIONA_VQ_MAX) {
1153 return (EINVAL);
1154 }
1155 ring = &link->l_vrings[idx];
1156
1157 mutex_enter(&ring->vr_lock);
1158 switch (ring->vr_state) {
1159 case VRS_SETUP:
1160 /*
1161 * An early kick to a ring which is starting its worker thread
1162 * is fine. Once that thread is active, it will process the
1163 * start-up request immediately.
1164 */
1165 /* FALLTHROUGH */
1166 case VRS_INIT:
1167 ring->vr_state_flags |= VRSF_REQ_START;
1168 /* FALLTHROUGH */
1169 case VRS_RUN:
1170 cv_broadcast(&ring->vr_cv);
1171 err = 0;
1172 break;
1173 default:
1174 err = EBUSY;
1175 break;
1176 }
1177 mutex_exit(&ring->vr_lock);
1178
1179 return (err);
1180 }
1181
1182 static int
viona_ioc_ring_pause(viona_link_t * link,uint_t idx)1183 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1184 {
1185 if (idx >= VIONA_VQ_MAX) {
1186 return (EINVAL);
1187 }
1188
1189 viona_vring_t *ring = &link->l_vrings[idx];
1190 return (viona_ring_pause(ring));
1191 }
1192
1193 static int
viona_ioc_ring_set_msi(viona_link_t * link,void * data,int md)1194 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1195 {
1196 vioc_ring_msi_t vrm;
1197 viona_vring_t *ring;
1198
1199 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1200 return (EFAULT);
1201 }
1202 if (vrm.rm_index >= VIONA_VQ_MAX) {
1203 return (EINVAL);
1204 }
1205
1206 ring = &link->l_vrings[vrm.rm_index];
1207 mutex_enter(&ring->vr_lock);
1208 ring->vr_msi_addr = vrm.rm_addr;
1209 ring->vr_msi_msg = vrm.rm_msg;
1210 mutex_exit(&ring->vr_lock);
1211
1212 return (0);
1213 }
1214
1215 static int
viona_notify_iop(void * arg,bool in,uint16_t port,uint8_t bytes,uint32_t * val)1216 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1217 uint32_t *val)
1218 {
1219 viona_link_t *link = (viona_link_t *)arg;
1220
1221 /*
1222 * If the request is a read (in/ins), or direct at a port other than
1223 * what we expect to be registered on, ignore it.
1224 */
1225 if (in || port != link->l_notify_ioport) {
1226 return (ESRCH);
1227 }
1228
1229 /* Let userspace handle notifications for rings other than RX/TX. */
1230 const uint16_t vq = *val;
1231 if (vq >= VIONA_VQ_MAX) {
1232 return (ESRCH);
1233 }
1234
1235 viona_vring_t *ring = &link->l_vrings[vq];
1236 int res = 0;
1237
1238 mutex_enter(&ring->vr_lock);
1239 if (ring->vr_state == VRS_RUN) {
1240 cv_broadcast(&ring->vr_cv);
1241 } else {
1242 res = ESRCH;
1243 }
1244 mutex_exit(&ring->vr_lock);
1245
1246 return (res);
1247 }
1248
1249 static int
viona_ioc_set_notify_ioport(viona_link_t * link,uint16_t ioport)1250 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1251 {
1252 int err = 0;
1253
1254 if (link->l_notify_ioport != 0) {
1255 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1256 link->l_notify_ioport = 0;
1257 }
1258
1259 if (ioport != 0) {
1260 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1261 viona_notify_iop, (void *)link, &link->l_notify_cookie);
1262 if (err == 0) {
1263 link->l_notify_ioport = ioport;
1264 }
1265 }
1266 return (err);
1267 }
1268
1269 static int
viona_ioc_set_promisc(viona_link_t * link,viona_promisc_t mode)1270 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode)
1271 {
1272 int err;
1273
1274 if (mode >= VIONA_PROMISC_MAX) {
1275 return (EINVAL);
1276 }
1277
1278 if (mode == link->l_promisc) {
1279 return (0);
1280 }
1281
1282 if ((err = viona_rx_set(link, mode)) != 0) {
1283 return (err);
1284 }
1285
1286 link->l_promisc = mode;
1287 return (0);
1288 }
1289
1290 #define PARAM_NM_TX_COPY_DATA "tx_copy_data"
1291 #define PARAM_NM_TX_HEADER_PAD "tx_header_pad"
1292
1293 #define PARAM_ERR_INVALID_TYPE "invalid type"
1294 #define PARAM_ERR_OUT_OF_RANGE "value out of range"
1295 #define PARAM_ERR_UNK_KEY "unknown key"
1296
1297 static nvlist_t *
viona_params_to_nvlist(const viona_link_params_t * vlp)1298 viona_params_to_nvlist(const viona_link_params_t *vlp)
1299 {
1300 nvlist_t *nvl = fnvlist_alloc();
1301
1302 fnvlist_add_boolean_value(nvl, PARAM_NM_TX_COPY_DATA,
1303 vlp->vlp_tx_copy_data);
1304 fnvlist_add_uint16(nvl, PARAM_NM_TX_HEADER_PAD,
1305 vlp->vlp_tx_header_pad);
1306
1307 return (nvl);
1308 }
1309
1310 static nvlist_t *
viona_params_from_nvlist(nvlist_t * nvl,viona_link_params_t * vlp)1311 viona_params_from_nvlist(nvlist_t *nvl, viona_link_params_t *vlp)
1312 {
1313 nvlist_t *nverr = fnvlist_alloc();
1314 nvpair_t *nvp = NULL;
1315
1316 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
1317 const char *name = nvpair_name(nvp);
1318 const data_type_t dtype = nvpair_type(nvp);
1319
1320 if (strcmp(name, PARAM_NM_TX_COPY_DATA) == 0) {
1321 if (dtype == DATA_TYPE_BOOLEAN_VALUE) {
1322 vlp->vlp_tx_copy_data =
1323 fnvpair_value_boolean_value(nvp);
1324 } else {
1325 fnvlist_add_string(nverr, name,
1326 PARAM_ERR_INVALID_TYPE);
1327 }
1328 continue;
1329 }
1330 if (strcmp(name, PARAM_NM_TX_HEADER_PAD) == 0) {
1331 if (dtype == DATA_TYPE_UINT16) {
1332 uint16_t value = fnvpair_value_uint16(nvp);
1333
1334 if (value > viona_max_header_pad) {
1335 fnvlist_add_string(nverr, name,
1336 PARAM_ERR_OUT_OF_RANGE);
1337 } else {
1338 vlp->vlp_tx_header_pad = value;
1339 }
1340 } else {
1341 fnvlist_add_string(nverr, name,
1342 PARAM_ERR_INVALID_TYPE);
1343 }
1344 continue;
1345 }
1346
1347 /* Reject parameters we do not recognize */
1348 fnvlist_add_string(nverr, name, PARAM_ERR_UNK_KEY);
1349 }
1350
1351 if (!nvlist_empty(nverr)) {
1352 return (nverr);
1353 }
1354
1355 nvlist_free(nverr);
1356 return (NULL);
1357 }
1358
1359 static void
viona_params_get_defaults(viona_link_params_t * vlp)1360 viona_params_get_defaults(viona_link_params_t *vlp)
1361 {
1362 vlp->vlp_tx_copy_data = viona_tx_copy_needed();
1363 vlp->vlp_tx_header_pad = 0;
1364 }
1365
1366 static int
viona_ioc_get_params(viona_link_t * link,void * udata,int md)1367 viona_ioc_get_params(viona_link_t *link, void *udata, int md)
1368 {
1369 vioc_get_params_t vgp;
1370 int err = 0;
1371
1372 if (ddi_copyin(udata, &vgp, sizeof (vgp), md) != 0) {
1373 return (EFAULT);
1374 }
1375
1376 nvlist_t *nvl = NULL;
1377 if (link != NULL) {
1378 nvl = viona_params_to_nvlist(&link->l_params);
1379 } else {
1380 viona_link_params_t vlp = { 0 };
1381
1382 viona_params_get_defaults(&vlp);
1383 nvl = viona_params_to_nvlist(&vlp);
1384 }
1385
1386 VERIFY(nvl != NULL);
1387
1388 size_t packed_sz;
1389 void *packed = fnvlist_pack(nvl, &packed_sz);
1390 nvlist_free(nvl);
1391
1392 if (packed_sz > vgp.vgp_param_sz) {
1393 err = E2BIG;
1394 }
1395 /* Communicate size, even if the data will not fit */
1396 vgp.vgp_param_sz = packed_sz;
1397
1398 if (err == 0 &&
1399 ddi_copyout(packed, vgp.vgp_param, packed_sz, md) != 0) {
1400 err = EFAULT;
1401 }
1402 kmem_free(packed, packed_sz);
1403
1404 if (ddi_copyout(&vgp, udata, sizeof (vgp), md) != 0) {
1405 if (err != 0) {
1406 err = EFAULT;
1407 }
1408 }
1409
1410 return (err);
1411 }
1412
1413 static int
viona_ioc_set_params(viona_link_t * link,void * udata,int md)1414 viona_ioc_set_params(viona_link_t *link, void *udata, int md)
1415 {
1416 vioc_set_params_t vsp;
1417 int err = 0;
1418 nvlist_t *nverr = NULL;
1419
1420 if (ddi_copyin(udata, &vsp, sizeof (vsp), md) != 0) {
1421 return (EFAULT);
1422 }
1423
1424 if (vsp.vsp_param_sz > VIONA_MAX_PARAM_NVLIST_SZ) {
1425 err = E2BIG;
1426 goto done;
1427 } else if (vsp.vsp_param_sz == 0) {
1428 /*
1429 * There is no reason to make this ioctl call with no actual
1430 * parameters to be changed.
1431 */
1432 err = EINVAL;
1433 goto done;
1434 }
1435
1436 const size_t packed_sz = vsp.vsp_param_sz;
1437 void *packed = kmem_alloc(packed_sz, KM_SLEEP);
1438 if (ddi_copyin(vsp.vsp_param, packed, packed_sz, md) != 0) {
1439 kmem_free(packed, packed_sz);
1440 err = EFAULT;
1441 goto done;
1442 }
1443
1444 nvlist_t *parsed = NULL;
1445 if (nvlist_unpack(packed, packed_sz, &parsed, KM_SLEEP) == 0) {
1446 /* Use the existing parameters as a starting point */
1447 viona_link_params_t new_params;
1448 bcopy(&link->l_params, &new_params,
1449 sizeof (new_params));
1450
1451 nverr = viona_params_from_nvlist(parsed, &new_params);
1452 if (nverr == NULL) {
1453 /*
1454 * Only apply the updated parameters if there
1455 * were no errors during parsing.
1456 */
1457 bcopy(&new_params, &link->l_params,
1458 sizeof (new_params));
1459 } else {
1460 err = EINVAL;
1461 }
1462
1463 } else {
1464 err = EINVAL;
1465 }
1466 nvlist_free(parsed);
1467 kmem_free(packed, packed_sz);
1468
1469 done:
1470 if (nverr != NULL) {
1471 size_t err_packed_sz;
1472 void *err_packed = fnvlist_pack(nverr, &err_packed_sz);
1473
1474 if (err_packed_sz > vsp.vsp_error_sz) {
1475 if (err != 0) {
1476 err = E2BIG;
1477 }
1478 } else if (ddi_copyout(err_packed, vsp.vsp_error,
1479 err_packed_sz, md) != 0 && err == 0) {
1480 err = EFAULT;
1481 }
1482 vsp.vsp_error_sz = err_packed_sz;
1483
1484 nvlist_free(nverr);
1485 kmem_free(err_packed, err_packed_sz);
1486 } else {
1487 /*
1488 * If there are no detailed per-field errors, it is important to
1489 * communicate that absense to userspace.
1490 */
1491 vsp.vsp_error_sz = 0;
1492 }
1493
1494 if (ddi_copyout(&vsp, udata, sizeof (vsp), md) != 0 && err == 0) {
1495 err = EFAULT;
1496 }
1497
1498 return (err);
1499 }
1500
1501 static int
viona_ioc_ring_intr_clear(viona_link_t * link,uint_t idx)1502 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1503 {
1504 if (idx >= VIONA_VQ_MAX) {
1505 return (EINVAL);
1506 }
1507
1508 link->l_vrings[idx].vr_intr_enabled = 0;
1509 return (0);
1510 }
1511
1512 static int
viona_ioc_intr_poll(viona_link_t * link,void * udata,int md,int * rv)1513 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1514 {
1515 uint_t cnt = 0;
1516 vioc_intr_poll_t vip;
1517
1518 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
1519 uint_t val = link->l_vrings[i].vr_intr_enabled;
1520
1521 vip.vip_status[i] = val;
1522 if (val != 0) {
1523 cnt++;
1524 }
1525 }
1526
1527 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1528 return (EFAULT);
1529 }
1530 *rv = (int)cnt;
1531 return (0);
1532 }
1533