xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_main.c (revision 227349345306fb206b7f303ce76099e288136097)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39  * Copyright 2026 Oxide Computer Company
40  */
41 
42 /*
43  * viona - VirtIO-Net, Accelerated
44  *
45  * The purpose of viona is to provide high performance virtio-net devices to
46  * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
47  * DLS/DLD stack.
48  *
49  * --------------------
50  * General Architecture
51  * --------------------
52  *
53  * A single viona instance is comprised of a "link" handle and a number of
54  * "ring" pairs. After opening the viona device, it must be associated with a
55  * MAC network interface and a bhyve (vmm) instance to form its link resource.
56  * This is done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd
57  * are passed in to perform the initialization.  With the MAC client opened,
58  * and a driver handle to the vmm instance established, the device is ready to
59  * be configured by the guest.
60  *
61  * The userspace portion of bhyve, which interfaces with the PCI device
62  * emulation framework, is meant to stay out of the datapath if at all
63  * possible.  Configuration changes made via PCI are mapped to actions which
64  * will steer the operation of the in-kernel logic.
65  *
66  *
67  * -----------
68  * Ring Basics
69  * -----------
70  *
71  * Each viona link has a number of pairs of viona_vring_t entities, each pair
72  * consisting of an RX and TX ring, for handling data transfers to and from the
73  * guest respectively. They represent an interface to the standard virtio ring
74  * structures. When initialized and active, each ring is backed by a kernel
75  * worker thread (parented to the bhyve process for the instance) which handles
76  * ring events. An RX worker has the simple task of watching for ring shutdown
77  * conditions. A TX worker does that in addition to processing all requests to
78  * transmit data. Data destined for the guest is delivered directly by MAC to
79  * viona_rx() when the ring is active.
80  *
81  *
82  * -----------
83  * Ring States
84  * -----------
85  *
86  * The viona_vring_t instances follow a simple path through the possible state
87  * values represented in virtio_vring_t`vr_state:
88  *
89  *        +<--------------------------------------------+
90  *        |						|
91  *        V						^
92  *  +-----------+	This is the initial state when a link is created or
93  *  | VRS_RESET |	when the ring has been explicitly reset.
94  *  +-----------+
95  *        |						^
96  *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
97  *        |						|
98  *        |						^
99  *        V
100  *  +-----------+	The ring parameters (size, guest physical addresses)
101  *  | VRS_SETUP |	have been set and start-up of the ring worker thread
102  *  +-----------+	has begun.
103  *        |						^
104  *        |						|
105  *        |---* ring worker thread begins execution	|
106  *        |						|
107  *        +-------------------------------------------->+
108  *        |	      |					^
109  *        |	      |
110  *        |	      *	If ring shutdown is requested (by ioctl or impending
111  *        |		bhyve process death) while the worker thread is
112  *        |		starting, the worker will transition the ring to
113  *        |		VRS_RESET and exit.
114  *        |						^
115  *        |						|
116  *        |<-------------------------------------------<+
117  *        |	      |					|
118  *        |	      |					^
119  *        |	      *	If ring is requested to pause (but not stop) from the
120  *        |             VRS_RUN state, it will return to the VRS_INIT state.
121  *        |
122  *        |						^
123  *        |						|
124  *        |						^
125  *        V
126  *  +-----------+	The worker thread associated with the ring has started
127  *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
128  *  +-----------+	for the ring to operate.
129  *        |						^
130  *        |						|
131  *        +-------------------------------------------->+
132  *        |	      |					^
133  *        |	      |
134  *        |	      *	If ring shutdown is requested while the worker is
135  *        |		waiting in VRS_INIT, it will free any extra resources
136  *        |		and transition to VRS_RESET.
137  *        |						^
138  *        |						|
139  *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
140  *        |						^
141  *        V
142  *  +-----------+	The worker thread associated with the ring is executing
143  *  | VRS_RUN   |	workload specific to that ring.
144  *  +-----------+
145  *        |						^
146  *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
147  *        |	(or bhyve process begins exit)		^
148  *        |
149  *  +-----------+	The worker thread associated with the ring is in the
150  *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
151  *  +-----------+	requests are allowed to complete, but new requests
152  *        |		must be ignored.
153  *        |						^
154  *        |						|
155  *        +-------------------------------------------->+
156  *
157  *
158  * While the worker thread is not running, changes to vr_state are only made by
159  * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
160  * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
161  * has been started, only it may perform ring state transitions (still under
162  * the protection of vr_lock), when requested by outside consumers via
163  * vr_state_flags or when the containing bhyve process initiates an exit.
164  *
165  * Additionally, since all ioctls that affect a ring are mutually exclusive
166  * via a hold on the soft state lock, a ring cannot unexpectedly change state
167  * while this lock is held. This is relied on by the VNA_IOC_SET_PAIRS ioctl to
168  * guarantee that the ring is idle, and remains so, while the number of queue
169  * pairs is being changed.
170  *
171  *
172  * ----------------------------
173  * Multiple Rings (multi-queue)
174  * ----------------------------
175  *
176  * A link starts its life with a single pair of rings (one RX and one TX ring).
177  * The number of pairs can be varied via a call to ioctl(VNA_IOC_SET_PAIRS)
178  * providing all of the existing rings are in the VRS_RESET state. Therefore a
179  * userland consumer may only change the ring count between link creation and
180  * initialising any rings, or after issuing ioctl(VNA_IOC_RING_RESET) on
181  * all rings. The number of active rings cannot be reduced below the number of
182  * rings currently used, see below. The maximum number of rings permitted by
183  * viona (0x100) is lower than that permitted for a network device by the
184  * VirtIO specification (0x8000).
185  *
186  * Separately the number of RX rings that should be used for transmission of
187  * data to the guest can be varied at any time via ioctl(VNA_IOC_SET_USEPAIRS).
188  * The number of pairs to use can never exceed the total number of allocated
189  * pairs.
190  *
191  *
192  * ----------------------------
193  * Transmission mblk_t Handling
194  * ----------------------------
195  *
196  * For incoming frames destined for a bhyve guest, the data must first land in
197  * a host OS buffer from the physical NIC before it is copied into the awaiting
198  * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
199  * this limitation and can avoid extra copying before the buffers are accessed
200  * directly by the NIC.  When a guest designates buffers to be transmitted,
201  * viona translates the guest-physical addresses contained in the ring
202  * descriptors to host-virtual addresses via viona_hold_page().  That pointer is
203  * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
204  * Doing so increments vr_xfer_outstanding, preventing the ring from being
205  * reset (allowing the link to drop its vmm handle to the guest) until all
206  * transmit mblks referencing guest memory have been processed.  Allocation of
207  * the viona_desb_t entries is done during the VRS_INIT stage of the ring
208  * worker thread.  The ring size informs that allocation as the number of
209  * concurrent transmissions is limited by the number of descriptors in the
210  * ring.  This minimizes allocation in the transmit hot-path by acquiring those
211  * fixed-size resources during initialization.
212  *
213  * This optimization depends on the underlying NIC driver freeing the mblks in
214  * a timely manner after they have been transmitted by the hardware.  Some
215  * drivers have been found to flush TX descriptors only when new transmissions
216  * are initiated.  This means that there is no upper bound to the time needed
217  * for an mblk to be flushed and can stall bhyve guests from shutting down
218  * since their memory must be free of viona TX references prior to clean-up.
219  *
220  * This expectation of deterministic mblk_t processing is likely the reason
221  * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
222  * loaded will copy transmit data into fresh buffers rather than passing up
223  * zero-copy mblks.  It is a hold-over from the original viona sources provided
224  * by Pluribus and its continued necessity has not been confirmed.
225  *
226  *
227  * ----------------------------
228  * Ring Notification Fast-paths
229  * ----------------------------
230  *
231  * Device operation for viona requires that notifications flow to and from the
232  * guest to indicate certain ring conditions.  In order to minimize latency and
233  * processing overhead, the notification procedures are kept in-kernel whenever
234  * possible.
235  *
236  * Guest-to-host notifications, when new available descriptors have been placed
237  * in the ring, are posted for legacy devices via the 'queue notify' address in
238  * the virtio BAR. For modern devices the notifications are posted to the MMIO
239  * bar that is indicated by the notify PCI capability. The
240  * vmm_drv_ioport_hook() and vmm_drv_mmio_hook() interfaces were added to bhyve
241  * which allows viona to install a callback hook on an ioport, or on an MMIO
242  * address range. Guest exits for accesses to viona-hooked addresses will
243  * result in direct calls to notify the appropriate ring worker without a trip
244  * to userland.
245  *
246  * Host-to-guest notifications in the form of interrupts enjoy similar
247  * acceleration.  Each viona ring can be configured to send MSI notifications
248  * to the guest as virtio conditions dictate.  This in-kernel interrupt
249  * configuration is kept synchronized through viona ioctls which are utilized
250  * during writes to the associated PCI config registers or MSI-X BAR.
251  *
252  * Guests which do not utilize MSI-X will result in viona falling back to the
253  * slow path for interrupts.  It will poll(2) the viona handle, receiving
254  * notification when ring events necessitate the assertion of an interrupt.
255  *
256  *
257  * ---------------
258  * Nethook Support
259  * ---------------
260  *
261  * Viona provides four nethook events that consumers (e.g. ipf) can hook into
262  * to intercept packets as they go up or down the stack.  Unfortunately,
263  * the nethook framework does not understand raw packets, so we can only
264  * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
265  * we register callbacks with the neti (netinfo) module that will be invoked
266  * for each netstack already present, as well as for any additional netstack
267  * instances created as the system operates.  These callbacks will
268  * register/unregister the hooks with the nethook framework for each
269  * netstack instance.  This registration occurs prior to creating any
270  * viona instances for a given netstack, and the unregistration for a netstack
271  * instance occurs after all viona instances of the netstack instance have
272  * been deleted.
273  *
274  * ------------------
275  * Metrics/Statistics
276  * -----------------
277  *
278  * During operation, Viona tracks certain metrics as certain events occur.
279  *
280  * One class of metrics, known as the "error stats", refer to abnormal
281  * conditions in ring processing which are likely the fault of a misbehaving
282  * guest.  These are tracked on a per-ring basis, and are not formally exposed
283  * to any consumer besides direct memory access through mdb.
284  *
285  * The other class of metrics tracked for an instance are the "transfer stats",
286  * which are the traditional packets/bytes/errors/drops figures.  These are
287  * counted per-ring, and then aggregated into link-wide values exposed via
288  * kstats.  Atomic operations are used to increment those per-ring stats during
289  * operation, and then when a ring is stopped, the values are consolidated into
290  * the link-wide values (to prevent loss when the ring is zeroed) under the
291  * protection of viona_link`l_stats_lock.  When the kstats are being updated,
292  * l_stats_lock is held to protect against a racing consolidation, with the
293  * existing per-ring values being added in at update time to provide an accurate
294  * figure.
295  */
296 
297 #include <sys/conf.h>
298 #include <sys/file.h>
299 #include <sys/stat.h>
300 
301 #include <sys/dlpi.h>
302 #include <sys/vlan.h>
303 
304 #include "viona_impl.h"
305 
306 
307 #define	VIONA_NAME		"Virtio Network Accelerator"
308 #define	VIONA_CTL_MINOR		0
309 #define	VIONA_MODULE_NAME	"viona"
310 #define	VIONA_KSTAT_CLASS	"misc"
311 #define	VIONA_KSTAT_NAME	"viona_stat"
312 
313 
314 /*
315  * Host capabilities.
316  */
317 #define	VIONA_S_HOSTCAPS	(	\
318 	VIRTIO_NET_F_GUEST_CSUM |	\
319 	VIRTIO_NET_F_GUEST_TSO4 |	\
320 	VIRTIO_NET_F_GUEST_TSO6 |	\
321 	VIRTIO_NET_F_MRG_RXBUF |	\
322 	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
323 	VIRTIO_F_RING_INDIRECT_DESC)
324 
325 /* MAC_CAPAB_HCKSUM specifics of interest */
326 #define	VIONA_CAP_HCKSUM_INTEREST	\
327 	(HCKSUM_INET_PARTIAL |		\
328 	HCKSUM_INET_FULL_V4 |		\
329 	HCKSUM_INET_FULL_V6)
330 
331 static void		*viona_state;
332 static dev_info_t	*viona_dip;
333 static id_space_t	*viona_minors;
334 
335 
336 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
337     void **result);
338 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
339 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
340 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
341 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
342 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
343     cred_t *credp, int *rval);
344 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
345     struct pollhead **phpp);
346 
347 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
348 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
349 
350 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
351 static int viona_ioc_set_notify_mmio(viona_link_t *, void *, int);
352 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t);
353 static int viona_ioc_get_params(viona_link_t *, void *, int);
354 static int viona_ioc_set_params(viona_link_t *, void *, int);
355 static int viona_ioc_link_setpairs(viona_link_t *, uint16_t);
356 static int viona_ioc_link_usepairs(viona_link_t *, uint16_t);
357 static int viona_ioc_ring_init(viona_link_t *, void *, int);
358 static int viona_ioc_ring_init_modern(viona_link_t *, void *, int);
359 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
360 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
361 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
362 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
363 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
364 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
365 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
366 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
367 static int viona_ioc_intr_poll_mq(viona_link_t *, void *, int, int *);
368 
369 static void viona_params_get_defaults(viona_link_params_t *);
370 
371 static struct cb_ops viona_cb_ops = {
372 	viona_open,
373 	viona_close,
374 	nodev,
375 	nodev,
376 	nodev,
377 	nodev,
378 	nodev,
379 	viona_ioctl,
380 	nodev,
381 	nodev,
382 	nodev,
383 	viona_chpoll,
384 	ddi_prop_op,
385 	0,
386 	D_MP | D_NEW | D_HOTPLUG,
387 	CB_REV,
388 	nodev,
389 	nodev
390 };
391 
392 static struct dev_ops viona_ops = {
393 	DEVO_REV,
394 	0,
395 	viona_info,
396 	nulldev,
397 	nulldev,
398 	viona_attach,
399 	viona_detach,
400 	nodev,
401 	&viona_cb_ops,
402 	NULL,
403 	ddi_power,
404 	ddi_quiesce_not_needed
405 };
406 
407 static struct modldrv modldrv = {
408 	&mod_driverops,
409 	VIONA_NAME,
410 	&viona_ops,
411 };
412 
413 static struct modlinkage modlinkage = {
414 	MODREV_1, &modldrv, NULL
415 };
416 
417 int
_init(void)418 _init(void)
419 {
420 	int ret;
421 
422 	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
423 	if (ret != 0) {
424 		return (ret);
425 	}
426 
427 	viona_minors = id_space_create("viona_minors",
428 	    VIONA_CTL_MINOR + 1, UINT16_MAX);
429 	viona_rx_init();
430 	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
431 
432 	ret = mod_install(&modlinkage);
433 	if (ret != 0) {
434 		ddi_soft_state_fini(&viona_state);
435 		id_space_destroy(viona_minors);
436 		viona_rx_fini();
437 		mutex_destroy(&viona_force_copy_lock);
438 	}
439 
440 	return (ret);
441 }
442 
443 int
_fini(void)444 _fini(void)
445 {
446 	int ret;
447 
448 	ret = mod_remove(&modlinkage);
449 	if (ret != 0) {
450 		return (ret);
451 	}
452 
453 	ddi_soft_state_fini(&viona_state);
454 	id_space_destroy(viona_minors);
455 	viona_rx_fini();
456 	mutex_destroy(&viona_force_copy_lock);
457 
458 	return (ret);
459 }
460 
461 int
_info(struct modinfo * modinfop)462 _info(struct modinfo *modinfop)
463 {
464 	return (mod_info(&modlinkage, modinfop));
465 }
466 
467 /* ARGSUSED */
468 static int
viona_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)469 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
470 {
471 	int error;
472 
473 	switch (cmd) {
474 	case DDI_INFO_DEVT2DEVINFO:
475 		*result = (void *)viona_dip;
476 		error = DDI_SUCCESS;
477 		break;
478 	case DDI_INFO_DEVT2INSTANCE:
479 		*result = (void *)0;
480 		error = DDI_SUCCESS;
481 		break;
482 	default:
483 		error = DDI_FAILURE;
484 		break;
485 	}
486 	return (error);
487 }
488 
489 static int
viona_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)490 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
491 {
492 	if (cmd != DDI_ATTACH) {
493 		return (DDI_FAILURE);
494 	}
495 
496 	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
497 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
498 		return (DDI_FAILURE);
499 	}
500 
501 	viona_neti_attach();
502 
503 	viona_dip = dip;
504 	ddi_report_dev(viona_dip);
505 
506 	return (DDI_SUCCESS);
507 }
508 
509 static int
viona_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)510 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
511 {
512 	dev_info_t *old_dip = viona_dip;
513 
514 	if (cmd != DDI_DETACH) {
515 		return (DDI_FAILURE);
516 	}
517 
518 	VERIFY(old_dip != NULL);
519 
520 	viona_neti_detach();
521 	viona_dip = NULL;
522 	ddi_remove_minor_node(old_dip, NULL);
523 
524 	return (DDI_SUCCESS);
525 }
526 
527 static int
viona_open(dev_t * devp,int flag,int otype,cred_t * credp)528 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
529 {
530 	int	minor;
531 	viona_soft_state_t *ss;
532 
533 	if (otype != OTYP_CHR) {
534 		return (EINVAL);
535 	}
536 #if 0
537 	/*
538 	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
539 	 * Should the check be at open() or ioctl()?
540 	 */
541 	if (drv_priv(credp) != 0) {
542 		return (EPERM);
543 	}
544 #endif
545 	if (getminor(*devp) != VIONA_CTL_MINOR) {
546 		return (ENXIO);
547 	}
548 
549 	minor = id_alloc_nosleep(viona_minors);
550 	if (minor == -1) {
551 		/* All minors are busy */
552 		return (EBUSY);
553 	}
554 	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
555 		id_free(viona_minors, minor);
556 		return (ENOMEM);
557 	}
558 
559 	ss = ddi_get_soft_state(viona_state, minor);
560 	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
561 	ss->ss_minor = minor;
562 	*devp = makedevice(getmajor(*devp), minor);
563 
564 	return (0);
565 }
566 
567 static int
viona_close(dev_t dev,int flag,int otype,cred_t * credp)568 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
569 {
570 	int			minor;
571 	viona_soft_state_t	*ss;
572 
573 	if (otype != OTYP_CHR) {
574 		return (EINVAL);
575 	}
576 
577 	minor = getminor(dev);
578 
579 	ss = ddi_get_soft_state(viona_state, minor);
580 	if (ss == NULL) {
581 		return (ENXIO);
582 	}
583 
584 	VERIFY0(viona_ioc_delete(ss, B_TRUE));
585 	VERIFY(!list_link_active(&ss->ss_node));
586 	ddi_soft_state_free(viona_state, minor);
587 	id_free(viona_minors, minor);
588 
589 	return (0);
590 }
591 
592 static int
viona_ioctl(dev_t dev,int cmd,intptr_t data,int md,cred_t * cr,int * rv)593 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
594 {
595 	viona_soft_state_t *ss;
596 	void *dptr = (void *)data;
597 	int err = 0;
598 	uint64_t val;
599 	viona_link_t *link;
600 
601 	ss = ddi_get_soft_state(viona_state, getminor(dev));
602 	if (ss == NULL) {
603 		return (ENXIO);
604 	}
605 
606 	switch (cmd) {
607 	case VNA_IOC_CREATE:
608 		return (viona_ioc_create(ss, dptr, md, cr));
609 	case VNA_IOC_DELETE:
610 		return (viona_ioc_delete(ss, B_FALSE));
611 	case VNA_IOC_VERSION:
612 		*rv = VIONA_CURRENT_INTERFACE_VERSION;
613 		return (0);
614 	case VNA_IOC_DEFAULT_PARAMS:
615 		/*
616 		 * With a NULL link parameter, viona_ioc_get_params() will emit
617 		 * the default parameters with the same error-handling behavior
618 		 * as VNA_IOC_GET_PARAMS.
619 		 */
620 		return (viona_ioc_get_params(NULL, dptr, md));
621 	default:
622 		break;
623 	}
624 
625 	mutex_enter(&ss->ss_lock);
626 	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
627 	    vmm_drv_release_reqd(link->l_vm_hold)) {
628 		mutex_exit(&ss->ss_lock);
629 		return (ENXIO);
630 	}
631 
632 	switch (cmd) {
633 	case VNA_IOC_GET_FEATURES:
634 		val = VIONA_S_HOSTCAPS | link->l_features_hw;
635 		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
636 			err = EFAULT;
637 		}
638 		break;
639 	case VNA_IOC_SET_FEATURES:
640 		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
641 			err = EFAULT;
642 			break;
643 		}
644 		link->l_modern = ((val & VIRTIO_F_VERSION_1) != 0);
645 		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
646 
647 		if ((val & VIRTIO_NET_F_CSUM) == 0) {
648 			val &= ~(VIRTIO_NET_F_HOST_TSO4 |
649 			    VIRTIO_NET_F_HOST_TSO6);
650 		}
651 
652 		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0) {
653 			val &= ~(VIRTIO_NET_F_GUEST_TSO4 |
654 			    VIRTIO_NET_F_GUEST_TSO6);
655 		}
656 
657 		link->l_features = val;
658 		break;
659 	case VNA_IOC_GET_PAIRS:
660 		*rv = (int)link->l_npairs;
661 		break;
662 	case VNA_IOC_SET_PAIRS:
663 		if (data > UINT16_MAX)
664 			err = EINVAL;
665 		else
666 			err = viona_ioc_link_setpairs(link, (uint16_t)data);
667 		break;
668 	case VNA_IOC_GET_USEPAIRS:
669 		*rv = (int)link->l_usepairs;
670 		break;
671 	case VNA_IOC_SET_USEPAIRS:
672 		if (data > UINT16_MAX)
673 			err = EINVAL;
674 		else
675 			err = viona_ioc_link_usepairs(link, (uint16_t)data);
676 		break;
677 	case VNA_IOC_RING_INIT:
678 		err = viona_ioc_ring_init(link, dptr, md);
679 		break;
680 	case VNA_IOC_RING_INIT_MODERN:
681 		err = viona_ioc_ring_init_modern(link, dptr, md);
682 		break;
683 	case VNA_IOC_RING_RESET:
684 		err = viona_ioc_ring_reset(link, (uint_t)data);
685 		break;
686 	case VNA_IOC_RING_KICK:
687 		err = viona_ioc_ring_kick(link, (uint_t)data);
688 		break;
689 	case VNA_IOC_RING_SET_MSI:
690 		err = viona_ioc_ring_set_msi(link, dptr, md);
691 		break;
692 	case VNA_IOC_RING_INTR_CLR:
693 		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
694 		break;
695 	case VNA_IOC_RING_SET_STATE:
696 		err = viona_ioc_ring_set_state(link, dptr, md);
697 		break;
698 	case VNA_IOC_RING_GET_STATE:
699 		err = viona_ioc_ring_get_state(link, dptr, md);
700 		break;
701 	case VNA_IOC_RING_PAUSE:
702 		err = viona_ioc_ring_pause(link, (uint_t)data);
703 		break;
704 
705 	case VNA_IOC_INTR_POLL:
706 		err = viona_ioc_intr_poll(link, dptr, md, rv);
707 		break;
708 	case VNA_IOC_INTR_POLL_MQ:
709 		err = viona_ioc_intr_poll_mq(link, dptr, md, rv);
710 		break;
711 	case VNA_IOC_SET_NOTIFY_IOP:
712 		if (data < 0 || data > UINT16_MAX) {
713 			err = EINVAL;
714 			break;
715 		}
716 		err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
717 		break;
718 	case VNA_IOC_SET_NOTIFY_MMIO:
719 		err = viona_ioc_set_notify_mmio(link, dptr, md);
720 		break;
721 	case VNA_IOC_SET_PROMISC:
722 		err = viona_ioc_set_promisc(link, (viona_promisc_t)data);
723 		break;
724 	case VNA_IOC_GET_PARAMS:
725 		err = viona_ioc_get_params(link, dptr, md);
726 		break;
727 	case VNA_IOC_SET_PARAMS:
728 		err = viona_ioc_set_params(link, dptr, md);
729 		break;
730 	case VNA_IOC_GET_MTU:
731 		*rv = (int)link->l_mtu;
732 		break;
733 	case VNA_IOC_SET_MTU:
734 		if (data < VIONA_MIN_MTU || data > VIONA_MAX_MTU)
735 			err = EINVAL;
736 		else
737 			link->l_mtu = (uint16_t)data;
738 		break;
739 	default:
740 		err = ENOTTY;
741 		break;
742 	}
743 
744 	mutex_exit(&ss->ss_lock);
745 	return (err);
746 }
747 
748 static int
viona_chpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)749 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
750     struct pollhead **phpp)
751 {
752 	viona_soft_state_t *ss;
753 	viona_link_t *link;
754 
755 	ss = ddi_get_soft_state(viona_state, getminor(dev));
756 	if (ss == NULL) {
757 		return (ENXIO);
758 	}
759 
760 	mutex_enter(&ss->ss_lock);
761 	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
762 		mutex_exit(&ss->ss_lock);
763 		return (ENXIO);
764 	}
765 
766 	*reventsp = 0;
767 	if ((events & POLLRDBAND) != 0) {
768 		for (uint16_t i = 0; i < VIONA_USABLE_RINGS(link); i++) {
769 			if (link->l_vrings[i].vr_intr_enabled != 0) {
770 				*reventsp |= POLLRDBAND;
771 				break;
772 			}
773 		}
774 	}
775 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
776 		*phpp = &link->l_pollhead;
777 	}
778 	mutex_exit(&ss->ss_lock);
779 
780 	return (0);
781 }
782 
783 static void
viona_get_mac_capab(viona_link_t * link)784 viona_get_mac_capab(viona_link_t *link)
785 {
786 	mac_handle_t mh = link->l_mh;
787 	uint32_t cap = 0;
788 	mac_capab_lso_t lso_cap;
789 
790 	link->l_features_hw = 0;
791 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
792 		/*
793 		 * Only report HW checksum ability if the underlying MAC
794 		 * resource is capable of populating the L4 header.
795 		 */
796 		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
797 			link->l_features_hw |= VIRTIO_NET_F_CSUM;
798 		}
799 		link->l_cap_csum = cap;
800 	}
801 
802 	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
803 	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
804 		/*
805 		 * Virtio doesn't allow for negotiating a maximum LSO
806 		 * packet size. We have to assume that the guest may
807 		 * send a maximum length IP packet. Make sure the
808 		 * underlying MAC can handle an LSO of this size.
809 		 */
810 		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
811 		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET) {
812 			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
813 		}
814 
815 		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV6) &&
816 		    lso_cap.lso_basic_tcp_ipv6.lso_max >= IP_MAXPACKET) {
817 			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO6;
818 		}
819 	}
820 }
821 
822 static int
viona_kstat_update(kstat_t * ksp,int rw)823 viona_kstat_update(kstat_t *ksp, int rw)
824 {
825 	viona_link_t *link = ksp->ks_private;
826 	viona_kstats_t *vk = ksp->ks_data;
827 
828 	/*
829 	 * Avoid the potential for mangled values due to a racing consolidation
830 	 * of stats for a ring by performing the kstat update with l_stats_lock
831 	 * held while adding up the central (link) and ring values.
832 	 */
833 	mutex_enter(&link->l_stats_lock);
834 
835 	for (uint16_t i = 0; i < VIONA_USABLE_RINGS(link); i++) {
836 		const viona_vring_t *ring = &link->l_vrings[i];
837 		const viona_transfer_stats_t *ring_stats = &ring->vr_stats;
838 		const viona_transfer_stats_t *link_stats;
839 
840 		if (VIONA_RING_ISRX(ring)) {
841 			link_stats = &link->l_stats.vls_rx;
842 
843 			vk->vk_rx_packets.value.ui64 =
844 			    link_stats->vts_packets + ring_stats->vts_packets;
845 			vk->vk_rx_bytes.value.ui64 =
846 			    link_stats->vts_bytes + ring_stats->vts_bytes;
847 			vk->vk_rx_errors.value.ui64 =
848 			    link_stats->vts_errors + ring_stats->vts_errors;
849 			vk->vk_rx_drops.value.ui64 =
850 			    link_stats->vts_drops + ring_stats->vts_drops;
851 		} else if (VIONA_RING_ISTX(ring)) {
852 			link_stats = &link->l_stats.vls_tx;
853 
854 			vk->vk_tx_packets.value.ui64 =
855 			    link_stats->vts_packets + ring_stats->vts_packets;
856 			vk->vk_tx_bytes.value.ui64 =
857 			    link_stats->vts_bytes + ring_stats->vts_bytes;
858 			vk->vk_tx_errors.value.ui64 =
859 			    link_stats->vts_errors + ring_stats->vts_errors;
860 			vk->vk_tx_drops.value.ui64 =
861 			    link_stats->vts_drops + ring_stats->vts_drops;
862 		}
863 	}
864 
865 	mutex_exit(&link->l_stats_lock);
866 
867 	return (0);
868 }
869 
870 static int
viona_kstat_init(viona_soft_state_t * ss,const cred_t * cr)871 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr)
872 {
873 	zoneid_t zid = crgetzoneid(cr);
874 	kstat_t *ksp;
875 
876 	ASSERT(MUTEX_HELD(&ss->ss_lock));
877 	ASSERT3P(ss->ss_kstat, ==, NULL);
878 
879 	ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor,
880 	    VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED,
881 	    sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid);
882 
883 	if (ksp == NULL) {
884 		/*
885 		 * Without detail from kstat_create_zone(), assume that resource
886 		 * exhaustion is to blame for the failure.
887 		 */
888 		return (ENOMEM);
889 	}
890 	ss->ss_kstat = ksp;
891 
892 	/*
893 	 * If this instance is associated with a non-global zone, make its
894 	 * kstats visible from the GZ.
895 	 */
896 	if (zid != GLOBAL_ZONEID) {
897 		kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID);
898 	}
899 
900 	viona_kstats_t *vk = ksp->ks_data;
901 
902 	kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64);
903 	kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64);
904 	kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64);
905 	kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64);
906 	kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64);
907 	kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64);
908 	kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64);
909 	kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64);
910 	ksp->ks_private = ss->ss_link;
911 	ksp->ks_update = viona_kstat_update;
912 
913 	kstat_install(ss->ss_kstat);
914 	return (0);
915 }
916 
917 static void
viona_kstat_fini(viona_soft_state_t * ss)918 viona_kstat_fini(viona_soft_state_t *ss)
919 {
920 	ASSERT(MUTEX_HELD(&ss->ss_lock));
921 
922 	if (ss->ss_kstat != NULL) {
923 		kstat_delete(ss->ss_kstat);
924 		ss->ss_kstat = NULL;
925 	}
926 }
927 
928 static void
viona_link_qfree(viona_link_t * link)929 viona_link_qfree(viona_link_t *link)
930 {
931 	if (link->l_vrings == NULL)
932 		return;
933 
934 	for (uint16_t i = 0; i < VIONA_NRINGS(link); i++) {
935 		ASSERT3U(link->l_vrings[i].vr_state, ==, VRS_RESET);
936 		viona_ring_free(&link->l_vrings[i]);
937 	}
938 	kmem_free(link->l_vrings, sizeof (viona_vring_t) * VIONA_NRINGS(link));
939 	link->l_vrings = NULL;
940 	link->l_npairs = link->l_usepairs = 0;
941 }
942 
943 static int
viona_link_qalloc(viona_link_t * link,uint16_t pairs)944 viona_link_qalloc(viona_link_t *link, uint16_t pairs)
945 {
946 	const uint16_t usepairs = link->l_usepairs;
947 
948 	ASSERT(MUTEX_HELD(&link->l_ss->ss_lock));
949 
950 	if (pairs < VIONA_MIN_QPAIR || pairs > VIONA_MAX_QPAIR ||
951 	    pairs < usepairs) {
952 		return (EINVAL);
953 	}
954 
955 	for (uint16_t i = 0; i < VIONA_NRINGS(link); i++) {
956 		if (link->l_vrings[i].vr_state != VRS_RESET)
957 			return (EBUSY);
958 	}
959 
960 	/*
961 	 * This is safe as we are holding the ss_lock, have checked that all
962 	 * of the rings are in the VRS_RESET state and know that the mac RX
963 	 * callback is not set at this point.
964 	 */
965 	viona_link_qfree(link);
966 
967 	link->l_npairs = pairs;
968 	link->l_usepairs = usepairs;
969 	link->l_vrings = kmem_zalloc(
970 	    sizeof (viona_vring_t) * VIONA_NRINGS(link), KM_SLEEP);
971 
972 	for (uint16_t i = 0; i < VIONA_NRINGS(link); i++)
973 		viona_ring_alloc(link, &link->l_vrings[i]);
974 
975 	return (0);
976 }
977 
978 static int
viona_ioc_create(viona_soft_state_t * ss,void * dptr,int md,cred_t * cr)979 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
980 {
981 	vioc_create_t	kvc;
982 	viona_link_t	*link = NULL;
983 	char		cli_name[MAXNAMELEN];
984 	int		err = 0;
985 	file_t		*fp;
986 	vmm_hold_t	*hold = NULL;
987 	viona_neti_t	*nip = NULL;
988 	zoneid_t	zid;
989 	mac_diag_t	mac_diag = MAC_DIAG_NONE;
990 
991 	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
992 
993 	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
994 		return (EFAULT);
995 	}
996 
997 	zid = crgetzoneid(cr);
998 	nip = viona_neti_lookup_by_zid(zid);
999 	if (nip == NULL) {
1000 		return (EIO);
1001 	}
1002 
1003 	if (!nip->vni_nethook.vnh_hooked) {
1004 		viona_neti_rele(nip);
1005 		return (EIO);
1006 	}
1007 
1008 	mutex_enter(&ss->ss_lock);
1009 	if (ss->ss_link != NULL) {
1010 		mutex_exit(&ss->ss_lock);
1011 		viona_neti_rele(nip);
1012 		return (EEXIST);
1013 	}
1014 
1015 	if ((fp = getf(kvc.c_vmfd)) == NULL) {
1016 		err = EBADF;
1017 		goto bail;
1018 	}
1019 	err = vmm_drv_hold(fp, cr, &hold);
1020 	releasef(kvc.c_vmfd);
1021 	if (err != 0) {
1022 		goto bail;
1023 	}
1024 
1025 	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
1026 	link->l_ss = ss;
1027 	link->l_linkid = kvc.c_linkid;
1028 	link->l_vm_hold = hold;
1029 	link->l_mtu = VIONA_DEFAULT_MTU;
1030 	link->l_notify_mmaddr = NOTIFY_MMADDR_UNSET;
1031 
1032 	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
1033 	if (err != 0) {
1034 		goto bail;
1035 	}
1036 
1037 	viona_get_mac_capab(link);
1038 	viona_params_get_defaults(&link->l_params);
1039 
1040 	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME,
1041 	    link->l_linkid);
1042 	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
1043 	if (err != 0) {
1044 		goto bail;
1045 	}
1046 
1047 	err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
1048 	    &link->l_muh, VLAN_ID_NONE, &mac_diag);
1049 	if (err != 0) {
1050 		goto bail;
1051 	}
1052 
1053 	if (viona_link_qalloc(link, 1) != 0)
1054 		goto bail;
1055 	link->l_usepairs = 1;
1056 
1057 	/*
1058 	 * Default to passing up all multicast traffic in addition to
1059 	 * classified unicast. Guests which have support will change this
1060 	 * if they need to via the virtio net control queue; guests without
1061 	 * support generally still want to see multicast.
1062 	 */
1063 	link->l_promisc = VIONA_PROMISC_MULTI;
1064 	if ((err = viona_rx_set(link, link->l_promisc)) != 0) {
1065 		goto bail;
1066 	}
1067 
1068 	link->l_neti = nip;
1069 	ss->ss_link = link;
1070 
1071 	if ((err = viona_kstat_init(ss, cr)) != 0) {
1072 		goto bail;
1073 	}
1074 
1075 	mutex_exit(&ss->ss_lock);
1076 
1077 	mutex_enter(&nip->vni_lock);
1078 	list_insert_tail(&nip->vni_dev_list, ss);
1079 	mutex_exit(&nip->vni_lock);
1080 
1081 	return (0);
1082 
1083 bail:
1084 	if (link != NULL) {
1085 		viona_rx_clear(link);
1086 		if (link->l_mch != NULL) {
1087 			if (link->l_muh != NULL) {
1088 				VERIFY0(mac_unicast_remove(link->l_mch,
1089 				    link->l_muh));
1090 				link->l_muh = NULL;
1091 			}
1092 			mac_client_close(link->l_mch, 0);
1093 		}
1094 		if (link->l_mh != NULL) {
1095 			mac_close(link->l_mh);
1096 		}
1097 		viona_link_qfree(link);
1098 		kmem_free(link, sizeof (viona_link_t));
1099 		ss->ss_link = NULL;
1100 	}
1101 	if (hold != NULL) {
1102 		vmm_drv_rele(hold);
1103 	}
1104 	viona_neti_rele(nip);
1105 
1106 	mutex_exit(&ss->ss_lock);
1107 	return (err);
1108 }
1109 
1110 static int
viona_ioc_delete(viona_soft_state_t * ss,boolean_t on_close)1111 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
1112 {
1113 	viona_link_t *link;
1114 	viona_neti_t *nip = NULL;
1115 
1116 	mutex_enter(&ss->ss_lock);
1117 	if ((link = ss->ss_link) == NULL) {
1118 		/* Link destruction already complete */
1119 		mutex_exit(&ss->ss_lock);
1120 		return (0);
1121 	}
1122 
1123 	if (link->l_destroyed) {
1124 		/*
1125 		 * Link destruction has been started by another thread, but has
1126 		 * not completed.  This condition should be impossible to
1127 		 * encounter when performing the on-close destroy of the link,
1128 		 * since racing ioctl accessors must necessarily be absent.
1129 		 */
1130 		VERIFY(!on_close);
1131 		mutex_exit(&ss->ss_lock);
1132 		return (EAGAIN);
1133 	}
1134 	/*
1135 	 * The link deletion cannot fail after this point, continuing until its
1136 	 * successful completion is reached.
1137 	 */
1138 	link->l_destroyed = B_TRUE;
1139 
1140 	/*
1141 	 * Tear down the IO and MMIO port hooks so they cannot be used to kick
1142 	 * any of the rings which are about to be reset and stopped.
1143 	 */
1144 	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
1145 	VERIFY0(viona_ioc_set_notify_mmio(link, NULL, 0));
1146 	mutex_exit(&ss->ss_lock);
1147 
1148 	/*
1149 	 * Return the rings to their reset state, ignoring any possible
1150 	 * interruptions from signals.
1151 	 */
1152 	for (uint16_t i = 0; i < VIONA_NRINGS(link); i++)
1153 		VERIFY0(viona_ring_reset(&link->l_vrings[i], B_FALSE));
1154 
1155 	mutex_enter(&ss->ss_lock);
1156 	viona_kstat_fini(ss);
1157 	if (link->l_mch != NULL) {
1158 		/* Unhook the receive callbacks and close out the client */
1159 		viona_rx_clear(link);
1160 		if (link->l_muh != NULL) {
1161 			VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
1162 			link->l_muh = NULL;
1163 		}
1164 		mac_client_close(link->l_mch, 0);
1165 	}
1166 	if (link->l_mh != NULL) {
1167 		mac_close(link->l_mh);
1168 	}
1169 	if (link->l_vm_hold != NULL) {
1170 		vmm_drv_rele(link->l_vm_hold);
1171 		link->l_vm_hold = NULL;
1172 	}
1173 
1174 	nip = link->l_neti;
1175 	link->l_neti = NULL;
1176 
1177 	viona_link_qfree(link);
1178 	pollhead_clean(&link->l_pollhead);
1179 	ss->ss_link = NULL;
1180 	mutex_exit(&ss->ss_lock);
1181 
1182 	mutex_enter(&nip->vni_lock);
1183 	list_remove(&nip->vni_dev_list, ss);
1184 	mutex_exit(&nip->vni_lock);
1185 
1186 	viona_neti_rele(nip);
1187 
1188 	kmem_free(link, sizeof (viona_link_t));
1189 	return (0);
1190 }
1191 
1192 static int
viona_ioc_ring_init(viona_link_t * link,void * udata,int md)1193 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
1194 {
1195 	vioc_ring_init_t kri;
1196 	int err;
1197 
1198 	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
1199 		return (EFAULT);
1200 	}
1201 
1202 	if (!VIONA_RING_VALID(link, kri.ri_index))
1203 		return (EINVAL);
1204 
1205 	struct viona_ring_params params = {
1206 		.vrp_pa_desc = kri.ri_qaddr,
1207 		.vrp_pa_avail = 0,
1208 		.vrp_pa_used = 0,
1209 		.vrp_size = kri.ri_qsize,
1210 		.vrp_avail_idx = 0,
1211 		.vrp_used_idx = 0,
1212 	};
1213 
1214 	if ((err = viona_ring_legacy_addr(&params)) != 0)
1215 		return (err);
1216 
1217 	err = viona_ring_init(link, kri.ri_index, &params);
1218 
1219 	return (err);
1220 }
1221 
1222 static int
viona_ioc_ring_init_modern(viona_link_t * link,void * udata,int md)1223 viona_ioc_ring_init_modern(viona_link_t *link, void *udata, int md)
1224 {
1225 	vioc_ring_init_modern_t krim;
1226 	int err;
1227 
1228 	if (ddi_copyin(udata, &krim, sizeof (krim), md) != 0) {
1229 		return (EFAULT);
1230 	}
1231 
1232 	if (!VIONA_RING_VALID(link, krim.rim_index))
1233 		return (EINVAL);
1234 
1235 	const struct viona_ring_params params = {
1236 		.vrp_pa_desc = krim.rim_qaddr_desc,
1237 		.vrp_pa_avail = krim.rim_qaddr_avail,
1238 		.vrp_pa_used = krim.rim_qaddr_used,
1239 		.vrp_size = krim.rim_qsize,
1240 		.vrp_avail_idx = 0,
1241 		.vrp_used_idx = 0,
1242 	};
1243 
1244 	err = viona_ring_init(link, krim.rim_index, &params);
1245 
1246 	return (err);
1247 }
1248 
1249 static int
viona_ioc_ring_set_state(viona_link_t * link,void * udata,int md)1250 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
1251 {
1252 	vioc_ring_state_t krs;
1253 	int err;
1254 
1255 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1256 		return (EFAULT);
1257 	}
1258 	const struct viona_ring_params params = {
1259 		.vrp_pa_desc = krs.vrs_qaddr_desc,
1260 		.vrp_pa_avail = krs.vrs_qaddr_avail,
1261 		.vrp_pa_used = krs.vrs_qaddr_used,
1262 		.vrp_size = krs.vrs_qsize,
1263 		.vrp_avail_idx = krs.vrs_avail_idx,
1264 		.vrp_used_idx = krs.vrs_used_idx,
1265 	};
1266 
1267 	err = viona_ring_init(link, krs.vrs_index, &params);
1268 
1269 	return (err);
1270 }
1271 
1272 static int
viona_ioc_ring_get_state(viona_link_t * link,void * udata,int md)1273 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
1274 {
1275 	vioc_ring_state_t krs;
1276 
1277 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1278 		return (EFAULT);
1279 	}
1280 
1281 	struct viona_ring_params params;
1282 	int err = viona_ring_get_state(link, krs.vrs_index, &params);
1283 	if (err != 0) {
1284 		return (err);
1285 	}
1286 	krs.vrs_qsize = params.vrp_size;
1287 	krs.vrs_qaddr_desc = params.vrp_pa_desc;
1288 	krs.vrs_qaddr_avail = params.vrp_pa_avail;
1289 	krs.vrs_qaddr_used = params.vrp_pa_used;
1290 	krs.vrs_avail_idx = params.vrp_avail_idx;
1291 	krs.vrs_used_idx = params.vrp_used_idx;
1292 
1293 	if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
1294 		return (EFAULT);
1295 	}
1296 	return (0);
1297 }
1298 
1299 static int
viona_ioc_link_setpairs(viona_link_t * link,uint16_t pairs)1300 viona_ioc_link_setpairs(viona_link_t *link, uint16_t pairs)
1301 {
1302 	int err;
1303 
1304 	/* Unhook the receive callbacks while the rings are being reallocated */
1305 	viona_rx_clear(link);
1306 	err = viona_link_qalloc(link, pairs);
1307 	(void) viona_rx_set(link, link->l_promisc);
1308 
1309 	return (err);
1310 }
1311 
1312 static int
viona_ioc_link_usepairs(viona_link_t * link,uint16_t pairs)1313 viona_ioc_link_usepairs(viona_link_t *link, uint16_t pairs)
1314 {
1315 	if (pairs < VIONA_MIN_QPAIR || pairs > link->l_npairs)
1316 		return (EINVAL);
1317 	link->l_usepairs = pairs;
1318 	return (0);
1319 }
1320 
1321 static int
viona_ioc_ring_reset(viona_link_t * link,uint_t idx)1322 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
1323 {
1324 	viona_vring_t *ring;
1325 
1326 	if (!VIONA_RING_VALID(link, idx)) {
1327 		return (EINVAL);
1328 	}
1329 	ring = &link->l_vrings[idx];
1330 
1331 	return (viona_ring_reset(ring, B_TRUE));
1332 }
1333 
1334 static int
viona_ioc_ring_kick(viona_link_t * link,uint_t idx)1335 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
1336 {
1337 	viona_vring_t *ring;
1338 	int err;
1339 
1340 	if (!VIONA_RING_VALID(link, idx)) {
1341 		return (EINVAL);
1342 	}
1343 	ring = &link->l_vrings[idx];
1344 
1345 	mutex_enter(&ring->vr_lock);
1346 	switch (ring->vr_state) {
1347 	case VRS_SETUP:
1348 		/*
1349 		 * An early kick to a ring which is starting its worker thread
1350 		 * is fine.  Once that thread is active, it will process the
1351 		 * start-up request immediately.
1352 		 */
1353 		/* FALLTHROUGH */
1354 	case VRS_INIT:
1355 		ring->vr_state_flags |= VRSF_REQ_START;
1356 		/* FALLTHROUGH */
1357 	case VRS_RUN:
1358 		cv_broadcast(&ring->vr_cv);
1359 		err = 0;
1360 		break;
1361 	default:
1362 		err = EBUSY;
1363 		break;
1364 	}
1365 	mutex_exit(&ring->vr_lock);
1366 
1367 	return (err);
1368 }
1369 
1370 static int
viona_ioc_ring_pause(viona_link_t * link,uint_t idx)1371 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1372 {
1373 	if (!VIONA_RING_VALID(link, idx)) {
1374 		return (EINVAL);
1375 	}
1376 
1377 	viona_vring_t *ring = &link->l_vrings[idx];
1378 	return (viona_ring_pause(ring));
1379 }
1380 
1381 static int
viona_ioc_ring_set_msi(viona_link_t * link,void * data,int md)1382 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1383 {
1384 	vioc_ring_msi_t vrm;
1385 	viona_vring_t *ring;
1386 
1387 	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1388 		return (EFAULT);
1389 	}
1390 	if (!VIONA_RING_VALID(link, vrm.rm_index)) {
1391 		return (EINVAL);
1392 	}
1393 
1394 	ring = &link->l_vrings[vrm.rm_index];
1395 	mutex_enter(&ring->vr_lock);
1396 	ring->vr_msi_addr = vrm.rm_addr;
1397 	ring->vr_msi_msg = vrm.rm_msg;
1398 	mutex_exit(&ring->vr_lock);
1399 
1400 	return (0);
1401 }
1402 
1403 static int
viona_notify_iop(void * arg,bool in,uint16_t port,uint8_t bytes,uint32_t * val)1404 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1405     uint32_t *val)
1406 {
1407 	viona_link_t *link = (viona_link_t *)arg;
1408 
1409 	/*
1410 	 * If the request is a read (in/ins), or direct at a port other than
1411 	 * what we expect to be registered on, ignore it.
1412 	 */
1413 	if (in || port != link->l_notify_ioport) {
1414 		return (ESRCH);
1415 	}
1416 
1417 	/* Let userspace handle notifications for rings other than RX/TX. */
1418 	const uint16_t vq = *val;
1419 	if (!VIONA_RING_VALID(link, vq)) {
1420 		return (ESRCH);
1421 	}
1422 
1423 	viona_vring_t *ring = &link->l_vrings[vq];
1424 	int res = 0;
1425 
1426 	mutex_enter(&ring->vr_lock);
1427 	if (ring->vr_state == VRS_RUN) {
1428 		cv_broadcast(&ring->vr_cv);
1429 	} else {
1430 		res = ESRCH;
1431 	}
1432 	mutex_exit(&ring->vr_lock);
1433 
1434 	return (res);
1435 }
1436 
1437 static int
viona_ioc_set_notify_ioport(viona_link_t * link,uint16_t ioport)1438 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1439 {
1440 	int err = 0;
1441 
1442 	if (link->l_notify_ioport != 0) {
1443 		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1444 		link->l_notify_ioport = 0;
1445 	}
1446 
1447 	if (ioport != 0) {
1448 		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1449 		    viona_notify_iop, (void *)link, &link->l_notify_cookie);
1450 		if (err == 0) {
1451 			link->l_notify_ioport = ioport;
1452 		}
1453 	}
1454 	return (err);
1455 }
1456 
1457 static int
viona_notify_mmio(void * arg,bool write,uint64_t address,int bytes,uint64_t * val)1458 viona_notify_mmio(void *arg, bool write, uint64_t address, int bytes,
1459     uint64_t *val)
1460 {
1461 	viona_link_t *link = (viona_link_t *)arg;
1462 
1463 	/*
1464 	 * We are only interested in writes to this BAR region; kick reads out
1465 	 * to userspace.
1466 	 */
1467 	if (!write)
1468 		return (ESRCH);
1469 
1470 	const uint16_t vq = *val;
1471 
1472 	/* Let userspace handle notifications for rings other than RX/TX. */
1473 	if (!VIONA_RING_VALID(link, vq))
1474 		return (ESRCH);
1475 
1476 	viona_vring_t *ring = &link->l_vrings[vq];
1477 	int res = 0;
1478 
1479 	mutex_enter(&ring->vr_lock);
1480 	if (ring->vr_state == VRS_RUN)
1481 		cv_broadcast(&ring->vr_cv);
1482 	else
1483 		res = ESRCH;
1484 	mutex_exit(&ring->vr_lock);
1485 
1486 	return (res);
1487 }
1488 
1489 static int
viona_ioc_set_notify_mmio(viona_link_t * link,void * udata,int md)1490 viona_ioc_set_notify_mmio(viona_link_t *link, void *udata, int md)
1491 {
1492 	vioc_notify_mmio_t vim;
1493 	int err = 0;
1494 
1495 	if (link->l_notify_mmaddr != NOTIFY_MMADDR_UNSET) {
1496 		int err = vmm_drv_mmio_unhook(link->l_vm_hold,
1497 		    &link->l_notify_mmcookie);
1498 		VERIFY(err == 0 || err == ENOENT);
1499 		link->l_notify_mmaddr = NOTIFY_MMADDR_UNSET;
1500 	}
1501 
1502 	if (udata == NULL)
1503 		return (0);
1504 
1505 	if (ddi_copyin(udata, &vim, sizeof (vim), md) != 0)
1506 		return (EFAULT);
1507 
1508 	err = vmm_drv_mmio_hook(link->l_vm_hold, vim.vim_address, vim.vim_size,
1509 	    viona_notify_mmio, (void *)link, &link->l_notify_mmcookie);
1510 	if (err == 0) {
1511 		link->l_notify_mmaddr = vim.vim_address;
1512 	}
1513 
1514 	return (err);
1515 }
1516 
1517 static int
viona_ioc_set_promisc(viona_link_t * link,viona_promisc_t mode)1518 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode)
1519 {
1520 	int err;
1521 
1522 	if (mode >= VIONA_PROMISC_MAX) {
1523 		return (EINVAL);
1524 	}
1525 
1526 	if (mode == link->l_promisc) {
1527 		return (0);
1528 	}
1529 
1530 	if ((err = viona_rx_set(link, mode)) != 0) {
1531 		return (err);
1532 	}
1533 
1534 	link->l_promisc = mode;
1535 	return (0);
1536 }
1537 
1538 #define	PARAM_NM_TX_COPY_DATA	"tx_copy_data"
1539 #define	PARAM_NM_TX_HEADER_PAD	"tx_header_pad"
1540 
1541 #define	PARAM_ERR_INVALID_TYPE	"invalid type"
1542 #define	PARAM_ERR_OUT_OF_RANGE	"value out of range"
1543 #define	PARAM_ERR_UNK_KEY	"unknown key"
1544 
1545 static nvlist_t *
viona_params_to_nvlist(const viona_link_params_t * vlp)1546 viona_params_to_nvlist(const viona_link_params_t *vlp)
1547 {
1548 	nvlist_t *nvl = fnvlist_alloc();
1549 
1550 	fnvlist_add_boolean_value(nvl, PARAM_NM_TX_COPY_DATA,
1551 	    vlp->vlp_tx_copy_data);
1552 	fnvlist_add_uint16(nvl, PARAM_NM_TX_HEADER_PAD,
1553 	    vlp->vlp_tx_header_pad);
1554 
1555 	return (nvl);
1556 }
1557 
1558 static nvlist_t *
viona_params_from_nvlist(nvlist_t * nvl,viona_link_params_t * vlp)1559 viona_params_from_nvlist(nvlist_t *nvl, viona_link_params_t *vlp)
1560 {
1561 	nvlist_t *nverr = fnvlist_alloc();
1562 	nvpair_t *nvp = NULL;
1563 
1564 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
1565 		const char *name = nvpair_name(nvp);
1566 		const data_type_t dtype = nvpair_type(nvp);
1567 
1568 		if (strcmp(name, PARAM_NM_TX_COPY_DATA) == 0) {
1569 			if (dtype == DATA_TYPE_BOOLEAN_VALUE) {
1570 				vlp->vlp_tx_copy_data =
1571 				    fnvpair_value_boolean_value(nvp);
1572 			} else {
1573 				fnvlist_add_string(nverr, name,
1574 				    PARAM_ERR_INVALID_TYPE);
1575 			}
1576 			continue;
1577 		}
1578 		if (strcmp(name, PARAM_NM_TX_HEADER_PAD) == 0) {
1579 			if (dtype == DATA_TYPE_UINT16) {
1580 				uint16_t value = fnvpair_value_uint16(nvp);
1581 
1582 				if (value > viona_max_header_pad) {
1583 					fnvlist_add_string(nverr, name,
1584 					    PARAM_ERR_OUT_OF_RANGE);
1585 				} else {
1586 					vlp->vlp_tx_header_pad = value;
1587 				}
1588 			} else {
1589 				fnvlist_add_string(nverr, name,
1590 				    PARAM_ERR_INVALID_TYPE);
1591 			}
1592 			continue;
1593 		}
1594 
1595 		/* Reject parameters we do not recognize */
1596 		fnvlist_add_string(nverr, name, PARAM_ERR_UNK_KEY);
1597 	}
1598 
1599 	if (!nvlist_empty(nverr)) {
1600 		return (nverr);
1601 	}
1602 
1603 	nvlist_free(nverr);
1604 	return (NULL);
1605 }
1606 
1607 static void
viona_params_get_defaults(viona_link_params_t * vlp)1608 viona_params_get_defaults(viona_link_params_t *vlp)
1609 {
1610 	vlp->vlp_tx_copy_data = viona_tx_copy_needed();
1611 	vlp->vlp_tx_header_pad = 0;
1612 }
1613 
1614 static int
viona_ioc_get_params(viona_link_t * link,void * udata,int md)1615 viona_ioc_get_params(viona_link_t *link, void *udata, int md)
1616 {
1617 	vioc_get_params_t vgp;
1618 	int err = 0;
1619 
1620 	if (ddi_copyin(udata, &vgp, sizeof (vgp), md) != 0) {
1621 		return (EFAULT);
1622 	}
1623 
1624 	nvlist_t *nvl = NULL;
1625 	if (link != NULL) {
1626 		nvl = viona_params_to_nvlist(&link->l_params);
1627 	} else {
1628 		viona_link_params_t vlp = { 0 };
1629 
1630 		viona_params_get_defaults(&vlp);
1631 		nvl = viona_params_to_nvlist(&vlp);
1632 	}
1633 
1634 	VERIFY(nvl != NULL);
1635 
1636 	size_t packed_sz;
1637 	void *packed = fnvlist_pack(nvl, &packed_sz);
1638 	nvlist_free(nvl);
1639 
1640 	if (packed_sz > vgp.vgp_param_sz) {
1641 		err = E2BIG;
1642 	}
1643 	/* Communicate size, even if the data will not fit */
1644 	vgp.vgp_param_sz = packed_sz;
1645 
1646 	if (err == 0 &&
1647 	    ddi_copyout(packed, vgp.vgp_param, packed_sz, md) != 0) {
1648 		err = EFAULT;
1649 	}
1650 	kmem_free(packed, packed_sz);
1651 
1652 	if (ddi_copyout(&vgp, udata, sizeof (vgp), md) != 0) {
1653 		if (err != 0) {
1654 			err = EFAULT;
1655 		}
1656 	}
1657 
1658 	return (err);
1659 }
1660 
1661 static int
viona_ioc_set_params(viona_link_t * link,void * udata,int md)1662 viona_ioc_set_params(viona_link_t *link, void *udata, int md)
1663 {
1664 	vioc_set_params_t vsp;
1665 	int err = 0;
1666 	nvlist_t *nverr = NULL;
1667 
1668 	if (ddi_copyin(udata, &vsp, sizeof (vsp), md) != 0) {
1669 		return (EFAULT);
1670 	}
1671 
1672 	if (vsp.vsp_param_sz > VIONA_MAX_PARAM_NVLIST_SZ) {
1673 		err = E2BIG;
1674 		goto done;
1675 	} else if (vsp.vsp_param_sz == 0) {
1676 		/*
1677 		 * There is no reason to make this ioctl call with no actual
1678 		 * parameters to be changed.
1679 		 */
1680 		err = EINVAL;
1681 		goto done;
1682 	}
1683 
1684 	const size_t packed_sz = vsp.vsp_param_sz;
1685 	void *packed = kmem_alloc(packed_sz, KM_SLEEP);
1686 	if (ddi_copyin(vsp.vsp_param, packed, packed_sz, md) != 0) {
1687 		kmem_free(packed, packed_sz);
1688 		err = EFAULT;
1689 		goto done;
1690 	}
1691 
1692 	nvlist_t *parsed = NULL;
1693 	if (nvlist_unpack(packed, packed_sz, &parsed, KM_SLEEP) == 0) {
1694 		/* Use the existing parameters as a starting point */
1695 		viona_link_params_t new_params;
1696 		bcopy(&link->l_params, &new_params,
1697 		    sizeof (new_params));
1698 
1699 		nverr = viona_params_from_nvlist(parsed, &new_params);
1700 		if (nverr == NULL) {
1701 			/*
1702 			 * Only apply the updated parameters if there
1703 			 * were no errors during parsing.
1704 			 */
1705 			bcopy(&new_params, &link->l_params,
1706 			    sizeof (new_params));
1707 		} else {
1708 			err = EINVAL;
1709 		}
1710 
1711 	} else {
1712 		err = EINVAL;
1713 	}
1714 	nvlist_free(parsed);
1715 	kmem_free(packed, packed_sz);
1716 
1717 done:
1718 	if (nverr != NULL) {
1719 		size_t err_packed_sz;
1720 		void *err_packed = fnvlist_pack(nverr, &err_packed_sz);
1721 
1722 		if (err_packed_sz > vsp.vsp_error_sz) {
1723 			if (err != 0) {
1724 				err = E2BIG;
1725 			}
1726 		} else if (ddi_copyout(err_packed, vsp.vsp_error,
1727 		    err_packed_sz, md) != 0 && err == 0) {
1728 			err = EFAULT;
1729 		}
1730 		vsp.vsp_error_sz = err_packed_sz;
1731 
1732 		nvlist_free(nverr);
1733 		kmem_free(err_packed, err_packed_sz);
1734 	} else {
1735 		/*
1736 		 * If there are no detailed per-field errors, it is important to
1737 		 * communicate that absense to userspace.
1738 		 */
1739 		vsp.vsp_error_sz = 0;
1740 	}
1741 
1742 	if (ddi_copyout(&vsp, udata, sizeof (vsp), md) != 0 && err == 0) {
1743 		err = EFAULT;
1744 	}
1745 
1746 	return (err);
1747 }
1748 
1749 static int
viona_ioc_ring_intr_clear(viona_link_t * link,uint_t idx)1750 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1751 {
1752 	if (!VIONA_RING_VALID(link, idx)) {
1753 		return (EINVAL);
1754 	}
1755 
1756 	link->l_vrings[idx].vr_intr_enabled = 0;
1757 	return (0);
1758 }
1759 
1760 static int
viona_ioc_intr_poll(viona_link_t * link,void * udata,int md,int * rv)1761 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1762 {
1763 	vioc_intr_poll_t vip = { 0 };
1764 	uint_t cnt = 0;
1765 
1766 	for (size_t i = 0;
1767 	    i < ARRAY_SIZE(vip.vip_status) && i < VIONA_USABLE_RINGS(link);
1768 	    i++) {
1769 		uint_t val = link->l_vrings[i].vr_intr_enabled;
1770 
1771 		vip.vip_status[i] = val;
1772 		if (val != 0)
1773 			cnt++;
1774 	}
1775 
1776 	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0)
1777 		return (EFAULT);
1778 
1779 	*rv = (int)cnt;
1780 	return (0);
1781 }
1782 
1783 static int
viona_ioc_intr_poll_mq(viona_link_t * link,void * udata,int md,int * rv)1784 viona_ioc_intr_poll_mq(viona_link_t *link, void *udata, int md, int *rv)
1785 {
1786 	vioc_intr_poll_mq_t vipm;
1787 	uint16_t cnt = 0;
1788 	int err = 0;
1789 
1790 	bzero(&vipm, sizeof (vipm));
1791 
1792 	if (ddi_copyin(udata, &vipm.vipm_nrings, sizeof (vipm.vipm_nrings),
1793 	    md) != 0) {
1794 		return (EFAULT);
1795 	}
1796 
1797 	if (vipm.vipm_nrings < 1 || vipm.vipm_nrings > VIONA_USABLE_RINGS(link))
1798 		return (EINVAL);
1799 
1800 	for (uint_t i = 0; i < vipm.vipm_nrings; i++) {
1801 		if (link->l_vrings[i].vr_intr_enabled) {
1802 			VIONA_INTR_SET(&vipm, i);
1803 			cnt++;
1804 		}
1805 	}
1806 
1807 	if (ddi_copyout(&vipm, udata, sizeof (vipm), md) != 0)
1808 		err = EFAULT;
1809 	else
1810 		*rv = (int)cnt;
1811 
1812 	return (err);
1813 }
1814