xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_main.c (revision 7037363ac736070a1e853a6841cd42d938f1e4f7)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39  * Copyright 2025 Oxide Computer Company
40  */
41 
42 /*
43  * viona - VirtIO-Net, Accelerated
44  *
45  * The purpose of viona is to provide high performance virtio-net devices to
46  * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
47  * DLS/DLD stack.
48  *
49  * --------------------
50  * General Architecture
51  * --------------------
52  *
53  * A single viona instance is comprised of a "link" handle and two "rings".
54  * After opening the viona device, it must be associated with a MAC network
55  * interface and a bhyve (vmm) instance to form its link resource.  This is
56  * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
57  * passed in to perform the initialization.  With the MAC client opened, and a
58  * driver handle to the vmm instance established, the device is ready to be
59  * configured by the guest.
60  *
61  * The userspace portion of bhyve, which interfaces with the PCI device
62  * emulation framework, is meant to stay out of the datapath if at all
63  * possible.  Configuration changes made via PCI are mapped to actions which
64  * will steer the operation of the in-kernel logic.
65  *
66  *
67  * -----------
68  * Ring Basics
69  * -----------
70  *
71  * Each viona link has two viona_vring_t entities, RX and TX, for handling data
72  * transfers to and from the guest.  They represent an interface to the
73  * standard virtio ring structures.  When initialized and active, each ring is
74  * backed by a kernel worker thread (parented to the bhyve process for the
75  * instance) which handles ring events.  The RX worker has the simple task of
76  * watching for ring shutdown conditions.  The TX worker does that in addition
77  * to processing all requests to transmit data.  Data destined for the guest is
78  * delivered directly by MAC to viona_rx() when the ring is active.
79  *
80  *
81  * -----------
82  * Ring States
83  * -----------
84  *
85  * The viona_vring_t instances follow a simple path through the possible state
86  * values represented in virtio_vring_t`vr_state:
87  *
88  *        +<--------------------------------------------+
89  *        |						|
90  *        V						^
91  *  +-----------+	This is the initial state when a link is created or
92  *  | VRS_RESET |	when the ring has been explicitly reset.
93  *  +-----------+
94  *        |						^
95  *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
96  *        |						|
97  *        |						^
98  *        V
99  *  +-----------+	The ring parameters (size, guest physical addresses)
100  *  | VRS_SETUP |	have been set and start-up of the ring worker thread
101  *  +-----------+	has begun.
102  *        |						^
103  *        |						|
104  *        |---* ring worker thread begins execution	|
105  *        |						|
106  *        +-------------------------------------------->+
107  *        |	      |					^
108  *        |	      |
109  *        |	      *	If ring shutdown is requested (by ioctl or impending
110  *        |		bhyve process death) while the worker thread is
111  *        |		starting, the worker will transition the ring to
112  *        |		VRS_RESET and exit.
113  *        |						^
114  *        |						|
115  *        |<-------------------------------------------<+
116  *        |	      |					|
117  *        |	      |					^
118  *        |	      *	If ring is requested to pause (but not stop)from the
119  *        |             VRS_RUN state, it will return to the VRS_INIT state.
120  *        |
121  *        |						^
122  *        |						|
123  *        |						^
124  *        V
125  *  +-----------+	The worker thread associated with the ring has started
126  *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
127  *  +-----------+	for the ring to operate.
128  *        |						^
129  *        |						|
130  *        +-------------------------------------------->+
131  *        |	      |					^
132  *        |	      |
133  *        |	      *	If ring shutdown is requested while the worker is
134  *        |		waiting in VRS_INIT, it will free any extra resources
135  *        |		and transition to VRS_RESET.
136  *        |						^
137  *        |						|
138  *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
139  *        |						^
140  *        V
141  *  +-----------+	The worker thread associated with the ring is executing
142  *  | VRS_RUN   |	workload specific to that ring.
143  *  +-----------+
144  *        |						^
145  *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
146  *        |	(or bhyve process begins exit)		^
147  *        |
148  *  +-----------+	The worker thread associated with the ring is in the
149  *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
150  *  +-----------+	requests are allowed to complete, but new requests
151  *        |		must be ignored.
152  *        |						^
153  *        |						|
154  *        +-------------------------------------------->+
155  *
156  *
157  * While the worker thread is not running, changes to vr_state are only made by
158  * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
159  * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
160  * has been started, only it may perform ring state transitions (still under
161  * the protection of vr_lock), when requested by outside consumers via
162  * vr_state_flags or when the containing bhyve process initiates an exit.
163  *
164  *
165  * ----------------------------
166  * Transmission mblk_t Handling
167  * ----------------------------
168  *
169  * For incoming frames destined for a bhyve guest, the data must first land in
170  * a host OS buffer from the physical NIC before it is copied into the awaiting
171  * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
172  * this limitation and can avoid extra copying before the buffers are accessed
173  * directly by the NIC.  When a guest designates buffers to be transmitted,
174  * viona translates the guest-physical addresses contained in the ring
175  * descriptors to host-virtual addresses via viona_hold_page().  That pointer is
176  * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
177  * Doing so increments vr_xfer_outstanding, preventing the ring from being
178  * reset (allowing the link to drop its vmm handle to the guest) until all
179  * transmit mblks referencing guest memory have been processed.  Allocation of
180  * the viona_desb_t entries is done during the VRS_INIT stage of the ring
181  * worker thread.  The ring size informs that allocation as the number of
182  * concurrent transmissions is limited by the number of descriptors in the
183  * ring.  This minimizes allocation in the transmit hot-path by acquiring those
184  * fixed-size resources during initialization.
185  *
186  * This optimization depends on the underlying NIC driver freeing the mblks in
187  * a timely manner after they have been transmitted by the hardware.  Some
188  * drivers have been found to flush TX descriptors only when new transmissions
189  * are initiated.  This means that there is no upper bound to the time needed
190  * for an mblk to be flushed and can stall bhyve guests from shutting down
191  * since their memory must be free of viona TX references prior to clean-up.
192  *
193  * This expectation of deterministic mblk_t processing is likely the reason
194  * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
195  * loaded will copy transmit data into fresh buffers rather than passing up
196  * zero-copy mblks.  It is a hold-over from the original viona sources provided
197  * by Pluribus and its continued necessity has not been confirmed.
198  *
199  *
200  * ----------------------------
201  * Ring Notification Fast-paths
202  * ----------------------------
203  *
204  * Device operation for viona requires that notifications flow to and from the
205  * guest to indicate certain ring conditions.  In order to minimize latency and
206  * processing overhead, the notification procedures are kept in-kernel whenever
207  * possible.
208  *
209  * Guest-to-host notifications, when new available descriptors have been placed
210  * in the ring, are posted via the 'queue notify' address in the virtio BAR.
211  * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
212  * install a callback hook on an ioport address.  Guest exits for accesses to
213  * viona-hooked ioport addresses will result in direct calls to notify the
214  * appropriate ring worker without a trip to userland.
215  *
216  * Host-to-guest notifications in the form of interrupts enjoy similar
217  * acceleration.  Each viona ring can be configured to send MSI notifications
218  * to the guest as virtio conditions dictate.  This in-kernel interrupt
219  * configuration is kept synchronized through viona ioctls which are utilized
220  * during writes to the associated PCI config registers or MSI-X BAR.
221  *
222  * Guests which do not utilize MSI-X will result in viona falling back to the
223  * slow path for interrupts.  It will poll(2) the viona handle, receiving
224  * notification when ring events necessitate the assertion of an interrupt.
225  *
226  *
227  * ---------------
228  * Nethook Support
229  * ---------------
230  *
231  * Viona provides four nethook events that consumers (e.g. ipf) can hook into
232  * to intercept packets as they go up or down the stack.  Unfortunately,
233  * the nethook framework does not understand raw packets, so we can only
234  * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
235  * we register callbacks with the neti (netinfo) module that will be invoked
236  * for each netstack already present, as well as for any additional netstack
237  * instances created as the system operates.  These callbacks will
238  * register/unregister the hooks with the nethook framework for each
239  * netstack instance.  This registration occurs prior to creating any
240  * viona instances for a given netstack, and the unregistration for a netstack
241  * instance occurs after all viona instances of the netstack instance have
242  * been deleted.
243  *
244  * ------------------
245  * Metrics/Statistics
246  * -----------------
247  *
248  * During operation, Viona tracks certain metrics as certain events occur.
249  *
250  * One class of metrics, known as the "error stats", refer to abnormal
251  * conditions in ring processing which are likely the fault of a misbehaving
252  * guest.  These are tracked on a per-ring basis, and are not formally exposed
253  * to any consumer besides direct memory access through mdb.
254  *
255  * The other class of metrics tracked for an instance are the "transfer stats",
256  * which are the traditional packets/bytes/errors/drops figures.  These are
257  * counted per-ring, and then aggregated into link-wide values exposed via
258  * kstats.  Atomic operations are used to increment those per-ring stats during
259  * operation, and then when a ring is stopped, the values are consolidated into
260  * the link-wide values (to prevent loss when the ring is zeroed) under the
261  * protection of viona_link`l_stats_lock.  When the kstats are being updated,
262  * l_stats_lock is held to protect against a racing consolidation, with the
263  * existing per-ring values being added in at update time to provide an accurate
264  * figure.
265  */
266 
267 #include <sys/conf.h>
268 #include <sys/file.h>
269 #include <sys/stat.h>
270 
271 #include <sys/dlpi.h>
272 #include <sys/vlan.h>
273 
274 #include "viona_impl.h"
275 
276 
277 #define	VIONA_NAME		"Virtio Network Accelerator"
278 #define	VIONA_CTL_MINOR		0
279 #define	VIONA_MODULE_NAME	"viona"
280 #define	VIONA_KSTAT_CLASS	"misc"
281 #define	VIONA_KSTAT_NAME	"viona_stat"
282 
283 
284 /*
285  * Host capabilities.
286  */
287 #define	VIONA_S_HOSTCAPS	(	\
288 	VIRTIO_NET_F_GUEST_CSUM |	\
289 	VIRTIO_NET_F_MAC |		\
290 	VIRTIO_NET_F_GUEST_TSO4 |	\
291 	VIRTIO_NET_F_MRG_RXBUF |	\
292 	VIRTIO_NET_F_STATUS |		\
293 	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
294 	VIRTIO_F_RING_INDIRECT_DESC)
295 
296 /* MAC_CAPAB_HCKSUM specifics of interest */
297 #define	VIONA_CAP_HCKSUM_INTEREST	\
298 	(HCKSUM_INET_PARTIAL |		\
299 	HCKSUM_INET_FULL_V4 |		\
300 	HCKSUM_INET_FULL_V6)
301 
302 static void		*viona_state;
303 static dev_info_t	*viona_dip;
304 static id_space_t	*viona_minors;
305 
306 
307 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
308     void **result);
309 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
310 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
311 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
312 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
313 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
314     cred_t *credp, int *rval);
315 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
316     struct pollhead **phpp);
317 
318 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
319 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
320 
321 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
322 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t);
323 static int viona_ioc_get_params(viona_link_t *, void *, int);
324 static int viona_ioc_set_params(viona_link_t *, void *, int);
325 static int viona_ioc_ring_init(viona_link_t *, void *, int);
326 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
327 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
328 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
329 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
330 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
331 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
332 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
333 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
334 
335 static void viona_params_get_defaults(viona_link_params_t *);
336 
337 static struct cb_ops viona_cb_ops = {
338 	viona_open,
339 	viona_close,
340 	nodev,
341 	nodev,
342 	nodev,
343 	nodev,
344 	nodev,
345 	viona_ioctl,
346 	nodev,
347 	nodev,
348 	nodev,
349 	viona_chpoll,
350 	ddi_prop_op,
351 	0,
352 	D_MP | D_NEW | D_HOTPLUG,
353 	CB_REV,
354 	nodev,
355 	nodev
356 };
357 
358 static struct dev_ops viona_ops = {
359 	DEVO_REV,
360 	0,
361 	viona_info,
362 	nulldev,
363 	nulldev,
364 	viona_attach,
365 	viona_detach,
366 	nodev,
367 	&viona_cb_ops,
368 	NULL,
369 	ddi_power,
370 	ddi_quiesce_not_needed
371 };
372 
373 static struct modldrv modldrv = {
374 	&mod_driverops,
375 	VIONA_NAME,
376 	&viona_ops,
377 };
378 
379 static struct modlinkage modlinkage = {
380 	MODREV_1, &modldrv, NULL
381 };
382 
383 int
_init(void)384 _init(void)
385 {
386 	int ret;
387 
388 	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
389 	if (ret != 0) {
390 		return (ret);
391 	}
392 
393 	viona_minors = id_space_create("viona_minors",
394 	    VIONA_CTL_MINOR + 1, UINT16_MAX);
395 	viona_rx_init();
396 	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
397 
398 	ret = mod_install(&modlinkage);
399 	if (ret != 0) {
400 		ddi_soft_state_fini(&viona_state);
401 		id_space_destroy(viona_minors);
402 		viona_rx_fini();
403 		mutex_destroy(&viona_force_copy_lock);
404 	}
405 
406 	return (ret);
407 }
408 
409 int
_fini(void)410 _fini(void)
411 {
412 	int ret;
413 
414 	ret = mod_remove(&modlinkage);
415 	if (ret != 0) {
416 		return (ret);
417 	}
418 
419 	ddi_soft_state_fini(&viona_state);
420 	id_space_destroy(viona_minors);
421 	viona_rx_fini();
422 	mutex_destroy(&viona_force_copy_lock);
423 
424 	return (ret);
425 }
426 
427 int
_info(struct modinfo * modinfop)428 _info(struct modinfo *modinfop)
429 {
430 	return (mod_info(&modlinkage, modinfop));
431 }
432 
433 /* ARGSUSED */
434 static int
viona_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)435 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
436 {
437 	int error;
438 
439 	switch (cmd) {
440 	case DDI_INFO_DEVT2DEVINFO:
441 		*result = (void *)viona_dip;
442 		error = DDI_SUCCESS;
443 		break;
444 	case DDI_INFO_DEVT2INSTANCE:
445 		*result = (void *)0;
446 		error = DDI_SUCCESS;
447 		break;
448 	default:
449 		error = DDI_FAILURE;
450 		break;
451 	}
452 	return (error);
453 }
454 
455 static int
viona_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)456 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
457 {
458 	if (cmd != DDI_ATTACH) {
459 		return (DDI_FAILURE);
460 	}
461 
462 	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
463 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
464 		return (DDI_FAILURE);
465 	}
466 
467 	viona_neti_attach();
468 
469 	viona_dip = dip;
470 	ddi_report_dev(viona_dip);
471 
472 	return (DDI_SUCCESS);
473 }
474 
475 static int
viona_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)476 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
477 {
478 	dev_info_t *old_dip = viona_dip;
479 
480 	if (cmd != DDI_DETACH) {
481 		return (DDI_FAILURE);
482 	}
483 
484 	VERIFY(old_dip != NULL);
485 
486 	viona_neti_detach();
487 	viona_dip = NULL;
488 	ddi_remove_minor_node(old_dip, NULL);
489 
490 	return (DDI_SUCCESS);
491 }
492 
493 static int
viona_open(dev_t * devp,int flag,int otype,cred_t * credp)494 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
495 {
496 	int	minor;
497 	viona_soft_state_t *ss;
498 
499 	if (otype != OTYP_CHR) {
500 		return (EINVAL);
501 	}
502 #if 0
503 	/*
504 	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
505 	 * Should the check be at open() or ioctl()?
506 	 */
507 	if (drv_priv(credp) != 0) {
508 		return (EPERM);
509 	}
510 #endif
511 	if (getminor(*devp) != VIONA_CTL_MINOR) {
512 		return (ENXIO);
513 	}
514 
515 	minor = id_alloc_nosleep(viona_minors);
516 	if (minor == -1) {
517 		/* All minors are busy */
518 		return (EBUSY);
519 	}
520 	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
521 		id_free(viona_minors, minor);
522 		return (ENOMEM);
523 	}
524 
525 	ss = ddi_get_soft_state(viona_state, minor);
526 	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
527 	ss->ss_minor = minor;
528 	*devp = makedevice(getmajor(*devp), minor);
529 
530 	return (0);
531 }
532 
533 static int
viona_close(dev_t dev,int flag,int otype,cred_t * credp)534 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
535 {
536 	int			minor;
537 	viona_soft_state_t	*ss;
538 
539 	if (otype != OTYP_CHR) {
540 		return (EINVAL);
541 	}
542 
543 	minor = getminor(dev);
544 
545 	ss = ddi_get_soft_state(viona_state, minor);
546 	if (ss == NULL) {
547 		return (ENXIO);
548 	}
549 
550 	VERIFY0(viona_ioc_delete(ss, B_TRUE));
551 	VERIFY(!list_link_active(&ss->ss_node));
552 	ddi_soft_state_free(viona_state, minor);
553 	id_free(viona_minors, minor);
554 
555 	return (0);
556 }
557 
558 static int
viona_ioctl(dev_t dev,int cmd,intptr_t data,int md,cred_t * cr,int * rv)559 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
560 {
561 	viona_soft_state_t *ss;
562 	void *dptr = (void *)data;
563 	int err = 0, val;
564 	viona_link_t *link;
565 
566 	ss = ddi_get_soft_state(viona_state, getminor(dev));
567 	if (ss == NULL) {
568 		return (ENXIO);
569 	}
570 
571 	switch (cmd) {
572 	case VNA_IOC_CREATE:
573 		return (viona_ioc_create(ss, dptr, md, cr));
574 	case VNA_IOC_DELETE:
575 		return (viona_ioc_delete(ss, B_FALSE));
576 	case VNA_IOC_VERSION:
577 		*rv = VIONA_CURRENT_INTERFACE_VERSION;
578 		return (0);
579 	case VNA_IOC_DEFAULT_PARAMS:
580 		/*
581 		 * With a NULL link parameter, viona_ioc_get_params() will emit
582 		 * the default parameters with the same error-handling behavior
583 		 * as VNA_IOC_GET_PARAMS.
584 		 */
585 		return (viona_ioc_get_params(NULL, dptr, md));
586 	default:
587 		break;
588 	}
589 
590 	mutex_enter(&ss->ss_lock);
591 	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
592 	    vmm_drv_release_reqd(link->l_vm_hold)) {
593 		mutex_exit(&ss->ss_lock);
594 		return (ENXIO);
595 	}
596 
597 	switch (cmd) {
598 	case VNA_IOC_GET_FEATURES:
599 		val = VIONA_S_HOSTCAPS | link->l_features_hw;
600 		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
601 			err = EFAULT;
602 		}
603 		break;
604 	case VNA_IOC_SET_FEATURES:
605 		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
606 			err = EFAULT;
607 			break;
608 		}
609 		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
610 
611 		if ((val & VIRTIO_NET_F_CSUM) == 0)
612 			val &= ~VIRTIO_NET_F_HOST_TSO4;
613 
614 		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
615 			val &= ~VIRTIO_NET_F_GUEST_TSO4;
616 
617 		link->l_features = val;
618 		break;
619 	case VNA_IOC_RING_INIT:
620 		err = viona_ioc_ring_init(link, dptr, md);
621 		break;
622 	case VNA_IOC_RING_RESET:
623 		err = viona_ioc_ring_reset(link, (uint_t)data);
624 		break;
625 	case VNA_IOC_RING_KICK:
626 		err = viona_ioc_ring_kick(link, (uint_t)data);
627 		break;
628 	case VNA_IOC_RING_SET_MSI:
629 		err = viona_ioc_ring_set_msi(link, dptr, md);
630 		break;
631 	case VNA_IOC_RING_INTR_CLR:
632 		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
633 		break;
634 	case VNA_IOC_RING_SET_STATE:
635 		err = viona_ioc_ring_set_state(link, dptr, md);
636 		break;
637 	case VNA_IOC_RING_GET_STATE:
638 		err = viona_ioc_ring_get_state(link, dptr, md);
639 		break;
640 	case VNA_IOC_RING_PAUSE:
641 		err = viona_ioc_ring_pause(link, (uint_t)data);
642 		break;
643 
644 	case VNA_IOC_INTR_POLL:
645 		err = viona_ioc_intr_poll(link, dptr, md, rv);
646 		break;
647 	case VNA_IOC_SET_NOTIFY_IOP:
648 		if (data < 0 || data > UINT16_MAX) {
649 			err = EINVAL;
650 			break;
651 		}
652 		err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
653 		break;
654 	case VNA_IOC_SET_PROMISC:
655 		err = viona_ioc_set_promisc(link, (viona_promisc_t)data);
656 		break;
657 	case VNA_IOC_GET_PARAMS:
658 		err = viona_ioc_get_params(link, dptr, md);
659 		break;
660 	case VNA_IOC_SET_PARAMS:
661 		err = viona_ioc_set_params(link, dptr, md);
662 		break;
663 	case VNA_IOC_GET_MTU:
664 		*rv = (int)link->l_mtu;
665 		break;
666 	case VNA_IOC_SET_MTU:
667 		if (data < VIONA_MIN_MTU || data > VIONA_MAX_MTU)
668 			err = EINVAL;
669 		else
670 			link->l_mtu = (uint16_t)data;
671 		break;
672 	default:
673 		err = ENOTTY;
674 		break;
675 	}
676 
677 	mutex_exit(&ss->ss_lock);
678 	return (err);
679 }
680 
681 static int
viona_chpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)682 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
683     struct pollhead **phpp)
684 {
685 	viona_soft_state_t *ss;
686 	viona_link_t *link;
687 
688 	ss = ddi_get_soft_state(viona_state, getminor(dev));
689 	if (ss == NULL) {
690 		return (ENXIO);
691 	}
692 
693 	mutex_enter(&ss->ss_lock);
694 	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
695 		mutex_exit(&ss->ss_lock);
696 		return (ENXIO);
697 	}
698 
699 	*reventsp = 0;
700 	if ((events & POLLRDBAND) != 0) {
701 		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
702 			if (link->l_vrings[i].vr_intr_enabled != 0) {
703 				*reventsp |= POLLRDBAND;
704 				break;
705 			}
706 		}
707 	}
708 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
709 		*phpp = &link->l_pollhead;
710 	}
711 	mutex_exit(&ss->ss_lock);
712 
713 	return (0);
714 }
715 
716 static void
viona_get_mac_capab(viona_link_t * link)717 viona_get_mac_capab(viona_link_t *link)
718 {
719 	mac_handle_t mh = link->l_mh;
720 	uint32_t cap = 0;
721 	mac_capab_lso_t lso_cap;
722 
723 	link->l_features_hw = 0;
724 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
725 		/*
726 		 * Only report HW checksum ability if the underlying MAC
727 		 * resource is capable of populating the L4 header.
728 		 */
729 		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
730 			link->l_features_hw |= VIRTIO_NET_F_CSUM;
731 		}
732 		link->l_cap_csum = cap;
733 	}
734 
735 	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
736 	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
737 		/*
738 		 * Virtio doesn't allow for negotiating a maximum LSO
739 		 * packet size. We have to assume that the guest may
740 		 * send a maximum length IP packet. Make sure the
741 		 * underlying MAC can handle an LSO of this size.
742 		 */
743 		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
744 		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
745 			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
746 	}
747 }
748 
749 static int
viona_kstat_update(kstat_t * ksp,int rw)750 viona_kstat_update(kstat_t *ksp, int rw)
751 {
752 	viona_link_t *link = ksp->ks_private;
753 	viona_kstats_t *vk = ksp->ks_data;
754 
755 	/*
756 	 * Avoid the potential for mangled values due to a racing consolidation
757 	 * of stats for a ring by performing the kstat update with l_stats_lock
758 	 * held while adding up the central (link) and ring values.
759 	 */
760 	mutex_enter(&link->l_stats_lock);
761 
762 	const viona_transfer_stats_t *ring_stats =
763 	    &link->l_vrings[VIONA_VQ_RX].vr_stats;
764 	const viona_transfer_stats_t *link_stats = &link->l_stats.vls_rx;
765 
766 	vk->vk_rx_packets.value.ui64 =
767 	    link_stats->vts_packets + ring_stats->vts_packets;
768 	vk->vk_rx_bytes.value.ui64 =
769 	    link_stats->vts_bytes + ring_stats->vts_bytes;
770 	vk->vk_rx_errors.value.ui64 =
771 	    link_stats->vts_errors + ring_stats->vts_errors;
772 	vk->vk_rx_drops.value.ui64 =
773 	    link_stats->vts_drops + ring_stats->vts_drops;
774 
775 	ring_stats = &link->l_vrings[VIONA_VQ_TX].vr_stats;
776 	link_stats = &link->l_stats.vls_tx;
777 
778 	vk->vk_tx_packets.value.ui64 =
779 	    link_stats->vts_packets + ring_stats->vts_packets;
780 	vk->vk_tx_bytes.value.ui64 =
781 	    link_stats->vts_bytes + ring_stats->vts_bytes;
782 	vk->vk_tx_errors.value.ui64 =
783 	    link_stats->vts_errors + ring_stats->vts_errors;
784 	vk->vk_tx_drops.value.ui64 =
785 	    link_stats->vts_drops + ring_stats->vts_drops;
786 
787 	mutex_exit(&link->l_stats_lock);
788 
789 	return (0);
790 }
791 
792 static int
viona_kstat_init(viona_soft_state_t * ss,const cred_t * cr)793 viona_kstat_init(viona_soft_state_t *ss, const cred_t *cr)
794 {
795 	zoneid_t zid = crgetzoneid(cr);
796 	kstat_t *ksp;
797 
798 	ASSERT(MUTEX_HELD(&ss->ss_lock));
799 	ASSERT3P(ss->ss_kstat, ==, NULL);
800 
801 	ksp = kstat_create_zone(VIONA_MODULE_NAME, ss->ss_minor,
802 	    VIONA_KSTAT_NAME, VIONA_KSTAT_CLASS, KSTAT_TYPE_NAMED,
803 	    sizeof (viona_kstats_t) / sizeof (kstat_named_t), 0, zid);
804 
805 	if (ksp == NULL) {
806 		/*
807 		 * Without detail from kstat_create_zone(), assume that resource
808 		 * exhaustion is to blame for the failure.
809 		 */
810 		return (ENOMEM);
811 	}
812 	ss->ss_kstat = ksp;
813 
814 	/*
815 	 * If this instance is associated with a non-global zone, make its
816 	 * kstats visible from the GZ.
817 	 */
818 	if (zid != GLOBAL_ZONEID) {
819 		kstat_zone_add(ss->ss_kstat, GLOBAL_ZONEID);
820 	}
821 
822 	viona_kstats_t *vk = ksp->ks_data;
823 
824 	kstat_named_init(&vk->vk_rx_packets, "rx_packets", KSTAT_DATA_UINT64);
825 	kstat_named_init(&vk->vk_rx_bytes, "rx_bytes", KSTAT_DATA_UINT64);
826 	kstat_named_init(&vk->vk_rx_errors, "rx_errors", KSTAT_DATA_UINT64);
827 	kstat_named_init(&vk->vk_rx_drops, "rx_drops", KSTAT_DATA_UINT64);
828 	kstat_named_init(&vk->vk_tx_packets, "tx_packets", KSTAT_DATA_UINT64);
829 	kstat_named_init(&vk->vk_tx_bytes, "tx_bytes", KSTAT_DATA_UINT64);
830 	kstat_named_init(&vk->vk_tx_errors, "tx_errors", KSTAT_DATA_UINT64);
831 	kstat_named_init(&vk->vk_tx_drops, "tx_drops", KSTAT_DATA_UINT64);
832 	ksp->ks_private = ss->ss_link;
833 	ksp->ks_update = viona_kstat_update;
834 
835 	kstat_install(ss->ss_kstat);
836 	return (0);
837 }
838 
839 static void
viona_kstat_fini(viona_soft_state_t * ss)840 viona_kstat_fini(viona_soft_state_t *ss)
841 {
842 	ASSERT(MUTEX_HELD(&ss->ss_lock));
843 
844 	if (ss->ss_kstat != NULL) {
845 		kstat_delete(ss->ss_kstat);
846 		ss->ss_kstat = NULL;
847 	}
848 }
849 
850 static int
viona_ioc_create(viona_soft_state_t * ss,void * dptr,int md,cred_t * cr)851 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
852 {
853 	vioc_create_t	kvc;
854 	viona_link_t	*link = NULL;
855 	char		cli_name[MAXNAMELEN];
856 	int		err = 0;
857 	file_t		*fp;
858 	vmm_hold_t	*hold = NULL;
859 	viona_neti_t	*nip = NULL;
860 	zoneid_t	zid;
861 	mac_diag_t	mac_diag = MAC_DIAG_NONE;
862 	boolean_t	rings_allocd = B_FALSE;
863 
864 	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
865 
866 	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
867 		return (EFAULT);
868 	}
869 
870 	zid = crgetzoneid(cr);
871 	nip = viona_neti_lookup_by_zid(zid);
872 	if (nip == NULL) {
873 		return (EIO);
874 	}
875 
876 	if (!nip->vni_nethook.vnh_hooked) {
877 		viona_neti_rele(nip);
878 		return (EIO);
879 	}
880 
881 	mutex_enter(&ss->ss_lock);
882 	if (ss->ss_link != NULL) {
883 		mutex_exit(&ss->ss_lock);
884 		viona_neti_rele(nip);
885 		return (EEXIST);
886 	}
887 
888 	if ((fp = getf(kvc.c_vmfd)) == NULL) {
889 		err = EBADF;
890 		goto bail;
891 	}
892 	err = vmm_drv_hold(fp, cr, &hold);
893 	releasef(kvc.c_vmfd);
894 	if (err != 0) {
895 		goto bail;
896 	}
897 
898 	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
899 	link->l_linkid = kvc.c_linkid;
900 	link->l_vm_hold = hold;
901 	link->l_mtu = VIONA_DEFAULT_MTU;
902 
903 	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
904 	if (err != 0) {
905 		goto bail;
906 	}
907 
908 	viona_get_mac_capab(link);
909 	viona_params_get_defaults(&link->l_params);
910 
911 	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_MODULE_NAME,
912 	    link->l_linkid);
913 	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
914 	if (err != 0) {
915 		goto bail;
916 	}
917 
918 	err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
919 	    &link->l_muh, VLAN_ID_NONE, &mac_diag);
920 	if (err != 0) {
921 		goto bail;
922 	}
923 
924 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
925 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
926 	rings_allocd = B_TRUE;
927 
928 	/*
929 	 * Default to passing up all multicast traffic in addition to
930 	 * classified unicast. Guests which have support will change this
931 	 * if they need to via the virtio net control queue; guests without
932 	 * support generally still want to see multicast.
933 	 */
934 	link->l_promisc = VIONA_PROMISC_MULTI;
935 	if ((err = viona_rx_set(link, link->l_promisc)) != 0) {
936 		goto bail;
937 	}
938 
939 	link->l_neti = nip;
940 	ss->ss_link = link;
941 
942 	if ((err = viona_kstat_init(ss, cr)) != 0) {
943 		goto bail;
944 	}
945 
946 	mutex_exit(&ss->ss_lock);
947 
948 	mutex_enter(&nip->vni_lock);
949 	list_insert_tail(&nip->vni_dev_list, ss);
950 	mutex_exit(&nip->vni_lock);
951 
952 	return (0);
953 
954 bail:
955 	if (link != NULL) {
956 		viona_rx_clear(link);
957 		if (link->l_mch != NULL) {
958 			if (link->l_muh != NULL) {
959 				VERIFY0(mac_unicast_remove(link->l_mch,
960 				    link->l_muh));
961 				link->l_muh = NULL;
962 			}
963 			mac_client_close(link->l_mch, 0);
964 		}
965 		if (link->l_mh != NULL) {
966 			mac_close(link->l_mh);
967 		}
968 		if (rings_allocd) {
969 			viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
970 			viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
971 		}
972 		kmem_free(link, sizeof (viona_link_t));
973 		ss->ss_link = NULL;
974 	}
975 	if (hold != NULL) {
976 		vmm_drv_rele(hold);
977 	}
978 	viona_neti_rele(nip);
979 
980 	mutex_exit(&ss->ss_lock);
981 	return (err);
982 }
983 
984 static int
viona_ioc_delete(viona_soft_state_t * ss,boolean_t on_close)985 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
986 {
987 	viona_link_t *link;
988 	viona_neti_t *nip = NULL;
989 
990 	mutex_enter(&ss->ss_lock);
991 	if ((link = ss->ss_link) == NULL) {
992 		/* Link destruction already complete */
993 		mutex_exit(&ss->ss_lock);
994 		return (0);
995 	}
996 
997 	if (link->l_destroyed) {
998 		/*
999 		 * Link destruction has been started by another thread, but has
1000 		 * not completed.  This condition should be impossible to
1001 		 * encounter when performing the on-close destroy of the link,
1002 		 * since racing ioctl accessors must necessarily be absent.
1003 		 */
1004 		VERIFY(!on_close);
1005 		mutex_exit(&ss->ss_lock);
1006 		return (EAGAIN);
1007 	}
1008 	/*
1009 	 * The link deletion cannot fail after this point, continuing until its
1010 	 * successful completion is reached.
1011 	 */
1012 	link->l_destroyed = B_TRUE;
1013 
1014 	/*
1015 	 * Tear down the IO port hook so it cannot be used to kick any of the
1016 	 * rings which are about to be reset and stopped.
1017 	 */
1018 	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
1019 	mutex_exit(&ss->ss_lock);
1020 
1021 	/*
1022 	 * Return the rings to their reset state, ignoring any possible
1023 	 * interruptions from signals.
1024 	 */
1025 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
1026 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
1027 
1028 	mutex_enter(&ss->ss_lock);
1029 	viona_kstat_fini(ss);
1030 	if (link->l_mch != NULL) {
1031 		/* Unhook the receive callbacks and close out the client */
1032 		viona_rx_clear(link);
1033 		if (link->l_muh != NULL) {
1034 			VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
1035 			link->l_muh = NULL;
1036 		}
1037 		mac_client_close(link->l_mch, 0);
1038 	}
1039 	if (link->l_mh != NULL) {
1040 		mac_close(link->l_mh);
1041 	}
1042 	if (link->l_vm_hold != NULL) {
1043 		vmm_drv_rele(link->l_vm_hold);
1044 		link->l_vm_hold = NULL;
1045 	}
1046 
1047 	nip = link->l_neti;
1048 	link->l_neti = NULL;
1049 
1050 	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
1051 	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
1052 	pollhead_clean(&link->l_pollhead);
1053 	ss->ss_link = NULL;
1054 	mutex_exit(&ss->ss_lock);
1055 
1056 	mutex_enter(&nip->vni_lock);
1057 	list_remove(&nip->vni_dev_list, ss);
1058 	mutex_exit(&nip->vni_lock);
1059 
1060 	viona_neti_rele(nip);
1061 
1062 	kmem_free(link, sizeof (viona_link_t));
1063 	return (0);
1064 }
1065 
1066 static int
viona_ioc_ring_init(viona_link_t * link,void * udata,int md)1067 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
1068 {
1069 	vioc_ring_init_t kri;
1070 	int err;
1071 
1072 	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
1073 		return (EFAULT);
1074 	}
1075 	const struct viona_ring_params params = {
1076 		.vrp_pa = kri.ri_qaddr,
1077 		.vrp_size = kri.ri_qsize,
1078 		.vrp_avail_idx = 0,
1079 		.vrp_used_idx = 0,
1080 	};
1081 
1082 	err = viona_ring_init(link, kri.ri_index, &params);
1083 
1084 	return (err);
1085 }
1086 
1087 static int
viona_ioc_ring_set_state(viona_link_t * link,void * udata,int md)1088 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
1089 {
1090 	vioc_ring_state_t krs;
1091 	int err;
1092 
1093 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1094 		return (EFAULT);
1095 	}
1096 	const struct viona_ring_params params = {
1097 		.vrp_pa = krs.vrs_qaddr,
1098 		.vrp_size = krs.vrs_qsize,
1099 		.vrp_avail_idx = krs.vrs_avail_idx,
1100 		.vrp_used_idx = krs.vrs_used_idx,
1101 	};
1102 
1103 	err = viona_ring_init(link, krs.vrs_index, &params);
1104 
1105 	return (err);
1106 }
1107 
1108 static int
viona_ioc_ring_get_state(viona_link_t * link,void * udata,int md)1109 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
1110 {
1111 	vioc_ring_state_t krs;
1112 
1113 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
1114 		return (EFAULT);
1115 	}
1116 
1117 	struct viona_ring_params params;
1118 	int err = viona_ring_get_state(link, krs.vrs_index, &params);
1119 	if (err != 0) {
1120 		return (err);
1121 	}
1122 	krs.vrs_qsize = params.vrp_size;
1123 	krs.vrs_qaddr = params.vrp_pa;
1124 	krs.vrs_avail_idx = params.vrp_avail_idx;
1125 	krs.vrs_used_idx = params.vrp_used_idx;
1126 
1127 	if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
1128 		return (EFAULT);
1129 	}
1130 	return (0);
1131 }
1132 
1133 static int
viona_ioc_ring_reset(viona_link_t * link,uint_t idx)1134 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
1135 {
1136 	viona_vring_t *ring;
1137 
1138 	if (idx >= VIONA_VQ_MAX) {
1139 		return (EINVAL);
1140 	}
1141 	ring = &link->l_vrings[idx];
1142 
1143 	return (viona_ring_reset(ring, B_TRUE));
1144 }
1145 
1146 static int
viona_ioc_ring_kick(viona_link_t * link,uint_t idx)1147 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
1148 {
1149 	viona_vring_t *ring;
1150 	int err;
1151 
1152 	if (idx >= VIONA_VQ_MAX) {
1153 		return (EINVAL);
1154 	}
1155 	ring = &link->l_vrings[idx];
1156 
1157 	mutex_enter(&ring->vr_lock);
1158 	switch (ring->vr_state) {
1159 	case VRS_SETUP:
1160 		/*
1161 		 * An early kick to a ring which is starting its worker thread
1162 		 * is fine.  Once that thread is active, it will process the
1163 		 * start-up request immediately.
1164 		 */
1165 		/* FALLTHROUGH */
1166 	case VRS_INIT:
1167 		ring->vr_state_flags |= VRSF_REQ_START;
1168 		/* FALLTHROUGH */
1169 	case VRS_RUN:
1170 		cv_broadcast(&ring->vr_cv);
1171 		err = 0;
1172 		break;
1173 	default:
1174 		err = EBUSY;
1175 		break;
1176 	}
1177 	mutex_exit(&ring->vr_lock);
1178 
1179 	return (err);
1180 }
1181 
1182 static int
viona_ioc_ring_pause(viona_link_t * link,uint_t idx)1183 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1184 {
1185 	if (idx >= VIONA_VQ_MAX) {
1186 		return (EINVAL);
1187 	}
1188 
1189 	viona_vring_t *ring = &link->l_vrings[idx];
1190 	return (viona_ring_pause(ring));
1191 }
1192 
1193 static int
viona_ioc_ring_set_msi(viona_link_t * link,void * data,int md)1194 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1195 {
1196 	vioc_ring_msi_t vrm;
1197 	viona_vring_t *ring;
1198 
1199 	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1200 		return (EFAULT);
1201 	}
1202 	if (vrm.rm_index >= VIONA_VQ_MAX) {
1203 		return (EINVAL);
1204 	}
1205 
1206 	ring = &link->l_vrings[vrm.rm_index];
1207 	mutex_enter(&ring->vr_lock);
1208 	ring->vr_msi_addr = vrm.rm_addr;
1209 	ring->vr_msi_msg = vrm.rm_msg;
1210 	mutex_exit(&ring->vr_lock);
1211 
1212 	return (0);
1213 }
1214 
1215 static int
viona_notify_iop(void * arg,bool in,uint16_t port,uint8_t bytes,uint32_t * val)1216 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1217     uint32_t *val)
1218 {
1219 	viona_link_t *link = (viona_link_t *)arg;
1220 
1221 	/*
1222 	 * If the request is a read (in/ins), or direct at a port other than
1223 	 * what we expect to be registered on, ignore it.
1224 	 */
1225 	if (in || port != link->l_notify_ioport) {
1226 		return (ESRCH);
1227 	}
1228 
1229 	/* Let userspace handle notifications for rings other than RX/TX. */
1230 	const uint16_t vq = *val;
1231 	if (vq >= VIONA_VQ_MAX) {
1232 		return (ESRCH);
1233 	}
1234 
1235 	viona_vring_t *ring = &link->l_vrings[vq];
1236 	int res = 0;
1237 
1238 	mutex_enter(&ring->vr_lock);
1239 	if (ring->vr_state == VRS_RUN) {
1240 		cv_broadcast(&ring->vr_cv);
1241 	} else {
1242 		res = ESRCH;
1243 	}
1244 	mutex_exit(&ring->vr_lock);
1245 
1246 	return (res);
1247 }
1248 
1249 static int
viona_ioc_set_notify_ioport(viona_link_t * link,uint16_t ioport)1250 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1251 {
1252 	int err = 0;
1253 
1254 	if (link->l_notify_ioport != 0) {
1255 		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1256 		link->l_notify_ioport = 0;
1257 	}
1258 
1259 	if (ioport != 0) {
1260 		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1261 		    viona_notify_iop, (void *)link, &link->l_notify_cookie);
1262 		if (err == 0) {
1263 			link->l_notify_ioport = ioport;
1264 		}
1265 	}
1266 	return (err);
1267 }
1268 
1269 static int
viona_ioc_set_promisc(viona_link_t * link,viona_promisc_t mode)1270 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode)
1271 {
1272 	int err;
1273 
1274 	if (mode >= VIONA_PROMISC_MAX) {
1275 		return (EINVAL);
1276 	}
1277 
1278 	if (mode == link->l_promisc) {
1279 		return (0);
1280 	}
1281 
1282 	if ((err = viona_rx_set(link, mode)) != 0) {
1283 		return (err);
1284 	}
1285 
1286 	link->l_promisc = mode;
1287 	return (0);
1288 }
1289 
1290 #define	PARAM_NM_TX_COPY_DATA	"tx_copy_data"
1291 #define	PARAM_NM_TX_HEADER_PAD	"tx_header_pad"
1292 
1293 #define	PARAM_ERR_INVALID_TYPE	"invalid type"
1294 #define	PARAM_ERR_OUT_OF_RANGE	"value out of range"
1295 #define	PARAM_ERR_UNK_KEY	"unknown key"
1296 
1297 static nvlist_t *
viona_params_to_nvlist(const viona_link_params_t * vlp)1298 viona_params_to_nvlist(const viona_link_params_t *vlp)
1299 {
1300 	nvlist_t *nvl = fnvlist_alloc();
1301 
1302 	fnvlist_add_boolean_value(nvl, PARAM_NM_TX_COPY_DATA,
1303 	    vlp->vlp_tx_copy_data);
1304 	fnvlist_add_uint16(nvl, PARAM_NM_TX_HEADER_PAD,
1305 	    vlp->vlp_tx_header_pad);
1306 
1307 	return (nvl);
1308 }
1309 
1310 static nvlist_t *
viona_params_from_nvlist(nvlist_t * nvl,viona_link_params_t * vlp)1311 viona_params_from_nvlist(nvlist_t *nvl, viona_link_params_t *vlp)
1312 {
1313 	nvlist_t *nverr = fnvlist_alloc();
1314 	nvpair_t *nvp = NULL;
1315 
1316 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
1317 		const char *name = nvpair_name(nvp);
1318 		const data_type_t dtype = nvpair_type(nvp);
1319 
1320 		if (strcmp(name, PARAM_NM_TX_COPY_DATA) == 0) {
1321 			if (dtype == DATA_TYPE_BOOLEAN_VALUE) {
1322 				vlp->vlp_tx_copy_data =
1323 				    fnvpair_value_boolean_value(nvp);
1324 			} else {
1325 				fnvlist_add_string(nverr, name,
1326 				    PARAM_ERR_INVALID_TYPE);
1327 			}
1328 			continue;
1329 		}
1330 		if (strcmp(name, PARAM_NM_TX_HEADER_PAD) == 0) {
1331 			if (dtype == DATA_TYPE_UINT16) {
1332 				uint16_t value = fnvpair_value_uint16(nvp);
1333 
1334 				if (value > viona_max_header_pad) {
1335 					fnvlist_add_string(nverr, name,
1336 					    PARAM_ERR_OUT_OF_RANGE);
1337 				} else {
1338 					vlp->vlp_tx_header_pad = value;
1339 				}
1340 			} else {
1341 				fnvlist_add_string(nverr, name,
1342 				    PARAM_ERR_INVALID_TYPE);
1343 			}
1344 			continue;
1345 		}
1346 
1347 		/* Reject parameters we do not recognize */
1348 		fnvlist_add_string(nverr, name, PARAM_ERR_UNK_KEY);
1349 	}
1350 
1351 	if (!nvlist_empty(nverr)) {
1352 		return (nverr);
1353 	}
1354 
1355 	nvlist_free(nverr);
1356 	return (NULL);
1357 }
1358 
1359 static void
viona_params_get_defaults(viona_link_params_t * vlp)1360 viona_params_get_defaults(viona_link_params_t *vlp)
1361 {
1362 	vlp->vlp_tx_copy_data = viona_tx_copy_needed();
1363 	vlp->vlp_tx_header_pad = 0;
1364 }
1365 
1366 static int
viona_ioc_get_params(viona_link_t * link,void * udata,int md)1367 viona_ioc_get_params(viona_link_t *link, void *udata, int md)
1368 {
1369 	vioc_get_params_t vgp;
1370 	int err = 0;
1371 
1372 	if (ddi_copyin(udata, &vgp, sizeof (vgp), md) != 0) {
1373 		return (EFAULT);
1374 	}
1375 
1376 	nvlist_t *nvl = NULL;
1377 	if (link != NULL) {
1378 		nvl = viona_params_to_nvlist(&link->l_params);
1379 	} else {
1380 		viona_link_params_t vlp = { 0 };
1381 
1382 		viona_params_get_defaults(&vlp);
1383 		nvl = viona_params_to_nvlist(&vlp);
1384 	}
1385 
1386 	VERIFY(nvl != NULL);
1387 
1388 	size_t packed_sz;
1389 	void *packed = fnvlist_pack(nvl, &packed_sz);
1390 	nvlist_free(nvl);
1391 
1392 	if (packed_sz > vgp.vgp_param_sz) {
1393 		err = E2BIG;
1394 	}
1395 	/* Communicate size, even if the data will not fit */
1396 	vgp.vgp_param_sz = packed_sz;
1397 
1398 	if (err == 0 &&
1399 	    ddi_copyout(packed, vgp.vgp_param, packed_sz, md) != 0) {
1400 		err = EFAULT;
1401 	}
1402 	kmem_free(packed, packed_sz);
1403 
1404 	if (ddi_copyout(&vgp, udata, sizeof (vgp), md) != 0) {
1405 		if (err != 0) {
1406 			err = EFAULT;
1407 		}
1408 	}
1409 
1410 	return (err);
1411 }
1412 
1413 static int
viona_ioc_set_params(viona_link_t * link,void * udata,int md)1414 viona_ioc_set_params(viona_link_t *link, void *udata, int md)
1415 {
1416 	vioc_set_params_t vsp;
1417 	int err = 0;
1418 	nvlist_t *nverr = NULL;
1419 
1420 	if (ddi_copyin(udata, &vsp, sizeof (vsp), md) != 0) {
1421 		return (EFAULT);
1422 	}
1423 
1424 	if (vsp.vsp_param_sz > VIONA_MAX_PARAM_NVLIST_SZ) {
1425 		err = E2BIG;
1426 		goto done;
1427 	} else if (vsp.vsp_param_sz == 0) {
1428 		/*
1429 		 * There is no reason to make this ioctl call with no actual
1430 		 * parameters to be changed.
1431 		 */
1432 		err = EINVAL;
1433 		goto done;
1434 	}
1435 
1436 	const size_t packed_sz = vsp.vsp_param_sz;
1437 	void *packed = kmem_alloc(packed_sz, KM_SLEEP);
1438 	if (ddi_copyin(vsp.vsp_param, packed, packed_sz, md) != 0) {
1439 		kmem_free(packed, packed_sz);
1440 		err = EFAULT;
1441 		goto done;
1442 	}
1443 
1444 	nvlist_t *parsed = NULL;
1445 	if (nvlist_unpack(packed, packed_sz, &parsed, KM_SLEEP) == 0) {
1446 		/* Use the existing parameters as a starting point */
1447 		viona_link_params_t new_params;
1448 		bcopy(&link->l_params, &new_params,
1449 		    sizeof (new_params));
1450 
1451 		nverr = viona_params_from_nvlist(parsed, &new_params);
1452 		if (nverr == NULL) {
1453 			/*
1454 			 * Only apply the updated parameters if there
1455 			 * were no errors during parsing.
1456 			 */
1457 			bcopy(&new_params, &link->l_params,
1458 			    sizeof (new_params));
1459 		} else {
1460 			err = EINVAL;
1461 		}
1462 
1463 	} else {
1464 		err = EINVAL;
1465 	}
1466 	nvlist_free(parsed);
1467 	kmem_free(packed, packed_sz);
1468 
1469 done:
1470 	if (nverr != NULL) {
1471 		size_t err_packed_sz;
1472 		void *err_packed = fnvlist_pack(nverr, &err_packed_sz);
1473 
1474 		if (err_packed_sz > vsp.vsp_error_sz) {
1475 			if (err != 0) {
1476 				err = E2BIG;
1477 			}
1478 		} else if (ddi_copyout(err_packed, vsp.vsp_error,
1479 		    err_packed_sz, md) != 0 && err == 0) {
1480 			err = EFAULT;
1481 		}
1482 		vsp.vsp_error_sz = err_packed_sz;
1483 
1484 		nvlist_free(nverr);
1485 		kmem_free(err_packed, err_packed_sz);
1486 	} else {
1487 		/*
1488 		 * If there are no detailed per-field errors, it is important to
1489 		 * communicate that absense to userspace.
1490 		 */
1491 		vsp.vsp_error_sz = 0;
1492 	}
1493 
1494 	if (ddi_copyout(&vsp, udata, sizeof (vsp), md) != 0 && err == 0) {
1495 		err = EFAULT;
1496 	}
1497 
1498 	return (err);
1499 }
1500 
1501 static int
viona_ioc_ring_intr_clear(viona_link_t * link,uint_t idx)1502 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1503 {
1504 	if (idx >= VIONA_VQ_MAX) {
1505 		return (EINVAL);
1506 	}
1507 
1508 	link->l_vrings[idx].vr_intr_enabled = 0;
1509 	return (0);
1510 }
1511 
1512 static int
viona_ioc_intr_poll(viona_link_t * link,void * udata,int md,int * rv)1513 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1514 {
1515 	uint_t cnt = 0;
1516 	vioc_intr_poll_t vip;
1517 
1518 	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
1519 		uint_t val = link->l_vrings[i].vr_intr_enabled;
1520 
1521 		vip.vip_status[i] = val;
1522 		if (val != 0) {
1523 			cnt++;
1524 		}
1525 	}
1526 
1527 	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1528 		return (EFAULT);
1529 	}
1530 	*rv = (int)cnt;
1531 	return (0);
1532 }
1533