xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_main.c (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39  * Copyright 2023 Oxide Computer Company
40  */
41 
42 /*
43  * viona - VirtIO-Net, Accelerated
44  *
45  * The purpose of viona is to provide high performance virtio-net devices to
46  * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
47  * DLS/DLD stack.
48  *
49  * --------------------
50  * General Architecture
51  * --------------------
52  *
53  * A single viona instance is comprised of a "link" handle and two "rings".
54  * After opening the viona device, it must be associated with a MAC network
55  * interface and a bhyve (vmm) instance to form its link resource.  This is
56  * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
57  * passed in to perform the initialization.  With the MAC client opened, and a
58  * driver handle to the vmm instance established, the device is ready to be
59  * configured by the guest.
60  *
61  * The userspace portion of bhyve, which interfaces with the PCI device
62  * emulation framework, is meant to stay out of the datapath if at all
63  * possible.  Configuration changes made via PCI are mapped to actions which
64  * will steer the operation of the in-kernel logic.
65  *
66  *
67  * -----------
68  * Ring Basics
69  * -----------
70  *
71  * Each viona link has two viona_vring_t entities, RX and TX, for handling data
72  * transfers to and from the guest.  They represent an interface to the
73  * standard virtio ring structures.  When initialized and active, each ring is
74  * backed by a kernel worker thread (parented to the bhyve process for the
75  * instance) which handles ring events.  The RX worker has the simple task of
76  * watching for ring shutdown conditions.  The TX worker does that in addition
77  * to processing all requests to transmit data.  Data destined for the guest is
78  * delivered directly by MAC to viona_rx() when the ring is active.
79  *
80  *
81  * -----------
82  * Ring States
83  * -----------
84  *
85  * The viona_vring_t instances follow a simple path through the possible state
86  * values represented in virtio_vring_t`vr_state:
87  *
88  *        +<--------------------------------------------+
89  *        |						|
90  *        V						^
91  *  +-----------+	This is the initial state when a link is created or
92  *  | VRS_RESET |	when the ring has been explicitly reset.
93  *  +-----------+
94  *        |						^
95  *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
96  *        |						|
97  *        |						^
98  *        V
99  *  +-----------+	The ring parameters (size, guest physical addresses)
100  *  | VRS_SETUP |	have been set and start-up of the ring worker thread
101  *  +-----------+	has begun.
102  *        |						^
103  *        |						|
104  *        |---* ring worker thread begins execution	|
105  *        |						|
106  *        +-------------------------------------------->+
107  *        |	      |					^
108  *        |	      |
109  *        |	      *	If ring shutdown is requested (by ioctl or impending
110  *        |		bhyve process death) while the worker thread is
111  *        |		starting, the worker will transition the ring to
112  *        |		VRS_RESET and exit.
113  *        |						^
114  *        |						|
115  *        |<-------------------------------------------<+
116  *        |	      |					|
117  *        |	      |					^
118  *        |	      *	If ring is requested to pause (but not stop)from the
119  *        |             VRS_RUN state, it will return to the VRS_INIT state.
120  *        |
121  *        |						^
122  *        |						|
123  *        |						^
124  *        V
125  *  +-----------+	The worker thread associated with the ring has started
126  *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
127  *  +-----------+	for the ring to operate.
128  *        |						^
129  *        |						|
130  *        +-------------------------------------------->+
131  *        |	      |					^
132  *        |	      |
133  *        |	      *	If ring shutdown is requested while the worker is
134  *        |		waiting in VRS_INIT, it will free any extra resources
135  *        |		and transition to VRS_RESET.
136  *        |						^
137  *        |						|
138  *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
139  *        |						^
140  *        V
141  *  +-----------+	The worker thread associated with the ring is executing
142  *  | VRS_RUN   |	workload specific to that ring.
143  *  +-----------+
144  *        |						^
145  *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
146  *        |	(or bhyve process begins exit)		^
147  *        |
148  *  +-----------+	The worker thread associated with the ring is in the
149  *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
150  *  +-----------+	requests are allowed to complete, but new requests
151  *        |		must be ignored.
152  *        |						^
153  *        |						|
154  *        +-------------------------------------------->+
155  *
156  *
157  * While the worker thread is not running, changes to vr_state are only made by
158  * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
159  * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
160  * has been started, only it may perform ring state transitions (still under
161  * the protection of vr_lock), when requested by outside consumers via
162  * vr_state_flags or when the containing bhyve process initiates an exit.
163  *
164  *
165  * ----------------------------
166  * Transmission mblk_t Handling
167  * ----------------------------
168  *
169  * For incoming frames destined for a bhyve guest, the data must first land in
170  * a host OS buffer from the physical NIC before it is copied into the awaiting
171  * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
172  * this limitation and can avoid extra copying before the buffers are accessed
173  * directly by the NIC.  When a guest designates buffers to be transmitted,
174  * viona translates the guest-physical addresses contained in the ring
175  * descriptors to host-virtual addresses via viona_hold_page().  That pointer is
176  * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
177  * Doing so increments vr_xfer_outstanding, preventing the ring from being
178  * reset (allowing the link to drop its vmm handle to the guest) until all
179  * transmit mblks referencing guest memory have been processed.  Allocation of
180  * the viona_desb_t entries is done during the VRS_INIT stage of the ring
181  * worker thread.  The ring size informs that allocation as the number of
182  * concurrent transmissions is limited by the number of descriptors in the
183  * ring.  This minimizes allocation in the transmit hot-path by acquiring those
184  * fixed-size resources during initialization.
185  *
186  * This optimization depends on the underlying NIC driver freeing the mblks in
187  * a timely manner after they have been transmitted by the hardware.  Some
188  * drivers have been found to flush TX descriptors only when new transmissions
189  * are initiated.  This means that there is no upper bound to the time needed
190  * for an mblk to be flushed and can stall bhyve guests from shutting down
191  * since their memory must be free of viona TX references prior to clean-up.
192  *
193  * This expectation of deterministic mblk_t processing is likely the reason
194  * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
195  * loaded will copy transmit data into fresh buffers rather than passing up
196  * zero-copy mblks.  It is a hold-over from the original viona sources provided
197  * by Pluribus and its continued necessity has not been confirmed.
198  *
199  *
200  * ----------------------------
201  * Ring Notification Fast-paths
202  * ----------------------------
203  *
204  * Device operation for viona requires that notifications flow to and from the
205  * guest to indicate certain ring conditions.  In order to minimize latency and
206  * processing overhead, the notification procedures are kept in-kernel whenever
207  * possible.
208  *
209  * Guest-to-host notifications, when new available descriptors have been placed
210  * in the ring, are posted via the 'queue notify' address in the virtio BAR.
211  * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
212  * install a callback hook on an ioport address.  Guest exits for accesses to
213  * viona-hooked ioport addresses will result in direct calls to notify the
214  * appropriate ring worker without a trip to userland.
215  *
216  * Host-to-guest notifications in the form of interrupts enjoy similar
217  * acceleration.  Each viona ring can be configured to send MSI notifications
218  * to the guest as virtio conditions dictate.  This in-kernel interrupt
219  * configuration is kept synchronized through viona ioctls which are utilized
220  * during writes to the associated PCI config registers or MSI-X BAR.
221  *
222  * Guests which do not utilize MSI-X will result in viona falling back to the
223  * slow path for interrupts.  It will poll(2) the viona handle, receiving
224  * notification when ring events necessitate the assertion of an interrupt.
225  *
226  *
227  * ---------------
228  * Nethook Support
229  * ---------------
230  *
231  * Viona provides four nethook events that consumers (e.g. ipf) can hook into
232  * to intercept packets as they go up or down the stack.  Unfortunately,
233  * the nethook framework does not understand raw packets, so we can only
234  * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
235  * we register callbacks with the neti (netinfo) module that will be invoked
236  * for each netstack already present, as well as for any additional netstack
237  * instances created as the system operates.  These callbacks will
238  * register/unregister the hooks with the nethook framework for each
239  * netstack instance.  This registration occurs prior to creating any
240  * viona instances for a given netstack, and the unregistration for a netstack
241  * instance occurs after all viona instances of the netstack instance have
242  * been deleted.
243  */
244 
245 #include <sys/conf.h>
246 #include <sys/file.h>
247 #include <sys/stat.h>
248 
249 #include <sys/dlpi.h>
250 #include <sys/vlan.h>
251 
252 #include "viona_impl.h"
253 
254 
255 #define	VIONA_NAME		"Virtio Network Accelerator"
256 #define	VIONA_CTL_MINOR		0
257 #define	VIONA_CLI_NAME		"viona"		/* MAC client name */
258 
259 
260 /*
261  * Host capabilities.
262  */
263 #define	VIONA_S_HOSTCAPS	(	\
264 	VIRTIO_NET_F_GUEST_CSUM |	\
265 	VIRTIO_NET_F_MAC |		\
266 	VIRTIO_NET_F_GUEST_TSO4 |	\
267 	VIRTIO_NET_F_MRG_RXBUF |	\
268 	VIRTIO_NET_F_STATUS |		\
269 	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
270 	VIRTIO_F_RING_INDIRECT_DESC)
271 
272 /* MAC_CAPAB_HCKSUM specifics of interest */
273 #define	VIONA_CAP_HCKSUM_INTEREST	\
274 	(HCKSUM_INET_PARTIAL |		\
275 	HCKSUM_INET_FULL_V4 |		\
276 	HCKSUM_INET_FULL_V6)
277 
278 static void		*viona_state;
279 static dev_info_t	*viona_dip;
280 static id_space_t	*viona_minors;
281 
282 
283 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
284     void **result);
285 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
286 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
287 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
288 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
289 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
290     cred_t *credp, int *rval);
291 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
292     struct pollhead **phpp);
293 
294 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
295 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
296 
297 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
298 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t);
299 static int viona_ioc_ring_init(viona_link_t *, void *, int);
300 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
301 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
302 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
303 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
304 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
305 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
306 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
307 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
308 
309 static struct cb_ops viona_cb_ops = {
310 	viona_open,
311 	viona_close,
312 	nodev,
313 	nodev,
314 	nodev,
315 	nodev,
316 	nodev,
317 	viona_ioctl,
318 	nodev,
319 	nodev,
320 	nodev,
321 	viona_chpoll,
322 	ddi_prop_op,
323 	0,
324 	D_MP | D_NEW | D_HOTPLUG,
325 	CB_REV,
326 	nodev,
327 	nodev
328 };
329 
330 static struct dev_ops viona_ops = {
331 	DEVO_REV,
332 	0,
333 	viona_info,
334 	nulldev,
335 	nulldev,
336 	viona_attach,
337 	viona_detach,
338 	nodev,
339 	&viona_cb_ops,
340 	NULL,
341 	ddi_power,
342 	ddi_quiesce_not_needed
343 };
344 
345 static struct modldrv modldrv = {
346 	&mod_driverops,
347 	VIONA_NAME,
348 	&viona_ops,
349 };
350 
351 static struct modlinkage modlinkage = {
352 	MODREV_1, &modldrv, NULL
353 };
354 
355 int
356 _init(void)
357 {
358 	int ret;
359 
360 	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
361 	if (ret != 0) {
362 		return (ret);
363 	}
364 
365 	viona_minors = id_space_create("viona_minors",
366 	    VIONA_CTL_MINOR + 1, UINT16_MAX);
367 	viona_rx_init();
368 	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
369 
370 	ret = mod_install(&modlinkage);
371 	if (ret != 0) {
372 		ddi_soft_state_fini(&viona_state);
373 		id_space_destroy(viona_minors);
374 		viona_rx_fini();
375 		mutex_destroy(&viona_force_copy_lock);
376 	}
377 
378 	return (ret);
379 }
380 
381 int
382 _fini(void)
383 {
384 	int ret;
385 
386 	ret = mod_remove(&modlinkage);
387 	if (ret != 0) {
388 		return (ret);
389 	}
390 
391 	ddi_soft_state_fini(&viona_state);
392 	id_space_destroy(viona_minors);
393 	viona_rx_fini();
394 	mutex_destroy(&viona_force_copy_lock);
395 
396 	return (ret);
397 }
398 
399 int
400 _info(struct modinfo *modinfop)
401 {
402 	return (mod_info(&modlinkage, modinfop));
403 }
404 
405 /* ARGSUSED */
406 static int
407 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
408 {
409 	int error;
410 
411 	switch (cmd) {
412 	case DDI_INFO_DEVT2DEVINFO:
413 		*result = (void *)viona_dip;
414 		error = DDI_SUCCESS;
415 		break;
416 	case DDI_INFO_DEVT2INSTANCE:
417 		*result = (void *)0;
418 		error = DDI_SUCCESS;
419 		break;
420 	default:
421 		error = DDI_FAILURE;
422 		break;
423 	}
424 	return (error);
425 }
426 
427 static int
428 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
429 {
430 	if (cmd != DDI_ATTACH) {
431 		return (DDI_FAILURE);
432 	}
433 
434 	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
435 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
436 		return (DDI_FAILURE);
437 	}
438 
439 	viona_neti_attach();
440 
441 	viona_dip = dip;
442 	ddi_report_dev(viona_dip);
443 
444 	return (DDI_SUCCESS);
445 }
446 
447 static int
448 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
449 {
450 	dev_info_t *old_dip = viona_dip;
451 
452 	if (cmd != DDI_DETACH) {
453 		return (DDI_FAILURE);
454 	}
455 
456 	VERIFY(old_dip != NULL);
457 
458 	viona_neti_detach();
459 	viona_dip = NULL;
460 	ddi_remove_minor_node(old_dip, NULL);
461 
462 	return (DDI_SUCCESS);
463 }
464 
465 static int
466 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
467 {
468 	int	minor;
469 	viona_soft_state_t *ss;
470 
471 	if (otype != OTYP_CHR) {
472 		return (EINVAL);
473 	}
474 #if 0
475 	/*
476 	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
477 	 * Should the check be at open() or ioctl()?
478 	 */
479 	if (drv_priv(credp) != 0) {
480 		return (EPERM);
481 	}
482 #endif
483 	if (getminor(*devp) != VIONA_CTL_MINOR) {
484 		return (ENXIO);
485 	}
486 
487 	minor = id_alloc_nosleep(viona_minors);
488 	if (minor == -1) {
489 		/* All minors are busy */
490 		return (EBUSY);
491 	}
492 	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
493 		id_free(viona_minors, minor);
494 		return (ENOMEM);
495 	}
496 
497 	ss = ddi_get_soft_state(viona_state, minor);
498 	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
499 	*devp = makedevice(getmajor(*devp), minor);
500 
501 	return (0);
502 }
503 
504 static int
505 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
506 {
507 	int			minor;
508 	viona_soft_state_t	*ss;
509 
510 	if (otype != OTYP_CHR) {
511 		return (EINVAL);
512 	}
513 
514 	minor = getminor(dev);
515 
516 	ss = ddi_get_soft_state(viona_state, minor);
517 	if (ss == NULL) {
518 		return (ENXIO);
519 	}
520 
521 	VERIFY0(viona_ioc_delete(ss, B_TRUE));
522 	VERIFY(!list_link_active(&ss->ss_node));
523 	ddi_soft_state_free(viona_state, minor);
524 	id_free(viona_minors, minor);
525 
526 	return (0);
527 }
528 
529 static int
530 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
531 {
532 	viona_soft_state_t *ss;
533 	void *dptr = (void *)data;
534 	int err = 0, val;
535 	viona_link_t *link;
536 
537 	ss = ddi_get_soft_state(viona_state, getminor(dev));
538 	if (ss == NULL) {
539 		return (ENXIO);
540 	}
541 
542 	switch (cmd) {
543 	case VNA_IOC_CREATE:
544 		return (viona_ioc_create(ss, dptr, md, cr));
545 	case VNA_IOC_DELETE:
546 		return (viona_ioc_delete(ss, B_FALSE));
547 	case VNA_IOC_VERSION:
548 		*rv = VIONA_CURRENT_INTERFACE_VERSION;
549 		return (0);
550 	default:
551 		break;
552 	}
553 
554 	mutex_enter(&ss->ss_lock);
555 	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
556 	    vmm_drv_release_reqd(link->l_vm_hold)) {
557 		mutex_exit(&ss->ss_lock);
558 		return (ENXIO);
559 	}
560 
561 	switch (cmd) {
562 	case VNA_IOC_GET_FEATURES:
563 		val = VIONA_S_HOSTCAPS | link->l_features_hw;
564 		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
565 			err = EFAULT;
566 		}
567 		break;
568 	case VNA_IOC_SET_FEATURES:
569 		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
570 			err = EFAULT;
571 			break;
572 		}
573 		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
574 
575 		if ((val & VIRTIO_NET_F_CSUM) == 0)
576 			val &= ~VIRTIO_NET_F_HOST_TSO4;
577 
578 		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
579 			val &= ~VIRTIO_NET_F_GUEST_TSO4;
580 
581 		link->l_features = val;
582 		break;
583 	case VNA_IOC_RING_INIT:
584 		err = viona_ioc_ring_init(link, dptr, md);
585 		break;
586 	case VNA_IOC_RING_RESET:
587 		err = viona_ioc_ring_reset(link, (uint_t)data);
588 		break;
589 	case VNA_IOC_RING_KICK:
590 		err = viona_ioc_ring_kick(link, (uint_t)data);
591 		break;
592 	case VNA_IOC_RING_SET_MSI:
593 		err = viona_ioc_ring_set_msi(link, dptr, md);
594 		break;
595 	case VNA_IOC_RING_INTR_CLR:
596 		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
597 		break;
598 	case VNA_IOC_RING_SET_STATE:
599 		err = viona_ioc_ring_set_state(link, dptr, md);
600 		break;
601 	case VNA_IOC_RING_GET_STATE:
602 		err = viona_ioc_ring_get_state(link, dptr, md);
603 		break;
604 	case VNA_IOC_RING_PAUSE:
605 		err = viona_ioc_ring_pause(link, (uint_t)data);
606 		break;
607 
608 	case VNA_IOC_INTR_POLL:
609 		err = viona_ioc_intr_poll(link, dptr, md, rv);
610 		break;
611 	case VNA_IOC_SET_NOTIFY_IOP:
612 		if (data < 0 || data > UINT16_MAX) {
613 			err = EINVAL;
614 			break;
615 		}
616 		err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
617 		break;
618 	case VNA_IOC_SET_PROMISC:
619 		err = viona_ioc_set_promisc(link, (viona_promisc_t)data);
620 		break;
621 	default:
622 		err = ENOTTY;
623 		break;
624 	}
625 
626 	mutex_exit(&ss->ss_lock);
627 	return (err);
628 }
629 
630 static int
631 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
632     struct pollhead **phpp)
633 {
634 	viona_soft_state_t *ss;
635 	viona_link_t *link;
636 
637 	ss = ddi_get_soft_state(viona_state, getminor(dev));
638 	if (ss == NULL) {
639 		return (ENXIO);
640 	}
641 
642 	mutex_enter(&ss->ss_lock);
643 	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
644 		mutex_exit(&ss->ss_lock);
645 		return (ENXIO);
646 	}
647 
648 	*reventsp = 0;
649 	if ((events & POLLRDBAND) != 0) {
650 		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
651 			if (link->l_vrings[i].vr_intr_enabled != 0) {
652 				*reventsp |= POLLRDBAND;
653 				break;
654 			}
655 		}
656 	}
657 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
658 		*phpp = &link->l_pollhead;
659 	}
660 	mutex_exit(&ss->ss_lock);
661 
662 	return (0);
663 }
664 
665 static void
666 viona_get_mac_capab(viona_link_t *link)
667 {
668 	mac_handle_t mh = link->l_mh;
669 	uint32_t cap = 0;
670 	mac_capab_lso_t lso_cap;
671 
672 	link->l_features_hw = 0;
673 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
674 		/*
675 		 * Only report HW checksum ability if the underlying MAC
676 		 * resource is capable of populating the L4 header.
677 		 */
678 		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
679 			link->l_features_hw |= VIRTIO_NET_F_CSUM;
680 		}
681 		link->l_cap_csum = cap;
682 	}
683 
684 	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
685 	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
686 		/*
687 		 * Virtio doesn't allow for negotiating a maximum LSO
688 		 * packet size. We have to assume that the guest may
689 		 * send a maximum length IP packet. Make sure the
690 		 * underlying MAC can handle an LSO of this size.
691 		 */
692 		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
693 		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
694 			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
695 	}
696 }
697 
698 static int
699 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
700 {
701 	vioc_create_t	kvc;
702 	viona_link_t	*link = NULL;
703 	char		cli_name[MAXNAMELEN];
704 	int		err = 0;
705 	file_t		*fp;
706 	vmm_hold_t	*hold = NULL;
707 	viona_neti_t	*nip = NULL;
708 	zoneid_t	zid;
709 	mac_diag_t	mac_diag = MAC_DIAG_NONE;
710 
711 	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
712 
713 	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
714 		return (EFAULT);
715 	}
716 
717 	zid = crgetzoneid(cr);
718 	nip = viona_neti_lookup_by_zid(zid);
719 	if (nip == NULL) {
720 		return (EIO);
721 	}
722 
723 	if (!nip->vni_nethook.vnh_hooked) {
724 		viona_neti_rele(nip);
725 		return (EIO);
726 	}
727 
728 	mutex_enter(&ss->ss_lock);
729 	if (ss->ss_link != NULL) {
730 		mutex_exit(&ss->ss_lock);
731 		viona_neti_rele(nip);
732 		return (EEXIST);
733 	}
734 
735 	if ((fp = getf(kvc.c_vmfd)) == NULL) {
736 		err = EBADF;
737 		goto bail;
738 	}
739 	err = vmm_drv_hold(fp, cr, &hold);
740 	releasef(kvc.c_vmfd);
741 	if (err != 0) {
742 		goto bail;
743 	}
744 
745 	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
746 	link->l_linkid = kvc.c_linkid;
747 	link->l_vm_hold = hold;
748 
749 	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
750 	if (err != 0) {
751 		goto bail;
752 	}
753 
754 	viona_get_mac_capab(link);
755 
756 	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
757 	    link->l_linkid);
758 	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
759 	if (err != 0) {
760 		goto bail;
761 	}
762 
763 	err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
764 	    &link->l_muh, VLAN_ID_NONE, &mac_diag);
765 	if (err != 0) {
766 		goto bail;
767 	}
768 
769 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
770 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
771 
772 	/*
773 	 * Default to passing up all multicast traffic in addition to
774 	 * classified unicast. Guests which have support will change this
775 	 * if they need to via the virtio net control queue; guests without
776 	 * support generally still want to see multicast.
777 	 */
778 	link->l_promisc = VIONA_PROMISC_MULTI;
779 	if ((err = viona_rx_set(link, link->l_promisc)) != 0) {
780 		viona_rx_clear(link);
781 		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
782 		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
783 		goto bail;
784 	}
785 
786 	link->l_neti = nip;
787 	ss->ss_link = link;
788 	mutex_exit(&ss->ss_lock);
789 
790 	mutex_enter(&nip->vni_lock);
791 	list_insert_tail(&nip->vni_dev_list, ss);
792 	mutex_exit(&nip->vni_lock);
793 
794 	return (0);
795 
796 bail:
797 	if (link != NULL) {
798 		if (link->l_mch != NULL) {
799 			if (link->l_muh != NULL) {
800 				VERIFY0(mac_unicast_remove(link->l_mch,
801 				    link->l_muh));
802 				link->l_muh = NULL;
803 			}
804 			mac_client_close(link->l_mch, 0);
805 		}
806 		if (link->l_mh != NULL) {
807 			mac_close(link->l_mh);
808 		}
809 		kmem_free(link, sizeof (viona_link_t));
810 	}
811 	if (hold != NULL) {
812 		vmm_drv_rele(hold);
813 	}
814 	viona_neti_rele(nip);
815 
816 	mutex_exit(&ss->ss_lock);
817 	return (err);
818 }
819 
820 static int
821 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
822 {
823 	viona_link_t *link;
824 	viona_neti_t *nip = NULL;
825 
826 	mutex_enter(&ss->ss_lock);
827 	if ((link = ss->ss_link) == NULL) {
828 		/* Link destruction already complete */
829 		mutex_exit(&ss->ss_lock);
830 		return (0);
831 	}
832 
833 	if (link->l_destroyed) {
834 		/*
835 		 * Link destruction has been started by another thread, but has
836 		 * not completed.  This condition should be impossible to
837 		 * encounter when performing the on-close destroy of the link,
838 		 * since racing ioctl accessors must necessarily be absent.
839 		 */
840 		VERIFY(!on_close);
841 		mutex_exit(&ss->ss_lock);
842 		return (EAGAIN);
843 	}
844 	/*
845 	 * The link deletion cannot fail after this point, continuing until its
846 	 * successful completion is reached.
847 	 */
848 	link->l_destroyed = B_TRUE;
849 
850 	/*
851 	 * Tear down the IO port hook so it cannot be used to kick any of the
852 	 * rings which are about to be reset and stopped.
853 	 */
854 	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
855 	mutex_exit(&ss->ss_lock);
856 
857 	/*
858 	 * Return the rings to their reset state, ignoring any possible
859 	 * interruptions from signals.
860 	 */
861 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
862 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
863 
864 	mutex_enter(&ss->ss_lock);
865 	if (link->l_mch != NULL) {
866 		/* Unhook the receive callbacks and close out the client */
867 		viona_rx_clear(link);
868 		if (link->l_muh != NULL) {
869 			VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
870 			link->l_muh = NULL;
871 		}
872 		mac_client_close(link->l_mch, 0);
873 	}
874 	if (link->l_mh != NULL) {
875 		mac_close(link->l_mh);
876 	}
877 	if (link->l_vm_hold != NULL) {
878 		vmm_drv_rele(link->l_vm_hold);
879 		link->l_vm_hold = NULL;
880 	}
881 
882 	nip = link->l_neti;
883 	link->l_neti = NULL;
884 
885 	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
886 	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
887 	pollhead_clean(&link->l_pollhead);
888 	ss->ss_link = NULL;
889 	mutex_exit(&ss->ss_lock);
890 
891 	mutex_enter(&nip->vni_lock);
892 	list_remove(&nip->vni_dev_list, ss);
893 	mutex_exit(&nip->vni_lock);
894 
895 	viona_neti_rele(nip);
896 
897 	kmem_free(link, sizeof (viona_link_t));
898 	return (0);
899 }
900 
901 static int
902 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
903 {
904 	vioc_ring_init_t kri;
905 	int err;
906 
907 	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
908 		return (EFAULT);
909 	}
910 	const struct viona_ring_params params = {
911 		.vrp_pa = kri.ri_qaddr,
912 		.vrp_size = kri.ri_qsize,
913 		.vrp_avail_idx = 0,
914 		.vrp_used_idx = 0,
915 	};
916 
917 	err = viona_ring_init(link, kri.ri_index, &params);
918 
919 	return (err);
920 }
921 
922 static int
923 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
924 {
925 	vioc_ring_state_t krs;
926 	int err;
927 
928 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
929 		return (EFAULT);
930 	}
931 	const struct viona_ring_params params = {
932 		.vrp_pa = krs.vrs_qaddr,
933 		.vrp_size = krs.vrs_qsize,
934 		.vrp_avail_idx = krs.vrs_avail_idx,
935 		.vrp_used_idx = krs.vrs_used_idx,
936 	};
937 
938 	err = viona_ring_init(link, krs.vrs_index, &params);
939 
940 	return (err);
941 }
942 
943 static int
944 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
945 {
946 	vioc_ring_state_t krs;
947 
948 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
949 		return (EFAULT);
950 	}
951 
952 	struct viona_ring_params params;
953 	int err = viona_ring_get_state(link, krs.vrs_index, &params);
954 	if (err != 0) {
955 		return (err);
956 	}
957 	krs.vrs_qsize = params.vrp_size;
958 	krs.vrs_qaddr = params.vrp_pa;
959 	krs.vrs_avail_idx = params.vrp_avail_idx;
960 	krs.vrs_used_idx = params.vrp_used_idx;
961 
962 	if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
963 		return (EFAULT);
964 	}
965 	return (0);
966 }
967 
968 static int
969 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
970 {
971 	viona_vring_t *ring;
972 
973 	if (idx >= VIONA_VQ_MAX) {
974 		return (EINVAL);
975 	}
976 	ring = &link->l_vrings[idx];
977 
978 	return (viona_ring_reset(ring, B_TRUE));
979 }
980 
981 static int
982 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
983 {
984 	viona_vring_t *ring;
985 	int err;
986 
987 	if (idx >= VIONA_VQ_MAX) {
988 		return (EINVAL);
989 	}
990 	ring = &link->l_vrings[idx];
991 
992 	mutex_enter(&ring->vr_lock);
993 	switch (ring->vr_state) {
994 	case VRS_SETUP:
995 		/*
996 		 * An early kick to a ring which is starting its worker thread
997 		 * is fine.  Once that thread is active, it will process the
998 		 * start-up request immediately.
999 		 */
1000 		/* FALLTHROUGH */
1001 	case VRS_INIT:
1002 		ring->vr_state_flags |= VRSF_REQ_START;
1003 		/* FALLTHROUGH */
1004 	case VRS_RUN:
1005 		cv_broadcast(&ring->vr_cv);
1006 		err = 0;
1007 		break;
1008 	default:
1009 		err = EBUSY;
1010 		break;
1011 	}
1012 	mutex_exit(&ring->vr_lock);
1013 
1014 	return (err);
1015 }
1016 
1017 static int
1018 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1019 {
1020 	if (idx >= VIONA_VQ_MAX) {
1021 		return (EINVAL);
1022 	}
1023 
1024 	viona_vring_t *ring = &link->l_vrings[idx];
1025 	return (viona_ring_pause(ring));
1026 }
1027 
1028 static int
1029 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1030 {
1031 	vioc_ring_msi_t vrm;
1032 	viona_vring_t *ring;
1033 
1034 	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1035 		return (EFAULT);
1036 	}
1037 	if (vrm.rm_index >= VIONA_VQ_MAX) {
1038 		return (EINVAL);
1039 	}
1040 
1041 	ring = &link->l_vrings[vrm.rm_index];
1042 	mutex_enter(&ring->vr_lock);
1043 	ring->vr_msi_addr = vrm.rm_addr;
1044 	ring->vr_msi_msg = vrm.rm_msg;
1045 	mutex_exit(&ring->vr_lock);
1046 
1047 	return (0);
1048 }
1049 
1050 static int
1051 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1052     uint32_t *val)
1053 {
1054 	viona_link_t *link = (viona_link_t *)arg;
1055 
1056 	/*
1057 	 * If the request is a read (in/ins), or direct at a port other than
1058 	 * what we expect to be registered on, ignore it.
1059 	 */
1060 	if (in || port != link->l_notify_ioport) {
1061 		return (ESRCH);
1062 	}
1063 
1064 	/* Let userspace handle notifications for rings other than RX/TX. */
1065 	const uint16_t vq = *val;
1066 	if (vq >= VIONA_VQ_MAX) {
1067 		return (ESRCH);
1068 	}
1069 
1070 	viona_vring_t *ring = &link->l_vrings[vq];
1071 	int res = 0;
1072 
1073 	mutex_enter(&ring->vr_lock);
1074 	if (ring->vr_state == VRS_RUN) {
1075 		cv_broadcast(&ring->vr_cv);
1076 	} else {
1077 		res = ESRCH;
1078 	}
1079 	mutex_exit(&ring->vr_lock);
1080 
1081 	return (res);
1082 }
1083 
1084 static int
1085 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1086 {
1087 	int err = 0;
1088 
1089 	if (link->l_notify_ioport != 0) {
1090 		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1091 		link->l_notify_ioport = 0;
1092 	}
1093 
1094 	if (ioport != 0) {
1095 		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1096 		    viona_notify_iop, (void *)link, &link->l_notify_cookie);
1097 		if (err == 0) {
1098 			link->l_notify_ioport = ioport;
1099 		}
1100 	}
1101 	return (err);
1102 }
1103 
1104 static int
1105 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode)
1106 {
1107 	int err;
1108 
1109 	if (mode >= VIONA_PROMISC_MAX) {
1110 		return (EINVAL);
1111 	}
1112 
1113 	if (mode == link->l_promisc) {
1114 		return (0);
1115 	}
1116 
1117 	if ((err = viona_rx_set(link, mode)) != 0) {
1118 		return (err);
1119 	}
1120 
1121 	link->l_promisc = mode;
1122 	return (0);
1123 }
1124 
1125 static int
1126 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1127 {
1128 	if (idx >= VIONA_VQ_MAX) {
1129 		return (EINVAL);
1130 	}
1131 
1132 	link->l_vrings[idx].vr_intr_enabled = 0;
1133 	return (0);
1134 }
1135 
1136 static int
1137 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1138 {
1139 	uint_t cnt = 0;
1140 	vioc_intr_poll_t vip;
1141 
1142 	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
1143 		uint_t val = link->l_vrings[i].vr_intr_enabled;
1144 
1145 		vip.vip_status[i] = val;
1146 		if (val != 0) {
1147 			cnt++;
1148 		}
1149 	}
1150 
1151 	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1152 		return (EFAULT);
1153 	}
1154 	*rv = (int)cnt;
1155 	return (0);
1156 }
1157