xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_main.c (revision badf94ff3599fab15963f6c532929e9bc411757a)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2021 Oxide Computer Company
39  */
40 
41 /*
42  * viona - VirtIO-Net, Accelerated
43  *
44  * The purpose of viona is to provide high performance virtio-net devices to
45  * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
46  * DLS/DLD stack.
47  *
48  * --------------------
49  * General Architecture
50  * --------------------
51  *
52  * A single viona instance is comprised of a "link" handle and two "rings".
53  * After opening the viona device, it must be associated with a MAC network
54  * interface and a bhyve (vmm) instance to form its link resource.  This is
55  * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
56  * passed in to perform the initialization.  With the MAC client opened, and a
57  * driver handle to the vmm instance established, the device is ready to be
58  * configured by the guest.
59  *
60  * The userspace portion of bhyve, which interfaces with the PCI device
61  * emulation framework, is meant to stay out of the datapath if at all
62  * possible.  Configuration changes made via PCI are mapped to actions which
63  * will steer the operation of the in-kernel logic.
64  *
65  *
66  * -----------
67  * Ring Basics
68  * -----------
69  *
70  * Each viona link has two viona_vring_t entities, RX and TX, for handling data
71  * transfers to and from the guest.  They represent an interface to the
72  * standard virtio ring structures.  When intiailized and active, each ring is
73  * backed by a kernel worker thread (parented to the bhyve process for the
74  * instance) which handles ring events.  The RX worker has the simple task of
75  * watching for ring shutdown conditions.  The TX worker does that in addition
76  * to processing all requests to transmit data.  Data destined for the guest is
77  * delivered directly by MAC to viona_rx() when the ring is active.
78  *
79  *
80  * -----------
81  * Ring States
82  * -----------
83  *
84  * The viona_vring_t instances follow a simple path through the possible state
85  * values represented in virtio_vring_t`vr_state:
86  *
87  *        +<--------------------------------------------+
88  *        |						|
89  *        V						^
90  *  +-----------+	This is the initial state when a link is created or
91  *  | VRS_RESET |	when the ring has been explicitly reset.
92  *  +-----------+
93  *        |						^
94  *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
95  *        |						|
96  *        |						^
97  *        V
98  *  +-----------+	The ring parameters (size, guest physical addresses)
99  *  | VRS_SETUP |	have been set and start-up of the ring worker thread
100  *  +-----------+	has begun.
101  *        |						^
102  *        |						|
103  *        |---* ring worker thread begins execution	|
104  *        |						|
105  *        +-------------------------------------------->+
106  *        |	      |					^
107  *        |	      |
108  *        |	      *	If ring shutdown is requested (by ioctl or impending
109  *        |		bhyve process death) while the worker thread is
110  *        |		starting, the worker will transition the ring to
111  *        |		VRS_RESET and exit.
112  *        |						^
113  *        |						|
114  *        |						^
115  *        V
116  *  +-----------+	The worker thread associated with the ring has started
117  *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
118  *  +-----------+	for the ring to operate.
119  *        |						^
120  *        |						|
121  *        +-------------------------------------------->+
122  *        |	      |					^
123  *        |	      |
124  *        |	      *	If ring shutdown is requested while the worker is
125  *        |		waiting in VRS_INIT, it will free any extra resources
126  *        |		and transition to VRS_RESET.
127  *        |						^
128  *        |						|
129  *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
130  *        |						^
131  *        V
132  *  +-----------+	The worker thread associated with the ring is executing
133  *  | VRS_RUN   |	workload specific to that ring.
134  *  +-----------+
135  *        |						^
136  *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
137  *        |	(or bhyve process begins exit)		^
138  *        |
139  *  +-----------+	The worker thread associated with the ring is in the
140  *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
141  *  +-----------+	requests are allowed to complete, but new requests
142  *        |		must be ignored.
143  *        |						^
144  *        |						|
145  *        +-------------------------------------------->+
146  *
147  *
148  * While the worker thread is not running, changes to vr_state are only made by
149  * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
150  * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
151  * has been started, only it may perform ring state transitions (still under
152  * the protection of vr_lock), when requested by outside consumers via
153  * vr_state_flags or when the containing bhyve process initiates an exit.
154  *
155  *
156  * ----------------------------
157  * Transmission mblk_t Handling
158  * ----------------------------
159  *
160  * For incoming frames destined for a bhyve guest, the data must first land in
161  * a host OS buffer from the physical NIC before it is copied into the awaiting
162  * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
163  * this limitation and can avoid extra copying before the buffers are accessed
164  * directly by the NIC.  When a guest designates buffers to be transmitted,
165  * viona translates the guest-physical addresses contained in the ring
166  * descriptors to host-virtual addresses via viona_hold_page().  That pointer is
167  * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
168  * Doing so increments vr_xfer_outstanding, preventing the ring from being
169  * reset (allowing the link to drop its vmm handle to the guest) until all
170  * transmit mblks referencing guest memory have been processed.  Allocation of
171  * the viona_desb_t entries is done during the VRS_INIT stage of the ring
172  * worker thread.  The ring size informs that allocation as the number of
173  * concurrent transmissions is limited by the number of descriptors in the
174  * ring.  This minimizes allocation in the transmit hot-path by acquiring those
175  * fixed-size resources during initialization.
176  *
177  * This optimization depends on the underlying NIC driver freeing the mblks in
178  * a timely manner after they have been transmitted by the hardware.  Some
179  * drivers have been found to flush TX descriptors only when new transmissions
180  * are initiated.  This means that there is no upper bound to the time needed
181  * for an mblk to be flushed and can stall bhyve guests from shutting down
182  * since their memory must be free of viona TX references prior to clean-up.
183  *
184  * This expectation of deterministic mblk_t processing is likely the reason
185  * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
186  * loaded will copy transmit data into fresh buffers rather than passing up
187  * zero-copy mblks.  It is a hold-over from the original viona sources provided
188  * by Pluribus and its continued necessity has not been confirmed.
189  *
190  *
191  * ----------------------------
192  * Ring Notification Fast-paths
193  * ----------------------------
194  *
195  * Device operation for viona requires that notifications flow to and from the
196  * guest to indicate certain ring conditions.  In order to minimize latency and
197  * processing overhead, the notification procedures are kept in-kernel whenever
198  * possible.
199  *
200  * Guest-to-host notifications, when new available descriptors have been placed
201  * in the ring, are posted via the 'queue notify' address in the virtio BAR.
202  * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
203  * install a callback hook on an ioport address.  Guest exits for accesses to
204  * viona-hooked ioport addresses will result in direct calls to notify the
205  * appropriate ring worker without a trip to userland.
206  *
207  * Host-to-guest notifications in the form of interrupts enjoy similar
208  * acceleration.  Each viona ring can be configured to send MSI notifications
209  * to the guest as virtio conditions dictate.  This in-kernel interrupt
210  * configuration is kept synchronized through viona ioctls which are utilized
211  * during writes to the associated PCI config registers or MSI-X BAR.
212  *
213  * Guests which do not utilize MSI-X will result in viona falling back to the
214  * slow path for interrupts.  It will poll(2) the viona handle, receiving
215  * notification when ring events necessitate the assertion of an interrupt.
216  *
217  *
218  * ---------------
219  * Nethook Support
220  * ---------------
221  *
222  * Viona provides four nethook events that consumers (e.g. ipf) can hook into
223  * to intercept packets as they go up or down the stack.  Unfortunately,
224  * the nethook framework does not understand raw packets, so we can only
225  * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
226  * we register callbacks with the neti (netinfo) module that will be invoked
227  * for each netstack already present, as well as for any additional netstack
228  * instances created as the system operates.  These callbacks will
229  * register/unregister the hooks with the nethook framework for each
230  * netstack instance.  This registration occurs prior to creating any
231  * viona instances for a given netstack, and the unregistration for a netstack
232  * instance occurs after all viona instances of the netstack instance have
233  * been deleted.
234  */
235 
236 #include <sys/conf.h>
237 #include <sys/file.h>
238 #include <sys/stat.h>
239 
240 #include <sys/dlpi.h>
241 
242 #include "viona_impl.h"
243 
244 
245 #define	VIONA_NAME		"Virtio Network Accelerator"
246 #define	VIONA_CTL_MINOR		0
247 #define	VIONA_CLI_NAME		"viona"		/* MAC client name */
248 
249 
250 /*
251  * Host capabilities.
252  */
253 #define	VIONA_S_HOSTCAPS	(	\
254 	VIRTIO_NET_F_GUEST_CSUM |	\
255 	VIRTIO_NET_F_MAC |		\
256 	VIRTIO_NET_F_GUEST_TSO4 |	\
257 	VIRTIO_NET_F_MRG_RXBUF |	\
258 	VIRTIO_NET_F_STATUS |		\
259 	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
260 	VIRTIO_F_RING_INDIRECT_DESC)
261 
262 /* MAC_CAPAB_HCKSUM specifics of interest */
263 #define	VIONA_CAP_HCKSUM_INTEREST	\
264 	(HCKSUM_INET_PARTIAL |		\
265 	HCKSUM_INET_FULL_V4 |		\
266 	HCKSUM_INET_FULL_V6)
267 
268 static void		*viona_state;
269 static dev_info_t	*viona_dip;
270 static id_space_t	*viona_minors;
271 
272 
273 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
274     void **result);
275 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
276 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
277 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
278 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
279 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
280     cred_t *credp, int *rval);
281 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
282     struct pollhead **phpp);
283 
284 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
285 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
286 
287 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
288 static int viona_ioc_ring_init(viona_link_t *, void *, int);
289 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
290 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
291 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
292 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
293 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
294 
295 static struct cb_ops viona_cb_ops = {
296 	viona_open,
297 	viona_close,
298 	nodev,
299 	nodev,
300 	nodev,
301 	nodev,
302 	nodev,
303 	viona_ioctl,
304 	nodev,
305 	nodev,
306 	nodev,
307 	viona_chpoll,
308 	ddi_prop_op,
309 	0,
310 	D_MP | D_NEW | D_HOTPLUG,
311 	CB_REV,
312 	nodev,
313 	nodev
314 };
315 
316 static struct dev_ops viona_ops = {
317 	DEVO_REV,
318 	0,
319 	viona_info,
320 	nulldev,
321 	nulldev,
322 	viona_attach,
323 	viona_detach,
324 	nodev,
325 	&viona_cb_ops,
326 	NULL,
327 	ddi_power,
328 	ddi_quiesce_not_needed
329 };
330 
331 static struct modldrv modldrv = {
332 	&mod_driverops,
333 	VIONA_NAME,
334 	&viona_ops,
335 };
336 
337 static struct modlinkage modlinkage = {
338 	MODREV_1, &modldrv, NULL
339 };
340 
341 int
342 _init(void)
343 {
344 	int ret;
345 
346 	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
347 	if (ret != 0) {
348 		return (ret);
349 	}
350 
351 	viona_minors = id_space_create("viona_minors",
352 	    VIONA_CTL_MINOR + 1, UINT16_MAX);
353 	viona_rx_init();
354 	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
355 
356 	ret = mod_install(&modlinkage);
357 	if (ret != 0) {
358 		ddi_soft_state_fini(&viona_state);
359 		id_space_destroy(viona_minors);
360 		viona_rx_fini();
361 		mutex_destroy(&viona_force_copy_lock);
362 	}
363 
364 	return (ret);
365 }
366 
367 int
368 _fini(void)
369 {
370 	int ret;
371 
372 	ret = mod_remove(&modlinkage);
373 	if (ret != 0) {
374 		return (ret);
375 	}
376 
377 	ddi_soft_state_fini(&viona_state);
378 	id_space_destroy(viona_minors);
379 	viona_rx_fini();
380 	mutex_destroy(&viona_force_copy_lock);
381 
382 	return (ret);
383 }
384 
385 int
386 _info(struct modinfo *modinfop)
387 {
388 	return (mod_info(&modlinkage, modinfop));
389 }
390 
391 /* ARGSUSED */
392 static int
393 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
394 {
395 	int error;
396 
397 	switch (cmd) {
398 	case DDI_INFO_DEVT2DEVINFO:
399 		*result = (void *)viona_dip;
400 		error = DDI_SUCCESS;
401 		break;
402 	case DDI_INFO_DEVT2INSTANCE:
403 		*result = (void *)0;
404 		error = DDI_SUCCESS;
405 		break;
406 	default:
407 		error = DDI_FAILURE;
408 		break;
409 	}
410 	return (error);
411 }
412 
413 static int
414 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
415 {
416 	if (cmd != DDI_ATTACH) {
417 		return (DDI_FAILURE);
418 	}
419 
420 	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
421 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
422 		return (DDI_FAILURE);
423 	}
424 
425 	viona_neti_attach();
426 
427 	viona_dip = dip;
428 	ddi_report_dev(viona_dip);
429 
430 	return (DDI_SUCCESS);
431 }
432 
433 static int
434 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
435 {
436 	dev_info_t *old_dip = viona_dip;
437 
438 	if (cmd != DDI_DETACH) {
439 		return (DDI_FAILURE);
440 	}
441 
442 	VERIFY(old_dip != NULL);
443 
444 	viona_neti_detach();
445 	viona_dip = NULL;
446 	ddi_remove_minor_node(old_dip, NULL);
447 
448 	return (DDI_SUCCESS);
449 }
450 
451 static int
452 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
453 {
454 	int	minor;
455 	viona_soft_state_t *ss;
456 
457 	if (otype != OTYP_CHR) {
458 		return (EINVAL);
459 	}
460 #if 0
461 	/*
462 	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
463 	 * Should the check be at open() or ioctl()?
464 	 */
465 	if (drv_priv(credp) != 0) {
466 		return (EPERM);
467 	}
468 #endif
469 	if (getminor(*devp) != VIONA_CTL_MINOR) {
470 		return (ENXIO);
471 	}
472 
473 	minor = id_alloc_nosleep(viona_minors);
474 	if (minor == -1) {
475 		/* All minors are busy */
476 		return (EBUSY);
477 	}
478 	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
479 		id_free(viona_minors, minor);
480 		return (ENOMEM);
481 	}
482 
483 	ss = ddi_get_soft_state(viona_state, minor);
484 	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
485 	*devp = makedevice(getmajor(*devp), minor);
486 
487 	return (0);
488 }
489 
490 static int
491 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
492 {
493 	int			minor;
494 	viona_soft_state_t	*ss;
495 
496 	if (otype != OTYP_CHR) {
497 		return (EINVAL);
498 	}
499 
500 	minor = getminor(dev);
501 
502 	ss = ddi_get_soft_state(viona_state, minor);
503 	if (ss == NULL) {
504 		return (ENXIO);
505 	}
506 
507 	VERIFY0(viona_ioc_delete(ss, B_TRUE));
508 	VERIFY(!list_link_active(&ss->ss_node));
509 	ddi_soft_state_free(viona_state, minor);
510 	id_free(viona_minors, minor);
511 
512 	return (0);
513 }
514 
515 static int
516 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
517 {
518 	viona_soft_state_t *ss;
519 	void *dptr = (void *)data;
520 	int err = 0, val;
521 	viona_link_t *link;
522 
523 	ss = ddi_get_soft_state(viona_state, getminor(dev));
524 	if (ss == NULL) {
525 		return (ENXIO);
526 	}
527 
528 	switch (cmd) {
529 	case VNA_IOC_CREATE:
530 		return (viona_ioc_create(ss, dptr, md, cr));
531 	case VNA_IOC_DELETE:
532 		return (viona_ioc_delete(ss, B_FALSE));
533 	default:
534 		break;
535 	}
536 
537 	mutex_enter(&ss->ss_lock);
538 	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
539 	    vmm_drv_release_reqd(link->l_vm_hold)) {
540 		mutex_exit(&ss->ss_lock);
541 		return (ENXIO);
542 	}
543 
544 	switch (cmd) {
545 	case VNA_IOC_GET_FEATURES:
546 		val = VIONA_S_HOSTCAPS | link->l_features_hw;
547 		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
548 			err = EFAULT;
549 		}
550 		break;
551 	case VNA_IOC_SET_FEATURES:
552 		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
553 			err = EFAULT;
554 			break;
555 		}
556 		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
557 
558 		if ((val & VIRTIO_NET_F_CSUM) == 0)
559 			val &= ~VIRTIO_NET_F_HOST_TSO4;
560 
561 		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
562 			val &= ~VIRTIO_NET_F_GUEST_TSO4;
563 
564 		link->l_features = val;
565 		break;
566 	case VNA_IOC_RING_INIT:
567 		err = viona_ioc_ring_init(link, dptr, md);
568 		break;
569 	case VNA_IOC_RING_RESET:
570 		err = viona_ioc_ring_reset(link, (uint_t)data);
571 		break;
572 	case VNA_IOC_RING_KICK:
573 		err = viona_ioc_ring_kick(link, (uint_t)data);
574 		break;
575 	case VNA_IOC_RING_SET_MSI:
576 		err = viona_ioc_ring_set_msi(link, dptr, md);
577 		break;
578 	case VNA_IOC_RING_INTR_CLR:
579 		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
580 		break;
581 	case VNA_IOC_INTR_POLL:
582 		err = viona_ioc_intr_poll(link, dptr, md, rv);
583 		break;
584 	case VNA_IOC_SET_NOTIFY_IOP:
585 		if (data < 0 || data > UINT16_MAX) {
586 			err = EINVAL;
587 			break;
588 		}
589 		err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
590 		break;
591 	default:
592 		err = ENOTTY;
593 		break;
594 	}
595 
596 	mutex_exit(&ss->ss_lock);
597 	return (err);
598 }
599 
600 static int
601 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
602     struct pollhead **phpp)
603 {
604 	viona_soft_state_t *ss;
605 	viona_link_t *link;
606 
607 	ss = ddi_get_soft_state(viona_state, getminor(dev));
608 	if (ss == NULL) {
609 		return (ENXIO);
610 	}
611 
612 	mutex_enter(&ss->ss_lock);
613 	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
614 		mutex_exit(&ss->ss_lock);
615 		return (ENXIO);
616 	}
617 
618 	*reventsp = 0;
619 	if ((events & POLLRDBAND) != 0) {
620 		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
621 			if (link->l_vrings[i].vr_intr_enabled != 0) {
622 				*reventsp |= POLLRDBAND;
623 				break;
624 			}
625 		}
626 	}
627 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
628 		*phpp = &link->l_pollhead;
629 	}
630 	mutex_exit(&ss->ss_lock);
631 
632 	return (0);
633 }
634 
635 static void
636 viona_get_mac_capab(viona_link_t *link)
637 {
638 	mac_handle_t mh = link->l_mh;
639 	uint32_t cap = 0;
640 	mac_capab_lso_t lso_cap;
641 
642 	link->l_features_hw = 0;
643 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
644 		/*
645 		 * Only report HW checksum ability if the underlying MAC
646 		 * resource is capable of populating the L4 header.
647 		 */
648 		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
649 			link->l_features_hw |= VIRTIO_NET_F_CSUM;
650 		}
651 		link->l_cap_csum = cap;
652 	}
653 
654 	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
655 	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
656 		/*
657 		 * Virtio doesn't allow for negotiating a maximum LSO
658 		 * packet size. We have to assume that the guest may
659 		 * send a maximum length IP packet. Make sure the
660 		 * underlying MAC can handle an LSO of this size.
661 		 */
662 		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
663 		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
664 			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
665 	}
666 }
667 
668 static int
669 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
670 {
671 	vioc_create_t	kvc;
672 	viona_link_t	*link = NULL;
673 	char		cli_name[MAXNAMELEN];
674 	int		err = 0;
675 	file_t		*fp;
676 	vmm_hold_t	*hold = NULL;
677 	viona_neti_t	*nip = NULL;
678 	zoneid_t	zid;
679 
680 	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
681 
682 	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
683 		return (EFAULT);
684 	}
685 
686 	zid = crgetzoneid(cr);
687 	nip = viona_neti_lookup_by_zid(zid);
688 	if (nip == NULL) {
689 		return (EIO);
690 	}
691 
692 	if (!nip->vni_nethook.vnh_hooked) {
693 		viona_neti_rele(nip);
694 		return (EIO);
695 	}
696 
697 	mutex_enter(&ss->ss_lock);
698 	if (ss->ss_link != NULL) {
699 		mutex_exit(&ss->ss_lock);
700 		viona_neti_rele(nip);
701 		return (EEXIST);
702 	}
703 
704 	if ((fp = getf(kvc.c_vmfd)) == NULL) {
705 		err = EBADF;
706 		goto bail;
707 	}
708 	err = vmm_drv_hold(fp, cr, &hold);
709 	releasef(kvc.c_vmfd);
710 	if (err != 0) {
711 		goto bail;
712 	}
713 
714 	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
715 	link->l_linkid = kvc.c_linkid;
716 	link->l_vm_hold = hold;
717 
718 	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
719 	if (err != 0) {
720 		goto bail;
721 	}
722 
723 	viona_get_mac_capab(link);
724 
725 	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
726 	    link->l_linkid);
727 	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
728 	if (err != 0) {
729 		goto bail;
730 	}
731 
732 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
733 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
734 
735 	if ((err = viona_rx_set(link)) != 0) {
736 		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
737 		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
738 		goto bail;
739 	}
740 
741 	link->l_neti = nip;
742 	ss->ss_link = link;
743 	mutex_exit(&ss->ss_lock);
744 
745 	mutex_enter(&nip->vni_lock);
746 	list_insert_tail(&nip->vni_dev_list, ss);
747 	mutex_exit(&nip->vni_lock);
748 
749 	return (0);
750 
751 bail:
752 	if (link != NULL) {
753 		if (link->l_mch != NULL) {
754 			mac_client_close(link->l_mch, 0);
755 		}
756 		if (link->l_mh != NULL) {
757 			mac_close(link->l_mh);
758 		}
759 		kmem_free(link, sizeof (viona_link_t));
760 	}
761 	if (hold != NULL) {
762 		vmm_drv_rele(hold);
763 	}
764 	viona_neti_rele(nip);
765 
766 	mutex_exit(&ss->ss_lock);
767 	return (err);
768 }
769 
770 static int
771 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
772 {
773 	viona_link_t *link;
774 	viona_neti_t *nip = NULL;
775 
776 	mutex_enter(&ss->ss_lock);
777 	if ((link = ss->ss_link) == NULL) {
778 		/* Link destruction already complete */
779 		mutex_exit(&ss->ss_lock);
780 		return (0);
781 	}
782 
783 	if (link->l_destroyed) {
784 		/*
785 		 * Link destruction has been started by another thread, but has
786 		 * not completed.  This condition should be impossible to
787 		 * encounter when performing the on-close destroy of the link,
788 		 * since racing ioctl accessors must necessarily be absent.
789 		 */
790 		VERIFY(!on_close);
791 		mutex_exit(&ss->ss_lock);
792 		return (EAGAIN);
793 	}
794 	/*
795 	 * The link deletion cannot fail after this point, continuing until its
796 	 * successful completion is reached.
797 	 */
798 	link->l_destroyed = B_TRUE;
799 
800 	/*
801 	 * Tear down the IO port hook so it cannot be used to kick any of the
802 	 * rings which are about to be reset and stopped.
803 	 */
804 	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
805 	mutex_exit(&ss->ss_lock);
806 
807 	/*
808 	 * Return the rings to their reset state, ignoring any possible
809 	 * interruptions from signals.
810 	 */
811 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
812 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
813 
814 	mutex_enter(&ss->ss_lock);
815 	if (link->l_mch != NULL) {
816 		/* Unhook the receive callbacks and close out the client */
817 		viona_rx_clear(link);
818 		mac_client_close(link->l_mch, 0);
819 	}
820 	if (link->l_mh != NULL) {
821 		mac_close(link->l_mh);
822 	}
823 	if (link->l_vm_hold != NULL) {
824 		vmm_drv_rele(link->l_vm_hold);
825 		link->l_vm_hold = NULL;
826 	}
827 
828 	nip = link->l_neti;
829 	link->l_neti = NULL;
830 
831 	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
832 	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
833 	pollhead_clean(&link->l_pollhead);
834 	ss->ss_link = NULL;
835 	mutex_exit(&ss->ss_lock);
836 
837 	mutex_enter(&nip->vni_lock);
838 	list_remove(&nip->vni_dev_list, ss);
839 	mutex_exit(&nip->vni_lock);
840 
841 	viona_neti_rele(nip);
842 
843 	kmem_free(link, sizeof (viona_link_t));
844 	return (0);
845 }
846 
847 static int
848 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
849 {
850 	vioc_ring_init_t kri;
851 	int err;
852 
853 	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
854 		return (EFAULT);
855 	}
856 
857 	err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
858 
859 	return (err);
860 }
861 
862 static int
863 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
864 {
865 	viona_vring_t *ring;
866 
867 	if (idx >= VIONA_VQ_MAX) {
868 		return (EINVAL);
869 	}
870 	ring = &link->l_vrings[idx];
871 
872 	return (viona_ring_reset(ring, B_TRUE));
873 }
874 
875 static int
876 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
877 {
878 	viona_vring_t *ring;
879 	int err;
880 
881 	if (idx >= VIONA_VQ_MAX) {
882 		return (EINVAL);
883 	}
884 	ring = &link->l_vrings[idx];
885 
886 	mutex_enter(&ring->vr_lock);
887 	switch (ring->vr_state) {
888 	case VRS_SETUP:
889 		/*
890 		 * An early kick to a ring which is starting its worker thread
891 		 * is fine.  Once that thread is active, it will process the
892 		 * start-up request immediately.
893 		 */
894 		/* FALLTHROUGH */
895 	case VRS_INIT:
896 		ring->vr_state_flags |= VRSF_REQ_START;
897 		/* FALLTHROUGH */
898 	case VRS_RUN:
899 		cv_broadcast(&ring->vr_cv);
900 		err = 0;
901 		break;
902 	default:
903 		err = EBUSY;
904 		break;
905 	}
906 	mutex_exit(&ring->vr_lock);
907 
908 	return (err);
909 }
910 
911 static int
912 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
913 {
914 	vioc_ring_msi_t vrm;
915 	viona_vring_t *ring;
916 
917 	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
918 		return (EFAULT);
919 	}
920 	if (vrm.rm_index >= VIONA_VQ_MAX) {
921 		return (EINVAL);
922 	}
923 
924 	ring = &link->l_vrings[vrm.rm_index];
925 	mutex_enter(&ring->vr_lock);
926 	ring->vr_msi_addr = vrm.rm_addr;
927 	ring->vr_msi_msg = vrm.rm_msg;
928 	mutex_exit(&ring->vr_lock);
929 
930 	return (0);
931 }
932 
933 static int
934 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
935     uint32_t *val)
936 {
937 	viona_link_t *link = (viona_link_t *)arg;
938 	uint16_t vq = *val;
939 
940 	if (in) {
941 		/*
942 		 * Do not service read (in/ins) requests on this ioport.
943 		 * Instead, indicate that the handler is not found, causing a
944 		 * fallback to userspace processing.
945 		 */
946 		return (ESRCH);
947 	}
948 
949 	if (port != link->l_notify_ioport) {
950 		return (EINVAL);
951 	}
952 	return (viona_ioc_ring_kick(link, vq));
953 }
954 
955 static int
956 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
957 {
958 	int err = 0;
959 
960 	if (link->l_notify_ioport != 0) {
961 		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
962 		link->l_notify_ioport = 0;
963 	}
964 
965 	if (ioport != 0) {
966 		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
967 		    viona_notify_iop, (void *)link, &link->l_notify_cookie);
968 		if (err == 0) {
969 			link->l_notify_ioport = ioport;
970 		}
971 	}
972 	return (err);
973 }
974 
975 static int
976 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
977 {
978 	if (idx >= VIONA_VQ_MAX) {
979 		return (EINVAL);
980 	}
981 
982 	link->l_vrings[idx].vr_intr_enabled = 0;
983 	return (0);
984 }
985 
986 static int
987 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
988 {
989 	uint_t cnt = 0;
990 	vioc_intr_poll_t vip;
991 
992 	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
993 		uint_t val = link->l_vrings[i].vr_intr_enabled;
994 
995 		vip.vip_status[i] = val;
996 		if (val != 0) {
997 			cnt++;
998 		}
999 	}
1000 
1001 	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1002 		return (EFAULT);
1003 	}
1004 	*rv = (int)cnt;
1005 	return (0);
1006 }
1007