xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.c (revision c686756220120076a07be0dcce54be698101a3d1)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2013 Nexenta Inc.  All rights reserved.
14  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15  * Copyright 2019 Joyent, Inc.
16  * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org>
17  */
18 
19 /* Based on the NetBSD virtio driver by Minoura Makoto. */
20 /*
21  * Copyright (c) 2010 Minoura Makoto.
22  * All rights reserved.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  *
33  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
34  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
35  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
36  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
37  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
38  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
39  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
40  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
42  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43  */
44 
45 /*
46  * VIRTIO NETWORK DRIVER
47  */
48 
49 #include <sys/types.h>
50 #include <sys/errno.h>
51 #include <sys/param.h>
52 #include <sys/stropts.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/kmem.h>
56 #include <sys/conf.h>
57 #include <sys/devops.h>
58 #include <sys/ksynch.h>
59 #include <sys/stat.h>
60 #include <sys/modctl.h>
61 #include <sys/debug.h>
62 #include <sys/pci.h>
63 #include <sys/ethernet.h>
64 #include <sys/vlan.h>
65 #include <sys/sysmacros.h>
66 #include <sys/smbios.h>
67 
68 #include <sys/dlpi.h>
69 #include <sys/taskq.h>
70 
71 #include <sys/pattr.h>
72 #include <sys/strsun.h>
73 
74 #include <sys/random.h>
75 #include <sys/containerof.h>
76 #include <sys/stream.h>
77 #include <inet/tcp.h>
78 
79 #include <sys/mac.h>
80 #include <sys/mac_provider.h>
81 #include <sys/mac_ether.h>
82 
83 #include "virtio.h"
84 #include "vioif.h"
85 
86 
87 static int vioif_quiesce(dev_info_t *);
88 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
89 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
90 static boolean_t vioif_has_feature(vioif_t *, uint32_t);
91 static void vioif_reclaim_restart(vioif_t *);
92 static int vioif_m_stat(void *, uint_t, uint64_t *);
93 static void vioif_m_stop(void *);
94 static int vioif_m_start(void *);
95 static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
96 static int vioif_m_setpromisc(void *, boolean_t);
97 static int vioif_m_unicst(void *, const uint8_t *);
98 static mblk_t *vioif_m_tx(void *, mblk_t *);
99 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
100     const void *);
101 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
102 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
103     mac_prop_info_handle_t);
104 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
105 static uint_t vioif_add_rx(vioif_t *);
106 
107 
108 static struct cb_ops vioif_cb_ops = {
109 	.cb_rev =			CB_REV,
110 	.cb_flag =			D_MP | D_NEW,
111 
112 	.cb_open =			nulldev,
113 	.cb_close =			nulldev,
114 	.cb_strategy =			nodev,
115 	.cb_print =			nodev,
116 	.cb_dump =			nodev,
117 	.cb_read =			nodev,
118 	.cb_write =			nodev,
119 	.cb_ioctl =			nodev,
120 	.cb_devmap =			nodev,
121 	.cb_mmap =			nodev,
122 	.cb_segmap =			nodev,
123 	.cb_chpoll =			nochpoll,
124 	.cb_prop_op =			ddi_prop_op,
125 	.cb_str =			NULL,
126 	.cb_aread =			nodev,
127 	.cb_awrite =			nodev,
128 };
129 
130 static struct dev_ops vioif_dev_ops = {
131 	.devo_rev =			DEVO_REV,
132 	.devo_refcnt =			0,
133 
134 	.devo_attach =			vioif_attach,
135 	.devo_detach =			vioif_detach,
136 	.devo_quiesce =			vioif_quiesce,
137 
138 	.devo_cb_ops =			&vioif_cb_ops,
139 
140 	.devo_getinfo =			NULL,
141 	.devo_identify =		nulldev,
142 	.devo_probe =			nulldev,
143 	.devo_reset =			nodev,
144 	.devo_bus_ops =			NULL,
145 	.devo_power =			NULL,
146 };
147 
148 static struct modldrv vioif_modldrv = {
149 	.drv_modops =			&mod_driverops,
150 	.drv_linkinfo =			"VIRTIO network driver",
151 	.drv_dev_ops =			&vioif_dev_ops
152 };
153 
154 static struct modlinkage vioif_modlinkage = {
155 	.ml_rev =			MODREV_1,
156 	.ml_linkage =			{ &vioif_modldrv, NULL }
157 };
158 
159 static mac_callbacks_t vioif_mac_callbacks = {
160 	.mc_getstat =			vioif_m_stat,
161 	.mc_start =			vioif_m_start,
162 	.mc_stop =			vioif_m_stop,
163 	.mc_setpromisc =		vioif_m_setpromisc,
164 	.mc_multicst =			vioif_m_multicst,
165 	.mc_unicst =			vioif_m_unicst,
166 	.mc_tx =			vioif_m_tx,
167 
168 	.mc_callbacks =			(MC_GETCAPAB | MC_SETPROP |
169 					    MC_GETPROP | MC_PROPINFO),
170 	.mc_getcapab =			vioif_m_getcapab,
171 	.mc_setprop =			vioif_m_setprop,
172 	.mc_getprop =			vioif_m_getprop,
173 	.mc_propinfo =			vioif_m_propinfo,
174 };
175 
176 static const uchar_t vioif_broadcast[ETHERADDRL] = {
177 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
178 };
179 
180 /*
181  * Interval for the periodic TX reclaim.
182  */
183 uint_t vioif_reclaim_ms = 200;
184 
185 /*
186  * Allow the operator to override the kinds of interrupts we'll use for
187  * vioif.  This value defaults to -1 so that it can be overridden to 0 in
188  * /etc/system.
189  */
190 int vioif_allowed_int_types = -1;
191 
192 /*
193  * DMA attribute template for transmit and receive buffers.  The SGL entry
194  * count will be modified before using the template.  Note that these
195  * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
196  * received frames at the correct offset for the networking stack.
197  */
198 ddi_dma_attr_t vioif_dma_attr_bufs = {
199 	.dma_attr_version =		DMA_ATTR_V0,
200 	.dma_attr_addr_lo =		0x0000000000000000,
201 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
202 	.dma_attr_count_max =		0x00000000FFFFFFFF,
203 	.dma_attr_align =		VIOIF_HEADER_ALIGN,
204 	.dma_attr_burstsizes =		1,
205 	.dma_attr_minxfer =		1,
206 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
207 	.dma_attr_seg =			0x00000000FFFFFFFF,
208 	.dma_attr_sgllen =		0,
209 	.dma_attr_granular =		1,
210 	.dma_attr_flags =		0
211 };
212 
213 /*
214  * DMA attributes for mapping larger transmit buffers from the networking
215  * stack.  The requirements are quite loose, but note that the SGL entry length
216  * field is 32-bit.
217  */
218 ddi_dma_attr_t vioif_dma_attr_external = {
219 	.dma_attr_version =		DMA_ATTR_V0,
220 	.dma_attr_addr_lo =		0x0000000000000000,
221 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
222 	.dma_attr_count_max =		0x00000000FFFFFFFF,
223 	.dma_attr_align =		1,
224 	.dma_attr_burstsizes =		1,
225 	.dma_attr_minxfer =		1,
226 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
227 	.dma_attr_seg =			0x00000000FFFFFFFF,
228 	.dma_attr_sgllen =		VIOIF_MAX_SEGS - 1,
229 	.dma_attr_granular =		1,
230 	.dma_attr_flags =		0
231 };
232 
233 
234 /*
235  * VIRTIO NET MAC PROPERTIES
236  */
237 #define	VIOIF_MACPROP_TXCOPY_THRESH	"_txcopy_thresh"
238 #define	VIOIF_MACPROP_TXCOPY_THRESH_DEF	300
239 #define	VIOIF_MACPROP_TXCOPY_THRESH_MAX	640
240 
241 #define	VIOIF_MACPROP_RXCOPY_THRESH	"_rxcopy_thresh"
242 #define	VIOIF_MACPROP_RXCOPY_THRESH_DEF	300
243 #define	VIOIF_MACPROP_RXCOPY_THRESH_MAX	640
244 
245 static char *vioif_priv_props[] = {
246 	VIOIF_MACPROP_TXCOPY_THRESH,
247 	VIOIF_MACPROP_RXCOPY_THRESH,
248 	NULL
249 };
250 
251 
252 static vioif_txbuf_t *
253 vioif_txbuf_alloc(vioif_t *vif)
254 {
255 	vioif_txbuf_t *tb;
256 
257 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
258 
259 	if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
260 		vif->vif_ntxbufs_alloc++;
261 	}
262 
263 	return (tb);
264 }
265 
266 static void
267 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
268 {
269 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
270 
271 	VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
272 	vif->vif_ntxbufs_alloc--;
273 
274 	virtio_chain_clear(tb->tb_chain);
275 	list_insert_head(&vif->vif_txbufs, tb);
276 }
277 
278 static vioif_rxbuf_t *
279 vioif_rxbuf_alloc(vioif_t *vif)
280 {
281 	vioif_rxbuf_t *rb;
282 
283 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
284 
285 	if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
286 		vif->vif_nrxbufs_alloc++;
287 	}
288 
289 	return (rb);
290 }
291 
292 static void
293 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
294 {
295 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
296 
297 	VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
298 	vif->vif_nrxbufs_alloc--;
299 
300 	virtio_chain_clear(rb->rb_chain);
301 	list_insert_head(&vif->vif_rxbufs, rb);
302 }
303 
304 static void
305 vioif_rx_free_callback(caddr_t free_arg)
306 {
307 	vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
308 	vioif_t *vif = rb->rb_vioif;
309 
310 	mutex_enter(&vif->vif_mutex);
311 
312 	/*
313 	 * Return this receive buffer to the free list.
314 	 */
315 	vioif_rxbuf_free(vif, rb);
316 
317 	VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
318 	vif->vif_nrxbufs_onloan--;
319 
320 	/*
321 	 * Attempt to replenish the receive queue with at least the buffer we
322 	 * just freed.  There isn't a great way to deal with failure here,
323 	 * though because we'll only loan at most half of the buffers there
324 	 * should always be at least some available even if this fails.
325 	 */
326 	(void) vioif_add_rx(vif);
327 
328 	mutex_exit(&vif->vif_mutex);
329 }
330 
331 static void
332 vioif_free_bufs(vioif_t *vif)
333 {
334 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
335 
336 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
337 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
338 		vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
339 
340 		/*
341 		 * Ensure that this txbuf is now in the free list:
342 		 */
343 		VERIFY(list_link_active(&tb->tb_link));
344 		list_remove(&vif->vif_txbufs, tb);
345 
346 		/*
347 		 * We should not have an mblk chain at this point.
348 		 */
349 		VERIFY3P(tb->tb_mp, ==, NULL);
350 
351 		if (tb->tb_dma != NULL) {
352 			virtio_dma_free(tb->tb_dma);
353 			tb->tb_dma = NULL;
354 		}
355 
356 		if (tb->tb_chain != NULL) {
357 			virtio_chain_free(tb->tb_chain);
358 			tb->tb_chain = NULL;
359 		}
360 
361 		if (tb->tb_dmaext != NULL) {
362 			for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
363 				if (tb->tb_dmaext[j] != NULL) {
364 					virtio_dma_free(
365 					    tb->tb_dmaext[j]);
366 					tb->tb_dmaext[j] = NULL;
367 				}
368 			}
369 
370 			kmem_free(tb->tb_dmaext,
371 			    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
372 			tb->tb_dmaext = NULL;
373 			tb->tb_dmaext_capacity = 0;
374 		}
375 	}
376 	VERIFY(list_is_empty(&vif->vif_txbufs));
377 	if (vif->vif_txbufs_mem != NULL) {
378 		kmem_free(vif->vif_txbufs_mem,
379 		    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
380 		vif->vif_txbufs_mem = NULL;
381 		vif->vif_txbufs_capacity = 0;
382 	}
383 
384 	VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
385 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
386 		vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
387 
388 		/*
389 		 * Ensure that this rxbuf is now in the free list:
390 		 */
391 		VERIFY(list_link_active(&rb->rb_link));
392 		list_remove(&vif->vif_rxbufs, rb);
393 
394 		if (rb->rb_dma != NULL) {
395 			virtio_dma_free(rb->rb_dma);
396 			rb->rb_dma = NULL;
397 		}
398 
399 		if (rb->rb_chain != NULL) {
400 			virtio_chain_free(rb->rb_chain);
401 			rb->rb_chain = NULL;
402 		}
403 	}
404 	VERIFY(list_is_empty(&vif->vif_rxbufs));
405 	if (vif->vif_rxbufs_mem != NULL) {
406 		kmem_free(vif->vif_rxbufs_mem,
407 		    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
408 		vif->vif_rxbufs_mem = NULL;
409 		vif->vif_rxbufs_capacity = 0;
410 	}
411 }
412 
413 static int
414 vioif_alloc_bufs(vioif_t *vif)
415 {
416 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
417 
418 	/*
419 	 * Allocate one contiguous chunk of memory for the transmit and receive
420 	 * buffer tracking objects.  If the ring is unusually small, we'll
421 	 * reduce our target buffer count accordingly.
422 	 */
423 	vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
424 	    virtio_queue_size(vif->vif_tx_vq));
425 	vif->vif_txbufs_mem = kmem_zalloc(
426 	    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
427 	list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
428 	    offsetof(vioif_txbuf_t, tb_link));
429 
430 	vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
431 	    virtio_queue_size(vif->vif_rx_vq));
432 	vif->vif_rxbufs_mem = kmem_zalloc(
433 	    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
434 	list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
435 	    offsetof(vioif_rxbuf_t, rb_link));
436 
437 	/*
438 	 * Do not loan more than half of our allocated receive buffers into
439 	 * the networking stack.
440 	 */
441 	vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
442 
443 	/*
444 	 * Put everything in the free list straight away in order to simplify
445 	 * the use of vioif_free_bufs() for cleanup on allocation failure.
446 	 */
447 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
448 		list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
449 	}
450 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
451 		list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
452 	}
453 
454 	/*
455 	 * Start from the DMA attribute template common to both transmit and
456 	 * receive buffers.  The SGL entry count will be modified for each
457 	 * buffer type.
458 	 */
459 	ddi_dma_attr_t attr = vioif_dma_attr_bufs;
460 
461 	/*
462 	 * The transmit inline buffer is small (less than a page), so it's
463 	 * reasonable to request a single cookie.
464 	 */
465 	attr.dma_attr_sgllen = 1;
466 
467 	for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
468 	    tb = list_next(&vif->vif_txbufs, tb)) {
469 		if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
470 		    VIOIF_TX_INLINE_SIZE, &attr,
471 		    DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
472 			goto fail;
473 		}
474 		VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
475 
476 		if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
477 		    KM_SLEEP)) == NULL) {
478 			goto fail;
479 		}
480 		virtio_chain_data_set(tb->tb_chain, tb);
481 
482 		tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
483 		tb->tb_dmaext = kmem_zalloc(
484 		    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
485 		    KM_SLEEP);
486 	}
487 
488 	/*
489 	 * The receive buffers are larger, and we can tolerate a large number
490 	 * of segments.  Adjust the SGL entry count, setting aside one segment
491 	 * for the virtio net header.
492 	 */
493 	attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
494 
495 	for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
496 	    rb = list_next(&vif->vif_rxbufs, rb)) {
497 		if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
498 		    VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
499 		    KM_SLEEP)) == NULL) {
500 			goto fail;
501 		}
502 
503 		if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
504 		    KM_SLEEP)) == NULL) {
505 			goto fail;
506 		}
507 		virtio_chain_data_set(rb->rb_chain, rb);
508 
509 		/*
510 		 * Ensure that the first cookie is sufficient to cover the
511 		 * header skip region plus one byte.
512 		 */
513 		VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
514 		    VIOIF_HEADER_SKIP + 1);
515 
516 		/*
517 		 * Ensure that the frame data begins at a location with a
518 		 * correctly aligned IP header.
519 		 */
520 		VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
521 		    VIOIF_HEADER_SKIP) % 4, ==, 2);
522 
523 		rb->rb_vioif = vif;
524 		rb->rb_frtn.free_func = vioif_rx_free_callback;
525 		rb->rb_frtn.free_arg = (caddr_t)rb;
526 	}
527 
528 	return (0);
529 
530 fail:
531 	vioif_free_bufs(vif);
532 	return (ENOMEM);
533 }
534 
535 static int
536 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
537 {
538 	/*
539 	 * Even though we currently do not have support for programming
540 	 * multicast filters, or even enabling promiscuous mode, we return
541 	 * success here to avoid the networking stack falling back to link
542 	 * layer broadcast for multicast traffic.  Some hypervisors already
543 	 * pass received multicast frames onto the guest, so at least on those
544 	 * systems multicast will work as expected anyway.
545 	 */
546 	return (0);
547 }
548 
549 static int
550 vioif_m_setpromisc(void *arg, boolean_t on)
551 {
552 	/*
553 	 * Even though we cannot currently enable promiscuous mode, we return
554 	 * success here to allow tools like snoop(1M) to continue to function.
555 	 */
556 	return (0);
557 }
558 
559 static int
560 vioif_m_unicst(void *arg, const uint8_t *mac)
561 {
562 	return (ENOTSUP);
563 }
564 
565 static uint_t
566 vioif_add_rx(vioif_t *vif)
567 {
568 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
569 
570 	if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
571 		/*
572 		 * If the NIC is not running, do not give the device any
573 		 * receive buffers.
574 		 */
575 		return (0);
576 	}
577 
578 	uint_t num_added = 0;
579 
580 	vioif_rxbuf_t *rb;
581 	while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
582 		/*
583 		 * For legacy devices, and those that have not negotiated
584 		 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
585 		 * separate descriptor entry to the rest of the buffer.
586 		 */
587 		if (virtio_chain_append(rb->rb_chain,
588 		    virtio_dma_cookie_pa(rb->rb_dma, 0),
589 		    sizeof (struct virtio_net_hdr),
590 		    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
591 			goto fail;
592 		}
593 
594 		for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
595 			uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
596 			size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
597 
598 			if (n == 0) {
599 				pa += VIOIF_HEADER_SKIP;
600 				VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
601 				sz -= VIOIF_HEADER_SKIP;
602 			}
603 
604 			if (virtio_chain_append(rb->rb_chain, pa, sz,
605 			    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
606 				goto fail;
607 			}
608 		}
609 
610 		virtio_chain_submit(rb->rb_chain, B_FALSE);
611 		num_added++;
612 		continue;
613 
614 fail:
615 		vioif_rxbuf_free(vif, rb);
616 		vif->vif_norecvbuf++;
617 		break;
618 	}
619 
620 	if (num_added > 0) {
621 		virtio_queue_flush(vif->vif_rx_vq);
622 	}
623 
624 	return (num_added);
625 }
626 
627 static uint_t
628 vioif_process_rx(vioif_t *vif)
629 {
630 	virtio_chain_t *vic;
631 	mblk_t *mphead = NULL, *lastmp = NULL, *mp;
632 	uint_t num_processed = 0;
633 
634 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
635 
636 	while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
637 		/*
638 		 * We have to use the chain received length here, as the device
639 		 * does not tell us the received frame length any other way.
640 		 * In a limited survey of hypervisors, virtio network devices
641 		 * appear to provide the right value here.
642 		 */
643 		size_t len = virtio_chain_received_length(vic);
644 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
645 
646 		virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
647 
648 		/*
649 		 * If the NIC is not running, discard any received frames.
650 		 */
651 		if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
652 			vioif_rxbuf_free(vif, rb);
653 			continue;
654 		}
655 
656 		if (len < sizeof (struct virtio_net_hdr)) {
657 			vif->vif_rxfail_chain_undersize++;
658 			vif->vif_ierrors++;
659 			vioif_rxbuf_free(vif, rb);
660 			continue;
661 		}
662 		len -= sizeof (struct virtio_net_hdr);
663 
664 		/*
665 		 * We copy small packets that happen to fit into a single
666 		 * cookie and reuse the buffers. For bigger ones, we loan
667 		 * the buffers upstream.
668 		 */
669 		if (len < vif->vif_rxcopy_thresh ||
670 		    vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
671 			mutex_exit(&vif->vif_mutex);
672 			if ((mp = allocb(len, 0)) == NULL) {
673 				mutex_enter(&vif->vif_mutex);
674 				vif->vif_norecvbuf++;
675 				vif->vif_ierrors++;
676 
677 				vioif_rxbuf_free(vif, rb);
678 				continue;
679 			}
680 
681 			bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
682 			    mp->b_rptr, len);
683 			mp->b_wptr = mp->b_rptr + len;
684 
685 			/*
686 			 * As the packet contents was copied rather than
687 			 * loaned, we can return the receive buffer resources
688 			 * to the free list.
689 			 */
690 			mutex_enter(&vif->vif_mutex);
691 			vioif_rxbuf_free(vif, rb);
692 
693 		} else {
694 			mutex_exit(&vif->vif_mutex);
695 			if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
696 			    VIOIF_HEADER_SKIP), len, 0,
697 			    &rb->rb_frtn)) == NULL) {
698 				mutex_enter(&vif->vif_mutex);
699 				vif->vif_norecvbuf++;
700 				vif->vif_ierrors++;
701 
702 				vioif_rxbuf_free(vif, rb);
703 				continue;
704 			}
705 			mp->b_wptr = mp->b_rptr + len;
706 
707 			mutex_enter(&vif->vif_mutex);
708 			vif->vif_nrxbufs_onloan++;
709 		}
710 
711 		/*
712 		 * virtio-net does not tell us if this packet is multicast
713 		 * or broadcast, so we have to check it.
714 		 */
715 		if (mp->b_rptr[0] & 0x1) {
716 			if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
717 				vif->vif_multircv++;
718 			else
719 				vif->vif_brdcstrcv++;
720 		}
721 
722 		vif->vif_rbytes += len;
723 		vif->vif_ipackets++;
724 
725 		if (lastmp == NULL) {
726 			mphead = mp;
727 		} else {
728 			lastmp->b_next = mp;
729 		}
730 		lastmp = mp;
731 		num_processed++;
732 	}
733 
734 	if (mphead != NULL) {
735 		if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
736 			mutex_exit(&vif->vif_mutex);
737 			mac_rx(vif->vif_mac_handle, NULL, mphead);
738 			mutex_enter(&vif->vif_mutex);
739 		} else {
740 			/*
741 			 * The NIC was disabled part way through our execution,
742 			 * so free the messages we allocated.
743 			 */
744 			freemsgchain(mphead);
745 		}
746 	}
747 
748 	return (num_processed);
749 }
750 
751 static uint_t
752 vioif_reclaim_used_tx(vioif_t *vif)
753 {
754 	virtio_chain_t *vic;
755 	uint_t num_reclaimed = 0;
756 
757 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
758 
759 	while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
760 		vioif_txbuf_t *tb = virtio_chain_data(vic);
761 
762 		if (tb->tb_mp != NULL) {
763 			/*
764 			 * Unbind the external mapping.
765 			 */
766 			for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
767 				if (tb->tb_dmaext[i] == NULL) {
768 					continue;
769 				}
770 
771 				virtio_dma_unbind(tb->tb_dmaext[i]);
772 			}
773 
774 			freemsg(tb->tb_mp);
775 			tb->tb_mp = NULL;
776 		}
777 
778 		/*
779 		 * Return this transmit buffer to the free list for reuse.
780 		 */
781 		mutex_enter(&vif->vif_mutex);
782 		vioif_txbuf_free(vif, tb);
783 		mutex_exit(&vif->vif_mutex);
784 
785 		num_reclaimed++;
786 	}
787 
788 	/* Return ring to transmitting state if descriptors were reclaimed. */
789 	if (num_reclaimed > 0) {
790 		boolean_t do_update = B_FALSE;
791 
792 		mutex_enter(&vif->vif_mutex);
793 		vif->vif_stat_tx_reclaim += num_reclaimed;
794 		if (vif->vif_tx_corked) {
795 			/*
796 			 * TX was corked on a lack of available descriptors.
797 			 * That dire state has passed so the TX interrupt can
798 			 * be disabled and MAC can be notified that
799 			 * transmission is possible again.
800 			 */
801 			vif->vif_tx_corked = B_FALSE;
802 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
803 			do_update = B_TRUE;
804 		}
805 
806 		if (do_update) {
807 			mac_tx_update(vif->vif_mac_handle);
808 		}
809 		mutex_exit(&vif->vif_mutex);
810 	}
811 
812 	return (num_reclaimed);
813 }
814 
815 static void
816 vioif_reclaim_periodic(void *arg)
817 {
818 	vioif_t *vif = arg;
819 	uint_t num_reclaimed;
820 
821 	num_reclaimed = vioif_reclaim_used_tx(vif);
822 
823 	mutex_enter(&vif->vif_mutex);
824 	vif->vif_tx_reclaim_tid = 0;
825 	/*
826 	 * If used descriptors were reclaimed or TX descriptors appear to be
827 	 * outstanding, the ring is considered active and periodic reclamation
828 	 * is necessary for now.
829 	 */
830 	if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
831 		/* Do not reschedule if the ring is being drained. */
832 		if (!vif->vif_tx_drain) {
833 			vioif_reclaim_restart(vif);
834 		}
835 	}
836 	mutex_exit(&vif->vif_mutex);
837 }
838 
839 static void
840 vioif_reclaim_restart(vioif_t *vif)
841 {
842 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
843 	VERIFY(!vif->vif_tx_drain);
844 
845 	if (vif->vif_tx_reclaim_tid == 0) {
846 		vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
847 		    MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
848 	}
849 }
850 
851 static void
852 vioif_tx_drain(vioif_t *vif)
853 {
854 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
855 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
856 
857 	vif->vif_tx_drain = B_TRUE;
858 	/* Put a stop to the periodic reclaim if it is running */
859 	if (vif->vif_tx_reclaim_tid != 0) {
860 		timeout_id_t tid = vif->vif_tx_reclaim_tid;
861 
862 		/*
863 		 * With vif_tx_drain set, there is no risk that a racing
864 		 * vioif_reclaim_periodic() call will reschedule itself.
865 		 *
866 		 * Being part of the mc_stop hook also guarantees that
867 		 * vioif_m_tx() will not be called to restart it.
868 		 */
869 		vif->vif_tx_reclaim_tid = 0;
870 		mutex_exit(&vif->vif_mutex);
871 		(void) untimeout(tid);
872 		mutex_enter(&vif->vif_mutex);
873 	}
874 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
875 
876 	/*
877 	 * Wait for all of the TX descriptors to be processed by the host so
878 	 * they can be reclaimed.
879 	 */
880 	while (vif->vif_ntxbufs_alloc > 0) {
881 		mutex_exit(&vif->vif_mutex);
882 		(void) vioif_reclaim_used_tx(vif);
883 		delay(5);
884 		mutex_enter(&vif->vif_mutex);
885 	}
886 	VERIFY(!vif->vif_tx_corked);
887 	VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
888 	VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
889 }
890 
891 static int
892 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
893 {
894 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
895 
896 	VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
897 
898 	/*
899 	 * Copy the message into the inline buffer and then free the message.
900 	 */
901 	mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
902 
903 	if (virtio_chain_append(tb->tb_chain,
904 	    virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
905 	    msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
906 		return (DDI_FAILURE);
907 	}
908 
909 	return (DDI_SUCCESS);
910 }
911 
912 static int
913 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
914 {
915 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
916 
917 	mblk_t *nmp = mp;
918 	tb->tb_ndmaext = 0;
919 
920 	while (nmp != NULL) {
921 		size_t len;
922 
923 		if ((len = MBLKL(nmp)) == 0) {
924 			/*
925 			 * Skip any zero-length entries in the chain.
926 			 */
927 			nmp = nmp->b_cont;
928 			continue;
929 		}
930 
931 		if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
932 			mutex_enter(&vif->vif_mutex);
933 			vif->vif_txfail_indirect_limit++;
934 			vif->vif_notxbuf++;
935 			mutex_exit(&vif->vif_mutex);
936 			goto fail;
937 		}
938 
939 		if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
940 			/*
941 			 * Allocate a DMA handle for this slot.
942 			 */
943 			if ((tb->tb_dmaext[tb->tb_ndmaext] =
944 			    virtio_dma_alloc_nomem(vif->vif_virtio,
945 			    &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
946 				mutex_enter(&vif->vif_mutex);
947 				vif->vif_notxbuf++;
948 				mutex_exit(&vif->vif_mutex);
949 				goto fail;
950 			}
951 		}
952 		virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
953 
954 		if (virtio_dma_bind(extdma, nmp->b_rptr, len,
955 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
956 		    DDI_SUCCESS) {
957 			mutex_enter(&vif->vif_mutex);
958 			vif->vif_txfail_dma_bind++;
959 			mutex_exit(&vif->vif_mutex);
960 			goto fail;
961 		}
962 
963 		for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
964 			uint64_t pa = virtio_dma_cookie_pa(extdma, n);
965 			size_t sz = virtio_dma_cookie_size(extdma, n);
966 
967 			if (virtio_chain_append(tb->tb_chain, pa, sz,
968 			    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
969 				mutex_enter(&vif->vif_mutex);
970 				vif->vif_txfail_indirect_limit++;
971 				vif->vif_notxbuf++;
972 				mutex_exit(&vif->vif_mutex);
973 				goto fail;
974 			}
975 		}
976 
977 		nmp = nmp->b_cont;
978 	}
979 
980 	/*
981 	 * We need to keep the message around until we reclaim the buffer from
982 	 * the device before freeing it.
983 	 */
984 	tb->tb_mp = mp;
985 
986 	return (DDI_SUCCESS);
987 
988 fail:
989 	for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
990 		if (tb->tb_dmaext[n] != NULL) {
991 			virtio_dma_unbind(tb->tb_dmaext[n]);
992 		}
993 	}
994 	tb->tb_ndmaext = 0;
995 
996 	freemsg(mp);
997 
998 	return (DDI_FAILURE);
999 }
1000 
1001 static boolean_t
1002 vioif_send(vioif_t *vif, mblk_t *mp)
1003 {
1004 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1005 
1006 	vioif_txbuf_t *tb = NULL;
1007 	struct virtio_net_hdr *vnh = NULL;
1008 	size_t msg_size = 0;
1009 	uint32_t csum_start;
1010 	uint32_t csum_stuff;
1011 	uint32_t csum_flags;
1012 	uint32_t lso_flags;
1013 	uint32_t lso_mss;
1014 	mblk_t *nmp;
1015 	int ret;
1016 	boolean_t lso_required = B_FALSE;
1017 	struct ether_header *ether = (void *)mp->b_rptr;
1018 
1019 	for (nmp = mp; nmp; nmp = nmp->b_cont)
1020 		msg_size += MBLKL(nmp);
1021 
1022 	if (vif->vif_tx_tso4 || vif->vif_tx_tso6) {
1023 		mac_lso_get(mp, &lso_mss, &lso_flags);
1024 		lso_required = (lso_flags & HW_LSO) != 0;
1025 	}
1026 
1027 	mutex_enter(&vif->vif_mutex);
1028 	if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
1029 		vif->vif_notxbuf++;
1030 		goto fail;
1031 	}
1032 	mutex_exit(&vif->vif_mutex);
1033 
1034 	/*
1035 	 * Use the inline buffer for the virtio net header.  Zero the portion
1036 	 * of our DMA allocation prior to the packet data.
1037 	 */
1038 	vnh = virtio_dma_va(tb->tb_dma, 0);
1039 	bzero(vnh, VIOIF_HEADER_SKIP);
1040 
1041 	/*
1042 	 * For legacy devices, and those that have not negotiated
1043 	 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
1044 	 * descriptor entry to the rest of the buffer.
1045 	 */
1046 	if (virtio_chain_append(tb->tb_chain,
1047 	    virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
1048 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1049 		mutex_enter(&vif->vif_mutex);
1050 		vif->vif_notxbuf++;
1051 		goto fail;
1052 	}
1053 
1054 	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
1055 
1056 	/*
1057 	 * They want us to do the TCP/UDP csum calculation.
1058 	 */
1059 	if (csum_flags & HCK_PARTIALCKSUM) {
1060 		int eth_hsize;
1061 
1062 		/*
1063 		 * Did we ask for it?
1064 		 */
1065 		ASSERT(vif->vif_tx_csum);
1066 
1067 		/*
1068 		 * We only asked for partial csum packets.
1069 		 */
1070 		ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1071 		ASSERT(!(csum_flags & HCK_FULLCKSUM));
1072 
1073 		if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
1074 			eth_hsize = sizeof (struct ether_vlan_header);
1075 		} else {
1076 			eth_hsize = sizeof (struct ether_header);
1077 		}
1078 
1079 		vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1080 		vnh->vnh_csum_start = eth_hsize + csum_start;
1081 		vnh->vnh_csum_offset = csum_stuff - csum_start;
1082 	}
1083 
1084 	/*
1085 	 * Setup LSO fields if required.
1086 	 */
1087 	if (lso_required) {
1088 		mac_ether_offload_flags_t needed;
1089 		mac_ether_offload_info_t meo;
1090 		uint32_t cksum;
1091 		size_t len;
1092 		mblk_t *pullmp = NULL;
1093 		tcpha_t *tcpha;
1094 
1095 		if (mac_ether_offload_info(mp, &meo) != 0) {
1096 			goto fail;
1097 		}
1098 
1099 		needed = MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET;
1100 		if ((meo.meoi_flags & needed) != needed) {
1101 			goto fail;
1102 		}
1103 
1104 		if (meo.meoi_l4proto != IPPROTO_TCP) {
1105 			goto fail;
1106 		}
1107 
1108 		if (meo.meoi_l3proto == ETHERTYPE_IP && vif->vif_tx_tso4) {
1109 			vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1110 		} else if (meo.meoi_l3proto == ETHERTYPE_IPV6 &&
1111 		    vif->vif_tx_tso6) {
1112 			vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1113 		} else {
1114 			goto fail;
1115 		}
1116 
1117 		/*
1118 		 * The TCP stack does not include the length in the TCP
1119 		 * pseudo-header when it is performing LSO since hardware
1120 		 * generally asks for it to be removed (as it'll change).
1121 		 * Unfortunately, for virtio, we actually need it. This means we
1122 		 * need to go through and calculate the actual length and fix
1123 		 * things up. Because the virtio spec cares about the ECN flag
1124 		 * and indicating that, at least this means we'll have that
1125 		 * available as well.
1126 		 */
1127 		if (MBLKL(mp) < vnh->vnh_hdr_len) {
1128 			pullmp = msgpullup(mp, vnh->vnh_hdr_len);
1129 			if (pullmp == NULL)
1130 				goto fail;
1131 			tcpha = (tcpha_t *)(pullmp->b_rptr + meo.meoi_l2hlen +
1132 			    meo.meoi_l3hlen);
1133 		} else {
1134 			tcpha = (tcpha_t *)(mp->b_rptr + meo.meoi_l2hlen +
1135 			    meo.meoi_l3hlen);
1136 		}
1137 
1138 		len = meo.meoi_len - meo.meoi_l2hlen - meo.meoi_l3hlen;
1139 		cksum = ntohs(tcpha->tha_sum) + len;
1140 		cksum = (cksum >> 16) + (cksum & 0xffff);
1141 		cksum = (cksum >> 16) + (cksum & 0xffff);
1142 		tcpha->tha_sum = htons(cksum);
1143 
1144 		if (tcpha->tha_flags & TH_CWR) {
1145 			vnh->vnh_gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1146 		}
1147 		vnh->vnh_gso_size = (uint16_t)lso_mss;
1148 		vnh->vnh_hdr_len = meo.meoi_l2hlen + meo.meoi_l3hlen +
1149 		    meo.meoi_l4hlen;
1150 
1151 		freemsg(pullmp);
1152 	}
1153 
1154 	/*
1155 	 * The device does not maintain its own statistics about broadcast or
1156 	 * multicast packets, so we have to check the destination address
1157 	 * ourselves.
1158 	 */
1159 	if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
1160 		mutex_enter(&vif->vif_mutex);
1161 		if (ether_cmp(&ether->ether_dhost, vioif_broadcast) == 0) {
1162 			vif->vif_brdcstxmt++;
1163 		} else {
1164 			vif->vif_multixmt++;
1165 		}
1166 		mutex_exit(&vif->vif_mutex);
1167 	}
1168 
1169 	/*
1170 	 * For small packets, copy into the preallocated inline buffer rather
1171 	 * than incur the overhead of mapping.  Note that both of these
1172 	 * functions ensure that "mp" is freed before returning.
1173 	 */
1174 	if (msg_size < vif->vif_txcopy_thresh) {
1175 		ret = vioif_tx_inline(vif, tb, mp, msg_size);
1176 	} else {
1177 		ret = vioif_tx_external(vif, tb, mp, msg_size);
1178 	}
1179 	mp = NULL;
1180 
1181 	mutex_enter(&vif->vif_mutex);
1182 
1183 	if (ret != DDI_SUCCESS) {
1184 		goto fail;
1185 	}
1186 
1187 	vif->vif_opackets++;
1188 	vif->vif_obytes += msg_size;
1189 	mutex_exit(&vif->vif_mutex);
1190 
1191 	virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
1192 	virtio_chain_submit(tb->tb_chain, B_TRUE);
1193 
1194 	return (B_TRUE);
1195 
1196 fail:
1197 	vif->vif_oerrors++;
1198 	if (tb != NULL) {
1199 		vioif_txbuf_free(vif, tb);
1200 	}
1201 	mutex_exit(&vif->vif_mutex);
1202 
1203 	return (mp == NULL);
1204 }
1205 
1206 static mblk_t *
1207 vioif_m_tx(void *arg, mblk_t *mp)
1208 {
1209 	vioif_t *vif = arg;
1210 	mblk_t *nmp;
1211 
1212 	/*
1213 	 * Prior to attempting to send any more frames, do a reclaim to pick up
1214 	 * any descriptors which have been processed by the host.
1215 	 */
1216 	if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1217 		(void) vioif_reclaim_used_tx(vif);
1218 	}
1219 
1220 	while (mp != NULL) {
1221 		nmp = mp->b_next;
1222 		mp->b_next = NULL;
1223 
1224 		if (!vioif_send(vif, mp)) {
1225 			/*
1226 			 * If there are no descriptors available, try to
1227 			 * reclaim some, allowing a retry of the send if some
1228 			 * are found.
1229 			 */
1230 			mp->b_next = nmp;
1231 			if (vioif_reclaim_used_tx(vif) != 0) {
1232 				continue;
1233 			}
1234 
1235 			/*
1236 			 * Otherwise, enable the TX ring interrupt so that as
1237 			 * soon as a descriptor becomes available, transmission
1238 			 * can begin again.  For safety, make sure the periodic
1239 			 * reclaim is running as well.
1240 			 */
1241 			mutex_enter(&vif->vif_mutex);
1242 			vif->vif_tx_corked = B_TRUE;
1243 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
1244 			vioif_reclaim_restart(vif);
1245 			mutex_exit(&vif->vif_mutex);
1246 			return (mp);
1247 		}
1248 		mp = nmp;
1249 	}
1250 
1251 	/* Ensure the periodic reclaim has been started. */
1252 	mutex_enter(&vif->vif_mutex);
1253 	vioif_reclaim_restart(vif);
1254 	mutex_exit(&vif->vif_mutex);
1255 
1256 	return (NULL);
1257 }
1258 
1259 static int
1260 vioif_m_start(void *arg)
1261 {
1262 	vioif_t *vif = arg;
1263 
1264 	mutex_enter(&vif->vif_mutex);
1265 
1266 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
1267 	vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
1268 
1269 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1270 
1271 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
1272 
1273 	/*
1274 	 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1275 	 * Descriptor reclamation is handling during transmit, via a periodic
1276 	 * timer, and when resources are tight, via the then-enabled interrupt.
1277 	 */
1278 	vif->vif_tx_drain = B_FALSE;
1279 
1280 	/*
1281 	 * Add as many receive buffers as we can to the receive queue.  If we
1282 	 * cannot add any, it may be because we have stopped and started again
1283 	 * and the descriptors are all in the queue already.
1284 	 */
1285 	(void) vioif_add_rx(vif);
1286 
1287 	mutex_exit(&vif->vif_mutex);
1288 	return (DDI_SUCCESS);
1289 }
1290 
1291 static void
1292 vioif_m_stop(void *arg)
1293 {
1294 	vioif_t *vif = arg;
1295 
1296 	mutex_enter(&vif->vif_mutex);
1297 
1298 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
1299 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
1300 
1301 	/* Ensure all TX descriptors have been processed and reclaimed */
1302 	vioif_tx_drain(vif);
1303 
1304 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1305 
1306 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1307 	mutex_exit(&vif->vif_mutex);
1308 }
1309 
1310 static int
1311 vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
1312 {
1313 	vioif_t *vif = arg;
1314 
1315 	switch (stat) {
1316 	case MAC_STAT_IERRORS:
1317 		*val = vif->vif_ierrors;
1318 		break;
1319 	case MAC_STAT_OERRORS:
1320 		*val = vif->vif_oerrors;
1321 		break;
1322 	case MAC_STAT_MULTIRCV:
1323 		*val = vif->vif_multircv;
1324 		break;
1325 	case MAC_STAT_BRDCSTRCV:
1326 		*val = vif->vif_brdcstrcv;
1327 		break;
1328 	case MAC_STAT_MULTIXMT:
1329 		*val = vif->vif_multixmt;
1330 		break;
1331 	case MAC_STAT_BRDCSTXMT:
1332 		*val = vif->vif_brdcstxmt;
1333 		break;
1334 	case MAC_STAT_IPACKETS:
1335 		*val = vif->vif_ipackets;
1336 		break;
1337 	case MAC_STAT_RBYTES:
1338 		*val = vif->vif_rbytes;
1339 		break;
1340 	case MAC_STAT_OPACKETS:
1341 		*val = vif->vif_opackets;
1342 		break;
1343 	case MAC_STAT_OBYTES:
1344 		*val = vif->vif_obytes;
1345 		break;
1346 	case MAC_STAT_NORCVBUF:
1347 		*val = vif->vif_norecvbuf;
1348 		break;
1349 	case MAC_STAT_NOXMTBUF:
1350 		*val = vif->vif_notxbuf;
1351 		break;
1352 	case MAC_STAT_IFSPEED:
1353 		/* always 1 Gbit */
1354 		*val = 1000000000ULL;
1355 		break;
1356 	case ETHER_STAT_LINK_DUPLEX:
1357 		/* virtual device, always full-duplex */
1358 		*val = LINK_DUPLEX_FULL;
1359 		break;
1360 
1361 	default:
1362 		return (ENOTSUP);
1363 	}
1364 
1365 	return (DDI_SUCCESS);
1366 }
1367 
1368 static int
1369 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1370     uint_t pr_valsize, const void *pr_val)
1371 {
1372 	vioif_t *vif = arg;
1373 
1374 	switch (pr_num) {
1375 	case MAC_PROP_MTU: {
1376 		int r;
1377 		uint32_t mtu;
1378 		if (pr_valsize < sizeof (mtu)) {
1379 			return (EOVERFLOW);
1380 		}
1381 		bcopy(pr_val, &mtu, sizeof (mtu));
1382 
1383 		if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
1384 			return (EINVAL);
1385 		}
1386 
1387 		mutex_enter(&vif->vif_mutex);
1388 		if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
1389 			vif->vif_mtu = mtu;
1390 		}
1391 		mutex_exit(&vif->vif_mutex);
1392 
1393 		return (r);
1394 	}
1395 
1396 	case MAC_PROP_PRIVATE: {
1397 		long max, result;
1398 		uint_t *resp;
1399 		char *endptr;
1400 
1401 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1402 			max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
1403 			resp = &vif->vif_txcopy_thresh;
1404 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1405 			max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
1406 			resp = &vif->vif_rxcopy_thresh;
1407 		} else {
1408 			return (ENOTSUP);
1409 		}
1410 
1411 		if (pr_val == NULL) {
1412 			return (EINVAL);
1413 		}
1414 
1415 		if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
1416 		    *endptr != '\0' || result < 0 || result > max) {
1417 			return (EINVAL);
1418 		}
1419 
1420 		mutex_enter(&vif->vif_mutex);
1421 		*resp = result;
1422 		mutex_exit(&vif->vif_mutex);
1423 
1424 		return (0);
1425 	}
1426 
1427 	default:
1428 		return (ENOTSUP);
1429 	}
1430 }
1431 
1432 static int
1433 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1434     uint_t pr_valsize, void *pr_val)
1435 {
1436 	vioif_t *vif = arg;
1437 
1438 	switch (pr_num) {
1439 	case MAC_PROP_PRIVATE: {
1440 		uint_t value;
1441 
1442 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1443 			value = vif->vif_txcopy_thresh;
1444 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1445 			value = vif->vif_rxcopy_thresh;
1446 		} else {
1447 			return (ENOTSUP);
1448 		}
1449 
1450 		if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
1451 			return (EOVERFLOW);
1452 		}
1453 
1454 		return (0);
1455 	}
1456 
1457 	default:
1458 		return (ENOTSUP);
1459 	}
1460 }
1461 
1462 static void
1463 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1464     mac_prop_info_handle_t prh)
1465 {
1466 	vioif_t *vif = arg;
1467 	char valstr[64];
1468 	int value;
1469 
1470 	switch (pr_num) {
1471 	case MAC_PROP_MTU:
1472 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1473 		mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
1474 		return;
1475 
1476 	case MAC_PROP_PRIVATE:
1477 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1478 			value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1479 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1480 			value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1481 		} else {
1482 			/*
1483 			 * We do not recognise this private property name.
1484 			 */
1485 			return;
1486 		}
1487 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1488 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
1489 		mac_prop_info_set_default_str(prh, valstr);
1490 		return;
1491 
1492 	default:
1493 		return;
1494 	}
1495 }
1496 
1497 static boolean_t
1498 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1499 {
1500 	vioif_t *vif = arg;
1501 
1502 	switch (cap) {
1503 	case MAC_CAPAB_HCKSUM: {
1504 		if (!vif->vif_tx_csum) {
1505 			return (B_FALSE);
1506 		}
1507 
1508 		*(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
1509 
1510 		return (B_TRUE);
1511 	}
1512 
1513 	case MAC_CAPAB_LSO: {
1514 		if (!vif->vif_tx_tso4) {
1515 			return (B_FALSE);
1516 		}
1517 
1518 		mac_capab_lso_t *lso = cap_data;
1519 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | LSO_TX_BASIC_TCP_IPV6;
1520 		lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
1521 		lso->lso_basic_tcp_ipv6.lso_max = VIOIF_RX_DATA_SIZE;
1522 
1523 		return (B_TRUE);
1524 	}
1525 
1526 	default:
1527 		return (B_FALSE);
1528 	}
1529 }
1530 
1531 static boolean_t
1532 vioif_has_feature(vioif_t *vif, uint32_t feature)
1533 {
1534 	return (virtio_feature_present(vif->vif_virtio, feature));
1535 }
1536 
1537 /*
1538  * Read the primary MAC address from the device if one is provided.  If not,
1539  * generate a random locally administered MAC address and write it back to the
1540  * device.
1541  */
1542 static void
1543 vioif_get_mac(vioif_t *vif)
1544 {
1545 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1546 
1547 	if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
1548 		for (uint_t i = 0; i < ETHERADDRL; i++) {
1549 			vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
1550 			    VIRTIO_NET_CONFIG_MAC + i);
1551 		}
1552 		vif->vif_mac_from_host = 1;
1553 
1554 		return;
1555 	}
1556 
1557 	/* Get a few random bytes */
1558 	(void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
1559 	/* Make sure it's a unicast MAC */
1560 	vif->vif_mac[0] &= ~1;
1561 	/* Set the "locally administered" bit */
1562 	vif->vif_mac[1] |= 2;
1563 
1564 	/*
1565 	 * Write the random MAC address back to the device.
1566 	 */
1567 	for (uint_t i = 0; i < ETHERADDRL; i++) {
1568 		virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
1569 		    vif->vif_mac[i]);
1570 	}
1571 	vif->vif_mac_from_host = 0;
1572 
1573 	dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
1574 	    "%02x:%02x:%02x:%02x:%02x:%02x",
1575 	    (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
1576 	    (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
1577 	    (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
1578 }
1579 
1580 /*
1581  * Virtqueue interrupt handlers
1582  */
1583 static uint_t
1584 vioif_rx_handler(caddr_t arg0, caddr_t arg1)
1585 {
1586 	vioif_t *vif = (vioif_t *)arg0;
1587 
1588 	mutex_enter(&vif->vif_mutex);
1589 	(void) vioif_process_rx(vif);
1590 
1591 	/*
1592 	 * Attempt to replenish the receive queue.  If we cannot add any
1593 	 * descriptors here, it may be because all of the recently received
1594 	 * packets were loaned up to the networking stack.
1595 	 */
1596 	(void) vioif_add_rx(vif);
1597 	mutex_exit(&vif->vif_mutex);
1598 
1599 	return (DDI_INTR_CLAIMED);
1600 }
1601 
1602 static uint_t
1603 vioif_tx_handler(caddr_t arg0, caddr_t arg1)
1604 {
1605 	vioif_t *vif = (vioif_t *)arg0;
1606 
1607 	/*
1608 	 * The TX interrupt could race with other reclamation activity, so
1609 	 * interpreting the return value is unimportant.
1610 	 */
1611 	(void) vioif_reclaim_used_tx(vif);
1612 
1613 	return (DDI_INTR_CLAIMED);
1614 }
1615 
1616 static void
1617 vioif_check_features(vioif_t *vif)
1618 {
1619 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1620 
1621 	vif->vif_tx_csum = 0;
1622 	vif->vif_tx_tso4 = 0;
1623 	vif->vif_tx_tso6 = 0;
1624 
1625 	if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
1626 		/*
1627 		 * The host will accept packets with partial checksums from us.
1628 		 */
1629 		vif->vif_tx_csum = 1;
1630 
1631 		/*
1632 		 * The legacy GSO feature represents the combination of
1633 		 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
1634 		 */
1635 		boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
1636 		boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
1637 		boolean_t tso6 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO6);
1638 		boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
1639 
1640 		/*
1641 		 * Explicit congestion notification (ECN) is configured
1642 		 * globally; see "tcp_ecn_permitted".  As we cannot currently
1643 		 * request that the stack disable ECN on a per interface basis,
1644 		 * we require the device to support the combination of
1645 		 * segmentation offload and ECN support.
1646 		 */
1647 		if (gso) {
1648 			vif->vif_tx_tso4 = 1;
1649 			vif->vif_tx_tso6 = 1;
1650 		}
1651 		if (tso4 && ecn) {
1652 			vif->vif_tx_tso4 = 1;
1653 		}
1654 		if (tso6 && ecn) {
1655 			vif->vif_tx_tso6 = 1;
1656 		}
1657 	}
1658 }
1659 
1660 static int
1661 vioif_select_interrupt_types(void)
1662 {
1663 	id_t id;
1664 	smbios_system_t sys;
1665 	smbios_info_t info;
1666 
1667 	if (vioif_allowed_int_types != -1) {
1668 		/*
1669 		 * If this value was tuned via /etc/system or the debugger,
1670 		 * use the provided value directly.
1671 		 */
1672 		return (vioif_allowed_int_types);
1673 	}
1674 
1675 	if ((id = smbios_info_system(ksmbios, &sys)) == SMB_ERR ||
1676 	    smbios_info_common(ksmbios, id, &info) == SMB_ERR) {
1677 		/*
1678 		 * The system may not have valid SMBIOS data, so ignore a
1679 		 * failure here.
1680 		 */
1681 		return (0);
1682 	}
1683 
1684 	if (strcmp(info.smbi_manufacturer, "Google") == 0 &&
1685 	    strcmp(info.smbi_product, "Google Compute Engine") == 0) {
1686 		/*
1687 		 * An undiagnosed issue with the Google Compute Engine (GCE)
1688 		 * hypervisor exists.  In this environment, no RX interrupts
1689 		 * are received if MSI-X handlers are installed.  This does not
1690 		 * appear to be true for the Virtio SCSI driver.  Fixed
1691 		 * interrupts do appear to work, so we fall back for now:
1692 		 */
1693 		return (DDI_INTR_TYPE_FIXED);
1694 	}
1695 
1696 	return (0);
1697 }
1698 
1699 static int
1700 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1701 {
1702 	int ret;
1703 	vioif_t *vif;
1704 	virtio_t *vio;
1705 	mac_register_t *macp = NULL;
1706 
1707 	if (cmd != DDI_ATTACH) {
1708 		return (DDI_FAILURE);
1709 	}
1710 
1711 	if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
1712 	    NULL) {
1713 		return (DDI_FAILURE);
1714 	}
1715 
1716 	vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
1717 	vif->vif_dip = dip;
1718 	vif->vif_virtio = vio;
1719 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1720 	ddi_set_driver_private(dip, vif);
1721 
1722 	if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
1723 	    "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
1724 	    (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
1725 	    "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1726 		goto fail;
1727 	}
1728 
1729 	if (virtio_init_complete(vio, vioif_select_interrupt_types()) !=
1730 	    DDI_SUCCESS) {
1731 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
1732 		goto fail;
1733 	}
1734 
1735 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1736 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1737 
1738 	mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
1739 	mutex_enter(&vif->vif_mutex);
1740 
1741 	vioif_get_mac(vif);
1742 
1743 	vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1744 	vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1745 
1746 	if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
1747 		vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
1748 	} else {
1749 		vif->vif_mtu_max = ETHERMTU;
1750 	}
1751 
1752 	vif->vif_mtu = ETHERMTU;
1753 	if (vif->vif_mtu > vif->vif_mtu_max) {
1754 		vif->vif_mtu = vif->vif_mtu_max;
1755 	}
1756 
1757 	vioif_check_features(vif);
1758 
1759 	if (vioif_alloc_bufs(vif) != 0) {
1760 		mutex_exit(&vif->vif_mutex);
1761 		dev_err(dip, CE_WARN, "failed to allocate memory");
1762 		goto fail;
1763 	}
1764 
1765 	mutex_exit(&vif->vif_mutex);
1766 
1767 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
1768 		dev_err(dip, CE_WARN, "failed to enable interrupts");
1769 		goto fail;
1770 	}
1771 
1772 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
1773 		dev_err(dip, CE_WARN, "failed to allocate a mac_register");
1774 		goto fail;
1775 	}
1776 
1777 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1778 	macp->m_driver = vif;
1779 	macp->m_dip = dip;
1780 	macp->m_src_addr = vif->vif_mac;
1781 	macp->m_callbacks = &vioif_mac_callbacks;
1782 	macp->m_min_sdu = 0;
1783 	macp->m_max_sdu = vif->vif_mtu;
1784 	macp->m_margin = VLAN_TAGSZ;
1785 	macp->m_priv_props = vioif_priv_props;
1786 
1787 	if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
1788 		dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
1789 		goto fail;
1790 	}
1791 	mac_free(macp);
1792 
1793 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1794 
1795 	return (DDI_SUCCESS);
1796 
1797 fail:
1798 	vioif_free_bufs(vif);
1799 	if (macp != NULL) {
1800 		mac_free(macp);
1801 	}
1802 	(void) virtio_fini(vio, B_TRUE);
1803 	kmem_free(vif, sizeof (*vif));
1804 	return (DDI_FAILURE);
1805 }
1806 
1807 static int
1808 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1809 {
1810 	int r;
1811 	vioif_t *vif;
1812 
1813 	if (cmd != DDI_DETACH) {
1814 		return (DDI_FAILURE);
1815 	}
1816 
1817 	if ((vif = ddi_get_driver_private(dip)) == NULL) {
1818 		return (DDI_FAILURE);
1819 	}
1820 
1821 	mutex_enter(&vif->vif_mutex);
1822 	if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
1823 		dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
1824 		mutex_exit(&vif->vif_mutex);
1825 		return (DDI_FAILURE);
1826 	}
1827 
1828 	/*
1829 	 * There should be no outstanding transmit buffers once the NIC is
1830 	 * completely stopped.
1831 	 */
1832 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
1833 
1834 	/*
1835 	 * Though we cannot claw back all of the receive buffers until we reset
1836 	 * the device, we must ensure all those loaned to MAC have been
1837 	 * returned before calling mac_unregister().
1838 	 */
1839 	if (vif->vif_nrxbufs_onloan > 0) {
1840 		dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
1841 		    "cannot detach", vif->vif_nrxbufs_onloan);
1842 		mutex_exit(&vif->vif_mutex);
1843 		return (DDI_FAILURE);
1844 	}
1845 
1846 	if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
1847 		dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
1848 		return (DDI_FAILURE);
1849 	}
1850 
1851 	/*
1852 	 * Shut down the device so that we can recover any previously
1853 	 * submitted receive buffers.
1854 	 */
1855 	virtio_shutdown(vif->vif_virtio);
1856 	for (;;) {
1857 		virtio_chain_t *vic;
1858 
1859 		if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
1860 			break;
1861 		}
1862 
1863 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
1864 		vioif_rxbuf_free(vif, rb);
1865 	}
1866 
1867 	/*
1868 	 * vioif_free_bufs() must be called before virtio_fini()
1869 	 * as it uses virtio_chain_free() which itself depends on some
1870 	 * virtio data structures still being around.
1871 	 */
1872 	vioif_free_bufs(vif);
1873 	(void) virtio_fini(vif->vif_virtio, B_FALSE);
1874 
1875 	mutex_exit(&vif->vif_mutex);
1876 	mutex_destroy(&vif->vif_mutex);
1877 
1878 	kmem_free(vif, sizeof (*vif));
1879 
1880 	return (DDI_SUCCESS);
1881 }
1882 
1883 static int
1884 vioif_quiesce(dev_info_t *dip)
1885 {
1886 	vioif_t *vif;
1887 
1888 	if ((vif = ddi_get_driver_private(dip)) == NULL)
1889 		return (DDI_FAILURE);
1890 
1891 	return (virtio_quiesce(vif->vif_virtio));
1892 }
1893 
1894 int
1895 _init(void)
1896 {
1897 	int ret;
1898 
1899 	mac_init_ops(&vioif_dev_ops, "vioif");
1900 
1901 	if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
1902 		mac_fini_ops(&vioif_dev_ops);
1903 	}
1904 
1905 	return (ret);
1906 }
1907 
1908 int
1909 _fini(void)
1910 {
1911 	int ret;
1912 
1913 	if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
1914 		mac_fini_ops(&vioif_dev_ops);
1915 	}
1916 
1917 	return (ret);
1918 }
1919 
1920 int
1921 _info(struct modinfo *modinfop)
1922 {
1923 	return (mod_info(&vioif_modlinkage, modinfop));
1924 }
1925