xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.c (revision 9b9d39d2a32ff806d2431dbcc50968ef1e6d46b2)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2013 Nexenta Inc.  All rights reserved.
14  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15  * Copyright 2021 Joyent, Inc.
16  * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org>
17  */
18 
19 /* Based on the NetBSD virtio driver by Minoura Makoto. */
20 /*
21  * Copyright (c) 2010 Minoura Makoto.
22  * All rights reserved.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  *
33  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
34  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
35  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
36  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
37  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
38  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
39  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
40  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
42  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43  */
44 
45 /*
46  * VIRTIO NETWORK DRIVER
47  */
48 
49 #include <sys/types.h>
50 #include <sys/errno.h>
51 #include <sys/param.h>
52 #include <sys/stropts.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/kmem.h>
56 #include <sys/conf.h>
57 #include <sys/devops.h>
58 #include <sys/ksynch.h>
59 #include <sys/stat.h>
60 #include <sys/modctl.h>
61 #include <sys/debug.h>
62 #include <sys/pci.h>
63 #include <sys/ethernet.h>
64 #include <sys/vlan.h>
65 #include <sys/sysmacros.h>
66 #include <sys/smbios.h>
67 
68 #include <sys/dlpi.h>
69 #include <sys/taskq.h>
70 
71 #include <sys/pattr.h>
72 #include <sys/strsun.h>
73 
74 #include <sys/random.h>
75 #include <sys/containerof.h>
76 #include <sys/stream.h>
77 #include <inet/tcp.h>
78 
79 #include <sys/mac.h>
80 #include <sys/mac_provider.h>
81 #include <sys/mac_ether.h>
82 
83 #include "virtio.h"
84 #include "vioif.h"
85 
86 /*
87  * While most hypervisors support the control queue, older versions of bhyve
88  * on illumos did not. To allow the historic behaviour of the illumos vioif
89  * driver, the following tuneable causes us to pretend that the request always
90  * succeeds if the underlying virtual device does not have support.
91  */
92 int vioif_fake_promisc_success = 1;
93 
94 static int vioif_quiesce(dev_info_t *);
95 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
96 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
97 static boolean_t vioif_has_feature(vioif_t *, uint32_t);
98 static void vioif_reclaim_restart(vioif_t *);
99 static int vioif_m_stat(void *, uint_t, uint64_t *);
100 static void vioif_m_stop(void *);
101 static int vioif_m_start(void *);
102 static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
103 static int vioif_m_setpromisc(void *, boolean_t);
104 static int vioif_m_unicst(void *, const uint8_t *);
105 static mblk_t *vioif_m_tx(void *, mblk_t *);
106 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
107     const void *);
108 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
109 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
110     mac_prop_info_handle_t);
111 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
112 static uint_t vioif_add_rx(vioif_t *);
113 
114 
115 static struct cb_ops vioif_cb_ops = {
116 	.cb_rev =			CB_REV,
117 	.cb_flag =			D_MP | D_NEW,
118 
119 	.cb_open =			nulldev,
120 	.cb_close =			nulldev,
121 	.cb_strategy =			nodev,
122 	.cb_print =			nodev,
123 	.cb_dump =			nodev,
124 	.cb_read =			nodev,
125 	.cb_write =			nodev,
126 	.cb_ioctl =			nodev,
127 	.cb_devmap =			nodev,
128 	.cb_mmap =			nodev,
129 	.cb_segmap =			nodev,
130 	.cb_chpoll =			nochpoll,
131 	.cb_prop_op =			ddi_prop_op,
132 	.cb_str =			NULL,
133 	.cb_aread =			nodev,
134 	.cb_awrite =			nodev,
135 };
136 
137 static struct dev_ops vioif_dev_ops = {
138 	.devo_rev =			DEVO_REV,
139 	.devo_refcnt =			0,
140 
141 	.devo_attach =			vioif_attach,
142 	.devo_detach =			vioif_detach,
143 	.devo_quiesce =			vioif_quiesce,
144 
145 	.devo_cb_ops =			&vioif_cb_ops,
146 
147 	.devo_getinfo =			NULL,
148 	.devo_identify =		nulldev,
149 	.devo_probe =			nulldev,
150 	.devo_reset =			nodev,
151 	.devo_bus_ops =			NULL,
152 	.devo_power =			NULL,
153 };
154 
155 static struct modldrv vioif_modldrv = {
156 	.drv_modops =			&mod_driverops,
157 	.drv_linkinfo =			"VIRTIO network driver",
158 	.drv_dev_ops =			&vioif_dev_ops
159 };
160 
161 static struct modlinkage vioif_modlinkage = {
162 	.ml_rev =			MODREV_1,
163 	.ml_linkage =			{ &vioif_modldrv, NULL }
164 };
165 
166 static mac_callbacks_t vioif_mac_callbacks = {
167 	.mc_getstat =			vioif_m_stat,
168 	.mc_start =			vioif_m_start,
169 	.mc_stop =			vioif_m_stop,
170 	.mc_setpromisc =		vioif_m_setpromisc,
171 	.mc_multicst =			vioif_m_multicst,
172 	.mc_unicst =			vioif_m_unicst,
173 	.mc_tx =			vioif_m_tx,
174 
175 	.mc_callbacks =			(MC_GETCAPAB | MC_SETPROP |
176 					    MC_GETPROP | MC_PROPINFO),
177 	.mc_getcapab =			vioif_m_getcapab,
178 	.mc_setprop =			vioif_m_setprop,
179 	.mc_getprop =			vioif_m_getprop,
180 	.mc_propinfo =			vioif_m_propinfo,
181 };
182 
183 static const uchar_t vioif_broadcast[ETHERADDRL] = {
184 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
185 };
186 
187 /*
188  * Interval for the periodic TX reclaim.
189  */
190 uint_t vioif_reclaim_ms = 200;
191 
192 /*
193  * Allow the operator to override the kinds of interrupts we'll use for
194  * vioif.  This value defaults to -1 so that it can be overridden to 0 in
195  * /etc/system.
196  */
197 int vioif_allowed_int_types = -1;
198 
199 /*
200  * DMA attribute template for transmit and receive buffers.  The SGL entry
201  * count will be modified before using the template.  Note that these
202  * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
203  * received frames at the correct offset for the networking stack.
204  */
205 ddi_dma_attr_t vioif_dma_attr_bufs = {
206 	.dma_attr_version =		DMA_ATTR_V0,
207 	.dma_attr_addr_lo =		0x0000000000000000,
208 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
209 	.dma_attr_count_max =		0x00000000FFFFFFFF,
210 	.dma_attr_align =		VIOIF_HEADER_ALIGN,
211 	.dma_attr_burstsizes =		1,
212 	.dma_attr_minxfer =		1,
213 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
214 	.dma_attr_seg =			0x00000000FFFFFFFF,
215 	.dma_attr_sgllen =		0,
216 	.dma_attr_granular =		1,
217 	.dma_attr_flags =		0
218 };
219 
220 /*
221  * DMA attributes for mapping larger transmit buffers from the networking
222  * stack.  The requirements are quite loose, but note that the SGL entry length
223  * field is 32-bit.
224  */
225 ddi_dma_attr_t vioif_dma_attr_external = {
226 	.dma_attr_version =		DMA_ATTR_V0,
227 	.dma_attr_addr_lo =		0x0000000000000000,
228 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
229 	.dma_attr_count_max =		0x00000000FFFFFFFF,
230 	.dma_attr_align =		1,
231 	.dma_attr_burstsizes =		1,
232 	.dma_attr_minxfer =		1,
233 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
234 	.dma_attr_seg =			0x00000000FFFFFFFF,
235 	.dma_attr_sgllen =		VIOIF_MAX_SEGS - 1,
236 	.dma_attr_granular =		1,
237 	.dma_attr_flags =		0
238 };
239 
240 
241 /*
242  * VIRTIO NET MAC PROPERTIES
243  */
244 #define	VIOIF_MACPROP_TXCOPY_THRESH	"_txcopy_thresh"
245 #define	VIOIF_MACPROP_TXCOPY_THRESH_DEF	300
246 #define	VIOIF_MACPROP_TXCOPY_THRESH_MAX	640
247 
248 #define	VIOIF_MACPROP_RXCOPY_THRESH	"_rxcopy_thresh"
249 #define	VIOIF_MACPROP_RXCOPY_THRESH_DEF	300
250 #define	VIOIF_MACPROP_RXCOPY_THRESH_MAX	640
251 
252 static char *vioif_priv_props[] = {
253 	VIOIF_MACPROP_TXCOPY_THRESH,
254 	VIOIF_MACPROP_RXCOPY_THRESH,
255 	NULL
256 };
257 
258 
259 static vioif_txbuf_t *
260 vioif_txbuf_alloc(vioif_t *vif)
261 {
262 	vioif_txbuf_t *tb;
263 
264 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
265 
266 	if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
267 		vif->vif_ntxbufs_alloc++;
268 	}
269 
270 	return (tb);
271 }
272 
273 static void
274 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
275 {
276 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
277 
278 	VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
279 	vif->vif_ntxbufs_alloc--;
280 
281 	virtio_chain_clear(tb->tb_chain);
282 	list_insert_head(&vif->vif_txbufs, tb);
283 }
284 
285 static vioif_rxbuf_t *
286 vioif_rxbuf_alloc(vioif_t *vif)
287 {
288 	vioif_rxbuf_t *rb;
289 
290 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
291 
292 	if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
293 		vif->vif_nrxbufs_alloc++;
294 	}
295 
296 	return (rb);
297 }
298 
299 static void
300 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
301 {
302 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
303 
304 	VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
305 	vif->vif_nrxbufs_alloc--;
306 
307 	virtio_chain_clear(rb->rb_chain);
308 	list_insert_head(&vif->vif_rxbufs, rb);
309 }
310 
311 static void
312 vioif_rx_free_callback(caddr_t free_arg)
313 {
314 	vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
315 	vioif_t *vif = rb->rb_vioif;
316 
317 	mutex_enter(&vif->vif_mutex);
318 
319 	/*
320 	 * Return this receive buffer to the free list.
321 	 */
322 	vioif_rxbuf_free(vif, rb);
323 
324 	VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
325 	vif->vif_nrxbufs_onloan--;
326 
327 	/*
328 	 * Attempt to replenish the receive queue with at least the buffer we
329 	 * just freed.  There isn't a great way to deal with failure here,
330 	 * though because we'll only loan at most half of the buffers there
331 	 * should always be at least some available even if this fails.
332 	 */
333 	(void) vioif_add_rx(vif);
334 
335 	mutex_exit(&vif->vif_mutex);
336 }
337 
338 static vioif_ctrlbuf_t *
339 vioif_ctrlbuf_alloc(vioif_t *vif)
340 {
341 	vioif_ctrlbuf_t *cb;
342 
343 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
344 
345 	if ((cb = list_remove_head(&vif->vif_ctrlbufs)) != NULL) {
346 		vif->vif_nctrlbufs_alloc++;
347 	}
348 
349 	return (cb);
350 }
351 
352 static void
353 vioif_ctrlbuf_free(vioif_t *vif, vioif_ctrlbuf_t *cb)
354 {
355 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
356 
357 	VERIFY3U(vif->vif_nctrlbufs_alloc, >, 0);
358 	vif->vif_nctrlbufs_alloc--;
359 
360 	virtio_chain_clear(cb->cb_chain);
361 	list_insert_head(&vif->vif_ctrlbufs, cb);
362 }
363 
364 static void
365 vioif_free_bufs(vioif_t *vif)
366 {
367 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
368 
369 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
370 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
371 		vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
372 
373 		/*
374 		 * Ensure that this txbuf is now in the free list:
375 		 */
376 		VERIFY(list_link_active(&tb->tb_link));
377 		list_remove(&vif->vif_txbufs, tb);
378 
379 		/*
380 		 * We should not have an mblk chain at this point.
381 		 */
382 		VERIFY3P(tb->tb_mp, ==, NULL);
383 
384 		if (tb->tb_dma != NULL) {
385 			virtio_dma_free(tb->tb_dma);
386 			tb->tb_dma = NULL;
387 		}
388 
389 		if (tb->tb_chain != NULL) {
390 			virtio_chain_free(tb->tb_chain);
391 			tb->tb_chain = NULL;
392 		}
393 
394 		if (tb->tb_dmaext != NULL) {
395 			for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
396 				if (tb->tb_dmaext[j] != NULL) {
397 					virtio_dma_free(
398 					    tb->tb_dmaext[j]);
399 					tb->tb_dmaext[j] = NULL;
400 				}
401 			}
402 
403 			kmem_free(tb->tb_dmaext,
404 			    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
405 			tb->tb_dmaext = NULL;
406 			tb->tb_dmaext_capacity = 0;
407 		}
408 	}
409 	VERIFY(list_is_empty(&vif->vif_txbufs));
410 	if (vif->vif_txbufs_mem != NULL) {
411 		kmem_free(vif->vif_txbufs_mem,
412 		    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
413 		vif->vif_txbufs_mem = NULL;
414 		vif->vif_txbufs_capacity = 0;
415 	}
416 
417 	VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
418 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
419 		vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
420 
421 		/*
422 		 * Ensure that this rxbuf is now in the free list:
423 		 */
424 		VERIFY(list_link_active(&rb->rb_link));
425 		list_remove(&vif->vif_rxbufs, rb);
426 
427 		if (rb->rb_dma != NULL) {
428 			virtio_dma_free(rb->rb_dma);
429 			rb->rb_dma = NULL;
430 		}
431 
432 		if (rb->rb_chain != NULL) {
433 			virtio_chain_free(rb->rb_chain);
434 			rb->rb_chain = NULL;
435 		}
436 	}
437 	VERIFY(list_is_empty(&vif->vif_rxbufs));
438 	if (vif->vif_rxbufs_mem != NULL) {
439 		kmem_free(vif->vif_rxbufs_mem,
440 		    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
441 		vif->vif_rxbufs_mem = NULL;
442 		vif->vif_rxbufs_capacity = 0;
443 	}
444 
445 	if (vif->vif_has_ctrlq) {
446 		VERIFY3U(vif->vif_nctrlbufs_alloc, ==, 0);
447 		for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
448 			vioif_ctrlbuf_t *cb = &vif->vif_ctrlbufs_mem[i];
449 
450 			/*
451 			 * Ensure that this ctrlbuf is now in the free list
452 			 */
453 			VERIFY(list_link_active(&cb->cb_link));
454 			list_remove(&vif->vif_ctrlbufs, cb);
455 
456 			if (cb->cb_dma != NULL) {
457 				virtio_dma_free(cb->cb_dma);
458 				cb->cb_dma = NULL;
459 			}
460 
461 			if (cb->cb_chain != NULL) {
462 				virtio_chain_free(cb->cb_chain);
463 				cb->cb_chain = NULL;
464 			}
465 		}
466 		VERIFY(list_is_empty(&vif->vif_ctrlbufs));
467 		if (vif->vif_ctrlbufs_mem != NULL) {
468 			kmem_free(vif->vif_ctrlbufs_mem,
469 			    sizeof (vioif_ctrlbuf_t) *
470 			    vif->vif_ctrlbufs_capacity);
471 			vif->vif_ctrlbufs_mem = NULL;
472 			vif->vif_ctrlbufs_capacity = 0;
473 		}
474 	}
475 }
476 
477 static int
478 vioif_alloc_bufs(vioif_t *vif)
479 {
480 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
481 
482 	/*
483 	 * Allocate one contiguous chunk of memory for the transmit and receive
484 	 * buffer tracking objects.  If the ring is unusually small, we'll
485 	 * reduce our target buffer count accordingly.
486 	 */
487 	vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
488 	    virtio_queue_size(vif->vif_tx_vq));
489 	vif->vif_txbufs_mem = kmem_zalloc(
490 	    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
491 	list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
492 	    offsetof(vioif_txbuf_t, tb_link));
493 
494 	vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
495 	    virtio_queue_size(vif->vif_rx_vq));
496 	vif->vif_rxbufs_mem = kmem_zalloc(
497 	    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
498 	list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
499 	    offsetof(vioif_rxbuf_t, rb_link));
500 
501 	if (vif->vif_has_ctrlq) {
502 		vif->vif_ctrlbufs_capacity = MIN(VIRTIO_NET_CTRL_BUFS,
503 		    virtio_queue_size(vif->vif_ctrl_vq));
504 		vif->vif_ctrlbufs_mem = kmem_zalloc(
505 		    sizeof (vioif_ctrlbuf_t) * vif->vif_ctrlbufs_capacity,
506 		    KM_SLEEP);
507 	}
508 	list_create(&vif->vif_ctrlbufs, sizeof (vioif_ctrlbuf_t),
509 	    offsetof(vioif_ctrlbuf_t, cb_link));
510 
511 	/*
512 	 * Do not loan more than half of our allocated receive buffers into
513 	 * the networking stack.
514 	 */
515 	vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
516 
517 	/*
518 	 * Put everything in the free list straight away in order to simplify
519 	 * the use of vioif_free_bufs() for cleanup on allocation failure.
520 	 */
521 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
522 		list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
523 	}
524 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
525 		list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
526 	}
527 	for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
528 		list_insert_tail(&vif->vif_ctrlbufs, &vif->vif_ctrlbufs_mem[i]);
529 	}
530 
531 	/*
532 	 * Start from the DMA attribute template common to both transmit and
533 	 * receive buffers.  The SGL entry count will be modified for each
534 	 * buffer type.
535 	 */
536 	ddi_dma_attr_t attr = vioif_dma_attr_bufs;
537 
538 	/*
539 	 * The transmit inline buffer is small (less than a page), so it's
540 	 * reasonable to request a single cookie.
541 	 */
542 	attr.dma_attr_sgllen = 1;
543 
544 	for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
545 	    tb = list_next(&vif->vif_txbufs, tb)) {
546 		if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
547 		    VIOIF_TX_INLINE_SIZE, &attr,
548 		    DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
549 			goto fail;
550 		}
551 		VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
552 
553 		if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
554 		    KM_SLEEP)) == NULL) {
555 			goto fail;
556 		}
557 		virtio_chain_data_set(tb->tb_chain, tb);
558 
559 		tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
560 		tb->tb_dmaext = kmem_zalloc(
561 		    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
562 		    KM_SLEEP);
563 	}
564 
565 	/*
566 	 * Control queue buffers are also small (less than a page), so we'll
567 	 * also request a single cookie for them.
568 	 */
569 	for (vioif_ctrlbuf_t *cb = list_head(&vif->vif_ctrlbufs); cb != NULL;
570 	    cb = list_next(&vif->vif_ctrlbufs, cb)) {
571 		if ((cb->cb_dma = virtio_dma_alloc(vif->vif_virtio,
572 		    VIOIF_CTRL_SIZE, &attr,
573 		    DDI_DMA_STREAMING | DDI_DMA_RDWR, KM_SLEEP)) == NULL) {
574 			goto fail;
575 		}
576 		VERIFY3U(virtio_dma_ncookies(cb->cb_dma), ==, 1);
577 
578 		if ((cb->cb_chain = virtio_chain_alloc(vif->vif_ctrl_vq,
579 		    KM_SLEEP)) == NULL) {
580 			goto fail;
581 		}
582 		virtio_chain_data_set(cb->cb_chain, cb);
583 	}
584 
585 	/*
586 	 * The receive buffers are larger, and we can tolerate a large number
587 	 * of segments.  Adjust the SGL entry count, setting aside one segment
588 	 * for the virtio net header.
589 	 */
590 	attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
591 
592 	for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
593 	    rb = list_next(&vif->vif_rxbufs, rb)) {
594 		if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
595 		    VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
596 		    KM_SLEEP)) == NULL) {
597 			goto fail;
598 		}
599 
600 		if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
601 		    KM_SLEEP)) == NULL) {
602 			goto fail;
603 		}
604 		virtio_chain_data_set(rb->rb_chain, rb);
605 
606 		/*
607 		 * Ensure that the first cookie is sufficient to cover the
608 		 * header skip region plus one byte.
609 		 */
610 		VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
611 		    VIOIF_HEADER_SKIP + 1);
612 
613 		/*
614 		 * Ensure that the frame data begins at a location with a
615 		 * correctly aligned IP header.
616 		 */
617 		VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
618 		    VIOIF_HEADER_SKIP) % 4, ==, 2);
619 
620 		rb->rb_vioif = vif;
621 		rb->rb_frtn.free_func = vioif_rx_free_callback;
622 		rb->rb_frtn.free_arg = (caddr_t)rb;
623 	}
624 
625 	return (0);
626 
627 fail:
628 	vioif_free_bufs(vif);
629 	return (ENOMEM);
630 }
631 
632 static int
633 vioif_ctrlq_req(vioif_t *vif, uint8_t class, uint8_t cmd, void *data,
634     size_t datalen)
635 {
636 	vioif_ctrlbuf_t *cb = NULL;
637 	virtio_chain_t *vic = NULL;
638 	uint8_t *p = NULL;
639 	uint64_t pa = 0;
640 	uint8_t *ackp = NULL;
641 	struct virtio_net_ctrlq_hdr hdr = {
642 		.vnch_class = class,
643 		.vnch_command = cmd,
644 	};
645 	const size_t hdrlen = sizeof (hdr);
646 	const size_t acklen = 1; /* the ack is always 1 byte */
647 	size_t totlen = hdrlen + datalen + acklen;
648 	int r = DDI_SUCCESS;
649 
650 	/*
651 	 * We shouldn't be called unless the ctrlq feature has been
652 	 * negotiated with the host
653 	 */
654 	VERIFY(vif->vif_has_ctrlq);
655 
656 	mutex_enter(&vif->vif_mutex);
657 	cb = vioif_ctrlbuf_alloc(vif);
658 	if (cb == NULL) {
659 		vif->vif_noctrlbuf++;
660 		mutex_exit(&vif->vif_mutex);
661 		r = DDI_FAILURE;
662 		goto done;
663 	}
664 	mutex_exit(&vif->vif_mutex);
665 
666 	if (totlen > virtio_dma_size(cb->cb_dma)) {
667 		vif->vif_ctrlbuf_toosmall++;
668 		r = DDI_FAILURE;
669 		goto done;
670 	}
671 
672 	/*
673 	 * Clear the entire buffer. Technically not necessary, but useful
674 	 * if trying to troubleshoot an issue, and probably not a bad idea
675 	 * to not let any old data linger.
676 	 */
677 	p = virtio_dma_va(cb->cb_dma, 0);
678 	bzero(p, virtio_dma_size(cb->cb_dma));
679 
680 	/*
681 	 * We currently do not support VIRTIO_F_ANY_LAYOUT. That means,
682 	 * that we must put the header, the data, and the ack in their
683 	 * own respective descriptors. Since all the currently supported
684 	 * control queue commands take _very_ small amounts of data, we
685 	 * use a single DMA buffer for all of it, but use 3 descriptors to
686 	 * reference (respectively) the header, the data, and the ack byte
687 	 * within that memory to adhere to the virtio spec.
688 	 *
689 	 * If we add support for control queue features such as custom
690 	 * MAC filtering tables, which might require larger amounts of
691 	 * memory, we likely will want to add more sophistication here
692 	 * and optionally use additional allocated memory to hold that
693 	 * data instead of a fixed size buffer.
694 	 *
695 	 * Copy the header.
696 	 */
697 	bcopy(&hdr, p, sizeof (hdr));
698 	pa = virtio_dma_cookie_pa(cb->cb_dma, 0);
699 	if ((r = virtio_chain_append(cb->cb_chain,
700 	    pa, hdrlen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
701 		goto done;
702 	}
703 
704 	/*
705 	 * Copy the request data
706 	 */
707 	p = virtio_dma_va(cb->cb_dma, hdrlen);
708 	bcopy(data, p, datalen);
709 	if ((r = virtio_chain_append(cb->cb_chain,
710 	    pa + hdrlen, datalen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
711 		goto done;
712 	}
713 
714 	/*
715 	 * We already cleared the buffer, so don't need to copy out a 0 for
716 	 * the ack byte. Just add a descriptor for that spot.
717 	 */
718 	ackp = virtio_dma_va(cb->cb_dma, hdrlen + datalen);
719 	if ((r = virtio_chain_append(cb->cb_chain,
720 	    pa + hdrlen + datalen, acklen,
721 	    VIRTIO_DIR_DEVICE_WRITES)) != DDI_SUCCESS) {
722 		goto done;
723 	}
724 
725 	virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORDEV);
726 	virtio_chain_submit(cb->cb_chain, B_TRUE);
727 
728 	/*
729 	 * Spin waiting for response.
730 	 */
731 	mutex_enter(&vif->vif_mutex);
732 	while ((vic = virtio_queue_poll(vif->vif_ctrl_vq)) == NULL) {
733 		mutex_exit(&vif->vif_mutex);
734 		delay(drv_usectohz(1000));
735 		mutex_enter(&vif->vif_mutex);
736 	}
737 
738 	virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORCPU);
739 	VERIFY3P(virtio_chain_data(vic), ==, cb);
740 	mutex_exit(&vif->vif_mutex);
741 
742 	if (*ackp != VIRTIO_NET_CQ_OK) {
743 		r = DDI_FAILURE;
744 	}
745 
746 done:
747 	mutex_enter(&vif->vif_mutex);
748 	vioif_ctrlbuf_free(vif, cb);
749 	mutex_exit(&vif->vif_mutex);
750 
751 	return (r);
752 }
753 
754 static int
755 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
756 {
757 	/*
758 	 * Even though we currently do not have support for programming
759 	 * multicast filters, or even enabling promiscuous mode, we return
760 	 * success here to avoid the networking stack falling back to link
761 	 * layer broadcast for multicast traffic.  Some hypervisors already
762 	 * pass received multicast frames onto the guest, so at least on those
763 	 * systems multicast will work as expected anyway.
764 	 */
765 	return (0);
766 }
767 
768 static int
769 vioif_m_setpromisc(void *arg, boolean_t on)
770 {
771 	vioif_t *vif = arg;
772 	uint8_t val = on ? 1 : 0;
773 
774 	if (!vif->vif_has_ctrlq_rx) {
775 		if (vioif_fake_promisc_success)
776 			return (0);
777 
778 		return (ENOTSUP);
779 	}
780 
781 	return (vioif_ctrlq_req(vif, VIRTIO_NET_CTRL_RX,
782 	    VIRTIO_NET_CTRL_RX_PROMISC, &val, sizeof (val)));
783 }
784 
785 static int
786 vioif_m_unicst(void *arg, const uint8_t *mac)
787 {
788 	return (ENOTSUP);
789 }
790 
791 static uint_t
792 vioif_add_rx(vioif_t *vif)
793 {
794 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
795 
796 	if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
797 		/*
798 		 * If the NIC is not running, do not give the device any
799 		 * receive buffers.
800 		 */
801 		return (0);
802 	}
803 
804 	uint_t num_added = 0;
805 
806 	vioif_rxbuf_t *rb;
807 	while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
808 		/*
809 		 * For legacy devices, and those that have not negotiated
810 		 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
811 		 * separate descriptor entry to the rest of the buffer.
812 		 */
813 		if (virtio_chain_append(rb->rb_chain,
814 		    virtio_dma_cookie_pa(rb->rb_dma, 0),
815 		    sizeof (struct virtio_net_hdr),
816 		    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
817 			goto fail;
818 		}
819 
820 		for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
821 			uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
822 			size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
823 
824 			if (n == 0) {
825 				pa += VIOIF_HEADER_SKIP;
826 				VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
827 				sz -= VIOIF_HEADER_SKIP;
828 			}
829 
830 			if (virtio_chain_append(rb->rb_chain, pa, sz,
831 			    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
832 				goto fail;
833 			}
834 		}
835 
836 		virtio_chain_submit(rb->rb_chain, B_FALSE);
837 		num_added++;
838 		continue;
839 
840 fail:
841 		vioif_rxbuf_free(vif, rb);
842 		vif->vif_norecvbuf++;
843 		break;
844 	}
845 
846 	if (num_added > 0) {
847 		virtio_queue_flush(vif->vif_rx_vq);
848 	}
849 
850 	return (num_added);
851 }
852 
853 static uint_t
854 vioif_process_rx(vioif_t *vif)
855 {
856 	virtio_chain_t *vic;
857 	mblk_t *mphead = NULL, *lastmp = NULL, *mp;
858 	uint_t num_processed = 0;
859 
860 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
861 
862 	while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
863 		/*
864 		 * We have to use the chain received length here, as the device
865 		 * does not tell us the received frame length any other way.
866 		 * In a limited survey of hypervisors, virtio network devices
867 		 * appear to provide the right value here.
868 		 */
869 		size_t len = virtio_chain_received_length(vic);
870 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
871 
872 		virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
873 
874 		/*
875 		 * If the NIC is not running, discard any received frames.
876 		 */
877 		if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
878 			vioif_rxbuf_free(vif, rb);
879 			continue;
880 		}
881 
882 		if (len < sizeof (struct virtio_net_hdr)) {
883 			vif->vif_rxfail_chain_undersize++;
884 			vif->vif_ierrors++;
885 			vioif_rxbuf_free(vif, rb);
886 			continue;
887 		}
888 		len -= sizeof (struct virtio_net_hdr);
889 
890 		/*
891 		 * We copy small packets that happen to fit into a single
892 		 * cookie and reuse the buffers. For bigger ones, we loan
893 		 * the buffers upstream.
894 		 */
895 		if (len < vif->vif_rxcopy_thresh ||
896 		    vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
897 			mutex_exit(&vif->vif_mutex);
898 			if ((mp = allocb(len, 0)) == NULL) {
899 				mutex_enter(&vif->vif_mutex);
900 				vif->vif_norecvbuf++;
901 				vif->vif_ierrors++;
902 
903 				vioif_rxbuf_free(vif, rb);
904 				continue;
905 			}
906 
907 			bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
908 			    mp->b_rptr, len);
909 			mp->b_wptr = mp->b_rptr + len;
910 
911 			/*
912 			 * As the packet contents was copied rather than
913 			 * loaned, we can return the receive buffer resources
914 			 * to the free list.
915 			 */
916 			mutex_enter(&vif->vif_mutex);
917 			vioif_rxbuf_free(vif, rb);
918 
919 		} else {
920 			mutex_exit(&vif->vif_mutex);
921 			if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
922 			    VIOIF_HEADER_SKIP), len, 0,
923 			    &rb->rb_frtn)) == NULL) {
924 				mutex_enter(&vif->vif_mutex);
925 				vif->vif_norecvbuf++;
926 				vif->vif_ierrors++;
927 
928 				vioif_rxbuf_free(vif, rb);
929 				continue;
930 			}
931 			mp->b_wptr = mp->b_rptr + len;
932 
933 			mutex_enter(&vif->vif_mutex);
934 			vif->vif_nrxbufs_onloan++;
935 		}
936 
937 		/*
938 		 * virtio-net does not tell us if this packet is multicast
939 		 * or broadcast, so we have to check it.
940 		 */
941 		if (mp->b_rptr[0] & 0x1) {
942 			if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
943 				vif->vif_multircv++;
944 			else
945 				vif->vif_brdcstrcv++;
946 		}
947 
948 		vif->vif_rbytes += len;
949 		vif->vif_ipackets++;
950 
951 		if (lastmp == NULL) {
952 			mphead = mp;
953 		} else {
954 			lastmp->b_next = mp;
955 		}
956 		lastmp = mp;
957 		num_processed++;
958 	}
959 
960 	if (mphead != NULL) {
961 		if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
962 			mutex_exit(&vif->vif_mutex);
963 			mac_rx(vif->vif_mac_handle, NULL, mphead);
964 			mutex_enter(&vif->vif_mutex);
965 		} else {
966 			/*
967 			 * The NIC was disabled part way through our execution,
968 			 * so free the messages we allocated.
969 			 */
970 			freemsgchain(mphead);
971 		}
972 	}
973 
974 	return (num_processed);
975 }
976 
977 static uint_t
978 vioif_reclaim_used_tx(vioif_t *vif)
979 {
980 	virtio_chain_t *vic;
981 	uint_t num_reclaimed = 0;
982 
983 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
984 
985 	while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
986 		vioif_txbuf_t *tb = virtio_chain_data(vic);
987 
988 		if (tb->tb_mp != NULL) {
989 			/*
990 			 * Unbind the external mapping.
991 			 */
992 			for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
993 				if (tb->tb_dmaext[i] == NULL) {
994 					continue;
995 				}
996 
997 				virtio_dma_unbind(tb->tb_dmaext[i]);
998 			}
999 
1000 			freemsg(tb->tb_mp);
1001 			tb->tb_mp = NULL;
1002 		}
1003 
1004 		/*
1005 		 * Return this transmit buffer to the free list for reuse.
1006 		 */
1007 		mutex_enter(&vif->vif_mutex);
1008 		vioif_txbuf_free(vif, tb);
1009 		mutex_exit(&vif->vif_mutex);
1010 
1011 		num_reclaimed++;
1012 	}
1013 
1014 	/* Return ring to transmitting state if descriptors were reclaimed. */
1015 	if (num_reclaimed > 0) {
1016 		boolean_t do_update = B_FALSE;
1017 
1018 		mutex_enter(&vif->vif_mutex);
1019 		vif->vif_stat_tx_reclaim += num_reclaimed;
1020 		if (vif->vif_tx_corked) {
1021 			/*
1022 			 * TX was corked on a lack of available descriptors.
1023 			 * That dire state has passed so the TX interrupt can
1024 			 * be disabled and MAC can be notified that
1025 			 * transmission is possible again.
1026 			 */
1027 			vif->vif_tx_corked = B_FALSE;
1028 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1029 			do_update = B_TRUE;
1030 		}
1031 
1032 		mutex_exit(&vif->vif_mutex);
1033 		if (do_update) {
1034 			mac_tx_update(vif->vif_mac_handle);
1035 		}
1036 	}
1037 
1038 	return (num_reclaimed);
1039 }
1040 
1041 static void
1042 vioif_reclaim_periodic(void *arg)
1043 {
1044 	vioif_t *vif = arg;
1045 	uint_t num_reclaimed;
1046 
1047 	num_reclaimed = vioif_reclaim_used_tx(vif);
1048 
1049 	mutex_enter(&vif->vif_mutex);
1050 	vif->vif_tx_reclaim_tid = 0;
1051 	/*
1052 	 * If used descriptors were reclaimed or TX descriptors appear to be
1053 	 * outstanding, the ring is considered active and periodic reclamation
1054 	 * is necessary for now.
1055 	 */
1056 	if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1057 		/* Do not reschedule if the ring is being drained. */
1058 		if (!vif->vif_tx_drain) {
1059 			vioif_reclaim_restart(vif);
1060 		}
1061 	}
1062 	mutex_exit(&vif->vif_mutex);
1063 }
1064 
1065 static void
1066 vioif_reclaim_restart(vioif_t *vif)
1067 {
1068 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1069 	VERIFY(!vif->vif_tx_drain);
1070 
1071 	if (vif->vif_tx_reclaim_tid == 0) {
1072 		vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
1073 		    MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
1074 	}
1075 }
1076 
1077 static void
1078 vioif_tx_drain(vioif_t *vif)
1079 {
1080 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1081 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
1082 
1083 	vif->vif_tx_drain = B_TRUE;
1084 	/* Put a stop to the periodic reclaim if it is running */
1085 	if (vif->vif_tx_reclaim_tid != 0) {
1086 		timeout_id_t tid = vif->vif_tx_reclaim_tid;
1087 
1088 		/*
1089 		 * With vif_tx_drain set, there is no risk that a racing
1090 		 * vioif_reclaim_periodic() call will reschedule itself.
1091 		 *
1092 		 * Being part of the mc_stop hook also guarantees that
1093 		 * vioif_m_tx() will not be called to restart it.
1094 		 */
1095 		vif->vif_tx_reclaim_tid = 0;
1096 		mutex_exit(&vif->vif_mutex);
1097 		(void) untimeout(tid);
1098 		mutex_enter(&vif->vif_mutex);
1099 	}
1100 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1101 
1102 	/*
1103 	 * Wait for all of the TX descriptors to be processed by the host so
1104 	 * they can be reclaimed.
1105 	 */
1106 	while (vif->vif_ntxbufs_alloc > 0) {
1107 		mutex_exit(&vif->vif_mutex);
1108 		(void) vioif_reclaim_used_tx(vif);
1109 		delay(5);
1110 		mutex_enter(&vif->vif_mutex);
1111 	}
1112 	VERIFY(!vif->vif_tx_corked);
1113 	VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
1114 	VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
1115 }
1116 
1117 static int
1118 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1119 {
1120 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1121 
1122 	VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
1123 
1124 	/*
1125 	 * Copy the message into the inline buffer and then free the message.
1126 	 */
1127 	mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
1128 
1129 	if (virtio_chain_append(tb->tb_chain,
1130 	    virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
1131 	    msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1132 		return (DDI_FAILURE);
1133 	}
1134 
1135 	return (DDI_SUCCESS);
1136 }
1137 
1138 static int
1139 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1140 {
1141 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1142 
1143 	mblk_t *nmp = mp;
1144 	tb->tb_ndmaext = 0;
1145 
1146 	while (nmp != NULL) {
1147 		size_t len;
1148 
1149 		if ((len = MBLKL(nmp)) == 0) {
1150 			/*
1151 			 * Skip any zero-length entries in the chain.
1152 			 */
1153 			nmp = nmp->b_cont;
1154 			continue;
1155 		}
1156 
1157 		if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
1158 			mutex_enter(&vif->vif_mutex);
1159 			vif->vif_txfail_indirect_limit++;
1160 			vif->vif_notxbuf++;
1161 			mutex_exit(&vif->vif_mutex);
1162 			goto fail;
1163 		}
1164 
1165 		if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
1166 			/*
1167 			 * Allocate a DMA handle for this slot.
1168 			 */
1169 			if ((tb->tb_dmaext[tb->tb_ndmaext] =
1170 			    virtio_dma_alloc_nomem(vif->vif_virtio,
1171 			    &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
1172 				mutex_enter(&vif->vif_mutex);
1173 				vif->vif_notxbuf++;
1174 				mutex_exit(&vif->vif_mutex);
1175 				goto fail;
1176 			}
1177 		}
1178 		virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
1179 
1180 		if (virtio_dma_bind(extdma, nmp->b_rptr, len,
1181 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
1182 		    DDI_SUCCESS) {
1183 			mutex_enter(&vif->vif_mutex);
1184 			vif->vif_txfail_dma_bind++;
1185 			mutex_exit(&vif->vif_mutex);
1186 			goto fail;
1187 		}
1188 
1189 		for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
1190 			uint64_t pa = virtio_dma_cookie_pa(extdma, n);
1191 			size_t sz = virtio_dma_cookie_size(extdma, n);
1192 
1193 			if (virtio_chain_append(tb->tb_chain, pa, sz,
1194 			    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1195 				mutex_enter(&vif->vif_mutex);
1196 				vif->vif_txfail_indirect_limit++;
1197 				vif->vif_notxbuf++;
1198 				mutex_exit(&vif->vif_mutex);
1199 				goto fail;
1200 			}
1201 		}
1202 
1203 		nmp = nmp->b_cont;
1204 	}
1205 
1206 	/*
1207 	 * We need to keep the message around until we reclaim the buffer from
1208 	 * the device before freeing it.
1209 	 */
1210 	tb->tb_mp = mp;
1211 
1212 	return (DDI_SUCCESS);
1213 
1214 fail:
1215 	for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
1216 		if (tb->tb_dmaext[n] != NULL) {
1217 			virtio_dma_unbind(tb->tb_dmaext[n]);
1218 		}
1219 	}
1220 	tb->tb_ndmaext = 0;
1221 
1222 	freemsg(mp);
1223 
1224 	return (DDI_FAILURE);
1225 }
1226 
1227 static boolean_t
1228 vioif_send(vioif_t *vif, mblk_t *mp)
1229 {
1230 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1231 
1232 	vioif_txbuf_t *tb = NULL;
1233 	struct virtio_net_hdr *vnh = NULL;
1234 	size_t msg_size = 0;
1235 	uint32_t csum_start;
1236 	uint32_t csum_stuff;
1237 	uint32_t csum_flags;
1238 	uint32_t lso_flags;
1239 	uint32_t lso_mss;
1240 	mblk_t *nmp;
1241 	int ret;
1242 	boolean_t lso_required = B_FALSE;
1243 	struct ether_header *ether = (void *)mp->b_rptr;
1244 
1245 	for (nmp = mp; nmp; nmp = nmp->b_cont)
1246 		msg_size += MBLKL(nmp);
1247 
1248 	if (vif->vif_tx_tso4 || vif->vif_tx_tso6) {
1249 		mac_lso_get(mp, &lso_mss, &lso_flags);
1250 		lso_required = (lso_flags & HW_LSO) != 0;
1251 	}
1252 
1253 	mutex_enter(&vif->vif_mutex);
1254 	if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
1255 		vif->vif_notxbuf++;
1256 		goto fail;
1257 	}
1258 	mutex_exit(&vif->vif_mutex);
1259 
1260 	/*
1261 	 * Use the inline buffer for the virtio net header.  Zero the portion
1262 	 * of our DMA allocation prior to the packet data.
1263 	 */
1264 	vnh = virtio_dma_va(tb->tb_dma, 0);
1265 	bzero(vnh, VIOIF_HEADER_SKIP);
1266 
1267 	/*
1268 	 * For legacy devices, and those that have not negotiated
1269 	 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
1270 	 * descriptor entry to the rest of the buffer.
1271 	 */
1272 	if (virtio_chain_append(tb->tb_chain,
1273 	    virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
1274 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1275 		mutex_enter(&vif->vif_mutex);
1276 		vif->vif_notxbuf++;
1277 		goto fail;
1278 	}
1279 
1280 	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
1281 
1282 	/*
1283 	 * They want us to do the TCP/UDP csum calculation.
1284 	 */
1285 	if (csum_flags & HCK_PARTIALCKSUM) {
1286 		int eth_hsize;
1287 
1288 		/*
1289 		 * Did we ask for it?
1290 		 */
1291 		ASSERT(vif->vif_tx_csum);
1292 
1293 		/*
1294 		 * We only asked for partial csum packets.
1295 		 */
1296 		ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1297 		ASSERT(!(csum_flags & HCK_FULLCKSUM));
1298 
1299 		if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
1300 			eth_hsize = sizeof (struct ether_vlan_header);
1301 		} else {
1302 			eth_hsize = sizeof (struct ether_header);
1303 		}
1304 
1305 		vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1306 		vnh->vnh_csum_start = eth_hsize + csum_start;
1307 		vnh->vnh_csum_offset = csum_stuff - csum_start;
1308 	}
1309 
1310 	/*
1311 	 * Setup LSO fields if required.
1312 	 */
1313 	if (lso_required) {
1314 		mac_ether_offload_flags_t needed;
1315 		mac_ether_offload_info_t meo;
1316 		uint32_t cksum;
1317 		size_t len;
1318 		mblk_t *pullmp = NULL;
1319 		tcpha_t *tcpha;
1320 
1321 		if (mac_ether_offload_info(mp, &meo) != 0) {
1322 			goto fail;
1323 		}
1324 
1325 		needed = MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET;
1326 		if ((meo.meoi_flags & needed) != needed) {
1327 			goto fail;
1328 		}
1329 
1330 		if (meo.meoi_l4proto != IPPROTO_TCP) {
1331 			goto fail;
1332 		}
1333 
1334 		if (meo.meoi_l3proto == ETHERTYPE_IP && vif->vif_tx_tso4) {
1335 			vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1336 		} else if (meo.meoi_l3proto == ETHERTYPE_IPV6 &&
1337 		    vif->vif_tx_tso6) {
1338 			vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1339 		} else {
1340 			goto fail;
1341 		}
1342 
1343 		/*
1344 		 * The TCP stack does not include the length in the TCP
1345 		 * pseudo-header when it is performing LSO since hardware
1346 		 * generally asks for it to be removed (as it'll change).
1347 		 * Unfortunately, for virtio, we actually need it. This means we
1348 		 * need to go through and calculate the actual length and fix
1349 		 * things up. Because the virtio spec cares about the ECN flag
1350 		 * and indicating that, at least this means we'll have that
1351 		 * available as well.
1352 		 */
1353 		if (MBLKL(mp) < vnh->vnh_hdr_len) {
1354 			pullmp = msgpullup(mp, vnh->vnh_hdr_len);
1355 			if (pullmp == NULL)
1356 				goto fail;
1357 			tcpha = (tcpha_t *)(pullmp->b_rptr + meo.meoi_l2hlen +
1358 			    meo.meoi_l3hlen);
1359 		} else {
1360 			tcpha = (tcpha_t *)(mp->b_rptr + meo.meoi_l2hlen +
1361 			    meo.meoi_l3hlen);
1362 		}
1363 
1364 		len = meo.meoi_len - meo.meoi_l2hlen - meo.meoi_l3hlen;
1365 		cksum = ntohs(tcpha->tha_sum) + len;
1366 		cksum = (cksum >> 16) + (cksum & 0xffff);
1367 		cksum = (cksum >> 16) + (cksum & 0xffff);
1368 		tcpha->tha_sum = htons(cksum);
1369 
1370 		if (tcpha->tha_flags & TH_CWR) {
1371 			vnh->vnh_gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1372 		}
1373 		vnh->vnh_gso_size = (uint16_t)lso_mss;
1374 		vnh->vnh_hdr_len = meo.meoi_l2hlen + meo.meoi_l3hlen +
1375 		    meo.meoi_l4hlen;
1376 
1377 		freemsg(pullmp);
1378 	}
1379 
1380 	/*
1381 	 * The device does not maintain its own statistics about broadcast or
1382 	 * multicast packets, so we have to check the destination address
1383 	 * ourselves.
1384 	 */
1385 	if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
1386 		mutex_enter(&vif->vif_mutex);
1387 		if (ether_cmp(&ether->ether_dhost, vioif_broadcast) == 0) {
1388 			vif->vif_brdcstxmt++;
1389 		} else {
1390 			vif->vif_multixmt++;
1391 		}
1392 		mutex_exit(&vif->vif_mutex);
1393 	}
1394 
1395 	/*
1396 	 * For small packets, copy into the preallocated inline buffer rather
1397 	 * than incur the overhead of mapping.  Note that both of these
1398 	 * functions ensure that "mp" is freed before returning.
1399 	 */
1400 	if (msg_size < vif->vif_txcopy_thresh) {
1401 		ret = vioif_tx_inline(vif, tb, mp, msg_size);
1402 	} else {
1403 		ret = vioif_tx_external(vif, tb, mp, msg_size);
1404 	}
1405 	mp = NULL;
1406 
1407 	mutex_enter(&vif->vif_mutex);
1408 
1409 	if (ret != DDI_SUCCESS) {
1410 		goto fail;
1411 	}
1412 
1413 	vif->vif_opackets++;
1414 	vif->vif_obytes += msg_size;
1415 	mutex_exit(&vif->vif_mutex);
1416 
1417 	virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
1418 	virtio_chain_submit(tb->tb_chain, B_TRUE);
1419 
1420 	return (B_TRUE);
1421 
1422 fail:
1423 	vif->vif_oerrors++;
1424 	if (tb != NULL) {
1425 		vioif_txbuf_free(vif, tb);
1426 	}
1427 	mutex_exit(&vif->vif_mutex);
1428 
1429 	return (mp == NULL);
1430 }
1431 
1432 static mblk_t *
1433 vioif_m_tx(void *arg, mblk_t *mp)
1434 {
1435 	vioif_t *vif = arg;
1436 	mblk_t *nmp;
1437 
1438 	/*
1439 	 * Prior to attempting to send any more frames, do a reclaim to pick up
1440 	 * any descriptors which have been processed by the host.
1441 	 */
1442 	if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1443 		(void) vioif_reclaim_used_tx(vif);
1444 	}
1445 
1446 	while (mp != NULL) {
1447 		nmp = mp->b_next;
1448 		mp->b_next = NULL;
1449 
1450 		if (!vioif_send(vif, mp)) {
1451 			/*
1452 			 * If there are no descriptors available, try to
1453 			 * reclaim some, allowing a retry of the send if some
1454 			 * are found.
1455 			 */
1456 			mp->b_next = nmp;
1457 			if (vioif_reclaim_used_tx(vif) != 0) {
1458 				continue;
1459 			}
1460 
1461 			/*
1462 			 * Otherwise, enable the TX ring interrupt so that as
1463 			 * soon as a descriptor becomes available, transmission
1464 			 * can begin again.  For safety, make sure the periodic
1465 			 * reclaim is running as well.
1466 			 */
1467 			mutex_enter(&vif->vif_mutex);
1468 			vif->vif_tx_corked = B_TRUE;
1469 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
1470 			vioif_reclaim_restart(vif);
1471 			mutex_exit(&vif->vif_mutex);
1472 			return (mp);
1473 		}
1474 		mp = nmp;
1475 	}
1476 
1477 	/* Ensure the periodic reclaim has been started. */
1478 	mutex_enter(&vif->vif_mutex);
1479 	vioif_reclaim_restart(vif);
1480 	mutex_exit(&vif->vif_mutex);
1481 
1482 	return (NULL);
1483 }
1484 
1485 static int
1486 vioif_m_start(void *arg)
1487 {
1488 	vioif_t *vif = arg;
1489 
1490 	mutex_enter(&vif->vif_mutex);
1491 
1492 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
1493 	vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
1494 
1495 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1496 
1497 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
1498 
1499 	/*
1500 	 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1501 	 * Descriptor reclamation is handling during transmit, via a periodic
1502 	 * timer, and when resources are tight, via the then-enabled interrupt.
1503 	 */
1504 	vif->vif_tx_drain = B_FALSE;
1505 
1506 	/*
1507 	 * Add as many receive buffers as we can to the receive queue.  If we
1508 	 * cannot add any, it may be because we have stopped and started again
1509 	 * and the descriptors are all in the queue already.
1510 	 */
1511 	(void) vioif_add_rx(vif);
1512 
1513 	mutex_exit(&vif->vif_mutex);
1514 	return (DDI_SUCCESS);
1515 }
1516 
1517 static void
1518 vioif_m_stop(void *arg)
1519 {
1520 	vioif_t *vif = arg;
1521 
1522 	mutex_enter(&vif->vif_mutex);
1523 
1524 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
1525 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
1526 
1527 	/* Ensure all TX descriptors have been processed and reclaimed */
1528 	vioif_tx_drain(vif);
1529 
1530 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1531 
1532 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1533 	mutex_exit(&vif->vif_mutex);
1534 }
1535 
1536 static int
1537 vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
1538 {
1539 	vioif_t *vif = arg;
1540 
1541 	switch (stat) {
1542 	case MAC_STAT_IERRORS:
1543 		*val = vif->vif_ierrors;
1544 		break;
1545 	case MAC_STAT_OERRORS:
1546 		*val = vif->vif_oerrors;
1547 		break;
1548 	case MAC_STAT_MULTIRCV:
1549 		*val = vif->vif_multircv;
1550 		break;
1551 	case MAC_STAT_BRDCSTRCV:
1552 		*val = vif->vif_brdcstrcv;
1553 		break;
1554 	case MAC_STAT_MULTIXMT:
1555 		*val = vif->vif_multixmt;
1556 		break;
1557 	case MAC_STAT_BRDCSTXMT:
1558 		*val = vif->vif_brdcstxmt;
1559 		break;
1560 	case MAC_STAT_IPACKETS:
1561 		*val = vif->vif_ipackets;
1562 		break;
1563 	case MAC_STAT_RBYTES:
1564 		*val = vif->vif_rbytes;
1565 		break;
1566 	case MAC_STAT_OPACKETS:
1567 		*val = vif->vif_opackets;
1568 		break;
1569 	case MAC_STAT_OBYTES:
1570 		*val = vif->vif_obytes;
1571 		break;
1572 	case MAC_STAT_NORCVBUF:
1573 		*val = vif->vif_norecvbuf;
1574 		break;
1575 	case MAC_STAT_NOXMTBUF:
1576 		*val = vif->vif_notxbuf;
1577 		break;
1578 	case MAC_STAT_IFSPEED:
1579 		/* always 1 Gbit */
1580 		*val = 1000000000ULL;
1581 		break;
1582 	case ETHER_STAT_LINK_DUPLEX:
1583 		/* virtual device, always full-duplex */
1584 		*val = LINK_DUPLEX_FULL;
1585 		break;
1586 
1587 	default:
1588 		return (ENOTSUP);
1589 	}
1590 
1591 	return (DDI_SUCCESS);
1592 }
1593 
1594 static int
1595 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1596     uint_t pr_valsize, const void *pr_val)
1597 {
1598 	vioif_t *vif = arg;
1599 
1600 	switch (pr_num) {
1601 	case MAC_PROP_MTU: {
1602 		int r;
1603 		uint32_t mtu;
1604 		if (pr_valsize < sizeof (mtu)) {
1605 			return (EOVERFLOW);
1606 		}
1607 		bcopy(pr_val, &mtu, sizeof (mtu));
1608 
1609 		if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
1610 			return (EINVAL);
1611 		}
1612 
1613 		mutex_enter(&vif->vif_mutex);
1614 		if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
1615 			vif->vif_mtu = mtu;
1616 		}
1617 		mutex_exit(&vif->vif_mutex);
1618 
1619 		return (r);
1620 	}
1621 
1622 	case MAC_PROP_PRIVATE: {
1623 		long max, result;
1624 		uint_t *resp;
1625 		char *endptr;
1626 
1627 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1628 			max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
1629 			resp = &vif->vif_txcopy_thresh;
1630 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1631 			max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
1632 			resp = &vif->vif_rxcopy_thresh;
1633 		} else {
1634 			return (ENOTSUP);
1635 		}
1636 
1637 		if (pr_val == NULL) {
1638 			return (EINVAL);
1639 		}
1640 
1641 		if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
1642 		    *endptr != '\0' || result < 0 || result > max) {
1643 			return (EINVAL);
1644 		}
1645 
1646 		mutex_enter(&vif->vif_mutex);
1647 		*resp = result;
1648 		mutex_exit(&vif->vif_mutex);
1649 
1650 		return (0);
1651 	}
1652 
1653 	default:
1654 		return (ENOTSUP);
1655 	}
1656 }
1657 
1658 static int
1659 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1660     uint_t pr_valsize, void *pr_val)
1661 {
1662 	vioif_t *vif = arg;
1663 
1664 	switch (pr_num) {
1665 	case MAC_PROP_PRIVATE: {
1666 		uint_t value;
1667 
1668 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1669 			value = vif->vif_txcopy_thresh;
1670 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1671 			value = vif->vif_rxcopy_thresh;
1672 		} else {
1673 			return (ENOTSUP);
1674 		}
1675 
1676 		if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
1677 			return (EOVERFLOW);
1678 		}
1679 
1680 		return (0);
1681 	}
1682 
1683 	default:
1684 		return (ENOTSUP);
1685 	}
1686 }
1687 
1688 static void
1689 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1690     mac_prop_info_handle_t prh)
1691 {
1692 	vioif_t *vif = arg;
1693 	char valstr[64];
1694 	int value;
1695 
1696 	switch (pr_num) {
1697 	case MAC_PROP_MTU:
1698 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1699 		mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
1700 		return;
1701 
1702 	case MAC_PROP_PRIVATE:
1703 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1704 			value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1705 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1706 			value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1707 		} else {
1708 			/*
1709 			 * We do not recognise this private property name.
1710 			 */
1711 			return;
1712 		}
1713 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1714 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
1715 		mac_prop_info_set_default_str(prh, valstr);
1716 		return;
1717 
1718 	default:
1719 		return;
1720 	}
1721 }
1722 
1723 static boolean_t
1724 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1725 {
1726 	vioif_t *vif = arg;
1727 
1728 	switch (cap) {
1729 	case MAC_CAPAB_HCKSUM: {
1730 		if (!vif->vif_tx_csum) {
1731 			return (B_FALSE);
1732 		}
1733 
1734 		*(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
1735 
1736 		return (B_TRUE);
1737 	}
1738 
1739 	case MAC_CAPAB_LSO: {
1740 		if (!vif->vif_tx_tso4) {
1741 			return (B_FALSE);
1742 		}
1743 
1744 		mac_capab_lso_t *lso = cap_data;
1745 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | LSO_TX_BASIC_TCP_IPV6;
1746 		lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
1747 		lso->lso_basic_tcp_ipv6.lso_max = VIOIF_RX_DATA_SIZE;
1748 
1749 		return (B_TRUE);
1750 	}
1751 
1752 	default:
1753 		return (B_FALSE);
1754 	}
1755 }
1756 
1757 static boolean_t
1758 vioif_has_feature(vioif_t *vif, uint32_t feature)
1759 {
1760 	return (virtio_feature_present(vif->vif_virtio, feature));
1761 }
1762 
1763 /*
1764  * Read the primary MAC address from the device if one is provided.  If not,
1765  * generate a random locally administered MAC address and write it back to the
1766  * device.
1767  */
1768 static void
1769 vioif_get_mac(vioif_t *vif)
1770 {
1771 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1772 
1773 	if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
1774 		for (uint_t i = 0; i < ETHERADDRL; i++) {
1775 			vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
1776 			    VIRTIO_NET_CONFIG_MAC + i);
1777 		}
1778 		vif->vif_mac_from_host = 1;
1779 
1780 		return;
1781 	}
1782 
1783 	/* Get a few random bytes */
1784 	(void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
1785 	/* Make sure it's a unicast MAC */
1786 	vif->vif_mac[0] &= ~1;
1787 	/* Set the "locally administered" bit */
1788 	vif->vif_mac[1] |= 2;
1789 
1790 	/*
1791 	 * Write the random MAC address back to the device.
1792 	 */
1793 	for (uint_t i = 0; i < ETHERADDRL; i++) {
1794 		virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
1795 		    vif->vif_mac[i]);
1796 	}
1797 	vif->vif_mac_from_host = 0;
1798 
1799 	dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
1800 	    "%02x:%02x:%02x:%02x:%02x:%02x",
1801 	    (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
1802 	    (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
1803 	    (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
1804 }
1805 
1806 /*
1807  * Virtqueue interrupt handlers
1808  */
1809 static uint_t
1810 vioif_rx_handler(caddr_t arg0, caddr_t arg1)
1811 {
1812 	vioif_t *vif = (vioif_t *)arg0;
1813 
1814 	mutex_enter(&vif->vif_mutex);
1815 	(void) vioif_process_rx(vif);
1816 
1817 	/*
1818 	 * Attempt to replenish the receive queue.  If we cannot add any
1819 	 * descriptors here, it may be because all of the recently received
1820 	 * packets were loaned up to the networking stack.
1821 	 */
1822 	(void) vioif_add_rx(vif);
1823 	mutex_exit(&vif->vif_mutex);
1824 
1825 	return (DDI_INTR_CLAIMED);
1826 }
1827 
1828 static uint_t
1829 vioif_tx_handler(caddr_t arg0, caddr_t arg1)
1830 {
1831 	vioif_t *vif = (vioif_t *)arg0;
1832 
1833 	/*
1834 	 * The TX interrupt could race with other reclamation activity, so
1835 	 * interpreting the return value is unimportant.
1836 	 */
1837 	(void) vioif_reclaim_used_tx(vif);
1838 
1839 	return (DDI_INTR_CLAIMED);
1840 }
1841 
1842 static void
1843 vioif_check_features(vioif_t *vif)
1844 {
1845 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1846 
1847 	vif->vif_tx_csum = 0;
1848 	vif->vif_tx_tso4 = 0;
1849 	vif->vif_tx_tso6 = 0;
1850 
1851 	if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
1852 		/*
1853 		 * The host will accept packets with partial checksums from us.
1854 		 */
1855 		vif->vif_tx_csum = 1;
1856 
1857 		/*
1858 		 * The legacy GSO feature represents the combination of
1859 		 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
1860 		 */
1861 		boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
1862 		boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
1863 		boolean_t tso6 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO6);
1864 		boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
1865 
1866 		/*
1867 		 * Explicit congestion notification (ECN) is configured
1868 		 * globally; see "tcp_ecn_permitted".  As we cannot currently
1869 		 * request that the stack disable ECN on a per interface basis,
1870 		 * we require the device to support the combination of
1871 		 * segmentation offload and ECN support.
1872 		 */
1873 		if (gso) {
1874 			vif->vif_tx_tso4 = 1;
1875 			vif->vif_tx_tso6 = 1;
1876 		}
1877 		if (tso4 && ecn) {
1878 			vif->vif_tx_tso4 = 1;
1879 		}
1880 		if (tso6 && ecn) {
1881 			vif->vif_tx_tso6 = 1;
1882 		}
1883 	}
1884 
1885 	if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ)) {
1886 		vif->vif_has_ctrlq = 1;
1887 
1888 		/*
1889 		 * The VIRTIO_NET_F_CTRL_VQ feature must be enabled if there's
1890 		 * any chance of the VIRTIO_NET_F_CTRL_RX being enabled.
1891 		 */
1892 		if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_RX))
1893 			vif->vif_has_ctrlq_rx = 1;
1894 	}
1895 }
1896 
1897 static int
1898 vioif_select_interrupt_types(void)
1899 {
1900 	id_t id;
1901 	smbios_system_t sys;
1902 	smbios_info_t info;
1903 
1904 	if (vioif_allowed_int_types != -1) {
1905 		/*
1906 		 * If this value was tuned via /etc/system or the debugger,
1907 		 * use the provided value directly.
1908 		 */
1909 		return (vioif_allowed_int_types);
1910 	}
1911 
1912 	if (ksmbios == NULL ||
1913 	    (id = smbios_info_system(ksmbios, &sys)) == SMB_ERR ||
1914 	    smbios_info_common(ksmbios, id, &info) == SMB_ERR) {
1915 		/*
1916 		 * The system may not have valid SMBIOS data, so ignore a
1917 		 * failure here.
1918 		 */
1919 		return (VIRTIO_ANY_INTR_TYPE);
1920 	}
1921 
1922 	if (strcmp(info.smbi_manufacturer, "Google") == 0 &&
1923 	    strcmp(info.smbi_product, "Google Compute Engine") == 0) {
1924 		/*
1925 		 * An undiagnosed issue with the Google Compute Engine (GCE)
1926 		 * hypervisor exists.  In this environment, no RX interrupts
1927 		 * are received if MSI-X handlers are installed.  This does not
1928 		 * appear to be true for the Virtio SCSI driver.  Fixed
1929 		 * interrupts do appear to work, so we fall back for now:
1930 		 */
1931 		return (DDI_INTR_TYPE_FIXED);
1932 	}
1933 
1934 	return (VIRTIO_ANY_INTR_TYPE);
1935 }
1936 
1937 static int
1938 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1939 {
1940 	int ret;
1941 	vioif_t *vif;
1942 	virtio_t *vio;
1943 	mac_register_t *macp = NULL;
1944 
1945 	if (cmd != DDI_ATTACH) {
1946 		return (DDI_FAILURE);
1947 	}
1948 
1949 	if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
1950 	    NULL) {
1951 		return (DDI_FAILURE);
1952 	}
1953 
1954 	vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
1955 	vif->vif_dip = dip;
1956 	vif->vif_virtio = vio;
1957 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1958 	ddi_set_driver_private(dip, vif);
1959 
1960 	if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
1961 	    "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
1962 	    (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
1963 	    "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1964 		goto fail;
1965 	}
1966 
1967 	if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ) &&
1968 	    (vif->vif_ctrl_vq = virtio_queue_alloc(vio,
1969 	    VIRTIO_NET_VIRTQ_CONTROL, "ctrlq", NULL, vif,
1970 	    B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1971 		goto fail;
1972 	}
1973 
1974 	if (virtio_init_complete(vio, vioif_select_interrupt_types()) !=
1975 	    DDI_SUCCESS) {
1976 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
1977 		goto fail;
1978 	}
1979 
1980 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1981 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1982 	if (vif->vif_ctrl_vq != NULL)
1983 		virtio_queue_no_interrupt(vif->vif_ctrl_vq, B_TRUE);
1984 
1985 	mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
1986 	mutex_enter(&vif->vif_mutex);
1987 
1988 	vioif_get_mac(vif);
1989 
1990 	vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1991 	vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1992 
1993 	if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
1994 		vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
1995 	} else {
1996 		vif->vif_mtu_max = ETHERMTU;
1997 	}
1998 
1999 	vif->vif_mtu = ETHERMTU;
2000 	if (vif->vif_mtu > vif->vif_mtu_max) {
2001 		vif->vif_mtu = vif->vif_mtu_max;
2002 	}
2003 
2004 	vioif_check_features(vif);
2005 
2006 	if (vioif_alloc_bufs(vif) != 0) {
2007 		mutex_exit(&vif->vif_mutex);
2008 		dev_err(dip, CE_WARN, "failed to allocate memory");
2009 		goto fail;
2010 	}
2011 
2012 	mutex_exit(&vif->vif_mutex);
2013 
2014 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
2015 		dev_err(dip, CE_WARN, "failed to enable interrupts");
2016 		goto fail;
2017 	}
2018 
2019 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2020 		dev_err(dip, CE_WARN, "failed to allocate a mac_register");
2021 		goto fail;
2022 	}
2023 
2024 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2025 	macp->m_driver = vif;
2026 	macp->m_dip = dip;
2027 	macp->m_src_addr = vif->vif_mac;
2028 	macp->m_callbacks = &vioif_mac_callbacks;
2029 	macp->m_min_sdu = 0;
2030 	macp->m_max_sdu = vif->vif_mtu;
2031 	macp->m_margin = VLAN_TAGSZ;
2032 	macp->m_priv_props = vioif_priv_props;
2033 
2034 	if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
2035 		dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
2036 		goto fail;
2037 	}
2038 	mac_free(macp);
2039 
2040 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
2041 
2042 	return (DDI_SUCCESS);
2043 
2044 fail:
2045 	vioif_free_bufs(vif);
2046 	if (macp != NULL) {
2047 		mac_free(macp);
2048 	}
2049 	(void) virtio_fini(vio, B_TRUE);
2050 	kmem_free(vif, sizeof (*vif));
2051 	return (DDI_FAILURE);
2052 }
2053 
2054 static int
2055 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2056 {
2057 	int r;
2058 	vioif_t *vif;
2059 
2060 	if (cmd != DDI_DETACH) {
2061 		return (DDI_FAILURE);
2062 	}
2063 
2064 	if ((vif = ddi_get_driver_private(dip)) == NULL) {
2065 		return (DDI_FAILURE);
2066 	}
2067 
2068 	mutex_enter(&vif->vif_mutex);
2069 	if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
2070 		dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
2071 		mutex_exit(&vif->vif_mutex);
2072 		return (DDI_FAILURE);
2073 	}
2074 
2075 	/*
2076 	 * There should be no outstanding transmit buffers once the NIC is
2077 	 * completely stopped.
2078 	 */
2079 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
2080 
2081 	/*
2082 	 * Though we cannot claw back all of the receive buffers until we reset
2083 	 * the device, we must ensure all those loaned to MAC have been
2084 	 * returned before calling mac_unregister().
2085 	 */
2086 	if (vif->vif_nrxbufs_onloan > 0) {
2087 		dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
2088 		    "cannot detach", vif->vif_nrxbufs_onloan);
2089 		mutex_exit(&vif->vif_mutex);
2090 		return (DDI_FAILURE);
2091 	}
2092 
2093 	if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
2094 		dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
2095 		return (DDI_FAILURE);
2096 	}
2097 
2098 	/*
2099 	 * Shut down the device so that we can recover any previously
2100 	 * submitted receive buffers.
2101 	 */
2102 	virtio_shutdown(vif->vif_virtio);
2103 	for (;;) {
2104 		virtio_chain_t *vic;
2105 
2106 		if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
2107 			break;
2108 		}
2109 
2110 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
2111 		vioif_rxbuf_free(vif, rb);
2112 	}
2113 
2114 	/*
2115 	 * vioif_free_bufs() must be called before virtio_fini()
2116 	 * as it uses virtio_chain_free() which itself depends on some
2117 	 * virtio data structures still being around.
2118 	 */
2119 	vioif_free_bufs(vif);
2120 	(void) virtio_fini(vif->vif_virtio, B_FALSE);
2121 
2122 	mutex_exit(&vif->vif_mutex);
2123 	mutex_destroy(&vif->vif_mutex);
2124 
2125 	kmem_free(vif, sizeof (*vif));
2126 
2127 	return (DDI_SUCCESS);
2128 }
2129 
2130 static int
2131 vioif_quiesce(dev_info_t *dip)
2132 {
2133 	vioif_t *vif;
2134 
2135 	if ((vif = ddi_get_driver_private(dip)) == NULL)
2136 		return (DDI_FAILURE);
2137 
2138 	return (virtio_quiesce(vif->vif_virtio));
2139 }
2140 
2141 int
2142 _init(void)
2143 {
2144 	int ret;
2145 
2146 	mac_init_ops(&vioif_dev_ops, "vioif");
2147 
2148 	if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
2149 		mac_fini_ops(&vioif_dev_ops);
2150 	}
2151 
2152 	return (ret);
2153 }
2154 
2155 int
2156 _fini(void)
2157 {
2158 	int ret;
2159 
2160 	if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
2161 		mac_fini_ops(&vioif_dev_ops);
2162 	}
2163 
2164 	return (ret);
2165 }
2166 
2167 int
2168 _info(struct modinfo *modinfop)
2169 {
2170 	return (mod_info(&vioif_modlinkage, modinfop));
2171 }
2172