xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.c (revision 8222814ef8560ee0ba222eca8ca5acffc6cd0e44)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2013 Nexenta Inc.  All rights reserved.
14  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15  * Copyright 2021 Joyent, Inc.
16  * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org>
17  */
18 
19 /* Based on the NetBSD virtio driver by Minoura Makoto. */
20 /*
21  * Copyright (c) 2010 Minoura Makoto.
22  * All rights reserved.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  *
33  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
34  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
35  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
36  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
37  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
38  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
39  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
40  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
42  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43  */
44 
45 /*
46  * VIRTIO NETWORK DRIVER
47  */
48 
49 #include <sys/types.h>
50 #include <sys/errno.h>
51 #include <sys/param.h>
52 #include <sys/stropts.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/kmem.h>
56 #include <sys/conf.h>
57 #include <sys/devops.h>
58 #include <sys/ksynch.h>
59 #include <sys/stat.h>
60 #include <sys/modctl.h>
61 #include <sys/debug.h>
62 #include <sys/pci.h>
63 #include <sys/ethernet.h>
64 #include <sys/vlan.h>
65 #include <sys/sysmacros.h>
66 #include <sys/smbios.h>
67 
68 #include <sys/dlpi.h>
69 #include <sys/taskq.h>
70 
71 #include <sys/pattr.h>
72 #include <sys/strsun.h>
73 
74 #include <sys/random.h>
75 #include <sys/containerof.h>
76 #include <sys/stream.h>
77 #include <inet/tcp.h>
78 
79 #include <sys/mac.h>
80 #include <sys/mac_provider.h>
81 #include <sys/mac_ether.h>
82 
83 #include "virtio.h"
84 #include "vioif.h"
85 
86 
87 static int vioif_quiesce(dev_info_t *);
88 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
89 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
90 static boolean_t vioif_has_feature(vioif_t *, uint32_t);
91 static void vioif_reclaim_restart(vioif_t *);
92 static int vioif_m_stat(void *, uint_t, uint64_t *);
93 static void vioif_m_stop(void *);
94 static int vioif_m_start(void *);
95 static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
96 static int vioif_m_setpromisc(void *, boolean_t);
97 static int vioif_m_unicst(void *, const uint8_t *);
98 static mblk_t *vioif_m_tx(void *, mblk_t *);
99 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
100     const void *);
101 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
102 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
103     mac_prop_info_handle_t);
104 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
105 static uint_t vioif_add_rx(vioif_t *);
106 
107 
108 static struct cb_ops vioif_cb_ops = {
109 	.cb_rev =			CB_REV,
110 	.cb_flag =			D_MP | D_NEW,
111 
112 	.cb_open =			nulldev,
113 	.cb_close =			nulldev,
114 	.cb_strategy =			nodev,
115 	.cb_print =			nodev,
116 	.cb_dump =			nodev,
117 	.cb_read =			nodev,
118 	.cb_write =			nodev,
119 	.cb_ioctl =			nodev,
120 	.cb_devmap =			nodev,
121 	.cb_mmap =			nodev,
122 	.cb_segmap =			nodev,
123 	.cb_chpoll =			nochpoll,
124 	.cb_prop_op =			ddi_prop_op,
125 	.cb_str =			NULL,
126 	.cb_aread =			nodev,
127 	.cb_awrite =			nodev,
128 };
129 
130 static struct dev_ops vioif_dev_ops = {
131 	.devo_rev =			DEVO_REV,
132 	.devo_refcnt =			0,
133 
134 	.devo_attach =			vioif_attach,
135 	.devo_detach =			vioif_detach,
136 	.devo_quiesce =			vioif_quiesce,
137 
138 	.devo_cb_ops =			&vioif_cb_ops,
139 
140 	.devo_getinfo =			NULL,
141 	.devo_identify =		nulldev,
142 	.devo_probe =			nulldev,
143 	.devo_reset =			nodev,
144 	.devo_bus_ops =			NULL,
145 	.devo_power =			NULL,
146 };
147 
148 static struct modldrv vioif_modldrv = {
149 	.drv_modops =			&mod_driverops,
150 	.drv_linkinfo =			"VIRTIO network driver",
151 	.drv_dev_ops =			&vioif_dev_ops
152 };
153 
154 static struct modlinkage vioif_modlinkage = {
155 	.ml_rev =			MODREV_1,
156 	.ml_linkage =			{ &vioif_modldrv, NULL }
157 };
158 
159 static mac_callbacks_t vioif_mac_callbacks = {
160 	.mc_getstat =			vioif_m_stat,
161 	.mc_start =			vioif_m_start,
162 	.mc_stop =			vioif_m_stop,
163 	.mc_setpromisc =		vioif_m_setpromisc,
164 	.mc_multicst =			vioif_m_multicst,
165 	.mc_unicst =			vioif_m_unicst,
166 	.mc_tx =			vioif_m_tx,
167 
168 	.mc_callbacks =			(MC_GETCAPAB | MC_SETPROP |
169 					    MC_GETPROP | MC_PROPINFO),
170 	.mc_getcapab =			vioif_m_getcapab,
171 	.mc_setprop =			vioif_m_setprop,
172 	.mc_getprop =			vioif_m_getprop,
173 	.mc_propinfo =			vioif_m_propinfo,
174 };
175 
176 static const uchar_t vioif_broadcast[ETHERADDRL] = {
177 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
178 };
179 
180 /*
181  * Interval for the periodic TX reclaim.
182  */
183 uint_t vioif_reclaim_ms = 200;
184 
185 /*
186  * Allow the operator to override the kinds of interrupts we'll use for
187  * vioif.  This value defaults to -1 so that it can be overridden to 0 in
188  * /etc/system.
189  */
190 int vioif_allowed_int_types = -1;
191 
192 /*
193  * DMA attribute template for transmit and receive buffers.  The SGL entry
194  * count will be modified before using the template.  Note that these
195  * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
196  * received frames at the correct offset for the networking stack.
197  */
198 ddi_dma_attr_t vioif_dma_attr_bufs = {
199 	.dma_attr_version =		DMA_ATTR_V0,
200 	.dma_attr_addr_lo =		0x0000000000000000,
201 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
202 	.dma_attr_count_max =		0x00000000FFFFFFFF,
203 	.dma_attr_align =		VIOIF_HEADER_ALIGN,
204 	.dma_attr_burstsizes =		1,
205 	.dma_attr_minxfer =		1,
206 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
207 	.dma_attr_seg =			0x00000000FFFFFFFF,
208 	.dma_attr_sgllen =		0,
209 	.dma_attr_granular =		1,
210 	.dma_attr_flags =		0
211 };
212 
213 /*
214  * DMA attributes for mapping larger transmit buffers from the networking
215  * stack.  The requirements are quite loose, but note that the SGL entry length
216  * field is 32-bit.
217  */
218 ddi_dma_attr_t vioif_dma_attr_external = {
219 	.dma_attr_version =		DMA_ATTR_V0,
220 	.dma_attr_addr_lo =		0x0000000000000000,
221 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
222 	.dma_attr_count_max =		0x00000000FFFFFFFF,
223 	.dma_attr_align =		1,
224 	.dma_attr_burstsizes =		1,
225 	.dma_attr_minxfer =		1,
226 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
227 	.dma_attr_seg =			0x00000000FFFFFFFF,
228 	.dma_attr_sgllen =		VIOIF_MAX_SEGS - 1,
229 	.dma_attr_granular =		1,
230 	.dma_attr_flags =		0
231 };
232 
233 
234 /*
235  * VIRTIO NET MAC PROPERTIES
236  */
237 #define	VIOIF_MACPROP_TXCOPY_THRESH	"_txcopy_thresh"
238 #define	VIOIF_MACPROP_TXCOPY_THRESH_DEF	300
239 #define	VIOIF_MACPROP_TXCOPY_THRESH_MAX	640
240 
241 #define	VIOIF_MACPROP_RXCOPY_THRESH	"_rxcopy_thresh"
242 #define	VIOIF_MACPROP_RXCOPY_THRESH_DEF	300
243 #define	VIOIF_MACPROP_RXCOPY_THRESH_MAX	640
244 
245 static char *vioif_priv_props[] = {
246 	VIOIF_MACPROP_TXCOPY_THRESH,
247 	VIOIF_MACPROP_RXCOPY_THRESH,
248 	NULL
249 };
250 
251 
252 static vioif_txbuf_t *
253 vioif_txbuf_alloc(vioif_t *vif)
254 {
255 	vioif_txbuf_t *tb;
256 
257 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
258 
259 	if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
260 		vif->vif_ntxbufs_alloc++;
261 	}
262 
263 	return (tb);
264 }
265 
266 static void
267 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
268 {
269 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
270 
271 	VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
272 	vif->vif_ntxbufs_alloc--;
273 
274 	virtio_chain_clear(tb->tb_chain);
275 	list_insert_head(&vif->vif_txbufs, tb);
276 }
277 
278 static vioif_rxbuf_t *
279 vioif_rxbuf_alloc(vioif_t *vif)
280 {
281 	vioif_rxbuf_t *rb;
282 
283 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
284 
285 	if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
286 		vif->vif_nrxbufs_alloc++;
287 	}
288 
289 	return (rb);
290 }
291 
292 static void
293 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
294 {
295 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
296 
297 	VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
298 	vif->vif_nrxbufs_alloc--;
299 
300 	virtio_chain_clear(rb->rb_chain);
301 	list_insert_head(&vif->vif_rxbufs, rb);
302 }
303 
304 static void
305 vioif_rx_free_callback(caddr_t free_arg)
306 {
307 	vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
308 	vioif_t *vif = rb->rb_vioif;
309 
310 	mutex_enter(&vif->vif_mutex);
311 
312 	/*
313 	 * Return this receive buffer to the free list.
314 	 */
315 	vioif_rxbuf_free(vif, rb);
316 
317 	VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
318 	vif->vif_nrxbufs_onloan--;
319 
320 	/*
321 	 * Attempt to replenish the receive queue with at least the buffer we
322 	 * just freed.  There isn't a great way to deal with failure here,
323 	 * though because we'll only loan at most half of the buffers there
324 	 * should always be at least some available even if this fails.
325 	 */
326 	(void) vioif_add_rx(vif);
327 
328 	mutex_exit(&vif->vif_mutex);
329 }
330 
331 static vioif_ctrlbuf_t *
332 vioif_ctrlbuf_alloc(vioif_t *vif)
333 {
334 	vioif_ctrlbuf_t *cb;
335 
336 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
337 
338 	if ((cb = list_remove_head(&vif->vif_ctrlbufs)) != NULL) {
339 		vif->vif_nctrlbufs_alloc++;
340 	}
341 
342 	return (cb);
343 }
344 
345 static void
346 vioif_ctrlbuf_free(vioif_t *vif, vioif_ctrlbuf_t *cb)
347 {
348 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
349 
350 	VERIFY3U(vif->vif_nctrlbufs_alloc, >, 0);
351 	vif->vif_nctrlbufs_alloc--;
352 
353 	virtio_chain_clear(cb->cb_chain);
354 	list_insert_head(&vif->vif_ctrlbufs, cb);
355 }
356 
357 static void
358 vioif_free_bufs(vioif_t *vif)
359 {
360 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
361 
362 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
363 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
364 		vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
365 
366 		/*
367 		 * Ensure that this txbuf is now in the free list:
368 		 */
369 		VERIFY(list_link_active(&tb->tb_link));
370 		list_remove(&vif->vif_txbufs, tb);
371 
372 		/*
373 		 * We should not have an mblk chain at this point.
374 		 */
375 		VERIFY3P(tb->tb_mp, ==, NULL);
376 
377 		if (tb->tb_dma != NULL) {
378 			virtio_dma_free(tb->tb_dma);
379 			tb->tb_dma = NULL;
380 		}
381 
382 		if (tb->tb_chain != NULL) {
383 			virtio_chain_free(tb->tb_chain);
384 			tb->tb_chain = NULL;
385 		}
386 
387 		if (tb->tb_dmaext != NULL) {
388 			for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
389 				if (tb->tb_dmaext[j] != NULL) {
390 					virtio_dma_free(
391 					    tb->tb_dmaext[j]);
392 					tb->tb_dmaext[j] = NULL;
393 				}
394 			}
395 
396 			kmem_free(tb->tb_dmaext,
397 			    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
398 			tb->tb_dmaext = NULL;
399 			tb->tb_dmaext_capacity = 0;
400 		}
401 	}
402 	VERIFY(list_is_empty(&vif->vif_txbufs));
403 	if (vif->vif_txbufs_mem != NULL) {
404 		kmem_free(vif->vif_txbufs_mem,
405 		    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
406 		vif->vif_txbufs_mem = NULL;
407 		vif->vif_txbufs_capacity = 0;
408 	}
409 
410 	VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
411 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
412 		vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
413 
414 		/*
415 		 * Ensure that this rxbuf is now in the free list:
416 		 */
417 		VERIFY(list_link_active(&rb->rb_link));
418 		list_remove(&vif->vif_rxbufs, rb);
419 
420 		if (rb->rb_dma != NULL) {
421 			virtio_dma_free(rb->rb_dma);
422 			rb->rb_dma = NULL;
423 		}
424 
425 		if (rb->rb_chain != NULL) {
426 			virtio_chain_free(rb->rb_chain);
427 			rb->rb_chain = NULL;
428 		}
429 	}
430 	VERIFY(list_is_empty(&vif->vif_rxbufs));
431 	if (vif->vif_rxbufs_mem != NULL) {
432 		kmem_free(vif->vif_rxbufs_mem,
433 		    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
434 		vif->vif_rxbufs_mem = NULL;
435 		vif->vif_rxbufs_capacity = 0;
436 	}
437 
438 	if (vif->vif_has_ctrlq) {
439 		VERIFY3U(vif->vif_nctrlbufs_alloc, ==, 0);
440 		for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
441 			vioif_ctrlbuf_t *cb = &vif->vif_ctrlbufs_mem[i];
442 
443 			/*
444 			 * Ensure that this ctrlbuf is now in the free list
445 			 */
446 			VERIFY(list_link_active(&cb->cb_link));
447 			list_remove(&vif->vif_ctrlbufs, cb);
448 
449 			if (cb->cb_dma != NULL) {
450 				virtio_dma_free(cb->cb_dma);
451 				cb->cb_dma = NULL;
452 			}
453 
454 			if (cb->cb_chain != NULL) {
455 				virtio_chain_free(cb->cb_chain);
456 				cb->cb_chain = NULL;
457 			}
458 		}
459 		VERIFY(list_is_empty(&vif->vif_ctrlbufs));
460 		if (vif->vif_ctrlbufs_mem != NULL) {
461 			kmem_free(vif->vif_ctrlbufs_mem,
462 			    sizeof (vioif_ctrlbuf_t) *
463 			    vif->vif_ctrlbufs_capacity);
464 			vif->vif_ctrlbufs_mem = NULL;
465 			vif->vif_ctrlbufs_capacity = 0;
466 		}
467 	}
468 }
469 
470 static int
471 vioif_alloc_bufs(vioif_t *vif)
472 {
473 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
474 
475 	/*
476 	 * Allocate one contiguous chunk of memory for the transmit and receive
477 	 * buffer tracking objects.  If the ring is unusually small, we'll
478 	 * reduce our target buffer count accordingly.
479 	 */
480 	vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
481 	    virtio_queue_size(vif->vif_tx_vq));
482 	vif->vif_txbufs_mem = kmem_zalloc(
483 	    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
484 	list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
485 	    offsetof(vioif_txbuf_t, tb_link));
486 
487 	vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
488 	    virtio_queue_size(vif->vif_rx_vq));
489 	vif->vif_rxbufs_mem = kmem_zalloc(
490 	    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
491 	list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
492 	    offsetof(vioif_rxbuf_t, rb_link));
493 
494 	if (vif->vif_has_ctrlq) {
495 		vif->vif_ctrlbufs_capacity = MIN(VIRTIO_NET_CTRL_BUFS,
496 		    virtio_queue_size(vif->vif_ctrl_vq));
497 		vif->vif_ctrlbufs_mem = kmem_zalloc(
498 		    sizeof (vioif_ctrlbuf_t) * vif->vif_ctrlbufs_capacity,
499 		    KM_SLEEP);
500 	}
501 	list_create(&vif->vif_ctrlbufs, sizeof (vioif_ctrlbuf_t),
502 	    offsetof(vioif_ctrlbuf_t, cb_link));
503 
504 	/*
505 	 * Do not loan more than half of our allocated receive buffers into
506 	 * the networking stack.
507 	 */
508 	vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
509 
510 	/*
511 	 * Put everything in the free list straight away in order to simplify
512 	 * the use of vioif_free_bufs() for cleanup on allocation failure.
513 	 */
514 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
515 		list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
516 	}
517 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
518 		list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
519 	}
520 	for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
521 		list_insert_tail(&vif->vif_ctrlbufs, &vif->vif_ctrlbufs_mem[i]);
522 	}
523 
524 	/*
525 	 * Start from the DMA attribute template common to both transmit and
526 	 * receive buffers.  The SGL entry count will be modified for each
527 	 * buffer type.
528 	 */
529 	ddi_dma_attr_t attr = vioif_dma_attr_bufs;
530 
531 	/*
532 	 * The transmit inline buffer is small (less than a page), so it's
533 	 * reasonable to request a single cookie.
534 	 */
535 	attr.dma_attr_sgllen = 1;
536 
537 	for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
538 	    tb = list_next(&vif->vif_txbufs, tb)) {
539 		if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
540 		    VIOIF_TX_INLINE_SIZE, &attr,
541 		    DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
542 			goto fail;
543 		}
544 		VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
545 
546 		if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
547 		    KM_SLEEP)) == NULL) {
548 			goto fail;
549 		}
550 		virtio_chain_data_set(tb->tb_chain, tb);
551 
552 		tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
553 		tb->tb_dmaext = kmem_zalloc(
554 		    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
555 		    KM_SLEEP);
556 	}
557 
558 	/*
559 	 * Control queue buffers are also small (less than a page), so we'll
560 	 * also request a single cookie for them.
561 	 */
562 	for (vioif_ctrlbuf_t *cb = list_head(&vif->vif_ctrlbufs); cb != NULL;
563 	    cb = list_next(&vif->vif_ctrlbufs, cb)) {
564 		if ((cb->cb_dma = virtio_dma_alloc(vif->vif_virtio,
565 		    VIOIF_CTRL_SIZE, &attr,
566 		    DDI_DMA_STREAMING | DDI_DMA_RDWR, KM_SLEEP)) == NULL) {
567 			goto fail;
568 		}
569 		VERIFY3U(virtio_dma_ncookies(cb->cb_dma), ==, 1);
570 
571 		if ((cb->cb_chain = virtio_chain_alloc(vif->vif_ctrl_vq,
572 		    KM_SLEEP)) == NULL) {
573 			goto fail;
574 		}
575 		virtio_chain_data_set(cb->cb_chain, cb);
576 	}
577 
578 	/*
579 	 * The receive buffers are larger, and we can tolerate a large number
580 	 * of segments.  Adjust the SGL entry count, setting aside one segment
581 	 * for the virtio net header.
582 	 */
583 	attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
584 
585 	for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
586 	    rb = list_next(&vif->vif_rxbufs, rb)) {
587 		if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
588 		    VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
589 		    KM_SLEEP)) == NULL) {
590 			goto fail;
591 		}
592 
593 		if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
594 		    KM_SLEEP)) == NULL) {
595 			goto fail;
596 		}
597 		virtio_chain_data_set(rb->rb_chain, rb);
598 
599 		/*
600 		 * Ensure that the first cookie is sufficient to cover the
601 		 * header skip region plus one byte.
602 		 */
603 		VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
604 		    VIOIF_HEADER_SKIP + 1);
605 
606 		/*
607 		 * Ensure that the frame data begins at a location with a
608 		 * correctly aligned IP header.
609 		 */
610 		VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
611 		    VIOIF_HEADER_SKIP) % 4, ==, 2);
612 
613 		rb->rb_vioif = vif;
614 		rb->rb_frtn.free_func = vioif_rx_free_callback;
615 		rb->rb_frtn.free_arg = (caddr_t)rb;
616 	}
617 
618 	return (0);
619 
620 fail:
621 	vioif_free_bufs(vif);
622 	return (ENOMEM);
623 }
624 
625 static int
626 vioif_ctrlq_req(vioif_t *vif, uint8_t class, uint8_t cmd, void *data,
627     size_t datalen)
628 {
629 	vioif_ctrlbuf_t *cb = NULL;
630 	virtio_chain_t *vic = NULL;
631 	uint8_t *p = NULL;
632 	uint64_t pa = 0;
633 	uint8_t *ackp = NULL;
634 	struct virtio_net_ctrlq_hdr hdr = {
635 		.vnch_class = class,
636 		.vnch_command = cmd,
637 	};
638 	const size_t hdrlen = sizeof (hdr);
639 	const size_t acklen = 1; /* the ack is always 1 byte */
640 	size_t totlen = hdrlen + datalen + acklen;
641 	int r = DDI_SUCCESS;
642 
643 	/*
644 	 * We shouldn't be called unless the ctrlq feature has been
645 	 * negotiated with the host
646 	 */
647 	VERIFY(vif->vif_has_ctrlq);
648 
649 	mutex_enter(&vif->vif_mutex);
650 	cb = vioif_ctrlbuf_alloc(vif);
651 	if (cb == NULL) {
652 		vif->vif_noctrlbuf++;
653 		mutex_exit(&vif->vif_mutex);
654 		r = DDI_FAILURE;
655 		goto done;
656 	}
657 	mutex_exit(&vif->vif_mutex);
658 
659 	if (totlen > virtio_dma_size(cb->cb_dma)) {
660 		vif->vif_ctrlbuf_toosmall++;
661 		r = DDI_FAILURE;
662 		goto done;
663 	}
664 
665 	/*
666 	 * Clear the entire buffer. Technically not necessary, but useful
667 	 * if trying to troubleshoot an issue, and probably not a bad idea
668 	 * to not let any old data linger.
669 	 */
670 	p = virtio_dma_va(cb->cb_dma, 0);
671 	bzero(p, virtio_dma_size(cb->cb_dma));
672 
673 	/*
674 	 * We currently do not support VIRTIO_F_ANY_LAYOUT. That means,
675 	 * that we must put the header, the data, and the ack in their
676 	 * own respective descriptors. Since all the currently supported
677 	 * control queue commands take _very_ small amounts of data, we
678 	 * use a single DMA buffer for all of it, but use 3 descriptors to
679 	 * reference (respectively) the header, the data, and the ack byte
680 	 * within that memory to adhere to the virtio spec.
681 	 *
682 	 * If we add support for control queue features such as custom
683 	 * MAC filtering tables, which might require larger amounts of
684 	 * memory, we likely will want to add more sophistication here
685 	 * and optionally use additional allocated memory to hold that
686 	 * data instead of a fixed size buffer.
687 	 *
688 	 * Copy the header.
689 	 */
690 	bcopy(&hdr, p, sizeof (hdr));
691 	pa = virtio_dma_cookie_pa(cb->cb_dma, 0);
692 	if ((r = virtio_chain_append(cb->cb_chain,
693 	    pa, hdrlen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
694 		goto done;
695 	}
696 
697 	/*
698 	 * Copy the request data
699 	 */
700 	p = virtio_dma_va(cb->cb_dma, hdrlen);
701 	bcopy(data, p, datalen);
702 	if ((r = virtio_chain_append(cb->cb_chain,
703 	    pa + hdrlen, datalen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
704 		goto done;
705 	}
706 
707 	/*
708 	 * We already cleared the buffer, so don't need to copy out a 0 for
709 	 * the ack byte. Just add a descriptor for that spot.
710 	 */
711 	ackp = virtio_dma_va(cb->cb_dma, hdrlen + datalen);
712 	if ((r = virtio_chain_append(cb->cb_chain,
713 	    pa + hdrlen + datalen, acklen,
714 	    VIRTIO_DIR_DEVICE_WRITES)) != DDI_SUCCESS) {
715 		goto done;
716 	}
717 
718 	virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORDEV);
719 	virtio_chain_submit(cb->cb_chain, B_TRUE);
720 
721 	/*
722 	 * Spin waiting for response.
723 	 */
724 	mutex_enter(&vif->vif_mutex);
725 	while ((vic = virtio_queue_poll(vif->vif_ctrl_vq)) == NULL) {
726 		mutex_exit(&vif->vif_mutex);
727 		delay(drv_usectohz(1000));
728 		mutex_enter(&vif->vif_mutex);
729 	}
730 
731 	virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORCPU);
732 	VERIFY3P(virtio_chain_data(vic), ==, cb);
733 	mutex_exit(&vif->vif_mutex);
734 
735 	if (*ackp != VIRTIO_NET_CQ_OK) {
736 		r = DDI_FAILURE;
737 	}
738 
739 done:
740 	mutex_enter(&vif->vif_mutex);
741 	vioif_ctrlbuf_free(vif, cb);
742 	mutex_exit(&vif->vif_mutex);
743 
744 	return (r);
745 }
746 
747 static int
748 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
749 {
750 	/*
751 	 * Even though we currently do not have support for programming
752 	 * multicast filters, or even enabling promiscuous mode, we return
753 	 * success here to avoid the networking stack falling back to link
754 	 * layer broadcast for multicast traffic.  Some hypervisors already
755 	 * pass received multicast frames onto the guest, so at least on those
756 	 * systems multicast will work as expected anyway.
757 	 */
758 	return (0);
759 }
760 
761 static int
762 vioif_m_setpromisc(void *arg, boolean_t on)
763 {
764 	vioif_t *vif = arg;
765 	uint8_t val = on ? 1 : 0;
766 
767 	if (!vif->vif_has_ctrlq_rx) {
768 		/*
769 		 * While most hypervisors support the control queue, bhyve
770 		 * (or more specifically viona) on illumos currently does not.
771 		 *
772 		 * Until that support is added to viona, we pretend
773 		 * the request always succeeds to match the historic behavior
774 		 * of the illumos vioif driver. Once that support has been
775 		 * added to viona, we should do the correct thing and return
776 		 * ENOTSUP
777 		 */
778 		return (0);
779 	}
780 
781 	return (vioif_ctrlq_req(vif, VIRTIO_NET_CTRL_RX,
782 	    VIRTIO_NET_CTRL_RX_PROMISC, &val, sizeof (val)));
783 }
784 
785 static int
786 vioif_m_unicst(void *arg, const uint8_t *mac)
787 {
788 	return (ENOTSUP);
789 }
790 
791 static uint_t
792 vioif_add_rx(vioif_t *vif)
793 {
794 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
795 
796 	if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
797 		/*
798 		 * If the NIC is not running, do not give the device any
799 		 * receive buffers.
800 		 */
801 		return (0);
802 	}
803 
804 	uint_t num_added = 0;
805 
806 	vioif_rxbuf_t *rb;
807 	while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
808 		/*
809 		 * For legacy devices, and those that have not negotiated
810 		 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
811 		 * separate descriptor entry to the rest of the buffer.
812 		 */
813 		if (virtio_chain_append(rb->rb_chain,
814 		    virtio_dma_cookie_pa(rb->rb_dma, 0),
815 		    sizeof (struct virtio_net_hdr),
816 		    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
817 			goto fail;
818 		}
819 
820 		for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
821 			uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
822 			size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
823 
824 			if (n == 0) {
825 				pa += VIOIF_HEADER_SKIP;
826 				VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
827 				sz -= VIOIF_HEADER_SKIP;
828 			}
829 
830 			if (virtio_chain_append(rb->rb_chain, pa, sz,
831 			    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
832 				goto fail;
833 			}
834 		}
835 
836 		virtio_chain_submit(rb->rb_chain, B_FALSE);
837 		num_added++;
838 		continue;
839 
840 fail:
841 		vioif_rxbuf_free(vif, rb);
842 		vif->vif_norecvbuf++;
843 		break;
844 	}
845 
846 	if (num_added > 0) {
847 		virtio_queue_flush(vif->vif_rx_vq);
848 	}
849 
850 	return (num_added);
851 }
852 
853 static uint_t
854 vioif_process_rx(vioif_t *vif)
855 {
856 	virtio_chain_t *vic;
857 	mblk_t *mphead = NULL, *lastmp = NULL, *mp;
858 	uint_t num_processed = 0;
859 
860 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
861 
862 	while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
863 		/*
864 		 * We have to use the chain received length here, as the device
865 		 * does not tell us the received frame length any other way.
866 		 * In a limited survey of hypervisors, virtio network devices
867 		 * appear to provide the right value here.
868 		 */
869 		size_t len = virtio_chain_received_length(vic);
870 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
871 
872 		virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
873 
874 		/*
875 		 * If the NIC is not running, discard any received frames.
876 		 */
877 		if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
878 			vioif_rxbuf_free(vif, rb);
879 			continue;
880 		}
881 
882 		if (len < sizeof (struct virtio_net_hdr)) {
883 			vif->vif_rxfail_chain_undersize++;
884 			vif->vif_ierrors++;
885 			vioif_rxbuf_free(vif, rb);
886 			continue;
887 		}
888 		len -= sizeof (struct virtio_net_hdr);
889 
890 		/*
891 		 * We copy small packets that happen to fit into a single
892 		 * cookie and reuse the buffers. For bigger ones, we loan
893 		 * the buffers upstream.
894 		 */
895 		if (len < vif->vif_rxcopy_thresh ||
896 		    vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
897 			mutex_exit(&vif->vif_mutex);
898 			if ((mp = allocb(len, 0)) == NULL) {
899 				mutex_enter(&vif->vif_mutex);
900 				vif->vif_norecvbuf++;
901 				vif->vif_ierrors++;
902 
903 				vioif_rxbuf_free(vif, rb);
904 				continue;
905 			}
906 
907 			bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
908 			    mp->b_rptr, len);
909 			mp->b_wptr = mp->b_rptr + len;
910 
911 			/*
912 			 * As the packet contents was copied rather than
913 			 * loaned, we can return the receive buffer resources
914 			 * to the free list.
915 			 */
916 			mutex_enter(&vif->vif_mutex);
917 			vioif_rxbuf_free(vif, rb);
918 
919 		} else {
920 			mutex_exit(&vif->vif_mutex);
921 			if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
922 			    VIOIF_HEADER_SKIP), len, 0,
923 			    &rb->rb_frtn)) == NULL) {
924 				mutex_enter(&vif->vif_mutex);
925 				vif->vif_norecvbuf++;
926 				vif->vif_ierrors++;
927 
928 				vioif_rxbuf_free(vif, rb);
929 				continue;
930 			}
931 			mp->b_wptr = mp->b_rptr + len;
932 
933 			mutex_enter(&vif->vif_mutex);
934 			vif->vif_nrxbufs_onloan++;
935 		}
936 
937 		/*
938 		 * virtio-net does not tell us if this packet is multicast
939 		 * or broadcast, so we have to check it.
940 		 */
941 		if (mp->b_rptr[0] & 0x1) {
942 			if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
943 				vif->vif_multircv++;
944 			else
945 				vif->vif_brdcstrcv++;
946 		}
947 
948 		vif->vif_rbytes += len;
949 		vif->vif_ipackets++;
950 
951 		if (lastmp == NULL) {
952 			mphead = mp;
953 		} else {
954 			lastmp->b_next = mp;
955 		}
956 		lastmp = mp;
957 		num_processed++;
958 	}
959 
960 	if (mphead != NULL) {
961 		if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
962 			mutex_exit(&vif->vif_mutex);
963 			mac_rx(vif->vif_mac_handle, NULL, mphead);
964 			mutex_enter(&vif->vif_mutex);
965 		} else {
966 			/*
967 			 * The NIC was disabled part way through our execution,
968 			 * so free the messages we allocated.
969 			 */
970 			freemsgchain(mphead);
971 		}
972 	}
973 
974 	return (num_processed);
975 }
976 
977 static uint_t
978 vioif_reclaim_used_tx(vioif_t *vif)
979 {
980 	virtio_chain_t *vic;
981 	uint_t num_reclaimed = 0;
982 
983 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
984 
985 	while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
986 		vioif_txbuf_t *tb = virtio_chain_data(vic);
987 
988 		if (tb->tb_mp != NULL) {
989 			/*
990 			 * Unbind the external mapping.
991 			 */
992 			for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
993 				if (tb->tb_dmaext[i] == NULL) {
994 					continue;
995 				}
996 
997 				virtio_dma_unbind(tb->tb_dmaext[i]);
998 			}
999 
1000 			freemsg(tb->tb_mp);
1001 			tb->tb_mp = NULL;
1002 		}
1003 
1004 		/*
1005 		 * Return this transmit buffer to the free list for reuse.
1006 		 */
1007 		mutex_enter(&vif->vif_mutex);
1008 		vioif_txbuf_free(vif, tb);
1009 		mutex_exit(&vif->vif_mutex);
1010 
1011 		num_reclaimed++;
1012 	}
1013 
1014 	/* Return ring to transmitting state if descriptors were reclaimed. */
1015 	if (num_reclaimed > 0) {
1016 		boolean_t do_update = B_FALSE;
1017 
1018 		mutex_enter(&vif->vif_mutex);
1019 		vif->vif_stat_tx_reclaim += num_reclaimed;
1020 		if (vif->vif_tx_corked) {
1021 			/*
1022 			 * TX was corked on a lack of available descriptors.
1023 			 * That dire state has passed so the TX interrupt can
1024 			 * be disabled and MAC can be notified that
1025 			 * transmission is possible again.
1026 			 */
1027 			vif->vif_tx_corked = B_FALSE;
1028 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1029 			do_update = B_TRUE;
1030 		}
1031 
1032 		if (do_update) {
1033 			mac_tx_update(vif->vif_mac_handle);
1034 		}
1035 		mutex_exit(&vif->vif_mutex);
1036 	}
1037 
1038 	return (num_reclaimed);
1039 }
1040 
1041 static void
1042 vioif_reclaim_periodic(void *arg)
1043 {
1044 	vioif_t *vif = arg;
1045 	uint_t num_reclaimed;
1046 
1047 	num_reclaimed = vioif_reclaim_used_tx(vif);
1048 
1049 	mutex_enter(&vif->vif_mutex);
1050 	vif->vif_tx_reclaim_tid = 0;
1051 	/*
1052 	 * If used descriptors were reclaimed or TX descriptors appear to be
1053 	 * outstanding, the ring is considered active and periodic reclamation
1054 	 * is necessary for now.
1055 	 */
1056 	if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1057 		/* Do not reschedule if the ring is being drained. */
1058 		if (!vif->vif_tx_drain) {
1059 			vioif_reclaim_restart(vif);
1060 		}
1061 	}
1062 	mutex_exit(&vif->vif_mutex);
1063 }
1064 
1065 static void
1066 vioif_reclaim_restart(vioif_t *vif)
1067 {
1068 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1069 	VERIFY(!vif->vif_tx_drain);
1070 
1071 	if (vif->vif_tx_reclaim_tid == 0) {
1072 		vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
1073 		    MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
1074 	}
1075 }
1076 
1077 static void
1078 vioif_tx_drain(vioif_t *vif)
1079 {
1080 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1081 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
1082 
1083 	vif->vif_tx_drain = B_TRUE;
1084 	/* Put a stop to the periodic reclaim if it is running */
1085 	if (vif->vif_tx_reclaim_tid != 0) {
1086 		timeout_id_t tid = vif->vif_tx_reclaim_tid;
1087 
1088 		/*
1089 		 * With vif_tx_drain set, there is no risk that a racing
1090 		 * vioif_reclaim_periodic() call will reschedule itself.
1091 		 *
1092 		 * Being part of the mc_stop hook also guarantees that
1093 		 * vioif_m_tx() will not be called to restart it.
1094 		 */
1095 		vif->vif_tx_reclaim_tid = 0;
1096 		mutex_exit(&vif->vif_mutex);
1097 		(void) untimeout(tid);
1098 		mutex_enter(&vif->vif_mutex);
1099 	}
1100 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1101 
1102 	/*
1103 	 * Wait for all of the TX descriptors to be processed by the host so
1104 	 * they can be reclaimed.
1105 	 */
1106 	while (vif->vif_ntxbufs_alloc > 0) {
1107 		mutex_exit(&vif->vif_mutex);
1108 		(void) vioif_reclaim_used_tx(vif);
1109 		delay(5);
1110 		mutex_enter(&vif->vif_mutex);
1111 	}
1112 	VERIFY(!vif->vif_tx_corked);
1113 	VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
1114 	VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
1115 }
1116 
1117 static int
1118 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1119 {
1120 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1121 
1122 	VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
1123 
1124 	/*
1125 	 * Copy the message into the inline buffer and then free the message.
1126 	 */
1127 	mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
1128 
1129 	if (virtio_chain_append(tb->tb_chain,
1130 	    virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
1131 	    msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1132 		return (DDI_FAILURE);
1133 	}
1134 
1135 	return (DDI_SUCCESS);
1136 }
1137 
1138 static int
1139 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1140 {
1141 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1142 
1143 	mblk_t *nmp = mp;
1144 	tb->tb_ndmaext = 0;
1145 
1146 	while (nmp != NULL) {
1147 		size_t len;
1148 
1149 		if ((len = MBLKL(nmp)) == 0) {
1150 			/*
1151 			 * Skip any zero-length entries in the chain.
1152 			 */
1153 			nmp = nmp->b_cont;
1154 			continue;
1155 		}
1156 
1157 		if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
1158 			mutex_enter(&vif->vif_mutex);
1159 			vif->vif_txfail_indirect_limit++;
1160 			vif->vif_notxbuf++;
1161 			mutex_exit(&vif->vif_mutex);
1162 			goto fail;
1163 		}
1164 
1165 		if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
1166 			/*
1167 			 * Allocate a DMA handle for this slot.
1168 			 */
1169 			if ((tb->tb_dmaext[tb->tb_ndmaext] =
1170 			    virtio_dma_alloc_nomem(vif->vif_virtio,
1171 			    &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
1172 				mutex_enter(&vif->vif_mutex);
1173 				vif->vif_notxbuf++;
1174 				mutex_exit(&vif->vif_mutex);
1175 				goto fail;
1176 			}
1177 		}
1178 		virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
1179 
1180 		if (virtio_dma_bind(extdma, nmp->b_rptr, len,
1181 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
1182 		    DDI_SUCCESS) {
1183 			mutex_enter(&vif->vif_mutex);
1184 			vif->vif_txfail_dma_bind++;
1185 			mutex_exit(&vif->vif_mutex);
1186 			goto fail;
1187 		}
1188 
1189 		for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
1190 			uint64_t pa = virtio_dma_cookie_pa(extdma, n);
1191 			size_t sz = virtio_dma_cookie_size(extdma, n);
1192 
1193 			if (virtio_chain_append(tb->tb_chain, pa, sz,
1194 			    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1195 				mutex_enter(&vif->vif_mutex);
1196 				vif->vif_txfail_indirect_limit++;
1197 				vif->vif_notxbuf++;
1198 				mutex_exit(&vif->vif_mutex);
1199 				goto fail;
1200 			}
1201 		}
1202 
1203 		nmp = nmp->b_cont;
1204 	}
1205 
1206 	/*
1207 	 * We need to keep the message around until we reclaim the buffer from
1208 	 * the device before freeing it.
1209 	 */
1210 	tb->tb_mp = mp;
1211 
1212 	return (DDI_SUCCESS);
1213 
1214 fail:
1215 	for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
1216 		if (tb->tb_dmaext[n] != NULL) {
1217 			virtio_dma_unbind(tb->tb_dmaext[n]);
1218 		}
1219 	}
1220 	tb->tb_ndmaext = 0;
1221 
1222 	freemsg(mp);
1223 
1224 	return (DDI_FAILURE);
1225 }
1226 
1227 static boolean_t
1228 vioif_send(vioif_t *vif, mblk_t *mp)
1229 {
1230 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1231 
1232 	vioif_txbuf_t *tb = NULL;
1233 	struct virtio_net_hdr *vnh = NULL;
1234 	size_t msg_size = 0;
1235 	uint32_t csum_start;
1236 	uint32_t csum_stuff;
1237 	uint32_t csum_flags;
1238 	uint32_t lso_flags;
1239 	uint32_t lso_mss;
1240 	mblk_t *nmp;
1241 	int ret;
1242 	boolean_t lso_required = B_FALSE;
1243 	struct ether_header *ether = (void *)mp->b_rptr;
1244 
1245 	for (nmp = mp; nmp; nmp = nmp->b_cont)
1246 		msg_size += MBLKL(nmp);
1247 
1248 	if (vif->vif_tx_tso4 || vif->vif_tx_tso6) {
1249 		mac_lso_get(mp, &lso_mss, &lso_flags);
1250 		lso_required = (lso_flags & HW_LSO) != 0;
1251 	}
1252 
1253 	mutex_enter(&vif->vif_mutex);
1254 	if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
1255 		vif->vif_notxbuf++;
1256 		goto fail;
1257 	}
1258 	mutex_exit(&vif->vif_mutex);
1259 
1260 	/*
1261 	 * Use the inline buffer for the virtio net header.  Zero the portion
1262 	 * of our DMA allocation prior to the packet data.
1263 	 */
1264 	vnh = virtio_dma_va(tb->tb_dma, 0);
1265 	bzero(vnh, VIOIF_HEADER_SKIP);
1266 
1267 	/*
1268 	 * For legacy devices, and those that have not negotiated
1269 	 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
1270 	 * descriptor entry to the rest of the buffer.
1271 	 */
1272 	if (virtio_chain_append(tb->tb_chain,
1273 	    virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
1274 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1275 		mutex_enter(&vif->vif_mutex);
1276 		vif->vif_notxbuf++;
1277 		goto fail;
1278 	}
1279 
1280 	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
1281 
1282 	/*
1283 	 * They want us to do the TCP/UDP csum calculation.
1284 	 */
1285 	if (csum_flags & HCK_PARTIALCKSUM) {
1286 		int eth_hsize;
1287 
1288 		/*
1289 		 * Did we ask for it?
1290 		 */
1291 		ASSERT(vif->vif_tx_csum);
1292 
1293 		/*
1294 		 * We only asked for partial csum packets.
1295 		 */
1296 		ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1297 		ASSERT(!(csum_flags & HCK_FULLCKSUM));
1298 
1299 		if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
1300 			eth_hsize = sizeof (struct ether_vlan_header);
1301 		} else {
1302 			eth_hsize = sizeof (struct ether_header);
1303 		}
1304 
1305 		vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1306 		vnh->vnh_csum_start = eth_hsize + csum_start;
1307 		vnh->vnh_csum_offset = csum_stuff - csum_start;
1308 	}
1309 
1310 	/*
1311 	 * Setup LSO fields if required.
1312 	 */
1313 	if (lso_required) {
1314 		mac_ether_offload_flags_t needed;
1315 		mac_ether_offload_info_t meo;
1316 		uint32_t cksum;
1317 		size_t len;
1318 		mblk_t *pullmp = NULL;
1319 		tcpha_t *tcpha;
1320 
1321 		if (mac_ether_offload_info(mp, &meo) != 0) {
1322 			goto fail;
1323 		}
1324 
1325 		needed = MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET;
1326 		if ((meo.meoi_flags & needed) != needed) {
1327 			goto fail;
1328 		}
1329 
1330 		if (meo.meoi_l4proto != IPPROTO_TCP) {
1331 			goto fail;
1332 		}
1333 
1334 		if (meo.meoi_l3proto == ETHERTYPE_IP && vif->vif_tx_tso4) {
1335 			vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1336 		} else if (meo.meoi_l3proto == ETHERTYPE_IPV6 &&
1337 		    vif->vif_tx_tso6) {
1338 			vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1339 		} else {
1340 			goto fail;
1341 		}
1342 
1343 		/*
1344 		 * The TCP stack does not include the length in the TCP
1345 		 * pseudo-header when it is performing LSO since hardware
1346 		 * generally asks for it to be removed (as it'll change).
1347 		 * Unfortunately, for virtio, we actually need it. This means we
1348 		 * need to go through and calculate the actual length and fix
1349 		 * things up. Because the virtio spec cares about the ECN flag
1350 		 * and indicating that, at least this means we'll have that
1351 		 * available as well.
1352 		 */
1353 		if (MBLKL(mp) < vnh->vnh_hdr_len) {
1354 			pullmp = msgpullup(mp, vnh->vnh_hdr_len);
1355 			if (pullmp == NULL)
1356 				goto fail;
1357 			tcpha = (tcpha_t *)(pullmp->b_rptr + meo.meoi_l2hlen +
1358 			    meo.meoi_l3hlen);
1359 		} else {
1360 			tcpha = (tcpha_t *)(mp->b_rptr + meo.meoi_l2hlen +
1361 			    meo.meoi_l3hlen);
1362 		}
1363 
1364 		len = meo.meoi_len - meo.meoi_l2hlen - meo.meoi_l3hlen;
1365 		cksum = ntohs(tcpha->tha_sum) + len;
1366 		cksum = (cksum >> 16) + (cksum & 0xffff);
1367 		cksum = (cksum >> 16) + (cksum & 0xffff);
1368 		tcpha->tha_sum = htons(cksum);
1369 
1370 		if (tcpha->tha_flags & TH_CWR) {
1371 			vnh->vnh_gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1372 		}
1373 		vnh->vnh_gso_size = (uint16_t)lso_mss;
1374 		vnh->vnh_hdr_len = meo.meoi_l2hlen + meo.meoi_l3hlen +
1375 		    meo.meoi_l4hlen;
1376 
1377 		freemsg(pullmp);
1378 	}
1379 
1380 	/*
1381 	 * The device does not maintain its own statistics about broadcast or
1382 	 * multicast packets, so we have to check the destination address
1383 	 * ourselves.
1384 	 */
1385 	if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
1386 		mutex_enter(&vif->vif_mutex);
1387 		if (ether_cmp(&ether->ether_dhost, vioif_broadcast) == 0) {
1388 			vif->vif_brdcstxmt++;
1389 		} else {
1390 			vif->vif_multixmt++;
1391 		}
1392 		mutex_exit(&vif->vif_mutex);
1393 	}
1394 
1395 	/*
1396 	 * For small packets, copy into the preallocated inline buffer rather
1397 	 * than incur the overhead of mapping.  Note that both of these
1398 	 * functions ensure that "mp" is freed before returning.
1399 	 */
1400 	if (msg_size < vif->vif_txcopy_thresh) {
1401 		ret = vioif_tx_inline(vif, tb, mp, msg_size);
1402 	} else {
1403 		ret = vioif_tx_external(vif, tb, mp, msg_size);
1404 	}
1405 	mp = NULL;
1406 
1407 	mutex_enter(&vif->vif_mutex);
1408 
1409 	if (ret != DDI_SUCCESS) {
1410 		goto fail;
1411 	}
1412 
1413 	vif->vif_opackets++;
1414 	vif->vif_obytes += msg_size;
1415 	mutex_exit(&vif->vif_mutex);
1416 
1417 	virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
1418 	virtio_chain_submit(tb->tb_chain, B_TRUE);
1419 
1420 	return (B_TRUE);
1421 
1422 fail:
1423 	vif->vif_oerrors++;
1424 	if (tb != NULL) {
1425 		vioif_txbuf_free(vif, tb);
1426 	}
1427 	mutex_exit(&vif->vif_mutex);
1428 
1429 	return (mp == NULL);
1430 }
1431 
1432 static mblk_t *
1433 vioif_m_tx(void *arg, mblk_t *mp)
1434 {
1435 	vioif_t *vif = arg;
1436 	mblk_t *nmp;
1437 
1438 	/*
1439 	 * Prior to attempting to send any more frames, do a reclaim to pick up
1440 	 * any descriptors which have been processed by the host.
1441 	 */
1442 	if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1443 		(void) vioif_reclaim_used_tx(vif);
1444 	}
1445 
1446 	while (mp != NULL) {
1447 		nmp = mp->b_next;
1448 		mp->b_next = NULL;
1449 
1450 		if (!vioif_send(vif, mp)) {
1451 			/*
1452 			 * If there are no descriptors available, try to
1453 			 * reclaim some, allowing a retry of the send if some
1454 			 * are found.
1455 			 */
1456 			mp->b_next = nmp;
1457 			if (vioif_reclaim_used_tx(vif) != 0) {
1458 				continue;
1459 			}
1460 
1461 			/*
1462 			 * Otherwise, enable the TX ring interrupt so that as
1463 			 * soon as a descriptor becomes available, transmission
1464 			 * can begin again.  For safety, make sure the periodic
1465 			 * reclaim is running as well.
1466 			 */
1467 			mutex_enter(&vif->vif_mutex);
1468 			vif->vif_tx_corked = B_TRUE;
1469 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
1470 			vioif_reclaim_restart(vif);
1471 			mutex_exit(&vif->vif_mutex);
1472 			return (mp);
1473 		}
1474 		mp = nmp;
1475 	}
1476 
1477 	/* Ensure the periodic reclaim has been started. */
1478 	mutex_enter(&vif->vif_mutex);
1479 	vioif_reclaim_restart(vif);
1480 	mutex_exit(&vif->vif_mutex);
1481 
1482 	return (NULL);
1483 }
1484 
1485 static int
1486 vioif_m_start(void *arg)
1487 {
1488 	vioif_t *vif = arg;
1489 
1490 	mutex_enter(&vif->vif_mutex);
1491 
1492 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
1493 	vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
1494 
1495 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1496 
1497 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
1498 
1499 	/*
1500 	 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1501 	 * Descriptor reclamation is handling during transmit, via a periodic
1502 	 * timer, and when resources are tight, via the then-enabled interrupt.
1503 	 */
1504 	vif->vif_tx_drain = B_FALSE;
1505 
1506 	/*
1507 	 * Add as many receive buffers as we can to the receive queue.  If we
1508 	 * cannot add any, it may be because we have stopped and started again
1509 	 * and the descriptors are all in the queue already.
1510 	 */
1511 	(void) vioif_add_rx(vif);
1512 
1513 	mutex_exit(&vif->vif_mutex);
1514 	return (DDI_SUCCESS);
1515 }
1516 
1517 static void
1518 vioif_m_stop(void *arg)
1519 {
1520 	vioif_t *vif = arg;
1521 
1522 	mutex_enter(&vif->vif_mutex);
1523 
1524 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
1525 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
1526 
1527 	/* Ensure all TX descriptors have been processed and reclaimed */
1528 	vioif_tx_drain(vif);
1529 
1530 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1531 
1532 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1533 	mutex_exit(&vif->vif_mutex);
1534 }
1535 
1536 static int
1537 vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
1538 {
1539 	vioif_t *vif = arg;
1540 
1541 	switch (stat) {
1542 	case MAC_STAT_IERRORS:
1543 		*val = vif->vif_ierrors;
1544 		break;
1545 	case MAC_STAT_OERRORS:
1546 		*val = vif->vif_oerrors;
1547 		break;
1548 	case MAC_STAT_MULTIRCV:
1549 		*val = vif->vif_multircv;
1550 		break;
1551 	case MAC_STAT_BRDCSTRCV:
1552 		*val = vif->vif_brdcstrcv;
1553 		break;
1554 	case MAC_STAT_MULTIXMT:
1555 		*val = vif->vif_multixmt;
1556 		break;
1557 	case MAC_STAT_BRDCSTXMT:
1558 		*val = vif->vif_brdcstxmt;
1559 		break;
1560 	case MAC_STAT_IPACKETS:
1561 		*val = vif->vif_ipackets;
1562 		break;
1563 	case MAC_STAT_RBYTES:
1564 		*val = vif->vif_rbytes;
1565 		break;
1566 	case MAC_STAT_OPACKETS:
1567 		*val = vif->vif_opackets;
1568 		break;
1569 	case MAC_STAT_OBYTES:
1570 		*val = vif->vif_obytes;
1571 		break;
1572 	case MAC_STAT_NORCVBUF:
1573 		*val = vif->vif_norecvbuf;
1574 		break;
1575 	case MAC_STAT_NOXMTBUF:
1576 		*val = vif->vif_notxbuf;
1577 		break;
1578 	case MAC_STAT_IFSPEED:
1579 		/* always 1 Gbit */
1580 		*val = 1000000000ULL;
1581 		break;
1582 	case ETHER_STAT_LINK_DUPLEX:
1583 		/* virtual device, always full-duplex */
1584 		*val = LINK_DUPLEX_FULL;
1585 		break;
1586 
1587 	default:
1588 		return (ENOTSUP);
1589 	}
1590 
1591 	return (DDI_SUCCESS);
1592 }
1593 
1594 static int
1595 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1596     uint_t pr_valsize, const void *pr_val)
1597 {
1598 	vioif_t *vif = arg;
1599 
1600 	switch (pr_num) {
1601 	case MAC_PROP_MTU: {
1602 		int r;
1603 		uint32_t mtu;
1604 		if (pr_valsize < sizeof (mtu)) {
1605 			return (EOVERFLOW);
1606 		}
1607 		bcopy(pr_val, &mtu, sizeof (mtu));
1608 
1609 		if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
1610 			return (EINVAL);
1611 		}
1612 
1613 		mutex_enter(&vif->vif_mutex);
1614 		if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
1615 			vif->vif_mtu = mtu;
1616 		}
1617 		mutex_exit(&vif->vif_mutex);
1618 
1619 		return (r);
1620 	}
1621 
1622 	case MAC_PROP_PRIVATE: {
1623 		long max, result;
1624 		uint_t *resp;
1625 		char *endptr;
1626 
1627 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1628 			max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
1629 			resp = &vif->vif_txcopy_thresh;
1630 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1631 			max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
1632 			resp = &vif->vif_rxcopy_thresh;
1633 		} else {
1634 			return (ENOTSUP);
1635 		}
1636 
1637 		if (pr_val == NULL) {
1638 			return (EINVAL);
1639 		}
1640 
1641 		if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
1642 		    *endptr != '\0' || result < 0 || result > max) {
1643 			return (EINVAL);
1644 		}
1645 
1646 		mutex_enter(&vif->vif_mutex);
1647 		*resp = result;
1648 		mutex_exit(&vif->vif_mutex);
1649 
1650 		return (0);
1651 	}
1652 
1653 	default:
1654 		return (ENOTSUP);
1655 	}
1656 }
1657 
1658 static int
1659 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1660     uint_t pr_valsize, void *pr_val)
1661 {
1662 	vioif_t *vif = arg;
1663 
1664 	switch (pr_num) {
1665 	case MAC_PROP_PRIVATE: {
1666 		uint_t value;
1667 
1668 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1669 			value = vif->vif_txcopy_thresh;
1670 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1671 			value = vif->vif_rxcopy_thresh;
1672 		} else {
1673 			return (ENOTSUP);
1674 		}
1675 
1676 		if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
1677 			return (EOVERFLOW);
1678 		}
1679 
1680 		return (0);
1681 	}
1682 
1683 	default:
1684 		return (ENOTSUP);
1685 	}
1686 }
1687 
1688 static void
1689 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1690     mac_prop_info_handle_t prh)
1691 {
1692 	vioif_t *vif = arg;
1693 	char valstr[64];
1694 	int value;
1695 
1696 	switch (pr_num) {
1697 	case MAC_PROP_MTU:
1698 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1699 		mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
1700 		return;
1701 
1702 	case MAC_PROP_PRIVATE:
1703 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1704 			value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1705 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1706 			value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1707 		} else {
1708 			/*
1709 			 * We do not recognise this private property name.
1710 			 */
1711 			return;
1712 		}
1713 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1714 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
1715 		mac_prop_info_set_default_str(prh, valstr);
1716 		return;
1717 
1718 	default:
1719 		return;
1720 	}
1721 }
1722 
1723 static boolean_t
1724 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1725 {
1726 	vioif_t *vif = arg;
1727 
1728 	switch (cap) {
1729 	case MAC_CAPAB_HCKSUM: {
1730 		if (!vif->vif_tx_csum) {
1731 			return (B_FALSE);
1732 		}
1733 
1734 		*(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
1735 
1736 		return (B_TRUE);
1737 	}
1738 
1739 	case MAC_CAPAB_LSO: {
1740 		if (!vif->vif_tx_tso4) {
1741 			return (B_FALSE);
1742 		}
1743 
1744 		mac_capab_lso_t *lso = cap_data;
1745 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | LSO_TX_BASIC_TCP_IPV6;
1746 		lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
1747 		lso->lso_basic_tcp_ipv6.lso_max = VIOIF_RX_DATA_SIZE;
1748 
1749 		return (B_TRUE);
1750 	}
1751 
1752 	default:
1753 		return (B_FALSE);
1754 	}
1755 }
1756 
1757 static boolean_t
1758 vioif_has_feature(vioif_t *vif, uint32_t feature)
1759 {
1760 	return (virtio_feature_present(vif->vif_virtio, feature));
1761 }
1762 
1763 /*
1764  * Read the primary MAC address from the device if one is provided.  If not,
1765  * generate a random locally administered MAC address and write it back to the
1766  * device.
1767  */
1768 static void
1769 vioif_get_mac(vioif_t *vif)
1770 {
1771 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1772 
1773 	if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
1774 		for (uint_t i = 0; i < ETHERADDRL; i++) {
1775 			vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
1776 			    VIRTIO_NET_CONFIG_MAC + i);
1777 		}
1778 		vif->vif_mac_from_host = 1;
1779 
1780 		return;
1781 	}
1782 
1783 	/* Get a few random bytes */
1784 	(void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
1785 	/* Make sure it's a unicast MAC */
1786 	vif->vif_mac[0] &= ~1;
1787 	/* Set the "locally administered" bit */
1788 	vif->vif_mac[1] |= 2;
1789 
1790 	/*
1791 	 * Write the random MAC address back to the device.
1792 	 */
1793 	for (uint_t i = 0; i < ETHERADDRL; i++) {
1794 		virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
1795 		    vif->vif_mac[i]);
1796 	}
1797 	vif->vif_mac_from_host = 0;
1798 
1799 	dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
1800 	    "%02x:%02x:%02x:%02x:%02x:%02x",
1801 	    (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
1802 	    (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
1803 	    (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
1804 }
1805 
1806 /*
1807  * Virtqueue interrupt handlers
1808  */
1809 static uint_t
1810 vioif_rx_handler(caddr_t arg0, caddr_t arg1)
1811 {
1812 	vioif_t *vif = (vioif_t *)arg0;
1813 
1814 	mutex_enter(&vif->vif_mutex);
1815 	(void) vioif_process_rx(vif);
1816 
1817 	/*
1818 	 * Attempt to replenish the receive queue.  If we cannot add any
1819 	 * descriptors here, it may be because all of the recently received
1820 	 * packets were loaned up to the networking stack.
1821 	 */
1822 	(void) vioif_add_rx(vif);
1823 	mutex_exit(&vif->vif_mutex);
1824 
1825 	return (DDI_INTR_CLAIMED);
1826 }
1827 
1828 static uint_t
1829 vioif_tx_handler(caddr_t arg0, caddr_t arg1)
1830 {
1831 	vioif_t *vif = (vioif_t *)arg0;
1832 
1833 	/*
1834 	 * The TX interrupt could race with other reclamation activity, so
1835 	 * interpreting the return value is unimportant.
1836 	 */
1837 	(void) vioif_reclaim_used_tx(vif);
1838 
1839 	return (DDI_INTR_CLAIMED);
1840 }
1841 
1842 static void
1843 vioif_check_features(vioif_t *vif)
1844 {
1845 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1846 
1847 	vif->vif_tx_csum = 0;
1848 	vif->vif_tx_tso4 = 0;
1849 	vif->vif_tx_tso6 = 0;
1850 
1851 	if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
1852 		/*
1853 		 * The host will accept packets with partial checksums from us.
1854 		 */
1855 		vif->vif_tx_csum = 1;
1856 
1857 		/*
1858 		 * The legacy GSO feature represents the combination of
1859 		 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
1860 		 */
1861 		boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
1862 		boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
1863 		boolean_t tso6 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO6);
1864 		boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
1865 
1866 		/*
1867 		 * Explicit congestion notification (ECN) is configured
1868 		 * globally; see "tcp_ecn_permitted".  As we cannot currently
1869 		 * request that the stack disable ECN on a per interface basis,
1870 		 * we require the device to support the combination of
1871 		 * segmentation offload and ECN support.
1872 		 */
1873 		if (gso) {
1874 			vif->vif_tx_tso4 = 1;
1875 			vif->vif_tx_tso6 = 1;
1876 		}
1877 		if (tso4 && ecn) {
1878 			vif->vif_tx_tso4 = 1;
1879 		}
1880 		if (tso6 && ecn) {
1881 			vif->vif_tx_tso6 = 1;
1882 		}
1883 	}
1884 
1885 	if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ)) {
1886 		vif->vif_has_ctrlq = 1;
1887 
1888 		/*
1889 		 * The VIRTIO_NET_F_CTRL_VQ feature must be enabled if there's
1890 		 * any chance of the VIRTIO_NET_F_CTRL_RX being enabled.
1891 		 */
1892 		if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_RX))
1893 			vif->vif_has_ctrlq_rx = 1;
1894 	}
1895 }
1896 
1897 static int
1898 vioif_select_interrupt_types(void)
1899 {
1900 	id_t id;
1901 	smbios_system_t sys;
1902 	smbios_info_t info;
1903 
1904 	if (vioif_allowed_int_types != -1) {
1905 		/*
1906 		 * If this value was tuned via /etc/system or the debugger,
1907 		 * use the provided value directly.
1908 		 */
1909 		return (vioif_allowed_int_types);
1910 	}
1911 
1912 	if ((id = smbios_info_system(ksmbios, &sys)) == SMB_ERR ||
1913 	    smbios_info_common(ksmbios, id, &info) == SMB_ERR) {
1914 		/*
1915 		 * The system may not have valid SMBIOS data, so ignore a
1916 		 * failure here.
1917 		 */
1918 		return (0);
1919 	}
1920 
1921 	if (strcmp(info.smbi_manufacturer, "Google") == 0 &&
1922 	    strcmp(info.smbi_product, "Google Compute Engine") == 0) {
1923 		/*
1924 		 * An undiagnosed issue with the Google Compute Engine (GCE)
1925 		 * hypervisor exists.  In this environment, no RX interrupts
1926 		 * are received if MSI-X handlers are installed.  This does not
1927 		 * appear to be true for the Virtio SCSI driver.  Fixed
1928 		 * interrupts do appear to work, so we fall back for now:
1929 		 */
1930 		return (DDI_INTR_TYPE_FIXED);
1931 	}
1932 
1933 	return (0);
1934 }
1935 
1936 static int
1937 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1938 {
1939 	int ret;
1940 	vioif_t *vif;
1941 	virtio_t *vio;
1942 	mac_register_t *macp = NULL;
1943 
1944 	if (cmd != DDI_ATTACH) {
1945 		return (DDI_FAILURE);
1946 	}
1947 
1948 	if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
1949 	    NULL) {
1950 		return (DDI_FAILURE);
1951 	}
1952 
1953 	vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
1954 	vif->vif_dip = dip;
1955 	vif->vif_virtio = vio;
1956 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1957 	ddi_set_driver_private(dip, vif);
1958 
1959 	if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
1960 	    "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
1961 	    (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
1962 	    "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1963 		goto fail;
1964 	}
1965 
1966 	if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ) &&
1967 	    (vif->vif_ctrl_vq = virtio_queue_alloc(vio,
1968 	    VIRTIO_NET_VIRTQ_CONTROL, "ctrlq", NULL, vif,
1969 	    B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1970 		goto fail;
1971 	}
1972 
1973 	if (virtio_init_complete(vio, vioif_select_interrupt_types()) !=
1974 	    DDI_SUCCESS) {
1975 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
1976 		goto fail;
1977 	}
1978 
1979 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1980 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1981 	if (vif->vif_ctrl_vq != NULL)
1982 		virtio_queue_no_interrupt(vif->vif_ctrl_vq, B_TRUE);
1983 
1984 	mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
1985 	mutex_enter(&vif->vif_mutex);
1986 
1987 	vioif_get_mac(vif);
1988 
1989 	vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1990 	vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1991 
1992 	if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
1993 		vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
1994 	} else {
1995 		vif->vif_mtu_max = ETHERMTU;
1996 	}
1997 
1998 	vif->vif_mtu = ETHERMTU;
1999 	if (vif->vif_mtu > vif->vif_mtu_max) {
2000 		vif->vif_mtu = vif->vif_mtu_max;
2001 	}
2002 
2003 	vioif_check_features(vif);
2004 
2005 	if (vioif_alloc_bufs(vif) != 0) {
2006 		mutex_exit(&vif->vif_mutex);
2007 		dev_err(dip, CE_WARN, "failed to allocate memory");
2008 		goto fail;
2009 	}
2010 
2011 	mutex_exit(&vif->vif_mutex);
2012 
2013 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
2014 		dev_err(dip, CE_WARN, "failed to enable interrupts");
2015 		goto fail;
2016 	}
2017 
2018 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2019 		dev_err(dip, CE_WARN, "failed to allocate a mac_register");
2020 		goto fail;
2021 	}
2022 
2023 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2024 	macp->m_driver = vif;
2025 	macp->m_dip = dip;
2026 	macp->m_src_addr = vif->vif_mac;
2027 	macp->m_callbacks = &vioif_mac_callbacks;
2028 	macp->m_min_sdu = 0;
2029 	macp->m_max_sdu = vif->vif_mtu;
2030 	macp->m_margin = VLAN_TAGSZ;
2031 	macp->m_priv_props = vioif_priv_props;
2032 
2033 	if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
2034 		dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
2035 		goto fail;
2036 	}
2037 	mac_free(macp);
2038 
2039 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
2040 
2041 	return (DDI_SUCCESS);
2042 
2043 fail:
2044 	vioif_free_bufs(vif);
2045 	if (macp != NULL) {
2046 		mac_free(macp);
2047 	}
2048 	(void) virtio_fini(vio, B_TRUE);
2049 	kmem_free(vif, sizeof (*vif));
2050 	return (DDI_FAILURE);
2051 }
2052 
2053 static int
2054 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2055 {
2056 	int r;
2057 	vioif_t *vif;
2058 
2059 	if (cmd != DDI_DETACH) {
2060 		return (DDI_FAILURE);
2061 	}
2062 
2063 	if ((vif = ddi_get_driver_private(dip)) == NULL) {
2064 		return (DDI_FAILURE);
2065 	}
2066 
2067 	mutex_enter(&vif->vif_mutex);
2068 	if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
2069 		dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
2070 		mutex_exit(&vif->vif_mutex);
2071 		return (DDI_FAILURE);
2072 	}
2073 
2074 	/*
2075 	 * There should be no outstanding transmit buffers once the NIC is
2076 	 * completely stopped.
2077 	 */
2078 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
2079 
2080 	/*
2081 	 * Though we cannot claw back all of the receive buffers until we reset
2082 	 * the device, we must ensure all those loaned to MAC have been
2083 	 * returned before calling mac_unregister().
2084 	 */
2085 	if (vif->vif_nrxbufs_onloan > 0) {
2086 		dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
2087 		    "cannot detach", vif->vif_nrxbufs_onloan);
2088 		mutex_exit(&vif->vif_mutex);
2089 		return (DDI_FAILURE);
2090 	}
2091 
2092 	if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
2093 		dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
2094 		return (DDI_FAILURE);
2095 	}
2096 
2097 	/*
2098 	 * Shut down the device so that we can recover any previously
2099 	 * submitted receive buffers.
2100 	 */
2101 	virtio_shutdown(vif->vif_virtio);
2102 	for (;;) {
2103 		virtio_chain_t *vic;
2104 
2105 		if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
2106 			break;
2107 		}
2108 
2109 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
2110 		vioif_rxbuf_free(vif, rb);
2111 	}
2112 
2113 	/*
2114 	 * vioif_free_bufs() must be called before virtio_fini()
2115 	 * as it uses virtio_chain_free() which itself depends on some
2116 	 * virtio data structures still being around.
2117 	 */
2118 	vioif_free_bufs(vif);
2119 	(void) virtio_fini(vif->vif_virtio, B_FALSE);
2120 
2121 	mutex_exit(&vif->vif_mutex);
2122 	mutex_destroy(&vif->vif_mutex);
2123 
2124 	kmem_free(vif, sizeof (*vif));
2125 
2126 	return (DDI_SUCCESS);
2127 }
2128 
2129 static int
2130 vioif_quiesce(dev_info_t *dip)
2131 {
2132 	vioif_t *vif;
2133 
2134 	if ((vif = ddi_get_driver_private(dip)) == NULL)
2135 		return (DDI_FAILURE);
2136 
2137 	return (virtio_quiesce(vif->vif_virtio));
2138 }
2139 
2140 int
2141 _init(void)
2142 {
2143 	int ret;
2144 
2145 	mac_init_ops(&vioif_dev_ops, "vioif");
2146 
2147 	if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
2148 		mac_fini_ops(&vioif_dev_ops);
2149 	}
2150 
2151 	return (ret);
2152 }
2153 
2154 int
2155 _fini(void)
2156 {
2157 	int ret;
2158 
2159 	if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
2160 		mac_fini_ops(&vioif_dev_ops);
2161 	}
2162 
2163 	return (ret);
2164 }
2165 
2166 int
2167 _info(struct modinfo *modinfop)
2168 {
2169 	return (mod_info(&vioif_modlinkage, modinfop));
2170 }
2171