xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.c (revision b77a2dc4455ca028e52fdf96385a530a2d168316)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2013 Nexenta Inc.  All rights reserved.
14  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15  * Copyright 2019 Joyent, Inc.
16  */
17 
18 /* Based on the NetBSD virtio driver by Minoura Makoto. */
19 /*
20  * Copyright (c) 2010 Minoura Makoto.
21  * All rights reserved.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the above copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
33  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
36  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42  */
43 
44 /*
45  * VIRTIO NETWORK DRIVER
46  */
47 
48 #include <sys/types.h>
49 #include <sys/errno.h>
50 #include <sys/param.h>
51 #include <sys/stropts.h>
52 #include <sys/stream.h>
53 #include <sys/strsubr.h>
54 #include <sys/kmem.h>
55 #include <sys/conf.h>
56 #include <sys/devops.h>
57 #include <sys/ksynch.h>
58 #include <sys/stat.h>
59 #include <sys/modctl.h>
60 #include <sys/debug.h>
61 #include <sys/pci.h>
62 #include <sys/ethernet.h>
63 #include <sys/vlan.h>
64 #include <sys/sysmacros.h>
65 
66 #include <sys/dlpi.h>
67 #include <sys/taskq.h>
68 
69 #include <sys/pattr.h>
70 #include <sys/strsun.h>
71 
72 #include <sys/random.h>
73 #include <sys/containerof.h>
74 #include <sys/stream.h>
75 
76 #include <sys/mac.h>
77 #include <sys/mac_provider.h>
78 #include <sys/mac_ether.h>
79 
80 #include "virtio.h"
81 #include "vioif.h"
82 
83 
84 static int vioif_quiesce(dev_info_t *);
85 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
86 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
87 static boolean_t vioif_has_feature(vioif_t *, uint32_t);
88 static void vioif_reclaim_restart(vioif_t *);
89 static int vioif_m_stat(void *, uint_t, uint64_t *);
90 static void vioif_m_stop(void *);
91 static int vioif_m_start(void *);
92 static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
93 static int vioif_m_setpromisc(void *, boolean_t);
94 static int vioif_m_unicst(void *, const uint8_t *);
95 static mblk_t *vioif_m_tx(void *, mblk_t *);
96 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
97     const void *);
98 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
99 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
100     mac_prop_info_handle_t);
101 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
102 static uint_t vioif_add_rx(vioif_t *);
103 
104 
105 static struct cb_ops vioif_cb_ops = {
106 	.cb_rev =			CB_REV,
107 	.cb_flag =			D_MP | D_NEW,
108 
109 	.cb_open =			nulldev,
110 	.cb_close =			nulldev,
111 	.cb_strategy =			nodev,
112 	.cb_print =			nodev,
113 	.cb_dump =			nodev,
114 	.cb_read =			nodev,
115 	.cb_write =			nodev,
116 	.cb_ioctl =			nodev,
117 	.cb_devmap =			nodev,
118 	.cb_mmap =			nodev,
119 	.cb_segmap =			nodev,
120 	.cb_chpoll =			nochpoll,
121 	.cb_prop_op =			ddi_prop_op,
122 	.cb_str =			NULL,
123 	.cb_aread =			nodev,
124 	.cb_awrite =			nodev,
125 };
126 
127 static struct dev_ops vioif_dev_ops = {
128 	.devo_rev =			DEVO_REV,
129 	.devo_refcnt =			0,
130 
131 	.devo_attach =			vioif_attach,
132 	.devo_detach =			vioif_detach,
133 	.devo_quiesce =			vioif_quiesce,
134 
135 	.devo_cb_ops =			&vioif_cb_ops,
136 
137 	.devo_getinfo =			NULL,
138 	.devo_identify =		nulldev,
139 	.devo_probe =			nulldev,
140 	.devo_reset =			nodev,
141 	.devo_bus_ops =			NULL,
142 	.devo_power =			NULL,
143 };
144 
145 static struct modldrv vioif_modldrv = {
146 	.drv_modops =			&mod_driverops,
147 	.drv_linkinfo =			"VIRTIO network driver",
148 	.drv_dev_ops =			&vioif_dev_ops
149 };
150 
151 static struct modlinkage vioif_modlinkage = {
152 	.ml_rev =			MODREV_1,
153 	.ml_linkage =			{ &vioif_modldrv, NULL }
154 };
155 
156 static mac_callbacks_t vioif_mac_callbacks = {
157 	.mc_getstat =			vioif_m_stat,
158 	.mc_start =			vioif_m_start,
159 	.mc_stop =			vioif_m_stop,
160 	.mc_setpromisc =		vioif_m_setpromisc,
161 	.mc_multicst =			vioif_m_multicst,
162 	.mc_unicst =			vioif_m_unicst,
163 	.mc_tx =			vioif_m_tx,
164 
165 	.mc_callbacks =			(MC_GETCAPAB | MC_SETPROP |
166 					    MC_GETPROP | MC_PROPINFO),
167 	.mc_getcapab =			vioif_m_getcapab,
168 	.mc_setprop =			vioif_m_setprop,
169 	.mc_getprop =			vioif_m_getprop,
170 	.mc_propinfo =			vioif_m_propinfo,
171 };
172 
173 static const uchar_t vioif_broadcast[ETHERADDRL] = {
174 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
175 };
176 
177 /*
178  * Interval for the periodic TX reclaim.
179  */
180 uint_t vioif_reclaim_ms = 200;
181 
182 /*
183  * DMA attribute template for transmit and receive buffers.  The SGL entry
184  * count will be modified before using the template.  Note that these
185  * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
186  * received frames at the correct offset for the networking stack.
187  */
188 ddi_dma_attr_t vioif_dma_attr_bufs = {
189 	.dma_attr_version =		DMA_ATTR_V0,
190 	.dma_attr_addr_lo =		0x0000000000000000,
191 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
192 	.dma_attr_count_max =		0x00000000FFFFFFFF,
193 	.dma_attr_align =		VIOIF_HEADER_ALIGN,
194 	.dma_attr_burstsizes =		1,
195 	.dma_attr_minxfer =		1,
196 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
197 	.dma_attr_seg =			0x00000000FFFFFFFF,
198 	.dma_attr_sgllen =		0,
199 	.dma_attr_granular =		1,
200 	.dma_attr_flags =		0
201 };
202 
203 /*
204  * DMA attributes for mapping larger transmit buffers from the networking
205  * stack.  The requirements are quite loose, but note that the SGL entry length
206  * field is 32-bit.
207  */
208 ddi_dma_attr_t vioif_dma_attr_external = {
209 	.dma_attr_version =		DMA_ATTR_V0,
210 	.dma_attr_addr_lo =		0x0000000000000000,
211 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
212 	.dma_attr_count_max =		0x00000000FFFFFFFF,
213 	.dma_attr_align =		1,
214 	.dma_attr_burstsizes =		1,
215 	.dma_attr_minxfer =		1,
216 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
217 	.dma_attr_seg =			0x00000000FFFFFFFF,
218 	.dma_attr_sgllen =		VIOIF_MAX_SEGS - 1,
219 	.dma_attr_granular =		1,
220 	.dma_attr_flags =		0
221 };
222 
223 
224 /*
225  * VIRTIO NET MAC PROPERTIES
226  */
227 #define	VIOIF_MACPROP_TXCOPY_THRESH	"_txcopy_thresh"
228 #define	VIOIF_MACPROP_TXCOPY_THRESH_DEF	300
229 #define	VIOIF_MACPROP_TXCOPY_THRESH_MAX	640
230 
231 #define	VIOIF_MACPROP_RXCOPY_THRESH	"_rxcopy_thresh"
232 #define	VIOIF_MACPROP_RXCOPY_THRESH_DEF	300
233 #define	VIOIF_MACPROP_RXCOPY_THRESH_MAX	640
234 
235 static char *vioif_priv_props[] = {
236 	VIOIF_MACPROP_TXCOPY_THRESH,
237 	VIOIF_MACPROP_RXCOPY_THRESH,
238 	NULL
239 };
240 
241 
242 static vioif_txbuf_t *
243 vioif_txbuf_alloc(vioif_t *vif)
244 {
245 	vioif_txbuf_t *tb;
246 
247 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
248 
249 	if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
250 		vif->vif_ntxbufs_alloc++;
251 	}
252 
253 	return (tb);
254 }
255 
256 static void
257 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
258 {
259 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
260 
261 	VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
262 	vif->vif_ntxbufs_alloc--;
263 
264 	virtio_chain_clear(tb->tb_chain);
265 	list_insert_head(&vif->vif_txbufs, tb);
266 }
267 
268 static vioif_rxbuf_t *
269 vioif_rxbuf_alloc(vioif_t *vif)
270 {
271 	vioif_rxbuf_t *rb;
272 
273 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
274 
275 	if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
276 		vif->vif_nrxbufs_alloc++;
277 	}
278 
279 	return (rb);
280 }
281 
282 static void
283 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
284 {
285 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
286 
287 	VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
288 	vif->vif_nrxbufs_alloc--;
289 
290 	virtio_chain_clear(rb->rb_chain);
291 	list_insert_head(&vif->vif_rxbufs, rb);
292 }
293 
294 static void
295 vioif_rx_free_callback(caddr_t free_arg)
296 {
297 	vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
298 	vioif_t *vif = rb->rb_vioif;
299 
300 	mutex_enter(&vif->vif_mutex);
301 
302 	/*
303 	 * Return this receive buffer to the free list.
304 	 */
305 	vioif_rxbuf_free(vif, rb);
306 
307 	VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
308 	vif->vif_nrxbufs_onloan--;
309 
310 	/*
311 	 * Attempt to replenish the receive queue with at least the buffer we
312 	 * just freed.  There isn't a great way to deal with failure here,
313 	 * though because we'll only loan at most half of the buffers there
314 	 * should always be at least some available even if this fails.
315 	 */
316 	(void) vioif_add_rx(vif);
317 
318 	mutex_exit(&vif->vif_mutex);
319 }
320 
321 static void
322 vioif_free_bufs(vioif_t *vif)
323 {
324 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
325 
326 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
327 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
328 		vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
329 
330 		/*
331 		 * Ensure that this txbuf is now in the free list:
332 		 */
333 		VERIFY(list_link_active(&tb->tb_link));
334 		list_remove(&vif->vif_txbufs, tb);
335 
336 		/*
337 		 * We should not have an mblk chain at this point.
338 		 */
339 		VERIFY3P(tb->tb_mp, ==, NULL);
340 
341 		if (tb->tb_dma != NULL) {
342 			virtio_dma_free(tb->tb_dma);
343 			tb->tb_dma = NULL;
344 		}
345 
346 		if (tb->tb_chain != NULL) {
347 			virtio_chain_free(tb->tb_chain);
348 			tb->tb_chain = NULL;
349 		}
350 
351 		if (tb->tb_dmaext != NULL) {
352 			for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
353 				if (tb->tb_dmaext[j] != NULL) {
354 					virtio_dma_free(
355 					    tb->tb_dmaext[j]);
356 					tb->tb_dmaext[j] = NULL;
357 				}
358 			}
359 
360 			kmem_free(tb->tb_dmaext,
361 			    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
362 			tb->tb_dmaext = NULL;
363 			tb->tb_dmaext_capacity = 0;
364 		}
365 	}
366 	VERIFY(list_is_empty(&vif->vif_txbufs));
367 	if (vif->vif_txbufs_mem != NULL) {
368 		kmem_free(vif->vif_txbufs_mem,
369 		    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
370 		vif->vif_txbufs_mem = NULL;
371 		vif->vif_txbufs_capacity = 0;
372 	}
373 
374 	VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
375 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
376 		vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
377 
378 		/*
379 		 * Ensure that this rxbuf is now in the free list:
380 		 */
381 		VERIFY(list_link_active(&rb->rb_link));
382 		list_remove(&vif->vif_rxbufs, rb);
383 
384 		if (rb->rb_dma != NULL) {
385 			virtio_dma_free(rb->rb_dma);
386 			rb->rb_dma = NULL;
387 		}
388 
389 		if (rb->rb_chain != NULL) {
390 			virtio_chain_free(rb->rb_chain);
391 			rb->rb_chain = NULL;
392 		}
393 	}
394 	VERIFY(list_is_empty(&vif->vif_rxbufs));
395 	if (vif->vif_rxbufs_mem != NULL) {
396 		kmem_free(vif->vif_rxbufs_mem,
397 		    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
398 		vif->vif_rxbufs_mem = NULL;
399 		vif->vif_rxbufs_capacity = 0;
400 	}
401 }
402 
403 static int
404 vioif_alloc_bufs(vioif_t *vif)
405 {
406 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
407 
408 	/*
409 	 * Allocate one contiguous chunk of memory for the transmit and receive
410 	 * buffer tracking objects.  If the ring is unusually small, we'll
411 	 * reduce our target buffer count accordingly.
412 	 */
413 	vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
414 	    virtio_queue_size(vif->vif_tx_vq));
415 	vif->vif_txbufs_mem = kmem_zalloc(
416 	    sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
417 	list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
418 	    offsetof(vioif_txbuf_t, tb_link));
419 
420 	vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
421 	    virtio_queue_size(vif->vif_rx_vq));
422 	vif->vif_rxbufs_mem = kmem_zalloc(
423 	    sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
424 	list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
425 	    offsetof(vioif_rxbuf_t, rb_link));
426 
427 	/*
428 	 * Do not loan more than half of our allocated receive buffers into
429 	 * the networking stack.
430 	 */
431 	vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
432 
433 	/*
434 	 * Put everything in the free list straight away in order to simplify
435 	 * the use of vioif_free_bufs() for cleanup on allocation failure.
436 	 */
437 	for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
438 		list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
439 	}
440 	for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
441 		list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
442 	}
443 
444 	/*
445 	 * Start from the DMA attribute template common to both transmit and
446 	 * receive buffers.  The SGL entry count will be modified for each
447 	 * buffer type.
448 	 */
449 	ddi_dma_attr_t attr = vioif_dma_attr_bufs;
450 
451 	/*
452 	 * The transmit inline buffer is small (less than a page), so it's
453 	 * reasonable to request a single cookie.
454 	 */
455 	attr.dma_attr_sgllen = 1;
456 
457 	for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
458 	    tb = list_next(&vif->vif_txbufs, tb)) {
459 		if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
460 		    VIOIF_TX_INLINE_SIZE, &attr,
461 		    DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
462 			goto fail;
463 		}
464 		VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
465 
466 		if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
467 		    KM_SLEEP)) == NULL) {
468 			goto fail;
469 		}
470 		virtio_chain_data_set(tb->tb_chain, tb);
471 
472 		tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
473 		tb->tb_dmaext = kmem_zalloc(
474 		    sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
475 		    KM_SLEEP);
476 	}
477 
478 	/*
479 	 * The receive buffers are larger, and we can tolerate a large number
480 	 * of segments.  Adjust the SGL entry count, setting aside one segment
481 	 * for the virtio net header.
482 	 */
483 	attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
484 
485 	for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
486 	    rb = list_next(&vif->vif_rxbufs, rb)) {
487 		if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
488 		    VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
489 		    KM_SLEEP)) == NULL) {
490 			goto fail;
491 		}
492 
493 		if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
494 		    KM_SLEEP)) == NULL) {
495 			goto fail;
496 		}
497 		virtio_chain_data_set(rb->rb_chain, rb);
498 
499 		/*
500 		 * Ensure that the first cookie is sufficient to cover the
501 		 * header skip region plus one byte.
502 		 */
503 		VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
504 		    VIOIF_HEADER_SKIP + 1);
505 
506 		/*
507 		 * Ensure that the frame data begins at a location with a
508 		 * correctly aligned IP header.
509 		 */
510 		VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
511 		    VIOIF_HEADER_SKIP) % 4, ==, 2);
512 
513 		rb->rb_vioif = vif;
514 		rb->rb_frtn.free_func = vioif_rx_free_callback;
515 		rb->rb_frtn.free_arg = (caddr_t)rb;
516 	}
517 
518 	return (0);
519 
520 fail:
521 	vioif_free_bufs(vif);
522 	return (ENOMEM);
523 }
524 
525 static int
526 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
527 {
528 	/*
529 	 * Even though we currently do not have support for programming
530 	 * multicast filters, or even enabling promiscuous mode, we return
531 	 * success here to avoid the networking stack falling back to link
532 	 * layer broadcast for multicast traffic.  Some hypervisors already
533 	 * pass received multicast frames onto the guest, so at least on those
534 	 * systems multicast will work as expected anyway.
535 	 */
536 	return (0);
537 }
538 
539 static int
540 vioif_m_setpromisc(void *arg, boolean_t on)
541 {
542 	/*
543 	 * Even though we cannot currently enable promiscuous mode, we return
544 	 * success here to allow tools like snoop(1M) to continue to function.
545 	 */
546 	return (0);
547 }
548 
549 static int
550 vioif_m_unicst(void *arg, const uint8_t *mac)
551 {
552 	return (ENOTSUP);
553 }
554 
555 static uint_t
556 vioif_add_rx(vioif_t *vif)
557 {
558 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
559 
560 	if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
561 		/*
562 		 * If the NIC is not running, do not give the device any
563 		 * receive buffers.
564 		 */
565 		return (0);
566 	}
567 
568 	uint_t num_added = 0;
569 
570 	vioif_rxbuf_t *rb;
571 	while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
572 		/*
573 		 * For legacy devices, and those that have not negotiated
574 		 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
575 		 * separate descriptor entry to the rest of the buffer.
576 		 */
577 		if (virtio_chain_append(rb->rb_chain,
578 		    virtio_dma_cookie_pa(rb->rb_dma, 0),
579 		    sizeof (struct virtio_net_hdr),
580 		    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
581 			goto fail;
582 		}
583 
584 		for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
585 			uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
586 			size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
587 
588 			if (n == 0) {
589 				pa += VIOIF_HEADER_SKIP;
590 				VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
591 				sz -= VIOIF_HEADER_SKIP;
592 			}
593 
594 			if (virtio_chain_append(rb->rb_chain, pa, sz,
595 			    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
596 				goto fail;
597 			}
598 		}
599 
600 		virtio_chain_submit(rb->rb_chain, B_FALSE);
601 		num_added++;
602 		continue;
603 
604 fail:
605 		vioif_rxbuf_free(vif, rb);
606 		vif->vif_norecvbuf++;
607 		break;
608 	}
609 
610 	if (num_added > 0) {
611 		virtio_queue_flush(vif->vif_rx_vq);
612 	}
613 
614 	return (num_added);
615 }
616 
617 static uint_t
618 vioif_process_rx(vioif_t *vif)
619 {
620 	virtio_chain_t *vic;
621 	mblk_t *mphead = NULL, *lastmp = NULL, *mp;
622 	uint_t num_processed = 0;
623 
624 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
625 
626 	while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
627 		/*
628 		 * We have to use the chain received length here, as the device
629 		 * does not tell us the received frame length any other way.
630 		 * In a limited survey of hypervisors, virtio network devices
631 		 * appear to provide the right value here.
632 		 */
633 		size_t len = virtio_chain_received_length(vic);
634 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
635 
636 		virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
637 
638 		/*
639 		 * If the NIC is not running, discard any received frames.
640 		 */
641 		if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
642 			vioif_rxbuf_free(vif, rb);
643 			continue;
644 		}
645 
646 		if (len < sizeof (struct virtio_net_hdr)) {
647 			vif->vif_rxfail_chain_undersize++;
648 			vif->vif_ierrors++;
649 			vioif_rxbuf_free(vif, rb);
650 			continue;
651 		}
652 		len -= sizeof (struct virtio_net_hdr);
653 
654 		/*
655 		 * We copy small packets that happen to fit into a single
656 		 * cookie and reuse the buffers. For bigger ones, we loan
657 		 * the buffers upstream.
658 		 */
659 		if (len < vif->vif_rxcopy_thresh ||
660 		    vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
661 			mutex_exit(&vif->vif_mutex);
662 			if ((mp = allocb(len, 0)) == NULL) {
663 				mutex_enter(&vif->vif_mutex);
664 				vif->vif_norecvbuf++;
665 				vif->vif_ierrors++;
666 
667 				vioif_rxbuf_free(vif, rb);
668 				continue;
669 			}
670 
671 			bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
672 			    mp->b_rptr, len);
673 			mp->b_wptr = mp->b_rptr + len;
674 
675 			/*
676 			 * As the packet contents was copied rather than
677 			 * loaned, we can return the receive buffer resources
678 			 * to the free list.
679 			 */
680 			mutex_enter(&vif->vif_mutex);
681 			vioif_rxbuf_free(vif, rb);
682 
683 		} else {
684 			mutex_exit(&vif->vif_mutex);
685 			if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
686 			    VIOIF_HEADER_SKIP), len, 0,
687 			    &rb->rb_frtn)) == NULL) {
688 				mutex_enter(&vif->vif_mutex);
689 				vif->vif_norecvbuf++;
690 				vif->vif_ierrors++;
691 
692 				vioif_rxbuf_free(vif, rb);
693 				continue;
694 			}
695 			mp->b_wptr = mp->b_rptr + len;
696 
697 			mutex_enter(&vif->vif_mutex);
698 			vif->vif_nrxbufs_onloan++;
699 		}
700 
701 		/*
702 		 * virtio-net does not tell us if this packet is multicast
703 		 * or broadcast, so we have to check it.
704 		 */
705 		if (mp->b_rptr[0] & 0x1) {
706 			if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
707 				vif->vif_multircv++;
708 			else
709 				vif->vif_brdcstrcv++;
710 		}
711 
712 		vif->vif_rbytes += len;
713 		vif->vif_ipackets++;
714 
715 		if (lastmp == NULL) {
716 			mphead = mp;
717 		} else {
718 			lastmp->b_next = mp;
719 		}
720 		lastmp = mp;
721 		num_processed++;
722 	}
723 
724 	if (mphead != NULL) {
725 		if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
726 			mutex_exit(&vif->vif_mutex);
727 			mac_rx(vif->vif_mac_handle, NULL, mphead);
728 			mutex_enter(&vif->vif_mutex);
729 		} else {
730 			/*
731 			 * The NIC was disabled part way through our execution,
732 			 * so free the messages we allocated.
733 			 */
734 			freemsgchain(mphead);
735 		}
736 	}
737 
738 	return (num_processed);
739 }
740 
741 static uint_t
742 vioif_reclaim_used_tx(vioif_t *vif)
743 {
744 	virtio_chain_t *vic;
745 	uint_t num_reclaimed = 0;
746 
747 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
748 
749 	while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
750 		vioif_txbuf_t *tb = virtio_chain_data(vic);
751 
752 		if (tb->tb_mp != NULL) {
753 			/*
754 			 * Unbind the external mapping.
755 			 */
756 			for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
757 				if (tb->tb_dmaext[i] == NULL) {
758 					continue;
759 				}
760 
761 				virtio_dma_unbind(tb->tb_dmaext[i]);
762 			}
763 
764 			freemsg(tb->tb_mp);
765 			tb->tb_mp = NULL;
766 		}
767 
768 		/*
769 		 * Return this transmit buffer to the free list for reuse.
770 		 */
771 		mutex_enter(&vif->vif_mutex);
772 		vioif_txbuf_free(vif, tb);
773 		mutex_exit(&vif->vif_mutex);
774 
775 		num_reclaimed++;
776 	}
777 
778 	/* Return ring to transmitting state if descriptors were reclaimed. */
779 	if (num_reclaimed > 0) {
780 		boolean_t do_update = B_FALSE;
781 
782 		mutex_enter(&vif->vif_mutex);
783 		vif->vif_stat_tx_reclaim += num_reclaimed;
784 		if (vif->vif_tx_corked) {
785 			/*
786 			 * TX was corked on a lack of available descriptors.
787 			 * That dire state has passed so the TX interrupt can
788 			 * be disabled and MAC can be notified that
789 			 * transmission is possible again.
790 			 */
791 			vif->vif_tx_corked = B_FALSE;
792 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
793 			do_update = B_TRUE;
794 		}
795 
796 		if (do_update) {
797 			mac_tx_update(vif->vif_mac_handle);
798 		}
799 		mutex_exit(&vif->vif_mutex);
800 	}
801 
802 	return (num_reclaimed);
803 }
804 
805 static void
806 vioif_reclaim_periodic(void *arg)
807 {
808 	vioif_t *vif = arg;
809 	uint_t num_reclaimed;
810 
811 	num_reclaimed = vioif_reclaim_used_tx(vif);
812 
813 	mutex_enter(&vif->vif_mutex);
814 	vif->vif_tx_reclaim_tid = 0;
815 	/*
816 	 * If used descriptors were reclaimed or TX descriptors appear to be
817 	 * outstanding, the ring is considered active and periodic reclamation
818 	 * is necessary for now.
819 	 */
820 	if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
821 		/* Do not reschedule if the ring is being drained. */
822 		if (!vif->vif_tx_drain) {
823 			vioif_reclaim_restart(vif);
824 		}
825 	}
826 	mutex_exit(&vif->vif_mutex);
827 }
828 
829 static void
830 vioif_reclaim_restart(vioif_t *vif)
831 {
832 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
833 	VERIFY(!vif->vif_tx_drain);
834 
835 	if (vif->vif_tx_reclaim_tid == 0) {
836 		vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
837 		    MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
838 	}
839 }
840 
841 static void
842 vioif_tx_drain(vioif_t *vif)
843 {
844 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
845 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
846 
847 	vif->vif_tx_drain = B_TRUE;
848 	/* Put a stop to the periodic reclaim if it is running */
849 	if (vif->vif_tx_reclaim_tid != 0) {
850 		timeout_id_t tid = vif->vif_tx_reclaim_tid;
851 
852 		/*
853 		 * With vif_tx_drain set, there is no risk that a racing
854 		 * vioif_reclaim_periodic() call will reschedule itself.
855 		 *
856 		 * Being part of the mc_stop hook also guarantees that
857 		 * vioif_m_tx() will not be called to restart it.
858 		 */
859 		vif->vif_tx_reclaim_tid = 0;
860 		mutex_exit(&vif->vif_mutex);
861 		(void) untimeout(tid);
862 		mutex_enter(&vif->vif_mutex);
863 	}
864 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
865 
866 	/*
867 	 * Wait for all of the TX descriptors to be processed by the host so
868 	 * they can be reclaimed.
869 	 */
870 	while (vif->vif_ntxbufs_alloc > 0) {
871 		mutex_exit(&vif->vif_mutex);
872 		(void) vioif_reclaim_used_tx(vif);
873 		delay(5);
874 		mutex_enter(&vif->vif_mutex);
875 	}
876 	VERIFY(!vif->vif_tx_corked);
877 	VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
878 	VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
879 }
880 
881 static int
882 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
883 {
884 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
885 
886 	VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
887 
888 	/*
889 	 * Copy the message into the inline buffer and then free the message.
890 	 */
891 	mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
892 
893 	if (virtio_chain_append(tb->tb_chain,
894 	    virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
895 	    msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
896 		return (DDI_FAILURE);
897 	}
898 
899 	return (DDI_SUCCESS);
900 }
901 
902 static int
903 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
904 {
905 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
906 
907 	mblk_t *nmp = mp;
908 	tb->tb_ndmaext = 0;
909 
910 	while (nmp != NULL) {
911 		size_t len;
912 
913 		if ((len = MBLKL(nmp)) == 0) {
914 			/*
915 			 * Skip any zero-length entries in the chain.
916 			 */
917 			nmp = nmp->b_cont;
918 			continue;
919 		}
920 
921 		if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
922 			mutex_enter(&vif->vif_mutex);
923 			vif->vif_txfail_indirect_limit++;
924 			vif->vif_notxbuf++;
925 			mutex_exit(&vif->vif_mutex);
926 			goto fail;
927 		}
928 
929 		if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
930 			/*
931 			 * Allocate a DMA handle for this slot.
932 			 */
933 			if ((tb->tb_dmaext[tb->tb_ndmaext] =
934 			    virtio_dma_alloc_nomem(vif->vif_virtio,
935 			    &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
936 				mutex_enter(&vif->vif_mutex);
937 				vif->vif_notxbuf++;
938 				mutex_exit(&vif->vif_mutex);
939 				goto fail;
940 			}
941 		}
942 		virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
943 
944 		if (virtio_dma_bind(extdma, nmp->b_rptr, len,
945 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
946 		    DDI_SUCCESS) {
947 			mutex_enter(&vif->vif_mutex);
948 			vif->vif_txfail_dma_bind++;
949 			mutex_exit(&vif->vif_mutex);
950 			goto fail;
951 		}
952 
953 		for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
954 			uint64_t pa = virtio_dma_cookie_pa(extdma, n);
955 			size_t sz = virtio_dma_cookie_size(extdma, n);
956 
957 			if (virtio_chain_append(tb->tb_chain, pa, sz,
958 			    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
959 				mutex_enter(&vif->vif_mutex);
960 				vif->vif_txfail_indirect_limit++;
961 				vif->vif_notxbuf++;
962 				mutex_exit(&vif->vif_mutex);
963 				goto fail;
964 			}
965 		}
966 
967 		nmp = nmp->b_cont;
968 	}
969 
970 	/*
971 	 * We need to keep the message around until we reclaim the buffer from
972 	 * the device before freeing it.
973 	 */
974 	tb->tb_mp = mp;
975 
976 	return (DDI_SUCCESS);
977 
978 fail:
979 	for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
980 		if (tb->tb_dmaext[n] != NULL) {
981 			virtio_dma_unbind(tb->tb_dmaext[n]);
982 		}
983 	}
984 	tb->tb_ndmaext = 0;
985 
986 	freemsg(mp);
987 
988 	return (DDI_FAILURE);
989 }
990 
991 static boolean_t
992 vioif_send(vioif_t *vif, mblk_t *mp)
993 {
994 	VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
995 
996 	vioif_txbuf_t *tb = NULL;
997 	struct virtio_net_hdr *vnh = NULL;
998 	size_t msg_size = 0;
999 	uint32_t csum_start;
1000 	uint32_t csum_stuff;
1001 	uint32_t csum_flags;
1002 	uint32_t lso_flags;
1003 	uint32_t lso_mss;
1004 	mblk_t *nmp;
1005 	int ret;
1006 	boolean_t lso_required = B_FALSE;
1007 	struct ether_header *ether = (void *)mp->b_rptr;
1008 
1009 	for (nmp = mp; nmp; nmp = nmp->b_cont)
1010 		msg_size += MBLKL(nmp);
1011 
1012 	if (vif->vif_tx_tso4) {
1013 		mac_lso_get(mp, &lso_mss, &lso_flags);
1014 		lso_required = (lso_flags & HW_LSO) != 0;
1015 	}
1016 
1017 	mutex_enter(&vif->vif_mutex);
1018 	if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
1019 		vif->vif_notxbuf++;
1020 		goto fail;
1021 	}
1022 	mutex_exit(&vif->vif_mutex);
1023 
1024 	/*
1025 	 * Use the inline buffer for the virtio net header.  Zero the portion
1026 	 * of our DMA allocation prior to the packet data.
1027 	 */
1028 	vnh = virtio_dma_va(tb->tb_dma, 0);
1029 	bzero(vnh, VIOIF_HEADER_SKIP);
1030 
1031 	/*
1032 	 * For legacy devices, and those that have not negotiated
1033 	 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
1034 	 * descriptor entry to the rest of the buffer.
1035 	 */
1036 	if (virtio_chain_append(tb->tb_chain,
1037 	    virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
1038 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1039 		mutex_enter(&vif->vif_mutex);
1040 		vif->vif_notxbuf++;
1041 		goto fail;
1042 	}
1043 
1044 	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
1045 
1046 	/*
1047 	 * They want us to do the TCP/UDP csum calculation.
1048 	 */
1049 	if (csum_flags & HCK_PARTIALCKSUM) {
1050 		int eth_hsize;
1051 
1052 		/*
1053 		 * Did we ask for it?
1054 		 */
1055 		ASSERT(vif->vif_tx_csum);
1056 
1057 		/*
1058 		 * We only asked for partial csum packets.
1059 		 */
1060 		ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1061 		ASSERT(!(csum_flags & HCK_FULLCKSUM));
1062 
1063 		if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
1064 			eth_hsize = sizeof (struct ether_vlan_header);
1065 		} else {
1066 			eth_hsize = sizeof (struct ether_header);
1067 		}
1068 
1069 		vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1070 		vnh->vnh_csum_start = eth_hsize + csum_start;
1071 		vnh->vnh_csum_offset = csum_stuff - csum_start;
1072 	}
1073 
1074 	/*
1075 	 * Setup LSO fields if required.
1076 	 */
1077 	if (lso_required) {
1078 		vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1079 		vnh->vnh_gso_size = (uint16_t)lso_mss;
1080 	}
1081 
1082 	/*
1083 	 * The device does not maintain its own statistics about broadcast or
1084 	 * multicast packets, so we have to check the destination address
1085 	 * ourselves.
1086 	 */
1087 	if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
1088 		mutex_enter(&vif->vif_mutex);
1089 		if (ether_cmp(&ether->ether_dhost, vioif_broadcast) == 0) {
1090 			vif->vif_brdcstxmt++;
1091 		} else {
1092 			vif->vif_multixmt++;
1093 		}
1094 		mutex_exit(&vif->vif_mutex);
1095 	}
1096 
1097 	/*
1098 	 * For small packets, copy into the preallocated inline buffer rather
1099 	 * than incur the overhead of mapping.  Note that both of these
1100 	 * functions ensure that "mp" is freed before returning.
1101 	 */
1102 	if (msg_size < vif->vif_txcopy_thresh) {
1103 		ret = vioif_tx_inline(vif, tb, mp, msg_size);
1104 	} else {
1105 		ret = vioif_tx_external(vif, tb, mp, msg_size);
1106 	}
1107 	mp = NULL;
1108 
1109 	mutex_enter(&vif->vif_mutex);
1110 
1111 	if (ret != DDI_SUCCESS) {
1112 		goto fail;
1113 	}
1114 
1115 	vif->vif_opackets++;
1116 	vif->vif_obytes += msg_size;
1117 	mutex_exit(&vif->vif_mutex);
1118 
1119 	virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
1120 	virtio_chain_submit(tb->tb_chain, B_TRUE);
1121 
1122 	return (B_TRUE);
1123 
1124 fail:
1125 	vif->vif_oerrors++;
1126 	if (tb != NULL) {
1127 		vioif_txbuf_free(vif, tb);
1128 	}
1129 	mutex_exit(&vif->vif_mutex);
1130 
1131 	return (mp == NULL);
1132 }
1133 
1134 static mblk_t *
1135 vioif_m_tx(void *arg, mblk_t *mp)
1136 {
1137 	vioif_t *vif = arg;
1138 	mblk_t *nmp;
1139 
1140 	/*
1141 	 * Prior to attempting to send any more frames, do a reclaim to pick up
1142 	 * any descriptors which have been processed by the host.
1143 	 */
1144 	if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1145 		(void) vioif_reclaim_used_tx(vif);
1146 	}
1147 
1148 	while (mp != NULL) {
1149 		nmp = mp->b_next;
1150 		mp->b_next = NULL;
1151 
1152 		if (!vioif_send(vif, mp)) {
1153 			/*
1154 			 * If there are no descriptors available, try to
1155 			 * reclaim some, allowing a retry of the send if some
1156 			 * are found.
1157 			 */
1158 			mp->b_next = nmp;
1159 			if (vioif_reclaim_used_tx(vif) != 0) {
1160 				continue;
1161 			}
1162 
1163 			/*
1164 			 * Otherwise, enable the TX ring interrupt so that as
1165 			 * soon as a descriptor becomes available, transmission
1166 			 * can begin again.  For safety, make sure the periodic
1167 			 * reclaim is running as well.
1168 			 */
1169 			mutex_enter(&vif->vif_mutex);
1170 			vif->vif_tx_corked = B_TRUE;
1171 			virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
1172 			vioif_reclaim_restart(vif);
1173 			mutex_exit(&vif->vif_mutex);
1174 			return (mp);
1175 		}
1176 		mp = nmp;
1177 	}
1178 
1179 	/* Ensure the periodic reclaim has been started. */
1180 	mutex_enter(&vif->vif_mutex);
1181 	vioif_reclaim_restart(vif);
1182 	mutex_exit(&vif->vif_mutex);
1183 
1184 	return (NULL);
1185 }
1186 
1187 static int
1188 vioif_m_start(void *arg)
1189 {
1190 	vioif_t *vif = arg;
1191 
1192 	mutex_enter(&vif->vif_mutex);
1193 
1194 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
1195 	vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
1196 
1197 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1198 
1199 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
1200 
1201 	/*
1202 	 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1203 	 * Descriptor reclamation is handling during transmit, via a periodic
1204 	 * timer, and when resources are tight, via the then-enabled interrupt.
1205 	 */
1206 	vif->vif_tx_drain = B_FALSE;
1207 
1208 	/*
1209 	 * Add as many receive buffers as we can to the receive queue.  If we
1210 	 * cannot add any, it may be because we have stopped and started again
1211 	 * and the descriptors are all in the queue already.
1212 	 */
1213 	(void) vioif_add_rx(vif);
1214 
1215 	mutex_exit(&vif->vif_mutex);
1216 	return (DDI_SUCCESS);
1217 }
1218 
1219 static void
1220 vioif_m_stop(void *arg)
1221 {
1222 	vioif_t *vif = arg;
1223 
1224 	mutex_enter(&vif->vif_mutex);
1225 
1226 	VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
1227 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
1228 
1229 	/* Ensure all TX descriptors have been processed and reclaimed */
1230 	vioif_tx_drain(vif);
1231 
1232 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1233 
1234 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1235 	mutex_exit(&vif->vif_mutex);
1236 }
1237 
1238 static int
1239 vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
1240 {
1241 	vioif_t *vif = arg;
1242 
1243 	switch (stat) {
1244 	case MAC_STAT_IERRORS:
1245 		*val = vif->vif_ierrors;
1246 		break;
1247 	case MAC_STAT_OERRORS:
1248 		*val = vif->vif_oerrors;
1249 		break;
1250 	case MAC_STAT_MULTIRCV:
1251 		*val = vif->vif_multircv;
1252 		break;
1253 	case MAC_STAT_BRDCSTRCV:
1254 		*val = vif->vif_brdcstrcv;
1255 		break;
1256 	case MAC_STAT_MULTIXMT:
1257 		*val = vif->vif_multixmt;
1258 		break;
1259 	case MAC_STAT_BRDCSTXMT:
1260 		*val = vif->vif_brdcstxmt;
1261 		break;
1262 	case MAC_STAT_IPACKETS:
1263 		*val = vif->vif_ipackets;
1264 		break;
1265 	case MAC_STAT_RBYTES:
1266 		*val = vif->vif_rbytes;
1267 		break;
1268 	case MAC_STAT_OPACKETS:
1269 		*val = vif->vif_opackets;
1270 		break;
1271 	case MAC_STAT_OBYTES:
1272 		*val = vif->vif_obytes;
1273 		break;
1274 	case MAC_STAT_NORCVBUF:
1275 		*val = vif->vif_norecvbuf;
1276 		break;
1277 	case MAC_STAT_NOXMTBUF:
1278 		*val = vif->vif_notxbuf;
1279 		break;
1280 	case MAC_STAT_IFSPEED:
1281 		/* always 1 Gbit */
1282 		*val = 1000000000ULL;
1283 		break;
1284 	case ETHER_STAT_LINK_DUPLEX:
1285 		/* virtual device, always full-duplex */
1286 		*val = LINK_DUPLEX_FULL;
1287 		break;
1288 
1289 	default:
1290 		return (ENOTSUP);
1291 	}
1292 
1293 	return (DDI_SUCCESS);
1294 }
1295 
1296 static int
1297 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1298     uint_t pr_valsize, const void *pr_val)
1299 {
1300 	vioif_t *vif = arg;
1301 
1302 	switch (pr_num) {
1303 	case MAC_PROP_MTU: {
1304 		int r;
1305 		uint32_t mtu;
1306 		if (pr_valsize < sizeof (mtu)) {
1307 			return (EOVERFLOW);
1308 		}
1309 		bcopy(pr_val, &mtu, sizeof (mtu));
1310 
1311 		if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
1312 			return (EINVAL);
1313 		}
1314 
1315 		mutex_enter(&vif->vif_mutex);
1316 		if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
1317 			vif->vif_mtu = mtu;
1318 		}
1319 		mutex_exit(&vif->vif_mutex);
1320 
1321 		return (r);
1322 	}
1323 
1324 	case MAC_PROP_PRIVATE: {
1325 		long max, result;
1326 		uint_t *resp;
1327 		char *endptr;
1328 
1329 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1330 			max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
1331 			resp = &vif->vif_txcopy_thresh;
1332 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1333 			max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
1334 			resp = &vif->vif_rxcopy_thresh;
1335 		} else {
1336 			return (ENOTSUP);
1337 		}
1338 
1339 		if (pr_val == NULL) {
1340 			return (EINVAL);
1341 		}
1342 
1343 		if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
1344 		    *endptr != '\0' || result < 0 || result > max) {
1345 			return (EINVAL);
1346 		}
1347 
1348 		mutex_enter(&vif->vif_mutex);
1349 		*resp = result;
1350 		mutex_exit(&vif->vif_mutex);
1351 
1352 		return (0);
1353 	}
1354 
1355 	default:
1356 		return (ENOTSUP);
1357 	}
1358 }
1359 
1360 static int
1361 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1362     uint_t pr_valsize, void *pr_val)
1363 {
1364 	vioif_t *vif = arg;
1365 
1366 	switch (pr_num) {
1367 	case MAC_PROP_PRIVATE: {
1368 		uint_t value;
1369 
1370 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1371 			value = vif->vif_txcopy_thresh;
1372 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1373 			value = vif->vif_rxcopy_thresh;
1374 		} else {
1375 			return (ENOTSUP);
1376 		}
1377 
1378 		if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
1379 			return (EOVERFLOW);
1380 		}
1381 
1382 		return (0);
1383 	}
1384 
1385 	default:
1386 		return (ENOTSUP);
1387 	}
1388 }
1389 
1390 static void
1391 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1392     mac_prop_info_handle_t prh)
1393 {
1394 	vioif_t *vif = arg;
1395 	char valstr[64];
1396 	int value;
1397 
1398 	switch (pr_num) {
1399 	case MAC_PROP_MTU:
1400 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1401 		mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
1402 		return;
1403 
1404 	case MAC_PROP_PRIVATE:
1405 		if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1406 			value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1407 		} else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1408 			value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1409 		} else {
1410 			/*
1411 			 * We do not recognise this private property name.
1412 			 */
1413 			return;
1414 		}
1415 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1416 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
1417 		mac_prop_info_set_default_str(prh, valstr);
1418 		return;
1419 
1420 	default:
1421 		return;
1422 	}
1423 }
1424 
1425 static boolean_t
1426 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1427 {
1428 	vioif_t *vif = arg;
1429 
1430 	switch (cap) {
1431 	case MAC_CAPAB_HCKSUM: {
1432 		if (!vif->vif_tx_csum) {
1433 			return (B_FALSE);
1434 		}
1435 
1436 		*(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
1437 
1438 		return (B_TRUE);
1439 	}
1440 
1441 	case MAC_CAPAB_LSO: {
1442 		if (!vif->vif_tx_tso4) {
1443 			return (B_FALSE);
1444 		}
1445 
1446 		mac_capab_lso_t *lso = cap_data;
1447 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
1448 		lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
1449 
1450 		return (B_TRUE);
1451 	}
1452 
1453 	default:
1454 		return (B_FALSE);
1455 	}
1456 }
1457 
1458 static boolean_t
1459 vioif_has_feature(vioif_t *vif, uint32_t feature)
1460 {
1461 	return (virtio_feature_present(vif->vif_virtio, feature));
1462 }
1463 
1464 /*
1465  * Read the primary MAC address from the device if one is provided.  If not,
1466  * generate a random locally administered MAC address and write it back to the
1467  * device.
1468  */
1469 static void
1470 vioif_get_mac(vioif_t *vif)
1471 {
1472 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1473 
1474 	if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
1475 		for (uint_t i = 0; i < ETHERADDRL; i++) {
1476 			vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
1477 			    VIRTIO_NET_CONFIG_MAC + i);
1478 		}
1479 		vif->vif_mac_from_host = 1;
1480 
1481 		return;
1482 	}
1483 
1484 	/* Get a few random bytes */
1485 	(void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
1486 	/* Make sure it's a unicast MAC */
1487 	vif->vif_mac[0] &= ~1;
1488 	/* Set the "locally administered" bit */
1489 	vif->vif_mac[1] |= 2;
1490 
1491 	/*
1492 	 * Write the random MAC address back to the device.
1493 	 */
1494 	for (uint_t i = 0; i < ETHERADDRL; i++) {
1495 		virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
1496 		    vif->vif_mac[i]);
1497 	}
1498 	vif->vif_mac_from_host = 0;
1499 
1500 	dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
1501 	    "%02x:%02x:%02x:%02x:%02x:%02x",
1502 	    (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
1503 	    (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
1504 	    (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
1505 }
1506 
1507 /*
1508  * Virtqueue interrupt handlers
1509  */
1510 static uint_t
1511 vioif_rx_handler(caddr_t arg0, caddr_t arg1)
1512 {
1513 	vioif_t *vif = (vioif_t *)arg0;
1514 
1515 	mutex_enter(&vif->vif_mutex);
1516 	(void) vioif_process_rx(vif);
1517 
1518 	/*
1519 	 * Attempt to replenish the receive queue.  If we cannot add any
1520 	 * descriptors here, it may be because all of the recently received
1521 	 * packets were loaned up to the networking stack.
1522 	 */
1523 	(void) vioif_add_rx(vif);
1524 	mutex_exit(&vif->vif_mutex);
1525 
1526 	return (DDI_INTR_CLAIMED);
1527 }
1528 
1529 static uint_t
1530 vioif_tx_handler(caddr_t arg0, caddr_t arg1)
1531 {
1532 	vioif_t *vif = (vioif_t *)arg0;
1533 
1534 	/*
1535 	 * The TX interrupt could race with other reclamation activity, so
1536 	 * interpreting the return value is unimportant.
1537 	 */
1538 	(void) vioif_reclaim_used_tx(vif);
1539 
1540 	return (DDI_INTR_CLAIMED);
1541 }
1542 
1543 static void
1544 vioif_check_features(vioif_t *vif)
1545 {
1546 	VERIFY(MUTEX_HELD(&vif->vif_mutex));
1547 
1548 	vif->vif_tx_csum = 0;
1549 	vif->vif_tx_tso4 = 0;
1550 
1551 	if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
1552 		/*
1553 		 * The host will accept packets with partial checksums from us.
1554 		 */
1555 		vif->vif_tx_csum = 1;
1556 
1557 		/*
1558 		 * The legacy GSO feature represents the combination of
1559 		 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
1560 		 */
1561 		boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
1562 		boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
1563 		boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
1564 
1565 		/*
1566 		 * Explicit congestion notification (ECN) is configured
1567 		 * globally; see "tcp_ecn_permitted".  As we cannot currently
1568 		 * request that the stack disable ECN on a per interface basis,
1569 		 * we require the device to support the combination of
1570 		 * segmentation offload and ECN support.
1571 		 */
1572 		if (gso || (tso4 && ecn)) {
1573 			vif->vif_tx_tso4 = 1;
1574 		}
1575 	}
1576 }
1577 
1578 static int
1579 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1580 {
1581 	int ret;
1582 	vioif_t *vif;
1583 	virtio_t *vio;
1584 	mac_register_t *macp = NULL;
1585 
1586 	if (cmd != DDI_ATTACH) {
1587 		return (DDI_FAILURE);
1588 	}
1589 
1590 	if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
1591 	    NULL) {
1592 		return (DDI_FAILURE);
1593 	}
1594 
1595 	vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
1596 	vif->vif_dip = dip;
1597 	vif->vif_virtio = vio;
1598 	vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1599 	ddi_set_driver_private(dip, vif);
1600 
1601 	if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
1602 	    "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
1603 	    (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
1604 	    "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1605 		goto fail;
1606 	}
1607 
1608 	if (virtio_init_complete(vio, 0) != DDI_SUCCESS) {
1609 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
1610 		goto fail;
1611 	}
1612 
1613 	virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1614 	virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1615 
1616 	mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
1617 	mutex_enter(&vif->vif_mutex);
1618 
1619 	vioif_get_mac(vif);
1620 
1621 	vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1622 	vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1623 
1624 	if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
1625 		vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
1626 	} else {
1627 		vif->vif_mtu_max = ETHERMTU;
1628 	}
1629 
1630 	vif->vif_mtu = ETHERMTU;
1631 	if (vif->vif_mtu > vif->vif_mtu_max) {
1632 		vif->vif_mtu = vif->vif_mtu_max;
1633 	}
1634 
1635 	vioif_check_features(vif);
1636 
1637 	if (vioif_alloc_bufs(vif) != 0) {
1638 		mutex_exit(&vif->vif_mutex);
1639 		dev_err(dip, CE_WARN, "failed to allocate memory");
1640 		goto fail;
1641 	}
1642 
1643 	mutex_exit(&vif->vif_mutex);
1644 
1645 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
1646 		dev_err(dip, CE_WARN, "failed to enable interrupts");
1647 		goto fail;
1648 	}
1649 
1650 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
1651 		dev_err(dip, CE_WARN, "failed to allocate a mac_register");
1652 		goto fail;
1653 	}
1654 
1655 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1656 	macp->m_driver = vif;
1657 	macp->m_dip = dip;
1658 	macp->m_src_addr = vif->vif_mac;
1659 	macp->m_callbacks = &vioif_mac_callbacks;
1660 	macp->m_min_sdu = 0;
1661 	macp->m_max_sdu = vif->vif_mtu;
1662 	macp->m_margin = VLAN_TAGSZ;
1663 	macp->m_priv_props = vioif_priv_props;
1664 
1665 	if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
1666 		dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
1667 		goto fail;
1668 	}
1669 	mac_free(macp);
1670 
1671 	mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1672 
1673 	return (DDI_SUCCESS);
1674 
1675 fail:
1676 	vioif_free_bufs(vif);
1677 	if (macp != NULL) {
1678 		mac_free(macp);
1679 	}
1680 	(void) virtio_fini(vio, B_TRUE);
1681 	kmem_free(vif, sizeof (*vif));
1682 	return (DDI_FAILURE);
1683 }
1684 
1685 static int
1686 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1687 {
1688 	int r;
1689 	vioif_t *vif;
1690 
1691 	if (cmd != DDI_DETACH) {
1692 		return (DDI_FAILURE);
1693 	}
1694 
1695 	if ((vif = ddi_get_driver_private(dip)) == NULL) {
1696 		return (DDI_FAILURE);
1697 	}
1698 
1699 	mutex_enter(&vif->vif_mutex);
1700 	if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
1701 		dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
1702 		mutex_exit(&vif->vif_mutex);
1703 		return (DDI_FAILURE);
1704 	}
1705 
1706 	/*
1707 	 * There should be no outstanding transmit buffers once the NIC is
1708 	 * completely stopped.
1709 	 */
1710 	VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
1711 
1712 	/*
1713 	 * Though we cannot claw back all of the receive buffers until we reset
1714 	 * the device, we must ensure all those loaned to MAC have been
1715 	 * returned before calling mac_unregister().
1716 	 */
1717 	if (vif->vif_nrxbufs_onloan > 0) {
1718 		dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
1719 		    "cannot detach", vif->vif_nrxbufs_onloan);
1720 		mutex_exit(&vif->vif_mutex);
1721 		return (DDI_FAILURE);
1722 	}
1723 
1724 	if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
1725 		dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
1726 		return (DDI_FAILURE);
1727 	}
1728 	mac_free(vif->vif_macp);
1729 
1730 	/*
1731 	 * Shut down the device so that we can recover any previously
1732 	 * submitted receive buffers.
1733 	 */
1734 	virtio_shutdown(vif->vif_virtio);
1735 	for (;;) {
1736 		virtio_chain_t *vic;
1737 
1738 		if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
1739 			break;
1740 		}
1741 
1742 		vioif_rxbuf_t *rb = virtio_chain_data(vic);
1743 		vioif_rxbuf_free(vif, rb);
1744 	}
1745 
1746 	(void) virtio_fini(vif->vif_virtio, B_FALSE);
1747 
1748 	vioif_free_bufs(vif);
1749 
1750 	mutex_exit(&vif->vif_mutex);
1751 	mutex_destroy(&vif->vif_mutex);
1752 
1753 	kmem_free(vif, sizeof (*vif));
1754 
1755 	return (DDI_SUCCESS);
1756 }
1757 
1758 static int
1759 vioif_quiesce(dev_info_t *dip)
1760 {
1761 	vioif_t *vif;
1762 
1763 	if ((vif = ddi_get_driver_private(dip)) == NULL)
1764 		return (DDI_FAILURE);
1765 
1766 	return (virtio_quiesce(vif->vif_virtio));
1767 }
1768 
1769 int
1770 _init(void)
1771 {
1772 	int ret;
1773 
1774 	mac_init_ops(&vioif_dev_ops, "vioif");
1775 
1776 	if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
1777 		mac_fini_ops(&vioif_dev_ops);
1778 	}
1779 
1780 	return (ret);
1781 }
1782 
1783 int
1784 _fini(void)
1785 {
1786 	int ret;
1787 
1788 	if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
1789 		mac_fini_ops(&vioif_dev_ops);
1790 	}
1791 
1792 	return (ret);
1793 }
1794 
1795 int
1796 _info(struct modinfo *modinfop)
1797 {
1798 	return (mod_info(&vioif_modlinkage, modinfop));
1799 }
1800