xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2013 Nexenta Inc.  All rights reserved.
14  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15  * Copyright 2019 Joyent, Inc.
16  */
17 
18 /* Based on the NetBSD virtio driver by Minoura Makoto. */
19 /*
20  * Copyright (c) 2010 Minoura Makoto.
21  * All rights reserved.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the above copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  *
32  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
33  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
34  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
35  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
36  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
41  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/errno.h>
46 #include <sys/param.h>
47 #include <sys/stropts.h>
48 #include <sys/stream.h>
49 #include <sys/strsubr.h>
50 #include <sys/kmem.h>
51 #include <sys/conf.h>
52 #include <sys/devops.h>
53 #include <sys/ksynch.h>
54 #include <sys/stat.h>
55 #include <sys/modctl.h>
56 #include <sys/debug.h>
57 #include <sys/pci.h>
58 #include <sys/ethernet.h>
59 #include <sys/vlan.h>
60 
61 #include <sys/dlpi.h>
62 #include <sys/taskq.h>
63 
64 #include <sys/pattr.h>
65 #include <sys/strsun.h>
66 
67 #include <sys/random.h>
68 #include <sys/containerof.h>
69 #include <sys/stream.h>
70 
71 #include <sys/mac.h>
72 #include <sys/mac_provider.h>
73 #include <sys/mac_ether.h>
74 
75 #include "virtiovar.h"
76 #include "virtioreg.h"
77 
78 /* Configuration registers */
79 #define	VIRTIO_NET_CONFIG_MAC		0 /* 8bit x 6byte */
80 #define	VIRTIO_NET_CONFIG_STATUS	6 /* 16bit */
81 
82 /* Feature bits */
83 #define	VIRTIO_NET_F_CSUM	(1 << 0) /* Host handles pkts w/ partial csum */
84 #define	VIRTIO_NET_F_GUEST_CSUM	(1 << 1) /* Guest handles pkts w/ part csum */
85 #define	VIRTIO_NET_F_MAC	(1 << 5) /* Host has given MAC address. */
86 #define	VIRTIO_NET_F_GSO	(1 << 6) /* Host handles pkts w/ any GSO type */
87 #define	VIRTIO_NET_F_GUEST_TSO4	(1 << 7) /* Guest can handle TSOv4 in. */
88 #define	VIRTIO_NET_F_GUEST_TSO6	(1 << 8) /* Guest can handle TSOv6 in. */
89 #define	VIRTIO_NET_F_GUEST_ECN	(1 << 9) /* Guest can handle TSO[6] w/ ECN in */
90 #define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* Guest can handle UFO in. */
91 #define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* Host can handle TSOv4 in. */
92 #define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* Host can handle TSOv6 in. */
93 #define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* Host can handle TSO[6] w/ ECN in */
94 #define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* Host can handle UFO in. */
95 #define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* Host can merge receive buffers. */
96 #define	VIRTIO_NET_F_STATUS	(1 << 16) /* Config.status available */
97 #define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* Control channel available */
98 #define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* Control channel RX mode support */
99 #define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* Control channel VLAN filtering */
100 #define	VIRTIO_NET_F_CTRL_RX_EXTRA (1 << 20) /* Extra RX mode control support */
101 
102 #define	VIRTIO_NET_FEATURE_BITS \
103 	"\020" \
104 	"\1CSUM" \
105 	"\2GUEST_CSUM" \
106 	"\6MAC" \
107 	"\7GSO" \
108 	"\10GUEST_TSO4" \
109 	"\11GUEST_TSO6" \
110 	"\12GUEST_ECN" \
111 	"\13GUEST_UFO" \
112 	"\14HOST_TSO4" \
113 	"\15HOST_TSO6" \
114 	"\16HOST_ECN" \
115 	"\17HOST_UFO" \
116 	"\20MRG_RXBUF" \
117 	"\21STATUS" \
118 	"\22CTRL_VQ" \
119 	"\23CTRL_RX" \
120 	"\24CTRL_VLAN" \
121 	"\25CTRL_RX_EXTRA"
122 
123 /* Status */
124 #define	VIRTIO_NET_S_LINK_UP	1
125 
126 #pragma pack(1)
127 /* Packet header structure */
128 struct virtio_net_hdr {
129 	uint8_t		flags;
130 	uint8_t		gso_type;
131 	uint16_t	hdr_len;
132 	uint16_t	gso_size;
133 	uint16_t	csum_start;
134 	uint16_t	csum_offset;
135 };
136 #pragma pack()
137 
138 #define	VIRTIO_NET_HDR_F_NEEDS_CSUM	1 /* flags */
139 #define	VIRTIO_NET_HDR_GSO_NONE		0 /* gso_type */
140 #define	VIRTIO_NET_HDR_GSO_TCPV4	1 /* gso_type */
141 #define	VIRTIO_NET_HDR_GSO_UDP		3 /* gso_type */
142 #define	VIRTIO_NET_HDR_GSO_TCPV6	4 /* gso_type */
143 #define	VIRTIO_NET_HDR_GSO_ECN		0x80 /* gso_type, |'ed */
144 
145 
146 /* Control virtqueue */
147 #pragma pack(1)
148 struct virtio_net_ctrl_cmd {
149 	uint8_t	class;
150 	uint8_t	command;
151 };
152 #pragma pack()
153 
154 #define	VIRTIO_NET_CTRL_RX		0
155 #define	VIRTIO_NET_CTRL_RX_PROMISC	0
156 #define	VIRTIO_NET_CTRL_RX_ALLMULTI	1
157 
158 #define	VIRTIO_NET_CTRL_MAC		1
159 #define	VIRTIO_NET_CTRL_MAC_TABLE_SET	0
160 
161 #define	VIRTIO_NET_CTRL_VLAN		2
162 #define	VIRTIO_NET_CTRL_VLAN_ADD	0
163 #define	VIRTIO_NET_CTRL_VLAN_DEL	1
164 
165 #pragma pack(1)
166 struct virtio_net_ctrl_status {
167 	uint8_t	ack;
168 };
169 
170 struct virtio_net_ctrl_rx {
171 	uint8_t	onoff;
172 };
173 
174 struct virtio_net_ctrl_mac_tbl {
175 	uint32_t nentries;
176 	uint8_t macs[][ETHERADDRL];
177 };
178 
179 struct virtio_net_ctrl_vlan {
180 	uint16_t id;
181 };
182 #pragma pack()
183 
184 static int vioif_quiesce(dev_info_t *);
185 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
186 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
187 
188 DDI_DEFINE_STREAM_OPS(vioif_ops,
189     nulldev,		/* identify */
190     nulldev,		/* probe */
191     vioif_attach,	/* attach */
192     vioif_detach,	/* detach */
193     nodev,		/* reset */
194     NULL,		/* cb_ops */
195     D_MP,		/* bus_ops */
196     NULL,		/* power */
197     vioif_quiesce	/* quiesce */);
198 
199 static char vioif_ident[] = "VirtIO ethernet driver";
200 
201 /* Standard Module linkage initialization for a Streams driver */
202 extern struct mod_ops mod_driverops;
203 
204 static struct modldrv modldrv = {
205 	&mod_driverops,		/* Type of module.  This one is a driver */
206 	vioif_ident,		/* short description */
207 	&vioif_ops		/* driver specific ops */
208 };
209 
210 static struct modlinkage modlinkage = {
211 	MODREV_1,
212 	{
213 		(void *)&modldrv,
214 		NULL,
215 	},
216 };
217 
218 /* Interval for the periodic TX reclaim */
219 uint_t vioif_reclaim_ms = 200;
220 
221 ddi_device_acc_attr_t vioif_attr = {
222 	DDI_DEVICE_ATTR_V0,
223 	DDI_NEVERSWAP_ACC,	/* virtio is always native byte order */
224 	DDI_STORECACHING_OK_ACC,
225 	DDI_DEFAULT_ACC
226 };
227 
228 /*
229  * A mapping represents a binding for a single buffer that is contiguous in the
230  * virtual address space.
231  */
232 struct vioif_buf_mapping {
233 	caddr_t			vbm_buf;
234 	ddi_dma_handle_t	vbm_dmah;
235 	ddi_acc_handle_t	vbm_acch;
236 	ddi_dma_cookie_t	vbm_dmac;
237 	unsigned int		vbm_ncookies;
238 };
239 
240 /*
241  * Rx buffers can be loaned upstream, so the code has
242  * to allocate them dynamically.
243  */
244 struct vioif_rx_buf {
245 	struct vioif_softc	*rb_sc;
246 	frtn_t			rb_frtn;
247 
248 	struct vioif_buf_mapping rb_mapping;
249 };
250 
251 /*
252  * Tx buffers have two mapping types. One, "inline", is pre-allocated and is
253  * used to hold the virtio_net_header. Small packets also get copied there, as
254  * it's faster then mapping them. Bigger packets get mapped using the "external"
255  * mapping array. An array is used, because a packet may consist of muptiple
256  * fragments, so each fragment gets bound to an entry. According to my
257  * observations, the number of fragments does not exceed 2, but just in case,
258  * a bigger, up to VIOIF_INDIRECT_MAX - 1 array is allocated. To save resources,
259  * the dma handles are allocated lazily in the tx path.
260  */
261 struct vioif_tx_buf {
262 	mblk_t			*tb_mp;
263 
264 	/* inline buffer */
265 	struct vioif_buf_mapping tb_inline_mapping;
266 
267 	/* External buffers */
268 	struct vioif_buf_mapping *tb_external_mapping;
269 	unsigned int		tb_external_num;
270 };
271 
272 struct vioif_softc {
273 	dev_info_t		*sc_dev; /* mirrors virtio_softc->sc_dev */
274 	struct virtio_softc	sc_virtio;
275 
276 	mac_handle_t sc_mac_handle;
277 	mac_register_t *sc_macp;
278 
279 	struct virtqueue	*sc_rx_vq;
280 	struct virtqueue	*sc_tx_vq;
281 	struct virtqueue	*sc_ctrl_vq;
282 
283 	/* TX virtqueue management resources */
284 	kmutex_t		sc_tx_lock;
285 	boolean_t		sc_tx_corked;
286 	boolean_t		sc_tx_drain;
287 	timeout_id_t		sc_tx_reclaim_tid;
288 
289 	/* Feature bits. */
290 	unsigned int		sc_rx_csum:1;
291 	unsigned int		sc_tx_csum:1;
292 	unsigned int		sc_tx_tso4:1;
293 
294 	/*
295 	 * For debugging, it is useful to know whether the MAC address we
296 	 * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
297 	 * was otherwise generated or set from within the guest.
298 	 */
299 	unsigned int		sc_mac_from_host:1;
300 
301 	int			sc_mtu;
302 	uint8_t			sc_mac[ETHERADDRL];
303 	/*
304 	 * For rx buffers, we keep a pointer array, because the buffers
305 	 * can be loaned upstream, and we have to repopulate the array with
306 	 * new members.
307 	 */
308 	struct vioif_rx_buf	**sc_rxbufs;
309 
310 	/*
311 	 * For tx, we just allocate an array of buffers. The packet can
312 	 * either be copied into the inline buffer, or the external mapping
313 	 * could be used to map the packet
314 	 */
315 	struct vioif_tx_buf	*sc_txbufs;
316 
317 	kstat_t			*sc_intrstat;
318 	/*
319 	 * We "loan" rx buffers upstream and reuse them after they are
320 	 * freed. This lets us avoid allocations in the hot path.
321 	 */
322 	kmem_cache_t		*sc_rxbuf_cache;
323 	ulong_t			sc_rxloan;
324 
325 	/* Copying small packets turns out to be faster then mapping them. */
326 	unsigned long		sc_rxcopy_thresh;
327 	unsigned long		sc_txcopy_thresh;
328 
329 	/*
330 	 * Statistics visible through mac:
331 	 */
332 	uint64_t		sc_ipackets;
333 	uint64_t		sc_opackets;
334 	uint64_t		sc_rbytes;
335 	uint64_t		sc_obytes;
336 	uint64_t		sc_brdcstxmt;
337 	uint64_t		sc_brdcstrcv;
338 	uint64_t		sc_multixmt;
339 	uint64_t		sc_multircv;
340 	uint64_t		sc_norecvbuf;
341 	uint64_t		sc_notxbuf;
342 	uint64_t		sc_ierrors;
343 	uint64_t		sc_oerrors;
344 
345 	/*
346 	 * Internal debugging statistics:
347 	 */
348 	uint64_t		sc_rxfail_dma_handle;
349 	uint64_t		sc_rxfail_dma_buffer;
350 	uint64_t		sc_rxfail_dma_bind;
351 	uint64_t		sc_rxfail_chain_undersize;
352 	uint64_t		sc_rxfail_no_descriptors;
353 	uint64_t		sc_txfail_dma_handle;
354 	uint64_t		sc_txfail_dma_bind;
355 	uint64_t		sc_txfail_indirect_limit;
356 };
357 
358 #define	ETHER_HEADER_LEN		sizeof (struct ether_header)
359 
360 /* MTU + the ethernet header. */
361 #define	MAX_PAYLOAD	65535
362 #define	MAX_MTU		(MAX_PAYLOAD - ETHER_HEADER_LEN)
363 #define	DEFAULT_MTU	ETHERMTU
364 
365 /*
366  * Yeah, we spend 8M per device. Turns out, there is no point
367  * being smart and using merged rx buffers (VIRTIO_NET_F_MRG_RXBUF),
368  * because vhost does not support them, and we expect to be used with
369  * vhost in production environment.
370  */
371 /* The buffer keeps both the packet data and the virtio_net_header. */
372 #define	VIOIF_RX_SIZE (MAX_PAYLOAD + sizeof (struct virtio_net_hdr))
373 
374 /*
375  * We win a bit on header alignment, but the host wins a lot
376  * more on moving aligned buffers. Might need more thought.
377  */
378 #define	VIOIF_IP_ALIGN 0
379 
380 /* Maximum number of indirect descriptors, somewhat arbitrary. */
381 #define	VIOIF_INDIRECT_MAX 128
382 
383 /*
384  * We pre-allocate a reasonably large buffer to copy small packets
385  * there. Bigger packets are mapped, packets with multiple
386  * cookies are mapped as indirect buffers.
387  */
388 #define	VIOIF_TX_INLINE_SIZE 2048
389 
390 /* Native queue size for all queues */
391 #define	VIOIF_RX_QLEN 0
392 #define	VIOIF_TX_QLEN 0
393 #define	VIOIF_CTRL_QLEN 0
394 
395 static uchar_t vioif_broadcast[ETHERADDRL] = {
396 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
397 };
398 
399 #define	VIOIF_TX_THRESH_MAX	640
400 #define	VIOIF_RX_THRESH_MAX	640
401 
402 #define	CACHE_NAME_SIZE	32
403 
404 static char vioif_txcopy_thresh[] =
405 	"vioif_txcopy_thresh";
406 static char vioif_rxcopy_thresh[] =
407 	"vioif_rxcopy_thresh";
408 
409 static char *vioif_priv_props[] = {
410 	vioif_txcopy_thresh,
411 	vioif_rxcopy_thresh,
412 	NULL
413 };
414 
415 static void vioif_reclaim_restart(struct vioif_softc *);
416 
417 /* Add up to ddi? */
418 static ddi_dma_cookie_t *
419 vioif_dma_curr_cookie(ddi_dma_handle_t dmah)
420 {
421 	ddi_dma_impl_t *dmah_impl = (void *) dmah;
422 	ASSERT(dmah_impl->dmai_cookie);
423 	return (dmah_impl->dmai_cookie);
424 }
425 
426 static void
427 vioif_dma_reset_cookie(ddi_dma_handle_t dmah, ddi_dma_cookie_t *dmac)
428 {
429 	ddi_dma_impl_t *dmah_impl = (void *) dmah;
430 	dmah_impl->dmai_cookie = dmac;
431 }
432 
433 static link_state_t
434 vioif_link_state(struct vioif_softc *sc)
435 {
436 	if (sc->sc_virtio.sc_features & VIRTIO_NET_F_STATUS) {
437 		if (virtio_read_device_config_2(&sc->sc_virtio,
438 		    VIRTIO_NET_CONFIG_STATUS) & VIRTIO_NET_S_LINK_UP) {
439 			return (LINK_STATE_UP);
440 		} else {
441 			return (LINK_STATE_DOWN);
442 		}
443 	}
444 
445 	return (LINK_STATE_UP);
446 }
447 
448 static ddi_dma_attr_t vioif_inline_buf_dma_attr = {
449 	DMA_ATTR_V0,		/* Version number */
450 	0,			/* low address */
451 	0xFFFFFFFFFFFFFFFF,	/* high address */
452 	0xFFFFFFFF,		/* counter register max */
453 	1,			/* page alignment */
454 	1,			/* burst sizes: 1 - 32 */
455 	1,			/* minimum transfer size */
456 	0xFFFFFFFF,		/* max transfer size */
457 	0xFFFFFFFFFFFFFFF,	/* address register max */
458 	1,			/* scatter-gather capacity */
459 	1,			/* device operates on bytes */
460 	0,			/* attr flag: set to 0 */
461 };
462 
463 static ddi_dma_attr_t vioif_mapped_buf_dma_attr = {
464 	DMA_ATTR_V0,		/* Version number */
465 	0,			/* low address */
466 	0xFFFFFFFFFFFFFFFF,	/* high address */
467 	0xFFFFFFFF,		/* counter register max */
468 	1,			/* page alignment */
469 	1,			/* burst sizes: 1 - 32 */
470 	1,			/* minimum transfer size */
471 	0xFFFFFFFF,		/* max transfer size */
472 	0xFFFFFFFFFFFFFFF,	/* address register max */
473 
474 	/* One entry is used for the virtio_net_hdr on the tx path */
475 	VIOIF_INDIRECT_MAX - 1,	/* scatter-gather capacity */
476 	1,			/* device operates on bytes */
477 	0,			/* attr flag: set to 0 */
478 };
479 
480 static ddi_device_acc_attr_t vioif_bufattr = {
481 	DDI_DEVICE_ATTR_V0,
482 	DDI_NEVERSWAP_ACC,
483 	DDI_STORECACHING_OK_ACC,
484 	DDI_DEFAULT_ACC
485 };
486 
487 static void
488 vioif_rx_free(caddr_t free_arg)
489 {
490 	struct vioif_rx_buf *buf = (void *) free_arg;
491 	struct vioif_softc *sc = buf->rb_sc;
492 
493 	kmem_cache_free(sc->sc_rxbuf_cache, buf);
494 	atomic_dec_ulong(&sc->sc_rxloan);
495 }
496 
497 static int
498 vioif_rx_construct(void *buffer, void *user_arg, int kmflags)
499 {
500 	_NOTE(ARGUNUSED(kmflags));
501 	struct vioif_softc *sc = user_arg;
502 	struct vioif_rx_buf *buf = buffer;
503 	size_t len;
504 
505 	if (ddi_dma_alloc_handle(sc->sc_dev, &vioif_mapped_buf_dma_attr,
506 	    DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmah)) {
507 		sc->sc_rxfail_dma_handle++;
508 		goto exit_handle;
509 	}
510 
511 	if (ddi_dma_mem_alloc(buf->rb_mapping.vbm_dmah,
512 	    VIOIF_RX_SIZE + sizeof (struct virtio_net_hdr),
513 	    &vioif_bufattr, DDI_DMA_STREAMING, DDI_DMA_SLEEP,
514 	    NULL, &buf->rb_mapping.vbm_buf, &len, &buf->rb_mapping.vbm_acch)) {
515 		sc->sc_rxfail_dma_buffer++;
516 		goto exit_alloc;
517 	}
518 	ASSERT(len >= VIOIF_RX_SIZE);
519 
520 	if (ddi_dma_addr_bind_handle(buf->rb_mapping.vbm_dmah, NULL,
521 	    buf->rb_mapping.vbm_buf, len, DDI_DMA_READ | DDI_DMA_STREAMING,
522 	    DDI_DMA_SLEEP, NULL, &buf->rb_mapping.vbm_dmac,
523 	    &buf->rb_mapping.vbm_ncookies)) {
524 		sc->sc_rxfail_dma_bind++;
525 		goto exit_bind;
526 	}
527 
528 	ASSERT(buf->rb_mapping.vbm_ncookies <= VIOIF_INDIRECT_MAX);
529 
530 	buf->rb_sc = sc;
531 	buf->rb_frtn.free_arg = (void *) buf;
532 	buf->rb_frtn.free_func = vioif_rx_free;
533 
534 	return (0);
535 exit_bind:
536 	ddi_dma_mem_free(&buf->rb_mapping.vbm_acch);
537 exit_alloc:
538 	ddi_dma_free_handle(&buf->rb_mapping.vbm_dmah);
539 exit_handle:
540 
541 	return (ENOMEM);
542 }
543 
544 static void
545 vioif_rx_destruct(void *buffer, void *user_arg)
546 {
547 	_NOTE(ARGUNUSED(user_arg));
548 	struct vioif_rx_buf *buf = buffer;
549 
550 	ASSERT(buf->rb_mapping.vbm_acch);
551 	ASSERT(buf->rb_mapping.vbm_acch);
552 
553 	(void) ddi_dma_unbind_handle(buf->rb_mapping.vbm_dmah);
554 	ddi_dma_mem_free(&buf->rb_mapping.vbm_acch);
555 	ddi_dma_free_handle(&buf->rb_mapping.vbm_dmah);
556 }
557 
558 static void
559 vioif_free_mems(struct vioif_softc *sc)
560 {
561 	int i;
562 
563 	for (i = 0; i < sc->sc_tx_vq->vq_num; i++) {
564 		struct vioif_tx_buf *buf = &sc->sc_txbufs[i];
565 		int j;
566 
567 		/* Tear down the internal mapping. */
568 
569 		ASSERT(buf->tb_inline_mapping.vbm_acch);
570 		ASSERT(buf->tb_inline_mapping.vbm_dmah);
571 
572 		(void) ddi_dma_unbind_handle(buf->tb_inline_mapping.vbm_dmah);
573 		ddi_dma_mem_free(&buf->tb_inline_mapping.vbm_acch);
574 		ddi_dma_free_handle(&buf->tb_inline_mapping.vbm_dmah);
575 
576 		/* We should not see any in-flight buffers at this point. */
577 		ASSERT(!buf->tb_mp);
578 
579 		/* Free all the dma hdnales we allocated lazily. */
580 		for (j = 0; buf->tb_external_mapping[j].vbm_dmah; j++)
581 			ddi_dma_free_handle(
582 			    &buf->tb_external_mapping[j].vbm_dmah);
583 		/* Free the external mapping array. */
584 		kmem_free(buf->tb_external_mapping,
585 		    sizeof (struct vioif_tx_buf) * VIOIF_INDIRECT_MAX - 1);
586 	}
587 
588 	kmem_free(sc->sc_txbufs, sizeof (struct vioif_tx_buf) *
589 	    sc->sc_tx_vq->vq_num);
590 
591 	for (i = 0; i < sc->sc_rx_vq->vq_num; i++) {
592 		struct vioif_rx_buf *buf = sc->sc_rxbufs[i];
593 
594 		if (buf)
595 			kmem_cache_free(sc->sc_rxbuf_cache, buf);
596 	}
597 	kmem_free(sc->sc_rxbufs, sizeof (struct vioif_rx_buf *) *
598 	    sc->sc_rx_vq->vq_num);
599 }
600 
601 static int
602 vioif_alloc_mems(struct vioif_softc *sc)
603 {
604 	int i, txqsize, rxqsize;
605 	size_t len;
606 	unsigned int nsegments;
607 
608 	txqsize = sc->sc_tx_vq->vq_num;
609 	rxqsize = sc->sc_rx_vq->vq_num;
610 
611 	sc->sc_txbufs = kmem_zalloc(sizeof (struct vioif_tx_buf) * txqsize,
612 	    KM_SLEEP);
613 	if (sc->sc_txbufs == NULL) {
614 		dev_err(sc->sc_dev, CE_WARN,
615 		    "Failed to allocate the tx buffers array");
616 		goto exit_txalloc;
617 	}
618 
619 	/*
620 	 * We don't allocate the rx vioif_bufs, just the pointers, as
621 	 * rx vioif_bufs can be loaned upstream, and we don't know the
622 	 * total number we need.
623 	 */
624 	sc->sc_rxbufs = kmem_zalloc(sizeof (struct vioif_rx_buf *) * rxqsize,
625 	    KM_SLEEP);
626 	if (sc->sc_rxbufs == NULL) {
627 		dev_err(sc->sc_dev, CE_WARN,
628 		    "Failed to allocate the rx buffers pointer array");
629 		goto exit_rxalloc;
630 	}
631 
632 	for (i = 0; i < txqsize; i++) {
633 		struct vioif_tx_buf *buf = &sc->sc_txbufs[i];
634 
635 		/* Allocate and bind an inline mapping. */
636 
637 		if (ddi_dma_alloc_handle(sc->sc_dev,
638 		    &vioif_inline_buf_dma_attr,
639 		    DDI_DMA_SLEEP, NULL, &buf->tb_inline_mapping.vbm_dmah)) {
640 
641 			dev_err(sc->sc_dev, CE_WARN,
642 			    "Can't allocate dma handle for tx buffer %d", i);
643 			goto exit_tx;
644 		}
645 
646 		if (ddi_dma_mem_alloc(buf->tb_inline_mapping.vbm_dmah,
647 		    VIOIF_TX_INLINE_SIZE, &vioif_bufattr, DDI_DMA_STREAMING,
648 		    DDI_DMA_SLEEP, NULL, &buf->tb_inline_mapping.vbm_buf,
649 		    &len, &buf->tb_inline_mapping.vbm_acch)) {
650 
651 			dev_err(sc->sc_dev, CE_WARN,
652 			    "Can't allocate tx buffer %d", i);
653 			goto exit_tx;
654 		}
655 		ASSERT(len >= VIOIF_TX_INLINE_SIZE);
656 
657 		if (ddi_dma_addr_bind_handle(buf->tb_inline_mapping.vbm_dmah,
658 		    NULL, buf->tb_inline_mapping.vbm_buf, len,
659 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
660 		    &buf->tb_inline_mapping.vbm_dmac, &nsegments)) {
661 
662 			dev_err(sc->sc_dev, CE_WARN,
663 			    "Can't bind tx buffer %d", i);
664 			goto exit_tx;
665 		}
666 
667 		/* We asked for a single segment */
668 		ASSERT(nsegments == 1);
669 
670 		/*
671 		 * We allow up to VIOIF_INDIRECT_MAX - 1 external mappings.
672 		 * In reality, I don't expect more then 2-3 used, but who
673 		 * knows.
674 		 */
675 		buf->tb_external_mapping = kmem_zalloc(
676 		    sizeof (struct vioif_tx_buf) * VIOIF_INDIRECT_MAX - 1,
677 		    KM_SLEEP);
678 
679 		/*
680 		 * The external mapping's dma handles are allocate lazily,
681 		 * as we don't expect most of them to be used..
682 		 */
683 	}
684 
685 	return (0);
686 
687 exit_tx:
688 	for (i = 0; i < txqsize; i++) {
689 		struct vioif_tx_buf *buf = &sc->sc_txbufs[i];
690 
691 		if (buf->tb_inline_mapping.vbm_dmah)
692 			(void) ddi_dma_unbind_handle(
693 			    buf->tb_inline_mapping.vbm_dmah);
694 
695 		if (buf->tb_inline_mapping.vbm_acch)
696 			ddi_dma_mem_free(
697 			    &buf->tb_inline_mapping.vbm_acch);
698 
699 		if (buf->tb_inline_mapping.vbm_dmah)
700 			ddi_dma_free_handle(
701 			    &buf->tb_inline_mapping.vbm_dmah);
702 
703 		if (buf->tb_external_mapping)
704 			kmem_free(buf->tb_external_mapping,
705 			    sizeof (struct vioif_tx_buf) *
706 			    VIOIF_INDIRECT_MAX - 1);
707 	}
708 
709 	kmem_free(sc->sc_rxbufs, sizeof (struct vioif_rx_buf) * rxqsize);
710 
711 exit_rxalloc:
712 	kmem_free(sc->sc_txbufs, sizeof (struct vioif_tx_buf) * txqsize);
713 exit_txalloc:
714 	return (ENOMEM);
715 }
716 
717 /* ARGSUSED */
718 static int
719 vioif_multicst(void *arg, boolean_t add, const uint8_t *macaddr)
720 {
721 	return (DDI_SUCCESS);
722 }
723 
724 /* ARGSUSED */
725 static int
726 vioif_promisc(void *arg, boolean_t on)
727 {
728 	return (DDI_SUCCESS);
729 }
730 
731 /* ARGSUSED */
732 static int
733 vioif_unicst(void *arg, const uint8_t *macaddr)
734 {
735 	return (DDI_FAILURE);
736 }
737 
738 static uint_t
739 vioif_add_rx(struct vioif_softc *sc, int kmflag)
740 {
741 	uint_t num_added = 0;
742 	struct vq_entry *ve;
743 
744 	while ((ve = vq_alloc_entry(sc->sc_rx_vq)) != NULL) {
745 		struct vioif_rx_buf *buf = sc->sc_rxbufs[ve->qe_index];
746 
747 		if (buf == NULL) {
748 			/* First run, allocate the buffer. */
749 			buf = kmem_cache_alloc(sc->sc_rxbuf_cache, kmflag);
750 			sc->sc_rxbufs[ve->qe_index] = buf;
751 		}
752 
753 		/* Still nothing? Bye. */
754 		if (buf == NULL) {
755 			sc->sc_norecvbuf++;
756 			vq_free_entry(sc->sc_rx_vq, ve);
757 			break;
758 		}
759 
760 		ASSERT(buf->rb_mapping.vbm_ncookies >= 1);
761 
762 		/*
763 		 * For an unknown reason, the virtio_net_hdr must be placed
764 		 * as a separate virtio queue entry.
765 		 */
766 		virtio_ve_add_indirect_buf(ve,
767 		    buf->rb_mapping.vbm_dmac.dmac_laddress,
768 		    sizeof (struct virtio_net_hdr), B_FALSE);
769 
770 		/* Add the rest of the first cookie. */
771 		virtio_ve_add_indirect_buf(ve,
772 		    buf->rb_mapping.vbm_dmac.dmac_laddress +
773 		    sizeof (struct virtio_net_hdr),
774 		    buf->rb_mapping.vbm_dmac.dmac_size -
775 		    sizeof (struct virtio_net_hdr), B_FALSE);
776 
777 		/*
778 		 * If the buffer consists of a single cookie (unlikely for a
779 		 * 64-k buffer), we are done. Otherwise, add the rest of the
780 		 * cookies using indirect entries.
781 		 */
782 		if (buf->rb_mapping.vbm_ncookies > 1) {
783 			ddi_dma_cookie_t *first_extra_dmac;
784 			ddi_dma_cookie_t dmac;
785 			first_extra_dmac =
786 			    vioif_dma_curr_cookie(buf->rb_mapping.vbm_dmah);
787 
788 			ddi_dma_nextcookie(buf->rb_mapping.vbm_dmah, &dmac);
789 			virtio_ve_add_cookie(ve, buf->rb_mapping.vbm_dmah,
790 			    dmac, buf->rb_mapping.vbm_ncookies - 1, B_FALSE);
791 			vioif_dma_reset_cookie(buf->rb_mapping.vbm_dmah,
792 			    first_extra_dmac);
793 		}
794 
795 		virtio_push_chain(ve, B_FALSE);
796 		num_added++;
797 	}
798 
799 	return (num_added);
800 }
801 
802 static uint_t
803 vioif_populate_rx(struct vioif_softc *sc, int kmflag)
804 {
805 	uint_t num_added = vioif_add_rx(sc, kmflag);
806 
807 	if (num_added > 0)
808 		virtio_sync_vq(sc->sc_rx_vq);
809 
810 	return (num_added);
811 }
812 
813 static uint_t
814 vioif_process_rx(struct vioif_softc *sc)
815 {
816 	struct vq_entry *ve;
817 	struct vioif_rx_buf *buf;
818 	mblk_t *mphead = NULL, *lastmp = NULL, *mp;
819 	uint32_t len;
820 	uint_t num_processed = 0;
821 
822 	while ((ve = virtio_pull_chain(sc->sc_rx_vq, &len))) {
823 
824 		buf = sc->sc_rxbufs[ve->qe_index];
825 		ASSERT(buf);
826 
827 		if (len < sizeof (struct virtio_net_hdr)) {
828 			sc->sc_rxfail_chain_undersize++;
829 			sc->sc_ierrors++;
830 			virtio_free_chain(ve);
831 			continue;
832 		}
833 
834 		len -= sizeof (struct virtio_net_hdr);
835 		/*
836 		 * We copy small packets that happen to fit into a single
837 		 * cookie and reuse the buffers. For bigger ones, we loan
838 		 * the buffers upstream.
839 		 */
840 		if (len < sc->sc_rxcopy_thresh) {
841 			mp = allocb(len, 0);
842 			if (mp == NULL) {
843 				sc->sc_norecvbuf++;
844 				sc->sc_ierrors++;
845 
846 				virtio_free_chain(ve);
847 				break;
848 			}
849 
850 			bcopy((char *)buf->rb_mapping.vbm_buf +
851 			    sizeof (struct virtio_net_hdr), mp->b_rptr, len);
852 			mp->b_wptr = mp->b_rptr + len;
853 
854 		} else {
855 			mp = desballoc((unsigned char *)
856 			    buf->rb_mapping.vbm_buf +
857 			    sizeof (struct virtio_net_hdr) +
858 			    VIOIF_IP_ALIGN, len, 0, &buf->rb_frtn);
859 			if (mp == NULL) {
860 				sc->sc_norecvbuf++;
861 				sc->sc_ierrors++;
862 
863 				virtio_free_chain(ve);
864 				break;
865 			}
866 			mp->b_wptr = mp->b_rptr + len;
867 
868 			atomic_inc_ulong(&sc->sc_rxloan);
869 			/*
870 			 * Buffer loaned, we will have to allocate a new one
871 			 * for this slot.
872 			 */
873 			sc->sc_rxbufs[ve->qe_index] = NULL;
874 		}
875 
876 		/*
877 		 * virtio-net does not tell us if this packet is multicast
878 		 * or broadcast, so we have to check it.
879 		 */
880 		if (mp->b_rptr[0] & 0x1) {
881 			if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
882 				sc->sc_multircv++;
883 			else
884 				sc->sc_brdcstrcv++;
885 		}
886 
887 		sc->sc_rbytes += len;
888 		sc->sc_ipackets++;
889 
890 		virtio_free_chain(ve);
891 
892 		if (lastmp == NULL) {
893 			mphead = mp;
894 		} else {
895 			lastmp->b_next = mp;
896 		}
897 		lastmp = mp;
898 		num_processed++;
899 	}
900 
901 	if (mphead != NULL) {
902 		mac_rx(sc->sc_mac_handle, NULL, mphead);
903 	}
904 
905 	return (num_processed);
906 }
907 
908 static uint_t
909 vioif_reclaim_used_tx(struct vioif_softc *sc)
910 {
911 	struct vq_entry *ve;
912 	uint32_t len;
913 	uint_t num_reclaimed = 0;
914 
915 	while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) {
916 		struct vioif_tx_buf *buf;
917 		mblk_t *mp;
918 
919 		/* We don't chain descriptors for tx, so don't expect any. */
920 		ASSERT(!ve->qe_next);
921 
922 		buf = &sc->sc_txbufs[ve->qe_index];
923 		mp = buf->tb_mp;
924 		buf->tb_mp = NULL;
925 
926 		if (mp != NULL) {
927 			for (uint_t i = 0; i < buf->tb_external_num; i++) {
928 				(void) ddi_dma_unbind_handle(
929 				    buf->tb_external_mapping[i].vbm_dmah);
930 			}
931 		}
932 
933 		virtio_free_chain(ve);
934 
935 		/* External mapping used, mp was not freed in vioif_send() */
936 		if (mp != NULL)
937 			freemsg(mp);
938 		num_reclaimed++;
939 	}
940 
941 	/* Return ring to transmitting state if descriptors were reclaimed. */
942 	if (num_reclaimed > 0) {
943 		boolean_t do_update = B_FALSE;
944 
945 		mutex_enter(&sc->sc_tx_lock);
946 		if (sc->sc_tx_corked) {
947 			/*
948 			 * TX was corked on a lack of available descriptors.
949 			 * That dire state has passed so the TX interrupt can
950 			 * be disabled and MAC can be notified that
951 			 * transmission is possible again.
952 			 */
953 			sc->sc_tx_corked = B_FALSE;
954 			virtio_stop_vq_intr(sc->sc_tx_vq);
955 			do_update = B_TRUE;
956 		}
957 		mutex_exit(&sc->sc_tx_lock);
958 
959 		/* Notify MAC outside the above lock */
960 		if (do_update) {
961 			mac_tx_update(sc->sc_mac_handle);
962 		}
963 	}
964 
965 	return (num_reclaimed);
966 }
967 
968 static void
969 vioif_reclaim_periodic(void *arg)
970 {
971 	struct vioif_softc *sc = arg;
972 	uint_t num_reclaimed;
973 
974 	num_reclaimed = vioif_reclaim_used_tx(sc);
975 
976 	mutex_enter(&sc->sc_tx_lock);
977 	sc->sc_tx_reclaim_tid = 0;
978 	/*
979 	 * If used descriptors were reclaimed or TX descriptors appear to be
980 	 * outstanding, the ring is considered active and periodic reclamation
981 	 * is necessary for now.
982 	 */
983 	if (num_reclaimed != 0 || vq_num_used(sc->sc_tx_vq) != 0) {
984 		/* Do not reschedule if the ring is being drained. */
985 		if (!sc->sc_tx_drain) {
986 			vioif_reclaim_restart(sc);
987 		}
988 	}
989 	mutex_exit(&sc->sc_tx_lock);
990 }
991 
992 static void
993 vioif_reclaim_restart(struct vioif_softc *sc)
994 {
995 	ASSERT(MUTEX_HELD(&sc->sc_tx_lock));
996 	ASSERT(!sc->sc_tx_drain);
997 
998 	if (sc->sc_tx_reclaim_tid == 0) {
999 		sc->sc_tx_reclaim_tid = timeout(vioif_reclaim_periodic, sc,
1000 		    MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
1001 	}
1002 }
1003 
1004 static void
1005 vioif_tx_drain(struct vioif_softc *sc)
1006 {
1007 	mutex_enter(&sc->sc_tx_lock);
1008 	sc->sc_tx_drain = B_TRUE;
1009 	/* Put a stop to the periodic reclaim if it is running */
1010 	if (sc->sc_tx_reclaim_tid != 0) {
1011 		timeout_id_t tid = sc->sc_tx_reclaim_tid;
1012 
1013 		/*
1014 		 * With sc_tx_drain set, there is no risk that a racing
1015 		 * vioif_reclaim_periodic() call will reschedule itself.
1016 		 *
1017 		 * Being part of the mc_stop hook also guarantees that
1018 		 * vioif_tx() will not be called to restart it.
1019 		 */
1020 		sc->sc_tx_reclaim_tid = 0;
1021 		mutex_exit(&sc->sc_tx_lock);
1022 		(void) untimeout(tid);
1023 		mutex_enter(&sc->sc_tx_lock);
1024 	}
1025 	virtio_stop_vq_intr(sc->sc_tx_vq);
1026 	mutex_exit(&sc->sc_tx_lock);
1027 
1028 	/*
1029 	 * Wait for all of the TX descriptors to be processed by the host so
1030 	 * they can be reclaimed.
1031 	 */
1032 	while (vq_num_used(sc->sc_tx_vq) != 0) {
1033 		(void) vioif_reclaim_used_tx(sc);
1034 		delay(5);
1035 	}
1036 
1037 	VERIFY(!sc->sc_tx_corked);
1038 	VERIFY3U(sc->sc_tx_reclaim_tid, ==, 0);
1039 	VERIFY3U(vq_num_used(sc->sc_tx_vq), ==, 0);
1040 }
1041 
1042 /* sc will be used to update stat counters. */
1043 /* ARGSUSED */
1044 static inline void
1045 vioif_tx_inline(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp,
1046     size_t msg_size)
1047 {
1048 	struct vioif_tx_buf *buf;
1049 	buf = &sc->sc_txbufs[ve->qe_index];
1050 
1051 	ASSERT(buf);
1052 
1053 	/* Frees mp */
1054 	mcopymsg(mp, buf->tb_inline_mapping.vbm_buf +
1055 	    sizeof (struct virtio_net_hdr));
1056 
1057 	virtio_ve_add_indirect_buf(ve,
1058 	    buf->tb_inline_mapping.vbm_dmac.dmac_laddress +
1059 	    sizeof (struct virtio_net_hdr), msg_size, B_TRUE);
1060 }
1061 
1062 static inline int
1063 vioif_tx_lazy_handle_alloc(struct vioif_softc *sc, struct vioif_tx_buf *buf,
1064     int i)
1065 {
1066 	int ret = DDI_SUCCESS;
1067 
1068 	if (!buf->tb_external_mapping[i].vbm_dmah) {
1069 		ret = ddi_dma_alloc_handle(sc->sc_dev,
1070 		    &vioif_mapped_buf_dma_attr, DDI_DMA_SLEEP, NULL,
1071 		    &buf->tb_external_mapping[i].vbm_dmah);
1072 		if (ret != DDI_SUCCESS) {
1073 			sc->sc_txfail_dma_handle++;
1074 		}
1075 	}
1076 
1077 	return (ret);
1078 }
1079 
1080 static inline int
1081 vioif_tx_external(struct vioif_softc *sc, struct vq_entry *ve, mblk_t *mp,
1082     size_t msg_size)
1083 {
1084 	_NOTE(ARGUNUSED(msg_size));
1085 
1086 	struct vioif_tx_buf *buf;
1087 	mblk_t *nmp;
1088 	int i, j;
1089 	int ret = DDI_SUCCESS;
1090 
1091 	buf = &sc->sc_txbufs[ve->qe_index];
1092 
1093 	ASSERT(buf);
1094 
1095 	buf->tb_external_num = 0;
1096 	i = 0;
1097 	nmp = mp;
1098 
1099 	while (nmp) {
1100 		size_t len;
1101 		ddi_dma_cookie_t dmac;
1102 		unsigned int ncookies;
1103 
1104 		len = MBLKL(nmp);
1105 		/*
1106 		 * For some reason, the network stack can
1107 		 * actually send us zero-length fragments.
1108 		 */
1109 		if (len == 0) {
1110 			nmp = nmp->b_cont;
1111 			continue;
1112 		}
1113 
1114 		ret = vioif_tx_lazy_handle_alloc(sc, buf, i);
1115 		if (ret != DDI_SUCCESS) {
1116 			sc->sc_notxbuf++;
1117 			sc->sc_oerrors++;
1118 			goto exit_lazy_alloc;
1119 		}
1120 		ret = ddi_dma_addr_bind_handle(
1121 		    buf->tb_external_mapping[i].vbm_dmah, NULL,
1122 		    (caddr_t)nmp->b_rptr, len,
1123 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1124 		    DDI_DMA_SLEEP, NULL, &dmac, &ncookies);
1125 
1126 		if (ret != DDI_SUCCESS) {
1127 			sc->sc_txfail_dma_bind++;
1128 			sc->sc_oerrors++;
1129 			goto exit_bind;
1130 		}
1131 
1132 		/* Check if we still fit into the indirect table. */
1133 		if (virtio_ve_indirect_available(ve) < ncookies) {
1134 			sc->sc_txfail_indirect_limit++;
1135 			sc->sc_notxbuf++;
1136 			sc->sc_oerrors++;
1137 
1138 			ret = DDI_FAILURE;
1139 			goto exit_limit;
1140 		}
1141 
1142 		virtio_ve_add_cookie(ve, buf->tb_external_mapping[i].vbm_dmah,
1143 		    dmac, ncookies, B_TRUE);
1144 
1145 		nmp = nmp->b_cont;
1146 		i++;
1147 	}
1148 
1149 	buf->tb_external_num = i;
1150 	/* Save the mp to free it when the packet is sent. */
1151 	buf->tb_mp = mp;
1152 
1153 	return (DDI_SUCCESS);
1154 
1155 exit_limit:
1156 exit_bind:
1157 exit_lazy_alloc:
1158 
1159 	for (j = 0; j < i; j++) {
1160 		(void) ddi_dma_unbind_handle(
1161 		    buf->tb_external_mapping[j].vbm_dmah);
1162 	}
1163 
1164 	return (ret);
1165 }
1166 
1167 static boolean_t
1168 vioif_send(struct vioif_softc *sc, mblk_t *mp)
1169 {
1170 	struct vq_entry *ve;
1171 	struct vioif_tx_buf *buf;
1172 	struct virtio_net_hdr *net_header = NULL;
1173 	size_t msg_size = 0;
1174 	uint32_t csum_start;
1175 	uint32_t csum_stuff;
1176 	uint32_t csum_flags;
1177 	uint32_t lso_flags;
1178 	uint32_t lso_mss;
1179 	mblk_t *nmp;
1180 	int ret;
1181 	boolean_t lso_required = B_FALSE;
1182 
1183 	for (nmp = mp; nmp; nmp = nmp->b_cont)
1184 		msg_size += MBLKL(nmp);
1185 
1186 	if (sc->sc_tx_tso4) {
1187 		mac_lso_get(mp, &lso_mss, &lso_flags);
1188 		lso_required = (lso_flags & HW_LSO);
1189 	}
1190 
1191 	ve = vq_alloc_entry(sc->sc_tx_vq);
1192 
1193 	if (ve == NULL) {
1194 		sc->sc_notxbuf++;
1195 		/* Out of free descriptors - try later. */
1196 		return (B_FALSE);
1197 	}
1198 	buf = &sc->sc_txbufs[ve->qe_index];
1199 
1200 	/* Use the inline buffer of the first entry for the virtio_net_hdr. */
1201 	(void) memset(buf->tb_inline_mapping.vbm_buf, 0,
1202 	    sizeof (struct virtio_net_hdr));
1203 
1204 	net_header = (struct virtio_net_hdr *)buf->tb_inline_mapping.vbm_buf;
1205 
1206 	mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL,
1207 	    NULL, &csum_flags);
1208 
1209 	/* They want us to do the TCP/UDP csum calculation. */
1210 	if (csum_flags & HCK_PARTIALCKSUM) {
1211 		struct ether_header *eth_header;
1212 		int eth_hsize;
1213 
1214 		/* Did we ask for it? */
1215 		ASSERT(sc->sc_tx_csum);
1216 
1217 		/* We only asked for partial csum packets. */
1218 		ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1219 		ASSERT(!(csum_flags & HCK_FULLCKSUM));
1220 
1221 		eth_header = (void *) mp->b_rptr;
1222 		if (eth_header->ether_type == htons(ETHERTYPE_VLAN)) {
1223 			eth_hsize = sizeof (struct ether_vlan_header);
1224 		} else {
1225 			eth_hsize = sizeof (struct ether_header);
1226 		}
1227 		net_header->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1228 		net_header->csum_start = eth_hsize + csum_start;
1229 		net_header->csum_offset = csum_stuff - csum_start;
1230 	}
1231 
1232 	/* setup LSO fields if required */
1233 	if (lso_required) {
1234 		net_header->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1235 		net_header->gso_size = (uint16_t)lso_mss;
1236 	}
1237 
1238 	virtio_ve_add_indirect_buf(ve,
1239 	    buf->tb_inline_mapping.vbm_dmac.dmac_laddress,
1240 	    sizeof (struct virtio_net_hdr), B_TRUE);
1241 
1242 	/* meanwhile update the statistic */
1243 	if (mp->b_rptr[0] & 0x1) {
1244 		if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
1245 			sc->sc_multixmt++;
1246 		else
1247 			sc->sc_brdcstxmt++;
1248 	}
1249 
1250 	/*
1251 	 * We copy small packets into the inline buffer. The bigger ones
1252 	 * get mapped using the mapped buffer.
1253 	 */
1254 	if (msg_size < sc->sc_txcopy_thresh) {
1255 		vioif_tx_inline(sc, ve, mp, msg_size);
1256 	} else {
1257 		/* statistic gets updated by vioif_tx_external when fail */
1258 		ret = vioif_tx_external(sc, ve, mp, msg_size);
1259 		if (ret != DDI_SUCCESS)
1260 			goto exit_tx_external;
1261 	}
1262 
1263 	virtio_push_chain(ve, B_TRUE);
1264 
1265 	sc->sc_opackets++;
1266 	sc->sc_obytes += msg_size;
1267 
1268 	return (B_TRUE);
1269 
1270 exit_tx_external:
1271 
1272 	vq_free_entry(sc->sc_tx_vq, ve);
1273 	/*
1274 	 * vioif_tx_external can fail when the buffer does not fit into the
1275 	 * indirect descriptor table. Free the mp. I don't expect this ever
1276 	 * to happen.
1277 	 */
1278 	freemsg(mp);
1279 
1280 	return (B_TRUE);
1281 }
1282 
1283 static mblk_t *
1284 vioif_tx(void *arg, mblk_t *mp)
1285 {
1286 	struct vioif_softc *sc = arg;
1287 	mblk_t *nmp;
1288 
1289 	/*
1290 	 * Prior to attempting to send any more frames, do a reclaim to pick up
1291 	 * any descriptors which have been processed by the host.
1292 	 */
1293 	if (vq_num_used(sc->sc_tx_vq) != 0) {
1294 		(void) vioif_reclaim_used_tx(sc);
1295 	}
1296 
1297 	while (mp != NULL) {
1298 		nmp = mp->b_next;
1299 		mp->b_next = NULL;
1300 
1301 		if (!vioif_send(sc, mp)) {
1302 			/*
1303 			 * If there are no descriptors available, try to
1304 			 * reclaim some, allowing a retry of the send if some
1305 			 * are found.
1306 			 */
1307 			mp->b_next = nmp;
1308 			if (vioif_reclaim_used_tx(sc) != 0) {
1309 				continue;
1310 			}
1311 
1312 			/*
1313 			 * Otherwise, enable the TX ring interrupt so that as
1314 			 * soon as a descriptor becomes available, transmission
1315 			 * can begin again.  For safety, make sure the periodic
1316 			 * reclaim is running as well.
1317 			 */
1318 			mutex_enter(&sc->sc_tx_lock);
1319 			sc->sc_tx_corked = B_TRUE;
1320 			virtio_start_vq_intr(sc->sc_tx_vq);
1321 			vioif_reclaim_restart(sc);
1322 			mutex_exit(&sc->sc_tx_lock);
1323 			return (mp);
1324 		}
1325 		mp = nmp;
1326 	}
1327 
1328 	/* Ensure the periodic reclaim has been started. */
1329 	mutex_enter(&sc->sc_tx_lock);
1330 	vioif_reclaim_restart(sc);
1331 	mutex_exit(&sc->sc_tx_lock);
1332 
1333 	return (NULL);
1334 }
1335 
1336 static int
1337 vioif_start(void *arg)
1338 {
1339 	struct vioif_softc *sc = arg;
1340 	struct vq_entry *ve;
1341 	uint32_t len;
1342 
1343 	mac_link_update(sc->sc_mac_handle, vioif_link_state(sc));
1344 
1345 	virtio_start_vq_intr(sc->sc_rx_vq);
1346 
1347 	/*
1348 	 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1349 	 * Descriptor reclamation is handling during transmit, via a periodic
1350 	 * timer, and when resources are tight, via the then-enabled interrupt.
1351 	 */
1352 	sc->sc_tx_drain = B_FALSE;
1353 
1354 	/*
1355 	 * Clear any data that arrived early on the receive queue and populate
1356 	 * it with free buffers that the device can use moving forward.
1357 	 */
1358 	while ((ve = virtio_pull_chain(sc->sc_rx_vq, &len)) != NULL) {
1359 		virtio_free_chain(ve);
1360 	}
1361 	(void) vioif_populate_rx(sc, KM_SLEEP);
1362 
1363 	return (DDI_SUCCESS);
1364 }
1365 
1366 static void
1367 vioif_stop(void *arg)
1368 {
1369 	struct vioif_softc *sc = arg;
1370 
1371 	/* Ensure all TX descriptors have been processed and reclaimed */
1372 	vioif_tx_drain(sc);
1373 
1374 	virtio_stop_vq_intr(sc->sc_rx_vq);
1375 }
1376 
1377 static int
1378 vioif_stat(void *arg, uint_t stat, uint64_t *val)
1379 {
1380 	struct vioif_softc *sc = arg;
1381 
1382 	switch (stat) {
1383 	case MAC_STAT_IERRORS:
1384 		*val = sc->sc_ierrors;
1385 		break;
1386 	case MAC_STAT_OERRORS:
1387 		*val = sc->sc_oerrors;
1388 		break;
1389 	case MAC_STAT_MULTIRCV:
1390 		*val = sc->sc_multircv;
1391 		break;
1392 	case MAC_STAT_BRDCSTRCV:
1393 		*val = sc->sc_brdcstrcv;
1394 		break;
1395 	case MAC_STAT_MULTIXMT:
1396 		*val = sc->sc_multixmt;
1397 		break;
1398 	case MAC_STAT_BRDCSTXMT:
1399 		*val = sc->sc_brdcstxmt;
1400 		break;
1401 	case MAC_STAT_IPACKETS:
1402 		*val = sc->sc_ipackets;
1403 		break;
1404 	case MAC_STAT_RBYTES:
1405 		*val = sc->sc_rbytes;
1406 		break;
1407 	case MAC_STAT_OPACKETS:
1408 		*val = sc->sc_opackets;
1409 		break;
1410 	case MAC_STAT_OBYTES:
1411 		*val = sc->sc_obytes;
1412 		break;
1413 	case MAC_STAT_NORCVBUF:
1414 		*val = sc->sc_norecvbuf;
1415 		break;
1416 	case MAC_STAT_NOXMTBUF:
1417 		*val = sc->sc_notxbuf;
1418 		break;
1419 	case MAC_STAT_IFSPEED:
1420 		/* always 1 Gbit */
1421 		*val = 1000000000ULL;
1422 		break;
1423 	case ETHER_STAT_LINK_DUPLEX:
1424 		/* virtual device, always full-duplex */
1425 		*val = LINK_DUPLEX_FULL;
1426 		break;
1427 
1428 	default:
1429 		return (ENOTSUP);
1430 	}
1431 
1432 	return (DDI_SUCCESS);
1433 }
1434 
1435 static int
1436 vioif_set_prop_private(struct vioif_softc *sc, const char *pr_name,
1437     uint_t pr_valsize, const void *pr_val)
1438 {
1439 	_NOTE(ARGUNUSED(pr_valsize));
1440 
1441 	long result;
1442 
1443 	if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
1444 
1445 		if (pr_val == NULL)
1446 			return (EINVAL);
1447 
1448 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
1449 
1450 		if (result < 0 || result > VIOIF_TX_THRESH_MAX)
1451 			return (EINVAL);
1452 		sc->sc_txcopy_thresh = result;
1453 	}
1454 	if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
1455 
1456 		if (pr_val == NULL)
1457 			return (EINVAL);
1458 
1459 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
1460 
1461 		if (result < 0 || result > VIOIF_RX_THRESH_MAX)
1462 			return (EINVAL);
1463 		sc->sc_rxcopy_thresh = result;
1464 	}
1465 	return (0);
1466 }
1467 
1468 static int
1469 vioif_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1470     uint_t pr_valsize, const void *pr_val)
1471 {
1472 	struct vioif_softc *sc = arg;
1473 	const uint32_t *new_mtu;
1474 	int err;
1475 
1476 	switch (pr_num) {
1477 	case MAC_PROP_MTU:
1478 		new_mtu = pr_val;
1479 
1480 		if (*new_mtu > MAX_MTU) {
1481 			return (EINVAL);
1482 		}
1483 
1484 		err = mac_maxsdu_update(sc->sc_mac_handle, *new_mtu);
1485 		if (err) {
1486 			return (err);
1487 		}
1488 		break;
1489 	case MAC_PROP_PRIVATE:
1490 		err = vioif_set_prop_private(sc, pr_name,
1491 		    pr_valsize, pr_val);
1492 		if (err)
1493 			return (err);
1494 		break;
1495 	default:
1496 		return (ENOTSUP);
1497 	}
1498 
1499 	return (0);
1500 }
1501 
1502 static int
1503 vioif_get_prop_private(struct vioif_softc *sc, const char *pr_name,
1504     uint_t pr_valsize, void *pr_val)
1505 {
1506 	int err = ENOTSUP;
1507 	int value;
1508 
1509 	if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
1510 
1511 		value = sc->sc_txcopy_thresh;
1512 		err = 0;
1513 		goto done;
1514 	}
1515 	if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
1516 
1517 		value = sc->sc_rxcopy_thresh;
1518 		err = 0;
1519 		goto done;
1520 	}
1521 done:
1522 	if (err == 0) {
1523 		(void) snprintf(pr_val, pr_valsize, "%d", value);
1524 	}
1525 	return (err);
1526 }
1527 
1528 static int
1529 vioif_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1530     uint_t pr_valsize, void *pr_val)
1531 {
1532 	struct vioif_softc *sc = arg;
1533 	int err = ENOTSUP;
1534 
1535 	switch (pr_num) {
1536 	case MAC_PROP_PRIVATE:
1537 		err = vioif_get_prop_private(sc, pr_name,
1538 		    pr_valsize, pr_val);
1539 		break;
1540 	default:
1541 		break;
1542 	}
1543 	return (err);
1544 }
1545 
1546 static void
1547 vioif_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1548     mac_prop_info_handle_t prh)
1549 {
1550 	struct vioif_softc *sc = arg;
1551 	char valstr[64];
1552 	int value;
1553 
1554 	switch (pr_num) {
1555 	case MAC_PROP_MTU:
1556 		mac_prop_info_set_range_uint32(prh, ETHERMIN, MAX_MTU);
1557 		break;
1558 
1559 	case MAC_PROP_PRIVATE:
1560 		bzero(valstr, sizeof (valstr));
1561 		if (strcmp(pr_name, vioif_txcopy_thresh) == 0) {
1562 			value = sc->sc_txcopy_thresh;
1563 		} else if (strcmp(pr_name, vioif_rxcopy_thresh) == 0) {
1564 			value = sc->sc_rxcopy_thresh;
1565 		} else {
1566 			return;
1567 		}
1568 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
1569 		break;
1570 
1571 	default:
1572 		break;
1573 	}
1574 }
1575 
1576 static boolean_t
1577 vioif_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1578 {
1579 	struct vioif_softc *sc = arg;
1580 
1581 	switch (cap) {
1582 	case MAC_CAPAB_HCKSUM:
1583 		if (sc->sc_tx_csum) {
1584 			uint32_t *txflags = cap_data;
1585 
1586 			*txflags = HCKSUM_INET_PARTIAL;
1587 			return (B_TRUE);
1588 		}
1589 		return (B_FALSE);
1590 	case MAC_CAPAB_LSO:
1591 		if (sc->sc_tx_tso4) {
1592 			mac_capab_lso_t *cap_lso = cap_data;
1593 
1594 			cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
1595 			cap_lso->lso_basic_tcp_ipv4.lso_max = MAX_MTU;
1596 			return (B_TRUE);
1597 		}
1598 		return (B_FALSE);
1599 	default:
1600 		break;
1601 	}
1602 	return (B_FALSE);
1603 }
1604 
1605 static mac_callbacks_t vioif_m_callbacks = {
1606 	.mc_callbacks	= (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO),
1607 	.mc_getstat	= vioif_stat,
1608 	.mc_start	= vioif_start,
1609 	.mc_stop	= vioif_stop,
1610 	.mc_setpromisc	= vioif_promisc,
1611 	.mc_multicst	= vioif_multicst,
1612 	.mc_unicst	= vioif_unicst,
1613 	.mc_tx		= vioif_tx,
1614 	/* Optional callbacks */
1615 	.mc_reserved	= NULL,		/* reserved */
1616 	.mc_ioctl	= NULL,		/* mc_ioctl */
1617 	.mc_getcapab	= vioif_getcapab,		/* mc_getcapab */
1618 	.mc_open	= NULL,		/* mc_open */
1619 	.mc_close	= NULL,		/* mc_close */
1620 	.mc_setprop	= vioif_setprop,
1621 	.mc_getprop	= vioif_getprop,
1622 	.mc_propinfo	= vioif_propinfo,
1623 };
1624 
1625 static void
1626 vioif_show_features(struct vioif_softc *sc, const char *prefix,
1627     uint32_t features)
1628 {
1629 	char buf[512];
1630 	char *bufp = buf;
1631 	char *bufend = buf + sizeof (buf);
1632 
1633 	/* LINTED E_PTRDIFF_OVERFLOW */
1634 	bufp += snprintf(bufp, bufend - bufp, prefix);
1635 	/* LINTED E_PTRDIFF_OVERFLOW */
1636 	bufp += virtio_show_features(features, bufp, bufend - bufp);
1637 	*bufp = '\0';
1638 
1639 	/* Using '!' to only CE_NOTE this to the system log. */
1640 	dev_err(sc->sc_dev, CE_NOTE, "!%s Vioif (%b)", buf, features,
1641 	    VIRTIO_NET_FEATURE_BITS);
1642 }
1643 
1644 /*
1645  * Find out which features are supported by the device and
1646  * choose which ones we wish to use.
1647  */
1648 static int
1649 vioif_dev_features(struct vioif_softc *sc)
1650 {
1651 	uint32_t host_features;
1652 
1653 	host_features = virtio_negotiate_features(&sc->sc_virtio,
1654 	    VIRTIO_NET_F_CSUM |
1655 	    VIRTIO_NET_F_HOST_TSO4 |
1656 	    VIRTIO_NET_F_HOST_ECN |
1657 	    VIRTIO_NET_F_MAC |
1658 	    VIRTIO_NET_F_STATUS |
1659 	    VIRTIO_F_RING_INDIRECT_DESC);
1660 
1661 	vioif_show_features(sc, "Host features: ", host_features);
1662 	vioif_show_features(sc, "Negotiated features: ",
1663 	    sc->sc_virtio.sc_features);
1664 
1665 	if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) {
1666 		dev_err(sc->sc_dev, CE_WARN,
1667 		    "Host does not support RING_INDIRECT_DESC. Cannot attach.");
1668 		return (DDI_FAILURE);
1669 	}
1670 
1671 	return (DDI_SUCCESS);
1672 }
1673 
1674 static boolean_t
1675 vioif_has_feature(struct vioif_softc *sc, uint32_t feature)
1676 {
1677 	return (virtio_has_feature(&sc->sc_virtio, feature));
1678 }
1679 
1680 static void
1681 vioif_set_mac(struct vioif_softc *sc)
1682 {
1683 	int i;
1684 
1685 	for (i = 0; i < ETHERADDRL; i++) {
1686 		virtio_write_device_config_1(&sc->sc_virtio,
1687 		    VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]);
1688 	}
1689 	sc->sc_mac_from_host = 0;
1690 }
1691 
1692 /* Get the mac address out of the hardware, or make up one. */
1693 static void
1694 vioif_get_mac(struct vioif_softc *sc)
1695 {
1696 	int i;
1697 	if (sc->sc_virtio.sc_features & VIRTIO_NET_F_MAC) {
1698 		for (i = 0; i < ETHERADDRL; i++) {
1699 			sc->sc_mac[i] = virtio_read_device_config_1(
1700 			    &sc->sc_virtio,
1701 			    VIRTIO_NET_CONFIG_MAC + i);
1702 		}
1703 		sc->sc_mac_from_host = 1;
1704 	} else {
1705 		/* Get a few random bytes */
1706 		(void) random_get_pseudo_bytes(sc->sc_mac, ETHERADDRL);
1707 		/* Make sure it's a unicast MAC */
1708 		sc->sc_mac[0] &= ~1;
1709 		/* Set the "locally administered" bit */
1710 		sc->sc_mac[1] |= 2;
1711 
1712 		vioif_set_mac(sc);
1713 
1714 		dev_err(sc->sc_dev, CE_NOTE,
1715 		    "!Generated a random MAC address: %s",
1716 		    ether_sprintf((struct ether_addr *)sc->sc_mac));
1717 	}
1718 }
1719 
1720 /*
1721  * Virtqueue interrupt handlers
1722  */
1723 /* ARGSUSED */
1724 static uint_t
1725 vioif_rx_handler(caddr_t arg1, caddr_t arg2)
1726 {
1727 	struct virtio_softc *vsc = (void *) arg1;
1728 	struct vioif_softc *sc = __containerof(vsc,
1729 	    struct vioif_softc, sc_virtio);
1730 
1731 	/*
1732 	 * The return values of these functions are not needed but they make
1733 	 * debugging interrupts simpler because you can use them to detect when
1734 	 * stuff was processed and repopulated in this handler.
1735 	 */
1736 	(void) vioif_process_rx(sc);
1737 	(void) vioif_populate_rx(sc, KM_NOSLEEP);
1738 
1739 	return (DDI_INTR_CLAIMED);
1740 }
1741 
1742 /* ARGSUSED */
1743 static uint_t
1744 vioif_tx_handler(caddr_t arg1, caddr_t arg2)
1745 {
1746 	struct virtio_softc *vsc = (void *)arg1;
1747 	struct vioif_softc *sc = __containerof(vsc,
1748 	    struct vioif_softc, sc_virtio);
1749 
1750 	/*
1751 	 * The TX interrupt could race with other reclamation activity, so
1752 	 * interpreting the return value is unimportant.
1753 	 */
1754 	(void) vioif_reclaim_used_tx(sc);
1755 
1756 	return (DDI_INTR_CLAIMED);
1757 }
1758 
1759 static int
1760 vioif_register_ints(struct vioif_softc *sc)
1761 {
1762 	int ret;
1763 
1764 	struct virtio_int_handler vioif_vq_h[] = {
1765 		{ vioif_rx_handler },
1766 		{ vioif_tx_handler },
1767 		{ NULL }
1768 	};
1769 
1770 	ret = virtio_register_ints(&sc->sc_virtio, NULL, vioif_vq_h);
1771 
1772 	return (ret);
1773 }
1774 
1775 
1776 static void
1777 vioif_check_features(struct vioif_softc *sc)
1778 {
1779 	if (vioif_has_feature(sc, VIRTIO_NET_F_CSUM)) {
1780 		/* The GSO/GRO featured depend on CSUM, check them here. */
1781 		sc->sc_tx_csum = 1;
1782 		sc->sc_rx_csum = 1;
1783 
1784 		if (!vioif_has_feature(sc, VIRTIO_NET_F_GUEST_CSUM)) {
1785 			sc->sc_rx_csum = 0;
1786 		}
1787 		dev_err(sc->sc_dev, CE_NOTE, "!Csum enabled.");
1788 
1789 		if (vioif_has_feature(sc, VIRTIO_NET_F_HOST_TSO4)) {
1790 
1791 			sc->sc_tx_tso4 = 1;
1792 			/*
1793 			 * We don't seem to have a way to ask the system
1794 			 * not to send us LSO packets with Explicit
1795 			 * Congestion Notification bit set, so we require
1796 			 * the device to support it in order to do
1797 			 * LSO.
1798 			 */
1799 			if (!vioif_has_feature(sc, VIRTIO_NET_F_HOST_ECN)) {
1800 				dev_err(sc->sc_dev, CE_NOTE,
1801 				    "!TSO4 supported, but not ECN. "
1802 				    "Not using LSO.");
1803 				sc->sc_tx_tso4 = 0;
1804 			} else {
1805 				dev_err(sc->sc_dev, CE_NOTE, "!LSO enabled");
1806 			}
1807 		}
1808 	}
1809 }
1810 
1811 static int
1812 vioif_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
1813 {
1814 	int ret, instance;
1815 	struct vioif_softc *sc;
1816 	struct virtio_softc *vsc;
1817 	mac_register_t *macp;
1818 	char cache_name[CACHE_NAME_SIZE];
1819 
1820 	instance = ddi_get_instance(devinfo);
1821 
1822 	switch (cmd) {
1823 	case DDI_ATTACH:
1824 		break;
1825 
1826 	case DDI_RESUME:
1827 	case DDI_PM_RESUME:
1828 		/* We do not support suspend/resume for vioif. */
1829 		goto exit;
1830 
1831 	default:
1832 		goto exit;
1833 	}
1834 
1835 	sc = kmem_zalloc(sizeof (struct vioif_softc), KM_SLEEP);
1836 	ddi_set_driver_private(devinfo, sc);
1837 
1838 	vsc = &sc->sc_virtio;
1839 
1840 	/* Duplicate for less typing */
1841 	sc->sc_dev = devinfo;
1842 	vsc->sc_dev = devinfo;
1843 
1844 	/*
1845 	 * Initialize interrupt kstat.
1846 	 */
1847 	sc->sc_intrstat = kstat_create("vioif", instance, "intr", "controller",
1848 	    KSTAT_TYPE_INTR, 1, 0);
1849 	if (sc->sc_intrstat == NULL) {
1850 		dev_err(devinfo, CE_WARN, "kstat_create failed");
1851 		goto exit_intrstat;
1852 	}
1853 	kstat_install(sc->sc_intrstat);
1854 
1855 	/* map BAR 0 */
1856 	ret = ddi_regs_map_setup(devinfo, 1,
1857 	    (caddr_t *)&sc->sc_virtio.sc_io_addr,
1858 	    0, 0, &vioif_attr, &sc->sc_virtio.sc_ioh);
1859 	if (ret != DDI_SUCCESS) {
1860 		dev_err(devinfo, CE_WARN, "unable to map bar 0: %d", ret);
1861 		goto exit_map;
1862 	}
1863 
1864 	virtio_device_reset(&sc->sc_virtio);
1865 	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_ACK);
1866 	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER);
1867 
1868 	ret = vioif_dev_features(sc);
1869 	if (ret)
1870 		goto exit_features;
1871 
1872 	vsc->sc_nvqs = vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
1873 
1874 	(void) snprintf(cache_name, CACHE_NAME_SIZE, "vioif%d_rx", instance);
1875 	sc->sc_rxbuf_cache = kmem_cache_create(cache_name,
1876 	    sizeof (struct vioif_rx_buf), 0, vioif_rx_construct,
1877 	    vioif_rx_destruct, NULL, sc, NULL, KM_SLEEP);
1878 	if (sc->sc_rxbuf_cache == NULL) {
1879 		dev_err(sc->sc_dev, CE_WARN, "Can't allocate the buffer cache");
1880 		goto exit_cache;
1881 	}
1882 
1883 	ret = vioif_register_ints(sc);
1884 	if (ret) {
1885 		dev_err(sc->sc_dev, CE_WARN,
1886 		    "Failed to allocate interrupt(s)!");
1887 		goto exit_ints;
1888 	}
1889 
1890 	/*
1891 	 * Register layout determined, can now access the
1892 	 * device-specific bits
1893 	 */
1894 	vioif_get_mac(sc);
1895 
1896 	sc->sc_rx_vq = virtio_alloc_vq(&sc->sc_virtio, 0,
1897 	    VIOIF_RX_QLEN, VIOIF_INDIRECT_MAX, "rx");
1898 	if (!sc->sc_rx_vq)
1899 		goto exit_alloc1;
1900 	virtio_stop_vq_intr(sc->sc_rx_vq);
1901 
1902 	sc->sc_tx_vq = virtio_alloc_vq(&sc->sc_virtio, 1,
1903 	    VIOIF_TX_QLEN, VIOIF_INDIRECT_MAX, "tx");
1904 	if (!sc->sc_tx_vq)
1905 		goto exit_alloc2;
1906 	virtio_stop_vq_intr(sc->sc_tx_vq);
1907 
1908 	mutex_init(&sc->sc_tx_lock, NULL, MUTEX_DRIVER,
1909 	    DDI_INTR_PRI(sc->sc_virtio.sc_intr_prio));
1910 
1911 	if (vioif_has_feature(sc, VIRTIO_NET_F_CTRL_VQ)) {
1912 		sc->sc_ctrl_vq = virtio_alloc_vq(&sc->sc_virtio, 2,
1913 		    VIOIF_CTRL_QLEN, 0, "ctrl");
1914 		if (!sc->sc_ctrl_vq) {
1915 			goto exit_alloc3;
1916 		}
1917 		virtio_stop_vq_intr(sc->sc_ctrl_vq);
1918 	}
1919 
1920 	virtio_set_status(&sc->sc_virtio,
1921 	    VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK);
1922 
1923 	sc->sc_rxloan = 0;
1924 
1925 	/* set some reasonable-small default values */
1926 	sc->sc_rxcopy_thresh = 300;
1927 	sc->sc_txcopy_thresh = 300;
1928 	sc->sc_mtu = ETHERMTU;
1929 
1930 	vioif_check_features(sc);
1931 
1932 	if (vioif_alloc_mems(sc) != 0)
1933 		goto exit_alloc_mems;
1934 
1935 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
1936 		dev_err(devinfo, CE_WARN, "Failed to allocate a mac_register");
1937 		goto exit_macalloc;
1938 	}
1939 
1940 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1941 	macp->m_driver = sc;
1942 	macp->m_dip = devinfo;
1943 	macp->m_src_addr = sc->sc_mac;
1944 	macp->m_callbacks = &vioif_m_callbacks;
1945 	macp->m_min_sdu = 0;
1946 	macp->m_max_sdu = sc->sc_mtu;
1947 	macp->m_margin = VLAN_TAGSZ;
1948 	macp->m_priv_props = vioif_priv_props;
1949 
1950 	sc->sc_macp = macp;
1951 
1952 	/* Pre-fill the rx ring. */
1953 	(void) vioif_populate_rx(sc, KM_SLEEP);
1954 
1955 	ret = mac_register(macp, &sc->sc_mac_handle);
1956 	if (ret != 0) {
1957 		dev_err(devinfo, CE_WARN, "vioif_attach: "
1958 		    "mac_register() failed, ret=%d", ret);
1959 		goto exit_register;
1960 	}
1961 
1962 	ret = virtio_enable_ints(&sc->sc_virtio);
1963 	if (ret) {
1964 		dev_err(devinfo, CE_WARN, "Failed to enable interrupts");
1965 		goto exit_enable_ints;
1966 	}
1967 
1968 	mac_link_update(sc->sc_mac_handle, LINK_STATE_UP);
1969 	return (DDI_SUCCESS);
1970 
1971 exit_enable_ints:
1972 	(void) mac_unregister(sc->sc_mac_handle);
1973 exit_register:
1974 	mac_free(macp);
1975 exit_macalloc:
1976 	vioif_free_mems(sc);
1977 exit_alloc_mems:
1978 	virtio_release_ints(&sc->sc_virtio);
1979 	if (sc->sc_ctrl_vq)
1980 		virtio_free_vq(sc->sc_ctrl_vq);
1981 exit_alloc3:
1982 	virtio_free_vq(sc->sc_tx_vq);
1983 exit_alloc2:
1984 	virtio_free_vq(sc->sc_rx_vq);
1985 exit_alloc1:
1986 exit_ints:
1987 	kmem_cache_destroy(sc->sc_rxbuf_cache);
1988 exit_cache:
1989 exit_features:
1990 	virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_FAILED);
1991 	ddi_regs_map_free(&sc->sc_virtio.sc_ioh);
1992 exit_intrstat:
1993 exit_map:
1994 	kstat_delete(sc->sc_intrstat);
1995 	kmem_free(sc, sizeof (struct vioif_softc));
1996 exit:
1997 	return (DDI_FAILURE);
1998 }
1999 
2000 static int
2001 vioif_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
2002 {
2003 	struct vioif_softc *sc;
2004 
2005 	if ((sc = ddi_get_driver_private(devinfo)) == NULL)
2006 		return (DDI_FAILURE);
2007 
2008 	switch (cmd) {
2009 	case DDI_DETACH:
2010 		break;
2011 
2012 	case DDI_PM_SUSPEND:
2013 		/* We do not support suspend/resume for vioif. */
2014 		return (DDI_FAILURE);
2015 
2016 	default:
2017 		return (DDI_FAILURE);
2018 	}
2019 
2020 	if (sc->sc_rxloan > 0) {
2021 		dev_err(devinfo, CE_WARN, "!Some rx buffers are still upstream,"
2022 		    " not detaching.");
2023 		return (DDI_FAILURE);
2024 	}
2025 
2026 	virtio_stop_vq_intr(sc->sc_rx_vq);
2027 	virtio_stop_vq_intr(sc->sc_tx_vq);
2028 
2029 	virtio_release_ints(&sc->sc_virtio);
2030 
2031 	if (mac_unregister(sc->sc_mac_handle)) {
2032 		return (DDI_FAILURE);
2033 	}
2034 
2035 	mac_free(sc->sc_macp);
2036 
2037 	vioif_free_mems(sc);
2038 	virtio_free_vq(sc->sc_rx_vq);
2039 	virtio_free_vq(sc->sc_tx_vq);
2040 
2041 	virtio_device_reset(&sc->sc_virtio);
2042 
2043 	ddi_regs_map_free(&sc->sc_virtio.sc_ioh);
2044 
2045 	kmem_cache_destroy(sc->sc_rxbuf_cache);
2046 	kstat_delete(sc->sc_intrstat);
2047 	kmem_free(sc, sizeof (struct vioif_softc));
2048 
2049 	return (DDI_SUCCESS);
2050 }
2051 
2052 static int
2053 vioif_quiesce(dev_info_t *devinfo)
2054 {
2055 	struct vioif_softc *sc;
2056 
2057 	if ((sc = ddi_get_driver_private(devinfo)) == NULL)
2058 		return (DDI_FAILURE);
2059 
2060 	virtio_stop_vq_intr(sc->sc_rx_vq);
2061 	virtio_stop_vq_intr(sc->sc_tx_vq);
2062 	virtio_device_reset(&sc->sc_virtio);
2063 
2064 	return (DDI_SUCCESS);
2065 }
2066 
2067 int
2068 _init(void)
2069 {
2070 	int ret = 0;
2071 
2072 	mac_init_ops(&vioif_ops, "vioif");
2073 
2074 	ret = mod_install(&modlinkage);
2075 	if (ret != DDI_SUCCESS) {
2076 		mac_fini_ops(&vioif_ops);
2077 		return (ret);
2078 	}
2079 
2080 	return (0);
2081 }
2082 
2083 int
2084 _fini(void)
2085 {
2086 	int ret;
2087 
2088 	ret = mod_remove(&modlinkage);
2089 	if (ret == DDI_SUCCESS) {
2090 		mac_fini_ops(&vioif_ops);
2091 	}
2092 
2093 	return (ret);
2094 }
2095 
2096 int
2097 _info(struct modinfo *pModinfo)
2098 {
2099 	return (mod_info(&modlinkage, pModinfo));
2100 }
2101