xref: /illumos-gate/usr/src/uts/common/xen/io/xnf.c (revision 8c69cc8fbe729fa7b091e901c4b50508ccc6bb33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  */
30 
31 /*
32  *
33  * Copyright (c) 2004 Christian Limpach.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. This section intentionally left blank.
45  * 4. The name of the author may not be used to endorse or promote products
46  *    derived from this software without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
49  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
50  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
51  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
52  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
53  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
54  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
55  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
56  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
57  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58  */
59 /*
60  * Section 3 of the above license was updated in response to bug 6379571.
61  */
62 
63 /*
64  * xnf.c - GLDv3 network driver for domU.
65  */
66 
67 /*
68  * This driver uses four per-instance locks:
69  *
70  * xnf_gref_lock:
71  *
72  *    Protects access to the grant reference list stored in
73  *    xnf_gref_head. Grant references should be acquired and released
74  *    using gref_get() and gref_put() respectively.
75  *
76  * xnf_schedlock:
77  *
78  *    Protects:
79  *    xnf_need_sched - used to record that a previous transmit attempt
80  *       failed (and consequently it will be necessary to call
81  *       mac_tx_update() when transmit resources are available).
82  *    xnf_pending_multicast - the number of multicast requests that
83  *       have been submitted to the backend for which we have not
84  *       processed responses.
85  *
86  * xnf_txlock:
87  *
88  *    Protects the transmit ring (xnf_tx_ring) and associated
89  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
90  *
91  * xnf_rxlock:
92  *
93  *    Protects the receive ring (xnf_rx_ring) and associated
94  *    structures (notably xnf_rx_pkt_info).
95  *
96  * If driver-global state that affects both the transmit and receive
97  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
98  * held, in that order.
99  *
100  * xnf_schedlock is acquired both whilst holding xnf_txlock and
101  * without. It should always be acquired after xnf_txlock if both are
102  * held.
103  *
104  * Notes:
105  * - atomic_add_64() is used to manipulate counters where we require
106  *   accuracy. For counters intended only for observation by humans,
107  *   post increment/decrement are used instead.
108  */
109 
110 #include <sys/types.h>
111 #include <sys/errno.h>
112 #include <sys/param.h>
113 #include <sys/sysmacros.h>
114 #include <sys/systm.h>
115 #include <sys/stream.h>
116 #include <sys/strsubr.h>
117 #include <sys/strsun.h>
118 #include <sys/conf.h>
119 #include <sys/ddi.h>
120 #include <sys/devops.h>
121 #include <sys/sunddi.h>
122 #include <sys/sunndi.h>
123 #include <sys/dlpi.h>
124 #include <sys/ethernet.h>
125 #include <sys/strsun.h>
126 #include <sys/pattr.h>
127 #include <inet/ip.h>
128 #include <inet/ip_impl.h>
129 #include <inet/tcp.h>
130 #include <netinet/udp.h>
131 #include <sys/gld.h>
132 #include <sys/modctl.h>
133 #include <sys/mac_provider.h>
134 #include <sys/mac_ether.h>
135 #include <sys/bootinfo.h>
136 #include <sys/mach_mmu.h>
137 #ifdef	XPV_HVM_DRIVER
138 #include <sys/xpv_support.h>
139 #include <sys/hypervisor.h>
140 #else
141 #include <sys/hypervisor.h>
142 #include <sys/evtchn_impl.h>
143 #include <sys/balloon_impl.h>
144 #endif
145 #include <xen/public/io/netif.h>
146 #include <sys/gnttab.h>
147 #include <xen/sys/xendev.h>
148 #include <sys/sdt.h>
149 #include <sys/note.h>
150 #include <sys/debug.h>
151 
152 #include <io/xnf.h>
153 
154 #if defined(DEBUG) || defined(__lint)
155 #define	XNF_DEBUG
156 #endif
157 
158 #ifdef XNF_DEBUG
159 int xnf_debug = 0;
160 xnf_t *xnf_debug_instance = NULL;
161 #endif
162 
163 /*
164  * On a 32 bit PAE system physical and machine addresses are larger
165  * than 32 bits.  ddi_btop() on such systems take an unsigned long
166  * argument, and so addresses above 4G are truncated before ddi_btop()
167  * gets to see them.  To avoid this, code the shift operation here.
168  */
169 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
170 
171 /*
172  * The parameters below should only be changed in /etc/system, never in mdb.
173  */
174 
175 /*
176  * Should we use the multicast control feature if the backend provides
177  * it?
178  */
179 boolean_t xnf_multicast_control = B_TRUE;
180 
181 /*
182  * Should we allow scatter-gather for tx if backend allows it?
183  */
184 boolean_t xnf_enable_tx_sg = B_TRUE;
185 
186 /*
187  * Should we allow scatter-gather for rx if backend allows it?
188  */
189 boolean_t xnf_enable_rx_sg = B_TRUE;
190 
191 /*
192  * Should we allow lso for tx sends if backend allows it?
193  * Requires xnf_enable_tx_sg to be also set to TRUE.
194  */
195 boolean_t xnf_enable_lso = B_TRUE;
196 
197 /*
198  * Should we allow lro on rx if backend supports it?
199  * Requires xnf_enable_rx_sg to be also set to TRUE.
200  *
201  * !! WARNING !!
202  * LRO is not yet supported in the OS so this should be left as FALSE.
203  * !! WARNING !!
204  */
205 boolean_t xnf_enable_lro = B_FALSE;
206 
207 /*
208  * Received packets below this size are copied to a new streams buffer
209  * rather than being desballoc'ed.
210  *
211  * This value is chosen to accommodate traffic where there are a large
212  * number of small packets. For data showing a typical distribution,
213  * see:
214  *
215  * Sinha07a:
216  *	Rishi Sinha, Christos Papadopoulos, and John
217  *	Heidemann. Internet Packet Size Distributions: Some
218  *	Observations. Technical Report ISI-TR-2007-643,
219  *	USC/Information Sciences Institute, May, 2007. Orignally
220  *	released October 2005 as web page
221  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
222  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
223  */
224 size_t xnf_rx_copy_limit = 64;
225 
226 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
227 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
228 #define	INVALID_TX_ID		((uint16_t)-1)
229 
230 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
231 #define	TX_ID_VALID(i) \
232 	(((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
233 
234 /*
235  * calculate how many pages are spanned by an mblk fragment
236  */
237 #define	xnf_mblk_pages(mp)	(MBLKL(mp) == 0 ? 0 : \
238     xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
239 
240 /* Required system entry points */
241 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
242 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
243 
244 /* Required driver entry points for Nemo */
245 static int	xnf_start(void *);
246 static void	xnf_stop(void *);
247 static int	xnf_set_mac_addr(void *, const uint8_t *);
248 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
249 static int	xnf_set_promiscuous(void *, boolean_t);
250 static mblk_t	*xnf_send(void *, mblk_t *);
251 static uint_t	xnf_intr(caddr_t);
252 static int	xnf_stat(void *, uint_t, uint64_t *);
253 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
254 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
255 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
256     const void *);
257 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
258     mac_prop_info_handle_t);
259 
260 /* Driver private functions */
261 static int xnf_alloc_dma_resources(xnf_t *);
262 static void xnf_release_dma_resources(xnf_t *);
263 static void xnf_release_mblks(xnf_t *);
264 
265 static int xnf_buf_constructor(void *, void *, int);
266 static void xnf_buf_destructor(void *, void *);
267 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
268 #pragma inline(xnf_buf_get)
269 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
270 #pragma inline(xnf_buf_put)
271 static void xnf_buf_refresh(xnf_buf_t *);
272 #pragma inline(xnf_buf_refresh)
273 static void xnf_buf_recycle(xnf_buf_t *);
274 
275 static int xnf_tx_buf_constructor(void *, void *, int);
276 static void xnf_tx_buf_destructor(void *, void *);
277 
278 static grant_ref_t xnf_gref_get(xnf_t *);
279 #pragma inline(xnf_gref_get)
280 static void xnf_gref_put(xnf_t *, grant_ref_t);
281 #pragma inline(xnf_gref_put)
282 
283 static xnf_txid_t *xnf_txid_get(xnf_t *);
284 #pragma inline(xnf_txid_get)
285 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
286 #pragma inline(xnf_txid_put)
287 
288 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
289 static int xnf_tx_clean_ring(xnf_t  *);
290 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
291     void *, void *);
292 static boolean_t xnf_kstat_init(xnf_t *);
293 static void xnf_rx_collect(xnf_t *);
294 
295 #define	XNF_CALLBACK_FLAGS	(MC_GETCAPAB | MC_PROPERTIES)
296 
297 static mac_callbacks_t xnf_callbacks = {
298 	.mc_callbacks = XNF_CALLBACK_FLAGS,
299 	.mc_getstat = xnf_stat,
300 	.mc_start = xnf_start,
301 	.mc_stop = xnf_stop,
302 	.mc_setpromisc = xnf_set_promiscuous,
303 	.mc_multicst = xnf_set_multicast,
304 	.mc_unicst = xnf_set_mac_addr,
305 	.mc_tx = xnf_send,
306 	.mc_getcapab = xnf_getcapab,
307 	.mc_setprop = xnf_setprop,
308 	.mc_getprop = xnf_getprop,
309 	.mc_propinfo = xnf_propinfo,
310 };
311 
312 /* DMA attributes for network ring buffer */
313 static ddi_dma_attr_t ringbuf_dma_attr = {
314 	.dma_attr_version = DMA_ATTR_V0,
315 	.dma_attr_addr_lo = 0,
316 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
317 	.dma_attr_count_max = 0x7fffffff,
318 	.dma_attr_align = MMU_PAGESIZE,
319 	.dma_attr_burstsizes = 0x7ff,
320 	.dma_attr_minxfer = 1,
321 	.dma_attr_maxxfer = 0xffffffffU,
322 	.dma_attr_seg = 0xffffffffffffffffULL,
323 	.dma_attr_sgllen = 1,
324 	.dma_attr_granular = 1,
325 	.dma_attr_flags = 0
326 };
327 
328 /* DMA attributes for receive data */
329 static ddi_dma_attr_t rx_buf_dma_attr = {
330 	.dma_attr_version = DMA_ATTR_V0,
331 	.dma_attr_addr_lo = 0,
332 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
333 	.dma_attr_count_max = MMU_PAGEOFFSET,
334 	.dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
335 	.dma_attr_burstsizes = 0x7ff,
336 	.dma_attr_minxfer = 1,
337 	.dma_attr_maxxfer = 0xffffffffU,
338 	.dma_attr_seg = 0xffffffffffffffffULL,
339 	.dma_attr_sgllen = 1,
340 	.dma_attr_granular = 1,
341 	.dma_attr_flags = 0
342 };
343 
344 /* DMA attributes for transmit data */
345 static ddi_dma_attr_t tx_buf_dma_attr = {
346 	.dma_attr_version = DMA_ATTR_V0,
347 	.dma_attr_addr_lo = 0,
348 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
349 	.dma_attr_count_max = MMU_PAGEOFFSET,
350 	.dma_attr_align = 1,
351 	.dma_attr_burstsizes = 0x7ff,
352 	.dma_attr_minxfer = 1,
353 	.dma_attr_maxxfer = 0xffffffffU,
354 	.dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
355 	.dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
356 	.dma_attr_granular = 1,
357 	.dma_attr_flags = 0
358 };
359 
360 /* DMA access attributes for registers and descriptors */
361 static ddi_device_acc_attr_t accattr = {
362 	DDI_DEVICE_ATTR_V0,
363 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
364 	DDI_STRICTORDER_ACC
365 };
366 
367 /* DMA access attributes for data: NOT to be byte swapped. */
368 static ddi_device_acc_attr_t data_accattr = {
369 	DDI_DEVICE_ATTR_V0,
370 	DDI_NEVERSWAP_ACC,
371 	DDI_STRICTORDER_ACC
372 };
373 
374 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
375     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
376 
377 static struct modldrv xnf_modldrv = {
378 	&mod_driverops,
379 	"Virtual Ethernet driver",
380 	&xnf_dev_ops
381 };
382 
383 static struct modlinkage modlinkage = {
384 	MODREV_1, &xnf_modldrv, NULL
385 };
386 
387 int
388 _init(void)
389 {
390 	int r;
391 
392 	mac_init_ops(&xnf_dev_ops, "xnf");
393 	r = mod_install(&modlinkage);
394 	if (r != DDI_SUCCESS)
395 		mac_fini_ops(&xnf_dev_ops);
396 
397 	return (r);
398 }
399 
400 int
401 _fini(void)
402 {
403 	return (EBUSY); /* XXPV should be removable */
404 }
405 
406 int
407 _info(struct modinfo *modinfop)
408 {
409 	return (mod_info(&modlinkage, modinfop));
410 }
411 
412 /*
413  * Acquire a grant reference.
414  */
415 static grant_ref_t
416 xnf_gref_get(xnf_t *xnfp)
417 {
418 	grant_ref_t gref;
419 
420 	mutex_enter(&xnfp->xnf_gref_lock);
421 
422 	do {
423 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
424 
425 	} while ((gref == INVALID_GRANT_REF) &&
426 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
427 
428 	mutex_exit(&xnfp->xnf_gref_lock);
429 
430 	if (gref == INVALID_GRANT_REF) {
431 		xnfp->xnf_stat_gref_failure++;
432 	} else {
433 		atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
434 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
435 			xnfp->xnf_stat_gref_peak =
436 			    xnfp->xnf_stat_gref_outstanding;
437 	}
438 
439 	return (gref);
440 }
441 
442 /*
443  * Release a grant reference.
444  */
445 static void
446 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
447 {
448 	ASSERT(gref != INVALID_GRANT_REF);
449 
450 	mutex_enter(&xnfp->xnf_gref_lock);
451 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
452 	mutex_exit(&xnfp->xnf_gref_lock);
453 
454 	atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
455 }
456 
457 /*
458  * Acquire a transmit id.
459  */
460 static xnf_txid_t *
461 xnf_txid_get(xnf_t *xnfp)
462 {
463 	xnf_txid_t *tidp;
464 
465 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
466 
467 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
468 		return (NULL);
469 
470 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
471 
472 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
473 	xnfp->xnf_tx_pkt_id_head = tidp->next;
474 	tidp->next = INVALID_TX_ID;
475 
476 	ASSERT(tidp->txbuf == NULL);
477 
478 	return (tidp);
479 }
480 
481 /*
482  * Release a transmit id.
483  */
484 static void
485 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
486 {
487 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
488 	ASSERT(TX_ID_VALID(tidp->id));
489 	ASSERT(tidp->next == INVALID_TX_ID);
490 
491 	tidp->txbuf = NULL;
492 	tidp->next = xnfp->xnf_tx_pkt_id_head;
493 	xnfp->xnf_tx_pkt_id_head = tidp->id;
494 }
495 
496 static void
497 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
498 {
499 	ASSERT3U(txp->tx_type, ==, TX_DATA);
500 
501 	/*
502 	 * We are either using a lookaside buffer or we are mapping existing
503 	 * buffers.
504 	 */
505 	if (txp->tx_bdesc != NULL) {
506 		ASSERT(!txp->tx_handle_bound);
507 		xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
508 	} else {
509 		if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
510 			if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
511 			    0) {
512 				cmn_err(CE_PANIC, "tx grant %d still in use by "
513 				    "backend domain", txp->tx_txreq.gref);
514 			}
515 			(void) gnttab_end_foreign_access_ref(
516 			    txp->tx_txreq.gref, 1);
517 			xnf_gref_put(xnfp, txp->tx_txreq.gref);
518 		}
519 
520 		if (txp->tx_handle_bound)
521 			(void) ddi_dma_unbind_handle(txp->tx_dma_handle);
522 	}
523 
524 	if (txp->tx_mp != NULL)
525 		freemsg(txp->tx_mp);
526 
527 	if (txp->tx_prev != NULL) {
528 		ASSERT3P(txp->tx_prev->tx_next, ==, txp);
529 		txp->tx_prev->tx_next = NULL;
530 	}
531 
532 	if (txp->tx_txreq.id != INVALID_TX_ID) {
533 		/*
534 		 * This should be only possible when resuming from a suspend.
535 		 */
536 		ASSERT(!xnfp->xnf_connected);
537 		xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
538 		txp->tx_txreq.id = INVALID_TX_ID;
539 	}
540 
541 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
542 }
543 
544 static void
545 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
546 {
547 	if (txp == NULL)
548 		return;
549 
550 	while (txp->tx_next != NULL)
551 		txp = txp->tx_next;
552 
553 	/*
554 	 * We free the chain in reverse order so that grants can be released
555 	 * for all dma chunks before unbinding the dma handles. The mblk is
556 	 * freed last, after all its fragments' dma handles are unbound.
557 	 */
558 	xnf_txbuf_t *prev;
559 	for (; txp != NULL; txp = prev) {
560 		prev = txp->tx_prev;
561 		xnf_data_txbuf_free(xnfp, txp);
562 	}
563 }
564 
565 static xnf_txbuf_t *
566 xnf_data_txbuf_alloc(xnf_t *xnfp)
567 {
568 	xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
569 	txp->tx_type = TX_DATA;
570 	txp->tx_next = NULL;
571 	txp->tx_prev = NULL;
572 	txp->tx_head = txp;
573 	txp->tx_frags_to_ack = 0;
574 	txp->tx_mp = NULL;
575 	txp->tx_bdesc = NULL;
576 	txp->tx_handle_bound = B_FALSE;
577 	txp->tx_txreq.gref = INVALID_GRANT_REF;
578 	txp->tx_txreq.id = INVALID_TX_ID;
579 
580 	return (txp);
581 }
582 
583 /*
584  * Get `wanted' slots in the transmit ring, waiting for at least that
585  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
586  * `wanted' to zero.
587  *
588  * Return the number of slots available.
589  */
590 static int
591 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
592 {
593 	int slotsfree;
594 	boolean_t forced_clean = (wanted == 0);
595 
596 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
597 
598 	/* LINTED: constant in conditional context */
599 	while (B_TRUE) {
600 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
601 
602 		if ((slotsfree < wanted) || forced_clean)
603 			slotsfree = xnf_tx_clean_ring(xnfp);
604 
605 		/*
606 		 * If there are more than we need free, tell other
607 		 * people to come looking again. We hold txlock, so we
608 		 * are able to take our slots before anyone else runs.
609 		 */
610 		if (slotsfree > wanted)
611 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
612 
613 		if (slotsfree >= wanted)
614 			break;
615 
616 		if (!wait)
617 			break;
618 
619 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
620 	}
621 
622 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
623 
624 	return (slotsfree);
625 }
626 
627 static int
628 xnf_setup_rings(xnf_t *xnfp)
629 {
630 	domid_t			oeid;
631 	struct xenbus_device	*xsd;
632 	RING_IDX		i;
633 	int			err;
634 	xnf_txid_t		*tidp;
635 	xnf_buf_t **bdescp;
636 
637 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
638 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
639 
640 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
641 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
642 
643 	err = gnttab_grant_foreign_access(oeid,
644 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
645 	if (err <= 0) {
646 		err = -err;
647 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
648 		goto out;
649 	}
650 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
651 
652 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
653 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
654 
655 	err = gnttab_grant_foreign_access(oeid,
656 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
657 	if (err <= 0) {
658 		err = -err;
659 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
660 		goto out;
661 	}
662 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
663 
664 	mutex_enter(&xnfp->xnf_txlock);
665 
666 	/*
667 	 * We first cleanup the TX ring in case we are doing a resume.
668 	 * Note that this can lose packets, but we expect to stagger on.
669 	 */
670 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
671 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
672 	    i < NET_TX_RING_SIZE;
673 	    i++, tidp++) {
674 		xnf_txbuf_t *txp = tidp->txbuf;
675 		if (txp == NULL)
676 			continue;
677 
678 		switch (txp->tx_type) {
679 		case TX_DATA:
680 			/*
681 			 * txid_put() will be called for each txbuf's txid in
682 			 * the chain which will result in clearing tidp->txbuf.
683 			 */
684 			xnf_data_txbuf_free_chain(xnfp, txp);
685 
686 			break;
687 
688 		case TX_MCAST_REQ:
689 			txp->tx_type = TX_MCAST_RSP;
690 			txp->tx_status = NETIF_RSP_DROPPED;
691 			cv_broadcast(&xnfp->xnf_cv_multicast);
692 
693 			/*
694 			 * The request consumed two slots in the ring,
695 			 * yet only a single xnf_txid_t is used. Step
696 			 * over the empty slot.
697 			 */
698 			i++;
699 			ASSERT3U(i, <, NET_TX_RING_SIZE);
700 			break;
701 
702 		case TX_MCAST_RSP:
703 			break;
704 		}
705 	}
706 
707 	/*
708 	 * Now purge old list and add each txid to the new free list.
709 	 */
710 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
711 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
712 	    i < NET_TX_RING_SIZE;
713 	    i++, tidp++) {
714 		tidp->id = i;
715 		ASSERT3P(tidp->txbuf, ==, NULL);
716 		tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
717 		xnf_txid_put(xnfp, tidp);
718 	}
719 
720 	/* LINTED: constant in conditional context */
721 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
722 	/* LINTED: constant in conditional context */
723 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
724 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
725 
726 	mutex_exit(&xnfp->xnf_txlock);
727 
728 	mutex_enter(&xnfp->xnf_rxlock);
729 
730 	/*
731 	 * Clean out any buffers currently posted to the receive ring
732 	 * before we reset it.
733 	 */
734 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
735 	    i < NET_RX_RING_SIZE;
736 	    i++, bdescp++) {
737 		if (*bdescp != NULL) {
738 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
739 			*bdescp = NULL;
740 		}
741 	}
742 
743 	/* LINTED: constant in conditional context */
744 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
745 	/* LINTED: constant in conditional context */
746 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
747 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
748 
749 	/*
750 	 * Fill the ring with buffers.
751 	 */
752 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
753 		xnf_buf_t *bdesc;
754 
755 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
756 		VERIFY(bdesc != NULL);
757 		xnf_rxbuf_hang(xnfp, bdesc);
758 	}
759 
760 	/* LINTED: constant in conditional context */
761 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
762 
763 	mutex_exit(&xnfp->xnf_rxlock);
764 
765 	return (0);
766 
767 out:
768 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
769 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
770 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
771 
772 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
773 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
774 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
775 
776 	return (err);
777 }
778 
779 /*
780  * Connect driver to back end, called to set up communication with
781  * back end driver both initially and on resume after restore/migrate.
782  */
783 void
784 xnf_be_connect(xnf_t *xnfp)
785 {
786 	const char	*message;
787 	xenbus_transaction_t xbt;
788 	struct		xenbus_device *xsd;
789 	char		*xsname;
790 	int		err;
791 
792 	ASSERT(!xnfp->xnf_connected);
793 
794 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
795 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
796 
797 	err = xnf_setup_rings(xnfp);
798 	if (err != 0) {
799 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
800 		xenbus_dev_error(xsd, err, "setting up ring");
801 		return;
802 	}
803 
804 again:
805 	err = xenbus_transaction_start(&xbt);
806 	if (err != 0) {
807 		xenbus_dev_error(xsd, EIO, "starting transaction");
808 		return;
809 	}
810 
811 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
812 	    xnfp->xnf_tx_ring_ref);
813 	if (err != 0) {
814 		message = "writing tx ring-ref";
815 		goto abort_transaction;
816 	}
817 
818 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
819 	    xnfp->xnf_rx_ring_ref);
820 	if (err != 0) {
821 		message = "writing rx ring-ref";
822 		goto abort_transaction;
823 	}
824 
825 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
826 	    xnfp->xnf_evtchn);
827 	if (err != 0) {
828 		message = "writing event-channel";
829 		goto abort_transaction;
830 	}
831 
832 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
833 	if (err != 0) {
834 		message = "writing feature-rx-notify";
835 		goto abort_transaction;
836 	}
837 
838 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
839 	if (err != 0) {
840 		message = "writing request-rx-copy";
841 		goto abort_transaction;
842 	}
843 
844 	if (xnfp->xnf_be_mcast_control) {
845 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
846 		    "%d", 1);
847 		if (err != 0) {
848 			message = "writing request-multicast-control";
849 			goto abort_transaction;
850 		}
851 	}
852 
853 	/*
854 	 * Tell backend if we support scatter-gather lists on the rx side.
855 	 */
856 	err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
857 	    xnf_enable_rx_sg ? 1 : 0);
858 	if (err != 0) {
859 		message = "writing feature-sg";
860 		goto abort_transaction;
861 	}
862 
863 	/*
864 	 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
865 	 * a prerequisite.
866 	 */
867 	err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
868 	    (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
869 	if (err != 0) {
870 		message = "writing feature-gso-tcpv4";
871 		goto abort_transaction;
872 	}
873 
874 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
875 	if (err != 0) {
876 		message = "switching state to XenbusStateConnected";
877 		goto abort_transaction;
878 	}
879 
880 	err = xenbus_transaction_end(xbt, 0);
881 	if (err != 0) {
882 		if (err == EAGAIN)
883 			goto again;
884 		xenbus_dev_error(xsd, err, "completing transaction");
885 	}
886 
887 	return;
888 
889 abort_transaction:
890 	(void) xenbus_transaction_end(xbt, 1);
891 	xenbus_dev_error(xsd, err, "%s", message);
892 }
893 
894 /*
895  * Read configuration information from xenstore.
896  */
897 void
898 xnf_read_config(xnf_t *xnfp)
899 {
900 	int err, be_cap;
901 	char mac[ETHERADDRL * 3];
902 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
903 
904 	err = xenbus_scanf(XBT_NULL, oename, "mac",
905 	    "%s", (char *)&mac[0]);
906 	if (err != 0) {
907 		/*
908 		 * bad: we're supposed to be set up with a proper mac
909 		 * addr. at this point
910 		 */
911 		cmn_err(CE_WARN, "%s%d: no mac address",
912 		    ddi_driver_name(xnfp->xnf_devinfo),
913 		    ddi_get_instance(xnfp->xnf_devinfo));
914 			return;
915 	}
916 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
917 		err = ENOENT;
918 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
919 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
920 		return;
921 	}
922 
923 	err = xenbus_scanf(XBT_NULL, oename,
924 	    "feature-rx-copy", "%d", &be_cap);
925 	/*
926 	 * If we fail to read the store we assume that the key is
927 	 * absent, implying an older domain at the far end.  Older
928 	 * domains cannot do HV copy.
929 	 */
930 	if (err != 0)
931 		be_cap = 0;
932 	xnfp->xnf_be_rx_copy = (be_cap != 0);
933 
934 	err = xenbus_scanf(XBT_NULL, oename,
935 	    "feature-multicast-control", "%d", &be_cap);
936 	/*
937 	 * If we fail to read the store we assume that the key is
938 	 * absent, implying an older domain at the far end.  Older
939 	 * domains do not support multicast control.
940 	 */
941 	if (err != 0)
942 		be_cap = 0;
943 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
944 
945 	/*
946 	 * See if back-end supports scatter-gather for transmits. If not,
947 	 * we will not support LSO and limit the mtu to 1500.
948 	 */
949 	err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
950 	if (err != 0) {
951 		be_cap = 0;
952 		dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
953 		    "'feature-sg' from backend driver");
954 	}
955 	if (be_cap == 0) {
956 		dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
957 		    "supported for transmits in the backend driver. LSO is "
958 		    "disabled and MTU is restricted to 1500 bytes.");
959 	}
960 	xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
961 
962 	if (xnfp->xnf_be_tx_sg) {
963 		/*
964 		 * Check if LSO is supported. Currently we only check for
965 		 * IPv4 as Illumos doesn't support LSO for IPv6.
966 		 */
967 		err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
968 		    &be_cap);
969 		if (err != 0) {
970 			be_cap = 0;
971 			dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
972 			    "'feature-gso-tcpv4' from backend driver");
973 		}
974 		if (be_cap == 0) {
975 			dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
976 			    "supported by the backend driver. Performance "
977 			    "will be affected.");
978 		}
979 		xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
980 	}
981 }
982 
983 /*
984  *  attach(9E) -- Attach a device to the system
985  */
986 static int
987 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
988 {
989 	mac_register_t *macp;
990 	xnf_t *xnfp;
991 	int err;
992 	char cachename[32];
993 
994 #ifdef XNF_DEBUG
995 	if (xnf_debug & XNF_DEBUG_DDI)
996 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
997 		    (void *)devinfo);
998 #endif
999 
1000 	switch (cmd) {
1001 	case DDI_RESUME:
1002 		xnfp = ddi_get_driver_private(devinfo);
1003 		xnfp->xnf_gen++;
1004 
1005 		(void) xvdi_resume(devinfo);
1006 		(void) xvdi_alloc_evtchn(devinfo);
1007 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1008 #ifdef XPV_HVM_DRIVER
1009 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
1010 		    xnfp);
1011 #else
1012 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
1013 		    (caddr_t)xnfp);
1014 #endif
1015 		return (DDI_SUCCESS);
1016 
1017 	case DDI_ATTACH:
1018 		break;
1019 
1020 	default:
1021 		return (DDI_FAILURE);
1022 	}
1023 
1024 	/*
1025 	 *  Allocate gld_mac_info_t and xnf_instance structures
1026 	 */
1027 	macp = mac_alloc(MAC_VERSION);
1028 	if (macp == NULL)
1029 		return (DDI_FAILURE);
1030 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1031 
1032 	xnfp->xnf_tx_pkt_id =
1033 	    kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1034 
1035 	xnfp->xnf_rx_pkt_info =
1036 	    kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1037 
1038 	macp->m_dip = devinfo;
1039 	macp->m_driver = xnfp;
1040 	xnfp->xnf_devinfo = devinfo;
1041 
1042 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1043 	macp->m_src_addr = xnfp->xnf_mac_addr;
1044 	macp->m_callbacks = &xnf_callbacks;
1045 	macp->m_min_sdu = 0;
1046 	xnfp->xnf_mtu = ETHERMTU;
1047 	macp->m_max_sdu = xnfp->xnf_mtu;
1048 
1049 	xnfp->xnf_running = B_FALSE;
1050 	xnfp->xnf_connected = B_FALSE;
1051 	xnfp->xnf_be_rx_copy = B_FALSE;
1052 	xnfp->xnf_be_mcast_control = B_FALSE;
1053 	xnfp->xnf_need_sched = B_FALSE;
1054 
1055 	xnfp->xnf_rx_head = NULL;
1056 	xnfp->xnf_rx_tail = NULL;
1057 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1058 
1059 #ifdef XPV_HVM_DRIVER
1060 	/*
1061 	 * Report our version to dom0.
1062 	 */
1063 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1064 	    HVMPV_XNF_VERS))
1065 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
1066 #endif
1067 
1068 	/*
1069 	 * Get the iblock cookie with which to initialize the mutexes.
1070 	 */
1071 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1072 	    != DDI_SUCCESS)
1073 		goto failure;
1074 
1075 	mutex_init(&xnfp->xnf_txlock,
1076 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1077 	mutex_init(&xnfp->xnf_rxlock,
1078 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1079 	mutex_init(&xnfp->xnf_schedlock,
1080 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1081 	mutex_init(&xnfp->xnf_gref_lock,
1082 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1083 
1084 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1085 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1086 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1087 
1088 	(void) sprintf(cachename, "xnf_buf_cache_%d",
1089 	    ddi_get_instance(devinfo));
1090 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1091 	    sizeof (xnf_buf_t), 0,
1092 	    xnf_buf_constructor, xnf_buf_destructor,
1093 	    NULL, xnfp, NULL, 0);
1094 	if (xnfp->xnf_buf_cache == NULL)
1095 		goto failure_0;
1096 
1097 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1098 	    ddi_get_instance(devinfo));
1099 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1100 	    sizeof (xnf_txbuf_t), 0,
1101 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1102 	    NULL, xnfp, NULL, 0);
1103 	if (xnfp->xnf_tx_buf_cache == NULL)
1104 		goto failure_1;
1105 
1106 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
1107 
1108 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1109 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1110 		    "driver data structures",
1111 		    ddi_get_instance(xnfp->xnf_devinfo));
1112 		goto failure_2;
1113 	}
1114 
1115 	xnfp->xnf_rx_ring.sring->rsp_event =
1116 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
1117 
1118 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1119 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1120 
1121 	/* set driver private pointer now */
1122 	ddi_set_driver_private(devinfo, xnfp);
1123 
1124 	if (!xnf_kstat_init(xnfp))
1125 		goto failure_3;
1126 
1127 	/*
1128 	 * Allocate an event channel, add the interrupt handler and
1129 	 * bind it to the event channel.
1130 	 */
1131 	(void) xvdi_alloc_evtchn(devinfo);
1132 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1133 #ifdef XPV_HVM_DRIVER
1134 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1135 #else
1136 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1137 #endif
1138 
1139 	err = mac_register(macp, &xnfp->xnf_mh);
1140 	mac_free(macp);
1141 	macp = NULL;
1142 	if (err != 0)
1143 		goto failure_4;
1144 
1145 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1146 	    != DDI_SUCCESS)
1147 		goto failure_5;
1148 
1149 #ifdef XPV_HVM_DRIVER
1150 	/*
1151 	 * In the HVM case, this driver essentially replaces a driver for
1152 	 * a 'real' PCI NIC. Without the "model" property set to
1153 	 * "Ethernet controller", like the PCI code does, netbooting does
1154 	 * not work correctly, as strplumb_get_netdev_path() will not find
1155 	 * this interface.
1156 	 */
1157 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1158 	    "Ethernet controller");
1159 #endif
1160 
1161 #ifdef XNF_DEBUG
1162 	if (xnf_debug_instance == NULL)
1163 		xnf_debug_instance = xnfp;
1164 #endif
1165 
1166 	return (DDI_SUCCESS);
1167 
1168 failure_5:
1169 	(void) mac_unregister(xnfp->xnf_mh);
1170 
1171 failure_4:
1172 #ifdef XPV_HVM_DRIVER
1173 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1174 	xvdi_free_evtchn(devinfo);
1175 #else
1176 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1177 #endif
1178 	xnfp->xnf_evtchn = INVALID_EVTCHN;
1179 	kstat_delete(xnfp->xnf_kstat_aux);
1180 
1181 failure_3:
1182 	xnf_release_dma_resources(xnfp);
1183 
1184 failure_2:
1185 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1186 
1187 failure_1:
1188 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1189 
1190 failure_0:
1191 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1192 	cv_destroy(&xnfp->xnf_cv_multicast);
1193 	cv_destroy(&xnfp->xnf_cv_state);
1194 
1195 	mutex_destroy(&xnfp->xnf_gref_lock);
1196 	mutex_destroy(&xnfp->xnf_schedlock);
1197 	mutex_destroy(&xnfp->xnf_rxlock);
1198 	mutex_destroy(&xnfp->xnf_txlock);
1199 
1200 failure:
1201 	kmem_free(xnfp, sizeof (*xnfp));
1202 	if (macp != NULL)
1203 		mac_free(macp);
1204 
1205 	return (DDI_FAILURE);
1206 }
1207 
1208 /*  detach(9E) -- Detach a device from the system */
1209 static int
1210 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1211 {
1212 	xnf_t *xnfp;		/* Our private device info */
1213 
1214 #ifdef XNF_DEBUG
1215 	if (xnf_debug & XNF_DEBUG_DDI)
1216 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
1217 #endif
1218 
1219 	xnfp = ddi_get_driver_private(devinfo);
1220 
1221 	switch (cmd) {
1222 	case DDI_SUSPEND:
1223 #ifdef XPV_HVM_DRIVER
1224 		ec_unbind_evtchn(xnfp->xnf_evtchn);
1225 		xvdi_free_evtchn(devinfo);
1226 #else
1227 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1228 #endif
1229 
1230 		xvdi_suspend(devinfo);
1231 
1232 		mutex_enter(&xnfp->xnf_rxlock);
1233 		mutex_enter(&xnfp->xnf_txlock);
1234 
1235 		xnfp->xnf_evtchn = INVALID_EVTCHN;
1236 		xnfp->xnf_connected = B_FALSE;
1237 		mutex_exit(&xnfp->xnf_txlock);
1238 		mutex_exit(&xnfp->xnf_rxlock);
1239 
1240 		/* claim link to be down after disconnect */
1241 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1242 		return (DDI_SUCCESS);
1243 
1244 	case DDI_DETACH:
1245 		break;
1246 
1247 	default:
1248 		return (DDI_FAILURE);
1249 	}
1250 
1251 	if (xnfp->xnf_connected)
1252 		return (DDI_FAILURE);
1253 
1254 	/*
1255 	 * Cannot detach if we have xnf_buf_t outstanding.
1256 	 */
1257 	if (xnfp->xnf_stat_buf_allocated > 0)
1258 		return (DDI_FAILURE);
1259 
1260 	if (mac_unregister(xnfp->xnf_mh) != 0)
1261 		return (DDI_FAILURE);
1262 
1263 	kstat_delete(xnfp->xnf_kstat_aux);
1264 
1265 	/* Stop the receiver */
1266 	xnf_stop(xnfp);
1267 
1268 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1269 
1270 	/* Remove the interrupt */
1271 #ifdef XPV_HVM_DRIVER
1272 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1273 	xvdi_free_evtchn(devinfo);
1274 #else
1275 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1276 #endif
1277 
1278 	/* Release any pending xmit mblks */
1279 	xnf_release_mblks(xnfp);
1280 
1281 	/* Release all DMA resources */
1282 	xnf_release_dma_resources(xnfp);
1283 
1284 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1285 	cv_destroy(&xnfp->xnf_cv_multicast);
1286 	cv_destroy(&xnfp->xnf_cv_state);
1287 
1288 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1289 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1290 
1291 	mutex_destroy(&xnfp->xnf_gref_lock);
1292 	mutex_destroy(&xnfp->xnf_schedlock);
1293 	mutex_destroy(&xnfp->xnf_rxlock);
1294 	mutex_destroy(&xnfp->xnf_txlock);
1295 
1296 	kmem_free(xnfp, sizeof (*xnfp));
1297 
1298 	return (DDI_SUCCESS);
1299 }
1300 
1301 /*
1302  *  xnf_set_mac_addr() -- set the physical network address on the board.
1303  */
1304 static int
1305 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1306 {
1307 	_NOTE(ARGUNUSED(arg, macaddr));
1308 
1309 	/*
1310 	 * We can't set our macaddr.
1311 	 */
1312 	return (ENOTSUP);
1313 }
1314 
1315 /*
1316  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1317  *
1318  *  Program the hardware to enable/disable the multicast address
1319  *  in "mca".  Enable if "add" is true, disable if false.
1320  */
1321 static int
1322 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1323 {
1324 	xnf_t *xnfp = arg;
1325 	xnf_txbuf_t *txp;
1326 	int n_slots;
1327 	RING_IDX slot;
1328 	xnf_txid_t *tidp;
1329 	netif_tx_request_t *txrp;
1330 	struct netif_extra_info *erp;
1331 	boolean_t notify, result;
1332 
1333 	/*
1334 	 * If the backend does not support multicast control then we
1335 	 * must assume that the right packets will just arrive.
1336 	 */
1337 	if (!xnfp->xnf_be_mcast_control)
1338 		return (0);
1339 
1340 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1341 
1342 	mutex_enter(&xnfp->xnf_txlock);
1343 
1344 	/*
1345 	 * If we're not yet connected then claim success. This is
1346 	 * acceptable because we refresh the entire set of multicast
1347 	 * addresses when we get connected.
1348 	 *
1349 	 * We can't wait around here because the MAC layer expects
1350 	 * this to be a non-blocking operation - waiting ends up
1351 	 * causing a deadlock during resume.
1352 	 */
1353 	if (!xnfp->xnf_connected) {
1354 		mutex_exit(&xnfp->xnf_txlock);
1355 		return (0);
1356 	}
1357 
1358 	/*
1359 	 * 1. Acquire two slots in the ring.
1360 	 * 2. Fill in the slots.
1361 	 * 3. Request notification when the operation is done.
1362 	 * 4. Kick the peer.
1363 	 * 5. Wait for the response via xnf_tx_clean_ring().
1364 	 */
1365 
1366 	n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1367 	ASSERT(n_slots >= 2);
1368 
1369 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1370 	tidp = xnf_txid_get(xnfp);
1371 	VERIFY(tidp != NULL);
1372 
1373 	txp->tx_type = TX_MCAST_REQ;
1374 	txp->tx_slot = slot;
1375 
1376 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1377 	erp = (struct netif_extra_info *)
1378 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1379 
1380 	txrp->gref = 0;
1381 	txrp->size = 0;
1382 	txrp->offset = 0;
1383 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1384 	txrp->id = txp->tx_txreq.id = tidp->id;
1385 	txrp->flags = NETTXF_extra_info;
1386 
1387 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1388 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1389 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1390 
1391 	tidp->txbuf = txp;
1392 
1393 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1394 
1395 	mutex_enter(&xnfp->xnf_schedlock);
1396 	xnfp->xnf_pending_multicast++;
1397 	mutex_exit(&xnfp->xnf_schedlock);
1398 
1399 	/* LINTED: constant in conditional context */
1400 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1401 	    notify);
1402 	if (notify)
1403 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1404 
1405 	while (txp->tx_type == TX_MCAST_REQ)
1406 		cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1407 
1408 	ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1409 
1410 	mutex_enter(&xnfp->xnf_schedlock);
1411 	xnfp->xnf_pending_multicast--;
1412 	mutex_exit(&xnfp->xnf_schedlock);
1413 
1414 	result = (txp->tx_status == NETIF_RSP_OKAY);
1415 
1416 	xnf_txid_put(xnfp, tidp);
1417 
1418 	mutex_exit(&xnfp->xnf_txlock);
1419 
1420 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1421 
1422 	return (result ? 0 : 1);
1423 }
1424 
1425 /*
1426  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1427  *
1428  *  Program the hardware to enable/disable promiscuous mode.
1429  */
1430 static int
1431 xnf_set_promiscuous(void *arg, boolean_t on)
1432 {
1433 	_NOTE(ARGUNUSED(arg, on));
1434 
1435 	/*
1436 	 * We can't really do this, but we pretend that we can in
1437 	 * order that snoop will work.
1438 	 */
1439 	return (0);
1440 }
1441 
1442 /*
1443  * Clean buffers that we have responses for from the transmit ring.
1444  */
1445 static int
1446 xnf_tx_clean_ring(xnf_t *xnfp)
1447 {
1448 	boolean_t work_to_do;
1449 
1450 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1451 
1452 loop:
1453 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1454 		RING_IDX cons, prod, i;
1455 
1456 		cons = xnfp->xnf_tx_ring.rsp_cons;
1457 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1458 		membar_consumer();
1459 		/*
1460 		 * Clean tx requests from ring that we have responses
1461 		 * for.
1462 		 */
1463 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1464 		for (i = cons; i != prod; i++) {
1465 			netif_tx_response_t *trp;
1466 			xnf_txid_t *tidp;
1467 			xnf_txbuf_t *txp;
1468 
1469 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1470 			/*
1471 			 * if this slot was occupied by netif_extra_info_t,
1472 			 * then the response will be NETIF_RSP_NULL. In this
1473 			 * case there are no resources to clean up.
1474 			 */
1475 			if (trp->status == NETIF_RSP_NULL)
1476 				continue;
1477 
1478 			ASSERT(TX_ID_VALID(trp->id));
1479 
1480 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
1481 			ASSERT3U(tidp->id, ==, trp->id);
1482 			ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1483 
1484 			txp = tidp->txbuf;
1485 			ASSERT(txp != NULL);
1486 			ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1487 
1488 			switch (txp->tx_type) {
1489 			case TX_DATA:
1490 				/*
1491 				 * We must put the txid for each response we
1492 				 * acknowledge to make sure that we never have
1493 				 * more free slots than txids. Because of this
1494 				 * we do it here instead of waiting for it to
1495 				 * be done in xnf_data_txbuf_free_chain().
1496 				 */
1497 				xnf_txid_put(xnfp, tidp);
1498 				txp->tx_txreq.id = INVALID_TX_ID;
1499 				ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1500 				txp->tx_head->tx_frags_to_ack--;
1501 
1502 				/*
1503 				 * We clean the whole chain once we got a
1504 				 * response for each fragment.
1505 				 */
1506 				if (txp->tx_head->tx_frags_to_ack == 0)
1507 					xnf_data_txbuf_free_chain(xnfp, txp);
1508 
1509 				break;
1510 
1511 			case TX_MCAST_REQ:
1512 				txp->tx_type = TX_MCAST_RSP;
1513 				txp->tx_status = trp->status;
1514 				cv_broadcast(&xnfp->xnf_cv_multicast);
1515 
1516 				break;
1517 
1518 			default:
1519 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1520 				    "invalid xnf_txbuf_t type: %d",
1521 				    txp->tx_type);
1522 				break;
1523 			}
1524 		}
1525 		/*
1526 		 * Record the last response we dealt with so that we
1527 		 * know where to start next time around.
1528 		 */
1529 		xnfp->xnf_tx_ring.rsp_cons = prod;
1530 		membar_enter();
1531 	}
1532 
1533 	/* LINTED: constant in conditional context */
1534 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1535 	if (work_to_do)
1536 		goto loop;
1537 
1538 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1539 }
1540 
1541 /*
1542  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1543  * to ensure that the packet is physically contiguous and contained
1544  * within a single page.
1545  */
1546 static xnf_buf_t *
1547 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1548 {
1549 	xnf_buf_t *bd;
1550 	caddr_t bp;
1551 
1552 	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1553 	if (bd == NULL)
1554 		return (NULL);
1555 
1556 	bp = bd->buf;
1557 	while (mp != NULL) {
1558 		size_t len = MBLKL(mp);
1559 
1560 		bcopy(mp->b_rptr, bp, len);
1561 		bp += len;
1562 
1563 		mp = mp->b_cont;
1564 	}
1565 
1566 	*plen = bp - bd->buf;
1567 	ASSERT3U(*plen, <=, PAGESIZE);
1568 
1569 	xnfp->xnf_stat_tx_lookaside++;
1570 
1571 	return (bd);
1572 }
1573 
1574 /*
1575  * Insert the pseudo-header checksum into the packet.
1576  * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1577  * HCKSUM_INET_FULL_V4.
1578  */
1579 int
1580 xnf_pseudo_cksum(mblk_t *mp)
1581 {
1582 	struct ether_header *ehp;
1583 	uint16_t sap, iplen, *stuff;
1584 	uint32_t cksum;
1585 	size_t len;
1586 	ipha_t *ipha;
1587 	ipaddr_t src, dst;
1588 	uchar_t *ptr;
1589 
1590 	ptr = mp->b_rptr;
1591 	len = MBLKL(mp);
1592 
1593 	/* Each header must fit completely in an mblk. */
1594 	ASSERT3U(len, >=, sizeof (*ehp));
1595 
1596 	ehp = (struct ether_header *)ptr;
1597 
1598 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1599 		struct ether_vlan_header *evhp;
1600 		ASSERT3U(len, >=, sizeof (*evhp));
1601 		evhp = (struct ether_vlan_header *)ptr;
1602 		sap = ntohs(evhp->ether_type);
1603 		ptr += sizeof (*evhp);
1604 		len -= sizeof (*evhp);
1605 	} else {
1606 		sap = ntohs(ehp->ether_type);
1607 		ptr += sizeof (*ehp);
1608 		len -= sizeof (*ehp);
1609 	}
1610 
1611 	ASSERT3U(sap, ==, ETHERTYPE_IP);
1612 
1613 	/*
1614 	 * Ethernet and IP headers may be in different mblks.
1615 	 */
1616 	ASSERT3P(ptr, <=, mp->b_wptr);
1617 	if (ptr == mp->b_wptr) {
1618 		mp = mp->b_cont;
1619 		ptr = mp->b_rptr;
1620 		len = MBLKL(mp);
1621 	}
1622 
1623 	ASSERT3U(len, >=, sizeof (ipha_t));
1624 	ipha = (ipha_t *)ptr;
1625 
1626 	/*
1627 	 * We assume the IP header has no options. (This is enforced in
1628 	 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1629 	 */
1630 	ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1631 	iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1632 
1633 	ptr += IP_SIMPLE_HDR_LENGTH;
1634 	len -= IP_SIMPLE_HDR_LENGTH;
1635 
1636 	/*
1637 	 * IP and L4 headers may be in different mblks.
1638 	 */
1639 	ASSERT3P(ptr, <=, mp->b_wptr);
1640 	if (ptr == mp->b_wptr) {
1641 		mp = mp->b_cont;
1642 		ptr = mp->b_rptr;
1643 		len = MBLKL(mp);
1644 	}
1645 
1646 	switch (ipha->ipha_protocol) {
1647 	case IPPROTO_TCP:
1648 		ASSERT3U(len, >=, sizeof (tcph_t));
1649 		stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1650 		cksum = IP_TCP_CSUM_COMP;
1651 		break;
1652 	case IPPROTO_UDP:
1653 		ASSERT3U(len, >=, sizeof (struct udphdr));
1654 		stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1655 		cksum = IP_UDP_CSUM_COMP;
1656 		break;
1657 	default:
1658 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1659 		    ipha->ipha_protocol);
1660 		return (EINVAL);
1661 	}
1662 
1663 	src = ipha->ipha_src;
1664 	dst = ipha->ipha_dst;
1665 
1666 	cksum += (dst >> 16) + (dst & 0xFFFF);
1667 	cksum += (src >> 16) + (src & 0xFFFF);
1668 	cksum += htons(iplen);
1669 
1670 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1671 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1672 
1673 	ASSERT(cksum <= 0xFFFF);
1674 
1675 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1676 
1677 	return (0);
1678 }
1679 
1680 /*
1681  * Push a packet into the transmit ring.
1682  *
1683  * Note: the format of a tx packet that spans multiple slots is similar to
1684  * what is described in xnf_rx_one_packet().
1685  */
1686 static void
1687 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1688 {
1689 	int nslots = 0;
1690 	int extras = 0;
1691 	RING_IDX slot;
1692 	boolean_t notify;
1693 
1694 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1695 	ASSERT(xnfp->xnf_running);
1696 
1697 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1698 
1699 	/*
1700 	 * The caller has already checked that we have enough slots to proceed.
1701 	 */
1702 	for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1703 		xnf_txid_t *tidp;
1704 		netif_tx_request_t *txrp;
1705 
1706 		tidp = xnf_txid_get(xnfp);
1707 		VERIFY(tidp != NULL);
1708 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1709 
1710 		txp->tx_slot = slot;
1711 		txp->tx_txreq.id = tidp->id;
1712 		*txrp = txp->tx_txreq;
1713 
1714 		tidp->txbuf = txp;
1715 		slot++;
1716 		nslots++;
1717 
1718 		/*
1719 		 * When present, LSO info is placed in a slot after the first
1720 		 * data segment, and doesn't require a txid.
1721 		 */
1722 		if (txp->tx_txreq.flags & NETTXF_extra_info) {
1723 			netif_extra_info_t *extra;
1724 			ASSERT3U(nslots, ==, 1);
1725 
1726 			extra = (netif_extra_info_t *)
1727 			    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1728 			*extra = txp->tx_extra;
1729 			slot++;
1730 			nslots++;
1731 			extras = 1;
1732 		}
1733 	}
1734 
1735 	ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1736 
1737 	/*
1738 	 * Store the number of data fragments.
1739 	 */
1740 	head->tx_frags_to_ack = nslots - extras;
1741 
1742 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
1743 
1744 	/*
1745 	 * Tell the peer that we sent something, if it cares.
1746 	 */
1747 	/* LINTED: constant in conditional context */
1748 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1749 	if (notify)
1750 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1751 }
1752 
1753 static xnf_txbuf_t *
1754 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1755 {
1756 	xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp);
1757 	size_t length;
1758 
1759 	txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1760 	if (txp->tx_bdesc == NULL) {
1761 		xnf_data_txbuf_free(xnfp, txp);
1762 		return (NULL);
1763 	}
1764 	txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1765 	txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1766 	txp->tx_txreq.size = length;
1767 	txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1768 	txp->tx_txreq.flags = 0;
1769 
1770 	return (txp);
1771 }
1772 
1773 static xnf_txbuf_t *
1774 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1775 {
1776 	xnf_txbuf_t *head = NULL;
1777 	xnf_txbuf_t *tail = NULL;
1778 	domid_t oeid;
1779 	int nsegs = 0;
1780 
1781 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1782 
1783 	for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1784 		ddi_dma_handle_t dma_handle;
1785 		ddi_dma_cookie_t dma_cookie;
1786 		uint_t ncookies;
1787 		xnf_txbuf_t *txp;
1788 
1789 		if (MBLKL(ml) == 0)
1790 			continue;
1791 
1792 		txp = xnf_data_txbuf_alloc(xnfp);
1793 
1794 		if (head == NULL) {
1795 			head = txp;
1796 		} else {
1797 			ASSERT(tail != NULL);
1798 			TXBUF_SETNEXT(tail, txp);
1799 			txp->tx_head = head;
1800 		}
1801 
1802 		/*
1803 		 * The necessary segmentation rules (e.g. not crossing a page
1804 		 * boundary) are enforced by the dma attributes of the handle.
1805 		 */
1806 		dma_handle = txp->tx_dma_handle;
1807 		int ret = ddi_dma_addr_bind_handle(dma_handle,
1808 		    NULL, (char *)ml->b_rptr, MBLKL(ml),
1809 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1810 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1811 		    &ncookies);
1812 		if (ret != DDI_DMA_MAPPED) {
1813 			if (ret != DDI_DMA_NORESOURCES) {
1814 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1815 				    "ddi_dma_addr_bind_handle() failed "
1816 				    "[dma_error=%d]", ret);
1817 			}
1818 			goto error;
1819 		}
1820 		txp->tx_handle_bound = B_TRUE;
1821 
1822 		ASSERT(ncookies > 0);
1823 		for (int i = 0; i < ncookies; i++) {
1824 			if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1825 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1826 				    "xnf_dmamap_alloc() failed: "
1827 				    "too many segments");
1828 				goto error;
1829 			}
1830 			if (i > 0) {
1831 				txp = xnf_data_txbuf_alloc(xnfp);
1832 				ASSERT(tail != NULL);
1833 				TXBUF_SETNEXT(tail, txp);
1834 				txp->tx_head = head;
1835 			}
1836 
1837 			txp->tx_mfn =
1838 			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
1839 			txp->tx_txreq.gref = xnf_gref_get(xnfp);
1840 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1841 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1842 				    "xnf_dmamap_alloc() failed: "
1843 				    "invalid grant ref");
1844 				goto error;
1845 			}
1846 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1847 			    oeid, txp->tx_mfn, 1);
1848 			txp->tx_txreq.offset =
1849 			    dma_cookie.dmac_laddress & PAGEOFFSET;
1850 			txp->tx_txreq.size = dma_cookie.dmac_size;
1851 			txp->tx_txreq.flags = 0;
1852 
1853 			ddi_dma_nextcookie(dma_handle, &dma_cookie);
1854 			nsegs++;
1855 
1856 			if (tail != NULL)
1857 				tail->tx_txreq.flags = NETTXF_more_data;
1858 			tail = txp;
1859 		}
1860 	}
1861 
1862 	*countp = nsegs;
1863 	return (head);
1864 
1865 error:
1866 	xnf_data_txbuf_free_chain(xnfp, head);
1867 	return (NULL);
1868 }
1869 
1870 static void
1871 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1872     uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1873 {
1874 	if (lso_flags != 0) {
1875 		ASSERT3U(lso_flags, ==, HW_LSO);
1876 		ASSERT3P(head->tx_bdesc, ==, NULL);
1877 
1878 		head->tx_txreq.flags |= NETTXF_extra_info;
1879 		netif_extra_info_t *extra = &head->tx_extra;
1880 		extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1881 		extra->flags = 0;
1882 		extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1883 		extra->u.gso.size = mss;
1884 		extra->u.gso.features = 0;
1885 		extra->u.gso.pad = 0;
1886 	} else if (cksum_flags != 0) {
1887 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1888 		/*
1889 		 * If the local protocol stack requests checksum
1890 		 * offload we set the 'checksum blank' flag,
1891 		 * indicating to the peer that we need the checksum
1892 		 * calculated for us.
1893 		 *
1894 		 * We _don't_ set the validated flag, because we haven't
1895 		 * validated that the data and the checksum match.
1896 		 *
1897 		 * Note: we already called xnf_pseudo_cksum() in
1898 		 * xnf_send(), so we just set the txreq flag here.
1899 		 */
1900 		head->tx_txreq.flags |= NETTXF_csum_blank;
1901 		xnfp->xnf_stat_tx_cksum_deferred++;
1902 	}
1903 }
1904 
1905 /*
1906  * Send packet mp. Called by the MAC framework.
1907  */
1908 static mblk_t *
1909 xnf_send(void *arg, mblk_t *mp)
1910 {
1911 	xnf_t *xnfp = arg;
1912 	xnf_txbuf_t *head;
1913 	mblk_t *ml;
1914 	int length;
1915 	int pages, chunks, slots, slots_free;
1916 	uint32_t cksum_flags, lso_flags, mss;
1917 	boolean_t pulledup = B_FALSE;
1918 	boolean_t force_copy = B_FALSE;
1919 
1920 	ASSERT3P(mp->b_next, ==, NULL);
1921 
1922 	mutex_enter(&xnfp->xnf_txlock);
1923 
1924 	/*
1925 	 * Wait until we are connected to the backend.
1926 	 */
1927 	while (!xnfp->xnf_connected)
1928 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1929 
1930 	/*
1931 	 * To simplify logic and be in sync with the rescheduling mechanism,
1932 	 * we require the maximum amount of slots that could be used by a
1933 	 * transaction to be free before proceeding. The only downside of doing
1934 	 * this is that it slightly reduces the effective size of the ring.
1935 	 */
1936 	slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1937 	if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1938 		/*
1939 		 * We need to ask for a re-schedule later as the ring is full.
1940 		 */
1941 		mutex_enter(&xnfp->xnf_schedlock);
1942 		xnfp->xnf_need_sched = B_TRUE;
1943 		mutex_exit(&xnfp->xnf_schedlock);
1944 
1945 		xnfp->xnf_stat_tx_defer++;
1946 		mutex_exit(&xnfp->xnf_txlock);
1947 		return (mp);
1948 	}
1949 
1950 	/*
1951 	 * Get hw offload parameters.
1952 	 * This must be done before pulling up the mp as those parameters
1953 	 * are not copied over.
1954 	 */
1955 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1956 	mac_lso_get(mp, &mss, &lso_flags);
1957 
1958 	/*
1959 	 * XXX: fix MAC framework so that we can advertise support for
1960 	 * partial checksum for IPv4 only. This way we won't need to calculate
1961 	 * the pseudo header checksum ourselves.
1962 	 */
1963 	if (cksum_flags != 0) {
1964 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1965 		(void) xnf_pseudo_cksum(mp);
1966 	}
1967 
1968 pulledup:
1969 	for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1970 	    ml = ml->b_cont, chunks++) {
1971 		pages += xnf_mblk_pages(ml);
1972 		length += MBLKL(ml);
1973 	}
1974 	DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1975 	DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1976 
1977 	/*
1978 	 * If the ethernet header crosses a page boundary the packet
1979 	 * will be dropped by the backend. In practice it seems like
1980 	 * this happens fairly rarely so we'll do nothing unless the
1981 	 * packet is small enough to fit in a look-aside buffer.
1982 	 */
1983 	if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1984 	    sizeof (struct ether_header) > PAGESIZE) {
1985 		xnfp->xnf_stat_tx_eth_hdr_split++;
1986 		if (length <= PAGESIZE)
1987 			force_copy = B_TRUE;
1988 	}
1989 
1990 	if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1991 		/*
1992 		 * If the packet spans several pages and scatter-gather is not
1993 		 * supported then use a look-aside buffer.
1994 		 */
1995 		ASSERT3U(length, <=, PAGESIZE);
1996 		head = xnf_mblk_copy(xnfp, mp);
1997 		if (head == NULL) {
1998 			dev_err(xnfp->xnf_devinfo, CE_WARN,
1999 			    "xnf_mblk_copy() failed");
2000 			goto drop;
2001 		}
2002 	} else {
2003 		/*
2004 		 * There's a limit for how many pages can be passed to the
2005 		 * backend. If we pass that limit, the packet will be dropped
2006 		 * and some backend implementations (e.g. Linux) could even
2007 		 * offline the interface.
2008 		 */
2009 		if (pages > XEN_MAX_TX_DATA_PAGES) {
2010 			if (pulledup) {
2011 				dev_err(xnfp->xnf_devinfo, CE_WARN,
2012 				    "too many pages, even after pullup: %d.",
2013 				    pages);
2014 				goto drop;
2015 			}
2016 
2017 			/*
2018 			 * Defragment packet if it spans too many pages.
2019 			 */
2020 			mblk_t *newmp = msgpullup(mp, -1);
2021 			freemsg(mp);
2022 			mp = newmp;
2023 			xnfp->xnf_stat_tx_pullup++;
2024 			pulledup = B_TRUE;
2025 			goto pulledup;
2026 		}
2027 
2028 		head = xnf_mblk_map(xnfp, mp, &slots);
2029 		if (head == NULL)
2030 			goto drop;
2031 
2032 		IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2033 	}
2034 
2035 	/*
2036 	 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2037 	 */
2038 	head->tx_mp = mp;
2039 
2040 	xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2041 
2042 	/*
2043 	 * The first request must store the total length of the packet.
2044 	 */
2045 	head->tx_txreq.size = length;
2046 
2047 	/*
2048 	 * Push the packet we have prepared into the ring.
2049 	 */
2050 	xnf_tx_push_packet(xnfp, head);
2051 	xnfp->xnf_stat_opackets++;
2052 	xnfp->xnf_stat_obytes += length;
2053 
2054 	mutex_exit(&xnfp->xnf_txlock);
2055 	return (NULL);
2056 
2057 drop:
2058 	freemsg(mp);
2059 	xnfp->xnf_stat_tx_drop++;
2060 	mutex_exit(&xnfp->xnf_txlock);
2061 	return (NULL);
2062 }
2063 
2064 /*
2065  * Notification of RX packets. Currently no TX-complete interrupt is
2066  * used, as we clean the TX ring lazily.
2067  */
2068 static uint_t
2069 xnf_intr(caddr_t arg)
2070 {
2071 	xnf_t *xnfp = (xnf_t *)arg;
2072 	mblk_t *mp;
2073 	boolean_t need_sched, clean_ring;
2074 
2075 	mutex_enter(&xnfp->xnf_rxlock);
2076 
2077 	/*
2078 	 * Interrupts before we are connected are spurious.
2079 	 */
2080 	if (!xnfp->xnf_connected) {
2081 		mutex_exit(&xnfp->xnf_rxlock);
2082 		xnfp->xnf_stat_unclaimed_interrupts++;
2083 		return (DDI_INTR_UNCLAIMED);
2084 	}
2085 
2086 	/*
2087 	 * Receive side processing.
2088 	 */
2089 	do {
2090 		/*
2091 		 * Collect buffers from the ring.
2092 		 */
2093 		xnf_rx_collect(xnfp);
2094 
2095 		/*
2096 		 * Interrupt me when the next receive buffer is consumed.
2097 		 */
2098 		xnfp->xnf_rx_ring.sring->rsp_event =
2099 		    xnfp->xnf_rx_ring.rsp_cons + 1;
2100 		xen_mb();
2101 
2102 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2103 
2104 	if (xnfp->xnf_rx_new_buffers_posted) {
2105 		boolean_t notify;
2106 
2107 		/*
2108 		 * Indicate to the peer that we have re-filled the
2109 		 * receive ring, if it cares.
2110 		 */
2111 		/* LINTED: constant in conditional context */
2112 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2113 		if (notify)
2114 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
2115 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2116 	}
2117 
2118 	mp = xnfp->xnf_rx_head;
2119 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2120 
2121 	xnfp->xnf_stat_interrupts++;
2122 	mutex_exit(&xnfp->xnf_rxlock);
2123 
2124 	if (mp != NULL)
2125 		mac_rx(xnfp->xnf_mh, NULL, mp);
2126 
2127 	/*
2128 	 * Transmit side processing.
2129 	 *
2130 	 * If a previous transmit attempt failed or we have pending
2131 	 * multicast requests, clean the ring.
2132 	 *
2133 	 * If we previously stalled transmission and cleaning produces
2134 	 * some free slots, tell upstream to attempt sending again.
2135 	 *
2136 	 * The odd style is to avoid acquiring xnf_txlock unless we
2137 	 * will actually look inside the tx machinery.
2138 	 */
2139 	mutex_enter(&xnfp->xnf_schedlock);
2140 	need_sched = xnfp->xnf_need_sched;
2141 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2142 	mutex_exit(&xnfp->xnf_schedlock);
2143 
2144 	if (clean_ring) {
2145 		int free_slots;
2146 
2147 		mutex_enter(&xnfp->xnf_txlock);
2148 		free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2149 
2150 		if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2151 			mutex_enter(&xnfp->xnf_schedlock);
2152 			xnfp->xnf_need_sched = B_FALSE;
2153 			mutex_exit(&xnfp->xnf_schedlock);
2154 
2155 			mac_tx_update(xnfp->xnf_mh);
2156 		}
2157 		mutex_exit(&xnfp->xnf_txlock);
2158 	}
2159 
2160 	return (DDI_INTR_CLAIMED);
2161 }
2162 
2163 /*
2164  *  xnf_start() -- start the board receiving and enable interrupts.
2165  */
2166 static int
2167 xnf_start(void *arg)
2168 {
2169 	xnf_t *xnfp = arg;
2170 
2171 #ifdef XNF_DEBUG
2172 	if (xnf_debug & XNF_DEBUG_TRACE)
2173 		printf("xnf%d start(0x%p)\n",
2174 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2175 #endif
2176 
2177 	mutex_enter(&xnfp->xnf_rxlock);
2178 	mutex_enter(&xnfp->xnf_txlock);
2179 
2180 	/* Accept packets from above. */
2181 	xnfp->xnf_running = B_TRUE;
2182 
2183 	mutex_exit(&xnfp->xnf_txlock);
2184 	mutex_exit(&xnfp->xnf_rxlock);
2185 
2186 	return (0);
2187 }
2188 
2189 /* xnf_stop() - disable hardware */
2190 static void
2191 xnf_stop(void *arg)
2192 {
2193 	xnf_t *xnfp = arg;
2194 
2195 #ifdef XNF_DEBUG
2196 	if (xnf_debug & XNF_DEBUG_TRACE)
2197 		printf("xnf%d stop(0x%p)\n",
2198 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2199 #endif
2200 
2201 	mutex_enter(&xnfp->xnf_rxlock);
2202 	mutex_enter(&xnfp->xnf_txlock);
2203 
2204 	xnfp->xnf_running = B_FALSE;
2205 
2206 	mutex_exit(&xnfp->xnf_txlock);
2207 	mutex_exit(&xnfp->xnf_rxlock);
2208 }
2209 
2210 /*
2211  * Hang buffer `bdesc' on the RX ring.
2212  */
2213 static void
2214 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2215 {
2216 	netif_rx_request_t *reqp;
2217 	RING_IDX hang_ix;
2218 
2219 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2220 
2221 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2222 	    xnfp->xnf_rx_ring.req_prod_pvt);
2223 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2224 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2225 
2226 	reqp->id = bdesc->id = hang_ix;
2227 	reqp->gref = bdesc->grant_ref;
2228 
2229 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2230 	xnfp->xnf_rx_ring.req_prod_pvt++;
2231 
2232 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2233 }
2234 
2235 /*
2236  * Receive an entire packet from the ring, starting from slot *consp.
2237  * prod indicates the slot of the latest response.
2238  * On return, *consp will point to the head of the next packet.
2239  *
2240  * Note: If slot prod was reached before we could gather a full packet, we will
2241  * drop the partial packet; this would most likely indicate a bug in either
2242  * the front-end or the back-end driver.
2243  *
2244  * An rx packet can consist of several fragments and thus span multiple slots.
2245  * Each fragment can contain up to 4k of data.
2246  *
2247  * A typical 9000 MTU packet with look like this:
2248  * +------+---------------------+-------------------+-----------------------+
2249  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2250  * +------+---------------------+-------------------+-----------------------+
2251  * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
2252  * +------+---------------------+-------------------+-----------------------+
2253  * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
2254  * +------+---------------------+-------------------+-----------------------+
2255  * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
2256  * +------+---------------------+-------------------+-----------------------+
2257  *
2258  * Fragments are chained by setting NETRXF_more_data in the previous
2259  * response's flags. If there are additional flags, such as
2260  * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2261  * first fragment.
2262  *
2263  * Sometimes extra info can be present. If so, it will follow the first
2264  * fragment, and NETRXF_extra_info flag will be set on the first response.
2265  * If LRO is set on a packet, it will be stored in the extra info. Conforming
2266  * to the spec, extra info can also be chained, but must all be present right
2267  * after the first fragment.
2268  *
2269  * Example of a packet with 2 extra infos:
2270  * +------+---------------------+-------------------+-----------------------+
2271  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2272  * +------+---------------------+-------------------+-----------------------+
2273  * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2274  * +------+---------------------+-------------------+-----------------------+
2275  * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
2276  * +------+---------------------+-------------------+-----------------------+
2277  * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
2278  * +------+---------------------+-------------------+-----------------------+
2279  * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
2280  * +------+---------------------+-------------------+-----------------------+
2281  * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
2282  * +------+---------------------+-------------------+-----------------------+
2283  * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
2284  * +------+---------------------+-------------------+-----------------------+
2285  *
2286  * In practice, the only extra we expect is for LRO, but only if we advertise
2287  * that we support it to the backend (xnf_enable_lro == TRUE).
2288  */
2289 static int
2290 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2291 {
2292 	mblk_t *head = NULL;
2293 	mblk_t *tail = NULL;
2294 	mblk_t *mp;
2295 	int error = 0;
2296 	RING_IDX cons = *consp;
2297 	netif_extra_info_t lro;
2298 	boolean_t is_lro = B_FALSE;
2299 	boolean_t is_extra = B_FALSE;
2300 
2301 	netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2302 
2303 	boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2304 	boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2305 	boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2306 
2307 	IMPLY(more_data, xnf_enable_rx_sg);
2308 
2309 	while (cons != prod) {
2310 		xnf_buf_t *bdesc;
2311 		int len, off;
2312 		int rxidx = cons & (NET_RX_RING_SIZE - 1);
2313 
2314 		bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2315 		xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2316 
2317 		if (is_extra) {
2318 			netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2319 			/*
2320 			 * The only extra we expect is for LRO, and it should
2321 			 * only be present once.
2322 			 */
2323 			if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2324 			    !is_lro) {
2325 				ASSERT(xnf_enable_lro);
2326 				lro = *extra;
2327 				is_lro = B_TRUE;
2328 				DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2329 			} else {
2330 				dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2331 				    "contains unexpected extra info of type %d",
2332 				    extra->type);
2333 				error = EINVAL;
2334 			}
2335 			more_extra =
2336 			    (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2337 
2338 			goto hang_buf;
2339 		}
2340 
2341 		ASSERT3U(bdesc->id, ==, rsp.id);
2342 
2343 		/*
2344 		 * status stores packet length when >= 0, or errors when < 0.
2345 		 */
2346 		len = rsp.status;
2347 		off = rsp.offset;
2348 		more_data = (rsp.flags & NETRXF_more_data) != 0;
2349 
2350 		/*
2351 		 * sanity checks.
2352 		 */
2353 		if (!xnfp->xnf_running) {
2354 			error = EBUSY;
2355 		} else if (len <= 0) {
2356 			xnfp->xnf_stat_errrx++;
2357 
2358 			switch (len) {
2359 			case 0:
2360 				xnfp->xnf_stat_runt++;
2361 				break;
2362 			case NETIF_RSP_ERROR:
2363 				xnfp->xnf_stat_mac_rcv_error++;
2364 				break;
2365 			case NETIF_RSP_DROPPED:
2366 				xnfp->xnf_stat_norxbuf++;
2367 				break;
2368 			}
2369 			error = EINVAL;
2370 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2371 			dev_err(xnfp->xnf_devinfo, CE_WARN,
2372 			    "Bad rx grant reference, rsp id %d", rsp.id);
2373 			error = EINVAL;
2374 		} else if ((off + len) > PAGESIZE) {
2375 			dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2376 			    "page boundary (offset %d, length %d)", off, len);
2377 			error = EINVAL;
2378 		}
2379 
2380 		if (error != 0) {
2381 			/*
2382 			 * If an error has been detected, we do not attempt
2383 			 * to read the data but we still need to replace
2384 			 * the rx bufs.
2385 			 */
2386 			goto hang_buf;
2387 		}
2388 
2389 		xnf_buf_t *nbuf = NULL;
2390 
2391 		/*
2392 		 * If the packet is below a pre-determined size we will
2393 		 * copy data out of the buf rather than replace it.
2394 		 */
2395 		if (len > xnf_rx_copy_limit)
2396 			nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2397 
2398 		if (nbuf != NULL) {
2399 			mp = desballoc((unsigned char *)bdesc->buf,
2400 			    bdesc->len, 0, &bdesc->free_rtn);
2401 
2402 			if (mp == NULL) {
2403 				xnfp->xnf_stat_rx_desballoc_fail++;
2404 				xnfp->xnf_stat_norxbuf++;
2405 				error = ENOMEM;
2406 				/*
2407 				 * we free the buf we just allocated as we
2408 				 * will re-hang the old buf.
2409 				 */
2410 				xnf_buf_put(xnfp, nbuf, B_FALSE);
2411 				goto hang_buf;
2412 			}
2413 
2414 			mp->b_rptr = mp->b_rptr + off;
2415 			mp->b_wptr = mp->b_rptr + len;
2416 
2417 			/*
2418 			 * Release the grant as the backend doesn't need to
2419 			 * access this buffer anymore and grants are scarce.
2420 			 */
2421 			(void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2422 			    0);
2423 			xnf_gref_put(xnfp, bdesc->grant_ref);
2424 			bdesc->grant_ref = INVALID_GRANT_REF;
2425 
2426 			bdesc = nbuf;
2427 		} else {
2428 			/*
2429 			 * We failed to allocate a new buf or decided to reuse
2430 			 * the old one. In either case we copy the data off it
2431 			 * and put it back into the ring.
2432 			 */
2433 			mp = allocb(len, 0);
2434 			if (mp == NULL) {
2435 				xnfp->xnf_stat_rx_allocb_fail++;
2436 				xnfp->xnf_stat_norxbuf++;
2437 				error = ENOMEM;
2438 				goto hang_buf;
2439 			}
2440 			bcopy(bdesc->buf + off, mp->b_wptr, len);
2441 			mp->b_wptr += len;
2442 		}
2443 
2444 		if (head == NULL)
2445 			head = mp;
2446 		else
2447 			tail->b_cont = mp;
2448 		tail = mp;
2449 
2450 hang_buf:
2451 		/*
2452 		 * No matter what happens, for each response we need to hang
2453 		 * a new buf on the rx ring. Put either the old one, or a new
2454 		 * one if the old one is borrowed by the kernel via desballoc().
2455 		 */
2456 		xnf_rxbuf_hang(xnfp, bdesc);
2457 		cons++;
2458 
2459 		/* next response is an extra */
2460 		is_extra = more_extra;
2461 
2462 		if (!more_data && !more_extra)
2463 			break;
2464 
2465 		/*
2466 		 * Note that since requests and responses are union'd on the
2467 		 * same ring, we copy the response to a local variable instead
2468 		 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2469 		 * overwritten contents of rsp.
2470 		 */
2471 		rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2472 	}
2473 
2474 	/*
2475 	 * Check that we do not get stuck in a loop.
2476 	 */
2477 	ASSERT3U(*consp, !=, cons);
2478 	*consp = cons;
2479 
2480 	/*
2481 	 * We ran out of responses but the flags indicate there is more data.
2482 	 */
2483 	if (more_data) {
2484 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2485 		error = EINVAL;
2486 	}
2487 	if (more_extra) {
2488 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2489 		    "(extras).");
2490 		error = EINVAL;
2491 	}
2492 
2493 	/*
2494 	 * An error means the packet must be dropped. If we have already formed
2495 	 * a partial packet, then discard it.
2496 	 */
2497 	if (error != 0) {
2498 		if (head != NULL)
2499 			freemsg(head);
2500 		xnfp->xnf_stat_rx_drop++;
2501 		return (error);
2502 	}
2503 
2504 	ASSERT(head != NULL);
2505 
2506 	if (hwcsum) {
2507 		/*
2508 		 * If the peer says that the data has been validated then we
2509 		 * declare that the full checksum has been verified.
2510 		 *
2511 		 * We don't look at the "checksum blank" flag, and hence could
2512 		 * have a packet here that we are asserting is good with
2513 		 * a blank checksum.
2514 		 */
2515 		mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2516 		xnfp->xnf_stat_rx_cksum_no_need++;
2517 	}
2518 
2519 	/* XXX: set lro info for packet once LRO is supported in OS. */
2520 
2521 	*mpp = head;
2522 
2523 	return (0);
2524 }
2525 
2526 /*
2527  * Collect packets from the RX ring, storing them in `xnfp' for later use.
2528  */
2529 static void
2530 xnf_rx_collect(xnf_t *xnfp)
2531 {
2532 	RING_IDX prod;
2533 
2534 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2535 
2536 	prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2537 	/*
2538 	 * Ensure we see queued responses up to 'prod'.
2539 	 */
2540 	membar_consumer();
2541 
2542 	while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2543 		mblk_t *mp;
2544 
2545 		/*
2546 		 * Collect a packet.
2547 		 * rsp_cons is updated inside xnf_rx_one_packet().
2548 		 */
2549 		int error = xnf_rx_one_packet(xnfp, prod,
2550 		    &xnfp->xnf_rx_ring.rsp_cons, &mp);
2551 		if (error == 0) {
2552 			xnfp->xnf_stat_ipackets++;
2553 			xnfp->xnf_stat_rbytes += xmsgsize(mp);
2554 
2555 			/*
2556 			 * Append the mblk to the rx list.
2557 			 */
2558 			if (xnfp->xnf_rx_head == NULL) {
2559 				ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2560 				xnfp->xnf_rx_head = mp;
2561 			} else {
2562 				ASSERT(xnfp->xnf_rx_tail != NULL);
2563 				xnfp->xnf_rx_tail->b_next = mp;
2564 			}
2565 			xnfp->xnf_rx_tail = mp;
2566 		}
2567 	}
2568 }
2569 
2570 /*
2571  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2572  */
2573 static int
2574 xnf_alloc_dma_resources(xnf_t *xnfp)
2575 {
2576 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
2577 	size_t			len;
2578 	ddi_dma_cookie_t	dma_cookie;
2579 	uint_t			ncookies;
2580 	int			rc;
2581 	caddr_t			rptr;
2582 
2583 	/*
2584 	 * The code below allocates all the DMA data structures that
2585 	 * need to be released when the driver is detached.
2586 	 *
2587 	 * Allocate page for the transmit descriptor ring.
2588 	 */
2589 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2590 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2591 		goto alloc_error;
2592 
2593 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2594 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2595 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2596 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2597 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2598 		xnfp->xnf_tx_ring_dma_handle = NULL;
2599 		goto alloc_error;
2600 	}
2601 
2602 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2603 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2604 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2605 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2606 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2607 		xnfp->xnf_tx_ring_dma_handle = NULL;
2608 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2609 		if (rc == DDI_DMA_NORESOURCES)
2610 			goto alloc_error;
2611 		else
2612 			goto error;
2613 	}
2614 
2615 	ASSERT(ncookies == 1);
2616 	bzero(rptr, PAGESIZE);
2617 	/* LINTED: constant in conditional context */
2618 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2619 	/* LINTED: constant in conditional context */
2620 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2621 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2622 
2623 	/*
2624 	 * Allocate page for the receive descriptor ring.
2625 	 */
2626 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2627 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2628 		goto alloc_error;
2629 
2630 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2631 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2632 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2633 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2634 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2635 		xnfp->xnf_rx_ring_dma_handle = NULL;
2636 		goto alloc_error;
2637 	}
2638 
2639 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2640 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2641 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2642 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2643 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2644 		xnfp->xnf_rx_ring_dma_handle = NULL;
2645 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2646 		if (rc == DDI_DMA_NORESOURCES)
2647 			goto alloc_error;
2648 		else
2649 			goto error;
2650 	}
2651 
2652 	ASSERT(ncookies == 1);
2653 	bzero(rptr, PAGESIZE);
2654 	/* LINTED: constant in conditional context */
2655 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2656 	/* LINTED: constant in conditional context */
2657 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2658 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2659 
2660 	return (DDI_SUCCESS);
2661 
2662 alloc_error:
2663 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2664 	    ddi_get_instance(xnfp->xnf_devinfo));
2665 error:
2666 	xnf_release_dma_resources(xnfp);
2667 	return (DDI_FAILURE);
2668 }
2669 
2670 /*
2671  * Release all DMA resources in the opposite order from acquisition
2672  */
2673 static void
2674 xnf_release_dma_resources(xnf_t *xnfp)
2675 {
2676 	int i;
2677 
2678 	/*
2679 	 * Free receive buffers which are currently associated with
2680 	 * descriptors.
2681 	 */
2682 	mutex_enter(&xnfp->xnf_rxlock);
2683 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
2684 		xnf_buf_t *bp;
2685 
2686 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2687 			continue;
2688 		xnfp->xnf_rx_pkt_info[i] = NULL;
2689 		xnf_buf_put(xnfp, bp, B_FALSE);
2690 	}
2691 	mutex_exit(&xnfp->xnf_rxlock);
2692 
2693 	/* Free the receive ring buffer. */
2694 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2695 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2696 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2697 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2698 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2699 	}
2700 	/* Free the transmit ring buffer. */
2701 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2702 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2703 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2704 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2705 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2706 	}
2707 
2708 }
2709 
2710 /*
2711  * Release any packets and associated structures used by the TX ring.
2712  */
2713 static void
2714 xnf_release_mblks(xnf_t *xnfp)
2715 {
2716 	RING_IDX i;
2717 	xnf_txid_t *tidp;
2718 
2719 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2720 	    i < NET_TX_RING_SIZE;
2721 	    i++, tidp++) {
2722 		xnf_txbuf_t *txp = tidp->txbuf;
2723 
2724 		if (txp != NULL) {
2725 			ASSERT(txp->tx_mp != NULL);
2726 			freemsg(txp->tx_mp);
2727 
2728 			xnf_txid_put(xnfp, tidp);
2729 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2730 		}
2731 	}
2732 }
2733 
2734 static int
2735 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2736 {
2737 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2738 	xnf_buf_t *bdesc = buf;
2739 	xnf_t *xnfp = arg;
2740 	ddi_dma_cookie_t dma_cookie;
2741 	uint_t ncookies;
2742 	size_t len;
2743 
2744 	if (kmflag & KM_NOSLEEP)
2745 		ddiflags = DDI_DMA_DONTWAIT;
2746 
2747 	/* Allocate a DMA access handle for the buffer. */
2748 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2749 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2750 		goto failure;
2751 
2752 	/* Allocate DMA-able memory for buffer. */
2753 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2754 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2755 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2756 		goto failure_1;
2757 
2758 	/* Bind to virtual address of buffer to get physical address. */
2759 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2760 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2761 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2762 		goto failure_2;
2763 	ASSERT(ncookies == 1);
2764 
2765 	bdesc->free_rtn.free_func = xnf_buf_recycle;
2766 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2767 	bdesc->xnfp = xnfp;
2768 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2769 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2770 	bdesc->len = dma_cookie.dmac_size;
2771 	bdesc->grant_ref = INVALID_GRANT_REF;
2772 	bdesc->gen = xnfp->xnf_gen;
2773 
2774 	atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2775 
2776 	return (0);
2777 
2778 failure_2:
2779 	ddi_dma_mem_free(&bdesc->acc_handle);
2780 
2781 failure_1:
2782 	ddi_dma_free_handle(&bdesc->dma_handle);
2783 
2784 failure:
2785 
2786 	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2787 	return (-1);
2788 }
2789 
2790 static void
2791 xnf_buf_destructor(void *buf, void *arg)
2792 {
2793 	xnf_buf_t *bdesc = buf;
2794 	xnf_t *xnfp = arg;
2795 
2796 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
2797 	ddi_dma_mem_free(&bdesc->acc_handle);
2798 	ddi_dma_free_handle(&bdesc->dma_handle);
2799 
2800 	atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2801 }
2802 
2803 static xnf_buf_t *
2804 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2805 {
2806 	grant_ref_t gref;
2807 	xnf_buf_t *bufp;
2808 
2809 	/*
2810 	 * Usually grant references are more scarce than memory, so we
2811 	 * attempt to acquire a grant reference first.
2812 	 */
2813 	gref = xnf_gref_get(xnfp);
2814 	if (gref == INVALID_GRANT_REF)
2815 		return (NULL);
2816 
2817 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2818 	if (bufp == NULL) {
2819 		xnf_gref_put(xnfp, gref);
2820 		return (NULL);
2821 	}
2822 
2823 	ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2824 
2825 	bufp->grant_ref = gref;
2826 
2827 	if (bufp->gen != xnfp->xnf_gen)
2828 		xnf_buf_refresh(bufp);
2829 
2830 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
2831 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2832 	    bufp->buf_mfn, readonly ? 1 : 0);
2833 
2834 	atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2835 
2836 	return (bufp);
2837 }
2838 
2839 static void
2840 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2841 {
2842 	if (bufp->grant_ref != INVALID_GRANT_REF) {
2843 		(void) gnttab_end_foreign_access_ref(
2844 		    bufp->grant_ref, readonly ? 1 : 0);
2845 		xnf_gref_put(xnfp, bufp->grant_ref);
2846 		bufp->grant_ref = INVALID_GRANT_REF;
2847 	}
2848 
2849 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2850 
2851 	atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2852 }
2853 
2854 /*
2855  * Refresh any cached data about a buffer after resume.
2856  */
2857 static void
2858 xnf_buf_refresh(xnf_buf_t *bdesc)
2859 {
2860 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2861 	bdesc->gen = bdesc->xnfp->xnf_gen;
2862 }
2863 
2864 /*
2865  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2866  * look-aside buffers.
2867  */
2868 static void
2869 xnf_buf_recycle(xnf_buf_t *bdesc)
2870 {
2871 	xnf_t *xnfp = bdesc->xnfp;
2872 
2873 	xnf_buf_put(xnfp, bdesc, B_TRUE);
2874 }
2875 
2876 static int
2877 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2878 {
2879 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2880 	xnf_txbuf_t *txp = buf;
2881 	xnf_t *xnfp = arg;
2882 
2883 	if (kmflag & KM_NOSLEEP)
2884 		ddiflags = DDI_DMA_DONTWAIT;
2885 
2886 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2887 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2888 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2889 		return (-1);
2890 	}
2891 
2892 	return (0);
2893 }
2894 
2895 static void
2896 xnf_tx_buf_destructor(void *buf, void *arg)
2897 {
2898 	_NOTE(ARGUNUSED(arg));
2899 	xnf_txbuf_t *txp = buf;
2900 
2901 	ddi_dma_free_handle(&txp->tx_dma_handle);
2902 }
2903 
2904 /*
2905  * Statistics.
2906  */
2907 static char *xnf_aux_statistics[] = {
2908 	"tx_cksum_deferred",
2909 	"rx_cksum_no_need",
2910 	"interrupts",
2911 	"unclaimed_interrupts",
2912 	"tx_pullup",
2913 	"tx_lookaside",
2914 	"tx_drop",
2915 	"tx_eth_hdr_split",
2916 	"buf_allocated",
2917 	"buf_outstanding",
2918 	"gref_outstanding",
2919 	"gref_failure",
2920 	"gref_peak",
2921 	"rx_allocb_fail",
2922 	"rx_desballoc_fail",
2923 };
2924 
2925 static int
2926 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2927 {
2928 	xnf_t *xnfp;
2929 	kstat_named_t *knp;
2930 
2931 	if (flag != KSTAT_READ)
2932 		return (EACCES);
2933 
2934 	xnfp = ksp->ks_private;
2935 	knp = ksp->ks_data;
2936 
2937 	/*
2938 	 * Assignment order must match that of the names in
2939 	 * xnf_aux_statistics.
2940 	 */
2941 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2942 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2943 
2944 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2945 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2946 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2947 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2948 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2949 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2950 
2951 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2952 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2953 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2954 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2955 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2956 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2957 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2958 
2959 	return (0);
2960 }
2961 
2962 static boolean_t
2963 xnf_kstat_init(xnf_t *xnfp)
2964 {
2965 	int nstat = sizeof (xnf_aux_statistics) /
2966 	    sizeof (xnf_aux_statistics[0]);
2967 	char **cp = xnf_aux_statistics;
2968 	kstat_named_t *knp;
2969 
2970 	/*
2971 	 * Create and initialise kstats.
2972 	 */
2973 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2974 	    ddi_get_instance(xnfp->xnf_devinfo),
2975 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2976 	    nstat, 0)) == NULL)
2977 		return (B_FALSE);
2978 
2979 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2980 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2981 
2982 	knp = xnfp->xnf_kstat_aux->ks_data;
2983 	while (nstat > 0) {
2984 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2985 
2986 		knp++;
2987 		cp++;
2988 		nstat--;
2989 	}
2990 
2991 	kstat_install(xnfp->xnf_kstat_aux);
2992 
2993 	return (B_TRUE);
2994 }
2995 
2996 static int
2997 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2998 {
2999 	xnf_t *xnfp = arg;
3000 
3001 	mutex_enter(&xnfp->xnf_rxlock);
3002 	mutex_enter(&xnfp->xnf_txlock);
3003 
3004 #define	mac_stat(q, r)				\
3005 	case (MAC_STAT_##q):			\
3006 		*val = xnfp->xnf_stat_##r;	\
3007 		break
3008 
3009 #define	ether_stat(q, r)			\
3010 	case (ETHER_STAT_##q):			\
3011 		*val = xnfp->xnf_stat_##r;	\
3012 		break
3013 
3014 	switch (stat) {
3015 
3016 	mac_stat(IPACKETS, ipackets);
3017 	mac_stat(OPACKETS, opackets);
3018 	mac_stat(RBYTES, rbytes);
3019 	mac_stat(OBYTES, obytes);
3020 	mac_stat(NORCVBUF, norxbuf);
3021 	mac_stat(IERRORS, errrx);
3022 	mac_stat(NOXMTBUF, tx_defer);
3023 
3024 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
3025 	ether_stat(TOOSHORT_ERRORS, runt);
3026 
3027 	/* always claim to be in full duplex mode */
3028 	case ETHER_STAT_LINK_DUPLEX:
3029 		*val = LINK_DUPLEX_FULL;
3030 		break;
3031 
3032 	/* always claim to be at 1Gb/s link speed */
3033 	case MAC_STAT_IFSPEED:
3034 		*val = 1000000000ull;
3035 		break;
3036 
3037 	default:
3038 		mutex_exit(&xnfp->xnf_txlock);
3039 		mutex_exit(&xnfp->xnf_rxlock);
3040 
3041 		return (ENOTSUP);
3042 	}
3043 
3044 #undef mac_stat
3045 #undef ether_stat
3046 
3047 	mutex_exit(&xnfp->xnf_txlock);
3048 	mutex_exit(&xnfp->xnf_rxlock);
3049 
3050 	return (0);
3051 }
3052 
3053 static int
3054 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3055 {
3056 	if (mtu > ETHERMTU) {
3057 		if (!xnf_enable_tx_sg) {
3058 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3059 			    "because scatter-gather is disabled for transmit "
3060 			    "in driver settings", ETHERMTU);
3061 			return (EINVAL);
3062 		} else if (!xnf_enable_rx_sg) {
3063 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3064 			    "because scatter-gather is disabled for receive "
3065 			    "in driver settings", ETHERMTU);
3066 			return (EINVAL);
3067 		} else if (!xnfp->xnf_be_tx_sg) {
3068 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3069 			    "because backend doesn't support scatter-gather",
3070 			    ETHERMTU);
3071 			return (EINVAL);
3072 		}
3073 		if (mtu > XNF_MAXPKT)
3074 			return (EINVAL);
3075 	}
3076 	int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3077 	if (error == 0)
3078 		xnfp->xnf_mtu = mtu;
3079 
3080 	return (error);
3081 }
3082 
3083 /*ARGSUSED*/
3084 static int
3085 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3086     uint_t prop_val_size, void *prop_val)
3087 {
3088 	xnf_t *xnfp = data;
3089 
3090 	switch (prop_id) {
3091 	case MAC_PROP_MTU:
3092 		ASSERT(prop_val_size >= sizeof (uint32_t));
3093 		bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3094 		break;
3095 	default:
3096 		return (ENOTSUP);
3097 	}
3098 	return (0);
3099 }
3100 
3101 /*ARGSUSED*/
3102 static int
3103 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3104     uint_t prop_val_size, const void *prop_val)
3105 {
3106 	xnf_t *xnfp = data;
3107 	uint32_t new_mtu;
3108 	int error;
3109 
3110 	switch (prop_id) {
3111 	case MAC_PROP_MTU:
3112 		ASSERT(prop_val_size >= sizeof (uint32_t));
3113 		bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3114 		error = xnf_change_mtu(xnfp, new_mtu);
3115 		break;
3116 	default:
3117 		return (ENOTSUP);
3118 	}
3119 
3120 	return (error);
3121 }
3122 
3123 /*ARGSUSED*/
3124 static void
3125 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3126     mac_prop_info_handle_t prop_handle)
3127 {
3128 	switch (prop_id) {
3129 	case MAC_PROP_MTU:
3130 		mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3131 		break;
3132 	default:
3133 		break;
3134 	}
3135 }
3136 
3137 static boolean_t
3138 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3139 {
3140 	xnf_t *xnfp = arg;
3141 
3142 	switch (cap) {
3143 	case MAC_CAPAB_HCKSUM: {
3144 		uint32_t *capab = cap_data;
3145 
3146 		/*
3147 		 * Whilst the flag used to communicate with the IO
3148 		 * domain is called "NETTXF_csum_blank", the checksum
3149 		 * in the packet must contain the pseudo-header
3150 		 * checksum and not zero.
3151 		 *
3152 		 * To help out the IO domain, we might use
3153 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3154 		 * then use checksum offload for IPv6 packets, which
3155 		 * the IO domain can't handle.
3156 		 *
3157 		 * As a result, we declare outselves capable of
3158 		 * HCKSUM_INET_FULL_V4. This means that we receive
3159 		 * IPv4 packets from the stack with a blank checksum
3160 		 * field and must insert the pseudo-header checksum
3161 		 * before passing the packet to the IO domain.
3162 		 */
3163 		*capab = HCKSUM_INET_FULL_V4;
3164 
3165 		/*
3166 		 * TODO: query the "feature-ipv6-csum-offload" capability.
3167 		 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3168 		 */
3169 
3170 		break;
3171 	}
3172 	case MAC_CAPAB_LSO: {
3173 		if (!xnfp->xnf_be_lso)
3174 			return (B_FALSE);
3175 
3176 		mac_capab_lso_t *lso = cap_data;
3177 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3178 		lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3179 		break;
3180 	}
3181 	default:
3182 		return (B_FALSE);
3183 	}
3184 
3185 	return (B_TRUE);
3186 }
3187 
3188 /*
3189  * The state of the peer has changed - react accordingly.
3190  */
3191 static void
3192 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3193     void *arg, void *impl_data)
3194 {
3195 	_NOTE(ARGUNUSED(id, arg));
3196 	xnf_t *xnfp = ddi_get_driver_private(dip);
3197 	XenbusState new_state = *(XenbusState *)impl_data;
3198 
3199 	ASSERT(xnfp != NULL);
3200 
3201 	switch (new_state) {
3202 	case XenbusStateUnknown:
3203 	case XenbusStateInitialising:
3204 	case XenbusStateInitialised:
3205 	case XenbusStateClosing:
3206 	case XenbusStateClosed:
3207 	case XenbusStateReconfiguring:
3208 	case XenbusStateReconfigured:
3209 		break;
3210 
3211 	case XenbusStateInitWait:
3212 		xnf_read_config(xnfp);
3213 
3214 		if (!xnfp->xnf_be_rx_copy) {
3215 			cmn_err(CE_WARN,
3216 			    "The xnf driver requires a dom0 that "
3217 			    "supports 'feature-rx-copy'.");
3218 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
3219 			    XBT_NULL, XenbusStateClosed);
3220 			break;
3221 		}
3222 
3223 		/*
3224 		 * Connect to the backend.
3225 		 */
3226 		xnf_be_connect(xnfp);
3227 
3228 		/*
3229 		 * Our MAC address as discovered by xnf_read_config().
3230 		 */
3231 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3232 
3233 		/*
3234 		 * We do not know if some features such as LSO are supported
3235 		 * until we connect to the backend. We request the MAC layer
3236 		 * to poll our capabilities again.
3237 		 */
3238 		mac_capab_update(xnfp->xnf_mh);
3239 
3240 		break;
3241 
3242 	case XenbusStateConnected:
3243 		mutex_enter(&xnfp->xnf_rxlock);
3244 		mutex_enter(&xnfp->xnf_txlock);
3245 
3246 		xnfp->xnf_connected = B_TRUE;
3247 		/*
3248 		 * Wake up any threads waiting to send data to
3249 		 * backend.
3250 		 */
3251 		cv_broadcast(&xnfp->xnf_cv_state);
3252 
3253 		mutex_exit(&xnfp->xnf_txlock);
3254 		mutex_exit(&xnfp->xnf_rxlock);
3255 
3256 		/*
3257 		 * Kick the peer in case it missed any transmits
3258 		 * request in the TX ring.
3259 		 */
3260 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
3261 
3262 		/*
3263 		 * There may already be completed receive requests in
3264 		 * the ring sent by backend after it gets connected
3265 		 * but before we see its state change here, so we call
3266 		 * xnf_intr() to handle them, if any.
3267 		 */
3268 		(void) xnf_intr((caddr_t)xnfp);
3269 
3270 		/*
3271 		 * Mark the link up now that we are connected.
3272 		 */
3273 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3274 
3275 		/*
3276 		 * Tell the backend about the multicast addresses in
3277 		 * which we are interested.
3278 		 */
3279 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3280 
3281 		break;
3282 
3283 	default:
3284 		break;
3285 	}
3286 }
3287