xref: /illumos-gate/usr/src/uts/common/xen/io/xnf.c (revision a92282e44f968185a6bba094d1e5fece2da819cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  * Copyright 2020 RackTop Systems, Inc.
30  */
31 
32 /*
33  *
34  * Copyright (c) 2004 Christian Limpach.
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. This section intentionally left blank.
46  * 4. The name of the author may not be used to endorse or promote products
47  *    derived from this software without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
50  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
53  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
54  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
55  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
56  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
57  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
58  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59  */
60 /*
61  * Section 3 of the above license was updated in response to bug 6379571.
62  */
63 
64 /*
65  * xnf.c - GLDv3 network driver for domU.
66  */
67 
68 /*
69  * This driver uses four per-instance locks:
70  *
71  * xnf_gref_lock:
72  *
73  *    Protects access to the grant reference list stored in
74  *    xnf_gref_head. Grant references should be acquired and released
75  *    using gref_get() and gref_put() respectively.
76  *
77  * xnf_schedlock:
78  *
79  *    Protects:
80  *    xnf_need_sched - used to record that a previous transmit attempt
81  *       failed (and consequently it will be necessary to call
82  *       mac_tx_update() when transmit resources are available).
83  *    xnf_pending_multicast - the number of multicast requests that
84  *       have been submitted to the backend for which we have not
85  *       processed responses.
86  *
87  * xnf_txlock:
88  *
89  *    Protects the transmit ring (xnf_tx_ring) and associated
90  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
91  *
92  * xnf_rxlock:
93  *
94  *    Protects the receive ring (xnf_rx_ring) and associated
95  *    structures (notably xnf_rx_pkt_info).
96  *
97  * If driver-global state that affects both the transmit and receive
98  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
99  * held, in that order.
100  *
101  * xnf_schedlock is acquired both whilst holding xnf_txlock and
102  * without. It should always be acquired after xnf_txlock if both are
103  * held.
104  *
105  * Notes:
106  * - atomic_add_64() is used to manipulate counters where we require
107  *   accuracy. For counters intended only for observation by humans,
108  *   post increment/decrement are used instead.
109  */
110 
111 #include <sys/types.h>
112 #include <sys/errno.h>
113 #include <sys/param.h>
114 #include <sys/sysmacros.h>
115 #include <sys/systm.h>
116 #include <sys/stream.h>
117 #include <sys/strsubr.h>
118 #include <sys/strsun.h>
119 #include <sys/conf.h>
120 #include <sys/ddi.h>
121 #include <sys/devops.h>
122 #include <sys/sunddi.h>
123 #include <sys/sunndi.h>
124 #include <sys/dlpi.h>
125 #include <sys/ethernet.h>
126 #include <sys/strsun.h>
127 #include <sys/pattr.h>
128 #include <inet/ip.h>
129 #include <inet/ip_impl.h>
130 #include <inet/tcp.h>
131 #include <netinet/udp.h>
132 #include <sys/gld.h>
133 #include <sys/modctl.h>
134 #include <sys/mac_provider.h>
135 #include <sys/mac_ether.h>
136 #include <sys/bootinfo.h>
137 #include <sys/mach_mmu.h>
138 #ifdef	XPV_HVM_DRIVER
139 #include <sys/xpv_support.h>
140 #include <sys/hypervisor.h>
141 #else
142 #include <sys/hypervisor.h>
143 #include <sys/evtchn_impl.h>
144 #include <sys/balloon_impl.h>
145 #endif
146 #include <xen/public/io/netif.h>
147 #include <sys/gnttab.h>
148 #include <xen/sys/xendev.h>
149 #include <sys/sdt.h>
150 #include <sys/note.h>
151 #include <sys/debug.h>
152 
153 #include <io/xnf.h>
154 
155 /*
156  * On a 32 bit PAE system physical and machine addresses are larger
157  * than 32 bits.  ddi_btop() on such systems take an unsigned long
158  * argument, and so addresses above 4G are truncated before ddi_btop()
159  * gets to see them.  To avoid this, code the shift operation here.
160  */
161 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
162 
163 /*
164  * The parameters below should only be changed in /etc/system, never in mdb.
165  */
166 
167 /*
168  * Should we use the multicast control feature if the backend provides
169  * it?
170  */
171 boolean_t xnf_multicast_control = B_TRUE;
172 
173 /*
174  * Should we allow scatter-gather for tx if backend allows it?
175  */
176 boolean_t xnf_enable_tx_sg = B_TRUE;
177 
178 /*
179  * Should we allow scatter-gather for rx if backend allows it?
180  */
181 boolean_t xnf_enable_rx_sg = B_TRUE;
182 
183 /*
184  * Should we allow lso for tx sends if backend allows it?
185  * Requires xnf_enable_tx_sg to be also set to TRUE.
186  */
187 boolean_t xnf_enable_lso = B_TRUE;
188 
189 /*
190  * Should we allow lro on rx if backend supports it?
191  * Requires xnf_enable_rx_sg to be also set to TRUE.
192  *
193  * !! WARNING !!
194  * LRO is not yet supported in the OS so this should be left as FALSE.
195  * !! WARNING !!
196  */
197 boolean_t xnf_enable_lro = B_FALSE;
198 
199 /*
200  * Received packets below this size are copied to a new streams buffer
201  * rather than being desballoc'ed.
202  *
203  * This value is chosen to accommodate traffic where there are a large
204  * number of small packets. For data showing a typical distribution,
205  * see:
206  *
207  * Sinha07a:
208  *	Rishi Sinha, Christos Papadopoulos, and John
209  *	Heidemann. Internet Packet Size Distributions: Some
210  *	Observations. Technical Report ISI-TR-2007-643,
211  *	USC/Information Sciences Institute, May, 2007. Orignally
212  *	released October 2005 as web page
213  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
214  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
215  */
216 size_t xnf_rx_copy_limit = 64;
217 
218 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
219 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
220 #define	INVALID_TX_ID		((uint16_t)-1)
221 
222 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
223 #define	TX_ID_VALID(i) \
224 	(((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
225 
226 /*
227  * calculate how many pages are spanned by an mblk fragment
228  */
229 #define	xnf_mblk_pages(mp)	(MBLKL(mp) == 0 ? 0 : \
230     xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
231 
232 /* Required system entry points */
233 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
234 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
235 
236 /* Required driver entry points for Nemo */
237 static int	xnf_start(void *);
238 static void	xnf_stop(void *);
239 static int	xnf_set_mac_addr(void *, const uint8_t *);
240 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
241 static int	xnf_set_promiscuous(void *, boolean_t);
242 static mblk_t	*xnf_send(void *, mblk_t *);
243 static uint_t	xnf_intr(caddr_t);
244 static int	xnf_stat(void *, uint_t, uint64_t *);
245 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
246 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
247 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
248     const void *);
249 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
250     mac_prop_info_handle_t);
251 
252 /* Driver private functions */
253 static int xnf_alloc_dma_resources(xnf_t *);
254 static void xnf_release_dma_resources(xnf_t *);
255 static void xnf_release_mblks(xnf_t *);
256 
257 static int xnf_buf_constructor(void *, void *, int);
258 static void xnf_buf_destructor(void *, void *);
259 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
260 #pragma inline(xnf_buf_get)
261 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
262 #pragma inline(xnf_buf_put)
263 static void xnf_buf_refresh(xnf_buf_t *);
264 #pragma inline(xnf_buf_refresh)
265 static void xnf_buf_recycle(xnf_buf_t *);
266 
267 static int xnf_tx_buf_constructor(void *, void *, int);
268 static void xnf_tx_buf_destructor(void *, void *);
269 
270 static grant_ref_t xnf_gref_get(xnf_t *);
271 #pragma inline(xnf_gref_get)
272 static void xnf_gref_put(xnf_t *, grant_ref_t);
273 #pragma inline(xnf_gref_put)
274 
275 static xnf_txid_t *xnf_txid_get(xnf_t *);
276 #pragma inline(xnf_txid_get)
277 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
278 #pragma inline(xnf_txid_put)
279 
280 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
281 static int xnf_tx_clean_ring(xnf_t  *);
282 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
283     void *, void *);
284 static boolean_t xnf_kstat_init(xnf_t *);
285 static void xnf_rx_collect(xnf_t *);
286 
287 #define	XNF_CALLBACK_FLAGS	(MC_GETCAPAB | MC_PROPERTIES)
288 
289 static mac_callbacks_t xnf_callbacks = {
290 	.mc_callbacks = XNF_CALLBACK_FLAGS,
291 	.mc_getstat = xnf_stat,
292 	.mc_start = xnf_start,
293 	.mc_stop = xnf_stop,
294 	.mc_setpromisc = xnf_set_promiscuous,
295 	.mc_multicst = xnf_set_multicast,
296 	.mc_unicst = xnf_set_mac_addr,
297 	.mc_tx = xnf_send,
298 	.mc_getcapab = xnf_getcapab,
299 	.mc_setprop = xnf_setprop,
300 	.mc_getprop = xnf_getprop,
301 	.mc_propinfo = xnf_propinfo,
302 };
303 
304 /* DMA attributes for network ring buffer */
305 static ddi_dma_attr_t ringbuf_dma_attr = {
306 	.dma_attr_version = DMA_ATTR_V0,
307 	.dma_attr_addr_lo = 0,
308 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
309 	.dma_attr_count_max = 0x7fffffff,
310 	.dma_attr_align = MMU_PAGESIZE,
311 	.dma_attr_burstsizes = 0x7ff,
312 	.dma_attr_minxfer = 1,
313 	.dma_attr_maxxfer = 0xffffffffU,
314 	.dma_attr_seg = 0xffffffffffffffffULL,
315 	.dma_attr_sgllen = 1,
316 	.dma_attr_granular = 1,
317 	.dma_attr_flags = 0
318 };
319 
320 /* DMA attributes for receive data */
321 static ddi_dma_attr_t rx_buf_dma_attr = {
322 	.dma_attr_version = DMA_ATTR_V0,
323 	.dma_attr_addr_lo = 0,
324 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
325 	.dma_attr_count_max = MMU_PAGEOFFSET,
326 	.dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
327 	.dma_attr_burstsizes = 0x7ff,
328 	.dma_attr_minxfer = 1,
329 	.dma_attr_maxxfer = 0xffffffffU,
330 	.dma_attr_seg = 0xffffffffffffffffULL,
331 	.dma_attr_sgllen = 1,
332 	.dma_attr_granular = 1,
333 	.dma_attr_flags = 0
334 };
335 
336 /* DMA attributes for transmit data */
337 static ddi_dma_attr_t tx_buf_dma_attr = {
338 	.dma_attr_version = DMA_ATTR_V0,
339 	.dma_attr_addr_lo = 0,
340 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
341 	.dma_attr_count_max = MMU_PAGEOFFSET,
342 	.dma_attr_align = 1,
343 	.dma_attr_burstsizes = 0x7ff,
344 	.dma_attr_minxfer = 1,
345 	.dma_attr_maxxfer = 0xffffffffU,
346 	.dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
347 	.dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
348 	.dma_attr_granular = 1,
349 	.dma_attr_flags = 0
350 };
351 
352 /* DMA access attributes for registers and descriptors */
353 static ddi_device_acc_attr_t accattr = {
354 	DDI_DEVICE_ATTR_V0,
355 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
356 	DDI_STRICTORDER_ACC
357 };
358 
359 /* DMA access attributes for data: NOT to be byte swapped. */
360 static ddi_device_acc_attr_t data_accattr = {
361 	DDI_DEVICE_ATTR_V0,
362 	DDI_NEVERSWAP_ACC,
363 	DDI_STRICTORDER_ACC
364 };
365 
366 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
367     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
368 
369 static struct modldrv xnf_modldrv = {
370 	&mod_driverops,
371 	"Virtual Ethernet driver",
372 	&xnf_dev_ops
373 };
374 
375 static struct modlinkage modlinkage = {
376 	MODREV_1, &xnf_modldrv, NULL
377 };
378 
379 int
380 _init(void)
381 {
382 	int r;
383 
384 	mac_init_ops(&xnf_dev_ops, "xnf");
385 	r = mod_install(&modlinkage);
386 	if (r != DDI_SUCCESS)
387 		mac_fini_ops(&xnf_dev_ops);
388 
389 	return (r);
390 }
391 
392 int
393 _fini(void)
394 {
395 	return (EBUSY); /* XXPV should be removable */
396 }
397 
398 int
399 _info(struct modinfo *modinfop)
400 {
401 	return (mod_info(&modlinkage, modinfop));
402 }
403 
404 /*
405  * Acquire a grant reference.
406  */
407 static grant_ref_t
408 xnf_gref_get(xnf_t *xnfp)
409 {
410 	grant_ref_t gref;
411 
412 	mutex_enter(&xnfp->xnf_gref_lock);
413 
414 	do {
415 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
416 
417 	} while ((gref == INVALID_GRANT_REF) &&
418 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
419 
420 	mutex_exit(&xnfp->xnf_gref_lock);
421 
422 	if (gref == INVALID_GRANT_REF) {
423 		xnfp->xnf_stat_gref_failure++;
424 	} else {
425 		atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
426 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
427 			xnfp->xnf_stat_gref_peak =
428 			    xnfp->xnf_stat_gref_outstanding;
429 	}
430 
431 	return (gref);
432 }
433 
434 /*
435  * Release a grant reference.
436  */
437 static void
438 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
439 {
440 	ASSERT(gref != INVALID_GRANT_REF);
441 
442 	mutex_enter(&xnfp->xnf_gref_lock);
443 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
444 	mutex_exit(&xnfp->xnf_gref_lock);
445 
446 	atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
447 }
448 
449 /*
450  * Acquire a transmit id.
451  */
452 static xnf_txid_t *
453 xnf_txid_get(xnf_t *xnfp)
454 {
455 	xnf_txid_t *tidp;
456 
457 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
458 
459 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
460 		return (NULL);
461 
462 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
463 
464 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
465 	xnfp->xnf_tx_pkt_id_head = tidp->next;
466 	tidp->next = INVALID_TX_ID;
467 
468 	ASSERT(tidp->txbuf == NULL);
469 
470 	return (tidp);
471 }
472 
473 /*
474  * Release a transmit id.
475  */
476 static void
477 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
478 {
479 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
480 	ASSERT(TX_ID_VALID(tidp->id));
481 	ASSERT(tidp->next == INVALID_TX_ID);
482 
483 	tidp->txbuf = NULL;
484 	tidp->next = xnfp->xnf_tx_pkt_id_head;
485 	xnfp->xnf_tx_pkt_id_head = tidp->id;
486 }
487 
488 static void
489 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
490 {
491 	ASSERT3U(txp->tx_type, ==, TX_DATA);
492 
493 	/*
494 	 * We are either using a lookaside buffer or we are mapping existing
495 	 * buffers.
496 	 */
497 	if (txp->tx_bdesc != NULL) {
498 		ASSERT(!txp->tx_handle_bound);
499 		xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
500 	} else {
501 		if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
502 			if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
503 			    0) {
504 				cmn_err(CE_PANIC, "tx grant %d still in use by "
505 				    "backend domain", txp->tx_txreq.gref);
506 			}
507 			(void) gnttab_end_foreign_access_ref(
508 			    txp->tx_txreq.gref, 1);
509 			xnf_gref_put(xnfp, txp->tx_txreq.gref);
510 		}
511 
512 		if (txp->tx_handle_bound)
513 			(void) ddi_dma_unbind_handle(txp->tx_dma_handle);
514 	}
515 
516 	if (txp->tx_mp != NULL)
517 		freemsg(txp->tx_mp);
518 
519 	if (txp->tx_prev != NULL) {
520 		ASSERT3P(txp->tx_prev->tx_next, ==, txp);
521 		txp->tx_prev->tx_next = NULL;
522 	}
523 
524 	if (txp->tx_txreq.id != INVALID_TX_ID) {
525 		/*
526 		 * This should be only possible when resuming from a suspend.
527 		 */
528 		ASSERT(!xnfp->xnf_connected);
529 		xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
530 		txp->tx_txreq.id = INVALID_TX_ID;
531 	}
532 
533 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
534 }
535 
536 static void
537 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
538 {
539 	if (txp == NULL)
540 		return;
541 
542 	while (txp->tx_next != NULL)
543 		txp = txp->tx_next;
544 
545 	/*
546 	 * We free the chain in reverse order so that grants can be released
547 	 * for all dma chunks before unbinding the dma handles. The mblk is
548 	 * freed last, after all its fragments' dma handles are unbound.
549 	 */
550 	xnf_txbuf_t *prev;
551 	for (; txp != NULL; txp = prev) {
552 		prev = txp->tx_prev;
553 		xnf_data_txbuf_free(xnfp, txp);
554 	}
555 }
556 
557 static xnf_txbuf_t *
558 xnf_data_txbuf_alloc(xnf_t *xnfp, int flag)
559 {
560 	xnf_txbuf_t *txp;
561 
562 	if ((txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, flag)) == NULL) {
563 		return (NULL);
564 	}
565 
566 	txp->tx_type = TX_DATA;
567 	txp->tx_next = NULL;
568 	txp->tx_prev = NULL;
569 	txp->tx_head = txp;
570 	txp->tx_frags_to_ack = 0;
571 	txp->tx_mp = NULL;
572 	txp->tx_bdesc = NULL;
573 	txp->tx_handle_bound = B_FALSE;
574 	txp->tx_txreq.gref = INVALID_GRANT_REF;
575 	txp->tx_txreq.id = INVALID_TX_ID;
576 
577 	return (txp);
578 }
579 
580 /*
581  * Get `wanted' slots in the transmit ring, waiting for at least that
582  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
583  * `wanted' to zero.
584  *
585  * Return the number of slots available.
586  */
587 static int
588 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
589 {
590 	int slotsfree;
591 	boolean_t forced_clean = (wanted == 0);
592 
593 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
594 
595 	/* LINTED: constant in conditional context */
596 	while (B_TRUE) {
597 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
598 
599 		if ((slotsfree < wanted) || forced_clean)
600 			slotsfree = xnf_tx_clean_ring(xnfp);
601 
602 		/*
603 		 * If there are more than we need free, tell other
604 		 * people to come looking again. We hold txlock, so we
605 		 * are able to take our slots before anyone else runs.
606 		 */
607 		if (slotsfree > wanted)
608 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
609 
610 		if (slotsfree >= wanted)
611 			break;
612 
613 		if (!wait)
614 			break;
615 
616 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
617 	}
618 
619 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
620 
621 	return (slotsfree);
622 }
623 
624 static int
625 xnf_setup_rings(xnf_t *xnfp)
626 {
627 	domid_t			oeid;
628 	struct xenbus_device	*xsd;
629 	RING_IDX		i;
630 	int			err;
631 	xnf_txid_t		*tidp;
632 	xnf_buf_t **bdescp;
633 
634 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
635 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
636 
637 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
638 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
639 
640 	err = gnttab_grant_foreign_access(oeid,
641 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
642 	if (err <= 0) {
643 		err = -err;
644 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
645 		goto out;
646 	}
647 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
648 
649 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
650 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
651 
652 	err = gnttab_grant_foreign_access(oeid,
653 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
654 	if (err <= 0) {
655 		err = -err;
656 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
657 		goto out;
658 	}
659 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
660 
661 	mutex_enter(&xnfp->xnf_txlock);
662 
663 	/*
664 	 * We first cleanup the TX ring in case we are doing a resume.
665 	 * Note that this can lose packets, but we expect to stagger on.
666 	 */
667 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
668 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
669 	    i < NET_TX_RING_SIZE;
670 	    i++, tidp++) {
671 		xnf_txbuf_t *txp = tidp->txbuf;
672 		if (txp == NULL)
673 			continue;
674 
675 		switch (txp->tx_type) {
676 		case TX_DATA:
677 			/*
678 			 * txid_put() will be called for each txbuf's txid in
679 			 * the chain which will result in clearing tidp->txbuf.
680 			 */
681 			xnf_data_txbuf_free_chain(xnfp, txp);
682 
683 			break;
684 
685 		case TX_MCAST_REQ:
686 			txp->tx_type = TX_MCAST_RSP;
687 			txp->tx_status = NETIF_RSP_DROPPED;
688 			cv_broadcast(&xnfp->xnf_cv_multicast);
689 
690 			/*
691 			 * The request consumed two slots in the ring,
692 			 * yet only a single xnf_txid_t is used. Step
693 			 * over the empty slot.
694 			 */
695 			i++;
696 			ASSERT3U(i, <, NET_TX_RING_SIZE);
697 			break;
698 
699 		case TX_MCAST_RSP:
700 			break;
701 		}
702 	}
703 
704 	/*
705 	 * Now purge old list and add each txid to the new free list.
706 	 */
707 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
708 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
709 	    i < NET_TX_RING_SIZE;
710 	    i++, tidp++) {
711 		tidp->id = i;
712 		ASSERT3P(tidp->txbuf, ==, NULL);
713 		tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
714 		xnf_txid_put(xnfp, tidp);
715 	}
716 
717 	/* LINTED: constant in conditional context */
718 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
719 	/* LINTED: constant in conditional context */
720 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
721 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
722 
723 	mutex_exit(&xnfp->xnf_txlock);
724 
725 	mutex_enter(&xnfp->xnf_rxlock);
726 
727 	/*
728 	 * Clean out any buffers currently posted to the receive ring
729 	 * before we reset it.
730 	 */
731 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
732 	    i < NET_RX_RING_SIZE;
733 	    i++, bdescp++) {
734 		if (*bdescp != NULL) {
735 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
736 			*bdescp = NULL;
737 		}
738 	}
739 
740 	/* LINTED: constant in conditional context */
741 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
742 	/* LINTED: constant in conditional context */
743 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
744 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
745 
746 	/*
747 	 * Fill the ring with buffers.
748 	 */
749 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
750 		xnf_buf_t *bdesc;
751 
752 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
753 		VERIFY(bdesc != NULL);
754 		xnf_rxbuf_hang(xnfp, bdesc);
755 	}
756 
757 	/* LINTED: constant in conditional context */
758 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
759 
760 	mutex_exit(&xnfp->xnf_rxlock);
761 
762 	return (0);
763 
764 out:
765 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
766 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
767 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
768 
769 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
770 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
771 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
772 
773 	return (err);
774 }
775 
776 /*
777  * Connect driver to back end, called to set up communication with
778  * back end driver both initially and on resume after restore/migrate.
779  */
780 void
781 xnf_be_connect(xnf_t *xnfp)
782 {
783 	const char	*message;
784 	xenbus_transaction_t xbt;
785 	struct		xenbus_device *xsd;
786 	char		*xsname;
787 	int		err;
788 
789 	ASSERT(!xnfp->xnf_connected);
790 
791 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
792 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
793 
794 	err = xnf_setup_rings(xnfp);
795 	if (err != 0) {
796 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
797 		xenbus_dev_error(xsd, err, "setting up ring");
798 		return;
799 	}
800 
801 again:
802 	err = xenbus_transaction_start(&xbt);
803 	if (err != 0) {
804 		xenbus_dev_error(xsd, EIO, "starting transaction");
805 		return;
806 	}
807 
808 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
809 	    xnfp->xnf_tx_ring_ref);
810 	if (err != 0) {
811 		message = "writing tx ring-ref";
812 		goto abort_transaction;
813 	}
814 
815 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
816 	    xnfp->xnf_rx_ring_ref);
817 	if (err != 0) {
818 		message = "writing rx ring-ref";
819 		goto abort_transaction;
820 	}
821 
822 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
823 	    xnfp->xnf_evtchn);
824 	if (err != 0) {
825 		message = "writing event-channel";
826 		goto abort_transaction;
827 	}
828 
829 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
830 	if (err != 0) {
831 		message = "writing feature-rx-notify";
832 		goto abort_transaction;
833 	}
834 
835 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
836 	if (err != 0) {
837 		message = "writing request-rx-copy";
838 		goto abort_transaction;
839 	}
840 
841 	if (xnfp->xnf_be_mcast_control) {
842 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
843 		    "%d", 1);
844 		if (err != 0) {
845 			message = "writing request-multicast-control";
846 			goto abort_transaction;
847 		}
848 	}
849 
850 	/*
851 	 * Tell backend if we support scatter-gather lists on the rx side.
852 	 */
853 	err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
854 	    xnf_enable_rx_sg ? 1 : 0);
855 	if (err != 0) {
856 		message = "writing feature-sg";
857 		goto abort_transaction;
858 	}
859 
860 	/*
861 	 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
862 	 * a prerequisite.
863 	 */
864 	err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
865 	    (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
866 	if (err != 0) {
867 		message = "writing feature-gso-tcpv4";
868 		goto abort_transaction;
869 	}
870 
871 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
872 	if (err != 0) {
873 		message = "switching state to XenbusStateConnected";
874 		goto abort_transaction;
875 	}
876 
877 	err = xenbus_transaction_end(xbt, 0);
878 	if (err != 0) {
879 		if (err == EAGAIN)
880 			goto again;
881 		xenbus_dev_error(xsd, err, "completing transaction");
882 	}
883 
884 	return;
885 
886 abort_transaction:
887 	(void) xenbus_transaction_end(xbt, 1);
888 	xenbus_dev_error(xsd, err, "%s", message);
889 }
890 
891 /*
892  * Read configuration information from xenstore.
893  */
894 void
895 xnf_read_config(xnf_t *xnfp)
896 {
897 	int err, be_cap;
898 	char mac[ETHERADDRL * 3];
899 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
900 
901 	err = xenbus_scanf(XBT_NULL, oename, "mac",
902 	    "%s", (char *)&mac[0]);
903 	if (err != 0) {
904 		/*
905 		 * bad: we're supposed to be set up with a proper mac
906 		 * addr. at this point
907 		 */
908 		cmn_err(CE_WARN, "%s%d: no mac address",
909 		    ddi_driver_name(xnfp->xnf_devinfo),
910 		    ddi_get_instance(xnfp->xnf_devinfo));
911 			return;
912 	}
913 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
914 		err = ENOENT;
915 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
916 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
917 		return;
918 	}
919 
920 	err = xenbus_scanf(XBT_NULL, oename,
921 	    "feature-rx-copy", "%d", &be_cap);
922 	/*
923 	 * If we fail to read the store we assume that the key is
924 	 * absent, implying an older domain at the far end.  Older
925 	 * domains cannot do HV copy.
926 	 */
927 	if (err != 0)
928 		be_cap = 0;
929 	xnfp->xnf_be_rx_copy = (be_cap != 0);
930 
931 	err = xenbus_scanf(XBT_NULL, oename,
932 	    "feature-multicast-control", "%d", &be_cap);
933 	/*
934 	 * If we fail to read the store we assume that the key is
935 	 * absent, implying an older domain at the far end.  Older
936 	 * domains do not support multicast control.
937 	 */
938 	if (err != 0)
939 		be_cap = 0;
940 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
941 
942 	/*
943 	 * See if back-end supports scatter-gather for transmits. If not,
944 	 * we will not support LSO and limit the mtu to 1500.
945 	 */
946 	err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
947 	if (err != 0) {
948 		be_cap = 0;
949 		dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
950 		    "'feature-sg' from backend driver");
951 	}
952 	if (be_cap == 0) {
953 		dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
954 		    "supported for transmits in the backend driver. LSO is "
955 		    "disabled and MTU is restricted to 1500 bytes.");
956 	}
957 	xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
958 
959 	if (xnfp->xnf_be_tx_sg) {
960 		/*
961 		 * Check if LSO is supported. Currently we only check for
962 		 * IPv4 as Illumos doesn't support LSO for IPv6.
963 		 */
964 		err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
965 		    &be_cap);
966 		if (err != 0) {
967 			be_cap = 0;
968 			dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
969 			    "'feature-gso-tcpv4' from backend driver");
970 		}
971 		if (be_cap == 0) {
972 			dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
973 			    "supported by the backend driver. Performance "
974 			    "will be affected.");
975 		}
976 		xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
977 	}
978 }
979 
980 /*
981  *  attach(9E) -- Attach a device to the system
982  */
983 static int
984 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
985 {
986 	mac_register_t *macp;
987 	xnf_t *xnfp;
988 	int err;
989 	char cachename[32];
990 
991 	switch (cmd) {
992 	case DDI_RESUME:
993 		xnfp = ddi_get_driver_private(devinfo);
994 		xnfp->xnf_gen++;
995 
996 		(void) xvdi_resume(devinfo);
997 		(void) xvdi_alloc_evtchn(devinfo);
998 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
999 #ifdef XPV_HVM_DRIVER
1000 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
1001 		    xnfp);
1002 #else
1003 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
1004 		    (caddr_t)xnfp);
1005 #endif
1006 		return (DDI_SUCCESS);
1007 
1008 	case DDI_ATTACH:
1009 		break;
1010 
1011 	default:
1012 		return (DDI_FAILURE);
1013 	}
1014 
1015 	/*
1016 	 *  Allocate gld_mac_info_t and xnf_instance structures
1017 	 */
1018 	macp = mac_alloc(MAC_VERSION);
1019 	if (macp == NULL)
1020 		return (DDI_FAILURE);
1021 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1022 
1023 	xnfp->xnf_tx_pkt_id =
1024 	    kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1025 
1026 	xnfp->xnf_rx_pkt_info =
1027 	    kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1028 
1029 	macp->m_dip = devinfo;
1030 	macp->m_driver = xnfp;
1031 	xnfp->xnf_devinfo = devinfo;
1032 
1033 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1034 	macp->m_src_addr = xnfp->xnf_mac_addr;
1035 	macp->m_callbacks = &xnf_callbacks;
1036 	macp->m_min_sdu = 0;
1037 	xnfp->xnf_mtu = ETHERMTU;
1038 	macp->m_max_sdu = xnfp->xnf_mtu;
1039 
1040 	xnfp->xnf_running = B_FALSE;
1041 	xnfp->xnf_connected = B_FALSE;
1042 	xnfp->xnf_be_rx_copy = B_FALSE;
1043 	xnfp->xnf_be_mcast_control = B_FALSE;
1044 	xnfp->xnf_need_sched = B_FALSE;
1045 
1046 	xnfp->xnf_rx_head = NULL;
1047 	xnfp->xnf_rx_tail = NULL;
1048 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1049 
1050 #ifdef XPV_HVM_DRIVER
1051 	/* Report our version to dom0 */
1052 	(void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1053 	    HVMPV_XNF_VERS);
1054 #endif
1055 
1056 	/*
1057 	 * Get the iblock cookie with which to initialize the mutexes.
1058 	 */
1059 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1060 	    != DDI_SUCCESS)
1061 		goto failure;
1062 
1063 	mutex_init(&xnfp->xnf_txlock,
1064 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1065 	mutex_init(&xnfp->xnf_rxlock,
1066 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1067 	mutex_init(&xnfp->xnf_schedlock,
1068 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1069 	mutex_init(&xnfp->xnf_gref_lock,
1070 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1071 
1072 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1073 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1074 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1075 
1076 	(void) sprintf(cachename, "xnf_buf_cache_%d",
1077 	    ddi_get_instance(devinfo));
1078 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1079 	    sizeof (xnf_buf_t), 0,
1080 	    xnf_buf_constructor, xnf_buf_destructor,
1081 	    NULL, xnfp, NULL, 0);
1082 	if (xnfp->xnf_buf_cache == NULL)
1083 		goto failure_0;
1084 
1085 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1086 	    ddi_get_instance(devinfo));
1087 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1088 	    sizeof (xnf_txbuf_t), 0,
1089 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1090 	    NULL, xnfp, NULL, 0);
1091 	if (xnfp->xnf_tx_buf_cache == NULL)
1092 		goto failure_1;
1093 
1094 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
1095 
1096 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1097 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1098 		    "driver data structures",
1099 		    ddi_get_instance(xnfp->xnf_devinfo));
1100 		goto failure_2;
1101 	}
1102 
1103 	xnfp->xnf_rx_ring.sring->rsp_event =
1104 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
1105 
1106 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1107 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1108 
1109 	/* set driver private pointer now */
1110 	ddi_set_driver_private(devinfo, xnfp);
1111 
1112 	if (!xnf_kstat_init(xnfp))
1113 		goto failure_3;
1114 
1115 	/*
1116 	 * Allocate an event channel, add the interrupt handler and
1117 	 * bind it to the event channel.
1118 	 */
1119 	(void) xvdi_alloc_evtchn(devinfo);
1120 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1121 #ifdef XPV_HVM_DRIVER
1122 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1123 #else
1124 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1125 #endif
1126 
1127 	err = mac_register(macp, &xnfp->xnf_mh);
1128 	mac_free(macp);
1129 	macp = NULL;
1130 	if (err != 0)
1131 		goto failure_4;
1132 
1133 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1134 	    != DDI_SUCCESS)
1135 		goto failure_5;
1136 
1137 #ifdef XPV_HVM_DRIVER
1138 	/*
1139 	 * In the HVM case, this driver essentially replaces a driver for
1140 	 * a 'real' PCI NIC. Without the "model" property set to
1141 	 * "Ethernet controller", like the PCI code does, netbooting does
1142 	 * not work correctly, as strplumb_get_netdev_path() will not find
1143 	 * this interface.
1144 	 */
1145 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1146 	    "Ethernet controller");
1147 #endif
1148 
1149 	return (DDI_SUCCESS);
1150 
1151 failure_5:
1152 	(void) mac_unregister(xnfp->xnf_mh);
1153 
1154 failure_4:
1155 #ifdef XPV_HVM_DRIVER
1156 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1157 	xvdi_free_evtchn(devinfo);
1158 #else
1159 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1160 #endif
1161 	xnfp->xnf_evtchn = INVALID_EVTCHN;
1162 	kstat_delete(xnfp->xnf_kstat_aux);
1163 
1164 failure_3:
1165 	xnf_release_dma_resources(xnfp);
1166 
1167 failure_2:
1168 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1169 
1170 failure_1:
1171 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1172 
1173 failure_0:
1174 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1175 	cv_destroy(&xnfp->xnf_cv_multicast);
1176 	cv_destroy(&xnfp->xnf_cv_state);
1177 
1178 	mutex_destroy(&xnfp->xnf_gref_lock);
1179 	mutex_destroy(&xnfp->xnf_schedlock);
1180 	mutex_destroy(&xnfp->xnf_rxlock);
1181 	mutex_destroy(&xnfp->xnf_txlock);
1182 
1183 failure:
1184 	kmem_free(xnfp, sizeof (*xnfp));
1185 	if (macp != NULL)
1186 		mac_free(macp);
1187 
1188 	return (DDI_FAILURE);
1189 }
1190 
1191 /*  detach(9E) -- Detach a device from the system */
1192 static int
1193 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1194 {
1195 	xnf_t *xnfp;		/* Our private device info */
1196 
1197 	xnfp = ddi_get_driver_private(devinfo);
1198 
1199 	switch (cmd) {
1200 	case DDI_SUSPEND:
1201 #ifdef XPV_HVM_DRIVER
1202 		ec_unbind_evtchn(xnfp->xnf_evtchn);
1203 		xvdi_free_evtchn(devinfo);
1204 #else
1205 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1206 #endif
1207 
1208 		xvdi_suspend(devinfo);
1209 
1210 		mutex_enter(&xnfp->xnf_rxlock);
1211 		mutex_enter(&xnfp->xnf_txlock);
1212 
1213 		xnfp->xnf_evtchn = INVALID_EVTCHN;
1214 		xnfp->xnf_connected = B_FALSE;
1215 		mutex_exit(&xnfp->xnf_txlock);
1216 		mutex_exit(&xnfp->xnf_rxlock);
1217 
1218 		/* claim link to be down after disconnect */
1219 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1220 		return (DDI_SUCCESS);
1221 
1222 	case DDI_DETACH:
1223 		break;
1224 
1225 	default:
1226 		return (DDI_FAILURE);
1227 	}
1228 
1229 	if (xnfp->xnf_connected)
1230 		return (DDI_FAILURE);
1231 
1232 	/*
1233 	 * Cannot detach if we have xnf_buf_t outstanding.
1234 	 */
1235 	if (xnfp->xnf_stat_buf_allocated > 0)
1236 		return (DDI_FAILURE);
1237 
1238 	if (mac_unregister(xnfp->xnf_mh) != 0)
1239 		return (DDI_FAILURE);
1240 
1241 	kstat_delete(xnfp->xnf_kstat_aux);
1242 
1243 	/* Stop the receiver */
1244 	xnf_stop(xnfp);
1245 
1246 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1247 
1248 	/* Remove the interrupt */
1249 #ifdef XPV_HVM_DRIVER
1250 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1251 	xvdi_free_evtchn(devinfo);
1252 #else
1253 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1254 #endif
1255 
1256 	/* Release any pending xmit mblks */
1257 	xnf_release_mblks(xnfp);
1258 
1259 	/* Release all DMA resources */
1260 	xnf_release_dma_resources(xnfp);
1261 
1262 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1263 	cv_destroy(&xnfp->xnf_cv_multicast);
1264 	cv_destroy(&xnfp->xnf_cv_state);
1265 
1266 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1267 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1268 
1269 	mutex_destroy(&xnfp->xnf_gref_lock);
1270 	mutex_destroy(&xnfp->xnf_schedlock);
1271 	mutex_destroy(&xnfp->xnf_rxlock);
1272 	mutex_destroy(&xnfp->xnf_txlock);
1273 
1274 	kmem_free(xnfp, sizeof (*xnfp));
1275 
1276 	return (DDI_SUCCESS);
1277 }
1278 
1279 /*
1280  *  xnf_set_mac_addr() -- set the physical network address on the board.
1281  */
1282 static int
1283 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1284 {
1285 	_NOTE(ARGUNUSED(arg, macaddr));
1286 
1287 	/*
1288 	 * We can't set our macaddr.
1289 	 */
1290 	return (ENOTSUP);
1291 }
1292 
1293 /*
1294  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1295  *
1296  *  Program the hardware to enable/disable the multicast address
1297  *  in "mca".  Enable if "add" is true, disable if false.
1298  */
1299 static int
1300 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1301 {
1302 	xnf_t *xnfp = arg;
1303 	xnf_txbuf_t *txp;
1304 	int n_slots;
1305 	RING_IDX slot;
1306 	xnf_txid_t *tidp;
1307 	netif_tx_request_t *txrp;
1308 	struct netif_extra_info *erp;
1309 	boolean_t notify, result;
1310 
1311 	/*
1312 	 * If the backend does not support multicast control then we
1313 	 * must assume that the right packets will just arrive.
1314 	 */
1315 	if (!xnfp->xnf_be_mcast_control)
1316 		return (0);
1317 
1318 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1319 
1320 	mutex_enter(&xnfp->xnf_txlock);
1321 
1322 	/*
1323 	 * If we're not yet connected then claim success. This is
1324 	 * acceptable because we refresh the entire set of multicast
1325 	 * addresses when we get connected.
1326 	 *
1327 	 * We can't wait around here because the MAC layer expects
1328 	 * this to be a non-blocking operation - waiting ends up
1329 	 * causing a deadlock during resume.
1330 	 */
1331 	if (!xnfp->xnf_connected) {
1332 		mutex_exit(&xnfp->xnf_txlock);
1333 		return (0);
1334 	}
1335 
1336 	/*
1337 	 * 1. Acquire two slots in the ring.
1338 	 * 2. Fill in the slots.
1339 	 * 3. Request notification when the operation is done.
1340 	 * 4. Kick the peer.
1341 	 * 5. Wait for the response via xnf_tx_clean_ring().
1342 	 */
1343 
1344 	n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1345 	ASSERT(n_slots >= 2);
1346 
1347 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1348 	tidp = xnf_txid_get(xnfp);
1349 	VERIFY(tidp != NULL);
1350 
1351 	txp->tx_type = TX_MCAST_REQ;
1352 	txp->tx_slot = slot;
1353 
1354 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1355 	erp = (struct netif_extra_info *)
1356 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1357 
1358 	txrp->gref = 0;
1359 	txrp->size = 0;
1360 	txrp->offset = 0;
1361 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1362 	txrp->id = txp->tx_txreq.id = tidp->id;
1363 	txrp->flags = NETTXF_extra_info;
1364 
1365 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1366 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1367 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1368 
1369 	tidp->txbuf = txp;
1370 
1371 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1372 
1373 	mutex_enter(&xnfp->xnf_schedlock);
1374 	xnfp->xnf_pending_multicast++;
1375 	mutex_exit(&xnfp->xnf_schedlock);
1376 
1377 	/* LINTED: constant in conditional context */
1378 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1379 	    notify);
1380 	if (notify)
1381 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1382 
1383 	while (txp->tx_type == TX_MCAST_REQ)
1384 		cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1385 
1386 	ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1387 
1388 	mutex_enter(&xnfp->xnf_schedlock);
1389 	xnfp->xnf_pending_multicast--;
1390 	mutex_exit(&xnfp->xnf_schedlock);
1391 
1392 	result = (txp->tx_status == NETIF_RSP_OKAY);
1393 
1394 	xnf_txid_put(xnfp, tidp);
1395 
1396 	mutex_exit(&xnfp->xnf_txlock);
1397 
1398 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1399 
1400 	return (result ? 0 : 1);
1401 }
1402 
1403 /*
1404  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1405  *
1406  *  Program the hardware to enable/disable promiscuous mode.
1407  */
1408 static int
1409 xnf_set_promiscuous(void *arg, boolean_t on)
1410 {
1411 	_NOTE(ARGUNUSED(arg, on));
1412 
1413 	/*
1414 	 * We can't really do this, but we pretend that we can in
1415 	 * order that snoop will work.
1416 	 */
1417 	return (0);
1418 }
1419 
1420 /*
1421  * Clean buffers that we have responses for from the transmit ring.
1422  */
1423 static int
1424 xnf_tx_clean_ring(xnf_t *xnfp)
1425 {
1426 	boolean_t work_to_do;
1427 
1428 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1429 
1430 loop:
1431 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1432 		RING_IDX cons, prod, i;
1433 
1434 		cons = xnfp->xnf_tx_ring.rsp_cons;
1435 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1436 		membar_consumer();
1437 		/*
1438 		 * Clean tx requests from ring that we have responses
1439 		 * for.
1440 		 */
1441 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1442 		for (i = cons; i != prod; i++) {
1443 			netif_tx_response_t *trp;
1444 			xnf_txid_t *tidp;
1445 			xnf_txbuf_t *txp;
1446 
1447 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1448 			/*
1449 			 * if this slot was occupied by netif_extra_info_t,
1450 			 * then the response will be NETIF_RSP_NULL. In this
1451 			 * case there are no resources to clean up.
1452 			 */
1453 			if (trp->status == NETIF_RSP_NULL)
1454 				continue;
1455 
1456 			ASSERT(TX_ID_VALID(trp->id));
1457 
1458 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
1459 			ASSERT3U(tidp->id, ==, trp->id);
1460 			ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1461 
1462 			txp = tidp->txbuf;
1463 			ASSERT(txp != NULL);
1464 			ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1465 
1466 			switch (txp->tx_type) {
1467 			case TX_DATA:
1468 				/*
1469 				 * We must put the txid for each response we
1470 				 * acknowledge to make sure that we never have
1471 				 * more free slots than txids. Because of this
1472 				 * we do it here instead of waiting for it to
1473 				 * be done in xnf_data_txbuf_free_chain().
1474 				 */
1475 				xnf_txid_put(xnfp, tidp);
1476 				txp->tx_txreq.id = INVALID_TX_ID;
1477 				ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1478 				txp->tx_head->tx_frags_to_ack--;
1479 
1480 				/*
1481 				 * We clean the whole chain once we got a
1482 				 * response for each fragment.
1483 				 */
1484 				if (txp->tx_head->tx_frags_to_ack == 0)
1485 					xnf_data_txbuf_free_chain(xnfp, txp);
1486 
1487 				break;
1488 
1489 			case TX_MCAST_REQ:
1490 				txp->tx_type = TX_MCAST_RSP;
1491 				txp->tx_status = trp->status;
1492 				cv_broadcast(&xnfp->xnf_cv_multicast);
1493 
1494 				break;
1495 
1496 			default:
1497 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1498 				    "invalid xnf_txbuf_t type: %d",
1499 				    txp->tx_type);
1500 				break;
1501 			}
1502 		}
1503 		/*
1504 		 * Record the last response we dealt with so that we
1505 		 * know where to start next time around.
1506 		 */
1507 		xnfp->xnf_tx_ring.rsp_cons = prod;
1508 		membar_enter();
1509 	}
1510 
1511 	/* LINTED: constant in conditional context */
1512 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1513 	if (work_to_do)
1514 		goto loop;
1515 
1516 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1517 }
1518 
1519 /*
1520  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1521  * to ensure that the packet is physically contiguous and contained
1522  * within a single page.
1523  */
1524 static xnf_buf_t *
1525 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1526 {
1527 	xnf_buf_t *bd;
1528 	caddr_t bp;
1529 
1530 	if ((bd = xnf_buf_get(xnfp, KM_NOSLEEP, B_TRUE)) == NULL) {
1531 		return (NULL);
1532 	}
1533 
1534 	bp = bd->buf;
1535 	while (mp != NULL) {
1536 		size_t len = MBLKL(mp);
1537 
1538 		bcopy(mp->b_rptr, bp, len);
1539 		bp += len;
1540 
1541 		mp = mp->b_cont;
1542 	}
1543 
1544 	*plen = bp - bd->buf;
1545 	ASSERT3U(*plen, <=, PAGESIZE);
1546 
1547 	xnfp->xnf_stat_tx_lookaside++;
1548 
1549 	return (bd);
1550 }
1551 
1552 /*
1553  * Insert the pseudo-header checksum into the packet.
1554  * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1555  * HCKSUM_INET_FULL_V4.
1556  */
1557 int
1558 xnf_pseudo_cksum(mblk_t *mp)
1559 {
1560 	struct ether_header *ehp;
1561 	uint16_t sap, iplen, *stuff;
1562 	uint32_t cksum;
1563 	size_t len;
1564 	ipha_t *ipha;
1565 	ipaddr_t src, dst;
1566 	uchar_t *ptr;
1567 
1568 	ptr = mp->b_rptr;
1569 	len = MBLKL(mp);
1570 
1571 	/* Each header must fit completely in an mblk. */
1572 	ASSERT3U(len, >=, sizeof (*ehp));
1573 
1574 	ehp = (struct ether_header *)ptr;
1575 
1576 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1577 		struct ether_vlan_header *evhp;
1578 		ASSERT3U(len, >=, sizeof (*evhp));
1579 		evhp = (struct ether_vlan_header *)ptr;
1580 		sap = ntohs(evhp->ether_type);
1581 		ptr += sizeof (*evhp);
1582 		len -= sizeof (*evhp);
1583 	} else {
1584 		sap = ntohs(ehp->ether_type);
1585 		ptr += sizeof (*ehp);
1586 		len -= sizeof (*ehp);
1587 	}
1588 
1589 	ASSERT3U(sap, ==, ETHERTYPE_IP);
1590 
1591 	/*
1592 	 * Ethernet and IP headers may be in different mblks.
1593 	 */
1594 	ASSERT3P(ptr, <=, mp->b_wptr);
1595 	if (ptr == mp->b_wptr) {
1596 		mp = mp->b_cont;
1597 		ptr = mp->b_rptr;
1598 		len = MBLKL(mp);
1599 	}
1600 
1601 	ASSERT3U(len, >=, sizeof (ipha_t));
1602 	ipha = (ipha_t *)ptr;
1603 
1604 	/*
1605 	 * We assume the IP header has no options. (This is enforced in
1606 	 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1607 	 */
1608 	ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1609 	iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1610 
1611 	ptr += IP_SIMPLE_HDR_LENGTH;
1612 	len -= IP_SIMPLE_HDR_LENGTH;
1613 
1614 	/*
1615 	 * IP and L4 headers may be in different mblks.
1616 	 */
1617 	ASSERT3P(ptr, <=, mp->b_wptr);
1618 	if (ptr == mp->b_wptr) {
1619 		mp = mp->b_cont;
1620 		ptr = mp->b_rptr;
1621 		len = MBLKL(mp);
1622 	}
1623 
1624 	switch (ipha->ipha_protocol) {
1625 	case IPPROTO_TCP:
1626 		ASSERT3U(len, >=, sizeof (tcph_t));
1627 		stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1628 		cksum = IP_TCP_CSUM_COMP;
1629 		break;
1630 	case IPPROTO_UDP:
1631 		ASSERT3U(len, >=, sizeof (struct udphdr));
1632 		stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1633 		cksum = IP_UDP_CSUM_COMP;
1634 		break;
1635 	default:
1636 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1637 		    ipha->ipha_protocol);
1638 		return (EINVAL);
1639 	}
1640 
1641 	src = ipha->ipha_src;
1642 	dst = ipha->ipha_dst;
1643 
1644 	cksum += (dst >> 16) + (dst & 0xFFFF);
1645 	cksum += (src >> 16) + (src & 0xFFFF);
1646 	cksum += htons(iplen);
1647 
1648 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1649 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1650 
1651 	ASSERT(cksum <= 0xFFFF);
1652 
1653 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1654 
1655 	return (0);
1656 }
1657 
1658 /*
1659  * Push a packet into the transmit ring.
1660  *
1661  * Note: the format of a tx packet that spans multiple slots is similar to
1662  * what is described in xnf_rx_one_packet().
1663  */
1664 static void
1665 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1666 {
1667 	int nslots = 0;
1668 	int extras = 0;
1669 	RING_IDX slot;
1670 	boolean_t notify;
1671 
1672 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1673 	ASSERT(xnfp->xnf_running);
1674 
1675 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1676 
1677 	/*
1678 	 * The caller has already checked that we have enough slots to proceed.
1679 	 */
1680 	for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1681 		xnf_txid_t *tidp;
1682 		netif_tx_request_t *txrp;
1683 
1684 		tidp = xnf_txid_get(xnfp);
1685 		VERIFY(tidp != NULL);
1686 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1687 
1688 		txp->tx_slot = slot;
1689 		txp->tx_txreq.id = tidp->id;
1690 		*txrp = txp->tx_txreq;
1691 
1692 		tidp->txbuf = txp;
1693 		slot++;
1694 		nslots++;
1695 
1696 		/*
1697 		 * When present, LSO info is placed in a slot after the first
1698 		 * data segment, and doesn't require a txid.
1699 		 */
1700 		if (txp->tx_txreq.flags & NETTXF_extra_info) {
1701 			netif_extra_info_t *extra;
1702 			ASSERT3U(nslots, ==, 1);
1703 
1704 			extra = (netif_extra_info_t *)
1705 			    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1706 			*extra = txp->tx_extra;
1707 			slot++;
1708 			nslots++;
1709 			extras = 1;
1710 		}
1711 	}
1712 
1713 	ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1714 
1715 	/*
1716 	 * Store the number of data fragments.
1717 	 */
1718 	head->tx_frags_to_ack = nslots - extras;
1719 
1720 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
1721 
1722 	/*
1723 	 * Tell the peer that we sent something, if it cares.
1724 	 */
1725 	/* LINTED: constant in conditional context */
1726 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1727 	if (notify)
1728 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1729 }
1730 
1731 static xnf_txbuf_t *
1732 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1733 {
1734 	xnf_txbuf_t *txp;
1735 	size_t length;
1736 
1737 	if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1738 		return (NULL);
1739 	}
1740 
1741 	txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1742 	if (txp->tx_bdesc == NULL) {
1743 		xnf_data_txbuf_free(xnfp, txp);
1744 		return (NULL);
1745 	}
1746 	txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1747 	txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1748 	txp->tx_txreq.size = length;
1749 	txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1750 	txp->tx_txreq.flags = 0;
1751 
1752 	return (txp);
1753 }
1754 
1755 static xnf_txbuf_t *
1756 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1757 {
1758 	xnf_txbuf_t *head = NULL;
1759 	xnf_txbuf_t *tail = NULL;
1760 	domid_t oeid;
1761 	int nsegs = 0;
1762 
1763 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1764 
1765 	for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1766 		ddi_dma_handle_t dma_handle;
1767 		const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev;
1768 		xnf_txbuf_t *txp;
1769 
1770 		if (MBLKL(ml) == 0)
1771 			continue;
1772 
1773 		if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1774 			goto error;
1775 		}
1776 
1777 		if (head == NULL) {
1778 			head = txp;
1779 		} else {
1780 			ASSERT(tail != NULL);
1781 			TXBUF_SETNEXT(tail, txp);
1782 			txp->tx_head = head;
1783 		}
1784 
1785 		/*
1786 		 * The necessary segmentation rules (e.g. not crossing a page
1787 		 * boundary) are enforced by the dma attributes of the handle.
1788 		 */
1789 		dma_handle = txp->tx_dma_handle;
1790 		int ret = ddi_dma_addr_bind_handle(dma_handle,
1791 		    NULL, (char *)ml->b_rptr, MBLKL(ml),
1792 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1793 		    DDI_DMA_DONTWAIT, 0, NULL, NULL);
1794 		if (ret != DDI_DMA_MAPPED) {
1795 			if (ret != DDI_DMA_NORESOURCES) {
1796 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1797 				    "ddi_dma_addr_bind_handle() failed "
1798 				    "[dma_error=%d]", ret);
1799 			}
1800 			goto error;
1801 		}
1802 		txp->tx_handle_bound = B_TRUE;
1803 
1804 		dma_cookie_prev = NULL;
1805 		while ((dma_cookie = ddi_dma_cookie_iter(dma_handle,
1806 		    dma_cookie_prev)) != NULL) {
1807 			if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1808 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1809 				    "xnf_dmamap_alloc() failed: "
1810 				    "too many segments");
1811 				goto error;
1812 			}
1813 			if (dma_cookie_prev != NULL) {
1814 				if ((txp = xnf_data_txbuf_alloc(xnfp,
1815 				    KM_NOSLEEP)) == NULL) {
1816 					goto error;
1817 				}
1818 				ASSERT(tail != NULL);
1819 				TXBUF_SETNEXT(tail, txp);
1820 				txp->tx_head = head;
1821 			}
1822 
1823 			txp->tx_mfn =
1824 			    xnf_btop(pa_to_ma(dma_cookie->dmac_laddress));
1825 			txp->tx_txreq.gref = xnf_gref_get(xnfp);
1826 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1827 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1828 				    "xnf_dmamap_alloc() failed: "
1829 				    "invalid grant ref");
1830 				goto error;
1831 			}
1832 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1833 			    oeid, txp->tx_mfn, 1);
1834 			txp->tx_txreq.offset =
1835 			    dma_cookie->dmac_laddress & PAGEOFFSET;
1836 			txp->tx_txreq.size = dma_cookie->dmac_size;
1837 			txp->tx_txreq.flags = 0;
1838 
1839 			nsegs++;
1840 
1841 			if (tail != NULL)
1842 				tail->tx_txreq.flags = NETTXF_more_data;
1843 			tail = txp;
1844 
1845 			dma_cookie_prev = dma_cookie;
1846 		}
1847 	}
1848 
1849 	*countp = nsegs;
1850 	return (head);
1851 
1852 error:
1853 	xnf_data_txbuf_free_chain(xnfp, head);
1854 	return (NULL);
1855 }
1856 
1857 static void
1858 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1859     uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1860 {
1861 	if (lso_flags != 0) {
1862 		ASSERT3U(lso_flags, ==, HW_LSO);
1863 		ASSERT3P(head->tx_bdesc, ==, NULL);
1864 
1865 		head->tx_txreq.flags |= NETTXF_extra_info;
1866 		netif_extra_info_t *extra = &head->tx_extra;
1867 		extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1868 		extra->flags = 0;
1869 		extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1870 		extra->u.gso.size = mss;
1871 		extra->u.gso.features = 0;
1872 		extra->u.gso.pad = 0;
1873 	} else if (cksum_flags != 0) {
1874 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1875 		/*
1876 		 * If the local protocol stack requests checksum
1877 		 * offload we set the 'checksum blank' flag,
1878 		 * indicating to the peer that we need the checksum
1879 		 * calculated for us.
1880 		 *
1881 		 * We _don't_ set the validated flag, because we haven't
1882 		 * validated that the data and the checksum match.
1883 		 *
1884 		 * Note: we already called xnf_pseudo_cksum() in
1885 		 * xnf_send(), so we just set the txreq flag here.
1886 		 */
1887 		head->tx_txreq.flags |= NETTXF_csum_blank;
1888 		xnfp->xnf_stat_tx_cksum_deferred++;
1889 	}
1890 }
1891 
1892 /*
1893  * Send packet mp. Called by the MAC framework.
1894  */
1895 static mblk_t *
1896 xnf_send(void *arg, mblk_t *mp)
1897 {
1898 	xnf_t *xnfp = arg;
1899 	xnf_txbuf_t *head;
1900 	mblk_t *ml;
1901 	int length;
1902 	int pages, chunks, slots, slots_free;
1903 	uint32_t cksum_flags, lso_flags, mss;
1904 	boolean_t pulledup = B_FALSE;
1905 	boolean_t force_copy = B_FALSE;
1906 
1907 	ASSERT3P(mp->b_next, ==, NULL);
1908 
1909 	mutex_enter(&xnfp->xnf_txlock);
1910 
1911 	/*
1912 	 * Wait until we are connected to the backend.
1913 	 */
1914 	while (!xnfp->xnf_connected)
1915 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1916 
1917 	/*
1918 	 * To simplify logic and be in sync with the rescheduling mechanism,
1919 	 * we require the maximum amount of slots that could be used by a
1920 	 * transaction to be free before proceeding. The only downside of doing
1921 	 * this is that it slightly reduces the effective size of the ring.
1922 	 */
1923 	slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1924 	if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1925 		/*
1926 		 * We need to ask for a re-schedule later as the ring is full.
1927 		 */
1928 		mutex_enter(&xnfp->xnf_schedlock);
1929 		xnfp->xnf_need_sched = B_TRUE;
1930 		mutex_exit(&xnfp->xnf_schedlock);
1931 
1932 		xnfp->xnf_stat_tx_defer++;
1933 		mutex_exit(&xnfp->xnf_txlock);
1934 		return (mp);
1935 	}
1936 
1937 	/*
1938 	 * Get hw offload parameters.
1939 	 * This must be done before pulling up the mp as those parameters
1940 	 * are not copied over.
1941 	 */
1942 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1943 	mac_lso_get(mp, &mss, &lso_flags);
1944 
1945 	/*
1946 	 * XXX: fix MAC framework so that we can advertise support for
1947 	 * partial checksum for IPv4 only. This way we won't need to calculate
1948 	 * the pseudo header checksum ourselves.
1949 	 */
1950 	if (cksum_flags != 0) {
1951 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1952 		(void) xnf_pseudo_cksum(mp);
1953 	}
1954 
1955 pulledup:
1956 	for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1957 	    ml = ml->b_cont, chunks++) {
1958 		pages += xnf_mblk_pages(ml);
1959 		length += MBLKL(ml);
1960 	}
1961 	DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1962 	DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1963 
1964 	/*
1965 	 * If the ethernet header crosses a page boundary the packet
1966 	 * will be dropped by the backend. In practice it seems like
1967 	 * this happens fairly rarely so we'll do nothing unless the
1968 	 * packet is small enough to fit in a look-aside buffer.
1969 	 */
1970 	if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1971 	    sizeof (struct ether_header) > PAGESIZE) {
1972 		xnfp->xnf_stat_tx_eth_hdr_split++;
1973 		if (length <= PAGESIZE)
1974 			force_copy = B_TRUE;
1975 	}
1976 
1977 	if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1978 		/*
1979 		 * If the packet spans several pages and scatter-gather is not
1980 		 * supported then use a look-aside buffer.
1981 		 */
1982 		ASSERT3U(length, <=, PAGESIZE);
1983 		head = xnf_mblk_copy(xnfp, mp);
1984 		if (head == NULL) {
1985 			dev_err(xnfp->xnf_devinfo, CE_WARN,
1986 			    "xnf_mblk_copy() failed");
1987 			goto drop;
1988 		}
1989 	} else {
1990 		/*
1991 		 * There's a limit for how many pages can be passed to the
1992 		 * backend. If we pass that limit, the packet will be dropped
1993 		 * and some backend implementations (e.g. Linux) could even
1994 		 * offline the interface.
1995 		 */
1996 		if (pages > XEN_MAX_TX_DATA_PAGES) {
1997 			if (pulledup) {
1998 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1999 				    "too many pages, even after pullup: %d.",
2000 				    pages);
2001 				goto drop;
2002 			}
2003 
2004 			/*
2005 			 * Defragment packet if it spans too many pages.
2006 			 */
2007 			mblk_t *newmp = msgpullup(mp, -1);
2008 			if (newmp == NULL) {
2009 				dev_err(xnfp->xnf_devinfo, CE_WARN,
2010 				    "msgpullup() failed");
2011 				goto drop;
2012 			}
2013 
2014 			freemsg(mp);
2015 			mp = newmp;
2016 			xnfp->xnf_stat_tx_pullup++;
2017 			pulledup = B_TRUE;
2018 			goto pulledup;
2019 		}
2020 
2021 		head = xnf_mblk_map(xnfp, mp, &slots);
2022 		if (head == NULL)
2023 			goto drop;
2024 
2025 		IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2026 	}
2027 
2028 	/*
2029 	 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2030 	 */
2031 	head->tx_mp = mp;
2032 
2033 	xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2034 
2035 	/*
2036 	 * The first request must store the total length of the packet.
2037 	 */
2038 	head->tx_txreq.size = length;
2039 
2040 	/*
2041 	 * Push the packet we have prepared into the ring.
2042 	 */
2043 	xnf_tx_push_packet(xnfp, head);
2044 	xnfp->xnf_stat_opackets++;
2045 	xnfp->xnf_stat_obytes += length;
2046 
2047 	mutex_exit(&xnfp->xnf_txlock);
2048 	return (NULL);
2049 
2050 drop:
2051 	freemsg(mp);
2052 	xnfp->xnf_stat_tx_drop++;
2053 	mutex_exit(&xnfp->xnf_txlock);
2054 	return (NULL);
2055 }
2056 
2057 /*
2058  * Notification of RX packets. Currently no TX-complete interrupt is
2059  * used, as we clean the TX ring lazily.
2060  */
2061 static uint_t
2062 xnf_intr(caddr_t arg)
2063 {
2064 	xnf_t *xnfp = (xnf_t *)arg;
2065 	mblk_t *mp;
2066 	boolean_t need_sched, clean_ring;
2067 
2068 	mutex_enter(&xnfp->xnf_rxlock);
2069 
2070 	/*
2071 	 * Interrupts before we are connected are spurious.
2072 	 */
2073 	if (!xnfp->xnf_connected) {
2074 		mutex_exit(&xnfp->xnf_rxlock);
2075 		xnfp->xnf_stat_unclaimed_interrupts++;
2076 		return (DDI_INTR_UNCLAIMED);
2077 	}
2078 
2079 	/*
2080 	 * Receive side processing.
2081 	 */
2082 	do {
2083 		/*
2084 		 * Collect buffers from the ring.
2085 		 */
2086 		xnf_rx_collect(xnfp);
2087 
2088 		/*
2089 		 * Interrupt me when the next receive buffer is consumed.
2090 		 */
2091 		xnfp->xnf_rx_ring.sring->rsp_event =
2092 		    xnfp->xnf_rx_ring.rsp_cons + 1;
2093 		xen_mb();
2094 
2095 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2096 
2097 	if (xnfp->xnf_rx_new_buffers_posted) {
2098 		boolean_t notify;
2099 
2100 		/*
2101 		 * Indicate to the peer that we have re-filled the
2102 		 * receive ring, if it cares.
2103 		 */
2104 		/* LINTED: constant in conditional context */
2105 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2106 		if (notify)
2107 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
2108 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2109 	}
2110 
2111 	mp = xnfp->xnf_rx_head;
2112 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2113 
2114 	xnfp->xnf_stat_interrupts++;
2115 	mutex_exit(&xnfp->xnf_rxlock);
2116 
2117 	if (mp != NULL)
2118 		mac_rx(xnfp->xnf_mh, NULL, mp);
2119 
2120 	/*
2121 	 * Transmit side processing.
2122 	 *
2123 	 * If a previous transmit attempt failed or we have pending
2124 	 * multicast requests, clean the ring.
2125 	 *
2126 	 * If we previously stalled transmission and cleaning produces
2127 	 * some free slots, tell upstream to attempt sending again.
2128 	 *
2129 	 * The odd style is to avoid acquiring xnf_txlock unless we
2130 	 * will actually look inside the tx machinery.
2131 	 */
2132 	mutex_enter(&xnfp->xnf_schedlock);
2133 	need_sched = xnfp->xnf_need_sched;
2134 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2135 	mutex_exit(&xnfp->xnf_schedlock);
2136 
2137 	if (clean_ring) {
2138 		int free_slots;
2139 
2140 		mutex_enter(&xnfp->xnf_txlock);
2141 		free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2142 
2143 		if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2144 			mutex_enter(&xnfp->xnf_schedlock);
2145 			xnfp->xnf_need_sched = B_FALSE;
2146 			mutex_exit(&xnfp->xnf_schedlock);
2147 
2148 			mac_tx_update(xnfp->xnf_mh);
2149 		}
2150 		mutex_exit(&xnfp->xnf_txlock);
2151 	}
2152 
2153 	return (DDI_INTR_CLAIMED);
2154 }
2155 
2156 /*
2157  *  xnf_start() -- start the board receiving and enable interrupts.
2158  */
2159 static int
2160 xnf_start(void *arg)
2161 {
2162 	xnf_t *xnfp = arg;
2163 
2164 	mutex_enter(&xnfp->xnf_rxlock);
2165 	mutex_enter(&xnfp->xnf_txlock);
2166 
2167 	/* Accept packets from above. */
2168 	xnfp->xnf_running = B_TRUE;
2169 
2170 	mutex_exit(&xnfp->xnf_txlock);
2171 	mutex_exit(&xnfp->xnf_rxlock);
2172 
2173 	return (0);
2174 }
2175 
2176 /* xnf_stop() - disable hardware */
2177 static void
2178 xnf_stop(void *arg)
2179 {
2180 	xnf_t *xnfp = arg;
2181 
2182 	mutex_enter(&xnfp->xnf_rxlock);
2183 	mutex_enter(&xnfp->xnf_txlock);
2184 
2185 	xnfp->xnf_running = B_FALSE;
2186 
2187 	mutex_exit(&xnfp->xnf_txlock);
2188 	mutex_exit(&xnfp->xnf_rxlock);
2189 }
2190 
2191 /*
2192  * Hang buffer `bdesc' on the RX ring.
2193  */
2194 static void
2195 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2196 {
2197 	netif_rx_request_t *reqp;
2198 	RING_IDX hang_ix;
2199 
2200 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2201 
2202 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2203 	    xnfp->xnf_rx_ring.req_prod_pvt);
2204 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2205 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2206 
2207 	reqp->id = bdesc->id = hang_ix;
2208 	reqp->gref = bdesc->grant_ref;
2209 
2210 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2211 	xnfp->xnf_rx_ring.req_prod_pvt++;
2212 
2213 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2214 }
2215 
2216 /*
2217  * Receive an entire packet from the ring, starting from slot *consp.
2218  * prod indicates the slot of the latest response.
2219  * On return, *consp will point to the head of the next packet.
2220  *
2221  * Note: If slot prod was reached before we could gather a full packet, we will
2222  * drop the partial packet; this would most likely indicate a bug in either
2223  * the front-end or the back-end driver.
2224  *
2225  * An rx packet can consist of several fragments and thus span multiple slots.
2226  * Each fragment can contain up to 4k of data.
2227  *
2228  * A typical 9000 MTU packet with look like this:
2229  * +------+---------------------+-------------------+-----------------------+
2230  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2231  * +------+---------------------+-------------------+-----------------------+
2232  * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
2233  * +------+---------------------+-------------------+-----------------------+
2234  * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
2235  * +------+---------------------+-------------------+-----------------------+
2236  * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
2237  * +------+---------------------+-------------------+-----------------------+
2238  *
2239  * Fragments are chained by setting NETRXF_more_data in the previous
2240  * response's flags. If there are additional flags, such as
2241  * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2242  * first fragment.
2243  *
2244  * Sometimes extra info can be present. If so, it will follow the first
2245  * fragment, and NETRXF_extra_info flag will be set on the first response.
2246  * If LRO is set on a packet, it will be stored in the extra info. Conforming
2247  * to the spec, extra info can also be chained, but must all be present right
2248  * after the first fragment.
2249  *
2250  * Example of a packet with 2 extra infos:
2251  * +------+---------------------+-------------------+-----------------------+
2252  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2253  * +------+---------------------+-------------------+-----------------------+
2254  * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2255  * +------+---------------------+-------------------+-----------------------+
2256  * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
2257  * +------+---------------------+-------------------+-----------------------+
2258  * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
2259  * +------+---------------------+-------------------+-----------------------+
2260  * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
2261  * +------+---------------------+-------------------+-----------------------+
2262  * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
2263  * +------+---------------------+-------------------+-----------------------+
2264  * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
2265  * +------+---------------------+-------------------+-----------------------+
2266  *
2267  * In practice, the only extra we expect is for LRO, but only if we advertise
2268  * that we support it to the backend (xnf_enable_lro == TRUE).
2269  */
2270 static int
2271 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2272 {
2273 	mblk_t *head = NULL;
2274 	mblk_t *tail = NULL;
2275 	mblk_t *mp;
2276 	int error = 0;
2277 	RING_IDX cons = *consp;
2278 	netif_extra_info_t lro;
2279 	boolean_t is_lro = B_FALSE;
2280 	boolean_t is_extra = B_FALSE;
2281 
2282 	netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2283 
2284 	boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2285 	boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2286 	boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2287 
2288 	IMPLY(more_data, xnf_enable_rx_sg);
2289 
2290 	while (cons != prod) {
2291 		xnf_buf_t *bdesc;
2292 		int len, off;
2293 		int rxidx = cons & (NET_RX_RING_SIZE - 1);
2294 
2295 		bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2296 		xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2297 
2298 		if (is_extra) {
2299 			netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2300 			/*
2301 			 * The only extra we expect is for LRO, and it should
2302 			 * only be present once.
2303 			 */
2304 			if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2305 			    !is_lro) {
2306 				ASSERT(xnf_enable_lro);
2307 				lro = *extra;
2308 				is_lro = B_TRUE;
2309 				DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2310 			} else {
2311 				dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2312 				    "contains unexpected extra info of type %d",
2313 				    extra->type);
2314 				error = EINVAL;
2315 			}
2316 			more_extra =
2317 			    (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2318 
2319 			goto hang_buf;
2320 		}
2321 
2322 		ASSERT3U(bdesc->id, ==, rsp.id);
2323 
2324 		/*
2325 		 * status stores packet length when >= 0, or errors when < 0.
2326 		 */
2327 		len = rsp.status;
2328 		off = rsp.offset;
2329 		more_data = (rsp.flags & NETRXF_more_data) != 0;
2330 
2331 		/*
2332 		 * sanity checks.
2333 		 */
2334 		if (!xnfp->xnf_running) {
2335 			error = EBUSY;
2336 		} else if (len <= 0) {
2337 			xnfp->xnf_stat_errrx++;
2338 
2339 			switch (len) {
2340 			case 0:
2341 				xnfp->xnf_stat_runt++;
2342 				break;
2343 			case NETIF_RSP_ERROR:
2344 				xnfp->xnf_stat_mac_rcv_error++;
2345 				break;
2346 			case NETIF_RSP_DROPPED:
2347 				xnfp->xnf_stat_norxbuf++;
2348 				break;
2349 			}
2350 			error = EINVAL;
2351 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2352 			dev_err(xnfp->xnf_devinfo, CE_WARN,
2353 			    "Bad rx grant reference, rsp id %d", rsp.id);
2354 			error = EINVAL;
2355 		} else if ((off + len) > PAGESIZE) {
2356 			dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2357 			    "page boundary (offset %d, length %d)", off, len);
2358 			error = EINVAL;
2359 		}
2360 
2361 		if (error != 0) {
2362 			/*
2363 			 * If an error has been detected, we do not attempt
2364 			 * to read the data but we still need to replace
2365 			 * the rx bufs.
2366 			 */
2367 			goto hang_buf;
2368 		}
2369 
2370 		xnf_buf_t *nbuf = NULL;
2371 
2372 		/*
2373 		 * If the packet is below a pre-determined size we will
2374 		 * copy data out of the buf rather than replace it.
2375 		 */
2376 		if (len > xnf_rx_copy_limit)
2377 			nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2378 
2379 		if (nbuf != NULL) {
2380 			mp = desballoc((unsigned char *)bdesc->buf,
2381 			    bdesc->len, 0, &bdesc->free_rtn);
2382 
2383 			if (mp == NULL) {
2384 				xnfp->xnf_stat_rx_desballoc_fail++;
2385 				xnfp->xnf_stat_norxbuf++;
2386 				error = ENOMEM;
2387 				/*
2388 				 * we free the buf we just allocated as we
2389 				 * will re-hang the old buf.
2390 				 */
2391 				xnf_buf_put(xnfp, nbuf, B_FALSE);
2392 				goto hang_buf;
2393 			}
2394 
2395 			mp->b_rptr = mp->b_rptr + off;
2396 			mp->b_wptr = mp->b_rptr + len;
2397 
2398 			/*
2399 			 * Release the grant as the backend doesn't need to
2400 			 * access this buffer anymore and grants are scarce.
2401 			 */
2402 			(void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2403 			    0);
2404 			xnf_gref_put(xnfp, bdesc->grant_ref);
2405 			bdesc->grant_ref = INVALID_GRANT_REF;
2406 
2407 			bdesc = nbuf;
2408 		} else {
2409 			/*
2410 			 * We failed to allocate a new buf or decided to reuse
2411 			 * the old one. In either case we copy the data off it
2412 			 * and put it back into the ring.
2413 			 */
2414 			mp = allocb(len, 0);
2415 			if (mp == NULL) {
2416 				xnfp->xnf_stat_rx_allocb_fail++;
2417 				xnfp->xnf_stat_norxbuf++;
2418 				error = ENOMEM;
2419 				goto hang_buf;
2420 			}
2421 			bcopy(bdesc->buf + off, mp->b_wptr, len);
2422 			mp->b_wptr += len;
2423 		}
2424 
2425 		if (head == NULL)
2426 			head = mp;
2427 		else
2428 			tail->b_cont = mp;
2429 		tail = mp;
2430 
2431 hang_buf:
2432 		/*
2433 		 * No matter what happens, for each response we need to hang
2434 		 * a new buf on the rx ring. Put either the old one, or a new
2435 		 * one if the old one is borrowed by the kernel via desballoc().
2436 		 */
2437 		xnf_rxbuf_hang(xnfp, bdesc);
2438 		cons++;
2439 
2440 		/* next response is an extra */
2441 		is_extra = more_extra;
2442 
2443 		if (!more_data && !more_extra)
2444 			break;
2445 
2446 		/*
2447 		 * Note that since requests and responses are union'd on the
2448 		 * same ring, we copy the response to a local variable instead
2449 		 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2450 		 * overwritten contents of rsp.
2451 		 */
2452 		rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2453 	}
2454 
2455 	/*
2456 	 * Check that we do not get stuck in a loop.
2457 	 */
2458 	ASSERT3U(*consp, !=, cons);
2459 	*consp = cons;
2460 
2461 	/*
2462 	 * We ran out of responses but the flags indicate there is more data.
2463 	 */
2464 	if (more_data) {
2465 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2466 		error = EINVAL;
2467 	}
2468 	if (more_extra) {
2469 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2470 		    "(extras).");
2471 		error = EINVAL;
2472 	}
2473 
2474 	/*
2475 	 * An error means the packet must be dropped. If we have already formed
2476 	 * a partial packet, then discard it.
2477 	 */
2478 	if (error != 0) {
2479 		if (head != NULL)
2480 			freemsg(head);
2481 		xnfp->xnf_stat_rx_drop++;
2482 		return (error);
2483 	}
2484 
2485 	ASSERT(head != NULL);
2486 
2487 	if (hwcsum) {
2488 		/*
2489 		 * If the peer says that the data has been validated then we
2490 		 * declare that the full checksum has been verified.
2491 		 *
2492 		 * We don't look at the "checksum blank" flag, and hence could
2493 		 * have a packet here that we are asserting is good with
2494 		 * a blank checksum.
2495 		 */
2496 		mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2497 		xnfp->xnf_stat_rx_cksum_no_need++;
2498 	}
2499 
2500 	/* XXX: set lro info for packet once LRO is supported in OS. */
2501 
2502 	*mpp = head;
2503 
2504 	return (0);
2505 }
2506 
2507 /*
2508  * Collect packets from the RX ring, storing them in `xnfp' for later use.
2509  */
2510 static void
2511 xnf_rx_collect(xnf_t *xnfp)
2512 {
2513 	RING_IDX prod;
2514 
2515 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2516 
2517 	prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2518 	/*
2519 	 * Ensure we see queued responses up to 'prod'.
2520 	 */
2521 	membar_consumer();
2522 
2523 	while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2524 		mblk_t *mp;
2525 
2526 		/*
2527 		 * Collect a packet.
2528 		 * rsp_cons is updated inside xnf_rx_one_packet().
2529 		 */
2530 		int error = xnf_rx_one_packet(xnfp, prod,
2531 		    &xnfp->xnf_rx_ring.rsp_cons, &mp);
2532 		if (error == 0) {
2533 			xnfp->xnf_stat_ipackets++;
2534 			xnfp->xnf_stat_rbytes += xmsgsize(mp);
2535 
2536 			/*
2537 			 * Append the mblk to the rx list.
2538 			 */
2539 			if (xnfp->xnf_rx_head == NULL) {
2540 				ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2541 				xnfp->xnf_rx_head = mp;
2542 			} else {
2543 				ASSERT(xnfp->xnf_rx_tail != NULL);
2544 				xnfp->xnf_rx_tail->b_next = mp;
2545 			}
2546 			xnfp->xnf_rx_tail = mp;
2547 		}
2548 	}
2549 }
2550 
2551 /*
2552  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2553  */
2554 static int
2555 xnf_alloc_dma_resources(xnf_t *xnfp)
2556 {
2557 	dev_info_t		*devinfo = xnfp->xnf_devinfo;
2558 	size_t			len;
2559 	ddi_dma_cookie_t	dma_cookie;
2560 	uint_t			ncookies;
2561 	int			rc;
2562 	caddr_t			rptr;
2563 
2564 	/*
2565 	 * The code below allocates all the DMA data structures that
2566 	 * need to be released when the driver is detached.
2567 	 *
2568 	 * Allocate page for the transmit descriptor ring.
2569 	 */
2570 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2571 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2572 		goto alloc_error;
2573 
2574 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2575 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2576 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2577 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2578 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2579 		xnfp->xnf_tx_ring_dma_handle = NULL;
2580 		goto alloc_error;
2581 	}
2582 
2583 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2584 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2585 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2586 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2587 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2588 		xnfp->xnf_tx_ring_dma_handle = NULL;
2589 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2590 		if (rc == DDI_DMA_NORESOURCES)
2591 			goto alloc_error;
2592 		else
2593 			goto error;
2594 	}
2595 
2596 	ASSERT(ncookies == 1);
2597 	bzero(rptr, PAGESIZE);
2598 	/* LINTED: constant in conditional context */
2599 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2600 	/* LINTED: constant in conditional context */
2601 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2602 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2603 
2604 	/*
2605 	 * Allocate page for the receive descriptor ring.
2606 	 */
2607 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2608 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2609 		goto alloc_error;
2610 
2611 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2612 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2613 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2614 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2615 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2616 		xnfp->xnf_rx_ring_dma_handle = NULL;
2617 		goto alloc_error;
2618 	}
2619 
2620 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2621 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2622 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2623 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2624 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2625 		xnfp->xnf_rx_ring_dma_handle = NULL;
2626 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2627 		if (rc == DDI_DMA_NORESOURCES)
2628 			goto alloc_error;
2629 		else
2630 			goto error;
2631 	}
2632 
2633 	ASSERT(ncookies == 1);
2634 	bzero(rptr, PAGESIZE);
2635 	/* LINTED: constant in conditional context */
2636 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2637 	/* LINTED: constant in conditional context */
2638 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2639 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2640 
2641 	return (DDI_SUCCESS);
2642 
2643 alloc_error:
2644 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2645 	    ddi_get_instance(xnfp->xnf_devinfo));
2646 error:
2647 	xnf_release_dma_resources(xnfp);
2648 	return (DDI_FAILURE);
2649 }
2650 
2651 /*
2652  * Release all DMA resources in the opposite order from acquisition
2653  */
2654 static void
2655 xnf_release_dma_resources(xnf_t *xnfp)
2656 {
2657 	int i;
2658 
2659 	/*
2660 	 * Free receive buffers which are currently associated with
2661 	 * descriptors.
2662 	 */
2663 	mutex_enter(&xnfp->xnf_rxlock);
2664 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
2665 		xnf_buf_t *bp;
2666 
2667 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2668 			continue;
2669 		xnfp->xnf_rx_pkt_info[i] = NULL;
2670 		xnf_buf_put(xnfp, bp, B_FALSE);
2671 	}
2672 	mutex_exit(&xnfp->xnf_rxlock);
2673 
2674 	/* Free the receive ring buffer. */
2675 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2676 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2677 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2678 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2679 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2680 	}
2681 	/* Free the transmit ring buffer. */
2682 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2683 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2684 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2685 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2686 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2687 	}
2688 
2689 }
2690 
2691 /*
2692  * Release any packets and associated structures used by the TX ring.
2693  */
2694 static void
2695 xnf_release_mblks(xnf_t *xnfp)
2696 {
2697 	RING_IDX i;
2698 	xnf_txid_t *tidp;
2699 
2700 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2701 	    i < NET_TX_RING_SIZE;
2702 	    i++, tidp++) {
2703 		xnf_txbuf_t *txp = tidp->txbuf;
2704 
2705 		if (txp != NULL) {
2706 			ASSERT(txp->tx_mp != NULL);
2707 			freemsg(txp->tx_mp);
2708 
2709 			xnf_txid_put(xnfp, tidp);
2710 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2711 		}
2712 	}
2713 }
2714 
2715 static int
2716 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2717 {
2718 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2719 	xnf_buf_t *bdesc = buf;
2720 	xnf_t *xnfp = arg;
2721 	ddi_dma_cookie_t dma_cookie;
2722 	uint_t ncookies;
2723 	size_t len;
2724 
2725 	if (kmflag & KM_NOSLEEP)
2726 		ddiflags = DDI_DMA_DONTWAIT;
2727 
2728 	/* Allocate a DMA access handle for the buffer. */
2729 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2730 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2731 		goto failure;
2732 
2733 	/* Allocate DMA-able memory for buffer. */
2734 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2735 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2736 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2737 		goto failure_1;
2738 
2739 	/* Bind to virtual address of buffer to get physical address. */
2740 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2741 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2742 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2743 		goto failure_2;
2744 	ASSERT(ncookies == 1);
2745 
2746 	bdesc->free_rtn.free_func = xnf_buf_recycle;
2747 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2748 	bdesc->xnfp = xnfp;
2749 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2750 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2751 	bdesc->len = dma_cookie.dmac_size;
2752 	bdesc->grant_ref = INVALID_GRANT_REF;
2753 	bdesc->gen = xnfp->xnf_gen;
2754 
2755 	atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2756 
2757 	return (0);
2758 
2759 failure_2:
2760 	ddi_dma_mem_free(&bdesc->acc_handle);
2761 
2762 failure_1:
2763 	ddi_dma_free_handle(&bdesc->dma_handle);
2764 
2765 failure:
2766 
2767 	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2768 	return (-1);
2769 }
2770 
2771 static void
2772 xnf_buf_destructor(void *buf, void *arg)
2773 {
2774 	xnf_buf_t *bdesc = buf;
2775 	xnf_t *xnfp = arg;
2776 
2777 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
2778 	ddi_dma_mem_free(&bdesc->acc_handle);
2779 	ddi_dma_free_handle(&bdesc->dma_handle);
2780 
2781 	atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2782 }
2783 
2784 static xnf_buf_t *
2785 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2786 {
2787 	grant_ref_t gref;
2788 	xnf_buf_t *bufp;
2789 
2790 	/*
2791 	 * Usually grant references are more scarce than memory, so we
2792 	 * attempt to acquire a grant reference first.
2793 	 */
2794 	gref = xnf_gref_get(xnfp);
2795 	if (gref == INVALID_GRANT_REF)
2796 		return (NULL);
2797 
2798 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2799 	if (bufp == NULL) {
2800 		xnf_gref_put(xnfp, gref);
2801 		return (NULL);
2802 	}
2803 
2804 	ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2805 
2806 	bufp->grant_ref = gref;
2807 
2808 	if (bufp->gen != xnfp->xnf_gen)
2809 		xnf_buf_refresh(bufp);
2810 
2811 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
2812 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2813 	    bufp->buf_mfn, readonly ? 1 : 0);
2814 
2815 	atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2816 
2817 	return (bufp);
2818 }
2819 
2820 static void
2821 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2822 {
2823 	if (bufp->grant_ref != INVALID_GRANT_REF) {
2824 		(void) gnttab_end_foreign_access_ref(
2825 		    bufp->grant_ref, readonly ? 1 : 0);
2826 		xnf_gref_put(xnfp, bufp->grant_ref);
2827 		bufp->grant_ref = INVALID_GRANT_REF;
2828 	}
2829 
2830 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2831 
2832 	atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2833 }
2834 
2835 /*
2836  * Refresh any cached data about a buffer after resume.
2837  */
2838 static void
2839 xnf_buf_refresh(xnf_buf_t *bdesc)
2840 {
2841 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2842 	bdesc->gen = bdesc->xnfp->xnf_gen;
2843 }
2844 
2845 /*
2846  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2847  * look-aside buffers.
2848  */
2849 static void
2850 xnf_buf_recycle(xnf_buf_t *bdesc)
2851 {
2852 	xnf_t *xnfp = bdesc->xnfp;
2853 
2854 	xnf_buf_put(xnfp, bdesc, B_TRUE);
2855 }
2856 
2857 static int
2858 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2859 {
2860 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2861 	xnf_txbuf_t *txp = buf;
2862 	xnf_t *xnfp = arg;
2863 
2864 	if (kmflag & KM_NOSLEEP)
2865 		ddiflags = DDI_DMA_DONTWAIT;
2866 
2867 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2868 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2869 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2870 		return (-1);
2871 	}
2872 
2873 	return (0);
2874 }
2875 
2876 static void
2877 xnf_tx_buf_destructor(void *buf, void *arg)
2878 {
2879 	_NOTE(ARGUNUSED(arg));
2880 	xnf_txbuf_t *txp = buf;
2881 
2882 	ddi_dma_free_handle(&txp->tx_dma_handle);
2883 }
2884 
2885 /*
2886  * Statistics.
2887  */
2888 static char *xnf_aux_statistics[] = {
2889 	"tx_cksum_deferred",
2890 	"rx_cksum_no_need",
2891 	"interrupts",
2892 	"unclaimed_interrupts",
2893 	"tx_pullup",
2894 	"tx_lookaside",
2895 	"tx_drop",
2896 	"tx_eth_hdr_split",
2897 	"buf_allocated",
2898 	"buf_outstanding",
2899 	"gref_outstanding",
2900 	"gref_failure",
2901 	"gref_peak",
2902 	"rx_allocb_fail",
2903 	"rx_desballoc_fail",
2904 };
2905 
2906 static int
2907 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2908 {
2909 	xnf_t *xnfp;
2910 	kstat_named_t *knp;
2911 
2912 	if (flag != KSTAT_READ)
2913 		return (EACCES);
2914 
2915 	xnfp = ksp->ks_private;
2916 	knp = ksp->ks_data;
2917 
2918 	/*
2919 	 * Assignment order must match that of the names in
2920 	 * xnf_aux_statistics.
2921 	 */
2922 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2923 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2924 
2925 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2926 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2927 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2928 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2929 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2930 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2931 
2932 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2933 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2934 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2935 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2936 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2937 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2938 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2939 
2940 	return (0);
2941 }
2942 
2943 static boolean_t
2944 xnf_kstat_init(xnf_t *xnfp)
2945 {
2946 	int nstat = sizeof (xnf_aux_statistics) /
2947 	    sizeof (xnf_aux_statistics[0]);
2948 	char **cp = xnf_aux_statistics;
2949 	kstat_named_t *knp;
2950 
2951 	/*
2952 	 * Create and initialise kstats.
2953 	 */
2954 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2955 	    ddi_get_instance(xnfp->xnf_devinfo),
2956 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2957 	    nstat, 0)) == NULL)
2958 		return (B_FALSE);
2959 
2960 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2961 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2962 
2963 	knp = xnfp->xnf_kstat_aux->ks_data;
2964 	while (nstat > 0) {
2965 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2966 
2967 		knp++;
2968 		cp++;
2969 		nstat--;
2970 	}
2971 
2972 	kstat_install(xnfp->xnf_kstat_aux);
2973 
2974 	return (B_TRUE);
2975 }
2976 
2977 static int
2978 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2979 {
2980 	xnf_t *xnfp = arg;
2981 
2982 	mutex_enter(&xnfp->xnf_rxlock);
2983 	mutex_enter(&xnfp->xnf_txlock);
2984 
2985 #define	mac_stat(q, r)				\
2986 	case (MAC_STAT_##q):			\
2987 		*val = xnfp->xnf_stat_##r;	\
2988 		break
2989 
2990 #define	ether_stat(q, r)			\
2991 	case (ETHER_STAT_##q):			\
2992 		*val = xnfp->xnf_stat_##r;	\
2993 		break
2994 
2995 	switch (stat) {
2996 
2997 	mac_stat(IPACKETS, ipackets);
2998 	mac_stat(OPACKETS, opackets);
2999 	mac_stat(RBYTES, rbytes);
3000 	mac_stat(OBYTES, obytes);
3001 	mac_stat(NORCVBUF, norxbuf);
3002 	mac_stat(IERRORS, errrx);
3003 	mac_stat(NOXMTBUF, tx_defer);
3004 
3005 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
3006 	ether_stat(TOOSHORT_ERRORS, runt);
3007 
3008 	/* always claim to be in full duplex mode */
3009 	case ETHER_STAT_LINK_DUPLEX:
3010 		*val = LINK_DUPLEX_FULL;
3011 		break;
3012 
3013 	/* always claim to be at 1Gb/s link speed */
3014 	case MAC_STAT_IFSPEED:
3015 		*val = 1000000000ull;
3016 		break;
3017 
3018 	default:
3019 		mutex_exit(&xnfp->xnf_txlock);
3020 		mutex_exit(&xnfp->xnf_rxlock);
3021 
3022 		return (ENOTSUP);
3023 	}
3024 
3025 #undef mac_stat
3026 #undef ether_stat
3027 
3028 	mutex_exit(&xnfp->xnf_txlock);
3029 	mutex_exit(&xnfp->xnf_rxlock);
3030 
3031 	return (0);
3032 }
3033 
3034 static int
3035 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3036 {
3037 	if (mtu > ETHERMTU) {
3038 		if (!xnf_enable_tx_sg) {
3039 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3040 			    "because scatter-gather is disabled for transmit "
3041 			    "in driver settings", ETHERMTU);
3042 			return (EINVAL);
3043 		} else if (!xnf_enable_rx_sg) {
3044 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3045 			    "because scatter-gather is disabled for receive "
3046 			    "in driver settings", ETHERMTU);
3047 			return (EINVAL);
3048 		} else if (!xnfp->xnf_be_tx_sg) {
3049 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3050 			    "because backend doesn't support scatter-gather",
3051 			    ETHERMTU);
3052 			return (EINVAL);
3053 		}
3054 		if (mtu > XNF_MAXPKT)
3055 			return (EINVAL);
3056 	}
3057 	int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3058 	if (error == 0)
3059 		xnfp->xnf_mtu = mtu;
3060 
3061 	return (error);
3062 }
3063 
3064 /*ARGSUSED*/
3065 static int
3066 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3067     uint_t prop_val_size, void *prop_val)
3068 {
3069 	xnf_t *xnfp = data;
3070 
3071 	switch (prop_id) {
3072 	case MAC_PROP_MTU:
3073 		ASSERT(prop_val_size >= sizeof (uint32_t));
3074 		bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3075 		break;
3076 	default:
3077 		return (ENOTSUP);
3078 	}
3079 	return (0);
3080 }
3081 
3082 /*ARGSUSED*/
3083 static int
3084 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3085     uint_t prop_val_size, const void *prop_val)
3086 {
3087 	xnf_t *xnfp = data;
3088 	uint32_t new_mtu;
3089 	int error;
3090 
3091 	switch (prop_id) {
3092 	case MAC_PROP_MTU:
3093 		ASSERT(prop_val_size >= sizeof (uint32_t));
3094 		bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3095 		error = xnf_change_mtu(xnfp, new_mtu);
3096 		break;
3097 	default:
3098 		return (ENOTSUP);
3099 	}
3100 
3101 	return (error);
3102 }
3103 
3104 /*ARGSUSED*/
3105 static void
3106 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3107     mac_prop_info_handle_t prop_handle)
3108 {
3109 	switch (prop_id) {
3110 	case MAC_PROP_MTU:
3111 		mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3112 		break;
3113 	default:
3114 		break;
3115 	}
3116 }
3117 
3118 static boolean_t
3119 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3120 {
3121 	xnf_t *xnfp = arg;
3122 
3123 	switch (cap) {
3124 	case MAC_CAPAB_HCKSUM: {
3125 		uint32_t *capab = cap_data;
3126 
3127 		/*
3128 		 * Whilst the flag used to communicate with the IO
3129 		 * domain is called "NETTXF_csum_blank", the checksum
3130 		 * in the packet must contain the pseudo-header
3131 		 * checksum and not zero.
3132 		 *
3133 		 * To help out the IO domain, we might use
3134 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3135 		 * then use checksum offload for IPv6 packets, which
3136 		 * the IO domain can't handle.
3137 		 *
3138 		 * As a result, we declare outselves capable of
3139 		 * HCKSUM_INET_FULL_V4. This means that we receive
3140 		 * IPv4 packets from the stack with a blank checksum
3141 		 * field and must insert the pseudo-header checksum
3142 		 * before passing the packet to the IO domain.
3143 		 */
3144 		*capab = HCKSUM_INET_FULL_V4;
3145 
3146 		/*
3147 		 * TODO: query the "feature-ipv6-csum-offload" capability.
3148 		 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3149 		 */
3150 
3151 		break;
3152 	}
3153 	case MAC_CAPAB_LSO: {
3154 		if (!xnfp->xnf_be_lso)
3155 			return (B_FALSE);
3156 
3157 		mac_capab_lso_t *lso = cap_data;
3158 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3159 		lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3160 		break;
3161 	}
3162 	default:
3163 		return (B_FALSE);
3164 	}
3165 
3166 	return (B_TRUE);
3167 }
3168 
3169 /*
3170  * The state of the peer has changed - react accordingly.
3171  */
3172 static void
3173 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3174     void *arg, void *impl_data)
3175 {
3176 	_NOTE(ARGUNUSED(id, arg));
3177 	xnf_t *xnfp = ddi_get_driver_private(dip);
3178 	XenbusState new_state = *(XenbusState *)impl_data;
3179 
3180 	ASSERT(xnfp != NULL);
3181 
3182 	switch (new_state) {
3183 	case XenbusStateUnknown:
3184 	case XenbusStateInitialising:
3185 	case XenbusStateInitialised:
3186 	case XenbusStateClosing:
3187 	case XenbusStateClosed:
3188 	case XenbusStateReconfiguring:
3189 	case XenbusStateReconfigured:
3190 		break;
3191 
3192 	case XenbusStateInitWait:
3193 		xnf_read_config(xnfp);
3194 
3195 		if (!xnfp->xnf_be_rx_copy) {
3196 			cmn_err(CE_WARN,
3197 			    "The xnf driver requires a dom0 that "
3198 			    "supports 'feature-rx-copy'.");
3199 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
3200 			    XBT_NULL, XenbusStateClosed);
3201 			break;
3202 		}
3203 
3204 		/*
3205 		 * Connect to the backend.
3206 		 */
3207 		xnf_be_connect(xnfp);
3208 
3209 		/*
3210 		 * Our MAC address as discovered by xnf_read_config().
3211 		 */
3212 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3213 
3214 		/*
3215 		 * We do not know if some features such as LSO are supported
3216 		 * until we connect to the backend. We request the MAC layer
3217 		 * to poll our capabilities again.
3218 		 */
3219 		mac_capab_update(xnfp->xnf_mh);
3220 
3221 		break;
3222 
3223 	case XenbusStateConnected:
3224 		mutex_enter(&xnfp->xnf_rxlock);
3225 		mutex_enter(&xnfp->xnf_txlock);
3226 
3227 		xnfp->xnf_connected = B_TRUE;
3228 		/*
3229 		 * Wake up any threads waiting to send data to
3230 		 * backend.
3231 		 */
3232 		cv_broadcast(&xnfp->xnf_cv_state);
3233 
3234 		mutex_exit(&xnfp->xnf_txlock);
3235 		mutex_exit(&xnfp->xnf_rxlock);
3236 
3237 		/*
3238 		 * Kick the peer in case it missed any transmits
3239 		 * request in the TX ring.
3240 		 */
3241 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
3242 
3243 		/*
3244 		 * There may already be completed receive requests in
3245 		 * the ring sent by backend after it gets connected
3246 		 * but before we see its state change here, so we call
3247 		 * xnf_intr() to handle them, if any.
3248 		 */
3249 		(void) xnf_intr((caddr_t)xnfp);
3250 
3251 		/*
3252 		 * Mark the link up now that we are connected.
3253 		 */
3254 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3255 
3256 		/*
3257 		 * Tell the backend about the multicast addresses in
3258 		 * which we are interested.
3259 		 */
3260 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3261 
3262 		break;
3263 
3264 	default:
3265 		break;
3266 	}
3267 }
3268