xref: /titanic_50/usr/src/uts/common/xen/io/xnf.c (revision 5895e34b428f1f8306218e01067b97c05f75d238)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *
29  * Copyright (c) 2004 Christian Limpach.
30  * All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. This section intentionally left blank.
41  * 4. The name of the author may not be used to endorse or promote products
42  *    derived from this software without specific prior written permission.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 /*
56  * Section 3 of the above license was updated in response to bug 6379571.
57  */
58 
59 /*
60  * xnf.c - GLDv3 network driver for domU.
61  */
62 
63 /*
64  * This driver uses four per-instance locks:
65  *
66  * xnf_gref_lock:
67  *
68  *    Protects access to the grant reference list stored in
69  *    xnf_gref_head. Grant references should be acquired and released
70  *    using gref_get() and gref_put() respectively.
71  *
72  * xnf_schedlock:
73  *
74  *    Protects:
75  *    xnf_need_sched - used to record that a previous transmit attempt
76  *       failed (and consequently it will be necessary to call
77  *       mac_tx_update() when transmit resources are available).
78  *    xnf_pending_multicast - the number of multicast requests that
79  *       have been submitted to the backend for which we have not
80  *       processed responses.
81  *
82  * xnf_txlock:
83  *
84  *    Protects the transmit ring (xnf_tx_ring) and associated
85  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
86  *
87  * xnf_rxlock:
88  *
89  *    Protects the receive ring (xnf_rx_ring) and associated
90  *    structures (notably xnf_rx_pkt_info).
91  *
92  * If driver-global state that affects both the transmit and receive
93  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
94  * held, in that order.
95  *
96  * xnf_schedlock is acquired both whilst holding xnf_txlock and
97  * without. It should always be acquired after xnf_txlock if both are
98  * held.
99  *
100  * Notes:
101  * - atomic_add_64() is used to manipulate counters where we require
102  *   accuracy. For counters intended only for observation by humans,
103  *   post increment/decrement are used instead.
104  */
105 
106 #include <sys/types.h>
107 #include <sys/errno.h>
108 #include <sys/param.h>
109 #include <sys/sysmacros.h>
110 #include <sys/systm.h>
111 #include <sys/stream.h>
112 #include <sys/strsubr.h>
113 #include <sys/strsun.h>
114 #include <sys/conf.h>
115 #include <sys/ddi.h>
116 #include <sys/devops.h>
117 #include <sys/sunddi.h>
118 #include <sys/sunndi.h>
119 #include <sys/dlpi.h>
120 #include <sys/ethernet.h>
121 #include <sys/strsun.h>
122 #include <sys/pattr.h>
123 #include <inet/ip.h>
124 #include <inet/ip_impl.h>
125 #include <sys/gld.h>
126 #include <sys/modctl.h>
127 #include <sys/mac_provider.h>
128 #include <sys/mac_ether.h>
129 #include <sys/bootinfo.h>
130 #include <sys/mach_mmu.h>
131 #ifdef	XPV_HVM_DRIVER
132 #include <sys/xpv_support.h>
133 #include <sys/hypervisor.h>
134 #else
135 #include <sys/hypervisor.h>
136 #include <sys/evtchn_impl.h>
137 #include <sys/balloon_impl.h>
138 #endif
139 #include <xen/public/io/netif.h>
140 #include <sys/gnttab.h>
141 #include <xen/sys/xendev.h>
142 #include <sys/sdt.h>
143 #include <sys/note.h>
144 #include <sys/debug.h>
145 
146 #include <io/xnf.h>
147 
148 #if defined(DEBUG) || defined(__lint)
149 #define	XNF_DEBUG
150 #endif
151 
152 #ifdef XNF_DEBUG
153 int xnf_debug = 0;
154 xnf_t *xnf_debug_instance = NULL;
155 #endif
156 
157 /*
158  * On a 32 bit PAE system physical and machine addresses are larger
159  * than 32 bits.  ddi_btop() on such systems take an unsigned long
160  * argument, and so addresses above 4G are truncated before ddi_btop()
161  * gets to see them.  To avoid this, code the shift operation here.
162  */
163 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
164 
165 unsigned int	xnf_max_tx_frags = 1;
166 
167 /*
168  * Should we use the multicast control feature if the backend provides
169  * it?
170  */
171 boolean_t xnf_multicast_control = B_TRUE;
172 
173 /*
174  * Received packets below this size are copied to a new streams buffer
175  * rather than being desballoc'ed.
176  *
177  * This value is chosen to accommodate traffic where there are a large
178  * number of small packets. For data showing a typical distribution,
179  * see:
180  *
181  * Sinha07a:
182  *	Rishi Sinha, Christos Papadopoulos, and John
183  *	Heidemann. Internet Packet Size Distributions: Some
184  *	Observations. Technical Report ISI-TR-2007-643,
185  *	USC/Information Sciences Institute, May, 2007. Orignally
186  *	released October 2005 as web page
187  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
188  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
189  */
190 size_t xnf_rx_copy_limit = 64;
191 
192 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
193 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
194 #define	INVALID_TX_ID		((uint16_t)-1)
195 
196 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
197 #define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
198 
199 /* Required system entry points */
200 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
201 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
202 
203 /* Required driver entry points for Nemo */
204 static int	xnf_start(void *);
205 static void	xnf_stop(void *);
206 static int	xnf_set_mac_addr(void *, const uint8_t *);
207 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
208 static int	xnf_set_promiscuous(void *, boolean_t);
209 static mblk_t	*xnf_send(void *, mblk_t *);
210 static uint_t	xnf_intr(caddr_t);
211 static int	xnf_stat(void *, uint_t, uint64_t *);
212 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
213 
214 /* Driver private functions */
215 static int xnf_alloc_dma_resources(xnf_t *);
216 static void xnf_release_dma_resources(xnf_t *);
217 static void xnf_release_mblks(xnf_t *);
218 
219 static int xnf_buf_constructor(void *, void *, int);
220 static void xnf_buf_destructor(void *, void *);
221 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
222 #pragma inline(xnf_buf_get)
223 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
224 #pragma inline(xnf_buf_put)
225 static void xnf_buf_refresh(xnf_buf_t *);
226 #pragma inline(xnf_buf_refresh)
227 static void xnf_buf_recycle(xnf_buf_t *);
228 
229 static int xnf_tx_buf_constructor(void *, void *, int);
230 static void xnf_tx_buf_destructor(void *, void *);
231 
232 static grant_ref_t gref_get(xnf_t *);
233 #pragma inline(gref_get)
234 static void gref_put(xnf_t *, grant_ref_t);
235 #pragma inline(gref_put)
236 
237 static xnf_txid_t *txid_get(xnf_t *);
238 #pragma inline(txid_get)
239 static void txid_put(xnf_t *, xnf_txid_t *);
240 #pragma inline(txid_put)
241 
242 void xnf_send_driver_status(int, int);
243 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
244 static int xnf_tx_clean_ring(xnf_t  *);
245 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
246     void *, void *);
247 static boolean_t xnf_kstat_init(xnf_t *);
248 static void xnf_rx_collect(xnf_t *);
249 
250 static mac_callbacks_t xnf_callbacks = {
251 	MC_GETCAPAB,
252 	xnf_stat,
253 	xnf_start,
254 	xnf_stop,
255 	xnf_set_promiscuous,
256 	xnf_set_multicast,
257 	xnf_set_mac_addr,
258 	xnf_send,
259 	NULL,
260 	xnf_getcapab
261 };
262 
263 /* DMA attributes for network ring buffer */
264 static ddi_dma_attr_t ringbuf_dma_attr = {
265 	DMA_ATTR_V0,		/* version of this structure */
266 	0,			/* lowest usable address */
267 	0xffffffffffffffffULL,	/* highest usable address */
268 	0x7fffffff,		/* maximum DMAable byte count */
269 	MMU_PAGESIZE,		/* alignment in bytes */
270 	0x7ff,			/* bitmap of burst sizes */
271 	1,			/* minimum transfer */
272 	0xffffffffU,		/* maximum transfer */
273 	0xffffffffffffffffULL,	/* maximum segment length */
274 	1,			/* maximum number of segments */
275 	1,			/* granularity */
276 	0,			/* flags (reserved) */
277 };
278 
279 /* DMA attributes for transmit and receive data */
280 static ddi_dma_attr_t buf_dma_attr = {
281 	DMA_ATTR_V0,		/* version of this structure */
282 	0,			/* lowest usable address */
283 	0xffffffffffffffffULL,	/* highest usable address */
284 	0x7fffffff,		/* maximum DMAable byte count */
285 	MMU_PAGESIZE,		/* alignment in bytes */
286 	0x7ff,			/* bitmap of burst sizes */
287 	1,			/* minimum transfer */
288 	0xffffffffU,		/* maximum transfer */
289 	0xffffffffffffffffULL,	/* maximum segment length */
290 	1,			/* maximum number of segments */
291 	1,			/* granularity */
292 	0,			/* flags (reserved) */
293 };
294 
295 /* DMA access attributes for registers and descriptors */
296 static ddi_device_acc_attr_t accattr = {
297 	DDI_DEVICE_ATTR_V0,
298 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
299 	DDI_STRICTORDER_ACC
300 };
301 
302 /* DMA access attributes for data: NOT to be byte swapped. */
303 static ddi_device_acc_attr_t data_accattr = {
304 	DDI_DEVICE_ATTR_V0,
305 	DDI_NEVERSWAP_ACC,
306 	DDI_STRICTORDER_ACC
307 };
308 
309 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
310     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
311 
312 static struct modldrv xnf_modldrv = {
313 	&mod_driverops,
314 	"Virtual Ethernet driver",
315 	&xnf_dev_ops
316 };
317 
318 static struct modlinkage modlinkage = {
319 	MODREV_1, &xnf_modldrv, NULL
320 };
321 
322 int
323 _init(void)
324 {
325 	int r;
326 
327 	mac_init_ops(&xnf_dev_ops, "xnf");
328 	r = mod_install(&modlinkage);
329 	if (r != DDI_SUCCESS)
330 		mac_fini_ops(&xnf_dev_ops);
331 
332 	return (r);
333 }
334 
335 int
336 _fini(void)
337 {
338 	return (EBUSY); /* XXPV should be removable */
339 }
340 
341 int
342 _info(struct modinfo *modinfop)
343 {
344 	return (mod_info(&modlinkage, modinfop));
345 }
346 
347 /*
348  * Acquire a grant reference.
349  */
350 static grant_ref_t
351 gref_get(xnf_t *xnfp)
352 {
353 	grant_ref_t gref;
354 
355 	mutex_enter(&xnfp->xnf_gref_lock);
356 
357 	do {
358 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
359 
360 	} while ((gref == INVALID_GRANT_REF) &&
361 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
362 
363 	mutex_exit(&xnfp->xnf_gref_lock);
364 
365 	if (gref == INVALID_GRANT_REF) {
366 		xnfp->xnf_stat_gref_failure++;
367 	} else {
368 		atomic_add_64(&xnfp->xnf_stat_gref_outstanding, 1);
369 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
370 			xnfp->xnf_stat_gref_peak =
371 			    xnfp->xnf_stat_gref_outstanding;
372 	}
373 
374 	return (gref);
375 }
376 
377 /*
378  * Release a grant reference.
379  */
380 static void
381 gref_put(xnf_t *xnfp, grant_ref_t gref)
382 {
383 	ASSERT(gref != INVALID_GRANT_REF);
384 
385 	mutex_enter(&xnfp->xnf_gref_lock);
386 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
387 	mutex_exit(&xnfp->xnf_gref_lock);
388 
389 	atomic_add_64(&xnfp->xnf_stat_gref_outstanding, -1);
390 }
391 
392 /*
393  * Acquire a transmit id.
394  */
395 static xnf_txid_t *
396 txid_get(xnf_t *xnfp)
397 {
398 	xnf_txid_t *tidp;
399 
400 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
401 
402 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
403 		return (NULL);
404 
405 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
406 
407 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
408 	xnfp->xnf_tx_pkt_id_head = tidp->next;
409 	tidp->next = INVALID_TX_ID;
410 
411 	ASSERT(tidp->txbuf == NULL);
412 
413 	return (tidp);
414 }
415 
416 /*
417  * Release a transmit id.
418  */
419 static void
420 txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
421 {
422 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
423 	ASSERT(TX_ID_VALID(tidp->id));
424 	ASSERT(tidp->next == INVALID_TX_ID);
425 
426 	tidp->txbuf = NULL;
427 	tidp->next = xnfp->xnf_tx_pkt_id_head;
428 	xnfp->xnf_tx_pkt_id_head = tidp->id;
429 }
430 
431 /*
432  * Get `wanted' slots in the transmit ring, waiting for at least that
433  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
434  * `wanted' to zero.
435  *
436  * Return the number of slots available.
437  */
438 static int
439 tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
440 {
441 	int slotsfree;
442 	boolean_t forced_clean = (wanted == 0);
443 
444 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
445 
446 	/* LINTED: constant in conditional context */
447 	while (B_TRUE) {
448 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
449 
450 		if ((slotsfree < wanted) || forced_clean)
451 			slotsfree = xnf_tx_clean_ring(xnfp);
452 
453 		/*
454 		 * If there are more than we need free, tell other
455 		 * people to come looking again. We hold txlock, so we
456 		 * are able to take our slots before anyone else runs.
457 		 */
458 		if (slotsfree > wanted)
459 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
460 
461 		if (slotsfree >= wanted)
462 			break;
463 
464 		if (!wait)
465 			break;
466 
467 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
468 	}
469 
470 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
471 
472 	return (slotsfree);
473 }
474 
475 static int
476 xnf_setup_rings(xnf_t *xnfp)
477 {
478 	domid_t			oeid;
479 	struct xenbus_device	*xsd;
480 	RING_IDX		i;
481 	int			err;
482 	xnf_txid_t		*tidp;
483 	xnf_buf_t **bdescp;
484 
485 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
486 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
487 
488 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
489 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
490 
491 	err = gnttab_grant_foreign_access(oeid,
492 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
493 	if (err <= 0) {
494 		err = -err;
495 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
496 		goto out;
497 	}
498 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
499 
500 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
501 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
502 
503 	err = gnttab_grant_foreign_access(oeid,
504 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
505 	if (err <= 0) {
506 		err = -err;
507 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
508 		goto out;
509 	}
510 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
511 
512 	mutex_enter(&xnfp->xnf_txlock);
513 
514 	/*
515 	 * Setup/cleanup the TX ring.  Note that this can lose packets
516 	 * after a resume, but we expect to stagger on.
517 	 */
518 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
519 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
520 	    i < NET_TX_RING_SIZE;
521 	    i++, tidp++) {
522 		xnf_txbuf_t *txp;
523 
524 		tidp->id = i;
525 
526 		txp = tidp->txbuf;
527 		if (txp == NULL) {
528 			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
529 			txid_put(xnfp, tidp);
530 			continue;
531 		}
532 
533 		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
534 		ASSERT(txp->tx_mp != NULL);
535 
536 		switch (txp->tx_type) {
537 		case TX_DATA:
538 			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
539 			    == 0);
540 
541 			if (txp->tx_bdesc == NULL) {
542 				(void) gnttab_end_foreign_access_ref(
543 				    txp->tx_txreq.gref, 1);
544 				gref_put(xnfp, txp->tx_txreq.gref);
545 				(void) ddi_dma_unbind_handle(
546 				    txp->tx_dma_handle);
547 			} else {
548 				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
549 			}
550 
551 			freemsg(txp->tx_mp);
552 			txid_put(xnfp, tidp);
553 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
554 
555 			break;
556 
557 		case TX_MCAST_REQ:
558 			txp->tx_type = TX_MCAST_RSP;
559 			txp->tx_status = NETIF_RSP_DROPPED;
560 			cv_broadcast(&xnfp->xnf_cv_multicast);
561 
562 			/*
563 			 * The request consumed two slots in the ring,
564 			 * yet only a single xnf_txid_t is used. Step
565 			 * over the empty slot.
566 			 */
567 			i++;
568 			ASSERT(i < NET_TX_RING_SIZE);
569 
570 			break;
571 
572 		case TX_MCAST_RSP:
573 			break;
574 		}
575 	}
576 
577 	/* LINTED: constant in conditional context */
578 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
579 	/* LINTED: constant in conditional context */
580 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
581 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
582 
583 	mutex_exit(&xnfp->xnf_txlock);
584 
585 	mutex_enter(&xnfp->xnf_rxlock);
586 
587 	/*
588 	 * Clean out any buffers currently posted to the receive ring
589 	 * before we reset it.
590 	 */
591 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
592 	    i < NET_RX_RING_SIZE;
593 	    i++, bdescp++) {
594 		if (*bdescp != NULL) {
595 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
596 			*bdescp = NULL;
597 		}
598 	}
599 
600 	/* LINTED: constant in conditional context */
601 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
602 	/* LINTED: constant in conditional context */
603 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
604 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
605 
606 	/*
607 	 * Fill the ring with buffers.
608 	 */
609 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
610 		xnf_buf_t *bdesc;
611 
612 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
613 		VERIFY(bdesc != NULL);
614 		xnf_rxbuf_hang(xnfp, bdesc);
615 	}
616 
617 	/* LINTED: constant in conditional context */
618 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
619 
620 	mutex_exit(&xnfp->xnf_rxlock);
621 
622 	return (0);
623 
624 out:
625 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
626 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
627 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
628 
629 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
630 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
631 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
632 
633 	return (err);
634 }
635 
636 /*
637  * Connect driver to back end, called to set up communication with
638  * back end driver both initially and on resume after restore/migrate.
639  */
640 void
641 xnf_be_connect(xnf_t *xnfp)
642 {
643 	const char	*message;
644 	xenbus_transaction_t xbt;
645 	struct		xenbus_device *xsd;
646 	char		*xsname;
647 	int		err;
648 
649 	ASSERT(!xnfp->xnf_connected);
650 
651 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
652 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
653 
654 	err = xnf_setup_rings(xnfp);
655 	if (err != 0) {
656 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
657 		xenbus_dev_error(xsd, err, "setting up ring");
658 		return;
659 	}
660 
661 again:
662 	err = xenbus_transaction_start(&xbt);
663 	if (err != 0) {
664 		xenbus_dev_error(xsd, EIO, "starting transaction");
665 		return;
666 	}
667 
668 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
669 	    xnfp->xnf_tx_ring_ref);
670 	if (err != 0) {
671 		message = "writing tx ring-ref";
672 		goto abort_transaction;
673 	}
674 
675 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
676 	    xnfp->xnf_rx_ring_ref);
677 	if (err != 0) {
678 		message = "writing rx ring-ref";
679 		goto abort_transaction;
680 	}
681 
682 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
683 	    xnfp->xnf_evtchn);
684 	if (err != 0) {
685 		message = "writing event-channel";
686 		goto abort_transaction;
687 	}
688 
689 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
690 	if (err != 0) {
691 		message = "writing feature-rx-notify";
692 		goto abort_transaction;
693 	}
694 
695 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
696 	if (err != 0) {
697 		message = "writing request-rx-copy";
698 		goto abort_transaction;
699 	}
700 
701 	if (xnfp->xnf_be_mcast_control) {
702 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
703 		    "%d", 1);
704 		if (err != 0) {
705 			message = "writing request-multicast-control";
706 			goto abort_transaction;
707 		}
708 	}
709 
710 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
711 	if (err != 0) {
712 		message = "switching state to XenbusStateConnected";
713 		goto abort_transaction;
714 	}
715 
716 	err = xenbus_transaction_end(xbt, 0);
717 	if (err != 0) {
718 		if (err == EAGAIN)
719 			goto again;
720 		xenbus_dev_error(xsd, err, "completing transaction");
721 	}
722 
723 	return;
724 
725 abort_transaction:
726 	(void) xenbus_transaction_end(xbt, 1);
727 	xenbus_dev_error(xsd, err, "%s", message);
728 }
729 
730 /*
731  * Read configuration information from xenstore.
732  */
733 void
734 xnf_read_config(xnf_t *xnfp)
735 {
736 	int err, be_cap;
737 	char mac[ETHERADDRL * 3];
738 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
739 
740 	err = xenbus_scanf(XBT_NULL, oename, "mac",
741 	    "%s", (char *)&mac[0]);
742 	if (err != 0) {
743 		/*
744 		 * bad: we're supposed to be set up with a proper mac
745 		 * addr. at this point
746 		 */
747 		cmn_err(CE_WARN, "%s%d: no mac address",
748 		    ddi_driver_name(xnfp->xnf_devinfo),
749 		    ddi_get_instance(xnfp->xnf_devinfo));
750 			return;
751 	}
752 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
753 		err = ENOENT;
754 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
755 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
756 		return;
757 	}
758 
759 	err = xenbus_scanf(XBT_NULL, oename,
760 	    "feature-rx-copy", "%d", &be_cap);
761 	/*
762 	 * If we fail to read the store we assume that the key is
763 	 * absent, implying an older domain at the far end.  Older
764 	 * domains cannot do HV copy.
765 	 */
766 	if (err != 0)
767 		be_cap = 0;
768 	xnfp->xnf_be_rx_copy = (be_cap != 0);
769 
770 	err = xenbus_scanf(XBT_NULL, oename,
771 	    "feature-multicast-control", "%d", &be_cap);
772 	/*
773 	 * If we fail to read the store we assume that the key is
774 	 * absent, implying an older domain at the far end.  Older
775 	 * domains do not support multicast control.
776 	 */
777 	if (err != 0)
778 		be_cap = 0;
779 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
780 }
781 
782 /*
783  *  attach(9E) -- Attach a device to the system
784  */
785 static int
786 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
787 {
788 	mac_register_t *macp;
789 	xnf_t *xnfp;
790 	int err;
791 	char cachename[32];
792 
793 #ifdef XNF_DEBUG
794 	if (xnf_debug & XNF_DEBUG_DDI)
795 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
796 		    (void *)devinfo);
797 #endif
798 
799 	switch (cmd) {
800 	case DDI_RESUME:
801 		xnfp = ddi_get_driver_private(devinfo);
802 		xnfp->xnf_gen++;
803 
804 		(void) xvdi_resume(devinfo);
805 		(void) xvdi_alloc_evtchn(devinfo);
806 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
807 #ifdef XPV_HVM_DRIVER
808 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
809 		    xnfp);
810 #else
811 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
812 		    (caddr_t)xnfp);
813 #endif
814 		return (DDI_SUCCESS);
815 
816 	case DDI_ATTACH:
817 		break;
818 
819 	default:
820 		return (DDI_FAILURE);
821 	}
822 
823 	/*
824 	 *  Allocate gld_mac_info_t and xnf_instance structures
825 	 */
826 	macp = mac_alloc(MAC_VERSION);
827 	if (macp == NULL)
828 		return (DDI_FAILURE);
829 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
830 
831 	macp->m_dip = devinfo;
832 	macp->m_driver = xnfp;
833 	xnfp->xnf_devinfo = devinfo;
834 
835 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
836 	macp->m_src_addr = xnfp->xnf_mac_addr;
837 	macp->m_callbacks = &xnf_callbacks;
838 	macp->m_min_sdu = 0;
839 	macp->m_max_sdu = XNF_MAXPKT;
840 
841 	xnfp->xnf_running = B_FALSE;
842 	xnfp->xnf_connected = B_FALSE;
843 	xnfp->xnf_be_rx_copy = B_FALSE;
844 	xnfp->xnf_be_mcast_control = B_FALSE;
845 	xnfp->xnf_need_sched = B_FALSE;
846 
847 	xnfp->xnf_rx_head = NULL;
848 	xnfp->xnf_rx_tail = NULL;
849 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
850 
851 #ifdef XPV_HVM_DRIVER
852 	/*
853 	 * Report our version to dom0.
854 	 */
855 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
856 	    HVMPV_XNF_VERS))
857 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
858 #endif
859 
860 	/*
861 	 * Get the iblock cookie with which to initialize the mutexes.
862 	 */
863 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
864 	    != DDI_SUCCESS)
865 		goto failure;
866 
867 	mutex_init(&xnfp->xnf_txlock,
868 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
869 	mutex_init(&xnfp->xnf_rxlock,
870 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
871 	mutex_init(&xnfp->xnf_schedlock,
872 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
873 	mutex_init(&xnfp->xnf_gref_lock,
874 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
875 
876 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
877 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
878 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
879 
880 	(void) sprintf(cachename, "xnf_buf_cache_%d",
881 	    ddi_get_instance(devinfo));
882 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
883 	    sizeof (xnf_buf_t), 0,
884 	    xnf_buf_constructor, xnf_buf_destructor,
885 	    NULL, xnfp, NULL, 0);
886 	if (xnfp->xnf_buf_cache == NULL)
887 		goto failure_0;
888 
889 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
890 	    ddi_get_instance(devinfo));
891 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
892 	    sizeof (xnf_txbuf_t), 0,
893 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
894 	    NULL, xnfp, NULL, 0);
895 	if (xnfp->xnf_tx_buf_cache == NULL)
896 		goto failure_1;
897 
898 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
899 
900 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
901 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
902 		    "driver data structures",
903 		    ddi_get_instance(xnfp->xnf_devinfo));
904 		goto failure_2;
905 	}
906 
907 	xnfp->xnf_rx_ring.sring->rsp_event =
908 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
909 
910 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
911 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
912 
913 	/* set driver private pointer now */
914 	ddi_set_driver_private(devinfo, xnfp);
915 
916 	if (!xnf_kstat_init(xnfp))
917 		goto failure_3;
918 
919 	/*
920 	 * Allocate an event channel, add the interrupt handler and
921 	 * bind it to the event channel.
922 	 */
923 	(void) xvdi_alloc_evtchn(devinfo);
924 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
925 #ifdef XPV_HVM_DRIVER
926 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
927 #else
928 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
929 #endif
930 
931 	err = mac_register(macp, &xnfp->xnf_mh);
932 	mac_free(macp);
933 	macp = NULL;
934 	if (err != 0)
935 		goto failure_4;
936 
937 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
938 	    != DDI_SUCCESS)
939 		goto failure_5;
940 
941 #ifdef XPV_HVM_DRIVER
942 	/*
943 	 * In the HVM case, this driver essentially replaces a driver for
944 	 * a 'real' PCI NIC. Without the "model" property set to
945 	 * "Ethernet controller", like the PCI code does, netbooting does
946 	 * not work correctly, as strplumb_get_netdev_path() will not find
947 	 * this interface.
948 	 */
949 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
950 	    "Ethernet controller");
951 #endif
952 
953 #ifdef XNF_DEBUG
954 	if (xnf_debug_instance == NULL)
955 		xnf_debug_instance = xnfp;
956 #endif
957 
958 	return (DDI_SUCCESS);
959 
960 failure_5:
961 	mac_unregister(xnfp->xnf_mh);
962 
963 failure_4:
964 #ifdef XPV_HVM_DRIVER
965 	ec_unbind_evtchn(xnfp->xnf_evtchn);
966 	xvdi_free_evtchn(devinfo);
967 #else
968 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
969 #endif
970 	xnfp->xnf_evtchn = INVALID_EVTCHN;
971 	kstat_delete(xnfp->xnf_kstat_aux);
972 
973 failure_3:
974 	xnf_release_dma_resources(xnfp);
975 
976 failure_2:
977 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
978 
979 failure_1:
980 	kmem_cache_destroy(xnfp->xnf_buf_cache);
981 
982 failure_0:
983 	cv_destroy(&xnfp->xnf_cv_tx_slots);
984 	cv_destroy(&xnfp->xnf_cv_multicast);
985 	cv_destroy(&xnfp->xnf_cv_state);
986 
987 	mutex_destroy(&xnfp->xnf_gref_lock);
988 	mutex_destroy(&xnfp->xnf_schedlock);
989 	mutex_destroy(&xnfp->xnf_rxlock);
990 	mutex_destroy(&xnfp->xnf_txlock);
991 
992 failure:
993 	kmem_free(xnfp, sizeof (*xnfp));
994 	if (macp != NULL)
995 		mac_free(macp);
996 
997 	return (DDI_FAILURE);
998 }
999 
1000 /*  detach(9E) -- Detach a device from the system */
1001 static int
1002 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1003 {
1004 	xnf_t *xnfp;		/* Our private device info */
1005 
1006 #ifdef XNF_DEBUG
1007 	if (xnf_debug & XNF_DEBUG_DDI)
1008 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
1009 #endif
1010 
1011 	xnfp = ddi_get_driver_private(devinfo);
1012 
1013 	switch (cmd) {
1014 	case DDI_SUSPEND:
1015 #ifdef XPV_HVM_DRIVER
1016 		ec_unbind_evtchn(xnfp->xnf_evtchn);
1017 		xvdi_free_evtchn(devinfo);
1018 #else
1019 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1020 #endif
1021 
1022 		xvdi_suspend(devinfo);
1023 
1024 		mutex_enter(&xnfp->xnf_rxlock);
1025 		mutex_enter(&xnfp->xnf_txlock);
1026 
1027 		xnfp->xnf_evtchn = INVALID_EVTCHN;
1028 		xnfp->xnf_connected = B_FALSE;
1029 		mutex_exit(&xnfp->xnf_txlock);
1030 		mutex_exit(&xnfp->xnf_rxlock);
1031 
1032 		/* claim link to be down after disconnect */
1033 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1034 		return (DDI_SUCCESS);
1035 
1036 	case DDI_DETACH:
1037 		break;
1038 
1039 	default:
1040 		return (DDI_FAILURE);
1041 	}
1042 
1043 	if (xnfp->xnf_connected)
1044 		return (DDI_FAILURE);
1045 
1046 	/*
1047 	 * Cannot detach if we have xnf_buf_t outstanding.
1048 	 */
1049 	if (xnfp->xnf_stat_buf_allocated > 0)
1050 		return (DDI_FAILURE);
1051 
1052 	if (mac_unregister(xnfp->xnf_mh) != 0)
1053 		return (DDI_FAILURE);
1054 
1055 	kstat_delete(xnfp->xnf_kstat_aux);
1056 
1057 	/* Stop the receiver */
1058 	xnf_stop(xnfp);
1059 
1060 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1061 
1062 	/* Remove the interrupt */
1063 #ifdef XPV_HVM_DRIVER
1064 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1065 	xvdi_free_evtchn(devinfo);
1066 #else
1067 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1068 #endif
1069 
1070 	/* Release any pending xmit mblks */
1071 	xnf_release_mblks(xnfp);
1072 
1073 	/* Release all DMA resources */
1074 	xnf_release_dma_resources(xnfp);
1075 
1076 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1077 	cv_destroy(&xnfp->xnf_cv_multicast);
1078 	cv_destroy(&xnfp->xnf_cv_state);
1079 
1080 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1081 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1082 
1083 	mutex_destroy(&xnfp->xnf_gref_lock);
1084 	mutex_destroy(&xnfp->xnf_schedlock);
1085 	mutex_destroy(&xnfp->xnf_rxlock);
1086 	mutex_destroy(&xnfp->xnf_txlock);
1087 
1088 	kmem_free(xnfp, sizeof (*xnfp));
1089 
1090 	return (DDI_SUCCESS);
1091 }
1092 
1093 /*
1094  *  xnf_set_mac_addr() -- set the physical network address on the board.
1095  */
1096 static int
1097 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1098 {
1099 	_NOTE(ARGUNUSED(arg, macaddr));
1100 
1101 	/*
1102 	 * We can't set our macaddr.
1103 	 */
1104 	return (ENOTSUP);
1105 }
1106 
1107 /*
1108  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1109  *
1110  *  Program the hardware to enable/disable the multicast address
1111  *  in "mca".  Enable if "add" is true, disable if false.
1112  */
1113 static int
1114 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1115 {
1116 	xnf_t *xnfp = arg;
1117 	xnf_txbuf_t *txp;
1118 	int n_slots;
1119 	RING_IDX slot;
1120 	xnf_txid_t *tidp;
1121 	netif_tx_request_t *txrp;
1122 	struct netif_extra_info *erp;
1123 	boolean_t notify, result;
1124 
1125 	/*
1126 	 * If the backend does not support multicast control then we
1127 	 * must assume that the right packets will just arrive.
1128 	 */
1129 	if (!xnfp->xnf_be_mcast_control)
1130 		return (0);
1131 
1132 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1133 	if (txp == NULL)
1134 		return (1);
1135 
1136 	mutex_enter(&xnfp->xnf_txlock);
1137 
1138 	/*
1139 	 * If we're not yet connected then claim success. This is
1140 	 * acceptable because we refresh the entire set of multicast
1141 	 * addresses when we get connected.
1142 	 *
1143 	 * We can't wait around here because the MAC layer expects
1144 	 * this to be a non-blocking operation - waiting ends up
1145 	 * causing a deadlock during resume.
1146 	 */
1147 	if (!xnfp->xnf_connected) {
1148 		mutex_exit(&xnfp->xnf_txlock);
1149 		return (0);
1150 	}
1151 
1152 	/*
1153 	 * 1. Acquire two slots in the ring.
1154 	 * 2. Fill in the slots.
1155 	 * 3. Request notification when the operation is done.
1156 	 * 4. Kick the peer.
1157 	 * 5. Wait for the response via xnf_tx_clean_ring().
1158 	 */
1159 
1160 	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
1161 	ASSERT(n_slots >= 2);
1162 
1163 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1164 	tidp = txid_get(xnfp);
1165 	VERIFY(tidp != NULL);
1166 
1167 	txp->tx_type = TX_MCAST_REQ;
1168 	txp->tx_slot = slot;
1169 
1170 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1171 	erp = (struct netif_extra_info *)
1172 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1173 
1174 	txrp->gref = 0;
1175 	txrp->size = 0;
1176 	txrp->offset = 0;
1177 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1178 	txrp->id = txp->tx_txreq.id = tidp->id;
1179 	txrp->flags = NETTXF_extra_info;
1180 
1181 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1182 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1183 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1184 
1185 	tidp->txbuf = txp;
1186 
1187 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1188 
1189 	mutex_enter(&xnfp->xnf_schedlock);
1190 	xnfp->xnf_pending_multicast++;
1191 	mutex_exit(&xnfp->xnf_schedlock);
1192 
1193 	/* LINTED: constant in conditional context */
1194 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1195 	    notify);
1196 	if (notify)
1197 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1198 
1199 	while (txp->tx_type == TX_MCAST_REQ)
1200 		cv_wait(&xnfp->xnf_cv_multicast,
1201 		    &xnfp->xnf_txlock);
1202 
1203 	ASSERT(txp->tx_type == TX_MCAST_RSP);
1204 
1205 	mutex_enter(&xnfp->xnf_schedlock);
1206 	xnfp->xnf_pending_multicast--;
1207 	mutex_exit(&xnfp->xnf_schedlock);
1208 
1209 	result = (txp->tx_status == NETIF_RSP_OKAY);
1210 
1211 	txid_put(xnfp, tidp);
1212 
1213 	mutex_exit(&xnfp->xnf_txlock);
1214 
1215 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1216 
1217 	return (result ? 0 : 1);
1218 }
1219 
1220 /*
1221  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1222  *
1223  *  Program the hardware to enable/disable promiscuous mode.
1224  */
1225 static int
1226 xnf_set_promiscuous(void *arg, boolean_t on)
1227 {
1228 	_NOTE(ARGUNUSED(arg, on));
1229 
1230 	/*
1231 	 * We can't really do this, but we pretend that we can in
1232 	 * order that snoop will work.
1233 	 */
1234 	return (0);
1235 }
1236 
1237 /*
1238  * Clean buffers that we have responses for from the transmit ring.
1239  */
1240 static int
1241 xnf_tx_clean_ring(xnf_t *xnfp)
1242 {
1243 	boolean_t work_to_do;
1244 
1245 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1246 
1247 loop:
1248 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1249 		RING_IDX cons, prod, i;
1250 
1251 		cons = xnfp->xnf_tx_ring.rsp_cons;
1252 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1253 		membar_consumer();
1254 		/*
1255 		 * Clean tx requests from ring that we have responses
1256 		 * for.
1257 		 */
1258 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1259 		for (i = cons; i != prod; i++) {
1260 			netif_tx_response_t *trp;
1261 			xnf_txid_t *tidp;
1262 			xnf_txbuf_t *txp;
1263 
1264 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1265 			ASSERT(TX_ID_VALID(trp->id));
1266 
1267 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
1268 			ASSERT(tidp->id == trp->id);
1269 			ASSERT(tidp->next == INVALID_TX_ID);
1270 
1271 			txp = tidp->txbuf;
1272 			ASSERT(txp != NULL);
1273 			ASSERT(txp->tx_txreq.id == trp->id);
1274 
1275 			switch (txp->tx_type) {
1276 			case TX_DATA:
1277 				if (gnttab_query_foreign_access(
1278 				    txp->tx_txreq.gref) != 0)
1279 					cmn_err(CE_PANIC,
1280 					    "tx grant %d still in use by "
1281 					    "backend domain",
1282 					    txp->tx_txreq.gref);
1283 
1284 				if (txp->tx_bdesc == NULL) {
1285 					(void) gnttab_end_foreign_access_ref(
1286 					    txp->tx_txreq.gref, 1);
1287 					gref_put(xnfp, txp->tx_txreq.gref);
1288 					(void) ddi_dma_unbind_handle(
1289 					    txp->tx_dma_handle);
1290 				} else {
1291 					xnf_buf_put(xnfp, txp->tx_bdesc,
1292 					    B_TRUE);
1293 				}
1294 
1295 				freemsg(txp->tx_mp);
1296 				txid_put(xnfp, tidp);
1297 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1298 
1299 				break;
1300 
1301 			case TX_MCAST_REQ:
1302 				txp->tx_type = TX_MCAST_RSP;
1303 				txp->tx_status = trp->status;
1304 				cv_broadcast(&xnfp->xnf_cv_multicast);
1305 
1306 				break;
1307 
1308 			case TX_MCAST_RSP:
1309 				break;
1310 
1311 			default:
1312 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1313 				    "invalid xnf_txbuf_t type: %d",
1314 				    txp->tx_type);
1315 				break;
1316 			}
1317 		}
1318 		/*
1319 		 * Record the last response we dealt with so that we
1320 		 * know where to start next time around.
1321 		 */
1322 		xnfp->xnf_tx_ring.rsp_cons = prod;
1323 		membar_enter();
1324 	}
1325 
1326 	/* LINTED: constant in conditional context */
1327 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1328 	if (work_to_do)
1329 		goto loop;
1330 
1331 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1332 }
1333 
1334 /*
1335  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1336  * to ensure that the packet is physically contiguous and contained
1337  * within a single page.
1338  */
1339 static xnf_buf_t *
1340 xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
1341 {
1342 	xnf_buf_t *bd;
1343 	caddr_t bp;
1344 
1345 	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1346 	if (bd == NULL)
1347 		return (NULL);
1348 
1349 	bp = bd->buf;
1350 	while (mp != NULL) {
1351 		size_t len = MBLKL(mp);
1352 
1353 		bcopy(mp->b_rptr, bp, len);
1354 		bp += len;
1355 
1356 		mp = mp->b_cont;
1357 	}
1358 
1359 	ASSERT((bp - bd->buf) <= PAGESIZE);
1360 
1361 	xnfp->xnf_stat_tx_pullup++;
1362 
1363 	return (bd);
1364 }
1365 
1366 /*
1367  * Insert the pseudo-header checksum into the packet `buf'.
1368  */
1369 void
1370 xnf_pseudo_cksum(caddr_t buf, int length)
1371 {
1372 	struct ether_header *ehp;
1373 	uint16_t sap, len, *stuff;
1374 	uint32_t cksum;
1375 	size_t offset;
1376 	ipha_t *ipha;
1377 	ipaddr_t src, dst;
1378 
1379 	ASSERT(length >= sizeof (*ehp));
1380 	ehp = (struct ether_header *)buf;
1381 
1382 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1383 		struct ether_vlan_header *evhp;
1384 
1385 		ASSERT(length >= sizeof (*evhp));
1386 		evhp = (struct ether_vlan_header *)buf;
1387 		sap = ntohs(evhp->ether_type);
1388 		offset = sizeof (*evhp);
1389 	} else {
1390 		sap = ntohs(ehp->ether_type);
1391 		offset = sizeof (*ehp);
1392 	}
1393 
1394 	ASSERT(sap == ETHERTYPE_IP);
1395 
1396 	/* Packet should have been pulled up by the caller. */
1397 	if ((offset + sizeof (ipha_t)) > length) {
1398 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
1399 		return;
1400 	}
1401 
1402 	ipha = (ipha_t *)(buf + offset);
1403 
1404 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
1405 
1406 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1407 
1408 	switch (ipha->ipha_protocol) {
1409 	case IPPROTO_TCP:
1410 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1411 		cksum = IP_TCP_CSUM_COMP;
1412 		break;
1413 	case IPPROTO_UDP:
1414 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1415 		cksum = IP_UDP_CSUM_COMP;
1416 		break;
1417 	default:
1418 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1419 		    ipha->ipha_protocol);
1420 		return;
1421 	}
1422 
1423 	src = ipha->ipha_src;
1424 	dst = ipha->ipha_dst;
1425 
1426 	cksum += (dst >> 16) + (dst & 0xFFFF);
1427 	cksum += (src >> 16) + (src & 0xFFFF);
1428 	cksum += htons(len);
1429 
1430 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1431 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1432 
1433 	ASSERT(cksum <= 0xFFFF);
1434 
1435 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1436 }
1437 
1438 /*
1439  * Push a list of prepared packets (`txp') into the transmit ring.
1440  */
1441 static xnf_txbuf_t *
1442 tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
1443 {
1444 	int slots_free;
1445 	RING_IDX slot;
1446 	boolean_t notify;
1447 
1448 	mutex_enter(&xnfp->xnf_txlock);
1449 
1450 	ASSERT(xnfp->xnf_running);
1451 
1452 	/*
1453 	 * Wait until we are connected to the backend.
1454 	 */
1455 	while (!xnfp->xnf_connected)
1456 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1457 
1458 	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
1459 	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
1460 
1461 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1462 
1463 	while ((txp != NULL) && (slots_free > 0)) {
1464 		xnf_txid_t *tidp;
1465 		netif_tx_request_t *txrp;
1466 
1467 		tidp = txid_get(xnfp);
1468 		VERIFY(tidp != NULL);
1469 
1470 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1471 
1472 		txp->tx_slot = slot;
1473 		txp->tx_txreq.id = tidp->id;
1474 		*txrp = txp->tx_txreq;
1475 
1476 		tidp->txbuf = txp;
1477 
1478 		xnfp->xnf_stat_opackets++;
1479 		xnfp->xnf_stat_obytes += txp->tx_txreq.size;
1480 
1481 		txp = txp->tx_next;
1482 		slots_free--;
1483 		slot++;
1484 
1485 	}
1486 
1487 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
1488 
1489 	/*
1490 	 * Tell the peer that we sent something, if it cares.
1491 	 */
1492 	/* LINTED: constant in conditional context */
1493 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1494 	    notify);
1495 	if (notify)
1496 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1497 
1498 	mutex_exit(&xnfp->xnf_txlock);
1499 
1500 	return (txp);
1501 }
1502 
1503 /*
1504  * Send the chain of packets `mp'. Called by the MAC framework.
1505  */
1506 static mblk_t *
1507 xnf_send(void *arg, mblk_t *mp)
1508 {
1509 	xnf_t *xnfp = arg;
1510 	domid_t oeid;
1511 	xnf_txbuf_t *head, *tail;
1512 	mblk_t *ml;
1513 	int prepared;
1514 
1515 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1516 
1517 	/*
1518 	 * Prepare packets for transmission.
1519 	 */
1520 	head = tail = NULL;
1521 	prepared = 0;
1522 	while (mp != NULL) {
1523 		xnf_txbuf_t *txp;
1524 		int n_chunks, length;
1525 		boolean_t page_oops;
1526 		uint32_t pflags;
1527 
1528 		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
1529 		    ml != NULL;
1530 		    ml = ml->b_cont, n_chunks++) {
1531 
1532 			/*
1533 			 * Test if this buffer includes a page
1534 			 * boundary. The test assumes that the range
1535 			 * b_rptr...b_wptr can include only a single
1536 			 * boundary.
1537 			 */
1538 			if (xnf_btop((size_t)ml->b_rptr) !=
1539 			    xnf_btop((size_t)ml->b_wptr)) {
1540 				xnfp->xnf_stat_tx_pagebndry++;
1541 				page_oops = B_TRUE;
1542 			}
1543 
1544 			length += MBLKL(ml);
1545 		}
1546 		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
1547 
1548 		/*
1549 		 * Make sure packet isn't too large.
1550 		 */
1551 		if (length > XNF_FRAMESIZE) {
1552 			cmn_err(CE_WARN,
1553 			    "xnf%d: oversized packet (%d bytes) dropped",
1554 			    ddi_get_instance(xnfp->xnf_devinfo), length);
1555 			freemsg(mp);
1556 			continue;
1557 		}
1558 
1559 		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1560 		if (txp == NULL)
1561 			break;
1562 
1563 		txp->tx_type = TX_DATA;
1564 
1565 		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
1566 			/*
1567 			 * Loan a side buffer rather than the mblk
1568 			 * itself.
1569 			 */
1570 			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
1571 			if (txp->tx_bdesc == NULL) {
1572 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1573 				break;
1574 			}
1575 
1576 			txp->tx_bufp = txp->tx_bdesc->buf;
1577 			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1578 			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1579 
1580 		} else {
1581 			int rc;
1582 			ddi_dma_cookie_t dma_cookie;
1583 			uint_t ncookies;
1584 
1585 			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
1586 			    NULL, (char *)mp->b_rptr, length,
1587 			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1588 			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1589 			    &ncookies);
1590 			if (rc != DDI_DMA_MAPPED) {
1591 				ASSERT(rc != DDI_DMA_INUSE);
1592 				ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1593 
1594 #ifdef XNF_DEBUG
1595 				if (rc != DDI_DMA_NORESOURCES)
1596 					cmn_err(CE_WARN,
1597 					    "xnf%d: bind_handle failed (%x)",
1598 					    ddi_get_instance(xnfp->xnf_devinfo),
1599 					    rc);
1600 #endif
1601 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1602 				break;
1603 			}
1604 			ASSERT(ncookies == 1);
1605 
1606 			txp->tx_bdesc = NULL;
1607 			txp->tx_bufp = (caddr_t)mp->b_rptr;
1608 			txp->tx_mfn =
1609 			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
1610 			txp->tx_txreq.gref = gref_get(xnfp);
1611 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1612 				(void) ddi_dma_unbind_handle(
1613 				    txp->tx_dma_handle);
1614 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1615 				break;
1616 			}
1617 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1618 			    oeid, txp->tx_mfn, 1);
1619 		}
1620 
1621 		txp->tx_next = NULL;
1622 		txp->tx_mp = mp;
1623 		txp->tx_txreq.size = length;
1624 		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
1625 		txp->tx_txreq.flags = 0;
1626 		hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
1627 		    &pflags);
1628 		if (pflags != 0) {
1629 			/*
1630 			 * If the local protocol stack requests checksum
1631 			 * offload we set the 'checksum blank' flag,
1632 			 * indicating to the peer that we need the checksum
1633 			 * calculated for us.
1634 			 *
1635 			 * We _don't_ set the validated flag, because we haven't
1636 			 * validated that the data and the checksum match.
1637 			 */
1638 			xnf_pseudo_cksum(txp->tx_bufp, length);
1639 			txp->tx_txreq.flags |= NETTXF_csum_blank;
1640 
1641 			xnfp->xnf_stat_tx_cksum_deferred++;
1642 		}
1643 
1644 		if (head == NULL) {
1645 			ASSERT(tail == NULL);
1646 
1647 			head = txp;
1648 		} else {
1649 			ASSERT(tail != NULL);
1650 
1651 			tail->tx_next = txp;
1652 		}
1653 		tail = txp;
1654 
1655 		mp = mp->b_next;
1656 		prepared++;
1657 
1658 		/*
1659 		 * There is no point in preparing more than
1660 		 * NET_TX_RING_SIZE, as we won't be able to push them
1661 		 * into the ring in one go and would hence have to
1662 		 * un-prepare the extra.
1663 		 */
1664 		if (prepared == NET_TX_RING_SIZE)
1665 			break;
1666 	}
1667 
1668 	DTRACE_PROBE1(xnf_send_prepared, int, prepared);
1669 
1670 	if (mp != NULL) {
1671 #ifdef XNF_DEBUG
1672 		int notprepared = 0;
1673 		mblk_t *l = mp;
1674 
1675 		while (l != NULL) {
1676 			notprepared++;
1677 			l = l->b_next;
1678 		}
1679 
1680 		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
1681 #else /* !XNF_DEBUG */
1682 		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
1683 #endif /* XNF_DEBUG */
1684 	}
1685 
1686 	/*
1687 	 * Push the packets we have prepared into the ring. They may
1688 	 * not all go.
1689 	 */
1690 	if (head != NULL)
1691 		head = tx_push_packets(xnfp, head);
1692 
1693 	/*
1694 	 * If some packets that we prepared were not sent, unprepare
1695 	 * them and add them back to the head of those we didn't
1696 	 * prepare.
1697 	 */
1698 	{
1699 		xnf_txbuf_t *loop;
1700 		mblk_t *mp_head, *mp_tail;
1701 		int unprepared = 0;
1702 
1703 		mp_head = mp_tail = NULL;
1704 		loop = head;
1705 
1706 		while (loop != NULL) {
1707 			xnf_txbuf_t *next = loop->tx_next;
1708 
1709 			if (loop->tx_bdesc == NULL) {
1710 				(void) gnttab_end_foreign_access_ref(
1711 				    loop->tx_txreq.gref, 1);
1712 				gref_put(xnfp, loop->tx_txreq.gref);
1713 				(void) ddi_dma_unbind_handle(
1714 				    loop->tx_dma_handle);
1715 			} else {
1716 				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
1717 			}
1718 
1719 			ASSERT(loop->tx_mp != NULL);
1720 			if (mp_head == NULL)
1721 				mp_head = loop->tx_mp;
1722 			mp_tail = loop->tx_mp;
1723 
1724 			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
1725 			loop = next;
1726 			unprepared++;
1727 		}
1728 
1729 		if (mp_tail == NULL) {
1730 			ASSERT(mp_head == NULL);
1731 		} else {
1732 			ASSERT(mp_head != NULL);
1733 
1734 			mp_tail->b_next = mp;
1735 			mp = mp_head;
1736 		}
1737 
1738 		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
1739 	}
1740 
1741 	/*
1742 	 * If any mblks are left then we have deferred for some reason
1743 	 * and need to ask for a re-schedule later. This is typically
1744 	 * due to the ring filling.
1745 	 */
1746 	if (mp != NULL) {
1747 		mutex_enter(&xnfp->xnf_schedlock);
1748 		xnfp->xnf_need_sched = B_TRUE;
1749 		mutex_exit(&xnfp->xnf_schedlock);
1750 
1751 		xnfp->xnf_stat_tx_defer++;
1752 	}
1753 
1754 	return (mp);
1755 }
1756 
1757 /*
1758  * Notification of RX packets. Currently no TX-complete interrupt is
1759  * used, as we clean the TX ring lazily.
1760  */
1761 static uint_t
1762 xnf_intr(caddr_t arg)
1763 {
1764 	xnf_t *xnfp = (xnf_t *)arg;
1765 	mblk_t *mp;
1766 	boolean_t need_sched, clean_ring;
1767 
1768 	mutex_enter(&xnfp->xnf_rxlock);
1769 
1770 	/*
1771 	 * Interrupts before we are connected are spurious.
1772 	 */
1773 	if (!xnfp->xnf_connected) {
1774 		mutex_exit(&xnfp->xnf_rxlock);
1775 		xnfp->xnf_stat_unclaimed_interrupts++;
1776 		return (DDI_INTR_UNCLAIMED);
1777 	}
1778 
1779 	/*
1780 	 * Receive side processing.
1781 	 */
1782 	do {
1783 		/*
1784 		 * Collect buffers from the ring.
1785 		 */
1786 		xnf_rx_collect(xnfp);
1787 
1788 		/*
1789 		 * Interrupt me when the next receive buffer is consumed.
1790 		 */
1791 		xnfp->xnf_rx_ring.sring->rsp_event =
1792 		    xnfp->xnf_rx_ring.rsp_cons + 1;
1793 		xen_mb();
1794 
1795 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
1796 
1797 	if (xnfp->xnf_rx_new_buffers_posted) {
1798 		boolean_t notify;
1799 
1800 		/*
1801 		 * Indicate to the peer that we have re-filled the
1802 		 * receive ring, if it cares.
1803 		 */
1804 		/* LINTED: constant in conditional context */
1805 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1806 		if (notify)
1807 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1808 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1809 	}
1810 
1811 	mp = xnfp->xnf_rx_head;
1812 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
1813 
1814 	xnfp->xnf_stat_interrupts++;
1815 	mutex_exit(&xnfp->xnf_rxlock);
1816 
1817 	if (mp != NULL)
1818 		mac_rx(xnfp->xnf_mh, NULL, mp);
1819 
1820 	/*
1821 	 * Transmit side processing.
1822 	 *
1823 	 * If a previous transmit attempt failed or we have pending
1824 	 * multicast requests, clean the ring.
1825 	 *
1826 	 * If we previously stalled transmission and cleaning produces
1827 	 * some free slots, tell upstream to attempt sending again.
1828 	 *
1829 	 * The odd style is to avoid acquiring xnf_txlock unless we
1830 	 * will actually look inside the tx machinery.
1831 	 */
1832 	mutex_enter(&xnfp->xnf_schedlock);
1833 	need_sched = xnfp->xnf_need_sched;
1834 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
1835 	mutex_exit(&xnfp->xnf_schedlock);
1836 
1837 	if (clean_ring) {
1838 		int free_slots;
1839 
1840 		mutex_enter(&xnfp->xnf_txlock);
1841 		free_slots = tx_slots_get(xnfp, 0, B_FALSE);
1842 
1843 		if (need_sched && (free_slots > 0)) {
1844 			mutex_enter(&xnfp->xnf_schedlock);
1845 			xnfp->xnf_need_sched = B_FALSE;
1846 			mutex_exit(&xnfp->xnf_schedlock);
1847 
1848 			mac_tx_update(xnfp->xnf_mh);
1849 		}
1850 		mutex_exit(&xnfp->xnf_txlock);
1851 	}
1852 
1853 	return (DDI_INTR_CLAIMED);
1854 }
1855 
1856 /*
1857  *  xnf_start() -- start the board receiving and enable interrupts.
1858  */
1859 static int
1860 xnf_start(void *arg)
1861 {
1862 	xnf_t *xnfp = arg;
1863 
1864 #ifdef XNF_DEBUG
1865 	if (xnf_debug & XNF_DEBUG_TRACE)
1866 		printf("xnf%d start(0x%p)\n",
1867 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1868 #endif
1869 
1870 	mutex_enter(&xnfp->xnf_rxlock);
1871 	mutex_enter(&xnfp->xnf_txlock);
1872 
1873 	/* Accept packets from above. */
1874 	xnfp->xnf_running = B_TRUE;
1875 
1876 	mutex_exit(&xnfp->xnf_txlock);
1877 	mutex_exit(&xnfp->xnf_rxlock);
1878 
1879 	return (0);
1880 }
1881 
1882 /* xnf_stop() - disable hardware */
1883 static void
1884 xnf_stop(void *arg)
1885 {
1886 	xnf_t *xnfp = arg;
1887 
1888 #ifdef XNF_DEBUG
1889 	if (xnf_debug & XNF_DEBUG_TRACE)
1890 		printf("xnf%d stop(0x%p)\n",
1891 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1892 #endif
1893 
1894 	mutex_enter(&xnfp->xnf_rxlock);
1895 	mutex_enter(&xnfp->xnf_txlock);
1896 
1897 	xnfp->xnf_running = B_FALSE;
1898 
1899 	mutex_exit(&xnfp->xnf_txlock);
1900 	mutex_exit(&xnfp->xnf_rxlock);
1901 }
1902 
1903 /*
1904  * Hang buffer `bdesc' on the RX ring.
1905  */
1906 static void
1907 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
1908 {
1909 	netif_rx_request_t *reqp;
1910 	RING_IDX hang_ix;
1911 
1912 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
1913 
1914 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1915 	    xnfp->xnf_rx_ring.req_prod_pvt);
1916 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1917 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
1918 
1919 	reqp->id = bdesc->id = hang_ix;
1920 	reqp->gref = bdesc->grant_ref;
1921 
1922 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
1923 	xnfp->xnf_rx_ring.req_prod_pvt++;
1924 
1925 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
1926 }
1927 
1928 /*
1929  * Collect packets from the RX ring, storing them in `xnfp' for later
1930  * use.
1931  */
1932 static void
1933 xnf_rx_collect(xnf_t *xnfp)
1934 {
1935 	mblk_t *head, *tail;
1936 
1937 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
1938 
1939 	/*
1940 	 * Loop over unconsumed responses:
1941 	 * 1. get a response
1942 	 * 2. take corresponding buffer off recv. ring
1943 	 * 3. indicate this by setting slot to NULL
1944 	 * 4. create a new message and
1945 	 * 5. copy data in, adjust ptr
1946 	 */
1947 
1948 	head = tail = NULL;
1949 
1950 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1951 		netif_rx_response_t *rxpkt;
1952 		xnf_buf_t *bdesc;
1953 		ssize_t len;
1954 		size_t off;
1955 		mblk_t *mp = NULL;
1956 		boolean_t hwcsum = B_FALSE;
1957 		grant_ref_t ref;
1958 
1959 		/* 1. */
1960 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1961 		    xnfp->xnf_rx_ring.rsp_cons);
1962 
1963 		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
1964 		    int, (int)rxpkt->offset,
1965 		    int, (int)rxpkt->flags,
1966 		    int, (int)rxpkt->status);
1967 
1968 		/*
1969 		 * 2.
1970 		 */
1971 		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
1972 
1973 		/*
1974 		 * 3.
1975 		 */
1976 		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
1977 		ASSERT(bdesc->id == rxpkt->id);
1978 
1979 		ref = bdesc->grant_ref;
1980 		off = rxpkt->offset;
1981 		len = rxpkt->status;
1982 
1983 		if (!xnfp->xnf_running) {
1984 			DTRACE_PROBE4(xnf_rx_not_running,
1985 			    int, rxpkt->status,
1986 			    char *, bdesc->buf, int, rxpkt->offset,
1987 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1988 
1989 			xnfp->xnf_stat_drop++;
1990 
1991 		} else if (len <= 0) {
1992 			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
1993 			    int, rxpkt->status,
1994 			    char *, bdesc->buf, int, rxpkt->offset,
1995 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1996 
1997 			xnfp->xnf_stat_errrx++;
1998 
1999 			switch (len) {
2000 			case 0:
2001 				xnfp->xnf_stat_runt++;
2002 				break;
2003 			case NETIF_RSP_ERROR:
2004 				xnfp->xnf_stat_mac_rcv_error++;
2005 				break;
2006 			case NETIF_RSP_DROPPED:
2007 				xnfp->xnf_stat_norxbuf++;
2008 				break;
2009 			}
2010 
2011 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2012 			cmn_err(CE_WARN, "Bad rx grant reference %d "
2013 			    "from domain %d", ref,
2014 			    xvdi_get_oeid(xnfp->xnf_devinfo));
2015 
2016 		} else if ((off + len) > PAGESIZE) {
2017 			cmn_err(CE_WARN, "Rx packet overflows page "
2018 			    "(offset %ld, length %ld) from domain %d",
2019 			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
2020 		} else {
2021 			xnf_buf_t *nbuf = NULL;
2022 
2023 			DTRACE_PROBE4(xnf_rx_packet, int, len,
2024 			    char *, bdesc->buf, int, off,
2025 			    char *, ((char *)bdesc->buf) + off);
2026 
2027 			ASSERT(off + len <= PAGEOFFSET);
2028 
2029 			if (rxpkt->flags & NETRXF_data_validated)
2030 				hwcsum = B_TRUE;
2031 
2032 			/*
2033 			 * If the packet is below a pre-determined
2034 			 * size we will copy data out rather than
2035 			 * replace it.
2036 			 */
2037 			if (len > xnf_rx_copy_limit)
2038 				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2039 
2040 			/*
2041 			 * If we have a replacement buffer, attempt to
2042 			 * wrap the existing one with an mblk_t in
2043 			 * order that the upper layers of the stack
2044 			 * might use it directly.
2045 			 */
2046 			if (nbuf != NULL) {
2047 				mp = desballoc((unsigned char *)bdesc->buf,
2048 				    bdesc->len, 0, &bdesc->free_rtn);
2049 				if (mp == NULL) {
2050 					xnfp->xnf_stat_rx_desballoc_fail++;
2051 					xnfp->xnf_stat_norxbuf++;
2052 
2053 					xnf_buf_put(xnfp, nbuf, B_FALSE);
2054 					nbuf = NULL;
2055 				} else {
2056 					mp->b_rptr = mp->b_rptr + off;
2057 					mp->b_wptr = mp->b_rptr + len;
2058 
2059 					/*
2060 					 * Release the grant reference
2061 					 * associated with this buffer
2062 					 * - they are scarce and the
2063 					 * upper layers of the stack
2064 					 * don't need it.
2065 					 */
2066 					(void) gnttab_end_foreign_access_ref(
2067 					    bdesc->grant_ref, 0);
2068 					gref_put(xnfp, bdesc->grant_ref);
2069 					bdesc->grant_ref = INVALID_GRANT_REF;
2070 
2071 					bdesc = nbuf;
2072 				}
2073 			}
2074 
2075 			if (nbuf == NULL) {
2076 				/*
2077 				 * No replacement buffer allocated -
2078 				 * attempt to copy the data out and
2079 				 * re-hang the existing buffer.
2080 				 */
2081 
2082 				/* 4. */
2083 				mp = allocb(len, BPRI_MED);
2084 				if (mp == NULL) {
2085 					xnfp->xnf_stat_rx_allocb_fail++;
2086 					xnfp->xnf_stat_norxbuf++;
2087 				} else {
2088 					/* 5. */
2089 					bcopy(bdesc->buf + off, mp->b_wptr,
2090 					    len);
2091 					mp->b_wptr += len;
2092 				}
2093 			}
2094 		}
2095 
2096 		/* Re-hang the buffer. */
2097 		xnf_rxbuf_hang(xnfp, bdesc);
2098 
2099 		if (mp != NULL) {
2100 			if (hwcsum) {
2101 				/*
2102 				 * If the peer says that the data has
2103 				 * been validated then we declare that
2104 				 * the full checksum has been
2105 				 * verified.
2106 				 *
2107 				 * We don't look at the "checksum
2108 				 * blank" flag, and hence could have a
2109 				 * packet here that we are asserting
2110 				 * is good with a blank checksum.
2111 				 *
2112 				 * The hardware checksum offload
2113 				 * specification says that we must
2114 				 * provide the actual checksum as well
2115 				 * as an assertion that it is valid,
2116 				 * but the protocol stack doesn't
2117 				 * actually use it and some other
2118 				 * drivers don't bother, so we don't.
2119 				 * If it was necessary we could grovel
2120 				 * in the packet to find it.
2121 				 */
2122 				(void) hcksum_assoc(mp, NULL,
2123 				    NULL, 0, 0, 0, 0,
2124 				    HCK_FULLCKSUM |
2125 				    HCK_FULLCKSUM_OK, 0);
2126 				xnfp->xnf_stat_rx_cksum_no_need++;
2127 			}
2128 			if (head == NULL) {
2129 				ASSERT(tail == NULL);
2130 
2131 				head = mp;
2132 			} else {
2133 				ASSERT(tail != NULL);
2134 
2135 				tail->b_next = mp;
2136 			}
2137 			tail = mp;
2138 
2139 			ASSERT(mp->b_next == NULL);
2140 
2141 			xnfp->xnf_stat_ipackets++;
2142 			xnfp->xnf_stat_rbytes += len;
2143 		}
2144 
2145 		xnfp->xnf_rx_ring.rsp_cons++;
2146 	}
2147 
2148 	/*
2149 	 * Store the mblks we have collected.
2150 	 */
2151 	if (head != NULL) {
2152 		ASSERT(tail != NULL);
2153 
2154 		if (xnfp->xnf_rx_head == NULL) {
2155 			ASSERT(xnfp->xnf_rx_tail == NULL);
2156 
2157 			xnfp->xnf_rx_head = head;
2158 		} else {
2159 			ASSERT(xnfp->xnf_rx_tail != NULL);
2160 
2161 			xnfp->xnf_rx_tail->b_next = head;
2162 		}
2163 		xnfp->xnf_rx_tail = tail;
2164 	}
2165 }
2166 
2167 /*
2168  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2169  */
2170 static int
2171 xnf_alloc_dma_resources(xnf_t *xnfp)
2172 {
2173 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
2174 	size_t			len;
2175 	ddi_dma_cookie_t	dma_cookie;
2176 	uint_t			ncookies;
2177 	int			rc;
2178 	caddr_t			rptr;
2179 
2180 	/*
2181 	 * The code below allocates all the DMA data structures that
2182 	 * need to be released when the driver is detached.
2183 	 *
2184 	 * Allocate page for the transmit descriptor ring.
2185 	 */
2186 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2187 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2188 		goto alloc_error;
2189 
2190 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2191 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2192 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2193 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2194 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2195 		xnfp->xnf_tx_ring_dma_handle = NULL;
2196 		goto alloc_error;
2197 	}
2198 
2199 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2200 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2201 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2202 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2203 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2204 		xnfp->xnf_tx_ring_dma_handle = NULL;
2205 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2206 		if (rc == DDI_DMA_NORESOURCES)
2207 			goto alloc_error;
2208 		else
2209 			goto error;
2210 	}
2211 
2212 	ASSERT(ncookies == 1);
2213 	bzero(rptr, PAGESIZE);
2214 	/* LINTED: constant in conditional context */
2215 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2216 	/* LINTED: constant in conditional context */
2217 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2218 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2219 
2220 	/*
2221 	 * Allocate page for the receive descriptor ring.
2222 	 */
2223 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2224 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2225 		goto alloc_error;
2226 
2227 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2228 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2229 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2230 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2231 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2232 		xnfp->xnf_rx_ring_dma_handle = NULL;
2233 		goto alloc_error;
2234 	}
2235 
2236 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2237 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2238 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2239 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2240 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2241 		xnfp->xnf_rx_ring_dma_handle = NULL;
2242 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2243 		if (rc == DDI_DMA_NORESOURCES)
2244 			goto alloc_error;
2245 		else
2246 			goto error;
2247 	}
2248 
2249 	ASSERT(ncookies == 1);
2250 	bzero(rptr, PAGESIZE);
2251 	/* LINTED: constant in conditional context */
2252 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2253 	/* LINTED: constant in conditional context */
2254 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2255 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2256 
2257 	return (DDI_SUCCESS);
2258 
2259 alloc_error:
2260 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2261 	    ddi_get_instance(xnfp->xnf_devinfo));
2262 error:
2263 	xnf_release_dma_resources(xnfp);
2264 	return (DDI_FAILURE);
2265 }
2266 
2267 /*
2268  * Release all DMA resources in the opposite order from acquisition
2269  */
2270 static void
2271 xnf_release_dma_resources(xnf_t *xnfp)
2272 {
2273 	int i;
2274 
2275 	/*
2276 	 * Free receive buffers which are currently associated with
2277 	 * descriptors.
2278 	 */
2279 	mutex_enter(&xnfp->xnf_rxlock);
2280 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
2281 		xnf_buf_t *bp;
2282 
2283 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2284 			continue;
2285 		xnfp->xnf_rx_pkt_info[i] = NULL;
2286 		xnf_buf_put(xnfp, bp, B_FALSE);
2287 	}
2288 	mutex_exit(&xnfp->xnf_rxlock);
2289 
2290 	/* Free the receive ring buffer. */
2291 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2292 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2293 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2294 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2295 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2296 	}
2297 	/* Free the transmit ring buffer. */
2298 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2299 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2300 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2301 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2302 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2303 	}
2304 
2305 }
2306 
2307 /*
2308  * Release any packets and associated structures used by the TX ring.
2309  */
2310 static void
2311 xnf_release_mblks(xnf_t *xnfp)
2312 {
2313 	RING_IDX i;
2314 	xnf_txid_t *tidp;
2315 
2316 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2317 	    i < NET_TX_RING_SIZE;
2318 	    i++, tidp++) {
2319 		xnf_txbuf_t *txp = tidp->txbuf;
2320 
2321 		if (txp != NULL) {
2322 			ASSERT(txp->tx_mp != NULL);
2323 			freemsg(txp->tx_mp);
2324 
2325 			txid_put(xnfp, tidp);
2326 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2327 		}
2328 	}
2329 }
2330 
2331 static int
2332 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2333 {
2334 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2335 	xnf_buf_t *bdesc = buf;
2336 	xnf_t *xnfp = arg;
2337 	ddi_dma_cookie_t dma_cookie;
2338 	uint_t ncookies;
2339 	size_t len;
2340 
2341 	if (kmflag & KM_NOSLEEP)
2342 		ddiflags = DDI_DMA_DONTWAIT;
2343 
2344 	/* Allocate a DMA access handle for the buffer. */
2345 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
2346 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2347 		goto failure;
2348 
2349 	/* Allocate DMA-able memory for buffer. */
2350 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2351 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2352 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2353 		goto failure_1;
2354 
2355 	/* Bind to virtual address of buffer to get physical address. */
2356 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2357 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2358 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2359 		goto failure_2;
2360 	ASSERT(ncookies == 1);
2361 
2362 	bdesc->free_rtn.free_func = xnf_buf_recycle;
2363 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2364 	bdesc->xnfp = xnfp;
2365 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2366 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2367 	bdesc->len = dma_cookie.dmac_size;
2368 	bdesc->grant_ref = INVALID_GRANT_REF;
2369 	bdesc->gen = xnfp->xnf_gen;
2370 
2371 	atomic_add_64(&xnfp->xnf_stat_buf_allocated, 1);
2372 
2373 	return (0);
2374 
2375 failure_2:
2376 	ddi_dma_mem_free(&bdesc->acc_handle);
2377 
2378 failure_1:
2379 	ddi_dma_free_handle(&bdesc->dma_handle);
2380 
2381 failure:
2382 
2383 	return (-1);
2384 }
2385 
2386 static void
2387 xnf_buf_destructor(void *buf, void *arg)
2388 {
2389 	xnf_buf_t *bdesc = buf;
2390 	xnf_t *xnfp = arg;
2391 
2392 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
2393 	ddi_dma_mem_free(&bdesc->acc_handle);
2394 	ddi_dma_free_handle(&bdesc->dma_handle);
2395 
2396 	atomic_add_64(&xnfp->xnf_stat_buf_allocated, -1);
2397 }
2398 
2399 static xnf_buf_t *
2400 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2401 {
2402 	grant_ref_t gref;
2403 	xnf_buf_t *bufp;
2404 
2405 	/*
2406 	 * Usually grant references are more scarce than memory, so we
2407 	 * attempt to acquire a grant reference first.
2408 	 */
2409 	gref = gref_get(xnfp);
2410 	if (gref == INVALID_GRANT_REF)
2411 		return (NULL);
2412 
2413 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2414 	if (bufp == NULL) {
2415 		gref_put(xnfp, gref);
2416 		return (NULL);
2417 	}
2418 
2419 	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
2420 
2421 	bufp->grant_ref = gref;
2422 
2423 	if (bufp->gen != xnfp->xnf_gen)
2424 		xnf_buf_refresh(bufp);
2425 
2426 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
2427 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2428 	    bufp->buf_mfn, readonly ? 1 : 0);
2429 
2430 	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, 1);
2431 
2432 	return (bufp);
2433 }
2434 
2435 static void
2436 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2437 {
2438 	if (bufp->grant_ref != INVALID_GRANT_REF) {
2439 		(void) gnttab_end_foreign_access_ref(
2440 		    bufp->grant_ref, readonly ? 1 : 0);
2441 		gref_put(xnfp, bufp->grant_ref);
2442 		bufp->grant_ref = INVALID_GRANT_REF;
2443 	}
2444 
2445 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2446 
2447 	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, -1);
2448 }
2449 
2450 /*
2451  * Refresh any cached data about a buffer after resume.
2452  */
2453 static void
2454 xnf_buf_refresh(xnf_buf_t *bdesc)
2455 {
2456 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2457 	bdesc->gen = bdesc->xnfp->xnf_gen;
2458 }
2459 
2460 /*
2461  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2462  * look-aside buffers.
2463  */
2464 static void
2465 xnf_buf_recycle(xnf_buf_t *bdesc)
2466 {
2467 	xnf_t *xnfp = bdesc->xnfp;
2468 
2469 	xnf_buf_put(xnfp, bdesc, B_TRUE);
2470 }
2471 
2472 static int
2473 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2474 {
2475 	_NOTE(ARGUNUSED(kmflag));
2476 	xnf_txbuf_t *txp = buf;
2477 	xnf_t *xnfp = arg;
2478 
2479 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
2480 	    0, 0, &txp->tx_dma_handle) != DDI_SUCCESS)
2481 		return (-1);
2482 
2483 	return (0);
2484 }
2485 
2486 static void
2487 xnf_tx_buf_destructor(void *buf, void *arg)
2488 {
2489 	_NOTE(ARGUNUSED(arg));
2490 	xnf_txbuf_t *txp = buf;
2491 
2492 	ddi_dma_free_handle(&txp->tx_dma_handle);
2493 }
2494 
2495 /*
2496  * Statistics.
2497  */
2498 static char *xnf_aux_statistics[] = {
2499 	"tx_cksum_deferred",
2500 	"rx_cksum_no_need",
2501 	"interrupts",
2502 	"unclaimed_interrupts",
2503 	"tx_pullup",
2504 	"tx_pagebndry",
2505 	"tx_attempt",
2506 	"buf_allocated",
2507 	"buf_outstanding",
2508 	"gref_outstanding",
2509 	"gref_failure",
2510 	"gref_peak",
2511 	"rx_allocb_fail",
2512 	"rx_desballoc_fail",
2513 };
2514 
2515 static int
2516 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2517 {
2518 	xnf_t *xnfp;
2519 	kstat_named_t *knp;
2520 
2521 	if (flag != KSTAT_READ)
2522 		return (EACCES);
2523 
2524 	xnfp = ksp->ks_private;
2525 	knp = ksp->ks_data;
2526 
2527 	/*
2528 	 * Assignment order must match that of the names in
2529 	 * xnf_aux_statistics.
2530 	 */
2531 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2532 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2533 
2534 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2535 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2536 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2537 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2538 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2539 
2540 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2541 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2542 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2543 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2544 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2545 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2546 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2547 
2548 	return (0);
2549 }
2550 
2551 static boolean_t
2552 xnf_kstat_init(xnf_t *xnfp)
2553 {
2554 	int nstat = sizeof (xnf_aux_statistics) /
2555 	    sizeof (xnf_aux_statistics[0]);
2556 	char **cp = xnf_aux_statistics;
2557 	kstat_named_t *knp;
2558 
2559 	/*
2560 	 * Create and initialise kstats.
2561 	 */
2562 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2563 	    ddi_get_instance(xnfp->xnf_devinfo),
2564 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2565 	    nstat, 0)) == NULL)
2566 		return (B_FALSE);
2567 
2568 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2569 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2570 
2571 	knp = xnfp->xnf_kstat_aux->ks_data;
2572 	while (nstat > 0) {
2573 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2574 
2575 		knp++;
2576 		cp++;
2577 		nstat--;
2578 	}
2579 
2580 	kstat_install(xnfp->xnf_kstat_aux);
2581 
2582 	return (B_TRUE);
2583 }
2584 
2585 static int
2586 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2587 {
2588 	xnf_t *xnfp = arg;
2589 
2590 	mutex_enter(&xnfp->xnf_rxlock);
2591 	mutex_enter(&xnfp->xnf_txlock);
2592 
2593 #define	mac_stat(q, r)				\
2594 	case (MAC_STAT_##q):			\
2595 		*val = xnfp->xnf_stat_##r;	\
2596 		break
2597 
2598 #define	ether_stat(q, r)			\
2599 	case (ETHER_STAT_##q):			\
2600 		*val = xnfp->xnf_stat_##r;	\
2601 		break
2602 
2603 	switch (stat) {
2604 
2605 	mac_stat(IPACKETS, ipackets);
2606 	mac_stat(OPACKETS, opackets);
2607 	mac_stat(RBYTES, rbytes);
2608 	mac_stat(OBYTES, obytes);
2609 	mac_stat(NORCVBUF, norxbuf);
2610 	mac_stat(IERRORS, errrx);
2611 	mac_stat(NOXMTBUF, tx_defer);
2612 
2613 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2614 	ether_stat(TOOSHORT_ERRORS, runt);
2615 
2616 	/* always claim to be in full duplex mode */
2617 	case ETHER_STAT_LINK_DUPLEX:
2618 		*val = LINK_DUPLEX_FULL;
2619 		break;
2620 
2621 	/* always claim to be at 1Gb/s link speed */
2622 	case MAC_STAT_IFSPEED:
2623 		*val = 1000000000ull;
2624 		break;
2625 
2626 	default:
2627 		mutex_exit(&xnfp->xnf_txlock);
2628 		mutex_exit(&xnfp->xnf_rxlock);
2629 
2630 		return (ENOTSUP);
2631 	}
2632 
2633 #undef mac_stat
2634 #undef ether_stat
2635 
2636 	mutex_exit(&xnfp->xnf_txlock);
2637 	mutex_exit(&xnfp->xnf_rxlock);
2638 
2639 	return (0);
2640 }
2641 
2642 static boolean_t
2643 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2644 {
2645 	_NOTE(ARGUNUSED(arg));
2646 
2647 	switch (cap) {
2648 	case MAC_CAPAB_HCKSUM: {
2649 		uint32_t *capab = cap_data;
2650 
2651 		/*
2652 		 * Whilst the flag used to communicate with the IO
2653 		 * domain is called "NETTXF_csum_blank", the checksum
2654 		 * in the packet must contain the pseudo-header
2655 		 * checksum and not zero.
2656 		 *
2657 		 * To help out the IO domain, we might use
2658 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
2659 		 * then use checksum offload for IPv6 packets, which
2660 		 * the IO domain can't handle.
2661 		 *
2662 		 * As a result, we declare outselves capable of
2663 		 * HCKSUM_INET_FULL_V4. This means that we receive
2664 		 * IPv4 packets from the stack with a blank checksum
2665 		 * field and must insert the pseudo-header checksum
2666 		 * before passing the packet to the IO domain.
2667 		 */
2668 		*capab = HCKSUM_INET_FULL_V4;
2669 		break;
2670 	}
2671 	default:
2672 		return (B_FALSE);
2673 	}
2674 
2675 	return (B_TRUE);
2676 }
2677 
2678 /*
2679  * The state of the peer has changed - react accordingly.
2680  */
2681 static void
2682 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2683     void *arg, void *impl_data)
2684 {
2685 	_NOTE(ARGUNUSED(id, arg));
2686 	xnf_t *xnfp = ddi_get_driver_private(dip);
2687 	XenbusState new_state = *(XenbusState *)impl_data;
2688 
2689 	ASSERT(xnfp != NULL);
2690 
2691 	switch (new_state) {
2692 	case XenbusStateUnknown:
2693 	case XenbusStateInitialising:
2694 	case XenbusStateInitialised:
2695 	case XenbusStateClosing:
2696 	case XenbusStateClosed:
2697 	case XenbusStateReconfiguring:
2698 	case XenbusStateReconfigured:
2699 		break;
2700 
2701 	case XenbusStateInitWait:
2702 		xnf_read_config(xnfp);
2703 
2704 		if (!xnfp->xnf_be_rx_copy) {
2705 			cmn_err(CE_WARN,
2706 			    "The xnf driver requires a dom0 that "
2707 			    "supports 'feature-rx-copy'.");
2708 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
2709 			    XBT_NULL, XenbusStateClosed);
2710 			break;
2711 		}
2712 
2713 		/*
2714 		 * Connect to the backend.
2715 		 */
2716 		xnf_be_connect(xnfp);
2717 
2718 		/*
2719 		 * Our MAC address as discovered by xnf_read_config().
2720 		 */
2721 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
2722 
2723 		break;
2724 
2725 	case XenbusStateConnected:
2726 		mutex_enter(&xnfp->xnf_rxlock);
2727 		mutex_enter(&xnfp->xnf_txlock);
2728 
2729 		xnfp->xnf_connected = B_TRUE;
2730 		/*
2731 		 * Wake up any threads waiting to send data to
2732 		 * backend.
2733 		 */
2734 		cv_broadcast(&xnfp->xnf_cv_state);
2735 
2736 		mutex_exit(&xnfp->xnf_txlock);
2737 		mutex_exit(&xnfp->xnf_rxlock);
2738 
2739 		/*
2740 		 * Kick the peer in case it missed any transmits
2741 		 * request in the TX ring.
2742 		 */
2743 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2744 
2745 		/*
2746 		 * There may already be completed receive requests in
2747 		 * the ring sent by backend after it gets connected
2748 		 * but before we see its state change here, so we call
2749 		 * xnf_intr() to handle them, if any.
2750 		 */
2751 		(void) xnf_intr((caddr_t)xnfp);
2752 
2753 		/*
2754 		 * Mark the link up now that we are connected.
2755 		 */
2756 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
2757 
2758 		/*
2759 		 * Tell the backend about the multicast addresses in
2760 		 * which we are interested.
2761 		 */
2762 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
2763 
2764 		break;
2765 
2766 	default:
2767 		break;
2768 	}
2769 }
2770