xref: /titanic_50/usr/src/uts/common/xen/io/xnf.c (revision 89621fe174cf95ae903df6ceab605bf24d696ac3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *
29  * Copyright (c) 2004 Christian Limpach.
30  * All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. This section intentionally left blank.
41  * 4. The name of the author may not be used to endorse or promote products
42  *    derived from this software without specific prior written permission.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 /*
56  * Section 3 of the above license was updated in response to bug 6379571.
57  */
58 
59 /*
60  * xnf.c - GLDv3 network driver for domU.
61  */
62 
63 /*
64  * This driver uses four per-instance locks:
65  *
66  * xnf_gref_lock:
67  *
68  *    Protects access to the grant reference list stored in
69  *    xnf_gref_head. Grant references should be acquired and released
70  *    using gref_get() and gref_put() respectively.
71  *
72  * xnf_schedlock:
73  *
74  *    Protects:
75  *    xnf_need_sched - used to record that a previous transmit attempt
76  *       failed (and consequently it will be necessary to call
77  *       mac_tx_update() when transmit resources are available).
78  *    xnf_pending_multicast - the number of multicast requests that
79  *       have been submitted to the backend for which we have not
80  *       processed responses.
81  *
82  * xnf_txlock:
83  *
84  *    Protects the transmit ring (xnf_tx_ring) and associated
85  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
86  *
87  * xnf_rxlock:
88  *
89  *    Protects the receive ring (xnf_rx_ring) and associated
90  *    structures (notably xnf_rx_pkt_info).
91  *
92  * If driver-global state that affects both the transmit and receive
93  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
94  * held, in that order.
95  *
96  * xnf_schedlock is acquired both whilst holding xnf_txlock and
97  * without. It should always be acquired after xnf_txlock if both are
98  * held.
99  *
100  * Notes:
101  * - atomic_add_64() is used to manipulate counters where we require
102  *   accuracy. For counters intended only for observation by humans,
103  *   post increment/decrement are used instead.
104  */
105 
106 #include <sys/types.h>
107 #include <sys/errno.h>
108 #include <sys/param.h>
109 #include <sys/sysmacros.h>
110 #include <sys/systm.h>
111 #include <sys/stream.h>
112 #include <sys/strsubr.h>
113 #include <sys/strsun.h>
114 #include <sys/conf.h>
115 #include <sys/ddi.h>
116 #include <sys/devops.h>
117 #include <sys/sunddi.h>
118 #include <sys/sunndi.h>
119 #include <sys/dlpi.h>
120 #include <sys/ethernet.h>
121 #include <sys/strsun.h>
122 #include <sys/pattr.h>
123 #include <inet/ip.h>
124 #include <inet/ip_impl.h>
125 #include <sys/gld.h>
126 #include <sys/modctl.h>
127 #include <sys/mac_provider.h>
128 #include <sys/mac_ether.h>
129 #include <sys/bootinfo.h>
130 #include <sys/mach_mmu.h>
131 #ifdef	XPV_HVM_DRIVER
132 #include <sys/xpv_support.h>
133 #include <sys/hypervisor.h>
134 #else
135 #include <sys/hypervisor.h>
136 #include <sys/evtchn_impl.h>
137 #include <sys/balloon_impl.h>
138 #endif
139 #include <xen/public/io/netif.h>
140 #include <sys/gnttab.h>
141 #include <xen/sys/xendev.h>
142 #include <sys/sdt.h>
143 #include <sys/note.h>
144 #include <sys/debug.h>
145 
146 #include <io/xnf.h>
147 
148 #if defined(DEBUG) || defined(__lint)
149 #define	XNF_DEBUG
150 #endif
151 
152 #ifdef XNF_DEBUG
153 int xnf_debug = 0;
154 xnf_t *xnf_debug_instance = NULL;
155 #endif
156 
157 /*
158  * On a 32 bit PAE system physical and machine addresses are larger
159  * than 32 bits.  ddi_btop() on such systems take an unsigned long
160  * argument, and so addresses above 4G are truncated before ddi_btop()
161  * gets to see them.  To avoid this, code the shift operation here.
162  */
163 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
164 
165 unsigned int	xnf_max_tx_frags = 1;
166 
167 /*
168  * Should we use the multicast control feature if the backend provides
169  * it?
170  */
171 boolean_t xnf_multicast_control = B_TRUE;
172 
173 /*
174  * Received packets below this size are copied to a new streams buffer
175  * rather than being desballoc'ed.
176  *
177  * This value is chosen to accommodate traffic where there are a large
178  * number of small packets. For data showing a typical distribution,
179  * see:
180  *
181  * Sinha07a:
182  *	Rishi Sinha, Christos Papadopoulos, and John
183  *	Heidemann. Internet Packet Size Distributions: Some
184  *	Observations. Technical Report ISI-TR-2007-643,
185  *	USC/Information Sciences Institute, May, 2007. Orignally
186  *	released October 2005 as web page
187  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
188  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
189  */
190 size_t xnf_rx_copy_limit = 64;
191 
192 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
193 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
194 #define	INVALID_TX_ID		((uint16_t)-1)
195 
196 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
197 #define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
198 
199 /* Required system entry points */
200 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
201 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
202 
203 /* Required driver entry points for Nemo */
204 static int	xnf_start(void *);
205 static void	xnf_stop(void *);
206 static int	xnf_set_mac_addr(void *, const uint8_t *);
207 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
208 static int	xnf_set_promiscuous(void *, boolean_t);
209 static mblk_t	*xnf_send(void *, mblk_t *);
210 static uint_t	xnf_intr(caddr_t);
211 static int	xnf_stat(void *, uint_t, uint64_t *);
212 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
213 
214 /* Driver private functions */
215 static int xnf_alloc_dma_resources(xnf_t *);
216 static void xnf_release_dma_resources(xnf_t *);
217 static void xnf_release_mblks(xnf_t *);
218 
219 static int xnf_buf_constructor(void *, void *, int);
220 static void xnf_buf_destructor(void *, void *);
221 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
222 #pragma inline(xnf_buf_get)
223 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
224 #pragma inline(xnf_buf_put)
225 static void xnf_buf_refresh(xnf_buf_t *);
226 #pragma inline(xnf_buf_refresh)
227 static void xnf_buf_recycle(xnf_buf_t *);
228 
229 static int xnf_tx_buf_constructor(void *, void *, int);
230 static void xnf_tx_buf_destructor(void *, void *);
231 
232 static grant_ref_t gref_get(xnf_t *);
233 #pragma inline(gref_get)
234 static void gref_put(xnf_t *, grant_ref_t);
235 #pragma inline(gref_put)
236 
237 static xnf_txid_t *txid_get(xnf_t *);
238 #pragma inline(txid_get)
239 static void txid_put(xnf_t *, xnf_txid_t *);
240 #pragma inline(txid_put)
241 
242 void xnf_send_driver_status(int, int);
243 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
244 static int xnf_tx_clean_ring(xnf_t  *);
245 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
246     void *, void *);
247 static boolean_t xnf_kstat_init(xnf_t *);
248 static void xnf_rx_collect(xnf_t *);
249 
250 static mac_callbacks_t xnf_callbacks = {
251 	MC_GETCAPAB,
252 	xnf_stat,
253 	xnf_start,
254 	xnf_stop,
255 	xnf_set_promiscuous,
256 	xnf_set_multicast,
257 	xnf_set_mac_addr,
258 	xnf_send,
259 	NULL,
260 	NULL,
261 	xnf_getcapab
262 };
263 
264 /* DMA attributes for network ring buffer */
265 static ddi_dma_attr_t ringbuf_dma_attr = {
266 	DMA_ATTR_V0,		/* version of this structure */
267 	0,			/* lowest usable address */
268 	0xffffffffffffffffULL,	/* highest usable address */
269 	0x7fffffff,		/* maximum DMAable byte count */
270 	MMU_PAGESIZE,		/* alignment in bytes */
271 	0x7ff,			/* bitmap of burst sizes */
272 	1,			/* minimum transfer */
273 	0xffffffffU,		/* maximum transfer */
274 	0xffffffffffffffffULL,	/* maximum segment length */
275 	1,			/* maximum number of segments */
276 	1,			/* granularity */
277 	0,			/* flags (reserved) */
278 };
279 
280 /* DMA attributes for transmit and receive data */
281 static ddi_dma_attr_t buf_dma_attr = {
282 	DMA_ATTR_V0,		/* version of this structure */
283 	0,			/* lowest usable address */
284 	0xffffffffffffffffULL,	/* highest usable address */
285 	0x7fffffff,		/* maximum DMAable byte count */
286 	MMU_PAGESIZE,		/* alignment in bytes */
287 	0x7ff,			/* bitmap of burst sizes */
288 	1,			/* minimum transfer */
289 	0xffffffffU,		/* maximum transfer */
290 	0xffffffffffffffffULL,	/* maximum segment length */
291 	1,			/* maximum number of segments */
292 	1,			/* granularity */
293 	0,			/* flags (reserved) */
294 };
295 
296 /* DMA access attributes for registers and descriptors */
297 static ddi_device_acc_attr_t accattr = {
298 	DDI_DEVICE_ATTR_V0,
299 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
300 	DDI_STRICTORDER_ACC
301 };
302 
303 /* DMA access attributes for data: NOT to be byte swapped. */
304 static ddi_device_acc_attr_t data_accattr = {
305 	DDI_DEVICE_ATTR_V0,
306 	DDI_NEVERSWAP_ACC,
307 	DDI_STRICTORDER_ACC
308 };
309 
310 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
311     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
312 
313 static struct modldrv xnf_modldrv = {
314 	&mod_driverops,
315 	"Virtual Ethernet driver",
316 	&xnf_dev_ops
317 };
318 
319 static struct modlinkage modlinkage = {
320 	MODREV_1, &xnf_modldrv, NULL
321 };
322 
323 int
324 _init(void)
325 {
326 	int r;
327 
328 	mac_init_ops(&xnf_dev_ops, "xnf");
329 	r = mod_install(&modlinkage);
330 	if (r != DDI_SUCCESS)
331 		mac_fini_ops(&xnf_dev_ops);
332 
333 	return (r);
334 }
335 
336 int
337 _fini(void)
338 {
339 	return (EBUSY); /* XXPV should be removable */
340 }
341 
342 int
343 _info(struct modinfo *modinfop)
344 {
345 	return (mod_info(&modlinkage, modinfop));
346 }
347 
348 /*
349  * Acquire a grant reference.
350  */
351 static grant_ref_t
352 gref_get(xnf_t *xnfp)
353 {
354 	grant_ref_t gref;
355 
356 	mutex_enter(&xnfp->xnf_gref_lock);
357 
358 	do {
359 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
360 
361 	} while ((gref == INVALID_GRANT_REF) &&
362 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
363 
364 	mutex_exit(&xnfp->xnf_gref_lock);
365 
366 	if (gref == INVALID_GRANT_REF) {
367 		xnfp->xnf_stat_gref_failure++;
368 	} else {
369 		atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
370 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
371 			xnfp->xnf_stat_gref_peak =
372 			    xnfp->xnf_stat_gref_outstanding;
373 	}
374 
375 	return (gref);
376 }
377 
378 /*
379  * Release a grant reference.
380  */
381 static void
382 gref_put(xnf_t *xnfp, grant_ref_t gref)
383 {
384 	ASSERT(gref != INVALID_GRANT_REF);
385 
386 	mutex_enter(&xnfp->xnf_gref_lock);
387 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
388 	mutex_exit(&xnfp->xnf_gref_lock);
389 
390 	atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
391 }
392 
393 /*
394  * Acquire a transmit id.
395  */
396 static xnf_txid_t *
397 txid_get(xnf_t *xnfp)
398 {
399 	xnf_txid_t *tidp;
400 
401 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
402 
403 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
404 		return (NULL);
405 
406 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
407 
408 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
409 	xnfp->xnf_tx_pkt_id_head = tidp->next;
410 	tidp->next = INVALID_TX_ID;
411 
412 	ASSERT(tidp->txbuf == NULL);
413 
414 	return (tidp);
415 }
416 
417 /*
418  * Release a transmit id.
419  */
420 static void
421 txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
422 {
423 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
424 	ASSERT(TX_ID_VALID(tidp->id));
425 	ASSERT(tidp->next == INVALID_TX_ID);
426 
427 	tidp->txbuf = NULL;
428 	tidp->next = xnfp->xnf_tx_pkt_id_head;
429 	xnfp->xnf_tx_pkt_id_head = tidp->id;
430 }
431 
432 /*
433  * Get `wanted' slots in the transmit ring, waiting for at least that
434  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
435  * `wanted' to zero.
436  *
437  * Return the number of slots available.
438  */
439 static int
440 tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
441 {
442 	int slotsfree;
443 	boolean_t forced_clean = (wanted == 0);
444 
445 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
446 
447 	/* LINTED: constant in conditional context */
448 	while (B_TRUE) {
449 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
450 
451 		if ((slotsfree < wanted) || forced_clean)
452 			slotsfree = xnf_tx_clean_ring(xnfp);
453 
454 		/*
455 		 * If there are more than we need free, tell other
456 		 * people to come looking again. We hold txlock, so we
457 		 * are able to take our slots before anyone else runs.
458 		 */
459 		if (slotsfree > wanted)
460 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
461 
462 		if (slotsfree >= wanted)
463 			break;
464 
465 		if (!wait)
466 			break;
467 
468 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
469 	}
470 
471 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
472 
473 	return (slotsfree);
474 }
475 
476 static int
477 xnf_setup_rings(xnf_t *xnfp)
478 {
479 	domid_t			oeid;
480 	struct xenbus_device	*xsd;
481 	RING_IDX		i;
482 	int			err;
483 	xnf_txid_t		*tidp;
484 	xnf_buf_t **bdescp;
485 
486 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
487 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
488 
489 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
490 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
491 
492 	err = gnttab_grant_foreign_access(oeid,
493 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
494 	if (err <= 0) {
495 		err = -err;
496 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
497 		goto out;
498 	}
499 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
500 
501 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
502 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
503 
504 	err = gnttab_grant_foreign_access(oeid,
505 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
506 	if (err <= 0) {
507 		err = -err;
508 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
509 		goto out;
510 	}
511 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
512 
513 	mutex_enter(&xnfp->xnf_txlock);
514 
515 	/*
516 	 * Setup/cleanup the TX ring.  Note that this can lose packets
517 	 * after a resume, but we expect to stagger on.
518 	 */
519 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
520 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
521 	    i < NET_TX_RING_SIZE;
522 	    i++, tidp++) {
523 		xnf_txbuf_t *txp;
524 
525 		tidp->id = i;
526 
527 		txp = tidp->txbuf;
528 		if (txp == NULL) {
529 			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
530 			txid_put(xnfp, tidp);
531 			continue;
532 		}
533 
534 		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
535 		ASSERT(txp->tx_mp != NULL);
536 
537 		switch (txp->tx_type) {
538 		case TX_DATA:
539 			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
540 			    == 0);
541 
542 			if (txp->tx_bdesc == NULL) {
543 				(void) gnttab_end_foreign_access_ref(
544 				    txp->tx_txreq.gref, 1);
545 				gref_put(xnfp, txp->tx_txreq.gref);
546 				(void) ddi_dma_unbind_handle(
547 				    txp->tx_dma_handle);
548 			} else {
549 				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
550 			}
551 
552 			freemsg(txp->tx_mp);
553 			txid_put(xnfp, tidp);
554 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
555 
556 			break;
557 
558 		case TX_MCAST_REQ:
559 			txp->tx_type = TX_MCAST_RSP;
560 			txp->tx_status = NETIF_RSP_DROPPED;
561 			cv_broadcast(&xnfp->xnf_cv_multicast);
562 
563 			/*
564 			 * The request consumed two slots in the ring,
565 			 * yet only a single xnf_txid_t is used. Step
566 			 * over the empty slot.
567 			 */
568 			i++;
569 			ASSERT(i < NET_TX_RING_SIZE);
570 
571 			break;
572 
573 		case TX_MCAST_RSP:
574 			break;
575 		}
576 	}
577 
578 	/* LINTED: constant in conditional context */
579 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
580 	/* LINTED: constant in conditional context */
581 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
582 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
583 
584 	mutex_exit(&xnfp->xnf_txlock);
585 
586 	mutex_enter(&xnfp->xnf_rxlock);
587 
588 	/*
589 	 * Clean out any buffers currently posted to the receive ring
590 	 * before we reset it.
591 	 */
592 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
593 	    i < NET_RX_RING_SIZE;
594 	    i++, bdescp++) {
595 		if (*bdescp != NULL) {
596 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
597 			*bdescp = NULL;
598 		}
599 	}
600 
601 	/* LINTED: constant in conditional context */
602 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
603 	/* LINTED: constant in conditional context */
604 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
605 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
606 
607 	/*
608 	 * Fill the ring with buffers.
609 	 */
610 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
611 		xnf_buf_t *bdesc;
612 
613 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
614 		VERIFY(bdesc != NULL);
615 		xnf_rxbuf_hang(xnfp, bdesc);
616 	}
617 
618 	/* LINTED: constant in conditional context */
619 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
620 
621 	mutex_exit(&xnfp->xnf_rxlock);
622 
623 	return (0);
624 
625 out:
626 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
627 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
628 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
629 
630 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
631 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
632 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
633 
634 	return (err);
635 }
636 
637 /*
638  * Connect driver to back end, called to set up communication with
639  * back end driver both initially and on resume after restore/migrate.
640  */
641 void
642 xnf_be_connect(xnf_t *xnfp)
643 {
644 	const char	*message;
645 	xenbus_transaction_t xbt;
646 	struct		xenbus_device *xsd;
647 	char		*xsname;
648 	int		err;
649 
650 	ASSERT(!xnfp->xnf_connected);
651 
652 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
653 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
654 
655 	err = xnf_setup_rings(xnfp);
656 	if (err != 0) {
657 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
658 		xenbus_dev_error(xsd, err, "setting up ring");
659 		return;
660 	}
661 
662 again:
663 	err = xenbus_transaction_start(&xbt);
664 	if (err != 0) {
665 		xenbus_dev_error(xsd, EIO, "starting transaction");
666 		return;
667 	}
668 
669 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
670 	    xnfp->xnf_tx_ring_ref);
671 	if (err != 0) {
672 		message = "writing tx ring-ref";
673 		goto abort_transaction;
674 	}
675 
676 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
677 	    xnfp->xnf_rx_ring_ref);
678 	if (err != 0) {
679 		message = "writing rx ring-ref";
680 		goto abort_transaction;
681 	}
682 
683 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
684 	    xnfp->xnf_evtchn);
685 	if (err != 0) {
686 		message = "writing event-channel";
687 		goto abort_transaction;
688 	}
689 
690 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
691 	if (err != 0) {
692 		message = "writing feature-rx-notify";
693 		goto abort_transaction;
694 	}
695 
696 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
697 	if (err != 0) {
698 		message = "writing request-rx-copy";
699 		goto abort_transaction;
700 	}
701 
702 	if (xnfp->xnf_be_mcast_control) {
703 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
704 		    "%d", 1);
705 		if (err != 0) {
706 			message = "writing request-multicast-control";
707 			goto abort_transaction;
708 		}
709 	}
710 
711 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
712 	if (err != 0) {
713 		message = "switching state to XenbusStateConnected";
714 		goto abort_transaction;
715 	}
716 
717 	err = xenbus_transaction_end(xbt, 0);
718 	if (err != 0) {
719 		if (err == EAGAIN)
720 			goto again;
721 		xenbus_dev_error(xsd, err, "completing transaction");
722 	}
723 
724 	return;
725 
726 abort_transaction:
727 	(void) xenbus_transaction_end(xbt, 1);
728 	xenbus_dev_error(xsd, err, "%s", message);
729 }
730 
731 /*
732  * Read configuration information from xenstore.
733  */
734 void
735 xnf_read_config(xnf_t *xnfp)
736 {
737 	int err, be_cap;
738 	char mac[ETHERADDRL * 3];
739 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
740 
741 	err = xenbus_scanf(XBT_NULL, oename, "mac",
742 	    "%s", (char *)&mac[0]);
743 	if (err != 0) {
744 		/*
745 		 * bad: we're supposed to be set up with a proper mac
746 		 * addr. at this point
747 		 */
748 		cmn_err(CE_WARN, "%s%d: no mac address",
749 		    ddi_driver_name(xnfp->xnf_devinfo),
750 		    ddi_get_instance(xnfp->xnf_devinfo));
751 			return;
752 	}
753 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
754 		err = ENOENT;
755 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
756 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
757 		return;
758 	}
759 
760 	err = xenbus_scanf(XBT_NULL, oename,
761 	    "feature-rx-copy", "%d", &be_cap);
762 	/*
763 	 * If we fail to read the store we assume that the key is
764 	 * absent, implying an older domain at the far end.  Older
765 	 * domains cannot do HV copy.
766 	 */
767 	if (err != 0)
768 		be_cap = 0;
769 	xnfp->xnf_be_rx_copy = (be_cap != 0);
770 
771 	err = xenbus_scanf(XBT_NULL, oename,
772 	    "feature-multicast-control", "%d", &be_cap);
773 	/*
774 	 * If we fail to read the store we assume that the key is
775 	 * absent, implying an older domain at the far end.  Older
776 	 * domains do not support multicast control.
777 	 */
778 	if (err != 0)
779 		be_cap = 0;
780 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
781 }
782 
783 /*
784  *  attach(9E) -- Attach a device to the system
785  */
786 static int
787 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
788 {
789 	mac_register_t *macp;
790 	xnf_t *xnfp;
791 	int err;
792 	char cachename[32];
793 
794 #ifdef XNF_DEBUG
795 	if (xnf_debug & XNF_DEBUG_DDI)
796 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
797 		    (void *)devinfo);
798 #endif
799 
800 	switch (cmd) {
801 	case DDI_RESUME:
802 		xnfp = ddi_get_driver_private(devinfo);
803 		xnfp->xnf_gen++;
804 
805 		(void) xvdi_resume(devinfo);
806 		(void) xvdi_alloc_evtchn(devinfo);
807 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
808 #ifdef XPV_HVM_DRIVER
809 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
810 		    xnfp);
811 #else
812 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
813 		    (caddr_t)xnfp);
814 #endif
815 		return (DDI_SUCCESS);
816 
817 	case DDI_ATTACH:
818 		break;
819 
820 	default:
821 		return (DDI_FAILURE);
822 	}
823 
824 	/*
825 	 *  Allocate gld_mac_info_t and xnf_instance structures
826 	 */
827 	macp = mac_alloc(MAC_VERSION);
828 	if (macp == NULL)
829 		return (DDI_FAILURE);
830 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
831 
832 	macp->m_dip = devinfo;
833 	macp->m_driver = xnfp;
834 	xnfp->xnf_devinfo = devinfo;
835 
836 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
837 	macp->m_src_addr = xnfp->xnf_mac_addr;
838 	macp->m_callbacks = &xnf_callbacks;
839 	macp->m_min_sdu = 0;
840 	macp->m_max_sdu = XNF_MAXPKT;
841 
842 	xnfp->xnf_running = B_FALSE;
843 	xnfp->xnf_connected = B_FALSE;
844 	xnfp->xnf_be_rx_copy = B_FALSE;
845 	xnfp->xnf_be_mcast_control = B_FALSE;
846 	xnfp->xnf_need_sched = B_FALSE;
847 
848 	xnfp->xnf_rx_head = NULL;
849 	xnfp->xnf_rx_tail = NULL;
850 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
851 
852 #ifdef XPV_HVM_DRIVER
853 	/*
854 	 * Report our version to dom0.
855 	 */
856 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
857 	    HVMPV_XNF_VERS))
858 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
859 #endif
860 
861 	/*
862 	 * Get the iblock cookie with which to initialize the mutexes.
863 	 */
864 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
865 	    != DDI_SUCCESS)
866 		goto failure;
867 
868 	mutex_init(&xnfp->xnf_txlock,
869 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
870 	mutex_init(&xnfp->xnf_rxlock,
871 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
872 	mutex_init(&xnfp->xnf_schedlock,
873 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
874 	mutex_init(&xnfp->xnf_gref_lock,
875 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
876 
877 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
878 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
879 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
880 
881 	(void) sprintf(cachename, "xnf_buf_cache_%d",
882 	    ddi_get_instance(devinfo));
883 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
884 	    sizeof (xnf_buf_t), 0,
885 	    xnf_buf_constructor, xnf_buf_destructor,
886 	    NULL, xnfp, NULL, 0);
887 	if (xnfp->xnf_buf_cache == NULL)
888 		goto failure_0;
889 
890 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
891 	    ddi_get_instance(devinfo));
892 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
893 	    sizeof (xnf_txbuf_t), 0,
894 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
895 	    NULL, xnfp, NULL, 0);
896 	if (xnfp->xnf_tx_buf_cache == NULL)
897 		goto failure_1;
898 
899 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
900 
901 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
902 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
903 		    "driver data structures",
904 		    ddi_get_instance(xnfp->xnf_devinfo));
905 		goto failure_2;
906 	}
907 
908 	xnfp->xnf_rx_ring.sring->rsp_event =
909 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
910 
911 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
912 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
913 
914 	/* set driver private pointer now */
915 	ddi_set_driver_private(devinfo, xnfp);
916 
917 	if (!xnf_kstat_init(xnfp))
918 		goto failure_3;
919 
920 	/*
921 	 * Allocate an event channel, add the interrupt handler and
922 	 * bind it to the event channel.
923 	 */
924 	(void) xvdi_alloc_evtchn(devinfo);
925 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
926 #ifdef XPV_HVM_DRIVER
927 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
928 #else
929 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
930 #endif
931 
932 	err = mac_register(macp, &xnfp->xnf_mh);
933 	mac_free(macp);
934 	macp = NULL;
935 	if (err != 0)
936 		goto failure_4;
937 
938 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
939 	    != DDI_SUCCESS)
940 		goto failure_5;
941 
942 #ifdef XPV_HVM_DRIVER
943 	/*
944 	 * In the HVM case, this driver essentially replaces a driver for
945 	 * a 'real' PCI NIC. Without the "model" property set to
946 	 * "Ethernet controller", like the PCI code does, netbooting does
947 	 * not work correctly, as strplumb_get_netdev_path() will not find
948 	 * this interface.
949 	 */
950 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
951 	    "Ethernet controller");
952 #endif
953 
954 #ifdef XNF_DEBUG
955 	if (xnf_debug_instance == NULL)
956 		xnf_debug_instance = xnfp;
957 #endif
958 
959 	return (DDI_SUCCESS);
960 
961 failure_5:
962 	(void) mac_unregister(xnfp->xnf_mh);
963 
964 failure_4:
965 #ifdef XPV_HVM_DRIVER
966 	ec_unbind_evtchn(xnfp->xnf_evtchn);
967 	xvdi_free_evtchn(devinfo);
968 #else
969 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
970 #endif
971 	xnfp->xnf_evtchn = INVALID_EVTCHN;
972 	kstat_delete(xnfp->xnf_kstat_aux);
973 
974 failure_3:
975 	xnf_release_dma_resources(xnfp);
976 
977 failure_2:
978 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
979 
980 failure_1:
981 	kmem_cache_destroy(xnfp->xnf_buf_cache);
982 
983 failure_0:
984 	cv_destroy(&xnfp->xnf_cv_tx_slots);
985 	cv_destroy(&xnfp->xnf_cv_multicast);
986 	cv_destroy(&xnfp->xnf_cv_state);
987 
988 	mutex_destroy(&xnfp->xnf_gref_lock);
989 	mutex_destroy(&xnfp->xnf_schedlock);
990 	mutex_destroy(&xnfp->xnf_rxlock);
991 	mutex_destroy(&xnfp->xnf_txlock);
992 
993 failure:
994 	kmem_free(xnfp, sizeof (*xnfp));
995 	if (macp != NULL)
996 		mac_free(macp);
997 
998 	return (DDI_FAILURE);
999 }
1000 
1001 /*  detach(9E) -- Detach a device from the system */
1002 static int
1003 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1004 {
1005 	xnf_t *xnfp;		/* Our private device info */
1006 
1007 #ifdef XNF_DEBUG
1008 	if (xnf_debug & XNF_DEBUG_DDI)
1009 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
1010 #endif
1011 
1012 	xnfp = ddi_get_driver_private(devinfo);
1013 
1014 	switch (cmd) {
1015 	case DDI_SUSPEND:
1016 #ifdef XPV_HVM_DRIVER
1017 		ec_unbind_evtchn(xnfp->xnf_evtchn);
1018 		xvdi_free_evtchn(devinfo);
1019 #else
1020 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1021 #endif
1022 
1023 		xvdi_suspend(devinfo);
1024 
1025 		mutex_enter(&xnfp->xnf_rxlock);
1026 		mutex_enter(&xnfp->xnf_txlock);
1027 
1028 		xnfp->xnf_evtchn = INVALID_EVTCHN;
1029 		xnfp->xnf_connected = B_FALSE;
1030 		mutex_exit(&xnfp->xnf_txlock);
1031 		mutex_exit(&xnfp->xnf_rxlock);
1032 
1033 		/* claim link to be down after disconnect */
1034 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1035 		return (DDI_SUCCESS);
1036 
1037 	case DDI_DETACH:
1038 		break;
1039 
1040 	default:
1041 		return (DDI_FAILURE);
1042 	}
1043 
1044 	if (xnfp->xnf_connected)
1045 		return (DDI_FAILURE);
1046 
1047 	/*
1048 	 * Cannot detach if we have xnf_buf_t outstanding.
1049 	 */
1050 	if (xnfp->xnf_stat_buf_allocated > 0)
1051 		return (DDI_FAILURE);
1052 
1053 	if (mac_unregister(xnfp->xnf_mh) != 0)
1054 		return (DDI_FAILURE);
1055 
1056 	kstat_delete(xnfp->xnf_kstat_aux);
1057 
1058 	/* Stop the receiver */
1059 	xnf_stop(xnfp);
1060 
1061 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1062 
1063 	/* Remove the interrupt */
1064 #ifdef XPV_HVM_DRIVER
1065 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1066 	xvdi_free_evtchn(devinfo);
1067 #else
1068 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1069 #endif
1070 
1071 	/* Release any pending xmit mblks */
1072 	xnf_release_mblks(xnfp);
1073 
1074 	/* Release all DMA resources */
1075 	xnf_release_dma_resources(xnfp);
1076 
1077 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1078 	cv_destroy(&xnfp->xnf_cv_multicast);
1079 	cv_destroy(&xnfp->xnf_cv_state);
1080 
1081 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1082 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1083 
1084 	mutex_destroy(&xnfp->xnf_gref_lock);
1085 	mutex_destroy(&xnfp->xnf_schedlock);
1086 	mutex_destroy(&xnfp->xnf_rxlock);
1087 	mutex_destroy(&xnfp->xnf_txlock);
1088 
1089 	kmem_free(xnfp, sizeof (*xnfp));
1090 
1091 	return (DDI_SUCCESS);
1092 }
1093 
1094 /*
1095  *  xnf_set_mac_addr() -- set the physical network address on the board.
1096  */
1097 static int
1098 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1099 {
1100 	_NOTE(ARGUNUSED(arg, macaddr));
1101 
1102 	/*
1103 	 * We can't set our macaddr.
1104 	 */
1105 	return (ENOTSUP);
1106 }
1107 
1108 /*
1109  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1110  *
1111  *  Program the hardware to enable/disable the multicast address
1112  *  in "mca".  Enable if "add" is true, disable if false.
1113  */
1114 static int
1115 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1116 {
1117 	xnf_t *xnfp = arg;
1118 	xnf_txbuf_t *txp;
1119 	int n_slots;
1120 	RING_IDX slot;
1121 	xnf_txid_t *tidp;
1122 	netif_tx_request_t *txrp;
1123 	struct netif_extra_info *erp;
1124 	boolean_t notify, result;
1125 
1126 	/*
1127 	 * If the backend does not support multicast control then we
1128 	 * must assume that the right packets will just arrive.
1129 	 */
1130 	if (!xnfp->xnf_be_mcast_control)
1131 		return (0);
1132 
1133 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1134 
1135 	mutex_enter(&xnfp->xnf_txlock);
1136 
1137 	/*
1138 	 * If we're not yet connected then claim success. This is
1139 	 * acceptable because we refresh the entire set of multicast
1140 	 * addresses when we get connected.
1141 	 *
1142 	 * We can't wait around here because the MAC layer expects
1143 	 * this to be a non-blocking operation - waiting ends up
1144 	 * causing a deadlock during resume.
1145 	 */
1146 	if (!xnfp->xnf_connected) {
1147 		mutex_exit(&xnfp->xnf_txlock);
1148 		return (0);
1149 	}
1150 
1151 	/*
1152 	 * 1. Acquire two slots in the ring.
1153 	 * 2. Fill in the slots.
1154 	 * 3. Request notification when the operation is done.
1155 	 * 4. Kick the peer.
1156 	 * 5. Wait for the response via xnf_tx_clean_ring().
1157 	 */
1158 
1159 	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
1160 	ASSERT(n_slots >= 2);
1161 
1162 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1163 	tidp = txid_get(xnfp);
1164 	VERIFY(tidp != NULL);
1165 
1166 	txp->tx_type = TX_MCAST_REQ;
1167 	txp->tx_slot = slot;
1168 
1169 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1170 	erp = (struct netif_extra_info *)
1171 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1172 
1173 	txrp->gref = 0;
1174 	txrp->size = 0;
1175 	txrp->offset = 0;
1176 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1177 	txrp->id = txp->tx_txreq.id = tidp->id;
1178 	txrp->flags = NETTXF_extra_info;
1179 
1180 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1181 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1182 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1183 
1184 	tidp->txbuf = txp;
1185 
1186 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1187 
1188 	mutex_enter(&xnfp->xnf_schedlock);
1189 	xnfp->xnf_pending_multicast++;
1190 	mutex_exit(&xnfp->xnf_schedlock);
1191 
1192 	/* LINTED: constant in conditional context */
1193 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1194 	    notify);
1195 	if (notify)
1196 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1197 
1198 	while (txp->tx_type == TX_MCAST_REQ)
1199 		cv_wait(&xnfp->xnf_cv_multicast,
1200 		    &xnfp->xnf_txlock);
1201 
1202 	ASSERT(txp->tx_type == TX_MCAST_RSP);
1203 
1204 	mutex_enter(&xnfp->xnf_schedlock);
1205 	xnfp->xnf_pending_multicast--;
1206 	mutex_exit(&xnfp->xnf_schedlock);
1207 
1208 	result = (txp->tx_status == NETIF_RSP_OKAY);
1209 
1210 	txid_put(xnfp, tidp);
1211 
1212 	mutex_exit(&xnfp->xnf_txlock);
1213 
1214 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1215 
1216 	return (result ? 0 : 1);
1217 }
1218 
1219 /*
1220  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1221  *
1222  *  Program the hardware to enable/disable promiscuous mode.
1223  */
1224 static int
1225 xnf_set_promiscuous(void *arg, boolean_t on)
1226 {
1227 	_NOTE(ARGUNUSED(arg, on));
1228 
1229 	/*
1230 	 * We can't really do this, but we pretend that we can in
1231 	 * order that snoop will work.
1232 	 */
1233 	return (0);
1234 }
1235 
1236 /*
1237  * Clean buffers that we have responses for from the transmit ring.
1238  */
1239 static int
1240 xnf_tx_clean_ring(xnf_t *xnfp)
1241 {
1242 	boolean_t work_to_do;
1243 
1244 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1245 
1246 loop:
1247 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1248 		RING_IDX cons, prod, i;
1249 
1250 		cons = xnfp->xnf_tx_ring.rsp_cons;
1251 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1252 		membar_consumer();
1253 		/*
1254 		 * Clean tx requests from ring that we have responses
1255 		 * for.
1256 		 */
1257 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1258 		for (i = cons; i != prod; i++) {
1259 			netif_tx_response_t *trp;
1260 			xnf_txid_t *tidp;
1261 			xnf_txbuf_t *txp;
1262 
1263 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1264 			ASSERT(TX_ID_VALID(trp->id));
1265 
1266 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
1267 			ASSERT(tidp->id == trp->id);
1268 			ASSERT(tidp->next == INVALID_TX_ID);
1269 
1270 			txp = tidp->txbuf;
1271 			ASSERT(txp != NULL);
1272 			ASSERT(txp->tx_txreq.id == trp->id);
1273 
1274 			switch (txp->tx_type) {
1275 			case TX_DATA:
1276 				if (gnttab_query_foreign_access(
1277 				    txp->tx_txreq.gref) != 0)
1278 					cmn_err(CE_PANIC,
1279 					    "tx grant %d still in use by "
1280 					    "backend domain",
1281 					    txp->tx_txreq.gref);
1282 
1283 				if (txp->tx_bdesc == NULL) {
1284 					(void) gnttab_end_foreign_access_ref(
1285 					    txp->tx_txreq.gref, 1);
1286 					gref_put(xnfp, txp->tx_txreq.gref);
1287 					(void) ddi_dma_unbind_handle(
1288 					    txp->tx_dma_handle);
1289 				} else {
1290 					xnf_buf_put(xnfp, txp->tx_bdesc,
1291 					    B_TRUE);
1292 				}
1293 
1294 				freemsg(txp->tx_mp);
1295 				txid_put(xnfp, tidp);
1296 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1297 
1298 				break;
1299 
1300 			case TX_MCAST_REQ:
1301 				txp->tx_type = TX_MCAST_RSP;
1302 				txp->tx_status = trp->status;
1303 				cv_broadcast(&xnfp->xnf_cv_multicast);
1304 
1305 				break;
1306 
1307 			case TX_MCAST_RSP:
1308 				break;
1309 
1310 			default:
1311 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1312 				    "invalid xnf_txbuf_t type: %d",
1313 				    txp->tx_type);
1314 				break;
1315 			}
1316 		}
1317 		/*
1318 		 * Record the last response we dealt with so that we
1319 		 * know where to start next time around.
1320 		 */
1321 		xnfp->xnf_tx_ring.rsp_cons = prod;
1322 		membar_enter();
1323 	}
1324 
1325 	/* LINTED: constant in conditional context */
1326 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1327 	if (work_to_do)
1328 		goto loop;
1329 
1330 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1331 }
1332 
1333 /*
1334  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1335  * to ensure that the packet is physically contiguous and contained
1336  * within a single page.
1337  */
1338 static xnf_buf_t *
1339 xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
1340 {
1341 	xnf_buf_t *bd;
1342 	caddr_t bp;
1343 
1344 	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1345 	if (bd == NULL)
1346 		return (NULL);
1347 
1348 	bp = bd->buf;
1349 	while (mp != NULL) {
1350 		size_t len = MBLKL(mp);
1351 
1352 		bcopy(mp->b_rptr, bp, len);
1353 		bp += len;
1354 
1355 		mp = mp->b_cont;
1356 	}
1357 
1358 	ASSERT((bp - bd->buf) <= PAGESIZE);
1359 
1360 	xnfp->xnf_stat_tx_pullup++;
1361 
1362 	return (bd);
1363 }
1364 
1365 /*
1366  * Insert the pseudo-header checksum into the packet `buf'.
1367  */
1368 void
1369 xnf_pseudo_cksum(caddr_t buf, int length)
1370 {
1371 	struct ether_header *ehp;
1372 	uint16_t sap, len, *stuff;
1373 	uint32_t cksum;
1374 	size_t offset;
1375 	ipha_t *ipha;
1376 	ipaddr_t src, dst;
1377 
1378 	ASSERT(length >= sizeof (*ehp));
1379 	ehp = (struct ether_header *)buf;
1380 
1381 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1382 		struct ether_vlan_header *evhp;
1383 
1384 		ASSERT(length >= sizeof (*evhp));
1385 		evhp = (struct ether_vlan_header *)buf;
1386 		sap = ntohs(evhp->ether_type);
1387 		offset = sizeof (*evhp);
1388 	} else {
1389 		sap = ntohs(ehp->ether_type);
1390 		offset = sizeof (*ehp);
1391 	}
1392 
1393 	ASSERT(sap == ETHERTYPE_IP);
1394 
1395 	/* Packet should have been pulled up by the caller. */
1396 	if ((offset + sizeof (ipha_t)) > length) {
1397 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
1398 		return;
1399 	}
1400 
1401 	ipha = (ipha_t *)(buf + offset);
1402 
1403 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
1404 
1405 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1406 
1407 	switch (ipha->ipha_protocol) {
1408 	case IPPROTO_TCP:
1409 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1410 		cksum = IP_TCP_CSUM_COMP;
1411 		break;
1412 	case IPPROTO_UDP:
1413 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1414 		cksum = IP_UDP_CSUM_COMP;
1415 		break;
1416 	default:
1417 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1418 		    ipha->ipha_protocol);
1419 		return;
1420 	}
1421 
1422 	src = ipha->ipha_src;
1423 	dst = ipha->ipha_dst;
1424 
1425 	cksum += (dst >> 16) + (dst & 0xFFFF);
1426 	cksum += (src >> 16) + (src & 0xFFFF);
1427 	cksum += htons(len);
1428 
1429 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1430 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1431 
1432 	ASSERT(cksum <= 0xFFFF);
1433 
1434 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1435 }
1436 
1437 /*
1438  * Push a list of prepared packets (`txp') into the transmit ring.
1439  */
1440 static xnf_txbuf_t *
1441 tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
1442 {
1443 	int slots_free;
1444 	RING_IDX slot;
1445 	boolean_t notify;
1446 
1447 	mutex_enter(&xnfp->xnf_txlock);
1448 
1449 	ASSERT(xnfp->xnf_running);
1450 
1451 	/*
1452 	 * Wait until we are connected to the backend.
1453 	 */
1454 	while (!xnfp->xnf_connected)
1455 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1456 
1457 	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
1458 	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
1459 
1460 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1461 
1462 	while ((txp != NULL) && (slots_free > 0)) {
1463 		xnf_txid_t *tidp;
1464 		netif_tx_request_t *txrp;
1465 
1466 		tidp = txid_get(xnfp);
1467 		VERIFY(tidp != NULL);
1468 
1469 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1470 
1471 		txp->tx_slot = slot;
1472 		txp->tx_txreq.id = tidp->id;
1473 		*txrp = txp->tx_txreq;
1474 
1475 		tidp->txbuf = txp;
1476 
1477 		xnfp->xnf_stat_opackets++;
1478 		xnfp->xnf_stat_obytes += txp->tx_txreq.size;
1479 
1480 		txp = txp->tx_next;
1481 		slots_free--;
1482 		slot++;
1483 
1484 	}
1485 
1486 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
1487 
1488 	/*
1489 	 * Tell the peer that we sent something, if it cares.
1490 	 */
1491 	/* LINTED: constant in conditional context */
1492 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1493 	    notify);
1494 	if (notify)
1495 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1496 
1497 	mutex_exit(&xnfp->xnf_txlock);
1498 
1499 	return (txp);
1500 }
1501 
1502 /*
1503  * Send the chain of packets `mp'. Called by the MAC framework.
1504  */
1505 static mblk_t *
1506 xnf_send(void *arg, mblk_t *mp)
1507 {
1508 	xnf_t *xnfp = arg;
1509 	domid_t oeid;
1510 	xnf_txbuf_t *head, *tail;
1511 	mblk_t *ml;
1512 	int prepared;
1513 
1514 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1515 
1516 	/*
1517 	 * Prepare packets for transmission.
1518 	 */
1519 	head = tail = NULL;
1520 	prepared = 0;
1521 	while (mp != NULL) {
1522 		xnf_txbuf_t *txp;
1523 		int n_chunks, length;
1524 		boolean_t page_oops;
1525 		uint32_t pflags;
1526 
1527 		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
1528 		    ml != NULL;
1529 		    ml = ml->b_cont, n_chunks++) {
1530 
1531 			/*
1532 			 * Test if this buffer includes a page
1533 			 * boundary. The test assumes that the range
1534 			 * b_rptr...b_wptr can include only a single
1535 			 * boundary.
1536 			 */
1537 			if (xnf_btop((size_t)ml->b_rptr) !=
1538 			    xnf_btop((size_t)ml->b_wptr)) {
1539 				xnfp->xnf_stat_tx_pagebndry++;
1540 				page_oops = B_TRUE;
1541 			}
1542 
1543 			length += MBLKL(ml);
1544 		}
1545 		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
1546 
1547 		/*
1548 		 * Make sure packet isn't too large.
1549 		 */
1550 		if (length > XNF_FRAMESIZE) {
1551 			cmn_err(CE_WARN,
1552 			    "xnf%d: oversized packet (%d bytes) dropped",
1553 			    ddi_get_instance(xnfp->xnf_devinfo), length);
1554 			freemsg(mp);
1555 			continue;
1556 		}
1557 
1558 		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1559 
1560 		txp->tx_type = TX_DATA;
1561 
1562 		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
1563 			/*
1564 			 * Loan a side buffer rather than the mblk
1565 			 * itself.
1566 			 */
1567 			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
1568 			if (txp->tx_bdesc == NULL) {
1569 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1570 				break;
1571 			}
1572 
1573 			txp->tx_bufp = txp->tx_bdesc->buf;
1574 			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1575 			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1576 
1577 		} else {
1578 			int rc;
1579 			ddi_dma_cookie_t dma_cookie;
1580 			uint_t ncookies;
1581 
1582 			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
1583 			    NULL, (char *)mp->b_rptr, length,
1584 			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1585 			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1586 			    &ncookies);
1587 			if (rc != DDI_DMA_MAPPED) {
1588 				ASSERT(rc != DDI_DMA_INUSE);
1589 				ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1590 
1591 #ifdef XNF_DEBUG
1592 				if (rc != DDI_DMA_NORESOURCES)
1593 					cmn_err(CE_WARN,
1594 					    "xnf%d: bind_handle failed (%x)",
1595 					    ddi_get_instance(xnfp->xnf_devinfo),
1596 					    rc);
1597 #endif
1598 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1599 				break;
1600 			}
1601 			ASSERT(ncookies == 1);
1602 
1603 			txp->tx_bdesc = NULL;
1604 			txp->tx_bufp = (caddr_t)mp->b_rptr;
1605 			txp->tx_mfn =
1606 			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
1607 			txp->tx_txreq.gref = gref_get(xnfp);
1608 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1609 				(void) ddi_dma_unbind_handle(
1610 				    txp->tx_dma_handle);
1611 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1612 				break;
1613 			}
1614 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1615 			    oeid, txp->tx_mfn, 1);
1616 		}
1617 
1618 		txp->tx_next = NULL;
1619 		txp->tx_mp = mp;
1620 		txp->tx_txreq.size = length;
1621 		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
1622 		txp->tx_txreq.flags = 0;
1623 		mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags);
1624 		if (pflags != 0) {
1625 			/*
1626 			 * If the local protocol stack requests checksum
1627 			 * offload we set the 'checksum blank' flag,
1628 			 * indicating to the peer that we need the checksum
1629 			 * calculated for us.
1630 			 *
1631 			 * We _don't_ set the validated flag, because we haven't
1632 			 * validated that the data and the checksum match.
1633 			 */
1634 			xnf_pseudo_cksum(txp->tx_bufp, length);
1635 			txp->tx_txreq.flags |= NETTXF_csum_blank;
1636 
1637 			xnfp->xnf_stat_tx_cksum_deferred++;
1638 		}
1639 
1640 		if (head == NULL) {
1641 			ASSERT(tail == NULL);
1642 
1643 			head = txp;
1644 		} else {
1645 			ASSERT(tail != NULL);
1646 
1647 			tail->tx_next = txp;
1648 		}
1649 		tail = txp;
1650 
1651 		mp = mp->b_next;
1652 		prepared++;
1653 
1654 		/*
1655 		 * There is no point in preparing more than
1656 		 * NET_TX_RING_SIZE, as we won't be able to push them
1657 		 * into the ring in one go and would hence have to
1658 		 * un-prepare the extra.
1659 		 */
1660 		if (prepared == NET_TX_RING_SIZE)
1661 			break;
1662 	}
1663 
1664 	DTRACE_PROBE1(xnf_send_prepared, int, prepared);
1665 
1666 	if (mp != NULL) {
1667 #ifdef XNF_DEBUG
1668 		int notprepared = 0;
1669 		mblk_t *l = mp;
1670 
1671 		while (l != NULL) {
1672 			notprepared++;
1673 			l = l->b_next;
1674 		}
1675 
1676 		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
1677 #else /* !XNF_DEBUG */
1678 		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
1679 #endif /* XNF_DEBUG */
1680 	}
1681 
1682 	/*
1683 	 * Push the packets we have prepared into the ring. They may
1684 	 * not all go.
1685 	 */
1686 	if (head != NULL)
1687 		head = tx_push_packets(xnfp, head);
1688 
1689 	/*
1690 	 * If some packets that we prepared were not sent, unprepare
1691 	 * them and add them back to the head of those we didn't
1692 	 * prepare.
1693 	 */
1694 	{
1695 		xnf_txbuf_t *loop;
1696 		mblk_t *mp_head, *mp_tail;
1697 		int unprepared = 0;
1698 
1699 		mp_head = mp_tail = NULL;
1700 		loop = head;
1701 
1702 		while (loop != NULL) {
1703 			xnf_txbuf_t *next = loop->tx_next;
1704 
1705 			if (loop->tx_bdesc == NULL) {
1706 				(void) gnttab_end_foreign_access_ref(
1707 				    loop->tx_txreq.gref, 1);
1708 				gref_put(xnfp, loop->tx_txreq.gref);
1709 				(void) ddi_dma_unbind_handle(
1710 				    loop->tx_dma_handle);
1711 			} else {
1712 				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
1713 			}
1714 
1715 			ASSERT(loop->tx_mp != NULL);
1716 			if (mp_head == NULL)
1717 				mp_head = loop->tx_mp;
1718 			mp_tail = loop->tx_mp;
1719 
1720 			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
1721 			loop = next;
1722 			unprepared++;
1723 		}
1724 
1725 		if (mp_tail == NULL) {
1726 			ASSERT(mp_head == NULL);
1727 		} else {
1728 			ASSERT(mp_head != NULL);
1729 
1730 			mp_tail->b_next = mp;
1731 			mp = mp_head;
1732 		}
1733 
1734 		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
1735 	}
1736 
1737 	/*
1738 	 * If any mblks are left then we have deferred for some reason
1739 	 * and need to ask for a re-schedule later. This is typically
1740 	 * due to the ring filling.
1741 	 */
1742 	if (mp != NULL) {
1743 		mutex_enter(&xnfp->xnf_schedlock);
1744 		xnfp->xnf_need_sched = B_TRUE;
1745 		mutex_exit(&xnfp->xnf_schedlock);
1746 
1747 		xnfp->xnf_stat_tx_defer++;
1748 	}
1749 
1750 	return (mp);
1751 }
1752 
1753 /*
1754  * Notification of RX packets. Currently no TX-complete interrupt is
1755  * used, as we clean the TX ring lazily.
1756  */
1757 static uint_t
1758 xnf_intr(caddr_t arg)
1759 {
1760 	xnf_t *xnfp = (xnf_t *)arg;
1761 	mblk_t *mp;
1762 	boolean_t need_sched, clean_ring;
1763 
1764 	mutex_enter(&xnfp->xnf_rxlock);
1765 
1766 	/*
1767 	 * Interrupts before we are connected are spurious.
1768 	 */
1769 	if (!xnfp->xnf_connected) {
1770 		mutex_exit(&xnfp->xnf_rxlock);
1771 		xnfp->xnf_stat_unclaimed_interrupts++;
1772 		return (DDI_INTR_UNCLAIMED);
1773 	}
1774 
1775 	/*
1776 	 * Receive side processing.
1777 	 */
1778 	do {
1779 		/*
1780 		 * Collect buffers from the ring.
1781 		 */
1782 		xnf_rx_collect(xnfp);
1783 
1784 		/*
1785 		 * Interrupt me when the next receive buffer is consumed.
1786 		 */
1787 		xnfp->xnf_rx_ring.sring->rsp_event =
1788 		    xnfp->xnf_rx_ring.rsp_cons + 1;
1789 		xen_mb();
1790 
1791 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
1792 
1793 	if (xnfp->xnf_rx_new_buffers_posted) {
1794 		boolean_t notify;
1795 
1796 		/*
1797 		 * Indicate to the peer that we have re-filled the
1798 		 * receive ring, if it cares.
1799 		 */
1800 		/* LINTED: constant in conditional context */
1801 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1802 		if (notify)
1803 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1804 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1805 	}
1806 
1807 	mp = xnfp->xnf_rx_head;
1808 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
1809 
1810 	xnfp->xnf_stat_interrupts++;
1811 	mutex_exit(&xnfp->xnf_rxlock);
1812 
1813 	if (mp != NULL)
1814 		mac_rx(xnfp->xnf_mh, NULL, mp);
1815 
1816 	/*
1817 	 * Transmit side processing.
1818 	 *
1819 	 * If a previous transmit attempt failed or we have pending
1820 	 * multicast requests, clean the ring.
1821 	 *
1822 	 * If we previously stalled transmission and cleaning produces
1823 	 * some free slots, tell upstream to attempt sending again.
1824 	 *
1825 	 * The odd style is to avoid acquiring xnf_txlock unless we
1826 	 * will actually look inside the tx machinery.
1827 	 */
1828 	mutex_enter(&xnfp->xnf_schedlock);
1829 	need_sched = xnfp->xnf_need_sched;
1830 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
1831 	mutex_exit(&xnfp->xnf_schedlock);
1832 
1833 	if (clean_ring) {
1834 		int free_slots;
1835 
1836 		mutex_enter(&xnfp->xnf_txlock);
1837 		free_slots = tx_slots_get(xnfp, 0, B_FALSE);
1838 
1839 		if (need_sched && (free_slots > 0)) {
1840 			mutex_enter(&xnfp->xnf_schedlock);
1841 			xnfp->xnf_need_sched = B_FALSE;
1842 			mutex_exit(&xnfp->xnf_schedlock);
1843 
1844 			mac_tx_update(xnfp->xnf_mh);
1845 		}
1846 		mutex_exit(&xnfp->xnf_txlock);
1847 	}
1848 
1849 	return (DDI_INTR_CLAIMED);
1850 }
1851 
1852 /*
1853  *  xnf_start() -- start the board receiving and enable interrupts.
1854  */
1855 static int
1856 xnf_start(void *arg)
1857 {
1858 	xnf_t *xnfp = arg;
1859 
1860 #ifdef XNF_DEBUG
1861 	if (xnf_debug & XNF_DEBUG_TRACE)
1862 		printf("xnf%d start(0x%p)\n",
1863 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1864 #endif
1865 
1866 	mutex_enter(&xnfp->xnf_rxlock);
1867 	mutex_enter(&xnfp->xnf_txlock);
1868 
1869 	/* Accept packets from above. */
1870 	xnfp->xnf_running = B_TRUE;
1871 
1872 	mutex_exit(&xnfp->xnf_txlock);
1873 	mutex_exit(&xnfp->xnf_rxlock);
1874 
1875 	return (0);
1876 }
1877 
1878 /* xnf_stop() - disable hardware */
1879 static void
1880 xnf_stop(void *arg)
1881 {
1882 	xnf_t *xnfp = arg;
1883 
1884 #ifdef XNF_DEBUG
1885 	if (xnf_debug & XNF_DEBUG_TRACE)
1886 		printf("xnf%d stop(0x%p)\n",
1887 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1888 #endif
1889 
1890 	mutex_enter(&xnfp->xnf_rxlock);
1891 	mutex_enter(&xnfp->xnf_txlock);
1892 
1893 	xnfp->xnf_running = B_FALSE;
1894 
1895 	mutex_exit(&xnfp->xnf_txlock);
1896 	mutex_exit(&xnfp->xnf_rxlock);
1897 }
1898 
1899 /*
1900  * Hang buffer `bdesc' on the RX ring.
1901  */
1902 static void
1903 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
1904 {
1905 	netif_rx_request_t *reqp;
1906 	RING_IDX hang_ix;
1907 
1908 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
1909 
1910 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1911 	    xnfp->xnf_rx_ring.req_prod_pvt);
1912 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1913 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
1914 
1915 	reqp->id = bdesc->id = hang_ix;
1916 	reqp->gref = bdesc->grant_ref;
1917 
1918 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
1919 	xnfp->xnf_rx_ring.req_prod_pvt++;
1920 
1921 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
1922 }
1923 
1924 /*
1925  * Collect packets from the RX ring, storing them in `xnfp' for later
1926  * use.
1927  */
1928 static void
1929 xnf_rx_collect(xnf_t *xnfp)
1930 {
1931 	mblk_t *head, *tail;
1932 
1933 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
1934 
1935 	/*
1936 	 * Loop over unconsumed responses:
1937 	 * 1. get a response
1938 	 * 2. take corresponding buffer off recv. ring
1939 	 * 3. indicate this by setting slot to NULL
1940 	 * 4. create a new message and
1941 	 * 5. copy data in, adjust ptr
1942 	 */
1943 
1944 	head = tail = NULL;
1945 
1946 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1947 		netif_rx_response_t *rxpkt;
1948 		xnf_buf_t *bdesc;
1949 		ssize_t len;
1950 		size_t off;
1951 		mblk_t *mp = NULL;
1952 		boolean_t hwcsum = B_FALSE;
1953 		grant_ref_t ref;
1954 
1955 		/* 1. */
1956 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1957 		    xnfp->xnf_rx_ring.rsp_cons);
1958 
1959 		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
1960 		    int, (int)rxpkt->offset,
1961 		    int, (int)rxpkt->flags,
1962 		    int, (int)rxpkt->status);
1963 
1964 		/*
1965 		 * 2.
1966 		 */
1967 		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
1968 
1969 		/*
1970 		 * 3.
1971 		 */
1972 		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
1973 		ASSERT(bdesc->id == rxpkt->id);
1974 
1975 		ref = bdesc->grant_ref;
1976 		off = rxpkt->offset;
1977 		len = rxpkt->status;
1978 
1979 		if (!xnfp->xnf_running) {
1980 			DTRACE_PROBE4(xnf_rx_not_running,
1981 			    int, rxpkt->status,
1982 			    char *, bdesc->buf, int, rxpkt->offset,
1983 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1984 
1985 			xnfp->xnf_stat_drop++;
1986 
1987 		} else if (len <= 0) {
1988 			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
1989 			    int, rxpkt->status,
1990 			    char *, bdesc->buf, int, rxpkt->offset,
1991 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1992 
1993 			xnfp->xnf_stat_errrx++;
1994 
1995 			switch (len) {
1996 			case 0:
1997 				xnfp->xnf_stat_runt++;
1998 				break;
1999 			case NETIF_RSP_ERROR:
2000 				xnfp->xnf_stat_mac_rcv_error++;
2001 				break;
2002 			case NETIF_RSP_DROPPED:
2003 				xnfp->xnf_stat_norxbuf++;
2004 				break;
2005 			}
2006 
2007 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2008 			cmn_err(CE_WARN, "Bad rx grant reference %d "
2009 			    "from domain %d", ref,
2010 			    xvdi_get_oeid(xnfp->xnf_devinfo));
2011 
2012 		} else if ((off + len) > PAGESIZE) {
2013 			cmn_err(CE_WARN, "Rx packet overflows page "
2014 			    "(offset %ld, length %ld) from domain %d",
2015 			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
2016 		} else {
2017 			xnf_buf_t *nbuf = NULL;
2018 
2019 			DTRACE_PROBE4(xnf_rx_packet, int, len,
2020 			    char *, bdesc->buf, int, off,
2021 			    char *, ((char *)bdesc->buf) + off);
2022 
2023 			ASSERT(off + len <= PAGEOFFSET);
2024 
2025 			if (rxpkt->flags & NETRXF_data_validated)
2026 				hwcsum = B_TRUE;
2027 
2028 			/*
2029 			 * If the packet is below a pre-determined
2030 			 * size we will copy data out rather than
2031 			 * replace it.
2032 			 */
2033 			if (len > xnf_rx_copy_limit)
2034 				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2035 
2036 			/*
2037 			 * If we have a replacement buffer, attempt to
2038 			 * wrap the existing one with an mblk_t in
2039 			 * order that the upper layers of the stack
2040 			 * might use it directly.
2041 			 */
2042 			if (nbuf != NULL) {
2043 				mp = desballoc((unsigned char *)bdesc->buf,
2044 				    bdesc->len, 0, &bdesc->free_rtn);
2045 				if (mp == NULL) {
2046 					xnfp->xnf_stat_rx_desballoc_fail++;
2047 					xnfp->xnf_stat_norxbuf++;
2048 
2049 					xnf_buf_put(xnfp, nbuf, B_FALSE);
2050 					nbuf = NULL;
2051 				} else {
2052 					mp->b_rptr = mp->b_rptr + off;
2053 					mp->b_wptr = mp->b_rptr + len;
2054 
2055 					/*
2056 					 * Release the grant reference
2057 					 * associated with this buffer
2058 					 * - they are scarce and the
2059 					 * upper layers of the stack
2060 					 * don't need it.
2061 					 */
2062 					(void) gnttab_end_foreign_access_ref(
2063 					    bdesc->grant_ref, 0);
2064 					gref_put(xnfp, bdesc->grant_ref);
2065 					bdesc->grant_ref = INVALID_GRANT_REF;
2066 
2067 					bdesc = nbuf;
2068 				}
2069 			}
2070 
2071 			if (nbuf == NULL) {
2072 				/*
2073 				 * No replacement buffer allocated -
2074 				 * attempt to copy the data out and
2075 				 * re-hang the existing buffer.
2076 				 */
2077 
2078 				/* 4. */
2079 				mp = allocb(len, BPRI_MED);
2080 				if (mp == NULL) {
2081 					xnfp->xnf_stat_rx_allocb_fail++;
2082 					xnfp->xnf_stat_norxbuf++;
2083 				} else {
2084 					/* 5. */
2085 					bcopy(bdesc->buf + off, mp->b_wptr,
2086 					    len);
2087 					mp->b_wptr += len;
2088 				}
2089 			}
2090 		}
2091 
2092 		/* Re-hang the buffer. */
2093 		xnf_rxbuf_hang(xnfp, bdesc);
2094 
2095 		if (mp != NULL) {
2096 			if (hwcsum) {
2097 				/*
2098 				 * If the peer says that the data has
2099 				 * been validated then we declare that
2100 				 * the full checksum has been
2101 				 * verified.
2102 				 *
2103 				 * We don't look at the "checksum
2104 				 * blank" flag, and hence could have a
2105 				 * packet here that we are asserting
2106 				 * is good with a blank checksum.
2107 				 */
2108 				mac_hcksum_set(mp, 0, 0, 0, 0,
2109 				    HCK_FULLCKSUM_OK);
2110 				xnfp->xnf_stat_rx_cksum_no_need++;
2111 			}
2112 			if (head == NULL) {
2113 				ASSERT(tail == NULL);
2114 
2115 				head = mp;
2116 			} else {
2117 				ASSERT(tail != NULL);
2118 
2119 				tail->b_next = mp;
2120 			}
2121 			tail = mp;
2122 
2123 			ASSERT(mp->b_next == NULL);
2124 
2125 			xnfp->xnf_stat_ipackets++;
2126 			xnfp->xnf_stat_rbytes += len;
2127 		}
2128 
2129 		xnfp->xnf_rx_ring.rsp_cons++;
2130 	}
2131 
2132 	/*
2133 	 * Store the mblks we have collected.
2134 	 */
2135 	if (head != NULL) {
2136 		ASSERT(tail != NULL);
2137 
2138 		if (xnfp->xnf_rx_head == NULL) {
2139 			ASSERT(xnfp->xnf_rx_tail == NULL);
2140 
2141 			xnfp->xnf_rx_head = head;
2142 		} else {
2143 			ASSERT(xnfp->xnf_rx_tail != NULL);
2144 
2145 			xnfp->xnf_rx_tail->b_next = head;
2146 		}
2147 		xnfp->xnf_rx_tail = tail;
2148 	}
2149 }
2150 
2151 /*
2152  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2153  */
2154 static int
2155 xnf_alloc_dma_resources(xnf_t *xnfp)
2156 {
2157 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
2158 	size_t			len;
2159 	ddi_dma_cookie_t	dma_cookie;
2160 	uint_t			ncookies;
2161 	int			rc;
2162 	caddr_t			rptr;
2163 
2164 	/*
2165 	 * The code below allocates all the DMA data structures that
2166 	 * need to be released when the driver is detached.
2167 	 *
2168 	 * Allocate page for the transmit descriptor ring.
2169 	 */
2170 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2171 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2172 		goto alloc_error;
2173 
2174 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2175 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2176 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2177 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2178 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2179 		xnfp->xnf_tx_ring_dma_handle = NULL;
2180 		goto alloc_error;
2181 	}
2182 
2183 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2184 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2185 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2186 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2187 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2188 		xnfp->xnf_tx_ring_dma_handle = NULL;
2189 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2190 		if (rc == DDI_DMA_NORESOURCES)
2191 			goto alloc_error;
2192 		else
2193 			goto error;
2194 	}
2195 
2196 	ASSERT(ncookies == 1);
2197 	bzero(rptr, PAGESIZE);
2198 	/* LINTED: constant in conditional context */
2199 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2200 	/* LINTED: constant in conditional context */
2201 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2202 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2203 
2204 	/*
2205 	 * Allocate page for the receive descriptor ring.
2206 	 */
2207 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2208 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2209 		goto alloc_error;
2210 
2211 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2212 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2213 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2214 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2215 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2216 		xnfp->xnf_rx_ring_dma_handle = NULL;
2217 		goto alloc_error;
2218 	}
2219 
2220 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2221 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2222 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2223 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2224 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2225 		xnfp->xnf_rx_ring_dma_handle = NULL;
2226 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2227 		if (rc == DDI_DMA_NORESOURCES)
2228 			goto alloc_error;
2229 		else
2230 			goto error;
2231 	}
2232 
2233 	ASSERT(ncookies == 1);
2234 	bzero(rptr, PAGESIZE);
2235 	/* LINTED: constant in conditional context */
2236 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2237 	/* LINTED: constant in conditional context */
2238 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2239 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2240 
2241 	return (DDI_SUCCESS);
2242 
2243 alloc_error:
2244 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2245 	    ddi_get_instance(xnfp->xnf_devinfo));
2246 error:
2247 	xnf_release_dma_resources(xnfp);
2248 	return (DDI_FAILURE);
2249 }
2250 
2251 /*
2252  * Release all DMA resources in the opposite order from acquisition
2253  */
2254 static void
2255 xnf_release_dma_resources(xnf_t *xnfp)
2256 {
2257 	int i;
2258 
2259 	/*
2260 	 * Free receive buffers which are currently associated with
2261 	 * descriptors.
2262 	 */
2263 	mutex_enter(&xnfp->xnf_rxlock);
2264 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
2265 		xnf_buf_t *bp;
2266 
2267 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2268 			continue;
2269 		xnfp->xnf_rx_pkt_info[i] = NULL;
2270 		xnf_buf_put(xnfp, bp, B_FALSE);
2271 	}
2272 	mutex_exit(&xnfp->xnf_rxlock);
2273 
2274 	/* Free the receive ring buffer. */
2275 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2276 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2277 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2278 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2279 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2280 	}
2281 	/* Free the transmit ring buffer. */
2282 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2283 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2284 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2285 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2286 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2287 	}
2288 
2289 }
2290 
2291 /*
2292  * Release any packets and associated structures used by the TX ring.
2293  */
2294 static void
2295 xnf_release_mblks(xnf_t *xnfp)
2296 {
2297 	RING_IDX i;
2298 	xnf_txid_t *tidp;
2299 
2300 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2301 	    i < NET_TX_RING_SIZE;
2302 	    i++, tidp++) {
2303 		xnf_txbuf_t *txp = tidp->txbuf;
2304 
2305 		if (txp != NULL) {
2306 			ASSERT(txp->tx_mp != NULL);
2307 			freemsg(txp->tx_mp);
2308 
2309 			txid_put(xnfp, tidp);
2310 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2311 		}
2312 	}
2313 }
2314 
2315 static int
2316 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2317 {
2318 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2319 	xnf_buf_t *bdesc = buf;
2320 	xnf_t *xnfp = arg;
2321 	ddi_dma_cookie_t dma_cookie;
2322 	uint_t ncookies;
2323 	size_t len;
2324 
2325 	if (kmflag & KM_NOSLEEP)
2326 		ddiflags = DDI_DMA_DONTWAIT;
2327 
2328 	/* Allocate a DMA access handle for the buffer. */
2329 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
2330 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2331 		goto failure;
2332 
2333 	/* Allocate DMA-able memory for buffer. */
2334 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2335 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2336 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2337 		goto failure_1;
2338 
2339 	/* Bind to virtual address of buffer to get physical address. */
2340 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2341 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2342 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2343 		goto failure_2;
2344 	ASSERT(ncookies == 1);
2345 
2346 	bdesc->free_rtn.free_func = xnf_buf_recycle;
2347 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2348 	bdesc->xnfp = xnfp;
2349 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2350 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2351 	bdesc->len = dma_cookie.dmac_size;
2352 	bdesc->grant_ref = INVALID_GRANT_REF;
2353 	bdesc->gen = xnfp->xnf_gen;
2354 
2355 	atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2356 
2357 	return (0);
2358 
2359 failure_2:
2360 	ddi_dma_mem_free(&bdesc->acc_handle);
2361 
2362 failure_1:
2363 	ddi_dma_free_handle(&bdesc->dma_handle);
2364 
2365 failure:
2366 
2367 	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2368 	return (-1);
2369 }
2370 
2371 static void
2372 xnf_buf_destructor(void *buf, void *arg)
2373 {
2374 	xnf_buf_t *bdesc = buf;
2375 	xnf_t *xnfp = arg;
2376 
2377 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
2378 	ddi_dma_mem_free(&bdesc->acc_handle);
2379 	ddi_dma_free_handle(&bdesc->dma_handle);
2380 
2381 	atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2382 }
2383 
2384 static xnf_buf_t *
2385 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2386 {
2387 	grant_ref_t gref;
2388 	xnf_buf_t *bufp;
2389 
2390 	/*
2391 	 * Usually grant references are more scarce than memory, so we
2392 	 * attempt to acquire a grant reference first.
2393 	 */
2394 	gref = gref_get(xnfp);
2395 	if (gref == INVALID_GRANT_REF)
2396 		return (NULL);
2397 
2398 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2399 	if (bufp == NULL) {
2400 		gref_put(xnfp, gref);
2401 		return (NULL);
2402 	}
2403 
2404 	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
2405 
2406 	bufp->grant_ref = gref;
2407 
2408 	if (bufp->gen != xnfp->xnf_gen)
2409 		xnf_buf_refresh(bufp);
2410 
2411 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
2412 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2413 	    bufp->buf_mfn, readonly ? 1 : 0);
2414 
2415 	atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2416 
2417 	return (bufp);
2418 }
2419 
2420 static void
2421 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2422 {
2423 	if (bufp->grant_ref != INVALID_GRANT_REF) {
2424 		(void) gnttab_end_foreign_access_ref(
2425 		    bufp->grant_ref, readonly ? 1 : 0);
2426 		gref_put(xnfp, bufp->grant_ref);
2427 		bufp->grant_ref = INVALID_GRANT_REF;
2428 	}
2429 
2430 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2431 
2432 	atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2433 }
2434 
2435 /*
2436  * Refresh any cached data about a buffer after resume.
2437  */
2438 static void
2439 xnf_buf_refresh(xnf_buf_t *bdesc)
2440 {
2441 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2442 	bdesc->gen = bdesc->xnfp->xnf_gen;
2443 }
2444 
2445 /*
2446  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2447  * look-aside buffers.
2448  */
2449 static void
2450 xnf_buf_recycle(xnf_buf_t *bdesc)
2451 {
2452 	xnf_t *xnfp = bdesc->xnfp;
2453 
2454 	xnf_buf_put(xnfp, bdesc, B_TRUE);
2455 }
2456 
2457 static int
2458 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2459 {
2460 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2461 	xnf_txbuf_t *txp = buf;
2462 	xnf_t *xnfp = arg;
2463 
2464 	if (kmflag & KM_NOSLEEP)
2465 		ddiflags = DDI_DMA_DONTWAIT;
2466 
2467 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
2468 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2469 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2470 		return (-1);
2471 	}
2472 
2473 	return (0);
2474 }
2475 
2476 static void
2477 xnf_tx_buf_destructor(void *buf, void *arg)
2478 {
2479 	_NOTE(ARGUNUSED(arg));
2480 	xnf_txbuf_t *txp = buf;
2481 
2482 	ddi_dma_free_handle(&txp->tx_dma_handle);
2483 }
2484 
2485 /*
2486  * Statistics.
2487  */
2488 static char *xnf_aux_statistics[] = {
2489 	"tx_cksum_deferred",
2490 	"rx_cksum_no_need",
2491 	"interrupts",
2492 	"unclaimed_interrupts",
2493 	"tx_pullup",
2494 	"tx_pagebndry",
2495 	"tx_attempt",
2496 	"buf_allocated",
2497 	"buf_outstanding",
2498 	"gref_outstanding",
2499 	"gref_failure",
2500 	"gref_peak",
2501 	"rx_allocb_fail",
2502 	"rx_desballoc_fail",
2503 };
2504 
2505 static int
2506 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2507 {
2508 	xnf_t *xnfp;
2509 	kstat_named_t *knp;
2510 
2511 	if (flag != KSTAT_READ)
2512 		return (EACCES);
2513 
2514 	xnfp = ksp->ks_private;
2515 	knp = ksp->ks_data;
2516 
2517 	/*
2518 	 * Assignment order must match that of the names in
2519 	 * xnf_aux_statistics.
2520 	 */
2521 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2522 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2523 
2524 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2525 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2526 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2527 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2528 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2529 
2530 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2531 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2532 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2533 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2534 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2535 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2536 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2537 
2538 	return (0);
2539 }
2540 
2541 static boolean_t
2542 xnf_kstat_init(xnf_t *xnfp)
2543 {
2544 	int nstat = sizeof (xnf_aux_statistics) /
2545 	    sizeof (xnf_aux_statistics[0]);
2546 	char **cp = xnf_aux_statistics;
2547 	kstat_named_t *knp;
2548 
2549 	/*
2550 	 * Create and initialise kstats.
2551 	 */
2552 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2553 	    ddi_get_instance(xnfp->xnf_devinfo),
2554 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2555 	    nstat, 0)) == NULL)
2556 		return (B_FALSE);
2557 
2558 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2559 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2560 
2561 	knp = xnfp->xnf_kstat_aux->ks_data;
2562 	while (nstat > 0) {
2563 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2564 
2565 		knp++;
2566 		cp++;
2567 		nstat--;
2568 	}
2569 
2570 	kstat_install(xnfp->xnf_kstat_aux);
2571 
2572 	return (B_TRUE);
2573 }
2574 
2575 static int
2576 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2577 {
2578 	xnf_t *xnfp = arg;
2579 
2580 	mutex_enter(&xnfp->xnf_rxlock);
2581 	mutex_enter(&xnfp->xnf_txlock);
2582 
2583 #define	mac_stat(q, r)				\
2584 	case (MAC_STAT_##q):			\
2585 		*val = xnfp->xnf_stat_##r;	\
2586 		break
2587 
2588 #define	ether_stat(q, r)			\
2589 	case (ETHER_STAT_##q):			\
2590 		*val = xnfp->xnf_stat_##r;	\
2591 		break
2592 
2593 	switch (stat) {
2594 
2595 	mac_stat(IPACKETS, ipackets);
2596 	mac_stat(OPACKETS, opackets);
2597 	mac_stat(RBYTES, rbytes);
2598 	mac_stat(OBYTES, obytes);
2599 	mac_stat(NORCVBUF, norxbuf);
2600 	mac_stat(IERRORS, errrx);
2601 	mac_stat(NOXMTBUF, tx_defer);
2602 
2603 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2604 	ether_stat(TOOSHORT_ERRORS, runt);
2605 
2606 	/* always claim to be in full duplex mode */
2607 	case ETHER_STAT_LINK_DUPLEX:
2608 		*val = LINK_DUPLEX_FULL;
2609 		break;
2610 
2611 	/* always claim to be at 1Gb/s link speed */
2612 	case MAC_STAT_IFSPEED:
2613 		*val = 1000000000ull;
2614 		break;
2615 
2616 	default:
2617 		mutex_exit(&xnfp->xnf_txlock);
2618 		mutex_exit(&xnfp->xnf_rxlock);
2619 
2620 		return (ENOTSUP);
2621 	}
2622 
2623 #undef mac_stat
2624 #undef ether_stat
2625 
2626 	mutex_exit(&xnfp->xnf_txlock);
2627 	mutex_exit(&xnfp->xnf_rxlock);
2628 
2629 	return (0);
2630 }
2631 
2632 static boolean_t
2633 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2634 {
2635 	_NOTE(ARGUNUSED(arg));
2636 
2637 	switch (cap) {
2638 	case MAC_CAPAB_HCKSUM: {
2639 		uint32_t *capab = cap_data;
2640 
2641 		/*
2642 		 * Whilst the flag used to communicate with the IO
2643 		 * domain is called "NETTXF_csum_blank", the checksum
2644 		 * in the packet must contain the pseudo-header
2645 		 * checksum and not zero.
2646 		 *
2647 		 * To help out the IO domain, we might use
2648 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
2649 		 * then use checksum offload for IPv6 packets, which
2650 		 * the IO domain can't handle.
2651 		 *
2652 		 * As a result, we declare outselves capable of
2653 		 * HCKSUM_INET_FULL_V4. This means that we receive
2654 		 * IPv4 packets from the stack with a blank checksum
2655 		 * field and must insert the pseudo-header checksum
2656 		 * before passing the packet to the IO domain.
2657 		 */
2658 		*capab = HCKSUM_INET_FULL_V4;
2659 		break;
2660 	}
2661 	default:
2662 		return (B_FALSE);
2663 	}
2664 
2665 	return (B_TRUE);
2666 }
2667 
2668 /*
2669  * The state of the peer has changed - react accordingly.
2670  */
2671 static void
2672 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2673     void *arg, void *impl_data)
2674 {
2675 	_NOTE(ARGUNUSED(id, arg));
2676 	xnf_t *xnfp = ddi_get_driver_private(dip);
2677 	XenbusState new_state = *(XenbusState *)impl_data;
2678 
2679 	ASSERT(xnfp != NULL);
2680 
2681 	switch (new_state) {
2682 	case XenbusStateUnknown:
2683 	case XenbusStateInitialising:
2684 	case XenbusStateInitialised:
2685 	case XenbusStateClosing:
2686 	case XenbusStateClosed:
2687 	case XenbusStateReconfiguring:
2688 	case XenbusStateReconfigured:
2689 		break;
2690 
2691 	case XenbusStateInitWait:
2692 		xnf_read_config(xnfp);
2693 
2694 		if (!xnfp->xnf_be_rx_copy) {
2695 			cmn_err(CE_WARN,
2696 			    "The xnf driver requires a dom0 that "
2697 			    "supports 'feature-rx-copy'.");
2698 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
2699 			    XBT_NULL, XenbusStateClosed);
2700 			break;
2701 		}
2702 
2703 		/*
2704 		 * Connect to the backend.
2705 		 */
2706 		xnf_be_connect(xnfp);
2707 
2708 		/*
2709 		 * Our MAC address as discovered by xnf_read_config().
2710 		 */
2711 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
2712 
2713 		break;
2714 
2715 	case XenbusStateConnected:
2716 		mutex_enter(&xnfp->xnf_rxlock);
2717 		mutex_enter(&xnfp->xnf_txlock);
2718 
2719 		xnfp->xnf_connected = B_TRUE;
2720 		/*
2721 		 * Wake up any threads waiting to send data to
2722 		 * backend.
2723 		 */
2724 		cv_broadcast(&xnfp->xnf_cv_state);
2725 
2726 		mutex_exit(&xnfp->xnf_txlock);
2727 		mutex_exit(&xnfp->xnf_rxlock);
2728 
2729 		/*
2730 		 * Kick the peer in case it missed any transmits
2731 		 * request in the TX ring.
2732 		 */
2733 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2734 
2735 		/*
2736 		 * There may already be completed receive requests in
2737 		 * the ring sent by backend after it gets connected
2738 		 * but before we see its state change here, so we call
2739 		 * xnf_intr() to handle them, if any.
2740 		 */
2741 		(void) xnf_intr((caddr_t)xnfp);
2742 
2743 		/*
2744 		 * Mark the link up now that we are connected.
2745 		 */
2746 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
2747 
2748 		/*
2749 		 * Tell the backend about the multicast addresses in
2750 		 * which we are interested.
2751 		 */
2752 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
2753 
2754 		break;
2755 
2756 	default:
2757 		break;
2758 	}
2759 }
2760