xref: /illumos-gate/usr/src/uts/common/xen/io/xnf.c (revision 843e19887f64dde75055cf8842fc4db2171eff45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  *
31  * Copyright (c) 2004 Christian Limpach.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. This section intentionally left blank.
43  * 4. The name of the author may not be used to endorse or promote products
44  *    derived from this software without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56  */
57 /*
58  * Section 3 of the above license was updated in response to bug 6379571.
59  */
60 
61 /*
62  * xnf.c - Nemo-based network driver for domU
63  */
64 
65 #include <sys/types.h>
66 #include <sys/hypervisor.h>
67 #include <sys/debug.h>
68 #include <sys/errno.h>
69 #include <sys/param.h>
70 #include <sys/sysmacros.h>
71 #include <sys/systm.h>
72 #include <sys/stropts.h>
73 #include <sys/stream.h>
74 #include <sys/strsubr.h>
75 #include <sys/kmem.h>
76 #include <sys/conf.h>
77 #include <sys/ddi.h>
78 #include <sys/devops.h>
79 #include <sys/sunddi.h>
80 #include <sys/sunndi.h>
81 #include <sys/ksynch.h>
82 #include <sys/dlpi.h>
83 #include <sys/ethernet.h>
84 #include <sys/strsun.h>
85 #include <sys/pattr.h>
86 #include <inet/common.h>
87 #include <inet/ip.h>
88 #include <sys/stat.h>
89 #include <sys/modctl.h>
90 #include <sys/mac.h>
91 #include <sys/mac_ether.h>
92 #include <sys/atomic.h>
93 #include <sys/errno.h>
94 #include <sys/machsystm.h>
95 #include <sys/bootconf.h>
96 #include <sys/bootsvcs.h>
97 #include <sys/bootinfo.h>
98 #include <sys/promif.h>
99 #include <sys/archsystm.h>
100 #include <sys/gnttab.h>
101 #include <sys/mach_mmu.h>
102 #include <xen/public/memory.h>
103 
104 #include "xnf.h"
105 
106 #include <sys/evtchn_impl.h>
107 #include <sys/balloon_impl.h>
108 #include <xen/sys/xendev.h>
109 
110 /*
111  *  Declarations and Module Linkage
112  */
113 
114 #define	IDENT	"Virtual Ethernet driver"
115 
116 #if defined(DEBUG) || defined(__lint)
117 #define	XNF_DEBUG
118 int	xnfdebug = 0;
119 #endif
120 
121 /*
122  * On a 32 bit PAE system physical and machine addresses are larger
123  * than 32 bits.  ddi_btop() on such systems take an unsigned long
124  * argument, and so addresses above 4G are truncated before ddi_btop()
125  * gets to see them.  To avoid this, code the shift operation here.
126  */
127 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
128 
129 boolean_t	xnf_cksum_offload = B_TRUE;
130 /*
131  * Should pages used for transmit be readonly for the peer?
132  */
133 boolean_t	xnf_tx_pages_readonly = B_FALSE;
134 /*
135  * Packets under this size are bcopied instead of using desballoc.
136  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
137  * always copy.
138  */
139 unsigned int	xnf_rx_bcopy_thresh = 64;
140 
141 unsigned int	xnf_max_tx_frags = 1;
142 
143 /* Required system entry points */
144 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
145 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
146 
147 /* Required driver entry points for Nemo */
148 static int	xnf_start(void *);
149 static void	xnf_stop(void *);
150 static int	xnf_set_mac_addr(void *, const uint8_t *);
151 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
152 static int	xnf_set_promiscuous(void *, boolean_t);
153 static mblk_t	*xnf_send(void *, mblk_t *);
154 static uint_t	xnf_intr(caddr_t);
155 static int	xnf_stat(void *, uint_t, uint64_t *);
156 static void	xnf_blank(void *, time_t, uint_t);
157 static void	xnf_resources(void *);
158 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
159 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
160 
161 /* Driver private functions */
162 static int xnf_alloc_dma_resources(xnf_t *);
163 static void xnf_release_dma_resources(xnf_t *);
164 static mblk_t *xnf_process_recv(xnf_t *);
165 static void xnf_rcv_complete(struct xnf_buffer_desc *);
166 static void xnf_release_mblks(xnf_t *);
167 static struct xnf_buffer_desc *xnf_alloc_xmit_buffer(xnf_t *);
168 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
169 static struct xnf_buffer_desc *xnf_get_xmit_buffer(xnf_t *);
170 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
171 static void xnf_free_buffer(struct xnf_buffer_desc *);
172 static void xnf_free_xmit_buffer(struct xnf_buffer_desc *);
173 void xnf_send_driver_status(int, int);
174 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
175 static int xnf_clean_tx_ring(xnf_t  *);
176 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
177     void *, void *);
178 
179 /*
180  * XXPV dme: remove MC_IOCTL?
181  */
182 static mac_callbacks_t xnf_callbacks = {
183 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
184 	xnf_stat,
185 	xnf_start,
186 	xnf_stop,
187 	xnf_set_promiscuous,
188 	xnf_set_multicast,
189 	xnf_set_mac_addr,
190 	xnf_send,
191 	xnf_resources,
192 	xnf_ioctl,
193 	xnf_getcapab
194 };
195 
196 #define	GRANT_INVALID_REF	0
197 int xnf_recv_bufs_lowat = 4 * NET_RX_RING_SIZE;
198 int xnf_recv_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
199 
200 /* DMA attributes for network ring buffer */
201 static ddi_dma_attr_t ringbuf_dma_attr = {
202 	DMA_ATTR_V0,		/* version of this structure */
203 	0,			/* lowest usable address */
204 	0xffffffffffffffffULL,	/* highest usable address */
205 	0x7fffffff,		/* maximum DMAable byte count */
206 	MMU_PAGESIZE,		/* alignment in bytes */
207 	0x7ff,			/* bitmap of burst sizes */
208 	1,			/* minimum transfer */
209 	0xffffffffU,		/* maximum transfer */
210 	0xffffffffffffffffULL,	/* maximum segment length */
211 	1,			/* maximum number of segments */
212 	1,			/* granularity */
213 	0,			/* flags (reserved) */
214 };
215 
216 /* DMA attributes for transmit data */
217 static ddi_dma_attr_t tx_buffer_dma_attr = {
218 	DMA_ATTR_V0,		/* version of this structure */
219 	0,			/* lowest usable address */
220 	0xffffffffffffffffULL,	/* highest usable address */
221 	0x7fffffff,		/* maximum DMAable byte count */
222 	MMU_PAGESIZE,		/* alignment in bytes */
223 	0x7ff,			/* bitmap of burst sizes */
224 	1,			/* minimum transfer */
225 	0xffffffffU,		/* maximum transfer */
226 	0xffffffffffffffffULL,	/* maximum segment length */
227 	1,			/* maximum number of segments */
228 	1,			/* granularity */
229 	0,			/* flags (reserved) */
230 };
231 
232 /* DMA attributes for a receive buffer */
233 static ddi_dma_attr_t rx_buffer_dma_attr = {
234 	DMA_ATTR_V0,		/* version of this structure */
235 	0,			/* lowest usable address */
236 	0xffffffffffffffffULL,	/* highest usable address */
237 	0x7fffffff,		/* maximum DMAable byte count */
238 	MMU_PAGESIZE,		/* alignment in bytes */
239 	0x7ff,			/* bitmap of burst sizes */
240 	1,			/* minimum transfer */
241 	0xffffffffU,		/* maximum transfer */
242 	0xffffffffffffffffULL,	/* maximum segment length */
243 	1,			/* maximum number of segments */
244 	1,			/* granularity */
245 	0,			/* flags (reserved) */
246 };
247 
248 /* DMA access attributes for registers and descriptors */
249 static ddi_device_acc_attr_t accattr = {
250 	DDI_DEVICE_ATTR_V0,
251 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
252 	DDI_STRICTORDER_ACC
253 };
254 
255 /* DMA access attributes for data: NOT to be byte swapped. */
256 static ddi_device_acc_attr_t data_accattr = {
257 	DDI_DEVICE_ATTR_V0,
258 	DDI_NEVERSWAP_ACC,
259 	DDI_STRICTORDER_ACC
260 };
261 
262 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
263 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
264 
265 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
266     nodev, NULL, D_MP, NULL);
267 
268 static struct modldrv xnf_modldrv = {
269 	&mod_driverops,		/* Type of module.  This one is a driver */
270 	IDENT " %I%",		/* short description */
271 	&xnf_dev_ops		/* driver specific ops */
272 };
273 
274 static struct modlinkage modlinkage = {
275 	MODREV_1, &xnf_modldrv, NULL
276 };
277 
278 int
279 _init(void)
280 {
281 	int r;
282 
283 	mac_init_ops(&xnf_dev_ops, "xnf");
284 	r = mod_install(&modlinkage);
285 	if (r != DDI_SUCCESS)
286 		mac_fini_ops(&xnf_dev_ops);
287 
288 	return (r);
289 }
290 
291 int
292 _fini(void)
293 {
294 	return (EBUSY); /* XXPV dme: should be removable */
295 }
296 
297 int
298 _info(struct modinfo *modinfop)
299 {
300 	return (mod_info(&modlinkage, modinfop));
301 }
302 
303 /*
304  * Statistics.
305  */
306 /* XXPV: most of these names need re-"nice"ing */
307 static char *xnf_aux_statistics[] = {
308 	"tx_cksum_deferred",
309 	"rx_cksum_no_need",
310 	"intr",
311 	"xmit_pullup",
312 	"xmit_pagebndry",
313 	"xmit_attempt",
314 	"rx_no_ringbuf",
315 	"mac_rcv_error",
316 	"runt",
317 };
318 
319 static int
320 xnf_kstat_aux_update(kstat_t *ksp, int flag)
321 {
322 	xnf_t *xnfp;
323 	kstat_named_t *knp;
324 
325 	if (flag != KSTAT_READ)
326 		return (EACCES);
327 
328 	xnfp = ksp->ks_private;
329 	knp = ksp->ks_data;
330 
331 	/*
332 	 * Assignment order should match that of the names in
333 	 * xnf_aux_statistics.
334 	 */
335 	(knp++)->value.ui64 = xnfp->stat_tx_cksum_deferred;
336 	(knp++)->value.ui64 = xnfp->stat_rx_cksum_no_need;
337 
338 	(knp++)->value.ui64 = xnfp->stat_intr;
339 	(knp++)->value.ui64 = xnfp->stat_xmit_pullup;
340 	(knp++)->value.ui64 = xnfp->stat_xmit_pagebndry;
341 	(knp++)->value.ui64 = xnfp->stat_xmit_attempt;
342 	(knp++)->value.ui64 = xnfp->stat_rx_no_ringbuf;
343 	(knp++)->value.ui64 = xnfp->stat_mac_rcv_error;
344 	(knp++)->value.ui64 = xnfp->stat_runt;
345 
346 	return (0);
347 }
348 
349 static boolean_t
350 xnf_kstat_init(xnf_t *xnfp)
351 {
352 	int nstat = sizeof (xnf_aux_statistics) /
353 	    sizeof (xnf_aux_statistics[0]);
354 	char **cp = xnf_aux_statistics;
355 	kstat_named_t *knp;
356 
357 	/*
358 	 * Create and initialise kstats.
359 	 */
360 	if ((xnfp->kstat_aux = kstat_create("xnf",
361 	    ddi_get_instance(xnfp->devinfo),
362 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
363 	    nstat, 0)) == NULL)
364 		return (B_FALSE);
365 
366 	xnfp->kstat_aux->ks_private = xnfp;
367 	xnfp->kstat_aux->ks_update = xnf_kstat_aux_update;
368 
369 	knp = xnfp->kstat_aux->ks_data;
370 	while (nstat > 0) {
371 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
372 
373 		knp++;
374 		cp++;
375 		nstat--;
376 	}
377 
378 	kstat_install(xnfp->kstat_aux);
379 
380 	return (B_TRUE);
381 }
382 
383 static int
384 xnf_setup_rings(xnf_t *xnfp)
385 {
386 	int			ix, err;
387 	RING_IDX		i;
388 	struct xnf_buffer_desc *bdesc, *rbp;
389 	struct xenbus_device *xsd;
390 	domid_t oeid;
391 
392 	oeid = xvdi_get_oeid(xnfp->devinfo);
393 	xsd = xvdi_get_xsd(xnfp->devinfo);
394 
395 	if (xnfp->tx_ring_ref != GRANT_INVALID_REF)
396 		gnttab_end_foreign_access(xnfp->tx_ring_ref, 0, 0);
397 
398 	err = gnttab_grant_foreign_access(oeid,
399 	    xnf_btop(pa_to_ma(xnfp->tx_ring_phys_addr)), 0);
400 	if (err <= 0) {
401 		err = -err;
402 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
403 		goto out;
404 	}
405 	xnfp->tx_ring_ref = (grant_ref_t)err;
406 
407 	if (xnfp->rx_ring_ref != GRANT_INVALID_REF)
408 		gnttab_end_foreign_access(xnfp->rx_ring_ref, 0, 0);
409 
410 	err = gnttab_grant_foreign_access(oeid,
411 	    xnf_btop(pa_to_ma(xnfp->rx_ring_phys_addr)), 0);
412 	if (err <= 0) {
413 		err = -err;
414 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
415 		goto out;
416 	}
417 	xnfp->rx_ring_ref = (grant_ref_t)err;
418 
419 
420 	mutex_enter(&xnfp->intrlock);
421 
422 	/*
423 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
424 	 * and reset the ring.  Note that this can lose packets after a resume,
425 	 * but we expect to stagger on.
426 	 */
427 	mutex_enter(&xnfp->txlock);
428 
429 	for (i = 0; i < xnfp->n_xmits; i++) {
430 		struct tx_pktinfo *txp = &xnfp->tx_pkt_info[i];
431 
432 		txp->id = i + 1;
433 
434 		if (txp->grant_ref == GRANT_INVALID_REF) {
435 			ASSERT(txp->mp == NULL);
436 			ASSERT(txp->bdesc == NULL);
437 			continue;
438 		}
439 
440 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
441 			panic("tx grant still in use by backend domain");
442 
443 		freemsg(txp->mp);
444 		txp->mp = NULL;
445 
446 		(void) ddi_dma_unbind_handle(txp->dma_handle);
447 
448 		if (txp->bdesc != NULL) {
449 			xnf_free_xmit_buffer(txp->bdesc);
450 			txp->bdesc = NULL;
451 		}
452 
453 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
454 		    xnfp->tx_pages_readonly);
455 		gnttab_release_grant_reference(&xnfp->gref_tx_head,
456 		    txp->grant_ref);
457 		txp->grant_ref = GRANT_INVALID_REF;
458 	}
459 
460 	xnfp->tx_pkt_id_list = 0;
461 	xnfp->tx_ring.rsp_cons = 0;
462 	xnfp->tx_ring.sring->req_prod = 0;
463 	xnfp->tx_ring.sring->rsp_prod = 0;
464 	xnfp->tx_ring.sring->rsp_event = 1;
465 
466 	mutex_exit(&xnfp->txlock);
467 
468 	/*
469 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
470 	 * our pages are currently flipped out so we can't just free the RX
471 	 * buffers.  Reclaim any unprocessed recv buffers, they won't be
472 	 * useable anyway since the mfn's they refer to are no longer valid.
473 	 * Grant the backend domain access to each hung rx buffer.
474 	 */
475 	i = xnfp->rx_ring.rsp_cons;
476 	while (i++ != xnfp->rx_ring.sring->req_prod) {
477 		volatile netif_rx_request_t	*rxrp;
478 
479 		rxrp = RING_GET_REQUEST(&xnfp->rx_ring, i);
480 		ix = rxrp - RING_GET_REQUEST(&xnfp->rx_ring, 0);
481 		rbp = xnfp->rxpkt_bufptr[ix];
482 		if (rbp != NULL) {
483 			ASSERT(rbp->grant_ref != GRANT_INVALID_REF);
484 			gnttab_grant_foreign_transfer_ref(rbp->grant_ref,
485 			    oeid);
486 			rxrp->id = ix;
487 			rxrp->gref = rbp->grant_ref;
488 		}
489 	}
490 	/*
491 	 * Reset the ring pointers to initial state.
492 	 * Hang buffers for any empty ring slots.
493 	 */
494 	xnfp->rx_ring.rsp_cons = 0;
495 	xnfp->rx_ring.sring->req_prod = 0;
496 	xnfp->rx_ring.sring->rsp_prod = 0;
497 	xnfp->rx_ring.sring->rsp_event = 1;
498 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
499 		xnfp->rx_ring.req_prod_pvt = i;
500 		if (xnfp->rxpkt_bufptr[i] != NULL)
501 			continue;
502 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
503 			break;
504 		rx_buffer_hang(xnfp, bdesc);
505 	}
506 	xnfp->rx_ring.req_prod_pvt = i;
507 	/* LINTED: constant in conditional context */
508 	RING_PUSH_REQUESTS(&xnfp->rx_ring);
509 
510 	mutex_exit(&xnfp->intrlock);
511 
512 	return (0);
513 
514 out:
515 	if (xnfp->tx_ring_ref != GRANT_INVALID_REF)
516 		gnttab_end_foreign_access(xnfp->tx_ring_ref, 0, 0);
517 	xnfp->tx_ring_ref = GRANT_INVALID_REF;
518 
519 	if (xnfp->rx_ring_ref != GRANT_INVALID_REF)
520 		gnttab_end_foreign_access(xnfp->rx_ring_ref, 0, 0);
521 	xnfp->rx_ring_ref = GRANT_INVALID_REF;
522 
523 	return (err);
524 }
525 
526 /*
527  * Connect driver to back end, called to set up communication with
528  * back end driver both initially and on resume after restore/migrate.
529  */
530 void
531 xnf_be_connect(xnf_t *xnfp)
532 {
533 	char		mac[ETHERADDRL * 3];
534 	const char	*message;
535 	xenbus_transaction_t xbt;
536 	struct xenbus_device *xsd;
537 	char		*xsname;
538 	int		err, be_no_cksum_offload;
539 
540 	ASSERT(!xnfp->connected);
541 
542 	xsd = xvdi_get_xsd(xnfp->devinfo);
543 	xsname = xvdi_get_xsname(xnfp->devinfo);
544 
545 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->devinfo), "mac",
546 	    "%s", (char *)&mac[0]);
547 	if (err != 0) {
548 		/*
549 		 * bad: we're supposed to be set up with a proper mac
550 		 * addr. at this point
551 		 */
552 		cmn_err(CE_WARN, "%s%d: no mac address",
553 		    ddi_driver_name(xnfp->devinfo),
554 		    ddi_get_instance(xnfp->devinfo));
555 			return;
556 	}
557 
558 	if (ether_aton(mac, xnfp->mac_addr) != ETHERADDRL) {
559 		err = ENOENT;
560 		xenbus_dev_error(xsd, ENOENT, "parsing %s/mac", xsname);
561 		return;
562 	}
563 
564 	err = xnf_setup_rings(xnfp);
565 	if (err != 0) {
566 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
567 		xenbus_dev_error(xsd, err, "setting up ring");
568 		return;
569 	}
570 
571 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->devinfo),
572 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
573 	/*
574 	 * If we fail to read the store we assume that the key is
575 	 * absent, implying an older domain at the far end.  Older
576 	 * domains always support checksum offload.
577 	 */
578 	if (err != 0)
579 		be_no_cksum_offload = 0;
580 	/*
581 	 * If the far end cannot do checksum offload or we do not wish
582 	 * to do it, disable it.
583 	 */
584 	if ((be_no_cksum_offload == 1) || !xnfp->cksum_offload)
585 		xnfp->cksum_offload = B_FALSE;
586 
587 again:
588 	err = xenbus_transaction_start(&xbt);
589 	if (err != 0) {
590 		xenbus_dev_error(xsd, EIO, "starting transaction");
591 		return;
592 	}
593 
594 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
595 	    xnfp->tx_ring_ref);
596 	if (err != 0) {
597 		message = "writing tx ring-ref";
598 		goto abort_transaction;
599 	}
600 
601 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
602 	    xnfp->rx_ring_ref);
603 	if (err != 0) {
604 		message = "writing rx ring-ref";
605 		goto abort_transaction;
606 	}
607 
608 	err = xenbus_printf(xbt, xsname, "event-channel", "%u", xnfp->evtchn);
609 	if (err != 0) {
610 		message = "writing event-channel";
611 		goto abort_transaction;
612 	}
613 
614 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
615 	if (err != 0) {
616 		message = "writing feature-rx-notify";
617 		goto abort_transaction;
618 	}
619 
620 	if (!xnfp->tx_pages_readonly) {
621 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
622 		    "%d", 1);
623 		if (err != 0) {
624 			message = "writing feature-tx-writable";
625 			goto abort_transaction;
626 		}
627 	}
628 
629 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
630 	    xnfp->cksum_offload ? 0 : 1);
631 	if (err != 0) {
632 		message = "writing feature-no-csum-offload";
633 		goto abort_transaction;
634 	}
635 
636 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
637 	if (err != 0) {
638 		message = "writing frontend XenbusStateConnected";
639 		goto abort_transaction;
640 	}
641 
642 	err = xenbus_transaction_end(xbt, 0);
643 	if (err != 0) {
644 		if (err == EAGAIN)
645 			goto again;
646 		xenbus_dev_error(xsd, err, "completing transaction");
647 	}
648 
649 	return;
650 
651 abort_transaction:
652 	(void) xenbus_transaction_end(xbt, 1);
653 	xenbus_dev_error(xsd, err, "%s", message);
654 }
655 
656 /*
657  *  attach(9E) -- Attach a device to the system
658  *
659  *  Called once for each board successfully probed.
660  */
661 static int
662 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
663 {
664 	mac_register_t *macp;
665 	xnf_t *xnfp;
666 	int err;
667 
668 #ifdef XNF_DEBUG
669 	if (xnfdebug & XNF_DEBUG_DDI)
670 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
671 		    (void *)devinfo);
672 #endif
673 
674 	switch (cmd) {
675 	case DDI_RESUME:
676 		xnfp = ddi_get_driver_private(devinfo);
677 
678 		(void) xvdi_resume(devinfo);
679 		(void) xvdi_alloc_evtchn(devinfo);
680 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
681 		    (caddr_t)xnfp);
682 		xnfp->evtchn = xvdi_get_evtchn(devinfo);
683 		xnf_be_connect(xnfp);
684 		/*
685 		 * Our MAC address didn't necessarily change, but
686 		 * given that we may be resuming this OS instance
687 		 * on a different machine (or on the same one and got a
688 		 * different MAC address because we didn't specify one of
689 		 * our own), it's useful to claim that
690 		 * it changed in order that IP send out a
691 		 * gratuitous ARP.
692 		 */
693 		mac_unicst_update(xnfp->mh, xnfp->mac_addr);
694 		return (DDI_SUCCESS);
695 
696 	case DDI_ATTACH:
697 		break;
698 
699 	default:
700 		return (DDI_FAILURE);
701 	}
702 
703 	/*
704 	 *  Allocate gld_mac_info_t and xnf_instance structures
705 	 */
706 	macp = mac_alloc(MAC_VERSION);
707 	if (macp == NULL)
708 		return (DDI_FAILURE);
709 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
710 
711 	macp->m_dip = devinfo;
712 	macp->m_driver = xnfp;
713 	xnfp->devinfo = devinfo;
714 
715 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
716 	macp->m_src_addr = xnfp->mac_addr;
717 	macp->m_callbacks = &xnf_callbacks;
718 	macp->m_min_sdu = 0;
719 	macp->m_max_sdu = XNF_MAXPKT;
720 
721 	xnfp->running = B_FALSE;
722 	xnfp->connected = B_FALSE;
723 	xnfp->cksum_offload = xnf_cksum_offload;
724 	xnfp->tx_pages_readonly = xnf_tx_pages_readonly;
725 
726 	/*
727 	 * Get the iblock cookie with which to initialize the mutexes.
728 	 */
729 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->icookie)
730 	    != DDI_SUCCESS)
731 		goto failure;
732 	/*
733 	 * Driver locking strategy: the txlock protects all paths
734 	 * through the driver, except the interrupt thread.
735 	 * If the interrupt thread needs to do something which could
736 	 * affect the operation of any other part of the driver,
737 	 * it needs to acquire the txlock mutex.
738 	 */
739 	mutex_init(&xnfp->tx_buf_mutex,
740 	    NULL, MUTEX_DRIVER, xnfp->icookie);
741 	mutex_init(&xnfp->rx_buf_mutex,
742 	    NULL, MUTEX_DRIVER, xnfp->icookie);
743 	mutex_init(&xnfp->txlock,
744 	    NULL, MUTEX_DRIVER, xnfp->icookie);
745 	mutex_init(&xnfp->intrlock,
746 	    NULL, MUTEX_DRIVER, xnfp->icookie);
747 	cv_init(&xnfp->cv, NULL, CV_DEFAULT, NULL);
748 
749 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
750 	    &xnfp->gref_tx_head) < 0) {
751 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
752 		    ddi_get_instance(xnfp->devinfo));
753 		goto late_failure;
754 	}
755 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
756 	    &xnfp->gref_rx_head) < 0) {
757 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
758 		    ddi_get_instance(xnfp->devinfo));
759 		goto late_failure;
760 	}
761 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
762 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
763 		    "driver data structures", ddi_get_instance(xnfp->devinfo));
764 		goto late_failure;
765 	}
766 
767 	xnfp->rx_ring.sring->rsp_event = xnfp->tx_ring.sring->rsp_event = 1;
768 
769 	xnfp->tx_ring_ref = GRANT_INVALID_REF;
770 	xnfp->rx_ring_ref = GRANT_INVALID_REF;
771 
772 	/* set driver private pointer now */
773 	ddi_set_driver_private(devinfo, xnfp);
774 
775 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change)
776 	    != DDI_SUCCESS)
777 		goto late_failure;
778 
779 	if (!xnf_kstat_init(xnfp))
780 		goto very_late_failure;
781 
782 	/*
783 	 * Allocate an event channel, add the interrupt handler and
784 	 * bind it to the event channel.
785 	 */
786 	(void) xvdi_alloc_evtchn(devinfo);
787 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
788 	xnfp->evtchn = xvdi_get_evtchn(devinfo);
789 
790 	/*
791 	 * connect to the backend
792 	 */
793 	xnf_be_connect(xnfp);
794 
795 	err = mac_register(macp, &xnfp->mh);
796 	mac_free(macp);
797 	macp = NULL;
798 	if (err != 0)
799 		goto very_very_late_failure;
800 
801 	return (DDI_SUCCESS);
802 
803 very_very_late_failure:
804 	kstat_delete(xnfp->kstat_aux);
805 
806 very_late_failure:
807 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
808 	ddi_remove_intr(devinfo, 0, xnfp->icookie);
809 	xnfp->evtchn = INVALID_EVTCHN;
810 
811 late_failure:
812 	xnf_release_dma_resources(xnfp);
813 	cv_destroy(&xnfp->cv);
814 	mutex_destroy(&xnfp->rx_buf_mutex);
815 	mutex_destroy(&xnfp->txlock);
816 	mutex_destroy(&xnfp->intrlock);
817 
818 failure:
819 	kmem_free(xnfp, sizeof (*xnfp));
820 	if (macp != NULL)
821 		mac_free(macp);
822 
823 	(void) xvdi_switch_state(devinfo, XBT_NULL, XenbusStateClosed);
824 
825 	return (DDI_FAILURE);
826 }
827 
828 /*  detach(9E) -- Detach a device from the system */
829 static int
830 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
831 {
832 	xnf_t *xnfp;		/* Our private device info */
833 	int i;
834 
835 #ifdef XNF_DEBUG
836 	if (xnfdebug & XNF_DEBUG_DDI)
837 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
838 #endif
839 
840 	xnfp = ddi_get_driver_private(devinfo);
841 
842 	switch (cmd) {
843 	case DDI_SUSPEND:
844 		ddi_remove_intr(devinfo, 0, xnfp->icookie);
845 
846 		xvdi_suspend(devinfo);
847 
848 		mutex_enter(&xnfp->intrlock);
849 		mutex_enter(&xnfp->txlock);
850 
851 		xnfp->evtchn = INVALID_EVTCHN;
852 		xnfp->connected = B_FALSE;
853 		mutex_exit(&xnfp->txlock);
854 		mutex_exit(&xnfp->intrlock);
855 		return (DDI_SUCCESS);
856 
857 	case DDI_DETACH:
858 		break;
859 
860 	default:
861 		return (DDI_FAILURE);
862 	}
863 
864 	if (xnfp->connected)
865 		return (DDI_FAILURE);
866 
867 	/* Wait for receive buffers to be returned; give up after 5 seconds */
868 	i = 50;
869 
870 	mutex_enter(&xnfp->rx_buf_mutex);
871 	while (xnfp->rx_bufs_outstanding > 0) {
872 		mutex_exit(&xnfp->rx_buf_mutex);
873 		delay(drv_usectohz(100000));
874 		if (--i == 0) {
875 			cmn_err(CE_WARN,
876 			    "xnf%d: never reclaimed all the "
877 			    "receive buffers.  Still have %d "
878 			    "buffers outstanding.",
879 			    ddi_get_instance(xnfp->devinfo),
880 			    xnfp->rx_bufs_outstanding);
881 			return (DDI_FAILURE);
882 		}
883 		mutex_enter(&xnfp->rx_buf_mutex);
884 	}
885 	mutex_exit(&xnfp->rx_buf_mutex);
886 
887 	kstat_delete(xnfp->kstat_aux);
888 
889 	if (mac_unregister(xnfp->mh) != 0)
890 		return (DDI_FAILURE);
891 
892 	/* Stop the receiver */
893 	xnf_stop(xnfp);
894 
895 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
896 
897 	/* Remove the interrupt */
898 	ddi_remove_intr(devinfo, 0, xnfp->icookie);
899 
900 	/* Release any pending xmit mblks */
901 	xnf_release_mblks(xnfp);
902 
903 	/* Release all DMA resources */
904 	xnf_release_dma_resources(xnfp);
905 
906 	cv_destroy(&xnfp->cv);
907 	mutex_destroy(&xnfp->rx_buf_mutex);
908 	mutex_destroy(&xnfp->txlock);
909 	mutex_destroy(&xnfp->intrlock);
910 
911 	kmem_free(xnfp, sizeof (*xnfp));
912 
913 	return (DDI_SUCCESS);
914 }
915 
916 /*
917  *  xnf_set_mac_addr() -- set the physical network address on the board.
918  */
919 /*ARGSUSED*/
920 static int
921 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
922 {
923 	xnf_t *xnfp = arg;
924 
925 #ifdef XNF_DEBUG
926 	if (xnfdebug & XNF_DEBUG_TRACE)
927 		printf("xnf%d: set_mac_addr(0x%p): "
928 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
929 		    ddi_get_instance(xnfp->devinfo),
930 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
931 		    macaddr[3], macaddr[4], macaddr[5]);
932 #endif
933 	/*
934 	 * We can't set our macaddr.
935 	 *
936 	 * XXPV dme: Why not?
937 	 */
938 	return (ENOTSUP);
939 }
940 
941 /*
942  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
943  *
944  *  Program the hardware to enable/disable the multicast address
945  *  in "mcast".  Enable if "add" is true, disable if false.
946  */
947 /*ARGSUSED*/
948 static int
949 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
950 {
951 	xnf_t *xnfp = arg;
952 
953 #ifdef XNF_DEBUG
954 	if (xnfdebug & XNF_DEBUG_TRACE)
955 		printf("xnf%d set_multicast(0x%p): "
956 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
957 		    ddi_get_instance(xnfp->devinfo),
958 		    (void *)xnfp, mca[0], mca[1], mca[2],
959 		    mca[3], mca[4], mca[5]);
960 #endif
961 
962 	/*
963 	 * XXPV dme: Ideally we'd relay the address to the backend for
964 	 * enabling.  The protocol doesn't support that (interesting
965 	 * extension), so we simply succeed and hope that the relevant
966 	 * packets are going to arrive.
967 	 *
968 	 * If protocol support is added for enable/disable then we'll
969 	 * need to keep a list of those in use and re-add on resume.
970 	 */
971 	return (0);
972 }
973 
974 /*
975  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
976  *
977  *  Program the hardware to enable/disable promiscuous mode.
978  */
979 /*ARGSUSED*/
980 static int
981 xnf_set_promiscuous(void *arg, boolean_t on)
982 {
983 	xnf_t *xnfp = arg;
984 
985 #ifdef XNF_DEBUG
986 	if (xnfdebug & XNF_DEBUG_TRACE)
987 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
988 		    ddi_get_instance(xnfp->devinfo),
989 		    (void *)xnfp, on);
990 #endif
991 	/*
992 	 * We can't really do this, but we pretend that we can in
993 	 * order that snoop will work.
994 	 */
995 	return (0);
996 }
997 
998 /*
999  * Clean buffers that we have responses for from the transmit ring.
1000  */
1001 static int
1002 xnf_clean_tx_ring(xnf_t *xnfp)
1003 {
1004 	RING_IDX		next_resp, i;
1005 	struct tx_pktinfo	*reap;
1006 	int			id;
1007 	grant_ref_t		ref;
1008 
1009 	ASSERT(MUTEX_HELD(&xnfp->txlock));
1010 
1011 	do {
1012 		/*
1013 		 * index of next transmission ack
1014 		 */
1015 		next_resp = xnfp->tx_ring.sring->rsp_prod;
1016 		membar_consumer();
1017 		/*
1018 		 * Clean tx packets from ring that we have responses for
1019 		 */
1020 		for (i = xnfp->tx_ring.rsp_cons; i != next_resp; i++) {
1021 			id = RING_GET_RESPONSE(&xnfp->tx_ring, i)->id;
1022 			reap = &xnfp->tx_pkt_info[id];
1023 			ref = reap->grant_ref;
1024 			/*
1025 			 * Return id to free list
1026 			 */
1027 			reap->id = xnfp->tx_pkt_id_list;
1028 			xnfp->tx_pkt_id_list = id;
1029 			if (gnttab_query_foreign_access(ref) != 0)
1030 				panic("tx grant still in use"
1031 				    "by backend domain");
1032 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1033 			(void) gnttab_end_foreign_access_ref(ref,
1034 			    xnfp->tx_pages_readonly);
1035 			gnttab_release_grant_reference(&xnfp->gref_tx_head,
1036 			    ref);
1037 			freemsg(reap->mp);
1038 			reap->mp = NULL;
1039 			reap->grant_ref = GRANT_INVALID_REF;
1040 			if (reap->bdesc != NULL)
1041 				xnf_free_xmit_buffer(reap->bdesc);
1042 			reap->bdesc = NULL;
1043 		}
1044 		xnfp->tx_ring.rsp_cons = next_resp;
1045 		membar_enter();
1046 	} while (next_resp != xnfp->tx_ring.sring->rsp_prod);
1047 	return (NET_TX_RING_SIZE - (xnfp->tx_ring.sring->req_prod - next_resp));
1048 }
1049 
1050 /*
1051  * If we need to pull up data from either a packet that crosses a page
1052  * boundary or consisting of multiple mblks, do it here.  We allocate
1053  * a page aligned buffer and copy the data into it.  The header for the
1054  * allocated buffer is returned. (which is also allocated here)
1055  */
1056 static struct xnf_buffer_desc *
1057 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1058 {
1059 	struct xnf_buffer_desc	*bdesc;
1060 	mblk_t			*mptr;
1061 	caddr_t			bp;
1062 	int			len;
1063 
1064 	/*
1065 	 * get a xmit buffer from the xmit buffer pool
1066 	 */
1067 	mutex_enter(&xnfp->rx_buf_mutex);
1068 	bdesc = xnf_get_xmit_buffer(xnfp);
1069 	mutex_exit(&xnfp->rx_buf_mutex);
1070 	if (bdesc == NULL)
1071 		return (bdesc);
1072 	/*
1073 	 * Copy the data into the buffer
1074 	 */
1075 	xnfp->stat_xmit_pullup++;
1076 	bp = bdesc->buf;
1077 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1078 		len = mptr->b_wptr - mptr->b_rptr;
1079 		bcopy(mptr->b_rptr, bp, len);
1080 		bp += len;
1081 	}
1082 	return (bdesc);
1083 }
1084 
1085 /*
1086  *  xnf_send_one() -- send a packet
1087  *
1088  *  Called when a packet is ready to be transmitted. A pointer to an
1089  *  M_DATA message that contains the packet is passed to this routine.
1090  *  At least the complete LLC header is contained in the message's
1091  *  first message block, and the remainder of the packet is contained
1092  *  within additional M_DATA message blocks linked to the first
1093  *  message block.
1094  *
1095  */
1096 static boolean_t
1097 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1098 {
1099 	struct xnf_buffer_desc	*xmitbuf;
1100 	struct tx_pktinfo	*txp_info;
1101 	mblk_t			*mptr;
1102 	ddi_dma_cookie_t	dma_cookie;
1103 	RING_IDX		slot, txs_out;
1104 	int			length = 0, i, pktlen = 0, rc, tx_id;
1105 	int			tx_ring_freespace, page_oops;
1106 	uint_t			ncookies;
1107 	volatile netif_tx_request_t	*txrp;
1108 	caddr_t			bufaddr;
1109 	grant_ref_t		ref;
1110 	unsigned long		mfn;
1111 	uint32_t		pflags;
1112 	domid_t			oeid;
1113 
1114 #ifdef XNF_DEBUG
1115 	if (xnfdebug & XNF_DEBUG_SEND)
1116 		printf("xnf%d send(0x%p, 0x%p)\n",
1117 		    ddi_get_instance(xnfp->devinfo),
1118 		    (void *)xnfp, (void *)mp);
1119 #endif
1120 
1121 	ASSERT(mp != NULL);
1122 	ASSERT(mp->b_next == NULL);
1123 	ASSERT(MUTEX_HELD(&xnfp->txlock));
1124 
1125 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1126 	ASSERT(tx_ring_freespace >= 0);
1127 
1128 	oeid = xvdi_get_oeid(xnfp->devinfo);
1129 	xnfp->stat_xmit_attempt++;
1130 	/*
1131 	 * If there are no xmit ring slots available, return.
1132 	 */
1133 	if (tx_ring_freespace == 0) {
1134 		xnfp->stat_xmit_defer++;
1135 		return (B_FALSE);	/* Send should be retried */
1136 	}
1137 
1138 	slot = xnfp->tx_ring.sring->req_prod;
1139 	/* Count the number of mblks in message and compute packet size */
1140 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1141 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1142 
1143 	/* Make sure packet isn't too large */
1144 	if (pktlen > XNF_FRAMESIZE) {
1145 		cmn_err(CE_WARN, "xnf%d: large packet %d bytes",
1146 		    ddi_get_instance(xnfp->devinfo), pktlen);
1147 		freemsg(mp);
1148 		return (B_FALSE);
1149 	}
1150 
1151 	/*
1152 	 * Test if we cross a page boundary with our buffer
1153 	 */
1154 	page_oops = (i == 1) &&
1155 	    (xnf_btop((size_t)mp->b_rptr) !=
1156 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1157 	/*
1158 	 * XXPV - unfortunately, the Xen virtual net device currently
1159 	 * doesn't support multiple packet frags, so this will always
1160 	 * end up doing the pullup if we got more than one packet.
1161 	 */
1162 	if (i > xnf_max_tx_frags || page_oops) {
1163 		if (page_oops)
1164 			xnfp->stat_xmit_pagebndry++;
1165 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1166 			/* could not allocate resources? */
1167 #ifdef XNF_DEBUG
1168 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1169 			    ddi_get_instance(xnfp->devinfo));
1170 #endif
1171 			xnfp->stat_xmit_defer++;
1172 			return (B_FALSE);	/* Retry send */
1173 		}
1174 		bufaddr = xmitbuf->buf;
1175 	} else {
1176 		xmitbuf = NULL;
1177 		bufaddr = (caddr_t)mp->b_rptr;
1178 	}
1179 
1180 	/* set up data descriptor */
1181 	length = pktlen;
1182 
1183 	/*
1184 	 * Get packet id from free list
1185 	 */
1186 	tx_id = xnfp->tx_pkt_id_list;
1187 	ASSERT(tx_id < NET_TX_RING_SIZE);
1188 	txp_info = &xnfp->tx_pkt_info[tx_id];
1189 	xnfp->tx_pkt_id_list = txp_info->id;
1190 	txp_info->id = tx_id;
1191 
1192 	/* Prepare for DMA mapping of tx buffer(s) */
1193 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1194 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1195 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1196 	if (rc != DDI_DMA_MAPPED) {
1197 		ASSERT(rc != DDI_DMA_INUSE);
1198 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1199 		/*
1200 		 *  Return id to free list
1201 		 */
1202 		txp_info->id = xnfp->tx_pkt_id_list;
1203 		xnfp->tx_pkt_id_list = tx_id;
1204 		if (rc == DDI_DMA_NORESOURCES) {
1205 			xnfp->stat_xmit_defer++;
1206 			return (B_FALSE); /* Retry later */
1207 		}
1208 #ifdef XNF_DEBUG
1209 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1210 		    ddi_get_instance(xnfp->devinfo), rc);
1211 #endif
1212 		return (B_FALSE);
1213 	}
1214 
1215 	ASSERT(ncookies == 1);
1216 	ref = gnttab_claim_grant_reference(&xnfp->gref_tx_head);
1217 	ASSERT((signed short)ref >= 0);
1218 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1219 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1220 	    xnfp->tx_pages_readonly);
1221 	txp_info->grant_ref = ref;
1222 	txrp = RING_GET_REQUEST(&xnfp->tx_ring, slot);
1223 	txrp->gref = ref;
1224 	txrp->size = dma_cookie.dmac_size;
1225 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1226 	txrp->id = tx_id;
1227 	txrp->flags = 0;
1228 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1229 	if (pflags != 0) {
1230 		ASSERT(xnfp->cksum_offload);
1231 		/*
1232 		 * If the local protocol stack requests checksum
1233 		 * offload we set the 'checksum blank' flag,
1234 		 * indicating to the peer that we need the checksum
1235 		 * calculated for us.
1236 		 *
1237 		 * We _don't_ set the validated flag, because we haven't
1238 		 * validated that the data and the checksum match.
1239 		 */
1240 		txrp->flags |= NETTXF_csum_blank;
1241 		xnfp->stat_tx_cksum_deferred++;
1242 	}
1243 	membar_producer();
1244 	xnfp->tx_ring.sring->req_prod = slot + 1;
1245 
1246 	txp_info->mp = mp;
1247 	txp_info->bdesc = xmitbuf;
1248 
1249 	txs_out = xnfp->tx_ring.sring->req_prod - xnfp->tx_ring.sring->rsp_prod;
1250 	if (xnfp->tx_ring.sring->req_prod - xnfp->tx_ring.rsp_cons <
1251 	    XNF_TX_FREE_THRESH) {
1252 		/*
1253 		 * The ring is getting full; Set up this packet
1254 		 * to cause an interrupt.
1255 		 */
1256 		xnfp->tx_ring.sring->rsp_event =
1257 		    xnfp->tx_ring.sring->rsp_prod + txs_out;
1258 	}
1259 
1260 	xnfp->stat_opackets++;
1261 	xnfp->stat_obytes += pktlen;
1262 
1263 	return (B_TRUE);	/* successful transmit attempt */
1264 }
1265 
1266 mblk_t *
1267 xnf_send(void *arg, mblk_t *mp)
1268 {
1269 	xnf_t *xnfp = arg;
1270 	mblk_t *next;
1271 	boolean_t sent_something = B_FALSE;
1272 
1273 	mutex_enter(&xnfp->txlock);
1274 
1275 	/*
1276 	 * Transmission attempts should be impossible without having
1277 	 * previously called xnf_start().
1278 	 */
1279 	ASSERT(xnfp->running);
1280 
1281 	/*
1282 	 * Wait for getting connected to the backend
1283 	 */
1284 	while (!xnfp->connected) {
1285 		cv_wait(&xnfp->cv, &xnfp->txlock);
1286 	}
1287 
1288 	while (mp != NULL) {
1289 		next = mp->b_next;
1290 		mp->b_next = NULL;
1291 
1292 		if (!xnf_send_one(xnfp, mp)) {
1293 			mp->b_next = next;
1294 			break;
1295 		}
1296 
1297 		mp = next;
1298 		sent_something = B_TRUE;
1299 	}
1300 
1301 	if (sent_something)
1302 		ec_notify_via_evtchn(xnfp->evtchn);
1303 
1304 	mutex_exit(&xnfp->txlock);
1305 
1306 	return (mp);
1307 }
1308 
1309 /*
1310  *  xnf_intr() -- ring interrupt service routine
1311  */
1312 static uint_t
1313 xnf_intr(caddr_t arg)
1314 {
1315 	xnf_t *xnfp = (xnf_t *)arg;
1316 	int tx_ring_space;
1317 
1318 	mutex_enter(&xnfp->intrlock);
1319 
1320 	/*
1321 	 * If not connected to the peer or not started by the upper
1322 	 * layers we cannot usefully handle interrupts.
1323 	 */
1324 	if (!(xnfp->connected && xnfp->running)) {
1325 		mutex_exit(&xnfp->intrlock);
1326 		return (DDI_INTR_UNCLAIMED);
1327 	}
1328 
1329 #ifdef XNF_DEBUG
1330 	if (xnfdebug & XNF_DEBUG_INT)
1331 		printf("xnf%d intr(0x%p)\n",
1332 		    ddi_get_instance(xnfp->devinfo), (void *)xnfp);
1333 #endif
1334 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->rx_ring)) {
1335 		mblk_t *mp;
1336 
1337 		if ((mp = xnf_process_recv(xnfp)) != NULL)
1338 			mac_rx(xnfp->mh, xnfp->rx_handle, mp);
1339 	}
1340 
1341 	/*
1342 	 * Is tx ring nearly full?
1343 	 */
1344 #define	inuse(r) ((r).sring->req_prod - (r).rsp_cons)
1345 
1346 	if ((NET_TX_RING_SIZE - inuse(xnfp->tx_ring)) < XNF_TX_FREE_THRESH) {
1347 		/*
1348 		 * Yes, clean it and try to start any blocked xmit
1349 		 * streams.
1350 		 */
1351 		mutex_enter(&xnfp->txlock);
1352 		tx_ring_space = xnf_clean_tx_ring(xnfp);
1353 		mutex_exit(&xnfp->txlock);
1354 		if (tx_ring_space > XNF_TX_FREE_THRESH) {
1355 			mutex_exit(&xnfp->intrlock);
1356 			mac_tx_update(xnfp->mh);
1357 			mutex_enter(&xnfp->intrlock);
1358 		} else {
1359 			/*
1360 			 * Schedule another tx interrupt when we have
1361 			 * sent enough packets to cross the threshold.
1362 			 */
1363 			xnfp->tx_ring.sring->rsp_event =
1364 			    xnfp->tx_ring.sring->rsp_prod +
1365 			    XNF_TX_FREE_THRESH - tx_ring_space + 1;
1366 		}
1367 	}
1368 #undef inuse
1369 
1370 	xnfp->stat_intr++;
1371 	mutex_exit(&xnfp->intrlock);
1372 	return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */
1373 }
1374 
1375 /*
1376  *  xnf_start() -- start the board receiving and enable interrupts.
1377  */
1378 static int
1379 xnf_start(void *arg)
1380 {
1381 	xnf_t *xnfp = arg;
1382 
1383 #ifdef XNF_DEBUG
1384 	if (xnfdebug & XNF_DEBUG_TRACE)
1385 		printf("xnf%d start(0x%p)\n",
1386 		    ddi_get_instance(xnfp->devinfo), (void *)xnfp);
1387 #endif
1388 
1389 	mutex_enter(&xnfp->intrlock);
1390 	mutex_enter(&xnfp->txlock);
1391 
1392 	/* Accept packets from above. */
1393 	xnfp->running = B_TRUE;
1394 
1395 	mutex_exit(&xnfp->txlock);
1396 	mutex_exit(&xnfp->intrlock);
1397 
1398 	return (0);
1399 }
1400 
1401 /* xnf_stop() - disable hardware */
1402 static void
1403 xnf_stop(void *arg)
1404 {
1405 	xnf_t *xnfp = arg;
1406 
1407 #ifdef XNF_DEBUG
1408 	if (xnfdebug & XNF_DEBUG_TRACE)
1409 		printf("xnf%d stop(0x%p)\n",
1410 		    ddi_get_instance(xnfp->devinfo), (void *)xnfp);
1411 #endif
1412 
1413 	mutex_enter(&xnfp->intrlock);
1414 	mutex_enter(&xnfp->txlock);
1415 
1416 	xnfp->running = B_FALSE;
1417 
1418 	mutex_exit(&xnfp->txlock);
1419 	mutex_exit(&xnfp->intrlock);
1420 }
1421 
1422 /*
1423  * Driver private functions follow
1424  */
1425 
1426 /*
1427  * Hang buffer on rx ring
1428  */
1429 static void
1430 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1431 {
1432 	volatile netif_rx_request_t	*reqp;
1433 	RING_IDX	hang_ix;
1434 	grant_ref_t ref;
1435 	domid_t oeid;
1436 
1437 	oeid = xvdi_get_oeid(xnfp->devinfo);
1438 
1439 	ASSERT(MUTEX_HELD(&xnfp->intrlock));
1440 	reqp = RING_GET_REQUEST(&xnfp->rx_ring, xnfp->rx_ring.req_prod_pvt);
1441 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->rx_ring, 0));
1442 	ASSERT(xnfp->rxpkt_bufptr[hang_ix] == NULL);
1443 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1444 		ref = gnttab_claim_grant_reference(&xnfp->gref_rx_head);
1445 		ASSERT((signed short)ref >= 0);
1446 		bdesc->grant_ref = ref;
1447 		gnttab_grant_foreign_transfer_ref(ref, oeid);
1448 	}
1449 	reqp->id = hang_ix;
1450 	reqp->gref = bdesc->grant_ref;
1451 	bdesc->id = hang_ix;
1452 	xnfp->rxpkt_bufptr[hang_ix] = bdesc;
1453 	membar_producer();
1454 	xnfp->rx_ring.req_prod_pvt++;
1455 }
1456 
1457 
1458 /* Process all queued received packets */
1459 static mblk_t *
1460 xnf_process_recv(xnf_t *xnfp)
1461 {
1462 	volatile netif_rx_response_t *rxpkt;
1463 	mblk_t *mp, *head, *tail;
1464 	struct xnf_buffer_desc *bdesc;
1465 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1466 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1467 	size_t len;
1468 	pfn_t pfn;
1469 	long cnt;
1470 
1471 	head = tail = NULL;
1472 loop:
1473 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->rx_ring)) {
1474 
1475 		rxpkt = RING_GET_RESPONSE(&xnfp->rx_ring,
1476 		    xnfp->rx_ring.rsp_cons);
1477 
1478 		/*
1479 		 * Take buffer off of receive ring
1480 		 */
1481 		hwcsum = B_FALSE;
1482 		bdesc = xnfp->rxpkt_bufptr[rxpkt->id];
1483 		xnfp->rxpkt_bufptr[rxpkt->id] = NULL;
1484 		ASSERT(bdesc->id == rxpkt->id);
1485 		if (rxpkt->status <= 0) {
1486 			mp = NULL;
1487 			xnfp->stat_errrcv++;
1488 			if (rxpkt->status == 0)
1489 				xnfp->stat_runt++;
1490 			if (rxpkt->status == NETIF_RSP_ERROR)
1491 				xnfp->stat_mac_rcv_error++;
1492 			if (rxpkt->status == NETIF_RSP_DROPPED)
1493 				xnfp->stat_norcvbuf++;
1494 			/*
1495 			 * re-hang the buffer
1496 			 */
1497 			rx_buffer_hang(xnfp, bdesc);
1498 		} else {
1499 			grant_ref_t ref =  bdesc->grant_ref;
1500 			struct xnf_buffer_desc *new_bdesc;
1501 			unsigned long off = rxpkt->offset;
1502 			unsigned long mfn;
1503 
1504 			len = rxpkt->status;
1505 			ASSERT(off + len <= PAGEOFFSET);
1506 			if (ref == GRANT_INVALID_REF) {
1507 				mp = NULL;
1508 				new_bdesc = bdesc;
1509 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1510 				    "from dom %d", ref,
1511 				    xvdi_get_oeid(xnfp->devinfo));
1512 				goto luckless;
1513 			}
1514 			bdesc->grant_ref = GRANT_INVALID_REF;
1515 			mfn = gnttab_end_foreign_transfer_ref(ref);
1516 			ASSERT(mfn != MFN_INVALID);
1517 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1518 			    PFN_INVALID);
1519 			gnttab_release_grant_reference(&xnfp->gref_rx_head,
1520 			    ref);
1521 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1522 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1523 			    xnf_btop(bdesc->buf_phys),
1524 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1525 			balloon_drv_added(1);
1526 			if (rxpkt->flags & NETRXF_data_validated)
1527 				hwcsum = B_TRUE;
1528 			if (len <= xnf_rx_bcopy_thresh) {
1529 				/*
1530 				 * For small buffers, just copy the data
1531 				 * and send the copy upstream.
1532 				 */
1533 				new_bdesc = NULL;
1534 			} else {
1535 				/*
1536 				 * We send a pointer to this data upstream;
1537 				 * we need a new buffer to replace this one.
1538 				 */
1539 				mutex_enter(&xnfp->rx_buf_mutex);
1540 				new_bdesc = xnf_get_buffer(xnfp);
1541 				if (new_bdesc != NULL) {
1542 					xnfp->rx_bufs_outstanding++;
1543 				} else {
1544 					xnfp->stat_rx_no_ringbuf++;
1545 				}
1546 				mutex_exit(&xnfp->rx_buf_mutex);
1547 			}
1548 
1549 			if (new_bdesc == NULL) {
1550 				/*
1551 				 * Don't have a new ring buffer; bcopy the data
1552 				 * from the buffer, and preserve the
1553 				 * original buffer
1554 				 */
1555 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1556 					/*
1557 					 * Could't get buffer to copy to,
1558 					 * drop this data, and re-hang
1559 					 * the buffer on the ring.
1560 					 */
1561 					xnfp->stat_norcvbuf++;
1562 				} else {
1563 					bcopy(bdesc->buf + off, mp->b_wptr,
1564 					    len);
1565 				}
1566 				/*
1567 				 * Give the buffer page back to xen
1568 				 */
1569 				pfn = xnf_btop(bdesc->buf_phys);
1570 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1571 				    &pfn);
1572 				if (cnt != 1) {
1573 					cmn_err(CE_WARN, "unable to give a "
1574 					    "page back to the hypervisor\n");
1575 				}
1576 				new_bdesc = bdesc;
1577 			} else {
1578 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1579 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1580 					/*
1581 					 * Couldn't get mblk to pass recv data
1582 					 * up with, free the old ring buffer
1583 					 */
1584 					xnfp->stat_norcvbuf++;
1585 					xnf_rcv_complete(bdesc);
1586 					goto luckless;
1587 				}
1588 				(void) ddi_dma_sync(bdesc->dma_handle,
1589 				    0, 0, DDI_DMA_SYNC_FORCPU);
1590 
1591 				mp->b_wptr += off;
1592 				mp->b_rptr += off;
1593 			}
1594 luckless:
1595 			if (mp)
1596 				mp->b_wptr += len;
1597 			/* re-hang old or hang new buffer */
1598 			rx_buffer_hang(xnfp, new_bdesc);
1599 		}
1600 		if (mp) {
1601 			if (hwcsum) {
1602 				/*
1603 				 * If the peer says that the data has
1604 				 * been validated then we declare that
1605 				 * the full checksum has been
1606 				 * verified.
1607 				 *
1608 				 * We don't look at the "checksum
1609 				 * blank" flag, and hence could have a
1610 				 * packet here that we are asserting
1611 				 * is good with a blank checksum.
1612 				 *
1613 				 * The hardware checksum offload
1614 				 * specification says that we must
1615 				 * provide the actual checksum as well
1616 				 * as an assertion that it is valid,
1617 				 * but the protocol stack doesn't
1618 				 * actually use it and some other
1619 				 * drivers don't bother, so we don't.
1620 				 * If it was necessary we could grovel
1621 				 * in the packet to find it.
1622 				 */
1623 
1624 				(void) hcksum_assoc(mp, NULL,
1625 				    NULL, 0, 0, 0, 0,
1626 				    HCK_FULLCKSUM |
1627 				    HCK_FULLCKSUM_OK,
1628 				    0);
1629 				xnfp->stat_rx_cksum_no_need++;
1630 			}
1631 			if (head == NULL) {
1632 				head = tail = mp;
1633 			} else {
1634 				tail->b_next = mp;
1635 				tail = mp;
1636 			}
1637 
1638 			ASSERT(mp->b_next == NULL);
1639 
1640 			xnfp->stat_ipackets++;
1641 			xnfp->stat_rbytes += len;
1642 		}
1643 
1644 		xnfp->rx_ring.rsp_cons++;
1645 	}
1646 
1647 	/*
1648 	 * Has more data come in since we started?
1649 	 */
1650 	/* LINTED: constant in conditional context */
1651 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->rx_ring, work_to_do);
1652 	if (work_to_do)
1653 		goto loop;
1654 
1655 	/*
1656 	 * Indicate to the backend that we have re-filled the receive
1657 	 * ring.
1658 	 */
1659 	/* LINTED: constant in conditional context */
1660 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->rx_ring, notify);
1661 	if (notify)
1662 		ec_notify_via_evtchn(xnfp->evtchn);
1663 
1664 	return (head);
1665 }
1666 
1667 /* Called when the upper layers free a message we passed upstream */
1668 static void
1669 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1670 {
1671 	xnf_t *xnfp = bdesc->xnfp;
1672 	pfn_t pfn;
1673 	long cnt;
1674 
1675 	/* One less outstanding receive buffer */
1676 	mutex_enter(&xnfp->rx_buf_mutex);
1677 	--xnfp->rx_bufs_outstanding;
1678 	/*
1679 	 * Return buffer to the free list, unless the free list is getting
1680 	 * too large.  XXX - this threshold may need tuning.
1681 	 */
1682 	if (xnfp->rx_descs_free < xnf_recv_bufs_lowat) {
1683 		/*
1684 		 * Unmap the page, and hand the machine page back
1685 		 * to xen so it can be re-used as a backend net buffer.
1686 		 */
1687 		pfn = xnf_btop(bdesc->buf_phys);
1688 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1689 		if (cnt != 1) {
1690 			cmn_err(CE_WARN, "unable to give a page back to the "
1691 			    "hypervisor\n");
1692 		}
1693 
1694 		bdesc->next = xnfp->free_list;
1695 		xnfp->free_list = bdesc;
1696 		xnfp->rx_descs_free++;
1697 		mutex_exit(&xnfp->rx_buf_mutex);
1698 	} else {
1699 		/*
1700 		 * We can return everything here since we have a free buffer
1701 		 * that we have not given the backing page for back to xen.
1702 		 */
1703 		--xnfp->recv_buffer_count;
1704 		mutex_exit(&xnfp->rx_buf_mutex);
1705 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1706 		ddi_dma_mem_free(&bdesc->acc_handle);
1707 		ddi_dma_free_handle(&bdesc->dma_handle);
1708 		kmem_free(bdesc, sizeof (*bdesc));
1709 	}
1710 }
1711 
1712 /*
1713  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1714  */
1715 static int
1716 xnf_alloc_dma_resources(xnf_t *xnfp)
1717 {
1718 	dev_info_t 		*devinfo = xnfp->devinfo;
1719 	int			i;
1720 	size_t			len;
1721 	ddi_dma_cookie_t	dma_cookie;
1722 	uint_t			ncookies;
1723 	struct xnf_buffer_desc	*bdesc;
1724 	int			rc;
1725 	caddr_t			rptr;
1726 
1727 	xnfp->n_recvs = NET_RX_RING_SIZE;
1728 	xnfp->max_recv_bufs = xnf_recv_bufs_hiwat;
1729 
1730 	xnfp->n_xmits = NET_TX_RING_SIZE;
1731 
1732 	/*
1733 	 * The code below allocates all the DMA data structures that
1734 	 * need to be released when the driver is detached.
1735 	 *
1736 	 * First allocate handles for mapping (virtual address) pointers to
1737 	 * transmit data buffers to physical addresses
1738 	 */
1739 	for (i = 0; i < xnfp->n_xmits; i++) {
1740 		if ((rc = ddi_dma_alloc_handle(devinfo,
1741 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
1742 		    &xnfp->tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
1743 			return (DDI_FAILURE);
1744 	}
1745 
1746 	/*
1747 	 * Allocate page for the transmit descriptor ring.
1748 	 */
1749 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1750 	    DDI_DMA_SLEEP, 0, &xnfp->tx_ring_dma_handle) != DDI_SUCCESS)
1751 		goto alloc_error;
1752 
1753 	if (ddi_dma_mem_alloc(xnfp->tx_ring_dma_handle,
1754 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1755 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1756 	    &xnfp->tx_ring_dma_acchandle) != DDI_SUCCESS) {
1757 		ddi_dma_free_handle(&xnfp->tx_ring_dma_handle);
1758 		xnfp->tx_ring_dma_handle = NULL;
1759 		goto alloc_error;
1760 	}
1761 
1762 	if ((rc = ddi_dma_addr_bind_handle(xnfp->tx_ring_dma_handle, NULL,
1763 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1764 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1765 		ddi_dma_mem_free(&xnfp->tx_ring_dma_acchandle);
1766 		ddi_dma_free_handle(&xnfp->tx_ring_dma_handle);
1767 		xnfp->tx_ring_dma_handle = NULL;
1768 		xnfp->tx_ring_dma_acchandle = NULL;
1769 		if (rc == DDI_DMA_NORESOURCES)
1770 			goto alloc_error;
1771 		else
1772 			goto error;
1773 	}
1774 
1775 	ASSERT(ncookies == 1);
1776 	bzero(rptr, PAGESIZE);
1777 	/* LINTED: constant in conditional context */
1778 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
1779 	/* LINTED: constant in conditional context */
1780 	FRONT_RING_INIT(&xnfp->tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
1781 	xnfp->tx_ring_phys_addr = dma_cookie.dmac_laddress;
1782 
1783 	/*
1784 	 * Allocate page for the receive descriptor ring.
1785 	 */
1786 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1787 	    DDI_DMA_SLEEP, 0, &xnfp->rx_ring_dma_handle) != DDI_SUCCESS)
1788 		goto alloc_error;
1789 
1790 	if (ddi_dma_mem_alloc(xnfp->rx_ring_dma_handle,
1791 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1792 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1793 	    &xnfp->rx_ring_dma_acchandle) != DDI_SUCCESS) {
1794 		ddi_dma_free_handle(&xnfp->rx_ring_dma_handle);
1795 		xnfp->rx_ring_dma_handle = NULL;
1796 		goto alloc_error;
1797 	}
1798 
1799 	if ((rc = ddi_dma_addr_bind_handle(xnfp->rx_ring_dma_handle, NULL,
1800 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1801 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1802 		ddi_dma_mem_free(&xnfp->rx_ring_dma_acchandle);
1803 		ddi_dma_free_handle(&xnfp->rx_ring_dma_handle);
1804 		xnfp->rx_ring_dma_handle = NULL;
1805 		xnfp->rx_ring_dma_acchandle = NULL;
1806 		if (rc == DDI_DMA_NORESOURCES)
1807 			goto alloc_error;
1808 		else
1809 			goto error;
1810 	}
1811 
1812 	ASSERT(ncookies == 1);
1813 	bzero(rptr, PAGESIZE);
1814 	/* LINTED: constant in conditional context */
1815 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
1816 	/* LINTED: constant in conditional context */
1817 	FRONT_RING_INIT(&xnfp->rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
1818 	xnfp->rx_ring_phys_addr = dma_cookie.dmac_laddress;
1819 
1820 	/*
1821 	 * Preallocate receive buffers for each receive descriptor.
1822 	 */
1823 
1824 	/* Set up the "free list" of receive buffer descriptors */
1825 	for (i = 0; i < xnfp->n_recvs; i++) {
1826 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
1827 			goto alloc_error;
1828 		bdesc->next = xnfp->free_list;
1829 		xnfp->free_list = bdesc;
1830 	}
1831 
1832 	return (DDI_SUCCESS);
1833 
1834 alloc_error:
1835 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
1836 	    ddi_get_instance(xnfp->devinfo));
1837 error:
1838 	xnf_release_dma_resources(xnfp);
1839 	return (DDI_FAILURE);
1840 }
1841 
1842 /*
1843  * Release all DMA resources in the opposite order from acquisition
1844  * Should not be called until all outstanding esballoc buffers
1845  * have been returned.
1846  */
1847 static void
1848 xnf_release_dma_resources(xnf_t *xnfp)
1849 {
1850 	int i;
1851 
1852 	/*
1853 	 * Free receive buffers which are currently associated with
1854 	 * descriptors
1855 	 */
1856 	for (i = 0; i < xnfp->n_recvs; i++) {
1857 		struct xnf_buffer_desc *bp;
1858 
1859 		if ((bp = xnfp->rxpkt_bufptr[i]) == NULL)
1860 			continue;
1861 		xnf_free_buffer(bp);
1862 		xnfp->rxpkt_bufptr[i] = NULL;
1863 	}
1864 
1865 	/* Free the receive ring buffer */
1866 	if (xnfp->rx_ring_dma_acchandle != NULL) {
1867 		(void) ddi_dma_unbind_handle(xnfp->rx_ring_dma_handle);
1868 		ddi_dma_mem_free(&xnfp->rx_ring_dma_acchandle);
1869 		ddi_dma_free_handle(&xnfp->rx_ring_dma_handle);
1870 		xnfp->rx_ring_dma_acchandle = NULL;
1871 	}
1872 	/* Free the transmit ring buffer */
1873 	if (xnfp->tx_ring_dma_acchandle != NULL) {
1874 		(void) ddi_dma_unbind_handle(xnfp->tx_ring_dma_handle);
1875 		ddi_dma_mem_free(&xnfp->tx_ring_dma_acchandle);
1876 		ddi_dma_free_handle(&xnfp->tx_ring_dma_handle);
1877 		xnfp->tx_ring_dma_acchandle = NULL;
1878 	}
1879 }
1880 
1881 static void
1882 xnf_release_mblks(xnf_t *xnfp)
1883 {
1884 	int	i;
1885 
1886 	for (i = 0; i < xnfp->n_xmits; i++) {
1887 		if (xnfp->tx_pkt_info[i].mp == NULL)
1888 			continue;
1889 		freemsg(xnfp->tx_pkt_info[i].mp);
1890 		xnfp->tx_pkt_info[i].mp = NULL;
1891 		(void) ddi_dma_unbind_handle(xnfp->tx_pkt_info[i].dma_handle);
1892 	}
1893 }
1894 
1895 /*
1896  * Remove a xmit buffer descriptor from the head of the free list and return
1897  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
1898  * Called with the tx_buf_mutex held.
1899  */
1900 static struct xnf_buffer_desc *
1901 xnf_get_xmit_buffer(xnf_t *xnfp)
1902 {
1903 	struct xnf_buffer_desc *bdesc;
1904 
1905 	bdesc = xnfp->xmit_free_list;
1906 	if (bdesc != NULL) {
1907 		xnfp->xmit_free_list = bdesc->next;
1908 	} else {
1909 		bdesc = xnf_alloc_xmit_buffer(xnfp);
1910 	}
1911 	return (bdesc);
1912 }
1913 
1914 /*
1915  * Remove a buffer descriptor from the head of the free list and return
1916  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
1917  * Called with the rx_buf_mutex held.
1918  */
1919 static struct xnf_buffer_desc *
1920 xnf_get_buffer(xnf_t *xnfp)
1921 {
1922 	struct xnf_buffer_desc *bdesc;
1923 
1924 	bdesc = xnfp->free_list;
1925 	if (bdesc != NULL) {
1926 		xnfp->free_list = bdesc->next;
1927 		xnfp->rx_descs_free--;
1928 	} else {
1929 		bdesc = xnf_alloc_buffer(xnfp);
1930 	}
1931 	return (bdesc);
1932 }
1933 
1934 /*
1935  * Free a xmit buffer back to the xmit free list
1936  */
1937 static void
1938 xnf_free_xmit_buffer(struct xnf_buffer_desc *bp)
1939 {
1940 	xnf_t *xnfp = bp->xnfp;
1941 
1942 	mutex_enter(&xnfp->tx_buf_mutex);
1943 	bp->next = xnfp->xmit_free_list;
1944 	xnfp->xmit_free_list = bp;
1945 	mutex_exit(&xnfp->tx_buf_mutex);
1946 }
1947 
1948 /*
1949  * Put a buffer descriptor onto the head of the free list.
1950  * We can't really free these buffers back to the kernel
1951  * since we have given away their backing page to be used
1952  * by the back end net driver.
1953  */
1954 static void
1955 xnf_free_buffer(struct xnf_buffer_desc *bp)
1956 {
1957 	xnf_t *xnfp = bp->xnfp;
1958 
1959 	mutex_enter(&xnfp->rx_buf_mutex);
1960 	bp->next = xnfp->free_list;
1961 	xnfp->free_list = bp;
1962 	xnfp->rx_descs_free++;
1963 	mutex_exit(&xnfp->rx_buf_mutex);
1964 }
1965 
1966 /*
1967  * Allocate a DMA-able xmit buffer, including a structure to
1968  * keep track of the buffer.  Called with tx_buf_mutex held.
1969  */
1970 static struct xnf_buffer_desc *
1971 xnf_alloc_xmit_buffer(xnf_t *xnfp)
1972 {
1973 	struct xnf_buffer_desc *bdesc;
1974 	size_t len;
1975 
1976 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
1977 		return (NULL);
1978 
1979 	/* allocate a DMA access handle for receive buffer */
1980 	if (ddi_dma_alloc_handle(xnfp->devinfo, &tx_buffer_dma_attr,
1981 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
1982 		goto failure;
1983 
1984 	/* Allocate DMA-able memory for transmit buffer */
1985 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
1986 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
1987 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
1988 		goto late_failure;
1989 
1990 	bdesc->xnfp = xnfp;
1991 	xnfp->xmit_buffer_count++;
1992 
1993 	return (bdesc);
1994 
1995 late_failure:
1996 	ddi_dma_free_handle(&bdesc->dma_handle);
1997 
1998 failure:
1999 	kmem_free(bdesc, sizeof (*bdesc));
2000 	return (NULL);
2001 }
2002 
2003 /*
2004  * Allocate a DMA-able receive buffer, including a structure to
2005  * keep track of the buffer.  Called with rx_buf_mutex held.
2006  */
2007 static struct xnf_buffer_desc *
2008 xnf_alloc_buffer(xnf_t *xnfp)
2009 {
2010 	struct			xnf_buffer_desc *bdesc;
2011 	size_t			len;
2012 	uint_t			ncookies;
2013 	ddi_dma_cookie_t	dma_cookie;
2014 	long			cnt;
2015 	pfn_t			pfn;
2016 
2017 	if (xnfp->recv_buffer_count >= xnfp->max_recv_bufs)
2018 		return (NULL);
2019 
2020 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2021 		return (NULL);
2022 
2023 	/* allocate a DMA access handle for receive buffer */
2024 	if (ddi_dma_alloc_handle(xnfp->devinfo, &rx_buffer_dma_attr,
2025 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2026 		goto failure;
2027 
2028 	/* Allocate DMA-able memory for receive buffer */
2029 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2030 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2031 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2032 		goto late_failure;
2033 
2034 	/* bind to virtual address of buffer to get physical address */
2035 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2036 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2037 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2038 		goto late_late_failure;
2039 
2040 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2041 	bdesc->xnfp = xnfp;
2042 	bdesc->free_rtn.free_func = xnf_rcv_complete;
2043 	bdesc->free_rtn.free_arg = (char *)bdesc;
2044 	bdesc->grant_ref = GRANT_INVALID_REF;
2045 	ASSERT(ncookies == 1);
2046 
2047 	xnfp->recv_buffer_count++;
2048 	/*
2049 	 * Unmap the page, and hand the machine page back
2050 	 * to xen so it can be used as a backend net buffer.
2051 	 */
2052 	pfn = xnf_btop(bdesc->buf_phys);
2053 	cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2054 	if (cnt != 1) {
2055 		cmn_err(CE_WARN, "unable to give a page back to the "
2056 		    "hypervisor\n");
2057 	}
2058 
2059 	return (bdesc);
2060 
2061 late_late_failure:
2062 	ddi_dma_mem_free(&bdesc->acc_handle);
2063 
2064 late_failure:
2065 	ddi_dma_free_handle(&bdesc->dma_handle);
2066 
2067 failure:
2068 	kmem_free(bdesc, sizeof (*bdesc));
2069 	return (NULL);
2070 }
2071 
2072 static int
2073 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2074 {
2075 	xnf_t *xnfp = arg;
2076 
2077 	mutex_enter(&xnfp->intrlock);
2078 	mutex_enter(&xnfp->txlock);
2079 
2080 #define	map_stat(q, r)				\
2081 	case (MAC_STAT_##q):			\
2082 		*val = xnfp->stat_##r;		\
2083 		break
2084 
2085 	switch (stat) {
2086 
2087 	map_stat(IPACKETS, ipackets);
2088 	map_stat(OPACKETS, opackets);
2089 	map_stat(RBYTES, rbytes);
2090 	map_stat(OBYTES, obytes);
2091 	map_stat(NORCVBUF, norcvbuf);
2092 	map_stat(IERRORS, errrcv);
2093 	map_stat(NOXMTBUF, xmit_defer);
2094 
2095 	default:
2096 		mutex_exit(&xnfp->txlock);
2097 		mutex_exit(&xnfp->intrlock);
2098 
2099 		return (ENOTSUP);
2100 	}
2101 
2102 #undef map_stat
2103 
2104 	mutex_exit(&xnfp->txlock);
2105 	mutex_exit(&xnfp->intrlock);
2106 
2107 	return (0);
2108 }
2109 
2110 /*ARGSUSED*/
2111 static void
2112 xnf_blank(void *arg, time_t ticks, uint_t count)
2113 {
2114 	/*
2115 	 * XXPV dme: blanking is not currently implemented.
2116 	 *
2117 	 * It's not obvious how to use the 'ticks' argument here.
2118 	 *
2119 	 * 'Count' might be used as an indicator of how to set
2120 	 * rsp_event when posting receive buffers to the rx_ring.  It
2121 	 * would replace the code at the tail of xnf_process_recv()
2122 	 * that simply indicates that the next completed packet should
2123 	 * cause an interrupt.
2124 	 */
2125 }
2126 
2127 static void
2128 xnf_resources(void *arg)
2129 {
2130 	xnf_t *xnfp = arg;
2131 	mac_rx_fifo_t mrf;
2132 
2133 	mrf.mrf_type = MAC_RX_FIFO;
2134 	mrf.mrf_blank = xnf_blank;
2135 	mrf.mrf_arg = (void *)xnfp;
2136 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2137 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2138 
2139 	xnfp->rx_handle = mac_resource_add(xnfp->mh,
2140 	    (mac_resource_t *)&mrf);
2141 }
2142 
2143 /*ARGSUSED*/
2144 static void
2145 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2146 {
2147 	miocnak(q, mp, 0, EINVAL);
2148 }
2149 
2150 static boolean_t
2151 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2152 {
2153 	xnf_t *xnfp = arg;
2154 
2155 	switch (cap) {
2156 	case MAC_CAPAB_HCKSUM: {
2157 		uint32_t *capab = cap_data;
2158 
2159 		if (xnfp->cksum_offload)
2160 			*capab = HCKSUM_INET_FULL_V4;
2161 		else
2162 			*capab = 0;
2163 		break;
2164 	}
2165 
2166 	case MAC_CAPAB_POLL:
2167 		/* Just return B_TRUE. */
2168 		break;
2169 
2170 	default:
2171 		return (B_FALSE);
2172 	}
2173 
2174 	return (B_TRUE);
2175 }
2176 
2177 /*ARGSUSED*/
2178 static void
2179 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2180     void *arg, void *impl_data)
2181 {
2182 	xnf_t *xnfp = ddi_get_driver_private(dip);
2183 	XenbusState new_state = *(XenbusState *)impl_data;
2184 
2185 	ASSERT(xnfp != NULL);
2186 
2187 	switch (new_state) {
2188 	case XenbusStateConnected:
2189 		mutex_enter(&xnfp->intrlock);
2190 		mutex_enter(&xnfp->txlock);
2191 
2192 		xnfp->connected = B_TRUE;
2193 		cv_broadcast(&xnfp->cv);
2194 
2195 		mutex_exit(&xnfp->txlock);
2196 		mutex_exit(&xnfp->intrlock);
2197 
2198 		ec_notify_via_evtchn(xnfp->evtchn);
2199 		break;
2200 
2201 	default:
2202 		break;
2203 	}
2204 }
2205