xref: /titanic_50/usr/src/uts/common/xen/io/xnf.c (revision a856bf0569d60e1d5715fdbd2cfbf389c2a720d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  *
31  * Copyright (c) 2004 Christian Limpach.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. This section intentionally left blank.
43  * 4. The name of the author may not be used to endorse or promote products
44  *    derived from this software without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56  */
57 /*
58  * Section 3 of the above license was updated in response to bug 6379571.
59  */
60 
61 /*
62  * xnf.c - Nemo-based network driver for domU
63  */
64 
65 #include <sys/types.h>
66 #include <sys/hypervisor.h>
67 #include <sys/debug.h>
68 #include <sys/errno.h>
69 #include <sys/param.h>
70 #include <sys/sysmacros.h>
71 #include <sys/systm.h>
72 #include <sys/stropts.h>
73 #include <sys/stream.h>
74 #include <sys/strsubr.h>
75 #include <sys/kmem.h>
76 #include <sys/conf.h>
77 #include <sys/ddi.h>
78 #include <sys/devops.h>
79 #include <sys/sunddi.h>
80 #include <sys/sunndi.h>
81 #include <sys/ksynch.h>
82 #include <sys/dlpi.h>
83 #include <sys/ethernet.h>
84 #include <sys/strsun.h>
85 #include <sys/pattr.h>
86 #include <inet/common.h>
87 #include <inet/ip.h>
88 #include <sys/stat.h>
89 #include <sys/modctl.h>
90 #include <sys/mac.h>
91 #include <sys/mac_ether.h>
92 #include <sys/atomic.h>
93 #include <sys/errno.h>
94 #include <sys/machsystm.h>
95 #include <sys/bootconf.h>
96 #include <sys/bootsvcs.h>
97 #include <sys/bootinfo.h>
98 #include <sys/promif.h>
99 #include <sys/archsystm.h>
100 #include <sys/gnttab.h>
101 #include <sys/mach_mmu.h>
102 #include <xen/public/memory.h>
103 
104 #include "xnf.h"
105 
106 #include <sys/evtchn_impl.h>
107 #include <sys/balloon_impl.h>
108 #include <xen/sys/xendev.h>
109 
110 /*
111  *  Declarations and Module Linkage
112  */
113 
114 #define	IDENT	"Virtual Ethernet driver"
115 
116 #if defined(DEBUG) || defined(__lint)
117 #define	XNF_DEBUG
118 int	xnfdebug = 0;
119 #endif
120 
121 /*
122  * On a 32 bit PAE system physical and machine addresses are larger
123  * than 32 bits.  ddi_btop() on such systems take an unsigned long
124  * argument, and so addresses above 4G are truncated before ddi_btop()
125  * gets to see them.  To avoid this, code the shift operation here.
126  */
127 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
128 
129 boolean_t	xnf_cksum_offload = B_TRUE;
130 /*
131  * Should pages used for transmit be readonly for the peer?
132  */
133 boolean_t	xnf_tx_pages_readonly = B_FALSE;
134 /*
135  * Packets under this size are bcopied instead of using desballoc.
136  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
137  * always copy.
138  */
139 unsigned int	xnf_rx_bcopy_thresh = 64;
140 
141 unsigned int	xnf_max_tx_frags = 1;
142 
143 /* Required system entry points */
144 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
145 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
146 
147 /* Required driver entry points for Nemo */
148 static int	xnf_start(void *);
149 static void	xnf_stop(void *);
150 static int	xnf_set_mac_addr(void *, const uint8_t *);
151 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
152 static int	xnf_set_promiscuous(void *, boolean_t);
153 static mblk_t	*xnf_send(void *, mblk_t *);
154 static uint_t	xnf_intr(caddr_t);
155 static int	xnf_stat(void *, uint_t, uint64_t *);
156 static void	xnf_blank(void *, time_t, uint_t);
157 static void	xnf_resources(void *);
158 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
159 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
160 
161 /* Driver private functions */
162 static int xnf_alloc_dma_resources(xnf_t *);
163 static void xnf_release_dma_resources(xnf_t *);
164 static mblk_t *xnf_process_recv(xnf_t *);
165 static void xnf_rcv_complete(struct xnf_buffer_desc *);
166 static void xnf_release_mblks(xnf_t *);
167 static struct xnf_buffer_desc *xnf_alloc_xmit_buffer(xnf_t *);
168 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
169 static struct xnf_buffer_desc *xnf_get_xmit_buffer(xnf_t *);
170 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
171 static void xnf_free_buffer(struct xnf_buffer_desc *);
172 static void xnf_free_xmit_buffer(struct xnf_buffer_desc *);
173 void xnf_send_driver_status(int, int);
174 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
175 static int xnf_clean_tx_ring(xnf_t  *);
176 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
177     void *, void *);
178 
179 /*
180  * XXPV dme: remove MC_IOCTL?
181  */
182 static mac_callbacks_t xnf_callbacks = {
183 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
184 	xnf_stat,
185 	xnf_start,
186 	xnf_stop,
187 	xnf_set_promiscuous,
188 	xnf_set_multicast,
189 	xnf_set_mac_addr,
190 	xnf_send,
191 	xnf_resources,
192 	xnf_ioctl,
193 	xnf_getcapab
194 };
195 
196 #define	GRANT_INVALID_REF	0
197 int xnf_recv_bufs_lowat = 4 * NET_RX_RING_SIZE;
198 int xnf_recv_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
199 
200 /* DMA attributes for network ring buffer */
201 static ddi_dma_attr_t ringbuf_dma_attr = {
202 	DMA_ATTR_V0,		/* version of this structure */
203 	0,			/* lowest usable address */
204 	0xffffffffffffffffULL,	/* highest usable address */
205 	0x7fffffff,		/* maximum DMAable byte count */
206 	MMU_PAGESIZE,		/* alignment in bytes */
207 	0x7ff,			/* bitmap of burst sizes */
208 	1,			/* minimum transfer */
209 	0xffffffffU,		/* maximum transfer */
210 	0xffffffffffffffffULL,	/* maximum segment length */
211 	1,			/* maximum number of segments */
212 	1,			/* granularity */
213 	0,			/* flags (reserved) */
214 };
215 
216 /* DMA attributes for transmit data */
217 static ddi_dma_attr_t tx_buffer_dma_attr = {
218 	DMA_ATTR_V0,		/* version of this structure */
219 	0,			/* lowest usable address */
220 	0xffffffffffffffffULL,	/* highest usable address */
221 	0x7fffffff,		/* maximum DMAable byte count */
222 	MMU_PAGESIZE,		/* alignment in bytes */
223 	0x7ff,			/* bitmap of burst sizes */
224 	1,			/* minimum transfer */
225 	0xffffffffU,		/* maximum transfer */
226 	0xffffffffffffffffULL,	/* maximum segment length */
227 	1,			/* maximum number of segments */
228 	1,			/* granularity */
229 	0,			/* flags (reserved) */
230 };
231 
232 /* DMA attributes for a receive buffer */
233 static ddi_dma_attr_t rx_buffer_dma_attr = {
234 	DMA_ATTR_V0,		/* version of this structure */
235 	0,			/* lowest usable address */
236 	0xffffffffffffffffULL,	/* highest usable address */
237 	0x7fffffff,		/* maximum DMAable byte count */
238 	MMU_PAGESIZE,		/* alignment in bytes */
239 	0x7ff,			/* bitmap of burst sizes */
240 	1,			/* minimum transfer */
241 	0xffffffffU,		/* maximum transfer */
242 	0xffffffffffffffffULL,	/* maximum segment length */
243 	1,			/* maximum number of segments */
244 	1,			/* granularity */
245 	0,			/* flags (reserved) */
246 };
247 
248 /* DMA access attributes for registers and descriptors */
249 static ddi_device_acc_attr_t accattr = {
250 	DDI_DEVICE_ATTR_V0,
251 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
252 	DDI_STRICTORDER_ACC
253 };
254 
255 /* DMA access attributes for data: NOT to be byte swapped. */
256 static ddi_device_acc_attr_t data_accattr = {
257 	DDI_DEVICE_ATTR_V0,
258 	DDI_NEVERSWAP_ACC,
259 	DDI_STRICTORDER_ACC
260 };
261 
262 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
263 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
264 
265 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
266     nodev, NULL, D_MP, NULL);
267 
268 static struct modldrv xnf_modldrv = {
269 	&mod_driverops,		/* Type of module.  This one is a driver */
270 	IDENT " %I%",		/* short description */
271 	&xnf_dev_ops		/* driver specific ops */
272 };
273 
274 static struct modlinkage modlinkage = {
275 	MODREV_1, &xnf_modldrv, NULL
276 };
277 
278 int
279 _init(void)
280 {
281 	int r;
282 
283 	mac_init_ops(&xnf_dev_ops, "xnf");
284 	r = mod_install(&modlinkage);
285 	if (r != DDI_SUCCESS)
286 		mac_fini_ops(&xnf_dev_ops);
287 
288 	return (r);
289 }
290 
291 int
292 _fini(void)
293 {
294 	return (EBUSY); /* XXPV dme: should be removable */
295 }
296 
297 int
298 _info(struct modinfo *modinfop)
299 {
300 	return (mod_info(&modlinkage, modinfop));
301 }
302 
303 /*
304  * Statistics.
305  */
306 /* XXPV: most of these names need re-"nice"ing */
307 static char *xnf_aux_statistics[] = {
308 	"tx_cksum_deferred",
309 	"rx_cksum_no_need",
310 	"intr",
311 	"xmit_pullup",
312 	"xmit_pagebndry",
313 	"xmit_attempt",
314 	"rx_no_ringbuf",
315 	"mac_rcv_error",
316 	"runt",
317 };
318 
319 static int
320 xnf_kstat_aux_update(kstat_t *ksp, int flag)
321 {
322 	xnf_t *xnfp;
323 	kstat_named_t *knp;
324 
325 	if (flag != KSTAT_READ)
326 		return (EACCES);
327 
328 	xnfp = ksp->ks_private;
329 	knp = ksp->ks_data;
330 
331 	/*
332 	 * Assignment order should match that of the names in
333 	 * xnf_aux_statistics.
334 	 */
335 	(knp++)->value.ui64 = xnfp->stat_tx_cksum_deferred;
336 	(knp++)->value.ui64 = xnfp->stat_rx_cksum_no_need;
337 
338 	(knp++)->value.ui64 = xnfp->stat_intr;
339 	(knp++)->value.ui64 = xnfp->stat_xmit_pullup;
340 	(knp++)->value.ui64 = xnfp->stat_xmit_pagebndry;
341 	(knp++)->value.ui64 = xnfp->stat_xmit_attempt;
342 	(knp++)->value.ui64 = xnfp->stat_rx_no_ringbuf;
343 	(knp++)->value.ui64 = xnfp->stat_mac_rcv_error;
344 	(knp++)->value.ui64 = xnfp->stat_runt;
345 
346 	return (0);
347 }
348 
349 static boolean_t
350 xnf_kstat_init(xnf_t *xnfp)
351 {
352 	int nstat = sizeof (xnf_aux_statistics) /
353 	    sizeof (xnf_aux_statistics[0]);
354 	char **cp = xnf_aux_statistics;
355 	kstat_named_t *knp;
356 
357 	/*
358 	 * Create and initialise kstats.
359 	 */
360 	if ((xnfp->kstat_aux = kstat_create("xnf",
361 	    ddi_get_instance(xnfp->devinfo),
362 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
363 	    nstat, 0)) == NULL)
364 		return (B_FALSE);
365 
366 	xnfp->kstat_aux->ks_private = xnfp;
367 	xnfp->kstat_aux->ks_update = xnf_kstat_aux_update;
368 
369 	knp = xnfp->kstat_aux->ks_data;
370 	while (nstat > 0) {
371 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
372 
373 		knp++;
374 		cp++;
375 		nstat--;
376 	}
377 
378 	kstat_install(xnfp->kstat_aux);
379 
380 	return (B_TRUE);
381 }
382 
383 static int
384 xnf_setup_rings(xnf_t *xnfp)
385 {
386 	int			ix, err;
387 	RING_IDX		i;
388 	struct xnf_buffer_desc *bdesc, *rbp;
389 	struct xenbus_device *xsd;
390 	domid_t oeid;
391 
392 	oeid = xvdi_get_oeid(xnfp->devinfo);
393 	xsd = xvdi_get_xsd(xnfp->devinfo);
394 
395 	if (xnfp->tx_ring_ref != GRANT_INVALID_REF)
396 		gnttab_end_foreign_access(xnfp->tx_ring_ref, 0, 0);
397 
398 	err = gnttab_grant_foreign_access(oeid,
399 	    xnf_btop(pa_to_ma(xnfp->tx_ring_phys_addr)), 0);
400 	if (err <= 0) {
401 		err = -err;
402 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
403 		goto out;
404 	}
405 	xnfp->tx_ring_ref = (grant_ref_t)err;
406 
407 	if (xnfp->rx_ring_ref != GRANT_INVALID_REF)
408 		gnttab_end_foreign_access(xnfp->rx_ring_ref, 0, 0);
409 
410 	err = gnttab_grant_foreign_access(oeid,
411 	    xnf_btop(pa_to_ma(xnfp->rx_ring_phys_addr)), 0);
412 	if (err <= 0) {
413 		err = -err;
414 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
415 		goto out;
416 	}
417 	xnfp->rx_ring_ref = (grant_ref_t)err;
418 
419 
420 	mutex_enter(&xnfp->intrlock);
421 
422 	/*
423 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
424 	 * and reset the ring.  Note that this can lose packets after a resume,
425 	 * but we expect to stagger on.
426 	 */
427 	mutex_enter(&xnfp->txlock);
428 
429 	for (i = 0; i < xnfp->n_xmits; i++) {
430 		struct tx_pktinfo *txp = &xnfp->tx_pkt_info[i];
431 
432 		txp->id = i + 1;
433 
434 		if (txp->grant_ref == GRANT_INVALID_REF) {
435 			ASSERT(txp->mp == NULL);
436 			ASSERT(txp->bdesc == NULL);
437 			continue;
438 		}
439 
440 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
441 			panic("tx grant still in use by backend domain");
442 
443 		freemsg(txp->mp);
444 		txp->mp = NULL;
445 
446 		(void) ddi_dma_unbind_handle(txp->dma_handle);
447 
448 		if (txp->bdesc != NULL) {
449 			xnf_free_xmit_buffer(txp->bdesc);
450 			txp->bdesc = NULL;
451 		}
452 
453 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
454 		    xnfp->tx_pages_readonly);
455 		gnttab_release_grant_reference(&xnfp->gref_tx_head,
456 		    txp->grant_ref);
457 		txp->grant_ref = GRANT_INVALID_REF;
458 	}
459 
460 	xnfp->tx_pkt_id_list = 0;
461 	xnfp->tx_ring.rsp_cons = 0;
462 	xnfp->tx_ring.sring->req_prod = 0;
463 	xnfp->tx_ring.sring->rsp_prod = 0;
464 	xnfp->tx_ring.sring->rsp_event = 1;
465 
466 	mutex_exit(&xnfp->txlock);
467 
468 	/*
469 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
470 	 * our pages are currently flipped out so we can't just free the RX
471 	 * buffers.  Reclaim any unprocessed recv buffers, they won't be
472 	 * useable anyway since the mfn's they refer to are no longer valid.
473 	 * Grant the backend domain access to each hung rx buffer.
474 	 */
475 	i = xnfp->rx_ring.rsp_cons;
476 	while (i++ != xnfp->rx_ring.sring->req_prod) {
477 		volatile netif_rx_request_t	*rxrp;
478 
479 		rxrp = RING_GET_REQUEST(&xnfp->rx_ring, i);
480 		ix = rxrp - RING_GET_REQUEST(&xnfp->rx_ring, 0);
481 		rbp = xnfp->rxpkt_bufptr[ix];
482 		if (rbp != NULL) {
483 			ASSERT(rbp->grant_ref != GRANT_INVALID_REF);
484 			gnttab_grant_foreign_transfer_ref(rbp->grant_ref,
485 			    oeid);
486 			rxrp->id = ix;
487 			rxrp->gref = rbp->grant_ref;
488 		}
489 	}
490 	/*
491 	 * Reset the ring pointers to initial state.
492 	 * Hang buffers for any empty ring slots.
493 	 */
494 	xnfp->rx_ring.rsp_cons = 0;
495 	xnfp->rx_ring.sring->req_prod = 0;
496 	xnfp->rx_ring.sring->rsp_prod = 0;
497 	xnfp->rx_ring.sring->rsp_event = 1;
498 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
499 		xnfp->rx_ring.req_prod_pvt = i;
500 		if (xnfp->rxpkt_bufptr[i] != NULL)
501 			continue;
502 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
503 			break;
504 		rx_buffer_hang(xnfp, bdesc);
505 	}
506 	xnfp->rx_ring.req_prod_pvt = i;
507 	/* LINTED: constant in conditional context */
508 	RING_PUSH_REQUESTS(&xnfp->rx_ring);
509 
510 	mutex_exit(&xnfp->intrlock);
511 
512 	return (0);
513 
514 out:
515 	if (xnfp->tx_ring_ref != GRANT_INVALID_REF)
516 		gnttab_end_foreign_access(xnfp->tx_ring_ref, 0, 0);
517 	xnfp->tx_ring_ref = GRANT_INVALID_REF;
518 
519 	if (xnfp->rx_ring_ref != GRANT_INVALID_REF)
520 		gnttab_end_foreign_access(xnfp->rx_ring_ref, 0, 0);
521 	xnfp->rx_ring_ref = GRANT_INVALID_REF;
522 
523 	return (err);
524 }
525 
526 /*
527  * Connect driver to back end, called to set up communication with
528  * back end driver both initially and on resume after restore/migrate.
529  */
530 void
531 xnf_be_connect(xnf_t *xnfp)
532 {
533 	char		mac[ETHERADDRL * 3];
534 	const char	*message;
535 	xenbus_transaction_t xbt;
536 	struct xenbus_device *xsd;
537 	char		*xsname;
538 	int		err, be_no_cksum_offload;
539 
540 	ASSERT(!xnfp->connected);
541 
542 	xsd = xvdi_get_xsd(xnfp->devinfo);
543 	xsname = xvdi_get_xsname(xnfp->devinfo);
544 
545 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->devinfo), "mac",
546 	    "%s", (char *)&mac[0]);
547 	if (err != 0) {
548 		/*
549 		 * bad: we're supposed to be set up with a proper mac
550 		 * addr. at this point
551 		 */
552 		cmn_err(CE_WARN, "%s%d: no mac address",
553 		    ddi_driver_name(xnfp->devinfo),
554 		    ddi_get_instance(xnfp->devinfo));
555 			return;
556 	}
557 
558 	if (ether_aton(mac, xnfp->mac_addr) != ETHERADDRL) {
559 		err = ENOENT;
560 		xenbus_dev_error(xsd, ENOENT, "parsing %s/mac", xsname);
561 		return;
562 	}
563 
564 	err = xnf_setup_rings(xnfp);
565 	if (err != 0) {
566 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
567 		xenbus_dev_error(xsd, err, "setting up ring");
568 		return;
569 	}
570 
571 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->devinfo),
572 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
573 	/*
574 	 * If we fail to read the store we assume that the key is
575 	 * absent, implying an older domain at the far end.  Older
576 	 * domains always support checksum offload.
577 	 */
578 	if (err != 0)
579 		be_no_cksum_offload = 0;
580 	/*
581 	 * If the far end cannot do checksum offload or we do not wish
582 	 * to do it, disable it.
583 	 */
584 	if ((be_no_cksum_offload == 1) || !xnfp->cksum_offload)
585 		xnfp->cksum_offload = B_FALSE;
586 
587 again:
588 	err = xenbus_transaction_start(&xbt);
589 	if (err != 0) {
590 		xenbus_dev_error(xsd, EIO, "starting transaction");
591 		return;
592 	}
593 
594 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
595 	    xnfp->tx_ring_ref);
596 	if (err != 0) {
597 		message = "writing tx ring-ref";
598 		goto abort_transaction;
599 	}
600 
601 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
602 	    xnfp->rx_ring_ref);
603 	if (err != 0) {
604 		message = "writing rx ring-ref";
605 		goto abort_transaction;
606 	}
607 
608 	err = xenbus_printf(xbt, xsname, "event-channel", "%u", xnfp->evtchn);
609 	if (err != 0) {
610 		message = "writing event-channel";
611 		goto abort_transaction;
612 	}
613 
614 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
615 	if (err != 0) {
616 		message = "writing feature-rx-notify";
617 		goto abort_transaction;
618 	}
619 
620 	if (!xnfp->tx_pages_readonly) {
621 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
622 		    "%d", 1);
623 		if (err != 0) {
624 			message = "writing feature-tx-writable";
625 			goto abort_transaction;
626 		}
627 	}
628 
629 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
630 	    xnfp->cksum_offload ? 0 : 1);
631 	if (err != 0) {
632 		message = "writing feature-no-csum-offload";
633 		goto abort_transaction;
634 	}
635 
636 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
637 	if (err != 0) {
638 		message = "writing frontend XenbusStateConnected";
639 		goto abort_transaction;
640 	}
641 
642 	err = xenbus_transaction_end(xbt, 0);
643 	if (err != 0) {
644 		if (err == EAGAIN)
645 			goto again;
646 		xenbus_dev_error(xsd, err, "completing transaction");
647 	}
648 
649 	return;
650 
651 abort_transaction:
652 	(void) xenbus_transaction_end(xbt, 1);
653 	xenbus_dev_error(xsd, err, "%s", message);
654 }
655 
656 /*
657  *  attach(9E) -- Attach a device to the system
658  *
659  *  Called once for each board successfully probed.
660  */
661 static int
662 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
663 {
664 	mac_register_t *macp;
665 	xnf_t *xnfp;
666 	int err;
667 
668 #ifdef XNF_DEBUG
669 	if (xnfdebug & XNF_DEBUG_DDI)
670 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
671 		    (void *)devinfo);
672 #endif
673 
674 	switch (cmd) {
675 	case DDI_RESUME:
676 		xnfp = ddi_get_driver_private(devinfo);
677 
678 		(void) xvdi_resume(devinfo);
679 		(void) xvdi_alloc_evtchn(devinfo);
680 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
681 		    (caddr_t)xnfp);
682 		xnfp->evtchn = xvdi_get_evtchn(devinfo);
683 		xnf_be_connect(xnfp);
684 		/*
685 		 * Our MAC address didn't necessarily change, but
686 		 * given that we may be resuming this OS instance
687 		 * on a different machine (or on the same one and got a
688 		 * different MAC address because we didn't specify one of
689 		 * our own), it's useful to claim that
690 		 * it changed in order that IP send out a
691 		 * gratuitous ARP.
692 		 */
693 		mac_unicst_update(xnfp->mh, xnfp->mac_addr);
694 		return (DDI_SUCCESS);
695 
696 	case DDI_ATTACH:
697 		break;
698 
699 	default:
700 		return (DDI_FAILURE);
701 	}
702 
703 	/*
704 	 *  Allocate gld_mac_info_t and xnf_instance structures
705 	 */
706 	macp = mac_alloc(MAC_VERSION);
707 	if (macp == NULL)
708 		return (DDI_FAILURE);
709 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
710 
711 	macp->m_dip = devinfo;
712 	macp->m_driver = xnfp;
713 	xnfp->devinfo = devinfo;
714 
715 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
716 	macp->m_src_addr = xnfp->mac_addr;
717 	macp->m_callbacks = &xnf_callbacks;
718 	macp->m_min_sdu = 0;
719 	macp->m_max_sdu = XNF_MAXPKT;
720 
721 	xnfp->running = B_FALSE;
722 	xnfp->connected = B_FALSE;
723 	xnfp->cksum_offload = xnf_cksum_offload;
724 	xnfp->tx_pages_readonly = xnf_tx_pages_readonly;
725 
726 	/*
727 	 * Get the iblock cookie with which to initialize the mutexes.
728 	 */
729 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->icookie)
730 	    != DDI_SUCCESS)
731 		goto failure;
732 	/*
733 	 * Driver locking strategy: the txlock protects all paths
734 	 * through the driver, except the interrupt thread.
735 	 * If the interrupt thread needs to do something which could
736 	 * affect the operation of any other part of the driver,
737 	 * it needs to acquire the txlock mutex.
738 	 */
739 	mutex_init(&xnfp->tx_buf_mutex,
740 	    NULL, MUTEX_DRIVER, xnfp->icookie);
741 	mutex_init(&xnfp->rx_buf_mutex,
742 	    NULL, MUTEX_DRIVER, xnfp->icookie);
743 	mutex_init(&xnfp->txlock,
744 	    NULL, MUTEX_DRIVER, xnfp->icookie);
745 	mutex_init(&xnfp->intrlock,
746 	    NULL, MUTEX_DRIVER, xnfp->icookie);
747 	cv_init(&xnfp->cv, NULL, CV_DEFAULT, NULL);
748 
749 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
750 	    &xnfp->gref_tx_head) < 0) {
751 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
752 		    ddi_get_instance(xnfp->devinfo));
753 		goto late_failure;
754 	}
755 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
756 	    &xnfp->gref_rx_head) < 0) {
757 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
758 		    ddi_get_instance(xnfp->devinfo));
759 		goto late_failure;
760 	}
761 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
762 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
763 		    "driver data structures", ddi_get_instance(xnfp->devinfo));
764 		goto late_failure;
765 	}
766 
767 	xnfp->rx_ring.sring->rsp_event = xnfp->tx_ring.sring->rsp_event = 1;
768 
769 	xnfp->tx_ring_ref = GRANT_INVALID_REF;
770 	xnfp->rx_ring_ref = GRANT_INVALID_REF;
771 
772 	/* set driver private pointer now */
773 	ddi_set_driver_private(devinfo, xnfp);
774 
775 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change)
776 	    != DDI_SUCCESS)
777 		goto late_failure;
778 
779 	if (!xnf_kstat_init(xnfp))
780 		goto very_late_failure;
781 
782 	/*
783 	 * Allocate an event channel, add the interrupt handler and
784 	 * bind it to the event channel.
785 	 */
786 	(void) xvdi_alloc_evtchn(devinfo);
787 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
788 	xnfp->evtchn = xvdi_get_evtchn(devinfo);
789 
790 	/*
791 	 * connect to the backend
792 	 */
793 	xnf_be_connect(xnfp);
794 
795 	err = mac_register(macp, &xnfp->mh);
796 	mac_free(macp);
797 	macp = NULL;
798 	if (err != 0)
799 		goto very_very_late_failure;
800 
801 	return (DDI_SUCCESS);
802 
803 very_very_late_failure:
804 	kstat_delete(xnfp->kstat_aux);
805 
806 very_late_failure:
807 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
808 	ddi_remove_intr(devinfo, 0, xnfp->icookie);
809 	xnfp->evtchn = INVALID_EVTCHN;
810 
811 late_failure:
812 	xnf_release_dma_resources(xnfp);
813 	cv_destroy(&xnfp->cv);
814 	mutex_destroy(&xnfp->rx_buf_mutex);
815 	mutex_destroy(&xnfp->txlock);
816 	mutex_destroy(&xnfp->intrlock);
817 
818 failure:
819 	kmem_free(xnfp, sizeof (*xnfp));
820 	if (macp != NULL)
821 		mac_free(macp);
822 
823 	return (DDI_FAILURE);
824 }
825 
826 /*  detach(9E) -- Detach a device from the system */
827 static int
828 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
829 {
830 	xnf_t *xnfp;		/* Our private device info */
831 	int i;
832 
833 #ifdef XNF_DEBUG
834 	if (xnfdebug & XNF_DEBUG_DDI)
835 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
836 #endif
837 
838 	xnfp = ddi_get_driver_private(devinfo);
839 
840 	switch (cmd) {
841 	case DDI_SUSPEND:
842 		ddi_remove_intr(devinfo, 0, xnfp->icookie);
843 
844 		xvdi_suspend(devinfo);
845 
846 		mutex_enter(&xnfp->intrlock);
847 		mutex_enter(&xnfp->txlock);
848 
849 		xnfp->evtchn = INVALID_EVTCHN;
850 		xnfp->connected = B_FALSE;
851 		mutex_exit(&xnfp->txlock);
852 		mutex_exit(&xnfp->intrlock);
853 		return (DDI_SUCCESS);
854 
855 	case DDI_DETACH:
856 		break;
857 
858 	default:
859 		return (DDI_FAILURE);
860 	}
861 
862 	if (xnfp->connected)
863 		return (DDI_FAILURE);
864 
865 	/* Wait for receive buffers to be returned; give up after 5 seconds */
866 	i = 50;
867 
868 	mutex_enter(&xnfp->rx_buf_mutex);
869 	while (xnfp->rx_bufs_outstanding > 0) {
870 		mutex_exit(&xnfp->rx_buf_mutex);
871 		delay(drv_usectohz(100000));
872 		if (--i == 0) {
873 			cmn_err(CE_WARN,
874 			    "xnf%d: never reclaimed all the "
875 			    "receive buffers.  Still have %d "
876 			    "buffers outstanding.",
877 			    ddi_get_instance(xnfp->devinfo),
878 			    xnfp->rx_bufs_outstanding);
879 			return (DDI_FAILURE);
880 		}
881 		mutex_enter(&xnfp->rx_buf_mutex);
882 	}
883 	mutex_exit(&xnfp->rx_buf_mutex);
884 
885 	kstat_delete(xnfp->kstat_aux);
886 
887 	if (mac_unregister(xnfp->mh) != 0)
888 		return (DDI_FAILURE);
889 
890 	/* Stop the receiver */
891 	xnf_stop(xnfp);
892 
893 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
894 
895 	/* Remove the interrupt */
896 	ddi_remove_intr(devinfo, 0, xnfp->icookie);
897 
898 	/* Release any pending xmit mblks */
899 	xnf_release_mblks(xnfp);
900 
901 	/* Release all DMA resources */
902 	xnf_release_dma_resources(xnfp);
903 
904 	cv_destroy(&xnfp->cv);
905 	mutex_destroy(&xnfp->rx_buf_mutex);
906 	mutex_destroy(&xnfp->txlock);
907 	mutex_destroy(&xnfp->intrlock);
908 
909 	kmem_free(xnfp, sizeof (*xnfp));
910 
911 	return (DDI_SUCCESS);
912 }
913 
914 /*
915  *  xnf_set_mac_addr() -- set the physical network address on the board.
916  */
917 /*ARGSUSED*/
918 static int
919 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
920 {
921 	xnf_t *xnfp = arg;
922 
923 #ifdef XNF_DEBUG
924 	if (xnfdebug & XNF_DEBUG_TRACE)
925 		printf("xnf%d: set_mac_addr(0x%p): "
926 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
927 		    ddi_get_instance(xnfp->devinfo),
928 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
929 		    macaddr[3], macaddr[4], macaddr[5]);
930 #endif
931 	/*
932 	 * We can't set our macaddr.
933 	 *
934 	 * XXPV dme: Why not?
935 	 */
936 	return (ENOTSUP);
937 }
938 
939 /*
940  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
941  *
942  *  Program the hardware to enable/disable the multicast address
943  *  in "mcast".  Enable if "add" is true, disable if false.
944  */
945 /*ARGSUSED*/
946 static int
947 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
948 {
949 	xnf_t *xnfp = arg;
950 
951 #ifdef XNF_DEBUG
952 	if (xnfdebug & XNF_DEBUG_TRACE)
953 		printf("xnf%d set_multicast(0x%p): "
954 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
955 		    ddi_get_instance(xnfp->devinfo),
956 		    (void *)xnfp, mca[0], mca[1], mca[2],
957 		    mca[3], mca[4], mca[5]);
958 #endif
959 
960 	/*
961 	 * XXPV dme: Ideally we'd relay the address to the backend for
962 	 * enabling.  The protocol doesn't support that (interesting
963 	 * extension), so we simply succeed and hope that the relevant
964 	 * packets are going to arrive.
965 	 *
966 	 * If protocol support is added for enable/disable then we'll
967 	 * need to keep a list of those in use and re-add on resume.
968 	 */
969 	return (0);
970 }
971 
972 /*
973  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
974  *
975  *  Program the hardware to enable/disable promiscuous mode.
976  */
977 /*ARGSUSED*/
978 static int
979 xnf_set_promiscuous(void *arg, boolean_t on)
980 {
981 	xnf_t *xnfp = arg;
982 
983 #ifdef XNF_DEBUG
984 	if (xnfdebug & XNF_DEBUG_TRACE)
985 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
986 		    ddi_get_instance(xnfp->devinfo),
987 		    (void *)xnfp, on);
988 #endif
989 	/*
990 	 * We can't really do this, but we pretend that we can in
991 	 * order that snoop will work.
992 	 */
993 	return (0);
994 }
995 
996 /*
997  * Clean buffers that we have responses for from the transmit ring.
998  */
999 static int
1000 xnf_clean_tx_ring(xnf_t *xnfp)
1001 {
1002 	RING_IDX		next_resp, i;
1003 	struct tx_pktinfo	*reap;
1004 	int			id;
1005 	grant_ref_t		ref;
1006 
1007 	ASSERT(MUTEX_HELD(&xnfp->txlock));
1008 
1009 	do {
1010 		/*
1011 		 * index of next transmission ack
1012 		 */
1013 		next_resp = xnfp->tx_ring.sring->rsp_prod;
1014 		membar_consumer();
1015 		/*
1016 		 * Clean tx packets from ring that we have responses for
1017 		 */
1018 		for (i = xnfp->tx_ring.rsp_cons; i != next_resp; i++) {
1019 			id = RING_GET_RESPONSE(&xnfp->tx_ring, i)->id;
1020 			reap = &xnfp->tx_pkt_info[id];
1021 			ref = reap->grant_ref;
1022 			/*
1023 			 * Return id to free list
1024 			 */
1025 			reap->id = xnfp->tx_pkt_id_list;
1026 			xnfp->tx_pkt_id_list = id;
1027 			if (gnttab_query_foreign_access(ref) != 0)
1028 				panic("tx grant still in use"
1029 				    "by backend domain");
1030 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1031 			(void) gnttab_end_foreign_access_ref(ref,
1032 			    xnfp->tx_pages_readonly);
1033 			gnttab_release_grant_reference(&xnfp->gref_tx_head,
1034 			    ref);
1035 			freemsg(reap->mp);
1036 			reap->mp = NULL;
1037 			reap->grant_ref = GRANT_INVALID_REF;
1038 			if (reap->bdesc != NULL)
1039 				xnf_free_xmit_buffer(reap->bdesc);
1040 			reap->bdesc = NULL;
1041 		}
1042 		xnfp->tx_ring.rsp_cons = next_resp;
1043 		membar_enter();
1044 	} while (next_resp != xnfp->tx_ring.sring->rsp_prod);
1045 	return (NET_TX_RING_SIZE - (xnfp->tx_ring.sring->req_prod - next_resp));
1046 }
1047 
1048 /*
1049  * If we need to pull up data from either a packet that crosses a page
1050  * boundary or consisting of multiple mblks, do it here.  We allocate
1051  * a page aligned buffer and copy the data into it.  The header for the
1052  * allocated buffer is returned. (which is also allocated here)
1053  */
1054 static struct xnf_buffer_desc *
1055 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1056 {
1057 	struct xnf_buffer_desc	*bdesc;
1058 	mblk_t			*mptr;
1059 	caddr_t			bp;
1060 	int			len;
1061 
1062 	/*
1063 	 * get a xmit buffer from the xmit buffer pool
1064 	 */
1065 	mutex_enter(&xnfp->rx_buf_mutex);
1066 	bdesc = xnf_get_xmit_buffer(xnfp);
1067 	mutex_exit(&xnfp->rx_buf_mutex);
1068 	if (bdesc == NULL)
1069 		return (bdesc);
1070 	/*
1071 	 * Copy the data into the buffer
1072 	 */
1073 	xnfp->stat_xmit_pullup++;
1074 	bp = bdesc->buf;
1075 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1076 		len = mptr->b_wptr - mptr->b_rptr;
1077 		bcopy(mptr->b_rptr, bp, len);
1078 		bp += len;
1079 	}
1080 	return (bdesc);
1081 }
1082 
1083 /*
1084  *  xnf_send_one() -- send a packet
1085  *
1086  *  Called when a packet is ready to be transmitted. A pointer to an
1087  *  M_DATA message that contains the packet is passed to this routine.
1088  *  At least the complete LLC header is contained in the message's
1089  *  first message block, and the remainder of the packet is contained
1090  *  within additional M_DATA message blocks linked to the first
1091  *  message block.
1092  *
1093  */
1094 static boolean_t
1095 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1096 {
1097 	struct xnf_buffer_desc	*xmitbuf;
1098 	struct tx_pktinfo	*txp_info;
1099 	mblk_t			*mptr;
1100 	ddi_dma_cookie_t	dma_cookie;
1101 	RING_IDX		slot, txs_out;
1102 	int			length = 0, i, pktlen = 0, rc, tx_id;
1103 	int			tx_ring_freespace, page_oops;
1104 	uint_t			ncookies;
1105 	volatile netif_tx_request_t	*txrp;
1106 	caddr_t			bufaddr;
1107 	grant_ref_t		ref;
1108 	unsigned long		mfn;
1109 	uint32_t		pflags;
1110 	domid_t			oeid;
1111 
1112 #ifdef XNF_DEBUG
1113 	if (xnfdebug & XNF_DEBUG_SEND)
1114 		printf("xnf%d send(0x%p, 0x%p)\n",
1115 		    ddi_get_instance(xnfp->devinfo),
1116 		    (void *)xnfp, (void *)mp);
1117 #endif
1118 
1119 	ASSERT(mp != NULL);
1120 	ASSERT(mp->b_next == NULL);
1121 	ASSERT(MUTEX_HELD(&xnfp->txlock));
1122 
1123 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1124 	ASSERT(tx_ring_freespace >= 0);
1125 
1126 	oeid = xvdi_get_oeid(xnfp->devinfo);
1127 	xnfp->stat_xmit_attempt++;
1128 	/*
1129 	 * If there are no xmit ring slots available, return.
1130 	 */
1131 	if (tx_ring_freespace == 0) {
1132 		xnfp->stat_xmit_defer++;
1133 		return (B_FALSE);	/* Send should be retried */
1134 	}
1135 
1136 	slot = xnfp->tx_ring.sring->req_prod;
1137 	/* Count the number of mblks in message and compute packet size */
1138 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1139 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1140 
1141 	/* Make sure packet isn't too large */
1142 	if (pktlen > XNF_FRAMESIZE) {
1143 		cmn_err(CE_WARN, "xnf%d: large packet %d bytes",
1144 		    ddi_get_instance(xnfp->devinfo), pktlen);
1145 		freemsg(mp);
1146 		return (B_FALSE);
1147 	}
1148 
1149 	/*
1150 	 * Test if we cross a page boundary with our buffer
1151 	 */
1152 	page_oops = (i == 1) &&
1153 	    (xnf_btop((size_t)mp->b_rptr) !=
1154 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1155 	/*
1156 	 * XXPV - unfortunately, the Xen virtual net device currently
1157 	 * doesn't support multiple packet frags, so this will always
1158 	 * end up doing the pullup if we got more than one packet.
1159 	 */
1160 	if (i > xnf_max_tx_frags || page_oops) {
1161 		if (page_oops)
1162 			xnfp->stat_xmit_pagebndry++;
1163 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1164 			/* could not allocate resources? */
1165 #ifdef XNF_DEBUG
1166 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1167 			    ddi_get_instance(xnfp->devinfo));
1168 #endif
1169 			xnfp->stat_xmit_defer++;
1170 			return (B_FALSE);	/* Retry send */
1171 		}
1172 		bufaddr = xmitbuf->buf;
1173 	} else {
1174 		xmitbuf = NULL;
1175 		bufaddr = (caddr_t)mp->b_rptr;
1176 	}
1177 
1178 	/* set up data descriptor */
1179 	length = pktlen;
1180 
1181 	/*
1182 	 * Get packet id from free list
1183 	 */
1184 	tx_id = xnfp->tx_pkt_id_list;
1185 	ASSERT(tx_id < NET_TX_RING_SIZE);
1186 	txp_info = &xnfp->tx_pkt_info[tx_id];
1187 	xnfp->tx_pkt_id_list = txp_info->id;
1188 	txp_info->id = tx_id;
1189 
1190 	/* Prepare for DMA mapping of tx buffer(s) */
1191 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1192 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1193 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1194 	if (rc != DDI_DMA_MAPPED) {
1195 		ASSERT(rc != DDI_DMA_INUSE);
1196 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1197 		/*
1198 		 *  Return id to free list
1199 		 */
1200 		txp_info->id = xnfp->tx_pkt_id_list;
1201 		xnfp->tx_pkt_id_list = tx_id;
1202 		if (rc == DDI_DMA_NORESOURCES) {
1203 			xnfp->stat_xmit_defer++;
1204 			return (B_FALSE); /* Retry later */
1205 		}
1206 #ifdef XNF_DEBUG
1207 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1208 		    ddi_get_instance(xnfp->devinfo), rc);
1209 #endif
1210 		return (B_FALSE);
1211 	}
1212 
1213 	ASSERT(ncookies == 1);
1214 	ref = gnttab_claim_grant_reference(&xnfp->gref_tx_head);
1215 	ASSERT((signed short)ref >= 0);
1216 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1217 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1218 	    xnfp->tx_pages_readonly);
1219 	txp_info->grant_ref = ref;
1220 	txrp = RING_GET_REQUEST(&xnfp->tx_ring, slot);
1221 	txrp->gref = ref;
1222 	txrp->size = dma_cookie.dmac_size;
1223 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1224 	txrp->id = tx_id;
1225 	txrp->flags = 0;
1226 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1227 	if (pflags != 0) {
1228 		ASSERT(xnfp->cksum_offload);
1229 		/*
1230 		 * If the local protocol stack requests checksum
1231 		 * offload we set the 'checksum blank' flag,
1232 		 * indicating to the peer that we need the checksum
1233 		 * calculated for us.
1234 		 *
1235 		 * We _don't_ set the validated flag, because we haven't
1236 		 * validated that the data and the checksum match.
1237 		 */
1238 		txrp->flags |= NETTXF_csum_blank;
1239 		xnfp->stat_tx_cksum_deferred++;
1240 	}
1241 	membar_producer();
1242 	xnfp->tx_ring.sring->req_prod = slot + 1;
1243 
1244 	txp_info->mp = mp;
1245 	txp_info->bdesc = xmitbuf;
1246 
1247 	txs_out = xnfp->tx_ring.sring->req_prod - xnfp->tx_ring.sring->rsp_prod;
1248 	if (xnfp->tx_ring.sring->req_prod - xnfp->tx_ring.rsp_cons <
1249 	    XNF_TX_FREE_THRESH) {
1250 		/*
1251 		 * The ring is getting full; Set up this packet
1252 		 * to cause an interrupt.
1253 		 */
1254 		xnfp->tx_ring.sring->rsp_event =
1255 		    xnfp->tx_ring.sring->rsp_prod + txs_out;
1256 	}
1257 
1258 	xnfp->stat_opackets++;
1259 	xnfp->stat_obytes += pktlen;
1260 
1261 	return (B_TRUE);	/* successful transmit attempt */
1262 }
1263 
1264 mblk_t *
1265 xnf_send(void *arg, mblk_t *mp)
1266 {
1267 	xnf_t *xnfp = arg;
1268 	mblk_t *next;
1269 	boolean_t sent_something = B_FALSE;
1270 
1271 	mutex_enter(&xnfp->txlock);
1272 
1273 	/*
1274 	 * Transmission attempts should be impossible without having
1275 	 * previously called xnf_start().
1276 	 */
1277 	ASSERT(xnfp->running);
1278 
1279 	/*
1280 	 * Wait for getting connected to the backend
1281 	 */
1282 	while (!xnfp->connected) {
1283 		cv_wait(&xnfp->cv, &xnfp->txlock);
1284 	}
1285 
1286 	while (mp != NULL) {
1287 		next = mp->b_next;
1288 		mp->b_next = NULL;
1289 
1290 		if (!xnf_send_one(xnfp, mp)) {
1291 			mp->b_next = next;
1292 			break;
1293 		}
1294 
1295 		mp = next;
1296 		sent_something = B_TRUE;
1297 	}
1298 
1299 	if (sent_something)
1300 		ec_notify_via_evtchn(xnfp->evtchn);
1301 
1302 	mutex_exit(&xnfp->txlock);
1303 
1304 	return (mp);
1305 }
1306 
1307 /*
1308  *  xnf_intr() -- ring interrupt service routine
1309  */
1310 static uint_t
1311 xnf_intr(caddr_t arg)
1312 {
1313 	xnf_t *xnfp = (xnf_t *)arg;
1314 	int tx_ring_space;
1315 
1316 	mutex_enter(&xnfp->intrlock);
1317 
1318 	/*
1319 	 * If not connected to the peer or not started by the upper
1320 	 * layers we cannot usefully handle interrupts.
1321 	 */
1322 	if (!(xnfp->connected && xnfp->running)) {
1323 		mutex_exit(&xnfp->intrlock);
1324 		return (DDI_INTR_UNCLAIMED);
1325 	}
1326 
1327 #ifdef XNF_DEBUG
1328 	if (xnfdebug & XNF_DEBUG_INT)
1329 		printf("xnf%d intr(0x%p)\n",
1330 		    ddi_get_instance(xnfp->devinfo), (void *)xnfp);
1331 #endif
1332 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->rx_ring)) {
1333 		mblk_t *mp;
1334 
1335 		if ((mp = xnf_process_recv(xnfp)) != NULL)
1336 			mac_rx(xnfp->mh, xnfp->rx_handle, mp);
1337 	}
1338 
1339 	/*
1340 	 * Is tx ring nearly full?
1341 	 */
1342 #define	inuse(r) ((r).sring->req_prod - (r).rsp_cons)
1343 
1344 	if ((NET_TX_RING_SIZE - inuse(xnfp->tx_ring)) < XNF_TX_FREE_THRESH) {
1345 		/*
1346 		 * Yes, clean it and try to start any blocked xmit
1347 		 * streams.
1348 		 */
1349 		mutex_enter(&xnfp->txlock);
1350 		tx_ring_space = xnf_clean_tx_ring(xnfp);
1351 		mutex_exit(&xnfp->txlock);
1352 		if (tx_ring_space > XNF_TX_FREE_THRESH) {
1353 			mutex_exit(&xnfp->intrlock);
1354 			mac_tx_update(xnfp->mh);
1355 			mutex_enter(&xnfp->intrlock);
1356 		} else {
1357 			/*
1358 			 * Schedule another tx interrupt when we have
1359 			 * sent enough packets to cross the threshold.
1360 			 */
1361 			xnfp->tx_ring.sring->rsp_event =
1362 			    xnfp->tx_ring.sring->rsp_prod +
1363 			    XNF_TX_FREE_THRESH - tx_ring_space + 1;
1364 		}
1365 	}
1366 #undef inuse
1367 
1368 	xnfp->stat_intr++;
1369 	mutex_exit(&xnfp->intrlock);
1370 	return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */
1371 }
1372 
1373 /*
1374  *  xnf_start() -- start the board receiving and enable interrupts.
1375  */
1376 static int
1377 xnf_start(void *arg)
1378 {
1379 	xnf_t *xnfp = arg;
1380 
1381 #ifdef XNF_DEBUG
1382 	if (xnfdebug & XNF_DEBUG_TRACE)
1383 		printf("xnf%d start(0x%p)\n",
1384 		    ddi_get_instance(xnfp->devinfo), (void *)xnfp);
1385 #endif
1386 
1387 	mutex_enter(&xnfp->intrlock);
1388 	mutex_enter(&xnfp->txlock);
1389 
1390 	/* Accept packets from above. */
1391 	xnfp->running = B_TRUE;
1392 
1393 	mutex_exit(&xnfp->txlock);
1394 	mutex_exit(&xnfp->intrlock);
1395 
1396 	return (0);
1397 }
1398 
1399 /* xnf_stop() - disable hardware */
1400 static void
1401 xnf_stop(void *arg)
1402 {
1403 	xnf_t *xnfp = arg;
1404 
1405 #ifdef XNF_DEBUG
1406 	if (xnfdebug & XNF_DEBUG_TRACE)
1407 		printf("xnf%d stop(0x%p)\n",
1408 		    ddi_get_instance(xnfp->devinfo), (void *)xnfp);
1409 #endif
1410 
1411 	mutex_enter(&xnfp->intrlock);
1412 	mutex_enter(&xnfp->txlock);
1413 
1414 	xnfp->running = B_FALSE;
1415 
1416 	mutex_exit(&xnfp->txlock);
1417 	mutex_exit(&xnfp->intrlock);
1418 }
1419 
1420 /*
1421  * Driver private functions follow
1422  */
1423 
1424 /*
1425  * Hang buffer on rx ring
1426  */
1427 static void
1428 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1429 {
1430 	volatile netif_rx_request_t	*reqp;
1431 	RING_IDX	hang_ix;
1432 	grant_ref_t ref;
1433 	domid_t oeid;
1434 
1435 	oeid = xvdi_get_oeid(xnfp->devinfo);
1436 
1437 	ASSERT(MUTEX_HELD(&xnfp->intrlock));
1438 	reqp = RING_GET_REQUEST(&xnfp->rx_ring, xnfp->rx_ring.req_prod_pvt);
1439 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->rx_ring, 0));
1440 	ASSERT(xnfp->rxpkt_bufptr[hang_ix] == NULL);
1441 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1442 		ref = gnttab_claim_grant_reference(&xnfp->gref_rx_head);
1443 		ASSERT((signed short)ref >= 0);
1444 		bdesc->grant_ref = ref;
1445 		gnttab_grant_foreign_transfer_ref(ref, oeid);
1446 	}
1447 	reqp->id = hang_ix;
1448 	reqp->gref = bdesc->grant_ref;
1449 	bdesc->id = hang_ix;
1450 	xnfp->rxpkt_bufptr[hang_ix] = bdesc;
1451 	membar_producer();
1452 	xnfp->rx_ring.req_prod_pvt++;
1453 }
1454 
1455 
1456 /* Process all queued received packets */
1457 static mblk_t *
1458 xnf_process_recv(xnf_t *xnfp)
1459 {
1460 	volatile netif_rx_response_t *rxpkt;
1461 	mblk_t *mp, *head, *tail;
1462 	struct xnf_buffer_desc *bdesc;
1463 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1464 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1465 	size_t len;
1466 	pfn_t pfn;
1467 	long cnt;
1468 
1469 	head = tail = NULL;
1470 loop:
1471 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->rx_ring)) {
1472 
1473 		rxpkt = RING_GET_RESPONSE(&xnfp->rx_ring,
1474 		    xnfp->rx_ring.rsp_cons);
1475 
1476 		/*
1477 		 * Take buffer off of receive ring
1478 		 */
1479 		hwcsum = B_FALSE;
1480 		bdesc = xnfp->rxpkt_bufptr[rxpkt->id];
1481 		xnfp->rxpkt_bufptr[rxpkt->id] = NULL;
1482 		ASSERT(bdesc->id == rxpkt->id);
1483 		if (rxpkt->status <= 0) {
1484 			mp = NULL;
1485 			xnfp->stat_errrcv++;
1486 			if (rxpkt->status == 0)
1487 				xnfp->stat_runt++;
1488 			if (rxpkt->status == NETIF_RSP_ERROR)
1489 				xnfp->stat_mac_rcv_error++;
1490 			if (rxpkt->status == NETIF_RSP_DROPPED)
1491 				xnfp->stat_norcvbuf++;
1492 			/*
1493 			 * re-hang the buffer
1494 			 */
1495 			rx_buffer_hang(xnfp, bdesc);
1496 		} else {
1497 			grant_ref_t ref =  bdesc->grant_ref;
1498 			struct xnf_buffer_desc *new_bdesc;
1499 			unsigned long off = rxpkt->offset;
1500 			unsigned long mfn;
1501 
1502 			len = rxpkt->status;
1503 			ASSERT(off + len <= PAGEOFFSET);
1504 			if (ref == GRANT_INVALID_REF) {
1505 				mp = NULL;
1506 				new_bdesc = bdesc;
1507 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1508 				    "from dom %d", ref,
1509 				    xvdi_get_oeid(xnfp->devinfo));
1510 				goto luckless;
1511 			}
1512 			bdesc->grant_ref = GRANT_INVALID_REF;
1513 			mfn = gnttab_end_foreign_transfer_ref(ref);
1514 			ASSERT(mfn != MFN_INVALID);
1515 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1516 			    PFN_INVALID);
1517 			gnttab_release_grant_reference(&xnfp->gref_rx_head,
1518 			    ref);
1519 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1520 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1521 			    xnf_btop(bdesc->buf_phys),
1522 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1523 			balloon_drv_added(1);
1524 			if (rxpkt->flags & NETRXF_data_validated)
1525 				hwcsum = B_TRUE;
1526 			if (len <= xnf_rx_bcopy_thresh) {
1527 				/*
1528 				 * For small buffers, just copy the data
1529 				 * and send the copy upstream.
1530 				 */
1531 				new_bdesc = NULL;
1532 			} else {
1533 				/*
1534 				 * We send a pointer to this data upstream;
1535 				 * we need a new buffer to replace this one.
1536 				 */
1537 				mutex_enter(&xnfp->rx_buf_mutex);
1538 				new_bdesc = xnf_get_buffer(xnfp);
1539 				if (new_bdesc != NULL) {
1540 					xnfp->rx_bufs_outstanding++;
1541 				} else {
1542 					xnfp->stat_rx_no_ringbuf++;
1543 				}
1544 				mutex_exit(&xnfp->rx_buf_mutex);
1545 			}
1546 
1547 			if (new_bdesc == NULL) {
1548 				/*
1549 				 * Don't have a new ring buffer; bcopy the data
1550 				 * from the buffer, and preserve the
1551 				 * original buffer
1552 				 */
1553 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1554 					/*
1555 					 * Could't get buffer to copy to,
1556 					 * drop this data, and re-hang
1557 					 * the buffer on the ring.
1558 					 */
1559 					xnfp->stat_norcvbuf++;
1560 				} else {
1561 					bcopy(bdesc->buf + off, mp->b_wptr,
1562 					    len);
1563 				}
1564 				/*
1565 				 * Give the buffer page back to xen
1566 				 */
1567 				pfn = xnf_btop(bdesc->buf_phys);
1568 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1569 				    &pfn);
1570 				if (cnt != 1) {
1571 					cmn_err(CE_WARN, "unable to give a "
1572 					    "page back to the hypervisor\n");
1573 				}
1574 				new_bdesc = bdesc;
1575 			} else {
1576 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1577 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1578 					/*
1579 					 * Couldn't get mblk to pass recv data
1580 					 * up with, free the old ring buffer
1581 					 */
1582 					xnfp->stat_norcvbuf++;
1583 					xnf_rcv_complete(bdesc);
1584 					goto luckless;
1585 				}
1586 				(void) ddi_dma_sync(bdesc->dma_handle,
1587 				    0, 0, DDI_DMA_SYNC_FORCPU);
1588 
1589 				mp->b_wptr += off;
1590 				mp->b_rptr += off;
1591 			}
1592 luckless:
1593 			if (mp)
1594 				mp->b_wptr += len;
1595 			/* re-hang old or hang new buffer */
1596 			rx_buffer_hang(xnfp, new_bdesc);
1597 		}
1598 		if (mp) {
1599 			if (hwcsum) {
1600 				/*
1601 				 * If the peer says that the data has
1602 				 * been validated then we declare that
1603 				 * the full checksum has been
1604 				 * verified.
1605 				 *
1606 				 * We don't look at the "checksum
1607 				 * blank" flag, and hence could have a
1608 				 * packet here that we are asserting
1609 				 * is good with a blank checksum.
1610 				 *
1611 				 * The hardware checksum offload
1612 				 * specification says that we must
1613 				 * provide the actual checksum as well
1614 				 * as an assertion that it is valid,
1615 				 * but the protocol stack doesn't
1616 				 * actually use it and some other
1617 				 * drivers don't bother, so we don't.
1618 				 * If it was necessary we could grovel
1619 				 * in the packet to find it.
1620 				 */
1621 
1622 				(void) hcksum_assoc(mp, NULL,
1623 				    NULL, 0, 0, 0, 0,
1624 				    HCK_FULLCKSUM |
1625 				    HCK_FULLCKSUM_OK,
1626 				    0);
1627 				xnfp->stat_rx_cksum_no_need++;
1628 			}
1629 			if (head == NULL) {
1630 				head = tail = mp;
1631 			} else {
1632 				tail->b_next = mp;
1633 				tail = mp;
1634 			}
1635 
1636 			ASSERT(mp->b_next == NULL);
1637 
1638 			xnfp->stat_ipackets++;
1639 			xnfp->stat_rbytes += len;
1640 		}
1641 
1642 		xnfp->rx_ring.rsp_cons++;
1643 	}
1644 
1645 	/*
1646 	 * Has more data come in since we started?
1647 	 */
1648 	/* LINTED: constant in conditional context */
1649 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->rx_ring, work_to_do);
1650 	if (work_to_do)
1651 		goto loop;
1652 
1653 	/*
1654 	 * Indicate to the backend that we have re-filled the receive
1655 	 * ring.
1656 	 */
1657 	/* LINTED: constant in conditional context */
1658 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->rx_ring, notify);
1659 	if (notify)
1660 		ec_notify_via_evtchn(xnfp->evtchn);
1661 
1662 	return (head);
1663 }
1664 
1665 /* Called when the upper layers free a message we passed upstream */
1666 static void
1667 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1668 {
1669 	xnf_t *xnfp = bdesc->xnfp;
1670 	pfn_t pfn;
1671 	long cnt;
1672 
1673 	/* One less outstanding receive buffer */
1674 	mutex_enter(&xnfp->rx_buf_mutex);
1675 	--xnfp->rx_bufs_outstanding;
1676 	/*
1677 	 * Return buffer to the free list, unless the free list is getting
1678 	 * too large.  XXX - this threshold may need tuning.
1679 	 */
1680 	if (xnfp->rx_descs_free < xnf_recv_bufs_lowat) {
1681 		/*
1682 		 * Unmap the page, and hand the machine page back
1683 		 * to xen so it can be re-used as a backend net buffer.
1684 		 */
1685 		pfn = xnf_btop(bdesc->buf_phys);
1686 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1687 		if (cnt != 1) {
1688 			cmn_err(CE_WARN, "unable to give a page back to the "
1689 			    "hypervisor\n");
1690 		}
1691 
1692 		bdesc->next = xnfp->free_list;
1693 		xnfp->free_list = bdesc;
1694 		xnfp->rx_descs_free++;
1695 		mutex_exit(&xnfp->rx_buf_mutex);
1696 	} else {
1697 		/*
1698 		 * We can return everything here since we have a free buffer
1699 		 * that we have not given the backing page for back to xen.
1700 		 */
1701 		--xnfp->recv_buffer_count;
1702 		mutex_exit(&xnfp->rx_buf_mutex);
1703 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1704 		ddi_dma_mem_free(&bdesc->acc_handle);
1705 		ddi_dma_free_handle(&bdesc->dma_handle);
1706 		kmem_free(bdesc, sizeof (*bdesc));
1707 	}
1708 }
1709 
1710 /*
1711  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1712  */
1713 static int
1714 xnf_alloc_dma_resources(xnf_t *xnfp)
1715 {
1716 	dev_info_t 		*devinfo = xnfp->devinfo;
1717 	int			i;
1718 	size_t			len;
1719 	ddi_dma_cookie_t	dma_cookie;
1720 	uint_t			ncookies;
1721 	struct xnf_buffer_desc	*bdesc;
1722 	int			rc;
1723 	caddr_t			rptr;
1724 
1725 	xnfp->n_recvs = NET_RX_RING_SIZE;
1726 	xnfp->max_recv_bufs = xnf_recv_bufs_hiwat;
1727 
1728 	xnfp->n_xmits = NET_TX_RING_SIZE;
1729 
1730 	/*
1731 	 * The code below allocates all the DMA data structures that
1732 	 * need to be released when the driver is detached.
1733 	 *
1734 	 * First allocate handles for mapping (virtual address) pointers to
1735 	 * transmit data buffers to physical addresses
1736 	 */
1737 	for (i = 0; i < xnfp->n_xmits; i++) {
1738 		if ((rc = ddi_dma_alloc_handle(devinfo,
1739 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
1740 		    &xnfp->tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
1741 			return (DDI_FAILURE);
1742 	}
1743 
1744 	/*
1745 	 * Allocate page for the transmit descriptor ring.
1746 	 */
1747 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1748 	    DDI_DMA_SLEEP, 0, &xnfp->tx_ring_dma_handle) != DDI_SUCCESS)
1749 		goto alloc_error;
1750 
1751 	if (ddi_dma_mem_alloc(xnfp->tx_ring_dma_handle,
1752 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1753 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1754 	    &xnfp->tx_ring_dma_acchandle) != DDI_SUCCESS) {
1755 		ddi_dma_free_handle(&xnfp->tx_ring_dma_handle);
1756 		xnfp->tx_ring_dma_handle = NULL;
1757 		goto alloc_error;
1758 	}
1759 
1760 	if ((rc = ddi_dma_addr_bind_handle(xnfp->tx_ring_dma_handle, NULL,
1761 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1762 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1763 		ddi_dma_mem_free(&xnfp->tx_ring_dma_acchandle);
1764 		ddi_dma_free_handle(&xnfp->tx_ring_dma_handle);
1765 		xnfp->tx_ring_dma_handle = NULL;
1766 		xnfp->tx_ring_dma_acchandle = NULL;
1767 		if (rc == DDI_DMA_NORESOURCES)
1768 			goto alloc_error;
1769 		else
1770 			goto error;
1771 	}
1772 
1773 	ASSERT(ncookies == 1);
1774 	bzero(rptr, PAGESIZE);
1775 	/* LINTED: constant in conditional context */
1776 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
1777 	/* LINTED: constant in conditional context */
1778 	FRONT_RING_INIT(&xnfp->tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
1779 	xnfp->tx_ring_phys_addr = dma_cookie.dmac_laddress;
1780 
1781 	/*
1782 	 * Allocate page for the receive descriptor ring.
1783 	 */
1784 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1785 	    DDI_DMA_SLEEP, 0, &xnfp->rx_ring_dma_handle) != DDI_SUCCESS)
1786 		goto alloc_error;
1787 
1788 	if (ddi_dma_mem_alloc(xnfp->rx_ring_dma_handle,
1789 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1790 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1791 	    &xnfp->rx_ring_dma_acchandle) != DDI_SUCCESS) {
1792 		ddi_dma_free_handle(&xnfp->rx_ring_dma_handle);
1793 		xnfp->rx_ring_dma_handle = NULL;
1794 		goto alloc_error;
1795 	}
1796 
1797 	if ((rc = ddi_dma_addr_bind_handle(xnfp->rx_ring_dma_handle, NULL,
1798 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1799 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1800 		ddi_dma_mem_free(&xnfp->rx_ring_dma_acchandle);
1801 		ddi_dma_free_handle(&xnfp->rx_ring_dma_handle);
1802 		xnfp->rx_ring_dma_handle = NULL;
1803 		xnfp->rx_ring_dma_acchandle = NULL;
1804 		if (rc == DDI_DMA_NORESOURCES)
1805 			goto alloc_error;
1806 		else
1807 			goto error;
1808 	}
1809 
1810 	ASSERT(ncookies == 1);
1811 	bzero(rptr, PAGESIZE);
1812 	/* LINTED: constant in conditional context */
1813 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
1814 	/* LINTED: constant in conditional context */
1815 	FRONT_RING_INIT(&xnfp->rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
1816 	xnfp->rx_ring_phys_addr = dma_cookie.dmac_laddress;
1817 
1818 	/*
1819 	 * Preallocate receive buffers for each receive descriptor.
1820 	 */
1821 
1822 	/* Set up the "free list" of receive buffer descriptors */
1823 	for (i = 0; i < xnfp->n_recvs; i++) {
1824 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
1825 			goto alloc_error;
1826 		bdesc->next = xnfp->free_list;
1827 		xnfp->free_list = bdesc;
1828 	}
1829 
1830 	return (DDI_SUCCESS);
1831 
1832 alloc_error:
1833 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
1834 	    ddi_get_instance(xnfp->devinfo));
1835 error:
1836 	xnf_release_dma_resources(xnfp);
1837 	return (DDI_FAILURE);
1838 }
1839 
1840 /*
1841  * Release all DMA resources in the opposite order from acquisition
1842  * Should not be called until all outstanding esballoc buffers
1843  * have been returned.
1844  */
1845 static void
1846 xnf_release_dma_resources(xnf_t *xnfp)
1847 {
1848 	int i;
1849 
1850 	/*
1851 	 * Free receive buffers which are currently associated with
1852 	 * descriptors
1853 	 */
1854 	for (i = 0; i < xnfp->n_recvs; i++) {
1855 		struct xnf_buffer_desc *bp;
1856 
1857 		if ((bp = xnfp->rxpkt_bufptr[i]) == NULL)
1858 			continue;
1859 		xnf_free_buffer(bp);
1860 		xnfp->rxpkt_bufptr[i] = NULL;
1861 	}
1862 
1863 	/* Free the receive ring buffer */
1864 	if (xnfp->rx_ring_dma_acchandle != NULL) {
1865 		(void) ddi_dma_unbind_handle(xnfp->rx_ring_dma_handle);
1866 		ddi_dma_mem_free(&xnfp->rx_ring_dma_acchandle);
1867 		ddi_dma_free_handle(&xnfp->rx_ring_dma_handle);
1868 		xnfp->rx_ring_dma_acchandle = NULL;
1869 	}
1870 	/* Free the transmit ring buffer */
1871 	if (xnfp->tx_ring_dma_acchandle != NULL) {
1872 		(void) ddi_dma_unbind_handle(xnfp->tx_ring_dma_handle);
1873 		ddi_dma_mem_free(&xnfp->tx_ring_dma_acchandle);
1874 		ddi_dma_free_handle(&xnfp->tx_ring_dma_handle);
1875 		xnfp->tx_ring_dma_acchandle = NULL;
1876 	}
1877 }
1878 
1879 static void
1880 xnf_release_mblks(xnf_t *xnfp)
1881 {
1882 	int	i;
1883 
1884 	for (i = 0; i < xnfp->n_xmits; i++) {
1885 		if (xnfp->tx_pkt_info[i].mp == NULL)
1886 			continue;
1887 		freemsg(xnfp->tx_pkt_info[i].mp);
1888 		xnfp->tx_pkt_info[i].mp = NULL;
1889 		(void) ddi_dma_unbind_handle(xnfp->tx_pkt_info[i].dma_handle);
1890 	}
1891 }
1892 
1893 /*
1894  * Remove a xmit buffer descriptor from the head of the free list and return
1895  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
1896  * Called with the tx_buf_mutex held.
1897  */
1898 static struct xnf_buffer_desc *
1899 xnf_get_xmit_buffer(xnf_t *xnfp)
1900 {
1901 	struct xnf_buffer_desc *bdesc;
1902 
1903 	bdesc = xnfp->xmit_free_list;
1904 	if (bdesc != NULL) {
1905 		xnfp->xmit_free_list = bdesc->next;
1906 	} else {
1907 		bdesc = xnf_alloc_xmit_buffer(xnfp);
1908 	}
1909 	return (bdesc);
1910 }
1911 
1912 /*
1913  * Remove a buffer descriptor from the head of the free list and return
1914  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
1915  * Called with the rx_buf_mutex held.
1916  */
1917 static struct xnf_buffer_desc *
1918 xnf_get_buffer(xnf_t *xnfp)
1919 {
1920 	struct xnf_buffer_desc *bdesc;
1921 
1922 	bdesc = xnfp->free_list;
1923 	if (bdesc != NULL) {
1924 		xnfp->free_list = bdesc->next;
1925 		xnfp->rx_descs_free--;
1926 	} else {
1927 		bdesc = xnf_alloc_buffer(xnfp);
1928 	}
1929 	return (bdesc);
1930 }
1931 
1932 /*
1933  * Free a xmit buffer back to the xmit free list
1934  */
1935 static void
1936 xnf_free_xmit_buffer(struct xnf_buffer_desc *bp)
1937 {
1938 	xnf_t *xnfp = bp->xnfp;
1939 
1940 	mutex_enter(&xnfp->tx_buf_mutex);
1941 	bp->next = xnfp->xmit_free_list;
1942 	xnfp->xmit_free_list = bp;
1943 	mutex_exit(&xnfp->tx_buf_mutex);
1944 }
1945 
1946 /*
1947  * Put a buffer descriptor onto the head of the free list.
1948  * We can't really free these buffers back to the kernel
1949  * since we have given away their backing page to be used
1950  * by the back end net driver.
1951  */
1952 static void
1953 xnf_free_buffer(struct xnf_buffer_desc *bp)
1954 {
1955 	xnf_t *xnfp = bp->xnfp;
1956 
1957 	mutex_enter(&xnfp->rx_buf_mutex);
1958 	bp->next = xnfp->free_list;
1959 	xnfp->free_list = bp;
1960 	xnfp->rx_descs_free++;
1961 	mutex_exit(&xnfp->rx_buf_mutex);
1962 }
1963 
1964 /*
1965  * Allocate a DMA-able xmit buffer, including a structure to
1966  * keep track of the buffer.  Called with tx_buf_mutex held.
1967  */
1968 static struct xnf_buffer_desc *
1969 xnf_alloc_xmit_buffer(xnf_t *xnfp)
1970 {
1971 	struct xnf_buffer_desc *bdesc;
1972 	size_t len;
1973 
1974 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
1975 		return (NULL);
1976 
1977 	/* allocate a DMA access handle for receive buffer */
1978 	if (ddi_dma_alloc_handle(xnfp->devinfo, &tx_buffer_dma_attr,
1979 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
1980 		goto failure;
1981 
1982 	/* Allocate DMA-able memory for transmit buffer */
1983 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
1984 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
1985 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
1986 		goto late_failure;
1987 
1988 	bdesc->xnfp = xnfp;
1989 	xnfp->xmit_buffer_count++;
1990 
1991 	return (bdesc);
1992 
1993 late_failure:
1994 	ddi_dma_free_handle(&bdesc->dma_handle);
1995 
1996 failure:
1997 	kmem_free(bdesc, sizeof (*bdesc));
1998 	return (NULL);
1999 }
2000 
2001 /*
2002  * Allocate a DMA-able receive buffer, including a structure to
2003  * keep track of the buffer.  Called with rx_buf_mutex held.
2004  */
2005 static struct xnf_buffer_desc *
2006 xnf_alloc_buffer(xnf_t *xnfp)
2007 {
2008 	struct			xnf_buffer_desc *bdesc;
2009 	size_t			len;
2010 	uint_t			ncookies;
2011 	ddi_dma_cookie_t	dma_cookie;
2012 	long			cnt;
2013 	pfn_t			pfn;
2014 
2015 	if (xnfp->recv_buffer_count >= xnfp->max_recv_bufs)
2016 		return (NULL);
2017 
2018 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2019 		return (NULL);
2020 
2021 	/* allocate a DMA access handle for receive buffer */
2022 	if (ddi_dma_alloc_handle(xnfp->devinfo, &rx_buffer_dma_attr,
2023 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2024 		goto failure;
2025 
2026 	/* Allocate DMA-able memory for receive buffer */
2027 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2028 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2029 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2030 		goto late_failure;
2031 
2032 	/* bind to virtual address of buffer to get physical address */
2033 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2034 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2035 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2036 		goto late_late_failure;
2037 
2038 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2039 	bdesc->xnfp = xnfp;
2040 	bdesc->free_rtn.free_func = xnf_rcv_complete;
2041 	bdesc->free_rtn.free_arg = (char *)bdesc;
2042 	bdesc->grant_ref = GRANT_INVALID_REF;
2043 	ASSERT(ncookies == 1);
2044 
2045 	xnfp->recv_buffer_count++;
2046 	/*
2047 	 * Unmap the page, and hand the machine page back
2048 	 * to xen so it can be used as a backend net buffer.
2049 	 */
2050 	pfn = xnf_btop(bdesc->buf_phys);
2051 	cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2052 	if (cnt != 1) {
2053 		cmn_err(CE_WARN, "unable to give a page back to the "
2054 		    "hypervisor\n");
2055 	}
2056 
2057 	return (bdesc);
2058 
2059 late_late_failure:
2060 	ddi_dma_mem_free(&bdesc->acc_handle);
2061 
2062 late_failure:
2063 	ddi_dma_free_handle(&bdesc->dma_handle);
2064 
2065 failure:
2066 	kmem_free(bdesc, sizeof (*bdesc));
2067 	return (NULL);
2068 }
2069 
2070 static int
2071 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2072 {
2073 	xnf_t *xnfp = arg;
2074 
2075 	mutex_enter(&xnfp->intrlock);
2076 	mutex_enter(&xnfp->txlock);
2077 
2078 #define	map_stat(q, r)				\
2079 	case (MAC_STAT_##q):			\
2080 		*val = xnfp->stat_##r;		\
2081 		break
2082 
2083 	switch (stat) {
2084 
2085 	map_stat(IPACKETS, ipackets);
2086 	map_stat(OPACKETS, opackets);
2087 	map_stat(RBYTES, rbytes);
2088 	map_stat(OBYTES, obytes);
2089 	map_stat(NORCVBUF, norcvbuf);
2090 	map_stat(IERRORS, errrcv);
2091 	map_stat(NOXMTBUF, xmit_defer);
2092 
2093 	default:
2094 		mutex_exit(&xnfp->txlock);
2095 		mutex_exit(&xnfp->intrlock);
2096 
2097 		return (ENOTSUP);
2098 	}
2099 
2100 #undef map_stat
2101 
2102 	mutex_exit(&xnfp->txlock);
2103 	mutex_exit(&xnfp->intrlock);
2104 
2105 	return (0);
2106 }
2107 
2108 /*ARGSUSED*/
2109 static void
2110 xnf_blank(void *arg, time_t ticks, uint_t count)
2111 {
2112 	/*
2113 	 * XXPV dme: blanking is not currently implemented.
2114 	 *
2115 	 * It's not obvious how to use the 'ticks' argument here.
2116 	 *
2117 	 * 'Count' might be used as an indicator of how to set
2118 	 * rsp_event when posting receive buffers to the rx_ring.  It
2119 	 * would replace the code at the tail of xnf_process_recv()
2120 	 * that simply indicates that the next completed packet should
2121 	 * cause an interrupt.
2122 	 */
2123 }
2124 
2125 static void
2126 xnf_resources(void *arg)
2127 {
2128 	xnf_t *xnfp = arg;
2129 	mac_rx_fifo_t mrf;
2130 
2131 	mrf.mrf_type = MAC_RX_FIFO;
2132 	mrf.mrf_blank = xnf_blank;
2133 	mrf.mrf_arg = (void *)xnfp;
2134 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2135 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2136 
2137 	xnfp->rx_handle = mac_resource_add(xnfp->mh,
2138 	    (mac_resource_t *)&mrf);
2139 }
2140 
2141 /*ARGSUSED*/
2142 static void
2143 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2144 {
2145 	miocnak(q, mp, 0, EINVAL);
2146 }
2147 
2148 static boolean_t
2149 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2150 {
2151 	xnf_t *xnfp = arg;
2152 
2153 	switch (cap) {
2154 	case MAC_CAPAB_HCKSUM: {
2155 		uint32_t *capab = cap_data;
2156 
2157 		/*
2158 		 * We declare ourselves capable of HCKSUM_INET_PARTIAL
2159 		 * in order that the protocol stack insert the
2160 		 * pseudo-header checksum in packets that it passes
2161 		 * down to us.
2162 		 *
2163 		 * Whilst the flag used to communicate with dom0 is
2164 		 * called "NETTXF_csum_blank", the checksum in the
2165 		 * packet must contain the pseudo-header checksum and
2166 		 * not zero. (In fact, a Solaris dom0 is happy to deal
2167 		 * with a checksum of zero, but a Linux dom0 is not.)
2168 		 */
2169 		if (xnfp->cksum_offload)
2170 			*capab = HCKSUM_INET_PARTIAL;
2171 		else
2172 			*capab = 0;
2173 		break;
2174 	}
2175 
2176 	case MAC_CAPAB_POLL:
2177 		/* Just return B_TRUE. */
2178 		break;
2179 
2180 	default:
2181 		return (B_FALSE);
2182 	}
2183 
2184 	return (B_TRUE);
2185 }
2186 
2187 /*ARGSUSED*/
2188 static void
2189 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2190     void *arg, void *impl_data)
2191 {
2192 	xnf_t *xnfp = ddi_get_driver_private(dip);
2193 	XenbusState new_state = *(XenbusState *)impl_data;
2194 
2195 	ASSERT(xnfp != NULL);
2196 
2197 	switch (new_state) {
2198 	case XenbusStateConnected:
2199 		mutex_enter(&xnfp->intrlock);
2200 		mutex_enter(&xnfp->txlock);
2201 
2202 		xnfp->connected = B_TRUE;
2203 		cv_broadcast(&xnfp->cv);
2204 
2205 		mutex_exit(&xnfp->txlock);
2206 		mutex_exit(&xnfp->intrlock);
2207 
2208 		ec_notify_via_evtchn(xnfp->evtchn);
2209 		break;
2210 
2211 	default:
2212 		break;
2213 	}
2214 }
2215