xref: /titanic_51/usr/src/uts/common/xen/io/xnf.c (revision 3605ad6f0044065c54582650a845c977b81e1c3e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  *
31  * Copyright (c) 2004 Christian Limpach.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. This section intentionally left blank.
43  * 4. The name of the author may not be used to endorse or promote products
44  *    derived from this software without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56  */
57 /*
58  * Section 3 of the above license was updated in response to bug 6379571.
59  */
60 
61 /*
62  * xnf.c - Nemo-based network driver for domU
63  */
64 
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h>
68 #include <sys/sysmacros.h>
69 #include <sys/systm.h>
70 #include <sys/stream.h>
71 #include <sys/strsubr.h>
72 #include <sys/conf.h>
73 #include <sys/ddi.h>
74 #include <sys/devops.h>
75 #include <sys/sunddi.h>
76 #include <sys/sunndi.h>
77 #include <sys/dlpi.h>
78 #include <sys/ethernet.h>
79 #include <sys/strsun.h>
80 #include <sys/pattr.h>
81 #include <inet/ip.h>
82 #include <sys/modctl.h>
83 #include <sys/mac.h>
84 #include <sys/mac_ether.h>
85 #include <sys/bootinfo.h>
86 #include <sys/mach_mmu.h>
87 #ifdef	XPV_HVM_DRIVER
88 #include <sys/xpv_support.h>
89 #include <sys/hypervisor.h>
90 #else
91 #include <sys/hypervisor.h>
92 #include <sys/evtchn_impl.h>
93 #include <sys/balloon_impl.h>
94 #endif
95 #include <xen/public/io/netif.h>
96 #include <sys/gnttab.h>
97 #include <xen/sys/xendev.h>
98 #include <sys/sdt.h>
99 
100 #include <io/xnf.h>
101 
102 
103 /*
104  *  Declarations and Module Linkage
105  */
106 
107 #define	IDENT	"Virtual Ethernet driver"
108 
109 #if defined(DEBUG) || defined(__lint)
110 #define	XNF_DEBUG
111 int	xnfdebug = 0;
112 #endif
113 
114 /*
115  * On a 32 bit PAE system physical and machine addresses are larger
116  * than 32 bits.  ddi_btop() on such systems take an unsigned long
117  * argument, and so addresses above 4G are truncated before ddi_btop()
118  * gets to see them.  To avoid this, code the shift operation here.
119  */
120 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
121 
122 boolean_t	xnf_cksum_offload = B_TRUE;
123 
124 /* Default value for hypervisor-based copy operations */
125 boolean_t	xnf_rx_hvcopy = B_TRUE;
126 
127 /*
128  * Should pages used for transmit be readonly for the peer?
129  */
130 boolean_t	xnf_tx_pages_readonly = B_FALSE;
131 /*
132  * Packets under this size are bcopied instead of using desballoc.
133  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
134  * always copy.
135  */
136 unsigned int	xnf_rx_bcopy_thresh = 64;
137 
138 unsigned int	xnf_max_tx_frags = 1;
139 
140 /* Required system entry points */
141 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
142 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
143 
144 /* Required driver entry points for Nemo */
145 static int	xnf_start(void *);
146 static void	xnf_stop(void *);
147 static int	xnf_set_mac_addr(void *, const uint8_t *);
148 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
149 static int	xnf_set_promiscuous(void *, boolean_t);
150 static mblk_t	*xnf_send(void *, mblk_t *);
151 static uint_t	xnf_intr(caddr_t);
152 static int	xnf_stat(void *, uint_t, uint64_t *);
153 static void	xnf_blank(void *, time_t, uint_t);
154 static void	xnf_resources(void *);
155 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
156 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
157 
158 /* Driver private functions */
159 static int xnf_alloc_dma_resources(xnf_t *);
160 static void xnf_release_dma_resources(xnf_t *);
161 static mblk_t *xnf_process_recv(xnf_t *);
162 static void xnf_rcv_complete(struct xnf_buffer_desc *);
163 static void xnf_release_mblks(xnf_t *);
164 static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
165 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
166 static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
167 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
168 static void xnf_free_buffer(struct xnf_buffer_desc *);
169 static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
170 void xnf_send_driver_status(int, int);
171 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
172 static int xnf_clean_tx_ring(xnf_t  *);
173 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
174     void *, void *);
175 static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
176 static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
177 static boolean_t xnf_kstat_init(xnf_t *xnfp);
178 
179 /*
180  * XXPV dme: remove MC_IOCTL?
181  */
182 static mac_callbacks_t xnf_callbacks = {
183 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
184 	xnf_stat,
185 	xnf_start,
186 	xnf_stop,
187 	xnf_set_promiscuous,
188 	xnf_set_multicast,
189 	xnf_set_mac_addr,
190 	xnf_send,
191 	xnf_resources,
192 	xnf_ioctl,
193 	xnf_getcapab
194 };
195 
196 #define	GRANT_INVALID_REF	0
197 const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
198 const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
199 
200 /* DMA attributes for network ring buffer */
201 static ddi_dma_attr_t ringbuf_dma_attr = {
202 	DMA_ATTR_V0,		/* version of this structure */
203 	0,			/* lowest usable address */
204 	0xffffffffffffffffULL,	/* highest usable address */
205 	0x7fffffff,		/* maximum DMAable byte count */
206 	MMU_PAGESIZE,		/* alignment in bytes */
207 	0x7ff,			/* bitmap of burst sizes */
208 	1,			/* minimum transfer */
209 	0xffffffffU,		/* maximum transfer */
210 	0xffffffffffffffffULL,	/* maximum segment length */
211 	1,			/* maximum number of segments */
212 	1,			/* granularity */
213 	0,			/* flags (reserved) */
214 };
215 
216 /* DMA attributes for transmit data */
217 static ddi_dma_attr_t tx_buffer_dma_attr = {
218 	DMA_ATTR_V0,		/* version of this structure */
219 	0,			/* lowest usable address */
220 	0xffffffffffffffffULL,	/* highest usable address */
221 	0x7fffffff,		/* maximum DMAable byte count */
222 	MMU_PAGESIZE,		/* alignment in bytes */
223 	0x7ff,			/* bitmap of burst sizes */
224 	1,			/* minimum transfer */
225 	0xffffffffU,		/* maximum transfer */
226 	0xffffffffffffffffULL,	/* maximum segment length */
227 	1,			/* maximum number of segments */
228 	1,			/* granularity */
229 	0,			/* flags (reserved) */
230 };
231 
232 /* DMA attributes for a receive buffer */
233 static ddi_dma_attr_t rx_buffer_dma_attr = {
234 	DMA_ATTR_V0,		/* version of this structure */
235 	0,			/* lowest usable address */
236 	0xffffffffffffffffULL,	/* highest usable address */
237 	0x7fffffff,		/* maximum DMAable byte count */
238 	MMU_PAGESIZE,		/* alignment in bytes */
239 	0x7ff,			/* bitmap of burst sizes */
240 	1,			/* minimum transfer */
241 	0xffffffffU,		/* maximum transfer */
242 	0xffffffffffffffffULL,	/* maximum segment length */
243 	1,			/* maximum number of segments */
244 	1,			/* granularity */
245 	0,			/* flags (reserved) */
246 };
247 
248 /* DMA access attributes for registers and descriptors */
249 static ddi_device_acc_attr_t accattr = {
250 	DDI_DEVICE_ATTR_V0,
251 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
252 	DDI_STRICTORDER_ACC
253 };
254 
255 /* DMA access attributes for data: NOT to be byte swapped. */
256 static ddi_device_acc_attr_t data_accattr = {
257 	DDI_DEVICE_ATTR_V0,
258 	DDI_NEVERSWAP_ACC,
259 	DDI_STRICTORDER_ACC
260 };
261 
262 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
263 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
264 
265 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
266     nodev, NULL, D_MP, NULL);
267 
268 static struct modldrv xnf_modldrv = {
269 	&mod_driverops,		/* Type of module.  This one is a driver */
270 	IDENT " %I%",		/* short description */
271 	&xnf_dev_ops		/* driver specific ops */
272 };
273 
274 static struct modlinkage modlinkage = {
275 	MODREV_1, &xnf_modldrv, NULL
276 };
277 
278 int
279 _init(void)
280 {
281 	int r;
282 
283 	mac_init_ops(&xnf_dev_ops, "xnf");
284 	r = mod_install(&modlinkage);
285 	if (r != DDI_SUCCESS)
286 		mac_fini_ops(&xnf_dev_ops);
287 
288 	return (r);
289 }
290 
291 int
292 _fini(void)
293 {
294 	return (EBUSY); /* XXPV dme: should be removable */
295 }
296 
297 int
298 _info(struct modinfo *modinfop)
299 {
300 	return (mod_info(&modlinkage, modinfop));
301 }
302 
303 static int
304 xnf_setup_rings(xnf_t *xnfp)
305 {
306 	int			ix, err;
307 	RING_IDX		i;
308 	struct xnf_buffer_desc	*bdesc, *rbp;
309 	struct xenbus_device	*xsd;
310 	domid_t			oeid;
311 
312 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
313 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
314 
315 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
316 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
317 
318 	err = gnttab_grant_foreign_access(oeid,
319 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
320 	if (err <= 0) {
321 		err = -err;
322 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
323 		goto out;
324 	}
325 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
326 
327 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
328 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
329 
330 	err = gnttab_grant_foreign_access(oeid,
331 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
332 	if (err <= 0) {
333 		err = -err;
334 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
335 		goto out;
336 	}
337 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
338 
339 
340 	mutex_enter(&xnfp->xnf_intrlock);
341 
342 	/*
343 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
344 	 * and reset the ring.  Note that this can lose packets after a resume,
345 	 * but we expect to stagger on.
346 	 */
347 	mutex_enter(&xnfp->xnf_txlock);
348 
349 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
350 		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
351 
352 		txp->id = i + 1;
353 
354 		if (txp->grant_ref == GRANT_INVALID_REF) {
355 			ASSERT(txp->mp == NULL);
356 			ASSERT(txp->bdesc == NULL);
357 			continue;
358 		}
359 
360 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
361 			panic("tx grant still in use by backend domain");
362 
363 		freemsg(txp->mp);
364 		txp->mp = NULL;
365 
366 		(void) ddi_dma_unbind_handle(txp->dma_handle);
367 
368 		if (txp->bdesc != NULL) {
369 			xnf_free_tx_buffer(txp->bdesc);
370 			txp->bdesc = NULL;
371 		}
372 
373 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
374 		    xnfp->xnf_tx_pages_readonly);
375 		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
376 		    txp->grant_ref);
377 		txp->grant_ref = GRANT_INVALID_REF;
378 	}
379 
380 	xnfp->xnf_tx_pkt_id_list = 0;
381 	xnfp->xnf_tx_ring.rsp_cons = 0;
382 	xnfp->xnf_tx_ring.sring->req_prod = 0;
383 	xnfp->xnf_tx_ring.sring->rsp_prod = 0;
384 	xnfp->xnf_tx_ring.sring->rsp_event = 1;
385 
386 	mutex_exit(&xnfp->xnf_txlock);
387 
388 	/*
389 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
390 	 * our pages are currently flipped out/granted so we can't just free
391 	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
392 	 * useable anyway since the mfn's they refer to are no longer valid.
393 	 * Grant the backend domain access to each hung rx buffer.
394 	 */
395 	i = xnfp->xnf_rx_ring.rsp_cons;
396 	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
397 		volatile netif_rx_request_t	*rxrp;
398 
399 		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
400 		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
401 		rbp = xnfp->xnf_rxpkt_bufptr[ix];
402 		if (rbp != NULL) {
403 			grant_ref_t	ref = rbp->grant_ref;
404 
405 			ASSERT(ref != GRANT_INVALID_REF);
406 			if (xnfp->xnf_rx_hvcopy) {
407 				pfn_t pfn = xnf_btop(rbp->buf_phys);
408 				mfn_t mfn = pfn_to_mfn(pfn);
409 
410 				gnttab_grant_foreign_access_ref(ref, oeid,
411 				    mfn, 0);
412 			} else {
413 				gnttab_grant_foreign_transfer_ref(ref, oeid);
414 			}
415 			rxrp->id = ix;
416 			rxrp->gref = ref;
417 		}
418 	}
419 
420 	/*
421 	 * Reset the ring pointers to initial state.
422 	 * Hang buffers for any empty ring slots.
423 	 */
424 	xnfp->xnf_rx_ring.rsp_cons = 0;
425 	xnfp->xnf_rx_ring.sring->req_prod = 0;
426 	xnfp->xnf_rx_ring.sring->rsp_prod = 0;
427 	xnfp->xnf_rx_ring.sring->rsp_event = 1;
428 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
429 		xnfp->xnf_rx_ring.req_prod_pvt = i;
430 		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
431 			continue;
432 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
433 			break;
434 		rx_buffer_hang(xnfp, bdesc);
435 	}
436 	xnfp->xnf_rx_ring.req_prod_pvt = i;
437 	/* LINTED: constant in conditional context */
438 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
439 
440 	mutex_exit(&xnfp->xnf_intrlock);
441 
442 	return (0);
443 
444 out:
445 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
446 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
447 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
448 
449 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
450 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
451 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
452 
453 	return (err);
454 }
455 
456 
457 /* Called when the upper layers free a message we passed upstream */
458 static void
459 xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
460 {
461 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
462 	ddi_dma_mem_free(&bdesc->acc_handle);
463 	ddi_dma_free_handle(&bdesc->dma_handle);
464 	kmem_free(bdesc, sizeof (*bdesc));
465 }
466 
467 
468 /*
469  * Connect driver to back end, called to set up communication with
470  * back end driver both initially and on resume after restore/migrate.
471  */
472 void
473 xnf_be_connect(xnf_t *xnfp)
474 {
475 	char		mac[ETHERADDRL * 3];
476 	const char	*message;
477 	xenbus_transaction_t xbt;
478 	struct		xenbus_device *xsd;
479 	char		*xsname;
480 	int		err, be_no_cksum_offload;
481 
482 	ASSERT(!xnfp->xnf_connected);
483 
484 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
485 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
486 
487 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
488 	    "%s", (char *)&mac[0]);
489 	if (err != 0) {
490 		/*
491 		 * bad: we're supposed to be set up with a proper mac
492 		 * addr. at this point
493 		 */
494 		cmn_err(CE_WARN, "%s%d: no mac address",
495 		    ddi_driver_name(xnfp->xnf_devinfo),
496 		    ddi_get_instance(xnfp->xnf_devinfo));
497 			return;
498 	}
499 
500 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
501 		err = ENOENT;
502 		xenbus_dev_error(xsd, ENOENT, "parsing %s/mac", xsname);
503 		return;
504 	}
505 
506 	err = xnf_setup_rings(xnfp);
507 	if (err != 0) {
508 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
509 		xenbus_dev_error(xsd, err, "setting up ring");
510 		return;
511 	}
512 
513 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
514 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
515 	/*
516 	 * If we fail to read the store we assume that the key is
517 	 * absent, implying an older domain at the far end.  Older
518 	 * domains always support checksum offload.
519 	 */
520 	if (err != 0)
521 		be_no_cksum_offload = 0;
522 	/*
523 	 * If the far end cannot do checksum offload or we do not wish
524 	 * to do it, disable it.
525 	 */
526 	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
527 		xnfp->xnf_cksum_offload = B_FALSE;
528 
529 again:
530 	err = xenbus_transaction_start(&xbt);
531 	if (err != 0) {
532 		xenbus_dev_error(xsd, EIO, "starting transaction");
533 		return;
534 	}
535 
536 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
537 	    xnfp->xnf_tx_ring_ref);
538 	if (err != 0) {
539 		message = "writing tx ring-ref";
540 		goto abort_transaction;
541 	}
542 
543 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
544 	    xnfp->xnf_rx_ring_ref);
545 	if (err != 0) {
546 		message = "writing rx ring-ref";
547 		goto abort_transaction;
548 	}
549 
550 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
551 	    xnfp->xnf_evtchn);
552 	if (err != 0) {
553 		message = "writing event-channel";
554 		goto abort_transaction;
555 	}
556 
557 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
558 	if (err != 0) {
559 		message = "writing feature-rx-notify";
560 		goto abort_transaction;
561 	}
562 
563 	if (!xnfp->xnf_tx_pages_readonly) {
564 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
565 		    "%d", 1);
566 		if (err != 0) {
567 			message = "writing feature-tx-writable";
568 			goto abort_transaction;
569 		}
570 	}
571 
572 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
573 	    xnfp->xnf_cksum_offload ? 0 : 1);
574 	if (err != 0) {
575 		message = "writing feature-no-csum-offload";
576 		goto abort_transaction;
577 	}
578 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
579 	    xnfp->xnf_rx_hvcopy ? 1 : 0);
580 	if (err != 0) {
581 		message = "writing request-rx-copy";
582 		goto abort_transaction;
583 	}
584 
585 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
586 	if (err != 0) {
587 		message = "writing frontend XenbusStateConnected";
588 		goto abort_transaction;
589 	}
590 
591 	err = xenbus_transaction_end(xbt, 0);
592 	if (err != 0) {
593 		if (err == EAGAIN)
594 			goto again;
595 		xenbus_dev_error(xsd, err, "completing transaction");
596 	}
597 
598 	return;
599 
600 abort_transaction:
601 	(void) xenbus_transaction_end(xbt, 1);
602 	xenbus_dev_error(xsd, err, "%s", message);
603 }
604 
605 /*
606  *  attach(9E) -- Attach a device to the system
607  *
608  *  Called once for each board successfully probed.
609  */
610 static int
611 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
612 {
613 	mac_register_t *macp;
614 	xnf_t *xnfp;
615 	int err;
616 
617 #ifdef XNF_DEBUG
618 	if (xnfdebug & XNF_DEBUG_DDI)
619 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
620 		    (void *)devinfo);
621 #endif
622 
623 	switch (cmd) {
624 	case DDI_RESUME:
625 		xnfp = ddi_get_driver_private(devinfo);
626 
627 		(void) xvdi_resume(devinfo);
628 		(void) xvdi_alloc_evtchn(devinfo);
629 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
630 #ifdef XPV_HVM_DRIVER
631 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
632 		    xnfp);
633 #else
634 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
635 		    (caddr_t)xnfp);
636 #endif
637 		xnf_be_connect(xnfp);
638 		/*
639 		 * Our MAC address may have changed if we're resuming:
640 		 * - on a different host
641 		 * - on the same one and got a different MAC address
642 		 *   because we didn't specify one of our own.
643 		 * so it's useful to claim that it changed in order that
644 		 * IP send out a gratuitous ARP.
645 		 */
646 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
647 		return (DDI_SUCCESS);
648 
649 	case DDI_ATTACH:
650 		break;
651 
652 	default:
653 		return (DDI_FAILURE);
654 	}
655 
656 	/*
657 	 *  Allocate gld_mac_info_t and xnf_instance structures
658 	 */
659 	macp = mac_alloc(MAC_VERSION);
660 	if (macp == NULL)
661 		return (DDI_FAILURE);
662 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
663 
664 	macp->m_dip = devinfo;
665 	macp->m_driver = xnfp;
666 	xnfp->xnf_devinfo = devinfo;
667 
668 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
669 	macp->m_src_addr = xnfp->xnf_mac_addr;
670 	macp->m_callbacks = &xnf_callbacks;
671 	macp->m_min_sdu = 0;
672 	macp->m_max_sdu = XNF_MAXPKT;
673 
674 	xnfp->xnf_running = B_FALSE;
675 	xnfp->xnf_connected = B_FALSE;
676 	xnfp->xnf_cksum_offload = xnf_cksum_offload;
677 	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
678 
679 	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
680 #ifdef XPV_HVM_DRIVER
681 	if (!xnfp->xnf_rx_hvcopy) {
682 		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
683 		    "supports 'feature-rx-copy'");
684 		goto failure;
685 	}
686 #endif
687 
688 	/*
689 	 * Get the iblock cookie with which to initialize the mutexes.
690 	 */
691 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
692 	    != DDI_SUCCESS)
693 		goto failure;
694 	/*
695 	 * Driver locking strategy: the txlock protects all paths
696 	 * through the driver, except the interrupt thread.
697 	 * If the interrupt thread needs to do something which could
698 	 * affect the operation of any other part of the driver,
699 	 * it needs to acquire the txlock mutex.
700 	 */
701 	mutex_init(&xnfp->xnf_tx_buf_mutex,
702 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
703 	mutex_init(&xnfp->xnf_rx_buf_mutex,
704 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
705 	mutex_init(&xnfp->xnf_txlock,
706 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
707 	mutex_init(&xnfp->xnf_intrlock,
708 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
709 	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
710 
711 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
712 	    &xnfp->xnf_gref_tx_head) < 0) {
713 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
714 		    ddi_get_instance(xnfp->xnf_devinfo));
715 		goto failure_1;
716 	}
717 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
718 	    &xnfp->xnf_gref_rx_head) < 0) {
719 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
720 		    ddi_get_instance(xnfp->xnf_devinfo));
721 		goto failure_1;
722 	}
723 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
724 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
725 		    "driver data structures",
726 		    ddi_get_instance(xnfp->xnf_devinfo));
727 		goto failure_1;
728 	}
729 
730 	xnfp->xnf_rx_ring.sring->rsp_event =
731 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
732 
733 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
734 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
735 
736 	/* set driver private pointer now */
737 	ddi_set_driver_private(devinfo, xnfp);
738 
739 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change)
740 	    != DDI_SUCCESS)
741 		goto failure_1;
742 
743 	if (!xnf_kstat_init(xnfp))
744 		goto failure_2;
745 
746 	/*
747 	 * Allocate an event channel, add the interrupt handler and
748 	 * bind it to the event channel.
749 	 */
750 	(void) xvdi_alloc_evtchn(devinfo);
751 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
752 #ifdef XPV_HVM_DRIVER
753 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
754 #else
755 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
756 #endif
757 
758 	/*
759 	 * connect to the backend
760 	 */
761 	xnf_be_connect(xnfp);
762 
763 	err = mac_register(macp, &xnfp->xnf_mh);
764 	mac_free(macp);
765 	macp = NULL;
766 	if (err != 0)
767 		goto failure_3;
768 
769 	return (DDI_SUCCESS);
770 
771 failure_3:
772 	kstat_delete(xnfp->xnf_kstat_aux);
773 
774 failure_2:
775 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
776 #ifdef XPV_HVM_DRIVER
777 	ec_unbind_evtchn(xnfp->xnf_evtchn);
778 #else
779 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
780 #endif
781 	xnfp->xnf_evtchn = INVALID_EVTCHN;
782 
783 failure_1:
784 	xnf_release_dma_resources(xnfp);
785 	cv_destroy(&xnfp->xnf_cv);
786 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
787 	mutex_destroy(&xnfp->xnf_txlock);
788 	mutex_destroy(&xnfp->xnf_intrlock);
789 
790 failure:
791 	kmem_free(xnfp, sizeof (*xnfp));
792 	if (macp != NULL)
793 		mac_free(macp);
794 
795 	return (DDI_FAILURE);
796 }
797 
798 /*  detach(9E) -- Detach a device from the system */
799 static int
800 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
801 {
802 	xnf_t *xnfp;		/* Our private device info */
803 	int i;
804 
805 #ifdef XNF_DEBUG
806 	if (xnfdebug & XNF_DEBUG_DDI)
807 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
808 #endif
809 
810 	xnfp = ddi_get_driver_private(devinfo);
811 
812 	switch (cmd) {
813 	case DDI_SUSPEND:
814 #ifdef XPV_HVM_DRIVER
815 		ec_unbind_evtchn(xnfp->xnf_evtchn);
816 #else
817 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
818 #endif
819 
820 		xvdi_suspend(devinfo);
821 
822 		mutex_enter(&xnfp->xnf_intrlock);
823 		mutex_enter(&xnfp->xnf_txlock);
824 
825 		xnfp->xnf_evtchn = INVALID_EVTCHN;
826 		xnfp->xnf_connected = B_FALSE;
827 		mutex_exit(&xnfp->xnf_txlock);
828 		mutex_exit(&xnfp->xnf_intrlock);
829 		return (DDI_SUCCESS);
830 
831 	case DDI_DETACH:
832 		break;
833 
834 	default:
835 		return (DDI_FAILURE);
836 	}
837 
838 	if (xnfp->xnf_connected)
839 		return (DDI_FAILURE);
840 
841 	/* Wait for receive buffers to be returned; give up after 5 seconds */
842 	i = 50;
843 
844 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
845 	while (xnfp->xnf_rx_bufs_outstanding > 0) {
846 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
847 		delay(drv_usectohz(100000));
848 		if (--i == 0) {
849 			cmn_err(CE_WARN,
850 			    "xnf%d: never reclaimed all the "
851 			    "receive buffers.  Still have %d "
852 			    "buffers outstanding.",
853 			    ddi_get_instance(xnfp->xnf_devinfo),
854 			    xnfp->xnf_rx_bufs_outstanding);
855 			return (DDI_FAILURE);
856 		}
857 		mutex_enter(&xnfp->xnf_rx_buf_mutex);
858 	}
859 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
860 
861 	kstat_delete(xnfp->xnf_kstat_aux);
862 
863 	if (mac_unregister(xnfp->xnf_mh) != 0)
864 		return (DDI_FAILURE);
865 
866 	/* Stop the receiver */
867 	xnf_stop(xnfp);
868 
869 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
870 
871 	/* Remove the interrupt */
872 #ifdef XPV_HVM_DRIVER
873 	ec_unbind_evtchn(xnfp->xnf_evtchn);
874 #else
875 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
876 #endif
877 
878 	/* Release any pending xmit mblks */
879 	xnf_release_mblks(xnfp);
880 
881 	/* Release all DMA resources */
882 	xnf_release_dma_resources(xnfp);
883 
884 	cv_destroy(&xnfp->xnf_cv);
885 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
886 	mutex_destroy(&xnfp->xnf_txlock);
887 	mutex_destroy(&xnfp->xnf_intrlock);
888 
889 	kmem_free(xnfp, sizeof (*xnfp));
890 
891 	return (DDI_SUCCESS);
892 }
893 
894 /*
895  *  xnf_set_mac_addr() -- set the physical network address on the board.
896  */
897 /*ARGSUSED*/
898 static int
899 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
900 {
901 	xnf_t *xnfp = arg;
902 
903 #ifdef XNF_DEBUG
904 	if (xnfdebug & XNF_DEBUG_TRACE)
905 		printf("xnf%d: set_mac_addr(0x%p): "
906 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
907 		    ddi_get_instance(xnfp->xnf_devinfo),
908 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
909 		    macaddr[3], macaddr[4], macaddr[5]);
910 #endif
911 	/*
912 	 * We can't set our macaddr.
913 	 *
914 	 * XXPV dme: Why not?
915 	 */
916 	return (ENOTSUP);
917 }
918 
919 /*
920  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
921  *
922  *  Program the hardware to enable/disable the multicast address
923  *  in "mcast".  Enable if "add" is true, disable if false.
924  */
925 /*ARGSUSED*/
926 static int
927 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
928 {
929 	xnf_t *xnfp = arg;
930 
931 #ifdef XNF_DEBUG
932 	if (xnfdebug & XNF_DEBUG_TRACE)
933 		printf("xnf%d set_multicast(0x%p): "
934 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
935 		    ddi_get_instance(xnfp->xnf_devinfo),
936 		    (void *)xnfp, mca[0], mca[1], mca[2],
937 		    mca[3], mca[4], mca[5]);
938 #endif
939 
940 	/*
941 	 * XXPV dme: Ideally we'd relay the address to the backend for
942 	 * enabling.  The protocol doesn't support that (interesting
943 	 * extension), so we simply succeed and hope that the relevant
944 	 * packets are going to arrive.
945 	 *
946 	 * If protocol support is added for enable/disable then we'll
947 	 * need to keep a list of those in use and re-add on resume.
948 	 */
949 	return (0);
950 }
951 
952 /*
953  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
954  *
955  *  Program the hardware to enable/disable promiscuous mode.
956  */
957 /*ARGSUSED*/
958 static int
959 xnf_set_promiscuous(void *arg, boolean_t on)
960 {
961 	xnf_t *xnfp = arg;
962 
963 #ifdef XNF_DEBUG
964 	if (xnfdebug & XNF_DEBUG_TRACE)
965 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
966 		    ddi_get_instance(xnfp->xnf_devinfo),
967 		    (void *)xnfp, on);
968 #endif
969 	/*
970 	 * We can't really do this, but we pretend that we can in
971 	 * order that snoop will work.
972 	 */
973 	return (0);
974 }
975 
976 /*
977  * Clean buffers that we have responses for from the transmit ring.
978  */
979 static int
980 xnf_clean_tx_ring(xnf_t *xnfp)
981 {
982 	RING_IDX		next_resp, i;
983 	struct tx_pktinfo	*reap;
984 	int			id;
985 	grant_ref_t		ref;
986 
987 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
988 
989 	do {
990 		/*
991 		 * index of next transmission ack
992 		 */
993 		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
994 		membar_consumer();
995 		/*
996 		 * Clean tx packets from ring that we have responses for
997 		 */
998 		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
999 			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
1000 			reap = &xnfp->xnf_tx_pkt_info[id];
1001 			ref = reap->grant_ref;
1002 			/*
1003 			 * Return id to free list
1004 			 */
1005 			reap->id = xnfp->xnf_tx_pkt_id_list;
1006 			xnfp->xnf_tx_pkt_id_list = id;
1007 			if (gnttab_query_foreign_access(ref) != 0)
1008 				panic("tx grant still in use "
1009 				    "by backend domain");
1010 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1011 			(void) gnttab_end_foreign_access_ref(ref,
1012 			    xnfp->xnf_tx_pages_readonly);
1013 			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
1014 			    ref);
1015 			freemsg(reap->mp);
1016 			reap->mp = NULL;
1017 			reap->grant_ref = GRANT_INVALID_REF;
1018 			if (reap->bdesc != NULL)
1019 				xnf_free_tx_buffer(reap->bdesc);
1020 			reap->bdesc = NULL;
1021 		}
1022 		xnfp->xnf_tx_ring.rsp_cons = next_resp;
1023 		membar_enter();
1024 	} while (next_resp != xnfp->xnf_tx_ring.sring->rsp_prod);
1025 	return (NET_TX_RING_SIZE - (xnfp->xnf_tx_ring.sring->req_prod -
1026 	    next_resp));
1027 }
1028 
1029 /*
1030  * If we need to pull up data from either a packet that crosses a page
1031  * boundary or consisting of multiple mblks, do it here.  We allocate
1032  * a page aligned buffer and copy the data into it.  The header for the
1033  * allocated buffer is returned. (which is also allocated here)
1034  */
1035 static struct xnf_buffer_desc *
1036 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1037 {
1038 	struct xnf_buffer_desc	*bdesc;
1039 	mblk_t			*mptr;
1040 	caddr_t			bp;
1041 	int			len;
1042 
1043 	/*
1044 	 * get a xmit buffer from the xmit buffer pool
1045 	 */
1046 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1047 	bdesc = xnf_get_tx_buffer(xnfp);
1048 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
1049 	if (bdesc == NULL)
1050 		return (bdesc);
1051 	/*
1052 	 * Copy the data into the buffer
1053 	 */
1054 	xnfp->xnf_stat_tx_pullup++;
1055 	bp = bdesc->buf;
1056 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1057 		len = mptr->b_wptr - mptr->b_rptr;
1058 		bcopy(mptr->b_rptr, bp, len);
1059 		bp += len;
1060 	}
1061 	return (bdesc);
1062 }
1063 
1064 /*
1065  *  xnf_send_one() -- send a packet
1066  *
1067  *  Called when a packet is ready to be transmitted. A pointer to an
1068  *  M_DATA message that contains the packet is passed to this routine.
1069  *  At least the complete LLC header is contained in the message's
1070  *  first message block, and the remainder of the packet is contained
1071  *  within additional M_DATA message blocks linked to the first
1072  *  message block.
1073  *
1074  */
1075 static boolean_t
1076 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1077 {
1078 	struct xnf_buffer_desc	*xmitbuf;
1079 	struct tx_pktinfo	*txp_info;
1080 	mblk_t			*mptr;
1081 	ddi_dma_cookie_t	dma_cookie;
1082 	RING_IDX		slot, txs_out;
1083 	int			length = 0, i, pktlen = 0, rc, tx_id;
1084 	int			tx_ring_freespace, page_oops;
1085 	uint_t			ncookies;
1086 	volatile netif_tx_request_t	*txrp;
1087 	caddr_t			bufaddr;
1088 	grant_ref_t		ref;
1089 	unsigned long		mfn;
1090 	uint32_t		pflags;
1091 	domid_t			oeid;
1092 
1093 #ifdef XNF_DEBUG
1094 	if (xnfdebug & XNF_DEBUG_SEND)
1095 		printf("xnf%d send(0x%p, 0x%p)\n",
1096 		    ddi_get_instance(xnfp->xnf_devinfo),
1097 		    (void *)xnfp, (void *)mp);
1098 #endif
1099 
1100 	ASSERT(mp != NULL);
1101 	ASSERT(mp->b_next == NULL);
1102 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1103 
1104 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1105 	ASSERT(tx_ring_freespace >= 0);
1106 
1107 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1108 	xnfp->xnf_stat_tx_attempt++;
1109 	/*
1110 	 * If there are no xmit ring slots available, return.
1111 	 */
1112 	if (tx_ring_freespace == 0) {
1113 		xnfp->xnf_stat_tx_defer++;
1114 		return (B_FALSE);	/* Send should be retried */
1115 	}
1116 
1117 	slot = xnfp->xnf_tx_ring.sring->req_prod;
1118 	/* Count the number of mblks in message and compute packet size */
1119 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1120 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1121 
1122 	/* Make sure packet isn't too large */
1123 	if (pktlen > XNF_FRAMESIZE) {
1124 		cmn_err(CE_WARN, "xnf%d: large packet %d bytes",
1125 		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
1126 		freemsg(mp);
1127 		return (B_FALSE);
1128 	}
1129 
1130 	/*
1131 	 * Test if we cross a page boundary with our buffer
1132 	 */
1133 	page_oops = (i == 1) &&
1134 	    (xnf_btop((size_t)mp->b_rptr) !=
1135 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1136 	/*
1137 	 * XXPV - unfortunately, the Xen virtual net device currently
1138 	 * doesn't support multiple packet frags, so this will always
1139 	 * end up doing the pullup if we got more than one packet.
1140 	 */
1141 	if (i > xnf_max_tx_frags || page_oops) {
1142 		if (page_oops)
1143 			xnfp->xnf_stat_tx_pagebndry++;
1144 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1145 			/* could not allocate resources? */
1146 #ifdef XNF_DEBUG
1147 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1148 			    ddi_get_instance(xnfp->xnf_devinfo));
1149 #endif
1150 			xnfp->xnf_stat_tx_defer++;
1151 			return (B_FALSE);	/* Retry send */
1152 		}
1153 		bufaddr = xmitbuf->buf;
1154 	} else {
1155 		xmitbuf = NULL;
1156 		bufaddr = (caddr_t)mp->b_rptr;
1157 	}
1158 
1159 	/* set up data descriptor */
1160 	length = pktlen;
1161 
1162 	/*
1163 	 * Get packet id from free list
1164 	 */
1165 	tx_id = xnfp->xnf_tx_pkt_id_list;
1166 	ASSERT(tx_id < NET_TX_RING_SIZE);
1167 	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
1168 	xnfp->xnf_tx_pkt_id_list = txp_info->id;
1169 	txp_info->id = tx_id;
1170 
1171 	/* Prepare for DMA mapping of tx buffer(s) */
1172 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1173 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1174 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1175 	if (rc != DDI_DMA_MAPPED) {
1176 		ASSERT(rc != DDI_DMA_INUSE);
1177 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1178 		/*
1179 		 *  Return id to free list
1180 		 */
1181 		txp_info->id = xnfp->xnf_tx_pkt_id_list;
1182 		xnfp->xnf_tx_pkt_id_list = tx_id;
1183 		if (rc == DDI_DMA_NORESOURCES) {
1184 			xnfp->xnf_stat_tx_defer++;
1185 			return (B_FALSE); /* Retry later */
1186 		}
1187 #ifdef XNF_DEBUG
1188 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1189 		    ddi_get_instance(xnfp->xnf_devinfo), rc);
1190 #endif
1191 		return (B_FALSE);
1192 	}
1193 
1194 	ASSERT(ncookies == 1);
1195 	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
1196 	ASSERT((signed short)ref >= 0);
1197 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1198 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1199 	    xnfp->xnf_tx_pages_readonly);
1200 	txp_info->grant_ref = ref;
1201 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1202 	txrp->gref = ref;
1203 	txrp->size = dma_cookie.dmac_size;
1204 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1205 	txrp->id = tx_id;
1206 	txrp->flags = 0;
1207 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1208 	if (pflags != 0) {
1209 		ASSERT(xnfp->xnf_cksum_offload);
1210 		/*
1211 		 * If the local protocol stack requests checksum
1212 		 * offload we set the 'checksum blank' flag,
1213 		 * indicating to the peer that we need the checksum
1214 		 * calculated for us.
1215 		 *
1216 		 * We _don't_ set the validated flag, because we haven't
1217 		 * validated that the data and the checksum match.
1218 		 */
1219 		txrp->flags |= NETTXF_csum_blank;
1220 		xnfp->xnf_stat_tx_cksum_deferred++;
1221 	}
1222 	membar_producer();
1223 	xnfp->xnf_tx_ring.sring->req_prod = slot + 1;
1224 
1225 	txp_info->mp = mp;
1226 	txp_info->bdesc = xmitbuf;
1227 
1228 	txs_out = xnfp->xnf_tx_ring.sring->req_prod -
1229 	    xnfp->xnf_tx_ring.sring->rsp_prod;
1230 	if (xnfp->xnf_tx_ring.sring->req_prod - xnfp->xnf_tx_ring.rsp_cons <
1231 	    XNF_TX_FREE_THRESH) {
1232 		/*
1233 		 * The ring is getting full; Set up this packet
1234 		 * to cause an interrupt.
1235 		 */
1236 		xnfp->xnf_tx_ring.sring->rsp_event =
1237 		    xnfp->xnf_tx_ring.sring->rsp_prod + txs_out;
1238 	}
1239 
1240 	xnfp->xnf_stat_opackets++;
1241 	xnfp->xnf_stat_obytes += pktlen;
1242 
1243 	return (B_TRUE);	/* successful transmit attempt */
1244 }
1245 
1246 mblk_t *
1247 xnf_send(void *arg, mblk_t *mp)
1248 {
1249 	xnf_t *xnfp = arg;
1250 	mblk_t *next;
1251 	boolean_t sent_something = B_FALSE;
1252 
1253 	mutex_enter(&xnfp->xnf_txlock);
1254 
1255 	/*
1256 	 * Transmission attempts should be impossible without having
1257 	 * previously called xnf_start().
1258 	 */
1259 	ASSERT(xnfp->xnf_running);
1260 
1261 	/*
1262 	 * Wait for getting connected to the backend
1263 	 */
1264 	while (!xnfp->xnf_connected) {
1265 		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
1266 	}
1267 
1268 	while (mp != NULL) {
1269 		next = mp->b_next;
1270 		mp->b_next = NULL;
1271 
1272 		if (!xnf_send_one(xnfp, mp)) {
1273 			mp->b_next = next;
1274 			break;
1275 		}
1276 
1277 		mp = next;
1278 		sent_something = B_TRUE;
1279 	}
1280 
1281 	if (sent_something)
1282 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1283 
1284 	mutex_exit(&xnfp->xnf_txlock);
1285 
1286 	return (mp);
1287 }
1288 
1289 /*
1290  *  xnf_intr() -- ring interrupt service routine
1291  */
1292 static uint_t
1293 xnf_intr(caddr_t arg)
1294 {
1295 	xnf_t *xnfp = (xnf_t *)arg;
1296 	int tx_ring_space;
1297 
1298 	mutex_enter(&xnfp->xnf_intrlock);
1299 
1300 	/*
1301 	 * If not connected to the peer or not started by the upper
1302 	 * layers we cannot usefully handle interrupts.
1303 	 */
1304 	if (!(xnfp->xnf_connected && xnfp->xnf_running)) {
1305 		mutex_exit(&xnfp->xnf_intrlock);
1306 		xnfp->xnf_stat_unclaimed_interrupts++;
1307 		return (DDI_INTR_UNCLAIMED);
1308 	}
1309 
1310 #ifdef XNF_DEBUG
1311 	if (xnfdebug & XNF_DEBUG_INT)
1312 		printf("xnf%d intr(0x%p)\n",
1313 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1314 #endif
1315 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1316 		mblk_t *mp;
1317 
1318 		if (xnfp->xnf_rx_hvcopy)
1319 			mp = xnf_process_hvcopy_recv(xnfp);
1320 		else
1321 			mp = xnf_process_recv(xnfp);
1322 
1323 		if (mp != NULL)
1324 			mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
1325 	}
1326 
1327 	/*
1328 	 * Is tx ring nearly full?
1329 	 */
1330 #define	inuse(r) ((r).sring->req_prod - (r).rsp_cons)
1331 
1332 	if ((NET_TX_RING_SIZE - inuse(xnfp->xnf_tx_ring)) <
1333 	    XNF_TX_FREE_THRESH) {
1334 		/*
1335 		 * Yes, clean it and try to start any blocked xmit
1336 		 * streams.
1337 		 */
1338 		mutex_enter(&xnfp->xnf_txlock);
1339 		tx_ring_space = xnf_clean_tx_ring(xnfp);
1340 		mutex_exit(&xnfp->xnf_txlock);
1341 		if (tx_ring_space > XNF_TX_FREE_THRESH) {
1342 			mutex_exit(&xnfp->xnf_intrlock);
1343 			mac_tx_update(xnfp->xnf_mh);
1344 			mutex_enter(&xnfp->xnf_intrlock);
1345 		} else {
1346 			/*
1347 			 * Schedule another tx interrupt when we have
1348 			 * sent enough packets to cross the threshold.
1349 			 */
1350 			xnfp->xnf_tx_ring.sring->rsp_event =
1351 			    xnfp->xnf_tx_ring.sring->rsp_prod +
1352 			    XNF_TX_FREE_THRESH - tx_ring_space + 1;
1353 		}
1354 	}
1355 #undef inuse
1356 
1357 	xnfp->xnf_stat_interrupts++;
1358 	mutex_exit(&xnfp->xnf_intrlock);
1359 	return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */
1360 }
1361 
1362 /*
1363  *  xnf_start() -- start the board receiving and enable interrupts.
1364  */
1365 static int
1366 xnf_start(void *arg)
1367 {
1368 	xnf_t *xnfp = arg;
1369 
1370 #ifdef XNF_DEBUG
1371 	if (xnfdebug & XNF_DEBUG_TRACE)
1372 		printf("xnf%d start(0x%p)\n",
1373 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1374 #endif
1375 
1376 	mutex_enter(&xnfp->xnf_intrlock);
1377 	mutex_enter(&xnfp->xnf_txlock);
1378 
1379 	/* Accept packets from above. */
1380 	xnfp->xnf_running = B_TRUE;
1381 
1382 	mutex_exit(&xnfp->xnf_txlock);
1383 	mutex_exit(&xnfp->xnf_intrlock);
1384 
1385 	return (0);
1386 }
1387 
1388 /* xnf_stop() - disable hardware */
1389 static void
1390 xnf_stop(void *arg)
1391 {
1392 	xnf_t *xnfp = arg;
1393 
1394 #ifdef XNF_DEBUG
1395 	if (xnfdebug & XNF_DEBUG_TRACE)
1396 		printf("xnf%d stop(0x%p)\n",
1397 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1398 #endif
1399 
1400 	mutex_enter(&xnfp->xnf_intrlock);
1401 	mutex_enter(&xnfp->xnf_txlock);
1402 
1403 	xnfp->xnf_running = B_FALSE;
1404 
1405 	mutex_exit(&xnfp->xnf_txlock);
1406 	mutex_exit(&xnfp->xnf_intrlock);
1407 }
1408 
1409 /*
1410  * Driver private functions follow
1411  */
1412 
1413 /*
1414  * Hang buffer on rx ring
1415  */
1416 static void
1417 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1418 {
1419 	volatile netif_rx_request_t	*reqp;
1420 	RING_IDX			hang_ix;
1421 	grant_ref_t			ref;
1422 	domid_t				oeid;
1423 
1424 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1425 
1426 	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
1427 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1428 	    xnfp->xnf_rx_ring.req_prod_pvt);
1429 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1430 	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
1431 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1432 		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
1433 		ASSERT((signed short)ref >= 0);
1434 		bdesc->grant_ref = ref;
1435 		if (xnfp->xnf_rx_hvcopy) {
1436 			pfn_t pfn = xnf_btop(bdesc->buf_phys);
1437 			mfn_t mfn = pfn_to_mfn(pfn);
1438 
1439 			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
1440 		} else {
1441 			gnttab_grant_foreign_transfer_ref(ref, oeid);
1442 		}
1443 	}
1444 	reqp->id = hang_ix;
1445 	reqp->gref = bdesc->grant_ref;
1446 	bdesc->id = hang_ix;
1447 	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
1448 	membar_producer();
1449 	xnfp->xnf_rx_ring.req_prod_pvt++;
1450 }
1451 
1452 static mblk_t *
1453 xnf_process_hvcopy_recv(xnf_t *xnfp)
1454 {
1455 	netif_rx_response_t *rxpkt;
1456 	mblk_t		*mp, *head, *tail;
1457 	struct		xnf_buffer_desc *bdesc;
1458 	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
1459 	size_t 		len;
1460 
1461 	/*
1462 	 * in loop over unconsumed responses, we do:
1463 	 * 1. get a response
1464 	 * 2. take corresponding buffer off recv. ring
1465 	 * 3. indicate this by setting slot to NULL
1466 	 * 4. create a new message and
1467 	 * 5. copy data in, adjust ptr
1468 	 *
1469 	 * outside loop:
1470 	 * 7. make sure no more data has arrived; kick HV
1471 	 */
1472 
1473 	head = tail = NULL;
1474 
1475 loop:
1476 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1477 
1478 		/* 1. */
1479 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1480 		    xnfp->xnf_rx_ring.rsp_cons);
1481 
1482 		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
1483 		    (int)rxpkt->offset,
1484 		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
1485 
1486 		/*
1487 		 * 2.
1488 		 * Take buffer off of receive ring
1489 		 */
1490 		hwcsum = B_FALSE;
1491 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1492 		/* 3 */
1493 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1494 		ASSERT(bdesc->id == rxpkt->id);
1495 		if (rxpkt->status <= 0) {
1496 			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
1497 			    char *, bdesc->buf, int, rxpkt->offset,
1498 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1499 			mp = NULL;
1500 			xnfp->xnf_stat_errrx++;
1501 			if (rxpkt->status == 0)
1502 				xnfp->xnf_stat_runt++;
1503 			if (rxpkt->status == NETIF_RSP_ERROR)
1504 				xnfp->xnf_stat_mac_rcv_error++;
1505 			if (rxpkt->status == NETIF_RSP_DROPPED)
1506 				xnfp->xnf_stat_norxbuf++;
1507 			/*
1508 			 * re-hang the buffer
1509 			 */
1510 			rx_buffer_hang(xnfp, bdesc);
1511 		} else {
1512 			grant_ref_t		ref =  bdesc->grant_ref;
1513 			struct xnf_buffer_desc	*new_bdesc;
1514 			unsigned long		off = rxpkt->offset;
1515 
1516 			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
1517 			    char *, bdesc->buf, int, rxpkt->offset,
1518 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1519 			len = rxpkt->status;
1520 			ASSERT(off + len <= PAGEOFFSET);
1521 			if (ref == GRANT_INVALID_REF) {
1522 				mp = NULL;
1523 				new_bdesc = bdesc;
1524 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1525 				    "from dom %d", ref,
1526 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1527 				goto luckless;
1528 			}
1529 			/*
1530 			 * Release ref which we'll be re-claiming in
1531 			 * rx_buffer_hang().
1532 			 */
1533 			bdesc->grant_ref = GRANT_INVALID_REF;
1534 			(void) gnttab_end_foreign_access_ref(ref, 0);
1535 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1536 			    ref);
1537 			if (rxpkt->flags & NETRXF_data_validated)
1538 				hwcsum = B_TRUE;
1539 
1540 			/*
1541 			 * XXPV for the initial implementation of HVcopy,
1542 			 * create a new msg and copy in the data
1543 			 */
1544 			/* 4. */
1545 			if ((mp = allocb(len, BPRI_MED)) == NULL) {
1546 				/*
1547 				 * Couldn't get buffer to copy to,
1548 				 * drop this data, and re-hang
1549 				 * the buffer on the ring.
1550 				 */
1551 				xnfp->xnf_stat_norxbuf++;
1552 				DTRACE_PROBE(alloc_nix);
1553 			} else {
1554 				/* 5. */
1555 				DTRACE_PROBE(alloc_ok);
1556 				bcopy(bdesc->buf + off, mp->b_wptr,
1557 				    len);
1558 				mp->b_wptr += len;
1559 			}
1560 			new_bdesc = bdesc;
1561 luckless:
1562 
1563 			/* Re-hang old or hang new buffer. */
1564 			rx_buffer_hang(xnfp, new_bdesc);
1565 		}
1566 		if (mp) {
1567 			if (hwcsum) {
1568 				/*
1569 				 * See comments in xnf_process_recv().
1570 				 */
1571 
1572 				(void) hcksum_assoc(mp, NULL,
1573 				    NULL, 0, 0, 0, 0,
1574 				    HCK_FULLCKSUM |
1575 				    HCK_FULLCKSUM_OK,
1576 				    0);
1577 				xnfp->xnf_stat_rx_cksum_no_need++;
1578 			}
1579 			if (head == NULL) {
1580 				head = tail = mp;
1581 			} else {
1582 				tail->b_next = mp;
1583 				tail = mp;
1584 			}
1585 
1586 			ASSERT(mp->b_next == NULL);
1587 
1588 			xnfp->xnf_stat_ipackets++;
1589 			xnfp->xnf_stat_rbytes += len;
1590 		}
1591 
1592 		xnfp->xnf_rx_ring.rsp_cons++;
1593 
1594 		xnfp->xnf_stat_hvcopy_packet_processed++;
1595 	}
1596 
1597 	/* 7. */
1598 	/*
1599 	 * Has more data come in since we started?
1600 	 */
1601 	/* LINTED: constant in conditional context */
1602 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1603 	if (work_to_do)
1604 		goto loop;
1605 
1606 	/*
1607 	 * Indicate to the backend that we have re-filled the receive
1608 	 * ring.
1609 	 */
1610 	/* LINTED: constant in conditional context */
1611 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1612 	if (notify)
1613 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1614 
1615 	return (head);
1616 }
1617 
1618 /* Process all queued received packets */
1619 static mblk_t *
1620 xnf_process_recv(xnf_t *xnfp)
1621 {
1622 	volatile netif_rx_response_t *rxpkt;
1623 	mblk_t *mp, *head, *tail;
1624 	struct xnf_buffer_desc *bdesc;
1625 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1626 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1627 	size_t len;
1628 	pfn_t pfn;
1629 	long cnt;
1630 
1631 	head = tail = NULL;
1632 loop:
1633 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1634 
1635 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1636 		    xnfp->xnf_rx_ring.rsp_cons);
1637 
1638 		/*
1639 		 * Take buffer off of receive ring
1640 		 */
1641 		hwcsum = B_FALSE;
1642 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1643 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1644 		ASSERT(bdesc->id == rxpkt->id);
1645 		if (rxpkt->status <= 0) {
1646 			mp = NULL;
1647 			xnfp->xnf_stat_errrx++;
1648 			if (rxpkt->status == 0)
1649 				xnfp->xnf_stat_runt++;
1650 			if (rxpkt->status == NETIF_RSP_ERROR)
1651 				xnfp->xnf_stat_mac_rcv_error++;
1652 			if (rxpkt->status == NETIF_RSP_DROPPED)
1653 				xnfp->xnf_stat_norxbuf++;
1654 			/*
1655 			 * re-hang the buffer
1656 			 */
1657 			rx_buffer_hang(xnfp, bdesc);
1658 		} else {
1659 			grant_ref_t ref =  bdesc->grant_ref;
1660 			struct xnf_buffer_desc *new_bdesc;
1661 			unsigned long off = rxpkt->offset;
1662 			unsigned long mfn;
1663 
1664 			len = rxpkt->status;
1665 			ASSERT(off + len <= PAGEOFFSET);
1666 			if (ref == GRANT_INVALID_REF) {
1667 				mp = NULL;
1668 				new_bdesc = bdesc;
1669 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1670 				    "from dom %d", ref,
1671 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1672 				goto luckless;
1673 			}
1674 			bdesc->grant_ref = GRANT_INVALID_REF;
1675 			mfn = gnttab_end_foreign_transfer_ref(ref);
1676 			ASSERT(mfn != MFN_INVALID);
1677 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1678 			    PFN_INVALID);
1679 
1680 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1681 			    ref);
1682 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1683 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1684 			    xnf_btop(bdesc->buf_phys),
1685 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1686 			balloon_drv_added(1);
1687 
1688 			if (rxpkt->flags & NETRXF_data_validated)
1689 				hwcsum = B_TRUE;
1690 			if (len <= xnf_rx_bcopy_thresh) {
1691 				/*
1692 				 * For small buffers, just copy the data
1693 				 * and send the copy upstream.
1694 				 */
1695 				new_bdesc = NULL;
1696 			} else {
1697 				/*
1698 				 * We send a pointer to this data upstream;
1699 				 * we need a new buffer to replace this one.
1700 				 */
1701 				mutex_enter(&xnfp->xnf_rx_buf_mutex);
1702 				new_bdesc = xnf_get_buffer(xnfp);
1703 				if (new_bdesc != NULL) {
1704 					xnfp->xnf_rx_bufs_outstanding++;
1705 				} else {
1706 					xnfp->xnf_stat_rx_no_ringbuf++;
1707 				}
1708 				mutex_exit(&xnfp->xnf_rx_buf_mutex);
1709 			}
1710 
1711 			if (new_bdesc == NULL) {
1712 				/*
1713 				 * Don't have a new ring buffer; bcopy the data
1714 				 * from the buffer, and preserve the
1715 				 * original buffer
1716 				 */
1717 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1718 					/*
1719 					 * Could't get buffer to copy to,
1720 					 * drop this data, and re-hang
1721 					 * the buffer on the ring.
1722 					 */
1723 					xnfp->xnf_stat_norxbuf++;
1724 				} else {
1725 					bcopy(bdesc->buf + off, mp->b_wptr,
1726 					    len);
1727 				}
1728 				/*
1729 				 * Give the buffer page back to xen
1730 				 */
1731 				pfn = xnf_btop(bdesc->buf_phys);
1732 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1733 				    &pfn);
1734 				if (cnt != 1) {
1735 					cmn_err(CE_WARN, "unable to give a "
1736 					    "page back to the hypervisor\n");
1737 				}
1738 				new_bdesc = bdesc;
1739 			} else {
1740 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1741 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1742 					/*
1743 					 * Couldn't get mblk to pass recv data
1744 					 * up with, free the old ring buffer
1745 					 */
1746 					xnfp->xnf_stat_norxbuf++;
1747 					xnf_rcv_complete(bdesc);
1748 					goto luckless;
1749 				}
1750 				(void) ddi_dma_sync(bdesc->dma_handle,
1751 				    0, 0, DDI_DMA_SYNC_FORCPU);
1752 
1753 				mp->b_wptr += off;
1754 				mp->b_rptr += off;
1755 			}
1756 luckless:
1757 			if (mp)
1758 				mp->b_wptr += len;
1759 			/* re-hang old or hang new buffer */
1760 			rx_buffer_hang(xnfp, new_bdesc);
1761 		}
1762 		if (mp) {
1763 			if (hwcsum) {
1764 				/*
1765 				 * If the peer says that the data has
1766 				 * been validated then we declare that
1767 				 * the full checksum has been
1768 				 * verified.
1769 				 *
1770 				 * We don't look at the "checksum
1771 				 * blank" flag, and hence could have a
1772 				 * packet here that we are asserting
1773 				 * is good with a blank checksum.
1774 				 *
1775 				 * The hardware checksum offload
1776 				 * specification says that we must
1777 				 * provide the actual checksum as well
1778 				 * as an assertion that it is valid,
1779 				 * but the protocol stack doesn't
1780 				 * actually use it and some other
1781 				 * drivers don't bother, so we don't.
1782 				 * If it was necessary we could grovel
1783 				 * in the packet to find it.
1784 				 */
1785 
1786 				(void) hcksum_assoc(mp, NULL,
1787 				    NULL, 0, 0, 0, 0,
1788 				    HCK_FULLCKSUM |
1789 				    HCK_FULLCKSUM_OK,
1790 				    0);
1791 				xnfp->xnf_stat_rx_cksum_no_need++;
1792 			}
1793 			if (head == NULL) {
1794 				head = tail = mp;
1795 			} else {
1796 				tail->b_next = mp;
1797 				tail = mp;
1798 			}
1799 
1800 			ASSERT(mp->b_next == NULL);
1801 
1802 			xnfp->xnf_stat_ipackets++;
1803 			xnfp->xnf_stat_rbytes += len;
1804 		}
1805 
1806 		xnfp->xnf_rx_ring.rsp_cons++;
1807 	}
1808 
1809 	/*
1810 	 * Has more data come in since we started?
1811 	 */
1812 	/* LINTED: constant in conditional context */
1813 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1814 	if (work_to_do)
1815 		goto loop;
1816 
1817 	/*
1818 	 * Indicate to the backend that we have re-filled the receive
1819 	 * ring.
1820 	 */
1821 	/* LINTED: constant in conditional context */
1822 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1823 	if (notify)
1824 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1825 
1826 	return (head);
1827 }
1828 
1829 /* Called when the upper layers free a message we passed upstream */
1830 static void
1831 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1832 {
1833 	xnf_t *xnfp = bdesc->xnfp;
1834 	pfn_t pfn;
1835 	long cnt;
1836 
1837 	/* One less outstanding receive buffer */
1838 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1839 	--xnfp->xnf_rx_bufs_outstanding;
1840 	/*
1841 	 * Return buffer to the free list, unless the free list is getting
1842 	 * too large.  XXPV - this threshold may need tuning.
1843 	 */
1844 	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
1845 		/*
1846 		 * Unmap the page, and hand the machine page back
1847 		 * to xen so it can be re-used as a backend net buffer.
1848 		 */
1849 		pfn = xnf_btop(bdesc->buf_phys);
1850 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1851 		if (cnt != 1) {
1852 			cmn_err(CE_WARN, "unable to give a page back to the "
1853 			    "hypervisor\n");
1854 		}
1855 
1856 		bdesc->next = xnfp->xnf_free_list;
1857 		xnfp->xnf_free_list = bdesc;
1858 		xnfp->xnf_rx_descs_free++;
1859 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1860 	} else {
1861 		/*
1862 		 * We can return everything here since we have a free buffer
1863 		 * that we have not given the backing page for back to xen.
1864 		 */
1865 		--xnfp->xnf_rx_buffer_count;
1866 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1867 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1868 		ddi_dma_mem_free(&bdesc->acc_handle);
1869 		ddi_dma_free_handle(&bdesc->dma_handle);
1870 		kmem_free(bdesc, sizeof (*bdesc));
1871 	}
1872 }
1873 
1874 /*
1875  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1876  */
1877 static int
1878 xnf_alloc_dma_resources(xnf_t *xnfp)
1879 {
1880 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
1881 	int			i;
1882 	size_t			len;
1883 	ddi_dma_cookie_t	dma_cookie;
1884 	uint_t			ncookies;
1885 	struct xnf_buffer_desc	*bdesc;
1886 	int			rc;
1887 	caddr_t			rptr;
1888 
1889 	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
1890 	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
1891 
1892 	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
1893 
1894 	/*
1895 	 * The code below allocates all the DMA data structures that
1896 	 * need to be released when the driver is detached.
1897 	 *
1898 	 * First allocate handles for mapping (virtual address) pointers to
1899 	 * transmit data buffers to physical addresses
1900 	 */
1901 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
1902 		if ((rc = ddi_dma_alloc_handle(devinfo,
1903 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
1904 		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
1905 			return (DDI_FAILURE);
1906 	}
1907 
1908 	/*
1909 	 * Allocate page for the transmit descriptor ring.
1910 	 */
1911 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1912 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
1913 		goto alloc_error;
1914 
1915 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
1916 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1917 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1918 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
1919 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
1920 		xnfp->xnf_tx_ring_dma_handle = NULL;
1921 		goto alloc_error;
1922 	}
1923 
1924 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
1925 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1926 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1927 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
1928 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
1929 		xnfp->xnf_tx_ring_dma_handle = NULL;
1930 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
1931 		if (rc == DDI_DMA_NORESOURCES)
1932 			goto alloc_error;
1933 		else
1934 			goto error;
1935 	}
1936 
1937 	ASSERT(ncookies == 1);
1938 	bzero(rptr, PAGESIZE);
1939 	/* LINTED: constant in conditional context */
1940 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
1941 	/* LINTED: constant in conditional context */
1942 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
1943 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
1944 
1945 	/*
1946 	 * Allocate page for the receive descriptor ring.
1947 	 */
1948 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1949 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
1950 		goto alloc_error;
1951 
1952 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
1953 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1954 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1955 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
1956 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
1957 		xnfp->xnf_rx_ring_dma_handle = NULL;
1958 		goto alloc_error;
1959 	}
1960 
1961 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
1962 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1963 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1964 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
1965 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
1966 		xnfp->xnf_rx_ring_dma_handle = NULL;
1967 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
1968 		if (rc == DDI_DMA_NORESOURCES)
1969 			goto alloc_error;
1970 		else
1971 			goto error;
1972 	}
1973 
1974 	ASSERT(ncookies == 1);
1975 	bzero(rptr, PAGESIZE);
1976 	/* LINTED: constant in conditional context */
1977 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
1978 	/* LINTED: constant in conditional context */
1979 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
1980 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
1981 
1982 	/*
1983 	 * Preallocate receive buffers for each receive descriptor.
1984 	 */
1985 
1986 	/* Set up the "free list" of receive buffer descriptors */
1987 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
1988 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
1989 			goto alloc_error;
1990 		bdesc->next = xnfp->xnf_free_list;
1991 		xnfp->xnf_free_list = bdesc;
1992 	}
1993 
1994 	return (DDI_SUCCESS);
1995 
1996 alloc_error:
1997 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
1998 	    ddi_get_instance(xnfp->xnf_devinfo));
1999 error:
2000 	xnf_release_dma_resources(xnfp);
2001 	return (DDI_FAILURE);
2002 }
2003 
2004 /*
2005  * Release all DMA resources in the opposite order from acquisition
2006  * Should not be called until all outstanding esballoc buffers
2007  * have been returned.
2008  */
2009 static void
2010 xnf_release_dma_resources(xnf_t *xnfp)
2011 {
2012 	int i;
2013 
2014 	/*
2015 	 * Free receive buffers which are currently associated with
2016 	 * descriptors
2017 	 */
2018 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2019 		struct xnf_buffer_desc *bp;
2020 
2021 		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
2022 			continue;
2023 		xnf_free_buffer(bp);
2024 		xnfp->xnf_rxpkt_bufptr[i] = NULL;
2025 	}
2026 
2027 	/* Free the receive ring buffer */
2028 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2029 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2030 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2031 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2032 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2033 	}
2034 	/* Free the transmit ring buffer */
2035 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2036 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2037 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2038 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2039 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2040 	}
2041 }
2042 
2043 static void
2044 xnf_release_mblks(xnf_t *xnfp)
2045 {
2046 	int	i;
2047 
2048 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2049 		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
2050 			continue;
2051 		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
2052 		xnfp->xnf_tx_pkt_info[i].mp = NULL;
2053 		(void) ddi_dma_unbind_handle(
2054 		    xnfp->xnf_tx_pkt_info[i].dma_handle);
2055 	}
2056 }
2057 
2058 /*
2059  * Remove a xmit buffer descriptor from the head of the free list and return
2060  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2061  * Called with the tx_buf_mutex held.
2062  */
2063 static struct xnf_buffer_desc *
2064 xnf_get_tx_buffer(xnf_t *xnfp)
2065 {
2066 	struct xnf_buffer_desc *bdesc;
2067 
2068 	bdesc = xnfp->xnf_tx_free_list;
2069 	if (bdesc != NULL) {
2070 		xnfp->xnf_tx_free_list = bdesc->next;
2071 	} else {
2072 		bdesc = xnf_alloc_tx_buffer(xnfp);
2073 	}
2074 	return (bdesc);
2075 }
2076 
2077 /*
2078  * Remove a buffer descriptor from the head of the free list and return
2079  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2080  * Called with the rx_buf_mutex held.
2081  */
2082 static struct xnf_buffer_desc *
2083 xnf_get_buffer(xnf_t *xnfp)
2084 {
2085 	struct xnf_buffer_desc *bdesc;
2086 
2087 	bdesc = xnfp->xnf_free_list;
2088 	if (bdesc != NULL) {
2089 		xnfp->xnf_free_list = bdesc->next;
2090 		xnfp->xnf_rx_descs_free--;
2091 	} else {
2092 		bdesc = xnf_alloc_buffer(xnfp);
2093 	}
2094 	return (bdesc);
2095 }
2096 
2097 /*
2098  * Free a xmit buffer back to the xmit free list
2099  */
2100 static void
2101 xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
2102 {
2103 	xnf_t *xnfp = bp->xnfp;
2104 
2105 	mutex_enter(&xnfp->xnf_tx_buf_mutex);
2106 	bp->next = xnfp->xnf_tx_free_list;
2107 	xnfp->xnf_tx_free_list = bp;
2108 	mutex_exit(&xnfp->xnf_tx_buf_mutex);
2109 }
2110 
2111 /*
2112  * Put a buffer descriptor onto the head of the free list.
2113  * for page-flip:
2114  * We can't really free these buffers back to the kernel
2115  * since we have given away their backing page to be used
2116  * by the back end net driver.
2117  * for hvcopy:
2118  * release all the memory
2119  */
2120 static void
2121 xnf_free_buffer(struct xnf_buffer_desc *bdesc)
2122 {
2123 	xnf_t *xnfp = bdesc->xnfp;
2124 
2125 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
2126 	if (xnfp->xnf_rx_hvcopy) {
2127 		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
2128 			goto out;
2129 		ddi_dma_mem_free(&bdesc->acc_handle);
2130 		ddi_dma_free_handle(&bdesc->dma_handle);
2131 		kmem_free(bdesc, sizeof (*bdesc));
2132 		xnfp->xnf_rx_buffer_count--;
2133 	} else {
2134 		bdesc->next = xnfp->xnf_free_list;
2135 		xnfp->xnf_free_list = bdesc;
2136 		xnfp->xnf_rx_descs_free++;
2137 	}
2138 out:
2139 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
2140 }
2141 
2142 /*
2143  * Allocate a DMA-able xmit buffer, including a structure to
2144  * keep track of the buffer.  Called with tx_buf_mutex held.
2145  */
2146 static struct xnf_buffer_desc *
2147 xnf_alloc_tx_buffer(xnf_t *xnfp)
2148 {
2149 	struct xnf_buffer_desc *bdesc;
2150 	size_t len;
2151 
2152 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2153 		return (NULL);
2154 
2155 	/* allocate a DMA access handle for receive buffer */
2156 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
2157 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2158 		goto failure;
2159 
2160 	/* Allocate DMA-able memory for transmit buffer */
2161 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2162 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2163 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2164 		goto failure_1;
2165 
2166 	bdesc->xnfp = xnfp;
2167 	xnfp->xnf_tx_buffer_count++;
2168 
2169 	return (bdesc);
2170 
2171 failure_1:
2172 	ddi_dma_free_handle(&bdesc->dma_handle);
2173 
2174 failure:
2175 	kmem_free(bdesc, sizeof (*bdesc));
2176 	return (NULL);
2177 }
2178 
2179 /*
2180  * Allocate a DMA-able receive buffer, including a structure to
2181  * keep track of the buffer.  Called with rx_buf_mutex held.
2182  */
2183 static struct xnf_buffer_desc *
2184 xnf_alloc_buffer(xnf_t *xnfp)
2185 {
2186 	struct			xnf_buffer_desc *bdesc;
2187 	size_t			len;
2188 	uint_t			ncookies;
2189 	ddi_dma_cookie_t	dma_cookie;
2190 	long			cnt;
2191 	pfn_t			pfn;
2192 
2193 	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
2194 		return (NULL);
2195 
2196 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2197 		return (NULL);
2198 
2199 	/* allocate a DMA access handle for receive buffer */
2200 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
2201 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2202 		goto failure;
2203 
2204 	/* Allocate DMA-able memory for receive buffer */
2205 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2206 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2207 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2208 		goto failure_1;
2209 
2210 	/* bind to virtual address of buffer to get physical address */
2211 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2212 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2213 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2214 		goto failure_2;
2215 
2216 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2217 	bdesc->xnfp = xnfp;
2218 	if (xnfp->xnf_rx_hvcopy) {
2219 		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
2220 	} else {
2221 		bdesc->free_rtn.free_func = xnf_rcv_complete;
2222 	}
2223 	bdesc->free_rtn.free_arg = (char *)bdesc;
2224 	bdesc->grant_ref = GRANT_INVALID_REF;
2225 	ASSERT(ncookies == 1);
2226 
2227 	xnfp->xnf_rx_buffer_count++;
2228 
2229 	if (!xnfp->xnf_rx_hvcopy) {
2230 		/*
2231 		 * Unmap the page, and hand the machine page back
2232 		 * to xen so it can be used as a backend net buffer.
2233 		 */
2234 		pfn = xnf_btop(bdesc->buf_phys);
2235 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2236 		if (cnt != 1) {
2237 			cmn_err(CE_WARN, "unable to give a page back to the "
2238 			    "hypervisor\n");
2239 		}
2240 	}
2241 
2242 	return (bdesc);
2243 
2244 failure_2:
2245 	ddi_dma_mem_free(&bdesc->acc_handle);
2246 
2247 failure_1:
2248 	ddi_dma_free_handle(&bdesc->dma_handle);
2249 
2250 failure:
2251 	kmem_free(bdesc, sizeof (*bdesc));
2252 	return (NULL);
2253 }
2254 
2255 /*
2256  * Statistics.
2257  */
2258 static char *xnf_aux_statistics[] = {
2259 	"tx_cksum_deferred",
2260 	"rx_cksum_no_need",
2261 	"interrupts",
2262 	"unclaimed_interrupts",
2263 	"tx_pullup",
2264 	"tx_pagebndry",
2265 	"tx_attempt",
2266 	"rx_no_ringbuf",
2267 	"hvcopy_packet_processed",
2268 };
2269 
2270 static int
2271 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2272 {
2273 	xnf_t *xnfp;
2274 	kstat_named_t *knp;
2275 
2276 	if (flag != KSTAT_READ)
2277 		return (EACCES);
2278 
2279 	xnfp = ksp->ks_private;
2280 	knp = ksp->ks_data;
2281 
2282 	/*
2283 	 * Assignment order must match that of the names in
2284 	 * xnf_aux_statistics.
2285 	 */
2286 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2287 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2288 
2289 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2290 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2291 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2292 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2293 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2294 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
2295 
2296 	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
2297 
2298 	return (0);
2299 }
2300 
2301 static boolean_t
2302 xnf_kstat_init(xnf_t *xnfp)
2303 {
2304 	int nstat = sizeof (xnf_aux_statistics) /
2305 	    sizeof (xnf_aux_statistics[0]);
2306 	char **cp = xnf_aux_statistics;
2307 	kstat_named_t *knp;
2308 
2309 	/*
2310 	 * Create and initialise kstats.
2311 	 */
2312 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2313 	    ddi_get_instance(xnfp->xnf_devinfo),
2314 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2315 	    nstat, 0)) == NULL)
2316 		return (B_FALSE);
2317 
2318 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2319 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2320 
2321 	knp = xnfp->xnf_kstat_aux->ks_data;
2322 	while (nstat > 0) {
2323 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2324 
2325 		knp++;
2326 		cp++;
2327 		nstat--;
2328 	}
2329 
2330 	kstat_install(xnfp->xnf_kstat_aux);
2331 
2332 	return (B_TRUE);
2333 }
2334 
2335 static int
2336 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2337 {
2338 	xnf_t *xnfp = arg;
2339 
2340 	mutex_enter(&xnfp->xnf_intrlock);
2341 	mutex_enter(&xnfp->xnf_txlock);
2342 
2343 #define	mac_stat(q, r)				\
2344 	case (MAC_STAT_##q):			\
2345 		*val = xnfp->xnf_stat_##r;	\
2346 		break
2347 
2348 #define	ether_stat(q, r)			\
2349 	case (ETHER_STAT_##q):			\
2350 		*val = xnfp->xnf_stat_##r;	\
2351 		break
2352 
2353 	switch (stat) {
2354 
2355 	mac_stat(IPACKETS, ipackets);
2356 	mac_stat(OPACKETS, opackets);
2357 	mac_stat(RBYTES, rbytes);
2358 	mac_stat(OBYTES, obytes);
2359 	mac_stat(NORCVBUF, norxbuf);
2360 	mac_stat(IERRORS, errrx);
2361 	mac_stat(NOXMTBUF, tx_defer);
2362 
2363 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2364 	ether_stat(TOOSHORT_ERRORS, runt);
2365 
2366 	default:
2367 		mutex_exit(&xnfp->xnf_txlock);
2368 		mutex_exit(&xnfp->xnf_intrlock);
2369 
2370 		return (ENOTSUP);
2371 	}
2372 
2373 #undef mac_stat
2374 #undef ether_stat
2375 
2376 	mutex_exit(&xnfp->xnf_txlock);
2377 	mutex_exit(&xnfp->xnf_intrlock);
2378 
2379 	return (0);
2380 }
2381 
2382 /*ARGSUSED*/
2383 static void
2384 xnf_blank(void *arg, time_t ticks, uint_t count)
2385 {
2386 	/*
2387 	 * XXPV dme: blanking is not currently implemented.
2388 	 *
2389 	 * It's not obvious how to use the 'ticks' argument here.
2390 	 *
2391 	 * 'Count' might be used as an indicator of how to set
2392 	 * rsp_event when posting receive buffers to the rx_ring.  It
2393 	 * would replace the code at the tail of xnf_process_recv()
2394 	 * that simply indicates that the next completed packet should
2395 	 * cause an interrupt.
2396 	 */
2397 }
2398 
2399 static void
2400 xnf_resources(void *arg)
2401 {
2402 	xnf_t *xnfp = arg;
2403 	mac_rx_fifo_t mrf;
2404 
2405 	mrf.mrf_type = MAC_RX_FIFO;
2406 	mrf.mrf_blank = xnf_blank;
2407 	mrf.mrf_arg = (void *)xnfp;
2408 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2409 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2410 
2411 	xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
2412 	    (mac_resource_t *)&mrf);
2413 }
2414 
2415 /*ARGSUSED*/
2416 static void
2417 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2418 {
2419 	miocnak(q, mp, 0, EINVAL);
2420 }
2421 
2422 static boolean_t
2423 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2424 {
2425 	xnf_t *xnfp = arg;
2426 
2427 	switch (cap) {
2428 	case MAC_CAPAB_HCKSUM: {
2429 		uint32_t *capab = cap_data;
2430 
2431 		/*
2432 		 * We declare ourselves capable of HCKSUM_INET_PARTIAL
2433 		 * in order that the protocol stack insert the
2434 		 * pseudo-header checksum in packets that it passes
2435 		 * down to us.
2436 		 *
2437 		 * Whilst the flag used to communicate with dom0 is
2438 		 * called "NETTXF_csum_blank", the checksum in the
2439 		 * packet must contain the pseudo-header checksum and
2440 		 * not zero. (In fact, a Solaris dom0 is happy to deal
2441 		 * with a checksum of zero, but a Linux dom0 is not.)
2442 		 */
2443 		if (xnfp->xnf_cksum_offload)
2444 			*capab = HCKSUM_INET_PARTIAL;
2445 		else
2446 			*capab = 0;
2447 		break;
2448 	}
2449 
2450 	case MAC_CAPAB_POLL:
2451 		/* Just return B_TRUE. */
2452 		break;
2453 
2454 	default:
2455 		return (B_FALSE);
2456 	}
2457 
2458 	return (B_TRUE);
2459 }
2460 
2461 /*ARGSUSED*/
2462 static void
2463 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2464     void *arg, void *impl_data)
2465 {
2466 	xnf_t *xnfp = ddi_get_driver_private(dip);
2467 	XenbusState new_state = *(XenbusState *)impl_data;
2468 
2469 	ASSERT(xnfp != NULL);
2470 
2471 	switch (new_state) {
2472 	case XenbusStateConnected:
2473 		mutex_enter(&xnfp->xnf_intrlock);
2474 		mutex_enter(&xnfp->xnf_txlock);
2475 
2476 		xnfp->xnf_connected = B_TRUE;
2477 		cv_broadcast(&xnfp->xnf_cv);
2478 
2479 		mutex_exit(&xnfp->xnf_txlock);
2480 		mutex_exit(&xnfp->xnf_intrlock);
2481 
2482 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2483 		break;
2484 
2485 	default:
2486 		break;
2487 	}
2488 }
2489 
2490 /*
2491  * Check whether backend is capable of and willing to talk
2492  * to us via hypervisor copy, as opposed to page flip.
2493  */
2494 static boolean_t
2495 xnf_hvcopy_peer_status(dev_info_t *devinfo)
2496 {
2497 	int	be_rx_copy;
2498 	int	err;
2499 
2500 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
2501 	    "feature-rx-copy", "%d", &be_rx_copy);
2502 	/*
2503 	 * If we fail to read the store we assume that the key is
2504 	 * absent, implying an older domain at the far end.  Older
2505 	 * domains cannot do HV copy (we assume ..).
2506 	 */
2507 	if (err != 0)
2508 		be_rx_copy = 0;
2509 
2510 	return (be_rx_copy?B_TRUE:B_FALSE);
2511 }
2512