xref: /titanic_52/usr/src/uts/common/xen/io/xnf.c (revision 5f149bca52352f45598e5563debe72ce04bd7a21)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  *
31  * Copyright (c) 2004 Christian Limpach.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. This section intentionally left blank.
43  * 4. The name of the author may not be used to endorse or promote products
44  *    derived from this software without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56  */
57 /*
58  * Section 3 of the above license was updated in response to bug 6379571.
59  */
60 
61 /*
62  * xnf.c - Nemo-based network driver for domU
63  */
64 
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h>
68 #include <sys/sysmacros.h>
69 #include <sys/systm.h>
70 #include <sys/stream.h>
71 #include <sys/strsubr.h>
72 #include <sys/conf.h>
73 #include <sys/ddi.h>
74 #include <sys/devops.h>
75 #include <sys/sunddi.h>
76 #include <sys/sunndi.h>
77 #include <sys/dlpi.h>
78 #include <sys/ethernet.h>
79 #include <sys/strsun.h>
80 #include <sys/pattr.h>
81 #include <inet/ip.h>
82 #include <sys/modctl.h>
83 #include <sys/mac.h>
84 #include <sys/mac_ether.h>
85 #include <sys/bootinfo.h>
86 #include <sys/mach_mmu.h>
87 #ifdef	XPV_HVM_DRIVER
88 #include <sys/xpv_support.h>
89 #include <sys/hypervisor.h>
90 #else
91 #include <sys/hypervisor.h>
92 #include <sys/evtchn_impl.h>
93 #include <sys/balloon_impl.h>
94 #endif
95 #include <xen/public/io/netif.h>
96 #include <sys/gnttab.h>
97 #include <xen/sys/xendev.h>
98 #include <sys/sdt.h>
99 
100 #include <io/xnf.h>
101 
102 
103 /*
104  *  Declarations and Module Linkage
105  */
106 
107 #define	IDENT	"Virtual Ethernet driver"
108 
109 #if defined(DEBUG) || defined(__lint)
110 #define	XNF_DEBUG
111 int	xnfdebug = 0;
112 #endif
113 
114 /*
115  * On a 32 bit PAE system physical and machine addresses are larger
116  * than 32 bits.  ddi_btop() on such systems take an unsigned long
117  * argument, and so addresses above 4G are truncated before ddi_btop()
118  * gets to see them.  To avoid this, code the shift operation here.
119  */
120 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
121 
122 boolean_t	xnf_cksum_offload = B_TRUE;
123 
124 /* Default value for hypervisor-based copy operations */
125 boolean_t	xnf_rx_hvcopy = B_TRUE;
126 
127 /*
128  * Should pages used for transmit be readonly for the peer?
129  */
130 boolean_t	xnf_tx_pages_readonly = B_FALSE;
131 /*
132  * Packets under this size are bcopied instead of using desballoc.
133  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
134  * always copy.
135  */
136 unsigned int	xnf_rx_bcopy_thresh = 64;
137 
138 unsigned int	xnf_max_tx_frags = 1;
139 
140 /* Required system entry points */
141 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
142 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
143 
144 /* Required driver entry points for Nemo */
145 static int	xnf_start(void *);
146 static void	xnf_stop(void *);
147 static int	xnf_set_mac_addr(void *, const uint8_t *);
148 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
149 static int	xnf_set_promiscuous(void *, boolean_t);
150 static mblk_t	*xnf_send(void *, mblk_t *);
151 static uint_t	xnf_intr(caddr_t);
152 static int	xnf_stat(void *, uint_t, uint64_t *);
153 static void	xnf_blank(void *, time_t, uint_t);
154 static void	xnf_resources(void *);
155 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
156 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
157 
158 /* Driver private functions */
159 static int xnf_alloc_dma_resources(xnf_t *);
160 static void xnf_release_dma_resources(xnf_t *);
161 static mblk_t *xnf_process_recv(xnf_t *);
162 static void xnf_rcv_complete(struct xnf_buffer_desc *);
163 static void xnf_release_mblks(xnf_t *);
164 static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
165 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
166 static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
167 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
168 static void xnf_free_buffer(struct xnf_buffer_desc *);
169 static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
170 void xnf_send_driver_status(int, int);
171 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
172 static int xnf_clean_tx_ring(xnf_t  *);
173 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
174     void *, void *);
175 static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
176 static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
177 static boolean_t xnf_kstat_init(xnf_t *xnfp);
178 
179 /*
180  * XXPV dme: remove MC_IOCTL?
181  */
182 static mac_callbacks_t xnf_callbacks = {
183 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
184 	xnf_stat,
185 	xnf_start,
186 	xnf_stop,
187 	xnf_set_promiscuous,
188 	xnf_set_multicast,
189 	xnf_set_mac_addr,
190 	xnf_send,
191 	xnf_resources,
192 	xnf_ioctl,
193 	xnf_getcapab
194 };
195 
196 #define	GRANT_INVALID_REF	0
197 const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
198 const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
199 
200 /* DMA attributes for network ring buffer */
201 static ddi_dma_attr_t ringbuf_dma_attr = {
202 	DMA_ATTR_V0,		/* version of this structure */
203 	0,			/* lowest usable address */
204 	0xffffffffffffffffULL,	/* highest usable address */
205 	0x7fffffff,		/* maximum DMAable byte count */
206 	MMU_PAGESIZE,		/* alignment in bytes */
207 	0x7ff,			/* bitmap of burst sizes */
208 	1,			/* minimum transfer */
209 	0xffffffffU,		/* maximum transfer */
210 	0xffffffffffffffffULL,	/* maximum segment length */
211 	1,			/* maximum number of segments */
212 	1,			/* granularity */
213 	0,			/* flags (reserved) */
214 };
215 
216 /* DMA attributes for transmit data */
217 static ddi_dma_attr_t tx_buffer_dma_attr = {
218 	DMA_ATTR_V0,		/* version of this structure */
219 	0,			/* lowest usable address */
220 	0xffffffffffffffffULL,	/* highest usable address */
221 	0x7fffffff,		/* maximum DMAable byte count */
222 	MMU_PAGESIZE,		/* alignment in bytes */
223 	0x7ff,			/* bitmap of burst sizes */
224 	1,			/* minimum transfer */
225 	0xffffffffU,		/* maximum transfer */
226 	0xffffffffffffffffULL,	/* maximum segment length */
227 	1,			/* maximum number of segments */
228 	1,			/* granularity */
229 	0,			/* flags (reserved) */
230 };
231 
232 /* DMA attributes for a receive buffer */
233 static ddi_dma_attr_t rx_buffer_dma_attr = {
234 	DMA_ATTR_V0,		/* version of this structure */
235 	0,			/* lowest usable address */
236 	0xffffffffffffffffULL,	/* highest usable address */
237 	0x7fffffff,		/* maximum DMAable byte count */
238 	MMU_PAGESIZE,		/* alignment in bytes */
239 	0x7ff,			/* bitmap of burst sizes */
240 	1,			/* minimum transfer */
241 	0xffffffffU,		/* maximum transfer */
242 	0xffffffffffffffffULL,	/* maximum segment length */
243 	1,			/* maximum number of segments */
244 	1,			/* granularity */
245 	0,			/* flags (reserved) */
246 };
247 
248 /* DMA access attributes for registers and descriptors */
249 static ddi_device_acc_attr_t accattr = {
250 	DDI_DEVICE_ATTR_V0,
251 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
252 	DDI_STRICTORDER_ACC
253 };
254 
255 /* DMA access attributes for data: NOT to be byte swapped. */
256 static ddi_device_acc_attr_t data_accattr = {
257 	DDI_DEVICE_ATTR_V0,
258 	DDI_NEVERSWAP_ACC,
259 	DDI_STRICTORDER_ACC
260 };
261 
262 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
263 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
264 
265 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
266     nodev, NULL, D_MP, NULL);
267 
268 static struct modldrv xnf_modldrv = {
269 	&mod_driverops,		/* Type of module.  This one is a driver */
270 	IDENT " %I%",		/* short description */
271 	&xnf_dev_ops		/* driver specific ops */
272 };
273 
274 static struct modlinkage modlinkage = {
275 	MODREV_1, &xnf_modldrv, NULL
276 };
277 
278 int
279 _init(void)
280 {
281 	int r;
282 
283 	mac_init_ops(&xnf_dev_ops, "xnf");
284 	r = mod_install(&modlinkage);
285 	if (r != DDI_SUCCESS)
286 		mac_fini_ops(&xnf_dev_ops);
287 
288 	return (r);
289 }
290 
291 int
292 _fini(void)
293 {
294 	return (EBUSY); /* XXPV dme: should be removable */
295 }
296 
297 int
298 _info(struct modinfo *modinfop)
299 {
300 	return (mod_info(&modlinkage, modinfop));
301 }
302 
303 static int
304 xnf_setup_rings(xnf_t *xnfp)
305 {
306 	int			ix, err;
307 	RING_IDX		i;
308 	struct xnf_buffer_desc	*bdesc, *rbp;
309 	struct xenbus_device	*xsd;
310 	domid_t			oeid;
311 
312 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
313 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
314 
315 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
316 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
317 
318 	err = gnttab_grant_foreign_access(oeid,
319 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
320 	if (err <= 0) {
321 		err = -err;
322 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
323 		goto out;
324 	}
325 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
326 
327 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
328 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
329 
330 	err = gnttab_grant_foreign_access(oeid,
331 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
332 	if (err <= 0) {
333 		err = -err;
334 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
335 		goto out;
336 	}
337 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
338 
339 
340 	mutex_enter(&xnfp->xnf_intrlock);
341 
342 	/*
343 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
344 	 * and reset the ring.  Note that this can lose packets after a resume,
345 	 * but we expect to stagger on.
346 	 */
347 	mutex_enter(&xnfp->xnf_txlock);
348 
349 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
350 		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
351 
352 		txp->id = i + 1;
353 
354 		if (txp->grant_ref == GRANT_INVALID_REF) {
355 			ASSERT(txp->mp == NULL);
356 			ASSERT(txp->bdesc == NULL);
357 			continue;
358 		}
359 
360 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
361 			panic("tx grant still in use by backend domain");
362 
363 		freemsg(txp->mp);
364 		txp->mp = NULL;
365 
366 		(void) ddi_dma_unbind_handle(txp->dma_handle);
367 
368 		if (txp->bdesc != NULL) {
369 			xnf_free_tx_buffer(txp->bdesc);
370 			txp->bdesc = NULL;
371 		}
372 
373 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
374 		    xnfp->xnf_tx_pages_readonly);
375 		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
376 		    txp->grant_ref);
377 		txp->grant_ref = GRANT_INVALID_REF;
378 	}
379 
380 	xnfp->xnf_tx_pkt_id_list = 0;
381 	xnfp->xnf_tx_ring.rsp_cons = 0;
382 	xnfp->xnf_tx_ring.req_prod_pvt = 0;
383 	xnfp->xnf_tx_ring.sring->req_prod = 0;
384 	xnfp->xnf_tx_ring.sring->rsp_prod = 0;
385 	xnfp->xnf_tx_ring.sring->rsp_event = 1;
386 
387 	mutex_exit(&xnfp->xnf_txlock);
388 
389 	/*
390 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
391 	 * our pages are currently flipped out/granted so we can't just free
392 	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
393 	 * useable anyway since the mfn's they refer to are no longer valid.
394 	 * Grant the backend domain access to each hung rx buffer.
395 	 */
396 	i = xnfp->xnf_rx_ring.rsp_cons;
397 	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
398 		volatile netif_rx_request_t	*rxrp;
399 
400 		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
401 		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
402 		rbp = xnfp->xnf_rxpkt_bufptr[ix];
403 		if (rbp != NULL) {
404 			grant_ref_t	ref = rbp->grant_ref;
405 
406 			ASSERT(ref != GRANT_INVALID_REF);
407 			if (xnfp->xnf_rx_hvcopy) {
408 				pfn_t pfn = xnf_btop(rbp->buf_phys);
409 				mfn_t mfn = pfn_to_mfn(pfn);
410 
411 				gnttab_grant_foreign_access_ref(ref, oeid,
412 				    mfn, 0);
413 			} else {
414 				gnttab_grant_foreign_transfer_ref(ref, oeid);
415 			}
416 			rxrp->id = ix;
417 			rxrp->gref = ref;
418 		}
419 	}
420 
421 	/*
422 	 * Reset the ring pointers to initial state.
423 	 * Hang buffers for any empty ring slots.
424 	 */
425 	xnfp->xnf_rx_ring.rsp_cons = 0;
426 	xnfp->xnf_rx_ring.req_prod_pvt = 0;
427 	xnfp->xnf_rx_ring.sring->req_prod = 0;
428 	xnfp->xnf_rx_ring.sring->rsp_prod = 0;
429 	xnfp->xnf_rx_ring.sring->rsp_event = 1;
430 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
431 		xnfp->xnf_rx_ring.req_prod_pvt = i;
432 		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
433 			continue;
434 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
435 			break;
436 		rx_buffer_hang(xnfp, bdesc);
437 	}
438 	xnfp->xnf_rx_ring.req_prod_pvt = i;
439 	/* LINTED: constant in conditional context */
440 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
441 
442 	mutex_exit(&xnfp->xnf_intrlock);
443 
444 	return (0);
445 
446 out:
447 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
448 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
449 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
450 
451 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
452 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
453 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
454 
455 	return (err);
456 }
457 
458 
459 /* Called when the upper layers free a message we passed upstream */
460 static void
461 xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
462 {
463 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
464 	ddi_dma_mem_free(&bdesc->acc_handle);
465 	ddi_dma_free_handle(&bdesc->dma_handle);
466 	kmem_free(bdesc, sizeof (*bdesc));
467 }
468 
469 
470 /*
471  * Connect driver to back end, called to set up communication with
472  * back end driver both initially and on resume after restore/migrate.
473  */
474 void
475 xnf_be_connect(xnf_t *xnfp)
476 {
477 	char		mac[ETHERADDRL * 3];
478 	const char	*message;
479 	xenbus_transaction_t xbt;
480 	struct		xenbus_device *xsd;
481 	char		*xsname;
482 	int		err, be_no_cksum_offload;
483 
484 	ASSERT(!xnfp->xnf_connected);
485 
486 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
487 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
488 
489 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
490 	    "%s", (char *)&mac[0]);
491 	if (err != 0) {
492 		/*
493 		 * bad: we're supposed to be set up with a proper mac
494 		 * addr. at this point
495 		 */
496 		cmn_err(CE_WARN, "%s%d: no mac address",
497 		    ddi_driver_name(xnfp->xnf_devinfo),
498 		    ddi_get_instance(xnfp->xnf_devinfo));
499 			return;
500 	}
501 
502 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
503 		err = ENOENT;
504 		xenbus_dev_error(xsd, ENOENT, "parsing %s/mac", xsname);
505 		return;
506 	}
507 
508 	err = xnf_setup_rings(xnfp);
509 	if (err != 0) {
510 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
511 		xenbus_dev_error(xsd, err, "setting up ring");
512 		return;
513 	}
514 
515 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
516 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
517 	/*
518 	 * If we fail to read the store we assume that the key is
519 	 * absent, implying an older domain at the far end.  Older
520 	 * domains always support checksum offload.
521 	 */
522 	if (err != 0)
523 		be_no_cksum_offload = 0;
524 	/*
525 	 * If the far end cannot do checksum offload or we do not wish
526 	 * to do it, disable it.
527 	 */
528 	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
529 		xnfp->xnf_cksum_offload = B_FALSE;
530 
531 again:
532 	err = xenbus_transaction_start(&xbt);
533 	if (err != 0) {
534 		xenbus_dev_error(xsd, EIO, "starting transaction");
535 		return;
536 	}
537 
538 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
539 	    xnfp->xnf_tx_ring_ref);
540 	if (err != 0) {
541 		message = "writing tx ring-ref";
542 		goto abort_transaction;
543 	}
544 
545 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
546 	    xnfp->xnf_rx_ring_ref);
547 	if (err != 0) {
548 		message = "writing rx ring-ref";
549 		goto abort_transaction;
550 	}
551 
552 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
553 	    xnfp->xnf_evtchn);
554 	if (err != 0) {
555 		message = "writing event-channel";
556 		goto abort_transaction;
557 	}
558 
559 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
560 	if (err != 0) {
561 		message = "writing feature-rx-notify";
562 		goto abort_transaction;
563 	}
564 
565 	if (!xnfp->xnf_tx_pages_readonly) {
566 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
567 		    "%d", 1);
568 		if (err != 0) {
569 			message = "writing feature-tx-writable";
570 			goto abort_transaction;
571 		}
572 	}
573 
574 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
575 	    xnfp->xnf_cksum_offload ? 0 : 1);
576 	if (err != 0) {
577 		message = "writing feature-no-csum-offload";
578 		goto abort_transaction;
579 	}
580 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
581 	    xnfp->xnf_rx_hvcopy ? 1 : 0);
582 	if (err != 0) {
583 		message = "writing request-rx-copy";
584 		goto abort_transaction;
585 	}
586 
587 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
588 	if (err != 0) {
589 		message = "writing frontend XenbusStateConnected";
590 		goto abort_transaction;
591 	}
592 
593 	err = xenbus_transaction_end(xbt, 0);
594 	if (err != 0) {
595 		if (err == EAGAIN)
596 			goto again;
597 		xenbus_dev_error(xsd, err, "completing transaction");
598 	}
599 
600 	return;
601 
602 abort_transaction:
603 	(void) xenbus_transaction_end(xbt, 1);
604 	xenbus_dev_error(xsd, err, "%s", message);
605 }
606 
607 /*
608  *  attach(9E) -- Attach a device to the system
609  *
610  *  Called once for each board successfully probed.
611  */
612 static int
613 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
614 {
615 	mac_register_t *macp;
616 	xnf_t *xnfp;
617 	int err;
618 
619 #ifdef XNF_DEBUG
620 	if (xnfdebug & XNF_DEBUG_DDI)
621 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
622 		    (void *)devinfo);
623 #endif
624 
625 	switch (cmd) {
626 	case DDI_RESUME:
627 		xnfp = ddi_get_driver_private(devinfo);
628 
629 		(void) xvdi_resume(devinfo);
630 		(void) xvdi_alloc_evtchn(devinfo);
631 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
632 #ifdef XPV_HVM_DRIVER
633 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
634 		    xnfp);
635 #else
636 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
637 		    (caddr_t)xnfp);
638 #endif
639 		xnf_be_connect(xnfp);
640 		/*
641 		 * Our MAC address may have changed if we're resuming:
642 		 * - on a different host
643 		 * - on the same one and got a different MAC address
644 		 *   because we didn't specify one of our own.
645 		 * so it's useful to claim that it changed in order that
646 		 * IP send out a gratuitous ARP.
647 		 */
648 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
649 		return (DDI_SUCCESS);
650 
651 	case DDI_ATTACH:
652 		break;
653 
654 	default:
655 		return (DDI_FAILURE);
656 	}
657 
658 	/*
659 	 *  Allocate gld_mac_info_t and xnf_instance structures
660 	 */
661 	macp = mac_alloc(MAC_VERSION);
662 	if (macp == NULL)
663 		return (DDI_FAILURE);
664 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
665 
666 	macp->m_dip = devinfo;
667 	macp->m_driver = xnfp;
668 	xnfp->xnf_devinfo = devinfo;
669 
670 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
671 	macp->m_src_addr = xnfp->xnf_mac_addr;
672 	macp->m_callbacks = &xnf_callbacks;
673 	macp->m_min_sdu = 0;
674 	macp->m_max_sdu = XNF_MAXPKT;
675 
676 	xnfp->xnf_running = B_FALSE;
677 	xnfp->xnf_connected = B_FALSE;
678 	xnfp->xnf_cksum_offload = xnf_cksum_offload;
679 	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
680 
681 	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
682 #ifdef XPV_HVM_DRIVER
683 	if (!xnfp->xnf_rx_hvcopy) {
684 		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
685 		    "supports 'feature-rx-copy'");
686 		goto failure;
687 	}
688 #endif
689 
690 	/*
691 	 * Get the iblock cookie with which to initialize the mutexes.
692 	 */
693 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
694 	    != DDI_SUCCESS)
695 		goto failure;
696 	/*
697 	 * Driver locking strategy: the txlock protects all paths
698 	 * through the driver, except the interrupt thread.
699 	 * If the interrupt thread needs to do something which could
700 	 * affect the operation of any other part of the driver,
701 	 * it needs to acquire the txlock mutex.
702 	 */
703 	mutex_init(&xnfp->xnf_tx_buf_mutex,
704 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
705 	mutex_init(&xnfp->xnf_rx_buf_mutex,
706 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
707 	mutex_init(&xnfp->xnf_txlock,
708 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
709 	mutex_init(&xnfp->xnf_intrlock,
710 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
711 	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
712 
713 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
714 	    &xnfp->xnf_gref_tx_head) < 0) {
715 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
716 		    ddi_get_instance(xnfp->xnf_devinfo));
717 		goto failure_1;
718 	}
719 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
720 	    &xnfp->xnf_gref_rx_head) < 0) {
721 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
722 		    ddi_get_instance(xnfp->xnf_devinfo));
723 		goto failure_1;
724 	}
725 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
726 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
727 		    "driver data structures",
728 		    ddi_get_instance(xnfp->xnf_devinfo));
729 		goto failure_1;
730 	}
731 
732 	xnfp->xnf_rx_ring.sring->rsp_event =
733 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
734 
735 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
736 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
737 
738 	/* set driver private pointer now */
739 	ddi_set_driver_private(devinfo, xnfp);
740 
741 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change)
742 	    != DDI_SUCCESS)
743 		goto failure_1;
744 
745 	if (!xnf_kstat_init(xnfp))
746 		goto failure_2;
747 
748 	/*
749 	 * Allocate an event channel, add the interrupt handler and
750 	 * bind it to the event channel.
751 	 */
752 	(void) xvdi_alloc_evtchn(devinfo);
753 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
754 #ifdef XPV_HVM_DRIVER
755 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
756 #else
757 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
758 #endif
759 
760 	/*
761 	 * connect to the backend
762 	 */
763 	xnf_be_connect(xnfp);
764 
765 	err = mac_register(macp, &xnfp->xnf_mh);
766 	mac_free(macp);
767 	macp = NULL;
768 	if (err != 0)
769 		goto failure_3;
770 
771 	return (DDI_SUCCESS);
772 
773 failure_3:
774 	kstat_delete(xnfp->xnf_kstat_aux);
775 
776 failure_2:
777 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
778 #ifdef XPV_HVM_DRIVER
779 	ec_unbind_evtchn(xnfp->xnf_evtchn);
780 #else
781 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
782 #endif
783 	xnfp->xnf_evtchn = INVALID_EVTCHN;
784 
785 failure_1:
786 	xnf_release_dma_resources(xnfp);
787 	cv_destroy(&xnfp->xnf_cv);
788 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
789 	mutex_destroy(&xnfp->xnf_txlock);
790 	mutex_destroy(&xnfp->xnf_intrlock);
791 
792 failure:
793 	kmem_free(xnfp, sizeof (*xnfp));
794 	if (macp != NULL)
795 		mac_free(macp);
796 
797 	return (DDI_FAILURE);
798 }
799 
800 /*  detach(9E) -- Detach a device from the system */
801 static int
802 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
803 {
804 	xnf_t *xnfp;		/* Our private device info */
805 	int i;
806 
807 #ifdef XNF_DEBUG
808 	if (xnfdebug & XNF_DEBUG_DDI)
809 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
810 #endif
811 
812 	xnfp = ddi_get_driver_private(devinfo);
813 
814 	switch (cmd) {
815 	case DDI_SUSPEND:
816 #ifdef XPV_HVM_DRIVER
817 		ec_unbind_evtchn(xnfp->xnf_evtchn);
818 #else
819 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
820 #endif
821 
822 		xvdi_suspend(devinfo);
823 
824 		mutex_enter(&xnfp->xnf_intrlock);
825 		mutex_enter(&xnfp->xnf_txlock);
826 
827 		xnfp->xnf_evtchn = INVALID_EVTCHN;
828 		xnfp->xnf_connected = B_FALSE;
829 		mutex_exit(&xnfp->xnf_txlock);
830 		mutex_exit(&xnfp->xnf_intrlock);
831 		return (DDI_SUCCESS);
832 
833 	case DDI_DETACH:
834 		break;
835 
836 	default:
837 		return (DDI_FAILURE);
838 	}
839 
840 	if (xnfp->xnf_connected)
841 		return (DDI_FAILURE);
842 
843 	/* Wait for receive buffers to be returned; give up after 5 seconds */
844 	i = 50;
845 
846 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
847 	while (xnfp->xnf_rx_bufs_outstanding > 0) {
848 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
849 		delay(drv_usectohz(100000));
850 		if (--i == 0) {
851 			cmn_err(CE_WARN,
852 			    "xnf%d: never reclaimed all the "
853 			    "receive buffers.  Still have %d "
854 			    "buffers outstanding.",
855 			    ddi_get_instance(xnfp->xnf_devinfo),
856 			    xnfp->xnf_rx_bufs_outstanding);
857 			return (DDI_FAILURE);
858 		}
859 		mutex_enter(&xnfp->xnf_rx_buf_mutex);
860 	}
861 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
862 
863 	kstat_delete(xnfp->xnf_kstat_aux);
864 
865 	if (mac_unregister(xnfp->xnf_mh) != 0)
866 		return (DDI_FAILURE);
867 
868 	/* Stop the receiver */
869 	xnf_stop(xnfp);
870 
871 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
872 
873 	/* Remove the interrupt */
874 #ifdef XPV_HVM_DRIVER
875 	ec_unbind_evtchn(xnfp->xnf_evtchn);
876 #else
877 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
878 #endif
879 
880 	/* Release any pending xmit mblks */
881 	xnf_release_mblks(xnfp);
882 
883 	/* Release all DMA resources */
884 	xnf_release_dma_resources(xnfp);
885 
886 	cv_destroy(&xnfp->xnf_cv);
887 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
888 	mutex_destroy(&xnfp->xnf_txlock);
889 	mutex_destroy(&xnfp->xnf_intrlock);
890 
891 	kmem_free(xnfp, sizeof (*xnfp));
892 
893 	return (DDI_SUCCESS);
894 }
895 
896 /*
897  *  xnf_set_mac_addr() -- set the physical network address on the board.
898  */
899 /*ARGSUSED*/
900 static int
901 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
902 {
903 	xnf_t *xnfp = arg;
904 
905 #ifdef XNF_DEBUG
906 	if (xnfdebug & XNF_DEBUG_TRACE)
907 		printf("xnf%d: set_mac_addr(0x%p): "
908 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
909 		    ddi_get_instance(xnfp->xnf_devinfo),
910 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
911 		    macaddr[3], macaddr[4], macaddr[5]);
912 #endif
913 	/*
914 	 * We can't set our macaddr.
915 	 *
916 	 * XXPV dme: Why not?
917 	 */
918 	return (ENOTSUP);
919 }
920 
921 /*
922  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
923  *
924  *  Program the hardware to enable/disable the multicast address
925  *  in "mcast".  Enable if "add" is true, disable if false.
926  */
927 /*ARGSUSED*/
928 static int
929 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
930 {
931 	xnf_t *xnfp = arg;
932 
933 #ifdef XNF_DEBUG
934 	if (xnfdebug & XNF_DEBUG_TRACE)
935 		printf("xnf%d set_multicast(0x%p): "
936 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
937 		    ddi_get_instance(xnfp->xnf_devinfo),
938 		    (void *)xnfp, mca[0], mca[1], mca[2],
939 		    mca[3], mca[4], mca[5]);
940 #endif
941 
942 	/*
943 	 * XXPV dme: Ideally we'd relay the address to the backend for
944 	 * enabling.  The protocol doesn't support that (interesting
945 	 * extension), so we simply succeed and hope that the relevant
946 	 * packets are going to arrive.
947 	 *
948 	 * If protocol support is added for enable/disable then we'll
949 	 * need to keep a list of those in use and re-add on resume.
950 	 */
951 	return (0);
952 }
953 
954 /*
955  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
956  *
957  *  Program the hardware to enable/disable promiscuous mode.
958  */
959 /*ARGSUSED*/
960 static int
961 xnf_set_promiscuous(void *arg, boolean_t on)
962 {
963 	xnf_t *xnfp = arg;
964 
965 #ifdef XNF_DEBUG
966 	if (xnfdebug & XNF_DEBUG_TRACE)
967 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
968 		    ddi_get_instance(xnfp->xnf_devinfo),
969 		    (void *)xnfp, on);
970 #endif
971 	/*
972 	 * We can't really do this, but we pretend that we can in
973 	 * order that snoop will work.
974 	 */
975 	return (0);
976 }
977 
978 /*
979  * Clean buffers that we have responses for from the transmit ring.
980  */
981 static int
982 xnf_clean_tx_ring(xnf_t *xnfp)
983 {
984 	RING_IDX		next_resp, i;
985 	struct tx_pktinfo	*reap;
986 	int			id;
987 	grant_ref_t		ref;
988 
989 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
990 
991 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
992 		/*
993 		 * index of next transmission ack
994 		 */
995 		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
996 		membar_consumer();
997 		/*
998 		 * Clean tx packets from ring that we have responses for
999 		 */
1000 		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
1001 			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
1002 			reap = &xnfp->xnf_tx_pkt_info[id];
1003 			ref = reap->grant_ref;
1004 			/*
1005 			 * Return id to free list
1006 			 */
1007 			reap->id = xnfp->xnf_tx_pkt_id_list;
1008 			xnfp->xnf_tx_pkt_id_list = id;
1009 			if (gnttab_query_foreign_access(ref) != 0)
1010 				panic("tx grant still in use "
1011 				    "by backend domain");
1012 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1013 			(void) gnttab_end_foreign_access_ref(ref,
1014 			    xnfp->xnf_tx_pages_readonly);
1015 			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
1016 			    ref);
1017 			freemsg(reap->mp);
1018 			reap->mp = NULL;
1019 			reap->grant_ref = GRANT_INVALID_REF;
1020 			if (reap->bdesc != NULL)
1021 				xnf_free_tx_buffer(reap->bdesc);
1022 			reap->bdesc = NULL;
1023 		}
1024 		xnfp->xnf_tx_ring.rsp_cons = next_resp;
1025 		membar_enter();
1026 	}
1027 
1028 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1029 }
1030 
1031 /*
1032  * If we need to pull up data from either a packet that crosses a page
1033  * boundary or consisting of multiple mblks, do it here.  We allocate
1034  * a page aligned buffer and copy the data into it.  The header for the
1035  * allocated buffer is returned. (which is also allocated here)
1036  */
1037 static struct xnf_buffer_desc *
1038 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1039 {
1040 	struct xnf_buffer_desc	*bdesc;
1041 	mblk_t			*mptr;
1042 	caddr_t			bp;
1043 	int			len;
1044 
1045 	/*
1046 	 * get a xmit buffer from the xmit buffer pool
1047 	 */
1048 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1049 	bdesc = xnf_get_tx_buffer(xnfp);
1050 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
1051 	if (bdesc == NULL)
1052 		return (bdesc);
1053 	/*
1054 	 * Copy the data into the buffer
1055 	 */
1056 	xnfp->xnf_stat_tx_pullup++;
1057 	bp = bdesc->buf;
1058 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1059 		len = mptr->b_wptr - mptr->b_rptr;
1060 		bcopy(mptr->b_rptr, bp, len);
1061 		bp += len;
1062 	}
1063 	return (bdesc);
1064 }
1065 
1066 /*
1067  *  xnf_send_one() -- send a packet
1068  *
1069  *  Called when a packet is ready to be transmitted. A pointer to an
1070  *  M_DATA message that contains the packet is passed to this routine.
1071  *  At least the complete LLC header is contained in the message's
1072  *  first message block, and the remainder of the packet is contained
1073  *  within additional M_DATA message blocks linked to the first
1074  *  message block.
1075  *
1076  */
1077 static boolean_t
1078 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1079 {
1080 	struct xnf_buffer_desc	*xmitbuf;
1081 	struct tx_pktinfo	*txp_info;
1082 	mblk_t			*mptr;
1083 	ddi_dma_cookie_t	dma_cookie;
1084 	RING_IDX		slot;
1085 	int			length = 0, i, pktlen = 0, rc, tx_id;
1086 	int			tx_ring_freespace, page_oops;
1087 	uint_t			ncookies;
1088 	volatile netif_tx_request_t	*txrp;
1089 	caddr_t			bufaddr;
1090 	grant_ref_t		ref;
1091 	unsigned long		mfn;
1092 	uint32_t		pflags;
1093 	domid_t			oeid;
1094 
1095 #ifdef XNF_DEBUG
1096 	if (xnfdebug & XNF_DEBUG_SEND)
1097 		printf("xnf%d send(0x%p, 0x%p)\n",
1098 		    ddi_get_instance(xnfp->xnf_devinfo),
1099 		    (void *)xnfp, (void *)mp);
1100 #endif
1101 
1102 	ASSERT(mp != NULL);
1103 	ASSERT(mp->b_next == NULL);
1104 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1105 
1106 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1107 	ASSERT(tx_ring_freespace >= 0);
1108 
1109 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1110 	xnfp->xnf_stat_tx_attempt++;
1111 	/*
1112 	 * If there are no xmit ring slots available, return.
1113 	 */
1114 	if (tx_ring_freespace == 0) {
1115 		xnfp->xnf_stat_tx_defer++;
1116 		return (B_FALSE);	/* Send should be retried */
1117 	}
1118 
1119 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1120 	/* Count the number of mblks in message and compute packet size */
1121 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1122 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1123 
1124 	/* Make sure packet isn't too large */
1125 	if (pktlen > XNF_FRAMESIZE) {
1126 		cmn_err(CE_WARN, "xnf%d: large packet %d bytes",
1127 		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
1128 		freemsg(mp);
1129 		return (B_FALSE);
1130 	}
1131 
1132 	/*
1133 	 * Test if we cross a page boundary with our buffer
1134 	 */
1135 	page_oops = (i == 1) &&
1136 	    (xnf_btop((size_t)mp->b_rptr) !=
1137 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1138 	/*
1139 	 * XXPV - unfortunately, the Xen virtual net device currently
1140 	 * doesn't support multiple packet frags, so this will always
1141 	 * end up doing the pullup if we got more than one packet.
1142 	 */
1143 	if (i > xnf_max_tx_frags || page_oops) {
1144 		if (page_oops)
1145 			xnfp->xnf_stat_tx_pagebndry++;
1146 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1147 			/* could not allocate resources? */
1148 #ifdef XNF_DEBUG
1149 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1150 			    ddi_get_instance(xnfp->xnf_devinfo));
1151 #endif
1152 			xnfp->xnf_stat_tx_defer++;
1153 			return (B_FALSE);	/* Retry send */
1154 		}
1155 		bufaddr = xmitbuf->buf;
1156 	} else {
1157 		xmitbuf = NULL;
1158 		bufaddr = (caddr_t)mp->b_rptr;
1159 	}
1160 
1161 	/* set up data descriptor */
1162 	length = pktlen;
1163 
1164 	/*
1165 	 * Get packet id from free list
1166 	 */
1167 	tx_id = xnfp->xnf_tx_pkt_id_list;
1168 	ASSERT(tx_id < NET_TX_RING_SIZE);
1169 	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
1170 	xnfp->xnf_tx_pkt_id_list = txp_info->id;
1171 	txp_info->id = tx_id;
1172 
1173 	/* Prepare for DMA mapping of tx buffer(s) */
1174 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1175 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1176 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1177 	if (rc != DDI_DMA_MAPPED) {
1178 		ASSERT(rc != DDI_DMA_INUSE);
1179 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1180 		/*
1181 		 *  Return id to free list
1182 		 */
1183 		txp_info->id = xnfp->xnf_tx_pkt_id_list;
1184 		xnfp->xnf_tx_pkt_id_list = tx_id;
1185 		if (rc == DDI_DMA_NORESOURCES) {
1186 			xnfp->xnf_stat_tx_defer++;
1187 			return (B_FALSE); /* Retry later */
1188 		}
1189 #ifdef XNF_DEBUG
1190 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1191 		    ddi_get_instance(xnfp->xnf_devinfo), rc);
1192 #endif
1193 		return (B_FALSE);
1194 	}
1195 
1196 	ASSERT(ncookies == 1);
1197 	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
1198 	ASSERT((signed short)ref >= 0);
1199 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1200 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1201 	    xnfp->xnf_tx_pages_readonly);
1202 	txp_info->grant_ref = ref;
1203 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1204 	txrp->gref = ref;
1205 	txrp->size = dma_cookie.dmac_size;
1206 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1207 	txrp->id = tx_id;
1208 	txrp->flags = 0;
1209 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1210 	if (pflags != 0) {
1211 		ASSERT(xnfp->xnf_cksum_offload);
1212 		/*
1213 		 * If the local protocol stack requests checksum
1214 		 * offload we set the 'checksum blank' flag,
1215 		 * indicating to the peer that we need the checksum
1216 		 * calculated for us.
1217 		 *
1218 		 * We _don't_ set the validated flag, because we haven't
1219 		 * validated that the data and the checksum match.
1220 		 */
1221 		txrp->flags |= NETTXF_csum_blank;
1222 		xnfp->xnf_stat_tx_cksum_deferred++;
1223 	}
1224 	membar_producer();
1225 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 1;
1226 
1227 	txp_info->mp = mp;
1228 	txp_info->bdesc = xmitbuf;
1229 
1230 	xnfp->xnf_stat_opackets++;
1231 	xnfp->xnf_stat_obytes += pktlen;
1232 
1233 	return (B_TRUE);	/* successful transmit attempt */
1234 }
1235 
1236 mblk_t *
1237 xnf_send(void *arg, mblk_t *mp)
1238 {
1239 	xnf_t *xnfp = arg;
1240 	mblk_t *next;
1241 	boolean_t sent_something = B_FALSE;
1242 
1243 	mutex_enter(&xnfp->xnf_txlock);
1244 
1245 	/*
1246 	 * Transmission attempts should be impossible without having
1247 	 * previously called xnf_start().
1248 	 */
1249 	ASSERT(xnfp->xnf_running);
1250 
1251 	/*
1252 	 * Wait for getting connected to the backend
1253 	 */
1254 	while (!xnfp->xnf_connected) {
1255 		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
1256 	}
1257 
1258 	while (mp != NULL) {
1259 		next = mp->b_next;
1260 		mp->b_next = NULL;
1261 
1262 		if (!xnf_send_one(xnfp, mp)) {
1263 			mp->b_next = next;
1264 			break;
1265 		}
1266 
1267 		mp = next;
1268 		sent_something = B_TRUE;
1269 	}
1270 
1271 	if (sent_something) {
1272 		boolean_t notify;
1273 
1274 		/* LINTED: constant in conditional context */
1275 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1276 		    notify);
1277 		if (notify)
1278 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1279 	}
1280 
1281 	mutex_exit(&xnfp->xnf_txlock);
1282 
1283 	return (mp);
1284 }
1285 
1286 /*
1287  *  xnf_intr() -- ring interrupt service routine
1288  */
1289 static uint_t
1290 xnf_intr(caddr_t arg)
1291 {
1292 	xnf_t *xnfp = (xnf_t *)arg;
1293 	int tx_ring_space;
1294 
1295 	mutex_enter(&xnfp->xnf_intrlock);
1296 
1297 	/*
1298 	 * If not connected to the peer or not started by the upper
1299 	 * layers we cannot usefully handle interrupts.
1300 	 */
1301 	if (!(xnfp->xnf_connected && xnfp->xnf_running)) {
1302 		mutex_exit(&xnfp->xnf_intrlock);
1303 		xnfp->xnf_stat_unclaimed_interrupts++;
1304 		return (DDI_INTR_UNCLAIMED);
1305 	}
1306 
1307 #ifdef XNF_DEBUG
1308 	if (xnfdebug & XNF_DEBUG_INT)
1309 		printf("xnf%d intr(0x%p)\n",
1310 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1311 #endif
1312 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1313 		mblk_t *mp;
1314 
1315 		if (xnfp->xnf_rx_hvcopy)
1316 			mp = xnf_process_hvcopy_recv(xnfp);
1317 		else
1318 			mp = xnf_process_recv(xnfp);
1319 
1320 		if (mp != NULL)
1321 			mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
1322 	}
1323 
1324 	/*
1325 	 * Clean tx ring and try to start any blocked xmit streams if
1326 	 * there is now some space.
1327 	 */
1328 	mutex_enter(&xnfp->xnf_txlock);
1329 	tx_ring_space = xnf_clean_tx_ring(xnfp);
1330 	mutex_exit(&xnfp->xnf_txlock);
1331 	if (tx_ring_space > XNF_TX_FREE_THRESH) {
1332 		mutex_exit(&xnfp->xnf_intrlock);
1333 		mac_tx_update(xnfp->xnf_mh);
1334 		mutex_enter(&xnfp->xnf_intrlock);
1335 	}
1336 
1337 	xnfp->xnf_stat_interrupts++;
1338 	mutex_exit(&xnfp->xnf_intrlock);
1339 	return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */
1340 }
1341 
1342 /*
1343  *  xnf_start() -- start the board receiving and enable interrupts.
1344  */
1345 static int
1346 xnf_start(void *arg)
1347 {
1348 	xnf_t *xnfp = arg;
1349 
1350 #ifdef XNF_DEBUG
1351 	if (xnfdebug & XNF_DEBUG_TRACE)
1352 		printf("xnf%d start(0x%p)\n",
1353 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1354 #endif
1355 
1356 	mutex_enter(&xnfp->xnf_intrlock);
1357 	mutex_enter(&xnfp->xnf_txlock);
1358 
1359 	/* Accept packets from above. */
1360 	xnfp->xnf_running = B_TRUE;
1361 
1362 	mutex_exit(&xnfp->xnf_txlock);
1363 	mutex_exit(&xnfp->xnf_intrlock);
1364 
1365 	return (0);
1366 }
1367 
1368 /* xnf_stop() - disable hardware */
1369 static void
1370 xnf_stop(void *arg)
1371 {
1372 	xnf_t *xnfp = arg;
1373 
1374 #ifdef XNF_DEBUG
1375 	if (xnfdebug & XNF_DEBUG_TRACE)
1376 		printf("xnf%d stop(0x%p)\n",
1377 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1378 #endif
1379 
1380 	mutex_enter(&xnfp->xnf_intrlock);
1381 	mutex_enter(&xnfp->xnf_txlock);
1382 
1383 	xnfp->xnf_running = B_FALSE;
1384 
1385 	mutex_exit(&xnfp->xnf_txlock);
1386 	mutex_exit(&xnfp->xnf_intrlock);
1387 }
1388 
1389 /*
1390  * Driver private functions follow
1391  */
1392 
1393 /*
1394  * Hang buffer on rx ring
1395  */
1396 static void
1397 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1398 {
1399 	volatile netif_rx_request_t	*reqp;
1400 	RING_IDX			hang_ix;
1401 	grant_ref_t			ref;
1402 	domid_t				oeid;
1403 
1404 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1405 
1406 	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
1407 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1408 	    xnfp->xnf_rx_ring.req_prod_pvt);
1409 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1410 	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
1411 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1412 		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
1413 		ASSERT((signed short)ref >= 0);
1414 		bdesc->grant_ref = ref;
1415 		if (xnfp->xnf_rx_hvcopy) {
1416 			pfn_t pfn = xnf_btop(bdesc->buf_phys);
1417 			mfn_t mfn = pfn_to_mfn(pfn);
1418 
1419 			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
1420 		} else {
1421 			gnttab_grant_foreign_transfer_ref(ref, oeid);
1422 		}
1423 	}
1424 	reqp->id = hang_ix;
1425 	reqp->gref = bdesc->grant_ref;
1426 	bdesc->id = hang_ix;
1427 	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
1428 	membar_producer();
1429 	xnfp->xnf_rx_ring.req_prod_pvt++;
1430 }
1431 
1432 static mblk_t *
1433 xnf_process_hvcopy_recv(xnf_t *xnfp)
1434 {
1435 	netif_rx_response_t *rxpkt;
1436 	mblk_t		*mp, *head, *tail;
1437 	struct		xnf_buffer_desc *bdesc;
1438 	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
1439 	size_t 		len;
1440 
1441 	/*
1442 	 * in loop over unconsumed responses, we do:
1443 	 * 1. get a response
1444 	 * 2. take corresponding buffer off recv. ring
1445 	 * 3. indicate this by setting slot to NULL
1446 	 * 4. create a new message and
1447 	 * 5. copy data in, adjust ptr
1448 	 *
1449 	 * outside loop:
1450 	 * 7. make sure no more data has arrived; kick HV
1451 	 */
1452 
1453 	head = tail = NULL;
1454 
1455 loop:
1456 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1457 
1458 		/* 1. */
1459 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1460 		    xnfp->xnf_rx_ring.rsp_cons);
1461 
1462 		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
1463 		    (int)rxpkt->offset,
1464 		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
1465 
1466 		/*
1467 		 * 2.
1468 		 * Take buffer off of receive ring
1469 		 */
1470 		hwcsum = B_FALSE;
1471 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1472 		/* 3 */
1473 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1474 		ASSERT(bdesc->id == rxpkt->id);
1475 		if (rxpkt->status <= 0) {
1476 			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
1477 			    char *, bdesc->buf, int, rxpkt->offset,
1478 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1479 			mp = NULL;
1480 			xnfp->xnf_stat_errrx++;
1481 			if (rxpkt->status == 0)
1482 				xnfp->xnf_stat_runt++;
1483 			if (rxpkt->status == NETIF_RSP_ERROR)
1484 				xnfp->xnf_stat_mac_rcv_error++;
1485 			if (rxpkt->status == NETIF_RSP_DROPPED)
1486 				xnfp->xnf_stat_norxbuf++;
1487 			/*
1488 			 * re-hang the buffer
1489 			 */
1490 			rx_buffer_hang(xnfp, bdesc);
1491 		} else {
1492 			grant_ref_t		ref =  bdesc->grant_ref;
1493 			struct xnf_buffer_desc	*new_bdesc;
1494 			unsigned long		off = rxpkt->offset;
1495 
1496 			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
1497 			    char *, bdesc->buf, int, rxpkt->offset,
1498 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1499 			len = rxpkt->status;
1500 			ASSERT(off + len <= PAGEOFFSET);
1501 			if (ref == GRANT_INVALID_REF) {
1502 				mp = NULL;
1503 				new_bdesc = bdesc;
1504 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1505 				    "from dom %d", ref,
1506 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1507 				goto luckless;
1508 			}
1509 			/*
1510 			 * Release ref which we'll be re-claiming in
1511 			 * rx_buffer_hang().
1512 			 */
1513 			bdesc->grant_ref = GRANT_INVALID_REF;
1514 			(void) gnttab_end_foreign_access_ref(ref, 0);
1515 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1516 			    ref);
1517 			if (rxpkt->flags & NETRXF_data_validated)
1518 				hwcsum = B_TRUE;
1519 
1520 			/*
1521 			 * XXPV for the initial implementation of HVcopy,
1522 			 * create a new msg and copy in the data
1523 			 */
1524 			/* 4. */
1525 			if ((mp = allocb(len, BPRI_MED)) == NULL) {
1526 				/*
1527 				 * Couldn't get buffer to copy to,
1528 				 * drop this data, and re-hang
1529 				 * the buffer on the ring.
1530 				 */
1531 				xnfp->xnf_stat_norxbuf++;
1532 				DTRACE_PROBE(alloc_nix);
1533 			} else {
1534 				/* 5. */
1535 				DTRACE_PROBE(alloc_ok);
1536 				bcopy(bdesc->buf + off, mp->b_wptr,
1537 				    len);
1538 				mp->b_wptr += len;
1539 			}
1540 			new_bdesc = bdesc;
1541 luckless:
1542 
1543 			/* Re-hang old or hang new buffer. */
1544 			rx_buffer_hang(xnfp, new_bdesc);
1545 		}
1546 		if (mp) {
1547 			if (hwcsum) {
1548 				/*
1549 				 * See comments in xnf_process_recv().
1550 				 */
1551 
1552 				(void) hcksum_assoc(mp, NULL,
1553 				    NULL, 0, 0, 0, 0,
1554 				    HCK_FULLCKSUM |
1555 				    HCK_FULLCKSUM_OK,
1556 				    0);
1557 				xnfp->xnf_stat_rx_cksum_no_need++;
1558 			}
1559 			if (head == NULL) {
1560 				head = tail = mp;
1561 			} else {
1562 				tail->b_next = mp;
1563 				tail = mp;
1564 			}
1565 
1566 			ASSERT(mp->b_next == NULL);
1567 
1568 			xnfp->xnf_stat_ipackets++;
1569 			xnfp->xnf_stat_rbytes += len;
1570 		}
1571 
1572 		xnfp->xnf_rx_ring.rsp_cons++;
1573 
1574 		xnfp->xnf_stat_hvcopy_packet_processed++;
1575 	}
1576 
1577 	/* 7. */
1578 	/*
1579 	 * Has more data come in since we started?
1580 	 */
1581 	/* LINTED: constant in conditional context */
1582 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1583 	if (work_to_do)
1584 		goto loop;
1585 
1586 	/*
1587 	 * Indicate to the backend that we have re-filled the receive
1588 	 * ring.
1589 	 */
1590 	/* LINTED: constant in conditional context */
1591 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1592 	if (notify)
1593 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1594 
1595 	return (head);
1596 }
1597 
1598 /* Process all queued received packets */
1599 static mblk_t *
1600 xnf_process_recv(xnf_t *xnfp)
1601 {
1602 	volatile netif_rx_response_t *rxpkt;
1603 	mblk_t *mp, *head, *tail;
1604 	struct xnf_buffer_desc *bdesc;
1605 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1606 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1607 	size_t len;
1608 	pfn_t pfn;
1609 	long cnt;
1610 
1611 	head = tail = NULL;
1612 loop:
1613 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1614 
1615 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1616 		    xnfp->xnf_rx_ring.rsp_cons);
1617 
1618 		/*
1619 		 * Take buffer off of receive ring
1620 		 */
1621 		hwcsum = B_FALSE;
1622 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1623 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1624 		ASSERT(bdesc->id == rxpkt->id);
1625 		if (rxpkt->status <= 0) {
1626 			mp = NULL;
1627 			xnfp->xnf_stat_errrx++;
1628 			if (rxpkt->status == 0)
1629 				xnfp->xnf_stat_runt++;
1630 			if (rxpkt->status == NETIF_RSP_ERROR)
1631 				xnfp->xnf_stat_mac_rcv_error++;
1632 			if (rxpkt->status == NETIF_RSP_DROPPED)
1633 				xnfp->xnf_stat_norxbuf++;
1634 			/*
1635 			 * re-hang the buffer
1636 			 */
1637 			rx_buffer_hang(xnfp, bdesc);
1638 		} else {
1639 			grant_ref_t ref =  bdesc->grant_ref;
1640 			struct xnf_buffer_desc *new_bdesc;
1641 			unsigned long off = rxpkt->offset;
1642 			unsigned long mfn;
1643 
1644 			len = rxpkt->status;
1645 			ASSERT(off + len <= PAGEOFFSET);
1646 			if (ref == GRANT_INVALID_REF) {
1647 				mp = NULL;
1648 				new_bdesc = bdesc;
1649 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1650 				    "from dom %d", ref,
1651 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1652 				goto luckless;
1653 			}
1654 			bdesc->grant_ref = GRANT_INVALID_REF;
1655 			mfn = gnttab_end_foreign_transfer_ref(ref);
1656 			ASSERT(mfn != MFN_INVALID);
1657 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1658 			    PFN_INVALID);
1659 
1660 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1661 			    ref);
1662 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1663 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1664 			    xnf_btop(bdesc->buf_phys),
1665 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1666 			balloon_drv_added(1);
1667 
1668 			if (rxpkt->flags & NETRXF_data_validated)
1669 				hwcsum = B_TRUE;
1670 			if (len <= xnf_rx_bcopy_thresh) {
1671 				/*
1672 				 * For small buffers, just copy the data
1673 				 * and send the copy upstream.
1674 				 */
1675 				new_bdesc = NULL;
1676 			} else {
1677 				/*
1678 				 * We send a pointer to this data upstream;
1679 				 * we need a new buffer to replace this one.
1680 				 */
1681 				mutex_enter(&xnfp->xnf_rx_buf_mutex);
1682 				new_bdesc = xnf_get_buffer(xnfp);
1683 				if (new_bdesc != NULL) {
1684 					xnfp->xnf_rx_bufs_outstanding++;
1685 				} else {
1686 					xnfp->xnf_stat_rx_no_ringbuf++;
1687 				}
1688 				mutex_exit(&xnfp->xnf_rx_buf_mutex);
1689 			}
1690 
1691 			if (new_bdesc == NULL) {
1692 				/*
1693 				 * Don't have a new ring buffer; bcopy the data
1694 				 * from the buffer, and preserve the
1695 				 * original buffer
1696 				 */
1697 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1698 					/*
1699 					 * Could't get buffer to copy to,
1700 					 * drop this data, and re-hang
1701 					 * the buffer on the ring.
1702 					 */
1703 					xnfp->xnf_stat_norxbuf++;
1704 				} else {
1705 					bcopy(bdesc->buf + off, mp->b_wptr,
1706 					    len);
1707 				}
1708 				/*
1709 				 * Give the buffer page back to xen
1710 				 */
1711 				pfn = xnf_btop(bdesc->buf_phys);
1712 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1713 				    &pfn);
1714 				if (cnt != 1) {
1715 					cmn_err(CE_WARN, "unable to give a "
1716 					    "page back to the hypervisor\n");
1717 				}
1718 				new_bdesc = bdesc;
1719 			} else {
1720 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1721 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1722 					/*
1723 					 * Couldn't get mblk to pass recv data
1724 					 * up with, free the old ring buffer
1725 					 */
1726 					xnfp->xnf_stat_norxbuf++;
1727 					xnf_rcv_complete(bdesc);
1728 					goto luckless;
1729 				}
1730 				(void) ddi_dma_sync(bdesc->dma_handle,
1731 				    0, 0, DDI_DMA_SYNC_FORCPU);
1732 
1733 				mp->b_wptr += off;
1734 				mp->b_rptr += off;
1735 			}
1736 luckless:
1737 			if (mp)
1738 				mp->b_wptr += len;
1739 			/* re-hang old or hang new buffer */
1740 			rx_buffer_hang(xnfp, new_bdesc);
1741 		}
1742 		if (mp) {
1743 			if (hwcsum) {
1744 				/*
1745 				 * If the peer says that the data has
1746 				 * been validated then we declare that
1747 				 * the full checksum has been
1748 				 * verified.
1749 				 *
1750 				 * We don't look at the "checksum
1751 				 * blank" flag, and hence could have a
1752 				 * packet here that we are asserting
1753 				 * is good with a blank checksum.
1754 				 *
1755 				 * The hardware checksum offload
1756 				 * specification says that we must
1757 				 * provide the actual checksum as well
1758 				 * as an assertion that it is valid,
1759 				 * but the protocol stack doesn't
1760 				 * actually use it and some other
1761 				 * drivers don't bother, so we don't.
1762 				 * If it was necessary we could grovel
1763 				 * in the packet to find it.
1764 				 */
1765 
1766 				(void) hcksum_assoc(mp, NULL,
1767 				    NULL, 0, 0, 0, 0,
1768 				    HCK_FULLCKSUM |
1769 				    HCK_FULLCKSUM_OK,
1770 				    0);
1771 				xnfp->xnf_stat_rx_cksum_no_need++;
1772 			}
1773 			if (head == NULL) {
1774 				head = tail = mp;
1775 			} else {
1776 				tail->b_next = mp;
1777 				tail = mp;
1778 			}
1779 
1780 			ASSERT(mp->b_next == NULL);
1781 
1782 			xnfp->xnf_stat_ipackets++;
1783 			xnfp->xnf_stat_rbytes += len;
1784 		}
1785 
1786 		xnfp->xnf_rx_ring.rsp_cons++;
1787 	}
1788 
1789 	/*
1790 	 * Has more data come in since we started?
1791 	 */
1792 	/* LINTED: constant in conditional context */
1793 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1794 	if (work_to_do)
1795 		goto loop;
1796 
1797 	/*
1798 	 * Indicate to the backend that we have re-filled the receive
1799 	 * ring.
1800 	 */
1801 	/* LINTED: constant in conditional context */
1802 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1803 	if (notify)
1804 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1805 
1806 	return (head);
1807 }
1808 
1809 /* Called when the upper layers free a message we passed upstream */
1810 static void
1811 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1812 {
1813 	xnf_t *xnfp = bdesc->xnfp;
1814 	pfn_t pfn;
1815 	long cnt;
1816 
1817 	/* One less outstanding receive buffer */
1818 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1819 	--xnfp->xnf_rx_bufs_outstanding;
1820 	/*
1821 	 * Return buffer to the free list, unless the free list is getting
1822 	 * too large.  XXPV - this threshold may need tuning.
1823 	 */
1824 	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
1825 		/*
1826 		 * Unmap the page, and hand the machine page back
1827 		 * to xen so it can be re-used as a backend net buffer.
1828 		 */
1829 		pfn = xnf_btop(bdesc->buf_phys);
1830 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1831 		if (cnt != 1) {
1832 			cmn_err(CE_WARN, "unable to give a page back to the "
1833 			    "hypervisor\n");
1834 		}
1835 
1836 		bdesc->next = xnfp->xnf_free_list;
1837 		xnfp->xnf_free_list = bdesc;
1838 		xnfp->xnf_rx_descs_free++;
1839 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1840 	} else {
1841 		/*
1842 		 * We can return everything here since we have a free buffer
1843 		 * that we have not given the backing page for back to xen.
1844 		 */
1845 		--xnfp->xnf_rx_buffer_count;
1846 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1847 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1848 		ddi_dma_mem_free(&bdesc->acc_handle);
1849 		ddi_dma_free_handle(&bdesc->dma_handle);
1850 		kmem_free(bdesc, sizeof (*bdesc));
1851 	}
1852 }
1853 
1854 /*
1855  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1856  */
1857 static int
1858 xnf_alloc_dma_resources(xnf_t *xnfp)
1859 {
1860 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
1861 	int			i;
1862 	size_t			len;
1863 	ddi_dma_cookie_t	dma_cookie;
1864 	uint_t			ncookies;
1865 	struct xnf_buffer_desc	*bdesc;
1866 	int			rc;
1867 	caddr_t			rptr;
1868 
1869 	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
1870 	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
1871 
1872 	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
1873 
1874 	/*
1875 	 * The code below allocates all the DMA data structures that
1876 	 * need to be released when the driver is detached.
1877 	 *
1878 	 * First allocate handles for mapping (virtual address) pointers to
1879 	 * transmit data buffers to physical addresses
1880 	 */
1881 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
1882 		if ((rc = ddi_dma_alloc_handle(devinfo,
1883 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
1884 		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
1885 			return (DDI_FAILURE);
1886 	}
1887 
1888 	/*
1889 	 * Allocate page for the transmit descriptor ring.
1890 	 */
1891 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1892 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
1893 		goto alloc_error;
1894 
1895 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
1896 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1897 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1898 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
1899 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
1900 		xnfp->xnf_tx_ring_dma_handle = NULL;
1901 		goto alloc_error;
1902 	}
1903 
1904 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
1905 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1906 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1907 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
1908 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
1909 		xnfp->xnf_tx_ring_dma_handle = NULL;
1910 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
1911 		if (rc == DDI_DMA_NORESOURCES)
1912 			goto alloc_error;
1913 		else
1914 			goto error;
1915 	}
1916 
1917 	ASSERT(ncookies == 1);
1918 	bzero(rptr, PAGESIZE);
1919 	/* LINTED: constant in conditional context */
1920 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
1921 	/* LINTED: constant in conditional context */
1922 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
1923 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
1924 
1925 	/*
1926 	 * Allocate page for the receive descriptor ring.
1927 	 */
1928 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1929 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
1930 		goto alloc_error;
1931 
1932 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
1933 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1934 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1935 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
1936 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
1937 		xnfp->xnf_rx_ring_dma_handle = NULL;
1938 		goto alloc_error;
1939 	}
1940 
1941 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
1942 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1943 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1944 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
1945 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
1946 		xnfp->xnf_rx_ring_dma_handle = NULL;
1947 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
1948 		if (rc == DDI_DMA_NORESOURCES)
1949 			goto alloc_error;
1950 		else
1951 			goto error;
1952 	}
1953 
1954 	ASSERT(ncookies == 1);
1955 	bzero(rptr, PAGESIZE);
1956 	/* LINTED: constant in conditional context */
1957 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
1958 	/* LINTED: constant in conditional context */
1959 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
1960 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
1961 
1962 	/*
1963 	 * Preallocate receive buffers for each receive descriptor.
1964 	 */
1965 
1966 	/* Set up the "free list" of receive buffer descriptors */
1967 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
1968 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
1969 			goto alloc_error;
1970 		bdesc->next = xnfp->xnf_free_list;
1971 		xnfp->xnf_free_list = bdesc;
1972 	}
1973 
1974 	return (DDI_SUCCESS);
1975 
1976 alloc_error:
1977 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
1978 	    ddi_get_instance(xnfp->xnf_devinfo));
1979 error:
1980 	xnf_release_dma_resources(xnfp);
1981 	return (DDI_FAILURE);
1982 }
1983 
1984 /*
1985  * Release all DMA resources in the opposite order from acquisition
1986  * Should not be called until all outstanding esballoc buffers
1987  * have been returned.
1988  */
1989 static void
1990 xnf_release_dma_resources(xnf_t *xnfp)
1991 {
1992 	int i;
1993 
1994 	/*
1995 	 * Free receive buffers which are currently associated with
1996 	 * descriptors
1997 	 */
1998 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
1999 		struct xnf_buffer_desc *bp;
2000 
2001 		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
2002 			continue;
2003 		xnf_free_buffer(bp);
2004 		xnfp->xnf_rxpkt_bufptr[i] = NULL;
2005 	}
2006 
2007 	/* Free the receive ring buffer */
2008 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2009 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2010 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2011 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2012 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2013 	}
2014 	/* Free the transmit ring buffer */
2015 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2016 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2017 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2018 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2019 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2020 	}
2021 }
2022 
2023 static void
2024 xnf_release_mblks(xnf_t *xnfp)
2025 {
2026 	int	i;
2027 
2028 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2029 		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
2030 			continue;
2031 		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
2032 		xnfp->xnf_tx_pkt_info[i].mp = NULL;
2033 		(void) ddi_dma_unbind_handle(
2034 		    xnfp->xnf_tx_pkt_info[i].dma_handle);
2035 	}
2036 }
2037 
2038 /*
2039  * Remove a xmit buffer descriptor from the head of the free list and return
2040  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2041  * Called with the tx_buf_mutex held.
2042  */
2043 static struct xnf_buffer_desc *
2044 xnf_get_tx_buffer(xnf_t *xnfp)
2045 {
2046 	struct xnf_buffer_desc *bdesc;
2047 
2048 	bdesc = xnfp->xnf_tx_free_list;
2049 	if (bdesc != NULL) {
2050 		xnfp->xnf_tx_free_list = bdesc->next;
2051 	} else {
2052 		bdesc = xnf_alloc_tx_buffer(xnfp);
2053 	}
2054 	return (bdesc);
2055 }
2056 
2057 /*
2058  * Remove a buffer descriptor from the head of the free list and return
2059  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2060  * Called with the rx_buf_mutex held.
2061  */
2062 static struct xnf_buffer_desc *
2063 xnf_get_buffer(xnf_t *xnfp)
2064 {
2065 	struct xnf_buffer_desc *bdesc;
2066 
2067 	bdesc = xnfp->xnf_free_list;
2068 	if (bdesc != NULL) {
2069 		xnfp->xnf_free_list = bdesc->next;
2070 		xnfp->xnf_rx_descs_free--;
2071 	} else {
2072 		bdesc = xnf_alloc_buffer(xnfp);
2073 	}
2074 	return (bdesc);
2075 }
2076 
2077 /*
2078  * Free a xmit buffer back to the xmit free list
2079  */
2080 static void
2081 xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
2082 {
2083 	xnf_t *xnfp = bp->xnfp;
2084 
2085 	mutex_enter(&xnfp->xnf_tx_buf_mutex);
2086 	bp->next = xnfp->xnf_tx_free_list;
2087 	xnfp->xnf_tx_free_list = bp;
2088 	mutex_exit(&xnfp->xnf_tx_buf_mutex);
2089 }
2090 
2091 /*
2092  * Put a buffer descriptor onto the head of the free list.
2093  * for page-flip:
2094  * We can't really free these buffers back to the kernel
2095  * since we have given away their backing page to be used
2096  * by the back end net driver.
2097  * for hvcopy:
2098  * release all the memory
2099  */
2100 static void
2101 xnf_free_buffer(struct xnf_buffer_desc *bdesc)
2102 {
2103 	xnf_t *xnfp = bdesc->xnfp;
2104 
2105 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
2106 	if (xnfp->xnf_rx_hvcopy) {
2107 		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
2108 			goto out;
2109 		ddi_dma_mem_free(&bdesc->acc_handle);
2110 		ddi_dma_free_handle(&bdesc->dma_handle);
2111 		kmem_free(bdesc, sizeof (*bdesc));
2112 		xnfp->xnf_rx_buffer_count--;
2113 	} else {
2114 		bdesc->next = xnfp->xnf_free_list;
2115 		xnfp->xnf_free_list = bdesc;
2116 		xnfp->xnf_rx_descs_free++;
2117 	}
2118 out:
2119 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
2120 }
2121 
2122 /*
2123  * Allocate a DMA-able xmit buffer, including a structure to
2124  * keep track of the buffer.  Called with tx_buf_mutex held.
2125  */
2126 static struct xnf_buffer_desc *
2127 xnf_alloc_tx_buffer(xnf_t *xnfp)
2128 {
2129 	struct xnf_buffer_desc *bdesc;
2130 	size_t len;
2131 
2132 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2133 		return (NULL);
2134 
2135 	/* allocate a DMA access handle for receive buffer */
2136 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
2137 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2138 		goto failure;
2139 
2140 	/* Allocate DMA-able memory for transmit buffer */
2141 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2142 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2143 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2144 		goto failure_1;
2145 
2146 	bdesc->xnfp = xnfp;
2147 	xnfp->xnf_tx_buffer_count++;
2148 
2149 	return (bdesc);
2150 
2151 failure_1:
2152 	ddi_dma_free_handle(&bdesc->dma_handle);
2153 
2154 failure:
2155 	kmem_free(bdesc, sizeof (*bdesc));
2156 	return (NULL);
2157 }
2158 
2159 /*
2160  * Allocate a DMA-able receive buffer, including a structure to
2161  * keep track of the buffer.  Called with rx_buf_mutex held.
2162  */
2163 static struct xnf_buffer_desc *
2164 xnf_alloc_buffer(xnf_t *xnfp)
2165 {
2166 	struct			xnf_buffer_desc *bdesc;
2167 	size_t			len;
2168 	uint_t			ncookies;
2169 	ddi_dma_cookie_t	dma_cookie;
2170 	long			cnt;
2171 	pfn_t			pfn;
2172 
2173 	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
2174 		return (NULL);
2175 
2176 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2177 		return (NULL);
2178 
2179 	/* allocate a DMA access handle for receive buffer */
2180 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
2181 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2182 		goto failure;
2183 
2184 	/* Allocate DMA-able memory for receive buffer */
2185 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2186 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2187 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2188 		goto failure_1;
2189 
2190 	/* bind to virtual address of buffer to get physical address */
2191 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2192 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2193 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2194 		goto failure_2;
2195 
2196 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2197 	bdesc->xnfp = xnfp;
2198 	if (xnfp->xnf_rx_hvcopy) {
2199 		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
2200 	} else {
2201 		bdesc->free_rtn.free_func = xnf_rcv_complete;
2202 	}
2203 	bdesc->free_rtn.free_arg = (char *)bdesc;
2204 	bdesc->grant_ref = GRANT_INVALID_REF;
2205 	ASSERT(ncookies == 1);
2206 
2207 	xnfp->xnf_rx_buffer_count++;
2208 
2209 	if (!xnfp->xnf_rx_hvcopy) {
2210 		/*
2211 		 * Unmap the page, and hand the machine page back
2212 		 * to xen so it can be used as a backend net buffer.
2213 		 */
2214 		pfn = xnf_btop(bdesc->buf_phys);
2215 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2216 		if (cnt != 1) {
2217 			cmn_err(CE_WARN, "unable to give a page back to the "
2218 			    "hypervisor\n");
2219 		}
2220 	}
2221 
2222 	return (bdesc);
2223 
2224 failure_2:
2225 	ddi_dma_mem_free(&bdesc->acc_handle);
2226 
2227 failure_1:
2228 	ddi_dma_free_handle(&bdesc->dma_handle);
2229 
2230 failure:
2231 	kmem_free(bdesc, sizeof (*bdesc));
2232 	return (NULL);
2233 }
2234 
2235 /*
2236  * Statistics.
2237  */
2238 static char *xnf_aux_statistics[] = {
2239 	"tx_cksum_deferred",
2240 	"rx_cksum_no_need",
2241 	"interrupts",
2242 	"unclaimed_interrupts",
2243 	"tx_pullup",
2244 	"tx_pagebndry",
2245 	"tx_attempt",
2246 	"rx_no_ringbuf",
2247 	"hvcopy_packet_processed",
2248 };
2249 
2250 static int
2251 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2252 {
2253 	xnf_t *xnfp;
2254 	kstat_named_t *knp;
2255 
2256 	if (flag != KSTAT_READ)
2257 		return (EACCES);
2258 
2259 	xnfp = ksp->ks_private;
2260 	knp = ksp->ks_data;
2261 
2262 	/*
2263 	 * Assignment order must match that of the names in
2264 	 * xnf_aux_statistics.
2265 	 */
2266 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2267 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2268 
2269 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2270 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2271 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2272 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2273 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2274 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
2275 
2276 	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
2277 
2278 	return (0);
2279 }
2280 
2281 static boolean_t
2282 xnf_kstat_init(xnf_t *xnfp)
2283 {
2284 	int nstat = sizeof (xnf_aux_statistics) /
2285 	    sizeof (xnf_aux_statistics[0]);
2286 	char **cp = xnf_aux_statistics;
2287 	kstat_named_t *knp;
2288 
2289 	/*
2290 	 * Create and initialise kstats.
2291 	 */
2292 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2293 	    ddi_get_instance(xnfp->xnf_devinfo),
2294 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2295 	    nstat, 0)) == NULL)
2296 		return (B_FALSE);
2297 
2298 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2299 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2300 
2301 	knp = xnfp->xnf_kstat_aux->ks_data;
2302 	while (nstat > 0) {
2303 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2304 
2305 		knp++;
2306 		cp++;
2307 		nstat--;
2308 	}
2309 
2310 	kstat_install(xnfp->xnf_kstat_aux);
2311 
2312 	return (B_TRUE);
2313 }
2314 
2315 static int
2316 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2317 {
2318 	xnf_t *xnfp = arg;
2319 
2320 	mutex_enter(&xnfp->xnf_intrlock);
2321 	mutex_enter(&xnfp->xnf_txlock);
2322 
2323 #define	mac_stat(q, r)				\
2324 	case (MAC_STAT_##q):			\
2325 		*val = xnfp->xnf_stat_##r;	\
2326 		break
2327 
2328 #define	ether_stat(q, r)			\
2329 	case (ETHER_STAT_##q):			\
2330 		*val = xnfp->xnf_stat_##r;	\
2331 		break
2332 
2333 	switch (stat) {
2334 
2335 	mac_stat(IPACKETS, ipackets);
2336 	mac_stat(OPACKETS, opackets);
2337 	mac_stat(RBYTES, rbytes);
2338 	mac_stat(OBYTES, obytes);
2339 	mac_stat(NORCVBUF, norxbuf);
2340 	mac_stat(IERRORS, errrx);
2341 	mac_stat(NOXMTBUF, tx_defer);
2342 
2343 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2344 	ether_stat(TOOSHORT_ERRORS, runt);
2345 
2346 	default:
2347 		mutex_exit(&xnfp->xnf_txlock);
2348 		mutex_exit(&xnfp->xnf_intrlock);
2349 
2350 		return (ENOTSUP);
2351 	}
2352 
2353 #undef mac_stat
2354 #undef ether_stat
2355 
2356 	mutex_exit(&xnfp->xnf_txlock);
2357 	mutex_exit(&xnfp->xnf_intrlock);
2358 
2359 	return (0);
2360 }
2361 
2362 /*ARGSUSED*/
2363 static void
2364 xnf_blank(void *arg, time_t ticks, uint_t count)
2365 {
2366 	/*
2367 	 * XXPV dme: blanking is not currently implemented.
2368 	 *
2369 	 * It's not obvious how to use the 'ticks' argument here.
2370 	 *
2371 	 * 'Count' might be used as an indicator of how to set
2372 	 * rsp_event when posting receive buffers to the rx_ring.  It
2373 	 * would replace the code at the tail of xnf_process_recv()
2374 	 * that simply indicates that the next completed packet should
2375 	 * cause an interrupt.
2376 	 */
2377 }
2378 
2379 static void
2380 xnf_resources(void *arg)
2381 {
2382 	xnf_t *xnfp = arg;
2383 	mac_rx_fifo_t mrf;
2384 
2385 	mrf.mrf_type = MAC_RX_FIFO;
2386 	mrf.mrf_blank = xnf_blank;
2387 	mrf.mrf_arg = (void *)xnfp;
2388 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2389 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2390 
2391 	xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
2392 	    (mac_resource_t *)&mrf);
2393 }
2394 
2395 /*ARGSUSED*/
2396 static void
2397 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2398 {
2399 	miocnak(q, mp, 0, EINVAL);
2400 }
2401 
2402 static boolean_t
2403 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2404 {
2405 	xnf_t *xnfp = arg;
2406 
2407 	switch (cap) {
2408 	case MAC_CAPAB_HCKSUM: {
2409 		uint32_t *capab = cap_data;
2410 
2411 		/*
2412 		 * We declare ourselves capable of HCKSUM_INET_PARTIAL
2413 		 * in order that the protocol stack insert the
2414 		 * pseudo-header checksum in packets that it passes
2415 		 * down to us.
2416 		 *
2417 		 * Whilst the flag used to communicate with dom0 is
2418 		 * called "NETTXF_csum_blank", the checksum in the
2419 		 * packet must contain the pseudo-header checksum and
2420 		 * not zero. (In fact, a Solaris dom0 is happy to deal
2421 		 * with a checksum of zero, but a Linux dom0 is not.)
2422 		 */
2423 		if (xnfp->xnf_cksum_offload)
2424 			*capab = HCKSUM_INET_PARTIAL;
2425 		else
2426 			*capab = 0;
2427 		break;
2428 	}
2429 
2430 	case MAC_CAPAB_POLL:
2431 		/* Just return B_TRUE. */
2432 		break;
2433 
2434 	default:
2435 		return (B_FALSE);
2436 	}
2437 
2438 	return (B_TRUE);
2439 }
2440 
2441 /*ARGSUSED*/
2442 static void
2443 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2444     void *arg, void *impl_data)
2445 {
2446 	xnf_t *xnfp = ddi_get_driver_private(dip);
2447 	XenbusState new_state = *(XenbusState *)impl_data;
2448 
2449 	ASSERT(xnfp != NULL);
2450 
2451 	switch (new_state) {
2452 	case XenbusStateConnected:
2453 		mutex_enter(&xnfp->xnf_intrlock);
2454 		mutex_enter(&xnfp->xnf_txlock);
2455 
2456 		xnfp->xnf_connected = B_TRUE;
2457 		cv_broadcast(&xnfp->xnf_cv);
2458 
2459 		mutex_exit(&xnfp->xnf_txlock);
2460 		mutex_exit(&xnfp->xnf_intrlock);
2461 
2462 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2463 		break;
2464 
2465 	default:
2466 		break;
2467 	}
2468 }
2469 
2470 /*
2471  * Check whether backend is capable of and willing to talk
2472  * to us via hypervisor copy, as opposed to page flip.
2473  */
2474 static boolean_t
2475 xnf_hvcopy_peer_status(dev_info_t *devinfo)
2476 {
2477 	int	be_rx_copy;
2478 	int	err;
2479 
2480 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
2481 	    "feature-rx-copy", "%d", &be_rx_copy);
2482 	/*
2483 	 * If we fail to read the store we assume that the key is
2484 	 * absent, implying an older domain at the far end.  Older
2485 	 * domains cannot do HV copy (we assume ..).
2486 	 */
2487 	if (err != 0)
2488 		be_rx_copy = 0;
2489 
2490 	return (be_rx_copy?B_TRUE:B_FALSE);
2491 }
2492