xref: /titanic_41/usr/src/uts/common/xen/io/xnf.c (revision 5b81b7ca9b4fd49f704a5f492832ec658207bb98)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *
29  * Copyright (c) 2004 Christian Limpach.
30  * All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. This section intentionally left blank.
41  * 4. The name of the author may not be used to endorse or promote products
42  *    derived from this software without specific prior written permission.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 /*
56  * Section 3 of the above license was updated in response to bug 6379571.
57  */
58 
59 /*
60  * xnf.c - Nemo-based network driver for domU
61  */
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>
66 #include <sys/sysmacros.h>
67 #include <sys/systm.h>
68 #include <sys/stream.h>
69 #include <sys/strsubr.h>
70 #include <sys/conf.h>
71 #include <sys/ddi.h>
72 #include <sys/devops.h>
73 #include <sys/sunddi.h>
74 #include <sys/sunndi.h>
75 #include <sys/dlpi.h>
76 #include <sys/ethernet.h>
77 #include <sys/strsun.h>
78 #include <sys/pattr.h>
79 #include <inet/ip.h>
80 #include <inet/ip_impl.h>
81 #include <sys/gld.h>
82 #include <sys/modctl.h>
83 #include <sys/mac_provider.h>
84 #include <sys/mac_ether.h>
85 #include <sys/bootinfo.h>
86 #include <sys/mach_mmu.h>
87 #ifdef	XPV_HVM_DRIVER
88 #include <sys/xpv_support.h>
89 #include <sys/hypervisor.h>
90 #else
91 #include <sys/hypervisor.h>
92 #include <sys/evtchn_impl.h>
93 #include <sys/balloon_impl.h>
94 #endif
95 #include <xen/public/io/netif.h>
96 #include <sys/gnttab.h>
97 #include <xen/sys/xendev.h>
98 #include <sys/sdt.h>
99 
100 #include <io/xnf.h>
101 
102 
103 /*
104  *  Declarations and Module Linkage
105  */
106 
107 #if defined(DEBUG) || defined(__lint)
108 #define	XNF_DEBUG
109 int	xnfdebug = 0;
110 #endif
111 
112 /*
113  * On a 32 bit PAE system physical and machine addresses are larger
114  * than 32 bits.  ddi_btop() on such systems take an unsigned long
115  * argument, and so addresses above 4G are truncated before ddi_btop()
116  * gets to see them.  To avoid this, code the shift operation here.
117  */
118 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
119 
120 boolean_t	xnf_cksum_offload = B_TRUE;
121 
122 /* Default value for hypervisor-based copy operations */
123 boolean_t	xnf_rx_hvcopy = B_TRUE;
124 
125 /*
126  * Should pages used for transmit be readonly for the peer?
127  */
128 boolean_t	xnf_tx_pages_readonly = B_FALSE;
129 /*
130  * Packets under this size are bcopied instead of using desballoc.
131  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
132  * always copy.
133  */
134 unsigned int	xnf_rx_bcopy_thresh = 64;
135 
136 unsigned int	xnf_max_tx_frags = 1;
137 
138 /* Required system entry points */
139 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
141 
142 /* Required driver entry points for Nemo */
143 static int	xnf_start(void *);
144 static void	xnf_stop(void *);
145 static int	xnf_set_mac_addr(void *, const uint8_t *);
146 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
147 static int	xnf_set_promiscuous(void *, boolean_t);
148 static mblk_t	*xnf_send(void *, mblk_t *);
149 static uint_t	xnf_intr(caddr_t);
150 static int	xnf_stat(void *, uint_t, uint64_t *);
151 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
152 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
153 
154 /* Driver private functions */
155 static int xnf_alloc_dma_resources(xnf_t *);
156 static void xnf_release_dma_resources(xnf_t *);
157 static mblk_t *xnf_process_recv(xnf_t *);
158 static void xnf_rcv_complete(struct xnf_buffer_desc *);
159 static void xnf_release_mblks(xnf_t *);
160 static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
161 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
162 static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
163 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
164 static void xnf_free_buffer(struct xnf_buffer_desc *);
165 static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
166 void xnf_send_driver_status(int, int);
167 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
168 static int xnf_clean_tx_ring(xnf_t  *);
169 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
170     void *, void *);
171 static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
172 static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
173 static boolean_t xnf_kstat_init(xnf_t *xnfp);
174 
175 /*
176  * XXPV dme: remove MC_IOCTL?
177  */
178 static mac_callbacks_t xnf_callbacks = {
179 	MC_IOCTL | MC_GETCAPAB,
180 	xnf_stat,
181 	xnf_start,
182 	xnf_stop,
183 	xnf_set_promiscuous,
184 	xnf_set_multicast,
185 	xnf_set_mac_addr,
186 	xnf_send,
187 	xnf_ioctl,
188 	xnf_getcapab
189 };
190 
191 #define	GRANT_INVALID_REF	0
192 const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
193 const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
194 
195 /* DMA attributes for network ring buffer */
196 static ddi_dma_attr_t ringbuf_dma_attr = {
197 	DMA_ATTR_V0,		/* version of this structure */
198 	0,			/* lowest usable address */
199 	0xffffffffffffffffULL,	/* highest usable address */
200 	0x7fffffff,		/* maximum DMAable byte count */
201 	MMU_PAGESIZE,		/* alignment in bytes */
202 	0x7ff,			/* bitmap of burst sizes */
203 	1,			/* minimum transfer */
204 	0xffffffffU,		/* maximum transfer */
205 	0xffffffffffffffffULL,	/* maximum segment length */
206 	1,			/* maximum number of segments */
207 	1,			/* granularity */
208 	0,			/* flags (reserved) */
209 };
210 
211 /* DMA attributes for transmit data */
212 static ddi_dma_attr_t tx_buffer_dma_attr = {
213 	DMA_ATTR_V0,		/* version of this structure */
214 	0,			/* lowest usable address */
215 	0xffffffffffffffffULL,	/* highest usable address */
216 	0x7fffffff,		/* maximum DMAable byte count */
217 	MMU_PAGESIZE,		/* alignment in bytes */
218 	0x7ff,			/* bitmap of burst sizes */
219 	1,			/* minimum transfer */
220 	0xffffffffU,		/* maximum transfer */
221 	0xffffffffffffffffULL,	/* maximum segment length */
222 	1,			/* maximum number of segments */
223 	1,			/* granularity */
224 	0,			/* flags (reserved) */
225 };
226 
227 /* DMA attributes for a receive buffer */
228 static ddi_dma_attr_t rx_buffer_dma_attr = {
229 	DMA_ATTR_V0,		/* version of this structure */
230 	0,			/* lowest usable address */
231 	0xffffffffffffffffULL,	/* highest usable address */
232 	0x7fffffff,		/* maximum DMAable byte count */
233 	MMU_PAGESIZE,		/* alignment in bytes */
234 	0x7ff,			/* bitmap of burst sizes */
235 	1,			/* minimum transfer */
236 	0xffffffffU,		/* maximum transfer */
237 	0xffffffffffffffffULL,	/* maximum segment length */
238 	1,			/* maximum number of segments */
239 	1,			/* granularity */
240 	0,			/* flags (reserved) */
241 };
242 
243 /* DMA access attributes for registers and descriptors */
244 static ddi_device_acc_attr_t accattr = {
245 	DDI_DEVICE_ATTR_V0,
246 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
247 	DDI_STRICTORDER_ACC
248 };
249 
250 /* DMA access attributes for data: NOT to be byte swapped. */
251 static ddi_device_acc_attr_t data_accattr = {
252 	DDI_DEVICE_ATTR_V0,
253 	DDI_NEVERSWAP_ACC,
254 	DDI_STRICTORDER_ACC
255 };
256 
257 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
258 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
259 
260 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
261     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
262 
263 static struct modldrv xnf_modldrv = {
264 	&mod_driverops,
265 	"Virtual Ethernet driver",
266 	&xnf_dev_ops
267 };
268 
269 static struct modlinkage modlinkage = {
270 	MODREV_1, &xnf_modldrv, NULL
271 };
272 
273 int
274 _init(void)
275 {
276 	int r;
277 
278 	mac_init_ops(&xnf_dev_ops, "xnf");
279 	r = mod_install(&modlinkage);
280 	if (r != DDI_SUCCESS)
281 		mac_fini_ops(&xnf_dev_ops);
282 
283 	return (r);
284 }
285 
286 int
287 _fini(void)
288 {
289 	return (EBUSY); /* XXPV dme: should be removable */
290 }
291 
292 int
293 _info(struct modinfo *modinfop)
294 {
295 	return (mod_info(&modlinkage, modinfop));
296 }
297 
298 static int
299 xnf_setup_rings(xnf_t *xnfp)
300 {
301 	int			ix, err;
302 	RING_IDX		i;
303 	struct xnf_buffer_desc	*bdesc, *rbp;
304 	struct xenbus_device	*xsd;
305 	domid_t			oeid;
306 
307 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
308 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
309 
310 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
311 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
312 
313 	err = gnttab_grant_foreign_access(oeid,
314 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
315 	if (err <= 0) {
316 		err = -err;
317 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
318 		goto out;
319 	}
320 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
321 
322 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
323 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
324 
325 	err = gnttab_grant_foreign_access(oeid,
326 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
327 	if (err <= 0) {
328 		err = -err;
329 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
330 		goto out;
331 	}
332 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
333 
334 
335 	mutex_enter(&xnfp->xnf_intrlock);
336 
337 	/*
338 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
339 	 * and reset the ring.  Note that this can lose packets after a resume,
340 	 * but we expect to stagger on.
341 	 */
342 	mutex_enter(&xnfp->xnf_txlock);
343 
344 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
345 		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
346 
347 		txp->id = i + 1;
348 
349 		if (txp->grant_ref == GRANT_INVALID_REF) {
350 			ASSERT(txp->mp == NULL);
351 			ASSERT(txp->bdesc == NULL);
352 			continue;
353 		}
354 
355 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
356 			panic("tx grant still in use by backend domain");
357 
358 		freemsg(txp->mp);
359 		txp->mp = NULL;
360 
361 		(void) ddi_dma_unbind_handle(txp->dma_handle);
362 
363 		if (txp->bdesc != NULL) {
364 			xnf_free_tx_buffer(txp->bdesc);
365 			txp->bdesc = NULL;
366 		}
367 
368 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
369 		    xnfp->xnf_tx_pages_readonly);
370 		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
371 		    txp->grant_ref);
372 		txp->grant_ref = GRANT_INVALID_REF;
373 	}
374 
375 	xnfp->xnf_tx_pkt_id_list = 0;
376 	xnfp->xnf_tx_ring.rsp_cons = 0;
377 	xnfp->xnf_tx_ring.req_prod_pvt = 0;
378 
379 	/* LINTED: constant in conditional context */
380 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
381 
382 	mutex_exit(&xnfp->xnf_txlock);
383 
384 	/*
385 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
386 	 * our pages are currently flipped out/granted so we can't just free
387 	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
388 	 * useable anyway since the mfn's they refer to are no longer valid.
389 	 * Grant the backend domain access to each hung rx buffer.
390 	 */
391 	i = xnfp->xnf_rx_ring.rsp_cons;
392 	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
393 		volatile netif_rx_request_t	*rxrp;
394 
395 		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
396 		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
397 		rbp = xnfp->xnf_rxpkt_bufptr[ix];
398 		if (rbp != NULL) {
399 			grant_ref_t	ref = rbp->grant_ref;
400 
401 			ASSERT(ref != GRANT_INVALID_REF);
402 			if (xnfp->xnf_rx_hvcopy) {
403 				pfn_t pfn = xnf_btop(rbp->buf_phys);
404 				mfn_t mfn = pfn_to_mfn(pfn);
405 
406 				gnttab_grant_foreign_access_ref(ref, oeid,
407 				    mfn, 0);
408 			} else {
409 				gnttab_grant_foreign_transfer_ref(ref,
410 				    oeid, 0);
411 			}
412 			rxrp->id = ix;
413 			rxrp->gref = ref;
414 		}
415 	}
416 
417 	/*
418 	 * Reset the ring pointers to initial state.
419 	 * Hang buffers for any empty ring slots.
420 	 */
421 	xnfp->xnf_rx_ring.rsp_cons = 0;
422 	xnfp->xnf_rx_ring.req_prod_pvt = 0;
423 
424 	/* LINTED: constant in conditional context */
425 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
426 
427 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
428 		xnfp->xnf_rx_ring.req_prod_pvt = i;
429 		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
430 			continue;
431 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
432 			break;
433 		rx_buffer_hang(xnfp, bdesc);
434 	}
435 	xnfp->xnf_rx_ring.req_prod_pvt = i;
436 	/* LINTED: constant in conditional context */
437 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
438 
439 	mutex_exit(&xnfp->xnf_intrlock);
440 
441 	return (0);
442 
443 out:
444 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
445 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
446 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
447 
448 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
449 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
450 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
451 
452 	return (err);
453 }
454 
455 
456 /* Called when the upper layers free a message we passed upstream */
457 static void
458 xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
459 {
460 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
461 	ddi_dma_mem_free(&bdesc->acc_handle);
462 	ddi_dma_free_handle(&bdesc->dma_handle);
463 	kmem_free(bdesc, sizeof (*bdesc));
464 }
465 
466 
467 /*
468  * Connect driver to back end, called to set up communication with
469  * back end driver both initially and on resume after restore/migrate.
470  */
471 void
472 xnf_be_connect(xnf_t *xnfp)
473 {
474 	const char	*message;
475 	xenbus_transaction_t xbt;
476 	struct		xenbus_device *xsd;
477 	char		*xsname;
478 	int		err;
479 
480 	ASSERT(!xnfp->xnf_connected);
481 
482 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
483 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
484 
485 	err = xnf_setup_rings(xnfp);
486 	if (err != 0) {
487 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
488 		xenbus_dev_error(xsd, err, "setting up ring");
489 		return;
490 	}
491 
492 again:
493 	err = xenbus_transaction_start(&xbt);
494 	if (err != 0) {
495 		xenbus_dev_error(xsd, EIO, "starting transaction");
496 		return;
497 	}
498 
499 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
500 	    xnfp->xnf_tx_ring_ref);
501 	if (err != 0) {
502 		message = "writing tx ring-ref";
503 		goto abort_transaction;
504 	}
505 
506 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
507 	    xnfp->xnf_rx_ring_ref);
508 	if (err != 0) {
509 		message = "writing rx ring-ref";
510 		goto abort_transaction;
511 	}
512 
513 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
514 	    xnfp->xnf_evtchn);
515 	if (err != 0) {
516 		message = "writing event-channel";
517 		goto abort_transaction;
518 	}
519 
520 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
521 	if (err != 0) {
522 		message = "writing feature-rx-notify";
523 		goto abort_transaction;
524 	}
525 
526 	if (!xnfp->xnf_tx_pages_readonly) {
527 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
528 		    "%d", 1);
529 		if (err != 0) {
530 			message = "writing feature-tx-writable";
531 			goto abort_transaction;
532 		}
533 	}
534 
535 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
536 	    xnfp->xnf_cksum_offload ? 0 : 1);
537 	if (err != 0) {
538 		message = "writing feature-no-csum-offload";
539 		goto abort_transaction;
540 	}
541 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
542 	    xnfp->xnf_rx_hvcopy ? 1 : 0);
543 	if (err != 0) {
544 		message = "writing request-rx-copy";
545 		goto abort_transaction;
546 	}
547 
548 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
549 	if (err != 0) {
550 		message = "writing frontend XenbusStateConnected";
551 		goto abort_transaction;
552 	}
553 
554 	err = xenbus_transaction_end(xbt, 0);
555 	if (err != 0) {
556 		if (err == EAGAIN)
557 			goto again;
558 		xenbus_dev_error(xsd, err, "completing transaction");
559 	}
560 
561 	return;
562 
563 abort_transaction:
564 	(void) xenbus_transaction_end(xbt, 1);
565 	xenbus_dev_error(xsd, err, "%s", message);
566 }
567 
568 /*
569  * Read config info from xenstore
570  */
571 void
572 xnf_read_config(xnf_t *xnfp)
573 {
574 	char		mac[ETHERADDRL * 3];
575 	int		err, be_no_cksum_offload;
576 
577 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
578 	    "%s", (char *)&mac[0]);
579 	if (err != 0) {
580 		/*
581 		 * bad: we're supposed to be set up with a proper mac
582 		 * addr. at this point
583 		 */
584 		cmn_err(CE_WARN, "%s%d: no mac address",
585 		    ddi_driver_name(xnfp->xnf_devinfo),
586 		    ddi_get_instance(xnfp->xnf_devinfo));
587 			return;
588 	}
589 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
590 		err = ENOENT;
591 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
592 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
593 		return;
594 	}
595 
596 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
597 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
598 	/*
599 	 * If we fail to read the store we assume that the key is
600 	 * absent, implying an older domain at the far end.  Older
601 	 * domains always support checksum offload.
602 	 */
603 	if (err != 0)
604 		be_no_cksum_offload = 0;
605 	/*
606 	 * If the far end cannot do checksum offload or we do not wish
607 	 * to do it, disable it.
608 	 */
609 	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
610 		xnfp->xnf_cksum_offload = B_FALSE;
611 }
612 
613 /*
614  *  attach(9E) -- Attach a device to the system
615  *
616  *  Called once for each board successfully probed.
617  */
618 static int
619 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
620 {
621 	mac_register_t *macp;
622 	xnf_t *xnfp;
623 	int err;
624 
625 #ifdef XNF_DEBUG
626 	if (xnfdebug & XNF_DEBUG_DDI)
627 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
628 		    (void *)devinfo);
629 #endif
630 
631 	switch (cmd) {
632 	case DDI_RESUME:
633 		xnfp = ddi_get_driver_private(devinfo);
634 
635 		(void) xvdi_resume(devinfo);
636 		(void) xvdi_alloc_evtchn(devinfo);
637 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
638 #ifdef XPV_HVM_DRIVER
639 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
640 		    xnfp);
641 #else
642 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
643 		    (caddr_t)xnfp);
644 #endif
645 		xnf_be_connect(xnfp);
646 		/*
647 		 * Our MAC address may have changed if we're resuming:
648 		 * - on a different host
649 		 * - on the same one and got a different MAC address
650 		 *   because we didn't specify one of our own.
651 		 * so it's useful to claim that it changed in order that
652 		 * IP send out a gratuitous ARP.
653 		 */
654 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
655 		return (DDI_SUCCESS);
656 
657 	case DDI_ATTACH:
658 		break;
659 
660 	default:
661 		return (DDI_FAILURE);
662 	}
663 
664 	/*
665 	 *  Allocate gld_mac_info_t and xnf_instance structures
666 	 */
667 	macp = mac_alloc(MAC_VERSION);
668 	if (macp == NULL)
669 		return (DDI_FAILURE);
670 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
671 
672 	macp->m_dip = devinfo;
673 	macp->m_driver = xnfp;
674 	xnfp->xnf_devinfo = devinfo;
675 
676 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
677 	macp->m_src_addr = xnfp->xnf_mac_addr;
678 	macp->m_callbacks = &xnf_callbacks;
679 	macp->m_min_sdu = 0;
680 	macp->m_max_sdu = XNF_MAXPKT;
681 
682 	xnfp->xnf_running = B_FALSE;
683 	xnfp->xnf_connected = B_FALSE;
684 	xnfp->xnf_cksum_offload = xnf_cksum_offload;
685 	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
686 	xnfp->xnf_need_sched = B_FALSE;
687 
688 	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
689 #ifdef XPV_HVM_DRIVER
690 	/*
691 	 * Report our version to dom0.
692 	 */
693 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
694 	    HVMPV_XNF_VERS))
695 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
696 
697 	if (!xnfp->xnf_rx_hvcopy) {
698 		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
699 		    "supports 'feature-rx-copy'");
700 		goto failure;
701 	}
702 #endif
703 
704 	/*
705 	 * Get the iblock cookie with which to initialize the mutexes.
706 	 */
707 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
708 	    != DDI_SUCCESS)
709 		goto failure;
710 	/*
711 	 * Driver locking strategy: the txlock protects all paths
712 	 * through the driver, except the interrupt thread.
713 	 * If the interrupt thread needs to do something which could
714 	 * affect the operation of any other part of the driver,
715 	 * it needs to acquire the txlock mutex.
716 	 */
717 	mutex_init(&xnfp->xnf_tx_buf_mutex,
718 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
719 	mutex_init(&xnfp->xnf_rx_buf_mutex,
720 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
721 	mutex_init(&xnfp->xnf_txlock,
722 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
723 	mutex_init(&xnfp->xnf_intrlock,
724 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
725 	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
726 
727 	xnfp->xnf_gref_tx_head = (grant_ref_t)-1;
728 	xnfp->xnf_gref_rx_head = (grant_ref_t)-1;
729 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
730 	    &xnfp->xnf_gref_tx_head) < 0) {
731 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
732 		    ddi_get_instance(xnfp->xnf_devinfo));
733 		goto failure_1;
734 	}
735 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
736 	    &xnfp->xnf_gref_rx_head) < 0) {
737 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
738 		    ddi_get_instance(xnfp->xnf_devinfo));
739 		goto failure_1;
740 	}
741 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
742 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
743 		    "driver data structures",
744 		    ddi_get_instance(xnfp->xnf_devinfo));
745 		goto failure_1;
746 	}
747 
748 	xnfp->xnf_rx_ring.sring->rsp_event =
749 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
750 
751 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
752 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
753 
754 	/* set driver private pointer now */
755 	ddi_set_driver_private(devinfo, xnfp);
756 
757 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
758 	    != DDI_SUCCESS)
759 		goto failure_1;
760 
761 	if (!xnf_kstat_init(xnfp))
762 		goto failure_2;
763 
764 	/*
765 	 * Allocate an event channel, add the interrupt handler and
766 	 * bind it to the event channel.
767 	 */
768 	(void) xvdi_alloc_evtchn(devinfo);
769 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
770 #ifdef XPV_HVM_DRIVER
771 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
772 #else
773 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
774 #endif
775 
776 	xnf_read_config(xnfp);
777 	err = mac_register(macp, &xnfp->xnf_mh);
778 	mac_free(macp);
779 	macp = NULL;
780 	if (err != 0)
781 		goto failure_3;
782 
783 #ifdef XPV_HVM_DRIVER
784 	/*
785 	 * In the HVM case, this driver essentially replaces a driver for
786 	 * a 'real' PCI NIC. Without the "model" property set to
787 	 * "Ethernet controller", like the PCI code does, netbooting does
788 	 * not work correctly, as strplumb_get_netdev_path() will not find
789 	 * this interface.
790 	 */
791 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
792 	    "Ethernet controller");
793 #endif
794 
795 	/*
796 	 * connect to the backend
797 	 */
798 	xnf_be_connect(xnfp);
799 
800 	return (DDI_SUCCESS);
801 
802 failure_3:
803 	kstat_delete(xnfp->xnf_kstat_aux);
804 #ifdef XPV_HVM_DRIVER
805 	ec_unbind_evtchn(xnfp->xnf_evtchn);
806 	xvdi_free_evtchn(devinfo);
807 #else
808 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
809 #endif
810 	xnfp->xnf_evtchn = INVALID_EVTCHN;
811 
812 failure_2:
813 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
814 
815 failure_1:
816 	if (xnfp->xnf_gref_tx_head != (grant_ref_t)-1)
817 		gnttab_free_grant_references(xnfp->xnf_gref_tx_head);
818 	if (xnfp->xnf_gref_rx_head != (grant_ref_t)-1)
819 		gnttab_free_grant_references(xnfp->xnf_gref_rx_head);
820 	xnf_release_dma_resources(xnfp);
821 	cv_destroy(&xnfp->xnf_cv);
822 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
823 	mutex_destroy(&xnfp->xnf_txlock);
824 	mutex_destroy(&xnfp->xnf_intrlock);
825 
826 failure:
827 	kmem_free(xnfp, sizeof (*xnfp));
828 	if (macp != NULL)
829 		mac_free(macp);
830 
831 	return (DDI_FAILURE);
832 }
833 
834 /*  detach(9E) -- Detach a device from the system */
835 static int
836 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
837 {
838 	xnf_t *xnfp;		/* Our private device info */
839 	int i;
840 
841 #ifdef XNF_DEBUG
842 	if (xnfdebug & XNF_DEBUG_DDI)
843 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
844 #endif
845 
846 	xnfp = ddi_get_driver_private(devinfo);
847 
848 	switch (cmd) {
849 	case DDI_SUSPEND:
850 #ifdef XPV_HVM_DRIVER
851 		ec_unbind_evtchn(xnfp->xnf_evtchn);
852 		xvdi_free_evtchn(devinfo);
853 #else
854 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
855 #endif
856 
857 		xvdi_suspend(devinfo);
858 
859 		mutex_enter(&xnfp->xnf_intrlock);
860 		mutex_enter(&xnfp->xnf_txlock);
861 
862 		xnfp->xnf_evtchn = INVALID_EVTCHN;
863 		xnfp->xnf_connected = B_FALSE;
864 		mutex_exit(&xnfp->xnf_txlock);
865 		mutex_exit(&xnfp->xnf_intrlock);
866 
867 		/* claim link to be down after disconnect */
868 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
869 		return (DDI_SUCCESS);
870 
871 	case DDI_DETACH:
872 		break;
873 
874 	default:
875 		return (DDI_FAILURE);
876 	}
877 
878 	if (xnfp->xnf_connected)
879 		return (DDI_FAILURE);
880 
881 	/* Wait for receive buffers to be returned; give up after 5 seconds */
882 	i = 50;
883 
884 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
885 	while (xnfp->xnf_rx_bufs_outstanding > 0) {
886 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
887 		delay(drv_usectohz(100000));
888 		if (--i == 0) {
889 			cmn_err(CE_WARN,
890 			    "xnf%d: never reclaimed all the "
891 			    "receive buffers.  Still have %d "
892 			    "buffers outstanding.",
893 			    ddi_get_instance(xnfp->xnf_devinfo),
894 			    xnfp->xnf_rx_bufs_outstanding);
895 			return (DDI_FAILURE);
896 		}
897 		mutex_enter(&xnfp->xnf_rx_buf_mutex);
898 	}
899 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
900 
901 	if (mac_unregister(xnfp->xnf_mh) != 0)
902 		return (DDI_FAILURE);
903 
904 	kstat_delete(xnfp->xnf_kstat_aux);
905 
906 	/* Stop the receiver */
907 	xnf_stop(xnfp);
908 
909 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
910 
911 	/* Remove the interrupt */
912 #ifdef XPV_HVM_DRIVER
913 	ec_unbind_evtchn(xnfp->xnf_evtchn);
914 	xvdi_free_evtchn(devinfo);
915 #else
916 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
917 #endif
918 
919 	/* Release any pending xmit mblks */
920 	xnf_release_mblks(xnfp);
921 
922 	/* Release all DMA resources */
923 	xnf_release_dma_resources(xnfp);
924 
925 	cv_destroy(&xnfp->xnf_cv);
926 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
927 	mutex_destroy(&xnfp->xnf_txlock);
928 	mutex_destroy(&xnfp->xnf_intrlock);
929 
930 	kmem_free(xnfp, sizeof (*xnfp));
931 
932 	return (DDI_SUCCESS);
933 }
934 
935 /*
936  *  xnf_set_mac_addr() -- set the physical network address on the board.
937  */
938 /*ARGSUSED*/
939 static int
940 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
941 {
942 	xnf_t *xnfp = arg;
943 
944 #ifdef XNF_DEBUG
945 	if (xnfdebug & XNF_DEBUG_TRACE)
946 		printf("xnf%d: set_mac_addr(0x%p): "
947 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
948 		    ddi_get_instance(xnfp->xnf_devinfo),
949 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
950 		    macaddr[3], macaddr[4], macaddr[5]);
951 #endif
952 	/*
953 	 * We can't set our macaddr.
954 	 *
955 	 * XXPV dme: Why not?
956 	 */
957 	return (ENOTSUP);
958 }
959 
960 /*
961  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
962  *
963  *  Program the hardware to enable/disable the multicast address
964  *  in "mcast".  Enable if "add" is true, disable if false.
965  */
966 /*ARGSUSED*/
967 static int
968 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
969 {
970 	xnf_t *xnfp = arg;
971 
972 #ifdef XNF_DEBUG
973 	if (xnfdebug & XNF_DEBUG_TRACE)
974 		printf("xnf%d set_multicast(0x%p): "
975 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
976 		    ddi_get_instance(xnfp->xnf_devinfo),
977 		    (void *)xnfp, mca[0], mca[1], mca[2],
978 		    mca[3], mca[4], mca[5]);
979 #endif
980 
981 	/*
982 	 * XXPV dme: Ideally we'd relay the address to the backend for
983 	 * enabling.  The protocol doesn't support that (interesting
984 	 * extension), so we simply succeed and hope that the relevant
985 	 * packets are going to arrive.
986 	 *
987 	 * If protocol support is added for enable/disable then we'll
988 	 * need to keep a list of those in use and re-add on resume.
989 	 */
990 	return (0);
991 }
992 
993 /*
994  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
995  *
996  *  Program the hardware to enable/disable promiscuous mode.
997  */
998 /*ARGSUSED*/
999 static int
1000 xnf_set_promiscuous(void *arg, boolean_t on)
1001 {
1002 	xnf_t *xnfp = arg;
1003 
1004 #ifdef XNF_DEBUG
1005 	if (xnfdebug & XNF_DEBUG_TRACE)
1006 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
1007 		    ddi_get_instance(xnfp->xnf_devinfo),
1008 		    (void *)xnfp, on);
1009 #endif
1010 	/*
1011 	 * We can't really do this, but we pretend that we can in
1012 	 * order that snoop will work.
1013 	 */
1014 	return (0);
1015 }
1016 
1017 /*
1018  * Clean buffers that we have responses for from the transmit ring.
1019  */
1020 static int
1021 xnf_clean_tx_ring(xnf_t *xnfp)
1022 {
1023 	RING_IDX		next_resp, i;
1024 	struct tx_pktinfo	*reap;
1025 	int			id;
1026 	grant_ref_t		ref;
1027 	boolean_t		work_to_do;
1028 
1029 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1030 
1031 loop:
1032 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1033 		/*
1034 		 * index of next transmission ack
1035 		 */
1036 		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
1037 		membar_consumer();
1038 		/*
1039 		 * Clean tx packets from ring that we have responses for
1040 		 */
1041 		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
1042 			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
1043 			reap = &xnfp->xnf_tx_pkt_info[id];
1044 			ref = reap->grant_ref;
1045 			/*
1046 			 * Return id to free list
1047 			 */
1048 			reap->id = xnfp->xnf_tx_pkt_id_list;
1049 			xnfp->xnf_tx_pkt_id_list = id;
1050 			if (gnttab_query_foreign_access(ref) != 0)
1051 				panic("tx grant still in use "
1052 				    "by backend domain");
1053 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1054 			(void) gnttab_end_foreign_access_ref(ref,
1055 			    xnfp->xnf_tx_pages_readonly);
1056 			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
1057 			    ref);
1058 			freemsg(reap->mp);
1059 			reap->mp = NULL;
1060 			reap->grant_ref = GRANT_INVALID_REF;
1061 			if (reap->bdesc != NULL)
1062 				xnf_free_tx_buffer(reap->bdesc);
1063 			reap->bdesc = NULL;
1064 		}
1065 		xnfp->xnf_tx_ring.rsp_cons = next_resp;
1066 		membar_enter();
1067 	}
1068 
1069 	/* LINTED: constant in conditional context */
1070 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1071 	if (work_to_do)
1072 		goto loop;
1073 
1074 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1075 }
1076 
1077 /*
1078  * If we need to pull up data from either a packet that crosses a page
1079  * boundary or consisting of multiple mblks, do it here.  We allocate
1080  * a page aligned buffer and copy the data into it.  The header for the
1081  * allocated buffer is returned. (which is also allocated here)
1082  */
1083 static struct xnf_buffer_desc *
1084 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1085 {
1086 	struct xnf_buffer_desc	*bdesc;
1087 	mblk_t			*mptr;
1088 	caddr_t			bp;
1089 	int			len;
1090 
1091 	/*
1092 	 * get a xmit buffer from the xmit buffer pool
1093 	 */
1094 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1095 	bdesc = xnf_get_tx_buffer(xnfp);
1096 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
1097 	if (bdesc == NULL)
1098 		return (bdesc);
1099 	/*
1100 	 * Copy the data into the buffer
1101 	 */
1102 	xnfp->xnf_stat_tx_pullup++;
1103 	bp = bdesc->buf;
1104 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1105 		len = mptr->b_wptr - mptr->b_rptr;
1106 		bcopy(mptr->b_rptr, bp, len);
1107 		bp += len;
1108 	}
1109 	return (bdesc);
1110 }
1111 
1112 void
1113 xnf_pseudo_cksum(caddr_t buf, int length)
1114 {
1115 	struct ether_header *ehp;
1116 	uint16_t sap, len, *stuff;
1117 	uint32_t cksum;
1118 	size_t offset;
1119 	ipha_t *ipha;
1120 	ipaddr_t src, dst;
1121 
1122 	ASSERT(length >= sizeof (*ehp));
1123 	ehp = (struct ether_header *)buf;
1124 
1125 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1126 		struct ether_vlan_header *evhp;
1127 
1128 		ASSERT(length >= sizeof (*evhp));
1129 		evhp = (struct ether_vlan_header *)buf;
1130 		sap = ntohs(evhp->ether_type);
1131 		offset = sizeof (*evhp);
1132 	} else {
1133 		sap = ntohs(ehp->ether_type);
1134 		offset = sizeof (*ehp);
1135 	}
1136 
1137 	ASSERT(sap == ETHERTYPE_IP);
1138 
1139 	/* Packet should have been pulled up by the caller. */
1140 	if ((offset + sizeof (ipha_t)) > length) {
1141 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
1142 		return;
1143 	}
1144 
1145 	ipha = (ipha_t *)(buf + offset);
1146 
1147 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
1148 
1149 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1150 
1151 	switch (ipha->ipha_protocol) {
1152 	case IPPROTO_TCP:
1153 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1154 		cksum = IP_TCP_CSUM_COMP;
1155 		break;
1156 	case IPPROTO_UDP:
1157 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1158 		cksum = IP_UDP_CSUM_COMP;
1159 		break;
1160 	default:
1161 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1162 		    ipha->ipha_protocol);
1163 		return;
1164 	}
1165 
1166 	src = ipha->ipha_src;
1167 	dst = ipha->ipha_dst;
1168 
1169 	cksum += (dst >> 16) + (dst & 0xFFFF);
1170 	cksum += (src >> 16) + (src & 0xFFFF);
1171 	cksum += htons(len);
1172 
1173 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1174 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1175 
1176 	ASSERT(cksum <= 0xFFFF);
1177 
1178 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1179 }
1180 
1181 /*
1182  *  xnf_send_one() -- send a packet
1183  *
1184  *  Called when a packet is ready to be transmitted. A pointer to an
1185  *  M_DATA message that contains the packet is passed to this routine.
1186  *  At least the complete LLC header is contained in the message's
1187  *  first message block, and the remainder of the packet is contained
1188  *  within additional M_DATA message blocks linked to the first
1189  *  message block.
1190  *
1191  */
1192 static boolean_t
1193 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1194 {
1195 	struct xnf_buffer_desc	*xmitbuf;
1196 	struct tx_pktinfo	*txp_info;
1197 	mblk_t			*mptr;
1198 	ddi_dma_cookie_t	dma_cookie;
1199 	RING_IDX		slot;
1200 	int			length = 0, i, pktlen = 0, rc, tx_id;
1201 	int			tx_ring_freespace, page_oops;
1202 	uint_t			ncookies;
1203 	volatile netif_tx_request_t	*txrp;
1204 	caddr_t			bufaddr;
1205 	grant_ref_t		ref;
1206 	unsigned long		mfn;
1207 	uint32_t		pflags;
1208 	domid_t			oeid;
1209 
1210 #ifdef XNF_DEBUG
1211 	if (xnfdebug & XNF_DEBUG_SEND)
1212 		printf("xnf%d send(0x%p, 0x%p)\n",
1213 		    ddi_get_instance(xnfp->xnf_devinfo),
1214 		    (void *)xnfp, (void *)mp);
1215 #endif
1216 
1217 	ASSERT(mp != NULL);
1218 	ASSERT(mp->b_next == NULL);
1219 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1220 
1221 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1222 	ASSERT(tx_ring_freespace >= 0);
1223 
1224 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1225 	xnfp->xnf_stat_tx_attempt++;
1226 	/*
1227 	 * If there are no xmit ring slots available, return.
1228 	 */
1229 	if (tx_ring_freespace == 0) {
1230 		xnfp->xnf_stat_tx_defer++;
1231 		return (B_FALSE);	/* Send should be retried */
1232 	}
1233 
1234 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1235 	/* Count the number of mblks in message and compute packet size */
1236 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1237 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1238 
1239 	/* Make sure packet isn't too large */
1240 	if (pktlen > XNF_FRAMESIZE) {
1241 		cmn_err(CE_WARN, "xnf%d: oversized packet (%d bytes) dropped",
1242 		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
1243 		freemsg(mp);
1244 		return (B_TRUE);
1245 	}
1246 
1247 	/*
1248 	 * Test if we cross a page boundary with our buffer
1249 	 */
1250 	page_oops = (i == 1) &&
1251 	    (xnf_btop((size_t)mp->b_rptr) !=
1252 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1253 	/*
1254 	 * XXPV - unfortunately, the Xen virtual net device currently
1255 	 * doesn't support multiple packet frags, so this will always
1256 	 * end up doing the pullup if we got more than one packet.
1257 	 */
1258 	if (i > xnf_max_tx_frags || page_oops) {
1259 		if (page_oops)
1260 			xnfp->xnf_stat_tx_pagebndry++;
1261 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1262 			/* could not allocate resources? */
1263 #ifdef XNF_DEBUG
1264 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1265 			    ddi_get_instance(xnfp->xnf_devinfo));
1266 #endif
1267 			xnfp->xnf_stat_tx_defer++;
1268 			return (B_FALSE);	/* Retry send */
1269 		}
1270 		bufaddr = xmitbuf->buf;
1271 	} else {
1272 		xmitbuf = NULL;
1273 		bufaddr = (caddr_t)mp->b_rptr;
1274 	}
1275 
1276 	/* set up data descriptor */
1277 	length = pktlen;
1278 
1279 	/*
1280 	 * Get packet id from free list
1281 	 */
1282 	tx_id = xnfp->xnf_tx_pkt_id_list;
1283 	ASSERT(tx_id < NET_TX_RING_SIZE);
1284 	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
1285 	xnfp->xnf_tx_pkt_id_list = txp_info->id;
1286 	txp_info->id = tx_id;
1287 
1288 	/* Prepare for DMA mapping of tx buffer(s) */
1289 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1290 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1291 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1292 	if (rc != DDI_DMA_MAPPED) {
1293 		ASSERT(rc != DDI_DMA_INUSE);
1294 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1295 		/*
1296 		 *  Return id to free list
1297 		 */
1298 		txp_info->id = xnfp->xnf_tx_pkt_id_list;
1299 		xnfp->xnf_tx_pkt_id_list = tx_id;
1300 		if (rc == DDI_DMA_NORESOURCES) {
1301 			xnfp->xnf_stat_tx_defer++;
1302 			return (B_FALSE); /* Retry later */
1303 		}
1304 #ifdef XNF_DEBUG
1305 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1306 		    ddi_get_instance(xnfp->xnf_devinfo), rc);
1307 #endif
1308 		return (B_FALSE);
1309 	}
1310 
1311 	ASSERT(ncookies == 1);
1312 	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
1313 	ASSERT((signed short)ref >= 0);
1314 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1315 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1316 	    xnfp->xnf_tx_pages_readonly);
1317 	txp_info->grant_ref = ref;
1318 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1319 	txrp->gref = ref;
1320 	txrp->size = dma_cookie.dmac_size;
1321 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1322 	txrp->id = tx_id;
1323 	txrp->flags = 0;
1324 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1325 	if (pflags != 0) {
1326 		ASSERT(xnfp->xnf_cksum_offload);
1327 		/*
1328 		 * If the local protocol stack requests checksum
1329 		 * offload we set the 'checksum blank' flag,
1330 		 * indicating to the peer that we need the checksum
1331 		 * calculated for us.
1332 		 *
1333 		 * We _don't_ set the validated flag, because we haven't
1334 		 * validated that the data and the checksum match.
1335 		 */
1336 		xnf_pseudo_cksum(bufaddr, length);
1337 		txrp->flags |= NETTXF_csum_blank;
1338 		xnfp->xnf_stat_tx_cksum_deferred++;
1339 	}
1340 	membar_producer();
1341 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 1;
1342 
1343 	txp_info->mp = mp;
1344 	txp_info->bdesc = xmitbuf;
1345 
1346 	xnfp->xnf_stat_opackets++;
1347 	xnfp->xnf_stat_obytes += pktlen;
1348 
1349 	return (B_TRUE);	/* successful transmit attempt */
1350 }
1351 
1352 mblk_t *
1353 xnf_send(void *arg, mblk_t *mp)
1354 {
1355 	xnf_t *xnfp = arg;
1356 	mblk_t *next;
1357 	boolean_t sent_something = B_FALSE;
1358 
1359 	mutex_enter(&xnfp->xnf_txlock);
1360 
1361 	/*
1362 	 * Transmission attempts should be impossible without having
1363 	 * previously called xnf_start().
1364 	 */
1365 	ASSERT(xnfp->xnf_running);
1366 
1367 	/*
1368 	 * Wait for getting connected to the backend
1369 	 */
1370 	while (!xnfp->xnf_connected) {
1371 		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
1372 	}
1373 
1374 	while (mp != NULL) {
1375 		next = mp->b_next;
1376 		mp->b_next = NULL;
1377 
1378 		if (!xnf_send_one(xnfp, mp)) {
1379 			mp->b_next = next;
1380 			break;
1381 		}
1382 
1383 		mp = next;
1384 		sent_something = B_TRUE;
1385 	}
1386 
1387 	if (sent_something) {
1388 		boolean_t notify;
1389 
1390 		/* LINTED: constant in conditional context */
1391 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1392 		    notify);
1393 		if (notify)
1394 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1395 	}
1396 
1397 	if (mp != NULL)
1398 		xnfp->xnf_need_sched = B_TRUE;
1399 
1400 	mutex_exit(&xnfp->xnf_txlock);
1401 
1402 	return (mp);
1403 }
1404 
1405 /*
1406  *  xnf_intr() -- ring interrupt service routine
1407  */
1408 static uint_t
1409 xnf_intr(caddr_t arg)
1410 {
1411 	xnf_t *xnfp = (xnf_t *)arg;
1412 	boolean_t sched = B_FALSE;
1413 
1414 	mutex_enter(&xnfp->xnf_intrlock);
1415 
1416 	/* spurious intr */
1417 	if (!xnfp->xnf_connected) {
1418 		mutex_exit(&xnfp->xnf_intrlock);
1419 		xnfp->xnf_stat_unclaimed_interrupts++;
1420 		return (DDI_INTR_UNCLAIMED);
1421 	}
1422 
1423 #ifdef XNF_DEBUG
1424 	if (xnfdebug & XNF_DEBUG_INT)
1425 		printf("xnf%d intr(0x%p)\n",
1426 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1427 #endif
1428 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1429 		mblk_t *mp;
1430 
1431 		if (xnfp->xnf_rx_hvcopy)
1432 			mp = xnf_process_hvcopy_recv(xnfp);
1433 		else
1434 			mp = xnf_process_recv(xnfp);
1435 
1436 		if (mp != NULL)
1437 			mac_rx(xnfp->xnf_mh, NULL, mp);
1438 	}
1439 
1440 	xnfp->xnf_stat_interrupts++;
1441 	mutex_exit(&xnfp->xnf_intrlock);
1442 
1443 	/*
1444 	 * Clean tx ring and try to start any blocked xmit streams if
1445 	 * there is now some space.
1446 	 */
1447 	mutex_enter(&xnfp->xnf_txlock);
1448 	if (xnf_clean_tx_ring(xnfp) > 0) {
1449 		sched = xnfp->xnf_need_sched;
1450 		xnfp->xnf_need_sched = B_FALSE;
1451 	}
1452 	mutex_exit(&xnfp->xnf_txlock);
1453 
1454 	if (sched)
1455 		mac_tx_update(xnfp->xnf_mh);
1456 
1457 	return (DDI_INTR_CLAIMED);
1458 }
1459 
1460 /*
1461  *  xnf_start() -- start the board receiving and enable interrupts.
1462  */
1463 static int
1464 xnf_start(void *arg)
1465 {
1466 	xnf_t *xnfp = arg;
1467 
1468 #ifdef XNF_DEBUG
1469 	if (xnfdebug & XNF_DEBUG_TRACE)
1470 		printf("xnf%d start(0x%p)\n",
1471 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1472 #endif
1473 
1474 	mutex_enter(&xnfp->xnf_intrlock);
1475 	mutex_enter(&xnfp->xnf_txlock);
1476 
1477 	/* Accept packets from above. */
1478 	xnfp->xnf_running = B_TRUE;
1479 
1480 	mutex_exit(&xnfp->xnf_txlock);
1481 	mutex_exit(&xnfp->xnf_intrlock);
1482 
1483 	return (0);
1484 }
1485 
1486 /* xnf_stop() - disable hardware */
1487 static void
1488 xnf_stop(void *arg)
1489 {
1490 	xnf_t *xnfp = arg;
1491 
1492 #ifdef XNF_DEBUG
1493 	if (xnfdebug & XNF_DEBUG_TRACE)
1494 		printf("xnf%d stop(0x%p)\n",
1495 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1496 #endif
1497 
1498 	mutex_enter(&xnfp->xnf_intrlock);
1499 	mutex_enter(&xnfp->xnf_txlock);
1500 
1501 	xnfp->xnf_running = B_FALSE;
1502 
1503 	mutex_exit(&xnfp->xnf_txlock);
1504 	mutex_exit(&xnfp->xnf_intrlock);
1505 }
1506 
1507 /*
1508  * Driver private functions follow
1509  */
1510 
1511 /*
1512  * Hang buffer on rx ring
1513  */
1514 static void
1515 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1516 {
1517 	volatile netif_rx_request_t	*reqp;
1518 	RING_IDX			hang_ix;
1519 	grant_ref_t			ref;
1520 	domid_t				oeid;
1521 
1522 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1523 
1524 	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
1525 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1526 	    xnfp->xnf_rx_ring.req_prod_pvt);
1527 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1528 	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
1529 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1530 		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
1531 		ASSERT((signed short)ref >= 0);
1532 		bdesc->grant_ref = ref;
1533 		if (xnfp->xnf_rx_hvcopy) {
1534 			pfn_t pfn = xnf_btop(bdesc->buf_phys);
1535 			mfn_t mfn = pfn_to_mfn(pfn);
1536 
1537 			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
1538 		} else {
1539 			gnttab_grant_foreign_transfer_ref(ref, oeid, 0);
1540 		}
1541 	}
1542 	reqp->id = hang_ix;
1543 	reqp->gref = bdesc->grant_ref;
1544 	bdesc->id = hang_ix;
1545 	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
1546 	membar_producer();
1547 	xnfp->xnf_rx_ring.req_prod_pvt++;
1548 }
1549 
1550 static mblk_t *
1551 xnf_process_hvcopy_recv(xnf_t *xnfp)
1552 {
1553 	netif_rx_response_t *rxpkt;
1554 	mblk_t		*mp, *head, *tail;
1555 	struct		xnf_buffer_desc *bdesc;
1556 	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
1557 	size_t 		len;
1558 
1559 	/*
1560 	 * in loop over unconsumed responses, we do:
1561 	 * 1. get a response
1562 	 * 2. take corresponding buffer off recv. ring
1563 	 * 3. indicate this by setting slot to NULL
1564 	 * 4. create a new message and
1565 	 * 5. copy data in, adjust ptr
1566 	 *
1567 	 * outside loop:
1568 	 * 7. make sure no more data has arrived; kick HV
1569 	 */
1570 
1571 	head = tail = NULL;
1572 
1573 loop:
1574 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1575 
1576 		/* 1. */
1577 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1578 		    xnfp->xnf_rx_ring.rsp_cons);
1579 
1580 		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
1581 		    (int)rxpkt->offset,
1582 		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
1583 
1584 		/*
1585 		 * 2.
1586 		 * Take buffer off of receive ring
1587 		 */
1588 		hwcsum = B_FALSE;
1589 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1590 		/* 3 */
1591 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1592 		ASSERT(bdesc->id == rxpkt->id);
1593 		mp = NULL;
1594 		if (!xnfp->xnf_running) {
1595 			DTRACE_PROBE4(pkt_dropped, int, rxpkt->status,
1596 			    char *, bdesc->buf, int, rxpkt->offset,
1597 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1598 			xnfp->xnf_stat_drop++;
1599 			/*
1600 			 * re-hang the buffer
1601 			 */
1602 			rx_buffer_hang(xnfp, bdesc);
1603 		} else if (rxpkt->status <= 0) {
1604 			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
1605 			    char *, bdesc->buf, int, rxpkt->offset,
1606 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1607 			xnfp->xnf_stat_errrx++;
1608 			if (rxpkt->status == 0)
1609 				xnfp->xnf_stat_runt++;
1610 			if (rxpkt->status == NETIF_RSP_ERROR)
1611 				xnfp->xnf_stat_mac_rcv_error++;
1612 			if (rxpkt->status == NETIF_RSP_DROPPED)
1613 				xnfp->xnf_stat_norxbuf++;
1614 			/*
1615 			 * re-hang the buffer
1616 			 */
1617 			rx_buffer_hang(xnfp, bdesc);
1618 		} else {
1619 			grant_ref_t		ref =  bdesc->grant_ref;
1620 			struct xnf_buffer_desc	*new_bdesc;
1621 			unsigned long		off = rxpkt->offset;
1622 
1623 			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
1624 			    char *, bdesc->buf, int, rxpkt->offset,
1625 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1626 			len = rxpkt->status;
1627 			ASSERT(off + len <= PAGEOFFSET);
1628 			if (ref == GRANT_INVALID_REF) {
1629 				mp = NULL;
1630 				new_bdesc = bdesc;
1631 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1632 				    "from dom %d", ref,
1633 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1634 				goto luckless;
1635 			}
1636 			/*
1637 			 * Release ref which we'll be re-claiming in
1638 			 * rx_buffer_hang().
1639 			 */
1640 			bdesc->grant_ref = GRANT_INVALID_REF;
1641 			(void) gnttab_end_foreign_access_ref(ref, 0);
1642 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1643 			    ref);
1644 			if (rxpkt->flags & NETRXF_data_validated)
1645 				hwcsum = B_TRUE;
1646 
1647 			/*
1648 			 * XXPV for the initial implementation of HVcopy,
1649 			 * create a new msg and copy in the data
1650 			 */
1651 			/* 4. */
1652 			if ((mp = allocb(len, BPRI_MED)) == NULL) {
1653 				/*
1654 				 * Couldn't get buffer to copy to,
1655 				 * drop this data, and re-hang
1656 				 * the buffer on the ring.
1657 				 */
1658 				xnfp->xnf_stat_norxbuf++;
1659 				DTRACE_PROBE(alloc_nix);
1660 			} else {
1661 				/* 5. */
1662 				DTRACE_PROBE(alloc_ok);
1663 				bcopy(bdesc->buf + off, mp->b_wptr,
1664 				    len);
1665 				mp->b_wptr += len;
1666 			}
1667 			new_bdesc = bdesc;
1668 luckless:
1669 
1670 			/* Re-hang old or hang new buffer. */
1671 			rx_buffer_hang(xnfp, new_bdesc);
1672 		}
1673 		if (mp) {
1674 			if (hwcsum) {
1675 				/*
1676 				 * See comments in xnf_process_recv().
1677 				 */
1678 
1679 				(void) hcksum_assoc(mp, NULL,
1680 				    NULL, 0, 0, 0, 0,
1681 				    HCK_FULLCKSUM |
1682 				    HCK_FULLCKSUM_OK,
1683 				    0);
1684 				xnfp->xnf_stat_rx_cksum_no_need++;
1685 			}
1686 			if (head == NULL) {
1687 				head = tail = mp;
1688 			} else {
1689 				tail->b_next = mp;
1690 				tail = mp;
1691 			}
1692 
1693 			ASSERT(mp->b_next == NULL);
1694 
1695 			xnfp->xnf_stat_ipackets++;
1696 			xnfp->xnf_stat_rbytes += len;
1697 		}
1698 
1699 		xnfp->xnf_rx_ring.rsp_cons++;
1700 
1701 		xnfp->xnf_stat_hvcopy_packet_processed++;
1702 	}
1703 
1704 	/* 7. */
1705 	/*
1706 	 * Has more data come in since we started?
1707 	 */
1708 	/* LINTED: constant in conditional context */
1709 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1710 	if (work_to_do)
1711 		goto loop;
1712 
1713 	/*
1714 	 * Indicate to the backend that we have re-filled the receive
1715 	 * ring.
1716 	 */
1717 	/* LINTED: constant in conditional context */
1718 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1719 	if (notify)
1720 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1721 
1722 	return (head);
1723 }
1724 
1725 /* Process all queued received packets */
1726 static mblk_t *
1727 xnf_process_recv(xnf_t *xnfp)
1728 {
1729 	volatile netif_rx_response_t *rxpkt;
1730 	mblk_t *mp, *head, *tail;
1731 	struct xnf_buffer_desc *bdesc;
1732 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1733 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1734 	size_t len;
1735 	pfn_t pfn;
1736 	long cnt;
1737 
1738 	head = tail = NULL;
1739 loop:
1740 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1741 
1742 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1743 		    xnfp->xnf_rx_ring.rsp_cons);
1744 
1745 		/*
1746 		 * Take buffer off of receive ring
1747 		 */
1748 		hwcsum = B_FALSE;
1749 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1750 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1751 		ASSERT(bdesc->id == rxpkt->id);
1752 		mp = NULL;
1753 		if (!xnfp->xnf_running) {
1754 			xnfp->xnf_stat_drop++;
1755 			/*
1756 			 * re-hang the buffer
1757 			 */
1758 			rx_buffer_hang(xnfp, bdesc);
1759 		} else if (rxpkt->status <= 0) {
1760 			xnfp->xnf_stat_errrx++;
1761 			if (rxpkt->status == 0)
1762 				xnfp->xnf_stat_runt++;
1763 			if (rxpkt->status == NETIF_RSP_ERROR)
1764 				xnfp->xnf_stat_mac_rcv_error++;
1765 			if (rxpkt->status == NETIF_RSP_DROPPED)
1766 				xnfp->xnf_stat_norxbuf++;
1767 			/*
1768 			 * re-hang the buffer
1769 			 */
1770 			rx_buffer_hang(xnfp, bdesc);
1771 		} else {
1772 			grant_ref_t ref =  bdesc->grant_ref;
1773 			struct xnf_buffer_desc *new_bdesc;
1774 			unsigned long off = rxpkt->offset;
1775 			unsigned long mfn;
1776 
1777 			len = rxpkt->status;
1778 			ASSERT(off + len <= PAGEOFFSET);
1779 			if (ref == GRANT_INVALID_REF) {
1780 				mp = NULL;
1781 				new_bdesc = bdesc;
1782 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1783 				    "from dom %d", ref,
1784 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1785 				goto luckless;
1786 			}
1787 			bdesc->grant_ref = GRANT_INVALID_REF;
1788 			mfn = gnttab_end_foreign_transfer_ref(ref);
1789 			ASSERT(mfn != MFN_INVALID);
1790 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1791 			    PFN_INVALID);
1792 
1793 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1794 			    ref);
1795 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1796 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1797 			    xnf_btop(bdesc->buf_phys),
1798 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1799 			balloon_drv_added(1);
1800 
1801 			if (rxpkt->flags & NETRXF_data_validated)
1802 				hwcsum = B_TRUE;
1803 			if (len <= xnf_rx_bcopy_thresh) {
1804 				/*
1805 				 * For small buffers, just copy the data
1806 				 * and send the copy upstream.
1807 				 */
1808 				new_bdesc = NULL;
1809 			} else {
1810 				/*
1811 				 * We send a pointer to this data upstream;
1812 				 * we need a new buffer to replace this one.
1813 				 */
1814 				mutex_enter(&xnfp->xnf_rx_buf_mutex);
1815 				new_bdesc = xnf_get_buffer(xnfp);
1816 				if (new_bdesc != NULL) {
1817 					xnfp->xnf_rx_bufs_outstanding++;
1818 				} else {
1819 					xnfp->xnf_stat_rx_no_ringbuf++;
1820 				}
1821 				mutex_exit(&xnfp->xnf_rx_buf_mutex);
1822 			}
1823 
1824 			if (new_bdesc == NULL) {
1825 				/*
1826 				 * Don't have a new ring buffer; bcopy the data
1827 				 * from the buffer, and preserve the
1828 				 * original buffer
1829 				 */
1830 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1831 					/*
1832 					 * Could't get buffer to copy to,
1833 					 * drop this data, and re-hang
1834 					 * the buffer on the ring.
1835 					 */
1836 					xnfp->xnf_stat_norxbuf++;
1837 				} else {
1838 					bcopy(bdesc->buf + off, mp->b_wptr,
1839 					    len);
1840 				}
1841 				/*
1842 				 * Give the buffer page back to xen
1843 				 */
1844 				pfn = xnf_btop(bdesc->buf_phys);
1845 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1846 				    &pfn);
1847 				if (cnt != 1) {
1848 					cmn_err(CE_WARN, "unable to give a "
1849 					    "page back to the hypervisor\n");
1850 				}
1851 				new_bdesc = bdesc;
1852 			} else {
1853 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1854 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1855 					/*
1856 					 * Couldn't get mblk to pass recv data
1857 					 * up with, free the old ring buffer
1858 					 */
1859 					xnfp->xnf_stat_norxbuf++;
1860 					xnf_rcv_complete(bdesc);
1861 					goto luckless;
1862 				}
1863 				(void) ddi_dma_sync(bdesc->dma_handle,
1864 				    0, 0, DDI_DMA_SYNC_FORCPU);
1865 
1866 				mp->b_wptr += off;
1867 				mp->b_rptr += off;
1868 			}
1869 luckless:
1870 			if (mp)
1871 				mp->b_wptr += len;
1872 			/* re-hang old or hang new buffer */
1873 			rx_buffer_hang(xnfp, new_bdesc);
1874 		}
1875 		if (mp) {
1876 			if (hwcsum) {
1877 				/*
1878 				 * If the peer says that the data has
1879 				 * been validated then we declare that
1880 				 * the full checksum has been
1881 				 * verified.
1882 				 *
1883 				 * We don't look at the "checksum
1884 				 * blank" flag, and hence could have a
1885 				 * packet here that we are asserting
1886 				 * is good with a blank checksum.
1887 				 *
1888 				 * The hardware checksum offload
1889 				 * specification says that we must
1890 				 * provide the actual checksum as well
1891 				 * as an assertion that it is valid,
1892 				 * but the protocol stack doesn't
1893 				 * actually use it and some other
1894 				 * drivers don't bother, so we don't.
1895 				 * If it was necessary we could grovel
1896 				 * in the packet to find it.
1897 				 */
1898 
1899 				(void) hcksum_assoc(mp, NULL,
1900 				    NULL, 0, 0, 0, 0,
1901 				    HCK_FULLCKSUM |
1902 				    HCK_FULLCKSUM_OK,
1903 				    0);
1904 				xnfp->xnf_stat_rx_cksum_no_need++;
1905 			}
1906 			if (head == NULL) {
1907 				head = tail = mp;
1908 			} else {
1909 				tail->b_next = mp;
1910 				tail = mp;
1911 			}
1912 
1913 			ASSERT(mp->b_next == NULL);
1914 
1915 			xnfp->xnf_stat_ipackets++;
1916 			xnfp->xnf_stat_rbytes += len;
1917 		}
1918 
1919 		xnfp->xnf_rx_ring.rsp_cons++;
1920 	}
1921 
1922 	/*
1923 	 * Has more data come in since we started?
1924 	 */
1925 	/* LINTED: constant in conditional context */
1926 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1927 	if (work_to_do)
1928 		goto loop;
1929 
1930 	/*
1931 	 * Indicate to the backend that we have re-filled the receive
1932 	 * ring.
1933 	 */
1934 	/* LINTED: constant in conditional context */
1935 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1936 	if (notify)
1937 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1938 
1939 	return (head);
1940 }
1941 
1942 /* Called when the upper layers free a message we passed upstream */
1943 static void
1944 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1945 {
1946 	xnf_t *xnfp = bdesc->xnfp;
1947 	pfn_t pfn;
1948 	long cnt;
1949 
1950 	/* One less outstanding receive buffer */
1951 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1952 	--xnfp->xnf_rx_bufs_outstanding;
1953 	/*
1954 	 * Return buffer to the free list, unless the free list is getting
1955 	 * too large.  XXPV - this threshold may need tuning.
1956 	 */
1957 	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
1958 		/*
1959 		 * Unmap the page, and hand the machine page back
1960 		 * to xen so it can be re-used as a backend net buffer.
1961 		 */
1962 		pfn = xnf_btop(bdesc->buf_phys);
1963 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1964 		if (cnt != 1) {
1965 			cmn_err(CE_WARN, "unable to give a page back to the "
1966 			    "hypervisor\n");
1967 		}
1968 
1969 		bdesc->next = xnfp->xnf_free_list;
1970 		xnfp->xnf_free_list = bdesc;
1971 		xnfp->xnf_rx_descs_free++;
1972 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1973 	} else {
1974 		/*
1975 		 * We can return everything here since we have a free buffer
1976 		 * that we have not given the backing page for back to xen.
1977 		 */
1978 		--xnfp->xnf_rx_buffer_count;
1979 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1980 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1981 		ddi_dma_mem_free(&bdesc->acc_handle);
1982 		ddi_dma_free_handle(&bdesc->dma_handle);
1983 		kmem_free(bdesc, sizeof (*bdesc));
1984 	}
1985 }
1986 
1987 /*
1988  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1989  */
1990 static int
1991 xnf_alloc_dma_resources(xnf_t *xnfp)
1992 {
1993 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
1994 	int			i;
1995 	size_t			len;
1996 	ddi_dma_cookie_t	dma_cookie;
1997 	uint_t			ncookies;
1998 	struct xnf_buffer_desc	*bdesc;
1999 	int			rc;
2000 	caddr_t			rptr;
2001 
2002 	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
2003 	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
2004 
2005 	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
2006 
2007 	/*
2008 	 * The code below allocates all the DMA data structures that
2009 	 * need to be released when the driver is detached.
2010 	 *
2011 	 * First allocate handles for mapping (virtual address) pointers to
2012 	 * transmit data buffers to physical addresses
2013 	 */
2014 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2015 		if ((rc = ddi_dma_alloc_handle(devinfo,
2016 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
2017 		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
2018 			return (DDI_FAILURE);
2019 	}
2020 
2021 	/*
2022 	 * Allocate page for the transmit descriptor ring.
2023 	 */
2024 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2025 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2026 		goto alloc_error;
2027 
2028 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2029 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2030 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2031 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2032 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2033 		xnfp->xnf_tx_ring_dma_handle = NULL;
2034 		goto alloc_error;
2035 	}
2036 
2037 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2038 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2039 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2040 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2041 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2042 		xnfp->xnf_tx_ring_dma_handle = NULL;
2043 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2044 		if (rc == DDI_DMA_NORESOURCES)
2045 			goto alloc_error;
2046 		else
2047 			goto error;
2048 	}
2049 
2050 	ASSERT(ncookies == 1);
2051 	bzero(rptr, PAGESIZE);
2052 	/* LINTED: constant in conditional context */
2053 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2054 	/* LINTED: constant in conditional context */
2055 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2056 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2057 
2058 	/*
2059 	 * Allocate page for the receive descriptor ring.
2060 	 */
2061 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2062 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2063 		goto alloc_error;
2064 
2065 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2066 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2067 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2068 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2069 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2070 		xnfp->xnf_rx_ring_dma_handle = NULL;
2071 		goto alloc_error;
2072 	}
2073 
2074 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2075 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2076 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2077 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2078 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2079 		xnfp->xnf_rx_ring_dma_handle = NULL;
2080 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2081 		if (rc == DDI_DMA_NORESOURCES)
2082 			goto alloc_error;
2083 		else
2084 			goto error;
2085 	}
2086 
2087 	ASSERT(ncookies == 1);
2088 	bzero(rptr, PAGESIZE);
2089 	/* LINTED: constant in conditional context */
2090 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2091 	/* LINTED: constant in conditional context */
2092 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2093 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2094 
2095 	/*
2096 	 * Preallocate receive buffers for each receive descriptor.
2097 	 */
2098 
2099 	/* Set up the "free list" of receive buffer descriptors */
2100 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2101 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
2102 			goto alloc_error;
2103 		bdesc->next = xnfp->xnf_free_list;
2104 		xnfp->xnf_free_list = bdesc;
2105 	}
2106 
2107 	return (DDI_SUCCESS);
2108 
2109 alloc_error:
2110 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2111 	    ddi_get_instance(xnfp->xnf_devinfo));
2112 error:
2113 	xnf_release_dma_resources(xnfp);
2114 	return (DDI_FAILURE);
2115 }
2116 
2117 /*
2118  * Release all DMA resources in the opposite order from acquisition
2119  * Should not be called until all outstanding esballoc buffers
2120  * have been returned.
2121  */
2122 static void
2123 xnf_release_dma_resources(xnf_t *xnfp)
2124 {
2125 	int i;
2126 
2127 	/*
2128 	 * Free receive buffers which are currently associated with
2129 	 * descriptors
2130 	 */
2131 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2132 		struct xnf_buffer_desc *bp;
2133 
2134 		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
2135 			continue;
2136 		xnf_free_buffer(bp);
2137 		xnfp->xnf_rxpkt_bufptr[i] = NULL;
2138 	}
2139 
2140 	/* Free the receive ring buffer */
2141 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2142 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2143 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2144 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2145 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2146 	}
2147 	/* Free the transmit ring buffer */
2148 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2149 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2150 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2151 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2152 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2153 	}
2154 
2155 	/*
2156 	 * Free handles for mapping (virtual address) pointers to
2157 	 * transmit data buffers to physical addresses
2158 	 */
2159 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2160 		if (xnfp->xnf_tx_pkt_info[i].dma_handle != NULL) {
2161 			ddi_dma_free_handle(
2162 			    &xnfp->xnf_tx_pkt_info[i].dma_handle);
2163 		}
2164 	}
2165 
2166 }
2167 
2168 static void
2169 xnf_release_mblks(xnf_t *xnfp)
2170 {
2171 	int	i;
2172 
2173 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2174 		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
2175 			continue;
2176 		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
2177 		xnfp->xnf_tx_pkt_info[i].mp = NULL;
2178 		(void) ddi_dma_unbind_handle(
2179 		    xnfp->xnf_tx_pkt_info[i].dma_handle);
2180 	}
2181 }
2182 
2183 /*
2184  * Remove a xmit buffer descriptor from the head of the free list and return
2185  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2186  * Called with the tx_buf_mutex held.
2187  */
2188 static struct xnf_buffer_desc *
2189 xnf_get_tx_buffer(xnf_t *xnfp)
2190 {
2191 	struct xnf_buffer_desc *bdesc;
2192 
2193 	bdesc = xnfp->xnf_tx_free_list;
2194 	if (bdesc != NULL) {
2195 		xnfp->xnf_tx_free_list = bdesc->next;
2196 	} else {
2197 		bdesc = xnf_alloc_tx_buffer(xnfp);
2198 	}
2199 	return (bdesc);
2200 }
2201 
2202 /*
2203  * Remove a buffer descriptor from the head of the free list and return
2204  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2205  * Called with the rx_buf_mutex held.
2206  */
2207 static struct xnf_buffer_desc *
2208 xnf_get_buffer(xnf_t *xnfp)
2209 {
2210 	struct xnf_buffer_desc *bdesc;
2211 
2212 	bdesc = xnfp->xnf_free_list;
2213 	if (bdesc != NULL) {
2214 		xnfp->xnf_free_list = bdesc->next;
2215 		xnfp->xnf_rx_descs_free--;
2216 	} else {
2217 		bdesc = xnf_alloc_buffer(xnfp);
2218 	}
2219 	return (bdesc);
2220 }
2221 
2222 /*
2223  * Free a xmit buffer back to the xmit free list
2224  */
2225 static void
2226 xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
2227 {
2228 	xnf_t *xnfp = bp->xnfp;
2229 
2230 	mutex_enter(&xnfp->xnf_tx_buf_mutex);
2231 	bp->next = xnfp->xnf_tx_free_list;
2232 	xnfp->xnf_tx_free_list = bp;
2233 	mutex_exit(&xnfp->xnf_tx_buf_mutex);
2234 }
2235 
2236 /*
2237  * Put a buffer descriptor onto the head of the free list.
2238  * for page-flip:
2239  * We can't really free these buffers back to the kernel
2240  * since we have given away their backing page to be used
2241  * by the back end net driver.
2242  * for hvcopy:
2243  * release all the memory
2244  */
2245 static void
2246 xnf_free_buffer(struct xnf_buffer_desc *bdesc)
2247 {
2248 	xnf_t *xnfp = bdesc->xnfp;
2249 
2250 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
2251 	if (xnfp->xnf_rx_hvcopy) {
2252 		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
2253 			goto out;
2254 		ddi_dma_mem_free(&bdesc->acc_handle);
2255 		ddi_dma_free_handle(&bdesc->dma_handle);
2256 		kmem_free(bdesc, sizeof (*bdesc));
2257 		xnfp->xnf_rx_buffer_count--;
2258 	} else {
2259 		bdesc->next = xnfp->xnf_free_list;
2260 		xnfp->xnf_free_list = bdesc;
2261 		xnfp->xnf_rx_descs_free++;
2262 	}
2263 out:
2264 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
2265 }
2266 
2267 /*
2268  * Allocate a DMA-able xmit buffer, including a structure to
2269  * keep track of the buffer.  Called with tx_buf_mutex held.
2270  */
2271 static struct xnf_buffer_desc *
2272 xnf_alloc_tx_buffer(xnf_t *xnfp)
2273 {
2274 	struct xnf_buffer_desc *bdesc;
2275 	size_t len;
2276 
2277 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2278 		return (NULL);
2279 
2280 	/* allocate a DMA access handle for receive buffer */
2281 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
2282 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2283 		goto failure;
2284 
2285 	/* Allocate DMA-able memory for transmit buffer */
2286 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2287 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2288 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2289 		goto failure_1;
2290 
2291 	bdesc->xnfp = xnfp;
2292 	xnfp->xnf_tx_buffer_count++;
2293 
2294 	return (bdesc);
2295 
2296 failure_1:
2297 	ddi_dma_free_handle(&bdesc->dma_handle);
2298 
2299 failure:
2300 	kmem_free(bdesc, sizeof (*bdesc));
2301 	return (NULL);
2302 }
2303 
2304 /*
2305  * Allocate a DMA-able receive buffer, including a structure to
2306  * keep track of the buffer.  Called with rx_buf_mutex held.
2307  */
2308 static struct xnf_buffer_desc *
2309 xnf_alloc_buffer(xnf_t *xnfp)
2310 {
2311 	struct			xnf_buffer_desc *bdesc;
2312 	size_t			len;
2313 	uint_t			ncookies;
2314 	ddi_dma_cookie_t	dma_cookie;
2315 	long			cnt;
2316 	pfn_t			pfn;
2317 
2318 	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
2319 		return (NULL);
2320 
2321 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2322 		return (NULL);
2323 
2324 	/* allocate a DMA access handle for receive buffer */
2325 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
2326 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2327 		goto failure;
2328 
2329 	/* Allocate DMA-able memory for receive buffer */
2330 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2331 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2332 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2333 		goto failure_1;
2334 
2335 	/* bind to virtual address of buffer to get physical address */
2336 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2337 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2338 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2339 		goto failure_2;
2340 
2341 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2342 	bdesc->xnfp = xnfp;
2343 	if (xnfp->xnf_rx_hvcopy) {
2344 		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
2345 	} else {
2346 		bdesc->free_rtn.free_func = xnf_rcv_complete;
2347 	}
2348 	bdesc->free_rtn.free_arg = (char *)bdesc;
2349 	bdesc->grant_ref = GRANT_INVALID_REF;
2350 	ASSERT(ncookies == 1);
2351 
2352 	xnfp->xnf_rx_buffer_count++;
2353 
2354 	if (!xnfp->xnf_rx_hvcopy) {
2355 		/*
2356 		 * Unmap the page, and hand the machine page back
2357 		 * to xen so it can be used as a backend net buffer.
2358 		 */
2359 		pfn = xnf_btop(bdesc->buf_phys);
2360 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2361 		if (cnt != 1) {
2362 			cmn_err(CE_WARN, "unable to give a page back to the "
2363 			    "hypervisor\n");
2364 		}
2365 	}
2366 
2367 	return (bdesc);
2368 
2369 failure_2:
2370 	ddi_dma_mem_free(&bdesc->acc_handle);
2371 
2372 failure_1:
2373 	ddi_dma_free_handle(&bdesc->dma_handle);
2374 
2375 failure:
2376 	kmem_free(bdesc, sizeof (*bdesc));
2377 	return (NULL);
2378 }
2379 
2380 /*
2381  * Statistics.
2382  */
2383 static char *xnf_aux_statistics[] = {
2384 	"tx_cksum_deferred",
2385 	"rx_cksum_no_need",
2386 	"interrupts",
2387 	"unclaimed_interrupts",
2388 	"tx_pullup",
2389 	"tx_pagebndry",
2390 	"tx_attempt",
2391 	"rx_no_ringbuf",
2392 	"hvcopy_packet_processed",
2393 };
2394 
2395 static int
2396 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2397 {
2398 	xnf_t *xnfp;
2399 	kstat_named_t *knp;
2400 
2401 	if (flag != KSTAT_READ)
2402 		return (EACCES);
2403 
2404 	xnfp = ksp->ks_private;
2405 	knp = ksp->ks_data;
2406 
2407 	/*
2408 	 * Assignment order must match that of the names in
2409 	 * xnf_aux_statistics.
2410 	 */
2411 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2412 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2413 
2414 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2415 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2416 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2417 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2418 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2419 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
2420 
2421 	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
2422 
2423 	return (0);
2424 }
2425 
2426 static boolean_t
2427 xnf_kstat_init(xnf_t *xnfp)
2428 {
2429 	int nstat = sizeof (xnf_aux_statistics) /
2430 	    sizeof (xnf_aux_statistics[0]);
2431 	char **cp = xnf_aux_statistics;
2432 	kstat_named_t *knp;
2433 
2434 	/*
2435 	 * Create and initialise kstats.
2436 	 */
2437 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2438 	    ddi_get_instance(xnfp->xnf_devinfo),
2439 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2440 	    nstat, 0)) == NULL)
2441 		return (B_FALSE);
2442 
2443 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2444 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2445 
2446 	knp = xnfp->xnf_kstat_aux->ks_data;
2447 	while (nstat > 0) {
2448 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2449 
2450 		knp++;
2451 		cp++;
2452 		nstat--;
2453 	}
2454 
2455 	kstat_install(xnfp->xnf_kstat_aux);
2456 
2457 	return (B_TRUE);
2458 }
2459 
2460 static int
2461 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2462 {
2463 	xnf_t *xnfp = arg;
2464 
2465 	mutex_enter(&xnfp->xnf_intrlock);
2466 	mutex_enter(&xnfp->xnf_txlock);
2467 
2468 #define	mac_stat(q, r)				\
2469 	case (MAC_STAT_##q):			\
2470 		*val = xnfp->xnf_stat_##r;	\
2471 		break
2472 
2473 #define	ether_stat(q, r)			\
2474 	case (ETHER_STAT_##q):			\
2475 		*val = xnfp->xnf_stat_##r;	\
2476 		break
2477 
2478 	switch (stat) {
2479 
2480 	mac_stat(IPACKETS, ipackets);
2481 	mac_stat(OPACKETS, opackets);
2482 	mac_stat(RBYTES, rbytes);
2483 	mac_stat(OBYTES, obytes);
2484 	mac_stat(NORCVBUF, norxbuf);
2485 	mac_stat(IERRORS, errrx);
2486 	mac_stat(NOXMTBUF, tx_defer);
2487 
2488 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2489 	ether_stat(TOOSHORT_ERRORS, runt);
2490 
2491 	/* always claim to be in full duplex mode */
2492 	case ETHER_STAT_LINK_DUPLEX:
2493 		*val = LINK_DUPLEX_FULL;
2494 		break;
2495 
2496 	/* always claim to be at 1Gb/s link speed */
2497 	case MAC_STAT_IFSPEED:
2498 		*val = 1000000000ull;
2499 		break;
2500 
2501 	default:
2502 		mutex_exit(&xnfp->xnf_txlock);
2503 		mutex_exit(&xnfp->xnf_intrlock);
2504 
2505 		return (ENOTSUP);
2506 	}
2507 
2508 #undef mac_stat
2509 #undef ether_stat
2510 
2511 	mutex_exit(&xnfp->xnf_txlock);
2512 	mutex_exit(&xnfp->xnf_intrlock);
2513 
2514 	return (0);
2515 }
2516 
2517 /*ARGSUSED*/
2518 static void
2519 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2520 {
2521 	miocnak(q, mp, 0, EINVAL);
2522 }
2523 
2524 static boolean_t
2525 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2526 {
2527 	xnf_t *xnfp = arg;
2528 
2529 	switch (cap) {
2530 	case MAC_CAPAB_HCKSUM: {
2531 		uint32_t *capab = cap_data;
2532 
2533 		/*
2534 		 * Whilst the flag used to communicate with the IO
2535 		 * domain is called "NETTXF_csum_blank", the checksum
2536 		 * in the packet must contain the pseudo-header
2537 		 * checksum and not zero.
2538 		 *
2539 		 * To help out the IO domain, we might use
2540 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
2541 		 * then use checksum offload for IPv6 packets, which
2542 		 * the IO domain can't handle.
2543 		 *
2544 		 * As a result, we declare outselves capable of
2545 		 * HCKSUM_INET_FULL_V4. This means that we receive
2546 		 * IPv4 packets from the stack with a blank checksum
2547 		 * field and must insert the pseudo-header checksum
2548 		 * before passing the packet to the IO domain.
2549 		 */
2550 		if (xnfp->xnf_cksum_offload)
2551 			*capab = HCKSUM_INET_FULL_V4;
2552 		else
2553 			*capab = 0;
2554 		break;
2555 	}
2556 	default:
2557 		return (B_FALSE);
2558 	}
2559 
2560 	return (B_TRUE);
2561 }
2562 
2563 /*ARGSUSED*/
2564 static void
2565 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2566     void *arg, void *impl_data)
2567 {
2568 	xnf_t *xnfp = ddi_get_driver_private(dip);
2569 	XenbusState new_state = *(XenbusState *)impl_data;
2570 
2571 	ASSERT(xnfp != NULL);
2572 
2573 	switch (new_state) {
2574 	case XenbusStateConnected:
2575 		mutex_enter(&xnfp->xnf_intrlock);
2576 		mutex_enter(&xnfp->xnf_txlock);
2577 
2578 		xnfp->xnf_connected = B_TRUE;
2579 		/*
2580 		 * wake up threads wanting to send data to backend,
2581 		 * but got blocked due to backend is not ready
2582 		 */
2583 		cv_broadcast(&xnfp->xnf_cv);
2584 
2585 		mutex_exit(&xnfp->xnf_txlock);
2586 		mutex_exit(&xnfp->xnf_intrlock);
2587 
2588 		/*
2589 		 * kick backend in case it missed any tx request
2590 		 * in the TX ring buffer
2591 		 */
2592 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2593 
2594 		/*
2595 		 * there maybe already queued rx data in the RX ring
2596 		 * sent by backend after it gets connected but before
2597 		 * we see its state change here, so we call our intr
2598 		 * handling routine to handle them, if any
2599 		 */
2600 		(void) xnf_intr((caddr_t)xnfp);
2601 
2602 		/* mark as link up after get connected */
2603 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
2604 
2605 		break;
2606 
2607 	default:
2608 		break;
2609 	}
2610 }
2611 
2612 /*
2613  * Check whether backend is capable of and willing to talk
2614  * to us via hypervisor copy, as opposed to page flip.
2615  */
2616 static boolean_t
2617 xnf_hvcopy_peer_status(dev_info_t *devinfo)
2618 {
2619 	int	be_rx_copy;
2620 	int	err;
2621 
2622 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
2623 	    "feature-rx-copy", "%d", &be_rx_copy);
2624 	/*
2625 	 * If we fail to read the store we assume that the key is
2626 	 * absent, implying an older domain at the far end.  Older
2627 	 * domains cannot do HV copy (we assume ..).
2628 	 */
2629 	if (err != 0)
2630 		be_rx_copy = 0;
2631 
2632 	return (be_rx_copy?B_TRUE:B_FALSE);
2633 }
2634