xref: /titanic_44/usr/src/uts/common/xen/io/xnf.c (revision 72612f86fafbe2510a166b48e158c9031e0dd63b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *
29  * Copyright (c) 2004 Christian Limpach.
30  * All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. This section intentionally left blank.
41  * 4. The name of the author may not be used to endorse or promote products
42  *    derived from this software without specific prior written permission.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 /*
56  * Section 3 of the above license was updated in response to bug 6379571.
57  */
58 
59 /*
60  * xnf.c - Nemo-based network driver for domU
61  */
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>
66 #include <sys/sysmacros.h>
67 #include <sys/systm.h>
68 #include <sys/stream.h>
69 #include <sys/strsubr.h>
70 #include <sys/conf.h>
71 #include <sys/ddi.h>
72 #include <sys/devops.h>
73 #include <sys/sunddi.h>
74 #include <sys/sunndi.h>
75 #include <sys/dlpi.h>
76 #include <sys/ethernet.h>
77 #include <sys/strsun.h>
78 #include <sys/pattr.h>
79 #include <inet/ip.h>
80 #include <inet/ip_impl.h>
81 #include <sys/gld.h>
82 #include <sys/modctl.h>
83 #include <sys/mac.h>
84 #include <sys/mac_ether.h>
85 #include <sys/bootinfo.h>
86 #include <sys/mach_mmu.h>
87 #ifdef	XPV_HVM_DRIVER
88 #include <sys/xpv_support.h>
89 #include <sys/hypervisor.h>
90 #else
91 #include <sys/hypervisor.h>
92 #include <sys/evtchn_impl.h>
93 #include <sys/balloon_impl.h>
94 #endif
95 #include <xen/public/io/netif.h>
96 #include <sys/gnttab.h>
97 #include <xen/sys/xendev.h>
98 #include <sys/sdt.h>
99 
100 #include <io/xnf.h>
101 
102 
103 /*
104  *  Declarations and Module Linkage
105  */
106 
107 #define	IDENT	"Virtual Ethernet driver"
108 
109 #if defined(DEBUG) || defined(__lint)
110 #define	XNF_DEBUG
111 int	xnfdebug = 0;
112 #endif
113 
114 /*
115  * On a 32 bit PAE system physical and machine addresses are larger
116  * than 32 bits.  ddi_btop() on such systems take an unsigned long
117  * argument, and so addresses above 4G are truncated before ddi_btop()
118  * gets to see them.  To avoid this, code the shift operation here.
119  */
120 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
121 
122 boolean_t	xnf_cksum_offload = B_TRUE;
123 
124 /* Default value for hypervisor-based copy operations */
125 boolean_t	xnf_rx_hvcopy = B_TRUE;
126 
127 /*
128  * Should pages used for transmit be readonly for the peer?
129  */
130 boolean_t	xnf_tx_pages_readonly = B_FALSE;
131 /*
132  * Packets under this size are bcopied instead of using desballoc.
133  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
134  * always copy.
135  */
136 unsigned int	xnf_rx_bcopy_thresh = 64;
137 
138 unsigned int	xnf_max_tx_frags = 1;
139 
140 /* Required system entry points */
141 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
142 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
143 
144 /* Required driver entry points for Nemo */
145 static int	xnf_start(void *);
146 static void	xnf_stop(void *);
147 static int	xnf_set_mac_addr(void *, const uint8_t *);
148 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
149 static int	xnf_set_promiscuous(void *, boolean_t);
150 static mblk_t	*xnf_send(void *, mblk_t *);
151 static uint_t	xnf_intr(caddr_t);
152 static int	xnf_stat(void *, uint_t, uint64_t *);
153 static void	xnf_blank(void *, time_t, uint_t);
154 static void	xnf_resources(void *);
155 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
156 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
157 
158 /* Driver private functions */
159 static int xnf_alloc_dma_resources(xnf_t *);
160 static void xnf_release_dma_resources(xnf_t *);
161 static mblk_t *xnf_process_recv(xnf_t *);
162 static void xnf_rcv_complete(struct xnf_buffer_desc *);
163 static void xnf_release_mblks(xnf_t *);
164 static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
165 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
166 static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
167 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
168 static void xnf_free_buffer(struct xnf_buffer_desc *);
169 static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
170 void xnf_send_driver_status(int, int);
171 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
172 static int xnf_clean_tx_ring(xnf_t  *);
173 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
174     void *, void *);
175 static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
176 static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
177 static boolean_t xnf_kstat_init(xnf_t *xnfp);
178 
179 /*
180  * XXPV dme: remove MC_IOCTL?
181  */
182 static mac_callbacks_t xnf_callbacks = {
183 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
184 	xnf_stat,
185 	xnf_start,
186 	xnf_stop,
187 	xnf_set_promiscuous,
188 	xnf_set_multicast,
189 	xnf_set_mac_addr,
190 	xnf_send,
191 	xnf_resources,
192 	xnf_ioctl,
193 	xnf_getcapab
194 };
195 
196 #define	GRANT_INVALID_REF	0
197 const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
198 const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
199 
200 /* DMA attributes for network ring buffer */
201 static ddi_dma_attr_t ringbuf_dma_attr = {
202 	DMA_ATTR_V0,		/* version of this structure */
203 	0,			/* lowest usable address */
204 	0xffffffffffffffffULL,	/* highest usable address */
205 	0x7fffffff,		/* maximum DMAable byte count */
206 	MMU_PAGESIZE,		/* alignment in bytes */
207 	0x7ff,			/* bitmap of burst sizes */
208 	1,			/* minimum transfer */
209 	0xffffffffU,		/* maximum transfer */
210 	0xffffffffffffffffULL,	/* maximum segment length */
211 	1,			/* maximum number of segments */
212 	1,			/* granularity */
213 	0,			/* flags (reserved) */
214 };
215 
216 /* DMA attributes for transmit data */
217 static ddi_dma_attr_t tx_buffer_dma_attr = {
218 	DMA_ATTR_V0,		/* version of this structure */
219 	0,			/* lowest usable address */
220 	0xffffffffffffffffULL,	/* highest usable address */
221 	0x7fffffff,		/* maximum DMAable byte count */
222 	MMU_PAGESIZE,		/* alignment in bytes */
223 	0x7ff,			/* bitmap of burst sizes */
224 	1,			/* minimum transfer */
225 	0xffffffffU,		/* maximum transfer */
226 	0xffffffffffffffffULL,	/* maximum segment length */
227 	1,			/* maximum number of segments */
228 	1,			/* granularity */
229 	0,			/* flags (reserved) */
230 };
231 
232 /* DMA attributes for a receive buffer */
233 static ddi_dma_attr_t rx_buffer_dma_attr = {
234 	DMA_ATTR_V0,		/* version of this structure */
235 	0,			/* lowest usable address */
236 	0xffffffffffffffffULL,	/* highest usable address */
237 	0x7fffffff,		/* maximum DMAable byte count */
238 	MMU_PAGESIZE,		/* alignment in bytes */
239 	0x7ff,			/* bitmap of burst sizes */
240 	1,			/* minimum transfer */
241 	0xffffffffU,		/* maximum transfer */
242 	0xffffffffffffffffULL,	/* maximum segment length */
243 	1,			/* maximum number of segments */
244 	1,			/* granularity */
245 	0,			/* flags (reserved) */
246 };
247 
248 /* DMA access attributes for registers and descriptors */
249 static ddi_device_acc_attr_t accattr = {
250 	DDI_DEVICE_ATTR_V0,
251 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
252 	DDI_STRICTORDER_ACC
253 };
254 
255 /* DMA access attributes for data: NOT to be byte swapped. */
256 static ddi_device_acc_attr_t data_accattr = {
257 	DDI_DEVICE_ATTR_V0,
258 	DDI_NEVERSWAP_ACC,
259 	DDI_STRICTORDER_ACC
260 };
261 
262 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
263 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
264 
265 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
266     nodev, NULL, D_MP, NULL);
267 
268 static struct modldrv xnf_modldrv = {
269 	&mod_driverops,
270 	"Virtual Ethernet driver",
271 	&xnf_dev_ops
272 };
273 
274 static struct modlinkage modlinkage = {
275 	MODREV_1, &xnf_modldrv, NULL
276 };
277 
278 int
279 _init(void)
280 {
281 	int r;
282 
283 	mac_init_ops(&xnf_dev_ops, "xnf");
284 	r = mod_install(&modlinkage);
285 	if (r != DDI_SUCCESS)
286 		mac_fini_ops(&xnf_dev_ops);
287 
288 	return (r);
289 }
290 
291 int
292 _fini(void)
293 {
294 	return (EBUSY); /* XXPV dme: should be removable */
295 }
296 
297 int
298 _info(struct modinfo *modinfop)
299 {
300 	return (mod_info(&modlinkage, modinfop));
301 }
302 
303 static int
304 xnf_setup_rings(xnf_t *xnfp)
305 {
306 	int			ix, err;
307 	RING_IDX		i;
308 	struct xnf_buffer_desc	*bdesc, *rbp;
309 	struct xenbus_device	*xsd;
310 	domid_t			oeid;
311 
312 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
313 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
314 
315 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
316 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
317 
318 	err = gnttab_grant_foreign_access(oeid,
319 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
320 	if (err <= 0) {
321 		err = -err;
322 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
323 		goto out;
324 	}
325 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
326 
327 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
328 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
329 
330 	err = gnttab_grant_foreign_access(oeid,
331 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
332 	if (err <= 0) {
333 		err = -err;
334 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
335 		goto out;
336 	}
337 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
338 
339 
340 	mutex_enter(&xnfp->xnf_intrlock);
341 
342 	/*
343 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
344 	 * and reset the ring.  Note that this can lose packets after a resume,
345 	 * but we expect to stagger on.
346 	 */
347 	mutex_enter(&xnfp->xnf_txlock);
348 
349 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
350 		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
351 
352 		txp->id = i + 1;
353 
354 		if (txp->grant_ref == GRANT_INVALID_REF) {
355 			ASSERT(txp->mp == NULL);
356 			ASSERT(txp->bdesc == NULL);
357 			continue;
358 		}
359 
360 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
361 			panic("tx grant still in use by backend domain");
362 
363 		freemsg(txp->mp);
364 		txp->mp = NULL;
365 
366 		(void) ddi_dma_unbind_handle(txp->dma_handle);
367 
368 		if (txp->bdesc != NULL) {
369 			xnf_free_tx_buffer(txp->bdesc);
370 			txp->bdesc = NULL;
371 		}
372 
373 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
374 		    xnfp->xnf_tx_pages_readonly);
375 		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
376 		    txp->grant_ref);
377 		txp->grant_ref = GRANT_INVALID_REF;
378 	}
379 
380 	xnfp->xnf_tx_pkt_id_list = 0;
381 	xnfp->xnf_tx_ring.rsp_cons = 0;
382 	xnfp->xnf_tx_ring.req_prod_pvt = 0;
383 
384 	/* LINTED: constant in conditional context */
385 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
386 
387 	mutex_exit(&xnfp->xnf_txlock);
388 
389 	/*
390 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
391 	 * our pages are currently flipped out/granted so we can't just free
392 	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
393 	 * useable anyway since the mfn's they refer to are no longer valid.
394 	 * Grant the backend domain access to each hung rx buffer.
395 	 */
396 	i = xnfp->xnf_rx_ring.rsp_cons;
397 	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
398 		volatile netif_rx_request_t	*rxrp;
399 
400 		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
401 		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
402 		rbp = xnfp->xnf_rxpkt_bufptr[ix];
403 		if (rbp != NULL) {
404 			grant_ref_t	ref = rbp->grant_ref;
405 
406 			ASSERT(ref != GRANT_INVALID_REF);
407 			if (xnfp->xnf_rx_hvcopy) {
408 				pfn_t pfn = xnf_btop(rbp->buf_phys);
409 				mfn_t mfn = pfn_to_mfn(pfn);
410 
411 				gnttab_grant_foreign_access_ref(ref, oeid,
412 				    mfn, 0);
413 			} else {
414 				gnttab_grant_foreign_transfer_ref(ref,
415 				    oeid, 0);
416 			}
417 			rxrp->id = ix;
418 			rxrp->gref = ref;
419 		}
420 	}
421 
422 	/*
423 	 * Reset the ring pointers to initial state.
424 	 * Hang buffers for any empty ring slots.
425 	 */
426 	xnfp->xnf_rx_ring.rsp_cons = 0;
427 	xnfp->xnf_rx_ring.req_prod_pvt = 0;
428 
429 	/* LINTED: constant in conditional context */
430 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
431 
432 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
433 		xnfp->xnf_rx_ring.req_prod_pvt = i;
434 		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
435 			continue;
436 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
437 			break;
438 		rx_buffer_hang(xnfp, bdesc);
439 	}
440 	xnfp->xnf_rx_ring.req_prod_pvt = i;
441 	/* LINTED: constant in conditional context */
442 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
443 
444 	mutex_exit(&xnfp->xnf_intrlock);
445 
446 	return (0);
447 
448 out:
449 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
450 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
451 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
452 
453 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
454 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
455 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
456 
457 	return (err);
458 }
459 
460 
461 /* Called when the upper layers free a message we passed upstream */
462 static void
463 xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
464 {
465 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
466 	ddi_dma_mem_free(&bdesc->acc_handle);
467 	ddi_dma_free_handle(&bdesc->dma_handle);
468 	kmem_free(bdesc, sizeof (*bdesc));
469 }
470 
471 
472 /*
473  * Connect driver to back end, called to set up communication with
474  * back end driver both initially and on resume after restore/migrate.
475  */
476 void
477 xnf_be_connect(xnf_t *xnfp)
478 {
479 	const char	*message;
480 	xenbus_transaction_t xbt;
481 	struct		xenbus_device *xsd;
482 	char		*xsname;
483 	int		err;
484 
485 	ASSERT(!xnfp->xnf_connected);
486 
487 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
488 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
489 
490 	err = xnf_setup_rings(xnfp);
491 	if (err != 0) {
492 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
493 		xenbus_dev_error(xsd, err, "setting up ring");
494 		return;
495 	}
496 
497 again:
498 	err = xenbus_transaction_start(&xbt);
499 	if (err != 0) {
500 		xenbus_dev_error(xsd, EIO, "starting transaction");
501 		return;
502 	}
503 
504 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
505 	    xnfp->xnf_tx_ring_ref);
506 	if (err != 0) {
507 		message = "writing tx ring-ref";
508 		goto abort_transaction;
509 	}
510 
511 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
512 	    xnfp->xnf_rx_ring_ref);
513 	if (err != 0) {
514 		message = "writing rx ring-ref";
515 		goto abort_transaction;
516 	}
517 
518 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
519 	    xnfp->xnf_evtchn);
520 	if (err != 0) {
521 		message = "writing event-channel";
522 		goto abort_transaction;
523 	}
524 
525 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
526 	if (err != 0) {
527 		message = "writing feature-rx-notify";
528 		goto abort_transaction;
529 	}
530 
531 	if (!xnfp->xnf_tx_pages_readonly) {
532 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
533 		    "%d", 1);
534 		if (err != 0) {
535 			message = "writing feature-tx-writable";
536 			goto abort_transaction;
537 		}
538 	}
539 
540 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
541 	    xnfp->xnf_cksum_offload ? 0 : 1);
542 	if (err != 0) {
543 		message = "writing feature-no-csum-offload";
544 		goto abort_transaction;
545 	}
546 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
547 	    xnfp->xnf_rx_hvcopy ? 1 : 0);
548 	if (err != 0) {
549 		message = "writing request-rx-copy";
550 		goto abort_transaction;
551 	}
552 
553 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
554 	if (err != 0) {
555 		message = "writing frontend XenbusStateConnected";
556 		goto abort_transaction;
557 	}
558 
559 	err = xenbus_transaction_end(xbt, 0);
560 	if (err != 0) {
561 		if (err == EAGAIN)
562 			goto again;
563 		xenbus_dev_error(xsd, err, "completing transaction");
564 	}
565 
566 	return;
567 
568 abort_transaction:
569 	(void) xenbus_transaction_end(xbt, 1);
570 	xenbus_dev_error(xsd, err, "%s", message);
571 }
572 
573 /*
574  * Read config info from xenstore
575  */
576 void
577 xnf_read_config(xnf_t *xnfp)
578 {
579 	char		mac[ETHERADDRL * 3];
580 	int		err, be_no_cksum_offload;
581 
582 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
583 	    "%s", (char *)&mac[0]);
584 	if (err != 0) {
585 		/*
586 		 * bad: we're supposed to be set up with a proper mac
587 		 * addr. at this point
588 		 */
589 		cmn_err(CE_WARN, "%s%d: no mac address",
590 		    ddi_driver_name(xnfp->xnf_devinfo),
591 		    ddi_get_instance(xnfp->xnf_devinfo));
592 			return;
593 	}
594 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
595 		err = ENOENT;
596 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
597 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
598 		return;
599 	}
600 
601 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
602 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
603 	/*
604 	 * If we fail to read the store we assume that the key is
605 	 * absent, implying an older domain at the far end.  Older
606 	 * domains always support checksum offload.
607 	 */
608 	if (err != 0)
609 		be_no_cksum_offload = 0;
610 	/*
611 	 * If the far end cannot do checksum offload or we do not wish
612 	 * to do it, disable it.
613 	 */
614 	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
615 		xnfp->xnf_cksum_offload = B_FALSE;
616 }
617 
618 /*
619  *  attach(9E) -- Attach a device to the system
620  *
621  *  Called once for each board successfully probed.
622  */
623 static int
624 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
625 {
626 	mac_register_t *macp;
627 	xnf_t *xnfp;
628 	int err;
629 
630 #ifdef XNF_DEBUG
631 	if (xnfdebug & XNF_DEBUG_DDI)
632 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
633 		    (void *)devinfo);
634 #endif
635 
636 	switch (cmd) {
637 	case DDI_RESUME:
638 		xnfp = ddi_get_driver_private(devinfo);
639 
640 		(void) xvdi_resume(devinfo);
641 		(void) xvdi_alloc_evtchn(devinfo);
642 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
643 #ifdef XPV_HVM_DRIVER
644 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
645 		    xnfp);
646 #else
647 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
648 		    (caddr_t)xnfp);
649 #endif
650 		xnf_be_connect(xnfp);
651 		/*
652 		 * Our MAC address may have changed if we're resuming:
653 		 * - on a different host
654 		 * - on the same one and got a different MAC address
655 		 *   because we didn't specify one of our own.
656 		 * so it's useful to claim that it changed in order that
657 		 * IP send out a gratuitous ARP.
658 		 */
659 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
660 		return (DDI_SUCCESS);
661 
662 	case DDI_ATTACH:
663 		break;
664 
665 	default:
666 		return (DDI_FAILURE);
667 	}
668 
669 	/*
670 	 *  Allocate gld_mac_info_t and xnf_instance structures
671 	 */
672 	macp = mac_alloc(MAC_VERSION);
673 	if (macp == NULL)
674 		return (DDI_FAILURE);
675 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
676 
677 	macp->m_dip = devinfo;
678 	macp->m_driver = xnfp;
679 	xnfp->xnf_devinfo = devinfo;
680 
681 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
682 	macp->m_src_addr = xnfp->xnf_mac_addr;
683 	macp->m_callbacks = &xnf_callbacks;
684 	macp->m_min_sdu = 0;
685 	macp->m_max_sdu = XNF_MAXPKT;
686 
687 	xnfp->xnf_running = B_FALSE;
688 	xnfp->xnf_connected = B_FALSE;
689 	xnfp->xnf_cksum_offload = xnf_cksum_offload;
690 	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
691 
692 	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
693 #ifdef XPV_HVM_DRIVER
694 	/*
695 	 * Report our version to dom0.
696 	 */
697 	if (xenbus_printf(XBT_NULL, "hvmpv/xnf", "version", "%d",
698 	    HVMPV_XNF_VERS))
699 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
700 
701 	if (!xnfp->xnf_rx_hvcopy) {
702 		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
703 		    "supports 'feature-rx-copy'");
704 		goto failure;
705 	}
706 #endif
707 
708 	/*
709 	 * Get the iblock cookie with which to initialize the mutexes.
710 	 */
711 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
712 	    != DDI_SUCCESS)
713 		goto failure;
714 	/*
715 	 * Driver locking strategy: the txlock protects all paths
716 	 * through the driver, except the interrupt thread.
717 	 * If the interrupt thread needs to do something which could
718 	 * affect the operation of any other part of the driver,
719 	 * it needs to acquire the txlock mutex.
720 	 */
721 	mutex_init(&xnfp->xnf_tx_buf_mutex,
722 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
723 	mutex_init(&xnfp->xnf_rx_buf_mutex,
724 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
725 	mutex_init(&xnfp->xnf_txlock,
726 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
727 	mutex_init(&xnfp->xnf_intrlock,
728 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
729 	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
730 
731 	xnfp->xnf_gref_tx_head = (grant_ref_t)-1;
732 	xnfp->xnf_gref_rx_head = (grant_ref_t)-1;
733 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
734 	    &xnfp->xnf_gref_tx_head) < 0) {
735 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
736 		    ddi_get_instance(xnfp->xnf_devinfo));
737 		goto failure_1;
738 	}
739 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
740 	    &xnfp->xnf_gref_rx_head) < 0) {
741 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
742 		    ddi_get_instance(xnfp->xnf_devinfo));
743 		goto failure_1;
744 	}
745 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
746 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
747 		    "driver data structures",
748 		    ddi_get_instance(xnfp->xnf_devinfo));
749 		goto failure_1;
750 	}
751 
752 	xnfp->xnf_rx_ring.sring->rsp_event =
753 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
754 
755 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
756 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
757 
758 	/* set driver private pointer now */
759 	ddi_set_driver_private(devinfo, xnfp);
760 
761 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change)
762 	    != DDI_SUCCESS)
763 		goto failure_1;
764 
765 	if (!xnf_kstat_init(xnfp))
766 		goto failure_2;
767 
768 	/*
769 	 * Allocate an event channel, add the interrupt handler and
770 	 * bind it to the event channel.
771 	 */
772 	(void) xvdi_alloc_evtchn(devinfo);
773 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
774 #ifdef XPV_HVM_DRIVER
775 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
776 #else
777 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
778 #endif
779 
780 	xnf_read_config(xnfp);
781 	err = mac_register(macp, &xnfp->xnf_mh);
782 	mac_free(macp);
783 	macp = NULL;
784 	if (err != 0)
785 		goto failure_3;
786 
787 #ifdef XPV_HVM_DRIVER
788 	/*
789 	 * In the HVM case, this driver essentially replaces a driver for
790 	 * a 'real' PCI NIC. Without the "model" property set to
791 	 * "Ethernet controller", like the PCI code does, netbooting does
792 	 * not work correctly, as strplumb_get_netdev_path() will not find
793 	 * this interface.
794 	 */
795 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
796 	    "Ethernet controller");
797 #endif
798 
799 	/*
800 	 * connect to the backend
801 	 */
802 	xnf_be_connect(xnfp);
803 
804 	return (DDI_SUCCESS);
805 
806 failure_3:
807 	kstat_delete(xnfp->xnf_kstat_aux);
808 #ifdef XPV_HVM_DRIVER
809 	ec_unbind_evtchn(xnfp->xnf_evtchn);
810 	xvdi_free_evtchn(devinfo);
811 #else
812 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
813 #endif
814 	xnfp->xnf_evtchn = INVALID_EVTCHN;
815 
816 failure_2:
817 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
818 
819 failure_1:
820 	if (xnfp->xnf_gref_tx_head != (grant_ref_t)-1)
821 		gnttab_free_grant_references(xnfp->xnf_gref_tx_head);
822 	if (xnfp->xnf_gref_rx_head != (grant_ref_t)-1)
823 		gnttab_free_grant_references(xnfp->xnf_gref_rx_head);
824 	xnf_release_dma_resources(xnfp);
825 	cv_destroy(&xnfp->xnf_cv);
826 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
827 	mutex_destroy(&xnfp->xnf_txlock);
828 	mutex_destroy(&xnfp->xnf_intrlock);
829 
830 failure:
831 	kmem_free(xnfp, sizeof (*xnfp));
832 	if (macp != NULL)
833 		mac_free(macp);
834 
835 	return (DDI_FAILURE);
836 }
837 
838 /*  detach(9E) -- Detach a device from the system */
839 static int
840 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
841 {
842 	xnf_t *xnfp;		/* Our private device info */
843 	int i;
844 
845 #ifdef XNF_DEBUG
846 	if (xnfdebug & XNF_DEBUG_DDI)
847 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
848 #endif
849 
850 	xnfp = ddi_get_driver_private(devinfo);
851 
852 	switch (cmd) {
853 	case DDI_SUSPEND:
854 #ifdef XPV_HVM_DRIVER
855 		ec_unbind_evtchn(xnfp->xnf_evtchn);
856 		xvdi_free_evtchn(devinfo);
857 #else
858 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
859 #endif
860 
861 		xvdi_suspend(devinfo);
862 
863 		mutex_enter(&xnfp->xnf_intrlock);
864 		mutex_enter(&xnfp->xnf_txlock);
865 
866 		xnfp->xnf_evtchn = INVALID_EVTCHN;
867 		xnfp->xnf_connected = B_FALSE;
868 		mutex_exit(&xnfp->xnf_txlock);
869 		mutex_exit(&xnfp->xnf_intrlock);
870 		return (DDI_SUCCESS);
871 
872 	case DDI_DETACH:
873 		break;
874 
875 	default:
876 		return (DDI_FAILURE);
877 	}
878 
879 	if (xnfp->xnf_connected)
880 		return (DDI_FAILURE);
881 
882 	/* Wait for receive buffers to be returned; give up after 5 seconds */
883 	i = 50;
884 
885 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
886 	while (xnfp->xnf_rx_bufs_outstanding > 0) {
887 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
888 		delay(drv_usectohz(100000));
889 		if (--i == 0) {
890 			cmn_err(CE_WARN,
891 			    "xnf%d: never reclaimed all the "
892 			    "receive buffers.  Still have %d "
893 			    "buffers outstanding.",
894 			    ddi_get_instance(xnfp->xnf_devinfo),
895 			    xnfp->xnf_rx_bufs_outstanding);
896 			return (DDI_FAILURE);
897 		}
898 		mutex_enter(&xnfp->xnf_rx_buf_mutex);
899 	}
900 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
901 
902 	if (mac_unregister(xnfp->xnf_mh) != 0)
903 		return (DDI_FAILURE);
904 
905 	kstat_delete(xnfp->xnf_kstat_aux);
906 
907 	/* Stop the receiver */
908 	xnf_stop(xnfp);
909 
910 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
911 
912 	/* Remove the interrupt */
913 #ifdef XPV_HVM_DRIVER
914 	ec_unbind_evtchn(xnfp->xnf_evtchn);
915 	xvdi_free_evtchn(devinfo);
916 #else
917 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
918 #endif
919 
920 	/* Release any pending xmit mblks */
921 	xnf_release_mblks(xnfp);
922 
923 	/* Release all DMA resources */
924 	xnf_release_dma_resources(xnfp);
925 
926 	cv_destroy(&xnfp->xnf_cv);
927 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
928 	mutex_destroy(&xnfp->xnf_txlock);
929 	mutex_destroy(&xnfp->xnf_intrlock);
930 
931 	kmem_free(xnfp, sizeof (*xnfp));
932 
933 	return (DDI_SUCCESS);
934 }
935 
936 /*
937  *  xnf_set_mac_addr() -- set the physical network address on the board.
938  */
939 /*ARGSUSED*/
940 static int
941 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
942 {
943 	xnf_t *xnfp = arg;
944 
945 #ifdef XNF_DEBUG
946 	if (xnfdebug & XNF_DEBUG_TRACE)
947 		printf("xnf%d: set_mac_addr(0x%p): "
948 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
949 		    ddi_get_instance(xnfp->xnf_devinfo),
950 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
951 		    macaddr[3], macaddr[4], macaddr[5]);
952 #endif
953 	/*
954 	 * We can't set our macaddr.
955 	 *
956 	 * XXPV dme: Why not?
957 	 */
958 	return (ENOTSUP);
959 }
960 
961 /*
962  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
963  *
964  *  Program the hardware to enable/disable the multicast address
965  *  in "mcast".  Enable if "add" is true, disable if false.
966  */
967 /*ARGSUSED*/
968 static int
969 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
970 {
971 	xnf_t *xnfp = arg;
972 
973 #ifdef XNF_DEBUG
974 	if (xnfdebug & XNF_DEBUG_TRACE)
975 		printf("xnf%d set_multicast(0x%p): "
976 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
977 		    ddi_get_instance(xnfp->xnf_devinfo),
978 		    (void *)xnfp, mca[0], mca[1], mca[2],
979 		    mca[3], mca[4], mca[5]);
980 #endif
981 
982 	/*
983 	 * XXPV dme: Ideally we'd relay the address to the backend for
984 	 * enabling.  The protocol doesn't support that (interesting
985 	 * extension), so we simply succeed and hope that the relevant
986 	 * packets are going to arrive.
987 	 *
988 	 * If protocol support is added for enable/disable then we'll
989 	 * need to keep a list of those in use and re-add on resume.
990 	 */
991 	return (0);
992 }
993 
994 /*
995  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
996  *
997  *  Program the hardware to enable/disable promiscuous mode.
998  */
999 /*ARGSUSED*/
1000 static int
1001 xnf_set_promiscuous(void *arg, boolean_t on)
1002 {
1003 	xnf_t *xnfp = arg;
1004 
1005 #ifdef XNF_DEBUG
1006 	if (xnfdebug & XNF_DEBUG_TRACE)
1007 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
1008 		    ddi_get_instance(xnfp->xnf_devinfo),
1009 		    (void *)xnfp, on);
1010 #endif
1011 	/*
1012 	 * We can't really do this, but we pretend that we can in
1013 	 * order that snoop will work.
1014 	 */
1015 	return (0);
1016 }
1017 
1018 /*
1019  * Clean buffers that we have responses for from the transmit ring.
1020  */
1021 static int
1022 xnf_clean_tx_ring(xnf_t *xnfp)
1023 {
1024 	RING_IDX		next_resp, i;
1025 	struct tx_pktinfo	*reap;
1026 	int			id;
1027 	grant_ref_t		ref;
1028 	boolean_t		work_to_do;
1029 
1030 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1031 
1032 loop:
1033 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1034 		/*
1035 		 * index of next transmission ack
1036 		 */
1037 		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
1038 		membar_consumer();
1039 		/*
1040 		 * Clean tx packets from ring that we have responses for
1041 		 */
1042 		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
1043 			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
1044 			reap = &xnfp->xnf_tx_pkt_info[id];
1045 			ref = reap->grant_ref;
1046 			/*
1047 			 * Return id to free list
1048 			 */
1049 			reap->id = xnfp->xnf_tx_pkt_id_list;
1050 			xnfp->xnf_tx_pkt_id_list = id;
1051 			if (gnttab_query_foreign_access(ref) != 0)
1052 				panic("tx grant still in use "
1053 				    "by backend domain");
1054 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1055 			(void) gnttab_end_foreign_access_ref(ref,
1056 			    xnfp->xnf_tx_pages_readonly);
1057 			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
1058 			    ref);
1059 			freemsg(reap->mp);
1060 			reap->mp = NULL;
1061 			reap->grant_ref = GRANT_INVALID_REF;
1062 			if (reap->bdesc != NULL)
1063 				xnf_free_tx_buffer(reap->bdesc);
1064 			reap->bdesc = NULL;
1065 		}
1066 		xnfp->xnf_tx_ring.rsp_cons = next_resp;
1067 		membar_enter();
1068 	}
1069 
1070 	/* LINTED: constant in conditional context */
1071 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1072 	if (work_to_do)
1073 		goto loop;
1074 
1075 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1076 }
1077 
1078 /*
1079  * If we need to pull up data from either a packet that crosses a page
1080  * boundary or consisting of multiple mblks, do it here.  We allocate
1081  * a page aligned buffer and copy the data into it.  The header for the
1082  * allocated buffer is returned. (which is also allocated here)
1083  */
1084 static struct xnf_buffer_desc *
1085 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1086 {
1087 	struct xnf_buffer_desc	*bdesc;
1088 	mblk_t			*mptr;
1089 	caddr_t			bp;
1090 	int			len;
1091 
1092 	/*
1093 	 * get a xmit buffer from the xmit buffer pool
1094 	 */
1095 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1096 	bdesc = xnf_get_tx_buffer(xnfp);
1097 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
1098 	if (bdesc == NULL)
1099 		return (bdesc);
1100 	/*
1101 	 * Copy the data into the buffer
1102 	 */
1103 	xnfp->xnf_stat_tx_pullup++;
1104 	bp = bdesc->buf;
1105 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1106 		len = mptr->b_wptr - mptr->b_rptr;
1107 		bcopy(mptr->b_rptr, bp, len);
1108 		bp += len;
1109 	}
1110 	return (bdesc);
1111 }
1112 
1113 void
1114 xnf_pseudo_cksum(caddr_t buf, int length)
1115 {
1116 	struct ether_header *ehp;
1117 	uint16_t sap, len, *stuff;
1118 	uint32_t cksum;
1119 	size_t offset;
1120 	ipha_t *ipha;
1121 	ipaddr_t src, dst;
1122 
1123 	ASSERT(length >= sizeof (*ehp));
1124 	ehp = (struct ether_header *)buf;
1125 
1126 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1127 		struct ether_vlan_header *evhp;
1128 
1129 		ASSERT(length >= sizeof (*evhp));
1130 		evhp = (struct ether_vlan_header *)buf;
1131 		sap = ntohs(evhp->ether_type);
1132 		offset = sizeof (*evhp);
1133 	} else {
1134 		sap = ntohs(ehp->ether_type);
1135 		offset = sizeof (*ehp);
1136 	}
1137 
1138 	ASSERT(sap == ETHERTYPE_IP);
1139 
1140 	/* Packet should have been pulled up by the caller. */
1141 	if ((offset + sizeof (ipha_t)) > length) {
1142 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
1143 		return;
1144 	}
1145 
1146 	ipha = (ipha_t *)(buf + offset);
1147 
1148 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
1149 
1150 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1151 
1152 	switch (ipha->ipha_protocol) {
1153 	case IPPROTO_TCP:
1154 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1155 		cksum = IP_TCP_CSUM_COMP;
1156 		break;
1157 	case IPPROTO_UDP:
1158 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1159 		cksum = IP_UDP_CSUM_COMP;
1160 		break;
1161 	default:
1162 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1163 		    ipha->ipha_protocol);
1164 		return;
1165 	}
1166 
1167 	src = ipha->ipha_src;
1168 	dst = ipha->ipha_dst;
1169 
1170 	cksum += (dst >> 16) + (dst & 0xFFFF);
1171 	cksum += (src >> 16) + (src & 0xFFFF);
1172 	cksum += htons(len);
1173 
1174 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1175 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1176 
1177 	ASSERT(cksum <= 0xFFFF);
1178 
1179 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1180 }
1181 
1182 /*
1183  *  xnf_send_one() -- send a packet
1184  *
1185  *  Called when a packet is ready to be transmitted. A pointer to an
1186  *  M_DATA message that contains the packet is passed to this routine.
1187  *  At least the complete LLC header is contained in the message's
1188  *  first message block, and the remainder of the packet is contained
1189  *  within additional M_DATA message blocks linked to the first
1190  *  message block.
1191  *
1192  */
1193 static boolean_t
1194 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1195 {
1196 	struct xnf_buffer_desc	*xmitbuf;
1197 	struct tx_pktinfo	*txp_info;
1198 	mblk_t			*mptr;
1199 	ddi_dma_cookie_t	dma_cookie;
1200 	RING_IDX		slot;
1201 	int			length = 0, i, pktlen = 0, rc, tx_id;
1202 	int			tx_ring_freespace, page_oops;
1203 	uint_t			ncookies;
1204 	volatile netif_tx_request_t	*txrp;
1205 	caddr_t			bufaddr;
1206 	grant_ref_t		ref;
1207 	unsigned long		mfn;
1208 	uint32_t		pflags;
1209 	domid_t			oeid;
1210 
1211 #ifdef XNF_DEBUG
1212 	if (xnfdebug & XNF_DEBUG_SEND)
1213 		printf("xnf%d send(0x%p, 0x%p)\n",
1214 		    ddi_get_instance(xnfp->xnf_devinfo),
1215 		    (void *)xnfp, (void *)mp);
1216 #endif
1217 
1218 	ASSERT(mp != NULL);
1219 	ASSERT(mp->b_next == NULL);
1220 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1221 
1222 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1223 	ASSERT(tx_ring_freespace >= 0);
1224 
1225 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1226 	xnfp->xnf_stat_tx_attempt++;
1227 	/*
1228 	 * If there are no xmit ring slots available, return.
1229 	 */
1230 	if (tx_ring_freespace == 0) {
1231 		xnfp->xnf_stat_tx_defer++;
1232 		return (B_FALSE);	/* Send should be retried */
1233 	}
1234 
1235 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1236 	/* Count the number of mblks in message and compute packet size */
1237 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1238 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1239 
1240 	/* Make sure packet isn't too large */
1241 	if (pktlen > XNF_FRAMESIZE) {
1242 		cmn_err(CE_WARN, "xnf%d: oversized packet (%d bytes) dropped",
1243 		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
1244 		freemsg(mp);
1245 		return (B_TRUE);
1246 	}
1247 
1248 	/*
1249 	 * Test if we cross a page boundary with our buffer
1250 	 */
1251 	page_oops = (i == 1) &&
1252 	    (xnf_btop((size_t)mp->b_rptr) !=
1253 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1254 	/*
1255 	 * XXPV - unfortunately, the Xen virtual net device currently
1256 	 * doesn't support multiple packet frags, so this will always
1257 	 * end up doing the pullup if we got more than one packet.
1258 	 */
1259 	if (i > xnf_max_tx_frags || page_oops) {
1260 		if (page_oops)
1261 			xnfp->xnf_stat_tx_pagebndry++;
1262 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1263 			/* could not allocate resources? */
1264 #ifdef XNF_DEBUG
1265 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1266 			    ddi_get_instance(xnfp->xnf_devinfo));
1267 #endif
1268 			xnfp->xnf_stat_tx_defer++;
1269 			return (B_FALSE);	/* Retry send */
1270 		}
1271 		bufaddr = xmitbuf->buf;
1272 	} else {
1273 		xmitbuf = NULL;
1274 		bufaddr = (caddr_t)mp->b_rptr;
1275 	}
1276 
1277 	/* set up data descriptor */
1278 	length = pktlen;
1279 
1280 	/*
1281 	 * Get packet id from free list
1282 	 */
1283 	tx_id = xnfp->xnf_tx_pkt_id_list;
1284 	ASSERT(tx_id < NET_TX_RING_SIZE);
1285 	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
1286 	xnfp->xnf_tx_pkt_id_list = txp_info->id;
1287 	txp_info->id = tx_id;
1288 
1289 	/* Prepare for DMA mapping of tx buffer(s) */
1290 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1291 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1292 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1293 	if (rc != DDI_DMA_MAPPED) {
1294 		ASSERT(rc != DDI_DMA_INUSE);
1295 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1296 		/*
1297 		 *  Return id to free list
1298 		 */
1299 		txp_info->id = xnfp->xnf_tx_pkt_id_list;
1300 		xnfp->xnf_tx_pkt_id_list = tx_id;
1301 		if (rc == DDI_DMA_NORESOURCES) {
1302 			xnfp->xnf_stat_tx_defer++;
1303 			return (B_FALSE); /* Retry later */
1304 		}
1305 #ifdef XNF_DEBUG
1306 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1307 		    ddi_get_instance(xnfp->xnf_devinfo), rc);
1308 #endif
1309 		return (B_FALSE);
1310 	}
1311 
1312 	ASSERT(ncookies == 1);
1313 	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
1314 	ASSERT((signed short)ref >= 0);
1315 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1316 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1317 	    xnfp->xnf_tx_pages_readonly);
1318 	txp_info->grant_ref = ref;
1319 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1320 	txrp->gref = ref;
1321 	txrp->size = dma_cookie.dmac_size;
1322 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1323 	txrp->id = tx_id;
1324 	txrp->flags = 0;
1325 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1326 	if (pflags != 0) {
1327 		ASSERT(xnfp->xnf_cksum_offload);
1328 		/*
1329 		 * If the local protocol stack requests checksum
1330 		 * offload we set the 'checksum blank' flag,
1331 		 * indicating to the peer that we need the checksum
1332 		 * calculated for us.
1333 		 *
1334 		 * We _don't_ set the validated flag, because we haven't
1335 		 * validated that the data and the checksum match.
1336 		 */
1337 		xnf_pseudo_cksum(bufaddr, length);
1338 		txrp->flags |= NETTXF_csum_blank;
1339 		xnfp->xnf_stat_tx_cksum_deferred++;
1340 	}
1341 	membar_producer();
1342 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 1;
1343 
1344 	txp_info->mp = mp;
1345 	txp_info->bdesc = xmitbuf;
1346 
1347 	xnfp->xnf_stat_opackets++;
1348 	xnfp->xnf_stat_obytes += pktlen;
1349 
1350 	return (B_TRUE);	/* successful transmit attempt */
1351 }
1352 
1353 mblk_t *
1354 xnf_send(void *arg, mblk_t *mp)
1355 {
1356 	xnf_t *xnfp = arg;
1357 	mblk_t *next;
1358 	boolean_t sent_something = B_FALSE;
1359 
1360 	mutex_enter(&xnfp->xnf_txlock);
1361 
1362 	/*
1363 	 * Transmission attempts should be impossible without having
1364 	 * previously called xnf_start().
1365 	 */
1366 	ASSERT(xnfp->xnf_running);
1367 
1368 	/*
1369 	 * Wait for getting connected to the backend
1370 	 */
1371 	while (!xnfp->xnf_connected) {
1372 		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
1373 	}
1374 
1375 	while (mp != NULL) {
1376 		next = mp->b_next;
1377 		mp->b_next = NULL;
1378 
1379 		if (!xnf_send_one(xnfp, mp)) {
1380 			mp->b_next = next;
1381 			break;
1382 		}
1383 
1384 		mp = next;
1385 		sent_something = B_TRUE;
1386 	}
1387 
1388 	if (sent_something) {
1389 		boolean_t notify;
1390 
1391 		/* LINTED: constant in conditional context */
1392 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1393 		    notify);
1394 		if (notify)
1395 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1396 	}
1397 
1398 	mutex_exit(&xnfp->xnf_txlock);
1399 
1400 	return (mp);
1401 }
1402 
1403 /*
1404  *  xnf_intr() -- ring interrupt service routine
1405  */
1406 static uint_t
1407 xnf_intr(caddr_t arg)
1408 {
1409 	xnf_t *xnfp = (xnf_t *)arg;
1410 	int tx_ring_space;
1411 
1412 	mutex_enter(&xnfp->xnf_intrlock);
1413 
1414 	/* spurious intr */
1415 	if (!xnfp->xnf_connected) {
1416 		mutex_exit(&xnfp->xnf_intrlock);
1417 		xnfp->xnf_stat_unclaimed_interrupts++;
1418 		return (DDI_INTR_UNCLAIMED);
1419 	}
1420 
1421 #ifdef XNF_DEBUG
1422 	if (xnfdebug & XNF_DEBUG_INT)
1423 		printf("xnf%d intr(0x%p)\n",
1424 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1425 #endif
1426 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1427 		mblk_t *mp;
1428 
1429 		if (xnfp->xnf_rx_hvcopy)
1430 			mp = xnf_process_hvcopy_recv(xnfp);
1431 		else
1432 			mp = xnf_process_recv(xnfp);
1433 
1434 		if (mp != NULL)
1435 			mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
1436 	}
1437 
1438 	/*
1439 	 * Clean tx ring and try to start any blocked xmit streams if
1440 	 * there is now some space.
1441 	 */
1442 	mutex_enter(&xnfp->xnf_txlock);
1443 	tx_ring_space = xnf_clean_tx_ring(xnfp);
1444 	mutex_exit(&xnfp->xnf_txlock);
1445 	if (tx_ring_space > XNF_TX_FREE_THRESH) {
1446 		mutex_exit(&xnfp->xnf_intrlock);
1447 		mac_tx_update(xnfp->xnf_mh);
1448 		mutex_enter(&xnfp->xnf_intrlock);
1449 	}
1450 
1451 	xnfp->xnf_stat_interrupts++;
1452 	mutex_exit(&xnfp->xnf_intrlock);
1453 	return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */
1454 }
1455 
1456 /*
1457  *  xnf_start() -- start the board receiving and enable interrupts.
1458  */
1459 static int
1460 xnf_start(void *arg)
1461 {
1462 	xnf_t *xnfp = arg;
1463 
1464 #ifdef XNF_DEBUG
1465 	if (xnfdebug & XNF_DEBUG_TRACE)
1466 		printf("xnf%d start(0x%p)\n",
1467 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1468 #endif
1469 
1470 	mutex_enter(&xnfp->xnf_intrlock);
1471 	mutex_enter(&xnfp->xnf_txlock);
1472 
1473 	/* Accept packets from above. */
1474 	xnfp->xnf_running = B_TRUE;
1475 
1476 	mutex_exit(&xnfp->xnf_txlock);
1477 	mutex_exit(&xnfp->xnf_intrlock);
1478 
1479 	return (0);
1480 }
1481 
1482 /* xnf_stop() - disable hardware */
1483 static void
1484 xnf_stop(void *arg)
1485 {
1486 	xnf_t *xnfp = arg;
1487 
1488 #ifdef XNF_DEBUG
1489 	if (xnfdebug & XNF_DEBUG_TRACE)
1490 		printf("xnf%d stop(0x%p)\n",
1491 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1492 #endif
1493 
1494 	mutex_enter(&xnfp->xnf_intrlock);
1495 	mutex_enter(&xnfp->xnf_txlock);
1496 
1497 	xnfp->xnf_running = B_FALSE;
1498 
1499 	mutex_exit(&xnfp->xnf_txlock);
1500 	mutex_exit(&xnfp->xnf_intrlock);
1501 }
1502 
1503 /*
1504  * Driver private functions follow
1505  */
1506 
1507 /*
1508  * Hang buffer on rx ring
1509  */
1510 static void
1511 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1512 {
1513 	volatile netif_rx_request_t	*reqp;
1514 	RING_IDX			hang_ix;
1515 	grant_ref_t			ref;
1516 	domid_t				oeid;
1517 
1518 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1519 
1520 	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
1521 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1522 	    xnfp->xnf_rx_ring.req_prod_pvt);
1523 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1524 	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
1525 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1526 		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
1527 		ASSERT((signed short)ref >= 0);
1528 		bdesc->grant_ref = ref;
1529 		if (xnfp->xnf_rx_hvcopy) {
1530 			pfn_t pfn = xnf_btop(bdesc->buf_phys);
1531 			mfn_t mfn = pfn_to_mfn(pfn);
1532 
1533 			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
1534 		} else {
1535 			gnttab_grant_foreign_transfer_ref(ref, oeid, 0);
1536 		}
1537 	}
1538 	reqp->id = hang_ix;
1539 	reqp->gref = bdesc->grant_ref;
1540 	bdesc->id = hang_ix;
1541 	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
1542 	membar_producer();
1543 	xnfp->xnf_rx_ring.req_prod_pvt++;
1544 }
1545 
1546 static mblk_t *
1547 xnf_process_hvcopy_recv(xnf_t *xnfp)
1548 {
1549 	netif_rx_response_t *rxpkt;
1550 	mblk_t		*mp, *head, *tail;
1551 	struct		xnf_buffer_desc *bdesc;
1552 	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
1553 	size_t 		len;
1554 
1555 	/*
1556 	 * in loop over unconsumed responses, we do:
1557 	 * 1. get a response
1558 	 * 2. take corresponding buffer off recv. ring
1559 	 * 3. indicate this by setting slot to NULL
1560 	 * 4. create a new message and
1561 	 * 5. copy data in, adjust ptr
1562 	 *
1563 	 * outside loop:
1564 	 * 7. make sure no more data has arrived; kick HV
1565 	 */
1566 
1567 	head = tail = NULL;
1568 
1569 loop:
1570 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1571 
1572 		/* 1. */
1573 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1574 		    xnfp->xnf_rx_ring.rsp_cons);
1575 
1576 		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
1577 		    (int)rxpkt->offset,
1578 		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
1579 
1580 		/*
1581 		 * 2.
1582 		 * Take buffer off of receive ring
1583 		 */
1584 		hwcsum = B_FALSE;
1585 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1586 		/* 3 */
1587 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1588 		ASSERT(bdesc->id == rxpkt->id);
1589 		mp = NULL;
1590 		if (!xnfp->xnf_running) {
1591 			DTRACE_PROBE4(pkt_dropped, int, rxpkt->status,
1592 			    char *, bdesc->buf, int, rxpkt->offset,
1593 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1594 			xnfp->xnf_stat_drop++;
1595 			/*
1596 			 * re-hang the buffer
1597 			 */
1598 			rx_buffer_hang(xnfp, bdesc);
1599 		} else if (rxpkt->status <= 0) {
1600 			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
1601 			    char *, bdesc->buf, int, rxpkt->offset,
1602 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1603 			xnfp->xnf_stat_errrx++;
1604 			if (rxpkt->status == 0)
1605 				xnfp->xnf_stat_runt++;
1606 			if (rxpkt->status == NETIF_RSP_ERROR)
1607 				xnfp->xnf_stat_mac_rcv_error++;
1608 			if (rxpkt->status == NETIF_RSP_DROPPED)
1609 				xnfp->xnf_stat_norxbuf++;
1610 			/*
1611 			 * re-hang the buffer
1612 			 */
1613 			rx_buffer_hang(xnfp, bdesc);
1614 		} else {
1615 			grant_ref_t		ref =  bdesc->grant_ref;
1616 			struct xnf_buffer_desc	*new_bdesc;
1617 			unsigned long		off = rxpkt->offset;
1618 
1619 			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
1620 			    char *, bdesc->buf, int, rxpkt->offset,
1621 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1622 			len = rxpkt->status;
1623 			ASSERT(off + len <= PAGEOFFSET);
1624 			if (ref == GRANT_INVALID_REF) {
1625 				mp = NULL;
1626 				new_bdesc = bdesc;
1627 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1628 				    "from dom %d", ref,
1629 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1630 				goto luckless;
1631 			}
1632 			/*
1633 			 * Release ref which we'll be re-claiming in
1634 			 * rx_buffer_hang().
1635 			 */
1636 			bdesc->grant_ref = GRANT_INVALID_REF;
1637 			(void) gnttab_end_foreign_access_ref(ref, 0);
1638 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1639 			    ref);
1640 			if (rxpkt->flags & NETRXF_data_validated)
1641 				hwcsum = B_TRUE;
1642 
1643 			/*
1644 			 * XXPV for the initial implementation of HVcopy,
1645 			 * create a new msg and copy in the data
1646 			 */
1647 			/* 4. */
1648 			if ((mp = allocb(len, BPRI_MED)) == NULL) {
1649 				/*
1650 				 * Couldn't get buffer to copy to,
1651 				 * drop this data, and re-hang
1652 				 * the buffer on the ring.
1653 				 */
1654 				xnfp->xnf_stat_norxbuf++;
1655 				DTRACE_PROBE(alloc_nix);
1656 			} else {
1657 				/* 5. */
1658 				DTRACE_PROBE(alloc_ok);
1659 				bcopy(bdesc->buf + off, mp->b_wptr,
1660 				    len);
1661 				mp->b_wptr += len;
1662 			}
1663 			new_bdesc = bdesc;
1664 luckless:
1665 
1666 			/* Re-hang old or hang new buffer. */
1667 			rx_buffer_hang(xnfp, new_bdesc);
1668 		}
1669 		if (mp) {
1670 			if (hwcsum) {
1671 				/*
1672 				 * See comments in xnf_process_recv().
1673 				 */
1674 
1675 				(void) hcksum_assoc(mp, NULL,
1676 				    NULL, 0, 0, 0, 0,
1677 				    HCK_FULLCKSUM |
1678 				    HCK_FULLCKSUM_OK,
1679 				    0);
1680 				xnfp->xnf_stat_rx_cksum_no_need++;
1681 			}
1682 			if (head == NULL) {
1683 				head = tail = mp;
1684 			} else {
1685 				tail->b_next = mp;
1686 				tail = mp;
1687 			}
1688 
1689 			ASSERT(mp->b_next == NULL);
1690 
1691 			xnfp->xnf_stat_ipackets++;
1692 			xnfp->xnf_stat_rbytes += len;
1693 		}
1694 
1695 		xnfp->xnf_rx_ring.rsp_cons++;
1696 
1697 		xnfp->xnf_stat_hvcopy_packet_processed++;
1698 	}
1699 
1700 	/* 7. */
1701 	/*
1702 	 * Has more data come in since we started?
1703 	 */
1704 	/* LINTED: constant in conditional context */
1705 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1706 	if (work_to_do)
1707 		goto loop;
1708 
1709 	/*
1710 	 * Indicate to the backend that we have re-filled the receive
1711 	 * ring.
1712 	 */
1713 	/* LINTED: constant in conditional context */
1714 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1715 	if (notify)
1716 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1717 
1718 	return (head);
1719 }
1720 
1721 /* Process all queued received packets */
1722 static mblk_t *
1723 xnf_process_recv(xnf_t *xnfp)
1724 {
1725 	volatile netif_rx_response_t *rxpkt;
1726 	mblk_t *mp, *head, *tail;
1727 	struct xnf_buffer_desc *bdesc;
1728 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1729 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1730 	size_t len;
1731 	pfn_t pfn;
1732 	long cnt;
1733 
1734 	head = tail = NULL;
1735 loop:
1736 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1737 
1738 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1739 		    xnfp->xnf_rx_ring.rsp_cons);
1740 
1741 		/*
1742 		 * Take buffer off of receive ring
1743 		 */
1744 		hwcsum = B_FALSE;
1745 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1746 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1747 		ASSERT(bdesc->id == rxpkt->id);
1748 		mp = NULL;
1749 		if (!xnfp->xnf_running) {
1750 			xnfp->xnf_stat_drop++;
1751 			/*
1752 			 * re-hang the buffer
1753 			 */
1754 			rx_buffer_hang(xnfp, bdesc);
1755 		} else if (rxpkt->status <= 0) {
1756 			xnfp->xnf_stat_errrx++;
1757 			if (rxpkt->status == 0)
1758 				xnfp->xnf_stat_runt++;
1759 			if (rxpkt->status == NETIF_RSP_ERROR)
1760 				xnfp->xnf_stat_mac_rcv_error++;
1761 			if (rxpkt->status == NETIF_RSP_DROPPED)
1762 				xnfp->xnf_stat_norxbuf++;
1763 			/*
1764 			 * re-hang the buffer
1765 			 */
1766 			rx_buffer_hang(xnfp, bdesc);
1767 		} else {
1768 			grant_ref_t ref =  bdesc->grant_ref;
1769 			struct xnf_buffer_desc *new_bdesc;
1770 			unsigned long off = rxpkt->offset;
1771 			unsigned long mfn;
1772 
1773 			len = rxpkt->status;
1774 			ASSERT(off + len <= PAGEOFFSET);
1775 			if (ref == GRANT_INVALID_REF) {
1776 				mp = NULL;
1777 				new_bdesc = bdesc;
1778 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1779 				    "from dom %d", ref,
1780 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1781 				goto luckless;
1782 			}
1783 			bdesc->grant_ref = GRANT_INVALID_REF;
1784 			mfn = gnttab_end_foreign_transfer_ref(ref);
1785 			ASSERT(mfn != MFN_INVALID);
1786 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1787 			    PFN_INVALID);
1788 
1789 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1790 			    ref);
1791 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1792 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1793 			    xnf_btop(bdesc->buf_phys),
1794 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1795 			balloon_drv_added(1);
1796 
1797 			if (rxpkt->flags & NETRXF_data_validated)
1798 				hwcsum = B_TRUE;
1799 			if (len <= xnf_rx_bcopy_thresh) {
1800 				/*
1801 				 * For small buffers, just copy the data
1802 				 * and send the copy upstream.
1803 				 */
1804 				new_bdesc = NULL;
1805 			} else {
1806 				/*
1807 				 * We send a pointer to this data upstream;
1808 				 * we need a new buffer to replace this one.
1809 				 */
1810 				mutex_enter(&xnfp->xnf_rx_buf_mutex);
1811 				new_bdesc = xnf_get_buffer(xnfp);
1812 				if (new_bdesc != NULL) {
1813 					xnfp->xnf_rx_bufs_outstanding++;
1814 				} else {
1815 					xnfp->xnf_stat_rx_no_ringbuf++;
1816 				}
1817 				mutex_exit(&xnfp->xnf_rx_buf_mutex);
1818 			}
1819 
1820 			if (new_bdesc == NULL) {
1821 				/*
1822 				 * Don't have a new ring buffer; bcopy the data
1823 				 * from the buffer, and preserve the
1824 				 * original buffer
1825 				 */
1826 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1827 					/*
1828 					 * Could't get buffer to copy to,
1829 					 * drop this data, and re-hang
1830 					 * the buffer on the ring.
1831 					 */
1832 					xnfp->xnf_stat_norxbuf++;
1833 				} else {
1834 					bcopy(bdesc->buf + off, mp->b_wptr,
1835 					    len);
1836 				}
1837 				/*
1838 				 * Give the buffer page back to xen
1839 				 */
1840 				pfn = xnf_btop(bdesc->buf_phys);
1841 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1842 				    &pfn);
1843 				if (cnt != 1) {
1844 					cmn_err(CE_WARN, "unable to give a "
1845 					    "page back to the hypervisor\n");
1846 				}
1847 				new_bdesc = bdesc;
1848 			} else {
1849 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1850 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1851 					/*
1852 					 * Couldn't get mblk to pass recv data
1853 					 * up with, free the old ring buffer
1854 					 */
1855 					xnfp->xnf_stat_norxbuf++;
1856 					xnf_rcv_complete(bdesc);
1857 					goto luckless;
1858 				}
1859 				(void) ddi_dma_sync(bdesc->dma_handle,
1860 				    0, 0, DDI_DMA_SYNC_FORCPU);
1861 
1862 				mp->b_wptr += off;
1863 				mp->b_rptr += off;
1864 			}
1865 luckless:
1866 			if (mp)
1867 				mp->b_wptr += len;
1868 			/* re-hang old or hang new buffer */
1869 			rx_buffer_hang(xnfp, new_bdesc);
1870 		}
1871 		if (mp) {
1872 			if (hwcsum) {
1873 				/*
1874 				 * If the peer says that the data has
1875 				 * been validated then we declare that
1876 				 * the full checksum has been
1877 				 * verified.
1878 				 *
1879 				 * We don't look at the "checksum
1880 				 * blank" flag, and hence could have a
1881 				 * packet here that we are asserting
1882 				 * is good with a blank checksum.
1883 				 *
1884 				 * The hardware checksum offload
1885 				 * specification says that we must
1886 				 * provide the actual checksum as well
1887 				 * as an assertion that it is valid,
1888 				 * but the protocol stack doesn't
1889 				 * actually use it and some other
1890 				 * drivers don't bother, so we don't.
1891 				 * If it was necessary we could grovel
1892 				 * in the packet to find it.
1893 				 */
1894 
1895 				(void) hcksum_assoc(mp, NULL,
1896 				    NULL, 0, 0, 0, 0,
1897 				    HCK_FULLCKSUM |
1898 				    HCK_FULLCKSUM_OK,
1899 				    0);
1900 				xnfp->xnf_stat_rx_cksum_no_need++;
1901 			}
1902 			if (head == NULL) {
1903 				head = tail = mp;
1904 			} else {
1905 				tail->b_next = mp;
1906 				tail = mp;
1907 			}
1908 
1909 			ASSERT(mp->b_next == NULL);
1910 
1911 			xnfp->xnf_stat_ipackets++;
1912 			xnfp->xnf_stat_rbytes += len;
1913 		}
1914 
1915 		xnfp->xnf_rx_ring.rsp_cons++;
1916 	}
1917 
1918 	/*
1919 	 * Has more data come in since we started?
1920 	 */
1921 	/* LINTED: constant in conditional context */
1922 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1923 	if (work_to_do)
1924 		goto loop;
1925 
1926 	/*
1927 	 * Indicate to the backend that we have re-filled the receive
1928 	 * ring.
1929 	 */
1930 	/* LINTED: constant in conditional context */
1931 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1932 	if (notify)
1933 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1934 
1935 	return (head);
1936 }
1937 
1938 /* Called when the upper layers free a message we passed upstream */
1939 static void
1940 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1941 {
1942 	xnf_t *xnfp = bdesc->xnfp;
1943 	pfn_t pfn;
1944 	long cnt;
1945 
1946 	/* One less outstanding receive buffer */
1947 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1948 	--xnfp->xnf_rx_bufs_outstanding;
1949 	/*
1950 	 * Return buffer to the free list, unless the free list is getting
1951 	 * too large.  XXPV - this threshold may need tuning.
1952 	 */
1953 	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
1954 		/*
1955 		 * Unmap the page, and hand the machine page back
1956 		 * to xen so it can be re-used as a backend net buffer.
1957 		 */
1958 		pfn = xnf_btop(bdesc->buf_phys);
1959 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1960 		if (cnt != 1) {
1961 			cmn_err(CE_WARN, "unable to give a page back to the "
1962 			    "hypervisor\n");
1963 		}
1964 
1965 		bdesc->next = xnfp->xnf_free_list;
1966 		xnfp->xnf_free_list = bdesc;
1967 		xnfp->xnf_rx_descs_free++;
1968 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1969 	} else {
1970 		/*
1971 		 * We can return everything here since we have a free buffer
1972 		 * that we have not given the backing page for back to xen.
1973 		 */
1974 		--xnfp->xnf_rx_buffer_count;
1975 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1976 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1977 		ddi_dma_mem_free(&bdesc->acc_handle);
1978 		ddi_dma_free_handle(&bdesc->dma_handle);
1979 		kmem_free(bdesc, sizeof (*bdesc));
1980 	}
1981 }
1982 
1983 /*
1984  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1985  */
1986 static int
1987 xnf_alloc_dma_resources(xnf_t *xnfp)
1988 {
1989 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
1990 	int			i;
1991 	size_t			len;
1992 	ddi_dma_cookie_t	dma_cookie;
1993 	uint_t			ncookies;
1994 	struct xnf_buffer_desc	*bdesc;
1995 	int			rc;
1996 	caddr_t			rptr;
1997 
1998 	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
1999 	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
2000 
2001 	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
2002 
2003 	/*
2004 	 * The code below allocates all the DMA data structures that
2005 	 * need to be released when the driver is detached.
2006 	 *
2007 	 * First allocate handles for mapping (virtual address) pointers to
2008 	 * transmit data buffers to physical addresses
2009 	 */
2010 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2011 		if ((rc = ddi_dma_alloc_handle(devinfo,
2012 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
2013 		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
2014 			return (DDI_FAILURE);
2015 	}
2016 
2017 	/*
2018 	 * Allocate page for the transmit descriptor ring.
2019 	 */
2020 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2021 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2022 		goto alloc_error;
2023 
2024 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2025 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2026 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2027 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2028 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2029 		xnfp->xnf_tx_ring_dma_handle = NULL;
2030 		goto alloc_error;
2031 	}
2032 
2033 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2034 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2035 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2036 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2037 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2038 		xnfp->xnf_tx_ring_dma_handle = NULL;
2039 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2040 		if (rc == DDI_DMA_NORESOURCES)
2041 			goto alloc_error;
2042 		else
2043 			goto error;
2044 	}
2045 
2046 	ASSERT(ncookies == 1);
2047 	bzero(rptr, PAGESIZE);
2048 	/* LINTED: constant in conditional context */
2049 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2050 	/* LINTED: constant in conditional context */
2051 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2052 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2053 
2054 	/*
2055 	 * Allocate page for the receive descriptor ring.
2056 	 */
2057 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2058 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2059 		goto alloc_error;
2060 
2061 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2062 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2063 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2064 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2065 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2066 		xnfp->xnf_rx_ring_dma_handle = NULL;
2067 		goto alloc_error;
2068 	}
2069 
2070 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2071 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2072 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2073 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2074 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2075 		xnfp->xnf_rx_ring_dma_handle = NULL;
2076 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2077 		if (rc == DDI_DMA_NORESOURCES)
2078 			goto alloc_error;
2079 		else
2080 			goto error;
2081 	}
2082 
2083 	ASSERT(ncookies == 1);
2084 	bzero(rptr, PAGESIZE);
2085 	/* LINTED: constant in conditional context */
2086 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2087 	/* LINTED: constant in conditional context */
2088 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2089 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2090 
2091 	/*
2092 	 * Preallocate receive buffers for each receive descriptor.
2093 	 */
2094 
2095 	/* Set up the "free list" of receive buffer descriptors */
2096 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2097 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
2098 			goto alloc_error;
2099 		bdesc->next = xnfp->xnf_free_list;
2100 		xnfp->xnf_free_list = bdesc;
2101 	}
2102 
2103 	return (DDI_SUCCESS);
2104 
2105 alloc_error:
2106 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2107 	    ddi_get_instance(xnfp->xnf_devinfo));
2108 error:
2109 	xnf_release_dma_resources(xnfp);
2110 	return (DDI_FAILURE);
2111 }
2112 
2113 /*
2114  * Release all DMA resources in the opposite order from acquisition
2115  * Should not be called until all outstanding esballoc buffers
2116  * have been returned.
2117  */
2118 static void
2119 xnf_release_dma_resources(xnf_t *xnfp)
2120 {
2121 	int i;
2122 
2123 	/*
2124 	 * Free receive buffers which are currently associated with
2125 	 * descriptors
2126 	 */
2127 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2128 		struct xnf_buffer_desc *bp;
2129 
2130 		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
2131 			continue;
2132 		xnf_free_buffer(bp);
2133 		xnfp->xnf_rxpkt_bufptr[i] = NULL;
2134 	}
2135 
2136 	/* Free the receive ring buffer */
2137 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2138 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2139 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2140 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2141 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2142 	}
2143 	/* Free the transmit ring buffer */
2144 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2145 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2146 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2147 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2148 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2149 	}
2150 
2151 	/*
2152 	 * Free handles for mapping (virtual address) pointers to
2153 	 * transmit data buffers to physical addresses
2154 	 */
2155 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2156 		if (xnfp->xnf_tx_pkt_info[i].dma_handle != NULL) {
2157 			ddi_dma_free_handle(
2158 			    &xnfp->xnf_tx_pkt_info[i].dma_handle);
2159 		}
2160 	}
2161 
2162 }
2163 
2164 static void
2165 xnf_release_mblks(xnf_t *xnfp)
2166 {
2167 	int	i;
2168 
2169 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2170 		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
2171 			continue;
2172 		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
2173 		xnfp->xnf_tx_pkt_info[i].mp = NULL;
2174 		(void) ddi_dma_unbind_handle(
2175 		    xnfp->xnf_tx_pkt_info[i].dma_handle);
2176 	}
2177 }
2178 
2179 /*
2180  * Remove a xmit buffer descriptor from the head of the free list and return
2181  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2182  * Called with the tx_buf_mutex held.
2183  */
2184 static struct xnf_buffer_desc *
2185 xnf_get_tx_buffer(xnf_t *xnfp)
2186 {
2187 	struct xnf_buffer_desc *bdesc;
2188 
2189 	bdesc = xnfp->xnf_tx_free_list;
2190 	if (bdesc != NULL) {
2191 		xnfp->xnf_tx_free_list = bdesc->next;
2192 	} else {
2193 		bdesc = xnf_alloc_tx_buffer(xnfp);
2194 	}
2195 	return (bdesc);
2196 }
2197 
2198 /*
2199  * Remove a buffer descriptor from the head of the free list and return
2200  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2201  * Called with the rx_buf_mutex held.
2202  */
2203 static struct xnf_buffer_desc *
2204 xnf_get_buffer(xnf_t *xnfp)
2205 {
2206 	struct xnf_buffer_desc *bdesc;
2207 
2208 	bdesc = xnfp->xnf_free_list;
2209 	if (bdesc != NULL) {
2210 		xnfp->xnf_free_list = bdesc->next;
2211 		xnfp->xnf_rx_descs_free--;
2212 	} else {
2213 		bdesc = xnf_alloc_buffer(xnfp);
2214 	}
2215 	return (bdesc);
2216 }
2217 
2218 /*
2219  * Free a xmit buffer back to the xmit free list
2220  */
2221 static void
2222 xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
2223 {
2224 	xnf_t *xnfp = bp->xnfp;
2225 
2226 	mutex_enter(&xnfp->xnf_tx_buf_mutex);
2227 	bp->next = xnfp->xnf_tx_free_list;
2228 	xnfp->xnf_tx_free_list = bp;
2229 	mutex_exit(&xnfp->xnf_tx_buf_mutex);
2230 }
2231 
2232 /*
2233  * Put a buffer descriptor onto the head of the free list.
2234  * for page-flip:
2235  * We can't really free these buffers back to the kernel
2236  * since we have given away their backing page to be used
2237  * by the back end net driver.
2238  * for hvcopy:
2239  * release all the memory
2240  */
2241 static void
2242 xnf_free_buffer(struct xnf_buffer_desc *bdesc)
2243 {
2244 	xnf_t *xnfp = bdesc->xnfp;
2245 
2246 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
2247 	if (xnfp->xnf_rx_hvcopy) {
2248 		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
2249 			goto out;
2250 		ddi_dma_mem_free(&bdesc->acc_handle);
2251 		ddi_dma_free_handle(&bdesc->dma_handle);
2252 		kmem_free(bdesc, sizeof (*bdesc));
2253 		xnfp->xnf_rx_buffer_count--;
2254 	} else {
2255 		bdesc->next = xnfp->xnf_free_list;
2256 		xnfp->xnf_free_list = bdesc;
2257 		xnfp->xnf_rx_descs_free++;
2258 	}
2259 out:
2260 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
2261 }
2262 
2263 /*
2264  * Allocate a DMA-able xmit buffer, including a structure to
2265  * keep track of the buffer.  Called with tx_buf_mutex held.
2266  */
2267 static struct xnf_buffer_desc *
2268 xnf_alloc_tx_buffer(xnf_t *xnfp)
2269 {
2270 	struct xnf_buffer_desc *bdesc;
2271 	size_t len;
2272 
2273 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2274 		return (NULL);
2275 
2276 	/* allocate a DMA access handle for receive buffer */
2277 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
2278 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2279 		goto failure;
2280 
2281 	/* Allocate DMA-able memory for transmit buffer */
2282 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2283 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2284 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2285 		goto failure_1;
2286 
2287 	bdesc->xnfp = xnfp;
2288 	xnfp->xnf_tx_buffer_count++;
2289 
2290 	return (bdesc);
2291 
2292 failure_1:
2293 	ddi_dma_free_handle(&bdesc->dma_handle);
2294 
2295 failure:
2296 	kmem_free(bdesc, sizeof (*bdesc));
2297 	return (NULL);
2298 }
2299 
2300 /*
2301  * Allocate a DMA-able receive buffer, including a structure to
2302  * keep track of the buffer.  Called with rx_buf_mutex held.
2303  */
2304 static struct xnf_buffer_desc *
2305 xnf_alloc_buffer(xnf_t *xnfp)
2306 {
2307 	struct			xnf_buffer_desc *bdesc;
2308 	size_t			len;
2309 	uint_t			ncookies;
2310 	ddi_dma_cookie_t	dma_cookie;
2311 	long			cnt;
2312 	pfn_t			pfn;
2313 
2314 	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
2315 		return (NULL);
2316 
2317 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2318 		return (NULL);
2319 
2320 	/* allocate a DMA access handle for receive buffer */
2321 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
2322 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2323 		goto failure;
2324 
2325 	/* Allocate DMA-able memory for receive buffer */
2326 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2327 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2328 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2329 		goto failure_1;
2330 
2331 	/* bind to virtual address of buffer to get physical address */
2332 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2333 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2334 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2335 		goto failure_2;
2336 
2337 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2338 	bdesc->xnfp = xnfp;
2339 	if (xnfp->xnf_rx_hvcopy) {
2340 		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
2341 	} else {
2342 		bdesc->free_rtn.free_func = xnf_rcv_complete;
2343 	}
2344 	bdesc->free_rtn.free_arg = (char *)bdesc;
2345 	bdesc->grant_ref = GRANT_INVALID_REF;
2346 	ASSERT(ncookies == 1);
2347 
2348 	xnfp->xnf_rx_buffer_count++;
2349 
2350 	if (!xnfp->xnf_rx_hvcopy) {
2351 		/*
2352 		 * Unmap the page, and hand the machine page back
2353 		 * to xen so it can be used as a backend net buffer.
2354 		 */
2355 		pfn = xnf_btop(bdesc->buf_phys);
2356 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2357 		if (cnt != 1) {
2358 			cmn_err(CE_WARN, "unable to give a page back to the "
2359 			    "hypervisor\n");
2360 		}
2361 	}
2362 
2363 	return (bdesc);
2364 
2365 failure_2:
2366 	ddi_dma_mem_free(&bdesc->acc_handle);
2367 
2368 failure_1:
2369 	ddi_dma_free_handle(&bdesc->dma_handle);
2370 
2371 failure:
2372 	kmem_free(bdesc, sizeof (*bdesc));
2373 	return (NULL);
2374 }
2375 
2376 /*
2377  * Statistics.
2378  */
2379 static char *xnf_aux_statistics[] = {
2380 	"tx_cksum_deferred",
2381 	"rx_cksum_no_need",
2382 	"interrupts",
2383 	"unclaimed_interrupts",
2384 	"tx_pullup",
2385 	"tx_pagebndry",
2386 	"tx_attempt",
2387 	"rx_no_ringbuf",
2388 	"hvcopy_packet_processed",
2389 };
2390 
2391 static int
2392 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2393 {
2394 	xnf_t *xnfp;
2395 	kstat_named_t *knp;
2396 
2397 	if (flag != KSTAT_READ)
2398 		return (EACCES);
2399 
2400 	xnfp = ksp->ks_private;
2401 	knp = ksp->ks_data;
2402 
2403 	/*
2404 	 * Assignment order must match that of the names in
2405 	 * xnf_aux_statistics.
2406 	 */
2407 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2408 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2409 
2410 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2411 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2412 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2413 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2414 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2415 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
2416 
2417 	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
2418 
2419 	return (0);
2420 }
2421 
2422 static boolean_t
2423 xnf_kstat_init(xnf_t *xnfp)
2424 {
2425 	int nstat = sizeof (xnf_aux_statistics) /
2426 	    sizeof (xnf_aux_statistics[0]);
2427 	char **cp = xnf_aux_statistics;
2428 	kstat_named_t *knp;
2429 
2430 	/*
2431 	 * Create and initialise kstats.
2432 	 */
2433 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2434 	    ddi_get_instance(xnfp->xnf_devinfo),
2435 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2436 	    nstat, 0)) == NULL)
2437 		return (B_FALSE);
2438 
2439 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2440 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2441 
2442 	knp = xnfp->xnf_kstat_aux->ks_data;
2443 	while (nstat > 0) {
2444 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2445 
2446 		knp++;
2447 		cp++;
2448 		nstat--;
2449 	}
2450 
2451 	kstat_install(xnfp->xnf_kstat_aux);
2452 
2453 	return (B_TRUE);
2454 }
2455 
2456 static int
2457 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2458 {
2459 	xnf_t *xnfp = arg;
2460 
2461 	mutex_enter(&xnfp->xnf_intrlock);
2462 	mutex_enter(&xnfp->xnf_txlock);
2463 
2464 #define	mac_stat(q, r)				\
2465 	case (MAC_STAT_##q):			\
2466 		*val = xnfp->xnf_stat_##r;	\
2467 		break
2468 
2469 #define	ether_stat(q, r)			\
2470 	case (ETHER_STAT_##q):			\
2471 		*val = xnfp->xnf_stat_##r;	\
2472 		break
2473 
2474 	switch (stat) {
2475 
2476 	mac_stat(IPACKETS, ipackets);
2477 	mac_stat(OPACKETS, opackets);
2478 	mac_stat(RBYTES, rbytes);
2479 	mac_stat(OBYTES, obytes);
2480 	mac_stat(NORCVBUF, norxbuf);
2481 	mac_stat(IERRORS, errrx);
2482 	mac_stat(NOXMTBUF, tx_defer);
2483 
2484 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2485 	ether_stat(TOOSHORT_ERRORS, runt);
2486 
2487 	default:
2488 		mutex_exit(&xnfp->xnf_txlock);
2489 		mutex_exit(&xnfp->xnf_intrlock);
2490 
2491 		return (ENOTSUP);
2492 	}
2493 
2494 #undef mac_stat
2495 #undef ether_stat
2496 
2497 	mutex_exit(&xnfp->xnf_txlock);
2498 	mutex_exit(&xnfp->xnf_intrlock);
2499 
2500 	return (0);
2501 }
2502 
2503 /*ARGSUSED*/
2504 static void
2505 xnf_blank(void *arg, time_t ticks, uint_t count)
2506 {
2507 	/*
2508 	 * XXPV dme: blanking is not currently implemented.
2509 	 *
2510 	 * It's not obvious how to use the 'ticks' argument here.
2511 	 *
2512 	 * 'Count' might be used as an indicator of how to set
2513 	 * rsp_event when posting receive buffers to the rx_ring.  It
2514 	 * would replace the code at the tail of xnf_process_recv()
2515 	 * that simply indicates that the next completed packet should
2516 	 * cause an interrupt.
2517 	 */
2518 }
2519 
2520 static void
2521 xnf_resources(void *arg)
2522 {
2523 	xnf_t *xnfp = arg;
2524 	mac_rx_fifo_t mrf;
2525 
2526 	mrf.mrf_type = MAC_RX_FIFO;
2527 	mrf.mrf_blank = xnf_blank;
2528 	mrf.mrf_arg = (void *)xnfp;
2529 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2530 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2531 
2532 	xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
2533 	    (mac_resource_t *)&mrf);
2534 }
2535 
2536 /*ARGSUSED*/
2537 static void
2538 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2539 {
2540 	miocnak(q, mp, 0, EINVAL);
2541 }
2542 
2543 static boolean_t
2544 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2545 {
2546 	xnf_t *xnfp = arg;
2547 
2548 	switch (cap) {
2549 	case MAC_CAPAB_HCKSUM: {
2550 		uint32_t *capab = cap_data;
2551 
2552 		/*
2553 		 * Whilst the flag used to communicate with the IO
2554 		 * domain is called "NETTXF_csum_blank", the checksum
2555 		 * in the packet must contain the pseudo-header
2556 		 * checksum and not zero.
2557 		 *
2558 		 * To help out the IO domain, we might use
2559 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
2560 		 * then use checksum offload for IPv6 packets, which
2561 		 * the IO domain can't handle.
2562 		 *
2563 		 * As a result, we declare outselves capable of
2564 		 * HCKSUM_INET_FULL_V4. This means that we receive
2565 		 * IPv4 packets from the stack with a blank checksum
2566 		 * field and must insert the pseudo-header checksum
2567 		 * before passing the packet to the IO domain.
2568 		 */
2569 		if (xnfp->xnf_cksum_offload)
2570 			*capab = HCKSUM_INET_FULL_V4;
2571 		else
2572 			*capab = 0;
2573 		break;
2574 	}
2575 
2576 	case MAC_CAPAB_POLL:
2577 		/* Just return B_TRUE. */
2578 		break;
2579 
2580 	default:
2581 		return (B_FALSE);
2582 	}
2583 
2584 	return (B_TRUE);
2585 }
2586 
2587 /*ARGSUSED*/
2588 static void
2589 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2590     void *arg, void *impl_data)
2591 {
2592 	xnf_t *xnfp = ddi_get_driver_private(dip);
2593 	XenbusState new_state = *(XenbusState *)impl_data;
2594 
2595 	ASSERT(xnfp != NULL);
2596 
2597 	switch (new_state) {
2598 	case XenbusStateConnected:
2599 		mutex_enter(&xnfp->xnf_intrlock);
2600 		mutex_enter(&xnfp->xnf_txlock);
2601 
2602 		xnfp->xnf_connected = B_TRUE;
2603 		/*
2604 		 * wake up threads wanting to send data to backend,
2605 		 * but got blocked due to backend is not ready
2606 		 */
2607 		cv_broadcast(&xnfp->xnf_cv);
2608 
2609 		mutex_exit(&xnfp->xnf_txlock);
2610 		mutex_exit(&xnfp->xnf_intrlock);
2611 
2612 		/*
2613 		 * kick backend in case it missed any tx request
2614 		 * in the TX ring buffer
2615 		 */
2616 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2617 
2618 		/*
2619 		 * there maybe already queued rx data in the RX ring
2620 		 * sent by backend after it gets connected but before
2621 		 * we see its state change here, so we call our intr
2622 		 * handling routine to handle them, if any
2623 		 */
2624 		(void) xnf_intr((caddr_t)xnfp);
2625 
2626 		break;
2627 
2628 	default:
2629 		break;
2630 	}
2631 }
2632 
2633 /*
2634  * Check whether backend is capable of and willing to talk
2635  * to us via hypervisor copy, as opposed to page flip.
2636  */
2637 static boolean_t
2638 xnf_hvcopy_peer_status(dev_info_t *devinfo)
2639 {
2640 	int	be_rx_copy;
2641 	int	err;
2642 
2643 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
2644 	    "feature-rx-copy", "%d", &be_rx_copy);
2645 	/*
2646 	 * If we fail to read the store we assume that the key is
2647 	 * absent, implying an older domain at the far end.  Older
2648 	 * domains cannot do HV copy (we assume ..).
2649 	 */
2650 	if (err != 0)
2651 		be_rx_copy = 0;
2652 
2653 	return (be_rx_copy?B_TRUE:B_FALSE);
2654 }
2655