xref: /illumos-gate/usr/src/uts/common/xen/io/xnf.c (revision 0ebf3797ed9aceba2a3b361cf14badb82ac13478)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  *
31  * Copyright (c) 2004 Christian Limpach.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. This section intentionally left blank.
43  * 4. The name of the author may not be used to endorse or promote products
44  *    derived from this software without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56  */
57 /*
58  * Section 3 of the above license was updated in response to bug 6379571.
59  */
60 
61 /*
62  * xnf.c - Nemo-based network driver for domU
63  */
64 
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h>
68 #include <sys/sysmacros.h>
69 #include <sys/systm.h>
70 #include <sys/stream.h>
71 #include <sys/strsubr.h>
72 #include <sys/conf.h>
73 #include <sys/ddi.h>
74 #include <sys/devops.h>
75 #include <sys/sunddi.h>
76 #include <sys/sunndi.h>
77 #include <sys/dlpi.h>
78 #include <sys/ethernet.h>
79 #include <sys/strsun.h>
80 #include <sys/pattr.h>
81 #include <inet/ip.h>
82 #include <sys/modctl.h>
83 #include <sys/mac.h>
84 #include <sys/mac_ether.h>
85 #include <sys/bootinfo.h>
86 #include <sys/mach_mmu.h>
87 #ifdef	XPV_HVM_DRIVER
88 #include <sys/xpv_support.h>
89 #include <sys/hypervisor.h>
90 #else
91 #include <sys/hypervisor.h>
92 #include <sys/evtchn_impl.h>
93 #include <sys/balloon_impl.h>
94 #endif
95 #include <xen/public/io/netif.h>
96 #include <sys/gnttab.h>
97 #include <xen/sys/xendev.h>
98 #include <sys/sdt.h>
99 
100 #include <io/xnf.h>
101 
102 
103 /*
104  *  Declarations and Module Linkage
105  */
106 
107 #define	IDENT	"Virtual Ethernet driver"
108 
109 #if defined(DEBUG) || defined(__lint)
110 #define	XNF_DEBUG
111 int	xnfdebug = 0;
112 #endif
113 
114 /*
115  * On a 32 bit PAE system physical and machine addresses are larger
116  * than 32 bits.  ddi_btop() on such systems take an unsigned long
117  * argument, and so addresses above 4G are truncated before ddi_btop()
118  * gets to see them.  To avoid this, code the shift operation here.
119  */
120 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
121 
122 boolean_t	xnf_cksum_offload = B_TRUE;
123 
124 /* Default value for hypervisor-based copy operations */
125 boolean_t	xnf_rx_hvcopy = B_TRUE;
126 
127 /*
128  * Should pages used for transmit be readonly for the peer?
129  */
130 boolean_t	xnf_tx_pages_readonly = B_FALSE;
131 /*
132  * Packets under this size are bcopied instead of using desballoc.
133  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
134  * always copy.
135  */
136 unsigned int	xnf_rx_bcopy_thresh = 64;
137 
138 unsigned int	xnf_max_tx_frags = 1;
139 
140 /* Required system entry points */
141 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
142 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
143 
144 /* Required driver entry points for Nemo */
145 static int	xnf_start(void *);
146 static void	xnf_stop(void *);
147 static int	xnf_set_mac_addr(void *, const uint8_t *);
148 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
149 static int	xnf_set_promiscuous(void *, boolean_t);
150 static mblk_t	*xnf_send(void *, mblk_t *);
151 static uint_t	xnf_intr(caddr_t);
152 static int	xnf_stat(void *, uint_t, uint64_t *);
153 static void	xnf_blank(void *, time_t, uint_t);
154 static void	xnf_resources(void *);
155 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
156 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
157 
158 /* Driver private functions */
159 static int xnf_alloc_dma_resources(xnf_t *);
160 static void xnf_release_dma_resources(xnf_t *);
161 static mblk_t *xnf_process_recv(xnf_t *);
162 static void xnf_rcv_complete(struct xnf_buffer_desc *);
163 static void xnf_release_mblks(xnf_t *);
164 static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
165 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
166 static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
167 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
168 static void xnf_free_buffer(struct xnf_buffer_desc *);
169 static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
170 void xnf_send_driver_status(int, int);
171 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
172 static int xnf_clean_tx_ring(xnf_t  *);
173 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
174     void *, void *);
175 static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
176 static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
177 static boolean_t xnf_kstat_init(xnf_t *xnfp);
178 
179 /*
180  * XXPV dme: remove MC_IOCTL?
181  */
182 static mac_callbacks_t xnf_callbacks = {
183 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
184 	xnf_stat,
185 	xnf_start,
186 	xnf_stop,
187 	xnf_set_promiscuous,
188 	xnf_set_multicast,
189 	xnf_set_mac_addr,
190 	xnf_send,
191 	xnf_resources,
192 	xnf_ioctl,
193 	xnf_getcapab
194 };
195 
196 #define	GRANT_INVALID_REF	0
197 const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
198 const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
199 
200 /* DMA attributes for network ring buffer */
201 static ddi_dma_attr_t ringbuf_dma_attr = {
202 	DMA_ATTR_V0,		/* version of this structure */
203 	0,			/* lowest usable address */
204 	0xffffffffffffffffULL,	/* highest usable address */
205 	0x7fffffff,		/* maximum DMAable byte count */
206 	MMU_PAGESIZE,		/* alignment in bytes */
207 	0x7ff,			/* bitmap of burst sizes */
208 	1,			/* minimum transfer */
209 	0xffffffffU,		/* maximum transfer */
210 	0xffffffffffffffffULL,	/* maximum segment length */
211 	1,			/* maximum number of segments */
212 	1,			/* granularity */
213 	0,			/* flags (reserved) */
214 };
215 
216 /* DMA attributes for transmit data */
217 static ddi_dma_attr_t tx_buffer_dma_attr = {
218 	DMA_ATTR_V0,		/* version of this structure */
219 	0,			/* lowest usable address */
220 	0xffffffffffffffffULL,	/* highest usable address */
221 	0x7fffffff,		/* maximum DMAable byte count */
222 	MMU_PAGESIZE,		/* alignment in bytes */
223 	0x7ff,			/* bitmap of burst sizes */
224 	1,			/* minimum transfer */
225 	0xffffffffU,		/* maximum transfer */
226 	0xffffffffffffffffULL,	/* maximum segment length */
227 	1,			/* maximum number of segments */
228 	1,			/* granularity */
229 	0,			/* flags (reserved) */
230 };
231 
232 /* DMA attributes for a receive buffer */
233 static ddi_dma_attr_t rx_buffer_dma_attr = {
234 	DMA_ATTR_V0,		/* version of this structure */
235 	0,			/* lowest usable address */
236 	0xffffffffffffffffULL,	/* highest usable address */
237 	0x7fffffff,		/* maximum DMAable byte count */
238 	MMU_PAGESIZE,		/* alignment in bytes */
239 	0x7ff,			/* bitmap of burst sizes */
240 	1,			/* minimum transfer */
241 	0xffffffffU,		/* maximum transfer */
242 	0xffffffffffffffffULL,	/* maximum segment length */
243 	1,			/* maximum number of segments */
244 	1,			/* granularity */
245 	0,			/* flags (reserved) */
246 };
247 
248 /* DMA access attributes for registers and descriptors */
249 static ddi_device_acc_attr_t accattr = {
250 	DDI_DEVICE_ATTR_V0,
251 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
252 	DDI_STRICTORDER_ACC
253 };
254 
255 /* DMA access attributes for data: NOT to be byte swapped. */
256 static ddi_device_acc_attr_t data_accattr = {
257 	DDI_DEVICE_ATTR_V0,
258 	DDI_NEVERSWAP_ACC,
259 	DDI_STRICTORDER_ACC
260 };
261 
262 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
263 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
264 
265 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
266     nodev, NULL, D_MP, NULL);
267 
268 static struct modldrv xnf_modldrv = {
269 	&mod_driverops,		/* Type of module.  This one is a driver */
270 	IDENT " %I%",		/* short description */
271 	&xnf_dev_ops		/* driver specific ops */
272 };
273 
274 static struct modlinkage modlinkage = {
275 	MODREV_1, &xnf_modldrv, NULL
276 };
277 
278 int
279 _init(void)
280 {
281 	int r;
282 
283 	mac_init_ops(&xnf_dev_ops, "xnf");
284 	r = mod_install(&modlinkage);
285 	if (r != DDI_SUCCESS)
286 		mac_fini_ops(&xnf_dev_ops);
287 
288 	return (r);
289 }
290 
291 int
292 _fini(void)
293 {
294 	return (EBUSY); /* XXPV dme: should be removable */
295 }
296 
297 int
298 _info(struct modinfo *modinfop)
299 {
300 	return (mod_info(&modlinkage, modinfop));
301 }
302 
303 static int
304 xnf_setup_rings(xnf_t *xnfp)
305 {
306 	int			ix, err;
307 	RING_IDX		i;
308 	struct xnf_buffer_desc	*bdesc, *rbp;
309 	struct xenbus_device	*xsd;
310 	domid_t			oeid;
311 
312 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
313 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
314 
315 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
316 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
317 
318 	err = gnttab_grant_foreign_access(oeid,
319 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
320 	if (err <= 0) {
321 		err = -err;
322 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
323 		goto out;
324 	}
325 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
326 
327 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
328 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
329 
330 	err = gnttab_grant_foreign_access(oeid,
331 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
332 	if (err <= 0) {
333 		err = -err;
334 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
335 		goto out;
336 	}
337 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
338 
339 
340 	mutex_enter(&xnfp->xnf_intrlock);
341 
342 	/*
343 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
344 	 * and reset the ring.  Note that this can lose packets after a resume,
345 	 * but we expect to stagger on.
346 	 */
347 	mutex_enter(&xnfp->xnf_txlock);
348 
349 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
350 		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
351 
352 		txp->id = i + 1;
353 
354 		if (txp->grant_ref == GRANT_INVALID_REF) {
355 			ASSERT(txp->mp == NULL);
356 			ASSERT(txp->bdesc == NULL);
357 			continue;
358 		}
359 
360 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
361 			panic("tx grant still in use by backend domain");
362 
363 		freemsg(txp->mp);
364 		txp->mp = NULL;
365 
366 		(void) ddi_dma_unbind_handle(txp->dma_handle);
367 
368 		if (txp->bdesc != NULL) {
369 			xnf_free_tx_buffer(txp->bdesc);
370 			txp->bdesc = NULL;
371 		}
372 
373 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
374 		    xnfp->xnf_tx_pages_readonly);
375 		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
376 		    txp->grant_ref);
377 		txp->grant_ref = GRANT_INVALID_REF;
378 	}
379 
380 	xnfp->xnf_tx_pkt_id_list = 0;
381 	xnfp->xnf_tx_ring.rsp_cons = 0;
382 	xnfp->xnf_tx_ring.req_prod_pvt = 0;
383 	xnfp->xnf_tx_ring.sring->req_prod = 0;
384 	xnfp->xnf_tx_ring.sring->rsp_prod = 0;
385 	xnfp->xnf_tx_ring.sring->rsp_event = 1;
386 
387 	mutex_exit(&xnfp->xnf_txlock);
388 
389 	/*
390 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
391 	 * our pages are currently flipped out/granted so we can't just free
392 	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
393 	 * useable anyway since the mfn's they refer to are no longer valid.
394 	 * Grant the backend domain access to each hung rx buffer.
395 	 */
396 	i = xnfp->xnf_rx_ring.rsp_cons;
397 	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
398 		volatile netif_rx_request_t	*rxrp;
399 
400 		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
401 		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
402 		rbp = xnfp->xnf_rxpkt_bufptr[ix];
403 		if (rbp != NULL) {
404 			grant_ref_t	ref = rbp->grant_ref;
405 
406 			ASSERT(ref != GRANT_INVALID_REF);
407 			if (xnfp->xnf_rx_hvcopy) {
408 				pfn_t pfn = xnf_btop(rbp->buf_phys);
409 				mfn_t mfn = pfn_to_mfn(pfn);
410 
411 				gnttab_grant_foreign_access_ref(ref, oeid,
412 				    mfn, 0);
413 			} else {
414 				gnttab_grant_foreign_transfer_ref(ref,
415 				    oeid, 0);
416 			}
417 			rxrp->id = ix;
418 			rxrp->gref = ref;
419 		}
420 	}
421 
422 	/*
423 	 * Reset the ring pointers to initial state.
424 	 * Hang buffers for any empty ring slots.
425 	 */
426 	xnfp->xnf_rx_ring.rsp_cons = 0;
427 	xnfp->xnf_rx_ring.req_prod_pvt = 0;
428 	xnfp->xnf_rx_ring.sring->req_prod = 0;
429 	xnfp->xnf_rx_ring.sring->rsp_prod = 0;
430 	xnfp->xnf_rx_ring.sring->rsp_event = 1;
431 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
432 		xnfp->xnf_rx_ring.req_prod_pvt = i;
433 		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
434 			continue;
435 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
436 			break;
437 		rx_buffer_hang(xnfp, bdesc);
438 	}
439 	xnfp->xnf_rx_ring.req_prod_pvt = i;
440 	/* LINTED: constant in conditional context */
441 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
442 
443 	mutex_exit(&xnfp->xnf_intrlock);
444 
445 	return (0);
446 
447 out:
448 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
449 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
450 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
451 
452 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
453 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
454 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
455 
456 	return (err);
457 }
458 
459 
460 /* Called when the upper layers free a message we passed upstream */
461 static void
462 xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
463 {
464 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
465 	ddi_dma_mem_free(&bdesc->acc_handle);
466 	ddi_dma_free_handle(&bdesc->dma_handle);
467 	kmem_free(bdesc, sizeof (*bdesc));
468 }
469 
470 
471 /*
472  * Connect driver to back end, called to set up communication with
473  * back end driver both initially and on resume after restore/migrate.
474  */
475 void
476 xnf_be_connect(xnf_t *xnfp)
477 {
478 	const char	*message;
479 	xenbus_transaction_t xbt;
480 	struct		xenbus_device *xsd;
481 	char		*xsname;
482 	int		err;
483 
484 	ASSERT(!xnfp->xnf_connected);
485 
486 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
487 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
488 
489 	err = xnf_setup_rings(xnfp);
490 	if (err != 0) {
491 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
492 		xenbus_dev_error(xsd, err, "setting up ring");
493 		return;
494 	}
495 
496 again:
497 	err = xenbus_transaction_start(&xbt);
498 	if (err != 0) {
499 		xenbus_dev_error(xsd, EIO, "starting transaction");
500 		return;
501 	}
502 
503 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
504 	    xnfp->xnf_tx_ring_ref);
505 	if (err != 0) {
506 		message = "writing tx ring-ref";
507 		goto abort_transaction;
508 	}
509 
510 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
511 	    xnfp->xnf_rx_ring_ref);
512 	if (err != 0) {
513 		message = "writing rx ring-ref";
514 		goto abort_transaction;
515 	}
516 
517 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
518 	    xnfp->xnf_evtchn);
519 	if (err != 0) {
520 		message = "writing event-channel";
521 		goto abort_transaction;
522 	}
523 
524 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
525 	if (err != 0) {
526 		message = "writing feature-rx-notify";
527 		goto abort_transaction;
528 	}
529 
530 	if (!xnfp->xnf_tx_pages_readonly) {
531 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
532 		    "%d", 1);
533 		if (err != 0) {
534 			message = "writing feature-tx-writable";
535 			goto abort_transaction;
536 		}
537 	}
538 
539 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
540 	    xnfp->xnf_cksum_offload ? 0 : 1);
541 	if (err != 0) {
542 		message = "writing feature-no-csum-offload";
543 		goto abort_transaction;
544 	}
545 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
546 	    xnfp->xnf_rx_hvcopy ? 1 : 0);
547 	if (err != 0) {
548 		message = "writing request-rx-copy";
549 		goto abort_transaction;
550 	}
551 
552 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
553 	if (err != 0) {
554 		message = "writing frontend XenbusStateConnected";
555 		goto abort_transaction;
556 	}
557 
558 	err = xenbus_transaction_end(xbt, 0);
559 	if (err != 0) {
560 		if (err == EAGAIN)
561 			goto again;
562 		xenbus_dev_error(xsd, err, "completing transaction");
563 	}
564 
565 	return;
566 
567 abort_transaction:
568 	(void) xenbus_transaction_end(xbt, 1);
569 	xenbus_dev_error(xsd, err, "%s", message);
570 }
571 
572 /*
573  * Read config info from xenstore
574  */
575 void
576 xnf_read_config(xnf_t *xnfp)
577 {
578 	char		mac[ETHERADDRL * 3];
579 	int		err, be_no_cksum_offload;
580 
581 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
582 	    "%s", (char *)&mac[0]);
583 	if (err != 0) {
584 		/*
585 		 * bad: we're supposed to be set up with a proper mac
586 		 * addr. at this point
587 		 */
588 		cmn_err(CE_WARN, "%s%d: no mac address",
589 		    ddi_driver_name(xnfp->xnf_devinfo),
590 		    ddi_get_instance(xnfp->xnf_devinfo));
591 			return;
592 	}
593 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
594 		err = ENOENT;
595 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
596 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
597 		return;
598 	}
599 
600 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
601 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
602 	/*
603 	 * If we fail to read the store we assume that the key is
604 	 * absent, implying an older domain at the far end.  Older
605 	 * domains always support checksum offload.
606 	 */
607 	if (err != 0)
608 		be_no_cksum_offload = 0;
609 	/*
610 	 * If the far end cannot do checksum offload or we do not wish
611 	 * to do it, disable it.
612 	 */
613 	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
614 		xnfp->xnf_cksum_offload = B_FALSE;
615 }
616 
617 /*
618  *  attach(9E) -- Attach a device to the system
619  *
620  *  Called once for each board successfully probed.
621  */
622 static int
623 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
624 {
625 	mac_register_t *macp;
626 	xnf_t *xnfp;
627 	int err;
628 
629 #ifdef XNF_DEBUG
630 	if (xnfdebug & XNF_DEBUG_DDI)
631 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
632 		    (void *)devinfo);
633 #endif
634 
635 	switch (cmd) {
636 	case DDI_RESUME:
637 		xnfp = ddi_get_driver_private(devinfo);
638 
639 		(void) xvdi_resume(devinfo);
640 		(void) xvdi_alloc_evtchn(devinfo);
641 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
642 #ifdef XPV_HVM_DRIVER
643 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
644 		    xnfp);
645 #else
646 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
647 		    (caddr_t)xnfp);
648 #endif
649 		xnf_be_connect(xnfp);
650 		/*
651 		 * Our MAC address may have changed if we're resuming:
652 		 * - on a different host
653 		 * - on the same one and got a different MAC address
654 		 *   because we didn't specify one of our own.
655 		 * so it's useful to claim that it changed in order that
656 		 * IP send out a gratuitous ARP.
657 		 */
658 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
659 		return (DDI_SUCCESS);
660 
661 	case DDI_ATTACH:
662 		break;
663 
664 	default:
665 		return (DDI_FAILURE);
666 	}
667 
668 	/*
669 	 *  Allocate gld_mac_info_t and xnf_instance structures
670 	 */
671 	macp = mac_alloc(MAC_VERSION);
672 	if (macp == NULL)
673 		return (DDI_FAILURE);
674 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
675 
676 	macp->m_dip = devinfo;
677 	macp->m_driver = xnfp;
678 	xnfp->xnf_devinfo = devinfo;
679 
680 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
681 	macp->m_src_addr = xnfp->xnf_mac_addr;
682 	macp->m_callbacks = &xnf_callbacks;
683 	macp->m_min_sdu = 0;
684 	macp->m_max_sdu = XNF_MAXPKT;
685 
686 	xnfp->xnf_running = B_FALSE;
687 	xnfp->xnf_connected = B_FALSE;
688 	xnfp->xnf_cksum_offload = xnf_cksum_offload;
689 	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
690 
691 	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
692 #ifdef XPV_HVM_DRIVER
693 	/*
694 	 * Report our version to dom0.
695 	 */
696 	if (xenbus_printf(XBT_NULL, "hvmpv/xnf", "version", "%d",
697 	    HVMPV_XNF_VERS))
698 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
699 
700 	if (!xnfp->xnf_rx_hvcopy) {
701 		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
702 		    "supports 'feature-rx-copy'");
703 		goto failure;
704 	}
705 #endif
706 
707 	/*
708 	 * Get the iblock cookie with which to initialize the mutexes.
709 	 */
710 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
711 	    != DDI_SUCCESS)
712 		goto failure;
713 	/*
714 	 * Driver locking strategy: the txlock protects all paths
715 	 * through the driver, except the interrupt thread.
716 	 * If the interrupt thread needs to do something which could
717 	 * affect the operation of any other part of the driver,
718 	 * it needs to acquire the txlock mutex.
719 	 */
720 	mutex_init(&xnfp->xnf_tx_buf_mutex,
721 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
722 	mutex_init(&xnfp->xnf_rx_buf_mutex,
723 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
724 	mutex_init(&xnfp->xnf_txlock,
725 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
726 	mutex_init(&xnfp->xnf_intrlock,
727 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
728 	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
729 
730 	xnfp->xnf_gref_tx_head = (grant_ref_t)-1;
731 	xnfp->xnf_gref_rx_head = (grant_ref_t)-1;
732 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
733 	    &xnfp->xnf_gref_tx_head) < 0) {
734 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
735 		    ddi_get_instance(xnfp->xnf_devinfo));
736 		goto failure_1;
737 	}
738 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
739 	    &xnfp->xnf_gref_rx_head) < 0) {
740 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
741 		    ddi_get_instance(xnfp->xnf_devinfo));
742 		goto failure_1;
743 	}
744 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
745 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
746 		    "driver data structures",
747 		    ddi_get_instance(xnfp->xnf_devinfo));
748 		goto failure_1;
749 	}
750 
751 	xnfp->xnf_rx_ring.sring->rsp_event =
752 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
753 
754 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
755 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
756 
757 	/* set driver private pointer now */
758 	ddi_set_driver_private(devinfo, xnfp);
759 
760 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change)
761 	    != DDI_SUCCESS)
762 		goto failure_1;
763 
764 	if (!xnf_kstat_init(xnfp))
765 		goto failure_2;
766 
767 	/*
768 	 * Allocate an event channel, add the interrupt handler and
769 	 * bind it to the event channel.
770 	 */
771 	(void) xvdi_alloc_evtchn(devinfo);
772 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
773 #ifdef XPV_HVM_DRIVER
774 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
775 #else
776 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
777 #endif
778 
779 	xnf_read_config(xnfp);
780 	err = mac_register(macp, &xnfp->xnf_mh);
781 	mac_free(macp);
782 	macp = NULL;
783 	if (err != 0)
784 		goto failure_3;
785 
786 #ifdef XPV_HVM_DRIVER
787 	/*
788 	 * In the HVM case, this driver essentially replaces a driver for
789 	 * a 'real' PCI NIC. Without the "model" property set to
790 	 * "Ethernet controller", like the PCI code does, netbooting does
791 	 * not work correctly, as strplumb_get_netdev_path() will not find
792 	 * this interface.
793 	 */
794 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
795 	    "Ethernet controller");
796 #endif
797 
798 	/*
799 	 * connect to the backend
800 	 */
801 	xnf_be_connect(xnfp);
802 
803 	return (DDI_SUCCESS);
804 
805 failure_3:
806 	kstat_delete(xnfp->xnf_kstat_aux);
807 #ifdef XPV_HVM_DRIVER
808 	ec_unbind_evtchn(xnfp->xnf_evtchn);
809 	xvdi_free_evtchn(devinfo);
810 #else
811 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
812 #endif
813 	xnfp->xnf_evtchn = INVALID_EVTCHN;
814 
815 failure_2:
816 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
817 
818 failure_1:
819 	if (xnfp->xnf_gref_tx_head != (grant_ref_t)-1)
820 		gnttab_free_grant_references(xnfp->xnf_gref_tx_head);
821 	if (xnfp->xnf_gref_rx_head != (grant_ref_t)-1)
822 		gnttab_free_grant_references(xnfp->xnf_gref_rx_head);
823 	xnf_release_dma_resources(xnfp);
824 	cv_destroy(&xnfp->xnf_cv);
825 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
826 	mutex_destroy(&xnfp->xnf_txlock);
827 	mutex_destroy(&xnfp->xnf_intrlock);
828 
829 failure:
830 	kmem_free(xnfp, sizeof (*xnfp));
831 	if (macp != NULL)
832 		mac_free(macp);
833 
834 	return (DDI_FAILURE);
835 }
836 
837 /*  detach(9E) -- Detach a device from the system */
838 static int
839 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
840 {
841 	xnf_t *xnfp;		/* Our private device info */
842 	int i;
843 
844 #ifdef XNF_DEBUG
845 	if (xnfdebug & XNF_DEBUG_DDI)
846 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
847 #endif
848 
849 	xnfp = ddi_get_driver_private(devinfo);
850 
851 	switch (cmd) {
852 	case DDI_SUSPEND:
853 #ifdef XPV_HVM_DRIVER
854 		ec_unbind_evtchn(xnfp->xnf_evtchn);
855 		xvdi_free_evtchn(devinfo);
856 #else
857 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
858 #endif
859 
860 		xvdi_suspend(devinfo);
861 
862 		mutex_enter(&xnfp->xnf_intrlock);
863 		mutex_enter(&xnfp->xnf_txlock);
864 
865 		xnfp->xnf_evtchn = INVALID_EVTCHN;
866 		xnfp->xnf_connected = B_FALSE;
867 		mutex_exit(&xnfp->xnf_txlock);
868 		mutex_exit(&xnfp->xnf_intrlock);
869 		return (DDI_SUCCESS);
870 
871 	case DDI_DETACH:
872 		break;
873 
874 	default:
875 		return (DDI_FAILURE);
876 	}
877 
878 	if (xnfp->xnf_connected)
879 		return (DDI_FAILURE);
880 
881 	/* Wait for receive buffers to be returned; give up after 5 seconds */
882 	i = 50;
883 
884 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
885 	while (xnfp->xnf_rx_bufs_outstanding > 0) {
886 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
887 		delay(drv_usectohz(100000));
888 		if (--i == 0) {
889 			cmn_err(CE_WARN,
890 			    "xnf%d: never reclaimed all the "
891 			    "receive buffers.  Still have %d "
892 			    "buffers outstanding.",
893 			    ddi_get_instance(xnfp->xnf_devinfo),
894 			    xnfp->xnf_rx_bufs_outstanding);
895 			return (DDI_FAILURE);
896 		}
897 		mutex_enter(&xnfp->xnf_rx_buf_mutex);
898 	}
899 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
900 
901 	if (mac_unregister(xnfp->xnf_mh) != 0)
902 		return (DDI_FAILURE);
903 
904 	kstat_delete(xnfp->xnf_kstat_aux);
905 
906 	/* Stop the receiver */
907 	xnf_stop(xnfp);
908 
909 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
910 
911 	/* Remove the interrupt */
912 #ifdef XPV_HVM_DRIVER
913 	ec_unbind_evtchn(xnfp->xnf_evtchn);
914 	xvdi_free_evtchn(devinfo);
915 #else
916 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
917 #endif
918 
919 	/* Release any pending xmit mblks */
920 	xnf_release_mblks(xnfp);
921 
922 	/* Release all DMA resources */
923 	xnf_release_dma_resources(xnfp);
924 
925 	cv_destroy(&xnfp->xnf_cv);
926 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
927 	mutex_destroy(&xnfp->xnf_txlock);
928 	mutex_destroy(&xnfp->xnf_intrlock);
929 
930 	kmem_free(xnfp, sizeof (*xnfp));
931 
932 	return (DDI_SUCCESS);
933 }
934 
935 /*
936  *  xnf_set_mac_addr() -- set the physical network address on the board.
937  */
938 /*ARGSUSED*/
939 static int
940 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
941 {
942 	xnf_t *xnfp = arg;
943 
944 #ifdef XNF_DEBUG
945 	if (xnfdebug & XNF_DEBUG_TRACE)
946 		printf("xnf%d: set_mac_addr(0x%p): "
947 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
948 		    ddi_get_instance(xnfp->xnf_devinfo),
949 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
950 		    macaddr[3], macaddr[4], macaddr[5]);
951 #endif
952 	/*
953 	 * We can't set our macaddr.
954 	 *
955 	 * XXPV dme: Why not?
956 	 */
957 	return (ENOTSUP);
958 }
959 
960 /*
961  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
962  *
963  *  Program the hardware to enable/disable the multicast address
964  *  in "mcast".  Enable if "add" is true, disable if false.
965  */
966 /*ARGSUSED*/
967 static int
968 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
969 {
970 	xnf_t *xnfp = arg;
971 
972 #ifdef XNF_DEBUG
973 	if (xnfdebug & XNF_DEBUG_TRACE)
974 		printf("xnf%d set_multicast(0x%p): "
975 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
976 		    ddi_get_instance(xnfp->xnf_devinfo),
977 		    (void *)xnfp, mca[0], mca[1], mca[2],
978 		    mca[3], mca[4], mca[5]);
979 #endif
980 
981 	/*
982 	 * XXPV dme: Ideally we'd relay the address to the backend for
983 	 * enabling.  The protocol doesn't support that (interesting
984 	 * extension), so we simply succeed and hope that the relevant
985 	 * packets are going to arrive.
986 	 *
987 	 * If protocol support is added for enable/disable then we'll
988 	 * need to keep a list of those in use and re-add on resume.
989 	 */
990 	return (0);
991 }
992 
993 /*
994  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
995  *
996  *  Program the hardware to enable/disable promiscuous mode.
997  */
998 /*ARGSUSED*/
999 static int
1000 xnf_set_promiscuous(void *arg, boolean_t on)
1001 {
1002 	xnf_t *xnfp = arg;
1003 
1004 #ifdef XNF_DEBUG
1005 	if (xnfdebug & XNF_DEBUG_TRACE)
1006 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
1007 		    ddi_get_instance(xnfp->xnf_devinfo),
1008 		    (void *)xnfp, on);
1009 #endif
1010 	/*
1011 	 * We can't really do this, but we pretend that we can in
1012 	 * order that snoop will work.
1013 	 */
1014 	return (0);
1015 }
1016 
1017 /*
1018  * Clean buffers that we have responses for from the transmit ring.
1019  */
1020 static int
1021 xnf_clean_tx_ring(xnf_t *xnfp)
1022 {
1023 	RING_IDX		next_resp, i;
1024 	struct tx_pktinfo	*reap;
1025 	int			id;
1026 	grant_ref_t		ref;
1027 	boolean_t		work_to_do;
1028 
1029 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1030 
1031 loop:
1032 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1033 		/*
1034 		 * index of next transmission ack
1035 		 */
1036 		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
1037 		membar_consumer();
1038 		/*
1039 		 * Clean tx packets from ring that we have responses for
1040 		 */
1041 		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
1042 			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
1043 			reap = &xnfp->xnf_tx_pkt_info[id];
1044 			ref = reap->grant_ref;
1045 			/*
1046 			 * Return id to free list
1047 			 */
1048 			reap->id = xnfp->xnf_tx_pkt_id_list;
1049 			xnfp->xnf_tx_pkt_id_list = id;
1050 			if (gnttab_query_foreign_access(ref) != 0)
1051 				panic("tx grant still in use "
1052 				    "by backend domain");
1053 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1054 			(void) gnttab_end_foreign_access_ref(ref,
1055 			    xnfp->xnf_tx_pages_readonly);
1056 			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
1057 			    ref);
1058 			freemsg(reap->mp);
1059 			reap->mp = NULL;
1060 			reap->grant_ref = GRANT_INVALID_REF;
1061 			if (reap->bdesc != NULL)
1062 				xnf_free_tx_buffer(reap->bdesc);
1063 			reap->bdesc = NULL;
1064 		}
1065 		xnfp->xnf_tx_ring.rsp_cons = next_resp;
1066 		membar_enter();
1067 	}
1068 
1069 	/* LINTED: constant in conditional context */
1070 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1071 	if (work_to_do)
1072 		goto loop;
1073 
1074 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1075 }
1076 
1077 /*
1078  * If we need to pull up data from either a packet that crosses a page
1079  * boundary or consisting of multiple mblks, do it here.  We allocate
1080  * a page aligned buffer and copy the data into it.  The header for the
1081  * allocated buffer is returned. (which is also allocated here)
1082  */
1083 static struct xnf_buffer_desc *
1084 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1085 {
1086 	struct xnf_buffer_desc	*bdesc;
1087 	mblk_t			*mptr;
1088 	caddr_t			bp;
1089 	int			len;
1090 
1091 	/*
1092 	 * get a xmit buffer from the xmit buffer pool
1093 	 */
1094 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1095 	bdesc = xnf_get_tx_buffer(xnfp);
1096 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
1097 	if (bdesc == NULL)
1098 		return (bdesc);
1099 	/*
1100 	 * Copy the data into the buffer
1101 	 */
1102 	xnfp->xnf_stat_tx_pullup++;
1103 	bp = bdesc->buf;
1104 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1105 		len = mptr->b_wptr - mptr->b_rptr;
1106 		bcopy(mptr->b_rptr, bp, len);
1107 		bp += len;
1108 	}
1109 	return (bdesc);
1110 }
1111 
1112 /*
1113  *  xnf_send_one() -- send a packet
1114  *
1115  *  Called when a packet is ready to be transmitted. A pointer to an
1116  *  M_DATA message that contains the packet is passed to this routine.
1117  *  At least the complete LLC header is contained in the message's
1118  *  first message block, and the remainder of the packet is contained
1119  *  within additional M_DATA message blocks linked to the first
1120  *  message block.
1121  *
1122  */
1123 static boolean_t
1124 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1125 {
1126 	struct xnf_buffer_desc	*xmitbuf;
1127 	struct tx_pktinfo	*txp_info;
1128 	mblk_t			*mptr;
1129 	ddi_dma_cookie_t	dma_cookie;
1130 	RING_IDX		slot;
1131 	int			length = 0, i, pktlen = 0, rc, tx_id;
1132 	int			tx_ring_freespace, page_oops;
1133 	uint_t			ncookies;
1134 	volatile netif_tx_request_t	*txrp;
1135 	caddr_t			bufaddr;
1136 	grant_ref_t		ref;
1137 	unsigned long		mfn;
1138 	uint32_t		pflags;
1139 	domid_t			oeid;
1140 
1141 #ifdef XNF_DEBUG
1142 	if (xnfdebug & XNF_DEBUG_SEND)
1143 		printf("xnf%d send(0x%p, 0x%p)\n",
1144 		    ddi_get_instance(xnfp->xnf_devinfo),
1145 		    (void *)xnfp, (void *)mp);
1146 #endif
1147 
1148 	ASSERT(mp != NULL);
1149 	ASSERT(mp->b_next == NULL);
1150 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1151 
1152 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1153 	ASSERT(tx_ring_freespace >= 0);
1154 
1155 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1156 	xnfp->xnf_stat_tx_attempt++;
1157 	/*
1158 	 * If there are no xmit ring slots available, return.
1159 	 */
1160 	if (tx_ring_freespace == 0) {
1161 		xnfp->xnf_stat_tx_defer++;
1162 		return (B_FALSE);	/* Send should be retried */
1163 	}
1164 
1165 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1166 	/* Count the number of mblks in message and compute packet size */
1167 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1168 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1169 
1170 	/* Make sure packet isn't too large */
1171 	if (pktlen > XNF_FRAMESIZE) {
1172 		cmn_err(CE_WARN, "xnf%d: oversized packet (%d bytes) dropped",
1173 		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
1174 		freemsg(mp);
1175 		return (B_TRUE);
1176 	}
1177 
1178 	/*
1179 	 * Test if we cross a page boundary with our buffer
1180 	 */
1181 	page_oops = (i == 1) &&
1182 	    (xnf_btop((size_t)mp->b_rptr) !=
1183 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1184 	/*
1185 	 * XXPV - unfortunately, the Xen virtual net device currently
1186 	 * doesn't support multiple packet frags, so this will always
1187 	 * end up doing the pullup if we got more than one packet.
1188 	 */
1189 	if (i > xnf_max_tx_frags || page_oops) {
1190 		if (page_oops)
1191 			xnfp->xnf_stat_tx_pagebndry++;
1192 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1193 			/* could not allocate resources? */
1194 #ifdef XNF_DEBUG
1195 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1196 			    ddi_get_instance(xnfp->xnf_devinfo));
1197 #endif
1198 			xnfp->xnf_stat_tx_defer++;
1199 			return (B_FALSE);	/* Retry send */
1200 		}
1201 		bufaddr = xmitbuf->buf;
1202 	} else {
1203 		xmitbuf = NULL;
1204 		bufaddr = (caddr_t)mp->b_rptr;
1205 	}
1206 
1207 	/* set up data descriptor */
1208 	length = pktlen;
1209 
1210 	/*
1211 	 * Get packet id from free list
1212 	 */
1213 	tx_id = xnfp->xnf_tx_pkt_id_list;
1214 	ASSERT(tx_id < NET_TX_RING_SIZE);
1215 	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
1216 	xnfp->xnf_tx_pkt_id_list = txp_info->id;
1217 	txp_info->id = tx_id;
1218 
1219 	/* Prepare for DMA mapping of tx buffer(s) */
1220 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1221 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1222 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1223 	if (rc != DDI_DMA_MAPPED) {
1224 		ASSERT(rc != DDI_DMA_INUSE);
1225 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1226 		/*
1227 		 *  Return id to free list
1228 		 */
1229 		txp_info->id = xnfp->xnf_tx_pkt_id_list;
1230 		xnfp->xnf_tx_pkt_id_list = tx_id;
1231 		if (rc == DDI_DMA_NORESOURCES) {
1232 			xnfp->xnf_stat_tx_defer++;
1233 			return (B_FALSE); /* Retry later */
1234 		}
1235 #ifdef XNF_DEBUG
1236 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1237 		    ddi_get_instance(xnfp->xnf_devinfo), rc);
1238 #endif
1239 		return (B_FALSE);
1240 	}
1241 
1242 	ASSERT(ncookies == 1);
1243 	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
1244 	ASSERT((signed short)ref >= 0);
1245 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1246 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1247 	    xnfp->xnf_tx_pages_readonly);
1248 	txp_info->grant_ref = ref;
1249 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1250 	txrp->gref = ref;
1251 	txrp->size = dma_cookie.dmac_size;
1252 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1253 	txrp->id = tx_id;
1254 	txrp->flags = 0;
1255 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1256 	if (pflags != 0) {
1257 		ASSERT(xnfp->xnf_cksum_offload);
1258 		/*
1259 		 * If the local protocol stack requests checksum
1260 		 * offload we set the 'checksum blank' flag,
1261 		 * indicating to the peer that we need the checksum
1262 		 * calculated for us.
1263 		 *
1264 		 * We _don't_ set the validated flag, because we haven't
1265 		 * validated that the data and the checksum match.
1266 		 */
1267 		txrp->flags |= NETTXF_csum_blank;
1268 		xnfp->xnf_stat_tx_cksum_deferred++;
1269 	}
1270 	membar_producer();
1271 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 1;
1272 
1273 	txp_info->mp = mp;
1274 	txp_info->bdesc = xmitbuf;
1275 
1276 	xnfp->xnf_stat_opackets++;
1277 	xnfp->xnf_stat_obytes += pktlen;
1278 
1279 	return (B_TRUE);	/* successful transmit attempt */
1280 }
1281 
1282 mblk_t *
1283 xnf_send(void *arg, mblk_t *mp)
1284 {
1285 	xnf_t *xnfp = arg;
1286 	mblk_t *next;
1287 	boolean_t sent_something = B_FALSE;
1288 
1289 	mutex_enter(&xnfp->xnf_txlock);
1290 
1291 	/*
1292 	 * Transmission attempts should be impossible without having
1293 	 * previously called xnf_start().
1294 	 */
1295 	ASSERT(xnfp->xnf_running);
1296 
1297 	/*
1298 	 * Wait for getting connected to the backend
1299 	 */
1300 	while (!xnfp->xnf_connected) {
1301 		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
1302 	}
1303 
1304 	while (mp != NULL) {
1305 		next = mp->b_next;
1306 		mp->b_next = NULL;
1307 
1308 		if (!xnf_send_one(xnfp, mp)) {
1309 			mp->b_next = next;
1310 			break;
1311 		}
1312 
1313 		mp = next;
1314 		sent_something = B_TRUE;
1315 	}
1316 
1317 	if (sent_something) {
1318 		boolean_t notify;
1319 
1320 		/* LINTED: constant in conditional context */
1321 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1322 		    notify);
1323 		if (notify)
1324 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1325 	}
1326 
1327 	mutex_exit(&xnfp->xnf_txlock);
1328 
1329 	return (mp);
1330 }
1331 
1332 /*
1333  *  xnf_intr() -- ring interrupt service routine
1334  */
1335 static uint_t
1336 xnf_intr(caddr_t arg)
1337 {
1338 	xnf_t *xnfp = (xnf_t *)arg;
1339 	int tx_ring_space;
1340 
1341 	mutex_enter(&xnfp->xnf_intrlock);
1342 
1343 	/* spurious intr */
1344 	if (!xnfp->xnf_connected) {
1345 		mutex_exit(&xnfp->xnf_intrlock);
1346 		xnfp->xnf_stat_unclaimed_interrupts++;
1347 		return (DDI_INTR_UNCLAIMED);
1348 	}
1349 
1350 #ifdef XNF_DEBUG
1351 	if (xnfdebug & XNF_DEBUG_INT)
1352 		printf("xnf%d intr(0x%p)\n",
1353 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1354 #endif
1355 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1356 		mblk_t *mp;
1357 
1358 		if (xnfp->xnf_rx_hvcopy)
1359 			mp = xnf_process_hvcopy_recv(xnfp);
1360 		else
1361 			mp = xnf_process_recv(xnfp);
1362 
1363 		if (mp != NULL)
1364 			mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
1365 	}
1366 
1367 	/*
1368 	 * Clean tx ring and try to start any blocked xmit streams if
1369 	 * there is now some space.
1370 	 */
1371 	mutex_enter(&xnfp->xnf_txlock);
1372 	tx_ring_space = xnf_clean_tx_ring(xnfp);
1373 	mutex_exit(&xnfp->xnf_txlock);
1374 	if (tx_ring_space > XNF_TX_FREE_THRESH) {
1375 		mutex_exit(&xnfp->xnf_intrlock);
1376 		mac_tx_update(xnfp->xnf_mh);
1377 		mutex_enter(&xnfp->xnf_intrlock);
1378 	}
1379 
1380 	xnfp->xnf_stat_interrupts++;
1381 	mutex_exit(&xnfp->xnf_intrlock);
1382 	return (DDI_INTR_CLAIMED); /* indicate that the interrupt was for us */
1383 }
1384 
1385 /*
1386  *  xnf_start() -- start the board receiving and enable interrupts.
1387  */
1388 static int
1389 xnf_start(void *arg)
1390 {
1391 	xnf_t *xnfp = arg;
1392 
1393 #ifdef XNF_DEBUG
1394 	if (xnfdebug & XNF_DEBUG_TRACE)
1395 		printf("xnf%d start(0x%p)\n",
1396 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1397 #endif
1398 
1399 	mutex_enter(&xnfp->xnf_intrlock);
1400 	mutex_enter(&xnfp->xnf_txlock);
1401 
1402 	/* Accept packets from above. */
1403 	xnfp->xnf_running = B_TRUE;
1404 
1405 	mutex_exit(&xnfp->xnf_txlock);
1406 	mutex_exit(&xnfp->xnf_intrlock);
1407 
1408 	return (0);
1409 }
1410 
1411 /* xnf_stop() - disable hardware */
1412 static void
1413 xnf_stop(void *arg)
1414 {
1415 	xnf_t *xnfp = arg;
1416 
1417 #ifdef XNF_DEBUG
1418 	if (xnfdebug & XNF_DEBUG_TRACE)
1419 		printf("xnf%d stop(0x%p)\n",
1420 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1421 #endif
1422 
1423 	mutex_enter(&xnfp->xnf_intrlock);
1424 	mutex_enter(&xnfp->xnf_txlock);
1425 
1426 	xnfp->xnf_running = B_FALSE;
1427 
1428 	mutex_exit(&xnfp->xnf_txlock);
1429 	mutex_exit(&xnfp->xnf_intrlock);
1430 }
1431 
1432 /*
1433  * Driver private functions follow
1434  */
1435 
1436 /*
1437  * Hang buffer on rx ring
1438  */
1439 static void
1440 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1441 {
1442 	volatile netif_rx_request_t	*reqp;
1443 	RING_IDX			hang_ix;
1444 	grant_ref_t			ref;
1445 	domid_t				oeid;
1446 
1447 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1448 
1449 	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
1450 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1451 	    xnfp->xnf_rx_ring.req_prod_pvt);
1452 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1453 	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
1454 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1455 		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
1456 		ASSERT((signed short)ref >= 0);
1457 		bdesc->grant_ref = ref;
1458 		if (xnfp->xnf_rx_hvcopy) {
1459 			pfn_t pfn = xnf_btop(bdesc->buf_phys);
1460 			mfn_t mfn = pfn_to_mfn(pfn);
1461 
1462 			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
1463 		} else {
1464 			gnttab_grant_foreign_transfer_ref(ref, oeid, 0);
1465 		}
1466 	}
1467 	reqp->id = hang_ix;
1468 	reqp->gref = bdesc->grant_ref;
1469 	bdesc->id = hang_ix;
1470 	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
1471 	membar_producer();
1472 	xnfp->xnf_rx_ring.req_prod_pvt++;
1473 }
1474 
1475 static mblk_t *
1476 xnf_process_hvcopy_recv(xnf_t *xnfp)
1477 {
1478 	netif_rx_response_t *rxpkt;
1479 	mblk_t		*mp, *head, *tail;
1480 	struct		xnf_buffer_desc *bdesc;
1481 	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
1482 	size_t 		len;
1483 
1484 	/*
1485 	 * in loop over unconsumed responses, we do:
1486 	 * 1. get a response
1487 	 * 2. take corresponding buffer off recv. ring
1488 	 * 3. indicate this by setting slot to NULL
1489 	 * 4. create a new message and
1490 	 * 5. copy data in, adjust ptr
1491 	 *
1492 	 * outside loop:
1493 	 * 7. make sure no more data has arrived; kick HV
1494 	 */
1495 
1496 	head = tail = NULL;
1497 
1498 loop:
1499 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1500 
1501 		/* 1. */
1502 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1503 		    xnfp->xnf_rx_ring.rsp_cons);
1504 
1505 		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
1506 		    (int)rxpkt->offset,
1507 		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
1508 
1509 		/*
1510 		 * 2.
1511 		 * Take buffer off of receive ring
1512 		 */
1513 		hwcsum = B_FALSE;
1514 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1515 		/* 3 */
1516 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1517 		ASSERT(bdesc->id == rxpkt->id);
1518 		mp = NULL;
1519 		if (!xnfp->xnf_running) {
1520 			DTRACE_PROBE4(pkt_dropped, int, rxpkt->status,
1521 			    char *, bdesc->buf, int, rxpkt->offset,
1522 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1523 			xnfp->xnf_stat_drop++;
1524 			/*
1525 			 * re-hang the buffer
1526 			 */
1527 			rx_buffer_hang(xnfp, bdesc);
1528 		} else if (rxpkt->status <= 0) {
1529 			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
1530 			    char *, bdesc->buf, int, rxpkt->offset,
1531 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1532 			xnfp->xnf_stat_errrx++;
1533 			if (rxpkt->status == 0)
1534 				xnfp->xnf_stat_runt++;
1535 			if (rxpkt->status == NETIF_RSP_ERROR)
1536 				xnfp->xnf_stat_mac_rcv_error++;
1537 			if (rxpkt->status == NETIF_RSP_DROPPED)
1538 				xnfp->xnf_stat_norxbuf++;
1539 			/*
1540 			 * re-hang the buffer
1541 			 */
1542 			rx_buffer_hang(xnfp, bdesc);
1543 		} else {
1544 			grant_ref_t		ref =  bdesc->grant_ref;
1545 			struct xnf_buffer_desc	*new_bdesc;
1546 			unsigned long		off = rxpkt->offset;
1547 
1548 			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
1549 			    char *, bdesc->buf, int, rxpkt->offset,
1550 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1551 			len = rxpkt->status;
1552 			ASSERT(off + len <= PAGEOFFSET);
1553 			if (ref == GRANT_INVALID_REF) {
1554 				mp = NULL;
1555 				new_bdesc = bdesc;
1556 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1557 				    "from dom %d", ref,
1558 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1559 				goto luckless;
1560 			}
1561 			/*
1562 			 * Release ref which we'll be re-claiming in
1563 			 * rx_buffer_hang().
1564 			 */
1565 			bdesc->grant_ref = GRANT_INVALID_REF;
1566 			(void) gnttab_end_foreign_access_ref(ref, 0);
1567 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1568 			    ref);
1569 			if (rxpkt->flags & NETRXF_data_validated)
1570 				hwcsum = B_TRUE;
1571 
1572 			/*
1573 			 * XXPV for the initial implementation of HVcopy,
1574 			 * create a new msg and copy in the data
1575 			 */
1576 			/* 4. */
1577 			if ((mp = allocb(len, BPRI_MED)) == NULL) {
1578 				/*
1579 				 * Couldn't get buffer to copy to,
1580 				 * drop this data, and re-hang
1581 				 * the buffer on the ring.
1582 				 */
1583 				xnfp->xnf_stat_norxbuf++;
1584 				DTRACE_PROBE(alloc_nix);
1585 			} else {
1586 				/* 5. */
1587 				DTRACE_PROBE(alloc_ok);
1588 				bcopy(bdesc->buf + off, mp->b_wptr,
1589 				    len);
1590 				mp->b_wptr += len;
1591 			}
1592 			new_bdesc = bdesc;
1593 luckless:
1594 
1595 			/* Re-hang old or hang new buffer. */
1596 			rx_buffer_hang(xnfp, new_bdesc);
1597 		}
1598 		if (mp) {
1599 			if (hwcsum) {
1600 				/*
1601 				 * See comments in xnf_process_recv().
1602 				 */
1603 
1604 				(void) hcksum_assoc(mp, NULL,
1605 				    NULL, 0, 0, 0, 0,
1606 				    HCK_FULLCKSUM |
1607 				    HCK_FULLCKSUM_OK,
1608 				    0);
1609 				xnfp->xnf_stat_rx_cksum_no_need++;
1610 			}
1611 			if (head == NULL) {
1612 				head = tail = mp;
1613 			} else {
1614 				tail->b_next = mp;
1615 				tail = mp;
1616 			}
1617 
1618 			ASSERT(mp->b_next == NULL);
1619 
1620 			xnfp->xnf_stat_ipackets++;
1621 			xnfp->xnf_stat_rbytes += len;
1622 		}
1623 
1624 		xnfp->xnf_rx_ring.rsp_cons++;
1625 
1626 		xnfp->xnf_stat_hvcopy_packet_processed++;
1627 	}
1628 
1629 	/* 7. */
1630 	/*
1631 	 * Has more data come in since we started?
1632 	 */
1633 	/* LINTED: constant in conditional context */
1634 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1635 	if (work_to_do)
1636 		goto loop;
1637 
1638 	/*
1639 	 * Indicate to the backend that we have re-filled the receive
1640 	 * ring.
1641 	 */
1642 	/* LINTED: constant in conditional context */
1643 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1644 	if (notify)
1645 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1646 
1647 	return (head);
1648 }
1649 
1650 /* Process all queued received packets */
1651 static mblk_t *
1652 xnf_process_recv(xnf_t *xnfp)
1653 {
1654 	volatile netif_rx_response_t *rxpkt;
1655 	mblk_t *mp, *head, *tail;
1656 	struct xnf_buffer_desc *bdesc;
1657 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1658 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1659 	size_t len;
1660 	pfn_t pfn;
1661 	long cnt;
1662 
1663 	head = tail = NULL;
1664 loop:
1665 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1666 
1667 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1668 		    xnfp->xnf_rx_ring.rsp_cons);
1669 
1670 		/*
1671 		 * Take buffer off of receive ring
1672 		 */
1673 		hwcsum = B_FALSE;
1674 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1675 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1676 		ASSERT(bdesc->id == rxpkt->id);
1677 		mp = NULL;
1678 		if (!xnfp->xnf_running) {
1679 			xnfp->xnf_stat_drop++;
1680 			/*
1681 			 * re-hang the buffer
1682 			 */
1683 			rx_buffer_hang(xnfp, bdesc);
1684 		} else if (rxpkt->status <= 0) {
1685 			xnfp->xnf_stat_errrx++;
1686 			if (rxpkt->status == 0)
1687 				xnfp->xnf_stat_runt++;
1688 			if (rxpkt->status == NETIF_RSP_ERROR)
1689 				xnfp->xnf_stat_mac_rcv_error++;
1690 			if (rxpkt->status == NETIF_RSP_DROPPED)
1691 				xnfp->xnf_stat_norxbuf++;
1692 			/*
1693 			 * re-hang the buffer
1694 			 */
1695 			rx_buffer_hang(xnfp, bdesc);
1696 		} else {
1697 			grant_ref_t ref =  bdesc->grant_ref;
1698 			struct xnf_buffer_desc *new_bdesc;
1699 			unsigned long off = rxpkt->offset;
1700 			unsigned long mfn;
1701 
1702 			len = rxpkt->status;
1703 			ASSERT(off + len <= PAGEOFFSET);
1704 			if (ref == GRANT_INVALID_REF) {
1705 				mp = NULL;
1706 				new_bdesc = bdesc;
1707 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1708 				    "from dom %d", ref,
1709 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1710 				goto luckless;
1711 			}
1712 			bdesc->grant_ref = GRANT_INVALID_REF;
1713 			mfn = gnttab_end_foreign_transfer_ref(ref);
1714 			ASSERT(mfn != MFN_INVALID);
1715 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1716 			    PFN_INVALID);
1717 
1718 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1719 			    ref);
1720 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1721 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1722 			    xnf_btop(bdesc->buf_phys),
1723 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1724 			balloon_drv_added(1);
1725 
1726 			if (rxpkt->flags & NETRXF_data_validated)
1727 				hwcsum = B_TRUE;
1728 			if (len <= xnf_rx_bcopy_thresh) {
1729 				/*
1730 				 * For small buffers, just copy the data
1731 				 * and send the copy upstream.
1732 				 */
1733 				new_bdesc = NULL;
1734 			} else {
1735 				/*
1736 				 * We send a pointer to this data upstream;
1737 				 * we need a new buffer to replace this one.
1738 				 */
1739 				mutex_enter(&xnfp->xnf_rx_buf_mutex);
1740 				new_bdesc = xnf_get_buffer(xnfp);
1741 				if (new_bdesc != NULL) {
1742 					xnfp->xnf_rx_bufs_outstanding++;
1743 				} else {
1744 					xnfp->xnf_stat_rx_no_ringbuf++;
1745 				}
1746 				mutex_exit(&xnfp->xnf_rx_buf_mutex);
1747 			}
1748 
1749 			if (new_bdesc == NULL) {
1750 				/*
1751 				 * Don't have a new ring buffer; bcopy the data
1752 				 * from the buffer, and preserve the
1753 				 * original buffer
1754 				 */
1755 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1756 					/*
1757 					 * Could't get buffer to copy to,
1758 					 * drop this data, and re-hang
1759 					 * the buffer on the ring.
1760 					 */
1761 					xnfp->xnf_stat_norxbuf++;
1762 				} else {
1763 					bcopy(bdesc->buf + off, mp->b_wptr,
1764 					    len);
1765 				}
1766 				/*
1767 				 * Give the buffer page back to xen
1768 				 */
1769 				pfn = xnf_btop(bdesc->buf_phys);
1770 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1771 				    &pfn);
1772 				if (cnt != 1) {
1773 					cmn_err(CE_WARN, "unable to give a "
1774 					    "page back to the hypervisor\n");
1775 				}
1776 				new_bdesc = bdesc;
1777 			} else {
1778 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1779 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1780 					/*
1781 					 * Couldn't get mblk to pass recv data
1782 					 * up with, free the old ring buffer
1783 					 */
1784 					xnfp->xnf_stat_norxbuf++;
1785 					xnf_rcv_complete(bdesc);
1786 					goto luckless;
1787 				}
1788 				(void) ddi_dma_sync(bdesc->dma_handle,
1789 				    0, 0, DDI_DMA_SYNC_FORCPU);
1790 
1791 				mp->b_wptr += off;
1792 				mp->b_rptr += off;
1793 			}
1794 luckless:
1795 			if (mp)
1796 				mp->b_wptr += len;
1797 			/* re-hang old or hang new buffer */
1798 			rx_buffer_hang(xnfp, new_bdesc);
1799 		}
1800 		if (mp) {
1801 			if (hwcsum) {
1802 				/*
1803 				 * If the peer says that the data has
1804 				 * been validated then we declare that
1805 				 * the full checksum has been
1806 				 * verified.
1807 				 *
1808 				 * We don't look at the "checksum
1809 				 * blank" flag, and hence could have a
1810 				 * packet here that we are asserting
1811 				 * is good with a blank checksum.
1812 				 *
1813 				 * The hardware checksum offload
1814 				 * specification says that we must
1815 				 * provide the actual checksum as well
1816 				 * as an assertion that it is valid,
1817 				 * but the protocol stack doesn't
1818 				 * actually use it and some other
1819 				 * drivers don't bother, so we don't.
1820 				 * If it was necessary we could grovel
1821 				 * in the packet to find it.
1822 				 */
1823 
1824 				(void) hcksum_assoc(mp, NULL,
1825 				    NULL, 0, 0, 0, 0,
1826 				    HCK_FULLCKSUM |
1827 				    HCK_FULLCKSUM_OK,
1828 				    0);
1829 				xnfp->xnf_stat_rx_cksum_no_need++;
1830 			}
1831 			if (head == NULL) {
1832 				head = tail = mp;
1833 			} else {
1834 				tail->b_next = mp;
1835 				tail = mp;
1836 			}
1837 
1838 			ASSERT(mp->b_next == NULL);
1839 
1840 			xnfp->xnf_stat_ipackets++;
1841 			xnfp->xnf_stat_rbytes += len;
1842 		}
1843 
1844 		xnfp->xnf_rx_ring.rsp_cons++;
1845 	}
1846 
1847 	/*
1848 	 * Has more data come in since we started?
1849 	 */
1850 	/* LINTED: constant in conditional context */
1851 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1852 	if (work_to_do)
1853 		goto loop;
1854 
1855 	/*
1856 	 * Indicate to the backend that we have re-filled the receive
1857 	 * ring.
1858 	 */
1859 	/* LINTED: constant in conditional context */
1860 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1861 	if (notify)
1862 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1863 
1864 	return (head);
1865 }
1866 
1867 /* Called when the upper layers free a message we passed upstream */
1868 static void
1869 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1870 {
1871 	xnf_t *xnfp = bdesc->xnfp;
1872 	pfn_t pfn;
1873 	long cnt;
1874 
1875 	/* One less outstanding receive buffer */
1876 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1877 	--xnfp->xnf_rx_bufs_outstanding;
1878 	/*
1879 	 * Return buffer to the free list, unless the free list is getting
1880 	 * too large.  XXPV - this threshold may need tuning.
1881 	 */
1882 	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
1883 		/*
1884 		 * Unmap the page, and hand the machine page back
1885 		 * to xen so it can be re-used as a backend net buffer.
1886 		 */
1887 		pfn = xnf_btop(bdesc->buf_phys);
1888 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1889 		if (cnt != 1) {
1890 			cmn_err(CE_WARN, "unable to give a page back to the "
1891 			    "hypervisor\n");
1892 		}
1893 
1894 		bdesc->next = xnfp->xnf_free_list;
1895 		xnfp->xnf_free_list = bdesc;
1896 		xnfp->xnf_rx_descs_free++;
1897 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1898 	} else {
1899 		/*
1900 		 * We can return everything here since we have a free buffer
1901 		 * that we have not given the backing page for back to xen.
1902 		 */
1903 		--xnfp->xnf_rx_buffer_count;
1904 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1905 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1906 		ddi_dma_mem_free(&bdesc->acc_handle);
1907 		ddi_dma_free_handle(&bdesc->dma_handle);
1908 		kmem_free(bdesc, sizeof (*bdesc));
1909 	}
1910 }
1911 
1912 /*
1913  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1914  */
1915 static int
1916 xnf_alloc_dma_resources(xnf_t *xnfp)
1917 {
1918 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
1919 	int			i;
1920 	size_t			len;
1921 	ddi_dma_cookie_t	dma_cookie;
1922 	uint_t			ncookies;
1923 	struct xnf_buffer_desc	*bdesc;
1924 	int			rc;
1925 	caddr_t			rptr;
1926 
1927 	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
1928 	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
1929 
1930 	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
1931 
1932 	/*
1933 	 * The code below allocates all the DMA data structures that
1934 	 * need to be released when the driver is detached.
1935 	 *
1936 	 * First allocate handles for mapping (virtual address) pointers to
1937 	 * transmit data buffers to physical addresses
1938 	 */
1939 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
1940 		if ((rc = ddi_dma_alloc_handle(devinfo,
1941 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
1942 		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
1943 			return (DDI_FAILURE);
1944 	}
1945 
1946 	/*
1947 	 * Allocate page for the transmit descriptor ring.
1948 	 */
1949 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1950 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
1951 		goto alloc_error;
1952 
1953 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
1954 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1955 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1956 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
1957 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
1958 		xnfp->xnf_tx_ring_dma_handle = NULL;
1959 		goto alloc_error;
1960 	}
1961 
1962 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
1963 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
1964 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
1965 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
1966 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
1967 		xnfp->xnf_tx_ring_dma_handle = NULL;
1968 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
1969 		if (rc == DDI_DMA_NORESOURCES)
1970 			goto alloc_error;
1971 		else
1972 			goto error;
1973 	}
1974 
1975 	ASSERT(ncookies == 1);
1976 	bzero(rptr, PAGESIZE);
1977 	/* LINTED: constant in conditional context */
1978 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
1979 	/* LINTED: constant in conditional context */
1980 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
1981 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
1982 
1983 	/*
1984 	 * Allocate page for the receive descriptor ring.
1985 	 */
1986 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
1987 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
1988 		goto alloc_error;
1989 
1990 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
1991 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
1992 	    DDI_DMA_SLEEP, 0, &rptr, &len,
1993 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
1994 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
1995 		xnfp->xnf_rx_ring_dma_handle = NULL;
1996 		goto alloc_error;
1997 	}
1998 
1999 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2000 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2001 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2002 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2003 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2004 		xnfp->xnf_rx_ring_dma_handle = NULL;
2005 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2006 		if (rc == DDI_DMA_NORESOURCES)
2007 			goto alloc_error;
2008 		else
2009 			goto error;
2010 	}
2011 
2012 	ASSERT(ncookies == 1);
2013 	bzero(rptr, PAGESIZE);
2014 	/* LINTED: constant in conditional context */
2015 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2016 	/* LINTED: constant in conditional context */
2017 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2018 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2019 
2020 	/*
2021 	 * Preallocate receive buffers for each receive descriptor.
2022 	 */
2023 
2024 	/* Set up the "free list" of receive buffer descriptors */
2025 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2026 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
2027 			goto alloc_error;
2028 		bdesc->next = xnfp->xnf_free_list;
2029 		xnfp->xnf_free_list = bdesc;
2030 	}
2031 
2032 	return (DDI_SUCCESS);
2033 
2034 alloc_error:
2035 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2036 	    ddi_get_instance(xnfp->xnf_devinfo));
2037 error:
2038 	xnf_release_dma_resources(xnfp);
2039 	return (DDI_FAILURE);
2040 }
2041 
2042 /*
2043  * Release all DMA resources in the opposite order from acquisition
2044  * Should not be called until all outstanding esballoc buffers
2045  * have been returned.
2046  */
2047 static void
2048 xnf_release_dma_resources(xnf_t *xnfp)
2049 {
2050 	int i;
2051 
2052 	/*
2053 	 * Free receive buffers which are currently associated with
2054 	 * descriptors
2055 	 */
2056 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2057 		struct xnf_buffer_desc *bp;
2058 
2059 		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
2060 			continue;
2061 		xnf_free_buffer(bp);
2062 		xnfp->xnf_rxpkt_bufptr[i] = NULL;
2063 	}
2064 
2065 	/* Free the receive ring buffer */
2066 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2067 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2068 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2069 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2070 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2071 	}
2072 	/* Free the transmit ring buffer */
2073 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2074 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2075 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2076 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2077 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2078 	}
2079 
2080 	/*
2081 	 * Free handles for mapping (virtual address) pointers to
2082 	 * transmit data buffers to physical addresses
2083 	 */
2084 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2085 		if (xnfp->xnf_tx_pkt_info[i].dma_handle != NULL) {
2086 			ddi_dma_free_handle(
2087 			    &xnfp->xnf_tx_pkt_info[i].dma_handle);
2088 		}
2089 	}
2090 
2091 }
2092 
2093 static void
2094 xnf_release_mblks(xnf_t *xnfp)
2095 {
2096 	int	i;
2097 
2098 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2099 		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
2100 			continue;
2101 		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
2102 		xnfp->xnf_tx_pkt_info[i].mp = NULL;
2103 		(void) ddi_dma_unbind_handle(
2104 		    xnfp->xnf_tx_pkt_info[i].dma_handle);
2105 	}
2106 }
2107 
2108 /*
2109  * Remove a xmit buffer descriptor from the head of the free list and return
2110  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2111  * Called with the tx_buf_mutex held.
2112  */
2113 static struct xnf_buffer_desc *
2114 xnf_get_tx_buffer(xnf_t *xnfp)
2115 {
2116 	struct xnf_buffer_desc *bdesc;
2117 
2118 	bdesc = xnfp->xnf_tx_free_list;
2119 	if (bdesc != NULL) {
2120 		xnfp->xnf_tx_free_list = bdesc->next;
2121 	} else {
2122 		bdesc = xnf_alloc_tx_buffer(xnfp);
2123 	}
2124 	return (bdesc);
2125 }
2126 
2127 /*
2128  * Remove a buffer descriptor from the head of the free list and return
2129  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2130  * Called with the rx_buf_mutex held.
2131  */
2132 static struct xnf_buffer_desc *
2133 xnf_get_buffer(xnf_t *xnfp)
2134 {
2135 	struct xnf_buffer_desc *bdesc;
2136 
2137 	bdesc = xnfp->xnf_free_list;
2138 	if (bdesc != NULL) {
2139 		xnfp->xnf_free_list = bdesc->next;
2140 		xnfp->xnf_rx_descs_free--;
2141 	} else {
2142 		bdesc = xnf_alloc_buffer(xnfp);
2143 	}
2144 	return (bdesc);
2145 }
2146 
2147 /*
2148  * Free a xmit buffer back to the xmit free list
2149  */
2150 static void
2151 xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
2152 {
2153 	xnf_t *xnfp = bp->xnfp;
2154 
2155 	mutex_enter(&xnfp->xnf_tx_buf_mutex);
2156 	bp->next = xnfp->xnf_tx_free_list;
2157 	xnfp->xnf_tx_free_list = bp;
2158 	mutex_exit(&xnfp->xnf_tx_buf_mutex);
2159 }
2160 
2161 /*
2162  * Put a buffer descriptor onto the head of the free list.
2163  * for page-flip:
2164  * We can't really free these buffers back to the kernel
2165  * since we have given away their backing page to be used
2166  * by the back end net driver.
2167  * for hvcopy:
2168  * release all the memory
2169  */
2170 static void
2171 xnf_free_buffer(struct xnf_buffer_desc *bdesc)
2172 {
2173 	xnf_t *xnfp = bdesc->xnfp;
2174 
2175 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
2176 	if (xnfp->xnf_rx_hvcopy) {
2177 		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
2178 			goto out;
2179 		ddi_dma_mem_free(&bdesc->acc_handle);
2180 		ddi_dma_free_handle(&bdesc->dma_handle);
2181 		kmem_free(bdesc, sizeof (*bdesc));
2182 		xnfp->xnf_rx_buffer_count--;
2183 	} else {
2184 		bdesc->next = xnfp->xnf_free_list;
2185 		xnfp->xnf_free_list = bdesc;
2186 		xnfp->xnf_rx_descs_free++;
2187 	}
2188 out:
2189 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
2190 }
2191 
2192 /*
2193  * Allocate a DMA-able xmit buffer, including a structure to
2194  * keep track of the buffer.  Called with tx_buf_mutex held.
2195  */
2196 static struct xnf_buffer_desc *
2197 xnf_alloc_tx_buffer(xnf_t *xnfp)
2198 {
2199 	struct xnf_buffer_desc *bdesc;
2200 	size_t len;
2201 
2202 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2203 		return (NULL);
2204 
2205 	/* allocate a DMA access handle for receive buffer */
2206 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
2207 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2208 		goto failure;
2209 
2210 	/* Allocate DMA-able memory for transmit buffer */
2211 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2212 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2213 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2214 		goto failure_1;
2215 
2216 	bdesc->xnfp = xnfp;
2217 	xnfp->xnf_tx_buffer_count++;
2218 
2219 	return (bdesc);
2220 
2221 failure_1:
2222 	ddi_dma_free_handle(&bdesc->dma_handle);
2223 
2224 failure:
2225 	kmem_free(bdesc, sizeof (*bdesc));
2226 	return (NULL);
2227 }
2228 
2229 /*
2230  * Allocate a DMA-able receive buffer, including a structure to
2231  * keep track of the buffer.  Called with rx_buf_mutex held.
2232  */
2233 static struct xnf_buffer_desc *
2234 xnf_alloc_buffer(xnf_t *xnfp)
2235 {
2236 	struct			xnf_buffer_desc *bdesc;
2237 	size_t			len;
2238 	uint_t			ncookies;
2239 	ddi_dma_cookie_t	dma_cookie;
2240 	long			cnt;
2241 	pfn_t			pfn;
2242 
2243 	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
2244 		return (NULL);
2245 
2246 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2247 		return (NULL);
2248 
2249 	/* allocate a DMA access handle for receive buffer */
2250 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
2251 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2252 		goto failure;
2253 
2254 	/* Allocate DMA-able memory for receive buffer */
2255 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2256 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2257 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2258 		goto failure_1;
2259 
2260 	/* bind to virtual address of buffer to get physical address */
2261 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2262 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2263 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2264 		goto failure_2;
2265 
2266 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2267 	bdesc->xnfp = xnfp;
2268 	if (xnfp->xnf_rx_hvcopy) {
2269 		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
2270 	} else {
2271 		bdesc->free_rtn.free_func = xnf_rcv_complete;
2272 	}
2273 	bdesc->free_rtn.free_arg = (char *)bdesc;
2274 	bdesc->grant_ref = GRANT_INVALID_REF;
2275 	ASSERT(ncookies == 1);
2276 
2277 	xnfp->xnf_rx_buffer_count++;
2278 
2279 	if (!xnfp->xnf_rx_hvcopy) {
2280 		/*
2281 		 * Unmap the page, and hand the machine page back
2282 		 * to xen so it can be used as a backend net buffer.
2283 		 */
2284 		pfn = xnf_btop(bdesc->buf_phys);
2285 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2286 		if (cnt != 1) {
2287 			cmn_err(CE_WARN, "unable to give a page back to the "
2288 			    "hypervisor\n");
2289 		}
2290 	}
2291 
2292 	return (bdesc);
2293 
2294 failure_2:
2295 	ddi_dma_mem_free(&bdesc->acc_handle);
2296 
2297 failure_1:
2298 	ddi_dma_free_handle(&bdesc->dma_handle);
2299 
2300 failure:
2301 	kmem_free(bdesc, sizeof (*bdesc));
2302 	return (NULL);
2303 }
2304 
2305 /*
2306  * Statistics.
2307  */
2308 static char *xnf_aux_statistics[] = {
2309 	"tx_cksum_deferred",
2310 	"rx_cksum_no_need",
2311 	"interrupts",
2312 	"unclaimed_interrupts",
2313 	"tx_pullup",
2314 	"tx_pagebndry",
2315 	"tx_attempt",
2316 	"rx_no_ringbuf",
2317 	"hvcopy_packet_processed",
2318 };
2319 
2320 static int
2321 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2322 {
2323 	xnf_t *xnfp;
2324 	kstat_named_t *knp;
2325 
2326 	if (flag != KSTAT_READ)
2327 		return (EACCES);
2328 
2329 	xnfp = ksp->ks_private;
2330 	knp = ksp->ks_data;
2331 
2332 	/*
2333 	 * Assignment order must match that of the names in
2334 	 * xnf_aux_statistics.
2335 	 */
2336 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2337 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2338 
2339 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2340 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2341 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2342 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2343 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2344 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
2345 
2346 	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
2347 
2348 	return (0);
2349 }
2350 
2351 static boolean_t
2352 xnf_kstat_init(xnf_t *xnfp)
2353 {
2354 	int nstat = sizeof (xnf_aux_statistics) /
2355 	    sizeof (xnf_aux_statistics[0]);
2356 	char **cp = xnf_aux_statistics;
2357 	kstat_named_t *knp;
2358 
2359 	/*
2360 	 * Create and initialise kstats.
2361 	 */
2362 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2363 	    ddi_get_instance(xnfp->xnf_devinfo),
2364 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2365 	    nstat, 0)) == NULL)
2366 		return (B_FALSE);
2367 
2368 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2369 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2370 
2371 	knp = xnfp->xnf_kstat_aux->ks_data;
2372 	while (nstat > 0) {
2373 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2374 
2375 		knp++;
2376 		cp++;
2377 		nstat--;
2378 	}
2379 
2380 	kstat_install(xnfp->xnf_kstat_aux);
2381 
2382 	return (B_TRUE);
2383 }
2384 
2385 static int
2386 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2387 {
2388 	xnf_t *xnfp = arg;
2389 
2390 	mutex_enter(&xnfp->xnf_intrlock);
2391 	mutex_enter(&xnfp->xnf_txlock);
2392 
2393 #define	mac_stat(q, r)				\
2394 	case (MAC_STAT_##q):			\
2395 		*val = xnfp->xnf_stat_##r;	\
2396 		break
2397 
2398 #define	ether_stat(q, r)			\
2399 	case (ETHER_STAT_##q):			\
2400 		*val = xnfp->xnf_stat_##r;	\
2401 		break
2402 
2403 	switch (stat) {
2404 
2405 	mac_stat(IPACKETS, ipackets);
2406 	mac_stat(OPACKETS, opackets);
2407 	mac_stat(RBYTES, rbytes);
2408 	mac_stat(OBYTES, obytes);
2409 	mac_stat(NORCVBUF, norxbuf);
2410 	mac_stat(IERRORS, errrx);
2411 	mac_stat(NOXMTBUF, tx_defer);
2412 
2413 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2414 	ether_stat(TOOSHORT_ERRORS, runt);
2415 
2416 	default:
2417 		mutex_exit(&xnfp->xnf_txlock);
2418 		mutex_exit(&xnfp->xnf_intrlock);
2419 
2420 		return (ENOTSUP);
2421 	}
2422 
2423 #undef mac_stat
2424 #undef ether_stat
2425 
2426 	mutex_exit(&xnfp->xnf_txlock);
2427 	mutex_exit(&xnfp->xnf_intrlock);
2428 
2429 	return (0);
2430 }
2431 
2432 /*ARGSUSED*/
2433 static void
2434 xnf_blank(void *arg, time_t ticks, uint_t count)
2435 {
2436 	/*
2437 	 * XXPV dme: blanking is not currently implemented.
2438 	 *
2439 	 * It's not obvious how to use the 'ticks' argument here.
2440 	 *
2441 	 * 'Count' might be used as an indicator of how to set
2442 	 * rsp_event when posting receive buffers to the rx_ring.  It
2443 	 * would replace the code at the tail of xnf_process_recv()
2444 	 * that simply indicates that the next completed packet should
2445 	 * cause an interrupt.
2446 	 */
2447 }
2448 
2449 static void
2450 xnf_resources(void *arg)
2451 {
2452 	xnf_t *xnfp = arg;
2453 	mac_rx_fifo_t mrf;
2454 
2455 	mrf.mrf_type = MAC_RX_FIFO;
2456 	mrf.mrf_blank = xnf_blank;
2457 	mrf.mrf_arg = (void *)xnfp;
2458 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2459 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2460 
2461 	xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
2462 	    (mac_resource_t *)&mrf);
2463 }
2464 
2465 /*ARGSUSED*/
2466 static void
2467 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2468 {
2469 	miocnak(q, mp, 0, EINVAL);
2470 }
2471 
2472 static boolean_t
2473 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2474 {
2475 	xnf_t *xnfp = arg;
2476 
2477 	switch (cap) {
2478 	case MAC_CAPAB_HCKSUM: {
2479 		uint32_t *capab = cap_data;
2480 
2481 		/*
2482 		 * We declare ourselves capable of HCKSUM_INET_PARTIAL
2483 		 * in order that the protocol stack insert the
2484 		 * pseudo-header checksum in packets that it passes
2485 		 * down to us.
2486 		 *
2487 		 * Whilst the flag used to communicate with dom0 is
2488 		 * called "NETTXF_csum_blank", the checksum in the
2489 		 * packet must contain the pseudo-header checksum and
2490 		 * not zero. (In fact, a Solaris dom0 is happy to deal
2491 		 * with a checksum of zero, but a Linux dom0 is not.)
2492 		 */
2493 		if (xnfp->xnf_cksum_offload)
2494 			*capab = HCKSUM_INET_PARTIAL;
2495 		else
2496 			*capab = 0;
2497 		break;
2498 	}
2499 
2500 	case MAC_CAPAB_POLL:
2501 		/* Just return B_TRUE. */
2502 		break;
2503 
2504 	default:
2505 		return (B_FALSE);
2506 	}
2507 
2508 	return (B_TRUE);
2509 }
2510 
2511 /*ARGSUSED*/
2512 static void
2513 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2514     void *arg, void *impl_data)
2515 {
2516 	xnf_t *xnfp = ddi_get_driver_private(dip);
2517 	XenbusState new_state = *(XenbusState *)impl_data;
2518 
2519 	ASSERT(xnfp != NULL);
2520 
2521 	switch (new_state) {
2522 	case XenbusStateConnected:
2523 		mutex_enter(&xnfp->xnf_intrlock);
2524 		mutex_enter(&xnfp->xnf_txlock);
2525 
2526 		xnfp->xnf_connected = B_TRUE;
2527 		/*
2528 		 * wake up threads wanting to send data to backend,
2529 		 * but got blocked due to backend is not ready
2530 		 */
2531 		cv_broadcast(&xnfp->xnf_cv);
2532 
2533 		mutex_exit(&xnfp->xnf_txlock);
2534 		mutex_exit(&xnfp->xnf_intrlock);
2535 
2536 		/*
2537 		 * kick backend in case it missed any tx request
2538 		 * in the TX ring buffer
2539 		 */
2540 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2541 
2542 		/*
2543 		 * there maybe already queued rx data in the RX ring
2544 		 * sent by backend after it gets connected but before
2545 		 * we see its state change here, so we call our intr
2546 		 * handling routine to handle them, if any
2547 		 */
2548 		(void) xnf_intr((caddr_t)xnfp);
2549 
2550 		break;
2551 
2552 	default:
2553 		break;
2554 	}
2555 }
2556 
2557 /*
2558  * Check whether backend is capable of and willing to talk
2559  * to us via hypervisor copy, as opposed to page flip.
2560  */
2561 static boolean_t
2562 xnf_hvcopy_peer_status(dev_info_t *devinfo)
2563 {
2564 	int	be_rx_copy;
2565 	int	err;
2566 
2567 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
2568 	    "feature-rx-copy", "%d", &be_rx_copy);
2569 	/*
2570 	 * If we fail to read the store we assume that the key is
2571 	 * absent, implying an older domain at the far end.  Older
2572 	 * domains cannot do HV copy (we assume ..).
2573 	 */
2574 	if (err != 0)
2575 		be_rx_copy = 0;
2576 
2577 	return (be_rx_copy?B_TRUE:B_FALSE);
2578 }
2579