xref: /titanic_41/usr/src/uts/common/xen/io/xnf.c (revision 54c529d43d4363891fd7381edde56d2cafbff593)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *
29  * Copyright (c) 2004 Christian Limpach.
30  * All rights reserved.
31  *
32  * Redistribution and use in source and binary forms, with or without
33  * modification, are permitted provided that the following conditions
34  * are met:
35  * 1. Redistributions of source code must retain the above copyright
36  *    notice, this list of conditions and the following disclaimer.
37  * 2. Redistributions in binary form must reproduce the above copyright
38  *    notice, this list of conditions and the following disclaimer in the
39  *    documentation and/or other materials provided with the distribution.
40  * 3. This section intentionally left blank.
41  * 4. The name of the author may not be used to endorse or promote products
42  *    derived from this software without specific prior written permission.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 /*
56  * Section 3 of the above license was updated in response to bug 6379571.
57  */
58 
59 /*
60  * xnf.c - Nemo-based network driver for domU
61  */
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>
66 #include <sys/sysmacros.h>
67 #include <sys/systm.h>
68 #include <sys/stream.h>
69 #include <sys/strsubr.h>
70 #include <sys/conf.h>
71 #include <sys/ddi.h>
72 #include <sys/devops.h>
73 #include <sys/sunddi.h>
74 #include <sys/sunndi.h>
75 #include <sys/dlpi.h>
76 #include <sys/ethernet.h>
77 #include <sys/strsun.h>
78 #include <sys/pattr.h>
79 #include <inet/ip.h>
80 #include <inet/ip_impl.h>
81 #include <sys/gld.h>
82 #include <sys/modctl.h>
83 #include <sys/mac.h>
84 #include <sys/mac_ether.h>
85 #include <sys/bootinfo.h>
86 #include <sys/mach_mmu.h>
87 #ifdef	XPV_HVM_DRIVER
88 #include <sys/xpv_support.h>
89 #include <sys/hypervisor.h>
90 #else
91 #include <sys/hypervisor.h>
92 #include <sys/evtchn_impl.h>
93 #include <sys/balloon_impl.h>
94 #endif
95 #include <xen/public/io/netif.h>
96 #include <sys/gnttab.h>
97 #include <xen/sys/xendev.h>
98 #include <sys/sdt.h>
99 
100 #include <io/xnf.h>
101 
102 
103 /*
104  *  Declarations and Module Linkage
105  */
106 
107 #if defined(DEBUG) || defined(__lint)
108 #define	XNF_DEBUG
109 int	xnfdebug = 0;
110 #endif
111 
112 /*
113  * On a 32 bit PAE system physical and machine addresses are larger
114  * than 32 bits.  ddi_btop() on such systems take an unsigned long
115  * argument, and so addresses above 4G are truncated before ddi_btop()
116  * gets to see them.  To avoid this, code the shift operation here.
117  */
118 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
119 
120 boolean_t	xnf_cksum_offload = B_TRUE;
121 
122 /* Default value for hypervisor-based copy operations */
123 boolean_t	xnf_rx_hvcopy = B_TRUE;
124 
125 /*
126  * Should pages used for transmit be readonly for the peer?
127  */
128 boolean_t	xnf_tx_pages_readonly = B_FALSE;
129 /*
130  * Packets under this size are bcopied instead of using desballoc.
131  * Choose a value > XNF_FRAMESIZE (1514) to force the receive path to
132  * always copy.
133  */
134 unsigned int	xnf_rx_bcopy_thresh = 64;
135 
136 unsigned int	xnf_max_tx_frags = 1;
137 
138 /* Required system entry points */
139 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
141 
142 /* Required driver entry points for Nemo */
143 static int	xnf_start(void *);
144 static void	xnf_stop(void *);
145 static int	xnf_set_mac_addr(void *, const uint8_t *);
146 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
147 static int	xnf_set_promiscuous(void *, boolean_t);
148 static mblk_t	*xnf_send(void *, mblk_t *);
149 static uint_t	xnf_intr(caddr_t);
150 static int	xnf_stat(void *, uint_t, uint64_t *);
151 static void	xnf_blank(void *, time_t, uint_t);
152 static void	xnf_resources(void *);
153 static void	xnf_ioctl(void *, queue_t *, mblk_t *);
154 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
155 
156 /* Driver private functions */
157 static int xnf_alloc_dma_resources(xnf_t *);
158 static void xnf_release_dma_resources(xnf_t *);
159 static mblk_t *xnf_process_recv(xnf_t *);
160 static void xnf_rcv_complete(struct xnf_buffer_desc *);
161 static void xnf_release_mblks(xnf_t *);
162 static struct xnf_buffer_desc *xnf_alloc_tx_buffer(xnf_t *);
163 static struct xnf_buffer_desc *xnf_alloc_buffer(xnf_t *);
164 static struct xnf_buffer_desc *xnf_get_tx_buffer(xnf_t *);
165 static struct xnf_buffer_desc *xnf_get_buffer(xnf_t *);
166 static void xnf_free_buffer(struct xnf_buffer_desc *);
167 static void xnf_free_tx_buffer(struct xnf_buffer_desc *);
168 void xnf_send_driver_status(int, int);
169 static void rx_buffer_hang(xnf_t *, struct xnf_buffer_desc *);
170 static int xnf_clean_tx_ring(xnf_t  *);
171 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
172     void *, void *);
173 static mblk_t *xnf_process_hvcopy_recv(xnf_t *xnfp);
174 static boolean_t xnf_hvcopy_peer_status(dev_info_t *devinfo);
175 static boolean_t xnf_kstat_init(xnf_t *xnfp);
176 
177 /*
178  * XXPV dme: remove MC_IOCTL?
179  */
180 static mac_callbacks_t xnf_callbacks = {
181 	MC_RESOURCES | MC_IOCTL | MC_GETCAPAB,
182 	xnf_stat,
183 	xnf_start,
184 	xnf_stop,
185 	xnf_set_promiscuous,
186 	xnf_set_multicast,
187 	xnf_set_mac_addr,
188 	xnf_send,
189 	xnf_resources,
190 	xnf_ioctl,
191 	xnf_getcapab
192 };
193 
194 #define	GRANT_INVALID_REF	0
195 const int xnf_rx_bufs_lowat = 4 * NET_RX_RING_SIZE;
196 const int xnf_rx_bufs_hiwat = 8 * NET_RX_RING_SIZE; /* default max */
197 
198 /* DMA attributes for network ring buffer */
199 static ddi_dma_attr_t ringbuf_dma_attr = {
200 	DMA_ATTR_V0,		/* version of this structure */
201 	0,			/* lowest usable address */
202 	0xffffffffffffffffULL,	/* highest usable address */
203 	0x7fffffff,		/* maximum DMAable byte count */
204 	MMU_PAGESIZE,		/* alignment in bytes */
205 	0x7ff,			/* bitmap of burst sizes */
206 	1,			/* minimum transfer */
207 	0xffffffffU,		/* maximum transfer */
208 	0xffffffffffffffffULL,	/* maximum segment length */
209 	1,			/* maximum number of segments */
210 	1,			/* granularity */
211 	0,			/* flags (reserved) */
212 };
213 
214 /* DMA attributes for transmit data */
215 static ddi_dma_attr_t tx_buffer_dma_attr = {
216 	DMA_ATTR_V0,		/* version of this structure */
217 	0,			/* lowest usable address */
218 	0xffffffffffffffffULL,	/* highest usable address */
219 	0x7fffffff,		/* maximum DMAable byte count */
220 	MMU_PAGESIZE,		/* alignment in bytes */
221 	0x7ff,			/* bitmap of burst sizes */
222 	1,			/* minimum transfer */
223 	0xffffffffU,		/* maximum transfer */
224 	0xffffffffffffffffULL,	/* maximum segment length */
225 	1,			/* maximum number of segments */
226 	1,			/* granularity */
227 	0,			/* flags (reserved) */
228 };
229 
230 /* DMA attributes for a receive buffer */
231 static ddi_dma_attr_t rx_buffer_dma_attr = {
232 	DMA_ATTR_V0,		/* version of this structure */
233 	0,			/* lowest usable address */
234 	0xffffffffffffffffULL,	/* highest usable address */
235 	0x7fffffff,		/* maximum DMAable byte count */
236 	MMU_PAGESIZE,		/* alignment in bytes */
237 	0x7ff,			/* bitmap of burst sizes */
238 	1,			/* minimum transfer */
239 	0xffffffffU,		/* maximum transfer */
240 	0xffffffffffffffffULL,	/* maximum segment length */
241 	1,			/* maximum number of segments */
242 	1,			/* granularity */
243 	0,			/* flags (reserved) */
244 };
245 
246 /* DMA access attributes for registers and descriptors */
247 static ddi_device_acc_attr_t accattr = {
248 	DDI_DEVICE_ATTR_V0,
249 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
250 	DDI_STRICTORDER_ACC
251 };
252 
253 /* DMA access attributes for data: NOT to be byte swapped. */
254 static ddi_device_acc_attr_t data_accattr = {
255 	DDI_DEVICE_ATTR_V0,
256 	DDI_NEVERSWAP_ACC,
257 	DDI_STRICTORDER_ACC
258 };
259 
260 unsigned char xnf_broadcastaddr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
261 int xnf_diagnose = 0; /* Patchable global for diagnostic purposes */
262 
263 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
264     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
265 
266 static struct modldrv xnf_modldrv = {
267 	&mod_driverops,
268 	"Virtual Ethernet driver",
269 	&xnf_dev_ops
270 };
271 
272 static struct modlinkage modlinkage = {
273 	MODREV_1, &xnf_modldrv, NULL
274 };
275 
276 int
277 _init(void)
278 {
279 	int r;
280 
281 	mac_init_ops(&xnf_dev_ops, "xnf");
282 	r = mod_install(&modlinkage);
283 	if (r != DDI_SUCCESS)
284 		mac_fini_ops(&xnf_dev_ops);
285 
286 	return (r);
287 }
288 
289 int
290 _fini(void)
291 {
292 	return (EBUSY); /* XXPV dme: should be removable */
293 }
294 
295 int
296 _info(struct modinfo *modinfop)
297 {
298 	return (mod_info(&modlinkage, modinfop));
299 }
300 
301 static int
302 xnf_setup_rings(xnf_t *xnfp)
303 {
304 	int			ix, err;
305 	RING_IDX		i;
306 	struct xnf_buffer_desc	*bdesc, *rbp;
307 	struct xenbus_device	*xsd;
308 	domid_t			oeid;
309 
310 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
311 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
312 
313 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
314 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
315 
316 	err = gnttab_grant_foreign_access(oeid,
317 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
318 	if (err <= 0) {
319 		err = -err;
320 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
321 		goto out;
322 	}
323 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
324 
325 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
326 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
327 
328 	err = gnttab_grant_foreign_access(oeid,
329 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
330 	if (err <= 0) {
331 		err = -err;
332 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
333 		goto out;
334 	}
335 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
336 
337 
338 	mutex_enter(&xnfp->xnf_intrlock);
339 
340 	/*
341 	 * Cleanup the TX ring.  We just clean up any valid tx_pktinfo structs
342 	 * and reset the ring.  Note that this can lose packets after a resume,
343 	 * but we expect to stagger on.
344 	 */
345 	mutex_enter(&xnfp->xnf_txlock);
346 
347 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
348 		struct tx_pktinfo *txp = &xnfp->xnf_tx_pkt_info[i];
349 
350 		txp->id = i + 1;
351 
352 		if (txp->grant_ref == GRANT_INVALID_REF) {
353 			ASSERT(txp->mp == NULL);
354 			ASSERT(txp->bdesc == NULL);
355 			continue;
356 		}
357 
358 		if (gnttab_query_foreign_access(txp->grant_ref) != 0)
359 			panic("tx grant still in use by backend domain");
360 
361 		freemsg(txp->mp);
362 		txp->mp = NULL;
363 
364 		(void) ddi_dma_unbind_handle(txp->dma_handle);
365 
366 		if (txp->bdesc != NULL) {
367 			xnf_free_tx_buffer(txp->bdesc);
368 			txp->bdesc = NULL;
369 		}
370 
371 		(void) gnttab_end_foreign_access_ref(txp->grant_ref,
372 		    xnfp->xnf_tx_pages_readonly);
373 		gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
374 		    txp->grant_ref);
375 		txp->grant_ref = GRANT_INVALID_REF;
376 	}
377 
378 	xnfp->xnf_tx_pkt_id_list = 0;
379 	xnfp->xnf_tx_ring.rsp_cons = 0;
380 	xnfp->xnf_tx_ring.req_prod_pvt = 0;
381 
382 	/* LINTED: constant in conditional context */
383 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
384 
385 	mutex_exit(&xnfp->xnf_txlock);
386 
387 	/*
388 	 * Rebuild the RX ring.  We have to rebuild the RX ring because some of
389 	 * our pages are currently flipped out/granted so we can't just free
390 	 * the RX buffers.  Reclaim any unprocessed recv buffers, they won't be
391 	 * useable anyway since the mfn's they refer to are no longer valid.
392 	 * Grant the backend domain access to each hung rx buffer.
393 	 */
394 	i = xnfp->xnf_rx_ring.rsp_cons;
395 	while (i++ != xnfp->xnf_rx_ring.sring->req_prod) {
396 		volatile netif_rx_request_t	*rxrp;
397 
398 		rxrp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, i);
399 		ix = rxrp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0);
400 		rbp = xnfp->xnf_rxpkt_bufptr[ix];
401 		if (rbp != NULL) {
402 			grant_ref_t	ref = rbp->grant_ref;
403 
404 			ASSERT(ref != GRANT_INVALID_REF);
405 			if (xnfp->xnf_rx_hvcopy) {
406 				pfn_t pfn = xnf_btop(rbp->buf_phys);
407 				mfn_t mfn = pfn_to_mfn(pfn);
408 
409 				gnttab_grant_foreign_access_ref(ref, oeid,
410 				    mfn, 0);
411 			} else {
412 				gnttab_grant_foreign_transfer_ref(ref,
413 				    oeid, 0);
414 			}
415 			rxrp->id = ix;
416 			rxrp->gref = ref;
417 		}
418 	}
419 
420 	/*
421 	 * Reset the ring pointers to initial state.
422 	 * Hang buffers for any empty ring slots.
423 	 */
424 	xnfp->xnf_rx_ring.rsp_cons = 0;
425 	xnfp->xnf_rx_ring.req_prod_pvt = 0;
426 
427 	/* LINTED: constant in conditional context */
428 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
429 
430 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
431 		xnfp->xnf_rx_ring.req_prod_pvt = i;
432 		if (xnfp->xnf_rxpkt_bufptr[i] != NULL)
433 			continue;
434 		if ((bdesc = xnf_get_buffer(xnfp)) == NULL)
435 			break;
436 		rx_buffer_hang(xnfp, bdesc);
437 	}
438 	xnfp->xnf_rx_ring.req_prod_pvt = i;
439 	/* LINTED: constant in conditional context */
440 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
441 
442 	mutex_exit(&xnfp->xnf_intrlock);
443 
444 	return (0);
445 
446 out:
447 	if (xnfp->xnf_tx_ring_ref != GRANT_INVALID_REF)
448 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
449 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
450 
451 	if (xnfp->xnf_rx_ring_ref != GRANT_INVALID_REF)
452 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
453 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
454 
455 	return (err);
456 }
457 
458 
459 /* Called when the upper layers free a message we passed upstream */
460 static void
461 xnf_copy_rcv_complete(struct xnf_buffer_desc *bdesc)
462 {
463 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
464 	ddi_dma_mem_free(&bdesc->acc_handle);
465 	ddi_dma_free_handle(&bdesc->dma_handle);
466 	kmem_free(bdesc, sizeof (*bdesc));
467 }
468 
469 
470 /*
471  * Connect driver to back end, called to set up communication with
472  * back end driver both initially and on resume after restore/migrate.
473  */
474 void
475 xnf_be_connect(xnf_t *xnfp)
476 {
477 	const char	*message;
478 	xenbus_transaction_t xbt;
479 	struct		xenbus_device *xsd;
480 	char		*xsname;
481 	int		err;
482 
483 	ASSERT(!xnfp->xnf_connected);
484 
485 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
486 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
487 
488 	err = xnf_setup_rings(xnfp);
489 	if (err != 0) {
490 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
491 		xenbus_dev_error(xsd, err, "setting up ring");
492 		return;
493 	}
494 
495 again:
496 	err = xenbus_transaction_start(&xbt);
497 	if (err != 0) {
498 		xenbus_dev_error(xsd, EIO, "starting transaction");
499 		return;
500 	}
501 
502 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
503 	    xnfp->xnf_tx_ring_ref);
504 	if (err != 0) {
505 		message = "writing tx ring-ref";
506 		goto abort_transaction;
507 	}
508 
509 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
510 	    xnfp->xnf_rx_ring_ref);
511 	if (err != 0) {
512 		message = "writing rx ring-ref";
513 		goto abort_transaction;
514 	}
515 
516 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
517 	    xnfp->xnf_evtchn);
518 	if (err != 0) {
519 		message = "writing event-channel";
520 		goto abort_transaction;
521 	}
522 
523 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
524 	if (err != 0) {
525 		message = "writing feature-rx-notify";
526 		goto abort_transaction;
527 	}
528 
529 	if (!xnfp->xnf_tx_pages_readonly) {
530 		err = xenbus_printf(xbt, xsname, "feature-tx-writable",
531 		    "%d", 1);
532 		if (err != 0) {
533 			message = "writing feature-tx-writable";
534 			goto abort_transaction;
535 		}
536 	}
537 
538 	err = xenbus_printf(xbt, xsname, "feature-no-csum-offload", "%d",
539 	    xnfp->xnf_cksum_offload ? 0 : 1);
540 	if (err != 0) {
541 		message = "writing feature-no-csum-offload";
542 		goto abort_transaction;
543 	}
544 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d",
545 	    xnfp->xnf_rx_hvcopy ? 1 : 0);
546 	if (err != 0) {
547 		message = "writing request-rx-copy";
548 		goto abort_transaction;
549 	}
550 
551 	err = xenbus_printf(xbt, xsname, "state", "%d", XenbusStateConnected);
552 	if (err != 0) {
553 		message = "writing frontend XenbusStateConnected";
554 		goto abort_transaction;
555 	}
556 
557 	err = xenbus_transaction_end(xbt, 0);
558 	if (err != 0) {
559 		if (err == EAGAIN)
560 			goto again;
561 		xenbus_dev_error(xsd, err, "completing transaction");
562 	}
563 
564 	return;
565 
566 abort_transaction:
567 	(void) xenbus_transaction_end(xbt, 1);
568 	xenbus_dev_error(xsd, err, "%s", message);
569 }
570 
571 /*
572  * Read config info from xenstore
573  */
574 void
575 xnf_read_config(xnf_t *xnfp)
576 {
577 	char		mac[ETHERADDRL * 3];
578 	int		err, be_no_cksum_offload;
579 
580 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo), "mac",
581 	    "%s", (char *)&mac[0]);
582 	if (err != 0) {
583 		/*
584 		 * bad: we're supposed to be set up with a proper mac
585 		 * addr. at this point
586 		 */
587 		cmn_err(CE_WARN, "%s%d: no mac address",
588 		    ddi_driver_name(xnfp->xnf_devinfo),
589 		    ddi_get_instance(xnfp->xnf_devinfo));
590 			return;
591 	}
592 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
593 		err = ENOENT;
594 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
595 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
596 		return;
597 	}
598 
599 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(xnfp->xnf_devinfo),
600 	    "feature-no-csum-offload", "%d", &be_no_cksum_offload);
601 	/*
602 	 * If we fail to read the store we assume that the key is
603 	 * absent, implying an older domain at the far end.  Older
604 	 * domains always support checksum offload.
605 	 */
606 	if (err != 0)
607 		be_no_cksum_offload = 0;
608 	/*
609 	 * If the far end cannot do checksum offload or we do not wish
610 	 * to do it, disable it.
611 	 */
612 	if ((be_no_cksum_offload == 1) || !xnfp->xnf_cksum_offload)
613 		xnfp->xnf_cksum_offload = B_FALSE;
614 }
615 
616 /*
617  *  attach(9E) -- Attach a device to the system
618  *
619  *  Called once for each board successfully probed.
620  */
621 static int
622 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
623 {
624 	mac_register_t *macp;
625 	xnf_t *xnfp;
626 	int err;
627 
628 #ifdef XNF_DEBUG
629 	if (xnfdebug & XNF_DEBUG_DDI)
630 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
631 		    (void *)devinfo);
632 #endif
633 
634 	switch (cmd) {
635 	case DDI_RESUME:
636 		xnfp = ddi_get_driver_private(devinfo);
637 
638 		(void) xvdi_resume(devinfo);
639 		(void) xvdi_alloc_evtchn(devinfo);
640 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
641 #ifdef XPV_HVM_DRIVER
642 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
643 		    xnfp);
644 #else
645 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
646 		    (caddr_t)xnfp);
647 #endif
648 		xnf_be_connect(xnfp);
649 		/*
650 		 * Our MAC address may have changed if we're resuming:
651 		 * - on a different host
652 		 * - on the same one and got a different MAC address
653 		 *   because we didn't specify one of our own.
654 		 * so it's useful to claim that it changed in order that
655 		 * IP send out a gratuitous ARP.
656 		 */
657 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
658 		return (DDI_SUCCESS);
659 
660 	case DDI_ATTACH:
661 		break;
662 
663 	default:
664 		return (DDI_FAILURE);
665 	}
666 
667 	/*
668 	 *  Allocate gld_mac_info_t and xnf_instance structures
669 	 */
670 	macp = mac_alloc(MAC_VERSION);
671 	if (macp == NULL)
672 		return (DDI_FAILURE);
673 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
674 
675 	macp->m_dip = devinfo;
676 	macp->m_driver = xnfp;
677 	xnfp->xnf_devinfo = devinfo;
678 
679 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
680 	macp->m_src_addr = xnfp->xnf_mac_addr;
681 	macp->m_callbacks = &xnf_callbacks;
682 	macp->m_min_sdu = 0;
683 	macp->m_max_sdu = XNF_MAXPKT;
684 
685 	xnfp->xnf_running = B_FALSE;
686 	xnfp->xnf_connected = B_FALSE;
687 	xnfp->xnf_cksum_offload = xnf_cksum_offload;
688 	xnfp->xnf_tx_pages_readonly = xnf_tx_pages_readonly;
689 	xnfp->xnf_need_sched = B_FALSE;
690 
691 	xnfp->xnf_rx_hvcopy = xnf_hvcopy_peer_status(devinfo) && xnf_rx_hvcopy;
692 #ifdef XPV_HVM_DRIVER
693 	/*
694 	 * Report our version to dom0.
695 	 */
696 	if (xenbus_printf(XBT_NULL, "hvmpv/xnf", "version", "%d",
697 	    HVMPV_XNF_VERS))
698 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
699 
700 	if (!xnfp->xnf_rx_hvcopy) {
701 		cmn_err(CE_WARN, "The xnf driver requires a dom0 that "
702 		    "supports 'feature-rx-copy'");
703 		goto failure;
704 	}
705 #endif
706 
707 	/*
708 	 * Get the iblock cookie with which to initialize the mutexes.
709 	 */
710 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
711 	    != DDI_SUCCESS)
712 		goto failure;
713 	/*
714 	 * Driver locking strategy: the txlock protects all paths
715 	 * through the driver, except the interrupt thread.
716 	 * If the interrupt thread needs to do something which could
717 	 * affect the operation of any other part of the driver,
718 	 * it needs to acquire the txlock mutex.
719 	 */
720 	mutex_init(&xnfp->xnf_tx_buf_mutex,
721 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
722 	mutex_init(&xnfp->xnf_rx_buf_mutex,
723 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
724 	mutex_init(&xnfp->xnf_txlock,
725 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
726 	mutex_init(&xnfp->xnf_intrlock,
727 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
728 	cv_init(&xnfp->xnf_cv, NULL, CV_DEFAULT, NULL);
729 
730 	xnfp->xnf_gref_tx_head = (grant_ref_t)-1;
731 	xnfp->xnf_gref_rx_head = (grant_ref_t)-1;
732 	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
733 	    &xnfp->xnf_gref_tx_head) < 0) {
734 		cmn_err(CE_WARN, "xnf%d: can't alloc tx grant refs",
735 		    ddi_get_instance(xnfp->xnf_devinfo));
736 		goto failure_1;
737 	}
738 	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
739 	    &xnfp->xnf_gref_rx_head) < 0) {
740 		cmn_err(CE_WARN, "xnf%d: can't alloc rx grant refs",
741 		    ddi_get_instance(xnfp->xnf_devinfo));
742 		goto failure_1;
743 	}
744 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
745 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
746 		    "driver data structures",
747 		    ddi_get_instance(xnfp->xnf_devinfo));
748 		goto failure_1;
749 	}
750 
751 	xnfp->xnf_rx_ring.sring->rsp_event =
752 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
753 
754 	xnfp->xnf_tx_ring_ref = GRANT_INVALID_REF;
755 	xnfp->xnf_rx_ring_ref = GRANT_INVALID_REF;
756 
757 	/* set driver private pointer now */
758 	ddi_set_driver_private(devinfo, xnfp);
759 
760 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
761 	    != DDI_SUCCESS)
762 		goto failure_1;
763 
764 	if (!xnf_kstat_init(xnfp))
765 		goto failure_2;
766 
767 	/*
768 	 * Allocate an event channel, add the interrupt handler and
769 	 * bind it to the event channel.
770 	 */
771 	(void) xvdi_alloc_evtchn(devinfo);
772 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
773 #ifdef XPV_HVM_DRIVER
774 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
775 #else
776 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
777 #endif
778 
779 	xnf_read_config(xnfp);
780 	err = mac_register(macp, &xnfp->xnf_mh);
781 	mac_free(macp);
782 	macp = NULL;
783 	if (err != 0)
784 		goto failure_3;
785 
786 #ifdef XPV_HVM_DRIVER
787 	/*
788 	 * In the HVM case, this driver essentially replaces a driver for
789 	 * a 'real' PCI NIC. Without the "model" property set to
790 	 * "Ethernet controller", like the PCI code does, netbooting does
791 	 * not work correctly, as strplumb_get_netdev_path() will not find
792 	 * this interface.
793 	 */
794 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
795 	    "Ethernet controller");
796 #endif
797 
798 	/*
799 	 * connect to the backend
800 	 */
801 	xnf_be_connect(xnfp);
802 
803 	return (DDI_SUCCESS);
804 
805 failure_3:
806 	kstat_delete(xnfp->xnf_kstat_aux);
807 #ifdef XPV_HVM_DRIVER
808 	ec_unbind_evtchn(xnfp->xnf_evtchn);
809 	xvdi_free_evtchn(devinfo);
810 #else
811 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
812 #endif
813 	xnfp->xnf_evtchn = INVALID_EVTCHN;
814 
815 failure_2:
816 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
817 
818 failure_1:
819 	if (xnfp->xnf_gref_tx_head != (grant_ref_t)-1)
820 		gnttab_free_grant_references(xnfp->xnf_gref_tx_head);
821 	if (xnfp->xnf_gref_rx_head != (grant_ref_t)-1)
822 		gnttab_free_grant_references(xnfp->xnf_gref_rx_head);
823 	xnf_release_dma_resources(xnfp);
824 	cv_destroy(&xnfp->xnf_cv);
825 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
826 	mutex_destroy(&xnfp->xnf_txlock);
827 	mutex_destroy(&xnfp->xnf_intrlock);
828 
829 failure:
830 	kmem_free(xnfp, sizeof (*xnfp));
831 	if (macp != NULL)
832 		mac_free(macp);
833 
834 	return (DDI_FAILURE);
835 }
836 
837 /*  detach(9E) -- Detach a device from the system */
838 static int
839 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
840 {
841 	xnf_t *xnfp;		/* Our private device info */
842 	int i;
843 
844 #ifdef XNF_DEBUG
845 	if (xnfdebug & XNF_DEBUG_DDI)
846 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
847 #endif
848 
849 	xnfp = ddi_get_driver_private(devinfo);
850 
851 	switch (cmd) {
852 	case DDI_SUSPEND:
853 #ifdef XPV_HVM_DRIVER
854 		ec_unbind_evtchn(xnfp->xnf_evtchn);
855 		xvdi_free_evtchn(devinfo);
856 #else
857 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
858 #endif
859 
860 		xvdi_suspend(devinfo);
861 
862 		mutex_enter(&xnfp->xnf_intrlock);
863 		mutex_enter(&xnfp->xnf_txlock);
864 
865 		xnfp->xnf_evtchn = INVALID_EVTCHN;
866 		xnfp->xnf_connected = B_FALSE;
867 		mutex_exit(&xnfp->xnf_txlock);
868 		mutex_exit(&xnfp->xnf_intrlock);
869 
870 		/* claim link to be down after disconnect */
871 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
872 		return (DDI_SUCCESS);
873 
874 	case DDI_DETACH:
875 		break;
876 
877 	default:
878 		return (DDI_FAILURE);
879 	}
880 
881 	if (xnfp->xnf_connected)
882 		return (DDI_FAILURE);
883 
884 	/* Wait for receive buffers to be returned; give up after 5 seconds */
885 	i = 50;
886 
887 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
888 	while (xnfp->xnf_rx_bufs_outstanding > 0) {
889 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
890 		delay(drv_usectohz(100000));
891 		if (--i == 0) {
892 			cmn_err(CE_WARN,
893 			    "xnf%d: never reclaimed all the "
894 			    "receive buffers.  Still have %d "
895 			    "buffers outstanding.",
896 			    ddi_get_instance(xnfp->xnf_devinfo),
897 			    xnfp->xnf_rx_bufs_outstanding);
898 			return (DDI_FAILURE);
899 		}
900 		mutex_enter(&xnfp->xnf_rx_buf_mutex);
901 	}
902 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
903 
904 	if (mac_unregister(xnfp->xnf_mh) != 0)
905 		return (DDI_FAILURE);
906 
907 	kstat_delete(xnfp->xnf_kstat_aux);
908 
909 	/* Stop the receiver */
910 	xnf_stop(xnfp);
911 
912 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
913 
914 	/* Remove the interrupt */
915 #ifdef XPV_HVM_DRIVER
916 	ec_unbind_evtchn(xnfp->xnf_evtchn);
917 	xvdi_free_evtchn(devinfo);
918 #else
919 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
920 #endif
921 
922 	/* Release any pending xmit mblks */
923 	xnf_release_mblks(xnfp);
924 
925 	/* Release all DMA resources */
926 	xnf_release_dma_resources(xnfp);
927 
928 	cv_destroy(&xnfp->xnf_cv);
929 	mutex_destroy(&xnfp->xnf_rx_buf_mutex);
930 	mutex_destroy(&xnfp->xnf_txlock);
931 	mutex_destroy(&xnfp->xnf_intrlock);
932 
933 	kmem_free(xnfp, sizeof (*xnfp));
934 
935 	return (DDI_SUCCESS);
936 }
937 
938 /*
939  *  xnf_set_mac_addr() -- set the physical network address on the board.
940  */
941 /*ARGSUSED*/
942 static int
943 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
944 {
945 	xnf_t *xnfp = arg;
946 
947 #ifdef XNF_DEBUG
948 	if (xnfdebug & XNF_DEBUG_TRACE)
949 		printf("xnf%d: set_mac_addr(0x%p): "
950 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
951 		    ddi_get_instance(xnfp->xnf_devinfo),
952 		    (void *)xnfp, macaddr[0], macaddr[1], macaddr[2],
953 		    macaddr[3], macaddr[4], macaddr[5]);
954 #endif
955 	/*
956 	 * We can't set our macaddr.
957 	 *
958 	 * XXPV dme: Why not?
959 	 */
960 	return (ENOTSUP);
961 }
962 
963 /*
964  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
965  *
966  *  Program the hardware to enable/disable the multicast address
967  *  in "mcast".  Enable if "add" is true, disable if false.
968  */
969 /*ARGSUSED*/
970 static int
971 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
972 {
973 	xnf_t *xnfp = arg;
974 
975 #ifdef XNF_DEBUG
976 	if (xnfdebug & XNF_DEBUG_TRACE)
977 		printf("xnf%d set_multicast(0x%p): "
978 		    "%02x:%02x:%02x:%02x:%02x:%02x\n",
979 		    ddi_get_instance(xnfp->xnf_devinfo),
980 		    (void *)xnfp, mca[0], mca[1], mca[2],
981 		    mca[3], mca[4], mca[5]);
982 #endif
983 
984 	/*
985 	 * XXPV dme: Ideally we'd relay the address to the backend for
986 	 * enabling.  The protocol doesn't support that (interesting
987 	 * extension), so we simply succeed and hope that the relevant
988 	 * packets are going to arrive.
989 	 *
990 	 * If protocol support is added for enable/disable then we'll
991 	 * need to keep a list of those in use and re-add on resume.
992 	 */
993 	return (0);
994 }
995 
996 /*
997  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
998  *
999  *  Program the hardware to enable/disable promiscuous mode.
1000  */
1001 /*ARGSUSED*/
1002 static int
1003 xnf_set_promiscuous(void *arg, boolean_t on)
1004 {
1005 	xnf_t *xnfp = arg;
1006 
1007 #ifdef XNF_DEBUG
1008 	if (xnfdebug & XNF_DEBUG_TRACE)
1009 		printf("xnf%d set_promiscuous(0x%p, %x)\n",
1010 		    ddi_get_instance(xnfp->xnf_devinfo),
1011 		    (void *)xnfp, on);
1012 #endif
1013 	/*
1014 	 * We can't really do this, but we pretend that we can in
1015 	 * order that snoop will work.
1016 	 */
1017 	return (0);
1018 }
1019 
1020 /*
1021  * Clean buffers that we have responses for from the transmit ring.
1022  */
1023 static int
1024 xnf_clean_tx_ring(xnf_t *xnfp)
1025 {
1026 	RING_IDX		next_resp, i;
1027 	struct tx_pktinfo	*reap;
1028 	int			id;
1029 	grant_ref_t		ref;
1030 	boolean_t		work_to_do;
1031 
1032 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1033 
1034 loop:
1035 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1036 		/*
1037 		 * index of next transmission ack
1038 		 */
1039 		next_resp = xnfp->xnf_tx_ring.sring->rsp_prod;
1040 		membar_consumer();
1041 		/*
1042 		 * Clean tx packets from ring that we have responses for
1043 		 */
1044 		for (i = xnfp->xnf_tx_ring.rsp_cons; i != next_resp; i++) {
1045 			id = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i)->id;
1046 			reap = &xnfp->xnf_tx_pkt_info[id];
1047 			ref = reap->grant_ref;
1048 			/*
1049 			 * Return id to free list
1050 			 */
1051 			reap->id = xnfp->xnf_tx_pkt_id_list;
1052 			xnfp->xnf_tx_pkt_id_list = id;
1053 			if (gnttab_query_foreign_access(ref) != 0)
1054 				panic("tx grant still in use "
1055 				    "by backend domain");
1056 			(void) ddi_dma_unbind_handle(reap->dma_handle);
1057 			(void) gnttab_end_foreign_access_ref(ref,
1058 			    xnfp->xnf_tx_pages_readonly);
1059 			gnttab_release_grant_reference(&xnfp->xnf_gref_tx_head,
1060 			    ref);
1061 			freemsg(reap->mp);
1062 			reap->mp = NULL;
1063 			reap->grant_ref = GRANT_INVALID_REF;
1064 			if (reap->bdesc != NULL)
1065 				xnf_free_tx_buffer(reap->bdesc);
1066 			reap->bdesc = NULL;
1067 		}
1068 		xnfp->xnf_tx_ring.rsp_cons = next_resp;
1069 		membar_enter();
1070 	}
1071 
1072 	/* LINTED: constant in conditional context */
1073 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1074 	if (work_to_do)
1075 		goto loop;
1076 
1077 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1078 }
1079 
1080 /*
1081  * If we need to pull up data from either a packet that crosses a page
1082  * boundary or consisting of multiple mblks, do it here.  We allocate
1083  * a page aligned buffer and copy the data into it.  The header for the
1084  * allocated buffer is returned. (which is also allocated here)
1085  */
1086 static struct xnf_buffer_desc *
1087 xnf_pullupmsg(xnf_t *xnfp, mblk_t *mp)
1088 {
1089 	struct xnf_buffer_desc	*bdesc;
1090 	mblk_t			*mptr;
1091 	caddr_t			bp;
1092 	int			len;
1093 
1094 	/*
1095 	 * get a xmit buffer from the xmit buffer pool
1096 	 */
1097 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1098 	bdesc = xnf_get_tx_buffer(xnfp);
1099 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
1100 	if (bdesc == NULL)
1101 		return (bdesc);
1102 	/*
1103 	 * Copy the data into the buffer
1104 	 */
1105 	xnfp->xnf_stat_tx_pullup++;
1106 	bp = bdesc->buf;
1107 	for (mptr = mp; mptr != NULL; mptr = mptr->b_cont) {
1108 		len = mptr->b_wptr - mptr->b_rptr;
1109 		bcopy(mptr->b_rptr, bp, len);
1110 		bp += len;
1111 	}
1112 	return (bdesc);
1113 }
1114 
1115 void
1116 xnf_pseudo_cksum(caddr_t buf, int length)
1117 {
1118 	struct ether_header *ehp;
1119 	uint16_t sap, len, *stuff;
1120 	uint32_t cksum;
1121 	size_t offset;
1122 	ipha_t *ipha;
1123 	ipaddr_t src, dst;
1124 
1125 	ASSERT(length >= sizeof (*ehp));
1126 	ehp = (struct ether_header *)buf;
1127 
1128 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1129 		struct ether_vlan_header *evhp;
1130 
1131 		ASSERT(length >= sizeof (*evhp));
1132 		evhp = (struct ether_vlan_header *)buf;
1133 		sap = ntohs(evhp->ether_type);
1134 		offset = sizeof (*evhp);
1135 	} else {
1136 		sap = ntohs(ehp->ether_type);
1137 		offset = sizeof (*ehp);
1138 	}
1139 
1140 	ASSERT(sap == ETHERTYPE_IP);
1141 
1142 	/* Packet should have been pulled up by the caller. */
1143 	if ((offset + sizeof (ipha_t)) > length) {
1144 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
1145 		return;
1146 	}
1147 
1148 	ipha = (ipha_t *)(buf + offset);
1149 
1150 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
1151 
1152 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1153 
1154 	switch (ipha->ipha_protocol) {
1155 	case IPPROTO_TCP:
1156 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1157 		cksum = IP_TCP_CSUM_COMP;
1158 		break;
1159 	case IPPROTO_UDP:
1160 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
1161 		cksum = IP_UDP_CSUM_COMP;
1162 		break;
1163 	default:
1164 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1165 		    ipha->ipha_protocol);
1166 		return;
1167 	}
1168 
1169 	src = ipha->ipha_src;
1170 	dst = ipha->ipha_dst;
1171 
1172 	cksum += (dst >> 16) + (dst & 0xFFFF);
1173 	cksum += (src >> 16) + (src & 0xFFFF);
1174 	cksum += htons(len);
1175 
1176 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1177 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1178 
1179 	ASSERT(cksum <= 0xFFFF);
1180 
1181 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1182 }
1183 
1184 /*
1185  *  xnf_send_one() -- send a packet
1186  *
1187  *  Called when a packet is ready to be transmitted. A pointer to an
1188  *  M_DATA message that contains the packet is passed to this routine.
1189  *  At least the complete LLC header is contained in the message's
1190  *  first message block, and the remainder of the packet is contained
1191  *  within additional M_DATA message blocks linked to the first
1192  *  message block.
1193  *
1194  */
1195 static boolean_t
1196 xnf_send_one(xnf_t *xnfp, mblk_t *mp)
1197 {
1198 	struct xnf_buffer_desc	*xmitbuf;
1199 	struct tx_pktinfo	*txp_info;
1200 	mblk_t			*mptr;
1201 	ddi_dma_cookie_t	dma_cookie;
1202 	RING_IDX		slot;
1203 	int			length = 0, i, pktlen = 0, rc, tx_id;
1204 	int			tx_ring_freespace, page_oops;
1205 	uint_t			ncookies;
1206 	volatile netif_tx_request_t	*txrp;
1207 	caddr_t			bufaddr;
1208 	grant_ref_t		ref;
1209 	unsigned long		mfn;
1210 	uint32_t		pflags;
1211 	domid_t			oeid;
1212 
1213 #ifdef XNF_DEBUG
1214 	if (xnfdebug & XNF_DEBUG_SEND)
1215 		printf("xnf%d send(0x%p, 0x%p)\n",
1216 		    ddi_get_instance(xnfp->xnf_devinfo),
1217 		    (void *)xnfp, (void *)mp);
1218 #endif
1219 
1220 	ASSERT(mp != NULL);
1221 	ASSERT(mp->b_next == NULL);
1222 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1223 
1224 	tx_ring_freespace = xnf_clean_tx_ring(xnfp);
1225 	ASSERT(tx_ring_freespace >= 0);
1226 
1227 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1228 	xnfp->xnf_stat_tx_attempt++;
1229 	/*
1230 	 * If there are no xmit ring slots available, return.
1231 	 */
1232 	if (tx_ring_freespace == 0) {
1233 		xnfp->xnf_stat_tx_defer++;
1234 		return (B_FALSE);	/* Send should be retried */
1235 	}
1236 
1237 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1238 	/* Count the number of mblks in message and compute packet size */
1239 	for (i = 0, mptr = mp; mptr != NULL; mptr = mptr->b_cont, i++)
1240 		pktlen += (mptr->b_wptr - mptr->b_rptr);
1241 
1242 	/* Make sure packet isn't too large */
1243 	if (pktlen > XNF_FRAMESIZE) {
1244 		cmn_err(CE_WARN, "xnf%d: oversized packet (%d bytes) dropped",
1245 		    ddi_get_instance(xnfp->xnf_devinfo), pktlen);
1246 		freemsg(mp);
1247 		return (B_TRUE);
1248 	}
1249 
1250 	/*
1251 	 * Test if we cross a page boundary with our buffer
1252 	 */
1253 	page_oops = (i == 1) &&
1254 	    (xnf_btop((size_t)mp->b_rptr) !=
1255 	    xnf_btop((size_t)(mp->b_rptr + pktlen)));
1256 	/*
1257 	 * XXPV - unfortunately, the Xen virtual net device currently
1258 	 * doesn't support multiple packet frags, so this will always
1259 	 * end up doing the pullup if we got more than one packet.
1260 	 */
1261 	if (i > xnf_max_tx_frags || page_oops) {
1262 		if (page_oops)
1263 			xnfp->xnf_stat_tx_pagebndry++;
1264 		if ((xmitbuf = xnf_pullupmsg(xnfp, mp)) == NULL) {
1265 			/* could not allocate resources? */
1266 #ifdef XNF_DEBUG
1267 			cmn_err(CE_WARN, "xnf%d: pullupmsg failed",
1268 			    ddi_get_instance(xnfp->xnf_devinfo));
1269 #endif
1270 			xnfp->xnf_stat_tx_defer++;
1271 			return (B_FALSE);	/* Retry send */
1272 		}
1273 		bufaddr = xmitbuf->buf;
1274 	} else {
1275 		xmitbuf = NULL;
1276 		bufaddr = (caddr_t)mp->b_rptr;
1277 	}
1278 
1279 	/* set up data descriptor */
1280 	length = pktlen;
1281 
1282 	/*
1283 	 * Get packet id from free list
1284 	 */
1285 	tx_id = xnfp->xnf_tx_pkt_id_list;
1286 	ASSERT(tx_id < NET_TX_RING_SIZE);
1287 	txp_info = &xnfp->xnf_tx_pkt_info[tx_id];
1288 	xnfp->xnf_tx_pkt_id_list = txp_info->id;
1289 	txp_info->id = tx_id;
1290 
1291 	/* Prepare for DMA mapping of tx buffer(s) */
1292 	rc = ddi_dma_addr_bind_handle(txp_info->dma_handle,
1293 	    NULL, bufaddr, length, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1294 	    DDI_DMA_DONTWAIT, 0, &dma_cookie, &ncookies);
1295 	if (rc != DDI_DMA_MAPPED) {
1296 		ASSERT(rc != DDI_DMA_INUSE);
1297 		ASSERT(rc != DDI_DMA_PARTIAL_MAP);
1298 		/*
1299 		 *  Return id to free list
1300 		 */
1301 		txp_info->id = xnfp->xnf_tx_pkt_id_list;
1302 		xnfp->xnf_tx_pkt_id_list = tx_id;
1303 		if (rc == DDI_DMA_NORESOURCES) {
1304 			xnfp->xnf_stat_tx_defer++;
1305 			return (B_FALSE); /* Retry later */
1306 		}
1307 #ifdef XNF_DEBUG
1308 		cmn_err(CE_WARN, "xnf%d: bind_handle failed (%x)",
1309 		    ddi_get_instance(xnfp->xnf_devinfo), rc);
1310 #endif
1311 		return (B_FALSE);
1312 	}
1313 
1314 	ASSERT(ncookies == 1);
1315 	ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_tx_head);
1316 	ASSERT((signed short)ref >= 0);
1317 	mfn = xnf_btop(pa_to_ma((paddr_t)dma_cookie.dmac_laddress));
1318 	gnttab_grant_foreign_access_ref(ref, oeid, mfn,
1319 	    xnfp->xnf_tx_pages_readonly);
1320 	txp_info->grant_ref = ref;
1321 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1322 	txrp->gref = ref;
1323 	txrp->size = dma_cookie.dmac_size;
1324 	txrp->offset = (uintptr_t)bufaddr & PAGEOFFSET;
1325 	txrp->id = tx_id;
1326 	txrp->flags = 0;
1327 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &pflags);
1328 	if (pflags != 0) {
1329 		ASSERT(xnfp->xnf_cksum_offload);
1330 		/*
1331 		 * If the local protocol stack requests checksum
1332 		 * offload we set the 'checksum blank' flag,
1333 		 * indicating to the peer that we need the checksum
1334 		 * calculated for us.
1335 		 *
1336 		 * We _don't_ set the validated flag, because we haven't
1337 		 * validated that the data and the checksum match.
1338 		 */
1339 		xnf_pseudo_cksum(bufaddr, length);
1340 		txrp->flags |= NETTXF_csum_blank;
1341 		xnfp->xnf_stat_tx_cksum_deferred++;
1342 	}
1343 	membar_producer();
1344 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 1;
1345 
1346 	txp_info->mp = mp;
1347 	txp_info->bdesc = xmitbuf;
1348 
1349 	xnfp->xnf_stat_opackets++;
1350 	xnfp->xnf_stat_obytes += pktlen;
1351 
1352 	return (B_TRUE);	/* successful transmit attempt */
1353 }
1354 
1355 mblk_t *
1356 xnf_send(void *arg, mblk_t *mp)
1357 {
1358 	xnf_t *xnfp = arg;
1359 	mblk_t *next;
1360 	boolean_t sent_something = B_FALSE;
1361 
1362 	mutex_enter(&xnfp->xnf_txlock);
1363 
1364 	/*
1365 	 * Transmission attempts should be impossible without having
1366 	 * previously called xnf_start().
1367 	 */
1368 	ASSERT(xnfp->xnf_running);
1369 
1370 	/*
1371 	 * Wait for getting connected to the backend
1372 	 */
1373 	while (!xnfp->xnf_connected) {
1374 		cv_wait(&xnfp->xnf_cv, &xnfp->xnf_txlock);
1375 	}
1376 
1377 	while (mp != NULL) {
1378 		next = mp->b_next;
1379 		mp->b_next = NULL;
1380 
1381 		if (!xnf_send_one(xnfp, mp)) {
1382 			mp->b_next = next;
1383 			break;
1384 		}
1385 
1386 		mp = next;
1387 		sent_something = B_TRUE;
1388 	}
1389 
1390 	if (sent_something) {
1391 		boolean_t notify;
1392 
1393 		/* LINTED: constant in conditional context */
1394 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1395 		    notify);
1396 		if (notify)
1397 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
1398 	}
1399 
1400 	xnfp->xnf_need_sched = !sent_something;
1401 
1402 	mutex_exit(&xnfp->xnf_txlock);
1403 
1404 	return (mp);
1405 }
1406 
1407 /*
1408  *  xnf_intr() -- ring interrupt service routine
1409  */
1410 static uint_t
1411 xnf_intr(caddr_t arg)
1412 {
1413 	xnf_t *xnfp = (xnf_t *)arg;
1414 	boolean_t sched = B_FALSE;
1415 
1416 	mutex_enter(&xnfp->xnf_intrlock);
1417 
1418 	/* spurious intr */
1419 	if (!xnfp->xnf_connected) {
1420 		mutex_exit(&xnfp->xnf_intrlock);
1421 		xnfp->xnf_stat_unclaimed_interrupts++;
1422 		return (DDI_INTR_UNCLAIMED);
1423 	}
1424 
1425 #ifdef XNF_DEBUG
1426 	if (xnfdebug & XNF_DEBUG_INT)
1427 		printf("xnf%d intr(0x%p)\n",
1428 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1429 #endif
1430 	if (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1431 		mblk_t *mp;
1432 
1433 		if (xnfp->xnf_rx_hvcopy)
1434 			mp = xnf_process_hvcopy_recv(xnfp);
1435 		else
1436 			mp = xnf_process_recv(xnfp);
1437 
1438 		if (mp != NULL)
1439 			mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp);
1440 	}
1441 
1442 	xnfp->xnf_stat_interrupts++;
1443 	mutex_exit(&xnfp->xnf_intrlock);
1444 
1445 	/*
1446 	 * Clean tx ring and try to start any blocked xmit streams if
1447 	 * there is now some space.
1448 	 */
1449 	mutex_enter(&xnfp->xnf_txlock);
1450 	if (xnf_clean_tx_ring(xnfp) > 0) {
1451 		sched = xnfp->xnf_need_sched;
1452 		xnfp->xnf_need_sched = B_FALSE;
1453 	}
1454 	mutex_exit(&xnfp->xnf_txlock);
1455 
1456 	if (sched)
1457 		mac_tx_update(xnfp->xnf_mh);
1458 
1459 	return (DDI_INTR_CLAIMED);
1460 }
1461 
1462 /*
1463  *  xnf_start() -- start the board receiving and enable interrupts.
1464  */
1465 static int
1466 xnf_start(void *arg)
1467 {
1468 	xnf_t *xnfp = arg;
1469 
1470 #ifdef XNF_DEBUG
1471 	if (xnfdebug & XNF_DEBUG_TRACE)
1472 		printf("xnf%d start(0x%p)\n",
1473 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1474 #endif
1475 
1476 	mutex_enter(&xnfp->xnf_intrlock);
1477 	mutex_enter(&xnfp->xnf_txlock);
1478 
1479 	/* Accept packets from above. */
1480 	xnfp->xnf_running = B_TRUE;
1481 
1482 	mutex_exit(&xnfp->xnf_txlock);
1483 	mutex_exit(&xnfp->xnf_intrlock);
1484 
1485 	return (0);
1486 }
1487 
1488 /* xnf_stop() - disable hardware */
1489 static void
1490 xnf_stop(void *arg)
1491 {
1492 	xnf_t *xnfp = arg;
1493 
1494 #ifdef XNF_DEBUG
1495 	if (xnfdebug & XNF_DEBUG_TRACE)
1496 		printf("xnf%d stop(0x%p)\n",
1497 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
1498 #endif
1499 
1500 	mutex_enter(&xnfp->xnf_intrlock);
1501 	mutex_enter(&xnfp->xnf_txlock);
1502 
1503 	xnfp->xnf_running = B_FALSE;
1504 
1505 	mutex_exit(&xnfp->xnf_txlock);
1506 	mutex_exit(&xnfp->xnf_intrlock);
1507 }
1508 
1509 /*
1510  * Driver private functions follow
1511  */
1512 
1513 /*
1514  * Hang buffer on rx ring
1515  */
1516 static void
1517 rx_buffer_hang(xnf_t *xnfp, struct xnf_buffer_desc *bdesc)
1518 {
1519 	volatile netif_rx_request_t	*reqp;
1520 	RING_IDX			hang_ix;
1521 	grant_ref_t			ref;
1522 	domid_t				oeid;
1523 
1524 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1525 
1526 	ASSERT(MUTEX_HELD(&xnfp->xnf_intrlock));
1527 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
1528 	    xnfp->xnf_rx_ring.req_prod_pvt);
1529 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
1530 	ASSERT(xnfp->xnf_rxpkt_bufptr[hang_ix] == NULL);
1531 	if (bdesc->grant_ref == GRANT_INVALID_REF) {
1532 		ref = gnttab_claim_grant_reference(&xnfp->xnf_gref_rx_head);
1533 		ASSERT((signed short)ref >= 0);
1534 		bdesc->grant_ref = ref;
1535 		if (xnfp->xnf_rx_hvcopy) {
1536 			pfn_t pfn = xnf_btop(bdesc->buf_phys);
1537 			mfn_t mfn = pfn_to_mfn(pfn);
1538 
1539 			gnttab_grant_foreign_access_ref(ref, oeid, mfn, 0);
1540 		} else {
1541 			gnttab_grant_foreign_transfer_ref(ref, oeid, 0);
1542 		}
1543 	}
1544 	reqp->id = hang_ix;
1545 	reqp->gref = bdesc->grant_ref;
1546 	bdesc->id = hang_ix;
1547 	xnfp->xnf_rxpkt_bufptr[hang_ix] = bdesc;
1548 	membar_producer();
1549 	xnfp->xnf_rx_ring.req_prod_pvt++;
1550 }
1551 
1552 static mblk_t *
1553 xnf_process_hvcopy_recv(xnf_t *xnfp)
1554 {
1555 	netif_rx_response_t *rxpkt;
1556 	mblk_t		*mp, *head, *tail;
1557 	struct		xnf_buffer_desc *bdesc;
1558 	boolean_t	hwcsum = B_FALSE, notify, work_to_do;
1559 	size_t 		len;
1560 
1561 	/*
1562 	 * in loop over unconsumed responses, we do:
1563 	 * 1. get a response
1564 	 * 2. take corresponding buffer off recv. ring
1565 	 * 3. indicate this by setting slot to NULL
1566 	 * 4. create a new message and
1567 	 * 5. copy data in, adjust ptr
1568 	 *
1569 	 * outside loop:
1570 	 * 7. make sure no more data has arrived; kick HV
1571 	 */
1572 
1573 	head = tail = NULL;
1574 
1575 loop:
1576 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1577 
1578 		/* 1. */
1579 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1580 		    xnfp->xnf_rx_ring.rsp_cons);
1581 
1582 		DTRACE_PROBE4(got_PKT, int, (int)rxpkt->id, int,
1583 		    (int)rxpkt->offset,
1584 		    int, (int)rxpkt->flags, int, (int)rxpkt->status);
1585 
1586 		/*
1587 		 * 2.
1588 		 * Take buffer off of receive ring
1589 		 */
1590 		hwcsum = B_FALSE;
1591 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1592 		/* 3 */
1593 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1594 		ASSERT(bdesc->id == rxpkt->id);
1595 		mp = NULL;
1596 		if (!xnfp->xnf_running) {
1597 			DTRACE_PROBE4(pkt_dropped, int, rxpkt->status,
1598 			    char *, bdesc->buf, int, rxpkt->offset,
1599 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1600 			xnfp->xnf_stat_drop++;
1601 			/*
1602 			 * re-hang the buffer
1603 			 */
1604 			rx_buffer_hang(xnfp, bdesc);
1605 		} else if (rxpkt->status <= 0) {
1606 			DTRACE_PROBE4(pkt_status_negative, int, rxpkt->status,
1607 			    char *, bdesc->buf, int, rxpkt->offset,
1608 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1609 			xnfp->xnf_stat_errrx++;
1610 			if (rxpkt->status == 0)
1611 				xnfp->xnf_stat_runt++;
1612 			if (rxpkt->status == NETIF_RSP_ERROR)
1613 				xnfp->xnf_stat_mac_rcv_error++;
1614 			if (rxpkt->status == NETIF_RSP_DROPPED)
1615 				xnfp->xnf_stat_norxbuf++;
1616 			/*
1617 			 * re-hang the buffer
1618 			 */
1619 			rx_buffer_hang(xnfp, bdesc);
1620 		} else {
1621 			grant_ref_t		ref =  bdesc->grant_ref;
1622 			struct xnf_buffer_desc	*new_bdesc;
1623 			unsigned long		off = rxpkt->offset;
1624 
1625 			DTRACE_PROBE4(pkt_status_ok, int, rxpkt->status,
1626 			    char *, bdesc->buf, int, rxpkt->offset,
1627 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
1628 			len = rxpkt->status;
1629 			ASSERT(off + len <= PAGEOFFSET);
1630 			if (ref == GRANT_INVALID_REF) {
1631 				mp = NULL;
1632 				new_bdesc = bdesc;
1633 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1634 				    "from dom %d", ref,
1635 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1636 				goto luckless;
1637 			}
1638 			/*
1639 			 * Release ref which we'll be re-claiming in
1640 			 * rx_buffer_hang().
1641 			 */
1642 			bdesc->grant_ref = GRANT_INVALID_REF;
1643 			(void) gnttab_end_foreign_access_ref(ref, 0);
1644 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1645 			    ref);
1646 			if (rxpkt->flags & NETRXF_data_validated)
1647 				hwcsum = B_TRUE;
1648 
1649 			/*
1650 			 * XXPV for the initial implementation of HVcopy,
1651 			 * create a new msg and copy in the data
1652 			 */
1653 			/* 4. */
1654 			if ((mp = allocb(len, BPRI_MED)) == NULL) {
1655 				/*
1656 				 * Couldn't get buffer to copy to,
1657 				 * drop this data, and re-hang
1658 				 * the buffer on the ring.
1659 				 */
1660 				xnfp->xnf_stat_norxbuf++;
1661 				DTRACE_PROBE(alloc_nix);
1662 			} else {
1663 				/* 5. */
1664 				DTRACE_PROBE(alloc_ok);
1665 				bcopy(bdesc->buf + off, mp->b_wptr,
1666 				    len);
1667 				mp->b_wptr += len;
1668 			}
1669 			new_bdesc = bdesc;
1670 luckless:
1671 
1672 			/* Re-hang old or hang new buffer. */
1673 			rx_buffer_hang(xnfp, new_bdesc);
1674 		}
1675 		if (mp) {
1676 			if (hwcsum) {
1677 				/*
1678 				 * See comments in xnf_process_recv().
1679 				 */
1680 
1681 				(void) hcksum_assoc(mp, NULL,
1682 				    NULL, 0, 0, 0, 0,
1683 				    HCK_FULLCKSUM |
1684 				    HCK_FULLCKSUM_OK,
1685 				    0);
1686 				xnfp->xnf_stat_rx_cksum_no_need++;
1687 			}
1688 			if (head == NULL) {
1689 				head = tail = mp;
1690 			} else {
1691 				tail->b_next = mp;
1692 				tail = mp;
1693 			}
1694 
1695 			ASSERT(mp->b_next == NULL);
1696 
1697 			xnfp->xnf_stat_ipackets++;
1698 			xnfp->xnf_stat_rbytes += len;
1699 		}
1700 
1701 		xnfp->xnf_rx_ring.rsp_cons++;
1702 
1703 		xnfp->xnf_stat_hvcopy_packet_processed++;
1704 	}
1705 
1706 	/* 7. */
1707 	/*
1708 	 * Has more data come in since we started?
1709 	 */
1710 	/* LINTED: constant in conditional context */
1711 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1712 	if (work_to_do)
1713 		goto loop;
1714 
1715 	/*
1716 	 * Indicate to the backend that we have re-filled the receive
1717 	 * ring.
1718 	 */
1719 	/* LINTED: constant in conditional context */
1720 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1721 	if (notify)
1722 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1723 
1724 	return (head);
1725 }
1726 
1727 /* Process all queued received packets */
1728 static mblk_t *
1729 xnf_process_recv(xnf_t *xnfp)
1730 {
1731 	volatile netif_rx_response_t *rxpkt;
1732 	mblk_t *mp, *head, *tail;
1733 	struct xnf_buffer_desc *bdesc;
1734 	extern mblk_t *desballoc(unsigned char *, size_t, uint_t, frtn_t *);
1735 	boolean_t hwcsum = B_FALSE, notify, work_to_do;
1736 	size_t len;
1737 	pfn_t pfn;
1738 	long cnt;
1739 
1740 	head = tail = NULL;
1741 loop:
1742 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
1743 
1744 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
1745 		    xnfp->xnf_rx_ring.rsp_cons);
1746 
1747 		/*
1748 		 * Take buffer off of receive ring
1749 		 */
1750 		hwcsum = B_FALSE;
1751 		bdesc = xnfp->xnf_rxpkt_bufptr[rxpkt->id];
1752 		xnfp->xnf_rxpkt_bufptr[rxpkt->id] = NULL;
1753 		ASSERT(bdesc->id == rxpkt->id);
1754 		mp = NULL;
1755 		if (!xnfp->xnf_running) {
1756 			xnfp->xnf_stat_drop++;
1757 			/*
1758 			 * re-hang the buffer
1759 			 */
1760 			rx_buffer_hang(xnfp, bdesc);
1761 		} else if (rxpkt->status <= 0) {
1762 			xnfp->xnf_stat_errrx++;
1763 			if (rxpkt->status == 0)
1764 				xnfp->xnf_stat_runt++;
1765 			if (rxpkt->status == NETIF_RSP_ERROR)
1766 				xnfp->xnf_stat_mac_rcv_error++;
1767 			if (rxpkt->status == NETIF_RSP_DROPPED)
1768 				xnfp->xnf_stat_norxbuf++;
1769 			/*
1770 			 * re-hang the buffer
1771 			 */
1772 			rx_buffer_hang(xnfp, bdesc);
1773 		} else {
1774 			grant_ref_t ref =  bdesc->grant_ref;
1775 			struct xnf_buffer_desc *new_bdesc;
1776 			unsigned long off = rxpkt->offset;
1777 			unsigned long mfn;
1778 
1779 			len = rxpkt->status;
1780 			ASSERT(off + len <= PAGEOFFSET);
1781 			if (ref == GRANT_INVALID_REF) {
1782 				mp = NULL;
1783 				new_bdesc = bdesc;
1784 				cmn_err(CE_WARN, "Bad rx grant reference %d "
1785 				    "from dom %d", ref,
1786 				    xvdi_get_oeid(xnfp->xnf_devinfo));
1787 				goto luckless;
1788 			}
1789 			bdesc->grant_ref = GRANT_INVALID_REF;
1790 			mfn = gnttab_end_foreign_transfer_ref(ref);
1791 			ASSERT(mfn != MFN_INVALID);
1792 			ASSERT(hat_getpfnum(kas.a_hat, bdesc->buf) ==
1793 			    PFN_INVALID);
1794 
1795 			gnttab_release_grant_reference(&xnfp->xnf_gref_rx_head,
1796 			    ref);
1797 			reassign_pfn(xnf_btop(bdesc->buf_phys), mfn);
1798 			hat_devload(kas.a_hat, bdesc->buf, PAGESIZE,
1799 			    xnf_btop(bdesc->buf_phys),
1800 			    PROT_READ | PROT_WRITE, HAT_LOAD);
1801 			balloon_drv_added(1);
1802 
1803 			if (rxpkt->flags & NETRXF_data_validated)
1804 				hwcsum = B_TRUE;
1805 			if (len <= xnf_rx_bcopy_thresh) {
1806 				/*
1807 				 * For small buffers, just copy the data
1808 				 * and send the copy upstream.
1809 				 */
1810 				new_bdesc = NULL;
1811 			} else {
1812 				/*
1813 				 * We send a pointer to this data upstream;
1814 				 * we need a new buffer to replace this one.
1815 				 */
1816 				mutex_enter(&xnfp->xnf_rx_buf_mutex);
1817 				new_bdesc = xnf_get_buffer(xnfp);
1818 				if (new_bdesc != NULL) {
1819 					xnfp->xnf_rx_bufs_outstanding++;
1820 				} else {
1821 					xnfp->xnf_stat_rx_no_ringbuf++;
1822 				}
1823 				mutex_exit(&xnfp->xnf_rx_buf_mutex);
1824 			}
1825 
1826 			if (new_bdesc == NULL) {
1827 				/*
1828 				 * Don't have a new ring buffer; bcopy the data
1829 				 * from the buffer, and preserve the
1830 				 * original buffer
1831 				 */
1832 				if ((mp = allocb(len, BPRI_MED)) == NULL) {
1833 					/*
1834 					 * Could't get buffer to copy to,
1835 					 * drop this data, and re-hang
1836 					 * the buffer on the ring.
1837 					 */
1838 					xnfp->xnf_stat_norxbuf++;
1839 				} else {
1840 					bcopy(bdesc->buf + off, mp->b_wptr,
1841 					    len);
1842 				}
1843 				/*
1844 				 * Give the buffer page back to xen
1845 				 */
1846 				pfn = xnf_btop(bdesc->buf_phys);
1847 				cnt = balloon_free_pages(1, &mfn, bdesc->buf,
1848 				    &pfn);
1849 				if (cnt != 1) {
1850 					cmn_err(CE_WARN, "unable to give a "
1851 					    "page back to the hypervisor\n");
1852 				}
1853 				new_bdesc = bdesc;
1854 			} else {
1855 				if ((mp = desballoc((unsigned char *)bdesc->buf,
1856 				    off + len, 0, (frtn_t *)bdesc)) == NULL) {
1857 					/*
1858 					 * Couldn't get mblk to pass recv data
1859 					 * up with, free the old ring buffer
1860 					 */
1861 					xnfp->xnf_stat_norxbuf++;
1862 					xnf_rcv_complete(bdesc);
1863 					goto luckless;
1864 				}
1865 				(void) ddi_dma_sync(bdesc->dma_handle,
1866 				    0, 0, DDI_DMA_SYNC_FORCPU);
1867 
1868 				mp->b_wptr += off;
1869 				mp->b_rptr += off;
1870 			}
1871 luckless:
1872 			if (mp)
1873 				mp->b_wptr += len;
1874 			/* re-hang old or hang new buffer */
1875 			rx_buffer_hang(xnfp, new_bdesc);
1876 		}
1877 		if (mp) {
1878 			if (hwcsum) {
1879 				/*
1880 				 * If the peer says that the data has
1881 				 * been validated then we declare that
1882 				 * the full checksum has been
1883 				 * verified.
1884 				 *
1885 				 * We don't look at the "checksum
1886 				 * blank" flag, and hence could have a
1887 				 * packet here that we are asserting
1888 				 * is good with a blank checksum.
1889 				 *
1890 				 * The hardware checksum offload
1891 				 * specification says that we must
1892 				 * provide the actual checksum as well
1893 				 * as an assertion that it is valid,
1894 				 * but the protocol stack doesn't
1895 				 * actually use it and some other
1896 				 * drivers don't bother, so we don't.
1897 				 * If it was necessary we could grovel
1898 				 * in the packet to find it.
1899 				 */
1900 
1901 				(void) hcksum_assoc(mp, NULL,
1902 				    NULL, 0, 0, 0, 0,
1903 				    HCK_FULLCKSUM |
1904 				    HCK_FULLCKSUM_OK,
1905 				    0);
1906 				xnfp->xnf_stat_rx_cksum_no_need++;
1907 			}
1908 			if (head == NULL) {
1909 				head = tail = mp;
1910 			} else {
1911 				tail->b_next = mp;
1912 				tail = mp;
1913 			}
1914 
1915 			ASSERT(mp->b_next == NULL);
1916 
1917 			xnfp->xnf_stat_ipackets++;
1918 			xnfp->xnf_stat_rbytes += len;
1919 		}
1920 
1921 		xnfp->xnf_rx_ring.rsp_cons++;
1922 	}
1923 
1924 	/*
1925 	 * Has more data come in since we started?
1926 	 */
1927 	/* LINTED: constant in conditional context */
1928 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_rx_ring, work_to_do);
1929 	if (work_to_do)
1930 		goto loop;
1931 
1932 	/*
1933 	 * Indicate to the backend that we have re-filled the receive
1934 	 * ring.
1935 	 */
1936 	/* LINTED: constant in conditional context */
1937 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
1938 	if (notify)
1939 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1940 
1941 	return (head);
1942 }
1943 
1944 /* Called when the upper layers free a message we passed upstream */
1945 static void
1946 xnf_rcv_complete(struct xnf_buffer_desc *bdesc)
1947 {
1948 	xnf_t *xnfp = bdesc->xnfp;
1949 	pfn_t pfn;
1950 	long cnt;
1951 
1952 	/* One less outstanding receive buffer */
1953 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
1954 	--xnfp->xnf_rx_bufs_outstanding;
1955 	/*
1956 	 * Return buffer to the free list, unless the free list is getting
1957 	 * too large.  XXPV - this threshold may need tuning.
1958 	 */
1959 	if (xnfp->xnf_rx_descs_free < xnf_rx_bufs_lowat) {
1960 		/*
1961 		 * Unmap the page, and hand the machine page back
1962 		 * to xen so it can be re-used as a backend net buffer.
1963 		 */
1964 		pfn = xnf_btop(bdesc->buf_phys);
1965 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
1966 		if (cnt != 1) {
1967 			cmn_err(CE_WARN, "unable to give a page back to the "
1968 			    "hypervisor\n");
1969 		}
1970 
1971 		bdesc->next = xnfp->xnf_free_list;
1972 		xnfp->xnf_free_list = bdesc;
1973 		xnfp->xnf_rx_descs_free++;
1974 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1975 	} else {
1976 		/*
1977 		 * We can return everything here since we have a free buffer
1978 		 * that we have not given the backing page for back to xen.
1979 		 */
1980 		--xnfp->xnf_rx_buffer_count;
1981 		mutex_exit(&xnfp->xnf_rx_buf_mutex);
1982 		(void) ddi_dma_unbind_handle(bdesc->dma_handle);
1983 		ddi_dma_mem_free(&bdesc->acc_handle);
1984 		ddi_dma_free_handle(&bdesc->dma_handle);
1985 		kmem_free(bdesc, sizeof (*bdesc));
1986 	}
1987 }
1988 
1989 /*
1990  *  xnf_alloc_dma_resources() -- initialize the drivers structures
1991  */
1992 static int
1993 xnf_alloc_dma_resources(xnf_t *xnfp)
1994 {
1995 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
1996 	int			i;
1997 	size_t			len;
1998 	ddi_dma_cookie_t	dma_cookie;
1999 	uint_t			ncookies;
2000 	struct xnf_buffer_desc	*bdesc;
2001 	int			rc;
2002 	caddr_t			rptr;
2003 
2004 	xnfp->xnf_n_rx = NET_RX_RING_SIZE;
2005 	xnfp->xnf_max_rx_bufs = xnf_rx_bufs_hiwat;
2006 
2007 	xnfp->xnf_n_tx = NET_TX_RING_SIZE;
2008 
2009 	/*
2010 	 * The code below allocates all the DMA data structures that
2011 	 * need to be released when the driver is detached.
2012 	 *
2013 	 * First allocate handles for mapping (virtual address) pointers to
2014 	 * transmit data buffers to physical addresses
2015 	 */
2016 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2017 		if ((rc = ddi_dma_alloc_handle(devinfo,
2018 		    &tx_buffer_dma_attr, DDI_DMA_SLEEP, 0,
2019 		    &xnfp->xnf_tx_pkt_info[i].dma_handle)) != DDI_SUCCESS)
2020 			return (DDI_FAILURE);
2021 	}
2022 
2023 	/*
2024 	 * Allocate page for the transmit descriptor ring.
2025 	 */
2026 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2027 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2028 		goto alloc_error;
2029 
2030 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2031 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2032 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2033 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2034 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2035 		xnfp->xnf_tx_ring_dma_handle = NULL;
2036 		goto alloc_error;
2037 	}
2038 
2039 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2040 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2041 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2042 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2043 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2044 		xnfp->xnf_tx_ring_dma_handle = NULL;
2045 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2046 		if (rc == DDI_DMA_NORESOURCES)
2047 			goto alloc_error;
2048 		else
2049 			goto error;
2050 	}
2051 
2052 	ASSERT(ncookies == 1);
2053 	bzero(rptr, PAGESIZE);
2054 	/* LINTED: constant in conditional context */
2055 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2056 	/* LINTED: constant in conditional context */
2057 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2058 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2059 
2060 	/*
2061 	 * Allocate page for the receive descriptor ring.
2062 	 */
2063 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2064 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2065 		goto alloc_error;
2066 
2067 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2068 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2069 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2070 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2071 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2072 		xnfp->xnf_rx_ring_dma_handle = NULL;
2073 		goto alloc_error;
2074 	}
2075 
2076 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2077 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2078 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2079 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2080 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2081 		xnfp->xnf_rx_ring_dma_handle = NULL;
2082 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2083 		if (rc == DDI_DMA_NORESOURCES)
2084 			goto alloc_error;
2085 		else
2086 			goto error;
2087 	}
2088 
2089 	ASSERT(ncookies == 1);
2090 	bzero(rptr, PAGESIZE);
2091 	/* LINTED: constant in conditional context */
2092 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2093 	/* LINTED: constant in conditional context */
2094 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2095 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2096 
2097 	/*
2098 	 * Preallocate receive buffers for each receive descriptor.
2099 	 */
2100 
2101 	/* Set up the "free list" of receive buffer descriptors */
2102 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2103 		if ((bdesc = xnf_alloc_buffer(xnfp)) == NULL)
2104 			goto alloc_error;
2105 		bdesc->next = xnfp->xnf_free_list;
2106 		xnfp->xnf_free_list = bdesc;
2107 	}
2108 
2109 	return (DDI_SUCCESS);
2110 
2111 alloc_error:
2112 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2113 	    ddi_get_instance(xnfp->xnf_devinfo));
2114 error:
2115 	xnf_release_dma_resources(xnfp);
2116 	return (DDI_FAILURE);
2117 }
2118 
2119 /*
2120  * Release all DMA resources in the opposite order from acquisition
2121  * Should not be called until all outstanding esballoc buffers
2122  * have been returned.
2123  */
2124 static void
2125 xnf_release_dma_resources(xnf_t *xnfp)
2126 {
2127 	int i;
2128 
2129 	/*
2130 	 * Free receive buffers which are currently associated with
2131 	 * descriptors
2132 	 */
2133 	for (i = 0; i < xnfp->xnf_n_rx; i++) {
2134 		struct xnf_buffer_desc *bp;
2135 
2136 		if ((bp = xnfp->xnf_rxpkt_bufptr[i]) == NULL)
2137 			continue;
2138 		xnf_free_buffer(bp);
2139 		xnfp->xnf_rxpkt_bufptr[i] = NULL;
2140 	}
2141 
2142 	/* Free the receive ring buffer */
2143 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2144 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2145 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2146 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2147 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2148 	}
2149 	/* Free the transmit ring buffer */
2150 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2151 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2152 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2153 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2154 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2155 	}
2156 
2157 	/*
2158 	 * Free handles for mapping (virtual address) pointers to
2159 	 * transmit data buffers to physical addresses
2160 	 */
2161 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2162 		if (xnfp->xnf_tx_pkt_info[i].dma_handle != NULL) {
2163 			ddi_dma_free_handle(
2164 			    &xnfp->xnf_tx_pkt_info[i].dma_handle);
2165 		}
2166 	}
2167 
2168 }
2169 
2170 static void
2171 xnf_release_mblks(xnf_t *xnfp)
2172 {
2173 	int	i;
2174 
2175 	for (i = 0; i < xnfp->xnf_n_tx; i++) {
2176 		if (xnfp->xnf_tx_pkt_info[i].mp == NULL)
2177 			continue;
2178 		freemsg(xnfp->xnf_tx_pkt_info[i].mp);
2179 		xnfp->xnf_tx_pkt_info[i].mp = NULL;
2180 		(void) ddi_dma_unbind_handle(
2181 		    xnfp->xnf_tx_pkt_info[i].dma_handle);
2182 	}
2183 }
2184 
2185 /*
2186  * Remove a xmit buffer descriptor from the head of the free list and return
2187  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2188  * Called with the tx_buf_mutex held.
2189  */
2190 static struct xnf_buffer_desc *
2191 xnf_get_tx_buffer(xnf_t *xnfp)
2192 {
2193 	struct xnf_buffer_desc *bdesc;
2194 
2195 	bdesc = xnfp->xnf_tx_free_list;
2196 	if (bdesc != NULL) {
2197 		xnfp->xnf_tx_free_list = bdesc->next;
2198 	} else {
2199 		bdesc = xnf_alloc_tx_buffer(xnfp);
2200 	}
2201 	return (bdesc);
2202 }
2203 
2204 /*
2205  * Remove a buffer descriptor from the head of the free list and return
2206  * a pointer to it.  If no buffers on list, attempt to allocate a new one.
2207  * Called with the rx_buf_mutex held.
2208  */
2209 static struct xnf_buffer_desc *
2210 xnf_get_buffer(xnf_t *xnfp)
2211 {
2212 	struct xnf_buffer_desc *bdesc;
2213 
2214 	bdesc = xnfp->xnf_free_list;
2215 	if (bdesc != NULL) {
2216 		xnfp->xnf_free_list = bdesc->next;
2217 		xnfp->xnf_rx_descs_free--;
2218 	} else {
2219 		bdesc = xnf_alloc_buffer(xnfp);
2220 	}
2221 	return (bdesc);
2222 }
2223 
2224 /*
2225  * Free a xmit buffer back to the xmit free list
2226  */
2227 static void
2228 xnf_free_tx_buffer(struct xnf_buffer_desc *bp)
2229 {
2230 	xnf_t *xnfp = bp->xnfp;
2231 
2232 	mutex_enter(&xnfp->xnf_tx_buf_mutex);
2233 	bp->next = xnfp->xnf_tx_free_list;
2234 	xnfp->xnf_tx_free_list = bp;
2235 	mutex_exit(&xnfp->xnf_tx_buf_mutex);
2236 }
2237 
2238 /*
2239  * Put a buffer descriptor onto the head of the free list.
2240  * for page-flip:
2241  * We can't really free these buffers back to the kernel
2242  * since we have given away their backing page to be used
2243  * by the back end net driver.
2244  * for hvcopy:
2245  * release all the memory
2246  */
2247 static void
2248 xnf_free_buffer(struct xnf_buffer_desc *bdesc)
2249 {
2250 	xnf_t *xnfp = bdesc->xnfp;
2251 
2252 	mutex_enter(&xnfp->xnf_rx_buf_mutex);
2253 	if (xnfp->xnf_rx_hvcopy) {
2254 		if (ddi_dma_unbind_handle(bdesc->dma_handle) != DDI_SUCCESS)
2255 			goto out;
2256 		ddi_dma_mem_free(&bdesc->acc_handle);
2257 		ddi_dma_free_handle(&bdesc->dma_handle);
2258 		kmem_free(bdesc, sizeof (*bdesc));
2259 		xnfp->xnf_rx_buffer_count--;
2260 	} else {
2261 		bdesc->next = xnfp->xnf_free_list;
2262 		xnfp->xnf_free_list = bdesc;
2263 		xnfp->xnf_rx_descs_free++;
2264 	}
2265 out:
2266 	mutex_exit(&xnfp->xnf_rx_buf_mutex);
2267 }
2268 
2269 /*
2270  * Allocate a DMA-able xmit buffer, including a structure to
2271  * keep track of the buffer.  Called with tx_buf_mutex held.
2272  */
2273 static struct xnf_buffer_desc *
2274 xnf_alloc_tx_buffer(xnf_t *xnfp)
2275 {
2276 	struct xnf_buffer_desc *bdesc;
2277 	size_t len;
2278 
2279 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2280 		return (NULL);
2281 
2282 	/* allocate a DMA access handle for receive buffer */
2283 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buffer_dma_attr,
2284 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2285 		goto failure;
2286 
2287 	/* Allocate DMA-able memory for transmit buffer */
2288 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2289 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2290 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2291 		goto failure_1;
2292 
2293 	bdesc->xnfp = xnfp;
2294 	xnfp->xnf_tx_buffer_count++;
2295 
2296 	return (bdesc);
2297 
2298 failure_1:
2299 	ddi_dma_free_handle(&bdesc->dma_handle);
2300 
2301 failure:
2302 	kmem_free(bdesc, sizeof (*bdesc));
2303 	return (NULL);
2304 }
2305 
2306 /*
2307  * Allocate a DMA-able receive buffer, including a structure to
2308  * keep track of the buffer.  Called with rx_buf_mutex held.
2309  */
2310 static struct xnf_buffer_desc *
2311 xnf_alloc_buffer(xnf_t *xnfp)
2312 {
2313 	struct			xnf_buffer_desc *bdesc;
2314 	size_t			len;
2315 	uint_t			ncookies;
2316 	ddi_dma_cookie_t	dma_cookie;
2317 	long			cnt;
2318 	pfn_t			pfn;
2319 
2320 	if (xnfp->xnf_rx_buffer_count >= xnfp->xnf_max_rx_bufs)
2321 		return (NULL);
2322 
2323 	if ((bdesc = kmem_zalloc(sizeof (*bdesc), KM_NOSLEEP)) == NULL)
2324 		return (NULL);
2325 
2326 	/* allocate a DMA access handle for receive buffer */
2327 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buffer_dma_attr,
2328 	    0, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2329 		goto failure;
2330 
2331 	/* Allocate DMA-able memory for receive buffer */
2332 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2333 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, 0, 0,
2334 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2335 		goto failure_1;
2336 
2337 	/* bind to virtual address of buffer to get physical address */
2338 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2339 	    bdesc->buf, PAGESIZE, DDI_DMA_READ | DDI_DMA_STREAMING,
2340 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2341 		goto failure_2;
2342 
2343 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2344 	bdesc->xnfp = xnfp;
2345 	if (xnfp->xnf_rx_hvcopy) {
2346 		bdesc->free_rtn.free_func = xnf_copy_rcv_complete;
2347 	} else {
2348 		bdesc->free_rtn.free_func = xnf_rcv_complete;
2349 	}
2350 	bdesc->free_rtn.free_arg = (char *)bdesc;
2351 	bdesc->grant_ref = GRANT_INVALID_REF;
2352 	ASSERT(ncookies == 1);
2353 
2354 	xnfp->xnf_rx_buffer_count++;
2355 
2356 	if (!xnfp->xnf_rx_hvcopy) {
2357 		/*
2358 		 * Unmap the page, and hand the machine page back
2359 		 * to xen so it can be used as a backend net buffer.
2360 		 */
2361 		pfn = xnf_btop(bdesc->buf_phys);
2362 		cnt = balloon_free_pages(1, NULL, bdesc->buf, &pfn);
2363 		if (cnt != 1) {
2364 			cmn_err(CE_WARN, "unable to give a page back to the "
2365 			    "hypervisor\n");
2366 		}
2367 	}
2368 
2369 	return (bdesc);
2370 
2371 failure_2:
2372 	ddi_dma_mem_free(&bdesc->acc_handle);
2373 
2374 failure_1:
2375 	ddi_dma_free_handle(&bdesc->dma_handle);
2376 
2377 failure:
2378 	kmem_free(bdesc, sizeof (*bdesc));
2379 	return (NULL);
2380 }
2381 
2382 /*
2383  * Statistics.
2384  */
2385 static char *xnf_aux_statistics[] = {
2386 	"tx_cksum_deferred",
2387 	"rx_cksum_no_need",
2388 	"interrupts",
2389 	"unclaimed_interrupts",
2390 	"tx_pullup",
2391 	"tx_pagebndry",
2392 	"tx_attempt",
2393 	"rx_no_ringbuf",
2394 	"hvcopy_packet_processed",
2395 };
2396 
2397 static int
2398 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2399 {
2400 	xnf_t *xnfp;
2401 	kstat_named_t *knp;
2402 
2403 	if (flag != KSTAT_READ)
2404 		return (EACCES);
2405 
2406 	xnfp = ksp->ks_private;
2407 	knp = ksp->ks_data;
2408 
2409 	/*
2410 	 * Assignment order must match that of the names in
2411 	 * xnf_aux_statistics.
2412 	 */
2413 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2414 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2415 
2416 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2417 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2418 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2419 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
2420 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
2421 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_no_ringbuf;
2422 
2423 	(knp++)->value.ui64 = xnfp->xnf_stat_hvcopy_packet_processed;
2424 
2425 	return (0);
2426 }
2427 
2428 static boolean_t
2429 xnf_kstat_init(xnf_t *xnfp)
2430 {
2431 	int nstat = sizeof (xnf_aux_statistics) /
2432 	    sizeof (xnf_aux_statistics[0]);
2433 	char **cp = xnf_aux_statistics;
2434 	kstat_named_t *knp;
2435 
2436 	/*
2437 	 * Create and initialise kstats.
2438 	 */
2439 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2440 	    ddi_get_instance(xnfp->xnf_devinfo),
2441 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2442 	    nstat, 0)) == NULL)
2443 		return (B_FALSE);
2444 
2445 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2446 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2447 
2448 	knp = xnfp->xnf_kstat_aux->ks_data;
2449 	while (nstat > 0) {
2450 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2451 
2452 		knp++;
2453 		cp++;
2454 		nstat--;
2455 	}
2456 
2457 	kstat_install(xnfp->xnf_kstat_aux);
2458 
2459 	return (B_TRUE);
2460 }
2461 
2462 static int
2463 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2464 {
2465 	xnf_t *xnfp = arg;
2466 
2467 	mutex_enter(&xnfp->xnf_intrlock);
2468 	mutex_enter(&xnfp->xnf_txlock);
2469 
2470 #define	mac_stat(q, r)				\
2471 	case (MAC_STAT_##q):			\
2472 		*val = xnfp->xnf_stat_##r;	\
2473 		break
2474 
2475 #define	ether_stat(q, r)			\
2476 	case (ETHER_STAT_##q):			\
2477 		*val = xnfp->xnf_stat_##r;	\
2478 		break
2479 
2480 	switch (stat) {
2481 
2482 	mac_stat(IPACKETS, ipackets);
2483 	mac_stat(OPACKETS, opackets);
2484 	mac_stat(RBYTES, rbytes);
2485 	mac_stat(OBYTES, obytes);
2486 	mac_stat(NORCVBUF, norxbuf);
2487 	mac_stat(IERRORS, errrx);
2488 	mac_stat(NOXMTBUF, tx_defer);
2489 
2490 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2491 	ether_stat(TOOSHORT_ERRORS, runt);
2492 
2493 	/* always claim to be in full duplex mode */
2494 	case ETHER_STAT_LINK_DUPLEX:
2495 		*val = LINK_DUPLEX_FULL;
2496 		break;
2497 
2498 	/* always claim to be at 1Gb/s link speed */
2499 	case MAC_STAT_IFSPEED:
2500 		*val = 1000000000ull;
2501 		break;
2502 
2503 	default:
2504 		mutex_exit(&xnfp->xnf_txlock);
2505 		mutex_exit(&xnfp->xnf_intrlock);
2506 
2507 		return (ENOTSUP);
2508 	}
2509 
2510 #undef mac_stat
2511 #undef ether_stat
2512 
2513 	mutex_exit(&xnfp->xnf_txlock);
2514 	mutex_exit(&xnfp->xnf_intrlock);
2515 
2516 	return (0);
2517 }
2518 
2519 /*ARGSUSED*/
2520 static void
2521 xnf_blank(void *arg, time_t ticks, uint_t count)
2522 {
2523 	/*
2524 	 * XXPV dme: blanking is not currently implemented.
2525 	 *
2526 	 * It's not obvious how to use the 'ticks' argument here.
2527 	 *
2528 	 * 'Count' might be used as an indicator of how to set
2529 	 * rsp_event when posting receive buffers to the rx_ring.  It
2530 	 * would replace the code at the tail of xnf_process_recv()
2531 	 * that simply indicates that the next completed packet should
2532 	 * cause an interrupt.
2533 	 */
2534 }
2535 
2536 static void
2537 xnf_resources(void *arg)
2538 {
2539 	xnf_t *xnfp = arg;
2540 	mac_rx_fifo_t mrf;
2541 
2542 	mrf.mrf_type = MAC_RX_FIFO;
2543 	mrf.mrf_blank = xnf_blank;
2544 	mrf.mrf_arg = (void *)xnfp;
2545 	mrf.mrf_normal_blank_time = 128;	/* XXPV dme: see xnf_blank() */
2546 	mrf.mrf_normal_pkt_count = 8;		/* XXPV dme: see xnf_blank() */
2547 
2548 	xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh,
2549 	    (mac_resource_t *)&mrf);
2550 }
2551 
2552 /*ARGSUSED*/
2553 static void
2554 xnf_ioctl(void *arg, queue_t *q, mblk_t *mp)
2555 {
2556 	miocnak(q, mp, 0, EINVAL);
2557 }
2558 
2559 static boolean_t
2560 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
2561 {
2562 	xnf_t *xnfp = arg;
2563 
2564 	switch (cap) {
2565 	case MAC_CAPAB_HCKSUM: {
2566 		uint32_t *capab = cap_data;
2567 
2568 		/*
2569 		 * Whilst the flag used to communicate with the IO
2570 		 * domain is called "NETTXF_csum_blank", the checksum
2571 		 * in the packet must contain the pseudo-header
2572 		 * checksum and not zero.
2573 		 *
2574 		 * To help out the IO domain, we might use
2575 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
2576 		 * then use checksum offload for IPv6 packets, which
2577 		 * the IO domain can't handle.
2578 		 *
2579 		 * As a result, we declare outselves capable of
2580 		 * HCKSUM_INET_FULL_V4. This means that we receive
2581 		 * IPv4 packets from the stack with a blank checksum
2582 		 * field and must insert the pseudo-header checksum
2583 		 * before passing the packet to the IO domain.
2584 		 */
2585 		if (xnfp->xnf_cksum_offload)
2586 			*capab = HCKSUM_INET_FULL_V4;
2587 		else
2588 			*capab = 0;
2589 		break;
2590 	}
2591 
2592 	case MAC_CAPAB_POLL:
2593 		/* Just return B_TRUE. */
2594 		break;
2595 
2596 	default:
2597 		return (B_FALSE);
2598 	}
2599 
2600 	return (B_TRUE);
2601 }
2602 
2603 /*ARGSUSED*/
2604 static void
2605 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2606     void *arg, void *impl_data)
2607 {
2608 	xnf_t *xnfp = ddi_get_driver_private(dip);
2609 	XenbusState new_state = *(XenbusState *)impl_data;
2610 
2611 	ASSERT(xnfp != NULL);
2612 
2613 	switch (new_state) {
2614 	case XenbusStateConnected:
2615 		mutex_enter(&xnfp->xnf_intrlock);
2616 		mutex_enter(&xnfp->xnf_txlock);
2617 
2618 		xnfp->xnf_connected = B_TRUE;
2619 		/*
2620 		 * wake up threads wanting to send data to backend,
2621 		 * but got blocked due to backend is not ready
2622 		 */
2623 		cv_broadcast(&xnfp->xnf_cv);
2624 
2625 		mutex_exit(&xnfp->xnf_txlock);
2626 		mutex_exit(&xnfp->xnf_intrlock);
2627 
2628 		/*
2629 		 * kick backend in case it missed any tx request
2630 		 * in the TX ring buffer
2631 		 */
2632 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
2633 
2634 		/*
2635 		 * there maybe already queued rx data in the RX ring
2636 		 * sent by backend after it gets connected but before
2637 		 * we see its state change here, so we call our intr
2638 		 * handling routine to handle them, if any
2639 		 */
2640 		(void) xnf_intr((caddr_t)xnfp);
2641 
2642 		/* mark as link up after get connected */
2643 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
2644 
2645 		break;
2646 
2647 	default:
2648 		break;
2649 	}
2650 }
2651 
2652 /*
2653  * Check whether backend is capable of and willing to talk
2654  * to us via hypervisor copy, as opposed to page flip.
2655  */
2656 static boolean_t
2657 xnf_hvcopy_peer_status(dev_info_t *devinfo)
2658 {
2659 	int	be_rx_copy;
2660 	int	err;
2661 
2662 	err = xenbus_scanf(XBT_NULL, xvdi_get_oename(devinfo),
2663 	    "feature-rx-copy", "%d", &be_rx_copy);
2664 	/*
2665 	 * If we fail to read the store we assume that the key is
2666 	 * absent, implying an older domain at the far end.  Older
2667 	 * domains cannot do HV copy (we assume ..).
2668 	 */
2669 	if (err != 0)
2670 		be_rx_copy = 0;
2671 
2672 	return (be_rx_copy?B_TRUE:B_FALSE);
2673 }
2674