xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 8380b3cc879a715dff53a0564cd5b1c4bf9ade62)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/dlpi.h>
39 #include <sys/strsubr.h>
40 #include <sys/strsun.h>
41 #include <sys/types.h>
42 #include <sys/pattr.h>
43 #include <vm/seg_kmem.h>
44 #include <vm/hat_i86.h>
45 #include <xen/sys/xenbus_impl.h>
46 #include <xen/sys/xendev.h>
47 #include <sys/balloon_impl.h>
48 #include <sys/evtchn_impl.h>
49 #include <sys/gnttab.h>
50 #include <vm/vm_dep.h>
51 
52 #include <sys/gld.h>
53 #include <inet/ip.h>
54 #include <inet/ip_impl.h>
55 #include <sys/vnic_impl.h> /* blech. */
56 
57 /*
58  * The terms "transmit" and "receive" are used in their traditional
59  * sense here - packets from other parts of this system are
60  * "transmitted" to the peer domain and those originating from the
61  * peer are "received".
62  *
63  * In some cases this can be confusing, because various data
64  * structures are shared with the domU driver, which has the opposite
65  * view of what constitutes "transmit" and "receive".  In naming the
66  * shared structures the domU driver always wins.
67  */
68 
69 /*
70  * XXPV dme: things to do, as well as various things indicated
71  * throughout the source:
72  * - copy avoidance outbound.
73  * - copy avoidance inbound.
74  * - transfer credit limiting.
75  * - MAC address based filtering.
76  */
77 
78 /*
79  * Linux expects to have some headroom in received buffers.  The Linux
80  * frontend driver (netfront) checks to see if the headroom is
81  * available and will re-allocate the buffer to make room if
82  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
83  * headroom to each packet we pass to the peer.
84  */
85 #define	TX_BUFFER_HEADROOM	16
86 
87 /*
88  * Should we attempt to defer checksum calculation?
89  */
90 static boolean_t	xnb_cksum_offload = B_TRUE;
91 /*
92  * When receiving packets from a guest, should they be copied
93  * or used as-is (esballoc)?
94  */
95 static boolean_t	xnb_rx_always_copy = B_TRUE;
96 
97 static boolean_t	xnb_connect_rings(dev_info_t *);
98 static void		xnb_disconnect_rings(dev_info_t *);
99 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
100     void *, void *);
101 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
102     void *, void *);
103 
104 static int	xnb_rxbuf_constructor(void *, void *, int);
105 static void	xnb_rxbuf_destructor(void *, void *);
106 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
107 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
108 static void	xnb_rx_notify_peer(xnb_t *);
109 static void	xnb_rx_complete(xnb_rxbuf_t *);
110 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
111 static void 	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
112     xnb_rxbuf_t *);
113 static void	xnb_rx_perform_pending_unmop(xnb_t *);
114 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
115 
116 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
117 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
118 
119 
120 boolean_t	xnb_hv_copy = B_TRUE;
121 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
122 
123 /* XXPV dme: are these really invalid? */
124 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
125 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
126 
127 static kmem_cache_t *xnb_rxbuf_cachep;
128 static kmutex_t	xnb_alloc_page_lock;
129 
130 /*
131  * Statistics.
132  */
133 static char *aux_statistics[] = {
134 	"tx_cksum_deferred",
135 	"rx_cksum_no_need",
136 	"tx_rsp_notok",
137 	"tx_notify_deferred",
138 	"tx_notify_sent",
139 	"rx_notify_deferred",
140 	"rx_notify_sent",
141 	"tx_too_early",
142 	"rx_too_early",
143 	"rx_allocb_failed",
144 	"tx_allocb_failed",
145 	"tx_foreign_page",
146 	"mac_full",
147 	"spurious_intr",
148 	"allocation_success",
149 	"allocation_failure",
150 	"small_allocation_success",
151 	"small_allocation_failure",
152 	"other_allocation_failure",
153 	"tx_pageboundary_crossed",
154 	"tx_cpoparea_grown",
155 	"csum_hardware",
156 	"csum_software",
157 };
158 
159 static int
160 xnb_ks_aux_update(kstat_t *ksp, int flag)
161 {
162 	xnb_t *xnbp;
163 	kstat_named_t *knp;
164 
165 	if (flag != KSTAT_READ)
166 		return (EACCES);
167 
168 	xnbp = ksp->ks_private;
169 	knp = ksp->ks_data;
170 
171 	/*
172 	 * Assignment order should match that of the names in
173 	 * aux_statistics.
174 	 */
175 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_rsp_notok;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed;
195 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown;
196 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
197 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
198 
199 	return (0);
200 }
201 
202 static boolean_t
203 xnb_ks_init(xnb_t *xnbp)
204 {
205 	int nstat = sizeof (aux_statistics) /
206 	    sizeof (aux_statistics[0]);
207 	char **cp = aux_statistics;
208 	kstat_named_t *knp;
209 
210 	/*
211 	 * Create and initialise kstats.
212 	 */
213 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
214 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
215 	    KSTAT_TYPE_NAMED, nstat, 0);
216 	if (xnbp->xnb_kstat_aux == NULL)
217 		return (B_FALSE);
218 
219 	xnbp->xnb_kstat_aux->ks_private = xnbp;
220 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
221 
222 	knp = xnbp->xnb_kstat_aux->ks_data;
223 	while (nstat > 0) {
224 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
225 
226 		knp++;
227 		cp++;
228 		nstat--;
229 	}
230 
231 	kstat_install(xnbp->xnb_kstat_aux);
232 
233 	return (B_TRUE);
234 }
235 
236 static void
237 xnb_ks_free(xnb_t *xnbp)
238 {
239 	kstat_delete(xnbp->xnb_kstat_aux);
240 }
241 
242 /*
243  * Software checksum calculation and insertion for an arbitrary packet.
244  */
245 /*ARGSUSED*/
246 static mblk_t *
247 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
248 {
249 	/*
250 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
251 	 * because it doesn't cover all of the interesting cases :-(
252 	 */
253 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
254 	    HCK_FULLCKSUM, KM_NOSLEEP);
255 
256 	return (vnic_fix_cksum(mp));
257 }
258 
259 mblk_t *
260 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
261 {
262 	struct ether_header *ehp;
263 	uint16_t sap;
264 	uint32_t offset;
265 	ipha_t *ipha;
266 
267 	ASSERT(mp->b_next == NULL);
268 
269 	/*
270 	 * Check that the packet is contained in a single mblk.  In
271 	 * the "from peer" path this is true today, but will change
272 	 * when scatter gather support is added.  In the "to peer"
273 	 * path we cannot be sure, but in most cases it will be true
274 	 * (in the xnbo case the packet has come from a MAC device
275 	 * which is unlikely to split packets).
276 	 */
277 	if (mp->b_cont != NULL)
278 		goto software;
279 
280 	/*
281 	 * If the MAC has no hardware capability don't do any further
282 	 * checking.
283 	 */
284 	if (capab == 0)
285 		goto software;
286 
287 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
288 	ehp = (struct ether_header *)mp->b_rptr;
289 
290 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
291 		struct ether_vlan_header *evhp;
292 
293 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
294 		evhp = (struct ether_vlan_header *)mp->b_rptr;
295 		sap = ntohs(evhp->ether_type);
296 		offset = sizeof (struct ether_vlan_header);
297 	} else {
298 		sap = ntohs(ehp->ether_type);
299 		offset = sizeof (struct ether_header);
300 	}
301 
302 	/*
303 	 * We only attempt to do IPv4 packets in hardware.
304 	 */
305 	if (sap != ETHERTYPE_IP)
306 		goto software;
307 
308 	/*
309 	 * We know that this is an IPv4 packet.
310 	 */
311 	ipha = (ipha_t *)(mp->b_rptr + offset);
312 
313 	switch (ipha->ipha_protocol) {
314 	case IPPROTO_TCP:
315 	case IPPROTO_UDP: {
316 		uint32_t start, length, stuff, cksum;
317 		uint16_t *stuffp;
318 
319 		/*
320 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
321 		 * can use full IPv4 and partial checksum offload.
322 		 */
323 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
324 			break;
325 
326 		start = IP_SIMPLE_HDR_LENGTH;
327 		length = ntohs(ipha->ipha_length);
328 		if (ipha->ipha_protocol == IPPROTO_TCP) {
329 			stuff = start + TCP_CHECKSUM_OFFSET;
330 			cksum = IP_TCP_CSUM_COMP;
331 		} else {
332 			stuff = start + UDP_CHECKSUM_OFFSET;
333 			cksum = IP_UDP_CSUM_COMP;
334 		}
335 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
336 
337 		if (capab & HCKSUM_INET_FULL_V4) {
338 			/*
339 			 * Some devices require that the checksum
340 			 * field of the packet is zero for full
341 			 * offload.
342 			 */
343 			*stuffp = 0;
344 
345 			(void) hcksum_assoc(mp, NULL, NULL,
346 			    0, 0, 0, 0,
347 			    HCK_FULLCKSUM, KM_NOSLEEP);
348 
349 			xnbp->xnb_stat_csum_hardware++;
350 
351 			return (mp);
352 		}
353 
354 		if (capab & HCKSUM_INET_PARTIAL) {
355 			if (*stuffp == 0) {
356 				ipaddr_t src, dst;
357 
358 				/*
359 				 * Older Solaris guests don't insert
360 				 * the pseudo-header checksum, so we
361 				 * calculate it here.
362 				 */
363 				src = ipha->ipha_src;
364 				dst = ipha->ipha_dst;
365 
366 				cksum += (dst >> 16) + (dst & 0xFFFF);
367 				cksum += (src >> 16) + (src & 0xFFFF);
368 				cksum += length - IP_SIMPLE_HDR_LENGTH;
369 
370 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
371 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
372 
373 				ASSERT(cksum <= 0xFFFF);
374 
375 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
376 			}
377 
378 			(void) hcksum_assoc(mp, NULL, NULL,
379 			    start, stuff, length, 0,
380 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
381 
382 			xnbp->xnb_stat_csum_hardware++;
383 
384 			return (mp);
385 		}
386 
387 		/* NOTREACHED */
388 		break;
389 	}
390 
391 	default:
392 		/* Use software. */
393 		break;
394 	}
395 
396 software:
397 	/*
398 	 * We are not able to use any offload so do the whole thing in
399 	 * software.
400 	 */
401 	xnbp->xnb_stat_csum_software++;
402 
403 	return (xnb_software_csum(xnbp, mp));
404 }
405 
406 int
407 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
408 {
409 	xnb_t *xnbp;
410 	char *xsname, mac[ETHERADDRL * 3];
411 
412 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
413 
414 	xnbp->xnb_flavour = flavour;
415 	xnbp->xnb_flavour_data = flavour_data;
416 	xnbp->xnb_devinfo = dip;
417 	xnbp->xnb_evtchn = INVALID_EVTCHN;
418 	xnbp->xnb_irq = B_FALSE;
419 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
420 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
421 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
422 	xnbp->xnb_connected = B_FALSE;
423 	xnbp->xnb_hotplugged = B_FALSE;
424 	xnbp->xnb_detachable = B_FALSE;
425 	xnbp->xnb_peer = xvdi_get_oeid(dip);
426 	xnbp->xnb_rx_pages_writable = B_FALSE;
427 	xnbp->xnb_rx_always_copy = xnb_rx_always_copy;
428 
429 	xnbp->xnb_rx_buf_count = 0;
430 	xnbp->xnb_rx_unmop_count = 0;
431 
432 	xnbp->xnb_hv_copy = B_FALSE;
433 
434 	xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
435 	ASSERT(xnbp->xnb_tx_va != NULL);
436 
437 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
438 	    != DDI_SUCCESS)
439 		goto failure;
440 
441 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
442 	xnbp->xnb_tx_cpop = NULL;
443 	xnbp->xnb_cpop_sz = 0;
444 
445 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
446 	    xnbp->xnb_icookie);
447 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
448 	    xnbp->xnb_icookie);
449 
450 	/* set driver private pointer now */
451 	ddi_set_driver_private(dip, xnbp);
452 
453 	if (!xnb_ks_init(xnbp))
454 		goto failure_1;
455 
456 	/*
457 	 * Receive notification of changes in the state of the
458 	 * driver in the guest domain.
459 	 */
460 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
461 	    xnb_oe_state_change) != DDI_SUCCESS)
462 		goto failure_2;
463 
464 	/*
465 	 * Receive notification of hotplug events.
466 	 */
467 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
468 	    xnb_hp_state_change) != DDI_SUCCESS)
469 		goto failure_2;
470 
471 	xsname = xvdi_get_xsname(dip);
472 
473 	if (xenbus_printf(XBT_NULL, xsname,
474 	    "feature-no-csum-offload", "%d",
475 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
476 		goto failure_3;
477 
478 	/*
479 	 * Use global xnb_hv_copy to export this feature. This means that
480 	 * we have to decide what to do before starting up a guest domain
481 	 */
482 	if (xenbus_printf(XBT_NULL, xsname,
483 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
484 		goto failure_3;
485 	/*
486 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
487 	 * in addition to "feature-rx-copy" being 1. It seems strange
488 	 * to use four possible states to describe a binary decision,
489 	 * but we might as well play nice.
490 	 */
491 	if (xenbus_printf(XBT_NULL, xsname,
492 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
493 		goto failure_3;
494 
495 	if (xenbus_scanf(XBT_NULL, xsname,
496 	    "mac", "%s", mac) != 0) {
497 		cmn_err(CE_WARN, "xnb_attach: "
498 		    "cannot read mac address from %s",
499 		    xsname);
500 		goto failure_3;
501 	}
502 
503 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
504 		cmn_err(CE_WARN,
505 		    "xnb_attach: cannot parse mac address %s",
506 		    mac);
507 		goto failure_3;
508 	}
509 
510 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
511 	(void) xvdi_post_event(dip, XEN_HP_ADD);
512 
513 	return (DDI_SUCCESS);
514 
515 failure_3:
516 	xvdi_remove_event_handler(dip, NULL);
517 
518 failure_2:
519 	xnb_ks_free(xnbp);
520 
521 failure_1:
522 	mutex_destroy(&xnbp->xnb_rx_lock);
523 	mutex_destroy(&xnbp->xnb_tx_lock);
524 
525 failure:
526 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
527 	kmem_free(xnbp, sizeof (*xnbp));
528 	return (DDI_FAILURE);
529 }
530 
531 /*ARGSUSED*/
532 void
533 xnb_detach(dev_info_t *dip)
534 {
535 	xnb_t *xnbp = ddi_get_driver_private(dip);
536 
537 	ASSERT(xnbp != NULL);
538 	ASSERT(!xnbp->xnb_connected);
539 	ASSERT(xnbp->xnb_rx_buf_count == 0);
540 
541 	xnb_disconnect_rings(dip);
542 
543 	xvdi_remove_event_handler(dip, NULL);
544 
545 	xnb_ks_free(xnbp);
546 
547 	ddi_set_driver_private(dip, NULL);
548 
549 	mutex_destroy(&xnbp->xnb_tx_lock);
550 	mutex_destroy(&xnbp->xnb_rx_lock);
551 
552 	if (xnbp->xnb_cpop_sz > 0)
553 		kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop)
554 		    * xnbp->xnb_cpop_sz);
555 
556 	ASSERT(xnbp->xnb_tx_va != NULL);
557 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
558 
559 	kmem_free(xnbp, sizeof (*xnbp));
560 }
561 
562 
563 static mfn_t
564 xnb_alloc_page(xnb_t *xnbp)
565 {
566 #define	WARNING_RATE_LIMIT 100
567 #define	BATCH_SIZE 256
568 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
569 	static int nth = BATCH_SIZE;
570 	mfn_t mfn;
571 
572 	mutex_enter(&xnb_alloc_page_lock);
573 	if (nth == BATCH_SIZE) {
574 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
575 			xnbp->xnb_stat_allocation_failure++;
576 			mutex_exit(&xnb_alloc_page_lock);
577 
578 			/*
579 			 * Try for a single page in low memory situations.
580 			 */
581 			if (balloon_alloc_pages(1, &mfn) != 1) {
582 				if ((xnbp->xnb_stat_small_allocation_failure++
583 				    % WARNING_RATE_LIMIT) == 0)
584 					cmn_err(CE_WARN, "xnb_alloc_page: "
585 					    "Cannot allocate memory to "
586 					    "transfer packets to peer.");
587 				return (0);
588 			} else {
589 				xnbp->xnb_stat_small_allocation_success++;
590 				return (mfn);
591 			}
592 		}
593 
594 		nth = 0;
595 		xnbp->xnb_stat_allocation_success++;
596 	}
597 
598 	mfn = mfns[nth++];
599 	mutex_exit(&xnb_alloc_page_lock);
600 
601 	ASSERT(mfn != 0);
602 
603 	return (mfn);
604 #undef BATCH_SIZE
605 #undef WARNING_RATE_LIMIT
606 }
607 
608 /*ARGSUSED*/
609 static void
610 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
611 {
612 	int r;
613 	pfn_t pfn;
614 
615 	pfn = xen_assign_pfn(mfn);
616 	pfnzero(pfn, 0, PAGESIZE);
617 	xen_release_pfn(pfn);
618 
619 	/*
620 	 * This happens only in the error path, so batching is
621 	 * not worth the complication.
622 	 */
623 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
624 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
625 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
626 		    r, mfn);
627 	}
628 }
629 
630 /*
631  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
632  * using local variables.
633  */
634 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
635 	((((_r)->sring->req_prod - loop) <		\
636 		(RING_SIZE(_r) - (loop - prod))) ?	\
637 	    ((_r)->sring->req_prod - loop) :		\
638 	    (RING_SIZE(_r) - (loop - prod)))
639 
640 mblk_t *
641 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
642 {
643 	mblk_t *free = mp, *prev = NULL;
644 	size_t len;
645 	gnttab_transfer_t *gop;
646 	boolean_t notify;
647 	RING_IDX loop, prod, end;
648 
649 	/*
650 	 * For each packet the sequence of operations is:
651 	 *
652 	 * 1. get a new page from the hypervisor.
653 	 * 2. get a request slot from the ring.
654 	 * 3. copy the data into the new page.
655 	 * 4. transfer the page to the peer.
656 	 * 5. update the request slot.
657 	 * 6. kick the peer.
658 	 * 7. free mp.
659 	 *
660 	 * In order to reduce the number of hypercalls, we prepare
661 	 * several packets for the peer and perform a single hypercall
662 	 * to transfer them.
663 	 */
664 
665 	mutex_enter(&xnbp->xnb_tx_lock);
666 
667 	/*
668 	 * If we are not connected to the peer or have not yet
669 	 * finished hotplug it is too early to pass packets to the
670 	 * peer.
671 	 */
672 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
673 		mutex_exit(&xnbp->xnb_tx_lock);
674 		DTRACE_PROBE(flip_tx_too_early);
675 		xnbp->xnb_stat_tx_too_early++;
676 		return (mp);
677 	}
678 
679 	loop = xnbp->xnb_rx_ring.req_cons;
680 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
681 	gop = xnbp->xnb_tx_top;
682 
683 	while ((mp != NULL) &&
684 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
685 
686 		mfn_t mfn;
687 		pfn_t pfn;
688 		netif_rx_request_t *rxreq;
689 		netif_rx_response_t *rxresp;
690 		char *valoop;
691 		size_t offset;
692 		mblk_t *ml;
693 		uint16_t cksum_flags;
694 
695 		/* 1 */
696 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
697 			xnbp->xnb_stat_xmit_defer++;
698 			break;
699 		}
700 
701 		/* 2 */
702 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
703 
704 #ifdef XNB_DEBUG
705 		if (!(rxreq->id < NET_RX_RING_SIZE))
706 			cmn_err(CE_PANIC, "xnb_to_peer: "
707 			    "id %d out of range in request 0x%p",
708 			    rxreq->id, (void *)rxreq);
709 #endif /* XNB_DEBUG */
710 
711 		/* Assign a pfn and map the new page at the allocated va. */
712 		pfn = xen_assign_pfn(mfn);
713 		hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
714 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
715 
716 		offset = TX_BUFFER_HEADROOM;
717 
718 		/* 3 */
719 		len = 0;
720 		valoop = xnbp->xnb_tx_va + offset;
721 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
722 			size_t chunk = ml->b_wptr - ml->b_rptr;
723 
724 			bcopy(ml->b_rptr, valoop, chunk);
725 			valoop += chunk;
726 			len += chunk;
727 		}
728 
729 		ASSERT(len + offset < PAGESIZE);
730 
731 		/* Release the pfn. */
732 		hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
733 		    HAT_UNLOAD_UNMAP);
734 		xen_release_pfn(pfn);
735 
736 		/* 4 */
737 		gop->mfn = mfn;
738 		gop->domid = xnbp->xnb_peer;
739 		gop->ref = rxreq->gref;
740 
741 		/* 5.1 */
742 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
743 		rxresp->offset = offset;
744 		rxresp->flags = 0;
745 
746 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
747 		if (cksum_flags != 0)
748 			xnbp->xnb_stat_tx_cksum_deferred++;
749 		rxresp->flags |= cksum_flags;
750 
751 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
752 		rxresp->status = len;
753 
754 		loop++;
755 		prod++;
756 		gop++;
757 		prev = mp;
758 		mp = mp->b_next;
759 	}
760 
761 	/*
762 	 * Did we actually do anything?
763 	 */
764 	if (loop == xnbp->xnb_rx_ring.req_cons) {
765 		mutex_exit(&xnbp->xnb_tx_lock);
766 		return (mp);
767 	}
768 
769 	end = loop;
770 
771 	/*
772 	 * Unlink the end of the 'done' list from the remainder.
773 	 */
774 	ASSERT(prev != NULL);
775 	prev->b_next = NULL;
776 
777 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top,
778 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
779 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
780 	}
781 
782 	loop = xnbp->xnb_rx_ring.req_cons;
783 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
784 	gop = xnbp->xnb_tx_top;
785 
786 	while (loop < end) {
787 		int16_t status = NETIF_RSP_OKAY;
788 
789 		if (gop->status != 0) {
790 			status = NETIF_RSP_ERROR;
791 
792 			/*
793 			 * If the status is anything other than
794 			 * GNTST_bad_page then we don't own the page
795 			 * any more, so don't try to give it back.
796 			 */
797 			if (gop->status != GNTST_bad_page)
798 				gop->mfn = 0;
799 		} else {
800 			/* The page is no longer ours. */
801 			gop->mfn = 0;
802 		}
803 
804 		if (gop->mfn != 0)
805 			/*
806 			 * Give back the page, as we won't be using
807 			 * it.
808 			 */
809 			xnb_free_page(xnbp, gop->mfn);
810 		else
811 			/*
812 			 * We gave away a page, update our accounting
813 			 * now.
814 			 */
815 			balloon_drv_subtracted(1);
816 
817 		/* 5.2 */
818 		if (status != NETIF_RSP_OKAY) {
819 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
820 			    status;
821 		} else {
822 			xnbp->xnb_stat_opackets++;
823 			xnbp->xnb_stat_obytes += len;
824 		}
825 
826 		loop++;
827 		prod++;
828 		gop++;
829 	}
830 
831 	xnbp->xnb_rx_ring.req_cons = loop;
832 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
833 
834 	/* 6 */
835 	/* LINTED: constant in conditional context */
836 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
837 	if (notify) {
838 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
839 		xnbp->xnb_stat_tx_notify_sent++;
840 	} else {
841 		xnbp->xnb_stat_tx_notify_deferred++;
842 	}
843 
844 	if (mp != NULL)
845 		xnbp->xnb_stat_xmit_defer++;
846 
847 	mutex_exit(&xnbp->xnb_tx_lock);
848 
849 	/* Free mblk_t's that we consumed. */
850 	freemsgchain(free);
851 
852 	return (mp);
853 }
854 
855 /* helper functions for xnb_copy_to_peer */
856 
857 /*
858  * Grow the array of copy operation descriptors.
859  * Returns a pointer to the next available entry.
860  */
861 gnttab_copy_t *
862 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
863 {
864 	/*
865 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
866 	 * something into but cannot, because we haven't alloc'ed it
867 	 * yet, or NULL.
868 	 * old_cpop and new_cpop (local) are pointers to old/new
869 	 * versions of xnbp->xnb_tx_cpop.
870 	 */
871 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
872 	size_t		newcount;
873 
874 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
875 
876 	old_cpop = xnbp->xnb_tx_cpop;
877 	/*
878 	 * o_cpop is a pointer into the array pointed to by old_cpop;
879 	 * it would be an error for exactly one of these pointers to be NULL.
880 	 * We shouldn't call this function if xnb_tx_cpop has already
881 	 * been allocated, but we're starting to fill it from the beginning
882 	 * again.
883 	 */
884 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
885 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
886 
887 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
888 
889 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
890 	if (new_cpop == NULL) {
891 		xnbp->xnb_stat_other_allocation_failure++;
892 		return (NULL);
893 	}
894 
895 	if (o_cpop != NULL) {
896 		size_t	 offset = (o_cpop - old_cpop);
897 
898 		/* we only need to move the parts in use ... */
899 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
900 		    (sizeof (*old_cpop)));
901 
902 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
903 
904 		ret_cpop = new_cpop + offset;
905 	} else {
906 		ret_cpop = new_cpop;
907 	}
908 
909 	xnbp->xnb_tx_cpop = new_cpop;
910 	xnbp->xnb_cpop_sz = newcount;
911 
912 	xnbp->xnb_stat_tx_cpoparea_grown++;
913 
914 	return (ret_cpop);
915 }
916 
917 /*
918  * Check whether an address is on a page that's foreign to this domain.
919  */
920 static boolean_t
921 is_foreign(void *addr)
922 {
923 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
924 
925 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
926 }
927 
928 /*
929  * Insert a newly allocated mblk into a chain, replacing the old one.
930  */
931 static mblk_t *
932 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
933 {
934 	uint32_t	start, stuff, end, value, flags;
935 	mblk_t		*new_mp;
936 
937 	new_mp = copyb(mp);
938 	if (new_mp == NULL)
939 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
940 		    "for %p, len %lu", (void *) mp, len);
941 
942 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
943 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
944 	    flags, KM_NOSLEEP);
945 
946 	new_mp->b_next = mp->b_next;
947 	new_mp->b_prev = mp->b_prev;
948 	new_mp->b_cont = mp->b_cont;
949 
950 	/* Make sure we only overwrite pointers to the mblk being replaced. */
951 	if (mp_prev != NULL && mp_prev->b_next == mp)
952 		mp_prev->b_next = new_mp;
953 
954 	if (ml_prev != NULL && ml_prev->b_cont == mp)
955 		ml_prev->b_cont = new_mp;
956 
957 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
958 	freemsg(mp);
959 
960 	return (new_mp);
961 }
962 
963 /*
964  * Set all the fields in a gnttab_copy_t.
965  */
966 static void
967 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
968     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
969 {
970 	ASSERT(xnbp != NULL && gp != NULL);
971 
972 	gp->source.offset = s_off;
973 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
974 	gp->source.domid = DOMID_SELF;
975 
976 	gp->len = (uint16_t)len;
977 	gp->flags = GNTCOPY_dest_gref;
978 	gp->status = 0;
979 
980 	gp->dest.u.ref = d_ref;
981 	gp->dest.offset = d_off;
982 	gp->dest.domid = xnbp->xnb_peer;
983 }
984 
985 mblk_t *
986 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
987 {
988 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
989 	mblk_t		*ml, *ml_prev;
990 	gnttab_copy_t	*gop_cp;
991 	boolean_t	notify;
992 	RING_IDX	loop, prod;
993 	int		i;
994 
995 	if (!xnbp->xnb_hv_copy)
996 		return (xnb_to_peer(xnbp, mp));
997 
998 	/*
999 	 * For each packet the sequence of operations is:
1000 	 *
1001 	 *  1. get a request slot from the ring.
1002 	 *  2. set up data for hypercall (see NOTE below)
1003 	 *  3. have the hypervisore copy the data
1004 	 *  4. update the request slot.
1005 	 *  5. kick the peer.
1006 	 *
1007 	 * NOTE ad 2.
1008 	 *  In order to reduce the number of hypercalls, we prepare
1009 	 *  several packets (mp->b_cont != NULL) for the peer and
1010 	 *  perform a single hypercall to transfer them.
1011 	 *  We also have to set up a seperate copy operation for
1012 	 *  every page.
1013 	 *
1014 	 * If we have more than one message (mp->b_next != NULL),
1015 	 * we do this whole dance repeatedly.
1016 	 */
1017 
1018 	mutex_enter(&xnbp->xnb_tx_lock);
1019 
1020 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1021 		mutex_exit(&xnbp->xnb_tx_lock);
1022 		DTRACE_PROBE(copy_tx_too_early);
1023 		xnbp->xnb_stat_tx_too_early++;
1024 		return (mp);
1025 	}
1026 
1027 	loop = xnbp->xnb_rx_ring.req_cons;
1028 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1029 
1030 	while ((mp != NULL) &&
1031 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1032 		netif_rx_request_t	*rxreq;
1033 		netif_rx_response_t	*rxresp;
1034 		size_t			offset, d_offset;
1035 		size_t			len;
1036 		uint16_t		cksum_flags;
1037 		int16_t			status = NETIF_RSP_OKAY;
1038 		int			item_count;
1039 
1040 		/* 1 */
1041 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1042 
1043 #ifdef XNB_DEBUG
1044 		if (!(rxreq->id < NET_RX_RING_SIZE))
1045 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1046 			    "id %d out of range in request 0x%p",
1047 			    rxreq->id, (void *)rxreq);
1048 #endif /* XNB_DEBUG */
1049 
1050 		/* 2 */
1051 		d_offset = offset = TX_BUFFER_HEADROOM;
1052 		len = 0;
1053 		item_count = 0;
1054 
1055 		gop_cp = xnbp->xnb_tx_cpop;
1056 
1057 		/*
1058 		 * We walk the b_cont pointers and set up a gop_cp
1059 		 * structure for every page in every data block we have.
1060 		 */
1061 		/* 2a */
1062 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1063 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1064 			uchar_t	*r_tmp,	*rpt_align;
1065 			size_t	r_offset;
1066 
1067 			/*
1068 			 * If we get an mblk on a page that doesn't belong to
1069 			 * this domain, get a new mblk to replace the old one.
1070 			 */
1071 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1072 				mblk_t *ml_new = replace_msg(ml, chunk,
1073 				    mp_prev, ml_prev);
1074 
1075 				/* We can still use old ml, but not *ml! */
1076 				if (free == ml)
1077 					free = ml_new;
1078 				if (mp == ml)
1079 					mp = ml_new;
1080 				ml = ml_new;
1081 
1082 				xnbp->xnb_stat_tx_foreign_page++;
1083 			}
1084 
1085 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1086 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1087 			r_tmp = ml->b_rptr;
1088 
1089 			if (d_offset + chunk > PAGESIZE)
1090 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1091 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1092 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1093 				    (void *)mp, (void *)saved_mp, (void *)ml,
1094 				    (void *)rpt_align,
1095 				    d_offset, chunk, (int)PAGESIZE);
1096 
1097 			while (chunk > 0) {
1098 				size_t part_len;
1099 
1100 				item_count++;
1101 				if (item_count > xnbp->xnb_cpop_sz) {
1102 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1103 					if (gop_cp == NULL)
1104 						goto failure;
1105 				}
1106 				/*
1107 				 * If our mblk crosses a page boundary, we need
1108 				 * to do a seperate copy for every page.
1109 				 */
1110 				if (r_offset + chunk > PAGESIZE) {
1111 					part_len = PAGESIZE - r_offset;
1112 
1113 					DTRACE_PROBE3(mblk_page_crossed,
1114 					    (mblk_t *), ml, int, chunk, int,
1115 					    (int)r_offset);
1116 
1117 					xnbp->xnb_stat_tx_pagebndry_crossed++;
1118 				} else {
1119 					part_len = chunk;
1120 				}
1121 
1122 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1123 				    d_offset, part_len, rxreq->gref);
1124 
1125 				chunk -= part_len;
1126 
1127 				len += part_len;
1128 				d_offset += part_len;
1129 				r_tmp += part_len;
1130 				/*
1131 				 * The 2nd, 3rd ... last copies will always
1132 				 * start at r_tmp, therefore r_offset is 0.
1133 				 */
1134 				r_offset = 0;
1135 				gop_cp++;
1136 			}
1137 			ml_prev = ml;
1138 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1139 			    chunk, int, len, int, item_count);
1140 		}
1141 		/* 3 */
1142 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop,
1143 		    item_count) != 0) {
1144 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1145 			DTRACE_PROBE(HV_granttableopfailed);
1146 		}
1147 
1148 		/* 4 */
1149 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1150 		rxresp->offset = offset;
1151 
1152 		rxresp->flags = 0;
1153 
1154 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1155 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1156 		    (int)rxresp->status);
1157 
1158 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1159 		if (cksum_flags != 0)
1160 			xnbp->xnb_stat_tx_cksum_deferred++;
1161 		rxresp->flags |= cksum_flags;
1162 
1163 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1164 		rxresp->status = len;
1165 
1166 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1167 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1168 		    (int)rxresp->status);
1169 
1170 		for (i = 0; i < item_count; i++) {
1171 			if (xnbp->xnb_tx_cpop[i].status != 0) {
1172 				DTRACE_PROBE2(cpop__status__nonnull, int,
1173 				    (int)xnbp->xnb_tx_cpop[i].status,
1174 				    int, i);
1175 				status = NETIF_RSP_ERROR;
1176 			}
1177 		}
1178 
1179 		/* 5.2 */
1180 		if (status != NETIF_RSP_OKAY) {
1181 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1182 			    status;
1183 			xnbp->xnb_stat_tx_rsp_notok++;
1184 		} else {
1185 			xnbp->xnb_stat_opackets++;
1186 			xnbp->xnb_stat_obytes += len;
1187 		}
1188 
1189 		loop++;
1190 		prod++;
1191 		mp_prev = mp;
1192 		mp = mp->b_next;
1193 	}
1194 failure:
1195 	/*
1196 	 * Did we actually do anything?
1197 	 */
1198 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1199 		mutex_exit(&xnbp->xnb_tx_lock);
1200 		return (mp);
1201 	}
1202 
1203 	/*
1204 	 * Unlink the end of the 'done' list from the remainder.
1205 	 */
1206 	ASSERT(mp_prev != NULL);
1207 	mp_prev->b_next = NULL;
1208 
1209 	xnbp->xnb_rx_ring.req_cons = loop;
1210 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1211 
1212 	/* 6 */
1213 	/* LINTED: constant in conditional context */
1214 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1215 	if (notify) {
1216 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1217 		xnbp->xnb_stat_tx_notify_sent++;
1218 	} else {
1219 		xnbp->xnb_stat_tx_notify_deferred++;
1220 	}
1221 
1222 	if (mp != NULL)
1223 		xnbp->xnb_stat_xmit_defer++;
1224 
1225 	mutex_exit(&xnbp->xnb_tx_lock);
1226 
1227 	/* Free mblk_t structs we have consumed. */
1228 	freemsgchain(free);
1229 
1230 	return (mp);
1231 }
1232 
1233 /*ARGSUSED*/
1234 static int
1235 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
1236 {
1237 	xnb_rxbuf_t *rxp = buf;
1238 
1239 	bzero(rxp, sizeof (*rxp));
1240 
1241 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
1242 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
1243 
1244 	rxp->xr_mop.host_addr =
1245 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1246 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1247 	    VM_NOSLEEP : VM_SLEEP);
1248 
1249 	if (rxp->xr_mop.host_addr == NULL) {
1250 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
1251 		    "cannot get address space");
1252 		return (-1);
1253 	}
1254 
1255 	/*
1256 	 * Have the hat ensure that page table exists for the VA.
1257 	 */
1258 	hat_prepare_mapping(kas.a_hat,
1259 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1260 
1261 	return (0);
1262 }
1263 
1264 /*ARGSUSED*/
1265 static void
1266 xnb_rxbuf_destructor(void *buf, void *arg)
1267 {
1268 	xnb_rxbuf_t *rxp = buf;
1269 
1270 	ASSERT(rxp->xr_mop.host_addr != NULL);
1271 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1272 
1273 	hat_release_mapping(kas.a_hat,
1274 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1275 	vmem_free(heap_arena,
1276 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
1277 }
1278 
1279 static void
1280 xnb_rx_notify_peer(xnb_t *xnbp)
1281 {
1282 	boolean_t notify;
1283 
1284 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1285 
1286 	/* LINTED: constant in conditional context */
1287 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1288 	if (notify) {
1289 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1290 		xnbp->xnb_stat_rx_notify_sent++;
1291 	} else {
1292 		xnbp->xnb_stat_rx_notify_deferred++;
1293 	}
1294 }
1295 
1296 static void
1297 xnb_rx_complete(xnb_rxbuf_t *rxp)
1298 {
1299 	xnb_t *xnbp = rxp->xr_xnbp;
1300 
1301 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1302 
1303 	mutex_enter(&xnbp->xnb_rx_lock);
1304 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp);
1305 	mutex_exit(&xnbp->xnb_rx_lock);
1306 }
1307 
1308 static void
1309 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1310 {
1311 	RING_IDX i;
1312 	netif_tx_response_t *txresp;
1313 
1314 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1315 
1316 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1317 
1318 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1319 	txresp->id = id;
1320 	txresp->status = status;
1321 
1322 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1323 
1324 	/*
1325 	 * Note that we don't push the change to the peer here - that
1326 	 * is the callers responsibility.
1327 	 */
1328 }
1329 
1330 static void
1331 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1332     xnb_rxbuf_t *rxp)
1333 {
1334 	gnttab_unmap_grant_ref_t	*unmop;
1335 	int				u_count;
1336 	int				reqs_on_ring;
1337 
1338 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1339 	ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE);
1340 
1341 	u_count = xnbp->xnb_rx_unmop_count++;
1342 
1343 	/* Cache data for the time when we actually unmap grant refs */
1344 	xnbp->xnb_rx_unmop_rxp[u_count] = rxp;
1345 
1346 	unmop = &xnbp->xnb_rx_unmop[u_count];
1347 	unmop->host_addr = mop->host_addr;
1348 	unmop->dev_bus_addr = mop->dev_bus_addr;
1349 	unmop->handle = mop->handle;
1350 
1351 	/*
1352 	 * We cannot check the ring once we're disconnected from it. Batching
1353 	 * doesn't seem to be a useful optimisation in this case either,
1354 	 * so we directly call into the actual unmap function.
1355 	 */
1356 	if (xnbp->xnb_connected) {
1357 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring);
1358 
1359 		/*
1360 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1361 		 * or (with N == 1) "immediate unmop" behaviour.
1362 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1363 		 */
1364 		if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat &&
1365 		    reqs_on_ring > xnb_unmop_lowwat)
1366 			return;
1367 	}
1368 
1369 	xnb_rx_perform_pending_unmop(xnbp);
1370 }
1371 
1372 /*
1373  * Here we perform the actual unmapping of the data that was
1374  * accumulated in xnb_rx_schedule_unmop().
1375  * Note that it is the caller's responsibility to make sure that
1376  * there's actually something there to unmop.
1377  */
1378 static void
1379 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
1380 {
1381 	RING_IDX loop;
1382 #ifdef XNB_DEBUG
1383 	gnttab_unmap_grant_ref_t *unmop;
1384 #endif /* XNB_DEBUG */
1385 
1386 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1387 	ASSERT(xnbp->xnb_rx_unmop_count > 0);
1388 
1389 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1390 	    xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) {
1391 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1392 		    "unmap grant operation failed, "
1393 		    "%d pages lost", xnbp->xnb_rx_unmop_count);
1394 	}
1395 
1396 #ifdef XNB_DEBUG
1397 	for (loop = 0, unmop = xnbp->xnb_rx_unmop;
1398 	    loop < xnbp->xnb_rx_unmop_count;
1399 	    loop++, unmop++) {
1400 		if (unmop->status != 0) {
1401 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1402 			    "unmap grant reference failed (%d)",
1403 			    unmop->status);
1404 		}
1405 	}
1406 #endif /* XNB_DEBUG */
1407 
1408 	for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) {
1409 		xnb_rxbuf_t	*rxp = xnbp->xnb_rx_unmop_rxp[loop];
1410 
1411 		if (rxp == NULL)
1412 			cmn_err(CE_PANIC,
1413 			    "xnb_rx_perform_pending_unmop: "
1414 			    "unexpected NULL rxp (loop %d; count %d)!",
1415 			    loop, xnbp->xnb_rx_unmop_count);
1416 
1417 		if (xnbp->xnb_connected)
1418 			xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
1419 		xnb_rxbuf_put(xnbp, rxp);
1420 	}
1421 	if (xnbp->xnb_connected)
1422 		xnb_rx_notify_peer(xnbp);
1423 
1424 	xnbp->xnb_rx_unmop_count = 0;
1425 
1426 #ifdef XNB_DEBUG
1427 	bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop));
1428 	bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp));
1429 #endif /* XNB_DEBUG */
1430 }
1431 
1432 static xnb_rxbuf_t *
1433 xnb_rxbuf_get(xnb_t *xnbp, int flags)
1434 {
1435 	xnb_rxbuf_t *rxp;
1436 
1437 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1438 
1439 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
1440 	if (rxp != NULL) {
1441 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1442 		rxp->xr_flags |= XNB_RXBUF_INUSE;
1443 
1444 		rxp->xr_xnbp = xnbp;
1445 		rxp->xr_mop.dom = xnbp->xnb_peer;
1446 
1447 		rxp->xr_mop.flags = GNTMAP_host_map;
1448 		if (!xnbp->xnb_rx_pages_writable)
1449 			rxp->xr_mop.flags |= GNTMAP_readonly;
1450 
1451 		xnbp->xnb_rx_buf_count++;
1452 	}
1453 
1454 	return (rxp);
1455 }
1456 
1457 static void
1458 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
1459 {
1460 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1461 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1462 
1463 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
1464 	xnbp->xnb_rx_buf_count--;
1465 
1466 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
1467 }
1468 
1469 static mblk_t *
1470 xnb_recv(xnb_t *xnbp)
1471 {
1472 	RING_IDX start, end, loop;
1473 	gnttab_map_grant_ref_t *mop;
1474 	xnb_rxbuf_t **rxpp;
1475 	netif_tx_request_t *txreq;
1476 	boolean_t work_to_do;
1477 	mblk_t *head, *tail;
1478 	/*
1479 	 * If the peer granted a read-only mapping to the page then we
1480 	 * must copy the data, as the local protocol stack (should the
1481 	 * packet be destined for this host) will modify the packet
1482 	 * 'in place'.
1483 	 */
1484 	boolean_t copy = xnbp->xnb_rx_always_copy ||
1485 	    !xnbp->xnb_rx_pages_writable;
1486 
1487 	/*
1488 	 * For each individual request, the sequence of actions is:
1489 	 *
1490 	 * 1. get the request.
1491 	 * 2. map the page based on the grant ref.
1492 	 * 3. allocate an mblk, copy the data to it.
1493 	 * 4. release the grant.
1494 	 * 5. update the ring.
1495 	 * 6. pass the packet upward.
1496 	 * 7. kick the peer.
1497 	 *
1498 	 * In fact, we try to perform the grant operations in batches,
1499 	 * so there are two loops.
1500 	 */
1501 
1502 	head = tail = NULL;
1503 around:
1504 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1505 
1506 	/* LINTED: constant in conditional context */
1507 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1508 	if (!work_to_do) {
1509 finished:
1510 		return (head);
1511 	}
1512 
1513 	start = xnbp->xnb_tx_ring.req_cons;
1514 	end = xnbp->xnb_tx_ring.sring->req_prod;
1515 
1516 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1517 	    loop != end;
1518 	    loop++, mop++, rxpp++) {
1519 		xnb_rxbuf_t *rxp;
1520 
1521 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1522 		if (rxp == NULL)
1523 			break;
1524 
1525 		ASSERT(xnbp->xnb_rx_pages_writable ||
1526 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1527 		    == GNTMAP_readonly));
1528 
1529 		rxp->xr_mop.ref =
1530 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1531 
1532 		*mop = rxp->xr_mop;
1533 		*rxpp = rxp;
1534 	}
1535 
1536 	if ((loop - start) == 0)
1537 		goto finished;
1538 
1539 	end = loop;
1540 
1541 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1542 	    xnbp->xnb_rx_mop, end - start) != 0) {
1543 
1544 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1545 
1546 		loop = start;
1547 		rxpp = xnbp->xnb_rx_bufp;
1548 
1549 		while (loop != end) {
1550 			xnb_rxbuf_put(xnbp, *rxpp);
1551 
1552 			loop++;
1553 			rxpp++;
1554 		}
1555 
1556 		goto finished;
1557 	}
1558 
1559 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1560 	    loop != end;
1561 	    loop++, mop++, rxpp++) {
1562 		mblk_t *mp = NULL;
1563 		int16_t status = NETIF_RSP_OKAY;
1564 		xnb_rxbuf_t *rxp = *rxpp;
1565 
1566 		if (mop->status != 0) {
1567 			cmn_err(CE_WARN, "xnb_recv: "
1568 			    "failed to map buffer: %d",
1569 			    mop->status);
1570 			status = NETIF_RSP_ERROR;
1571 		}
1572 
1573 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1574 
1575 		if (status == NETIF_RSP_OKAY) {
1576 			if (copy) {
1577 				mp = allocb(txreq->size, BPRI_MED);
1578 				if (mp == NULL) {
1579 					status = NETIF_RSP_ERROR;
1580 					xnbp->xnb_stat_rx_allocb_failed++;
1581 				} else {
1582 					bcopy((caddr_t)(uintptr_t)
1583 					    mop->host_addr + txreq->offset,
1584 					    mp->b_wptr, txreq->size);
1585 					mp->b_wptr += txreq->size;
1586 				}
1587 			} else {
1588 				mp = desballoc((uchar_t *)(uintptr_t)
1589 				    mop->host_addr + txreq->offset,
1590 				    txreq->size, 0, &rxp->xr_free_rtn);
1591 				if (mp == NULL) {
1592 					status = NETIF_RSP_ERROR;
1593 					xnbp->xnb_stat_rx_allocb_failed++;
1594 				} else {
1595 					rxp->xr_id = txreq->id;
1596 					rxp->xr_status = status;
1597 					rxp->xr_mop = *mop;
1598 
1599 					mp->b_wptr += txreq->size;
1600 				}
1601 			}
1602 
1603 			/*
1604 			 * If we have a buffer and there are checksum
1605 			 * flags, process them appropriately.
1606 			 */
1607 			if ((mp != NULL) &&
1608 			    ((txreq->flags &
1609 			    (NETTXF_csum_blank | NETTXF_data_validated))
1610 			    != 0)) {
1611 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1612 				    mp, txreq->flags);
1613 				xnbp->xnb_stat_rx_cksum_no_need++;
1614 			}
1615 		}
1616 
1617 		if (copy || (mp == NULL)) {
1618 			rxp->xr_status = status;
1619 			rxp->xr_id = txreq->id;
1620 			xnb_rx_schedule_unmop(xnbp, mop, rxp);
1621 		}
1622 
1623 		if (mp != NULL) {
1624 			xnbp->xnb_stat_ipackets++;
1625 			xnbp->xnb_stat_rbytes += txreq->size;
1626 
1627 			mp->b_next = NULL;
1628 			if (head == NULL) {
1629 				ASSERT(tail == NULL);
1630 				head = mp;
1631 			} else {
1632 				ASSERT(tail != NULL);
1633 				tail->b_next = mp;
1634 			}
1635 			tail = mp;
1636 		}
1637 	}
1638 
1639 	xnbp->xnb_tx_ring.req_cons = loop;
1640 
1641 	goto around;
1642 	/* NOTREACHED */
1643 }
1644 
1645 /*
1646  *  intr() -- ring interrupt service routine
1647  */
1648 static uint_t
1649 xnb_intr(caddr_t arg)
1650 {
1651 	xnb_t *xnbp = (xnb_t *)arg;
1652 	mblk_t *mp;
1653 
1654 	xnbp->xnb_stat_intr++;
1655 
1656 	mutex_enter(&xnbp->xnb_rx_lock);
1657 
1658 	ASSERT(xnbp->xnb_connected);
1659 
1660 	mp = xnb_recv(xnbp);
1661 
1662 	mutex_exit(&xnbp->xnb_rx_lock);
1663 
1664 	if (!xnbp->xnb_hotplugged) {
1665 		xnbp->xnb_stat_rx_too_early++;
1666 		goto fail;
1667 	}
1668 	if (mp == NULL) {
1669 		xnbp->xnb_stat_spurious_intr++;
1670 		goto fail;
1671 	}
1672 
1673 	xnbp->xnb_flavour->xf_recv(xnbp, mp);
1674 
1675 	return (DDI_INTR_CLAIMED);
1676 
1677 fail:
1678 	freemsgchain(mp);
1679 	return (DDI_INTR_CLAIMED);
1680 }
1681 
1682 static boolean_t
1683 xnb_connect_rings(dev_info_t *dip)
1684 {
1685 	xnb_t *xnbp = ddi_get_driver_private(dip);
1686 	char *oename;
1687 	struct gnttab_map_grant_ref map_op;
1688 	evtchn_port_t evtchn;
1689 	int i;
1690 
1691 	/*
1692 	 * Cannot attempt to connect the rings if already connected.
1693 	 */
1694 	ASSERT(!xnbp->xnb_connected);
1695 
1696 	oename = xvdi_get_oename(dip);
1697 
1698 	if (xenbus_gather(XBT_NULL, oename,
1699 	    "event-channel", "%u", &evtchn,
1700 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1701 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1702 	    NULL) != 0) {
1703 		cmn_err(CE_WARN, "xnb_connect_rings: "
1704 		    "cannot read other-end details from %s",
1705 		    oename);
1706 		goto fail;
1707 	}
1708 
1709 	if (xenbus_scanf(XBT_NULL, oename,
1710 	    "feature-tx-writable", "%d", &i) != 0)
1711 		i = 0;
1712 	if (i != 0)
1713 		xnbp->xnb_rx_pages_writable = B_TRUE;
1714 
1715 	if (xenbus_scanf(XBT_NULL, oename,
1716 	    "feature-no-csum-offload", "%d", &i) != 0)
1717 		i = 0;
1718 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1719 		xnbp->xnb_cksum_offload = B_FALSE;
1720 
1721 	/* Check whether our peer knows and requests hypervisor copy */
1722 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1723 	    != 0)
1724 		i = 0;
1725 	if (i != 0)
1726 		xnbp->xnb_hv_copy = B_TRUE;
1727 
1728 	/*
1729 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1730 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1731 	 *    into the allocated vaddr (one for tx, one for rx).
1732 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1733 	 *    bound to this domain.
1734 	 * 4. associate the event channel with an interrupt.
1735 	 * 5. declare ourselves connected.
1736 	 * 6. enable the interrupt.
1737 	 */
1738 
1739 	/* 1.tx */
1740 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1741 	    0, 0, 0, 0, VM_SLEEP);
1742 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1743 
1744 	/* 2.tx */
1745 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1746 	map_op.flags = GNTMAP_host_map;
1747 	map_op.ref = xnbp->xnb_tx_ring_ref;
1748 	map_op.dom = xnbp->xnb_peer;
1749 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1750 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1751 	    &map_op, 1) != 0 || map_op.status != 0) {
1752 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1753 		goto fail;
1754 	}
1755 	xnbp->xnb_tx_ring_handle = map_op.handle;
1756 
1757 	/* LINTED: constant in conditional context */
1758 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1759 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1760 
1761 	/* 1.rx */
1762 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1763 	    0, 0, 0, 0, VM_SLEEP);
1764 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1765 
1766 	/* 2.rx */
1767 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1768 	map_op.flags = GNTMAP_host_map;
1769 	map_op.ref = xnbp->xnb_rx_ring_ref;
1770 	map_op.dom = xnbp->xnb_peer;
1771 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1772 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1773 	    &map_op, 1) != 0 || map_op.status != 0) {
1774 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1775 		goto fail;
1776 	}
1777 	xnbp->xnb_rx_ring_handle = map_op.handle;
1778 
1779 	/* LINTED: constant in conditional context */
1780 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1781 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1782 
1783 	/* 3 */
1784 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1785 		cmn_err(CE_WARN, "xnb_connect_rings: "
1786 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1787 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1788 		goto fail;
1789 	}
1790 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1791 
1792 	/*
1793 	 * It would be good to set the state to XenbusStateConnected
1794 	 * here as well, but then what if ddi_add_intr() failed?
1795 	 * Changing the state in the store will be noticed by the peer
1796 	 * and cannot be "taken back".
1797 	 */
1798 	mutex_enter(&xnbp->xnb_tx_lock);
1799 	mutex_enter(&xnbp->xnb_rx_lock);
1800 
1801 	/* 5.1 */
1802 	xnbp->xnb_connected = B_TRUE;
1803 
1804 	mutex_exit(&xnbp->xnb_rx_lock);
1805 	mutex_exit(&xnbp->xnb_tx_lock);
1806 
1807 	/* 4, 6 */
1808 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1809 	    != DDI_SUCCESS) {
1810 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1811 		goto fail;
1812 	}
1813 	xnbp->xnb_irq = B_TRUE;
1814 
1815 	/* 5.2 */
1816 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1817 
1818 	return (B_TRUE);
1819 
1820 fail:
1821 	mutex_enter(&xnbp->xnb_tx_lock);
1822 	mutex_enter(&xnbp->xnb_rx_lock);
1823 
1824 	xnbp->xnb_connected = B_FALSE;
1825 	mutex_exit(&xnbp->xnb_rx_lock);
1826 	mutex_exit(&xnbp->xnb_tx_lock);
1827 
1828 	return (B_FALSE);
1829 }
1830 
1831 static void
1832 xnb_disconnect_rings(dev_info_t *dip)
1833 {
1834 	xnb_t *xnbp = ddi_get_driver_private(dip);
1835 
1836 	if (xnbp->xnb_irq) {
1837 		ddi_remove_intr(dip, 0, NULL);
1838 		xnbp->xnb_irq = B_FALSE;
1839 	}
1840 
1841 	if (xnbp->xnb_rx_unmop_count > 0)
1842 		xnb_rx_perform_pending_unmop(xnbp);
1843 
1844 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1845 		xvdi_free_evtchn(dip);
1846 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1847 	}
1848 
1849 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1850 		struct gnttab_unmap_grant_ref unmap_op;
1851 
1852 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1853 		    xnbp->xnb_rx_ring_addr;
1854 		unmap_op.dev_bus_addr = 0;
1855 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1856 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1857 		    &unmap_op, 1) != 0)
1858 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1859 			    "cannot unmap rx-ring page (%d)",
1860 			    unmap_op.status);
1861 
1862 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1863 	}
1864 
1865 	if (xnbp->xnb_rx_ring_addr != NULL) {
1866 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1867 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1868 		xnbp->xnb_rx_ring_addr = NULL;
1869 	}
1870 
1871 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1872 		struct gnttab_unmap_grant_ref unmap_op;
1873 
1874 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1875 		    xnbp->xnb_tx_ring_addr;
1876 		unmap_op.dev_bus_addr = 0;
1877 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1878 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1879 		    &unmap_op, 1) != 0)
1880 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1881 			    "cannot unmap tx-ring page (%d)",
1882 			    unmap_op.status);
1883 
1884 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1885 	}
1886 
1887 	if (xnbp->xnb_tx_ring_addr != NULL) {
1888 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1889 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1890 		xnbp->xnb_tx_ring_addr = NULL;
1891 	}
1892 }
1893 
1894 /*ARGSUSED*/
1895 static void
1896 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1897     void *arg, void *impl_data)
1898 {
1899 	xnb_t *xnbp = ddi_get_driver_private(dip);
1900 	XenbusState new_state = *(XenbusState *)impl_data;
1901 
1902 	ASSERT(xnbp != NULL);
1903 
1904 	switch (new_state) {
1905 	case XenbusStateConnected:
1906 		/* spurious state change */
1907 		if (xnbp->xnb_connected)
1908 			return;
1909 
1910 		if (xnb_connect_rings(dip)) {
1911 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1912 		} else {
1913 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1914 			xnb_disconnect_rings(dip);
1915 			(void) xvdi_switch_state(dip, XBT_NULL,
1916 			    XenbusStateClosed);
1917 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1918 		}
1919 
1920 		/*
1921 		 * Now that we've attempted to connect it's reasonable
1922 		 * to allow an attempt to detach.
1923 		 */
1924 		xnbp->xnb_detachable = B_TRUE;
1925 
1926 		break;
1927 
1928 	case XenbusStateClosing:
1929 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1930 
1931 		break;
1932 
1933 	case XenbusStateClosed:
1934 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1935 
1936 		mutex_enter(&xnbp->xnb_tx_lock);
1937 		mutex_enter(&xnbp->xnb_rx_lock);
1938 
1939 		xnb_disconnect_rings(dip);
1940 		xnbp->xnb_connected = B_FALSE;
1941 
1942 		mutex_exit(&xnbp->xnb_rx_lock);
1943 		mutex_exit(&xnbp->xnb_tx_lock);
1944 
1945 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1946 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1947 		/*
1948 		 * In all likelyhood this is already set (in the above
1949 		 * case), but if the peer never attempted to connect
1950 		 * and the domain is destroyed we get here without
1951 		 * having been through the case above, so we set it to
1952 		 * be sure.
1953 		 */
1954 		xnbp->xnb_detachable = B_TRUE;
1955 
1956 		break;
1957 
1958 	default:
1959 		break;
1960 	}
1961 }
1962 
1963 /*ARGSUSED*/
1964 static void
1965 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1966     void *arg, void *impl_data)
1967 {
1968 	xnb_t *xnbp = ddi_get_driver_private(dip);
1969 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1970 	boolean_t success;
1971 
1972 	ASSERT(xnbp != NULL);
1973 
1974 	switch (state) {
1975 	case Connected:
1976 
1977 		/* spurious hotplug event */
1978 		if (xnbp->xnb_hotplugged)
1979 			return;
1980 
1981 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1982 
1983 		mutex_enter(&xnbp->xnb_tx_lock);
1984 		mutex_enter(&xnbp->xnb_rx_lock);
1985 
1986 		xnbp->xnb_hotplugged = success;
1987 
1988 		mutex_exit(&xnbp->xnb_rx_lock);
1989 		mutex_exit(&xnbp->xnb_tx_lock);
1990 		break;
1991 
1992 	default:
1993 		break;
1994 	}
1995 }
1996 
1997 static struct modldrv modldrv = {
1998 	&mod_miscops, "xnb",
1999 };
2000 
2001 static struct modlinkage modlinkage = {
2002 	MODREV_1, &modldrv, NULL
2003 };
2004 
2005 int
2006 _init(void)
2007 {
2008 	int i;
2009 
2010 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2011 
2012 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
2013 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
2014 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
2015 	ASSERT(xnb_rxbuf_cachep != NULL);
2016 
2017 	i = mod_install(&modlinkage);
2018 	if (i != DDI_SUCCESS) {
2019 		kmem_cache_destroy(xnb_rxbuf_cachep);
2020 		mutex_destroy(&xnb_alloc_page_lock);
2021 	}
2022 	return (i);
2023 }
2024 
2025 int
2026 _info(struct modinfo *modinfop)
2027 {
2028 	return (mod_info(&modlinkage, modinfop));
2029 }
2030 
2031 int
2032 _fini(void)
2033 {
2034 	int i;
2035 
2036 	i = mod_remove(&modlinkage);
2037 	if (i == DDI_SUCCESS) {
2038 		kmem_cache_destroy(xnb_rxbuf_cachep);
2039 		mutex_destroy(&xnb_alloc_page_lock);
2040 	}
2041 	return (i);
2042 }
2043