xref: /titanic_51/usr/src/uts/common/xen/io/xnb.c (revision 29493bd8e037cbaea9095b34172305abb589cb6b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #ifdef DEBUG
30 #define	XNB_DEBUG 1
31 #endif /* DEBUG */
32 
33 #include "xnb.h"
34 
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/mac.h>
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/types.h>
44 #include <sys/pattr.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/hat_i86.h>
47 #include <xen/sys/xenbus_impl.h>
48 #include <xen/sys/xendev.h>
49 #include <sys/balloon_impl.h>
50 #include <sys/evtchn_impl.h>
51 #include <sys/gnttab.h>
52 #include <vm/vm_dep.h>
53 
54 #include <sys/gld.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <sys/vnic_impl.h> /* blech. */
58 
59 /*
60  * The terms "transmit" and "receive" are used in their traditional
61  * sense here - packets from other parts of this system are
62  * "transmitted" to the peer domain and those originating from the
63  * peer are "received".
64  *
65  * In some cases this can be confusing, because various data
66  * structures are shared with the domU driver, which has the opposite
67  * view of what constitutes "transmit" and "receive".  In naming the
68  * shared structures the domU driver always wins.
69  */
70 
71 /*
72  * XXPV dme: things to do, as well as various things indicated
73  * throughout the source:
74  * - copy avoidance outbound.
75  * - copy avoidance inbound.
76  * - transfer credit limiting.
77  * - MAC address based filtering.
78  */
79 
80 /*
81  * Linux expects to have some headroom in received buffers.  The Linux
82  * frontend driver (netfront) checks to see if the headroom is
83  * available and will re-allocate the buffer to make room if
84  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
85  * headroom to each packet we pass to the peer.
86  */
87 #define	TX_BUFFER_HEADROOM	16
88 
89 static boolean_t	xnb_cksum_offload = B_TRUE;
90 
91 static boolean_t	xnb_connect_rings(dev_info_t *);
92 static void		xnb_disconnect_rings(dev_info_t *);
93 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
94     void *, void *);
95 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
96     void *, void *);
97 
98 static int	xnb_rxbuf_constructor(void *, void *, int);
99 static void	xnb_rxbuf_destructor(void *, void *);
100 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
101 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
102 static void	xnb_rx_notify_peer(xnb_t *);
103 static void	xnb_rx_complete(xnb_rxbuf_t *);
104 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
105 static void 	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
106     xnb_rxbuf_t *);
107 static void	xnb_rx_perform_pending_unmop(xnb_t *);
108 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
109 
110 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
111 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
112 
113 
114 boolean_t	xnb_hv_copy = B_TRUE;
115 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
116 
117 #ifdef XNB_DEBUG
118 #define	NR_GRANT_ENTRIES \
119 	(NR_GRANT_FRAMES * PAGESIZE / sizeof (grant_entry_t))
120 #endif /* XNB_DEBUG */
121 
122 /* XXPV dme: are these really invalid? */
123 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
124 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
125 
126 static kmem_cache_t *xnb_rxbuf_cachep;
127 static kmutex_t	xnb_alloc_page_lock;
128 
129 /*
130  * Statistics.
131  */
132 static char *aux_statistics[] = {
133 	"tx_cksum_deferred",
134 	"rx_cksum_no_need",
135 	"tx_notify_deferred",
136 	"tx_notify_sent",
137 	"rx_notify_deferred",
138 	"rx_notify_sent",
139 	"tx_too_early",
140 	"rx_too_early",
141 	"rx_allocb_failed",
142 	"tx_allocb_failed",
143 	"tx_foreign_page",
144 	"mac_full",
145 	"spurious_intr",
146 	"allocation_success",
147 	"allocation_failure",
148 	"small_allocation_success",
149 	"small_allocation_failure",
150 	"other_allocation_failure",
151 	"tx_pageboundary_crossed",
152 	"tx_cpoparea_grown",
153 	"csum_hardware",
154 	"csum_software",
155 };
156 
157 static int
158 xnb_ks_aux_update(kstat_t *ksp, int flag)
159 {
160 	xnb_t *xnbp;
161 	kstat_named_t *knp;
162 
163 	if (flag != KSTAT_READ)
164 		return (EACCES);
165 
166 	xnbp = ksp->ks_private;
167 	knp = ksp->ks_data;
168 
169 	/*
170 	 * Assignment order should match that of the names in
171 	 * aux_statistics.
172 	 */
173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
195 
196 	return (0);
197 }
198 
199 static boolean_t
200 xnb_ks_init(xnb_t *xnbp)
201 {
202 	int nstat = sizeof (aux_statistics) /
203 	    sizeof (aux_statistics[0]);
204 	char **cp = aux_statistics;
205 	kstat_named_t *knp;
206 
207 	/*
208 	 * Create and initialise kstats.
209 	 */
210 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
211 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
212 	    KSTAT_TYPE_NAMED, nstat, 0);
213 	if (xnbp->xnb_kstat_aux == NULL)
214 		return (B_FALSE);
215 
216 	xnbp->xnb_kstat_aux->ks_private = xnbp;
217 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
218 
219 	knp = xnbp->xnb_kstat_aux->ks_data;
220 	while (nstat > 0) {
221 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
222 
223 		knp++;
224 		cp++;
225 		nstat--;
226 	}
227 
228 	kstat_install(xnbp->xnb_kstat_aux);
229 
230 	return (B_TRUE);
231 }
232 
233 static void
234 xnb_ks_free(xnb_t *xnbp)
235 {
236 	kstat_delete(xnbp->xnb_kstat_aux);
237 }
238 
239 /*
240  * Software checksum calculation and insertion for an arbitrary packet.
241  */
242 /*ARGSUSED*/
243 static mblk_t *
244 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
245 {
246 	/*
247 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
248 	 * because it doesn't cover all of the interesting cases :-(
249 	 */
250 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
251 	    HCK_FULLCKSUM, KM_NOSLEEP);
252 
253 	return (vnic_fix_cksum(mp));
254 }
255 
256 mblk_t *
257 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
258 {
259 	struct ether_header *ehp;
260 	uint16_t sap;
261 	uint32_t offset;
262 	ipha_t *ipha;
263 
264 	ASSERT(mp->b_next == NULL);
265 
266 	/*
267 	 * Check that the packet is contained in a single mblk.  In
268 	 * the "from peer" path this is true today, but will change
269 	 * when scatter gather support is added.  In the "to peer"
270 	 * path we cannot be sure, but in most cases it will be true
271 	 * (in the xnbo case the packet has come from a MAC device
272 	 * which is unlikely to split packets).
273 	 */
274 	if (mp->b_cont != NULL)
275 		goto software;
276 
277 	/*
278 	 * If the MAC has no hardware capability don't do any further
279 	 * checking.
280 	 */
281 	if (capab == 0)
282 		goto software;
283 
284 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
285 	ehp = (struct ether_header *)mp->b_rptr;
286 
287 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
288 		struct ether_vlan_header *evhp;
289 
290 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
291 		evhp = (struct ether_vlan_header *)mp->b_rptr;
292 		sap = ntohs(evhp->ether_type);
293 		offset = sizeof (struct ether_vlan_header);
294 	} else {
295 		sap = ntohs(ehp->ether_type);
296 		offset = sizeof (struct ether_header);
297 	}
298 
299 	/*
300 	 * We only attempt to do IPv4 packets in hardware.
301 	 */
302 	if (sap != ETHERTYPE_IP)
303 		goto software;
304 
305 	/*
306 	 * We know that this is an IPv4 packet.
307 	 */
308 	ipha = (ipha_t *)(mp->b_rptr + offset);
309 
310 	switch (ipha->ipha_protocol) {
311 	case IPPROTO_TCP:
312 	case IPPROTO_UDP:
313 		/*
314 		 * This is a TCP/IPv4 or UDP/IPv4 packet.
315 		 *
316 		 * If the capabilities indicate that full checksum
317 		 * offload is available, use it.
318 		 */
319 		if ((capab & HCKSUM_INET_FULL_V4) != 0) {
320 			(void) hcksum_assoc(mp, NULL, NULL,
321 			    0, 0, 0, 0,
322 			    HCK_FULLCKSUM, KM_NOSLEEP);
323 
324 			xnbp->xnb_stat_csum_hardware++;
325 
326 			return (mp);
327 		}
328 
329 		/*
330 		 * XXPV dme: If the capabilities indicate that partial
331 		 * checksum offload is available, we should use it.
332 		 */
333 
334 		break;
335 
336 	default:
337 		/* Use software. */
338 		break;
339 	}
340 
341 software:
342 	/*
343 	 * We are not able to use any offload so do the whole thing in
344 	 * software.
345 	 */
346 	xnbp->xnb_stat_csum_software++;
347 
348 	return (xnb_software_csum(xnbp, mp));
349 }
350 
351 int
352 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
353 {
354 	xnb_t *xnbp;
355 	char *xsname, mac[ETHERADDRL * 3];
356 
357 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
358 
359 	xnbp->xnb_flavour = flavour;
360 	xnbp->xnb_flavour_data = flavour_data;
361 	xnbp->xnb_devinfo = dip;
362 	xnbp->xnb_evtchn = INVALID_EVTCHN;
363 	xnbp->xnb_irq = B_FALSE;
364 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
365 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
366 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
367 	xnbp->xnb_connected = B_FALSE;
368 	xnbp->xnb_hotplugged = B_FALSE;
369 	xnbp->xnb_detachable = B_FALSE;
370 	xnbp->xnb_peer = xvdi_get_oeid(dip);
371 	xnbp->xnb_rx_pages_writable = B_FALSE;
372 
373 	xnbp->xnb_rx_buf_count = 0;
374 	xnbp->xnb_rx_unmop_count = 0;
375 
376 	xnbp->xnb_hv_copy = B_FALSE;
377 
378 	xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
379 	ASSERT(xnbp->xnb_tx_va != NULL);
380 
381 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
382 	    != DDI_SUCCESS)
383 		goto failure;
384 
385 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
386 	xnbp->xnb_tx_cpop = NULL;
387 	xnbp->xnb_cpop_sz = 0;
388 
389 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
390 	    xnbp->xnb_icookie);
391 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
392 	    xnbp->xnb_icookie);
393 
394 	/* set driver private pointer now */
395 	ddi_set_driver_private(dip, xnbp);
396 
397 	if (!xnb_ks_init(xnbp))
398 		goto failure_1;
399 
400 	/*
401 	 * Receive notification of changes in the state of the
402 	 * driver in the guest domain.
403 	 */
404 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
405 	    xnb_oe_state_change) != DDI_SUCCESS)
406 		goto failure_2;
407 
408 	/*
409 	 * Receive notification of hotplug events.
410 	 */
411 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
412 	    xnb_hp_state_change) != DDI_SUCCESS)
413 		goto failure_2;
414 
415 	xsname = xvdi_get_xsname(dip);
416 
417 	if (xenbus_printf(XBT_NULL, xsname,
418 	    "feature-no-csum-offload", "%d",
419 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
420 		goto failure_3;
421 
422 	/*
423 	 * Use global xnb_hv_copy to export this feature. This means that
424 	 * we have to decide what to do before starting up a guest domain
425 	 */
426 	if (xenbus_printf(XBT_NULL, xsname,
427 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
428 		goto failure_3;
429 	/*
430 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
431 	 * in addition to "feature-rx-copy" being 1. It seems strange
432 	 * to use four possible states to describe a binary decision,
433 	 * but we might as well play nice.
434 	 */
435 	if (xenbus_printf(XBT_NULL, xsname,
436 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
437 		goto failure_3;
438 
439 	if (xenbus_scanf(XBT_NULL, xsname,
440 	    "mac", "%s", mac) != 0) {
441 		cmn_err(CE_WARN, "xnb_attach: "
442 		    "cannot read mac address from %s",
443 		    xsname);
444 		goto failure_3;
445 	}
446 
447 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
448 		cmn_err(CE_WARN,
449 		    "xnb_attach: cannot parse mac address %s",
450 		    mac);
451 		goto failure_3;
452 	}
453 
454 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
455 	(void) xvdi_post_event(dip, XEN_HP_ADD);
456 
457 	return (DDI_SUCCESS);
458 
459 failure_3:
460 	xvdi_remove_event_handler(dip, NULL);
461 
462 failure_2:
463 	xnb_ks_free(xnbp);
464 
465 failure_1:
466 	mutex_destroy(&xnbp->xnb_rx_lock);
467 	mutex_destroy(&xnbp->xnb_tx_lock);
468 
469 failure:
470 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
471 	kmem_free(xnbp, sizeof (*xnbp));
472 	return (DDI_FAILURE);
473 }
474 
475 /*ARGSUSED*/
476 void
477 xnb_detach(dev_info_t *dip)
478 {
479 	xnb_t *xnbp = ddi_get_driver_private(dip);
480 
481 	ASSERT(xnbp != NULL);
482 	ASSERT(!xnbp->xnb_connected);
483 	ASSERT(xnbp->xnb_rx_buf_count == 0);
484 
485 	xnb_disconnect_rings(dip);
486 
487 	xvdi_remove_event_handler(dip, NULL);
488 
489 	xnb_ks_free(xnbp);
490 
491 	ddi_set_driver_private(dip, NULL);
492 
493 	mutex_destroy(&xnbp->xnb_tx_lock);
494 	mutex_destroy(&xnbp->xnb_rx_lock);
495 
496 	if (xnbp->xnb_cpop_sz > 0)
497 		kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop)
498 		    * xnbp->xnb_cpop_sz);
499 
500 	ASSERT(xnbp->xnb_tx_va != NULL);
501 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
502 
503 	kmem_free(xnbp, sizeof (*xnbp));
504 }
505 
506 
507 static mfn_t
508 xnb_alloc_page(xnb_t *xnbp)
509 {
510 #define	WARNING_RATE_LIMIT 100
511 #define	BATCH_SIZE 256
512 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
513 	static int nth = BATCH_SIZE;
514 	mfn_t mfn;
515 
516 	mutex_enter(&xnb_alloc_page_lock);
517 	if (nth == BATCH_SIZE) {
518 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
519 			xnbp->xnb_stat_allocation_failure++;
520 			mutex_exit(&xnb_alloc_page_lock);
521 
522 			/*
523 			 * Try for a single page in low memory situations.
524 			 */
525 			if (balloon_alloc_pages(1, &mfn) != 1) {
526 				if ((xnbp->xnb_stat_small_allocation_failure++
527 				    % WARNING_RATE_LIMIT) == 0)
528 					cmn_err(CE_WARN, "xnb_alloc_page: "
529 					    "Cannot allocate memory to "
530 					    "transfer packets to peer.");
531 				return (0);
532 			} else {
533 				xnbp->xnb_stat_small_allocation_success++;
534 				return (mfn);
535 			}
536 		}
537 
538 		nth = 0;
539 		xnbp->xnb_stat_allocation_success++;
540 	}
541 
542 	mfn = mfns[nth++];
543 	mutex_exit(&xnb_alloc_page_lock);
544 
545 	ASSERT(mfn != 0);
546 
547 	return (mfn);
548 #undef BATCH_SIZE
549 #undef WARNING_RATE_LIMIT
550 }
551 
552 /*ARGSUSED*/
553 static void
554 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
555 {
556 	int r;
557 	pfn_t pfn;
558 
559 	pfn = xen_assign_pfn(mfn);
560 	pfnzero(pfn, 0, PAGESIZE);
561 	xen_release_pfn(pfn);
562 
563 	/*
564 	 * This happens only in the error path, so batching is
565 	 * not worth the complication.
566 	 */
567 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
568 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
569 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
570 		    r, mfn);
571 	}
572 }
573 
574 /*
575  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
576  * using local variables.
577  */
578 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
579 	((((_r)->sring->req_prod - loop) <		\
580 		(RING_SIZE(_r) - (loop - prod))) ?	\
581 	    ((_r)->sring->req_prod - loop) :		\
582 	    (RING_SIZE(_r) - (loop - prod)))
583 
584 mblk_t *
585 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
586 {
587 	mblk_t *free = mp, *prev = NULL;
588 	size_t len;
589 	gnttab_transfer_t *gop;
590 	boolean_t notify;
591 	RING_IDX loop, prod, end;
592 
593 	/*
594 	 * For each packet the sequence of operations is:
595 	 *
596 	 * 1. get a new page from the hypervisor.
597 	 * 2. get a request slot from the ring.
598 	 * 3. copy the data into the new page.
599 	 * 4. transfer the page to the peer.
600 	 * 5. update the request slot.
601 	 * 6. kick the peer.
602 	 * 7. free mp.
603 	 *
604 	 * In order to reduce the number of hypercalls, we prepare
605 	 * several packets for the peer and perform a single hypercall
606 	 * to transfer them.
607 	 */
608 
609 	mutex_enter(&xnbp->xnb_tx_lock);
610 
611 	/*
612 	 * If we are not connected to the peer or have not yet
613 	 * finished hotplug it is too early to pass packets to the
614 	 * peer.
615 	 */
616 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
617 		mutex_exit(&xnbp->xnb_tx_lock);
618 		DTRACE_PROBE(flip_tx_too_early);
619 		xnbp->xnb_stat_tx_too_early++;
620 		return (mp);
621 	}
622 
623 	loop = xnbp->xnb_rx_ring.req_cons;
624 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
625 	gop = xnbp->xnb_tx_top;
626 
627 	while ((mp != NULL) &&
628 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
629 
630 		mfn_t mfn;
631 		pfn_t pfn;
632 		netif_rx_request_t *rxreq;
633 		netif_rx_response_t *rxresp;
634 		char *valoop;
635 		size_t offset;
636 		mblk_t *ml;
637 		uint16_t cksum_flags;
638 
639 		/* 1 */
640 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
641 			xnbp->xnb_stat_xmit_defer++;
642 			break;
643 		}
644 
645 		/* 2 */
646 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
647 
648 #ifdef XNB_DEBUG
649 		if (!(rxreq->id < NET_RX_RING_SIZE))
650 			cmn_err(CE_PANIC, "xnb_to_peer: "
651 			    "id %d out of range in request 0x%p",
652 			    rxreq->id, (void *)rxreq);
653 		if (rxreq->gref >= NR_GRANT_ENTRIES)
654 			cmn_err(CE_PANIC, "xnb_to_peer: "
655 			    "grant ref %d out of range in request 0x%p",
656 			    rxreq->gref, (void *)rxreq);
657 #endif /* XNB_DEBUG */
658 
659 		/* Assign a pfn and map the new page at the allocated va. */
660 		pfn = xen_assign_pfn(mfn);
661 		hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
662 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
663 
664 		offset = TX_BUFFER_HEADROOM;
665 
666 		/* 3 */
667 		len = 0;
668 		valoop = xnbp->xnb_tx_va + offset;
669 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
670 			size_t chunk = ml->b_wptr - ml->b_rptr;
671 
672 			bcopy(ml->b_rptr, valoop, chunk);
673 			valoop += chunk;
674 			len += chunk;
675 		}
676 
677 		ASSERT(len + offset < PAGESIZE);
678 
679 		/* Release the pfn. */
680 		hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
681 		    HAT_UNLOAD_UNMAP);
682 		xen_release_pfn(pfn);
683 
684 		/* 4 */
685 		gop->mfn = mfn;
686 		gop->domid = xnbp->xnb_peer;
687 		gop->ref = rxreq->gref;
688 
689 		/* 5.1 */
690 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
691 		rxresp->offset = offset;
692 		rxresp->flags = 0;
693 
694 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
695 		if (cksum_flags != 0)
696 			xnbp->xnb_stat_tx_cksum_deferred++;
697 		rxresp->flags |= cksum_flags;
698 
699 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
700 		rxresp->status = len;
701 
702 		loop++;
703 		prod++;
704 		gop++;
705 		prev = mp;
706 		mp = mp->b_next;
707 	}
708 
709 	/*
710 	 * Did we actually do anything?
711 	 */
712 	if (loop == xnbp->xnb_rx_ring.req_cons) {
713 		mutex_exit(&xnbp->xnb_tx_lock);
714 		return (mp);
715 	}
716 
717 	end = loop;
718 
719 	/*
720 	 * Unlink the end of the 'done' list from the remainder.
721 	 */
722 	ASSERT(prev != NULL);
723 	prev->b_next = NULL;
724 
725 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top,
726 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
727 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
728 	}
729 
730 	loop = xnbp->xnb_rx_ring.req_cons;
731 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
732 	gop = xnbp->xnb_tx_top;
733 
734 	while (loop < end) {
735 		int16_t status = NETIF_RSP_OKAY;
736 
737 		if (gop->status != 0) {
738 			status = NETIF_RSP_ERROR;
739 
740 			/*
741 			 * If the status is anything other than
742 			 * GNTST_bad_page then we don't own the page
743 			 * any more, so don't try to give it back.
744 			 */
745 			if (gop->status != GNTST_bad_page)
746 				gop->mfn = 0;
747 		} else {
748 			/* The page is no longer ours. */
749 			gop->mfn = 0;
750 		}
751 
752 		if (gop->mfn != 0)
753 			/*
754 			 * Give back the page, as we won't be using
755 			 * it.
756 			 */
757 			xnb_free_page(xnbp, gop->mfn);
758 		else
759 			/*
760 			 * We gave away a page, update our accounting
761 			 * now.
762 			 */
763 			balloon_drv_subtracted(1);
764 
765 		/* 5.2 */
766 		if (status != NETIF_RSP_OKAY) {
767 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
768 			    status;
769 		} else {
770 			xnbp->xnb_stat_opackets++;
771 			xnbp->xnb_stat_obytes += len;
772 		}
773 
774 		loop++;
775 		prod++;
776 		gop++;
777 	}
778 
779 	xnbp->xnb_rx_ring.req_cons = loop;
780 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
781 
782 	/* 6 */
783 	/* LINTED: constant in conditional context */
784 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
785 	if (notify) {
786 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
787 		xnbp->xnb_stat_tx_notify_sent++;
788 	} else {
789 		xnbp->xnb_stat_tx_notify_deferred++;
790 	}
791 
792 	if (mp != NULL)
793 		xnbp->xnb_stat_xmit_defer++;
794 
795 	mutex_exit(&xnbp->xnb_tx_lock);
796 
797 	/* Free mblk_t's that we consumed. */
798 	freemsgchain(free);
799 
800 	return (mp);
801 }
802 
803 /* helper functions for xnb_copy_to_peer */
804 
805 /*
806  * Grow the array of copy operation descriptors.
807  * Returns a pointer to the next available entry.
808  */
809 gnttab_copy_t *
810 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
811 {
812 	/*
813 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
814 	 * something into but cannot, because we haven't alloc'ed it
815 	 * yet, or NULL.
816 	 * old_cpop and new_cpop (local) are pointers to old/new
817 	 * versions of xnbp->xnb_tx_cpop.
818 	 */
819 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
820 	size_t		newcount;
821 
822 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
823 
824 	old_cpop = xnbp->xnb_tx_cpop;
825 	/*
826 	 * o_cpop is a pointer into the array pointed to by old_cpop;
827 	 * it would be an error for exactly one of these pointers to be NULL.
828 	 * We shouldn't call this function if xnb_tx_cpop has already
829 	 * been allocated, but we're starting to fill it from the beginning
830 	 * again.
831 	 */
832 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
833 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
834 
835 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
836 
837 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
838 	if (new_cpop == NULL) {
839 		xnbp->xnb_stat_other_allocation_failure++;
840 		return (NULL);
841 	}
842 
843 	if (o_cpop != NULL) {
844 		size_t	 offset = (o_cpop - old_cpop);
845 
846 		/* we only need to move the parts in use ... */
847 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
848 		    (sizeof (*old_cpop)));
849 
850 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
851 
852 		ret_cpop = new_cpop + offset;
853 	} else {
854 		ret_cpop = new_cpop;
855 	}
856 
857 	xnbp->xnb_tx_cpop = new_cpop;
858 	xnbp->xnb_cpop_sz = newcount;
859 
860 	xnbp->xnb_stat_tx_cpoparea_grown++;
861 
862 	return (ret_cpop);
863 }
864 
865 /*
866  * Check whether an address is on a page that's foreign to this domain.
867  */
868 static boolean_t
869 is_foreign(void *addr)
870 {
871 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
872 
873 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
874 }
875 
876 /*
877  * Insert a newly allocated mblk into a chain, replacing the old one.
878  */
879 static mblk_t *
880 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
881 {
882 	uint32_t	start, stuff, end, value, flags;
883 	mblk_t		*new_mp;
884 
885 	new_mp = copyb(mp);
886 	if (new_mp == NULL)
887 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
888 		    "for %p, len %lu", (void *) mp, len);
889 
890 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
891 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
892 	    flags, KM_NOSLEEP);
893 
894 	new_mp->b_next = mp->b_next;
895 	new_mp->b_prev = mp->b_prev;
896 	new_mp->b_cont = mp->b_cont;
897 
898 	/* Make sure we only overwrite pointers to the mblk being replaced. */
899 	if (mp_prev != NULL && mp_prev->b_next == mp)
900 		mp_prev->b_next = new_mp;
901 
902 	if (ml_prev != NULL && ml_prev->b_cont == mp)
903 		ml_prev->b_cont = new_mp;
904 
905 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
906 	freemsg(mp);
907 
908 	return (new_mp);
909 }
910 
911 /*
912  * Set all the fields in a gnttab_copy_t.
913  */
914 static void
915 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
916     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
917 {
918 	ASSERT(xnbp != NULL && gp != NULL);
919 
920 	gp->source.offset = s_off;
921 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
922 	gp->source.domid = DOMID_SELF;
923 
924 	gp->len = (uint16_t)len;
925 	gp->flags = GNTCOPY_dest_gref;
926 	gp->status = 0;
927 
928 	gp->dest.u.ref = d_ref;
929 	gp->dest.offset = d_off;
930 	gp->dest.domid = xnbp->xnb_peer;
931 }
932 
933 mblk_t *
934 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
935 {
936 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
937 	mblk_t		*ml, *ml_prev;
938 	gnttab_copy_t	*gop_cp;
939 	boolean_t	notify;
940 	RING_IDX	loop, prod;
941 	int		i;
942 
943 	if (!xnbp->xnb_hv_copy)
944 		return (xnb_to_peer(xnbp, mp));
945 
946 	/*
947 	 * For each packet the sequence of operations is:
948 	 *
949 	 *  1. get a request slot from the ring.
950 	 *  2. set up data for hypercall (see NOTE below)
951 	 *  3. have the hypervisore copy the data
952 	 *  4. update the request slot.
953 	 *  5. kick the peer.
954 	 *
955 	 * NOTE ad 2.
956 	 *  In order to reduce the number of hypercalls, we prepare
957 	 *  several packets (mp->b_cont != NULL) for the peer and
958 	 *  perform a single hypercall to transfer them.
959 	 *  We also have to set up a seperate copy operation for
960 	 *  every page.
961 	 *
962 	 * If we have more than one message (mp->b_next != NULL),
963 	 * we do this whole dance repeatedly.
964 	 */
965 
966 	mutex_enter(&xnbp->xnb_tx_lock);
967 
968 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
969 		mutex_exit(&xnbp->xnb_tx_lock);
970 		DTRACE_PROBE(copy_tx_too_early);
971 		xnbp->xnb_stat_tx_too_early++;
972 		return (mp);
973 	}
974 
975 	loop = xnbp->xnb_rx_ring.req_cons;
976 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
977 
978 	while ((mp != NULL) &&
979 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
980 		netif_rx_request_t	*rxreq;
981 		netif_rx_response_t	*rxresp;
982 		size_t			offset, d_offset;
983 		size_t			len;
984 		uint16_t		cksum_flags;
985 		int16_t			status = NETIF_RSP_OKAY;
986 		int			item_count;
987 
988 		/* 1 */
989 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
990 
991 #ifdef XNB_DEBUG
992 		if (!(rxreq->id < NET_RX_RING_SIZE))
993 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
994 			    "id %d out of range in request 0x%p",
995 			    rxreq->id, (void *)rxreq);
996 		if (rxreq->gref >= NR_GRANT_ENTRIES)
997 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
998 			    "grant ref %d out of range in request 0x%p",
999 			    rxreq->gref, (void *)rxreq);
1000 #endif /* XNB_DEBUG */
1001 
1002 		/* 2 */
1003 		d_offset = offset = TX_BUFFER_HEADROOM;
1004 		len = 0;
1005 		item_count = 0;
1006 
1007 		gop_cp = xnbp->xnb_tx_cpop;
1008 
1009 		/*
1010 		 * We walk the b_cont pointers and set up a gop_cp
1011 		 * structure for every page in every data block we have.
1012 		 */
1013 		/* 2a */
1014 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1015 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1016 			uchar_t	*r_tmp,	*rpt_align;
1017 			size_t	r_offset;
1018 
1019 			/*
1020 			 * If we get an mblk on a page that doesn't belong to
1021 			 * this domain, get a new mblk to replace the old one.
1022 			 */
1023 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1024 				mblk_t *ml_new = replace_msg(ml, chunk,
1025 				    mp_prev, ml_prev);
1026 
1027 				/* We can still use old ml, but not *ml! */
1028 				if (free == ml)
1029 					free = ml_new;
1030 				if (mp == ml)
1031 					mp = ml_new;
1032 				ml = ml_new;
1033 
1034 				xnbp->xnb_stat_tx_foreign_page++;
1035 			}
1036 
1037 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1038 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1039 			r_tmp = ml->b_rptr;
1040 
1041 			if (d_offset + chunk > PAGESIZE)
1042 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1043 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1044 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1045 				    (void *)mp, (void *)saved_mp, (void *)ml,
1046 				    (void *)rpt_align,
1047 				    d_offset, chunk, (int)PAGESIZE);
1048 
1049 			while (chunk > 0) {
1050 				size_t part_len;
1051 
1052 				item_count++;
1053 				if (item_count > xnbp->xnb_cpop_sz) {
1054 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1055 					if (gop_cp == NULL)
1056 						goto failure;
1057 				}
1058 				/*
1059 				 * If our mblk crosses a page boundary, we need
1060 				 * to do a seperate copy for every page.
1061 				 */
1062 				if (r_offset + chunk > PAGESIZE) {
1063 					part_len = PAGESIZE - r_offset;
1064 
1065 					DTRACE_PROBE3(mblk_page_crossed,
1066 					    (mblk_t *), ml, int, chunk, int,
1067 					    (int)r_offset);
1068 
1069 					xnbp->xnb_stat_tx_pagebndry_crossed++;
1070 				} else {
1071 					part_len = chunk;
1072 				}
1073 
1074 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1075 				    d_offset, part_len, rxreq->gref);
1076 
1077 				chunk -= part_len;
1078 
1079 				len += part_len;
1080 				d_offset += part_len;
1081 				r_tmp += part_len;
1082 				/*
1083 				 * The 2nd, 3rd ... last copies will always
1084 				 * start at r_tmp, therefore r_offset is 0.
1085 				 */
1086 				r_offset = 0;
1087 				gop_cp++;
1088 			}
1089 			ml_prev = ml;
1090 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1091 			    chunk, int, len, int, item_count);
1092 		}
1093 		/* 3 */
1094 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop,
1095 		    item_count) != 0) {
1096 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1097 			DTRACE_PROBE(HV_granttableopfailed);
1098 		}
1099 
1100 		/* 4 */
1101 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1102 		rxresp->offset = offset;
1103 
1104 		rxresp->flags = 0;
1105 
1106 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1107 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1108 		    (int)rxresp->status);
1109 
1110 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1111 		if (cksum_flags != 0)
1112 			xnbp->xnb_stat_tx_cksum_deferred++;
1113 		rxresp->flags |= cksum_flags;
1114 
1115 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1116 		rxresp->status = len;
1117 
1118 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1119 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1120 		    (int)rxresp->status);
1121 
1122 		for (i = 0; i < item_count; i++) {
1123 			if (xnbp->xnb_tx_cpop[i].status != 0) {
1124 				DTRACE_PROBE2(cpop__status__nonnull, int,
1125 				    (int)xnbp->xnb_tx_cpop[i].status,
1126 				    int, i);
1127 				status = NETIF_RSP_ERROR;
1128 			}
1129 		}
1130 
1131 		/* 5.2 */
1132 		if (status != NETIF_RSP_OKAY) {
1133 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1134 			    status;
1135 		} else {
1136 			xnbp->xnb_stat_opackets++;
1137 			xnbp->xnb_stat_obytes += len;
1138 		}
1139 
1140 		loop++;
1141 		prod++;
1142 		mp_prev = mp;
1143 		mp = mp->b_next;
1144 	}
1145 failure:
1146 	/*
1147 	 * Did we actually do anything?
1148 	 */
1149 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1150 		mutex_exit(&xnbp->xnb_tx_lock);
1151 		return (mp);
1152 	}
1153 
1154 	/*
1155 	 * Unlink the end of the 'done' list from the remainder.
1156 	 */
1157 	ASSERT(mp_prev != NULL);
1158 	mp_prev->b_next = NULL;
1159 
1160 	xnbp->xnb_rx_ring.req_cons = loop;
1161 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1162 
1163 	/* 6 */
1164 	/* LINTED: constant in conditional context */
1165 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1166 	if (notify) {
1167 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1168 		xnbp->xnb_stat_tx_notify_sent++;
1169 	} else {
1170 		xnbp->xnb_stat_tx_notify_deferred++;
1171 	}
1172 
1173 	if (mp != NULL)
1174 		xnbp->xnb_stat_xmit_defer++;
1175 
1176 	mutex_exit(&xnbp->xnb_tx_lock);
1177 
1178 	/* Free mblk_t structs we have consumed. */
1179 	freemsgchain(free);
1180 
1181 	return (mp);
1182 }
1183 
1184 /*ARGSUSED*/
1185 static int
1186 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
1187 {
1188 	xnb_rxbuf_t *rxp = buf;
1189 
1190 	bzero(rxp, sizeof (*rxp));
1191 
1192 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
1193 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
1194 
1195 	rxp->xr_mop.host_addr =
1196 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1197 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1198 	    VM_NOSLEEP : VM_SLEEP);
1199 
1200 	if (rxp->xr_mop.host_addr == NULL) {
1201 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
1202 		    "cannot get address space");
1203 		return (-1);
1204 	}
1205 
1206 	/*
1207 	 * Have the hat ensure that page table exists for the VA.
1208 	 */
1209 	hat_prepare_mapping(kas.a_hat,
1210 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1211 
1212 	return (0);
1213 }
1214 
1215 /*ARGSUSED*/
1216 static void
1217 xnb_rxbuf_destructor(void *buf, void *arg)
1218 {
1219 	xnb_rxbuf_t *rxp = buf;
1220 
1221 	ASSERT(rxp->xr_mop.host_addr != NULL);
1222 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1223 
1224 	hat_release_mapping(kas.a_hat,
1225 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1226 	vmem_free(heap_arena,
1227 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
1228 }
1229 
1230 static void
1231 xnb_rx_notify_peer(xnb_t *xnbp)
1232 {
1233 	boolean_t notify;
1234 
1235 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1236 
1237 	/* LINTED: constant in conditional context */
1238 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1239 	if (notify) {
1240 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1241 		xnbp->xnb_stat_rx_notify_sent++;
1242 	} else {
1243 		xnbp->xnb_stat_rx_notify_deferred++;
1244 	}
1245 }
1246 
1247 static void
1248 xnb_rx_complete(xnb_rxbuf_t *rxp)
1249 {
1250 	xnb_t *xnbp = rxp->xr_xnbp;
1251 
1252 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1253 
1254 	mutex_enter(&xnbp->xnb_rx_lock);
1255 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp);
1256 	mutex_exit(&xnbp->xnb_rx_lock);
1257 }
1258 
1259 static void
1260 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1261 {
1262 	RING_IDX i;
1263 	netif_tx_response_t *txresp;
1264 
1265 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1266 
1267 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1268 
1269 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1270 	txresp->id = id;
1271 	txresp->status = status;
1272 
1273 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1274 
1275 	/*
1276 	 * Note that we don't push the change to the peer here - that
1277 	 * is the callers responsibility.
1278 	 */
1279 }
1280 
1281 static void
1282 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1283     xnb_rxbuf_t *rxp)
1284 {
1285 	gnttab_unmap_grant_ref_t	*unmop;
1286 	int				u_count;
1287 	int				reqs_on_ring;
1288 
1289 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1290 	ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE);
1291 
1292 	u_count = xnbp->xnb_rx_unmop_count++;
1293 
1294 	/* Cache data for the time when we actually unmap grant refs */
1295 	xnbp->xnb_rx_unmop_rxp[u_count] = rxp;
1296 
1297 	unmop = &xnbp->xnb_rx_unmop[u_count];
1298 	unmop->host_addr = mop->host_addr;
1299 	unmop->dev_bus_addr = mop->dev_bus_addr;
1300 	unmop->handle = mop->handle;
1301 
1302 	/*
1303 	 * We cannot check the ring once we're disconnected from it. Batching
1304 	 * doesn't seem to be a useful optimisation in this case either,
1305 	 * so we directly call into the actual unmap function.
1306 	 */
1307 	if (xnbp->xnb_connected) {
1308 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring);
1309 
1310 		/*
1311 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1312 		 * or (with N == 1) "immediate unmop" behaviour.
1313 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1314 		 */
1315 		if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat &&
1316 		    reqs_on_ring > xnb_unmop_lowwat)
1317 			return;
1318 	}
1319 
1320 	xnb_rx_perform_pending_unmop(xnbp);
1321 }
1322 
1323 /*
1324  * Here we perform the actual unmapping of the data that was
1325  * accumulated in xnb_rx_schedule_unmop().
1326  * Note that it is the caller's responsibility to make sure that
1327  * there's actually something there to unmop.
1328  */
1329 static void
1330 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
1331 {
1332 	RING_IDX loop;
1333 #ifdef XNB_DEBUG
1334 	gnttab_unmap_grant_ref_t *unmop;
1335 #endif /* XNB_DEBUG */
1336 
1337 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1338 	ASSERT(xnbp->xnb_rx_unmop_count > 0);
1339 
1340 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1341 	    xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) {
1342 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1343 		    "unmap grant operation failed, "
1344 		    "%d pages lost", xnbp->xnb_rx_unmop_count);
1345 	}
1346 
1347 #ifdef XNB_DEBUG
1348 	for (loop = 0, unmop = xnbp->xnb_rx_unmop;
1349 	    loop < xnbp->xnb_rx_unmop_count;
1350 	    loop++, unmop++) {
1351 		if (unmop->status != 0) {
1352 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1353 			    "unmap grant reference failed (%d)",
1354 			    unmop->status);
1355 		}
1356 	}
1357 #endif /* XNB_DEBUG */
1358 
1359 	for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) {
1360 		xnb_rxbuf_t	*rxp = xnbp->xnb_rx_unmop_rxp[loop];
1361 
1362 		if (rxp == NULL)
1363 			cmn_err(CE_PANIC,
1364 			    "xnb_rx_perform_pending_unmop: "
1365 			    "unexpected NULL rxp (loop %d; count %d)!",
1366 			    loop, xnbp->xnb_rx_unmop_count);
1367 
1368 		if (xnbp->xnb_connected)
1369 			xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
1370 		xnb_rxbuf_put(xnbp, rxp);
1371 	}
1372 	if (xnbp->xnb_connected)
1373 		xnb_rx_notify_peer(xnbp);
1374 
1375 	xnbp->xnb_rx_unmop_count = 0;
1376 
1377 #ifdef XNB_DEBUG
1378 	bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop));
1379 	bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp));
1380 #endif /* XNB_DEBUG */
1381 }
1382 
1383 static xnb_rxbuf_t *
1384 xnb_rxbuf_get(xnb_t *xnbp, int flags)
1385 {
1386 	xnb_rxbuf_t *rxp;
1387 
1388 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1389 
1390 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
1391 	if (rxp != NULL) {
1392 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1393 		rxp->xr_flags |= XNB_RXBUF_INUSE;
1394 
1395 		rxp->xr_xnbp = xnbp;
1396 		rxp->xr_mop.dom = xnbp->xnb_peer;
1397 
1398 		rxp->xr_mop.flags = GNTMAP_host_map;
1399 		if (!xnbp->xnb_rx_pages_writable)
1400 			rxp->xr_mop.flags |= GNTMAP_readonly;
1401 
1402 		xnbp->xnb_rx_buf_count++;
1403 	}
1404 
1405 	return (rxp);
1406 }
1407 
1408 static void
1409 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
1410 {
1411 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1412 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1413 
1414 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
1415 	xnbp->xnb_rx_buf_count--;
1416 
1417 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
1418 }
1419 
1420 static mblk_t *
1421 xnb_recv(xnb_t *xnbp)
1422 {
1423 	RING_IDX start, end, loop;
1424 	gnttab_map_grant_ref_t *mop;
1425 	xnb_rxbuf_t **rxpp;
1426 	netif_tx_request_t *txreq;
1427 	boolean_t work_to_do;
1428 	mblk_t *head, *tail;
1429 	/*
1430 	 * If the peer granted a read-only mapping to the page then we
1431 	 * must copy the data, as the local protocol stack (should the
1432 	 * packet be destined for this host) will modify the packet
1433 	 * 'in place'.
1434 	 */
1435 	boolean_t copy = !xnbp->xnb_rx_pages_writable;
1436 
1437 	/*
1438 	 * For each individual request, the sequence of actions is:
1439 	 *
1440 	 * 1. get the request.
1441 	 * 2. map the page based on the grant ref.
1442 	 * 3. allocate an mblk, copy the data to it.
1443 	 * 4. release the grant.
1444 	 * 5. update the ring.
1445 	 * 6. pass the packet upward.
1446 	 * 7. kick the peer.
1447 	 *
1448 	 * In fact, we try to perform the grant operations in batches,
1449 	 * so there are two loops.
1450 	 */
1451 
1452 	head = tail = NULL;
1453 around:
1454 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1455 
1456 	/* LINTED: constant in conditional context */
1457 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1458 	if (!work_to_do) {
1459 finished:
1460 		return (head);
1461 	}
1462 
1463 	start = xnbp->xnb_tx_ring.req_cons;
1464 	end = xnbp->xnb_tx_ring.sring->req_prod;
1465 
1466 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1467 	    loop != end;
1468 	    loop++, mop++, rxpp++) {
1469 		xnb_rxbuf_t *rxp;
1470 
1471 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1472 		if (rxp == NULL)
1473 			break;
1474 
1475 		ASSERT(xnbp->xnb_rx_pages_writable ||
1476 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1477 		    == GNTMAP_readonly));
1478 
1479 		rxp->xr_mop.ref =
1480 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1481 
1482 		ASSERT(rxp->xr_mop.ref < NR_GRANT_ENTRIES);
1483 
1484 		*mop = rxp->xr_mop;
1485 		*rxpp = rxp;
1486 	}
1487 
1488 	if ((loop - start) == 0)
1489 		goto finished;
1490 
1491 	end = loop;
1492 
1493 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1494 	    xnbp->xnb_rx_mop, end - start) != 0) {
1495 
1496 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1497 
1498 		loop = start;
1499 		rxpp = xnbp->xnb_rx_bufp;
1500 
1501 		while (loop != end) {
1502 			xnb_rxbuf_put(xnbp, *rxpp);
1503 
1504 			loop++;
1505 			rxpp++;
1506 		}
1507 
1508 		goto finished;
1509 	}
1510 
1511 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1512 	    loop != end;
1513 	    loop++, mop++, rxpp++) {
1514 		mblk_t *mp = NULL;
1515 		int16_t status = NETIF_RSP_OKAY;
1516 		xnb_rxbuf_t *rxp = *rxpp;
1517 
1518 		if (mop->status != 0) {
1519 			cmn_err(CE_WARN, "xnb_recv: "
1520 			    "failed to map buffer: %d",
1521 			    mop->status);
1522 			status = NETIF_RSP_ERROR;
1523 		}
1524 
1525 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1526 
1527 		if (status == NETIF_RSP_OKAY) {
1528 			if (copy) {
1529 				mp = allocb(txreq->size, BPRI_MED);
1530 				if (mp == NULL) {
1531 					status = NETIF_RSP_ERROR;
1532 					xnbp->xnb_stat_rx_allocb_failed++;
1533 				} else {
1534 					bcopy((caddr_t)(uintptr_t)
1535 					    mop->host_addr + txreq->offset,
1536 					    mp->b_wptr, txreq->size);
1537 					mp->b_wptr += txreq->size;
1538 				}
1539 			} else {
1540 				mp = desballoc((uchar_t *)(uintptr_t)
1541 				    mop->host_addr + txreq->offset,
1542 				    txreq->size, 0, &rxp->xr_free_rtn);
1543 				if (mp == NULL) {
1544 					status = NETIF_RSP_ERROR;
1545 					xnbp->xnb_stat_rx_allocb_failed++;
1546 				} else {
1547 					rxp->xr_id = txreq->id;
1548 					rxp->xr_status = status;
1549 					rxp->xr_mop = *mop;
1550 
1551 					mp->b_wptr += txreq->size;
1552 				}
1553 			}
1554 
1555 			/*
1556 			 * If we have a buffer and there are checksum
1557 			 * flags, process them appropriately.
1558 			 */
1559 			if ((mp != NULL) &&
1560 			    ((txreq->flags &
1561 			    (NETTXF_csum_blank | NETTXF_data_validated))
1562 			    != 0)) {
1563 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1564 				    mp, txreq->flags);
1565 				xnbp->xnb_stat_rx_cksum_no_need++;
1566 			}
1567 		}
1568 
1569 		if (copy || (mp == NULL)) {
1570 			rxp->xr_status = status;
1571 			rxp->xr_id = txreq->id;
1572 			xnb_rx_schedule_unmop(xnbp, mop, rxp);
1573 		}
1574 
1575 		if (mp != NULL) {
1576 			xnbp->xnb_stat_ipackets++;
1577 			xnbp->xnb_stat_rbytes += txreq->size;
1578 
1579 			mp->b_next = NULL;
1580 			if (head == NULL) {
1581 				ASSERT(tail == NULL);
1582 				head = mp;
1583 			} else {
1584 				ASSERT(tail != NULL);
1585 				tail->b_next = mp;
1586 			}
1587 			tail = mp;
1588 		}
1589 	}
1590 
1591 	xnbp->xnb_tx_ring.req_cons = loop;
1592 
1593 	goto around;
1594 	/* NOTREACHED */
1595 }
1596 
1597 /*
1598  *  intr() -- ring interrupt service routine
1599  */
1600 static uint_t
1601 xnb_intr(caddr_t arg)
1602 {
1603 	xnb_t *xnbp = (xnb_t *)arg;
1604 	mblk_t *mp;
1605 
1606 	xnbp->xnb_stat_intr++;
1607 
1608 	mutex_enter(&xnbp->xnb_rx_lock);
1609 
1610 	ASSERT(xnbp->xnb_connected);
1611 
1612 	mp = xnb_recv(xnbp);
1613 
1614 	mutex_exit(&xnbp->xnb_rx_lock);
1615 
1616 	if (!xnbp->xnb_hotplugged) {
1617 		xnbp->xnb_stat_rx_too_early++;
1618 		goto fail;
1619 	}
1620 	if (mp == NULL) {
1621 		xnbp->xnb_stat_spurious_intr++;
1622 		goto fail;
1623 	}
1624 
1625 	xnbp->xnb_flavour->xf_recv(xnbp, mp);
1626 
1627 	return (DDI_INTR_CLAIMED);
1628 
1629 fail:
1630 	freemsgchain(mp);
1631 	return (DDI_INTR_CLAIMED);
1632 }
1633 
1634 static boolean_t
1635 xnb_connect_rings(dev_info_t *dip)
1636 {
1637 	xnb_t *xnbp = ddi_get_driver_private(dip);
1638 	char *oename;
1639 	struct gnttab_map_grant_ref map_op;
1640 	evtchn_port_t evtchn;
1641 	int i;
1642 
1643 	/*
1644 	 * Cannot attempt to connect the rings if already connected.
1645 	 */
1646 	ASSERT(!xnbp->xnb_connected);
1647 
1648 	oename = xvdi_get_oename(dip);
1649 
1650 	if (xenbus_gather(XBT_NULL, oename,
1651 	    "event-channel", "%u", &evtchn,
1652 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1653 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1654 	    NULL) != 0) {
1655 		cmn_err(CE_WARN, "xnb_connect_rings: "
1656 		    "cannot read other-end details from %s",
1657 		    oename);
1658 		goto fail;
1659 	}
1660 
1661 	if (xenbus_scanf(XBT_NULL, oename,
1662 	    "feature-tx-writable", "%d", &i) != 0)
1663 		i = 0;
1664 	if (i != 0)
1665 		xnbp->xnb_rx_pages_writable = B_TRUE;
1666 
1667 	if (xenbus_scanf(XBT_NULL, oename,
1668 	    "feature-no-csum-offload", "%d", &i) != 0)
1669 		i = 0;
1670 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1671 		xnbp->xnb_cksum_offload = B_FALSE;
1672 
1673 	/* Check whether our peer knows and requests hypervisor copy */
1674 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1675 	    != 0)
1676 		i = 0;
1677 	if (i != 0)
1678 		xnbp->xnb_hv_copy = B_TRUE;
1679 
1680 	/*
1681 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1682 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1683 	 *    into the allocated vaddr (one for tx, one for rx).
1684 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1685 	 *    bound to this domain.
1686 	 * 4. associate the event channel with an interrupt.
1687 	 * 5. declare ourselves connected.
1688 	 * 6. enable the interrupt.
1689 	 */
1690 
1691 	/* 1.tx */
1692 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1693 	    0, 0, 0, 0, VM_SLEEP);
1694 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1695 
1696 	/* 2.tx */
1697 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1698 	map_op.flags = GNTMAP_host_map;
1699 	map_op.ref = xnbp->xnb_tx_ring_ref;
1700 	map_op.dom = xnbp->xnb_peer;
1701 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1702 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1703 	    &map_op, 1) != 0 || map_op.status != 0) {
1704 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1705 		goto fail;
1706 	}
1707 	xnbp->xnb_tx_ring_handle = map_op.handle;
1708 
1709 	/* LINTED: constant in conditional context */
1710 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1711 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1712 
1713 	/* 1.rx */
1714 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1715 	    0, 0, 0, 0, VM_SLEEP);
1716 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1717 
1718 	/* 2.rx */
1719 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1720 	map_op.flags = GNTMAP_host_map;
1721 	map_op.ref = xnbp->xnb_rx_ring_ref;
1722 	map_op.dom = xnbp->xnb_peer;
1723 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1724 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1725 	    &map_op, 1) != 0 || map_op.status != 0) {
1726 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1727 		goto fail;
1728 	}
1729 	xnbp->xnb_rx_ring_handle = map_op.handle;
1730 
1731 	/* LINTED: constant in conditional context */
1732 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1733 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1734 
1735 	/* 3 */
1736 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1737 		cmn_err(CE_WARN, "xnb_connect_rings: "
1738 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1739 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1740 		goto fail;
1741 	}
1742 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1743 
1744 	/*
1745 	 * It would be good to set the state to XenbusStateConnected
1746 	 * here as well, but then what if ddi_add_intr() failed?
1747 	 * Changing the state in the store will be noticed by the peer
1748 	 * and cannot be "taken back".
1749 	 */
1750 	mutex_enter(&xnbp->xnb_tx_lock);
1751 	mutex_enter(&xnbp->xnb_rx_lock);
1752 
1753 	/* 5.1 */
1754 	xnbp->xnb_connected = B_TRUE;
1755 
1756 	mutex_exit(&xnbp->xnb_rx_lock);
1757 	mutex_exit(&xnbp->xnb_tx_lock);
1758 
1759 	/* 4, 6 */
1760 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1761 	    != DDI_SUCCESS) {
1762 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1763 		goto fail;
1764 	}
1765 	xnbp->xnb_irq = B_TRUE;
1766 
1767 	/* 5.2 */
1768 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1769 
1770 	return (B_TRUE);
1771 
1772 fail:
1773 	mutex_enter(&xnbp->xnb_tx_lock);
1774 	mutex_enter(&xnbp->xnb_rx_lock);
1775 
1776 	xnbp->xnb_connected = B_FALSE;
1777 	mutex_exit(&xnbp->xnb_rx_lock);
1778 	mutex_exit(&xnbp->xnb_tx_lock);
1779 
1780 	return (B_FALSE);
1781 }
1782 
1783 static void
1784 xnb_disconnect_rings(dev_info_t *dip)
1785 {
1786 	xnb_t *xnbp = ddi_get_driver_private(dip);
1787 
1788 	if (xnbp->xnb_irq) {
1789 		ddi_remove_intr(dip, 0, NULL);
1790 		xnbp->xnb_irq = B_FALSE;
1791 	}
1792 
1793 	if (xnbp->xnb_rx_unmop_count > 0)
1794 		xnb_rx_perform_pending_unmop(xnbp);
1795 
1796 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1797 		xvdi_free_evtchn(dip);
1798 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1799 	}
1800 
1801 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1802 		struct gnttab_unmap_grant_ref unmap_op;
1803 
1804 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1805 		    xnbp->xnb_rx_ring_addr;
1806 		unmap_op.dev_bus_addr = 0;
1807 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1808 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1809 		    &unmap_op, 1) != 0)
1810 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1811 			    "cannot unmap rx-ring page (%d)",
1812 			    unmap_op.status);
1813 
1814 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1815 	}
1816 
1817 	if (xnbp->xnb_rx_ring_addr != NULL) {
1818 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1819 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1820 		xnbp->xnb_rx_ring_addr = NULL;
1821 	}
1822 
1823 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1824 		struct gnttab_unmap_grant_ref unmap_op;
1825 
1826 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1827 		    xnbp->xnb_tx_ring_addr;
1828 		unmap_op.dev_bus_addr = 0;
1829 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1830 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1831 		    &unmap_op, 1) != 0)
1832 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1833 			    "cannot unmap tx-ring page (%d)",
1834 			    unmap_op.status);
1835 
1836 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1837 	}
1838 
1839 	if (xnbp->xnb_tx_ring_addr != NULL) {
1840 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1841 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1842 		xnbp->xnb_tx_ring_addr = NULL;
1843 	}
1844 }
1845 
1846 /*ARGSUSED*/
1847 static void
1848 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1849     void *arg, void *impl_data)
1850 {
1851 	xnb_t *xnbp = ddi_get_driver_private(dip);
1852 	XenbusState new_state = *(XenbusState *)impl_data;
1853 
1854 	ASSERT(xnbp != NULL);
1855 
1856 	switch (new_state) {
1857 	case XenbusStateConnected:
1858 		if (xnb_connect_rings(dip)) {
1859 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1860 		} else {
1861 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1862 			xnb_disconnect_rings(dip);
1863 			(void) xvdi_switch_state(dip, XBT_NULL,
1864 			    XenbusStateClosed);
1865 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1866 		}
1867 
1868 		/*
1869 		 * Now that we've attempted to connect it's reasonable
1870 		 * to allow an attempt to detach.
1871 		 */
1872 		xnbp->xnb_detachable = B_TRUE;
1873 
1874 		break;
1875 
1876 	case XenbusStateClosing:
1877 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1878 
1879 		break;
1880 
1881 	case XenbusStateClosed:
1882 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1883 
1884 		mutex_enter(&xnbp->xnb_tx_lock);
1885 		mutex_enter(&xnbp->xnb_rx_lock);
1886 
1887 		xnb_disconnect_rings(dip);
1888 		xnbp->xnb_connected = B_FALSE;
1889 
1890 		mutex_exit(&xnbp->xnb_rx_lock);
1891 		mutex_exit(&xnbp->xnb_tx_lock);
1892 
1893 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1894 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1895 		/*
1896 		 * In all likelyhood this is already set (in the above
1897 		 * case), but if the peer never attempted to connect
1898 		 * and the domain is destroyed we get here without
1899 		 * having been through the case above, so we set it to
1900 		 * be sure.
1901 		 */
1902 		xnbp->xnb_detachable = B_TRUE;
1903 
1904 		break;
1905 
1906 	default:
1907 		break;
1908 	}
1909 }
1910 
1911 /*ARGSUSED*/
1912 static void
1913 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1914     void *arg, void *impl_data)
1915 {
1916 	xnb_t *xnbp = ddi_get_driver_private(dip);
1917 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1918 	boolean_t success;
1919 
1920 	ASSERT(xnbp != NULL);
1921 
1922 	switch (state) {
1923 	case Connected:
1924 
1925 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1926 
1927 		mutex_enter(&xnbp->xnb_tx_lock);
1928 		mutex_enter(&xnbp->xnb_rx_lock);
1929 
1930 		xnbp->xnb_hotplugged = success;
1931 
1932 		mutex_exit(&xnbp->xnb_rx_lock);
1933 		mutex_exit(&xnbp->xnb_tx_lock);
1934 		break;
1935 
1936 	default:
1937 		break;
1938 	}
1939 }
1940 
1941 static struct modldrv modldrv = {
1942 	&mod_miscops, "xnb module %I%",
1943 };
1944 
1945 static struct modlinkage modlinkage = {
1946 	MODREV_1, &modldrv, NULL
1947 };
1948 
1949 int
1950 _init(void)
1951 {
1952 	int i;
1953 
1954 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
1955 
1956 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
1957 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
1958 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
1959 	ASSERT(xnb_rxbuf_cachep != NULL);
1960 
1961 	i = mod_install(&modlinkage);
1962 	if (i != DDI_SUCCESS) {
1963 		kmem_cache_destroy(xnb_rxbuf_cachep);
1964 		mutex_destroy(&xnb_alloc_page_lock);
1965 	}
1966 	return (i);
1967 }
1968 
1969 int
1970 _info(struct modinfo *modinfop)
1971 {
1972 	return (mod_info(&modlinkage, modinfop));
1973 }
1974 
1975 int
1976 _fini(void)
1977 {
1978 	int i;
1979 
1980 	i = mod_remove(&modlinkage);
1981 	if (i == DDI_SUCCESS) {
1982 		kmem_cache_destroy(xnb_rxbuf_cachep);
1983 		mutex_destroy(&xnb_alloc_page_lock);
1984 	}
1985 	return (i);
1986 }
1987