xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 06fb6a368cb1af862cff62b9a1fd89171e9ac63a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #ifdef DEBUG
30 #define	XNB_DEBUG 1
31 #endif /* DEBUG */
32 
33 #include "xnb.h"
34 
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/mac.h>
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/types.h>
44 #include <sys/pattr.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/hat_i86.h>
47 #include <xen/sys/xenbus_impl.h>
48 #include <xen/sys/xendev.h>
49 #include <sys/balloon_impl.h>
50 #include <sys/evtchn_impl.h>
51 #include <sys/gnttab.h>
52 #include <vm/vm_dep.h>
53 
54 #include <sys/gld.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <sys/vnic_impl.h> /* blech. */
58 
59 /*
60  * The terms "transmit" and "receive" are used in their traditional
61  * sense here - packets from other parts of this system are
62  * "transmitted" to the peer domain and those originating from the
63  * peer are "received".
64  *
65  * In some cases this can be confusing, because various data
66  * structures are shared with the domU driver, which has the opposite
67  * view of what constitutes "transmit" and "receive".  In naming the
68  * shared structures the domU driver always wins.
69  */
70 
71 /*
72  * XXPV dme: things to do, as well as various things indicated
73  * throughout the source:
74  * - copy avoidance outbound.
75  * - copy avoidance inbound.
76  * - transfer credit limiting.
77  * - MAC address based filtering.
78  */
79 
80 /*
81  * Linux expects to have some headroom in received buffers.  The Linux
82  * frontend driver (netfront) checks to see if the headroom is
83  * available and will re-allocate the buffer to make room if
84  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
85  * headroom to each packet we pass to the peer.
86  */
87 #define	TX_BUFFER_HEADROOM	16
88 
89 static boolean_t	xnb_cksum_offload = B_TRUE;
90 
91 static boolean_t	xnb_connect_rings(dev_info_t *);
92 static void		xnb_disconnect_rings(dev_info_t *);
93 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
94     void *, void *);
95 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
96     void *, void *);
97 
98 static int	xnb_rxbuf_constructor(void *, void *, int);
99 static void	xnb_rxbuf_destructor(void *, void *);
100 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
101 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
102 static void	xnb_rx_notify_peer(xnb_t *);
103 static void	xnb_rx_complete(xnb_rxbuf_t *);
104 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
105 static void 	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
106     xnb_rxbuf_t *);
107 static void	xnb_rx_perform_pending_unmop(xnb_t *);
108 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
109 
110 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
111 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
112 
113 
114 boolean_t	xnb_hv_copy = B_TRUE;
115 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
116 
117 /* XXPV dme: are these really invalid? */
118 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
119 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
120 
121 static kmem_cache_t *xnb_rxbuf_cachep;
122 static kmutex_t	xnb_alloc_page_lock;
123 
124 /*
125  * Statistics.
126  */
127 static char *aux_statistics[] = {
128 	"tx_cksum_deferred",
129 	"rx_cksum_no_need",
130 	"tx_rsp_notok",
131 	"tx_notify_deferred",
132 	"tx_notify_sent",
133 	"rx_notify_deferred",
134 	"rx_notify_sent",
135 	"tx_too_early",
136 	"rx_too_early",
137 	"rx_allocb_failed",
138 	"tx_allocb_failed",
139 	"tx_foreign_page",
140 	"mac_full",
141 	"spurious_intr",
142 	"allocation_success",
143 	"allocation_failure",
144 	"small_allocation_success",
145 	"small_allocation_failure",
146 	"other_allocation_failure",
147 	"tx_pageboundary_crossed",
148 	"tx_cpoparea_grown",
149 	"csum_hardware",
150 	"csum_software",
151 };
152 
153 static int
154 xnb_ks_aux_update(kstat_t *ksp, int flag)
155 {
156 	xnb_t *xnbp;
157 	kstat_named_t *knp;
158 
159 	if (flag != KSTAT_READ)
160 		return (EACCES);
161 
162 	xnbp = ksp->ks_private;
163 	knp = ksp->ks_data;
164 
165 	/*
166 	 * Assignment order should match that of the names in
167 	 * aux_statistics.
168 	 */
169 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred;
170 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need;
171 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_rsp_notok;
172 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
192 
193 	return (0);
194 }
195 
196 static boolean_t
197 xnb_ks_init(xnb_t *xnbp)
198 {
199 	int nstat = sizeof (aux_statistics) /
200 	    sizeof (aux_statistics[0]);
201 	char **cp = aux_statistics;
202 	kstat_named_t *knp;
203 
204 	/*
205 	 * Create and initialise kstats.
206 	 */
207 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
208 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
209 	    KSTAT_TYPE_NAMED, nstat, 0);
210 	if (xnbp->xnb_kstat_aux == NULL)
211 		return (B_FALSE);
212 
213 	xnbp->xnb_kstat_aux->ks_private = xnbp;
214 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
215 
216 	knp = xnbp->xnb_kstat_aux->ks_data;
217 	while (nstat > 0) {
218 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
219 
220 		knp++;
221 		cp++;
222 		nstat--;
223 	}
224 
225 	kstat_install(xnbp->xnb_kstat_aux);
226 
227 	return (B_TRUE);
228 }
229 
230 static void
231 xnb_ks_free(xnb_t *xnbp)
232 {
233 	kstat_delete(xnbp->xnb_kstat_aux);
234 }
235 
236 /*
237  * Software checksum calculation and insertion for an arbitrary packet.
238  */
239 /*ARGSUSED*/
240 static mblk_t *
241 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
242 {
243 	/*
244 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
245 	 * because it doesn't cover all of the interesting cases :-(
246 	 */
247 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
248 	    HCK_FULLCKSUM, KM_NOSLEEP);
249 
250 	return (vnic_fix_cksum(mp));
251 }
252 
253 mblk_t *
254 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
255 {
256 	struct ether_header *ehp;
257 	uint16_t sap;
258 	uint32_t offset;
259 	ipha_t *ipha;
260 
261 	ASSERT(mp->b_next == NULL);
262 
263 	/*
264 	 * Check that the packet is contained in a single mblk.  In
265 	 * the "from peer" path this is true today, but will change
266 	 * when scatter gather support is added.  In the "to peer"
267 	 * path we cannot be sure, but in most cases it will be true
268 	 * (in the xnbo case the packet has come from a MAC device
269 	 * which is unlikely to split packets).
270 	 */
271 	if (mp->b_cont != NULL)
272 		goto software;
273 
274 	/*
275 	 * If the MAC has no hardware capability don't do any further
276 	 * checking.
277 	 */
278 	if (capab == 0)
279 		goto software;
280 
281 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
282 	ehp = (struct ether_header *)mp->b_rptr;
283 
284 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
285 		struct ether_vlan_header *evhp;
286 
287 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
288 		evhp = (struct ether_vlan_header *)mp->b_rptr;
289 		sap = ntohs(evhp->ether_type);
290 		offset = sizeof (struct ether_vlan_header);
291 	} else {
292 		sap = ntohs(ehp->ether_type);
293 		offset = sizeof (struct ether_header);
294 	}
295 
296 	/*
297 	 * We only attempt to do IPv4 packets in hardware.
298 	 */
299 	if (sap != ETHERTYPE_IP)
300 		goto software;
301 
302 	/*
303 	 * We know that this is an IPv4 packet.
304 	 */
305 	ipha = (ipha_t *)(mp->b_rptr + offset);
306 
307 	switch (ipha->ipha_protocol) {
308 	case IPPROTO_TCP:
309 	case IPPROTO_UDP:
310 		/*
311 		 * This is a TCP/IPv4 or UDP/IPv4 packet.
312 		 *
313 		 * If the capabilities indicate that full checksum
314 		 * offload is available, use it.
315 		 */
316 		if ((capab & HCKSUM_INET_FULL_V4) != 0) {
317 			(void) hcksum_assoc(mp, NULL, NULL,
318 			    0, 0, 0, 0,
319 			    HCK_FULLCKSUM, KM_NOSLEEP);
320 
321 			xnbp->xnb_stat_csum_hardware++;
322 
323 			return (mp);
324 		}
325 
326 		/*
327 		 * XXPV dme: If the capabilities indicate that partial
328 		 * checksum offload is available, we should use it.
329 		 */
330 
331 		break;
332 
333 	default:
334 		/* Use software. */
335 		break;
336 	}
337 
338 software:
339 	/*
340 	 * We are not able to use any offload so do the whole thing in
341 	 * software.
342 	 */
343 	xnbp->xnb_stat_csum_software++;
344 
345 	return (xnb_software_csum(xnbp, mp));
346 }
347 
348 int
349 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
350 {
351 	xnb_t *xnbp;
352 	char *xsname, mac[ETHERADDRL * 3];
353 
354 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
355 
356 	xnbp->xnb_flavour = flavour;
357 	xnbp->xnb_flavour_data = flavour_data;
358 	xnbp->xnb_devinfo = dip;
359 	xnbp->xnb_evtchn = INVALID_EVTCHN;
360 	xnbp->xnb_irq = B_FALSE;
361 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
362 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
363 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
364 	xnbp->xnb_connected = B_FALSE;
365 	xnbp->xnb_hotplugged = B_FALSE;
366 	xnbp->xnb_detachable = B_FALSE;
367 	xnbp->xnb_peer = xvdi_get_oeid(dip);
368 	xnbp->xnb_rx_pages_writable = B_FALSE;
369 
370 	xnbp->xnb_rx_buf_count = 0;
371 	xnbp->xnb_rx_unmop_count = 0;
372 
373 	xnbp->xnb_hv_copy = B_FALSE;
374 
375 	xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
376 	ASSERT(xnbp->xnb_tx_va != NULL);
377 
378 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
379 	    != DDI_SUCCESS)
380 		goto failure;
381 
382 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
383 	xnbp->xnb_tx_cpop = NULL;
384 	xnbp->xnb_cpop_sz = 0;
385 
386 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
387 	    xnbp->xnb_icookie);
388 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
389 	    xnbp->xnb_icookie);
390 
391 	/* set driver private pointer now */
392 	ddi_set_driver_private(dip, xnbp);
393 
394 	if (!xnb_ks_init(xnbp))
395 		goto failure_1;
396 
397 	/*
398 	 * Receive notification of changes in the state of the
399 	 * driver in the guest domain.
400 	 */
401 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
402 	    xnb_oe_state_change) != DDI_SUCCESS)
403 		goto failure_2;
404 
405 	/*
406 	 * Receive notification of hotplug events.
407 	 */
408 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
409 	    xnb_hp_state_change) != DDI_SUCCESS)
410 		goto failure_2;
411 
412 	xsname = xvdi_get_xsname(dip);
413 
414 	if (xenbus_printf(XBT_NULL, xsname,
415 	    "feature-no-csum-offload", "%d",
416 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
417 		goto failure_3;
418 
419 	/*
420 	 * Use global xnb_hv_copy to export this feature. This means that
421 	 * we have to decide what to do before starting up a guest domain
422 	 */
423 	if (xenbus_printf(XBT_NULL, xsname,
424 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
425 		goto failure_3;
426 	/*
427 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
428 	 * in addition to "feature-rx-copy" being 1. It seems strange
429 	 * to use four possible states to describe a binary decision,
430 	 * but we might as well play nice.
431 	 */
432 	if (xenbus_printf(XBT_NULL, xsname,
433 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
434 		goto failure_3;
435 
436 	if (xenbus_scanf(XBT_NULL, xsname,
437 	    "mac", "%s", mac) != 0) {
438 		cmn_err(CE_WARN, "xnb_attach: "
439 		    "cannot read mac address from %s",
440 		    xsname);
441 		goto failure_3;
442 	}
443 
444 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
445 		cmn_err(CE_WARN,
446 		    "xnb_attach: cannot parse mac address %s",
447 		    mac);
448 		goto failure_3;
449 	}
450 
451 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
452 	(void) xvdi_post_event(dip, XEN_HP_ADD);
453 
454 	return (DDI_SUCCESS);
455 
456 failure_3:
457 	xvdi_remove_event_handler(dip, NULL);
458 
459 failure_2:
460 	xnb_ks_free(xnbp);
461 
462 failure_1:
463 	mutex_destroy(&xnbp->xnb_rx_lock);
464 	mutex_destroy(&xnbp->xnb_tx_lock);
465 
466 failure:
467 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
468 	kmem_free(xnbp, sizeof (*xnbp));
469 	return (DDI_FAILURE);
470 }
471 
472 /*ARGSUSED*/
473 void
474 xnb_detach(dev_info_t *dip)
475 {
476 	xnb_t *xnbp = ddi_get_driver_private(dip);
477 
478 	ASSERT(xnbp != NULL);
479 	ASSERT(!xnbp->xnb_connected);
480 	ASSERT(xnbp->xnb_rx_buf_count == 0);
481 
482 	xnb_disconnect_rings(dip);
483 
484 	xvdi_remove_event_handler(dip, NULL);
485 
486 	xnb_ks_free(xnbp);
487 
488 	ddi_set_driver_private(dip, NULL);
489 
490 	mutex_destroy(&xnbp->xnb_tx_lock);
491 	mutex_destroy(&xnbp->xnb_rx_lock);
492 
493 	if (xnbp->xnb_cpop_sz > 0)
494 		kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop)
495 		    * xnbp->xnb_cpop_sz);
496 
497 	ASSERT(xnbp->xnb_tx_va != NULL);
498 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
499 
500 	kmem_free(xnbp, sizeof (*xnbp));
501 }
502 
503 
504 static mfn_t
505 xnb_alloc_page(xnb_t *xnbp)
506 {
507 #define	WARNING_RATE_LIMIT 100
508 #define	BATCH_SIZE 256
509 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
510 	static int nth = BATCH_SIZE;
511 	mfn_t mfn;
512 
513 	mutex_enter(&xnb_alloc_page_lock);
514 	if (nth == BATCH_SIZE) {
515 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
516 			xnbp->xnb_stat_allocation_failure++;
517 			mutex_exit(&xnb_alloc_page_lock);
518 
519 			/*
520 			 * Try for a single page in low memory situations.
521 			 */
522 			if (balloon_alloc_pages(1, &mfn) != 1) {
523 				if ((xnbp->xnb_stat_small_allocation_failure++
524 				    % WARNING_RATE_LIMIT) == 0)
525 					cmn_err(CE_WARN, "xnb_alloc_page: "
526 					    "Cannot allocate memory to "
527 					    "transfer packets to peer.");
528 				return (0);
529 			} else {
530 				xnbp->xnb_stat_small_allocation_success++;
531 				return (mfn);
532 			}
533 		}
534 
535 		nth = 0;
536 		xnbp->xnb_stat_allocation_success++;
537 	}
538 
539 	mfn = mfns[nth++];
540 	mutex_exit(&xnb_alloc_page_lock);
541 
542 	ASSERT(mfn != 0);
543 
544 	return (mfn);
545 #undef BATCH_SIZE
546 #undef WARNING_RATE_LIMIT
547 }
548 
549 /*ARGSUSED*/
550 static void
551 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
552 {
553 	int r;
554 	pfn_t pfn;
555 
556 	pfn = xen_assign_pfn(mfn);
557 	pfnzero(pfn, 0, PAGESIZE);
558 	xen_release_pfn(pfn);
559 
560 	/*
561 	 * This happens only in the error path, so batching is
562 	 * not worth the complication.
563 	 */
564 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
565 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
566 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
567 		    r, mfn);
568 	}
569 }
570 
571 /*
572  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
573  * using local variables.
574  */
575 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
576 	((((_r)->sring->req_prod - loop) <		\
577 		(RING_SIZE(_r) - (loop - prod))) ?	\
578 	    ((_r)->sring->req_prod - loop) :		\
579 	    (RING_SIZE(_r) - (loop - prod)))
580 
581 mblk_t *
582 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
583 {
584 	mblk_t *free = mp, *prev = NULL;
585 	size_t len;
586 	gnttab_transfer_t *gop;
587 	boolean_t notify;
588 	RING_IDX loop, prod, end;
589 
590 	/*
591 	 * For each packet the sequence of operations is:
592 	 *
593 	 * 1. get a new page from the hypervisor.
594 	 * 2. get a request slot from the ring.
595 	 * 3. copy the data into the new page.
596 	 * 4. transfer the page to the peer.
597 	 * 5. update the request slot.
598 	 * 6. kick the peer.
599 	 * 7. free mp.
600 	 *
601 	 * In order to reduce the number of hypercalls, we prepare
602 	 * several packets for the peer and perform a single hypercall
603 	 * to transfer them.
604 	 */
605 
606 	mutex_enter(&xnbp->xnb_tx_lock);
607 
608 	/*
609 	 * If we are not connected to the peer or have not yet
610 	 * finished hotplug it is too early to pass packets to the
611 	 * peer.
612 	 */
613 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
614 		mutex_exit(&xnbp->xnb_tx_lock);
615 		DTRACE_PROBE(flip_tx_too_early);
616 		xnbp->xnb_stat_tx_too_early++;
617 		return (mp);
618 	}
619 
620 	loop = xnbp->xnb_rx_ring.req_cons;
621 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
622 	gop = xnbp->xnb_tx_top;
623 
624 	while ((mp != NULL) &&
625 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
626 
627 		mfn_t mfn;
628 		pfn_t pfn;
629 		netif_rx_request_t *rxreq;
630 		netif_rx_response_t *rxresp;
631 		char *valoop;
632 		size_t offset;
633 		mblk_t *ml;
634 		uint16_t cksum_flags;
635 
636 		/* 1 */
637 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
638 			xnbp->xnb_stat_xmit_defer++;
639 			break;
640 		}
641 
642 		/* 2 */
643 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
644 
645 #ifdef XNB_DEBUG
646 		if (!(rxreq->id < NET_RX_RING_SIZE))
647 			cmn_err(CE_PANIC, "xnb_to_peer: "
648 			    "id %d out of range in request 0x%p",
649 			    rxreq->id, (void *)rxreq);
650 #endif /* XNB_DEBUG */
651 
652 		/* Assign a pfn and map the new page at the allocated va. */
653 		pfn = xen_assign_pfn(mfn);
654 		hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
655 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
656 
657 		offset = TX_BUFFER_HEADROOM;
658 
659 		/* 3 */
660 		len = 0;
661 		valoop = xnbp->xnb_tx_va + offset;
662 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
663 			size_t chunk = ml->b_wptr - ml->b_rptr;
664 
665 			bcopy(ml->b_rptr, valoop, chunk);
666 			valoop += chunk;
667 			len += chunk;
668 		}
669 
670 		ASSERT(len + offset < PAGESIZE);
671 
672 		/* Release the pfn. */
673 		hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
674 		    HAT_UNLOAD_UNMAP);
675 		xen_release_pfn(pfn);
676 
677 		/* 4 */
678 		gop->mfn = mfn;
679 		gop->domid = xnbp->xnb_peer;
680 		gop->ref = rxreq->gref;
681 
682 		/* 5.1 */
683 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
684 		rxresp->offset = offset;
685 		rxresp->flags = 0;
686 
687 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
688 		if (cksum_flags != 0)
689 			xnbp->xnb_stat_tx_cksum_deferred++;
690 		rxresp->flags |= cksum_flags;
691 
692 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
693 		rxresp->status = len;
694 
695 		loop++;
696 		prod++;
697 		gop++;
698 		prev = mp;
699 		mp = mp->b_next;
700 	}
701 
702 	/*
703 	 * Did we actually do anything?
704 	 */
705 	if (loop == xnbp->xnb_rx_ring.req_cons) {
706 		mutex_exit(&xnbp->xnb_tx_lock);
707 		return (mp);
708 	}
709 
710 	end = loop;
711 
712 	/*
713 	 * Unlink the end of the 'done' list from the remainder.
714 	 */
715 	ASSERT(prev != NULL);
716 	prev->b_next = NULL;
717 
718 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top,
719 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
720 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
721 	}
722 
723 	loop = xnbp->xnb_rx_ring.req_cons;
724 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
725 	gop = xnbp->xnb_tx_top;
726 
727 	while (loop < end) {
728 		int16_t status = NETIF_RSP_OKAY;
729 
730 		if (gop->status != 0) {
731 			status = NETIF_RSP_ERROR;
732 
733 			/*
734 			 * If the status is anything other than
735 			 * GNTST_bad_page then we don't own the page
736 			 * any more, so don't try to give it back.
737 			 */
738 			if (gop->status != GNTST_bad_page)
739 				gop->mfn = 0;
740 		} else {
741 			/* The page is no longer ours. */
742 			gop->mfn = 0;
743 		}
744 
745 		if (gop->mfn != 0)
746 			/*
747 			 * Give back the page, as we won't be using
748 			 * it.
749 			 */
750 			xnb_free_page(xnbp, gop->mfn);
751 		else
752 			/*
753 			 * We gave away a page, update our accounting
754 			 * now.
755 			 */
756 			balloon_drv_subtracted(1);
757 
758 		/* 5.2 */
759 		if (status != NETIF_RSP_OKAY) {
760 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
761 			    status;
762 		} else {
763 			xnbp->xnb_stat_opackets++;
764 			xnbp->xnb_stat_obytes += len;
765 		}
766 
767 		loop++;
768 		prod++;
769 		gop++;
770 	}
771 
772 	xnbp->xnb_rx_ring.req_cons = loop;
773 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
774 
775 	/* 6 */
776 	/* LINTED: constant in conditional context */
777 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
778 	if (notify) {
779 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
780 		xnbp->xnb_stat_tx_notify_sent++;
781 	} else {
782 		xnbp->xnb_stat_tx_notify_deferred++;
783 	}
784 
785 	if (mp != NULL)
786 		xnbp->xnb_stat_xmit_defer++;
787 
788 	mutex_exit(&xnbp->xnb_tx_lock);
789 
790 	/* Free mblk_t's that we consumed. */
791 	freemsgchain(free);
792 
793 	return (mp);
794 }
795 
796 /* helper functions for xnb_copy_to_peer */
797 
798 /*
799  * Grow the array of copy operation descriptors.
800  * Returns a pointer to the next available entry.
801  */
802 gnttab_copy_t *
803 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
804 {
805 	/*
806 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
807 	 * something into but cannot, because we haven't alloc'ed it
808 	 * yet, or NULL.
809 	 * old_cpop and new_cpop (local) are pointers to old/new
810 	 * versions of xnbp->xnb_tx_cpop.
811 	 */
812 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
813 	size_t		newcount;
814 
815 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
816 
817 	old_cpop = xnbp->xnb_tx_cpop;
818 	/*
819 	 * o_cpop is a pointer into the array pointed to by old_cpop;
820 	 * it would be an error for exactly one of these pointers to be NULL.
821 	 * We shouldn't call this function if xnb_tx_cpop has already
822 	 * been allocated, but we're starting to fill it from the beginning
823 	 * again.
824 	 */
825 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
826 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
827 
828 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
829 
830 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
831 	if (new_cpop == NULL) {
832 		xnbp->xnb_stat_other_allocation_failure++;
833 		return (NULL);
834 	}
835 
836 	if (o_cpop != NULL) {
837 		size_t	 offset = (o_cpop - old_cpop);
838 
839 		/* we only need to move the parts in use ... */
840 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
841 		    (sizeof (*old_cpop)));
842 
843 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
844 
845 		ret_cpop = new_cpop + offset;
846 	} else {
847 		ret_cpop = new_cpop;
848 	}
849 
850 	xnbp->xnb_tx_cpop = new_cpop;
851 	xnbp->xnb_cpop_sz = newcount;
852 
853 	xnbp->xnb_stat_tx_cpoparea_grown++;
854 
855 	return (ret_cpop);
856 }
857 
858 /*
859  * Check whether an address is on a page that's foreign to this domain.
860  */
861 static boolean_t
862 is_foreign(void *addr)
863 {
864 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
865 
866 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
867 }
868 
869 /*
870  * Insert a newly allocated mblk into a chain, replacing the old one.
871  */
872 static mblk_t *
873 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
874 {
875 	uint32_t	start, stuff, end, value, flags;
876 	mblk_t		*new_mp;
877 
878 	new_mp = copyb(mp);
879 	if (new_mp == NULL)
880 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
881 		    "for %p, len %lu", (void *) mp, len);
882 
883 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
884 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
885 	    flags, KM_NOSLEEP);
886 
887 	new_mp->b_next = mp->b_next;
888 	new_mp->b_prev = mp->b_prev;
889 	new_mp->b_cont = mp->b_cont;
890 
891 	/* Make sure we only overwrite pointers to the mblk being replaced. */
892 	if (mp_prev != NULL && mp_prev->b_next == mp)
893 		mp_prev->b_next = new_mp;
894 
895 	if (ml_prev != NULL && ml_prev->b_cont == mp)
896 		ml_prev->b_cont = new_mp;
897 
898 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
899 	freemsg(mp);
900 
901 	return (new_mp);
902 }
903 
904 /*
905  * Set all the fields in a gnttab_copy_t.
906  */
907 static void
908 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
909     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
910 {
911 	ASSERT(xnbp != NULL && gp != NULL);
912 
913 	gp->source.offset = s_off;
914 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
915 	gp->source.domid = DOMID_SELF;
916 
917 	gp->len = (uint16_t)len;
918 	gp->flags = GNTCOPY_dest_gref;
919 	gp->status = 0;
920 
921 	gp->dest.u.ref = d_ref;
922 	gp->dest.offset = d_off;
923 	gp->dest.domid = xnbp->xnb_peer;
924 }
925 
926 mblk_t *
927 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
928 {
929 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
930 	mblk_t		*ml, *ml_prev;
931 	gnttab_copy_t	*gop_cp;
932 	boolean_t	notify;
933 	RING_IDX	loop, prod;
934 	int		i;
935 
936 	if (!xnbp->xnb_hv_copy)
937 		return (xnb_to_peer(xnbp, mp));
938 
939 	/*
940 	 * For each packet the sequence of operations is:
941 	 *
942 	 *  1. get a request slot from the ring.
943 	 *  2. set up data for hypercall (see NOTE below)
944 	 *  3. have the hypervisore copy the data
945 	 *  4. update the request slot.
946 	 *  5. kick the peer.
947 	 *
948 	 * NOTE ad 2.
949 	 *  In order to reduce the number of hypercalls, we prepare
950 	 *  several packets (mp->b_cont != NULL) for the peer and
951 	 *  perform a single hypercall to transfer them.
952 	 *  We also have to set up a seperate copy operation for
953 	 *  every page.
954 	 *
955 	 * If we have more than one message (mp->b_next != NULL),
956 	 * we do this whole dance repeatedly.
957 	 */
958 
959 	mutex_enter(&xnbp->xnb_tx_lock);
960 
961 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
962 		mutex_exit(&xnbp->xnb_tx_lock);
963 		DTRACE_PROBE(copy_tx_too_early);
964 		xnbp->xnb_stat_tx_too_early++;
965 		return (mp);
966 	}
967 
968 	loop = xnbp->xnb_rx_ring.req_cons;
969 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
970 
971 	while ((mp != NULL) &&
972 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
973 		netif_rx_request_t	*rxreq;
974 		netif_rx_response_t	*rxresp;
975 		size_t			offset, d_offset;
976 		size_t			len;
977 		uint16_t		cksum_flags;
978 		int16_t			status = NETIF_RSP_OKAY;
979 		int			item_count;
980 
981 		/* 1 */
982 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
983 
984 #ifdef XNB_DEBUG
985 		if (!(rxreq->id < NET_RX_RING_SIZE))
986 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
987 			    "id %d out of range in request 0x%p",
988 			    rxreq->id, (void *)rxreq);
989 #endif /* XNB_DEBUG */
990 
991 		/* 2 */
992 		d_offset = offset = TX_BUFFER_HEADROOM;
993 		len = 0;
994 		item_count = 0;
995 
996 		gop_cp = xnbp->xnb_tx_cpop;
997 
998 		/*
999 		 * We walk the b_cont pointers and set up a gop_cp
1000 		 * structure for every page in every data block we have.
1001 		 */
1002 		/* 2a */
1003 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1004 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1005 			uchar_t	*r_tmp,	*rpt_align;
1006 			size_t	r_offset;
1007 
1008 			/*
1009 			 * If we get an mblk on a page that doesn't belong to
1010 			 * this domain, get a new mblk to replace the old one.
1011 			 */
1012 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1013 				mblk_t *ml_new = replace_msg(ml, chunk,
1014 				    mp_prev, ml_prev);
1015 
1016 				/* We can still use old ml, but not *ml! */
1017 				if (free == ml)
1018 					free = ml_new;
1019 				if (mp == ml)
1020 					mp = ml_new;
1021 				ml = ml_new;
1022 
1023 				xnbp->xnb_stat_tx_foreign_page++;
1024 			}
1025 
1026 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1027 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1028 			r_tmp = ml->b_rptr;
1029 
1030 			if (d_offset + chunk > PAGESIZE)
1031 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1032 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1033 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1034 				    (void *)mp, (void *)saved_mp, (void *)ml,
1035 				    (void *)rpt_align,
1036 				    d_offset, chunk, (int)PAGESIZE);
1037 
1038 			while (chunk > 0) {
1039 				size_t part_len;
1040 
1041 				item_count++;
1042 				if (item_count > xnbp->xnb_cpop_sz) {
1043 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1044 					if (gop_cp == NULL)
1045 						goto failure;
1046 				}
1047 				/*
1048 				 * If our mblk crosses a page boundary, we need
1049 				 * to do a seperate copy for every page.
1050 				 */
1051 				if (r_offset + chunk > PAGESIZE) {
1052 					part_len = PAGESIZE - r_offset;
1053 
1054 					DTRACE_PROBE3(mblk_page_crossed,
1055 					    (mblk_t *), ml, int, chunk, int,
1056 					    (int)r_offset);
1057 
1058 					xnbp->xnb_stat_tx_pagebndry_crossed++;
1059 				} else {
1060 					part_len = chunk;
1061 				}
1062 
1063 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1064 				    d_offset, part_len, rxreq->gref);
1065 
1066 				chunk -= part_len;
1067 
1068 				len += part_len;
1069 				d_offset += part_len;
1070 				r_tmp += part_len;
1071 				/*
1072 				 * The 2nd, 3rd ... last copies will always
1073 				 * start at r_tmp, therefore r_offset is 0.
1074 				 */
1075 				r_offset = 0;
1076 				gop_cp++;
1077 			}
1078 			ml_prev = ml;
1079 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1080 			    chunk, int, len, int, item_count);
1081 		}
1082 		/* 3 */
1083 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop,
1084 		    item_count) != 0) {
1085 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1086 			DTRACE_PROBE(HV_granttableopfailed);
1087 		}
1088 
1089 		/* 4 */
1090 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1091 		rxresp->offset = offset;
1092 
1093 		rxresp->flags = 0;
1094 
1095 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1096 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1097 		    (int)rxresp->status);
1098 
1099 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1100 		if (cksum_flags != 0)
1101 			xnbp->xnb_stat_tx_cksum_deferred++;
1102 		rxresp->flags |= cksum_flags;
1103 
1104 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1105 		rxresp->status = len;
1106 
1107 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1108 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1109 		    (int)rxresp->status);
1110 
1111 		for (i = 0; i < item_count; i++) {
1112 			if (xnbp->xnb_tx_cpop[i].status != 0) {
1113 				DTRACE_PROBE2(cpop__status__nonnull, int,
1114 				    (int)xnbp->xnb_tx_cpop[i].status,
1115 				    int, i);
1116 				status = NETIF_RSP_ERROR;
1117 			}
1118 		}
1119 
1120 		/* 5.2 */
1121 		if (status != NETIF_RSP_OKAY) {
1122 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1123 			    status;
1124 			xnbp->xnb_stat_tx_rsp_notok++;
1125 		} else {
1126 			xnbp->xnb_stat_opackets++;
1127 			xnbp->xnb_stat_obytes += len;
1128 		}
1129 
1130 		loop++;
1131 		prod++;
1132 		mp_prev = mp;
1133 		mp = mp->b_next;
1134 	}
1135 failure:
1136 	/*
1137 	 * Did we actually do anything?
1138 	 */
1139 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1140 		mutex_exit(&xnbp->xnb_tx_lock);
1141 		return (mp);
1142 	}
1143 
1144 	/*
1145 	 * Unlink the end of the 'done' list from the remainder.
1146 	 */
1147 	ASSERT(mp_prev != NULL);
1148 	mp_prev->b_next = NULL;
1149 
1150 	xnbp->xnb_rx_ring.req_cons = loop;
1151 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1152 
1153 	/* 6 */
1154 	/* LINTED: constant in conditional context */
1155 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1156 	if (notify) {
1157 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1158 		xnbp->xnb_stat_tx_notify_sent++;
1159 	} else {
1160 		xnbp->xnb_stat_tx_notify_deferred++;
1161 	}
1162 
1163 	if (mp != NULL)
1164 		xnbp->xnb_stat_xmit_defer++;
1165 
1166 	mutex_exit(&xnbp->xnb_tx_lock);
1167 
1168 	/* Free mblk_t structs we have consumed. */
1169 	freemsgchain(free);
1170 
1171 	return (mp);
1172 }
1173 
1174 /*ARGSUSED*/
1175 static int
1176 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
1177 {
1178 	xnb_rxbuf_t *rxp = buf;
1179 
1180 	bzero(rxp, sizeof (*rxp));
1181 
1182 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
1183 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
1184 
1185 	rxp->xr_mop.host_addr =
1186 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1187 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1188 	    VM_NOSLEEP : VM_SLEEP);
1189 
1190 	if (rxp->xr_mop.host_addr == NULL) {
1191 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
1192 		    "cannot get address space");
1193 		return (-1);
1194 	}
1195 
1196 	/*
1197 	 * Have the hat ensure that page table exists for the VA.
1198 	 */
1199 	hat_prepare_mapping(kas.a_hat,
1200 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1201 
1202 	return (0);
1203 }
1204 
1205 /*ARGSUSED*/
1206 static void
1207 xnb_rxbuf_destructor(void *buf, void *arg)
1208 {
1209 	xnb_rxbuf_t *rxp = buf;
1210 
1211 	ASSERT(rxp->xr_mop.host_addr != NULL);
1212 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1213 
1214 	hat_release_mapping(kas.a_hat,
1215 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1216 	vmem_free(heap_arena,
1217 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
1218 }
1219 
1220 static void
1221 xnb_rx_notify_peer(xnb_t *xnbp)
1222 {
1223 	boolean_t notify;
1224 
1225 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1226 
1227 	/* LINTED: constant in conditional context */
1228 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1229 	if (notify) {
1230 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1231 		xnbp->xnb_stat_rx_notify_sent++;
1232 	} else {
1233 		xnbp->xnb_stat_rx_notify_deferred++;
1234 	}
1235 }
1236 
1237 static void
1238 xnb_rx_complete(xnb_rxbuf_t *rxp)
1239 {
1240 	xnb_t *xnbp = rxp->xr_xnbp;
1241 
1242 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1243 
1244 	mutex_enter(&xnbp->xnb_rx_lock);
1245 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp);
1246 	mutex_exit(&xnbp->xnb_rx_lock);
1247 }
1248 
1249 static void
1250 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1251 {
1252 	RING_IDX i;
1253 	netif_tx_response_t *txresp;
1254 
1255 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1256 
1257 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1258 
1259 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1260 	txresp->id = id;
1261 	txresp->status = status;
1262 
1263 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1264 
1265 	/*
1266 	 * Note that we don't push the change to the peer here - that
1267 	 * is the callers responsibility.
1268 	 */
1269 }
1270 
1271 static void
1272 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1273     xnb_rxbuf_t *rxp)
1274 {
1275 	gnttab_unmap_grant_ref_t	*unmop;
1276 	int				u_count;
1277 	int				reqs_on_ring;
1278 
1279 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1280 	ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE);
1281 
1282 	u_count = xnbp->xnb_rx_unmop_count++;
1283 
1284 	/* Cache data for the time when we actually unmap grant refs */
1285 	xnbp->xnb_rx_unmop_rxp[u_count] = rxp;
1286 
1287 	unmop = &xnbp->xnb_rx_unmop[u_count];
1288 	unmop->host_addr = mop->host_addr;
1289 	unmop->dev_bus_addr = mop->dev_bus_addr;
1290 	unmop->handle = mop->handle;
1291 
1292 	/*
1293 	 * We cannot check the ring once we're disconnected from it. Batching
1294 	 * doesn't seem to be a useful optimisation in this case either,
1295 	 * so we directly call into the actual unmap function.
1296 	 */
1297 	if (xnbp->xnb_connected) {
1298 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring);
1299 
1300 		/*
1301 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1302 		 * or (with N == 1) "immediate unmop" behaviour.
1303 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1304 		 */
1305 		if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat &&
1306 		    reqs_on_ring > xnb_unmop_lowwat)
1307 			return;
1308 	}
1309 
1310 	xnb_rx_perform_pending_unmop(xnbp);
1311 }
1312 
1313 /*
1314  * Here we perform the actual unmapping of the data that was
1315  * accumulated in xnb_rx_schedule_unmop().
1316  * Note that it is the caller's responsibility to make sure that
1317  * there's actually something there to unmop.
1318  */
1319 static void
1320 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
1321 {
1322 	RING_IDX loop;
1323 #ifdef XNB_DEBUG
1324 	gnttab_unmap_grant_ref_t *unmop;
1325 #endif /* XNB_DEBUG */
1326 
1327 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1328 	ASSERT(xnbp->xnb_rx_unmop_count > 0);
1329 
1330 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1331 	    xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) {
1332 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1333 		    "unmap grant operation failed, "
1334 		    "%d pages lost", xnbp->xnb_rx_unmop_count);
1335 	}
1336 
1337 #ifdef XNB_DEBUG
1338 	for (loop = 0, unmop = xnbp->xnb_rx_unmop;
1339 	    loop < xnbp->xnb_rx_unmop_count;
1340 	    loop++, unmop++) {
1341 		if (unmop->status != 0) {
1342 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1343 			    "unmap grant reference failed (%d)",
1344 			    unmop->status);
1345 		}
1346 	}
1347 #endif /* XNB_DEBUG */
1348 
1349 	for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) {
1350 		xnb_rxbuf_t	*rxp = xnbp->xnb_rx_unmop_rxp[loop];
1351 
1352 		if (rxp == NULL)
1353 			cmn_err(CE_PANIC,
1354 			    "xnb_rx_perform_pending_unmop: "
1355 			    "unexpected NULL rxp (loop %d; count %d)!",
1356 			    loop, xnbp->xnb_rx_unmop_count);
1357 
1358 		if (xnbp->xnb_connected)
1359 			xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
1360 		xnb_rxbuf_put(xnbp, rxp);
1361 	}
1362 	if (xnbp->xnb_connected)
1363 		xnb_rx_notify_peer(xnbp);
1364 
1365 	xnbp->xnb_rx_unmop_count = 0;
1366 
1367 #ifdef XNB_DEBUG
1368 	bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop));
1369 	bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp));
1370 #endif /* XNB_DEBUG */
1371 }
1372 
1373 static xnb_rxbuf_t *
1374 xnb_rxbuf_get(xnb_t *xnbp, int flags)
1375 {
1376 	xnb_rxbuf_t *rxp;
1377 
1378 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1379 
1380 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
1381 	if (rxp != NULL) {
1382 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1383 		rxp->xr_flags |= XNB_RXBUF_INUSE;
1384 
1385 		rxp->xr_xnbp = xnbp;
1386 		rxp->xr_mop.dom = xnbp->xnb_peer;
1387 
1388 		rxp->xr_mop.flags = GNTMAP_host_map;
1389 		if (!xnbp->xnb_rx_pages_writable)
1390 			rxp->xr_mop.flags |= GNTMAP_readonly;
1391 
1392 		xnbp->xnb_rx_buf_count++;
1393 	}
1394 
1395 	return (rxp);
1396 }
1397 
1398 static void
1399 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
1400 {
1401 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1402 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1403 
1404 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
1405 	xnbp->xnb_rx_buf_count--;
1406 
1407 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
1408 }
1409 
1410 static mblk_t *
1411 xnb_recv(xnb_t *xnbp)
1412 {
1413 	RING_IDX start, end, loop;
1414 	gnttab_map_grant_ref_t *mop;
1415 	xnb_rxbuf_t **rxpp;
1416 	netif_tx_request_t *txreq;
1417 	boolean_t work_to_do;
1418 	mblk_t *head, *tail;
1419 	/*
1420 	 * If the peer granted a read-only mapping to the page then we
1421 	 * must copy the data, as the local protocol stack (should the
1422 	 * packet be destined for this host) will modify the packet
1423 	 * 'in place'.
1424 	 */
1425 	boolean_t copy = !xnbp->xnb_rx_pages_writable;
1426 
1427 	/*
1428 	 * For each individual request, the sequence of actions is:
1429 	 *
1430 	 * 1. get the request.
1431 	 * 2. map the page based on the grant ref.
1432 	 * 3. allocate an mblk, copy the data to it.
1433 	 * 4. release the grant.
1434 	 * 5. update the ring.
1435 	 * 6. pass the packet upward.
1436 	 * 7. kick the peer.
1437 	 *
1438 	 * In fact, we try to perform the grant operations in batches,
1439 	 * so there are two loops.
1440 	 */
1441 
1442 	head = tail = NULL;
1443 around:
1444 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1445 
1446 	/* LINTED: constant in conditional context */
1447 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1448 	if (!work_to_do) {
1449 finished:
1450 		return (head);
1451 	}
1452 
1453 	start = xnbp->xnb_tx_ring.req_cons;
1454 	end = xnbp->xnb_tx_ring.sring->req_prod;
1455 
1456 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1457 	    loop != end;
1458 	    loop++, mop++, rxpp++) {
1459 		xnb_rxbuf_t *rxp;
1460 
1461 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1462 		if (rxp == NULL)
1463 			break;
1464 
1465 		ASSERT(xnbp->xnb_rx_pages_writable ||
1466 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1467 		    == GNTMAP_readonly));
1468 
1469 		rxp->xr_mop.ref =
1470 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1471 
1472 		*mop = rxp->xr_mop;
1473 		*rxpp = rxp;
1474 	}
1475 
1476 	if ((loop - start) == 0)
1477 		goto finished;
1478 
1479 	end = loop;
1480 
1481 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1482 	    xnbp->xnb_rx_mop, end - start) != 0) {
1483 
1484 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1485 
1486 		loop = start;
1487 		rxpp = xnbp->xnb_rx_bufp;
1488 
1489 		while (loop != end) {
1490 			xnb_rxbuf_put(xnbp, *rxpp);
1491 
1492 			loop++;
1493 			rxpp++;
1494 		}
1495 
1496 		goto finished;
1497 	}
1498 
1499 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1500 	    loop != end;
1501 	    loop++, mop++, rxpp++) {
1502 		mblk_t *mp = NULL;
1503 		int16_t status = NETIF_RSP_OKAY;
1504 		xnb_rxbuf_t *rxp = *rxpp;
1505 
1506 		if (mop->status != 0) {
1507 			cmn_err(CE_WARN, "xnb_recv: "
1508 			    "failed to map buffer: %d",
1509 			    mop->status);
1510 			status = NETIF_RSP_ERROR;
1511 		}
1512 
1513 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1514 
1515 		if (status == NETIF_RSP_OKAY) {
1516 			if (copy) {
1517 				mp = allocb(txreq->size, BPRI_MED);
1518 				if (mp == NULL) {
1519 					status = NETIF_RSP_ERROR;
1520 					xnbp->xnb_stat_rx_allocb_failed++;
1521 				} else {
1522 					bcopy((caddr_t)(uintptr_t)
1523 					    mop->host_addr + txreq->offset,
1524 					    mp->b_wptr, txreq->size);
1525 					mp->b_wptr += txreq->size;
1526 				}
1527 			} else {
1528 				mp = desballoc((uchar_t *)(uintptr_t)
1529 				    mop->host_addr + txreq->offset,
1530 				    txreq->size, 0, &rxp->xr_free_rtn);
1531 				if (mp == NULL) {
1532 					status = NETIF_RSP_ERROR;
1533 					xnbp->xnb_stat_rx_allocb_failed++;
1534 				} else {
1535 					rxp->xr_id = txreq->id;
1536 					rxp->xr_status = status;
1537 					rxp->xr_mop = *mop;
1538 
1539 					mp->b_wptr += txreq->size;
1540 				}
1541 			}
1542 
1543 			/*
1544 			 * If we have a buffer and there are checksum
1545 			 * flags, process them appropriately.
1546 			 */
1547 			if ((mp != NULL) &&
1548 			    ((txreq->flags &
1549 			    (NETTXF_csum_blank | NETTXF_data_validated))
1550 			    != 0)) {
1551 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1552 				    mp, txreq->flags);
1553 				xnbp->xnb_stat_rx_cksum_no_need++;
1554 			}
1555 		}
1556 
1557 		if (copy || (mp == NULL)) {
1558 			rxp->xr_status = status;
1559 			rxp->xr_id = txreq->id;
1560 			xnb_rx_schedule_unmop(xnbp, mop, rxp);
1561 		}
1562 
1563 		if (mp != NULL) {
1564 			xnbp->xnb_stat_ipackets++;
1565 			xnbp->xnb_stat_rbytes += txreq->size;
1566 
1567 			mp->b_next = NULL;
1568 			if (head == NULL) {
1569 				ASSERT(tail == NULL);
1570 				head = mp;
1571 			} else {
1572 				ASSERT(tail != NULL);
1573 				tail->b_next = mp;
1574 			}
1575 			tail = mp;
1576 		}
1577 	}
1578 
1579 	xnbp->xnb_tx_ring.req_cons = loop;
1580 
1581 	goto around;
1582 	/* NOTREACHED */
1583 }
1584 
1585 /*
1586  *  intr() -- ring interrupt service routine
1587  */
1588 static uint_t
1589 xnb_intr(caddr_t arg)
1590 {
1591 	xnb_t *xnbp = (xnb_t *)arg;
1592 	mblk_t *mp;
1593 
1594 	xnbp->xnb_stat_intr++;
1595 
1596 	mutex_enter(&xnbp->xnb_rx_lock);
1597 
1598 	ASSERT(xnbp->xnb_connected);
1599 
1600 	mp = xnb_recv(xnbp);
1601 
1602 	mutex_exit(&xnbp->xnb_rx_lock);
1603 
1604 	if (!xnbp->xnb_hotplugged) {
1605 		xnbp->xnb_stat_rx_too_early++;
1606 		goto fail;
1607 	}
1608 	if (mp == NULL) {
1609 		xnbp->xnb_stat_spurious_intr++;
1610 		goto fail;
1611 	}
1612 
1613 	xnbp->xnb_flavour->xf_recv(xnbp, mp);
1614 
1615 	return (DDI_INTR_CLAIMED);
1616 
1617 fail:
1618 	freemsgchain(mp);
1619 	return (DDI_INTR_CLAIMED);
1620 }
1621 
1622 static boolean_t
1623 xnb_connect_rings(dev_info_t *dip)
1624 {
1625 	xnb_t *xnbp = ddi_get_driver_private(dip);
1626 	char *oename;
1627 	struct gnttab_map_grant_ref map_op;
1628 	evtchn_port_t evtchn;
1629 	int i;
1630 
1631 	/*
1632 	 * Cannot attempt to connect the rings if already connected.
1633 	 */
1634 	ASSERT(!xnbp->xnb_connected);
1635 
1636 	oename = xvdi_get_oename(dip);
1637 
1638 	if (xenbus_gather(XBT_NULL, oename,
1639 	    "event-channel", "%u", &evtchn,
1640 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1641 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1642 	    NULL) != 0) {
1643 		cmn_err(CE_WARN, "xnb_connect_rings: "
1644 		    "cannot read other-end details from %s",
1645 		    oename);
1646 		goto fail;
1647 	}
1648 
1649 	if (xenbus_scanf(XBT_NULL, oename,
1650 	    "feature-tx-writable", "%d", &i) != 0)
1651 		i = 0;
1652 	if (i != 0)
1653 		xnbp->xnb_rx_pages_writable = B_TRUE;
1654 
1655 	if (xenbus_scanf(XBT_NULL, oename,
1656 	    "feature-no-csum-offload", "%d", &i) != 0)
1657 		i = 0;
1658 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1659 		xnbp->xnb_cksum_offload = B_FALSE;
1660 
1661 	/* Check whether our peer knows and requests hypervisor copy */
1662 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1663 	    != 0)
1664 		i = 0;
1665 	if (i != 0)
1666 		xnbp->xnb_hv_copy = B_TRUE;
1667 
1668 	/*
1669 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1670 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1671 	 *    into the allocated vaddr (one for tx, one for rx).
1672 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1673 	 *    bound to this domain.
1674 	 * 4. associate the event channel with an interrupt.
1675 	 * 5. declare ourselves connected.
1676 	 * 6. enable the interrupt.
1677 	 */
1678 
1679 	/* 1.tx */
1680 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1681 	    0, 0, 0, 0, VM_SLEEP);
1682 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1683 
1684 	/* 2.tx */
1685 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1686 	map_op.flags = GNTMAP_host_map;
1687 	map_op.ref = xnbp->xnb_tx_ring_ref;
1688 	map_op.dom = xnbp->xnb_peer;
1689 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1690 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1691 	    &map_op, 1) != 0 || map_op.status != 0) {
1692 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1693 		goto fail;
1694 	}
1695 	xnbp->xnb_tx_ring_handle = map_op.handle;
1696 
1697 	/* LINTED: constant in conditional context */
1698 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1699 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1700 
1701 	/* 1.rx */
1702 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1703 	    0, 0, 0, 0, VM_SLEEP);
1704 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1705 
1706 	/* 2.rx */
1707 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1708 	map_op.flags = GNTMAP_host_map;
1709 	map_op.ref = xnbp->xnb_rx_ring_ref;
1710 	map_op.dom = xnbp->xnb_peer;
1711 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1712 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1713 	    &map_op, 1) != 0 || map_op.status != 0) {
1714 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1715 		goto fail;
1716 	}
1717 	xnbp->xnb_rx_ring_handle = map_op.handle;
1718 
1719 	/* LINTED: constant in conditional context */
1720 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1721 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1722 
1723 	/* 3 */
1724 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1725 		cmn_err(CE_WARN, "xnb_connect_rings: "
1726 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1727 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1728 		goto fail;
1729 	}
1730 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1731 
1732 	/*
1733 	 * It would be good to set the state to XenbusStateConnected
1734 	 * here as well, but then what if ddi_add_intr() failed?
1735 	 * Changing the state in the store will be noticed by the peer
1736 	 * and cannot be "taken back".
1737 	 */
1738 	mutex_enter(&xnbp->xnb_tx_lock);
1739 	mutex_enter(&xnbp->xnb_rx_lock);
1740 
1741 	/* 5.1 */
1742 	xnbp->xnb_connected = B_TRUE;
1743 
1744 	mutex_exit(&xnbp->xnb_rx_lock);
1745 	mutex_exit(&xnbp->xnb_tx_lock);
1746 
1747 	/* 4, 6 */
1748 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1749 	    != DDI_SUCCESS) {
1750 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1751 		goto fail;
1752 	}
1753 	xnbp->xnb_irq = B_TRUE;
1754 
1755 	/* 5.2 */
1756 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1757 
1758 	return (B_TRUE);
1759 
1760 fail:
1761 	mutex_enter(&xnbp->xnb_tx_lock);
1762 	mutex_enter(&xnbp->xnb_rx_lock);
1763 
1764 	xnbp->xnb_connected = B_FALSE;
1765 	mutex_exit(&xnbp->xnb_rx_lock);
1766 	mutex_exit(&xnbp->xnb_tx_lock);
1767 
1768 	return (B_FALSE);
1769 }
1770 
1771 static void
1772 xnb_disconnect_rings(dev_info_t *dip)
1773 {
1774 	xnb_t *xnbp = ddi_get_driver_private(dip);
1775 
1776 	if (xnbp->xnb_irq) {
1777 		ddi_remove_intr(dip, 0, NULL);
1778 		xnbp->xnb_irq = B_FALSE;
1779 	}
1780 
1781 	if (xnbp->xnb_rx_unmop_count > 0)
1782 		xnb_rx_perform_pending_unmop(xnbp);
1783 
1784 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1785 		xvdi_free_evtchn(dip);
1786 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1787 	}
1788 
1789 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1790 		struct gnttab_unmap_grant_ref unmap_op;
1791 
1792 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1793 		    xnbp->xnb_rx_ring_addr;
1794 		unmap_op.dev_bus_addr = 0;
1795 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1796 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1797 		    &unmap_op, 1) != 0)
1798 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1799 			    "cannot unmap rx-ring page (%d)",
1800 			    unmap_op.status);
1801 
1802 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1803 	}
1804 
1805 	if (xnbp->xnb_rx_ring_addr != NULL) {
1806 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1807 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1808 		xnbp->xnb_rx_ring_addr = NULL;
1809 	}
1810 
1811 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1812 		struct gnttab_unmap_grant_ref unmap_op;
1813 
1814 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1815 		    xnbp->xnb_tx_ring_addr;
1816 		unmap_op.dev_bus_addr = 0;
1817 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1818 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1819 		    &unmap_op, 1) != 0)
1820 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1821 			    "cannot unmap tx-ring page (%d)",
1822 			    unmap_op.status);
1823 
1824 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1825 	}
1826 
1827 	if (xnbp->xnb_tx_ring_addr != NULL) {
1828 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1829 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1830 		xnbp->xnb_tx_ring_addr = NULL;
1831 	}
1832 }
1833 
1834 /*ARGSUSED*/
1835 static void
1836 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1837     void *arg, void *impl_data)
1838 {
1839 	xnb_t *xnbp = ddi_get_driver_private(dip);
1840 	XenbusState new_state = *(XenbusState *)impl_data;
1841 
1842 	ASSERT(xnbp != NULL);
1843 
1844 	switch (new_state) {
1845 	case XenbusStateConnected:
1846 		if (xnb_connect_rings(dip)) {
1847 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1848 		} else {
1849 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1850 			xnb_disconnect_rings(dip);
1851 			(void) xvdi_switch_state(dip, XBT_NULL,
1852 			    XenbusStateClosed);
1853 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1854 		}
1855 
1856 		/*
1857 		 * Now that we've attempted to connect it's reasonable
1858 		 * to allow an attempt to detach.
1859 		 */
1860 		xnbp->xnb_detachable = B_TRUE;
1861 
1862 		break;
1863 
1864 	case XenbusStateClosing:
1865 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1866 
1867 		break;
1868 
1869 	case XenbusStateClosed:
1870 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1871 
1872 		mutex_enter(&xnbp->xnb_tx_lock);
1873 		mutex_enter(&xnbp->xnb_rx_lock);
1874 
1875 		xnb_disconnect_rings(dip);
1876 		xnbp->xnb_connected = B_FALSE;
1877 
1878 		mutex_exit(&xnbp->xnb_rx_lock);
1879 		mutex_exit(&xnbp->xnb_tx_lock);
1880 
1881 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1882 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1883 		/*
1884 		 * In all likelyhood this is already set (in the above
1885 		 * case), but if the peer never attempted to connect
1886 		 * and the domain is destroyed we get here without
1887 		 * having been through the case above, so we set it to
1888 		 * be sure.
1889 		 */
1890 		xnbp->xnb_detachable = B_TRUE;
1891 
1892 		break;
1893 
1894 	default:
1895 		break;
1896 	}
1897 }
1898 
1899 /*ARGSUSED*/
1900 static void
1901 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1902     void *arg, void *impl_data)
1903 {
1904 	xnb_t *xnbp = ddi_get_driver_private(dip);
1905 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1906 	boolean_t success;
1907 
1908 	ASSERT(xnbp != NULL);
1909 
1910 	switch (state) {
1911 	case Connected:
1912 
1913 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1914 
1915 		mutex_enter(&xnbp->xnb_tx_lock);
1916 		mutex_enter(&xnbp->xnb_rx_lock);
1917 
1918 		xnbp->xnb_hotplugged = success;
1919 
1920 		mutex_exit(&xnbp->xnb_rx_lock);
1921 		mutex_exit(&xnbp->xnb_tx_lock);
1922 		break;
1923 
1924 	default:
1925 		break;
1926 	}
1927 }
1928 
1929 static struct modldrv modldrv = {
1930 	&mod_miscops, "xnb module %I%",
1931 };
1932 
1933 static struct modlinkage modlinkage = {
1934 	MODREV_1, &modldrv, NULL
1935 };
1936 
1937 int
1938 _init(void)
1939 {
1940 	int i;
1941 
1942 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
1943 
1944 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
1945 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
1946 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
1947 	ASSERT(xnb_rxbuf_cachep != NULL);
1948 
1949 	i = mod_install(&modlinkage);
1950 	if (i != DDI_SUCCESS) {
1951 		kmem_cache_destroy(xnb_rxbuf_cachep);
1952 		mutex_destroy(&xnb_alloc_page_lock);
1953 	}
1954 	return (i);
1955 }
1956 
1957 int
1958 _info(struct modinfo *modinfop)
1959 {
1960 	return (mod_info(&modlinkage, modinfop));
1961 }
1962 
1963 int
1964 _fini(void)
1965 {
1966 	int i;
1967 
1968 	i = mod_remove(&modlinkage);
1969 	if (i == DDI_SUCCESS) {
1970 		kmem_cache_destroy(xnb_rxbuf_cachep);
1971 		mutex_destroy(&xnb_alloc_page_lock);
1972 	}
1973 	return (i);
1974 }
1975