xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision c386eb9c22c7c00fc48a982f238576e16b113bda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */
39 #include <sys/dlpi.h>
40 #include <sys/strsubr.h>
41 #include <sys/strsun.h>
42 #include <sys/types.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 #include <vm/vm_dep.h>
52 
53 #include <sys/gld.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 #include <sys/vnic_impl.h> /* blech. */
57 
58 /*
59  * The terms "transmit" and "receive" are used in alignment with domU,
60  * which means that packets originating from the peer domU are "transmitted"
61  * to other parts of the system and packets are "received" from them.
62  */
63 
64 /*
65  * XXPV dme: things to do, as well as various things indicated
66  * throughout the source:
67  * - copy avoidance outbound.
68  * - copy avoidance inbound.
69  * - transfer credit limiting.
70  * - MAC address based filtering.
71  */
72 
73 /*
74  * Should we attempt to defer checksum calculation?
75  */
76 static boolean_t	xnb_cksum_offload = B_TRUE;
77 /*
78  * When receiving packets from a guest, should they be copied
79  * or used as-is (esballoc)?
80  */
81 static boolean_t	xnb_tx_always_copy = B_TRUE;
82 
83 static boolean_t	xnb_connect_rings(dev_info_t *);
84 static void		xnb_disconnect_rings(dev_info_t *);
85 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
86     void *, void *);
87 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
88     void *, void *);
89 
90 static int	xnb_txbuf_constructor(void *, void *, int);
91 static void	xnb_txbuf_destructor(void *, void *);
92 static xnb_txbuf_t *xnb_txbuf_get(xnb_t *, int);
93 static void	xnb_txbuf_put(xnb_t *, xnb_txbuf_t *);
94 static void	xnb_tx_notify_peer(xnb_t *);
95 static void	xnb_tx_complete(xnb_txbuf_t *);
96 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
97 static void 	xnb_tx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
98     xnb_txbuf_t *);
99 static void	xnb_tx_perform_pending_unmop(xnb_t *);
100 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
101 
102 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
103 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
104 
105 
106 boolean_t	xnb_hv_copy = B_TRUE;
107 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
108 
109 /* XXPV dme: are these really invalid? */
110 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
111 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
112 
113 static kmem_cache_t *xnb_txbuf_cachep;
114 static kmutex_t	xnb_alloc_page_lock;
115 
116 /*
117  * Statistics.
118  */
119 static char *aux_statistics[] = {
120 	"rx_cksum_deferred",
121 	"tx_cksum_no_need",
122 	"rx_rsp_notok",
123 	"tx_notify_deferred",
124 	"tx_notify_sent",
125 	"rx_notify_deferred",
126 	"rx_notify_sent",
127 	"tx_too_early",
128 	"rx_too_early",
129 	"rx_allocb_failed",
130 	"tx_allocb_failed",
131 	"rx_foreign_page",
132 	"mac_full",
133 	"spurious_intr",
134 	"allocation_success",
135 	"allocation_failure",
136 	"small_allocation_success",
137 	"small_allocation_failure",
138 	"other_allocation_failure",
139 	"rx_pageboundary_crossed",
140 	"rx_cpoparea_grown",
141 	"csum_hardware",
142 	"csum_software",
143 };
144 
145 static int
146 xnb_ks_aux_update(kstat_t *ksp, int flag)
147 {
148 	xnb_t *xnbp;
149 	kstat_named_t *knp;
150 
151 	if (flag != KSTAT_READ)
152 		return (EACCES);
153 
154 	xnbp = ksp->ks_private;
155 	knp = ksp->ks_data;
156 
157 	/*
158 	 * Assignment order should match that of the names in
159 	 * aux_statistics.
160 	 */
161 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
162 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
163 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
164 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
165 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
166 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
167 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
168 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
169 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
170 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
171 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
184 
185 	return (0);
186 }
187 
188 static boolean_t
189 xnb_ks_init(xnb_t *xnbp)
190 {
191 	int nstat = sizeof (aux_statistics) /
192 	    sizeof (aux_statistics[0]);
193 	char **cp = aux_statistics;
194 	kstat_named_t *knp;
195 
196 	/*
197 	 * Create and initialise kstats.
198 	 */
199 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
200 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
201 	    KSTAT_TYPE_NAMED, nstat, 0);
202 	if (xnbp->xnb_kstat_aux == NULL)
203 		return (B_FALSE);
204 
205 	xnbp->xnb_kstat_aux->ks_private = xnbp;
206 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
207 
208 	knp = xnbp->xnb_kstat_aux->ks_data;
209 	while (nstat > 0) {
210 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
211 
212 		knp++;
213 		cp++;
214 		nstat--;
215 	}
216 
217 	kstat_install(xnbp->xnb_kstat_aux);
218 
219 	return (B_TRUE);
220 }
221 
222 static void
223 xnb_ks_free(xnb_t *xnbp)
224 {
225 	kstat_delete(xnbp->xnb_kstat_aux);
226 }
227 
228 /*
229  * Software checksum calculation and insertion for an arbitrary packet.
230  */
231 /*ARGSUSED*/
232 static mblk_t *
233 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
234 {
235 	/*
236 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
237 	 * because it doesn't cover all of the interesting cases :-(
238 	 */
239 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
240 	    HCK_FULLCKSUM, KM_NOSLEEP);
241 
242 	return (mac_fix_cksum(mp));
243 }
244 
245 mblk_t *
246 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
247 {
248 	struct ether_header *ehp;
249 	uint16_t sap;
250 	uint32_t offset;
251 	ipha_t *ipha;
252 
253 	ASSERT(mp->b_next == NULL);
254 
255 	/*
256 	 * Check that the packet is contained in a single mblk.  In
257 	 * the "from peer" path this is true today, but will change
258 	 * when scatter gather support is added.  In the "to peer"
259 	 * path we cannot be sure, but in most cases it will be true
260 	 * (in the xnbo case the packet has come from a MAC device
261 	 * which is unlikely to split packets).
262 	 */
263 	if (mp->b_cont != NULL)
264 		goto software;
265 
266 	/*
267 	 * If the MAC has no hardware capability don't do any further
268 	 * checking.
269 	 */
270 	if (capab == 0)
271 		goto software;
272 
273 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
274 	ehp = (struct ether_header *)mp->b_rptr;
275 
276 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
277 		struct ether_vlan_header *evhp;
278 
279 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
280 		evhp = (struct ether_vlan_header *)mp->b_rptr;
281 		sap = ntohs(evhp->ether_type);
282 		offset = sizeof (struct ether_vlan_header);
283 	} else {
284 		sap = ntohs(ehp->ether_type);
285 		offset = sizeof (struct ether_header);
286 	}
287 
288 	/*
289 	 * We only attempt to do IPv4 packets in hardware.
290 	 */
291 	if (sap != ETHERTYPE_IP)
292 		goto software;
293 
294 	/*
295 	 * We know that this is an IPv4 packet.
296 	 */
297 	ipha = (ipha_t *)(mp->b_rptr + offset);
298 
299 	switch (ipha->ipha_protocol) {
300 	case IPPROTO_TCP:
301 	case IPPROTO_UDP: {
302 		uint32_t start, length, stuff, cksum;
303 		uint16_t *stuffp;
304 
305 		/*
306 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
307 		 * can use full IPv4 and partial checksum offload.
308 		 */
309 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
310 			break;
311 
312 		start = IP_SIMPLE_HDR_LENGTH;
313 		length = ntohs(ipha->ipha_length);
314 		if (ipha->ipha_protocol == IPPROTO_TCP) {
315 			stuff = start + TCP_CHECKSUM_OFFSET;
316 			cksum = IP_TCP_CSUM_COMP;
317 		} else {
318 			stuff = start + UDP_CHECKSUM_OFFSET;
319 			cksum = IP_UDP_CSUM_COMP;
320 		}
321 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
322 
323 		if (capab & HCKSUM_INET_FULL_V4) {
324 			/*
325 			 * Some devices require that the checksum
326 			 * field of the packet is zero for full
327 			 * offload.
328 			 */
329 			*stuffp = 0;
330 
331 			(void) hcksum_assoc(mp, NULL, NULL,
332 			    0, 0, 0, 0,
333 			    HCK_FULLCKSUM, KM_NOSLEEP);
334 
335 			xnbp->xnb_stat_csum_hardware++;
336 
337 			return (mp);
338 		}
339 
340 		if (capab & HCKSUM_INET_PARTIAL) {
341 			if (*stuffp == 0) {
342 				ipaddr_t src, dst;
343 
344 				/*
345 				 * Older Solaris guests don't insert
346 				 * the pseudo-header checksum, so we
347 				 * calculate it here.
348 				 */
349 				src = ipha->ipha_src;
350 				dst = ipha->ipha_dst;
351 
352 				cksum += (dst >> 16) + (dst & 0xFFFF);
353 				cksum += (src >> 16) + (src & 0xFFFF);
354 				cksum += length - IP_SIMPLE_HDR_LENGTH;
355 
356 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
357 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
358 
359 				ASSERT(cksum <= 0xFFFF);
360 
361 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
362 			}
363 
364 			(void) hcksum_assoc(mp, NULL, NULL,
365 			    start, stuff, length, 0,
366 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
367 
368 			xnbp->xnb_stat_csum_hardware++;
369 
370 			return (mp);
371 		}
372 
373 		/* NOTREACHED */
374 		break;
375 	}
376 
377 	default:
378 		/* Use software. */
379 		break;
380 	}
381 
382 software:
383 	/*
384 	 * We are not able to use any offload so do the whole thing in
385 	 * software.
386 	 */
387 	xnbp->xnb_stat_csum_software++;
388 
389 	return (xnb_software_csum(xnbp, mp));
390 }
391 
392 int
393 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
394 {
395 	xnb_t *xnbp;
396 	char *xsname, mac[ETHERADDRL * 3];
397 
398 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
399 
400 	xnbp->xnb_flavour = flavour;
401 	xnbp->xnb_flavour_data = flavour_data;
402 	xnbp->xnb_devinfo = dip;
403 	xnbp->xnb_evtchn = INVALID_EVTCHN;
404 	xnbp->xnb_irq = B_FALSE;
405 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
406 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
407 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
408 	xnbp->xnb_connected = B_FALSE;
409 	xnbp->xnb_hotplugged = B_FALSE;
410 	xnbp->xnb_detachable = B_FALSE;
411 	xnbp->xnb_peer = xvdi_get_oeid(dip);
412 	xnbp->xnb_tx_pages_writable = B_FALSE;
413 	xnbp->xnb_tx_always_copy = xnb_tx_always_copy;
414 
415 	xnbp->xnb_tx_buf_count = 0;
416 	xnbp->xnb_tx_unmop_count = 0;
417 
418 	xnbp->xnb_hv_copy = B_FALSE;
419 
420 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
421 	ASSERT(xnbp->xnb_rx_va != NULL);
422 
423 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
424 	    != DDI_SUCCESS)
425 		goto failure;
426 
427 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
428 	xnbp->xnb_rx_cpop = NULL;
429 	xnbp->xnb_cpop_sz = 0;
430 
431 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
432 	    xnbp->xnb_icookie);
433 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
434 	    xnbp->xnb_icookie);
435 
436 	/* set driver private pointer now */
437 	ddi_set_driver_private(dip, xnbp);
438 
439 	if (!xnb_ks_init(xnbp))
440 		goto failure_1;
441 
442 	/*
443 	 * Receive notification of changes in the state of the
444 	 * driver in the guest domain.
445 	 */
446 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
447 	    NULL) != DDI_SUCCESS)
448 		goto failure_2;
449 
450 	/*
451 	 * Receive notification of hotplug events.
452 	 */
453 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
454 	    NULL) != DDI_SUCCESS)
455 		goto failure_2;
456 
457 	xsname = xvdi_get_xsname(dip);
458 
459 	if (xenbus_printf(XBT_NULL, xsname,
460 	    "feature-no-csum-offload", "%d",
461 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
462 		goto failure_3;
463 
464 	/*
465 	 * Use global xnb_hv_copy to export this feature. This means that
466 	 * we have to decide what to do before starting up a guest domain
467 	 */
468 	if (xenbus_printf(XBT_NULL, xsname,
469 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
470 		goto failure_3;
471 	/*
472 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
473 	 * in addition to "feature-rx-copy" being 1. It seems strange
474 	 * to use four possible states to describe a binary decision,
475 	 * but we might as well play nice.
476 	 */
477 	if (xenbus_printf(XBT_NULL, xsname,
478 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
479 		goto failure_3;
480 
481 	if (xenbus_scanf(XBT_NULL, xsname,
482 	    "mac", "%s", mac) != 0) {
483 		cmn_err(CE_WARN, "xnb_attach: "
484 		    "cannot read mac address from %s",
485 		    xsname);
486 		goto failure_3;
487 	}
488 
489 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
490 		cmn_err(CE_WARN,
491 		    "xnb_attach: cannot parse mac address %s",
492 		    mac);
493 		goto failure_3;
494 	}
495 
496 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
497 	(void) xvdi_post_event(dip, XEN_HP_ADD);
498 
499 	return (DDI_SUCCESS);
500 
501 failure_3:
502 	xvdi_remove_event_handler(dip, NULL);
503 
504 failure_2:
505 	xnb_ks_free(xnbp);
506 
507 failure_1:
508 	mutex_destroy(&xnbp->xnb_rx_lock);
509 	mutex_destroy(&xnbp->xnb_tx_lock);
510 
511 failure:
512 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
513 	kmem_free(xnbp, sizeof (*xnbp));
514 	return (DDI_FAILURE);
515 }
516 
517 /*ARGSUSED*/
518 void
519 xnb_detach(dev_info_t *dip)
520 {
521 	xnb_t *xnbp = ddi_get_driver_private(dip);
522 
523 	ASSERT(xnbp != NULL);
524 	ASSERT(!xnbp->xnb_connected);
525 	ASSERT(xnbp->xnb_tx_buf_count == 0);
526 
527 	xnb_disconnect_rings(dip);
528 
529 	xvdi_remove_event_handler(dip, NULL);
530 
531 	xnb_ks_free(xnbp);
532 
533 	ddi_set_driver_private(dip, NULL);
534 
535 	mutex_destroy(&xnbp->xnb_tx_lock);
536 	mutex_destroy(&xnbp->xnb_rx_lock);
537 
538 	if (xnbp->xnb_cpop_sz > 0)
539 		kmem_free(xnbp->xnb_rx_cpop, sizeof (*xnbp->xnb_rx_cpop)
540 		    * xnbp->xnb_cpop_sz);
541 
542 	ASSERT(xnbp->xnb_rx_va != NULL);
543 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
544 
545 	kmem_free(xnbp, sizeof (*xnbp));
546 }
547 
548 
549 static mfn_t
550 xnb_alloc_page(xnb_t *xnbp)
551 {
552 #define	WARNING_RATE_LIMIT 100
553 #define	BATCH_SIZE 256
554 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
555 	static int nth = BATCH_SIZE;
556 	mfn_t mfn;
557 
558 	mutex_enter(&xnb_alloc_page_lock);
559 	if (nth == BATCH_SIZE) {
560 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
561 			xnbp->xnb_stat_allocation_failure++;
562 			mutex_exit(&xnb_alloc_page_lock);
563 
564 			/*
565 			 * Try for a single page in low memory situations.
566 			 */
567 			if (balloon_alloc_pages(1, &mfn) != 1) {
568 				if ((xnbp->xnb_stat_small_allocation_failure++
569 				    % WARNING_RATE_LIMIT) == 0)
570 					cmn_err(CE_WARN, "xnb_alloc_page: "
571 					    "Cannot allocate memory to "
572 					    "transfer packets to peer.");
573 				return (0);
574 			} else {
575 				xnbp->xnb_stat_small_allocation_success++;
576 				return (mfn);
577 			}
578 		}
579 
580 		nth = 0;
581 		xnbp->xnb_stat_allocation_success++;
582 	}
583 
584 	mfn = mfns[nth++];
585 	mutex_exit(&xnb_alloc_page_lock);
586 
587 	ASSERT(mfn != 0);
588 
589 	return (mfn);
590 #undef BATCH_SIZE
591 #undef WARNING_RATE_LIMIT
592 }
593 
594 /*ARGSUSED*/
595 static void
596 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
597 {
598 	int r;
599 	pfn_t pfn;
600 
601 	pfn = xen_assign_pfn(mfn);
602 	pfnzero(pfn, 0, PAGESIZE);
603 	xen_release_pfn(pfn);
604 
605 	/*
606 	 * This happens only in the error path, so batching is
607 	 * not worth the complication.
608 	 */
609 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
610 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
611 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
612 		    r, mfn);
613 	}
614 }
615 
616 /*
617  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
618  * using local variables.
619  */
620 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
621 	((((_r)->sring->req_prod - loop) <		\
622 		(RING_SIZE(_r) - (loop - prod))) ?	\
623 	    ((_r)->sring->req_prod - loop) :		\
624 	    (RING_SIZE(_r) - (loop - prod)))
625 
626 mblk_t *
627 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
628 {
629 	mblk_t *free = mp, *prev = NULL;
630 	size_t len;
631 	gnttab_transfer_t *gop;
632 	boolean_t notify;
633 	RING_IDX loop, prod, end;
634 
635 	/*
636 	 * For each packet the sequence of operations is:
637 	 *
638 	 * 1. get a new page from the hypervisor.
639 	 * 2. get a request slot from the ring.
640 	 * 3. copy the data into the new page.
641 	 * 4. transfer the page to the peer.
642 	 * 5. update the request slot.
643 	 * 6. kick the peer.
644 	 * 7. free mp.
645 	 *
646 	 * In order to reduce the number of hypercalls, we prepare
647 	 * several packets for the peer and perform a single hypercall
648 	 * to transfer them.
649 	 */
650 
651 	mutex_enter(&xnbp->xnb_rx_lock);
652 
653 	/*
654 	 * If we are not connected to the peer or have not yet
655 	 * finished hotplug it is too early to pass packets to the
656 	 * peer.
657 	 */
658 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
659 		mutex_exit(&xnbp->xnb_rx_lock);
660 		DTRACE_PROBE(flip_rx_too_early);
661 		xnbp->xnb_stat_rx_too_early++;
662 		return (mp);
663 	}
664 
665 	loop = xnbp->xnb_rx_ring.req_cons;
666 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
667 	gop = xnbp->xnb_rx_top;
668 
669 	while ((mp != NULL) &&
670 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
671 
672 		mfn_t mfn;
673 		pfn_t pfn;
674 		netif_rx_request_t *rxreq;
675 		netif_rx_response_t *rxresp;
676 		char *valoop;
677 		mblk_t *ml;
678 		uint16_t cksum_flags;
679 
680 		/* 1 */
681 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
682 			xnbp->xnb_stat_rx_defer++;
683 			break;
684 		}
685 
686 		/* 2 */
687 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
688 
689 #ifdef XNB_DEBUG
690 		if (!(rxreq->id < NET_RX_RING_SIZE))
691 			cmn_err(CE_PANIC, "xnb_to_peer: "
692 			    "id %d out of range in request 0x%p",
693 			    rxreq->id, (void *)rxreq);
694 #endif /* XNB_DEBUG */
695 
696 		/* Assign a pfn and map the new page at the allocated va. */
697 		pfn = xen_assign_pfn(mfn);
698 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
699 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
700 
701 		/* 3 */
702 		len = 0;
703 		valoop = xnbp->xnb_rx_va;
704 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
705 			size_t chunk = ml->b_wptr - ml->b_rptr;
706 
707 			bcopy(ml->b_rptr, valoop, chunk);
708 			valoop += chunk;
709 			len += chunk;
710 		}
711 
712 		ASSERT(len < PAGESIZE);
713 
714 		/* Release the pfn. */
715 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
716 		    HAT_UNLOAD_UNMAP);
717 		xen_release_pfn(pfn);
718 
719 		/* 4 */
720 		gop->mfn = mfn;
721 		gop->domid = xnbp->xnb_peer;
722 		gop->ref = rxreq->gref;
723 
724 		/* 5.1 */
725 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
726 		rxresp->offset = 0;
727 		rxresp->flags = 0;
728 
729 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
730 		if (cksum_flags != 0)
731 			xnbp->xnb_stat_rx_cksum_deferred++;
732 		rxresp->flags |= cksum_flags;
733 
734 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
735 		rxresp->status = len;
736 
737 		loop++;
738 		prod++;
739 		gop++;
740 		prev = mp;
741 		mp = mp->b_next;
742 	}
743 
744 	/*
745 	 * Did we actually do anything?
746 	 */
747 	if (loop == xnbp->xnb_rx_ring.req_cons) {
748 		mutex_exit(&xnbp->xnb_rx_lock);
749 		return (mp);
750 	}
751 
752 	end = loop;
753 
754 	/*
755 	 * Unlink the end of the 'done' list from the remainder.
756 	 */
757 	ASSERT(prev != NULL);
758 	prev->b_next = NULL;
759 
760 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
761 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
762 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
763 	}
764 
765 	loop = xnbp->xnb_rx_ring.req_cons;
766 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
767 	gop = xnbp->xnb_rx_top;
768 
769 	while (loop < end) {
770 		int16_t status = NETIF_RSP_OKAY;
771 
772 		if (gop->status != 0) {
773 			status = NETIF_RSP_ERROR;
774 
775 			/*
776 			 * If the status is anything other than
777 			 * GNTST_bad_page then we don't own the page
778 			 * any more, so don't try to give it back.
779 			 */
780 			if (gop->status != GNTST_bad_page)
781 				gop->mfn = 0;
782 		} else {
783 			/* The page is no longer ours. */
784 			gop->mfn = 0;
785 		}
786 
787 		if (gop->mfn != 0)
788 			/*
789 			 * Give back the page, as we won't be using
790 			 * it.
791 			 */
792 			xnb_free_page(xnbp, gop->mfn);
793 		else
794 			/*
795 			 * We gave away a page, update our accounting
796 			 * now.
797 			 */
798 			balloon_drv_subtracted(1);
799 
800 		/* 5.2 */
801 		if (status != NETIF_RSP_OKAY) {
802 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
803 			    status;
804 		} else {
805 			xnbp->xnb_stat_ipackets++;
806 			xnbp->xnb_stat_rbytes += len;
807 		}
808 
809 		loop++;
810 		prod++;
811 		gop++;
812 	}
813 
814 	xnbp->xnb_rx_ring.req_cons = loop;
815 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
816 
817 	/* 6 */
818 	/* LINTED: constant in conditional context */
819 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
820 	if (notify) {
821 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
822 		xnbp->xnb_stat_rx_notify_sent++;
823 	} else {
824 		xnbp->xnb_stat_rx_notify_deferred++;
825 	}
826 
827 	if (mp != NULL)
828 		xnbp->xnb_stat_rx_defer++;
829 
830 	mutex_exit(&xnbp->xnb_rx_lock);
831 
832 	/* Free mblk_t's that we consumed. */
833 	freemsgchain(free);
834 
835 	return (mp);
836 }
837 
838 /* helper functions for xnb_copy_to_peer */
839 
840 /*
841  * Grow the array of copy operation descriptors.
842  * Returns a pointer to the next available entry.
843  */
844 gnttab_copy_t *
845 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
846 {
847 	/*
848 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
849 	 * something into but cannot, because we haven't alloc'ed it
850 	 * yet, or NULL.
851 	 * old_cpop and new_cpop (local) are pointers to old/new
852 	 * versions of xnbp->xnb_rx_cpop.
853 	 */
854 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
855 	size_t		newcount;
856 
857 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
858 
859 	old_cpop = xnbp->xnb_rx_cpop;
860 	/*
861 	 * o_cpop is a pointer into the array pointed to by old_cpop;
862 	 * it would be an error for exactly one of these pointers to be NULL.
863 	 * We shouldn't call this function if xnb_rx_cpop has already
864 	 * been allocated, but we're starting to fill it from the beginning
865 	 * again.
866 	 */
867 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
868 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
869 
870 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
871 
872 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
873 	if (new_cpop == NULL) {
874 		xnbp->xnb_stat_other_allocation_failure++;
875 		return (NULL);
876 	}
877 
878 	if (o_cpop != NULL) {
879 		size_t	 offset = (o_cpop - old_cpop);
880 
881 		/* we only need to move the parts in use ... */
882 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
883 		    (sizeof (*old_cpop)));
884 
885 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
886 
887 		ret_cpop = new_cpop + offset;
888 	} else {
889 		ret_cpop = new_cpop;
890 	}
891 
892 	xnbp->xnb_rx_cpop = new_cpop;
893 	xnbp->xnb_cpop_sz = newcount;
894 
895 	xnbp->xnb_stat_rx_cpoparea_grown++;
896 
897 	return (ret_cpop);
898 }
899 
900 /*
901  * Check whether an address is on a page that's foreign to this domain.
902  */
903 static boolean_t
904 is_foreign(void *addr)
905 {
906 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
907 
908 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
909 }
910 
911 /*
912  * Insert a newly allocated mblk into a chain, replacing the old one.
913  */
914 static mblk_t *
915 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
916 {
917 	uint32_t	start, stuff, end, value, flags;
918 	mblk_t		*new_mp;
919 
920 	new_mp = copyb(mp);
921 	if (new_mp == NULL)
922 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
923 		    "for %p, len %lu", (void *) mp, len);
924 
925 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
926 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
927 	    flags, KM_NOSLEEP);
928 
929 	new_mp->b_next = mp->b_next;
930 	new_mp->b_prev = mp->b_prev;
931 	new_mp->b_cont = mp->b_cont;
932 
933 	/* Make sure we only overwrite pointers to the mblk being replaced. */
934 	if (mp_prev != NULL && mp_prev->b_next == mp)
935 		mp_prev->b_next = new_mp;
936 
937 	if (ml_prev != NULL && ml_prev->b_cont == mp)
938 		ml_prev->b_cont = new_mp;
939 
940 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
941 	freemsg(mp);
942 
943 	return (new_mp);
944 }
945 
946 /*
947  * Set all the fields in a gnttab_copy_t.
948  */
949 static void
950 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
951     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
952 {
953 	ASSERT(xnbp != NULL && gp != NULL);
954 
955 	gp->source.offset = s_off;
956 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
957 	gp->source.domid = DOMID_SELF;
958 
959 	gp->len = (uint16_t)len;
960 	gp->flags = GNTCOPY_dest_gref;
961 	gp->status = 0;
962 
963 	gp->dest.u.ref = d_ref;
964 	gp->dest.offset = d_off;
965 	gp->dest.domid = xnbp->xnb_peer;
966 }
967 
968 mblk_t *
969 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
970 {
971 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
972 	mblk_t		*ml, *ml_prev;
973 	gnttab_copy_t	*gop_cp;
974 	boolean_t	notify;
975 	RING_IDX	loop, prod;
976 	int		i;
977 
978 	if (!xnbp->xnb_hv_copy)
979 		return (xnb_to_peer(xnbp, mp));
980 
981 	/*
982 	 * For each packet the sequence of operations is:
983 	 *
984 	 *  1. get a request slot from the ring.
985 	 *  2. set up data for hypercall (see NOTE below)
986 	 *  3. have the hypervisore copy the data
987 	 *  4. update the request slot.
988 	 *  5. kick the peer.
989 	 *
990 	 * NOTE ad 2.
991 	 *  In order to reduce the number of hypercalls, we prepare
992 	 *  several packets (mp->b_cont != NULL) for the peer and
993 	 *  perform a single hypercall to transfer them.
994 	 *  We also have to set up a seperate copy operation for
995 	 *  every page.
996 	 *
997 	 * If we have more than one message (mp->b_next != NULL),
998 	 * we do this whole dance repeatedly.
999 	 */
1000 
1001 	mutex_enter(&xnbp->xnb_rx_lock);
1002 
1003 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1004 		mutex_exit(&xnbp->xnb_rx_lock);
1005 		DTRACE_PROBE(copy_rx_too_early);
1006 		xnbp->xnb_stat_rx_too_early++;
1007 		return (mp);
1008 	}
1009 
1010 	loop = xnbp->xnb_rx_ring.req_cons;
1011 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1012 
1013 	while ((mp != NULL) &&
1014 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1015 		netif_rx_request_t	*rxreq;
1016 		netif_rx_response_t	*rxresp;
1017 		size_t			d_offset;
1018 		size_t			len;
1019 		uint16_t		cksum_flags;
1020 		int16_t			status = NETIF_RSP_OKAY;
1021 		int			item_count;
1022 
1023 		/* 1 */
1024 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1025 
1026 #ifdef XNB_DEBUG
1027 		if (!(rxreq->id < NET_RX_RING_SIZE))
1028 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1029 			    "id %d out of range in request 0x%p",
1030 			    rxreq->id, (void *)rxreq);
1031 #endif /* XNB_DEBUG */
1032 
1033 		/* 2 */
1034 		d_offset = 0;
1035 		len = 0;
1036 		item_count = 0;
1037 
1038 		gop_cp = xnbp->xnb_rx_cpop;
1039 
1040 		/*
1041 		 * We walk the b_cont pointers and set up a gop_cp
1042 		 * structure for every page in every data block we have.
1043 		 */
1044 		/* 2a */
1045 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1046 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1047 			uchar_t	*r_tmp,	*rpt_align;
1048 			size_t	r_offset;
1049 
1050 			/*
1051 			 * If we get an mblk on a page that doesn't belong to
1052 			 * this domain, get a new mblk to replace the old one.
1053 			 */
1054 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1055 				mblk_t *ml_new = replace_msg(ml, chunk,
1056 				    mp_prev, ml_prev);
1057 
1058 				/* We can still use old ml, but not *ml! */
1059 				if (free == ml)
1060 					free = ml_new;
1061 				if (mp == ml)
1062 					mp = ml_new;
1063 				ml = ml_new;
1064 
1065 				xnbp->xnb_stat_rx_foreign_page++;
1066 			}
1067 
1068 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1069 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1070 			r_tmp = ml->b_rptr;
1071 
1072 			if (d_offset + chunk > PAGESIZE)
1073 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1074 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1075 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1076 				    (void *)mp, (void *)saved_mp, (void *)ml,
1077 				    (void *)rpt_align,
1078 				    d_offset, chunk, (int)PAGESIZE);
1079 
1080 			while (chunk > 0) {
1081 				size_t part_len;
1082 
1083 				item_count++;
1084 				if (item_count > xnbp->xnb_cpop_sz) {
1085 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1086 					if (gop_cp == NULL)
1087 						goto failure;
1088 				}
1089 				/*
1090 				 * If our mblk crosses a page boundary, we need
1091 				 * to do a seperate copy for every page.
1092 				 */
1093 				if (r_offset + chunk > PAGESIZE) {
1094 					part_len = PAGESIZE - r_offset;
1095 
1096 					DTRACE_PROBE3(mblk_page_crossed,
1097 					    (mblk_t *), ml, int, chunk, int,
1098 					    (int)r_offset);
1099 
1100 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1101 				} else {
1102 					part_len = chunk;
1103 				}
1104 
1105 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1106 				    d_offset, part_len, rxreq->gref);
1107 
1108 				chunk -= part_len;
1109 
1110 				len += part_len;
1111 				d_offset += part_len;
1112 				r_tmp += part_len;
1113 				/*
1114 				 * The 2nd, 3rd ... last copies will always
1115 				 * start at r_tmp, therefore r_offset is 0.
1116 				 */
1117 				r_offset = 0;
1118 				gop_cp++;
1119 			}
1120 			ml_prev = ml;
1121 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1122 			    chunk, int, len, int, item_count);
1123 		}
1124 		/* 3 */
1125 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1126 		    item_count) != 0) {
1127 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1128 			DTRACE_PROBE(HV_granttableopfailed);
1129 		}
1130 
1131 		/* 4 */
1132 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1133 		rxresp->offset = 0;
1134 
1135 		rxresp->flags = 0;
1136 
1137 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1138 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1139 		    (int)rxresp->status);
1140 
1141 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1142 		if (cksum_flags != 0)
1143 			xnbp->xnb_stat_rx_cksum_deferred++;
1144 		rxresp->flags |= cksum_flags;
1145 
1146 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1147 		rxresp->status = len;
1148 
1149 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1150 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1151 		    (int)rxresp->status);
1152 
1153 		for (i = 0; i < item_count; i++) {
1154 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1155 				DTRACE_PROBE2(cpop__status__nonnull, int,
1156 				    (int)xnbp->xnb_rx_cpop[i].status,
1157 				    int, i);
1158 				status = NETIF_RSP_ERROR;
1159 			}
1160 		}
1161 
1162 		/* 5.2 */
1163 		if (status != NETIF_RSP_OKAY) {
1164 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1165 			    status;
1166 			xnbp->xnb_stat_rx_rsp_notok++;
1167 		} else {
1168 			xnbp->xnb_stat_ipackets++;
1169 			xnbp->xnb_stat_rbytes += len;
1170 		}
1171 
1172 		loop++;
1173 		prod++;
1174 		mp_prev = mp;
1175 		mp = mp->b_next;
1176 	}
1177 failure:
1178 	/*
1179 	 * Did we actually do anything?
1180 	 */
1181 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1182 		mutex_exit(&xnbp->xnb_rx_lock);
1183 		return (mp);
1184 	}
1185 
1186 	/*
1187 	 * Unlink the end of the 'done' list from the remainder.
1188 	 */
1189 	ASSERT(mp_prev != NULL);
1190 	mp_prev->b_next = NULL;
1191 
1192 	xnbp->xnb_rx_ring.req_cons = loop;
1193 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1194 
1195 	/* 6 */
1196 	/* LINTED: constant in conditional context */
1197 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1198 	if (notify) {
1199 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1200 		xnbp->xnb_stat_rx_notify_sent++;
1201 	} else {
1202 		xnbp->xnb_stat_rx_notify_deferred++;
1203 	}
1204 
1205 	if (mp != NULL)
1206 		xnbp->xnb_stat_rx_defer++;
1207 
1208 	mutex_exit(&xnbp->xnb_rx_lock);
1209 
1210 	/* Free mblk_t structs we have consumed. */
1211 	freemsgchain(free);
1212 
1213 	return (mp);
1214 }
1215 
1216 /*ARGSUSED*/
1217 static int
1218 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1219 {
1220 	xnb_txbuf_t *txp = buf;
1221 
1222 	bzero(txp, sizeof (*txp));
1223 
1224 	txp->xt_free_rtn.free_func = xnb_tx_complete;
1225 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1226 
1227 	txp->xt_mop.host_addr =
1228 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1229 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1230 	    VM_NOSLEEP : VM_SLEEP);
1231 
1232 	if (txp->xt_mop.host_addr == NULL) {
1233 		cmn_err(CE_WARN, "xnb_txbuf_constructor: "
1234 		    "cannot get address space");
1235 		return (-1);
1236 	}
1237 
1238 	/*
1239 	 * Have the hat ensure that page table exists for the VA.
1240 	 */
1241 	hat_prepare_mapping(kas.a_hat,
1242 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, NULL);
1243 
1244 	return (0);
1245 }
1246 
1247 /*ARGSUSED*/
1248 static void
1249 xnb_txbuf_destructor(void *buf, void *arg)
1250 {
1251 	xnb_txbuf_t *txp = buf;
1252 
1253 	ASSERT(txp->xt_mop.host_addr != NULL);
1254 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
1255 
1256 	hat_release_mapping(kas.a_hat,
1257 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr);
1258 	vmem_free(heap_arena,
1259 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, PAGESIZE);
1260 }
1261 
1262 static void
1263 xnb_tx_notify_peer(xnb_t *xnbp)
1264 {
1265 	boolean_t notify;
1266 
1267 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1268 
1269 	/* LINTED: constant in conditional context */
1270 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1271 	if (notify) {
1272 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1273 		xnbp->xnb_stat_tx_notify_sent++;
1274 	} else {
1275 		xnbp->xnb_stat_tx_notify_deferred++;
1276 	}
1277 }
1278 
1279 static void
1280 xnb_tx_complete(xnb_txbuf_t *txp)
1281 {
1282 	xnb_t *xnbp = txp->xt_xnbp;
1283 
1284 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
1285 
1286 	mutex_enter(&xnbp->xnb_tx_lock);
1287 	xnb_tx_schedule_unmop(xnbp, &txp->xt_mop, txp);
1288 	mutex_exit(&xnbp->xnb_tx_lock);
1289 }
1290 
1291 static void
1292 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1293 {
1294 	RING_IDX i;
1295 	netif_tx_response_t *txresp;
1296 
1297 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1298 
1299 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1300 
1301 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1302 	txresp->id = id;
1303 	txresp->status = status;
1304 
1305 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1306 
1307 	/*
1308 	 * Note that we don't push the change to the peer here - that
1309 	 * is the callers responsibility.
1310 	 */
1311 }
1312 
1313 static void
1314 xnb_tx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1315     xnb_txbuf_t *txp)
1316 {
1317 	gnttab_unmap_grant_ref_t	*unmop;
1318 	int				u_count;
1319 	int				reqs_on_ring;
1320 
1321 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1322 	ASSERT(xnbp->xnb_tx_unmop_count < NET_TX_RING_SIZE);
1323 
1324 	u_count = xnbp->xnb_tx_unmop_count++;
1325 
1326 	/* Cache data for the time when we actually unmap grant refs */
1327 	xnbp->xnb_tx_unmop_txp[u_count] = txp;
1328 
1329 	unmop = &xnbp->xnb_tx_unmop[u_count];
1330 	unmop->host_addr = mop->host_addr;
1331 	unmop->dev_bus_addr = mop->dev_bus_addr;
1332 	unmop->handle = mop->handle;
1333 
1334 	/*
1335 	 * We cannot check the ring once we're disconnected from it. Batching
1336 	 * doesn't seem to be a useful optimisation in this case either,
1337 	 * so we directly call into the actual unmap function.
1338 	 */
1339 	if (xnbp->xnb_connected) {
1340 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring);
1341 
1342 		/*
1343 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1344 		 * or (with N == 1) "immediate unmop" behaviour.
1345 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1346 		 */
1347 		if (xnbp->xnb_tx_unmop_count < xnb_unmop_hiwat &&
1348 		    reqs_on_ring > xnb_unmop_lowwat)
1349 			return;
1350 	}
1351 
1352 	xnb_tx_perform_pending_unmop(xnbp);
1353 }
1354 
1355 /*
1356  * Here we perform the actual unmapping of the data that was
1357  * accumulated in xnb_tx_schedule_unmop().
1358  * Note that it is the caller's responsibility to make sure that
1359  * there's actually something there to unmop.
1360  */
1361 static void
1362 xnb_tx_perform_pending_unmop(xnb_t *xnbp)
1363 {
1364 	RING_IDX loop;
1365 #ifdef XNB_DEBUG
1366 	gnttab_unmap_grant_ref_t *unmop;
1367 #endif /* XNB_DEBUG */
1368 
1369 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1370 	ASSERT(xnbp->xnb_tx_unmop_count > 0);
1371 
1372 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1373 	    xnbp->xnb_tx_unmop, xnbp->xnb_tx_unmop_count) < 0) {
1374 		cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
1375 		    "unmap grant operation failed, "
1376 		    "%d pages lost", xnbp->xnb_tx_unmop_count);
1377 	}
1378 
1379 #ifdef XNB_DEBUG
1380 	for (loop = 0, unmop = xnbp->xnb_tx_unmop;
1381 	    loop < xnbp->xnb_tx_unmop_count;
1382 	    loop++, unmop++) {
1383 		if (unmop->status != 0) {
1384 			cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
1385 			    "unmap grant reference failed (%d)",
1386 			    unmop->status);
1387 		}
1388 	}
1389 #endif /* XNB_DEBUG */
1390 
1391 	for (loop = 0; loop < xnbp->xnb_tx_unmop_count; loop++) {
1392 		xnb_txbuf_t	*txp = xnbp->xnb_tx_unmop_txp[loop];
1393 
1394 		if (txp == NULL)
1395 			cmn_err(CE_PANIC,
1396 			    "xnb_tx_perform_pending_unmop: "
1397 			    "unexpected NULL txp (loop %d; count %d)!",
1398 			    loop, xnbp->xnb_tx_unmop_count);
1399 
1400 		if (xnbp->xnb_connected)
1401 			xnb_tx_mark_complete(xnbp, txp->xt_id, txp->xt_status);
1402 		xnb_txbuf_put(xnbp, txp);
1403 	}
1404 	if (xnbp->xnb_connected)
1405 		xnb_tx_notify_peer(xnbp);
1406 
1407 	xnbp->xnb_tx_unmop_count = 0;
1408 
1409 #ifdef XNB_DEBUG
1410 	bzero(xnbp->xnb_tx_unmop, sizeof (xnbp->xnb_tx_unmop));
1411 	bzero(xnbp->xnb_tx_unmop_txp, sizeof (xnbp->xnb_tx_unmop_txp));
1412 #endif /* XNB_DEBUG */
1413 }
1414 
1415 static xnb_txbuf_t *
1416 xnb_txbuf_get(xnb_t *xnbp, int flags)
1417 {
1418 	xnb_txbuf_t *txp;
1419 
1420 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1421 
1422 	txp = kmem_cache_alloc(xnb_txbuf_cachep, flags);
1423 	if (txp != NULL) {
1424 		ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
1425 		txp->xt_flags |= XNB_TXBUF_INUSE;
1426 
1427 		txp->xt_xnbp = xnbp;
1428 		txp->xt_mop.dom = xnbp->xnb_peer;
1429 
1430 		txp->xt_mop.flags = GNTMAP_host_map;
1431 		if (!xnbp->xnb_tx_pages_writable)
1432 			txp->xt_mop.flags |= GNTMAP_readonly;
1433 
1434 		xnbp->xnb_tx_buf_count++;
1435 	}
1436 
1437 	return (txp);
1438 }
1439 
1440 static void
1441 xnb_txbuf_put(xnb_t *xnbp, xnb_txbuf_t *txp)
1442 {
1443 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1444 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
1445 
1446 	txp->xt_flags &= ~XNB_TXBUF_INUSE;
1447 	xnbp->xnb_tx_buf_count--;
1448 
1449 	kmem_cache_free(xnb_txbuf_cachep, txp);
1450 }
1451 
1452 static mblk_t *
1453 xnb_from_peer(xnb_t *xnbp)
1454 {
1455 	RING_IDX start, end, loop;
1456 	gnttab_map_grant_ref_t *mop;
1457 	xnb_txbuf_t **txpp;
1458 	netif_tx_request_t *txreq;
1459 	boolean_t work_to_do;
1460 	mblk_t *head, *tail;
1461 	/*
1462 	 * If the peer granted a read-only mapping to the page then we
1463 	 * must copy the data, as the local protocol stack (should the
1464 	 * packet be destined for this host) will modify the packet
1465 	 * 'in place'.
1466 	 */
1467 	boolean_t copy = xnbp->xnb_tx_always_copy ||
1468 	    !xnbp->xnb_tx_pages_writable;
1469 
1470 	/*
1471 	 * For each individual request, the sequence of actions is:
1472 	 *
1473 	 * 1. get the request.
1474 	 * 2. map the page based on the grant ref.
1475 	 * 3. allocate an mblk, copy the data to it.
1476 	 * 4. release the grant.
1477 	 * 5. update the ring.
1478 	 * 6. pass the packet upward.
1479 	 * 7. kick the peer.
1480 	 *
1481 	 * In fact, we try to perform the grant operations in batches,
1482 	 * so there are two loops.
1483 	 */
1484 
1485 	head = tail = NULL;
1486 around:
1487 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1488 
1489 	/* LINTED: constant in conditional context */
1490 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1491 	if (!work_to_do) {
1492 finished:
1493 		return (head);
1494 	}
1495 
1496 	start = xnbp->xnb_tx_ring.req_cons;
1497 	end = xnbp->xnb_tx_ring.sring->req_prod;
1498 
1499 	if ((end - start) > NET_TX_RING_SIZE) {
1500 		/*
1501 		 * This usually indicates that the frontend driver is
1502 		 * misbehaving, as it's not possible to have more than
1503 		 * NET_TX_RING_SIZE ring elements in play at any one
1504 		 * time.
1505 		 *
1506 		 * We reset the ring pointers to the state declared by
1507 		 * the frontend and try to carry on.
1508 		 */
1509 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1510 		    "items in the ring, resetting and trying to recover.",
1511 		    xnbp->xnb_peer, (end - start));
1512 
1513 		/* LINTED: constant in conditional context */
1514 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1515 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1516 
1517 		goto around;
1518 	}
1519 
1520 	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
1521 	    loop != end;
1522 	    loop++, mop++, txpp++) {
1523 		xnb_txbuf_t *txp;
1524 
1525 		txp = xnb_txbuf_get(xnbp, KM_NOSLEEP);
1526 		if (txp == NULL)
1527 			break;
1528 
1529 		ASSERT(xnbp->xnb_tx_pages_writable ||
1530 		    ((txp->xt_mop.flags & GNTMAP_readonly)
1531 		    == GNTMAP_readonly));
1532 
1533 		txp->xt_mop.ref =
1534 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1535 
1536 		*mop = txp->xt_mop;
1537 		*txpp = txp;
1538 	}
1539 
1540 	if ((loop - start) == 0)
1541 		goto finished;
1542 
1543 	end = loop;
1544 
1545 	if (xen_map_gref(GNTTABOP_map_grant_ref, xnbp->xnb_tx_mop,
1546 	    end - start, B_FALSE) != 0) {
1547 
1548 		cmn_err(CE_WARN, "xnb_from_peer: map grant operation failed");
1549 
1550 		loop = start;
1551 		txpp = xnbp->xnb_tx_bufp;
1552 
1553 		while (loop != end) {
1554 			xnb_txbuf_put(xnbp, *txpp);
1555 
1556 			loop++;
1557 			txpp++;
1558 		}
1559 
1560 		goto finished;
1561 	}
1562 
1563 	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
1564 	    loop != end;
1565 	    loop++, mop++, txpp++) {
1566 		mblk_t *mp = NULL;
1567 		int16_t status = NETIF_RSP_OKAY;
1568 		xnb_txbuf_t *txp = *txpp;
1569 
1570 		if (mop->status != 0) {
1571 			cmn_err(CE_WARN, "xnb_from_peer: "
1572 			    "failed to map buffer: %d",
1573 			    mop->status);
1574 			status = NETIF_RSP_ERROR;
1575 		}
1576 
1577 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1578 
1579 		if (status == NETIF_RSP_OKAY) {
1580 			if (copy) {
1581 				mp = allocb(txreq->size, BPRI_MED);
1582 				if (mp == NULL) {
1583 					status = NETIF_RSP_ERROR;
1584 					xnbp->xnb_stat_tx_allocb_failed++;
1585 				} else {
1586 					bcopy((caddr_t)(uintptr_t)
1587 					    mop->host_addr + txreq->offset,
1588 					    mp->b_wptr, txreq->size);
1589 					mp->b_wptr += txreq->size;
1590 				}
1591 			} else {
1592 				mp = desballoc((uchar_t *)(uintptr_t)
1593 				    mop->host_addr + txreq->offset,
1594 				    txreq->size, 0, &txp->xt_free_rtn);
1595 				if (mp == NULL) {
1596 					status = NETIF_RSP_ERROR;
1597 					xnbp->xnb_stat_tx_allocb_failed++;
1598 				} else {
1599 					txp->xt_id = txreq->id;
1600 					txp->xt_status = status;
1601 					txp->xt_mop = *mop;
1602 
1603 					mp->b_wptr += txreq->size;
1604 				}
1605 			}
1606 
1607 			/*
1608 			 * If we have a buffer and there are checksum
1609 			 * flags, process them appropriately.
1610 			 */
1611 			if ((mp != NULL) &&
1612 			    ((txreq->flags &
1613 			    (NETTXF_csum_blank | NETTXF_data_validated))
1614 			    != 0)) {
1615 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1616 				    mp, txreq->flags);
1617 				xnbp->xnb_stat_tx_cksum_no_need++;
1618 			}
1619 		}
1620 
1621 		if (copy || (mp == NULL)) {
1622 			txp->xt_status = status;
1623 			txp->xt_id = txreq->id;
1624 			xnb_tx_schedule_unmop(xnbp, mop, txp);
1625 		}
1626 
1627 		if (mp != NULL) {
1628 			xnbp->xnb_stat_opackets++;
1629 			xnbp->xnb_stat_obytes += txreq->size;
1630 
1631 			mp->b_next = NULL;
1632 			if (head == NULL) {
1633 				ASSERT(tail == NULL);
1634 				head = mp;
1635 			} else {
1636 				ASSERT(tail != NULL);
1637 				tail->b_next = mp;
1638 			}
1639 			tail = mp;
1640 		}
1641 	}
1642 
1643 	xnbp->xnb_tx_ring.req_cons = loop;
1644 
1645 	goto around;
1646 	/* NOTREACHED */
1647 }
1648 
1649 /*
1650  *  intr() -- ring interrupt service routine
1651  */
1652 static uint_t
1653 xnb_intr(caddr_t arg)
1654 {
1655 	xnb_t *xnbp = (xnb_t *)arg;
1656 	mblk_t *mp;
1657 
1658 	xnbp->xnb_stat_intr++;
1659 
1660 	mutex_enter(&xnbp->xnb_tx_lock);
1661 
1662 	ASSERT(xnbp->xnb_connected);
1663 
1664 	mp = xnb_from_peer(xnbp);
1665 
1666 	mutex_exit(&xnbp->xnb_tx_lock);
1667 
1668 	if (!xnbp->xnb_hotplugged) {
1669 		xnbp->xnb_stat_tx_too_early++;
1670 		goto fail;
1671 	}
1672 	if (mp == NULL) {
1673 		xnbp->xnb_stat_spurious_intr++;
1674 		goto fail;
1675 	}
1676 
1677 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1678 
1679 	return (DDI_INTR_CLAIMED);
1680 
1681 fail:
1682 	freemsgchain(mp);
1683 	return (DDI_INTR_CLAIMED);
1684 }
1685 
1686 static boolean_t
1687 xnb_connect_rings(dev_info_t *dip)
1688 {
1689 	xnb_t *xnbp = ddi_get_driver_private(dip);
1690 	char *oename;
1691 	struct gnttab_map_grant_ref map_op;
1692 	evtchn_port_t evtchn;
1693 	int i;
1694 
1695 	/*
1696 	 * Cannot attempt to connect the rings if already connected.
1697 	 */
1698 	ASSERT(!xnbp->xnb_connected);
1699 
1700 	oename = xvdi_get_oename(dip);
1701 
1702 	if (xenbus_gather(XBT_NULL, oename,
1703 	    "event-channel", "%u", &evtchn,
1704 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1705 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1706 	    NULL) != 0) {
1707 		cmn_err(CE_WARN, "xnb_connect_rings: "
1708 		    "cannot read other-end details from %s",
1709 		    oename);
1710 		goto fail;
1711 	}
1712 
1713 	if (xenbus_scanf(XBT_NULL, oename,
1714 	    "feature-tx-writable", "%d", &i) != 0)
1715 		i = 0;
1716 	if (i != 0)
1717 		xnbp->xnb_tx_pages_writable = B_TRUE;
1718 
1719 	if (xenbus_scanf(XBT_NULL, oename,
1720 	    "feature-no-csum-offload", "%d", &i) != 0)
1721 		i = 0;
1722 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1723 		xnbp->xnb_cksum_offload = B_FALSE;
1724 
1725 	/* Check whether our peer knows and requests hypervisor copy */
1726 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1727 	    != 0)
1728 		i = 0;
1729 	if (i != 0)
1730 		xnbp->xnb_hv_copy = B_TRUE;
1731 
1732 	/*
1733 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1734 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1735 	 *    into the allocated vaddr (one for tx, one for rx).
1736 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1737 	 *    bound to this domain.
1738 	 * 4. associate the event channel with an interrupt.
1739 	 * 5. declare ourselves connected.
1740 	 * 6. enable the interrupt.
1741 	 */
1742 
1743 	/* 1.tx */
1744 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1745 	    0, 0, 0, 0, VM_SLEEP);
1746 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1747 
1748 	/* 2.tx */
1749 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1750 	map_op.flags = GNTMAP_host_map;
1751 	map_op.ref = xnbp->xnb_tx_ring_ref;
1752 	map_op.dom = xnbp->xnb_peer;
1753 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1754 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1755 	    map_op.status != 0) {
1756 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1757 		goto fail;
1758 	}
1759 	xnbp->xnb_tx_ring_handle = map_op.handle;
1760 
1761 	/* LINTED: constant in conditional context */
1762 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1763 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1764 
1765 	/* 1.rx */
1766 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1767 	    0, 0, 0, 0, VM_SLEEP);
1768 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1769 
1770 	/* 2.rx */
1771 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1772 	map_op.flags = GNTMAP_host_map;
1773 	map_op.ref = xnbp->xnb_rx_ring_ref;
1774 	map_op.dom = xnbp->xnb_peer;
1775 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1776 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1777 	    map_op.status != 0) {
1778 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1779 		goto fail;
1780 	}
1781 	xnbp->xnb_rx_ring_handle = map_op.handle;
1782 
1783 	/* LINTED: constant in conditional context */
1784 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1785 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1786 
1787 	/* 3 */
1788 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1789 		cmn_err(CE_WARN, "xnb_connect_rings: "
1790 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1791 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1792 		goto fail;
1793 	}
1794 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1795 
1796 	/*
1797 	 * It would be good to set the state to XenbusStateConnected
1798 	 * here as well, but then what if ddi_add_intr() failed?
1799 	 * Changing the state in the store will be noticed by the peer
1800 	 * and cannot be "taken back".
1801 	 */
1802 	mutex_enter(&xnbp->xnb_tx_lock);
1803 	mutex_enter(&xnbp->xnb_rx_lock);
1804 
1805 	/* 5.1 */
1806 	xnbp->xnb_connected = B_TRUE;
1807 
1808 	mutex_exit(&xnbp->xnb_rx_lock);
1809 	mutex_exit(&xnbp->xnb_tx_lock);
1810 
1811 	/* 4, 6 */
1812 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1813 	    != DDI_SUCCESS) {
1814 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1815 		goto fail;
1816 	}
1817 	xnbp->xnb_irq = B_TRUE;
1818 
1819 	/* 5.2 */
1820 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1821 
1822 	return (B_TRUE);
1823 
1824 fail:
1825 	mutex_enter(&xnbp->xnb_tx_lock);
1826 	mutex_enter(&xnbp->xnb_rx_lock);
1827 
1828 	xnbp->xnb_connected = B_FALSE;
1829 	mutex_exit(&xnbp->xnb_rx_lock);
1830 	mutex_exit(&xnbp->xnb_tx_lock);
1831 
1832 	return (B_FALSE);
1833 }
1834 
1835 static void
1836 xnb_disconnect_rings(dev_info_t *dip)
1837 {
1838 	xnb_t *xnbp = ddi_get_driver_private(dip);
1839 
1840 	if (xnbp->xnb_irq) {
1841 		ddi_remove_intr(dip, 0, NULL);
1842 		xnbp->xnb_irq = B_FALSE;
1843 	}
1844 
1845 	if (xnbp->xnb_tx_unmop_count > 0)
1846 		xnb_tx_perform_pending_unmop(xnbp);
1847 
1848 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1849 		xvdi_free_evtchn(dip);
1850 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1851 	}
1852 
1853 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1854 		struct gnttab_unmap_grant_ref unmap_op;
1855 
1856 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1857 		    xnbp->xnb_rx_ring_addr;
1858 		unmap_op.dev_bus_addr = 0;
1859 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1860 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1861 		    &unmap_op, 1) != 0)
1862 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1863 			    "cannot unmap rx-ring page (%d)",
1864 			    unmap_op.status);
1865 
1866 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1867 	}
1868 
1869 	if (xnbp->xnb_rx_ring_addr != NULL) {
1870 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1871 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1872 		xnbp->xnb_rx_ring_addr = NULL;
1873 	}
1874 
1875 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1876 		struct gnttab_unmap_grant_ref unmap_op;
1877 
1878 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1879 		    xnbp->xnb_tx_ring_addr;
1880 		unmap_op.dev_bus_addr = 0;
1881 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1882 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1883 		    &unmap_op, 1) != 0)
1884 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1885 			    "cannot unmap tx-ring page (%d)",
1886 			    unmap_op.status);
1887 
1888 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1889 	}
1890 
1891 	if (xnbp->xnb_tx_ring_addr != NULL) {
1892 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1893 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1894 		xnbp->xnb_tx_ring_addr = NULL;
1895 	}
1896 }
1897 
1898 /*ARGSUSED*/
1899 static void
1900 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1901     void *arg, void *impl_data)
1902 {
1903 	xnb_t *xnbp = ddi_get_driver_private(dip);
1904 	XenbusState new_state = *(XenbusState *)impl_data;
1905 
1906 	ASSERT(xnbp != NULL);
1907 
1908 	switch (new_state) {
1909 	case XenbusStateConnected:
1910 		/* spurious state change */
1911 		if (xnbp->xnb_connected)
1912 			return;
1913 
1914 		if (xnb_connect_rings(dip)) {
1915 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1916 		} else {
1917 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1918 			xnb_disconnect_rings(dip);
1919 			(void) xvdi_switch_state(dip, XBT_NULL,
1920 			    XenbusStateClosed);
1921 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1922 		}
1923 
1924 		/*
1925 		 * Now that we've attempted to connect it's reasonable
1926 		 * to allow an attempt to detach.
1927 		 */
1928 		xnbp->xnb_detachable = B_TRUE;
1929 
1930 		break;
1931 
1932 	case XenbusStateClosing:
1933 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1934 
1935 		break;
1936 
1937 	case XenbusStateClosed:
1938 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1939 
1940 		mutex_enter(&xnbp->xnb_tx_lock);
1941 		mutex_enter(&xnbp->xnb_rx_lock);
1942 
1943 		xnb_disconnect_rings(dip);
1944 		xnbp->xnb_connected = B_FALSE;
1945 
1946 		mutex_exit(&xnbp->xnb_rx_lock);
1947 		mutex_exit(&xnbp->xnb_tx_lock);
1948 
1949 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1950 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1951 		/*
1952 		 * In all likelyhood this is already set (in the above
1953 		 * case), but if the peer never attempted to connect
1954 		 * and the domain is destroyed we get here without
1955 		 * having been through the case above, so we set it to
1956 		 * be sure.
1957 		 */
1958 		xnbp->xnb_detachable = B_TRUE;
1959 
1960 		break;
1961 
1962 	default:
1963 		break;
1964 	}
1965 }
1966 
1967 /*ARGSUSED*/
1968 static void
1969 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1970     void *arg, void *impl_data)
1971 {
1972 	xnb_t *xnbp = ddi_get_driver_private(dip);
1973 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1974 	boolean_t success;
1975 
1976 	ASSERT(xnbp != NULL);
1977 
1978 	switch (state) {
1979 	case Connected:
1980 
1981 		/* spurious hotplug event */
1982 		if (xnbp->xnb_hotplugged)
1983 			return;
1984 
1985 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1986 
1987 		mutex_enter(&xnbp->xnb_tx_lock);
1988 		mutex_enter(&xnbp->xnb_rx_lock);
1989 
1990 		xnbp->xnb_hotplugged = success;
1991 
1992 		mutex_exit(&xnbp->xnb_rx_lock);
1993 		mutex_exit(&xnbp->xnb_tx_lock);
1994 		break;
1995 
1996 	default:
1997 		break;
1998 	}
1999 }
2000 
2001 static struct modldrv modldrv = {
2002 	&mod_miscops, "xnb",
2003 };
2004 
2005 static struct modlinkage modlinkage = {
2006 	MODREV_1, &modldrv, NULL
2007 };
2008 
2009 int
2010 _init(void)
2011 {
2012 	int i;
2013 
2014 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2015 
2016 	xnb_txbuf_cachep = kmem_cache_create("xnb_txbuf_cachep",
2017 	    sizeof (xnb_txbuf_t), 0, xnb_txbuf_constructor,
2018 	    xnb_txbuf_destructor, NULL, NULL, NULL, 0);
2019 	ASSERT(xnb_txbuf_cachep != NULL);
2020 
2021 	i = mod_install(&modlinkage);
2022 	if (i != DDI_SUCCESS) {
2023 		kmem_cache_destroy(xnb_txbuf_cachep);
2024 		mutex_destroy(&xnb_alloc_page_lock);
2025 	}
2026 	return (i);
2027 }
2028 
2029 int
2030 _info(struct modinfo *modinfop)
2031 {
2032 	return (mod_info(&modlinkage, modinfop));
2033 }
2034 
2035 int
2036 _fini(void)
2037 {
2038 	int i;
2039 
2040 	i = mod_remove(&modlinkage);
2041 	if (i == DDI_SUCCESS) {
2042 		kmem_cache_destroy(xnb_txbuf_cachep);
2043 		mutex_destroy(&xnb_alloc_page_lock);
2044 	}
2045 	return (i);
2046 }
2047