xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 2f0fcb93196badcdd803715656c809058d9f3114)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/dlpi.h>
39 #include <sys/strsubr.h>
40 #include <sys/strsun.h>
41 #include <sys/types.h>
42 #include <sys/pattr.h>
43 #include <vm/seg_kmem.h>
44 #include <vm/hat_i86.h>
45 #include <xen/sys/xenbus_impl.h>
46 #include <xen/sys/xendev.h>
47 #include <sys/balloon_impl.h>
48 #include <sys/evtchn_impl.h>
49 #include <sys/gnttab.h>
50 #include <vm/vm_dep.h>
51 
52 #include <sys/gld.h>
53 #include <inet/ip.h>
54 #include <inet/ip_impl.h>
55 #include <sys/vnic_impl.h> /* blech. */
56 
57 /*
58  * The terms "transmit" and "receive" are used in alignment with domU,
59  * which means that packets originating from the peer domU are "transmitted"
60  * to other parts of the system and packets are "received" from them.
61  */
62 
63 /*
64  * XXPV dme: things to do, as well as various things indicated
65  * throughout the source:
66  * - copy avoidance outbound.
67  * - copy avoidance inbound.
68  * - transfer credit limiting.
69  * - MAC address based filtering.
70  */
71 
72 /*
73  * Linux expects to have some headroom in received buffers.  The Linux
74  * frontend driver (netfront) checks to see if the headroom is
75  * available and will re-allocate the buffer to make room if
76  * necessary.  To avoid this we add RX_BUFFER_HEADROOM bytes of
77  * headroom to each packet we pass to the peer.
78  */
79 #define	RX_BUFFER_HEADROOM	16
80 
81 /*
82  * Should we attempt to defer checksum calculation?
83  */
84 static boolean_t	xnb_cksum_offload = B_TRUE;
85 /*
86  * When receiving packets from a guest, should they be copied
87  * or used as-is (esballoc)?
88  */
89 static boolean_t	xnb_tx_always_copy = B_TRUE;
90 
91 static boolean_t	xnb_connect_rings(dev_info_t *);
92 static void		xnb_disconnect_rings(dev_info_t *);
93 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
94     void *, void *);
95 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
96     void *, void *);
97 
98 static int	xnb_txbuf_constructor(void *, void *, int);
99 static void	xnb_txbuf_destructor(void *, void *);
100 static xnb_txbuf_t *xnb_txbuf_get(xnb_t *, int);
101 static void	xnb_txbuf_put(xnb_t *, xnb_txbuf_t *);
102 static void	xnb_tx_notify_peer(xnb_t *);
103 static void	xnb_tx_complete(xnb_txbuf_t *);
104 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
105 static void 	xnb_tx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
106     xnb_txbuf_t *);
107 static void	xnb_tx_perform_pending_unmop(xnb_t *);
108 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
109 
110 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
111 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
112 
113 
114 boolean_t	xnb_hv_copy = B_TRUE;
115 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
116 
117 /* XXPV dme: are these really invalid? */
118 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
119 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
120 
121 static kmem_cache_t *xnb_txbuf_cachep;
122 static kmutex_t	xnb_alloc_page_lock;
123 
124 /*
125  * Statistics.
126  */
127 static char *aux_statistics[] = {
128 	"rx_cksum_deferred",
129 	"tx_cksum_no_need",
130 	"rx_rsp_notok",
131 	"tx_notify_deferred",
132 	"tx_notify_sent",
133 	"rx_notify_deferred",
134 	"rx_notify_sent",
135 	"tx_too_early",
136 	"rx_too_early",
137 	"rx_allocb_failed",
138 	"tx_allocb_failed",
139 	"rx_foreign_page",
140 	"mac_full",
141 	"spurious_intr",
142 	"allocation_success",
143 	"allocation_failure",
144 	"small_allocation_success",
145 	"small_allocation_failure",
146 	"other_allocation_failure",
147 	"rx_pageboundary_crossed",
148 	"rx_cpoparea_grown",
149 	"csum_hardware",
150 	"csum_software",
151 };
152 
153 static int
154 xnb_ks_aux_update(kstat_t *ksp, int flag)
155 {
156 	xnb_t *xnbp;
157 	kstat_named_t *knp;
158 
159 	if (flag != KSTAT_READ)
160 		return (EACCES);
161 
162 	xnbp = ksp->ks_private;
163 	knp = ksp->ks_data;
164 
165 	/*
166 	 * Assignment order should match that of the names in
167 	 * aux_statistics.
168 	 */
169 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
170 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
171 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
172 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
192 
193 	return (0);
194 }
195 
196 static boolean_t
197 xnb_ks_init(xnb_t *xnbp)
198 {
199 	int nstat = sizeof (aux_statistics) /
200 	    sizeof (aux_statistics[0]);
201 	char **cp = aux_statistics;
202 	kstat_named_t *knp;
203 
204 	/*
205 	 * Create and initialise kstats.
206 	 */
207 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
208 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
209 	    KSTAT_TYPE_NAMED, nstat, 0);
210 	if (xnbp->xnb_kstat_aux == NULL)
211 		return (B_FALSE);
212 
213 	xnbp->xnb_kstat_aux->ks_private = xnbp;
214 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
215 
216 	knp = xnbp->xnb_kstat_aux->ks_data;
217 	while (nstat > 0) {
218 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
219 
220 		knp++;
221 		cp++;
222 		nstat--;
223 	}
224 
225 	kstat_install(xnbp->xnb_kstat_aux);
226 
227 	return (B_TRUE);
228 }
229 
230 static void
231 xnb_ks_free(xnb_t *xnbp)
232 {
233 	kstat_delete(xnbp->xnb_kstat_aux);
234 }
235 
236 /*
237  * Software checksum calculation and insertion for an arbitrary packet.
238  */
239 /*ARGSUSED*/
240 static mblk_t *
241 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
242 {
243 	/*
244 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
245 	 * because it doesn't cover all of the interesting cases :-(
246 	 */
247 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
248 	    HCK_FULLCKSUM, KM_NOSLEEP);
249 
250 	return (vnic_fix_cksum(mp));
251 }
252 
253 mblk_t *
254 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
255 {
256 	struct ether_header *ehp;
257 	uint16_t sap;
258 	uint32_t offset;
259 	ipha_t *ipha;
260 
261 	ASSERT(mp->b_next == NULL);
262 
263 	/*
264 	 * Check that the packet is contained in a single mblk.  In
265 	 * the "from peer" path this is true today, but will change
266 	 * when scatter gather support is added.  In the "to peer"
267 	 * path we cannot be sure, but in most cases it will be true
268 	 * (in the xnbo case the packet has come from a MAC device
269 	 * which is unlikely to split packets).
270 	 */
271 	if (mp->b_cont != NULL)
272 		goto software;
273 
274 	/*
275 	 * If the MAC has no hardware capability don't do any further
276 	 * checking.
277 	 */
278 	if (capab == 0)
279 		goto software;
280 
281 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
282 	ehp = (struct ether_header *)mp->b_rptr;
283 
284 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
285 		struct ether_vlan_header *evhp;
286 
287 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
288 		evhp = (struct ether_vlan_header *)mp->b_rptr;
289 		sap = ntohs(evhp->ether_type);
290 		offset = sizeof (struct ether_vlan_header);
291 	} else {
292 		sap = ntohs(ehp->ether_type);
293 		offset = sizeof (struct ether_header);
294 	}
295 
296 	/*
297 	 * We only attempt to do IPv4 packets in hardware.
298 	 */
299 	if (sap != ETHERTYPE_IP)
300 		goto software;
301 
302 	/*
303 	 * We know that this is an IPv4 packet.
304 	 */
305 	ipha = (ipha_t *)(mp->b_rptr + offset);
306 
307 	switch (ipha->ipha_protocol) {
308 	case IPPROTO_TCP:
309 	case IPPROTO_UDP: {
310 		uint32_t start, length, stuff, cksum;
311 		uint16_t *stuffp;
312 
313 		/*
314 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
315 		 * can use full IPv4 and partial checksum offload.
316 		 */
317 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
318 			break;
319 
320 		start = IP_SIMPLE_HDR_LENGTH;
321 		length = ntohs(ipha->ipha_length);
322 		if (ipha->ipha_protocol == IPPROTO_TCP) {
323 			stuff = start + TCP_CHECKSUM_OFFSET;
324 			cksum = IP_TCP_CSUM_COMP;
325 		} else {
326 			stuff = start + UDP_CHECKSUM_OFFSET;
327 			cksum = IP_UDP_CSUM_COMP;
328 		}
329 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
330 
331 		if (capab & HCKSUM_INET_FULL_V4) {
332 			/*
333 			 * Some devices require that the checksum
334 			 * field of the packet is zero for full
335 			 * offload.
336 			 */
337 			*stuffp = 0;
338 
339 			(void) hcksum_assoc(mp, NULL, NULL,
340 			    0, 0, 0, 0,
341 			    HCK_FULLCKSUM, KM_NOSLEEP);
342 
343 			xnbp->xnb_stat_csum_hardware++;
344 
345 			return (mp);
346 		}
347 
348 		if (capab & HCKSUM_INET_PARTIAL) {
349 			if (*stuffp == 0) {
350 				ipaddr_t src, dst;
351 
352 				/*
353 				 * Older Solaris guests don't insert
354 				 * the pseudo-header checksum, so we
355 				 * calculate it here.
356 				 */
357 				src = ipha->ipha_src;
358 				dst = ipha->ipha_dst;
359 
360 				cksum += (dst >> 16) + (dst & 0xFFFF);
361 				cksum += (src >> 16) + (src & 0xFFFF);
362 				cksum += length - IP_SIMPLE_HDR_LENGTH;
363 
364 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
365 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
366 
367 				ASSERT(cksum <= 0xFFFF);
368 
369 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
370 			}
371 
372 			(void) hcksum_assoc(mp, NULL, NULL,
373 			    start, stuff, length, 0,
374 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
375 
376 			xnbp->xnb_stat_csum_hardware++;
377 
378 			return (mp);
379 		}
380 
381 		/* NOTREACHED */
382 		break;
383 	}
384 
385 	default:
386 		/* Use software. */
387 		break;
388 	}
389 
390 software:
391 	/*
392 	 * We are not able to use any offload so do the whole thing in
393 	 * software.
394 	 */
395 	xnbp->xnb_stat_csum_software++;
396 
397 	return (xnb_software_csum(xnbp, mp));
398 }
399 
400 int
401 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
402 {
403 	xnb_t *xnbp;
404 	char *xsname, mac[ETHERADDRL * 3];
405 
406 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
407 
408 	xnbp->xnb_flavour = flavour;
409 	xnbp->xnb_flavour_data = flavour_data;
410 	xnbp->xnb_devinfo = dip;
411 	xnbp->xnb_evtchn = INVALID_EVTCHN;
412 	xnbp->xnb_irq = B_FALSE;
413 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
414 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
415 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
416 	xnbp->xnb_connected = B_FALSE;
417 	xnbp->xnb_hotplugged = B_FALSE;
418 	xnbp->xnb_detachable = B_FALSE;
419 	xnbp->xnb_peer = xvdi_get_oeid(dip);
420 	xnbp->xnb_tx_pages_writable = B_FALSE;
421 	xnbp->xnb_tx_always_copy = xnb_tx_always_copy;
422 
423 	xnbp->xnb_tx_buf_count = 0;
424 	xnbp->xnb_tx_unmop_count = 0;
425 
426 	xnbp->xnb_hv_copy = B_FALSE;
427 
428 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
429 	ASSERT(xnbp->xnb_rx_va != NULL);
430 
431 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
432 	    != DDI_SUCCESS)
433 		goto failure;
434 
435 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
436 	xnbp->xnb_rx_cpop = NULL;
437 	xnbp->xnb_cpop_sz = 0;
438 
439 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
440 	    xnbp->xnb_icookie);
441 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
442 	    xnbp->xnb_icookie);
443 
444 	/* set driver private pointer now */
445 	ddi_set_driver_private(dip, xnbp);
446 
447 	if (!xnb_ks_init(xnbp))
448 		goto failure_1;
449 
450 	/*
451 	 * Receive notification of changes in the state of the
452 	 * driver in the guest domain.
453 	 */
454 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
455 	    xnb_oe_state_change) != DDI_SUCCESS)
456 		goto failure_2;
457 
458 	/*
459 	 * Receive notification of hotplug events.
460 	 */
461 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
462 	    xnb_hp_state_change) != DDI_SUCCESS)
463 		goto failure_2;
464 
465 	xsname = xvdi_get_xsname(dip);
466 
467 	if (xenbus_printf(XBT_NULL, xsname,
468 	    "feature-no-csum-offload", "%d",
469 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
470 		goto failure_3;
471 
472 	/*
473 	 * Use global xnb_hv_copy to export this feature. This means that
474 	 * we have to decide what to do before starting up a guest domain
475 	 */
476 	if (xenbus_printf(XBT_NULL, xsname,
477 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
478 		goto failure_3;
479 	/*
480 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
481 	 * in addition to "feature-rx-copy" being 1. It seems strange
482 	 * to use four possible states to describe a binary decision,
483 	 * but we might as well play nice.
484 	 */
485 	if (xenbus_printf(XBT_NULL, xsname,
486 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
487 		goto failure_3;
488 
489 	if (xenbus_scanf(XBT_NULL, xsname,
490 	    "mac", "%s", mac) != 0) {
491 		cmn_err(CE_WARN, "xnb_attach: "
492 		    "cannot read mac address from %s",
493 		    xsname);
494 		goto failure_3;
495 	}
496 
497 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
498 		cmn_err(CE_WARN,
499 		    "xnb_attach: cannot parse mac address %s",
500 		    mac);
501 		goto failure_3;
502 	}
503 
504 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
505 	(void) xvdi_post_event(dip, XEN_HP_ADD);
506 
507 	return (DDI_SUCCESS);
508 
509 failure_3:
510 	xvdi_remove_event_handler(dip, NULL);
511 
512 failure_2:
513 	xnb_ks_free(xnbp);
514 
515 failure_1:
516 	mutex_destroy(&xnbp->xnb_rx_lock);
517 	mutex_destroy(&xnbp->xnb_tx_lock);
518 
519 failure:
520 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
521 	kmem_free(xnbp, sizeof (*xnbp));
522 	return (DDI_FAILURE);
523 }
524 
525 /*ARGSUSED*/
526 void
527 xnb_detach(dev_info_t *dip)
528 {
529 	xnb_t *xnbp = ddi_get_driver_private(dip);
530 
531 	ASSERT(xnbp != NULL);
532 	ASSERT(!xnbp->xnb_connected);
533 	ASSERT(xnbp->xnb_tx_buf_count == 0);
534 
535 	xnb_disconnect_rings(dip);
536 
537 	xvdi_remove_event_handler(dip, NULL);
538 
539 	xnb_ks_free(xnbp);
540 
541 	ddi_set_driver_private(dip, NULL);
542 
543 	mutex_destroy(&xnbp->xnb_tx_lock);
544 	mutex_destroy(&xnbp->xnb_rx_lock);
545 
546 	if (xnbp->xnb_cpop_sz > 0)
547 		kmem_free(xnbp->xnb_rx_cpop, sizeof (*xnbp->xnb_rx_cpop)
548 		    * xnbp->xnb_cpop_sz);
549 
550 	ASSERT(xnbp->xnb_rx_va != NULL);
551 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
552 
553 	kmem_free(xnbp, sizeof (*xnbp));
554 }
555 
556 
557 static mfn_t
558 xnb_alloc_page(xnb_t *xnbp)
559 {
560 #define	WARNING_RATE_LIMIT 100
561 #define	BATCH_SIZE 256
562 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
563 	static int nth = BATCH_SIZE;
564 	mfn_t mfn;
565 
566 	mutex_enter(&xnb_alloc_page_lock);
567 	if (nth == BATCH_SIZE) {
568 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
569 			xnbp->xnb_stat_allocation_failure++;
570 			mutex_exit(&xnb_alloc_page_lock);
571 
572 			/*
573 			 * Try for a single page in low memory situations.
574 			 */
575 			if (balloon_alloc_pages(1, &mfn) != 1) {
576 				if ((xnbp->xnb_stat_small_allocation_failure++
577 				    % WARNING_RATE_LIMIT) == 0)
578 					cmn_err(CE_WARN, "xnb_alloc_page: "
579 					    "Cannot allocate memory to "
580 					    "transfer packets to peer.");
581 				return (0);
582 			} else {
583 				xnbp->xnb_stat_small_allocation_success++;
584 				return (mfn);
585 			}
586 		}
587 
588 		nth = 0;
589 		xnbp->xnb_stat_allocation_success++;
590 	}
591 
592 	mfn = mfns[nth++];
593 	mutex_exit(&xnb_alloc_page_lock);
594 
595 	ASSERT(mfn != 0);
596 
597 	return (mfn);
598 #undef BATCH_SIZE
599 #undef WARNING_RATE_LIMIT
600 }
601 
602 /*ARGSUSED*/
603 static void
604 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
605 {
606 	int r;
607 	pfn_t pfn;
608 
609 	pfn = xen_assign_pfn(mfn);
610 	pfnzero(pfn, 0, PAGESIZE);
611 	xen_release_pfn(pfn);
612 
613 	/*
614 	 * This happens only in the error path, so batching is
615 	 * not worth the complication.
616 	 */
617 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
618 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
619 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
620 		    r, mfn);
621 	}
622 }
623 
624 /*
625  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
626  * using local variables.
627  */
628 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
629 	((((_r)->sring->req_prod - loop) <		\
630 		(RING_SIZE(_r) - (loop - prod))) ?	\
631 	    ((_r)->sring->req_prod - loop) :		\
632 	    (RING_SIZE(_r) - (loop - prod)))
633 
634 mblk_t *
635 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
636 {
637 	mblk_t *free = mp, *prev = NULL;
638 	size_t len;
639 	gnttab_transfer_t *gop;
640 	boolean_t notify;
641 	RING_IDX loop, prod, end;
642 
643 	/*
644 	 * For each packet the sequence of operations is:
645 	 *
646 	 * 1. get a new page from the hypervisor.
647 	 * 2. get a request slot from the ring.
648 	 * 3. copy the data into the new page.
649 	 * 4. transfer the page to the peer.
650 	 * 5. update the request slot.
651 	 * 6. kick the peer.
652 	 * 7. free mp.
653 	 *
654 	 * In order to reduce the number of hypercalls, we prepare
655 	 * several packets for the peer and perform a single hypercall
656 	 * to transfer them.
657 	 */
658 
659 	mutex_enter(&xnbp->xnb_rx_lock);
660 
661 	/*
662 	 * If we are not connected to the peer or have not yet
663 	 * finished hotplug it is too early to pass packets to the
664 	 * peer.
665 	 */
666 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
667 		mutex_exit(&xnbp->xnb_rx_lock);
668 		DTRACE_PROBE(flip_rx_too_early);
669 		xnbp->xnb_stat_rx_too_early++;
670 		return (mp);
671 	}
672 
673 	loop = xnbp->xnb_rx_ring.req_cons;
674 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
675 	gop = xnbp->xnb_rx_top;
676 
677 	while ((mp != NULL) &&
678 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
679 
680 		mfn_t mfn;
681 		pfn_t pfn;
682 		netif_rx_request_t *rxreq;
683 		netif_rx_response_t *rxresp;
684 		char *valoop;
685 		size_t offset;
686 		mblk_t *ml;
687 		uint16_t cksum_flags;
688 
689 		/* 1 */
690 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
691 			xnbp->xnb_stat_rx_defer++;
692 			break;
693 		}
694 
695 		/* 2 */
696 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
697 
698 #ifdef XNB_DEBUG
699 		if (!(rxreq->id < NET_RX_RING_SIZE))
700 			cmn_err(CE_PANIC, "xnb_to_peer: "
701 			    "id %d out of range in request 0x%p",
702 			    rxreq->id, (void *)rxreq);
703 #endif /* XNB_DEBUG */
704 
705 		/* Assign a pfn and map the new page at the allocated va. */
706 		pfn = xen_assign_pfn(mfn);
707 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
708 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
709 
710 		offset = RX_BUFFER_HEADROOM;
711 
712 		/* 3 */
713 		len = 0;
714 		valoop = xnbp->xnb_rx_va + offset;
715 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
716 			size_t chunk = ml->b_wptr - ml->b_rptr;
717 
718 			bcopy(ml->b_rptr, valoop, chunk);
719 			valoop += chunk;
720 			len += chunk;
721 		}
722 
723 		ASSERT(len + offset < PAGESIZE);
724 
725 		/* Release the pfn. */
726 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
727 		    HAT_UNLOAD_UNMAP);
728 		xen_release_pfn(pfn);
729 
730 		/* 4 */
731 		gop->mfn = mfn;
732 		gop->domid = xnbp->xnb_peer;
733 		gop->ref = rxreq->gref;
734 
735 		/* 5.1 */
736 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
737 		rxresp->offset = offset;
738 		rxresp->flags = 0;
739 
740 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
741 		if (cksum_flags != 0)
742 			xnbp->xnb_stat_rx_cksum_deferred++;
743 		rxresp->flags |= cksum_flags;
744 
745 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
746 		rxresp->status = len;
747 
748 		loop++;
749 		prod++;
750 		gop++;
751 		prev = mp;
752 		mp = mp->b_next;
753 	}
754 
755 	/*
756 	 * Did we actually do anything?
757 	 */
758 	if (loop == xnbp->xnb_rx_ring.req_cons) {
759 		mutex_exit(&xnbp->xnb_rx_lock);
760 		return (mp);
761 	}
762 
763 	end = loop;
764 
765 	/*
766 	 * Unlink the end of the 'done' list from the remainder.
767 	 */
768 	ASSERT(prev != NULL);
769 	prev->b_next = NULL;
770 
771 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
772 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
773 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
774 	}
775 
776 	loop = xnbp->xnb_rx_ring.req_cons;
777 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
778 	gop = xnbp->xnb_rx_top;
779 
780 	while (loop < end) {
781 		int16_t status = NETIF_RSP_OKAY;
782 
783 		if (gop->status != 0) {
784 			status = NETIF_RSP_ERROR;
785 
786 			/*
787 			 * If the status is anything other than
788 			 * GNTST_bad_page then we don't own the page
789 			 * any more, so don't try to give it back.
790 			 */
791 			if (gop->status != GNTST_bad_page)
792 				gop->mfn = 0;
793 		} else {
794 			/* The page is no longer ours. */
795 			gop->mfn = 0;
796 		}
797 
798 		if (gop->mfn != 0)
799 			/*
800 			 * Give back the page, as we won't be using
801 			 * it.
802 			 */
803 			xnb_free_page(xnbp, gop->mfn);
804 		else
805 			/*
806 			 * We gave away a page, update our accounting
807 			 * now.
808 			 */
809 			balloon_drv_subtracted(1);
810 
811 		/* 5.2 */
812 		if (status != NETIF_RSP_OKAY) {
813 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
814 			    status;
815 		} else {
816 			xnbp->xnb_stat_ipackets++;
817 			xnbp->xnb_stat_rbytes += len;
818 		}
819 
820 		loop++;
821 		prod++;
822 		gop++;
823 	}
824 
825 	xnbp->xnb_rx_ring.req_cons = loop;
826 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
827 
828 	/* 6 */
829 	/* LINTED: constant in conditional context */
830 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
831 	if (notify) {
832 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
833 		xnbp->xnb_stat_rx_notify_sent++;
834 	} else {
835 		xnbp->xnb_stat_rx_notify_deferred++;
836 	}
837 
838 	if (mp != NULL)
839 		xnbp->xnb_stat_rx_defer++;
840 
841 	mutex_exit(&xnbp->xnb_rx_lock);
842 
843 	/* Free mblk_t's that we consumed. */
844 	freemsgchain(free);
845 
846 	return (mp);
847 }
848 
849 /* helper functions for xnb_copy_to_peer */
850 
851 /*
852  * Grow the array of copy operation descriptors.
853  * Returns a pointer to the next available entry.
854  */
855 gnttab_copy_t *
856 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
857 {
858 	/*
859 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
860 	 * something into but cannot, because we haven't alloc'ed it
861 	 * yet, or NULL.
862 	 * old_cpop and new_cpop (local) are pointers to old/new
863 	 * versions of xnbp->xnb_rx_cpop.
864 	 */
865 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
866 	size_t		newcount;
867 
868 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
869 
870 	old_cpop = xnbp->xnb_rx_cpop;
871 	/*
872 	 * o_cpop is a pointer into the array pointed to by old_cpop;
873 	 * it would be an error for exactly one of these pointers to be NULL.
874 	 * We shouldn't call this function if xnb_rx_cpop has already
875 	 * been allocated, but we're starting to fill it from the beginning
876 	 * again.
877 	 */
878 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
879 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
880 
881 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
882 
883 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
884 	if (new_cpop == NULL) {
885 		xnbp->xnb_stat_other_allocation_failure++;
886 		return (NULL);
887 	}
888 
889 	if (o_cpop != NULL) {
890 		size_t	 offset = (o_cpop - old_cpop);
891 
892 		/* we only need to move the parts in use ... */
893 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
894 		    (sizeof (*old_cpop)));
895 
896 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
897 
898 		ret_cpop = new_cpop + offset;
899 	} else {
900 		ret_cpop = new_cpop;
901 	}
902 
903 	xnbp->xnb_rx_cpop = new_cpop;
904 	xnbp->xnb_cpop_sz = newcount;
905 
906 	xnbp->xnb_stat_rx_cpoparea_grown++;
907 
908 	return (ret_cpop);
909 }
910 
911 /*
912  * Check whether an address is on a page that's foreign to this domain.
913  */
914 static boolean_t
915 is_foreign(void *addr)
916 {
917 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
918 
919 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
920 }
921 
922 /*
923  * Insert a newly allocated mblk into a chain, replacing the old one.
924  */
925 static mblk_t *
926 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
927 {
928 	uint32_t	start, stuff, end, value, flags;
929 	mblk_t		*new_mp;
930 
931 	new_mp = copyb(mp);
932 	if (new_mp == NULL)
933 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
934 		    "for %p, len %lu", (void *) mp, len);
935 
936 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
937 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
938 	    flags, KM_NOSLEEP);
939 
940 	new_mp->b_next = mp->b_next;
941 	new_mp->b_prev = mp->b_prev;
942 	new_mp->b_cont = mp->b_cont;
943 
944 	/* Make sure we only overwrite pointers to the mblk being replaced. */
945 	if (mp_prev != NULL && mp_prev->b_next == mp)
946 		mp_prev->b_next = new_mp;
947 
948 	if (ml_prev != NULL && ml_prev->b_cont == mp)
949 		ml_prev->b_cont = new_mp;
950 
951 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
952 	freemsg(mp);
953 
954 	return (new_mp);
955 }
956 
957 /*
958  * Set all the fields in a gnttab_copy_t.
959  */
960 static void
961 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
962     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
963 {
964 	ASSERT(xnbp != NULL && gp != NULL);
965 
966 	gp->source.offset = s_off;
967 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
968 	gp->source.domid = DOMID_SELF;
969 
970 	gp->len = (uint16_t)len;
971 	gp->flags = GNTCOPY_dest_gref;
972 	gp->status = 0;
973 
974 	gp->dest.u.ref = d_ref;
975 	gp->dest.offset = d_off;
976 	gp->dest.domid = xnbp->xnb_peer;
977 }
978 
979 mblk_t *
980 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
981 {
982 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
983 	mblk_t		*ml, *ml_prev;
984 	gnttab_copy_t	*gop_cp;
985 	boolean_t	notify;
986 	RING_IDX	loop, prod;
987 	int		i;
988 
989 	if (!xnbp->xnb_hv_copy)
990 		return (xnb_to_peer(xnbp, mp));
991 
992 	/*
993 	 * For each packet the sequence of operations is:
994 	 *
995 	 *  1. get a request slot from the ring.
996 	 *  2. set up data for hypercall (see NOTE below)
997 	 *  3. have the hypervisore copy the data
998 	 *  4. update the request slot.
999 	 *  5. kick the peer.
1000 	 *
1001 	 * NOTE ad 2.
1002 	 *  In order to reduce the number of hypercalls, we prepare
1003 	 *  several packets (mp->b_cont != NULL) for the peer and
1004 	 *  perform a single hypercall to transfer them.
1005 	 *  We also have to set up a seperate copy operation for
1006 	 *  every page.
1007 	 *
1008 	 * If we have more than one message (mp->b_next != NULL),
1009 	 * we do this whole dance repeatedly.
1010 	 */
1011 
1012 	mutex_enter(&xnbp->xnb_rx_lock);
1013 
1014 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1015 		mutex_exit(&xnbp->xnb_rx_lock);
1016 		DTRACE_PROBE(copy_rx_too_early);
1017 		xnbp->xnb_stat_rx_too_early++;
1018 		return (mp);
1019 	}
1020 
1021 	loop = xnbp->xnb_rx_ring.req_cons;
1022 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1023 
1024 	while ((mp != NULL) &&
1025 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1026 		netif_rx_request_t	*rxreq;
1027 		netif_rx_response_t	*rxresp;
1028 		size_t			offset, d_offset;
1029 		size_t			len;
1030 		uint16_t		cksum_flags;
1031 		int16_t			status = NETIF_RSP_OKAY;
1032 		int			item_count;
1033 
1034 		/* 1 */
1035 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1036 
1037 #ifdef XNB_DEBUG
1038 		if (!(rxreq->id < NET_RX_RING_SIZE))
1039 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1040 			    "id %d out of range in request 0x%p",
1041 			    rxreq->id, (void *)rxreq);
1042 #endif /* XNB_DEBUG */
1043 
1044 		/* 2 */
1045 		d_offset = offset = RX_BUFFER_HEADROOM;
1046 		len = 0;
1047 		item_count = 0;
1048 
1049 		gop_cp = xnbp->xnb_rx_cpop;
1050 
1051 		/*
1052 		 * We walk the b_cont pointers and set up a gop_cp
1053 		 * structure for every page in every data block we have.
1054 		 */
1055 		/* 2a */
1056 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1057 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1058 			uchar_t	*r_tmp,	*rpt_align;
1059 			size_t	r_offset;
1060 
1061 			/*
1062 			 * If we get an mblk on a page that doesn't belong to
1063 			 * this domain, get a new mblk to replace the old one.
1064 			 */
1065 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1066 				mblk_t *ml_new = replace_msg(ml, chunk,
1067 				    mp_prev, ml_prev);
1068 
1069 				/* We can still use old ml, but not *ml! */
1070 				if (free == ml)
1071 					free = ml_new;
1072 				if (mp == ml)
1073 					mp = ml_new;
1074 				ml = ml_new;
1075 
1076 				xnbp->xnb_stat_rx_foreign_page++;
1077 			}
1078 
1079 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1080 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1081 			r_tmp = ml->b_rptr;
1082 
1083 			if (d_offset + chunk > PAGESIZE)
1084 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1085 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1086 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1087 				    (void *)mp, (void *)saved_mp, (void *)ml,
1088 				    (void *)rpt_align,
1089 				    d_offset, chunk, (int)PAGESIZE);
1090 
1091 			while (chunk > 0) {
1092 				size_t part_len;
1093 
1094 				item_count++;
1095 				if (item_count > xnbp->xnb_cpop_sz) {
1096 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1097 					if (gop_cp == NULL)
1098 						goto failure;
1099 				}
1100 				/*
1101 				 * If our mblk crosses a page boundary, we need
1102 				 * to do a seperate copy for every page.
1103 				 */
1104 				if (r_offset + chunk > PAGESIZE) {
1105 					part_len = PAGESIZE - r_offset;
1106 
1107 					DTRACE_PROBE3(mblk_page_crossed,
1108 					    (mblk_t *), ml, int, chunk, int,
1109 					    (int)r_offset);
1110 
1111 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1112 				} else {
1113 					part_len = chunk;
1114 				}
1115 
1116 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1117 				    d_offset, part_len, rxreq->gref);
1118 
1119 				chunk -= part_len;
1120 
1121 				len += part_len;
1122 				d_offset += part_len;
1123 				r_tmp += part_len;
1124 				/*
1125 				 * The 2nd, 3rd ... last copies will always
1126 				 * start at r_tmp, therefore r_offset is 0.
1127 				 */
1128 				r_offset = 0;
1129 				gop_cp++;
1130 			}
1131 			ml_prev = ml;
1132 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1133 			    chunk, int, len, int, item_count);
1134 		}
1135 		/* 3 */
1136 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1137 		    item_count) != 0) {
1138 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1139 			DTRACE_PROBE(HV_granttableopfailed);
1140 		}
1141 
1142 		/* 4 */
1143 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1144 		rxresp->offset = offset;
1145 
1146 		rxresp->flags = 0;
1147 
1148 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1149 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1150 		    (int)rxresp->status);
1151 
1152 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1153 		if (cksum_flags != 0)
1154 			xnbp->xnb_stat_rx_cksum_deferred++;
1155 		rxresp->flags |= cksum_flags;
1156 
1157 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1158 		rxresp->status = len;
1159 
1160 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1161 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1162 		    (int)rxresp->status);
1163 
1164 		for (i = 0; i < item_count; i++) {
1165 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1166 				DTRACE_PROBE2(cpop__status__nonnull, int,
1167 				    (int)xnbp->xnb_rx_cpop[i].status,
1168 				    int, i);
1169 				status = NETIF_RSP_ERROR;
1170 			}
1171 		}
1172 
1173 		/* 5.2 */
1174 		if (status != NETIF_RSP_OKAY) {
1175 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1176 			    status;
1177 			xnbp->xnb_stat_rx_rsp_notok++;
1178 		} else {
1179 			xnbp->xnb_stat_ipackets++;
1180 			xnbp->xnb_stat_rbytes += len;
1181 		}
1182 
1183 		loop++;
1184 		prod++;
1185 		mp_prev = mp;
1186 		mp = mp->b_next;
1187 	}
1188 failure:
1189 	/*
1190 	 * Did we actually do anything?
1191 	 */
1192 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1193 		mutex_exit(&xnbp->xnb_rx_lock);
1194 		return (mp);
1195 	}
1196 
1197 	/*
1198 	 * Unlink the end of the 'done' list from the remainder.
1199 	 */
1200 	ASSERT(mp_prev != NULL);
1201 	mp_prev->b_next = NULL;
1202 
1203 	xnbp->xnb_rx_ring.req_cons = loop;
1204 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1205 
1206 	/* 6 */
1207 	/* LINTED: constant in conditional context */
1208 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1209 	if (notify) {
1210 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1211 		xnbp->xnb_stat_rx_notify_sent++;
1212 	} else {
1213 		xnbp->xnb_stat_rx_notify_deferred++;
1214 	}
1215 
1216 	if (mp != NULL)
1217 		xnbp->xnb_stat_rx_defer++;
1218 
1219 	mutex_exit(&xnbp->xnb_rx_lock);
1220 
1221 	/* Free mblk_t structs we have consumed. */
1222 	freemsgchain(free);
1223 
1224 	return (mp);
1225 }
1226 
1227 /*ARGSUSED*/
1228 static int
1229 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1230 {
1231 	xnb_txbuf_t *txp = buf;
1232 
1233 	bzero(txp, sizeof (*txp));
1234 
1235 	txp->xt_free_rtn.free_func = xnb_tx_complete;
1236 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1237 
1238 	txp->xt_mop.host_addr =
1239 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1240 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1241 	    VM_NOSLEEP : VM_SLEEP);
1242 
1243 	if (txp->xt_mop.host_addr == NULL) {
1244 		cmn_err(CE_WARN, "xnb_txbuf_constructor: "
1245 		    "cannot get address space");
1246 		return (-1);
1247 	}
1248 
1249 	/*
1250 	 * Have the hat ensure that page table exists for the VA.
1251 	 */
1252 	hat_prepare_mapping(kas.a_hat,
1253 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr);
1254 
1255 	return (0);
1256 }
1257 
1258 /*ARGSUSED*/
1259 static void
1260 xnb_txbuf_destructor(void *buf, void *arg)
1261 {
1262 	xnb_txbuf_t *txp = buf;
1263 
1264 	ASSERT(txp->xt_mop.host_addr != NULL);
1265 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
1266 
1267 	hat_release_mapping(kas.a_hat,
1268 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr);
1269 	vmem_free(heap_arena,
1270 	    (caddr_t)(uintptr_t)txp->xt_mop.host_addr, PAGESIZE);
1271 }
1272 
1273 static void
1274 xnb_tx_notify_peer(xnb_t *xnbp)
1275 {
1276 	boolean_t notify;
1277 
1278 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1279 
1280 	/* LINTED: constant in conditional context */
1281 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1282 	if (notify) {
1283 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1284 		xnbp->xnb_stat_tx_notify_sent++;
1285 	} else {
1286 		xnbp->xnb_stat_tx_notify_deferred++;
1287 	}
1288 }
1289 
1290 static void
1291 xnb_tx_complete(xnb_txbuf_t *txp)
1292 {
1293 	xnb_t *xnbp = txp->xt_xnbp;
1294 
1295 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
1296 
1297 	mutex_enter(&xnbp->xnb_tx_lock);
1298 	xnb_tx_schedule_unmop(xnbp, &txp->xt_mop, txp);
1299 	mutex_exit(&xnbp->xnb_tx_lock);
1300 }
1301 
1302 static void
1303 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1304 {
1305 	RING_IDX i;
1306 	netif_tx_response_t *txresp;
1307 
1308 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1309 
1310 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1311 
1312 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1313 	txresp->id = id;
1314 	txresp->status = status;
1315 
1316 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1317 
1318 	/*
1319 	 * Note that we don't push the change to the peer here - that
1320 	 * is the callers responsibility.
1321 	 */
1322 }
1323 
1324 static void
1325 xnb_tx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1326     xnb_txbuf_t *txp)
1327 {
1328 	gnttab_unmap_grant_ref_t	*unmop;
1329 	int				u_count;
1330 	int				reqs_on_ring;
1331 
1332 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1333 	ASSERT(xnbp->xnb_tx_unmop_count < NET_TX_RING_SIZE);
1334 
1335 	u_count = xnbp->xnb_tx_unmop_count++;
1336 
1337 	/* Cache data for the time when we actually unmap grant refs */
1338 	xnbp->xnb_tx_unmop_txp[u_count] = txp;
1339 
1340 	unmop = &xnbp->xnb_tx_unmop[u_count];
1341 	unmop->host_addr = mop->host_addr;
1342 	unmop->dev_bus_addr = mop->dev_bus_addr;
1343 	unmop->handle = mop->handle;
1344 
1345 	/*
1346 	 * We cannot check the ring once we're disconnected from it. Batching
1347 	 * doesn't seem to be a useful optimisation in this case either,
1348 	 * so we directly call into the actual unmap function.
1349 	 */
1350 	if (xnbp->xnb_connected) {
1351 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring);
1352 
1353 		/*
1354 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1355 		 * or (with N == 1) "immediate unmop" behaviour.
1356 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1357 		 */
1358 		if (xnbp->xnb_tx_unmop_count < xnb_unmop_hiwat &&
1359 		    reqs_on_ring > xnb_unmop_lowwat)
1360 			return;
1361 	}
1362 
1363 	xnb_tx_perform_pending_unmop(xnbp);
1364 }
1365 
1366 /*
1367  * Here we perform the actual unmapping of the data that was
1368  * accumulated in xnb_tx_schedule_unmop().
1369  * Note that it is the caller's responsibility to make sure that
1370  * there's actually something there to unmop.
1371  */
1372 static void
1373 xnb_tx_perform_pending_unmop(xnb_t *xnbp)
1374 {
1375 	RING_IDX loop;
1376 #ifdef XNB_DEBUG
1377 	gnttab_unmap_grant_ref_t *unmop;
1378 #endif /* XNB_DEBUG */
1379 
1380 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1381 	ASSERT(xnbp->xnb_tx_unmop_count > 0);
1382 
1383 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1384 	    xnbp->xnb_tx_unmop, xnbp->xnb_tx_unmop_count) < 0) {
1385 		cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
1386 		    "unmap grant operation failed, "
1387 		    "%d pages lost", xnbp->xnb_tx_unmop_count);
1388 	}
1389 
1390 #ifdef XNB_DEBUG
1391 	for (loop = 0, unmop = xnbp->xnb_tx_unmop;
1392 	    loop < xnbp->xnb_tx_unmop_count;
1393 	    loop++, unmop++) {
1394 		if (unmop->status != 0) {
1395 			cmn_err(CE_WARN, "xnb_tx_perform_pending_unmop: "
1396 			    "unmap grant reference failed (%d)",
1397 			    unmop->status);
1398 		}
1399 	}
1400 #endif /* XNB_DEBUG */
1401 
1402 	for (loop = 0; loop < xnbp->xnb_tx_unmop_count; loop++) {
1403 		xnb_txbuf_t	*txp = xnbp->xnb_tx_unmop_txp[loop];
1404 
1405 		if (txp == NULL)
1406 			cmn_err(CE_PANIC,
1407 			    "xnb_tx_perform_pending_unmop: "
1408 			    "unexpected NULL txp (loop %d; count %d)!",
1409 			    loop, xnbp->xnb_tx_unmop_count);
1410 
1411 		if (xnbp->xnb_connected)
1412 			xnb_tx_mark_complete(xnbp, txp->xt_id, txp->xt_status);
1413 		xnb_txbuf_put(xnbp, txp);
1414 	}
1415 	if (xnbp->xnb_connected)
1416 		xnb_tx_notify_peer(xnbp);
1417 
1418 	xnbp->xnb_tx_unmop_count = 0;
1419 
1420 #ifdef XNB_DEBUG
1421 	bzero(xnbp->xnb_tx_unmop, sizeof (xnbp->xnb_tx_unmop));
1422 	bzero(xnbp->xnb_tx_unmop_txp, sizeof (xnbp->xnb_tx_unmop_txp));
1423 #endif /* XNB_DEBUG */
1424 }
1425 
1426 static xnb_txbuf_t *
1427 xnb_txbuf_get(xnb_t *xnbp, int flags)
1428 {
1429 	xnb_txbuf_t *txp;
1430 
1431 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1432 
1433 	txp = kmem_cache_alloc(xnb_txbuf_cachep, flags);
1434 	if (txp != NULL) {
1435 		ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == 0);
1436 		txp->xt_flags |= XNB_TXBUF_INUSE;
1437 
1438 		txp->xt_xnbp = xnbp;
1439 		txp->xt_mop.dom = xnbp->xnb_peer;
1440 
1441 		txp->xt_mop.flags = GNTMAP_host_map;
1442 		if (!xnbp->xnb_tx_pages_writable)
1443 			txp->xt_mop.flags |= GNTMAP_readonly;
1444 
1445 		xnbp->xnb_tx_buf_count++;
1446 	}
1447 
1448 	return (txp);
1449 }
1450 
1451 static void
1452 xnb_txbuf_put(xnb_t *xnbp, xnb_txbuf_t *txp)
1453 {
1454 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1455 	ASSERT((txp->xt_flags & XNB_TXBUF_INUSE) == XNB_TXBUF_INUSE);
1456 
1457 	txp->xt_flags &= ~XNB_TXBUF_INUSE;
1458 	xnbp->xnb_tx_buf_count--;
1459 
1460 	kmem_cache_free(xnb_txbuf_cachep, txp);
1461 }
1462 
1463 static mblk_t *
1464 xnb_from_peer(xnb_t *xnbp)
1465 {
1466 	RING_IDX start, end, loop;
1467 	gnttab_map_grant_ref_t *mop;
1468 	xnb_txbuf_t **txpp;
1469 	netif_tx_request_t *txreq;
1470 	boolean_t work_to_do;
1471 	mblk_t *head, *tail;
1472 	/*
1473 	 * If the peer granted a read-only mapping to the page then we
1474 	 * must copy the data, as the local protocol stack (should the
1475 	 * packet be destined for this host) will modify the packet
1476 	 * 'in place'.
1477 	 */
1478 	boolean_t copy = xnbp->xnb_tx_always_copy ||
1479 	    !xnbp->xnb_tx_pages_writable;
1480 
1481 	/*
1482 	 * For each individual request, the sequence of actions is:
1483 	 *
1484 	 * 1. get the request.
1485 	 * 2. map the page based on the grant ref.
1486 	 * 3. allocate an mblk, copy the data to it.
1487 	 * 4. release the grant.
1488 	 * 5. update the ring.
1489 	 * 6. pass the packet upward.
1490 	 * 7. kick the peer.
1491 	 *
1492 	 * In fact, we try to perform the grant operations in batches,
1493 	 * so there are two loops.
1494 	 */
1495 
1496 	head = tail = NULL;
1497 around:
1498 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1499 
1500 	/* LINTED: constant in conditional context */
1501 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1502 	if (!work_to_do) {
1503 finished:
1504 		return (head);
1505 	}
1506 
1507 	start = xnbp->xnb_tx_ring.req_cons;
1508 	end = xnbp->xnb_tx_ring.sring->req_prod;
1509 
1510 	if ((end - start) > NET_TX_RING_SIZE) {
1511 		/*
1512 		 * This usually indicates that the frontend driver is
1513 		 * misbehaving, as it's not possible to have more than
1514 		 * NET_TX_RING_SIZE ring elements in play at any one
1515 		 * time.
1516 		 *
1517 		 * We reset the ring pointers to the state declared by
1518 		 * the frontend and try to carry on.
1519 		 */
1520 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1521 		    "items in the ring, resetting and trying to recover.",
1522 		    xnbp->xnb_peer, (end - start));
1523 
1524 		/* LINTED: constant in conditional context */
1525 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1526 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1527 
1528 		goto around;
1529 	}
1530 
1531 	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
1532 	    loop != end;
1533 	    loop++, mop++, txpp++) {
1534 		xnb_txbuf_t *txp;
1535 
1536 		txp = xnb_txbuf_get(xnbp, KM_NOSLEEP);
1537 		if (txp == NULL)
1538 			break;
1539 
1540 		ASSERT(xnbp->xnb_tx_pages_writable ||
1541 		    ((txp->xt_mop.flags & GNTMAP_readonly)
1542 		    == GNTMAP_readonly));
1543 
1544 		txp->xt_mop.ref =
1545 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1546 
1547 		*mop = txp->xt_mop;
1548 		*txpp = txp;
1549 	}
1550 
1551 	if ((loop - start) == 0)
1552 		goto finished;
1553 
1554 	end = loop;
1555 
1556 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1557 	    xnbp->xnb_tx_mop, end - start) != 0) {
1558 
1559 		cmn_err(CE_WARN, "xnb_from_peer: map grant operation failed");
1560 
1561 		loop = start;
1562 		txpp = xnbp->xnb_tx_bufp;
1563 
1564 		while (loop != end) {
1565 			xnb_txbuf_put(xnbp, *txpp);
1566 
1567 			loop++;
1568 			txpp++;
1569 		}
1570 
1571 		goto finished;
1572 	}
1573 
1574 	for (loop = start, mop = xnbp->xnb_tx_mop, txpp = xnbp->xnb_tx_bufp;
1575 	    loop != end;
1576 	    loop++, mop++, txpp++) {
1577 		mblk_t *mp = NULL;
1578 		int16_t status = NETIF_RSP_OKAY;
1579 		xnb_txbuf_t *txp = *txpp;
1580 
1581 		if (mop->status != 0) {
1582 			cmn_err(CE_WARN, "xnb_from_peer: "
1583 			    "failed to map buffer: %d",
1584 			    mop->status);
1585 			status = NETIF_RSP_ERROR;
1586 		}
1587 
1588 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1589 
1590 		if (status == NETIF_RSP_OKAY) {
1591 			if (copy) {
1592 				mp = allocb(txreq->size, BPRI_MED);
1593 				if (mp == NULL) {
1594 					status = NETIF_RSP_ERROR;
1595 					xnbp->xnb_stat_tx_allocb_failed++;
1596 				} else {
1597 					bcopy((caddr_t)(uintptr_t)
1598 					    mop->host_addr + txreq->offset,
1599 					    mp->b_wptr, txreq->size);
1600 					mp->b_wptr += txreq->size;
1601 				}
1602 			} else {
1603 				mp = desballoc((uchar_t *)(uintptr_t)
1604 				    mop->host_addr + txreq->offset,
1605 				    txreq->size, 0, &txp->xt_free_rtn);
1606 				if (mp == NULL) {
1607 					status = NETIF_RSP_ERROR;
1608 					xnbp->xnb_stat_tx_allocb_failed++;
1609 				} else {
1610 					txp->xt_id = txreq->id;
1611 					txp->xt_status = status;
1612 					txp->xt_mop = *mop;
1613 
1614 					mp->b_wptr += txreq->size;
1615 				}
1616 			}
1617 
1618 			/*
1619 			 * If we have a buffer and there are checksum
1620 			 * flags, process them appropriately.
1621 			 */
1622 			if ((mp != NULL) &&
1623 			    ((txreq->flags &
1624 			    (NETTXF_csum_blank | NETTXF_data_validated))
1625 			    != 0)) {
1626 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1627 				    mp, txreq->flags);
1628 				xnbp->xnb_stat_tx_cksum_no_need++;
1629 			}
1630 		}
1631 
1632 		if (copy || (mp == NULL)) {
1633 			txp->xt_status = status;
1634 			txp->xt_id = txreq->id;
1635 			xnb_tx_schedule_unmop(xnbp, mop, txp);
1636 		}
1637 
1638 		if (mp != NULL) {
1639 			xnbp->xnb_stat_opackets++;
1640 			xnbp->xnb_stat_obytes += txreq->size;
1641 
1642 			mp->b_next = NULL;
1643 			if (head == NULL) {
1644 				ASSERT(tail == NULL);
1645 				head = mp;
1646 			} else {
1647 				ASSERT(tail != NULL);
1648 				tail->b_next = mp;
1649 			}
1650 			tail = mp;
1651 		}
1652 	}
1653 
1654 	xnbp->xnb_tx_ring.req_cons = loop;
1655 
1656 	goto around;
1657 	/* NOTREACHED */
1658 }
1659 
1660 /*
1661  *  intr() -- ring interrupt service routine
1662  */
1663 static uint_t
1664 xnb_intr(caddr_t arg)
1665 {
1666 	xnb_t *xnbp = (xnb_t *)arg;
1667 	mblk_t *mp;
1668 
1669 	xnbp->xnb_stat_intr++;
1670 
1671 	mutex_enter(&xnbp->xnb_tx_lock);
1672 
1673 	ASSERT(xnbp->xnb_connected);
1674 
1675 	mp = xnb_from_peer(xnbp);
1676 
1677 	mutex_exit(&xnbp->xnb_tx_lock);
1678 
1679 	if (!xnbp->xnb_hotplugged) {
1680 		xnbp->xnb_stat_tx_too_early++;
1681 		goto fail;
1682 	}
1683 	if (mp == NULL) {
1684 		xnbp->xnb_stat_spurious_intr++;
1685 		goto fail;
1686 	}
1687 
1688 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1689 
1690 	return (DDI_INTR_CLAIMED);
1691 
1692 fail:
1693 	freemsgchain(mp);
1694 	return (DDI_INTR_CLAIMED);
1695 }
1696 
1697 static boolean_t
1698 xnb_connect_rings(dev_info_t *dip)
1699 {
1700 	xnb_t *xnbp = ddi_get_driver_private(dip);
1701 	char *oename;
1702 	struct gnttab_map_grant_ref map_op;
1703 	evtchn_port_t evtchn;
1704 	int i;
1705 
1706 	/*
1707 	 * Cannot attempt to connect the rings if already connected.
1708 	 */
1709 	ASSERT(!xnbp->xnb_connected);
1710 
1711 	oename = xvdi_get_oename(dip);
1712 
1713 	if (xenbus_gather(XBT_NULL, oename,
1714 	    "event-channel", "%u", &evtchn,
1715 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1716 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1717 	    NULL) != 0) {
1718 		cmn_err(CE_WARN, "xnb_connect_rings: "
1719 		    "cannot read other-end details from %s",
1720 		    oename);
1721 		goto fail;
1722 	}
1723 
1724 	if (xenbus_scanf(XBT_NULL, oename,
1725 	    "feature-tx-writable", "%d", &i) != 0)
1726 		i = 0;
1727 	if (i != 0)
1728 		xnbp->xnb_tx_pages_writable = B_TRUE;
1729 
1730 	if (xenbus_scanf(XBT_NULL, oename,
1731 	    "feature-no-csum-offload", "%d", &i) != 0)
1732 		i = 0;
1733 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1734 		xnbp->xnb_cksum_offload = B_FALSE;
1735 
1736 	/* Check whether our peer knows and requests hypervisor copy */
1737 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1738 	    != 0)
1739 		i = 0;
1740 	if (i != 0)
1741 		xnbp->xnb_hv_copy = B_TRUE;
1742 
1743 	/*
1744 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1745 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1746 	 *    into the allocated vaddr (one for tx, one for rx).
1747 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1748 	 *    bound to this domain.
1749 	 * 4. associate the event channel with an interrupt.
1750 	 * 5. declare ourselves connected.
1751 	 * 6. enable the interrupt.
1752 	 */
1753 
1754 	/* 1.tx */
1755 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1756 	    0, 0, 0, 0, VM_SLEEP);
1757 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1758 
1759 	/* 2.tx */
1760 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1761 	map_op.flags = GNTMAP_host_map;
1762 	map_op.ref = xnbp->xnb_tx_ring_ref;
1763 	map_op.dom = xnbp->xnb_peer;
1764 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1765 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1766 	    &map_op, 1) != 0 || map_op.status != 0) {
1767 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1768 		goto fail;
1769 	}
1770 	xnbp->xnb_tx_ring_handle = map_op.handle;
1771 
1772 	/* LINTED: constant in conditional context */
1773 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1774 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1775 
1776 	/* 1.rx */
1777 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1778 	    0, 0, 0, 0, VM_SLEEP);
1779 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1780 
1781 	/* 2.rx */
1782 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1783 	map_op.flags = GNTMAP_host_map;
1784 	map_op.ref = xnbp->xnb_rx_ring_ref;
1785 	map_op.dom = xnbp->xnb_peer;
1786 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1787 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1788 	    &map_op, 1) != 0 || map_op.status != 0) {
1789 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1790 		goto fail;
1791 	}
1792 	xnbp->xnb_rx_ring_handle = map_op.handle;
1793 
1794 	/* LINTED: constant in conditional context */
1795 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1796 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1797 
1798 	/* 3 */
1799 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1800 		cmn_err(CE_WARN, "xnb_connect_rings: "
1801 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1802 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1803 		goto fail;
1804 	}
1805 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1806 
1807 	/*
1808 	 * It would be good to set the state to XenbusStateConnected
1809 	 * here as well, but then what if ddi_add_intr() failed?
1810 	 * Changing the state in the store will be noticed by the peer
1811 	 * and cannot be "taken back".
1812 	 */
1813 	mutex_enter(&xnbp->xnb_tx_lock);
1814 	mutex_enter(&xnbp->xnb_rx_lock);
1815 
1816 	/* 5.1 */
1817 	xnbp->xnb_connected = B_TRUE;
1818 
1819 	mutex_exit(&xnbp->xnb_rx_lock);
1820 	mutex_exit(&xnbp->xnb_tx_lock);
1821 
1822 	/* 4, 6 */
1823 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1824 	    != DDI_SUCCESS) {
1825 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1826 		goto fail;
1827 	}
1828 	xnbp->xnb_irq = B_TRUE;
1829 
1830 	/* 5.2 */
1831 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1832 
1833 	return (B_TRUE);
1834 
1835 fail:
1836 	mutex_enter(&xnbp->xnb_tx_lock);
1837 	mutex_enter(&xnbp->xnb_rx_lock);
1838 
1839 	xnbp->xnb_connected = B_FALSE;
1840 	mutex_exit(&xnbp->xnb_rx_lock);
1841 	mutex_exit(&xnbp->xnb_tx_lock);
1842 
1843 	return (B_FALSE);
1844 }
1845 
1846 static void
1847 xnb_disconnect_rings(dev_info_t *dip)
1848 {
1849 	xnb_t *xnbp = ddi_get_driver_private(dip);
1850 
1851 	if (xnbp->xnb_irq) {
1852 		ddi_remove_intr(dip, 0, NULL);
1853 		xnbp->xnb_irq = B_FALSE;
1854 	}
1855 
1856 	if (xnbp->xnb_tx_unmop_count > 0)
1857 		xnb_tx_perform_pending_unmop(xnbp);
1858 
1859 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1860 		xvdi_free_evtchn(dip);
1861 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1862 	}
1863 
1864 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1865 		struct gnttab_unmap_grant_ref unmap_op;
1866 
1867 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1868 		    xnbp->xnb_rx_ring_addr;
1869 		unmap_op.dev_bus_addr = 0;
1870 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1871 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1872 		    &unmap_op, 1) != 0)
1873 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1874 			    "cannot unmap rx-ring page (%d)",
1875 			    unmap_op.status);
1876 
1877 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1878 	}
1879 
1880 	if (xnbp->xnb_rx_ring_addr != NULL) {
1881 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1882 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1883 		xnbp->xnb_rx_ring_addr = NULL;
1884 	}
1885 
1886 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1887 		struct gnttab_unmap_grant_ref unmap_op;
1888 
1889 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1890 		    xnbp->xnb_tx_ring_addr;
1891 		unmap_op.dev_bus_addr = 0;
1892 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1893 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1894 		    &unmap_op, 1) != 0)
1895 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1896 			    "cannot unmap tx-ring page (%d)",
1897 			    unmap_op.status);
1898 
1899 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1900 	}
1901 
1902 	if (xnbp->xnb_tx_ring_addr != NULL) {
1903 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1904 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1905 		xnbp->xnb_tx_ring_addr = NULL;
1906 	}
1907 }
1908 
1909 /*ARGSUSED*/
1910 static void
1911 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1912     void *arg, void *impl_data)
1913 {
1914 	xnb_t *xnbp = ddi_get_driver_private(dip);
1915 	XenbusState new_state = *(XenbusState *)impl_data;
1916 
1917 	ASSERT(xnbp != NULL);
1918 
1919 	switch (new_state) {
1920 	case XenbusStateConnected:
1921 		/* spurious state change */
1922 		if (xnbp->xnb_connected)
1923 			return;
1924 
1925 		if (xnb_connect_rings(dip)) {
1926 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1927 		} else {
1928 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1929 			xnb_disconnect_rings(dip);
1930 			(void) xvdi_switch_state(dip, XBT_NULL,
1931 			    XenbusStateClosed);
1932 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1933 		}
1934 
1935 		/*
1936 		 * Now that we've attempted to connect it's reasonable
1937 		 * to allow an attempt to detach.
1938 		 */
1939 		xnbp->xnb_detachable = B_TRUE;
1940 
1941 		break;
1942 
1943 	case XenbusStateClosing:
1944 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1945 
1946 		break;
1947 
1948 	case XenbusStateClosed:
1949 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1950 
1951 		mutex_enter(&xnbp->xnb_tx_lock);
1952 		mutex_enter(&xnbp->xnb_rx_lock);
1953 
1954 		xnb_disconnect_rings(dip);
1955 		xnbp->xnb_connected = B_FALSE;
1956 
1957 		mutex_exit(&xnbp->xnb_rx_lock);
1958 		mutex_exit(&xnbp->xnb_tx_lock);
1959 
1960 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1961 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1962 		/*
1963 		 * In all likelyhood this is already set (in the above
1964 		 * case), but if the peer never attempted to connect
1965 		 * and the domain is destroyed we get here without
1966 		 * having been through the case above, so we set it to
1967 		 * be sure.
1968 		 */
1969 		xnbp->xnb_detachable = B_TRUE;
1970 
1971 		break;
1972 
1973 	default:
1974 		break;
1975 	}
1976 }
1977 
1978 /*ARGSUSED*/
1979 static void
1980 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1981     void *arg, void *impl_data)
1982 {
1983 	xnb_t *xnbp = ddi_get_driver_private(dip);
1984 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1985 	boolean_t success;
1986 
1987 	ASSERT(xnbp != NULL);
1988 
1989 	switch (state) {
1990 	case Connected:
1991 
1992 		/* spurious hotplug event */
1993 		if (xnbp->xnb_hotplugged)
1994 			return;
1995 
1996 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1997 
1998 		mutex_enter(&xnbp->xnb_tx_lock);
1999 		mutex_enter(&xnbp->xnb_rx_lock);
2000 
2001 		xnbp->xnb_hotplugged = success;
2002 
2003 		mutex_exit(&xnbp->xnb_rx_lock);
2004 		mutex_exit(&xnbp->xnb_tx_lock);
2005 		break;
2006 
2007 	default:
2008 		break;
2009 	}
2010 }
2011 
2012 static struct modldrv modldrv = {
2013 	&mod_miscops, "xnb",
2014 };
2015 
2016 static struct modlinkage modlinkage = {
2017 	MODREV_1, &modldrv, NULL
2018 };
2019 
2020 int
2021 _init(void)
2022 {
2023 	int i;
2024 
2025 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2026 
2027 	xnb_txbuf_cachep = kmem_cache_create("xnb_txbuf_cachep",
2028 	    sizeof (xnb_txbuf_t), 0, xnb_txbuf_constructor,
2029 	    xnb_txbuf_destructor, NULL, NULL, NULL, 0);
2030 	ASSERT(xnb_txbuf_cachep != NULL);
2031 
2032 	i = mod_install(&modlinkage);
2033 	if (i != DDI_SUCCESS) {
2034 		kmem_cache_destroy(xnb_txbuf_cachep);
2035 		mutex_destroy(&xnb_alloc_page_lock);
2036 	}
2037 	return (i);
2038 }
2039 
2040 int
2041 _info(struct modinfo *modinfop)
2042 {
2043 	return (mod_info(&modlinkage, modinfop));
2044 }
2045 
2046 int
2047 _fini(void)
2048 {
2049 	int i;
2050 
2051 	i = mod_remove(&modlinkage);
2052 	if (i == DDI_SUCCESS) {
2053 		kmem_cache_destroy(xnb_txbuf_cachep);
2054 		mutex_destroy(&xnb_alloc_page_lock);
2055 	}
2056 	return (i);
2057 }
2058