xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 74e7dc986c89efca1f2e4451c7a572e05e4a6e4f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/dlpi.h>
39 #include <sys/strsubr.h>
40 #include <sys/strsun.h>
41 #include <sys/types.h>
42 #include <sys/pattr.h>
43 #include <vm/seg_kmem.h>
44 #include <vm/hat_i86.h>
45 #include <xen/sys/xenbus_impl.h>
46 #include <xen/sys/xendev.h>
47 #include <sys/balloon_impl.h>
48 #include <sys/evtchn_impl.h>
49 #include <sys/gnttab.h>
50 #include <vm/vm_dep.h>
51 
52 #include <sys/gld.h>
53 #include <inet/ip.h>
54 #include <inet/ip_impl.h>
55 #include <sys/vnic_impl.h> /* blech. */
56 
57 /*
58  * The terms "transmit" and "receive" are used in their traditional
59  * sense here - packets from other parts of this system are
60  * "transmitted" to the peer domain and those originating from the
61  * peer are "received".
62  *
63  * In some cases this can be confusing, because various data
64  * structures are shared with the domU driver, which has the opposite
65  * view of what constitutes "transmit" and "receive".  In naming the
66  * shared structures the domU driver always wins.
67  */
68 
69 /*
70  * XXPV dme: things to do, as well as various things indicated
71  * throughout the source:
72  * - copy avoidance outbound.
73  * - copy avoidance inbound.
74  * - transfer credit limiting.
75  * - MAC address based filtering.
76  */
77 
78 /*
79  * Linux expects to have some headroom in received buffers.  The Linux
80  * frontend driver (netfront) checks to see if the headroom is
81  * available and will re-allocate the buffer to make room if
82  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
83  * headroom to each packet we pass to the peer.
84  */
85 #define	TX_BUFFER_HEADROOM	16
86 
87 static boolean_t	xnb_cksum_offload = B_TRUE;
88 
89 static boolean_t	xnb_connect_rings(dev_info_t *);
90 static void		xnb_disconnect_rings(dev_info_t *);
91 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
92     void *, void *);
93 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
94     void *, void *);
95 
96 static int	xnb_rxbuf_constructor(void *, void *, int);
97 static void	xnb_rxbuf_destructor(void *, void *);
98 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
99 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
100 static void	xnb_rx_notify_peer(xnb_t *);
101 static void	xnb_rx_complete(xnb_rxbuf_t *);
102 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
103 static void 	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
104     xnb_rxbuf_t *);
105 static void	xnb_rx_perform_pending_unmop(xnb_t *);
106 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
107 
108 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
109 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
110 
111 
112 boolean_t	xnb_hv_copy = B_TRUE;
113 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
114 
115 /* XXPV dme: are these really invalid? */
116 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
117 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
118 
119 static kmem_cache_t *xnb_rxbuf_cachep;
120 static kmutex_t	xnb_alloc_page_lock;
121 
122 /*
123  * Statistics.
124  */
125 static char *aux_statistics[] = {
126 	"tx_cksum_deferred",
127 	"rx_cksum_no_need",
128 	"tx_rsp_notok",
129 	"tx_notify_deferred",
130 	"tx_notify_sent",
131 	"rx_notify_deferred",
132 	"rx_notify_sent",
133 	"tx_too_early",
134 	"rx_too_early",
135 	"rx_allocb_failed",
136 	"tx_allocb_failed",
137 	"tx_foreign_page",
138 	"mac_full",
139 	"spurious_intr",
140 	"allocation_success",
141 	"allocation_failure",
142 	"small_allocation_success",
143 	"small_allocation_failure",
144 	"other_allocation_failure",
145 	"tx_pageboundary_crossed",
146 	"tx_cpoparea_grown",
147 	"csum_hardware",
148 	"csum_software",
149 };
150 
151 static int
152 xnb_ks_aux_update(kstat_t *ksp, int flag)
153 {
154 	xnb_t *xnbp;
155 	kstat_named_t *knp;
156 
157 	if (flag != KSTAT_READ)
158 		return (EACCES);
159 
160 	xnbp = ksp->ks_private;
161 	knp = ksp->ks_data;
162 
163 	/*
164 	 * Assignment order should match that of the names in
165 	 * aux_statistics.
166 	 */
167 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred;
168 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need;
169 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_rsp_notok;
170 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
171 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
190 
191 	return (0);
192 }
193 
194 static boolean_t
195 xnb_ks_init(xnb_t *xnbp)
196 {
197 	int nstat = sizeof (aux_statistics) /
198 	    sizeof (aux_statistics[0]);
199 	char **cp = aux_statistics;
200 	kstat_named_t *knp;
201 
202 	/*
203 	 * Create and initialise kstats.
204 	 */
205 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
206 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
207 	    KSTAT_TYPE_NAMED, nstat, 0);
208 	if (xnbp->xnb_kstat_aux == NULL)
209 		return (B_FALSE);
210 
211 	xnbp->xnb_kstat_aux->ks_private = xnbp;
212 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
213 
214 	knp = xnbp->xnb_kstat_aux->ks_data;
215 	while (nstat > 0) {
216 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
217 
218 		knp++;
219 		cp++;
220 		nstat--;
221 	}
222 
223 	kstat_install(xnbp->xnb_kstat_aux);
224 
225 	return (B_TRUE);
226 }
227 
228 static void
229 xnb_ks_free(xnb_t *xnbp)
230 {
231 	kstat_delete(xnbp->xnb_kstat_aux);
232 }
233 
234 /*
235  * Software checksum calculation and insertion for an arbitrary packet.
236  */
237 /*ARGSUSED*/
238 static mblk_t *
239 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
240 {
241 	/*
242 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
243 	 * because it doesn't cover all of the interesting cases :-(
244 	 */
245 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
246 	    HCK_FULLCKSUM, KM_NOSLEEP);
247 
248 	return (vnic_fix_cksum(mp));
249 }
250 
251 mblk_t *
252 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
253 {
254 	struct ether_header *ehp;
255 	uint16_t sap;
256 	uint32_t offset;
257 	ipha_t *ipha;
258 
259 	ASSERT(mp->b_next == NULL);
260 
261 	/*
262 	 * Check that the packet is contained in a single mblk.  In
263 	 * the "from peer" path this is true today, but will change
264 	 * when scatter gather support is added.  In the "to peer"
265 	 * path we cannot be sure, but in most cases it will be true
266 	 * (in the xnbo case the packet has come from a MAC device
267 	 * which is unlikely to split packets).
268 	 */
269 	if (mp->b_cont != NULL)
270 		goto software;
271 
272 	/*
273 	 * If the MAC has no hardware capability don't do any further
274 	 * checking.
275 	 */
276 	if (capab == 0)
277 		goto software;
278 
279 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
280 	ehp = (struct ether_header *)mp->b_rptr;
281 
282 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
283 		struct ether_vlan_header *evhp;
284 
285 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
286 		evhp = (struct ether_vlan_header *)mp->b_rptr;
287 		sap = ntohs(evhp->ether_type);
288 		offset = sizeof (struct ether_vlan_header);
289 	} else {
290 		sap = ntohs(ehp->ether_type);
291 		offset = sizeof (struct ether_header);
292 	}
293 
294 	/*
295 	 * We only attempt to do IPv4 packets in hardware.
296 	 */
297 	if (sap != ETHERTYPE_IP)
298 		goto software;
299 
300 	/*
301 	 * We know that this is an IPv4 packet.
302 	 */
303 	ipha = (ipha_t *)(mp->b_rptr + offset);
304 
305 	switch (ipha->ipha_protocol) {
306 	case IPPROTO_TCP:
307 	case IPPROTO_UDP: {
308 		uint32_t start, length, stuff, cksum;
309 		uint16_t *stuffp;
310 
311 		/*
312 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
313 		 * can use full IPv4 and partial checksum offload.
314 		 */
315 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
316 			break;
317 
318 		start = IP_SIMPLE_HDR_LENGTH;
319 		length = ntohs(ipha->ipha_length);
320 		if (ipha->ipha_protocol == IPPROTO_TCP) {
321 			stuff = start + TCP_CHECKSUM_OFFSET;
322 			cksum = IP_TCP_CSUM_COMP;
323 		} else {
324 			stuff = start + UDP_CHECKSUM_OFFSET;
325 			cksum = IP_UDP_CSUM_COMP;
326 		}
327 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
328 
329 		if (capab & HCKSUM_INET_FULL_V4) {
330 			/*
331 			 * Some devices require that the checksum
332 			 * field of the packet is zero for full
333 			 * offload.
334 			 */
335 			*stuffp = 0;
336 
337 			(void) hcksum_assoc(mp, NULL, NULL,
338 			    0, 0, 0, 0,
339 			    HCK_FULLCKSUM, KM_NOSLEEP);
340 
341 			xnbp->xnb_stat_csum_hardware++;
342 
343 			return (mp);
344 		}
345 
346 		if (capab & HCKSUM_INET_PARTIAL) {
347 			if (*stuffp == 0) {
348 				ipaddr_t src, dst;
349 
350 				/*
351 				 * Older Solaris guests don't insert
352 				 * the pseudo-header checksum, so we
353 				 * calculate it here.
354 				 */
355 				src = ipha->ipha_src;
356 				dst = ipha->ipha_dst;
357 
358 				cksum += (dst >> 16) + (dst & 0xFFFF);
359 				cksum += (src >> 16) + (src & 0xFFFF);
360 				cksum += length - IP_SIMPLE_HDR_LENGTH;
361 
362 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
363 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
364 
365 				ASSERT(cksum <= 0xFFFF);
366 
367 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
368 			}
369 
370 			(void) hcksum_assoc(mp, NULL, NULL,
371 			    start, stuff, length, 0,
372 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
373 
374 			xnbp->xnb_stat_csum_hardware++;
375 
376 			return (mp);
377 		}
378 
379 		/* NOTREACHED */
380 		break;
381 	}
382 
383 	default:
384 		/* Use software. */
385 		break;
386 	}
387 
388 software:
389 	/*
390 	 * We are not able to use any offload so do the whole thing in
391 	 * software.
392 	 */
393 	xnbp->xnb_stat_csum_software++;
394 
395 	return (xnb_software_csum(xnbp, mp));
396 }
397 
398 int
399 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
400 {
401 	xnb_t *xnbp;
402 	char *xsname, mac[ETHERADDRL * 3];
403 
404 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
405 
406 	xnbp->xnb_flavour = flavour;
407 	xnbp->xnb_flavour_data = flavour_data;
408 	xnbp->xnb_devinfo = dip;
409 	xnbp->xnb_evtchn = INVALID_EVTCHN;
410 	xnbp->xnb_irq = B_FALSE;
411 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
412 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
413 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
414 	xnbp->xnb_connected = B_FALSE;
415 	xnbp->xnb_hotplugged = B_FALSE;
416 	xnbp->xnb_detachable = B_FALSE;
417 	xnbp->xnb_peer = xvdi_get_oeid(dip);
418 	xnbp->xnb_rx_pages_writable = B_FALSE;
419 
420 	xnbp->xnb_rx_buf_count = 0;
421 	xnbp->xnb_rx_unmop_count = 0;
422 
423 	xnbp->xnb_hv_copy = B_FALSE;
424 
425 	xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
426 	ASSERT(xnbp->xnb_tx_va != NULL);
427 
428 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
429 	    != DDI_SUCCESS)
430 		goto failure;
431 
432 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
433 	xnbp->xnb_tx_cpop = NULL;
434 	xnbp->xnb_cpop_sz = 0;
435 
436 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
437 	    xnbp->xnb_icookie);
438 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
439 	    xnbp->xnb_icookie);
440 
441 	/* set driver private pointer now */
442 	ddi_set_driver_private(dip, xnbp);
443 
444 	if (!xnb_ks_init(xnbp))
445 		goto failure_1;
446 
447 	/*
448 	 * Receive notification of changes in the state of the
449 	 * driver in the guest domain.
450 	 */
451 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
452 	    xnb_oe_state_change) != DDI_SUCCESS)
453 		goto failure_2;
454 
455 	/*
456 	 * Receive notification of hotplug events.
457 	 */
458 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
459 	    xnb_hp_state_change) != DDI_SUCCESS)
460 		goto failure_2;
461 
462 	xsname = xvdi_get_xsname(dip);
463 
464 	if (xenbus_printf(XBT_NULL, xsname,
465 	    "feature-no-csum-offload", "%d",
466 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
467 		goto failure_3;
468 
469 	/*
470 	 * Use global xnb_hv_copy to export this feature. This means that
471 	 * we have to decide what to do before starting up a guest domain
472 	 */
473 	if (xenbus_printf(XBT_NULL, xsname,
474 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
475 		goto failure_3;
476 	/*
477 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
478 	 * in addition to "feature-rx-copy" being 1. It seems strange
479 	 * to use four possible states to describe a binary decision,
480 	 * but we might as well play nice.
481 	 */
482 	if (xenbus_printf(XBT_NULL, xsname,
483 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
484 		goto failure_3;
485 
486 	if (xenbus_scanf(XBT_NULL, xsname,
487 	    "mac", "%s", mac) != 0) {
488 		cmn_err(CE_WARN, "xnb_attach: "
489 		    "cannot read mac address from %s",
490 		    xsname);
491 		goto failure_3;
492 	}
493 
494 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
495 		cmn_err(CE_WARN,
496 		    "xnb_attach: cannot parse mac address %s",
497 		    mac);
498 		goto failure_3;
499 	}
500 
501 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
502 	(void) xvdi_post_event(dip, XEN_HP_ADD);
503 
504 	return (DDI_SUCCESS);
505 
506 failure_3:
507 	xvdi_remove_event_handler(dip, NULL);
508 
509 failure_2:
510 	xnb_ks_free(xnbp);
511 
512 failure_1:
513 	mutex_destroy(&xnbp->xnb_rx_lock);
514 	mutex_destroy(&xnbp->xnb_tx_lock);
515 
516 failure:
517 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
518 	kmem_free(xnbp, sizeof (*xnbp));
519 	return (DDI_FAILURE);
520 }
521 
522 /*ARGSUSED*/
523 void
524 xnb_detach(dev_info_t *dip)
525 {
526 	xnb_t *xnbp = ddi_get_driver_private(dip);
527 
528 	ASSERT(xnbp != NULL);
529 	ASSERT(!xnbp->xnb_connected);
530 	ASSERT(xnbp->xnb_rx_buf_count == 0);
531 
532 	xnb_disconnect_rings(dip);
533 
534 	xvdi_remove_event_handler(dip, NULL);
535 
536 	xnb_ks_free(xnbp);
537 
538 	ddi_set_driver_private(dip, NULL);
539 
540 	mutex_destroy(&xnbp->xnb_tx_lock);
541 	mutex_destroy(&xnbp->xnb_rx_lock);
542 
543 	if (xnbp->xnb_cpop_sz > 0)
544 		kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop)
545 		    * xnbp->xnb_cpop_sz);
546 
547 	ASSERT(xnbp->xnb_tx_va != NULL);
548 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
549 
550 	kmem_free(xnbp, sizeof (*xnbp));
551 }
552 
553 
554 static mfn_t
555 xnb_alloc_page(xnb_t *xnbp)
556 {
557 #define	WARNING_RATE_LIMIT 100
558 #define	BATCH_SIZE 256
559 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
560 	static int nth = BATCH_SIZE;
561 	mfn_t mfn;
562 
563 	mutex_enter(&xnb_alloc_page_lock);
564 	if (nth == BATCH_SIZE) {
565 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
566 			xnbp->xnb_stat_allocation_failure++;
567 			mutex_exit(&xnb_alloc_page_lock);
568 
569 			/*
570 			 * Try for a single page in low memory situations.
571 			 */
572 			if (balloon_alloc_pages(1, &mfn) != 1) {
573 				if ((xnbp->xnb_stat_small_allocation_failure++
574 				    % WARNING_RATE_LIMIT) == 0)
575 					cmn_err(CE_WARN, "xnb_alloc_page: "
576 					    "Cannot allocate memory to "
577 					    "transfer packets to peer.");
578 				return (0);
579 			} else {
580 				xnbp->xnb_stat_small_allocation_success++;
581 				return (mfn);
582 			}
583 		}
584 
585 		nth = 0;
586 		xnbp->xnb_stat_allocation_success++;
587 	}
588 
589 	mfn = mfns[nth++];
590 	mutex_exit(&xnb_alloc_page_lock);
591 
592 	ASSERT(mfn != 0);
593 
594 	return (mfn);
595 #undef BATCH_SIZE
596 #undef WARNING_RATE_LIMIT
597 }
598 
599 /*ARGSUSED*/
600 static void
601 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
602 {
603 	int r;
604 	pfn_t pfn;
605 
606 	pfn = xen_assign_pfn(mfn);
607 	pfnzero(pfn, 0, PAGESIZE);
608 	xen_release_pfn(pfn);
609 
610 	/*
611 	 * This happens only in the error path, so batching is
612 	 * not worth the complication.
613 	 */
614 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
615 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
616 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
617 		    r, mfn);
618 	}
619 }
620 
621 /*
622  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
623  * using local variables.
624  */
625 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
626 	((((_r)->sring->req_prod - loop) <		\
627 		(RING_SIZE(_r) - (loop - prod))) ?	\
628 	    ((_r)->sring->req_prod - loop) :		\
629 	    (RING_SIZE(_r) - (loop - prod)))
630 
631 mblk_t *
632 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
633 {
634 	mblk_t *free = mp, *prev = NULL;
635 	size_t len;
636 	gnttab_transfer_t *gop;
637 	boolean_t notify;
638 	RING_IDX loop, prod, end;
639 
640 	/*
641 	 * For each packet the sequence of operations is:
642 	 *
643 	 * 1. get a new page from the hypervisor.
644 	 * 2. get a request slot from the ring.
645 	 * 3. copy the data into the new page.
646 	 * 4. transfer the page to the peer.
647 	 * 5. update the request slot.
648 	 * 6. kick the peer.
649 	 * 7. free mp.
650 	 *
651 	 * In order to reduce the number of hypercalls, we prepare
652 	 * several packets for the peer and perform a single hypercall
653 	 * to transfer them.
654 	 */
655 
656 	mutex_enter(&xnbp->xnb_tx_lock);
657 
658 	/*
659 	 * If we are not connected to the peer or have not yet
660 	 * finished hotplug it is too early to pass packets to the
661 	 * peer.
662 	 */
663 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
664 		mutex_exit(&xnbp->xnb_tx_lock);
665 		DTRACE_PROBE(flip_tx_too_early);
666 		xnbp->xnb_stat_tx_too_early++;
667 		return (mp);
668 	}
669 
670 	loop = xnbp->xnb_rx_ring.req_cons;
671 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
672 	gop = xnbp->xnb_tx_top;
673 
674 	while ((mp != NULL) &&
675 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
676 
677 		mfn_t mfn;
678 		pfn_t pfn;
679 		netif_rx_request_t *rxreq;
680 		netif_rx_response_t *rxresp;
681 		char *valoop;
682 		size_t offset;
683 		mblk_t *ml;
684 		uint16_t cksum_flags;
685 
686 		/* 1 */
687 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
688 			xnbp->xnb_stat_xmit_defer++;
689 			break;
690 		}
691 
692 		/* 2 */
693 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
694 
695 #ifdef XNB_DEBUG
696 		if (!(rxreq->id < NET_RX_RING_SIZE))
697 			cmn_err(CE_PANIC, "xnb_to_peer: "
698 			    "id %d out of range in request 0x%p",
699 			    rxreq->id, (void *)rxreq);
700 #endif /* XNB_DEBUG */
701 
702 		/* Assign a pfn and map the new page at the allocated va. */
703 		pfn = xen_assign_pfn(mfn);
704 		hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
705 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
706 
707 		offset = TX_BUFFER_HEADROOM;
708 
709 		/* 3 */
710 		len = 0;
711 		valoop = xnbp->xnb_tx_va + offset;
712 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
713 			size_t chunk = ml->b_wptr - ml->b_rptr;
714 
715 			bcopy(ml->b_rptr, valoop, chunk);
716 			valoop += chunk;
717 			len += chunk;
718 		}
719 
720 		ASSERT(len + offset < PAGESIZE);
721 
722 		/* Release the pfn. */
723 		hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
724 		    HAT_UNLOAD_UNMAP);
725 		xen_release_pfn(pfn);
726 
727 		/* 4 */
728 		gop->mfn = mfn;
729 		gop->domid = xnbp->xnb_peer;
730 		gop->ref = rxreq->gref;
731 
732 		/* 5.1 */
733 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
734 		rxresp->offset = offset;
735 		rxresp->flags = 0;
736 
737 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
738 		if (cksum_flags != 0)
739 			xnbp->xnb_stat_tx_cksum_deferred++;
740 		rxresp->flags |= cksum_flags;
741 
742 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
743 		rxresp->status = len;
744 
745 		loop++;
746 		prod++;
747 		gop++;
748 		prev = mp;
749 		mp = mp->b_next;
750 	}
751 
752 	/*
753 	 * Did we actually do anything?
754 	 */
755 	if (loop == xnbp->xnb_rx_ring.req_cons) {
756 		mutex_exit(&xnbp->xnb_tx_lock);
757 		return (mp);
758 	}
759 
760 	end = loop;
761 
762 	/*
763 	 * Unlink the end of the 'done' list from the remainder.
764 	 */
765 	ASSERT(prev != NULL);
766 	prev->b_next = NULL;
767 
768 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top,
769 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
770 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
771 	}
772 
773 	loop = xnbp->xnb_rx_ring.req_cons;
774 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
775 	gop = xnbp->xnb_tx_top;
776 
777 	while (loop < end) {
778 		int16_t status = NETIF_RSP_OKAY;
779 
780 		if (gop->status != 0) {
781 			status = NETIF_RSP_ERROR;
782 
783 			/*
784 			 * If the status is anything other than
785 			 * GNTST_bad_page then we don't own the page
786 			 * any more, so don't try to give it back.
787 			 */
788 			if (gop->status != GNTST_bad_page)
789 				gop->mfn = 0;
790 		} else {
791 			/* The page is no longer ours. */
792 			gop->mfn = 0;
793 		}
794 
795 		if (gop->mfn != 0)
796 			/*
797 			 * Give back the page, as we won't be using
798 			 * it.
799 			 */
800 			xnb_free_page(xnbp, gop->mfn);
801 		else
802 			/*
803 			 * We gave away a page, update our accounting
804 			 * now.
805 			 */
806 			balloon_drv_subtracted(1);
807 
808 		/* 5.2 */
809 		if (status != NETIF_RSP_OKAY) {
810 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
811 			    status;
812 		} else {
813 			xnbp->xnb_stat_opackets++;
814 			xnbp->xnb_stat_obytes += len;
815 		}
816 
817 		loop++;
818 		prod++;
819 		gop++;
820 	}
821 
822 	xnbp->xnb_rx_ring.req_cons = loop;
823 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
824 
825 	/* 6 */
826 	/* LINTED: constant in conditional context */
827 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
828 	if (notify) {
829 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
830 		xnbp->xnb_stat_tx_notify_sent++;
831 	} else {
832 		xnbp->xnb_stat_tx_notify_deferred++;
833 	}
834 
835 	if (mp != NULL)
836 		xnbp->xnb_stat_xmit_defer++;
837 
838 	mutex_exit(&xnbp->xnb_tx_lock);
839 
840 	/* Free mblk_t's that we consumed. */
841 	freemsgchain(free);
842 
843 	return (mp);
844 }
845 
846 /* helper functions for xnb_copy_to_peer */
847 
848 /*
849  * Grow the array of copy operation descriptors.
850  * Returns a pointer to the next available entry.
851  */
852 gnttab_copy_t *
853 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
854 {
855 	/*
856 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
857 	 * something into but cannot, because we haven't alloc'ed it
858 	 * yet, or NULL.
859 	 * old_cpop and new_cpop (local) are pointers to old/new
860 	 * versions of xnbp->xnb_tx_cpop.
861 	 */
862 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
863 	size_t		newcount;
864 
865 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
866 
867 	old_cpop = xnbp->xnb_tx_cpop;
868 	/*
869 	 * o_cpop is a pointer into the array pointed to by old_cpop;
870 	 * it would be an error for exactly one of these pointers to be NULL.
871 	 * We shouldn't call this function if xnb_tx_cpop has already
872 	 * been allocated, but we're starting to fill it from the beginning
873 	 * again.
874 	 */
875 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
876 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
877 
878 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
879 
880 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
881 	if (new_cpop == NULL) {
882 		xnbp->xnb_stat_other_allocation_failure++;
883 		return (NULL);
884 	}
885 
886 	if (o_cpop != NULL) {
887 		size_t	 offset = (o_cpop - old_cpop);
888 
889 		/* we only need to move the parts in use ... */
890 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
891 		    (sizeof (*old_cpop)));
892 
893 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
894 
895 		ret_cpop = new_cpop + offset;
896 	} else {
897 		ret_cpop = new_cpop;
898 	}
899 
900 	xnbp->xnb_tx_cpop = new_cpop;
901 	xnbp->xnb_cpop_sz = newcount;
902 
903 	xnbp->xnb_stat_tx_cpoparea_grown++;
904 
905 	return (ret_cpop);
906 }
907 
908 /*
909  * Check whether an address is on a page that's foreign to this domain.
910  */
911 static boolean_t
912 is_foreign(void *addr)
913 {
914 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
915 
916 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
917 }
918 
919 /*
920  * Insert a newly allocated mblk into a chain, replacing the old one.
921  */
922 static mblk_t *
923 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
924 {
925 	uint32_t	start, stuff, end, value, flags;
926 	mblk_t		*new_mp;
927 
928 	new_mp = copyb(mp);
929 	if (new_mp == NULL)
930 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
931 		    "for %p, len %lu", (void *) mp, len);
932 
933 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
934 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
935 	    flags, KM_NOSLEEP);
936 
937 	new_mp->b_next = mp->b_next;
938 	new_mp->b_prev = mp->b_prev;
939 	new_mp->b_cont = mp->b_cont;
940 
941 	/* Make sure we only overwrite pointers to the mblk being replaced. */
942 	if (mp_prev != NULL && mp_prev->b_next == mp)
943 		mp_prev->b_next = new_mp;
944 
945 	if (ml_prev != NULL && ml_prev->b_cont == mp)
946 		ml_prev->b_cont = new_mp;
947 
948 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
949 	freemsg(mp);
950 
951 	return (new_mp);
952 }
953 
954 /*
955  * Set all the fields in a gnttab_copy_t.
956  */
957 static void
958 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
959     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
960 {
961 	ASSERT(xnbp != NULL && gp != NULL);
962 
963 	gp->source.offset = s_off;
964 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
965 	gp->source.domid = DOMID_SELF;
966 
967 	gp->len = (uint16_t)len;
968 	gp->flags = GNTCOPY_dest_gref;
969 	gp->status = 0;
970 
971 	gp->dest.u.ref = d_ref;
972 	gp->dest.offset = d_off;
973 	gp->dest.domid = xnbp->xnb_peer;
974 }
975 
976 mblk_t *
977 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
978 {
979 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
980 	mblk_t		*ml, *ml_prev;
981 	gnttab_copy_t	*gop_cp;
982 	boolean_t	notify;
983 	RING_IDX	loop, prod;
984 	int		i;
985 
986 	if (!xnbp->xnb_hv_copy)
987 		return (xnb_to_peer(xnbp, mp));
988 
989 	/*
990 	 * For each packet the sequence of operations is:
991 	 *
992 	 *  1. get a request slot from the ring.
993 	 *  2. set up data for hypercall (see NOTE below)
994 	 *  3. have the hypervisore copy the data
995 	 *  4. update the request slot.
996 	 *  5. kick the peer.
997 	 *
998 	 * NOTE ad 2.
999 	 *  In order to reduce the number of hypercalls, we prepare
1000 	 *  several packets (mp->b_cont != NULL) for the peer and
1001 	 *  perform a single hypercall to transfer them.
1002 	 *  We also have to set up a seperate copy operation for
1003 	 *  every page.
1004 	 *
1005 	 * If we have more than one message (mp->b_next != NULL),
1006 	 * we do this whole dance repeatedly.
1007 	 */
1008 
1009 	mutex_enter(&xnbp->xnb_tx_lock);
1010 
1011 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1012 		mutex_exit(&xnbp->xnb_tx_lock);
1013 		DTRACE_PROBE(copy_tx_too_early);
1014 		xnbp->xnb_stat_tx_too_early++;
1015 		return (mp);
1016 	}
1017 
1018 	loop = xnbp->xnb_rx_ring.req_cons;
1019 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1020 
1021 	while ((mp != NULL) &&
1022 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1023 		netif_rx_request_t	*rxreq;
1024 		netif_rx_response_t	*rxresp;
1025 		size_t			offset, d_offset;
1026 		size_t			len;
1027 		uint16_t		cksum_flags;
1028 		int16_t			status = NETIF_RSP_OKAY;
1029 		int			item_count;
1030 
1031 		/* 1 */
1032 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1033 
1034 #ifdef XNB_DEBUG
1035 		if (!(rxreq->id < NET_RX_RING_SIZE))
1036 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1037 			    "id %d out of range in request 0x%p",
1038 			    rxreq->id, (void *)rxreq);
1039 #endif /* XNB_DEBUG */
1040 
1041 		/* 2 */
1042 		d_offset = offset = TX_BUFFER_HEADROOM;
1043 		len = 0;
1044 		item_count = 0;
1045 
1046 		gop_cp = xnbp->xnb_tx_cpop;
1047 
1048 		/*
1049 		 * We walk the b_cont pointers and set up a gop_cp
1050 		 * structure for every page in every data block we have.
1051 		 */
1052 		/* 2a */
1053 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1054 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1055 			uchar_t	*r_tmp,	*rpt_align;
1056 			size_t	r_offset;
1057 
1058 			/*
1059 			 * If we get an mblk on a page that doesn't belong to
1060 			 * this domain, get a new mblk to replace the old one.
1061 			 */
1062 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1063 				mblk_t *ml_new = replace_msg(ml, chunk,
1064 				    mp_prev, ml_prev);
1065 
1066 				/* We can still use old ml, but not *ml! */
1067 				if (free == ml)
1068 					free = ml_new;
1069 				if (mp == ml)
1070 					mp = ml_new;
1071 				ml = ml_new;
1072 
1073 				xnbp->xnb_stat_tx_foreign_page++;
1074 			}
1075 
1076 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1077 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1078 			r_tmp = ml->b_rptr;
1079 
1080 			if (d_offset + chunk > PAGESIZE)
1081 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1082 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1083 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1084 				    (void *)mp, (void *)saved_mp, (void *)ml,
1085 				    (void *)rpt_align,
1086 				    d_offset, chunk, (int)PAGESIZE);
1087 
1088 			while (chunk > 0) {
1089 				size_t part_len;
1090 
1091 				item_count++;
1092 				if (item_count > xnbp->xnb_cpop_sz) {
1093 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1094 					if (gop_cp == NULL)
1095 						goto failure;
1096 				}
1097 				/*
1098 				 * If our mblk crosses a page boundary, we need
1099 				 * to do a seperate copy for every page.
1100 				 */
1101 				if (r_offset + chunk > PAGESIZE) {
1102 					part_len = PAGESIZE - r_offset;
1103 
1104 					DTRACE_PROBE3(mblk_page_crossed,
1105 					    (mblk_t *), ml, int, chunk, int,
1106 					    (int)r_offset);
1107 
1108 					xnbp->xnb_stat_tx_pagebndry_crossed++;
1109 				} else {
1110 					part_len = chunk;
1111 				}
1112 
1113 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1114 				    d_offset, part_len, rxreq->gref);
1115 
1116 				chunk -= part_len;
1117 
1118 				len += part_len;
1119 				d_offset += part_len;
1120 				r_tmp += part_len;
1121 				/*
1122 				 * The 2nd, 3rd ... last copies will always
1123 				 * start at r_tmp, therefore r_offset is 0.
1124 				 */
1125 				r_offset = 0;
1126 				gop_cp++;
1127 			}
1128 			ml_prev = ml;
1129 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1130 			    chunk, int, len, int, item_count);
1131 		}
1132 		/* 3 */
1133 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop,
1134 		    item_count) != 0) {
1135 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1136 			DTRACE_PROBE(HV_granttableopfailed);
1137 		}
1138 
1139 		/* 4 */
1140 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1141 		rxresp->offset = offset;
1142 
1143 		rxresp->flags = 0;
1144 
1145 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1146 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1147 		    (int)rxresp->status);
1148 
1149 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1150 		if (cksum_flags != 0)
1151 			xnbp->xnb_stat_tx_cksum_deferred++;
1152 		rxresp->flags |= cksum_flags;
1153 
1154 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1155 		rxresp->status = len;
1156 
1157 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1158 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1159 		    (int)rxresp->status);
1160 
1161 		for (i = 0; i < item_count; i++) {
1162 			if (xnbp->xnb_tx_cpop[i].status != 0) {
1163 				DTRACE_PROBE2(cpop__status__nonnull, int,
1164 				    (int)xnbp->xnb_tx_cpop[i].status,
1165 				    int, i);
1166 				status = NETIF_RSP_ERROR;
1167 			}
1168 		}
1169 
1170 		/* 5.2 */
1171 		if (status != NETIF_RSP_OKAY) {
1172 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1173 			    status;
1174 			xnbp->xnb_stat_tx_rsp_notok++;
1175 		} else {
1176 			xnbp->xnb_stat_opackets++;
1177 			xnbp->xnb_stat_obytes += len;
1178 		}
1179 
1180 		loop++;
1181 		prod++;
1182 		mp_prev = mp;
1183 		mp = mp->b_next;
1184 	}
1185 failure:
1186 	/*
1187 	 * Did we actually do anything?
1188 	 */
1189 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1190 		mutex_exit(&xnbp->xnb_tx_lock);
1191 		return (mp);
1192 	}
1193 
1194 	/*
1195 	 * Unlink the end of the 'done' list from the remainder.
1196 	 */
1197 	ASSERT(mp_prev != NULL);
1198 	mp_prev->b_next = NULL;
1199 
1200 	xnbp->xnb_rx_ring.req_cons = loop;
1201 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1202 
1203 	/* 6 */
1204 	/* LINTED: constant in conditional context */
1205 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1206 	if (notify) {
1207 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1208 		xnbp->xnb_stat_tx_notify_sent++;
1209 	} else {
1210 		xnbp->xnb_stat_tx_notify_deferred++;
1211 	}
1212 
1213 	if (mp != NULL)
1214 		xnbp->xnb_stat_xmit_defer++;
1215 
1216 	mutex_exit(&xnbp->xnb_tx_lock);
1217 
1218 	/* Free mblk_t structs we have consumed. */
1219 	freemsgchain(free);
1220 
1221 	return (mp);
1222 }
1223 
1224 /*ARGSUSED*/
1225 static int
1226 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
1227 {
1228 	xnb_rxbuf_t *rxp = buf;
1229 
1230 	bzero(rxp, sizeof (*rxp));
1231 
1232 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
1233 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
1234 
1235 	rxp->xr_mop.host_addr =
1236 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1237 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1238 	    VM_NOSLEEP : VM_SLEEP);
1239 
1240 	if (rxp->xr_mop.host_addr == NULL) {
1241 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
1242 		    "cannot get address space");
1243 		return (-1);
1244 	}
1245 
1246 	/*
1247 	 * Have the hat ensure that page table exists for the VA.
1248 	 */
1249 	hat_prepare_mapping(kas.a_hat,
1250 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1251 
1252 	return (0);
1253 }
1254 
1255 /*ARGSUSED*/
1256 static void
1257 xnb_rxbuf_destructor(void *buf, void *arg)
1258 {
1259 	xnb_rxbuf_t *rxp = buf;
1260 
1261 	ASSERT(rxp->xr_mop.host_addr != NULL);
1262 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1263 
1264 	hat_release_mapping(kas.a_hat,
1265 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1266 	vmem_free(heap_arena,
1267 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
1268 }
1269 
1270 static void
1271 xnb_rx_notify_peer(xnb_t *xnbp)
1272 {
1273 	boolean_t notify;
1274 
1275 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1276 
1277 	/* LINTED: constant in conditional context */
1278 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1279 	if (notify) {
1280 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1281 		xnbp->xnb_stat_rx_notify_sent++;
1282 	} else {
1283 		xnbp->xnb_stat_rx_notify_deferred++;
1284 	}
1285 }
1286 
1287 static void
1288 xnb_rx_complete(xnb_rxbuf_t *rxp)
1289 {
1290 	xnb_t *xnbp = rxp->xr_xnbp;
1291 
1292 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1293 
1294 	mutex_enter(&xnbp->xnb_rx_lock);
1295 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp);
1296 	mutex_exit(&xnbp->xnb_rx_lock);
1297 }
1298 
1299 static void
1300 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1301 {
1302 	RING_IDX i;
1303 	netif_tx_response_t *txresp;
1304 
1305 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1306 
1307 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1308 
1309 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1310 	txresp->id = id;
1311 	txresp->status = status;
1312 
1313 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1314 
1315 	/*
1316 	 * Note that we don't push the change to the peer here - that
1317 	 * is the callers responsibility.
1318 	 */
1319 }
1320 
1321 static void
1322 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1323     xnb_rxbuf_t *rxp)
1324 {
1325 	gnttab_unmap_grant_ref_t	*unmop;
1326 	int				u_count;
1327 	int				reqs_on_ring;
1328 
1329 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1330 	ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE);
1331 
1332 	u_count = xnbp->xnb_rx_unmop_count++;
1333 
1334 	/* Cache data for the time when we actually unmap grant refs */
1335 	xnbp->xnb_rx_unmop_rxp[u_count] = rxp;
1336 
1337 	unmop = &xnbp->xnb_rx_unmop[u_count];
1338 	unmop->host_addr = mop->host_addr;
1339 	unmop->dev_bus_addr = mop->dev_bus_addr;
1340 	unmop->handle = mop->handle;
1341 
1342 	/*
1343 	 * We cannot check the ring once we're disconnected from it. Batching
1344 	 * doesn't seem to be a useful optimisation in this case either,
1345 	 * so we directly call into the actual unmap function.
1346 	 */
1347 	if (xnbp->xnb_connected) {
1348 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring);
1349 
1350 		/*
1351 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1352 		 * or (with N == 1) "immediate unmop" behaviour.
1353 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1354 		 */
1355 		if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat &&
1356 		    reqs_on_ring > xnb_unmop_lowwat)
1357 			return;
1358 	}
1359 
1360 	xnb_rx_perform_pending_unmop(xnbp);
1361 }
1362 
1363 /*
1364  * Here we perform the actual unmapping of the data that was
1365  * accumulated in xnb_rx_schedule_unmop().
1366  * Note that it is the caller's responsibility to make sure that
1367  * there's actually something there to unmop.
1368  */
1369 static void
1370 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
1371 {
1372 	RING_IDX loop;
1373 #ifdef XNB_DEBUG
1374 	gnttab_unmap_grant_ref_t *unmop;
1375 #endif /* XNB_DEBUG */
1376 
1377 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1378 	ASSERT(xnbp->xnb_rx_unmop_count > 0);
1379 
1380 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1381 	    xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) {
1382 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1383 		    "unmap grant operation failed, "
1384 		    "%d pages lost", xnbp->xnb_rx_unmop_count);
1385 	}
1386 
1387 #ifdef XNB_DEBUG
1388 	for (loop = 0, unmop = xnbp->xnb_rx_unmop;
1389 	    loop < xnbp->xnb_rx_unmop_count;
1390 	    loop++, unmop++) {
1391 		if (unmop->status != 0) {
1392 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1393 			    "unmap grant reference failed (%d)",
1394 			    unmop->status);
1395 		}
1396 	}
1397 #endif /* XNB_DEBUG */
1398 
1399 	for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) {
1400 		xnb_rxbuf_t	*rxp = xnbp->xnb_rx_unmop_rxp[loop];
1401 
1402 		if (rxp == NULL)
1403 			cmn_err(CE_PANIC,
1404 			    "xnb_rx_perform_pending_unmop: "
1405 			    "unexpected NULL rxp (loop %d; count %d)!",
1406 			    loop, xnbp->xnb_rx_unmop_count);
1407 
1408 		if (xnbp->xnb_connected)
1409 			xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
1410 		xnb_rxbuf_put(xnbp, rxp);
1411 	}
1412 	if (xnbp->xnb_connected)
1413 		xnb_rx_notify_peer(xnbp);
1414 
1415 	xnbp->xnb_rx_unmop_count = 0;
1416 
1417 #ifdef XNB_DEBUG
1418 	bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop));
1419 	bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp));
1420 #endif /* XNB_DEBUG */
1421 }
1422 
1423 static xnb_rxbuf_t *
1424 xnb_rxbuf_get(xnb_t *xnbp, int flags)
1425 {
1426 	xnb_rxbuf_t *rxp;
1427 
1428 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1429 
1430 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
1431 	if (rxp != NULL) {
1432 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1433 		rxp->xr_flags |= XNB_RXBUF_INUSE;
1434 
1435 		rxp->xr_xnbp = xnbp;
1436 		rxp->xr_mop.dom = xnbp->xnb_peer;
1437 
1438 		rxp->xr_mop.flags = GNTMAP_host_map;
1439 		if (!xnbp->xnb_rx_pages_writable)
1440 			rxp->xr_mop.flags |= GNTMAP_readonly;
1441 
1442 		xnbp->xnb_rx_buf_count++;
1443 	}
1444 
1445 	return (rxp);
1446 }
1447 
1448 static void
1449 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
1450 {
1451 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1452 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1453 
1454 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
1455 	xnbp->xnb_rx_buf_count--;
1456 
1457 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
1458 }
1459 
1460 static mblk_t *
1461 xnb_recv(xnb_t *xnbp)
1462 {
1463 	RING_IDX start, end, loop;
1464 	gnttab_map_grant_ref_t *mop;
1465 	xnb_rxbuf_t **rxpp;
1466 	netif_tx_request_t *txreq;
1467 	boolean_t work_to_do;
1468 	mblk_t *head, *tail;
1469 	/*
1470 	 * If the peer granted a read-only mapping to the page then we
1471 	 * must copy the data, as the local protocol stack (should the
1472 	 * packet be destined for this host) will modify the packet
1473 	 * 'in place'.
1474 	 */
1475 	boolean_t copy = !xnbp->xnb_rx_pages_writable;
1476 
1477 	/*
1478 	 * For each individual request, the sequence of actions is:
1479 	 *
1480 	 * 1. get the request.
1481 	 * 2. map the page based on the grant ref.
1482 	 * 3. allocate an mblk, copy the data to it.
1483 	 * 4. release the grant.
1484 	 * 5. update the ring.
1485 	 * 6. pass the packet upward.
1486 	 * 7. kick the peer.
1487 	 *
1488 	 * In fact, we try to perform the grant operations in batches,
1489 	 * so there are two loops.
1490 	 */
1491 
1492 	head = tail = NULL;
1493 around:
1494 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1495 
1496 	/* LINTED: constant in conditional context */
1497 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1498 	if (!work_to_do) {
1499 finished:
1500 		return (head);
1501 	}
1502 
1503 	start = xnbp->xnb_tx_ring.req_cons;
1504 	end = xnbp->xnb_tx_ring.sring->req_prod;
1505 
1506 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1507 	    loop != end;
1508 	    loop++, mop++, rxpp++) {
1509 		xnb_rxbuf_t *rxp;
1510 
1511 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1512 		if (rxp == NULL)
1513 			break;
1514 
1515 		ASSERT(xnbp->xnb_rx_pages_writable ||
1516 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1517 		    == GNTMAP_readonly));
1518 
1519 		rxp->xr_mop.ref =
1520 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1521 
1522 		*mop = rxp->xr_mop;
1523 		*rxpp = rxp;
1524 	}
1525 
1526 	if ((loop - start) == 0)
1527 		goto finished;
1528 
1529 	end = loop;
1530 
1531 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1532 	    xnbp->xnb_rx_mop, end - start) != 0) {
1533 
1534 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1535 
1536 		loop = start;
1537 		rxpp = xnbp->xnb_rx_bufp;
1538 
1539 		while (loop != end) {
1540 			xnb_rxbuf_put(xnbp, *rxpp);
1541 
1542 			loop++;
1543 			rxpp++;
1544 		}
1545 
1546 		goto finished;
1547 	}
1548 
1549 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1550 	    loop != end;
1551 	    loop++, mop++, rxpp++) {
1552 		mblk_t *mp = NULL;
1553 		int16_t status = NETIF_RSP_OKAY;
1554 		xnb_rxbuf_t *rxp = *rxpp;
1555 
1556 		if (mop->status != 0) {
1557 			cmn_err(CE_WARN, "xnb_recv: "
1558 			    "failed to map buffer: %d",
1559 			    mop->status);
1560 			status = NETIF_RSP_ERROR;
1561 		}
1562 
1563 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1564 
1565 		if (status == NETIF_RSP_OKAY) {
1566 			if (copy) {
1567 				mp = allocb(txreq->size, BPRI_MED);
1568 				if (mp == NULL) {
1569 					status = NETIF_RSP_ERROR;
1570 					xnbp->xnb_stat_rx_allocb_failed++;
1571 				} else {
1572 					bcopy((caddr_t)(uintptr_t)
1573 					    mop->host_addr + txreq->offset,
1574 					    mp->b_wptr, txreq->size);
1575 					mp->b_wptr += txreq->size;
1576 				}
1577 			} else {
1578 				mp = desballoc((uchar_t *)(uintptr_t)
1579 				    mop->host_addr + txreq->offset,
1580 				    txreq->size, 0, &rxp->xr_free_rtn);
1581 				if (mp == NULL) {
1582 					status = NETIF_RSP_ERROR;
1583 					xnbp->xnb_stat_rx_allocb_failed++;
1584 				} else {
1585 					rxp->xr_id = txreq->id;
1586 					rxp->xr_status = status;
1587 					rxp->xr_mop = *mop;
1588 
1589 					mp->b_wptr += txreq->size;
1590 				}
1591 			}
1592 
1593 			/*
1594 			 * If we have a buffer and there are checksum
1595 			 * flags, process them appropriately.
1596 			 */
1597 			if ((mp != NULL) &&
1598 			    ((txreq->flags &
1599 			    (NETTXF_csum_blank | NETTXF_data_validated))
1600 			    != 0)) {
1601 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1602 				    mp, txreq->flags);
1603 				xnbp->xnb_stat_rx_cksum_no_need++;
1604 			}
1605 		}
1606 
1607 		if (copy || (mp == NULL)) {
1608 			rxp->xr_status = status;
1609 			rxp->xr_id = txreq->id;
1610 			xnb_rx_schedule_unmop(xnbp, mop, rxp);
1611 		}
1612 
1613 		if (mp != NULL) {
1614 			xnbp->xnb_stat_ipackets++;
1615 			xnbp->xnb_stat_rbytes += txreq->size;
1616 
1617 			mp->b_next = NULL;
1618 			if (head == NULL) {
1619 				ASSERT(tail == NULL);
1620 				head = mp;
1621 			} else {
1622 				ASSERT(tail != NULL);
1623 				tail->b_next = mp;
1624 			}
1625 			tail = mp;
1626 		}
1627 	}
1628 
1629 	xnbp->xnb_tx_ring.req_cons = loop;
1630 
1631 	goto around;
1632 	/* NOTREACHED */
1633 }
1634 
1635 /*
1636  *  intr() -- ring interrupt service routine
1637  */
1638 static uint_t
1639 xnb_intr(caddr_t arg)
1640 {
1641 	xnb_t *xnbp = (xnb_t *)arg;
1642 	mblk_t *mp;
1643 
1644 	xnbp->xnb_stat_intr++;
1645 
1646 	mutex_enter(&xnbp->xnb_rx_lock);
1647 
1648 	ASSERT(xnbp->xnb_connected);
1649 
1650 	mp = xnb_recv(xnbp);
1651 
1652 	mutex_exit(&xnbp->xnb_rx_lock);
1653 
1654 	if (!xnbp->xnb_hotplugged) {
1655 		xnbp->xnb_stat_rx_too_early++;
1656 		goto fail;
1657 	}
1658 	if (mp == NULL) {
1659 		xnbp->xnb_stat_spurious_intr++;
1660 		goto fail;
1661 	}
1662 
1663 	xnbp->xnb_flavour->xf_recv(xnbp, mp);
1664 
1665 	return (DDI_INTR_CLAIMED);
1666 
1667 fail:
1668 	freemsgchain(mp);
1669 	return (DDI_INTR_CLAIMED);
1670 }
1671 
1672 static boolean_t
1673 xnb_connect_rings(dev_info_t *dip)
1674 {
1675 	xnb_t *xnbp = ddi_get_driver_private(dip);
1676 	char *oename;
1677 	struct gnttab_map_grant_ref map_op;
1678 	evtchn_port_t evtchn;
1679 	int i;
1680 
1681 	/*
1682 	 * Cannot attempt to connect the rings if already connected.
1683 	 */
1684 	ASSERT(!xnbp->xnb_connected);
1685 
1686 	oename = xvdi_get_oename(dip);
1687 
1688 	if (xenbus_gather(XBT_NULL, oename,
1689 	    "event-channel", "%u", &evtchn,
1690 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1691 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1692 	    NULL) != 0) {
1693 		cmn_err(CE_WARN, "xnb_connect_rings: "
1694 		    "cannot read other-end details from %s",
1695 		    oename);
1696 		goto fail;
1697 	}
1698 
1699 	if (xenbus_scanf(XBT_NULL, oename,
1700 	    "feature-tx-writable", "%d", &i) != 0)
1701 		i = 0;
1702 	if (i != 0)
1703 		xnbp->xnb_rx_pages_writable = B_TRUE;
1704 
1705 	if (xenbus_scanf(XBT_NULL, oename,
1706 	    "feature-no-csum-offload", "%d", &i) != 0)
1707 		i = 0;
1708 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1709 		xnbp->xnb_cksum_offload = B_FALSE;
1710 
1711 	/* Check whether our peer knows and requests hypervisor copy */
1712 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1713 	    != 0)
1714 		i = 0;
1715 	if (i != 0)
1716 		xnbp->xnb_hv_copy = B_TRUE;
1717 
1718 	/*
1719 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1720 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1721 	 *    into the allocated vaddr (one for tx, one for rx).
1722 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1723 	 *    bound to this domain.
1724 	 * 4. associate the event channel with an interrupt.
1725 	 * 5. declare ourselves connected.
1726 	 * 6. enable the interrupt.
1727 	 */
1728 
1729 	/* 1.tx */
1730 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1731 	    0, 0, 0, 0, VM_SLEEP);
1732 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1733 
1734 	/* 2.tx */
1735 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1736 	map_op.flags = GNTMAP_host_map;
1737 	map_op.ref = xnbp->xnb_tx_ring_ref;
1738 	map_op.dom = xnbp->xnb_peer;
1739 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1740 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1741 	    &map_op, 1) != 0 || map_op.status != 0) {
1742 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1743 		goto fail;
1744 	}
1745 	xnbp->xnb_tx_ring_handle = map_op.handle;
1746 
1747 	/* LINTED: constant in conditional context */
1748 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1749 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1750 
1751 	/* 1.rx */
1752 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1753 	    0, 0, 0, 0, VM_SLEEP);
1754 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1755 
1756 	/* 2.rx */
1757 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1758 	map_op.flags = GNTMAP_host_map;
1759 	map_op.ref = xnbp->xnb_rx_ring_ref;
1760 	map_op.dom = xnbp->xnb_peer;
1761 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1762 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1763 	    &map_op, 1) != 0 || map_op.status != 0) {
1764 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1765 		goto fail;
1766 	}
1767 	xnbp->xnb_rx_ring_handle = map_op.handle;
1768 
1769 	/* LINTED: constant in conditional context */
1770 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1771 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1772 
1773 	/* 3 */
1774 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1775 		cmn_err(CE_WARN, "xnb_connect_rings: "
1776 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1777 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1778 		goto fail;
1779 	}
1780 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1781 
1782 	/*
1783 	 * It would be good to set the state to XenbusStateConnected
1784 	 * here as well, but then what if ddi_add_intr() failed?
1785 	 * Changing the state in the store will be noticed by the peer
1786 	 * and cannot be "taken back".
1787 	 */
1788 	mutex_enter(&xnbp->xnb_tx_lock);
1789 	mutex_enter(&xnbp->xnb_rx_lock);
1790 
1791 	/* 5.1 */
1792 	xnbp->xnb_connected = B_TRUE;
1793 
1794 	mutex_exit(&xnbp->xnb_rx_lock);
1795 	mutex_exit(&xnbp->xnb_tx_lock);
1796 
1797 	/* 4, 6 */
1798 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1799 	    != DDI_SUCCESS) {
1800 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1801 		goto fail;
1802 	}
1803 	xnbp->xnb_irq = B_TRUE;
1804 
1805 	/* 5.2 */
1806 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1807 
1808 	return (B_TRUE);
1809 
1810 fail:
1811 	mutex_enter(&xnbp->xnb_tx_lock);
1812 	mutex_enter(&xnbp->xnb_rx_lock);
1813 
1814 	xnbp->xnb_connected = B_FALSE;
1815 	mutex_exit(&xnbp->xnb_rx_lock);
1816 	mutex_exit(&xnbp->xnb_tx_lock);
1817 
1818 	return (B_FALSE);
1819 }
1820 
1821 static void
1822 xnb_disconnect_rings(dev_info_t *dip)
1823 {
1824 	xnb_t *xnbp = ddi_get_driver_private(dip);
1825 
1826 	if (xnbp->xnb_irq) {
1827 		ddi_remove_intr(dip, 0, NULL);
1828 		xnbp->xnb_irq = B_FALSE;
1829 	}
1830 
1831 	if (xnbp->xnb_rx_unmop_count > 0)
1832 		xnb_rx_perform_pending_unmop(xnbp);
1833 
1834 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1835 		xvdi_free_evtchn(dip);
1836 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1837 	}
1838 
1839 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1840 		struct gnttab_unmap_grant_ref unmap_op;
1841 
1842 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1843 		    xnbp->xnb_rx_ring_addr;
1844 		unmap_op.dev_bus_addr = 0;
1845 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1846 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1847 		    &unmap_op, 1) != 0)
1848 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1849 			    "cannot unmap rx-ring page (%d)",
1850 			    unmap_op.status);
1851 
1852 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1853 	}
1854 
1855 	if (xnbp->xnb_rx_ring_addr != NULL) {
1856 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1857 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1858 		xnbp->xnb_rx_ring_addr = NULL;
1859 	}
1860 
1861 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1862 		struct gnttab_unmap_grant_ref unmap_op;
1863 
1864 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1865 		    xnbp->xnb_tx_ring_addr;
1866 		unmap_op.dev_bus_addr = 0;
1867 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1868 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1869 		    &unmap_op, 1) != 0)
1870 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1871 			    "cannot unmap tx-ring page (%d)",
1872 			    unmap_op.status);
1873 
1874 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1875 	}
1876 
1877 	if (xnbp->xnb_tx_ring_addr != NULL) {
1878 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1879 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1880 		xnbp->xnb_tx_ring_addr = NULL;
1881 	}
1882 }
1883 
1884 /*ARGSUSED*/
1885 static void
1886 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1887     void *arg, void *impl_data)
1888 {
1889 	xnb_t *xnbp = ddi_get_driver_private(dip);
1890 	XenbusState new_state = *(XenbusState *)impl_data;
1891 
1892 	ASSERT(xnbp != NULL);
1893 
1894 	switch (new_state) {
1895 	case XenbusStateConnected:
1896 		/* spurious state change */
1897 		if (xnbp->xnb_connected)
1898 			return;
1899 
1900 		if (xnb_connect_rings(dip)) {
1901 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1902 		} else {
1903 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1904 			xnb_disconnect_rings(dip);
1905 			(void) xvdi_switch_state(dip, XBT_NULL,
1906 			    XenbusStateClosed);
1907 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1908 		}
1909 
1910 		/*
1911 		 * Now that we've attempted to connect it's reasonable
1912 		 * to allow an attempt to detach.
1913 		 */
1914 		xnbp->xnb_detachable = B_TRUE;
1915 
1916 		break;
1917 
1918 	case XenbusStateClosing:
1919 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1920 
1921 		break;
1922 
1923 	case XenbusStateClosed:
1924 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1925 
1926 		mutex_enter(&xnbp->xnb_tx_lock);
1927 		mutex_enter(&xnbp->xnb_rx_lock);
1928 
1929 		xnb_disconnect_rings(dip);
1930 		xnbp->xnb_connected = B_FALSE;
1931 
1932 		mutex_exit(&xnbp->xnb_rx_lock);
1933 		mutex_exit(&xnbp->xnb_tx_lock);
1934 
1935 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1936 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1937 		/*
1938 		 * In all likelyhood this is already set (in the above
1939 		 * case), but if the peer never attempted to connect
1940 		 * and the domain is destroyed we get here without
1941 		 * having been through the case above, so we set it to
1942 		 * be sure.
1943 		 */
1944 		xnbp->xnb_detachable = B_TRUE;
1945 
1946 		break;
1947 
1948 	default:
1949 		break;
1950 	}
1951 }
1952 
1953 /*ARGSUSED*/
1954 static void
1955 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1956     void *arg, void *impl_data)
1957 {
1958 	xnb_t *xnbp = ddi_get_driver_private(dip);
1959 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1960 	boolean_t success;
1961 
1962 	ASSERT(xnbp != NULL);
1963 
1964 	switch (state) {
1965 	case Connected:
1966 
1967 		/* spurious hotplug event */
1968 		if (xnbp->xnb_hotplugged)
1969 			return;
1970 
1971 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1972 
1973 		mutex_enter(&xnbp->xnb_tx_lock);
1974 		mutex_enter(&xnbp->xnb_rx_lock);
1975 
1976 		xnbp->xnb_hotplugged = success;
1977 
1978 		mutex_exit(&xnbp->xnb_rx_lock);
1979 		mutex_exit(&xnbp->xnb_tx_lock);
1980 		break;
1981 
1982 	default:
1983 		break;
1984 	}
1985 }
1986 
1987 static struct modldrv modldrv = {
1988 	&mod_miscops, "xnb",
1989 };
1990 
1991 static struct modlinkage modlinkage = {
1992 	MODREV_1, &modldrv, NULL
1993 };
1994 
1995 int
1996 _init(void)
1997 {
1998 	int i;
1999 
2000 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2001 
2002 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
2003 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
2004 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
2005 	ASSERT(xnb_rxbuf_cachep != NULL);
2006 
2007 	i = mod_install(&modlinkage);
2008 	if (i != DDI_SUCCESS) {
2009 		kmem_cache_destroy(xnb_rxbuf_cachep);
2010 		mutex_destroy(&xnb_alloc_page_lock);
2011 	}
2012 	return (i);
2013 }
2014 
2015 int
2016 _info(struct modinfo *modinfop)
2017 {
2018 	return (mod_info(&modlinkage, modinfop));
2019 }
2020 
2021 int
2022 _fini(void)
2023 {
2024 	int i;
2025 
2026 	i = mod_remove(&modlinkage);
2027 	if (i == DDI_SUCCESS) {
2028 		kmem_cache_destroy(xnb_rxbuf_cachep);
2029 		mutex_destroy(&xnb_alloc_page_lock);
2030 	}
2031 	return (i);
2032 }
2033