xref: /titanic_51/usr/src/uts/common/xen/io/xnb.c (revision a3c4695861e3f0a8d3706f77ccd53683cca48d67)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #ifdef DEBUG
30 #define	XNB_DEBUG 1
31 #endif /* DEBUG */
32 
33 #include "xnb.h"
34 
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/mac.h>
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/types.h>
44 #include <sys/pattr.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/hat_i86.h>
47 #include <xen/sys/xenbus_impl.h>
48 #include <xen/sys/xendev.h>
49 #include <sys/balloon_impl.h>
50 #include <sys/evtchn_impl.h>
51 #include <sys/gnttab.h>
52 #include <vm/vm_dep.h>
53 
54 #include <sys/gld.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <sys/vnic_impl.h> /* blech. */
58 
59 /*
60  * The terms "transmit" and "receive" are used in their traditional
61  * sense here - packets from other parts of this system are
62  * "transmitted" to the peer domain and those originating from the
63  * peer are "received".
64  *
65  * In some cases this can be confusing, because various data
66  * structures are shared with the domU driver, which has the opposite
67  * view of what constitutes "transmit" and "receive".  In naming the
68  * shared structures the domU driver always wins.
69  */
70 
71 /*
72  * XXPV dme: things to do, as well as various things indicated
73  * throughout the source:
74  * - copy avoidance outbound.
75  * - copy avoidance inbound.
76  * - transfer credit limiting.
77  * - MAC address based filtering.
78  */
79 
80 /*
81  * Linux expects to have some headroom in received buffers.  The Linux
82  * frontend driver (netfront) checks to see if the headroom is
83  * available and will re-allocate the buffer to make room if
84  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
85  * headroom to each packet we pass to the peer.
86  */
87 #define	TX_BUFFER_HEADROOM	16
88 
89 static boolean_t	xnb_cksum_offload = B_TRUE;
90 
91 static boolean_t	xnb_connect_rings(dev_info_t *);
92 static void		xnb_disconnect_rings(dev_info_t *);
93 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
94     void *, void *);
95 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
96     void *, void *);
97 
98 static int	xnb_rxbuf_constructor(void *, void *, int);
99 static void	xnb_rxbuf_destructor(void *, void *);
100 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
101 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
102 static void	xnb_rx_notify_peer(xnb_t *);
103 static void	xnb_rx_complete(xnb_rxbuf_t *);
104 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
105 static void 	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *,
106     xnb_rxbuf_t *);
107 static void	xnb_rx_perform_pending_unmop(xnb_t *);
108 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
109 
110 int		xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2;
111 int		xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2);
112 
113 
114 boolean_t	xnb_hv_copy = B_TRUE;
115 boolean_t	xnb_explicit_pageflip_set = B_FALSE;
116 
117 #ifdef XNB_DEBUG
118 #define	NR_GRANT_ENTRIES \
119 	(NR_GRANT_FRAMES * PAGESIZE / sizeof (grant_entry_t))
120 #endif /* XNB_DEBUG */
121 
122 /* XXPV dme: are these really invalid? */
123 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
124 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
125 
126 static kmem_cache_t *xnb_rxbuf_cachep;
127 static kmutex_t	xnb_alloc_page_lock;
128 
129 /*
130  * Statistics.
131  */
132 static char *aux_statistics[] = {
133 	"tx_cksum_deferred",
134 	"rx_cksum_no_need",
135 	"tx_rsp_notok",
136 	"tx_notify_deferred",
137 	"tx_notify_sent",
138 	"rx_notify_deferred",
139 	"rx_notify_sent",
140 	"tx_too_early",
141 	"rx_too_early",
142 	"rx_allocb_failed",
143 	"tx_allocb_failed",
144 	"tx_foreign_page",
145 	"mac_full",
146 	"spurious_intr",
147 	"allocation_success",
148 	"allocation_failure",
149 	"small_allocation_success",
150 	"small_allocation_failure",
151 	"other_allocation_failure",
152 	"tx_pageboundary_crossed",
153 	"tx_cpoparea_grown",
154 	"csum_hardware",
155 	"csum_software",
156 };
157 
158 static int
159 xnb_ks_aux_update(kstat_t *ksp, int flag)
160 {
161 	xnb_t *xnbp;
162 	kstat_named_t *knp;
163 
164 	if (flag != KSTAT_READ)
165 		return (EACCES);
166 
167 	xnbp = ksp->ks_private;
168 	knp = ksp->ks_data;
169 
170 	/*
171 	 * Assignment order should match that of the names in
172 	 * aux_statistics.
173 	 */
174 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_rsp_notok;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown;
195 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
196 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
197 
198 	return (0);
199 }
200 
201 static boolean_t
202 xnb_ks_init(xnb_t *xnbp)
203 {
204 	int nstat = sizeof (aux_statistics) /
205 	    sizeof (aux_statistics[0]);
206 	char **cp = aux_statistics;
207 	kstat_named_t *knp;
208 
209 	/*
210 	 * Create and initialise kstats.
211 	 */
212 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
213 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
214 	    KSTAT_TYPE_NAMED, nstat, 0);
215 	if (xnbp->xnb_kstat_aux == NULL)
216 		return (B_FALSE);
217 
218 	xnbp->xnb_kstat_aux->ks_private = xnbp;
219 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
220 
221 	knp = xnbp->xnb_kstat_aux->ks_data;
222 	while (nstat > 0) {
223 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
224 
225 		knp++;
226 		cp++;
227 		nstat--;
228 	}
229 
230 	kstat_install(xnbp->xnb_kstat_aux);
231 
232 	return (B_TRUE);
233 }
234 
235 static void
236 xnb_ks_free(xnb_t *xnbp)
237 {
238 	kstat_delete(xnbp->xnb_kstat_aux);
239 }
240 
241 /*
242  * Software checksum calculation and insertion for an arbitrary packet.
243  */
244 /*ARGSUSED*/
245 static mblk_t *
246 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
247 {
248 	/*
249 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
250 	 * because it doesn't cover all of the interesting cases :-(
251 	 */
252 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
253 	    HCK_FULLCKSUM, KM_NOSLEEP);
254 
255 	return (vnic_fix_cksum(mp));
256 }
257 
258 mblk_t *
259 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
260 {
261 	struct ether_header *ehp;
262 	uint16_t sap;
263 	uint32_t offset;
264 	ipha_t *ipha;
265 
266 	ASSERT(mp->b_next == NULL);
267 
268 	/*
269 	 * Check that the packet is contained in a single mblk.  In
270 	 * the "from peer" path this is true today, but will change
271 	 * when scatter gather support is added.  In the "to peer"
272 	 * path we cannot be sure, but in most cases it will be true
273 	 * (in the xnbo case the packet has come from a MAC device
274 	 * which is unlikely to split packets).
275 	 */
276 	if (mp->b_cont != NULL)
277 		goto software;
278 
279 	/*
280 	 * If the MAC has no hardware capability don't do any further
281 	 * checking.
282 	 */
283 	if (capab == 0)
284 		goto software;
285 
286 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
287 	ehp = (struct ether_header *)mp->b_rptr;
288 
289 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
290 		struct ether_vlan_header *evhp;
291 
292 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
293 		evhp = (struct ether_vlan_header *)mp->b_rptr;
294 		sap = ntohs(evhp->ether_type);
295 		offset = sizeof (struct ether_vlan_header);
296 	} else {
297 		sap = ntohs(ehp->ether_type);
298 		offset = sizeof (struct ether_header);
299 	}
300 
301 	/*
302 	 * We only attempt to do IPv4 packets in hardware.
303 	 */
304 	if (sap != ETHERTYPE_IP)
305 		goto software;
306 
307 	/*
308 	 * We know that this is an IPv4 packet.
309 	 */
310 	ipha = (ipha_t *)(mp->b_rptr + offset);
311 
312 	switch (ipha->ipha_protocol) {
313 	case IPPROTO_TCP:
314 	case IPPROTO_UDP:
315 		/*
316 		 * This is a TCP/IPv4 or UDP/IPv4 packet.
317 		 *
318 		 * If the capabilities indicate that full checksum
319 		 * offload is available, use it.
320 		 */
321 		if ((capab & HCKSUM_INET_FULL_V4) != 0) {
322 			(void) hcksum_assoc(mp, NULL, NULL,
323 			    0, 0, 0, 0,
324 			    HCK_FULLCKSUM, KM_NOSLEEP);
325 
326 			xnbp->xnb_stat_csum_hardware++;
327 
328 			return (mp);
329 		}
330 
331 		/*
332 		 * XXPV dme: If the capabilities indicate that partial
333 		 * checksum offload is available, we should use it.
334 		 */
335 
336 		break;
337 
338 	default:
339 		/* Use software. */
340 		break;
341 	}
342 
343 software:
344 	/*
345 	 * We are not able to use any offload so do the whole thing in
346 	 * software.
347 	 */
348 	xnbp->xnb_stat_csum_software++;
349 
350 	return (xnb_software_csum(xnbp, mp));
351 }
352 
353 int
354 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
355 {
356 	xnb_t *xnbp;
357 	char *xsname, mac[ETHERADDRL * 3];
358 
359 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
360 
361 	xnbp->xnb_flavour = flavour;
362 	xnbp->xnb_flavour_data = flavour_data;
363 	xnbp->xnb_devinfo = dip;
364 	xnbp->xnb_evtchn = INVALID_EVTCHN;
365 	xnbp->xnb_irq = B_FALSE;
366 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
367 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
368 	xnbp->xnb_cksum_offload = xnb_cksum_offload;
369 	xnbp->xnb_connected = B_FALSE;
370 	xnbp->xnb_hotplugged = B_FALSE;
371 	xnbp->xnb_detachable = B_FALSE;
372 	xnbp->xnb_peer = xvdi_get_oeid(dip);
373 	xnbp->xnb_rx_pages_writable = B_FALSE;
374 
375 	xnbp->xnb_rx_buf_count = 0;
376 	xnbp->xnb_rx_unmop_count = 0;
377 
378 	xnbp->xnb_hv_copy = B_FALSE;
379 
380 	xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
381 	ASSERT(xnbp->xnb_tx_va != NULL);
382 
383 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
384 	    != DDI_SUCCESS)
385 		goto failure;
386 
387 	/* allocated on demand, when/if we enter xnb_copy_to_peer() */
388 	xnbp->xnb_tx_cpop = NULL;
389 	xnbp->xnb_cpop_sz = 0;
390 
391 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
392 	    xnbp->xnb_icookie);
393 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
394 	    xnbp->xnb_icookie);
395 
396 	/* set driver private pointer now */
397 	ddi_set_driver_private(dip, xnbp);
398 
399 	if (!xnb_ks_init(xnbp))
400 		goto failure_1;
401 
402 	/*
403 	 * Receive notification of changes in the state of the
404 	 * driver in the guest domain.
405 	 */
406 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
407 	    xnb_oe_state_change) != DDI_SUCCESS)
408 		goto failure_2;
409 
410 	/*
411 	 * Receive notification of hotplug events.
412 	 */
413 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
414 	    xnb_hp_state_change) != DDI_SUCCESS)
415 		goto failure_2;
416 
417 	xsname = xvdi_get_xsname(dip);
418 
419 	if (xenbus_printf(XBT_NULL, xsname,
420 	    "feature-no-csum-offload", "%d",
421 	    xnbp->xnb_cksum_offload ? 0 : 1) != 0)
422 		goto failure_3;
423 
424 	/*
425 	 * Use global xnb_hv_copy to export this feature. This means that
426 	 * we have to decide what to do before starting up a guest domain
427 	 */
428 	if (xenbus_printf(XBT_NULL, xsname,
429 	    "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0)
430 		goto failure_3;
431 	/*
432 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
433 	 * in addition to "feature-rx-copy" being 1. It seems strange
434 	 * to use four possible states to describe a binary decision,
435 	 * but we might as well play nice.
436 	 */
437 	if (xenbus_printf(XBT_NULL, xsname,
438 	    "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0)
439 		goto failure_3;
440 
441 	if (xenbus_scanf(XBT_NULL, xsname,
442 	    "mac", "%s", mac) != 0) {
443 		cmn_err(CE_WARN, "xnb_attach: "
444 		    "cannot read mac address from %s",
445 		    xsname);
446 		goto failure_3;
447 	}
448 
449 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
450 		cmn_err(CE_WARN,
451 		    "xnb_attach: cannot parse mac address %s",
452 		    mac);
453 		goto failure_3;
454 	}
455 
456 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
457 	(void) xvdi_post_event(dip, XEN_HP_ADD);
458 
459 	return (DDI_SUCCESS);
460 
461 failure_3:
462 	xvdi_remove_event_handler(dip, NULL);
463 
464 failure_2:
465 	xnb_ks_free(xnbp);
466 
467 failure_1:
468 	mutex_destroy(&xnbp->xnb_rx_lock);
469 	mutex_destroy(&xnbp->xnb_tx_lock);
470 
471 failure:
472 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
473 	kmem_free(xnbp, sizeof (*xnbp));
474 	return (DDI_FAILURE);
475 }
476 
477 /*ARGSUSED*/
478 void
479 xnb_detach(dev_info_t *dip)
480 {
481 	xnb_t *xnbp = ddi_get_driver_private(dip);
482 
483 	ASSERT(xnbp != NULL);
484 	ASSERT(!xnbp->xnb_connected);
485 	ASSERT(xnbp->xnb_rx_buf_count == 0);
486 
487 	xnb_disconnect_rings(dip);
488 
489 	xvdi_remove_event_handler(dip, NULL);
490 
491 	xnb_ks_free(xnbp);
492 
493 	ddi_set_driver_private(dip, NULL);
494 
495 	mutex_destroy(&xnbp->xnb_tx_lock);
496 	mutex_destroy(&xnbp->xnb_rx_lock);
497 
498 	if (xnbp->xnb_cpop_sz > 0)
499 		kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop)
500 		    * xnbp->xnb_cpop_sz);
501 
502 	ASSERT(xnbp->xnb_tx_va != NULL);
503 	vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE);
504 
505 	kmem_free(xnbp, sizeof (*xnbp));
506 }
507 
508 
509 static mfn_t
510 xnb_alloc_page(xnb_t *xnbp)
511 {
512 #define	WARNING_RATE_LIMIT 100
513 #define	BATCH_SIZE 256
514 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
515 	static int nth = BATCH_SIZE;
516 	mfn_t mfn;
517 
518 	mutex_enter(&xnb_alloc_page_lock);
519 	if (nth == BATCH_SIZE) {
520 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
521 			xnbp->xnb_stat_allocation_failure++;
522 			mutex_exit(&xnb_alloc_page_lock);
523 
524 			/*
525 			 * Try for a single page in low memory situations.
526 			 */
527 			if (balloon_alloc_pages(1, &mfn) != 1) {
528 				if ((xnbp->xnb_stat_small_allocation_failure++
529 				    % WARNING_RATE_LIMIT) == 0)
530 					cmn_err(CE_WARN, "xnb_alloc_page: "
531 					    "Cannot allocate memory to "
532 					    "transfer packets to peer.");
533 				return (0);
534 			} else {
535 				xnbp->xnb_stat_small_allocation_success++;
536 				return (mfn);
537 			}
538 		}
539 
540 		nth = 0;
541 		xnbp->xnb_stat_allocation_success++;
542 	}
543 
544 	mfn = mfns[nth++];
545 	mutex_exit(&xnb_alloc_page_lock);
546 
547 	ASSERT(mfn != 0);
548 
549 	return (mfn);
550 #undef BATCH_SIZE
551 #undef WARNING_RATE_LIMIT
552 }
553 
554 /*ARGSUSED*/
555 static void
556 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
557 {
558 	int r;
559 	pfn_t pfn;
560 
561 	pfn = xen_assign_pfn(mfn);
562 	pfnzero(pfn, 0, PAGESIZE);
563 	xen_release_pfn(pfn);
564 
565 	/*
566 	 * This happens only in the error path, so batching is
567 	 * not worth the complication.
568 	 */
569 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
570 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
571 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
572 		    r, mfn);
573 	}
574 }
575 
576 /*
577  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but
578  * using local variables.
579  */
580 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
581 	((((_r)->sring->req_prod - loop) <		\
582 		(RING_SIZE(_r) - (loop - prod))) ?	\
583 	    ((_r)->sring->req_prod - loop) :		\
584 	    (RING_SIZE(_r) - (loop - prod)))
585 
586 mblk_t *
587 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
588 {
589 	mblk_t *free = mp, *prev = NULL;
590 	size_t len;
591 	gnttab_transfer_t *gop;
592 	boolean_t notify;
593 	RING_IDX loop, prod, end;
594 
595 	/*
596 	 * For each packet the sequence of operations is:
597 	 *
598 	 * 1. get a new page from the hypervisor.
599 	 * 2. get a request slot from the ring.
600 	 * 3. copy the data into the new page.
601 	 * 4. transfer the page to the peer.
602 	 * 5. update the request slot.
603 	 * 6. kick the peer.
604 	 * 7. free mp.
605 	 *
606 	 * In order to reduce the number of hypercalls, we prepare
607 	 * several packets for the peer and perform a single hypercall
608 	 * to transfer them.
609 	 */
610 
611 	mutex_enter(&xnbp->xnb_tx_lock);
612 
613 	/*
614 	 * If we are not connected to the peer or have not yet
615 	 * finished hotplug it is too early to pass packets to the
616 	 * peer.
617 	 */
618 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
619 		mutex_exit(&xnbp->xnb_tx_lock);
620 		DTRACE_PROBE(flip_tx_too_early);
621 		xnbp->xnb_stat_tx_too_early++;
622 		return (mp);
623 	}
624 
625 	loop = xnbp->xnb_rx_ring.req_cons;
626 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
627 	gop = xnbp->xnb_tx_top;
628 
629 	while ((mp != NULL) &&
630 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
631 
632 		mfn_t mfn;
633 		pfn_t pfn;
634 		netif_rx_request_t *rxreq;
635 		netif_rx_response_t *rxresp;
636 		char *valoop;
637 		size_t offset;
638 		mblk_t *ml;
639 		uint16_t cksum_flags;
640 
641 		/* 1 */
642 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
643 			xnbp->xnb_stat_xmit_defer++;
644 			break;
645 		}
646 
647 		/* 2 */
648 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
649 
650 #ifdef XNB_DEBUG
651 		if (!(rxreq->id < NET_RX_RING_SIZE))
652 			cmn_err(CE_PANIC, "xnb_to_peer: "
653 			    "id %d out of range in request 0x%p",
654 			    rxreq->id, (void *)rxreq);
655 		if (rxreq->gref >= NR_GRANT_ENTRIES)
656 			cmn_err(CE_PANIC, "xnb_to_peer: "
657 			    "grant ref %d out of range in request 0x%p",
658 			    rxreq->gref, (void *)rxreq);
659 #endif /* XNB_DEBUG */
660 
661 		/* Assign a pfn and map the new page at the allocated va. */
662 		pfn = xen_assign_pfn(mfn);
663 		hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
664 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
665 
666 		offset = TX_BUFFER_HEADROOM;
667 
668 		/* 3 */
669 		len = 0;
670 		valoop = xnbp->xnb_tx_va + offset;
671 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
672 			size_t chunk = ml->b_wptr - ml->b_rptr;
673 
674 			bcopy(ml->b_rptr, valoop, chunk);
675 			valoop += chunk;
676 			len += chunk;
677 		}
678 
679 		ASSERT(len + offset < PAGESIZE);
680 
681 		/* Release the pfn. */
682 		hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE,
683 		    HAT_UNLOAD_UNMAP);
684 		xen_release_pfn(pfn);
685 
686 		/* 4 */
687 		gop->mfn = mfn;
688 		gop->domid = xnbp->xnb_peer;
689 		gop->ref = rxreq->gref;
690 
691 		/* 5.1 */
692 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
693 		rxresp->offset = offset;
694 		rxresp->flags = 0;
695 
696 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
697 		if (cksum_flags != 0)
698 			xnbp->xnb_stat_tx_cksum_deferred++;
699 		rxresp->flags |= cksum_flags;
700 
701 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
702 		rxresp->status = len;
703 
704 		loop++;
705 		prod++;
706 		gop++;
707 		prev = mp;
708 		mp = mp->b_next;
709 	}
710 
711 	/*
712 	 * Did we actually do anything?
713 	 */
714 	if (loop == xnbp->xnb_rx_ring.req_cons) {
715 		mutex_exit(&xnbp->xnb_tx_lock);
716 		return (mp);
717 	}
718 
719 	end = loop;
720 
721 	/*
722 	 * Unlink the end of the 'done' list from the remainder.
723 	 */
724 	ASSERT(prev != NULL);
725 	prev->b_next = NULL;
726 
727 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top,
728 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
729 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
730 	}
731 
732 	loop = xnbp->xnb_rx_ring.req_cons;
733 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
734 	gop = xnbp->xnb_tx_top;
735 
736 	while (loop < end) {
737 		int16_t status = NETIF_RSP_OKAY;
738 
739 		if (gop->status != 0) {
740 			status = NETIF_RSP_ERROR;
741 
742 			/*
743 			 * If the status is anything other than
744 			 * GNTST_bad_page then we don't own the page
745 			 * any more, so don't try to give it back.
746 			 */
747 			if (gop->status != GNTST_bad_page)
748 				gop->mfn = 0;
749 		} else {
750 			/* The page is no longer ours. */
751 			gop->mfn = 0;
752 		}
753 
754 		if (gop->mfn != 0)
755 			/*
756 			 * Give back the page, as we won't be using
757 			 * it.
758 			 */
759 			xnb_free_page(xnbp, gop->mfn);
760 		else
761 			/*
762 			 * We gave away a page, update our accounting
763 			 * now.
764 			 */
765 			balloon_drv_subtracted(1);
766 
767 		/* 5.2 */
768 		if (status != NETIF_RSP_OKAY) {
769 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
770 			    status;
771 		} else {
772 			xnbp->xnb_stat_opackets++;
773 			xnbp->xnb_stat_obytes += len;
774 		}
775 
776 		loop++;
777 		prod++;
778 		gop++;
779 	}
780 
781 	xnbp->xnb_rx_ring.req_cons = loop;
782 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
783 
784 	/* 6 */
785 	/* LINTED: constant in conditional context */
786 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
787 	if (notify) {
788 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
789 		xnbp->xnb_stat_tx_notify_sent++;
790 	} else {
791 		xnbp->xnb_stat_tx_notify_deferred++;
792 	}
793 
794 	if (mp != NULL)
795 		xnbp->xnb_stat_xmit_defer++;
796 
797 	mutex_exit(&xnbp->xnb_tx_lock);
798 
799 	/* Free mblk_t's that we consumed. */
800 	freemsgchain(free);
801 
802 	return (mp);
803 }
804 
805 /* helper functions for xnb_copy_to_peer */
806 
807 /*
808  * Grow the array of copy operation descriptors.
809  * Returns a pointer to the next available entry.
810  */
811 gnttab_copy_t *
812 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop)
813 {
814 	/*
815 	 * o_cpop (arg.1) is a ptr to the area we would like to copy
816 	 * something into but cannot, because we haven't alloc'ed it
817 	 * yet, or NULL.
818 	 * old_cpop and new_cpop (local) are pointers to old/new
819 	 * versions of xnbp->xnb_tx_cpop.
820 	 */
821 	gnttab_copy_t	*new_cpop, *old_cpop, *ret_cpop;
822 	size_t		newcount;
823 
824 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
825 
826 	old_cpop = xnbp->xnb_tx_cpop;
827 	/*
828 	 * o_cpop is a pointer into the array pointed to by old_cpop;
829 	 * it would be an error for exactly one of these pointers to be NULL.
830 	 * We shouldn't call this function if xnb_tx_cpop has already
831 	 * been allocated, but we're starting to fill it from the beginning
832 	 * again.
833 	 */
834 	ASSERT((o_cpop == NULL && old_cpop == NULL) ||
835 	    (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop));
836 
837 	newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT;
838 
839 	new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP);
840 	if (new_cpop == NULL) {
841 		xnbp->xnb_stat_other_allocation_failure++;
842 		return (NULL);
843 	}
844 
845 	if (o_cpop != NULL) {
846 		size_t	 offset = (o_cpop - old_cpop);
847 
848 		/* we only need to move the parts in use ... */
849 		(void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz *
850 		    (sizeof (*old_cpop)));
851 
852 		kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop));
853 
854 		ret_cpop = new_cpop + offset;
855 	} else {
856 		ret_cpop = new_cpop;
857 	}
858 
859 	xnbp->xnb_tx_cpop = new_cpop;
860 	xnbp->xnb_cpop_sz = newcount;
861 
862 	xnbp->xnb_stat_tx_cpoparea_grown++;
863 
864 	return (ret_cpop);
865 }
866 
867 /*
868  * Check whether an address is on a page that's foreign to this domain.
869  */
870 static boolean_t
871 is_foreign(void *addr)
872 {
873 	pfn_t	pfn = hat_getpfnum(kas.a_hat, addr);
874 
875 	return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE);
876 }
877 
878 /*
879  * Insert a newly allocated mblk into a chain, replacing the old one.
880  */
881 static mblk_t *
882 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
883 {
884 	uint32_t	start, stuff, end, value, flags;
885 	mblk_t		*new_mp;
886 
887 	new_mp = copyb(mp);
888 	if (new_mp == NULL)
889 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
890 		    "for %p, len %lu", (void *) mp, len);
891 
892 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
893 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
894 	    flags, KM_NOSLEEP);
895 
896 	new_mp->b_next = mp->b_next;
897 	new_mp->b_prev = mp->b_prev;
898 	new_mp->b_cont = mp->b_cont;
899 
900 	/* Make sure we only overwrite pointers to the mblk being replaced. */
901 	if (mp_prev != NULL && mp_prev->b_next == mp)
902 		mp_prev->b_next = new_mp;
903 
904 	if (ml_prev != NULL && ml_prev->b_cont == mp)
905 		ml_prev->b_cont = new_mp;
906 
907 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
908 	freemsg(mp);
909 
910 	return (new_mp);
911 }
912 
913 /*
914  * Set all the fields in a gnttab_copy_t.
915  */
916 static void
917 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
918     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
919 {
920 	ASSERT(xnbp != NULL && gp != NULL);
921 
922 	gp->source.offset = s_off;
923 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
924 	gp->source.domid = DOMID_SELF;
925 
926 	gp->len = (uint16_t)len;
927 	gp->flags = GNTCOPY_dest_gref;
928 	gp->status = 0;
929 
930 	gp->dest.u.ref = d_ref;
931 	gp->dest.offset = d_off;
932 	gp->dest.domid = xnbp->xnb_peer;
933 }
934 
935 mblk_t *
936 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
937 {
938 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
939 	mblk_t		*ml, *ml_prev;
940 	gnttab_copy_t	*gop_cp;
941 	boolean_t	notify;
942 	RING_IDX	loop, prod;
943 	int		i;
944 
945 	if (!xnbp->xnb_hv_copy)
946 		return (xnb_to_peer(xnbp, mp));
947 
948 	/*
949 	 * For each packet the sequence of operations is:
950 	 *
951 	 *  1. get a request slot from the ring.
952 	 *  2. set up data for hypercall (see NOTE below)
953 	 *  3. have the hypervisore copy the data
954 	 *  4. update the request slot.
955 	 *  5. kick the peer.
956 	 *
957 	 * NOTE ad 2.
958 	 *  In order to reduce the number of hypercalls, we prepare
959 	 *  several packets (mp->b_cont != NULL) for the peer and
960 	 *  perform a single hypercall to transfer them.
961 	 *  We also have to set up a seperate copy operation for
962 	 *  every page.
963 	 *
964 	 * If we have more than one message (mp->b_next != NULL),
965 	 * we do this whole dance repeatedly.
966 	 */
967 
968 	mutex_enter(&xnbp->xnb_tx_lock);
969 
970 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
971 		mutex_exit(&xnbp->xnb_tx_lock);
972 		DTRACE_PROBE(copy_tx_too_early);
973 		xnbp->xnb_stat_tx_too_early++;
974 		return (mp);
975 	}
976 
977 	loop = xnbp->xnb_rx_ring.req_cons;
978 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
979 
980 	while ((mp != NULL) &&
981 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
982 		netif_rx_request_t	*rxreq;
983 		netif_rx_response_t	*rxresp;
984 		size_t			offset, d_offset;
985 		size_t			len;
986 		uint16_t		cksum_flags;
987 		int16_t			status = NETIF_RSP_OKAY;
988 		int			item_count;
989 
990 		/* 1 */
991 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
992 
993 #ifdef XNB_DEBUG
994 		if (!(rxreq->id < NET_RX_RING_SIZE))
995 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
996 			    "id %d out of range in request 0x%p",
997 			    rxreq->id, (void *)rxreq);
998 		if (rxreq->gref >= NR_GRANT_ENTRIES)
999 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1000 			    "grant ref %d out of range in request 0x%p",
1001 			    rxreq->gref, (void *)rxreq);
1002 #endif /* XNB_DEBUG */
1003 
1004 		/* 2 */
1005 		d_offset = offset = TX_BUFFER_HEADROOM;
1006 		len = 0;
1007 		item_count = 0;
1008 
1009 		gop_cp = xnbp->xnb_tx_cpop;
1010 
1011 		/*
1012 		 * We walk the b_cont pointers and set up a gop_cp
1013 		 * structure for every page in every data block we have.
1014 		 */
1015 		/* 2a */
1016 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1017 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1018 			uchar_t	*r_tmp,	*rpt_align;
1019 			size_t	r_offset;
1020 
1021 			/*
1022 			 * If we get an mblk on a page that doesn't belong to
1023 			 * this domain, get a new mblk to replace the old one.
1024 			 */
1025 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1026 				mblk_t *ml_new = replace_msg(ml, chunk,
1027 				    mp_prev, ml_prev);
1028 
1029 				/* We can still use old ml, but not *ml! */
1030 				if (free == ml)
1031 					free = ml_new;
1032 				if (mp == ml)
1033 					mp = ml_new;
1034 				ml = ml_new;
1035 
1036 				xnbp->xnb_stat_tx_foreign_page++;
1037 			}
1038 
1039 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1040 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1041 			r_tmp = ml->b_rptr;
1042 
1043 			if (d_offset + chunk > PAGESIZE)
1044 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1045 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1046 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1047 				    (void *)mp, (void *)saved_mp, (void *)ml,
1048 				    (void *)rpt_align,
1049 				    d_offset, chunk, (int)PAGESIZE);
1050 
1051 			while (chunk > 0) {
1052 				size_t part_len;
1053 
1054 				item_count++;
1055 				if (item_count > xnbp->xnb_cpop_sz) {
1056 					gop_cp = grow_cpop_area(xnbp, gop_cp);
1057 					if (gop_cp == NULL)
1058 						goto failure;
1059 				}
1060 				/*
1061 				 * If our mblk crosses a page boundary, we need
1062 				 * to do a seperate copy for every page.
1063 				 */
1064 				if (r_offset + chunk > PAGESIZE) {
1065 					part_len = PAGESIZE - r_offset;
1066 
1067 					DTRACE_PROBE3(mblk_page_crossed,
1068 					    (mblk_t *), ml, int, chunk, int,
1069 					    (int)r_offset);
1070 
1071 					xnbp->xnb_stat_tx_pagebndry_crossed++;
1072 				} else {
1073 					part_len = chunk;
1074 				}
1075 
1076 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1077 				    d_offset, part_len, rxreq->gref);
1078 
1079 				chunk -= part_len;
1080 
1081 				len += part_len;
1082 				d_offset += part_len;
1083 				r_tmp += part_len;
1084 				/*
1085 				 * The 2nd, 3rd ... last copies will always
1086 				 * start at r_tmp, therefore r_offset is 0.
1087 				 */
1088 				r_offset = 0;
1089 				gop_cp++;
1090 			}
1091 			ml_prev = ml;
1092 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1093 			    chunk, int, len, int, item_count);
1094 		}
1095 		/* 3 */
1096 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop,
1097 		    item_count) != 0) {
1098 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1099 			DTRACE_PROBE(HV_granttableopfailed);
1100 		}
1101 
1102 		/* 4 */
1103 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1104 		rxresp->offset = offset;
1105 
1106 		rxresp->flags = 0;
1107 
1108 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1109 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1110 		    (int)rxresp->status);
1111 
1112 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1113 		if (cksum_flags != 0)
1114 			xnbp->xnb_stat_tx_cksum_deferred++;
1115 		rxresp->flags |= cksum_flags;
1116 
1117 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1118 		rxresp->status = len;
1119 
1120 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1121 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1122 		    (int)rxresp->status);
1123 
1124 		for (i = 0; i < item_count; i++) {
1125 			if (xnbp->xnb_tx_cpop[i].status != 0) {
1126 				DTRACE_PROBE2(cpop__status__nonnull, int,
1127 				    (int)xnbp->xnb_tx_cpop[i].status,
1128 				    int, i);
1129 				status = NETIF_RSP_ERROR;
1130 			}
1131 		}
1132 
1133 		/* 5.2 */
1134 		if (status != NETIF_RSP_OKAY) {
1135 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1136 			    status;
1137 			xnbp->xnb_stat_tx_rsp_notok++;
1138 		} else {
1139 			xnbp->xnb_stat_opackets++;
1140 			xnbp->xnb_stat_obytes += len;
1141 		}
1142 
1143 		loop++;
1144 		prod++;
1145 		mp_prev = mp;
1146 		mp = mp->b_next;
1147 	}
1148 failure:
1149 	/*
1150 	 * Did we actually do anything?
1151 	 */
1152 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1153 		mutex_exit(&xnbp->xnb_tx_lock);
1154 		return (mp);
1155 	}
1156 
1157 	/*
1158 	 * Unlink the end of the 'done' list from the remainder.
1159 	 */
1160 	ASSERT(mp_prev != NULL);
1161 	mp_prev->b_next = NULL;
1162 
1163 	xnbp->xnb_rx_ring.req_cons = loop;
1164 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1165 
1166 	/* 6 */
1167 	/* LINTED: constant in conditional context */
1168 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1169 	if (notify) {
1170 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1171 		xnbp->xnb_stat_tx_notify_sent++;
1172 	} else {
1173 		xnbp->xnb_stat_tx_notify_deferred++;
1174 	}
1175 
1176 	if (mp != NULL)
1177 		xnbp->xnb_stat_xmit_defer++;
1178 
1179 	mutex_exit(&xnbp->xnb_tx_lock);
1180 
1181 	/* Free mblk_t structs we have consumed. */
1182 	freemsgchain(free);
1183 
1184 	return (mp);
1185 }
1186 
1187 /*ARGSUSED*/
1188 static int
1189 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
1190 {
1191 	xnb_rxbuf_t *rxp = buf;
1192 
1193 	bzero(rxp, sizeof (*rxp));
1194 
1195 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
1196 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
1197 
1198 	rxp->xr_mop.host_addr =
1199 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
1200 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
1201 	    VM_NOSLEEP : VM_SLEEP);
1202 
1203 	if (rxp->xr_mop.host_addr == NULL) {
1204 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
1205 		    "cannot get address space");
1206 		return (-1);
1207 	}
1208 
1209 	/*
1210 	 * Have the hat ensure that page table exists for the VA.
1211 	 */
1212 	hat_prepare_mapping(kas.a_hat,
1213 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1214 
1215 	return (0);
1216 }
1217 
1218 /*ARGSUSED*/
1219 static void
1220 xnb_rxbuf_destructor(void *buf, void *arg)
1221 {
1222 	xnb_rxbuf_t *rxp = buf;
1223 
1224 	ASSERT(rxp->xr_mop.host_addr != NULL);
1225 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1226 
1227 	hat_release_mapping(kas.a_hat,
1228 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
1229 	vmem_free(heap_arena,
1230 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
1231 }
1232 
1233 static void
1234 xnb_rx_notify_peer(xnb_t *xnbp)
1235 {
1236 	boolean_t notify;
1237 
1238 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1239 
1240 	/* LINTED: constant in conditional context */
1241 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1242 	if (notify) {
1243 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1244 		xnbp->xnb_stat_rx_notify_sent++;
1245 	} else {
1246 		xnbp->xnb_stat_rx_notify_deferred++;
1247 	}
1248 }
1249 
1250 static void
1251 xnb_rx_complete(xnb_rxbuf_t *rxp)
1252 {
1253 	xnb_t *xnbp = rxp->xr_xnbp;
1254 
1255 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1256 
1257 	mutex_enter(&xnbp->xnb_rx_lock);
1258 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp);
1259 	mutex_exit(&xnbp->xnb_rx_lock);
1260 }
1261 
1262 static void
1263 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1264 {
1265 	RING_IDX i;
1266 	netif_tx_response_t *txresp;
1267 
1268 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1269 
1270 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1271 
1272 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1273 	txresp->id = id;
1274 	txresp->status = status;
1275 
1276 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1277 
1278 	/*
1279 	 * Note that we don't push the change to the peer here - that
1280 	 * is the callers responsibility.
1281 	 */
1282 }
1283 
1284 static void
1285 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop,
1286     xnb_rxbuf_t *rxp)
1287 {
1288 	gnttab_unmap_grant_ref_t	*unmop;
1289 	int				u_count;
1290 	int				reqs_on_ring;
1291 
1292 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1293 	ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE);
1294 
1295 	u_count = xnbp->xnb_rx_unmop_count++;
1296 
1297 	/* Cache data for the time when we actually unmap grant refs */
1298 	xnbp->xnb_rx_unmop_rxp[u_count] = rxp;
1299 
1300 	unmop = &xnbp->xnb_rx_unmop[u_count];
1301 	unmop->host_addr = mop->host_addr;
1302 	unmop->dev_bus_addr = mop->dev_bus_addr;
1303 	unmop->handle = mop->handle;
1304 
1305 	/*
1306 	 * We cannot check the ring once we're disconnected from it. Batching
1307 	 * doesn't seem to be a useful optimisation in this case either,
1308 	 * so we directly call into the actual unmap function.
1309 	 */
1310 	if (xnbp->xnb_connected) {
1311 		reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring);
1312 
1313 		/*
1314 		 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch"
1315 		 * or (with N == 1) "immediate unmop" behaviour.
1316 		 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion.
1317 		 */
1318 		if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat &&
1319 		    reqs_on_ring > xnb_unmop_lowwat)
1320 			return;
1321 	}
1322 
1323 	xnb_rx_perform_pending_unmop(xnbp);
1324 }
1325 
1326 /*
1327  * Here we perform the actual unmapping of the data that was
1328  * accumulated in xnb_rx_schedule_unmop().
1329  * Note that it is the caller's responsibility to make sure that
1330  * there's actually something there to unmop.
1331  */
1332 static void
1333 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
1334 {
1335 	RING_IDX loop;
1336 #ifdef XNB_DEBUG
1337 	gnttab_unmap_grant_ref_t *unmop;
1338 #endif /* XNB_DEBUG */
1339 
1340 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1341 	ASSERT(xnbp->xnb_rx_unmop_count > 0);
1342 
1343 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1344 	    xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) {
1345 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1346 		    "unmap grant operation failed, "
1347 		    "%d pages lost", xnbp->xnb_rx_unmop_count);
1348 	}
1349 
1350 #ifdef XNB_DEBUG
1351 	for (loop = 0, unmop = xnbp->xnb_rx_unmop;
1352 	    loop < xnbp->xnb_rx_unmop_count;
1353 	    loop++, unmop++) {
1354 		if (unmop->status != 0) {
1355 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
1356 			    "unmap grant reference failed (%d)",
1357 			    unmop->status);
1358 		}
1359 	}
1360 #endif /* XNB_DEBUG */
1361 
1362 	for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) {
1363 		xnb_rxbuf_t	*rxp = xnbp->xnb_rx_unmop_rxp[loop];
1364 
1365 		if (rxp == NULL)
1366 			cmn_err(CE_PANIC,
1367 			    "xnb_rx_perform_pending_unmop: "
1368 			    "unexpected NULL rxp (loop %d; count %d)!",
1369 			    loop, xnbp->xnb_rx_unmop_count);
1370 
1371 		if (xnbp->xnb_connected)
1372 			xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
1373 		xnb_rxbuf_put(xnbp, rxp);
1374 	}
1375 	if (xnbp->xnb_connected)
1376 		xnb_rx_notify_peer(xnbp);
1377 
1378 	xnbp->xnb_rx_unmop_count = 0;
1379 
1380 #ifdef XNB_DEBUG
1381 	bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop));
1382 	bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp));
1383 #endif /* XNB_DEBUG */
1384 }
1385 
1386 static xnb_rxbuf_t *
1387 xnb_rxbuf_get(xnb_t *xnbp, int flags)
1388 {
1389 	xnb_rxbuf_t *rxp;
1390 
1391 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1392 
1393 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
1394 	if (rxp != NULL) {
1395 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
1396 		rxp->xr_flags |= XNB_RXBUF_INUSE;
1397 
1398 		rxp->xr_xnbp = xnbp;
1399 		rxp->xr_mop.dom = xnbp->xnb_peer;
1400 
1401 		rxp->xr_mop.flags = GNTMAP_host_map;
1402 		if (!xnbp->xnb_rx_pages_writable)
1403 			rxp->xr_mop.flags |= GNTMAP_readonly;
1404 
1405 		xnbp->xnb_rx_buf_count++;
1406 	}
1407 
1408 	return (rxp);
1409 }
1410 
1411 static void
1412 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
1413 {
1414 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1415 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
1416 
1417 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
1418 	xnbp->xnb_rx_buf_count--;
1419 
1420 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
1421 }
1422 
1423 static mblk_t *
1424 xnb_recv(xnb_t *xnbp)
1425 {
1426 	RING_IDX start, end, loop;
1427 	gnttab_map_grant_ref_t *mop;
1428 	xnb_rxbuf_t **rxpp;
1429 	netif_tx_request_t *txreq;
1430 	boolean_t work_to_do;
1431 	mblk_t *head, *tail;
1432 	/*
1433 	 * If the peer granted a read-only mapping to the page then we
1434 	 * must copy the data, as the local protocol stack (should the
1435 	 * packet be destined for this host) will modify the packet
1436 	 * 'in place'.
1437 	 */
1438 	boolean_t copy = !xnbp->xnb_rx_pages_writable;
1439 
1440 	/*
1441 	 * For each individual request, the sequence of actions is:
1442 	 *
1443 	 * 1. get the request.
1444 	 * 2. map the page based on the grant ref.
1445 	 * 3. allocate an mblk, copy the data to it.
1446 	 * 4. release the grant.
1447 	 * 5. update the ring.
1448 	 * 6. pass the packet upward.
1449 	 * 7. kick the peer.
1450 	 *
1451 	 * In fact, we try to perform the grant operations in batches,
1452 	 * so there are two loops.
1453 	 */
1454 
1455 	head = tail = NULL;
1456 around:
1457 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
1458 
1459 	/* LINTED: constant in conditional context */
1460 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1461 	if (!work_to_do) {
1462 finished:
1463 		return (head);
1464 	}
1465 
1466 	start = xnbp->xnb_tx_ring.req_cons;
1467 	end = xnbp->xnb_tx_ring.sring->req_prod;
1468 
1469 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1470 	    loop != end;
1471 	    loop++, mop++, rxpp++) {
1472 		xnb_rxbuf_t *rxp;
1473 
1474 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1475 		if (rxp == NULL)
1476 			break;
1477 
1478 		ASSERT(xnbp->xnb_rx_pages_writable ||
1479 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1480 		    == GNTMAP_readonly));
1481 
1482 		rxp->xr_mop.ref =
1483 		    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref;
1484 
1485 		ASSERT(rxp->xr_mop.ref < NR_GRANT_ENTRIES);
1486 
1487 		*mop = rxp->xr_mop;
1488 		*rxpp = rxp;
1489 	}
1490 
1491 	if ((loop - start) == 0)
1492 		goto finished;
1493 
1494 	end = loop;
1495 
1496 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1497 	    xnbp->xnb_rx_mop, end - start) != 0) {
1498 
1499 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1500 
1501 		loop = start;
1502 		rxpp = xnbp->xnb_rx_bufp;
1503 
1504 		while (loop != end) {
1505 			xnb_rxbuf_put(xnbp, *rxpp);
1506 
1507 			loop++;
1508 			rxpp++;
1509 		}
1510 
1511 		goto finished;
1512 	}
1513 
1514 	for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp;
1515 	    loop != end;
1516 	    loop++, mop++, rxpp++) {
1517 		mblk_t *mp = NULL;
1518 		int16_t status = NETIF_RSP_OKAY;
1519 		xnb_rxbuf_t *rxp = *rxpp;
1520 
1521 		if (mop->status != 0) {
1522 			cmn_err(CE_WARN, "xnb_recv: "
1523 			    "failed to map buffer: %d",
1524 			    mop->status);
1525 			status = NETIF_RSP_ERROR;
1526 		}
1527 
1528 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1529 
1530 		if (status == NETIF_RSP_OKAY) {
1531 			if (copy) {
1532 				mp = allocb(txreq->size, BPRI_MED);
1533 				if (mp == NULL) {
1534 					status = NETIF_RSP_ERROR;
1535 					xnbp->xnb_stat_rx_allocb_failed++;
1536 				} else {
1537 					bcopy((caddr_t)(uintptr_t)
1538 					    mop->host_addr + txreq->offset,
1539 					    mp->b_wptr, txreq->size);
1540 					mp->b_wptr += txreq->size;
1541 				}
1542 			} else {
1543 				mp = desballoc((uchar_t *)(uintptr_t)
1544 				    mop->host_addr + txreq->offset,
1545 				    txreq->size, 0, &rxp->xr_free_rtn);
1546 				if (mp == NULL) {
1547 					status = NETIF_RSP_ERROR;
1548 					xnbp->xnb_stat_rx_allocb_failed++;
1549 				} else {
1550 					rxp->xr_id = txreq->id;
1551 					rxp->xr_status = status;
1552 					rxp->xr_mop = *mop;
1553 
1554 					mp->b_wptr += txreq->size;
1555 				}
1556 			}
1557 
1558 			/*
1559 			 * If we have a buffer and there are checksum
1560 			 * flags, process them appropriately.
1561 			 */
1562 			if ((mp != NULL) &&
1563 			    ((txreq->flags &
1564 			    (NETTXF_csum_blank | NETTXF_data_validated))
1565 			    != 0)) {
1566 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1567 				    mp, txreq->flags);
1568 				xnbp->xnb_stat_rx_cksum_no_need++;
1569 			}
1570 		}
1571 
1572 		if (copy || (mp == NULL)) {
1573 			rxp->xr_status = status;
1574 			rxp->xr_id = txreq->id;
1575 			xnb_rx_schedule_unmop(xnbp, mop, rxp);
1576 		}
1577 
1578 		if (mp != NULL) {
1579 			xnbp->xnb_stat_ipackets++;
1580 			xnbp->xnb_stat_rbytes += txreq->size;
1581 
1582 			mp->b_next = NULL;
1583 			if (head == NULL) {
1584 				ASSERT(tail == NULL);
1585 				head = mp;
1586 			} else {
1587 				ASSERT(tail != NULL);
1588 				tail->b_next = mp;
1589 			}
1590 			tail = mp;
1591 		}
1592 	}
1593 
1594 	xnbp->xnb_tx_ring.req_cons = loop;
1595 
1596 	goto around;
1597 	/* NOTREACHED */
1598 }
1599 
1600 /*
1601  *  intr() -- ring interrupt service routine
1602  */
1603 static uint_t
1604 xnb_intr(caddr_t arg)
1605 {
1606 	xnb_t *xnbp = (xnb_t *)arg;
1607 	mblk_t *mp;
1608 
1609 	xnbp->xnb_stat_intr++;
1610 
1611 	mutex_enter(&xnbp->xnb_rx_lock);
1612 
1613 	ASSERT(xnbp->xnb_connected);
1614 
1615 	mp = xnb_recv(xnbp);
1616 
1617 	mutex_exit(&xnbp->xnb_rx_lock);
1618 
1619 	if (!xnbp->xnb_hotplugged) {
1620 		xnbp->xnb_stat_rx_too_early++;
1621 		goto fail;
1622 	}
1623 	if (mp == NULL) {
1624 		xnbp->xnb_stat_spurious_intr++;
1625 		goto fail;
1626 	}
1627 
1628 	xnbp->xnb_flavour->xf_recv(xnbp, mp);
1629 
1630 	return (DDI_INTR_CLAIMED);
1631 
1632 fail:
1633 	freemsgchain(mp);
1634 	return (DDI_INTR_CLAIMED);
1635 }
1636 
1637 static boolean_t
1638 xnb_connect_rings(dev_info_t *dip)
1639 {
1640 	xnb_t *xnbp = ddi_get_driver_private(dip);
1641 	char *oename;
1642 	struct gnttab_map_grant_ref map_op;
1643 	evtchn_port_t evtchn;
1644 	int i;
1645 
1646 	/*
1647 	 * Cannot attempt to connect the rings if already connected.
1648 	 */
1649 	ASSERT(!xnbp->xnb_connected);
1650 
1651 	oename = xvdi_get_oename(dip);
1652 
1653 	if (xenbus_gather(XBT_NULL, oename,
1654 	    "event-channel", "%u", &evtchn,
1655 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1656 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1657 	    NULL) != 0) {
1658 		cmn_err(CE_WARN, "xnb_connect_rings: "
1659 		    "cannot read other-end details from %s",
1660 		    oename);
1661 		goto fail;
1662 	}
1663 
1664 	if (xenbus_scanf(XBT_NULL, oename,
1665 	    "feature-tx-writable", "%d", &i) != 0)
1666 		i = 0;
1667 	if (i != 0)
1668 		xnbp->xnb_rx_pages_writable = B_TRUE;
1669 
1670 	if (xenbus_scanf(XBT_NULL, oename,
1671 	    "feature-no-csum-offload", "%d", &i) != 0)
1672 		i = 0;
1673 	if ((i == 1) || !xnbp->xnb_cksum_offload)
1674 		xnbp->xnb_cksum_offload = B_FALSE;
1675 
1676 	/* Check whether our peer knows and requests hypervisor copy */
1677 	if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i)
1678 	    != 0)
1679 		i = 0;
1680 	if (i != 0)
1681 		xnbp->xnb_hv_copy = B_TRUE;
1682 
1683 	/*
1684 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1685 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1686 	 *    into the allocated vaddr (one for tx, one for rx).
1687 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1688 	 *    bound to this domain.
1689 	 * 4. associate the event channel with an interrupt.
1690 	 * 5. declare ourselves connected.
1691 	 * 6. enable the interrupt.
1692 	 */
1693 
1694 	/* 1.tx */
1695 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1696 	    0, 0, 0, 0, VM_SLEEP);
1697 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1698 
1699 	/* 2.tx */
1700 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1701 	map_op.flags = GNTMAP_host_map;
1702 	map_op.ref = xnbp->xnb_tx_ring_ref;
1703 	map_op.dom = xnbp->xnb_peer;
1704 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1705 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1706 	    &map_op, 1) != 0 || map_op.status != 0) {
1707 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1708 		goto fail;
1709 	}
1710 	xnbp->xnb_tx_ring_handle = map_op.handle;
1711 
1712 	/* LINTED: constant in conditional context */
1713 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1714 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1715 
1716 	/* 1.rx */
1717 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1718 	    0, 0, 0, 0, VM_SLEEP);
1719 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1720 
1721 	/* 2.rx */
1722 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1723 	map_op.flags = GNTMAP_host_map;
1724 	map_op.ref = xnbp->xnb_rx_ring_ref;
1725 	map_op.dom = xnbp->xnb_peer;
1726 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1727 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1728 	    &map_op, 1) != 0 || map_op.status != 0) {
1729 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1730 		goto fail;
1731 	}
1732 	xnbp->xnb_rx_ring_handle = map_op.handle;
1733 
1734 	/* LINTED: constant in conditional context */
1735 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1736 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1737 
1738 	/* 3 */
1739 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1740 		cmn_err(CE_WARN, "xnb_connect_rings: "
1741 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1742 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1743 		goto fail;
1744 	}
1745 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1746 
1747 	/*
1748 	 * It would be good to set the state to XenbusStateConnected
1749 	 * here as well, but then what if ddi_add_intr() failed?
1750 	 * Changing the state in the store will be noticed by the peer
1751 	 * and cannot be "taken back".
1752 	 */
1753 	mutex_enter(&xnbp->xnb_tx_lock);
1754 	mutex_enter(&xnbp->xnb_rx_lock);
1755 
1756 	/* 5.1 */
1757 	xnbp->xnb_connected = B_TRUE;
1758 
1759 	mutex_exit(&xnbp->xnb_rx_lock);
1760 	mutex_exit(&xnbp->xnb_tx_lock);
1761 
1762 	/* 4, 6 */
1763 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1764 	    != DDI_SUCCESS) {
1765 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1766 		goto fail;
1767 	}
1768 	xnbp->xnb_irq = B_TRUE;
1769 
1770 	/* 5.2 */
1771 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1772 
1773 	return (B_TRUE);
1774 
1775 fail:
1776 	mutex_enter(&xnbp->xnb_tx_lock);
1777 	mutex_enter(&xnbp->xnb_rx_lock);
1778 
1779 	xnbp->xnb_connected = B_FALSE;
1780 	mutex_exit(&xnbp->xnb_rx_lock);
1781 	mutex_exit(&xnbp->xnb_tx_lock);
1782 
1783 	return (B_FALSE);
1784 }
1785 
1786 static void
1787 xnb_disconnect_rings(dev_info_t *dip)
1788 {
1789 	xnb_t *xnbp = ddi_get_driver_private(dip);
1790 
1791 	if (xnbp->xnb_irq) {
1792 		ddi_remove_intr(dip, 0, NULL);
1793 		xnbp->xnb_irq = B_FALSE;
1794 	}
1795 
1796 	if (xnbp->xnb_rx_unmop_count > 0)
1797 		xnb_rx_perform_pending_unmop(xnbp);
1798 
1799 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1800 		xvdi_free_evtchn(dip);
1801 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1802 	}
1803 
1804 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1805 		struct gnttab_unmap_grant_ref unmap_op;
1806 
1807 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1808 		    xnbp->xnb_rx_ring_addr;
1809 		unmap_op.dev_bus_addr = 0;
1810 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1811 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1812 		    &unmap_op, 1) != 0)
1813 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1814 			    "cannot unmap rx-ring page (%d)",
1815 			    unmap_op.status);
1816 
1817 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1818 	}
1819 
1820 	if (xnbp->xnb_rx_ring_addr != NULL) {
1821 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1822 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1823 		xnbp->xnb_rx_ring_addr = NULL;
1824 	}
1825 
1826 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1827 		struct gnttab_unmap_grant_ref unmap_op;
1828 
1829 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1830 		    xnbp->xnb_tx_ring_addr;
1831 		unmap_op.dev_bus_addr = 0;
1832 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1833 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1834 		    &unmap_op, 1) != 0)
1835 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1836 			    "cannot unmap tx-ring page (%d)",
1837 			    unmap_op.status);
1838 
1839 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1840 	}
1841 
1842 	if (xnbp->xnb_tx_ring_addr != NULL) {
1843 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1844 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1845 		xnbp->xnb_tx_ring_addr = NULL;
1846 	}
1847 }
1848 
1849 /*ARGSUSED*/
1850 static void
1851 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1852     void *arg, void *impl_data)
1853 {
1854 	xnb_t *xnbp = ddi_get_driver_private(dip);
1855 	XenbusState new_state = *(XenbusState *)impl_data;
1856 
1857 	ASSERT(xnbp != NULL);
1858 
1859 	switch (new_state) {
1860 	case XenbusStateConnected:
1861 		if (xnb_connect_rings(dip)) {
1862 			xnbp->xnb_flavour->xf_peer_connected(xnbp);
1863 		} else {
1864 			xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1865 			xnb_disconnect_rings(dip);
1866 			(void) xvdi_switch_state(dip, XBT_NULL,
1867 			    XenbusStateClosed);
1868 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1869 		}
1870 
1871 		/*
1872 		 * Now that we've attempted to connect it's reasonable
1873 		 * to allow an attempt to detach.
1874 		 */
1875 		xnbp->xnb_detachable = B_TRUE;
1876 
1877 		break;
1878 
1879 	case XenbusStateClosing:
1880 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1881 
1882 		break;
1883 
1884 	case XenbusStateClosed:
1885 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1886 
1887 		mutex_enter(&xnbp->xnb_tx_lock);
1888 		mutex_enter(&xnbp->xnb_rx_lock);
1889 
1890 		xnb_disconnect_rings(dip);
1891 		xnbp->xnb_connected = B_FALSE;
1892 
1893 		mutex_exit(&xnbp->xnb_rx_lock);
1894 		mutex_exit(&xnbp->xnb_tx_lock);
1895 
1896 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1897 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1898 		/*
1899 		 * In all likelyhood this is already set (in the above
1900 		 * case), but if the peer never attempted to connect
1901 		 * and the domain is destroyed we get here without
1902 		 * having been through the case above, so we set it to
1903 		 * be sure.
1904 		 */
1905 		xnbp->xnb_detachable = B_TRUE;
1906 
1907 		break;
1908 
1909 	default:
1910 		break;
1911 	}
1912 }
1913 
1914 /*ARGSUSED*/
1915 static void
1916 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1917     void *arg, void *impl_data)
1918 {
1919 	xnb_t *xnbp = ddi_get_driver_private(dip);
1920 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1921 	boolean_t success;
1922 
1923 	ASSERT(xnbp != NULL);
1924 
1925 	switch (state) {
1926 	case Connected:
1927 
1928 		success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp);
1929 
1930 		mutex_enter(&xnbp->xnb_tx_lock);
1931 		mutex_enter(&xnbp->xnb_rx_lock);
1932 
1933 		xnbp->xnb_hotplugged = success;
1934 
1935 		mutex_exit(&xnbp->xnb_rx_lock);
1936 		mutex_exit(&xnbp->xnb_tx_lock);
1937 		break;
1938 
1939 	default:
1940 		break;
1941 	}
1942 }
1943 
1944 static struct modldrv modldrv = {
1945 	&mod_miscops, "xnb module %I%",
1946 };
1947 
1948 static struct modlinkage modlinkage = {
1949 	MODREV_1, &modldrv, NULL
1950 };
1951 
1952 int
1953 _init(void)
1954 {
1955 	int i;
1956 
1957 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
1958 
1959 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
1960 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
1961 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
1962 	ASSERT(xnb_rxbuf_cachep != NULL);
1963 
1964 	i = mod_install(&modlinkage);
1965 	if (i != DDI_SUCCESS) {
1966 		kmem_cache_destroy(xnb_rxbuf_cachep);
1967 		mutex_destroy(&xnb_alloc_page_lock);
1968 	}
1969 	return (i);
1970 }
1971 
1972 int
1973 _info(struct modinfo *modinfop)
1974 {
1975 	return (mod_info(&modlinkage, modinfop));
1976 }
1977 
1978 int
1979 _fini(void)
1980 {
1981 	int i;
1982 
1983 	i = mod_remove(&modlinkage);
1984 	if (i == DDI_SUCCESS) {
1985 		kmem_cache_destroy(xnb_rxbuf_cachep);
1986 		mutex_destroy(&xnb_alloc_page_lock);
1987 	}
1988 	return (i);
1989 }
1990