xref: /titanic_44/usr/src/uts/common/xen/io/xnb.c (revision f3324781c875e2f9865c291e43f86ee710b0c145)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #ifdef DEBUG
30 #define	XNB_DEBUG 1
31 #endif /* DEBUG */
32 
33 #include "xnb.h"
34 
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/mac.h>
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 
52 #include <sys/gld.h>
53 #include <inet/ip.h>
54 #include <inet/ip_impl.h>
55 #include <sys/vnic_impl.h> /* blech. */
56 
57 /*
58  * The terms "transmit" and "receive" are used in their traditional
59  * sense here - packets from other parts of this system are
60  * "transmitted" to the peer domain and those originating from the
61  * peer are "received".
62  *
63  * In some cases this can be confusing, because various data
64  * structures are shared with the domU driver, which has the opposite
65  * view of what constitutes "transmit" and "receive".  In naming the
66  * shared structures the domU driver always wins.
67  */
68 
69 /*
70  * XXPV dme: things to do, as well as various things indicated
71  * throughout the source:
72  * - copy avoidance outbound.
73  * - copy avoidance inbound.
74  * - transfer credit limiting.
75  * - MAC address based filtering.
76  */
77 
78 /*
79  * Linux expects to have some headroom in received buffers.  The Linux
80  * frontend driver (netfront) checks to see if the headroom is
81  * available and will re-allocate the buffer to make room if
82  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
83  * headroom to each packet we pass to the peer.
84  */
85 #define	TX_BUFFER_HEADROOM	16
86 
87 static boolean_t	xnb_cksum_offload = B_TRUE;
88 
89 static boolean_t	xnb_connect_rings(dev_info_t *);
90 static void		xnb_disconnect_rings(dev_info_t *);
91 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
92     void *, void *);
93 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
94     void *, void *);
95 
96 static int	xnb_rxbuf_constructor(void *, void *, int);
97 static void	xnb_rxbuf_destructor(void *, void *);
98 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
99 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
100 static void	xnb_rx_notify_peer(xnb_t *);
101 static void	xnb_rx_complete(xnb_rxbuf_t *);
102 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
103 static void	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *);
104 static void	xnb_rx_perform_pending_unmop(xnb_t *);
105 
106 #ifdef XNB_DEBUG
107 #define	NR_GRANT_ENTRIES \
108 	(NR_GRANT_FRAMES * PAGESIZE / sizeof (grant_entry_t))
109 #endif /* XNB_DEBUG */
110 
111 /* XXPV dme: are these really invalid? */
112 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
113 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
114 
115 static kmem_cache_t *xnb_rxbuf_cachep;
116 static kmutex_t	xnb_alloc_page_lock;
117 
118 /*
119  * Statistics.
120  */
121 static char *aux_statistics[] = {
122 	"tx_cksum_deferred",
123 	"rx_cksum_no_need",
124 	"tx_notify_deferred",
125 	"tx_notify_sent",
126 	"rx_notify_deferred",
127 	"rx_notify_sent",
128 	"tx_too_early",
129 	"rx_too_early",
130 	"rx_allocb_failed",
131 	"mac_full",
132 	"spurious_intr",
133 	"allocation_success",
134 	"allocation_failure",
135 	"small_allocation_success",
136 	"small_allocation_failure",
137 	"csum_hardware",
138 	"csum_software",
139 };
140 
141 static int
142 xnb_ks_aux_update(kstat_t *ksp, int flag)
143 {
144 	xnb_t *xnbp;
145 	kstat_named_t *knp;
146 
147 	if (flag != KSTAT_READ)
148 		return (EACCES);
149 
150 	xnbp = ksp->ks_private;
151 	knp = ksp->ks_data;
152 
153 	/*
154 	 * Assignment order should match that of the names in
155 	 * aux_statistics.
156 	 */
157 	(knp++)->value.ui64 = xnbp->x_stat_tx_cksum_deferred;
158 	(knp++)->value.ui64 = xnbp->x_stat_rx_cksum_no_need;
159 	(knp++)->value.ui64 = xnbp->x_stat_tx_notify_deferred;
160 	(knp++)->value.ui64 = xnbp->x_stat_tx_notify_sent;
161 	(knp++)->value.ui64 = xnbp->x_stat_rx_notify_deferred;
162 	(knp++)->value.ui64 = xnbp->x_stat_rx_notify_sent;
163 	(knp++)->value.ui64 = xnbp->x_stat_tx_too_early;
164 	(knp++)->value.ui64 = xnbp->x_stat_rx_too_early;
165 	(knp++)->value.ui64 = xnbp->x_stat_rx_allocb_failed;
166 	(knp++)->value.ui64 = xnbp->x_stat_mac_full;
167 	(knp++)->value.ui64 = xnbp->x_stat_spurious_intr;
168 	(knp++)->value.ui64 = xnbp->x_stat_allocation_success;
169 	(knp++)->value.ui64 = xnbp->x_stat_allocation_failure;
170 	(knp++)->value.ui64 = xnbp->x_stat_small_allocation_success;
171 	(knp++)->value.ui64 = xnbp->x_stat_small_allocation_failure;
172 	(knp++)->value.ui64 = xnbp->x_stat_csum_hardware;
173 	(knp++)->value.ui64 = xnbp->x_stat_csum_software;
174 
175 	return (0);
176 }
177 
178 static boolean_t
179 xnb_ks_init(xnb_t *xnbp)
180 {
181 	int nstat = sizeof (aux_statistics) /
182 	    sizeof (aux_statistics[0]);
183 	char **cp = aux_statistics;
184 	kstat_named_t *knp;
185 
186 	/*
187 	 * Create and initialise kstats.
188 	 */
189 	xnbp->x_kstat_aux = kstat_create(ddi_driver_name(xnbp->x_devinfo),
190 	    ddi_get_instance(xnbp->x_devinfo), "aux_statistics", "net",
191 	    KSTAT_TYPE_NAMED, nstat, 0);
192 	if (xnbp->x_kstat_aux == NULL)
193 		return (B_FALSE);
194 
195 	xnbp->x_kstat_aux->ks_private = xnbp;
196 	xnbp->x_kstat_aux->ks_update = xnb_ks_aux_update;
197 
198 	knp = xnbp->x_kstat_aux->ks_data;
199 	while (nstat > 0) {
200 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
201 
202 		knp++;
203 		cp++;
204 		nstat--;
205 	}
206 
207 	kstat_install(xnbp->x_kstat_aux);
208 
209 	return (B_TRUE);
210 }
211 
212 static void
213 xnb_ks_free(xnb_t *xnbp)
214 {
215 	kstat_delete(xnbp->x_kstat_aux);
216 }
217 
218 /*
219  * Software checksum calculation and insertion for an arbitrary packet.
220  */
221 /*ARGSUSED*/
222 static mblk_t *
223 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
224 {
225 	/*
226 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
227 	 * because it doesn't cover all of the interesting cases :-(
228 	 */
229 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
230 	    HCK_FULLCKSUM, KM_NOSLEEP);
231 
232 	return (vnic_fix_cksum(mp));
233 }
234 
235 mblk_t *
236 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
237 {
238 	struct ether_header *ehp;
239 	uint16_t sap;
240 	uint32_t offset;
241 	ipha_t *ipha;
242 
243 	ASSERT(mp->b_next == NULL);
244 
245 	/*
246 	 * Check that the packet is contained in a single mblk.  In
247 	 * the "from peer" path this is true today, but will change
248 	 * when scatter gather support is added.  In the "to peer"
249 	 * path we cannot be sure, but in most cases it will be true
250 	 * (in the xnbo case the packet has come from a MAC device
251 	 * which is unlikely to split packets).
252 	 */
253 	if (mp->b_cont != NULL)
254 		goto software;
255 
256 	/*
257 	 * If the MAC has no hardware capability don't do any further
258 	 * checking.
259 	 */
260 	if (capab == 0)
261 		goto software;
262 
263 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
264 	ehp = (struct ether_header *)mp->b_rptr;
265 
266 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
267 		struct ether_vlan_header *evhp;
268 
269 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
270 		evhp = (struct ether_vlan_header *)mp->b_rptr;
271 		sap = ntohs(evhp->ether_type);
272 		offset = sizeof (struct ether_vlan_header);
273 	} else {
274 		sap = ntohs(ehp->ether_type);
275 		offset = sizeof (struct ether_header);
276 	}
277 
278 	/*
279 	 * We only attempt to do IPv4 packets in hardware.
280 	 */
281 	if (sap != ETHERTYPE_IP)
282 		goto software;
283 
284 	/*
285 	 * We know that this is an IPv4 packet.
286 	 */
287 	ipha = (ipha_t *)(mp->b_rptr + offset);
288 
289 	switch (ipha->ipha_protocol) {
290 	case IPPROTO_TCP:
291 	case IPPROTO_UDP:
292 		/*
293 		 * This is a TCP/IPv4 or UDP/IPv4 packet.
294 		 *
295 		 * If the capabilities indicate that full checksum
296 		 * offload is available, use it.
297 		 */
298 		if ((capab & HCKSUM_INET_FULL_V4) != 0) {
299 			(void) hcksum_assoc(mp, NULL, NULL,
300 			    0, 0, 0, 0,
301 			    HCK_FULLCKSUM, KM_NOSLEEP);
302 
303 			xnbp->x_stat_csum_hardware++;
304 
305 			return (mp);
306 		}
307 
308 		/*
309 		 * XXPV dme: If the capabilities indicate that partial
310 		 * checksum offload is available, we should use it.
311 		 */
312 
313 		break;
314 
315 	default:
316 		/* Use software. */
317 		break;
318 	}
319 
320 software:
321 	/*
322 	 * We are not able to use any offload so do the whole thing in
323 	 * software.
324 	 */
325 	xnbp->x_stat_csum_software++;
326 
327 	return (xnb_software_csum(xnbp, mp));
328 }
329 
330 int
331 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
332 {
333 	xnb_t *xnbp;
334 	char *xsname, mac[ETHERADDRL * 3];
335 
336 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
337 
338 	xnbp->x_flavour = flavour;
339 	xnbp->x_flavour_data = flavour_data;
340 	xnbp->x_devinfo = dip;
341 	xnbp->x_evtchn = INVALID_EVTCHN;
342 	xnbp->x_irq = B_FALSE;
343 	xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE;
344 	xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE;
345 	xnbp->x_cksum_offload = xnb_cksum_offload;
346 	xnbp->x_connected = B_FALSE;
347 	xnbp->x_hotplugged = B_FALSE;
348 	xnbp->x_detachable = B_FALSE;
349 	xnbp->x_peer = xvdi_get_oeid(dip);
350 	xnbp->x_rx_pages_writable = B_FALSE;
351 
352 	xnbp->x_rx_buf_count = 0;
353 	xnbp->x_rx_unmop_count = 0;
354 
355 	xnbp->x_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
356 	ASSERT(xnbp->x_tx_va != NULL);
357 
358 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->x_icookie)
359 	    != DDI_SUCCESS)
360 		goto failure;
361 
362 	mutex_init(&xnbp->x_tx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie);
363 	mutex_init(&xnbp->x_rx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie);
364 
365 	/* set driver private pointer now */
366 	ddi_set_driver_private(dip, xnbp);
367 
368 	if (!xnb_ks_init(xnbp))
369 		goto late_failure;
370 
371 	/*
372 	 * Receive notification of changes in the state of the
373 	 * driver in the guest domain.
374 	 */
375 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
376 	    xnb_oe_state_change) != DDI_SUCCESS)
377 		goto very_late_failure;
378 
379 	/*
380 	 * Receive notification of hotplug events.
381 	 */
382 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
383 	    xnb_hp_state_change) != DDI_SUCCESS)
384 		goto very_late_failure;
385 
386 	xsname = xvdi_get_xsname(dip);
387 
388 	if (xenbus_printf(XBT_NULL, xsname,
389 	    "feature-no-csum-offload", "%d",
390 	    xnbp->x_cksum_offload ? 0 : 1) != 0)
391 		goto very_very_late_failure;
392 
393 	if (xenbus_scanf(XBT_NULL, xsname,
394 	    "mac", "%s", mac) != 0) {
395 		cmn_err(CE_WARN, "xnb_attach: "
396 		    "cannot read mac address from %s",
397 		    xsname);
398 		goto very_very_late_failure;
399 	}
400 
401 	if (ether_aton(mac, xnbp->x_mac_addr) != ETHERADDRL) {
402 		cmn_err(CE_WARN,
403 		    "xnb_attach: cannot parse mac address %s",
404 		    mac);
405 		goto very_very_late_failure;
406 	}
407 
408 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
409 	(void) xvdi_post_event(dip, XEN_HP_ADD);
410 
411 	return (DDI_SUCCESS);
412 
413 very_very_late_failure: /* not that the naming is getting silly or anything */
414 	xvdi_remove_event_handler(dip, NULL);
415 
416 very_late_failure:
417 	xnb_ks_free(xnbp);
418 
419 late_failure:
420 	mutex_destroy(&xnbp->x_rx_lock);
421 	mutex_destroy(&xnbp->x_tx_lock);
422 
423 failure:
424 	vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE);
425 	kmem_free(xnbp, sizeof (*xnbp));
426 	return (DDI_FAILURE);
427 }
428 
429 /*ARGSUSED*/
430 void
431 xnb_detach(dev_info_t *dip)
432 {
433 	xnb_t *xnbp = ddi_get_driver_private(dip);
434 
435 	ASSERT(xnbp != NULL);
436 	ASSERT(!xnbp->x_connected);
437 	ASSERT(xnbp->x_rx_buf_count == 0);
438 
439 	xnb_disconnect_rings(dip);
440 
441 	xvdi_remove_event_handler(dip, NULL);
442 
443 	xnb_ks_free(xnbp);
444 
445 	ddi_set_driver_private(dip, NULL);
446 
447 	mutex_destroy(&xnbp->x_tx_lock);
448 	mutex_destroy(&xnbp->x_rx_lock);
449 
450 	ASSERT(xnbp->x_tx_va != NULL);
451 	vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE);
452 
453 	kmem_free(xnbp, sizeof (*xnbp));
454 }
455 
456 
457 static mfn_t
458 xnb_alloc_page(xnb_t *xnbp)
459 {
460 #define	WARNING_RATE_LIMIT 100
461 #define	BATCH_SIZE 256
462 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
463 	static int nth = BATCH_SIZE;
464 	mfn_t mfn;
465 
466 	mutex_enter(&xnb_alloc_page_lock);
467 	if (nth == BATCH_SIZE) {
468 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
469 			xnbp->x_stat_allocation_failure++;
470 			mutex_exit(&xnb_alloc_page_lock);
471 
472 			/*
473 			 * Try for a single page in low memory situations.
474 			 */
475 			if (balloon_alloc_pages(1, &mfn) != 1) {
476 				xnbp->x_stat_small_allocation_failure++;
477 				if ((xnbp->x_stat_small_allocation_failure
478 				    % WARNING_RATE_LIMIT) == 0) {
479 					cmn_err(CE_WARN, "xnb_alloc_page: "
480 					    "Cannot allocate memory to "
481 					    "transfer packets to peer.");
482 				}
483 				return (0);
484 			} else {
485 				xnbp->x_stat_small_allocation_success++;
486 				return (mfn);
487 			}
488 		}
489 
490 		nth = 0;
491 		xnbp->x_stat_allocation_success++;
492 	}
493 
494 	mfn = mfns[nth++];
495 	mutex_exit(&xnb_alloc_page_lock);
496 
497 	ASSERT(mfn != 0);
498 
499 	return (mfn);
500 #undef BATCH_SIZE
501 #undef WARNING_RATE_LIMIT
502 }
503 
504 /*ARGSUSED*/
505 static void
506 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
507 {
508 	int r;
509 
510 	/*
511 	 * This happens only in the error path, so batching is
512 	 * not worth the complication.
513 	 */
514 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
515 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
516 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
517 		    r, mfn);
518 	}
519 }
520 
521 mblk_t *
522 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
523 {
524 	mblk_t *free = mp, *prev = NULL;
525 	size_t len;
526 	gnttab_transfer_t *gop;
527 	boolean_t notify;
528 	RING_IDX loop, prod, end;
529 
530 	/*
531 	 * For each packet the sequence of operations is:
532 	 *
533 	 * 1. get a new page from the hypervisor.
534 	 * 2. get a request slot from the ring.
535 	 * 3. copy the data into the new page.
536 	 * 4. transfer the page to the peer.
537 	 * 5. update the request slot.
538 	 * 6. kick the peer.
539 	 * 7. free mp.
540 	 *
541 	 * In order to reduce the number of hypercalls, we prepare
542 	 * several packets for the peer and perform a single hypercall
543 	 * to transfer them.
544 	 */
545 
546 	mutex_enter(&xnbp->x_tx_lock);
547 
548 	/*
549 	 * If we are not connected to the peer or have not yet
550 	 * finished hotplug it is too early to pass packets to the
551 	 * peer.
552 	 */
553 	if (!(xnbp->x_connected && xnbp->x_hotplugged)) {
554 		mutex_exit(&xnbp->x_tx_lock);
555 		xnbp->x_stat_tx_too_early++;
556 		return (mp);
557 	}
558 
559 	loop = xnbp->x_rx_ring.req_cons;
560 	prod = xnbp->x_rx_ring.rsp_prod_pvt;
561 	gop = xnbp->x_tx_top;
562 
563 	/*
564 	 * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring) but
565 	 * using local variables.
566 	 */
567 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
568 	((((_r)->sring->req_prod - loop) <		\
569 		(RING_SIZE(_r) - (loop - prod))) ?	\
570 	    ((_r)->sring->req_prod - loop) :		\
571 	    (RING_SIZE(_r) - (loop - prod)))
572 
573 	while ((mp != NULL) &&
574 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring)) {
575 
576 		mfn_t mfn;
577 		pfn_t pfn;
578 		netif_rx_request_t *rxreq;
579 		netif_rx_response_t *rxresp;
580 		char *valoop;
581 		size_t offset;
582 		mblk_t *ml;
583 		uint16_t cksum_flags;
584 
585 		/* 1 */
586 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
587 			xnbp->x_stat_xmit_defer++;
588 			break;
589 		}
590 
591 		/* 2 */
592 		rxreq = RING_GET_REQUEST(&xnbp->x_rx_ring, loop);
593 
594 #ifdef XNB_DEBUG
595 		if (!(rxreq->id < NET_RX_RING_SIZE))
596 			cmn_err(CE_PANIC, "xnb_to_peer: "
597 			    "id %d out of range in request 0x%p",
598 			    rxreq->id, (void *)rxreq);
599 		if (rxreq->gref >= NR_GRANT_ENTRIES)
600 			cmn_err(CE_PANIC, "xnb_to_peer: "
601 			    "grant ref %d out of range in request 0x%p",
602 			    rxreq->gref, (void *)rxreq);
603 #endif /* XNB_DEBUG */
604 
605 		/* Assign a pfn and map the new page at the allocated va. */
606 		pfn = xen_assign_pfn(mfn);
607 		hat_devload(kas.a_hat, xnbp->x_tx_va, PAGESIZE,
608 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
609 
610 		offset = TX_BUFFER_HEADROOM;
611 
612 		/* 3 */
613 		len = 0;
614 		valoop = xnbp->x_tx_va + offset;
615 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
616 			size_t chunk = ml->b_wptr - ml->b_rptr;
617 
618 			bcopy(ml->b_rptr, valoop, chunk);
619 			valoop += chunk;
620 			len += chunk;
621 		}
622 
623 		ASSERT(len + offset < PAGESIZE);
624 
625 		/* Release the pfn. */
626 		hat_unload(kas.a_hat, xnbp->x_tx_va, PAGESIZE,
627 		    HAT_UNLOAD_UNMAP);
628 		xen_release_pfn(pfn);
629 
630 		/* 4 */
631 		gop->mfn = mfn;
632 		gop->domid = xnbp->x_peer;
633 		gop->ref = rxreq->gref;
634 
635 		/* 5.1 */
636 		rxresp = RING_GET_RESPONSE(&xnbp->x_rx_ring, prod);
637 		rxresp->offset = offset;
638 		rxresp->flags = 0;
639 
640 		cksum_flags = xnbp->x_flavour->xf_cksum_to_peer(xnbp, mp);
641 		if (cksum_flags != 0)
642 			xnbp->x_stat_tx_cksum_deferred++;
643 		rxresp->flags |= cksum_flags;
644 
645 		rxresp->id = RING_GET_REQUEST(&xnbp->x_rx_ring, prod)->id;
646 		rxresp->status = len;
647 
648 		loop++;
649 		prod++;
650 		gop++;
651 		prev = mp;
652 		mp = mp->b_next;
653 	}
654 
655 	/*
656 	 * Did we actually do anything?
657 	 */
658 	if (loop == xnbp->x_rx_ring.req_cons) {
659 		mutex_exit(&xnbp->x_tx_lock);
660 		return (mp);
661 	}
662 
663 	end = loop;
664 
665 	/*
666 	 * Unlink the end of the 'done' list from the remainder.
667 	 */
668 	ASSERT(prev != NULL);
669 	prev->b_next = NULL;
670 
671 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->x_tx_top,
672 	    loop - xnbp->x_rx_ring.req_cons) != 0) {
673 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
674 	}
675 
676 	loop = xnbp->x_rx_ring.req_cons;
677 	prod = xnbp->x_rx_ring.rsp_prod_pvt;
678 	gop = xnbp->x_tx_top;
679 
680 	while (loop < end) {
681 		int16_t status = NETIF_RSP_OKAY;
682 
683 		if (gop->status != 0) {
684 			status = NETIF_RSP_ERROR;
685 
686 			/*
687 			 * If the status is anything other than
688 			 * GNTST_bad_page then we don't own the page
689 			 * any more, so don't try to give it back.
690 			 */
691 			if (gop->status != GNTST_bad_page)
692 				gop->mfn = 0;
693 		} else {
694 			/* The page is no longer ours. */
695 			gop->mfn = 0;
696 		}
697 
698 		if (gop->mfn != 0)
699 			/*
700 			 * Give back the page, as we won't be using
701 			 * it.
702 			 */
703 			xnb_free_page(xnbp, gop->mfn);
704 		else
705 			/*
706 			 * We gave away a page, update our accounting
707 			 * now.
708 			 */
709 			balloon_drv_subtracted(1);
710 
711 		/* 5.2 */
712 		if (status != NETIF_RSP_OKAY) {
713 			RING_GET_RESPONSE(&xnbp->x_rx_ring, prod)->status =
714 			    status;
715 		} else {
716 			xnbp->x_stat_opackets++;
717 			xnbp->x_stat_obytes += len;
718 		}
719 
720 		loop++;
721 		prod++;
722 		gop++;
723 	}
724 
725 	xnbp->x_rx_ring.req_cons = loop;
726 	xnbp->x_rx_ring.rsp_prod_pvt = prod;
727 
728 	/* 6 */
729 	/*LINTED: constant in conditional context*/
730 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_rx_ring, notify);
731 	if (notify) {
732 		ec_notify_via_evtchn(xnbp->x_evtchn);
733 		xnbp->x_stat_tx_notify_sent++;
734 	} else {
735 		xnbp->x_stat_tx_notify_deferred++;
736 	}
737 
738 	if (mp != NULL)
739 		xnbp->x_stat_xmit_defer++;
740 
741 	mutex_exit(&xnbp->x_tx_lock);
742 
743 	/* Free mblk_t's that we consumed. */
744 	freemsgchain(free);
745 
746 	return (mp);
747 }
748 
749 /*ARGSUSED*/
750 static int
751 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
752 {
753 	xnb_rxbuf_t *rxp = buf;
754 
755 	bzero(rxp, sizeof (*rxp));
756 
757 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
758 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
759 
760 	rxp->xr_mop.host_addr =
761 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
762 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
763 	    VM_NOSLEEP : VM_SLEEP);
764 
765 	if (rxp->xr_mop.host_addr == NULL) {
766 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
767 		    "cannot get address space");
768 		return (-1);
769 	}
770 
771 	/*
772 	 * Have the hat ensure that page table exists for the VA.
773 	 */
774 	hat_prepare_mapping(kas.a_hat,
775 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
776 
777 	return (0);
778 }
779 
780 /*ARGSUSED*/
781 static void
782 xnb_rxbuf_destructor(void *buf, void *arg)
783 {
784 	xnb_rxbuf_t *rxp = buf;
785 
786 	ASSERT(rxp->xr_mop.host_addr != NULL);
787 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
788 
789 	hat_release_mapping(kas.a_hat,
790 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
791 	vmem_free(heap_arena,
792 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
793 }
794 
795 static void
796 xnb_rx_notify_peer(xnb_t *xnbp)
797 {
798 	boolean_t notify;
799 
800 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
801 
802 	/*LINTED: constant in conditional context*/
803 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_tx_ring, notify);
804 	if (notify) {
805 		ec_notify_via_evtchn(xnbp->x_evtchn);
806 		xnbp->x_stat_rx_notify_sent++;
807 	} else {
808 		xnbp->x_stat_rx_notify_deferred++;
809 	}
810 }
811 
812 static void
813 xnb_rx_complete(xnb_rxbuf_t *rxp)
814 {
815 	xnb_t *xnbp = rxp->xr_xnbp;
816 
817 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
818 
819 	mutex_enter(&xnbp->x_rx_lock);
820 
821 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop);
822 	xnb_rx_perform_pending_unmop(xnbp);
823 
824 	if (xnbp->x_connected) {
825 		xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
826 		xnb_rx_notify_peer(xnbp);
827 	}
828 
829 	xnb_rxbuf_put(xnbp, rxp);
830 
831 	mutex_exit(&xnbp->x_rx_lock);
832 }
833 
834 static void
835 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
836 {
837 	RING_IDX i;
838 	netif_tx_response_t *txresp;
839 
840 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
841 
842 	i = xnbp->x_tx_ring.rsp_prod_pvt;
843 
844 	txresp = RING_GET_RESPONSE(&xnbp->x_tx_ring, i);
845 	txresp->id = id;
846 	txresp->status = status;
847 
848 	xnbp->x_tx_ring.rsp_prod_pvt = i + 1;
849 
850 	/*
851 	 * Note that we don't push the change to the peer here - that
852 	 * is the callers responsibility.
853 	 */
854 }
855 
856 /*
857  * XXPV dme: currently pending unmap operations are stored on a
858  * per-instance basis.  Should they be per-driver?  The locking would
859  * have to change (obviously), but there might be an improvement from
860  * batching more together.  Right now they are all 'done' either at
861  * the tail of each receive operation (copy case) or on each
862  * completion (non-copy case).  Should that be changed to some
863  * interval (watermark?) to improve the chance of batching?
864  */
865 static void
866 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop)
867 {
868 	gnttab_unmap_grant_ref_t *unmop;
869 
870 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
871 	ASSERT(xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE);
872 
873 	unmop = &xnbp->x_rx_unmop[xnbp->x_rx_unmop_count];
874 	xnbp->x_rx_unmop_count++;
875 
876 	unmop->host_addr = mop->host_addr;
877 	unmop->dev_bus_addr = mop->dev_bus_addr;
878 	unmop->handle = mop->handle;
879 
880 #ifdef XNB_DEBUG
881 	if (xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE)
882 		ASSERT(xnbp->x_rx_unmop[xnbp->x_rx_unmop_count].host_addr
883 		    == NULL);
884 #endif /* XNB_DEBUG */
885 
886 }
887 
888 static void
889 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
890 {
891 #ifdef XNB_DEBUG
892 	RING_IDX loop;
893 	gnttab_unmap_grant_ref_t *unmop;
894 #endif /* XNB_DEBUG */
895 
896 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
897 
898 	if (xnbp->x_rx_unmop_count == 0)
899 		return;
900 
901 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
902 	    xnbp->x_rx_unmop, xnbp->x_rx_unmop_count) < 0) {
903 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
904 		    "unmap grant operation failed, "
905 		    "%d pages lost", xnbp->x_rx_unmop_count);
906 	}
907 
908 #ifdef XNB_DEBUG
909 	for (loop = 0, unmop = xnbp->x_rx_unmop;
910 	    loop < xnbp->x_rx_unmop_count;
911 	    loop++, unmop++) {
912 		if (unmop->status != 0) {
913 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
914 			    "unmap grant reference failed (%d)",
915 			    unmop->status);
916 		}
917 	}
918 #endif /* XNB_DEBUG */
919 
920 	xnbp->x_rx_unmop_count = 0;
921 
922 #ifdef XNB_DEBUG
923 	bzero(xnbp->x_rx_unmop, sizeof (xnbp->x_rx_unmop));
924 #endif /* XNB_DEBUG */
925 }
926 
927 static xnb_rxbuf_t *
928 xnb_rxbuf_get(xnb_t *xnbp, int flags)
929 {
930 	xnb_rxbuf_t *rxp;
931 
932 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
933 
934 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
935 	if (rxp != NULL) {
936 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
937 		rxp->xr_flags |= XNB_RXBUF_INUSE;
938 
939 		rxp->xr_xnbp = xnbp;
940 		rxp->xr_mop.dom = xnbp->x_peer;
941 
942 		rxp->xr_mop.flags = GNTMAP_host_map;
943 		if (!xnbp->x_rx_pages_writable)
944 			rxp->xr_mop.flags |= GNTMAP_readonly;
945 
946 		xnbp->x_rx_buf_count++;
947 	}
948 
949 	return (rxp);
950 }
951 
952 static void
953 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
954 {
955 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
956 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
957 
958 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
959 	xnbp->x_rx_buf_count--;
960 
961 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
962 }
963 
964 static mblk_t *
965 xnb_recv(xnb_t *xnbp)
966 {
967 	RING_IDX start, end, loop;
968 	gnttab_map_grant_ref_t *mop;
969 	xnb_rxbuf_t **rxpp;
970 	netif_tx_request_t *txreq;
971 	boolean_t work_to_do;
972 	mblk_t *head, *tail;
973 	/*
974 	 * If the peer granted a read-only mapping to the page then we
975 	 * must copy the data, as the local protocol stack (should the
976 	 * packet be destined for this host) will modify the packet
977 	 * 'in place'.
978 	 */
979 	boolean_t copy = !xnbp->x_rx_pages_writable;
980 
981 	/*
982 	 * For each individual request, the sequence of actions is:
983 	 *
984 	 * 1. get the request.
985 	 * 2. map the page based on the grant ref.
986 	 * 3. allocate an mblk, copy the data to it.
987 	 * 4. release the grant.
988 	 * 5. update the ring.
989 	 * 6. pass the packet upward.
990 	 * 7. kick the peer.
991 	 *
992 	 * In fact, we try to perform the grant operations in batches,
993 	 * so there are two loops.
994 	 */
995 
996 	head = tail = NULL;
997 around:
998 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
999 
1000 	/*LINTED: constant in conditional context*/
1001 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->x_tx_ring, work_to_do);
1002 	if (!work_to_do) {
1003 finished:
1004 		xnb_rx_notify_peer(xnbp);
1005 
1006 		return (head);
1007 	}
1008 
1009 	start = xnbp->x_tx_ring.req_cons;
1010 	end = xnbp->x_tx_ring.sring->req_prod;
1011 
1012 	for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp;
1013 	    loop != end;
1014 	    loop++, mop++, rxpp++) {
1015 		xnb_rxbuf_t *rxp;
1016 
1017 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1018 		if (rxp == NULL)
1019 			break;
1020 
1021 		ASSERT(xnbp->x_rx_pages_writable ||
1022 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1023 		    == GNTMAP_readonly));
1024 
1025 		rxp->xr_mop.ref =
1026 		    RING_GET_REQUEST(&xnbp->x_tx_ring, loop)->gref;
1027 
1028 		ASSERT(rxp->xr_mop.ref < NR_GRANT_ENTRIES);
1029 
1030 		*mop = rxp->xr_mop;
1031 		*rxpp = rxp;
1032 	}
1033 
1034 	if ((loop - start) == 0)
1035 		goto finished;
1036 
1037 	end = loop;
1038 
1039 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1040 	    xnbp->x_rx_mop, end - start) != 0) {
1041 
1042 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1043 
1044 		loop = start;
1045 		rxpp = xnbp->x_rx_bufp;
1046 
1047 		while (loop != end) {
1048 			xnb_rxbuf_put(xnbp, *rxpp);
1049 
1050 			loop++;
1051 			rxpp++;
1052 		}
1053 
1054 		goto finished;
1055 	}
1056 
1057 	for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp;
1058 	    loop != end;
1059 	    loop++, mop++, rxpp++) {
1060 		mblk_t *mp = NULL;
1061 		int16_t status = NETIF_RSP_OKAY;
1062 		xnb_rxbuf_t *rxp = *rxpp;
1063 
1064 		if (mop->status != 0) {
1065 			cmn_err(CE_WARN, "xnb_recv: "
1066 			    "failed to map buffer: %d",
1067 			    mop->status);
1068 			status = NETIF_RSP_ERROR;
1069 		}
1070 
1071 		txreq = RING_GET_REQUEST(&xnbp->x_tx_ring, loop);
1072 
1073 		if (status == NETIF_RSP_OKAY) {
1074 			if (copy) {
1075 				mp = allocb(txreq->size, BPRI_MED);
1076 				if (mp == NULL) {
1077 					status = NETIF_RSP_ERROR;
1078 					xnbp->x_stat_rx_allocb_failed++;
1079 				} else {
1080 					bcopy((caddr_t)(uintptr_t)
1081 					    mop->host_addr + txreq->offset,
1082 					    mp->b_wptr, txreq->size);
1083 					mp->b_wptr += txreq->size;
1084 				}
1085 			} else {
1086 				mp = desballoc((unsigned char *)(uintptr_t)
1087 				    mop->host_addr + txreq->offset,
1088 				    txreq->size, 0, &rxp->xr_free_rtn);
1089 				if (mp == NULL) {
1090 					status = NETIF_RSP_ERROR;
1091 					xnbp->x_stat_rx_allocb_failed++;
1092 				} else {
1093 					rxp->xr_id = txreq->id;
1094 					rxp->xr_status = status;
1095 					rxp->xr_mop = *mop;
1096 
1097 					mp->b_wptr += txreq->size;
1098 				}
1099 			}
1100 
1101 			/*
1102 			 * If we have a buffer and there are checksum
1103 			 * flags, process them appropriately.
1104 			 */
1105 			if ((mp != NULL) &&
1106 			    ((txreq->flags &
1107 			    (NETTXF_csum_blank | NETTXF_data_validated))
1108 			    != 0)) {
1109 				mp = xnbp->x_flavour->xf_cksum_from_peer(xnbp,
1110 				    mp, txreq->flags);
1111 				xnbp->x_stat_rx_cksum_no_need++;
1112 			}
1113 		}
1114 
1115 		if (copy || (mp == NULL)) {
1116 			xnb_rx_mark_complete(xnbp, txreq->id, status);
1117 			xnb_rx_schedule_unmop(xnbp, mop);
1118 		}
1119 
1120 		if (mp != NULL) {
1121 			xnbp->x_stat_ipackets++;
1122 			xnbp->x_stat_rbytes += txreq->size;
1123 
1124 			mp->b_next = NULL;
1125 			if (head == NULL) {
1126 				ASSERT(tail == NULL);
1127 				head = mp;
1128 			} else {
1129 				ASSERT(tail != NULL);
1130 				tail->b_next = mp;
1131 			}
1132 			tail = mp;
1133 		}
1134 	}
1135 
1136 	/*
1137 	 * This has to be here rather than in the 'finished' code
1138 	 * because we can only handle NET_TX_RING_SIZE pending unmap
1139 	 * operations, which may be exceeded by multiple trips around
1140 	 * the receive loop during heavy load (one trip around the
1141 	 * loop cannot generate more than NET_TX_RING_SIZE unmap
1142 	 * operations).
1143 	 */
1144 	xnb_rx_perform_pending_unmop(xnbp);
1145 	if (copy) {
1146 		for (loop = start, rxpp = xnbp->x_rx_bufp;
1147 		    loop != end;
1148 		    loop++, rxpp++)
1149 			xnb_rxbuf_put(xnbp, *rxpp);
1150 	}
1151 
1152 	xnbp->x_tx_ring.req_cons = loop;
1153 
1154 	goto around;
1155 	/* NOTREACHED */
1156 }
1157 
1158 /*
1159  *  intr() -- ring interrupt service routine
1160  */
1161 static uint_t
1162 xnb_intr(caddr_t arg)
1163 {
1164 	xnb_t *xnbp = (xnb_t *)arg;
1165 	mblk_t *mp;
1166 
1167 	xnbp->x_stat_intr++;
1168 
1169 	mutex_enter(&xnbp->x_rx_lock);
1170 
1171 	ASSERT(xnbp->x_connected);
1172 
1173 	mp = xnb_recv(xnbp);
1174 
1175 	mutex_exit(&xnbp->x_rx_lock);
1176 
1177 	if (!xnbp->x_hotplugged) {
1178 		xnbp->x_stat_rx_too_early++;
1179 		goto fail;
1180 	}
1181 	if (mp == NULL) {
1182 		xnbp->x_stat_spurious_intr++;
1183 		goto fail;
1184 	}
1185 
1186 	xnbp->x_flavour->xf_recv(xnbp, mp);
1187 
1188 	return (DDI_INTR_CLAIMED);
1189 
1190 fail:
1191 	freemsgchain(mp);
1192 	return (DDI_INTR_CLAIMED);
1193 }
1194 
1195 static boolean_t
1196 xnb_connect_rings(dev_info_t *dip)
1197 {
1198 	xnb_t *xnbp = ddi_get_driver_private(dip);
1199 	char *oename;
1200 	struct gnttab_map_grant_ref map_op;
1201 	evtchn_port_t evtchn;
1202 	int i;
1203 
1204 	/*
1205 	 * Cannot attempt to connect the rings if already connected.
1206 	 */
1207 	ASSERT(!xnbp->x_connected);
1208 
1209 	oename = xvdi_get_oename(dip);
1210 
1211 	if (xenbus_gather(XBT_NULL, oename,
1212 	    "event-channel", "%u", &evtchn,
1213 	    "tx-ring-ref", "%lu", &xnbp->x_tx_ring_ref,
1214 	    "rx-ring-ref", "%lu", &xnbp->x_rx_ring_ref,
1215 	    NULL) != 0) {
1216 		cmn_err(CE_WARN, "xnb_connect_rings: "
1217 		    "cannot read other-end details from %s",
1218 		    oename);
1219 		goto fail;
1220 	}
1221 
1222 	if (xenbus_scanf(XBT_NULL, oename,
1223 	    "feature-tx-writable", "%d", &i) != 0)
1224 		i = 0;
1225 	if (i != 0)
1226 		xnbp->x_rx_pages_writable = B_TRUE;
1227 
1228 	if (xenbus_scanf(XBT_NULL, oename,
1229 	    "feature-no-csum-offload", "%d", &i) != 0)
1230 		i = 0;
1231 	if ((i == 1) || !xnbp->x_cksum_offload)
1232 		xnbp->x_cksum_offload = B_FALSE;
1233 
1234 	/*
1235 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1236 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1237 	 *    into the allocated vaddr (one for tx, one for rx).
1238 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1239 	 *    bound to this domain.
1240 	 * 4. associate the event channel with an interrupt.
1241 	 * 5. declare ourselves connected.
1242 	 * 6. enable the interrupt.
1243 	 */
1244 
1245 	/* 1.tx */
1246 	xnbp->x_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1247 	    0, 0, 0, 0, VM_SLEEP);
1248 	ASSERT(xnbp->x_tx_ring_addr != NULL);
1249 
1250 	/* 2.tx */
1251 	map_op.host_addr = (uint64_t)((long)xnbp->x_tx_ring_addr);
1252 	map_op.flags = GNTMAP_host_map;
1253 	map_op.ref = xnbp->x_tx_ring_ref;
1254 	map_op.dom = xnbp->x_peer;
1255 	hat_prepare_mapping(kas.a_hat, xnbp->x_tx_ring_addr);
1256 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1257 	    &map_op, 1) != 0 || map_op.status != 0) {
1258 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1259 		goto fail;
1260 	}
1261 	xnbp->x_tx_ring_handle = map_op.handle;
1262 
1263 	/*LINTED: constant in conditional context*/
1264 	BACK_RING_INIT(&xnbp->x_tx_ring,
1265 	    (netif_tx_sring_t *)xnbp->x_tx_ring_addr, PAGESIZE);
1266 
1267 	/* 1.rx */
1268 	xnbp->x_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1269 	    0, 0, 0, 0, VM_SLEEP);
1270 	ASSERT(xnbp->x_rx_ring_addr != NULL);
1271 
1272 	/* 2.rx */
1273 	map_op.host_addr = (uint64_t)((long)xnbp->x_rx_ring_addr);
1274 	map_op.flags = GNTMAP_host_map;
1275 	map_op.ref = xnbp->x_rx_ring_ref;
1276 	map_op.dom = xnbp->x_peer;
1277 	hat_prepare_mapping(kas.a_hat, xnbp->x_rx_ring_addr);
1278 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1279 	    &map_op, 1) != 0 || map_op.status != 0) {
1280 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1281 		goto fail;
1282 	}
1283 	xnbp->x_rx_ring_handle = map_op.handle;
1284 
1285 	/*LINTED: constant in conditional context*/
1286 	BACK_RING_INIT(&xnbp->x_rx_ring,
1287 	    (netif_rx_sring_t *)xnbp->x_rx_ring_addr, PAGESIZE);
1288 
1289 	/* 3 */
1290 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1291 		cmn_err(CE_WARN, "xnb_connect_rings: "
1292 		    "cannot bind event channel %d", xnbp->x_evtchn);
1293 		xnbp->x_evtchn = INVALID_EVTCHN;
1294 		goto fail;
1295 	}
1296 	xnbp->x_evtchn = xvdi_get_evtchn(dip);
1297 
1298 	/*
1299 	 * It would be good to set the state to XenbusStateConnected
1300 	 * here as well, but then what if ddi_add_intr() failed?
1301 	 * Changing the state in the store will be noticed by the peer
1302 	 * and cannot be "taken back".
1303 	 */
1304 	mutex_enter(&xnbp->x_tx_lock);
1305 	mutex_enter(&xnbp->x_rx_lock);
1306 
1307 	/* 5.1 */
1308 	xnbp->x_connected = B_TRUE;
1309 
1310 	mutex_exit(&xnbp->x_rx_lock);
1311 	mutex_exit(&xnbp->x_tx_lock);
1312 
1313 	/* 4, 6 */
1314 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1315 	    != DDI_SUCCESS) {
1316 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1317 		goto fail;
1318 	}
1319 	xnbp->x_irq = B_TRUE;
1320 
1321 	/* 5.2 */
1322 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1323 
1324 	return (B_TRUE);
1325 
1326 fail:
1327 	mutex_enter(&xnbp->x_tx_lock);
1328 	mutex_enter(&xnbp->x_rx_lock);
1329 
1330 	xnbp->x_connected = B_FALSE;
1331 
1332 	mutex_exit(&xnbp->x_rx_lock);
1333 	mutex_exit(&xnbp->x_tx_lock);
1334 
1335 	return (B_FALSE);
1336 }
1337 
1338 static void
1339 xnb_disconnect_rings(dev_info_t *dip)
1340 {
1341 	xnb_t *xnbp = ddi_get_driver_private(dip);
1342 
1343 	if (xnbp->x_irq) {
1344 		ddi_remove_intr(dip, 0, NULL);
1345 		xnbp->x_irq = B_FALSE;
1346 	}
1347 
1348 	if (xnbp->x_evtchn != INVALID_EVTCHN) {
1349 		xvdi_free_evtchn(dip);
1350 		xnbp->x_evtchn = INVALID_EVTCHN;
1351 	}
1352 
1353 	if (xnbp->x_rx_ring_handle != INVALID_GRANT_HANDLE) {
1354 		struct gnttab_unmap_grant_ref unmap_op;
1355 
1356 		unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_rx_ring_addr;
1357 		unmap_op.dev_bus_addr = 0;
1358 		unmap_op.handle = xnbp->x_rx_ring_handle;
1359 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1360 		    &unmap_op, 1) != 0)
1361 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1362 			    "cannot unmap rx-ring page (%d)",
1363 			    unmap_op.status);
1364 
1365 		xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE;
1366 	}
1367 
1368 	if (xnbp->x_rx_ring_addr != NULL) {
1369 		hat_release_mapping(kas.a_hat, xnbp->x_rx_ring_addr);
1370 		vmem_free(heap_arena, xnbp->x_rx_ring_addr, PAGESIZE);
1371 		xnbp->x_rx_ring_addr = NULL;
1372 	}
1373 
1374 	if (xnbp->x_tx_ring_handle != INVALID_GRANT_HANDLE) {
1375 		struct gnttab_unmap_grant_ref unmap_op;
1376 
1377 		unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_tx_ring_addr;
1378 		unmap_op.dev_bus_addr = 0;
1379 		unmap_op.handle = xnbp->x_tx_ring_handle;
1380 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1381 		    &unmap_op, 1) != 0)
1382 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1383 			    "cannot unmap tx-ring page (%d)",
1384 			    unmap_op.status);
1385 
1386 		xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE;
1387 	}
1388 
1389 	if (xnbp->x_tx_ring_addr != NULL) {
1390 		hat_release_mapping(kas.a_hat, xnbp->x_tx_ring_addr);
1391 		vmem_free(heap_arena, xnbp->x_tx_ring_addr, PAGESIZE);
1392 		xnbp->x_tx_ring_addr = NULL;
1393 	}
1394 }
1395 
1396 /*ARGSUSED*/
1397 static void
1398 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1399     void *arg, void *impl_data)
1400 {
1401 	xnb_t *xnbp = ddi_get_driver_private(dip);
1402 	XenbusState new_state = *(XenbusState *)impl_data;
1403 
1404 	ASSERT(xnbp != NULL);
1405 
1406 	switch (new_state) {
1407 	case XenbusStateConnected:
1408 		if (xnb_connect_rings(dip)) {
1409 			xnbp->x_flavour->xf_peer_connected(xnbp);
1410 		} else {
1411 			xnbp->x_flavour->xf_peer_disconnected(xnbp);
1412 			xnb_disconnect_rings(dip);
1413 			(void) xvdi_switch_state(dip, XBT_NULL,
1414 			    XenbusStateClosed);
1415 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1416 		}
1417 
1418 		/*
1419 		 * Now that we've attempted to connect it's reasonable
1420 		 * to allow an attempt to detach.
1421 		 */
1422 		xnbp->x_detachable = B_TRUE;
1423 
1424 		break;
1425 
1426 	case XenbusStateClosing:
1427 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1428 
1429 		break;
1430 
1431 	case XenbusStateClosed:
1432 		xnbp->x_flavour->xf_peer_disconnected(xnbp);
1433 
1434 		mutex_enter(&xnbp->x_tx_lock);
1435 		mutex_enter(&xnbp->x_rx_lock);
1436 
1437 		xnb_disconnect_rings(dip);
1438 		xnbp->x_connected = B_FALSE;
1439 
1440 		mutex_exit(&xnbp->x_rx_lock);
1441 		mutex_exit(&xnbp->x_tx_lock);
1442 
1443 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1444 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1445 		/*
1446 		 * In all likelyhood this is already set (in the above
1447 		 * case), but if the peer never attempted to connect
1448 		 * and the domain is destroyed we get here without
1449 		 * having been through the case above, so we set it to
1450 		 * be sure.
1451 		 */
1452 		xnbp->x_detachable = B_TRUE;
1453 
1454 		break;
1455 
1456 	default:
1457 		break;
1458 	}
1459 }
1460 
1461 /*ARGSUSED*/
1462 static void
1463 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1464     void *arg, void *impl_data)
1465 {
1466 	xnb_t *xnbp = ddi_get_driver_private(dip);
1467 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1468 	boolean_t success;
1469 
1470 	ASSERT(xnbp != NULL);
1471 
1472 	switch (state) {
1473 	case Connected:
1474 
1475 		success = xnbp->x_flavour->xf_hotplug_connected(xnbp);
1476 
1477 		mutex_enter(&xnbp->x_tx_lock);
1478 		mutex_enter(&xnbp->x_rx_lock);
1479 
1480 		xnbp->x_hotplugged = success;
1481 
1482 		mutex_exit(&xnbp->x_rx_lock);
1483 		mutex_exit(&xnbp->x_tx_lock);
1484 		break;
1485 
1486 	default:
1487 		break;
1488 	}
1489 }
1490 
1491 static struct modldrv modldrv = {
1492 	&mod_miscops, "xnb module %I%",
1493 };
1494 
1495 static struct modlinkage modlinkage = {
1496 	MODREV_1, &modldrv, NULL
1497 };
1498 
1499 int
1500 _init(void)
1501 {
1502 	int i;
1503 
1504 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
1505 
1506 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
1507 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
1508 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
1509 	ASSERT(xnb_rxbuf_cachep != NULL);
1510 
1511 	i = mod_install(&modlinkage);
1512 	if (i != DDI_SUCCESS) {
1513 		kmem_cache_destroy(xnb_rxbuf_cachep);
1514 		mutex_destroy(&xnb_alloc_page_lock);
1515 	}
1516 	return (i);
1517 }
1518 
1519 int
1520 _info(struct modinfo *modinfop)
1521 {
1522 	return (mod_info(&modlinkage, modinfop));
1523 }
1524 
1525 int
1526 _fini(void)
1527 {
1528 	int i;
1529 
1530 	i = mod_remove(&modlinkage);
1531 	if (i == DDI_SUCCESS) {
1532 		kmem_cache_destroy(xnb_rxbuf_cachep);
1533 		mutex_destroy(&xnb_alloc_page_lock);
1534 	}
1535 	return (i);
1536 }
1537