xref: /titanic_51/usr/src/uts/common/xen/io/xnb.c (revision 1af98250c8b03bdc43d8ac3aac6390221d75b92e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #ifdef DEBUG
30 #define	XNB_DEBUG 1
31 #endif /* DEBUG */
32 
33 #include "xnb.h"
34 
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/mac.h>
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 #include <vm/vm_dep.h>
52 
53 #include <sys/gld.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 #include <sys/vnic_impl.h> /* blech. */
57 
58 /*
59  * The terms "transmit" and "receive" are used in their traditional
60  * sense here - packets from other parts of this system are
61  * "transmitted" to the peer domain and those originating from the
62  * peer are "received".
63  *
64  * In some cases this can be confusing, because various data
65  * structures are shared with the domU driver, which has the opposite
66  * view of what constitutes "transmit" and "receive".  In naming the
67  * shared structures the domU driver always wins.
68  */
69 
70 /*
71  * XXPV dme: things to do, as well as various things indicated
72  * throughout the source:
73  * - copy avoidance outbound.
74  * - copy avoidance inbound.
75  * - transfer credit limiting.
76  * - MAC address based filtering.
77  */
78 
79 /*
80  * Linux expects to have some headroom in received buffers.  The Linux
81  * frontend driver (netfront) checks to see if the headroom is
82  * available and will re-allocate the buffer to make room if
83  * necessary.  To avoid this we add TX_BUFFER_HEADROOM bytes of
84  * headroom to each packet we pass to the peer.
85  */
86 #define	TX_BUFFER_HEADROOM	16
87 
88 static boolean_t	xnb_cksum_offload = B_TRUE;
89 
90 static boolean_t	xnb_connect_rings(dev_info_t *);
91 static void		xnb_disconnect_rings(dev_info_t *);
92 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
93     void *, void *);
94 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
95     void *, void *);
96 
97 static int	xnb_rxbuf_constructor(void *, void *, int);
98 static void	xnb_rxbuf_destructor(void *, void *);
99 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int);
100 static void	xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *);
101 static void	xnb_rx_notify_peer(xnb_t *);
102 static void	xnb_rx_complete(xnb_rxbuf_t *);
103 static void	xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t);
104 static void	xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *);
105 static void	xnb_rx_perform_pending_unmop(xnb_t *);
106 
107 #ifdef XNB_DEBUG
108 #define	NR_GRANT_ENTRIES \
109 	(NR_GRANT_FRAMES * PAGESIZE / sizeof (grant_entry_t))
110 #endif /* XNB_DEBUG */
111 
112 /* XXPV dme: are these really invalid? */
113 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
114 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
115 
116 static kmem_cache_t *xnb_rxbuf_cachep;
117 static kmutex_t	xnb_alloc_page_lock;
118 
119 /*
120  * Statistics.
121  */
122 static char *aux_statistics[] = {
123 	"tx_cksum_deferred",
124 	"rx_cksum_no_need",
125 	"tx_notify_deferred",
126 	"tx_notify_sent",
127 	"rx_notify_deferred",
128 	"rx_notify_sent",
129 	"tx_too_early",
130 	"rx_too_early",
131 	"rx_allocb_failed",
132 	"mac_full",
133 	"spurious_intr",
134 	"allocation_success",
135 	"allocation_failure",
136 	"small_allocation_success",
137 	"small_allocation_failure",
138 	"csum_hardware",
139 	"csum_software",
140 };
141 
142 static int
143 xnb_ks_aux_update(kstat_t *ksp, int flag)
144 {
145 	xnb_t *xnbp;
146 	kstat_named_t *knp;
147 
148 	if (flag != KSTAT_READ)
149 		return (EACCES);
150 
151 	xnbp = ksp->ks_private;
152 	knp = ksp->ks_data;
153 
154 	/*
155 	 * Assignment order should match that of the names in
156 	 * aux_statistics.
157 	 */
158 	(knp++)->value.ui64 = xnbp->x_stat_tx_cksum_deferred;
159 	(knp++)->value.ui64 = xnbp->x_stat_rx_cksum_no_need;
160 	(knp++)->value.ui64 = xnbp->x_stat_tx_notify_deferred;
161 	(knp++)->value.ui64 = xnbp->x_stat_tx_notify_sent;
162 	(knp++)->value.ui64 = xnbp->x_stat_rx_notify_deferred;
163 	(knp++)->value.ui64 = xnbp->x_stat_rx_notify_sent;
164 	(knp++)->value.ui64 = xnbp->x_stat_tx_too_early;
165 	(knp++)->value.ui64 = xnbp->x_stat_rx_too_early;
166 	(knp++)->value.ui64 = xnbp->x_stat_rx_allocb_failed;
167 	(knp++)->value.ui64 = xnbp->x_stat_mac_full;
168 	(knp++)->value.ui64 = xnbp->x_stat_spurious_intr;
169 	(knp++)->value.ui64 = xnbp->x_stat_allocation_success;
170 	(knp++)->value.ui64 = xnbp->x_stat_allocation_failure;
171 	(knp++)->value.ui64 = xnbp->x_stat_small_allocation_success;
172 	(knp++)->value.ui64 = xnbp->x_stat_small_allocation_failure;
173 	(knp++)->value.ui64 = xnbp->x_stat_csum_hardware;
174 	(knp++)->value.ui64 = xnbp->x_stat_csum_software;
175 
176 	return (0);
177 }
178 
179 static boolean_t
180 xnb_ks_init(xnb_t *xnbp)
181 {
182 	int nstat = sizeof (aux_statistics) /
183 	    sizeof (aux_statistics[0]);
184 	char **cp = aux_statistics;
185 	kstat_named_t *knp;
186 
187 	/*
188 	 * Create and initialise kstats.
189 	 */
190 	xnbp->x_kstat_aux = kstat_create(ddi_driver_name(xnbp->x_devinfo),
191 	    ddi_get_instance(xnbp->x_devinfo), "aux_statistics", "net",
192 	    KSTAT_TYPE_NAMED, nstat, 0);
193 	if (xnbp->x_kstat_aux == NULL)
194 		return (B_FALSE);
195 
196 	xnbp->x_kstat_aux->ks_private = xnbp;
197 	xnbp->x_kstat_aux->ks_update = xnb_ks_aux_update;
198 
199 	knp = xnbp->x_kstat_aux->ks_data;
200 	while (nstat > 0) {
201 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
202 
203 		knp++;
204 		cp++;
205 		nstat--;
206 	}
207 
208 	kstat_install(xnbp->x_kstat_aux);
209 
210 	return (B_TRUE);
211 }
212 
213 static void
214 xnb_ks_free(xnb_t *xnbp)
215 {
216 	kstat_delete(xnbp->x_kstat_aux);
217 }
218 
219 /*
220  * Software checksum calculation and insertion for an arbitrary packet.
221  */
222 /*ARGSUSED*/
223 static mblk_t *
224 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
225 {
226 	/*
227 	 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least
228 	 * because it doesn't cover all of the interesting cases :-(
229 	 */
230 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
231 	    HCK_FULLCKSUM, KM_NOSLEEP);
232 
233 	return (vnic_fix_cksum(mp));
234 }
235 
236 mblk_t *
237 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
238 {
239 	struct ether_header *ehp;
240 	uint16_t sap;
241 	uint32_t offset;
242 	ipha_t *ipha;
243 
244 	ASSERT(mp->b_next == NULL);
245 
246 	/*
247 	 * Check that the packet is contained in a single mblk.  In
248 	 * the "from peer" path this is true today, but will change
249 	 * when scatter gather support is added.  In the "to peer"
250 	 * path we cannot be sure, but in most cases it will be true
251 	 * (in the xnbo case the packet has come from a MAC device
252 	 * which is unlikely to split packets).
253 	 */
254 	if (mp->b_cont != NULL)
255 		goto software;
256 
257 	/*
258 	 * If the MAC has no hardware capability don't do any further
259 	 * checking.
260 	 */
261 	if (capab == 0)
262 		goto software;
263 
264 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
265 	ehp = (struct ether_header *)mp->b_rptr;
266 
267 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
268 		struct ether_vlan_header *evhp;
269 
270 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
271 		evhp = (struct ether_vlan_header *)mp->b_rptr;
272 		sap = ntohs(evhp->ether_type);
273 		offset = sizeof (struct ether_vlan_header);
274 	} else {
275 		sap = ntohs(ehp->ether_type);
276 		offset = sizeof (struct ether_header);
277 	}
278 
279 	/*
280 	 * We only attempt to do IPv4 packets in hardware.
281 	 */
282 	if (sap != ETHERTYPE_IP)
283 		goto software;
284 
285 	/*
286 	 * We know that this is an IPv4 packet.
287 	 */
288 	ipha = (ipha_t *)(mp->b_rptr + offset);
289 
290 	switch (ipha->ipha_protocol) {
291 	case IPPROTO_TCP:
292 	case IPPROTO_UDP:
293 		/*
294 		 * This is a TCP/IPv4 or UDP/IPv4 packet.
295 		 *
296 		 * If the capabilities indicate that full checksum
297 		 * offload is available, use it.
298 		 */
299 		if ((capab & HCKSUM_INET_FULL_V4) != 0) {
300 			(void) hcksum_assoc(mp, NULL, NULL,
301 			    0, 0, 0, 0,
302 			    HCK_FULLCKSUM, KM_NOSLEEP);
303 
304 			xnbp->x_stat_csum_hardware++;
305 
306 			return (mp);
307 		}
308 
309 		/*
310 		 * XXPV dme: If the capabilities indicate that partial
311 		 * checksum offload is available, we should use it.
312 		 */
313 
314 		break;
315 
316 	default:
317 		/* Use software. */
318 		break;
319 	}
320 
321 software:
322 	/*
323 	 * We are not able to use any offload so do the whole thing in
324 	 * software.
325 	 */
326 	xnbp->x_stat_csum_software++;
327 
328 	return (xnb_software_csum(xnbp, mp));
329 }
330 
331 int
332 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
333 {
334 	xnb_t *xnbp;
335 	char *xsname, mac[ETHERADDRL * 3];
336 
337 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
338 
339 	xnbp->x_flavour = flavour;
340 	xnbp->x_flavour_data = flavour_data;
341 	xnbp->x_devinfo = dip;
342 	xnbp->x_evtchn = INVALID_EVTCHN;
343 	xnbp->x_irq = B_FALSE;
344 	xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE;
345 	xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE;
346 	xnbp->x_cksum_offload = xnb_cksum_offload;
347 	xnbp->x_connected = B_FALSE;
348 	xnbp->x_hotplugged = B_FALSE;
349 	xnbp->x_detachable = B_FALSE;
350 	xnbp->x_peer = xvdi_get_oeid(dip);
351 	xnbp->x_rx_pages_writable = B_FALSE;
352 
353 	xnbp->x_rx_buf_count = 0;
354 	xnbp->x_rx_unmop_count = 0;
355 
356 	xnbp->x_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
357 	ASSERT(xnbp->x_tx_va != NULL);
358 
359 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->x_icookie)
360 	    != DDI_SUCCESS)
361 		goto failure;
362 
363 	mutex_init(&xnbp->x_tx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie);
364 	mutex_init(&xnbp->x_rx_lock, NULL, MUTEX_DRIVER, xnbp->x_icookie);
365 
366 	/* set driver private pointer now */
367 	ddi_set_driver_private(dip, xnbp);
368 
369 	if (!xnb_ks_init(xnbp))
370 		goto late_failure;
371 
372 	/*
373 	 * Receive notification of changes in the state of the
374 	 * driver in the guest domain.
375 	 */
376 	if (xvdi_add_event_handler(dip, XS_OE_STATE,
377 	    xnb_oe_state_change) != DDI_SUCCESS)
378 		goto very_late_failure;
379 
380 	/*
381 	 * Receive notification of hotplug events.
382 	 */
383 	if (xvdi_add_event_handler(dip, XS_HP_STATE,
384 	    xnb_hp_state_change) != DDI_SUCCESS)
385 		goto very_late_failure;
386 
387 	xsname = xvdi_get_xsname(dip);
388 
389 	if (xenbus_printf(XBT_NULL, xsname,
390 	    "feature-no-csum-offload", "%d",
391 	    xnbp->x_cksum_offload ? 0 : 1) != 0)
392 		goto very_very_late_failure;
393 
394 	if (xenbus_scanf(XBT_NULL, xsname,
395 	    "mac", "%s", mac) != 0) {
396 		cmn_err(CE_WARN, "xnb_attach: "
397 		    "cannot read mac address from %s",
398 		    xsname);
399 		goto very_very_late_failure;
400 	}
401 
402 	if (ether_aton(mac, xnbp->x_mac_addr) != ETHERADDRL) {
403 		cmn_err(CE_WARN,
404 		    "xnb_attach: cannot parse mac address %s",
405 		    mac);
406 		goto very_very_late_failure;
407 	}
408 
409 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
410 	(void) xvdi_post_event(dip, XEN_HP_ADD);
411 
412 	return (DDI_SUCCESS);
413 
414 very_very_late_failure: /* not that the naming is getting silly or anything */
415 	xvdi_remove_event_handler(dip, NULL);
416 
417 very_late_failure:
418 	xnb_ks_free(xnbp);
419 
420 late_failure:
421 	mutex_destroy(&xnbp->x_rx_lock);
422 	mutex_destroy(&xnbp->x_tx_lock);
423 
424 failure:
425 	vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE);
426 	kmem_free(xnbp, sizeof (*xnbp));
427 	return (DDI_FAILURE);
428 }
429 
430 /*ARGSUSED*/
431 void
432 xnb_detach(dev_info_t *dip)
433 {
434 	xnb_t *xnbp = ddi_get_driver_private(dip);
435 
436 	ASSERT(xnbp != NULL);
437 	ASSERT(!xnbp->x_connected);
438 	ASSERT(xnbp->x_rx_buf_count == 0);
439 
440 	xnb_disconnect_rings(dip);
441 
442 	xvdi_remove_event_handler(dip, NULL);
443 
444 	xnb_ks_free(xnbp);
445 
446 	ddi_set_driver_private(dip, NULL);
447 
448 	mutex_destroy(&xnbp->x_tx_lock);
449 	mutex_destroy(&xnbp->x_rx_lock);
450 
451 	ASSERT(xnbp->x_tx_va != NULL);
452 	vmem_free(heap_arena, xnbp->x_tx_va, PAGESIZE);
453 
454 	kmem_free(xnbp, sizeof (*xnbp));
455 }
456 
457 
458 static mfn_t
459 xnb_alloc_page(xnb_t *xnbp)
460 {
461 #define	WARNING_RATE_LIMIT 100
462 #define	BATCH_SIZE 256
463 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
464 	static int nth = BATCH_SIZE;
465 	mfn_t mfn;
466 
467 	mutex_enter(&xnb_alloc_page_lock);
468 	if (nth == BATCH_SIZE) {
469 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
470 			xnbp->x_stat_allocation_failure++;
471 			mutex_exit(&xnb_alloc_page_lock);
472 
473 			/*
474 			 * Try for a single page in low memory situations.
475 			 */
476 			if (balloon_alloc_pages(1, &mfn) != 1) {
477 				xnbp->x_stat_small_allocation_failure++;
478 				if ((xnbp->x_stat_small_allocation_failure
479 				    % WARNING_RATE_LIMIT) == 0) {
480 					cmn_err(CE_WARN, "xnb_alloc_page: "
481 					    "Cannot allocate memory to "
482 					    "transfer packets to peer.");
483 				}
484 				return (0);
485 			} else {
486 				xnbp->x_stat_small_allocation_success++;
487 				return (mfn);
488 			}
489 		}
490 
491 		nth = 0;
492 		xnbp->x_stat_allocation_success++;
493 	}
494 
495 	mfn = mfns[nth++];
496 	mutex_exit(&xnb_alloc_page_lock);
497 
498 	ASSERT(mfn != 0);
499 
500 	return (mfn);
501 #undef BATCH_SIZE
502 #undef WARNING_RATE_LIMIT
503 }
504 
505 /*ARGSUSED*/
506 static void
507 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
508 {
509 	int r;
510 	pfn_t pfn;
511 
512 	pfn = xen_assign_pfn(mfn);
513 	pfnzero(pfn, 0, PAGESIZE);
514 	xen_release_pfn(pfn);
515 
516 	/*
517 	 * This happens only in the error path, so batching is
518 	 * not worth the complication.
519 	 */
520 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
521 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
522 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
523 		    r, mfn);
524 	}
525 }
526 
527 mblk_t *
528 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
529 {
530 	mblk_t *free = mp, *prev = NULL;
531 	size_t len;
532 	gnttab_transfer_t *gop;
533 	boolean_t notify;
534 	RING_IDX loop, prod, end;
535 
536 	/*
537 	 * For each packet the sequence of operations is:
538 	 *
539 	 * 1. get a new page from the hypervisor.
540 	 * 2. get a request slot from the ring.
541 	 * 3. copy the data into the new page.
542 	 * 4. transfer the page to the peer.
543 	 * 5. update the request slot.
544 	 * 6. kick the peer.
545 	 * 7. free mp.
546 	 *
547 	 * In order to reduce the number of hypercalls, we prepare
548 	 * several packets for the peer and perform a single hypercall
549 	 * to transfer them.
550 	 */
551 
552 	mutex_enter(&xnbp->x_tx_lock);
553 
554 	/*
555 	 * If we are not connected to the peer or have not yet
556 	 * finished hotplug it is too early to pass packets to the
557 	 * peer.
558 	 */
559 	if (!(xnbp->x_connected && xnbp->x_hotplugged)) {
560 		mutex_exit(&xnbp->x_tx_lock);
561 		xnbp->x_stat_tx_too_early++;
562 		return (mp);
563 	}
564 
565 	loop = xnbp->x_rx_ring.req_cons;
566 	prod = xnbp->x_rx_ring.rsp_prod_pvt;
567 	gop = xnbp->x_tx_top;
568 
569 	/*
570 	 * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring) but
571 	 * using local variables.
572 	 */
573 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
574 	((((_r)->sring->req_prod - loop) <		\
575 		(RING_SIZE(_r) - (loop - prod))) ?	\
576 	    ((_r)->sring->req_prod - loop) :		\
577 	    (RING_SIZE(_r) - (loop - prod)))
578 
579 	while ((mp != NULL) &&
580 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->x_rx_ring)) {
581 
582 		mfn_t mfn;
583 		pfn_t pfn;
584 		netif_rx_request_t *rxreq;
585 		netif_rx_response_t *rxresp;
586 		char *valoop;
587 		size_t offset;
588 		mblk_t *ml;
589 		uint16_t cksum_flags;
590 
591 		/* 1 */
592 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
593 			xnbp->x_stat_xmit_defer++;
594 			break;
595 		}
596 
597 		/* 2 */
598 		rxreq = RING_GET_REQUEST(&xnbp->x_rx_ring, loop);
599 
600 #ifdef XNB_DEBUG
601 		if (!(rxreq->id < NET_RX_RING_SIZE))
602 			cmn_err(CE_PANIC, "xnb_to_peer: "
603 			    "id %d out of range in request 0x%p",
604 			    rxreq->id, (void *)rxreq);
605 		if (rxreq->gref >= NR_GRANT_ENTRIES)
606 			cmn_err(CE_PANIC, "xnb_to_peer: "
607 			    "grant ref %d out of range in request 0x%p",
608 			    rxreq->gref, (void *)rxreq);
609 #endif /* XNB_DEBUG */
610 
611 		/* Assign a pfn and map the new page at the allocated va. */
612 		pfn = xen_assign_pfn(mfn);
613 		hat_devload(kas.a_hat, xnbp->x_tx_va, PAGESIZE,
614 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
615 
616 		offset = TX_BUFFER_HEADROOM;
617 
618 		/* 3 */
619 		len = 0;
620 		valoop = xnbp->x_tx_va + offset;
621 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
622 			size_t chunk = ml->b_wptr - ml->b_rptr;
623 
624 			bcopy(ml->b_rptr, valoop, chunk);
625 			valoop += chunk;
626 			len += chunk;
627 		}
628 
629 		ASSERT(len + offset < PAGESIZE);
630 
631 		/* Release the pfn. */
632 		hat_unload(kas.a_hat, xnbp->x_tx_va, PAGESIZE,
633 		    HAT_UNLOAD_UNMAP);
634 		xen_release_pfn(pfn);
635 
636 		/* 4 */
637 		gop->mfn = mfn;
638 		gop->domid = xnbp->x_peer;
639 		gop->ref = rxreq->gref;
640 
641 		/* 5.1 */
642 		rxresp = RING_GET_RESPONSE(&xnbp->x_rx_ring, prod);
643 		rxresp->offset = offset;
644 		rxresp->flags = 0;
645 
646 		cksum_flags = xnbp->x_flavour->xf_cksum_to_peer(xnbp, mp);
647 		if (cksum_flags != 0)
648 			xnbp->x_stat_tx_cksum_deferred++;
649 		rxresp->flags |= cksum_flags;
650 
651 		rxresp->id = RING_GET_REQUEST(&xnbp->x_rx_ring, prod)->id;
652 		rxresp->status = len;
653 
654 		loop++;
655 		prod++;
656 		gop++;
657 		prev = mp;
658 		mp = mp->b_next;
659 	}
660 
661 	/*
662 	 * Did we actually do anything?
663 	 */
664 	if (loop == xnbp->x_rx_ring.req_cons) {
665 		mutex_exit(&xnbp->x_tx_lock);
666 		return (mp);
667 	}
668 
669 	end = loop;
670 
671 	/*
672 	 * Unlink the end of the 'done' list from the remainder.
673 	 */
674 	ASSERT(prev != NULL);
675 	prev->b_next = NULL;
676 
677 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->x_tx_top,
678 	    loop - xnbp->x_rx_ring.req_cons) != 0) {
679 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
680 	}
681 
682 	loop = xnbp->x_rx_ring.req_cons;
683 	prod = xnbp->x_rx_ring.rsp_prod_pvt;
684 	gop = xnbp->x_tx_top;
685 
686 	while (loop < end) {
687 		int16_t status = NETIF_RSP_OKAY;
688 
689 		if (gop->status != 0) {
690 			status = NETIF_RSP_ERROR;
691 
692 			/*
693 			 * If the status is anything other than
694 			 * GNTST_bad_page then we don't own the page
695 			 * any more, so don't try to give it back.
696 			 */
697 			if (gop->status != GNTST_bad_page)
698 				gop->mfn = 0;
699 		} else {
700 			/* The page is no longer ours. */
701 			gop->mfn = 0;
702 		}
703 
704 		if (gop->mfn != 0)
705 			/*
706 			 * Give back the page, as we won't be using
707 			 * it.
708 			 */
709 			xnb_free_page(xnbp, gop->mfn);
710 		else
711 			/*
712 			 * We gave away a page, update our accounting
713 			 * now.
714 			 */
715 			balloon_drv_subtracted(1);
716 
717 		/* 5.2 */
718 		if (status != NETIF_RSP_OKAY) {
719 			RING_GET_RESPONSE(&xnbp->x_rx_ring, prod)->status =
720 			    status;
721 		} else {
722 			xnbp->x_stat_opackets++;
723 			xnbp->x_stat_obytes += len;
724 		}
725 
726 		loop++;
727 		prod++;
728 		gop++;
729 	}
730 
731 	xnbp->x_rx_ring.req_cons = loop;
732 	xnbp->x_rx_ring.rsp_prod_pvt = prod;
733 
734 	/* 6 */
735 	/*LINTED: constant in conditional context*/
736 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_rx_ring, notify);
737 	if (notify) {
738 		ec_notify_via_evtchn(xnbp->x_evtchn);
739 		xnbp->x_stat_tx_notify_sent++;
740 	} else {
741 		xnbp->x_stat_tx_notify_deferred++;
742 	}
743 
744 	if (mp != NULL)
745 		xnbp->x_stat_xmit_defer++;
746 
747 	mutex_exit(&xnbp->x_tx_lock);
748 
749 	/* Free mblk_t's that we consumed. */
750 	freemsgchain(free);
751 
752 	return (mp);
753 }
754 
755 /*ARGSUSED*/
756 static int
757 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag)
758 {
759 	xnb_rxbuf_t *rxp = buf;
760 
761 	bzero(rxp, sizeof (*rxp));
762 
763 	rxp->xr_free_rtn.free_func = xnb_rx_complete;
764 	rxp->xr_free_rtn.free_arg = (caddr_t)rxp;
765 
766 	rxp->xr_mop.host_addr =
767 	    (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE,
768 	    ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ?
769 	    VM_NOSLEEP : VM_SLEEP);
770 
771 	if (rxp->xr_mop.host_addr == NULL) {
772 		cmn_err(CE_WARN, "xnb_rxbuf_constructor: "
773 		    "cannot get address space");
774 		return (-1);
775 	}
776 
777 	/*
778 	 * Have the hat ensure that page table exists for the VA.
779 	 */
780 	hat_prepare_mapping(kas.a_hat,
781 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
782 
783 	return (0);
784 }
785 
786 /*ARGSUSED*/
787 static void
788 xnb_rxbuf_destructor(void *buf, void *arg)
789 {
790 	xnb_rxbuf_t *rxp = buf;
791 
792 	ASSERT(rxp->xr_mop.host_addr != NULL);
793 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
794 
795 	hat_release_mapping(kas.a_hat,
796 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr);
797 	vmem_free(heap_arena,
798 	    (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE);
799 }
800 
801 static void
802 xnb_rx_notify_peer(xnb_t *xnbp)
803 {
804 	boolean_t notify;
805 
806 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
807 
808 	/*LINTED: constant in conditional context*/
809 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->x_tx_ring, notify);
810 	if (notify) {
811 		ec_notify_via_evtchn(xnbp->x_evtchn);
812 		xnbp->x_stat_rx_notify_sent++;
813 	} else {
814 		xnbp->x_stat_rx_notify_deferred++;
815 	}
816 }
817 
818 static void
819 xnb_rx_complete(xnb_rxbuf_t *rxp)
820 {
821 	xnb_t *xnbp = rxp->xr_xnbp;
822 
823 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
824 
825 	mutex_enter(&xnbp->x_rx_lock);
826 
827 	xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop);
828 	xnb_rx_perform_pending_unmop(xnbp);
829 
830 	if (xnbp->x_connected) {
831 		xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status);
832 		xnb_rx_notify_peer(xnbp);
833 	}
834 
835 	xnb_rxbuf_put(xnbp, rxp);
836 
837 	mutex_exit(&xnbp->x_rx_lock);
838 }
839 
840 static void
841 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
842 {
843 	RING_IDX i;
844 	netif_tx_response_t *txresp;
845 
846 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
847 
848 	i = xnbp->x_tx_ring.rsp_prod_pvt;
849 
850 	txresp = RING_GET_RESPONSE(&xnbp->x_tx_ring, i);
851 	txresp->id = id;
852 	txresp->status = status;
853 
854 	xnbp->x_tx_ring.rsp_prod_pvt = i + 1;
855 
856 	/*
857 	 * Note that we don't push the change to the peer here - that
858 	 * is the callers responsibility.
859 	 */
860 }
861 
862 /*
863  * XXPV dme: currently pending unmap operations are stored on a
864  * per-instance basis.  Should they be per-driver?  The locking would
865  * have to change (obviously), but there might be an improvement from
866  * batching more together.  Right now they are all 'done' either at
867  * the tail of each receive operation (copy case) or on each
868  * completion (non-copy case).  Should that be changed to some
869  * interval (watermark?) to improve the chance of batching?
870  */
871 static void
872 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop)
873 {
874 	gnttab_unmap_grant_ref_t *unmop;
875 
876 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
877 	ASSERT(xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE);
878 
879 	unmop = &xnbp->x_rx_unmop[xnbp->x_rx_unmop_count];
880 	xnbp->x_rx_unmop_count++;
881 
882 	unmop->host_addr = mop->host_addr;
883 	unmop->dev_bus_addr = mop->dev_bus_addr;
884 	unmop->handle = mop->handle;
885 
886 #ifdef XNB_DEBUG
887 	if (xnbp->x_rx_unmop_count <= NET_TX_RING_SIZE)
888 		ASSERT(xnbp->x_rx_unmop[xnbp->x_rx_unmop_count].host_addr
889 		    == NULL);
890 #endif /* XNB_DEBUG */
891 
892 }
893 
894 static void
895 xnb_rx_perform_pending_unmop(xnb_t *xnbp)
896 {
897 #ifdef XNB_DEBUG
898 	RING_IDX loop;
899 	gnttab_unmap_grant_ref_t *unmop;
900 #endif /* XNB_DEBUG */
901 
902 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
903 
904 	if (xnbp->x_rx_unmop_count == 0)
905 		return;
906 
907 	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
908 	    xnbp->x_rx_unmop, xnbp->x_rx_unmop_count) < 0) {
909 		cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
910 		    "unmap grant operation failed, "
911 		    "%d pages lost", xnbp->x_rx_unmop_count);
912 	}
913 
914 #ifdef XNB_DEBUG
915 	for (loop = 0, unmop = xnbp->x_rx_unmop;
916 	    loop < xnbp->x_rx_unmop_count;
917 	    loop++, unmop++) {
918 		if (unmop->status != 0) {
919 			cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: "
920 			    "unmap grant reference failed (%d)",
921 			    unmop->status);
922 		}
923 	}
924 #endif /* XNB_DEBUG */
925 
926 	xnbp->x_rx_unmop_count = 0;
927 
928 #ifdef XNB_DEBUG
929 	bzero(xnbp->x_rx_unmop, sizeof (xnbp->x_rx_unmop));
930 #endif /* XNB_DEBUG */
931 }
932 
933 static xnb_rxbuf_t *
934 xnb_rxbuf_get(xnb_t *xnbp, int flags)
935 {
936 	xnb_rxbuf_t *rxp;
937 
938 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
939 
940 	rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags);
941 	if (rxp != NULL) {
942 		ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0);
943 		rxp->xr_flags |= XNB_RXBUF_INUSE;
944 
945 		rxp->xr_xnbp = xnbp;
946 		rxp->xr_mop.dom = xnbp->x_peer;
947 
948 		rxp->xr_mop.flags = GNTMAP_host_map;
949 		if (!xnbp->x_rx_pages_writable)
950 			rxp->xr_mop.flags |= GNTMAP_readonly;
951 
952 		xnbp->x_rx_buf_count++;
953 	}
954 
955 	return (rxp);
956 }
957 
958 static void
959 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp)
960 {
961 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
962 	ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE);
963 
964 	rxp->xr_flags &= ~XNB_RXBUF_INUSE;
965 	xnbp->x_rx_buf_count--;
966 
967 	kmem_cache_free(xnb_rxbuf_cachep, rxp);
968 }
969 
970 static mblk_t *
971 xnb_recv(xnb_t *xnbp)
972 {
973 	RING_IDX start, end, loop;
974 	gnttab_map_grant_ref_t *mop;
975 	xnb_rxbuf_t **rxpp;
976 	netif_tx_request_t *txreq;
977 	boolean_t work_to_do;
978 	mblk_t *head, *tail;
979 	/*
980 	 * If the peer granted a read-only mapping to the page then we
981 	 * must copy the data, as the local protocol stack (should the
982 	 * packet be destined for this host) will modify the packet
983 	 * 'in place'.
984 	 */
985 	boolean_t copy = !xnbp->x_rx_pages_writable;
986 
987 	/*
988 	 * For each individual request, the sequence of actions is:
989 	 *
990 	 * 1. get the request.
991 	 * 2. map the page based on the grant ref.
992 	 * 3. allocate an mblk, copy the data to it.
993 	 * 4. release the grant.
994 	 * 5. update the ring.
995 	 * 6. pass the packet upward.
996 	 * 7. kick the peer.
997 	 *
998 	 * In fact, we try to perform the grant operations in batches,
999 	 * so there are two loops.
1000 	 */
1001 
1002 	head = tail = NULL;
1003 around:
1004 	ASSERT(MUTEX_HELD(&xnbp->x_rx_lock));
1005 
1006 	/*LINTED: constant in conditional context*/
1007 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->x_tx_ring, work_to_do);
1008 	if (!work_to_do) {
1009 finished:
1010 		xnb_rx_notify_peer(xnbp);
1011 
1012 		return (head);
1013 	}
1014 
1015 	start = xnbp->x_tx_ring.req_cons;
1016 	end = xnbp->x_tx_ring.sring->req_prod;
1017 
1018 	for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp;
1019 	    loop != end;
1020 	    loop++, mop++, rxpp++) {
1021 		xnb_rxbuf_t *rxp;
1022 
1023 		rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP);
1024 		if (rxp == NULL)
1025 			break;
1026 
1027 		ASSERT(xnbp->x_rx_pages_writable ||
1028 		    ((rxp->xr_mop.flags & GNTMAP_readonly)
1029 		    == GNTMAP_readonly));
1030 
1031 		rxp->xr_mop.ref =
1032 		    RING_GET_REQUEST(&xnbp->x_tx_ring, loop)->gref;
1033 
1034 		ASSERT(rxp->xr_mop.ref < NR_GRANT_ENTRIES);
1035 
1036 		*mop = rxp->xr_mop;
1037 		*rxpp = rxp;
1038 	}
1039 
1040 	if ((loop - start) == 0)
1041 		goto finished;
1042 
1043 	end = loop;
1044 
1045 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1046 	    xnbp->x_rx_mop, end - start) != 0) {
1047 
1048 		cmn_err(CE_WARN, "xnb_recv: map grant operation failed");
1049 
1050 		loop = start;
1051 		rxpp = xnbp->x_rx_bufp;
1052 
1053 		while (loop != end) {
1054 			xnb_rxbuf_put(xnbp, *rxpp);
1055 
1056 			loop++;
1057 			rxpp++;
1058 		}
1059 
1060 		goto finished;
1061 	}
1062 
1063 	for (loop = start, mop = xnbp->x_rx_mop, rxpp = xnbp->x_rx_bufp;
1064 	    loop != end;
1065 	    loop++, mop++, rxpp++) {
1066 		mblk_t *mp = NULL;
1067 		int16_t status = NETIF_RSP_OKAY;
1068 		xnb_rxbuf_t *rxp = *rxpp;
1069 
1070 		if (mop->status != 0) {
1071 			cmn_err(CE_WARN, "xnb_recv: "
1072 			    "failed to map buffer: %d",
1073 			    mop->status);
1074 			status = NETIF_RSP_ERROR;
1075 		}
1076 
1077 		txreq = RING_GET_REQUEST(&xnbp->x_tx_ring, loop);
1078 
1079 		if (status == NETIF_RSP_OKAY) {
1080 			if (copy) {
1081 				mp = allocb(txreq->size, BPRI_MED);
1082 				if (mp == NULL) {
1083 					status = NETIF_RSP_ERROR;
1084 					xnbp->x_stat_rx_allocb_failed++;
1085 				} else {
1086 					bcopy((caddr_t)(uintptr_t)
1087 					    mop->host_addr + txreq->offset,
1088 					    mp->b_wptr, txreq->size);
1089 					mp->b_wptr += txreq->size;
1090 				}
1091 			} else {
1092 				mp = desballoc((unsigned char *)(uintptr_t)
1093 				    mop->host_addr + txreq->offset,
1094 				    txreq->size, 0, &rxp->xr_free_rtn);
1095 				if (mp == NULL) {
1096 					status = NETIF_RSP_ERROR;
1097 					xnbp->x_stat_rx_allocb_failed++;
1098 				} else {
1099 					rxp->xr_id = txreq->id;
1100 					rxp->xr_status = status;
1101 					rxp->xr_mop = *mop;
1102 
1103 					mp->b_wptr += txreq->size;
1104 				}
1105 			}
1106 
1107 			/*
1108 			 * If we have a buffer and there are checksum
1109 			 * flags, process them appropriately.
1110 			 */
1111 			if ((mp != NULL) &&
1112 			    ((txreq->flags &
1113 			    (NETTXF_csum_blank | NETTXF_data_validated))
1114 			    != 0)) {
1115 				mp = xnbp->x_flavour->xf_cksum_from_peer(xnbp,
1116 				    mp, txreq->flags);
1117 				xnbp->x_stat_rx_cksum_no_need++;
1118 			}
1119 		}
1120 
1121 		if (copy || (mp == NULL)) {
1122 			xnb_rx_mark_complete(xnbp, txreq->id, status);
1123 			xnb_rx_schedule_unmop(xnbp, mop);
1124 		}
1125 
1126 		if (mp != NULL) {
1127 			xnbp->x_stat_ipackets++;
1128 			xnbp->x_stat_rbytes += txreq->size;
1129 
1130 			mp->b_next = NULL;
1131 			if (head == NULL) {
1132 				ASSERT(tail == NULL);
1133 				head = mp;
1134 			} else {
1135 				ASSERT(tail != NULL);
1136 				tail->b_next = mp;
1137 			}
1138 			tail = mp;
1139 		}
1140 	}
1141 
1142 	/*
1143 	 * This has to be here rather than in the 'finished' code
1144 	 * because we can only handle NET_TX_RING_SIZE pending unmap
1145 	 * operations, which may be exceeded by multiple trips around
1146 	 * the receive loop during heavy load (one trip around the
1147 	 * loop cannot generate more than NET_TX_RING_SIZE unmap
1148 	 * operations).
1149 	 */
1150 	xnb_rx_perform_pending_unmop(xnbp);
1151 	if (copy) {
1152 		for (loop = start, rxpp = xnbp->x_rx_bufp;
1153 		    loop != end;
1154 		    loop++, rxpp++)
1155 			xnb_rxbuf_put(xnbp, *rxpp);
1156 	}
1157 
1158 	xnbp->x_tx_ring.req_cons = loop;
1159 
1160 	goto around;
1161 	/* NOTREACHED */
1162 }
1163 
1164 /*
1165  *  intr() -- ring interrupt service routine
1166  */
1167 static uint_t
1168 xnb_intr(caddr_t arg)
1169 {
1170 	xnb_t *xnbp = (xnb_t *)arg;
1171 	mblk_t *mp;
1172 
1173 	xnbp->x_stat_intr++;
1174 
1175 	mutex_enter(&xnbp->x_rx_lock);
1176 
1177 	ASSERT(xnbp->x_connected);
1178 
1179 	mp = xnb_recv(xnbp);
1180 
1181 	mutex_exit(&xnbp->x_rx_lock);
1182 
1183 	if (!xnbp->x_hotplugged) {
1184 		xnbp->x_stat_rx_too_early++;
1185 		goto fail;
1186 	}
1187 	if (mp == NULL) {
1188 		xnbp->x_stat_spurious_intr++;
1189 		goto fail;
1190 	}
1191 
1192 	xnbp->x_flavour->xf_recv(xnbp, mp);
1193 
1194 	return (DDI_INTR_CLAIMED);
1195 
1196 fail:
1197 	freemsgchain(mp);
1198 	return (DDI_INTR_CLAIMED);
1199 }
1200 
1201 static boolean_t
1202 xnb_connect_rings(dev_info_t *dip)
1203 {
1204 	xnb_t *xnbp = ddi_get_driver_private(dip);
1205 	char *oename;
1206 	struct gnttab_map_grant_ref map_op;
1207 	evtchn_port_t evtchn;
1208 	int i;
1209 
1210 	/*
1211 	 * Cannot attempt to connect the rings if already connected.
1212 	 */
1213 	ASSERT(!xnbp->x_connected);
1214 
1215 	oename = xvdi_get_oename(dip);
1216 
1217 	if (xenbus_gather(XBT_NULL, oename,
1218 	    "event-channel", "%u", &evtchn,
1219 	    "tx-ring-ref", "%lu", &xnbp->x_tx_ring_ref,
1220 	    "rx-ring-ref", "%lu", &xnbp->x_rx_ring_ref,
1221 	    NULL) != 0) {
1222 		cmn_err(CE_WARN, "xnb_connect_rings: "
1223 		    "cannot read other-end details from %s",
1224 		    oename);
1225 		goto fail;
1226 	}
1227 
1228 	if (xenbus_scanf(XBT_NULL, oename,
1229 	    "feature-tx-writable", "%d", &i) != 0)
1230 		i = 0;
1231 	if (i != 0)
1232 		xnbp->x_rx_pages_writable = B_TRUE;
1233 
1234 	if (xenbus_scanf(XBT_NULL, oename,
1235 	    "feature-no-csum-offload", "%d", &i) != 0)
1236 		i = 0;
1237 	if ((i == 1) || !xnbp->x_cksum_offload)
1238 		xnbp->x_cksum_offload = B_FALSE;
1239 
1240 	/*
1241 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1242 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1243 	 *    into the allocated vaddr (one for tx, one for rx).
1244 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1245 	 *    bound to this domain.
1246 	 * 4. associate the event channel with an interrupt.
1247 	 * 5. declare ourselves connected.
1248 	 * 6. enable the interrupt.
1249 	 */
1250 
1251 	/* 1.tx */
1252 	xnbp->x_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1253 	    0, 0, 0, 0, VM_SLEEP);
1254 	ASSERT(xnbp->x_tx_ring_addr != NULL);
1255 
1256 	/* 2.tx */
1257 	map_op.host_addr = (uint64_t)((long)xnbp->x_tx_ring_addr);
1258 	map_op.flags = GNTMAP_host_map;
1259 	map_op.ref = xnbp->x_tx_ring_ref;
1260 	map_op.dom = xnbp->x_peer;
1261 	hat_prepare_mapping(kas.a_hat, xnbp->x_tx_ring_addr);
1262 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1263 	    &map_op, 1) != 0 || map_op.status != 0) {
1264 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1265 		goto fail;
1266 	}
1267 	xnbp->x_tx_ring_handle = map_op.handle;
1268 
1269 	/*LINTED: constant in conditional context*/
1270 	BACK_RING_INIT(&xnbp->x_tx_ring,
1271 	    (netif_tx_sring_t *)xnbp->x_tx_ring_addr, PAGESIZE);
1272 
1273 	/* 1.rx */
1274 	xnbp->x_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1275 	    0, 0, 0, 0, VM_SLEEP);
1276 	ASSERT(xnbp->x_rx_ring_addr != NULL);
1277 
1278 	/* 2.rx */
1279 	map_op.host_addr = (uint64_t)((long)xnbp->x_rx_ring_addr);
1280 	map_op.flags = GNTMAP_host_map;
1281 	map_op.ref = xnbp->x_rx_ring_ref;
1282 	map_op.dom = xnbp->x_peer;
1283 	hat_prepare_mapping(kas.a_hat, xnbp->x_rx_ring_addr);
1284 	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1285 	    &map_op, 1) != 0 || map_op.status != 0) {
1286 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1287 		goto fail;
1288 	}
1289 	xnbp->x_rx_ring_handle = map_op.handle;
1290 
1291 	/*LINTED: constant in conditional context*/
1292 	BACK_RING_INIT(&xnbp->x_rx_ring,
1293 	    (netif_rx_sring_t *)xnbp->x_rx_ring_addr, PAGESIZE);
1294 
1295 	/* 3 */
1296 	if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) {
1297 		cmn_err(CE_WARN, "xnb_connect_rings: "
1298 		    "cannot bind event channel %d", xnbp->x_evtchn);
1299 		xnbp->x_evtchn = INVALID_EVTCHN;
1300 		goto fail;
1301 	}
1302 	xnbp->x_evtchn = xvdi_get_evtchn(dip);
1303 
1304 	/*
1305 	 * It would be good to set the state to XenbusStateConnected
1306 	 * here as well, but then what if ddi_add_intr() failed?
1307 	 * Changing the state in the store will be noticed by the peer
1308 	 * and cannot be "taken back".
1309 	 */
1310 	mutex_enter(&xnbp->x_tx_lock);
1311 	mutex_enter(&xnbp->x_rx_lock);
1312 
1313 	/* 5.1 */
1314 	xnbp->x_connected = B_TRUE;
1315 
1316 	mutex_exit(&xnbp->x_rx_lock);
1317 	mutex_exit(&xnbp->x_tx_lock);
1318 
1319 	/* 4, 6 */
1320 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1321 	    != DDI_SUCCESS) {
1322 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1323 		goto fail;
1324 	}
1325 	xnbp->x_irq = B_TRUE;
1326 
1327 	/* 5.2 */
1328 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1329 
1330 	return (B_TRUE);
1331 
1332 fail:
1333 	mutex_enter(&xnbp->x_tx_lock);
1334 	mutex_enter(&xnbp->x_rx_lock);
1335 
1336 	xnbp->x_connected = B_FALSE;
1337 
1338 	mutex_exit(&xnbp->x_rx_lock);
1339 	mutex_exit(&xnbp->x_tx_lock);
1340 
1341 	return (B_FALSE);
1342 }
1343 
1344 static void
1345 xnb_disconnect_rings(dev_info_t *dip)
1346 {
1347 	xnb_t *xnbp = ddi_get_driver_private(dip);
1348 
1349 	if (xnbp->x_irq) {
1350 		ddi_remove_intr(dip, 0, NULL);
1351 		xnbp->x_irq = B_FALSE;
1352 	}
1353 
1354 	if (xnbp->x_evtchn != INVALID_EVTCHN) {
1355 		xvdi_free_evtchn(dip);
1356 		xnbp->x_evtchn = INVALID_EVTCHN;
1357 	}
1358 
1359 	if (xnbp->x_rx_ring_handle != INVALID_GRANT_HANDLE) {
1360 		struct gnttab_unmap_grant_ref unmap_op;
1361 
1362 		unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_rx_ring_addr;
1363 		unmap_op.dev_bus_addr = 0;
1364 		unmap_op.handle = xnbp->x_rx_ring_handle;
1365 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1366 		    &unmap_op, 1) != 0)
1367 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1368 			    "cannot unmap rx-ring page (%d)",
1369 			    unmap_op.status);
1370 
1371 		xnbp->x_rx_ring_handle = INVALID_GRANT_HANDLE;
1372 	}
1373 
1374 	if (xnbp->x_rx_ring_addr != NULL) {
1375 		hat_release_mapping(kas.a_hat, xnbp->x_rx_ring_addr);
1376 		vmem_free(heap_arena, xnbp->x_rx_ring_addr, PAGESIZE);
1377 		xnbp->x_rx_ring_addr = NULL;
1378 	}
1379 
1380 	if (xnbp->x_tx_ring_handle != INVALID_GRANT_HANDLE) {
1381 		struct gnttab_unmap_grant_ref unmap_op;
1382 
1383 		unmap_op.host_addr = (uint64_t)(uintptr_t)xnbp->x_tx_ring_addr;
1384 		unmap_op.dev_bus_addr = 0;
1385 		unmap_op.handle = xnbp->x_tx_ring_handle;
1386 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1387 		    &unmap_op, 1) != 0)
1388 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1389 			    "cannot unmap tx-ring page (%d)",
1390 			    unmap_op.status);
1391 
1392 		xnbp->x_tx_ring_handle = INVALID_GRANT_HANDLE;
1393 	}
1394 
1395 	if (xnbp->x_tx_ring_addr != NULL) {
1396 		hat_release_mapping(kas.a_hat, xnbp->x_tx_ring_addr);
1397 		vmem_free(heap_arena, xnbp->x_tx_ring_addr, PAGESIZE);
1398 		xnbp->x_tx_ring_addr = NULL;
1399 	}
1400 }
1401 
1402 /*ARGSUSED*/
1403 static void
1404 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1405     void *arg, void *impl_data)
1406 {
1407 	xnb_t *xnbp = ddi_get_driver_private(dip);
1408 	XenbusState new_state = *(XenbusState *)impl_data;
1409 
1410 	ASSERT(xnbp != NULL);
1411 
1412 	switch (new_state) {
1413 	case XenbusStateConnected:
1414 		if (xnb_connect_rings(dip)) {
1415 			xnbp->x_flavour->xf_peer_connected(xnbp);
1416 		} else {
1417 			xnbp->x_flavour->xf_peer_disconnected(xnbp);
1418 			xnb_disconnect_rings(dip);
1419 			(void) xvdi_switch_state(dip, XBT_NULL,
1420 			    XenbusStateClosed);
1421 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1422 		}
1423 
1424 		/*
1425 		 * Now that we've attempted to connect it's reasonable
1426 		 * to allow an attempt to detach.
1427 		 */
1428 		xnbp->x_detachable = B_TRUE;
1429 
1430 		break;
1431 
1432 	case XenbusStateClosing:
1433 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1434 
1435 		break;
1436 
1437 	case XenbusStateClosed:
1438 		xnbp->x_flavour->xf_peer_disconnected(xnbp);
1439 
1440 		mutex_enter(&xnbp->x_tx_lock);
1441 		mutex_enter(&xnbp->x_rx_lock);
1442 
1443 		xnb_disconnect_rings(dip);
1444 		xnbp->x_connected = B_FALSE;
1445 
1446 		mutex_exit(&xnbp->x_rx_lock);
1447 		mutex_exit(&xnbp->x_tx_lock);
1448 
1449 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1450 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1451 		/*
1452 		 * In all likelyhood this is already set (in the above
1453 		 * case), but if the peer never attempted to connect
1454 		 * and the domain is destroyed we get here without
1455 		 * having been through the case above, so we set it to
1456 		 * be sure.
1457 		 */
1458 		xnbp->x_detachable = B_TRUE;
1459 
1460 		break;
1461 
1462 	default:
1463 		break;
1464 	}
1465 }
1466 
1467 /*ARGSUSED*/
1468 static void
1469 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1470     void *arg, void *impl_data)
1471 {
1472 	xnb_t *xnbp = ddi_get_driver_private(dip);
1473 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
1474 	boolean_t success;
1475 
1476 	ASSERT(xnbp != NULL);
1477 
1478 	switch (state) {
1479 	case Connected:
1480 
1481 		success = xnbp->x_flavour->xf_hotplug_connected(xnbp);
1482 
1483 		mutex_enter(&xnbp->x_tx_lock);
1484 		mutex_enter(&xnbp->x_rx_lock);
1485 
1486 		xnbp->x_hotplugged = success;
1487 
1488 		mutex_exit(&xnbp->x_rx_lock);
1489 		mutex_exit(&xnbp->x_tx_lock);
1490 		break;
1491 
1492 	default:
1493 		break;
1494 	}
1495 }
1496 
1497 static struct modldrv modldrv = {
1498 	&mod_miscops, "xnb module %I%",
1499 };
1500 
1501 static struct modlinkage modlinkage = {
1502 	MODREV_1, &modldrv, NULL
1503 };
1504 
1505 int
1506 _init(void)
1507 {
1508 	int i;
1509 
1510 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
1511 
1512 	xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep",
1513 	    sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor,
1514 	    xnb_rxbuf_destructor, NULL, NULL, NULL, 0);
1515 	ASSERT(xnb_rxbuf_cachep != NULL);
1516 
1517 	i = mod_install(&modlinkage);
1518 	if (i != DDI_SUCCESS) {
1519 		kmem_cache_destroy(xnb_rxbuf_cachep);
1520 		mutex_destroy(&xnb_alloc_page_lock);
1521 	}
1522 	return (i);
1523 }
1524 
1525 int
1526 _info(struct modinfo *modinfop)
1527 {
1528 	return (mod_info(&modlinkage, modinfop));
1529 }
1530 
1531 int
1532 _fini(void)
1533 {
1534 	int i;
1535 
1536 	i = mod_remove(&modlinkage);
1537 	if (i == DDI_SUCCESS) {
1538 		kmem_cache_destroy(xnb_rxbuf_cachep);
1539 		mutex_destroy(&xnb_alloc_page_lock);
1540 	}
1541 	return (i);
1542 }
1543