xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 15fa1d047e03d3f123546d72f130c5ce4b278eba)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
39 #include <sys/dlpi.h>
40 #include <sys/strsubr.h>
41 #include <sys/strsun.h>
42 #include <sys/types.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 #include <vm/vm_dep.h>
52 #include <sys/note.h>
53 #include <sys/gld.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 
57 /*
58  * The terms "transmit" and "receive" are used in alignment with domU,
59  * which means that packets originating from the peer domU are "transmitted"
60  * to other parts of the system and packets are "received" from them.
61  */
62 
63 /*
64  * Should we allow guests to manipulate multicast group membership?
65  */
66 static boolean_t	xnb_multicast_control = B_TRUE;
67 
68 static boolean_t	xnb_connect_rings(dev_info_t *);
69 static void		xnb_disconnect_rings(dev_info_t *);
70 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
71     void *, void *);
72 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
73     void *, void *);
74 
75 static int	xnb_txbuf_constructor(void *, void *, int);
76 static void	xnb_txbuf_destructor(void *, void *);
77 static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
78 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
79 
80 mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
81 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
82 
83 static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
84     size_t, size_t, size_t, grant_ref_t);
85 #pragma inline(setup_gop)
86 static boolean_t	is_foreign(void *);
87 #pragma inline(is_foreign)
88 
89 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
90 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
91 
92 static kmutex_t	xnb_alloc_page_lock;
93 
94 /*
95  * On a 32 bit PAE system physical and machine addresses are larger
96  * than 32 bits.  ddi_btop() on such systems take an unsigned long
97  * argument, and so addresses above 4G are truncated before ddi_btop()
98  * gets to see them.  To avoid this, code the shift operation here.
99  */
100 #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
101 
102 /* DMA attributes for transmit and receive data */
103 static ddi_dma_attr_t buf_dma_attr = {
104 	DMA_ATTR_V0,		/* version of this structure */
105 	0,			/* lowest usable address */
106 	0xffffffffffffffffULL,	/* highest usable address */
107 	0x7fffffff,		/* maximum DMAable byte count */
108 	MMU_PAGESIZE,		/* alignment in bytes */
109 	0x7ff,			/* bitmap of burst sizes */
110 	1,			/* minimum transfer */
111 	0xffffffffU,		/* maximum transfer */
112 	0xffffffffffffffffULL,	/* maximum segment length */
113 	1,			/* maximum number of segments */
114 	1,			/* granularity */
115 	0,			/* flags (reserved) */
116 };
117 
118 /* DMA access attributes for data: NOT to be byte swapped. */
119 static ddi_device_acc_attr_t data_accattr = {
120 	DDI_DEVICE_ATTR_V0,
121 	DDI_NEVERSWAP_ACC,
122 	DDI_STRICTORDER_ACC
123 };
124 
125 /*
126  * Statistics.
127  */
128 static const char * const aux_statistics[] = {
129 	"rx_cksum_deferred",
130 	"tx_cksum_no_need",
131 	"rx_rsp_notok",
132 	"tx_notify_deferred",
133 	"tx_notify_sent",
134 	"rx_notify_deferred",
135 	"rx_notify_sent",
136 	"tx_too_early",
137 	"rx_too_early",
138 	"rx_allocb_failed",
139 	"tx_allocb_failed",
140 	"rx_foreign_page",
141 	"mac_full",
142 	"spurious_intr",
143 	"allocation_success",
144 	"allocation_failure",
145 	"small_allocation_success",
146 	"small_allocation_failure",
147 	"other_allocation_failure",
148 	"rx_pageboundary_crossed",
149 	"rx_cpoparea_grown",
150 	"csum_hardware",
151 	"csum_software",
152 	"tx_overflow_page",
153 	"tx_unexpected_flags",
154 };
155 
156 static int
157 xnb_ks_aux_update(kstat_t *ksp, int flag)
158 {
159 	xnb_t *xnbp;
160 	kstat_named_t *knp;
161 
162 	if (flag != KSTAT_READ)
163 		return (EACCES);
164 
165 	xnbp = ksp->ks_private;
166 	knp = ksp->ks_data;
167 
168 	/*
169 	 * Assignment order should match that of the names in
170 	 * aux_statistics.
171 	 */
172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
195 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
196 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
197 
198 	return (0);
199 }
200 
201 static boolean_t
202 xnb_ks_init(xnb_t *xnbp)
203 {
204 	int nstat = sizeof (aux_statistics) /
205 	    sizeof (aux_statistics[0]);
206 	const char * const *cp = aux_statistics;
207 	kstat_named_t *knp;
208 
209 	/*
210 	 * Create and initialise kstats.
211 	 */
212 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
213 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
214 	    KSTAT_TYPE_NAMED, nstat, 0);
215 	if (xnbp->xnb_kstat_aux == NULL)
216 		return (B_FALSE);
217 
218 	xnbp->xnb_kstat_aux->ks_private = xnbp;
219 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
220 
221 	knp = xnbp->xnb_kstat_aux->ks_data;
222 	while (nstat > 0) {
223 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
224 
225 		knp++;
226 		cp++;
227 		nstat--;
228 	}
229 
230 	kstat_install(xnbp->xnb_kstat_aux);
231 
232 	return (B_TRUE);
233 }
234 
235 static void
236 xnb_ks_free(xnb_t *xnbp)
237 {
238 	kstat_delete(xnbp->xnb_kstat_aux);
239 }
240 
241 /*
242  * Calculate and insert the transport checksum for an arbitrary packet.
243  */
244 static mblk_t *
245 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
246 {
247 	_NOTE(ARGUNUSED(xnbp));
248 
249 	/*
250 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
251 	 * because it doesn't cover all of the interesting cases :-(
252 	 */
253 	mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
254 
255 	return (mac_fix_cksum(mp));
256 }
257 
258 mblk_t *
259 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
260 {
261 	struct ether_header *ehp;
262 	uint16_t sap;
263 	uint32_t offset;
264 	ipha_t *ipha;
265 
266 	ASSERT(mp->b_next == NULL);
267 
268 	/*
269 	 * Check that the packet is contained in a single mblk.  In
270 	 * the "from peer" path this is true today, but may change
271 	 * when scatter gather support is added.  In the "to peer"
272 	 * path we cannot be sure, but in most cases it will be true
273 	 * (in the xnbo case the packet has come from a MAC device
274 	 * which is unlikely to split packets).
275 	 */
276 	if (mp->b_cont != NULL)
277 		goto software;
278 
279 	/*
280 	 * If the MAC has no hardware capability don't do any further
281 	 * checking.
282 	 */
283 	if (capab == 0)
284 		goto software;
285 
286 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
287 	ehp = (struct ether_header *)mp->b_rptr;
288 
289 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
290 		struct ether_vlan_header *evhp;
291 
292 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
293 		evhp = (struct ether_vlan_header *)mp->b_rptr;
294 		sap = ntohs(evhp->ether_type);
295 		offset = sizeof (struct ether_vlan_header);
296 	} else {
297 		sap = ntohs(ehp->ether_type);
298 		offset = sizeof (struct ether_header);
299 	}
300 
301 	/*
302 	 * We only attempt to do IPv4 packets in hardware.
303 	 */
304 	if (sap != ETHERTYPE_IP)
305 		goto software;
306 
307 	/*
308 	 * We know that this is an IPv4 packet.
309 	 */
310 	ipha = (ipha_t *)(mp->b_rptr + offset);
311 
312 	switch (ipha->ipha_protocol) {
313 	case IPPROTO_TCP:
314 	case IPPROTO_UDP: {
315 		uint32_t start, length, stuff, cksum;
316 		uint16_t *stuffp;
317 
318 		/*
319 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
320 		 * can use full IPv4 and partial checksum offload.
321 		 */
322 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
323 			break;
324 
325 		start = IP_SIMPLE_HDR_LENGTH;
326 		length = ntohs(ipha->ipha_length);
327 		if (ipha->ipha_protocol == IPPROTO_TCP) {
328 			stuff = start + TCP_CHECKSUM_OFFSET;
329 			cksum = IP_TCP_CSUM_COMP;
330 		} else {
331 			stuff = start + UDP_CHECKSUM_OFFSET;
332 			cksum = IP_UDP_CSUM_COMP;
333 		}
334 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
335 
336 		if (capab & HCKSUM_INET_FULL_V4) {
337 			/*
338 			 * Some devices require that the checksum
339 			 * field of the packet is zero for full
340 			 * offload.
341 			 */
342 			*stuffp = 0;
343 
344 			mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
345 
346 			xnbp->xnb_stat_csum_hardware++;
347 
348 			return (mp);
349 		}
350 
351 		if (capab & HCKSUM_INET_PARTIAL) {
352 			if (*stuffp == 0) {
353 				ipaddr_t src, dst;
354 
355 				/*
356 				 * Older Solaris guests don't insert
357 				 * the pseudo-header checksum, so we
358 				 * calculate it here.
359 				 */
360 				src = ipha->ipha_src;
361 				dst = ipha->ipha_dst;
362 
363 				cksum += (dst >> 16) + (dst & 0xFFFF);
364 				cksum += (src >> 16) + (src & 0xFFFF);
365 				cksum += length - IP_SIMPLE_HDR_LENGTH;
366 
367 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
368 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
369 
370 				ASSERT(cksum <= 0xFFFF);
371 
372 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
373 			}
374 
375 			mac_hcksum_set(mp, start, stuff, length, 0,
376 			    HCK_PARTIALCKSUM);
377 
378 			xnbp->xnb_stat_csum_hardware++;
379 
380 			return (mp);
381 		}
382 
383 		/* NOTREACHED */
384 		break;
385 	}
386 
387 	default:
388 		/* Use software. */
389 		break;
390 	}
391 
392 software:
393 	/*
394 	 * We are not able to use any offload so do the whole thing in
395 	 * software.
396 	 */
397 	xnbp->xnb_stat_csum_software++;
398 
399 	return (xnb_software_csum(xnbp, mp));
400 }
401 
402 int
403 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
404 {
405 	xnb_t *xnbp;
406 	char *xsname;
407 	char cachename[32];
408 
409 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
410 
411 	xnbp->xnb_flavour = flavour;
412 	xnbp->xnb_flavour_data = flavour_data;
413 	xnbp->xnb_devinfo = dip;
414 	xnbp->xnb_evtchn = INVALID_EVTCHN;
415 	xnbp->xnb_irq = B_FALSE;
416 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
417 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
418 	xnbp->xnb_connected = B_FALSE;
419 	xnbp->xnb_hotplugged = B_FALSE;
420 	xnbp->xnb_detachable = B_FALSE;
421 	xnbp->xnb_peer = xvdi_get_oeid(dip);
422 	xnbp->xnb_be_status = XNB_STATE_INIT;
423 	xnbp->xnb_fe_status = XNB_STATE_INIT;
424 
425 	xnbp->xnb_tx_buf_count = 0;
426 
427 	xnbp->xnb_rx_hv_copy = B_FALSE;
428 	xnbp->xnb_multicast_control = B_FALSE;
429 
430 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
431 	ASSERT(xnbp->xnb_rx_va != NULL);
432 
433 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
434 	    != DDI_SUCCESS)
435 		goto failure;
436 
437 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
438 	xnbp->xnb_rx_cpop = NULL;
439 	xnbp->xnb_rx_cpop_count = 0;
440 
441 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
442 	    xnbp->xnb_icookie);
443 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
444 	    xnbp->xnb_icookie);
445 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
446 	    xnbp->xnb_icookie);
447 
448 	/* Set driver private pointer now. */
449 	ddi_set_driver_private(dip, xnbp);
450 
451 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
452 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
453 	    sizeof (xnb_txbuf_t), 0,
454 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
455 	    NULL, xnbp, NULL, 0);
456 	if (xnbp->xnb_tx_buf_cache == NULL)
457 		goto failure_0;
458 
459 	if (!xnb_ks_init(xnbp))
460 		goto failure_1;
461 
462 	/*
463 	 * Receive notification of changes in the state of the
464 	 * driver in the guest domain.
465 	 */
466 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
467 	    NULL) != DDI_SUCCESS)
468 		goto failure_2;
469 
470 	/*
471 	 * Receive notification of hotplug events.
472 	 */
473 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
474 	    NULL) != DDI_SUCCESS)
475 		goto failure_2;
476 
477 	xsname = xvdi_get_xsname(dip);
478 
479 	if (xenbus_printf(XBT_NULL, xsname,
480 	    "feature-multicast-control", "%d",
481 	    xnb_multicast_control ? 1 : 0) != 0)
482 		goto failure_3;
483 
484 	if (xenbus_printf(XBT_NULL, xsname,
485 	    "feature-rx-copy", "%d",  1) != 0)
486 		goto failure_3;
487 	/*
488 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
489 	 * in addition to "feature-rx-copy" being 1. It seems strange
490 	 * to use four possible states to describe a binary decision,
491 	 * but we might as well play nice.
492 	 */
493 	if (xenbus_printf(XBT_NULL, xsname,
494 	    "feature-rx-flip", "%d", 0) != 0)
495 		goto failure_3;
496 
497 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
498 	(void) xvdi_post_event(dip, XEN_HP_ADD);
499 
500 	return (DDI_SUCCESS);
501 
502 failure_3:
503 	xvdi_remove_event_handler(dip, NULL);
504 
505 failure_2:
506 	xnb_ks_free(xnbp);
507 
508 failure_1:
509 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
510 
511 failure_0:
512 	mutex_destroy(&xnbp->xnb_state_lock);
513 	mutex_destroy(&xnbp->xnb_rx_lock);
514 	mutex_destroy(&xnbp->xnb_tx_lock);
515 
516 failure:
517 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
518 	kmem_free(xnbp, sizeof (*xnbp));
519 	return (DDI_FAILURE);
520 }
521 
522 void
523 xnb_detach(dev_info_t *dip)
524 {
525 	xnb_t *xnbp = ddi_get_driver_private(dip);
526 
527 	ASSERT(xnbp != NULL);
528 	ASSERT(!xnbp->xnb_connected);
529 	ASSERT(xnbp->xnb_tx_buf_count == 0);
530 
531 	xnb_disconnect_rings(dip);
532 
533 	xvdi_remove_event_handler(dip, NULL);
534 
535 	xnb_ks_free(xnbp);
536 
537 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
538 
539 	ddi_set_driver_private(dip, NULL);
540 
541 	mutex_destroy(&xnbp->xnb_state_lock);
542 	mutex_destroy(&xnbp->xnb_rx_lock);
543 	mutex_destroy(&xnbp->xnb_tx_lock);
544 
545 	if (xnbp->xnb_rx_cpop_count > 0)
546 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
547 		    * xnbp->xnb_rx_cpop_count);
548 
549 	ASSERT(xnbp->xnb_rx_va != NULL);
550 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
551 
552 	kmem_free(xnbp, sizeof (*xnbp));
553 }
554 
555 /*
556  * Allocate a page from the hypervisor to be flipped to the peer.
557  *
558  * Try to get pages in batches to reduce the overhead of calls into
559  * the balloon driver.
560  */
561 static mfn_t
562 xnb_alloc_page(xnb_t *xnbp)
563 {
564 #define	WARNING_RATE_LIMIT 100
565 #define	BATCH_SIZE 256
566 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
567 	static int nth = BATCH_SIZE;
568 	mfn_t mfn;
569 
570 	mutex_enter(&xnb_alloc_page_lock);
571 	if (nth == BATCH_SIZE) {
572 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
573 			xnbp->xnb_stat_allocation_failure++;
574 			mutex_exit(&xnb_alloc_page_lock);
575 
576 			/*
577 			 * Try for a single page in low memory situations.
578 			 */
579 			if (balloon_alloc_pages(1, &mfn) != 1) {
580 				if ((xnbp->xnb_stat_small_allocation_failure++
581 				    % WARNING_RATE_LIMIT) == 0)
582 					cmn_err(CE_WARN, "xnb_alloc_page: "
583 					    "Cannot allocate memory to "
584 					    "transfer packets to peer.");
585 				return (0);
586 			} else {
587 				xnbp->xnb_stat_small_allocation_success++;
588 				return (mfn);
589 			}
590 		}
591 
592 		nth = 0;
593 		xnbp->xnb_stat_allocation_success++;
594 	}
595 
596 	mfn = mfns[nth++];
597 	mutex_exit(&xnb_alloc_page_lock);
598 
599 	ASSERT(mfn != 0);
600 
601 	return (mfn);
602 #undef BATCH_SIZE
603 #undef WARNING_RATE_LIMIT
604 }
605 
606 /*
607  * Free a page back to the hypervisor.
608  *
609  * This happens only in the error path, so batching is not worth the
610  * complication.
611  */
612 static void
613 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
614 {
615 	_NOTE(ARGUNUSED(xnbp));
616 	int r;
617 	pfn_t pfn;
618 
619 	pfn = xen_assign_pfn(mfn);
620 	pfnzero(pfn, 0, PAGESIZE);
621 	xen_release_pfn(pfn);
622 
623 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
624 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
625 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
626 		    r, mfn);
627 	}
628 }
629 
630 /*
631  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
632  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
633  */
634 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
635 	((((_r)->sring->req_prod - loop) <		\
636 		(RING_SIZE(_r) - (loop - prod))) ?	\
637 	    ((_r)->sring->req_prod - loop) :		\
638 	    (RING_SIZE(_r) - (loop - prod)))
639 
640 /*
641  * Pass packets to the peer using page flipping.
642  */
643 mblk_t *
644 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
645 {
646 	mblk_t *free = mp, *prev = NULL;
647 	size_t len;
648 	gnttab_transfer_t *gop;
649 	boolean_t notify;
650 	RING_IDX loop, prod, end;
651 
652 	/*
653 	 * For each packet the sequence of operations is:
654 	 *
655 	 * 1. get a new page from the hypervisor.
656 	 * 2. get a request slot from the ring.
657 	 * 3. copy the data into the new page.
658 	 * 4. transfer the page to the peer.
659 	 * 5. update the request slot.
660 	 * 6. kick the peer.
661 	 * 7. free mp.
662 	 *
663 	 * In order to reduce the number of hypercalls, we prepare
664 	 * several packets for the peer and perform a single hypercall
665 	 * to transfer them.
666 	 */
667 
668 	len = 0;
669 	mutex_enter(&xnbp->xnb_rx_lock);
670 
671 	/*
672 	 * If we are not connected to the peer or have not yet
673 	 * finished hotplug it is too early to pass packets to the
674 	 * peer.
675 	 */
676 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
677 		mutex_exit(&xnbp->xnb_rx_lock);
678 		DTRACE_PROBE(flip_rx_too_early);
679 		xnbp->xnb_stat_rx_too_early++;
680 		return (mp);
681 	}
682 
683 	loop = xnbp->xnb_rx_ring.req_cons;
684 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
685 	gop = xnbp->xnb_rx_top;
686 
687 	while ((mp != NULL) &&
688 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
689 
690 		mfn_t mfn;
691 		pfn_t pfn;
692 		netif_rx_request_t *rxreq;
693 		netif_rx_response_t *rxresp;
694 		char *valoop;
695 		mblk_t *ml;
696 		uint16_t cksum_flags;
697 
698 		/* 1 */
699 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
700 			xnbp->xnb_stat_rx_defer++;
701 			break;
702 		}
703 
704 		/* 2 */
705 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
706 
707 #ifdef XNB_DEBUG
708 		if (!(rxreq->id < NET_RX_RING_SIZE))
709 			cmn_err(CE_PANIC, "xnb_to_peer: "
710 			    "id %d out of range in request 0x%p",
711 			    rxreq->id, (void *)rxreq);
712 #endif /* XNB_DEBUG */
713 
714 		/* Assign a pfn and map the new page at the allocated va. */
715 		pfn = xen_assign_pfn(mfn);
716 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
717 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
718 
719 		/* 3 */
720 		len = 0;
721 		valoop = xnbp->xnb_rx_va;
722 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
723 			size_t chunk = ml->b_wptr - ml->b_rptr;
724 
725 			bcopy(ml->b_rptr, valoop, chunk);
726 			valoop += chunk;
727 			len += chunk;
728 		}
729 
730 		ASSERT(len < PAGESIZE);
731 
732 		/* Release the pfn. */
733 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
734 		    HAT_UNLOAD_UNMAP);
735 		xen_release_pfn(pfn);
736 
737 		/* 4 */
738 		gop->mfn = mfn;
739 		gop->domid = xnbp->xnb_peer;
740 		gop->ref = rxreq->gref;
741 
742 		/* 5.1 */
743 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
744 		rxresp->offset = 0;
745 		rxresp->flags = 0;
746 
747 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
748 		if (cksum_flags != 0)
749 			xnbp->xnb_stat_rx_cksum_deferred++;
750 		rxresp->flags |= cksum_flags;
751 
752 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
753 		rxresp->status = len;
754 
755 		loop++;
756 		prod++;
757 		gop++;
758 		prev = mp;
759 		mp = mp->b_next;
760 	}
761 
762 	/*
763 	 * Did we actually do anything?
764 	 */
765 	if (loop == xnbp->xnb_rx_ring.req_cons) {
766 		mutex_exit(&xnbp->xnb_rx_lock);
767 		return (mp);
768 	}
769 
770 	end = loop;
771 
772 	/*
773 	 * Unlink the end of the 'done' list from the remainder.
774 	 */
775 	ASSERT(prev != NULL);
776 	prev->b_next = NULL;
777 
778 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
779 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
780 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
781 	}
782 
783 	loop = xnbp->xnb_rx_ring.req_cons;
784 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
785 	gop = xnbp->xnb_rx_top;
786 
787 	while (loop < end) {
788 		int16_t status = NETIF_RSP_OKAY;
789 
790 		if (gop->status != 0) {
791 			status = NETIF_RSP_ERROR;
792 
793 			/*
794 			 * If the status is anything other than
795 			 * GNTST_bad_page then we don't own the page
796 			 * any more, so don't try to give it back.
797 			 */
798 			if (gop->status != GNTST_bad_page)
799 				gop->mfn = 0;
800 		} else {
801 			/* The page is no longer ours. */
802 			gop->mfn = 0;
803 		}
804 
805 		if (gop->mfn != 0)
806 			/*
807 			 * Give back the page, as we won't be using
808 			 * it.
809 			 */
810 			xnb_free_page(xnbp, gop->mfn);
811 		else
812 			/*
813 			 * We gave away a page, update our accounting
814 			 * now.
815 			 */
816 			balloon_drv_subtracted(1);
817 
818 		/* 5.2 */
819 		if (status != NETIF_RSP_OKAY) {
820 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
821 			    status;
822 		} else {
823 			xnbp->xnb_stat_ipackets++;
824 			xnbp->xnb_stat_rbytes += len;
825 		}
826 
827 		loop++;
828 		prod++;
829 		gop++;
830 	}
831 
832 	xnbp->xnb_rx_ring.req_cons = loop;
833 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
834 
835 	/* 6 */
836 	/* LINTED: constant in conditional context */
837 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
838 	if (notify) {
839 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
840 		xnbp->xnb_stat_rx_notify_sent++;
841 	} else {
842 		xnbp->xnb_stat_rx_notify_deferred++;
843 	}
844 
845 	if (mp != NULL)
846 		xnbp->xnb_stat_rx_defer++;
847 
848 	mutex_exit(&xnbp->xnb_rx_lock);
849 
850 	/* Free mblk_t's that we consumed. */
851 	freemsgchain(free);
852 
853 	return (mp);
854 }
855 
856 /* Helper functions for xnb_copy_to_peer(). */
857 
858 /*
859  * Grow the array of copy operation descriptors.
860  */
861 static boolean_t
862 grow_cpop_area(xnb_t *xnbp)
863 {
864 	size_t count;
865 	gnttab_copy_t *new;
866 
867 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
868 
869 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
870 
871 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
872 		xnbp->xnb_stat_other_allocation_failure++;
873 		return (B_FALSE);
874 	}
875 
876 	bcopy(xnbp->xnb_rx_cpop, new,
877 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
878 
879 	kmem_free(xnbp->xnb_rx_cpop,
880 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
881 
882 	xnbp->xnb_rx_cpop = new;
883 	xnbp->xnb_rx_cpop_count = count;
884 
885 	xnbp->xnb_stat_rx_cpoparea_grown++;
886 
887 	return (B_TRUE);
888 }
889 
890 /*
891  * Check whether an address is on a page that's foreign to this domain.
892  */
893 static boolean_t
894 is_foreign(void *addr)
895 {
896 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
897 
898 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
899 }
900 
901 /*
902  * Insert a newly allocated mblk into a chain, replacing the old one.
903  */
904 static mblk_t *
905 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
906 {
907 	uint32_t	start, stuff, end, value, flags;
908 	mblk_t		*new_mp;
909 
910 	new_mp = copyb(mp);
911 	if (new_mp == NULL) {
912 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
913 		    "for %p, len %lu", (void *) mp, len);
914 	}
915 
916 	mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
917 	mac_hcksum_set(new_mp, start, stuff, end, value, flags);
918 
919 	new_mp->b_next = mp->b_next;
920 	new_mp->b_prev = mp->b_prev;
921 	new_mp->b_cont = mp->b_cont;
922 
923 	/* Make sure we only overwrite pointers to the mblk being replaced. */
924 	if (mp_prev != NULL && mp_prev->b_next == mp)
925 		mp_prev->b_next = new_mp;
926 
927 	if (ml_prev != NULL && ml_prev->b_cont == mp)
928 		ml_prev->b_cont = new_mp;
929 
930 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
931 	freemsg(mp);
932 
933 	return (new_mp);
934 }
935 
936 /*
937  * Set all the fields in a gnttab_copy_t.
938  */
939 static void
940 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
941     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
942 {
943 	ASSERT(xnbp != NULL && gp != NULL);
944 
945 	gp->source.offset = s_off;
946 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
947 	gp->source.domid = DOMID_SELF;
948 
949 	gp->len = (uint16_t)len;
950 	gp->flags = GNTCOPY_dest_gref;
951 	gp->status = 0;
952 
953 	gp->dest.u.ref = d_ref;
954 	gp->dest.offset = d_off;
955 	gp->dest.domid = xnbp->xnb_peer;
956 }
957 
958 /*
959  * Pass packets to the peer using hypervisor copy operations.
960  */
961 mblk_t *
962 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
963 {
964 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
965 	mblk_t		*ml, *ml_prev;
966 	boolean_t	notify;
967 	RING_IDX	loop, prod;
968 	int		i;
969 
970 	/*
971 	 * If the peer does not pre-post buffers for received packets,
972 	 * use page flipping to pass packets to it.
973 	 */
974 	if (!xnbp->xnb_rx_hv_copy)
975 		return (xnb_to_peer(xnbp, mp));
976 
977 	/*
978 	 * For each packet the sequence of operations is:
979 	 *
980 	 *  1. get a request slot from the ring.
981 	 *  2. set up data for hypercall (see NOTE below)
982 	 *  3. have the hypervisore copy the data
983 	 *  4. update the request slot.
984 	 *  5. kick the peer.
985 	 *
986 	 * NOTE ad 2.
987 	 *  In order to reduce the number of hypercalls, we prepare
988 	 *  several mblks (mp->b_cont != NULL) for the peer and
989 	 *  perform a single hypercall to transfer them.  We also have
990 	 *  to set up a seperate copy operation for every page.
991 	 *
992 	 * If we have more than one packet (mp->b_next != NULL), we do
993 	 * this whole dance repeatedly.
994 	 */
995 
996 	mutex_enter(&xnbp->xnb_rx_lock);
997 
998 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
999 		mutex_exit(&xnbp->xnb_rx_lock);
1000 		DTRACE_PROBE(copy_rx_too_early);
1001 		xnbp->xnb_stat_rx_too_early++;
1002 		return (mp);
1003 	}
1004 
1005 	loop = xnbp->xnb_rx_ring.req_cons;
1006 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1007 
1008 	while ((mp != NULL) &&
1009 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1010 		netif_rx_request_t	*rxreq;
1011 		size_t			d_offset, len;
1012 		int			item_count;
1013 		gnttab_copy_t		*gop_cp;
1014 		netif_rx_response_t	*rxresp;
1015 		uint16_t		cksum_flags;
1016 		int16_t			status = NETIF_RSP_OKAY;
1017 
1018 		/* 1 */
1019 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1020 
1021 #ifdef XNB_DEBUG
1022 		if (!(rxreq->id < NET_RX_RING_SIZE))
1023 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1024 			    "id %d out of range in request 0x%p",
1025 			    rxreq->id, (void *)rxreq);
1026 #endif /* XNB_DEBUG */
1027 
1028 		/* 2 */
1029 		d_offset = 0;
1030 		len = 0;
1031 		item_count = 0;
1032 
1033 		gop_cp = xnbp->xnb_rx_cpop;
1034 
1035 		/*
1036 		 * We walk the b_cont pointers and set up a
1037 		 * gnttab_copy_t for each sub-page chunk in each data
1038 		 * block.
1039 		 */
1040 		/* 2a */
1041 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1042 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1043 			uchar_t	*r_tmp,	*rpt_align;
1044 			size_t	r_offset;
1045 
1046 			/*
1047 			 * The hypervisor will not allow us to
1048 			 * reference a foreign page (e.g. one
1049 			 * belonging to another domain) by mfn in the
1050 			 * copy operation. If the data in this mblk is
1051 			 * on such a page we must copy the data into a
1052 			 * local page before initiating the hypervisor
1053 			 * copy operation.
1054 			 */
1055 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1056 				mblk_t *ml_new = replace_msg(ml, chunk,
1057 				    mp_prev, ml_prev);
1058 
1059 				/* We can still use old ml, but not *ml! */
1060 				if (free == ml)
1061 					free = ml_new;
1062 				if (mp == ml)
1063 					mp = ml_new;
1064 				ml = ml_new;
1065 
1066 				xnbp->xnb_stat_rx_foreign_page++;
1067 			}
1068 
1069 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1070 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1071 			r_tmp = ml->b_rptr;
1072 
1073 			if (d_offset + chunk > PAGESIZE)
1074 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1075 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1076 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1077 				    (void *)mp, (void *)saved_mp, (void *)ml,
1078 				    (void *)rpt_align,
1079 				    d_offset, chunk, (int)PAGESIZE);
1080 
1081 			while (chunk > 0) {
1082 				size_t part_len;
1083 
1084 				if (item_count == xnbp->xnb_rx_cpop_count) {
1085 					if (!grow_cpop_area(xnbp))
1086 						goto failure;
1087 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
1088 				}
1089 				/*
1090 				 * If our mblk crosses a page boundary, we need
1091 				 * to do a seperate copy for each page.
1092 				 */
1093 				if (r_offset + chunk > PAGESIZE) {
1094 					part_len = PAGESIZE - r_offset;
1095 
1096 					DTRACE_PROBE3(mblk_page_crossed,
1097 					    (mblk_t *), ml, int, chunk, int,
1098 					    (int)r_offset);
1099 
1100 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1101 				} else {
1102 					part_len = chunk;
1103 				}
1104 
1105 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1106 				    d_offset, part_len, rxreq->gref);
1107 
1108 				chunk -= part_len;
1109 
1110 				len += part_len;
1111 				d_offset += part_len;
1112 				r_tmp += part_len;
1113 				/*
1114 				 * The 2nd, 3rd ... last copies will always
1115 				 * start at r_tmp, therefore r_offset is 0.
1116 				 */
1117 				r_offset = 0;
1118 				gop_cp++;
1119 				item_count++;
1120 			}
1121 			ml_prev = ml;
1122 
1123 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1124 			    chunk, int, len, int, item_count);
1125 		}
1126 		/* 3 */
1127 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1128 		    item_count) != 0) {
1129 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1130 			DTRACE_PROBE(HV_granttableopfailed);
1131 		}
1132 
1133 		/* 4 */
1134 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1135 		rxresp->offset = 0;
1136 
1137 		rxresp->flags = 0;
1138 
1139 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1140 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1141 		    (int)rxresp->status);
1142 
1143 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1144 		if (cksum_flags != 0)
1145 			xnbp->xnb_stat_rx_cksum_deferred++;
1146 		rxresp->flags |= cksum_flags;
1147 
1148 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1149 		rxresp->status = len;
1150 
1151 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1152 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1153 		    (int)rxresp->status);
1154 
1155 		for (i = 0; i < item_count; i++) {
1156 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1157 				DTRACE_PROBE2(cpop_status_nonnull, int,
1158 				    (int)xnbp->xnb_rx_cpop[i].status,
1159 				    int, i);
1160 				status = NETIF_RSP_ERROR;
1161 			}
1162 		}
1163 
1164 		/* 5.2 */
1165 		if (status != NETIF_RSP_OKAY) {
1166 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1167 			    status;
1168 			xnbp->xnb_stat_rx_rsp_notok++;
1169 		} else {
1170 			xnbp->xnb_stat_ipackets++;
1171 			xnbp->xnb_stat_rbytes += len;
1172 		}
1173 
1174 		loop++;
1175 		prod++;
1176 		mp_prev = mp;
1177 		mp = mp->b_next;
1178 	}
1179 failure:
1180 	/*
1181 	 * Did we actually do anything?
1182 	 */
1183 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1184 		mutex_exit(&xnbp->xnb_rx_lock);
1185 		return (mp);
1186 	}
1187 
1188 	/*
1189 	 * Unlink the end of the 'done' list from the remainder.
1190 	 */
1191 	ASSERT(mp_prev != NULL);
1192 	mp_prev->b_next = NULL;
1193 
1194 	xnbp->xnb_rx_ring.req_cons = loop;
1195 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1196 
1197 	/* 6 */
1198 	/* LINTED: constant in conditional context */
1199 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1200 	if (notify) {
1201 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1202 		xnbp->xnb_stat_rx_notify_sent++;
1203 	} else {
1204 		xnbp->xnb_stat_rx_notify_deferred++;
1205 	}
1206 
1207 	if (mp != NULL)
1208 		xnbp->xnb_stat_rx_defer++;
1209 
1210 	mutex_exit(&xnbp->xnb_rx_lock);
1211 
1212 	/* Free mblk_t structs we have consumed. */
1213 	freemsgchain(free);
1214 
1215 	return (mp);
1216 }
1217 
1218 
1219 static void
1220 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
1221 {
1222 	boolean_t notify;
1223 
1224 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1225 
1226 	/* LINTED: constant in conditional context */
1227 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1228 	if (notify || force) {
1229 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1230 		xnbp->xnb_stat_tx_notify_sent++;
1231 	} else {
1232 		xnbp->xnb_stat_tx_notify_deferred++;
1233 	}
1234 }
1235 
1236 static void
1237 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1238 {
1239 	RING_IDX i;
1240 	netif_tx_response_t *txresp;
1241 
1242 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1243 
1244 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1245 
1246 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1247 	txresp->id = id;
1248 	txresp->status = status;
1249 
1250 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1251 
1252 	/*
1253 	 * Note that we don't push the change to the peer here - that
1254 	 * is the callers responsibility.
1255 	 */
1256 }
1257 
1258 static void
1259 xnb_txbuf_recycle(xnb_txbuf_t *txp)
1260 {
1261 	xnb_t *xnbp = txp->xt_xnbp;
1262 
1263 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1264 
1265 	xnbp->xnb_tx_buf_outstanding--;
1266 }
1267 
1268 static int
1269 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1270 {
1271 	_NOTE(ARGUNUSED(kmflag));
1272 	xnb_txbuf_t *txp = buf;
1273 	xnb_t *xnbp = arg;
1274 	size_t len;
1275 	ddi_dma_cookie_t dma_cookie;
1276 	uint_t ncookies;
1277 
1278 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
1279 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1280 	txp->xt_xnbp = xnbp;
1281 	txp->xt_next = NULL;
1282 
1283 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
1284 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
1285 		goto failure;
1286 
1287 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
1288 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
1289 	    &txp->xt_acc_handle) != DDI_SUCCESS)
1290 		goto failure_1;
1291 
1292 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
1293 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
1294 	    &dma_cookie, &ncookies)
1295 	    != DDI_DMA_MAPPED)
1296 		goto failure_2;
1297 	ASSERT(ncookies == 1);
1298 
1299 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
1300 	txp->xt_buflen = dma_cookie.dmac_size;
1301 
1302 	DTRACE_PROBE(txbuf_allocated);
1303 
1304 	atomic_inc_32(&xnbp->xnb_tx_buf_count);
1305 	xnbp->xnb_tx_buf_outstanding++;
1306 
1307 	return (0);
1308 
1309 failure_2:
1310 	ddi_dma_mem_free(&txp->xt_acc_handle);
1311 
1312 failure_1:
1313 	ddi_dma_free_handle(&txp->xt_dma_handle);
1314 
1315 failure:
1316 
1317 	return (-1);
1318 }
1319 
1320 static void
1321 xnb_txbuf_destructor(void *buf, void *arg)
1322 {
1323 	xnb_txbuf_t *txp = buf;
1324 	xnb_t *xnbp = arg;
1325 
1326 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
1327 	ddi_dma_mem_free(&txp->xt_acc_handle);
1328 	ddi_dma_free_handle(&txp->xt_dma_handle);
1329 
1330 	atomic_dec_32(&xnbp->xnb_tx_buf_count);
1331 }
1332 
1333 /*
1334  * Take packets from the peer and deliver them onward.
1335  */
1336 static mblk_t *
1337 xnb_from_peer(xnb_t *xnbp)
1338 {
1339 	RING_IDX start, end, loop;
1340 	gnttab_copy_t *cop;
1341 	xnb_txbuf_t **txpp;
1342 	netif_tx_request_t *txreq;
1343 	boolean_t work_to_do, need_notify = B_FALSE;
1344 	mblk_t *head, *tail;
1345 	int n_data_req, i;
1346 
1347 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1348 
1349 	head = tail = NULL;
1350 around:
1351 
1352 	/* LINTED: constant in conditional context */
1353 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1354 	if (!work_to_do) {
1355 finished:
1356 		xnb_tx_notify_peer(xnbp, need_notify);
1357 
1358 		return (head);
1359 	}
1360 
1361 	start = xnbp->xnb_tx_ring.req_cons;
1362 	end = xnbp->xnb_tx_ring.sring->req_prod;
1363 
1364 	if ((end - start) > NET_TX_RING_SIZE) {
1365 		/*
1366 		 * This usually indicates that the frontend driver is
1367 		 * misbehaving, as it's not possible to have more than
1368 		 * NET_TX_RING_SIZE ring elements in play at any one
1369 		 * time.
1370 		 *
1371 		 * We reset the ring pointers to the state declared by
1372 		 * the frontend and try to carry on.
1373 		 */
1374 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1375 		    "items in the ring, resetting and trying to recover.",
1376 		    xnbp->xnb_peer, (end - start));
1377 
1378 		/* LINTED: constant in conditional context */
1379 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1380 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1381 
1382 		goto around;
1383 	}
1384 
1385 	loop = start;
1386 	cop = xnbp->xnb_tx_cop;
1387 	txpp = xnbp->xnb_tx_bufp;
1388 	n_data_req = 0;
1389 
1390 	while (loop < end) {
1391 		static const uint16_t acceptable_flags =
1392 		    NETTXF_csum_blank |
1393 		    NETTXF_data_validated |
1394 		    NETTXF_extra_info;
1395 		uint16_t unexpected_flags;
1396 
1397 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1398 
1399 		unexpected_flags = txreq->flags & ~acceptable_flags;
1400 		if (unexpected_flags != 0) {
1401 			/*
1402 			 * The peer used flag bits that we do not
1403 			 * recognize.
1404 			 */
1405 			cmn_err(CE_WARN, "xnb_from_peer: "
1406 			    "unexpected flag bits (0x%x) from peer "
1407 			    "in transmit request",
1408 			    unexpected_flags);
1409 			xnbp->xnb_stat_tx_unexpected_flags++;
1410 
1411 			/* Mark this entry as failed. */
1412 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1413 			need_notify = B_TRUE;
1414 
1415 		} else if (txreq->flags & NETTXF_extra_info) {
1416 			struct netif_extra_info *erp;
1417 			boolean_t status;
1418 
1419 			loop++; /* Consume another slot in the ring. */
1420 			ASSERT(loop <= end);
1421 
1422 			erp = (struct netif_extra_info *)
1423 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1424 
1425 			switch (erp->type) {
1426 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
1427 				ASSERT(xnbp->xnb_multicast_control);
1428 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
1429 				    &erp->u.mcast.addr);
1430 				break;
1431 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
1432 				ASSERT(xnbp->xnb_multicast_control);
1433 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
1434 				    &erp->u.mcast.addr);
1435 				break;
1436 			default:
1437 				status = B_FALSE;
1438 				cmn_err(CE_WARN, "xnb_from_peer: "
1439 				    "unknown extra type %d", erp->type);
1440 				break;
1441 			}
1442 
1443 			xnb_tx_mark_complete(xnbp, txreq->id,
1444 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
1445 			need_notify = B_TRUE;
1446 
1447 		} else if ((txreq->offset > PAGESIZE) ||
1448 		    (txreq->offset + txreq->size > PAGESIZE)) {
1449 			/*
1450 			 * Peer attempted to refer to data beyond the
1451 			 * end of the granted page.
1452 			 */
1453 			cmn_err(CE_WARN, "xnb_from_peer: "
1454 			    "attempt to refer beyond the end of granted "
1455 			    "page in txreq (offset %d, size %d).",
1456 			    txreq->offset, txreq->size);
1457 			xnbp->xnb_stat_tx_overflow_page++;
1458 
1459 			/* Mark this entry as failed. */
1460 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1461 			need_notify = B_TRUE;
1462 
1463 		} else {
1464 			xnb_txbuf_t *txp;
1465 
1466 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
1467 			    KM_NOSLEEP);
1468 			if (txp == NULL)
1469 				break;
1470 
1471 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
1472 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
1473 			if (txp->xt_mblk == NULL) {
1474 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1475 				break;
1476 			}
1477 
1478 			txp->xt_idx = loop;
1479 			txp->xt_id = txreq->id;
1480 
1481 			cop->source.u.ref = txreq->gref;
1482 			cop->source.domid = xnbp->xnb_peer;
1483 			cop->source.offset = txreq->offset;
1484 
1485 			cop->dest.u.gmfn = txp->xt_mfn;
1486 			cop->dest.domid = DOMID_SELF;
1487 			cop->dest.offset = 0;
1488 
1489 			cop->len = txreq->size;
1490 			cop->flags = GNTCOPY_source_gref;
1491 			cop->status = 0;
1492 
1493 			*txpp = txp;
1494 
1495 			txpp++;
1496 			cop++;
1497 			n_data_req++;
1498 
1499 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
1500 		}
1501 
1502 		loop++;
1503 	}
1504 
1505 	xnbp->xnb_tx_ring.req_cons = loop;
1506 
1507 	if (n_data_req == 0)
1508 		goto around;
1509 
1510 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1511 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
1512 
1513 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
1514 
1515 		txpp = xnbp->xnb_tx_bufp;
1516 		i = n_data_req;
1517 		while (i > 0) {
1518 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
1519 			txpp++;
1520 			i--;
1521 		}
1522 
1523 		goto finished;
1524 	}
1525 
1526 	txpp = xnbp->xnb_tx_bufp;
1527 	cop = xnbp->xnb_tx_cop;
1528 	i = n_data_req;
1529 
1530 	while (i > 0) {
1531 		xnb_txbuf_t *txp = *txpp;
1532 
1533 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
1534 
1535 		if (cop->status != 0) {
1536 #ifdef XNB_DEBUG
1537 			cmn_err(CE_WARN, "xnb_from_peer: "
1538 			    "txpp 0x%p failed (%d)",
1539 			    (void *)*txpp, cop->status);
1540 #endif /* XNB_DEBUG */
1541 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
1542 			freemsg(txp->xt_mblk);
1543 		} else {
1544 			mblk_t *mp;
1545 
1546 			mp = txp->xt_mblk;
1547 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
1548 			mp->b_wptr += txreq->size;
1549 			mp->b_next = NULL;
1550 
1551 			/*
1552 			 * If there are checksum flags, process them
1553 			 * appropriately.
1554 			 */
1555 			if ((txreq->flags &
1556 			    (NETTXF_csum_blank | NETTXF_data_validated))
1557 			    != 0) {
1558 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1559 				    mp, txreq->flags);
1560 				xnbp->xnb_stat_tx_cksum_no_need++;
1561 
1562 				txp->xt_mblk = mp;
1563 			}
1564 
1565 			if (head == NULL) {
1566 				ASSERT(tail == NULL);
1567 				head = mp;
1568 			} else {
1569 				ASSERT(tail != NULL);
1570 				tail->b_next = mp;
1571 			}
1572 			tail = mp;
1573 
1574 			xnbp->xnb_stat_opackets++;
1575 			xnbp->xnb_stat_obytes += txreq->size;
1576 
1577 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
1578 		}
1579 
1580 		txpp++;
1581 		cop++;
1582 		i--;
1583 	}
1584 
1585 	goto around;
1586 	/* NOTREACHED */
1587 }
1588 
1589 static uint_t
1590 xnb_intr(caddr_t arg)
1591 {
1592 	xnb_t *xnbp = (xnb_t *)arg;
1593 	mblk_t *mp;
1594 
1595 	xnbp->xnb_stat_intr++;
1596 
1597 	mutex_enter(&xnbp->xnb_tx_lock);
1598 
1599 	ASSERT(xnbp->xnb_connected);
1600 
1601 	mp = xnb_from_peer(xnbp);
1602 
1603 	mutex_exit(&xnbp->xnb_tx_lock);
1604 
1605 	if (!xnbp->xnb_hotplugged) {
1606 		xnbp->xnb_stat_tx_too_early++;
1607 		goto fail;
1608 	}
1609 	if (mp == NULL) {
1610 		xnbp->xnb_stat_spurious_intr++;
1611 		goto fail;
1612 	}
1613 
1614 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1615 
1616 	return (DDI_INTR_CLAIMED);
1617 
1618 fail:
1619 	freemsgchain(mp);
1620 	return (DDI_INTR_CLAIMED);
1621 }
1622 
1623 /*
1624  * Read our configuration from xenstore.
1625  */
1626 boolean_t
1627 xnb_read_xs_config(xnb_t *xnbp)
1628 {
1629 	char *xsname;
1630 	char mac[ETHERADDRL * 3];
1631 
1632 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
1633 
1634 	if (xenbus_scanf(XBT_NULL, xsname,
1635 	    "mac", "%s", mac) != 0) {
1636 		cmn_err(CE_WARN, "xnb_attach: "
1637 		    "cannot read mac address from %s",
1638 		    xsname);
1639 		return (B_FALSE);
1640 	}
1641 
1642 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
1643 		cmn_err(CE_WARN,
1644 		    "xnb_attach: cannot parse mac address %s",
1645 		    mac);
1646 		return (B_FALSE);
1647 	}
1648 
1649 	return (B_TRUE);
1650 }
1651 
1652 /*
1653  * Read the configuration of the peer from xenstore.
1654  */
1655 boolean_t
1656 xnb_read_oe_config(xnb_t *xnbp)
1657 {
1658 	char *oename;
1659 	int i;
1660 
1661 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
1662 
1663 	if (xenbus_gather(XBT_NULL, oename,
1664 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
1665 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1666 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1667 	    NULL) != 0) {
1668 		cmn_err(CE_WARN, "xnb_read_oe_config: "
1669 		    "cannot read other-end details from %s",
1670 		    oename);
1671 		return (B_FALSE);
1672 	}
1673 
1674 	/*
1675 	 * Check whether our peer requests receive side hypervisor
1676 	 * copy.
1677 	 */
1678 	if (xenbus_scanf(XBT_NULL, oename,
1679 	    "request-rx-copy", "%d", &i) != 0)
1680 		i = 0;
1681 	if (i != 0)
1682 		xnbp->xnb_rx_hv_copy = B_TRUE;
1683 
1684 	/*
1685 	 * Check whether our peer requests multicast_control.
1686 	 */
1687 	if (xenbus_scanf(XBT_NULL, oename,
1688 	    "request-multicast-control", "%d", &i) != 0)
1689 		i = 0;
1690 	if (i != 0)
1691 		xnbp->xnb_multicast_control = B_TRUE;
1692 
1693 	/*
1694 	 * The Linux backend driver here checks to see if the peer has
1695 	 * set 'feature-no-csum-offload'. This is used to indicate
1696 	 * that the guest cannot handle receiving packets without a
1697 	 * valid checksum. We don't check here, because packets passed
1698 	 * to the peer _always_ have a valid checksum.
1699 	 *
1700 	 * There are three cases:
1701 	 *
1702 	 * - the NIC is dedicated: packets from the wire should always
1703 	 *   have a valid checksum. If the hardware validates the
1704 	 *   checksum then the relevant bit will be set in the packet
1705 	 *   attributes and we will inform the peer. It can choose to
1706 	 *   ignore the hardware verification.
1707 	 *
1708 	 * - the NIC is shared (VNIC) and a packet originates from the
1709 	 *   wire: this is the same as the case above - the packets
1710 	 *   will have a valid checksum.
1711 	 *
1712 	 * - the NIC is shared (VNIC) and a packet originates from the
1713 	 *   host: the MAC layer ensures that all such packets have a
1714 	 *   valid checksum by calculating one if the stack did not.
1715 	 */
1716 
1717 	return (B_TRUE);
1718 }
1719 
1720 void
1721 xnb_start_connect(xnb_t *xnbp)
1722 {
1723 	dev_info_t  *dip = xnbp->xnb_devinfo;
1724 
1725 	if (!xnb_connect_rings(dip)) {
1726 		cmn_err(CE_WARN, "xnb_start_connect: "
1727 		    "cannot connect rings");
1728 		goto failed;
1729 	}
1730 
1731 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
1732 		cmn_err(CE_WARN, "xnb_start_connect: "
1733 		    "flavour failed to connect");
1734 		goto failed;
1735 	}
1736 
1737 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1738 	return;
1739 
1740 failed:
1741 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1742 	xnb_disconnect_rings(dip);
1743 	(void) xvdi_switch_state(dip, XBT_NULL,
1744 	    XenbusStateClosed);
1745 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1746 }
1747 
1748 static boolean_t
1749 xnb_connect_rings(dev_info_t *dip)
1750 {
1751 	xnb_t *xnbp = ddi_get_driver_private(dip);
1752 	struct gnttab_map_grant_ref map_op;
1753 
1754 	/*
1755 	 * Cannot attempt to connect the rings if already connected.
1756 	 */
1757 	ASSERT(!xnbp->xnb_connected);
1758 
1759 	/*
1760 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1761 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1762 	 *    into the allocated vaddr (one for tx, one for rx).
1763 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1764 	 *    bound to this domain.
1765 	 * 4. associate the event channel with an interrupt.
1766 	 * 5. enable the interrupt.
1767 	 */
1768 
1769 	/* 1.tx */
1770 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1771 	    0, 0, 0, 0, VM_SLEEP);
1772 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1773 
1774 	/* 2.tx */
1775 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1776 	map_op.flags = GNTMAP_host_map;
1777 	map_op.ref = xnbp->xnb_tx_ring_ref;
1778 	map_op.dom = xnbp->xnb_peer;
1779 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1780 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1781 	    map_op.status != 0) {
1782 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1783 		goto fail;
1784 	}
1785 	xnbp->xnb_tx_ring_handle = map_op.handle;
1786 
1787 	/* LINTED: constant in conditional context */
1788 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1789 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1790 
1791 	/* 1.rx */
1792 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1793 	    0, 0, 0, 0, VM_SLEEP);
1794 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1795 
1796 	/* 2.rx */
1797 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1798 	map_op.flags = GNTMAP_host_map;
1799 	map_op.ref = xnbp->xnb_rx_ring_ref;
1800 	map_op.dom = xnbp->xnb_peer;
1801 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1802 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1803 	    map_op.status != 0) {
1804 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1805 		goto fail;
1806 	}
1807 	xnbp->xnb_rx_ring_handle = map_op.handle;
1808 
1809 	/* LINTED: constant in conditional context */
1810 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1811 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1812 
1813 	/* 3 */
1814 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
1815 		cmn_err(CE_WARN, "xnb_connect_rings: "
1816 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1817 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1818 		goto fail;
1819 	}
1820 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1821 
1822 	/*
1823 	 * It would be good to set the state to XenbusStateConnected
1824 	 * here as well, but then what if ddi_add_intr() failed?
1825 	 * Changing the state in the store will be noticed by the peer
1826 	 * and cannot be "taken back".
1827 	 */
1828 	mutex_enter(&xnbp->xnb_tx_lock);
1829 	mutex_enter(&xnbp->xnb_rx_lock);
1830 
1831 	xnbp->xnb_connected = B_TRUE;
1832 
1833 	mutex_exit(&xnbp->xnb_rx_lock);
1834 	mutex_exit(&xnbp->xnb_tx_lock);
1835 
1836 	/* 4, 5 */
1837 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1838 	    != DDI_SUCCESS) {
1839 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1840 		goto fail;
1841 	}
1842 	xnbp->xnb_irq = B_TRUE;
1843 
1844 	return (B_TRUE);
1845 
1846 fail:
1847 	mutex_enter(&xnbp->xnb_tx_lock);
1848 	mutex_enter(&xnbp->xnb_rx_lock);
1849 
1850 	xnbp->xnb_connected = B_FALSE;
1851 
1852 	mutex_exit(&xnbp->xnb_rx_lock);
1853 	mutex_exit(&xnbp->xnb_tx_lock);
1854 
1855 	return (B_FALSE);
1856 }
1857 
1858 static void
1859 xnb_disconnect_rings(dev_info_t *dip)
1860 {
1861 	xnb_t *xnbp = ddi_get_driver_private(dip);
1862 
1863 	if (xnbp->xnb_irq) {
1864 		ddi_remove_intr(dip, 0, NULL);
1865 		xnbp->xnb_irq = B_FALSE;
1866 	}
1867 
1868 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1869 		xvdi_free_evtchn(dip);
1870 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1871 	}
1872 
1873 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1874 		struct gnttab_unmap_grant_ref unmap_op;
1875 
1876 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1877 		    xnbp->xnb_rx_ring_addr;
1878 		unmap_op.dev_bus_addr = 0;
1879 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1880 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1881 		    &unmap_op, 1) != 0)
1882 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1883 			    "cannot unmap rx-ring page (%d)",
1884 			    unmap_op.status);
1885 
1886 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1887 	}
1888 
1889 	if (xnbp->xnb_rx_ring_addr != NULL) {
1890 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1891 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1892 		xnbp->xnb_rx_ring_addr = NULL;
1893 	}
1894 
1895 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1896 		struct gnttab_unmap_grant_ref unmap_op;
1897 
1898 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1899 		    xnbp->xnb_tx_ring_addr;
1900 		unmap_op.dev_bus_addr = 0;
1901 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1902 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1903 		    &unmap_op, 1) != 0)
1904 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1905 			    "cannot unmap tx-ring page (%d)",
1906 			    unmap_op.status);
1907 
1908 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1909 	}
1910 
1911 	if (xnbp->xnb_tx_ring_addr != NULL) {
1912 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1913 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1914 		xnbp->xnb_tx_ring_addr = NULL;
1915 	}
1916 }
1917 
1918 static void
1919 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1920     void *arg, void *impl_data)
1921 {
1922 	_NOTE(ARGUNUSED(id, arg));
1923 	xnb_t *xnbp = ddi_get_driver_private(dip);
1924 	XenbusState new_state = *(XenbusState *)impl_data;
1925 
1926 	ASSERT(xnbp != NULL);
1927 
1928 	switch (new_state) {
1929 	case XenbusStateConnected:
1930 		/* spurious state change */
1931 		if (xnbp->xnb_connected)
1932 			return;
1933 
1934 		if (!xnb_read_oe_config(xnbp) ||
1935 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
1936 			cmn_err(CE_WARN, "xnb_oe_state_change: "
1937 			    "read otherend config error");
1938 			(void) xvdi_switch_state(dip, XBT_NULL,
1939 			    XenbusStateClosed);
1940 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1941 
1942 			break;
1943 		}
1944 
1945 
1946 		mutex_enter(&xnbp->xnb_state_lock);
1947 		xnbp->xnb_fe_status = XNB_STATE_READY;
1948 		if (xnbp->xnb_be_status == XNB_STATE_READY)
1949 			xnb_start_connect(xnbp);
1950 		mutex_exit(&xnbp->xnb_state_lock);
1951 
1952 		/*
1953 		 * Now that we've attempted to connect it's reasonable
1954 		 * to allow an attempt to detach.
1955 		 */
1956 		xnbp->xnb_detachable = B_TRUE;
1957 
1958 		break;
1959 
1960 	case XenbusStateClosing:
1961 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1962 
1963 		break;
1964 
1965 	case XenbusStateClosed:
1966 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1967 
1968 		mutex_enter(&xnbp->xnb_tx_lock);
1969 		mutex_enter(&xnbp->xnb_rx_lock);
1970 
1971 		xnb_disconnect_rings(dip);
1972 		xnbp->xnb_connected = B_FALSE;
1973 
1974 		mutex_exit(&xnbp->xnb_rx_lock);
1975 		mutex_exit(&xnbp->xnb_tx_lock);
1976 
1977 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1978 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1979 		/*
1980 		 * In all likelyhood this is already set (in the above
1981 		 * case), but if the peer never attempted to connect
1982 		 * and the domain is destroyed we get here without
1983 		 * having been through the case above, so we set it to
1984 		 * be sure.
1985 		 */
1986 		xnbp->xnb_detachable = B_TRUE;
1987 
1988 		break;
1989 
1990 	default:
1991 		break;
1992 	}
1993 }
1994 
1995 static void
1996 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1997     void *arg, void *impl_data)
1998 {
1999 	_NOTE(ARGUNUSED(id, arg));
2000 	xnb_t *xnbp = ddi_get_driver_private(dip);
2001 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
2002 
2003 	ASSERT(xnbp != NULL);
2004 
2005 	switch (state) {
2006 	case Connected:
2007 		/* spurious hotplug event */
2008 		if (xnbp->xnb_hotplugged)
2009 			break;
2010 
2011 		if (!xnb_read_xs_config(xnbp))
2012 			break;
2013 
2014 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
2015 			break;
2016 
2017 		mutex_enter(&xnbp->xnb_tx_lock);
2018 		mutex_enter(&xnbp->xnb_rx_lock);
2019 
2020 		xnbp->xnb_hotplugged = B_TRUE;
2021 
2022 		mutex_exit(&xnbp->xnb_rx_lock);
2023 		mutex_exit(&xnbp->xnb_tx_lock);
2024 
2025 		mutex_enter(&xnbp->xnb_state_lock);
2026 		xnbp->xnb_be_status = XNB_STATE_READY;
2027 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
2028 			xnb_start_connect(xnbp);
2029 		mutex_exit(&xnbp->xnb_state_lock);
2030 
2031 		break;
2032 
2033 	default:
2034 		break;
2035 	}
2036 }
2037 
2038 static struct modldrv modldrv = {
2039 	&mod_miscops, "xnb",
2040 };
2041 
2042 static struct modlinkage modlinkage = {
2043 	MODREV_1, &modldrv, NULL
2044 };
2045 
2046 int
2047 _init(void)
2048 {
2049 	int i;
2050 
2051 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2052 
2053 	i = mod_install(&modlinkage);
2054 	if (i != DDI_SUCCESS)
2055 		mutex_destroy(&xnb_alloc_page_lock);
2056 
2057 	return (i);
2058 }
2059 
2060 int
2061 _info(struct modinfo *modinfop)
2062 {
2063 	return (mod_info(&modlinkage, modinfop));
2064 }
2065 
2066 int
2067 _fini(void)
2068 {
2069 	int i;
2070 
2071 	i = mod_remove(&modlinkage);
2072 	if (i == DDI_SUCCESS)
2073 		mutex_destroy(&xnb_alloc_page_lock);
2074 
2075 	return (i);
2076 }
2077