xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 78801af7286cd73dbc996d470f789e75993cf15d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2018 Joyent, Inc.
26  */
27 
28 #ifdef DEBUG
29 #define	XNB_DEBUG 1
30 #endif /* DEBUG */
31 
32 #include "xnb.h"
33 
34 #include <sys/sunddi.h>
35 #include <sys/sunndi.h>
36 #include <sys/modctl.h>
37 #include <sys/conf.h>
38 #include <sys/mac.h>
39 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/types.h>
44 #include <sys/pattr.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/hat_i86.h>
47 #include <xen/sys/xenbus_impl.h>
48 #include <xen/sys/xendev.h>
49 #include <sys/balloon_impl.h>
50 #include <sys/evtchn_impl.h>
51 #include <sys/gnttab.h>
52 #include <vm/vm_dep.h>
53 #include <sys/note.h>
54 #include <sys/gld.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 
58 /*
59  * The terms "transmit" and "receive" are used in alignment with domU,
60  * which means that packets originating from the peer domU are "transmitted"
61  * to other parts of the system and packets are "received" from them.
62  */
63 
64 /*
65  * Should we allow guests to manipulate multicast group membership?
66  */
67 static boolean_t	xnb_multicast_control = B_TRUE;
68 
69 static boolean_t	xnb_connect_rings(dev_info_t *);
70 static void		xnb_disconnect_rings(dev_info_t *);
71 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
72     void *, void *);
73 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
74     void *, void *);
75 
76 static int	xnb_txbuf_constructor(void *, void *, int);
77 static void	xnb_txbuf_destructor(void *, void *);
78 static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
79 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
80 
81 mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
82 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
83 
84 static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
85     size_t, size_t, size_t, grant_ref_t);
86 #pragma inline(setup_gop)
87 static boolean_t	is_foreign(void *);
88 #pragma inline(is_foreign)
89 
90 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
91 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
92 
93 static kmutex_t	xnb_alloc_page_lock;
94 
95 /*
96  * On a 32 bit PAE system physical and machine addresses are larger
97  * than 32 bits.  ddi_btop() on such systems take an unsigned long
98  * argument, and so addresses above 4G are truncated before ddi_btop()
99  * gets to see them.  To avoid this, code the shift operation here.
100  */
101 #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
102 
103 /* DMA attributes for transmit and receive data */
104 static ddi_dma_attr_t buf_dma_attr = {
105 	DMA_ATTR_V0,		/* version of this structure */
106 	0,			/* lowest usable address */
107 	0xffffffffffffffffULL,	/* highest usable address */
108 	0x7fffffff,		/* maximum DMAable byte count */
109 	MMU_PAGESIZE,		/* alignment in bytes */
110 	0x7ff,			/* bitmap of burst sizes */
111 	1,			/* minimum transfer */
112 	0xffffffffU,		/* maximum transfer */
113 	0xffffffffffffffffULL,	/* maximum segment length */
114 	1,			/* maximum number of segments */
115 	1,			/* granularity */
116 	0,			/* flags (reserved) */
117 };
118 
119 /* DMA access attributes for data: NOT to be byte swapped. */
120 static ddi_device_acc_attr_t data_accattr = {
121 	DDI_DEVICE_ATTR_V0,
122 	DDI_NEVERSWAP_ACC,
123 	DDI_STRICTORDER_ACC
124 };
125 
126 /*
127  * Statistics.
128  */
129 static const char * const aux_statistics[] = {
130 	"rx_cksum_deferred",
131 	"tx_cksum_no_need",
132 	"rx_rsp_notok",
133 	"tx_notify_deferred",
134 	"tx_notify_sent",
135 	"rx_notify_deferred",
136 	"rx_notify_sent",
137 	"tx_too_early",
138 	"rx_too_early",
139 	"rx_allocb_failed",
140 	"tx_allocb_failed",
141 	"rx_foreign_page",
142 	"mac_full",
143 	"spurious_intr",
144 	"allocation_success",
145 	"allocation_failure",
146 	"small_allocation_success",
147 	"small_allocation_failure",
148 	"other_allocation_failure",
149 	"rx_pageboundary_crossed",
150 	"rx_cpoparea_grown",
151 	"csum_hardware",
152 	"csum_software",
153 	"tx_overflow_page",
154 	"tx_unexpected_flags",
155 };
156 
157 static int
158 xnb_ks_aux_update(kstat_t *ksp, int flag)
159 {
160 	xnb_t *xnbp;
161 	kstat_named_t *knp;
162 
163 	if (flag != KSTAT_READ)
164 		return (EACCES);
165 
166 	xnbp = ksp->ks_private;
167 	knp = ksp->ks_data;
168 
169 	/*
170 	 * Assignment order should match that of the names in
171 	 * aux_statistics.
172 	 */
173 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
195 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
196 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
197 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
198 
199 	return (0);
200 }
201 
202 static boolean_t
203 xnb_ks_init(xnb_t *xnbp)
204 {
205 	int nstat = sizeof (aux_statistics) /
206 	    sizeof (aux_statistics[0]);
207 	const char * const *cp = aux_statistics;
208 	kstat_named_t *knp;
209 
210 	/*
211 	 * Create and initialise kstats.
212 	 */
213 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
214 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
215 	    KSTAT_TYPE_NAMED, nstat, 0);
216 	if (xnbp->xnb_kstat_aux == NULL)
217 		return (B_FALSE);
218 
219 	xnbp->xnb_kstat_aux->ks_private = xnbp;
220 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
221 
222 	knp = xnbp->xnb_kstat_aux->ks_data;
223 	while (nstat > 0) {
224 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
225 
226 		knp++;
227 		cp++;
228 		nstat--;
229 	}
230 
231 	kstat_install(xnbp->xnb_kstat_aux);
232 
233 	return (B_TRUE);
234 }
235 
236 static void
237 xnb_ks_free(xnb_t *xnbp)
238 {
239 	kstat_delete(xnbp->xnb_kstat_aux);
240 }
241 
242 /*
243  * Calculate and insert the transport checksum for an arbitrary packet.
244  */
245 static mblk_t *
246 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
247 {
248 	_NOTE(ARGUNUSED(xnbp));
249 
250 	/*
251 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
252 	 * because it doesn't cover all of the interesting cases :-(
253 	 */
254 	mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
255 	mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
256 	return (mp);
257 }
258 
259 mblk_t *
260 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
261 {
262 	struct ether_header *ehp;
263 	uint16_t sap;
264 	uint32_t offset;
265 	ipha_t *ipha;
266 
267 	ASSERT(mp->b_next == NULL);
268 
269 	/*
270 	 * Check that the packet is contained in a single mblk.  In
271 	 * the "from peer" path this is true today, but may change
272 	 * when scatter gather support is added.  In the "to peer"
273 	 * path we cannot be sure, but in most cases it will be true
274 	 * (in the xnbo case the packet has come from a MAC device
275 	 * which is unlikely to split packets).
276 	 */
277 	if (mp->b_cont != NULL)
278 		goto software;
279 
280 	/*
281 	 * If the MAC has no hardware capability don't do any further
282 	 * checking.
283 	 */
284 	if (capab == 0)
285 		goto software;
286 
287 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
288 	ehp = (struct ether_header *)mp->b_rptr;
289 
290 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
291 		struct ether_vlan_header *evhp;
292 
293 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
294 		evhp = (struct ether_vlan_header *)mp->b_rptr;
295 		sap = ntohs(evhp->ether_type);
296 		offset = sizeof (struct ether_vlan_header);
297 	} else {
298 		sap = ntohs(ehp->ether_type);
299 		offset = sizeof (struct ether_header);
300 	}
301 
302 	/*
303 	 * We only attempt to do IPv4 packets in hardware.
304 	 */
305 	if (sap != ETHERTYPE_IP)
306 		goto software;
307 
308 	/*
309 	 * We know that this is an IPv4 packet.
310 	 */
311 	ipha = (ipha_t *)(mp->b_rptr + offset);
312 
313 	switch (ipha->ipha_protocol) {
314 	case IPPROTO_TCP:
315 	case IPPROTO_UDP: {
316 		uint32_t start, length, stuff, cksum;
317 		uint16_t *stuffp;
318 
319 		/*
320 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
321 		 * can use full IPv4 and partial checksum offload.
322 		 */
323 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
324 			break;
325 
326 		start = IP_SIMPLE_HDR_LENGTH;
327 		length = ntohs(ipha->ipha_length);
328 		if (ipha->ipha_protocol == IPPROTO_TCP) {
329 			stuff = start + TCP_CHECKSUM_OFFSET;
330 			cksum = IP_TCP_CSUM_COMP;
331 		} else {
332 			stuff = start + UDP_CHECKSUM_OFFSET;
333 			cksum = IP_UDP_CSUM_COMP;
334 		}
335 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
336 
337 		if (capab & HCKSUM_INET_FULL_V4) {
338 			/*
339 			 * Some devices require that the checksum
340 			 * field of the packet is zero for full
341 			 * offload.
342 			 */
343 			*stuffp = 0;
344 
345 			mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
346 
347 			xnbp->xnb_stat_csum_hardware++;
348 
349 			return (mp);
350 		}
351 
352 		if (capab & HCKSUM_INET_PARTIAL) {
353 			if (*stuffp == 0) {
354 				ipaddr_t src, dst;
355 
356 				/*
357 				 * Older Solaris guests don't insert
358 				 * the pseudo-header checksum, so we
359 				 * calculate it here.
360 				 */
361 				src = ipha->ipha_src;
362 				dst = ipha->ipha_dst;
363 
364 				cksum += (dst >> 16) + (dst & 0xFFFF);
365 				cksum += (src >> 16) + (src & 0xFFFF);
366 				cksum += length - IP_SIMPLE_HDR_LENGTH;
367 
368 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
369 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
370 
371 				ASSERT(cksum <= 0xFFFF);
372 
373 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
374 			}
375 
376 			mac_hcksum_set(mp, start, stuff, length, 0,
377 			    HCK_PARTIALCKSUM);
378 
379 			xnbp->xnb_stat_csum_hardware++;
380 
381 			return (mp);
382 		}
383 
384 		/* NOTREACHED */
385 		break;
386 	}
387 
388 	default:
389 		/* Use software. */
390 		break;
391 	}
392 
393 software:
394 	/*
395 	 * We are not able to use any offload so do the whole thing in
396 	 * software.
397 	 */
398 	xnbp->xnb_stat_csum_software++;
399 
400 	return (xnb_software_csum(xnbp, mp));
401 }
402 
403 int
404 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
405 {
406 	xnb_t *xnbp;
407 	char *xsname;
408 	char cachename[32];
409 
410 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
411 
412 	xnbp->xnb_flavour = flavour;
413 	xnbp->xnb_flavour_data = flavour_data;
414 	xnbp->xnb_devinfo = dip;
415 	xnbp->xnb_evtchn = INVALID_EVTCHN;
416 	xnbp->xnb_irq = B_FALSE;
417 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
418 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
419 	xnbp->xnb_connected = B_FALSE;
420 	xnbp->xnb_hotplugged = B_FALSE;
421 	xnbp->xnb_detachable = B_FALSE;
422 	xnbp->xnb_peer = xvdi_get_oeid(dip);
423 	xnbp->xnb_be_status = XNB_STATE_INIT;
424 	xnbp->xnb_fe_status = XNB_STATE_INIT;
425 
426 	xnbp->xnb_tx_buf_count = 0;
427 
428 	xnbp->xnb_rx_hv_copy = B_FALSE;
429 	xnbp->xnb_multicast_control = B_FALSE;
430 
431 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
432 	ASSERT(xnbp->xnb_rx_va != NULL);
433 
434 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
435 	    != DDI_SUCCESS)
436 		goto failure;
437 
438 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
439 	xnbp->xnb_rx_cpop = NULL;
440 	xnbp->xnb_rx_cpop_count = 0;
441 
442 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
443 	    xnbp->xnb_icookie);
444 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
445 	    xnbp->xnb_icookie);
446 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
447 	    xnbp->xnb_icookie);
448 
449 	/* Set driver private pointer now. */
450 	ddi_set_driver_private(dip, xnbp);
451 
452 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
453 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
454 	    sizeof (xnb_txbuf_t), 0,
455 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
456 	    NULL, xnbp, NULL, 0);
457 	if (xnbp->xnb_tx_buf_cache == NULL)
458 		goto failure_0;
459 
460 	if (!xnb_ks_init(xnbp))
461 		goto failure_1;
462 
463 	/*
464 	 * Receive notification of changes in the state of the
465 	 * driver in the guest domain.
466 	 */
467 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
468 	    NULL) != DDI_SUCCESS)
469 		goto failure_2;
470 
471 	/*
472 	 * Receive notification of hotplug events.
473 	 */
474 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
475 	    NULL) != DDI_SUCCESS)
476 		goto failure_2;
477 
478 	xsname = xvdi_get_xsname(dip);
479 
480 	if (xenbus_printf(XBT_NULL, xsname,
481 	    "feature-multicast-control", "%d",
482 	    xnb_multicast_control ? 1 : 0) != 0)
483 		goto failure_3;
484 
485 	if (xenbus_printf(XBT_NULL, xsname,
486 	    "feature-rx-copy", "%d",  1) != 0)
487 		goto failure_3;
488 	/*
489 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
490 	 * in addition to "feature-rx-copy" being 1. It seems strange
491 	 * to use four possible states to describe a binary decision,
492 	 * but we might as well play nice.
493 	 */
494 	if (xenbus_printf(XBT_NULL, xsname,
495 	    "feature-rx-flip", "%d", 0) != 0)
496 		goto failure_3;
497 
498 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
499 	(void) xvdi_post_event(dip, XEN_HP_ADD);
500 
501 	return (DDI_SUCCESS);
502 
503 failure_3:
504 	xvdi_remove_event_handler(dip, NULL);
505 
506 failure_2:
507 	xnb_ks_free(xnbp);
508 
509 failure_1:
510 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
511 
512 failure_0:
513 	mutex_destroy(&xnbp->xnb_state_lock);
514 	mutex_destroy(&xnbp->xnb_rx_lock);
515 	mutex_destroy(&xnbp->xnb_tx_lock);
516 
517 failure:
518 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
519 	kmem_free(xnbp, sizeof (*xnbp));
520 	return (DDI_FAILURE);
521 }
522 
523 void
524 xnb_detach(dev_info_t *dip)
525 {
526 	xnb_t *xnbp = ddi_get_driver_private(dip);
527 
528 	ASSERT(xnbp != NULL);
529 	ASSERT(!xnbp->xnb_connected);
530 	ASSERT(xnbp->xnb_tx_buf_count == 0);
531 
532 	xnb_disconnect_rings(dip);
533 
534 	xvdi_remove_event_handler(dip, NULL);
535 
536 	xnb_ks_free(xnbp);
537 
538 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
539 
540 	ddi_set_driver_private(dip, NULL);
541 
542 	mutex_destroy(&xnbp->xnb_state_lock);
543 	mutex_destroy(&xnbp->xnb_rx_lock);
544 	mutex_destroy(&xnbp->xnb_tx_lock);
545 
546 	if (xnbp->xnb_rx_cpop_count > 0)
547 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
548 		    * xnbp->xnb_rx_cpop_count);
549 
550 	ASSERT(xnbp->xnb_rx_va != NULL);
551 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
552 
553 	kmem_free(xnbp, sizeof (*xnbp));
554 }
555 
556 /*
557  * Allocate a page from the hypervisor to be flipped to the peer.
558  *
559  * Try to get pages in batches to reduce the overhead of calls into
560  * the balloon driver.
561  */
562 static mfn_t
563 xnb_alloc_page(xnb_t *xnbp)
564 {
565 #define	WARNING_RATE_LIMIT 100
566 #define	BATCH_SIZE 256
567 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
568 	static int nth = BATCH_SIZE;
569 	mfn_t mfn;
570 
571 	mutex_enter(&xnb_alloc_page_lock);
572 	if (nth == BATCH_SIZE) {
573 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
574 			xnbp->xnb_stat_allocation_failure++;
575 			mutex_exit(&xnb_alloc_page_lock);
576 
577 			/*
578 			 * Try for a single page in low memory situations.
579 			 */
580 			if (balloon_alloc_pages(1, &mfn) != 1) {
581 				if ((xnbp->xnb_stat_small_allocation_failure++
582 				    % WARNING_RATE_LIMIT) == 0)
583 					cmn_err(CE_WARN, "xnb_alloc_page: "
584 					    "Cannot allocate memory to "
585 					    "transfer packets to peer.");
586 				return (0);
587 			} else {
588 				xnbp->xnb_stat_small_allocation_success++;
589 				return (mfn);
590 			}
591 		}
592 
593 		nth = 0;
594 		xnbp->xnb_stat_allocation_success++;
595 	}
596 
597 	mfn = mfns[nth++];
598 	mutex_exit(&xnb_alloc_page_lock);
599 
600 	ASSERT(mfn != 0);
601 
602 	return (mfn);
603 #undef BATCH_SIZE
604 #undef WARNING_RATE_LIMIT
605 }
606 
607 /*
608  * Free a page back to the hypervisor.
609  *
610  * This happens only in the error path, so batching is not worth the
611  * complication.
612  */
613 static void
614 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
615 {
616 	_NOTE(ARGUNUSED(xnbp));
617 	int r;
618 	pfn_t pfn;
619 
620 	pfn = xen_assign_pfn(mfn);
621 	pfnzero(pfn, 0, PAGESIZE);
622 	xen_release_pfn(pfn);
623 
624 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
625 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
626 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
627 		    r, mfn);
628 	}
629 }
630 
631 /*
632  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
633  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
634  */
635 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
636 	((((_r)->sring->req_prod - loop) <		\
637 		(RING_SIZE(_r) - (loop - prod))) ?	\
638 	    ((_r)->sring->req_prod - loop) :		\
639 	    (RING_SIZE(_r) - (loop - prod)))
640 
641 /*
642  * Pass packets to the peer using page flipping.
643  */
644 mblk_t *
645 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
646 {
647 	mblk_t *free = mp, *prev = NULL;
648 	size_t len;
649 	gnttab_transfer_t *gop;
650 	boolean_t notify;
651 	RING_IDX loop, prod, end;
652 
653 	/*
654 	 * For each packet the sequence of operations is:
655 	 *
656 	 * 1. get a new page from the hypervisor.
657 	 * 2. get a request slot from the ring.
658 	 * 3. copy the data into the new page.
659 	 * 4. transfer the page to the peer.
660 	 * 5. update the request slot.
661 	 * 6. kick the peer.
662 	 * 7. free mp.
663 	 *
664 	 * In order to reduce the number of hypercalls, we prepare
665 	 * several packets for the peer and perform a single hypercall
666 	 * to transfer them.
667 	 */
668 
669 	len = 0;
670 	mutex_enter(&xnbp->xnb_rx_lock);
671 
672 	/*
673 	 * If we are not connected to the peer or have not yet
674 	 * finished hotplug it is too early to pass packets to the
675 	 * peer.
676 	 */
677 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
678 		mutex_exit(&xnbp->xnb_rx_lock);
679 		DTRACE_PROBE(flip_rx_too_early);
680 		xnbp->xnb_stat_rx_too_early++;
681 		return (mp);
682 	}
683 
684 	loop = xnbp->xnb_rx_ring.req_cons;
685 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
686 	gop = xnbp->xnb_rx_top;
687 
688 	while ((mp != NULL) &&
689 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
690 
691 		mfn_t mfn;
692 		pfn_t pfn;
693 		netif_rx_request_t *rxreq;
694 		netif_rx_response_t *rxresp;
695 		char *valoop;
696 		mblk_t *ml;
697 		uint16_t cksum_flags;
698 
699 		/* 1 */
700 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
701 			xnbp->xnb_stat_rx_defer++;
702 			break;
703 		}
704 
705 		/* 2 */
706 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
707 
708 #ifdef XNB_DEBUG
709 		if (!(rxreq->id < NET_RX_RING_SIZE))
710 			cmn_err(CE_PANIC, "xnb_to_peer: "
711 			    "id %d out of range in request 0x%p",
712 			    rxreq->id, (void *)rxreq);
713 #endif /* XNB_DEBUG */
714 
715 		/* Assign a pfn and map the new page at the allocated va. */
716 		pfn = xen_assign_pfn(mfn);
717 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
718 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
719 
720 		/* 3 */
721 		len = 0;
722 		valoop = xnbp->xnb_rx_va;
723 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
724 			size_t chunk = ml->b_wptr - ml->b_rptr;
725 
726 			bcopy(ml->b_rptr, valoop, chunk);
727 			valoop += chunk;
728 			len += chunk;
729 		}
730 
731 		ASSERT(len < PAGESIZE);
732 
733 		/* Release the pfn. */
734 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
735 		    HAT_UNLOAD_UNMAP);
736 		xen_release_pfn(pfn);
737 
738 		/* 4 */
739 		gop->mfn = mfn;
740 		gop->domid = xnbp->xnb_peer;
741 		gop->ref = rxreq->gref;
742 
743 		/* 5.1 */
744 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
745 		rxresp->offset = 0;
746 		rxresp->flags = 0;
747 
748 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
749 		if (cksum_flags != 0)
750 			xnbp->xnb_stat_rx_cksum_deferred++;
751 		rxresp->flags |= cksum_flags;
752 
753 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
754 		rxresp->status = len;
755 
756 		loop++;
757 		prod++;
758 		gop++;
759 		prev = mp;
760 		mp = mp->b_next;
761 	}
762 
763 	/*
764 	 * Did we actually do anything?
765 	 */
766 	if (loop == xnbp->xnb_rx_ring.req_cons) {
767 		mutex_exit(&xnbp->xnb_rx_lock);
768 		return (mp);
769 	}
770 
771 	end = loop;
772 
773 	/*
774 	 * Unlink the end of the 'done' list from the remainder.
775 	 */
776 	ASSERT(prev != NULL);
777 	prev->b_next = NULL;
778 
779 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
780 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
781 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
782 	}
783 
784 	loop = xnbp->xnb_rx_ring.req_cons;
785 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
786 	gop = xnbp->xnb_rx_top;
787 
788 	while (loop < end) {
789 		int16_t status = NETIF_RSP_OKAY;
790 
791 		if (gop->status != 0) {
792 			status = NETIF_RSP_ERROR;
793 
794 			/*
795 			 * If the status is anything other than
796 			 * GNTST_bad_page then we don't own the page
797 			 * any more, so don't try to give it back.
798 			 */
799 			if (gop->status != GNTST_bad_page)
800 				gop->mfn = 0;
801 		} else {
802 			/* The page is no longer ours. */
803 			gop->mfn = 0;
804 		}
805 
806 		if (gop->mfn != 0)
807 			/*
808 			 * Give back the page, as we won't be using
809 			 * it.
810 			 */
811 			xnb_free_page(xnbp, gop->mfn);
812 		else
813 			/*
814 			 * We gave away a page, update our accounting
815 			 * now.
816 			 */
817 			balloon_drv_subtracted(1);
818 
819 		/* 5.2 */
820 		if (status != NETIF_RSP_OKAY) {
821 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
822 			    status;
823 		} else {
824 			xnbp->xnb_stat_ipackets++;
825 			xnbp->xnb_stat_rbytes += len;
826 		}
827 
828 		loop++;
829 		prod++;
830 		gop++;
831 	}
832 
833 	xnbp->xnb_rx_ring.req_cons = loop;
834 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
835 
836 	/* 6 */
837 	/* LINTED: constant in conditional context */
838 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
839 	if (notify) {
840 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
841 		xnbp->xnb_stat_rx_notify_sent++;
842 	} else {
843 		xnbp->xnb_stat_rx_notify_deferred++;
844 	}
845 
846 	if (mp != NULL)
847 		xnbp->xnb_stat_rx_defer++;
848 
849 	mutex_exit(&xnbp->xnb_rx_lock);
850 
851 	/* Free mblk_t's that we consumed. */
852 	freemsgchain(free);
853 
854 	return (mp);
855 }
856 
857 /* Helper functions for xnb_copy_to_peer(). */
858 
859 /*
860  * Grow the array of copy operation descriptors.
861  */
862 static boolean_t
863 grow_cpop_area(xnb_t *xnbp)
864 {
865 	size_t count;
866 	gnttab_copy_t *new;
867 
868 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
869 
870 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
871 
872 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
873 		xnbp->xnb_stat_other_allocation_failure++;
874 		return (B_FALSE);
875 	}
876 
877 	bcopy(xnbp->xnb_rx_cpop, new,
878 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
879 
880 	kmem_free(xnbp->xnb_rx_cpop,
881 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
882 
883 	xnbp->xnb_rx_cpop = new;
884 	xnbp->xnb_rx_cpop_count = count;
885 
886 	xnbp->xnb_stat_rx_cpoparea_grown++;
887 
888 	return (B_TRUE);
889 }
890 
891 /*
892  * Check whether an address is on a page that's foreign to this domain.
893  */
894 static boolean_t
895 is_foreign(void *addr)
896 {
897 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
898 
899 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
900 }
901 
902 /*
903  * Insert a newly allocated mblk into a chain, replacing the old one.
904  */
905 static mblk_t *
906 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
907 {
908 	uint32_t	start, stuff, end, value, flags;
909 	mblk_t		*new_mp;
910 
911 	new_mp = copyb(mp);
912 	if (new_mp == NULL) {
913 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
914 		    "for %p, len %lu", (void *) mp, len);
915 	}
916 
917 	mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
918 	mac_hcksum_set(new_mp, start, stuff, end, value, flags);
919 
920 	new_mp->b_next = mp->b_next;
921 	new_mp->b_prev = mp->b_prev;
922 	new_mp->b_cont = mp->b_cont;
923 
924 	/* Make sure we only overwrite pointers to the mblk being replaced. */
925 	if (mp_prev != NULL && mp_prev->b_next == mp)
926 		mp_prev->b_next = new_mp;
927 
928 	if (ml_prev != NULL && ml_prev->b_cont == mp)
929 		ml_prev->b_cont = new_mp;
930 
931 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
932 	freemsg(mp);
933 
934 	return (new_mp);
935 }
936 
937 /*
938  * Set all the fields in a gnttab_copy_t.
939  */
940 static void
941 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
942     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
943 {
944 	ASSERT(xnbp != NULL && gp != NULL);
945 
946 	gp->source.offset = s_off;
947 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
948 	gp->source.domid = DOMID_SELF;
949 
950 	gp->len = (uint16_t)len;
951 	gp->flags = GNTCOPY_dest_gref;
952 	gp->status = 0;
953 
954 	gp->dest.u.ref = d_ref;
955 	gp->dest.offset = d_off;
956 	gp->dest.domid = xnbp->xnb_peer;
957 }
958 
959 /*
960  * Pass packets to the peer using hypervisor copy operations.
961  */
962 mblk_t *
963 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
964 {
965 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
966 	mblk_t		*ml, *ml_prev;
967 	boolean_t	notify;
968 	RING_IDX	loop, prod;
969 	int		i;
970 
971 	/*
972 	 * If the peer does not pre-post buffers for received packets,
973 	 * use page flipping to pass packets to it.
974 	 */
975 	if (!xnbp->xnb_rx_hv_copy)
976 		return (xnb_to_peer(xnbp, mp));
977 
978 	/*
979 	 * For each packet the sequence of operations is:
980 	 *
981 	 *  1. get a request slot from the ring.
982 	 *  2. set up data for hypercall (see NOTE below)
983 	 *  3. have the hypervisore copy the data
984 	 *  4. update the request slot.
985 	 *  5. kick the peer.
986 	 *
987 	 * NOTE ad 2.
988 	 *  In order to reduce the number of hypercalls, we prepare
989 	 *  several mblks (mp->b_cont != NULL) for the peer and
990 	 *  perform a single hypercall to transfer them.  We also have
991 	 *  to set up a seperate copy operation for every page.
992 	 *
993 	 * If we have more than one packet (mp->b_next != NULL), we do
994 	 * this whole dance repeatedly.
995 	 */
996 
997 	mutex_enter(&xnbp->xnb_rx_lock);
998 
999 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1000 		mutex_exit(&xnbp->xnb_rx_lock);
1001 		DTRACE_PROBE(copy_rx_too_early);
1002 		xnbp->xnb_stat_rx_too_early++;
1003 		return (mp);
1004 	}
1005 
1006 	loop = xnbp->xnb_rx_ring.req_cons;
1007 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1008 
1009 	while ((mp != NULL) &&
1010 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1011 		netif_rx_request_t	*rxreq;
1012 		size_t			d_offset, len;
1013 		int			item_count;
1014 		gnttab_copy_t		*gop_cp;
1015 		netif_rx_response_t	*rxresp;
1016 		uint16_t		cksum_flags;
1017 		int16_t			status = NETIF_RSP_OKAY;
1018 
1019 		/* 1 */
1020 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1021 
1022 #ifdef XNB_DEBUG
1023 		if (!(rxreq->id < NET_RX_RING_SIZE))
1024 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1025 			    "id %d out of range in request 0x%p",
1026 			    rxreq->id, (void *)rxreq);
1027 #endif /* XNB_DEBUG */
1028 
1029 		/* 2 */
1030 		d_offset = 0;
1031 		len = 0;
1032 		item_count = 0;
1033 
1034 		gop_cp = xnbp->xnb_rx_cpop;
1035 
1036 		/*
1037 		 * We walk the b_cont pointers and set up a
1038 		 * gnttab_copy_t for each sub-page chunk in each data
1039 		 * block.
1040 		 */
1041 		/* 2a */
1042 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1043 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1044 			uchar_t	*r_tmp,	*rpt_align;
1045 			size_t	r_offset;
1046 
1047 			/*
1048 			 * The hypervisor will not allow us to
1049 			 * reference a foreign page (e.g. one
1050 			 * belonging to another domain) by mfn in the
1051 			 * copy operation. If the data in this mblk is
1052 			 * on such a page we must copy the data into a
1053 			 * local page before initiating the hypervisor
1054 			 * copy operation.
1055 			 */
1056 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1057 				mblk_t *ml_new = replace_msg(ml, chunk,
1058 				    mp_prev, ml_prev);
1059 
1060 				/* We can still use old ml, but not *ml! */
1061 				if (free == ml)
1062 					free = ml_new;
1063 				if (mp == ml)
1064 					mp = ml_new;
1065 				ml = ml_new;
1066 
1067 				xnbp->xnb_stat_rx_foreign_page++;
1068 			}
1069 
1070 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1071 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1072 			r_tmp = ml->b_rptr;
1073 
1074 			if (d_offset + chunk > PAGESIZE)
1075 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1076 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1077 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1078 				    (void *)mp, (void *)saved_mp, (void *)ml,
1079 				    (void *)rpt_align,
1080 				    d_offset, chunk, (int)PAGESIZE);
1081 
1082 			while (chunk > 0) {
1083 				size_t part_len;
1084 
1085 				if (item_count == xnbp->xnb_rx_cpop_count) {
1086 					if (!grow_cpop_area(xnbp))
1087 						goto failure;
1088 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
1089 				}
1090 				/*
1091 				 * If our mblk crosses a page boundary, we need
1092 				 * to do a seperate copy for each page.
1093 				 */
1094 				if (r_offset + chunk > PAGESIZE) {
1095 					part_len = PAGESIZE - r_offset;
1096 
1097 					DTRACE_PROBE3(mblk_page_crossed,
1098 					    (mblk_t *), ml, int, chunk, int,
1099 					    (int)r_offset);
1100 
1101 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1102 				} else {
1103 					part_len = chunk;
1104 				}
1105 
1106 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1107 				    d_offset, part_len, rxreq->gref);
1108 
1109 				chunk -= part_len;
1110 
1111 				len += part_len;
1112 				d_offset += part_len;
1113 				r_tmp += part_len;
1114 				/*
1115 				 * The 2nd, 3rd ... last copies will always
1116 				 * start at r_tmp, therefore r_offset is 0.
1117 				 */
1118 				r_offset = 0;
1119 				gop_cp++;
1120 				item_count++;
1121 			}
1122 			ml_prev = ml;
1123 
1124 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1125 			    chunk, int, len, int, item_count);
1126 		}
1127 		/* 3 */
1128 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1129 		    item_count) != 0) {
1130 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1131 			DTRACE_PROBE(HV_granttableopfailed);
1132 		}
1133 
1134 		/* 4 */
1135 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1136 		rxresp->offset = 0;
1137 
1138 		rxresp->flags = 0;
1139 
1140 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1141 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1142 		    (int)rxresp->status);
1143 
1144 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1145 		if (cksum_flags != 0)
1146 			xnbp->xnb_stat_rx_cksum_deferred++;
1147 		rxresp->flags |= cksum_flags;
1148 
1149 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1150 		rxresp->status = len;
1151 
1152 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1153 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1154 		    (int)rxresp->status);
1155 
1156 		for (i = 0; i < item_count; i++) {
1157 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1158 				DTRACE_PROBE2(cpop_status_nonnull, int,
1159 				    (int)xnbp->xnb_rx_cpop[i].status,
1160 				    int, i);
1161 				status = NETIF_RSP_ERROR;
1162 			}
1163 		}
1164 
1165 		/* 5.2 */
1166 		if (status != NETIF_RSP_OKAY) {
1167 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1168 			    status;
1169 			xnbp->xnb_stat_rx_rsp_notok++;
1170 		} else {
1171 			xnbp->xnb_stat_ipackets++;
1172 			xnbp->xnb_stat_rbytes += len;
1173 		}
1174 
1175 		loop++;
1176 		prod++;
1177 		mp_prev = mp;
1178 		mp = mp->b_next;
1179 	}
1180 failure:
1181 	/*
1182 	 * Did we actually do anything?
1183 	 */
1184 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1185 		mutex_exit(&xnbp->xnb_rx_lock);
1186 		return (mp);
1187 	}
1188 
1189 	/*
1190 	 * Unlink the end of the 'done' list from the remainder.
1191 	 */
1192 	ASSERT(mp_prev != NULL);
1193 	mp_prev->b_next = NULL;
1194 
1195 	xnbp->xnb_rx_ring.req_cons = loop;
1196 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1197 
1198 	/* 6 */
1199 	/* LINTED: constant in conditional context */
1200 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1201 	if (notify) {
1202 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1203 		xnbp->xnb_stat_rx_notify_sent++;
1204 	} else {
1205 		xnbp->xnb_stat_rx_notify_deferred++;
1206 	}
1207 
1208 	if (mp != NULL)
1209 		xnbp->xnb_stat_rx_defer++;
1210 
1211 	mutex_exit(&xnbp->xnb_rx_lock);
1212 
1213 	/* Free mblk_t structs we have consumed. */
1214 	freemsgchain(free);
1215 
1216 	return (mp);
1217 }
1218 
1219 
1220 static void
1221 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
1222 {
1223 	boolean_t notify;
1224 
1225 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1226 
1227 	/* LINTED: constant in conditional context */
1228 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1229 	if (notify || force) {
1230 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1231 		xnbp->xnb_stat_tx_notify_sent++;
1232 	} else {
1233 		xnbp->xnb_stat_tx_notify_deferred++;
1234 	}
1235 }
1236 
1237 static void
1238 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1239 {
1240 	RING_IDX i;
1241 	netif_tx_response_t *txresp;
1242 
1243 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1244 
1245 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1246 
1247 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1248 	txresp->id = id;
1249 	txresp->status = status;
1250 
1251 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1252 
1253 	/*
1254 	 * Note that we don't push the change to the peer here - that
1255 	 * is the callers responsibility.
1256 	 */
1257 }
1258 
1259 static void
1260 xnb_txbuf_recycle(xnb_txbuf_t *txp)
1261 {
1262 	xnb_t *xnbp = txp->xt_xnbp;
1263 
1264 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1265 
1266 	xnbp->xnb_tx_buf_outstanding--;
1267 }
1268 
1269 static int
1270 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1271 {
1272 	_NOTE(ARGUNUSED(kmflag));
1273 	xnb_txbuf_t *txp = buf;
1274 	xnb_t *xnbp = arg;
1275 	size_t len;
1276 	ddi_dma_cookie_t dma_cookie;
1277 	uint_t ncookies;
1278 
1279 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
1280 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1281 	txp->xt_xnbp = xnbp;
1282 	txp->xt_next = NULL;
1283 
1284 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
1285 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
1286 		goto failure;
1287 
1288 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
1289 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
1290 	    &txp->xt_acc_handle) != DDI_SUCCESS)
1291 		goto failure_1;
1292 
1293 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
1294 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
1295 	    &dma_cookie, &ncookies)
1296 	    != DDI_DMA_MAPPED)
1297 		goto failure_2;
1298 	ASSERT(ncookies == 1);
1299 
1300 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
1301 	txp->xt_buflen = dma_cookie.dmac_size;
1302 
1303 	DTRACE_PROBE(txbuf_allocated);
1304 
1305 	atomic_inc_32(&xnbp->xnb_tx_buf_count);
1306 	xnbp->xnb_tx_buf_outstanding++;
1307 
1308 	return (0);
1309 
1310 failure_2:
1311 	ddi_dma_mem_free(&txp->xt_acc_handle);
1312 
1313 failure_1:
1314 	ddi_dma_free_handle(&txp->xt_dma_handle);
1315 
1316 failure:
1317 
1318 	return (-1);
1319 }
1320 
1321 static void
1322 xnb_txbuf_destructor(void *buf, void *arg)
1323 {
1324 	xnb_txbuf_t *txp = buf;
1325 	xnb_t *xnbp = arg;
1326 
1327 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
1328 	ddi_dma_mem_free(&txp->xt_acc_handle);
1329 	ddi_dma_free_handle(&txp->xt_dma_handle);
1330 
1331 	atomic_dec_32(&xnbp->xnb_tx_buf_count);
1332 }
1333 
1334 /*
1335  * Take packets from the peer and deliver them onward.
1336  */
1337 static mblk_t *
1338 xnb_from_peer(xnb_t *xnbp)
1339 {
1340 	RING_IDX start, end, loop;
1341 	gnttab_copy_t *cop;
1342 	xnb_txbuf_t **txpp;
1343 	netif_tx_request_t *txreq;
1344 	boolean_t work_to_do, need_notify = B_FALSE;
1345 	mblk_t *head, *tail;
1346 	int n_data_req, i;
1347 
1348 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1349 
1350 	head = tail = NULL;
1351 around:
1352 
1353 	/* LINTED: constant in conditional context */
1354 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1355 	if (!work_to_do) {
1356 finished:
1357 		xnb_tx_notify_peer(xnbp, need_notify);
1358 
1359 		return (head);
1360 	}
1361 
1362 	start = xnbp->xnb_tx_ring.req_cons;
1363 	end = xnbp->xnb_tx_ring.sring->req_prod;
1364 
1365 	if ((end - start) > NET_TX_RING_SIZE) {
1366 		/*
1367 		 * This usually indicates that the frontend driver is
1368 		 * misbehaving, as it's not possible to have more than
1369 		 * NET_TX_RING_SIZE ring elements in play at any one
1370 		 * time.
1371 		 *
1372 		 * We reset the ring pointers to the state declared by
1373 		 * the frontend and try to carry on.
1374 		 */
1375 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1376 		    "items in the ring, resetting and trying to recover.",
1377 		    xnbp->xnb_peer, (end - start));
1378 
1379 		/* LINTED: constant in conditional context */
1380 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1381 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1382 
1383 		goto around;
1384 	}
1385 
1386 	loop = start;
1387 	cop = xnbp->xnb_tx_cop;
1388 	txpp = xnbp->xnb_tx_bufp;
1389 	n_data_req = 0;
1390 
1391 	while (loop < end) {
1392 		static const uint16_t acceptable_flags =
1393 		    NETTXF_csum_blank |
1394 		    NETTXF_data_validated |
1395 		    NETTXF_extra_info;
1396 		uint16_t unexpected_flags;
1397 
1398 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1399 
1400 		unexpected_flags = txreq->flags & ~acceptable_flags;
1401 		if (unexpected_flags != 0) {
1402 			/*
1403 			 * The peer used flag bits that we do not
1404 			 * recognize.
1405 			 */
1406 			cmn_err(CE_WARN, "xnb_from_peer: "
1407 			    "unexpected flag bits (0x%x) from peer "
1408 			    "in transmit request",
1409 			    unexpected_flags);
1410 			xnbp->xnb_stat_tx_unexpected_flags++;
1411 
1412 			/* Mark this entry as failed. */
1413 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1414 			need_notify = B_TRUE;
1415 
1416 		} else if (txreq->flags & NETTXF_extra_info) {
1417 			struct netif_extra_info *erp;
1418 			boolean_t status;
1419 
1420 			loop++; /* Consume another slot in the ring. */
1421 			ASSERT(loop <= end);
1422 
1423 			erp = (struct netif_extra_info *)
1424 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1425 
1426 			switch (erp->type) {
1427 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
1428 				ASSERT(xnbp->xnb_multicast_control);
1429 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
1430 				    &erp->u.mcast.addr);
1431 				break;
1432 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
1433 				ASSERT(xnbp->xnb_multicast_control);
1434 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
1435 				    &erp->u.mcast.addr);
1436 				break;
1437 			default:
1438 				status = B_FALSE;
1439 				cmn_err(CE_WARN, "xnb_from_peer: "
1440 				    "unknown extra type %d", erp->type);
1441 				break;
1442 			}
1443 
1444 			xnb_tx_mark_complete(xnbp, txreq->id,
1445 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
1446 			need_notify = B_TRUE;
1447 
1448 		} else if ((txreq->offset > PAGESIZE) ||
1449 		    (txreq->offset + txreq->size > PAGESIZE)) {
1450 			/*
1451 			 * Peer attempted to refer to data beyond the
1452 			 * end of the granted page.
1453 			 */
1454 			cmn_err(CE_WARN, "xnb_from_peer: "
1455 			    "attempt to refer beyond the end of granted "
1456 			    "page in txreq (offset %d, size %d).",
1457 			    txreq->offset, txreq->size);
1458 			xnbp->xnb_stat_tx_overflow_page++;
1459 
1460 			/* Mark this entry as failed. */
1461 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1462 			need_notify = B_TRUE;
1463 
1464 		} else {
1465 			xnb_txbuf_t *txp;
1466 
1467 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
1468 			    KM_NOSLEEP);
1469 			if (txp == NULL)
1470 				break;
1471 
1472 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
1473 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
1474 			if (txp->xt_mblk == NULL) {
1475 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1476 				break;
1477 			}
1478 
1479 			txp->xt_idx = loop;
1480 			txp->xt_id = txreq->id;
1481 
1482 			cop->source.u.ref = txreq->gref;
1483 			cop->source.domid = xnbp->xnb_peer;
1484 			cop->source.offset = txreq->offset;
1485 
1486 			cop->dest.u.gmfn = txp->xt_mfn;
1487 			cop->dest.domid = DOMID_SELF;
1488 			cop->dest.offset = 0;
1489 
1490 			cop->len = txreq->size;
1491 			cop->flags = GNTCOPY_source_gref;
1492 			cop->status = 0;
1493 
1494 			*txpp = txp;
1495 
1496 			txpp++;
1497 			cop++;
1498 			n_data_req++;
1499 
1500 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
1501 		}
1502 
1503 		loop++;
1504 	}
1505 
1506 	xnbp->xnb_tx_ring.req_cons = loop;
1507 
1508 	if (n_data_req == 0)
1509 		goto around;
1510 
1511 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1512 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
1513 
1514 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
1515 
1516 		txpp = xnbp->xnb_tx_bufp;
1517 		i = n_data_req;
1518 		while (i > 0) {
1519 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
1520 			txpp++;
1521 			i--;
1522 		}
1523 
1524 		goto finished;
1525 	}
1526 
1527 	txpp = xnbp->xnb_tx_bufp;
1528 	cop = xnbp->xnb_tx_cop;
1529 	i = n_data_req;
1530 
1531 	while (i > 0) {
1532 		xnb_txbuf_t *txp = *txpp;
1533 
1534 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
1535 
1536 		if (cop->status != 0) {
1537 #ifdef XNB_DEBUG
1538 			cmn_err(CE_WARN, "xnb_from_peer: "
1539 			    "txpp 0x%p failed (%d)",
1540 			    (void *)*txpp, cop->status);
1541 #endif /* XNB_DEBUG */
1542 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
1543 			freemsg(txp->xt_mblk);
1544 		} else {
1545 			mblk_t *mp;
1546 
1547 			mp = txp->xt_mblk;
1548 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
1549 			mp->b_wptr += txreq->size;
1550 			mp->b_next = NULL;
1551 
1552 			/*
1553 			 * If there are checksum flags, process them
1554 			 * appropriately.
1555 			 */
1556 			if ((txreq->flags &
1557 			    (NETTXF_csum_blank | NETTXF_data_validated))
1558 			    != 0) {
1559 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1560 				    mp, txreq->flags);
1561 				xnbp->xnb_stat_tx_cksum_no_need++;
1562 
1563 				txp->xt_mblk = mp;
1564 			}
1565 
1566 			if (head == NULL) {
1567 				ASSERT(tail == NULL);
1568 				head = mp;
1569 			} else {
1570 				ASSERT(tail != NULL);
1571 				tail->b_next = mp;
1572 			}
1573 			tail = mp;
1574 
1575 			xnbp->xnb_stat_opackets++;
1576 			xnbp->xnb_stat_obytes += txreq->size;
1577 
1578 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
1579 		}
1580 
1581 		txpp++;
1582 		cop++;
1583 		i--;
1584 	}
1585 
1586 	goto around;
1587 	/* NOTREACHED */
1588 }
1589 
1590 static uint_t
1591 xnb_intr(caddr_t arg)
1592 {
1593 	xnb_t *xnbp = (xnb_t *)arg;
1594 	mblk_t *mp;
1595 
1596 	xnbp->xnb_stat_intr++;
1597 
1598 	mutex_enter(&xnbp->xnb_tx_lock);
1599 
1600 	ASSERT(xnbp->xnb_connected);
1601 
1602 	mp = xnb_from_peer(xnbp);
1603 
1604 	mutex_exit(&xnbp->xnb_tx_lock);
1605 
1606 	if (!xnbp->xnb_hotplugged) {
1607 		xnbp->xnb_stat_tx_too_early++;
1608 		goto fail;
1609 	}
1610 	if (mp == NULL) {
1611 		xnbp->xnb_stat_spurious_intr++;
1612 		goto fail;
1613 	}
1614 
1615 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1616 
1617 	return (DDI_INTR_CLAIMED);
1618 
1619 fail:
1620 	freemsgchain(mp);
1621 	return (DDI_INTR_CLAIMED);
1622 }
1623 
1624 /*
1625  * Read our configuration from xenstore.
1626  */
1627 boolean_t
1628 xnb_read_xs_config(xnb_t *xnbp)
1629 {
1630 	char *xsname;
1631 	char mac[ETHERADDRL * 3];
1632 
1633 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
1634 
1635 	if (xenbus_scanf(XBT_NULL, xsname,
1636 	    "mac", "%s", mac) != 0) {
1637 		cmn_err(CE_WARN, "xnb_attach: "
1638 		    "cannot read mac address from %s",
1639 		    xsname);
1640 		return (B_FALSE);
1641 	}
1642 
1643 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
1644 		cmn_err(CE_WARN,
1645 		    "xnb_attach: cannot parse mac address %s",
1646 		    mac);
1647 		return (B_FALSE);
1648 	}
1649 
1650 	return (B_TRUE);
1651 }
1652 
1653 /*
1654  * Read the configuration of the peer from xenstore.
1655  */
1656 boolean_t
1657 xnb_read_oe_config(xnb_t *xnbp)
1658 {
1659 	char *oename;
1660 	int i;
1661 
1662 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
1663 
1664 	if (xenbus_gather(XBT_NULL, oename,
1665 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
1666 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1667 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1668 	    NULL) != 0) {
1669 		cmn_err(CE_WARN, "xnb_read_oe_config: "
1670 		    "cannot read other-end details from %s",
1671 		    oename);
1672 		return (B_FALSE);
1673 	}
1674 
1675 	/*
1676 	 * Check whether our peer requests receive side hypervisor
1677 	 * copy.
1678 	 */
1679 	if (xenbus_scanf(XBT_NULL, oename,
1680 	    "request-rx-copy", "%d", &i) != 0)
1681 		i = 0;
1682 	if (i != 0)
1683 		xnbp->xnb_rx_hv_copy = B_TRUE;
1684 
1685 	/*
1686 	 * Check whether our peer requests multicast_control.
1687 	 */
1688 	if (xenbus_scanf(XBT_NULL, oename,
1689 	    "request-multicast-control", "%d", &i) != 0)
1690 		i = 0;
1691 	if (i != 0)
1692 		xnbp->xnb_multicast_control = B_TRUE;
1693 
1694 	/*
1695 	 * The Linux backend driver here checks to see if the peer has
1696 	 * set 'feature-no-csum-offload'. This is used to indicate
1697 	 * that the guest cannot handle receiving packets without a
1698 	 * valid checksum. We don't check here, because packets passed
1699 	 * to the peer _always_ have a valid checksum.
1700 	 *
1701 	 * There are three cases:
1702 	 *
1703 	 * - the NIC is dedicated: packets from the wire should always
1704 	 *   have a valid checksum. If the hardware validates the
1705 	 *   checksum then the relevant bit will be set in the packet
1706 	 *   attributes and we will inform the peer. It can choose to
1707 	 *   ignore the hardware verification.
1708 	 *
1709 	 * - the NIC is shared (VNIC) and a packet originates from the
1710 	 *   wire: this is the same as the case above - the packets
1711 	 *   will have a valid checksum.
1712 	 *
1713 	 * - the NIC is shared (VNIC) and a packet originates from the
1714 	 *   host: the MAC layer ensures that all such packets have a
1715 	 *   valid checksum by calculating one if the stack did not.
1716 	 */
1717 
1718 	return (B_TRUE);
1719 }
1720 
1721 void
1722 xnb_start_connect(xnb_t *xnbp)
1723 {
1724 	dev_info_t  *dip = xnbp->xnb_devinfo;
1725 
1726 	if (!xnb_connect_rings(dip)) {
1727 		cmn_err(CE_WARN, "xnb_start_connect: "
1728 		    "cannot connect rings");
1729 		goto failed;
1730 	}
1731 
1732 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
1733 		cmn_err(CE_WARN, "xnb_start_connect: "
1734 		    "flavour failed to connect");
1735 		goto failed;
1736 	}
1737 
1738 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1739 	return;
1740 
1741 failed:
1742 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1743 	xnb_disconnect_rings(dip);
1744 	(void) xvdi_switch_state(dip, XBT_NULL,
1745 	    XenbusStateClosed);
1746 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1747 }
1748 
1749 static boolean_t
1750 xnb_connect_rings(dev_info_t *dip)
1751 {
1752 	xnb_t *xnbp = ddi_get_driver_private(dip);
1753 	struct gnttab_map_grant_ref map_op;
1754 
1755 	/*
1756 	 * Cannot attempt to connect the rings if already connected.
1757 	 */
1758 	ASSERT(!xnbp->xnb_connected);
1759 
1760 	/*
1761 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1762 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1763 	 *    into the allocated vaddr (one for tx, one for rx).
1764 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1765 	 *    bound to this domain.
1766 	 * 4. associate the event channel with an interrupt.
1767 	 * 5. enable the interrupt.
1768 	 */
1769 
1770 	/* 1.tx */
1771 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1772 	    0, 0, 0, 0, VM_SLEEP);
1773 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1774 
1775 	/* 2.tx */
1776 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1777 	map_op.flags = GNTMAP_host_map;
1778 	map_op.ref = xnbp->xnb_tx_ring_ref;
1779 	map_op.dom = xnbp->xnb_peer;
1780 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1781 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1782 	    map_op.status != 0) {
1783 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1784 		goto fail;
1785 	}
1786 	xnbp->xnb_tx_ring_handle = map_op.handle;
1787 
1788 	/* LINTED: constant in conditional context */
1789 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1790 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1791 
1792 	/* 1.rx */
1793 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1794 	    0, 0, 0, 0, VM_SLEEP);
1795 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1796 
1797 	/* 2.rx */
1798 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1799 	map_op.flags = GNTMAP_host_map;
1800 	map_op.ref = xnbp->xnb_rx_ring_ref;
1801 	map_op.dom = xnbp->xnb_peer;
1802 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1803 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1804 	    map_op.status != 0) {
1805 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1806 		goto fail;
1807 	}
1808 	xnbp->xnb_rx_ring_handle = map_op.handle;
1809 
1810 	/* LINTED: constant in conditional context */
1811 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1812 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1813 
1814 	/* 3 */
1815 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
1816 		cmn_err(CE_WARN, "xnb_connect_rings: "
1817 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1818 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1819 		goto fail;
1820 	}
1821 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1822 
1823 	/*
1824 	 * It would be good to set the state to XenbusStateConnected
1825 	 * here as well, but then what if ddi_add_intr() failed?
1826 	 * Changing the state in the store will be noticed by the peer
1827 	 * and cannot be "taken back".
1828 	 */
1829 	mutex_enter(&xnbp->xnb_tx_lock);
1830 	mutex_enter(&xnbp->xnb_rx_lock);
1831 
1832 	xnbp->xnb_connected = B_TRUE;
1833 
1834 	mutex_exit(&xnbp->xnb_rx_lock);
1835 	mutex_exit(&xnbp->xnb_tx_lock);
1836 
1837 	/* 4, 5 */
1838 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1839 	    != DDI_SUCCESS) {
1840 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1841 		goto fail;
1842 	}
1843 	xnbp->xnb_irq = B_TRUE;
1844 
1845 	return (B_TRUE);
1846 
1847 fail:
1848 	mutex_enter(&xnbp->xnb_tx_lock);
1849 	mutex_enter(&xnbp->xnb_rx_lock);
1850 
1851 	xnbp->xnb_connected = B_FALSE;
1852 
1853 	mutex_exit(&xnbp->xnb_rx_lock);
1854 	mutex_exit(&xnbp->xnb_tx_lock);
1855 
1856 	return (B_FALSE);
1857 }
1858 
1859 static void
1860 xnb_disconnect_rings(dev_info_t *dip)
1861 {
1862 	xnb_t *xnbp = ddi_get_driver_private(dip);
1863 
1864 	if (xnbp->xnb_irq) {
1865 		ddi_remove_intr(dip, 0, NULL);
1866 		xnbp->xnb_irq = B_FALSE;
1867 	}
1868 
1869 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1870 		xvdi_free_evtchn(dip);
1871 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1872 	}
1873 
1874 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1875 		struct gnttab_unmap_grant_ref unmap_op;
1876 
1877 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1878 		    xnbp->xnb_rx_ring_addr;
1879 		unmap_op.dev_bus_addr = 0;
1880 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1881 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1882 		    &unmap_op, 1) != 0)
1883 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1884 			    "cannot unmap rx-ring page (%d)",
1885 			    unmap_op.status);
1886 
1887 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1888 	}
1889 
1890 	if (xnbp->xnb_rx_ring_addr != NULL) {
1891 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1892 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1893 		xnbp->xnb_rx_ring_addr = NULL;
1894 	}
1895 
1896 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1897 		struct gnttab_unmap_grant_ref unmap_op;
1898 
1899 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1900 		    xnbp->xnb_tx_ring_addr;
1901 		unmap_op.dev_bus_addr = 0;
1902 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1903 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1904 		    &unmap_op, 1) != 0)
1905 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1906 			    "cannot unmap tx-ring page (%d)",
1907 			    unmap_op.status);
1908 
1909 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1910 	}
1911 
1912 	if (xnbp->xnb_tx_ring_addr != NULL) {
1913 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1914 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1915 		xnbp->xnb_tx_ring_addr = NULL;
1916 	}
1917 }
1918 
1919 static void
1920 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1921     void *arg, void *impl_data)
1922 {
1923 	_NOTE(ARGUNUSED(id, arg));
1924 	xnb_t *xnbp = ddi_get_driver_private(dip);
1925 	XenbusState new_state = *(XenbusState *)impl_data;
1926 
1927 	ASSERT(xnbp != NULL);
1928 
1929 	switch (new_state) {
1930 	case XenbusStateConnected:
1931 		/* spurious state change */
1932 		if (xnbp->xnb_connected)
1933 			return;
1934 
1935 		if (!xnb_read_oe_config(xnbp) ||
1936 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
1937 			cmn_err(CE_WARN, "xnb_oe_state_change: "
1938 			    "read otherend config error");
1939 			(void) xvdi_switch_state(dip, XBT_NULL,
1940 			    XenbusStateClosed);
1941 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1942 
1943 			break;
1944 		}
1945 
1946 
1947 		mutex_enter(&xnbp->xnb_state_lock);
1948 		xnbp->xnb_fe_status = XNB_STATE_READY;
1949 		if (xnbp->xnb_be_status == XNB_STATE_READY)
1950 			xnb_start_connect(xnbp);
1951 		mutex_exit(&xnbp->xnb_state_lock);
1952 
1953 		/*
1954 		 * Now that we've attempted to connect it's reasonable
1955 		 * to allow an attempt to detach.
1956 		 */
1957 		xnbp->xnb_detachable = B_TRUE;
1958 
1959 		break;
1960 
1961 	case XenbusStateClosing:
1962 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1963 
1964 		break;
1965 
1966 	case XenbusStateClosed:
1967 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1968 
1969 		mutex_enter(&xnbp->xnb_tx_lock);
1970 		mutex_enter(&xnbp->xnb_rx_lock);
1971 
1972 		xnb_disconnect_rings(dip);
1973 		xnbp->xnb_connected = B_FALSE;
1974 
1975 		mutex_exit(&xnbp->xnb_rx_lock);
1976 		mutex_exit(&xnbp->xnb_tx_lock);
1977 
1978 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1979 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1980 		/*
1981 		 * In all likelyhood this is already set (in the above
1982 		 * case), but if the peer never attempted to connect
1983 		 * and the domain is destroyed we get here without
1984 		 * having been through the case above, so we set it to
1985 		 * be sure.
1986 		 */
1987 		xnbp->xnb_detachable = B_TRUE;
1988 
1989 		break;
1990 
1991 	default:
1992 		break;
1993 	}
1994 }
1995 
1996 static void
1997 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1998     void *arg, void *impl_data)
1999 {
2000 	_NOTE(ARGUNUSED(id, arg));
2001 	xnb_t *xnbp = ddi_get_driver_private(dip);
2002 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
2003 
2004 	ASSERT(xnbp != NULL);
2005 
2006 	switch (state) {
2007 	case Connected:
2008 		/* spurious hotplug event */
2009 		if (xnbp->xnb_hotplugged)
2010 			break;
2011 
2012 		if (!xnb_read_xs_config(xnbp))
2013 			break;
2014 
2015 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
2016 			break;
2017 
2018 		mutex_enter(&xnbp->xnb_tx_lock);
2019 		mutex_enter(&xnbp->xnb_rx_lock);
2020 
2021 		xnbp->xnb_hotplugged = B_TRUE;
2022 
2023 		mutex_exit(&xnbp->xnb_rx_lock);
2024 		mutex_exit(&xnbp->xnb_tx_lock);
2025 
2026 		mutex_enter(&xnbp->xnb_state_lock);
2027 		xnbp->xnb_be_status = XNB_STATE_READY;
2028 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
2029 			xnb_start_connect(xnbp);
2030 		mutex_exit(&xnbp->xnb_state_lock);
2031 
2032 		break;
2033 
2034 	default:
2035 		break;
2036 	}
2037 }
2038 
2039 static struct modldrv modldrv = {
2040 	&mod_miscops, "xnb",
2041 };
2042 
2043 static struct modlinkage modlinkage = {
2044 	MODREV_1, &modldrv, NULL
2045 };
2046 
2047 int
2048 _init(void)
2049 {
2050 	int i;
2051 
2052 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2053 
2054 	i = mod_install(&modlinkage);
2055 	if (i != DDI_SUCCESS)
2056 		mutex_destroy(&xnb_alloc_page_lock);
2057 
2058 	return (i);
2059 }
2060 
2061 int
2062 _info(struct modinfo *modinfop)
2063 {
2064 	return (mod_info(&modlinkage, modinfop));
2065 }
2066 
2067 int
2068 _fini(void)
2069 {
2070 	int i;
2071 
2072 	i = mod_remove(&modlinkage);
2073 	if (i == DDI_SUCCESS)
2074 		mutex_destroy(&xnb_alloc_page_lock);
2075 
2076 	return (i);
2077 }
2078