xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision 1ee13a44c225078280d7fba5012905966ff4ad92)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifdef DEBUG
28 #define	XNB_DEBUG 1
29 #endif /* DEBUG */
30 
31 #include "xnb.h"
32 
33 #include <sys/sunddi.h>
34 #include <sys/sunndi.h>
35 #include <sys/modctl.h>
36 #include <sys/conf.h>
37 #include <sys/mac.h>
38 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
39 #include <sys/dlpi.h>
40 #include <sys/strsubr.h>
41 #include <sys/strsun.h>
42 #include <sys/types.h>
43 #include <sys/pattr.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/hat_i86.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <xen/sys/xendev.h>
48 #include <sys/balloon_impl.h>
49 #include <sys/evtchn_impl.h>
50 #include <sys/gnttab.h>
51 #include <vm/vm_dep.h>
52 #include <sys/note.h>
53 #include <sys/gld.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 
57 /*
58  * The terms "transmit" and "receive" are used in alignment with domU,
59  * which means that packets originating from the peer domU are "transmitted"
60  * to other parts of the system and packets are "received" from them.
61  */
62 
63 /*
64  * Should we allow guests to manipulate multicast group membership?
65  */
66 static boolean_t	xnb_multicast_control = B_TRUE;
67 
68 static boolean_t	xnb_connect_rings(dev_info_t *);
69 static void		xnb_disconnect_rings(dev_info_t *);
70 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
71     void *, void *);
72 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
73     void *, void *);
74 
75 static int	xnb_txbuf_constructor(void *, void *, int);
76 static void	xnb_txbuf_destructor(void *, void *);
77 static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
78 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
79 
80 mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
81 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
82 
83 static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
84     size_t, size_t, size_t, grant_ref_t);
85 #pragma inline(setup_gop)
86 static boolean_t	is_foreign(void *);
87 #pragma inline(is_foreign)
88 
89 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
90 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
91 
92 static kmutex_t	xnb_alloc_page_lock;
93 
94 /*
95  * On a 32 bit PAE system physical and machine addresses are larger
96  * than 32 bits.  ddi_btop() on such systems take an unsigned long
97  * argument, and so addresses above 4G are truncated before ddi_btop()
98  * gets to see them.  To avoid this, code the shift operation here.
99  */
100 #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
101 
102 /* DMA attributes for transmit and receive data */
103 static ddi_dma_attr_t buf_dma_attr = {
104 	DMA_ATTR_V0,		/* version of this structure */
105 	0,			/* lowest usable address */
106 	0xffffffffffffffffULL,	/* highest usable address */
107 	0x7fffffff,		/* maximum DMAable byte count */
108 	MMU_PAGESIZE,		/* alignment in bytes */
109 	0x7ff,			/* bitmap of burst sizes */
110 	1,			/* minimum transfer */
111 	0xffffffffU,		/* maximum transfer */
112 	0xffffffffffffffffULL,	/* maximum segment length */
113 	1,			/* maximum number of segments */
114 	1,			/* granularity */
115 	0,			/* flags (reserved) */
116 };
117 
118 /* DMA access attributes for data: NOT to be byte swapped. */
119 static ddi_device_acc_attr_t data_accattr = {
120 	DDI_DEVICE_ATTR_V0,
121 	DDI_NEVERSWAP_ACC,
122 	DDI_STRICTORDER_ACC
123 };
124 
125 /*
126  * Statistics.
127  */
128 static const char * const aux_statistics[] = {
129 	"rx_cksum_deferred",
130 	"tx_cksum_no_need",
131 	"rx_rsp_notok",
132 	"tx_notify_deferred",
133 	"tx_notify_sent",
134 	"rx_notify_deferred",
135 	"rx_notify_sent",
136 	"tx_too_early",
137 	"rx_too_early",
138 	"rx_allocb_failed",
139 	"tx_allocb_failed",
140 	"rx_foreign_page",
141 	"mac_full",
142 	"spurious_intr",
143 	"allocation_success",
144 	"allocation_failure",
145 	"small_allocation_success",
146 	"small_allocation_failure",
147 	"other_allocation_failure",
148 	"rx_pageboundary_crossed",
149 	"rx_cpoparea_grown",
150 	"csum_hardware",
151 	"csum_software",
152 	"tx_overflow_page",
153 	"tx_unexpected_flags",
154 };
155 
156 static int
157 xnb_ks_aux_update(kstat_t *ksp, int flag)
158 {
159 	xnb_t *xnbp;
160 	kstat_named_t *knp;
161 
162 	if (flag != KSTAT_READ)
163 		return (EACCES);
164 
165 	xnbp = ksp->ks_private;
166 	knp = ksp->ks_data;
167 
168 	/*
169 	 * Assignment order should match that of the names in
170 	 * aux_statistics.
171 	 */
172 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
195 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
196 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
197 
198 	return (0);
199 }
200 
201 static boolean_t
202 xnb_ks_init(xnb_t *xnbp)
203 {
204 	int nstat = sizeof (aux_statistics) /
205 	    sizeof (aux_statistics[0]);
206 	const char * const *cp = aux_statistics;
207 	kstat_named_t *knp;
208 
209 	/*
210 	 * Create and initialise kstats.
211 	 */
212 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
213 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
214 	    KSTAT_TYPE_NAMED, nstat, 0);
215 	if (xnbp->xnb_kstat_aux == NULL)
216 		return (B_FALSE);
217 
218 	xnbp->xnb_kstat_aux->ks_private = xnbp;
219 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
220 
221 	knp = xnbp->xnb_kstat_aux->ks_data;
222 	while (nstat > 0) {
223 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
224 
225 		knp++;
226 		cp++;
227 		nstat--;
228 	}
229 
230 	kstat_install(xnbp->xnb_kstat_aux);
231 
232 	return (B_TRUE);
233 }
234 
235 static void
236 xnb_ks_free(xnb_t *xnbp)
237 {
238 	kstat_delete(xnbp->xnb_kstat_aux);
239 }
240 
241 /*
242  * Calculate and insert the transport checksum for an arbitrary packet.
243  */
244 static mblk_t *
245 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
246 {
247 	_NOTE(ARGUNUSED(xnbp));
248 
249 	/*
250 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
251 	 * because it doesn't cover all of the interesting cases :-(
252 	 */
253 	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
254 	    HCK_FULLCKSUM, KM_NOSLEEP);
255 
256 	return (mac_fix_cksum(mp));
257 }
258 
259 mblk_t *
260 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
261 {
262 	struct ether_header *ehp;
263 	uint16_t sap;
264 	uint32_t offset;
265 	ipha_t *ipha;
266 
267 	ASSERT(mp->b_next == NULL);
268 
269 	/*
270 	 * Check that the packet is contained in a single mblk.  In
271 	 * the "from peer" path this is true today, but may change
272 	 * when scatter gather support is added.  In the "to peer"
273 	 * path we cannot be sure, but in most cases it will be true
274 	 * (in the xnbo case the packet has come from a MAC device
275 	 * which is unlikely to split packets).
276 	 */
277 	if (mp->b_cont != NULL)
278 		goto software;
279 
280 	/*
281 	 * If the MAC has no hardware capability don't do any further
282 	 * checking.
283 	 */
284 	if (capab == 0)
285 		goto software;
286 
287 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
288 	ehp = (struct ether_header *)mp->b_rptr;
289 
290 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
291 		struct ether_vlan_header *evhp;
292 
293 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
294 		evhp = (struct ether_vlan_header *)mp->b_rptr;
295 		sap = ntohs(evhp->ether_type);
296 		offset = sizeof (struct ether_vlan_header);
297 	} else {
298 		sap = ntohs(ehp->ether_type);
299 		offset = sizeof (struct ether_header);
300 	}
301 
302 	/*
303 	 * We only attempt to do IPv4 packets in hardware.
304 	 */
305 	if (sap != ETHERTYPE_IP)
306 		goto software;
307 
308 	/*
309 	 * We know that this is an IPv4 packet.
310 	 */
311 	ipha = (ipha_t *)(mp->b_rptr + offset);
312 
313 	switch (ipha->ipha_protocol) {
314 	case IPPROTO_TCP:
315 	case IPPROTO_UDP: {
316 		uint32_t start, length, stuff, cksum;
317 		uint16_t *stuffp;
318 
319 		/*
320 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
321 		 * can use full IPv4 and partial checksum offload.
322 		 */
323 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
324 			break;
325 
326 		start = IP_SIMPLE_HDR_LENGTH;
327 		length = ntohs(ipha->ipha_length);
328 		if (ipha->ipha_protocol == IPPROTO_TCP) {
329 			stuff = start + TCP_CHECKSUM_OFFSET;
330 			cksum = IP_TCP_CSUM_COMP;
331 		} else {
332 			stuff = start + UDP_CHECKSUM_OFFSET;
333 			cksum = IP_UDP_CSUM_COMP;
334 		}
335 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
336 
337 		if (capab & HCKSUM_INET_FULL_V4) {
338 			/*
339 			 * Some devices require that the checksum
340 			 * field of the packet is zero for full
341 			 * offload.
342 			 */
343 			*stuffp = 0;
344 
345 			(void) hcksum_assoc(mp, NULL, NULL,
346 			    0, 0, 0, 0,
347 			    HCK_FULLCKSUM, KM_NOSLEEP);
348 
349 			xnbp->xnb_stat_csum_hardware++;
350 
351 			return (mp);
352 		}
353 
354 		if (capab & HCKSUM_INET_PARTIAL) {
355 			if (*stuffp == 0) {
356 				ipaddr_t src, dst;
357 
358 				/*
359 				 * Older Solaris guests don't insert
360 				 * the pseudo-header checksum, so we
361 				 * calculate it here.
362 				 */
363 				src = ipha->ipha_src;
364 				dst = ipha->ipha_dst;
365 
366 				cksum += (dst >> 16) + (dst & 0xFFFF);
367 				cksum += (src >> 16) + (src & 0xFFFF);
368 				cksum += length - IP_SIMPLE_HDR_LENGTH;
369 
370 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
371 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
372 
373 				ASSERT(cksum <= 0xFFFF);
374 
375 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
376 			}
377 
378 			(void) hcksum_assoc(mp, NULL, NULL,
379 			    start, stuff, length, 0,
380 			    HCK_PARTIALCKSUM, KM_NOSLEEP);
381 
382 			xnbp->xnb_stat_csum_hardware++;
383 
384 			return (mp);
385 		}
386 
387 		/* NOTREACHED */
388 		break;
389 	}
390 
391 	default:
392 		/* Use software. */
393 		break;
394 	}
395 
396 software:
397 	/*
398 	 * We are not able to use any offload so do the whole thing in
399 	 * software.
400 	 */
401 	xnbp->xnb_stat_csum_software++;
402 
403 	return (xnb_software_csum(xnbp, mp));
404 }
405 
406 int
407 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
408 {
409 	xnb_t *xnbp;
410 	char *xsname;
411 	char cachename[32];
412 
413 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
414 
415 	xnbp->xnb_flavour = flavour;
416 	xnbp->xnb_flavour_data = flavour_data;
417 	xnbp->xnb_devinfo = dip;
418 	xnbp->xnb_evtchn = INVALID_EVTCHN;
419 	xnbp->xnb_irq = B_FALSE;
420 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
421 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
422 	xnbp->xnb_connected = B_FALSE;
423 	xnbp->xnb_hotplugged = B_FALSE;
424 	xnbp->xnb_detachable = B_FALSE;
425 	xnbp->xnb_peer = xvdi_get_oeid(dip);
426 	xnbp->xnb_be_status = XNB_STATE_INIT;
427 	xnbp->xnb_fe_status = XNB_STATE_INIT;
428 
429 	xnbp->xnb_tx_buf_count = 0;
430 
431 	xnbp->xnb_rx_hv_copy = B_FALSE;
432 	xnbp->xnb_multicast_control = B_FALSE;
433 
434 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
435 	ASSERT(xnbp->xnb_rx_va != NULL);
436 
437 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
438 	    != DDI_SUCCESS)
439 		goto failure;
440 
441 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
442 	xnbp->xnb_rx_cpop = NULL;
443 	xnbp->xnb_rx_cpop_count = 0;
444 
445 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
446 	    xnbp->xnb_icookie);
447 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
448 	    xnbp->xnb_icookie);
449 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
450 	    xnbp->xnb_icookie);
451 
452 	/* Set driver private pointer now. */
453 	ddi_set_driver_private(dip, xnbp);
454 
455 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
456 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
457 	    sizeof (xnb_txbuf_t), 0,
458 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
459 	    NULL, xnbp, NULL, 0);
460 	if (xnbp->xnb_tx_buf_cache == NULL)
461 		goto failure_0;
462 
463 	if (!xnb_ks_init(xnbp))
464 		goto failure_1;
465 
466 	/*
467 	 * Receive notification of changes in the state of the
468 	 * driver in the guest domain.
469 	 */
470 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
471 	    NULL) != DDI_SUCCESS)
472 		goto failure_2;
473 
474 	/*
475 	 * Receive notification of hotplug events.
476 	 */
477 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
478 	    NULL) != DDI_SUCCESS)
479 		goto failure_2;
480 
481 	xsname = xvdi_get_xsname(dip);
482 
483 	if (xenbus_printf(XBT_NULL, xsname,
484 	    "feature-multicast-control", "%d",
485 	    xnb_multicast_control ? 1 : 0) != 0)
486 		goto failure_3;
487 
488 	if (xenbus_printf(XBT_NULL, xsname,
489 	    "feature-rx-copy", "%d",  1) != 0)
490 		goto failure_3;
491 	/*
492 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
493 	 * in addition to "feature-rx-copy" being 1. It seems strange
494 	 * to use four possible states to describe a binary decision,
495 	 * but we might as well play nice.
496 	 */
497 	if (xenbus_printf(XBT_NULL, xsname,
498 	    "feature-rx-flip", "%d", 0) != 0)
499 		goto failure_3;
500 
501 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
502 	(void) xvdi_post_event(dip, XEN_HP_ADD);
503 
504 	return (DDI_SUCCESS);
505 
506 failure_3:
507 	xvdi_remove_event_handler(dip, NULL);
508 
509 failure_2:
510 	xnb_ks_free(xnbp);
511 
512 failure_1:
513 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
514 
515 failure_0:
516 	mutex_destroy(&xnbp->xnb_state_lock);
517 	mutex_destroy(&xnbp->xnb_rx_lock);
518 	mutex_destroy(&xnbp->xnb_tx_lock);
519 
520 failure:
521 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
522 	kmem_free(xnbp, sizeof (*xnbp));
523 	return (DDI_FAILURE);
524 }
525 
526 void
527 xnb_detach(dev_info_t *dip)
528 {
529 	xnb_t *xnbp = ddi_get_driver_private(dip);
530 
531 	ASSERT(xnbp != NULL);
532 	ASSERT(!xnbp->xnb_connected);
533 	ASSERT(xnbp->xnb_tx_buf_count == 0);
534 
535 	xnb_disconnect_rings(dip);
536 
537 	xvdi_remove_event_handler(dip, NULL);
538 
539 	xnb_ks_free(xnbp);
540 
541 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
542 
543 	ddi_set_driver_private(dip, NULL);
544 
545 	mutex_destroy(&xnbp->xnb_state_lock);
546 	mutex_destroy(&xnbp->xnb_rx_lock);
547 	mutex_destroy(&xnbp->xnb_tx_lock);
548 
549 	if (xnbp->xnb_rx_cpop_count > 0)
550 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
551 		    * xnbp->xnb_rx_cpop_count);
552 
553 	ASSERT(xnbp->xnb_rx_va != NULL);
554 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
555 
556 	kmem_free(xnbp, sizeof (*xnbp));
557 }
558 
559 /*
560  * Allocate a page from the hypervisor to be flipped to the peer.
561  *
562  * Try to get pages in batches to reduce the overhead of calls into
563  * the balloon driver.
564  */
565 static mfn_t
566 xnb_alloc_page(xnb_t *xnbp)
567 {
568 #define	WARNING_RATE_LIMIT 100
569 #define	BATCH_SIZE 256
570 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
571 	static int nth = BATCH_SIZE;
572 	mfn_t mfn;
573 
574 	mutex_enter(&xnb_alloc_page_lock);
575 	if (nth == BATCH_SIZE) {
576 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
577 			xnbp->xnb_stat_allocation_failure++;
578 			mutex_exit(&xnb_alloc_page_lock);
579 
580 			/*
581 			 * Try for a single page in low memory situations.
582 			 */
583 			if (balloon_alloc_pages(1, &mfn) != 1) {
584 				if ((xnbp->xnb_stat_small_allocation_failure++
585 				    % WARNING_RATE_LIMIT) == 0)
586 					cmn_err(CE_WARN, "xnb_alloc_page: "
587 					    "Cannot allocate memory to "
588 					    "transfer packets to peer.");
589 				return (0);
590 			} else {
591 				xnbp->xnb_stat_small_allocation_success++;
592 				return (mfn);
593 			}
594 		}
595 
596 		nth = 0;
597 		xnbp->xnb_stat_allocation_success++;
598 	}
599 
600 	mfn = mfns[nth++];
601 	mutex_exit(&xnb_alloc_page_lock);
602 
603 	ASSERT(mfn != 0);
604 
605 	return (mfn);
606 #undef BATCH_SIZE
607 #undef WARNING_RATE_LIMIT
608 }
609 
610 /*
611  * Free a page back to the hypervisor.
612  *
613  * This happens only in the error path, so batching is not worth the
614  * complication.
615  */
616 static void
617 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
618 {
619 	_NOTE(ARGUNUSED(xnbp));
620 	int r;
621 	pfn_t pfn;
622 
623 	pfn = xen_assign_pfn(mfn);
624 	pfnzero(pfn, 0, PAGESIZE);
625 	xen_release_pfn(pfn);
626 
627 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
628 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
629 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
630 		    r, mfn);
631 	}
632 }
633 
634 /*
635  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
636  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
637  */
638 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
639 	((((_r)->sring->req_prod - loop) <		\
640 		(RING_SIZE(_r) - (loop - prod))) ?	\
641 	    ((_r)->sring->req_prod - loop) :		\
642 	    (RING_SIZE(_r) - (loop - prod)))
643 
644 /*
645  * Pass packets to the peer using page flipping.
646  */
647 mblk_t *
648 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
649 {
650 	mblk_t *free = mp, *prev = NULL;
651 	size_t len;
652 	gnttab_transfer_t *gop;
653 	boolean_t notify;
654 	RING_IDX loop, prod, end;
655 
656 	/*
657 	 * For each packet the sequence of operations is:
658 	 *
659 	 * 1. get a new page from the hypervisor.
660 	 * 2. get a request slot from the ring.
661 	 * 3. copy the data into the new page.
662 	 * 4. transfer the page to the peer.
663 	 * 5. update the request slot.
664 	 * 6. kick the peer.
665 	 * 7. free mp.
666 	 *
667 	 * In order to reduce the number of hypercalls, we prepare
668 	 * several packets for the peer and perform a single hypercall
669 	 * to transfer them.
670 	 */
671 
672 	mutex_enter(&xnbp->xnb_rx_lock);
673 
674 	/*
675 	 * If we are not connected to the peer or have not yet
676 	 * finished hotplug it is too early to pass packets to the
677 	 * peer.
678 	 */
679 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
680 		mutex_exit(&xnbp->xnb_rx_lock);
681 		DTRACE_PROBE(flip_rx_too_early);
682 		xnbp->xnb_stat_rx_too_early++;
683 		return (mp);
684 	}
685 
686 	loop = xnbp->xnb_rx_ring.req_cons;
687 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
688 	gop = xnbp->xnb_rx_top;
689 
690 	while ((mp != NULL) &&
691 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
692 
693 		mfn_t mfn;
694 		pfn_t pfn;
695 		netif_rx_request_t *rxreq;
696 		netif_rx_response_t *rxresp;
697 		char *valoop;
698 		mblk_t *ml;
699 		uint16_t cksum_flags;
700 
701 		/* 1 */
702 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
703 			xnbp->xnb_stat_rx_defer++;
704 			break;
705 		}
706 
707 		/* 2 */
708 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
709 
710 #ifdef XNB_DEBUG
711 		if (!(rxreq->id < NET_RX_RING_SIZE))
712 			cmn_err(CE_PANIC, "xnb_to_peer: "
713 			    "id %d out of range in request 0x%p",
714 			    rxreq->id, (void *)rxreq);
715 #endif /* XNB_DEBUG */
716 
717 		/* Assign a pfn and map the new page at the allocated va. */
718 		pfn = xen_assign_pfn(mfn);
719 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
720 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
721 
722 		/* 3 */
723 		len = 0;
724 		valoop = xnbp->xnb_rx_va;
725 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
726 			size_t chunk = ml->b_wptr - ml->b_rptr;
727 
728 			bcopy(ml->b_rptr, valoop, chunk);
729 			valoop += chunk;
730 			len += chunk;
731 		}
732 
733 		ASSERT(len < PAGESIZE);
734 
735 		/* Release the pfn. */
736 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
737 		    HAT_UNLOAD_UNMAP);
738 		xen_release_pfn(pfn);
739 
740 		/* 4 */
741 		gop->mfn = mfn;
742 		gop->domid = xnbp->xnb_peer;
743 		gop->ref = rxreq->gref;
744 
745 		/* 5.1 */
746 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
747 		rxresp->offset = 0;
748 		rxresp->flags = 0;
749 
750 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
751 		if (cksum_flags != 0)
752 			xnbp->xnb_stat_rx_cksum_deferred++;
753 		rxresp->flags |= cksum_flags;
754 
755 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
756 		rxresp->status = len;
757 
758 		loop++;
759 		prod++;
760 		gop++;
761 		prev = mp;
762 		mp = mp->b_next;
763 	}
764 
765 	/*
766 	 * Did we actually do anything?
767 	 */
768 	if (loop == xnbp->xnb_rx_ring.req_cons) {
769 		mutex_exit(&xnbp->xnb_rx_lock);
770 		return (mp);
771 	}
772 
773 	end = loop;
774 
775 	/*
776 	 * Unlink the end of the 'done' list from the remainder.
777 	 */
778 	ASSERT(prev != NULL);
779 	prev->b_next = NULL;
780 
781 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
782 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
783 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
784 	}
785 
786 	loop = xnbp->xnb_rx_ring.req_cons;
787 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
788 	gop = xnbp->xnb_rx_top;
789 
790 	while (loop < end) {
791 		int16_t status = NETIF_RSP_OKAY;
792 
793 		if (gop->status != 0) {
794 			status = NETIF_RSP_ERROR;
795 
796 			/*
797 			 * If the status is anything other than
798 			 * GNTST_bad_page then we don't own the page
799 			 * any more, so don't try to give it back.
800 			 */
801 			if (gop->status != GNTST_bad_page)
802 				gop->mfn = 0;
803 		} else {
804 			/* The page is no longer ours. */
805 			gop->mfn = 0;
806 		}
807 
808 		if (gop->mfn != 0)
809 			/*
810 			 * Give back the page, as we won't be using
811 			 * it.
812 			 */
813 			xnb_free_page(xnbp, gop->mfn);
814 		else
815 			/*
816 			 * We gave away a page, update our accounting
817 			 * now.
818 			 */
819 			balloon_drv_subtracted(1);
820 
821 		/* 5.2 */
822 		if (status != NETIF_RSP_OKAY) {
823 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
824 			    status;
825 		} else {
826 			xnbp->xnb_stat_ipackets++;
827 			xnbp->xnb_stat_rbytes += len;
828 		}
829 
830 		loop++;
831 		prod++;
832 		gop++;
833 	}
834 
835 	xnbp->xnb_rx_ring.req_cons = loop;
836 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
837 
838 	/* 6 */
839 	/* LINTED: constant in conditional context */
840 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
841 	if (notify) {
842 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
843 		xnbp->xnb_stat_rx_notify_sent++;
844 	} else {
845 		xnbp->xnb_stat_rx_notify_deferred++;
846 	}
847 
848 	if (mp != NULL)
849 		xnbp->xnb_stat_rx_defer++;
850 
851 	mutex_exit(&xnbp->xnb_rx_lock);
852 
853 	/* Free mblk_t's that we consumed. */
854 	freemsgchain(free);
855 
856 	return (mp);
857 }
858 
859 /* Helper functions for xnb_copy_to_peer(). */
860 
861 /*
862  * Grow the array of copy operation descriptors.
863  */
864 static boolean_t
865 grow_cpop_area(xnb_t *xnbp)
866 {
867 	size_t count;
868 	gnttab_copy_t *new;
869 
870 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
871 
872 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
873 
874 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
875 		xnbp->xnb_stat_other_allocation_failure++;
876 		return (B_FALSE);
877 	}
878 
879 	bcopy(xnbp->xnb_rx_cpop, new,
880 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
881 
882 	kmem_free(xnbp->xnb_rx_cpop,
883 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
884 
885 	xnbp->xnb_rx_cpop = new;
886 	xnbp->xnb_rx_cpop_count = count;
887 
888 	xnbp->xnb_stat_rx_cpoparea_grown++;
889 
890 	return (B_TRUE);
891 }
892 
893 /*
894  * Check whether an address is on a page that's foreign to this domain.
895  */
896 static boolean_t
897 is_foreign(void *addr)
898 {
899 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
900 
901 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
902 }
903 
904 /*
905  * Insert a newly allocated mblk into a chain, replacing the old one.
906  */
907 static mblk_t *
908 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
909 {
910 	uint32_t	start, stuff, end, value, flags;
911 	mblk_t		*new_mp;
912 
913 	new_mp = copyb(mp);
914 	if (new_mp == NULL)
915 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
916 		    "for %p, len %lu", (void *) mp, len);
917 
918 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
919 	(void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value,
920 	    flags, KM_NOSLEEP);
921 
922 	new_mp->b_next = mp->b_next;
923 	new_mp->b_prev = mp->b_prev;
924 	new_mp->b_cont = mp->b_cont;
925 
926 	/* Make sure we only overwrite pointers to the mblk being replaced. */
927 	if (mp_prev != NULL && mp_prev->b_next == mp)
928 		mp_prev->b_next = new_mp;
929 
930 	if (ml_prev != NULL && ml_prev->b_cont == mp)
931 		ml_prev->b_cont = new_mp;
932 
933 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
934 	freemsg(mp);
935 
936 	return (new_mp);
937 }
938 
939 /*
940  * Set all the fields in a gnttab_copy_t.
941  */
942 static void
943 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
944     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
945 {
946 	ASSERT(xnbp != NULL && gp != NULL);
947 
948 	gp->source.offset = s_off;
949 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
950 	gp->source.domid = DOMID_SELF;
951 
952 	gp->len = (uint16_t)len;
953 	gp->flags = GNTCOPY_dest_gref;
954 	gp->status = 0;
955 
956 	gp->dest.u.ref = d_ref;
957 	gp->dest.offset = d_off;
958 	gp->dest.domid = xnbp->xnb_peer;
959 }
960 
961 /*
962  * Pass packets to the peer using hypervisor copy operations.
963  */
964 mblk_t *
965 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
966 {
967 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
968 	mblk_t		*ml, *ml_prev;
969 	boolean_t	notify;
970 	RING_IDX	loop, prod;
971 	int		i;
972 
973 	/*
974 	 * If the peer does not pre-post buffers for received packets,
975 	 * use page flipping to pass packets to it.
976 	 */
977 	if (!xnbp->xnb_rx_hv_copy)
978 		return (xnb_to_peer(xnbp, mp));
979 
980 	/*
981 	 * For each packet the sequence of operations is:
982 	 *
983 	 *  1. get a request slot from the ring.
984 	 *  2. set up data for hypercall (see NOTE below)
985 	 *  3. have the hypervisore copy the data
986 	 *  4. update the request slot.
987 	 *  5. kick the peer.
988 	 *
989 	 * NOTE ad 2.
990 	 *  In order to reduce the number of hypercalls, we prepare
991 	 *  several mblks (mp->b_cont != NULL) for the peer and
992 	 *  perform a single hypercall to transfer them.  We also have
993 	 *  to set up a seperate copy operation for every page.
994 	 *
995 	 * If we have more than one packet (mp->b_next != NULL), we do
996 	 * this whole dance repeatedly.
997 	 */
998 
999 	mutex_enter(&xnbp->xnb_rx_lock);
1000 
1001 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
1002 		mutex_exit(&xnbp->xnb_rx_lock);
1003 		DTRACE_PROBE(copy_rx_too_early);
1004 		xnbp->xnb_stat_rx_too_early++;
1005 		return (mp);
1006 	}
1007 
1008 	loop = xnbp->xnb_rx_ring.req_cons;
1009 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1010 
1011 	while ((mp != NULL) &&
1012 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1013 		netif_rx_request_t	*rxreq;
1014 		size_t			d_offset, len;
1015 		int			item_count;
1016 		gnttab_copy_t		*gop_cp;
1017 		netif_rx_response_t	*rxresp;
1018 		uint16_t		cksum_flags;
1019 		int16_t			status = NETIF_RSP_OKAY;
1020 
1021 		/* 1 */
1022 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1023 
1024 #ifdef XNB_DEBUG
1025 		if (!(rxreq->id < NET_RX_RING_SIZE))
1026 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1027 			    "id %d out of range in request 0x%p",
1028 			    rxreq->id, (void *)rxreq);
1029 #endif /* XNB_DEBUG */
1030 
1031 		/* 2 */
1032 		d_offset = 0;
1033 		len = 0;
1034 		item_count = 0;
1035 
1036 		gop_cp = xnbp->xnb_rx_cpop;
1037 
1038 		/*
1039 		 * We walk the b_cont pointers and set up a
1040 		 * gnttab_copy_t for each sub-page chunk in each data
1041 		 * block.
1042 		 */
1043 		/* 2a */
1044 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1045 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1046 			uchar_t	*r_tmp,	*rpt_align;
1047 			size_t	r_offset;
1048 
1049 			/*
1050 			 * The hypervisor will not allow us to
1051 			 * reference a foreign page (e.g. one
1052 			 * belonging to another domain) by mfn in the
1053 			 * copy operation. If the data in this mblk is
1054 			 * on such a page we must copy the data into a
1055 			 * local page before initiating the hypervisor
1056 			 * copy operation.
1057 			 */
1058 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1059 				mblk_t *ml_new = replace_msg(ml, chunk,
1060 				    mp_prev, ml_prev);
1061 
1062 				/* We can still use old ml, but not *ml! */
1063 				if (free == ml)
1064 					free = ml_new;
1065 				if (mp == ml)
1066 					mp = ml_new;
1067 				ml = ml_new;
1068 
1069 				xnbp->xnb_stat_rx_foreign_page++;
1070 			}
1071 
1072 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1073 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1074 			r_tmp = ml->b_rptr;
1075 
1076 			if (d_offset + chunk > PAGESIZE)
1077 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1078 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1079 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1080 				    (void *)mp, (void *)saved_mp, (void *)ml,
1081 				    (void *)rpt_align,
1082 				    d_offset, chunk, (int)PAGESIZE);
1083 
1084 			while (chunk > 0) {
1085 				size_t part_len;
1086 
1087 				if (item_count == xnbp->xnb_rx_cpop_count) {
1088 					if (!grow_cpop_area(xnbp))
1089 						goto failure;
1090 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
1091 				}
1092 				/*
1093 				 * If our mblk crosses a page boundary, we need
1094 				 * to do a seperate copy for each page.
1095 				 */
1096 				if (r_offset + chunk > PAGESIZE) {
1097 					part_len = PAGESIZE - r_offset;
1098 
1099 					DTRACE_PROBE3(mblk_page_crossed,
1100 					    (mblk_t *), ml, int, chunk, int,
1101 					    (int)r_offset);
1102 
1103 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1104 				} else {
1105 					part_len = chunk;
1106 				}
1107 
1108 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1109 				    d_offset, part_len, rxreq->gref);
1110 
1111 				chunk -= part_len;
1112 
1113 				len += part_len;
1114 				d_offset += part_len;
1115 				r_tmp += part_len;
1116 				/*
1117 				 * The 2nd, 3rd ... last copies will always
1118 				 * start at r_tmp, therefore r_offset is 0.
1119 				 */
1120 				r_offset = 0;
1121 				gop_cp++;
1122 				item_count++;
1123 			}
1124 			ml_prev = ml;
1125 
1126 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1127 			    chunk, int, len, int, item_count);
1128 		}
1129 		/* 3 */
1130 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1131 		    item_count) != 0) {
1132 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1133 			DTRACE_PROBE(HV_granttableopfailed);
1134 		}
1135 
1136 		/* 4 */
1137 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1138 		rxresp->offset = 0;
1139 
1140 		rxresp->flags = 0;
1141 
1142 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1143 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1144 		    (int)rxresp->status);
1145 
1146 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1147 		if (cksum_flags != 0)
1148 			xnbp->xnb_stat_rx_cksum_deferred++;
1149 		rxresp->flags |= cksum_flags;
1150 
1151 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1152 		rxresp->status = len;
1153 
1154 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1155 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1156 		    (int)rxresp->status);
1157 
1158 		for (i = 0; i < item_count; i++) {
1159 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1160 				DTRACE_PROBE2(cpop_status_nonnull, int,
1161 				    (int)xnbp->xnb_rx_cpop[i].status,
1162 				    int, i);
1163 				status = NETIF_RSP_ERROR;
1164 			}
1165 		}
1166 
1167 		/* 5.2 */
1168 		if (status != NETIF_RSP_OKAY) {
1169 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1170 			    status;
1171 			xnbp->xnb_stat_rx_rsp_notok++;
1172 		} else {
1173 			xnbp->xnb_stat_ipackets++;
1174 			xnbp->xnb_stat_rbytes += len;
1175 		}
1176 
1177 		loop++;
1178 		prod++;
1179 		mp_prev = mp;
1180 		mp = mp->b_next;
1181 	}
1182 failure:
1183 	/*
1184 	 * Did we actually do anything?
1185 	 */
1186 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1187 		mutex_exit(&xnbp->xnb_rx_lock);
1188 		return (mp);
1189 	}
1190 
1191 	/*
1192 	 * Unlink the end of the 'done' list from the remainder.
1193 	 */
1194 	ASSERT(mp_prev != NULL);
1195 	mp_prev->b_next = NULL;
1196 
1197 	xnbp->xnb_rx_ring.req_cons = loop;
1198 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1199 
1200 	/* 6 */
1201 	/* LINTED: constant in conditional context */
1202 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1203 	if (notify) {
1204 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1205 		xnbp->xnb_stat_rx_notify_sent++;
1206 	} else {
1207 		xnbp->xnb_stat_rx_notify_deferred++;
1208 	}
1209 
1210 	if (mp != NULL)
1211 		xnbp->xnb_stat_rx_defer++;
1212 
1213 	mutex_exit(&xnbp->xnb_rx_lock);
1214 
1215 	/* Free mblk_t structs we have consumed. */
1216 	freemsgchain(free);
1217 
1218 	return (mp);
1219 }
1220 
1221 
1222 static void
1223 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
1224 {
1225 	boolean_t notify;
1226 
1227 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1228 
1229 	/* LINTED: constant in conditional context */
1230 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1231 	if (notify || force) {
1232 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1233 		xnbp->xnb_stat_tx_notify_sent++;
1234 	} else {
1235 		xnbp->xnb_stat_tx_notify_deferred++;
1236 	}
1237 }
1238 
1239 static void
1240 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1241 {
1242 	RING_IDX i;
1243 	netif_tx_response_t *txresp;
1244 
1245 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1246 
1247 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1248 
1249 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1250 	txresp->id = id;
1251 	txresp->status = status;
1252 
1253 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1254 
1255 	/*
1256 	 * Note that we don't push the change to the peer here - that
1257 	 * is the callers responsibility.
1258 	 */
1259 }
1260 
1261 static void
1262 xnb_txbuf_recycle(xnb_txbuf_t *txp)
1263 {
1264 	xnb_t *xnbp = txp->xt_xnbp;
1265 
1266 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1267 
1268 	xnbp->xnb_tx_buf_outstanding--;
1269 }
1270 
1271 static int
1272 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1273 {
1274 	_NOTE(ARGUNUSED(kmflag));
1275 	xnb_txbuf_t *txp = buf;
1276 	xnb_t *xnbp = arg;
1277 	size_t len;
1278 	ddi_dma_cookie_t dma_cookie;
1279 	uint_t ncookies;
1280 
1281 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
1282 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1283 	txp->xt_xnbp = xnbp;
1284 	txp->xt_next = NULL;
1285 
1286 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
1287 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
1288 		goto failure;
1289 
1290 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
1291 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
1292 	    &txp->xt_acc_handle) != DDI_SUCCESS)
1293 		goto failure_1;
1294 
1295 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
1296 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
1297 	    &dma_cookie, &ncookies)
1298 	    != DDI_DMA_MAPPED)
1299 		goto failure_2;
1300 	ASSERT(ncookies == 1);
1301 
1302 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
1303 	txp->xt_buflen = dma_cookie.dmac_size;
1304 
1305 	DTRACE_PROBE(txbuf_allocated);
1306 
1307 	atomic_add_32(&xnbp->xnb_tx_buf_count, 1);
1308 	xnbp->xnb_tx_buf_outstanding++;
1309 
1310 	return (0);
1311 
1312 failure_2:
1313 	ddi_dma_mem_free(&txp->xt_acc_handle);
1314 
1315 failure_1:
1316 	ddi_dma_free_handle(&txp->xt_dma_handle);
1317 
1318 failure:
1319 
1320 	return (-1);
1321 }
1322 
1323 static void
1324 xnb_txbuf_destructor(void *buf, void *arg)
1325 {
1326 	xnb_txbuf_t *txp = buf;
1327 	xnb_t *xnbp = arg;
1328 
1329 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
1330 	ddi_dma_mem_free(&txp->xt_acc_handle);
1331 	ddi_dma_free_handle(&txp->xt_dma_handle);
1332 
1333 	atomic_add_32(&xnbp->xnb_tx_buf_count, -1);
1334 }
1335 
1336 /*
1337  * Take packets from the peer and deliver them onward.
1338  */
1339 static mblk_t *
1340 xnb_from_peer(xnb_t *xnbp)
1341 {
1342 	RING_IDX start, end, loop;
1343 	gnttab_copy_t *cop;
1344 	xnb_txbuf_t **txpp;
1345 	netif_tx_request_t *txreq;
1346 	boolean_t work_to_do, need_notify = B_FALSE;
1347 	mblk_t *head, *tail;
1348 	int n_data_req, i;
1349 
1350 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1351 
1352 	head = tail = NULL;
1353 around:
1354 
1355 	/* LINTED: constant in conditional context */
1356 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1357 	if (!work_to_do) {
1358 finished:
1359 		xnb_tx_notify_peer(xnbp, need_notify);
1360 
1361 		return (head);
1362 	}
1363 
1364 	start = xnbp->xnb_tx_ring.req_cons;
1365 	end = xnbp->xnb_tx_ring.sring->req_prod;
1366 
1367 	if ((end - start) > NET_TX_RING_SIZE) {
1368 		/*
1369 		 * This usually indicates that the frontend driver is
1370 		 * misbehaving, as it's not possible to have more than
1371 		 * NET_TX_RING_SIZE ring elements in play at any one
1372 		 * time.
1373 		 *
1374 		 * We reset the ring pointers to the state declared by
1375 		 * the frontend and try to carry on.
1376 		 */
1377 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1378 		    "items in the ring, resetting and trying to recover.",
1379 		    xnbp->xnb_peer, (end - start));
1380 
1381 		/* LINTED: constant in conditional context */
1382 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1383 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1384 
1385 		goto around;
1386 	}
1387 
1388 	loop = start;
1389 	cop = xnbp->xnb_tx_cop;
1390 	txpp = xnbp->xnb_tx_bufp;
1391 	n_data_req = 0;
1392 
1393 	while (loop < end) {
1394 		static const uint16_t acceptable_flags =
1395 		    NETTXF_csum_blank |
1396 		    NETTXF_data_validated |
1397 		    NETTXF_extra_info;
1398 		uint16_t unexpected_flags;
1399 
1400 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1401 
1402 		unexpected_flags = txreq->flags & ~acceptable_flags;
1403 		if (unexpected_flags != 0) {
1404 			/*
1405 			 * The peer used flag bits that we do not
1406 			 * recognize.
1407 			 */
1408 			cmn_err(CE_WARN, "xnb_from_peer: "
1409 			    "unexpected flag bits (0x%x) from peer "
1410 			    "in transmit request",
1411 			    unexpected_flags);
1412 			xnbp->xnb_stat_tx_unexpected_flags++;
1413 
1414 			/* Mark this entry as failed. */
1415 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1416 			need_notify = B_TRUE;
1417 
1418 		} else if (txreq->flags & NETTXF_extra_info) {
1419 			struct netif_extra_info *erp;
1420 			boolean_t status;
1421 
1422 			loop++; /* Consume another slot in the ring. */
1423 			ASSERT(loop <= end);
1424 
1425 			erp = (struct netif_extra_info *)
1426 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1427 
1428 			switch (erp->type) {
1429 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
1430 				ASSERT(xnbp->xnb_multicast_control);
1431 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
1432 				    &erp->u.mcast.addr);
1433 				break;
1434 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
1435 				ASSERT(xnbp->xnb_multicast_control);
1436 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
1437 				    &erp->u.mcast.addr);
1438 				break;
1439 			default:
1440 				status = B_FALSE;
1441 				cmn_err(CE_WARN, "xnb_from_peer: "
1442 				    "unknown extra type %d", erp->type);
1443 				break;
1444 			}
1445 
1446 			xnb_tx_mark_complete(xnbp, txreq->id,
1447 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
1448 			need_notify = B_TRUE;
1449 
1450 		} else if ((txreq->offset > PAGESIZE) ||
1451 		    (txreq->offset + txreq->size > PAGESIZE)) {
1452 			/*
1453 			 * Peer attempted to refer to data beyond the
1454 			 * end of the granted page.
1455 			 */
1456 			cmn_err(CE_WARN, "xnb_from_peer: "
1457 			    "attempt to refer beyond the end of granted "
1458 			    "page in txreq (offset %d, size %d).",
1459 			    txreq->offset, txreq->size);
1460 			xnbp->xnb_stat_tx_overflow_page++;
1461 
1462 			/* Mark this entry as failed. */
1463 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1464 			need_notify = B_TRUE;
1465 
1466 		} else {
1467 			xnb_txbuf_t *txp;
1468 
1469 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
1470 			    KM_NOSLEEP);
1471 			if (txp == NULL)
1472 				break;
1473 
1474 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
1475 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
1476 			if (txp->xt_mblk == NULL) {
1477 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1478 				break;
1479 			}
1480 
1481 			txp->xt_idx = loop;
1482 			txp->xt_id = txreq->id;
1483 
1484 			cop->source.u.ref = txreq->gref;
1485 			cop->source.domid = xnbp->xnb_peer;
1486 			cop->source.offset = txreq->offset;
1487 
1488 			cop->dest.u.gmfn = txp->xt_mfn;
1489 			cop->dest.domid = DOMID_SELF;
1490 			cop->dest.offset = 0;
1491 
1492 			cop->len = txreq->size;
1493 			cop->flags = GNTCOPY_source_gref;
1494 			cop->status = 0;
1495 
1496 			*txpp = txp;
1497 
1498 			txpp++;
1499 			cop++;
1500 			n_data_req++;
1501 
1502 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
1503 		}
1504 
1505 		loop++;
1506 	}
1507 
1508 	xnbp->xnb_tx_ring.req_cons = loop;
1509 
1510 	if (n_data_req == 0)
1511 		goto around;
1512 
1513 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1514 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
1515 
1516 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
1517 
1518 		txpp = xnbp->xnb_tx_bufp;
1519 		i = n_data_req;
1520 		while (i > 0) {
1521 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
1522 			txpp++;
1523 			i--;
1524 		}
1525 
1526 		goto finished;
1527 	}
1528 
1529 	txpp = xnbp->xnb_tx_bufp;
1530 	cop = xnbp->xnb_tx_cop;
1531 	i = n_data_req;
1532 
1533 	while (i > 0) {
1534 		xnb_txbuf_t *txp = *txpp;
1535 
1536 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
1537 
1538 		if (cop->status != 0) {
1539 #ifdef XNB_DEBUG
1540 			cmn_err(CE_WARN, "xnb_from_peer: "
1541 			    "txpp 0x%p failed (%d)",
1542 			    (void *)*txpp, cop->status);
1543 #endif /* XNB_DEBUG */
1544 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
1545 			freemsg(txp->xt_mblk);
1546 		} else {
1547 			mblk_t *mp;
1548 
1549 			mp = txp->xt_mblk;
1550 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
1551 			mp->b_wptr += txreq->size;
1552 			mp->b_next = NULL;
1553 
1554 			/*
1555 			 * If there are checksum flags, process them
1556 			 * appropriately.
1557 			 */
1558 			if ((txreq->flags &
1559 			    (NETTXF_csum_blank | NETTXF_data_validated))
1560 			    != 0) {
1561 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1562 				    mp, txreq->flags);
1563 				xnbp->xnb_stat_tx_cksum_no_need++;
1564 
1565 				txp->xt_mblk = mp;
1566 			}
1567 
1568 			if (head == NULL) {
1569 				ASSERT(tail == NULL);
1570 				head = mp;
1571 			} else {
1572 				ASSERT(tail != NULL);
1573 				tail->b_next = mp;
1574 			}
1575 			tail = mp;
1576 
1577 			xnbp->xnb_stat_opackets++;
1578 			xnbp->xnb_stat_obytes += txreq->size;
1579 
1580 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
1581 		}
1582 
1583 		txpp++;
1584 		cop++;
1585 		i--;
1586 	}
1587 
1588 	goto around;
1589 	/* NOTREACHED */
1590 }
1591 
1592 static uint_t
1593 xnb_intr(caddr_t arg)
1594 {
1595 	xnb_t *xnbp = (xnb_t *)arg;
1596 	mblk_t *mp;
1597 
1598 	xnbp->xnb_stat_intr++;
1599 
1600 	mutex_enter(&xnbp->xnb_tx_lock);
1601 
1602 	ASSERT(xnbp->xnb_connected);
1603 
1604 	mp = xnb_from_peer(xnbp);
1605 
1606 	mutex_exit(&xnbp->xnb_tx_lock);
1607 
1608 	if (!xnbp->xnb_hotplugged) {
1609 		xnbp->xnb_stat_tx_too_early++;
1610 		goto fail;
1611 	}
1612 	if (mp == NULL) {
1613 		xnbp->xnb_stat_spurious_intr++;
1614 		goto fail;
1615 	}
1616 
1617 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1618 
1619 	return (DDI_INTR_CLAIMED);
1620 
1621 fail:
1622 	freemsgchain(mp);
1623 	return (DDI_INTR_CLAIMED);
1624 }
1625 
1626 /*
1627  * Read our configuration from xenstore.
1628  */
1629 boolean_t
1630 xnb_read_xs_config(xnb_t *xnbp)
1631 {
1632 	char *xsname;
1633 	char mac[ETHERADDRL * 3];
1634 
1635 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
1636 
1637 	if (xenbus_scanf(XBT_NULL, xsname,
1638 	    "mac", "%s", mac) != 0) {
1639 		cmn_err(CE_WARN, "xnb_attach: "
1640 		    "cannot read mac address from %s",
1641 		    xsname);
1642 		return (B_FALSE);
1643 	}
1644 
1645 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
1646 		cmn_err(CE_WARN,
1647 		    "xnb_attach: cannot parse mac address %s",
1648 		    mac);
1649 		return (B_FALSE);
1650 	}
1651 
1652 	return (B_TRUE);
1653 }
1654 
1655 /*
1656  * Read the configuration of the peer from xenstore.
1657  */
1658 boolean_t
1659 xnb_read_oe_config(xnb_t *xnbp)
1660 {
1661 	char *oename;
1662 	int i;
1663 
1664 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
1665 
1666 	if (xenbus_gather(XBT_NULL, oename,
1667 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
1668 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1669 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1670 	    NULL) != 0) {
1671 		cmn_err(CE_WARN, "xnb_read_oe_config: "
1672 		    "cannot read other-end details from %s",
1673 		    oename);
1674 		return (B_FALSE);
1675 	}
1676 
1677 	/*
1678 	 * Check whether our peer requests receive side hypervisor
1679 	 * copy.
1680 	 */
1681 	if (xenbus_scanf(XBT_NULL, oename,
1682 	    "request-rx-copy", "%d", &i) != 0)
1683 		i = 0;
1684 	if (i != 0)
1685 		xnbp->xnb_rx_hv_copy = B_TRUE;
1686 
1687 	/*
1688 	 * Check whether our peer requests multicast_control.
1689 	 */
1690 	if (xenbus_scanf(XBT_NULL, oename,
1691 	    "request-multicast-control", "%d", &i) != 0)
1692 		i = 0;
1693 	if (i != 0)
1694 		xnbp->xnb_multicast_control = B_TRUE;
1695 
1696 	/*
1697 	 * The Linux backend driver here checks to see if the peer has
1698 	 * set 'feature-no-csum-offload'. This is used to indicate
1699 	 * that the guest cannot handle receiving packets without a
1700 	 * valid checksum. We don't check here, because packets passed
1701 	 * to the peer _always_ have a valid checksum.
1702 	 *
1703 	 * There are three cases:
1704 	 *
1705 	 * - the NIC is dedicated: packets from the wire should always
1706 	 *   have a valid checksum. If the hardware validates the
1707 	 *   checksum then the relevant bit will be set in the packet
1708 	 *   attributes and we will inform the peer. It can choose to
1709 	 *   ignore the hardware verification.
1710 	 *
1711 	 * - the NIC is shared (VNIC) and a packet originates from the
1712 	 *   wire: this is the same as the case above - the packets
1713 	 *   will have a valid checksum.
1714 	 *
1715 	 * - the NIC is shared (VNIC) and a packet originates from the
1716 	 *   host: the MAC layer ensures that all such packets have a
1717 	 *   valid checksum by calculating one if the stack did not.
1718 	 */
1719 
1720 	return (B_TRUE);
1721 }
1722 
1723 void
1724 xnb_start_connect(xnb_t *xnbp)
1725 {
1726 	dev_info_t  *dip = xnbp->xnb_devinfo;
1727 
1728 	if (!xnb_connect_rings(dip)) {
1729 		cmn_err(CE_WARN, "xnb_start_connect: "
1730 		    "cannot connect rings");
1731 		goto failed;
1732 	}
1733 
1734 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
1735 		cmn_err(CE_WARN, "xnb_start_connect: "
1736 		    "flavour failed to connect");
1737 		goto failed;
1738 	}
1739 
1740 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1741 	return;
1742 
1743 failed:
1744 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1745 	xnb_disconnect_rings(dip);
1746 	(void) xvdi_switch_state(dip, XBT_NULL,
1747 	    XenbusStateClosed);
1748 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1749 }
1750 
1751 static boolean_t
1752 xnb_connect_rings(dev_info_t *dip)
1753 {
1754 	xnb_t *xnbp = ddi_get_driver_private(dip);
1755 	struct gnttab_map_grant_ref map_op;
1756 
1757 	/*
1758 	 * Cannot attempt to connect the rings if already connected.
1759 	 */
1760 	ASSERT(!xnbp->xnb_connected);
1761 
1762 	/*
1763 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1764 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1765 	 *    into the allocated vaddr (one for tx, one for rx).
1766 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1767 	 *    bound to this domain.
1768 	 * 4. associate the event channel with an interrupt.
1769 	 * 5. enable the interrupt.
1770 	 */
1771 
1772 	/* 1.tx */
1773 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1774 	    0, 0, 0, 0, VM_SLEEP);
1775 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1776 
1777 	/* 2.tx */
1778 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1779 	map_op.flags = GNTMAP_host_map;
1780 	map_op.ref = xnbp->xnb_tx_ring_ref;
1781 	map_op.dom = xnbp->xnb_peer;
1782 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1783 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1784 	    map_op.status != 0) {
1785 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1786 		goto fail;
1787 	}
1788 	xnbp->xnb_tx_ring_handle = map_op.handle;
1789 
1790 	/* LINTED: constant in conditional context */
1791 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1792 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1793 
1794 	/* 1.rx */
1795 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1796 	    0, 0, 0, 0, VM_SLEEP);
1797 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1798 
1799 	/* 2.rx */
1800 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1801 	map_op.flags = GNTMAP_host_map;
1802 	map_op.ref = xnbp->xnb_rx_ring_ref;
1803 	map_op.dom = xnbp->xnb_peer;
1804 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1805 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1806 	    map_op.status != 0) {
1807 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1808 		goto fail;
1809 	}
1810 	xnbp->xnb_rx_ring_handle = map_op.handle;
1811 
1812 	/* LINTED: constant in conditional context */
1813 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1814 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1815 
1816 	/* 3 */
1817 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
1818 		cmn_err(CE_WARN, "xnb_connect_rings: "
1819 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1820 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1821 		goto fail;
1822 	}
1823 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1824 
1825 	/*
1826 	 * It would be good to set the state to XenbusStateConnected
1827 	 * here as well, but then what if ddi_add_intr() failed?
1828 	 * Changing the state in the store will be noticed by the peer
1829 	 * and cannot be "taken back".
1830 	 */
1831 	mutex_enter(&xnbp->xnb_tx_lock);
1832 	mutex_enter(&xnbp->xnb_rx_lock);
1833 
1834 	xnbp->xnb_connected = B_TRUE;
1835 
1836 	mutex_exit(&xnbp->xnb_rx_lock);
1837 	mutex_exit(&xnbp->xnb_tx_lock);
1838 
1839 	/* 4, 5 */
1840 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1841 	    != DDI_SUCCESS) {
1842 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1843 		goto fail;
1844 	}
1845 	xnbp->xnb_irq = B_TRUE;
1846 
1847 	return (B_TRUE);
1848 
1849 fail:
1850 	mutex_enter(&xnbp->xnb_tx_lock);
1851 	mutex_enter(&xnbp->xnb_rx_lock);
1852 
1853 	xnbp->xnb_connected = B_FALSE;
1854 
1855 	mutex_exit(&xnbp->xnb_rx_lock);
1856 	mutex_exit(&xnbp->xnb_tx_lock);
1857 
1858 	return (B_FALSE);
1859 }
1860 
1861 static void
1862 xnb_disconnect_rings(dev_info_t *dip)
1863 {
1864 	xnb_t *xnbp = ddi_get_driver_private(dip);
1865 
1866 	if (xnbp->xnb_irq) {
1867 		ddi_remove_intr(dip, 0, NULL);
1868 		xnbp->xnb_irq = B_FALSE;
1869 	}
1870 
1871 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1872 		xvdi_free_evtchn(dip);
1873 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1874 	}
1875 
1876 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1877 		struct gnttab_unmap_grant_ref unmap_op;
1878 
1879 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1880 		    xnbp->xnb_rx_ring_addr;
1881 		unmap_op.dev_bus_addr = 0;
1882 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1883 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1884 		    &unmap_op, 1) != 0)
1885 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1886 			    "cannot unmap rx-ring page (%d)",
1887 			    unmap_op.status);
1888 
1889 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1890 	}
1891 
1892 	if (xnbp->xnb_rx_ring_addr != NULL) {
1893 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1894 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1895 		xnbp->xnb_rx_ring_addr = NULL;
1896 	}
1897 
1898 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1899 		struct gnttab_unmap_grant_ref unmap_op;
1900 
1901 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1902 		    xnbp->xnb_tx_ring_addr;
1903 		unmap_op.dev_bus_addr = 0;
1904 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1905 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1906 		    &unmap_op, 1) != 0)
1907 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1908 			    "cannot unmap tx-ring page (%d)",
1909 			    unmap_op.status);
1910 
1911 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1912 	}
1913 
1914 	if (xnbp->xnb_tx_ring_addr != NULL) {
1915 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1916 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1917 		xnbp->xnb_tx_ring_addr = NULL;
1918 	}
1919 }
1920 
1921 static void
1922 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1923     void *arg, void *impl_data)
1924 {
1925 	_NOTE(ARGUNUSED(id, arg));
1926 	xnb_t *xnbp = ddi_get_driver_private(dip);
1927 	XenbusState new_state = *(XenbusState *)impl_data;
1928 
1929 	ASSERT(xnbp != NULL);
1930 
1931 	switch (new_state) {
1932 	case XenbusStateConnected:
1933 		/* spurious state change */
1934 		if (xnbp->xnb_connected)
1935 			return;
1936 
1937 		if (!xnb_read_oe_config(xnbp) ||
1938 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
1939 			cmn_err(CE_WARN, "xnb_oe_state_change: "
1940 			    "read otherend config error");
1941 			(void) xvdi_switch_state(dip, XBT_NULL,
1942 			    XenbusStateClosed);
1943 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1944 
1945 			break;
1946 		}
1947 
1948 
1949 		mutex_enter(&xnbp->xnb_state_lock);
1950 		xnbp->xnb_fe_status = XNB_STATE_READY;
1951 		if (xnbp->xnb_be_status == XNB_STATE_READY)
1952 			xnb_start_connect(xnbp);
1953 		mutex_exit(&xnbp->xnb_state_lock);
1954 
1955 		/*
1956 		 * Now that we've attempted to connect it's reasonable
1957 		 * to allow an attempt to detach.
1958 		 */
1959 		xnbp->xnb_detachable = B_TRUE;
1960 
1961 		break;
1962 
1963 	case XenbusStateClosing:
1964 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1965 
1966 		break;
1967 
1968 	case XenbusStateClosed:
1969 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1970 
1971 		mutex_enter(&xnbp->xnb_tx_lock);
1972 		mutex_enter(&xnbp->xnb_rx_lock);
1973 
1974 		xnb_disconnect_rings(dip);
1975 		xnbp->xnb_connected = B_FALSE;
1976 
1977 		mutex_exit(&xnbp->xnb_rx_lock);
1978 		mutex_exit(&xnbp->xnb_tx_lock);
1979 
1980 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1981 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1982 		/*
1983 		 * In all likelyhood this is already set (in the above
1984 		 * case), but if the peer never attempted to connect
1985 		 * and the domain is destroyed we get here without
1986 		 * having been through the case above, so we set it to
1987 		 * be sure.
1988 		 */
1989 		xnbp->xnb_detachable = B_TRUE;
1990 
1991 		break;
1992 
1993 	default:
1994 		break;
1995 	}
1996 }
1997 
1998 static void
1999 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
2000     void *arg, void *impl_data)
2001 {
2002 	_NOTE(ARGUNUSED(id, arg));
2003 	xnb_t *xnbp = ddi_get_driver_private(dip);
2004 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
2005 
2006 	ASSERT(xnbp != NULL);
2007 
2008 	switch (state) {
2009 	case Connected:
2010 		/* spurious hotplug event */
2011 		if (xnbp->xnb_hotplugged)
2012 			break;
2013 
2014 		if (!xnb_read_xs_config(xnbp))
2015 			break;
2016 
2017 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
2018 			break;
2019 
2020 		mutex_enter(&xnbp->xnb_tx_lock);
2021 		mutex_enter(&xnbp->xnb_rx_lock);
2022 
2023 		xnbp->xnb_hotplugged = B_TRUE;
2024 
2025 		mutex_exit(&xnbp->xnb_rx_lock);
2026 		mutex_exit(&xnbp->xnb_tx_lock);
2027 
2028 		mutex_enter(&xnbp->xnb_state_lock);
2029 		xnbp->xnb_be_status = XNB_STATE_READY;
2030 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
2031 			xnb_start_connect(xnbp);
2032 		mutex_exit(&xnbp->xnb_state_lock);
2033 
2034 		break;
2035 
2036 	default:
2037 		break;
2038 	}
2039 }
2040 
2041 static struct modldrv modldrv = {
2042 	&mod_miscops, "xnb",
2043 };
2044 
2045 static struct modlinkage modlinkage = {
2046 	MODREV_1, &modldrv, NULL
2047 };
2048 
2049 int
2050 _init(void)
2051 {
2052 	int i;
2053 
2054 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2055 
2056 	i = mod_install(&modlinkage);
2057 	if (i != DDI_SUCCESS)
2058 		mutex_destroy(&xnb_alloc_page_lock);
2059 
2060 	return (i);
2061 }
2062 
2063 int
2064 _info(struct modinfo *modinfop)
2065 {
2066 	return (mod_info(&modlinkage, modinfop));
2067 }
2068 
2069 int
2070 _fini(void)
2071 {
2072 	int i;
2073 
2074 	i = mod_remove(&modlinkage);
2075 	if (i == DDI_SUCCESS)
2076 		mutex_destroy(&xnb_alloc_page_lock);
2077 
2078 	return (i);
2079 }
2080