xref: /illumos-gate/usr/src/uts/common/xen/io/xnb.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2018 Joyent, Inc.
26  */
27 
28 #ifdef DEBUG
29 #define	XNB_DEBUG 1
30 #endif /* DEBUG */
31 
32 #include "xnb.h"
33 
34 #include <sys/sunddi.h>
35 #include <sys/sunndi.h>
36 #include <sys/modctl.h>
37 #include <sys/conf.h>
38 #include <sys/mac.h>
39 #include <sys/mac_impl.h> /* For mac_fix_cksum(). */
40 #include <sys/dlpi.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/types.h>
44 #include <sys/pattr.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/hat_i86.h>
47 #include <xen/sys/xenbus_impl.h>
48 #include <xen/sys/xendev.h>
49 #include <sys/balloon_impl.h>
50 #include <sys/evtchn_impl.h>
51 #include <sys/gnttab.h>
52 #include <vm/vm_dep.h>
53 #include <sys/note.h>
54 #include <sys/gld.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 
58 /*
59  * The terms "transmit" and "receive" are used in alignment with domU,
60  * which means that packets originating from the peer domU are "transmitted"
61  * to other parts of the system and packets are "received" from them.
62  */
63 
64 /*
65  * Should we allow guests to manipulate multicast group membership?
66  */
67 static boolean_t	xnb_multicast_control = B_TRUE;
68 
69 static boolean_t	xnb_connect_rings(dev_info_t *);
70 static void		xnb_disconnect_rings(dev_info_t *);
71 static void		xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
72     void *, void *);
73 static void		xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
74     void *, void *);
75 
76 static int	xnb_txbuf_constructor(void *, void *, int);
77 static void	xnb_txbuf_destructor(void *, void *);
78 static void	xnb_tx_notify_peer(xnb_t *, boolean_t);
79 static void	xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);
80 
81 mblk_t		*xnb_to_peer(xnb_t *, mblk_t *);
82 mblk_t		*xnb_copy_to_peer(xnb_t *, mblk_t *);
83 
84 static void		setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
85     size_t, size_t, size_t, grant_ref_t);
86 static boolean_t	is_foreign(void *);
87 
88 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
89 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
90 
91 static kmutex_t	xnb_alloc_page_lock;
92 
93 /*
94  * On a 32 bit PAE system physical and machine addresses are larger
95  * than 32 bits.  ddi_btop() on such systems take an unsigned long
96  * argument, and so addresses above 4G are truncated before ddi_btop()
97  * gets to see them.  To avoid this, code the shift operation here.
98  */
99 #define	xnb_btop(addr)	((addr) >> PAGESHIFT)
100 
101 /* DMA attributes for transmit and receive data */
102 static ddi_dma_attr_t buf_dma_attr = {
103 	DMA_ATTR_V0,		/* version of this structure */
104 	0,			/* lowest usable address */
105 	0xffffffffffffffffULL,	/* highest usable address */
106 	0x7fffffff,		/* maximum DMAable byte count */
107 	MMU_PAGESIZE,		/* alignment in bytes */
108 	0x7ff,			/* bitmap of burst sizes */
109 	1,			/* minimum transfer */
110 	0xffffffffU,		/* maximum transfer */
111 	0xffffffffffffffffULL,	/* maximum segment length */
112 	1,			/* maximum number of segments */
113 	1,			/* granularity */
114 	0,			/* flags (reserved) */
115 };
116 
117 /* DMA access attributes for data: NOT to be byte swapped. */
118 static ddi_device_acc_attr_t data_accattr = {
119 	DDI_DEVICE_ATTR_V0,
120 	DDI_NEVERSWAP_ACC,
121 	DDI_STRICTORDER_ACC
122 };
123 
124 /*
125  * Statistics.
126  */
127 static const char * const aux_statistics[] = {
128 	"rx_cksum_deferred",
129 	"tx_cksum_no_need",
130 	"rx_rsp_notok",
131 	"tx_notify_deferred",
132 	"tx_notify_sent",
133 	"rx_notify_deferred",
134 	"rx_notify_sent",
135 	"tx_too_early",
136 	"rx_too_early",
137 	"rx_allocb_failed",
138 	"tx_allocb_failed",
139 	"rx_foreign_page",
140 	"mac_full",
141 	"spurious_intr",
142 	"allocation_success",
143 	"allocation_failure",
144 	"small_allocation_success",
145 	"small_allocation_failure",
146 	"other_allocation_failure",
147 	"rx_pageboundary_crossed",
148 	"rx_cpoparea_grown",
149 	"csum_hardware",
150 	"csum_software",
151 	"tx_overflow_page",
152 	"tx_unexpected_flags",
153 };
154 
155 static int
xnb_ks_aux_update(kstat_t * ksp,int flag)156 xnb_ks_aux_update(kstat_t *ksp, int flag)
157 {
158 	xnb_t *xnbp;
159 	kstat_named_t *knp;
160 
161 	if (flag != KSTAT_READ)
162 		return (EACCES);
163 
164 	xnbp = ksp->ks_private;
165 	knp = ksp->ks_data;
166 
167 	/*
168 	 * Assignment order should match that of the names in
169 	 * aux_statistics.
170 	 */
171 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
172 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
173 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
174 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
175 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
176 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
177 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
178 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
179 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
180 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
181 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
182 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
183 	(knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
184 	(knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
185 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
186 	(knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
187 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
188 	(knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
189 	(knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
190 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
191 	(knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
192 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
193 	(knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
194 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
195 	(knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;
196 
197 	return (0);
198 }
199 
200 static boolean_t
xnb_ks_init(xnb_t * xnbp)201 xnb_ks_init(xnb_t *xnbp)
202 {
203 	int nstat = sizeof (aux_statistics) /
204 	    sizeof (aux_statistics[0]);
205 	const char * const *cp = aux_statistics;
206 	kstat_named_t *knp;
207 
208 	/*
209 	 * Create and initialise kstats.
210 	 */
211 	xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
212 	    ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
213 	    KSTAT_TYPE_NAMED, nstat, 0);
214 	if (xnbp->xnb_kstat_aux == NULL)
215 		return (B_FALSE);
216 
217 	xnbp->xnb_kstat_aux->ks_private = xnbp;
218 	xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;
219 
220 	knp = xnbp->xnb_kstat_aux->ks_data;
221 	while (nstat > 0) {
222 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
223 
224 		knp++;
225 		cp++;
226 		nstat--;
227 	}
228 
229 	kstat_install(xnbp->xnb_kstat_aux);
230 
231 	return (B_TRUE);
232 }
233 
234 static void
xnb_ks_free(xnb_t * xnbp)235 xnb_ks_free(xnb_t *xnbp)
236 {
237 	kstat_delete(xnbp->xnb_kstat_aux);
238 }
239 
240 /*
241  * Calculate and insert the transport checksum for an arbitrary packet.
242  */
243 static mblk_t *
xnb_software_csum(xnb_t * xnbp,mblk_t * mp)244 xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
245 {
246 	_NOTE(ARGUNUSED(xnbp));
247 
248 	/*
249 	 * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
250 	 * because it doesn't cover all of the interesting cases :-(
251 	 */
252 	mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
253 	mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
254 	return (mp);
255 }
256 
257 mblk_t *
xnb_process_cksum_flags(xnb_t * xnbp,mblk_t * mp,uint32_t capab)258 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
259 {
260 	struct ether_header *ehp;
261 	uint16_t sap;
262 	uint32_t offset;
263 	ipha_t *ipha;
264 
265 	ASSERT(mp->b_next == NULL);
266 
267 	/*
268 	 * Check that the packet is contained in a single mblk.  In
269 	 * the "from peer" path this is true today, but may change
270 	 * when scatter gather support is added.  In the "to peer"
271 	 * path we cannot be sure, but in most cases it will be true
272 	 * (in the xnbo case the packet has come from a MAC device
273 	 * which is unlikely to split packets).
274 	 */
275 	if (mp->b_cont != NULL)
276 		goto software;
277 
278 	/*
279 	 * If the MAC has no hardware capability don't do any further
280 	 * checking.
281 	 */
282 	if (capab == 0)
283 		goto software;
284 
285 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
286 	ehp = (struct ether_header *)mp->b_rptr;
287 
288 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
289 		struct ether_vlan_header *evhp;
290 
291 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
292 		evhp = (struct ether_vlan_header *)mp->b_rptr;
293 		sap = ntohs(evhp->ether_type);
294 		offset = sizeof (struct ether_vlan_header);
295 	} else {
296 		sap = ntohs(ehp->ether_type);
297 		offset = sizeof (struct ether_header);
298 	}
299 
300 	/*
301 	 * We only attempt to do IPv4 packets in hardware.
302 	 */
303 	if (sap != ETHERTYPE_IP)
304 		goto software;
305 
306 	/*
307 	 * We know that this is an IPv4 packet.
308 	 */
309 	ipha = (ipha_t *)(mp->b_rptr + offset);
310 
311 	switch (ipha->ipha_protocol) {
312 	case IPPROTO_TCP:
313 	case IPPROTO_UDP: {
314 		uint32_t start, length, stuff, cksum;
315 		uint16_t *stuffp;
316 
317 		/*
318 		 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
319 		 * can use full IPv4 and partial checksum offload.
320 		 */
321 		if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
322 			break;
323 
324 		start = IP_SIMPLE_HDR_LENGTH;
325 		length = ntohs(ipha->ipha_length);
326 		if (ipha->ipha_protocol == IPPROTO_TCP) {
327 			stuff = start + TCP_CHECKSUM_OFFSET;
328 			cksum = IP_TCP_CSUM_COMP;
329 		} else {
330 			stuff = start + UDP_CHECKSUM_OFFSET;
331 			cksum = IP_UDP_CSUM_COMP;
332 		}
333 		stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);
334 
335 		if (capab & HCKSUM_INET_FULL_V4) {
336 			/*
337 			 * Some devices require that the checksum
338 			 * field of the packet is zero for full
339 			 * offload.
340 			 */
341 			*stuffp = 0;
342 
343 			mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
344 
345 			xnbp->xnb_stat_csum_hardware++;
346 
347 			return (mp);
348 		}
349 
350 		if (capab & HCKSUM_INET_PARTIAL) {
351 			if (*stuffp == 0) {
352 				ipaddr_t src, dst;
353 
354 				/*
355 				 * Older Solaris guests don't insert
356 				 * the pseudo-header checksum, so we
357 				 * calculate it here.
358 				 */
359 				src = ipha->ipha_src;
360 				dst = ipha->ipha_dst;
361 
362 				cksum += (dst >> 16) + (dst & 0xFFFF);
363 				cksum += (src >> 16) + (src & 0xFFFF);
364 				cksum += length - IP_SIMPLE_HDR_LENGTH;
365 
366 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
367 				cksum = (cksum >> 16) + (cksum & 0xFFFF);
368 
369 				ASSERT(cksum <= 0xFFFF);
370 
371 				*stuffp = (uint16_t)(cksum ? cksum : ~cksum);
372 			}
373 
374 			mac_hcksum_set(mp, start, stuff, length, 0,
375 			    HCK_PARTIALCKSUM);
376 
377 			xnbp->xnb_stat_csum_hardware++;
378 
379 			return (mp);
380 		}
381 
382 		/* NOTREACHED */
383 		break;
384 	}
385 
386 	default:
387 		/* Use software. */
388 		break;
389 	}
390 
391 software:
392 	/*
393 	 * We are not able to use any offload so do the whole thing in
394 	 * software.
395 	 */
396 	xnbp->xnb_stat_csum_software++;
397 
398 	return (xnb_software_csum(xnbp, mp));
399 }
400 
401 int
xnb_attach(dev_info_t * dip,xnb_flavour_t * flavour,void * flavour_data)402 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
403 {
404 	xnb_t *xnbp;
405 	char *xsname;
406 	char cachename[32];
407 
408 	xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);
409 
410 	xnbp->xnb_flavour = flavour;
411 	xnbp->xnb_flavour_data = flavour_data;
412 	xnbp->xnb_devinfo = dip;
413 	xnbp->xnb_evtchn = INVALID_EVTCHN;
414 	xnbp->xnb_irq = B_FALSE;
415 	xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
416 	xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
417 	xnbp->xnb_connected = B_FALSE;
418 	xnbp->xnb_hotplugged = B_FALSE;
419 	xnbp->xnb_detachable = B_FALSE;
420 	xnbp->xnb_peer = xvdi_get_oeid(dip);
421 	xnbp->xnb_be_status = XNB_STATE_INIT;
422 	xnbp->xnb_fe_status = XNB_STATE_INIT;
423 
424 	xnbp->xnb_tx_buf_count = 0;
425 
426 	xnbp->xnb_rx_hv_copy = B_FALSE;
427 	xnbp->xnb_multicast_control = B_FALSE;
428 
429 	xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
430 	ASSERT(xnbp->xnb_rx_va != NULL);
431 
432 	if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
433 	    != DDI_SUCCESS)
434 		goto failure;
435 
436 	/* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
437 	xnbp->xnb_rx_cpop = NULL;
438 	xnbp->xnb_rx_cpop_count = 0;
439 
440 	mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
441 	    xnbp->xnb_icookie);
442 	mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
443 	    xnbp->xnb_icookie);
444 	mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
445 	    xnbp->xnb_icookie);
446 
447 	/* Set driver private pointer now. */
448 	ddi_set_driver_private(dip, xnbp);
449 
450 	(void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
451 	xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
452 	    sizeof (xnb_txbuf_t), 0,
453 	    xnb_txbuf_constructor, xnb_txbuf_destructor,
454 	    NULL, xnbp, NULL, 0);
455 	if (xnbp->xnb_tx_buf_cache == NULL)
456 		goto failure_0;
457 
458 	if (!xnb_ks_init(xnbp))
459 		goto failure_1;
460 
461 	/*
462 	 * Receive notification of changes in the state of the
463 	 * driver in the guest domain.
464 	 */
465 	if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
466 	    NULL) != DDI_SUCCESS)
467 		goto failure_2;
468 
469 	/*
470 	 * Receive notification of hotplug events.
471 	 */
472 	if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
473 	    NULL) != DDI_SUCCESS)
474 		goto failure_2;
475 
476 	xsname = xvdi_get_xsname(dip);
477 
478 	if (xenbus_printf(XBT_NULL, xsname,
479 	    "feature-multicast-control", "%d",
480 	    xnb_multicast_control ? 1 : 0) != 0)
481 		goto failure_3;
482 
483 	if (xenbus_printf(XBT_NULL, xsname,
484 	    "feature-rx-copy", "%d",  1) != 0)
485 		goto failure_3;
486 	/*
487 	 * Linux domUs seem to depend on "feature-rx-flip" being 0
488 	 * in addition to "feature-rx-copy" being 1. It seems strange
489 	 * to use four possible states to describe a binary decision,
490 	 * but we might as well play nice.
491 	 */
492 	if (xenbus_printf(XBT_NULL, xsname,
493 	    "feature-rx-flip", "%d", 0) != 0)
494 		goto failure_3;
495 
496 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
497 	(void) xvdi_post_event(dip, XEN_HP_ADD);
498 
499 	return (DDI_SUCCESS);
500 
501 failure_3:
502 	xvdi_remove_event_handler(dip, NULL);
503 
504 failure_2:
505 	xnb_ks_free(xnbp);
506 
507 failure_1:
508 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
509 
510 failure_0:
511 	mutex_destroy(&xnbp->xnb_state_lock);
512 	mutex_destroy(&xnbp->xnb_rx_lock);
513 	mutex_destroy(&xnbp->xnb_tx_lock);
514 
515 failure:
516 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
517 	kmem_free(xnbp, sizeof (*xnbp));
518 	return (DDI_FAILURE);
519 }
520 
521 void
xnb_detach(dev_info_t * dip)522 xnb_detach(dev_info_t *dip)
523 {
524 	xnb_t *xnbp = ddi_get_driver_private(dip);
525 
526 	ASSERT(xnbp != NULL);
527 	ASSERT(!xnbp->xnb_connected);
528 	ASSERT(xnbp->xnb_tx_buf_count == 0);
529 
530 	xnb_disconnect_rings(dip);
531 
532 	xvdi_remove_event_handler(dip, NULL);
533 
534 	xnb_ks_free(xnbp);
535 
536 	kmem_cache_destroy(xnbp->xnb_tx_buf_cache);
537 
538 	ddi_set_driver_private(dip, NULL);
539 
540 	mutex_destroy(&xnbp->xnb_state_lock);
541 	mutex_destroy(&xnbp->xnb_rx_lock);
542 	mutex_destroy(&xnbp->xnb_tx_lock);
543 
544 	if (xnbp->xnb_rx_cpop_count > 0)
545 		kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
546 		    * xnbp->xnb_rx_cpop_count);
547 
548 	ASSERT(xnbp->xnb_rx_va != NULL);
549 	vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
550 
551 	kmem_free(xnbp, sizeof (*xnbp));
552 }
553 
554 /*
555  * Allocate a page from the hypervisor to be flipped to the peer.
556  *
557  * Try to get pages in batches to reduce the overhead of calls into
558  * the balloon driver.
559  */
560 static mfn_t
xnb_alloc_page(xnb_t * xnbp)561 xnb_alloc_page(xnb_t *xnbp)
562 {
563 #define	WARNING_RATE_LIMIT 100
564 #define	BATCH_SIZE 256
565 	static mfn_t mfns[BATCH_SIZE];	/* common across all instances */
566 	static int nth = BATCH_SIZE;
567 	mfn_t mfn;
568 
569 	mutex_enter(&xnb_alloc_page_lock);
570 	if (nth == BATCH_SIZE) {
571 		if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
572 			xnbp->xnb_stat_allocation_failure++;
573 			mutex_exit(&xnb_alloc_page_lock);
574 
575 			/*
576 			 * Try for a single page in low memory situations.
577 			 */
578 			if (balloon_alloc_pages(1, &mfn) != 1) {
579 				if ((xnbp->xnb_stat_small_allocation_failure++
580 				    % WARNING_RATE_LIMIT) == 0)
581 					cmn_err(CE_WARN, "xnb_alloc_page: "
582 					    "Cannot allocate memory to "
583 					    "transfer packets to peer.");
584 				return (0);
585 			} else {
586 				xnbp->xnb_stat_small_allocation_success++;
587 				return (mfn);
588 			}
589 		}
590 
591 		nth = 0;
592 		xnbp->xnb_stat_allocation_success++;
593 	}
594 
595 	mfn = mfns[nth++];
596 	mutex_exit(&xnb_alloc_page_lock);
597 
598 	ASSERT(mfn != 0);
599 
600 	return (mfn);
601 #undef BATCH_SIZE
602 #undef WARNING_RATE_LIMIT
603 }
604 
605 /*
606  * Free a page back to the hypervisor.
607  *
608  * This happens only in the error path, so batching is not worth the
609  * complication.
610  */
611 static void
xnb_free_page(xnb_t * xnbp,mfn_t mfn)612 xnb_free_page(xnb_t *xnbp, mfn_t mfn)
613 {
614 	_NOTE(ARGUNUSED(xnbp));
615 	int r;
616 	pfn_t pfn;
617 
618 	pfn = xen_assign_pfn(mfn);
619 	pfnzero(pfn, 0, PAGESIZE);
620 	xen_release_pfn(pfn);
621 
622 	if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
623 		cmn_err(CE_WARN, "free_page: cannot decrease memory "
624 		    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
625 		    r, mfn);
626 	}
627 }
628 
629 /*
630  * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
631  * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
632  */
633 #define	XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)		\
634 	((((_r)->sring->req_prod - loop) <		\
635 		(RING_SIZE(_r) - (loop - prod))) ?	\
636 	    ((_r)->sring->req_prod - loop) :		\
637 	    (RING_SIZE(_r) - (loop - prod)))
638 
639 /*
640  * Pass packets to the peer using page flipping.
641  */
642 mblk_t *
xnb_to_peer(xnb_t * xnbp,mblk_t * mp)643 xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
644 {
645 	mblk_t *free = mp, *prev = NULL;
646 	size_t len;
647 	gnttab_transfer_t *gop;
648 	boolean_t notify;
649 	RING_IDX loop, prod, end;
650 
651 	/*
652 	 * For each packet the sequence of operations is:
653 	 *
654 	 * 1. get a new page from the hypervisor.
655 	 * 2. get a request slot from the ring.
656 	 * 3. copy the data into the new page.
657 	 * 4. transfer the page to the peer.
658 	 * 5. update the request slot.
659 	 * 6. kick the peer.
660 	 * 7. free mp.
661 	 *
662 	 * In order to reduce the number of hypercalls, we prepare
663 	 * several packets for the peer and perform a single hypercall
664 	 * to transfer them.
665 	 */
666 
667 	len = 0;
668 	mutex_enter(&xnbp->xnb_rx_lock);
669 
670 	/*
671 	 * If we are not connected to the peer or have not yet
672 	 * finished hotplug it is too early to pass packets to the
673 	 * peer.
674 	 */
675 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
676 		mutex_exit(&xnbp->xnb_rx_lock);
677 		DTRACE_PROBE(flip_rx_too_early);
678 		xnbp->xnb_stat_rx_too_early++;
679 		return (mp);
680 	}
681 
682 	loop = xnbp->xnb_rx_ring.req_cons;
683 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
684 	gop = xnbp->xnb_rx_top;
685 
686 	while ((mp != NULL) &&
687 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
688 
689 		mfn_t mfn;
690 		pfn_t pfn;
691 		netif_rx_request_t *rxreq;
692 		netif_rx_response_t *rxresp;
693 		char *valoop;
694 		mblk_t *ml;
695 		uint16_t cksum_flags;
696 
697 		/* 1 */
698 		if ((mfn = xnb_alloc_page(xnbp)) == 0) {
699 			xnbp->xnb_stat_rx_defer++;
700 			break;
701 		}
702 
703 		/* 2 */
704 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
705 
706 #ifdef XNB_DEBUG
707 		if (!(rxreq->id < NET_RX_RING_SIZE))
708 			cmn_err(CE_PANIC, "xnb_to_peer: "
709 			    "id %d out of range in request 0x%p",
710 			    rxreq->id, (void *)rxreq);
711 #endif /* XNB_DEBUG */
712 
713 		/* Assign a pfn and map the new page at the allocated va. */
714 		pfn = xen_assign_pfn(mfn);
715 		hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
716 		    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);
717 
718 		/* 3 */
719 		len = 0;
720 		valoop = xnbp->xnb_rx_va;
721 		for (ml = mp; ml != NULL; ml = ml->b_cont) {
722 			size_t chunk = ml->b_wptr - ml->b_rptr;
723 
724 			bcopy(ml->b_rptr, valoop, chunk);
725 			valoop += chunk;
726 			len += chunk;
727 		}
728 
729 		ASSERT(len < PAGESIZE);
730 
731 		/* Release the pfn. */
732 		hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
733 		    HAT_UNLOAD_UNMAP);
734 		xen_release_pfn(pfn);
735 
736 		/* 4 */
737 		gop->mfn = mfn;
738 		gop->domid = xnbp->xnb_peer;
739 		gop->ref = rxreq->gref;
740 
741 		/* 5.1 */
742 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
743 		rxresp->offset = 0;
744 		rxresp->flags = 0;
745 
746 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
747 		if (cksum_flags != 0)
748 			xnbp->xnb_stat_rx_cksum_deferred++;
749 		rxresp->flags |= cksum_flags;
750 
751 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
752 		rxresp->status = len;
753 
754 		loop++;
755 		prod++;
756 		gop++;
757 		prev = mp;
758 		mp = mp->b_next;
759 	}
760 
761 	/*
762 	 * Did we actually do anything?
763 	 */
764 	if (loop == xnbp->xnb_rx_ring.req_cons) {
765 		mutex_exit(&xnbp->xnb_rx_lock);
766 		return (mp);
767 	}
768 
769 	end = loop;
770 
771 	/*
772 	 * Unlink the end of the 'done' list from the remainder.
773 	 */
774 	ASSERT(prev != NULL);
775 	prev->b_next = NULL;
776 
777 	if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
778 	    loop - xnbp->xnb_rx_ring.req_cons) != 0) {
779 		cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
780 	}
781 
782 	loop = xnbp->xnb_rx_ring.req_cons;
783 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
784 	gop = xnbp->xnb_rx_top;
785 
786 	while (loop < end) {
787 		int16_t status = NETIF_RSP_OKAY;
788 
789 		if (gop->status != 0) {
790 			status = NETIF_RSP_ERROR;
791 
792 			/*
793 			 * If the status is anything other than
794 			 * GNTST_bad_page then we don't own the page
795 			 * any more, so don't try to give it back.
796 			 */
797 			if (gop->status != GNTST_bad_page)
798 				gop->mfn = 0;
799 		} else {
800 			/* The page is no longer ours. */
801 			gop->mfn = 0;
802 		}
803 
804 		if (gop->mfn != 0)
805 			/*
806 			 * Give back the page, as we won't be using
807 			 * it.
808 			 */
809 			xnb_free_page(xnbp, gop->mfn);
810 		else
811 			/*
812 			 * We gave away a page, update our accounting
813 			 * now.
814 			 */
815 			balloon_drv_subtracted(1);
816 
817 		/* 5.2 */
818 		if (status != NETIF_RSP_OKAY) {
819 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
820 			    status;
821 		} else {
822 			xnbp->xnb_stat_ipackets++;
823 			xnbp->xnb_stat_rbytes += len;
824 		}
825 
826 		loop++;
827 		prod++;
828 		gop++;
829 	}
830 
831 	xnbp->xnb_rx_ring.req_cons = loop;
832 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
833 
834 	/* 6 */
835 	/* LINTED: constant in conditional context */
836 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
837 	if (notify) {
838 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
839 		xnbp->xnb_stat_rx_notify_sent++;
840 	} else {
841 		xnbp->xnb_stat_rx_notify_deferred++;
842 	}
843 
844 	if (mp != NULL)
845 		xnbp->xnb_stat_rx_defer++;
846 
847 	mutex_exit(&xnbp->xnb_rx_lock);
848 
849 	/* Free mblk_t's that we consumed. */
850 	freemsgchain(free);
851 
852 	return (mp);
853 }
854 
855 /* Helper functions for xnb_copy_to_peer(). */
856 
857 /*
858  * Grow the array of copy operation descriptors.
859  */
860 static boolean_t
grow_cpop_area(xnb_t * xnbp)861 grow_cpop_area(xnb_t *xnbp)
862 {
863 	size_t count;
864 	gnttab_copy_t *new;
865 
866 	ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));
867 
868 	count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;
869 
870 	if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
871 		xnbp->xnb_stat_other_allocation_failure++;
872 		return (B_FALSE);
873 	}
874 
875 	bcopy(xnbp->xnb_rx_cpop, new,
876 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
877 
878 	kmem_free(xnbp->xnb_rx_cpop,
879 	    sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);
880 
881 	xnbp->xnb_rx_cpop = new;
882 	xnbp->xnb_rx_cpop_count = count;
883 
884 	xnbp->xnb_stat_rx_cpoparea_grown++;
885 
886 	return (B_TRUE);
887 }
888 
889 /*
890  * Check whether an address is on a page that's foreign to this domain.
891  */
892 static boolean_t
is_foreign(void * addr)893 is_foreign(void *addr)
894 {
895 	pfn_t pfn = hat_getpfnum(kas.a_hat, addr);
896 
897 	return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
898 }
899 
900 /*
901  * Insert a newly allocated mblk into a chain, replacing the old one.
902  */
903 static mblk_t *
replace_msg(mblk_t * mp,size_t len,mblk_t * mp_prev,mblk_t * ml_prev)904 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
905 {
906 	uint32_t	start, stuff, end, value, flags;
907 	mblk_t		*new_mp;
908 
909 	new_mp = copyb(mp);
910 	if (new_mp == NULL) {
911 		cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
912 		    "for %p, len %lu", (void *) mp, len);
913 	}
914 
915 	mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
916 	mac_hcksum_set(new_mp, start, stuff, end, value, flags);
917 
918 	new_mp->b_next = mp->b_next;
919 	new_mp->b_prev = mp->b_prev;
920 	new_mp->b_cont = mp->b_cont;
921 
922 	/* Make sure we only overwrite pointers to the mblk being replaced. */
923 	if (mp_prev != NULL && mp_prev->b_next == mp)
924 		mp_prev->b_next = new_mp;
925 
926 	if (ml_prev != NULL && ml_prev->b_cont == mp)
927 		ml_prev->b_cont = new_mp;
928 
929 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
930 	freemsg(mp);
931 
932 	return (new_mp);
933 }
934 
935 /*
936  * Set all the fields in a gnttab_copy_t.
937  */
938 static void
setup_gop(xnb_t * xnbp,gnttab_copy_t * gp,uchar_t * rptr,size_t s_off,size_t d_off,size_t len,grant_ref_t d_ref)939 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
940     size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
941 {
942 	ASSERT(xnbp != NULL && gp != NULL);
943 
944 	gp->source.offset = s_off;
945 	gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
946 	gp->source.domid = DOMID_SELF;
947 
948 	gp->len = (uint16_t)len;
949 	gp->flags = GNTCOPY_dest_gref;
950 	gp->status = 0;
951 
952 	gp->dest.u.ref = d_ref;
953 	gp->dest.offset = d_off;
954 	gp->dest.domid = xnbp->xnb_peer;
955 }
956 
957 /*
958  * Pass packets to the peer using hypervisor copy operations.
959  */
960 mblk_t *
xnb_copy_to_peer(xnb_t * xnbp,mblk_t * mp)961 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
962 {
963 	mblk_t		*free = mp, *mp_prev = NULL, *saved_mp = mp;
964 	mblk_t		*ml, *ml_prev;
965 	boolean_t	notify;
966 	RING_IDX	loop, prod;
967 	int		i;
968 
969 	/*
970 	 * If the peer does not pre-post buffers for received packets,
971 	 * use page flipping to pass packets to it.
972 	 */
973 	if (!xnbp->xnb_rx_hv_copy)
974 		return (xnb_to_peer(xnbp, mp));
975 
976 	/*
977 	 * For each packet the sequence of operations is:
978 	 *
979 	 *  1. get a request slot from the ring.
980 	 *  2. set up data for hypercall (see NOTE below)
981 	 *  3. have the hypervisore copy the data
982 	 *  4. update the request slot.
983 	 *  5. kick the peer.
984 	 *
985 	 * NOTE ad 2.
986 	 *  In order to reduce the number of hypercalls, we prepare
987 	 *  several mblks (mp->b_cont != NULL) for the peer and
988 	 *  perform a single hypercall to transfer them.  We also have
989 	 *  to set up a seperate copy operation for every page.
990 	 *
991 	 * If we have more than one packet (mp->b_next != NULL), we do
992 	 * this whole dance repeatedly.
993 	 */
994 
995 	mutex_enter(&xnbp->xnb_rx_lock);
996 
997 	if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
998 		mutex_exit(&xnbp->xnb_rx_lock);
999 		DTRACE_PROBE(copy_rx_too_early);
1000 		xnbp->xnb_stat_rx_too_early++;
1001 		return (mp);
1002 	}
1003 
1004 	loop = xnbp->xnb_rx_ring.req_cons;
1005 	prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
1006 
1007 	while ((mp != NULL) &&
1008 	    XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
1009 		netif_rx_request_t	*rxreq;
1010 		size_t			d_offset, len;
1011 		int			item_count;
1012 		gnttab_copy_t		*gop_cp;
1013 		netif_rx_response_t	*rxresp;
1014 		uint16_t		cksum_flags;
1015 		int16_t			status = NETIF_RSP_OKAY;
1016 
1017 		/* 1 */
1018 		rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);
1019 
1020 #ifdef XNB_DEBUG
1021 		if (!(rxreq->id < NET_RX_RING_SIZE))
1022 			cmn_err(CE_PANIC, "xnb_copy_to_peer: "
1023 			    "id %d out of range in request 0x%p",
1024 			    rxreq->id, (void *)rxreq);
1025 #endif /* XNB_DEBUG */
1026 
1027 		/* 2 */
1028 		d_offset = 0;
1029 		len = 0;
1030 		item_count = 0;
1031 
1032 		gop_cp = xnbp->xnb_rx_cpop;
1033 
1034 		/*
1035 		 * We walk the b_cont pointers and set up a
1036 		 * gnttab_copy_t for each sub-page chunk in each data
1037 		 * block.
1038 		 */
1039 		/* 2a */
1040 		for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
1041 			size_t	chunk = ml->b_wptr - ml->b_rptr;
1042 			uchar_t	*r_tmp,	*rpt_align;
1043 			size_t	r_offset;
1044 
1045 			/*
1046 			 * The hypervisor will not allow us to
1047 			 * reference a foreign page (e.g. one
1048 			 * belonging to another domain) by mfn in the
1049 			 * copy operation. If the data in this mblk is
1050 			 * on such a page we must copy the data into a
1051 			 * local page before initiating the hypervisor
1052 			 * copy operation.
1053 			 */
1054 			if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
1055 				mblk_t *ml_new = replace_msg(ml, chunk,
1056 				    mp_prev, ml_prev);
1057 
1058 				/* We can still use old ml, but not *ml! */
1059 				if (free == ml)
1060 					free = ml_new;
1061 				if (mp == ml)
1062 					mp = ml_new;
1063 				ml = ml_new;
1064 
1065 				xnbp->xnb_stat_rx_foreign_page++;
1066 			}
1067 
1068 			rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
1069 			r_offset = (uint16_t)(ml->b_rptr - rpt_align);
1070 			r_tmp = ml->b_rptr;
1071 
1072 			if (d_offset + chunk > PAGESIZE)
1073 				cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
1074 				    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
1075 				    "(%lu) + chunk (%lu) > PAGESIZE %d!",
1076 				    (void *)mp, (void *)saved_mp, (void *)ml,
1077 				    (void *)rpt_align,
1078 				    d_offset, chunk, (int)PAGESIZE);
1079 
1080 			while (chunk > 0) {
1081 				size_t part_len;
1082 
1083 				if (item_count == xnbp->xnb_rx_cpop_count) {
1084 					if (!grow_cpop_area(xnbp))
1085 						goto failure;
1086 					gop_cp = &xnbp->xnb_rx_cpop[item_count];
1087 				}
1088 				/*
1089 				 * If our mblk crosses a page boundary, we need
1090 				 * to do a seperate copy for each page.
1091 				 */
1092 				if (r_offset + chunk > PAGESIZE) {
1093 					part_len = PAGESIZE - r_offset;
1094 
1095 					DTRACE_PROBE3(mblk_page_crossed,
1096 					    (mblk_t *), ml, int, chunk, int,
1097 					    (int)r_offset);
1098 
1099 					xnbp->xnb_stat_rx_pagebndry_crossed++;
1100 				} else {
1101 					part_len = chunk;
1102 				}
1103 
1104 				setup_gop(xnbp, gop_cp, r_tmp, r_offset,
1105 				    d_offset, part_len, rxreq->gref);
1106 
1107 				chunk -= part_len;
1108 
1109 				len += part_len;
1110 				d_offset += part_len;
1111 				r_tmp += part_len;
1112 				/*
1113 				 * The 2nd, 3rd ... last copies will always
1114 				 * start at r_tmp, therefore r_offset is 0.
1115 				 */
1116 				r_offset = 0;
1117 				gop_cp++;
1118 				item_count++;
1119 			}
1120 			ml_prev = ml;
1121 
1122 			DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
1123 			    chunk, int, len, int, item_count);
1124 		}
1125 		/* 3 */
1126 		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
1127 		    item_count) != 0) {
1128 			cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
1129 			DTRACE_PROBE(HV_granttableopfailed);
1130 		}
1131 
1132 		/* 4 */
1133 		rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
1134 		rxresp->offset = 0;
1135 
1136 		rxresp->flags = 0;
1137 
1138 		DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
1139 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1140 		    (int)rxresp->status);
1141 
1142 		cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
1143 		if (cksum_flags != 0)
1144 			xnbp->xnb_stat_rx_cksum_deferred++;
1145 		rxresp->flags |= cksum_flags;
1146 
1147 		rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
1148 		rxresp->status = len;
1149 
1150 		DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
1151 		    (int)rxresp->offset, int, (int)rxresp->flags, int,
1152 		    (int)rxresp->status);
1153 
1154 		for (i = 0; i < item_count; i++) {
1155 			if (xnbp->xnb_rx_cpop[i].status != 0) {
1156 				DTRACE_PROBE2(cpop_status_nonnull, int,
1157 				    (int)xnbp->xnb_rx_cpop[i].status,
1158 				    int, i);
1159 				status = NETIF_RSP_ERROR;
1160 			}
1161 		}
1162 
1163 		/* 5.2 */
1164 		if (status != NETIF_RSP_OKAY) {
1165 			RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
1166 			    status;
1167 			xnbp->xnb_stat_rx_rsp_notok++;
1168 		} else {
1169 			xnbp->xnb_stat_ipackets++;
1170 			xnbp->xnb_stat_rbytes += len;
1171 		}
1172 
1173 		loop++;
1174 		prod++;
1175 		mp_prev = mp;
1176 		mp = mp->b_next;
1177 	}
1178 failure:
1179 	/*
1180 	 * Did we actually do anything?
1181 	 */
1182 	if (loop == xnbp->xnb_rx_ring.req_cons) {
1183 		mutex_exit(&xnbp->xnb_rx_lock);
1184 		return (mp);
1185 	}
1186 
1187 	/*
1188 	 * Unlink the end of the 'done' list from the remainder.
1189 	 */
1190 	ASSERT(mp_prev != NULL);
1191 	mp_prev->b_next = NULL;
1192 
1193 	xnbp->xnb_rx_ring.req_cons = loop;
1194 	xnbp->xnb_rx_ring.rsp_prod_pvt = prod;
1195 
1196 	/* 6 */
1197 	/* LINTED: constant in conditional context */
1198 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
1199 	if (notify) {
1200 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1201 		xnbp->xnb_stat_rx_notify_sent++;
1202 	} else {
1203 		xnbp->xnb_stat_rx_notify_deferred++;
1204 	}
1205 
1206 	if (mp != NULL)
1207 		xnbp->xnb_stat_rx_defer++;
1208 
1209 	mutex_exit(&xnbp->xnb_rx_lock);
1210 
1211 	/* Free mblk_t structs we have consumed. */
1212 	freemsgchain(free);
1213 
1214 	return (mp);
1215 }
1216 
1217 
1218 static void
xnb_tx_notify_peer(xnb_t * xnbp,boolean_t force)1219 xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
1220 {
1221 	boolean_t notify;
1222 
1223 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1224 
1225 	/* LINTED: constant in conditional context */
1226 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
1227 	if (notify || force) {
1228 		ec_notify_via_evtchn(xnbp->xnb_evtchn);
1229 		xnbp->xnb_stat_tx_notify_sent++;
1230 	} else {
1231 		xnbp->xnb_stat_tx_notify_deferred++;
1232 	}
1233 }
1234 
1235 static void
xnb_tx_mark_complete(xnb_t * xnbp,RING_IDX id,int16_t status)1236 xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
1237 {
1238 	RING_IDX i;
1239 	netif_tx_response_t *txresp;
1240 
1241 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1242 
1243 	i = xnbp->xnb_tx_ring.rsp_prod_pvt;
1244 
1245 	txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
1246 	txresp->id = id;
1247 	txresp->status = status;
1248 
1249 	xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;
1250 
1251 	/*
1252 	 * Note that we don't push the change to the peer here - that
1253 	 * is the callers responsibility.
1254 	 */
1255 }
1256 
1257 static void
xnb_txbuf_recycle(xnb_txbuf_t * txp)1258 xnb_txbuf_recycle(xnb_txbuf_t *txp)
1259 {
1260 	xnb_t *xnbp = txp->xt_xnbp;
1261 
1262 	kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1263 
1264 	xnbp->xnb_tx_buf_outstanding--;
1265 }
1266 
1267 static int
xnb_txbuf_constructor(void * buf,void * arg,int kmflag)1268 xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
1269 {
1270 	_NOTE(ARGUNUSED(kmflag));
1271 	xnb_txbuf_t *txp = buf;
1272 	xnb_t *xnbp = arg;
1273 	size_t len;
1274 	ddi_dma_cookie_t dma_cookie;
1275 	uint_t ncookies;
1276 
1277 	txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
1278 	txp->xt_free_rtn.free_arg = (caddr_t)txp;
1279 	txp->xt_xnbp = xnbp;
1280 	txp->xt_next = NULL;
1281 
1282 	if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
1283 	    0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
1284 		goto failure;
1285 
1286 	if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
1287 	    DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
1288 	    &txp->xt_acc_handle) != DDI_SUCCESS)
1289 		goto failure_1;
1290 
1291 	if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
1292 	    len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
1293 	    &dma_cookie, &ncookies)
1294 	    != DDI_DMA_MAPPED)
1295 		goto failure_2;
1296 	ASSERT(ncookies == 1);
1297 
1298 	txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
1299 	txp->xt_buflen = dma_cookie.dmac_size;
1300 
1301 	DTRACE_PROBE(txbuf_allocated);
1302 
1303 	atomic_inc_32(&xnbp->xnb_tx_buf_count);
1304 	xnbp->xnb_tx_buf_outstanding++;
1305 
1306 	return (0);
1307 
1308 failure_2:
1309 	ddi_dma_mem_free(&txp->xt_acc_handle);
1310 
1311 failure_1:
1312 	ddi_dma_free_handle(&txp->xt_dma_handle);
1313 
1314 failure:
1315 
1316 	return (-1);
1317 }
1318 
1319 static void
xnb_txbuf_destructor(void * buf,void * arg)1320 xnb_txbuf_destructor(void *buf, void *arg)
1321 {
1322 	xnb_txbuf_t *txp = buf;
1323 	xnb_t *xnbp = arg;
1324 
1325 	(void) ddi_dma_unbind_handle(txp->xt_dma_handle);
1326 	ddi_dma_mem_free(&txp->xt_acc_handle);
1327 	ddi_dma_free_handle(&txp->xt_dma_handle);
1328 
1329 	atomic_dec_32(&xnbp->xnb_tx_buf_count);
1330 }
1331 
1332 /*
1333  * Take packets from the peer and deliver them onward.
1334  */
1335 static mblk_t *
xnb_from_peer(xnb_t * xnbp)1336 xnb_from_peer(xnb_t *xnbp)
1337 {
1338 	RING_IDX start, end, loop;
1339 	gnttab_copy_t *cop;
1340 	xnb_txbuf_t **txpp;
1341 	netif_tx_request_t *txreq;
1342 	boolean_t work_to_do, need_notify = B_FALSE;
1343 	mblk_t *head, *tail;
1344 	int n_data_req, i;
1345 
1346 	ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));
1347 
1348 	head = tail = NULL;
1349 around:
1350 
1351 	/* LINTED: constant in conditional context */
1352 	RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
1353 	if (!work_to_do) {
1354 finished:
1355 		xnb_tx_notify_peer(xnbp, need_notify);
1356 
1357 		return (head);
1358 	}
1359 
1360 	start = xnbp->xnb_tx_ring.req_cons;
1361 	end = xnbp->xnb_tx_ring.sring->req_prod;
1362 
1363 	if ((end - start) > NET_TX_RING_SIZE) {
1364 		/*
1365 		 * This usually indicates that the frontend driver is
1366 		 * misbehaving, as it's not possible to have more than
1367 		 * NET_TX_RING_SIZE ring elements in play at any one
1368 		 * time.
1369 		 *
1370 		 * We reset the ring pointers to the state declared by
1371 		 * the frontend and try to carry on.
1372 		 */
1373 		cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
1374 		    "items in the ring, resetting and trying to recover.",
1375 		    xnbp->xnb_peer, (end - start));
1376 
1377 		/* LINTED: constant in conditional context */
1378 		BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
1379 		    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1380 
1381 		goto around;
1382 	}
1383 
1384 	loop = start;
1385 	cop = xnbp->xnb_tx_cop;
1386 	txpp = xnbp->xnb_tx_bufp;
1387 	n_data_req = 0;
1388 
1389 	while (loop < end) {
1390 		static const uint16_t acceptable_flags =
1391 		    NETTXF_csum_blank |
1392 		    NETTXF_data_validated |
1393 		    NETTXF_extra_info;
1394 		uint16_t unexpected_flags;
1395 
1396 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1397 
1398 		unexpected_flags = txreq->flags & ~acceptable_flags;
1399 		if (unexpected_flags != 0) {
1400 			/*
1401 			 * The peer used flag bits that we do not
1402 			 * recognize.
1403 			 */
1404 			cmn_err(CE_WARN, "xnb_from_peer: "
1405 			    "unexpected flag bits (0x%x) from peer "
1406 			    "in transmit request",
1407 			    unexpected_flags);
1408 			xnbp->xnb_stat_tx_unexpected_flags++;
1409 
1410 			/* Mark this entry as failed. */
1411 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1412 			need_notify = B_TRUE;
1413 
1414 		} else if (txreq->flags & NETTXF_extra_info) {
1415 			struct netif_extra_info *erp;
1416 			boolean_t status;
1417 
1418 			loop++; /* Consume another slot in the ring. */
1419 			ASSERT(loop <= end);
1420 
1421 			erp = (struct netif_extra_info *)
1422 			    RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);
1423 
1424 			switch (erp->type) {
1425 			case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
1426 				ASSERT(xnbp->xnb_multicast_control);
1427 				status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
1428 				    &erp->u.mcast.addr);
1429 				break;
1430 			case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
1431 				ASSERT(xnbp->xnb_multicast_control);
1432 				status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
1433 				    &erp->u.mcast.addr);
1434 				break;
1435 			default:
1436 				status = B_FALSE;
1437 				cmn_err(CE_WARN, "xnb_from_peer: "
1438 				    "unknown extra type %d", erp->type);
1439 				break;
1440 			}
1441 
1442 			xnb_tx_mark_complete(xnbp, txreq->id,
1443 			    status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
1444 			need_notify = B_TRUE;
1445 
1446 		} else if ((txreq->offset > PAGESIZE) ||
1447 		    (txreq->offset + txreq->size > PAGESIZE)) {
1448 			/*
1449 			 * Peer attempted to refer to data beyond the
1450 			 * end of the granted page.
1451 			 */
1452 			cmn_err(CE_WARN, "xnb_from_peer: "
1453 			    "attempt to refer beyond the end of granted "
1454 			    "page in txreq (offset %d, size %d).",
1455 			    txreq->offset, txreq->size);
1456 			xnbp->xnb_stat_tx_overflow_page++;
1457 
1458 			/* Mark this entry as failed. */
1459 			xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
1460 			need_notify = B_TRUE;
1461 
1462 		} else {
1463 			xnb_txbuf_t *txp;
1464 
1465 			txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
1466 			    KM_NOSLEEP);
1467 			if (txp == NULL)
1468 				break;
1469 
1470 			txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
1471 			    txp->xt_buflen, 0, &txp->xt_free_rtn);
1472 			if (txp->xt_mblk == NULL) {
1473 				kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
1474 				break;
1475 			}
1476 
1477 			txp->xt_idx = loop;
1478 			txp->xt_id = txreq->id;
1479 
1480 			cop->source.u.ref = txreq->gref;
1481 			cop->source.domid = xnbp->xnb_peer;
1482 			cop->source.offset = txreq->offset;
1483 
1484 			cop->dest.u.gmfn = txp->xt_mfn;
1485 			cop->dest.domid = DOMID_SELF;
1486 			cop->dest.offset = 0;
1487 
1488 			cop->len = txreq->size;
1489 			cop->flags = GNTCOPY_source_gref;
1490 			cop->status = 0;
1491 
1492 			*txpp = txp;
1493 
1494 			txpp++;
1495 			cop++;
1496 			n_data_req++;
1497 
1498 			ASSERT(n_data_req <= NET_TX_RING_SIZE);
1499 		}
1500 
1501 		loop++;
1502 	}
1503 
1504 	xnbp->xnb_tx_ring.req_cons = loop;
1505 
1506 	if (n_data_req == 0)
1507 		goto around;
1508 
1509 	if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
1510 	    xnbp->xnb_tx_cop, n_data_req) != 0) {
1511 
1512 		cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");
1513 
1514 		txpp = xnbp->xnb_tx_bufp;
1515 		i = n_data_req;
1516 		while (i > 0) {
1517 			kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
1518 			txpp++;
1519 			i--;
1520 		}
1521 
1522 		goto finished;
1523 	}
1524 
1525 	txpp = xnbp->xnb_tx_bufp;
1526 	cop = xnbp->xnb_tx_cop;
1527 	i = n_data_req;
1528 
1529 	while (i > 0) {
1530 		xnb_txbuf_t *txp = *txpp;
1531 
1532 		txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);
1533 
1534 		if (cop->status != 0) {
1535 #ifdef XNB_DEBUG
1536 			cmn_err(CE_WARN, "xnb_from_peer: "
1537 			    "txpp 0x%p failed (%d)",
1538 			    (void *)*txpp, cop->status);
1539 #endif /* XNB_DEBUG */
1540 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
1541 			freemsg(txp->xt_mblk);
1542 		} else {
1543 			mblk_t *mp;
1544 
1545 			mp = txp->xt_mblk;
1546 			mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
1547 			mp->b_wptr += txreq->size;
1548 			mp->b_next = NULL;
1549 
1550 			/*
1551 			 * If there are checksum flags, process them
1552 			 * appropriately.
1553 			 */
1554 			if ((txreq->flags &
1555 			    (NETTXF_csum_blank | NETTXF_data_validated))
1556 			    != 0) {
1557 				mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
1558 				    mp, txreq->flags);
1559 				xnbp->xnb_stat_tx_cksum_no_need++;
1560 
1561 				txp->xt_mblk = mp;
1562 			}
1563 
1564 			if (head == NULL) {
1565 				ASSERT(tail == NULL);
1566 				head = mp;
1567 			} else {
1568 				ASSERT(tail != NULL);
1569 				tail->b_next = mp;
1570 			}
1571 			tail = mp;
1572 
1573 			xnbp->xnb_stat_opackets++;
1574 			xnbp->xnb_stat_obytes += txreq->size;
1575 
1576 			xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
1577 		}
1578 
1579 		txpp++;
1580 		cop++;
1581 		i--;
1582 	}
1583 
1584 	goto around;
1585 	/* NOTREACHED */
1586 }
1587 
1588 static uint_t
xnb_intr(caddr_t arg)1589 xnb_intr(caddr_t arg)
1590 {
1591 	xnb_t *xnbp = (xnb_t *)arg;
1592 	mblk_t *mp;
1593 
1594 	xnbp->xnb_stat_intr++;
1595 
1596 	mutex_enter(&xnbp->xnb_tx_lock);
1597 
1598 	ASSERT(xnbp->xnb_connected);
1599 
1600 	mp = xnb_from_peer(xnbp);
1601 
1602 	mutex_exit(&xnbp->xnb_tx_lock);
1603 
1604 	if (!xnbp->xnb_hotplugged) {
1605 		xnbp->xnb_stat_tx_too_early++;
1606 		goto fail;
1607 	}
1608 	if (mp == NULL) {
1609 		xnbp->xnb_stat_spurious_intr++;
1610 		goto fail;
1611 	}
1612 
1613 	xnbp->xnb_flavour->xf_from_peer(xnbp, mp);
1614 
1615 	return (DDI_INTR_CLAIMED);
1616 
1617 fail:
1618 	freemsgchain(mp);
1619 	return (DDI_INTR_CLAIMED);
1620 }
1621 
1622 /*
1623  * Read our configuration from xenstore.
1624  */
1625 boolean_t
xnb_read_xs_config(xnb_t * xnbp)1626 xnb_read_xs_config(xnb_t *xnbp)
1627 {
1628 	char *xsname;
1629 	char mac[ETHERADDRL * 3];
1630 
1631 	xsname = xvdi_get_xsname(xnbp->xnb_devinfo);
1632 
1633 	if (xenbus_scanf(XBT_NULL, xsname,
1634 	    "mac", "%s", mac) != 0) {
1635 		cmn_err(CE_WARN, "xnb_attach: "
1636 		    "cannot read mac address from %s",
1637 		    xsname);
1638 		return (B_FALSE);
1639 	}
1640 
1641 	if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
1642 		cmn_err(CE_WARN,
1643 		    "xnb_attach: cannot parse mac address %s",
1644 		    mac);
1645 		return (B_FALSE);
1646 	}
1647 
1648 	return (B_TRUE);
1649 }
1650 
1651 /*
1652  * Read the configuration of the peer from xenstore.
1653  */
1654 boolean_t
xnb_read_oe_config(xnb_t * xnbp)1655 xnb_read_oe_config(xnb_t *xnbp)
1656 {
1657 	char *oename;
1658 	int i;
1659 
1660 	oename = xvdi_get_oename(xnbp->xnb_devinfo);
1661 
1662 	if (xenbus_gather(XBT_NULL, oename,
1663 	    "event-channel", "%u", &xnbp->xnb_fe_evtchn,
1664 	    "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
1665 	    "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
1666 	    NULL) != 0) {
1667 		cmn_err(CE_WARN, "xnb_read_oe_config: "
1668 		    "cannot read other-end details from %s",
1669 		    oename);
1670 		return (B_FALSE);
1671 	}
1672 
1673 	/*
1674 	 * Check whether our peer requests receive side hypervisor
1675 	 * copy.
1676 	 */
1677 	if (xenbus_scanf(XBT_NULL, oename,
1678 	    "request-rx-copy", "%d", &i) != 0)
1679 		i = 0;
1680 	if (i != 0)
1681 		xnbp->xnb_rx_hv_copy = B_TRUE;
1682 
1683 	/*
1684 	 * Check whether our peer requests multicast_control.
1685 	 */
1686 	if (xenbus_scanf(XBT_NULL, oename,
1687 	    "request-multicast-control", "%d", &i) != 0)
1688 		i = 0;
1689 	if (i != 0)
1690 		xnbp->xnb_multicast_control = B_TRUE;
1691 
1692 	/*
1693 	 * The Linux backend driver here checks to see if the peer has
1694 	 * set 'feature-no-csum-offload'. This is used to indicate
1695 	 * that the guest cannot handle receiving packets without a
1696 	 * valid checksum. We don't check here, because packets passed
1697 	 * to the peer _always_ have a valid checksum.
1698 	 *
1699 	 * There are three cases:
1700 	 *
1701 	 * - the NIC is dedicated: packets from the wire should always
1702 	 *   have a valid checksum. If the hardware validates the
1703 	 *   checksum then the relevant bit will be set in the packet
1704 	 *   attributes and we will inform the peer. It can choose to
1705 	 *   ignore the hardware verification.
1706 	 *
1707 	 * - the NIC is shared (VNIC) and a packet originates from the
1708 	 *   wire: this is the same as the case above - the packets
1709 	 *   will have a valid checksum.
1710 	 *
1711 	 * - the NIC is shared (VNIC) and a packet originates from the
1712 	 *   host: the MAC layer ensures that all such packets have a
1713 	 *   valid checksum by calculating one if the stack did not.
1714 	 */
1715 
1716 	return (B_TRUE);
1717 }
1718 
1719 void
xnb_start_connect(xnb_t * xnbp)1720 xnb_start_connect(xnb_t *xnbp)
1721 {
1722 	dev_info_t  *dip = xnbp->xnb_devinfo;
1723 
1724 	if (!xnb_connect_rings(dip)) {
1725 		cmn_err(CE_WARN, "xnb_start_connect: "
1726 		    "cannot connect rings");
1727 		goto failed;
1728 	}
1729 
1730 	if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
1731 		cmn_err(CE_WARN, "xnb_start_connect: "
1732 		    "flavour failed to connect");
1733 		goto failed;
1734 	}
1735 
1736 	(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1737 	return;
1738 
1739 failed:
1740 	xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1741 	xnb_disconnect_rings(dip);
1742 	(void) xvdi_switch_state(dip, XBT_NULL,
1743 	    XenbusStateClosed);
1744 	(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1745 }
1746 
1747 static boolean_t
xnb_connect_rings(dev_info_t * dip)1748 xnb_connect_rings(dev_info_t *dip)
1749 {
1750 	xnb_t *xnbp = ddi_get_driver_private(dip);
1751 	struct gnttab_map_grant_ref map_op;
1752 
1753 	/*
1754 	 * Cannot attempt to connect the rings if already connected.
1755 	 */
1756 	ASSERT(!xnbp->xnb_connected);
1757 
1758 	/*
1759 	 * 1. allocate a vaddr for the tx page, one for the rx page.
1760 	 * 2. call GNTTABOP_map_grant_ref to map the relevant pages
1761 	 *    into the allocated vaddr (one for tx, one for rx).
1762 	 * 3. call EVTCHNOP_bind_interdomain to have the event channel
1763 	 *    bound to this domain.
1764 	 * 4. associate the event channel with an interrupt.
1765 	 * 5. enable the interrupt.
1766 	 */
1767 
1768 	/* 1.tx */
1769 	xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1770 	    0, 0, 0, 0, VM_SLEEP);
1771 	ASSERT(xnbp->xnb_tx_ring_addr != NULL);
1772 
1773 	/* 2.tx */
1774 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
1775 	map_op.flags = GNTMAP_host_map;
1776 	map_op.ref = xnbp->xnb_tx_ring_ref;
1777 	map_op.dom = xnbp->xnb_peer;
1778 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
1779 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1780 	    map_op.status != 0) {
1781 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
1782 		goto fail;
1783 	}
1784 	xnbp->xnb_tx_ring_handle = map_op.handle;
1785 
1786 	/* LINTED: constant in conditional context */
1787 	BACK_RING_INIT(&xnbp->xnb_tx_ring,
1788 	    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);
1789 
1790 	/* 1.rx */
1791 	xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
1792 	    0, 0, 0, 0, VM_SLEEP);
1793 	ASSERT(xnbp->xnb_rx_ring_addr != NULL);
1794 
1795 	/* 2.rx */
1796 	map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
1797 	map_op.flags = GNTMAP_host_map;
1798 	map_op.ref = xnbp->xnb_rx_ring_ref;
1799 	map_op.dom = xnbp->xnb_peer;
1800 	hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
1801 	if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
1802 	    map_op.status != 0) {
1803 		cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
1804 		goto fail;
1805 	}
1806 	xnbp->xnb_rx_ring_handle = map_op.handle;
1807 
1808 	/* LINTED: constant in conditional context */
1809 	BACK_RING_INIT(&xnbp->xnb_rx_ring,
1810 	    (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);
1811 
1812 	/* 3 */
1813 	if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
1814 		cmn_err(CE_WARN, "xnb_connect_rings: "
1815 		    "cannot bind event channel %d", xnbp->xnb_evtchn);
1816 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1817 		goto fail;
1818 	}
1819 	xnbp->xnb_evtchn = xvdi_get_evtchn(dip);
1820 
1821 	/*
1822 	 * It would be good to set the state to XenbusStateConnected
1823 	 * here as well, but then what if ddi_add_intr() failed?
1824 	 * Changing the state in the store will be noticed by the peer
1825 	 * and cannot be "taken back".
1826 	 */
1827 	mutex_enter(&xnbp->xnb_tx_lock);
1828 	mutex_enter(&xnbp->xnb_rx_lock);
1829 
1830 	xnbp->xnb_connected = B_TRUE;
1831 
1832 	mutex_exit(&xnbp->xnb_rx_lock);
1833 	mutex_exit(&xnbp->xnb_tx_lock);
1834 
1835 	/* 4, 5 */
1836 	if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
1837 	    != DDI_SUCCESS) {
1838 		cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
1839 		goto fail;
1840 	}
1841 	xnbp->xnb_irq = B_TRUE;
1842 
1843 	return (B_TRUE);
1844 
1845 fail:
1846 	mutex_enter(&xnbp->xnb_tx_lock);
1847 	mutex_enter(&xnbp->xnb_rx_lock);
1848 
1849 	xnbp->xnb_connected = B_FALSE;
1850 
1851 	mutex_exit(&xnbp->xnb_rx_lock);
1852 	mutex_exit(&xnbp->xnb_tx_lock);
1853 
1854 	return (B_FALSE);
1855 }
1856 
1857 static void
xnb_disconnect_rings(dev_info_t * dip)1858 xnb_disconnect_rings(dev_info_t *dip)
1859 {
1860 	xnb_t *xnbp = ddi_get_driver_private(dip);
1861 
1862 	if (xnbp->xnb_irq) {
1863 		ddi_remove_intr(dip, 0, NULL);
1864 		xnbp->xnb_irq = B_FALSE;
1865 	}
1866 
1867 	if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
1868 		xvdi_free_evtchn(dip);
1869 		xnbp->xnb_evtchn = INVALID_EVTCHN;
1870 	}
1871 
1872 	if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
1873 		struct gnttab_unmap_grant_ref unmap_op;
1874 
1875 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1876 		    xnbp->xnb_rx_ring_addr;
1877 		unmap_op.dev_bus_addr = 0;
1878 		unmap_op.handle = xnbp->xnb_rx_ring_handle;
1879 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1880 		    &unmap_op, 1) != 0)
1881 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1882 			    "cannot unmap rx-ring page (%d)",
1883 			    unmap_op.status);
1884 
1885 		xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
1886 	}
1887 
1888 	if (xnbp->xnb_rx_ring_addr != NULL) {
1889 		hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
1890 		vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
1891 		xnbp->xnb_rx_ring_addr = NULL;
1892 	}
1893 
1894 	if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
1895 		struct gnttab_unmap_grant_ref unmap_op;
1896 
1897 		unmap_op.host_addr = (uint64_t)(uintptr_t)
1898 		    xnbp->xnb_tx_ring_addr;
1899 		unmap_op.dev_bus_addr = 0;
1900 		unmap_op.handle = xnbp->xnb_tx_ring_handle;
1901 		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1902 		    &unmap_op, 1) != 0)
1903 			cmn_err(CE_WARN, "xnb_disconnect_rings: "
1904 			    "cannot unmap tx-ring page (%d)",
1905 			    unmap_op.status);
1906 
1907 		xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
1908 	}
1909 
1910 	if (xnbp->xnb_tx_ring_addr != NULL) {
1911 		hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
1912 		vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
1913 		xnbp->xnb_tx_ring_addr = NULL;
1914 	}
1915 }
1916 
1917 static void
xnb_oe_state_change(dev_info_t * dip,ddi_eventcookie_t id,void * arg,void * impl_data)1918 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1919     void *arg, void *impl_data)
1920 {
1921 	_NOTE(ARGUNUSED(id, arg));
1922 	xnb_t *xnbp = ddi_get_driver_private(dip);
1923 	XenbusState new_state = *(XenbusState *)impl_data;
1924 
1925 	ASSERT(xnbp != NULL);
1926 
1927 	switch (new_state) {
1928 	case XenbusStateConnected:
1929 		/* spurious state change */
1930 		if (xnbp->xnb_connected)
1931 			return;
1932 
1933 		if (!xnb_read_oe_config(xnbp) ||
1934 		    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
1935 			cmn_err(CE_WARN, "xnb_oe_state_change: "
1936 			    "read otherend config error");
1937 			(void) xvdi_switch_state(dip, XBT_NULL,
1938 			    XenbusStateClosed);
1939 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1940 
1941 			break;
1942 		}
1943 
1944 
1945 		mutex_enter(&xnbp->xnb_state_lock);
1946 		xnbp->xnb_fe_status = XNB_STATE_READY;
1947 		if (xnbp->xnb_be_status == XNB_STATE_READY)
1948 			xnb_start_connect(xnbp);
1949 		mutex_exit(&xnbp->xnb_state_lock);
1950 
1951 		/*
1952 		 * Now that we've attempted to connect it's reasonable
1953 		 * to allow an attempt to detach.
1954 		 */
1955 		xnbp->xnb_detachable = B_TRUE;
1956 
1957 		break;
1958 
1959 	case XenbusStateClosing:
1960 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);
1961 
1962 		break;
1963 
1964 	case XenbusStateClosed:
1965 		xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
1966 
1967 		mutex_enter(&xnbp->xnb_tx_lock);
1968 		mutex_enter(&xnbp->xnb_rx_lock);
1969 
1970 		xnb_disconnect_rings(dip);
1971 		xnbp->xnb_connected = B_FALSE;
1972 
1973 		mutex_exit(&xnbp->xnb_rx_lock);
1974 		mutex_exit(&xnbp->xnb_tx_lock);
1975 
1976 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
1977 		(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1978 		/*
1979 		 * In all likelyhood this is already set (in the above
1980 		 * case), but if the peer never attempted to connect
1981 		 * and the domain is destroyed we get here without
1982 		 * having been through the case above, so we set it to
1983 		 * be sure.
1984 		 */
1985 		xnbp->xnb_detachable = B_TRUE;
1986 
1987 		break;
1988 
1989 	default:
1990 		break;
1991 	}
1992 }
1993 
1994 static void
xnb_hp_state_change(dev_info_t * dip,ddi_eventcookie_t id,void * arg,void * impl_data)1995 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
1996     void *arg, void *impl_data)
1997 {
1998 	_NOTE(ARGUNUSED(id, arg));
1999 	xnb_t *xnbp = ddi_get_driver_private(dip);
2000 	xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;
2001 
2002 	ASSERT(xnbp != NULL);
2003 
2004 	switch (state) {
2005 	case Connected:
2006 		/* spurious hotplug event */
2007 		if (xnbp->xnb_hotplugged)
2008 			break;
2009 
2010 		if (!xnb_read_xs_config(xnbp))
2011 			break;
2012 
2013 		if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
2014 			break;
2015 
2016 		mutex_enter(&xnbp->xnb_tx_lock);
2017 		mutex_enter(&xnbp->xnb_rx_lock);
2018 
2019 		xnbp->xnb_hotplugged = B_TRUE;
2020 
2021 		mutex_exit(&xnbp->xnb_rx_lock);
2022 		mutex_exit(&xnbp->xnb_tx_lock);
2023 
2024 		mutex_enter(&xnbp->xnb_state_lock);
2025 		xnbp->xnb_be_status = XNB_STATE_READY;
2026 		if (xnbp->xnb_fe_status == XNB_STATE_READY)
2027 			xnb_start_connect(xnbp);
2028 		mutex_exit(&xnbp->xnb_state_lock);
2029 
2030 		break;
2031 
2032 	default:
2033 		break;
2034 	}
2035 }
2036 
2037 static struct modldrv modldrv = {
2038 	&mod_miscops, "xnb",
2039 };
2040 
2041 static struct modlinkage modlinkage = {
2042 	MODREV_1, &modldrv, NULL
2043 };
2044 
2045 int
_init(void)2046 _init(void)
2047 {
2048 	int i;
2049 
2050 	mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);
2051 
2052 	i = mod_install(&modlinkage);
2053 	if (i != DDI_SUCCESS)
2054 		mutex_destroy(&xnb_alloc_page_lock);
2055 
2056 	return (i);
2057 }
2058 
2059 int
_info(struct modinfo * modinfop)2060 _info(struct modinfo *modinfop)
2061 {
2062 	return (mod_info(&modlinkage, modinfop));
2063 }
2064 
2065 int
_fini(void)2066 _fini(void)
2067 {
2068 	int i;
2069 
2070 	i = mod_remove(&modlinkage);
2071 	if (i == DDI_SUCCESS)
2072 		mutex_destroy(&xnb_alloc_page_lock);
2073 
2074 	return (i);
2075 }
2076