xref: /titanic_52/usr/src/uts/i86xpv/os/evtchn.c (revision 922d2c76afbee21520ffa2088c4e60dcb80d3945)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * evtchn.c
31  *
32  * Communication via hypervisor event channels.
33  *
34  * Copyright (c) 2002-2005, K A Fraser
35  *
36  * This file may be distributed separately from the Linux kernel, or
37  * incorporated into other software packages, subject to the following license:
38  *
39  * Permission is hereby granted, free of charge, to any person obtaining a copy
40  * of this source file (the "Software"), to deal in the Software without
41  * restriction, including without limitation the rights to use, copy, modify,
42  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43  * and to permit persons to whom the Software is furnished to do so, subject to
44  * the following conditions:
45  *
46  * The above copyright notice and this permission notice shall be included in
47  * all copies or substantial portions of the Software.
48  *
49  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
55  * IN THE SOFTWARE.
56  */
57 
58 /* some parts derived from netbsd's hypervisor_machdep.c 1.2.2.2 */
59 
60 /*
61  *
62  * Copyright (c) 2004 Christian Limpach.
63  * All rights reserved.
64  *
65  * Redistribution and use in source and binary forms, with or without
66  * modification, are permitted provided that the following conditions
67  * are met:
68  * 1. Redistributions of source code must retain the above copyright
69  *    notice, this list of conditions and the following disclaimer.
70  * 2. Redistributions in binary form must reproduce the above copyright
71  *    notice, this list of conditions and the following disclaimer in the
72  *    documentation and/or other materials provided with the distribution.
73  * 3. This section intentionally left blank.
74  * 4. The name of the author may not be used to endorse or promote products
75  *    derived from this software without specific prior written permission.
76  *
77  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
78  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
79  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
80  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
81  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
82  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
83  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
84  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
85  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
86  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87  */
88 /*
89  * Section 3 of the above license was updated in response to bug 6379571.
90  */
91 
92 #include <sys/types.h>
93 #include <sys/hypervisor.h>
94 #include <sys/machsystm.h>
95 #include <sys/mutex.h>
96 #include <sys/evtchn_impl.h>
97 #include <sys/ddi_impldefs.h>
98 #include <sys/avintr.h>
99 #include <sys/cpuvar.h>
100 #include <sys/smp_impldefs.h>
101 #include <sys/archsystm.h>
102 #include <sys/sysmacros.h>
103 #include <sys/cmn_err.h>
104 #include <sys/promif.h>
105 #include <sys/debug.h>
106 #include <sys/psm.h>
107 #include <sys/privregs.h>
108 #include <sys/trap.h>
109 #include <sys/atomic.h>
110 #include <sys/cpu.h>
111 #include <sys/psw.h>
112 #include <sys/traptrace.h>
113 #include <sys/stack.h>
114 #include <sys/x_call.h>
115 #include <xen/public/physdev.h>
116 
117 /*
118  * This file manages our association between hypervisor event channels and
119  * Solaris's IRQs.  This is a one-to-one mapping, with the exception of
120  * IPI IRQs, for which there is one event channel per CPU participating
121  * in the IPI, and the clock VIRQ which also has an event channel per cpu
122  * and the IRQ for /dev/xen/evtchn. The IRQ types are:
123  *
124  * IRQT_VIRQ:
125  *	The hypervisor's standard virtual IRQ, used for the clock timer, for
126  *	example.  This code allows any cpu to bind to one of these, although
127  *	some are treated specially (i.e. VIRQ_DEBUG).
128  *	Event channel binding is done via EVTCHNOP_bind_virq.
129  *
130  * IRQT_PIRQ:
131  *	These associate a physical IRQ with an event channel via
132  *	EVTCHNOP_bind_pirq.
133  *
134  * IRQT_IPI:
135  *	A cross-call IRQ. Maps to "ncpus" event channels, each of which is
136  *	bound to exactly one of the vcpus.  We do not currently support
137  *	unbinding of IPIs (since Solaris doesn't need it). Uses
138  *	EVTCHNOP_bind_ipi.
139  *
140  * IRQT_EVTCHN:
141  *	A "normal" binding to an event channel, typically used by the frontend
142  *      drivers to bind to the their backend event channel.
143  *
144  * IRQT_DEV_EVTCHN:
145  *	This is a one-time IRQ used by /dev/xen/evtchn. Unlike other IRQs, we
146  *	have a one-IRQ to many-evtchn mapping. We only track evtchn->irq for
147  *	these event channels, which are managed via ec_irq_add/rm_evtchn().
148  *	We enforce that IRQT_DEV_EVTCHN's representative evtchn (->ii_evtchn)
149  *	is zero, and make any calls to irq_evtchn() an error, to prevent
150  *	accidentally attempting to use the illegal evtchn 0.
151  *
152  * Suspend/resume
153  *
154  *	During a suspend/resume cycle, we need to tear down the event channels.
155  *	All other mapping data is kept. The drivers will remove their own event
156  *	channels via xendev on receiving a DDI_SUSPEND.  This leaves us with
157  *	the IPIs and VIRQs, which we handle in ec_suspend() and ec_resume()
158  *	below.
159  *
160  * CPU binding
161  *
162  *	When an event channel is bound to a CPU, we set a bit in a mask present
163  *	in the machcpu (evt_affinity) to indicate that this CPU can accept this
164  *	event channel.  For both IPIs and VIRQs, this binding is fixed at
165  *	allocation time and we never modify it.  All other event channels are
166  *	bound via the PSM either as part of add_avintr(), or interrupt
167  *	redistribution (xen_psm_dis/enable_intr()) as a result of CPU
168  *	offline/online.
169  *
170  * Locking
171  *
172  *	Updates are done holding the ec_lock.  The xen_callback_handler()
173  *	routine reads the mapping data in a lockless fashion.  Additionally
174  *	suspend takes ec_lock to prevent update races during a suspend/resume
175  *	cycle.  The IPI info is also examined without the lock; this is OK
176  *	since we only ever change IPI info during initial setup and resume.
177  */
178 
179 #define	IRQ_IS_CPUPOKE(irq) (ipi_info[XC_CPUPOKE_PIL].mi_irq == (irq))
180 
181 #define	EVTCHN_MASKED(ev) \
182 	(HYPERVISOR_shared_info->evtchn_mask[(ev) >> EVTCHN_SHIFT] & \
183 	(1ul << ((ev) & ((1ul << EVTCHN_SHIFT) - 1))))
184 
185 static short evtchn_to_irq[NR_EVENT_CHANNELS];
186 static cpuset_t evtchn_cpus[NR_EVENT_CHANNELS];
187 static int	evtchn_owner[NR_EVENT_CHANNELS];
188 #ifdef DEBUG
189 static kthread_t *evtchn_owner_thread[NR_EVENT_CHANNELS];
190 #endif
191 
192 static irq_info_t irq_info[NR_IRQS];
193 static mec_info_t ipi_info[MAXIPL];
194 static mec_info_t virq_info[NR_VIRQS];
195 /*
196  * Mailbox for communication with the evtchn device driver.
197  * We rely on only cpu 0 servicing the event channels associated
198  * with the driver.  i.e. all evtchn driver evtchns are bound to cpu 0.
199  */
200 volatile int ec_dev_mbox;	/* mailbox for evtchn device driver */
201 
202 /*
203  * See the locking description above.
204  */
205 kmutex_t ec_lock;
206 
207 /*
208  * Bitmap indicating which PIRQs require the hypervisor to be notified
209  * on unmask.
210  */
211 static unsigned long pirq_needs_eoi[NR_PIRQS / (sizeof (unsigned long) * NBBY)];
212 
213 static int ec_debug_irq = INVALID_IRQ;
214 int ec_dev_irq = INVALID_IRQ;
215 
216 int
217 xen_bind_virq(unsigned int virq, processorid_t cpu, int *port)
218 {
219 	evtchn_bind_virq_t bind;
220 	int err;
221 
222 	bind.virq = virq;
223 	bind.vcpu = cpu;
224 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind)) == 0)
225 		*port = bind.port;
226 	else
227 		err = xen_xlate_errcode(err);
228 	return (err);
229 }
230 
231 int
232 xen_bind_interdomain(int domid, int remote_port, int *port)
233 {
234 	evtchn_bind_interdomain_t bind;
235 	int err;
236 
237 	bind.remote_dom  = domid;
238 	bind.remote_port = remote_port;
239 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
240 	    &bind)) == 0)
241 		*port = bind.local_port;
242 	else
243 		err = xen_xlate_errcode(err);
244 	return (err);
245 }
246 
247 int
248 xen_alloc_unbound_evtchn(int domid, int *evtchnp)
249 {
250 	evtchn_alloc_unbound_t alloc;
251 	int err;
252 
253 	alloc.dom = DOMID_SELF;
254 	alloc.remote_dom = domid;
255 
256 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
257 	    &alloc)) == 0) {
258 		*evtchnp = alloc.port;
259 		/* ensure evtchn is masked till we're ready to use it */
260 		(void) ec_mask_evtchn(*evtchnp);
261 	} else {
262 		err = xen_xlate_errcode(err);
263 	}
264 
265 	return (err);
266 }
267 
268 static int
269 xen_close_evtchn(int evtchn)
270 {
271 	evtchn_close_t close;
272 	int err;
273 
274 	close.port = evtchn;
275 	err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
276 	if (err)
277 		err = xen_xlate_errcode(err);
278 	return (err);
279 }
280 
281 static int
282 xen_bind_ipi(processorid_t cpu)
283 {
284 	evtchn_bind_ipi_t bind;
285 
286 	ASSERT(MUTEX_HELD(&ec_lock));
287 
288 	bind.vcpu = cpu;
289 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind) != 0)
290 		panic("xen_bind_ipi() failed");
291 	return (bind.port);
292 }
293 
294 /* Send future instances of this interrupt to other vcpu. */
295 static void
296 xen_bind_vcpu(int evtchn, int cpu)
297 {
298 	evtchn_bind_vcpu_t bind;
299 
300 	ASSERT(MUTEX_HELD(&ec_lock));
301 
302 	bind.port = evtchn;
303 	bind.vcpu = cpu;
304 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind) != 0)
305 		panic("xen_bind_vcpu() failed");
306 }
307 
308 static int
309 xen_bind_pirq(int pirq)
310 {
311 	evtchn_bind_pirq_t bind;
312 	int ret;
313 
314 	bind.pirq = pirq;
315 	bind.flags = BIND_PIRQ__WILL_SHARE;
316 	if ((ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind)) != 0)
317 		panic("xen_bind_pirq() failed (err %d)", ret);
318 	return (bind.port);
319 }
320 
321 /* unmask an evtchn and send upcall to appropriate vcpu if pending bit is set */
322 static void
323 xen_evtchn_unmask(int evtchn)
324 {
325 	evtchn_unmask_t unmask;
326 
327 	unmask.port = evtchn;
328 	if (HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask) != 0)
329 		panic("xen_evtchn_unmask() failed");
330 }
331 
332 static void
333 update_evtchn_affinity(int evtchn)
334 {
335 	cpu_t *cp;
336 	struct xen_evt_data *cpe;
337 
338 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
339 	ASSERT(MUTEX_HELD(&ec_lock));
340 
341 	/*
342 	 * Use lockless search of cpu_list, similar to mutex_vector_enter().
343 	 */
344 	kpreempt_disable();
345 	cp = cpu_list;
346 	do {
347 		cpe = cp->cpu_m.mcpu_evt_pend;
348 		if (CPU_IN_SET(evtchn_cpus[evtchn], cp->cpu_id))
349 			SET_EVTCHN_BIT(evtchn, cpe->evt_affinity);
350 		else
351 			CLEAR_EVTCHN_BIT(evtchn, cpe->evt_affinity);
352 	} while ((cp = cp->cpu_next) != cpu_list);
353 	kpreempt_enable();
354 }
355 
356 static void
357 bind_evtchn_to_cpuset(int evtchn, cpuset_t cpus)
358 {
359 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
360 
361 	CPUSET_ZERO(evtchn_cpus[evtchn]);
362 	CPUSET_OR(evtchn_cpus[evtchn], cpus);
363 	update_evtchn_affinity(evtchn);
364 }
365 
366 static void
367 clear_evtchn_affinity(int evtchn)
368 {
369 	CPUSET_ZERO(evtchn_cpus[evtchn]);
370 	update_evtchn_affinity(evtchn);
371 }
372 
373 static void
374 alloc_irq_evtchn(int irq, int index, int evtchn, int cpu)
375 {
376 	irq_info_t *irqp = &irq_info[irq];
377 
378 	switch (irqp->ii_type) {
379 	case IRQT_IPI:
380 		ipi_info[index].mi_evtchns[cpu] = evtchn;
381 		irqp->ii_u.index = index;
382 		break;
383 	case IRQT_VIRQ:
384 		virq_info[index].mi_evtchns[cpu] = evtchn;
385 		irqp->ii_u.index = index;
386 		break;
387 	default:
388 		irqp->ii_u.evtchn = evtchn;
389 		break;
390 	}
391 
392 	evtchn_to_irq[evtchn] = irq;
393 
394 	/*
395 	 * If a CPU is not specified, we expect to bind it to a CPU later via
396 	 * the PSM.
397 	 */
398 	if (cpu != -1) {
399 		cpuset_t tcpus;
400 		CPUSET_ONLY(tcpus, cpu);
401 		bind_evtchn_to_cpuset(evtchn, tcpus);
402 	}
403 }
404 
405 static int
406 alloc_irq(int type, int index, int evtchn, int cpu)
407 {
408 	int irq;
409 	irq_info_t *irqp;
410 
411 	ASSERT(MUTEX_HELD(&ec_lock));
412 	ASSERT(type != IRQT_IPI || cpu != -1);
413 
414 	for (irq = 0; irq < NR_IRQS; irq++) {
415 		if (irq_info[irq].ii_type == IRQT_UNBOUND)
416 			break;
417 	}
418 
419 	if (irq == NR_IRQS)
420 		panic("No available IRQ to bind to: increase NR_IRQS!\n");
421 
422 	irqp = &irq_info[irq];
423 
424 	irqp->ii_type = type;
425 	/*
426 	 * Set irq/has_handler field to zero which means handler not installed
427 	 */
428 	irqp->ii_u2.has_handler = 0;
429 
430 	alloc_irq_evtchn(irq, index, evtchn, cpu);
431 	return (irq);
432 }
433 
434 static int
435 irq_evtchn(irq_info_t *irqp)
436 {
437 	int evtchn;
438 
439 	ASSERT(irqp->ii_type != IRQT_DEV_EVTCHN);
440 
441 	switch (irqp->ii_type) {
442 	case IRQT_IPI:
443 		ASSERT(irqp->ii_u.index != 0);
444 		evtchn = ipi_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
445 		break;
446 	case IRQT_VIRQ:
447 		evtchn = virq_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
448 		break;
449 	default:
450 		evtchn = irqp->ii_u.evtchn;
451 		break;
452 	}
453 
454 	return (evtchn);
455 }
456 
457 static void
458 unbind_evtchn(ushort_t *evtchnp)
459 {
460 	int err;
461 
462 	ASSERT(MUTEX_HELD(&ec_lock));
463 
464 	ASSERT(*evtchnp != 0);
465 
466 	err = xen_close_evtchn(*evtchnp);
467 	ASSERT(err == 0);
468 	clear_evtchn_affinity(*evtchnp);
469 	evtchn_to_irq[*evtchnp] = INVALID_IRQ;
470 	*evtchnp = 0;
471 }
472 
473 static void
474 pirq_unmask_notify(int pirq)
475 {
476 	struct physdev_eoi eoi;
477 
478 	if (TEST_EVTCHN_BIT(pirq, &pirq_needs_eoi[0])) {
479 		eoi.irq = pirq;
480 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
481 	}
482 }
483 
484 static void
485 pirq_query_unmask(int pirq)
486 {
487 	struct physdev_irq_status_query irq_status;
488 
489 	irq_status.irq = pirq;
490 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
491 	CLEAR_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
492 	if (irq_status.flags & XENIRQSTAT_needs_eoi)
493 		SET_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
494 }
495 
496 static void
497 end_pirq(int irq)
498 {
499 	int evtchn = irq_evtchn(&irq_info[irq]);
500 
501 	ec_unmask_evtchn(evtchn);
502 	pirq_unmask_notify(IRQ_TO_PIRQ(irq));
503 }
504 
505 /*
506  * probe if a pirq is available to bind to, return 1 if available
507  * else return 0.
508  * Note that for debug versions of xen this probe may cause an in use IRQ
509  * warning message from xen.
510  */
511 int
512 ec_probe_pirq(int pirq)
513 {
514 	evtchn_bind_pirq_t bind;
515 
516 	bind.pirq = pirq;
517 	bind.flags = 0;
518 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind) != 0) {
519 		return (0);
520 	} else {
521 		(void) xen_close_evtchn(bind.port);
522 		return (1);
523 	}
524 }
525 
526 /*
527  * Bind an event channel to a vcpu
528  */
529 void
530 ec_bind_vcpu(int evtchn, int cpu)
531 {
532 	mutex_enter(&ec_lock);
533 	xen_bind_vcpu(evtchn, cpu);
534 	mutex_exit(&ec_lock);
535 }
536 
537 /*
538  * Set up a physical device irq to be associated with an event channel.
539  */
540 void
541 ec_setup_pirq(int irq, int ipl, cpuset_t *cpusp)
542 {
543 	int evtchn;
544 	irq_info_t *irqp = &irq_info[irq];
545 
546 	/*
547 	 * Test if this PIRQ is already bound to an evtchn,
548 	 * which means it is a shared IRQ and we don't want to
549 	 * bind and do some initial setup that has already been
550 	 * done for this irq on a previous trip through this code.
551 	 */
552 	if (irqp->ii_u.evtchn == INVALID_EVTCHN) {
553 		evtchn = xen_bind_pirq(irq);
554 
555 		pirq_query_unmask(IRQ_TO_PIRQ(irq));
556 
557 		irqp->ii_type = IRQT_PIRQ;
558 		irqp->ii_u.evtchn = evtchn;
559 
560 		evtchn_to_irq[evtchn] = irq;
561 		irqp->ii_u2.ipl = ipl;
562 		ec_set_irq_affinity(irq, *cpusp);
563 		ec_enable_irq(irq);
564 		pirq_unmask_notify(IRQ_TO_PIRQ(irq));
565 	} else {
566 		ASSERT(irqp->ii_u2.ipl != 0);
567 		cmn_err(CE_NOTE, "!IRQ%d is shared", irq);
568 		if (ipl > irqp->ii_u2.ipl)
569 			irqp->ii_u2.ipl = ipl;
570 		*cpusp = evtchn_cpus[irqp->ii_u.evtchn];
571 	}
572 }
573 
574 void
575 ec_unbind_irq(int irq)
576 {
577 	irq_info_t *irqp = &irq_info[irq];
578 	mec_info_t *virqp;
579 	int drop_lock = 0;
580 	int type, i;
581 
582 	/*
583 	 * Nasty, but we need this during suspend.
584 	 */
585 	if (mutex_owner(&ec_lock) != curthread) {
586 		mutex_enter(&ec_lock);
587 		drop_lock = 1;
588 	}
589 
590 	type = irqp->ii_type;
591 
592 	ASSERT((type == IRQT_EVTCHN) || (type == IRQT_PIRQ) ||
593 	    (type == IRQT_VIRQ));
594 
595 	if ((type == IRQT_EVTCHN) || (type == IRQT_PIRQ)) {
596 		/* There's only one event channel associated with this irq */
597 		unbind_evtchn(&irqp->ii_u.evtchn);
598 	} else if (type == IRQT_VIRQ) {
599 		/*
600 		 * Each cpu on the system can have it's own event channel
601 		 * associated with a virq.  Unbind them all.
602 		 */
603 		virqp = &virq_info[irqp->ii_u.index];
604 		for (i = 0; i < NCPU; i++) {
605 			if (virqp->mi_evtchns[i] != 0)
606 				unbind_evtchn(&virqp->mi_evtchns[i]);
607 		}
608 		/* Mark the virq structure as invalid. */
609 		virqp->mi_irq = INVALID_IRQ;
610 	}
611 
612 	bzero(irqp, sizeof (*irqp));
613 	/* Re-reserve PIRQ. */
614 	if (type == IRQT_PIRQ)
615 		irqp->ii_type = IRQT_PIRQ;
616 
617 	if (drop_lock)
618 		mutex_exit(&ec_lock);
619 }
620 
621 /*
622  * Rebind an event channel for delivery to a CPU.
623  */
624 void
625 ec_set_irq_affinity(int irq, cpuset_t dest)
626 {
627 	int evtchn, tcpu;
628 	irq_info_t *irqp = &irq_info[irq];
629 
630 	mutex_enter(&ec_lock);
631 
632 	ASSERT(irq < NR_IRQS);
633 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
634 
635 	/*
636 	 * Binding is done at allocation time for these types, so we should
637 	 * never modify them.
638 	 */
639 	if (irqp->ii_type == IRQT_IPI || irqp->ii_type == IRQT_VIRQ ||
640 	    irqp->ii_type == IRQT_DEV_EVTCHN) {
641 		mutex_exit(&ec_lock);
642 		return;
643 	}
644 
645 	CPUSET_FIND(dest, tcpu);
646 	ASSERT(tcpu != CPUSET_NOTINSET);
647 
648 	evtchn = irq_evtchn(irqp);
649 
650 	xen_bind_vcpu(evtchn, tcpu);
651 
652 	bind_evtchn_to_cpuset(evtchn, dest);
653 
654 	mutex_exit(&ec_lock);
655 
656 	/*
657 	 * Now send the new target processor a NOP IPI.
658 	 * It will check for any pending interrupts, and so service any that
659 	 * got delivered to the wrong processor by mistake.
660 	 */
661 	if (ncpus > 1)
662 		poke_cpu(tcpu);
663 }
664 
665 int
666 ec_set_irq_priority(int irq, int pri)
667 {
668 	irq_info_t *irqp;
669 
670 	if (irq >= NR_IRQS)
671 		return (-1);
672 
673 	irqp = &irq_info[irq];
674 
675 	if (irqp->ii_type == IRQT_UNBOUND)
676 		return (-1);
677 
678 	irqp->ii_u2.ipl = pri;
679 
680 	return (0);
681 }
682 
683 void
684 ec_clear_irq_priority(int irq)
685 {
686 	irq_info_t *irqp = &irq_info[irq];
687 
688 	ASSERT(irq < NR_IRQS);
689 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
690 
691 	irqp->ii_u2.ipl = 0;
692 }
693 
694 int
695 ec_bind_evtchn_to_irq(int evtchn)
696 {
697 	mutex_enter(&ec_lock);
698 
699 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
700 
701 	(void) alloc_irq(IRQT_EVTCHN, 0, evtchn, -1);
702 
703 	mutex_exit(&ec_lock);
704 	return (evtchn_to_irq[evtchn]);
705 }
706 
707 int
708 ec_bind_virq_to_irq(int virq, int cpu)
709 {
710 	int err;
711 	int evtchn;
712 	mec_info_t *virqp;
713 
714 	virqp = &virq_info[virq];
715 	mutex_enter(&ec_lock);
716 
717 	err = xen_bind_virq(virq, cpu, &evtchn);
718 	ASSERT(err == 0);
719 
720 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
721 
722 	if (virqp->mi_irq == INVALID_IRQ) {
723 		virqp->mi_irq = alloc_irq(IRQT_VIRQ, virq, evtchn, cpu);
724 	} else {
725 		alloc_irq_evtchn(virqp->mi_irq, virq, evtchn, cpu);
726 	}
727 
728 	mutex_exit(&ec_lock);
729 
730 	return (virqp->mi_irq);
731 }
732 
733 int
734 ec_bind_ipi_to_irq(int ipl, int cpu)
735 {
736 	int evtchn;
737 	ulong_t flags;
738 	mec_info_t *ipip;
739 
740 	mutex_enter(&ec_lock);
741 
742 	ipip = &ipi_info[ipl];
743 
744 	evtchn = xen_bind_ipi(cpu);
745 
746 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
747 
748 	if (ipip->mi_irq == INVALID_IRQ) {
749 		ipip->mi_irq = alloc_irq(IRQT_IPI, ipl, evtchn, cpu);
750 	} else {
751 		alloc_irq_evtchn(ipip->mi_irq, ipl, evtchn, cpu);
752 	}
753 
754 	/*
755 	 * Unmask the new evtchn so that it can be seen by the target cpu
756 	 */
757 	flags = intr_clear();
758 	ec_unmask_evtchn(evtchn);
759 	intr_restore(flags);
760 
761 	mutex_exit(&ec_lock);
762 	return (ipip->mi_irq);
763 }
764 
765 /*
766  * When bringing up a CPU, bind to all the IPIs that CPU0 bound.
767  */
768 void
769 ec_bind_cpu_ipis(int cpu)
770 {
771 	int i;
772 
773 	for (i = 0; i < MAXIPL; i++) {
774 		mec_info_t *ipip = &ipi_info[i];
775 		if (ipip->mi_irq == INVALID_IRQ)
776 			continue;
777 
778 		(void) ec_bind_ipi_to_irq(i, cpu);
779 	}
780 }
781 
782 /*
783  * Can this IRQ be rebound to another CPU?
784  */
785 int
786 ec_irq_rebindable(int irq)
787 {
788 	irq_info_t *irqp = &irq_info[irq];
789 
790 	if (irqp->ii_u.evtchn == 0)
791 		return (0);
792 
793 	return (irqp->ii_type == IRQT_EVTCHN || irqp->ii_type == IRQT_PIRQ);
794 }
795 
796 /*
797  * Should this IRQ be unbound from this CPU (which is being offlined) to
798  * another?
799  */
800 int
801 ec_irq_needs_rebind(int irq, int cpu)
802 {
803 	irq_info_t *irqp = &irq_info[irq];
804 
805 	return (ec_irq_rebindable(irq) &&
806 	    CPU_IN_SET(evtchn_cpus[irqp->ii_u.evtchn], cpu));
807 }
808 
809 void
810 ec_send_ipi(int ipl, int cpu)
811 {
812 	mec_info_t *ipip = &ipi_info[ipl];
813 
814 	ASSERT(ipip->mi_irq != INVALID_IRQ);
815 
816 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
817 }
818 
819 void
820 ec_try_ipi(int ipl, int cpu)
821 {
822 	mec_info_t *ipip = &ipi_info[ipl];
823 
824 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
825 		return;
826 
827 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
828 }
829 
830 void
831 ec_irq_add_evtchn(int irq, int evtchn)
832 {
833 	mutex_enter(&ec_lock);
834 
835 	/*
836 	 * See description of IRQT_DEV_EVTCHN above.
837 	 */
838 	ASSERT(irq == ec_dev_irq);
839 
840 	alloc_irq_evtchn(irq, 0, evtchn, 0);
841 	/*
842 	 * We enforce that the representative event channel for IRQT_DEV_EVTCHN
843 	 * is zero, so PSM operations on it have no effect.
844 	 */
845 	irq_info[irq].ii_u.evtchn = 0;
846 	mutex_exit(&ec_lock);
847 }
848 
849 void
850 ec_irq_rm_evtchn(int irq, int evtchn)
851 {
852 	ushort_t ec = evtchn;
853 
854 	mutex_enter(&ec_lock);
855 	ASSERT(irq == ec_dev_irq);
856 	unbind_evtchn(&ec);
857 	mutex_exit(&ec_lock);
858 }
859 
860 /*
861  * Allocate an /dev/xen/evtchn IRQ.  See the big comment at the top
862  * for an explanation.
863  */
864 int
865 ec_dev_alloc_irq(void)
866 {
867 	int i;
868 	irq_info_t *irqp;
869 
870 	for (i = 0; i < NR_IRQS; i++) {
871 		if (irq_info[i].ii_type == IRQT_UNBOUND)
872 			break;
873 	}
874 
875 	ASSERT(i != NR_IRQS);
876 
877 	irqp = &irq_info[i];
878 	irqp->ii_type = IRQT_DEV_EVTCHN;
879 	irqp->ii_u2.ipl = IPL_EVTCHN;
880 	/*
881 	 * Force the evtchn to zero for the special evtchn device irq
882 	 */
883 	irqp->ii_u.evtchn = 0;
884 	return (i);
885 }
886 
887 void
888 ec_enable_irq(unsigned int irq)
889 {
890 	ulong_t flag;
891 	irq_info_t *irqp = &irq_info[irq];
892 
893 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
894 		return;
895 
896 	flag = intr_clear();
897 	ec_unmask_evtchn(irq_evtchn(irqp));
898 	intr_restore(flag);
899 }
900 
901 void
902 ec_disable_irq(unsigned int irq)
903 {
904 	irq_info_t *irqp = &irq_info[irq];
905 
906 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
907 		return;
908 
909 	/*
910 	 * Spin till we are the one to mask the evtchn
911 	 * Ensures no one else can be servicing this evtchn.
912 	 */
913 	while (!ec_mask_evtchn(irq_evtchn(irqp)))
914 		SMT_PAUSE();
915 }
916 
917 static int
918 ec_evtchn_pending(uint_t ev)
919 {
920 	uint_t evi;
921 	shared_info_t *si = HYPERVISOR_shared_info;
922 
923 	evi = ev >> EVTCHN_SHIFT;
924 	ev &= (1ul << EVTCHN_SHIFT) - 1;
925 	return ((si->evtchn_pending[evi] & (1ul << ev)) != 0);
926 }
927 
928 int
929 ec_pending_irq(unsigned int irq)
930 {
931 	int evtchn = irq_evtchn(&irq_info[irq]);
932 
933 	return (ec_evtchn_pending(evtchn));
934 }
935 
936 void
937 ec_clear_irq(int irq)
938 {
939 	irq_info_t *irqp = &irq_info[irq];
940 	int evtchn;
941 
942 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
943 		return;
944 
945 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
946 
947 	evtchn = irq_evtchn(irqp);
948 
949 	ASSERT(EVTCHN_MASKED(evtchn));
950 	ec_clear_evtchn(evtchn);
951 }
952 
953 void
954 ec_unmask_irq(int irq)
955 {
956 	ulong_t flags;
957 	irq_info_t *irqp = &irq_info[irq];
958 
959 	flags = intr_clear();
960 	switch (irqp->ii_type) {
961 	case IRQT_PIRQ:
962 		end_pirq(irq);
963 		break;
964 	case IRQT_DEV_EVTCHN:
965 		break;
966 	default:
967 		ec_unmask_evtchn(irq_evtchn(irqp));
968 		break;
969 	}
970 	intr_restore(flags);
971 }
972 
973 void
974 ec_try_unmask_irq(int irq)
975 {
976 	ulong_t flags;
977 	irq_info_t *irqp = &irq_info[irq];
978 	int evtchn;
979 
980 	flags = intr_clear();
981 	switch (irqp->ii_type) {
982 	case IRQT_PIRQ:
983 		end_pirq(irq);
984 		break;
985 	case IRQT_DEV_EVTCHN:
986 		break;
987 	default:
988 		if ((evtchn = irq_evtchn(irqp)) != 0)
989 			ec_unmask_evtchn(evtchn);
990 		break;
991 	}
992 	intr_restore(flags);
993 }
994 
995 /*
996  * Poll until an event channel is ready or 'check_func' returns true.  This can
997  * only be used in a situation where interrupts are masked, otherwise we have a
998  * classic time-of-check vs. time-of-use race.
999  */
1000 void
1001 ec_wait_on_evtchn(int evtchn, int (*check_func)(void *), void *arg)
1002 {
1003 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1004 		while (!check_func(arg))
1005 			(void) HYPERVISOR_yield();
1006 		return;
1007 	}
1008 
1009 	ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
1010 
1011 	for (;;) {
1012 		evtchn_port_t ports[1];
1013 
1014 		ports[0] = evtchn;
1015 
1016 		ec_clear_evtchn(evtchn);
1017 
1018 		if (check_func(arg))
1019 			return;
1020 
1021 		(void) HYPERVISOR_poll(ports, 1, 0);
1022 	}
1023 }
1024 
1025 void
1026 ec_wait_on_ipi(int ipl, int (*check_func)(void *), void *arg)
1027 {
1028 	mec_info_t *ipip = &ipi_info[ipl];
1029 
1030 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
1031 		return;
1032 
1033 	ec_wait_on_evtchn(ipip->mi_evtchns[CPU->cpu_id], check_func, arg);
1034 }
1035 
1036 void
1037 ec_suspend(void)
1038 {
1039 	irq_info_t *irqp;
1040 	ushort_t *evtchnp;
1041 	int i;
1042 	int c;
1043 
1044 	ASSERT(MUTEX_HELD(&ec_lock));
1045 
1046 	for (i = 0; i < MAXIPL; i++) {
1047 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1048 			continue;
1049 
1050 		for (c = 0; c < NCPU; c++) {
1051 			if (cpu[c] == NULL)
1052 				continue;
1053 
1054 			if (CPU_IN_SET(cpu_suspend_lost_set, c))
1055 				continue;
1056 
1057 			evtchnp = &ipi_info[i].mi_evtchns[c];
1058 			ASSERT(*evtchnp != 0);
1059 			unbind_evtchn(evtchnp);
1060 		}
1061 	}
1062 
1063 	for (i = 0; i < NR_VIRQS; i++) {
1064 		if (virq_info[i].mi_irq == INVALID_IRQ)
1065 			continue;
1066 
1067 		/*
1068 		 * If we're sharing a single event channel across all CPUs, we
1069 		 * should only unbind once.
1070 		 */
1071 		if (virq_info[i].mi_shared) {
1072 			evtchnp = &virq_info[i].mi_evtchns[0];
1073 			unbind_evtchn(evtchnp);
1074 			for (c = 1; c < NCPU; c++)
1075 				virq_info[i].mi_evtchns[c] = 0;
1076 		} else {
1077 			for (c = 0; c < NCPU; c++) {
1078 				if (cpu[c] == NULL)
1079 					continue;
1080 
1081 				evtchnp = &virq_info[i].mi_evtchns[c];
1082 				if (*evtchnp != 0)
1083 					unbind_evtchn(evtchnp);
1084 			}
1085 		}
1086 	}
1087 
1088 	for (i = 0; i < NR_IRQS; i++) {
1089 		irqp = &irq_info[i];
1090 
1091 		switch (irqp->ii_type) {
1092 		case IRQT_EVTCHN:
1093 		case IRQT_DEV_EVTCHN:
1094 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1095 			break;
1096 		case IRQT_PIRQ:
1097 			if (irqp->ii_u.evtchn != 0)
1098 				(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1099 			break;
1100 		default:
1101 			break;
1102 		}
1103 	}
1104 }
1105 
1106 /*
1107  * The debug irq is special, we only have one evtchn and irq but we allow all
1108  * cpus to service it.  It's marked as shared and we propogate the event
1109  * channel into all CPUs by hand.
1110  */
1111 static void
1112 share_virq(mec_info_t *virqp)
1113 {
1114 	int evtchn = virqp->mi_evtchns[0];
1115 	cpuset_t tset;
1116 	int i;
1117 
1118 	ASSERT(evtchn != 0);
1119 
1120 	virqp->mi_shared = 1;
1121 
1122 	for (i = 1; i < NCPU; i++)
1123 		virqp->mi_evtchns[i] = evtchn;
1124 	CPUSET_ALL(tset);
1125 	bind_evtchn_to_cpuset(evtchn, tset);
1126 }
1127 
1128 static void
1129 virq_resume(int virq)
1130 {
1131 	mec_info_t *virqp = &virq_info[virq];
1132 	int evtchn;
1133 	int i, err;
1134 
1135 	for (i = 0; i < NCPU; i++) {
1136 		cpuset_t tcpus;
1137 
1138 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1139 			continue;
1140 
1141 		err = xen_bind_virq(virq, i, &evtchn);
1142 		ASSERT(err == 0);
1143 
1144 		virqp->mi_evtchns[i] = evtchn;
1145 		evtchn_to_irq[evtchn] = virqp->mi_irq;
1146 		CPUSET_ONLY(tcpus, i);
1147 		bind_evtchn_to_cpuset(evtchn, tcpus);
1148 		ec_unmask_evtchn(evtchn);
1149 		/*
1150 		 * only timer VIRQ is bound to all cpus
1151 		 */
1152 		if (virq != VIRQ_TIMER)
1153 			break;
1154 	}
1155 
1156 	if (virqp->mi_shared)
1157 		share_virq(virqp);
1158 }
1159 
1160 static void
1161 ipi_resume(int ipl)
1162 {
1163 	mec_info_t *ipip = &ipi_info[ipl];
1164 	int i;
1165 
1166 	for (i = 0; i < NCPU; i++) {
1167 		cpuset_t tcpus;
1168 		int evtchn;
1169 
1170 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1171 			continue;
1172 
1173 		evtchn = xen_bind_ipi(i);
1174 		ipip->mi_evtchns[i] = evtchn;
1175 		evtchn_to_irq[evtchn] = ipip->mi_irq;
1176 		CPUSET_ONLY(tcpus, i);
1177 		bind_evtchn_to_cpuset(evtchn, tcpus);
1178 		ec_unmask_evtchn(evtchn);
1179 	}
1180 }
1181 
1182 void
1183 ec_resume(void)
1184 {
1185 	int i;
1186 
1187 	/* New event-channel space is not 'live' yet. */
1188 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
1189 		(void) ec_mask_evtchn(i);
1190 
1191 	for (i = 0; i < MAXIPL; i++) {
1192 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1193 			continue;
1194 		ipi_resume(i);
1195 	}
1196 
1197 	for (i = 0; i < NR_VIRQS; i++) {
1198 		if (virq_info[i].mi_irq == INVALID_IRQ)
1199 			continue;
1200 		virq_resume(i);
1201 	}
1202 }
1203 
1204 void
1205 ec_init(void)
1206 {
1207 	int i;
1208 	mutex_init(&ec_lock, NULL, MUTEX_SPIN, (void *)ipltospl(SPL7));
1209 
1210 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
1211 		CPUSET_ZERO(evtchn_cpus[i]);
1212 		evtchn_to_irq[i] = INVALID_IRQ;
1213 		(void) ec_mask_evtchn(i);
1214 	}
1215 
1216 	for (i = 0; i < MAXIPL; i++)
1217 		ipi_info[i].mi_irq = INVALID_IRQ;
1218 
1219 	for (i = 0; i < NR_VIRQS; i++)
1220 		virq_info[i].mi_irq = INVALID_IRQ;
1221 
1222 	/*
1223 	 * Phys IRQ space is statically bound (1:1 mapping), grab the IRQs
1224 	 * now.
1225 	 */
1226 	for (i = PIRQ_BASE; i < NR_PIRQS; i++) {
1227 		irq_info[PIRQ_TO_IRQ(i)].ii_type = IRQT_PIRQ;
1228 	}
1229 }
1230 
1231 void
1232 ec_init_debug_irq()
1233 {
1234 	int irq;
1235 
1236 	irq = ec_bind_virq_to_irq(VIRQ_DEBUG, 0);
1237 	(void) add_avintr(NULL, IPL_DEBUG, (avfunc)xen_debug_handler,
1238 	    "debug", irq, NULL, NULL, NULL, NULL);
1239 
1240 	mutex_enter(&ec_lock);
1241 	share_virq(&virq_info[irq_info[irq].ii_u.index]);
1242 	mutex_exit(&ec_lock);
1243 	ec_debug_irq = irq;
1244 }
1245 
1246 #define	UNBLOCKED_EVENTS(si, ix, cpe, cpu_id) \
1247 	((si)->evtchn_pending[ix] & ~(si)->evtchn_mask[ix] & \
1248 		(cpe)->evt_affinity[ix])
1249 
1250 /*
1251  * This is the entry point for processing events from xen
1252  *
1253  * (See the commentary associated with the shared_info_st structure
1254  * in hypervisor-if.h)
1255  *
1256  * Since the event channel mechanism doesn't really implement the
1257  * concept of priority like hardware interrupt controllers, we simulate
1258  * that in software here using the cpu priority field and the pending
1259  * interrupts field.  Events/interrupts that are not able to be serviced
1260  * now because they are at a lower priority than the current cpu priority
1261  * cause a level bit to be recorded in the pending interrupts word.  When
1262  * the priority is lowered (either by spl or interrupt exit code) the pending
1263  * levels are checked and an upcall is scheduled if there are events/interrupts
1264  * that have become deliverable.
1265  */
1266 void
1267 xen_callback_handler(struct regs *rp, trap_trace_rec_t *ttp)
1268 {
1269 	ulong_t pending_sels, pe, selbit;
1270 	int i, j, port, pri, curpri, irq;
1271 	uint16_t pending_ints;
1272 	struct cpu *cpu = CPU;
1273 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1274 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
1275 	volatile struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
1276 	volatile uint16_t *cpu_ipp = &cpu->cpu_m.mcpu_intr_pending;
1277 
1278 	ASSERT(rp->r_trapno == T_AST && rp->r_err == 0);
1279 	ASSERT(&si->vcpu_info[cpu->cpu_id] == vci);
1280 	ASSERT_STACK_ALIGNED();
1281 
1282 	vci->evtchn_upcall_pending = 0;
1283 
1284 	/*
1285 	 * To expedite scanning of pending notifications, any 0->1
1286 	 * pending transition on an unmasked channel causes a
1287 	 * corresponding bit in evtchn_pending_sel to be set.
1288 	 * Each bit in the selector covers a 32-bit word in
1289 	 * the evtchn_pending[] array.
1290 	 */
1291 	membar_enter();
1292 	do {
1293 		pending_sels = vci->evtchn_pending_sel;
1294 	} while (atomic_cas_ulong((volatile ulong_t *)&vci->evtchn_pending_sel,
1295 	    pending_sels, 0) != pending_sels);
1296 
1297 	pending_ints = *cpu_ipp;
1298 	while ((i = ffs(pending_sels)) != 0) {
1299 		i--;
1300 		selbit = 1ul << i;
1301 		pending_sels &= ~selbit;
1302 
1303 		membar_enter();
1304 		while ((pe = UNBLOCKED_EVENTS(si, i, cpe, cpu->cpu_id)) != 0) {
1305 			j = ffs(pe) - 1;
1306 			pe &= ~(1ul << j);
1307 
1308 			port = (i << EVTCHN_SHIFT) + j;
1309 
1310 			irq = evtchn_to_irq[port];
1311 
1312 			/*
1313 			 * If no irq set, just ignore the event.
1314 			 * On e.g. netbsd they call evtchn_device_upcall(port)
1315 			 * We require the evtchn driver to install a handler
1316 			 * so there will be an irq associated with user mode
1317 			 * evtchns.
1318 			 */
1319 			if (irq == INVALID_IRQ) {
1320 				ec_clear_evtchn(port);
1321 				continue;
1322 			}
1323 
1324 			/*
1325 			 * If there's no handler, it could be a poke, so just
1326 			 * accept the event and continue.
1327 			 */
1328 			if (!irq_info[irq].ii_u2.has_handler) {
1329 #ifdef TRAPTRACE
1330 				ttp->ttr_ipl = 0xff;
1331 				if (IRQ_IS_CPUPOKE(irq)) {
1332 					ttp->ttr_ipl = XC_CPUPOKE_PIL;
1333 					ttp->ttr_marker = TT_INTERRUPT;
1334 				}
1335 				ttp->ttr_pri = cpu->cpu_pri;
1336 				ttp->ttr_spl = cpu->cpu_base_spl;
1337 				ttp->ttr_vector = 0xff;
1338 #endif /* TRAPTRACE */
1339 				if (ec_mask_evtchn(port)) {
1340 					ec_clear_evtchn(port);
1341 					ec_unmask_evtchn(port);
1342 					continue;
1343 				}
1344 			}
1345 
1346 			pri = irq_info[irq].ii_u2.ipl;
1347 
1348 			/*
1349 			 * If we are the cpu that successfully masks
1350 			 * the event, then record it as a pending event
1351 			 * for this cpu to service
1352 			 */
1353 			if (ec_mask_evtchn(port)) {
1354 				if (ec_evtchn_pending(port)) {
1355 					cpe->pending_sel[pri] |= selbit;
1356 					cpe->pending_evts[pri][i] |= (1ul << j);
1357 					pending_ints |= 1 << pri;
1358 				} else {
1359 					/*
1360 					 * another cpu serviced this event
1361 					 * before us, clear the mask.
1362 					 */
1363 					ec_unmask_evtchn(port);
1364 				}
1365 			}
1366 		}
1367 	}
1368 	*cpu_ipp = pending_ints;
1369 	if (pending_ints == 0)
1370 		return;
1371 	/*
1372 	 * We have gathered all the pending events/interrupts,
1373 	 * go service all the ones we can from highest priority to lowest.
1374 	 * Note: This loop may not actually complete and service all
1375 	 * pending interrupts since one of the interrupt threads may
1376 	 * block and the pinned thread runs.  In that case, when we
1377 	 * exit the interrupt thread that blocked we will check for
1378 	 * any unserviced interrupts and re-post an upcall to process
1379 	 * any unserviced pending events.
1380 	 */
1381 	curpri = cpu->cpu_pri;
1382 	for (pri = bsrw_insn(*cpu_ipp); pri > curpri; pri--) {
1383 		while ((pending_sels = cpe->pending_sel[pri]) != 0) {
1384 			i = ffs(pending_sels) - 1;
1385 			while ((pe = cpe->pending_evts[pri][i]) != 0) {
1386 				j = ffs(pe) - 1;
1387 				pe &= ~(1ul << j);
1388 				cpe->pending_evts[pri][i] = pe;
1389 				if (pe == 0) {
1390 					/*
1391 					 * Must reload pending selector bits
1392 					 * here as they could have changed on
1393 					 * a previous trip around the inner loop
1394 					 * while we were interrupt enabled
1395 					 * in a interrupt service routine.
1396 					 */
1397 					pending_sels = cpe->pending_sel[pri];
1398 					pending_sels &= ~(1ul << i);
1399 					cpe->pending_sel[pri] = pending_sels;
1400 					if (pending_sels == 0)
1401 						*cpu_ipp &= ~(1 << pri);
1402 				}
1403 				port = (i << EVTCHN_SHIFT) + j;
1404 				irq = evtchn_to_irq[port];
1405 				if (irq == INVALID_IRQ) {
1406 					/*
1407 					 * No longer a handler for this event
1408 					 * channel.  Clear the event and
1409 					 * ignore it, unmask the event.
1410 					 */
1411 					ec_clear_evtchn(port);
1412 					ec_unmask_evtchn(port);
1413 					continue;
1414 				}
1415 				if (irq == ec_dev_irq) {
1416 					volatile int *tptr = &ec_dev_mbox;
1417 
1418 					ASSERT(ec_dev_mbox == 0);
1419 					/*
1420 					 * NOTE: this gross store thru a pointer
1421 					 * is necessary because of a Sun C
1422 					 * compiler bug that does not properly
1423 					 * honor a volatile declaration.
1424 					 * we really should just be able to say
1425 					 * 	ec_dev_mbox = port;
1426 					 * here
1427 					 */
1428 					*tptr = port;
1429 				}
1430 				/*
1431 				 * Set up the regs struct to
1432 				 * look like a normal hardware int
1433 				 * and do normal interrupt handling.
1434 				 */
1435 				rp->r_trapno = irq;
1436 				do_interrupt(rp, ttp);
1437 				/*
1438 				 * Check for cpu priority change
1439 				 * Can happen if int thread blocks
1440 				 */
1441 				if (cpu->cpu_pri > curpri)
1442 					return;
1443 			}
1444 		}
1445 	}
1446 }
1447 
1448 void
1449 ec_unmask_evtchn(unsigned int ev)
1450 {
1451 	uint_t evi;
1452 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1453 	volatile vcpu_info_t *vci = CPU->cpu_m.mcpu_vcpu_info;
1454 	volatile ulong_t *ulp;
1455 
1456 	ASSERT(!interrupts_enabled());
1457 	/*
1458 	 * Check if we need to take slow path
1459 	 */
1460 	if (!CPU_IN_SET(evtchn_cpus[ev], CPU->cpu_id)) {
1461 		xen_evtchn_unmask(ev);
1462 		return;
1463 	}
1464 	evi = ev >> EVTCHN_SHIFT;
1465 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1466 	ulp = (volatile ulong_t *)&si->evtchn_mask[evi];
1467 	atomic_and_ulong(ulp, ~(1ul << ev));
1468 	/*
1469 	 * The following is basically the equivalent of
1470 	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
1471 	 * interrupt edge' if the channel is masked.
1472 	 * XXPV - slight race if upcall was about to be set, we may get
1473 	 * an extra upcall.
1474 	 */
1475 	membar_enter();
1476 	if (si->evtchn_pending[evi] & (1ul << ev)) {
1477 		membar_consumer();
1478 		ulp = (volatile ulong_t *)&vci->evtchn_pending_sel;
1479 		if (!(*ulp & (1ul << evi))) {
1480 			atomic_or_ulong(ulp, (1ul << evi));
1481 		}
1482 		vci->evtchn_upcall_pending = 1;
1483 	}
1484 }
1485 
1486 /*
1487  * Set a bit in an evtchan mask word, return true if we are the cpu that
1488  * set the bit.
1489  */
1490 int
1491 ec_mask_evtchn(unsigned int ev)
1492 {
1493 	uint_t evi, evb;
1494 	ulong_t new, old, bit;
1495 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1496 	volatile ulong_t *maskp;
1497 	int masked;
1498 
1499 	kpreempt_disable();
1500 	evi = ev >> EVTCHN_SHIFT;
1501 	evb = ev & ((1ul << EVTCHN_SHIFT) - 1);
1502 	bit = 1ul << evb;
1503 	maskp = (volatile ulong_t *)&si->evtchn_mask[evi];
1504 	do {
1505 		old = si->evtchn_mask[evi];
1506 		new = old | bit;
1507 	} while (atomic_cas_ulong(maskp, old, new) != old);
1508 	masked = (old & bit) == 0;
1509 	if (masked) {
1510 		evtchn_owner[ev] = CPU->cpu_id;
1511 #ifdef DEBUG
1512 		evtchn_owner_thread[ev] = curthread;
1513 #endif
1514 	}
1515 	kpreempt_enable();
1516 	return (masked);
1517 }
1518 
1519 void
1520 ec_clear_evtchn(unsigned int ev)
1521 {
1522 	uint_t evi;
1523 	shared_info_t *si = HYPERVISOR_shared_info;
1524 	volatile ulong_t *pendp;
1525 
1526 	evi = ev >> EVTCHN_SHIFT;
1527 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1528 	pendp = (volatile ulong_t *)&si->evtchn_pending[evi];
1529 	atomic_and_ulong(pendp, ~(1ul << ev));
1530 }
1531 
1532 void
1533 ec_notify_via_evtchn(unsigned int port)
1534 {
1535 	evtchn_send_t send;
1536 
1537 	ASSERT(port != INVALID_EVTCHN);
1538 
1539 	send.port = port;
1540 	(void) HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
1541 }
1542 
1543 int
1544 ec_block_irq(int irq)
1545 {
1546 	irq_info_t *irqp = &irq_info[irq];
1547 	int evtchn;
1548 
1549 
1550 	evtchn = irq_evtchn(irqp);
1551 	(void) ec_mask_evtchn(evtchn);
1552 	return (evtchn_owner[evtchn]);
1553 }
1554 
1555 /*
1556  * Make a event that is pending for delivery on the current cpu  "go away"
1557  * without servicing the interrupt.
1558  */
1559 void
1560 ec_unpend_irq(int irq)
1561 {
1562 	irq_info_t *irqp = &irq_info[irq];
1563 	int pri = irqp->ii_u2.ipl;
1564 	ulong_t flags;
1565 	uint_t evtchn, evi, bit;
1566 	unsigned long pe, pending_sels;
1567 	struct xen_evt_data *cpe;
1568 
1569 	/*
1570 	 * The evtchn must be masked
1571 	 */
1572 	evtchn = irq_evtchn(irqp);
1573 	ASSERT(EVTCHN_MASKED(evtchn));
1574 	evi = evtchn >> EVTCHN_SHIFT;
1575 	bit = evtchn & (1ul << EVTCHN_SHIFT) - 1;
1576 	flags = intr_clear();
1577 	cpe = CPU->cpu_m.mcpu_evt_pend;
1578 	pe = cpe->pending_evts[pri][evi] & ~(1ul << bit);
1579 	cpe->pending_evts[pri][evi] = pe;
1580 	if (pe == 0) {
1581 		pending_sels = cpe->pending_sel[pri];
1582 		pending_sels &= ~(1ul << evi);
1583 		cpe->pending_sel[pri] = pending_sels;
1584 		if (pending_sels == 0)
1585 			CPU->cpu_m.mcpu_intr_pending &= ~(1 << pri);
1586 	}
1587 	intr_restore(flags);
1588 }
1589