xref: /illumos-gate/usr/src/uts/i86xpv/os/evtchn.c (revision 5422785d352a2bb398daceab3d1898a8aa64d006)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * evtchn.c
29  *
30  * Communication via hypervisor event channels.
31  *
32  * Copyright (c) 2002-2005, K A Fraser
33  *
34  * This file may be distributed separately from the Linux kernel, or
35  * incorporated into other software packages, subject to the following license:
36  *
37  * Permission is hereby granted, free of charge, to any person obtaining a copy
38  * of this source file (the "Software"), to deal in the Software without
39  * restriction, including without limitation the rights to use, copy, modify,
40  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
41  * and to permit persons to whom the Software is furnished to do so, subject to
42  * the following conditions:
43  *
44  * The above copyright notice and this permission notice shall be included in
45  * all copies or substantial portions of the Software.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
48  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
49  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
50  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
51  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
52  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
53  * IN THE SOFTWARE.
54  */
55 
56 /* some parts derived from netbsd's hypervisor_machdep.c 1.2.2.2 */
57 
58 /*
59  *
60  * Copyright (c) 2004 Christian Limpach.
61  * All rights reserved.
62  *
63  * Redistribution and use in source and binary forms, with or without
64  * modification, are permitted provided that the following conditions
65  * are met:
66  * 1. Redistributions of source code must retain the above copyright
67  *    notice, this list of conditions and the following disclaimer.
68  * 2. Redistributions in binary form must reproduce the above copyright
69  *    notice, this list of conditions and the following disclaimer in the
70  *    documentation and/or other materials provided with the distribution.
71  * 3. This section intentionally left blank.
72  * 4. The name of the author may not be used to endorse or promote products
73  *    derived from this software without specific prior written permission.
74  *
75  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
76  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
77  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
78  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
79  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
80  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
84  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85  */
86 /*
87  * Section 3 of the above license was updated in response to bug 6379571.
88  */
89 
90 #include <sys/types.h>
91 #include <sys/hypervisor.h>
92 #include <sys/machsystm.h>
93 #include <sys/mutex.h>
94 #include <sys/evtchn_impl.h>
95 #include <sys/ddi_impldefs.h>
96 #include <sys/avintr.h>
97 #include <sys/cpuvar.h>
98 #include <sys/smp_impldefs.h>
99 #include <sys/archsystm.h>
100 #include <sys/sysmacros.h>
101 #include <sys/cmn_err.h>
102 #include <sys/promif.h>
103 #include <sys/debug.h>
104 #include <sys/psm.h>
105 #include <sys/privregs.h>
106 #include <sys/trap.h>
107 #include <sys/atomic.h>
108 #include <sys/cpu.h>
109 #include <sys/psw.h>
110 #include <sys/traptrace.h>
111 #include <sys/stack.h>
112 #include <sys/x_call.h>
113 #include <xen/public/physdev.h>
114 
115 /*
116  * This file manages our association between hypervisor event channels and
117  * Solaris's IRQs.  This is a one-to-one mapping, with the exception of
118  * IPI IRQs, for which there is one event channel per CPU participating
119  * in the IPI, and the clock VIRQ which also has an event channel per cpu
120  * and the IRQ for /dev/xen/evtchn. The IRQ types are:
121  *
122  * IRQT_VIRQ:
123  *	The hypervisor's standard virtual IRQ, used for the clock timer, for
124  *	example.  This code allows any cpu to bind to one of these, although
125  *	some are treated specially (i.e. VIRQ_DEBUG).
126  *	Event channel binding is done via EVTCHNOP_bind_virq.
127  *
128  * IRQT_PIRQ:
129  *	These associate a physical IRQ with an event channel via
130  *	EVTCHNOP_bind_pirq.
131  *
132  * IRQT_IPI:
133  *	A cross-call IRQ. Maps to "ncpus" event channels, each of which is
134  *	bound to exactly one of the vcpus.  We do not currently support
135  *	unbinding of IPIs (since Solaris doesn't need it). Uses
136  *	EVTCHNOP_bind_ipi.
137  *
138  * IRQT_EVTCHN:
139  *	A "normal" binding to an event channel, typically used by the frontend
140  *      drivers to bind to the their backend event channel.
141  *
142  * IRQT_DEV_EVTCHN:
143  *	This is a one-time IRQ used by /dev/xen/evtchn. Unlike other IRQs, we
144  *	have a one-IRQ to many-evtchn mapping. We only track evtchn->irq for
145  *	these event channels, which are managed via ec_irq_add/rm_evtchn().
146  *	We enforce that IRQT_DEV_EVTCHN's representative evtchn (->ii_evtchn)
147  *	is zero, and make any calls to irq_evtchn() an error, to prevent
148  *	accidentally attempting to use the illegal evtchn 0.
149  *
150  * Suspend/resume
151  *
152  *	During a suspend/resume cycle, we need to tear down the event channels.
153  *	All other mapping data is kept. The drivers will remove their own event
154  *	channels via xendev on receiving a DDI_SUSPEND.  This leaves us with
155  *	the IPIs and VIRQs, which we handle in ec_suspend() and ec_resume()
156  *	below.
157  *
158  * CPU binding
159  *
160  *	When an event channel is bound to a CPU, we set a bit in a mask present
161  *	in the machcpu (evt_affinity) to indicate that this CPU can accept this
162  *	event channel.  For both IPIs and VIRQs, this binding is fixed at
163  *	allocation time and we never modify it.  All other event channels are
164  *	bound via the PSM either as part of add_avintr(), or interrupt
165  *	redistribution (xen_psm_dis/enable_intr()) as a result of CPU
166  *	offline/online.
167  *
168  * Locking
169  *
170  *	Updates are done holding the ec_lock.  The xen_callback_handler()
171  *	routine reads the mapping data in a lockless fashion.  Additionally
172  *	suspend takes ec_lock to prevent update races during a suspend/resume
173  *	cycle.  The IPI info is also examined without the lock; this is OK
174  *	since we only ever change IPI info during initial setup and resume.
175  */
176 
177 #define	IRQ_IS_CPUPOKE(irq) (ipi_info[XC_CPUPOKE_PIL].mi_irq == (irq))
178 
179 #define	EVTCHN_MASKED(ev) \
180 	(HYPERVISOR_shared_info->evtchn_mask[(ev) >> EVTCHN_SHIFT] & \
181 	(1ul << ((ev) & ((1ul << EVTCHN_SHIFT) - 1))))
182 
183 static short evtchn_to_irq[NR_EVENT_CHANNELS];
184 static cpuset_t evtchn_cpus[NR_EVENT_CHANNELS];
185 static int	evtchn_owner[NR_EVENT_CHANNELS];
186 #ifdef DEBUG
187 static kthread_t *evtchn_owner_thread[NR_EVENT_CHANNELS];
188 #endif
189 
190 static irq_info_t irq_info[NR_IRQS];
191 static mec_info_t ipi_info[MAXIPL];
192 static mec_info_t virq_info[NR_VIRQS];
193 
194 /*
195  * See the locking description above.
196  */
197 kmutex_t ec_lock;
198 
199 /*
200  * Bitmap indicating which PIRQs require the hypervisor to be notified
201  * on unmask.
202  */
203 static unsigned long pirq_needs_eoi[NR_PIRQS / (sizeof (unsigned long) * NBBY)];
204 
205 static int ec_debug_irq = INVALID_IRQ;
206 int ec_dev_irq = INVALID_IRQ;
207 
208 int
209 xen_bind_virq(unsigned int virq, processorid_t cpu, int *port)
210 {
211 	evtchn_bind_virq_t bind;
212 	int err;
213 
214 	bind.virq = virq;
215 	bind.vcpu = cpu;
216 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind)) == 0)
217 		*port = bind.port;
218 	else
219 		err = xen_xlate_errcode(err);
220 	return (err);
221 }
222 
223 int
224 xen_bind_interdomain(int domid, int remote_port, int *port)
225 {
226 	evtchn_bind_interdomain_t bind;
227 	int err;
228 
229 	bind.remote_dom  = domid;
230 	bind.remote_port = remote_port;
231 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
232 	    &bind)) == 0)
233 		*port = bind.local_port;
234 	else
235 		err = xen_xlate_errcode(err);
236 	return (err);
237 }
238 
239 int
240 xen_alloc_unbound_evtchn(int domid, int *evtchnp)
241 {
242 	evtchn_alloc_unbound_t alloc;
243 	int err;
244 
245 	alloc.dom = DOMID_SELF;
246 	alloc.remote_dom = domid;
247 
248 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
249 	    &alloc)) == 0) {
250 		*evtchnp = alloc.port;
251 		/* ensure evtchn is masked till we're ready to use it */
252 		(void) ec_mask_evtchn(*evtchnp);
253 	} else {
254 		err = xen_xlate_errcode(err);
255 	}
256 
257 	return (err);
258 }
259 
260 static int
261 xen_close_evtchn(int evtchn)
262 {
263 	evtchn_close_t close;
264 	int err;
265 
266 	close.port = evtchn;
267 	err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
268 	if (err)
269 		err = xen_xlate_errcode(err);
270 	return (err);
271 }
272 
273 static int
274 xen_bind_ipi(processorid_t cpu)
275 {
276 	evtchn_bind_ipi_t bind;
277 
278 	ASSERT(MUTEX_HELD(&ec_lock));
279 
280 	bind.vcpu = cpu;
281 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind) != 0)
282 		panic("xen_bind_ipi() failed");
283 	return (bind.port);
284 }
285 
286 /* Send future instances of this interrupt to other vcpu. */
287 static void
288 xen_bind_vcpu(int evtchn, int cpu)
289 {
290 	evtchn_bind_vcpu_t bind;
291 
292 	ASSERT(MUTEX_HELD(&ec_lock));
293 
294 	bind.port = evtchn;
295 	bind.vcpu = cpu;
296 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind) != 0)
297 		panic("xen_bind_vcpu() failed");
298 }
299 
300 static int
301 xen_bind_pirq(int pirq)
302 {
303 	evtchn_bind_pirq_t bind;
304 	int ret;
305 
306 	bind.pirq = pirq;
307 	bind.flags = BIND_PIRQ__WILL_SHARE;
308 	if ((ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind)) != 0)
309 		panic("xen_bind_pirq() failed (err %d)", ret);
310 	return (bind.port);
311 }
312 
313 /* unmask an evtchn and send upcall to appropriate vcpu if pending bit is set */
314 static void
315 xen_evtchn_unmask(int evtchn)
316 {
317 	evtchn_unmask_t unmask;
318 
319 	unmask.port = evtchn;
320 	if (HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask) != 0)
321 		panic("xen_evtchn_unmask() failed");
322 }
323 
324 static void
325 update_evtchn_affinity(int evtchn)
326 {
327 	cpu_t *cp;
328 	struct xen_evt_data *cpe;
329 
330 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
331 	ASSERT(MUTEX_HELD(&ec_lock));
332 
333 	/*
334 	 * Use lockless search of cpu_list, similar to mutex_vector_enter().
335 	 */
336 	kpreempt_disable();
337 	cp = cpu_list;
338 	do {
339 		cpe = cp->cpu_m.mcpu_evt_pend;
340 		if (CPU_IN_SET(evtchn_cpus[evtchn], cp->cpu_id))
341 			SET_EVTCHN_BIT(evtchn, cpe->evt_affinity);
342 		else
343 			CLEAR_EVTCHN_BIT(evtchn, cpe->evt_affinity);
344 	} while ((cp = cp->cpu_next) != cpu_list);
345 	kpreempt_enable();
346 }
347 
348 static void
349 bind_evtchn_to_cpuset(int evtchn, cpuset_t cpus)
350 {
351 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
352 
353 	CPUSET_ZERO(evtchn_cpus[evtchn]);
354 	CPUSET_OR(evtchn_cpus[evtchn], cpus);
355 	update_evtchn_affinity(evtchn);
356 }
357 
358 static void
359 clear_evtchn_affinity(int evtchn)
360 {
361 	CPUSET_ZERO(evtchn_cpus[evtchn]);
362 	update_evtchn_affinity(evtchn);
363 }
364 
365 static void
366 alloc_irq_evtchn(int irq, int index, int evtchn, int cpu)
367 {
368 	irq_info_t *irqp = &irq_info[irq];
369 
370 	switch (irqp->ii_type) {
371 	case IRQT_IPI:
372 		ipi_info[index].mi_evtchns[cpu] = evtchn;
373 		irqp->ii_u.index = index;
374 		break;
375 	case IRQT_VIRQ:
376 		virq_info[index].mi_evtchns[cpu] = evtchn;
377 		irqp->ii_u.index = index;
378 		break;
379 	default:
380 		irqp->ii_u.evtchn = evtchn;
381 		break;
382 	}
383 
384 	evtchn_to_irq[evtchn] = irq;
385 
386 	/*
387 	 * If a CPU is not specified, we expect to bind it to a CPU later via
388 	 * the PSM.
389 	 */
390 	if (cpu != -1) {
391 		cpuset_t tcpus;
392 		CPUSET_ONLY(tcpus, cpu);
393 		bind_evtchn_to_cpuset(evtchn, tcpus);
394 	}
395 }
396 
397 static int
398 alloc_irq(int type, int index, int evtchn, int cpu)
399 {
400 	int irq;
401 	irq_info_t *irqp;
402 
403 	ASSERT(MUTEX_HELD(&ec_lock));
404 	ASSERT(type != IRQT_IPI || cpu != -1);
405 
406 	for (irq = 0; irq < NR_IRQS; irq++) {
407 		if (irq_info[irq].ii_type == IRQT_UNBOUND)
408 			break;
409 	}
410 
411 	if (irq == NR_IRQS)
412 		panic("No available IRQ to bind to: increase NR_IRQS!\n");
413 
414 	irqp = &irq_info[irq];
415 
416 	irqp->ii_type = type;
417 	/*
418 	 * Set irq/has_handler field to zero which means handler not installed
419 	 */
420 	irqp->ii_u2.has_handler = 0;
421 
422 	alloc_irq_evtchn(irq, index, evtchn, cpu);
423 	return (irq);
424 }
425 
426 static int
427 irq_evtchn(irq_info_t *irqp)
428 {
429 	int evtchn;
430 
431 	ASSERT(irqp->ii_type != IRQT_DEV_EVTCHN);
432 
433 	switch (irqp->ii_type) {
434 	case IRQT_IPI:
435 		ASSERT(irqp->ii_u.index != 0);
436 		evtchn = ipi_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
437 		break;
438 	case IRQT_VIRQ:
439 		evtchn = virq_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
440 		break;
441 	default:
442 		evtchn = irqp->ii_u.evtchn;
443 		break;
444 	}
445 
446 	return (evtchn);
447 }
448 
449 int
450 ec_is_edge_pirq(int irq)
451 {
452 	return (irq_info[irq].ii_type == IRQT_PIRQ &&
453 	    !TEST_EVTCHN_BIT(irq, &pirq_needs_eoi[0]));
454 }
455 
456 static void
457 unbind_evtchn(ushort_t *evtchnp)
458 {
459 	int err;
460 
461 	ASSERT(MUTEX_HELD(&ec_lock));
462 
463 	ASSERT(*evtchnp != 0);
464 
465 	err = xen_close_evtchn(*evtchnp);
466 	ASSERT(err == 0);
467 	clear_evtchn_affinity(*evtchnp);
468 	evtchn_to_irq[*evtchnp] = INVALID_IRQ;
469 	*evtchnp = 0;
470 }
471 
472 static void
473 pirq_unmask_notify(int pirq)
474 {
475 	struct physdev_eoi eoi;
476 
477 	if (TEST_EVTCHN_BIT(pirq, &pirq_needs_eoi[0])) {
478 		eoi.irq = pirq;
479 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
480 	}
481 }
482 
483 static void
484 pirq_query_unmask(int pirq)
485 {
486 	struct physdev_irq_status_query irq_status;
487 
488 	irq_status.irq = pirq;
489 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
490 	CLEAR_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
491 	if (irq_status.flags & XENIRQSTAT_needs_eoi)
492 		SET_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
493 }
494 
495 static void
496 end_pirq(int irq)
497 {
498 	int evtchn = irq_evtchn(&irq_info[irq]);
499 
500 	/*
501 	 * If it is an edge-triggered interrupt we have already unmasked
502 	 */
503 	if (TEST_EVTCHN_BIT(irq, &pirq_needs_eoi[0])) {
504 		ec_unmask_evtchn(evtchn);
505 		pirq_unmask_notify(IRQ_TO_PIRQ(irq));
506 	}
507 }
508 
509 /*
510  * Bind an event channel to a vcpu
511  */
512 void
513 ec_bind_vcpu(int evtchn, int cpu)
514 {
515 	mutex_enter(&ec_lock);
516 	xen_bind_vcpu(evtchn, cpu);
517 	mutex_exit(&ec_lock);
518 }
519 
520 /*
521  * Set up a physical device irq to be associated with an event channel.
522  */
523 void
524 ec_setup_pirq(int irq, int ipl, cpuset_t *cpusp)
525 {
526 	int evtchn;
527 	irq_info_t *irqp = &irq_info[irq];
528 
529 	/*
530 	 * Test if this PIRQ is already bound to an evtchn,
531 	 * which means it is a shared IRQ and we don't want to
532 	 * bind and do some initial setup that has already been
533 	 * done for this irq on a previous trip through this code.
534 	 */
535 	if (irqp->ii_u.evtchn == INVALID_EVTCHN) {
536 		evtchn = xen_bind_pirq(irq);
537 
538 		pirq_query_unmask(IRQ_TO_PIRQ(irq));
539 
540 		irqp->ii_type = IRQT_PIRQ;
541 		irqp->ii_u.evtchn = evtchn;
542 
543 		evtchn_to_irq[evtchn] = irq;
544 		irqp->ii_u2.ipl = ipl;
545 		ec_set_irq_affinity(irq, *cpusp);
546 		ec_enable_irq(irq);
547 		pirq_unmask_notify(IRQ_TO_PIRQ(irq));
548 	} else {
549 		ASSERT(irqp->ii_u2.ipl != 0);
550 		cmn_err(CE_NOTE, "!IRQ%d is shared", irq);
551 		if (ipl > irqp->ii_u2.ipl)
552 			irqp->ii_u2.ipl = ipl;
553 		*cpusp = evtchn_cpus[irqp->ii_u.evtchn];
554 	}
555 }
556 
557 void
558 ec_unbind_irq(int irq)
559 {
560 	irq_info_t *irqp = &irq_info[irq];
561 	mec_info_t *virqp;
562 	int drop_lock = 0;
563 	int type, i;
564 
565 	/*
566 	 * Nasty, but we need this during suspend.
567 	 */
568 	if (mutex_owner(&ec_lock) != curthread) {
569 		mutex_enter(&ec_lock);
570 		drop_lock = 1;
571 	}
572 
573 	type = irqp->ii_type;
574 
575 	ASSERT((type == IRQT_EVTCHN) || (type == IRQT_PIRQ) ||
576 	    (type == IRQT_VIRQ));
577 
578 	if ((type == IRQT_EVTCHN) || (type == IRQT_PIRQ)) {
579 		/* There's only one event channel associated with this irq */
580 		unbind_evtchn(&irqp->ii_u.evtchn);
581 	} else if (type == IRQT_VIRQ) {
582 		/*
583 		 * Each cpu on the system can have it's own event channel
584 		 * associated with a virq.  Unbind them all.
585 		 */
586 		virqp = &virq_info[irqp->ii_u.index];
587 		for (i = 0; i < NCPU; i++) {
588 			if (virqp->mi_evtchns[i] != 0)
589 				unbind_evtchn(&virqp->mi_evtchns[i]);
590 		}
591 		/* Mark the virq structure as invalid. */
592 		virqp->mi_irq = INVALID_IRQ;
593 	}
594 
595 	bzero(irqp, sizeof (*irqp));
596 	/* Re-reserve PIRQ. */
597 	if (type == IRQT_PIRQ)
598 		irqp->ii_type = IRQT_PIRQ;
599 
600 	if (drop_lock)
601 		mutex_exit(&ec_lock);
602 }
603 
604 /*
605  * Rebind an event channel for delivery to a CPU.
606  */
607 void
608 ec_set_irq_affinity(int irq, cpuset_t dest)
609 {
610 	int evtchn, tcpu;
611 	irq_info_t *irqp = &irq_info[irq];
612 
613 	mutex_enter(&ec_lock);
614 
615 	ASSERT(irq < NR_IRQS);
616 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
617 
618 	/*
619 	 * Binding is done at allocation time for these types, so we should
620 	 * never modify them.
621 	 */
622 	if (irqp->ii_type == IRQT_IPI || irqp->ii_type == IRQT_VIRQ ||
623 	    irqp->ii_type == IRQT_DEV_EVTCHN) {
624 		mutex_exit(&ec_lock);
625 		return;
626 	}
627 
628 	CPUSET_FIND(dest, tcpu);
629 	ASSERT(tcpu != CPUSET_NOTINSET);
630 
631 	evtchn = irq_evtchn(irqp);
632 
633 	xen_bind_vcpu(evtchn, tcpu);
634 
635 	bind_evtchn_to_cpuset(evtchn, dest);
636 
637 	mutex_exit(&ec_lock);
638 
639 	/*
640 	 * Now send the new target processor a NOP IPI.
641 	 * It will check for any pending interrupts, and so service any that
642 	 * got delivered to the wrong processor by mistake.
643 	 */
644 	if (ncpus > 1)
645 		poke_cpu(tcpu);
646 }
647 
648 int
649 ec_set_irq_priority(int irq, int pri)
650 {
651 	irq_info_t *irqp;
652 
653 	if (irq >= NR_IRQS)
654 		return (-1);
655 
656 	irqp = &irq_info[irq];
657 
658 	if (irqp->ii_type == IRQT_UNBOUND)
659 		return (-1);
660 
661 	irqp->ii_u2.ipl = pri;
662 
663 	return (0);
664 }
665 
666 void
667 ec_clear_irq_priority(int irq)
668 {
669 	irq_info_t *irqp = &irq_info[irq];
670 
671 	ASSERT(irq < NR_IRQS);
672 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
673 
674 	irqp->ii_u2.ipl = 0;
675 }
676 
677 int
678 ec_bind_evtchn_to_irq(int evtchn)
679 {
680 	mutex_enter(&ec_lock);
681 
682 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
683 
684 	(void) alloc_irq(IRQT_EVTCHN, 0, evtchn, -1);
685 
686 	mutex_exit(&ec_lock);
687 	return (evtchn_to_irq[evtchn]);
688 }
689 
690 int
691 ec_bind_virq_to_irq(int virq, int cpu)
692 {
693 	int err;
694 	int evtchn;
695 	mec_info_t *virqp;
696 
697 	virqp = &virq_info[virq];
698 	mutex_enter(&ec_lock);
699 
700 	err = xen_bind_virq(virq, cpu, &evtchn);
701 	ASSERT(err == 0);
702 
703 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
704 
705 	if (virqp->mi_irq == INVALID_IRQ) {
706 		virqp->mi_irq = alloc_irq(IRQT_VIRQ, virq, evtchn, cpu);
707 	} else {
708 		alloc_irq_evtchn(virqp->mi_irq, virq, evtchn, cpu);
709 	}
710 
711 	mutex_exit(&ec_lock);
712 
713 	return (virqp->mi_irq);
714 }
715 
716 int
717 ec_bind_ipi_to_irq(int ipl, int cpu)
718 {
719 	int evtchn;
720 	ulong_t flags;
721 	mec_info_t *ipip;
722 
723 	mutex_enter(&ec_lock);
724 
725 	ipip = &ipi_info[ipl];
726 
727 	evtchn = xen_bind_ipi(cpu);
728 
729 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
730 
731 	if (ipip->mi_irq == INVALID_IRQ) {
732 		ipip->mi_irq = alloc_irq(IRQT_IPI, ipl, evtchn, cpu);
733 	} else {
734 		alloc_irq_evtchn(ipip->mi_irq, ipl, evtchn, cpu);
735 	}
736 
737 	/*
738 	 * Unmask the new evtchn so that it can be seen by the target cpu
739 	 */
740 	flags = intr_clear();
741 	ec_unmask_evtchn(evtchn);
742 	intr_restore(flags);
743 
744 	mutex_exit(&ec_lock);
745 	return (ipip->mi_irq);
746 }
747 
748 /*
749  * When bringing up a CPU, bind to all the IPIs that CPU0 bound.
750  */
751 void
752 ec_bind_cpu_ipis(int cpu)
753 {
754 	int i;
755 
756 	for (i = 0; i < MAXIPL; i++) {
757 		mec_info_t *ipip = &ipi_info[i];
758 		if (ipip->mi_irq == INVALID_IRQ)
759 			continue;
760 
761 		(void) ec_bind_ipi_to_irq(i, cpu);
762 	}
763 }
764 
765 /*
766  * Can this IRQ be rebound to another CPU?
767  */
768 int
769 ec_irq_rebindable(int irq)
770 {
771 	irq_info_t *irqp = &irq_info[irq];
772 
773 	if (irqp->ii_u.evtchn == 0)
774 		return (0);
775 
776 	return (irqp->ii_type == IRQT_EVTCHN || irqp->ii_type == IRQT_PIRQ);
777 }
778 
779 /*
780  * Should this IRQ be unbound from this CPU (which is being offlined) to
781  * another?
782  */
783 int
784 ec_irq_needs_rebind(int irq, int cpu)
785 {
786 	irq_info_t *irqp = &irq_info[irq];
787 
788 	return (ec_irq_rebindable(irq) &&
789 	    CPU_IN_SET(evtchn_cpus[irqp->ii_u.evtchn], cpu));
790 }
791 
792 void
793 ec_send_ipi(int ipl, int cpu)
794 {
795 	mec_info_t *ipip = &ipi_info[ipl];
796 
797 	ASSERT(ipip->mi_irq != INVALID_IRQ);
798 
799 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
800 }
801 
802 void
803 ec_try_ipi(int ipl, int cpu)
804 {
805 	mec_info_t *ipip = &ipi_info[ipl];
806 
807 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
808 		return;
809 
810 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
811 }
812 
813 void
814 ec_irq_add_evtchn(int irq, int evtchn)
815 {
816 	mutex_enter(&ec_lock);
817 
818 	/*
819 	 * See description of IRQT_DEV_EVTCHN above.
820 	 */
821 	ASSERT(irq == ec_dev_irq);
822 
823 	alloc_irq_evtchn(irq, 0, evtchn, 0);
824 	/*
825 	 * We enforce that the representative event channel for IRQT_DEV_EVTCHN
826 	 * is zero, so PSM operations on it have no effect.
827 	 */
828 	irq_info[irq].ii_u.evtchn = 0;
829 	mutex_exit(&ec_lock);
830 }
831 
832 void
833 ec_irq_rm_evtchn(int irq, int evtchn)
834 {
835 	ushort_t ec = evtchn;
836 
837 	mutex_enter(&ec_lock);
838 	ASSERT(irq == ec_dev_irq);
839 	unbind_evtchn(&ec);
840 	mutex_exit(&ec_lock);
841 }
842 
843 /*
844  * Allocate an /dev/xen/evtchn IRQ.  See the big comment at the top
845  * for an explanation.
846  */
847 int
848 ec_dev_alloc_irq(void)
849 {
850 	int i;
851 	irq_info_t *irqp;
852 
853 	for (i = 0; i < NR_IRQS; i++) {
854 		if (irq_info[i].ii_type == IRQT_UNBOUND)
855 			break;
856 	}
857 
858 	ASSERT(i != NR_IRQS);
859 
860 	irqp = &irq_info[i];
861 	irqp->ii_type = IRQT_DEV_EVTCHN;
862 	irqp->ii_u2.ipl = IPL_EVTCHN;
863 	/*
864 	 * Force the evtchn to zero for the special evtchn device irq
865 	 */
866 	irqp->ii_u.evtchn = 0;
867 	return (i);
868 }
869 
870 void
871 ec_enable_irq(unsigned int irq)
872 {
873 	ulong_t flag;
874 	irq_info_t *irqp = &irq_info[irq];
875 
876 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
877 		return;
878 
879 	flag = intr_clear();
880 	ec_unmask_evtchn(irq_evtchn(irqp));
881 	intr_restore(flag);
882 }
883 
884 void
885 ec_disable_irq(unsigned int irq)
886 {
887 	irq_info_t *irqp = &irq_info[irq];
888 
889 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
890 		return;
891 
892 	/*
893 	 * Spin till we are the one to mask the evtchn
894 	 * Ensures no one else can be servicing this evtchn.
895 	 */
896 	while (!ec_mask_evtchn(irq_evtchn(irqp)))
897 		SMT_PAUSE();
898 }
899 
900 static int
901 ec_evtchn_pending(uint_t ev)
902 {
903 	uint_t evi;
904 	shared_info_t *si = HYPERVISOR_shared_info;
905 
906 	evi = ev >> EVTCHN_SHIFT;
907 	ev &= (1ul << EVTCHN_SHIFT) - 1;
908 	return ((si->evtchn_pending[evi] & (1ul << ev)) != 0);
909 }
910 
911 int
912 ec_pending_irq(unsigned int irq)
913 {
914 	int evtchn = irq_evtchn(&irq_info[irq]);
915 
916 	return (ec_evtchn_pending(evtchn));
917 }
918 
919 void
920 ec_clear_irq(int irq)
921 {
922 	irq_info_t *irqp = &irq_info[irq];
923 	int evtchn;
924 
925 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
926 		return;
927 
928 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
929 
930 	evtchn = irq_evtchn(irqp);
931 
932 	ASSERT(EVTCHN_MASKED(evtchn));
933 	ec_clear_evtchn(evtchn);
934 }
935 
936 void
937 ec_unmask_irq(int irq)
938 {
939 	ulong_t flags;
940 	irq_info_t *irqp = &irq_info[irq];
941 
942 	flags = intr_clear();
943 	switch (irqp->ii_type) {
944 	case IRQT_PIRQ:
945 		end_pirq(irq);
946 		break;
947 	case IRQT_DEV_EVTCHN:
948 		break;
949 	default:
950 		ec_unmask_evtchn(irq_evtchn(irqp));
951 		break;
952 	}
953 	intr_restore(flags);
954 }
955 
956 void
957 ec_try_unmask_irq(int irq)
958 {
959 	ulong_t flags;
960 	irq_info_t *irqp = &irq_info[irq];
961 	int evtchn;
962 
963 	flags = intr_clear();
964 	switch (irqp->ii_type) {
965 	case IRQT_PIRQ:
966 		end_pirq(irq);
967 		break;
968 	case IRQT_DEV_EVTCHN:
969 		break;
970 	default:
971 		if ((evtchn = irq_evtchn(irqp)) != 0)
972 			ec_unmask_evtchn(evtchn);
973 		break;
974 	}
975 	intr_restore(flags);
976 }
977 
978 /*
979  * Poll until an event channel is ready or 'check_func' returns true.  This can
980  * only be used in a situation where interrupts are masked, otherwise we have a
981  * classic time-of-check vs. time-of-use race.
982  */
983 void
984 ec_wait_on_evtchn(int evtchn, int (*check_func)(void *), void *arg)
985 {
986 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
987 		while (!check_func(arg))
988 			(void) HYPERVISOR_yield();
989 		return;
990 	}
991 
992 	ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
993 
994 	for (;;) {
995 		evtchn_port_t ports[1];
996 
997 		ports[0] = evtchn;
998 
999 		ec_clear_evtchn(evtchn);
1000 
1001 		if (check_func(arg))
1002 			return;
1003 
1004 		(void) HYPERVISOR_poll(ports, 1, 0);
1005 	}
1006 }
1007 
1008 void
1009 ec_wait_on_ipi(int ipl, int (*check_func)(void *), void *arg)
1010 {
1011 	mec_info_t *ipip = &ipi_info[ipl];
1012 
1013 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
1014 		return;
1015 
1016 	ec_wait_on_evtchn(ipip->mi_evtchns[CPU->cpu_id], check_func, arg);
1017 }
1018 
1019 void
1020 ec_suspend(void)
1021 {
1022 	irq_info_t *irqp;
1023 	ushort_t *evtchnp;
1024 	int i;
1025 	int c;
1026 
1027 	ASSERT(MUTEX_HELD(&ec_lock));
1028 
1029 	for (i = 0; i < MAXIPL; i++) {
1030 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1031 			continue;
1032 
1033 		for (c = 0; c < NCPU; c++) {
1034 			if (cpu[c] == NULL)
1035 				continue;
1036 
1037 			if (CPU_IN_SET(cpu_suspend_lost_set, c))
1038 				continue;
1039 
1040 			evtchnp = &ipi_info[i].mi_evtchns[c];
1041 			ASSERT(*evtchnp != 0);
1042 			unbind_evtchn(evtchnp);
1043 		}
1044 	}
1045 
1046 	for (i = 0; i < NR_VIRQS; i++) {
1047 		if (virq_info[i].mi_irq == INVALID_IRQ)
1048 			continue;
1049 
1050 		/*
1051 		 * If we're sharing a single event channel across all CPUs, we
1052 		 * should only unbind once.
1053 		 */
1054 		if (virq_info[i].mi_shared) {
1055 			evtchnp = &virq_info[i].mi_evtchns[0];
1056 			unbind_evtchn(evtchnp);
1057 			for (c = 1; c < NCPU; c++)
1058 				virq_info[i].mi_evtchns[c] = 0;
1059 		} else {
1060 			for (c = 0; c < NCPU; c++) {
1061 				if (cpu[c] == NULL)
1062 					continue;
1063 
1064 				evtchnp = &virq_info[i].mi_evtchns[c];
1065 				if (*evtchnp != 0)
1066 					unbind_evtchn(evtchnp);
1067 			}
1068 		}
1069 	}
1070 
1071 	for (i = 0; i < NR_IRQS; i++) {
1072 		irqp = &irq_info[i];
1073 
1074 		switch (irqp->ii_type) {
1075 		case IRQT_EVTCHN:
1076 		case IRQT_DEV_EVTCHN:
1077 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1078 			break;
1079 		case IRQT_PIRQ:
1080 			if (irqp->ii_u.evtchn != 0)
1081 				(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1082 			break;
1083 		default:
1084 			break;
1085 		}
1086 	}
1087 }
1088 
1089 /*
1090  * The debug irq is special, we only have one evtchn and irq but we allow all
1091  * cpus to service it.  It's marked as shared and we propogate the event
1092  * channel into all CPUs by hand.
1093  */
1094 static void
1095 share_virq(mec_info_t *virqp)
1096 {
1097 	int evtchn = virqp->mi_evtchns[0];
1098 	cpuset_t tset;
1099 	int i;
1100 
1101 	ASSERT(evtchn != 0);
1102 
1103 	virqp->mi_shared = 1;
1104 
1105 	for (i = 1; i < NCPU; i++)
1106 		virqp->mi_evtchns[i] = evtchn;
1107 	CPUSET_ALL(tset);
1108 	bind_evtchn_to_cpuset(evtchn, tset);
1109 }
1110 
1111 static void
1112 virq_resume(int virq)
1113 {
1114 	mec_info_t *virqp = &virq_info[virq];
1115 	int evtchn;
1116 	int i, err;
1117 
1118 	for (i = 0; i < NCPU; i++) {
1119 		cpuset_t tcpus;
1120 
1121 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1122 			continue;
1123 
1124 		err = xen_bind_virq(virq, i, &evtchn);
1125 		ASSERT(err == 0);
1126 
1127 		virqp->mi_evtchns[i] = evtchn;
1128 		evtchn_to_irq[evtchn] = virqp->mi_irq;
1129 		CPUSET_ONLY(tcpus, i);
1130 		bind_evtchn_to_cpuset(evtchn, tcpus);
1131 		ec_unmask_evtchn(evtchn);
1132 		/*
1133 		 * only timer VIRQ is bound to all cpus
1134 		 */
1135 		if (virq != VIRQ_TIMER)
1136 			break;
1137 	}
1138 
1139 	if (virqp->mi_shared)
1140 		share_virq(virqp);
1141 }
1142 
1143 static void
1144 ipi_resume(int ipl)
1145 {
1146 	mec_info_t *ipip = &ipi_info[ipl];
1147 	int i;
1148 
1149 	for (i = 0; i < NCPU; i++) {
1150 		cpuset_t tcpus;
1151 		int evtchn;
1152 
1153 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1154 			continue;
1155 
1156 		evtchn = xen_bind_ipi(i);
1157 		ipip->mi_evtchns[i] = evtchn;
1158 		evtchn_to_irq[evtchn] = ipip->mi_irq;
1159 		CPUSET_ONLY(tcpus, i);
1160 		bind_evtchn_to_cpuset(evtchn, tcpus);
1161 		ec_unmask_evtchn(evtchn);
1162 	}
1163 }
1164 
1165 void
1166 ec_resume(void)
1167 {
1168 	int i;
1169 
1170 	/* New event-channel space is not 'live' yet. */
1171 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
1172 		(void) ec_mask_evtchn(i);
1173 
1174 	for (i = 0; i < MAXIPL; i++) {
1175 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1176 			continue;
1177 		ipi_resume(i);
1178 	}
1179 
1180 	for (i = 0; i < NR_VIRQS; i++) {
1181 		if (virq_info[i].mi_irq == INVALID_IRQ)
1182 			continue;
1183 		virq_resume(i);
1184 	}
1185 }
1186 
1187 int
1188 ec_init(void)
1189 {
1190 	int i;
1191 	mutex_init(&ec_lock, NULL, MUTEX_SPIN, (void *)ipltospl(SPL7));
1192 
1193 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
1194 		CPUSET_ZERO(evtchn_cpus[i]);
1195 		evtchn_to_irq[i] = INVALID_IRQ;
1196 		(void) ec_mask_evtchn(i);
1197 	}
1198 
1199 	for (i = 0; i < MAXIPL; i++)
1200 		ipi_info[i].mi_irq = INVALID_IRQ;
1201 
1202 	for (i = 0; i < NR_VIRQS; i++)
1203 		virq_info[i].mi_irq = INVALID_IRQ;
1204 
1205 	/*
1206 	 * Phys IRQ space is statically bound (1:1 mapping), grab the IRQs
1207 	 * now.
1208 	 */
1209 	for (i = PIRQ_BASE; i < NR_PIRQS; i++) {
1210 		irq_info[PIRQ_TO_IRQ(i)].ii_type = IRQT_PIRQ;
1211 	}
1212 
1213 	return (0);
1214 }
1215 
1216 void
1217 ec_init_debug_irq()
1218 {
1219 	int irq;
1220 
1221 	irq = ec_bind_virq_to_irq(VIRQ_DEBUG, 0);
1222 	(void) add_avintr(NULL, IPL_DEBUG, (avfunc)xen_debug_handler,
1223 	    "debug", irq, NULL, NULL, NULL, NULL);
1224 
1225 	mutex_enter(&ec_lock);
1226 	share_virq(&virq_info[irq_info[irq].ii_u.index]);
1227 	mutex_exit(&ec_lock);
1228 	ec_debug_irq = irq;
1229 }
1230 
1231 #define	UNBLOCKED_EVENTS(si, ix, cpe, cpu_id) \
1232 	((si)->evtchn_pending[ix] & ~(si)->evtchn_mask[ix] & \
1233 		(cpe)->evt_affinity[ix])
1234 
1235 
1236 /*
1237  * This is the entry point for processing events from xen
1238  *
1239  * (See the commentary associated with the shared_info_st structure
1240  * in hypervisor-if.h)
1241  *
1242  * Since the event channel mechanism doesn't really implement the
1243  * concept of priority like hardware interrupt controllers, we simulate
1244  * that in software here using the cpu priority field and the pending
1245  * interrupts field.  Events/interrupts that are not able to be serviced
1246  * now because they are at a lower priority than the current cpu priority
1247  * cause a level bit to be recorded in the pending interrupts word.  When
1248  * the priority is lowered (either by spl or interrupt exit code) the pending
1249  * levels are checked and an upcall is scheduled if there are events/interrupts
1250  * that have become deliverable.
1251  */
1252 void
1253 xen_callback_handler(struct regs *rp, trap_trace_rec_t *ttp)
1254 {
1255 	ulong_t pending_sels, pe, selbit;
1256 	int i, j, port, pri, curpri, irq, sipri;
1257 	uint16_t pending_ints, sip;
1258 	struct cpu *cpu = CPU;
1259 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1260 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
1261 	volatile struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
1262 	volatile uint16_t *cpu_ipp = &cpu->cpu_m.mcpu_intr_pending;
1263 	extern void dosoftint(struct regs *);
1264 
1265 	ASSERT(rp->r_trapno == T_AST && rp->r_err == 0);
1266 	ASSERT(&si->vcpu_info[cpu->cpu_id] == vci);
1267 	ASSERT_STACK_ALIGNED();
1268 
1269 	vci->evtchn_upcall_pending = 0;
1270 
1271 	/*
1272 	 * To expedite scanning of pending notifications, any 0->1
1273 	 * pending transition on an unmasked channel causes a
1274 	 * corresponding bit in evtchn_pending_sel to be set.
1275 	 * Each bit in the selector covers a 32-bit word in
1276 	 * the evtchn_pending[] array.
1277 	 */
1278 	membar_enter();
1279 	do {
1280 		pending_sels = vci->evtchn_pending_sel;
1281 	} while (atomic_cas_ulong((volatile ulong_t *)&vci->evtchn_pending_sel,
1282 	    pending_sels, 0) != pending_sels);
1283 
1284 	pending_ints = *cpu_ipp;
1285 	while ((i = ffs(pending_sels)) != 0) {
1286 		i--;
1287 		selbit = 1ul << i;
1288 		pending_sels &= ~selbit;
1289 
1290 		membar_enter();
1291 		while ((pe = UNBLOCKED_EVENTS(si, i, cpe, cpu->cpu_id)) != 0) {
1292 			j = ffs(pe) - 1;
1293 			pe &= ~(1ul << j);
1294 
1295 			port = (i << EVTCHN_SHIFT) + j;
1296 
1297 			irq = evtchn_to_irq[port];
1298 
1299 			/*
1300 			 * If no irq set, just ignore the event.
1301 			 * On e.g. netbsd they call evtchn_device_upcall(port)
1302 			 * We require the evtchn driver to install a handler
1303 			 * so there will be an irq associated with user mode
1304 			 * evtchns.
1305 			 */
1306 			if (irq == INVALID_IRQ) {
1307 				ec_clear_evtchn(port);
1308 				continue;
1309 			}
1310 
1311 			/*
1312 			 * If there's no handler, it could be a poke, so just
1313 			 * accept the event and continue.
1314 			 */
1315 			if (!irq_info[irq].ii_u2.has_handler) {
1316 #ifdef TRAPTRACE
1317 				ttp->ttr_ipl = 0xff;
1318 				if (IRQ_IS_CPUPOKE(irq)) {
1319 					ttp->ttr_ipl = XC_CPUPOKE_PIL;
1320 					ttp->ttr_marker = TT_INTERRUPT;
1321 				}
1322 				ttp->ttr_pri = cpu->cpu_pri;
1323 				ttp->ttr_spl = cpu->cpu_base_spl;
1324 				ttp->ttr_vector = 0xff;
1325 #endif /* TRAPTRACE */
1326 				if (ec_mask_evtchn(port)) {
1327 					ec_clear_evtchn(port);
1328 					ec_unmask_evtchn(port);
1329 					continue;
1330 				}
1331 			}
1332 
1333 			pri = irq_info[irq].ii_u2.ipl;
1334 
1335 			/*
1336 			 * If we are the cpu that successfully masks
1337 			 * the event, then record it as a pending event
1338 			 * for this cpu to service
1339 			 */
1340 			if (ec_mask_evtchn(port)) {
1341 				if (ec_evtchn_pending(port)) {
1342 					cpe->pending_sel[pri] |= selbit;
1343 					cpe->pending_evts[pri][i] |= (1ul << j);
1344 					pending_ints |= 1 << pri;
1345 					/*
1346 					 * We have recorded a pending interrupt
1347 					 * for this cpu.  If it is an edge
1348 					 * triggered interrupt then we go ahead
1349 					 * and clear the pending and mask bits
1350 					 * from the shared info to avoid having
1351 					 * the hypervisor see the pending event
1352 					 * again and possibly disabling the
1353 					 * interrupt.  This should also help
1354 					 * keep us from missing an interrupt.
1355 					 */
1356 					if (ec_is_edge_pirq(irq)) {
1357 						ec_clear_evtchn(port);
1358 						ec_unmask_evtchn(port);
1359 					}
1360 				} else {
1361 					/*
1362 					 * another cpu serviced this event
1363 					 * before us, clear the mask.
1364 					 */
1365 					ec_unmask_evtchn(port);
1366 				}
1367 			}
1368 		}
1369 	}
1370 	*cpu_ipp = pending_ints;
1371 	if (pending_ints == 0)
1372 		return;
1373 	/*
1374 	 * We have gathered all the pending events/interrupts,
1375 	 * go service all the ones we can from highest priority to lowest.
1376 	 * Note: This loop may not actually complete and service all
1377 	 * pending interrupts since one of the interrupt threads may
1378 	 * block and the pinned thread runs.  In that case, when we
1379 	 * exit the interrupt thread that blocked we will check for
1380 	 * any unserviced interrupts and re-post an upcall to process
1381 	 * any unserviced pending events.
1382 	 */
1383 restart:
1384 	curpri = cpu->cpu_pri;
1385 	pri = bsrw_insn(*cpu_ipp);
1386 	while (pri > curpri) {
1387 		while ((pending_sels = cpe->pending_sel[pri]) != 0) {
1388 			i = ffs(pending_sels) - 1;
1389 			while ((pe = cpe->pending_evts[pri][i]) != 0) {
1390 				j = ffs(pe) - 1;
1391 				port = (i << EVTCHN_SHIFT) + j;
1392 				pe &= ~(1ul << j);
1393 				cpe->pending_evts[pri][i] = pe;
1394 				if (pe == 0) {
1395 					/*
1396 					 * Must reload pending selector bits
1397 					 * here as they could have changed on
1398 					 * a previous trip around the inner loop
1399 					 * while we were interrupt enabled
1400 					 * in a interrupt service routine.
1401 					 */
1402 					pending_sels = cpe->pending_sel[pri];
1403 					pending_sels &= ~(1ul << i);
1404 					cpe->pending_sel[pri] = pending_sels;
1405 					if (pending_sels == 0)
1406 						*cpu_ipp &= ~(1 << pri);
1407 				}
1408 				irq = evtchn_to_irq[port];
1409 				if (irq == INVALID_IRQ) {
1410 					/*
1411 					 * No longer a handler for this event
1412 					 * channel.  Clear the event and
1413 					 * ignore it, unmask the event.
1414 					 */
1415 					ec_clear_evtchn(port);
1416 					ec_unmask_evtchn(port);
1417 					continue;
1418 				}
1419 				if (irq == ec_dev_irq) {
1420 					ASSERT(cpu->cpu_m.mcpu_ec_mbox == 0);
1421 					cpu->cpu_m.mcpu_ec_mbox = port;
1422 				}
1423 				/*
1424 				 * Set up the regs struct to
1425 				 * look like a normal hardware int
1426 				 * and do normal interrupt handling.
1427 				 */
1428 				rp->r_trapno = irq;
1429 				do_interrupt(rp, ttp);
1430 				/*
1431 				 * Check for cpu priority change
1432 				 * Can happen if int thread blocks
1433 				 */
1434 				if (cpu->cpu_pri != curpri)
1435 					goto restart;
1436 			}
1437 		}
1438 		/*
1439 		 * Dispatch any soft interrupts that are
1440 		 * higher priority than any hard ones remaining.
1441 		 */
1442 		pri = bsrw_insn(*cpu_ipp);
1443 		sip = (uint16_t)cpu->cpu_softinfo.st_pending;
1444 		if (sip != 0) {
1445 			sipri = bsrw_insn(sip);
1446 			if (sipri > pri && sipri > cpu->cpu_pri) {
1447 				dosoftint(rp);
1448 				/*
1449 				 * Check for cpu priority change
1450 				 * Can happen if softint thread blocks
1451 				 */
1452 				if (cpu->cpu_pri != curpri)
1453 					goto restart;
1454 			}
1455 		}
1456 	}
1457 	/*
1458 	 * Deliver any pending soft interrupts.
1459 	 */
1460 	if (cpu->cpu_softinfo.st_pending)
1461 		dosoftint(rp);
1462 }
1463 
1464 
1465 void
1466 ec_unmask_evtchn(unsigned int ev)
1467 {
1468 	uint_t evi, evb;
1469 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1470 	volatile vcpu_info_t *vci = CPU->cpu_m.mcpu_vcpu_info;
1471 	volatile ulong_t *ulp;
1472 
1473 	ASSERT(!interrupts_enabled());
1474 	/*
1475 	 * Check if we need to take slow path
1476 	 */
1477 	if (!CPU_IN_SET(evtchn_cpus[ev], CPU->cpu_id)) {
1478 		xen_evtchn_unmask(ev);
1479 		return;
1480 	}
1481 	evi = ev >> EVTCHN_SHIFT;
1482 	evb = ev & ((1ul << EVTCHN_SHIFT) - 1);
1483 	ulp = (volatile ulong_t *)&si->evtchn_mask[evi];
1484 	atomic_and_ulong(ulp, ~(1ul << evb));
1485 	/*
1486 	 * The following is basically the equivalent of
1487 	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
1488 	 * interrupt edge' if the channel is masked.
1489 	 * XXPV - slight race if upcall was about to be set, we may get
1490 	 * an extra upcall.
1491 	 */
1492 	membar_enter();
1493 	if (si->evtchn_pending[evi] & (1ul << evb)) {
1494 		membar_consumer();
1495 		ulp = (volatile ulong_t *)&vci->evtchn_pending_sel;
1496 		if (!(*ulp & (1ul << evi))) {
1497 			atomic_or_ulong(ulp, (1ul << evi));
1498 		}
1499 		vci->evtchn_upcall_pending = 1;
1500 	}
1501 }
1502 
1503 /*
1504  * Set a bit in an evtchan mask word, return true if we are the cpu that
1505  * set the bit.
1506  */
1507 int
1508 ec_mask_evtchn(unsigned int ev)
1509 {
1510 	uint_t evi, evb;
1511 	ulong_t new, old, bit;
1512 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1513 	volatile ulong_t *maskp;
1514 	int masked;
1515 
1516 	kpreempt_disable();
1517 	evi = ev >> EVTCHN_SHIFT;
1518 	evb = ev & ((1ul << EVTCHN_SHIFT) - 1);
1519 	bit = 1ul << evb;
1520 	maskp = (volatile ulong_t *)&si->evtchn_mask[evi];
1521 	do {
1522 		old = si->evtchn_mask[evi];
1523 		new = old | bit;
1524 	} while (atomic_cas_ulong(maskp, old, new) != old);
1525 	masked = (old & bit) == 0;
1526 	if (masked) {
1527 		evtchn_owner[ev] = CPU->cpu_id;
1528 #ifdef DEBUG
1529 		evtchn_owner_thread[ev] = curthread;
1530 #endif
1531 	}
1532 	kpreempt_enable();
1533 	return (masked);
1534 }
1535 
1536 void
1537 ec_clear_evtchn(unsigned int ev)
1538 {
1539 	uint_t evi;
1540 	shared_info_t *si = HYPERVISOR_shared_info;
1541 	volatile ulong_t *pendp;
1542 
1543 	evi = ev >> EVTCHN_SHIFT;
1544 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1545 	pendp = (volatile ulong_t *)&si->evtchn_pending[evi];
1546 	atomic_and_ulong(pendp, ~(1ul << ev));
1547 }
1548 
1549 void
1550 ec_notify_via_evtchn(unsigned int port)
1551 {
1552 	evtchn_send_t send;
1553 
1554 	ASSERT(port != INVALID_EVTCHN);
1555 
1556 	send.port = port;
1557 	(void) HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
1558 }
1559 
1560 int
1561 ec_block_irq(int irq)
1562 {
1563 	irq_info_t *irqp = &irq_info[irq];
1564 	int evtchn;
1565 
1566 
1567 	evtchn = irq_evtchn(irqp);
1568 	(void) ec_mask_evtchn(evtchn);
1569 	return (evtchn_owner[evtchn]);
1570 }
1571 
1572 /*
1573  * Make a event that is pending for delivery on the current cpu  "go away"
1574  * without servicing the interrupt.
1575  */
1576 void
1577 ec_unpend_irq(int irq)
1578 {
1579 	irq_info_t *irqp = &irq_info[irq];
1580 	int pri = irqp->ii_u2.ipl;
1581 	ulong_t flags;
1582 	uint_t evtchn, evi, bit;
1583 	unsigned long pe, pending_sels;
1584 	struct xen_evt_data *cpe;
1585 
1586 	/*
1587 	 * The evtchn must be masked
1588 	 */
1589 	evtchn = irq_evtchn(irqp);
1590 	ASSERT(EVTCHN_MASKED(evtchn));
1591 	evi = evtchn >> EVTCHN_SHIFT;
1592 	bit = evtchn & (1ul << EVTCHN_SHIFT) - 1;
1593 	flags = intr_clear();
1594 	cpe = CPU->cpu_m.mcpu_evt_pend;
1595 	pe = cpe->pending_evts[pri][evi] & ~(1ul << bit);
1596 	cpe->pending_evts[pri][evi] = pe;
1597 	if (pe == 0) {
1598 		pending_sels = cpe->pending_sel[pri];
1599 		pending_sels &= ~(1ul << evi);
1600 		cpe->pending_sel[pri] = pending_sels;
1601 		if (pending_sels == 0)
1602 			CPU->cpu_m.mcpu_intr_pending &= ~(1 << pri);
1603 	}
1604 	intr_restore(flags);
1605 }
1606