xref: /titanic_50/usr/src/uts/i86xpv/os/evtchn.c (revision b9bc7f7832704fda46b4d6b04f3f7be1227dc644)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * evtchn.c
31  *
32  * Communication via hypervisor event channels.
33  *
34  * Copyright (c) 2002-2005, K A Fraser
35  *
36  * This file may be distributed separately from the Linux kernel, or
37  * incorporated into other software packages, subject to the following license:
38  *
39  * Permission is hereby granted, free of charge, to any person obtaining a copy
40  * of this source file (the "Software"), to deal in the Software without
41  * restriction, including without limitation the rights to use, copy, modify,
42  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43  * and to permit persons to whom the Software is furnished to do so, subject to
44  * the following conditions:
45  *
46  * The above copyright notice and this permission notice shall be included in
47  * all copies or substantial portions of the Software.
48  *
49  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
55  * IN THE SOFTWARE.
56  */
57 
58 /* some parts derived from netbsd's hypervisor_machdep.c 1.2.2.2 */
59 
60 /*
61  *
62  * Copyright (c) 2004 Christian Limpach.
63  * All rights reserved.
64  *
65  * Redistribution and use in source and binary forms, with or without
66  * modification, are permitted provided that the following conditions
67  * are met:
68  * 1. Redistributions of source code must retain the above copyright
69  *    notice, this list of conditions and the following disclaimer.
70  * 2. Redistributions in binary form must reproduce the above copyright
71  *    notice, this list of conditions and the following disclaimer in the
72  *    documentation and/or other materials provided with the distribution.
73  * 3. This section intentionally left blank.
74  * 4. The name of the author may not be used to endorse or promote products
75  *    derived from this software without specific prior written permission.
76  *
77  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
78  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
79  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
80  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
81  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
82  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
83  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
84  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
85  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
86  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87  */
88 /*
89  * Section 3 of the above license was updated in response to bug 6379571.
90  */
91 
92 #include <sys/types.h>
93 #include <sys/hypervisor.h>
94 #include <sys/machsystm.h>
95 #include <sys/mutex.h>
96 #include <sys/evtchn_impl.h>
97 #include <sys/ddi_impldefs.h>
98 #include <sys/avintr.h>
99 #include <sys/cpuvar.h>
100 #include <sys/smp_impldefs.h>
101 #include <sys/archsystm.h>
102 #include <sys/sysmacros.h>
103 #include <sys/cmn_err.h>
104 #include <sys/promif.h>
105 #include <sys/debug.h>
106 #include <sys/psm.h>
107 #include <sys/privregs.h>
108 #include <sys/trap.h>
109 #include <sys/atomic.h>
110 #include <sys/cpu.h>
111 #include <sys/psw.h>
112 #include <sys/traptrace.h>
113 #include <sys/stack.h>
114 #include <sys/x_call.h>
115 #include <xen/public/physdev.h>
116 
117 /*
118  * This file manages our association between hypervisor event channels and
119  * Solaris's IRQs.  This is a one-to-one mapping, with the exception of
120  * IPI IRQs, for which there is one event channel per CPU participating
121  * in the IPI, and the clock VIRQ which also has an event channel per cpu
122  * and the IRQ for /dev/xen/evtchn. The IRQ types are:
123  *
124  * IRQT_VIRQ:
125  *	The hypervisor's standard virtual IRQ, used for the clock timer, for
126  *	example.  This code allows any cpu to bind to one of these, although
127  *	some are treated specially (i.e. VIRQ_DEBUG).
128  *	Event channel binding is done via EVTCHNOP_bind_virq.
129  *
130  * IRQT_PIRQ:
131  *	These associate a physical IRQ with an event channel via
132  *	EVTCHNOP_bind_pirq.
133  *
134  * IRQT_IPI:
135  *	A cross-call IRQ. Maps to "ncpus" event channels, each of which is
136  *	bound to exactly one of the vcpus.  We do not currently support
137  *	unbinding of IPIs (since Solaris doesn't need it). Uses
138  *	EVTCHNOP_bind_ipi.
139  *
140  * IRQT_EVTCHN:
141  *	A "normal" binding to an event channel, typically used by the frontend
142  *      drivers to bind to the their backend event channel.
143  *
144  * IRQT_DEV_EVTCHN:
145  *	This is a one-time IRQ used by /dev/xen/evtchn. Unlike other IRQs, we
146  *	have a one-IRQ to many-evtchn mapping. We only track evtchn->irq for
147  *	these event channels, which are managed via ec_irq_add/rm_evtchn().
148  *	We enforce that IRQT_DEV_EVTCHN's representative evtchn (->ii_evtchn)
149  *	is zero, and make any calls to irq_evtchn() an error, to prevent
150  *	accidentally attempting to use the illegal evtchn 0.
151  *
152  * Suspend/resume
153  *
154  *	During a suspend/resume cycle, we need to tear down the event channels.
155  *	All other mapping data is kept. The drivers will remove their own event
156  *	channels via xendev on receiving a DDI_SUSPEND.  This leaves us with
157  *	the IPIs and VIRQs, which we handle in ec_suspend() and ec_resume()
158  *	below.
159  *
160  * CPU binding
161  *
162  *	When an event channel is bound to a CPU, we set a bit in a mask present
163  *	in the machcpu (evt_affinity) to indicate that this CPU can accept this
164  *	event channel.  For both IPIs and VIRQs, this binding is fixed at
165  *	allocation time and we never modify it.  All other event channels are
166  *	bound via the PSM either as part of add_avintr(), or interrupt
167  *	redistribution (xen_psm_dis/enable_intr()) as a result of CPU
168  *	offline/online.
169  *
170  * Locking
171  *
172  *	Updates are done holding the ec_lock.  The xen_callback_handler()
173  *	routine reads the mapping data in a lockless fashion.  Additionally
174  *	suspend takes ec_lock to prevent update races during a suspend/resume
175  *	cycle.  The IPI info is also examined without the lock; this is OK
176  *	since we only ever change IPI info during initial setup and resume.
177  */
178 
179 #define	IRQ_IS_CPUPOKE(irq) (ipi_info[XC_CPUPOKE_PIL].mi_irq == (irq))
180 
181 #define	EVTCHN_MASKED(ev) \
182 	(HYPERVISOR_shared_info->evtchn_mask[(ev) >> EVTCHN_SHIFT] & \
183 	(1ul << ((ev) & ((1ul << EVTCHN_SHIFT) - 1))))
184 
185 static short evtchn_to_irq[NR_EVENT_CHANNELS];
186 static cpuset_t evtchn_cpus[NR_EVENT_CHANNELS];
187 static int	evtchn_owner[NR_EVENT_CHANNELS];
188 #ifdef DEBUG
189 static kthread_t *evtchn_owner_thread[NR_EVENT_CHANNELS];
190 #endif
191 
192 static irq_info_t irq_info[NR_IRQS];
193 static mec_info_t ipi_info[MAXIPL];
194 static mec_info_t virq_info[NR_VIRQS];
195 /*
196  * Mailbox for communication with the evtchn device driver.
197  * We rely on only cpu 0 servicing the event channels associated
198  * with the driver.  i.e. all evtchn driver evtchns are bound to cpu 0.
199  */
200 volatile int ec_dev_mbox;	/* mailbox for evtchn device driver */
201 
202 /*
203  * See the locking description above.
204  */
205 kmutex_t ec_lock;
206 
207 /*
208  * Bitmap indicating which PIRQs require the hypervisor to be notified
209  * on unmask.
210  */
211 static unsigned long pirq_needs_eoi[NR_PIRQS / (sizeof (unsigned long) * NBBY)];
212 
213 static int ec_debug_irq = INVALID_IRQ;
214 int ec_dev_irq = INVALID_IRQ;
215 
216 int
217 xen_bind_virq(unsigned int virq, processorid_t cpu, int *port)
218 {
219 	evtchn_bind_virq_t bind;
220 	int err;
221 
222 	bind.virq = virq;
223 	bind.vcpu = cpu;
224 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind)) == 0)
225 		*port = bind.port;
226 	else
227 		err = xen_xlate_errcode(err);
228 	return (err);
229 }
230 
231 int
232 xen_bind_interdomain(int domid, int remote_port, int *port)
233 {
234 	evtchn_bind_interdomain_t bind;
235 	int err;
236 
237 	bind.remote_dom  = domid;
238 	bind.remote_port = remote_port;
239 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
240 	    &bind)) == 0)
241 		*port = bind.local_port;
242 	else
243 		err = xen_xlate_errcode(err);
244 	return (err);
245 }
246 
247 int
248 xen_alloc_unbound_evtchn(int domid, int *evtchnp)
249 {
250 	evtchn_alloc_unbound_t alloc;
251 	int err;
252 
253 	alloc.dom = DOMID_SELF;
254 	alloc.remote_dom = domid;
255 
256 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
257 	    &alloc)) == 0) {
258 		*evtchnp = alloc.port;
259 		/* ensure evtchn is masked till we're ready to use it */
260 		(void) ec_mask_evtchn(*evtchnp);
261 	} else {
262 		err = xen_xlate_errcode(err);
263 	}
264 
265 	return (err);
266 }
267 
268 static int
269 xen_close_evtchn(int evtchn)
270 {
271 	evtchn_close_t close;
272 	int err;
273 
274 	close.port = evtchn;
275 	err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
276 	if (err)
277 		err = xen_xlate_errcode(err);
278 	return (err);
279 }
280 
281 static int
282 xen_bind_ipi(processorid_t cpu)
283 {
284 	evtchn_bind_ipi_t bind;
285 
286 	ASSERT(MUTEX_HELD(&ec_lock));
287 
288 	bind.vcpu = cpu;
289 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind) != 0)
290 		panic("xen_bind_ipi() failed");
291 	return (bind.port);
292 }
293 
294 /* Send future instances of this interrupt to other vcpu. */
295 static void
296 xen_bind_vcpu(int evtchn, int cpu)
297 {
298 	evtchn_bind_vcpu_t bind;
299 
300 	ASSERT(MUTEX_HELD(&ec_lock));
301 
302 	bind.port = evtchn;
303 	bind.vcpu = cpu;
304 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind) != 0)
305 		panic("xen_bind_vcpu() failed");
306 }
307 
308 static int
309 xen_bind_pirq(int pirq)
310 {
311 	evtchn_bind_pirq_t bind;
312 	int ret;
313 
314 	bind.pirq = pirq;
315 	bind.flags = BIND_PIRQ__WILL_SHARE;
316 	if ((ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind)) != 0)
317 		panic("xen_bind_pirq() failed (err %d)", ret);
318 	return (bind.port);
319 }
320 
321 /* unmask an evtchn and send upcall to appropriate vcpu if pending bit is set */
322 static void
323 xen_evtchn_unmask(int evtchn)
324 {
325 	evtchn_unmask_t unmask;
326 
327 	unmask.port = evtchn;
328 	if (HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask) != 0)
329 		panic("xen_evtchn_unmask() failed");
330 }
331 
332 static void
333 update_evtchn_affinity(int evtchn)
334 {
335 	cpu_t *cp;
336 	struct xen_evt_data *cpe;
337 
338 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
339 	ASSERT(MUTEX_HELD(&ec_lock));
340 
341 	/*
342 	 * Use lockless search of cpu_list, similar to mutex_vector_enter().
343 	 */
344 	kpreempt_disable();
345 	cp = cpu_list;
346 	do {
347 		cpe = cp->cpu_m.mcpu_evt_pend;
348 		if (CPU_IN_SET(evtchn_cpus[evtchn], cp->cpu_id))
349 			SET_EVTCHN_BIT(evtchn, cpe->evt_affinity);
350 		else
351 			CLEAR_EVTCHN_BIT(evtchn, cpe->evt_affinity);
352 	} while ((cp = cp->cpu_next) != cpu_list);
353 	kpreempt_enable();
354 }
355 
356 static void
357 bind_evtchn_to_cpuset(int evtchn, cpuset_t cpus)
358 {
359 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
360 
361 	CPUSET_ZERO(evtchn_cpus[evtchn]);
362 	CPUSET_OR(evtchn_cpus[evtchn], cpus);
363 	update_evtchn_affinity(evtchn);
364 }
365 
366 static void
367 clear_evtchn_affinity(int evtchn)
368 {
369 	CPUSET_ZERO(evtchn_cpus[evtchn]);
370 	update_evtchn_affinity(evtchn);
371 }
372 
373 static void
374 alloc_irq_evtchn(int irq, int index, int evtchn, int cpu)
375 {
376 	irq_info_t *irqp = &irq_info[irq];
377 
378 	switch (irqp->ii_type) {
379 	case IRQT_IPI:
380 		ipi_info[index].mi_evtchns[cpu] = evtchn;
381 		irqp->ii_u.index = index;
382 		break;
383 	case IRQT_VIRQ:
384 		virq_info[index].mi_evtchns[cpu] = evtchn;
385 		irqp->ii_u.index = index;
386 		break;
387 	default:
388 		irqp->ii_u.evtchn = evtchn;
389 		break;
390 	}
391 
392 	evtchn_to_irq[evtchn] = irq;
393 
394 	/*
395 	 * If a CPU is not specified, we expect to bind it to a CPU later via
396 	 * the PSM.
397 	 */
398 	if (cpu != -1) {
399 		cpuset_t tcpus;
400 		CPUSET_ONLY(tcpus, cpu);
401 		bind_evtchn_to_cpuset(evtchn, tcpus);
402 	}
403 }
404 
405 static int
406 alloc_irq(int type, int index, int evtchn, int cpu)
407 {
408 	int irq;
409 	irq_info_t *irqp;
410 
411 	ASSERT(MUTEX_HELD(&ec_lock));
412 	ASSERT(type != IRQT_IPI || cpu != -1);
413 
414 	for (irq = 0; irq < NR_IRQS; irq++) {
415 		if (irq_info[irq].ii_type == IRQT_UNBOUND)
416 			break;
417 	}
418 
419 	if (irq == NR_IRQS)
420 		panic("No available IRQ to bind to: increase NR_IRQS!\n");
421 
422 	irqp = &irq_info[irq];
423 
424 	irqp->ii_type = type;
425 	/*
426 	 * Set irq/has_handler field to zero which means handler not installed
427 	 */
428 	irqp->ii_u2.has_handler = 0;
429 
430 	alloc_irq_evtchn(irq, index, evtchn, cpu);
431 	return (irq);
432 }
433 
434 static int
435 irq_evtchn(irq_info_t *irqp)
436 {
437 	int evtchn;
438 
439 	ASSERT(irqp->ii_type != IRQT_DEV_EVTCHN);
440 
441 	switch (irqp->ii_type) {
442 	case IRQT_IPI:
443 		ASSERT(irqp->ii_u.index != 0);
444 		evtchn = ipi_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
445 		break;
446 	case IRQT_VIRQ:
447 		evtchn = virq_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
448 		break;
449 	default:
450 		evtchn = irqp->ii_u.evtchn;
451 		break;
452 	}
453 
454 	return (evtchn);
455 }
456 
457 static void
458 unbind_evtchn(ushort_t *evtchnp)
459 {
460 	int err;
461 
462 	ASSERT(MUTEX_HELD(&ec_lock));
463 
464 	ASSERT(*evtchnp != 0);
465 
466 	err = xen_close_evtchn(*evtchnp);
467 	ASSERT(err == 0);
468 	clear_evtchn_affinity(*evtchnp);
469 	evtchn_to_irq[*evtchnp] = INVALID_IRQ;
470 	*evtchnp = 0;
471 }
472 
473 static void
474 pirq_unmask_notify(int pirq)
475 {
476 	struct physdev_eoi eoi;
477 
478 	if (TEST_EVTCHN_BIT(pirq, &pirq_needs_eoi[0])) {
479 		eoi.irq = pirq;
480 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
481 	}
482 }
483 
484 static void
485 pirq_query_unmask(int pirq)
486 {
487 	struct physdev_irq_status_query irq_status;
488 
489 	irq_status.irq = pirq;
490 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
491 	CLEAR_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
492 	if (irq_status.flags & XENIRQSTAT_needs_eoi)
493 		SET_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
494 }
495 
496 static void
497 end_pirq(int irq)
498 {
499 	int evtchn = irq_evtchn(&irq_info[irq]);
500 
501 	ec_unmask_evtchn(evtchn);
502 	pirq_unmask_notify(IRQ_TO_PIRQ(irq));
503 }
504 
505 /*
506  * probe if a pirq is available to bind to, return 1 if available
507  * else return 0.
508  * Note that for debug versions of xen this probe may cause an in use IRQ
509  * warning message from xen.
510  */
511 int
512 ec_probe_pirq(int pirq)
513 {
514 	evtchn_bind_pirq_t bind;
515 
516 	bind.pirq = pirq;
517 	bind.flags = 0;
518 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind) != 0) {
519 		return (0);
520 	} else {
521 		(void) xen_close_evtchn(bind.port);
522 		return (1);
523 	}
524 }
525 
526 /*
527  * Bind an event channel to a vcpu
528  */
529 void
530 ec_bind_vcpu(int evtchn, int cpu)
531 {
532 	mutex_enter(&ec_lock);
533 	xen_bind_vcpu(evtchn, cpu);
534 	mutex_exit(&ec_lock);
535 }
536 
537 /*
538  * Set up a physical device irq to be associated with an event channel.
539  */
540 void
541 ec_setup_pirq(int irq, int ipl, cpuset_t *cpusp)
542 {
543 	int evtchn;
544 	irq_info_t *irqp = &irq_info[irq];
545 
546 	/*
547 	 * Test if this PIRQ is already bound to an evtchn,
548 	 * which means it is a shared IRQ and we don't want to
549 	 * bind and do some initial setup that has already been
550 	 * done for this irq on a previous trip through this code.
551 	 */
552 	if (irqp->ii_u.evtchn == INVALID_EVTCHN) {
553 		evtchn = xen_bind_pirq(irq);
554 
555 		pirq_query_unmask(IRQ_TO_PIRQ(irq));
556 
557 		irqp->ii_type = IRQT_PIRQ;
558 		irqp->ii_u.evtchn = evtchn;
559 
560 		evtchn_to_irq[evtchn] = irq;
561 		irqp->ii_u2.ipl = ipl;
562 		ec_set_irq_affinity(irq, *cpusp);
563 		ec_enable_irq(irq);
564 		pirq_unmask_notify(IRQ_TO_PIRQ(irq));
565 	} else {
566 		ASSERT(irqp->ii_u2.ipl != 0);
567 		cmn_err(CE_NOTE, "IRQ%d is shared", irq);
568 		if (ipl > irqp->ii_u2.ipl)
569 			irqp->ii_u2.ipl = ipl;
570 		*cpusp = evtchn_cpus[irqp->ii_u.evtchn];
571 	}
572 }
573 
574 void
575 ec_unbind_irq(int irq)
576 {
577 	irq_info_t *irqp = &irq_info[irq];
578 	mec_info_t *virqp;
579 	int drop_lock = 0;
580 	int type, i;
581 
582 	/*
583 	 * Nasty, but we need this during suspend.
584 	 */
585 	if (mutex_owner(&ec_lock) != curthread) {
586 		mutex_enter(&ec_lock);
587 		drop_lock = 1;
588 	}
589 
590 	type = irqp->ii_type;
591 
592 	ASSERT((type == IRQT_EVTCHN) || (type == IRQT_PIRQ) ||
593 	    (type == IRQT_VIRQ));
594 
595 	if ((type == IRQT_EVTCHN) || (type == IRQT_PIRQ)) {
596 		/* There's only one event channel associated with this irq */
597 		unbind_evtchn(&irqp->ii_u.evtchn);
598 	} else if (type == IRQT_VIRQ) {
599 		/*
600 		 * Each cpu on the system can have it's own event channel
601 		 * associated with a virq.  Unbind them all.
602 		 */
603 		virqp = &virq_info[irqp->ii_u.index];
604 		for (i = 0; i < NCPU; i++) {
605 			if (virqp->mi_evtchns[i] != 0)
606 				unbind_evtchn(&virqp->mi_evtchns[i]);
607 		}
608 		/* Mark the virq structure as invalid. */
609 		virqp->mi_irq = INVALID_IRQ;
610 	}
611 
612 	bzero(irqp, sizeof (*irqp));
613 	/* Re-reserve PIRQ. */
614 	if (type == IRQT_PIRQ)
615 		irqp->ii_type = IRQT_PIRQ;
616 
617 	if (drop_lock)
618 		mutex_exit(&ec_lock);
619 }
620 
621 /*
622  * Rebind an event channel for delivery to a CPU.
623  */
624 void
625 ec_set_irq_affinity(int irq, cpuset_t dest)
626 {
627 	int evtchn, tcpu;
628 	irq_info_t *irqp = &irq_info[irq];
629 
630 	mutex_enter(&ec_lock);
631 
632 	ASSERT(irq < NR_IRQS);
633 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
634 
635 	/*
636 	 * Binding is done at allocation time for these types, so we should
637 	 * never modify them.
638 	 */
639 	if (irqp->ii_type == IRQT_IPI || irqp->ii_type == IRQT_VIRQ ||
640 	    irqp->ii_type == IRQT_DEV_EVTCHN) {
641 		mutex_exit(&ec_lock);
642 		return;
643 	}
644 
645 	CPUSET_FIND(dest, tcpu);
646 	ASSERT(tcpu != CPUSET_NOTINSET);
647 
648 	evtchn = irq_evtchn(irqp);
649 
650 	xen_bind_vcpu(evtchn, tcpu);
651 
652 	bind_evtchn_to_cpuset(evtchn, dest);
653 
654 	mutex_exit(&ec_lock);
655 
656 	/*
657 	 * Now send the new target processor a NOP IPI.
658 	 * It will check for any pending interrupts, and so service any that
659 	 * got delivered to the wrong processor by mistake.
660 	 */
661 	poke_cpu(tcpu);
662 }
663 
664 int
665 ec_set_irq_priority(int irq, int pri)
666 {
667 	irq_info_t *irqp;
668 
669 	if (irq >= NR_IRQS)
670 		return (-1);
671 
672 	irqp = &irq_info[irq];
673 
674 	if (irqp->ii_type == IRQT_UNBOUND)
675 		return (-1);
676 
677 	irqp->ii_u2.ipl = pri;
678 
679 	return (0);
680 }
681 
682 void
683 ec_clear_irq_priority(int irq)
684 {
685 	irq_info_t *irqp = &irq_info[irq];
686 
687 	ASSERT(irq < NR_IRQS);
688 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
689 
690 	irqp->ii_u2.ipl = 0;
691 }
692 
693 int
694 ec_bind_evtchn_to_irq(int evtchn)
695 {
696 	mutex_enter(&ec_lock);
697 
698 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
699 
700 	(void) alloc_irq(IRQT_EVTCHN, 0, evtchn, -1);
701 
702 	mutex_exit(&ec_lock);
703 	return (evtchn_to_irq[evtchn]);
704 }
705 
706 int
707 ec_bind_virq_to_irq(int virq, int cpu)
708 {
709 	int err;
710 	int evtchn;
711 	mec_info_t *virqp;
712 
713 	virqp = &virq_info[virq];
714 	mutex_enter(&ec_lock);
715 
716 	err = xen_bind_virq(virq, cpu, &evtchn);
717 	ASSERT(err == 0);
718 
719 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
720 
721 	if (virqp->mi_irq == INVALID_IRQ) {
722 		virqp->mi_irq = alloc_irq(IRQT_VIRQ, virq, evtchn, cpu);
723 	} else {
724 		alloc_irq_evtchn(virqp->mi_irq, virq, evtchn, cpu);
725 	}
726 
727 	mutex_exit(&ec_lock);
728 
729 	return (virqp->mi_irq);
730 }
731 
732 int
733 ec_bind_ipi_to_irq(int ipl, int cpu)
734 {
735 	int evtchn;
736 	ulong_t flags;
737 	mec_info_t *ipip;
738 
739 	mutex_enter(&ec_lock);
740 
741 	ipip = &ipi_info[ipl];
742 
743 	evtchn = xen_bind_ipi(cpu);
744 
745 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
746 
747 	if (ipip->mi_irq == INVALID_IRQ) {
748 		ipip->mi_irq = alloc_irq(IRQT_IPI, ipl, evtchn, cpu);
749 	} else {
750 		alloc_irq_evtchn(ipip->mi_irq, ipl, evtchn, cpu);
751 	}
752 
753 	/*
754 	 * Unmask the new evtchn so that it can be seen by the target cpu
755 	 */
756 	flags = intr_clear();
757 	ec_unmask_evtchn(evtchn);
758 	intr_restore(flags);
759 
760 	mutex_exit(&ec_lock);
761 	return (ipip->mi_irq);
762 }
763 
764 /*
765  * When bringing up a CPU, bind to all the IPIs that CPU0 bound.
766  */
767 void
768 ec_bind_cpu_ipis(int cpu)
769 {
770 	int i;
771 
772 	for (i = 0; i < MAXIPL; i++) {
773 		mec_info_t *ipip = &ipi_info[i];
774 		if (ipip->mi_irq == INVALID_IRQ)
775 			continue;
776 
777 		(void) ec_bind_ipi_to_irq(i, cpu);
778 	}
779 }
780 
781 /*
782  * Can this IRQ be rebound to another CPU?
783  */
784 int
785 ec_irq_rebindable(int irq)
786 {
787 	irq_info_t *irqp = &irq_info[irq];
788 
789 	if (irqp->ii_u.evtchn == 0)
790 		return (0);
791 
792 	return (irqp->ii_type == IRQT_EVTCHN || irqp->ii_type == IRQT_PIRQ);
793 }
794 
795 /*
796  * Should this IRQ be unbound from this CPU (which is being offlined) to
797  * another?
798  */
799 int
800 ec_irq_needs_rebind(int irq, int cpu)
801 {
802 	irq_info_t *irqp = &irq_info[irq];
803 
804 	return (ec_irq_rebindable(irq) &&
805 	    CPU_IN_SET(evtchn_cpus[irqp->ii_u.evtchn], cpu));
806 }
807 
808 void
809 ec_send_ipi(int ipl, int cpu)
810 {
811 	mec_info_t *ipip = &ipi_info[ipl];
812 
813 	ASSERT(ipip->mi_irq != INVALID_IRQ);
814 
815 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
816 }
817 
818 void
819 ec_try_ipi(int ipl, int cpu)
820 {
821 	mec_info_t *ipip = &ipi_info[ipl];
822 
823 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
824 		return;
825 
826 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
827 }
828 
829 void
830 ec_irq_add_evtchn(int irq, int evtchn)
831 {
832 	mutex_enter(&ec_lock);
833 
834 	/*
835 	 * See description of IRQT_DEV_EVTCHN above.
836 	 */
837 	ASSERT(irq == ec_dev_irq);
838 
839 	alloc_irq_evtchn(irq, 0, evtchn, 0);
840 	/*
841 	 * We enforce that the representative event channel for IRQT_DEV_EVTCHN
842 	 * is zero, so PSM operations on it have no effect.
843 	 */
844 	irq_info[irq].ii_u.evtchn = 0;
845 	mutex_exit(&ec_lock);
846 }
847 
848 void
849 ec_irq_rm_evtchn(int irq, int evtchn)
850 {
851 	ushort_t ec = evtchn;
852 
853 	mutex_enter(&ec_lock);
854 	ASSERT(irq == ec_dev_irq);
855 	unbind_evtchn(&ec);
856 	mutex_exit(&ec_lock);
857 }
858 
859 /*
860  * Allocate an /dev/xen/evtchn IRQ.  See the big comment at the top
861  * for an explanation.
862  */
863 int
864 ec_dev_alloc_irq(void)
865 {
866 	int i;
867 	irq_info_t *irqp;
868 
869 	for (i = 0; i < NR_IRQS; i++) {
870 		if (irq_info[i].ii_type == IRQT_UNBOUND)
871 			break;
872 	}
873 
874 	ASSERT(i != NR_IRQS);
875 
876 	irqp = &irq_info[i];
877 	irqp->ii_type = IRQT_DEV_EVTCHN;
878 	irqp->ii_u2.ipl = IPL_EVTCHN;
879 	/*
880 	 * Force the evtchn to zero for the special evtchn device irq
881 	 */
882 	irqp->ii_u.evtchn = 0;
883 	return (i);
884 }
885 
886 void
887 ec_enable_irq(unsigned int irq)
888 {
889 	ulong_t flag;
890 	irq_info_t *irqp = &irq_info[irq];
891 
892 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
893 		return;
894 
895 	flag = intr_clear();
896 	ec_unmask_evtchn(irq_evtchn(irqp));
897 	intr_restore(flag);
898 }
899 
900 void
901 ec_disable_irq(unsigned int irq)
902 {
903 	irq_info_t *irqp = &irq_info[irq];
904 
905 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
906 		return;
907 
908 	/*
909 	 * Spin till we are the one to mask the evtchn
910 	 * Ensures no one else can be servicing this evtchn.
911 	 */
912 	while (!ec_mask_evtchn(irq_evtchn(irqp)))
913 		SMT_PAUSE();
914 }
915 
916 static int
917 ec_evtchn_pending(uint_t ev)
918 {
919 	uint_t evi;
920 	shared_info_t *si = HYPERVISOR_shared_info;
921 
922 	evi = ev >> EVTCHN_SHIFT;
923 	ev &= (1ul << EVTCHN_SHIFT) - 1;
924 	return ((si->evtchn_pending[evi] & (1ul << ev)) != 0);
925 }
926 
927 int
928 ec_pending_irq(unsigned int irq)
929 {
930 	int evtchn = irq_evtchn(&irq_info[irq]);
931 
932 	return (ec_evtchn_pending(evtchn));
933 }
934 
935 void
936 ec_clear_irq(int irq)
937 {
938 	irq_info_t *irqp = &irq_info[irq];
939 	int evtchn;
940 
941 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
942 		return;
943 
944 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
945 
946 	evtchn = irq_evtchn(irqp);
947 
948 	ASSERT(EVTCHN_MASKED(evtchn));
949 	ec_clear_evtchn(evtchn);
950 }
951 
952 void
953 ec_unmask_irq(int irq)
954 {
955 	ulong_t flags;
956 	irq_info_t *irqp = &irq_info[irq];
957 
958 	flags = intr_clear();
959 	switch (irqp->ii_type) {
960 	case IRQT_PIRQ:
961 		end_pirq(irq);
962 		break;
963 	case IRQT_DEV_EVTCHN:
964 		break;
965 	default:
966 		ec_unmask_evtchn(irq_evtchn(irqp));
967 		break;
968 	}
969 	intr_restore(flags);
970 }
971 
972 void
973 ec_try_unmask_irq(int irq)
974 {
975 	ulong_t flags;
976 	irq_info_t *irqp = &irq_info[irq];
977 	int evtchn;
978 
979 	flags = intr_clear();
980 	switch (irqp->ii_type) {
981 	case IRQT_PIRQ:
982 		end_pirq(irq);
983 		break;
984 	case IRQT_DEV_EVTCHN:
985 		break;
986 	default:
987 		if ((evtchn = irq_evtchn(irqp)) != 0)
988 			ec_unmask_evtchn(evtchn);
989 		break;
990 	}
991 	intr_restore(flags);
992 }
993 
994 /*
995  * Poll until an event channel is ready or 'check_func' returns true.  This can
996  * only be used in a situation where interrupts are masked, otherwise we have a
997  * classic time-of-check vs. time-of-use race.
998  */
999 void
1000 ec_wait_on_evtchn(int evtchn, int (*check_func)(void *), void *arg)
1001 {
1002 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1003 		while (!check_func(arg))
1004 			(void) HYPERVISOR_yield();
1005 		return;
1006 	}
1007 
1008 	ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
1009 
1010 	for (;;) {
1011 		evtchn_port_t ports[1];
1012 
1013 		ports[0] = evtchn;
1014 
1015 		ec_clear_evtchn(evtchn);
1016 
1017 		if (check_func(arg))
1018 			return;
1019 
1020 		(void) HYPERVISOR_poll(ports, 1, 0);
1021 	}
1022 }
1023 
1024 void
1025 ec_wait_on_ipi(int ipl, int (*check_func)(void *), void *arg)
1026 {
1027 	mec_info_t *ipip = &ipi_info[ipl];
1028 
1029 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
1030 		return;
1031 
1032 	ec_wait_on_evtchn(ipip->mi_evtchns[CPU->cpu_id], check_func, arg);
1033 }
1034 
1035 void
1036 ec_suspend(void)
1037 {
1038 	irq_info_t *irqp;
1039 	ushort_t *evtchnp;
1040 	int i;
1041 	int c;
1042 
1043 	ASSERT(MUTEX_HELD(&ec_lock));
1044 
1045 	for (i = 0; i < MAXIPL; i++) {
1046 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1047 			continue;
1048 
1049 		for (c = 0; c < NCPU; c++) {
1050 			if (cpu[c] == NULL)
1051 				continue;
1052 
1053 			if (CPU_IN_SET(cpu_suspend_lost_set, c))
1054 				continue;
1055 
1056 			evtchnp = &ipi_info[i].mi_evtchns[c];
1057 			ASSERT(*evtchnp != 0);
1058 			unbind_evtchn(evtchnp);
1059 		}
1060 	}
1061 
1062 	for (i = 0; i < NR_VIRQS; i++) {
1063 		if (virq_info[i].mi_irq == INVALID_IRQ)
1064 			continue;
1065 
1066 		/*
1067 		 * If we're sharing a single event channel across all CPUs, we
1068 		 * should only unbind once.
1069 		 */
1070 		if (virq_info[i].mi_shared) {
1071 			evtchnp = &virq_info[i].mi_evtchns[0];
1072 			unbind_evtchn(evtchnp);
1073 			for (c = 1; c < NCPU; c++)
1074 				virq_info[i].mi_evtchns[c] = 0;
1075 		} else {
1076 			for (c = 0; c < NCPU; c++) {
1077 				if (cpu[c] == NULL)
1078 					continue;
1079 
1080 				evtchnp = &virq_info[i].mi_evtchns[c];
1081 				if (*evtchnp != 0)
1082 					unbind_evtchn(evtchnp);
1083 			}
1084 		}
1085 	}
1086 
1087 	for (i = 0; i < NR_IRQS; i++) {
1088 		irqp = &irq_info[i];
1089 
1090 		switch (irqp->ii_type) {
1091 		case IRQT_EVTCHN:
1092 		case IRQT_DEV_EVTCHN:
1093 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1094 			break;
1095 		case IRQT_PIRQ:
1096 			if (irqp->ii_u.evtchn != 0)
1097 				(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1098 			break;
1099 		default:
1100 			break;
1101 		}
1102 	}
1103 }
1104 
1105 /*
1106  * The debug irq is special, we only have one evtchn and irq but we allow all
1107  * cpus to service it.  It's marked as shared and we propogate the event
1108  * channel into all CPUs by hand.
1109  */
1110 static void
1111 share_virq(mec_info_t *virqp)
1112 {
1113 	int evtchn = virqp->mi_evtchns[0];
1114 	cpuset_t tset;
1115 	int i;
1116 
1117 	ASSERT(evtchn != 0);
1118 
1119 	virqp->mi_shared = 1;
1120 
1121 	for (i = 1; i < NCPU; i++)
1122 		virqp->mi_evtchns[i] = evtchn;
1123 	CPUSET_ALL(tset);
1124 	bind_evtchn_to_cpuset(evtchn, tset);
1125 }
1126 
1127 static void
1128 virq_resume(int virq)
1129 {
1130 	mec_info_t *virqp = &virq_info[virq];
1131 	int evtchn;
1132 	int i, err;
1133 
1134 	for (i = 0; i < NCPU; i++) {
1135 		cpuset_t tcpus;
1136 
1137 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1138 			continue;
1139 
1140 		err = xen_bind_virq(virq, i, &evtchn);
1141 		ASSERT(err == 0);
1142 
1143 		virqp->mi_evtchns[i] = evtchn;
1144 		evtchn_to_irq[evtchn] = virqp->mi_irq;
1145 		CPUSET_ONLY(tcpus, i);
1146 		bind_evtchn_to_cpuset(evtchn, tcpus);
1147 		ec_unmask_evtchn(evtchn);
1148 		/*
1149 		 * only timer VIRQ is bound to all cpus
1150 		 */
1151 		if (virq != VIRQ_TIMER)
1152 			break;
1153 	}
1154 
1155 	if (virqp->mi_shared)
1156 		share_virq(virqp);
1157 }
1158 
1159 static void
1160 ipi_resume(int ipl)
1161 {
1162 	mec_info_t *ipip = &ipi_info[ipl];
1163 	int i;
1164 
1165 	for (i = 0; i < NCPU; i++) {
1166 		cpuset_t tcpus;
1167 		int evtchn;
1168 
1169 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1170 			continue;
1171 
1172 		evtchn = xen_bind_ipi(i);
1173 		ipip->mi_evtchns[i] = evtchn;
1174 		evtchn_to_irq[evtchn] = ipip->mi_irq;
1175 		CPUSET_ONLY(tcpus, i);
1176 		bind_evtchn_to_cpuset(evtchn, tcpus);
1177 		ec_unmask_evtchn(evtchn);
1178 	}
1179 }
1180 
1181 void
1182 ec_resume(void)
1183 {
1184 	int i;
1185 
1186 	/* New event-channel space is not 'live' yet. */
1187 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
1188 		(void) ec_mask_evtchn(i);
1189 
1190 	for (i = 0; i < MAXIPL; i++) {
1191 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1192 			continue;
1193 		ipi_resume(i);
1194 	}
1195 
1196 	for (i = 0; i < NR_VIRQS; i++) {
1197 		if (virq_info[i].mi_irq == INVALID_IRQ)
1198 			continue;
1199 		virq_resume(i);
1200 	}
1201 }
1202 
1203 void
1204 ec_init(void)
1205 {
1206 	int i;
1207 	mutex_init(&ec_lock, NULL, MUTEX_SPIN, (void *)ipltospl(SPL7));
1208 
1209 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
1210 		CPUSET_ZERO(evtchn_cpus[i]);
1211 		evtchn_to_irq[i] = INVALID_IRQ;
1212 		(void) ec_mask_evtchn(i);
1213 	}
1214 
1215 	for (i = 0; i < MAXIPL; i++)
1216 		ipi_info[i].mi_irq = INVALID_IRQ;
1217 
1218 	for (i = 0; i < NR_VIRQS; i++)
1219 		virq_info[i].mi_irq = INVALID_IRQ;
1220 
1221 	/*
1222 	 * Phys IRQ space is statically bound (1:1 mapping), grab the IRQs
1223 	 * now.
1224 	 */
1225 	for (i = PIRQ_BASE; i < NR_PIRQS; i++) {
1226 		irq_info[PIRQ_TO_IRQ(i)].ii_type = IRQT_PIRQ;
1227 	}
1228 }
1229 
1230 void
1231 ec_init_debug_irq()
1232 {
1233 	int irq;
1234 
1235 	irq = ec_bind_virq_to_irq(VIRQ_DEBUG, 0);
1236 	(void) add_avintr(NULL, IPL_DEBUG, (avfunc)xen_debug_handler,
1237 	    "debug", irq, NULL, NULL, NULL, NULL);
1238 
1239 	mutex_enter(&ec_lock);
1240 	share_virq(&virq_info[irq_info[irq].ii_u.index]);
1241 	mutex_exit(&ec_lock);
1242 	ec_debug_irq = irq;
1243 }
1244 
1245 #define	UNBLOCKED_EVENTS(si, ix, cpe, cpu_id) \
1246 	((si)->evtchn_pending[ix] & ~(si)->evtchn_mask[ix] & \
1247 		(cpe)->evt_affinity[ix])
1248 
1249 /*
1250  * This is the entry point for processing events from xen
1251  *
1252  * (See the commentary associated with the shared_info_st structure
1253  * in hypervisor-if.h)
1254  *
1255  * Since the event channel mechanism doesn't really implement the
1256  * concept of priority like hardware interrupt controllers, we simulate
1257  * that in software here using the cpu priority field and the pending
1258  * interrupts field.  Events/interrupts that are not able to be serviced
1259  * now because they are at a lower priority than the current cpu priority
1260  * cause a level bit to be recorded in the pending interrupts word.  When
1261  * the priority is lowered (either by spl or interrupt exit code) the pending
1262  * levels are checked and an upcall is scheduled if there are events/interrupts
1263  * that have become deliverable.
1264  */
1265 void
1266 xen_callback_handler(struct regs *rp, trap_trace_rec_t *ttp)
1267 {
1268 	ulong_t pending_sels, pe, selbit;
1269 	int i, j, port, pri, curpri, irq;
1270 	uint16_t pending_ints;
1271 	struct cpu *cpu = CPU;
1272 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1273 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
1274 	volatile struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
1275 	volatile uint16_t *cpu_ipp = &cpu->cpu_m.mcpu_intr_pending;
1276 
1277 	ASSERT(rp->r_trapno == T_AST && rp->r_err == 0);
1278 	ASSERT(&si->vcpu_info[cpu->cpu_id] == vci);
1279 	ASSERT_STACK_ALIGNED();
1280 
1281 	vci->evtchn_upcall_pending = 0;
1282 
1283 	/*
1284 	 * To expedite scanning of pending notifications, any 0->1
1285 	 * pending transition on an unmasked channel causes a
1286 	 * corresponding bit in evtchn_pending_sel to be set.
1287 	 * Each bit in the selector covers a 32-bit word in
1288 	 * the evtchn_pending[] array.
1289 	 */
1290 	membar_enter();
1291 	do {
1292 		pending_sels = vci->evtchn_pending_sel;
1293 	} while (atomic_cas_ulong((volatile ulong_t *)&vci->evtchn_pending_sel,
1294 	    pending_sels, 0) != pending_sels);
1295 
1296 	pending_ints = *cpu_ipp;
1297 	while ((i = ffs(pending_sels)) != 0) {
1298 		i--;
1299 		selbit = 1ul << i;
1300 		pending_sels &= ~selbit;
1301 
1302 		membar_enter();
1303 		while ((pe = UNBLOCKED_EVENTS(si, i, cpe, cpu->cpu_id)) != 0) {
1304 			j = ffs(pe) - 1;
1305 			pe &= ~(1ul << j);
1306 
1307 			port = (i << EVTCHN_SHIFT) + j;
1308 
1309 			irq = evtchn_to_irq[port];
1310 
1311 			/*
1312 			 * If no irq set, just ignore the event.
1313 			 * On e.g. netbsd they call evtchn_device_upcall(port)
1314 			 * We require the evtchn driver to install a handler
1315 			 * so there will be an irq associated with user mode
1316 			 * evtchns.
1317 			 */
1318 			if (irq == INVALID_IRQ) {
1319 				ec_clear_evtchn(port);
1320 				continue;
1321 			}
1322 
1323 			/*
1324 			 * If there's no handler, it could be a poke, so just
1325 			 * accept the event and continue.
1326 			 */
1327 			if (!irq_info[irq].ii_u2.has_handler) {
1328 #ifdef TRAPTRACE
1329 				ttp->ttr_ipl = 0xff;
1330 				if (IRQ_IS_CPUPOKE(irq)) {
1331 					ttp->ttr_ipl = XC_CPUPOKE_PIL;
1332 					ttp->ttr_marker = TT_INTERRUPT;
1333 				}
1334 				ttp->ttr_pri = cpu->cpu_pri;
1335 				ttp->ttr_spl = cpu->cpu_base_spl;
1336 				ttp->ttr_vector = 0xff;
1337 #endif /* TRAPTRACE */
1338 				if (ec_mask_evtchn(port)) {
1339 					ec_clear_evtchn(port);
1340 					ec_unmask_evtchn(port);
1341 					continue;
1342 				}
1343 			}
1344 
1345 			pri = irq_info[irq].ii_u2.ipl;
1346 
1347 			/*
1348 			 * If we are the cpu that successfully masks
1349 			 * the event, then record it as a pending event
1350 			 * for this cpu to service
1351 			 */
1352 			if (ec_mask_evtchn(port)) {
1353 				if (ec_evtchn_pending(port)) {
1354 					cpe->pending_sel[pri] |= selbit;
1355 					cpe->pending_evts[pri][i] |= (1ul << j);
1356 					pending_ints |= 1 << pri;
1357 				} else {
1358 					/*
1359 					 * another cpu serviced this event
1360 					 * before us, clear the mask.
1361 					 */
1362 					ec_unmask_evtchn(port);
1363 				}
1364 			}
1365 		}
1366 	}
1367 	*cpu_ipp = pending_ints;
1368 	if (pending_ints == 0)
1369 		return;
1370 	/*
1371 	 * We have gathered all the pending events/interrupts,
1372 	 * go service all the ones we can from highest priority to lowest.
1373 	 * Note: This loop may not actually complete and service all
1374 	 * pending interrupts since one of the interrupt threads may
1375 	 * block and the pinned thread runs.  In that case, when we
1376 	 * exit the interrupt thread that blocked we will check for
1377 	 * any unserviced interrupts and re-post an upcall to process
1378 	 * any unserviced pending events.
1379 	 */
1380 	curpri = cpu->cpu_pri;
1381 	for (pri = bsrw_insn(*cpu_ipp); pri > curpri; pri--) {
1382 		while ((pending_sels = cpe->pending_sel[pri]) != 0) {
1383 			i = ffs(pending_sels) - 1;
1384 			while ((pe = cpe->pending_evts[pri][i]) != 0) {
1385 				j = ffs(pe) - 1;
1386 				pe &= ~(1ul << j);
1387 				cpe->pending_evts[pri][i] = pe;
1388 				if (pe == 0) {
1389 					/*
1390 					 * Must reload pending selector bits
1391 					 * here as they could have changed on
1392 					 * a previous trip around the inner loop
1393 					 * while we were interrupt enabled
1394 					 * in a interrupt service routine.
1395 					 */
1396 					pending_sels = cpe->pending_sel[pri];
1397 					pending_sels &= ~(1ul << i);
1398 					cpe->pending_sel[pri] = pending_sels;
1399 					if (pending_sels == 0)
1400 						*cpu_ipp &= ~(1 << pri);
1401 				}
1402 				port = (i << EVTCHN_SHIFT) + j;
1403 				irq = evtchn_to_irq[port];
1404 				if (irq == INVALID_IRQ) {
1405 					/*
1406 					 * No longer a handler for this event
1407 					 * channel.  Clear the event and
1408 					 * ignore it, unmask the event.
1409 					 */
1410 					ec_clear_evtchn(port);
1411 					ec_unmask_evtchn(port);
1412 					continue;
1413 				}
1414 				if (irq == ec_dev_irq) {
1415 					volatile int *tptr = &ec_dev_mbox;
1416 
1417 					ASSERT(ec_dev_mbox == 0);
1418 					/*
1419 					 * NOTE: this gross store thru a pointer
1420 					 * is necessary because of a Sun C
1421 					 * compiler bug that does not properly
1422 					 * honor a volatile declaration.
1423 					 * we really should just be able to say
1424 					 * 	ec_dev_mbox = port;
1425 					 * here
1426 					 */
1427 					*tptr = port;
1428 				}
1429 				/*
1430 				 * Set up the regs struct to
1431 				 * look like a normal hardware int
1432 				 * and do normal interrupt handling.
1433 				 */
1434 				rp->r_trapno = irq;
1435 				do_interrupt(rp, ttp);
1436 				/*
1437 				 * Check for cpu priority change
1438 				 * Can happen if int thread blocks
1439 				 */
1440 				if (cpu->cpu_pri > curpri)
1441 					return;
1442 			}
1443 		}
1444 	}
1445 }
1446 
1447 void
1448 ec_unmask_evtchn(unsigned int ev)
1449 {
1450 	uint_t evi;
1451 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1452 	volatile vcpu_info_t *vci = CPU->cpu_m.mcpu_vcpu_info;
1453 	volatile ulong_t *ulp;
1454 
1455 	ASSERT(!interrupts_enabled());
1456 	/*
1457 	 * Check if we need to take slow path
1458 	 */
1459 	if (!CPU_IN_SET(evtchn_cpus[ev], CPU->cpu_id)) {
1460 		xen_evtchn_unmask(ev);
1461 		return;
1462 	}
1463 	evi = ev >> EVTCHN_SHIFT;
1464 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1465 	ulp = (volatile ulong_t *)&si->evtchn_mask[evi];
1466 	atomic_and_ulong(ulp, ~(1ul << ev));
1467 	/*
1468 	 * The following is basically the equivalent of
1469 	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
1470 	 * interrupt edge' if the channel is masked.
1471 	 * XXPV - slight race if upcall was about to be set, we may get
1472 	 * an extra upcall.
1473 	 */
1474 	membar_enter();
1475 	if (si->evtchn_pending[evi] & (1ul << ev)) {
1476 		membar_consumer();
1477 		ulp = (volatile ulong_t *)&vci->evtchn_pending_sel;
1478 		if (!(*ulp & (1ul << evi))) {
1479 			atomic_or_ulong(ulp, (1ul << evi));
1480 		}
1481 		vci->evtchn_upcall_pending = 1;
1482 	}
1483 }
1484 
1485 /*
1486  * Set a bit in an evtchan mask word, return true if we are the cpu that
1487  * set the bit.
1488  */
1489 int
1490 ec_mask_evtchn(unsigned int ev)
1491 {
1492 	uint_t evi, evb;
1493 	ulong_t new, old, bit;
1494 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1495 	volatile ulong_t *maskp;
1496 	int masked;
1497 
1498 	kpreempt_disable();
1499 	evi = ev >> EVTCHN_SHIFT;
1500 	evb = ev & ((1ul << EVTCHN_SHIFT) - 1);
1501 	bit = 1ul << evb;
1502 	maskp = (volatile ulong_t *)&si->evtchn_mask[evi];
1503 	do {
1504 		old = si->evtchn_mask[evi];
1505 		new = old | bit;
1506 	} while (atomic_cas_ulong(maskp, old, new) != old);
1507 	masked = (old & bit) == 0;
1508 	if (masked) {
1509 		evtchn_owner[ev] = CPU->cpu_id;
1510 #ifdef DEBUG
1511 		evtchn_owner_thread[ev] = curthread;
1512 #endif
1513 	}
1514 	kpreempt_enable();
1515 	return (masked);
1516 }
1517 
1518 void
1519 ec_clear_evtchn(unsigned int ev)
1520 {
1521 	uint_t evi;
1522 	shared_info_t *si = HYPERVISOR_shared_info;
1523 	volatile ulong_t *pendp;
1524 
1525 	evi = ev >> EVTCHN_SHIFT;
1526 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1527 	pendp = (volatile ulong_t *)&si->evtchn_pending[evi];
1528 	atomic_and_ulong(pendp, ~(1ul << ev));
1529 }
1530 
1531 void
1532 ec_notify_via_evtchn(unsigned int port)
1533 {
1534 	evtchn_send_t send;
1535 
1536 	ASSERT(port != INVALID_EVTCHN);
1537 
1538 	send.port = port;
1539 	(void) HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
1540 }
1541 
1542 int
1543 ec_block_irq(int irq)
1544 {
1545 	irq_info_t *irqp = &irq_info[irq];
1546 	int evtchn;
1547 
1548 
1549 	evtchn = irq_evtchn(irqp);
1550 	(void) ec_mask_evtchn(evtchn);
1551 	return (evtchn_owner[evtchn]);
1552 }
1553 
1554 /*
1555  * Make a event that is pending for delivery on the current cpu  "go away"
1556  * without servicing the interrupt.
1557  */
1558 void
1559 ec_unpend_irq(int irq)
1560 {
1561 	irq_info_t *irqp = &irq_info[irq];
1562 	int pri = irqp->ii_u2.ipl;
1563 	ulong_t flags;
1564 	uint_t evtchn, evi, bit;
1565 	unsigned long pe, pending_sels;
1566 	struct xen_evt_data *cpe;
1567 
1568 	/*
1569 	 * The evtchn must be masked
1570 	 */
1571 	evtchn = irq_evtchn(irqp);
1572 	ASSERT(EVTCHN_MASKED(evtchn));
1573 	evi = evtchn >> EVTCHN_SHIFT;
1574 	bit = evtchn & (1ul << EVTCHN_SHIFT) - 1;
1575 	flags = intr_clear();
1576 	cpe = CPU->cpu_m.mcpu_evt_pend;
1577 	pe = cpe->pending_evts[pri][evi] & ~(1ul << bit);
1578 	cpe->pending_evts[pri][evi] = pe;
1579 	if (pe == 0) {
1580 		pending_sels = cpe->pending_sel[pri];
1581 		pending_sels &= ~(1ul << evi);
1582 		cpe->pending_sel[pri] = pending_sels;
1583 		if (pending_sels == 0)
1584 			CPU->cpu_m.mcpu_intr_pending &= ~(1 << pri);
1585 	}
1586 	intr_restore(flags);
1587 }
1588