xref: /titanic_50/usr/src/uts/i86xpv/os/evtchn.c (revision f3324781c875e2f9865c291e43f86ee710b0c145)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * evtchn.c
31  *
32  * Communication via hypervisor event channels.
33  *
34  * Copyright (c) 2002-2005, K A Fraser
35  *
36  * This file may be distributed separately from the Linux kernel, or
37  * incorporated into other software packages, subject to the following license:
38  *
39  * Permission is hereby granted, free of charge, to any person obtaining a copy
40  * of this source file (the "Software"), to deal in the Software without
41  * restriction, including without limitation the rights to use, copy, modify,
42  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
43  * and to permit persons to whom the Software is furnished to do so, subject to
44  * the following conditions:
45  *
46  * The above copyright notice and this permission notice shall be included in
47  * all copies or substantial portions of the Software.
48  *
49  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
55  * IN THE SOFTWARE.
56  */
57 
58 /* some parts derived from netbsd's hypervisor_machdep.c 1.2.2.2 */
59 
60 /*
61  *
62  * Copyright (c) 2004 Christian Limpach.
63  * All rights reserved.
64  *
65  * Redistribution and use in source and binary forms, with or without
66  * modification, are permitted provided that the following conditions
67  * are met:
68  * 1. Redistributions of source code must retain the above copyright
69  *    notice, this list of conditions and the following disclaimer.
70  * 2. Redistributions in binary form must reproduce the above copyright
71  *    notice, this list of conditions and the following disclaimer in the
72  *    documentation and/or other materials provided with the distribution.
73  * 3. This section intentionally left blank.
74  * 4. The name of the author may not be used to endorse or promote products
75  *    derived from this software without specific prior written permission.
76  *
77  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
78  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
79  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
80  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
81  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
82  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
83  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
84  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
85  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
86  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
87  */
88 /*
89  * Section 3 of the above license was updated in response to bug 6379571.
90  */
91 
92 #include <sys/types.h>
93 #include <sys/hypervisor.h>
94 #include <sys/machsystm.h>
95 #include <sys/mutex.h>
96 #include <sys/evtchn_impl.h>
97 #include <sys/ddi_impldefs.h>
98 #include <sys/avintr.h>
99 #include <sys/cpuvar.h>
100 #include <sys/smp_impldefs.h>
101 #include <sys/archsystm.h>
102 #include <sys/sysmacros.h>
103 #include <sys/cmn_err.h>
104 #include <sys/promif.h>
105 #include <sys/debug.h>
106 #include <sys/psm.h>
107 #include <sys/privregs.h>
108 #include <sys/trap.h>
109 #include <sys/atomic.h>
110 #include <sys/cpu.h>
111 #include <sys/psw.h>
112 #include <sys/traptrace.h>
113 #include <sys/stack.h>
114 #include <sys/x_call.h>
115 #include <xen/public/physdev.h>
116 
117 /*
118  * This file manages our association between hypervisor event channels and
119  * Solaris's IRQs.  This is a one-to-one mapping, with the exception of
120  * IPI IRQs, for which there is one event channel per CPU participating
121  * in the IPI, and the clock VIRQ which also has an event channel per cpu
122  * and the IRQ for /dev/xen/evtchn. The IRQ types are:
123  *
124  * IRQT_VIRQ:
125  *	The hypervisor's standard virtual IRQ, used for the clock timer, for
126  *	example.  This code allows any cpu to bind to one of these, although
127  *	some are treated specially (i.e. VIRQ_DEBUG).
128  *	Event channel binding is done via EVTCHNOP_bind_virq.
129  *
130  * IRQT_PIRQ:
131  *	These associate a physical IRQ with an event channel via
132  *	EVTCHNOP_bind_pirq.
133  *
134  * IRQT_IPI:
135  *	A cross-call IRQ. Maps to "ncpus" event channels, each of which is
136  *	bound to exactly one of the vcpus.  We do not currently support
137  *	unbinding of IPIs (since Solaris doesn't need it). Uses
138  *	EVTCHNOP_bind_ipi.
139  *
140  * IRQT_EVTCHN:
141  *	A "normal" binding to an event channel, typically used by the frontend
142  *      drivers to bind to the their backend event channel.
143  *
144  * IRQT_DEV_EVTCHN:
145  *	This is a one-time IRQ used by /dev/xen/evtchn. Unlike other IRQs, we
146  *	have a one-IRQ to many-evtchn mapping. We only track evtchn->irq for
147  *	these event channels, which are managed via ec_irq_add/rm_evtchn().
148  *	We enforce that IRQT_DEV_EVTCHN's representative evtchn (->ii_evtchn)
149  *	is zero, and make any calls to irq_evtchn() an error, to prevent
150  *	accidentally attempting to use the illegal evtchn 0.
151  *
152  * Suspend/resume
153  *
154  *	During a suspend/resume cycle, we need to tear down the event channels.
155  *	All other mapping data is kept. The drivers will remove their own event
156  *	channels via xendev on receiving a DDI_SUSPEND.  This leaves us with
157  *	the IPIs and VIRQs, which we handle in ec_suspend() and ec_resume()
158  *	below.
159  *
160  * CPU binding
161  *
162  *	When an event channel is bound to a CPU, we set a bit in a mask present
163  *	in the machcpu (evt_affinity) to indicate that this CPU can accept this
164  *	event channel.  For both IPIs and VIRQs, this binding is fixed at
165  *	allocation time and we never modify it.  All other event channels are
166  *	bound via the PSM either as part of add_avintr(), or interrupt
167  *	redistribution (xen_psm_dis/enable_intr()) as a result of CPU
168  *	offline/online.
169  *
170  * Locking
171  *
172  *	Updates are done holding the ec_lock.  The xen_callback_handler()
173  *	routine reads the mapping data in a lockless fashion.  Additionally
174  *	suspend takes ec_lock to prevent update races during a suspend/resume
175  *	cycle.  The IPI info is also examined without the lock; this is OK
176  *	since we only ever change IPI info during initial setup and resume.
177  */
178 
179 #define	IRQ_IS_CPUPOKE(irq) (ipi_info[XC_CPUPOKE_PIL].mi_irq == (irq))
180 
181 #define	EVTCHN_MASKED(ev) \
182 	(HYPERVISOR_shared_info->evtchn_mask[(ev) >> EVTCHN_SHIFT] & \
183 	(1ul << ((ev) & ((1ul << EVTCHN_SHIFT) - 1))))
184 
185 static short evtchn_to_irq[NR_EVENT_CHANNELS];
186 static cpuset_t evtchn_cpus[NR_EVENT_CHANNELS];
187 static int	evtchn_owner[NR_EVENT_CHANNELS];
188 #ifdef DEBUG
189 static kthread_t *evtchn_owner_thread[NR_EVENT_CHANNELS];
190 #endif
191 
192 static irq_info_t irq_info[NR_IRQS];
193 static mec_info_t ipi_info[MAXIPL];
194 static mec_info_t virq_info[NR_VIRQS];
195 /*
196  * Mailbox for communication with the evtchn device driver.
197  * We rely on only cpu 0 servicing the event channels associated
198  * with the driver.  i.e. all evtchn driver evtchns are bound to cpu 0.
199  */
200 volatile int ec_dev_mbox;	/* mailbox for evtchn device driver */
201 
202 /*
203  * See the locking description above.
204  */
205 kmutex_t ec_lock;
206 
207 /*
208  * Bitmap indicating which PIRQs require the hypervisor to be notified
209  * on unmask.
210  */
211 static unsigned long pirq_needs_eoi[NR_PIRQS / (sizeof (unsigned long) * NBBY)];
212 
213 static int ec_debug_irq = INVALID_IRQ;
214 int ec_dev_irq = INVALID_IRQ;
215 
216 int
217 xen_bind_virq(unsigned int virq, processorid_t cpu, int *port)
218 {
219 	evtchn_bind_virq_t bind;
220 	int err;
221 
222 	bind.virq = virq;
223 	bind.vcpu = cpu;
224 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind)) == 0)
225 		*port = bind.port;
226 	else
227 		err = xen_xlate_errcode(err);
228 	return (err);
229 }
230 
231 int
232 xen_bind_interdomain(int domid, int remote_port, int *port)
233 {
234 	evtchn_bind_interdomain_t bind;
235 	int err;
236 
237 	bind.remote_dom  = domid;
238 	bind.remote_port = remote_port;
239 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
240 	    &bind)) == 0)
241 		*port = bind.local_port;
242 	else
243 		err = xen_xlate_errcode(err);
244 	return (err);
245 }
246 
247 int
248 xen_alloc_unbound_evtchn(int domid, int *evtchnp)
249 {
250 	evtchn_alloc_unbound_t alloc;
251 	int err;
252 
253 	alloc.dom = DOMID_SELF;
254 	alloc.remote_dom = domid;
255 
256 	if ((err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
257 	    &alloc)) == 0) {
258 		*evtchnp = alloc.port;
259 		/* ensure evtchn is masked till we're ready to use it */
260 		(void) ec_mask_evtchn(*evtchnp);
261 	} else {
262 		err = xen_xlate_errcode(err);
263 	}
264 
265 	return (err);
266 }
267 
268 static int
269 xen_close_evtchn(int evtchn)
270 {
271 	evtchn_close_t close;
272 	int err;
273 
274 	close.port = evtchn;
275 	err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
276 	if (err)
277 		err = xen_xlate_errcode(err);
278 	return (err);
279 }
280 
281 static int
282 xen_bind_ipi(processorid_t cpu)
283 {
284 	evtchn_bind_ipi_t bind;
285 
286 	ASSERT(MUTEX_HELD(&ec_lock));
287 
288 	bind.vcpu = cpu;
289 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind) != 0)
290 		panic("xen_bind_ipi() failed");
291 	return (bind.port);
292 }
293 
294 /* Send future instances of this interrupt to other vcpu. */
295 static void
296 xen_bind_vcpu(int evtchn, int cpu)
297 {
298 	evtchn_bind_vcpu_t bind;
299 
300 	ASSERT(MUTEX_HELD(&ec_lock));
301 
302 	bind.port = evtchn;
303 	bind.vcpu = cpu;
304 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind) != 0)
305 		panic("xen_bind_vcpu() failed");
306 }
307 
308 static int
309 xen_bind_pirq(int pirq)
310 {
311 	evtchn_bind_pirq_t bind;
312 	int ret;
313 
314 	bind.pirq = pirq;
315 	bind.flags = BIND_PIRQ__WILL_SHARE;
316 	if ((ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind)) != 0)
317 		panic("xen_bind_pirq() failed (err %d)", ret);
318 	return (bind.port);
319 }
320 
321 /* unmask an evtchn and send upcall to appropriate vcpu if pending bit is set */
322 static void
323 xen_evtchn_unmask(int evtchn)
324 {
325 	evtchn_unmask_t unmask;
326 
327 	unmask.port = evtchn;
328 	if (HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask) != 0)
329 		panic("xen_evtchn_unmask() failed");
330 }
331 
332 static void
333 update_evtchn_affinity(int evtchn)
334 {
335 	cpu_t *cp;
336 	struct xen_evt_data *cpe;
337 
338 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
339 	ASSERT(MUTEX_HELD(&ec_lock));
340 
341 	/*
342 	 * Use lockless search of cpu_list, similar to mutex_vector_enter().
343 	 */
344 	kpreempt_disable();
345 	cp = cpu_list;
346 	do {
347 		cpe = cp->cpu_m.mcpu_evt_pend;
348 		if (CPU_IN_SET(evtchn_cpus[evtchn], cp->cpu_id))
349 			SET_EVTCHN_BIT(evtchn, cpe->evt_affinity);
350 		else
351 			CLEAR_EVTCHN_BIT(evtchn, cpe->evt_affinity);
352 	} while ((cp = cp->cpu_next) != cpu_list);
353 	kpreempt_enable();
354 }
355 
356 static void
357 bind_evtchn_to_cpuset(int evtchn, cpuset_t cpus)
358 {
359 	ASSERT(evtchn_to_irq[evtchn] != INVALID_IRQ);
360 
361 	CPUSET_ZERO(evtchn_cpus[evtchn]);
362 	CPUSET_OR(evtchn_cpus[evtchn], cpus);
363 	update_evtchn_affinity(evtchn);
364 }
365 
366 static void
367 clear_evtchn_affinity(int evtchn)
368 {
369 	CPUSET_ZERO(evtchn_cpus[evtchn]);
370 	update_evtchn_affinity(evtchn);
371 }
372 
373 static void
374 alloc_irq_evtchn(int irq, int index, int evtchn, int cpu)
375 {
376 	irq_info_t *irqp = &irq_info[irq];
377 
378 	switch (irqp->ii_type) {
379 	case IRQT_IPI:
380 		ipi_info[index].mi_evtchns[cpu] = evtchn;
381 		irqp->ii_u.index = index;
382 		break;
383 	case IRQT_VIRQ:
384 		virq_info[index].mi_evtchns[cpu] = evtchn;
385 		irqp->ii_u.index = index;
386 		break;
387 	default:
388 		irqp->ii_u.evtchn = evtchn;
389 		break;
390 	}
391 
392 	evtchn_to_irq[evtchn] = irq;
393 
394 	/*
395 	 * If a CPU is not specified, we expect to bind it to a CPU later via
396 	 * the PSM.
397 	 */
398 	if (cpu != -1) {
399 		cpuset_t tcpus;
400 		CPUSET_ONLY(tcpus, cpu);
401 		bind_evtchn_to_cpuset(evtchn, tcpus);
402 	}
403 }
404 
405 static int
406 alloc_irq(int type, int index, int evtchn, int cpu)
407 {
408 	int irq;
409 	irq_info_t *irqp;
410 
411 	ASSERT(MUTEX_HELD(&ec_lock));
412 	ASSERT(type != IRQT_IPI || cpu != -1);
413 
414 	for (irq = 0; irq < NR_IRQS; irq++) {
415 		if (irq_info[irq].ii_type == IRQT_UNBOUND)
416 			break;
417 	}
418 
419 	if (irq == NR_IRQS)
420 		panic("No available IRQ to bind to: increase NR_IRQS!\n");
421 
422 	irqp = &irq_info[irq];
423 
424 	irqp->ii_type = type;
425 	/*
426 	 * Set irq/has_handler field to zero which means handler not installed
427 	 */
428 	irqp->ii_u2.has_handler = 0;
429 
430 	alloc_irq_evtchn(irq, index, evtchn, cpu);
431 	return (irq);
432 }
433 
434 static int
435 irq_evtchn(irq_info_t *irqp)
436 {
437 	int evtchn;
438 
439 	ASSERT(irqp->ii_type != IRQT_DEV_EVTCHN);
440 
441 	switch (irqp->ii_type) {
442 	case IRQT_IPI:
443 		ASSERT(irqp->ii_u.index != 0);
444 		evtchn = ipi_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
445 		break;
446 	case IRQT_VIRQ:
447 		evtchn = virq_info[irqp->ii_u.index].mi_evtchns[CPU->cpu_id];
448 		break;
449 	default:
450 		evtchn = irqp->ii_u.evtchn;
451 		break;
452 	}
453 
454 	return (evtchn);
455 }
456 
457 static void
458 unbind_evtchn(ushort_t *evtchnp)
459 {
460 	int err;
461 
462 	ASSERT(MUTEX_HELD(&ec_lock));
463 
464 	ASSERT(*evtchnp != 0);
465 
466 	err = xen_close_evtchn(*evtchnp);
467 	ASSERT(err == 0);
468 	clear_evtchn_affinity(*evtchnp);
469 	evtchn_to_irq[*evtchnp] = INVALID_IRQ;
470 	*evtchnp = 0;
471 }
472 
473 static void
474 pirq_unmask_notify(int pirq)
475 {
476 	struct physdev_eoi eoi;
477 
478 	if (TEST_EVTCHN_BIT(pirq, &pirq_needs_eoi[0])) {
479 		eoi.irq = pirq;
480 		(void) HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
481 	}
482 }
483 
484 static void
485 pirq_query_unmask(int pirq)
486 {
487 	struct physdev_irq_status_query irq_status;
488 
489 	irq_status.irq = pirq;
490 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status);
491 	CLEAR_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
492 	if (irq_status.flags & XENIRQSTAT_needs_eoi)
493 		SET_EVTCHN_BIT(pirq, &pirq_needs_eoi[0]);
494 }
495 
496 static void
497 end_pirq(int irq)
498 {
499 	int evtchn = irq_evtchn(&irq_info[irq]);
500 
501 	ec_unmask_evtchn(evtchn);
502 	pirq_unmask_notify(IRQ_TO_PIRQ(irq));
503 }
504 
505 /*
506  * probe if a pirq is available to bind to, return 1 if available
507  * else return 0.
508  * Note that for debug versions of xen this probe may cause an in use IRQ
509  * warning message from xen.
510  */
511 int
512 ec_probe_pirq(int pirq)
513 {
514 	evtchn_bind_pirq_t bind;
515 
516 	bind.pirq = pirq;
517 	bind.flags = 0;
518 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind) != 0) {
519 		return (0);
520 	} else {
521 		(void) xen_close_evtchn(bind.port);
522 		return (1);
523 	}
524 }
525 
526 /*
527  * Bind an event channel to a vcpu
528  */
529 void
530 ec_bind_vcpu(int evtchn, int cpu)
531 {
532 	mutex_enter(&ec_lock);
533 	xen_bind_vcpu(evtchn, cpu);
534 	mutex_exit(&ec_lock);
535 }
536 
537 /*
538  * Set up a physical device irq to be associated with an event channel.
539  */
540 void
541 ec_setup_pirq(int irq, int ipl, cpuset_t cpus)
542 {
543 	int evtchn;
544 	irq_info_t *irqp = &irq_info[irq];
545 
546 	/*
547 	 * Test if this PIRQ is already bound to an evtchn,
548 	 * which means it is a shared IRQ and we don't want to
549 	 * bind and do some initial setup that has already been
550 	 * done for this irq on a previous trip through this code.
551 	 */
552 	if (irqp->ii_u.evtchn == INVALID_EVTCHN) {
553 		evtchn = xen_bind_pirq(irq);
554 
555 		pirq_query_unmask(IRQ_TO_PIRQ(irq));
556 
557 		irqp->ii_type = IRQT_PIRQ;
558 		irqp->ii_u.evtchn = evtchn;
559 
560 		evtchn_to_irq[evtchn] = irq;
561 		irqp->ii_u2.ipl = ipl;
562 		ec_set_irq_affinity(irq, cpus);
563 		ec_enable_irq(irq);
564 		pirq_unmask_notify(IRQ_TO_PIRQ(irq));
565 	} else {
566 		ASSERT(irqp->ii_u2.ipl != 0);
567 		cmn_err(CE_NOTE, "IRQ%d is shared", irq);
568 		if (ipl > irqp->ii_u2.ipl)
569 			irqp->ii_u2.ipl = ipl;
570 	}
571 }
572 
573 void
574 ec_unbind_irq(int irq)
575 {
576 	irq_info_t *irqp = &irq_info[irq];
577 	mec_info_t *virqp;
578 	int drop_lock = 0;
579 	int type, i;
580 
581 	/*
582 	 * Nasty, but we need this during suspend.
583 	 */
584 	if (mutex_owner(&ec_lock) != curthread) {
585 		mutex_enter(&ec_lock);
586 		drop_lock = 1;
587 	}
588 
589 	type = irqp->ii_type;
590 
591 	ASSERT((type == IRQT_EVTCHN) || (type == IRQT_PIRQ) ||
592 	    (type == IRQT_VIRQ));
593 
594 	if ((type == IRQT_EVTCHN) || (type == IRQT_PIRQ)) {
595 		/* There's only one event channel associated with this irq */
596 		unbind_evtchn(&irqp->ii_u.evtchn);
597 	} else if (type == IRQT_VIRQ) {
598 		/*
599 		 * Each cpu on the system can have it's own event channel
600 		 * associated with a virq.  Unbind them all.
601 		 */
602 		virqp = &virq_info[irqp->ii_u.index];
603 		for (i = 0; i < NCPU; i++) {
604 			if (virqp->mi_evtchns[i] != 0)
605 				unbind_evtchn(&virqp->mi_evtchns[i]);
606 		}
607 		/* Mark the virq structure as invalid. */
608 		virqp->mi_irq = INVALID_IRQ;
609 	}
610 
611 	bzero(irqp, sizeof (*irqp));
612 	/* Re-reserve PIRQ. */
613 	if (type == IRQT_PIRQ)
614 		irqp->ii_type = IRQT_PIRQ;
615 
616 	if (drop_lock)
617 		mutex_exit(&ec_lock);
618 }
619 
620 /*ARGSUSED*/
621 static int
622 do_nothing_function(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
623 {
624 	return (0);
625 }
626 
627 /*
628  * Rebind an event channel for delivery to a CPU.
629  */
630 void
631 ec_set_irq_affinity(int irq, cpuset_t dest)
632 {
633 	int evtchn, tcpu;
634 	irq_info_t *irqp = &irq_info[irq];
635 
636 	mutex_enter(&ec_lock);
637 
638 	ASSERT(irq < NR_IRQS);
639 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
640 
641 	/*
642 	 * Binding is done at allocation time for these types, so we should
643 	 * never modify them.
644 	 */
645 	if (irqp->ii_type == IRQT_IPI || irqp->ii_type == IRQT_VIRQ ||
646 	    irqp->ii_type == IRQT_DEV_EVTCHN) {
647 		mutex_exit(&ec_lock);
648 		return;
649 	}
650 
651 	CPUSET_FIND(dest, tcpu);
652 	ASSERT(tcpu != CPUSET_NOTINSET);
653 
654 	evtchn = irq_evtchn(irqp);
655 
656 	xen_bind_vcpu(evtchn, tcpu);
657 
658 	bind_evtchn_to_cpuset(evtchn, dest);
659 
660 	mutex_exit(&ec_lock);
661 
662 	/*
663 	 * Now send the new target processor a NOP IPI. When this returns,
664 	 * it will check for any pending interrupts, and so service any that
665 	 * got delivered to the wrong processor by mistake.
666 	 */
667 	xc_call(NULL, NULL, NULL, X_CALL_HIPRI, dest, do_nothing_function);
668 }
669 
670 int
671 ec_set_irq_priority(int irq, int pri)
672 {
673 	irq_info_t *irqp;
674 
675 	if (irq >= NR_IRQS)
676 		return (-1);
677 
678 	irqp = &irq_info[irq];
679 
680 	if (irqp->ii_type == IRQT_UNBOUND)
681 		return (-1);
682 
683 	irqp->ii_u2.ipl = pri;
684 
685 	return (0);
686 }
687 
688 void
689 ec_clear_irq_priority(int irq)
690 {
691 	irq_info_t *irqp = &irq_info[irq];
692 
693 	ASSERT(irq < NR_IRQS);
694 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
695 
696 	irqp->ii_u2.ipl = 0;
697 }
698 
699 int
700 ec_bind_evtchn_to_irq(int evtchn)
701 {
702 	mutex_enter(&ec_lock);
703 
704 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
705 
706 	(void) alloc_irq(IRQT_EVTCHN, 0, evtchn, -1);
707 
708 	mutex_exit(&ec_lock);
709 	return (evtchn_to_irq[evtchn]);
710 }
711 
712 int
713 ec_bind_virq_to_irq(int virq, int cpu)
714 {
715 	int err;
716 	int evtchn;
717 	mec_info_t *virqp;
718 
719 	virqp = &virq_info[virq];
720 	mutex_enter(&ec_lock);
721 
722 	err = xen_bind_virq(virq, cpu, &evtchn);
723 	ASSERT(err == 0);
724 
725 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
726 
727 	if (virqp->mi_irq == INVALID_IRQ) {
728 		virqp->mi_irq = alloc_irq(IRQT_VIRQ, virq, evtchn, cpu);
729 	} else {
730 		alloc_irq_evtchn(virqp->mi_irq, virq, evtchn, cpu);
731 	}
732 
733 	mutex_exit(&ec_lock);
734 
735 	return (virqp->mi_irq);
736 }
737 
738 int
739 ec_bind_ipi_to_irq(int ipl, int cpu)
740 {
741 	int evtchn;
742 	ulong_t flags;
743 	mec_info_t *ipip;
744 
745 	mutex_enter(&ec_lock);
746 
747 	ipip = &ipi_info[ipl];
748 
749 	evtchn = xen_bind_ipi(cpu);
750 
751 	ASSERT(evtchn_to_irq[evtchn] == INVALID_IRQ);
752 
753 	if (ipip->mi_irq == INVALID_IRQ) {
754 		ipip->mi_irq = alloc_irq(IRQT_IPI, ipl, evtchn, cpu);
755 	} else {
756 		alloc_irq_evtchn(ipip->mi_irq, ipl, evtchn, cpu);
757 	}
758 
759 	/*
760 	 * Unmask the new evtchn so that it can be seen by the target cpu
761 	 */
762 	flags = intr_clear();
763 	ec_unmask_evtchn(evtchn);
764 	intr_restore(flags);
765 
766 	mutex_exit(&ec_lock);
767 	return (ipip->mi_irq);
768 }
769 
770 /*
771  * When bringing up a CPU, bind to all the IPIs that CPU0 bound.
772  */
773 void
774 ec_bind_cpu_ipis(int cpu)
775 {
776 	int i;
777 
778 	for (i = 0; i < MAXIPL; i++) {
779 		mec_info_t *ipip = &ipi_info[i];
780 		if (ipip->mi_irq == INVALID_IRQ)
781 			continue;
782 
783 		(void) ec_bind_ipi_to_irq(i, cpu);
784 	}
785 }
786 
787 /*
788  * Can this IRQ be rebound to another CPU?
789  */
790 int
791 ec_irq_rebindable(int irq)
792 {
793 	irq_info_t *irqp = &irq_info[irq];
794 
795 	if (irqp->ii_u.evtchn == 0)
796 		return (0);
797 
798 	return (irqp->ii_type == IRQT_EVTCHN || irqp->ii_type == IRQT_PIRQ);
799 }
800 
801 /*
802  * Should this IRQ be unbound from this CPU (which is being offlined) to
803  * another?
804  */
805 int
806 ec_irq_needs_rebind(int irq, int cpu)
807 {
808 	irq_info_t *irqp = &irq_info[irq];
809 
810 	return (ec_irq_rebindable(irq) &&
811 	    CPU_IN_SET(evtchn_cpus[irqp->ii_u.evtchn], cpu));
812 }
813 
814 void
815 ec_send_ipi(int ipl, int cpu)
816 {
817 	mec_info_t *ipip = &ipi_info[ipl];
818 
819 	ASSERT(ipip->mi_irq != INVALID_IRQ);
820 
821 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
822 }
823 
824 void
825 ec_try_ipi(int ipl, int cpu)
826 {
827 	mec_info_t *ipip = &ipi_info[ipl];
828 
829 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
830 		return;
831 
832 	ec_notify_via_evtchn(ipip->mi_evtchns[cpu]);
833 }
834 
835 void
836 ec_irq_add_evtchn(int irq, int evtchn)
837 {
838 	mutex_enter(&ec_lock);
839 
840 	/*
841 	 * See description of IRQT_DEV_EVTCHN above.
842 	 */
843 	ASSERT(irq == ec_dev_irq);
844 
845 	alloc_irq_evtchn(irq, 0, evtchn, 0);
846 	/*
847 	 * We enforce that the representative event channel for IRQT_DEV_EVTCHN
848 	 * is zero, so PSM operations on it have no effect.
849 	 */
850 	irq_info[irq].ii_u.evtchn = 0;
851 	mutex_exit(&ec_lock);
852 }
853 
854 void
855 ec_irq_rm_evtchn(int irq, int evtchn)
856 {
857 	ushort_t ec = evtchn;
858 
859 	mutex_enter(&ec_lock);
860 	ASSERT(irq == ec_dev_irq);
861 	unbind_evtchn(&ec);
862 	mutex_exit(&ec_lock);
863 }
864 
865 /*
866  * Allocate an /dev/xen/evtchn IRQ.  See the big comment at the top
867  * for an explanation.
868  */
869 int
870 ec_dev_alloc_irq(void)
871 {
872 	int i;
873 	irq_info_t *irqp;
874 
875 	for (i = 0; i < NR_IRQS; i++) {
876 		if (irq_info[i].ii_type == IRQT_UNBOUND)
877 			break;
878 	}
879 
880 	ASSERT(i != NR_IRQS);
881 
882 	irqp = &irq_info[i];
883 	irqp->ii_type = IRQT_DEV_EVTCHN;
884 	irqp->ii_u2.ipl = IPL_EVTCHN;
885 	/*
886 	 * Force the evtchn to zero for the special evtchn device irq
887 	 */
888 	irqp->ii_u.evtchn = 0;
889 	return (i);
890 }
891 
892 void
893 ec_enable_irq(unsigned int irq)
894 {
895 	ulong_t flag;
896 	irq_info_t *irqp = &irq_info[irq];
897 
898 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
899 		return;
900 
901 	flag = intr_clear();
902 	ec_unmask_evtchn(irq_evtchn(irqp));
903 	intr_restore(flag);
904 }
905 
906 void
907 ec_disable_irq(unsigned int irq)
908 {
909 	irq_info_t *irqp = &irq_info[irq];
910 
911 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
912 		return;
913 
914 	/*
915 	 * Spin till we are the one to mask the evtchn
916 	 * Ensures no one else can be servicing this evtchn.
917 	 */
918 	while (!ec_mask_evtchn(irq_evtchn(irqp)))
919 		SMT_PAUSE();
920 }
921 
922 static int
923 ec_evtchn_pending(uint_t ev)
924 {
925 	uint_t evi;
926 	shared_info_t *si = HYPERVISOR_shared_info;
927 
928 	evi = ev >> EVTCHN_SHIFT;
929 	ev &= (1ul << EVTCHN_SHIFT) - 1;
930 	return ((si->evtchn_pending[evi] & (1ul << ev)) != 0);
931 }
932 
933 int
934 ec_pending_irq(unsigned int irq)
935 {
936 	int evtchn = irq_evtchn(&irq_info[irq]);
937 
938 	return (ec_evtchn_pending(evtchn));
939 }
940 
941 void
942 ec_clear_irq(int irq)
943 {
944 	irq_info_t *irqp = &irq_info[irq];
945 	int evtchn;
946 
947 	if (irqp->ii_type == IRQT_DEV_EVTCHN)
948 		return;
949 
950 	ASSERT(irqp->ii_type != IRQT_UNBOUND);
951 
952 	evtchn = irq_evtchn(irqp);
953 
954 	ASSERT(EVTCHN_MASKED(evtchn));
955 	ec_clear_evtchn(evtchn);
956 }
957 
958 void
959 ec_unmask_irq(int irq)
960 {
961 	ulong_t flags;
962 	irq_info_t *irqp = &irq_info[irq];
963 
964 	flags = intr_clear();
965 	switch (irqp->ii_type) {
966 	case IRQT_PIRQ:
967 		end_pirq(irq);
968 		break;
969 	case IRQT_DEV_EVTCHN:
970 		break;
971 	default:
972 		ec_unmask_evtchn(irq_evtchn(irqp));
973 		break;
974 	}
975 	intr_restore(flags);
976 }
977 
978 void
979 ec_try_unmask_irq(int irq)
980 {
981 	ulong_t flags;
982 	irq_info_t *irqp = &irq_info[irq];
983 	int evtchn;
984 
985 	flags = intr_clear();
986 	switch (irqp->ii_type) {
987 	case IRQT_PIRQ:
988 		end_pirq(irq);
989 		break;
990 	case IRQT_DEV_EVTCHN:
991 		break;
992 	default:
993 		if ((evtchn = irq_evtchn(irqp)) != 0)
994 			ec_unmask_evtchn(evtchn);
995 		break;
996 	}
997 	intr_restore(flags);
998 }
999 
1000 /*
1001  * Poll until an event channel is ready or 'check_func' returns true.  This can
1002  * only be used in a situation where interrupts are masked, otherwise we have a
1003  * classic time-of-check vs. time-of-use race.
1004  */
1005 void
1006 ec_wait_on_evtchn(int evtchn, int (*check_func)(void *), void *arg)
1007 {
1008 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1009 		while (!check_func(arg))
1010 			(void) HYPERVISOR_yield();
1011 		return;
1012 	}
1013 
1014 	ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
1015 
1016 	for (;;) {
1017 		evtchn_port_t ports[1];
1018 
1019 		ports[0] = evtchn;
1020 
1021 		ec_clear_evtchn(evtchn);
1022 
1023 		if (check_func(arg))
1024 			return;
1025 
1026 		(void) HYPERVISOR_poll(ports, 1, 0);
1027 	}
1028 }
1029 
1030 void
1031 ec_wait_on_ipi(int ipl, int (*check_func)(void *), void *arg)
1032 {
1033 	mec_info_t *ipip = &ipi_info[ipl];
1034 
1035 	if (ipip->mi_irq == INVALID_IRQ || ipip->mi_irq == 0)
1036 		return;
1037 
1038 	ec_wait_on_evtchn(ipip->mi_evtchns[CPU->cpu_id], check_func, arg);
1039 }
1040 
1041 void
1042 ec_suspend(void)
1043 {
1044 	irq_info_t *irqp;
1045 	ushort_t *evtchnp;
1046 	int i;
1047 	int c;
1048 
1049 	ASSERT(MUTEX_HELD(&ec_lock));
1050 
1051 	for (i = 0; i < MAXIPL; i++) {
1052 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1053 			continue;
1054 
1055 		for (c = 0; c < NCPU; c++) {
1056 			if (cpu[c] == NULL)
1057 				continue;
1058 
1059 			if (CPU_IN_SET(cpu_suspend_lost_set, c))
1060 				continue;
1061 
1062 			evtchnp = &ipi_info[i].mi_evtchns[c];
1063 			ASSERT(*evtchnp != 0);
1064 			unbind_evtchn(evtchnp);
1065 		}
1066 	}
1067 
1068 	for (i = 0; i < NR_VIRQS; i++) {
1069 		if (virq_info[i].mi_irq == INVALID_IRQ)
1070 			continue;
1071 
1072 		/*
1073 		 * If we're sharing a single event channel across all CPUs, we
1074 		 * should only unbind once.
1075 		 */
1076 		if (virq_info[i].mi_shared) {
1077 			evtchnp = &virq_info[i].mi_evtchns[0];
1078 			unbind_evtchn(evtchnp);
1079 			for (c = 1; c < NCPU; c++)
1080 				virq_info[i].mi_evtchns[c] = 0;
1081 		} else {
1082 			for (c = 0; c < NCPU; c++) {
1083 				if (cpu[c] == NULL)
1084 					continue;
1085 
1086 				evtchnp = &virq_info[i].mi_evtchns[c];
1087 				if (*evtchnp != 0)
1088 					unbind_evtchn(evtchnp);
1089 			}
1090 		}
1091 	}
1092 
1093 	for (i = 0; i < NR_IRQS; i++) {
1094 		irqp = &irq_info[i];
1095 
1096 		switch (irqp->ii_type) {
1097 		case IRQT_EVTCHN:
1098 		case IRQT_DEV_EVTCHN:
1099 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1100 			break;
1101 		case IRQT_PIRQ:
1102 			if (irqp->ii_u.evtchn != 0)
1103 				(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
1104 			break;
1105 		default:
1106 			break;
1107 		}
1108 	}
1109 }
1110 
1111 /*
1112  * The debug irq is special, we only have one evtchn and irq but we allow all
1113  * cpus to service it.  It's marked as shared and we propogate the event
1114  * channel into all CPUs by hand.
1115  */
1116 static void
1117 share_virq(mec_info_t *virqp)
1118 {
1119 	int evtchn = virqp->mi_evtchns[0];
1120 	cpuset_t tset;
1121 	int i;
1122 
1123 	ASSERT(evtchn != 0);
1124 
1125 	virqp->mi_shared = 1;
1126 
1127 	for (i = 1; i < NCPU; i++)
1128 		virqp->mi_evtchns[i] = evtchn;
1129 	CPUSET_ALL(tset);
1130 	bind_evtchn_to_cpuset(evtchn, tset);
1131 }
1132 
1133 static void
1134 virq_resume(int virq)
1135 {
1136 	mec_info_t *virqp = &virq_info[virq];
1137 	int evtchn;
1138 	int i, err;
1139 
1140 	for (i = 0; i < NCPU; i++) {
1141 		cpuset_t tcpus;
1142 
1143 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1144 			continue;
1145 
1146 		err = xen_bind_virq(virq, i, &evtchn);
1147 		ASSERT(err == 0);
1148 
1149 		virqp->mi_evtchns[i] = evtchn;
1150 		evtchn_to_irq[evtchn] = virqp->mi_irq;
1151 		CPUSET_ONLY(tcpus, i);
1152 		bind_evtchn_to_cpuset(evtchn, tcpus);
1153 		ec_unmask_evtchn(evtchn);
1154 		/*
1155 		 * only timer VIRQ is bound to all cpus
1156 		 */
1157 		if (virq != VIRQ_TIMER)
1158 			break;
1159 	}
1160 
1161 	if (virqp->mi_shared)
1162 		share_virq(virqp);
1163 }
1164 
1165 static void
1166 ipi_resume(int ipl)
1167 {
1168 	mec_info_t *ipip = &ipi_info[ipl];
1169 	int i;
1170 
1171 	for (i = 0; i < NCPU; i++) {
1172 		cpuset_t tcpus;
1173 		int evtchn;
1174 
1175 		if (cpu[i] == NULL || CPU_IN_SET(cpu_suspend_lost_set, i))
1176 			continue;
1177 
1178 		evtchn = xen_bind_ipi(i);
1179 		ipip->mi_evtchns[i] = evtchn;
1180 		evtchn_to_irq[evtchn] = ipip->mi_irq;
1181 		CPUSET_ONLY(tcpus, i);
1182 		bind_evtchn_to_cpuset(evtchn, tcpus);
1183 		ec_unmask_evtchn(evtchn);
1184 	}
1185 }
1186 
1187 void
1188 ec_resume(void)
1189 {
1190 	int i;
1191 
1192 	/* New event-channel space is not 'live' yet. */
1193 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
1194 		(void) ec_mask_evtchn(i);
1195 
1196 	for (i = 0; i < MAXIPL; i++) {
1197 		if (ipi_info[i].mi_irq == INVALID_IRQ)
1198 			continue;
1199 		ipi_resume(i);
1200 	}
1201 
1202 	for (i = 0; i < NR_VIRQS; i++) {
1203 		if (virq_info[i].mi_irq == INVALID_IRQ)
1204 			continue;
1205 		virq_resume(i);
1206 	}
1207 }
1208 
1209 void
1210 ec_init(void)
1211 {
1212 	int i;
1213 	mutex_init(&ec_lock, NULL, MUTEX_SPIN, (void *)ipltospl(SPL7));
1214 
1215 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
1216 		CPUSET_ZERO(evtchn_cpus[i]);
1217 		evtchn_to_irq[i] = INVALID_IRQ;
1218 		(void) ec_mask_evtchn(i);
1219 	}
1220 
1221 	for (i = 0; i < MAXIPL; i++)
1222 		ipi_info[i].mi_irq = INVALID_IRQ;
1223 
1224 	for (i = 0; i < NR_VIRQS; i++)
1225 		virq_info[i].mi_irq = INVALID_IRQ;
1226 
1227 	/*
1228 	 * Phys IRQ space is statically bound (1:1 mapping), grab the IRQs
1229 	 * now.
1230 	 */
1231 	for (i = PIRQ_BASE; i < NR_PIRQS; i++) {
1232 		irq_info[PIRQ_TO_IRQ(i)].ii_type = IRQT_PIRQ;
1233 	}
1234 }
1235 
1236 void
1237 ec_init_debug_irq()
1238 {
1239 	int irq;
1240 
1241 	irq = ec_bind_virq_to_irq(VIRQ_DEBUG, 0);
1242 	(void) add_avintr(NULL, IPL_DEBUG, (avfunc)xen_debug_handler,
1243 	    "debug", irq, NULL, NULL, NULL, NULL);
1244 
1245 	mutex_enter(&ec_lock);
1246 	share_virq(&virq_info[irq_info[irq].ii_u.index]);
1247 	mutex_exit(&ec_lock);
1248 	ec_debug_irq = irq;
1249 }
1250 
1251 #define	UNBLOCKED_EVENTS(si, ix, cpe, cpu_id) \
1252 	((si)->evtchn_pending[ix] & ~(si)->evtchn_mask[ix] & \
1253 		(cpe)->evt_affinity[ix])
1254 
1255 /*
1256  * This is the entry point for processing events from xen
1257  *
1258  * (See the commentary associated with the shared_info_st structure
1259  * in hypervisor-if.h)
1260  *
1261  * Since the event channel mechanism doesn't really implement the
1262  * concept of priority like hardware interrupt controllers, we simulate
1263  * that in software here using the cpu priority field and the pending
1264  * interrupts field.  Events/interrupts that are not able to be serviced
1265  * now because they are at a lower priority than the current cpu priority
1266  * cause a level bit to be recorded in the pending interrupts word.  When
1267  * the priority is lowered (either by spl or interrupt exit code) the pending
1268  * levels are checked and an upcall is scheduled if there are events/interrupts
1269  * that have become deliverable.
1270  */
1271 void
1272 xen_callback_handler(struct regs *rp, trap_trace_rec_t *ttp)
1273 {
1274 	ulong_t pending_sels, pe, selbit;
1275 	int i, j, port, pri, curpri, irq;
1276 	uint16_t pending_ints;
1277 	struct cpu *cpu = CPU;
1278 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1279 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
1280 	volatile struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
1281 	volatile uint16_t *cpu_ipp = &cpu->cpu_m.mcpu_intr_pending;
1282 
1283 	ASSERT(rp->r_trapno == T_AST && rp->r_err == 0);
1284 	ASSERT(&si->vcpu_info[cpu->cpu_id] == vci);
1285 	ASSERT_STACK_ALIGNED();
1286 
1287 	vci->evtchn_upcall_pending = 0;
1288 
1289 	/*
1290 	 * To expedite scanning of pending notifications, any 0->1
1291 	 * pending transition on an unmasked channel causes a
1292 	 * corresponding bit in evtchn_pending_sel to be set.
1293 	 * Each bit in the selector covers a 32-bit word in
1294 	 * the evtchn_pending[] array.
1295 	 */
1296 	membar_enter();
1297 	do {
1298 		pending_sels = vci->evtchn_pending_sel;
1299 	} while (atomic_cas_ulong((volatile ulong_t *)&vci->evtchn_pending_sel,
1300 	    pending_sels, 0) != pending_sels);
1301 
1302 	pending_ints = *cpu_ipp;
1303 	while ((i = ffs(pending_sels)) != 0) {
1304 		i--;
1305 		selbit = 1ul << i;
1306 		pending_sels &= ~selbit;
1307 
1308 		membar_enter();
1309 		while ((pe = UNBLOCKED_EVENTS(si, i, cpe, cpu->cpu_id)) != 0) {
1310 			j = ffs(pe) - 1;
1311 			pe &= ~(1ul << j);
1312 
1313 			port = (i << EVTCHN_SHIFT) + j;
1314 
1315 			irq = evtchn_to_irq[port];
1316 
1317 			/*
1318 			 * If no irq set, just ignore the event.
1319 			 * On e.g. netbsd they call evtchn_device_upcall(port)
1320 			 * We require the evtchn driver to install a handler
1321 			 * so there will be an irq associated with user mode
1322 			 * evtchns.
1323 			 */
1324 			if (irq == INVALID_IRQ) {
1325 				ec_clear_evtchn(port);
1326 				continue;
1327 			}
1328 
1329 			/*
1330 			 * If there's no handler, it could be a poke, so just
1331 			 * accept the event and continue.
1332 			 */
1333 			if (!irq_info[irq].ii_u2.has_handler) {
1334 #ifdef TRAPTRACE
1335 				ttp->ttr_ipl = 0xff;
1336 				if (IRQ_IS_CPUPOKE(irq)) {
1337 					ttp->ttr_ipl = XC_CPUPOKE_PIL;
1338 					ttp->ttr_marker = TT_INTERRUPT;
1339 				}
1340 				ttp->ttr_pri = cpu->cpu_pri;
1341 				ttp->ttr_spl = cpu->cpu_base_spl;
1342 				ttp->ttr_vector = 0xff;
1343 #endif /* TRAPTRACE */
1344 				if (ec_mask_evtchn(port)) {
1345 					ec_clear_evtchn(port);
1346 					ec_unmask_evtchn(port);
1347 					continue;
1348 				}
1349 			}
1350 
1351 			pri = irq_info[irq].ii_u2.ipl;
1352 
1353 			/*
1354 			 * If we are the cpu that successfully masks
1355 			 * the event, then record it as a pending event
1356 			 * for this cpu to service
1357 			 */
1358 			if (ec_mask_evtchn(port)) {
1359 				if (ec_evtchn_pending(port)) {
1360 					cpe->pending_sel[pri] |= selbit;
1361 					cpe->pending_evts[pri][i] |= (1ul << j);
1362 					pending_ints |= 1 << pri;
1363 				} else {
1364 					/*
1365 					 * another cpu serviced this event
1366 					 * before us, clear the mask.
1367 					 */
1368 					ec_unmask_evtchn(port);
1369 				}
1370 			}
1371 		}
1372 	}
1373 	*cpu_ipp = pending_ints;
1374 	if (pending_ints == 0)
1375 		return;
1376 	/*
1377 	 * We have gathered all the pending events/interrupts,
1378 	 * go service all the ones we can from highest priority to lowest.
1379 	 * Note: This loop may not actually complete and service all
1380 	 * pending interrupts since one of the interrupt threads may
1381 	 * block and the pinned thread runs.  In that case, when we
1382 	 * exit the interrupt thread that blocked we will check for
1383 	 * any unserviced interrupts and re-post an upcall to process
1384 	 * any unserviced pending events.
1385 	 */
1386 	curpri = cpu->cpu_pri;
1387 	for (pri = bsrw_insn(*cpu_ipp); pri > curpri; pri--) {
1388 		while ((pending_sels = cpe->pending_sel[pri]) != 0) {
1389 			i = ffs(pending_sels) - 1;
1390 			while ((pe = cpe->pending_evts[pri][i]) != 0) {
1391 				j = ffs(pe) - 1;
1392 				pe &= ~(1ul << j);
1393 				cpe->pending_evts[pri][i] = pe;
1394 				if (pe == 0) {
1395 					/*
1396 					 * Must reload pending selector bits
1397 					 * here as they could have changed on
1398 					 * a previous trip around the inner loop
1399 					 * while we were interrupt enabled
1400 					 * in a interrupt service routine.
1401 					 */
1402 					pending_sels = cpe->pending_sel[pri];
1403 					pending_sels &= ~(1ul << i);
1404 					cpe->pending_sel[pri] = pending_sels;
1405 					if (pending_sels == 0)
1406 						*cpu_ipp &= ~(1 << pri);
1407 				}
1408 				port = (i << EVTCHN_SHIFT) + j;
1409 				irq = evtchn_to_irq[port];
1410 				if (irq == INVALID_IRQ) {
1411 					/*
1412 					 * No longer a handler for this event
1413 					 * channel.  Clear the event and
1414 					 * ignore it, unmask the event.
1415 					 */
1416 					ec_clear_evtchn(port);
1417 					ec_unmask_evtchn(port);
1418 					continue;
1419 				}
1420 				if (irq == ec_dev_irq) {
1421 					volatile int *tptr = &ec_dev_mbox;
1422 
1423 					ASSERT(ec_dev_mbox == 0);
1424 					/*
1425 					 * NOTE: this gross store thru a pointer
1426 					 * is necessary because of a Sun C
1427 					 * compiler bug that does not properly
1428 					 * honor a volatile declaration.
1429 					 * we really should just be able to say
1430 					 * 	ec_dev_mbox = port;
1431 					 * here
1432 					 */
1433 					*tptr = port;
1434 				}
1435 				/*
1436 				 * Set up the regs struct to
1437 				 * look like a normal hardware int
1438 				 * and do normal interrupt handling.
1439 				 */
1440 				rp->r_trapno = irq;
1441 				do_interrupt(rp, ttp);
1442 				/*
1443 				 * Check for cpu priority change
1444 				 * Can happen if int thread blocks
1445 				 */
1446 				if (cpu->cpu_pri > curpri)
1447 					return;
1448 			}
1449 		}
1450 	}
1451 }
1452 
1453 void
1454 ec_unmask_evtchn(unsigned int ev)
1455 {
1456 	uint_t evi;
1457 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1458 	volatile vcpu_info_t *vci = CPU->cpu_m.mcpu_vcpu_info;
1459 	volatile ulong_t *ulp;
1460 
1461 	ASSERT(!interrupts_enabled());
1462 	/*
1463 	 * Check if we need to take slow path
1464 	 */
1465 	if (!CPU_IN_SET(evtchn_cpus[ev], CPU->cpu_id)) {
1466 		xen_evtchn_unmask(ev);
1467 		return;
1468 	}
1469 	evi = ev >> EVTCHN_SHIFT;
1470 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1471 	ulp = (volatile ulong_t *)&si->evtchn_mask[evi];
1472 	atomic_and_ulong(ulp, ~(1ul << ev));
1473 	/*
1474 	 * The following is basically the equivalent of
1475 	 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose the
1476 	 * interrupt edge' if the channel is masked.
1477 	 * XXPV - slight race if upcall was about to be set, we may get
1478 	 * an extra upcall.
1479 	 */
1480 	membar_enter();
1481 	if (si->evtchn_pending[evi] & (1ul << ev)) {
1482 		membar_consumer();
1483 		ulp = (volatile ulong_t *)&vci->evtchn_pending_sel;
1484 		if (!(*ulp & (1ul << evi))) {
1485 			atomic_or_ulong(ulp, (1ul << evi));
1486 		}
1487 		vci->evtchn_upcall_pending = 1;
1488 	}
1489 }
1490 
1491 /*
1492  * Set a bit in an evtchan mask word, return true if we are the cpu that
1493  * set the bit.
1494  */
1495 int
1496 ec_mask_evtchn(unsigned int ev)
1497 {
1498 	uint_t evi, evb;
1499 	ulong_t new, old, bit;
1500 	volatile shared_info_t *si = HYPERVISOR_shared_info;
1501 	volatile ulong_t *maskp;
1502 	int masked;
1503 
1504 	kpreempt_disable();
1505 	evi = ev >> EVTCHN_SHIFT;
1506 	evb = ev & ((1ul << EVTCHN_SHIFT) - 1);
1507 	bit = 1ul << evb;
1508 	maskp = (volatile ulong_t *)&si->evtchn_mask[evi];
1509 	do {
1510 		old = si->evtchn_mask[evi];
1511 		new = old | bit;
1512 	} while (atomic_cas_ulong(maskp, old, new) != old);
1513 	masked = (old & bit) == 0;
1514 	if (masked) {
1515 		evtchn_owner[ev] = CPU->cpu_id;
1516 #ifdef DEBUG
1517 		evtchn_owner_thread[ev] = curthread;
1518 #endif
1519 	}
1520 	kpreempt_enable();
1521 	return (masked);
1522 }
1523 
1524 void
1525 ec_clear_evtchn(unsigned int ev)
1526 {
1527 	uint_t evi;
1528 	shared_info_t *si = HYPERVISOR_shared_info;
1529 	volatile ulong_t *pendp;
1530 
1531 	evi = ev >> EVTCHN_SHIFT;
1532 	ev &= (1ul << EVTCHN_SHIFT) - 1;
1533 	pendp = (volatile ulong_t *)&si->evtchn_pending[evi];
1534 	atomic_and_ulong(pendp, ~(1ul << ev));
1535 }
1536 
1537 void
1538 ec_notify_via_evtchn(unsigned int port)
1539 {
1540 	evtchn_send_t send;
1541 
1542 	ASSERT(port != INVALID_EVTCHN);
1543 
1544 	send.port = port;
1545 	(void) HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
1546 }
1547 
1548 int
1549 ec_block_irq(int irq)
1550 {
1551 	irq_info_t *irqp = &irq_info[irq];
1552 	int evtchn;
1553 
1554 
1555 	evtchn = irq_evtchn(irqp);
1556 	(void) ec_mask_evtchn(evtchn);
1557 	return (evtchn_owner[evtchn]);
1558 }
1559 
1560 /*
1561  * Make a event that is pending for delivery on the current cpu  "go away"
1562  * without servicing the interrupt.
1563  */
1564 void
1565 ec_unpend_irq(int irq)
1566 {
1567 	irq_info_t *irqp = &irq_info[irq];
1568 	int pri = irqp->ii_u2.ipl;
1569 	ulong_t flags;
1570 	uint_t evtchn, evi, bit;
1571 	unsigned long pe, pending_sels;
1572 	struct xen_evt_data *cpe;
1573 
1574 	/*
1575 	 * The evtchn must be masked
1576 	 */
1577 	evtchn = irq_evtchn(irqp);
1578 	ASSERT(EVTCHN_MASKED(evtchn));
1579 	evi = evtchn >> EVTCHN_SHIFT;
1580 	bit = evtchn & (1ul << EVTCHN_SHIFT) - 1;
1581 	flags = intr_clear();
1582 	cpe = CPU->cpu_m.mcpu_evt_pend;
1583 	pe = cpe->pending_evts[pri][evi] & ~(1ul << bit);
1584 	cpe->pending_evts[pri][evi] = pe;
1585 	if (pe == 0) {
1586 		pending_sels = cpe->pending_sel[pri];
1587 		pending_sels &= ~(1ul << evi);
1588 		cpe->pending_sel[pri] = pending_sels;
1589 		if (pending_sels == 0)
1590 			CPU->cpu_m.mcpu_intr_pending &= ~(1 << pri);
1591 	}
1592 	intr_restore(flags);
1593 }
1594