xref: /titanic_52/usr/src/uts/i86xpv/io/psm/xpv_psm.c (revision d321a33cdd896e6b211d113a33698dd76e89b861)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #define	PSMI_1_5
30 
31 #include <sys/mutex.h>
32 #include <sys/types.h>
33 #include <sys/time.h>
34 #include <sys/clock.h>
35 #include <sys/machlock.h>
36 #include <sys/smp_impldefs.h>
37 #include <sys/uadmin.h>
38 #include <sys/promif.h>
39 #include <sys/psm.h>
40 #include <sys/psm_common.h>
41 #include <sys/atomic.h>
42 #include <sys/apic.h>
43 #include <sys/archsystm.h>
44 #include <sys/mach_intr.h>
45 #include <sys/hypervisor.h>
46 #include <sys/evtchn_impl.h>
47 #include <sys/modctl.h>
48 #include <sys/trap.h>
49 #include <sys/panic.h>
50 
51 #include <xen/public/vcpu.h>
52 #include <xen/public/physdev.h>
53 
54 
55 /*
56  * Global Data
57  */
58 
59 int xen_psm_verbose = 0;
60 
61 volatile uint32_t *apicadr = NULL;	/* dummy, so common code will link */
62 int apic_error = 0;
63 int apic_verbose = 0;
64 cpuset_t apic_cpumask;
65 int apic_forceload = 0;
66 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
67 	3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
68 };
69 uchar_t apic_ipltopri[MAXIPL + 1];
70 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
71 uint_t apic_picinit_called;
72 apic_cpus_info_t *apic_cpus;
73 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
74 /* use to make sure only one cpu handles the nmi */
75 static lock_t xen_psm_nmi_lock;
76 int xen_psm_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
77 int xen_psm_panic_on_nmi = 0;
78 int xen_psm_num_nmis = 0;
79 
80 cpuset_t xen_psm_cpus_online;	/* online cpus */
81 int xen_psm_ncpus = 1;		/* cpu count */
82 int xen_psm_next_bind_cpu;	/* next cpu to bind an interrupt to */
83 
84 /*
85  * XXPV we flag MSI as not supported, since the hypervisor currently doesn't
86  * support MSI at all.  Change this initialization to zero when MSI is
87  * supported.
88  */
89 int xen_support_msi = -1;
90 
91 static int xen_clock_irq = INVALID_IRQ;
92 
93 /* flag definitions for xen_psm_verbose */
94 #define	XEN_PSM_VERBOSE_IRQ_FLAG		0x00000001
95 #define	XEN_PSM_VERBOSE_POWEROFF_FLAG		0x00000002
96 #define	XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG	0x00000004
97 
98 #define	XEN_PSM_VERBOSE_IRQ(fmt) \
99 	if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
100 		cmn_err fmt;
101 
102 #define	XEN_PSM_VERBOSE_POWEROFF(fmt) \
103 	if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
104 		prom_printf fmt;
105 
106 /*
107  * Dummy apic array to point common routines at that want to do some apic
108  * manipulation.  Xen doesn't allow guest apic access so we point at these
109  * memory locations to fake out those who want to do apic fiddling.
110  */
111 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
112 
113 static struct psm_info xen_psm_info;
114 static void xen_psm_setspl(int);
115 
116 static int apic_alloc_vectors(dev_info_t *, int, int, int, int, int);
117 
118 /*
119  * Local support routines
120  */
121 
122 /*
123  * Select vcpu to bind xen virtual device interrupt to.
124  */
125 /*ARGSUSED*/
126 int
127 xen_psm_bind_intr(int irq)
128 {
129 	int bind_cpu, test_cpu;
130 	apic_irq_t *irqptr;
131 
132 	if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
133 		return (IRQ_UNBOUND);
134 	if (irq <= APIC_MAX_VECTOR)
135 		irqptr = apic_irq_table[irq];
136 	else
137 		irqptr = NULL;
138 	if (irqptr && (irqptr->airq_cpu & IRQ_USER_BOUND)) {
139 		bind_cpu = irqptr->airq_cpu;
140 		test_cpu = bind_cpu & ~IRQ_USER_BOUND;
141 		if (!CPU_IN_SET(xen_psm_cpus_online, test_cpu))
142 			bind_cpu = 0;
143 		goto done;
144 	}
145 	if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
146 		do {
147 			bind_cpu = xen_psm_next_bind_cpu++;
148 			if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
149 				xen_psm_next_bind_cpu = 0;
150 		} while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
151 	} else {
152 		bind_cpu = 0;
153 	}
154 done:
155 	return (bind_cpu);
156 }
157 
158 /*
159  * Autoconfiguration Routines
160  */
161 
162 static int
163 xen_psm_probe(void)
164 {
165 	int ret = PSM_SUCCESS;
166 
167 	if (DOMAIN_IS_INITDOMAIN(xen_info))
168 		ret = apic_probe_common(xen_psm_info.p_mach_idstring);
169 	return (ret);
170 }
171 
172 static void
173 xen_psm_softinit(void)
174 {
175 	/* LINTED logical expression always true: op "||" */
176 	ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
177 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
178 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
179 		apic_init_common();
180 	}
181 }
182 
183 #define	XEN_NSEC_PER_TICK	10 /* XXX - assume we have a 100 Mhz clock */
184 
185 /*ARGSUSED*/
186 static int
187 xen_psm_clkinit(int hertz)
188 {
189 	extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
190 	extern int dosynctodr;
191 
192 	/*
193 	 * domU cannot set the TOD hardware, fault the TOD clock now to
194 	 * indicate that and turn off attempts to sync TOD hardware
195 	 * with the hires timer.
196 	 */
197 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
198 		mutex_enter(&tod_lock);
199 		(void) tod_fault(TOD_RDONLY, 0);
200 		dosynctodr = 0;
201 		mutex_exit(&tod_lock);
202 	}
203 	/*
204 	 * The hypervisor provides a timer based on the local APIC timer.
205 	 * The interface supports requests of nanosecond resolution.
206 	 * A common frequency of the apic clock is 100 Mhz which
207 	 * gives a resolution of 10 nsec per tick.  What we would really like
208 	 * is a way to get the ns per tick value from xen.
209 	 * XXPV - This is an assumption that needs checking and may change
210 	 */
211 	return (XEN_NSEC_PER_TICK);
212 }
213 
214 static void
215 xen_psm_hrtimeinit(void)
216 {
217 	extern int gethrtime_hires;
218 	gethrtime_hires = 1;
219 }
220 
221 /* xen_psm NMI handler */
222 /*ARGSUSED*/
223 static void
224 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
225 {
226 	xen_psm_num_nmis++;
227 
228 	if (!lock_try(&xen_psm_nmi_lock))
229 		return;
230 
231 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
232 		debug_enter("NMI received: entering kmdb\n");
233 	} else if (xen_psm_panic_on_nmi) {
234 		/* Keep panic from entering kmdb. */
235 		nopanicdebug = 1;
236 		panic("NMI received\n");
237 	} else {
238 		/*
239 		 * prom_printf is the best shot we have of something which is
240 		 * problem free from high level/NMI type of interrupts
241 		 */
242 		prom_printf("NMI received\n");
243 	}
244 
245 	lock_clear(&xen_psm_nmi_lock);
246 }
247 
248 static void
249 xen_psm_picinit()
250 {
251 	int cpu, irqno;
252 	cpuset_t cpus;
253 
254 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
255 		/* set a flag so we know we have run xen_psm_picinit() */
256 		apic_picinit_called = 1;
257 		LOCK_INIT_CLEAR(&apic_ioapic_lock);
258 
259 		/* XXPV - do we need to do this? */
260 		picsetup();	 /* initialise the 8259 */
261 
262 		/* enable apic mode if imcr present */
263 		/* XXPV - do we need to do this either? */
264 		if (apic_imcrp) {
265 			outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
266 			outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
267 		}
268 
269 		ioapic_init_intr(IOAPIC_NOMASK);
270 		/*
271 		 * We never called xen_psm_addspl() when the SCI
272 		 * interrupt was added because that happened before the
273 		 * PSM module was loaded.  Fix that up here by doing
274 		 * any missed operations (e.g. bind to CPU)
275 		 */
276 		if ((irqno = apic_sci_vect) > 0) {
277 			if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
278 				CPUSET_ZERO(cpus);
279 				CPUSET_OR(cpus, xen_psm_cpus_online);
280 			} else {
281 				CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
282 			}
283 			ec_set_irq_affinity(irqno, cpus);
284 			apic_irq_table[irqno]->airq_temp_cpu =
285 			    (uchar_t)(cpu & ~IRQ_USER_BOUND);
286 			ec_enable_irq(irqno);
287 		}
288 	}
289 
290 	/* add nmi handler - least priority nmi handler */
291 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
292 
293 	if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
294 	    "xVM_psm NMI handler", (caddr_t)NULL))
295 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
296 }
297 
298 
299 /*
300  * generates an interprocessor interrupt to another CPU
301  */
302 static void
303 xen_psm_send_ipi(int cpun, int ipl)
304 {
305 	ulong_t flag = intr_clear();
306 
307 	ec_send_ipi(ipl, cpun);
308 	intr_restore(flag);
309 }
310 
311 /*ARGSUSED*/
312 static int
313 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
314 {
315 	int cpu, ret;
316 	cpuset_t cpus;
317 
318 	/*
319 	 * We are called at splhi() so we can't call anything that might end
320 	 * up trying to context switch.
321 	 */
322 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
323 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
324 		/*
325 		 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
326 		 */
327 		ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
328 	} else {
329 		/*
330 		 * Set priority/affinity/enable for non PIRQs
331 		 */
332 		ret = ec_set_irq_priority(irqno, ipl);
333 		ASSERT(ret == 0);
334 		if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
335 			CPUSET_ZERO(cpus);
336 			CPUSET_OR(cpus, xen_psm_cpus_online);
337 		} else {
338 			CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
339 		}
340 		ec_set_irq_affinity(irqno, cpus);
341 		ec_enable_irq(irqno);
342 	}
343 	return (ret);
344 }
345 
346 /*
347  * Acquire ownership of this irq on this cpu
348  */
349 void
350 xen_psm_acquire_irq(int irq)
351 {
352 	ulong_t flags;
353 	int cpuid;
354 
355 	/*
356 	 * If the irq is currently being serviced by another cpu
357 	 * we busy-wait for the other cpu to finish.  Take any
358 	 * pending interrupts before retrying.
359 	 */
360 	do {
361 		flags = intr_clear();
362 		cpuid = ec_block_irq(irq);
363 		intr_restore(flags);
364 	} while (cpuid != CPU->cpu_id);
365 }
366 
367 /*ARGSUSED*/
368 static int
369 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
370 {
371 	apic_irq_t *irqptr;
372 	int err = PSM_SUCCESS;
373 
374 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
375 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
376 		irqptr = apic_irq_table[irqno];
377 		/*
378 		 * unbind if no more sharers of this irq/evtchn
379 		 */
380 		if (irqptr->airq_share == 1) {
381 			xen_psm_acquire_irq(irqno);
382 			ec_unbind_irq(irqno);
383 		}
384 		err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
385 		/*
386 		 * If still in use reset priority
387 		 */
388 		if (!err && irqptr->airq_share != 0) {
389 			err = ec_set_irq_priority(irqno, max_ipl);
390 			return (err);
391 		}
392 	} else {
393 		xen_psm_acquire_irq(irqno);
394 		ec_unbind_irq(irqno);
395 	}
396 	return (err);
397 }
398 
399 static processorid_t
400 xen_psm_get_next_processorid(processorid_t id)
401 {
402 	if (id == -1)
403 		return (0);
404 
405 	for (id++; id < NCPU; id++) {
406 		switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
407 		case 0:		/* yeah, that one's there */
408 			return (id);
409 		default:
410 		case X_EINVAL:	/* out of range */
411 			return (-1);
412 		case X_ENOENT:	/* not present in the domain */
413 			/*
414 			 * It's not clear that we -need- to keep looking
415 			 * at this point, if, e.g., we can guarantee
416 			 * the hypervisor always keeps a contiguous range
417 			 * of vcpus around this is equivalent to "out of range".
418 			 *
419 			 * But it would be sad to miss a vcpu we're
420 			 * supposed to be using ..
421 			 */
422 			break;
423 		}
424 	}
425 
426 	return (-1);
427 }
428 
429 /*
430  * XXPV - undo the start cpu op change; return to ignoring this value
431  *	- also tweak error handling in main startup loop
432  */
433 /*ARGSUSED*/
434 static int
435 xen_psm_cpu_start(processorid_t id, caddr_t arg)
436 {
437 	int ret;
438 
439 	ASSERT(id > 0);
440 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
441 	ec_bind_cpu_ipis(id);
442 	(void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
443 	if ((ret = xen_vcpu_up(id)) == 0)
444 		xen_psm_ncpus++;
445 	else
446 		ret = EINVAL;
447 	return (ret);
448 }
449 
450 /*
451  * Allocate an irq for inter cpu signaling
452  */
453 /*ARGSUSED*/
454 static int
455 xen_psm_get_ipivect(int ipl, int type)
456 {
457 	return (ec_bind_ipi_to_irq(ipl, 0));
458 }
459 
460 /*ARGSUSED*/
461 static int
462 xen_psm_get_clockirq(int ipl)
463 {
464 	if (xen_clock_irq != INVALID_IRQ)
465 		return (xen_clock_irq);
466 
467 	xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
468 	return (xen_clock_irq);
469 }
470 
471 /*ARGSUSED*/
472 static void
473 xen_psm_shutdown(int cmd, int fcn)
474 {
475 	XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
476 
477 	switch (cmd) {
478 	case A_SHUTDOWN:
479 		switch (fcn) {
480 		case AD_BOOT:
481 		case AD_IBOOT:
482 			(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
483 			break;
484 		case AD_POWEROFF:
485 			/* fall through if domU or if poweroff fails */
486 			if (DOMAIN_IS_INITDOMAIN(xen_info))
487 				if (apic_enable_acpi)
488 					(void) acpi_poweroff();
489 			/* FALLTHRU */
490 		case AD_HALT:
491 		default:
492 			(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
493 			break;
494 		}
495 		break;
496 	case A_REBOOT:
497 		(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
498 		break;
499 	default:
500 		return;
501 	}
502 }
503 
504 
505 static int
506 xen_psm_translate_irq(dev_info_t *dip, int irqno)
507 {
508 	if (dip == NULL) {
509 		XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
510 		    " dip = NULL\n", irqno));
511 		return (irqno);
512 	}
513 	return (irqno);
514 }
515 
516 /*
517  * xen_psm_intr_enter() acks the event that triggered the interrupt and
518  * returns the new priority level,
519  */
520 /*ARGSUSED*/
521 static int
522 xen_psm_intr_enter(int ipl, int *vector)
523 {
524 	int newipl;
525 	uint_t intno;
526 	cpu_t *cpu = CPU;
527 
528 	intno = (*vector);
529 
530 	ASSERT(intno < NR_IRQS);
531 	ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
532 
533 	ec_clear_irq(intno);
534 
535 	newipl = autovect[intno].avh_hi_pri;
536 	if (newipl == 0) {
537 		/*
538 		 * (newipl == 0) means we have no service routines for this
539 		 * vector.  We will treat this as a spurious interrupt.
540 		 * We have cleared the pending bit already, clear the event
541 		 * mask and return a spurious interrupt.  This case can happen
542 		 * when an interrupt delivery is racing with the removal of
543 		 * of the service routine for that interrupt.
544 		 */
545 		ec_unmask_irq(intno);
546 		newipl = -1;	/* flag spurious interrupt */
547 	} else if (newipl <= cpu->cpu_pri) {
548 		/*
549 		 * (newipl <= cpu->cpu_pri) means that we must be trying to
550 		 * service a vector that was shared with a higher priority
551 		 * isr.  The higher priority handler has been removed and
552 		 * we need to service this int.  We can't return a lower
553 		 * priority than current cpu priority.  Just synthesize a
554 		 * priority to return that should be acceptable.
555 		 */
556 		newipl = cpu->cpu_pri + 1;	/* synthetic priority */
557 	}
558 	return (newipl);
559 }
560 
561 
562 /*
563  * xen_psm_intr_exit() restores the old interrupt
564  * priority level after processing an interrupt.
565  * It is called with interrupts disabled, and does not enable interrupts.
566  */
567 /* ARGSUSED */
568 static void
569 xen_psm_intr_exit(int ipl, int vector)
570 {
571 	ec_try_unmask_irq(vector);
572 	xen_psm_setspl(ipl);
573 }
574 
575 intr_exit_fn_t
576 psm_intr_exit_fn(void)
577 {
578 	return (xen_psm_intr_exit);
579 }
580 
581 /*
582  * Check if new ipl level allows delivery of previously unserviced events
583  */
584 static void
585 xen_psm_setspl(int ipl)
586 {
587 	struct cpu *cpu = CPU;
588 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
589 	uint16_t pending;
590 
591 	ASSERT(vci->evtchn_upcall_mask != 0);
592 
593 	/*
594 	 * If new ipl level will enable any pending interrupts, setup so the
595 	 * upcoming sti will cause us to get an upcall.
596 	 */
597 	pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
598 	if (pending) {
599 		int i;
600 		ulong_t pending_sels = 0;
601 		volatile ulong_t *selp;
602 		struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
603 
604 		for (i = bsrw_insn(pending); i > ipl; i--)
605 			pending_sels |= cpe->pending_sel[i];
606 		ASSERT(pending_sels);
607 		selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
608 		atomic_or_ulong(selp, pending_sels);
609 		vci->evtchn_upcall_pending = 1;
610 	}
611 }
612 
613 /*
614  * This function provides external interface to the nexus for all
615  * functionality related to the new DDI interrupt framework.
616  *
617  * Input:
618  * dip     - pointer to the dev_info structure of the requested device
619  * hdlp    - pointer to the internal interrupt handle structure for the
620  *	     requested interrupt
621  * intr_op - opcode for this call
622  * result  - pointer to the integer that will hold the result to be
623  *	     passed back if return value is PSM_SUCCESS
624  *
625  * Output:
626  * return value is either PSM_SUCCESS or PSM_FAILURE
627  */
628 int
629 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
630     psm_intr_op_t intr_op, int *result)
631 {
632 	int		cap;
633 	int		err;
634 	int		new_priority;
635 	apic_irq_t	*irqp;
636 	struct intrspec *ispec;
637 
638 	DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
639 	    "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
640 
641 	switch (intr_op) {
642 	case PSM_INTR_OP_CHECK_MSI:
643 		if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
644 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
645 			    DDI_INTR_TYPE_MSIX);
646 			break;
647 		}
648 		/*
649 		 * Check MSI/X is supported or not at APIC level and
650 		 * masked off the MSI/X bits in hdlp->ih_type if not
651 		 * supported before return.  If MSI/X is supported,
652 		 * leave the ih_type unchanged and return.
653 		 *
654 		 * hdlp->ih_type passed in from the nexus has all the
655 		 * interrupt types supported by the device.
656 		 */
657 		if (xen_support_msi == 0) {
658 			/*
659 			 * if xen_support_msi is not set, call
660 			 * apic_check_msi_support() to check whether msi
661 			 * is supported first
662 			 */
663 			if (apic_check_msi_support() == PSM_SUCCESS)
664 				xen_support_msi = 1;
665 			else
666 				xen_support_msi = -1;
667 		}
668 		if (xen_support_msi == 1)
669 			*result = hdlp->ih_type;
670 		else
671 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
672 			    DDI_INTR_TYPE_MSIX);
673 		break;
674 	case PSM_INTR_OP_ALLOC_VECTORS:
675 		*result = apic_alloc_vectors(dip, hdlp->ih_inum,
676 		    hdlp->ih_scratch1, hdlp->ih_pri, hdlp->ih_type,
677 		    (int)(uintptr_t)hdlp->ih_scratch2);
678 		break;
679 	case PSM_INTR_OP_FREE_VECTORS:
680 		apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
681 		    hdlp->ih_pri, hdlp->ih_type);
682 		break;
683 	case PSM_INTR_OP_NAVAIL_VECTORS:
684 		/*
685 		 * XXPV - maybe we should make this be:
686 		 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
687 		 */
688 		if (DOMAIN_IS_INITDOMAIN(xen_info))
689 			*result = APIC_VECTOR_PER_IPL;
690 		else
691 			*result = 1;
692 		break;
693 	case PSM_INTR_OP_XLATE_VECTOR:
694 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
695 		if (ispec->intrspec_vec >= PIRQ_BASE &&
696 		    ispec->intrspec_vec < NR_PIRQS &&
697 		    DOMAIN_IS_INITDOMAIN(xen_info)) {
698 			*result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
699 		} else {
700 			*result = ispec->intrspec_vec;
701 		}
702 		break;
703 	case PSM_INTR_OP_GET_PENDING:
704 		/* XXPV - is this enough for dom0 or do we need to ref ioapic */
705 		*result = ec_pending_irq(hdlp->ih_vector);
706 		break;
707 	case PSM_INTR_OP_CLEAR_MASK:
708 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
709 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
710 			return (PSM_FAILURE);
711 		ec_enable_irq(hdlp->ih_vector);
712 		break;
713 	case PSM_INTR_OP_SET_MASK:
714 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
715 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
716 			return (PSM_FAILURE);
717 		ec_disable_irq(hdlp->ih_vector);
718 		break;
719 	case PSM_INTR_OP_GET_CAP:
720 		cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
721 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
722 			cap |= DDI_INTR_FLAG_MASKABLE;
723 		*result = cap;
724 		break;
725 	case PSM_INTR_OP_GET_SHARED:
726 		if (DOMAIN_IS_INITDOMAIN(xen_info)) {
727 			if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
728 				return (PSM_FAILURE);
729 			if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
730 			    == NULL)
731 				return (PSM_FAILURE);
732 			*result = irqp->airq_share ? 1: 0;
733 		} else {
734 			return (PSM_FAILURE);
735 		}
736 		break;
737 	case PSM_INTR_OP_SET_PRI:
738 		new_priority = *(int *)result;
739 		err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
740 		if (err != 0)
741 			return (PSM_FAILURE);
742 		break;
743 	case PSM_INTR_OP_GET_INTR:
744 		if (!DOMAIN_IS_INITDOMAIN(xen_info))
745 			return (PSM_FAILURE);
746 		/*
747 		 * The interrupt handle given here has been allocated
748 		 * specifically for this command, and ih_private carries
749 		 * a pointer to a apic_get_intr_t.
750 		 */
751 		if (apic_get_vector_intr_info(
752 		    hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
753 			return (PSM_FAILURE);
754 		break;
755 	case PSM_INTR_OP_SET_CAP:
756 		/* FALLTHRU */
757 	default:
758 		return (PSM_FAILURE);
759 	}
760 	return (PSM_SUCCESS);
761 }
762 
763 static void
764 xen_psm_rebind_irq(int irq)
765 {
766 	cpuset_t ncpu;
767 	processorid_t newcpu;
768 	apic_irq_t *irqptr;
769 
770 	newcpu = xen_psm_bind_intr(irq);
771 	if (newcpu == IRQ_UNBOUND) {
772 		CPUSET_ZERO(ncpu);
773 		CPUSET_OR(ncpu, xen_psm_cpus_online);
774 	} else {
775 		CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
776 	}
777 	ec_set_irq_affinity(irq, ncpu);
778 	if (irq <= APIC_MAX_VECTOR) {
779 		irqptr = apic_irq_table[irq];
780 		ASSERT(irqptr != NULL);
781 		irqptr->airq_temp_cpu = (uchar_t)newcpu;
782 	}
783 }
784 
785 /*
786  * Disable all device interrupts for the given cpu.
787  * High priority interrupts are not disabled and will still be serviced.
788  */
789 static int
790 xen_psm_disable_intr(processorid_t cpun)
791 {
792 	int irq;
793 
794 	/*
795 	 * Can't offline VCPU 0 on this hypervisor.  There's no reason
796 	 * anyone would want to given that the CPUs are virtual. Also note
797 	 * that the hypervisor requires suspend/resume to be on VCPU 0.
798 	 */
799 	if (cpun == 0)
800 		return (PSM_FAILURE);
801 
802 	CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
803 	for (irq = 0; irq < NR_IRQS; irq++) {
804 		if (!ec_irq_needs_rebind(irq, cpun))
805 			continue;
806 		xen_psm_rebind_irq(irq);
807 	}
808 	return (PSM_SUCCESS);
809 }
810 
811 static void
812 xen_psm_enable_intr(processorid_t cpun)
813 {
814 	int irq;
815 
816 	if (cpun == 0)
817 		return;
818 
819 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
820 
821 	/*
822 	 * Rebalance device interrupts among online processors
823 	 */
824 	for (irq = 0; irq < NR_IRQS; irq++) {
825 		if (!ec_irq_rebindable(irq))
826 			continue;
827 		xen_psm_rebind_irq(irq);
828 	}
829 }
830 
831 static int
832 xen_psm_post_cpu_start()
833 {
834 	processorid_t cpun;
835 
836 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
837 		cpun = psm_get_cpu_id();
838 		apic_cpus[cpun].aci_status =
839 		    APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
840 	}
841 	/*
842 	 * Re-distribute interrupts to include the newly added cpu.
843 	 */
844 	xen_psm_enable_intr(cpun);
845 	return (PSM_SUCCESS);
846 }
847 
848 /*
849  * This function will reprogram the timer.
850  *
851  * When in oneshot mode the argument is the absolute time in future at which to
852  * generate the interrupt.
853  *
854  * When in periodic mode, the argument is the interval at which the
855  * interrupts should be generated. There is no need to support the periodic
856  * mode timer change at this time.
857  *
858  * Note that we must be careful to convert from hrtime to Xen system time (see
859  * xpv_timestamp.c).
860  */
861 static void
862 xen_psm_timer_reprogram(hrtime_t timer_req)
863 {
864 	hrtime_t now, timer_new, time_delta, xen_time;
865 	ulong_t flags;
866 
867 	flags = intr_clear();
868 	/*
869 	 * We should be called from high PIL context (CBE_HIGH_PIL),
870 	 * so kpreempt is disabled.
871 	 */
872 
873 	now = xpv_gethrtime();
874 	xen_time = xpv_getsystime();
875 	if (timer_req <= now) {
876 		/*
877 		 * requested to generate an interrupt in the past
878 		 * generate an interrupt as soon as possible
879 		 */
880 		time_delta = XEN_NSEC_PER_TICK;
881 	} else
882 		time_delta = timer_req - now;
883 
884 	timer_new = xen_time + time_delta;
885 	if (HYPERVISOR_set_timer_op(timer_new) != 0)
886 		panic("can't set hypervisor timer?");
887 	intr_restore(flags);
888 }
889 
890 /*
891  * This function will enable timer interrupts.
892  */
893 static void
894 xen_psm_timer_enable(void)
895 {
896 	ec_unmask_irq(xen_clock_irq);
897 }
898 
899 /*
900  * This function will disable timer interrupts on the current cpu.
901  */
902 static void
903 xen_psm_timer_disable(void)
904 {
905 	(void) ec_block_irq(xen_clock_irq);
906 	/*
907 	 * If the clock irq is pending on this cpu then we need to
908 	 * clear the pending interrupt.
909 	 */
910 	ec_unpend_irq(xen_clock_irq);
911 }
912 
913 /*
914  *
915  * The following functions are in the platform specific file so that they
916  * can be different functions depending on whether we are running on
917  * bare metal or a hypervisor.
918  */
919 
920 /*
921  * Allocate a free vector for irq at ipl.
922  */
923 /* ARGSUSED */
924 uchar_t
925 apic_allocate_vector(int ipl, int irq, int pri)
926 {
927 	physdev_irq_t irq_op;
928 	uchar_t vector;
929 
930 	irq_op.irq = irq;
931 
932 	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
933 		panic("Hypervisor alloc vector failed");
934 	vector = irq_op.vector;
935 	/*
936 	 * No need to worry about vector colliding with our reserved vectors
937 	 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
938 	 * generated traps and handle them properly.
939 	 */
940 	apic_vector_to_irq[vector] = (uchar_t)irq;
941 	return (vector);
942 }
943 
944 /* Mark vector as not being used by any irq */
945 void
946 apic_free_vector(uchar_t vector)
947 {
948 	apic_vector_to_irq[vector] = APIC_RESV_IRQ;
949 }
950 
951 /*
952  * This function allocate "count" vector(s) for the given "dip/pri/type"
953  */
954 static int
955 apic_alloc_vectors(dev_info_t *dip, int inum, int count, int pri, int type,
956     int behavior)
957 {
958 	int	rcount, i;
959 	uchar_t	vector, cpu;
960 	int irqno;
961 	major_t	major;
962 	apic_irq_t	*irqptr;
963 
964 	/* only supports MSI at the moment, will add MSI-X support later */
965 	if (type != DDI_INTR_TYPE_MSI)
966 		return (0);
967 
968 	DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: dip=0x%p type=%d "
969 	    "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
970 	    (void *)dip, type, inum, pri, count, behavior));
971 
972 	if (count > 1) {
973 		if (behavior == DDI_INTR_ALLOC_STRICT &&
974 		    (apic_multi_msi_enable == 0 || count > apic_multi_msi_max))
975 			return (0);
976 
977 		if (apic_multi_msi_enable == 0)
978 			count = 1;
979 		else if (count > apic_multi_msi_max)
980 			count = apic_multi_msi_max;
981 	}
982 
983 	/*
984 	 * XXPV - metal version takes all vectors avail at given pri.
985 	 * Why do that?  For now just allocate count vectors.
986 	 */
987 	rcount = count;
988 
989 	mutex_enter(&airq_mutex);
990 
991 	/*
992 	 * XXPV - currently the hypervisor does not support MSI at all.
993 	 * It doesn't return consecutive vectors.  This code is a first
994 	 * cut for the (future) time that MSI is supported.
995 	 */
996 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
997 	for (i = 0; i < rcount; i++) {
998 		if ((irqno = apic_allocate_irq(apic_first_avail_irq)) ==
999 		    INVALID_IRQ) {
1000 			mutex_exit(&airq_mutex);
1001 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: "
1002 			    "apic_allocate_irq failed\n"));
1003 			return (i);
1004 		}
1005 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1006 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1007 		irqptr = apic_irq_table[irqno];
1008 		vector = apic_allocate_vector(pri, irqno, 0);
1009 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1010 #ifdef	DEBUG
1011 		if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1012 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: "
1013 			    "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1014 #endif
1015 
1016 		irqptr->airq_vector = vector;
1017 		irqptr->airq_ioapicindex = (uchar_t)inum;	/* start */
1018 		irqptr->airq_intin_no = (uchar_t)rcount;
1019 		irqptr->airq_ipl = pri;
1020 		irqptr->airq_origirq = (uchar_t)(inum + i);
1021 		irqptr->airq_share_id = 0;
1022 		irqptr->airq_mps_intr_index = MSI_INDEX;
1023 		irqptr->airq_dip = dip;
1024 		irqptr->airq_major = major;
1025 		if (i == 0) /* they all bound to the same cpu */
1026 			cpu = irqptr->airq_cpu = apic_bind_intr(dip, irqno,
1027 			    0xff, 0xff);
1028 		else
1029 			irqptr->airq_cpu = cpu;
1030 		DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: irq=0x%x "
1031 		    "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1032 		    (void *)irqptr->airq_dip, irqptr->airq_vector,
1033 		    irqptr->airq_origirq, pri));
1034 	}
1035 	mutex_exit(&airq_mutex);
1036 	return (rcount);
1037 }
1038 
1039 /*
1040  * The hypervisor doesn't permit access to local apics directly
1041  */
1042 /* ARGSUSED */
1043 uint32_t *
1044 mapin_apic(uint32_t addr, size_t len, int flags)
1045 {
1046 	/*
1047 	 * Return a pointer to a memory area to fake out the
1048 	 * probe code that wants to read apic registers.
1049 	 * The dummy values will end up being ignored by xen
1050 	 * later on when they are used anyway.
1051 	 */
1052 	xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1053 	return (xen_psm_dummy_apic);
1054 }
1055 
1056 /* ARGSUSED */
1057 uint32_t *
1058 mapin_ioapic(uint32_t addr, size_t len, int flags)
1059 {
1060 	/*
1061 	 * Return non-null here to fake out configure code that calls this.
1062 	 * The i86xpv platform will not reference through the returned value..
1063 	 */
1064 	return ((uint32_t *)0x1);
1065 }
1066 
1067 /* ARGSUSED */
1068 void
1069 mapout_apic(caddr_t addr, size_t len)
1070 {
1071 }
1072 
1073 /* ARGSUSED */
1074 void
1075 mapout_ioapic(caddr_t addr, size_t len)
1076 {
1077 }
1078 
1079 uint32_t
1080 ioapic_read(int apic_ix, uint32_t reg)
1081 {
1082 	physdev_apic_t apic;
1083 
1084 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1085 	apic.reg = reg;
1086 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1087 		panic("read ioapic %d reg %d failed", apic_ix, reg);
1088 	return (apic.value);
1089 }
1090 
1091 void
1092 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1093 {
1094 	physdev_apic_t apic;
1095 
1096 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1097 	apic.reg = reg;
1098 	apic.value = value;
1099 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1100 		panic("write ioapic %d reg %d failed", apic_ix, reg);
1101 }
1102 
1103 /*
1104  * Call rebind to do the actual programming.
1105  */
1106 int
1107 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1108 {
1109 	apic_irq_t *irqptr;
1110 	struct ioapic_reprogram_data *drep = NULL;
1111 	int rv, cpu;
1112 	cpuset_t cpus;
1113 
1114 	/*
1115 	 * Set cpu based on xen idea of online cpu's not apic tables.
1116 	 * Note that xen ignores/sets to it's own preferred value the
1117 	 * target cpu field when programming ioapic anyway.
1118 	 */
1119 	if ((cpu = xen_psm_bind_intr(irq)) == IRQ_UNBOUND) {
1120 		CPUSET_ZERO(cpus);
1121 		CPUSET_OR(cpus, xen_psm_cpus_online);
1122 	} else {
1123 		CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1124 	}
1125 	apic_irq_table[irq]->airq_cpu = cpu;
1126 	if (deferred) {
1127 		drep = (struct ioapic_reprogram_data *)p;
1128 		ASSERT(drep != NULL);
1129 		irqptr = drep->irqp;
1130 	} else {
1131 		irqptr = (apic_irq_t *)p;
1132 	}
1133 	ASSERT(irqptr != NULL);
1134 	rv = apic_rebind(irqptr, cpu, drep);
1135 	if (rv) {
1136 		/* CPU is not up or interrupt is disabled. Fall back to 0 */
1137 		cpu = 0;
1138 		rv = apic_rebind(irqptr, cpu, drep);
1139 	}
1140 	/*
1141 	 * If rebind successful bind the irq to an event channel
1142 	 */
1143 	if (rv == 0) {
1144 		ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1145 		CPUSET_FIND(cpus, cpu);
1146 		apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1147 	}
1148 	return (rv);
1149 }
1150 
1151 /*
1152  * Allocate a new vector for the given irq
1153  */
1154 /* ARGSUSED */
1155 uchar_t
1156 apic_modify_vector(uchar_t vector, int irq)
1157 {
1158 	return (apic_allocate_vector(0, irq, 0));
1159 }
1160 
1161 /*
1162  * The rest of the file is just generic psm module boilerplate
1163  */
1164 
1165 static struct psm_ops xen_psm_ops = {
1166 	xen_psm_probe,				/* psm_probe		*/
1167 
1168 	xen_psm_softinit,			/* psm_init		*/
1169 	xen_psm_picinit,			/* psm_picinit		*/
1170 	xen_psm_intr_enter,			/* psm_intr_enter	*/
1171 	xen_psm_intr_exit,			/* psm_intr_exit	*/
1172 	xen_psm_setspl,				/* psm_setspl		*/
1173 	xen_psm_addspl,				/* psm_addspl		*/
1174 	xen_psm_delspl,				/* psm_delspl		*/
1175 	xen_psm_disable_intr,			/* psm_disable_intr	*/
1176 	xen_psm_enable_intr,			/* psm_enable_intr	*/
1177 	(int (*)(int))NULL,			/* psm_softlvl_to_irq	*/
1178 	(void (*)(int))NULL,			/* psm_set_softintr	*/
1179 	(void (*)(processorid_t))NULL,		/* psm_set_idlecpu	*/
1180 	(void (*)(processorid_t))NULL,		/* psm_unset_idlecpu	*/
1181 
1182 	xen_psm_clkinit,			/* psm_clkinit		*/
1183 	xen_psm_get_clockirq,			/* psm_get_clockirq	*/
1184 	xen_psm_hrtimeinit,			/* psm_hrtimeinit	*/
1185 	xpv_gethrtime,				/* psm_gethrtime	*/
1186 
1187 	xen_psm_get_next_processorid,		/* psm_get_next_processorid */
1188 	xen_psm_cpu_start,			/* psm_cpu_start	*/
1189 	xen_psm_post_cpu_start,			/* psm_post_cpu_start	*/
1190 	xen_psm_shutdown,			/* psm_shutdown		*/
1191 	xen_psm_get_ipivect,			/* psm_get_ipivect	*/
1192 	xen_psm_send_ipi,			/* psm_send_ipi		*/
1193 
1194 	xen_psm_translate_irq,			/* psm_translate_irq	*/
1195 
1196 	(void (*)(int, char *))NULL,		/* psm_notify_error	*/
1197 	(void (*)(int msg))NULL,		/* psm_notify_func	*/
1198 	xen_psm_timer_reprogram,		/* psm_timer_reprogram	*/
1199 	xen_psm_timer_enable,			/* psm_timer_enable	*/
1200 	xen_psm_timer_disable,			/* psm_timer_disable	*/
1201 	(void (*)(void *arg))NULL,		/* psm_post_cyclic_setup */
1202 	(void (*)(int, int))NULL,		/* psm_preshutdown	*/
1203 	xen_intr_ops			/* Advanced DDI Interrupt framework */
1204 };
1205 
1206 static struct psm_info xen_psm_info = {
1207 	PSM_INFO_VER01_5,	/* version				*/
1208 	PSM_OWN_SYS_DEFAULT,	/* ownership				*/
1209 	&xen_psm_ops,		/* operation				*/
1210 	"xVM_psm",		/* machine name				*/
1211 	"platform module %I%"	/* machine descriptions			*/
1212 };
1213 
1214 static void *xen_psm_hdlp;
1215 
1216 int
1217 _init(void)
1218 {
1219 	return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1220 }
1221 
1222 int
1223 _fini(void)
1224 {
1225 	return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1226 }
1227 
1228 int
1229 _info(struct modinfo *modinfop)
1230 {
1231 	return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1232 }
1233