xref: /titanic_52/usr/src/uts/i86xpv/io/psm/xpv_psm.c (revision 08c92e0e5d8d3c6bb3708cac154d2afba4edb6a4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #define	PSMI_1_6
28 
29 #include <sys/mutex.h>
30 #include <sys/types.h>
31 #include <sys/time.h>
32 #include <sys/clock.h>
33 #include <sys/machlock.h>
34 #include <sys/smp_impldefs.h>
35 #include <sys/uadmin.h>
36 #include <sys/promif.h>
37 #include <sys/psm.h>
38 #include <sys/psm_common.h>
39 #include <sys/atomic.h>
40 #include <sys/apic.h>
41 #include <sys/archsystm.h>
42 #include <sys/mach_intr.h>
43 #include <sys/hypervisor.h>
44 #include <sys/evtchn_impl.h>
45 #include <sys/modctl.h>
46 #include <sys/trap.h>
47 #include <sys/panic.h>
48 
49 #include <xen/public/vcpu.h>
50 #include <xen/public/physdev.h>
51 
52 
53 /*
54  * Global Data
55  */
56 
57 int xen_psm_verbose = 0;
58 
59 /* As of now we don't support x2apic in xVM */
60 volatile uint32_t *apicadr = NULL;	/* dummy, so common code will link */
61 int apic_error = 0;
62 int apic_verbose = 0;
63 cpuset_t apic_cpumask;
64 int apic_forceload = 0;
65 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
66 	3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
67 };
68 uchar_t apic_ipltopri[MAXIPL + 1];
69 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
70 uint_t apic_picinit_called;
71 apic_cpus_info_t *apic_cpus;
72 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
73 /* use to make sure only one cpu handles the nmi */
74 static lock_t xen_psm_nmi_lock;
75 int xen_psm_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
76 int xen_psm_panic_on_nmi = 0;
77 int xen_psm_num_nmis = 0;
78 
79 cpuset_t xen_psm_cpus_online;	/* online cpus */
80 int xen_psm_ncpus = 1;		/* cpu count */
81 int xen_psm_next_bind_cpu;	/* next cpu to bind an interrupt to */
82 
83 /*
84  * XXPV we flag MSI as not supported, since the hypervisor currently doesn't
85  * support MSI at all.  Change this initialization to zero when MSI is
86  * supported.
87  */
88 int xen_support_msi = -1;
89 
90 static int xen_clock_irq = INVALID_IRQ;
91 
92 /* flag definitions for xen_psm_verbose */
93 #define	XEN_PSM_VERBOSE_IRQ_FLAG		0x00000001
94 #define	XEN_PSM_VERBOSE_POWEROFF_FLAG		0x00000002
95 #define	XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG	0x00000004
96 
97 #define	XEN_PSM_VERBOSE_IRQ(fmt) \
98 	if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
99 		cmn_err fmt;
100 
101 #define	XEN_PSM_VERBOSE_POWEROFF(fmt) \
102 	if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
103 		prom_printf fmt;
104 
105 /*
106  * Dummy apic array to point common routines at that want to do some apic
107  * manipulation.  Xen doesn't allow guest apic access so we point at these
108  * memory locations to fake out those who want to do apic fiddling.
109  */
110 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
111 
112 static struct psm_info xen_psm_info;
113 static void xen_psm_setspl(int);
114 
115 static int apic_alloc_vectors(dev_info_t *, int, int, int, int, int);
116 
117 /*
118  * Local support routines
119  */
120 
121 /*
122  * Select vcpu to bind xen virtual device interrupt to.
123  */
124 /*ARGSUSED*/
125 int
126 xen_psm_bind_intr(int irq)
127 {
128 	int bind_cpu, test_cpu;
129 	apic_irq_t *irqptr;
130 
131 	if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
132 		return (IRQ_UNBOUND);
133 	if (irq <= APIC_MAX_VECTOR)
134 		irqptr = apic_irq_table[irq];
135 	else
136 		irqptr = NULL;
137 	if (irqptr && (irqptr->airq_cpu & IRQ_USER_BOUND)) {
138 		bind_cpu = irqptr->airq_cpu;
139 		test_cpu = bind_cpu & ~IRQ_USER_BOUND;
140 		if (!CPU_IN_SET(xen_psm_cpus_online, test_cpu))
141 			bind_cpu = 0;
142 		goto done;
143 	}
144 	if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
145 		do {
146 			bind_cpu = xen_psm_next_bind_cpu++;
147 			if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
148 				xen_psm_next_bind_cpu = 0;
149 		} while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
150 	} else {
151 		bind_cpu = 0;
152 	}
153 done:
154 	return (bind_cpu);
155 }
156 
157 /*
158  * Autoconfiguration Routines
159  */
160 
161 static int
162 xen_psm_probe(void)
163 {
164 	int ret = PSM_SUCCESS;
165 
166 	if (DOMAIN_IS_INITDOMAIN(xen_info))
167 		ret = apic_probe_common(xen_psm_info.p_mach_idstring);
168 	return (ret);
169 }
170 
171 static void
172 xen_psm_softinit(void)
173 {
174 	/* LINTED logical expression always true: op "||" */
175 	ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
176 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
177 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
178 		apic_init_common();
179 	}
180 }
181 
182 #define	XEN_NSEC_PER_TICK	10 /* XXX - assume we have a 100 Mhz clock */
183 
184 /*ARGSUSED*/
185 static int
186 xen_psm_clkinit(int hertz)
187 {
188 	extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
189 	extern int dosynctodr;
190 
191 	/*
192 	 * domU cannot set the TOD hardware, fault the TOD clock now to
193 	 * indicate that and turn off attempts to sync TOD hardware
194 	 * with the hires timer.
195 	 */
196 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
197 		mutex_enter(&tod_lock);
198 		(void) tod_fault(TOD_RDONLY, 0);
199 		dosynctodr = 0;
200 		mutex_exit(&tod_lock);
201 	}
202 	/*
203 	 * The hypervisor provides a timer based on the local APIC timer.
204 	 * The interface supports requests of nanosecond resolution.
205 	 * A common frequency of the apic clock is 100 Mhz which
206 	 * gives a resolution of 10 nsec per tick.  What we would really like
207 	 * is a way to get the ns per tick value from xen.
208 	 * XXPV - This is an assumption that needs checking and may change
209 	 */
210 	return (XEN_NSEC_PER_TICK);
211 }
212 
213 static void
214 xen_psm_hrtimeinit(void)
215 {
216 	extern int gethrtime_hires;
217 	gethrtime_hires = 1;
218 }
219 
220 /* xen_psm NMI handler */
221 /*ARGSUSED*/
222 static void
223 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
224 {
225 	xen_psm_num_nmis++;
226 
227 	if (!lock_try(&xen_psm_nmi_lock))
228 		return;
229 
230 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
231 		debug_enter("NMI received: entering kmdb\n");
232 	} else if (xen_psm_panic_on_nmi) {
233 		/* Keep panic from entering kmdb. */
234 		nopanicdebug = 1;
235 		panic("NMI received\n");
236 	} else {
237 		/*
238 		 * prom_printf is the best shot we have of something which is
239 		 * problem free from high level/NMI type of interrupts
240 		 */
241 		prom_printf("NMI received\n");
242 	}
243 
244 	lock_clear(&xen_psm_nmi_lock);
245 }
246 
247 static void
248 xen_psm_picinit()
249 {
250 	int cpu, irqno;
251 	cpuset_t cpus;
252 
253 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
254 		/* set a flag so we know we have run xen_psm_picinit() */
255 		apic_picinit_called = 1;
256 		LOCK_INIT_CLEAR(&apic_ioapic_lock);
257 
258 		/* XXPV - do we need to do this? */
259 		picsetup();	 /* initialise the 8259 */
260 
261 		/* enable apic mode if imcr present */
262 		/* XXPV - do we need to do this either? */
263 		if (apic_imcrp) {
264 			outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
265 			outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
266 		}
267 
268 		ioapic_init_intr(IOAPIC_NOMASK);
269 		/*
270 		 * We never called xen_psm_addspl() when the SCI
271 		 * interrupt was added because that happened before the
272 		 * PSM module was loaded.  Fix that up here by doing
273 		 * any missed operations (e.g. bind to CPU)
274 		 */
275 		if ((irqno = apic_sci_vect) > 0) {
276 			if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
277 				CPUSET_ZERO(cpus);
278 				CPUSET_OR(cpus, xen_psm_cpus_online);
279 			} else {
280 				CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
281 			}
282 			ec_set_irq_affinity(irqno, cpus);
283 			apic_irq_table[irqno]->airq_temp_cpu =
284 			    (uchar_t)(cpu & ~IRQ_USER_BOUND);
285 			ec_enable_irq(irqno);
286 		}
287 	}
288 
289 	/* add nmi handler - least priority nmi handler */
290 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
291 
292 	if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
293 	    "xVM_psm NMI handler", (caddr_t)NULL))
294 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
295 }
296 
297 
298 /*
299  * generates an interprocessor interrupt to another CPU
300  */
301 static void
302 xen_psm_send_ipi(int cpun, int ipl)
303 {
304 	ulong_t flag = intr_clear();
305 
306 	ec_send_ipi(ipl, cpun);
307 	intr_restore(flag);
308 }
309 
310 /*ARGSUSED*/
311 static int
312 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
313 {
314 	int cpu, ret;
315 	cpuset_t cpus;
316 
317 	/*
318 	 * We are called at splhi() so we can't call anything that might end
319 	 * up trying to context switch.
320 	 */
321 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
322 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
323 		/*
324 		 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
325 		 */
326 		ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
327 	} else {
328 		/*
329 		 * Set priority/affinity/enable for non PIRQs
330 		 */
331 		ret = ec_set_irq_priority(irqno, ipl);
332 		ASSERT(ret == 0);
333 		if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
334 			CPUSET_ZERO(cpus);
335 			CPUSET_OR(cpus, xen_psm_cpus_online);
336 		} else {
337 			CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
338 		}
339 		ec_set_irq_affinity(irqno, cpus);
340 		ec_enable_irq(irqno);
341 	}
342 	return (ret);
343 }
344 
345 /*
346  * Acquire ownership of this irq on this cpu
347  */
348 void
349 xen_psm_acquire_irq(int irq)
350 {
351 	ulong_t flags;
352 	int cpuid;
353 
354 	/*
355 	 * If the irq is currently being serviced by another cpu
356 	 * we busy-wait for the other cpu to finish.  Take any
357 	 * pending interrupts before retrying.
358 	 */
359 	do {
360 		flags = intr_clear();
361 		cpuid = ec_block_irq(irq);
362 		intr_restore(flags);
363 	} while (cpuid != CPU->cpu_id);
364 }
365 
366 /*ARGSUSED*/
367 static int
368 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
369 {
370 	apic_irq_t *irqptr;
371 	int err = PSM_SUCCESS;
372 
373 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
374 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
375 		irqptr = apic_irq_table[irqno];
376 		/*
377 		 * unbind if no more sharers of this irq/evtchn
378 		 */
379 		if (irqptr->airq_share == 1) {
380 			xen_psm_acquire_irq(irqno);
381 			ec_unbind_irq(irqno);
382 		}
383 		err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
384 		/*
385 		 * If still in use reset priority
386 		 */
387 		if (!err && irqptr->airq_share != 0) {
388 			err = ec_set_irq_priority(irqno, max_ipl);
389 			return (err);
390 		}
391 	} else {
392 		xen_psm_acquire_irq(irqno);
393 		ec_unbind_irq(irqno);
394 	}
395 	return (err);
396 }
397 
398 static processorid_t
399 xen_psm_get_next_processorid(processorid_t id)
400 {
401 	if (id == -1)
402 		return (0);
403 
404 	for (id++; id < NCPU; id++) {
405 		switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
406 		case 0:		/* yeah, that one's there */
407 			return (id);
408 		default:
409 		case X_EINVAL:	/* out of range */
410 			return (-1);
411 		case X_ENOENT:	/* not present in the domain */
412 			/*
413 			 * It's not clear that we -need- to keep looking
414 			 * at this point, if, e.g., we can guarantee
415 			 * the hypervisor always keeps a contiguous range
416 			 * of vcpus around this is equivalent to "out of range".
417 			 *
418 			 * But it would be sad to miss a vcpu we're
419 			 * supposed to be using ..
420 			 */
421 			break;
422 		}
423 	}
424 
425 	return (-1);
426 }
427 
428 /*
429  * XXPV - undo the start cpu op change; return to ignoring this value
430  *	- also tweak error handling in main startup loop
431  */
432 /*ARGSUSED*/
433 static int
434 xen_psm_cpu_start(processorid_t id, caddr_t arg)
435 {
436 	int ret;
437 
438 	ASSERT(id > 0);
439 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
440 	ec_bind_cpu_ipis(id);
441 	(void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
442 	if ((ret = xen_vcpu_up(id)) == 0)
443 		xen_psm_ncpus++;
444 	else
445 		ret = EINVAL;
446 	return (ret);
447 }
448 
449 /*
450  * Allocate an irq for inter cpu signaling
451  */
452 /*ARGSUSED*/
453 static int
454 xen_psm_get_ipivect(int ipl, int type)
455 {
456 	return (ec_bind_ipi_to_irq(ipl, 0));
457 }
458 
459 /*ARGSUSED*/
460 static int
461 xen_psm_get_clockirq(int ipl)
462 {
463 	if (xen_clock_irq != INVALID_IRQ)
464 		return (xen_clock_irq);
465 
466 	xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
467 	return (xen_clock_irq);
468 }
469 
470 /*ARGSUSED*/
471 static void
472 xen_psm_shutdown(int cmd, int fcn)
473 {
474 	XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
475 
476 	switch (cmd) {
477 	case A_SHUTDOWN:
478 		switch (fcn) {
479 		case AD_BOOT:
480 		case AD_IBOOT:
481 			(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
482 			break;
483 		case AD_POWEROFF:
484 			/* fall through if domU or if poweroff fails */
485 			if (DOMAIN_IS_INITDOMAIN(xen_info))
486 				if (apic_enable_acpi)
487 					(void) acpi_poweroff();
488 			/* FALLTHRU */
489 		case AD_HALT:
490 		default:
491 			(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
492 			break;
493 		}
494 		break;
495 	case A_REBOOT:
496 		(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
497 		break;
498 	default:
499 		return;
500 	}
501 }
502 
503 
504 static int
505 xen_psm_translate_irq(dev_info_t *dip, int irqno)
506 {
507 	if (dip == NULL) {
508 		XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
509 		    " dip = NULL\n", irqno));
510 		return (irqno);
511 	}
512 	return (irqno);
513 }
514 
515 /*
516  * xen_psm_intr_enter() acks the event that triggered the interrupt and
517  * returns the new priority level,
518  */
519 /*ARGSUSED*/
520 static int
521 xen_psm_intr_enter(int ipl, int *vector)
522 {
523 	int newipl;
524 	uint_t intno;
525 	cpu_t *cpu = CPU;
526 
527 	intno = (*vector);
528 
529 	ASSERT(intno < NR_IRQS);
530 	ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
531 
532 	ec_clear_irq(intno);
533 
534 	newipl = autovect[intno].avh_hi_pri;
535 	if (newipl == 0) {
536 		/*
537 		 * (newipl == 0) means we have no service routines for this
538 		 * vector.  We will treat this as a spurious interrupt.
539 		 * We have cleared the pending bit already, clear the event
540 		 * mask and return a spurious interrupt.  This case can happen
541 		 * when an interrupt delivery is racing with the removal of
542 		 * of the service routine for that interrupt.
543 		 */
544 		ec_unmask_irq(intno);
545 		newipl = -1;	/* flag spurious interrupt */
546 	} else if (newipl <= cpu->cpu_pri) {
547 		/*
548 		 * (newipl <= cpu->cpu_pri) means that we must be trying to
549 		 * service a vector that was shared with a higher priority
550 		 * isr.  The higher priority handler has been removed and
551 		 * we need to service this int.  We can't return a lower
552 		 * priority than current cpu priority.  Just synthesize a
553 		 * priority to return that should be acceptable.
554 		 */
555 		newipl = cpu->cpu_pri + 1;	/* synthetic priority */
556 	}
557 	return (newipl);
558 }
559 
560 
561 /*
562  * xen_psm_intr_exit() restores the old interrupt
563  * priority level after processing an interrupt.
564  * It is called with interrupts disabled, and does not enable interrupts.
565  */
566 /* ARGSUSED */
567 static void
568 xen_psm_intr_exit(int ipl, int vector)
569 {
570 	ec_try_unmask_irq(vector);
571 	xen_psm_setspl(ipl);
572 }
573 
574 intr_exit_fn_t
575 psm_intr_exit_fn(void)
576 {
577 	return (xen_psm_intr_exit);
578 }
579 
580 /*
581  * Check if new ipl level allows delivery of previously unserviced events
582  */
583 static void
584 xen_psm_setspl(int ipl)
585 {
586 	struct cpu *cpu = CPU;
587 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
588 	uint16_t pending;
589 
590 	ASSERT(vci->evtchn_upcall_mask != 0);
591 
592 	/*
593 	 * If new ipl level will enable any pending interrupts, setup so the
594 	 * upcoming sti will cause us to get an upcall.
595 	 */
596 	pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
597 	if (pending) {
598 		int i;
599 		ulong_t pending_sels = 0;
600 		volatile ulong_t *selp;
601 		struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
602 
603 		for (i = bsrw_insn(pending); i > ipl; i--)
604 			pending_sels |= cpe->pending_sel[i];
605 		ASSERT(pending_sels);
606 		selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
607 		atomic_or_ulong(selp, pending_sels);
608 		vci->evtchn_upcall_pending = 1;
609 	}
610 }
611 
612 /*
613  * This function provides external interface to the nexus for all
614  * functionality related to the new DDI interrupt framework.
615  *
616  * Input:
617  * dip     - pointer to the dev_info structure of the requested device
618  * hdlp    - pointer to the internal interrupt handle structure for the
619  *	     requested interrupt
620  * intr_op - opcode for this call
621  * result  - pointer to the integer that will hold the result to be
622  *	     passed back if return value is PSM_SUCCESS
623  *
624  * Output:
625  * return value is either PSM_SUCCESS or PSM_FAILURE
626  */
627 int
628 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
629     psm_intr_op_t intr_op, int *result)
630 {
631 	int		cap;
632 	int		err;
633 	int		new_priority;
634 	apic_irq_t	*irqp;
635 	struct intrspec *ispec;
636 
637 	DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
638 	    "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
639 
640 	switch (intr_op) {
641 	case PSM_INTR_OP_CHECK_MSI:
642 		if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
643 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
644 			    DDI_INTR_TYPE_MSIX);
645 			break;
646 		}
647 		/*
648 		 * Check MSI/X is supported or not at APIC level and
649 		 * masked off the MSI/X bits in hdlp->ih_type if not
650 		 * supported before return.  If MSI/X is supported,
651 		 * leave the ih_type unchanged and return.
652 		 *
653 		 * hdlp->ih_type passed in from the nexus has all the
654 		 * interrupt types supported by the device.
655 		 */
656 		if (xen_support_msi == 0) {
657 			/*
658 			 * if xen_support_msi is not set, call
659 			 * apic_check_msi_support() to check whether msi
660 			 * is supported first
661 			 */
662 			if (apic_check_msi_support() == PSM_SUCCESS)
663 				xen_support_msi = 1;
664 			else
665 				xen_support_msi = -1;
666 		}
667 		if (xen_support_msi == 1)
668 			*result = hdlp->ih_type;
669 		else
670 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
671 			    DDI_INTR_TYPE_MSIX);
672 		break;
673 	case PSM_INTR_OP_ALLOC_VECTORS:
674 		*result = apic_alloc_vectors(dip, hdlp->ih_inum,
675 		    hdlp->ih_scratch1, hdlp->ih_pri, hdlp->ih_type,
676 		    (int)(uintptr_t)hdlp->ih_scratch2);
677 		break;
678 	case PSM_INTR_OP_FREE_VECTORS:
679 		apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
680 		    hdlp->ih_pri, hdlp->ih_type);
681 		break;
682 	case PSM_INTR_OP_NAVAIL_VECTORS:
683 		/*
684 		 * XXPV - maybe we should make this be:
685 		 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
686 		 */
687 		if (DOMAIN_IS_INITDOMAIN(xen_info))
688 			*result = APIC_VECTOR_PER_IPL;
689 		else
690 			*result = 1;
691 		break;
692 	case PSM_INTR_OP_XLATE_VECTOR:
693 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
694 		if (ispec->intrspec_vec >= PIRQ_BASE &&
695 		    ispec->intrspec_vec < NR_PIRQS &&
696 		    DOMAIN_IS_INITDOMAIN(xen_info)) {
697 			*result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
698 		} else {
699 			*result = ispec->intrspec_vec;
700 		}
701 		break;
702 	case PSM_INTR_OP_GET_PENDING:
703 		/* XXPV - is this enough for dom0 or do we need to ref ioapic */
704 		*result = ec_pending_irq(hdlp->ih_vector);
705 		break;
706 	case PSM_INTR_OP_CLEAR_MASK:
707 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
708 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
709 			return (PSM_FAILURE);
710 		ec_enable_irq(hdlp->ih_vector);
711 		break;
712 	case PSM_INTR_OP_SET_MASK:
713 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
714 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
715 			return (PSM_FAILURE);
716 		ec_disable_irq(hdlp->ih_vector);
717 		break;
718 	case PSM_INTR_OP_GET_CAP:
719 		cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
720 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
721 			cap |= DDI_INTR_FLAG_MASKABLE;
722 		*result = cap;
723 		break;
724 	case PSM_INTR_OP_GET_SHARED:
725 		if (DOMAIN_IS_INITDOMAIN(xen_info)) {
726 			if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
727 				return (PSM_FAILURE);
728 			if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
729 			    == NULL)
730 				return (PSM_FAILURE);
731 			*result = irqp->airq_share ? 1: 0;
732 		} else {
733 			return (PSM_FAILURE);
734 		}
735 		break;
736 	case PSM_INTR_OP_SET_PRI:
737 		new_priority = *(int *)result;
738 		err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
739 		if (err != 0)
740 			return (PSM_FAILURE);
741 		break;
742 	case PSM_INTR_OP_GET_INTR:
743 		if (!DOMAIN_IS_INITDOMAIN(xen_info))
744 			return (PSM_FAILURE);
745 		/*
746 		 * The interrupt handle given here has been allocated
747 		 * specifically for this command, and ih_private carries
748 		 * a pointer to a apic_get_intr_t.
749 		 */
750 		if (apic_get_vector_intr_info(
751 		    hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
752 			return (PSM_FAILURE);
753 		break;
754 	case PSM_INTR_OP_SET_CAP:
755 		/* FALLTHRU */
756 	default:
757 		return (PSM_FAILURE);
758 	}
759 	return (PSM_SUCCESS);
760 }
761 
762 static void
763 xen_psm_rebind_irq(int irq)
764 {
765 	cpuset_t ncpu;
766 	processorid_t newcpu;
767 	apic_irq_t *irqptr;
768 
769 	newcpu = xen_psm_bind_intr(irq);
770 	if (newcpu == IRQ_UNBOUND) {
771 		CPUSET_ZERO(ncpu);
772 		CPUSET_OR(ncpu, xen_psm_cpus_online);
773 	} else {
774 		CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
775 	}
776 	ec_set_irq_affinity(irq, ncpu);
777 	if (irq <= APIC_MAX_VECTOR) {
778 		irqptr = apic_irq_table[irq];
779 		ASSERT(irqptr != NULL);
780 		irqptr->airq_temp_cpu = (uchar_t)newcpu;
781 	}
782 }
783 
784 /*
785  * Disable all device interrupts for the given cpu.
786  * High priority interrupts are not disabled and will still be serviced.
787  */
788 static int
789 xen_psm_disable_intr(processorid_t cpun)
790 {
791 	int irq;
792 
793 	/*
794 	 * Can't offline VCPU 0 on this hypervisor.  There's no reason
795 	 * anyone would want to given that the CPUs are virtual. Also note
796 	 * that the hypervisor requires suspend/resume to be on VCPU 0.
797 	 */
798 	if (cpun == 0)
799 		return (PSM_FAILURE);
800 
801 	CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
802 	for (irq = 0; irq < NR_IRQS; irq++) {
803 		if (!ec_irq_needs_rebind(irq, cpun))
804 			continue;
805 		xen_psm_rebind_irq(irq);
806 	}
807 	return (PSM_SUCCESS);
808 }
809 
810 static void
811 xen_psm_enable_intr(processorid_t cpun)
812 {
813 	int irq;
814 
815 	if (cpun == 0)
816 		return;
817 
818 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
819 
820 	/*
821 	 * Rebalance device interrupts among online processors
822 	 */
823 	for (irq = 0; irq < NR_IRQS; irq++) {
824 		if (!ec_irq_rebindable(irq))
825 			continue;
826 		xen_psm_rebind_irq(irq);
827 	}
828 
829 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
830 		apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
831 	}
832 }
833 
834 static int
835 xen_psm_post_cpu_start()
836 {
837 	processorid_t cpun;
838 
839 	cpun = psm_get_cpu_id();
840 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
841 		/*
842 		 * Non-virtualized environments can call psm_post_cpu_start
843 		 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
844 		 * xen_psm_post_cpu_start() is only called from boot.
845 		 */
846 		apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
847 	}
848 	return (PSM_SUCCESS);
849 }
850 
851 /*
852  * This function will reprogram the timer.
853  *
854  * When in oneshot mode the argument is the absolute time in future at which to
855  * generate the interrupt.
856  *
857  * When in periodic mode, the argument is the interval at which the
858  * interrupts should be generated. There is no need to support the periodic
859  * mode timer change at this time.
860  *
861  * Note that we must be careful to convert from hrtime to Xen system time (see
862  * xpv_timestamp.c).
863  */
864 static void
865 xen_psm_timer_reprogram(hrtime_t timer_req)
866 {
867 	hrtime_t now, timer_new, time_delta, xen_time;
868 	ulong_t flags;
869 
870 	flags = intr_clear();
871 	/*
872 	 * We should be called from high PIL context (CBE_HIGH_PIL),
873 	 * so kpreempt is disabled.
874 	 */
875 
876 	now = xpv_gethrtime();
877 	xen_time = xpv_getsystime();
878 	if (timer_req <= now) {
879 		/*
880 		 * requested to generate an interrupt in the past
881 		 * generate an interrupt as soon as possible
882 		 */
883 		time_delta = XEN_NSEC_PER_TICK;
884 	} else
885 		time_delta = timer_req - now;
886 
887 	timer_new = xen_time + time_delta;
888 	if (HYPERVISOR_set_timer_op(timer_new) != 0)
889 		panic("can't set hypervisor timer?");
890 	intr_restore(flags);
891 }
892 
893 /*
894  * This function will enable timer interrupts.
895  */
896 static void
897 xen_psm_timer_enable(void)
898 {
899 	ec_unmask_irq(xen_clock_irq);
900 }
901 
902 /*
903  * This function will disable timer interrupts on the current cpu.
904  */
905 static void
906 xen_psm_timer_disable(void)
907 {
908 	(void) ec_block_irq(xen_clock_irq);
909 	/*
910 	 * If the clock irq is pending on this cpu then we need to
911 	 * clear the pending interrupt.
912 	 */
913 	ec_unpend_irq(xen_clock_irq);
914 }
915 
916 /*
917  *
918  * The following functions are in the platform specific file so that they
919  * can be different functions depending on whether we are running on
920  * bare metal or a hypervisor.
921  */
922 
923 /*
924  * Allocate a free vector for irq at ipl.
925  */
926 /* ARGSUSED */
927 uchar_t
928 apic_allocate_vector(int ipl, int irq, int pri)
929 {
930 	physdev_irq_t irq_op;
931 	uchar_t vector;
932 
933 	irq_op.irq = irq;
934 
935 	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
936 		panic("Hypervisor alloc vector failed");
937 	vector = irq_op.vector;
938 	/*
939 	 * No need to worry about vector colliding with our reserved vectors
940 	 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
941 	 * generated traps and handle them properly.
942 	 */
943 	apic_vector_to_irq[vector] = (uchar_t)irq;
944 	return (vector);
945 }
946 
947 /* Mark vector as not being used by any irq */
948 void
949 apic_free_vector(uchar_t vector)
950 {
951 	apic_vector_to_irq[vector] = APIC_RESV_IRQ;
952 }
953 
954 /*
955  * This function allocate "count" vector(s) for the given "dip/pri/type"
956  */
957 static int
958 apic_alloc_vectors(dev_info_t *dip, int inum, int count, int pri, int type,
959     int behavior)
960 {
961 	int	rcount, i;
962 	uchar_t	vector, cpu;
963 	int irqno;
964 	major_t	major;
965 	apic_irq_t	*irqptr;
966 
967 	/* only supports MSI at the moment, will add MSI-X support later */
968 	if (type != DDI_INTR_TYPE_MSI)
969 		return (0);
970 
971 	DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: dip=0x%p type=%d "
972 	    "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
973 	    (void *)dip, type, inum, pri, count, behavior));
974 
975 	if (count > 1) {
976 		if (behavior == DDI_INTR_ALLOC_STRICT &&
977 		    (apic_multi_msi_enable == 0 || count > apic_multi_msi_max))
978 			return (0);
979 
980 		if (apic_multi_msi_enable == 0)
981 			count = 1;
982 		else if (count > apic_multi_msi_max)
983 			count = apic_multi_msi_max;
984 	}
985 
986 	/*
987 	 * XXPV - metal version takes all vectors avail at given pri.
988 	 * Why do that?  For now just allocate count vectors.
989 	 */
990 	rcount = count;
991 
992 	mutex_enter(&airq_mutex);
993 
994 	/*
995 	 * XXPV - currently the hypervisor does not support MSI at all.
996 	 * It doesn't return consecutive vectors.  This code is a first
997 	 * cut for the (future) time that MSI is supported.
998 	 */
999 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1000 	for (i = 0; i < rcount; i++) {
1001 		if ((irqno = apic_allocate_irq(apic_first_avail_irq)) ==
1002 		    INVALID_IRQ) {
1003 			mutex_exit(&airq_mutex);
1004 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: "
1005 			    "apic_allocate_irq failed\n"));
1006 			return (i);
1007 		}
1008 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1009 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1010 		irqptr = apic_irq_table[irqno];
1011 		vector = apic_allocate_vector(pri, irqno, 0);
1012 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1013 #ifdef	DEBUG
1014 		if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1015 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: "
1016 			    "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1017 #endif
1018 
1019 		irqptr->airq_vector = vector;
1020 		irqptr->airq_ioapicindex = (uchar_t)inum;	/* start */
1021 		irqptr->airq_intin_no = (uchar_t)rcount;
1022 		irqptr->airq_ipl = pri;
1023 		irqptr->airq_origirq = (uchar_t)(inum + i);
1024 		irqptr->airq_share_id = 0;
1025 		irqptr->airq_mps_intr_index = MSI_INDEX;
1026 		irqptr->airq_dip = dip;
1027 		irqptr->airq_major = major;
1028 		if (i == 0) /* they all bound to the same cpu */
1029 			cpu = irqptr->airq_cpu = apic_bind_intr(dip, irqno,
1030 			    0xff, 0xff);
1031 		else
1032 			irqptr->airq_cpu = cpu;
1033 		DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: irq=0x%x "
1034 		    "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1035 		    (void *)irqptr->airq_dip, irqptr->airq_vector,
1036 		    irqptr->airq_origirq, pri));
1037 	}
1038 	mutex_exit(&airq_mutex);
1039 	return (rcount);
1040 }
1041 
1042 /*
1043  * The hypervisor doesn't permit access to local apics directly
1044  */
1045 /* ARGSUSED */
1046 uint32_t *
1047 mapin_apic(uint32_t addr, size_t len, int flags)
1048 {
1049 	/*
1050 	 * Return a pointer to a memory area to fake out the
1051 	 * probe code that wants to read apic registers.
1052 	 * The dummy values will end up being ignored by xen
1053 	 * later on when they are used anyway.
1054 	 */
1055 	xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1056 	return (xen_psm_dummy_apic);
1057 }
1058 
1059 /* ARGSUSED */
1060 uint32_t *
1061 mapin_ioapic(uint32_t addr, size_t len, int flags)
1062 {
1063 	/*
1064 	 * Return non-null here to fake out configure code that calls this.
1065 	 * The i86xpv platform will not reference through the returned value..
1066 	 */
1067 	return ((uint32_t *)0x1);
1068 }
1069 
1070 /* ARGSUSED */
1071 void
1072 mapout_apic(caddr_t addr, size_t len)
1073 {
1074 }
1075 
1076 /* ARGSUSED */
1077 void
1078 mapout_ioapic(caddr_t addr, size_t len)
1079 {
1080 }
1081 
1082 uint32_t
1083 ioapic_read(int apic_ix, uint32_t reg)
1084 {
1085 	physdev_apic_t apic;
1086 
1087 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1088 	apic.reg = reg;
1089 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1090 		panic("read ioapic %d reg %d failed", apic_ix, reg);
1091 	return (apic.value);
1092 }
1093 
1094 void
1095 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1096 {
1097 	physdev_apic_t apic;
1098 
1099 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1100 	apic.reg = reg;
1101 	apic.value = value;
1102 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1103 		panic("write ioapic %d reg %d failed", apic_ix, reg);
1104 }
1105 
1106 /*
1107  * This function was added as part of x2APIC support in pcplusmp.
1108  */
1109 void
1110 ioapic_write_eoi(int apic_ix, uint32_t value)
1111 {
1112 	physdev_apic_t apic;
1113 
1114 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1115 	apic.reg = APIC_IO_EOI;
1116 	apic.value = value;
1117 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1118 		panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1119 }
1120 
1121 /*
1122  * This function was added as part of x2APIC support in pcplusmp to resolve
1123  * undefined symbol in xpv_psm.
1124  */
1125 void
1126 x2apic_update_psm()
1127 {
1128 }
1129 
1130 /*
1131  * This function was added as part of x2APIC support in pcplusmp to resolve
1132  * undefined symbol in xpv_psm.
1133  */
1134 void
1135 apic_ret()
1136 {
1137 }
1138 
1139 /*
1140  * Call rebind to do the actual programming.
1141  */
1142 int
1143 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1144 {
1145 	apic_irq_t *irqptr;
1146 	struct ioapic_reprogram_data *drep = NULL;
1147 	int rv, cpu;
1148 	cpuset_t cpus;
1149 
1150 	/*
1151 	 * Set cpu based on xen idea of online cpu's not apic tables.
1152 	 * Note that xen ignores/sets to it's own preferred value the
1153 	 * target cpu field when programming ioapic anyway.
1154 	 */
1155 	if ((cpu = xen_psm_bind_intr(irq)) == IRQ_UNBOUND) {
1156 		CPUSET_ZERO(cpus);
1157 		CPUSET_OR(cpus, xen_psm_cpus_online);
1158 	} else {
1159 		CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1160 	}
1161 	apic_irq_table[irq]->airq_cpu = cpu;
1162 	if (deferred) {
1163 		drep = (struct ioapic_reprogram_data *)p;
1164 		ASSERT(drep != NULL);
1165 		irqptr = drep->irqp;
1166 	} else {
1167 		irqptr = (apic_irq_t *)p;
1168 	}
1169 	ASSERT(irqptr != NULL);
1170 	rv = apic_rebind(irqptr, cpu, drep);
1171 	if (rv) {
1172 		/* CPU is not up or interrupt is disabled. Fall back to 0 */
1173 		cpu = 0;
1174 		rv = apic_rebind(irqptr, cpu, drep);
1175 	}
1176 	/*
1177 	 * If rebind successful bind the irq to an event channel
1178 	 */
1179 	if (rv == 0) {
1180 		ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1181 		CPUSET_FIND(cpus, cpu);
1182 		apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1183 	}
1184 	return (rv);
1185 }
1186 
1187 /*
1188  * Allocate a new vector for the given irq
1189  */
1190 /* ARGSUSED */
1191 uchar_t
1192 apic_modify_vector(uchar_t vector, int irq)
1193 {
1194 	return (apic_allocate_vector(0, irq, 0));
1195 }
1196 
1197 /*
1198  * The rest of the file is just generic psm module boilerplate
1199  */
1200 
1201 static struct psm_ops xen_psm_ops = {
1202 	xen_psm_probe,				/* psm_probe		*/
1203 
1204 	xen_psm_softinit,			/* psm_init		*/
1205 	xen_psm_picinit,			/* psm_picinit		*/
1206 	xen_psm_intr_enter,			/* psm_intr_enter	*/
1207 	xen_psm_intr_exit,			/* psm_intr_exit	*/
1208 	xen_psm_setspl,				/* psm_setspl		*/
1209 	xen_psm_addspl,				/* psm_addspl		*/
1210 	xen_psm_delspl,				/* psm_delspl		*/
1211 	xen_psm_disable_intr,			/* psm_disable_intr	*/
1212 	xen_psm_enable_intr,			/* psm_enable_intr	*/
1213 	(int (*)(int))NULL,			/* psm_softlvl_to_irq	*/
1214 	(void (*)(int))NULL,			/* psm_set_softintr	*/
1215 	(void (*)(processorid_t))NULL,		/* psm_set_idlecpu	*/
1216 	(void (*)(processorid_t))NULL,		/* psm_unset_idlecpu	*/
1217 
1218 	xen_psm_clkinit,			/* psm_clkinit		*/
1219 	xen_psm_get_clockirq,			/* psm_get_clockirq	*/
1220 	xen_psm_hrtimeinit,			/* psm_hrtimeinit	*/
1221 	xpv_gethrtime,				/* psm_gethrtime	*/
1222 
1223 	xen_psm_get_next_processorid,		/* psm_get_next_processorid */
1224 	xen_psm_cpu_start,			/* psm_cpu_start	*/
1225 	xen_psm_post_cpu_start,			/* psm_post_cpu_start	*/
1226 	xen_psm_shutdown,			/* psm_shutdown		*/
1227 	xen_psm_get_ipivect,			/* psm_get_ipivect	*/
1228 	xen_psm_send_ipi,			/* psm_send_ipi		*/
1229 
1230 	xen_psm_translate_irq,			/* psm_translate_irq	*/
1231 
1232 	(void (*)(int, char *))NULL,		/* psm_notify_error	*/
1233 	(void (*)(int msg))NULL,		/* psm_notify_func	*/
1234 	xen_psm_timer_reprogram,		/* psm_timer_reprogram	*/
1235 	xen_psm_timer_enable,			/* psm_timer_enable	*/
1236 	xen_psm_timer_disable,			/* psm_timer_disable	*/
1237 	(void (*)(void *arg))NULL,		/* psm_post_cyclic_setup */
1238 	(void (*)(int, int))NULL,		/* psm_preshutdown	*/
1239 	xen_intr_ops,			/* Advanced DDI Interrupt framework */
1240 	(int (*)(psm_state_request_t *))NULL	/* psm_state		*/
1241 };
1242 
1243 static struct psm_info xen_psm_info = {
1244 	PSM_INFO_VER01_5,	/* version				*/
1245 	PSM_OWN_EXCLUSIVE,	/* ownership				*/
1246 	&xen_psm_ops,		/* operation				*/
1247 	"xVM_psm",		/* machine name				*/
1248 	"platform module"	/* machine descriptions			*/
1249 };
1250 
1251 static void *xen_psm_hdlp;
1252 
1253 int
1254 _init(void)
1255 {
1256 	return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1257 }
1258 
1259 int
1260 _fini(void)
1261 {
1262 	return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1263 }
1264 
1265 int
1266 _info(struct modinfo *modinfop)
1267 {
1268 	return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1269 }
1270