xref: /illumos-gate/usr/src/uts/i86xpv/io/psm/xpv_psm.c (revision a6d4d7d5d0e34964282f736f7bade0574645f1fd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #define	PSMI_1_5
30 
31 #include <sys/mutex.h>
32 #include <sys/types.h>
33 #include <sys/time.h>
34 #include <sys/clock.h>
35 #include <sys/machlock.h>
36 #include <sys/smp_impldefs.h>
37 #include <sys/uadmin.h>
38 #include <sys/promif.h>
39 #include <sys/psm.h>
40 #include <sys/psm_common.h>
41 #include <sys/atomic.h>
42 #include <sys/apic.h>
43 #include <sys/archsystm.h>
44 #include <sys/mach_intr.h>
45 #include <sys/hypervisor.h>
46 #include <sys/evtchn_impl.h>
47 #include <sys/modctl.h>
48 #include <sys/trap.h>
49 #include <sys/panic.h>
50 
51 #include <xen/public/vcpu.h>
52 #include <xen/public/physdev.h>
53 
54 
55 /*
56  * Global Data
57  */
58 
59 int xen_psm_verbose = 0;
60 
61 /* As of now we don't support x2apic in xVM */
62 volatile uint32_t *apicadr = NULL;	/* dummy, so common code will link */
63 int apic_error = 0;
64 int apic_verbose = 0;
65 cpuset_t apic_cpumask;
66 int apic_forceload = 0;
67 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
68 	3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
69 };
70 uchar_t apic_ipltopri[MAXIPL + 1];
71 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
72 uint_t apic_picinit_called;
73 apic_cpus_info_t *apic_cpus;
74 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
75 /* use to make sure only one cpu handles the nmi */
76 static lock_t xen_psm_nmi_lock;
77 int xen_psm_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
78 int xen_psm_panic_on_nmi = 0;
79 int xen_psm_num_nmis = 0;
80 
81 cpuset_t xen_psm_cpus_online;	/* online cpus */
82 int xen_psm_ncpus = 1;		/* cpu count */
83 int xen_psm_next_bind_cpu;	/* next cpu to bind an interrupt to */
84 
85 /*
86  * XXPV we flag MSI as not supported, since the hypervisor currently doesn't
87  * support MSI at all.  Change this initialization to zero when MSI is
88  * supported.
89  */
90 int xen_support_msi = -1;
91 
92 static int xen_clock_irq = INVALID_IRQ;
93 
94 /* flag definitions for xen_psm_verbose */
95 #define	XEN_PSM_VERBOSE_IRQ_FLAG		0x00000001
96 #define	XEN_PSM_VERBOSE_POWEROFF_FLAG		0x00000002
97 #define	XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG	0x00000004
98 
99 #define	XEN_PSM_VERBOSE_IRQ(fmt) \
100 	if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
101 		cmn_err fmt;
102 
103 #define	XEN_PSM_VERBOSE_POWEROFF(fmt) \
104 	if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
105 		prom_printf fmt;
106 
107 /*
108  * Dummy apic array to point common routines at that want to do some apic
109  * manipulation.  Xen doesn't allow guest apic access so we point at these
110  * memory locations to fake out those who want to do apic fiddling.
111  */
112 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
113 
114 static struct psm_info xen_psm_info;
115 static void xen_psm_setspl(int);
116 
117 static int apic_alloc_vectors(dev_info_t *, int, int, int, int, int);
118 
119 /*
120  * Local support routines
121  */
122 
123 /*
124  * Select vcpu to bind xen virtual device interrupt to.
125  */
126 /*ARGSUSED*/
127 int
128 xen_psm_bind_intr(int irq)
129 {
130 	int bind_cpu, test_cpu;
131 	apic_irq_t *irqptr;
132 
133 	if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
134 		return (IRQ_UNBOUND);
135 	if (irq <= APIC_MAX_VECTOR)
136 		irqptr = apic_irq_table[irq];
137 	else
138 		irqptr = NULL;
139 	if (irqptr && (irqptr->airq_cpu & IRQ_USER_BOUND)) {
140 		bind_cpu = irqptr->airq_cpu;
141 		test_cpu = bind_cpu & ~IRQ_USER_BOUND;
142 		if (!CPU_IN_SET(xen_psm_cpus_online, test_cpu))
143 			bind_cpu = 0;
144 		goto done;
145 	}
146 	if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
147 		do {
148 			bind_cpu = xen_psm_next_bind_cpu++;
149 			if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
150 				xen_psm_next_bind_cpu = 0;
151 		} while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
152 	} else {
153 		bind_cpu = 0;
154 	}
155 done:
156 	return (bind_cpu);
157 }
158 
159 /*
160  * Autoconfiguration Routines
161  */
162 
163 static int
164 xen_psm_probe(void)
165 {
166 	int ret = PSM_SUCCESS;
167 
168 	if (DOMAIN_IS_INITDOMAIN(xen_info))
169 		ret = apic_probe_common(xen_psm_info.p_mach_idstring);
170 	return (ret);
171 }
172 
173 static void
174 xen_psm_softinit(void)
175 {
176 	/* LINTED logical expression always true: op "||" */
177 	ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
178 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
179 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
180 		apic_init_common();
181 	}
182 }
183 
184 #define	XEN_NSEC_PER_TICK	10 /* XXX - assume we have a 100 Mhz clock */
185 
186 /*ARGSUSED*/
187 static int
188 xen_psm_clkinit(int hertz)
189 {
190 	extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
191 	extern int dosynctodr;
192 
193 	/*
194 	 * domU cannot set the TOD hardware, fault the TOD clock now to
195 	 * indicate that and turn off attempts to sync TOD hardware
196 	 * with the hires timer.
197 	 */
198 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
199 		mutex_enter(&tod_lock);
200 		(void) tod_fault(TOD_RDONLY, 0);
201 		dosynctodr = 0;
202 		mutex_exit(&tod_lock);
203 	}
204 	/*
205 	 * The hypervisor provides a timer based on the local APIC timer.
206 	 * The interface supports requests of nanosecond resolution.
207 	 * A common frequency of the apic clock is 100 Mhz which
208 	 * gives a resolution of 10 nsec per tick.  What we would really like
209 	 * is a way to get the ns per tick value from xen.
210 	 * XXPV - This is an assumption that needs checking and may change
211 	 */
212 	return (XEN_NSEC_PER_TICK);
213 }
214 
215 static void
216 xen_psm_hrtimeinit(void)
217 {
218 	extern int gethrtime_hires;
219 	gethrtime_hires = 1;
220 }
221 
222 /* xen_psm NMI handler */
223 /*ARGSUSED*/
224 static void
225 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
226 {
227 	xen_psm_num_nmis++;
228 
229 	if (!lock_try(&xen_psm_nmi_lock))
230 		return;
231 
232 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
233 		debug_enter("NMI received: entering kmdb\n");
234 	} else if (xen_psm_panic_on_nmi) {
235 		/* Keep panic from entering kmdb. */
236 		nopanicdebug = 1;
237 		panic("NMI received\n");
238 	} else {
239 		/*
240 		 * prom_printf is the best shot we have of something which is
241 		 * problem free from high level/NMI type of interrupts
242 		 */
243 		prom_printf("NMI received\n");
244 	}
245 
246 	lock_clear(&xen_psm_nmi_lock);
247 }
248 
249 static void
250 xen_psm_picinit()
251 {
252 	int cpu, irqno;
253 	cpuset_t cpus;
254 
255 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
256 		/* set a flag so we know we have run xen_psm_picinit() */
257 		apic_picinit_called = 1;
258 		LOCK_INIT_CLEAR(&apic_ioapic_lock);
259 
260 		/* XXPV - do we need to do this? */
261 		picsetup();	 /* initialise the 8259 */
262 
263 		/* enable apic mode if imcr present */
264 		/* XXPV - do we need to do this either? */
265 		if (apic_imcrp) {
266 			outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
267 			outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
268 		}
269 
270 		ioapic_init_intr(IOAPIC_NOMASK);
271 		/*
272 		 * We never called xen_psm_addspl() when the SCI
273 		 * interrupt was added because that happened before the
274 		 * PSM module was loaded.  Fix that up here by doing
275 		 * any missed operations (e.g. bind to CPU)
276 		 */
277 		if ((irqno = apic_sci_vect) > 0) {
278 			if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
279 				CPUSET_ZERO(cpus);
280 				CPUSET_OR(cpus, xen_psm_cpus_online);
281 			} else {
282 				CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
283 			}
284 			ec_set_irq_affinity(irqno, cpus);
285 			apic_irq_table[irqno]->airq_temp_cpu =
286 			    (uchar_t)(cpu & ~IRQ_USER_BOUND);
287 			ec_enable_irq(irqno);
288 		}
289 	}
290 
291 	/* add nmi handler - least priority nmi handler */
292 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
293 
294 	if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
295 	    "xVM_psm NMI handler", (caddr_t)NULL))
296 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
297 }
298 
299 
300 /*
301  * generates an interprocessor interrupt to another CPU
302  */
303 static void
304 xen_psm_send_ipi(int cpun, int ipl)
305 {
306 	ulong_t flag = intr_clear();
307 
308 	ec_send_ipi(ipl, cpun);
309 	intr_restore(flag);
310 }
311 
312 /*ARGSUSED*/
313 static int
314 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
315 {
316 	int cpu, ret;
317 	cpuset_t cpus;
318 
319 	/*
320 	 * We are called at splhi() so we can't call anything that might end
321 	 * up trying to context switch.
322 	 */
323 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
324 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
325 		/*
326 		 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
327 		 */
328 		ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
329 	} else {
330 		/*
331 		 * Set priority/affinity/enable for non PIRQs
332 		 */
333 		ret = ec_set_irq_priority(irqno, ipl);
334 		ASSERT(ret == 0);
335 		if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
336 			CPUSET_ZERO(cpus);
337 			CPUSET_OR(cpus, xen_psm_cpus_online);
338 		} else {
339 			CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
340 		}
341 		ec_set_irq_affinity(irqno, cpus);
342 		ec_enable_irq(irqno);
343 	}
344 	return (ret);
345 }
346 
347 /*
348  * Acquire ownership of this irq on this cpu
349  */
350 void
351 xen_psm_acquire_irq(int irq)
352 {
353 	ulong_t flags;
354 	int cpuid;
355 
356 	/*
357 	 * If the irq is currently being serviced by another cpu
358 	 * we busy-wait for the other cpu to finish.  Take any
359 	 * pending interrupts before retrying.
360 	 */
361 	do {
362 		flags = intr_clear();
363 		cpuid = ec_block_irq(irq);
364 		intr_restore(flags);
365 	} while (cpuid != CPU->cpu_id);
366 }
367 
368 /*ARGSUSED*/
369 static int
370 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
371 {
372 	apic_irq_t *irqptr;
373 	int err = PSM_SUCCESS;
374 
375 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
376 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
377 		irqptr = apic_irq_table[irqno];
378 		/*
379 		 * unbind if no more sharers of this irq/evtchn
380 		 */
381 		if (irqptr->airq_share == 1) {
382 			xen_psm_acquire_irq(irqno);
383 			ec_unbind_irq(irqno);
384 		}
385 		err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
386 		/*
387 		 * If still in use reset priority
388 		 */
389 		if (!err && irqptr->airq_share != 0) {
390 			err = ec_set_irq_priority(irqno, max_ipl);
391 			return (err);
392 		}
393 	} else {
394 		xen_psm_acquire_irq(irqno);
395 		ec_unbind_irq(irqno);
396 	}
397 	return (err);
398 }
399 
400 static processorid_t
401 xen_psm_get_next_processorid(processorid_t id)
402 {
403 	if (id == -1)
404 		return (0);
405 
406 	for (id++; id < NCPU; id++) {
407 		switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
408 		case 0:		/* yeah, that one's there */
409 			return (id);
410 		default:
411 		case X_EINVAL:	/* out of range */
412 			return (-1);
413 		case X_ENOENT:	/* not present in the domain */
414 			/*
415 			 * It's not clear that we -need- to keep looking
416 			 * at this point, if, e.g., we can guarantee
417 			 * the hypervisor always keeps a contiguous range
418 			 * of vcpus around this is equivalent to "out of range".
419 			 *
420 			 * But it would be sad to miss a vcpu we're
421 			 * supposed to be using ..
422 			 */
423 			break;
424 		}
425 	}
426 
427 	return (-1);
428 }
429 
430 /*
431  * XXPV - undo the start cpu op change; return to ignoring this value
432  *	- also tweak error handling in main startup loop
433  */
434 /*ARGSUSED*/
435 static int
436 xen_psm_cpu_start(processorid_t id, caddr_t arg)
437 {
438 	int ret;
439 
440 	ASSERT(id > 0);
441 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
442 	ec_bind_cpu_ipis(id);
443 	(void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
444 	if ((ret = xen_vcpu_up(id)) == 0)
445 		xen_psm_ncpus++;
446 	else
447 		ret = EINVAL;
448 	return (ret);
449 }
450 
451 /*
452  * Allocate an irq for inter cpu signaling
453  */
454 /*ARGSUSED*/
455 static int
456 xen_psm_get_ipivect(int ipl, int type)
457 {
458 	return (ec_bind_ipi_to_irq(ipl, 0));
459 }
460 
461 /*ARGSUSED*/
462 static int
463 xen_psm_get_clockirq(int ipl)
464 {
465 	if (xen_clock_irq != INVALID_IRQ)
466 		return (xen_clock_irq);
467 
468 	xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
469 	return (xen_clock_irq);
470 }
471 
472 /*ARGSUSED*/
473 static void
474 xen_psm_shutdown(int cmd, int fcn)
475 {
476 	XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
477 
478 	switch (cmd) {
479 	case A_SHUTDOWN:
480 		switch (fcn) {
481 		case AD_BOOT:
482 		case AD_IBOOT:
483 			(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
484 			break;
485 		case AD_POWEROFF:
486 			/* fall through if domU or if poweroff fails */
487 			if (DOMAIN_IS_INITDOMAIN(xen_info))
488 				if (apic_enable_acpi)
489 					(void) acpi_poweroff();
490 			/* FALLTHRU */
491 		case AD_HALT:
492 		default:
493 			(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
494 			break;
495 		}
496 		break;
497 	case A_REBOOT:
498 		(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
499 		break;
500 	default:
501 		return;
502 	}
503 }
504 
505 
506 static int
507 xen_psm_translate_irq(dev_info_t *dip, int irqno)
508 {
509 	if (dip == NULL) {
510 		XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
511 		    " dip = NULL\n", irqno));
512 		return (irqno);
513 	}
514 	return (irqno);
515 }
516 
517 /*
518  * xen_psm_intr_enter() acks the event that triggered the interrupt and
519  * returns the new priority level,
520  */
521 /*ARGSUSED*/
522 static int
523 xen_psm_intr_enter(int ipl, int *vector)
524 {
525 	int newipl;
526 	uint_t intno;
527 	cpu_t *cpu = CPU;
528 
529 	intno = (*vector);
530 
531 	ASSERT(intno < NR_IRQS);
532 	ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
533 
534 	ec_clear_irq(intno);
535 
536 	newipl = autovect[intno].avh_hi_pri;
537 	if (newipl == 0) {
538 		/*
539 		 * (newipl == 0) means we have no service routines for this
540 		 * vector.  We will treat this as a spurious interrupt.
541 		 * We have cleared the pending bit already, clear the event
542 		 * mask and return a spurious interrupt.  This case can happen
543 		 * when an interrupt delivery is racing with the removal of
544 		 * of the service routine for that interrupt.
545 		 */
546 		ec_unmask_irq(intno);
547 		newipl = -1;	/* flag spurious interrupt */
548 	} else if (newipl <= cpu->cpu_pri) {
549 		/*
550 		 * (newipl <= cpu->cpu_pri) means that we must be trying to
551 		 * service a vector that was shared with a higher priority
552 		 * isr.  The higher priority handler has been removed and
553 		 * we need to service this int.  We can't return a lower
554 		 * priority than current cpu priority.  Just synthesize a
555 		 * priority to return that should be acceptable.
556 		 */
557 		newipl = cpu->cpu_pri + 1;	/* synthetic priority */
558 	}
559 	return (newipl);
560 }
561 
562 
563 /*
564  * xen_psm_intr_exit() restores the old interrupt
565  * priority level after processing an interrupt.
566  * It is called with interrupts disabled, and does not enable interrupts.
567  */
568 /* ARGSUSED */
569 static void
570 xen_psm_intr_exit(int ipl, int vector)
571 {
572 	ec_try_unmask_irq(vector);
573 	xen_psm_setspl(ipl);
574 }
575 
576 intr_exit_fn_t
577 psm_intr_exit_fn(void)
578 {
579 	return (xen_psm_intr_exit);
580 }
581 
582 /*
583  * Check if new ipl level allows delivery of previously unserviced events
584  */
585 static void
586 xen_psm_setspl(int ipl)
587 {
588 	struct cpu *cpu = CPU;
589 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
590 	uint16_t pending;
591 
592 	ASSERT(vci->evtchn_upcall_mask != 0);
593 
594 	/*
595 	 * If new ipl level will enable any pending interrupts, setup so the
596 	 * upcoming sti will cause us to get an upcall.
597 	 */
598 	pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
599 	if (pending) {
600 		int i;
601 		ulong_t pending_sels = 0;
602 		volatile ulong_t *selp;
603 		struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
604 
605 		for (i = bsrw_insn(pending); i > ipl; i--)
606 			pending_sels |= cpe->pending_sel[i];
607 		ASSERT(pending_sels);
608 		selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
609 		atomic_or_ulong(selp, pending_sels);
610 		vci->evtchn_upcall_pending = 1;
611 	}
612 }
613 
614 /*
615  * This function provides external interface to the nexus for all
616  * functionality related to the new DDI interrupt framework.
617  *
618  * Input:
619  * dip     - pointer to the dev_info structure of the requested device
620  * hdlp    - pointer to the internal interrupt handle structure for the
621  *	     requested interrupt
622  * intr_op - opcode for this call
623  * result  - pointer to the integer that will hold the result to be
624  *	     passed back if return value is PSM_SUCCESS
625  *
626  * Output:
627  * return value is either PSM_SUCCESS or PSM_FAILURE
628  */
629 int
630 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
631     psm_intr_op_t intr_op, int *result)
632 {
633 	int		cap;
634 	int		err;
635 	int		new_priority;
636 	apic_irq_t	*irqp;
637 	struct intrspec *ispec;
638 
639 	DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
640 	    "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
641 
642 	switch (intr_op) {
643 	case PSM_INTR_OP_CHECK_MSI:
644 		if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
645 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
646 			    DDI_INTR_TYPE_MSIX);
647 			break;
648 		}
649 		/*
650 		 * Check MSI/X is supported or not at APIC level and
651 		 * masked off the MSI/X bits in hdlp->ih_type if not
652 		 * supported before return.  If MSI/X is supported,
653 		 * leave the ih_type unchanged and return.
654 		 *
655 		 * hdlp->ih_type passed in from the nexus has all the
656 		 * interrupt types supported by the device.
657 		 */
658 		if (xen_support_msi == 0) {
659 			/*
660 			 * if xen_support_msi is not set, call
661 			 * apic_check_msi_support() to check whether msi
662 			 * is supported first
663 			 */
664 			if (apic_check_msi_support() == PSM_SUCCESS)
665 				xen_support_msi = 1;
666 			else
667 				xen_support_msi = -1;
668 		}
669 		if (xen_support_msi == 1)
670 			*result = hdlp->ih_type;
671 		else
672 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
673 			    DDI_INTR_TYPE_MSIX);
674 		break;
675 	case PSM_INTR_OP_ALLOC_VECTORS:
676 		*result = apic_alloc_vectors(dip, hdlp->ih_inum,
677 		    hdlp->ih_scratch1, hdlp->ih_pri, hdlp->ih_type,
678 		    (int)(uintptr_t)hdlp->ih_scratch2);
679 		break;
680 	case PSM_INTR_OP_FREE_VECTORS:
681 		apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
682 		    hdlp->ih_pri, hdlp->ih_type);
683 		break;
684 	case PSM_INTR_OP_NAVAIL_VECTORS:
685 		/*
686 		 * XXPV - maybe we should make this be:
687 		 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
688 		 */
689 		if (DOMAIN_IS_INITDOMAIN(xen_info))
690 			*result = APIC_VECTOR_PER_IPL;
691 		else
692 			*result = 1;
693 		break;
694 	case PSM_INTR_OP_XLATE_VECTOR:
695 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
696 		if (ispec->intrspec_vec >= PIRQ_BASE &&
697 		    ispec->intrspec_vec < NR_PIRQS &&
698 		    DOMAIN_IS_INITDOMAIN(xen_info)) {
699 			*result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
700 		} else {
701 			*result = ispec->intrspec_vec;
702 		}
703 		break;
704 	case PSM_INTR_OP_GET_PENDING:
705 		/* XXPV - is this enough for dom0 or do we need to ref ioapic */
706 		*result = ec_pending_irq(hdlp->ih_vector);
707 		break;
708 	case PSM_INTR_OP_CLEAR_MASK:
709 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
710 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
711 			return (PSM_FAILURE);
712 		ec_enable_irq(hdlp->ih_vector);
713 		break;
714 	case PSM_INTR_OP_SET_MASK:
715 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
716 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
717 			return (PSM_FAILURE);
718 		ec_disable_irq(hdlp->ih_vector);
719 		break;
720 	case PSM_INTR_OP_GET_CAP:
721 		cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
722 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
723 			cap |= DDI_INTR_FLAG_MASKABLE;
724 		*result = cap;
725 		break;
726 	case PSM_INTR_OP_GET_SHARED:
727 		if (DOMAIN_IS_INITDOMAIN(xen_info)) {
728 			if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
729 				return (PSM_FAILURE);
730 			if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
731 			    == NULL)
732 				return (PSM_FAILURE);
733 			*result = irqp->airq_share ? 1: 0;
734 		} else {
735 			return (PSM_FAILURE);
736 		}
737 		break;
738 	case PSM_INTR_OP_SET_PRI:
739 		new_priority = *(int *)result;
740 		err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
741 		if (err != 0)
742 			return (PSM_FAILURE);
743 		break;
744 	case PSM_INTR_OP_GET_INTR:
745 		if (!DOMAIN_IS_INITDOMAIN(xen_info))
746 			return (PSM_FAILURE);
747 		/*
748 		 * The interrupt handle given here has been allocated
749 		 * specifically for this command, and ih_private carries
750 		 * a pointer to a apic_get_intr_t.
751 		 */
752 		if (apic_get_vector_intr_info(
753 		    hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
754 			return (PSM_FAILURE);
755 		break;
756 	case PSM_INTR_OP_SET_CAP:
757 		/* FALLTHRU */
758 	default:
759 		return (PSM_FAILURE);
760 	}
761 	return (PSM_SUCCESS);
762 }
763 
764 static void
765 xen_psm_rebind_irq(int irq)
766 {
767 	cpuset_t ncpu;
768 	processorid_t newcpu;
769 	apic_irq_t *irqptr;
770 
771 	newcpu = xen_psm_bind_intr(irq);
772 	if (newcpu == IRQ_UNBOUND) {
773 		CPUSET_ZERO(ncpu);
774 		CPUSET_OR(ncpu, xen_psm_cpus_online);
775 	} else {
776 		CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
777 	}
778 	ec_set_irq_affinity(irq, ncpu);
779 	if (irq <= APIC_MAX_VECTOR) {
780 		irqptr = apic_irq_table[irq];
781 		ASSERT(irqptr != NULL);
782 		irqptr->airq_temp_cpu = (uchar_t)newcpu;
783 	}
784 }
785 
786 /*
787  * Disable all device interrupts for the given cpu.
788  * High priority interrupts are not disabled and will still be serviced.
789  */
790 static int
791 xen_psm_disable_intr(processorid_t cpun)
792 {
793 	int irq;
794 
795 	/*
796 	 * Can't offline VCPU 0 on this hypervisor.  There's no reason
797 	 * anyone would want to given that the CPUs are virtual. Also note
798 	 * that the hypervisor requires suspend/resume to be on VCPU 0.
799 	 */
800 	if (cpun == 0)
801 		return (PSM_FAILURE);
802 
803 	CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
804 	for (irq = 0; irq < NR_IRQS; irq++) {
805 		if (!ec_irq_needs_rebind(irq, cpun))
806 			continue;
807 		xen_psm_rebind_irq(irq);
808 	}
809 	return (PSM_SUCCESS);
810 }
811 
812 static void
813 xen_psm_enable_intr(processorid_t cpun)
814 {
815 	int irq;
816 
817 	if (cpun == 0)
818 		return;
819 
820 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
821 
822 	/*
823 	 * Rebalance device interrupts among online processors
824 	 */
825 	for (irq = 0; irq < NR_IRQS; irq++) {
826 		if (!ec_irq_rebindable(irq))
827 			continue;
828 		xen_psm_rebind_irq(irq);
829 	}
830 
831 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
832 		apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
833 	}
834 }
835 
836 static int
837 xen_psm_post_cpu_start()
838 {
839 	processorid_t cpun;
840 
841 	cpun = psm_get_cpu_id();
842 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
843 		/*
844 		 * Non-virtualized environments can call psm_post_cpu_start
845 		 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
846 		 * xen_psm_post_cpu_start() is only called from boot.
847 		 */
848 		apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
849 	}
850 	return (PSM_SUCCESS);
851 }
852 
853 /*
854  * This function will reprogram the timer.
855  *
856  * When in oneshot mode the argument is the absolute time in future at which to
857  * generate the interrupt.
858  *
859  * When in periodic mode, the argument is the interval at which the
860  * interrupts should be generated. There is no need to support the periodic
861  * mode timer change at this time.
862  *
863  * Note that we must be careful to convert from hrtime to Xen system time (see
864  * xpv_timestamp.c).
865  */
866 static void
867 xen_psm_timer_reprogram(hrtime_t timer_req)
868 {
869 	hrtime_t now, timer_new, time_delta, xen_time;
870 	ulong_t flags;
871 
872 	flags = intr_clear();
873 	/*
874 	 * We should be called from high PIL context (CBE_HIGH_PIL),
875 	 * so kpreempt is disabled.
876 	 */
877 
878 	now = xpv_gethrtime();
879 	xen_time = xpv_getsystime();
880 	if (timer_req <= now) {
881 		/*
882 		 * requested to generate an interrupt in the past
883 		 * generate an interrupt as soon as possible
884 		 */
885 		time_delta = XEN_NSEC_PER_TICK;
886 	} else
887 		time_delta = timer_req - now;
888 
889 	timer_new = xen_time + time_delta;
890 	if (HYPERVISOR_set_timer_op(timer_new) != 0)
891 		panic("can't set hypervisor timer?");
892 	intr_restore(flags);
893 }
894 
895 /*
896  * This function will enable timer interrupts.
897  */
898 static void
899 xen_psm_timer_enable(void)
900 {
901 	ec_unmask_irq(xen_clock_irq);
902 }
903 
904 /*
905  * This function will disable timer interrupts on the current cpu.
906  */
907 static void
908 xen_psm_timer_disable(void)
909 {
910 	(void) ec_block_irq(xen_clock_irq);
911 	/*
912 	 * If the clock irq is pending on this cpu then we need to
913 	 * clear the pending interrupt.
914 	 */
915 	ec_unpend_irq(xen_clock_irq);
916 }
917 
918 /*
919  *
920  * The following functions are in the platform specific file so that they
921  * can be different functions depending on whether we are running on
922  * bare metal or a hypervisor.
923  */
924 
925 /*
926  * Allocate a free vector for irq at ipl.
927  */
928 /* ARGSUSED */
929 uchar_t
930 apic_allocate_vector(int ipl, int irq, int pri)
931 {
932 	physdev_irq_t irq_op;
933 	uchar_t vector;
934 
935 	irq_op.irq = irq;
936 
937 	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
938 		panic("Hypervisor alloc vector failed");
939 	vector = irq_op.vector;
940 	/*
941 	 * No need to worry about vector colliding with our reserved vectors
942 	 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
943 	 * generated traps and handle them properly.
944 	 */
945 	apic_vector_to_irq[vector] = (uchar_t)irq;
946 	return (vector);
947 }
948 
949 /* Mark vector as not being used by any irq */
950 void
951 apic_free_vector(uchar_t vector)
952 {
953 	apic_vector_to_irq[vector] = APIC_RESV_IRQ;
954 }
955 
956 /*
957  * This function allocate "count" vector(s) for the given "dip/pri/type"
958  */
959 static int
960 apic_alloc_vectors(dev_info_t *dip, int inum, int count, int pri, int type,
961     int behavior)
962 {
963 	int	rcount, i;
964 	uchar_t	vector, cpu;
965 	int irqno;
966 	major_t	major;
967 	apic_irq_t	*irqptr;
968 
969 	/* only supports MSI at the moment, will add MSI-X support later */
970 	if (type != DDI_INTR_TYPE_MSI)
971 		return (0);
972 
973 	DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: dip=0x%p type=%d "
974 	    "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
975 	    (void *)dip, type, inum, pri, count, behavior));
976 
977 	if (count > 1) {
978 		if (behavior == DDI_INTR_ALLOC_STRICT &&
979 		    (apic_multi_msi_enable == 0 || count > apic_multi_msi_max))
980 			return (0);
981 
982 		if (apic_multi_msi_enable == 0)
983 			count = 1;
984 		else if (count > apic_multi_msi_max)
985 			count = apic_multi_msi_max;
986 	}
987 
988 	/*
989 	 * XXPV - metal version takes all vectors avail at given pri.
990 	 * Why do that?  For now just allocate count vectors.
991 	 */
992 	rcount = count;
993 
994 	mutex_enter(&airq_mutex);
995 
996 	/*
997 	 * XXPV - currently the hypervisor does not support MSI at all.
998 	 * It doesn't return consecutive vectors.  This code is a first
999 	 * cut for the (future) time that MSI is supported.
1000 	 */
1001 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1002 	for (i = 0; i < rcount; i++) {
1003 		if ((irqno = apic_allocate_irq(apic_first_avail_irq)) ==
1004 		    INVALID_IRQ) {
1005 			mutex_exit(&airq_mutex);
1006 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: "
1007 			    "apic_allocate_irq failed\n"));
1008 			return (i);
1009 		}
1010 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1011 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1012 		irqptr = apic_irq_table[irqno];
1013 		vector = apic_allocate_vector(pri, irqno, 0);
1014 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1015 #ifdef	DEBUG
1016 		if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1017 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: "
1018 			    "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1019 #endif
1020 
1021 		irqptr->airq_vector = vector;
1022 		irqptr->airq_ioapicindex = (uchar_t)inum;	/* start */
1023 		irqptr->airq_intin_no = (uchar_t)rcount;
1024 		irqptr->airq_ipl = pri;
1025 		irqptr->airq_origirq = (uchar_t)(inum + i);
1026 		irqptr->airq_share_id = 0;
1027 		irqptr->airq_mps_intr_index = MSI_INDEX;
1028 		irqptr->airq_dip = dip;
1029 		irqptr->airq_major = major;
1030 		if (i == 0) /* they all bound to the same cpu */
1031 			cpu = irqptr->airq_cpu = apic_bind_intr(dip, irqno,
1032 			    0xff, 0xff);
1033 		else
1034 			irqptr->airq_cpu = cpu;
1035 		DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_vectors: irq=0x%x "
1036 		    "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1037 		    (void *)irqptr->airq_dip, irqptr->airq_vector,
1038 		    irqptr->airq_origirq, pri));
1039 	}
1040 	mutex_exit(&airq_mutex);
1041 	return (rcount);
1042 }
1043 
1044 /*
1045  * The hypervisor doesn't permit access to local apics directly
1046  */
1047 /* ARGSUSED */
1048 uint32_t *
1049 mapin_apic(uint32_t addr, size_t len, int flags)
1050 {
1051 	/*
1052 	 * Return a pointer to a memory area to fake out the
1053 	 * probe code that wants to read apic registers.
1054 	 * The dummy values will end up being ignored by xen
1055 	 * later on when they are used anyway.
1056 	 */
1057 	xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1058 	return (xen_psm_dummy_apic);
1059 }
1060 
1061 /* ARGSUSED */
1062 uint32_t *
1063 mapin_ioapic(uint32_t addr, size_t len, int flags)
1064 {
1065 	/*
1066 	 * Return non-null here to fake out configure code that calls this.
1067 	 * The i86xpv platform will not reference through the returned value..
1068 	 */
1069 	return ((uint32_t *)0x1);
1070 }
1071 
1072 /* ARGSUSED */
1073 void
1074 mapout_apic(caddr_t addr, size_t len)
1075 {
1076 }
1077 
1078 /* ARGSUSED */
1079 void
1080 mapout_ioapic(caddr_t addr, size_t len)
1081 {
1082 }
1083 
1084 uint32_t
1085 ioapic_read(int apic_ix, uint32_t reg)
1086 {
1087 	physdev_apic_t apic;
1088 
1089 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1090 	apic.reg = reg;
1091 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1092 		panic("read ioapic %d reg %d failed", apic_ix, reg);
1093 	return (apic.value);
1094 }
1095 
1096 void
1097 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1098 {
1099 	physdev_apic_t apic;
1100 
1101 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1102 	apic.reg = reg;
1103 	apic.value = value;
1104 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1105 		panic("write ioapic %d reg %d failed", apic_ix, reg);
1106 }
1107 
1108 /*
1109  * This function was added as part of x2APIC support in pcplusmp.
1110  */
1111 void
1112 ioapic_write_eoi(int apic_ix, uint32_t value)
1113 {
1114 	physdev_apic_t apic;
1115 
1116 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1117 	apic.reg = APIC_IO_EOI;
1118 	apic.value = value;
1119 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1120 		panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1121 }
1122 
1123 /*
1124  * This function was added as part of x2APIC support in pcplusmp to resolve
1125  * undefined symbol in xpv_psm.
1126  */
1127 void
1128 x2apic_update_psm()
1129 {
1130 }
1131 
1132 /*
1133  * This function was added as part of x2APIC support in pcplusmp to resolve
1134  * undefined symbol in xpv_psm.
1135  */
1136 void
1137 apic_ret()
1138 {
1139 }
1140 
1141 /*
1142  * Call rebind to do the actual programming.
1143  */
1144 int
1145 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1146 {
1147 	apic_irq_t *irqptr;
1148 	struct ioapic_reprogram_data *drep = NULL;
1149 	int rv, cpu;
1150 	cpuset_t cpus;
1151 
1152 	/*
1153 	 * Set cpu based on xen idea of online cpu's not apic tables.
1154 	 * Note that xen ignores/sets to it's own preferred value the
1155 	 * target cpu field when programming ioapic anyway.
1156 	 */
1157 	if ((cpu = xen_psm_bind_intr(irq)) == IRQ_UNBOUND) {
1158 		CPUSET_ZERO(cpus);
1159 		CPUSET_OR(cpus, xen_psm_cpus_online);
1160 	} else {
1161 		CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1162 	}
1163 	apic_irq_table[irq]->airq_cpu = cpu;
1164 	if (deferred) {
1165 		drep = (struct ioapic_reprogram_data *)p;
1166 		ASSERT(drep != NULL);
1167 		irqptr = drep->irqp;
1168 	} else {
1169 		irqptr = (apic_irq_t *)p;
1170 	}
1171 	ASSERT(irqptr != NULL);
1172 	rv = apic_rebind(irqptr, cpu, drep);
1173 	if (rv) {
1174 		/* CPU is not up or interrupt is disabled. Fall back to 0 */
1175 		cpu = 0;
1176 		rv = apic_rebind(irqptr, cpu, drep);
1177 	}
1178 	/*
1179 	 * If rebind successful bind the irq to an event channel
1180 	 */
1181 	if (rv == 0) {
1182 		ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1183 		CPUSET_FIND(cpus, cpu);
1184 		apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1185 	}
1186 	return (rv);
1187 }
1188 
1189 /*
1190  * Allocate a new vector for the given irq
1191  */
1192 /* ARGSUSED */
1193 uchar_t
1194 apic_modify_vector(uchar_t vector, int irq)
1195 {
1196 	return (apic_allocate_vector(0, irq, 0));
1197 }
1198 
1199 /*
1200  * The rest of the file is just generic psm module boilerplate
1201  */
1202 
1203 static struct psm_ops xen_psm_ops = {
1204 	xen_psm_probe,				/* psm_probe		*/
1205 
1206 	xen_psm_softinit,			/* psm_init		*/
1207 	xen_psm_picinit,			/* psm_picinit		*/
1208 	xen_psm_intr_enter,			/* psm_intr_enter	*/
1209 	xen_psm_intr_exit,			/* psm_intr_exit	*/
1210 	xen_psm_setspl,				/* psm_setspl		*/
1211 	xen_psm_addspl,				/* psm_addspl		*/
1212 	xen_psm_delspl,				/* psm_delspl		*/
1213 	xen_psm_disable_intr,			/* psm_disable_intr	*/
1214 	xen_psm_enable_intr,			/* psm_enable_intr	*/
1215 	(int (*)(int))NULL,			/* psm_softlvl_to_irq	*/
1216 	(void (*)(int))NULL,			/* psm_set_softintr	*/
1217 	(void (*)(processorid_t))NULL,		/* psm_set_idlecpu	*/
1218 	(void (*)(processorid_t))NULL,		/* psm_unset_idlecpu	*/
1219 
1220 	xen_psm_clkinit,			/* psm_clkinit		*/
1221 	xen_psm_get_clockirq,			/* psm_get_clockirq	*/
1222 	xen_psm_hrtimeinit,			/* psm_hrtimeinit	*/
1223 	xpv_gethrtime,				/* psm_gethrtime	*/
1224 
1225 	xen_psm_get_next_processorid,		/* psm_get_next_processorid */
1226 	xen_psm_cpu_start,			/* psm_cpu_start	*/
1227 	xen_psm_post_cpu_start,			/* psm_post_cpu_start	*/
1228 	xen_psm_shutdown,			/* psm_shutdown		*/
1229 	xen_psm_get_ipivect,			/* psm_get_ipivect	*/
1230 	xen_psm_send_ipi,			/* psm_send_ipi		*/
1231 
1232 	xen_psm_translate_irq,			/* psm_translate_irq	*/
1233 
1234 	(void (*)(int, char *))NULL,		/* psm_notify_error	*/
1235 	(void (*)(int msg))NULL,		/* psm_notify_func	*/
1236 	xen_psm_timer_reprogram,		/* psm_timer_reprogram	*/
1237 	xen_psm_timer_enable,			/* psm_timer_enable	*/
1238 	xen_psm_timer_disable,			/* psm_timer_disable	*/
1239 	(void (*)(void *arg))NULL,		/* psm_post_cyclic_setup */
1240 	(void (*)(int, int))NULL,		/* psm_preshutdown	*/
1241 	xen_intr_ops			/* Advanced DDI Interrupt framework */
1242 };
1243 
1244 static struct psm_info xen_psm_info = {
1245 	PSM_INFO_VER01_5,	/* version				*/
1246 	PSM_OWN_EXCLUSIVE,	/* ownership				*/
1247 	&xen_psm_ops,		/* operation				*/
1248 	"xVM_psm",		/* machine name				*/
1249 	"platform module %I%"	/* machine descriptions			*/
1250 };
1251 
1252 static void *xen_psm_hdlp;
1253 
1254 int
1255 _init(void)
1256 {
1257 	return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1258 }
1259 
1260 int
1261 _fini(void)
1262 {
1263 	return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1264 }
1265 
1266 int
1267 _info(struct modinfo *modinfop)
1268 {
1269 	return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1270 }
1271