xref: /illumos-gate/usr/src/uts/i86xpv/io/psm/xpv_psm.c (revision 23524732d002da91177f82bdfa44378749661577)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #define	PSMI_1_6
28 
29 #include <sys/mutex.h>
30 #include <sys/types.h>
31 #include <sys/time.h>
32 #include <sys/clock.h>
33 #include <sys/machlock.h>
34 #include <sys/smp_impldefs.h>
35 #include <sys/uadmin.h>
36 #include <sys/promif.h>
37 #include <sys/psm.h>
38 #include <sys/psm_common.h>
39 #include <sys/atomic.h>
40 #include <sys/apic.h>
41 #include <sys/archsystm.h>
42 #include <sys/mach_intr.h>
43 #include <sys/hypervisor.h>
44 #include <sys/evtchn_impl.h>
45 #include <sys/modctl.h>
46 #include <sys/trap.h>
47 #include <sys/panic.h>
48 #include <sys/sysmacros.h>
49 #include <sys/pci_intr_lib.h>
50 #include <vm/hat_i86.h>
51 
52 #include <xen/public/vcpu.h>
53 #include <xen/public/physdev.h>
54 
55 
56 /*
57  * Global Data
58  */
59 
60 int xen_psm_verbose = 0;
61 
62 /* As of now we don't support x2apic in xVM */
63 volatile uint32_t *apicadr = NULL;	/* dummy, so common code will link */
64 int apic_error = 0;
65 int apic_verbose = 0;
66 cpuset_t apic_cpumask;
67 int apic_forceload = 0;
68 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
69 	3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
70 };
71 uchar_t apic_ipltopri[MAXIPL + 1];
72 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
73 uint_t apic_picinit_called;
74 apic_cpus_info_t *apic_cpus;
75 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
76 /* use to make sure only one cpu handles the nmi */
77 static lock_t xen_psm_nmi_lock;
78 int xen_psm_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
79 int xen_psm_panic_on_nmi = 0;
80 int xen_psm_num_nmis = 0;
81 
82 cpuset_t xen_psm_cpus_online;	/* online cpus */
83 int xen_psm_ncpus = 1;		/* cpu count */
84 int xen_psm_next_bind_cpu;	/* next cpu to bind an interrupt to */
85 
86 int xen_support_msi = -1;
87 
88 static int xen_clock_irq = INVALID_IRQ;
89 
90 /* flag definitions for xen_psm_verbose */
91 #define	XEN_PSM_VERBOSE_IRQ_FLAG		0x00000001
92 #define	XEN_PSM_VERBOSE_POWEROFF_FLAG		0x00000002
93 #define	XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG	0x00000004
94 
95 #define	XEN_PSM_VERBOSE_IRQ(fmt) \
96 	if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
97 		cmn_err fmt;
98 
99 #define	XEN_PSM_VERBOSE_POWEROFF(fmt) \
100 	if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
101 		prom_printf fmt;
102 
103 /*
104  * Dummy apic array to point common routines at that want to do some apic
105  * manipulation.  Xen doesn't allow guest apic access so we point at these
106  * memory locations to fake out those who want to do apic fiddling.
107  */
108 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
109 
110 static struct psm_info xen_psm_info;
111 static void xen_psm_setspl(int);
112 
113 int
114 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
115     int behavior);
116 int
117 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
118     int behavior);
119 
120 /*
121  * Local support routines
122  */
123 
124 /*
125  * Select vcpu to bind xen virtual device interrupt to.
126  */
127 /*ARGSUSED*/
128 int
129 xen_psm_bind_intr(int irq)
130 {
131 	int bind_cpu;
132 	apic_irq_t *irqptr;
133 
134 	bind_cpu = IRQ_UNBOUND;
135 	if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
136 		return (bind_cpu);
137 	if (irq <= APIC_MAX_VECTOR)
138 		irqptr = apic_irq_table[irq];
139 	else
140 		irqptr = NULL;
141 	if (irqptr && (irqptr->airq_cpu != IRQ_UNBOUND))
142 		bind_cpu = irqptr->airq_cpu & ~IRQ_USER_BOUND;
143 	if (bind_cpu != IRQ_UNBOUND) {
144 		if (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu))
145 			bind_cpu = 0;
146 		goto done;
147 	}
148 	if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
149 		do {
150 			bind_cpu = xen_psm_next_bind_cpu++;
151 			if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
152 				xen_psm_next_bind_cpu = 0;
153 		} while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
154 	} else {
155 		bind_cpu = 0;
156 	}
157 done:
158 	return (bind_cpu);
159 }
160 
161 /*
162  * Autoconfiguration Routines
163  */
164 
165 static int
166 xen_psm_probe(void)
167 {
168 	int ret = PSM_SUCCESS;
169 
170 	if (DOMAIN_IS_INITDOMAIN(xen_info))
171 		ret = apic_probe_common(xen_psm_info.p_mach_idstring);
172 	return (ret);
173 }
174 
175 static void
176 xen_psm_softinit(void)
177 {
178 	/* LINTED logical expression always true: op "||" */
179 	ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
180 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
181 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
182 		apic_init_common();
183 	}
184 }
185 
186 #define	XEN_NSEC_PER_TICK	10 /* XXX - assume we have a 100 Mhz clock */
187 
188 /*ARGSUSED*/
189 static int
190 xen_psm_clkinit(int hertz)
191 {
192 	extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
193 	extern int dosynctodr;
194 
195 	/*
196 	 * domU cannot set the TOD hardware, fault the TOD clock now to
197 	 * indicate that and turn off attempts to sync TOD hardware
198 	 * with the hires timer.
199 	 */
200 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
201 		mutex_enter(&tod_lock);
202 		(void) tod_fault(TOD_RDONLY, 0);
203 		dosynctodr = 0;
204 		mutex_exit(&tod_lock);
205 	}
206 	/*
207 	 * The hypervisor provides a timer based on the local APIC timer.
208 	 * The interface supports requests of nanosecond resolution.
209 	 * A common frequency of the apic clock is 100 Mhz which
210 	 * gives a resolution of 10 nsec per tick.  What we would really like
211 	 * is a way to get the ns per tick value from xen.
212 	 * XXPV - This is an assumption that needs checking and may change
213 	 */
214 	return (XEN_NSEC_PER_TICK);
215 }
216 
217 static void
218 xen_psm_hrtimeinit(void)
219 {
220 	extern int gethrtime_hires;
221 	gethrtime_hires = 1;
222 }
223 
224 /* xen_psm NMI handler */
225 /*ARGSUSED*/
226 static void
227 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
228 {
229 	xen_psm_num_nmis++;
230 
231 	if (!lock_try(&xen_psm_nmi_lock))
232 		return;
233 
234 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
235 		debug_enter("NMI received: entering kmdb\n");
236 	} else if (xen_psm_panic_on_nmi) {
237 		/* Keep panic from entering kmdb. */
238 		nopanicdebug = 1;
239 		panic("NMI received\n");
240 	} else {
241 		/*
242 		 * prom_printf is the best shot we have of something which is
243 		 * problem free from high level/NMI type of interrupts
244 		 */
245 		prom_printf("NMI received\n");
246 	}
247 
248 	lock_clear(&xen_psm_nmi_lock);
249 }
250 
251 static void
252 xen_psm_picinit()
253 {
254 	int cpu, irqno;
255 	cpuset_t cpus;
256 
257 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
258 		/* set a flag so we know we have run xen_psm_picinit() */
259 		apic_picinit_called = 1;
260 		LOCK_INIT_CLEAR(&apic_ioapic_lock);
261 
262 		/* XXPV - do we need to do this? */
263 		picsetup();	 /* initialise the 8259 */
264 
265 		/* enable apic mode if imcr present */
266 		/* XXPV - do we need to do this either? */
267 		if (apic_imcrp) {
268 			outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
269 			outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
270 		}
271 
272 		ioapic_init_intr(IOAPIC_NOMASK);
273 		/*
274 		 * We never called xen_psm_addspl() when the SCI
275 		 * interrupt was added because that happened before the
276 		 * PSM module was loaded.  Fix that up here by doing
277 		 * any missed operations (e.g. bind to CPU)
278 		 */
279 		if ((irqno = apic_sci_vect) > 0) {
280 			if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
281 				CPUSET_ZERO(cpus);
282 				CPUSET_OR(cpus, xen_psm_cpus_online);
283 			} else {
284 				CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
285 			}
286 			ec_set_irq_affinity(irqno, cpus);
287 			apic_irq_table[irqno]->airq_temp_cpu =
288 			    (uchar_t)(cpu & ~IRQ_USER_BOUND);
289 			ec_enable_irq(irqno);
290 		}
291 	}
292 
293 	/* add nmi handler - least priority nmi handler */
294 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
295 
296 	if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
297 	    "xVM_psm NMI handler", (caddr_t)NULL))
298 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
299 }
300 
301 
302 /*
303  * generates an interprocessor interrupt to another CPU
304  */
305 static void
306 xen_psm_send_ipi(int cpun, int ipl)
307 {
308 	ulong_t flag = intr_clear();
309 
310 	ec_send_ipi(ipl, cpun);
311 	intr_restore(flag);
312 }
313 
314 /*ARGSUSED*/
315 static int
316 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
317 {
318 	int cpu, ret;
319 	cpuset_t cpus;
320 
321 	/*
322 	 * We are called at splhi() so we can't call anything that might end
323 	 * up trying to context switch.
324 	 */
325 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
326 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
327 		/*
328 		 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
329 		 */
330 		ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
331 	} else {
332 		/*
333 		 * Set priority/affinity/enable for non PIRQs
334 		 */
335 		ret = ec_set_irq_priority(irqno, ipl);
336 		ASSERT(ret == 0);
337 		if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
338 			CPUSET_ZERO(cpus);
339 			CPUSET_OR(cpus, xen_psm_cpus_online);
340 		} else {
341 			CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
342 		}
343 		ec_set_irq_affinity(irqno, cpus);
344 		ec_enable_irq(irqno);
345 	}
346 	return (ret);
347 }
348 
349 /*
350  * Acquire ownership of this irq on this cpu
351  */
352 void
353 xen_psm_acquire_irq(int irq)
354 {
355 	ulong_t flags;
356 	int cpuid;
357 
358 	/*
359 	 * If the irq is currently being serviced by another cpu
360 	 * we busy-wait for the other cpu to finish.  Take any
361 	 * pending interrupts before retrying.
362 	 */
363 	do {
364 		flags = intr_clear();
365 		cpuid = ec_block_irq(irq);
366 		intr_restore(flags);
367 	} while (cpuid != CPU->cpu_id);
368 }
369 
370 /*ARGSUSED*/
371 static int
372 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
373 {
374 	apic_irq_t *irqptr;
375 	int err = PSM_SUCCESS;
376 
377 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
378 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
379 		irqptr = apic_irq_table[irqno];
380 		/*
381 		 * unbind if no more sharers of this irq/evtchn
382 		 */
383 		if (irqptr->airq_share == 1) {
384 			xen_psm_acquire_irq(irqno);
385 			ec_unbind_irq(irqno);
386 		}
387 		err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
388 		/*
389 		 * If still in use reset priority
390 		 */
391 		if (!err && irqptr->airq_share != 0) {
392 			err = ec_set_irq_priority(irqno, max_ipl);
393 			return (err);
394 		}
395 	} else {
396 		xen_psm_acquire_irq(irqno);
397 		ec_unbind_irq(irqno);
398 	}
399 	return (err);
400 }
401 
402 static processorid_t
403 xen_psm_get_next_processorid(processorid_t id)
404 {
405 	if (id == -1)
406 		return (0);
407 
408 	for (id++; id < NCPU; id++) {
409 		switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
410 		case 0:		/* yeah, that one's there */
411 			return (id);
412 		default:
413 		case X_EINVAL:	/* out of range */
414 			return (-1);
415 		case X_ENOENT:	/* not present in the domain */
416 			/*
417 			 * It's not clear that we -need- to keep looking
418 			 * at this point, if, e.g., we can guarantee
419 			 * the hypervisor always keeps a contiguous range
420 			 * of vcpus around this is equivalent to "out of range".
421 			 *
422 			 * But it would be sad to miss a vcpu we're
423 			 * supposed to be using ..
424 			 */
425 			break;
426 		}
427 	}
428 
429 	return (-1);
430 }
431 
432 /*
433  * XXPV - undo the start cpu op change; return to ignoring this value
434  *	- also tweak error handling in main startup loop
435  */
436 /*ARGSUSED*/
437 static int
438 xen_psm_cpu_start(processorid_t id, caddr_t arg)
439 {
440 	int ret;
441 
442 	ASSERT(id > 0);
443 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
444 	ec_bind_cpu_ipis(id);
445 	(void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
446 	if ((ret = xen_vcpu_up(id)) == 0)
447 		xen_psm_ncpus++;
448 	else
449 		ret = EINVAL;
450 	return (ret);
451 }
452 
453 /*
454  * Allocate an irq for inter cpu signaling
455  */
456 /*ARGSUSED*/
457 static int
458 xen_psm_get_ipivect(int ipl, int type)
459 {
460 	return (ec_bind_ipi_to_irq(ipl, 0));
461 }
462 
463 /*ARGSUSED*/
464 static int
465 xen_psm_get_clockirq(int ipl)
466 {
467 	if (xen_clock_irq != INVALID_IRQ)
468 		return (xen_clock_irq);
469 
470 	xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
471 	return (xen_clock_irq);
472 }
473 
474 /*ARGSUSED*/
475 static void
476 xen_psm_shutdown(int cmd, int fcn)
477 {
478 	XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
479 
480 	switch (cmd) {
481 	case A_SHUTDOWN:
482 		switch (fcn) {
483 		case AD_BOOT:
484 		case AD_IBOOT:
485 			(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
486 			break;
487 		case AD_POWEROFF:
488 			/* fall through if domU or if poweroff fails */
489 			if (DOMAIN_IS_INITDOMAIN(xen_info))
490 				if (apic_enable_acpi)
491 					(void) acpi_poweroff();
492 			/* FALLTHRU */
493 		case AD_HALT:
494 		default:
495 			(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
496 			break;
497 		}
498 		break;
499 	case A_REBOOT:
500 		(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
501 		break;
502 	default:
503 		return;
504 	}
505 }
506 
507 
508 static int
509 xen_psm_translate_irq(dev_info_t *dip, int irqno)
510 {
511 	if (dip == NULL) {
512 		XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
513 		    " dip = NULL\n", irqno));
514 		return (irqno);
515 	}
516 	return (irqno);
517 }
518 
519 /*
520  * xen_psm_intr_enter() acks the event that triggered the interrupt and
521  * returns the new priority level,
522  */
523 /*ARGSUSED*/
524 static int
525 xen_psm_intr_enter(int ipl, int *vector)
526 {
527 	int newipl;
528 	uint_t intno;
529 	cpu_t *cpu = CPU;
530 
531 	intno = (*vector);
532 
533 	ASSERT(intno < NR_IRQS);
534 	ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
535 
536 	if (!ec_is_edge_pirq(intno))
537 		ec_clear_irq(intno);
538 
539 	newipl = autovect[intno].avh_hi_pri;
540 	if (newipl == 0) {
541 		/*
542 		 * (newipl == 0) means we have no service routines for this
543 		 * vector.  We will treat this as a spurious interrupt.
544 		 * We have cleared the pending bit already, clear the event
545 		 * mask and return a spurious interrupt.  This case can happen
546 		 * when an interrupt delivery is racing with the removal of
547 		 * of the service routine for that interrupt.
548 		 */
549 		ec_unmask_irq(intno);
550 		newipl = -1;	/* flag spurious interrupt */
551 	} else if (newipl <= cpu->cpu_pri) {
552 		/*
553 		 * (newipl <= cpu->cpu_pri) means that we must be trying to
554 		 * service a vector that was shared with a higher priority
555 		 * isr.  The higher priority handler has been removed and
556 		 * we need to service this int.  We can't return a lower
557 		 * priority than current cpu priority.  Just synthesize a
558 		 * priority to return that should be acceptable.
559 		 */
560 		newipl = cpu->cpu_pri + 1;	/* synthetic priority */
561 	}
562 	return (newipl);
563 }
564 
565 
566 /*
567  * xen_psm_intr_exit() restores the old interrupt
568  * priority level after processing an interrupt.
569  * It is called with interrupts disabled, and does not enable interrupts.
570  */
571 /* ARGSUSED */
572 static void
573 xen_psm_intr_exit(int ipl, int vector)
574 {
575 	ec_try_unmask_irq(vector);
576 	xen_psm_setspl(ipl);
577 }
578 
579 intr_exit_fn_t
580 psm_intr_exit_fn(void)
581 {
582 	return (xen_psm_intr_exit);
583 }
584 
585 /*
586  * Check if new ipl level allows delivery of previously unserviced events
587  */
588 static void
589 xen_psm_setspl(int ipl)
590 {
591 	struct cpu *cpu = CPU;
592 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
593 	uint16_t pending;
594 
595 	ASSERT(vci->evtchn_upcall_mask != 0);
596 
597 	/*
598 	 * If new ipl level will enable any pending interrupts, setup so the
599 	 * upcoming sti will cause us to get an upcall.
600 	 */
601 	pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
602 	if (pending) {
603 		int i;
604 		ulong_t pending_sels = 0;
605 		volatile ulong_t *selp;
606 		struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
607 
608 		for (i = bsrw_insn(pending); i > ipl; i--)
609 			pending_sels |= cpe->pending_sel[i];
610 		ASSERT(pending_sels);
611 		selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
612 		atomic_or_ulong(selp, pending_sels);
613 		vci->evtchn_upcall_pending = 1;
614 	}
615 }
616 
617 /*
618  * This function provides external interface to the nexus for all
619  * functionality related to the new DDI interrupt framework.
620  *
621  * Input:
622  * dip     - pointer to the dev_info structure of the requested device
623  * hdlp    - pointer to the internal interrupt handle structure for the
624  *	     requested interrupt
625  * intr_op - opcode for this call
626  * result  - pointer to the integer that will hold the result to be
627  *	     passed back if return value is PSM_SUCCESS
628  *
629  * Output:
630  * return value is either PSM_SUCCESS or PSM_FAILURE
631  */
632 int
633 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
634     psm_intr_op_t intr_op, int *result)
635 {
636 	int		cap;
637 	int		err;
638 	int		new_priority;
639 	apic_irq_t	*irqp;
640 	struct intrspec *ispec;
641 
642 	DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
643 	    "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
644 
645 	switch (intr_op) {
646 	case PSM_INTR_OP_CHECK_MSI:
647 		/*
648 		 * Till PCI passthru is supported, only dom0 has MSI/MSIX
649 		 */
650 		if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
651 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
652 			    DDI_INTR_TYPE_MSIX);
653 			break;
654 		}
655 		/*
656 		 * Check MSI/X is supported or not at APIC level and
657 		 * masked off the MSI/X bits in hdlp->ih_type if not
658 		 * supported before return.  If MSI/X is supported,
659 		 * leave the ih_type unchanged and return.
660 		 *
661 		 * hdlp->ih_type passed in from the nexus has all the
662 		 * interrupt types supported by the device.
663 		 */
664 		if (xen_support_msi == 0) {
665 			/*
666 			 * if xen_support_msi is not set, call
667 			 * apic_check_msi_support() to check whether msi
668 			 * is supported first
669 			 */
670 			if (apic_check_msi_support() == PSM_SUCCESS)
671 				xen_support_msi = 1;
672 			else
673 				xen_support_msi = -1;
674 		}
675 		if (xen_support_msi == 1)
676 			*result = hdlp->ih_type;
677 		else
678 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
679 			    DDI_INTR_TYPE_MSIX);
680 		break;
681 	case PSM_INTR_OP_ALLOC_VECTORS:
682 		if (hdlp->ih_type == DDI_INTR_TYPE_MSI)
683 			*result = apic_alloc_msi_vectors(dip, hdlp->ih_inum,
684 			    hdlp->ih_scratch1, hdlp->ih_pri,
685 			    (int)(uintptr_t)hdlp->ih_scratch2);
686 		else
687 			*result = apic_alloc_msix_vectors(dip, hdlp->ih_inum,
688 			    hdlp->ih_scratch1, hdlp->ih_pri,
689 			    (int)(uintptr_t)hdlp->ih_scratch2);
690 		break;
691 	case PSM_INTR_OP_FREE_VECTORS:
692 		apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
693 		    hdlp->ih_pri, hdlp->ih_type);
694 		break;
695 	case PSM_INTR_OP_NAVAIL_VECTORS:
696 		/*
697 		 * XXPV - maybe we should make this be:
698 		 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
699 		 */
700 		if (DOMAIN_IS_INITDOMAIN(xen_info))
701 			*result = APIC_VECTOR_PER_IPL;
702 		else
703 			*result = 1;
704 		break;
705 	case PSM_INTR_OP_XLATE_VECTOR:
706 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
707 		if (ispec->intrspec_vec >= PIRQ_BASE &&
708 		    ispec->intrspec_vec < NR_PIRQS &&
709 		    DOMAIN_IS_INITDOMAIN(xen_info)) {
710 			*result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
711 		} else {
712 			*result = ispec->intrspec_vec;
713 		}
714 		break;
715 	case PSM_INTR_OP_GET_PENDING:
716 		/* XXPV - is this enough for dom0 or do we need to ref ioapic */
717 		*result = ec_pending_irq(hdlp->ih_vector);
718 		break;
719 	case PSM_INTR_OP_CLEAR_MASK:
720 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
721 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
722 			return (PSM_FAILURE);
723 		ec_enable_irq(hdlp->ih_vector);
724 		break;
725 	case PSM_INTR_OP_SET_MASK:
726 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
727 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
728 			return (PSM_FAILURE);
729 		ec_disable_irq(hdlp->ih_vector);
730 		break;
731 	case PSM_INTR_OP_GET_CAP:
732 		cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
733 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
734 			cap |= DDI_INTR_FLAG_MASKABLE;
735 		*result = cap;
736 		break;
737 	case PSM_INTR_OP_GET_SHARED:
738 		if (DOMAIN_IS_INITDOMAIN(xen_info)) {
739 			if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
740 				return (PSM_FAILURE);
741 			ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
742 			if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
743 			    == NULL)
744 				return (PSM_FAILURE);
745 			*result = (irqp->airq_share > 1) ? 1: 0;
746 		} else {
747 			return (PSM_FAILURE);
748 		}
749 		break;
750 	case PSM_INTR_OP_SET_PRI:
751 		new_priority = *(int *)result;
752 		err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
753 		if (err != 0)
754 			return (PSM_FAILURE);
755 		break;
756 	case PSM_INTR_OP_GET_INTR:
757 		if (!DOMAIN_IS_INITDOMAIN(xen_info))
758 			return (PSM_FAILURE);
759 		/*
760 		 * The interrupt handle given here has been allocated
761 		 * specifically for this command, and ih_private carries
762 		 * a pointer to a apic_get_intr_t.
763 		 */
764 		if (apic_get_vector_intr_info(
765 		    hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
766 			return (PSM_FAILURE);
767 		break;
768 	case PSM_INTR_OP_SET_CAP:
769 		/* FALLTHRU */
770 	default:
771 		return (PSM_FAILURE);
772 	}
773 	return (PSM_SUCCESS);
774 }
775 
776 static void
777 xen_psm_rebind_irq(int irq)
778 {
779 	cpuset_t ncpu;
780 	processorid_t newcpu;
781 	apic_irq_t *irqptr;
782 
783 	newcpu = xen_psm_bind_intr(irq);
784 	if (newcpu == IRQ_UNBOUND) {
785 		CPUSET_ZERO(ncpu);
786 		CPUSET_OR(ncpu, xen_psm_cpus_online);
787 	} else {
788 		CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
789 	}
790 	ec_set_irq_affinity(irq, ncpu);
791 	if (irq <= APIC_MAX_VECTOR) {
792 		irqptr = apic_irq_table[irq];
793 		ASSERT(irqptr != NULL);
794 		irqptr->airq_temp_cpu = (uchar_t)newcpu;
795 	}
796 }
797 
798 /*
799  * Disable all device interrupts for the given cpu.
800  * High priority interrupts are not disabled and will still be serviced.
801  */
802 static int
803 xen_psm_disable_intr(processorid_t cpun)
804 {
805 	int irq;
806 
807 	/*
808 	 * Can't offline VCPU 0 on this hypervisor.  There's no reason
809 	 * anyone would want to given that the CPUs are virtual. Also note
810 	 * that the hypervisor requires suspend/resume to be on VCPU 0.
811 	 */
812 	if (cpun == 0)
813 		return (PSM_FAILURE);
814 
815 	CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
816 	for (irq = 0; irq < NR_IRQS; irq++) {
817 		if (!ec_irq_needs_rebind(irq, cpun))
818 			continue;
819 		xen_psm_rebind_irq(irq);
820 	}
821 	return (PSM_SUCCESS);
822 }
823 
824 static void
825 xen_psm_enable_intr(processorid_t cpun)
826 {
827 	int irq;
828 
829 	if (cpun == 0)
830 		return;
831 
832 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
833 
834 	/*
835 	 * Rebalance device interrupts among online processors
836 	 */
837 	for (irq = 0; irq < NR_IRQS; irq++) {
838 		if (!ec_irq_rebindable(irq))
839 			continue;
840 		xen_psm_rebind_irq(irq);
841 	}
842 
843 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
844 		apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
845 	}
846 }
847 
848 static int
849 xen_psm_post_cpu_start()
850 {
851 	processorid_t cpun;
852 
853 	cpun = psm_get_cpu_id();
854 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
855 		/*
856 		 * Non-virtualized environments can call psm_post_cpu_start
857 		 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
858 		 * xen_psm_post_cpu_start() is only called from boot.
859 		 */
860 		apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
861 	}
862 	return (PSM_SUCCESS);
863 }
864 
865 /*
866  * This function will reprogram the timer.
867  *
868  * When in oneshot mode the argument is the absolute time in future at which to
869  * generate the interrupt.
870  *
871  * When in periodic mode, the argument is the interval at which the
872  * interrupts should be generated. There is no need to support the periodic
873  * mode timer change at this time.
874  *
875  * Note that we must be careful to convert from hrtime to Xen system time (see
876  * xpv_timestamp.c).
877  */
878 static void
879 xen_psm_timer_reprogram(hrtime_t timer_req)
880 {
881 	hrtime_t now, timer_new, time_delta, xen_time;
882 	ulong_t flags;
883 
884 	flags = intr_clear();
885 	/*
886 	 * We should be called from high PIL context (CBE_HIGH_PIL),
887 	 * so kpreempt is disabled.
888 	 */
889 
890 	now = xpv_gethrtime();
891 	xen_time = xpv_getsystime();
892 	if (timer_req <= now) {
893 		/*
894 		 * requested to generate an interrupt in the past
895 		 * generate an interrupt as soon as possible
896 		 */
897 		time_delta = XEN_NSEC_PER_TICK;
898 	} else
899 		time_delta = timer_req - now;
900 
901 	timer_new = xen_time + time_delta;
902 	if (HYPERVISOR_set_timer_op(timer_new) != 0)
903 		panic("can't set hypervisor timer?");
904 	intr_restore(flags);
905 }
906 
907 /*
908  * This function will enable timer interrupts.
909  */
910 static void
911 xen_psm_timer_enable(void)
912 {
913 	ec_unmask_irq(xen_clock_irq);
914 }
915 
916 /*
917  * This function will disable timer interrupts on the current cpu.
918  */
919 static void
920 xen_psm_timer_disable(void)
921 {
922 	(void) ec_block_irq(xen_clock_irq);
923 	/*
924 	 * If the clock irq is pending on this cpu then we need to
925 	 * clear the pending interrupt.
926 	 */
927 	ec_unpend_irq(xen_clock_irq);
928 }
929 
930 /*
931  *
932  * The following functions are in the platform specific file so that they
933  * can be different functions depending on whether we are running on
934  * bare metal or a hypervisor.
935  */
936 
937 /*
938  * Allocate a free vector for irq at ipl.
939  */
940 /* ARGSUSED */
941 uchar_t
942 apic_allocate_vector(int ipl, int irq, int pri)
943 {
944 	physdev_irq_t irq_op;
945 	uchar_t vector;
946 	int rc;
947 
948 	irq_op.irq = irq;
949 
950 	if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
951 	    != 0)
952 		panic("Hypervisor alloc vector failed err: %d", -rc);
953 	vector = irq_op.vector;
954 	/*
955 	 * No need to worry about vector colliding with our reserved vectors
956 	 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
957 	 * generated traps and handle them properly.
958 	 */
959 	apic_vector_to_irq[vector] = (uchar_t)irq;
960 	return (vector);
961 }
962 
963 /* Mark vector as not being used by any irq */
964 void
965 apic_free_vector(uchar_t vector)
966 {
967 	apic_vector_to_irq[vector] = APIC_RESV_IRQ;
968 }
969 
970 /*
971  * This function returns the no. of vectors available for the pri.
972  * dip is not used at this moment.  If we really don't need that,
973  * it will be removed.  Since priority is not limited by hardware
974  * when running on the hypervisor we simply return the maximum no.
975  * of available contiguous vectors.
976  */
977 /*ARGSUSED*/
978 int
979 apic_navail_vector(dev_info_t *dip, int pri)
980 {
981 	int	lowest, highest, i, navail, count;
982 
983 	DDI_INTR_IMPLDBG((CE_CONT, "apic_navail_vector: dip: %p, pri: %x\n",
984 	    (void *)dip, pri));
985 
986 	highest = APIC_MAX_VECTOR;
987 	lowest = APIC_BASE_VECT;
988 	navail = count = 0;
989 
990 	/* It has to be contiguous */
991 	for (i = lowest; i < highest; i++) {
992 		count = 0;
993 		while ((apic_vector_to_irq[i] == APIC_RESV_IRQ) &&
994 		    (i < highest)) {
995 			count++;
996 			i++;
997 		}
998 		if (count > navail)
999 			navail = count;
1000 	}
1001 	return (navail);
1002 }
1003 
1004 static physdev_manage_pci_t *managed_devlist;
1005 static int mdev_cnt;
1006 static int mdev_size = 128;
1007 static uchar_t	msi_vector_to_pirq[APIC_MAX_VECTOR+1];
1008 
1009 /*
1010  * Add devfn on given bus to devices managed by hypervisor
1011  */
1012 static int
1013 xen_manage_device(uint8_t bus, uint8_t devfn)
1014 {
1015 	physdev_manage_pci_t manage_pci, *newlist;
1016 	int rc, i, oldsize;
1017 
1018 	/*
1019 	 * Check if bus/devfn already managed.  If so just return success.
1020 	 */
1021 	if (managed_devlist == NULL) {
1022 		managed_devlist = kmem_alloc(sizeof (physdev_manage_pci_t) *
1023 		    mdev_size, KM_NOSLEEP);
1024 		if (managed_devlist == NULL) {
1025 			cmn_err(CE_WARN,
1026 			    "Can't alloc space for managed device list");
1027 			return (0);
1028 		}
1029 	};
1030 	for (i = 0; i < mdev_cnt; i++) {
1031 		if (managed_devlist[i].bus == bus &&
1032 		    managed_devlist[i].devfn == devfn)
1033 			return (1); /* device already managed */
1034 	}
1035 	manage_pci.bus = bus;
1036 	manage_pci.devfn = devfn;
1037 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci);
1038 	if (rc < 0) {
1039 		cmn_err(CE_WARN,
1040 		    "hypervisor add pci device call failed bus:0x%x"
1041 		    " devfn:0x%x", bus, devfn);
1042 		return (0);
1043 	}
1044 	/*
1045 	 * Add device to the managed device list
1046 	 */
1047 	if (i == mdev_size) {
1048 		/*
1049 		 * grow the managed device list
1050 		 */
1051 		oldsize = mdev_size * sizeof (physdev_manage_pci_t);
1052 		mdev_size *= 2;
1053 		newlist = kmem_alloc(sizeof (physdev_manage_pci_t) * mdev_size,
1054 		    KM_NOSLEEP);
1055 		if (newlist == NULL) {
1056 			cmn_err(CE_WARN, "Can't grow managed device list");
1057 			return (0);
1058 		}
1059 		bcopy(managed_devlist, newlist, oldsize);
1060 		kmem_free(managed_devlist, oldsize);
1061 		managed_devlist = newlist;
1062 	}
1063 	managed_devlist[i].bus = bus;
1064 	managed_devlist[i].devfn = devfn;
1065 	mdev_cnt++;
1066 	return (1);
1067 }
1068 
1069 /*
1070  * allocate an apic irq struct for an MSI interrupt
1071  */
1072 static int
1073 msi_allocate_irq(int irq)
1074 {
1075 	apic_irq_t *irqptr = apic_irq_table[irq];
1076 
1077 	if (irqptr == NULL) {
1078 		irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP);
1079 		if (irqptr == NULL) {
1080 			cmn_err(CE_WARN, "xpv_psm: NO memory to allocate IRQ");
1081 			return (-1);
1082 		}
1083 		apic_irq_table[irq] = irqptr;
1084 	} else {
1085 		if (irq == APIC_RESV_IRQ && irqptr->airq_mps_intr_index == 0)
1086 			irqptr->airq_mps_intr_index = FREE_INDEX;
1087 		if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1088 			cmn_err(CE_WARN, "xpv_psm: MSI IRQ already in use");
1089 			return (-1);
1090 		}
1091 	}
1092 	irqptr->airq_mps_intr_index = FREE_INDEX;
1093 	return (irq);
1094 }
1095 
1096 /*
1097  * read MSI/MSIX vector out of config space
1098  */
1099 static uchar_t
1100 xpv_psm_get_msi_vector(dev_info_t *dip, int type, int entry)
1101 {
1102 	uint64_t		msi_data = 0;
1103 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip);
1104 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(dip);
1105 	ushort_t		msi_ctrl;
1106 	uchar_t			vector;
1107 
1108 	ASSERT((handle != NULL) && (cap_ptr != 0));
1109 	if (type == DDI_INTR_TYPE_MSI) {
1110 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1111 		/*
1112 		 * Get vector
1113 		 */
1114 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1115 			msi_data = pci_config_get16(handle,
1116 			    cap_ptr + PCI_MSI_64BIT_DATA);
1117 		} else {
1118 			msi_data = pci_config_get16(handle,
1119 			    cap_ptr + PCI_MSI_32BIT_DATA);
1120 		}
1121 	} else if (type == DDI_INTR_TYPE_MSIX) {
1122 		uintptr_t	off;
1123 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(dip);
1124 
1125 		/* Offset into the given entry in the MSI-X table */
1126 		off = (uintptr_t)msix_p->msix_tbl_addr +
1127 		    (entry  * PCI_MSIX_VECTOR_SIZE);
1128 
1129 		msi_data = ddi_get32(msix_p->msix_tbl_hdl,
1130 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET));
1131 	}
1132 	vector = msi_data & 0xff;
1133 	return (vector);
1134 }
1135 
1136 
1137 static void
1138 get_busdevfn(dev_info_t *dip, int *busp, int *devfnp)
1139 {
1140 	pci_regspec_t *regspec;
1141 	int reglen;
1142 
1143 	/*
1144 	 * Get device reg spec, first word has PCI bus and
1145 	 * device/function info we need.
1146 	 */
1147 	if (ddi_getlongprop(DDI_DEV_T_NONE, dip, DDI_PROP_DONTPASS, "reg",
1148 	    (caddr_t)&regspec, &reglen) != DDI_SUCCESS) {
1149 		cmn_err(CE_WARN,
1150 		    "get_busdevfn() failed to get regspec.");
1151 		return;
1152 	}
1153 	/*
1154 	 * get PCI bus # from reg spec for device
1155 	 */
1156 	*busp = PCI_REG_BUS_G(regspec[0].pci_phys_hi);
1157 	/*
1158 	 * get combined device/function from reg spec for device.
1159 	 */
1160 	*devfnp = (regspec[0].pci_phys_hi & (PCI_REG_FUNC_M | PCI_REG_DEV_M)) >>
1161 	    PCI_REG_FUNC_SHIFT;
1162 
1163 	kmem_free(regspec, reglen);
1164 }
1165 
1166 /*
1167  * This function allocates "count" MSI vector(s) for the given "dip/pri/type"
1168  */
1169 int
1170 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
1171     int behavior)
1172 {
1173 	int	rcount, i, rc, irqno;
1174 	uchar_t	vector, cpu;
1175 	major_t	major;
1176 	apic_irq_t	*irqptr;
1177 	physdev_map_pirq_t map_irq;
1178 	int busnum, devfn;
1179 
1180 	DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: dip=0x%p "
1181 	    "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
1182 	    (void *)dip, inum, pri, count, behavior));
1183 
1184 	if (count > 1) {
1185 		if (behavior == DDI_INTR_ALLOC_STRICT &&
1186 		    apic_multi_msi_enable == 0)
1187 			return (0);
1188 		if (apic_multi_msi_enable == 0)
1189 			count = 1;
1190 	}
1191 
1192 	if ((rcount = apic_navail_vector(dip, pri)) > count)
1193 		rcount = count;
1194 	else if (rcount == 0 || (rcount < count &&
1195 	    behavior == DDI_INTR_ALLOC_STRICT))
1196 		return (0);
1197 
1198 	/* if not ISP2, then round it down */
1199 	if (!ISP2(rcount))
1200 		rcount = 1 << (highbit(rcount) - 1);
1201 
1202 	/*
1203 	 * get PCI bus #  and devfn from reg spec for device
1204 	 */
1205 	get_busdevfn(dip, &busnum, &devfn);
1206 
1207 	/*
1208 	 * Tell xen about this pci device
1209 	 */
1210 	if (!xen_manage_device(busnum, devfn))
1211 		return (0);
1212 
1213 	mutex_enter(&airq_mutex);
1214 
1215 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1216 	for (i = 0; i < rcount; i++) {
1217 		/*
1218 		 * use PHYSDEVOP_map_pirq to have xen map MSI to a pirq
1219 		 */
1220 		map_irq.domid = DOMID_SELF;
1221 		map_irq.type = MAP_PIRQ_TYPE_MSI;
1222 		map_irq.index = -1; /* hypervisor auto allocates vector */
1223 		map_irq.pirq = -1;
1224 		map_irq.bus = busnum;
1225 		map_irq.devfn = devfn;
1226 		map_irq.entry_nr = 0;
1227 		map_irq.table_base = 0;
1228 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1229 		irqno = map_irq.pirq;
1230 		if (rc < 0) {
1231 			mutex_exit(&airq_mutex);
1232 			cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1233 			return (0);
1234 		}
1235 		if (irqno < 0) {
1236 			mutex_exit(&airq_mutex);
1237 			cmn_err(CE_NOTE,
1238 			    "!hypervisor not configured for MSI support");
1239 			xen_support_msi = -1;
1240 			return (0);
1241 		}
1242 		if (msi_allocate_irq(irqno) < 0) {
1243 			mutex_exit(&airq_mutex);
1244 			return (0);
1245 		}
1246 		/*
1247 		 * Find out what vector the hypervisor assigned
1248 		 */
1249 		vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSI, 0);
1250 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1251 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1252 		irqptr = apic_irq_table[irqno];
1253 		ASSERT(irqptr != NULL);
1254 #ifdef	DEBUG
1255 		if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1256 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: "
1257 			    "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1258 #endif
1259 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1260 		msi_vector_to_pirq[vector] = (uchar_t)irqno;
1261 
1262 		irqptr->airq_vector = vector;
1263 		irqptr->airq_ioapicindex = (uchar_t)inum;	/* start */
1264 		irqptr->airq_intin_no = (uchar_t)rcount;
1265 		irqptr->airq_ipl = pri;
1266 		irqptr->airq_origirq = (uchar_t)(inum + i);
1267 		irqptr->airq_share_id = 0;
1268 		irqptr->airq_mps_intr_index = MSI_INDEX;
1269 		irqptr->airq_dip = dip;
1270 		irqptr->airq_major = major;
1271 		if (i == 0) /* they all bind to the same cpu */
1272 			cpu = irqptr->airq_cpu = xen_psm_bind_intr(irqno);
1273 		else
1274 			irqptr->airq_cpu = cpu;
1275 		DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: irq=0x%x "
1276 		    "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1277 		    (void *)irqptr->airq_dip, irqptr->airq_vector,
1278 		    irqptr->airq_origirq, pri));
1279 	}
1280 	mutex_exit(&airq_mutex);
1281 	return (rcount);
1282 }
1283 
1284 /*
1285  * This function allocates "count" MSI-X vector(s) for the given "dip/pri/type"
1286  */
1287 int
1288 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
1289     int behavior)
1290 {
1291 	int	rcount, i, rc;
1292 	major_t	major;
1293 	physdev_map_pirq_t map_irq;
1294 	int busnum, devfn;
1295 	ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1296 	uint64_t table_base;
1297 	pfn_t pfnum;
1298 
1299 	if (msix_p == NULL) {
1300 		msix_p = pci_msix_init(dip);
1301 		if (msix_p != NULL) {
1302 			i_ddi_set_msix(dip, msix_p);
1303 		} else {
1304 			cmn_err(CE_WARN, "apic_alloc_msix_vectors()"
1305 			    " msix_init failed");
1306 			return (0);
1307 		}
1308 	}
1309 	/*
1310 	 * Hypervisor wants PCI config space address of msix table
1311 	 */
1312 	pfnum = hat_getpfnum(kas.a_hat, (caddr_t)msix_p->msix_tbl_addr) &
1313 	    ~PFN_IS_FOREIGN_MFN;
1314 	table_base = (uint64_t)((pfnum << PAGESHIFT) |
1315 	    ((uintptr_t)msix_p->msix_tbl_addr & PAGEOFFSET));
1316 	/*
1317 	 * get PCI bus #  and devfn from reg spec for device
1318 	 */
1319 	get_busdevfn(dip, &busnum, &devfn);
1320 
1321 	/*
1322 	 * Tell xen about this pci device
1323 	 */
1324 	if (!xen_manage_device(busnum, devfn))
1325 		return (0);
1326 	mutex_enter(&airq_mutex);
1327 
1328 	if ((rcount = apic_navail_vector(dip, pri)) > count)
1329 		rcount = count;
1330 	else if (rcount == 0 || (rcount < count &&
1331 	    behavior == DDI_INTR_ALLOC_STRICT)) {
1332 		rcount = 0;
1333 		goto out;
1334 	}
1335 
1336 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1337 	for (i = 0; i < rcount; i++) {
1338 		int irqno;
1339 		uchar_t	vector;
1340 		apic_irq_t	*irqptr;
1341 
1342 		/*
1343 		 * use PHYSDEVOP_map_pirq to have xen map MSI-X to a pirq
1344 		 */
1345 		map_irq.domid = DOMID_SELF;
1346 		map_irq.type = MAP_PIRQ_TYPE_MSI;
1347 		map_irq.index = -1; /* hypervisor auto allocates vector */
1348 		map_irq.pirq = -1;
1349 		map_irq.bus = busnum;
1350 		map_irq.devfn = devfn;
1351 		map_irq.entry_nr = i;
1352 		map_irq.table_base = table_base;
1353 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1354 		irqno = map_irq.pirq;
1355 		if (rc < 0) {
1356 			mutex_exit(&airq_mutex);
1357 			cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1358 			return (0);
1359 		}
1360 		if (irqno < 0) {
1361 			mutex_exit(&airq_mutex);
1362 			cmn_err(CE_NOTE,
1363 			    "!hypervisor not configured for MSI support");
1364 			xen_support_msi = -1;
1365 			return (0);
1366 		}
1367 		/*
1368 		 * Find out what vector the hypervisor assigned
1369 		 */
1370 		vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSIX, i);
1371 		if (msi_allocate_irq(irqno) < 0) {
1372 			mutex_exit(&airq_mutex);
1373 			return (0);
1374 		}
1375 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1376 		msi_vector_to_pirq[vector] = (uchar_t)irqno;
1377 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1378 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1379 		irqptr = apic_irq_table[irqno];
1380 		ASSERT(irqptr != NULL);
1381 		irqptr->airq_vector = (uchar_t)vector;
1382 		irqptr->airq_ipl = pri;
1383 		irqptr->airq_origirq = (uchar_t)(inum + i);
1384 		irqptr->airq_share_id = 0;
1385 		irqptr->airq_mps_intr_index = MSIX_INDEX;
1386 		irqptr->airq_dip = dip;
1387 		irqptr->airq_major = major;
1388 		irqptr->airq_cpu = IRQ_UNBOUND; /* will be bound when addspl */
1389 	}
1390 out:
1391 	mutex_exit(&airq_mutex);
1392 	return (rcount);
1393 }
1394 
1395 
1396 /*
1397  * This finds the apic_irq_t associated with the dip, ispec and type.
1398  * The entry should have already been freed, but it can not have been
1399  * reused yet since the hypervisor can not have reassigned the pirq since
1400  * we have not freed that yet.
1401  */
1402 static apic_irq_t *
1403 msi_find_irq(dev_info_t *dip, struct intrspec *ispec)
1404 {
1405 	apic_irq_t	*irqp;
1406 	int i;
1407 
1408 	for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
1409 		if ((irqp = apic_irq_table[i]) == NULL)
1410 			continue;
1411 		if ((irqp->airq_dip == dip) &&
1412 		    (irqp->airq_origirq == ispec->intrspec_vec) &&
1413 		    (irqp->airq_ipl == ispec->intrspec_pri)) {
1414 			return (irqp);
1415 		}
1416 	}
1417 	return (NULL);
1418 }
1419 
1420 void
1421 apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type)
1422 {
1423 	int i, rc;
1424 	physdev_unmap_pirq_t unmap_pirq;
1425 	apic_irq_t *irqptr;
1426 	struct intrspec ispec;
1427 
1428 	DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: dip: %p inum: %x "
1429 	    "count: %x pri: %x type: %x\n",
1430 	    (void *)dip, inum, count, pri, type));
1431 
1432 	/* for MSI/X only */
1433 	if (!DDI_INTR_IS_MSI_OR_MSIX(type))
1434 		return;
1435 
1436 	for (i = 0; i < count; i++) {
1437 		DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: inum=0x%x "
1438 		    "pri=0x%x count=0x%x\n", inum, pri, count));
1439 		ispec.intrspec_vec = inum + i;
1440 		ispec.intrspec_pri = pri;
1441 		if ((irqptr = msi_find_irq(dip, &ispec)) == NULL) {
1442 			cmn_err(CE_WARN,
1443 			    "couldn't find irq %s,%s dip: 0x%p vec: %x pri: %x",
1444 			    ddi_get_name(dip), ddi_get_name_addr(dip),
1445 			    (void *)dip, inum + i, pri);
1446 			continue;
1447 		}
1448 		/*
1449 		 * use PHYSDEVOP_unmap_pirq to have xen unmap MSI from a pirq
1450 		 */
1451 		unmap_pirq.domid = DOMID_SELF;
1452 		unmap_pirq.pirq = msi_vector_to_pirq[irqptr->airq_vector];
1453 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_pirq);
1454 		if (rc < 0) {
1455 			cmn_err(CE_WARN, "unmap pirq failed");
1456 			return;
1457 		}
1458 		irqptr->airq_mps_intr_index = FREE_INDEX;
1459 		apic_vector_to_irq[irqptr->airq_vector] = APIC_RESV_IRQ;
1460 	}
1461 }
1462 
1463 /*
1464  * The hypervisor doesn't permit access to local apics directly
1465  */
1466 /* ARGSUSED */
1467 uint32_t *
1468 mapin_apic(uint32_t addr, size_t len, int flags)
1469 {
1470 	/*
1471 	 * Return a pointer to a memory area to fake out the
1472 	 * probe code that wants to read apic registers.
1473 	 * The dummy values will end up being ignored by xen
1474 	 * later on when they are used anyway.
1475 	 */
1476 	xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1477 	return (xen_psm_dummy_apic);
1478 }
1479 
1480 /* ARGSUSED */
1481 uint32_t *
1482 mapin_ioapic(uint32_t addr, size_t len, int flags)
1483 {
1484 	/*
1485 	 * Return non-null here to fake out configure code that calls this.
1486 	 * The i86xpv platform will not reference through the returned value..
1487 	 */
1488 	return ((uint32_t *)0x1);
1489 }
1490 
1491 /* ARGSUSED */
1492 void
1493 mapout_apic(caddr_t addr, size_t len)
1494 {
1495 }
1496 
1497 /* ARGSUSED */
1498 void
1499 mapout_ioapic(caddr_t addr, size_t len)
1500 {
1501 }
1502 
1503 uint32_t
1504 ioapic_read(int apic_ix, uint32_t reg)
1505 {
1506 	physdev_apic_t apic;
1507 
1508 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1509 	apic.reg = reg;
1510 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1511 		panic("read ioapic %d reg %d failed", apic_ix, reg);
1512 	return (apic.value);
1513 }
1514 
1515 void
1516 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1517 {
1518 	physdev_apic_t apic;
1519 
1520 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1521 	apic.reg = reg;
1522 	apic.value = value;
1523 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1524 		panic("write ioapic %d reg %d failed", apic_ix, reg);
1525 }
1526 
1527 /*
1528  * This function was added as part of x2APIC support in pcplusmp.
1529  */
1530 void
1531 ioapic_write_eoi(int apic_ix, uint32_t value)
1532 {
1533 	physdev_apic_t apic;
1534 
1535 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1536 	apic.reg = APIC_IO_EOI;
1537 	apic.value = value;
1538 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1539 		panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1540 }
1541 
1542 /*
1543  * This function was added as part of x2APIC support in pcplusmp to resolve
1544  * undefined symbol in xpv_psm.
1545  */
1546 void
1547 x2apic_update_psm()
1548 {
1549 }
1550 
1551 /*
1552  * This function was added as part of x2APIC support in pcplusmp to resolve
1553  * undefined symbol in xpv_psm.
1554  */
1555 void
1556 apic_ret()
1557 {
1558 }
1559 
1560 /*
1561  * Call rebind to do the actual programming.
1562  */
1563 int
1564 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1565 {
1566 	apic_irq_t *irqptr;
1567 	struct ioapic_reprogram_data *drep = NULL;
1568 	int rv, cpu;
1569 	cpuset_t cpus;
1570 
1571 	if (deferred) {
1572 		drep = (struct ioapic_reprogram_data *)p;
1573 		ASSERT(drep != NULL);
1574 		irqptr = drep->irqp;
1575 	} else {
1576 		irqptr = (apic_irq_t *)p;
1577 	}
1578 	ASSERT(irqptr != NULL);
1579 	/*
1580 	 * Set cpu based on xen idea of online cpu's not apic tables.
1581 	 * Note that xen ignores/sets to it's own preferred value the
1582 	 * target cpu field when programming ioapic anyway.
1583 	 */
1584 	if (irqptr->airq_mps_intr_index == MSI_INDEX)
1585 		cpu = irqptr->airq_cpu; /* MSI cpus are already set */
1586 	else {
1587 		cpu = xen_psm_bind_intr(irq);
1588 		irqptr->airq_cpu = cpu;
1589 	}
1590 	if (cpu == IRQ_UNBOUND) {
1591 		CPUSET_ZERO(cpus);
1592 		CPUSET_OR(cpus, xen_psm_cpus_online);
1593 	} else {
1594 		CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1595 	}
1596 	rv = apic_rebind(irqptr, cpu, drep);
1597 	if (rv) {
1598 		/* CPU is not up or interrupt is disabled. Fall back to 0 */
1599 		cpu = 0;
1600 		irqptr->airq_cpu = cpu;
1601 		rv = apic_rebind(irqptr, cpu, drep);
1602 	}
1603 	/*
1604 	 * If rebind successful bind the irq to an event channel
1605 	 */
1606 	if (rv == 0) {
1607 		ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1608 		CPUSET_FIND(cpus, cpu);
1609 		apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1610 	}
1611 	return (rv);
1612 }
1613 
1614 /*
1615  * Allocate a new vector for the given irq
1616  */
1617 /* ARGSUSED */
1618 uchar_t
1619 apic_modify_vector(uchar_t vector, int irq)
1620 {
1621 	return (apic_allocate_vector(0, irq, 0));
1622 }
1623 
1624 /*
1625  * The rest of the file is just generic psm module boilerplate
1626  */
1627 
1628 static struct psm_ops xen_psm_ops = {
1629 	xen_psm_probe,				/* psm_probe		*/
1630 
1631 	xen_psm_softinit,			/* psm_init		*/
1632 	xen_psm_picinit,			/* psm_picinit		*/
1633 	xen_psm_intr_enter,			/* psm_intr_enter	*/
1634 	xen_psm_intr_exit,			/* psm_intr_exit	*/
1635 	xen_psm_setspl,				/* psm_setspl		*/
1636 	xen_psm_addspl,				/* psm_addspl		*/
1637 	xen_psm_delspl,				/* psm_delspl		*/
1638 	xen_psm_disable_intr,			/* psm_disable_intr	*/
1639 	xen_psm_enable_intr,			/* psm_enable_intr	*/
1640 	(int (*)(int))NULL,			/* psm_softlvl_to_irq	*/
1641 	(void (*)(int))NULL,			/* psm_set_softintr	*/
1642 	(void (*)(processorid_t))NULL,		/* psm_set_idlecpu	*/
1643 	(void (*)(processorid_t))NULL,		/* psm_unset_idlecpu	*/
1644 
1645 	xen_psm_clkinit,			/* psm_clkinit		*/
1646 	xen_psm_get_clockirq,			/* psm_get_clockirq	*/
1647 	xen_psm_hrtimeinit,			/* psm_hrtimeinit	*/
1648 	xpv_gethrtime,				/* psm_gethrtime	*/
1649 
1650 	xen_psm_get_next_processorid,		/* psm_get_next_processorid */
1651 	xen_psm_cpu_start,			/* psm_cpu_start	*/
1652 	xen_psm_post_cpu_start,			/* psm_post_cpu_start	*/
1653 	xen_psm_shutdown,			/* psm_shutdown		*/
1654 	xen_psm_get_ipivect,			/* psm_get_ipivect	*/
1655 	xen_psm_send_ipi,			/* psm_send_ipi		*/
1656 
1657 	xen_psm_translate_irq,			/* psm_translate_irq	*/
1658 
1659 	(void (*)(int, char *))NULL,		/* psm_notify_error	*/
1660 	(void (*)(int msg))NULL,		/* psm_notify_func	*/
1661 	xen_psm_timer_reprogram,		/* psm_timer_reprogram	*/
1662 	xen_psm_timer_enable,			/* psm_timer_enable	*/
1663 	xen_psm_timer_disable,			/* psm_timer_disable	*/
1664 	(void (*)(void *arg))NULL,		/* psm_post_cyclic_setup */
1665 	(void (*)(int, int))NULL,		/* psm_preshutdown	*/
1666 	xen_intr_ops,			/* Advanced DDI Interrupt framework */
1667 	(int (*)(psm_state_request_t *))NULL	/* psm_state		*/
1668 };
1669 
1670 static struct psm_info xen_psm_info = {
1671 	PSM_INFO_VER01_5,	/* version				*/
1672 	PSM_OWN_EXCLUSIVE,	/* ownership				*/
1673 	&xen_psm_ops,		/* operation				*/
1674 	"xVM_psm",		/* machine name				*/
1675 	"platform module"	/* machine descriptions			*/
1676 };
1677 
1678 static void *xen_psm_hdlp;
1679 
1680 int
1681 _init(void)
1682 {
1683 	return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1684 }
1685 
1686 int
1687 _fini(void)
1688 {
1689 	return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1690 }
1691 
1692 int
1693 _info(struct modinfo *modinfop)
1694 {
1695 	return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1696 }
1697