xref: /illumos-gate/usr/src/uts/i86xpv/io/psm/xpv_psm.c (revision 12042ab213b3af68474f48555504db816a449211)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2018 Joyent, Inc.
26  */
27 
28 #define	PSMI_1_7
29 
30 #include <sys/mutex.h>
31 #include <sys/types.h>
32 #include <sys/time.h>
33 #include <sys/clock.h>
34 #include <sys/machlock.h>
35 #include <sys/smp_impldefs.h>
36 #include <sys/uadmin.h>
37 #include <sys/promif.h>
38 #include <sys/psm.h>
39 #include <sys/psm_common.h>
40 #include <sys/atomic.h>
41 #include <sys/apic.h>
42 #include <sys/archsystm.h>
43 #include <sys/mach_intr.h>
44 #include <sys/hypervisor.h>
45 #include <sys/evtchn_impl.h>
46 #include <sys/modctl.h>
47 #include <sys/trap.h>
48 #include <sys/panic.h>
49 #include <sys/sysmacros.h>
50 #include <sys/pci_intr_lib.h>
51 #include <vm/hat_i86.h>
52 
53 #include <xen/public/vcpu.h>
54 #include <xen/public/physdev.h>
55 
56 
57 /*
58  * Global Data
59  */
60 
61 int xen_psm_verbose = 0;
62 
63 /* As of now we don't support x2apic in xVM */
64 volatile uint32_t *apicadr = NULL;	/* dummy, so common code will link */
65 int apic_error = 0;
66 int apic_verbose = 0;
67 cpuset_t apic_cpumask;
68 int apic_forceload = 0;
69 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
70 	3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
71 };
72 uchar_t apic_ipltopri[MAXIPL + 1];
73 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
74 uint_t apic_picinit_called;
75 apic_cpus_info_t *apic_cpus;
76 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
77 /* use to make sure only one cpu handles the nmi */
78 static lock_t xen_psm_nmi_lock;
79 int xen_psm_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
80 int xen_psm_panic_on_nmi = 0;
81 int xen_psm_num_nmis = 0;
82 
83 cpuset_t xen_psm_cpus_online;	/* online cpus */
84 int xen_psm_ncpus = 1;		/* cpu count */
85 int xen_psm_next_bind_cpu;	/* next cpu to bind an interrupt to */
86 
87 int xen_support_msi = 0;
88 
89 static int xen_clock_irq = INVALID_IRQ;
90 
91 /* flag definitions for xen_psm_verbose */
92 #define	XEN_PSM_VERBOSE_IRQ_FLAG		0x00000001
93 #define	XEN_PSM_VERBOSE_POWEROFF_FLAG		0x00000002
94 #define	XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG	0x00000004
95 
96 #define	XEN_PSM_VERBOSE_IRQ(fmt) \
97 	if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
98 		cmn_err fmt;
99 
100 #define	XEN_PSM_VERBOSE_POWEROFF(fmt) \
101 	if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
102 		prom_printf fmt;
103 
104 /*
105  * Dummy apic array to point common routines at that want to do some apic
106  * manipulation.  Xen doesn't allow guest apic access so we point at these
107  * memory locations to fake out those who want to do apic fiddling.
108  */
109 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
110 
111 static struct psm_info xen_psm_info;
112 static void xen_psm_setspl(int);
113 
114 int
115 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
116     int behavior);
117 int
118 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
119     int behavior);
120 
121 /*
122  * Local support routines
123  */
124 
125 /*
126  * Select vcpu to bind xen virtual device interrupt to.
127  */
128 /*ARGSUSED*/
129 int
130 xen_psm_bind_intr(int irq)
131 {
132 	int bind_cpu;
133 	apic_irq_t *irqptr;
134 
135 	bind_cpu = IRQ_UNBOUND;
136 	if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
137 		return (bind_cpu);
138 	if (irq <= APIC_MAX_VECTOR)
139 		irqptr = apic_irq_table[irq];
140 	else
141 		irqptr = NULL;
142 	if (irqptr && (irqptr->airq_cpu != IRQ_UNBOUND))
143 		bind_cpu = irqptr->airq_cpu & ~IRQ_USER_BOUND;
144 	if (bind_cpu != IRQ_UNBOUND) {
145 		if (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu))
146 			bind_cpu = 0;
147 		goto done;
148 	}
149 	if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
150 		do {
151 			bind_cpu = xen_psm_next_bind_cpu++;
152 			if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
153 				xen_psm_next_bind_cpu = 0;
154 		} while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
155 	} else {
156 		bind_cpu = 0;
157 	}
158 done:
159 	return (bind_cpu);
160 }
161 
162 /*
163  * Autoconfiguration Routines
164  */
165 
166 static int
167 xen_psm_probe(void)
168 {
169 	int ret = PSM_SUCCESS;
170 
171 	if (DOMAIN_IS_INITDOMAIN(xen_info))
172 		ret = apic_probe_common(xen_psm_info.p_mach_idstring);
173 	return (ret);
174 }
175 
176 static void
177 xen_psm_softinit(void)
178 {
179 	/* LINTED logical expression always true: op "||" */
180 	ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
181 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
182 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
183 		apic_init_common();
184 	}
185 }
186 
187 #define	XEN_NSEC_PER_TICK	10 /* XXX - assume we have a 100 Mhz clock */
188 
189 /*ARGSUSED*/
190 static int
191 xen_psm_clkinit(int hertz)
192 {
193 	extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
194 	extern int dosynctodr;
195 
196 	/*
197 	 * domU cannot set the TOD hardware, fault the TOD clock now to
198 	 * indicate that and turn off attempts to sync TOD hardware
199 	 * with the hires timer.
200 	 */
201 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
202 		mutex_enter(&tod_lock);
203 		(void) tod_fault(TOD_RDONLY, 0);
204 		dosynctodr = 0;
205 		mutex_exit(&tod_lock);
206 	}
207 	/*
208 	 * The hypervisor provides a timer based on the local APIC timer.
209 	 * The interface supports requests of nanosecond resolution.
210 	 * A common frequency of the apic clock is 100 Mhz which
211 	 * gives a resolution of 10 nsec per tick.  What we would really like
212 	 * is a way to get the ns per tick value from xen.
213 	 * XXPV - This is an assumption that needs checking and may change
214 	 */
215 	return (XEN_NSEC_PER_TICK);
216 }
217 
218 static void
219 xen_psm_hrtimeinit(void)
220 {
221 	extern int gethrtime_hires;
222 	gethrtime_hires = 1;
223 }
224 
225 /* xen_psm NMI handler */
226 static uint_t
227 xen_psm_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused)
228 {
229 	xen_psm_num_nmis++;
230 
231 	if (!lock_try(&xen_psm_nmi_lock))
232 		return (DDI_INTR_UNCLAIMED);
233 
234 	if (xen_psm_kmdb_on_nmi && psm_debugger()) {
235 		debug_enter("NMI received: entering kmdb\n");
236 	} else if (xen_psm_panic_on_nmi) {
237 		/* Keep panic from entering kmdb. */
238 		nopanicdebug = 1;
239 		panic("NMI received\n");
240 	} else {
241 		/*
242 		 * prom_printf is the best shot we have of something which is
243 		 * problem free from high level/NMI type of interrupts
244 		 */
245 		prom_printf("NMI received\n");
246 	}
247 
248 	lock_clear(&xen_psm_nmi_lock);
249 	return (DDI_INTR_CLAIMED);
250 }
251 
252 static void
253 xen_psm_picinit()
254 {
255 	int cpu, irqno;
256 	cpuset_t cpus;
257 
258 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
259 		/* set a flag so we know we have run xen_psm_picinit() */
260 		apic_picinit_called = 1;
261 		LOCK_INIT_CLEAR(&apic_ioapic_lock);
262 
263 		/* XXPV - do we need to do this? */
264 		picsetup();	 /* initialise the 8259 */
265 
266 		/* enable apic mode if imcr present */
267 		/* XXPV - do we need to do this either? */
268 		if (apic_imcrp) {
269 			outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
270 			outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
271 		}
272 
273 		ioapic_init_intr(IOAPIC_NOMASK);
274 		/*
275 		 * We never called xen_psm_addspl() when the SCI
276 		 * interrupt was added because that happened before the
277 		 * PSM module was loaded.  Fix that up here by doing
278 		 * any missed operations (e.g. bind to CPU)
279 		 */
280 		if ((irqno = apic_sci_vect) > 0) {
281 			if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
282 				CPUSET_ZERO(cpus);
283 				CPUSET_OR(cpus, xen_psm_cpus_online);
284 			} else {
285 				CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
286 			}
287 			ec_set_irq_affinity(irqno, cpus);
288 			apic_irq_table[irqno]->airq_temp_cpu =
289 			    (uchar_t)(cpu & ~IRQ_USER_BOUND);
290 			ec_enable_irq(irqno);
291 		}
292 	}
293 
294 	/* add nmi handler - least priority nmi handler */
295 	LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
296 
297 	if (!psm_add_nmintr(0, xen_psm_nmi_intr,
298 	    "xVM_psm NMI handler", (caddr_t)NULL))
299 		cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
300 }
301 
302 
303 /*
304  * generates an interprocessor interrupt to another CPU
305  */
306 static void
307 xen_psm_send_ipi(int cpun, int ipl)
308 {
309 	ulong_t flag = intr_clear();
310 
311 	ec_send_ipi(ipl, cpun);
312 	intr_restore(flag);
313 }
314 
315 /*ARGSUSED*/
316 static int
317 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
318 {
319 	int cpu, ret;
320 	cpuset_t cpus;
321 
322 	/*
323 	 * We are called at splhi() so we can't call anything that might end
324 	 * up trying to context switch.
325 	 */
326 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
327 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
328 		/*
329 		 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
330 		 */
331 		ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
332 	} else {
333 		/*
334 		 * Set priority/affinity/enable for non PIRQs
335 		 */
336 		ret = ec_set_irq_priority(irqno, ipl);
337 		ASSERT(ret == 0);
338 		if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
339 			CPUSET_ZERO(cpus);
340 			CPUSET_OR(cpus, xen_psm_cpus_online);
341 		} else {
342 			CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
343 		}
344 		ec_set_irq_affinity(irqno, cpus);
345 		ec_enable_irq(irqno);
346 	}
347 	return (ret);
348 }
349 
350 /*
351  * Acquire ownership of this irq on this cpu
352  */
353 void
354 xen_psm_acquire_irq(int irq)
355 {
356 	ulong_t flags;
357 	int cpuid;
358 
359 	/*
360 	 * If the irq is currently being serviced by another cpu
361 	 * we busy-wait for the other cpu to finish.  Take any
362 	 * pending interrupts before retrying.
363 	 */
364 	do {
365 		flags = intr_clear();
366 		cpuid = ec_block_irq(irq);
367 		intr_restore(flags);
368 	} while (cpuid != CPU->cpu_id);
369 }
370 
371 /*ARGSUSED*/
372 static int
373 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
374 {
375 	apic_irq_t *irqptr;
376 	int err = PSM_SUCCESS;
377 
378 	if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
379 	    DOMAIN_IS_INITDOMAIN(xen_info)) {
380 		irqptr = apic_irq_table[irqno];
381 		/*
382 		 * unbind if no more sharers of this irq/evtchn
383 		 */
384 		if (irqptr->airq_share == 1) {
385 			xen_psm_acquire_irq(irqno);
386 			ec_unbind_irq(irqno);
387 		}
388 		err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
389 		/*
390 		 * If still in use reset priority
391 		 */
392 		if (!err && irqptr->airq_share != 0) {
393 			err = ec_set_irq_priority(irqno, max_ipl);
394 			return (err);
395 		}
396 	} else {
397 		xen_psm_acquire_irq(irqno);
398 		ec_unbind_irq(irqno);
399 	}
400 	return (err);
401 }
402 
403 static processorid_t
404 xen_psm_get_next_processorid(processorid_t id)
405 {
406 	if (id == -1)
407 		return (0);
408 
409 	for (id++; id < NCPU; id++) {
410 		switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
411 		case 0:		/* yeah, that one's there */
412 			return (id);
413 		default:
414 		case X_EINVAL:	/* out of range */
415 			return (-1);
416 		case X_ENOENT:	/* not present in the domain */
417 			/*
418 			 * It's not clear that we -need- to keep looking
419 			 * at this point, if, e.g., we can guarantee
420 			 * the hypervisor always keeps a contiguous range
421 			 * of vcpus around this is equivalent to "out of range".
422 			 *
423 			 * But it would be sad to miss a vcpu we're
424 			 * supposed to be using ..
425 			 */
426 			break;
427 		}
428 	}
429 
430 	return (-1);
431 }
432 
433 /*
434  * XXPV - undo the start cpu op change; return to ignoring this value
435  *	- also tweak error handling in main startup loop
436  */
437 /*ARGSUSED*/
438 static int
439 xen_psm_cpu_start(processorid_t id, caddr_t arg)
440 {
441 	int ret;
442 
443 	ASSERT(id > 0);
444 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
445 	ec_bind_cpu_ipis(id);
446 	(void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
447 	if ((ret = xen_vcpu_up(id)) == 0)
448 		xen_psm_ncpus++;
449 	else
450 		ret = EINVAL;
451 	return (ret);
452 }
453 
454 /*
455  * Allocate an irq for inter cpu signaling
456  */
457 /*ARGSUSED*/
458 static int
459 xen_psm_get_ipivect(int ipl, int type)
460 {
461 	return (ec_bind_ipi_to_irq(ipl, 0));
462 }
463 
464 /*ARGSUSED*/
465 static int
466 xen_psm_get_clockirq(int ipl)
467 {
468 	if (xen_clock_irq != INVALID_IRQ)
469 		return (xen_clock_irq);
470 
471 	xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
472 	return (xen_clock_irq);
473 }
474 
475 /*ARGSUSED*/
476 static void
477 xen_psm_shutdown(int cmd, int fcn)
478 {
479 	XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
480 
481 	switch (cmd) {
482 	case A_SHUTDOWN:
483 		switch (fcn) {
484 		case AD_BOOT:
485 		case AD_IBOOT:
486 			(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
487 			break;
488 		case AD_POWEROFF:
489 			/* fall through if domU or if poweroff fails */
490 			if (DOMAIN_IS_INITDOMAIN(xen_info))
491 				if (apic_enable_acpi)
492 					(void) acpi_poweroff();
493 			/* FALLTHRU */
494 		case AD_HALT:
495 		default:
496 			(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
497 			break;
498 		}
499 		break;
500 	case A_REBOOT:
501 		(void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
502 		break;
503 	default:
504 		return;
505 	}
506 }
507 
508 
509 static int
510 xen_psm_translate_irq(dev_info_t *dip, int irqno)
511 {
512 	if (dip == NULL) {
513 		XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
514 		    " dip = NULL\n", irqno));
515 		return (irqno);
516 	}
517 	return (irqno);
518 }
519 
520 /*
521  * xen_psm_intr_enter() acks the event that triggered the interrupt and
522  * returns the new priority level,
523  */
524 /*ARGSUSED*/
525 static int
526 xen_psm_intr_enter(int ipl, int *vector)
527 {
528 	int newipl;
529 	uint_t intno;
530 	cpu_t *cpu = CPU;
531 
532 	intno = (*vector);
533 
534 	ASSERT(intno < NR_IRQS);
535 	ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
536 
537 	if (!ec_is_edge_pirq(intno))
538 		ec_clear_irq(intno);
539 
540 	newipl = autovect[intno].avh_hi_pri;
541 	if (newipl == 0) {
542 		/*
543 		 * (newipl == 0) means we have no service routines for this
544 		 * vector.  We will treat this as a spurious interrupt.
545 		 * We have cleared the pending bit already, clear the event
546 		 * mask and return a spurious interrupt.  This case can happen
547 		 * when an interrupt delivery is racing with the removal of
548 		 * of the service routine for that interrupt.
549 		 */
550 		ec_unmask_irq(intno);
551 		newipl = -1;	/* flag spurious interrupt */
552 	} else if (newipl <= cpu->cpu_pri) {
553 		/*
554 		 * (newipl <= cpu->cpu_pri) means that we must be trying to
555 		 * service a vector that was shared with a higher priority
556 		 * isr.  The higher priority handler has been removed and
557 		 * we need to service this int.  We can't return a lower
558 		 * priority than current cpu priority.  Just synthesize a
559 		 * priority to return that should be acceptable.
560 		 * It should never happen that we synthesize a priority that
561 		 * moves us from low-priority to high-priority that would make
562 		 * a us incorrectly run on the high priority stack.
563 		 */
564 		newipl = cpu->cpu_pri + 1;	/* synthetic priority */
565 		ASSERT(newipl != LOCK_LEVEL + 1);
566 	}
567 	return (newipl);
568 }
569 
570 
571 /*
572  * xen_psm_intr_exit() restores the old interrupt
573  * priority level after processing an interrupt.
574  * It is called with interrupts disabled, and does not enable interrupts.
575  */
576 /* ARGSUSED */
577 static void
578 xen_psm_intr_exit(int ipl, int vector)
579 {
580 	ec_try_unmask_irq(vector);
581 	xen_psm_setspl(ipl);
582 }
583 
584 intr_exit_fn_t
585 psm_intr_exit_fn(void)
586 {
587 	return (xen_psm_intr_exit);
588 }
589 
590 /*
591  * Check if new ipl level allows delivery of previously unserviced events
592  */
593 static void
594 xen_psm_setspl(int ipl)
595 {
596 	struct cpu *cpu = CPU;
597 	volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
598 	uint16_t pending;
599 
600 	ASSERT(vci->evtchn_upcall_mask != 0);
601 
602 	/*
603 	 * If new ipl level will enable any pending interrupts, setup so the
604 	 * upcoming sti will cause us to get an upcall.
605 	 */
606 	pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
607 	if (pending) {
608 		int i;
609 		ulong_t pending_sels = 0;
610 		volatile ulong_t *selp;
611 		struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
612 
613 		for (i = bsrw_insn(pending); i > ipl; i--)
614 			pending_sels |= cpe->pending_sel[i];
615 		ASSERT(pending_sels);
616 		selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
617 		atomic_or_ulong(selp, pending_sels);
618 		vci->evtchn_upcall_pending = 1;
619 	}
620 }
621 
622 /*
623  * This function provides external interface to the nexus for all
624  * functionality related to the new DDI interrupt framework.
625  *
626  * Input:
627  * dip     - pointer to the dev_info structure of the requested device
628  * hdlp    - pointer to the internal interrupt handle structure for the
629  *	     requested interrupt
630  * intr_op - opcode for this call
631  * result  - pointer to the integer that will hold the result to be
632  *	     passed back if return value is PSM_SUCCESS
633  *
634  * Output:
635  * return value is either PSM_SUCCESS or PSM_FAILURE
636  */
637 int
638 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
639     psm_intr_op_t intr_op, int *result)
640 {
641 	int		cap;
642 	int		err;
643 	int		new_priority;
644 	apic_irq_t	*irqp;
645 	struct intrspec *ispec;
646 
647 	DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
648 	    "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
649 
650 	switch (intr_op) {
651 	case PSM_INTR_OP_CHECK_MSI:
652 		/*
653 		 * Till PCI passthru is supported, only dom0 has MSI/MSIX
654 		 */
655 		if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
656 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
657 			    DDI_INTR_TYPE_MSIX);
658 			break;
659 		}
660 		/*
661 		 * Check MSI/X is supported or not at APIC level and
662 		 * masked off the MSI/X bits in hdlp->ih_type if not
663 		 * supported before return.  If MSI/X is supported,
664 		 * leave the ih_type unchanged and return.
665 		 *
666 		 * hdlp->ih_type passed in from the nexus has all the
667 		 * interrupt types supported by the device.
668 		 */
669 		if (xen_support_msi == 0) {
670 			/*
671 			 * if xen_support_msi is not set, call
672 			 * apic_check_msi_support() to check whether msi
673 			 * is supported first
674 			 */
675 			if (apic_check_msi_support() == PSM_SUCCESS)
676 				xen_support_msi = 1;
677 			else
678 				xen_support_msi = -1;
679 		}
680 		if (xen_support_msi == 1)
681 			*result = hdlp->ih_type;
682 		else
683 			*result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
684 			    DDI_INTR_TYPE_MSIX);
685 		break;
686 	case PSM_INTR_OP_ALLOC_VECTORS:
687 		if (hdlp->ih_type == DDI_INTR_TYPE_MSI)
688 			*result = apic_alloc_msi_vectors(dip, hdlp->ih_inum,
689 			    hdlp->ih_scratch1, hdlp->ih_pri,
690 			    (int)(uintptr_t)hdlp->ih_scratch2);
691 		else
692 			*result = apic_alloc_msix_vectors(dip, hdlp->ih_inum,
693 			    hdlp->ih_scratch1, hdlp->ih_pri,
694 			    (int)(uintptr_t)hdlp->ih_scratch2);
695 		break;
696 	case PSM_INTR_OP_FREE_VECTORS:
697 		apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
698 		    hdlp->ih_pri, hdlp->ih_type);
699 		break;
700 	case PSM_INTR_OP_NAVAIL_VECTORS:
701 		/*
702 		 * XXPV - maybe we should make this be:
703 		 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
704 		 */
705 		if (DOMAIN_IS_INITDOMAIN(xen_info))
706 			*result = APIC_VECTOR_PER_IPL;
707 		else
708 			*result = 1;
709 		break;
710 	case PSM_INTR_OP_XLATE_VECTOR:
711 		ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
712 		if (ispec->intrspec_vec >= PIRQ_BASE &&
713 		    ispec->intrspec_vec < NR_PIRQS &&
714 		    DOMAIN_IS_INITDOMAIN(xen_info)) {
715 			*result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
716 		} else {
717 			*result = ispec->intrspec_vec;
718 		}
719 		break;
720 	case PSM_INTR_OP_GET_PENDING:
721 		/* XXPV - is this enough for dom0 or do we need to ref ioapic */
722 		*result = ec_pending_irq(hdlp->ih_vector);
723 		break;
724 	case PSM_INTR_OP_CLEAR_MASK:
725 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
726 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
727 			return (PSM_FAILURE);
728 		ec_enable_irq(hdlp->ih_vector);
729 		break;
730 	case PSM_INTR_OP_SET_MASK:
731 		/* XXPV - is this enough for dom0 or do we need to set ioapic */
732 		if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
733 			return (PSM_FAILURE);
734 		ec_disable_irq(hdlp->ih_vector);
735 		break;
736 	case PSM_INTR_OP_GET_CAP:
737 		cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
738 		if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
739 			cap |= DDI_INTR_FLAG_MASKABLE;
740 		*result = cap;
741 		break;
742 	case PSM_INTR_OP_GET_SHARED:
743 		if (DOMAIN_IS_INITDOMAIN(xen_info)) {
744 			if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
745 				return (PSM_FAILURE);
746 			ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
747 			if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
748 			    == NULL)
749 				return (PSM_FAILURE);
750 			*result = (irqp->airq_share > 1) ? 1: 0;
751 		} else {
752 			return (PSM_FAILURE);
753 		}
754 		break;
755 	case PSM_INTR_OP_SET_PRI:
756 		new_priority = *(int *)result;
757 		err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
758 		if (err != 0)
759 			return (PSM_FAILURE);
760 		break;
761 	case PSM_INTR_OP_GET_INTR:
762 		if (!DOMAIN_IS_INITDOMAIN(xen_info))
763 			return (PSM_FAILURE);
764 		/*
765 		 * The interrupt handle given here has been allocated
766 		 * specifically for this command, and ih_private carries
767 		 * a pointer to a apic_get_intr_t.
768 		 */
769 		if (apic_get_vector_intr_info(
770 		    hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
771 			return (PSM_FAILURE);
772 		break;
773 	case PSM_INTR_OP_SET_CAP:
774 		/* FALLTHRU */
775 	default:
776 		return (PSM_FAILURE);
777 	}
778 	return (PSM_SUCCESS);
779 }
780 
781 static void
782 xen_psm_rebind_irq(int irq)
783 {
784 	cpuset_t ncpu;
785 	processorid_t newcpu;
786 	apic_irq_t *irqptr;
787 
788 	newcpu = xen_psm_bind_intr(irq);
789 	if (newcpu == IRQ_UNBOUND) {
790 		CPUSET_ZERO(ncpu);
791 		CPUSET_OR(ncpu, xen_psm_cpus_online);
792 	} else {
793 		CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
794 	}
795 	ec_set_irq_affinity(irq, ncpu);
796 	if (irq <= APIC_MAX_VECTOR) {
797 		irqptr = apic_irq_table[irq];
798 		ASSERT(irqptr != NULL);
799 		irqptr->airq_temp_cpu = (uchar_t)newcpu;
800 	}
801 }
802 
803 /*
804  * Disable all device interrupts for the given cpu.
805  * High priority interrupts are not disabled and will still be serviced.
806  */
807 static int
808 xen_psm_disable_intr(processorid_t cpun)
809 {
810 	int irq;
811 
812 	/*
813 	 * Can't offline VCPU 0 on this hypervisor.  There's no reason
814 	 * anyone would want to given that the CPUs are virtual. Also note
815 	 * that the hypervisor requires suspend/resume to be on VCPU 0.
816 	 */
817 	if (cpun == 0)
818 		return (PSM_FAILURE);
819 
820 	CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
821 	for (irq = 0; irq < NR_IRQS; irq++) {
822 		if (!ec_irq_needs_rebind(irq, cpun))
823 			continue;
824 		xen_psm_rebind_irq(irq);
825 	}
826 	return (PSM_SUCCESS);
827 }
828 
829 static void
830 xen_psm_enable_intr(processorid_t cpun)
831 {
832 	int irq;
833 
834 	if (cpun == 0)
835 		return;
836 
837 	CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
838 
839 	/*
840 	 * Rebalance device interrupts among online processors
841 	 */
842 	for (irq = 0; irq < NR_IRQS; irq++) {
843 		if (!ec_irq_rebindable(irq))
844 			continue;
845 		xen_psm_rebind_irq(irq);
846 	}
847 
848 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
849 		apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
850 	}
851 }
852 
853 static int
854 xen_psm_post_cpu_start()
855 {
856 	processorid_t cpun;
857 
858 	cpun = psm_get_cpu_id();
859 	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
860 		/*
861 		 * Non-virtualized environments can call psm_post_cpu_start
862 		 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
863 		 * xen_psm_post_cpu_start() is only called from boot.
864 		 */
865 		apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
866 	}
867 	return (PSM_SUCCESS);
868 }
869 
870 /*
871  * This function will reprogram the timer.
872  *
873  * When in oneshot mode the argument is the absolute time in future at which to
874  * generate the interrupt.
875  *
876  * When in periodic mode, the argument is the interval at which the
877  * interrupts should be generated. There is no need to support the periodic
878  * mode timer change at this time.
879  *
880  * Note that we must be careful to convert from hrtime to Xen system time (see
881  * xpv_timestamp.c).
882  */
883 static void
884 xen_psm_timer_reprogram(hrtime_t timer_req)
885 {
886 	hrtime_t now, timer_new, time_delta, xen_time;
887 	ulong_t flags;
888 
889 	flags = intr_clear();
890 	/*
891 	 * We should be called from high PIL context (CBE_HIGH_PIL),
892 	 * so kpreempt is disabled.
893 	 */
894 
895 	now = xpv_gethrtime();
896 	xen_time = xpv_getsystime();
897 	if (timer_req <= now) {
898 		/*
899 		 * requested to generate an interrupt in the past
900 		 * generate an interrupt as soon as possible
901 		 */
902 		time_delta = XEN_NSEC_PER_TICK;
903 	} else
904 		time_delta = timer_req - now;
905 
906 	timer_new = xen_time + time_delta;
907 	if (HYPERVISOR_set_timer_op(timer_new) != 0)
908 		panic("can't set hypervisor timer?");
909 	intr_restore(flags);
910 }
911 
912 /*
913  * This function will enable timer interrupts.
914  */
915 static void
916 xen_psm_timer_enable(void)
917 {
918 	ec_unmask_irq(xen_clock_irq);
919 }
920 
921 /*
922  * This function will disable timer interrupts on the current cpu.
923  */
924 static void
925 xen_psm_timer_disable(void)
926 {
927 	(void) ec_block_irq(xen_clock_irq);
928 	/*
929 	 * If the clock irq is pending on this cpu then we need to
930 	 * clear the pending interrupt.
931 	 */
932 	ec_unpend_irq(xen_clock_irq);
933 }
934 
935 /*
936  *
937  * The following functions are in the platform specific file so that they
938  * can be different functions depending on whether we are running on
939  * bare metal or a hypervisor.
940  */
941 
942 /*
943  * Allocate a free vector for irq at ipl.
944  */
945 /* ARGSUSED */
946 uchar_t
947 apic_allocate_vector(int ipl, int irq, int pri)
948 {
949 	physdev_irq_t irq_op;
950 	uchar_t vector;
951 	int rc;
952 
953 	irq_op.irq = irq;
954 
955 	if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
956 	    != 0)
957 		panic("Hypervisor alloc vector failed err: %d", -rc);
958 	vector = irq_op.vector;
959 	/*
960 	 * No need to worry about vector colliding with our reserved vectors
961 	 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
962 	 * generated traps and handle them properly.
963 	 */
964 	apic_vector_to_irq[vector] = (uchar_t)irq;
965 	return (vector);
966 }
967 
968 /* Mark vector as not being used by any irq */
969 void
970 apic_free_vector(uchar_t vector)
971 {
972 	apic_vector_to_irq[vector] = APIC_RESV_IRQ;
973 }
974 
975 /*
976  * This function returns the no. of vectors available for the pri.
977  * dip is not used at this moment.  If we really don't need that,
978  * it will be removed.  Since priority is not limited by hardware
979  * when running on the hypervisor we simply return the maximum no.
980  * of available contiguous vectors.
981  */
982 /*ARGSUSED*/
983 int
984 apic_navail_vector(dev_info_t *dip, int pri)
985 {
986 	int	lowest, highest, i, navail, count;
987 
988 	DDI_INTR_IMPLDBG((CE_CONT, "apic_navail_vector: dip: %p, pri: %x\n",
989 	    (void *)dip, pri));
990 
991 	highest = APIC_MAX_VECTOR;
992 	lowest = APIC_BASE_VECT;
993 	navail = count = 0;
994 
995 	/* It has to be contiguous */
996 	for (i = lowest; i < highest; i++) {
997 		count = 0;
998 		while ((apic_vector_to_irq[i] == APIC_RESV_IRQ) &&
999 		    (i < highest)) {
1000 			count++;
1001 			i++;
1002 		}
1003 		if (count > navail)
1004 			navail = count;
1005 	}
1006 	return (navail);
1007 }
1008 
1009 static physdev_manage_pci_t *managed_devlist;
1010 static int mdev_cnt;
1011 static int mdev_size = 128;
1012 static uchar_t	msi_vector_to_pirq[APIC_MAX_VECTOR+1];
1013 
1014 /*
1015  * Add devfn on given bus to devices managed by hypervisor
1016  */
1017 static int
1018 xen_manage_device(uint8_t bus, uint8_t devfn)
1019 {
1020 	physdev_manage_pci_t manage_pci, *newlist;
1021 	int rc, i, oldsize;
1022 
1023 	/*
1024 	 * Check if bus/devfn already managed.  If so just return success.
1025 	 */
1026 	if (managed_devlist == NULL) {
1027 		managed_devlist = kmem_alloc(sizeof (physdev_manage_pci_t) *
1028 		    mdev_size, KM_NOSLEEP);
1029 		if (managed_devlist == NULL) {
1030 			cmn_err(CE_WARN,
1031 			    "Can't alloc space for managed device list");
1032 			return (0);
1033 		}
1034 	};
1035 	for (i = 0; i < mdev_cnt; i++) {
1036 		if (managed_devlist[i].bus == bus &&
1037 		    managed_devlist[i].devfn == devfn)
1038 			return (1); /* device already managed */
1039 	}
1040 	manage_pci.bus = bus;
1041 	manage_pci.devfn = devfn;
1042 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci);
1043 	if (rc < 0) {
1044 		cmn_err(CE_WARN,
1045 		    "hypervisor add pci device call failed bus:0x%x"
1046 		    " devfn:0x%x", bus, devfn);
1047 		return (0);
1048 	}
1049 	/*
1050 	 * Add device to the managed device list
1051 	 */
1052 	if (i == mdev_size) {
1053 		/*
1054 		 * grow the managed device list
1055 		 */
1056 		oldsize = mdev_size * sizeof (physdev_manage_pci_t);
1057 		mdev_size *= 2;
1058 		newlist = kmem_alloc(sizeof (physdev_manage_pci_t) * mdev_size,
1059 		    KM_NOSLEEP);
1060 		if (newlist == NULL) {
1061 			cmn_err(CE_WARN, "Can't grow managed device list");
1062 			return (0);
1063 		}
1064 		bcopy(managed_devlist, newlist, oldsize);
1065 		kmem_free(managed_devlist, oldsize);
1066 		managed_devlist = newlist;
1067 	}
1068 	managed_devlist[i].bus = bus;
1069 	managed_devlist[i].devfn = devfn;
1070 	mdev_cnt++;
1071 	return (1);
1072 }
1073 
1074 /*
1075  * allocate an apic irq struct for an MSI interrupt
1076  */
1077 static int
1078 msi_allocate_irq(int irq)
1079 {
1080 	apic_irq_t *irqptr = apic_irq_table[irq];
1081 
1082 	if (irqptr == NULL) {
1083 		irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP);
1084 		if (irqptr == NULL) {
1085 			cmn_err(CE_WARN, "xpv_psm: NO memory to allocate IRQ");
1086 			return (-1);
1087 		}
1088 		apic_irq_table[irq] = irqptr;
1089 	} else {
1090 		if (irq == APIC_RESV_IRQ && irqptr->airq_mps_intr_index == 0)
1091 			irqptr->airq_mps_intr_index = FREE_INDEX;
1092 		if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1093 			cmn_err(CE_WARN, "xpv_psm: MSI IRQ already in use");
1094 			return (-1);
1095 		}
1096 	}
1097 	irqptr->airq_mps_intr_index = FREE_INDEX;
1098 	return (irq);
1099 }
1100 
1101 /*
1102  * read MSI/MSIX vector out of config space
1103  */
1104 static uchar_t
1105 xpv_psm_get_msi_vector(dev_info_t *dip, int type, int entry)
1106 {
1107 	uint64_t		msi_data = 0;
1108 	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip);
1109 	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(dip);
1110 	ushort_t		msi_ctrl;
1111 	uchar_t			vector;
1112 
1113 	ASSERT((handle != NULL) && (cap_ptr != 0));
1114 	vector = 0;
1115 	if (type == DDI_INTR_TYPE_MSI) {
1116 		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1117 		/*
1118 		 * Get vector
1119 		 */
1120 		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1121 			msi_data = pci_config_get16(handle,
1122 			    cap_ptr + PCI_MSI_64BIT_DATA);
1123 		} else {
1124 			msi_data = pci_config_get16(handle,
1125 			    cap_ptr + PCI_MSI_32BIT_DATA);
1126 		}
1127 		vector = (msi_data & 0xff) + entry;
1128 	} else if (type == DDI_INTR_TYPE_MSIX) {
1129 		uintptr_t	off;
1130 		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(dip);
1131 
1132 		/* Offset into the given entry in the MSI-X table */
1133 		off = (uintptr_t)msix_p->msix_tbl_addr +
1134 		    (entry  * PCI_MSIX_VECTOR_SIZE);
1135 
1136 		msi_data = ddi_get32(msix_p->msix_tbl_hdl,
1137 		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET));
1138 		vector = msi_data & 0xff;
1139 	}
1140 	return (vector);
1141 }
1142 
1143 
1144 static void
1145 get_busdevfn(dev_info_t *dip, int *busp, int *devfnp)
1146 {
1147 	pci_regspec_t *regspec;
1148 	int reglen;
1149 
1150 	/*
1151 	 * Get device reg spec, first word has PCI bus and
1152 	 * device/function info we need.
1153 	 */
1154 	if (ddi_getlongprop(DDI_DEV_T_NONE, dip, DDI_PROP_DONTPASS, "reg",
1155 	    (caddr_t)&regspec, &reglen) != DDI_SUCCESS) {
1156 		cmn_err(CE_WARN,
1157 		    "get_busdevfn() failed to get regspec.");
1158 		return;
1159 	}
1160 	/*
1161 	 * get PCI bus # from reg spec for device
1162 	 */
1163 	*busp = PCI_REG_BUS_G(regspec[0].pci_phys_hi);
1164 	/*
1165 	 * get combined device/function from reg spec for device.
1166 	 */
1167 	*devfnp = (regspec[0].pci_phys_hi & (PCI_REG_FUNC_M | PCI_REG_DEV_M)) >>
1168 	    PCI_REG_FUNC_SHIFT;
1169 
1170 	kmem_free(regspec, reglen);
1171 }
1172 
1173 /*
1174  * This function allocates "count" MSI vector(s) for the given "dip/pri/type"
1175  */
1176 int
1177 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
1178     int behavior)
1179 {
1180 	int	rcount, i, rc, irqno;
1181 	uchar_t	vector, cpu;
1182 	major_t	major;
1183 	apic_irq_t	*irqptr;
1184 	physdev_map_pirq_t map_irq;
1185 	int busnum, devfn;
1186 
1187 	DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: dip=0x%p "
1188 	    "inum=0x%x  pri=0x%x count=0x%x behavior=%d\n",
1189 	    (void *)dip, inum, pri, count, behavior));
1190 
1191 	if (count > 1) {
1192 		if (behavior == DDI_INTR_ALLOC_STRICT &&
1193 		    apic_multi_msi_enable == 0)
1194 			return (0);
1195 		if (apic_multi_msi_enable == 0)
1196 			count = 1;
1197 	}
1198 
1199 	if ((rcount = apic_navail_vector(dip, pri)) > count)
1200 		rcount = count;
1201 	else if (rcount == 0 || (rcount < count &&
1202 	    behavior == DDI_INTR_ALLOC_STRICT))
1203 		return (0);
1204 
1205 	/* if not ISP2, then round it down */
1206 	if (!ISP2(rcount))
1207 		rcount = 1 << (highbit(rcount) - 1);
1208 
1209 	/*
1210 	 * get PCI bus #  and devfn from reg spec for device
1211 	 */
1212 	get_busdevfn(dip, &busnum, &devfn);
1213 
1214 	/*
1215 	 * Tell xen about this pci device
1216 	 */
1217 	if (!xen_manage_device(busnum, devfn))
1218 		return (0);
1219 
1220 	mutex_enter(&airq_mutex);
1221 
1222 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1223 	for (i = 0; i < rcount; i++) {
1224 		/*
1225 		 * use PHYSDEVOP_map_pirq to have xen map MSI to a pirq
1226 		 */
1227 		map_irq.domid = DOMID_SELF;
1228 		map_irq.type = MAP_PIRQ_TYPE_MSI;
1229 		map_irq.index = -rcount; /* hypervisor auto allocates vectors */
1230 		map_irq.pirq = -1;
1231 		map_irq.bus = busnum;
1232 		map_irq.devfn = devfn;
1233 		map_irq.entry_nr = i;
1234 		map_irq.table_base = 0;
1235 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1236 		irqno = map_irq.pirq;
1237 		if (rc < 0) {
1238 			mutex_exit(&airq_mutex);
1239 			cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1240 			return (i);
1241 		}
1242 		if (irqno < 0) {
1243 			mutex_exit(&airq_mutex);
1244 			cmn_err(CE_NOTE,
1245 			    "!hypervisor not configured for MSI support");
1246 			xen_support_msi = -1;
1247 			return (0);
1248 		}
1249 
1250 		/*
1251 		 * Find out what vector the hypervisor assigned
1252 		 */
1253 		vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSI, i);
1254 
1255 		if (msi_allocate_irq(irqno) < 0) {
1256 			mutex_exit(&airq_mutex);
1257 			return (i);
1258 		}
1259 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1260 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1261 		irqptr = apic_irq_table[irqno];
1262 		ASSERT(irqptr != NULL);
1263 #ifdef	DEBUG
1264 		if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1265 			DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: "
1266 			    "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1267 #endif
1268 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1269 		msi_vector_to_pirq[vector] = (uchar_t)irqno;
1270 
1271 		irqptr->airq_vector = vector;
1272 		irqptr->airq_ioapicindex = (uchar_t)inum;	/* start */
1273 		irqptr->airq_intin_no = (uchar_t)rcount;
1274 		irqptr->airq_ipl = pri;
1275 		irqptr->airq_origirq = (uchar_t)(inum + i);
1276 		irqptr->airq_share_id = 0;
1277 		irqptr->airq_mps_intr_index = MSI_INDEX;
1278 		irqptr->airq_dip = dip;
1279 		irqptr->airq_major = major;
1280 		if (i == 0) /* they all bind to the same cpu */
1281 			cpu = irqptr->airq_cpu = xen_psm_bind_intr(irqno);
1282 		else
1283 			irqptr->airq_cpu = cpu;
1284 		DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: irq=0x%x "
1285 		    "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1286 		    (void *)irqptr->airq_dip, irqptr->airq_vector,
1287 		    irqptr->airq_origirq, pri));
1288 	}
1289 	mutex_exit(&airq_mutex);
1290 	return (rcount);
1291 }
1292 
1293 /*
1294  * This function allocates "count" MSI-X vector(s) for the given "dip/pri/type"
1295  */
1296 int
1297 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
1298     int behavior)
1299 {
1300 	int	rcount, i, rc;
1301 	major_t	major;
1302 	physdev_map_pirq_t map_irq;
1303 	int busnum, devfn;
1304 	ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1305 	uint64_t table_base;
1306 	pfn_t pfnum;
1307 
1308 	if (msix_p == NULL) {
1309 		msix_p = pci_msix_init(dip);
1310 		if (msix_p != NULL) {
1311 			i_ddi_set_msix(dip, msix_p);
1312 		} else {
1313 			cmn_err(CE_WARN, "apic_alloc_msix_vectors()"
1314 			    " msix_init failed");
1315 			return (0);
1316 		}
1317 	}
1318 	/*
1319 	 * Hypervisor wants PCI config space address of msix table base
1320 	 */
1321 	pfnum = hat_getpfnum(kas.a_hat, (caddr_t)msix_p->msix_tbl_addr) &
1322 	    ~PFN_IS_FOREIGN_MFN;
1323 	table_base = (uint64_t)((pfnum << PAGESHIFT) - msix_p->msix_tbl_offset |
1324 	    ((uintptr_t)msix_p->msix_tbl_addr & PAGEOFFSET));
1325 	/*
1326 	 * get PCI bus #  and devfn from reg spec for device
1327 	 */
1328 	get_busdevfn(dip, &busnum, &devfn);
1329 
1330 	/*
1331 	 * Tell xen about this pci device
1332 	 */
1333 	if (!xen_manage_device(busnum, devfn))
1334 		return (0);
1335 	mutex_enter(&airq_mutex);
1336 
1337 	if ((rcount = apic_navail_vector(dip, pri)) > count)
1338 		rcount = count;
1339 	else if (rcount == 0 || (rcount < count &&
1340 	    behavior == DDI_INTR_ALLOC_STRICT)) {
1341 		rcount = 0;
1342 		goto out;
1343 	}
1344 
1345 	major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1346 	for (i = 0; i < rcount; i++) {
1347 		int irqno;
1348 		uchar_t	vector;
1349 		apic_irq_t	*irqptr;
1350 
1351 		/*
1352 		 * use PHYSDEVOP_map_pirq to have xen map MSI-X to a pirq
1353 		 */
1354 		map_irq.domid = DOMID_SELF;
1355 		map_irq.type = MAP_PIRQ_TYPE_MSI;
1356 		map_irq.index = -1; /* hypervisor auto allocates vector */
1357 		map_irq.pirq = -1;
1358 		map_irq.bus = busnum;
1359 		map_irq.devfn = devfn;
1360 		map_irq.entry_nr = i;
1361 		map_irq.table_base = table_base;
1362 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1363 		irqno = map_irq.pirq;
1364 		if (rc < 0) {
1365 			mutex_exit(&airq_mutex);
1366 			cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1367 			return (i);
1368 		}
1369 		if (irqno < 0) {
1370 			mutex_exit(&airq_mutex);
1371 			cmn_err(CE_NOTE,
1372 			    "!hypervisor not configured for MSI support");
1373 			xen_support_msi = -1;
1374 			return (0);
1375 		}
1376 		/*
1377 		 * Find out what vector the hypervisor assigned
1378 		 */
1379 		vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSIX, i);
1380 
1381 		if (msi_allocate_irq(irqno) < 0) {
1382 			mutex_exit(&airq_mutex);
1383 			return (i);
1384 		}
1385 		apic_vector_to_irq[vector] = (uchar_t)irqno;
1386 		msi_vector_to_pirq[vector] = (uchar_t)irqno;
1387 		apic_max_device_irq = max(irqno, apic_max_device_irq);
1388 		apic_min_device_irq = min(irqno, apic_min_device_irq);
1389 		irqptr = apic_irq_table[irqno];
1390 		ASSERT(irqptr != NULL);
1391 		irqptr->airq_vector = (uchar_t)vector;
1392 		irqptr->airq_ipl = pri;
1393 		irqptr->airq_origirq = (uchar_t)(inum + i);
1394 		irqptr->airq_share_id = 0;
1395 		irqptr->airq_mps_intr_index = MSIX_INDEX;
1396 		irqptr->airq_dip = dip;
1397 		irqptr->airq_major = major;
1398 		irqptr->airq_cpu = IRQ_UNBOUND; /* will be bound when addspl */
1399 	}
1400 out:
1401 	mutex_exit(&airq_mutex);
1402 	return (rcount);
1403 }
1404 
1405 
1406 /*
1407  * This finds the apic_irq_t associated with the dip, ispec and type.
1408  * The entry should have already been freed, but it can not have been
1409  * reused yet since the hypervisor can not have reassigned the pirq since
1410  * we have not freed that yet.
1411  */
1412 static apic_irq_t *
1413 msi_find_irq(dev_info_t *dip, struct intrspec *ispec)
1414 {
1415 	apic_irq_t	*irqp;
1416 	int i;
1417 
1418 	for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
1419 		if ((irqp = apic_irq_table[i]) == NULL)
1420 			continue;
1421 		if ((irqp->airq_dip == dip) &&
1422 		    (irqp->airq_origirq == ispec->intrspec_vec) &&
1423 		    (irqp->airq_ipl == ispec->intrspec_pri)) {
1424 			return (irqp);
1425 		}
1426 	}
1427 	return (NULL);
1428 }
1429 
1430 void
1431 apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type)
1432 {
1433 	int i, rc;
1434 	physdev_unmap_pirq_t unmap_pirq;
1435 	apic_irq_t *irqptr;
1436 	struct intrspec ispec;
1437 
1438 	DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: dip: %p inum: %x "
1439 	    "count: %x pri: %x type: %x\n",
1440 	    (void *)dip, inum, count, pri, type));
1441 
1442 	/* for MSI/X only */
1443 	if (!DDI_INTR_IS_MSI_OR_MSIX(type))
1444 		return;
1445 
1446 	for (i = 0; i < count; i++) {
1447 		DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: inum=0x%x "
1448 		    "pri=0x%x count=0x%x\n", inum, pri, count));
1449 		ispec.intrspec_vec = inum + i;
1450 		ispec.intrspec_pri = pri;
1451 		if ((irqptr = msi_find_irq(dip, &ispec)) == NULL) {
1452 			cmn_err(CE_WARN,
1453 			    "couldn't find irq %s,%s dip: 0x%p vec: %x pri: %x",
1454 			    ddi_get_name(dip), ddi_get_name_addr(dip),
1455 			    (void *)dip, inum + i, pri);
1456 			continue;
1457 		}
1458 		/*
1459 		 * use PHYSDEVOP_unmap_pirq to have xen unmap MSI from a pirq
1460 		 */
1461 		unmap_pirq.domid = DOMID_SELF;
1462 		unmap_pirq.pirq = msi_vector_to_pirq[irqptr->airq_vector];
1463 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_pirq);
1464 		if (rc < 0) {
1465 			cmn_err(CE_WARN, "unmap pirq failed");
1466 			return;
1467 		}
1468 		irqptr->airq_mps_intr_index = FREE_INDEX;
1469 		apic_vector_to_irq[irqptr->airq_vector] = APIC_RESV_IRQ;
1470 	}
1471 }
1472 
1473 /*
1474  * The hypervisor doesn't permit access to local apics directly
1475  */
1476 /* ARGSUSED */
1477 uint32_t *
1478 mapin_apic(uint32_t addr, size_t len, int flags)
1479 {
1480 	/*
1481 	 * Return a pointer to a memory area to fake out the
1482 	 * probe code that wants to read apic registers.
1483 	 * The dummy values will end up being ignored by xen
1484 	 * later on when they are used anyway.
1485 	 */
1486 	xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1487 	return (xen_psm_dummy_apic);
1488 }
1489 
1490 /* ARGSUSED */
1491 uint32_t *
1492 mapin_ioapic(uint32_t addr, size_t len, int flags)
1493 {
1494 	/*
1495 	 * Return non-null here to fake out configure code that calls this.
1496 	 * The i86xpv platform will not reference through the returned value..
1497 	 */
1498 	return ((uint32_t *)0x1);
1499 }
1500 
1501 /* ARGSUSED */
1502 void
1503 mapout_apic(caddr_t addr, size_t len)
1504 {
1505 }
1506 
1507 /* ARGSUSED */
1508 void
1509 mapout_ioapic(caddr_t addr, size_t len)
1510 {
1511 }
1512 
1513 uint32_t
1514 ioapic_read(int apic_ix, uint32_t reg)
1515 {
1516 	physdev_apic_t apic;
1517 
1518 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1519 	apic.reg = reg;
1520 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1521 		panic("read ioapic %d reg %d failed", apic_ix, reg);
1522 	return (apic.value);
1523 }
1524 
1525 void
1526 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1527 {
1528 	physdev_apic_t apic;
1529 
1530 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1531 	apic.reg = reg;
1532 	apic.value = value;
1533 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1534 		panic("write ioapic %d reg %d failed", apic_ix, reg);
1535 }
1536 
1537 /*
1538  * This function was added as part of x2APIC support in pcplusmp.
1539  */
1540 void
1541 ioapic_write_eoi(int apic_ix, uint32_t value)
1542 {
1543 	physdev_apic_t apic;
1544 
1545 	apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1546 	apic.reg = APIC_IO_EOI;
1547 	apic.value = value;
1548 	if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1549 		panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1550 }
1551 
1552 /*
1553  * This function was added as part of x2APIC support in pcplusmp to resolve
1554  * undefined symbol in xpv_psm.
1555  */
1556 void
1557 x2apic_update_psm()
1558 {
1559 }
1560 
1561 /*
1562  * This function was added as part of x2APIC support in pcplusmp to resolve
1563  * undefined symbol in xpv_psm.
1564  */
1565 void
1566 apic_ret()
1567 {
1568 }
1569 
1570 /*
1571  * Call rebind to do the actual programming.
1572  */
1573 int
1574 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1575 {
1576 	apic_irq_t *irqptr;
1577 	struct ioapic_reprogram_data *drep = NULL;
1578 	int rv, cpu;
1579 	cpuset_t cpus;
1580 
1581 	if (deferred) {
1582 		drep = (struct ioapic_reprogram_data *)p;
1583 		ASSERT(drep != NULL);
1584 		irqptr = drep->irqp;
1585 	} else {
1586 		irqptr = (apic_irq_t *)p;
1587 	}
1588 	ASSERT(irqptr != NULL);
1589 	/*
1590 	 * Set cpu based on xen idea of online cpu's not apic tables.
1591 	 * Note that xen ignores/sets to it's own preferred value the
1592 	 * target cpu field when programming ioapic anyway.
1593 	 */
1594 	if (irqptr->airq_mps_intr_index == MSI_INDEX)
1595 		cpu = irqptr->airq_cpu; /* MSI cpus are already set */
1596 	else {
1597 		cpu = xen_psm_bind_intr(irq);
1598 		irqptr->airq_cpu = cpu;
1599 	}
1600 	if (cpu == IRQ_UNBOUND) {
1601 		CPUSET_ZERO(cpus);
1602 		CPUSET_OR(cpus, xen_psm_cpus_online);
1603 	} else {
1604 		CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1605 	}
1606 	rv = apic_rebind(irqptr, cpu, drep);
1607 	if (rv) {
1608 		/* CPU is not up or interrupt is disabled. Fall back to 0 */
1609 		cpu = 0;
1610 		irqptr->airq_cpu = cpu;
1611 		rv = apic_rebind(irqptr, cpu, drep);
1612 	}
1613 	/*
1614 	 * If rebind successful bind the irq to an event channel
1615 	 */
1616 	if (rv == 0) {
1617 		ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1618 		CPUSET_FIND(cpus, cpu);
1619 		apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1620 	}
1621 	return (rv);
1622 }
1623 
1624 /*
1625  * Allocate a new vector for the given irq
1626  */
1627 /* ARGSUSED */
1628 uchar_t
1629 apic_modify_vector(uchar_t vector, int irq)
1630 {
1631 	return (apic_allocate_vector(0, irq, 0));
1632 }
1633 
1634 /*
1635  * The rest of the file is just generic psm module boilerplate
1636  */
1637 
1638 static struct psm_ops xen_psm_ops = {
1639 	xen_psm_probe,				/* psm_probe		*/
1640 
1641 	xen_psm_softinit,			/* psm_init		*/
1642 	xen_psm_picinit,			/* psm_picinit		*/
1643 	xen_psm_intr_enter,			/* psm_intr_enter	*/
1644 	xen_psm_intr_exit,			/* psm_intr_exit	*/
1645 	xen_psm_setspl,				/* psm_setspl		*/
1646 	xen_psm_addspl,				/* psm_addspl		*/
1647 	xen_psm_delspl,				/* psm_delspl		*/
1648 	xen_psm_disable_intr,			/* psm_disable_intr	*/
1649 	xen_psm_enable_intr,			/* psm_enable_intr	*/
1650 	(int (*)(int))NULL,			/* psm_softlvl_to_irq	*/
1651 	(void (*)(int))NULL,			/* psm_set_softintr	*/
1652 	(void (*)(processorid_t))NULL,		/* psm_set_idlecpu	*/
1653 	(void (*)(processorid_t))NULL,		/* psm_unset_idlecpu	*/
1654 
1655 	xen_psm_clkinit,			/* psm_clkinit		*/
1656 	xen_psm_get_clockirq,			/* psm_get_clockirq	*/
1657 	xen_psm_hrtimeinit,			/* psm_hrtimeinit	*/
1658 	xpv_gethrtime,				/* psm_gethrtime	*/
1659 
1660 	xen_psm_get_next_processorid,		/* psm_get_next_processorid */
1661 	xen_psm_cpu_start,			/* psm_cpu_start	*/
1662 	xen_psm_post_cpu_start,			/* psm_post_cpu_start	*/
1663 	xen_psm_shutdown,			/* psm_shutdown		*/
1664 	xen_psm_get_ipivect,			/* psm_get_ipivect	*/
1665 	xen_psm_send_ipi,			/* psm_send_ipi		*/
1666 
1667 	xen_psm_translate_irq,			/* psm_translate_irq	*/
1668 
1669 	(void (*)(int, char *))NULL,		/* psm_notify_error	*/
1670 	(void (*)(int msg))NULL,		/* psm_notify_func	*/
1671 	xen_psm_timer_reprogram,		/* psm_timer_reprogram	*/
1672 	xen_psm_timer_enable,			/* psm_timer_enable	*/
1673 	xen_psm_timer_disable,			/* psm_timer_disable	*/
1674 	(void (*)(void *arg))NULL,		/* psm_post_cyclic_setup */
1675 	(void (*)(int, int))NULL,		/* psm_preshutdown	*/
1676 	xen_intr_ops,			/* Advanced DDI Interrupt framework */
1677 	(int (*)(psm_state_request_t *))NULL,	/* psm_state		*/
1678 	(int (*)(psm_cpu_request_t *))NULL,	/* psm_cpu_ops		*/
1679 
1680 	(int (*)(void))NULL,			/* psm_get_pir_ipivect	*/
1681 	(void (*)(processorid_t))NULL,		/* psm_send_pir_ipi	*/
1682 	(void (*)(processorid_t, boolean_t))NULL	/* psm_cmci_setup */
1683 };
1684 
1685 static struct psm_info xen_psm_info = {
1686 	PSM_INFO_VER01_5,	/* version				*/
1687 	PSM_OWN_EXCLUSIVE,	/* ownership				*/
1688 	&xen_psm_ops,		/* operation				*/
1689 	"xVM_psm",		/* machine name				*/
1690 	"platform module"	/* machine descriptions			*/
1691 };
1692 
1693 static void *xen_psm_hdlp;
1694 
1695 int
1696 _init(void)
1697 {
1698 	return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1699 }
1700 
1701 int
1702 _fini(void)
1703 {
1704 	return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1705 }
1706 
1707 int
1708 _info(struct modinfo *modinfop)
1709 {
1710 	return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1711 }
1712