1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #define PSMI_1_7
28
29 #include <sys/mutex.h>
30 #include <sys/types.h>
31 #include <sys/time.h>
32 #include <sys/clock.h>
33 #include <sys/machlock.h>
34 #include <sys/smp_impldefs.h>
35 #include <sys/uadmin.h>
36 #include <sys/promif.h>
37 #include <sys/psm.h>
38 #include <sys/psm_common.h>
39 #include <sys/atomic.h>
40 #include <sys/apic.h>
41 #include <sys/archsystm.h>
42 #include <sys/mach_intr.h>
43 #include <sys/hypervisor.h>
44 #include <sys/evtchn_impl.h>
45 #include <sys/modctl.h>
46 #include <sys/trap.h>
47 #include <sys/panic.h>
48 #include <sys/sysmacros.h>
49 #include <sys/pci_intr_lib.h>
50 #include <vm/hat_i86.h>
51
52 #include <xen/public/vcpu.h>
53 #include <xen/public/physdev.h>
54
55
56 /*
57 * Global Data
58 */
59
60 int xen_psm_verbose = 0;
61
62 /* As of now we don't support x2apic in xVM */
63 volatile uint32_t *apicadr = NULL; /* dummy, so common code will link */
64 int apic_error = 0;
65 int apic_verbose = 0;
66 cpuset_t apic_cpumask;
67 int apic_forceload = 0;
68 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
69 3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
70 };
71 uchar_t apic_ipltopri[MAXIPL + 1];
72 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
73 uint_t apic_picinit_called;
74 apic_cpus_info_t *apic_cpus;
75 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
76 /* use to make sure only one cpu handles the nmi */
77 static lock_t xen_psm_nmi_lock;
78 int xen_psm_kmdb_on_nmi = 0; /* 0 - no, 1 - yes enter kmdb */
79 int xen_psm_panic_on_nmi = 0;
80 int xen_psm_num_nmis = 0;
81
82 cpuset_t xen_psm_cpus_online; /* online cpus */
83 int xen_psm_ncpus = 1; /* cpu count */
84 int xen_psm_next_bind_cpu; /* next cpu to bind an interrupt to */
85
86 int xen_support_msi = 0;
87
88 static int xen_clock_irq = INVALID_IRQ;
89
90 /* flag definitions for xen_psm_verbose */
91 #define XEN_PSM_VERBOSE_IRQ_FLAG 0x00000001
92 #define XEN_PSM_VERBOSE_POWEROFF_FLAG 0x00000002
93 #define XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG 0x00000004
94
95 #define XEN_PSM_VERBOSE_IRQ(fmt) \
96 if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
97 cmn_err fmt;
98
99 #define XEN_PSM_VERBOSE_POWEROFF(fmt) \
100 if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
101 prom_printf fmt;
102
103 /*
104 * Dummy apic array to point common routines at that want to do some apic
105 * manipulation. Xen doesn't allow guest apic access so we point at these
106 * memory locations to fake out those who want to do apic fiddling.
107 */
108 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
109
110 static struct psm_info xen_psm_info;
111 static void xen_psm_setspl(int);
112
113 int
114 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
115 int behavior);
116 int
117 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
118 int behavior);
119
120 /*
121 * Local support routines
122 */
123
124 /*
125 * Select vcpu to bind xen virtual device interrupt to.
126 */
127 /*ARGSUSED*/
128 int
xen_psm_bind_intr(int irq)129 xen_psm_bind_intr(int irq)
130 {
131 int bind_cpu;
132 apic_irq_t *irqptr;
133
134 bind_cpu = IRQ_UNBOUND;
135 if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
136 return (bind_cpu);
137 if (irq <= APIC_MAX_VECTOR)
138 irqptr = apic_irq_table[irq];
139 else
140 irqptr = NULL;
141 if (irqptr && (irqptr->airq_cpu != IRQ_UNBOUND))
142 bind_cpu = irqptr->airq_cpu & ~IRQ_USER_BOUND;
143 if (bind_cpu != IRQ_UNBOUND) {
144 if (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu))
145 bind_cpu = 0;
146 goto done;
147 }
148 if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
149 do {
150 bind_cpu = xen_psm_next_bind_cpu++;
151 if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
152 xen_psm_next_bind_cpu = 0;
153 } while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
154 } else {
155 bind_cpu = 0;
156 }
157 done:
158 return (bind_cpu);
159 }
160
161 /*
162 * Autoconfiguration Routines
163 */
164
165 static int
xen_psm_probe(void)166 xen_psm_probe(void)
167 {
168 int ret = PSM_SUCCESS;
169
170 if (DOMAIN_IS_INITDOMAIN(xen_info))
171 ret = apic_probe_common(xen_psm_info.p_mach_idstring);
172 return (ret);
173 }
174
175 static void
xen_psm_softinit(void)176 xen_psm_softinit(void)
177 {
178 /* LINTED logical expression always true: op "||" */
179 ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
180 CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
181 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
182 apic_init_common();
183 }
184 }
185
186 #define XEN_NSEC_PER_TICK 10 /* XXX - assume we have a 100 Mhz clock */
187
188 /*ARGSUSED*/
189 static int
xen_psm_clkinit(int hertz)190 xen_psm_clkinit(int hertz)
191 {
192 extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
193 extern int dosynctodr;
194
195 /*
196 * domU cannot set the TOD hardware, fault the TOD clock now to
197 * indicate that and turn off attempts to sync TOD hardware
198 * with the hires timer.
199 */
200 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
201 mutex_enter(&tod_lock);
202 (void) tod_fault(TOD_RDONLY, 0);
203 dosynctodr = 0;
204 mutex_exit(&tod_lock);
205 }
206 /*
207 * The hypervisor provides a timer based on the local APIC timer.
208 * The interface supports requests of nanosecond resolution.
209 * A common frequency of the apic clock is 100 Mhz which
210 * gives a resolution of 10 nsec per tick. What we would really like
211 * is a way to get the ns per tick value from xen.
212 * XXPV - This is an assumption that needs checking and may change
213 */
214 return (XEN_NSEC_PER_TICK);
215 }
216
217 static void
xen_psm_hrtimeinit(void)218 xen_psm_hrtimeinit(void)
219 {
220 extern int gethrtime_hires;
221 gethrtime_hires = 1;
222 }
223
224 /* xen_psm NMI handler */
225 /*ARGSUSED*/
226 static void
xen_psm_nmi_intr(caddr_t arg,struct regs * rp)227 xen_psm_nmi_intr(caddr_t arg, struct regs *rp)
228 {
229 xen_psm_num_nmis++;
230
231 if (!lock_try(&xen_psm_nmi_lock))
232 return;
233
234 if (xen_psm_kmdb_on_nmi && psm_debugger()) {
235 debug_enter("NMI received: entering kmdb\n");
236 } else if (xen_psm_panic_on_nmi) {
237 /* Keep panic from entering kmdb. */
238 nopanicdebug = 1;
239 panic("NMI received\n");
240 } else {
241 /*
242 * prom_printf is the best shot we have of something which is
243 * problem free from high level/NMI type of interrupts
244 */
245 prom_printf("NMI received\n");
246 }
247
248 lock_clear(&xen_psm_nmi_lock);
249 }
250
251 static void
xen_psm_picinit()252 xen_psm_picinit()
253 {
254 int cpu, irqno;
255 cpuset_t cpus;
256
257 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
258 /* set a flag so we know we have run xen_psm_picinit() */
259 apic_picinit_called = 1;
260 LOCK_INIT_CLEAR(&apic_ioapic_lock);
261
262 /* XXPV - do we need to do this? */
263 picsetup(); /* initialise the 8259 */
264
265 /* enable apic mode if imcr present */
266 /* XXPV - do we need to do this either? */
267 if (apic_imcrp) {
268 outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
269 outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
270 }
271
272 ioapic_init_intr(IOAPIC_NOMASK);
273 /*
274 * We never called xen_psm_addspl() when the SCI
275 * interrupt was added because that happened before the
276 * PSM module was loaded. Fix that up here by doing
277 * any missed operations (e.g. bind to CPU)
278 */
279 if ((irqno = apic_sci_vect) > 0) {
280 if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
281 CPUSET_ZERO(cpus);
282 CPUSET_OR(cpus, xen_psm_cpus_online);
283 } else {
284 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
285 }
286 ec_set_irq_affinity(irqno, cpus);
287 apic_irq_table[irqno]->airq_temp_cpu =
288 (uchar_t)(cpu & ~IRQ_USER_BOUND);
289 ec_enable_irq(irqno);
290 }
291 }
292
293 /* add nmi handler - least priority nmi handler */
294 LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
295
296 if (!psm_add_nmintr(0, (avfunc) xen_psm_nmi_intr,
297 "xVM_psm NMI handler", (caddr_t)NULL))
298 cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
299 }
300
301
302 /*
303 * generates an interprocessor interrupt to another CPU
304 */
305 static void
xen_psm_send_ipi(int cpun,int ipl)306 xen_psm_send_ipi(int cpun, int ipl)
307 {
308 ulong_t flag = intr_clear();
309
310 ec_send_ipi(ipl, cpun);
311 intr_restore(flag);
312 }
313
314 /*ARGSUSED*/
315 static int
xen_psm_addspl(int irqno,int ipl,int min_ipl,int max_ipl)316 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
317 {
318 int cpu, ret;
319 cpuset_t cpus;
320
321 /*
322 * We are called at splhi() so we can't call anything that might end
323 * up trying to context switch.
324 */
325 if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
326 DOMAIN_IS_INITDOMAIN(xen_info)) {
327 /*
328 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
329 */
330 ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
331 } else {
332 /*
333 * Set priority/affinity/enable for non PIRQs
334 */
335 ret = ec_set_irq_priority(irqno, ipl);
336 ASSERT(ret == 0);
337 if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
338 CPUSET_ZERO(cpus);
339 CPUSET_OR(cpus, xen_psm_cpus_online);
340 } else {
341 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
342 }
343 ec_set_irq_affinity(irqno, cpus);
344 ec_enable_irq(irqno);
345 }
346 return (ret);
347 }
348
349 /*
350 * Acquire ownership of this irq on this cpu
351 */
352 void
xen_psm_acquire_irq(int irq)353 xen_psm_acquire_irq(int irq)
354 {
355 ulong_t flags;
356 int cpuid;
357
358 /*
359 * If the irq is currently being serviced by another cpu
360 * we busy-wait for the other cpu to finish. Take any
361 * pending interrupts before retrying.
362 */
363 do {
364 flags = intr_clear();
365 cpuid = ec_block_irq(irq);
366 intr_restore(flags);
367 } while (cpuid != CPU->cpu_id);
368 }
369
370 /*ARGSUSED*/
371 static int
xen_psm_delspl(int irqno,int ipl,int min_ipl,int max_ipl)372 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
373 {
374 apic_irq_t *irqptr;
375 int err = PSM_SUCCESS;
376
377 if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
378 DOMAIN_IS_INITDOMAIN(xen_info)) {
379 irqptr = apic_irq_table[irqno];
380 /*
381 * unbind if no more sharers of this irq/evtchn
382 */
383 if (irqptr->airq_share == 1) {
384 xen_psm_acquire_irq(irqno);
385 ec_unbind_irq(irqno);
386 }
387 err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
388 /*
389 * If still in use reset priority
390 */
391 if (!err && irqptr->airq_share != 0) {
392 err = ec_set_irq_priority(irqno, max_ipl);
393 return (err);
394 }
395 } else {
396 xen_psm_acquire_irq(irqno);
397 ec_unbind_irq(irqno);
398 }
399 return (err);
400 }
401
402 static processorid_t
xen_psm_get_next_processorid(processorid_t id)403 xen_psm_get_next_processorid(processorid_t id)
404 {
405 if (id == -1)
406 return (0);
407
408 for (id++; id < NCPU; id++) {
409 switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
410 case 0: /* yeah, that one's there */
411 return (id);
412 default:
413 case X_EINVAL: /* out of range */
414 return (-1);
415 case X_ENOENT: /* not present in the domain */
416 /*
417 * It's not clear that we -need- to keep looking
418 * at this point, if, e.g., we can guarantee
419 * the hypervisor always keeps a contiguous range
420 * of vcpus around this is equivalent to "out of range".
421 *
422 * But it would be sad to miss a vcpu we're
423 * supposed to be using ..
424 */
425 break;
426 }
427 }
428
429 return (-1);
430 }
431
432 /*
433 * XXPV - undo the start cpu op change; return to ignoring this value
434 * - also tweak error handling in main startup loop
435 */
436 /*ARGSUSED*/
437 static int
xen_psm_cpu_start(processorid_t id,caddr_t arg)438 xen_psm_cpu_start(processorid_t id, caddr_t arg)
439 {
440 int ret;
441
442 ASSERT(id > 0);
443 CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
444 ec_bind_cpu_ipis(id);
445 (void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
446 if ((ret = xen_vcpu_up(id)) == 0)
447 xen_psm_ncpus++;
448 else
449 ret = EINVAL;
450 return (ret);
451 }
452
453 /*
454 * Allocate an irq for inter cpu signaling
455 */
456 /*ARGSUSED*/
457 static int
xen_psm_get_ipivect(int ipl,int type)458 xen_psm_get_ipivect(int ipl, int type)
459 {
460 return (ec_bind_ipi_to_irq(ipl, 0));
461 }
462
463 /*ARGSUSED*/
464 static int
xen_psm_get_clockirq(int ipl)465 xen_psm_get_clockirq(int ipl)
466 {
467 if (xen_clock_irq != INVALID_IRQ)
468 return (xen_clock_irq);
469
470 xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
471 return (xen_clock_irq);
472 }
473
474 /*ARGSUSED*/
475 static void
xen_psm_shutdown(int cmd,int fcn)476 xen_psm_shutdown(int cmd, int fcn)
477 {
478 XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
479
480 switch (cmd) {
481 case A_SHUTDOWN:
482 switch (fcn) {
483 case AD_BOOT:
484 case AD_IBOOT:
485 (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
486 break;
487 case AD_POWEROFF:
488 /* fall through if domU or if poweroff fails */
489 if (DOMAIN_IS_INITDOMAIN(xen_info))
490 if (apic_enable_acpi)
491 (void) acpi_poweroff();
492 /* FALLTHRU */
493 case AD_HALT:
494 default:
495 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
496 break;
497 }
498 break;
499 case A_REBOOT:
500 (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
501 break;
502 default:
503 return;
504 }
505 }
506
507
508 static int
xen_psm_translate_irq(dev_info_t * dip,int irqno)509 xen_psm_translate_irq(dev_info_t *dip, int irqno)
510 {
511 if (dip == NULL) {
512 XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
513 " dip = NULL\n", irqno));
514 return (irqno);
515 }
516 return (irqno);
517 }
518
519 /*
520 * xen_psm_intr_enter() acks the event that triggered the interrupt and
521 * returns the new priority level,
522 */
523 /*ARGSUSED*/
524 static int
xen_psm_intr_enter(int ipl,int * vector)525 xen_psm_intr_enter(int ipl, int *vector)
526 {
527 int newipl;
528 uint_t intno;
529 cpu_t *cpu = CPU;
530
531 intno = (*vector);
532
533 ASSERT(intno < NR_IRQS);
534 ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
535
536 if (!ec_is_edge_pirq(intno))
537 ec_clear_irq(intno);
538
539 newipl = autovect[intno].avh_hi_pri;
540 if (newipl == 0) {
541 /*
542 * (newipl == 0) means we have no service routines for this
543 * vector. We will treat this as a spurious interrupt.
544 * We have cleared the pending bit already, clear the event
545 * mask and return a spurious interrupt. This case can happen
546 * when an interrupt delivery is racing with the removal of
547 * of the service routine for that interrupt.
548 */
549 ec_unmask_irq(intno);
550 newipl = -1; /* flag spurious interrupt */
551 } else if (newipl <= cpu->cpu_pri) {
552 /*
553 * (newipl <= cpu->cpu_pri) means that we must be trying to
554 * service a vector that was shared with a higher priority
555 * isr. The higher priority handler has been removed and
556 * we need to service this int. We can't return a lower
557 * priority than current cpu priority. Just synthesize a
558 * priority to return that should be acceptable.
559 * It should never happen that we synthesize a priority that
560 * moves us from low-priority to high-priority that would make
561 * a us incorrectly run on the high priority stack.
562 */
563 newipl = cpu->cpu_pri + 1; /* synthetic priority */
564 ASSERT(newipl != LOCK_LEVEL + 1);
565 }
566 return (newipl);
567 }
568
569
570 /*
571 * xen_psm_intr_exit() restores the old interrupt
572 * priority level after processing an interrupt.
573 * It is called with interrupts disabled, and does not enable interrupts.
574 */
575 /* ARGSUSED */
576 static void
xen_psm_intr_exit(int ipl,int vector)577 xen_psm_intr_exit(int ipl, int vector)
578 {
579 ec_try_unmask_irq(vector);
580 xen_psm_setspl(ipl);
581 }
582
583 intr_exit_fn_t
psm_intr_exit_fn(void)584 psm_intr_exit_fn(void)
585 {
586 return (xen_psm_intr_exit);
587 }
588
589 /*
590 * Check if new ipl level allows delivery of previously unserviced events
591 */
592 static void
xen_psm_setspl(int ipl)593 xen_psm_setspl(int ipl)
594 {
595 struct cpu *cpu = CPU;
596 volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
597 uint16_t pending;
598
599 ASSERT(vci->evtchn_upcall_mask != 0);
600
601 /*
602 * If new ipl level will enable any pending interrupts, setup so the
603 * upcoming sti will cause us to get an upcall.
604 */
605 pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
606 if (pending) {
607 int i;
608 ulong_t pending_sels = 0;
609 volatile ulong_t *selp;
610 struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
611
612 for (i = bsrw_insn(pending); i > ipl; i--)
613 pending_sels |= cpe->pending_sel[i];
614 ASSERT(pending_sels);
615 selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
616 atomic_or_ulong(selp, pending_sels);
617 vci->evtchn_upcall_pending = 1;
618 }
619 }
620
621 /*
622 * This function provides external interface to the nexus for all
623 * functionality related to the new DDI interrupt framework.
624 *
625 * Input:
626 * dip - pointer to the dev_info structure of the requested device
627 * hdlp - pointer to the internal interrupt handle structure for the
628 * requested interrupt
629 * intr_op - opcode for this call
630 * result - pointer to the integer that will hold the result to be
631 * passed back if return value is PSM_SUCCESS
632 *
633 * Output:
634 * return value is either PSM_SUCCESS or PSM_FAILURE
635 */
636 int
xen_intr_ops(dev_info_t * dip,ddi_intr_handle_impl_t * hdlp,psm_intr_op_t intr_op,int * result)637 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
638 psm_intr_op_t intr_op, int *result)
639 {
640 int cap;
641 int err;
642 int new_priority;
643 apic_irq_t *irqp;
644 struct intrspec *ispec;
645
646 DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
647 "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
648
649 switch (intr_op) {
650 case PSM_INTR_OP_CHECK_MSI:
651 /*
652 * Till PCI passthru is supported, only dom0 has MSI/MSIX
653 */
654 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
655 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
656 DDI_INTR_TYPE_MSIX);
657 break;
658 }
659 /*
660 * Check MSI/X is supported or not at APIC level and
661 * masked off the MSI/X bits in hdlp->ih_type if not
662 * supported before return. If MSI/X is supported,
663 * leave the ih_type unchanged and return.
664 *
665 * hdlp->ih_type passed in from the nexus has all the
666 * interrupt types supported by the device.
667 */
668 if (xen_support_msi == 0) {
669 /*
670 * if xen_support_msi is not set, call
671 * apic_check_msi_support() to check whether msi
672 * is supported first
673 */
674 if (apic_check_msi_support() == PSM_SUCCESS)
675 xen_support_msi = 1;
676 else
677 xen_support_msi = -1;
678 }
679 if (xen_support_msi == 1)
680 *result = hdlp->ih_type;
681 else
682 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
683 DDI_INTR_TYPE_MSIX);
684 break;
685 case PSM_INTR_OP_ALLOC_VECTORS:
686 if (hdlp->ih_type == DDI_INTR_TYPE_MSI)
687 *result = apic_alloc_msi_vectors(dip, hdlp->ih_inum,
688 hdlp->ih_scratch1, hdlp->ih_pri,
689 (int)(uintptr_t)hdlp->ih_scratch2);
690 else
691 *result = apic_alloc_msix_vectors(dip, hdlp->ih_inum,
692 hdlp->ih_scratch1, hdlp->ih_pri,
693 (int)(uintptr_t)hdlp->ih_scratch2);
694 break;
695 case PSM_INTR_OP_FREE_VECTORS:
696 apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
697 hdlp->ih_pri, hdlp->ih_type);
698 break;
699 case PSM_INTR_OP_NAVAIL_VECTORS:
700 /*
701 * XXPV - maybe we should make this be:
702 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
703 */
704 if (DOMAIN_IS_INITDOMAIN(xen_info))
705 *result = APIC_VECTOR_PER_IPL;
706 else
707 *result = 1;
708 break;
709 case PSM_INTR_OP_XLATE_VECTOR:
710 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
711 if (ispec->intrspec_vec >= PIRQ_BASE &&
712 ispec->intrspec_vec < NR_PIRQS &&
713 DOMAIN_IS_INITDOMAIN(xen_info)) {
714 *result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
715 } else {
716 *result = ispec->intrspec_vec;
717 }
718 break;
719 case PSM_INTR_OP_GET_PENDING:
720 /* XXPV - is this enough for dom0 or do we need to ref ioapic */
721 *result = ec_pending_irq(hdlp->ih_vector);
722 break;
723 case PSM_INTR_OP_CLEAR_MASK:
724 /* XXPV - is this enough for dom0 or do we need to set ioapic */
725 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
726 return (PSM_FAILURE);
727 ec_enable_irq(hdlp->ih_vector);
728 break;
729 case PSM_INTR_OP_SET_MASK:
730 /* XXPV - is this enough for dom0 or do we need to set ioapic */
731 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
732 return (PSM_FAILURE);
733 ec_disable_irq(hdlp->ih_vector);
734 break;
735 case PSM_INTR_OP_GET_CAP:
736 cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
737 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
738 cap |= DDI_INTR_FLAG_MASKABLE;
739 *result = cap;
740 break;
741 case PSM_INTR_OP_GET_SHARED:
742 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
743 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
744 return (PSM_FAILURE);
745 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
746 if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
747 == NULL)
748 return (PSM_FAILURE);
749 *result = (irqp->airq_share > 1) ? 1: 0;
750 } else {
751 return (PSM_FAILURE);
752 }
753 break;
754 case PSM_INTR_OP_SET_PRI:
755 new_priority = *(int *)result;
756 err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
757 if (err != 0)
758 return (PSM_FAILURE);
759 break;
760 case PSM_INTR_OP_GET_INTR:
761 if (!DOMAIN_IS_INITDOMAIN(xen_info))
762 return (PSM_FAILURE);
763 /*
764 * The interrupt handle given here has been allocated
765 * specifically for this command, and ih_private carries
766 * a pointer to a apic_get_intr_t.
767 */
768 if (apic_get_vector_intr_info(
769 hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
770 return (PSM_FAILURE);
771 break;
772 case PSM_INTR_OP_SET_CAP:
773 /* FALLTHRU */
774 default:
775 return (PSM_FAILURE);
776 }
777 return (PSM_SUCCESS);
778 }
779
780 static void
xen_psm_rebind_irq(int irq)781 xen_psm_rebind_irq(int irq)
782 {
783 cpuset_t ncpu;
784 processorid_t newcpu;
785 apic_irq_t *irqptr;
786
787 newcpu = xen_psm_bind_intr(irq);
788 if (newcpu == IRQ_UNBOUND) {
789 CPUSET_ZERO(ncpu);
790 CPUSET_OR(ncpu, xen_psm_cpus_online);
791 } else {
792 CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
793 }
794 ec_set_irq_affinity(irq, ncpu);
795 if (irq <= APIC_MAX_VECTOR) {
796 irqptr = apic_irq_table[irq];
797 ASSERT(irqptr != NULL);
798 irqptr->airq_temp_cpu = (uchar_t)newcpu;
799 }
800 }
801
802 /*
803 * Disable all device interrupts for the given cpu.
804 * High priority interrupts are not disabled and will still be serviced.
805 */
806 static int
xen_psm_disable_intr(processorid_t cpun)807 xen_psm_disable_intr(processorid_t cpun)
808 {
809 int irq;
810
811 /*
812 * Can't offline VCPU 0 on this hypervisor. There's no reason
813 * anyone would want to given that the CPUs are virtual. Also note
814 * that the hypervisor requires suspend/resume to be on VCPU 0.
815 */
816 if (cpun == 0)
817 return (PSM_FAILURE);
818
819 CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
820 for (irq = 0; irq < NR_IRQS; irq++) {
821 if (!ec_irq_needs_rebind(irq, cpun))
822 continue;
823 xen_psm_rebind_irq(irq);
824 }
825 return (PSM_SUCCESS);
826 }
827
828 static void
xen_psm_enable_intr(processorid_t cpun)829 xen_psm_enable_intr(processorid_t cpun)
830 {
831 int irq;
832
833 if (cpun == 0)
834 return;
835
836 CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
837
838 /*
839 * Rebalance device interrupts among online processors
840 */
841 for (irq = 0; irq < NR_IRQS; irq++) {
842 if (!ec_irq_rebindable(irq))
843 continue;
844 xen_psm_rebind_irq(irq);
845 }
846
847 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
848 apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
849 }
850 }
851
852 static int
xen_psm_post_cpu_start()853 xen_psm_post_cpu_start()
854 {
855 processorid_t cpun;
856
857 cpun = psm_get_cpu_id();
858 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
859 /*
860 * Non-virtualized environments can call psm_post_cpu_start
861 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
862 * xen_psm_post_cpu_start() is only called from boot.
863 */
864 apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
865 }
866 return (PSM_SUCCESS);
867 }
868
869 /*
870 * This function will reprogram the timer.
871 *
872 * When in oneshot mode the argument is the absolute time in future at which to
873 * generate the interrupt.
874 *
875 * When in periodic mode, the argument is the interval at which the
876 * interrupts should be generated. There is no need to support the periodic
877 * mode timer change at this time.
878 *
879 * Note that we must be careful to convert from hrtime to Xen system time (see
880 * xpv_timestamp.c).
881 */
882 static void
xen_psm_timer_reprogram(hrtime_t timer_req)883 xen_psm_timer_reprogram(hrtime_t timer_req)
884 {
885 hrtime_t now, timer_new, time_delta, xen_time;
886 ulong_t flags;
887
888 flags = intr_clear();
889 /*
890 * We should be called from high PIL context (CBE_HIGH_PIL),
891 * so kpreempt is disabled.
892 */
893
894 now = xpv_gethrtime();
895 xen_time = xpv_getsystime();
896 if (timer_req <= now) {
897 /*
898 * requested to generate an interrupt in the past
899 * generate an interrupt as soon as possible
900 */
901 time_delta = XEN_NSEC_PER_TICK;
902 } else
903 time_delta = timer_req - now;
904
905 timer_new = xen_time + time_delta;
906 if (HYPERVISOR_set_timer_op(timer_new) != 0)
907 panic("can't set hypervisor timer?");
908 intr_restore(flags);
909 }
910
911 /*
912 * This function will enable timer interrupts.
913 */
914 static void
xen_psm_timer_enable(void)915 xen_psm_timer_enable(void)
916 {
917 ec_unmask_irq(xen_clock_irq);
918 }
919
920 /*
921 * This function will disable timer interrupts on the current cpu.
922 */
923 static void
xen_psm_timer_disable(void)924 xen_psm_timer_disable(void)
925 {
926 (void) ec_block_irq(xen_clock_irq);
927 /*
928 * If the clock irq is pending on this cpu then we need to
929 * clear the pending interrupt.
930 */
931 ec_unpend_irq(xen_clock_irq);
932 }
933
934 /*
935 *
936 * The following functions are in the platform specific file so that they
937 * can be different functions depending on whether we are running on
938 * bare metal or a hypervisor.
939 */
940
941 /*
942 * Allocate a free vector for irq at ipl.
943 */
944 /* ARGSUSED */
945 uchar_t
apic_allocate_vector(int ipl,int irq,int pri)946 apic_allocate_vector(int ipl, int irq, int pri)
947 {
948 physdev_irq_t irq_op;
949 uchar_t vector;
950 int rc;
951
952 irq_op.irq = irq;
953
954 if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
955 != 0)
956 panic("Hypervisor alloc vector failed err: %d", -rc);
957 vector = irq_op.vector;
958 /*
959 * No need to worry about vector colliding with our reserved vectors
960 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
961 * generated traps and handle them properly.
962 */
963 apic_vector_to_irq[vector] = (uchar_t)irq;
964 return (vector);
965 }
966
967 /* Mark vector as not being used by any irq */
968 void
apic_free_vector(uchar_t vector)969 apic_free_vector(uchar_t vector)
970 {
971 apic_vector_to_irq[vector] = APIC_RESV_IRQ;
972 }
973
974 /*
975 * This function returns the no. of vectors available for the pri.
976 * dip is not used at this moment. If we really don't need that,
977 * it will be removed. Since priority is not limited by hardware
978 * when running on the hypervisor we simply return the maximum no.
979 * of available contiguous vectors.
980 */
981 /*ARGSUSED*/
982 int
apic_navail_vector(dev_info_t * dip,int pri)983 apic_navail_vector(dev_info_t *dip, int pri)
984 {
985 int lowest, highest, i, navail, count;
986
987 DDI_INTR_IMPLDBG((CE_CONT, "apic_navail_vector: dip: %p, pri: %x\n",
988 (void *)dip, pri));
989
990 highest = APIC_MAX_VECTOR;
991 lowest = APIC_BASE_VECT;
992 navail = count = 0;
993
994 /* It has to be contiguous */
995 for (i = lowest; i < highest; i++) {
996 count = 0;
997 while ((apic_vector_to_irq[i] == APIC_RESV_IRQ) &&
998 (i < highest)) {
999 count++;
1000 i++;
1001 }
1002 if (count > navail)
1003 navail = count;
1004 }
1005 return (navail);
1006 }
1007
1008 static physdev_manage_pci_t *managed_devlist;
1009 static int mdev_cnt;
1010 static int mdev_size = 128;
1011 static uchar_t msi_vector_to_pirq[APIC_MAX_VECTOR+1];
1012
1013 /*
1014 * Add devfn on given bus to devices managed by hypervisor
1015 */
1016 static int
xen_manage_device(uint8_t bus,uint8_t devfn)1017 xen_manage_device(uint8_t bus, uint8_t devfn)
1018 {
1019 physdev_manage_pci_t manage_pci, *newlist;
1020 int rc, i, oldsize;
1021
1022 /*
1023 * Check if bus/devfn already managed. If so just return success.
1024 */
1025 if (managed_devlist == NULL) {
1026 managed_devlist = kmem_alloc(sizeof (physdev_manage_pci_t) *
1027 mdev_size, KM_NOSLEEP);
1028 if (managed_devlist == NULL) {
1029 cmn_err(CE_WARN,
1030 "Can't alloc space for managed device list");
1031 return (0);
1032 }
1033 };
1034 for (i = 0; i < mdev_cnt; i++) {
1035 if (managed_devlist[i].bus == bus &&
1036 managed_devlist[i].devfn == devfn)
1037 return (1); /* device already managed */
1038 }
1039 manage_pci.bus = bus;
1040 manage_pci.devfn = devfn;
1041 rc = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci);
1042 if (rc < 0) {
1043 cmn_err(CE_WARN,
1044 "hypervisor add pci device call failed bus:0x%x"
1045 " devfn:0x%x", bus, devfn);
1046 return (0);
1047 }
1048 /*
1049 * Add device to the managed device list
1050 */
1051 if (i == mdev_size) {
1052 /*
1053 * grow the managed device list
1054 */
1055 oldsize = mdev_size * sizeof (physdev_manage_pci_t);
1056 mdev_size *= 2;
1057 newlist = kmem_alloc(sizeof (physdev_manage_pci_t) * mdev_size,
1058 KM_NOSLEEP);
1059 if (newlist == NULL) {
1060 cmn_err(CE_WARN, "Can't grow managed device list");
1061 return (0);
1062 }
1063 bcopy(managed_devlist, newlist, oldsize);
1064 kmem_free(managed_devlist, oldsize);
1065 managed_devlist = newlist;
1066 }
1067 managed_devlist[i].bus = bus;
1068 managed_devlist[i].devfn = devfn;
1069 mdev_cnt++;
1070 return (1);
1071 }
1072
1073 /*
1074 * allocate an apic irq struct for an MSI interrupt
1075 */
1076 static int
msi_allocate_irq(int irq)1077 msi_allocate_irq(int irq)
1078 {
1079 apic_irq_t *irqptr = apic_irq_table[irq];
1080
1081 if (irqptr == NULL) {
1082 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP);
1083 if (irqptr == NULL) {
1084 cmn_err(CE_WARN, "xpv_psm: NO memory to allocate IRQ");
1085 return (-1);
1086 }
1087 apic_irq_table[irq] = irqptr;
1088 } else {
1089 if (irq == APIC_RESV_IRQ && irqptr->airq_mps_intr_index == 0)
1090 irqptr->airq_mps_intr_index = FREE_INDEX;
1091 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1092 cmn_err(CE_WARN, "xpv_psm: MSI IRQ already in use");
1093 return (-1);
1094 }
1095 }
1096 irqptr->airq_mps_intr_index = FREE_INDEX;
1097 return (irq);
1098 }
1099
1100 /*
1101 * read MSI/MSIX vector out of config space
1102 */
1103 static uchar_t
xpv_psm_get_msi_vector(dev_info_t * dip,int type,int entry)1104 xpv_psm_get_msi_vector(dev_info_t *dip, int type, int entry)
1105 {
1106 uint64_t msi_data = 0;
1107 int cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip);
1108 ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(dip);
1109 ushort_t msi_ctrl;
1110 uchar_t vector;
1111
1112 ASSERT((handle != NULL) && (cap_ptr != 0));
1113 if (type == DDI_INTR_TYPE_MSI) {
1114 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1115 /*
1116 * Get vector
1117 */
1118 if (msi_ctrl & PCI_MSI_64BIT_MASK) {
1119 msi_data = pci_config_get16(handle,
1120 cap_ptr + PCI_MSI_64BIT_DATA);
1121 } else {
1122 msi_data = pci_config_get16(handle,
1123 cap_ptr + PCI_MSI_32BIT_DATA);
1124 }
1125 vector = (msi_data & 0xff) + entry;
1126 } else if (type == DDI_INTR_TYPE_MSIX) {
1127 uintptr_t off;
1128 ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1129
1130 /* Offset into the given entry in the MSI-X table */
1131 off = (uintptr_t)msix_p->msix_tbl_addr +
1132 (entry * PCI_MSIX_VECTOR_SIZE);
1133
1134 msi_data = ddi_get32(msix_p->msix_tbl_hdl,
1135 (uint32_t *)(off + PCI_MSIX_DATA_OFFSET));
1136 vector = msi_data & 0xff;
1137 }
1138 return (vector);
1139 }
1140
1141
1142 static void
get_busdevfn(dev_info_t * dip,int * busp,int * devfnp)1143 get_busdevfn(dev_info_t *dip, int *busp, int *devfnp)
1144 {
1145 pci_regspec_t *regspec;
1146 int reglen;
1147
1148 /*
1149 * Get device reg spec, first word has PCI bus and
1150 * device/function info we need.
1151 */
1152 if (ddi_getlongprop(DDI_DEV_T_NONE, dip, DDI_PROP_DONTPASS, "reg",
1153 (caddr_t)®spec, ®len) != DDI_SUCCESS) {
1154 cmn_err(CE_WARN,
1155 "get_busdevfn() failed to get regspec.");
1156 return;
1157 }
1158 /*
1159 * get PCI bus # from reg spec for device
1160 */
1161 *busp = PCI_REG_BUS_G(regspec[0].pci_phys_hi);
1162 /*
1163 * get combined device/function from reg spec for device.
1164 */
1165 *devfnp = (regspec[0].pci_phys_hi & (PCI_REG_FUNC_M | PCI_REG_DEV_M)) >>
1166 PCI_REG_FUNC_SHIFT;
1167
1168 kmem_free(regspec, reglen);
1169 }
1170
1171 /*
1172 * This function allocates "count" MSI vector(s) for the given "dip/pri/type"
1173 */
1174 int
apic_alloc_msi_vectors(dev_info_t * dip,int inum,int count,int pri,int behavior)1175 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
1176 int behavior)
1177 {
1178 int rcount, i, rc, irqno;
1179 uchar_t vector, cpu;
1180 major_t major;
1181 apic_irq_t *irqptr;
1182 physdev_map_pirq_t map_irq;
1183 int busnum, devfn;
1184
1185 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: dip=0x%p "
1186 "inum=0x%x pri=0x%x count=0x%x behavior=%d\n",
1187 (void *)dip, inum, pri, count, behavior));
1188
1189 if (count > 1) {
1190 if (behavior == DDI_INTR_ALLOC_STRICT &&
1191 apic_multi_msi_enable == 0)
1192 return (0);
1193 if (apic_multi_msi_enable == 0)
1194 count = 1;
1195 }
1196
1197 if ((rcount = apic_navail_vector(dip, pri)) > count)
1198 rcount = count;
1199 else if (rcount == 0 || (rcount < count &&
1200 behavior == DDI_INTR_ALLOC_STRICT))
1201 return (0);
1202
1203 /* if not ISP2, then round it down */
1204 if (!ISP2(rcount))
1205 rcount = 1 << (highbit(rcount) - 1);
1206
1207 /*
1208 * get PCI bus # and devfn from reg spec for device
1209 */
1210 get_busdevfn(dip, &busnum, &devfn);
1211
1212 /*
1213 * Tell xen about this pci device
1214 */
1215 if (!xen_manage_device(busnum, devfn))
1216 return (0);
1217
1218 mutex_enter(&airq_mutex);
1219
1220 major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1221 for (i = 0; i < rcount; i++) {
1222 /*
1223 * use PHYSDEVOP_map_pirq to have xen map MSI to a pirq
1224 */
1225 map_irq.domid = DOMID_SELF;
1226 map_irq.type = MAP_PIRQ_TYPE_MSI;
1227 map_irq.index = -rcount; /* hypervisor auto allocates vectors */
1228 map_irq.pirq = -1;
1229 map_irq.bus = busnum;
1230 map_irq.devfn = devfn;
1231 map_irq.entry_nr = i;
1232 map_irq.table_base = 0;
1233 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1234 irqno = map_irq.pirq;
1235 if (rc < 0) {
1236 mutex_exit(&airq_mutex);
1237 cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1238 return (i);
1239 }
1240 if (irqno < 0) {
1241 mutex_exit(&airq_mutex);
1242 cmn_err(CE_NOTE,
1243 "!hypervisor not configured for MSI support");
1244 xen_support_msi = -1;
1245 return (0);
1246 }
1247
1248 /*
1249 * Find out what vector the hypervisor assigned
1250 */
1251 vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSI, i);
1252
1253 if (msi_allocate_irq(irqno) < 0) {
1254 mutex_exit(&airq_mutex);
1255 return (i);
1256 }
1257 apic_max_device_irq = max(irqno, apic_max_device_irq);
1258 apic_min_device_irq = min(irqno, apic_min_device_irq);
1259 irqptr = apic_irq_table[irqno];
1260 ASSERT(irqptr != NULL);
1261 #ifdef DEBUG
1262 if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1263 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: "
1264 "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1265 #endif
1266 apic_vector_to_irq[vector] = (uchar_t)irqno;
1267 msi_vector_to_pirq[vector] = (uchar_t)irqno;
1268
1269 irqptr->airq_vector = vector;
1270 irqptr->airq_ioapicindex = (uchar_t)inum; /* start */
1271 irqptr->airq_intin_no = (uchar_t)rcount;
1272 irqptr->airq_ipl = pri;
1273 irqptr->airq_origirq = (uchar_t)(inum + i);
1274 irqptr->airq_share_id = 0;
1275 irqptr->airq_mps_intr_index = MSI_INDEX;
1276 irqptr->airq_dip = dip;
1277 irqptr->airq_major = major;
1278 if (i == 0) /* they all bind to the same cpu */
1279 cpu = irqptr->airq_cpu = xen_psm_bind_intr(irqno);
1280 else
1281 irqptr->airq_cpu = cpu;
1282 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: irq=0x%x "
1283 "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1284 (void *)irqptr->airq_dip, irqptr->airq_vector,
1285 irqptr->airq_origirq, pri));
1286 }
1287 mutex_exit(&airq_mutex);
1288 return (rcount);
1289 }
1290
1291 /*
1292 * This function allocates "count" MSI-X vector(s) for the given "dip/pri/type"
1293 */
1294 int
apic_alloc_msix_vectors(dev_info_t * dip,int inum,int count,int pri,int behavior)1295 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
1296 int behavior)
1297 {
1298 int rcount, i, rc;
1299 major_t major;
1300 physdev_map_pirq_t map_irq;
1301 int busnum, devfn;
1302 ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1303 uint64_t table_base;
1304 pfn_t pfnum;
1305
1306 if (msix_p == NULL) {
1307 msix_p = pci_msix_init(dip);
1308 if (msix_p != NULL) {
1309 i_ddi_set_msix(dip, msix_p);
1310 } else {
1311 cmn_err(CE_WARN, "apic_alloc_msix_vectors()"
1312 " msix_init failed");
1313 return (0);
1314 }
1315 }
1316 /*
1317 * Hypervisor wants PCI config space address of msix table base
1318 */
1319 pfnum = hat_getpfnum(kas.a_hat, (caddr_t)msix_p->msix_tbl_addr) &
1320 ~PFN_IS_FOREIGN_MFN;
1321 table_base = (uint64_t)((pfnum << PAGESHIFT) - msix_p->msix_tbl_offset |
1322 ((uintptr_t)msix_p->msix_tbl_addr & PAGEOFFSET));
1323 /*
1324 * get PCI bus # and devfn from reg spec for device
1325 */
1326 get_busdevfn(dip, &busnum, &devfn);
1327
1328 /*
1329 * Tell xen about this pci device
1330 */
1331 if (!xen_manage_device(busnum, devfn))
1332 return (0);
1333 mutex_enter(&airq_mutex);
1334
1335 if ((rcount = apic_navail_vector(dip, pri)) > count)
1336 rcount = count;
1337 else if (rcount == 0 || (rcount < count &&
1338 behavior == DDI_INTR_ALLOC_STRICT)) {
1339 rcount = 0;
1340 goto out;
1341 }
1342
1343 major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1344 for (i = 0; i < rcount; i++) {
1345 int irqno;
1346 uchar_t vector;
1347 apic_irq_t *irqptr;
1348
1349 /*
1350 * use PHYSDEVOP_map_pirq to have xen map MSI-X to a pirq
1351 */
1352 map_irq.domid = DOMID_SELF;
1353 map_irq.type = MAP_PIRQ_TYPE_MSI;
1354 map_irq.index = -1; /* hypervisor auto allocates vector */
1355 map_irq.pirq = -1;
1356 map_irq.bus = busnum;
1357 map_irq.devfn = devfn;
1358 map_irq.entry_nr = i;
1359 map_irq.table_base = table_base;
1360 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1361 irqno = map_irq.pirq;
1362 if (rc < 0) {
1363 mutex_exit(&airq_mutex);
1364 cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1365 return (i);
1366 }
1367 if (irqno < 0) {
1368 mutex_exit(&airq_mutex);
1369 cmn_err(CE_NOTE,
1370 "!hypervisor not configured for MSI support");
1371 xen_support_msi = -1;
1372 return (0);
1373 }
1374 /*
1375 * Find out what vector the hypervisor assigned
1376 */
1377 vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSIX, i);
1378
1379 if (msi_allocate_irq(irqno) < 0) {
1380 mutex_exit(&airq_mutex);
1381 return (i);
1382 }
1383 apic_vector_to_irq[vector] = (uchar_t)irqno;
1384 msi_vector_to_pirq[vector] = (uchar_t)irqno;
1385 apic_max_device_irq = max(irqno, apic_max_device_irq);
1386 apic_min_device_irq = min(irqno, apic_min_device_irq);
1387 irqptr = apic_irq_table[irqno];
1388 ASSERT(irqptr != NULL);
1389 irqptr->airq_vector = (uchar_t)vector;
1390 irqptr->airq_ipl = pri;
1391 irqptr->airq_origirq = (uchar_t)(inum + i);
1392 irqptr->airq_share_id = 0;
1393 irqptr->airq_mps_intr_index = MSIX_INDEX;
1394 irqptr->airq_dip = dip;
1395 irqptr->airq_major = major;
1396 irqptr->airq_cpu = IRQ_UNBOUND; /* will be bound when addspl */
1397 }
1398 out:
1399 mutex_exit(&airq_mutex);
1400 return (rcount);
1401 }
1402
1403
1404 /*
1405 * This finds the apic_irq_t associated with the dip, ispec and type.
1406 * The entry should have already been freed, but it can not have been
1407 * reused yet since the hypervisor can not have reassigned the pirq since
1408 * we have not freed that yet.
1409 */
1410 static apic_irq_t *
msi_find_irq(dev_info_t * dip,struct intrspec * ispec)1411 msi_find_irq(dev_info_t *dip, struct intrspec *ispec)
1412 {
1413 apic_irq_t *irqp;
1414 int i;
1415
1416 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
1417 if ((irqp = apic_irq_table[i]) == NULL)
1418 continue;
1419 if ((irqp->airq_dip == dip) &&
1420 (irqp->airq_origirq == ispec->intrspec_vec) &&
1421 (irqp->airq_ipl == ispec->intrspec_pri)) {
1422 return (irqp);
1423 }
1424 }
1425 return (NULL);
1426 }
1427
1428 void
apic_free_vectors(dev_info_t * dip,int inum,int count,int pri,int type)1429 apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type)
1430 {
1431 int i, rc;
1432 physdev_unmap_pirq_t unmap_pirq;
1433 apic_irq_t *irqptr;
1434 struct intrspec ispec;
1435
1436 DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: dip: %p inum: %x "
1437 "count: %x pri: %x type: %x\n",
1438 (void *)dip, inum, count, pri, type));
1439
1440 /* for MSI/X only */
1441 if (!DDI_INTR_IS_MSI_OR_MSIX(type))
1442 return;
1443
1444 for (i = 0; i < count; i++) {
1445 DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: inum=0x%x "
1446 "pri=0x%x count=0x%x\n", inum, pri, count));
1447 ispec.intrspec_vec = inum + i;
1448 ispec.intrspec_pri = pri;
1449 if ((irqptr = msi_find_irq(dip, &ispec)) == NULL) {
1450 cmn_err(CE_WARN,
1451 "couldn't find irq %s,%s dip: 0x%p vec: %x pri: %x",
1452 ddi_get_name(dip), ddi_get_name_addr(dip),
1453 (void *)dip, inum + i, pri);
1454 continue;
1455 }
1456 /*
1457 * use PHYSDEVOP_unmap_pirq to have xen unmap MSI from a pirq
1458 */
1459 unmap_pirq.domid = DOMID_SELF;
1460 unmap_pirq.pirq = msi_vector_to_pirq[irqptr->airq_vector];
1461 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_pirq);
1462 if (rc < 0) {
1463 cmn_err(CE_WARN, "unmap pirq failed");
1464 return;
1465 }
1466 irqptr->airq_mps_intr_index = FREE_INDEX;
1467 apic_vector_to_irq[irqptr->airq_vector] = APIC_RESV_IRQ;
1468 }
1469 }
1470
1471 /*
1472 * The hypervisor doesn't permit access to local apics directly
1473 */
1474 /* ARGSUSED */
1475 uint32_t *
mapin_apic(uint32_t addr,size_t len,int flags)1476 mapin_apic(uint32_t addr, size_t len, int flags)
1477 {
1478 /*
1479 * Return a pointer to a memory area to fake out the
1480 * probe code that wants to read apic registers.
1481 * The dummy values will end up being ignored by xen
1482 * later on when they are used anyway.
1483 */
1484 xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1485 return (xen_psm_dummy_apic);
1486 }
1487
1488 /* ARGSUSED */
1489 uint32_t *
mapin_ioapic(uint32_t addr,size_t len,int flags)1490 mapin_ioapic(uint32_t addr, size_t len, int flags)
1491 {
1492 /*
1493 * Return non-null here to fake out configure code that calls this.
1494 * The i86xpv platform will not reference through the returned value..
1495 */
1496 return ((uint32_t *)0x1);
1497 }
1498
1499 /* ARGSUSED */
1500 void
mapout_apic(caddr_t addr,size_t len)1501 mapout_apic(caddr_t addr, size_t len)
1502 {
1503 }
1504
1505 /* ARGSUSED */
1506 void
mapout_ioapic(caddr_t addr,size_t len)1507 mapout_ioapic(caddr_t addr, size_t len)
1508 {
1509 }
1510
1511 uint32_t
ioapic_read(int apic_ix,uint32_t reg)1512 ioapic_read(int apic_ix, uint32_t reg)
1513 {
1514 physdev_apic_t apic;
1515
1516 apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1517 apic.reg = reg;
1518 if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1519 panic("read ioapic %d reg %d failed", apic_ix, reg);
1520 return (apic.value);
1521 }
1522
1523 void
ioapic_write(int apic_ix,uint32_t reg,uint32_t value)1524 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1525 {
1526 physdev_apic_t apic;
1527
1528 apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1529 apic.reg = reg;
1530 apic.value = value;
1531 if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1532 panic("write ioapic %d reg %d failed", apic_ix, reg);
1533 }
1534
1535 /*
1536 * This function was added as part of x2APIC support in pcplusmp.
1537 */
1538 void
ioapic_write_eoi(int apic_ix,uint32_t value)1539 ioapic_write_eoi(int apic_ix, uint32_t value)
1540 {
1541 physdev_apic_t apic;
1542
1543 apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1544 apic.reg = APIC_IO_EOI;
1545 apic.value = value;
1546 if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1547 panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1548 }
1549
1550 /*
1551 * This function was added as part of x2APIC support in pcplusmp to resolve
1552 * undefined symbol in xpv_psm.
1553 */
1554 void
x2apic_update_psm()1555 x2apic_update_psm()
1556 {
1557 }
1558
1559 /*
1560 * This function was added as part of x2APIC support in pcplusmp to resolve
1561 * undefined symbol in xpv_psm.
1562 */
1563 void
apic_ret()1564 apic_ret()
1565 {
1566 }
1567
1568 /*
1569 * Call rebind to do the actual programming.
1570 */
1571 int
apic_setup_io_intr(void * p,int irq,boolean_t deferred)1572 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1573 {
1574 apic_irq_t *irqptr;
1575 struct ioapic_reprogram_data *drep = NULL;
1576 int rv, cpu;
1577 cpuset_t cpus;
1578
1579 if (deferred) {
1580 drep = (struct ioapic_reprogram_data *)p;
1581 ASSERT(drep != NULL);
1582 irqptr = drep->irqp;
1583 } else {
1584 irqptr = (apic_irq_t *)p;
1585 }
1586 ASSERT(irqptr != NULL);
1587 /*
1588 * Set cpu based on xen idea of online cpu's not apic tables.
1589 * Note that xen ignores/sets to it's own preferred value the
1590 * target cpu field when programming ioapic anyway.
1591 */
1592 if (irqptr->airq_mps_intr_index == MSI_INDEX)
1593 cpu = irqptr->airq_cpu; /* MSI cpus are already set */
1594 else {
1595 cpu = xen_psm_bind_intr(irq);
1596 irqptr->airq_cpu = cpu;
1597 }
1598 if (cpu == IRQ_UNBOUND) {
1599 CPUSET_ZERO(cpus);
1600 CPUSET_OR(cpus, xen_psm_cpus_online);
1601 } else {
1602 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1603 }
1604 rv = apic_rebind(irqptr, cpu, drep);
1605 if (rv) {
1606 /* CPU is not up or interrupt is disabled. Fall back to 0 */
1607 cpu = 0;
1608 irqptr->airq_cpu = cpu;
1609 rv = apic_rebind(irqptr, cpu, drep);
1610 }
1611 /*
1612 * If rebind successful bind the irq to an event channel
1613 */
1614 if (rv == 0) {
1615 ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1616 CPUSET_FIND(cpus, cpu);
1617 apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1618 }
1619 return (rv);
1620 }
1621
1622 /*
1623 * Allocate a new vector for the given irq
1624 */
1625 /* ARGSUSED */
1626 uchar_t
apic_modify_vector(uchar_t vector,int irq)1627 apic_modify_vector(uchar_t vector, int irq)
1628 {
1629 return (apic_allocate_vector(0, irq, 0));
1630 }
1631
1632 /*
1633 * The rest of the file is just generic psm module boilerplate
1634 */
1635
1636 static struct psm_ops xen_psm_ops = {
1637 xen_psm_probe, /* psm_probe */
1638
1639 xen_psm_softinit, /* psm_init */
1640 xen_psm_picinit, /* psm_picinit */
1641 xen_psm_intr_enter, /* psm_intr_enter */
1642 xen_psm_intr_exit, /* psm_intr_exit */
1643 xen_psm_setspl, /* psm_setspl */
1644 xen_psm_addspl, /* psm_addspl */
1645 xen_psm_delspl, /* psm_delspl */
1646 xen_psm_disable_intr, /* psm_disable_intr */
1647 xen_psm_enable_intr, /* psm_enable_intr */
1648 (int (*)(int))NULL, /* psm_softlvl_to_irq */
1649 (void (*)(int))NULL, /* psm_set_softintr */
1650 (void (*)(processorid_t))NULL, /* psm_set_idlecpu */
1651 (void (*)(processorid_t))NULL, /* psm_unset_idlecpu */
1652
1653 xen_psm_clkinit, /* psm_clkinit */
1654 xen_psm_get_clockirq, /* psm_get_clockirq */
1655 xen_psm_hrtimeinit, /* psm_hrtimeinit */
1656 xpv_gethrtime, /* psm_gethrtime */
1657
1658 xen_psm_get_next_processorid, /* psm_get_next_processorid */
1659 xen_psm_cpu_start, /* psm_cpu_start */
1660 xen_psm_post_cpu_start, /* psm_post_cpu_start */
1661 xen_psm_shutdown, /* psm_shutdown */
1662 xen_psm_get_ipivect, /* psm_get_ipivect */
1663 xen_psm_send_ipi, /* psm_send_ipi */
1664
1665 xen_psm_translate_irq, /* psm_translate_irq */
1666
1667 (void (*)(int, char *))NULL, /* psm_notify_error */
1668 (void (*)(int msg))NULL, /* psm_notify_func */
1669 xen_psm_timer_reprogram, /* psm_timer_reprogram */
1670 xen_psm_timer_enable, /* psm_timer_enable */
1671 xen_psm_timer_disable, /* psm_timer_disable */
1672 (void (*)(void *arg))NULL, /* psm_post_cyclic_setup */
1673 (void (*)(int, int))NULL, /* psm_preshutdown */
1674 xen_intr_ops, /* Advanced DDI Interrupt framework */
1675 (int (*)(psm_state_request_t *))NULL, /* psm_state */
1676 (int (*)(psm_cpu_request_t *))NULL /* psm_cpu_ops */
1677 };
1678
1679 static struct psm_info xen_psm_info = {
1680 PSM_INFO_VER01_5, /* version */
1681 PSM_OWN_EXCLUSIVE, /* ownership */
1682 &xen_psm_ops, /* operation */
1683 "xVM_psm", /* machine name */
1684 "platform module" /* machine descriptions */
1685 };
1686
1687 static void *xen_psm_hdlp;
1688
1689 int
_init(void)1690 _init(void)
1691 {
1692 return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1693 }
1694
1695 int
_fini(void)1696 _fini(void)
1697 {
1698 return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1699 }
1700
1701 int
_info(struct modinfo * modinfop)1702 _info(struct modinfo *modinfop)
1703 {
1704 return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1705 }
1706