1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2018 Joyent, Inc.
26 */
27
28 #define PSMI_1_7
29
30 #include <sys/mutex.h>
31 #include <sys/types.h>
32 #include <sys/time.h>
33 #include <sys/clock.h>
34 #include <sys/machlock.h>
35 #include <sys/smp_impldefs.h>
36 #include <sys/uadmin.h>
37 #include <sys/promif.h>
38 #include <sys/psm.h>
39 #include <sys/psm_common.h>
40 #include <sys/atomic.h>
41 #include <sys/apic.h>
42 #include <sys/archsystm.h>
43 #include <sys/mach_intr.h>
44 #include <sys/hypervisor.h>
45 #include <sys/evtchn_impl.h>
46 #include <sys/modctl.h>
47 #include <sys/trap.h>
48 #include <sys/panic.h>
49 #include <sys/sysmacros.h>
50 #include <sys/pci_intr_lib.h>
51 #include <vm/hat_i86.h>
52
53 #include <xen/public/vcpu.h>
54 #include <xen/public/physdev.h>
55
56
57 /*
58 * Global Data
59 */
60
61 int xen_psm_verbose = 0;
62
63 /* As of now we don't support x2apic in xVM */
64 volatile uint32_t *apicadr = NULL; /* dummy, so common code will link */
65 int apic_error = 0;
66 int apic_verbose = 0;
67 cpuset_t apic_cpumask;
68 int apic_forceload = 0;
69 uchar_t apic_vectortoipl[APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL] = {
70 3, 4, 5, 5, 6, 6, 9, 10, 11, 12, 13, 14, 15, 15
71 };
72 uchar_t apic_ipltopri[MAXIPL + 1];
73 uchar_t apic_ipls[APIC_AVAIL_VECTOR];
74 uint_t apic_picinit_called;
75 apic_cpus_info_t *apic_cpus;
76 int xen_psm_intr_policy = INTR_ROUND_ROBIN_WITH_AFFINITY;
77 /* use to make sure only one cpu handles the nmi */
78 static lock_t xen_psm_nmi_lock;
79 int xen_psm_kmdb_on_nmi = 0; /* 0 - no, 1 - yes enter kmdb */
80 int xen_psm_panic_on_nmi = 0;
81 int xen_psm_num_nmis = 0;
82
83 cpuset_t xen_psm_cpus_online; /* online cpus */
84 int xen_psm_ncpus = 1; /* cpu count */
85 int xen_psm_next_bind_cpu; /* next cpu to bind an interrupt to */
86
87 int xen_support_msi = 0;
88
89 static int xen_clock_irq = INVALID_IRQ;
90
91 /* flag definitions for xen_psm_verbose */
92 #define XEN_PSM_VERBOSE_IRQ_FLAG 0x00000001
93 #define XEN_PSM_VERBOSE_POWEROFF_FLAG 0x00000002
94 #define XEN_PSM_VERBOSE_POWEROFF_PAUSE_FLAG 0x00000004
95
96 #define XEN_PSM_VERBOSE_IRQ(fmt) \
97 if (xen_psm_verbose & XEN_PSM_VERBOSE_IRQ_FLAG) \
98 cmn_err fmt;
99
100 #define XEN_PSM_VERBOSE_POWEROFF(fmt) \
101 if (xen_psm_verbose & XEN_PSM_VERBOSE_POWEROFF_FLAG) \
102 prom_printf fmt;
103
104 /*
105 * Dummy apic array to point common routines at that want to do some apic
106 * manipulation. Xen doesn't allow guest apic access so we point at these
107 * memory locations to fake out those who want to do apic fiddling.
108 */
109 uint32_t xen_psm_dummy_apic[APIC_IRR_REG + 1];
110
111 static struct psm_info xen_psm_info;
112 static void xen_psm_setspl(int);
113
114 int
115 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
116 int behavior);
117 int
118 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
119 int behavior);
120
121 /*
122 * Local support routines
123 */
124
125 /*
126 * Select vcpu to bind xen virtual device interrupt to.
127 */
128 /*ARGSUSED*/
129 int
xen_psm_bind_intr(int irq)130 xen_psm_bind_intr(int irq)
131 {
132 int bind_cpu;
133 apic_irq_t *irqptr;
134
135 bind_cpu = IRQ_UNBOUND;
136 if (xen_psm_intr_policy == INTR_LOWEST_PRIORITY)
137 return (bind_cpu);
138 if (irq <= APIC_MAX_VECTOR)
139 irqptr = apic_irq_table[irq];
140 else
141 irqptr = NULL;
142 if (irqptr && (irqptr->airq_cpu != IRQ_UNBOUND))
143 bind_cpu = irqptr->airq_cpu & ~IRQ_USER_BOUND;
144 if (bind_cpu != IRQ_UNBOUND) {
145 if (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu))
146 bind_cpu = 0;
147 goto done;
148 }
149 if (xen_psm_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
150 do {
151 bind_cpu = xen_psm_next_bind_cpu++;
152 if (xen_psm_next_bind_cpu >= xen_psm_ncpus)
153 xen_psm_next_bind_cpu = 0;
154 } while (!CPU_IN_SET(xen_psm_cpus_online, bind_cpu));
155 } else {
156 bind_cpu = 0;
157 }
158 done:
159 return (bind_cpu);
160 }
161
162 /*
163 * Autoconfiguration Routines
164 */
165
166 static int
xen_psm_probe(void)167 xen_psm_probe(void)
168 {
169 int ret = PSM_SUCCESS;
170
171 if (DOMAIN_IS_INITDOMAIN(xen_info))
172 ret = apic_probe_common(xen_psm_info.p_mach_idstring);
173 return (ret);
174 }
175
176 static void
xen_psm_softinit(void)177 xen_psm_softinit(void)
178 {
179 /* LINTED logical expression always true: op "||" */
180 ASSERT((1 << EVTCHN_SHIFT) == NBBY * sizeof (ulong_t));
181 CPUSET_ATOMIC_ADD(xen_psm_cpus_online, 0);
182 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
183 apic_init_common();
184 }
185 }
186
187 #define XEN_NSEC_PER_TICK 10 /* XXX - assume we have a 100 Mhz clock */
188
189 /*ARGSUSED*/
190 static int
xen_psm_clkinit(int hertz)191 xen_psm_clkinit(int hertz)
192 {
193 extern enum tod_fault_type tod_fault(enum tod_fault_type, int);
194 extern int dosynctodr;
195
196 /*
197 * domU cannot set the TOD hardware, fault the TOD clock now to
198 * indicate that and turn off attempts to sync TOD hardware
199 * with the hires timer.
200 */
201 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
202 mutex_enter(&tod_lock);
203 (void) tod_fault(TOD_RDONLY, 0);
204 dosynctodr = 0;
205 mutex_exit(&tod_lock);
206 }
207 /*
208 * The hypervisor provides a timer based on the local APIC timer.
209 * The interface supports requests of nanosecond resolution.
210 * A common frequency of the apic clock is 100 Mhz which
211 * gives a resolution of 10 nsec per tick. What we would really like
212 * is a way to get the ns per tick value from xen.
213 * XXPV - This is an assumption that needs checking and may change
214 */
215 return (XEN_NSEC_PER_TICK);
216 }
217
218 static void
xen_psm_hrtimeinit(void)219 xen_psm_hrtimeinit(void)
220 {
221 extern int gethrtime_hires;
222 gethrtime_hires = 1;
223 }
224
225 /* xen_psm NMI handler */
226 static uint_t
xen_psm_nmi_intr(caddr_t arg __unused,caddr_t arg1 __unused)227 xen_psm_nmi_intr(caddr_t arg __unused, caddr_t arg1 __unused)
228 {
229 xen_psm_num_nmis++;
230
231 if (!lock_try(&xen_psm_nmi_lock))
232 return (DDI_INTR_UNCLAIMED);
233
234 if (xen_psm_kmdb_on_nmi && psm_debugger()) {
235 debug_enter("NMI received: entering kmdb\n");
236 } else if (xen_psm_panic_on_nmi) {
237 /* Keep panic from entering kmdb. */
238 nopanicdebug = 1;
239 panic("NMI received\n");
240 } else {
241 /*
242 * prom_printf is the best shot we have of something which is
243 * problem free from high level/NMI type of interrupts
244 */
245 prom_printf("NMI received\n");
246 }
247
248 lock_clear(&xen_psm_nmi_lock);
249 return (DDI_INTR_CLAIMED);
250 }
251
252 static void
xen_psm_picinit()253 xen_psm_picinit()
254 {
255 int cpu, irqno;
256 cpuset_t cpus;
257
258 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
259 /* set a flag so we know we have run xen_psm_picinit() */
260 apic_picinit_called = 1;
261 LOCK_INIT_CLEAR(&apic_ioapic_lock);
262
263 /* XXPV - do we need to do this? */
264 picsetup(); /* initialise the 8259 */
265
266 /* enable apic mode if imcr present */
267 /* XXPV - do we need to do this either? */
268 if (apic_imcrp) {
269 outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
270 outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_APIC);
271 }
272
273 ioapic_init_intr(IOAPIC_NOMASK);
274 /*
275 * We never called xen_psm_addspl() when the SCI
276 * interrupt was added because that happened before the
277 * PSM module was loaded. Fix that up here by doing
278 * any missed operations (e.g. bind to CPU)
279 */
280 if ((irqno = apic_sci_vect) > 0) {
281 if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
282 CPUSET_ZERO(cpus);
283 CPUSET_OR(cpus, xen_psm_cpus_online);
284 } else {
285 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
286 }
287 ec_set_irq_affinity(irqno, cpus);
288 apic_irq_table[irqno]->airq_temp_cpu =
289 (uchar_t)(cpu & ~IRQ_USER_BOUND);
290 ec_enable_irq(irqno);
291 }
292 }
293
294 /* add nmi handler - least priority nmi handler */
295 LOCK_INIT_CLEAR(&xen_psm_nmi_lock);
296
297 if (!psm_add_nmintr(0, xen_psm_nmi_intr,
298 "xVM_psm NMI handler", (caddr_t)NULL))
299 cmn_err(CE_WARN, "xVM_psm: Unable to add nmi handler");
300 }
301
302
303 /*
304 * generates an interprocessor interrupt to another CPU
305 */
306 static void
xen_psm_send_ipi(int cpun,int ipl)307 xen_psm_send_ipi(int cpun, int ipl)
308 {
309 ulong_t flag = intr_clear();
310
311 ec_send_ipi(ipl, cpun);
312 intr_restore(flag);
313 }
314
315 /*ARGSUSED*/
316 static int
xen_psm_addspl(int irqno,int ipl,int min_ipl,int max_ipl)317 xen_psm_addspl(int irqno, int ipl, int min_ipl, int max_ipl)
318 {
319 int cpu, ret;
320 cpuset_t cpus;
321
322 /*
323 * We are called at splhi() so we can't call anything that might end
324 * up trying to context switch.
325 */
326 if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
327 DOMAIN_IS_INITDOMAIN(xen_info)) {
328 /*
329 * Priority/affinity/enable for PIRQ's is set in ec_setup_pirq()
330 */
331 ret = apic_addspl_common(irqno, ipl, min_ipl, max_ipl);
332 } else {
333 /*
334 * Set priority/affinity/enable for non PIRQs
335 */
336 ret = ec_set_irq_priority(irqno, ipl);
337 ASSERT(ret == 0);
338 if ((cpu = xen_psm_bind_intr(irqno)) == IRQ_UNBOUND) {
339 CPUSET_ZERO(cpus);
340 CPUSET_OR(cpus, xen_psm_cpus_online);
341 } else {
342 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
343 }
344 ec_set_irq_affinity(irqno, cpus);
345 ec_enable_irq(irqno);
346 }
347 return (ret);
348 }
349
350 /*
351 * Acquire ownership of this irq on this cpu
352 */
353 void
xen_psm_acquire_irq(int irq)354 xen_psm_acquire_irq(int irq)
355 {
356 ulong_t flags;
357 int cpuid;
358
359 /*
360 * If the irq is currently being serviced by another cpu
361 * we busy-wait for the other cpu to finish. Take any
362 * pending interrupts before retrying.
363 */
364 do {
365 flags = intr_clear();
366 cpuid = ec_block_irq(irq);
367 intr_restore(flags);
368 } while (cpuid != CPU->cpu_id);
369 }
370
371 /*ARGSUSED*/
372 static int
xen_psm_delspl(int irqno,int ipl,int min_ipl,int max_ipl)373 xen_psm_delspl(int irqno, int ipl, int min_ipl, int max_ipl)
374 {
375 apic_irq_t *irqptr;
376 int err = PSM_SUCCESS;
377
378 if (irqno >= PIRQ_BASE && irqno < NR_PIRQS &&
379 DOMAIN_IS_INITDOMAIN(xen_info)) {
380 irqptr = apic_irq_table[irqno];
381 /*
382 * unbind if no more sharers of this irq/evtchn
383 */
384 if (irqptr->airq_share == 1) {
385 xen_psm_acquire_irq(irqno);
386 ec_unbind_irq(irqno);
387 }
388 err = apic_delspl_common(irqno, ipl, min_ipl, max_ipl);
389 /*
390 * If still in use reset priority
391 */
392 if (!err && irqptr->airq_share != 0) {
393 err = ec_set_irq_priority(irqno, max_ipl);
394 return (err);
395 }
396 } else {
397 xen_psm_acquire_irq(irqno);
398 ec_unbind_irq(irqno);
399 }
400 return (err);
401 }
402
403 static processorid_t
xen_psm_get_next_processorid(processorid_t id)404 xen_psm_get_next_processorid(processorid_t id)
405 {
406 if (id == -1)
407 return (0);
408
409 for (id++; id < NCPU; id++) {
410 switch (-HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL)) {
411 case 0: /* yeah, that one's there */
412 return (id);
413 default:
414 case X_EINVAL: /* out of range */
415 return (-1);
416 case X_ENOENT: /* not present in the domain */
417 /*
418 * It's not clear that we -need- to keep looking
419 * at this point, if, e.g., we can guarantee
420 * the hypervisor always keeps a contiguous range
421 * of vcpus around this is equivalent to "out of range".
422 *
423 * But it would be sad to miss a vcpu we're
424 * supposed to be using ..
425 */
426 break;
427 }
428 }
429
430 return (-1);
431 }
432
433 /*
434 * XXPV - undo the start cpu op change; return to ignoring this value
435 * - also tweak error handling in main startup loop
436 */
437 /*ARGSUSED*/
438 static int
xen_psm_cpu_start(processorid_t id,caddr_t arg)439 xen_psm_cpu_start(processorid_t id, caddr_t arg)
440 {
441 int ret;
442
443 ASSERT(id > 0);
444 CPUSET_ATOMIC_ADD(xen_psm_cpus_online, id);
445 ec_bind_cpu_ipis(id);
446 (void) ec_bind_virq_to_irq(VIRQ_TIMER, id);
447 if ((ret = xen_vcpu_up(id)) == 0)
448 xen_psm_ncpus++;
449 else
450 ret = EINVAL;
451 return (ret);
452 }
453
454 /*
455 * Allocate an irq for inter cpu signaling
456 */
457 /*ARGSUSED*/
458 static int
xen_psm_get_ipivect(int ipl,int type)459 xen_psm_get_ipivect(int ipl, int type)
460 {
461 return (ec_bind_ipi_to_irq(ipl, 0));
462 }
463
464 /*ARGSUSED*/
465 static int
xen_psm_get_clockirq(int ipl)466 xen_psm_get_clockirq(int ipl)
467 {
468 if (xen_clock_irq != INVALID_IRQ)
469 return (xen_clock_irq);
470
471 xen_clock_irq = ec_bind_virq_to_irq(VIRQ_TIMER, 0);
472 return (xen_clock_irq);
473 }
474
475 /*ARGSUSED*/
476 static void
xen_psm_shutdown(int cmd,int fcn)477 xen_psm_shutdown(int cmd, int fcn)
478 {
479 XEN_PSM_VERBOSE_POWEROFF(("xen_psm_shutdown(%d,%d);\n", cmd, fcn));
480
481 switch (cmd) {
482 case A_SHUTDOWN:
483 switch (fcn) {
484 case AD_BOOT:
485 case AD_IBOOT:
486 (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
487 break;
488 case AD_POWEROFF:
489 /* fall through if domU or if poweroff fails */
490 if (DOMAIN_IS_INITDOMAIN(xen_info))
491 if (apic_enable_acpi)
492 (void) acpi_poweroff();
493 /* FALLTHRU */
494 case AD_HALT:
495 default:
496 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
497 break;
498 }
499 break;
500 case A_REBOOT:
501 (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
502 break;
503 default:
504 return;
505 }
506 }
507
508
509 static int
xen_psm_translate_irq(dev_info_t * dip,int irqno)510 xen_psm_translate_irq(dev_info_t *dip, int irqno)
511 {
512 if (dip == NULL) {
513 XEN_PSM_VERBOSE_IRQ((CE_CONT, "!xen_psm: irqno = %d"
514 " dip = NULL\n", irqno));
515 return (irqno);
516 }
517 return (irqno);
518 }
519
520 /*
521 * xen_psm_intr_enter() acks the event that triggered the interrupt and
522 * returns the new priority level,
523 */
524 /*ARGSUSED*/
525 static int
xen_psm_intr_enter(int ipl,int * vector)526 xen_psm_intr_enter(int ipl, int *vector)
527 {
528 int newipl;
529 uint_t intno;
530 cpu_t *cpu = CPU;
531
532 intno = (*vector);
533
534 ASSERT(intno < NR_IRQS);
535 ASSERT(cpu->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask != 0);
536
537 if (!ec_is_edge_pirq(intno))
538 ec_clear_irq(intno);
539
540 newipl = autovect[intno].avh_hi_pri;
541 if (newipl == 0) {
542 /*
543 * (newipl == 0) means we have no service routines for this
544 * vector. We will treat this as a spurious interrupt.
545 * We have cleared the pending bit already, clear the event
546 * mask and return a spurious interrupt. This case can happen
547 * when an interrupt delivery is racing with the removal of
548 * of the service routine for that interrupt.
549 */
550 ec_unmask_irq(intno);
551 newipl = -1; /* flag spurious interrupt */
552 } else if (newipl <= cpu->cpu_pri) {
553 /*
554 * (newipl <= cpu->cpu_pri) means that we must be trying to
555 * service a vector that was shared with a higher priority
556 * isr. The higher priority handler has been removed and
557 * we need to service this int. We can't return a lower
558 * priority than current cpu priority. Just synthesize a
559 * priority to return that should be acceptable.
560 * It should never happen that we synthesize a priority that
561 * moves us from low-priority to high-priority that would make
562 * a us incorrectly run on the high priority stack.
563 */
564 newipl = cpu->cpu_pri + 1; /* synthetic priority */
565 ASSERT(newipl != LOCK_LEVEL + 1);
566 }
567 return (newipl);
568 }
569
570
571 /*
572 * xen_psm_intr_exit() restores the old interrupt
573 * priority level after processing an interrupt.
574 * It is called with interrupts disabled, and does not enable interrupts.
575 */
576 /* ARGSUSED */
577 static void
xen_psm_intr_exit(int ipl,int vector)578 xen_psm_intr_exit(int ipl, int vector)
579 {
580 ec_try_unmask_irq(vector);
581 xen_psm_setspl(ipl);
582 }
583
584 intr_exit_fn_t
psm_intr_exit_fn(void)585 psm_intr_exit_fn(void)
586 {
587 return (xen_psm_intr_exit);
588 }
589
590 /*
591 * Check if new ipl level allows delivery of previously unserviced events
592 */
593 static void
xen_psm_setspl(int ipl)594 xen_psm_setspl(int ipl)
595 {
596 struct cpu *cpu = CPU;
597 volatile vcpu_info_t *vci = cpu->cpu_m.mcpu_vcpu_info;
598 uint16_t pending;
599
600 ASSERT(vci->evtchn_upcall_mask != 0);
601
602 /*
603 * If new ipl level will enable any pending interrupts, setup so the
604 * upcoming sti will cause us to get an upcall.
605 */
606 pending = cpu->cpu_m.mcpu_intr_pending & ~((1 << (ipl + 1)) - 1);
607 if (pending) {
608 int i;
609 ulong_t pending_sels = 0;
610 volatile ulong_t *selp;
611 struct xen_evt_data *cpe = cpu->cpu_m.mcpu_evt_pend;
612
613 for (i = bsrw_insn(pending); i > ipl; i--)
614 pending_sels |= cpe->pending_sel[i];
615 ASSERT(pending_sels);
616 selp = (volatile ulong_t *)&vci->evtchn_pending_sel;
617 atomic_or_ulong(selp, pending_sels);
618 vci->evtchn_upcall_pending = 1;
619 }
620 }
621
622 /*
623 * This function provides external interface to the nexus for all
624 * functionality related to the new DDI interrupt framework.
625 *
626 * Input:
627 * dip - pointer to the dev_info structure of the requested device
628 * hdlp - pointer to the internal interrupt handle structure for the
629 * requested interrupt
630 * intr_op - opcode for this call
631 * result - pointer to the integer that will hold the result to be
632 * passed back if return value is PSM_SUCCESS
633 *
634 * Output:
635 * return value is either PSM_SUCCESS or PSM_FAILURE
636 */
637 int
xen_intr_ops(dev_info_t * dip,ddi_intr_handle_impl_t * hdlp,psm_intr_op_t intr_op,int * result)638 xen_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
639 psm_intr_op_t intr_op, int *result)
640 {
641 int cap;
642 int err;
643 int new_priority;
644 apic_irq_t *irqp;
645 struct intrspec *ispec;
646
647 DDI_INTR_IMPLDBG((CE_CONT, "xen_intr_ops: dip: %p hdlp: %p "
648 "intr_op: %x\n", (void *)dip, (void *)hdlp, intr_op));
649
650 switch (intr_op) {
651 case PSM_INTR_OP_CHECK_MSI:
652 /*
653 * Till PCI passthru is supported, only dom0 has MSI/MSIX
654 */
655 if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
656 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
657 DDI_INTR_TYPE_MSIX);
658 break;
659 }
660 /*
661 * Check MSI/X is supported or not at APIC level and
662 * masked off the MSI/X bits in hdlp->ih_type if not
663 * supported before return. If MSI/X is supported,
664 * leave the ih_type unchanged and return.
665 *
666 * hdlp->ih_type passed in from the nexus has all the
667 * interrupt types supported by the device.
668 */
669 if (xen_support_msi == 0) {
670 /*
671 * if xen_support_msi is not set, call
672 * apic_check_msi_support() to check whether msi
673 * is supported first
674 */
675 if (apic_check_msi_support() == PSM_SUCCESS)
676 xen_support_msi = 1;
677 else
678 xen_support_msi = -1;
679 }
680 if (xen_support_msi == 1)
681 *result = hdlp->ih_type;
682 else
683 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
684 DDI_INTR_TYPE_MSIX);
685 break;
686 case PSM_INTR_OP_ALLOC_VECTORS:
687 if (hdlp->ih_type == DDI_INTR_TYPE_MSI)
688 *result = apic_alloc_msi_vectors(dip, hdlp->ih_inum,
689 hdlp->ih_scratch1, hdlp->ih_pri,
690 (int)(uintptr_t)hdlp->ih_scratch2);
691 else
692 *result = apic_alloc_msix_vectors(dip, hdlp->ih_inum,
693 hdlp->ih_scratch1, hdlp->ih_pri,
694 (int)(uintptr_t)hdlp->ih_scratch2);
695 break;
696 case PSM_INTR_OP_FREE_VECTORS:
697 apic_free_vectors(dip, hdlp->ih_inum, hdlp->ih_scratch1,
698 hdlp->ih_pri, hdlp->ih_type);
699 break;
700 case PSM_INTR_OP_NAVAIL_VECTORS:
701 /*
702 * XXPV - maybe we should make this be:
703 * min(APIC_VECTOR_PER_IPL, count of all avail vectors);
704 */
705 if (DOMAIN_IS_INITDOMAIN(xen_info))
706 *result = APIC_VECTOR_PER_IPL;
707 else
708 *result = 1;
709 break;
710 case PSM_INTR_OP_XLATE_VECTOR:
711 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
712 if (ispec->intrspec_vec >= PIRQ_BASE &&
713 ispec->intrspec_vec < NR_PIRQS &&
714 DOMAIN_IS_INITDOMAIN(xen_info)) {
715 *result = apic_introp_xlate(dip, ispec, hdlp->ih_type);
716 } else {
717 *result = ispec->intrspec_vec;
718 }
719 break;
720 case PSM_INTR_OP_GET_PENDING:
721 /* XXPV - is this enough for dom0 or do we need to ref ioapic */
722 *result = ec_pending_irq(hdlp->ih_vector);
723 break;
724 case PSM_INTR_OP_CLEAR_MASK:
725 /* XXPV - is this enough for dom0 or do we need to set ioapic */
726 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
727 return (PSM_FAILURE);
728 ec_enable_irq(hdlp->ih_vector);
729 break;
730 case PSM_INTR_OP_SET_MASK:
731 /* XXPV - is this enough for dom0 or do we need to set ioapic */
732 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
733 return (PSM_FAILURE);
734 ec_disable_irq(hdlp->ih_vector);
735 break;
736 case PSM_INTR_OP_GET_CAP:
737 cap = DDI_INTR_FLAG_PENDING | DDI_INTR_FLAG_EDGE;
738 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
739 cap |= DDI_INTR_FLAG_MASKABLE;
740 *result = cap;
741 break;
742 case PSM_INTR_OP_GET_SHARED:
743 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
744 if (hdlp->ih_type != DDI_INTR_TYPE_FIXED)
745 return (PSM_FAILURE);
746 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
747 if ((irqp = apic_find_irq(dip, ispec, hdlp->ih_type))
748 == NULL)
749 return (PSM_FAILURE);
750 *result = (irqp->airq_share > 1) ? 1: 0;
751 } else {
752 return (PSM_FAILURE);
753 }
754 break;
755 case PSM_INTR_OP_SET_PRI:
756 new_priority = *(int *)result;
757 err = ec_set_irq_priority(hdlp->ih_vector, new_priority);
758 if (err != 0)
759 return (PSM_FAILURE);
760 break;
761 case PSM_INTR_OP_GET_INTR:
762 if (!DOMAIN_IS_INITDOMAIN(xen_info))
763 return (PSM_FAILURE);
764 /*
765 * The interrupt handle given here has been allocated
766 * specifically for this command, and ih_private carries
767 * a pointer to a apic_get_intr_t.
768 */
769 if (apic_get_vector_intr_info(
770 hdlp->ih_vector, hdlp->ih_private) != PSM_SUCCESS)
771 return (PSM_FAILURE);
772 break;
773 case PSM_INTR_OP_SET_CAP:
774 /* FALLTHRU */
775 default:
776 return (PSM_FAILURE);
777 }
778 return (PSM_SUCCESS);
779 }
780
781 static void
xen_psm_rebind_irq(int irq)782 xen_psm_rebind_irq(int irq)
783 {
784 cpuset_t ncpu;
785 processorid_t newcpu;
786 apic_irq_t *irqptr;
787
788 newcpu = xen_psm_bind_intr(irq);
789 if (newcpu == IRQ_UNBOUND) {
790 CPUSET_ZERO(ncpu);
791 CPUSET_OR(ncpu, xen_psm_cpus_online);
792 } else {
793 CPUSET_ONLY(ncpu, newcpu & ~IRQ_USER_BOUND);
794 }
795 ec_set_irq_affinity(irq, ncpu);
796 if (irq <= APIC_MAX_VECTOR) {
797 irqptr = apic_irq_table[irq];
798 ASSERT(irqptr != NULL);
799 irqptr->airq_temp_cpu = (uchar_t)newcpu;
800 }
801 }
802
803 /*
804 * Disable all device interrupts for the given cpu.
805 * High priority interrupts are not disabled and will still be serviced.
806 */
807 static int
xen_psm_disable_intr(processorid_t cpun)808 xen_psm_disable_intr(processorid_t cpun)
809 {
810 int irq;
811
812 /*
813 * Can't offline VCPU 0 on this hypervisor. There's no reason
814 * anyone would want to given that the CPUs are virtual. Also note
815 * that the hypervisor requires suspend/resume to be on VCPU 0.
816 */
817 if (cpun == 0)
818 return (PSM_FAILURE);
819
820 CPUSET_ATOMIC_DEL(xen_psm_cpus_online, cpun);
821 for (irq = 0; irq < NR_IRQS; irq++) {
822 if (!ec_irq_needs_rebind(irq, cpun))
823 continue;
824 xen_psm_rebind_irq(irq);
825 }
826 return (PSM_SUCCESS);
827 }
828
829 static void
xen_psm_enable_intr(processorid_t cpun)830 xen_psm_enable_intr(processorid_t cpun)
831 {
832 int irq;
833
834 if (cpun == 0)
835 return;
836
837 CPUSET_ATOMIC_ADD(xen_psm_cpus_online, cpun);
838
839 /*
840 * Rebalance device interrupts among online processors
841 */
842 for (irq = 0; irq < NR_IRQS; irq++) {
843 if (!ec_irq_rebindable(irq))
844 continue;
845 xen_psm_rebind_irq(irq);
846 }
847
848 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
849 apic_cpus[cpun].aci_status |= APIC_CPU_INTR_ENABLE;
850 }
851 }
852
853 static int
xen_psm_post_cpu_start()854 xen_psm_post_cpu_start()
855 {
856 processorid_t cpun;
857
858 cpun = psm_get_cpu_id();
859 if (DOMAIN_IS_INITDOMAIN(xen_info)) {
860 /*
861 * Non-virtualized environments can call psm_post_cpu_start
862 * from Suspend/Resume with the APIC_CPU_INTR_ENABLE bit set.
863 * xen_psm_post_cpu_start() is only called from boot.
864 */
865 apic_cpus[cpun].aci_status |= APIC_CPU_ONLINE;
866 }
867 return (PSM_SUCCESS);
868 }
869
870 /*
871 * This function will reprogram the timer.
872 *
873 * When in oneshot mode the argument is the absolute time in future at which to
874 * generate the interrupt.
875 *
876 * When in periodic mode, the argument is the interval at which the
877 * interrupts should be generated. There is no need to support the periodic
878 * mode timer change at this time.
879 *
880 * Note that we must be careful to convert from hrtime to Xen system time (see
881 * xpv_timestamp.c).
882 */
883 static void
xen_psm_timer_reprogram(hrtime_t timer_req)884 xen_psm_timer_reprogram(hrtime_t timer_req)
885 {
886 hrtime_t now, timer_new, time_delta, xen_time;
887 ulong_t flags;
888
889 flags = intr_clear();
890 /*
891 * We should be called from high PIL context (CBE_HIGH_PIL),
892 * so kpreempt is disabled.
893 */
894
895 now = xpv_gethrtime();
896 xen_time = xpv_getsystime();
897 if (timer_req <= now) {
898 /*
899 * requested to generate an interrupt in the past
900 * generate an interrupt as soon as possible
901 */
902 time_delta = XEN_NSEC_PER_TICK;
903 } else
904 time_delta = timer_req - now;
905
906 timer_new = xen_time + time_delta;
907 if (HYPERVISOR_set_timer_op(timer_new) != 0)
908 panic("can't set hypervisor timer?");
909 intr_restore(flags);
910 }
911
912 /*
913 * This function will enable timer interrupts.
914 */
915 static void
xen_psm_timer_enable(void)916 xen_psm_timer_enable(void)
917 {
918 ec_unmask_irq(xen_clock_irq);
919 }
920
921 /*
922 * This function will disable timer interrupts on the current cpu.
923 */
924 static void
xen_psm_timer_disable(void)925 xen_psm_timer_disable(void)
926 {
927 (void) ec_block_irq(xen_clock_irq);
928 /*
929 * If the clock irq is pending on this cpu then we need to
930 * clear the pending interrupt.
931 */
932 ec_unpend_irq(xen_clock_irq);
933 }
934
935 /*
936 *
937 * The following functions are in the platform specific file so that they
938 * can be different functions depending on whether we are running on
939 * bare metal or a hypervisor.
940 */
941
942 /*
943 * Allocate a free vector for irq at ipl.
944 */
945 /* ARGSUSED */
946 uchar_t
apic_allocate_vector(int ipl,int irq,int pri)947 apic_allocate_vector(int ipl, int irq, int pri)
948 {
949 physdev_irq_t irq_op;
950 uchar_t vector;
951 int rc;
952
953 irq_op.irq = irq;
954
955 if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
956 != 0)
957 panic("Hypervisor alloc vector failed err: %d", -rc);
958 vector = irq_op.vector;
959 /*
960 * No need to worry about vector colliding with our reserved vectors
961 * e.g. T_FASTTRAP, xen can differentiate between hardware and software
962 * generated traps and handle them properly.
963 */
964 apic_vector_to_irq[vector] = (uchar_t)irq;
965 return (vector);
966 }
967
968 /* Mark vector as not being used by any irq */
969 void
apic_free_vector(uchar_t vector)970 apic_free_vector(uchar_t vector)
971 {
972 apic_vector_to_irq[vector] = APIC_RESV_IRQ;
973 }
974
975 /*
976 * This function returns the no. of vectors available for the pri.
977 * dip is not used at this moment. If we really don't need that,
978 * it will be removed. Since priority is not limited by hardware
979 * when running on the hypervisor we simply return the maximum no.
980 * of available contiguous vectors.
981 */
982 /*ARGSUSED*/
983 int
apic_navail_vector(dev_info_t * dip,int pri)984 apic_navail_vector(dev_info_t *dip, int pri)
985 {
986 int lowest, highest, i, navail, count;
987
988 DDI_INTR_IMPLDBG((CE_CONT, "apic_navail_vector: dip: %p, pri: %x\n",
989 (void *)dip, pri));
990
991 highest = APIC_MAX_VECTOR;
992 lowest = APIC_BASE_VECT;
993 navail = count = 0;
994
995 /* It has to be contiguous */
996 for (i = lowest; i < highest; i++) {
997 count = 0;
998 while ((apic_vector_to_irq[i] == APIC_RESV_IRQ) &&
999 (i < highest)) {
1000 count++;
1001 i++;
1002 }
1003 if (count > navail)
1004 navail = count;
1005 }
1006 return (navail);
1007 }
1008
1009 static physdev_manage_pci_t *managed_devlist;
1010 static int mdev_cnt;
1011 static int mdev_size = 128;
1012 static uchar_t msi_vector_to_pirq[APIC_MAX_VECTOR+1];
1013
1014 /*
1015 * Add devfn on given bus to devices managed by hypervisor
1016 */
1017 static int
xen_manage_device(uint8_t bus,uint8_t devfn)1018 xen_manage_device(uint8_t bus, uint8_t devfn)
1019 {
1020 physdev_manage_pci_t manage_pci, *newlist;
1021 int rc, i, oldsize;
1022
1023 /*
1024 * Check if bus/devfn already managed. If so just return success.
1025 */
1026 if (managed_devlist == NULL) {
1027 managed_devlist = kmem_alloc(sizeof (physdev_manage_pci_t) *
1028 mdev_size, KM_NOSLEEP);
1029 if (managed_devlist == NULL) {
1030 cmn_err(CE_WARN,
1031 "Can't alloc space for managed device list");
1032 return (0);
1033 }
1034 };
1035 for (i = 0; i < mdev_cnt; i++) {
1036 if (managed_devlist[i].bus == bus &&
1037 managed_devlist[i].devfn == devfn)
1038 return (1); /* device already managed */
1039 }
1040 manage_pci.bus = bus;
1041 manage_pci.devfn = devfn;
1042 rc = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, &manage_pci);
1043 if (rc < 0) {
1044 cmn_err(CE_WARN,
1045 "hypervisor add pci device call failed bus:0x%x"
1046 " devfn:0x%x", bus, devfn);
1047 return (0);
1048 }
1049 /*
1050 * Add device to the managed device list
1051 */
1052 if (i == mdev_size) {
1053 /*
1054 * grow the managed device list
1055 */
1056 oldsize = mdev_size * sizeof (physdev_manage_pci_t);
1057 mdev_size *= 2;
1058 newlist = kmem_alloc(sizeof (physdev_manage_pci_t) * mdev_size,
1059 KM_NOSLEEP);
1060 if (newlist == NULL) {
1061 cmn_err(CE_WARN, "Can't grow managed device list");
1062 return (0);
1063 }
1064 bcopy(managed_devlist, newlist, oldsize);
1065 kmem_free(managed_devlist, oldsize);
1066 managed_devlist = newlist;
1067 }
1068 managed_devlist[i].bus = bus;
1069 managed_devlist[i].devfn = devfn;
1070 mdev_cnt++;
1071 return (1);
1072 }
1073
1074 /*
1075 * allocate an apic irq struct for an MSI interrupt
1076 */
1077 static int
msi_allocate_irq(int irq)1078 msi_allocate_irq(int irq)
1079 {
1080 apic_irq_t *irqptr = apic_irq_table[irq];
1081
1082 if (irqptr == NULL) {
1083 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_NOSLEEP);
1084 if (irqptr == NULL) {
1085 cmn_err(CE_WARN, "xpv_psm: NO memory to allocate IRQ");
1086 return (-1);
1087 }
1088 apic_irq_table[irq] = irqptr;
1089 } else {
1090 if (irq == APIC_RESV_IRQ && irqptr->airq_mps_intr_index == 0)
1091 irqptr->airq_mps_intr_index = FREE_INDEX;
1092 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1093 cmn_err(CE_WARN, "xpv_psm: MSI IRQ already in use");
1094 return (-1);
1095 }
1096 }
1097 irqptr->airq_mps_intr_index = FREE_INDEX;
1098 return (irq);
1099 }
1100
1101 /*
1102 * read MSI/MSIX vector out of config space
1103 */
1104 static uchar_t
xpv_psm_get_msi_vector(dev_info_t * dip,int type,int entry)1105 xpv_psm_get_msi_vector(dev_info_t *dip, int type, int entry)
1106 {
1107 uint64_t msi_data = 0;
1108 int cap_ptr = i_ddi_get_msi_msix_cap_ptr(dip);
1109 ddi_acc_handle_t handle = i_ddi_get_pci_config_handle(dip);
1110 ushort_t msi_ctrl;
1111 uchar_t vector;
1112
1113 ASSERT((handle != NULL) && (cap_ptr != 0));
1114 vector = 0;
1115 if (type == DDI_INTR_TYPE_MSI) {
1116 msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1117 /*
1118 * Get vector
1119 */
1120 if (msi_ctrl & PCI_MSI_64BIT_MASK) {
1121 msi_data = pci_config_get16(handle,
1122 cap_ptr + PCI_MSI_64BIT_DATA);
1123 } else {
1124 msi_data = pci_config_get16(handle,
1125 cap_ptr + PCI_MSI_32BIT_DATA);
1126 }
1127 vector = (msi_data & 0xff) + entry;
1128 } else if (type == DDI_INTR_TYPE_MSIX) {
1129 uintptr_t off;
1130 ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1131
1132 /* Offset into the given entry in the MSI-X table */
1133 off = (uintptr_t)msix_p->msix_tbl_addr +
1134 (entry * PCI_MSIX_VECTOR_SIZE);
1135
1136 msi_data = ddi_get32(msix_p->msix_tbl_hdl,
1137 (uint32_t *)(off + PCI_MSIX_DATA_OFFSET));
1138 vector = msi_data & 0xff;
1139 }
1140 return (vector);
1141 }
1142
1143
1144 static void
get_busdevfn(dev_info_t * dip,int * busp,int * devfnp)1145 get_busdevfn(dev_info_t *dip, int *busp, int *devfnp)
1146 {
1147 pci_regspec_t *regspec;
1148 int reglen;
1149
1150 /*
1151 * Get device reg spec, first word has PCI bus and
1152 * device/function info we need.
1153 */
1154 if (ddi_getlongprop(DDI_DEV_T_NONE, dip, DDI_PROP_DONTPASS, "reg",
1155 (caddr_t)®spec, ®len) != DDI_SUCCESS) {
1156 cmn_err(CE_WARN,
1157 "get_busdevfn() failed to get regspec.");
1158 return;
1159 }
1160 /*
1161 * get PCI bus # from reg spec for device
1162 */
1163 *busp = PCI_REG_BUS_G(regspec[0].pci_phys_hi);
1164 /*
1165 * get combined device/function from reg spec for device.
1166 */
1167 *devfnp = (regspec[0].pci_phys_hi & (PCI_REG_FUNC_M | PCI_REG_DEV_M)) >>
1168 PCI_REG_FUNC_SHIFT;
1169
1170 kmem_free(regspec, reglen);
1171 }
1172
1173 /*
1174 * This function allocates "count" MSI vector(s) for the given "dip/pri/type"
1175 */
1176 int
apic_alloc_msi_vectors(dev_info_t * dip,int inum,int count,int pri,int behavior)1177 apic_alloc_msi_vectors(dev_info_t *dip, int inum, int count, int pri,
1178 int behavior)
1179 {
1180 int rcount, i, rc, irqno;
1181 uchar_t vector, cpu;
1182 major_t major;
1183 apic_irq_t *irqptr;
1184 physdev_map_pirq_t map_irq;
1185 int busnum, devfn;
1186
1187 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: dip=0x%p "
1188 "inum=0x%x pri=0x%x count=0x%x behavior=%d\n",
1189 (void *)dip, inum, pri, count, behavior));
1190
1191 if (count > 1) {
1192 if (behavior == DDI_INTR_ALLOC_STRICT &&
1193 apic_multi_msi_enable == 0)
1194 return (0);
1195 if (apic_multi_msi_enable == 0)
1196 count = 1;
1197 }
1198
1199 if ((rcount = apic_navail_vector(dip, pri)) > count)
1200 rcount = count;
1201 else if (rcount == 0 || (rcount < count &&
1202 behavior == DDI_INTR_ALLOC_STRICT))
1203 return (0);
1204
1205 /* if not ISP2, then round it down */
1206 if (!ISP2(rcount))
1207 rcount = 1 << (highbit(rcount) - 1);
1208
1209 /*
1210 * get PCI bus # and devfn from reg spec for device
1211 */
1212 get_busdevfn(dip, &busnum, &devfn);
1213
1214 /*
1215 * Tell xen about this pci device
1216 */
1217 if (!xen_manage_device(busnum, devfn))
1218 return (0);
1219
1220 mutex_enter(&airq_mutex);
1221
1222 major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1223 for (i = 0; i < rcount; i++) {
1224 /*
1225 * use PHYSDEVOP_map_pirq to have xen map MSI to a pirq
1226 */
1227 map_irq.domid = DOMID_SELF;
1228 map_irq.type = MAP_PIRQ_TYPE_MSI;
1229 map_irq.index = -rcount; /* hypervisor auto allocates vectors */
1230 map_irq.pirq = -1;
1231 map_irq.bus = busnum;
1232 map_irq.devfn = devfn;
1233 map_irq.entry_nr = i;
1234 map_irq.table_base = 0;
1235 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1236 irqno = map_irq.pirq;
1237 if (rc < 0) {
1238 mutex_exit(&airq_mutex);
1239 cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1240 return (i);
1241 }
1242 if (irqno < 0) {
1243 mutex_exit(&airq_mutex);
1244 cmn_err(CE_NOTE,
1245 "!hypervisor not configured for MSI support");
1246 xen_support_msi = -1;
1247 return (0);
1248 }
1249
1250 /*
1251 * Find out what vector the hypervisor assigned
1252 */
1253 vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSI, i);
1254
1255 if (msi_allocate_irq(irqno) < 0) {
1256 mutex_exit(&airq_mutex);
1257 return (i);
1258 }
1259 apic_max_device_irq = max(irqno, apic_max_device_irq);
1260 apic_min_device_irq = min(irqno, apic_min_device_irq);
1261 irqptr = apic_irq_table[irqno];
1262 ASSERT(irqptr != NULL);
1263 #ifdef DEBUG
1264 if (apic_vector_to_irq[vector] != APIC_RESV_IRQ)
1265 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: "
1266 "apic_vector_to_irq is not APIC_RESV_IRQ\n"));
1267 #endif
1268 apic_vector_to_irq[vector] = (uchar_t)irqno;
1269 msi_vector_to_pirq[vector] = (uchar_t)irqno;
1270
1271 irqptr->airq_vector = vector;
1272 irqptr->airq_ioapicindex = (uchar_t)inum; /* start */
1273 irqptr->airq_intin_no = (uchar_t)rcount;
1274 irqptr->airq_ipl = pri;
1275 irqptr->airq_origirq = (uchar_t)(inum + i);
1276 irqptr->airq_share_id = 0;
1277 irqptr->airq_mps_intr_index = MSI_INDEX;
1278 irqptr->airq_dip = dip;
1279 irqptr->airq_major = major;
1280 if (i == 0) /* they all bind to the same cpu */
1281 cpu = irqptr->airq_cpu = xen_psm_bind_intr(irqno);
1282 else
1283 irqptr->airq_cpu = cpu;
1284 DDI_INTR_IMPLDBG((CE_CONT, "apic_alloc_msi_vectors: irq=0x%x "
1285 "dip=0x%p vector=0x%x origirq=0x%x pri=0x%x\n", irqno,
1286 (void *)irqptr->airq_dip, irqptr->airq_vector,
1287 irqptr->airq_origirq, pri));
1288 }
1289 mutex_exit(&airq_mutex);
1290 return (rcount);
1291 }
1292
1293 /*
1294 * This function allocates "count" MSI-X vector(s) for the given "dip/pri/type"
1295 */
1296 int
apic_alloc_msix_vectors(dev_info_t * dip,int inum,int count,int pri,int behavior)1297 apic_alloc_msix_vectors(dev_info_t *dip, int inum, int count, int pri,
1298 int behavior)
1299 {
1300 int rcount, i, rc;
1301 major_t major;
1302 physdev_map_pirq_t map_irq;
1303 int busnum, devfn;
1304 ddi_intr_msix_t *msix_p = i_ddi_get_msix(dip);
1305 uint64_t table_base;
1306 pfn_t pfnum;
1307
1308 if (msix_p == NULL) {
1309 msix_p = pci_msix_init(dip);
1310 if (msix_p != NULL) {
1311 i_ddi_set_msix(dip, msix_p);
1312 } else {
1313 cmn_err(CE_WARN, "apic_alloc_msix_vectors()"
1314 " msix_init failed");
1315 return (0);
1316 }
1317 }
1318 /*
1319 * Hypervisor wants PCI config space address of msix table base
1320 */
1321 pfnum = hat_getpfnum(kas.a_hat, (caddr_t)msix_p->msix_tbl_addr) &
1322 ~PFN_IS_FOREIGN_MFN;
1323 table_base = (uint64_t)((pfnum << PAGESHIFT) - msix_p->msix_tbl_offset |
1324 ((uintptr_t)msix_p->msix_tbl_addr & PAGEOFFSET));
1325 /*
1326 * get PCI bus # and devfn from reg spec for device
1327 */
1328 get_busdevfn(dip, &busnum, &devfn);
1329
1330 /*
1331 * Tell xen about this pci device
1332 */
1333 if (!xen_manage_device(busnum, devfn))
1334 return (0);
1335 mutex_enter(&airq_mutex);
1336
1337 if ((rcount = apic_navail_vector(dip, pri)) > count)
1338 rcount = count;
1339 else if (rcount == 0 || (rcount < count &&
1340 behavior == DDI_INTR_ALLOC_STRICT)) {
1341 rcount = 0;
1342 goto out;
1343 }
1344
1345 major = (dip != NULL) ? ddi_name_to_major(ddi_get_name(dip)) : 0;
1346 for (i = 0; i < rcount; i++) {
1347 int irqno;
1348 uchar_t vector;
1349 apic_irq_t *irqptr;
1350
1351 /*
1352 * use PHYSDEVOP_map_pirq to have xen map MSI-X to a pirq
1353 */
1354 map_irq.domid = DOMID_SELF;
1355 map_irq.type = MAP_PIRQ_TYPE_MSI;
1356 map_irq.index = -1; /* hypervisor auto allocates vector */
1357 map_irq.pirq = -1;
1358 map_irq.bus = busnum;
1359 map_irq.devfn = devfn;
1360 map_irq.entry_nr = i;
1361 map_irq.table_base = table_base;
1362 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
1363 irqno = map_irq.pirq;
1364 if (rc < 0) {
1365 mutex_exit(&airq_mutex);
1366 cmn_err(CE_WARN, "map MSI irq failed err: %d", -rc);
1367 return (i);
1368 }
1369 if (irqno < 0) {
1370 mutex_exit(&airq_mutex);
1371 cmn_err(CE_NOTE,
1372 "!hypervisor not configured for MSI support");
1373 xen_support_msi = -1;
1374 return (0);
1375 }
1376 /*
1377 * Find out what vector the hypervisor assigned
1378 */
1379 vector = xpv_psm_get_msi_vector(dip, DDI_INTR_TYPE_MSIX, i);
1380
1381 if (msi_allocate_irq(irqno) < 0) {
1382 mutex_exit(&airq_mutex);
1383 return (i);
1384 }
1385 apic_vector_to_irq[vector] = (uchar_t)irqno;
1386 msi_vector_to_pirq[vector] = (uchar_t)irqno;
1387 apic_max_device_irq = max(irqno, apic_max_device_irq);
1388 apic_min_device_irq = min(irqno, apic_min_device_irq);
1389 irqptr = apic_irq_table[irqno];
1390 ASSERT(irqptr != NULL);
1391 irqptr->airq_vector = (uchar_t)vector;
1392 irqptr->airq_ipl = pri;
1393 irqptr->airq_origirq = (uchar_t)(inum + i);
1394 irqptr->airq_share_id = 0;
1395 irqptr->airq_mps_intr_index = MSIX_INDEX;
1396 irqptr->airq_dip = dip;
1397 irqptr->airq_major = major;
1398 irqptr->airq_cpu = IRQ_UNBOUND; /* will be bound when addspl */
1399 }
1400 out:
1401 mutex_exit(&airq_mutex);
1402 return (rcount);
1403 }
1404
1405
1406 /*
1407 * This finds the apic_irq_t associated with the dip, ispec and type.
1408 * The entry should have already been freed, but it can not have been
1409 * reused yet since the hypervisor can not have reassigned the pirq since
1410 * we have not freed that yet.
1411 */
1412 static apic_irq_t *
msi_find_irq(dev_info_t * dip,struct intrspec * ispec)1413 msi_find_irq(dev_info_t *dip, struct intrspec *ispec)
1414 {
1415 apic_irq_t *irqp;
1416 int i;
1417
1418 for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
1419 if ((irqp = apic_irq_table[i]) == NULL)
1420 continue;
1421 if ((irqp->airq_dip == dip) &&
1422 (irqp->airq_origirq == ispec->intrspec_vec) &&
1423 (irqp->airq_ipl == ispec->intrspec_pri)) {
1424 return (irqp);
1425 }
1426 }
1427 return (NULL);
1428 }
1429
1430 void
apic_free_vectors(dev_info_t * dip,int inum,int count,int pri,int type)1431 apic_free_vectors(dev_info_t *dip, int inum, int count, int pri, int type)
1432 {
1433 int i, rc;
1434 physdev_unmap_pirq_t unmap_pirq;
1435 apic_irq_t *irqptr;
1436 struct intrspec ispec;
1437
1438 DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: dip: %p inum: %x "
1439 "count: %x pri: %x type: %x\n",
1440 (void *)dip, inum, count, pri, type));
1441
1442 /* for MSI/X only */
1443 if (!DDI_INTR_IS_MSI_OR_MSIX(type))
1444 return;
1445
1446 for (i = 0; i < count; i++) {
1447 DDI_INTR_IMPLDBG((CE_CONT, "apic_free_vectors: inum=0x%x "
1448 "pri=0x%x count=0x%x\n", inum, pri, count));
1449 ispec.intrspec_vec = inum + i;
1450 ispec.intrspec_pri = pri;
1451 if ((irqptr = msi_find_irq(dip, &ispec)) == NULL) {
1452 cmn_err(CE_WARN,
1453 "couldn't find irq %s,%s dip: 0x%p vec: %x pri: %x",
1454 ddi_get_name(dip), ddi_get_name_addr(dip),
1455 (void *)dip, inum + i, pri);
1456 continue;
1457 }
1458 /*
1459 * use PHYSDEVOP_unmap_pirq to have xen unmap MSI from a pirq
1460 */
1461 unmap_pirq.domid = DOMID_SELF;
1462 unmap_pirq.pirq = msi_vector_to_pirq[irqptr->airq_vector];
1463 rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_pirq);
1464 if (rc < 0) {
1465 cmn_err(CE_WARN, "unmap pirq failed");
1466 return;
1467 }
1468 irqptr->airq_mps_intr_index = FREE_INDEX;
1469 apic_vector_to_irq[irqptr->airq_vector] = APIC_RESV_IRQ;
1470 }
1471 }
1472
1473 /*
1474 * The hypervisor doesn't permit access to local apics directly
1475 */
1476 /* ARGSUSED */
1477 uint32_t *
mapin_apic(uint32_t addr,size_t len,int flags)1478 mapin_apic(uint32_t addr, size_t len, int flags)
1479 {
1480 /*
1481 * Return a pointer to a memory area to fake out the
1482 * probe code that wants to read apic registers.
1483 * The dummy values will end up being ignored by xen
1484 * later on when they are used anyway.
1485 */
1486 xen_psm_dummy_apic[APIC_VERS_REG] = APIC_INTEGRATED_VERS;
1487 return (xen_psm_dummy_apic);
1488 }
1489
1490 /* ARGSUSED */
1491 uint32_t *
mapin_ioapic(uint32_t addr,size_t len,int flags)1492 mapin_ioapic(uint32_t addr, size_t len, int flags)
1493 {
1494 /*
1495 * Return non-null here to fake out configure code that calls this.
1496 * The i86xpv platform will not reference through the returned value..
1497 */
1498 return ((uint32_t *)0x1);
1499 }
1500
1501 /* ARGSUSED */
1502 void
mapout_apic(caddr_t addr,size_t len)1503 mapout_apic(caddr_t addr, size_t len)
1504 {
1505 }
1506
1507 /* ARGSUSED */
1508 void
mapout_ioapic(caddr_t addr,size_t len)1509 mapout_ioapic(caddr_t addr, size_t len)
1510 {
1511 }
1512
1513 uint32_t
ioapic_read(int apic_ix,uint32_t reg)1514 ioapic_read(int apic_ix, uint32_t reg)
1515 {
1516 physdev_apic_t apic;
1517
1518 apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1519 apic.reg = reg;
1520 if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic))
1521 panic("read ioapic %d reg %d failed", apic_ix, reg);
1522 return (apic.value);
1523 }
1524
1525 void
ioapic_write(int apic_ix,uint32_t reg,uint32_t value)1526 ioapic_write(int apic_ix, uint32_t reg, uint32_t value)
1527 {
1528 physdev_apic_t apic;
1529
1530 apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1531 apic.reg = reg;
1532 apic.value = value;
1533 if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1534 panic("write ioapic %d reg %d failed", apic_ix, reg);
1535 }
1536
1537 /*
1538 * This function was added as part of x2APIC support in pcplusmp.
1539 */
1540 void
ioapic_write_eoi(int apic_ix,uint32_t value)1541 ioapic_write_eoi(int apic_ix, uint32_t value)
1542 {
1543 physdev_apic_t apic;
1544
1545 apic.apic_physbase = (unsigned long)apic_physaddr[apic_ix];
1546 apic.reg = APIC_IO_EOI;
1547 apic.value = value;
1548 if (HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic))
1549 panic("write ioapic reg : APIC_IO_EOI %d failed", apic_ix);
1550 }
1551
1552 /*
1553 * This function was added as part of x2APIC support in pcplusmp to resolve
1554 * undefined symbol in xpv_psm.
1555 */
1556 void
x2apic_update_psm()1557 x2apic_update_psm()
1558 {
1559 }
1560
1561 /*
1562 * This function was added as part of x2APIC support in pcplusmp to resolve
1563 * undefined symbol in xpv_psm.
1564 */
1565 void
apic_ret()1566 apic_ret()
1567 {
1568 }
1569
1570 /*
1571 * Call rebind to do the actual programming.
1572 */
1573 int
apic_setup_io_intr(void * p,int irq,boolean_t deferred)1574 apic_setup_io_intr(void *p, int irq, boolean_t deferred)
1575 {
1576 apic_irq_t *irqptr;
1577 struct ioapic_reprogram_data *drep = NULL;
1578 int rv, cpu;
1579 cpuset_t cpus;
1580
1581 if (deferred) {
1582 drep = (struct ioapic_reprogram_data *)p;
1583 ASSERT(drep != NULL);
1584 irqptr = drep->irqp;
1585 } else {
1586 irqptr = (apic_irq_t *)p;
1587 }
1588 ASSERT(irqptr != NULL);
1589 /*
1590 * Set cpu based on xen idea of online cpu's not apic tables.
1591 * Note that xen ignores/sets to it's own preferred value the
1592 * target cpu field when programming ioapic anyway.
1593 */
1594 if (irqptr->airq_mps_intr_index == MSI_INDEX)
1595 cpu = irqptr->airq_cpu; /* MSI cpus are already set */
1596 else {
1597 cpu = xen_psm_bind_intr(irq);
1598 irqptr->airq_cpu = cpu;
1599 }
1600 if (cpu == IRQ_UNBOUND) {
1601 CPUSET_ZERO(cpus);
1602 CPUSET_OR(cpus, xen_psm_cpus_online);
1603 } else {
1604 CPUSET_ONLY(cpus, cpu & ~IRQ_USER_BOUND);
1605 }
1606 rv = apic_rebind(irqptr, cpu, drep);
1607 if (rv) {
1608 /* CPU is not up or interrupt is disabled. Fall back to 0 */
1609 cpu = 0;
1610 irqptr->airq_cpu = cpu;
1611 rv = apic_rebind(irqptr, cpu, drep);
1612 }
1613 /*
1614 * If rebind successful bind the irq to an event channel
1615 */
1616 if (rv == 0) {
1617 ec_setup_pirq(irq, irqptr->airq_ipl, &cpus);
1618 CPUSET_FIND(cpus, cpu);
1619 apic_irq_table[irq]->airq_temp_cpu = cpu & ~IRQ_USER_BOUND;
1620 }
1621 return (rv);
1622 }
1623
1624 /*
1625 * Allocate a new vector for the given irq
1626 */
1627 /* ARGSUSED */
1628 uchar_t
apic_modify_vector(uchar_t vector,int irq)1629 apic_modify_vector(uchar_t vector, int irq)
1630 {
1631 return (apic_allocate_vector(0, irq, 0));
1632 }
1633
1634 /*
1635 * The rest of the file is just generic psm module boilerplate
1636 */
1637
1638 static struct psm_ops xen_psm_ops = {
1639 xen_psm_probe, /* psm_probe */
1640
1641 xen_psm_softinit, /* psm_init */
1642 xen_psm_picinit, /* psm_picinit */
1643 xen_psm_intr_enter, /* psm_intr_enter */
1644 xen_psm_intr_exit, /* psm_intr_exit */
1645 xen_psm_setspl, /* psm_setspl */
1646 xen_psm_addspl, /* psm_addspl */
1647 xen_psm_delspl, /* psm_delspl */
1648 xen_psm_disable_intr, /* psm_disable_intr */
1649 xen_psm_enable_intr, /* psm_enable_intr */
1650 (int (*)(int))NULL, /* psm_softlvl_to_irq */
1651 (void (*)(int))NULL, /* psm_set_softintr */
1652 (void (*)(processorid_t))NULL, /* psm_set_idlecpu */
1653 (void (*)(processorid_t))NULL, /* psm_unset_idlecpu */
1654
1655 xen_psm_clkinit, /* psm_clkinit */
1656 xen_psm_get_clockirq, /* psm_get_clockirq */
1657 xen_psm_hrtimeinit, /* psm_hrtimeinit */
1658 xpv_gethrtime, /* psm_gethrtime */
1659
1660 xen_psm_get_next_processorid, /* psm_get_next_processorid */
1661 xen_psm_cpu_start, /* psm_cpu_start */
1662 xen_psm_post_cpu_start, /* psm_post_cpu_start */
1663 xen_psm_shutdown, /* psm_shutdown */
1664 xen_psm_get_ipivect, /* psm_get_ipivect */
1665 xen_psm_send_ipi, /* psm_send_ipi */
1666
1667 xen_psm_translate_irq, /* psm_translate_irq */
1668
1669 (void (*)(int, char *))NULL, /* psm_notify_error */
1670 (void (*)(int msg))NULL, /* psm_notify_func */
1671 xen_psm_timer_reprogram, /* psm_timer_reprogram */
1672 xen_psm_timer_enable, /* psm_timer_enable */
1673 xen_psm_timer_disable, /* psm_timer_disable */
1674 (void (*)(void *arg))NULL, /* psm_post_cyclic_setup */
1675 (void (*)(int, int))NULL, /* psm_preshutdown */
1676 xen_intr_ops, /* Advanced DDI Interrupt framework */
1677 (int (*)(psm_state_request_t *))NULL, /* psm_state */
1678 (int (*)(psm_cpu_request_t *))NULL, /* psm_cpu_ops */
1679
1680 (int (*)(void))NULL, /* psm_get_pir_ipivect */
1681 (void (*)(processorid_t))NULL, /* psm_send_pir_ipi */
1682 (void (*)(processorid_t, boolean_t))NULL /* psm_cmci_setup */
1683 };
1684
1685 static struct psm_info xen_psm_info = {
1686 PSM_INFO_VER01_5, /* version */
1687 PSM_OWN_EXCLUSIVE, /* ownership */
1688 &xen_psm_ops, /* operation */
1689 "xVM_psm", /* machine name */
1690 "platform module" /* machine descriptions */
1691 };
1692
1693 static void *xen_psm_hdlp;
1694
1695 int
_init(void)1696 _init(void)
1697 {
1698 return (psm_mod_init(&xen_psm_hdlp, &xen_psm_info));
1699 }
1700
1701 int
_fini(void)1702 _fini(void)
1703 {
1704 return (psm_mod_fini(&xen_psm_hdlp, &xen_psm_info));
1705 }
1706
1707 int
_info(struct modinfo * modinfop)1708 _info(struct modinfo *modinfop)
1709 {
1710 return (psm_mod_info(&xen_psm_hdlp, &xen_psm_info, modinfop));
1711 }
1712