xref: /freebsd/sys/kern/subr_intr.c (revision 5036d9652a5701d00e9e40ea942c278e9f77d33d)
1 /*-
2  * Copyright (c) 2015-2016 Svatopluk Kraus
3  * Copyright (c) 2015-2016 Michal Meloun
4  * All rights reserved.
5  * Copyright (c) 2015-2016 The FreeBSD Foundation
6  * Copyright (c) 2021 Jessica Clarke <jrtc27@FreeBSD.org>
7  *
8  * Portions of this software were developed by Andrew Turner under
9  * sponsorship from the FreeBSD Foundation.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 /*
35  *	New-style Interrupt Framework
36  *
37  *  TODO: - add support for disconnected PICs.
38  *        - to support IPI (PPI) enabling on other CPUs if already started.
39  *        - to complete things for removable PICs.
40  */
41 
42 #include "opt_ddb.h"
43 #include "opt_hwpmc_hooks.h"
44 #include "opt_iommu.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/asan.h>
49 #include <sys/bitstring.h>
50 #include <sys/bus.h>
51 #include <sys/conf.h>
52 #include <sys/cpuset.h>
53 #include <sys/interrupt.h>
54 #include <sys/intr.h>
55 #include <sys/kernel.h>
56 #include <sys/lock.h>
57 #include <sys/malloc.h>
58 #include <sys/msan.h>
59 #include <sys/mutex.h>
60 #include <sys/proc.h>
61 #include <sys/queue.h>
62 #include <sys/rman.h>
63 #include <sys/sched.h>
64 #include <sys/smp.h>
65 #include <sys/sysctl.h>
66 #include <sys/syslog.h>
67 #include <sys/taskqueue.h>
68 #include <sys/tree.h>
69 #include <sys/vmmeter.h>
70 #ifdef HWPMC_HOOKS
71 #include <sys/pmckern.h>
72 #endif
73 
74 #include <machine/atomic.h>
75 #include <machine/cpu.h>
76 #include <machine/smp.h>
77 #include <machine/stdarg.h>
78 
79 #ifdef DDB
80 #include <ddb/ddb.h>
81 #endif
82 
83 #ifdef IOMMU
84 #include <dev/iommu/iommu_msi.h>
85 #endif
86 
87 #include "pic_if.h"
88 #include "msi_if.h"
89 
90 #define	INTRNAME_LEN	(2*MAXCOMLEN + 1)
91 
92 /*
93  * Archs may define multiple roots with INTR_ROOT_NUM to support different kinds
94  * of interrupts (e.g. arm64 FIQs which use a different exception vector than
95  * IRQs).
96  */
97 #if !defined(INTR_ROOT_NUM)
98 #define	INTR_ROOT_NUM	1
99 #endif
100 
101 #ifdef DEBUG
102 #define debugf(fmt, args...) do { printf("%s(): ", __func__);	\
103     printf(fmt,##args); } while (0)
104 #else
105 #define debugf(fmt, args...)
106 #endif
107 
108 MALLOC_DECLARE(M_INTRNG);
109 MALLOC_DEFINE(M_INTRNG, "intr", "intr interrupt handling");
110 
111 /* Root interrupt controller stuff. */
112 struct intr_irq_root {
113 	device_t dev;
114 	intr_irq_filter_t *filter;
115 	void *arg;
116 };
117 
118 static struct intr_irq_root intr_irq_roots[INTR_ROOT_NUM];
119 
120 struct intr_pic_child {
121 	SLIST_ENTRY(intr_pic_child)	 pc_next;
122 	struct intr_pic			*pc_pic;
123 	intr_child_irq_filter_t		*pc_filter;
124 	void				*pc_filter_arg;
125 	uintptr_t			 pc_start;
126 	uintptr_t			 pc_length;
127 };
128 
129 /* Interrupt controller definition. */
130 struct intr_pic {
131 	SLIST_ENTRY(intr_pic)	pic_next;
132 	intptr_t		pic_xref;	/* hardware identification */
133 	device_t		pic_dev;
134 /* Only one of FLAG_PIC or FLAG_MSI may be set */
135 #define	FLAG_PIC	(1 << 0)
136 #define	FLAG_MSI	(1 << 1)
137 #define	FLAG_TYPE_MASK	(FLAG_PIC | FLAG_MSI)
138 	u_int			pic_flags;
139 	struct mtx		pic_child_lock;
140 	SLIST_HEAD(, intr_pic_child) pic_children;
141 };
142 
143 #ifdef SMP
144 #define INTR_IPI_NAMELEN	(MAXCOMLEN + 1)
145 
146 struct intr_ipi {
147 	intr_ipi_handler_t	*ii_handler;
148 	void			*ii_handler_arg;
149 	struct intr_irqsrc	*ii_isrc;
150 	char			ii_name[INTR_IPI_NAMELEN];
151 	u_long			*ii_count;
152 };
153 
154 static device_t intr_ipi_dev;
155 static u_int intr_ipi_dev_priority;
156 static bool intr_ipi_dev_frozen;
157 #endif
158 
159 static struct mtx pic_list_lock;
160 static SLIST_HEAD(, intr_pic) pic_list;
161 
162 static struct intr_pic *pic_lookup(device_t dev, intptr_t xref, u_int flags);
163 
164 /* Interrupt source definition. */
165 static struct mtx isrc_table_lock;
166 static struct intr_irqsrc **irq_sources;
167 static u_int irq_next_free;
168 
169 #ifdef SMP
170 #ifdef EARLY_AP_STARTUP
171 static bool irq_assign_cpu = true;
172 #else
173 static bool irq_assign_cpu = false;
174 #endif
175 
176 static struct intr_ipi ipi_sources[INTR_IPI_COUNT];
177 #endif
178 
179 u_int intr_nirq = NIRQ;
180 SYSCTL_UINT(_machdep, OID_AUTO, nirq, CTLFLAG_RDTUN, &intr_nirq, 0,
181     "Number of IRQs");
182 
183 /* Data for MI statistics reporting. */
184 u_long *intrcnt;
185 char *intrnames;
186 size_t sintrcnt;
187 size_t sintrnames;
188 int nintrcnt;
189 static bitstr_t *intrcnt_bitmap;
190 
191 static struct intr_irqsrc *intr_map_get_isrc(u_int res_id);
192 static void intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc);
193 static struct intr_map_data * intr_map_get_map_data(u_int res_id);
194 static void intr_map_copy_map_data(u_int res_id, device_t *dev, intptr_t *xref,
195     struct intr_map_data **data);
196 
197 /*
198  *  Interrupt framework initialization routine.
199  */
200 static void
201 intr_irq_init(void *dummy __unused)
202 {
203 
204 	SLIST_INIT(&pic_list);
205 	mtx_init(&pic_list_lock, "intr pic list", NULL, MTX_DEF);
206 
207 	mtx_init(&isrc_table_lock, "intr isrc table", NULL, MTX_DEF);
208 
209 	/*
210 	 * - 2 counters for each I/O interrupt.
211 	 * - mp_maxid + 1 counters for each IPI counters for SMP.
212 	 */
213 	nintrcnt = intr_nirq * 2;
214 #ifdef SMP
215 	nintrcnt += INTR_IPI_COUNT * (mp_maxid + 1);
216 #endif
217 
218 	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTRNG,
219 	    M_WAITOK | M_ZERO);
220 	intrnames = mallocarray(nintrcnt, INTRNAME_LEN, M_INTRNG,
221 	    M_WAITOK | M_ZERO);
222 	sintrcnt = nintrcnt * sizeof(u_long);
223 	sintrnames = nintrcnt * INTRNAME_LEN;
224 
225 	/* Allocate the bitmap tracking counter allocations. */
226 	intrcnt_bitmap = bit_alloc(nintrcnt, M_INTRNG, M_WAITOK | M_ZERO);
227 
228 	irq_sources = mallocarray(intr_nirq, sizeof(struct intr_irqsrc*),
229 	    M_INTRNG, M_WAITOK | M_ZERO);
230 }
231 SYSINIT(intr_irq_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_irq_init, NULL);
232 
233 static void
234 intrcnt_setname(const char *name, int index)
235 {
236 
237 	snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s",
238 	    INTRNAME_LEN - 1, name);
239 }
240 
241 /*
242  *  Update name for interrupt source with interrupt event.
243  */
244 static void
245 intrcnt_updatename(struct intr_irqsrc *isrc)
246 {
247 
248 	/* QQQ: What about stray counter name? */
249 	mtx_assert(&isrc_table_lock, MA_OWNED);
250 	intrcnt_setname(isrc->isrc_event->ie_fullname, isrc->isrc_index);
251 }
252 
253 /*
254  *  Virtualization for interrupt source interrupt counter increment.
255  */
256 static inline void
257 isrc_increment_count(struct intr_irqsrc *isrc)
258 {
259 
260 	if (isrc->isrc_flags & INTR_ISRCF_PPI)
261 		atomic_add_long(&isrc->isrc_count[0], 1);
262 	else
263 		isrc->isrc_count[0]++;
264 }
265 
266 /*
267  *  Virtualization for interrupt source interrupt stray counter increment.
268  */
269 static inline void
270 isrc_increment_straycount(struct intr_irqsrc *isrc)
271 {
272 
273 	isrc->isrc_count[1]++;
274 }
275 
276 /*
277  *  Virtualization for interrupt source interrupt name update.
278  */
279 static void
280 isrc_update_name(struct intr_irqsrc *isrc, const char *name)
281 {
282 	char str[INTRNAME_LEN];
283 
284 	mtx_assert(&isrc_table_lock, MA_OWNED);
285 
286 	if (name != NULL) {
287 		snprintf(str, INTRNAME_LEN, "%s: %s", isrc->isrc_name, name);
288 		intrcnt_setname(str, isrc->isrc_index);
289 		snprintf(str, INTRNAME_LEN, "stray %s: %s", isrc->isrc_name,
290 		    name);
291 		intrcnt_setname(str, isrc->isrc_index + 1);
292 	} else {
293 		snprintf(str, INTRNAME_LEN, "%s:", isrc->isrc_name);
294 		intrcnt_setname(str, isrc->isrc_index);
295 		snprintf(str, INTRNAME_LEN, "stray %s:", isrc->isrc_name);
296 		intrcnt_setname(str, isrc->isrc_index + 1);
297 	}
298 }
299 
300 /*
301  *  Virtualization for interrupt source interrupt counters setup.
302  */
303 static void
304 isrc_setup_counters(struct intr_irqsrc *isrc)
305 {
306 	int index;
307 
308 	mtx_assert(&isrc_table_lock, MA_OWNED);
309 
310 	/*
311 	 * Allocate two counter values, the second tracking "stray" interrupts.
312 	 */
313 	bit_ffc_area(intrcnt_bitmap, nintrcnt, 2, &index);
314 	if (index == -1)
315 		panic("Failed to allocate 2 counters. Array exhausted?");
316 	bit_nset(intrcnt_bitmap, index, index + 1);
317 	isrc->isrc_index = index;
318 	isrc->isrc_count = &intrcnt[index];
319 	isrc_update_name(isrc, NULL);
320 }
321 
322 /*
323  *  Virtualization for interrupt source interrupt counters release.
324  */
325 static void
326 isrc_release_counters(struct intr_irqsrc *isrc)
327 {
328 	int idx = isrc->isrc_index;
329 
330 	mtx_assert(&isrc_table_lock, MA_OWNED);
331 
332 	bit_nclear(intrcnt_bitmap, idx, idx + 1);
333 }
334 
335 /*
336  *  Main interrupt dispatch handler. It's called straight
337  *  from the assembler, where CPU interrupt is served.
338  */
339 void
340 intr_irq_handler(struct trapframe *tf, uint32_t rootnum)
341 {
342 	struct trapframe * oldframe;
343 	struct thread * td;
344 	struct intr_irq_root *root;
345 
346 	KASSERT(rootnum < INTR_ROOT_NUM,
347 	    ("%s: invalid interrupt root %d", __func__, rootnum));
348 
349 	root = &intr_irq_roots[rootnum];
350 	KASSERT(root->filter != NULL, ("%s: no filter", __func__));
351 
352 	kasan_mark(tf, sizeof(*tf), sizeof(*tf), 0);
353 	kmsan_mark(tf, sizeof(*tf), KMSAN_STATE_INITED);
354 
355 	VM_CNT_INC(v_intr);
356 	critical_enter();
357 	td = curthread;
358 	oldframe = td->td_intr_frame;
359 	td->td_intr_frame = tf;
360 	(root->filter)(root->arg);
361 	td->td_intr_frame = oldframe;
362 	critical_exit();
363 #ifdef HWPMC_HOOKS
364 	if (pmc_hook && TRAPF_USERMODE(tf) &&
365 	    (PCPU_GET(curthread)->td_pflags & TDP_CALLCHAIN))
366 		pmc_hook(PCPU_GET(curthread), PMC_FN_USER_CALLCHAIN, tf);
367 #endif
368 }
369 
370 int
371 intr_child_irq_handler(struct intr_pic *parent, uintptr_t irq)
372 {
373 	struct intr_pic_child *child;
374 	bool found;
375 
376 	found = false;
377 	mtx_lock_spin(&parent->pic_child_lock);
378 	SLIST_FOREACH(child, &parent->pic_children, pc_next) {
379 		if (child->pc_start <= irq &&
380 		    irq < (child->pc_start + child->pc_length)) {
381 			found = true;
382 			break;
383 		}
384 	}
385 	mtx_unlock_spin(&parent->pic_child_lock);
386 
387 	if (found)
388 		return (child->pc_filter(child->pc_filter_arg, irq));
389 
390 	return (FILTER_STRAY);
391 }
392 
393 /*
394  *  interrupt controller dispatch function for interrupts. It should
395  *  be called straight from the interrupt controller, when associated interrupt
396  *  source is learned.
397  */
398 int
399 intr_isrc_dispatch(struct intr_irqsrc *isrc, struct trapframe *tf)
400 {
401 
402 	KASSERT(isrc != NULL, ("%s: no source", __func__));
403 
404 	if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
405 		isrc_increment_count(isrc);
406 
407 #ifdef INTR_SOLO
408 	if (isrc->isrc_filter != NULL) {
409 		int error;
410 		error = isrc->isrc_filter(isrc->isrc_arg, tf);
411 		PIC_POST_FILTER(isrc->isrc_dev, isrc);
412 		if (error == FILTER_HANDLED)
413 			return (0);
414 	} else
415 #endif
416 	if (isrc->isrc_event != NULL) {
417 		if (intr_event_handle(isrc->isrc_event, tf) == 0)
418 			return (0);
419 	}
420 
421 	if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
422 		isrc_increment_straycount(isrc);
423 	return (EINVAL);
424 }
425 
426 /*
427  *  Alloc unique interrupt number (resource handle) for interrupt source.
428  *
429  *  There could be various strategies how to allocate free interrupt number
430  *  (resource handle) for new interrupt source.
431  *
432  *  1. Handles are always allocated forward, so handles are not recycled
433  *     immediately. However, if only one free handle left which is reused
434  *     constantly...
435  */
436 static inline int
437 isrc_alloc_irq(struct intr_irqsrc *isrc)
438 {
439 	u_int irq;
440 
441 	mtx_assert(&isrc_table_lock, MA_OWNED);
442 
443 	if (irq_next_free >= intr_nirq)
444 		return (ENOSPC);
445 
446 	for (irq = irq_next_free; irq < intr_nirq; irq++) {
447 		if (irq_sources[irq] == NULL)
448 			goto found;
449 	}
450 	for (irq = 0; irq < irq_next_free; irq++) {
451 		if (irq_sources[irq] == NULL)
452 			goto found;
453 	}
454 
455 	irq_next_free = intr_nirq;
456 	return (ENOSPC);
457 
458 found:
459 	isrc->isrc_irq = irq;
460 	irq_sources[irq] = isrc;
461 
462 	irq_next_free = irq + 1;
463 	if (irq_next_free >= intr_nirq)
464 		irq_next_free = 0;
465 	return (0);
466 }
467 
468 /*
469  *  Free unique interrupt number (resource handle) from interrupt source.
470  */
471 static inline int
472 isrc_free_irq(struct intr_irqsrc *isrc)
473 {
474 
475 	mtx_assert(&isrc_table_lock, MA_OWNED);
476 
477 	if (isrc->isrc_irq >= intr_nirq)
478 		return (EINVAL);
479 	if (irq_sources[isrc->isrc_irq] != isrc)
480 		return (EINVAL);
481 
482 	irq_sources[isrc->isrc_irq] = NULL;
483 	isrc->isrc_irq = INTR_IRQ_INVALID;	/* just to be safe */
484 
485 	/*
486 	 * If we are recovering from the state irq_sources table is full,
487 	 * then the following allocation should check the entire table. This
488 	 * will ensure maximum separation of allocation order from release
489 	 * order.
490 	 */
491 	if (irq_next_free >= intr_nirq)
492 		irq_next_free = 0;
493 
494 	return (0);
495 }
496 
497 device_t
498 intr_irq_root_device(uint32_t rootnum)
499 {
500 	KASSERT(rootnum < INTR_ROOT_NUM,
501 	    ("%s: invalid interrupt root %d", __func__, rootnum));
502 	return (intr_irq_roots[rootnum].dev);
503 }
504 
505 /*
506  *  Initialize interrupt source and register it into global interrupt table.
507  */
508 int
509 intr_isrc_register(struct intr_irqsrc *isrc, device_t dev, u_int flags,
510     const char *fmt, ...)
511 {
512 	int error;
513 	va_list ap;
514 
515 	bzero(isrc, sizeof(struct intr_irqsrc));
516 	isrc->isrc_dev = dev;
517 	isrc->isrc_irq = INTR_IRQ_INVALID;	/* just to be safe */
518 	isrc->isrc_flags = flags;
519 
520 	va_start(ap, fmt);
521 	vsnprintf(isrc->isrc_name, INTR_ISRC_NAMELEN, fmt, ap);
522 	va_end(ap);
523 
524 	mtx_lock(&isrc_table_lock);
525 	error = isrc_alloc_irq(isrc);
526 	if (error != 0) {
527 		mtx_unlock(&isrc_table_lock);
528 		return (error);
529 	}
530 	/*
531 	 * Setup interrupt counters, but not for IPI sources. Those are setup
532 	 * later and only for used ones (up to INTR_IPI_COUNT) to not exhaust
533 	 * our counter pool.
534 	 */
535 	if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
536 		isrc_setup_counters(isrc);
537 	mtx_unlock(&isrc_table_lock);
538 	return (0);
539 }
540 
541 /*
542  *  Deregister interrupt source from global interrupt table.
543  */
544 int
545 intr_isrc_deregister(struct intr_irqsrc *isrc)
546 {
547 	int error;
548 
549 	mtx_lock(&isrc_table_lock);
550 	if ((isrc->isrc_flags & INTR_ISRCF_IPI) == 0)
551 		isrc_release_counters(isrc);
552 	error = isrc_free_irq(isrc);
553 	mtx_unlock(&isrc_table_lock);
554 	return (error);
555 }
556 
557 #ifdef SMP
558 /*
559  *  A support function for a PIC to decide if provided ISRC should be inited
560  *  on given cpu. The logic of INTR_ISRCF_BOUND flag and isrc_cpu member of
561  *  struct intr_irqsrc is the following:
562  *
563  *     If INTR_ISRCF_BOUND is set, the ISRC should be inited only on cpus
564  *     set in isrc_cpu. If not, the ISRC should be inited on every cpu and
565  *     isrc_cpu is kept consistent with it. Thus isrc_cpu is always correct.
566  */
567 bool
568 intr_isrc_init_on_cpu(struct intr_irqsrc *isrc, u_int cpu)
569 {
570 
571 	if (isrc->isrc_handlers == 0)
572 		return (false);
573 	if ((isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI)) == 0)
574 		return (false);
575 	if (isrc->isrc_flags & INTR_ISRCF_BOUND)
576 		return (CPU_ISSET(cpu, &isrc->isrc_cpu));
577 
578 	CPU_SET(cpu, &isrc->isrc_cpu);
579 	return (true);
580 }
581 #endif
582 
583 #ifdef INTR_SOLO
584 /*
585  *  Setup filter into interrupt source.
586  */
587 static int
588 iscr_setup_filter(struct intr_irqsrc *isrc, const char *name,
589     intr_irq_filter_t *filter, void *arg, void **cookiep)
590 {
591 
592 	if (filter == NULL)
593 		return (EINVAL);
594 
595 	mtx_lock(&isrc_table_lock);
596 	/*
597 	 * Make sure that we do not mix the two ways
598 	 * how we handle interrupt sources.
599 	 */
600 	if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) {
601 		mtx_unlock(&isrc_table_lock);
602 		return (EBUSY);
603 	}
604 	isrc->isrc_filter = filter;
605 	isrc->isrc_arg = arg;
606 	isrc_update_name(isrc, name);
607 	mtx_unlock(&isrc_table_lock);
608 
609 	*cookiep = isrc;
610 	return (0);
611 }
612 #endif
613 
614 /*
615  *  Interrupt source pre_ithread method for MI interrupt framework.
616  */
617 static void
618 intr_isrc_pre_ithread(void *arg)
619 {
620 	struct intr_irqsrc *isrc = arg;
621 
622 	PIC_PRE_ITHREAD(isrc->isrc_dev, isrc);
623 }
624 
625 /*
626  *  Interrupt source post_ithread method for MI interrupt framework.
627  */
628 static void
629 intr_isrc_post_ithread(void *arg)
630 {
631 	struct intr_irqsrc *isrc = arg;
632 
633 	PIC_POST_ITHREAD(isrc->isrc_dev, isrc);
634 }
635 
636 /*
637  *  Interrupt source post_filter method for MI interrupt framework.
638  */
639 static void
640 intr_isrc_post_filter(void *arg)
641 {
642 	struct intr_irqsrc *isrc = arg;
643 
644 	PIC_POST_FILTER(isrc->isrc_dev, isrc);
645 }
646 
647 /*
648  *  Interrupt source assign_cpu method for MI interrupt framework.
649  */
650 static int
651 intr_isrc_assign_cpu(void *arg, int cpu)
652 {
653 #ifdef SMP
654 	struct intr_irqsrc *isrc = arg;
655 	int error;
656 
657 	mtx_lock(&isrc_table_lock);
658 	if (cpu == NOCPU) {
659 		CPU_ZERO(&isrc->isrc_cpu);
660 		isrc->isrc_flags &= ~INTR_ISRCF_BOUND;
661 	} else {
662 		CPU_SETOF(cpu, &isrc->isrc_cpu);
663 		isrc->isrc_flags |= INTR_ISRCF_BOUND;
664 	}
665 
666 	/*
667 	 * In NOCPU case, it's up to PIC to either leave ISRC on same CPU or
668 	 * re-balance it to another CPU or enable it on more CPUs. However,
669 	 * PIC is expected to change isrc_cpu appropriately to keep us well
670 	 * informed if the call is successful.
671 	 */
672 	if (irq_assign_cpu) {
673 		error = PIC_BIND_INTR(isrc->isrc_dev, isrc);
674 		if (error) {
675 			CPU_ZERO(&isrc->isrc_cpu);
676 			mtx_unlock(&isrc_table_lock);
677 			return (error);
678 		}
679 	}
680 	mtx_unlock(&isrc_table_lock);
681 	return (0);
682 #else
683 	return (EOPNOTSUPP);
684 #endif
685 }
686 
687 /*
688  *  Create interrupt event for interrupt source.
689  */
690 static int
691 isrc_event_create(struct intr_irqsrc *isrc)
692 {
693 	struct intr_event *ie;
694 	int error;
695 
696 	error = intr_event_create(&ie, isrc, 0, isrc->isrc_irq,
697 	    intr_isrc_pre_ithread, intr_isrc_post_ithread, intr_isrc_post_filter,
698 	    intr_isrc_assign_cpu, "%s:", isrc->isrc_name);
699 	if (error)
700 		return (error);
701 
702 	mtx_lock(&isrc_table_lock);
703 	/*
704 	 * Make sure that we do not mix the two ways
705 	 * how we handle interrupt sources. Let contested event wins.
706 	 */
707 #ifdef INTR_SOLO
708 	if (isrc->isrc_filter != NULL || isrc->isrc_event != NULL) {
709 #else
710 	if (isrc->isrc_event != NULL) {
711 #endif
712 		mtx_unlock(&isrc_table_lock);
713 		intr_event_destroy(ie);
714 		return (isrc->isrc_event != NULL ? EBUSY : 0);
715 	}
716 	isrc->isrc_event = ie;
717 	mtx_unlock(&isrc_table_lock);
718 
719 	return (0);
720 }
721 #ifdef notyet
722 /*
723  *  Destroy interrupt event for interrupt source.
724  */
725 static void
726 isrc_event_destroy(struct intr_irqsrc *isrc)
727 {
728 	struct intr_event *ie;
729 
730 	mtx_lock(&isrc_table_lock);
731 	ie = isrc->isrc_event;
732 	isrc->isrc_event = NULL;
733 	mtx_unlock(&isrc_table_lock);
734 
735 	if (ie != NULL)
736 		intr_event_destroy(ie);
737 }
738 #endif
739 /*
740  *  Add handler to interrupt source.
741  */
742 static int
743 isrc_add_handler(struct intr_irqsrc *isrc, const char *name,
744     driver_filter_t filter, driver_intr_t handler, void *arg,
745     enum intr_type flags, void **cookiep)
746 {
747 	int error;
748 
749 	if (isrc->isrc_event == NULL) {
750 		error = isrc_event_create(isrc);
751 		if (error)
752 			return (error);
753 	}
754 
755 	error = intr_event_add_handler(isrc->isrc_event, name, filter, handler,
756 	    arg, intr_priority(flags), flags, cookiep);
757 	if (error == 0) {
758 		mtx_lock(&isrc_table_lock);
759 		intrcnt_updatename(isrc);
760 		mtx_unlock(&isrc_table_lock);
761 	}
762 
763 	return (error);
764 }
765 
766 /*
767  *  Lookup interrupt controller locked.
768  */
769 static inline struct intr_pic *
770 pic_lookup_locked(device_t dev, intptr_t xref, u_int flags)
771 {
772 	struct intr_pic *pic;
773 
774 	mtx_assert(&pic_list_lock, MA_OWNED);
775 
776 	if (dev == NULL && xref == 0)
777 		return (NULL);
778 
779 	/* Note that pic->pic_dev is never NULL on registered PIC. */
780 	SLIST_FOREACH(pic, &pic_list, pic_next) {
781 		if ((pic->pic_flags & FLAG_TYPE_MASK) !=
782 		    (flags & FLAG_TYPE_MASK))
783 			continue;
784 
785 		if (dev == NULL) {
786 			if (xref == pic->pic_xref)
787 				return (pic);
788 		} else if (xref == 0 || pic->pic_xref == 0) {
789 			if (dev == pic->pic_dev)
790 				return (pic);
791 		} else if (xref == pic->pic_xref && dev == pic->pic_dev)
792 				return (pic);
793 	}
794 	return (NULL);
795 }
796 
797 /*
798  *  Lookup interrupt controller.
799  */
800 static struct intr_pic *
801 pic_lookup(device_t dev, intptr_t xref, u_int flags)
802 {
803 	struct intr_pic *pic;
804 
805 	mtx_lock(&pic_list_lock);
806 	pic = pic_lookup_locked(dev, xref, flags);
807 	mtx_unlock(&pic_list_lock);
808 	return (pic);
809 }
810 
811 /*
812  *  Create interrupt controller.
813  */
814 static struct intr_pic *
815 pic_create(device_t dev, intptr_t xref, u_int flags)
816 {
817 	struct intr_pic *pic;
818 
819 	mtx_lock(&pic_list_lock);
820 	pic = pic_lookup_locked(dev, xref, flags);
821 	if (pic != NULL) {
822 		mtx_unlock(&pic_list_lock);
823 		return (pic);
824 	}
825 	pic = malloc(sizeof(*pic), M_INTRNG, M_NOWAIT | M_ZERO);
826 	if (pic == NULL) {
827 		mtx_unlock(&pic_list_lock);
828 		return (NULL);
829 	}
830 	pic->pic_xref = xref;
831 	pic->pic_dev = dev;
832 	pic->pic_flags = flags;
833 	mtx_init(&pic->pic_child_lock, "pic child lock", NULL, MTX_SPIN);
834 	SLIST_INSERT_HEAD(&pic_list, pic, pic_next);
835 	mtx_unlock(&pic_list_lock);
836 
837 	return (pic);
838 }
839 #ifdef notyet
840 /*
841  *  Destroy interrupt controller.
842  */
843 static void
844 pic_destroy(device_t dev, intptr_t xref, u_int flags)
845 {
846 	struct intr_pic *pic;
847 
848 	mtx_lock(&pic_list_lock);
849 	pic = pic_lookup_locked(dev, xref, flags);
850 	if (pic == NULL) {
851 		mtx_unlock(&pic_list_lock);
852 		return;
853 	}
854 	SLIST_REMOVE(&pic_list, pic, intr_pic, pic_next);
855 	mtx_unlock(&pic_list_lock);
856 
857 	free(pic, M_INTRNG);
858 }
859 #endif
860 /*
861  *  Register interrupt controller.
862  */
863 struct intr_pic *
864 intr_pic_register(device_t dev, intptr_t xref)
865 {
866 	struct intr_pic *pic;
867 
868 	if (dev == NULL)
869 		return (NULL);
870 	pic = pic_create(dev, xref, FLAG_PIC);
871 	if (pic == NULL)
872 		return (NULL);
873 
874 	debugf("PIC %p registered for %s <dev %p, xref %jx>\n", pic,
875 	    device_get_nameunit(dev), dev, (uintmax_t)xref);
876 	return (pic);
877 }
878 
879 /*
880  *  Unregister interrupt controller.
881  */
882 int
883 intr_pic_deregister(device_t dev, intptr_t xref)
884 {
885 
886 	panic("%s: not implemented", __func__);
887 }
888 
889 /*
890  *  Mark interrupt controller (itself) as a root one.
891  *
892  *  Note that only an interrupt controller can really know its position
893  *  in interrupt controller's tree. So root PIC must claim itself as a root.
894  *
895  *  In FDT case, according to ePAPR approved version 1.1 from 08 April 2011,
896  *  page 30:
897  *    "The root of the interrupt tree is determined when traversal
898  *     of the interrupt tree reaches an interrupt controller node without
899  *     an interrupts property and thus no explicit interrupt parent."
900  */
901 int
902 intr_pic_claim_root(device_t dev, intptr_t xref, intr_irq_filter_t *filter,
903     void *arg, uint32_t rootnum)
904 {
905 	struct intr_pic *pic;
906 	struct intr_irq_root *root;
907 
908 	pic = pic_lookup(dev, xref, FLAG_PIC);
909 	if (pic == NULL) {
910 		device_printf(dev, "not registered\n");
911 		return (EINVAL);
912 	}
913 
914 	KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_PIC,
915 	    ("%s: Found a non-PIC controller: %s", __func__,
916 	     device_get_name(pic->pic_dev)));
917 
918 	if (filter == NULL) {
919 		device_printf(dev, "filter missing\n");
920 		return (EINVAL);
921 	}
922 
923 	/*
924 	 * Only one interrupt controllers could be on the root for now.
925 	 * Note that we further suppose that there is not threaded interrupt
926 	 * routine (handler) on the root. See intr_irq_handler().
927 	 */
928 	KASSERT(rootnum < INTR_ROOT_NUM,
929 	    ("%s: invalid interrupt root %d", __func__, rootnum));
930 	root = &intr_irq_roots[rootnum];
931 	if (root->dev != NULL) {
932 		device_printf(dev, "another root already set\n");
933 		return (EBUSY);
934 	}
935 
936 	root->dev = dev;
937 	root->filter = filter;
938 	root->arg = arg;
939 
940 	debugf("irq root set to %s\n", device_get_nameunit(dev));
941 	return (0);
942 }
943 
944 /*
945  * Add a handler to manage a sub range of a parents interrupts.
946  */
947 int
948 intr_pic_add_handler(device_t parent, struct intr_pic *pic,
949     intr_child_irq_filter_t *filter, void *arg, uintptr_t start,
950     uintptr_t length)
951 {
952 	struct intr_pic *parent_pic;
953 	struct intr_pic_child *newchild;
954 #ifdef INVARIANTS
955 	struct intr_pic_child *child;
956 #endif
957 
958 	/* Find the parent PIC */
959 	parent_pic = pic_lookup(parent, 0, FLAG_PIC);
960 	if (parent_pic == NULL)
961 		return (ENXIO);
962 
963 	newchild = malloc(sizeof(*newchild), M_INTRNG, M_WAITOK | M_ZERO);
964 	newchild->pc_pic = pic;
965 	newchild->pc_filter = filter;
966 	newchild->pc_filter_arg = arg;
967 	newchild->pc_start = start;
968 	newchild->pc_length = length;
969 
970 	mtx_lock_spin(&parent_pic->pic_child_lock);
971 #ifdef INVARIANTS
972 	SLIST_FOREACH(child, &parent_pic->pic_children, pc_next) {
973 		KASSERT(child->pc_pic != pic, ("%s: Adding a child PIC twice",
974 		    __func__));
975 	}
976 #endif
977 	SLIST_INSERT_HEAD(&parent_pic->pic_children, newchild, pc_next);
978 	mtx_unlock_spin(&parent_pic->pic_child_lock);
979 
980 	return (0);
981 }
982 
983 static int
984 intr_resolve_irq(device_t dev, intptr_t xref, struct intr_map_data *data,
985     struct intr_irqsrc **isrc)
986 {
987 	struct intr_pic *pic;
988 	struct intr_map_data_msi *msi;
989 
990 	if (data == NULL)
991 		return (EINVAL);
992 
993 	pic = pic_lookup(dev, xref,
994 	    (data->type == INTR_MAP_DATA_MSI) ? FLAG_MSI : FLAG_PIC);
995 	if (pic == NULL)
996 		return (ESRCH);
997 
998 	switch (data->type) {
999 	case INTR_MAP_DATA_MSI:
1000 		KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI,
1001 		    ("%s: Found a non-MSI controller: %s", __func__,
1002 		     device_get_name(pic->pic_dev)));
1003 		msi = (struct intr_map_data_msi *)data;
1004 		*isrc = msi->isrc;
1005 		return (0);
1006 
1007 	default:
1008 		KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_PIC,
1009 		    ("%s: Found a non-PIC controller: %s", __func__,
1010 		     device_get_name(pic->pic_dev)));
1011 		return (PIC_MAP_INTR(pic->pic_dev, data, isrc));
1012 	}
1013 }
1014 
1015 bool
1016 intr_is_per_cpu(struct resource *res)
1017 {
1018 	u_int res_id;
1019 	struct intr_irqsrc *isrc;
1020 
1021 	res_id = (u_int)rman_get_start(res);
1022 	isrc = intr_map_get_isrc(res_id);
1023 
1024 	if (isrc == NULL)
1025 		panic("Attempt to get isrc for non-active resource id: %u\n",
1026 		    res_id);
1027 	return ((isrc->isrc_flags & INTR_ISRCF_PPI) != 0);
1028 }
1029 
1030 int
1031 intr_activate_irq(device_t dev, struct resource *res)
1032 {
1033 	device_t map_dev;
1034 	intptr_t map_xref;
1035 	struct intr_map_data *data;
1036 	struct intr_irqsrc *isrc;
1037 	u_int res_id;
1038 	int error;
1039 
1040 	KASSERT(rman_get_start(res) == rman_get_end(res),
1041 	    ("%s: more interrupts in resource", __func__));
1042 
1043 	res_id = (u_int)rman_get_start(res);
1044 	if (intr_map_get_isrc(res_id) != NULL)
1045 		panic("Attempt to double activation of resource id: %u\n",
1046 		    res_id);
1047 	intr_map_copy_map_data(res_id, &map_dev, &map_xref, &data);
1048 	error = intr_resolve_irq(map_dev, map_xref, data, &isrc);
1049 	if (error != 0) {
1050 		free(data, M_INTRNG);
1051 		/* XXX TODO DISCONECTED PICs */
1052 		/* if (error == EINVAL) return(0); */
1053 		return (error);
1054 	}
1055 	intr_map_set_isrc(res_id, isrc);
1056 	rman_set_virtual(res, data);
1057 	return (PIC_ACTIVATE_INTR(isrc->isrc_dev, isrc, res, data));
1058 }
1059 
1060 int
1061 intr_deactivate_irq(device_t dev, struct resource *res)
1062 {
1063 	struct intr_map_data *data;
1064 	struct intr_irqsrc *isrc;
1065 	u_int res_id;
1066 	int error;
1067 
1068 	KASSERT(rman_get_start(res) == rman_get_end(res),
1069 	    ("%s: more interrupts in resource", __func__));
1070 
1071 	res_id = (u_int)rman_get_start(res);
1072 	isrc = intr_map_get_isrc(res_id);
1073 	if (isrc == NULL)
1074 		panic("Attempt to deactivate non-active resource id: %u\n",
1075 		    res_id);
1076 
1077 	data = rman_get_virtual(res);
1078 	error = PIC_DEACTIVATE_INTR(isrc->isrc_dev, isrc, res, data);
1079 	intr_map_set_isrc(res_id, NULL);
1080 	rman_set_virtual(res, NULL);
1081 	free(data, M_INTRNG);
1082 	return (error);
1083 }
1084 
1085 int
1086 intr_setup_irq(device_t dev, struct resource *res, driver_filter_t filt,
1087     driver_intr_t hand, void *arg, int flags, void **cookiep)
1088 {
1089 	int error;
1090 	struct intr_map_data *data;
1091 	struct intr_irqsrc *isrc;
1092 	const char *name;
1093 	u_int res_id;
1094 
1095 	KASSERT(rman_get_start(res) == rman_get_end(res),
1096 	    ("%s: more interrupts in resource", __func__));
1097 
1098 	res_id = (u_int)rman_get_start(res);
1099 	isrc = intr_map_get_isrc(res_id);
1100 	if (isrc == NULL) {
1101 		/* XXX TODO DISCONECTED PICs */
1102 		return (EINVAL);
1103 	}
1104 
1105 	data = rman_get_virtual(res);
1106 	name = device_get_nameunit(dev);
1107 
1108 #ifdef INTR_SOLO
1109 	/*
1110 	 * Standard handling is done through MI interrupt framework. However,
1111 	 * some interrupts could request solely own special handling. This
1112 	 * non standard handling can be used for interrupt controllers without
1113 	 * handler (filter only), so in case that interrupt controllers are
1114 	 * chained, MI interrupt framework is called only in leaf controller.
1115 	 *
1116 	 * Note that root interrupt controller routine is served as well,
1117 	 * however in intr_irq_handler(), i.e. main system dispatch routine.
1118 	 */
1119 	if (flags & INTR_SOLO && hand != NULL) {
1120 		debugf("irq %u cannot solo on %s\n", irq, name);
1121 		return (EINVAL);
1122 	}
1123 
1124 	if (flags & INTR_SOLO) {
1125 		error = iscr_setup_filter(isrc, name, (intr_irq_filter_t *)filt,
1126 		    arg, cookiep);
1127 		debugf("irq %u setup filter error %d on %s\n", isrc->isrc_irq, error,
1128 		    name);
1129 	} else
1130 #endif
1131 		{
1132 		error = isrc_add_handler(isrc, name, filt, hand, arg, flags,
1133 		    cookiep);
1134 		debugf("irq %u add handler error %d on %s\n", isrc->isrc_irq, error, name);
1135 	}
1136 	if (error != 0)
1137 		return (error);
1138 
1139 	mtx_lock(&isrc_table_lock);
1140 	error = PIC_SETUP_INTR(isrc->isrc_dev, isrc, res, data);
1141 	if (error == 0) {
1142 		isrc->isrc_handlers++;
1143 		if (isrc->isrc_handlers == 1)
1144 			PIC_ENABLE_INTR(isrc->isrc_dev, isrc);
1145 	}
1146 	mtx_unlock(&isrc_table_lock);
1147 	if (error != 0)
1148 		intr_event_remove_handler(*cookiep);
1149 	return (error);
1150 }
1151 
1152 int
1153 intr_teardown_irq(device_t dev, struct resource *res, void *cookie)
1154 {
1155 	int error;
1156 	struct intr_map_data *data;
1157 	struct intr_irqsrc *isrc;
1158 	u_int res_id;
1159 
1160 	KASSERT(rman_get_start(res) == rman_get_end(res),
1161 	    ("%s: more interrupts in resource", __func__));
1162 
1163 	res_id = (u_int)rman_get_start(res);
1164 	isrc = intr_map_get_isrc(res_id);
1165 	if (isrc == NULL || isrc->isrc_handlers == 0)
1166 		return (EINVAL);
1167 
1168 	data = rman_get_virtual(res);
1169 
1170 #ifdef INTR_SOLO
1171 	if (isrc->isrc_filter != NULL) {
1172 		if (isrc != cookie)
1173 			return (EINVAL);
1174 
1175 		mtx_lock(&isrc_table_lock);
1176 		isrc->isrc_filter = NULL;
1177 		isrc->isrc_arg = NULL;
1178 		isrc->isrc_handlers = 0;
1179 		PIC_DISABLE_INTR(isrc->isrc_dev, isrc);
1180 		PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data);
1181 		isrc_update_name(isrc, NULL);
1182 		mtx_unlock(&isrc_table_lock);
1183 		return (0);
1184 	}
1185 #endif
1186 	if (isrc != intr_handler_source(cookie))
1187 		return (EINVAL);
1188 
1189 	error = intr_event_remove_handler(cookie);
1190 	if (error == 0) {
1191 		mtx_lock(&isrc_table_lock);
1192 		isrc->isrc_handlers--;
1193 		if (isrc->isrc_handlers == 0)
1194 			PIC_DISABLE_INTR(isrc->isrc_dev, isrc);
1195 		PIC_TEARDOWN_INTR(isrc->isrc_dev, isrc, res, data);
1196 		intrcnt_updatename(isrc);
1197 		mtx_unlock(&isrc_table_lock);
1198 	}
1199 	return (error);
1200 }
1201 
1202 int
1203 intr_describe_irq(device_t dev, struct resource *res, void *cookie,
1204     const char *descr)
1205 {
1206 	int error;
1207 	struct intr_irqsrc *isrc;
1208 	u_int res_id;
1209 
1210 	KASSERT(rman_get_start(res) == rman_get_end(res),
1211 	    ("%s: more interrupts in resource", __func__));
1212 
1213 	res_id = (u_int)rman_get_start(res);
1214 	isrc = intr_map_get_isrc(res_id);
1215 	if (isrc == NULL || isrc->isrc_handlers == 0)
1216 		return (EINVAL);
1217 #ifdef INTR_SOLO
1218 	if (isrc->isrc_filter != NULL) {
1219 		if (isrc != cookie)
1220 			return (EINVAL);
1221 
1222 		mtx_lock(&isrc_table_lock);
1223 		isrc_update_name(isrc, descr);
1224 		mtx_unlock(&isrc_table_lock);
1225 		return (0);
1226 	}
1227 #endif
1228 	error = intr_event_describe_handler(isrc->isrc_event, cookie, descr);
1229 	if (error == 0) {
1230 		mtx_lock(&isrc_table_lock);
1231 		intrcnt_updatename(isrc);
1232 		mtx_unlock(&isrc_table_lock);
1233 	}
1234 	return (error);
1235 }
1236 
1237 #ifdef SMP
1238 int
1239 intr_bind_irq(device_t dev, struct resource *res, int cpu)
1240 {
1241 	struct intr_irqsrc *isrc;
1242 	u_int res_id;
1243 
1244 	KASSERT(rman_get_start(res) == rman_get_end(res),
1245 	    ("%s: more interrupts in resource", __func__));
1246 
1247 	res_id = (u_int)rman_get_start(res);
1248 	isrc = intr_map_get_isrc(res_id);
1249 	if (isrc == NULL || isrc->isrc_handlers == 0)
1250 		return (EINVAL);
1251 #ifdef INTR_SOLO
1252 	if (isrc->isrc_filter != NULL)
1253 		return (intr_isrc_assign_cpu(isrc, cpu));
1254 #endif
1255 	return (intr_event_bind(isrc->isrc_event, cpu));
1256 }
1257 
1258 /*
1259  * Return the CPU that the next interrupt source should use.
1260  * For now just returns the next CPU according to round-robin.
1261  */
1262 u_int
1263 intr_irq_next_cpu(u_int last_cpu, cpuset_t *cpumask)
1264 {
1265 	u_int cpu;
1266 
1267 	KASSERT(!CPU_EMPTY(cpumask), ("%s: Empty CPU mask", __func__));
1268 	if (!irq_assign_cpu || mp_ncpus == 1) {
1269 		cpu = PCPU_GET(cpuid);
1270 
1271 		if (CPU_ISSET(cpu, cpumask))
1272 			return (curcpu);
1273 
1274 		return (CPU_FFS(cpumask) - 1);
1275 	}
1276 
1277 	do {
1278 		last_cpu++;
1279 		if (last_cpu > mp_maxid)
1280 			last_cpu = 0;
1281 	} while (!CPU_ISSET(last_cpu, cpumask));
1282 	return (last_cpu);
1283 }
1284 
1285 #ifndef EARLY_AP_STARTUP
1286 /*
1287  *  Distribute all the interrupt sources among the available
1288  *  CPUs once the AP's have been launched.
1289  */
1290 static void
1291 intr_irq_shuffle(void *arg __unused)
1292 {
1293 	struct intr_irqsrc *isrc;
1294 	u_int i;
1295 
1296 	if (mp_ncpus == 1)
1297 		return;
1298 
1299 	mtx_lock(&isrc_table_lock);
1300 	irq_assign_cpu = true;
1301 	for (i = 0; i < intr_nirq; i++) {
1302 		isrc = irq_sources[i];
1303 		if (isrc == NULL || isrc->isrc_handlers == 0 ||
1304 		    isrc->isrc_flags & (INTR_ISRCF_PPI | INTR_ISRCF_IPI))
1305 			continue;
1306 
1307 		if (isrc->isrc_event != NULL &&
1308 		    isrc->isrc_flags & INTR_ISRCF_BOUND &&
1309 		    isrc->isrc_event->ie_cpu != CPU_FFS(&isrc->isrc_cpu) - 1)
1310 			panic("%s: CPU inconsistency", __func__);
1311 
1312 		if ((isrc->isrc_flags & INTR_ISRCF_BOUND) == 0)
1313 			CPU_ZERO(&isrc->isrc_cpu); /* start again */
1314 
1315 		/*
1316 		 * We are in wicked position here if the following call fails
1317 		 * for bound ISRC. The best thing we can do is to clear
1318 		 * isrc_cpu so inconsistency with ie_cpu will be detectable.
1319 		 */
1320 		if (PIC_BIND_INTR(isrc->isrc_dev, isrc) != 0)
1321 			CPU_ZERO(&isrc->isrc_cpu);
1322 	}
1323 	mtx_unlock(&isrc_table_lock);
1324 }
1325 SYSINIT(intr_irq_shuffle, SI_SUB_SMP, SI_ORDER_SECOND, intr_irq_shuffle, NULL);
1326 #endif /* !EARLY_AP_STARTUP */
1327 
1328 #else
1329 u_int
1330 intr_irq_next_cpu(u_int current_cpu, cpuset_t *cpumask)
1331 {
1332 
1333 	return (PCPU_GET(cpuid));
1334 }
1335 #endif /* SMP */
1336 
1337 /*
1338  * Allocate memory for new intr_map_data structure.
1339  * Initialize common fields.
1340  */
1341 struct intr_map_data *
1342 intr_alloc_map_data(enum intr_map_data_type type, size_t len, int flags)
1343 {
1344 	struct intr_map_data *data;
1345 
1346 	data = malloc(len, M_INTRNG, flags);
1347 	data->type = type;
1348 	data->len = len;
1349 	return (data);
1350 }
1351 
1352 void intr_free_intr_map_data(struct intr_map_data *data)
1353 {
1354 
1355 	free(data, M_INTRNG);
1356 }
1357 
1358 /*
1359  *  Register a MSI/MSI-X interrupt controller
1360  */
1361 int
1362 intr_msi_register(device_t dev, intptr_t xref)
1363 {
1364 	struct intr_pic *pic;
1365 
1366 	if (dev == NULL)
1367 		return (EINVAL);
1368 	pic = pic_create(dev, xref, FLAG_MSI);
1369 	if (pic == NULL)
1370 		return (ENOMEM);
1371 
1372 	debugf("PIC %p registered for %s <dev %p, xref %jx>\n", pic,
1373 	    device_get_nameunit(dev), dev, (uintmax_t)xref);
1374 	return (0);
1375 }
1376 
1377 int
1378 intr_alloc_msi(device_t pci, device_t child, intptr_t xref, int count,
1379     int maxcount, int *irqs)
1380 {
1381 	struct iommu_domain *domain;
1382 	struct intr_irqsrc **isrc;
1383 	struct intr_pic *pic;
1384 	device_t pdev;
1385 	struct intr_map_data_msi *msi;
1386 	int err, i;
1387 
1388 	pic = pic_lookup(NULL, xref, FLAG_MSI);
1389 	if (pic == NULL)
1390 		return (ESRCH);
1391 
1392 	KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI,
1393 	    ("%s: Found a non-MSI controller: %s", __func__,
1394 	     device_get_name(pic->pic_dev)));
1395 
1396 	/*
1397 	 * If this is the first time we have used this context ask the
1398 	 * interrupt controller to map memory the msi source will need.
1399 	 */
1400 	err = MSI_IOMMU_INIT(pic->pic_dev, child, &domain);
1401 	if (err != 0)
1402 		return (err);
1403 
1404 	isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK);
1405 	err = MSI_ALLOC_MSI(pic->pic_dev, child, count, maxcount, &pdev, isrc);
1406 	if (err != 0) {
1407 		free(isrc, M_INTRNG);
1408 		return (err);
1409 	}
1410 
1411 	for (i = 0; i < count; i++) {
1412 		isrc[i]->isrc_iommu = domain;
1413 		msi = (struct intr_map_data_msi *)intr_alloc_map_data(
1414 		    INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO);
1415 		msi-> isrc = isrc[i];
1416 
1417 		irqs[i] = intr_map_irq(pic->pic_dev, xref,
1418 		    (struct intr_map_data *)msi);
1419 	}
1420 	free(isrc, M_INTRNG);
1421 
1422 	return (err);
1423 }
1424 
1425 int
1426 intr_release_msi(device_t pci, device_t child, intptr_t xref, int count,
1427     int *irqs)
1428 {
1429 	struct intr_irqsrc **isrc;
1430 	struct intr_pic *pic;
1431 	struct intr_map_data_msi *msi;
1432 	int i, err;
1433 
1434 	pic = pic_lookup(NULL, xref, FLAG_MSI);
1435 	if (pic == NULL)
1436 		return (ESRCH);
1437 
1438 	KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI,
1439 	    ("%s: Found a non-MSI controller: %s", __func__,
1440 	     device_get_name(pic->pic_dev)));
1441 
1442 	isrc = malloc(sizeof(*isrc) * count, M_INTRNG, M_WAITOK);
1443 
1444 	for (i = 0; i < count; i++) {
1445 		msi = (struct intr_map_data_msi *)
1446 		    intr_map_get_map_data(irqs[i]);
1447 		KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI,
1448 		    ("%s: irq %d map data is not MSI", __func__,
1449 		    irqs[i]));
1450 		isrc[i] = msi->isrc;
1451 	}
1452 
1453 	MSI_IOMMU_DEINIT(pic->pic_dev, child);
1454 
1455 	err = MSI_RELEASE_MSI(pic->pic_dev, child, count, isrc);
1456 
1457 	for (i = 0; i < count; i++) {
1458 		if (isrc[i] != NULL)
1459 			intr_unmap_irq(irqs[i]);
1460 	}
1461 
1462 	free(isrc, M_INTRNG);
1463 	return (err);
1464 }
1465 
1466 int
1467 intr_alloc_msix(device_t pci, device_t child, intptr_t xref, int *irq)
1468 {
1469 	struct iommu_domain *domain;
1470 	struct intr_irqsrc *isrc;
1471 	struct intr_pic *pic;
1472 	device_t pdev;
1473 	struct intr_map_data_msi *msi;
1474 	int err;
1475 
1476 	pic = pic_lookup(NULL, xref, FLAG_MSI);
1477 	if (pic == NULL)
1478 		return (ESRCH);
1479 
1480 	KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI,
1481 	    ("%s: Found a non-MSI controller: %s", __func__,
1482 	     device_get_name(pic->pic_dev)));
1483 
1484 	/*
1485 	 * If this is the first time we have used this context ask the
1486 	 * interrupt controller to map memory the msi source will need.
1487 	 */
1488 	err = MSI_IOMMU_INIT(pic->pic_dev, child, &domain);
1489 	if (err != 0)
1490 		return (err);
1491 
1492 	err = MSI_ALLOC_MSIX(pic->pic_dev, child, &pdev, &isrc);
1493 	if (err != 0)
1494 		return (err);
1495 
1496 	isrc->isrc_iommu = domain;
1497 	msi = (struct intr_map_data_msi *)intr_alloc_map_data(
1498 		    INTR_MAP_DATA_MSI, sizeof(*msi), M_WAITOK | M_ZERO);
1499 	msi->isrc = isrc;
1500 	*irq = intr_map_irq(pic->pic_dev, xref, (struct intr_map_data *)msi);
1501 	return (0);
1502 }
1503 
1504 int
1505 intr_release_msix(device_t pci, device_t child, intptr_t xref, int irq)
1506 {
1507 	struct intr_irqsrc *isrc;
1508 	struct intr_pic *pic;
1509 	struct intr_map_data_msi *msi;
1510 	int err;
1511 
1512 	pic = pic_lookup(NULL, xref, FLAG_MSI);
1513 	if (pic == NULL)
1514 		return (ESRCH);
1515 
1516 	KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI,
1517 	    ("%s: Found a non-MSI controller: %s", __func__,
1518 	     device_get_name(pic->pic_dev)));
1519 
1520 	msi = (struct intr_map_data_msi *)
1521 	    intr_map_get_map_data(irq);
1522 	KASSERT(msi->hdr.type == INTR_MAP_DATA_MSI,
1523 	    ("%s: irq %d map data is not MSI", __func__,
1524 	    irq));
1525 	isrc = msi->isrc;
1526 	if (isrc == NULL) {
1527 		intr_unmap_irq(irq);
1528 		return (EINVAL);
1529 	}
1530 
1531 	MSI_IOMMU_DEINIT(pic->pic_dev, child);
1532 
1533 	err = MSI_RELEASE_MSIX(pic->pic_dev, child, isrc);
1534 	intr_unmap_irq(irq);
1535 
1536 	return (err);
1537 }
1538 
1539 int
1540 intr_map_msi(device_t pci, device_t child, intptr_t xref, int irq,
1541     uint64_t *addr, uint32_t *data)
1542 {
1543 	struct intr_irqsrc *isrc;
1544 	struct intr_pic *pic;
1545 	int err;
1546 
1547 	pic = pic_lookup(NULL, xref, FLAG_MSI);
1548 	if (pic == NULL)
1549 		return (ESRCH);
1550 
1551 	KASSERT((pic->pic_flags & FLAG_TYPE_MASK) == FLAG_MSI,
1552 	    ("%s: Found a non-MSI controller: %s", __func__,
1553 	     device_get_name(pic->pic_dev)));
1554 
1555 	isrc = intr_map_get_isrc(irq);
1556 	if (isrc == NULL)
1557 		return (EINVAL);
1558 
1559 	err = MSI_MAP_MSI(pic->pic_dev, child, isrc, addr, data);
1560 
1561 #ifdef IOMMU
1562 	if (isrc->isrc_iommu != NULL)
1563 		iommu_translate_msi(isrc->isrc_iommu, addr);
1564 #endif
1565 
1566 	return (err);
1567 }
1568 
1569 void dosoftints(void);
1570 void
1571 dosoftints(void)
1572 {
1573 }
1574 
1575 #ifdef SMP
1576 /*
1577  *  Init interrupt controller on another CPU.
1578  */
1579 void
1580 intr_pic_init_secondary(void)
1581 {
1582 	device_t dev;
1583 	uint32_t rootnum;
1584 
1585 	/*
1586 	 * QQQ: Only root PICs are aware of other CPUs ???
1587 	 */
1588 	//mtx_lock(&isrc_table_lock);
1589 	for (rootnum = 0; rootnum < INTR_ROOT_NUM; rootnum++) {
1590 		dev = intr_irq_roots[rootnum].dev;
1591 		if (dev != NULL) {
1592 			PIC_INIT_SECONDARY(dev, rootnum);
1593 		}
1594 	}
1595 	//mtx_unlock(&isrc_table_lock);
1596 }
1597 #endif
1598 
1599 #ifdef DDB
1600 DB_SHOW_COMMAND_FLAGS(irqs, db_show_irqs, DB_CMD_MEMSAFE)
1601 {
1602 	u_int i, irqsum;
1603 	u_long num;
1604 	struct intr_irqsrc *isrc;
1605 
1606 	for (irqsum = 0, i = 0; i < intr_nirq; i++) {
1607 		isrc = irq_sources[i];
1608 		if (isrc == NULL)
1609 			continue;
1610 
1611 		num = isrc->isrc_count != NULL ? isrc->isrc_count[0] : 0;
1612 		db_printf("irq%-3u <%s>: cpu %02lx%s cnt %lu\n", i,
1613 		    isrc->isrc_name, isrc->isrc_cpu.__bits[0],
1614 		    isrc->isrc_flags & INTR_ISRCF_BOUND ? " (bound)" : "", num);
1615 		irqsum += num;
1616 	}
1617 	db_printf("irq total %u\n", irqsum);
1618 }
1619 #endif
1620 
1621 /*
1622  * Interrupt mapping table functions.
1623  *
1624  * Please, keep this part separately, it can be transformed to
1625  * extension of standard resources.
1626  */
1627 struct intr_map_entry
1628 {
1629 	device_t 		dev;
1630 	intptr_t 		xref;
1631 	struct intr_map_data 	*map_data;
1632 	struct intr_irqsrc 	*isrc;
1633 	/* XXX TODO DISCONECTED PICs */
1634 	/*int			flags */
1635 };
1636 
1637 /* XXX Convert irq_map[] to dynamicaly expandable one. */
1638 static struct intr_map_entry **irq_map;
1639 static u_int irq_map_count;
1640 static u_int irq_map_first_free_idx;
1641 static struct mtx irq_map_lock;
1642 
1643 static struct intr_irqsrc *
1644 intr_map_get_isrc(u_int res_id)
1645 {
1646 	struct intr_irqsrc *isrc;
1647 
1648 	isrc = NULL;
1649 	mtx_lock(&irq_map_lock);
1650 	if (res_id < irq_map_count && irq_map[res_id] != NULL)
1651 		isrc = irq_map[res_id]->isrc;
1652 	mtx_unlock(&irq_map_lock);
1653 
1654 	return (isrc);
1655 }
1656 
1657 static void
1658 intr_map_set_isrc(u_int res_id, struct intr_irqsrc *isrc)
1659 {
1660 
1661 	mtx_lock(&irq_map_lock);
1662 	if (res_id < irq_map_count && irq_map[res_id] != NULL)
1663 		irq_map[res_id]->isrc = isrc;
1664 	mtx_unlock(&irq_map_lock);
1665 }
1666 
1667 /*
1668  * Get a copy of intr_map_entry data
1669  */
1670 static struct intr_map_data *
1671 intr_map_get_map_data(u_int res_id)
1672 {
1673 	struct intr_map_data *data;
1674 
1675 	data = NULL;
1676 	mtx_lock(&irq_map_lock);
1677 	if (res_id >= irq_map_count || irq_map[res_id] == NULL)
1678 		panic("Attempt to copy invalid resource id: %u\n", res_id);
1679 	data = irq_map[res_id]->map_data;
1680 	mtx_unlock(&irq_map_lock);
1681 
1682 	return (data);
1683 }
1684 
1685 /*
1686  * Get a copy of intr_map_entry data
1687  */
1688 static void
1689 intr_map_copy_map_data(u_int res_id, device_t *map_dev, intptr_t *map_xref,
1690     struct intr_map_data **data)
1691 {
1692 	size_t len;
1693 
1694 	len = 0;
1695 	mtx_lock(&irq_map_lock);
1696 	if (res_id >= irq_map_count || irq_map[res_id] == NULL)
1697 		panic("Attempt to copy invalid resource id: %u\n", res_id);
1698 	if (irq_map[res_id]->map_data != NULL)
1699 		len = irq_map[res_id]->map_data->len;
1700 	mtx_unlock(&irq_map_lock);
1701 
1702 	if (len == 0)
1703 		*data = NULL;
1704 	else
1705 		*data = malloc(len, M_INTRNG, M_WAITOK | M_ZERO);
1706 	mtx_lock(&irq_map_lock);
1707 	if (irq_map[res_id] == NULL)
1708 		panic("Attempt to copy invalid resource id: %u\n", res_id);
1709 	if (len != 0) {
1710 		if (len != irq_map[res_id]->map_data->len)
1711 			panic("Resource id: %u has changed.\n", res_id);
1712 		memcpy(*data, irq_map[res_id]->map_data, len);
1713 	}
1714 	*map_dev = irq_map[res_id]->dev;
1715 	*map_xref = irq_map[res_id]->xref;
1716 	mtx_unlock(&irq_map_lock);
1717 }
1718 
1719 /*
1720  * Allocate and fill new entry in irq_map table.
1721  */
1722 u_int
1723 intr_map_irq(device_t dev, intptr_t xref, struct intr_map_data *data)
1724 {
1725 	u_int i;
1726 	struct intr_map_entry *entry;
1727 
1728 	/* Prepare new entry first. */
1729 	entry = malloc(sizeof(*entry), M_INTRNG, M_WAITOK | M_ZERO);
1730 
1731 	entry->dev = dev;
1732 	entry->xref = xref;
1733 	entry->map_data = data;
1734 	entry->isrc = NULL;
1735 
1736 	mtx_lock(&irq_map_lock);
1737 	for (i = irq_map_first_free_idx; i < irq_map_count; i++) {
1738 		if (irq_map[i] == NULL) {
1739 			irq_map[i] = entry;
1740 			irq_map_first_free_idx = i + 1;
1741 			mtx_unlock(&irq_map_lock);
1742 			return (i);
1743 		}
1744 	}
1745 	for (i = 0; i < irq_map_first_free_idx; i++) {
1746 		if (irq_map[i] == NULL) {
1747 			irq_map[i] = entry;
1748 			irq_map_first_free_idx = i + 1;
1749 			mtx_unlock(&irq_map_lock);
1750 			return (i);
1751 		}
1752 	}
1753 	mtx_unlock(&irq_map_lock);
1754 
1755 	/* XXX Expand irq_map table */
1756 	panic("IRQ mapping table is full.");
1757 }
1758 
1759 /*
1760  * Remove and free mapping entry.
1761  */
1762 void
1763 intr_unmap_irq(u_int res_id)
1764 {
1765 	struct intr_map_entry *entry;
1766 
1767 	mtx_lock(&irq_map_lock);
1768 	if ((res_id >= irq_map_count) || (irq_map[res_id] == NULL))
1769 		panic("Attempt to unmap invalid resource id: %u\n", res_id);
1770 	entry = irq_map[res_id];
1771 	irq_map[res_id] = NULL;
1772 	irq_map_first_free_idx = res_id;
1773 	mtx_unlock(&irq_map_lock);
1774 	intr_free_intr_map_data(entry->map_data);
1775 	free(entry, M_INTRNG);
1776 }
1777 
1778 /*
1779  * Clone mapping entry.
1780  */
1781 u_int
1782 intr_map_clone_irq(u_int old_res_id)
1783 {
1784 	device_t map_dev;
1785 	intptr_t map_xref;
1786 	struct intr_map_data *data;
1787 
1788 	intr_map_copy_map_data(old_res_id, &map_dev, &map_xref, &data);
1789 	return (intr_map_irq(map_dev, map_xref, data));
1790 }
1791 
1792 static void
1793 intr_map_init(void *dummy __unused)
1794 {
1795 
1796 	mtx_init(&irq_map_lock, "intr map table", NULL, MTX_DEF);
1797 
1798 	irq_map_count = 2 * intr_nirq;
1799 	irq_map = mallocarray(irq_map_count, sizeof(struct intr_map_entry*),
1800 	    M_INTRNG, M_WAITOK | M_ZERO);
1801 }
1802 SYSINIT(intr_map_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_map_init, NULL);
1803 
1804 #ifdef SMP
1805 /* Virtualization for interrupt source IPI counter increment. */
1806 static inline void
1807 intr_ipi_increment_count(u_long *counter, u_int cpu)
1808 {
1809 
1810 	KASSERT(cpu < mp_maxid + 1, ("%s: too big cpu %u", __func__, cpu));
1811 	counter[cpu]++;
1812 }
1813 
1814 /*
1815  *  Virtualization for interrupt source IPI counters setup.
1816  */
1817 static u_long *
1818 intr_ipi_setup_counters(const char *name)
1819 {
1820 	u_int index, i;
1821 	char str[INTRNAME_LEN];
1822 
1823 	mtx_lock(&isrc_table_lock);
1824 
1825 	/*
1826 	 * We should never have a problem finding mp_maxid + 1 contiguous
1827 	 * counters, in practice. Interrupts will be allocated sequentially
1828 	 * during boot, so the array should fill from low to high index. Once
1829 	 * reserved, the IPI counters will never be released. Similarly, we
1830 	 * will not need to allocate more IPIs once the system is running.
1831 	 */
1832 	bit_ffc_area(intrcnt_bitmap, nintrcnt, mp_maxid + 1, &index);
1833 	if (index == -1)
1834 		panic("Failed to allocate %d counters. Array exhausted?",
1835 		    mp_maxid + 1);
1836 	bit_nset(intrcnt_bitmap, index, index + mp_maxid);
1837 	for (i = 0; i < mp_maxid + 1; i++) {
1838 		snprintf(str, INTRNAME_LEN, "cpu%d:%s", i, name);
1839 		intrcnt_setname(str, index + i);
1840 	}
1841 	mtx_unlock(&isrc_table_lock);
1842 	return (&intrcnt[index]);
1843 }
1844 
1845 /*
1846  *  Lookup IPI source.
1847  */
1848 static struct intr_ipi *
1849 intr_ipi_lookup(u_int ipi)
1850 {
1851 
1852 	if (ipi >= INTR_IPI_COUNT)
1853 		panic("%s: no such IPI %u", __func__, ipi);
1854 
1855 	return (&ipi_sources[ipi]);
1856 }
1857 
1858 int
1859 intr_ipi_pic_register(device_t dev, u_int priority)
1860 {
1861 	if (intr_ipi_dev_frozen) {
1862 		device_printf(dev, "IPI device already frozen");
1863 		return (EBUSY);
1864 	}
1865 
1866 	if (intr_ipi_dev == NULL || priority > intr_ipi_dev_priority) {
1867 		intr_ipi_dev_priority = priority;
1868 		intr_ipi_dev = dev;
1869 	}
1870 
1871 	return (0);
1872 }
1873 
1874 /*
1875  *  Setup IPI handler on interrupt controller.
1876  *
1877  *  Not SMP coherent.
1878  */
1879 void
1880 intr_ipi_setup(u_int ipi, const char *name, intr_ipi_handler_t *hand,
1881     void *arg)
1882 {
1883 	struct intr_irqsrc *isrc;
1884 	struct intr_ipi *ii;
1885 	int error;
1886 
1887 	if (!intr_ipi_dev_frozen) {
1888 		if (intr_ipi_dev == NULL)
1889 			panic("%s: no IPI PIC attached", __func__);
1890 
1891 		intr_ipi_dev_frozen = true;
1892 		device_printf(intr_ipi_dev, "using for IPIs\n");
1893 	}
1894 
1895 	KASSERT(hand != NULL, ("%s: ipi %u no handler", __func__, ipi));
1896 
1897 	error = PIC_IPI_SETUP(intr_ipi_dev, ipi, &isrc);
1898 	if (error != 0)
1899 		return;
1900 
1901 	isrc->isrc_handlers++;
1902 
1903 	ii = intr_ipi_lookup(ipi);
1904 	KASSERT(ii->ii_count == NULL, ("%s: ipi %u reused", __func__, ipi));
1905 
1906 	ii->ii_handler = hand;
1907 	ii->ii_handler_arg = arg;
1908 	ii->ii_isrc = isrc;
1909 	strlcpy(ii->ii_name, name, INTR_IPI_NAMELEN);
1910 	ii->ii_count = intr_ipi_setup_counters(name);
1911 
1912 	PIC_ENABLE_INTR(intr_ipi_dev, isrc);
1913 }
1914 
1915 void
1916 intr_ipi_send(cpuset_t cpus, u_int ipi)
1917 {
1918 	struct intr_ipi *ii;
1919 
1920 	KASSERT(intr_ipi_dev_frozen,
1921 	    ("%s: IPI device not yet frozen", __func__));
1922 
1923 	ii = intr_ipi_lookup(ipi);
1924 	if (ii->ii_count == NULL)
1925 		panic("%s: not setup IPI %u", __func__, ipi);
1926 
1927 	/*
1928 	 * XXX: Surely needed on other architectures too? Either way should be
1929 	 * some kind of MI hook defined in an MD header, or the responsibility
1930 	 * of the MD caller if not widespread.
1931 	 */
1932 #ifdef __aarch64__
1933 	/*
1934 	 * Ensure that this CPU's stores will be visible to IPI
1935 	 * recipients before starting to send the interrupts.
1936 	 */
1937 	dsb(ishst);
1938 #endif
1939 
1940 	PIC_IPI_SEND(intr_ipi_dev, ii->ii_isrc, cpus, ipi);
1941 }
1942 
1943 /*
1944  *  interrupt controller dispatch function for IPIs. It should
1945  *  be called straight from the interrupt controller, when associated
1946  *  interrupt source is learned. Or from anybody who has an interrupt
1947  *  source mapped.
1948  */
1949 void
1950 intr_ipi_dispatch(u_int ipi)
1951 {
1952 	struct intr_ipi *ii;
1953 
1954 	ii = intr_ipi_lookup(ipi);
1955 	if (ii->ii_count == NULL)
1956 		panic("%s: not setup IPI %u", __func__, ipi);
1957 
1958 	intr_ipi_increment_count(ii->ii_count, PCPU_GET(cpuid));
1959 
1960 	ii->ii_handler(ii->ii_handler_arg);
1961 }
1962 #endif
1963