xref: /illumos-gate/usr/src/uts/sun4/os/intr.c (revision 2a1fd0ffe121888d44fdec321c25b53dcfaa9118)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
27  */
28 /*
29  * Copyright 2019 Peter Tribble.
30  */
31 
32 #include <sys/sysmacros.h>
33 #include <sys/stack.h>
34 #include <sys/cpuvar.h>
35 #include <sys/ivintr.h>
36 #include <sys/intreg.h>
37 #include <sys/membar.h>
38 #include <sys/kmem.h>
39 #include <sys/intr.h>
40 #include <sys/sunddi.h>
41 #include <sys/sunndi.h>
42 #include <sys/cmn_err.h>
43 #include <sys/privregs.h>
44 #include <sys/systm.h>
45 #include <sys/archsystm.h>
46 #include <sys/machsystm.h>
47 #include <sys/x_call.h>
48 #include <vm/seg_kp.h>
49 #include <sys/debug.h>
50 #include <sys/cyclic.h>
51 #include <sys/kdi_impl.h>
52 #include <sys/ddi_periodic.h>
53 
54 #include <sys/cpu_sgnblk_defs.h>
55 
56 /* Global locks which protect the interrupt distribution lists */
57 static kmutex_t intr_dist_lock;
58 static kmutex_t intr_dist_cpu_lock;
59 
60 /* Head of the interrupt distribution lists */
61 static struct intr_dist *intr_dist_head = NULL;
62 static struct intr_dist *intr_dist_whead = NULL;
63 
64 static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
65 uint64_t *siron_cpu_inum = NULL;
66 uint64_t siron_poke_cpu_inum;
67 static int siron_cpu_setup(cpu_setup_t, int, void *);
68 extern uint_t softlevel1();
69 
70 static uint64_t siron1_inum; /* backward compatibility */
71 uint64_t poke_cpu_inum;
72 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
73 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
74 
75 /*
76  * Variable to enable/disable printing a message when an invalid vecintr
77  * is received.
78  */
79 uint_t ignore_invalid_vecintr = 0;
80 
81 /*
82  * Note:-
83  * siron_pending was originally created to prevent a resource over consumption
84  * bug in setsoftint(exhaustion of interrupt pool free list).
85  * It's original intention is obsolete with the use of iv_pending in
86  * setsoftint. However, siron_pending stayed around, acting as a second
87  * gatekeeper preventing soft interrupts from being queued. In this capacity,
88  * it can lead to hangs on MP systems, where due to global visibility issues
89  * it can end up set while iv_pending is reset, preventing soft interrupts from
90  * ever being processed. In addition to its gatekeeper role, init_intr also
91  * uses it to flag the situation where siron() was called before siron_inum has
92  * been defined.
93  *
94  * siron() does not need an extra gatekeeper; any cpu that wishes should be
95  * allowed to queue a soft interrupt. It is softint()'s job to ensure
96  * correct handling of the queues. Therefore, siron_pending has been
97  * stripped of its gatekeeper task, retaining only its intr_init job, where
98  * it indicates that there is a pending need to call siron().
99  */
100 static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
101 static int siron1_pending; /* backward compatibility */
102 
103 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
104 int intr_dist_debug = 0;
105 int32_t intr_dist_weight_max = 1;
106 int32_t intr_dist_weight_maxmax = 1000;
107 int intr_dist_weight_maxfactor = 2;
108 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
109 
110 /*
111  * intr_init() - Interrupt initialization
112  *	Initialize the system's interrupt vector table.
113  */
114 void
115 intr_init(cpu_t *cp)
116 {
117 	int i;
118 	extern uint_t softlevel1();
119 
120 	init_ivintr();
121 
122 	/*
123 	 * Register these software interrupts for ddi timer.
124 	 * Software interrupts up to the level 10 are supported.
125 	 */
126 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
127 		siron_inum[i - 1] = add_softintr(i,
128 		    (softintrfunc)ddi_periodic_softintr,
129 		    (caddr_t)(uintptr_t)(i), SOFTINT_ST);
130 	}
131 
132 	siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
133 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
134 	siron_poke_cpu_inum = add_softintr(PIL_13,
135 	    siron_poke_cpu_intr, 0, SOFTINT_MT);
136 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
137 
138 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
139 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
140 
141 	/*
142 	 * A soft interrupt may have been requested prior to the initialization
143 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
144 	 * init_intr(), so we have to wait until now before we can dispatch the
145 	 * pending soft interrupt (if any).
146 	 */
147 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
148 		if (siron_pending[i-1]) {
149 			siron_pending[i-1] = 0;
150 			sir_on(i);
151 		}
152 	}
153 	if (siron1_pending) {
154 		siron1_pending = 0;
155 		siron();
156 	}
157 }
158 
159 /*
160  * poke_cpu_intr - fall through when poke_cpu calls
161  */
162 /* ARGSUSED */
163 uint_t
164 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
165 {
166 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
167 	membar_stld_stst();
168 	return (1);
169 }
170 
171 /*
172  * Trigger software interrupts dedicated to ddi timer.
173  */
174 void
175 sir_on(int level)
176 {
177 	ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
178 	if (siron_inum[level-1])
179 		setsoftint(siron_inum[level-1]);
180 	else
181 		siron_pending[level-1] = 1;
182 }
183 
184 /*
185  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
186  * inform its driver component that there's work to be done.  We need to keep
187  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
188  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
189  * implementation of setsoftint is complicated enough that we don't want to
190  * duplicate it, but at the same time we don't want to preclude tracing either.
191  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
192  * setsoftint() implemented as a wrapper.  This allows tracing, while still
193  * providing a way for kmdb to sneak in unmolested.
194  */
195 void
196 kdi_siron(void)
197 {
198 	if (siron1_inum != 0)
199 		kdi_setsoftint(siron1_inum);
200 	else
201 		siron1_pending = 1;
202 }
203 
204 void
205 setsoftint(uint64_t inum)
206 {
207 	kdi_setsoftint(inum);
208 }
209 
210 /*
211  * Generates softlevel1 interrupt on current CPU if it
212  * is not pending already.
213  */
214 void
215 siron(void)
216 {
217 	uint64_t inum;
218 
219 	if (siron1_inum != 0) {
220 		/*
221 		 * Once siron_cpu_inum has been allocated, we can
222 		 * use per-CPU siron inum.
223 		 */
224 		if (siron_cpu_inum && siron_cpu_inum[CPU->cpu_id] != 0)
225 			inum = siron_cpu_inum[CPU->cpu_id];
226 		else
227 			inum = siron1_inum;
228 
229 		setsoftint(inum);
230 	} else
231 		siron1_pending = 1;
232 }
233 
234 
235 static void
236 siron_init(void)
237 {
238 	/*
239 	 * We just allocate memory for per-cpu siron right now. Rest of
240 	 * the work is done when CPU is configured.
241 	 */
242 	siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
243 }
244 
245 /*
246  * This routine creates per-CPU siron inum for CPUs which are
247  * configured during boot.
248  */
249 void
250 siron_mp_init()
251 {
252 	cpu_t *c;
253 
254 	/*
255 	 * Get the memory for per-CPU siron inums
256 	 */
257 	siron_init();
258 
259 	mutex_enter(&cpu_lock);
260 	c = cpu_list;
261 	do {
262 		(void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
263 	} while ((c = c->cpu_next) != cpu_list);
264 
265 	register_cpu_setup_func(siron_cpu_setup, NULL);
266 	mutex_exit(&cpu_lock);
267 }
268 
269 /*
270  * siron_poke_cpu_intr - cross-call handler.
271  */
272 /* ARGSUSED */
273 uint_t
274 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
275 {
276 	/* generate level1 softint */
277 	siron();
278 	return (1);
279 }
280 
281 /*
282  * This routine generates a cross-call on target CPU(s).
283  */
284 void
285 siron_poke_cpu(cpuset_t poke)
286 {
287 	int cpuid = CPU->cpu_id;
288 
289 	if (CPU_IN_SET(poke, cpuid)) {
290 		siron();
291 		CPUSET_DEL(poke, cpuid);
292 		if (CPUSET_ISNULL(poke))
293 			return;
294 	}
295 
296 	xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
297 }
298 
299 /*
300  * This callback function allows us to create per-CPU siron inum.
301  */
302 /* ARGSUSED */
303 static int
304 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
305 {
306 	cpu_t *cp = cpu[id];
307 
308 	ASSERT(MUTEX_HELD(&cpu_lock));
309 	ASSERT(cp != NULL);
310 
311 	switch (what) {
312 	case CPU_CONFIG:
313 		siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
314 		    (softintrfunc)softlevel1, 0, SOFTINT_ST);
315 		break;
316 	case CPU_UNCONFIG:
317 		(void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
318 		siron_cpu_inum[cp->cpu_id] = 0;
319 		break;
320 	default:
321 		break;
322 	}
323 
324 	return (0);
325 }
326 
327 /*
328  * no_ivintr()
329  * 	called by setvecint_tl1() through sys_trap()
330  *	vector interrupt received but not valid or not
331  *	registered in intr_vec_table
332  *	considered as a spurious mondo interrupt
333  */
334 /* ARGSUSED */
335 void
336 no_ivintr(struct regs *rp, int inum, int pil)
337 {
338 	if (!ignore_invalid_vecintr)
339 		cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
340 		    inum, pil);
341 
342 #ifdef DEBUG_VEC_INTR
343 	prom_enter_mon();
344 #endif /* DEBUG_VEC_INTR */
345 }
346 
347 void
348 intr_dequeue_req(uint_t pil, uint64_t inum)
349 {
350 	intr_vec_t	*iv, *next, *prev;
351 	struct machcpu	*mcpu;
352 	uint32_t	clr;
353 	processorid_t	cpu_id;
354 	extern uint_t	getpstate(void);
355 
356 	ASSERT((getpstate() & PSTATE_IE) == 0);
357 
358 	mcpu = &CPU->cpu_m;
359 	cpu_id = CPU->cpu_id;
360 
361 	iv = (intr_vec_t *)inum;
362 	prev = NULL;
363 	next = mcpu->intr_head[pil];
364 
365 	/* Find a matching entry in the list */
366 	while (next != NULL) {
367 		if (next == iv)
368 			break;
369 		prev = next;
370 		next = IV_GET_PIL_NEXT(next, cpu_id);
371 	}
372 
373 	if (next != NULL) {
374 		intr_vec_t	*next_iv = IV_GET_PIL_NEXT(next, cpu_id);
375 
376 		/* Remove entry from list */
377 		if (prev != NULL)
378 			IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
379 		else
380 			mcpu->intr_head[pil] = next_iv; /* head */
381 
382 		if (next_iv == NULL)
383 			mcpu->intr_tail[pil] = prev; /* tail */
384 	}
385 
386 	/* Clear pending interrupts at this level if the list is empty */
387 	if (mcpu->intr_head[pil] == NULL) {
388 		clr = 1 << pil;
389 		if (pil == PIL_14)
390 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
391 		wr_clr_softint(clr);
392 	}
393 }
394 
395 
396 /*
397  * Send a directed interrupt of specified interrupt number id to a cpu.
398  */
399 void
400 send_dirint(
401 	int cpuix,		/* cpu to be interrupted */
402 	int intr_id)		/* interrupt number id */
403 {
404 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
405 }
406 
407 /*
408  * Take the specified CPU out of participation in interrupts.
409  *	Called by p_online(2) when a processor is being taken off-line.
410  *	This allows interrupt threads being handled on the processor to
411  *	complete before the processor is idled.
412  */
413 int
414 cpu_disable_intr(struct cpu *cp)
415 {
416 	ASSERT(MUTEX_HELD(&cpu_lock));
417 
418 	/*
419 	 * Turn off the CPU_ENABLE flag before calling the redistribution
420 	 * function, since it checks for this in the cpu flags.
421 	 */
422 	cp->cpu_flags &= ~CPU_ENABLE;
423 
424 	intr_redist_all_cpus();
425 
426 	return (0);
427 }
428 
429 /*
430  * Allow the specified CPU to participate in interrupts.
431  *	Called by p_online(2) if a processor could not be taken off-line
432  *	because of bound threads, in order to resume processing interrupts.
433  *	Also called after starting a processor.
434  */
435 void
436 cpu_enable_intr(struct cpu *cp)
437 {
438 	ASSERT(MUTEX_HELD(&cpu_lock));
439 
440 	cp->cpu_flags |= CPU_ENABLE;
441 
442 	intr_redist_all_cpus();
443 }
444 
445 /*
446  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
447  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
448  * are issued to redirect interrupts of a specified weight, from heavy to
449  * light.  This allows all the interrupts of a given weight to be redistributed
450  * for all weighted nexus drivers prior to those of less weight.
451  */
452 static void
453 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
454 {
455 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
456 	struct intr_dist *iptr;
457 	struct intr_dist **pptr;
458 
459 	ASSERT(func);
460 	new->func = func;
461 	new->arg = arg;
462 	new->next = NULL;
463 
464 	/* Add to tail so that redistribution occurs in original order. */
465 	mutex_enter(&intr_dist_lock);
466 	for (iptr = *phead, pptr = phead; iptr != NULL;
467 	    pptr = &iptr->next, iptr = iptr->next) {
468 		/* check for problems as we locate the tail */
469 		if ((iptr->func == func) && (iptr->arg == arg)) {
470 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
471 			/*NOTREACHED*/
472 		}
473 	}
474 	*pptr = new;
475 
476 	mutex_exit(&intr_dist_lock);
477 }
478 
479 void
480 intr_dist_add(void (*func)(void *), void *arg)
481 {
482 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
483 }
484 
485 void
486 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
487 {
488 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
489 }
490 
491 /*
492  * Search for the interrupt distribution structure with the specified
493  * mondo vec reg in the interrupt distribution list. If a match is found,
494  * then delete the entry from the list. The caller is responsible for
495  * modifying the mondo vector registers.
496  */
497 static void
498 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
499 {
500 	struct intr_dist *iptr;
501 	struct intr_dist **vect;
502 
503 	mutex_enter(&intr_dist_lock);
504 	for (iptr = *headp, vect = headp;
505 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
506 		if ((iptr->func == func) && (iptr->arg == arg)) {
507 			*vect = iptr->next;
508 			kmem_free(iptr, sizeof (struct intr_dist));
509 			mutex_exit(&intr_dist_lock);
510 			return;
511 		}
512 	}
513 
514 	if (!panicstr)
515 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
516 	mutex_exit(&intr_dist_lock);
517 }
518 
519 void
520 intr_dist_rem(void (*func)(void *), void *arg)
521 {
522 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
523 }
524 
525 void
526 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
527 {
528 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
529 }
530 
531 /*
532  * Initiate interrupt redistribution.  Redistribution improves the isolation
533  * associated with interrupt weights by ordering operations from heavy weight
534  * to light weight.  When a CPUs orientation changes relative to interrupts,
535  * there is *always* a redistribution to accommodate this change (call to
536  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
537  * that a redistribution could improve the quality of an initialization. For
538  * example, if you are not using a NIC it may not be attached with s10 (devfs).
539  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
540  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
541  * occurring late, so optimal "isolation" relative to weight is not occurring.
542  * The same applies to detach, although in this case doing the redistribution
543  * might improve "spread" for medium weight devices since the "isolation" of
544  * a higher weight device may no longer be present.
545  *
546  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
547  *
548  * NB: There is risk associated with automatically triggering execution of the
549  * redistribution code at arbitrary times. The risk comes from the fact that
550  * there is a lot of low-level hardware interaction associated with a
551  * redistribution.  At some point we may want this code to perform automatic
552  * redistribution (redistribution thread; trigger timeout when add/remove
553  * weight delta is large enough, and call cv_signal from timeout - causing
554  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
555  * risky at this time.
556  */
557 void
558 i_ddi_intr_redist_all_cpus()
559 {
560 	mutex_enter(&cpu_lock);
561 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
562 	intr_redist_all_cpus();
563 	mutex_exit(&cpu_lock);
564 }
565 
566 /*
567  * Redistribute all interrupts
568  *
569  * This function redistributes all interrupting devices, running the
570  * parent callback functions for each node.
571  */
572 void
573 intr_redist_all_cpus(void)
574 {
575 	struct cpu *cp;
576 	struct intr_dist *iptr;
577 	int32_t weight, max_weight;
578 
579 	ASSERT(MUTEX_HELD(&cpu_lock));
580 	mutex_enter(&intr_dist_lock);
581 
582 	/*
583 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
584 	 * cpu_list since we hold cpu_lock.
585 	 */
586 	cp = cpu_list;
587 	do {
588 		cp->cpu_intr_weight = 0;
589 	} while ((cp = cp->cpu_next) != cpu_list);
590 
591 	/*
592 	 * Assume that this redistribution may encounter a device weight
593 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
594 	 * intr_dist_weight_maxfactor times larger.
595 	 */
596 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
597 	if (max_weight > intr_dist_weight_maxmax)
598 		max_weight = intr_dist_weight_maxmax;
599 	intr_dist_weight_max = 1;
600 
601 	INTR_DEBUG((CE_CONT, "intr_dist: "
602 	    "intr_redist_all_cpus: %d-0\n", max_weight));
603 
604 	/*
605 	 * Redistribute weighted, from heavy to light.  The callback that
606 	 * specifies a weight equal to weight_max should redirect all
607 	 * interrupts of weight weight_max or greater [weight_max, inf.).
608 	 * Interrupts of lesser weight should be processed on the call with
609 	 * the matching weight. This allows all the heaver weight interrupts
610 	 * on all weighted busses (multiple pci busses) to be redirected prior
611 	 * to any lesser weight interrupts.
612 	 */
613 	for (weight = max_weight; weight >= 0; weight--)
614 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
615 			((void (*)(void *, int32_t, int32_t))iptr->func)
616 			    (iptr->arg, max_weight, weight);
617 
618 	/* redistribute normal (non-weighted) interrupts */
619 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
620 		((void (*)(void *))iptr->func)(iptr->arg);
621 	mutex_exit(&intr_dist_lock);
622 }
623 
624 void
625 intr_redist_all_cpus_shutdown(void)
626 {
627 	intr_policy = INTR_CURRENT_CPU;
628 	intr_redist_all_cpus();
629 }
630 
631 /*
632  * Determine what CPU to target, based on interrupt policy.
633  *
634  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
635  *	advance through interrupt enabled cpus (round-robin).
636  *
637  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
638  *	cpu_intr_weight, round robin when all equal.
639  *
640  *	Weighted interrupt distribution provides two things: "spread" of weight
641  *	(associated with algorithm itself) and "isolation" (associated with a
642  *	particular device weight). A redistribution is what provides optimal
643  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
644  *	(relative to what came before) is always occurring.
645  *
646  *	An interrupt weight is a subjective number that represents the
647  *	percentage of a CPU required to service a device's interrupts: the
648  *	default weight is 0% (however the algorithm still maintains
649  *	round-robin), a network interface controller (NIC) may have a large
650  *	weight (35%). Interrupt weight only has meaning relative to the
651  *	interrupt weight of other devices: a CPU can be weighted more than
652  *	100%, and a single device might consume more than 100% of a CPU.
653  *
654  *	A coarse interrupt weight can be defined by the parent nexus driver
655  *	based on bus specific information, like pci class codes. A nexus
656  *	driver that supports device interrupt weighting for its children
657  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
658  *	and removes the weight of a device from the CPU that an interrupt
659  *	is directed at.  The quality of initialization improves when the
660  *	device interrupt weights more accuracy reflect actual run-time weights,
661  *	and as the assignments are ordered from is heavy to light.
662  *
663  *	The implementation also supports interrupt weight being specified in
664  *	driver.conf files via the property "ddi-intr-weight", which takes
665  *	precedence over the nexus supplied weight.  This support is added to
666  *	permit possible tweaking in the product in response to customer
667  *	problems. This is not a formal or committed interface.
668  *
669  *	While a weighted approach chooses the CPU providing the best spread
670  *	given past weights, less than optimal isolation can result in cases
671  *	where heavy weight devices show up last. The nexus driver's interrupt
672  *	redistribution logic should use intr_dist_add/rem_weighted so that
673  *	interrupts can be redistributed heavy first for optimal isolation.
674  */
675 uint32_t
676 intr_dist_cpuid(void)
677 {
678 	static struct cpu	*curr_cpu;
679 	struct cpu		*start_cpu;
680 	struct cpu		*new_cpu;
681 	struct cpu		*cp;
682 	int			cpuid = -1;
683 
684 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
685 	mutex_enter(&intr_dist_cpu_lock);
686 
687 	switch (intr_policy) {
688 	case INTR_CURRENT_CPU:
689 		cpuid = CPU->cpu_id;
690 		break;
691 
692 	case INTR_BOOT_CPU:
693 		panic("INTR_BOOT_CPU no longer supported.");
694 		/*NOTREACHED*/
695 
696 	case INTR_FLAT_DIST:
697 	case INTR_WEIGHTED_DIST:
698 	default:
699 		/*
700 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
701 		 * the cpu has been deleted (cpu structs are never freed).
702 		 */
703 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
704 			curr_cpu = CPU;
705 
706 		/*
707 		 * Advance to online CPU after curr_cpu (round-robin). For
708 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
709 		 * weight.  For a nexus that does not support weight the
710 		 * default weight of zero is used. We degrade to round-robin
711 		 * behavior among equal weightes.  The default weight is zero
712 		 * and round-robin behavior continues.
713 		 *
714 		 * Disable preemption while traversing cpu_next_onln to
715 		 * ensure the list does not change.  This works because
716 		 * modifiers of this list and other lists in a struct cpu
717 		 * call pause_cpus() before making changes.
718 		 */
719 		kpreempt_disable();
720 		cp = start_cpu = curr_cpu->cpu_next_onln;
721 		new_cpu = NULL;
722 		do {
723 			/* Skip CPUs with interrupts disabled */
724 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
725 				continue;
726 
727 			if (intr_policy == INTR_FLAT_DIST) {
728 				/* select CPU */
729 				new_cpu = cp;
730 				break;
731 			} else if ((new_cpu == NULL) ||
732 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
733 				/* Choose if lighter weight */
734 				new_cpu = cp;
735 			}
736 		} while ((cp = cp->cpu_next_onln) != start_cpu);
737 		ASSERT(new_cpu);
738 		cpuid = new_cpu->cpu_id;
739 
740 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
741 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
742 
743 		/* update static pointer for next round-robin */
744 		curr_cpu = new_cpu;
745 		kpreempt_enable();
746 		break;
747 	}
748 	mutex_exit(&intr_dist_cpu_lock);
749 	return (cpuid);
750 }
751 
752 /*
753  * Add or remove the the weight of a device from a CPUs interrupt weight.
754  *
755  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
756  * their children to improve the overall quality of interrupt initialization.
757  *
758  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
759  * among multiple devices (sharing ino) then the nexus should call
760  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
761  * that share must specify the same cpuid.
762  *
763  * If a nexus driver is unable to determine the cpu at remove_intr time
764  * for some of its interrupts, then it should not call add_device_weight -
765  * intr_dist_cpuid will still provide round-robin.
766  *
767  * An established device weight (from dev_info node) takes precedence over
768  * the weight passed in.  If a device weight is not already established
769  * then the passed in nexus weight is established.
770  */
771 void
772 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
773     dev_info_t *dip, int32_t nweight)
774 {
775 	int32_t		eweight;
776 
777 	/*
778 	 * For non-weighted policy everything has weight of zero (and we get
779 	 * round-robin distribution from intr_dist_cpuid).
780 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
781 	 * calls this rouitne even if intr_policy has been patched to
782 	 * INTR_FLAG_DIST.
783 	 */
784 	ASSERT(dip);
785 	if (intr_policy != INTR_WEIGHTED_DIST)
786 		return;
787 
788 	eweight = i_ddi_get_intr_weight(dip);
789 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
790 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
791 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
792 	    ddi_get_instance(ddi_get_parent(dip)),
793 	    ddi_driver_name(dip), ddi_get_instance(dip)));
794 
795 	/* if no establish weight, establish nexus weight */
796 	if (eweight < 0) {
797 		if (nweight > 0)
798 			(void) i_ddi_set_intr_weight(dip, nweight);
799 		else
800 			nweight = 0;
801 	} else
802 		nweight = eweight;	/* use established weight */
803 
804 	/* Establish exclusion for cpu_intr_weight manipulation */
805 	mutex_enter(&intr_dist_cpu_lock);
806 	cpu[cpuid]->cpu_intr_weight += nweight;
807 
808 	/* update intr_dist_weight_max */
809 	if (nweight > intr_dist_weight_max)
810 		intr_dist_weight_max = nweight;
811 	mutex_exit(&intr_dist_cpu_lock);
812 }
813 
814 void
815 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
816 {
817 	struct cpu	*cp;
818 	int32_t		weight;
819 
820 	ASSERT(dip);
821 	if (intr_policy != INTR_WEIGHTED_DIST)
822 		return;
823 
824 	/* remove weight of device from cpu */
825 	weight = i_ddi_get_intr_weight(dip);
826 	if (weight < 0)
827 		weight = 0;
828 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
829 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
830 	    ddi_driver_name(ddi_get_parent(dip)),
831 	    ddi_get_instance(ddi_get_parent(dip)),
832 	    ddi_driver_name(dip), ddi_get_instance(dip)));
833 
834 	/* Establish exclusion for cpu_intr_weight manipulation */
835 	mutex_enter(&intr_dist_cpu_lock);
836 	cp = cpu[cpuid];
837 	cp->cpu_intr_weight -= weight;
838 	if (cp->cpu_intr_weight < 0)
839 		cp->cpu_intr_weight = 0;	/* sanity */
840 	mutex_exit(&intr_dist_cpu_lock);
841 }
842 
843 ulong_t
844 create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
845 {
846 	uint64_t inum;
847 
848 	inum = add_softintr(pil, func, arg1, SOFTINT_MT);
849 	return ((ulong_t)inum);
850 }
851 
852 void
853 invoke_softint(processorid_t cpuid, ulong_t hdl)
854 {
855 	uint64_t inum = hdl;
856 
857 	if (cpuid == CPU->cpu_id)
858 		setsoftint(inum);
859 	else
860 		xt_one(cpuid, setsoftint_tl1, inum, 0);
861 }
862 
863 void
864 remove_softint(ulong_t hdl)
865 {
866 	uint64_t inum = hdl;
867 
868 	(void) rem_softintr(inum);
869 }
870 
871 void
872 sync_softint(cpuset_t set)
873 {
874 	xt_sync(set);
875 }
876