xref: /titanic_44/usr/src/uts/sun4/os/intr.c (revision b9238976491622ad75a67ab0c12edf99e36212b9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/sysmacros.h>
29 #include <sys/stack.h>
30 #include <sys/cpuvar.h>
31 #include <sys/ivintr.h>
32 #include <sys/intreg.h>
33 #include <sys/membar.h>
34 #include <sys/kmem.h>
35 #include <sys/intr.h>
36 #include <sys/sunddi.h>
37 #include <sys/sunndi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/archsystm.h>
42 #include <sys/machsystm.h>
43 #include <sys/x_call.h>
44 #include <vm/seg_kp.h>
45 #include <sys/debug.h>
46 #include <sys/cyclic.h>
47 #include <sys/kdi_impl.h>
48 #include <sys/ddi_timer.h>
49 
50 #include <sys/cpu_sgnblk_defs.h>
51 
52 /* Global locks which protect the interrupt distribution lists */
53 static kmutex_t intr_dist_lock;
54 static kmutex_t intr_dist_cpu_lock;
55 
56 /* Head of the interrupt distribution lists */
57 static struct intr_dist *intr_dist_head = NULL;
58 static struct intr_dist *intr_dist_whead = NULL;
59 
60 static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
61 uint64_t *siron_cpu_inum = NULL;
62 uint64_t siron_poke_cpu_inum;
63 static int siron_cpu_setup(cpu_setup_t, int, void *);
64 extern uint_t softlevel1();
65 
66 static uint64_t siron1_inum; /* backward compatibility */
67 uint64_t poke_cpu_inum;
68 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
69 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
70 
71 /*
72  * Note:-
73  * siron_pending was originally created to prevent a resource over consumption
74  * bug in setsoftint(exhaustion of interrupt pool free list).
75  * It's original intention is obsolete with the use of iv_pending in
76  * setsoftint. However, siron_pending stayed around, acting as a second
77  * gatekeeper preventing soft interrupts from being queued. In this capacity,
78  * it can lead to hangs on MP systems, where due to global visibility issues
79  * it can end up set while iv_pending is reset, preventing soft interrupts from
80  * ever being processed. In addition to its gatekeeper role, init_intr also
81  * uses it to flag the situation where siron() was called before siron_inum has
82  * been defined.
83  *
84  * siron() does not need an extra gatekeeper; any cpu that wishes should be
85  * allowed to queue a soft interrupt. It is softint()'s job to ensure
86  * correct handling of the queues. Therefore, siron_pending has been
87  * stripped of its gatekeeper task, retaining only its intr_init job, where
88  * it indicates that there is a pending need to call siron().
89  */
90 static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
91 static int siron1_pending; /* backward compatibility */
92 
93 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
94 int intr_dist_debug = 0;
95 int32_t intr_dist_weight_max = 1;
96 int32_t intr_dist_weight_maxmax = 1000;
97 int intr_dist_weight_maxfactor = 2;
98 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
99 
100 /*
101  * intr_init() - Interrupt initialization
102  *	Initialize the system's interrupt vector table.
103  */
104 void
105 intr_init(cpu_t *cp)
106 {
107 	int i;
108 	extern uint_t softlevel1();
109 
110 	init_ivintr();
111 	REGISTER_BBUS_INTR();
112 
113 	/*
114 	 * We just allocate memory for per-cpu siron right now. Rest of
115 	 * the work is done when CPU is configured.
116 	 */
117 	siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
118 	/*
119 	 * Register these software interrupts for ddi timer.
120 	 * Software interrupts up to the level 10 are supported.
121 	 */
122 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
123 		siron_inum[i-1] = add_softintr(i, (softintrfunc)timer_softintr,
124 		    (caddr_t)(uintptr_t)(i), SOFTINT_ST);
125 	}
126 
127 	siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
128 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
129 	siron_poke_cpu_inum = add_softintr(PIL_13,
130 	    siron_poke_cpu_intr, 0, SOFTINT_MT);
131 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
132 
133 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
134 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
135 
136 	/*
137 	 * A soft interrupt may have been requested prior to the initialization
138 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
139 	 * init_intr(), so we have to wait until now before we can dispatch the
140 	 * pending soft interrupt (if any).
141 	 */
142 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
143 		if (siron_pending[i-1]) {
144 			siron_pending[i-1] = 0;
145 			sir_on(i);
146 		}
147 	}
148 	if (siron1_pending) {
149 		siron1_pending = 0;
150 		siron();
151 	}
152 }
153 
154 /*
155  * poke_cpu_intr - fall through when poke_cpu calls
156  */
157 /* ARGSUSED */
158 uint_t
159 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
160 {
161 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
162 	membar_stld_stst();
163 	return (1);
164 }
165 
166 /*
167  * Trigger software interrupts dedicated to ddi timer.
168  */
169 void
170 sir_on(int level)
171 {
172 	ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
173 	if (siron_inum[level-1])
174 		setsoftint(siron_inum[level-1]);
175 	else
176 		siron_pending[level-1] = 1;
177 }
178 
179 /*
180  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
181  * inform its driver component that there's work to be done.  We need to keep
182  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
183  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
184  * implementation of setsoftint is complicated enough that we don't want to
185  * duplicate it, but at the same time we don't want to preclude tracing either.
186  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
187  * setsoftint() implemented as a wrapper.  This allows tracing, while still
188  * providing a way for kmdb to sneak in unmolested.
189  */
190 void
191 kdi_siron(void)
192 {
193 	if (siron1_inum != 0)
194 		kdi_setsoftint(siron1_inum);
195 	else
196 		siron1_pending = 1;
197 }
198 
199 void
200 setsoftint(uint64_t inum)
201 {
202 	kdi_setsoftint(inum);
203 }
204 
205 /*
206  * Generates softlevel1 interrupt on current CPU if it
207  * is not pending already.
208  */
209 void
210 siron(void)
211 {
212 	uint64_t inum;
213 
214 	if (siron1_inum != 0) {
215 		if (siron_cpu_inum[CPU->cpu_id] != 0)
216 			inum = siron_cpu_inum[CPU->cpu_id];
217 		else
218 			inum = siron1_inum;
219 
220 		setsoftint(inum);
221 	} else
222 		siron1_pending = 1;
223 }
224 
225 /*
226  * This routine creates per-CPU siron inum for CPUs which are
227  * configured during boot.
228  */
229 void
230 siron_mp_init()
231 {
232 	cpu_t *c;
233 
234 	mutex_enter(&cpu_lock);
235 	c = cpu_list;
236 	do {
237 		(void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
238 	} while ((c = c->cpu_next) != cpu_list);
239 
240 	register_cpu_setup_func(siron_cpu_setup, NULL);
241 	mutex_exit(&cpu_lock);
242 }
243 
244 /*
245  * siron_poke_cpu_intr - cross-call handler.
246  */
247 /* ARGSUSED */
248 uint_t
249 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
250 {
251 	/* generate level1 softint */
252 	siron();
253 	return (1);
254 }
255 
256 /*
257  * This routine generates a cross-call on target CPU(s).
258  */
259 void
260 siron_poke_cpu(cpuset_t poke)
261 {
262 	int cpuid = CPU->cpu_id;
263 
264 	if (CPU_IN_SET(poke, cpuid)) {
265 		siron();
266 		CPUSET_DEL(poke, cpuid);
267 		if (CPUSET_ISNULL(poke))
268 			return;
269 	}
270 
271 	xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
272 }
273 
274 /*
275  * This callback function allows us to create per-CPU siron inum.
276  */
277 /* ARGSUSED */
278 static int
279 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
280 {
281 	cpu_t *cp = cpu[id];
282 
283 	ASSERT(MUTEX_HELD(&cpu_lock));
284 	ASSERT(cp != NULL);
285 
286 	switch (what) {
287 	case CPU_CONFIG:
288 		siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
289 		    (softintrfunc)softlevel1, 0, SOFTINT_ST);
290 		break;
291 	case CPU_UNCONFIG:
292 		(void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
293 		siron_cpu_inum[cp->cpu_id] = 0;
294 		break;
295 	default:
296 		break;
297 	}
298 
299 	return (0);
300 }
301 
302 /*
303  * no_ivintr()
304  * 	called by setvecint_tl1() through sys_trap()
305  *	vector interrupt received but not valid or not
306  *	registered in intr_vec_table
307  *	considered as a spurious mondo interrupt
308  */
309 /* ARGSUSED */
310 void
311 no_ivintr(struct regs *rp, int inum, int pil)
312 {
313 	cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
314 	    inum, pil);
315 
316 #ifdef DEBUG_VEC_INTR
317 	prom_enter_mon();
318 #endif /* DEBUG_VEC_INTR */
319 }
320 
321 void
322 intr_dequeue_req(uint_t pil, uint64_t inum)
323 {
324 	intr_vec_t	*iv, *next, *prev;
325 	struct machcpu	*mcpu;
326 	uint32_t	clr;
327 	processorid_t	cpu_id;
328 	extern uint_t	getpstate(void);
329 
330 	ASSERT((getpstate() & PSTATE_IE) == 0);
331 
332 	mcpu = &CPU->cpu_m;
333 	cpu_id = CPU->cpu_id;
334 
335 	iv = (intr_vec_t *)inum;
336 	prev = NULL;
337 	next = mcpu->intr_head[pil];
338 
339 	/* Find a matching entry in the list */
340 	while (next != NULL) {
341 		if (next == iv)
342 			break;
343 		prev = next;
344 		next = IV_GET_PIL_NEXT(next, cpu_id);
345 	}
346 
347 	if (next != NULL) {
348 		intr_vec_t	*next_iv = IV_GET_PIL_NEXT(next, cpu_id);
349 
350 		/* Remove entry from list */
351 		if (prev != NULL)
352 			IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
353 		else
354 			mcpu->intr_head[pil] = next_iv; /* head */
355 
356 		if (next_iv == NULL)
357 			mcpu->intr_tail[pil] = prev; /* tail */
358 	}
359 
360 	/* Clear pending interrupts at this level if the list is empty */
361 	if (mcpu->intr_head[pil] == NULL) {
362 		clr = 1 << pil;
363 		if (pil == PIL_14)
364 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
365 		wr_clr_softint(clr);
366 	}
367 }
368 
369 
370 /*
371  * Send a directed interrupt of specified interrupt number id to a cpu.
372  */
373 void
374 send_dirint(
375 	int cpuix,		/* cpu to be interrupted */
376 	int intr_id)		/* interrupt number id */
377 {
378 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
379 }
380 
381 /*
382  * Take the specified CPU out of participation in interrupts.
383  *	Called by p_online(2) when a processor is being taken off-line.
384  *	This allows interrupt threads being handled on the processor to
385  *	complete before the processor is idled.
386  */
387 int
388 cpu_disable_intr(struct cpu *cp)
389 {
390 	ASSERT(MUTEX_HELD(&cpu_lock));
391 
392 	/*
393 	 * Turn off the CPU_ENABLE flag before calling the redistribution
394 	 * function, since it checks for this in the cpu flags.
395 	 */
396 	cp->cpu_flags &= ~CPU_ENABLE;
397 
398 	intr_redist_all_cpus();
399 
400 	return (0);
401 }
402 
403 /*
404  * Allow the specified CPU to participate in interrupts.
405  *	Called by p_online(2) if a processor could not be taken off-line
406  *	because of bound threads, in order to resume processing interrupts.
407  *	Also called after starting a processor.
408  */
409 void
410 cpu_enable_intr(struct cpu *cp)
411 {
412 	ASSERT(MUTEX_HELD(&cpu_lock));
413 
414 	cp->cpu_flags |= CPU_ENABLE;
415 
416 	intr_redist_all_cpus();
417 }
418 
419 /*
420  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
421  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
422  * are issued to redirect interrupts of a specified weight, from heavy to
423  * light.  This allows all the interrupts of a given weight to be redistributed
424  * for all weighted nexus drivers prior to those of less weight.
425  */
426 static void
427 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
428 {
429 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
430 	struct intr_dist *iptr;
431 	struct intr_dist **pptr;
432 
433 	ASSERT(func);
434 	new->func = func;
435 	new->arg = arg;
436 	new->next = NULL;
437 
438 	/* Add to tail so that redistribution occurs in original order. */
439 	mutex_enter(&intr_dist_lock);
440 	for (iptr = *phead, pptr = phead; iptr != NULL;
441 	    pptr = &iptr->next, iptr = iptr->next) {
442 		/* check for problems as we locate the tail */
443 		if ((iptr->func == func) && (iptr->arg == arg)) {
444 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
445 			/*NOTREACHED*/
446 		}
447 	}
448 	*pptr = new;
449 
450 	mutex_exit(&intr_dist_lock);
451 }
452 
453 void
454 intr_dist_add(void (*func)(void *), void *arg)
455 {
456 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
457 }
458 
459 void
460 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
461 {
462 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
463 }
464 
465 /*
466  * Search for the interrupt distribution structure with the specified
467  * mondo vec reg in the interrupt distribution list. If a match is found,
468  * then delete the entry from the list. The caller is responsible for
469  * modifying the mondo vector registers.
470  */
471 static void
472 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
473 {
474 	struct intr_dist *iptr;
475 	struct intr_dist **vect;
476 
477 	mutex_enter(&intr_dist_lock);
478 	for (iptr = *headp, vect = headp;
479 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
480 		if ((iptr->func == func) && (iptr->arg == arg)) {
481 			*vect = iptr->next;
482 			kmem_free(iptr, sizeof (struct intr_dist));
483 			mutex_exit(&intr_dist_lock);
484 			return;
485 		}
486 	}
487 
488 	if (!panicstr)
489 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
490 	mutex_exit(&intr_dist_lock);
491 }
492 
493 void
494 intr_dist_rem(void (*func)(void *), void *arg)
495 {
496 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
497 }
498 
499 void
500 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
501 {
502 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
503 }
504 
505 /*
506  * Initiate interrupt redistribution.  Redistribution improves the isolation
507  * associated with interrupt weights by ordering operations from heavy weight
508  * to light weight.  When a CPUs orientation changes relative to interrupts,
509  * there is *always* a redistribution to accommodate this change (call to
510  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
511  * that a redistribution could improve the quality of an initialization. For
512  * example, if you are not using a NIC it may not be attached with s10 (devfs).
513  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
514  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
515  * occurring late, so optimal "isolation" relative to weight is not occurring.
516  * The same applies to detach, although in this case doing the redistribution
517  * might improve "spread" for medium weight devices since the "isolation" of
518  * a higher weight device may no longer be present.
519  *
520  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
521  *
522  * NB: There is risk associated with automatically triggering execution of the
523  * redistribution code at arbitrary times. The risk comes from the fact that
524  * there is a lot of low-level hardware interaction associated with a
525  * redistribution.  At some point we may want this code to perform automatic
526  * redistribution (redistribution thread; trigger timeout when add/remove
527  * weight delta is large enough, and call cv_signal from timeout - causing
528  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
529  * risky at this time.
530  */
531 void
532 i_ddi_intr_redist_all_cpus()
533 {
534 	mutex_enter(&cpu_lock);
535 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
536 	intr_redist_all_cpus();
537 	mutex_exit(&cpu_lock);
538 }
539 
540 /*
541  * Redistribute all interrupts
542  *
543  * This function redistributes all interrupting devices, running the
544  * parent callback functions for each node.
545  */
546 void
547 intr_redist_all_cpus(void)
548 {
549 	struct cpu *cp;
550 	struct intr_dist *iptr;
551 	int32_t weight, max_weight;
552 
553 	ASSERT(MUTEX_HELD(&cpu_lock));
554 	mutex_enter(&intr_dist_lock);
555 
556 	/*
557 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
558 	 * cpu_list since we hold cpu_lock.
559 	 */
560 	cp = cpu_list;
561 	do {
562 		cp->cpu_intr_weight = 0;
563 	} while ((cp = cp->cpu_next) != cpu_list);
564 
565 	/*
566 	 * Assume that this redistribution may encounter a device weight
567 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
568 	 * intr_dist_weight_maxfactor times larger.
569 	 */
570 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
571 	if (max_weight > intr_dist_weight_maxmax)
572 		max_weight = intr_dist_weight_maxmax;
573 	intr_dist_weight_max = 1;
574 
575 	INTR_DEBUG((CE_CONT, "intr_dist: "
576 	    "intr_redist_all_cpus: %d-0\n", max_weight));
577 
578 	/*
579 	 * Redistribute weighted, from heavy to light.  The callback that
580 	 * specifies a weight equal to weight_max should redirect all
581 	 * interrupts of weight weight_max or greater [weight_max, inf.).
582 	 * Interrupts of lesser weight should be processed on the call with
583 	 * the matching weight. This allows all the heaver weight interrupts
584 	 * on all weighted busses (multiple pci busses) to be redirected prior
585 	 * to any lesser weight interrupts.
586 	 */
587 	for (weight = max_weight; weight >= 0; weight--)
588 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
589 			((void (*)(void *, int32_t, int32_t))iptr->func)
590 			    (iptr->arg, max_weight, weight);
591 
592 	/* redistribute normal (non-weighted) interrupts */
593 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
594 		((void (*)(void *))iptr->func)(iptr->arg);
595 	mutex_exit(&intr_dist_lock);
596 }
597 
598 void
599 intr_redist_all_cpus_shutdown(void)
600 {
601 	intr_policy = INTR_CURRENT_CPU;
602 	intr_redist_all_cpus();
603 }
604 
605 /*
606  * Determine what CPU to target, based on interrupt policy.
607  *
608  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
609  *	advance through interrupt enabled cpus (round-robin).
610  *
611  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
612  *	cpu_intr_weight, round robin when all equal.
613  *
614  *	Weighted interrupt distribution provides two things: "spread" of weight
615  *	(associated with algorithm itself) and "isolation" (associated with a
616  *	particular device weight). A redistribution is what provides optimal
617  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
618  *	(relative to what came before) is always occurring.
619  *
620  *	An interrupt weight is a subjective number that represents the
621  *	percentage of a CPU required to service a device's interrupts: the
622  *	default weight is 0% (however the algorithm still maintains
623  *	round-robin), a network interface controller (NIC) may have a large
624  *	weight (35%). Interrupt weight only has meaning relative to the
625  *	interrupt weight of other devices: a CPU can be weighted more than
626  *	100%, and a single device might consume more than 100% of a CPU.
627  *
628  *	A coarse interrupt weight can be defined by the parent nexus driver
629  *	based on bus specific information, like pci class codes. A nexus
630  *	driver that supports device interrupt weighting for its children
631  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
632  *	and removes the weight of a device from the CPU that an interrupt
633  *	is directed at.  The quality of initialization improves when the
634  *	device interrupt weights more accuracy reflect actual run-time weights,
635  *	and as the assignments are ordered from is heavy to light.
636  *
637  *	The implementation also supports interrupt weight being specified in
638  *	driver.conf files via the property "ddi-intr-weight", which takes
639  *	precedence over the nexus supplied weight.  This support is added to
640  *	permit possible tweaking in the product in response to customer
641  *	problems. This is not a formal or committed interface.
642  *
643  *	While a weighted approach chooses the CPU providing the best spread
644  *	given past weights, less than optimal isolation can result in cases
645  *	where heavy weight devices show up last. The nexus driver's interrupt
646  *	redistribution logic should use intr_dist_add/rem_weighted so that
647  *	interrupts can be redistributed heavy first for optimal isolation.
648  */
649 uint32_t
650 intr_dist_cpuid(void)
651 {
652 	static struct cpu	*curr_cpu;
653 	struct cpu		*start_cpu;
654 	struct cpu		*new_cpu;
655 	struct cpu		*cp;
656 	int			cpuid = -1;
657 
658 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
659 	mutex_enter(&intr_dist_cpu_lock);
660 
661 	switch (intr_policy) {
662 	case INTR_CURRENT_CPU:
663 		cpuid = CPU->cpu_id;
664 		break;
665 
666 	case INTR_BOOT_CPU:
667 		panic("INTR_BOOT_CPU no longer supported.");
668 		/*NOTREACHED*/
669 
670 	case INTR_FLAT_DIST:
671 	case INTR_WEIGHTED_DIST:
672 	default:
673 		/*
674 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
675 		 * the cpu has been deleted (cpu structs are never freed).
676 		 */
677 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
678 			curr_cpu = CPU;
679 
680 		/*
681 		 * Advance to online CPU after curr_cpu (round-robin). For
682 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
683 		 * weight.  For a nexus that does not support weight the
684 		 * default weight of zero is used. We degrade to round-robin
685 		 * behavior among equal weightes.  The default weight is zero
686 		 * and round-robin behavior continues.
687 		 *
688 		 * Disable preemption while traversing cpu_next_onln to
689 		 * ensure the list does not change.  This works because
690 		 * modifiers of this list and other lists in a struct cpu
691 		 * call pause_cpus() before making changes.
692 		 */
693 		kpreempt_disable();
694 		cp = start_cpu = curr_cpu->cpu_next_onln;
695 		new_cpu = NULL;
696 		do {
697 			/* Skip CPUs with interrupts disabled */
698 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
699 				continue;
700 
701 			if (intr_policy == INTR_FLAT_DIST) {
702 				/* select CPU */
703 				new_cpu = cp;
704 				break;
705 			} else if ((new_cpu == NULL) ||
706 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
707 				/* Choose if lighter weight */
708 				new_cpu = cp;
709 			}
710 		} while ((cp = cp->cpu_next_onln) != start_cpu);
711 		ASSERT(new_cpu);
712 		cpuid = new_cpu->cpu_id;
713 
714 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
715 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
716 
717 		/* update static pointer for next round-robin */
718 		curr_cpu = new_cpu;
719 		kpreempt_enable();
720 		break;
721 	}
722 	mutex_exit(&intr_dist_cpu_lock);
723 	return (cpuid);
724 }
725 
726 /*
727  * Add or remove the the weight of a device from a CPUs interrupt weight.
728  *
729  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
730  * their children to improve the overall quality of interrupt initialization.
731  *
732  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
733  * among multiple devices (sharing ino) then the nexus should call
734  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
735  * that share must specify the same cpuid.
736  *
737  * If a nexus driver is unable to determine the cpu at remove_intr time
738  * for some of its interrupts, then it should not call add_device_weight -
739  * intr_dist_cpuid will still provide round-robin.
740  *
741  * An established device weight (from dev_info node) takes precedence over
742  * the weight passed in.  If a device weight is not already established
743  * then the passed in nexus weight is established.
744  */
745 void
746 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
747     dev_info_t *dip, int32_t nweight)
748 {
749 	int32_t		eweight;
750 
751 	/*
752 	 * For non-weighted policy everything has weight of zero (and we get
753 	 * round-robin distribution from intr_dist_cpuid).
754 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
755 	 * calls this rouitne even if intr_policy has been patched to
756 	 * INTR_FLAG_DIST.
757 	 */
758 	ASSERT(dip);
759 	if (intr_policy != INTR_WEIGHTED_DIST)
760 		return;
761 
762 	eweight = i_ddi_get_intr_weight(dip);
763 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
764 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
765 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
766 	    ddi_get_instance(ddi_get_parent(dip)),
767 	    ddi_driver_name(dip), ddi_get_instance(dip)));
768 
769 	/* if no establish weight, establish nexus weight */
770 	if (eweight < 0) {
771 		if (nweight > 0)
772 			(void) i_ddi_set_intr_weight(dip, nweight);
773 		else
774 			nweight = 0;
775 	} else
776 		nweight = eweight;	/* use established weight */
777 
778 	/* Establish exclusion for cpu_intr_weight manipulation */
779 	mutex_enter(&intr_dist_cpu_lock);
780 	cpu[cpuid]->cpu_intr_weight += nweight;
781 
782 	/* update intr_dist_weight_max */
783 	if (nweight > intr_dist_weight_max)
784 		intr_dist_weight_max = nweight;
785 	mutex_exit(&intr_dist_cpu_lock);
786 }
787 
788 void
789 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
790 {
791 	struct cpu	*cp;
792 	int32_t		weight;
793 
794 	ASSERT(dip);
795 	if (intr_policy != INTR_WEIGHTED_DIST)
796 		return;
797 
798 	/* remove weight of device from cpu */
799 	weight = i_ddi_get_intr_weight(dip);
800 	if (weight < 0)
801 		weight = 0;
802 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
803 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
804 	    ddi_driver_name(ddi_get_parent(dip)),
805 	    ddi_get_instance(ddi_get_parent(dip)),
806 	    ddi_driver_name(dip), ddi_get_instance(dip)));
807 
808 	/* Establish exclusion for cpu_intr_weight manipulation */
809 	mutex_enter(&intr_dist_cpu_lock);
810 	cp = cpu[cpuid];
811 	cp->cpu_intr_weight -= weight;
812 	if (cp->cpu_intr_weight < 0)
813 		cp->cpu_intr_weight = 0;	/* sanity */
814 	mutex_exit(&intr_dist_cpu_lock);
815 }
816