xref: /titanic_50/usr/src/uts/sun4/os/intr.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/sysmacros.h>
30 #include <sys/stack.h>
31 #include <sys/cpuvar.h>
32 #include <sys/ivintr.h>
33 #include <sys/intreg.h>
34 #include <sys/membar.h>
35 #include <sys/kmem.h>
36 #include <sys/intr.h>
37 #include <sys/sunndi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/archsystm.h>
42 #include <sys/machsystm.h>
43 #include <sys/x_call.h>
44 #include <vm/seg_kp.h>
45 #include <sys/debug.h>
46 #include <sys/cyclic.h>
47 
48 #include <sys/cpu_sgnblk_defs.h>
49 
50 kmutex_t soft_iv_lock;	/* protect software interrupt vector table */
51 /* Global locks which protect the interrupt distribution lists */
52 static kmutex_t intr_dist_lock;
53 static kmutex_t intr_dist_cpu_lock;
54 
55 /* Head of the interrupt distribution lists */
56 static struct intr_dist *intr_dist_head = NULL;
57 static struct intr_dist *intr_dist_whead = NULL;
58 
59 uint_t swinum_base;
60 uint_t maxswinum;
61 uint_t siron_inum;
62 uint_t poke_cpu_inum;
63 int siron_pending;
64 
65 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
66 int intr_dist_debug = 0;
67 int32_t intr_dist_weight_max = 1;
68 int32_t intr_dist_weight_maxmax = 1000;
69 int intr_dist_weight_maxfactor = 2;
70 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
71 
72 static void sw_ivintr_init(cpu_t *);
73 
74 /*
75  * intr_init() - interrupt initialization
76  *	Initialize the system's software interrupt vector table and
77  *	CPU's interrupt free list
78  */
79 void
80 intr_init(cpu_t *cp)
81 {
82 	init_ivintr();
83 	sw_ivintr_init(cp);
84 	init_intr_pool(cp);
85 
86 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
87 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
88 
89 	/*
90 	 * A soft interrupt may have been requested prior to the initialization
91 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
92 	 * init_intr_pool, so we have to wait until now before we can dispatch
93 	 * the pending soft interrupt (if any).
94 	 */
95 	if (siron_pending)
96 		setsoftint(siron_inum);
97 }
98 
99 /*
100  * poke_cpu_intr - fall through when poke_cpu calls
101  */
102 
103 /* ARGSUSED */
104 uint_t
105 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
106 {
107 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
108 	membar_stld_stst();
109 	return (1);
110 }
111 
112 /*
113  * sw_ivintr_init() - software interrupt vector initialization
114  *	called after CPU is active
115  *	the software interrupt vector table is part of the intr_vector[]
116  */
117 static void
118 sw_ivintr_init(cpu_t *cp)
119 {
120 	extern uint_t softlevel1();
121 
122 	mutex_init(&soft_iv_lock, NULL, MUTEX_DEFAULT, NULL);
123 
124 	swinum_base = SOFTIVNUM;
125 
126 	/*
127 	 * the maximum software interrupt == MAX_SOFT_INO
128 	 */
129 	maxswinum = swinum_base + MAX_SOFT_INO;
130 
131 	REGISTER_BBUS_INTR();
132 
133 	siron_inum = add_softintr(PIL_1, softlevel1, 0);
134 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0);
135 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
136 }
137 
138 cpuset_t intr_add_pools_inuse;
139 
140 /*
141  * cleanup_intr_pool()
142  *	Free up the extra intr request pool for this cpu.
143  */
144 void
145 cleanup_intr_pool(cpu_t *cp)
146 {
147 	extern struct intr_req *intr_add_head;
148 	int poolno;
149 	struct intr_req *pool;
150 
151 	poolno = cp->cpu_m.intr_pool_added;
152 	if (poolno >= 0) {
153 		cp->cpu_m.intr_pool_added = -1;
154 		pool = (poolno * INTR_PENDING_MAX * intr_add_pools) +
155 
156 			intr_add_head;	/* not byte arithmetic */
157 		bzero(pool, INTR_PENDING_MAX * intr_add_pools *
158 		    sizeof (struct intr_req));
159 
160 		CPUSET_DEL(intr_add_pools_inuse, poolno);
161 	}
162 }
163 
164 /*
165  * init_intr_pool()
166  *	initialize the intr request pool for the cpu
167  * 	should be called for each cpu
168  */
169 void
170 init_intr_pool(cpu_t *cp)
171 {
172 	extern struct intr_req *intr_add_head;
173 #ifdef	DEBUG
174 	extern struct intr_req *intr_add_tail;
175 #endif	/* DEBUG */
176 	int i, pool;
177 
178 	cp->cpu_m.intr_pool_added = -1;
179 
180 	for (i = 0; i < INTR_PENDING_MAX-1; i++) {
181 		cp->cpu_m.intr_pool[i].intr_next =
182 		    &cp->cpu_m.intr_pool[i+1];
183 	}
184 	cp->cpu_m.intr_pool[INTR_PENDING_MAX-1].intr_next = NULL;
185 
186 	cp->cpu_m.intr_head[0] = &cp->cpu_m.intr_pool[0];
187 	cp->cpu_m.intr_tail[0] = &cp->cpu_m.intr_pool[INTR_PENDING_MAX-1];
188 
189 	if (intr_add_pools != 0) {
190 
191 		/*
192 		 * If additional interrupt pools have been allocated,
193 		 * initialize those too and add them to the free list.
194 		 */
195 
196 		struct intr_req *trace;
197 
198 		for (pool = 0; pool < max_ncpus; pool++) {
199 			if (!(CPU_IN_SET(intr_add_pools_inuse, pool)))
200 			    break;
201 		}
202 		if (pool >= max_ncpus) {
203 			/*
204 			 * XXX - intr pools are alloc'd, just not as
205 			 * much as we would like.
206 			 */
207 			cmn_err(CE_WARN, "Failed to alloc all requested intr "
208 			    "pools for cpu%d", cp->cpu_id);
209 			return;
210 		}
211 		CPUSET_ADD(intr_add_pools_inuse, pool);
212 		cp->cpu_m.intr_pool_added = pool;
213 
214 		trace = (pool * INTR_PENDING_MAX * intr_add_pools) +
215 			intr_add_head;	/* not byte arithmetic */
216 
217 		cp->cpu_m.intr_pool[INTR_PENDING_MAX-1].intr_next = trace;
218 
219 		for (i = 1; i < intr_add_pools * INTR_PENDING_MAX; i++, trace++)
220 			trace->intr_next = trace + 1;
221 		trace->intr_next = NULL;
222 
223 		ASSERT(trace >= intr_add_head && trace <= intr_add_tail);
224 
225 		cp->cpu_m.intr_tail[0] = trace;
226 	}
227 }
228 
229 
230 /*
231  * siron - primitive for sun/os/softint.c
232  */
233 void
234 siron(void)
235 {
236 	if (!siron_pending) {
237 		siron_pending = 1;
238 		if (siron_inum != 0)
239 			setsoftint(siron_inum);
240 	}
241 }
242 
243 /*
244  * no_ivintr()
245  * 	called by vec_interrupt() through sys_trap()
246  *	vector interrupt received but not valid or not
247  *	registered in intr_vector[]
248  *	considered as a spurious mondo interrupt
249  */
250 /* ARGSUSED */
251 void
252 no_ivintr(struct regs *rp, int inum, int pil)
253 {
254 	cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
255 	    inum, pil);
256 
257 
258 #ifdef DEBUG_VEC_INTR
259 	prom_enter_mon();
260 #endif /* DEBUG_VEC_INTR */
261 }
262 
263 /*
264  * no_intr_pool()
265  * 	called by vec_interrupt() through sys_trap()
266  *	vector interrupt received but no intr_req entries
267  */
268 /* ARGSUSED */
269 void
270 no_intr_pool(struct regs *rp, int inum, int pil)
271 {
272 #ifdef DEBUG_VEC_INTR
273 	cmn_err(CE_WARN, "intr_req pool empty: num 0x%x, pil 0x%x",
274 		inum, pil);
275 	prom_enter_mon();
276 #else
277 	cmn_err(CE_PANIC, "intr_req pool empty: num 0x%x, pil 0x%x",
278 		inum, pil);
279 #endif /* DEBUG_VEC_INTR */
280 }
281 
282 void
283 intr_dequeue_req(uint_t pil, uint32_t inum)
284 {
285 	struct intr_req *ir, *prev;
286 	struct machcpu *mcpu;
287 	uint32_t clr;
288 	extern uint_t getpstate(void);
289 
290 	ASSERT((getpstate() & PSTATE_IE) == 0);
291 
292 	mcpu = &CPU->cpu_m;
293 
294 	/* Find a matching entry in the list */
295 	prev = NULL;
296 	ir = mcpu->intr_head[pil];
297 	while (ir != NULL) {
298 		if (ir->intr_number == inum)
299 			break;
300 		prev = ir;
301 		ir = ir->intr_next;
302 	}
303 	if (ir != NULL) {
304 		/*
305 		 * Remove entry from list
306 		 */
307 		if (prev != NULL)
308 			prev->intr_next = ir->intr_next;	/* non-head */
309 		else
310 			mcpu->intr_head[pil] = ir->intr_next;	/* head */
311 
312 		if (ir->intr_next == NULL)
313 			mcpu->intr_tail[pil] = prev;		/* tail */
314 
315 		/*
316 		 * Place on free list
317 		 */
318 		ir->intr_next = mcpu->intr_head[0];
319 		mcpu->intr_head[0] = ir;
320 	}
321 
322 	/*
323 	 * clear pending interrupts at this level if the list is empty
324 	 */
325 	if (mcpu->intr_head[pil] == NULL) {
326 		clr = 1 << pil;
327 		if (pil == PIL_14)
328 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
329 		wr_clr_softint(clr);
330 	}
331 }
332 
333 
334 /*
335  * Send a directed interrupt of specified interrupt number id to a cpu.
336  */
337 void
338 send_dirint(
339 	int cpuix,		/* cpu to be interrupted */
340 	int intr_id)		/* interrupt number id */
341 {
342 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
343 }
344 
345 void
346 init_intr_threads(struct cpu *cp)
347 {
348 	int i;
349 
350 	for (i = 0; i < NINTR_THREADS; i++)
351 		thread_create_intr(cp);
352 
353 	cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE,
354 		KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
355 		INTR_STACK_SIZE - SA(MINFRAME);
356 }
357 
358 /*
359  * Take the specified CPU out of participation in interrupts.
360  *	Called by p_online(2) when a processor is being taken off-line.
361  *	This allows interrupt threads being handled on the processor to
362  *	complete before the processor is idled.
363  */
364 int
365 cpu_disable_intr(struct cpu *cp)
366 {
367 	ASSERT(MUTEX_HELD(&cpu_lock));
368 
369 	/*
370 	 * Turn off the CPU_ENABLE flag before calling the redistribution
371 	 * function, since it checks for this in the cpu flags.
372 	 */
373 	cp->cpu_flags &= ~CPU_ENABLE;
374 
375 	intr_redist_all_cpus();
376 
377 	return (0);
378 }
379 
380 /*
381  * Allow the specified CPU to participate in interrupts.
382  *	Called by p_online(2) if a processor could not be taken off-line
383  *	because of bound threads, in order to resume processing interrupts.
384  *	Also called after starting a processor.
385  */
386 void
387 cpu_enable_intr(struct cpu *cp)
388 {
389 	ASSERT(MUTEX_HELD(&cpu_lock));
390 
391 	cp->cpu_flags |= CPU_ENABLE;
392 
393 	intr_redist_all_cpus();
394 }
395 
396 /*
397  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
398  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
399  * are issued to redirect interrupts of a specified weight, from heavy to
400  * light.  This allows all the interrupts of a given weight to be redistributed
401  * for all weighted nexus drivers prior to those of less weight.
402  */
403 static void
404 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
405 {
406 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
407 	struct intr_dist *iptr;
408 	struct intr_dist **pptr;
409 
410 	ASSERT(func);
411 	new->func = func;
412 	new->arg = arg;
413 	new->next = NULL;
414 
415 	/* Add to tail so that redistribution occurs in original order. */
416 	mutex_enter(&intr_dist_lock);
417 	for (iptr = *phead, pptr = phead; iptr != NULL;
418 	    pptr = &iptr->next, iptr = iptr->next) {
419 		/* check for problems as we locate the tail */
420 		if ((iptr->func == func) && (iptr->arg == arg)) {
421 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
422 			/*NOTREACHED*/
423 		}
424 	}
425 	*pptr = new;
426 
427 	mutex_exit(&intr_dist_lock);
428 }
429 
430 void
431 intr_dist_add(void (*func)(void *), void *arg)
432 {
433 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
434 }
435 
436 void
437 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
438 {
439 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
440 }
441 
442 /*
443  * Search for the interrupt distribution structure with the specified
444  * mondo vec reg in the interrupt distribution list. If a match is found,
445  * then delete the entry from the list. The caller is responsible for
446  * modifying the mondo vector registers.
447  */
448 static void
449 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
450 {
451 	struct intr_dist *iptr;
452 	struct intr_dist **vect;
453 
454 	mutex_enter(&intr_dist_lock);
455 	for (iptr = *headp, vect = headp;
456 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
457 		if ((iptr->func == func) && (iptr->arg == arg)) {
458 			*vect = iptr->next;
459 			kmem_free(iptr, sizeof (struct intr_dist));
460 			mutex_exit(&intr_dist_lock);
461 			return;
462 		}
463 	}
464 
465 	if (!panicstr)
466 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
467 	mutex_exit(&intr_dist_lock);
468 }
469 
470 void
471 intr_dist_rem(void (*func)(void *), void *arg)
472 {
473 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
474 }
475 
476 void
477 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
478 {
479 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
480 }
481 
482 /*
483  * Initiate interrupt redistribution.  Redistribution improves the isolation
484  * associated with interrupt weights by ordering operations from heavy weight
485  * to light weight.  When a CPUs orientation changes relative to interrupts,
486  * there is *always* a redistribution to accommodate this change (call to
487  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
488  * that a redistribution could improve the quality of an initialization. For
489  * example, if you are not using a NIC it may not be attached with s10 (devfs).
490  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
491  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
492  * occurring late, so optimal "isolation" relative to weight is not occurring.
493  * The same applies to detach, although in this case doing the redistribution
494  * might improve "spread" for medium weight devices since the "isolation" of
495  * a higher weight device may no longer be present.
496  *
497  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
498  *
499  * NB: There is risk associated with automatically triggering execution of the
500  * redistribution code at arbitrary times. The risk comes from the fact that
501  * there is a lot of low-level hardware interaction associated with a
502  * redistribution.  At some point we may want this code to perform automatic
503  * redistribution (redistribution thread; trigger timeout when add/remove
504  * weight delta is large enough, and call cv_signal from timeout - causing
505  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
506  * risky at this time.
507  */
508 void
509 i_ddi_intr_redist_all_cpus()
510 {
511 	mutex_enter(&cpu_lock);
512 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
513 	intr_redist_all_cpus();
514 	mutex_exit(&cpu_lock);
515 }
516 
517 /*
518  * Redistribute all interrupts
519  *
520  * This function redistributes all interrupting devices, running the
521  * parent callback functions for each node.
522  */
523 void
524 intr_redist_all_cpus(void)
525 {
526 	struct cpu *cp;
527 	struct intr_dist *iptr;
528 	int32_t weight, max_weight;
529 
530 	ASSERT(MUTEX_HELD(&cpu_lock));
531 	mutex_enter(&intr_dist_lock);
532 
533 	/*
534 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
535 	 * cpu_list since we hold cpu_lock.
536 	 */
537 	cp = cpu_list;
538 	do {
539 		cp->cpu_intr_weight = 0;
540 	} while ((cp = cp->cpu_next) != cpu_list);
541 
542 	/*
543 	 * Assume that this redistribution may encounter a device weight
544 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
545 	 * intr_dist_weight_maxfactor times larger.
546 	 */
547 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
548 	if (max_weight > intr_dist_weight_maxmax)
549 		max_weight = intr_dist_weight_maxmax;
550 	intr_dist_weight_max = 1;
551 
552 	INTR_DEBUG((CE_CONT, "intr_dist: "
553 	    "intr_redist_all_cpus: %d-0\n", max_weight));
554 
555 	/*
556 	 * Redistribute weighted, from heavy to light.  The callback that
557 	 * specifies a weight equal to weight_max should redirect all
558 	 * interrupts of weight weight_max or greater [weight_max, inf.).
559 	 * Interrupts of lesser weight should be processed on the call with
560 	 * the matching weight. This allows all the heaver weight interrupts
561 	 * on all weighted busses (multiple pci busses) to be redirected prior
562 	 * to any lesser weight interrupts.
563 	 */
564 	for (weight = max_weight; weight >= 0; weight--)
565 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
566 			((void (*)(void *, int32_t, int32_t))iptr->func)
567 			    (iptr->arg, max_weight, weight);
568 
569 	/* redistribute normal (non-weighted) interrupts */
570 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
571 		((void (*)(void *))iptr->func)(iptr->arg);
572 	mutex_exit(&intr_dist_lock);
573 }
574 
575 void
576 intr_redist_all_cpus_shutdown(void)
577 {
578 	intr_policy = INTR_CURRENT_CPU;
579 	intr_redist_all_cpus();
580 }
581 
582 /*
583  * Determine what CPU to target, based on interrupt policy.
584  *
585  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
586  *	advance through interrupt enabled cpus (round-robin).
587  *
588  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
589  *	cpu_intr_weight, round robin when all equal.
590  *
591  *	Weighted interrupt distribution provides two things: "spread" of weight
592  *	(associated with algorithm itself) and "isolation" (associated with a
593  *	particular device weight). A redistribution is what provides optimal
594  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
595  *	(relative to what came before) is always occurring.
596  *
597  *	An interrupt weight is a subjective number that represents the
598  *	percentage of a CPU required to service a device's interrupts: the
599  *	default weight is 0% (however the algorithm still maintains
600  *	round-robin), a network interface controller (NIC) may have a large
601  *	weight (35%). Interrupt weight only has meaning relative to the
602  *	interrupt weight of other devices: a CPU can be weighted more than
603  *	100%, and a single device might consume more than 100% of a CPU.
604  *
605  *	A coarse interrupt weight can be defined by the parent nexus driver
606  *	based on bus specific information, like pci class codes. A nexus
607  *	driver that supports device interrupt weighting for its children
608  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
609  *	and removes the weight of a device from the CPU that an interrupt
610  *	is directed at.  The quality of initialization improves when the
611  *	device interrupt weights more accuracy reflect actual run-time weights,
612  *	and as the assignments are ordered from is heavy to light.
613  *
614  *	The implementation also supports interrupt weight being specified in
615  *	driver.conf files via the property "ddi-intr-weight", which takes
616  *	precedence over the nexus supplied weight.  This support is added to
617  *	permit possible tweaking in the product in response to customer
618  *	problems. This is not a formal or committed interface.
619  *
620  *	While a weighted approach chooses the CPU providing the best spread
621  *	given past weights, less than optimal isolation can result in cases
622  *	where heavy weight devices show up last. The nexus driver's interrupt
623  *	redistribution logic should use intr_dist_add/rem_weighted so that
624  *	interrupts can be redistributed heavy first for optimal isolation.
625  */
626 uint32_t
627 intr_dist_cpuid(void)
628 {
629 	static struct cpu	*curr_cpu;
630 	struct cpu		*start_cpu;
631 	struct cpu		*new_cpu;
632 	struct cpu		*cp;
633 	int			cpuid = -1;
634 
635 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
636 	mutex_enter(&intr_dist_cpu_lock);
637 
638 	switch (intr_policy) {
639 	case INTR_CURRENT_CPU:
640 		cpuid = CPU->cpu_id;
641 		break;
642 
643 	case INTR_BOOT_CPU:
644 		panic("INTR_BOOT_CPU no longer supported.");
645 		/*NOTREACHED*/
646 
647 	case INTR_FLAT_DIST:
648 	case INTR_WEIGHTED_DIST:
649 	default:
650 		/*
651 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
652 		 * the cpu has been deleted (cpu structs are never freed).
653 		 */
654 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
655 			curr_cpu = CPU;
656 
657 		/*
658 		 * Advance to online CPU after curr_cpu (round-robin). For
659 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
660 		 * weight.  For a nexus that does not support weight the
661 		 * default weight of zero is used. We degrade to round-robin
662 		 * behavior among equal weightes.  The default weight is zero
663 		 * and round-robin behavior continues.
664 		 *
665 		 * Disable preemption while traversing cpu_next_onln to
666 		 * ensure the list does not change.  This works because
667 		 * modifiers of this list and other lists in a struct cpu
668 		 * call pause_cpus() before making changes.
669 		 */
670 		kpreempt_disable();
671 		cp = start_cpu = curr_cpu->cpu_next_onln;
672 		new_cpu = NULL;
673 		do {
674 			/* Skip CPUs with interrupts disabled */
675 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
676 				continue;
677 
678 			if (intr_policy == INTR_FLAT_DIST) {
679 				/* select CPU */
680 				new_cpu = cp;
681 				break;
682 			} else if ((new_cpu == NULL) ||
683 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
684 				/* Choose if lighter weight */
685 				new_cpu = cp;
686 			}
687 		} while ((cp = cp->cpu_next_onln) != start_cpu);
688 		ASSERT(new_cpu);
689 		cpuid = new_cpu->cpu_id;
690 
691 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
692 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
693 
694 		/* update static pointer for next round-robin */
695 		curr_cpu = new_cpu;
696 		kpreempt_enable();
697 		break;
698 	}
699 	mutex_exit(&intr_dist_cpu_lock);
700 	return (cpuid);
701 }
702 
703 /*
704  * Add or remove the the weight of a device from a CPUs interrupt weight.
705  *
706  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
707  * their children to improve the overall quality of interrupt initialization.
708  *
709  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
710  * among multiple devices (sharing ino) then the nexus should call
711  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
712  * that share must specify the same cpuid.
713  *
714  * If a nexus driver is unable to determine the cpu at remove_intr time
715  * for some of its interrupts, then it should not call add_device_weight -
716  * intr_dist_cpuid will still provide round-robin.
717  *
718  * An established device weight (from dev_info node) takes precedence over
719  * the weight passed in.  If a device weight is not already established
720  * then the passed in nexus weight is established.
721  */
722 void
723 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
724     dev_info_t *dip, int32_t nweight)
725 {
726 	int32_t		eweight;
727 
728 	/*
729 	 * For non-weighted policy everything has weight of zero (and we get
730 	 * round-robin distribution from intr_dist_cpuid).
731 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
732 	 * calls this rouitne even if intr_policy has been patched to
733 	 * INTR_FLAG_DIST.
734 	 */
735 	ASSERT(dip);
736 	if (intr_policy != INTR_WEIGHTED_DIST)
737 		return;
738 
739 	eweight = i_ddi_get_intr_weight(dip);
740 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
741 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
742 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
743 	    ddi_get_instance(ddi_get_parent(dip)),
744 	    ddi_driver_name(dip), ddi_get_instance(dip)));
745 
746 	/* if no establish weight, establish nexus weight */
747 	if (eweight < 0) {
748 		if (nweight > 0)
749 			(void) i_ddi_set_intr_weight(dip, nweight);
750 		else
751 			nweight = 0;
752 	} else
753 		nweight = eweight;	/* use established weight */
754 
755 	/* Establish exclusion for cpu_intr_weight manipulation */
756 	mutex_enter(&intr_dist_cpu_lock);
757 	cpu[cpuid]->cpu_intr_weight += nweight;
758 
759 	/* update intr_dist_weight_max */
760 	if (nweight > intr_dist_weight_max)
761 		intr_dist_weight_max = nweight;
762 	mutex_exit(&intr_dist_cpu_lock);
763 }
764 
765 void
766 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
767 {
768 	struct cpu	*cp;
769 	int32_t		weight;
770 
771 	ASSERT(dip);
772 	if (intr_policy != INTR_WEIGHTED_DIST)
773 		return;
774 
775 	/* remove weight of device from cpu */
776 	weight = i_ddi_get_intr_weight(dip);
777 	if (weight < 0)
778 		weight = 0;
779 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
780 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
781 	    ddi_driver_name(ddi_get_parent(dip)),
782 	    ddi_get_instance(ddi_get_parent(dip)),
783 	    ddi_driver_name(dip), ddi_get_instance(dip)));
784 
785 	/* Establish exclusion for cpu_intr_weight manipulation */
786 	mutex_enter(&intr_dist_cpu_lock);
787 	cp = cpu[cpuid];
788 	cp->cpu_intr_weight -= weight;
789 	if (cp->cpu_intr_weight < 0)
790 		cp->cpu_intr_weight = 0;	/* sanity */
791 	mutex_exit(&intr_dist_cpu_lock);
792 }
793