xref: /titanic_50/usr/src/uts/sun4/os/intr.c (revision 36ca3987254525f41789b62db675686b5eeb5048)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/sysmacros.h>
29 #include <sys/stack.h>
30 #include <sys/cpuvar.h>
31 #include <sys/ivintr.h>
32 #include <sys/intreg.h>
33 #include <sys/membar.h>
34 #include <sys/kmem.h>
35 #include <sys/intr.h>
36 #include <sys/sunddi.h>
37 #include <sys/sunndi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/archsystm.h>
42 #include <sys/machsystm.h>
43 #include <sys/x_call.h>
44 #include <vm/seg_kp.h>
45 #include <sys/debug.h>
46 #include <sys/cyclic.h>
47 #include <sys/kdi_impl.h>
48 #include <sys/ddi_timer.h>
49 
50 #include <sys/cpu_sgnblk_defs.h>
51 
52 /* Global locks which protect the interrupt distribution lists */
53 static kmutex_t intr_dist_lock;
54 static kmutex_t intr_dist_cpu_lock;
55 
56 /* Head of the interrupt distribution lists */
57 static struct intr_dist *intr_dist_head = NULL;
58 static struct intr_dist *intr_dist_whead = NULL;
59 
60 static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
61 uint64_t *siron_cpu_inum = NULL;
62 uint64_t siron_poke_cpu_inum;
63 static int siron_cpu_setup(cpu_setup_t, int, void *);
64 extern uint_t softlevel1();
65 
66 static uint64_t siron1_inum; /* backward compatibility */
67 uint64_t poke_cpu_inum;
68 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
69 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
70 
71 /*
72  * Note:-
73  * siron_pending was originally created to prevent a resource over consumption
74  * bug in setsoftint(exhaustion of interrupt pool free list).
75  * It's original intention is obsolete with the use of iv_pending in
76  * setsoftint. However, siron_pending stayed around, acting as a second
77  * gatekeeper preventing soft interrupts from being queued. In this capacity,
78  * it can lead to hangs on MP systems, where due to global visibility issues
79  * it can end up set while iv_pending is reset, preventing soft interrupts from
80  * ever being processed. In addition to its gatekeeper role, init_intr also
81  * uses it to flag the situation where siron() was called before siron_inum has
82  * been defined.
83  *
84  * siron() does not need an extra gatekeeper; any cpu that wishes should be
85  * allowed to queue a soft interrupt. It is softint()'s job to ensure
86  * correct handling of the queues. Therefore, siron_pending has been
87  * stripped of its gatekeeper task, retaining only its intr_init job, where
88  * it indicates that there is a pending need to call siron().
89  */
90 static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
91 static int siron1_pending; /* backward compatibility */
92 
93 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
94 int intr_dist_debug = 0;
95 int32_t intr_dist_weight_max = 1;
96 int32_t intr_dist_weight_maxmax = 1000;
97 int intr_dist_weight_maxfactor = 2;
98 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
99 
100 /*
101  * intr_init() - Interrupt initialization
102  *	Initialize the system's interrupt vector table.
103  */
104 void
105 intr_init(cpu_t *cp)
106 {
107 	int i;
108 	extern uint_t softlevel1();
109 
110 	init_ivintr();
111 	REGISTER_BBUS_INTR();
112 
113 	/*
114 	 * Register these software interrupts for ddi timer.
115 	 * Software interrupts up to the level 10 are supported.
116 	 */
117 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
118 		siron_inum[i-1] = add_softintr(i, (softintrfunc)timer_softintr,
119 		    (caddr_t)(uintptr_t)(i), SOFTINT_ST);
120 	}
121 
122 	siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
123 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
124 	siron_poke_cpu_inum = add_softintr(PIL_13,
125 	    siron_poke_cpu_intr, 0, SOFTINT_MT);
126 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
127 
128 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
129 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
130 
131 	/*
132 	 * A soft interrupt may have been requested prior to the initialization
133 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
134 	 * init_intr(), so we have to wait until now before we can dispatch the
135 	 * pending soft interrupt (if any).
136 	 */
137 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
138 		if (siron_pending[i-1]) {
139 			siron_pending[i-1] = 0;
140 			sir_on(i);
141 		}
142 	}
143 	if (siron1_pending) {
144 		siron1_pending = 0;
145 		siron();
146 	}
147 }
148 
149 /*
150  * poke_cpu_intr - fall through when poke_cpu calls
151  */
152 /* ARGSUSED */
153 uint_t
154 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
155 {
156 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
157 	membar_stld_stst();
158 	return (1);
159 }
160 
161 /*
162  * Trigger software interrupts dedicated to ddi timer.
163  */
164 void
165 sir_on(int level)
166 {
167 	ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
168 	if (siron_inum[level-1])
169 		setsoftint(siron_inum[level-1]);
170 	else
171 		siron_pending[level-1] = 1;
172 }
173 
174 /*
175  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
176  * inform its driver component that there's work to be done.  We need to keep
177  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
178  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
179  * implementation of setsoftint is complicated enough that we don't want to
180  * duplicate it, but at the same time we don't want to preclude tracing either.
181  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
182  * setsoftint() implemented as a wrapper.  This allows tracing, while still
183  * providing a way for kmdb to sneak in unmolested.
184  */
185 void
186 kdi_siron(void)
187 {
188 	if (siron1_inum != 0)
189 		kdi_setsoftint(siron1_inum);
190 	else
191 		siron1_pending = 1;
192 }
193 
194 void
195 setsoftint(uint64_t inum)
196 {
197 	kdi_setsoftint(inum);
198 }
199 
200 /*
201  * Generates softlevel1 interrupt on current CPU if it
202  * is not pending already.
203  */
204 void
205 siron(void)
206 {
207 	uint64_t inum;
208 
209 	if (siron1_inum != 0) {
210 		/*
211 		 * Once siron_cpu_inum has been allocated, we can
212 		 * use per-CPU siron inum.
213 		 */
214 		if (siron_cpu_inum && siron_cpu_inum[CPU->cpu_id] != 0)
215 			inum = siron_cpu_inum[CPU->cpu_id];
216 		else
217 			inum = siron1_inum;
218 
219 		setsoftint(inum);
220 	} else
221 		siron1_pending = 1;
222 }
223 
224 
225 static void
226 siron_init(void)
227 {
228 	/*
229 	 * We just allocate memory for per-cpu siron right now. Rest of
230 	 * the work is done when CPU is configured.
231 	 */
232 	siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
233 }
234 
235 /*
236  * This routine creates per-CPU siron inum for CPUs which are
237  * configured during boot.
238  */
239 void
240 siron_mp_init()
241 {
242 	cpu_t *c;
243 
244 	/*
245 	 * Get the memory for per-CPU siron inums
246 	 */
247 	siron_init();
248 
249 	mutex_enter(&cpu_lock);
250 	c = cpu_list;
251 	do {
252 		(void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
253 	} while ((c = c->cpu_next) != cpu_list);
254 
255 	register_cpu_setup_func(siron_cpu_setup, NULL);
256 	mutex_exit(&cpu_lock);
257 }
258 
259 /*
260  * siron_poke_cpu_intr - cross-call handler.
261  */
262 /* ARGSUSED */
263 uint_t
264 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
265 {
266 	/* generate level1 softint */
267 	siron();
268 	return (1);
269 }
270 
271 /*
272  * This routine generates a cross-call on target CPU(s).
273  */
274 void
275 siron_poke_cpu(cpuset_t poke)
276 {
277 	int cpuid = CPU->cpu_id;
278 
279 	if (CPU_IN_SET(poke, cpuid)) {
280 		siron();
281 		CPUSET_DEL(poke, cpuid);
282 		if (CPUSET_ISNULL(poke))
283 			return;
284 	}
285 
286 	xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
287 }
288 
289 /*
290  * This callback function allows us to create per-CPU siron inum.
291  */
292 /* ARGSUSED */
293 static int
294 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
295 {
296 	cpu_t *cp = cpu[id];
297 
298 	ASSERT(MUTEX_HELD(&cpu_lock));
299 	ASSERT(cp != NULL);
300 
301 	switch (what) {
302 	case CPU_CONFIG:
303 		siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
304 		    (softintrfunc)softlevel1, 0, SOFTINT_ST);
305 		break;
306 	case CPU_UNCONFIG:
307 		(void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
308 		siron_cpu_inum[cp->cpu_id] = 0;
309 		break;
310 	default:
311 		break;
312 	}
313 
314 	return (0);
315 }
316 
317 /*
318  * no_ivintr()
319  * 	called by setvecint_tl1() through sys_trap()
320  *	vector interrupt received but not valid or not
321  *	registered in intr_vec_table
322  *	considered as a spurious mondo interrupt
323  */
324 /* ARGSUSED */
325 void
326 no_ivintr(struct regs *rp, int inum, int pil)
327 {
328 	cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
329 	    inum, pil);
330 
331 #ifdef DEBUG_VEC_INTR
332 	prom_enter_mon();
333 #endif /* DEBUG_VEC_INTR */
334 }
335 
336 void
337 intr_dequeue_req(uint_t pil, uint64_t inum)
338 {
339 	intr_vec_t	*iv, *next, *prev;
340 	struct machcpu	*mcpu;
341 	uint32_t	clr;
342 	processorid_t	cpu_id;
343 	extern uint_t	getpstate(void);
344 
345 	ASSERT((getpstate() & PSTATE_IE) == 0);
346 
347 	mcpu = &CPU->cpu_m;
348 	cpu_id = CPU->cpu_id;
349 
350 	iv = (intr_vec_t *)inum;
351 	prev = NULL;
352 	next = mcpu->intr_head[pil];
353 
354 	/* Find a matching entry in the list */
355 	while (next != NULL) {
356 		if (next == iv)
357 			break;
358 		prev = next;
359 		next = IV_GET_PIL_NEXT(next, cpu_id);
360 	}
361 
362 	if (next != NULL) {
363 		intr_vec_t	*next_iv = IV_GET_PIL_NEXT(next, cpu_id);
364 
365 		/* Remove entry from list */
366 		if (prev != NULL)
367 			IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
368 		else
369 			mcpu->intr_head[pil] = next_iv; /* head */
370 
371 		if (next_iv == NULL)
372 			mcpu->intr_tail[pil] = prev; /* tail */
373 	}
374 
375 	/* Clear pending interrupts at this level if the list is empty */
376 	if (mcpu->intr_head[pil] == NULL) {
377 		clr = 1 << pil;
378 		if (pil == PIL_14)
379 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
380 		wr_clr_softint(clr);
381 	}
382 }
383 
384 
385 /*
386  * Send a directed interrupt of specified interrupt number id to a cpu.
387  */
388 void
389 send_dirint(
390 	int cpuix,		/* cpu to be interrupted */
391 	int intr_id)		/* interrupt number id */
392 {
393 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
394 }
395 
396 /*
397  * Take the specified CPU out of participation in interrupts.
398  *	Called by p_online(2) when a processor is being taken off-line.
399  *	This allows interrupt threads being handled on the processor to
400  *	complete before the processor is idled.
401  */
402 int
403 cpu_disable_intr(struct cpu *cp)
404 {
405 	ASSERT(MUTEX_HELD(&cpu_lock));
406 
407 	/*
408 	 * Turn off the CPU_ENABLE flag before calling the redistribution
409 	 * function, since it checks for this in the cpu flags.
410 	 */
411 	cp->cpu_flags &= ~CPU_ENABLE;
412 
413 	intr_redist_all_cpus();
414 
415 	return (0);
416 }
417 
418 /*
419  * Allow the specified CPU to participate in interrupts.
420  *	Called by p_online(2) if a processor could not be taken off-line
421  *	because of bound threads, in order to resume processing interrupts.
422  *	Also called after starting a processor.
423  */
424 void
425 cpu_enable_intr(struct cpu *cp)
426 {
427 	ASSERT(MUTEX_HELD(&cpu_lock));
428 
429 	cp->cpu_flags |= CPU_ENABLE;
430 
431 	intr_redist_all_cpus();
432 }
433 
434 /*
435  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
436  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
437  * are issued to redirect interrupts of a specified weight, from heavy to
438  * light.  This allows all the interrupts of a given weight to be redistributed
439  * for all weighted nexus drivers prior to those of less weight.
440  */
441 static void
442 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
443 {
444 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
445 	struct intr_dist *iptr;
446 	struct intr_dist **pptr;
447 
448 	ASSERT(func);
449 	new->func = func;
450 	new->arg = arg;
451 	new->next = NULL;
452 
453 	/* Add to tail so that redistribution occurs in original order. */
454 	mutex_enter(&intr_dist_lock);
455 	for (iptr = *phead, pptr = phead; iptr != NULL;
456 	    pptr = &iptr->next, iptr = iptr->next) {
457 		/* check for problems as we locate the tail */
458 		if ((iptr->func == func) && (iptr->arg == arg)) {
459 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
460 			/*NOTREACHED*/
461 		}
462 	}
463 	*pptr = new;
464 
465 	mutex_exit(&intr_dist_lock);
466 }
467 
468 void
469 intr_dist_add(void (*func)(void *), void *arg)
470 {
471 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
472 }
473 
474 void
475 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
476 {
477 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
478 }
479 
480 /*
481  * Search for the interrupt distribution structure with the specified
482  * mondo vec reg in the interrupt distribution list. If a match is found,
483  * then delete the entry from the list. The caller is responsible for
484  * modifying the mondo vector registers.
485  */
486 static void
487 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
488 {
489 	struct intr_dist *iptr;
490 	struct intr_dist **vect;
491 
492 	mutex_enter(&intr_dist_lock);
493 	for (iptr = *headp, vect = headp;
494 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
495 		if ((iptr->func == func) && (iptr->arg == arg)) {
496 			*vect = iptr->next;
497 			kmem_free(iptr, sizeof (struct intr_dist));
498 			mutex_exit(&intr_dist_lock);
499 			return;
500 		}
501 	}
502 
503 	if (!panicstr)
504 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
505 	mutex_exit(&intr_dist_lock);
506 }
507 
508 void
509 intr_dist_rem(void (*func)(void *), void *arg)
510 {
511 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
512 }
513 
514 void
515 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
516 {
517 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
518 }
519 
520 /*
521  * Initiate interrupt redistribution.  Redistribution improves the isolation
522  * associated with interrupt weights by ordering operations from heavy weight
523  * to light weight.  When a CPUs orientation changes relative to interrupts,
524  * there is *always* a redistribution to accommodate this change (call to
525  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
526  * that a redistribution could improve the quality of an initialization. For
527  * example, if you are not using a NIC it may not be attached with s10 (devfs).
528  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
529  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
530  * occurring late, so optimal "isolation" relative to weight is not occurring.
531  * The same applies to detach, although in this case doing the redistribution
532  * might improve "spread" for medium weight devices since the "isolation" of
533  * a higher weight device may no longer be present.
534  *
535  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
536  *
537  * NB: There is risk associated with automatically triggering execution of the
538  * redistribution code at arbitrary times. The risk comes from the fact that
539  * there is a lot of low-level hardware interaction associated with a
540  * redistribution.  At some point we may want this code to perform automatic
541  * redistribution (redistribution thread; trigger timeout when add/remove
542  * weight delta is large enough, and call cv_signal from timeout - causing
543  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
544  * risky at this time.
545  */
546 void
547 i_ddi_intr_redist_all_cpus()
548 {
549 	mutex_enter(&cpu_lock);
550 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
551 	intr_redist_all_cpus();
552 	mutex_exit(&cpu_lock);
553 }
554 
555 /*
556  * Redistribute all interrupts
557  *
558  * This function redistributes all interrupting devices, running the
559  * parent callback functions for each node.
560  */
561 void
562 intr_redist_all_cpus(void)
563 {
564 	struct cpu *cp;
565 	struct intr_dist *iptr;
566 	int32_t weight, max_weight;
567 
568 	ASSERT(MUTEX_HELD(&cpu_lock));
569 	mutex_enter(&intr_dist_lock);
570 
571 	/*
572 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
573 	 * cpu_list since we hold cpu_lock.
574 	 */
575 	cp = cpu_list;
576 	do {
577 		cp->cpu_intr_weight = 0;
578 	} while ((cp = cp->cpu_next) != cpu_list);
579 
580 	/*
581 	 * Assume that this redistribution may encounter a device weight
582 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
583 	 * intr_dist_weight_maxfactor times larger.
584 	 */
585 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
586 	if (max_weight > intr_dist_weight_maxmax)
587 		max_weight = intr_dist_weight_maxmax;
588 	intr_dist_weight_max = 1;
589 
590 	INTR_DEBUG((CE_CONT, "intr_dist: "
591 	    "intr_redist_all_cpus: %d-0\n", max_weight));
592 
593 	/*
594 	 * Redistribute weighted, from heavy to light.  The callback that
595 	 * specifies a weight equal to weight_max should redirect all
596 	 * interrupts of weight weight_max or greater [weight_max, inf.).
597 	 * Interrupts of lesser weight should be processed on the call with
598 	 * the matching weight. This allows all the heaver weight interrupts
599 	 * on all weighted busses (multiple pci busses) to be redirected prior
600 	 * to any lesser weight interrupts.
601 	 */
602 	for (weight = max_weight; weight >= 0; weight--)
603 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
604 			((void (*)(void *, int32_t, int32_t))iptr->func)
605 			    (iptr->arg, max_weight, weight);
606 
607 	/* redistribute normal (non-weighted) interrupts */
608 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
609 		((void (*)(void *))iptr->func)(iptr->arg);
610 	mutex_exit(&intr_dist_lock);
611 }
612 
613 void
614 intr_redist_all_cpus_shutdown(void)
615 {
616 	intr_policy = INTR_CURRENT_CPU;
617 	intr_redist_all_cpus();
618 }
619 
620 /*
621  * Determine what CPU to target, based on interrupt policy.
622  *
623  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
624  *	advance through interrupt enabled cpus (round-robin).
625  *
626  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
627  *	cpu_intr_weight, round robin when all equal.
628  *
629  *	Weighted interrupt distribution provides two things: "spread" of weight
630  *	(associated with algorithm itself) and "isolation" (associated with a
631  *	particular device weight). A redistribution is what provides optimal
632  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
633  *	(relative to what came before) is always occurring.
634  *
635  *	An interrupt weight is a subjective number that represents the
636  *	percentage of a CPU required to service a device's interrupts: the
637  *	default weight is 0% (however the algorithm still maintains
638  *	round-robin), a network interface controller (NIC) may have a large
639  *	weight (35%). Interrupt weight only has meaning relative to the
640  *	interrupt weight of other devices: a CPU can be weighted more than
641  *	100%, and a single device might consume more than 100% of a CPU.
642  *
643  *	A coarse interrupt weight can be defined by the parent nexus driver
644  *	based on bus specific information, like pci class codes. A nexus
645  *	driver that supports device interrupt weighting for its children
646  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
647  *	and removes the weight of a device from the CPU that an interrupt
648  *	is directed at.  The quality of initialization improves when the
649  *	device interrupt weights more accuracy reflect actual run-time weights,
650  *	and as the assignments are ordered from is heavy to light.
651  *
652  *	The implementation also supports interrupt weight being specified in
653  *	driver.conf files via the property "ddi-intr-weight", which takes
654  *	precedence over the nexus supplied weight.  This support is added to
655  *	permit possible tweaking in the product in response to customer
656  *	problems. This is not a formal or committed interface.
657  *
658  *	While a weighted approach chooses the CPU providing the best spread
659  *	given past weights, less than optimal isolation can result in cases
660  *	where heavy weight devices show up last. The nexus driver's interrupt
661  *	redistribution logic should use intr_dist_add/rem_weighted so that
662  *	interrupts can be redistributed heavy first for optimal isolation.
663  */
664 uint32_t
665 intr_dist_cpuid(void)
666 {
667 	static struct cpu	*curr_cpu;
668 	struct cpu		*start_cpu;
669 	struct cpu		*new_cpu;
670 	struct cpu		*cp;
671 	int			cpuid = -1;
672 
673 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
674 	mutex_enter(&intr_dist_cpu_lock);
675 
676 	switch (intr_policy) {
677 	case INTR_CURRENT_CPU:
678 		cpuid = CPU->cpu_id;
679 		break;
680 
681 	case INTR_BOOT_CPU:
682 		panic("INTR_BOOT_CPU no longer supported.");
683 		/*NOTREACHED*/
684 
685 	case INTR_FLAT_DIST:
686 	case INTR_WEIGHTED_DIST:
687 	default:
688 		/*
689 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
690 		 * the cpu has been deleted (cpu structs are never freed).
691 		 */
692 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
693 			curr_cpu = CPU;
694 
695 		/*
696 		 * Advance to online CPU after curr_cpu (round-robin). For
697 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
698 		 * weight.  For a nexus that does not support weight the
699 		 * default weight of zero is used. We degrade to round-robin
700 		 * behavior among equal weightes.  The default weight is zero
701 		 * and round-robin behavior continues.
702 		 *
703 		 * Disable preemption while traversing cpu_next_onln to
704 		 * ensure the list does not change.  This works because
705 		 * modifiers of this list and other lists in a struct cpu
706 		 * call pause_cpus() before making changes.
707 		 */
708 		kpreempt_disable();
709 		cp = start_cpu = curr_cpu->cpu_next_onln;
710 		new_cpu = NULL;
711 		do {
712 			/* Skip CPUs with interrupts disabled */
713 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
714 				continue;
715 
716 			if (intr_policy == INTR_FLAT_DIST) {
717 				/* select CPU */
718 				new_cpu = cp;
719 				break;
720 			} else if ((new_cpu == NULL) ||
721 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
722 				/* Choose if lighter weight */
723 				new_cpu = cp;
724 			}
725 		} while ((cp = cp->cpu_next_onln) != start_cpu);
726 		ASSERT(new_cpu);
727 		cpuid = new_cpu->cpu_id;
728 
729 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
730 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
731 
732 		/* update static pointer for next round-robin */
733 		curr_cpu = new_cpu;
734 		kpreempt_enable();
735 		break;
736 	}
737 	mutex_exit(&intr_dist_cpu_lock);
738 	return (cpuid);
739 }
740 
741 /*
742  * Add or remove the the weight of a device from a CPUs interrupt weight.
743  *
744  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
745  * their children to improve the overall quality of interrupt initialization.
746  *
747  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
748  * among multiple devices (sharing ino) then the nexus should call
749  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
750  * that share must specify the same cpuid.
751  *
752  * If a nexus driver is unable to determine the cpu at remove_intr time
753  * for some of its interrupts, then it should not call add_device_weight -
754  * intr_dist_cpuid will still provide round-robin.
755  *
756  * An established device weight (from dev_info node) takes precedence over
757  * the weight passed in.  If a device weight is not already established
758  * then the passed in nexus weight is established.
759  */
760 void
761 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
762     dev_info_t *dip, int32_t nweight)
763 {
764 	int32_t		eweight;
765 
766 	/*
767 	 * For non-weighted policy everything has weight of zero (and we get
768 	 * round-robin distribution from intr_dist_cpuid).
769 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
770 	 * calls this rouitne even if intr_policy has been patched to
771 	 * INTR_FLAG_DIST.
772 	 */
773 	ASSERT(dip);
774 	if (intr_policy != INTR_WEIGHTED_DIST)
775 		return;
776 
777 	eweight = i_ddi_get_intr_weight(dip);
778 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
779 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
780 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
781 	    ddi_get_instance(ddi_get_parent(dip)),
782 	    ddi_driver_name(dip), ddi_get_instance(dip)));
783 
784 	/* if no establish weight, establish nexus weight */
785 	if (eweight < 0) {
786 		if (nweight > 0)
787 			(void) i_ddi_set_intr_weight(dip, nweight);
788 		else
789 			nweight = 0;
790 	} else
791 		nweight = eweight;	/* use established weight */
792 
793 	/* Establish exclusion for cpu_intr_weight manipulation */
794 	mutex_enter(&intr_dist_cpu_lock);
795 	cpu[cpuid]->cpu_intr_weight += nweight;
796 
797 	/* update intr_dist_weight_max */
798 	if (nweight > intr_dist_weight_max)
799 		intr_dist_weight_max = nweight;
800 	mutex_exit(&intr_dist_cpu_lock);
801 }
802 
803 void
804 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
805 {
806 	struct cpu	*cp;
807 	int32_t		weight;
808 
809 	ASSERT(dip);
810 	if (intr_policy != INTR_WEIGHTED_DIST)
811 		return;
812 
813 	/* remove weight of device from cpu */
814 	weight = i_ddi_get_intr_weight(dip);
815 	if (weight < 0)
816 		weight = 0;
817 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
818 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
819 	    ddi_driver_name(ddi_get_parent(dip)),
820 	    ddi_get_instance(ddi_get_parent(dip)),
821 	    ddi_driver_name(dip), ddi_get_instance(dip)));
822 
823 	/* Establish exclusion for cpu_intr_weight manipulation */
824 	mutex_enter(&intr_dist_cpu_lock);
825 	cp = cpu[cpuid];
826 	cp->cpu_intr_weight -= weight;
827 	if (cp->cpu_intr_weight < 0)
828 		cp->cpu_intr_weight = 0;	/* sanity */
829 	mutex_exit(&intr_dist_cpu_lock);
830 }
831 
832 ulong_t
833 create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
834 {
835 	uint64_t inum;
836 
837 	inum = add_softintr(pil, func, arg1, SOFTINT_ST);
838 	return ((ulong_t)inum);
839 }
840 
841 void
842 invoke_softint(processorid_t cpuid, ulong_t hdl)
843 {
844 	uint64_t inum = hdl;
845 
846 	if (cpuid == CPU->cpu_id)
847 		setsoftint(inum);
848 	else
849 		xt_one(cpuid, setsoftint_tl1, inum, 0);
850 }
851 
852 void
853 remove_softint(ulong_t hdl)
854 {
855 	uint64_t inum = hdl;
856 
857 	(void) rem_softintr(inum);
858 }
859 
860 void
861 sync_softint(cpuset_t set)
862 {
863 	xt_sync(set);
864 }
865