xref: /titanic_44/usr/src/uts/sun4/os/intr.c (revision 03fc868668dd42b1b163d1fb8af3968f7283a7eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
27  */
28 
29 #include <sys/sysmacros.h>
30 #include <sys/stack.h>
31 #include <sys/cpuvar.h>
32 #include <sys/ivintr.h>
33 #include <sys/intreg.h>
34 #include <sys/membar.h>
35 #include <sys/kmem.h>
36 #include <sys/intr.h>
37 #include <sys/sunddi.h>
38 #include <sys/sunndi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/privregs.h>
41 #include <sys/systm.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/x_call.h>
45 #include <vm/seg_kp.h>
46 #include <sys/debug.h>
47 #include <sys/cyclic.h>
48 #include <sys/kdi_impl.h>
49 #include <sys/ddi_periodic.h>
50 
51 #include <sys/cpu_sgnblk_defs.h>
52 
53 /* Global locks which protect the interrupt distribution lists */
54 static kmutex_t intr_dist_lock;
55 static kmutex_t intr_dist_cpu_lock;
56 
57 /* Head of the interrupt distribution lists */
58 static struct intr_dist *intr_dist_head = NULL;
59 static struct intr_dist *intr_dist_whead = NULL;
60 
61 static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
62 uint64_t *siron_cpu_inum = NULL;
63 uint64_t siron_poke_cpu_inum;
64 static int siron_cpu_setup(cpu_setup_t, int, void *);
65 extern uint_t softlevel1();
66 
67 static uint64_t siron1_inum; /* backward compatibility */
68 uint64_t poke_cpu_inum;
69 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
70 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
71 
72 /*
73  * Variable to enable/disable printing a message when an invalid vecintr
74  * is received.
75  */
76 uint_t ignore_invalid_vecintr = 0;
77 
78 /*
79  * Note:-
80  * siron_pending was originally created to prevent a resource over consumption
81  * bug in setsoftint(exhaustion of interrupt pool free list).
82  * It's original intention is obsolete with the use of iv_pending in
83  * setsoftint. However, siron_pending stayed around, acting as a second
84  * gatekeeper preventing soft interrupts from being queued. In this capacity,
85  * it can lead to hangs on MP systems, where due to global visibility issues
86  * it can end up set while iv_pending is reset, preventing soft interrupts from
87  * ever being processed. In addition to its gatekeeper role, init_intr also
88  * uses it to flag the situation where siron() was called before siron_inum has
89  * been defined.
90  *
91  * siron() does not need an extra gatekeeper; any cpu that wishes should be
92  * allowed to queue a soft interrupt. It is softint()'s job to ensure
93  * correct handling of the queues. Therefore, siron_pending has been
94  * stripped of its gatekeeper task, retaining only its intr_init job, where
95  * it indicates that there is a pending need to call siron().
96  */
97 static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
98 static int siron1_pending; /* backward compatibility */
99 
100 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
101 int intr_dist_debug = 0;
102 int32_t intr_dist_weight_max = 1;
103 int32_t intr_dist_weight_maxmax = 1000;
104 int intr_dist_weight_maxfactor = 2;
105 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
106 
107 /*
108  * intr_init() - Interrupt initialization
109  *	Initialize the system's interrupt vector table.
110  */
111 void
112 intr_init(cpu_t *cp)
113 {
114 	int i;
115 	extern uint_t softlevel1();
116 
117 	init_ivintr();
118 	REGISTER_BBUS_INTR();
119 
120 	/*
121 	 * Register these software interrupts for ddi timer.
122 	 * Software interrupts up to the level 10 are supported.
123 	 */
124 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
125 		siron_inum[i - 1] = add_softintr(i,
126 		    (softintrfunc)ddi_periodic_softintr,
127 		    (caddr_t)(uintptr_t)(i), SOFTINT_ST);
128 	}
129 
130 	siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
131 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
132 	siron_poke_cpu_inum = add_softintr(PIL_13,
133 	    siron_poke_cpu_intr, 0, SOFTINT_MT);
134 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
135 
136 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
137 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
138 
139 	/*
140 	 * A soft interrupt may have been requested prior to the initialization
141 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
142 	 * init_intr(), so we have to wait until now before we can dispatch the
143 	 * pending soft interrupt (if any).
144 	 */
145 	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
146 		if (siron_pending[i-1]) {
147 			siron_pending[i-1] = 0;
148 			sir_on(i);
149 		}
150 	}
151 	if (siron1_pending) {
152 		siron1_pending = 0;
153 		siron();
154 	}
155 }
156 
157 /*
158  * poke_cpu_intr - fall through when poke_cpu calls
159  */
160 /* ARGSUSED */
161 uint_t
162 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
163 {
164 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
165 	membar_stld_stst();
166 	return (1);
167 }
168 
169 /*
170  * Trigger software interrupts dedicated to ddi timer.
171  */
172 void
173 sir_on(int level)
174 {
175 	ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
176 	if (siron_inum[level-1])
177 		setsoftint(siron_inum[level-1]);
178 	else
179 		siron_pending[level-1] = 1;
180 }
181 
182 /*
183  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
184  * inform its driver component that there's work to be done.  We need to keep
185  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
186  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
187  * implementation of setsoftint is complicated enough that we don't want to
188  * duplicate it, but at the same time we don't want to preclude tracing either.
189  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
190  * setsoftint() implemented as a wrapper.  This allows tracing, while still
191  * providing a way for kmdb to sneak in unmolested.
192  */
193 void
194 kdi_siron(void)
195 {
196 	if (siron1_inum != 0)
197 		kdi_setsoftint(siron1_inum);
198 	else
199 		siron1_pending = 1;
200 }
201 
202 void
203 setsoftint(uint64_t inum)
204 {
205 	kdi_setsoftint(inum);
206 }
207 
208 /*
209  * Generates softlevel1 interrupt on current CPU if it
210  * is not pending already.
211  */
212 void
213 siron(void)
214 {
215 	uint64_t inum;
216 
217 	if (siron1_inum != 0) {
218 		/*
219 		 * Once siron_cpu_inum has been allocated, we can
220 		 * use per-CPU siron inum.
221 		 */
222 		if (siron_cpu_inum && siron_cpu_inum[CPU->cpu_id] != 0)
223 			inum = siron_cpu_inum[CPU->cpu_id];
224 		else
225 			inum = siron1_inum;
226 
227 		setsoftint(inum);
228 	} else
229 		siron1_pending = 1;
230 }
231 
232 
233 static void
234 siron_init(void)
235 {
236 	/*
237 	 * We just allocate memory for per-cpu siron right now. Rest of
238 	 * the work is done when CPU is configured.
239 	 */
240 	siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
241 }
242 
243 /*
244  * This routine creates per-CPU siron inum for CPUs which are
245  * configured during boot.
246  */
247 void
248 siron_mp_init()
249 {
250 	cpu_t *c;
251 
252 	/*
253 	 * Get the memory for per-CPU siron inums
254 	 */
255 	siron_init();
256 
257 	mutex_enter(&cpu_lock);
258 	c = cpu_list;
259 	do {
260 		(void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
261 	} while ((c = c->cpu_next) != cpu_list);
262 
263 	register_cpu_setup_func(siron_cpu_setup, NULL);
264 	mutex_exit(&cpu_lock);
265 }
266 
267 /*
268  * siron_poke_cpu_intr - cross-call handler.
269  */
270 /* ARGSUSED */
271 uint_t
272 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
273 {
274 	/* generate level1 softint */
275 	siron();
276 	return (1);
277 }
278 
279 /*
280  * This routine generates a cross-call on target CPU(s).
281  */
282 void
283 siron_poke_cpu(cpuset_t poke)
284 {
285 	int cpuid = CPU->cpu_id;
286 
287 	if (CPU_IN_SET(poke, cpuid)) {
288 		siron();
289 		CPUSET_DEL(poke, cpuid);
290 		if (CPUSET_ISNULL(poke))
291 			return;
292 	}
293 
294 	xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
295 }
296 
297 /*
298  * This callback function allows us to create per-CPU siron inum.
299  */
300 /* ARGSUSED */
301 static int
302 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
303 {
304 	cpu_t *cp = cpu[id];
305 
306 	ASSERT(MUTEX_HELD(&cpu_lock));
307 	ASSERT(cp != NULL);
308 
309 	switch (what) {
310 	case CPU_CONFIG:
311 		siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
312 		    (softintrfunc)softlevel1, 0, SOFTINT_ST);
313 		break;
314 	case CPU_UNCONFIG:
315 		(void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
316 		siron_cpu_inum[cp->cpu_id] = 0;
317 		break;
318 	default:
319 		break;
320 	}
321 
322 	return (0);
323 }
324 
325 /*
326  * no_ivintr()
327  * 	called by setvecint_tl1() through sys_trap()
328  *	vector interrupt received but not valid or not
329  *	registered in intr_vec_table
330  *	considered as a spurious mondo interrupt
331  */
332 /* ARGSUSED */
333 void
334 no_ivintr(struct regs *rp, int inum, int pil)
335 {
336 	if (!ignore_invalid_vecintr)
337 		cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
338 		    inum, pil);
339 
340 #ifdef DEBUG_VEC_INTR
341 	prom_enter_mon();
342 #endif /* DEBUG_VEC_INTR */
343 }
344 
345 void
346 intr_dequeue_req(uint_t pil, uint64_t inum)
347 {
348 	intr_vec_t	*iv, *next, *prev;
349 	struct machcpu	*mcpu;
350 	uint32_t	clr;
351 	processorid_t	cpu_id;
352 	extern uint_t	getpstate(void);
353 
354 	ASSERT((getpstate() & PSTATE_IE) == 0);
355 
356 	mcpu = &CPU->cpu_m;
357 	cpu_id = CPU->cpu_id;
358 
359 	iv = (intr_vec_t *)inum;
360 	prev = NULL;
361 	next = mcpu->intr_head[pil];
362 
363 	/* Find a matching entry in the list */
364 	while (next != NULL) {
365 		if (next == iv)
366 			break;
367 		prev = next;
368 		next = IV_GET_PIL_NEXT(next, cpu_id);
369 	}
370 
371 	if (next != NULL) {
372 		intr_vec_t	*next_iv = IV_GET_PIL_NEXT(next, cpu_id);
373 
374 		/* Remove entry from list */
375 		if (prev != NULL)
376 			IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
377 		else
378 			mcpu->intr_head[pil] = next_iv; /* head */
379 
380 		if (next_iv == NULL)
381 			mcpu->intr_tail[pil] = prev; /* tail */
382 	}
383 
384 	/* Clear pending interrupts at this level if the list is empty */
385 	if (mcpu->intr_head[pil] == NULL) {
386 		clr = 1 << pil;
387 		if (pil == PIL_14)
388 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
389 		wr_clr_softint(clr);
390 	}
391 }
392 
393 
394 /*
395  * Send a directed interrupt of specified interrupt number id to a cpu.
396  */
397 void
398 send_dirint(
399 	int cpuix,		/* cpu to be interrupted */
400 	int intr_id)		/* interrupt number id */
401 {
402 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
403 }
404 
405 /*
406  * Take the specified CPU out of participation in interrupts.
407  *	Called by p_online(2) when a processor is being taken off-line.
408  *	This allows interrupt threads being handled on the processor to
409  *	complete before the processor is idled.
410  */
411 int
412 cpu_disable_intr(struct cpu *cp)
413 {
414 	ASSERT(MUTEX_HELD(&cpu_lock));
415 
416 	/*
417 	 * Turn off the CPU_ENABLE flag before calling the redistribution
418 	 * function, since it checks for this in the cpu flags.
419 	 */
420 	cp->cpu_flags &= ~CPU_ENABLE;
421 
422 	intr_redist_all_cpus();
423 
424 	return (0);
425 }
426 
427 /*
428  * Allow the specified CPU to participate in interrupts.
429  *	Called by p_online(2) if a processor could not be taken off-line
430  *	because of bound threads, in order to resume processing interrupts.
431  *	Also called after starting a processor.
432  */
433 void
434 cpu_enable_intr(struct cpu *cp)
435 {
436 	ASSERT(MUTEX_HELD(&cpu_lock));
437 
438 	cp->cpu_flags |= CPU_ENABLE;
439 
440 	intr_redist_all_cpus();
441 }
442 
443 /*
444  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
445  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
446  * are issued to redirect interrupts of a specified weight, from heavy to
447  * light.  This allows all the interrupts of a given weight to be redistributed
448  * for all weighted nexus drivers prior to those of less weight.
449  */
450 static void
451 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
452 {
453 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
454 	struct intr_dist *iptr;
455 	struct intr_dist **pptr;
456 
457 	ASSERT(func);
458 	new->func = func;
459 	new->arg = arg;
460 	new->next = NULL;
461 
462 	/* Add to tail so that redistribution occurs in original order. */
463 	mutex_enter(&intr_dist_lock);
464 	for (iptr = *phead, pptr = phead; iptr != NULL;
465 	    pptr = &iptr->next, iptr = iptr->next) {
466 		/* check for problems as we locate the tail */
467 		if ((iptr->func == func) && (iptr->arg == arg)) {
468 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
469 			/*NOTREACHED*/
470 		}
471 	}
472 	*pptr = new;
473 
474 	mutex_exit(&intr_dist_lock);
475 }
476 
477 void
478 intr_dist_add(void (*func)(void *), void *arg)
479 {
480 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
481 }
482 
483 void
484 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
485 {
486 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
487 }
488 
489 /*
490  * Search for the interrupt distribution structure with the specified
491  * mondo vec reg in the interrupt distribution list. If a match is found,
492  * then delete the entry from the list. The caller is responsible for
493  * modifying the mondo vector registers.
494  */
495 static void
496 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
497 {
498 	struct intr_dist *iptr;
499 	struct intr_dist **vect;
500 
501 	mutex_enter(&intr_dist_lock);
502 	for (iptr = *headp, vect = headp;
503 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
504 		if ((iptr->func == func) && (iptr->arg == arg)) {
505 			*vect = iptr->next;
506 			kmem_free(iptr, sizeof (struct intr_dist));
507 			mutex_exit(&intr_dist_lock);
508 			return;
509 		}
510 	}
511 
512 	if (!panicstr)
513 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
514 	mutex_exit(&intr_dist_lock);
515 }
516 
517 void
518 intr_dist_rem(void (*func)(void *), void *arg)
519 {
520 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
521 }
522 
523 void
524 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
525 {
526 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
527 }
528 
529 /*
530  * Initiate interrupt redistribution.  Redistribution improves the isolation
531  * associated with interrupt weights by ordering operations from heavy weight
532  * to light weight.  When a CPUs orientation changes relative to interrupts,
533  * there is *always* a redistribution to accommodate this change (call to
534  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
535  * that a redistribution could improve the quality of an initialization. For
536  * example, if you are not using a NIC it may not be attached with s10 (devfs).
537  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
538  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
539  * occurring late, so optimal "isolation" relative to weight is not occurring.
540  * The same applies to detach, although in this case doing the redistribution
541  * might improve "spread" for medium weight devices since the "isolation" of
542  * a higher weight device may no longer be present.
543  *
544  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
545  *
546  * NB: There is risk associated with automatically triggering execution of the
547  * redistribution code at arbitrary times. The risk comes from the fact that
548  * there is a lot of low-level hardware interaction associated with a
549  * redistribution.  At some point we may want this code to perform automatic
550  * redistribution (redistribution thread; trigger timeout when add/remove
551  * weight delta is large enough, and call cv_signal from timeout - causing
552  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
553  * risky at this time.
554  */
555 void
556 i_ddi_intr_redist_all_cpus()
557 {
558 	mutex_enter(&cpu_lock);
559 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
560 	intr_redist_all_cpus();
561 	mutex_exit(&cpu_lock);
562 }
563 
564 /*
565  * Redistribute all interrupts
566  *
567  * This function redistributes all interrupting devices, running the
568  * parent callback functions for each node.
569  */
570 void
571 intr_redist_all_cpus(void)
572 {
573 	struct cpu *cp;
574 	struct intr_dist *iptr;
575 	int32_t weight, max_weight;
576 
577 	ASSERT(MUTEX_HELD(&cpu_lock));
578 	mutex_enter(&intr_dist_lock);
579 
580 	/*
581 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
582 	 * cpu_list since we hold cpu_lock.
583 	 */
584 	cp = cpu_list;
585 	do {
586 		cp->cpu_intr_weight = 0;
587 	} while ((cp = cp->cpu_next) != cpu_list);
588 
589 	/*
590 	 * Assume that this redistribution may encounter a device weight
591 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
592 	 * intr_dist_weight_maxfactor times larger.
593 	 */
594 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
595 	if (max_weight > intr_dist_weight_maxmax)
596 		max_weight = intr_dist_weight_maxmax;
597 	intr_dist_weight_max = 1;
598 
599 	INTR_DEBUG((CE_CONT, "intr_dist: "
600 	    "intr_redist_all_cpus: %d-0\n", max_weight));
601 
602 	/*
603 	 * Redistribute weighted, from heavy to light.  The callback that
604 	 * specifies a weight equal to weight_max should redirect all
605 	 * interrupts of weight weight_max or greater [weight_max, inf.).
606 	 * Interrupts of lesser weight should be processed on the call with
607 	 * the matching weight. This allows all the heaver weight interrupts
608 	 * on all weighted busses (multiple pci busses) to be redirected prior
609 	 * to any lesser weight interrupts.
610 	 */
611 	for (weight = max_weight; weight >= 0; weight--)
612 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
613 			((void (*)(void *, int32_t, int32_t))iptr->func)
614 			    (iptr->arg, max_weight, weight);
615 
616 	/* redistribute normal (non-weighted) interrupts */
617 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
618 		((void (*)(void *))iptr->func)(iptr->arg);
619 	mutex_exit(&intr_dist_lock);
620 }
621 
622 void
623 intr_redist_all_cpus_shutdown(void)
624 {
625 	intr_policy = INTR_CURRENT_CPU;
626 	intr_redist_all_cpus();
627 }
628 
629 /*
630  * Determine what CPU to target, based on interrupt policy.
631  *
632  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
633  *	advance through interrupt enabled cpus (round-robin).
634  *
635  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
636  *	cpu_intr_weight, round robin when all equal.
637  *
638  *	Weighted interrupt distribution provides two things: "spread" of weight
639  *	(associated with algorithm itself) and "isolation" (associated with a
640  *	particular device weight). A redistribution is what provides optimal
641  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
642  *	(relative to what came before) is always occurring.
643  *
644  *	An interrupt weight is a subjective number that represents the
645  *	percentage of a CPU required to service a device's interrupts: the
646  *	default weight is 0% (however the algorithm still maintains
647  *	round-robin), a network interface controller (NIC) may have a large
648  *	weight (35%). Interrupt weight only has meaning relative to the
649  *	interrupt weight of other devices: a CPU can be weighted more than
650  *	100%, and a single device might consume more than 100% of a CPU.
651  *
652  *	A coarse interrupt weight can be defined by the parent nexus driver
653  *	based on bus specific information, like pci class codes. A nexus
654  *	driver that supports device interrupt weighting for its children
655  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
656  *	and removes the weight of a device from the CPU that an interrupt
657  *	is directed at.  The quality of initialization improves when the
658  *	device interrupt weights more accuracy reflect actual run-time weights,
659  *	and as the assignments are ordered from is heavy to light.
660  *
661  *	The implementation also supports interrupt weight being specified in
662  *	driver.conf files via the property "ddi-intr-weight", which takes
663  *	precedence over the nexus supplied weight.  This support is added to
664  *	permit possible tweaking in the product in response to customer
665  *	problems. This is not a formal or committed interface.
666  *
667  *	While a weighted approach chooses the CPU providing the best spread
668  *	given past weights, less than optimal isolation can result in cases
669  *	where heavy weight devices show up last. The nexus driver's interrupt
670  *	redistribution logic should use intr_dist_add/rem_weighted so that
671  *	interrupts can be redistributed heavy first for optimal isolation.
672  */
673 uint32_t
674 intr_dist_cpuid(void)
675 {
676 	static struct cpu	*curr_cpu;
677 	struct cpu		*start_cpu;
678 	struct cpu		*new_cpu;
679 	struct cpu		*cp;
680 	int			cpuid = -1;
681 
682 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
683 	mutex_enter(&intr_dist_cpu_lock);
684 
685 	switch (intr_policy) {
686 	case INTR_CURRENT_CPU:
687 		cpuid = CPU->cpu_id;
688 		break;
689 
690 	case INTR_BOOT_CPU:
691 		panic("INTR_BOOT_CPU no longer supported.");
692 		/*NOTREACHED*/
693 
694 	case INTR_FLAT_DIST:
695 	case INTR_WEIGHTED_DIST:
696 	default:
697 		/*
698 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
699 		 * the cpu has been deleted (cpu structs are never freed).
700 		 */
701 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
702 			curr_cpu = CPU;
703 
704 		/*
705 		 * Advance to online CPU after curr_cpu (round-robin). For
706 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
707 		 * weight.  For a nexus that does not support weight the
708 		 * default weight of zero is used. We degrade to round-robin
709 		 * behavior among equal weightes.  The default weight is zero
710 		 * and round-robin behavior continues.
711 		 *
712 		 * Disable preemption while traversing cpu_next_onln to
713 		 * ensure the list does not change.  This works because
714 		 * modifiers of this list and other lists in a struct cpu
715 		 * call pause_cpus() before making changes.
716 		 */
717 		kpreempt_disable();
718 		cp = start_cpu = curr_cpu->cpu_next_onln;
719 		new_cpu = NULL;
720 		do {
721 			/* Skip CPUs with interrupts disabled */
722 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
723 				continue;
724 
725 			if (intr_policy == INTR_FLAT_DIST) {
726 				/* select CPU */
727 				new_cpu = cp;
728 				break;
729 			} else if ((new_cpu == NULL) ||
730 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
731 				/* Choose if lighter weight */
732 				new_cpu = cp;
733 			}
734 		} while ((cp = cp->cpu_next_onln) != start_cpu);
735 		ASSERT(new_cpu);
736 		cpuid = new_cpu->cpu_id;
737 
738 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
739 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
740 
741 		/* update static pointer for next round-robin */
742 		curr_cpu = new_cpu;
743 		kpreempt_enable();
744 		break;
745 	}
746 	mutex_exit(&intr_dist_cpu_lock);
747 	return (cpuid);
748 }
749 
750 /*
751  * Add or remove the the weight of a device from a CPUs interrupt weight.
752  *
753  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
754  * their children to improve the overall quality of interrupt initialization.
755  *
756  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
757  * among multiple devices (sharing ino) then the nexus should call
758  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
759  * that share must specify the same cpuid.
760  *
761  * If a nexus driver is unable to determine the cpu at remove_intr time
762  * for some of its interrupts, then it should not call add_device_weight -
763  * intr_dist_cpuid will still provide round-robin.
764  *
765  * An established device weight (from dev_info node) takes precedence over
766  * the weight passed in.  If a device weight is not already established
767  * then the passed in nexus weight is established.
768  */
769 void
770 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
771     dev_info_t *dip, int32_t nweight)
772 {
773 	int32_t		eweight;
774 
775 	/*
776 	 * For non-weighted policy everything has weight of zero (and we get
777 	 * round-robin distribution from intr_dist_cpuid).
778 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
779 	 * calls this rouitne even if intr_policy has been patched to
780 	 * INTR_FLAG_DIST.
781 	 */
782 	ASSERT(dip);
783 	if (intr_policy != INTR_WEIGHTED_DIST)
784 		return;
785 
786 	eweight = i_ddi_get_intr_weight(dip);
787 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
788 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
789 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
790 	    ddi_get_instance(ddi_get_parent(dip)),
791 	    ddi_driver_name(dip), ddi_get_instance(dip)));
792 
793 	/* if no establish weight, establish nexus weight */
794 	if (eweight < 0) {
795 		if (nweight > 0)
796 			(void) i_ddi_set_intr_weight(dip, nweight);
797 		else
798 			nweight = 0;
799 	} else
800 		nweight = eweight;	/* use established weight */
801 
802 	/* Establish exclusion for cpu_intr_weight manipulation */
803 	mutex_enter(&intr_dist_cpu_lock);
804 	cpu[cpuid]->cpu_intr_weight += nweight;
805 
806 	/* update intr_dist_weight_max */
807 	if (nweight > intr_dist_weight_max)
808 		intr_dist_weight_max = nweight;
809 	mutex_exit(&intr_dist_cpu_lock);
810 }
811 
812 void
813 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
814 {
815 	struct cpu	*cp;
816 	int32_t		weight;
817 
818 	ASSERT(dip);
819 	if (intr_policy != INTR_WEIGHTED_DIST)
820 		return;
821 
822 	/* remove weight of device from cpu */
823 	weight = i_ddi_get_intr_weight(dip);
824 	if (weight < 0)
825 		weight = 0;
826 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
827 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
828 	    ddi_driver_name(ddi_get_parent(dip)),
829 	    ddi_get_instance(ddi_get_parent(dip)),
830 	    ddi_driver_name(dip), ddi_get_instance(dip)));
831 
832 	/* Establish exclusion for cpu_intr_weight manipulation */
833 	mutex_enter(&intr_dist_cpu_lock);
834 	cp = cpu[cpuid];
835 	cp->cpu_intr_weight -= weight;
836 	if (cp->cpu_intr_weight < 0)
837 		cp->cpu_intr_weight = 0;	/* sanity */
838 	mutex_exit(&intr_dist_cpu_lock);
839 }
840 
841 ulong_t
842 create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
843 {
844 	uint64_t inum;
845 
846 	inum = add_softintr(pil, func, arg1, SOFTINT_MT);
847 	return ((ulong_t)inum);
848 }
849 
850 void
851 invoke_softint(processorid_t cpuid, ulong_t hdl)
852 {
853 	uint64_t inum = hdl;
854 
855 	if (cpuid == CPU->cpu_id)
856 		setsoftint(inum);
857 	else
858 		xt_one(cpuid, setsoftint_tl1, inum, 0);
859 }
860 
861 void
862 remove_softint(ulong_t hdl)
863 {
864 	uint64_t inum = hdl;
865 
866 	(void) rem_softintr(inum);
867 }
868 
869 void
870 sync_softint(cpuset_t set)
871 {
872 	xt_sync(set);
873 }
874