xref: /titanic_51/usr/src/uts/i86pc/os/intr.c (revision 0d63ce2b32a9e1cc8ed71d4d92536c44d66a530a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/cpuvar.h>
29 #include <sys/regset.h>
30 #include <sys/psw.h>
31 #include <sys/types.h>
32 #include <sys/thread.h>
33 #include <sys/systm.h>
34 #include <sys/segments.h>
35 #include <sys/pcb.h>
36 #include <sys/trap.h>
37 #include <sys/ftrace.h>
38 #include <sys/traptrace.h>
39 #include <sys/clock.h>
40 #include <sys/panic.h>
41 #include <sys/disp.h>
42 #include <vm/seg_kp.h>
43 #include <sys/stack.h>
44 #include <sys/sysmacros.h>
45 #include <sys/cmn_err.h>
46 #include <sys/kstat.h>
47 #include <sys/smp_impldefs.h>
48 #include <sys/pool_pset.h>
49 #include <sys/zone.h>
50 #include <sys/bitmap.h>
51 #include <sys/archsystm.h>
52 #include <sys/machsystm.h>
53 #include <sys/ontrap.h>
54 #include <sys/x86_archext.h>
55 #include <sys/promif.h>
56 
57 
58 /*
59  * Set cpu's base SPL level to the highest active interrupt level
60  */
61 void
62 set_base_spl(void)
63 {
64 	struct cpu *cpu = CPU;
65 	uint16_t active = (uint16_t)cpu->cpu_intr_actv;
66 
67 	cpu->cpu_base_spl = active == 0 ? 0 : bsrw_insn(active);
68 }
69 
70 /*
71  * Do all the work necessary to set up the cpu and thread structures
72  * to dispatch a high-level interrupt.
73  *
74  * Returns 0 if we're -not- already on the high-level interrupt stack,
75  * (and *must* switch to it), non-zero if we are already on that stack.
76  *
77  * Called with interrupts masked.
78  * The 'pil' is already set to the appropriate level for rp->r_trapno.
79  */
80 static int
81 hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
82 {
83 	struct machcpu *mcpu = &cpu->cpu_m;
84 	uint_t mask;
85 	hrtime_t intrtime;
86 	hrtime_t now = tsc_read();
87 
88 	ASSERT(pil > LOCK_LEVEL);
89 
90 	if (pil == CBE_HIGH_PIL) {
91 		cpu->cpu_profile_pil = oldpil;
92 		if (USERMODE(rp->r_cs)) {
93 			cpu->cpu_profile_pc = 0;
94 			cpu->cpu_profile_upc = rp->r_pc;
95 		} else {
96 			cpu->cpu_profile_pc = rp->r_pc;
97 			cpu->cpu_profile_upc = 0;
98 		}
99 	}
100 
101 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
102 	if (mask != 0) {
103 		int nestpil;
104 
105 		/*
106 		 * We have interrupted another high-level interrupt.
107 		 * Load starting timestamp, compute interval, update
108 		 * cumulative counter.
109 		 */
110 		nestpil = bsrw_insn((uint16_t)mask);
111 		ASSERT(nestpil < pil);
112 		intrtime = now -
113 		    mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)];
114 		mcpu->intrstat[nestpil][0] += intrtime;
115 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
116 		/*
117 		 * Another high-level interrupt is active below this one, so
118 		 * there is no need to check for an interrupt thread.  That
119 		 * will be done by the lowest priority high-level interrupt
120 		 * active.
121 		 */
122 	} else {
123 		kthread_t *t = cpu->cpu_thread;
124 
125 		/*
126 		 * See if we are interrupting a low-level interrupt thread.
127 		 * If so, account for its time slice only if its time stamp
128 		 * is non-zero.
129 		 */
130 		if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) {
131 			intrtime = now - t->t_intr_start;
132 			mcpu->intrstat[t->t_pil][0] += intrtime;
133 			cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
134 			t->t_intr_start = 0;
135 		}
136 	}
137 
138 	/*
139 	 * Store starting timestamp in CPU structure for this PIL.
140 	 */
141 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
142 
143 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
144 
145 	if (pil == 15) {
146 		/*
147 		 * To support reentrant level 15 interrupts, we maintain a
148 		 * recursion count in the top half of cpu_intr_actv.  Only
149 		 * when this count hits zero do we clear the PIL 15 bit from
150 		 * the lower half of cpu_intr_actv.
151 		 */
152 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
153 		(*refcntp)++;
154 	}
155 
156 	mask = cpu->cpu_intr_actv;
157 
158 	cpu->cpu_intr_actv |= (1 << pil);
159 
160 	return (mask & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
161 }
162 
163 /*
164  * Does most of the work of returning from a high level interrupt.
165  *
166  * Returns 0 if there are no more high level interrupts (in which
167  * case we must switch back to the interrupted thread stack) or
168  * non-zero if there are more (in which case we should stay on it).
169  *
170  * Called with interrupts masked
171  */
172 static int
173 hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
174 {
175 	struct machcpu *mcpu = &cpu->cpu_m;
176 	uint_t mask;
177 	hrtime_t intrtime;
178 	hrtime_t now = tsc_read();
179 
180 	ASSERT(mcpu->mcpu_pri == pil);
181 
182 	cpu->cpu_stats.sys.intr[pil - 1]++;
183 
184 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
185 
186 	if (pil == 15) {
187 		/*
188 		 * To support reentrant level 15 interrupts, we maintain a
189 		 * recursion count in the top half of cpu_intr_actv.  Only
190 		 * when this count hits zero do we clear the PIL 15 bit from
191 		 * the lower half of cpu_intr_actv.
192 		 */
193 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
194 
195 		ASSERT(*refcntp > 0);
196 
197 		if (--(*refcntp) == 0)
198 			cpu->cpu_intr_actv &= ~(1 << pil);
199 	} else {
200 		cpu->cpu_intr_actv &= ~(1 << pil);
201 	}
202 
203 	ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0);
204 
205 	intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)];
206 	mcpu->intrstat[pil][0] += intrtime;
207 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
208 
209 	/*
210 	 * Check for lower-pil nested high-level interrupt beneath
211 	 * current one.  If so, place a starting timestamp in its
212 	 * pil_high_start entry.
213 	 */
214 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
215 	if (mask != 0) {
216 		int nestpil;
217 
218 		/*
219 		 * find PIL of nested interrupt
220 		 */
221 		nestpil = bsrw_insn((uint16_t)mask);
222 		ASSERT(nestpil < pil);
223 		mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now;
224 		/*
225 		 * (Another high-level interrupt is active below this one,
226 		 * so there is no need to check for an interrupt
227 		 * thread.  That will be done by the lowest priority
228 		 * high-level interrupt active.)
229 		 */
230 	} else {
231 		/*
232 		 * Check to see if there is a low-level interrupt active.
233 		 * If so, place a starting timestamp in the thread
234 		 * structure.
235 		 */
236 		kthread_t *t = cpu->cpu_thread;
237 
238 		if (t->t_flag & T_INTR_THREAD)
239 			t->t_intr_start = now;
240 	}
241 
242 	mcpu->mcpu_pri = oldpil;
243 	(void) (*setlvlx)(oldpil, vecnum);
244 
245 	return (cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
246 }
247 
248 /*
249  * Set up the cpu, thread and interrupt thread structures for
250  * executing an interrupt thread.  The new stack pointer of the
251  * interrupt thread (which *must* be switched to) is returned.
252  */
253 static caddr_t
254 intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
255 {
256 	struct machcpu *mcpu = &cpu->cpu_m;
257 	kthread_t *t, *volatile it;
258 	hrtime_t now = tsc_read();
259 
260 	ASSERT(pil > 0);
261 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
262 	cpu->cpu_intr_actv |= (1 << pil);
263 
264 	/*
265 	 * Get set to run an interrupt thread.
266 	 * There should always be an interrupt thread, since we
267 	 * allocate one for each level on each CPU.
268 	 *
269 	 * t_intr_start could be zero due to cpu_intr_swtch_enter.
270 	 */
271 	t = cpu->cpu_thread;
272 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
273 		hrtime_t intrtime = now - t->t_intr_start;
274 		mcpu->intrstat[t->t_pil][0] += intrtime;
275 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
276 		t->t_intr_start = 0;
277 	}
278 
279 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
280 
281 	t->t_sp = (uintptr_t)stackptr;	/* mark stack in curthread for resume */
282 
283 	/*
284 	 * unlink the interrupt thread off the cpu
285 	 *
286 	 * Note that the code in kcpc_overflow_intr -relies- on the
287 	 * ordering of events here - in particular that t->t_lwp of
288 	 * the interrupt thread is set to the pinned thread *before*
289 	 * curthread is changed.
290 	 */
291 	it = cpu->cpu_intr_thread;
292 	cpu->cpu_intr_thread = it->t_link;
293 	it->t_intr = t;
294 	it->t_lwp = t->t_lwp;
295 
296 	/*
297 	 * (threads on the interrupt thread free list could have state
298 	 * preset to TS_ONPROC, but it helps in debugging if
299 	 * they're TS_FREE.)
300 	 */
301 	it->t_state = TS_ONPROC;
302 
303 	cpu->cpu_thread = it;		/* new curthread on this cpu */
304 	it->t_pil = (uchar_t)pil;
305 	it->t_pri = intr_pri + (pri_t)pil;
306 	it->t_intr_start = now;
307 
308 	return (it->t_stk);
309 }
310 
311 
312 #ifdef DEBUG
313 int intr_thread_cnt;
314 #endif
315 
316 /*
317  * Called with interrupts disabled
318  */
319 static void
320 intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
321 {
322 	struct machcpu *mcpu = &cpu->cpu_m;
323 	kthread_t *t;
324 	kthread_t *it = cpu->cpu_thread;	/* curthread */
325 	uint_t pil, basespl;
326 	hrtime_t intrtime;
327 	hrtime_t now = tsc_read();
328 
329 	pil = it->t_pil;
330 	cpu->cpu_stats.sys.intr[pil - 1]++;
331 
332 	ASSERT(it->t_intr_start != 0);
333 	intrtime = now - it->t_intr_start;
334 	mcpu->intrstat[pil][0] += intrtime;
335 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
336 
337 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
338 	cpu->cpu_intr_actv &= ~(1 << pil);
339 
340 	/*
341 	 * If there is still an interrupted thread underneath this one
342 	 * then the interrupt was never blocked and the return is
343 	 * fairly simple.  Otherwise it isn't.
344 	 */
345 	if ((t = it->t_intr) == NULL) {
346 		/*
347 		 * The interrupted thread is no longer pinned underneath
348 		 * the interrupt thread.  This means the interrupt must
349 		 * have blocked, and the interrupted thread has been
350 		 * unpinned, and has probably been running around the
351 		 * system for a while.
352 		 *
353 		 * Since there is no longer a thread under this one, put
354 		 * this interrupt thread back on the CPU's free list and
355 		 * resume the idle thread which will dispatch the next
356 		 * thread to run.
357 		 */
358 #ifdef DEBUG
359 		intr_thread_cnt++;
360 #endif
361 		cpu->cpu_stats.sys.intrblk++;
362 		/*
363 		 * Set CPU's base SPL based on active interrupts bitmask
364 		 */
365 		set_base_spl();
366 		basespl = cpu->cpu_base_spl;
367 		mcpu->mcpu_pri = basespl;
368 		(*setlvlx)(basespl, vec);
369 		(void) splhigh();
370 		sti();
371 		it->t_state = TS_FREE;
372 		/*
373 		 * Return interrupt thread to pool
374 		 */
375 		it->t_link = cpu->cpu_intr_thread;
376 		cpu->cpu_intr_thread = it;
377 		swtch();
378 		panic("intr_thread_epilog: swtch returned");
379 		/*NOTREACHED*/
380 	}
381 
382 	/*
383 	 * Return interrupt thread to the pool
384 	 */
385 	it->t_link = cpu->cpu_intr_thread;
386 	cpu->cpu_intr_thread = it;
387 	it->t_state = TS_FREE;
388 
389 	basespl = cpu->cpu_base_spl;
390 	pil = MAX(oldpil, basespl);
391 	mcpu->mcpu_pri = pil;
392 	(*setlvlx)(pil, vec);
393 	t->t_intr_start = now;
394 	cpu->cpu_thread = t;
395 }
396 
397 /*
398  * intr_get_time() is a resource for interrupt handlers to determine how
399  * much time has been spent handling the current interrupt. Such a function
400  * is needed because higher level interrupts can arrive during the
401  * processing of an interrupt.  intr_get_time() only returns time spent in the
402  * current interrupt handler.
403  *
404  * The caller must be calling from an interrupt handler running at a pil
405  * below or at lock level. Timings are not provided for high-level
406  * interrupts.
407  *
408  * The first time intr_get_time() is called while handling an interrupt,
409  * it returns the time since the interrupt handler was invoked. Subsequent
410  * calls will return the time since the prior call to intr_get_time(). Time
411  * is returned as ticks. Use tsc_scalehrtime() to convert ticks to nsec.
412  *
413  * Theory Of Intrstat[][]:
414  *
415  * uint64_t intrstat[pil][0..1] is an array indexed by pil level, with two
416  * uint64_ts per pil.
417  *
418  * intrstat[pil][0] is a cumulative count of the number of ticks spent
419  * handling all interrupts at the specified pil on this CPU. It is
420  * exported via kstats to the user.
421  *
422  * intrstat[pil][1] is always a count of ticks less than or equal to the
423  * value in [0]. The difference between [1] and [0] is the value returned
424  * by a call to intr_get_time(). At the start of interrupt processing,
425  * [0] and [1] will be equal (or nearly so). As the interrupt consumes
426  * time, [0] will increase, but [1] will remain the same. A call to
427  * intr_get_time() will return the difference, then update [1] to be the
428  * same as [0]. Future calls will return the time since the last call.
429  * Finally, when the interrupt completes, [1] is updated to the same as [0].
430  *
431  * Implementation:
432  *
433  * intr_get_time() works much like a higher level interrupt arriving. It
434  * "checkpoints" the timing information by incrementing intrstat[pil][0]
435  * to include elapsed running time, and by setting t_intr_start to rdtsc.
436  * It then sets the return value to intrstat[pil][0] - intrstat[pil][1],
437  * and updates intrstat[pil][1] to be the same as the new value of
438  * intrstat[pil][0].
439  *
440  * In the normal handling of interrupts, after an interrupt handler returns
441  * and the code in intr_thread() updates intrstat[pil][0], it then sets
442  * intrstat[pil][1] to the new value of intrstat[pil][0]. When [0] == [1],
443  * the timings are reset, i.e. intr_get_time() will return [0] - [1] which
444  * is 0.
445  *
446  * Whenever interrupts arrive on a CPU which is handling a lower pil
447  * interrupt, they update the lower pil's [0] to show time spent in the
448  * handler that they've interrupted. This results in a growing discrepancy
449  * between [0] and [1], which is returned the next time intr_get_time() is
450  * called. Time spent in the higher-pil interrupt will not be returned in
451  * the next intr_get_time() call from the original interrupt, because
452  * the higher-pil interrupt's time is accumulated in intrstat[higherpil][].
453  */
454 uint64_t
455 intr_get_time(void)
456 {
457 	struct cpu *cpu;
458 	struct machcpu *mcpu;
459 	kthread_t *t;
460 	uint64_t time, delta, ret;
461 	uint_t pil;
462 
463 	cli();
464 	cpu = CPU;
465 	mcpu = &cpu->cpu_m;
466 	t = cpu->cpu_thread;
467 	pil = t->t_pil;
468 	ASSERT((cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK) == 0);
469 	ASSERT(t->t_flag & T_INTR_THREAD);
470 	ASSERT(pil != 0);
471 	ASSERT(t->t_intr_start != 0);
472 
473 	time = tsc_read();
474 	delta = time - t->t_intr_start;
475 	t->t_intr_start = time;
476 
477 	time = mcpu->intrstat[pil][0] + delta;
478 	ret = time - mcpu->intrstat[pil][1];
479 	mcpu->intrstat[pil][0] = time;
480 	mcpu->intrstat[pil][1] = time;
481 	cpu->cpu_intracct[cpu->cpu_mstate] += delta;
482 
483 	sti();
484 	return (ret);
485 }
486 
487 static caddr_t
488 dosoftint_prolog(
489 	struct cpu *cpu,
490 	caddr_t stackptr,
491 	uint32_t st_pending,
492 	uint_t oldpil)
493 {
494 	kthread_t *t, *volatile it;
495 	struct machcpu *mcpu = &cpu->cpu_m;
496 	uint_t pil;
497 	hrtime_t now;
498 
499 top:
500 	ASSERT(st_pending == mcpu->mcpu_softinfo.st_pending);
501 
502 	pil = bsrw_insn((uint16_t)st_pending);
503 	if (pil <= oldpil || pil <= cpu->cpu_base_spl)
504 		return (0);
505 
506 	/*
507 	 * XX64	Sigh.
508 	 *
509 	 * This is a transliteration of the i386 assembler code for
510 	 * soft interrupts.  One question is "why does this need
511 	 * to be atomic?"  One possible race is -other- processors
512 	 * posting soft interrupts to us in set_pending() i.e. the
513 	 * CPU might get preempted just after the address computation,
514 	 * but just before the atomic transaction, so another CPU would
515 	 * actually set the original CPU's st_pending bit.  However,
516 	 * it looks like it would be simpler to disable preemption there.
517 	 * Are there other races for which preemption control doesn't work?
518 	 *
519 	 * The i386 assembler version -also- checks to see if the bit
520 	 * being cleared was actually set; if it wasn't, it rechecks
521 	 * for more.  This seems a bit strange, as the only code that
522 	 * ever clears the bit is -this- code running with interrupts
523 	 * disabled on -this- CPU.  This code would probably be cheaper:
524 	 *
525 	 * atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending,
526 	 *   ~(1 << pil));
527 	 *
528 	 * and t->t_preempt--/++ around set_pending() even cheaper,
529 	 * but at this point, correctness is critical, so we slavishly
530 	 * emulate the i386 port.
531 	 */
532 	if (atomic_btr32((uint32_t *)
533 	    &mcpu->mcpu_softinfo.st_pending, pil) == 0) {
534 		st_pending = mcpu->mcpu_softinfo.st_pending;
535 		goto top;
536 	}
537 
538 	mcpu->mcpu_pri = pil;
539 	(*setspl)(pil);
540 
541 	now = tsc_read();
542 
543 	/*
544 	 * Get set to run interrupt thread.
545 	 * There should always be an interrupt thread since we
546 	 * allocate one for each level on the CPU.
547 	 */
548 	it = cpu->cpu_intr_thread;
549 	cpu->cpu_intr_thread = it->t_link;
550 
551 	/* t_intr_start could be zero due to cpu_intr_swtch_enter. */
552 	t = cpu->cpu_thread;
553 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
554 		hrtime_t intrtime = now - t->t_intr_start;
555 		mcpu->intrstat[pil][0] += intrtime;
556 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
557 		t->t_intr_start = 0;
558 	}
559 
560 	/*
561 	 * Note that the code in kcpc_overflow_intr -relies- on the
562 	 * ordering of events here - in particular that t->t_lwp of
563 	 * the interrupt thread is set to the pinned thread *before*
564 	 * curthread is changed.
565 	 */
566 	it->t_lwp = t->t_lwp;
567 	it->t_state = TS_ONPROC;
568 
569 	/*
570 	 * Push interrupted thread onto list from new thread.
571 	 * Set the new thread as the current one.
572 	 * Set interrupted thread's T_SP because if it is the idle thread,
573 	 * resume() may use that stack between threads.
574 	 */
575 
576 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
577 	t->t_sp = (uintptr_t)stackptr;
578 
579 	it->t_intr = t;
580 	cpu->cpu_thread = it;
581 
582 	/*
583 	 * Set bit for this pil in CPU's interrupt active bitmask.
584 	 */
585 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
586 	cpu->cpu_intr_actv |= (1 << pil);
587 
588 	/*
589 	 * Initialize thread priority level from intr_pri
590 	 */
591 	it->t_pil = (uchar_t)pil;
592 	it->t_pri = (pri_t)pil + intr_pri;
593 	it->t_intr_start = now;
594 
595 	return (it->t_stk);
596 }
597 
598 static void
599 dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
600 {
601 	struct machcpu *mcpu = &cpu->cpu_m;
602 	kthread_t *t, *it;
603 	uint_t pil, basespl;
604 	hrtime_t intrtime;
605 	hrtime_t now = tsc_read();
606 
607 	it = cpu->cpu_thread;
608 	pil = it->t_pil;
609 
610 	cpu->cpu_stats.sys.intr[pil - 1]++;
611 
612 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
613 	cpu->cpu_intr_actv &= ~(1 << pil);
614 	intrtime = now - it->t_intr_start;
615 	mcpu->intrstat[pil][0] += intrtime;
616 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
617 
618 	/*
619 	 * If there is still an interrupted thread underneath this one
620 	 * then the interrupt was never blocked and the return is
621 	 * fairly simple.  Otherwise it isn't.
622 	 */
623 	if ((t = it->t_intr) == NULL) {
624 		/*
625 		 * Put thread back on the interrupt thread list.
626 		 * This was an interrupt thread, so set CPU's base SPL.
627 		 */
628 		set_base_spl();
629 		it->t_state = TS_FREE;
630 		it->t_link = cpu->cpu_intr_thread;
631 		cpu->cpu_intr_thread = it;
632 		(void) splhigh();
633 		sti();
634 		swtch();
635 		/*NOTREACHED*/
636 		panic("dosoftint_epilog: swtch returned");
637 	}
638 	it->t_link = cpu->cpu_intr_thread;
639 	cpu->cpu_intr_thread = it;
640 	it->t_state = TS_FREE;
641 	cpu->cpu_thread = t;
642 	if (t->t_flag & T_INTR_THREAD)
643 		t->t_intr_start = now;
644 	basespl = cpu->cpu_base_spl;
645 	pil = MAX(oldpil, basespl);
646 	mcpu->mcpu_pri = pil;
647 	(*setspl)(pil);
648 }
649 
650 
651 /*
652  * Make the interrupted thread 'to' be runnable.
653  *
654  * Since t->t_sp has already been saved, t->t_pc is all
655  * that needs to be set in this function.
656  *
657  * Returns the interrupt level of the interrupt thread.
658  */
659 int
660 intr_passivate(
661 	kthread_t *it,		/* interrupt thread */
662 	kthread_t *t)		/* interrupted thread */
663 {
664 	extern void _sys_rtt();
665 
666 	ASSERT(it->t_flag & T_INTR_THREAD);
667 	ASSERT(SA(t->t_sp) == t->t_sp);
668 
669 	t->t_pc = (uintptr_t)_sys_rtt;
670 	return (it->t_pil);
671 }
672 
673 /*
674  * Create interrupt kstats for this CPU.
675  */
676 void
677 cpu_create_intrstat(cpu_t *cp)
678 {
679 	int		i;
680 	kstat_t		*intr_ksp;
681 	kstat_named_t	*knp;
682 	char		name[KSTAT_STRLEN];
683 	zoneid_t	zoneid;
684 
685 	ASSERT(MUTEX_HELD(&cpu_lock));
686 
687 	if (pool_pset_enabled())
688 		zoneid = GLOBAL_ZONEID;
689 	else
690 		zoneid = ALL_ZONES;
691 
692 	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
693 	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
694 
695 	/*
696 	 * Initialize each PIL's named kstat
697 	 */
698 	if (intr_ksp != NULL) {
699 		intr_ksp->ks_update = cpu_kstat_intrstat_update;
700 		knp = (kstat_named_t *)intr_ksp->ks_data;
701 		intr_ksp->ks_private = cp;
702 		for (i = 0; i < PIL_MAX; i++) {
703 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
704 			    i + 1);
705 			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
706 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
707 			    i + 1);
708 			kstat_named_init(&knp[(i * 2) + 1], name,
709 			    KSTAT_DATA_UINT64);
710 		}
711 		kstat_install(intr_ksp);
712 	}
713 }
714 
715 /*
716  * Delete interrupt kstats for this CPU.
717  */
718 void
719 cpu_delete_intrstat(cpu_t *cp)
720 {
721 	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
722 }
723 
724 /*
725  * Convert interrupt statistics from CPU ticks to nanoseconds and
726  * update kstat.
727  */
728 int
729 cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
730 {
731 	kstat_named_t	*knp = ksp->ks_data;
732 	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
733 	int		i;
734 	hrtime_t	hrt;
735 
736 	if (rw == KSTAT_WRITE)
737 		return (EACCES);
738 
739 	for (i = 0; i < PIL_MAX; i++) {
740 		hrt = (hrtime_t)cpup->cpu_m.intrstat[i + 1][0];
741 		tsc_scalehrtime(&hrt);
742 		knp[i * 2].value.ui64 = (uint64_t)hrt;
743 		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
744 	}
745 
746 	return (0);
747 }
748 
749 /*
750  * An interrupt thread is ending a time slice, so compute the interval it
751  * ran for and update the statistic for its PIL.
752  */
753 void
754 cpu_intr_swtch_enter(kthread_id_t t)
755 {
756 	uint64_t	interval;
757 	uint64_t	start;
758 	cpu_t		*cpu;
759 
760 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
761 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
762 
763 	/*
764 	 * We could be here with a zero timestamp. This could happen if:
765 	 * an interrupt thread which no longer has a pinned thread underneath
766 	 * it (i.e. it blocked at some point in its past) has finished running
767 	 * its handler. intr_thread() updated the interrupt statistic for its
768 	 * PIL and zeroed its timestamp. Since there was no pinned thread to
769 	 * return to, swtch() gets called and we end up here.
770 	 *
771 	 * Note that we use atomic ops below (cas64 and atomic_add_64), which
772 	 * we don't use in the functions above, because we're not called
773 	 * with interrupts blocked, but the epilog/prolog functions are.
774 	 */
775 	if (t->t_intr_start) {
776 		do {
777 			start = t->t_intr_start;
778 			interval = tsc_read() - start;
779 		} while (cas64(&t->t_intr_start, start, 0) != start);
780 		cpu = CPU;
781 		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
782 
783 		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
784 		    interval);
785 	} else
786 		ASSERT(t->t_intr == NULL);
787 }
788 
789 /*
790  * An interrupt thread is returning from swtch(). Place a starting timestamp
791  * in its thread structure.
792  */
793 void
794 cpu_intr_swtch_exit(kthread_id_t t)
795 {
796 	uint64_t ts;
797 
798 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
799 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
800 
801 	do {
802 		ts = t->t_intr_start;
803 	} while (cas64(&t->t_intr_start, ts, tsc_read()) != ts);
804 }
805 
806 /*
807  * Dispatch a hilevel interrupt (one above LOCK_LEVEL)
808  */
809 /*ARGSUSED*/
810 static void
811 dispatch_hilevel(uint_t vector, uint_t arg2)
812 {
813 	sti();
814 	av_dispatch_autovect(vector);
815 	cli();
816 }
817 
818 /*
819  * Dispatch a soft interrupt
820  */
821 /*ARGSUSED*/
822 static void
823 dispatch_softint(uint_t oldpil, uint_t arg2)
824 {
825 	struct cpu *cpu = CPU;
826 
827 	sti();
828 	av_dispatch_softvect((int)cpu->cpu_thread->t_pil);
829 	cli();
830 
831 	/*
832 	 * Must run softint_epilog() on the interrupt thread stack, since
833 	 * there may not be a return from it if the interrupt thread blocked.
834 	 */
835 	dosoftint_epilog(cpu, oldpil);
836 }
837 
838 /*
839  * Dispatch a normal interrupt
840  */
841 static void
842 dispatch_hardint(uint_t vector, uint_t oldipl)
843 {
844 	struct cpu *cpu = CPU;
845 
846 	sti();
847 	av_dispatch_autovect(vector);
848 	cli();
849 
850 	/*
851 	 * Must run intr_thread_epilog() on the interrupt thread stack, since
852 	 * there may not be a return from it if the interrupt thread blocked.
853 	 */
854 	intr_thread_epilog(cpu, vector, oldipl);
855 }
856 
857 /*
858  * Deliver any softints the current interrupt priority allows.
859  * Called with interrupts disabled.
860  */
861 void
862 dosoftint(struct regs *regs)
863 {
864 	struct cpu *cpu = CPU;
865 	int oldipl;
866 	caddr_t newsp;
867 
868 	while (cpu->cpu_softinfo.st_pending) {
869 		oldipl = cpu->cpu_pri;
870 		newsp = dosoftint_prolog(cpu, (caddr_t)regs,
871 			cpu->cpu_softinfo.st_pending, oldipl);
872 		/*
873 		 * If returned stack pointer is NULL, priority is too high
874 		 * to run any of the pending softints now.
875 		 * Break out and they will be run later.
876 		 */
877 		if (newsp == NULL)
878 			break;
879 		switch_sp_and_call(newsp, dispatch_softint, oldipl, 0);
880 	}
881 }
882 
883 /*
884  * Interrupt service routine, called with interrupts disabled.
885  */
886 /*ARGSUSED*/
887 void
888 do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
889 {
890 	struct cpu *cpu = CPU;
891 	int newipl, oldipl = cpu->cpu_pri;
892 	uint_t vector;
893 	caddr_t newsp;
894 
895 #ifdef TRAPTRACE
896 	ttp->ttr_marker = TT_INTERRUPT;
897 	ttp->ttr_ipl = 0xff;
898 	ttp->ttr_pri = oldipl;
899 	ttp->ttr_spl = cpu->cpu_base_spl;
900 	ttp->ttr_vector = 0xff;
901 #endif	/* TRAPTRACE */
902 
903 	/*
904 	 * If it's a softint go do it now.
905 	 */
906 	if (rp->r_trapno == T_SOFTINT) {
907 		dosoftint(rp);
908 		ASSERT(!interrupts_enabled());
909 		return;
910 	}
911 
912 	/*
913 	 * Raise the interrupt priority.
914 	 */
915 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
916 #ifdef TRAPTRACE
917 	ttp->ttr_ipl = newipl;
918 #endif	/* TRAPTRACE */
919 
920 	/*
921 	 * Bail if it is a spurious interrupt
922 	 */
923 	if (newipl == -1)
924 		return;
925 	cpu->cpu_pri = newipl;
926 	vector = rp->r_trapno;
927 #ifdef TRAPTRACE
928 	ttp->ttr_vector = vector;
929 #endif	/* TRAPTRACE */
930 	if (newipl > LOCK_LEVEL) {
931 		/*
932 		 * High priority interrupts run on this cpu's interrupt stack.
933 		 */
934 		if (hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) {
935 			newsp = cpu->cpu_intr_stack;
936 			switch_sp_and_call(newsp, dispatch_hilevel, vector, 0);
937 		} else { /* already on the interrupt stack */
938 			dispatch_hilevel(vector, 0);
939 		}
940 		(void) hilevel_intr_epilog(cpu, newipl, oldipl, vector);
941 	} else {
942 		/*
943 		 * Run this interrupt in a separate thread.
944 		 */
945 		newsp = intr_thread_prolog(cpu, (caddr_t)rp, newipl);
946 		switch_sp_and_call(newsp, dispatch_hardint, vector, oldipl);
947 	}
948 
949 	/*
950 	 * Deliver any pending soft interrupts.
951 	 */
952 	if (cpu->cpu_softinfo.st_pending)
953 		dosoftint(rp);
954 }
955 
956 /*
957  * Common tasks always done by _sys_rtt, called with interrupts disabled.
958  * Returns 1 if returning to userland, 0 if returning to system mode.
959  */
960 int
961 sys_rtt_common(struct regs *rp)
962 {
963 	kthread_t *tp;
964 	extern void mutex_exit_critical_start();
965 	extern long mutex_exit_critical_size;
966 
967 loop:
968 
969 	/*
970 	 * Check if returning to user
971 	 */
972 	tp = CPU->cpu_thread;
973 	if (USERMODE(rp->r_cs)) {
974 		/*
975 		 * Check if AST pending.
976 		 */
977 		if (tp->t_astflag) {
978 			/*
979 			 * Let trap() handle the AST
980 			 */
981 			sti();
982 			rp->r_trapno = T_AST;
983 			trap(rp, (caddr_t)0, CPU->cpu_id);
984 			cli();
985 			goto loop;
986 		}
987 
988 #if defined(__amd64)
989 		/*
990 		 * We are done if segment registers do not need updating.
991 		 */
992 		if ((tp->t_lwp->lwp_pcb.pcb_flags & RUPDATE_PENDING) == 0)
993 			return (1);
994 
995 		if (update_sregs(rp, tp->t_lwp)) {
996 			/*
997 			 * 1 or more of the selectors is bad.
998 			 * Deliver a SIGSEGV.
999 			 */
1000 			proc_t *p = ttoproc(tp);
1001 
1002 			sti();
1003 			mutex_enter(&p->p_lock);
1004 			tp->t_lwp->lwp_cursig = SIGSEGV;
1005 			mutex_exit(&p->p_lock);
1006 			psig();
1007 			tp->t_sig_check = 1;
1008 			cli();
1009 		}
1010 		tp->t_lwp->lwp_pcb.pcb_flags &= ~RUPDATE_PENDING;
1011 
1012 #endif	/* __amd64 */
1013 		return (1);
1014 	}
1015 
1016 	/*
1017 	 * Here if we are returning to supervisor mode.
1018 	 * Check for a kernel preemption request.
1019 	 */
1020 	if (CPU->cpu_kprunrun && (rp->r_ps & PS_IE)) {
1021 
1022 		/*
1023 		 * Do nothing if already in kpreempt
1024 		 */
1025 		if (!tp->t_preempt_lk) {
1026 			tp->t_preempt_lk = 1;
1027 			sti();
1028 			kpreempt(1); /* asynchronous kpreempt call */
1029 			cli();
1030 			tp->t_preempt_lk = 0;
1031 		}
1032 	}
1033 
1034 	/*
1035 	 * If we interrupted the mutex_exit() critical region we must
1036 	 * reset the PC back to the beginning to prevent missed wakeups
1037 	 * See the comments in mutex_exit() for details.
1038 	 */
1039 	if ((uintptr_t)rp->r_pc - (uintptr_t)mutex_exit_critical_start <
1040 	    mutex_exit_critical_size) {
1041 		rp->r_pc = (greg_t)mutex_exit_critical_start;
1042 	}
1043 	return (0);
1044 }
1045 
1046 void
1047 send_dirint(int cpuid, int int_level)
1048 {
1049 	(*send_dirintf)(cpuid, int_level);
1050 }
1051 
1052 /*
1053  * do_splx routine, takes new ipl to set
1054  * returns the old ipl.
1055  * We are careful not to set priority lower than CPU->cpu_base_pri,
1056  * even though it seems we're raising the priority, it could be set
1057  * higher at any time by an interrupt routine, so we must block interrupts
1058  * and look at CPU->cpu_base_pri
1059  */
1060 int
1061 do_splx(int newpri)
1062 {
1063 	ulong_t	flag;
1064 	cpu_t	*cpu;
1065 	int	curpri, basepri;
1066 
1067 	flag = intr_clear();
1068 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1069 	curpri = cpu->cpu_m.mcpu_pri;
1070 	basepri = cpu->cpu_base_spl;
1071 	if (newpri < basepri)
1072 		newpri = basepri;
1073 	cpu->cpu_m.mcpu_pri = newpri;
1074 	(*setspl)(newpri);
1075 	/*
1076 	 * If we are going to reenable interrupts see if new priority level
1077 	 * allows pending softint delivery.
1078 	 */
1079 	if ((flag & PS_IE) &&
1080 	    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1081 		fakesoftint();
1082 	ASSERT(!interrupts_enabled());
1083 	intr_restore(flag);
1084 	return (curpri);
1085 }
1086 
1087 /*
1088  * Common spl raise routine, takes new ipl to set
1089  * returns the old ipl, will not lower ipl.
1090  */
1091 int
1092 splr(int newpri)
1093 {
1094 	ulong_t	flag;
1095 	cpu_t	*cpu;
1096 	int	curpri, basepri;
1097 
1098 	flag = intr_clear();
1099 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1100 	curpri = cpu->cpu_m.mcpu_pri;
1101 	/*
1102 	 * Only do something if new priority is larger
1103 	 */
1104 	if (newpri > curpri) {
1105 		basepri = cpu->cpu_base_spl;
1106 		if (newpri < basepri)
1107 			newpri = basepri;
1108 		cpu->cpu_m.mcpu_pri = newpri;
1109 		(*setspl)(newpri);
1110 		/*
1111 		 * See if new priority level allows pending softint delivery
1112 		 */
1113 		if ((flag & PS_IE) &&
1114 		    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1115 			fakesoftint();
1116 	}
1117 	intr_restore(flag);
1118 	return (curpri);
1119 }
1120 
1121 int
1122 getpil(void)
1123 {
1124 	return (CPU->cpu_m.mcpu_pri);
1125 }
1126 
1127 int
1128 interrupts_enabled(void)
1129 {
1130 	ulong_t	flag;
1131 
1132 	flag = getflags();
1133 	return ((flag & PS_IE) == PS_IE);
1134 }
1135 
1136 #ifdef DEBUG
1137 void
1138 assert_ints_enabled(void)
1139 {
1140 	ASSERT(!interrupts_unleashed || interrupts_enabled());
1141 }
1142 #endif	/* DEBUG */
1143