xref: /titanic_52/usr/src/uts/i86pc/os/intr.c (revision c40d696f8f0e05103b3795dd37198e00ae7ef955)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/cpuvar.h>
29 #include <sys/regset.h>
30 #include <sys/psw.h>
31 #include <sys/types.h>
32 #include <sys/thread.h>
33 #include <sys/systm.h>
34 #include <sys/segments.h>
35 #include <sys/pcb.h>
36 #include <sys/trap.h>
37 #include <sys/ftrace.h>
38 #include <sys/traptrace.h>
39 #include <sys/clock.h>
40 #include <sys/panic.h>
41 #include <sys/disp.h>
42 #include <vm/seg_kp.h>
43 #include <sys/stack.h>
44 #include <sys/sysmacros.h>
45 #include <sys/cmn_err.h>
46 #include <sys/kstat.h>
47 #include <sys/smp_impldefs.h>
48 #include <sys/pool_pset.h>
49 #include <sys/zone.h>
50 #include <sys/bitmap.h>
51 #include <sys/archsystm.h>
52 #include <sys/machsystm.h>
53 #include <sys/ontrap.h>
54 #include <sys/x86_archext.h>
55 #include <sys/promif.h>
56 #include <vm/hat_i86.h>
57 
58 
59 /*
60  * Set cpu's base SPL level to the highest active interrupt level
61  */
62 void
63 set_base_spl(void)
64 {
65 	struct cpu *cpu = CPU;
66 	uint16_t active = (uint16_t)cpu->cpu_intr_actv;
67 
68 	cpu->cpu_base_spl = active == 0 ? 0 : bsrw_insn(active);
69 }
70 
71 /*
72  * Do all the work necessary to set up the cpu and thread structures
73  * to dispatch a high-level interrupt.
74  *
75  * Returns 0 if we're -not- already on the high-level interrupt stack,
76  * (and *must* switch to it), non-zero if we are already on that stack.
77  *
78  * Called with interrupts masked.
79  * The 'pil' is already set to the appropriate level for rp->r_trapno.
80  */
81 static int
82 hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
83 {
84 	struct machcpu *mcpu = &cpu->cpu_m;
85 	uint_t mask;
86 	hrtime_t intrtime;
87 	hrtime_t now = tsc_read();
88 
89 	ASSERT(pil > LOCK_LEVEL);
90 
91 	if (pil == CBE_HIGH_PIL) {
92 		cpu->cpu_profile_pil = oldpil;
93 		if (USERMODE(rp->r_cs)) {
94 			cpu->cpu_profile_pc = 0;
95 			cpu->cpu_profile_upc = rp->r_pc;
96 		} else {
97 			cpu->cpu_profile_pc = rp->r_pc;
98 			cpu->cpu_profile_upc = 0;
99 		}
100 	}
101 
102 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
103 	if (mask != 0) {
104 		int nestpil;
105 
106 		/*
107 		 * We have interrupted another high-level interrupt.
108 		 * Load starting timestamp, compute interval, update
109 		 * cumulative counter.
110 		 */
111 		nestpil = bsrw_insn((uint16_t)mask);
112 		ASSERT(nestpil < pil);
113 		intrtime = now -
114 		    mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)];
115 		mcpu->intrstat[nestpil][0] += intrtime;
116 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
117 		/*
118 		 * Another high-level interrupt is active below this one, so
119 		 * there is no need to check for an interrupt thread.  That
120 		 * will be done by the lowest priority high-level interrupt
121 		 * active.
122 		 */
123 	} else {
124 		kthread_t *t = cpu->cpu_thread;
125 
126 		/*
127 		 * See if we are interrupting a low-level interrupt thread.
128 		 * If so, account for its time slice only if its time stamp
129 		 * is non-zero.
130 		 */
131 		if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) {
132 			intrtime = now - t->t_intr_start;
133 			mcpu->intrstat[t->t_pil][0] += intrtime;
134 			cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
135 			t->t_intr_start = 0;
136 		}
137 	}
138 
139 	/*
140 	 * Store starting timestamp in CPU structure for this PIL.
141 	 */
142 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
143 
144 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
145 
146 	if (pil == 15) {
147 		/*
148 		 * To support reentrant level 15 interrupts, we maintain a
149 		 * recursion count in the top half of cpu_intr_actv.  Only
150 		 * when this count hits zero do we clear the PIL 15 bit from
151 		 * the lower half of cpu_intr_actv.
152 		 */
153 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
154 		(*refcntp)++;
155 	}
156 
157 	mask = cpu->cpu_intr_actv;
158 
159 	cpu->cpu_intr_actv |= (1 << pil);
160 
161 	return (mask & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
162 }
163 
164 /*
165  * Does most of the work of returning from a high level interrupt.
166  *
167  * Returns 0 if there are no more high level interrupts (in which
168  * case we must switch back to the interrupted thread stack) or
169  * non-zero if there are more (in which case we should stay on it).
170  *
171  * Called with interrupts masked
172  */
173 static int
174 hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
175 {
176 	struct machcpu *mcpu = &cpu->cpu_m;
177 	uint_t mask;
178 	hrtime_t intrtime;
179 	hrtime_t now = tsc_read();
180 
181 	ASSERT(mcpu->mcpu_pri == pil);
182 
183 	cpu->cpu_stats.sys.intr[pil - 1]++;
184 
185 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
186 
187 	if (pil == 15) {
188 		/*
189 		 * To support reentrant level 15 interrupts, we maintain a
190 		 * recursion count in the top half of cpu_intr_actv.  Only
191 		 * when this count hits zero do we clear the PIL 15 bit from
192 		 * the lower half of cpu_intr_actv.
193 		 */
194 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
195 
196 		ASSERT(*refcntp > 0);
197 
198 		if (--(*refcntp) == 0)
199 			cpu->cpu_intr_actv &= ~(1 << pil);
200 	} else {
201 		cpu->cpu_intr_actv &= ~(1 << pil);
202 	}
203 
204 	ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0);
205 
206 	intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)];
207 	mcpu->intrstat[pil][0] += intrtime;
208 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
209 
210 	/*
211 	 * Check for lower-pil nested high-level interrupt beneath
212 	 * current one.  If so, place a starting timestamp in its
213 	 * pil_high_start entry.
214 	 */
215 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
216 	if (mask != 0) {
217 		int nestpil;
218 
219 		/*
220 		 * find PIL of nested interrupt
221 		 */
222 		nestpil = bsrw_insn((uint16_t)mask);
223 		ASSERT(nestpil < pil);
224 		mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now;
225 		/*
226 		 * (Another high-level interrupt is active below this one,
227 		 * so there is no need to check for an interrupt
228 		 * thread.  That will be done by the lowest priority
229 		 * high-level interrupt active.)
230 		 */
231 	} else {
232 		/*
233 		 * Check to see if there is a low-level interrupt active.
234 		 * If so, place a starting timestamp in the thread
235 		 * structure.
236 		 */
237 		kthread_t *t = cpu->cpu_thread;
238 
239 		if (t->t_flag & T_INTR_THREAD)
240 			t->t_intr_start = now;
241 	}
242 
243 	mcpu->mcpu_pri = oldpil;
244 	(void) (*setlvlx)(oldpil, vecnum);
245 
246 	return (cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
247 }
248 
249 /*
250  * Set up the cpu, thread and interrupt thread structures for
251  * executing an interrupt thread.  The new stack pointer of the
252  * interrupt thread (which *must* be switched to) is returned.
253  */
254 static caddr_t
255 intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
256 {
257 	struct machcpu *mcpu = &cpu->cpu_m;
258 	kthread_t *t, *volatile it;
259 	hrtime_t now = tsc_read();
260 
261 	ASSERT(pil > 0);
262 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
263 	cpu->cpu_intr_actv |= (1 << pil);
264 
265 	/*
266 	 * Get set to run an interrupt thread.
267 	 * There should always be an interrupt thread, since we
268 	 * allocate one for each level on each CPU.
269 	 *
270 	 * t_intr_start could be zero due to cpu_intr_swtch_enter.
271 	 */
272 	t = cpu->cpu_thread;
273 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
274 		hrtime_t intrtime = now - t->t_intr_start;
275 		mcpu->intrstat[t->t_pil][0] += intrtime;
276 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
277 		t->t_intr_start = 0;
278 	}
279 
280 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
281 
282 	t->t_sp = (uintptr_t)stackptr;	/* mark stack in curthread for resume */
283 
284 	/*
285 	 * unlink the interrupt thread off the cpu
286 	 *
287 	 * Note that the code in kcpc_overflow_intr -relies- on the
288 	 * ordering of events here - in particular that t->t_lwp of
289 	 * the interrupt thread is set to the pinned thread *before*
290 	 * curthread is changed.
291 	 */
292 	it = cpu->cpu_intr_thread;
293 	cpu->cpu_intr_thread = it->t_link;
294 	it->t_intr = t;
295 	it->t_lwp = t->t_lwp;
296 
297 	/*
298 	 * (threads on the interrupt thread free list could have state
299 	 * preset to TS_ONPROC, but it helps in debugging if
300 	 * they're TS_FREE.)
301 	 */
302 	it->t_state = TS_ONPROC;
303 
304 	cpu->cpu_thread = it;		/* new curthread on this cpu */
305 	it->t_pil = (uchar_t)pil;
306 	it->t_pri = intr_pri + (pri_t)pil;
307 	it->t_intr_start = now;
308 
309 	return (it->t_stk);
310 }
311 
312 
313 #ifdef DEBUG
314 int intr_thread_cnt;
315 #endif
316 
317 /*
318  * Called with interrupts disabled
319  */
320 static void
321 intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
322 {
323 	struct machcpu *mcpu = &cpu->cpu_m;
324 	kthread_t *t;
325 	kthread_t *it = cpu->cpu_thread;	/* curthread */
326 	uint_t pil, basespl;
327 	hrtime_t intrtime;
328 	hrtime_t now = tsc_read();
329 
330 	pil = it->t_pil;
331 	cpu->cpu_stats.sys.intr[pil - 1]++;
332 
333 	ASSERT(it->t_intr_start != 0);
334 	intrtime = now - it->t_intr_start;
335 	mcpu->intrstat[pil][0] += intrtime;
336 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
337 
338 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
339 	cpu->cpu_intr_actv &= ~(1 << pil);
340 
341 	/*
342 	 * If there is still an interrupted thread underneath this one
343 	 * then the interrupt was never blocked and the return is
344 	 * fairly simple.  Otherwise it isn't.
345 	 */
346 	if ((t = it->t_intr) == NULL) {
347 		/*
348 		 * The interrupted thread is no longer pinned underneath
349 		 * the interrupt thread.  This means the interrupt must
350 		 * have blocked, and the interrupted thread has been
351 		 * unpinned, and has probably been running around the
352 		 * system for a while.
353 		 *
354 		 * Since there is no longer a thread under this one, put
355 		 * this interrupt thread back on the CPU's free list and
356 		 * resume the idle thread which will dispatch the next
357 		 * thread to run.
358 		 */
359 #ifdef DEBUG
360 		intr_thread_cnt++;
361 #endif
362 		cpu->cpu_stats.sys.intrblk++;
363 		/*
364 		 * Set CPU's base SPL based on active interrupts bitmask
365 		 */
366 		set_base_spl();
367 		basespl = cpu->cpu_base_spl;
368 		mcpu->mcpu_pri = basespl;
369 		(*setlvlx)(basespl, vec);
370 		(void) splhigh();
371 		sti();
372 		it->t_state = TS_FREE;
373 		/*
374 		 * Return interrupt thread to pool
375 		 */
376 		it->t_link = cpu->cpu_intr_thread;
377 		cpu->cpu_intr_thread = it;
378 		swtch();
379 		panic("intr_thread_epilog: swtch returned");
380 		/*NOTREACHED*/
381 	}
382 
383 	/*
384 	 * Return interrupt thread to the pool
385 	 */
386 	it->t_link = cpu->cpu_intr_thread;
387 	cpu->cpu_intr_thread = it;
388 	it->t_state = TS_FREE;
389 
390 	basespl = cpu->cpu_base_spl;
391 	pil = MAX(oldpil, basespl);
392 	mcpu->mcpu_pri = pil;
393 	(*setlvlx)(pil, vec);
394 	t->t_intr_start = now;
395 	cpu->cpu_thread = t;
396 }
397 
398 /*
399  * intr_get_time() is a resource for interrupt handlers to determine how
400  * much time has been spent handling the current interrupt. Such a function
401  * is needed because higher level interrupts can arrive during the
402  * processing of an interrupt.  intr_get_time() only returns time spent in the
403  * current interrupt handler.
404  *
405  * The caller must be calling from an interrupt handler running at a pil
406  * below or at lock level. Timings are not provided for high-level
407  * interrupts.
408  *
409  * The first time intr_get_time() is called while handling an interrupt,
410  * it returns the time since the interrupt handler was invoked. Subsequent
411  * calls will return the time since the prior call to intr_get_time(). Time
412  * is returned as ticks. Use tsc_scalehrtime() to convert ticks to nsec.
413  *
414  * Theory Of Intrstat[][]:
415  *
416  * uint64_t intrstat[pil][0..1] is an array indexed by pil level, with two
417  * uint64_ts per pil.
418  *
419  * intrstat[pil][0] is a cumulative count of the number of ticks spent
420  * handling all interrupts at the specified pil on this CPU. It is
421  * exported via kstats to the user.
422  *
423  * intrstat[pil][1] is always a count of ticks less than or equal to the
424  * value in [0]. The difference between [1] and [0] is the value returned
425  * by a call to intr_get_time(). At the start of interrupt processing,
426  * [0] and [1] will be equal (or nearly so). As the interrupt consumes
427  * time, [0] will increase, but [1] will remain the same. A call to
428  * intr_get_time() will return the difference, then update [1] to be the
429  * same as [0]. Future calls will return the time since the last call.
430  * Finally, when the interrupt completes, [1] is updated to the same as [0].
431  *
432  * Implementation:
433  *
434  * intr_get_time() works much like a higher level interrupt arriving. It
435  * "checkpoints" the timing information by incrementing intrstat[pil][0]
436  * to include elapsed running time, and by setting t_intr_start to rdtsc.
437  * It then sets the return value to intrstat[pil][0] - intrstat[pil][1],
438  * and updates intrstat[pil][1] to be the same as the new value of
439  * intrstat[pil][0].
440  *
441  * In the normal handling of interrupts, after an interrupt handler returns
442  * and the code in intr_thread() updates intrstat[pil][0], it then sets
443  * intrstat[pil][1] to the new value of intrstat[pil][0]. When [0] == [1],
444  * the timings are reset, i.e. intr_get_time() will return [0] - [1] which
445  * is 0.
446  *
447  * Whenever interrupts arrive on a CPU which is handling a lower pil
448  * interrupt, they update the lower pil's [0] to show time spent in the
449  * handler that they've interrupted. This results in a growing discrepancy
450  * between [0] and [1], which is returned the next time intr_get_time() is
451  * called. Time spent in the higher-pil interrupt will not be returned in
452  * the next intr_get_time() call from the original interrupt, because
453  * the higher-pil interrupt's time is accumulated in intrstat[higherpil][].
454  */
455 uint64_t
456 intr_get_time(void)
457 {
458 	struct cpu *cpu;
459 	struct machcpu *mcpu;
460 	kthread_t *t;
461 	uint64_t time, delta, ret;
462 	uint_t pil;
463 
464 	cli();
465 	cpu = CPU;
466 	mcpu = &cpu->cpu_m;
467 	t = cpu->cpu_thread;
468 	pil = t->t_pil;
469 	ASSERT((cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK) == 0);
470 	ASSERT(t->t_flag & T_INTR_THREAD);
471 	ASSERT(pil != 0);
472 	ASSERT(t->t_intr_start != 0);
473 
474 	time = tsc_read();
475 	delta = time - t->t_intr_start;
476 	t->t_intr_start = time;
477 
478 	time = mcpu->intrstat[pil][0] + delta;
479 	ret = time - mcpu->intrstat[pil][1];
480 	mcpu->intrstat[pil][0] = time;
481 	mcpu->intrstat[pil][1] = time;
482 	cpu->cpu_intracct[cpu->cpu_mstate] += delta;
483 
484 	sti();
485 	return (ret);
486 }
487 
488 static caddr_t
489 dosoftint_prolog(
490 	struct cpu *cpu,
491 	caddr_t stackptr,
492 	uint32_t st_pending,
493 	uint_t oldpil)
494 {
495 	kthread_t *t, *volatile it;
496 	struct machcpu *mcpu = &cpu->cpu_m;
497 	uint_t pil;
498 	hrtime_t now;
499 
500 top:
501 	ASSERT(st_pending == mcpu->mcpu_softinfo.st_pending);
502 
503 	pil = bsrw_insn((uint16_t)st_pending);
504 	if (pil <= oldpil || pil <= cpu->cpu_base_spl)
505 		return (0);
506 
507 	/*
508 	 * XX64	Sigh.
509 	 *
510 	 * This is a transliteration of the i386 assembler code for
511 	 * soft interrupts.  One question is "why does this need
512 	 * to be atomic?"  One possible race is -other- processors
513 	 * posting soft interrupts to us in set_pending() i.e. the
514 	 * CPU might get preempted just after the address computation,
515 	 * but just before the atomic transaction, so another CPU would
516 	 * actually set the original CPU's st_pending bit.  However,
517 	 * it looks like it would be simpler to disable preemption there.
518 	 * Are there other races for which preemption control doesn't work?
519 	 *
520 	 * The i386 assembler version -also- checks to see if the bit
521 	 * being cleared was actually set; if it wasn't, it rechecks
522 	 * for more.  This seems a bit strange, as the only code that
523 	 * ever clears the bit is -this- code running with interrupts
524 	 * disabled on -this- CPU.  This code would probably be cheaper:
525 	 *
526 	 * atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending,
527 	 *   ~(1 << pil));
528 	 *
529 	 * and t->t_preempt--/++ around set_pending() even cheaper,
530 	 * but at this point, correctness is critical, so we slavishly
531 	 * emulate the i386 port.
532 	 */
533 	if (atomic_btr32((uint32_t *)
534 	    &mcpu->mcpu_softinfo.st_pending, pil) == 0) {
535 		st_pending = mcpu->mcpu_softinfo.st_pending;
536 		goto top;
537 	}
538 
539 	mcpu->mcpu_pri = pil;
540 	(*setspl)(pil);
541 
542 	now = tsc_read();
543 
544 	/*
545 	 * Get set to run interrupt thread.
546 	 * There should always be an interrupt thread since we
547 	 * allocate one for each level on the CPU.
548 	 */
549 	it = cpu->cpu_intr_thread;
550 	cpu->cpu_intr_thread = it->t_link;
551 
552 	/* t_intr_start could be zero due to cpu_intr_swtch_enter. */
553 	t = cpu->cpu_thread;
554 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
555 		hrtime_t intrtime = now - t->t_intr_start;
556 		mcpu->intrstat[pil][0] += intrtime;
557 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
558 		t->t_intr_start = 0;
559 	}
560 
561 	/*
562 	 * Note that the code in kcpc_overflow_intr -relies- on the
563 	 * ordering of events here - in particular that t->t_lwp of
564 	 * the interrupt thread is set to the pinned thread *before*
565 	 * curthread is changed.
566 	 */
567 	it->t_lwp = t->t_lwp;
568 	it->t_state = TS_ONPROC;
569 
570 	/*
571 	 * Push interrupted thread onto list from new thread.
572 	 * Set the new thread as the current one.
573 	 * Set interrupted thread's T_SP because if it is the idle thread,
574 	 * resume() may use that stack between threads.
575 	 */
576 
577 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
578 	t->t_sp = (uintptr_t)stackptr;
579 
580 	it->t_intr = t;
581 	cpu->cpu_thread = it;
582 
583 	/*
584 	 * Set bit for this pil in CPU's interrupt active bitmask.
585 	 */
586 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
587 	cpu->cpu_intr_actv |= (1 << pil);
588 
589 	/*
590 	 * Initialize thread priority level from intr_pri
591 	 */
592 	it->t_pil = (uchar_t)pil;
593 	it->t_pri = (pri_t)pil + intr_pri;
594 	it->t_intr_start = now;
595 
596 	return (it->t_stk);
597 }
598 
599 static void
600 dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
601 {
602 	struct machcpu *mcpu = &cpu->cpu_m;
603 	kthread_t *t, *it;
604 	uint_t pil, basespl;
605 	hrtime_t intrtime;
606 	hrtime_t now = tsc_read();
607 
608 	it = cpu->cpu_thread;
609 	pil = it->t_pil;
610 
611 	cpu->cpu_stats.sys.intr[pil - 1]++;
612 
613 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
614 	cpu->cpu_intr_actv &= ~(1 << pil);
615 	intrtime = now - it->t_intr_start;
616 	mcpu->intrstat[pil][0] += intrtime;
617 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
618 
619 	/*
620 	 * If there is still an interrupted thread underneath this one
621 	 * then the interrupt was never blocked and the return is
622 	 * fairly simple.  Otherwise it isn't.
623 	 */
624 	if ((t = it->t_intr) == NULL) {
625 		/*
626 		 * Put thread back on the interrupt thread list.
627 		 * This was an interrupt thread, so set CPU's base SPL.
628 		 */
629 		set_base_spl();
630 		it->t_state = TS_FREE;
631 		it->t_link = cpu->cpu_intr_thread;
632 		cpu->cpu_intr_thread = it;
633 		(void) splhigh();
634 		sti();
635 		swtch();
636 		/*NOTREACHED*/
637 		panic("dosoftint_epilog: swtch returned");
638 	}
639 	it->t_link = cpu->cpu_intr_thread;
640 	cpu->cpu_intr_thread = it;
641 	it->t_state = TS_FREE;
642 	cpu->cpu_thread = t;
643 	if (t->t_flag & T_INTR_THREAD)
644 		t->t_intr_start = now;
645 	basespl = cpu->cpu_base_spl;
646 	pil = MAX(oldpil, basespl);
647 	mcpu->mcpu_pri = pil;
648 	(*setspl)(pil);
649 }
650 
651 
652 /*
653  * Make the interrupted thread 'to' be runnable.
654  *
655  * Since t->t_sp has already been saved, t->t_pc is all
656  * that needs to be set in this function.
657  *
658  * Returns the interrupt level of the interrupt thread.
659  */
660 int
661 intr_passivate(
662 	kthread_t *it,		/* interrupt thread */
663 	kthread_t *t)		/* interrupted thread */
664 {
665 	extern void _sys_rtt();
666 
667 	ASSERT(it->t_flag & T_INTR_THREAD);
668 	ASSERT(SA(t->t_sp) == t->t_sp);
669 
670 	t->t_pc = (uintptr_t)_sys_rtt;
671 	return (it->t_pil);
672 }
673 
674 /*
675  * Create interrupt kstats for this CPU.
676  */
677 void
678 cpu_create_intrstat(cpu_t *cp)
679 {
680 	int		i;
681 	kstat_t		*intr_ksp;
682 	kstat_named_t	*knp;
683 	char		name[KSTAT_STRLEN];
684 	zoneid_t	zoneid;
685 
686 	ASSERT(MUTEX_HELD(&cpu_lock));
687 
688 	if (pool_pset_enabled())
689 		zoneid = GLOBAL_ZONEID;
690 	else
691 		zoneid = ALL_ZONES;
692 
693 	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
694 	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
695 
696 	/*
697 	 * Initialize each PIL's named kstat
698 	 */
699 	if (intr_ksp != NULL) {
700 		intr_ksp->ks_update = cpu_kstat_intrstat_update;
701 		knp = (kstat_named_t *)intr_ksp->ks_data;
702 		intr_ksp->ks_private = cp;
703 		for (i = 0; i < PIL_MAX; i++) {
704 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
705 			    i + 1);
706 			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
707 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
708 			    i + 1);
709 			kstat_named_init(&knp[(i * 2) + 1], name,
710 			    KSTAT_DATA_UINT64);
711 		}
712 		kstat_install(intr_ksp);
713 	}
714 }
715 
716 /*
717  * Delete interrupt kstats for this CPU.
718  */
719 void
720 cpu_delete_intrstat(cpu_t *cp)
721 {
722 	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
723 }
724 
725 /*
726  * Convert interrupt statistics from CPU ticks to nanoseconds and
727  * update kstat.
728  */
729 int
730 cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
731 {
732 	kstat_named_t	*knp = ksp->ks_data;
733 	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
734 	int		i;
735 	hrtime_t	hrt;
736 
737 	if (rw == KSTAT_WRITE)
738 		return (EACCES);
739 
740 	for (i = 0; i < PIL_MAX; i++) {
741 		hrt = (hrtime_t)cpup->cpu_m.intrstat[i + 1][0];
742 		tsc_scalehrtime(&hrt);
743 		knp[i * 2].value.ui64 = (uint64_t)hrt;
744 		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
745 	}
746 
747 	return (0);
748 }
749 
750 /*
751  * An interrupt thread is ending a time slice, so compute the interval it
752  * ran for and update the statistic for its PIL.
753  */
754 void
755 cpu_intr_swtch_enter(kthread_id_t t)
756 {
757 	uint64_t	interval;
758 	uint64_t	start;
759 	cpu_t		*cpu;
760 
761 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
762 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
763 
764 	/*
765 	 * We could be here with a zero timestamp. This could happen if:
766 	 * an interrupt thread which no longer has a pinned thread underneath
767 	 * it (i.e. it blocked at some point in its past) has finished running
768 	 * its handler. intr_thread() updated the interrupt statistic for its
769 	 * PIL and zeroed its timestamp. Since there was no pinned thread to
770 	 * return to, swtch() gets called and we end up here.
771 	 *
772 	 * Note that we use atomic ops below (cas64 and atomic_add_64), which
773 	 * we don't use in the functions above, because we're not called
774 	 * with interrupts blocked, but the epilog/prolog functions are.
775 	 */
776 	if (t->t_intr_start) {
777 		do {
778 			start = t->t_intr_start;
779 			interval = tsc_read() - start;
780 		} while (cas64(&t->t_intr_start, start, 0) != start);
781 		cpu = CPU;
782 		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
783 
784 		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
785 		    interval);
786 	} else
787 		ASSERT(t->t_intr == NULL);
788 }
789 
790 /*
791  * An interrupt thread is returning from swtch(). Place a starting timestamp
792  * in its thread structure.
793  */
794 void
795 cpu_intr_swtch_exit(kthread_id_t t)
796 {
797 	uint64_t ts;
798 
799 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
800 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
801 
802 	do {
803 		ts = t->t_intr_start;
804 	} while (cas64(&t->t_intr_start, ts, tsc_read()) != ts);
805 }
806 
807 /*
808  * Dispatch a hilevel interrupt (one above LOCK_LEVEL)
809  */
810 /*ARGSUSED*/
811 static void
812 dispatch_hilevel(uint_t vector, uint_t arg2)
813 {
814 	sti();
815 	av_dispatch_autovect(vector);
816 	cli();
817 }
818 
819 /*
820  * Dispatch a soft interrupt
821  */
822 /*ARGSUSED*/
823 static void
824 dispatch_softint(uint_t oldpil, uint_t arg2)
825 {
826 	struct cpu *cpu = CPU;
827 
828 	sti();
829 	av_dispatch_softvect((int)cpu->cpu_thread->t_pil);
830 	cli();
831 
832 	/*
833 	 * Must run softint_epilog() on the interrupt thread stack, since
834 	 * there may not be a return from it if the interrupt thread blocked.
835 	 */
836 	dosoftint_epilog(cpu, oldpil);
837 }
838 
839 /*
840  * Dispatch a normal interrupt
841  */
842 static void
843 dispatch_hardint(uint_t vector, uint_t oldipl)
844 {
845 	struct cpu *cpu = CPU;
846 
847 	sti();
848 	av_dispatch_autovect(vector);
849 	cli();
850 
851 	/*
852 	 * Must run intr_thread_epilog() on the interrupt thread stack, since
853 	 * there may not be a return from it if the interrupt thread blocked.
854 	 */
855 	intr_thread_epilog(cpu, vector, oldipl);
856 }
857 
858 /*
859  * Deliver any softints the current interrupt priority allows.
860  * Called with interrupts disabled.
861  */
862 void
863 dosoftint(struct regs *regs)
864 {
865 	struct cpu *cpu = CPU;
866 	int oldipl;
867 	caddr_t newsp;
868 
869 	while (cpu->cpu_softinfo.st_pending) {
870 		oldipl = cpu->cpu_pri;
871 		newsp = dosoftint_prolog(cpu, (caddr_t)regs,
872 			cpu->cpu_softinfo.st_pending, oldipl);
873 		/*
874 		 * If returned stack pointer is NULL, priority is too high
875 		 * to run any of the pending softints now.
876 		 * Break out and they will be run later.
877 		 */
878 		if (newsp == NULL)
879 			break;
880 		switch_sp_and_call(newsp, dispatch_softint, oldipl, 0);
881 	}
882 }
883 
884 /*
885  * Interrupt service routine, called with interrupts disabled.
886  */
887 /*ARGSUSED*/
888 void
889 do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
890 {
891 	struct cpu *cpu = CPU;
892 	int newipl, oldipl = cpu->cpu_pri;
893 	uint_t vector;
894 	caddr_t newsp;
895 
896 #ifdef TRAPTRACE
897 	ttp->ttr_marker = TT_INTERRUPT;
898 	ttp->ttr_ipl = 0xff;
899 	ttp->ttr_pri = oldipl;
900 	ttp->ttr_spl = cpu->cpu_base_spl;
901 	ttp->ttr_vector = 0xff;
902 #endif	/* TRAPTRACE */
903 
904 	/*
905 	 * Handle any pending TLB flushing
906 	 */
907 	tlb_service();
908 
909 	/*
910 	 * If it's a softint go do it now.
911 	 */
912 	if (rp->r_trapno == T_SOFTINT) {
913 		dosoftint(rp);
914 		ASSERT(!interrupts_enabled());
915 		return;
916 	}
917 
918 	/*
919 	 * Raise the interrupt priority.
920 	 */
921 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
922 #ifdef TRAPTRACE
923 	ttp->ttr_ipl = newipl;
924 #endif	/* TRAPTRACE */
925 
926 	/*
927 	 * Bail if it is a spurious interrupt
928 	 */
929 	if (newipl == -1)
930 		return;
931 	cpu->cpu_pri = newipl;
932 	vector = rp->r_trapno;
933 #ifdef TRAPTRACE
934 	ttp->ttr_vector = vector;
935 #endif	/* TRAPTRACE */
936 	if (newipl > LOCK_LEVEL) {
937 		/*
938 		 * High priority interrupts run on this cpu's interrupt stack.
939 		 */
940 		if (hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) {
941 			newsp = cpu->cpu_intr_stack;
942 			switch_sp_and_call(newsp, dispatch_hilevel, vector, 0);
943 		} else { /* already on the interrupt stack */
944 			dispatch_hilevel(vector, 0);
945 		}
946 		(void) hilevel_intr_epilog(cpu, newipl, oldipl, vector);
947 	} else {
948 		/*
949 		 * Run this interrupt in a separate thread.
950 		 */
951 		newsp = intr_thread_prolog(cpu, (caddr_t)rp, newipl);
952 		switch_sp_and_call(newsp, dispatch_hardint, vector, oldipl);
953 	}
954 
955 	/*
956 	 * Deliver any pending soft interrupts.
957 	 */
958 	if (cpu->cpu_softinfo.st_pending)
959 		dosoftint(rp);
960 }
961 
962 /*
963  * Common tasks always done by _sys_rtt, called with interrupts disabled.
964  * Returns 1 if returning to userland, 0 if returning to system mode.
965  */
966 int
967 sys_rtt_common(struct regs *rp)
968 {
969 	kthread_t *tp;
970 	extern void mutex_exit_critical_start();
971 	extern long mutex_exit_critical_size;
972 
973 loop:
974 
975 	/*
976 	 * Check if returning to user
977 	 */
978 	tp = CPU->cpu_thread;
979 	if (USERMODE(rp->r_cs)) {
980 		/*
981 		 * Check if AST pending.
982 		 */
983 		if (tp->t_astflag) {
984 			/*
985 			 * Let trap() handle the AST
986 			 */
987 			sti();
988 			rp->r_trapno = T_AST;
989 			trap(rp, (caddr_t)0, CPU->cpu_id);
990 			cli();
991 			goto loop;
992 		}
993 
994 #if defined(__amd64)
995 		/*
996 		 * We are done if segment registers do not need updating.
997 		 */
998 		if (tp->t_lwp->lwp_pcb.pcb_rupdate == 0)
999 			return (1);
1000 
1001 		if (update_sregs(rp, tp->t_lwp)) {
1002 			/*
1003 			 * 1 or more of the selectors is bad.
1004 			 * Deliver a SIGSEGV.
1005 			 */
1006 			proc_t *p = ttoproc(tp);
1007 
1008 			sti();
1009 			mutex_enter(&p->p_lock);
1010 			tp->t_lwp->lwp_cursig = SIGSEGV;
1011 			mutex_exit(&p->p_lock);
1012 			psig();
1013 			tp->t_sig_check = 1;
1014 			cli();
1015 		}
1016 		tp->t_lwp->lwp_pcb.pcb_rupdate = 0;
1017 
1018 #endif	/* __amd64 */
1019 		return (1);
1020 	}
1021 
1022 	/*
1023 	 * Here if we are returning to supervisor mode.
1024 	 * Check for a kernel preemption request.
1025 	 */
1026 	if (CPU->cpu_kprunrun && (rp->r_ps & PS_IE)) {
1027 
1028 		/*
1029 		 * Do nothing if already in kpreempt
1030 		 */
1031 		if (!tp->t_preempt_lk) {
1032 			tp->t_preempt_lk = 1;
1033 			sti();
1034 			kpreempt(1); /* asynchronous kpreempt call */
1035 			cli();
1036 			tp->t_preempt_lk = 0;
1037 		}
1038 	}
1039 
1040 	/*
1041 	 * If we interrupted the mutex_exit() critical region we must
1042 	 * reset the PC back to the beginning to prevent missed wakeups
1043 	 * See the comments in mutex_exit() for details.
1044 	 */
1045 	if ((uintptr_t)rp->r_pc - (uintptr_t)mutex_exit_critical_start <
1046 	    mutex_exit_critical_size) {
1047 		rp->r_pc = (greg_t)mutex_exit_critical_start;
1048 	}
1049 	return (0);
1050 }
1051 
1052 void
1053 send_dirint(int cpuid, int int_level)
1054 {
1055 	(*send_dirintf)(cpuid, int_level);
1056 }
1057 
1058 /*
1059  * do_splx routine, takes new ipl to set
1060  * returns the old ipl.
1061  * We are careful not to set priority lower than CPU->cpu_base_pri,
1062  * even though it seems we're raising the priority, it could be set
1063  * higher at any time by an interrupt routine, so we must block interrupts
1064  * and look at CPU->cpu_base_pri
1065  */
1066 int
1067 do_splx(int newpri)
1068 {
1069 	ulong_t	flag;
1070 	cpu_t	*cpu;
1071 	int	curpri, basepri;
1072 
1073 	flag = intr_clear();
1074 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1075 	curpri = cpu->cpu_m.mcpu_pri;
1076 	basepri = cpu->cpu_base_spl;
1077 	if (newpri < basepri)
1078 		newpri = basepri;
1079 	cpu->cpu_m.mcpu_pri = newpri;
1080 	(*setspl)(newpri);
1081 	/*
1082 	 * If we are going to reenable interrupts see if new priority level
1083 	 * allows pending softint delivery.
1084 	 */
1085 	if ((flag & PS_IE) &&
1086 	    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1087 		fakesoftint();
1088 	ASSERT(!interrupts_enabled());
1089 	intr_restore(flag);
1090 	return (curpri);
1091 }
1092 
1093 /*
1094  * Common spl raise routine, takes new ipl to set
1095  * returns the old ipl, will not lower ipl.
1096  */
1097 int
1098 splr(int newpri)
1099 {
1100 	ulong_t	flag;
1101 	cpu_t	*cpu;
1102 	int	curpri, basepri;
1103 
1104 	flag = intr_clear();
1105 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1106 	curpri = cpu->cpu_m.mcpu_pri;
1107 	/*
1108 	 * Only do something if new priority is larger
1109 	 */
1110 	if (newpri > curpri) {
1111 		basepri = cpu->cpu_base_spl;
1112 		if (newpri < basepri)
1113 			newpri = basepri;
1114 		cpu->cpu_m.mcpu_pri = newpri;
1115 		(*setspl)(newpri);
1116 		/*
1117 		 * See if new priority level allows pending softint delivery
1118 		 */
1119 		if ((flag & PS_IE) &&
1120 		    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1121 			fakesoftint();
1122 	}
1123 	intr_restore(flag);
1124 	return (curpri);
1125 }
1126 
1127 int
1128 getpil(void)
1129 {
1130 	return (CPU->cpu_m.mcpu_pri);
1131 }
1132 
1133 int
1134 interrupts_enabled(void)
1135 {
1136 	ulong_t	flag;
1137 
1138 	flag = getflags();
1139 	return ((flag & PS_IE) == PS_IE);
1140 }
1141 
1142 #ifdef DEBUG
1143 void
1144 assert_ints_enabled(void)
1145 {
1146 	ASSERT(!interrupts_unleashed || interrupts_enabled());
1147 }
1148 #endif	/* DEBUG */
1149