xref: /titanic_50/usr/src/uts/i86pc/os/intr.c (revision a307732568c3d861c38b0342ae32434226d10e94)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/cpuvar.h>
27 #include <sys/cpu_event.h>
28 #include <sys/regset.h>
29 #include <sys/psw.h>
30 #include <sys/types.h>
31 #include <sys/thread.h>
32 #include <sys/systm.h>
33 #include <sys/segments.h>
34 #include <sys/pcb.h>
35 #include <sys/trap.h>
36 #include <sys/ftrace.h>
37 #include <sys/traptrace.h>
38 #include <sys/clock.h>
39 #include <sys/panic.h>
40 #include <sys/disp.h>
41 #include <vm/seg_kp.h>
42 #include <sys/stack.h>
43 #include <sys/sysmacros.h>
44 #include <sys/cmn_err.h>
45 #include <sys/kstat.h>
46 #include <sys/smp_impldefs.h>
47 #include <sys/pool_pset.h>
48 #include <sys/zone.h>
49 #include <sys/bitmap.h>
50 #include <sys/archsystm.h>
51 #include <sys/machsystm.h>
52 #include <sys/ontrap.h>
53 #include <sys/x86_archext.h>
54 #include <sys/promif.h>
55 #include <vm/hat_i86.h>
56 #if defined(__xpv)
57 #include <sys/hypervisor.h>
58 #endif
59 
60 
61 #if defined(__xpv) && defined(DEBUG)
62 
63 /*
64  * This panic message is intended as an aid to interrupt debugging.
65  *
66  * The associated assertion tests the condition of enabling
67  * events when events are already enabled.  The implication
68  * being that whatever code the programmer thought was
69  * protected by having events disabled until the second
70  * enable happened really wasn't protected at all ..
71  */
72 
73 int stistipanic = 1;	/* controls the debug panic check */
74 const char *stistimsg = "stisti";
75 ulong_t laststi[NCPU];
76 
77 /*
78  * This variable tracks the last place events were disabled on each cpu
79  * it assists in debugging when asserts that interrupts are enabled trip.
80  */
81 ulong_t lastcli[NCPU];
82 
83 #endif
84 
85 void do_interrupt(struct regs *rp, trap_trace_rec_t *ttp);
86 
87 void (*do_interrupt_common)(struct regs *, trap_trace_rec_t *) = do_interrupt;
88 uintptr_t (*get_intr_handler)(int, short) = NULL;
89 
90 /*
91  * Set cpu's base SPL level to the highest active interrupt level
92  */
93 void
94 set_base_spl(void)
95 {
96 	struct cpu *cpu = CPU;
97 	uint16_t active = (uint16_t)cpu->cpu_intr_actv;
98 
99 	cpu->cpu_base_spl = active == 0 ? 0 : bsrw_insn(active);
100 }
101 
102 /*
103  * Do all the work necessary to set up the cpu and thread structures
104  * to dispatch a high-level interrupt.
105  *
106  * Returns 0 if we're -not- already on the high-level interrupt stack,
107  * (and *must* switch to it), non-zero if we are already on that stack.
108  *
109  * Called with interrupts masked.
110  * The 'pil' is already set to the appropriate level for rp->r_trapno.
111  */
112 static int
113 hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
114 {
115 	struct machcpu *mcpu = &cpu->cpu_m;
116 	uint_t mask;
117 	hrtime_t intrtime;
118 	hrtime_t now = tsc_read();
119 
120 	ASSERT(pil > LOCK_LEVEL);
121 
122 	if (pil == CBE_HIGH_PIL) {
123 		cpu->cpu_profile_pil = oldpil;
124 		if (USERMODE(rp->r_cs)) {
125 			cpu->cpu_profile_pc = 0;
126 			cpu->cpu_profile_upc = rp->r_pc;
127 			cpu->cpu_cpcprofile_pc = 0;
128 			cpu->cpu_cpcprofile_upc = rp->r_pc;
129 		} else {
130 			cpu->cpu_profile_pc = rp->r_pc;
131 			cpu->cpu_profile_upc = 0;
132 			cpu->cpu_cpcprofile_pc = rp->r_pc;
133 			cpu->cpu_cpcprofile_upc = 0;
134 		}
135 	}
136 
137 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
138 	if (mask != 0) {
139 		int nestpil;
140 
141 		/*
142 		 * We have interrupted another high-level interrupt.
143 		 * Load starting timestamp, compute interval, update
144 		 * cumulative counter.
145 		 */
146 		nestpil = bsrw_insn((uint16_t)mask);
147 		ASSERT(nestpil < pil);
148 		intrtime = now -
149 		    mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)];
150 		mcpu->intrstat[nestpil][0] += intrtime;
151 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
152 		/*
153 		 * Another high-level interrupt is active below this one, so
154 		 * there is no need to check for an interrupt thread.  That
155 		 * will be done by the lowest priority high-level interrupt
156 		 * active.
157 		 */
158 	} else {
159 		kthread_t *t = cpu->cpu_thread;
160 
161 		/*
162 		 * See if we are interrupting a low-level interrupt thread.
163 		 * If so, account for its time slice only if its time stamp
164 		 * is non-zero.
165 		 */
166 		if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) {
167 			intrtime = now - t->t_intr_start;
168 			mcpu->intrstat[t->t_pil][0] += intrtime;
169 			cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
170 			t->t_intr_start = 0;
171 		}
172 	}
173 
174 	/*
175 	 * Store starting timestamp in CPU structure for this PIL.
176 	 */
177 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
178 
179 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
180 
181 	if (pil == 15) {
182 		/*
183 		 * To support reentrant level 15 interrupts, we maintain a
184 		 * recursion count in the top half of cpu_intr_actv.  Only
185 		 * when this count hits zero do we clear the PIL 15 bit from
186 		 * the lower half of cpu_intr_actv.
187 		 */
188 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
189 		(*refcntp)++;
190 	}
191 
192 	mask = cpu->cpu_intr_actv;
193 
194 	cpu->cpu_intr_actv |= (1 << pil);
195 
196 	return (mask & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
197 }
198 
199 /*
200  * Does most of the work of returning from a high level interrupt.
201  *
202  * Returns 0 if there are no more high level interrupts (in which
203  * case we must switch back to the interrupted thread stack) or
204  * non-zero if there are more (in which case we should stay on it).
205  *
206  * Called with interrupts masked
207  */
208 static int
209 hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
210 {
211 	struct machcpu *mcpu = &cpu->cpu_m;
212 	uint_t mask;
213 	hrtime_t intrtime;
214 	hrtime_t now = tsc_read();
215 
216 	ASSERT(mcpu->mcpu_pri == pil);
217 
218 	cpu->cpu_stats.sys.intr[pil - 1]++;
219 
220 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
221 
222 	if (pil == 15) {
223 		/*
224 		 * To support reentrant level 15 interrupts, we maintain a
225 		 * recursion count in the top half of cpu_intr_actv.  Only
226 		 * when this count hits zero do we clear the PIL 15 bit from
227 		 * the lower half of cpu_intr_actv.
228 		 */
229 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
230 
231 		ASSERT(*refcntp > 0);
232 
233 		if (--(*refcntp) == 0)
234 			cpu->cpu_intr_actv &= ~(1 << pil);
235 	} else {
236 		cpu->cpu_intr_actv &= ~(1 << pil);
237 	}
238 
239 	ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0);
240 
241 	intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)];
242 	mcpu->intrstat[pil][0] += intrtime;
243 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
244 
245 	/*
246 	 * Check for lower-pil nested high-level interrupt beneath
247 	 * current one.  If so, place a starting timestamp in its
248 	 * pil_high_start entry.
249 	 */
250 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
251 	if (mask != 0) {
252 		int nestpil;
253 
254 		/*
255 		 * find PIL of nested interrupt
256 		 */
257 		nestpil = bsrw_insn((uint16_t)mask);
258 		ASSERT(nestpil < pil);
259 		mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now;
260 		/*
261 		 * (Another high-level interrupt is active below this one,
262 		 * so there is no need to check for an interrupt
263 		 * thread.  That will be done by the lowest priority
264 		 * high-level interrupt active.)
265 		 */
266 	} else {
267 		/*
268 		 * Check to see if there is a low-level interrupt active.
269 		 * If so, place a starting timestamp in the thread
270 		 * structure.
271 		 */
272 		kthread_t *t = cpu->cpu_thread;
273 
274 		if (t->t_flag & T_INTR_THREAD)
275 			t->t_intr_start = now;
276 	}
277 
278 	mcpu->mcpu_pri = oldpil;
279 	(void) (*setlvlx)(oldpil, vecnum);
280 
281 	return (cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
282 }
283 
284 /*
285  * Set up the cpu, thread and interrupt thread structures for
286  * executing an interrupt thread.  The new stack pointer of the
287  * interrupt thread (which *must* be switched to) is returned.
288  */
289 static caddr_t
290 intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
291 {
292 	struct machcpu *mcpu = &cpu->cpu_m;
293 	kthread_t *t, *volatile it;
294 	hrtime_t now = tsc_read();
295 
296 	ASSERT(pil > 0);
297 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
298 	cpu->cpu_intr_actv |= (1 << pil);
299 
300 	/*
301 	 * Get set to run an interrupt thread.
302 	 * There should always be an interrupt thread, since we
303 	 * allocate one for each level on each CPU.
304 	 *
305 	 * t_intr_start could be zero due to cpu_intr_swtch_enter.
306 	 */
307 	t = cpu->cpu_thread;
308 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
309 		hrtime_t intrtime = now - t->t_intr_start;
310 		mcpu->intrstat[t->t_pil][0] += intrtime;
311 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
312 		t->t_intr_start = 0;
313 	}
314 
315 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
316 
317 	t->t_sp = (uintptr_t)stackptr;	/* mark stack in curthread for resume */
318 
319 	/*
320 	 * unlink the interrupt thread off the cpu
321 	 *
322 	 * Note that the code in kcpc_overflow_intr -relies- on the
323 	 * ordering of events here - in particular that t->t_lwp of
324 	 * the interrupt thread is set to the pinned thread *before*
325 	 * curthread is changed.
326 	 */
327 	it = cpu->cpu_intr_thread;
328 	cpu->cpu_intr_thread = it->t_link;
329 	it->t_intr = t;
330 	it->t_lwp = t->t_lwp;
331 
332 	/*
333 	 * (threads on the interrupt thread free list could have state
334 	 * preset to TS_ONPROC, but it helps in debugging if
335 	 * they're TS_FREE.)
336 	 */
337 	it->t_state = TS_ONPROC;
338 
339 	cpu->cpu_thread = it;		/* new curthread on this cpu */
340 	it->t_pil = (uchar_t)pil;
341 	it->t_pri = intr_pri + (pri_t)pil;
342 	it->t_intr_start = now;
343 
344 	return (it->t_stk);
345 }
346 
347 
348 #ifdef DEBUG
349 int intr_thread_cnt;
350 #endif
351 
352 /*
353  * Called with interrupts disabled
354  */
355 static void
356 intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
357 {
358 	struct machcpu *mcpu = &cpu->cpu_m;
359 	kthread_t *t;
360 	kthread_t *it = cpu->cpu_thread;	/* curthread */
361 	uint_t pil, basespl;
362 	hrtime_t intrtime;
363 	hrtime_t now = tsc_read();
364 
365 	pil = it->t_pil;
366 	cpu->cpu_stats.sys.intr[pil - 1]++;
367 
368 	ASSERT(it->t_intr_start != 0);
369 	intrtime = now - it->t_intr_start;
370 	mcpu->intrstat[pil][0] += intrtime;
371 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
372 
373 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
374 	cpu->cpu_intr_actv &= ~(1 << pil);
375 
376 	/*
377 	 * If there is still an interrupted thread underneath this one
378 	 * then the interrupt was never blocked and the return is
379 	 * fairly simple.  Otherwise it isn't.
380 	 */
381 	if ((t = it->t_intr) == NULL) {
382 		/*
383 		 * The interrupted thread is no longer pinned underneath
384 		 * the interrupt thread.  This means the interrupt must
385 		 * have blocked, and the interrupted thread has been
386 		 * unpinned, and has probably been running around the
387 		 * system for a while.
388 		 *
389 		 * Since there is no longer a thread under this one, put
390 		 * this interrupt thread back on the CPU's free list and
391 		 * resume the idle thread which will dispatch the next
392 		 * thread to run.
393 		 */
394 #ifdef DEBUG
395 		intr_thread_cnt++;
396 #endif
397 		cpu->cpu_stats.sys.intrblk++;
398 		/*
399 		 * Set CPU's base SPL based on active interrupts bitmask
400 		 */
401 		set_base_spl();
402 		basespl = cpu->cpu_base_spl;
403 		mcpu->mcpu_pri = basespl;
404 		(*setlvlx)(basespl, vec);
405 		(void) splhigh();
406 		sti();
407 		it->t_state = TS_FREE;
408 		/*
409 		 * Return interrupt thread to pool
410 		 */
411 		it->t_link = cpu->cpu_intr_thread;
412 		cpu->cpu_intr_thread = it;
413 		swtch();
414 		panic("intr_thread_epilog: swtch returned");
415 		/*NOTREACHED*/
416 	}
417 
418 	/*
419 	 * Return interrupt thread to the pool
420 	 */
421 	it->t_link = cpu->cpu_intr_thread;
422 	cpu->cpu_intr_thread = it;
423 	it->t_state = TS_FREE;
424 
425 	basespl = cpu->cpu_base_spl;
426 	pil = MAX(oldpil, basespl);
427 	mcpu->mcpu_pri = pil;
428 	(*setlvlx)(pil, vec);
429 	t->t_intr_start = now;
430 	cpu->cpu_thread = t;
431 }
432 
433 /*
434  * intr_get_time() is a resource for interrupt handlers to determine how
435  * much time has been spent handling the current interrupt. Such a function
436  * is needed because higher level interrupts can arrive during the
437  * processing of an interrupt.  intr_get_time() only returns time spent in the
438  * current interrupt handler.
439  *
440  * The caller must be calling from an interrupt handler running at a pil
441  * below or at lock level. Timings are not provided for high-level
442  * interrupts.
443  *
444  * The first time intr_get_time() is called while handling an interrupt,
445  * it returns the time since the interrupt handler was invoked. Subsequent
446  * calls will return the time since the prior call to intr_get_time(). Time
447  * is returned as ticks. Use scalehrtimef() to convert ticks to nsec.
448  *
449  * Theory Of Intrstat[][]:
450  *
451  * uint64_t intrstat[pil][0..1] is an array indexed by pil level, with two
452  * uint64_ts per pil.
453  *
454  * intrstat[pil][0] is a cumulative count of the number of ticks spent
455  * handling all interrupts at the specified pil on this CPU. It is
456  * exported via kstats to the user.
457  *
458  * intrstat[pil][1] is always a count of ticks less than or equal to the
459  * value in [0]. The difference between [1] and [0] is the value returned
460  * by a call to intr_get_time(). At the start of interrupt processing,
461  * [0] and [1] will be equal (or nearly so). As the interrupt consumes
462  * time, [0] will increase, but [1] will remain the same. A call to
463  * intr_get_time() will return the difference, then update [1] to be the
464  * same as [0]. Future calls will return the time since the last call.
465  * Finally, when the interrupt completes, [1] is updated to the same as [0].
466  *
467  * Implementation:
468  *
469  * intr_get_time() works much like a higher level interrupt arriving. It
470  * "checkpoints" the timing information by incrementing intrstat[pil][0]
471  * to include elapsed running time, and by setting t_intr_start to rdtsc.
472  * It then sets the return value to intrstat[pil][0] - intrstat[pil][1],
473  * and updates intrstat[pil][1] to be the same as the new value of
474  * intrstat[pil][0].
475  *
476  * In the normal handling of interrupts, after an interrupt handler returns
477  * and the code in intr_thread() updates intrstat[pil][0], it then sets
478  * intrstat[pil][1] to the new value of intrstat[pil][0]. When [0] == [1],
479  * the timings are reset, i.e. intr_get_time() will return [0] - [1] which
480  * is 0.
481  *
482  * Whenever interrupts arrive on a CPU which is handling a lower pil
483  * interrupt, they update the lower pil's [0] to show time spent in the
484  * handler that they've interrupted. This results in a growing discrepancy
485  * between [0] and [1], which is returned the next time intr_get_time() is
486  * called. Time spent in the higher-pil interrupt will not be returned in
487  * the next intr_get_time() call from the original interrupt, because
488  * the higher-pil interrupt's time is accumulated in intrstat[higherpil][].
489  */
490 uint64_t
491 intr_get_time(void)
492 {
493 	struct cpu *cpu;
494 	struct machcpu *mcpu;
495 	kthread_t *t;
496 	uint64_t time, delta, ret;
497 	uint_t pil;
498 
499 	cli();
500 	cpu = CPU;
501 	mcpu = &cpu->cpu_m;
502 	t = cpu->cpu_thread;
503 	pil = t->t_pil;
504 	ASSERT((cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK) == 0);
505 	ASSERT(t->t_flag & T_INTR_THREAD);
506 	ASSERT(pil != 0);
507 	ASSERT(t->t_intr_start != 0);
508 
509 	time = tsc_read();
510 	delta = time - t->t_intr_start;
511 	t->t_intr_start = time;
512 
513 	time = mcpu->intrstat[pil][0] + delta;
514 	ret = time - mcpu->intrstat[pil][1];
515 	mcpu->intrstat[pil][0] = time;
516 	mcpu->intrstat[pil][1] = time;
517 	cpu->cpu_intracct[cpu->cpu_mstate] += delta;
518 
519 	sti();
520 	return (ret);
521 }
522 
523 static caddr_t
524 dosoftint_prolog(
525 	struct cpu *cpu,
526 	caddr_t stackptr,
527 	uint32_t st_pending,
528 	uint_t oldpil)
529 {
530 	kthread_t *t, *volatile it;
531 	struct machcpu *mcpu = &cpu->cpu_m;
532 	uint_t pil;
533 	hrtime_t now;
534 
535 top:
536 	ASSERT(st_pending == mcpu->mcpu_softinfo.st_pending);
537 
538 	pil = bsrw_insn((uint16_t)st_pending);
539 	if (pil <= oldpil || pil <= cpu->cpu_base_spl)
540 		return (0);
541 
542 	/*
543 	 * XX64	Sigh.
544 	 *
545 	 * This is a transliteration of the i386 assembler code for
546 	 * soft interrupts.  One question is "why does this need
547 	 * to be atomic?"  One possible race is -other- processors
548 	 * posting soft interrupts to us in set_pending() i.e. the
549 	 * CPU might get preempted just after the address computation,
550 	 * but just before the atomic transaction, so another CPU would
551 	 * actually set the original CPU's st_pending bit.  However,
552 	 * it looks like it would be simpler to disable preemption there.
553 	 * Are there other races for which preemption control doesn't work?
554 	 *
555 	 * The i386 assembler version -also- checks to see if the bit
556 	 * being cleared was actually set; if it wasn't, it rechecks
557 	 * for more.  This seems a bit strange, as the only code that
558 	 * ever clears the bit is -this- code running with interrupts
559 	 * disabled on -this- CPU.  This code would probably be cheaper:
560 	 *
561 	 * atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending,
562 	 *   ~(1 << pil));
563 	 *
564 	 * and t->t_preempt--/++ around set_pending() even cheaper,
565 	 * but at this point, correctness is critical, so we slavishly
566 	 * emulate the i386 port.
567 	 */
568 	if (atomic_btr32((uint32_t *)
569 	    &mcpu->mcpu_softinfo.st_pending, pil) == 0) {
570 		st_pending = mcpu->mcpu_softinfo.st_pending;
571 		goto top;
572 	}
573 
574 	mcpu->mcpu_pri = pil;
575 	(*setspl)(pil);
576 
577 	now = tsc_read();
578 
579 	/*
580 	 * Get set to run interrupt thread.
581 	 * There should always be an interrupt thread since we
582 	 * allocate one for each level on the CPU.
583 	 */
584 	it = cpu->cpu_intr_thread;
585 	cpu->cpu_intr_thread = it->t_link;
586 
587 	/* t_intr_start could be zero due to cpu_intr_swtch_enter. */
588 	t = cpu->cpu_thread;
589 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
590 		hrtime_t intrtime = now - t->t_intr_start;
591 		mcpu->intrstat[pil][0] += intrtime;
592 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
593 		t->t_intr_start = 0;
594 	}
595 
596 	/*
597 	 * Note that the code in kcpc_overflow_intr -relies- on the
598 	 * ordering of events here - in particular that t->t_lwp of
599 	 * the interrupt thread is set to the pinned thread *before*
600 	 * curthread is changed.
601 	 */
602 	it->t_lwp = t->t_lwp;
603 	it->t_state = TS_ONPROC;
604 
605 	/*
606 	 * Push interrupted thread onto list from new thread.
607 	 * Set the new thread as the current one.
608 	 * Set interrupted thread's T_SP because if it is the idle thread,
609 	 * resume() may use that stack between threads.
610 	 */
611 
612 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
613 	t->t_sp = (uintptr_t)stackptr;
614 
615 	it->t_intr = t;
616 	cpu->cpu_thread = it;
617 
618 	/*
619 	 * Set bit for this pil in CPU's interrupt active bitmask.
620 	 */
621 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
622 	cpu->cpu_intr_actv |= (1 << pil);
623 
624 	/*
625 	 * Initialize thread priority level from intr_pri
626 	 */
627 	it->t_pil = (uchar_t)pil;
628 	it->t_pri = (pri_t)pil + intr_pri;
629 	it->t_intr_start = now;
630 
631 	return (it->t_stk);
632 }
633 
634 static void
635 dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
636 {
637 	struct machcpu *mcpu = &cpu->cpu_m;
638 	kthread_t *t, *it;
639 	uint_t pil, basespl;
640 	hrtime_t intrtime;
641 	hrtime_t now = tsc_read();
642 
643 	it = cpu->cpu_thread;
644 	pil = it->t_pil;
645 
646 	cpu->cpu_stats.sys.intr[pil - 1]++;
647 
648 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
649 	cpu->cpu_intr_actv &= ~(1 << pil);
650 	intrtime = now - it->t_intr_start;
651 	mcpu->intrstat[pil][0] += intrtime;
652 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
653 
654 	/*
655 	 * If there is still an interrupted thread underneath this one
656 	 * then the interrupt was never blocked and the return is
657 	 * fairly simple.  Otherwise it isn't.
658 	 */
659 	if ((t = it->t_intr) == NULL) {
660 		/*
661 		 * Put thread back on the interrupt thread list.
662 		 * This was an interrupt thread, so set CPU's base SPL.
663 		 */
664 		set_base_spl();
665 		it->t_state = TS_FREE;
666 		it->t_link = cpu->cpu_intr_thread;
667 		cpu->cpu_intr_thread = it;
668 		(void) splhigh();
669 		sti();
670 		swtch();
671 		/*NOTREACHED*/
672 		panic("dosoftint_epilog: swtch returned");
673 	}
674 	it->t_link = cpu->cpu_intr_thread;
675 	cpu->cpu_intr_thread = it;
676 	it->t_state = TS_FREE;
677 	cpu->cpu_thread = t;
678 	if (t->t_flag & T_INTR_THREAD)
679 		t->t_intr_start = now;
680 	basespl = cpu->cpu_base_spl;
681 	pil = MAX(oldpil, basespl);
682 	mcpu->mcpu_pri = pil;
683 	(*setspl)(pil);
684 }
685 
686 
687 /*
688  * Make the interrupted thread 'to' be runnable.
689  *
690  * Since t->t_sp has already been saved, t->t_pc is all
691  * that needs to be set in this function.
692  *
693  * Returns the interrupt level of the interrupt thread.
694  */
695 int
696 intr_passivate(
697 	kthread_t *it,		/* interrupt thread */
698 	kthread_t *t)		/* interrupted thread */
699 {
700 	extern void _sys_rtt();
701 
702 	ASSERT(it->t_flag & T_INTR_THREAD);
703 	ASSERT(SA(t->t_sp) == t->t_sp);
704 
705 	t->t_pc = (uintptr_t)_sys_rtt;
706 	return (it->t_pil);
707 }
708 
709 /*
710  * Create interrupt kstats for this CPU.
711  */
712 void
713 cpu_create_intrstat(cpu_t *cp)
714 {
715 	int		i;
716 	kstat_t		*intr_ksp;
717 	kstat_named_t	*knp;
718 	char		name[KSTAT_STRLEN];
719 	zoneid_t	zoneid;
720 
721 	ASSERT(MUTEX_HELD(&cpu_lock));
722 
723 	if (pool_pset_enabled())
724 		zoneid = GLOBAL_ZONEID;
725 	else
726 		zoneid = ALL_ZONES;
727 
728 	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
729 	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
730 
731 	/*
732 	 * Initialize each PIL's named kstat
733 	 */
734 	if (intr_ksp != NULL) {
735 		intr_ksp->ks_update = cpu_kstat_intrstat_update;
736 		knp = (kstat_named_t *)intr_ksp->ks_data;
737 		intr_ksp->ks_private = cp;
738 		for (i = 0; i < PIL_MAX; i++) {
739 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
740 			    i + 1);
741 			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
742 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
743 			    i + 1);
744 			kstat_named_init(&knp[(i * 2) + 1], name,
745 			    KSTAT_DATA_UINT64);
746 		}
747 		kstat_install(intr_ksp);
748 	}
749 }
750 
751 /*
752  * Delete interrupt kstats for this CPU.
753  */
754 void
755 cpu_delete_intrstat(cpu_t *cp)
756 {
757 	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
758 }
759 
760 /*
761  * Convert interrupt statistics from CPU ticks to nanoseconds and
762  * update kstat.
763  */
764 int
765 cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
766 {
767 	kstat_named_t	*knp = ksp->ks_data;
768 	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
769 	int		i;
770 	hrtime_t	hrt;
771 
772 	if (rw == KSTAT_WRITE)
773 		return (EACCES);
774 
775 	for (i = 0; i < PIL_MAX; i++) {
776 		hrt = (hrtime_t)cpup->cpu_m.intrstat[i + 1][0];
777 		scalehrtimef(&hrt);
778 		knp[i * 2].value.ui64 = (uint64_t)hrt;
779 		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
780 	}
781 
782 	return (0);
783 }
784 
785 /*
786  * An interrupt thread is ending a time slice, so compute the interval it
787  * ran for and update the statistic for its PIL.
788  */
789 void
790 cpu_intr_swtch_enter(kthread_id_t t)
791 {
792 	uint64_t	interval;
793 	uint64_t	start;
794 	cpu_t		*cpu;
795 
796 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
797 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
798 
799 	/*
800 	 * We could be here with a zero timestamp. This could happen if:
801 	 * an interrupt thread which no longer has a pinned thread underneath
802 	 * it (i.e. it blocked at some point in its past) has finished running
803 	 * its handler. intr_thread() updated the interrupt statistic for its
804 	 * PIL and zeroed its timestamp. Since there was no pinned thread to
805 	 * return to, swtch() gets called and we end up here.
806 	 *
807 	 * Note that we use atomic ops below (cas64 and atomic_add_64), which
808 	 * we don't use in the functions above, because we're not called
809 	 * with interrupts blocked, but the epilog/prolog functions are.
810 	 */
811 	if (t->t_intr_start) {
812 		do {
813 			start = t->t_intr_start;
814 			interval = tsc_read() - start;
815 		} while (cas64(&t->t_intr_start, start, 0) != start);
816 		cpu = CPU;
817 		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
818 
819 		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
820 		    interval);
821 	} else
822 		ASSERT(t->t_intr == NULL);
823 }
824 
825 /*
826  * An interrupt thread is returning from swtch(). Place a starting timestamp
827  * in its thread structure.
828  */
829 void
830 cpu_intr_swtch_exit(kthread_id_t t)
831 {
832 	uint64_t ts;
833 
834 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
835 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
836 
837 	do {
838 		ts = t->t_intr_start;
839 	} while (cas64(&t->t_intr_start, ts, tsc_read()) != ts);
840 }
841 
842 /*
843  * Dispatch a hilevel interrupt (one above LOCK_LEVEL)
844  */
845 /*ARGSUSED*/
846 static void
847 dispatch_hilevel(uint_t vector, uint_t arg2)
848 {
849 	sti();
850 	av_dispatch_autovect(vector);
851 	cli();
852 }
853 
854 /*
855  * Dispatch a soft interrupt
856  */
857 /*ARGSUSED*/
858 static void
859 dispatch_softint(uint_t oldpil, uint_t arg2)
860 {
861 	struct cpu *cpu = CPU;
862 
863 	sti();
864 	av_dispatch_softvect((int)cpu->cpu_thread->t_pil);
865 	cli();
866 
867 	/*
868 	 * Must run softint_epilog() on the interrupt thread stack, since
869 	 * there may not be a return from it if the interrupt thread blocked.
870 	 */
871 	dosoftint_epilog(cpu, oldpil);
872 }
873 
874 /*
875  * Dispatch a normal interrupt
876  */
877 static void
878 dispatch_hardint(uint_t vector, uint_t oldipl)
879 {
880 	struct cpu *cpu = CPU;
881 
882 	sti();
883 	av_dispatch_autovect(vector);
884 	cli();
885 
886 	/*
887 	 * Must run intr_thread_epilog() on the interrupt thread stack, since
888 	 * there may not be a return from it if the interrupt thread blocked.
889 	 */
890 	intr_thread_epilog(cpu, vector, oldipl);
891 }
892 
893 /*
894  * Deliver any softints the current interrupt priority allows.
895  * Called with interrupts disabled.
896  */
897 void
898 dosoftint(struct regs *regs)
899 {
900 	struct cpu *cpu = CPU;
901 	int oldipl;
902 	caddr_t newsp;
903 
904 	while (cpu->cpu_softinfo.st_pending) {
905 		oldipl = cpu->cpu_pri;
906 		newsp = dosoftint_prolog(cpu, (caddr_t)regs,
907 		    cpu->cpu_softinfo.st_pending, oldipl);
908 		/*
909 		 * If returned stack pointer is NULL, priority is too high
910 		 * to run any of the pending softints now.
911 		 * Break out and they will be run later.
912 		 */
913 		if (newsp == NULL)
914 			break;
915 		switch_sp_and_call(newsp, dispatch_softint, oldipl, 0);
916 	}
917 }
918 
919 /*
920  * Interrupt service routine, called with interrupts disabled.
921  */
922 /*ARGSUSED*/
923 void
924 do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
925 {
926 	struct cpu *cpu = CPU;
927 	int newipl, oldipl = cpu->cpu_pri;
928 	uint_t vector;
929 	caddr_t newsp;
930 
931 #ifdef TRAPTRACE
932 	ttp->ttr_marker = TT_INTERRUPT;
933 	ttp->ttr_ipl = 0xff;
934 	ttp->ttr_pri = oldipl;
935 	ttp->ttr_spl = cpu->cpu_base_spl;
936 	ttp->ttr_vector = 0xff;
937 #endif	/* TRAPTRACE */
938 
939 	cpu_idle_exit(CPU_IDLE_CB_FLAG_INTR);
940 
941 	++*(uint16_t *)&cpu->cpu_m.mcpu_istamp;
942 
943 	/*
944 	 * If it's a softint go do it now.
945 	 */
946 	if (rp->r_trapno == T_SOFTINT) {
947 		dosoftint(rp);
948 		ASSERT(!interrupts_enabled());
949 		return;
950 	}
951 
952 	/*
953 	 * Raise the interrupt priority.
954 	 */
955 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
956 #ifdef TRAPTRACE
957 	ttp->ttr_ipl = newipl;
958 #endif	/* TRAPTRACE */
959 
960 	/*
961 	 * Bail if it is a spurious interrupt
962 	 */
963 	if (newipl == -1)
964 		return;
965 	cpu->cpu_pri = newipl;
966 	vector = rp->r_trapno;
967 #ifdef TRAPTRACE
968 	ttp->ttr_vector = vector;
969 #endif	/* TRAPTRACE */
970 	if (newipl > LOCK_LEVEL) {
971 		/*
972 		 * High priority interrupts run on this cpu's interrupt stack.
973 		 */
974 		if (hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) {
975 			newsp = cpu->cpu_intr_stack;
976 			switch_sp_and_call(newsp, dispatch_hilevel, vector, 0);
977 		} else { /* already on the interrupt stack */
978 			dispatch_hilevel(vector, 0);
979 		}
980 		(void) hilevel_intr_epilog(cpu, newipl, oldipl, vector);
981 	} else {
982 		/*
983 		 * Run this interrupt in a separate thread.
984 		 */
985 		newsp = intr_thread_prolog(cpu, (caddr_t)rp, newipl);
986 		switch_sp_and_call(newsp, dispatch_hardint, vector, oldipl);
987 	}
988 
989 #if !defined(__xpv)
990 	/*
991 	 * Deliver any pending soft interrupts.
992 	 */
993 	if (cpu->cpu_softinfo.st_pending)
994 		dosoftint(rp);
995 #endif	/* !__xpv */
996 }
997 
998 
999 /*
1000  * Common tasks always done by _sys_rtt, called with interrupts disabled.
1001  * Returns 1 if returning to userland, 0 if returning to system mode.
1002  */
1003 int
1004 sys_rtt_common(struct regs *rp)
1005 {
1006 	kthread_t *tp;
1007 	extern void mutex_exit_critical_start();
1008 	extern long mutex_exit_critical_size;
1009 	extern void mutex_owner_running_critical_start();
1010 	extern long mutex_owner_running_critical_size;
1011 
1012 loop:
1013 
1014 	/*
1015 	 * Check if returning to user
1016 	 */
1017 	tp = CPU->cpu_thread;
1018 	if (USERMODE(rp->r_cs)) {
1019 		/*
1020 		 * Check if AST pending.
1021 		 */
1022 		if (tp->t_astflag) {
1023 			/*
1024 			 * Let trap() handle the AST
1025 			 */
1026 			sti();
1027 			rp->r_trapno = T_AST;
1028 			trap(rp, (caddr_t)0, CPU->cpu_id);
1029 			cli();
1030 			goto loop;
1031 		}
1032 
1033 #if defined(__amd64)
1034 		/*
1035 		 * We are done if segment registers do not need updating.
1036 		 */
1037 		if (tp->t_lwp->lwp_pcb.pcb_rupdate == 0)
1038 			return (1);
1039 
1040 		if (update_sregs(rp, tp->t_lwp)) {
1041 			/*
1042 			 * 1 or more of the selectors is bad.
1043 			 * Deliver a SIGSEGV.
1044 			 */
1045 			proc_t *p = ttoproc(tp);
1046 
1047 			sti();
1048 			mutex_enter(&p->p_lock);
1049 			tp->t_lwp->lwp_cursig = SIGSEGV;
1050 			mutex_exit(&p->p_lock);
1051 			psig();
1052 			tp->t_sig_check = 1;
1053 			cli();
1054 		}
1055 		tp->t_lwp->lwp_pcb.pcb_rupdate = 0;
1056 
1057 #endif	/* __amd64 */
1058 		return (1);
1059 	}
1060 
1061 	/*
1062 	 * Here if we are returning to supervisor mode.
1063 	 * Check for a kernel preemption request.
1064 	 */
1065 	if (CPU->cpu_kprunrun && (rp->r_ps & PS_IE)) {
1066 
1067 		/*
1068 		 * Do nothing if already in kpreempt
1069 		 */
1070 		if (!tp->t_preempt_lk) {
1071 			tp->t_preempt_lk = 1;
1072 			sti();
1073 			kpreempt(1); /* asynchronous kpreempt call */
1074 			cli();
1075 			tp->t_preempt_lk = 0;
1076 		}
1077 	}
1078 
1079 	/*
1080 	 * If we interrupted the mutex_exit() critical region we must
1081 	 * reset the PC back to the beginning to prevent missed wakeups
1082 	 * See the comments in mutex_exit() for details.
1083 	 */
1084 	if ((uintptr_t)rp->r_pc - (uintptr_t)mutex_exit_critical_start <
1085 	    mutex_exit_critical_size) {
1086 		rp->r_pc = (greg_t)mutex_exit_critical_start;
1087 	}
1088 
1089 	/*
1090 	 * If we interrupted the mutex_owner_running() critical region we
1091 	 * must reset the PC back to the beginning to prevent dereferencing
1092 	 * of a freed thread pointer. See the comments in mutex_owner_running
1093 	 * for details.
1094 	 */
1095 	if ((uintptr_t)rp->r_pc -
1096 	    (uintptr_t)mutex_owner_running_critical_start <
1097 	    mutex_owner_running_critical_size) {
1098 		rp->r_pc = (greg_t)mutex_owner_running_critical_start;
1099 	}
1100 
1101 	return (0);
1102 }
1103 
1104 void
1105 send_dirint(int cpuid, int int_level)
1106 {
1107 	(*send_dirintf)(cpuid, int_level);
1108 }
1109 
1110 #define	IS_FAKE_SOFTINT(flag, newpri)		\
1111 	(((flag) & PS_IE) &&				\
1112 	    (((*get_pending_spl)() > (newpri)) ||	\
1113 	    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > (newpri)))
1114 
1115 /*
1116  * do_splx routine, takes new ipl to set
1117  * returns the old ipl.
1118  * We are careful not to set priority lower than CPU->cpu_base_pri,
1119  * even though it seems we're raising the priority, it could be set
1120  * higher at any time by an interrupt routine, so we must block interrupts
1121  * and look at CPU->cpu_base_pri
1122  */
1123 int
1124 do_splx(int newpri)
1125 {
1126 	ulong_t	flag;
1127 	cpu_t	*cpu;
1128 	int	curpri, basepri;
1129 
1130 	flag = intr_clear();
1131 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1132 	curpri = cpu->cpu_m.mcpu_pri;
1133 	basepri = cpu->cpu_base_spl;
1134 	if (newpri < basepri)
1135 		newpri = basepri;
1136 	cpu->cpu_m.mcpu_pri = newpri;
1137 	(*setspl)(newpri);
1138 	/*
1139 	 * If we are going to reenable interrupts see if new priority level
1140 	 * allows pending softint delivery.
1141 	 */
1142 	if (IS_FAKE_SOFTINT(flag, newpri))
1143 		fakesoftint();
1144 	ASSERT(!interrupts_enabled());
1145 	intr_restore(flag);
1146 	return (curpri);
1147 }
1148 
1149 /*
1150  * Common spl raise routine, takes new ipl to set
1151  * returns the old ipl, will not lower ipl.
1152  */
1153 int
1154 splr(int newpri)
1155 {
1156 	ulong_t	flag;
1157 	cpu_t	*cpu;
1158 	int	curpri, basepri;
1159 
1160 	flag = intr_clear();
1161 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1162 	curpri = cpu->cpu_m.mcpu_pri;
1163 	/*
1164 	 * Only do something if new priority is larger
1165 	 */
1166 	if (newpri > curpri) {
1167 		basepri = cpu->cpu_base_spl;
1168 		if (newpri < basepri)
1169 			newpri = basepri;
1170 		cpu->cpu_m.mcpu_pri = newpri;
1171 		(*setspl)(newpri);
1172 		/*
1173 		 * See if new priority level allows pending softint delivery
1174 		 */
1175 		if (IS_FAKE_SOFTINT(flag, newpri))
1176 			fakesoftint();
1177 	}
1178 	intr_restore(flag);
1179 	return (curpri);
1180 }
1181 
1182 int
1183 getpil(void)
1184 {
1185 	return (CPU->cpu_m.mcpu_pri);
1186 }
1187 
1188 int
1189 spl_xcall(void)
1190 {
1191 	return (splr(ipltospl(XCALL_PIL)));
1192 }
1193 
1194 int
1195 interrupts_enabled(void)
1196 {
1197 	ulong_t	flag;
1198 
1199 	flag = getflags();
1200 	return ((flag & PS_IE) == PS_IE);
1201 }
1202 
1203 #ifdef DEBUG
1204 void
1205 assert_ints_enabled(void)
1206 {
1207 	ASSERT(!interrupts_unleashed || interrupts_enabled());
1208 }
1209 #endif	/* DEBUG */
1210