xref: /illumos-gate/usr/src/uts/i86pc/os/intr.c (revision c211fc479225fa54805cf480633bf6689ca9a2db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/cpuvar.h>
28 #include <sys/regset.h>
29 #include <sys/psw.h>
30 #include <sys/types.h>
31 #include <sys/thread.h>
32 #include <sys/systm.h>
33 #include <sys/segments.h>
34 #include <sys/pcb.h>
35 #include <sys/trap.h>
36 #include <sys/ftrace.h>
37 #include <sys/traptrace.h>
38 #include <sys/clock.h>
39 #include <sys/panic.h>
40 #include <sys/disp.h>
41 #include <vm/seg_kp.h>
42 #include <sys/stack.h>
43 #include <sys/sysmacros.h>
44 #include <sys/cmn_err.h>
45 #include <sys/kstat.h>
46 #include <sys/smp_impldefs.h>
47 #include <sys/pool_pset.h>
48 #include <sys/zone.h>
49 #include <sys/bitmap.h>
50 #include <sys/archsystm.h>
51 #include <sys/machsystm.h>
52 #include <sys/ontrap.h>
53 #include <sys/x86_archext.h>
54 #include <sys/promif.h>
55 #include <vm/hat_i86.h>
56 #if defined(__xpv)
57 #include <sys/hypervisor.h>
58 #endif
59 
60 
61 #if defined(__xpv) && defined(DEBUG)
62 
63 /*
64  * This panic message is intended as an aid to interrupt debugging.
65  *
66  * The associated assertion tests the condition of enabling
67  * events when events are already enabled.  The implication
68  * being that whatever code the programmer thought was
69  * protected by having events disabled until the second
70  * enable happened really wasn't protected at all ..
71  */
72 
73 int stistipanic = 1;	/* controls the debug panic check */
74 const char *stistimsg = "stisti";
75 ulong_t laststi[NCPU];
76 
77 /*
78  * This variable tracks the last place events were disabled on each cpu
79  * it assists in debugging when asserts that interupts are enabled trip.
80  */
81 ulong_t lastcli[NCPU];
82 
83 #endif
84 
85 /*
86  * Set cpu's base SPL level to the highest active interrupt level
87  */
88 void
89 set_base_spl(void)
90 {
91 	struct cpu *cpu = CPU;
92 	uint16_t active = (uint16_t)cpu->cpu_intr_actv;
93 
94 	cpu->cpu_base_spl = active == 0 ? 0 : bsrw_insn(active);
95 }
96 
97 /*
98  * Do all the work necessary to set up the cpu and thread structures
99  * to dispatch a high-level interrupt.
100  *
101  * Returns 0 if we're -not- already on the high-level interrupt stack,
102  * (and *must* switch to it), non-zero if we are already on that stack.
103  *
104  * Called with interrupts masked.
105  * The 'pil' is already set to the appropriate level for rp->r_trapno.
106  */
107 static int
108 hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
109 {
110 	struct machcpu *mcpu = &cpu->cpu_m;
111 	uint_t mask;
112 	hrtime_t intrtime;
113 	hrtime_t now = tsc_read();
114 
115 	ASSERT(pil > LOCK_LEVEL);
116 
117 	if (pil == CBE_HIGH_PIL) {
118 		cpu->cpu_profile_pil = oldpil;
119 		if (USERMODE(rp->r_cs)) {
120 			cpu->cpu_profile_pc = 0;
121 			cpu->cpu_profile_upc = rp->r_pc;
122 			cpu->cpu_cpcprofile_pc = 0;
123 			cpu->cpu_cpcprofile_upc = rp->r_pc;
124 		} else {
125 			cpu->cpu_profile_pc = rp->r_pc;
126 			cpu->cpu_profile_upc = 0;
127 			cpu->cpu_cpcprofile_pc = rp->r_pc;
128 			cpu->cpu_cpcprofile_upc = 0;
129 		}
130 	}
131 
132 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
133 	if (mask != 0) {
134 		int nestpil;
135 
136 		/*
137 		 * We have interrupted another high-level interrupt.
138 		 * Load starting timestamp, compute interval, update
139 		 * cumulative counter.
140 		 */
141 		nestpil = bsrw_insn((uint16_t)mask);
142 		ASSERT(nestpil < pil);
143 		intrtime = now -
144 		    mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)];
145 		mcpu->intrstat[nestpil][0] += intrtime;
146 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
147 		/*
148 		 * Another high-level interrupt is active below this one, so
149 		 * there is no need to check for an interrupt thread.  That
150 		 * will be done by the lowest priority high-level interrupt
151 		 * active.
152 		 */
153 	} else {
154 		kthread_t *t = cpu->cpu_thread;
155 
156 		/*
157 		 * See if we are interrupting a low-level interrupt thread.
158 		 * If so, account for its time slice only if its time stamp
159 		 * is non-zero.
160 		 */
161 		if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) {
162 			intrtime = now - t->t_intr_start;
163 			mcpu->intrstat[t->t_pil][0] += intrtime;
164 			cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
165 			t->t_intr_start = 0;
166 		}
167 	}
168 
169 	/*
170 	 * Store starting timestamp in CPU structure for this PIL.
171 	 */
172 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
173 
174 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
175 
176 	if (pil == 15) {
177 		/*
178 		 * To support reentrant level 15 interrupts, we maintain a
179 		 * recursion count in the top half of cpu_intr_actv.  Only
180 		 * when this count hits zero do we clear the PIL 15 bit from
181 		 * the lower half of cpu_intr_actv.
182 		 */
183 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
184 		(*refcntp)++;
185 	}
186 
187 	mask = cpu->cpu_intr_actv;
188 
189 	cpu->cpu_intr_actv |= (1 << pil);
190 
191 	return (mask & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
192 }
193 
194 /*
195  * Does most of the work of returning from a high level interrupt.
196  *
197  * Returns 0 if there are no more high level interrupts (in which
198  * case we must switch back to the interrupted thread stack) or
199  * non-zero if there are more (in which case we should stay on it).
200  *
201  * Called with interrupts masked
202  */
203 static int
204 hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
205 {
206 	struct machcpu *mcpu = &cpu->cpu_m;
207 	uint_t mask;
208 	hrtime_t intrtime;
209 	hrtime_t now = tsc_read();
210 
211 	ASSERT(mcpu->mcpu_pri == pil);
212 
213 	cpu->cpu_stats.sys.intr[pil - 1]++;
214 
215 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
216 
217 	if (pil == 15) {
218 		/*
219 		 * To support reentrant level 15 interrupts, we maintain a
220 		 * recursion count in the top half of cpu_intr_actv.  Only
221 		 * when this count hits zero do we clear the PIL 15 bit from
222 		 * the lower half of cpu_intr_actv.
223 		 */
224 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
225 
226 		ASSERT(*refcntp > 0);
227 
228 		if (--(*refcntp) == 0)
229 			cpu->cpu_intr_actv &= ~(1 << pil);
230 	} else {
231 		cpu->cpu_intr_actv &= ~(1 << pil);
232 	}
233 
234 	ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0);
235 
236 	intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)];
237 	mcpu->intrstat[pil][0] += intrtime;
238 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
239 
240 	/*
241 	 * Check for lower-pil nested high-level interrupt beneath
242 	 * current one.  If so, place a starting timestamp in its
243 	 * pil_high_start entry.
244 	 */
245 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
246 	if (mask != 0) {
247 		int nestpil;
248 
249 		/*
250 		 * find PIL of nested interrupt
251 		 */
252 		nestpil = bsrw_insn((uint16_t)mask);
253 		ASSERT(nestpil < pil);
254 		mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now;
255 		/*
256 		 * (Another high-level interrupt is active below this one,
257 		 * so there is no need to check for an interrupt
258 		 * thread.  That will be done by the lowest priority
259 		 * high-level interrupt active.)
260 		 */
261 	} else {
262 		/*
263 		 * Check to see if there is a low-level interrupt active.
264 		 * If so, place a starting timestamp in the thread
265 		 * structure.
266 		 */
267 		kthread_t *t = cpu->cpu_thread;
268 
269 		if (t->t_flag & T_INTR_THREAD)
270 			t->t_intr_start = now;
271 	}
272 
273 	mcpu->mcpu_pri = oldpil;
274 	(void) (*setlvlx)(oldpil, vecnum);
275 
276 	return (cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
277 }
278 
279 /*
280  * Set up the cpu, thread and interrupt thread structures for
281  * executing an interrupt thread.  The new stack pointer of the
282  * interrupt thread (which *must* be switched to) is returned.
283  */
284 static caddr_t
285 intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
286 {
287 	struct machcpu *mcpu = &cpu->cpu_m;
288 	kthread_t *t, *volatile it;
289 	hrtime_t now = tsc_read();
290 
291 	ASSERT(pil > 0);
292 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
293 	cpu->cpu_intr_actv |= (1 << pil);
294 
295 	/*
296 	 * Get set to run an interrupt thread.
297 	 * There should always be an interrupt thread, since we
298 	 * allocate one for each level on each CPU.
299 	 *
300 	 * t_intr_start could be zero due to cpu_intr_swtch_enter.
301 	 */
302 	t = cpu->cpu_thread;
303 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
304 		hrtime_t intrtime = now - t->t_intr_start;
305 		mcpu->intrstat[t->t_pil][0] += intrtime;
306 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
307 		t->t_intr_start = 0;
308 	}
309 
310 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
311 
312 	t->t_sp = (uintptr_t)stackptr;	/* mark stack in curthread for resume */
313 
314 	/*
315 	 * unlink the interrupt thread off the cpu
316 	 *
317 	 * Note that the code in kcpc_overflow_intr -relies- on the
318 	 * ordering of events here - in particular that t->t_lwp of
319 	 * the interrupt thread is set to the pinned thread *before*
320 	 * curthread is changed.
321 	 */
322 	it = cpu->cpu_intr_thread;
323 	cpu->cpu_intr_thread = it->t_link;
324 	it->t_intr = t;
325 	it->t_lwp = t->t_lwp;
326 
327 	/*
328 	 * (threads on the interrupt thread free list could have state
329 	 * preset to TS_ONPROC, but it helps in debugging if
330 	 * they're TS_FREE.)
331 	 */
332 	it->t_state = TS_ONPROC;
333 
334 	cpu->cpu_thread = it;		/* new curthread on this cpu */
335 	it->t_pil = (uchar_t)pil;
336 	it->t_pri = intr_pri + (pri_t)pil;
337 	it->t_intr_start = now;
338 
339 	return (it->t_stk);
340 }
341 
342 
343 #ifdef DEBUG
344 int intr_thread_cnt;
345 #endif
346 
347 /*
348  * Called with interrupts disabled
349  */
350 static void
351 intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
352 {
353 	struct machcpu *mcpu = &cpu->cpu_m;
354 	kthread_t *t;
355 	kthread_t *it = cpu->cpu_thread;	/* curthread */
356 	uint_t pil, basespl;
357 	hrtime_t intrtime;
358 	hrtime_t now = tsc_read();
359 
360 	pil = it->t_pil;
361 	cpu->cpu_stats.sys.intr[pil - 1]++;
362 
363 	ASSERT(it->t_intr_start != 0);
364 	intrtime = now - it->t_intr_start;
365 	mcpu->intrstat[pil][0] += intrtime;
366 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
367 
368 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
369 	cpu->cpu_intr_actv &= ~(1 << pil);
370 
371 	/*
372 	 * If there is still an interrupted thread underneath this one
373 	 * then the interrupt was never blocked and the return is
374 	 * fairly simple.  Otherwise it isn't.
375 	 */
376 	if ((t = it->t_intr) == NULL) {
377 		/*
378 		 * The interrupted thread is no longer pinned underneath
379 		 * the interrupt thread.  This means the interrupt must
380 		 * have blocked, and the interrupted thread has been
381 		 * unpinned, and has probably been running around the
382 		 * system for a while.
383 		 *
384 		 * Since there is no longer a thread under this one, put
385 		 * this interrupt thread back on the CPU's free list and
386 		 * resume the idle thread which will dispatch the next
387 		 * thread to run.
388 		 */
389 #ifdef DEBUG
390 		intr_thread_cnt++;
391 #endif
392 		cpu->cpu_stats.sys.intrblk++;
393 		/*
394 		 * Set CPU's base SPL based on active interrupts bitmask
395 		 */
396 		set_base_spl();
397 		basespl = cpu->cpu_base_spl;
398 		mcpu->mcpu_pri = basespl;
399 		(*setlvlx)(basespl, vec);
400 		(void) splhigh();
401 		sti();
402 		it->t_state = TS_FREE;
403 		/*
404 		 * Return interrupt thread to pool
405 		 */
406 		it->t_link = cpu->cpu_intr_thread;
407 		cpu->cpu_intr_thread = it;
408 		swtch();
409 		panic("intr_thread_epilog: swtch returned");
410 		/*NOTREACHED*/
411 	}
412 
413 	/*
414 	 * Return interrupt thread to the pool
415 	 */
416 	it->t_link = cpu->cpu_intr_thread;
417 	cpu->cpu_intr_thread = it;
418 	it->t_state = TS_FREE;
419 
420 	basespl = cpu->cpu_base_spl;
421 	pil = MAX(oldpil, basespl);
422 	mcpu->mcpu_pri = pil;
423 	(*setlvlx)(pil, vec);
424 	t->t_intr_start = now;
425 	cpu->cpu_thread = t;
426 }
427 
428 /*
429  * intr_get_time() is a resource for interrupt handlers to determine how
430  * much time has been spent handling the current interrupt. Such a function
431  * is needed because higher level interrupts can arrive during the
432  * processing of an interrupt.  intr_get_time() only returns time spent in the
433  * current interrupt handler.
434  *
435  * The caller must be calling from an interrupt handler running at a pil
436  * below or at lock level. Timings are not provided for high-level
437  * interrupts.
438  *
439  * The first time intr_get_time() is called while handling an interrupt,
440  * it returns the time since the interrupt handler was invoked. Subsequent
441  * calls will return the time since the prior call to intr_get_time(). Time
442  * is returned as ticks. Use scalehrtimef() to convert ticks to nsec.
443  *
444  * Theory Of Intrstat[][]:
445  *
446  * uint64_t intrstat[pil][0..1] is an array indexed by pil level, with two
447  * uint64_ts per pil.
448  *
449  * intrstat[pil][0] is a cumulative count of the number of ticks spent
450  * handling all interrupts at the specified pil on this CPU. It is
451  * exported via kstats to the user.
452  *
453  * intrstat[pil][1] is always a count of ticks less than or equal to the
454  * value in [0]. The difference between [1] and [0] is the value returned
455  * by a call to intr_get_time(). At the start of interrupt processing,
456  * [0] and [1] will be equal (or nearly so). As the interrupt consumes
457  * time, [0] will increase, but [1] will remain the same. A call to
458  * intr_get_time() will return the difference, then update [1] to be the
459  * same as [0]. Future calls will return the time since the last call.
460  * Finally, when the interrupt completes, [1] is updated to the same as [0].
461  *
462  * Implementation:
463  *
464  * intr_get_time() works much like a higher level interrupt arriving. It
465  * "checkpoints" the timing information by incrementing intrstat[pil][0]
466  * to include elapsed running time, and by setting t_intr_start to rdtsc.
467  * It then sets the return value to intrstat[pil][0] - intrstat[pil][1],
468  * and updates intrstat[pil][1] to be the same as the new value of
469  * intrstat[pil][0].
470  *
471  * In the normal handling of interrupts, after an interrupt handler returns
472  * and the code in intr_thread() updates intrstat[pil][0], it then sets
473  * intrstat[pil][1] to the new value of intrstat[pil][0]. When [0] == [1],
474  * the timings are reset, i.e. intr_get_time() will return [0] - [1] which
475  * is 0.
476  *
477  * Whenever interrupts arrive on a CPU which is handling a lower pil
478  * interrupt, they update the lower pil's [0] to show time spent in the
479  * handler that they've interrupted. This results in a growing discrepancy
480  * between [0] and [1], which is returned the next time intr_get_time() is
481  * called. Time spent in the higher-pil interrupt will not be returned in
482  * the next intr_get_time() call from the original interrupt, because
483  * the higher-pil interrupt's time is accumulated in intrstat[higherpil][].
484  */
485 uint64_t
486 intr_get_time(void)
487 {
488 	struct cpu *cpu;
489 	struct machcpu *mcpu;
490 	kthread_t *t;
491 	uint64_t time, delta, ret;
492 	uint_t pil;
493 
494 	cli();
495 	cpu = CPU;
496 	mcpu = &cpu->cpu_m;
497 	t = cpu->cpu_thread;
498 	pil = t->t_pil;
499 	ASSERT((cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK) == 0);
500 	ASSERT(t->t_flag & T_INTR_THREAD);
501 	ASSERT(pil != 0);
502 	ASSERT(t->t_intr_start != 0);
503 
504 	time = tsc_read();
505 	delta = time - t->t_intr_start;
506 	t->t_intr_start = time;
507 
508 	time = mcpu->intrstat[pil][0] + delta;
509 	ret = time - mcpu->intrstat[pil][1];
510 	mcpu->intrstat[pil][0] = time;
511 	mcpu->intrstat[pil][1] = time;
512 	cpu->cpu_intracct[cpu->cpu_mstate] += delta;
513 
514 	sti();
515 	return (ret);
516 }
517 
518 static caddr_t
519 dosoftint_prolog(
520 	struct cpu *cpu,
521 	caddr_t stackptr,
522 	uint32_t st_pending,
523 	uint_t oldpil)
524 {
525 	kthread_t *t, *volatile it;
526 	struct machcpu *mcpu = &cpu->cpu_m;
527 	uint_t pil;
528 	hrtime_t now;
529 
530 top:
531 	ASSERT(st_pending == mcpu->mcpu_softinfo.st_pending);
532 
533 	pil = bsrw_insn((uint16_t)st_pending);
534 	if (pil <= oldpil || pil <= cpu->cpu_base_spl)
535 		return (0);
536 
537 	/*
538 	 * XX64	Sigh.
539 	 *
540 	 * This is a transliteration of the i386 assembler code for
541 	 * soft interrupts.  One question is "why does this need
542 	 * to be atomic?"  One possible race is -other- processors
543 	 * posting soft interrupts to us in set_pending() i.e. the
544 	 * CPU might get preempted just after the address computation,
545 	 * but just before the atomic transaction, so another CPU would
546 	 * actually set the original CPU's st_pending bit.  However,
547 	 * it looks like it would be simpler to disable preemption there.
548 	 * Are there other races for which preemption control doesn't work?
549 	 *
550 	 * The i386 assembler version -also- checks to see if the bit
551 	 * being cleared was actually set; if it wasn't, it rechecks
552 	 * for more.  This seems a bit strange, as the only code that
553 	 * ever clears the bit is -this- code running with interrupts
554 	 * disabled on -this- CPU.  This code would probably be cheaper:
555 	 *
556 	 * atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending,
557 	 *   ~(1 << pil));
558 	 *
559 	 * and t->t_preempt--/++ around set_pending() even cheaper,
560 	 * but at this point, correctness is critical, so we slavishly
561 	 * emulate the i386 port.
562 	 */
563 	if (atomic_btr32((uint32_t *)
564 	    &mcpu->mcpu_softinfo.st_pending, pil) == 0) {
565 		st_pending = mcpu->mcpu_softinfo.st_pending;
566 		goto top;
567 	}
568 
569 	mcpu->mcpu_pri = pil;
570 	(*setspl)(pil);
571 
572 	now = tsc_read();
573 
574 	/*
575 	 * Get set to run interrupt thread.
576 	 * There should always be an interrupt thread since we
577 	 * allocate one for each level on the CPU.
578 	 */
579 	it = cpu->cpu_intr_thread;
580 	cpu->cpu_intr_thread = it->t_link;
581 
582 	/* t_intr_start could be zero due to cpu_intr_swtch_enter. */
583 	t = cpu->cpu_thread;
584 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
585 		hrtime_t intrtime = now - t->t_intr_start;
586 		mcpu->intrstat[pil][0] += intrtime;
587 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
588 		t->t_intr_start = 0;
589 	}
590 
591 	/*
592 	 * Note that the code in kcpc_overflow_intr -relies- on the
593 	 * ordering of events here - in particular that t->t_lwp of
594 	 * the interrupt thread is set to the pinned thread *before*
595 	 * curthread is changed.
596 	 */
597 	it->t_lwp = t->t_lwp;
598 	it->t_state = TS_ONPROC;
599 
600 	/*
601 	 * Push interrupted thread onto list from new thread.
602 	 * Set the new thread as the current one.
603 	 * Set interrupted thread's T_SP because if it is the idle thread,
604 	 * resume() may use that stack between threads.
605 	 */
606 
607 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
608 	t->t_sp = (uintptr_t)stackptr;
609 
610 	it->t_intr = t;
611 	cpu->cpu_thread = it;
612 
613 	/*
614 	 * Set bit for this pil in CPU's interrupt active bitmask.
615 	 */
616 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
617 	cpu->cpu_intr_actv |= (1 << pil);
618 
619 	/*
620 	 * Initialize thread priority level from intr_pri
621 	 */
622 	it->t_pil = (uchar_t)pil;
623 	it->t_pri = (pri_t)pil + intr_pri;
624 	it->t_intr_start = now;
625 
626 	return (it->t_stk);
627 }
628 
629 static void
630 dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
631 {
632 	struct machcpu *mcpu = &cpu->cpu_m;
633 	kthread_t *t, *it;
634 	uint_t pil, basespl;
635 	hrtime_t intrtime;
636 	hrtime_t now = tsc_read();
637 
638 	it = cpu->cpu_thread;
639 	pil = it->t_pil;
640 
641 	cpu->cpu_stats.sys.intr[pil - 1]++;
642 
643 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
644 	cpu->cpu_intr_actv &= ~(1 << pil);
645 	intrtime = now - it->t_intr_start;
646 	mcpu->intrstat[pil][0] += intrtime;
647 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
648 
649 	/*
650 	 * If there is still an interrupted thread underneath this one
651 	 * then the interrupt was never blocked and the return is
652 	 * fairly simple.  Otherwise it isn't.
653 	 */
654 	if ((t = it->t_intr) == NULL) {
655 		/*
656 		 * Put thread back on the interrupt thread list.
657 		 * This was an interrupt thread, so set CPU's base SPL.
658 		 */
659 		set_base_spl();
660 		it->t_state = TS_FREE;
661 		it->t_link = cpu->cpu_intr_thread;
662 		cpu->cpu_intr_thread = it;
663 		(void) splhigh();
664 		sti();
665 		swtch();
666 		/*NOTREACHED*/
667 		panic("dosoftint_epilog: swtch returned");
668 	}
669 	it->t_link = cpu->cpu_intr_thread;
670 	cpu->cpu_intr_thread = it;
671 	it->t_state = TS_FREE;
672 	cpu->cpu_thread = t;
673 	if (t->t_flag & T_INTR_THREAD)
674 		t->t_intr_start = now;
675 	basespl = cpu->cpu_base_spl;
676 	pil = MAX(oldpil, basespl);
677 	mcpu->mcpu_pri = pil;
678 	(*setspl)(pil);
679 }
680 
681 
682 /*
683  * Make the interrupted thread 'to' be runnable.
684  *
685  * Since t->t_sp has already been saved, t->t_pc is all
686  * that needs to be set in this function.
687  *
688  * Returns the interrupt level of the interrupt thread.
689  */
690 int
691 intr_passivate(
692 	kthread_t *it,		/* interrupt thread */
693 	kthread_t *t)		/* interrupted thread */
694 {
695 	extern void _sys_rtt();
696 
697 	ASSERT(it->t_flag & T_INTR_THREAD);
698 	ASSERT(SA(t->t_sp) == t->t_sp);
699 
700 	t->t_pc = (uintptr_t)_sys_rtt;
701 	return (it->t_pil);
702 }
703 
704 /*
705  * Create interrupt kstats for this CPU.
706  */
707 void
708 cpu_create_intrstat(cpu_t *cp)
709 {
710 	int		i;
711 	kstat_t		*intr_ksp;
712 	kstat_named_t	*knp;
713 	char		name[KSTAT_STRLEN];
714 	zoneid_t	zoneid;
715 
716 	ASSERT(MUTEX_HELD(&cpu_lock));
717 
718 	if (pool_pset_enabled())
719 		zoneid = GLOBAL_ZONEID;
720 	else
721 		zoneid = ALL_ZONES;
722 
723 	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
724 	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
725 
726 	/*
727 	 * Initialize each PIL's named kstat
728 	 */
729 	if (intr_ksp != NULL) {
730 		intr_ksp->ks_update = cpu_kstat_intrstat_update;
731 		knp = (kstat_named_t *)intr_ksp->ks_data;
732 		intr_ksp->ks_private = cp;
733 		for (i = 0; i < PIL_MAX; i++) {
734 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
735 			    i + 1);
736 			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
737 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
738 			    i + 1);
739 			kstat_named_init(&knp[(i * 2) + 1], name,
740 			    KSTAT_DATA_UINT64);
741 		}
742 		kstat_install(intr_ksp);
743 	}
744 }
745 
746 /*
747  * Delete interrupt kstats for this CPU.
748  */
749 void
750 cpu_delete_intrstat(cpu_t *cp)
751 {
752 	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
753 }
754 
755 /*
756  * Convert interrupt statistics from CPU ticks to nanoseconds and
757  * update kstat.
758  */
759 int
760 cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
761 {
762 	kstat_named_t	*knp = ksp->ks_data;
763 	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
764 	int		i;
765 	hrtime_t	hrt;
766 
767 	if (rw == KSTAT_WRITE)
768 		return (EACCES);
769 
770 	for (i = 0; i < PIL_MAX; i++) {
771 		hrt = (hrtime_t)cpup->cpu_m.intrstat[i + 1][0];
772 		scalehrtimef(&hrt);
773 		knp[i * 2].value.ui64 = (uint64_t)hrt;
774 		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
775 	}
776 
777 	return (0);
778 }
779 
780 /*
781  * An interrupt thread is ending a time slice, so compute the interval it
782  * ran for and update the statistic for its PIL.
783  */
784 void
785 cpu_intr_swtch_enter(kthread_id_t t)
786 {
787 	uint64_t	interval;
788 	uint64_t	start;
789 	cpu_t		*cpu;
790 
791 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
792 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
793 
794 	/*
795 	 * We could be here with a zero timestamp. This could happen if:
796 	 * an interrupt thread which no longer has a pinned thread underneath
797 	 * it (i.e. it blocked at some point in its past) has finished running
798 	 * its handler. intr_thread() updated the interrupt statistic for its
799 	 * PIL and zeroed its timestamp. Since there was no pinned thread to
800 	 * return to, swtch() gets called and we end up here.
801 	 *
802 	 * Note that we use atomic ops below (cas64 and atomic_add_64), which
803 	 * we don't use in the functions above, because we're not called
804 	 * with interrupts blocked, but the epilog/prolog functions are.
805 	 */
806 	if (t->t_intr_start) {
807 		do {
808 			start = t->t_intr_start;
809 			interval = tsc_read() - start;
810 		} while (cas64(&t->t_intr_start, start, 0) != start);
811 		cpu = CPU;
812 		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
813 
814 		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
815 		    interval);
816 	} else
817 		ASSERT(t->t_intr == NULL);
818 }
819 
820 /*
821  * An interrupt thread is returning from swtch(). Place a starting timestamp
822  * in its thread structure.
823  */
824 void
825 cpu_intr_swtch_exit(kthread_id_t t)
826 {
827 	uint64_t ts;
828 
829 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
830 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
831 
832 	do {
833 		ts = t->t_intr_start;
834 	} while (cas64(&t->t_intr_start, ts, tsc_read()) != ts);
835 }
836 
837 /*
838  * Dispatch a hilevel interrupt (one above LOCK_LEVEL)
839  */
840 /*ARGSUSED*/
841 static void
842 dispatch_hilevel(uint_t vector, uint_t arg2)
843 {
844 	sti();
845 	av_dispatch_autovect(vector);
846 	cli();
847 }
848 
849 /*
850  * Dispatch a soft interrupt
851  */
852 /*ARGSUSED*/
853 static void
854 dispatch_softint(uint_t oldpil, uint_t arg2)
855 {
856 	struct cpu *cpu = CPU;
857 
858 	sti();
859 	av_dispatch_softvect((int)cpu->cpu_thread->t_pil);
860 	cli();
861 
862 	/*
863 	 * Must run softint_epilog() on the interrupt thread stack, since
864 	 * there may not be a return from it if the interrupt thread blocked.
865 	 */
866 	dosoftint_epilog(cpu, oldpil);
867 }
868 
869 /*
870  * Dispatch a normal interrupt
871  */
872 static void
873 dispatch_hardint(uint_t vector, uint_t oldipl)
874 {
875 	struct cpu *cpu = CPU;
876 
877 	sti();
878 	av_dispatch_autovect(vector);
879 	cli();
880 
881 	/*
882 	 * Must run intr_thread_epilog() on the interrupt thread stack, since
883 	 * there may not be a return from it if the interrupt thread blocked.
884 	 */
885 	intr_thread_epilog(cpu, vector, oldipl);
886 }
887 
888 /*
889  * Deliver any softints the current interrupt priority allows.
890  * Called with interrupts disabled.
891  */
892 void
893 dosoftint(struct regs *regs)
894 {
895 	struct cpu *cpu = CPU;
896 	int oldipl;
897 	caddr_t newsp;
898 
899 	while (cpu->cpu_softinfo.st_pending) {
900 		oldipl = cpu->cpu_pri;
901 		newsp = dosoftint_prolog(cpu, (caddr_t)regs,
902 		    cpu->cpu_softinfo.st_pending, oldipl);
903 		/*
904 		 * If returned stack pointer is NULL, priority is too high
905 		 * to run any of the pending softints now.
906 		 * Break out and they will be run later.
907 		 */
908 		if (newsp == NULL)
909 			break;
910 		switch_sp_and_call(newsp, dispatch_softint, oldipl, 0);
911 	}
912 }
913 
914 /*
915  * Interrupt service routine, called with interrupts disabled.
916  */
917 /*ARGSUSED*/
918 void
919 do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
920 {
921 	struct cpu *cpu = CPU;
922 	int newipl, oldipl = cpu->cpu_pri;
923 	uint_t vector;
924 	caddr_t newsp;
925 
926 #ifdef TRAPTRACE
927 	ttp->ttr_marker = TT_INTERRUPT;
928 	ttp->ttr_ipl = 0xff;
929 	ttp->ttr_pri = oldipl;
930 	ttp->ttr_spl = cpu->cpu_base_spl;
931 	ttp->ttr_vector = 0xff;
932 #endif	/* TRAPTRACE */
933 
934 #if !defined(__xpv)
935 	/*
936 	 * Handle any pending TLB flushing
937 	 */
938 	tlb_service();
939 #endif
940 
941 	/*
942 	 * If it's a softint go do it now.
943 	 */
944 	if (rp->r_trapno == T_SOFTINT) {
945 		dosoftint(rp);
946 		ASSERT(!interrupts_enabled());
947 		return;
948 	}
949 
950 	/*
951 	 * Raise the interrupt priority.
952 	 */
953 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
954 #ifdef TRAPTRACE
955 	ttp->ttr_ipl = newipl;
956 #endif	/* TRAPTRACE */
957 
958 	/*
959 	 * Bail if it is a spurious interrupt
960 	 */
961 	if (newipl == -1)
962 		return;
963 	cpu->cpu_pri = newipl;
964 	vector = rp->r_trapno;
965 #ifdef TRAPTRACE
966 	ttp->ttr_vector = vector;
967 #endif	/* TRAPTRACE */
968 	if (newipl > LOCK_LEVEL) {
969 		/*
970 		 * High priority interrupts run on this cpu's interrupt stack.
971 		 */
972 		if (hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) {
973 			newsp = cpu->cpu_intr_stack;
974 			switch_sp_and_call(newsp, dispatch_hilevel, vector, 0);
975 		} else { /* already on the interrupt stack */
976 			dispatch_hilevel(vector, 0);
977 		}
978 		(void) hilevel_intr_epilog(cpu, newipl, oldipl, vector);
979 	} else {
980 		/*
981 		 * Run this interrupt in a separate thread.
982 		 */
983 		newsp = intr_thread_prolog(cpu, (caddr_t)rp, newipl);
984 		switch_sp_and_call(newsp, dispatch_hardint, vector, oldipl);
985 	}
986 
987 	/*
988 	 * Deliver any pending soft interrupts.
989 	 */
990 	if (cpu->cpu_softinfo.st_pending)
991 		dosoftint(rp);
992 }
993 
994 /*
995  * Common tasks always done by _sys_rtt, called with interrupts disabled.
996  * Returns 1 if returning to userland, 0 if returning to system mode.
997  */
998 int
999 sys_rtt_common(struct regs *rp)
1000 {
1001 	kthread_t *tp;
1002 	extern void mutex_exit_critical_start();
1003 	extern long mutex_exit_critical_size;
1004 	extern void mutex_owner_running_critical_start();
1005 	extern long mutex_owner_running_critical_size;
1006 
1007 loop:
1008 
1009 	/*
1010 	 * Check if returning to user
1011 	 */
1012 	tp = CPU->cpu_thread;
1013 	if (USERMODE(rp->r_cs)) {
1014 		/*
1015 		 * Check if AST pending.
1016 		 */
1017 		if (tp->t_astflag) {
1018 			/*
1019 			 * Let trap() handle the AST
1020 			 */
1021 			sti();
1022 			rp->r_trapno = T_AST;
1023 			trap(rp, (caddr_t)0, CPU->cpu_id);
1024 			cli();
1025 			goto loop;
1026 		}
1027 
1028 #if defined(__amd64)
1029 		/*
1030 		 * We are done if segment registers do not need updating.
1031 		 */
1032 		if (tp->t_lwp->lwp_pcb.pcb_rupdate == 0)
1033 			return (1);
1034 
1035 		if (update_sregs(rp, tp->t_lwp)) {
1036 			/*
1037 			 * 1 or more of the selectors is bad.
1038 			 * Deliver a SIGSEGV.
1039 			 */
1040 			proc_t *p = ttoproc(tp);
1041 
1042 			sti();
1043 			mutex_enter(&p->p_lock);
1044 			tp->t_lwp->lwp_cursig = SIGSEGV;
1045 			mutex_exit(&p->p_lock);
1046 			psig();
1047 			tp->t_sig_check = 1;
1048 			cli();
1049 		}
1050 		tp->t_lwp->lwp_pcb.pcb_rupdate = 0;
1051 
1052 #endif	/* __amd64 */
1053 		return (1);
1054 	}
1055 
1056 	/*
1057 	 * Here if we are returning to supervisor mode.
1058 	 * Check for a kernel preemption request.
1059 	 */
1060 	if (CPU->cpu_kprunrun && (rp->r_ps & PS_IE)) {
1061 
1062 		/*
1063 		 * Do nothing if already in kpreempt
1064 		 */
1065 		if (!tp->t_preempt_lk) {
1066 			tp->t_preempt_lk = 1;
1067 			sti();
1068 			kpreempt(1); /* asynchronous kpreempt call */
1069 			cli();
1070 			tp->t_preempt_lk = 0;
1071 		}
1072 	}
1073 
1074 	/*
1075 	 * If we interrupted the mutex_exit() critical region we must
1076 	 * reset the PC back to the beginning to prevent missed wakeups
1077 	 * See the comments in mutex_exit() for details.
1078 	 */
1079 	if ((uintptr_t)rp->r_pc - (uintptr_t)mutex_exit_critical_start <
1080 	    mutex_exit_critical_size) {
1081 		rp->r_pc = (greg_t)mutex_exit_critical_start;
1082 	}
1083 
1084 	/*
1085 	 * If we interrupted the mutex_owner_running() critical region we
1086 	 * must reset the PC back to the beginning to prevent dereferencing
1087 	 * of a freed thread pointer. See the comments in mutex_owner_running
1088 	 * for details.
1089 	 */
1090 	if ((uintptr_t)rp->r_pc -
1091 	    (uintptr_t)mutex_owner_running_critical_start <
1092 	    mutex_owner_running_critical_size) {
1093 		rp->r_pc = (greg_t)mutex_owner_running_critical_start;
1094 	}
1095 
1096 	return (0);
1097 }
1098 
1099 void
1100 send_dirint(int cpuid, int int_level)
1101 {
1102 	(*send_dirintf)(cpuid, int_level);
1103 }
1104 
1105 /*
1106  * do_splx routine, takes new ipl to set
1107  * returns the old ipl.
1108  * We are careful not to set priority lower than CPU->cpu_base_pri,
1109  * even though it seems we're raising the priority, it could be set
1110  * higher at any time by an interrupt routine, so we must block interrupts
1111  * and look at CPU->cpu_base_pri
1112  */
1113 int
1114 do_splx(int newpri)
1115 {
1116 	ulong_t	flag;
1117 	cpu_t	*cpu;
1118 	int	curpri, basepri;
1119 
1120 	flag = intr_clear();
1121 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1122 	curpri = cpu->cpu_m.mcpu_pri;
1123 	basepri = cpu->cpu_base_spl;
1124 	if (newpri < basepri)
1125 		newpri = basepri;
1126 	cpu->cpu_m.mcpu_pri = newpri;
1127 	(*setspl)(newpri);
1128 	/*
1129 	 * If we are going to reenable interrupts see if new priority level
1130 	 * allows pending softint delivery.
1131 	 */
1132 	if ((flag & PS_IE) &&
1133 	    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1134 		fakesoftint();
1135 	ASSERT(!interrupts_enabled());
1136 	intr_restore(flag);
1137 	return (curpri);
1138 }
1139 
1140 /*
1141  * Common spl raise routine, takes new ipl to set
1142  * returns the old ipl, will not lower ipl.
1143  */
1144 int
1145 splr(int newpri)
1146 {
1147 	ulong_t	flag;
1148 	cpu_t	*cpu;
1149 	int	curpri, basepri;
1150 
1151 	flag = intr_clear();
1152 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1153 	curpri = cpu->cpu_m.mcpu_pri;
1154 	/*
1155 	 * Only do something if new priority is larger
1156 	 */
1157 	if (newpri > curpri) {
1158 		basepri = cpu->cpu_base_spl;
1159 		if (newpri < basepri)
1160 			newpri = basepri;
1161 		cpu->cpu_m.mcpu_pri = newpri;
1162 		(*setspl)(newpri);
1163 		/*
1164 		 * See if new priority level allows pending softint delivery
1165 		 */
1166 		if ((flag & PS_IE) &&
1167 		    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1168 			fakesoftint();
1169 	}
1170 	intr_restore(flag);
1171 	return (curpri);
1172 }
1173 
1174 int
1175 getpil(void)
1176 {
1177 	return (CPU->cpu_m.mcpu_pri);
1178 }
1179 
1180 int
1181 interrupts_enabled(void)
1182 {
1183 	ulong_t	flag;
1184 
1185 	flag = getflags();
1186 	return ((flag & PS_IE) == PS_IE);
1187 }
1188 
1189 #ifdef DEBUG
1190 void
1191 assert_ints_enabled(void)
1192 {
1193 	ASSERT(!interrupts_unleashed || interrupts_enabled());
1194 }
1195 #endif	/* DEBUG */
1196