xref: /titanic_52/usr/src/uts/i86pc/os/intr.c (revision b72c368a02e0464faeef362bc5a1cf0fc69981da)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/cpuvar.h>
28 #include <sys/cpu_event.h>
29 #include <sys/regset.h>
30 #include <sys/psw.h>
31 #include <sys/types.h>
32 #include <sys/thread.h>
33 #include <sys/systm.h>
34 #include <sys/segments.h>
35 #include <sys/pcb.h>
36 #include <sys/trap.h>
37 #include <sys/ftrace.h>
38 #include <sys/traptrace.h>
39 #include <sys/clock.h>
40 #include <sys/panic.h>
41 #include <sys/disp.h>
42 #include <vm/seg_kp.h>
43 #include <sys/stack.h>
44 #include <sys/sysmacros.h>
45 #include <sys/cmn_err.h>
46 #include <sys/kstat.h>
47 #include <sys/smp_impldefs.h>
48 #include <sys/pool_pset.h>
49 #include <sys/zone.h>
50 #include <sys/bitmap.h>
51 #include <sys/archsystm.h>
52 #include <sys/machsystm.h>
53 #include <sys/ontrap.h>
54 #include <sys/x86_archext.h>
55 #include <sys/promif.h>
56 #include <vm/hat_i86.h>
57 #if defined(__xpv)
58 #include <sys/hypervisor.h>
59 #endif
60 
61 
62 #if defined(__xpv) && defined(DEBUG)
63 
64 /*
65  * This panic message is intended as an aid to interrupt debugging.
66  *
67  * The associated assertion tests the condition of enabling
68  * events when events are already enabled.  The implication
69  * being that whatever code the programmer thought was
70  * protected by having events disabled until the second
71  * enable happened really wasn't protected at all ..
72  */
73 
74 int stistipanic = 1;	/* controls the debug panic check */
75 const char *stistimsg = "stisti";
76 ulong_t laststi[NCPU];
77 
78 /*
79  * This variable tracks the last place events were disabled on each cpu
80  * it assists in debugging when asserts that interrupts are enabled trip.
81  */
82 ulong_t lastcli[NCPU];
83 
84 #endif
85 
86 /*
87  * Set cpu's base SPL level to the highest active interrupt level
88  */
89 void
90 set_base_spl(void)
91 {
92 	struct cpu *cpu = CPU;
93 	uint16_t active = (uint16_t)cpu->cpu_intr_actv;
94 
95 	cpu->cpu_base_spl = active == 0 ? 0 : bsrw_insn(active);
96 }
97 
98 /*
99  * Do all the work necessary to set up the cpu and thread structures
100  * to dispatch a high-level interrupt.
101  *
102  * Returns 0 if we're -not- already on the high-level interrupt stack,
103  * (and *must* switch to it), non-zero if we are already on that stack.
104  *
105  * Called with interrupts masked.
106  * The 'pil' is already set to the appropriate level for rp->r_trapno.
107  */
108 static int
109 hilevel_intr_prolog(struct cpu *cpu, uint_t pil, uint_t oldpil, struct regs *rp)
110 {
111 	struct machcpu *mcpu = &cpu->cpu_m;
112 	uint_t mask;
113 	hrtime_t intrtime;
114 	hrtime_t now = tsc_read();
115 
116 	ASSERT(pil > LOCK_LEVEL);
117 
118 	if (pil == CBE_HIGH_PIL) {
119 		cpu->cpu_profile_pil = oldpil;
120 		if (USERMODE(rp->r_cs)) {
121 			cpu->cpu_profile_pc = 0;
122 			cpu->cpu_profile_upc = rp->r_pc;
123 			cpu->cpu_cpcprofile_pc = 0;
124 			cpu->cpu_cpcprofile_upc = rp->r_pc;
125 		} else {
126 			cpu->cpu_profile_pc = rp->r_pc;
127 			cpu->cpu_profile_upc = 0;
128 			cpu->cpu_cpcprofile_pc = rp->r_pc;
129 			cpu->cpu_cpcprofile_upc = 0;
130 		}
131 	}
132 
133 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
134 	if (mask != 0) {
135 		int nestpil;
136 
137 		/*
138 		 * We have interrupted another high-level interrupt.
139 		 * Load starting timestamp, compute interval, update
140 		 * cumulative counter.
141 		 */
142 		nestpil = bsrw_insn((uint16_t)mask);
143 		ASSERT(nestpil < pil);
144 		intrtime = now -
145 		    mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)];
146 		mcpu->intrstat[nestpil][0] += intrtime;
147 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
148 		/*
149 		 * Another high-level interrupt is active below this one, so
150 		 * there is no need to check for an interrupt thread.  That
151 		 * will be done by the lowest priority high-level interrupt
152 		 * active.
153 		 */
154 	} else {
155 		kthread_t *t = cpu->cpu_thread;
156 
157 		/*
158 		 * See if we are interrupting a low-level interrupt thread.
159 		 * If so, account for its time slice only if its time stamp
160 		 * is non-zero.
161 		 */
162 		if ((t->t_flag & T_INTR_THREAD) != 0 && t->t_intr_start != 0) {
163 			intrtime = now - t->t_intr_start;
164 			mcpu->intrstat[t->t_pil][0] += intrtime;
165 			cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
166 			t->t_intr_start = 0;
167 		}
168 	}
169 
170 	/*
171 	 * Store starting timestamp in CPU structure for this PIL.
172 	 */
173 	mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] = now;
174 
175 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
176 
177 	if (pil == 15) {
178 		/*
179 		 * To support reentrant level 15 interrupts, we maintain a
180 		 * recursion count in the top half of cpu_intr_actv.  Only
181 		 * when this count hits zero do we clear the PIL 15 bit from
182 		 * the lower half of cpu_intr_actv.
183 		 */
184 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
185 		(*refcntp)++;
186 	}
187 
188 	mask = cpu->cpu_intr_actv;
189 
190 	cpu->cpu_intr_actv |= (1 << pil);
191 
192 	return (mask & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
193 }
194 
195 /*
196  * Does most of the work of returning from a high level interrupt.
197  *
198  * Returns 0 if there are no more high level interrupts (in which
199  * case we must switch back to the interrupted thread stack) or
200  * non-zero if there are more (in which case we should stay on it).
201  *
202  * Called with interrupts masked
203  */
204 static int
205 hilevel_intr_epilog(struct cpu *cpu, uint_t pil, uint_t oldpil, uint_t vecnum)
206 {
207 	struct machcpu *mcpu = &cpu->cpu_m;
208 	uint_t mask;
209 	hrtime_t intrtime;
210 	hrtime_t now = tsc_read();
211 
212 	ASSERT(mcpu->mcpu_pri == pil);
213 
214 	cpu->cpu_stats.sys.intr[pil - 1]++;
215 
216 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
217 
218 	if (pil == 15) {
219 		/*
220 		 * To support reentrant level 15 interrupts, we maintain a
221 		 * recursion count in the top half of cpu_intr_actv.  Only
222 		 * when this count hits zero do we clear the PIL 15 bit from
223 		 * the lower half of cpu_intr_actv.
224 		 */
225 		uint16_t *refcntp = (uint16_t *)&cpu->cpu_intr_actv + 1;
226 
227 		ASSERT(*refcntp > 0);
228 
229 		if (--(*refcntp) == 0)
230 			cpu->cpu_intr_actv &= ~(1 << pil);
231 	} else {
232 		cpu->cpu_intr_actv &= ~(1 << pil);
233 	}
234 
235 	ASSERT(mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)] != 0);
236 
237 	intrtime = now - mcpu->pil_high_start[pil - (LOCK_LEVEL + 1)];
238 	mcpu->intrstat[pil][0] += intrtime;
239 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
240 
241 	/*
242 	 * Check for lower-pil nested high-level interrupt beneath
243 	 * current one.  If so, place a starting timestamp in its
244 	 * pil_high_start entry.
245 	 */
246 	mask = cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK;
247 	if (mask != 0) {
248 		int nestpil;
249 
250 		/*
251 		 * find PIL of nested interrupt
252 		 */
253 		nestpil = bsrw_insn((uint16_t)mask);
254 		ASSERT(nestpil < pil);
255 		mcpu->pil_high_start[nestpil - (LOCK_LEVEL + 1)] = now;
256 		/*
257 		 * (Another high-level interrupt is active below this one,
258 		 * so there is no need to check for an interrupt
259 		 * thread.  That will be done by the lowest priority
260 		 * high-level interrupt active.)
261 		 */
262 	} else {
263 		/*
264 		 * Check to see if there is a low-level interrupt active.
265 		 * If so, place a starting timestamp in the thread
266 		 * structure.
267 		 */
268 		kthread_t *t = cpu->cpu_thread;
269 
270 		if (t->t_flag & T_INTR_THREAD)
271 			t->t_intr_start = now;
272 	}
273 
274 	mcpu->mcpu_pri = oldpil;
275 	(void) (*setlvlx)(oldpil, vecnum);
276 
277 	return (cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK);
278 }
279 
280 /*
281  * Set up the cpu, thread and interrupt thread structures for
282  * executing an interrupt thread.  The new stack pointer of the
283  * interrupt thread (which *must* be switched to) is returned.
284  */
285 static caddr_t
286 intr_thread_prolog(struct cpu *cpu, caddr_t stackptr, uint_t pil)
287 {
288 	struct machcpu *mcpu = &cpu->cpu_m;
289 	kthread_t *t, *volatile it;
290 	hrtime_t now = tsc_read();
291 
292 	ASSERT(pil > 0);
293 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
294 	cpu->cpu_intr_actv |= (1 << pil);
295 
296 	/*
297 	 * Get set to run an interrupt thread.
298 	 * There should always be an interrupt thread, since we
299 	 * allocate one for each level on each CPU.
300 	 *
301 	 * t_intr_start could be zero due to cpu_intr_swtch_enter.
302 	 */
303 	t = cpu->cpu_thread;
304 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
305 		hrtime_t intrtime = now - t->t_intr_start;
306 		mcpu->intrstat[t->t_pil][0] += intrtime;
307 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
308 		t->t_intr_start = 0;
309 	}
310 
311 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
312 
313 	t->t_sp = (uintptr_t)stackptr;	/* mark stack in curthread for resume */
314 
315 	/*
316 	 * unlink the interrupt thread off the cpu
317 	 *
318 	 * Note that the code in kcpc_overflow_intr -relies- on the
319 	 * ordering of events here - in particular that t->t_lwp of
320 	 * the interrupt thread is set to the pinned thread *before*
321 	 * curthread is changed.
322 	 */
323 	it = cpu->cpu_intr_thread;
324 	cpu->cpu_intr_thread = it->t_link;
325 	it->t_intr = t;
326 	it->t_lwp = t->t_lwp;
327 
328 	/*
329 	 * (threads on the interrupt thread free list could have state
330 	 * preset to TS_ONPROC, but it helps in debugging if
331 	 * they're TS_FREE.)
332 	 */
333 	it->t_state = TS_ONPROC;
334 
335 	cpu->cpu_thread = it;		/* new curthread on this cpu */
336 	it->t_pil = (uchar_t)pil;
337 	it->t_pri = intr_pri + (pri_t)pil;
338 	it->t_intr_start = now;
339 
340 	return (it->t_stk);
341 }
342 
343 
344 #ifdef DEBUG
345 int intr_thread_cnt;
346 #endif
347 
348 /*
349  * Called with interrupts disabled
350  */
351 static void
352 intr_thread_epilog(struct cpu *cpu, uint_t vec, uint_t oldpil)
353 {
354 	struct machcpu *mcpu = &cpu->cpu_m;
355 	kthread_t *t;
356 	kthread_t *it = cpu->cpu_thread;	/* curthread */
357 	uint_t pil, basespl;
358 	hrtime_t intrtime;
359 	hrtime_t now = tsc_read();
360 
361 	pil = it->t_pil;
362 	cpu->cpu_stats.sys.intr[pil - 1]++;
363 
364 	ASSERT(it->t_intr_start != 0);
365 	intrtime = now - it->t_intr_start;
366 	mcpu->intrstat[pil][0] += intrtime;
367 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
368 
369 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
370 	cpu->cpu_intr_actv &= ~(1 << pil);
371 
372 	/*
373 	 * If there is still an interrupted thread underneath this one
374 	 * then the interrupt was never blocked and the return is
375 	 * fairly simple.  Otherwise it isn't.
376 	 */
377 	if ((t = it->t_intr) == NULL) {
378 		/*
379 		 * The interrupted thread is no longer pinned underneath
380 		 * the interrupt thread.  This means the interrupt must
381 		 * have blocked, and the interrupted thread has been
382 		 * unpinned, and has probably been running around the
383 		 * system for a while.
384 		 *
385 		 * Since there is no longer a thread under this one, put
386 		 * this interrupt thread back on the CPU's free list and
387 		 * resume the idle thread which will dispatch the next
388 		 * thread to run.
389 		 */
390 #ifdef DEBUG
391 		intr_thread_cnt++;
392 #endif
393 		cpu->cpu_stats.sys.intrblk++;
394 		/*
395 		 * Set CPU's base SPL based on active interrupts bitmask
396 		 */
397 		set_base_spl();
398 		basespl = cpu->cpu_base_spl;
399 		mcpu->mcpu_pri = basespl;
400 		(*setlvlx)(basespl, vec);
401 		(void) splhigh();
402 		sti();
403 		it->t_state = TS_FREE;
404 		/*
405 		 * Return interrupt thread to pool
406 		 */
407 		it->t_link = cpu->cpu_intr_thread;
408 		cpu->cpu_intr_thread = it;
409 		swtch();
410 		panic("intr_thread_epilog: swtch returned");
411 		/*NOTREACHED*/
412 	}
413 
414 	/*
415 	 * Return interrupt thread to the pool
416 	 */
417 	it->t_link = cpu->cpu_intr_thread;
418 	cpu->cpu_intr_thread = it;
419 	it->t_state = TS_FREE;
420 
421 	basespl = cpu->cpu_base_spl;
422 	pil = MAX(oldpil, basespl);
423 	mcpu->mcpu_pri = pil;
424 	(*setlvlx)(pil, vec);
425 	t->t_intr_start = now;
426 	cpu->cpu_thread = t;
427 }
428 
429 /*
430  * intr_get_time() is a resource for interrupt handlers to determine how
431  * much time has been spent handling the current interrupt. Such a function
432  * is needed because higher level interrupts can arrive during the
433  * processing of an interrupt.  intr_get_time() only returns time spent in the
434  * current interrupt handler.
435  *
436  * The caller must be calling from an interrupt handler running at a pil
437  * below or at lock level. Timings are not provided for high-level
438  * interrupts.
439  *
440  * The first time intr_get_time() is called while handling an interrupt,
441  * it returns the time since the interrupt handler was invoked. Subsequent
442  * calls will return the time since the prior call to intr_get_time(). Time
443  * is returned as ticks. Use scalehrtimef() to convert ticks to nsec.
444  *
445  * Theory Of Intrstat[][]:
446  *
447  * uint64_t intrstat[pil][0..1] is an array indexed by pil level, with two
448  * uint64_ts per pil.
449  *
450  * intrstat[pil][0] is a cumulative count of the number of ticks spent
451  * handling all interrupts at the specified pil on this CPU. It is
452  * exported via kstats to the user.
453  *
454  * intrstat[pil][1] is always a count of ticks less than or equal to the
455  * value in [0]. The difference between [1] and [0] is the value returned
456  * by a call to intr_get_time(). At the start of interrupt processing,
457  * [0] and [1] will be equal (or nearly so). As the interrupt consumes
458  * time, [0] will increase, but [1] will remain the same. A call to
459  * intr_get_time() will return the difference, then update [1] to be the
460  * same as [0]. Future calls will return the time since the last call.
461  * Finally, when the interrupt completes, [1] is updated to the same as [0].
462  *
463  * Implementation:
464  *
465  * intr_get_time() works much like a higher level interrupt arriving. It
466  * "checkpoints" the timing information by incrementing intrstat[pil][0]
467  * to include elapsed running time, and by setting t_intr_start to rdtsc.
468  * It then sets the return value to intrstat[pil][0] - intrstat[pil][1],
469  * and updates intrstat[pil][1] to be the same as the new value of
470  * intrstat[pil][0].
471  *
472  * In the normal handling of interrupts, after an interrupt handler returns
473  * and the code in intr_thread() updates intrstat[pil][0], it then sets
474  * intrstat[pil][1] to the new value of intrstat[pil][0]. When [0] == [1],
475  * the timings are reset, i.e. intr_get_time() will return [0] - [1] which
476  * is 0.
477  *
478  * Whenever interrupts arrive on a CPU which is handling a lower pil
479  * interrupt, they update the lower pil's [0] to show time spent in the
480  * handler that they've interrupted. This results in a growing discrepancy
481  * between [0] and [1], which is returned the next time intr_get_time() is
482  * called. Time spent in the higher-pil interrupt will not be returned in
483  * the next intr_get_time() call from the original interrupt, because
484  * the higher-pil interrupt's time is accumulated in intrstat[higherpil][].
485  */
486 uint64_t
487 intr_get_time(void)
488 {
489 	struct cpu *cpu;
490 	struct machcpu *mcpu;
491 	kthread_t *t;
492 	uint64_t time, delta, ret;
493 	uint_t pil;
494 
495 	cli();
496 	cpu = CPU;
497 	mcpu = &cpu->cpu_m;
498 	t = cpu->cpu_thread;
499 	pil = t->t_pil;
500 	ASSERT((cpu->cpu_intr_actv & CPU_INTR_ACTV_HIGH_LEVEL_MASK) == 0);
501 	ASSERT(t->t_flag & T_INTR_THREAD);
502 	ASSERT(pil != 0);
503 	ASSERT(t->t_intr_start != 0);
504 
505 	time = tsc_read();
506 	delta = time - t->t_intr_start;
507 	t->t_intr_start = time;
508 
509 	time = mcpu->intrstat[pil][0] + delta;
510 	ret = time - mcpu->intrstat[pil][1];
511 	mcpu->intrstat[pil][0] = time;
512 	mcpu->intrstat[pil][1] = time;
513 	cpu->cpu_intracct[cpu->cpu_mstate] += delta;
514 
515 	sti();
516 	return (ret);
517 }
518 
519 static caddr_t
520 dosoftint_prolog(
521 	struct cpu *cpu,
522 	caddr_t stackptr,
523 	uint32_t st_pending,
524 	uint_t oldpil)
525 {
526 	kthread_t *t, *volatile it;
527 	struct machcpu *mcpu = &cpu->cpu_m;
528 	uint_t pil;
529 	hrtime_t now;
530 
531 top:
532 	ASSERT(st_pending == mcpu->mcpu_softinfo.st_pending);
533 
534 	pil = bsrw_insn((uint16_t)st_pending);
535 	if (pil <= oldpil || pil <= cpu->cpu_base_spl)
536 		return (0);
537 
538 	/*
539 	 * XX64	Sigh.
540 	 *
541 	 * This is a transliteration of the i386 assembler code for
542 	 * soft interrupts.  One question is "why does this need
543 	 * to be atomic?"  One possible race is -other- processors
544 	 * posting soft interrupts to us in set_pending() i.e. the
545 	 * CPU might get preempted just after the address computation,
546 	 * but just before the atomic transaction, so another CPU would
547 	 * actually set the original CPU's st_pending bit.  However,
548 	 * it looks like it would be simpler to disable preemption there.
549 	 * Are there other races for which preemption control doesn't work?
550 	 *
551 	 * The i386 assembler version -also- checks to see if the bit
552 	 * being cleared was actually set; if it wasn't, it rechecks
553 	 * for more.  This seems a bit strange, as the only code that
554 	 * ever clears the bit is -this- code running with interrupts
555 	 * disabled on -this- CPU.  This code would probably be cheaper:
556 	 *
557 	 * atomic_and_32((uint32_t *)&mcpu->mcpu_softinfo.st_pending,
558 	 *   ~(1 << pil));
559 	 *
560 	 * and t->t_preempt--/++ around set_pending() even cheaper,
561 	 * but at this point, correctness is critical, so we slavishly
562 	 * emulate the i386 port.
563 	 */
564 	if (atomic_btr32((uint32_t *)
565 	    &mcpu->mcpu_softinfo.st_pending, pil) == 0) {
566 		st_pending = mcpu->mcpu_softinfo.st_pending;
567 		goto top;
568 	}
569 
570 	mcpu->mcpu_pri = pil;
571 	(*setspl)(pil);
572 
573 	now = tsc_read();
574 
575 	/*
576 	 * Get set to run interrupt thread.
577 	 * There should always be an interrupt thread since we
578 	 * allocate one for each level on the CPU.
579 	 */
580 	it = cpu->cpu_intr_thread;
581 	cpu->cpu_intr_thread = it->t_link;
582 
583 	/* t_intr_start could be zero due to cpu_intr_swtch_enter. */
584 	t = cpu->cpu_thread;
585 	if ((t->t_flag & T_INTR_THREAD) && t->t_intr_start != 0) {
586 		hrtime_t intrtime = now - t->t_intr_start;
587 		mcpu->intrstat[pil][0] += intrtime;
588 		cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
589 		t->t_intr_start = 0;
590 	}
591 
592 	/*
593 	 * Note that the code in kcpc_overflow_intr -relies- on the
594 	 * ordering of events here - in particular that t->t_lwp of
595 	 * the interrupt thread is set to the pinned thread *before*
596 	 * curthread is changed.
597 	 */
598 	it->t_lwp = t->t_lwp;
599 	it->t_state = TS_ONPROC;
600 
601 	/*
602 	 * Push interrupted thread onto list from new thread.
603 	 * Set the new thread as the current one.
604 	 * Set interrupted thread's T_SP because if it is the idle thread,
605 	 * resume() may use that stack between threads.
606 	 */
607 
608 	ASSERT(SA((uintptr_t)stackptr) == (uintptr_t)stackptr);
609 	t->t_sp = (uintptr_t)stackptr;
610 
611 	it->t_intr = t;
612 	cpu->cpu_thread = it;
613 
614 	/*
615 	 * Set bit for this pil in CPU's interrupt active bitmask.
616 	 */
617 	ASSERT((cpu->cpu_intr_actv & (1 << pil)) == 0);
618 	cpu->cpu_intr_actv |= (1 << pil);
619 
620 	/*
621 	 * Initialize thread priority level from intr_pri
622 	 */
623 	it->t_pil = (uchar_t)pil;
624 	it->t_pri = (pri_t)pil + intr_pri;
625 	it->t_intr_start = now;
626 
627 	return (it->t_stk);
628 }
629 
630 static void
631 dosoftint_epilog(struct cpu *cpu, uint_t oldpil)
632 {
633 	struct machcpu *mcpu = &cpu->cpu_m;
634 	kthread_t *t, *it;
635 	uint_t pil, basespl;
636 	hrtime_t intrtime;
637 	hrtime_t now = tsc_read();
638 
639 	it = cpu->cpu_thread;
640 	pil = it->t_pil;
641 
642 	cpu->cpu_stats.sys.intr[pil - 1]++;
643 
644 	ASSERT(cpu->cpu_intr_actv & (1 << pil));
645 	cpu->cpu_intr_actv &= ~(1 << pil);
646 	intrtime = now - it->t_intr_start;
647 	mcpu->intrstat[pil][0] += intrtime;
648 	cpu->cpu_intracct[cpu->cpu_mstate] += intrtime;
649 
650 	/*
651 	 * If there is still an interrupted thread underneath this one
652 	 * then the interrupt was never blocked and the return is
653 	 * fairly simple.  Otherwise it isn't.
654 	 */
655 	if ((t = it->t_intr) == NULL) {
656 		/*
657 		 * Put thread back on the interrupt thread list.
658 		 * This was an interrupt thread, so set CPU's base SPL.
659 		 */
660 		set_base_spl();
661 		it->t_state = TS_FREE;
662 		it->t_link = cpu->cpu_intr_thread;
663 		cpu->cpu_intr_thread = it;
664 		(void) splhigh();
665 		sti();
666 		swtch();
667 		/*NOTREACHED*/
668 		panic("dosoftint_epilog: swtch returned");
669 	}
670 	it->t_link = cpu->cpu_intr_thread;
671 	cpu->cpu_intr_thread = it;
672 	it->t_state = TS_FREE;
673 	cpu->cpu_thread = t;
674 	if (t->t_flag & T_INTR_THREAD)
675 		t->t_intr_start = now;
676 	basespl = cpu->cpu_base_spl;
677 	pil = MAX(oldpil, basespl);
678 	mcpu->mcpu_pri = pil;
679 	(*setspl)(pil);
680 }
681 
682 
683 /*
684  * Make the interrupted thread 'to' be runnable.
685  *
686  * Since t->t_sp has already been saved, t->t_pc is all
687  * that needs to be set in this function.
688  *
689  * Returns the interrupt level of the interrupt thread.
690  */
691 int
692 intr_passivate(
693 	kthread_t *it,		/* interrupt thread */
694 	kthread_t *t)		/* interrupted thread */
695 {
696 	extern void _sys_rtt();
697 
698 	ASSERT(it->t_flag & T_INTR_THREAD);
699 	ASSERT(SA(t->t_sp) == t->t_sp);
700 
701 	t->t_pc = (uintptr_t)_sys_rtt;
702 	return (it->t_pil);
703 }
704 
705 /*
706  * Create interrupt kstats for this CPU.
707  */
708 void
709 cpu_create_intrstat(cpu_t *cp)
710 {
711 	int		i;
712 	kstat_t		*intr_ksp;
713 	kstat_named_t	*knp;
714 	char		name[KSTAT_STRLEN];
715 	zoneid_t	zoneid;
716 
717 	ASSERT(MUTEX_HELD(&cpu_lock));
718 
719 	if (pool_pset_enabled())
720 		zoneid = GLOBAL_ZONEID;
721 	else
722 		zoneid = ALL_ZONES;
723 
724 	intr_ksp = kstat_create_zone("cpu", cp->cpu_id, "intrstat", "misc",
725 	    KSTAT_TYPE_NAMED, PIL_MAX * 2, NULL, zoneid);
726 
727 	/*
728 	 * Initialize each PIL's named kstat
729 	 */
730 	if (intr_ksp != NULL) {
731 		intr_ksp->ks_update = cpu_kstat_intrstat_update;
732 		knp = (kstat_named_t *)intr_ksp->ks_data;
733 		intr_ksp->ks_private = cp;
734 		for (i = 0; i < PIL_MAX; i++) {
735 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-time",
736 			    i + 1);
737 			kstat_named_init(&knp[i * 2], name, KSTAT_DATA_UINT64);
738 			(void) snprintf(name, KSTAT_STRLEN, "level-%d-count",
739 			    i + 1);
740 			kstat_named_init(&knp[(i * 2) + 1], name,
741 			    KSTAT_DATA_UINT64);
742 		}
743 		kstat_install(intr_ksp);
744 	}
745 }
746 
747 /*
748  * Delete interrupt kstats for this CPU.
749  */
750 void
751 cpu_delete_intrstat(cpu_t *cp)
752 {
753 	kstat_delete_byname_zone("cpu", cp->cpu_id, "intrstat", ALL_ZONES);
754 }
755 
756 /*
757  * Convert interrupt statistics from CPU ticks to nanoseconds and
758  * update kstat.
759  */
760 int
761 cpu_kstat_intrstat_update(kstat_t *ksp, int rw)
762 {
763 	kstat_named_t	*knp = ksp->ks_data;
764 	cpu_t		*cpup = (cpu_t *)ksp->ks_private;
765 	int		i;
766 	hrtime_t	hrt;
767 
768 	if (rw == KSTAT_WRITE)
769 		return (EACCES);
770 
771 	for (i = 0; i < PIL_MAX; i++) {
772 		hrt = (hrtime_t)cpup->cpu_m.intrstat[i + 1][0];
773 		scalehrtimef(&hrt);
774 		knp[i * 2].value.ui64 = (uint64_t)hrt;
775 		knp[(i * 2) + 1].value.ui64 = cpup->cpu_stats.sys.intr[i];
776 	}
777 
778 	return (0);
779 }
780 
781 /*
782  * An interrupt thread is ending a time slice, so compute the interval it
783  * ran for and update the statistic for its PIL.
784  */
785 void
786 cpu_intr_swtch_enter(kthread_id_t t)
787 {
788 	uint64_t	interval;
789 	uint64_t	start;
790 	cpu_t		*cpu;
791 
792 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
793 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
794 
795 	/*
796 	 * We could be here with a zero timestamp. This could happen if:
797 	 * an interrupt thread which no longer has a pinned thread underneath
798 	 * it (i.e. it blocked at some point in its past) has finished running
799 	 * its handler. intr_thread() updated the interrupt statistic for its
800 	 * PIL and zeroed its timestamp. Since there was no pinned thread to
801 	 * return to, swtch() gets called and we end up here.
802 	 *
803 	 * Note that we use atomic ops below (cas64 and atomic_add_64), which
804 	 * we don't use in the functions above, because we're not called
805 	 * with interrupts blocked, but the epilog/prolog functions are.
806 	 */
807 	if (t->t_intr_start) {
808 		do {
809 			start = t->t_intr_start;
810 			interval = tsc_read() - start;
811 		} while (cas64(&t->t_intr_start, start, 0) != start);
812 		cpu = CPU;
813 		cpu->cpu_m.intrstat[t->t_pil][0] += interval;
814 
815 		atomic_add_64((uint64_t *)&cpu->cpu_intracct[cpu->cpu_mstate],
816 		    interval);
817 	} else
818 		ASSERT(t->t_intr == NULL);
819 }
820 
821 /*
822  * An interrupt thread is returning from swtch(). Place a starting timestamp
823  * in its thread structure.
824  */
825 void
826 cpu_intr_swtch_exit(kthread_id_t t)
827 {
828 	uint64_t ts;
829 
830 	ASSERT((t->t_flag & T_INTR_THREAD) != 0);
831 	ASSERT(t->t_pil > 0 && t->t_pil <= LOCK_LEVEL);
832 
833 	do {
834 		ts = t->t_intr_start;
835 	} while (cas64(&t->t_intr_start, ts, tsc_read()) != ts);
836 }
837 
838 /*
839  * Dispatch a hilevel interrupt (one above LOCK_LEVEL)
840  */
841 /*ARGSUSED*/
842 static void
843 dispatch_hilevel(uint_t vector, uint_t arg2)
844 {
845 	sti();
846 	av_dispatch_autovect(vector);
847 	cli();
848 }
849 
850 /*
851  * Dispatch a soft interrupt
852  */
853 /*ARGSUSED*/
854 static void
855 dispatch_softint(uint_t oldpil, uint_t arg2)
856 {
857 	struct cpu *cpu = CPU;
858 
859 	sti();
860 	av_dispatch_softvect((int)cpu->cpu_thread->t_pil);
861 	cli();
862 
863 	/*
864 	 * Must run softint_epilog() on the interrupt thread stack, since
865 	 * there may not be a return from it if the interrupt thread blocked.
866 	 */
867 	dosoftint_epilog(cpu, oldpil);
868 }
869 
870 /*
871  * Dispatch a normal interrupt
872  */
873 static void
874 dispatch_hardint(uint_t vector, uint_t oldipl)
875 {
876 	struct cpu *cpu = CPU;
877 
878 	sti();
879 	av_dispatch_autovect(vector);
880 	cli();
881 
882 	/*
883 	 * Must run intr_thread_epilog() on the interrupt thread stack, since
884 	 * there may not be a return from it if the interrupt thread blocked.
885 	 */
886 	intr_thread_epilog(cpu, vector, oldipl);
887 }
888 
889 /*
890  * Deliver any softints the current interrupt priority allows.
891  * Called with interrupts disabled.
892  */
893 void
894 dosoftint(struct regs *regs)
895 {
896 	struct cpu *cpu = CPU;
897 	int oldipl;
898 	caddr_t newsp;
899 
900 	while (cpu->cpu_softinfo.st_pending) {
901 		oldipl = cpu->cpu_pri;
902 		newsp = dosoftint_prolog(cpu, (caddr_t)regs,
903 		    cpu->cpu_softinfo.st_pending, oldipl);
904 		/*
905 		 * If returned stack pointer is NULL, priority is too high
906 		 * to run any of the pending softints now.
907 		 * Break out and they will be run later.
908 		 */
909 		if (newsp == NULL)
910 			break;
911 		switch_sp_and_call(newsp, dispatch_softint, oldipl, 0);
912 	}
913 }
914 
915 /*
916  * Interrupt service routine, called with interrupts disabled.
917  */
918 /*ARGSUSED*/
919 void
920 do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
921 {
922 	struct cpu *cpu = CPU;
923 	int newipl, oldipl = cpu->cpu_pri;
924 	uint_t vector;
925 	caddr_t newsp;
926 
927 #ifdef TRAPTRACE
928 	ttp->ttr_marker = TT_INTERRUPT;
929 	ttp->ttr_ipl = 0xff;
930 	ttp->ttr_pri = oldipl;
931 	ttp->ttr_spl = cpu->cpu_base_spl;
932 	ttp->ttr_vector = 0xff;
933 #endif	/* TRAPTRACE */
934 
935 	cpu_idle_exit(CPU_IDLE_CB_FLAG_INTR);
936 
937 	++*(uint16_t *)&cpu->cpu_m.mcpu_istamp;
938 
939 	/*
940 	 * If it's a softint go do it now.
941 	 */
942 	if (rp->r_trapno == T_SOFTINT) {
943 		dosoftint(rp);
944 		ASSERT(!interrupts_enabled());
945 		return;
946 	}
947 
948 	/*
949 	 * Raise the interrupt priority.
950 	 */
951 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
952 #ifdef TRAPTRACE
953 	ttp->ttr_ipl = newipl;
954 #endif	/* TRAPTRACE */
955 
956 	/*
957 	 * Bail if it is a spurious interrupt
958 	 */
959 	if (newipl == -1)
960 		return;
961 	cpu->cpu_pri = newipl;
962 	vector = rp->r_trapno;
963 #ifdef TRAPTRACE
964 	ttp->ttr_vector = vector;
965 #endif	/* TRAPTRACE */
966 	if (newipl > LOCK_LEVEL) {
967 		/*
968 		 * High priority interrupts run on this cpu's interrupt stack.
969 		 */
970 		if (hilevel_intr_prolog(cpu, newipl, oldipl, rp) == 0) {
971 			newsp = cpu->cpu_intr_stack;
972 			switch_sp_and_call(newsp, dispatch_hilevel, vector, 0);
973 		} else { /* already on the interrupt stack */
974 			dispatch_hilevel(vector, 0);
975 		}
976 		(void) hilevel_intr_epilog(cpu, newipl, oldipl, vector);
977 	} else {
978 		/*
979 		 * Run this interrupt in a separate thread.
980 		 */
981 		newsp = intr_thread_prolog(cpu, (caddr_t)rp, newipl);
982 		switch_sp_and_call(newsp, dispatch_hardint, vector, oldipl);
983 	}
984 
985 #if !defined(__xpv)
986 	/*
987 	 * Deliver any pending soft interrupts.
988 	 */
989 	if (cpu->cpu_softinfo.st_pending)
990 		dosoftint(rp);
991 #endif	/* !__xpv */
992 }
993 
994 
995 /*
996  * Common tasks always done by _sys_rtt, called with interrupts disabled.
997  * Returns 1 if returning to userland, 0 if returning to system mode.
998  */
999 int
1000 sys_rtt_common(struct regs *rp)
1001 {
1002 	kthread_t *tp;
1003 	extern void mutex_exit_critical_start();
1004 	extern long mutex_exit_critical_size;
1005 	extern void mutex_owner_running_critical_start();
1006 	extern long mutex_owner_running_critical_size;
1007 
1008 loop:
1009 
1010 	/*
1011 	 * Check if returning to user
1012 	 */
1013 	tp = CPU->cpu_thread;
1014 	if (USERMODE(rp->r_cs)) {
1015 		/*
1016 		 * Check if AST pending.
1017 		 */
1018 		if (tp->t_astflag) {
1019 			/*
1020 			 * Let trap() handle the AST
1021 			 */
1022 			sti();
1023 			rp->r_trapno = T_AST;
1024 			trap(rp, (caddr_t)0, CPU->cpu_id);
1025 			cli();
1026 			goto loop;
1027 		}
1028 
1029 #if defined(__amd64)
1030 		/*
1031 		 * We are done if segment registers do not need updating.
1032 		 */
1033 		if (tp->t_lwp->lwp_pcb.pcb_rupdate == 0)
1034 			return (1);
1035 
1036 		if (update_sregs(rp, tp->t_lwp)) {
1037 			/*
1038 			 * 1 or more of the selectors is bad.
1039 			 * Deliver a SIGSEGV.
1040 			 */
1041 			proc_t *p = ttoproc(tp);
1042 
1043 			sti();
1044 			mutex_enter(&p->p_lock);
1045 			tp->t_lwp->lwp_cursig = SIGSEGV;
1046 			mutex_exit(&p->p_lock);
1047 			psig();
1048 			tp->t_sig_check = 1;
1049 			cli();
1050 		}
1051 		tp->t_lwp->lwp_pcb.pcb_rupdate = 0;
1052 
1053 #endif	/* __amd64 */
1054 		return (1);
1055 	}
1056 
1057 	/*
1058 	 * Here if we are returning to supervisor mode.
1059 	 * Check for a kernel preemption request.
1060 	 */
1061 	if (CPU->cpu_kprunrun && (rp->r_ps & PS_IE)) {
1062 
1063 		/*
1064 		 * Do nothing if already in kpreempt
1065 		 */
1066 		if (!tp->t_preempt_lk) {
1067 			tp->t_preempt_lk = 1;
1068 			sti();
1069 			kpreempt(1); /* asynchronous kpreempt call */
1070 			cli();
1071 			tp->t_preempt_lk = 0;
1072 		}
1073 	}
1074 
1075 	/*
1076 	 * If we interrupted the mutex_exit() critical region we must
1077 	 * reset the PC back to the beginning to prevent missed wakeups
1078 	 * See the comments in mutex_exit() for details.
1079 	 */
1080 	if ((uintptr_t)rp->r_pc - (uintptr_t)mutex_exit_critical_start <
1081 	    mutex_exit_critical_size) {
1082 		rp->r_pc = (greg_t)mutex_exit_critical_start;
1083 	}
1084 
1085 	/*
1086 	 * If we interrupted the mutex_owner_running() critical region we
1087 	 * must reset the PC back to the beginning to prevent dereferencing
1088 	 * of a freed thread pointer. See the comments in mutex_owner_running
1089 	 * for details.
1090 	 */
1091 	if ((uintptr_t)rp->r_pc -
1092 	    (uintptr_t)mutex_owner_running_critical_start <
1093 	    mutex_owner_running_critical_size) {
1094 		rp->r_pc = (greg_t)mutex_owner_running_critical_start;
1095 	}
1096 
1097 	return (0);
1098 }
1099 
1100 void
1101 send_dirint(int cpuid, int int_level)
1102 {
1103 	(*send_dirintf)(cpuid, int_level);
1104 }
1105 
1106 /*
1107  * do_splx routine, takes new ipl to set
1108  * returns the old ipl.
1109  * We are careful not to set priority lower than CPU->cpu_base_pri,
1110  * even though it seems we're raising the priority, it could be set
1111  * higher at any time by an interrupt routine, so we must block interrupts
1112  * and look at CPU->cpu_base_pri
1113  */
1114 int
1115 do_splx(int newpri)
1116 {
1117 	ulong_t	flag;
1118 	cpu_t	*cpu;
1119 	int	curpri, basepri;
1120 
1121 	flag = intr_clear();
1122 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1123 	curpri = cpu->cpu_m.mcpu_pri;
1124 	basepri = cpu->cpu_base_spl;
1125 	if (newpri < basepri)
1126 		newpri = basepri;
1127 	cpu->cpu_m.mcpu_pri = newpri;
1128 	(*setspl)(newpri);
1129 	/*
1130 	 * If we are going to reenable interrupts see if new priority level
1131 	 * allows pending softint delivery.
1132 	 */
1133 	if ((flag & PS_IE) &&
1134 	    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1135 		fakesoftint();
1136 	ASSERT(!interrupts_enabled());
1137 	intr_restore(flag);
1138 	return (curpri);
1139 }
1140 
1141 /*
1142  * Common spl raise routine, takes new ipl to set
1143  * returns the old ipl, will not lower ipl.
1144  */
1145 int
1146 splr(int newpri)
1147 {
1148 	ulong_t	flag;
1149 	cpu_t	*cpu;
1150 	int	curpri, basepri;
1151 
1152 	flag = intr_clear();
1153 	cpu = CPU; /* ints are disabled, now safe to cache cpu ptr */
1154 	curpri = cpu->cpu_m.mcpu_pri;
1155 	/*
1156 	 * Only do something if new priority is larger
1157 	 */
1158 	if (newpri > curpri) {
1159 		basepri = cpu->cpu_base_spl;
1160 		if (newpri < basepri)
1161 			newpri = basepri;
1162 		cpu->cpu_m.mcpu_pri = newpri;
1163 		(*setspl)(newpri);
1164 		/*
1165 		 * See if new priority level allows pending softint delivery
1166 		 */
1167 		if ((flag & PS_IE) &&
1168 		    bsrw_insn((uint16_t)cpu->cpu_softinfo.st_pending) > newpri)
1169 			fakesoftint();
1170 	}
1171 	intr_restore(flag);
1172 	return (curpri);
1173 }
1174 
1175 int
1176 getpil(void)
1177 {
1178 	return (CPU->cpu_m.mcpu_pri);
1179 }
1180 
1181 int
1182 spl_xcall(void)
1183 {
1184 	return (splr(ipltospl(XCALL_PIL)));
1185 }
1186 
1187 int
1188 interrupts_enabled(void)
1189 {
1190 	ulong_t	flag;
1191 
1192 	flag = getflags();
1193 	return ((flag & PS_IE) == PS_IE);
1194 }
1195 
1196 #ifdef DEBUG
1197 void
1198 assert_ints_enabled(void)
1199 {
1200 	ASSERT(!interrupts_unleashed || interrupts_enabled());
1201 }
1202 #endif	/* DEBUG */
1203