xref: /titanic_51/usr/src/uts/common/disp/thread.c (revision 4dd87b61fd8fd35014b2dc0e39c822fa0741a007)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/signal.h>
33 #include <sys/stack.h>
34 #include <sys/pcb.h>
35 #include <sys/user.h>
36 #include <sys/systm.h>
37 #include <sys/sysinfo.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/cred.h>
41 #include <sys/resource.h>
42 #include <sys/task.h>
43 #include <sys/project.h>
44 #include <sys/proc.h>
45 #include <sys/debug.h>
46 #include <sys/disp.h>
47 #include <sys/class.h>
48 #include <vm/seg_kmem.h>
49 #include <vm/seg_kp.h>
50 #include <sys/machlock.h>
51 #include <sys/kmem.h>
52 #include <sys/varargs.h>
53 #include <sys/turnstile.h>
54 #include <sys/poll.h>
55 #include <sys/vtrace.h>
56 #include <sys/callb.h>
57 #include <c2/audit.h>
58 #include <sys/tnf.h>
59 #include <sys/sobject.h>
60 #include <sys/cpupart.h>
61 #include <sys/pset.h>
62 #include <sys/door.h>
63 #include <sys/spl.h>
64 #include <sys/copyops.h>
65 #include <sys/rctl.h>
66 #include <sys/brand.h>
67 #include <sys/pool.h>
68 #include <sys/zone.h>
69 #include <sys/tsol/label.h>
70 #include <sys/tsol/tndb.h>
71 #include <sys/cpc_impl.h>
72 #include <sys/sdt.h>
73 #include <sys/reboot.h>
74 #include <sys/kdi.h>
75 #include <sys/schedctl.h>
76 #include <sys/waitq.h>
77 #include <sys/cpucaps.h>
78 #include <sys/kiconv.h>
79 
80 struct kmem_cache *thread_cache;	/* cache of free threads */
81 struct kmem_cache *lwp_cache;		/* cache of free lwps */
82 struct kmem_cache *turnstile_cache;	/* cache of free turnstiles */
83 
84 /*
85  * allthreads is only for use by kmem_readers.  All kernel loops can use
86  * the current thread as a start/end point.
87  */
88 static kthread_t *allthreads = &t0;	/* circular list of all threads */
89 
90 static kcondvar_t reaper_cv;		/* synchronization var */
91 kthread_t	*thread_deathrow;	/* circular list of reapable threads */
92 kthread_t	*lwp_deathrow;		/* circular list of reapable threads */
93 kmutex_t	reaplock;		/* protects lwp and thread deathrows */
94 int	thread_reapcnt = 0;		/* number of threads on deathrow */
95 int	lwp_reapcnt = 0;		/* number of lwps on deathrow */
96 int	reaplimit = 16;			/* delay reaping until reaplimit */
97 
98 thread_free_lock_t	*thread_free_lock;
99 					/* protects tick thread from reaper */
100 
101 extern int nthread;
102 
103 id_t	syscid;				/* system scheduling class ID */
104 void	*segkp_thread;			/* cookie for segkp pool */
105 
106 int lwp_cache_sz = 32;
107 int t_cache_sz = 8;
108 static kt_did_t next_t_id = 1;
109 
110 /*
111  * Min/Max stack sizes for stack size parameters
112  */
113 #define	MAX_STKSIZE	(32 * DEFAULTSTKSZ)
114 #define	MIN_STKSIZE	DEFAULTSTKSZ
115 
116 /*
117  * default_stksize overrides lwp_default_stksize if it is set.
118  */
119 int	default_stksize;
120 int	lwp_default_stksize;
121 
122 static zone_key_t zone_thread_key;
123 
124 /*
125  * forward declarations for internal thread specific data (tsd)
126  */
127 static void *tsd_realloc(void *, size_t, size_t);
128 
129 void thread_reaper(void);
130 
131 /*ARGSUSED*/
132 static int
133 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
134 {
135 	bzero(buf, sizeof (turnstile_t));
136 	return (0);
137 }
138 
139 /*ARGSUSED*/
140 static void
141 turnstile_destructor(void *buf, void *cdrarg)
142 {
143 	turnstile_t *ts = buf;
144 
145 	ASSERT(ts->ts_free == NULL);
146 	ASSERT(ts->ts_waiters == 0);
147 	ASSERT(ts->ts_inheritor == NULL);
148 	ASSERT(ts->ts_sleepq[0].sq_first == NULL);
149 	ASSERT(ts->ts_sleepq[1].sq_first == NULL);
150 }
151 
152 void
153 thread_init(void)
154 {
155 	kthread_t *tp;
156 	extern char sys_name[];
157 	extern void idle();
158 	struct cpu *cpu = CPU;
159 	int i;
160 	kmutex_t *lp;
161 
162 	mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
163 	thread_free_lock =
164 	    kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
165 	for (i = 0; i < THREAD_FREE_NUM; i++) {
166 		lp = &thread_free_lock[i].tf_lock;
167 		mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
168 	}
169 
170 #if defined(__i386) || defined(__amd64)
171 	thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
172 	    PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
173 
174 	/*
175 	 * "struct _klwp" includes a "struct pcb", which includes a
176 	 * "struct fpu", which needs to be 16-byte aligned on amd64
177 	 * (and even on i386 for fxsave/fxrstor).
178 	 */
179 	lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
180 	    16, NULL, NULL, NULL, NULL, NULL, 0);
181 #else
182 	/*
183 	 * Allocate thread structures from static_arena.  This prevents
184 	 * issues where a thread tries to relocate its own thread
185 	 * structure and touches it after the mapping has been suspended.
186 	 */
187 	thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
188 	    PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
189 
190 	lwp_stk_cache_init();
191 
192 	lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
193 	    0, NULL, NULL, NULL, NULL, NULL, 0);
194 #endif
195 
196 	turnstile_cache = kmem_cache_create("turnstile_cache",
197 	    sizeof (turnstile_t), 0,
198 	    turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
199 
200 	label_init();
201 	cred_init();
202 
203 	/*
204 	 * Initialize various resource management facilities.
205 	 */
206 	rctl_init();
207 	cpucaps_init();
208 	/*
209 	 * Zone_init() should be called before project_init() so that project ID
210 	 * for the first project is initialized correctly.
211 	 */
212 	zone_init();
213 	project_init();
214 	brand_init();
215 	kiconv_init();
216 	task_init();
217 	tcache_init();
218 	pool_init();
219 
220 	curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
221 
222 	/*
223 	 * Originally, we had two parameters to set default stack
224 	 * size: one for lwp's (lwp_default_stksize), and one for
225 	 * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
226 	 * Now we have a third parameter that overrides both if it is
227 	 * set to a legal stack size, called default_stksize.
228 	 */
229 
230 	if (default_stksize == 0) {
231 		default_stksize = DEFAULTSTKSZ;
232 	} else if (default_stksize % PAGESIZE != 0 ||
233 	    default_stksize > MAX_STKSIZE ||
234 	    default_stksize < MIN_STKSIZE) {
235 		cmn_err(CE_WARN, "Illegal stack size. Using %d",
236 		    (int)DEFAULTSTKSZ);
237 		default_stksize = DEFAULTSTKSZ;
238 	} else {
239 		lwp_default_stksize = default_stksize;
240 	}
241 
242 	if (lwp_default_stksize == 0) {
243 		lwp_default_stksize = default_stksize;
244 	} else if (lwp_default_stksize % PAGESIZE != 0 ||
245 	    lwp_default_stksize > MAX_STKSIZE ||
246 	    lwp_default_stksize < MIN_STKSIZE) {
247 		cmn_err(CE_WARN, "Illegal stack size. Using %d",
248 		    default_stksize);
249 		lwp_default_stksize = default_stksize;
250 	}
251 
252 	segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
253 	    lwp_default_stksize,
254 	    (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
255 
256 	segkp_thread = segkp_cache_init(segkp, t_cache_sz,
257 	    default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
258 
259 	(void) getcid(sys_name, &syscid);
260 	curthread->t_cid = syscid;	/* current thread is t0 */
261 
262 	/*
263 	 * Set up the first CPU's idle thread.
264 	 * It runs whenever the CPU has nothing worthwhile to do.
265 	 */
266 	tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
267 	cpu->cpu_idle_thread = tp;
268 	tp->t_preempt = 1;
269 	tp->t_disp_queue = cpu->cpu_disp;
270 	ASSERT(tp->t_disp_queue != NULL);
271 	tp->t_bound_cpu = cpu;
272 	tp->t_affinitycnt = 1;
273 
274 	/*
275 	 * Registering a thread in the callback table is usually
276 	 * done in the initialization code of the thread. In this
277 	 * case, we do it right after thread creation to avoid
278 	 * blocking idle thread while registering itself. It also
279 	 * avoids the possibility of reregistration in case a CPU
280 	 * restarts its idle thread.
281 	 */
282 	CALLB_CPR_INIT_SAFE(tp, "idle");
283 
284 	/*
285 	 * Create the thread_reaper daemon. From this point on, exited
286 	 * threads will get reaped.
287 	 */
288 	(void) thread_create(NULL, 0, (void (*)())thread_reaper,
289 	    NULL, 0, &p0, TS_RUN, minclsyspri);
290 
291 	/*
292 	 * Finish initializing the kernel memory allocator now that
293 	 * thread_create() is available.
294 	 */
295 	kmem_thread_init();
296 
297 	if (boothowto & RB_DEBUG)
298 		kdi_dvec_thravail();
299 }
300 
301 /*
302  * Create a thread.
303  *
304  * thread_create() blocks for memory if necessary.  It never fails.
305  *
306  * If stk is NULL, the thread is created at the base of the stack
307  * and cannot be swapped.
308  */
309 kthread_t *
310 thread_create(
311 	caddr_t	stk,
312 	size_t	stksize,
313 	void	(*proc)(),
314 	void	*arg,
315 	size_t	len,
316 	proc_t	 *pp,
317 	int	state,
318 	pri_t	pri)
319 {
320 	kthread_t *t;
321 	extern struct classfuncs sys_classfuncs;
322 	turnstile_t *ts;
323 
324 	/*
325 	 * Every thread keeps a turnstile around in case it needs to block.
326 	 * The only reason the turnstile is not simply part of the thread
327 	 * structure is that we may have to break the association whenever
328 	 * more than one thread blocks on a given synchronization object.
329 	 * From a memory-management standpoint, turnstiles are like the
330 	 * "attached mblks" that hang off dblks in the streams allocator.
331 	 */
332 	ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
333 
334 	if (stk == NULL) {
335 		/*
336 		 * alloc both thread and stack in segkp chunk
337 		 */
338 
339 		if (stksize < default_stksize)
340 			stksize = default_stksize;
341 
342 		if (stksize == default_stksize) {
343 			stk = (caddr_t)segkp_cache_get(segkp_thread);
344 		} else {
345 			stksize = roundup(stksize, PAGESIZE);
346 			stk = (caddr_t)segkp_get(segkp, stksize,
347 			    (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
348 		}
349 
350 		ASSERT(stk != NULL);
351 
352 		/*
353 		 * The machine-dependent mutex code may require that
354 		 * thread pointers (since they may be used for mutex owner
355 		 * fields) have certain alignment requirements.
356 		 * PTR24_ALIGN is the size of the alignment quanta.
357 		 * XXX - assumes stack grows toward low addresses.
358 		 */
359 		if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
360 			cmn_err(CE_PANIC, "thread_create: proposed stack size"
361 			    " too small to hold thread.");
362 #ifdef STACK_GROWTH_DOWN
363 		stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
364 		stksize &= -PTR24_ALIGN;	/* make thread aligned */
365 		t = (kthread_t *)(stk + stksize);
366 		bzero(t, sizeof (kthread_t));
367 		if (audit_active)
368 			audit_thread_create(t);
369 		t->t_stk = stk + stksize;
370 		t->t_stkbase = stk;
371 #else	/* stack grows to larger addresses */
372 		stksize -= SA(sizeof (kthread_t));
373 		t = (kthread_t *)(stk);
374 		bzero(t, sizeof (kthread_t));
375 		t->t_stk = stk + sizeof (kthread_t);
376 		t->t_stkbase = stk + stksize + sizeof (kthread_t);
377 #endif	/* STACK_GROWTH_DOWN */
378 		t->t_flag |= T_TALLOCSTK;
379 		t->t_swap = stk;
380 	} else {
381 		t = kmem_cache_alloc(thread_cache, KM_SLEEP);
382 		bzero(t, sizeof (kthread_t));
383 		ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
384 		if (audit_active)
385 			audit_thread_create(t);
386 		/*
387 		 * Initialize t_stk to the kernel stack pointer to use
388 		 * upon entry to the kernel
389 		 */
390 #ifdef STACK_GROWTH_DOWN
391 		t->t_stk = stk + stksize;
392 		t->t_stkbase = stk;
393 #else
394 		t->t_stk = stk;			/* 3b2-like */
395 		t->t_stkbase = stk + stksize;
396 #endif /* STACK_GROWTH_DOWN */
397 	}
398 
399 	/* set default stack flag */
400 	if (stksize == lwp_default_stksize)
401 		t->t_flag |= T_DFLTSTK;
402 
403 	t->t_ts = ts;
404 
405 	/*
406 	 * p_cred could be NULL if it thread_create is called before cred_init
407 	 * is called in main.
408 	 */
409 	mutex_enter(&pp->p_crlock);
410 	if (pp->p_cred)
411 		crhold(t->t_cred = pp->p_cred);
412 	mutex_exit(&pp->p_crlock);
413 	t->t_start = gethrestime_sec();
414 	t->t_startpc = proc;
415 	t->t_procp = pp;
416 	t->t_clfuncs = &sys_classfuncs.thread;
417 	t->t_cid = syscid;
418 	t->t_pri = pri;
419 	t->t_stime = lbolt;
420 	t->t_schedflag = TS_LOAD | TS_DONT_SWAP;
421 	t->t_bind_cpu = PBIND_NONE;
422 	t->t_bind_pset = PS_NONE;
423 	t->t_plockp = &pp->p_lock;
424 	t->t_copyops = NULL;
425 	t->t_taskq = NULL;
426 	t->t_anttime = 0;
427 	t->t_hatdepth = 0;
428 
429 	t->t_dtrace_vtime = 1;	/* assure vtimestamp is always non-zero */
430 
431 	CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
432 #ifndef NPROBE
433 	/* Kernel probe */
434 	tnf_thread_create(t);
435 #endif /* NPROBE */
436 	LOCK_INIT_CLEAR(&t->t_lock);
437 
438 	/*
439 	 * Callers who give us a NULL proc must do their own
440 	 * stack initialization.  e.g. lwp_create()
441 	 */
442 	if (proc != NULL) {
443 		t->t_stk = thread_stk_init(t->t_stk);
444 		thread_load(t, proc, arg, len);
445 	}
446 
447 	/*
448 	 * Put a hold on project0. If this thread is actually in a
449 	 * different project, then t_proj will be changed later in
450 	 * lwp_create().  All kernel-only threads must be in project 0.
451 	 */
452 	t->t_proj = project_hold(proj0p);
453 
454 	lgrp_affinity_init(&t->t_lgrp_affinity);
455 
456 	mutex_enter(&pidlock);
457 	nthread++;
458 	t->t_did = next_t_id++;
459 	t->t_prev = curthread->t_prev;
460 	t->t_next = curthread;
461 
462 	/*
463 	 * Add the thread to the list of all threads, and initialize
464 	 * its t_cpu pointer.  We need to block preemption since
465 	 * cpu_offline walks the thread list looking for threads
466 	 * with t_cpu pointing to the CPU being offlined.  We want
467 	 * to make sure that the list is consistent and that if t_cpu
468 	 * is set, the thread is on the list.
469 	 */
470 	kpreempt_disable();
471 	curthread->t_prev->t_next = t;
472 	curthread->t_prev = t;
473 
474 	/*
475 	 * Threads should never have a NULL t_cpu pointer so assign it
476 	 * here.  If the thread is being created with state TS_RUN a
477 	 * better CPU may be chosen when it is placed on the run queue.
478 	 *
479 	 * We need to keep kernel preemption disabled when setting all
480 	 * three fields to keep them in sync.  Also, always create in
481 	 * the default partition since that's where kernel threads go
482 	 * (if this isn't a kernel thread, t_cpupart will be changed
483 	 * in lwp_create before setting the thread runnable).
484 	 */
485 	t->t_cpupart = &cp_default;
486 
487 	/*
488 	 * For now, affiliate this thread with the root lgroup.
489 	 * Since the kernel does not (presently) allocate its memory
490 	 * in a locality aware fashion, the root is an appropriate home.
491 	 * If this thread is later associated with an lwp, it will have
492 	 * it's lgroup re-assigned at that time.
493 	 */
494 	lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
495 
496 	/*
497 	 * Inherit the current cpu.  If this cpu isn't part of the chosen
498 	 * lgroup, a new cpu will be chosen by cpu_choose when the thread
499 	 * is ready to run.
500 	 */
501 	if (CPU->cpu_part == &cp_default)
502 		t->t_cpu = CPU;
503 	else
504 		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
505 		    t->t_pri, NULL);
506 
507 	t->t_disp_queue = t->t_cpu->cpu_disp;
508 	kpreempt_enable();
509 
510 	/*
511 	 * Initialize thread state and the dispatcher lock pointer.
512 	 * Need to hold onto pidlock to block allthreads walkers until
513 	 * the state is set.
514 	 */
515 	switch (state) {
516 	case TS_RUN:
517 		curthread->t_oldspl = splhigh();	/* get dispatcher spl */
518 		THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
519 		CL_SETRUN(t);
520 		thread_unlock(t);
521 		break;
522 
523 	case TS_ONPROC:
524 		THREAD_ONPROC(t, t->t_cpu);
525 		break;
526 
527 	case TS_FREE:
528 		/*
529 		 * Free state will be used for intr threads.
530 		 * The interrupt routine must set the thread dispatcher
531 		 * lock pointer (t_lockp) if starting on a CPU
532 		 * other than the current one.
533 		 */
534 		THREAD_FREEINTR(t, CPU);
535 		break;
536 
537 	case TS_STOPPED:
538 		THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
539 		break;
540 
541 	default:			/* TS_SLEEP, TS_ZOMB or TS_TRANS */
542 		cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
543 	}
544 	mutex_exit(&pidlock);
545 	return (t);
546 }
547 
548 /*
549  * Move thread to project0 and take care of project reference counters.
550  */
551 void
552 thread_rele(kthread_t *t)
553 {
554 	kproject_t *kpj;
555 
556 	thread_lock(t);
557 
558 	ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
559 	kpj = ttoproj(t);
560 	t->t_proj = proj0p;
561 
562 	thread_unlock(t);
563 
564 	if (kpj != proj0p) {
565 		project_rele(kpj);
566 		(void) project_hold(proj0p);
567 	}
568 }
569 
570 void
571 thread_exit(void)
572 {
573 	kthread_t *t = curthread;
574 
575 	if ((t->t_proc_flag & TP_ZTHREAD) != 0)
576 		cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
577 
578 	tsd_exit();		/* Clean up this thread's TSD */
579 
580 	kcpc_passivate();	/* clean up performance counter state */
581 
582 	/*
583 	 * No kernel thread should have called poll() without arranging
584 	 * calling pollcleanup() here.
585 	 */
586 	ASSERT(t->t_pollstate == NULL);
587 	ASSERT(t->t_schedctl == NULL);
588 	if (t->t_door)
589 		door_slam();	/* in case thread did an upcall */
590 
591 #ifndef NPROBE
592 	/* Kernel probe */
593 	if (t->t_tnf_tpdp)
594 		tnf_thread_exit();
595 #endif /* NPROBE */
596 
597 	thread_rele(t);
598 	t->t_preempt++;
599 
600 	/*
601 	 * remove thread from the all threads list so that
602 	 * death-row can use the same pointers.
603 	 */
604 	mutex_enter(&pidlock);
605 	t->t_next->t_prev = t->t_prev;
606 	t->t_prev->t_next = t->t_next;
607 	ASSERT(allthreads != t);	/* t0 never exits */
608 	cv_broadcast(&t->t_joincv);	/* wake up anyone in thread_join */
609 	mutex_exit(&pidlock);
610 
611 	if (t->t_ctx != NULL)
612 		exitctx(t);
613 	if (t->t_procp->p_pctx != NULL)
614 		exitpctx(t->t_procp);
615 
616 	t->t_state = TS_ZOMB;	/* set zombie thread */
617 
618 	swtch_from_zombie();	/* give up the CPU */
619 	/* NOTREACHED */
620 }
621 
622 /*
623  * Check to see if the specified thread is active (defined as being on
624  * the thread list).  This is certainly a slow way to do this; if there's
625  * ever a reason to speed it up, we could maintain a hash table of active
626  * threads indexed by their t_did.
627  */
628 static kthread_t *
629 did_to_thread(kt_did_t tid)
630 {
631 	kthread_t *t;
632 
633 	ASSERT(MUTEX_HELD(&pidlock));
634 	for (t = curthread->t_next; t != curthread; t = t->t_next) {
635 		if (t->t_did == tid)
636 			break;
637 	}
638 	if (t->t_did == tid)
639 		return (t);
640 	else
641 		return (NULL);
642 }
643 
644 /*
645  * Wait for specified thread to exit.  Returns immediately if the thread
646  * could not be found, meaning that it has either already exited or never
647  * existed.
648  */
649 void
650 thread_join(kt_did_t tid)
651 {
652 	kthread_t *t;
653 
654 	ASSERT(tid != curthread->t_did);
655 	ASSERT(tid != t0.t_did);
656 
657 	mutex_enter(&pidlock);
658 	/*
659 	 * Make sure we check that the thread is on the thread list
660 	 * before blocking on it; otherwise we could end up blocking on
661 	 * a cv that's already been freed.  In other words, don't cache
662 	 * the thread pointer across calls to cv_wait.
663 	 *
664 	 * The choice of loop invariant means that whenever a thread
665 	 * is taken off the allthreads list, a cv_broadcast must be
666 	 * performed on that thread's t_joincv to wake up any waiters.
667 	 * The broadcast doesn't have to happen right away, but it
668 	 * shouldn't be postponed indefinitely (e.g., by doing it in
669 	 * thread_free which may only be executed when the deathrow
670 	 * queue is processed.
671 	 */
672 	while (t = did_to_thread(tid))
673 		cv_wait(&t->t_joincv, &pidlock);
674 	mutex_exit(&pidlock);
675 }
676 
677 void
678 thread_free_prevent(kthread_t *t)
679 {
680 	kmutex_t *lp;
681 
682 	lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
683 	mutex_enter(lp);
684 }
685 
686 void
687 thread_free_allow(kthread_t *t)
688 {
689 	kmutex_t *lp;
690 
691 	lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
692 	mutex_exit(lp);
693 }
694 
695 static void
696 thread_free_barrier(kthread_t *t)
697 {
698 	kmutex_t *lp;
699 
700 	lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
701 	mutex_enter(lp);
702 	mutex_exit(lp);
703 }
704 
705 void
706 thread_free(kthread_t *t)
707 {
708 	ASSERT(t != &t0 && t->t_state == TS_FREE);
709 	ASSERT(t->t_door == NULL);
710 	ASSERT(t->t_schedctl == NULL);
711 	ASSERT(t->t_pollstate == NULL);
712 
713 	t->t_pri = 0;
714 	t->t_pc = 0;
715 	t->t_sp = 0;
716 	t->t_wchan0 = NULL;
717 	t->t_wchan = NULL;
718 	if (t->t_cred != NULL) {
719 		crfree(t->t_cred);
720 		t->t_cred = 0;
721 	}
722 	if (t->t_pdmsg) {
723 		kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
724 		t->t_pdmsg = NULL;
725 	}
726 	if (audit_active)
727 		audit_thread_free(t);
728 #ifndef NPROBE
729 	if (t->t_tnf_tpdp)
730 		tnf_thread_free(t);
731 #endif /* NPROBE */
732 	if (t->t_cldata) {
733 		CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
734 	}
735 	if (t->t_rprof != NULL) {
736 		kmem_free(t->t_rprof, sizeof (*t->t_rprof));
737 		t->t_rprof = NULL;
738 	}
739 	t->t_lockp = NULL;	/* nothing should try to lock this thread now */
740 	if (t->t_lwp)
741 		lwp_freeregs(t->t_lwp, 0);
742 	if (t->t_ctx)
743 		freectx(t, 0);
744 	t->t_stk = NULL;
745 	if (t->t_lwp)
746 		lwp_stk_fini(t->t_lwp);
747 	lock_clear(&t->t_lock);
748 
749 	if (t->t_ts->ts_waiters > 0)
750 		panic("thread_free: turnstile still active");
751 
752 	kmem_cache_free(turnstile_cache, t->t_ts);
753 
754 	free_afd(&t->t_activefd);
755 
756 	/*
757 	 * Barrier for the tick accounting code.  The tick accounting code
758 	 * holds this lock to keep the thread from going away while it's
759 	 * looking at it.
760 	 */
761 	thread_free_barrier(t);
762 
763 	ASSERT(ttoproj(t) == proj0p);
764 	project_rele(ttoproj(t));
765 
766 	lgrp_affinity_free(&t->t_lgrp_affinity);
767 
768 	/*
769 	 * Free thread struct and its stack.
770 	 */
771 	if (t->t_flag & T_TALLOCSTK) {
772 		/* thread struct is embedded in stack */
773 		segkp_release(segkp, t->t_swap);
774 		mutex_enter(&pidlock);
775 		nthread--;
776 		mutex_exit(&pidlock);
777 	} else {
778 		if (t->t_swap) {
779 			segkp_release(segkp, t->t_swap);
780 			t->t_swap = NULL;
781 		}
782 		if (t->t_lwp) {
783 			kmem_cache_free(lwp_cache, t->t_lwp);
784 			t->t_lwp = NULL;
785 		}
786 		mutex_enter(&pidlock);
787 		nthread--;
788 		mutex_exit(&pidlock);
789 		kmem_cache_free(thread_cache, t);
790 	}
791 }
792 
793 /*
794  * Removes threads associated with the given zone from a deathrow queue.
795  * tp is a pointer to the head of the deathrow queue, and countp is a
796  * pointer to the current deathrow count.  Returns a linked list of
797  * threads removed from the list.
798  */
799 static kthread_t *
800 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
801 {
802 	kthread_t *tmp, *list = NULL;
803 	cred_t *cr;
804 
805 	ASSERT(MUTEX_HELD(&reaplock));
806 	while (*tp != NULL) {
807 		if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
808 			tmp = *tp;
809 			*tp = tmp->t_forw;
810 			tmp->t_forw = list;
811 			list = tmp;
812 			(*countp)--;
813 		} else {
814 			tp = &(*tp)->t_forw;
815 		}
816 	}
817 	return (list);
818 }
819 
820 static void
821 thread_reap_list(kthread_t *t)
822 {
823 	kthread_t *next;
824 
825 	while (t != NULL) {
826 		next = t->t_forw;
827 		thread_free(t);
828 		t = next;
829 	}
830 }
831 
832 /* ARGSUSED */
833 static void
834 thread_zone_destroy(zoneid_t zoneid, void *unused)
835 {
836 	kthread_t *t, *l;
837 
838 	mutex_enter(&reaplock);
839 	/*
840 	 * Pull threads and lwps associated with zone off deathrow lists.
841 	 */
842 	t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
843 	l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
844 	mutex_exit(&reaplock);
845 
846 	/*
847 	 * Guard against race condition in mutex_owner_running:
848 	 * 	thread=owner(mutex)
849 	 * 	<interrupt>
850 	 * 				thread exits mutex
851 	 * 				thread exits
852 	 * 				thread reaped
853 	 * 				thread struct freed
854 	 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
855 	 * A cross call to all cpus will cause the interrupt handler
856 	 * to reset the PC if it is in mutex_owner_running, refreshing
857 	 * stale thread pointers.
858 	 */
859 	mutex_sync();   /* sync with mutex code */
860 
861 	/*
862 	 * Reap threads
863 	 */
864 	thread_reap_list(t);
865 
866 	/*
867 	 * Reap lwps
868 	 */
869 	thread_reap_list(l);
870 }
871 
872 /*
873  * cleanup zombie threads that are on deathrow.
874  */
875 void
876 thread_reaper()
877 {
878 	kthread_t *t, *l;
879 	callb_cpr_t cprinfo;
880 
881 	/*
882 	 * Register callback to clean up threads when zone is destroyed.
883 	 */
884 	zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
885 
886 	CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
887 	for (;;) {
888 		mutex_enter(&reaplock);
889 		while (thread_deathrow == NULL && lwp_deathrow == NULL) {
890 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
891 			cv_wait(&reaper_cv, &reaplock);
892 			CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
893 		}
894 		/*
895 		 * mutex_sync() needs to be called when reaping, but
896 		 * not too often.  We limit reaping rate to once
897 		 * per second.  Reaplimit is max rate at which threads can
898 		 * be freed. Does not impact thread destruction/creation.
899 		 */
900 		t = thread_deathrow;
901 		l = lwp_deathrow;
902 		thread_deathrow = NULL;
903 		lwp_deathrow = NULL;
904 		thread_reapcnt = 0;
905 		lwp_reapcnt = 0;
906 		mutex_exit(&reaplock);
907 
908 		/*
909 		 * Guard against race condition in mutex_owner_running:
910 		 * 	thread=owner(mutex)
911 		 * 	<interrupt>
912 		 * 				thread exits mutex
913 		 * 				thread exits
914 		 * 				thread reaped
915 		 * 				thread struct freed
916 		 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
917 		 * A cross call to all cpus will cause the interrupt handler
918 		 * to reset the PC if it is in mutex_owner_running, refreshing
919 		 * stale thread pointers.
920 		 */
921 		mutex_sync();   /* sync with mutex code */
922 		/*
923 		 * Reap threads
924 		 */
925 		thread_reap_list(t);
926 
927 		/*
928 		 * Reap lwps
929 		 */
930 		thread_reap_list(l);
931 		delay(hz);
932 	}
933 }
934 
935 /*
936  * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
937  * thread_deathrow. The thread's state is changed already TS_FREE to indicate
938  * that is reapable. The thread already holds the reaplock, and was already
939  * freed.
940  */
941 void
942 reapq_move_lq_to_tq(kthread_t *t)
943 {
944 	ASSERT(t->t_state == TS_FREE);
945 	ASSERT(MUTEX_HELD(&reaplock));
946 	t->t_forw = thread_deathrow;
947 	thread_deathrow = t;
948 	thread_reapcnt++;
949 	if (lwp_reapcnt + thread_reapcnt > reaplimit)
950 		cv_signal(&reaper_cv);  /* wake the reaper */
951 }
952 
953 /*
954  * This is called by resume() to put a zombie thread onto deathrow.
955  * The thread's state is changed to TS_FREE to indicate that is reapable.
956  * This is called from the idle thread so it must not block - just spin.
957  */
958 void
959 reapq_add(kthread_t *t)
960 {
961 	mutex_enter(&reaplock);
962 
963 	/*
964 	 * lwp_deathrow contains only threads with lwp linkage
965 	 * that are of the default stacksize. Anything else goes
966 	 * on thread_deathrow.
967 	 */
968 	if (ttolwp(t) && (t->t_flag & T_DFLTSTK)) {
969 		t->t_forw = lwp_deathrow;
970 		lwp_deathrow = t;
971 		lwp_reapcnt++;
972 	} else {
973 		t->t_forw = thread_deathrow;
974 		thread_deathrow = t;
975 		thread_reapcnt++;
976 	}
977 	if (lwp_reapcnt + thread_reapcnt > reaplimit)
978 		cv_signal(&reaper_cv);	/* wake the reaper */
979 	t->t_state = TS_FREE;
980 	lock_clear(&t->t_lock);
981 
982 	/*
983 	 * Before we return, we need to grab and drop the thread lock for
984 	 * the dead thread.  At this point, the current thread is the idle
985 	 * thread, and the dead thread's CPU lock points to the current
986 	 * CPU -- and we must grab and drop the lock to synchronize with
987 	 * a racing thread walking a blocking chain that the zombie thread
988 	 * was recently in.  By this point, that blocking chain is (by
989 	 * definition) stale:  the dead thread is not holding any locks, and
990 	 * is therefore not in any blocking chains -- but if we do not regrab
991 	 * our lock before freeing the dead thread's data structures, the
992 	 * thread walking the (stale) blocking chain will die on memory
993 	 * corruption when it attempts to drop the dead thread's lock.  We
994 	 * only need do this once because there is no way for the dead thread
995 	 * to ever again be on a blocking chain:  once we have grabbed and
996 	 * dropped the thread lock, we are guaranteed that anyone that could
997 	 * have seen this thread in a blocking chain can no longer see it.
998 	 */
999 	thread_lock(t);
1000 	thread_unlock(t);
1001 
1002 	mutex_exit(&reaplock);
1003 }
1004 
1005 /*
1006  * Install thread context ops for the current thread.
1007  */
1008 void
1009 installctx(
1010 	kthread_t *t,
1011 	void	*arg,
1012 	void	(*save)(void *),
1013 	void	(*restore)(void *),
1014 	void	(*fork)(void *, void *),
1015 	void	(*lwp_create)(void *, void *),
1016 	void	(*exit)(void *),
1017 	void	(*free)(void *, int))
1018 {
1019 	struct ctxop *ctx;
1020 
1021 	ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
1022 	ctx->save_op = save;
1023 	ctx->restore_op = restore;
1024 	ctx->fork_op = fork;
1025 	ctx->lwp_create_op = lwp_create;
1026 	ctx->exit_op = exit;
1027 	ctx->free_op = free;
1028 	ctx->arg = arg;
1029 	ctx->next = t->t_ctx;
1030 	t->t_ctx = ctx;
1031 }
1032 
1033 /*
1034  * Remove the thread context ops from a thread.
1035  */
1036 int
1037 removectx(
1038 	kthread_t *t,
1039 	void	*arg,
1040 	void	(*save)(void *),
1041 	void	(*restore)(void *),
1042 	void	(*fork)(void *, void *),
1043 	void	(*lwp_create)(void *, void *),
1044 	void	(*exit)(void *),
1045 	void	(*free)(void *, int))
1046 {
1047 	struct ctxop *ctx, *prev_ctx;
1048 
1049 	/*
1050 	 * The incoming kthread_t (which is the thread for which the
1051 	 * context ops will be removed) should be one of the following:
1052 	 *
1053 	 * a) the current thread,
1054 	 *
1055 	 * b) a thread of a process that's being forked (SIDL),
1056 	 *
1057 	 * c) a thread that belongs to the same process as the current
1058 	 *    thread and for which the current thread is the agent thread,
1059 	 *
1060 	 * d) a thread that is TS_STOPPED which is indicative of it
1061 	 *    being (if curthread is not an agent) a thread being created
1062 	 *    as part of an lwp creation.
1063 	 */
1064 	ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
1065 	    ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1066 
1067 	/*
1068 	 * Serialize modifications to t->t_ctx to prevent the agent thread
1069 	 * and the target thread from racing with each other during lwp exit.
1070 	 */
1071 	mutex_enter(&t->t_ctx_lock);
1072 	prev_ctx = NULL;
1073 	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
1074 		if (ctx->save_op == save && ctx->restore_op == restore &&
1075 		    ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
1076 		    ctx->exit_op == exit && ctx->free_op == free &&
1077 		    ctx->arg == arg) {
1078 			if (prev_ctx)
1079 				prev_ctx->next = ctx->next;
1080 			else
1081 				t->t_ctx = ctx->next;
1082 			mutex_exit(&t->t_ctx_lock);
1083 			if (ctx->free_op != NULL)
1084 				(ctx->free_op)(ctx->arg, 0);
1085 			kmem_free(ctx, sizeof (struct ctxop));
1086 			return (1);
1087 		}
1088 		prev_ctx = ctx;
1089 	}
1090 	mutex_exit(&t->t_ctx_lock);
1091 
1092 	return (0);
1093 }
1094 
1095 void
1096 savectx(kthread_t *t)
1097 {
1098 	struct ctxop *ctx;
1099 
1100 	ASSERT(t == curthread);
1101 	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1102 		if (ctx->save_op != NULL)
1103 			(ctx->save_op)(ctx->arg);
1104 }
1105 
1106 void
1107 restorectx(kthread_t *t)
1108 {
1109 	struct ctxop *ctx;
1110 
1111 	ASSERT(t == curthread);
1112 	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1113 		if (ctx->restore_op != NULL)
1114 			(ctx->restore_op)(ctx->arg);
1115 }
1116 
1117 void
1118 forkctx(kthread_t *t, kthread_t *ct)
1119 {
1120 	struct ctxop *ctx;
1121 
1122 	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1123 		if (ctx->fork_op != NULL)
1124 			(ctx->fork_op)(t, ct);
1125 }
1126 
1127 /*
1128  * Note that this operator is only invoked via the _lwp_create
1129  * system call.  The system may have other reasons to create lwps
1130  * e.g. the agent lwp or the doors unreferenced lwp.
1131  */
1132 void
1133 lwp_createctx(kthread_t *t, kthread_t *ct)
1134 {
1135 	struct ctxop *ctx;
1136 
1137 	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1138 		if (ctx->lwp_create_op != NULL)
1139 			(ctx->lwp_create_op)(t, ct);
1140 }
1141 
1142 /*
1143  * exitctx is called from thread_exit() and lwp_exit() to perform any actions
1144  * needed when the thread/LWP leaves the processor for the last time. This
1145  * routine is not intended to deal with freeing memory; freectx() is used for
1146  * that purpose during thread_free(). This routine is provided to allow for
1147  * clean-up that can't wait until thread_free().
1148  */
1149 void
1150 exitctx(kthread_t *t)
1151 {
1152 	struct ctxop *ctx;
1153 
1154 	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1155 		if (ctx->exit_op != NULL)
1156 			(ctx->exit_op)(t);
1157 }
1158 
1159 /*
1160  * freectx is called from thread_free() and exec() to get
1161  * rid of old thread context ops.
1162  */
1163 void
1164 freectx(kthread_t *t, int isexec)
1165 {
1166 	struct ctxop *ctx;
1167 
1168 	while ((ctx = t->t_ctx) != NULL) {
1169 		t->t_ctx = ctx->next;
1170 		if (ctx->free_op != NULL)
1171 			(ctx->free_op)(ctx->arg, isexec);
1172 		kmem_free(ctx, sizeof (struct ctxop));
1173 	}
1174 }
1175 
1176 /*
1177  * freectx_ctx is called from lwp_create() when lwp is reused from
1178  * lwp_deathrow and its thread structure is added to thread_deathrow.
1179  * The thread structure to which this ctx was attached may be already
1180  * freed by the thread reaper so free_op implementations shouldn't rely
1181  * on thread structure to which this ctx was attached still being around.
1182  */
1183 void
1184 freectx_ctx(struct ctxop *ctx)
1185 {
1186 	struct ctxop *nctx;
1187 
1188 	ASSERT(ctx != NULL);
1189 
1190 	do {
1191 		nctx = ctx->next;
1192 		if (ctx->free_op != NULL)
1193 			(ctx->free_op)(ctx->arg, 0);
1194 		kmem_free(ctx, sizeof (struct ctxop));
1195 	} while ((ctx = nctx) != NULL);
1196 }
1197 
1198 /*
1199  * Set the thread running; arrange for it to be swapped in if necessary.
1200  */
1201 void
1202 setrun_locked(kthread_t *t)
1203 {
1204 	ASSERT(THREAD_LOCK_HELD(t));
1205 	if (t->t_state == TS_SLEEP) {
1206 		/*
1207 		 * Take off sleep queue.
1208 		 */
1209 		SOBJ_UNSLEEP(t->t_sobj_ops, t);
1210 	} else if (t->t_state & (TS_RUN | TS_ONPROC)) {
1211 		/*
1212 		 * Already on dispatcher queue.
1213 		 */
1214 		return;
1215 	} else if (t->t_state == TS_WAIT) {
1216 		waitq_setrun(t);
1217 	} else if (t->t_state == TS_STOPPED) {
1218 		/*
1219 		 * All of the sending of SIGCONT (TC_XSTART) and /proc
1220 		 * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
1221 		 * requested that the thread be run.
1222 		 * Just calling setrun() is not sufficient to set a stopped
1223 		 * thread running.  TP_TXSTART is always set if the thread
1224 		 * is not stopped by a jobcontrol stop signal.
1225 		 * TP_TPSTART is always set if /proc is not controlling it.
1226 		 * TP_TCSTART is always set if lwp_suspend() didn't stop it.
1227 		 * The thread won't be stopped unless one of these
1228 		 * three mechanisms did it.
1229 		 *
1230 		 * These flags must be set before calling setrun_locked(t).
1231 		 * They can't be passed as arguments because the streams
1232 		 * code calls setrun() indirectly and the mechanism for
1233 		 * doing so admits only one argument.  Note that the
1234 		 * thread must be locked in order to change t_schedflags.
1235 		 */
1236 		if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
1237 			return;
1238 		/*
1239 		 * Process is no longer stopped (a thread is running).
1240 		 */
1241 		t->t_whystop = 0;
1242 		t->t_whatstop = 0;
1243 		/*
1244 		 * Strictly speaking, we do not have to clear these
1245 		 * flags here; they are cleared on entry to stop().
1246 		 * However, they are confusing when doing kernel
1247 		 * debugging or when they are revealed by ps(1).
1248 		 */
1249 		t->t_schedflag &= ~TS_ALLSTART;
1250 		THREAD_TRANSITION(t);	/* drop stopped-thread lock */
1251 		ASSERT(t->t_lockp == &transition_lock);
1252 		ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
1253 		/*
1254 		 * Let the class put the process on the dispatcher queue.
1255 		 */
1256 		CL_SETRUN(t);
1257 	}
1258 }
1259 
1260 void
1261 setrun(kthread_t *t)
1262 {
1263 	thread_lock(t);
1264 	setrun_locked(t);
1265 	thread_unlock(t);
1266 }
1267 
1268 /*
1269  * Unpin an interrupted thread.
1270  *	When an interrupt occurs, the interrupt is handled on the stack
1271  *	of an interrupt thread, taken from a pool linked to the CPU structure.
1272  *
1273  *	When swtch() is switching away from an interrupt thread because it
1274  *	blocked or was preempted, this routine is called to complete the
1275  *	saving of the interrupted thread state, and returns the interrupted
1276  *	thread pointer so it may be resumed.
1277  *
1278  *	Called by swtch() only at high spl.
1279  */
1280 kthread_t *
1281 thread_unpin()
1282 {
1283 	kthread_t	*t = curthread;	/* current thread */
1284 	kthread_t	*itp;		/* interrupted thread */
1285 	int		i;		/* interrupt level */
1286 	extern int	intr_passivate();
1287 
1288 	ASSERT(t->t_intr != NULL);
1289 
1290 	itp = t->t_intr;		/* interrupted thread */
1291 	t->t_intr = NULL;		/* clear interrupt ptr */
1292 
1293 	/*
1294 	 * Get state from interrupt thread for the one
1295 	 * it interrupted.
1296 	 */
1297 
1298 	i = intr_passivate(t, itp);
1299 
1300 	TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
1301 	    "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
1302 	    i, t, t, itp, itp);
1303 
1304 	/*
1305 	 * Dissociate the current thread from the interrupted thread's LWP.
1306 	 */
1307 	t->t_lwp = NULL;
1308 
1309 	/*
1310 	 * Interrupt handlers above the level that spinlocks block must
1311 	 * not block.
1312 	 */
1313 #if DEBUG
1314 	if (i < 0 || i > LOCK_LEVEL)
1315 		cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
1316 #endif
1317 
1318 	/*
1319 	 * Compute the CPU's base interrupt level based on the active
1320 	 * interrupts.
1321 	 */
1322 	ASSERT(CPU->cpu_intr_actv & (1 << i));
1323 	set_base_spl();
1324 
1325 	return (itp);
1326 }
1327 
1328 /*
1329  * Create and initialize an interrupt thread.
1330  *	Returns non-zero on error.
1331  *	Called at spl7() or better.
1332  */
1333 void
1334 thread_create_intr(struct cpu *cp)
1335 {
1336 	kthread_t *tp;
1337 
1338 	tp = thread_create(NULL, 0,
1339 	    (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
1340 
1341 	/*
1342 	 * Set the thread in the TS_FREE state.  The state will change
1343 	 * to TS_ONPROC only while the interrupt is active.  Think of these
1344 	 * as being on a private free list for the CPU.  Being TS_FREE keeps
1345 	 * inactive interrupt threads out of debugger thread lists.
1346 	 *
1347 	 * We cannot call thread_create with TS_FREE because of the current
1348 	 * checks there for ONPROC.  Fix this when thread_create takes flags.
1349 	 */
1350 	THREAD_FREEINTR(tp, cp);
1351 
1352 	/*
1353 	 * Nobody should ever reference the credentials of an interrupt
1354 	 * thread so make it NULL to catch any such references.
1355 	 */
1356 	tp->t_cred = NULL;
1357 	tp->t_flag |= T_INTR_THREAD;
1358 	tp->t_cpu = cp;
1359 	tp->t_bound_cpu = cp;
1360 	tp->t_disp_queue = cp->cpu_disp;
1361 	tp->t_affinitycnt = 1;
1362 	tp->t_preempt = 1;
1363 
1364 	/*
1365 	 * Don't make a user-requested binding on this thread so that
1366 	 * the processor can be offlined.
1367 	 */
1368 	tp->t_bind_cpu = PBIND_NONE;	/* no USER-requested binding */
1369 	tp->t_bind_pset = PS_NONE;
1370 
1371 #if defined(__i386) || defined(__amd64)
1372 	tp->t_stk -= STACK_ALIGN;
1373 	*(tp->t_stk) = 0;		/* terminate intr thread stack */
1374 #endif
1375 
1376 	/*
1377 	 * Link onto CPU's interrupt pool.
1378 	 */
1379 	tp->t_link = cp->cpu_intr_thread;
1380 	cp->cpu_intr_thread = tp;
1381 }
1382 
1383 /*
1384  * TSD -- THREAD SPECIFIC DATA
1385  */
1386 static kmutex_t		tsd_mutex;	 /* linked list spin lock */
1387 static uint_t		tsd_nkeys;	 /* size of destructor array */
1388 /* per-key destructor funcs */
1389 static void 		(**tsd_destructor)(void *);
1390 /* list of tsd_thread's */
1391 static struct tsd_thread	*tsd_list;
1392 
1393 /*
1394  * Default destructor
1395  *	Needed because NULL destructor means that the key is unused
1396  */
1397 /* ARGSUSED */
1398 void
1399 tsd_defaultdestructor(void *value)
1400 {}
1401 
1402 /*
1403  * Create a key (index into per thread array)
1404  *	Locks out tsd_create, tsd_destroy, and tsd_exit
1405  *	May allocate memory with lock held
1406  */
1407 void
1408 tsd_create(uint_t *keyp, void (*destructor)(void *))
1409 {
1410 	int	i;
1411 	uint_t	nkeys;
1412 
1413 	/*
1414 	 * if key is allocated, do nothing
1415 	 */
1416 	mutex_enter(&tsd_mutex);
1417 	if (*keyp) {
1418 		mutex_exit(&tsd_mutex);
1419 		return;
1420 	}
1421 	/*
1422 	 * find an unused key
1423 	 */
1424 	if (destructor == NULL)
1425 		destructor = tsd_defaultdestructor;
1426 
1427 	for (i = 0; i < tsd_nkeys; ++i)
1428 		if (tsd_destructor[i] == NULL)
1429 			break;
1430 
1431 	/*
1432 	 * if no unused keys, increase the size of the destructor array
1433 	 */
1434 	if (i == tsd_nkeys) {
1435 		if ((nkeys = (tsd_nkeys << 1)) == 0)
1436 			nkeys = 1;
1437 		tsd_destructor =
1438 		    (void (**)(void *))tsd_realloc((void *)tsd_destructor,
1439 		    (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
1440 		    (size_t)(nkeys * sizeof (void (*)(void *))));
1441 		tsd_nkeys = nkeys;
1442 	}
1443 
1444 	/*
1445 	 * allocate the next available unused key
1446 	 */
1447 	tsd_destructor[i] = destructor;
1448 	*keyp = i + 1;
1449 	mutex_exit(&tsd_mutex);
1450 }
1451 
1452 /*
1453  * Destroy a key -- this is for unloadable modules
1454  *
1455  * Assumes that the caller is preventing tsd_set and tsd_get
1456  * Locks out tsd_create, tsd_destroy, and tsd_exit
1457  * May free memory with lock held
1458  */
1459 void
1460 tsd_destroy(uint_t *keyp)
1461 {
1462 	uint_t key;
1463 	struct tsd_thread *tsd;
1464 
1465 	/*
1466 	 * protect the key namespace and our destructor lists
1467 	 */
1468 	mutex_enter(&tsd_mutex);
1469 	key = *keyp;
1470 	*keyp = 0;
1471 
1472 	ASSERT(key <= tsd_nkeys);
1473 
1474 	/*
1475 	 * if the key is valid
1476 	 */
1477 	if (key != 0) {
1478 		uint_t k = key - 1;
1479 		/*
1480 		 * for every thread with TSD, call key's destructor
1481 		 */
1482 		for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
1483 			/*
1484 			 * no TSD for key in this thread
1485 			 */
1486 			if (key > tsd->ts_nkeys)
1487 				continue;
1488 			/*
1489 			 * call destructor for key
1490 			 */
1491 			if (tsd->ts_value[k] && tsd_destructor[k])
1492 				(*tsd_destructor[k])(tsd->ts_value[k]);
1493 			/*
1494 			 * reset value for key
1495 			 */
1496 			tsd->ts_value[k] = NULL;
1497 		}
1498 		/*
1499 		 * actually free the key (NULL destructor == unused)
1500 		 */
1501 		tsd_destructor[k] = NULL;
1502 	}
1503 
1504 	mutex_exit(&tsd_mutex);
1505 }
1506 
1507 /*
1508  * Quickly return the per thread value that was stored with the specified key
1509  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1510  */
1511 void *
1512 tsd_get(uint_t key)
1513 {
1514 	return (tsd_agent_get(curthread, key));
1515 }
1516 
1517 /*
1518  * Set a per thread value indexed with the specified key
1519  */
1520 int
1521 tsd_set(uint_t key, void *value)
1522 {
1523 	return (tsd_agent_set(curthread, key, value));
1524 }
1525 
1526 /*
1527  * Like tsd_get(), except that the agent lwp can get the tsd of
1528  * another thread in the same process (the agent thread only runs when the
1529  * process is completely stopped by /proc), or syslwp is creating a new lwp.
1530  */
1531 void *
1532 tsd_agent_get(kthread_t *t, uint_t key)
1533 {
1534 	struct tsd_thread *tsd = t->t_tsd;
1535 
1536 	ASSERT(t == curthread ||
1537 	    ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1538 
1539 	if (key && tsd != NULL && key <= tsd->ts_nkeys)
1540 		return (tsd->ts_value[key - 1]);
1541 	return (NULL);
1542 }
1543 
1544 /*
1545  * Like tsd_set(), except that the agent lwp can set the tsd of
1546  * another thread in the same process, or syslwp can set the tsd
1547  * of a thread it's in the middle of creating.
1548  *
1549  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1550  * May lock out tsd_destroy (and tsd_create), may allocate memory with
1551  * lock held
1552  */
1553 int
1554 tsd_agent_set(kthread_t *t, uint_t key, void *value)
1555 {
1556 	struct tsd_thread *tsd = t->t_tsd;
1557 
1558 	ASSERT(t == curthread ||
1559 	    ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1560 
1561 	if (key == 0)
1562 		return (EINVAL);
1563 	if (tsd == NULL)
1564 		tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1565 	if (key <= tsd->ts_nkeys) {
1566 		tsd->ts_value[key - 1] = value;
1567 		return (0);
1568 	}
1569 
1570 	ASSERT(key <= tsd_nkeys);
1571 
1572 	/*
1573 	 * lock out tsd_destroy()
1574 	 */
1575 	mutex_enter(&tsd_mutex);
1576 	if (tsd->ts_nkeys == 0) {
1577 		/*
1578 		 * Link onto list of threads with TSD
1579 		 */
1580 		if ((tsd->ts_next = tsd_list) != NULL)
1581 			tsd_list->ts_prev = tsd;
1582 		tsd_list = tsd;
1583 	}
1584 
1585 	/*
1586 	 * Allocate thread local storage and set the value for key
1587 	 */
1588 	tsd->ts_value = tsd_realloc(tsd->ts_value,
1589 	    tsd->ts_nkeys * sizeof (void *),
1590 	    key * sizeof (void *));
1591 	tsd->ts_nkeys = key;
1592 	tsd->ts_value[key - 1] = value;
1593 	mutex_exit(&tsd_mutex);
1594 
1595 	return (0);
1596 }
1597 
1598 
1599 /*
1600  * Return the per thread value that was stored with the specified key
1601  *	If necessary, create the key and the value
1602  *	Assumes the caller is protecting *keyp from tsd_destroy
1603  */
1604 void *
1605 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
1606 {
1607 	void *value;
1608 	uint_t key = *keyp;
1609 	struct tsd_thread *tsd = curthread->t_tsd;
1610 
1611 	if (tsd == NULL)
1612 		tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1613 	if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
1614 		return (value);
1615 	if (key == 0)
1616 		tsd_create(keyp, destroy);
1617 	(void) tsd_set(*keyp, value = (*allocate)());
1618 
1619 	return (value);
1620 }
1621 
1622 /*
1623  * Called from thread_exit() to run the destructor function for each tsd
1624  *	Locks out tsd_create and tsd_destroy
1625  *	Assumes that the destructor *DOES NOT* use tsd
1626  */
1627 void
1628 tsd_exit(void)
1629 {
1630 	int i;
1631 	struct tsd_thread *tsd = curthread->t_tsd;
1632 
1633 	if (tsd == NULL)
1634 		return;
1635 
1636 	if (tsd->ts_nkeys == 0) {
1637 		kmem_free(tsd, sizeof (*tsd));
1638 		curthread->t_tsd = NULL;
1639 		return;
1640 	}
1641 
1642 	/*
1643 	 * lock out tsd_create and tsd_destroy, call
1644 	 * the destructor, and mark the value as destroyed.
1645 	 */
1646 	mutex_enter(&tsd_mutex);
1647 
1648 	for (i = 0; i < tsd->ts_nkeys; i++) {
1649 		if (tsd->ts_value[i] && tsd_destructor[i])
1650 			(*tsd_destructor[i])(tsd->ts_value[i]);
1651 		tsd->ts_value[i] = NULL;
1652 	}
1653 
1654 	/*
1655 	 * remove from linked list of threads with TSD
1656 	 */
1657 	if (tsd->ts_next)
1658 		tsd->ts_next->ts_prev = tsd->ts_prev;
1659 	if (tsd->ts_prev)
1660 		tsd->ts_prev->ts_next = tsd->ts_next;
1661 	if (tsd_list == tsd)
1662 		tsd_list = tsd->ts_next;
1663 
1664 	mutex_exit(&tsd_mutex);
1665 
1666 	/*
1667 	 * free up the TSD
1668 	 */
1669 	kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
1670 	kmem_free(tsd, sizeof (struct tsd_thread));
1671 	curthread->t_tsd = NULL;
1672 }
1673 
1674 /*
1675  * realloc
1676  */
1677 static void *
1678 tsd_realloc(void *old, size_t osize, size_t nsize)
1679 {
1680 	void *new;
1681 
1682 	new = kmem_zalloc(nsize, KM_SLEEP);
1683 	if (old) {
1684 		bcopy(old, new, osize);
1685 		kmem_free(old, osize);
1686 	}
1687 	return (new);
1688 }
1689 
1690 /*
1691  * Check to see if an interrupt thread might be active at a given ipl.
1692  * If so return true.
1693  * We must be conservative--it is ok to give a false yes, but a false no
1694  * will cause disaster.  (But if the situation changes after we check it is
1695  * ok--the caller is trying to ensure that an interrupt routine has been
1696  * exited).
1697  * This is used when trying to remove an interrupt handler from an autovector
1698  * list in avintr.c.
1699  */
1700 int
1701 intr_active(struct cpu *cp, int level)
1702 {
1703 	if (level <= LOCK_LEVEL)
1704 		return (cp->cpu_thread != cp->cpu_dispthread);
1705 	else
1706 		return (CPU_ON_INTR(cp));
1707 }
1708 
1709 /*
1710  * Return non-zero if an interrupt is being serviced.
1711  */
1712 int
1713 servicing_interrupt()
1714 {
1715 	int onintr = 0;
1716 
1717 	/* Are we an interrupt thread */
1718 	if (curthread->t_flag & T_INTR_THREAD)
1719 		return (1);
1720 	/* Are we servicing a high level interrupt? */
1721 	if (CPU_ON_INTR(CPU)) {
1722 		kpreempt_disable();
1723 		onintr = CPU_ON_INTR(CPU);
1724 		kpreempt_enable();
1725 	}
1726 	return (onintr);
1727 }
1728 
1729 
1730 /*
1731  * Change the dispatch priority of a thread in the system.
1732  * Used when raising or lowering a thread's priority.
1733  * (E.g., priority inheritance)
1734  *
1735  * Since threads are queued according to their priority, we
1736  * we must check the thread's state to determine whether it
1737  * is on a queue somewhere. If it is, we've got to:
1738  *
1739  *	o Dequeue the thread.
1740  *	o Change its effective priority.
1741  *	o Enqueue the thread.
1742  *
1743  * Assumptions: The thread whose priority we wish to change
1744  * must be locked before we call thread_change_(e)pri().
1745  * The thread_change(e)pri() function doesn't drop the thread
1746  * lock--that must be done by its caller.
1747  */
1748 void
1749 thread_change_epri(kthread_t *t, pri_t disp_pri)
1750 {
1751 	uint_t	state;
1752 
1753 	ASSERT(THREAD_LOCK_HELD(t));
1754 
1755 	/*
1756 	 * If the inherited priority hasn't actually changed,
1757 	 * just return.
1758 	 */
1759 	if (t->t_epri == disp_pri)
1760 		return;
1761 
1762 	state = t->t_state;
1763 
1764 	/*
1765 	 * If it's not on a queue, change the priority with impunity.
1766 	 */
1767 	if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1768 		t->t_epri = disp_pri;
1769 		if (state == TS_ONPROC) {
1770 			cpu_t *cp = t->t_disp_queue->disp_cpu;
1771 
1772 			if (t == cp->cpu_dispthread)
1773 				cp->cpu_dispatch_pri = DISP_PRIO(t);
1774 		}
1775 	} else if (state == TS_SLEEP) {
1776 		/*
1777 		 * Take the thread out of its sleep queue.
1778 		 * Change the inherited priority.
1779 		 * Re-enqueue the thread.
1780 		 * Each synchronization object exports a function
1781 		 * to do this in an appropriate manner.
1782 		 */
1783 		SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
1784 	} else if (state == TS_WAIT) {
1785 		/*
1786 		 * Re-enqueue a thread on the wait queue if its
1787 		 * effective priority needs to change.
1788 		 */
1789 		if (disp_pri != t->t_epri)
1790 			waitq_change_pri(t, disp_pri);
1791 	} else {
1792 		/*
1793 		 * The thread is on a run queue.
1794 		 * Note: setbackdq() may not put the thread
1795 		 * back on the same run queue where it originally
1796 		 * resided.
1797 		 */
1798 		(void) dispdeq(t);
1799 		t->t_epri = disp_pri;
1800 		setbackdq(t);
1801 	}
1802 	schedctl_set_cidpri(t);
1803 }
1804 
1805 /*
1806  * Function: Change the t_pri field of a thread.
1807  * Side Effects: Adjust the thread ordering on a run queue
1808  *		 or sleep queue, if necessary.
1809  * Returns: 1 if the thread was on a run queue, else 0.
1810  */
1811 int
1812 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
1813 {
1814 	uint_t	state;
1815 	int	on_rq = 0;
1816 
1817 	ASSERT(THREAD_LOCK_HELD(t));
1818 
1819 	state = t->t_state;
1820 	THREAD_WILLCHANGE_PRI(t, disp_pri);
1821 
1822 	/*
1823 	 * If it's not on a queue, change the priority with impunity.
1824 	 */
1825 	if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1826 		t->t_pri = disp_pri;
1827 
1828 		if (state == TS_ONPROC) {
1829 			cpu_t *cp = t->t_disp_queue->disp_cpu;
1830 
1831 			if (t == cp->cpu_dispthread)
1832 				cp->cpu_dispatch_pri = DISP_PRIO(t);
1833 		}
1834 	} else if (state == TS_SLEEP) {
1835 		/*
1836 		 * If the priority has changed, take the thread out of
1837 		 * its sleep queue and change the priority.
1838 		 * Re-enqueue the thread.
1839 		 * Each synchronization object exports a function
1840 		 * to do this in an appropriate manner.
1841 		 */
1842 		if (disp_pri != t->t_pri)
1843 			SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
1844 	} else if (state == TS_WAIT) {
1845 		/*
1846 		 * Re-enqueue a thread on the wait queue if its
1847 		 * priority needs to change.
1848 		 */
1849 		if (disp_pri != t->t_pri)
1850 			waitq_change_pri(t, disp_pri);
1851 	} else {
1852 		/*
1853 		 * The thread is on a run queue.
1854 		 * Note: setbackdq() may not put the thread
1855 		 * back on the same run queue where it originally
1856 		 * resided.
1857 		 *
1858 		 * We still requeue the thread even if the priority
1859 		 * is unchanged to preserve round-robin (and other)
1860 		 * effects between threads of the same priority.
1861 		 */
1862 		on_rq = dispdeq(t);
1863 		ASSERT(on_rq);
1864 		t->t_pri = disp_pri;
1865 		if (front) {
1866 			setfrontdq(t);
1867 		} else {
1868 			setbackdq(t);
1869 		}
1870 	}
1871 	schedctl_set_cidpri(t);
1872 	return (on_rq);
1873 }
1874