xref: /freebsd/sys/kern/kern_thread.c (revision ce46e2059e16557a44be599f86de42c0e1a13220)
1 /*
2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3  *  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice(s), this list of conditions and the following disclaimer as
10  *    the first lines of this file unmodified other than the possible
11  *    addition of one or more copyright notices.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice(s), this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26  * DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/smp.h>
39 #include <sys/sysctl.h>
40 #include <sys/sched.h>
41 #include <sys/sleepqueue.h>
42 #include <sys/turnstile.h>
43 #include <sys/ktr.h>
44 
45 #include <vm/vm.h>
46 #include <vm/vm_extern.h>
47 #include <vm/uma.h>
48 
49 /*
50  * KSEGRP related storage.
51  */
52 static uma_zone_t ksegrp_zone;
53 static uma_zone_t kse_zone;
54 static uma_zone_t thread_zone;
55 
56 /* DEBUG ONLY */
57 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
58 static int thread_debug = 0;
59 SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
60 	&thread_debug, 0, "thread debug");
61 
62 int max_threads_per_proc = 1500;
63 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
64 	&max_threads_per_proc, 0, "Limit on threads per proc");
65 
66 int max_groups_per_proc = 500;
67 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
68 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
69 
70 int max_threads_hits;
71 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
72 	&max_threads_hits, 0, "");
73 
74 int virtual_cpu;
75 
76 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
77 
78 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
79 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
80 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
81 struct mtx kse_zombie_lock;
82 MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
83 
84 void kse_purge(struct proc *p, struct thread *td);
85 void kse_purge_group(struct thread *td);
86 
87 /* move to proc.h */
88 extern void	kseinit(void);
89 extern void	kse_GC(void);
90 
91 
92 static int
93 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
94 {
95 	int error, new_val;
96 	int def_val;
97 
98 	def_val = mp_ncpus;
99 	if (virtual_cpu == 0)
100 		new_val = def_val;
101 	else
102 		new_val = virtual_cpu;
103 	error = sysctl_handle_int(oidp, &new_val, 0, req);
104         if (error != 0 || req->newptr == NULL)
105 		return (error);
106 	if (new_val < 0)
107 		return (EINVAL);
108 	virtual_cpu = new_val;
109 	return (0);
110 }
111 
112 /* DEBUG ONLY */
113 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
114 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
115 	"debug virtual cpus");
116 
117 /*
118  * Thread ID allocator. The allocator keeps track of assigned IDs by
119  * using a bitmap. The bitmap is created in parts. The parts are linked
120  * together.
121  */
122 typedef u_long tid_bitmap_word;
123 
124 #define	TID_IDS_PER_PART	1024
125 #define	TID_IDS_PER_IDX		(sizeof(tid_bitmap_word) << 3)
126 #define	TID_BITMAP_SIZE		(TID_IDS_PER_PART / TID_IDS_PER_IDX)
127 #define	TID_MIN			(PID_MAX + 1)
128 
129 struct tid_bitmap_part {
130 	STAILQ_ENTRY(tid_bitmap_part) bmp_next;
131 	tid_bitmap_word	bmp_bitmap[TID_BITMAP_SIZE];
132 	lwpid_t		bmp_base;
133 	int		bmp_free;
134 };
135 
136 static STAILQ_HEAD(, tid_bitmap_part) tid_bitmap =
137     STAILQ_HEAD_INITIALIZER(tid_bitmap);
138 static uma_zone_t tid_zone;
139 
140 struct mtx tid_lock;
141 MTX_SYSINIT(tid_lock, &tid_lock, "TID lock", MTX_DEF);
142 
143 /*
144  * Prepare a thread for use.
145  */
146 static int
147 thread_ctor(void *mem, int size, void *arg, int flags)
148 {
149 	struct thread	*td;
150 
151 	td = (struct thread *)mem;
152 	td->td_state = TDS_INACTIVE;
153 	td->td_oncpu	= NOCPU;
154 
155 	/*
156 	 * Note that td_critnest begins life as 1 because the thread is not
157 	 * running and is thereby implicitly waiting to be on the receiving
158 	 * end of a context switch.  A context switch must occur inside a
159 	 * critical section, and in fact, includes hand-off of the sched_lock.
160 	 * After a context switch to a newly created thread, it will release
161 	 * sched_lock for the first time, and its td_critnest will hit 0 for
162 	 * the first time.  This happens on the far end of a context switch,
163 	 * and when it context switches away from itself, it will in fact go
164 	 * back into a critical section, and hand off the sched lock to the
165 	 * next thread.
166 	 */
167 	td->td_critnest = 1;
168 	return (0);
169 }
170 
171 /*
172  * Reclaim a thread after use.
173  */
174 static void
175 thread_dtor(void *mem, int size, void *arg)
176 {
177 	struct thread *td;
178 
179 	td = (struct thread *)mem;
180 
181 #ifdef INVARIANTS
182 	/* Verify that this thread is in a safe state to free. */
183 	switch (td->td_state) {
184 	case TDS_INHIBITED:
185 	case TDS_RUNNING:
186 	case TDS_CAN_RUN:
187 	case TDS_RUNQ:
188 		/*
189 		 * We must never unlink a thread that is in one of
190 		 * these states, because it is currently active.
191 		 */
192 		panic("bad state for thread unlinking");
193 		/* NOTREACHED */
194 	case TDS_INACTIVE:
195 		break;
196 	default:
197 		panic("bad thread state");
198 		/* NOTREACHED */
199 	}
200 #endif
201 }
202 
203 /*
204  * Initialize type-stable parts of a thread (when newly created).
205  */
206 static int
207 thread_init(void *mem, int size, int flags)
208 {
209 	struct thread *td;
210 	struct tid_bitmap_part *bmp, *new;
211 	int bit, idx;
212 
213 	td = (struct thread *)mem;
214 
215 	mtx_lock(&tid_lock);
216 	STAILQ_FOREACH(bmp, &tid_bitmap, bmp_next) {
217 		if (bmp->bmp_free)
218 			break;
219 	}
220 	/* Create a new bitmap if we run out of free bits. */
221 	if (bmp == NULL) {
222 		mtx_unlock(&tid_lock);
223 		new = uma_zalloc(tid_zone, M_WAITOK);
224 		mtx_lock(&tid_lock);
225 		bmp = STAILQ_LAST(&tid_bitmap, tid_bitmap_part, bmp_next);
226 		if (bmp == NULL || bmp->bmp_free < TID_IDS_PER_PART/2) {
227 			/* 1=free, 0=assigned. This way we can use ffsl(). */
228 			memset(new->bmp_bitmap, ~0U, sizeof(new->bmp_bitmap));
229 			new->bmp_base = (bmp == NULL) ? TID_MIN :
230 			    bmp->bmp_base + TID_IDS_PER_PART;
231 			new->bmp_free = TID_IDS_PER_PART;
232 			STAILQ_INSERT_TAIL(&tid_bitmap, new, bmp_next);
233 			bmp = new;
234 			new = NULL;
235 		}
236 	} else
237 		new = NULL;
238 	/* We have a bitmap with available IDs. */
239 	idx = 0;
240 	while (idx < TID_BITMAP_SIZE && bmp->bmp_bitmap[idx] == 0UL)
241 		idx++;
242 	bit = ffsl(bmp->bmp_bitmap[idx]) - 1;
243 	td->td_tid = bmp->bmp_base + idx * TID_IDS_PER_IDX + bit;
244 	bmp->bmp_bitmap[idx] &= ~(1UL << bit);
245 	bmp->bmp_free--;
246 	mtx_unlock(&tid_lock);
247 	if (new != NULL)
248 		uma_zfree(tid_zone, new);
249 
250 	vm_thread_new(td, 0);
251 	cpu_thread_setup(td);
252 	td->td_sleepqueue = sleepq_alloc();
253 	td->td_turnstile = turnstile_alloc();
254 	td->td_sched = (struct td_sched *)&td[1];
255 	return (0);
256 }
257 
258 /*
259  * Tear down type-stable parts of a thread (just before being discarded).
260  */
261 static void
262 thread_fini(void *mem, int size)
263 {
264 	struct thread *td;
265 	struct tid_bitmap_part *bmp;
266 	lwpid_t tid;
267 	int bit, idx;
268 
269 	td = (struct thread *)mem;
270 	turnstile_free(td->td_turnstile);
271 	sleepq_free(td->td_sleepqueue);
272 	vm_thread_dispose(td);
273 
274 	STAILQ_FOREACH(bmp, &tid_bitmap, bmp_next) {
275 		if (td->td_tid >= bmp->bmp_base &&
276 		    td->td_tid < bmp->bmp_base + TID_IDS_PER_PART)
277 			break;
278 	}
279 	KASSERT(bmp != NULL, ("No TID bitmap?"));
280 	mtx_lock(&tid_lock);
281 	tid = td->td_tid - bmp->bmp_base;
282 	idx = tid / TID_IDS_PER_IDX;
283 	bit = 1UL << (tid % TID_IDS_PER_IDX);
284 	bmp->bmp_bitmap[idx] |= bit;
285 	bmp->bmp_free++;
286 	mtx_unlock(&tid_lock);
287 }
288 
289 /*
290  * Initialize type-stable parts of a kse (when newly created).
291  */
292 static int
293 kse_init(void *mem, int size, int flags)
294 {
295 	struct kse	*ke;
296 
297 	ke = (struct kse *)mem;
298 	ke->ke_sched = (struct ke_sched *)&ke[1];
299 	return (0);
300 }
301 
302 /*
303  * Initialize type-stable parts of a ksegrp (when newly created).
304  */
305 static int
306 ksegrp_init(void *mem, int size, int flags)
307 {
308 	struct ksegrp	*kg;
309 
310 	kg = (struct ksegrp *)mem;
311 	kg->kg_sched = (struct kg_sched *)&kg[1];
312 	return (0);
313 }
314 
315 /*
316  * KSE is linked into kse group.
317  */
318 void
319 kse_link(struct kse *ke, struct ksegrp *kg)
320 {
321 	struct proc *p = kg->kg_proc;
322 
323 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
324 	kg->kg_kses++;
325 	ke->ke_state	= KES_UNQUEUED;
326 	ke->ke_proc	= p;
327 	ke->ke_ksegrp	= kg;
328 	ke->ke_thread	= NULL;
329 	ke->ke_oncpu	= NOCPU;
330 	ke->ke_flags	= 0;
331 }
332 
333 void
334 kse_unlink(struct kse *ke)
335 {
336 	struct ksegrp *kg;
337 
338 	mtx_assert(&sched_lock, MA_OWNED);
339 	kg = ke->ke_ksegrp;
340 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
341 	if (ke->ke_state == KES_IDLE) {
342 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
343 		kg->kg_idle_kses--;
344 	}
345 	--kg->kg_kses;
346 	/*
347 	 * Aggregate stats from the KSE
348 	 */
349 	kse_stash(ke);
350 }
351 
352 void
353 ksegrp_link(struct ksegrp *kg, struct proc *p)
354 {
355 
356 	TAILQ_INIT(&kg->kg_threads);
357 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
358 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
359 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
360 	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
361 	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
362 	kg->kg_proc = p;
363 	/*
364 	 * the following counters are in the -zero- section
365 	 * and may not need clearing
366 	 */
367 	kg->kg_numthreads = 0;
368 	kg->kg_runnable   = 0;
369 	kg->kg_kses       = 0;
370 	kg->kg_runq_kses  = 0; /* XXXKSE change name */
371 	kg->kg_idle_kses  = 0;
372 	kg->kg_numupcalls = 0;
373 	/* link it in now that it's consistent */
374 	p->p_numksegrps++;
375 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
376 }
377 
378 void
379 ksegrp_unlink(struct ksegrp *kg)
380 {
381 	struct proc *p;
382 
383 	mtx_assert(&sched_lock, MA_OWNED);
384 	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
385 	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
386 	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
387 
388 	p = kg->kg_proc;
389 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
390 	p->p_numksegrps--;
391 	/*
392 	 * Aggregate stats from the KSE
393 	 */
394 	ksegrp_stash(kg);
395 }
396 
397 /*
398  * For a newly created process,
399  * link up all the structures and its initial threads etc.
400  */
401 void
402 proc_linkup(struct proc *p, struct ksegrp *kg,
403 	    struct kse *ke, struct thread *td)
404 {
405 
406 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
407 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
408 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
409 	p->p_numksegrps = 0;
410 	p->p_numthreads = 0;
411 
412 	ksegrp_link(kg, p);
413 	kse_link(ke, kg);
414 	thread_link(td, kg);
415 }
416 
417 /*
418  * Initialize global thread allocation resources.
419  */
420 void
421 threadinit(void)
422 {
423 
424 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
425 	    thread_ctor, thread_dtor, thread_init, thread_fini,
426 	    UMA_ALIGN_CACHE, 0);
427 	tid_zone = uma_zcreate("TID", sizeof(struct tid_bitmap_part),
428 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
429 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
430 	    NULL, NULL, ksegrp_init, NULL,
431 	    UMA_ALIGN_CACHE, 0);
432 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
433 	    NULL, NULL, kse_init, NULL,
434 	    UMA_ALIGN_CACHE, 0);
435 	kseinit();
436 }
437 
438 /*
439  * Stash an embarasingly extra thread into the zombie thread queue.
440  */
441 void
442 thread_stash(struct thread *td)
443 {
444 	mtx_lock_spin(&kse_zombie_lock);
445 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
446 	mtx_unlock_spin(&kse_zombie_lock);
447 }
448 
449 /*
450  * Stash an embarasingly extra kse into the zombie kse queue.
451  */
452 void
453 kse_stash(struct kse *ke)
454 {
455 	mtx_lock_spin(&kse_zombie_lock);
456 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
457 	mtx_unlock_spin(&kse_zombie_lock);
458 }
459 
460 /*
461  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
462  */
463 void
464 ksegrp_stash(struct ksegrp *kg)
465 {
466 	mtx_lock_spin(&kse_zombie_lock);
467 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
468 	mtx_unlock_spin(&kse_zombie_lock);
469 }
470 
471 /*
472  * Reap zombie kse resource.
473  */
474 void
475 thread_reap(void)
476 {
477 	struct thread *td_first, *td_next;
478 	struct kse *ke_first, *ke_next;
479 	struct ksegrp *kg_first, * kg_next;
480 
481 	/*
482 	 * Don't even bother to lock if none at this instant,
483 	 * we really don't care about the next instant..
484 	 */
485 	if ((!TAILQ_EMPTY(&zombie_threads))
486 	    || (!TAILQ_EMPTY(&zombie_kses))
487 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
488 		mtx_lock_spin(&kse_zombie_lock);
489 		td_first = TAILQ_FIRST(&zombie_threads);
490 		ke_first = TAILQ_FIRST(&zombie_kses);
491 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
492 		if (td_first)
493 			TAILQ_INIT(&zombie_threads);
494 		if (ke_first)
495 			TAILQ_INIT(&zombie_kses);
496 		if (kg_first)
497 			TAILQ_INIT(&zombie_ksegrps);
498 		mtx_unlock_spin(&kse_zombie_lock);
499 		while (td_first) {
500 			td_next = TAILQ_NEXT(td_first, td_runq);
501 			if (td_first->td_ucred)
502 				crfree(td_first->td_ucred);
503 			thread_free(td_first);
504 			td_first = td_next;
505 		}
506 		while (ke_first) {
507 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
508 			kse_free(ke_first);
509 			ke_first = ke_next;
510 		}
511 		while (kg_first) {
512 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
513 			ksegrp_free(kg_first);
514 			kg_first = kg_next;
515 		}
516 	}
517 	kse_GC();
518 }
519 
520 /*
521  * Allocate a ksegrp.
522  */
523 struct ksegrp *
524 ksegrp_alloc(void)
525 {
526 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
527 }
528 
529 /*
530  * Allocate a kse.
531  */
532 struct kse *
533 kse_alloc(void)
534 {
535 	return (uma_zalloc(kse_zone, M_WAITOK));
536 }
537 
538 /*
539  * Allocate a thread.
540  */
541 struct thread *
542 thread_alloc(void)
543 {
544 	thread_reap(); /* check if any zombies to get */
545 	return (uma_zalloc(thread_zone, M_WAITOK));
546 }
547 
548 /*
549  * Deallocate a ksegrp.
550  */
551 void
552 ksegrp_free(struct ksegrp *td)
553 {
554 	uma_zfree(ksegrp_zone, td);
555 }
556 
557 /*
558  * Deallocate a kse.
559  */
560 void
561 kse_free(struct kse *td)
562 {
563 	uma_zfree(kse_zone, td);
564 }
565 
566 /*
567  * Deallocate a thread.
568  */
569 void
570 thread_free(struct thread *td)
571 {
572 
573 	cpu_thread_clean(td);
574 	uma_zfree(thread_zone, td);
575 }
576 
577 /*
578  * Discard the current thread and exit from its context.
579  * Always called with scheduler locked.
580  *
581  * Because we can't free a thread while we're operating under its context,
582  * push the current thread into our CPU's deadthread holder. This means
583  * we needn't worry about someone else grabbing our context before we
584  * do a cpu_throw().  This may not be needed now as we are under schedlock.
585  * Maybe we can just do a thread_stash() as thr_exit1 does.
586  */
587 /*  XXX
588  * libthr expects its thread exit to return for the last
589  * thread, meaning that the program is back to non-threaded
590  * mode I guess. Because we do this (cpu_throw) unconditionally
591  * here, they have their own version of it. (thr_exit1())
592  * that doesn't do it all if this was the last thread.
593  * It is also called from thread_suspend_check().
594  * Of course in the end, they end up coming here through exit1
595  * anyhow..  After fixing 'thr' to play by the rules we should be able
596  * to merge these two functions together.
597  */
598 void
599 thread_exit(void)
600 {
601 	struct thread *td;
602 	struct kse *ke;
603 	struct proc *p;
604 	struct ksegrp	*kg;
605 
606 	td = curthread;
607 	kg = td->td_ksegrp;
608 	p = td->td_proc;
609 	ke = td->td_kse;
610 
611 	mtx_assert(&sched_lock, MA_OWNED);
612 	KASSERT(p != NULL, ("thread exiting without a process"));
613 	KASSERT(ke != NULL, ("thread exiting without a kse"));
614 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
615 	PROC_LOCK_ASSERT(p, MA_OWNED);
616 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
617 	mtx_assert(&Giant, MA_NOTOWNED);
618 
619 	if (td->td_standin != NULL) {
620 		thread_stash(td->td_standin);
621 		td->td_standin = NULL;
622 	}
623 
624 	cpu_thread_exit(td);	/* XXXSMP */
625 
626 	/*
627 	 * The last thread is left attached to the process
628 	 * So that the whole bundle gets recycled. Skip
629 	 * all this stuff.
630 	 */
631 	if (p->p_numthreads > 1) {
632 		thread_unlink(td);
633 		if (p->p_maxthrwaits)
634 			wakeup(&p->p_numthreads);
635 		/*
636 		 * The test below is NOT true if we are the
637 		 * sole exiting thread. P_STOPPED_SNGL is unset
638 		 * in exit1() after it is the only survivor.
639 		 */
640 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
641 			if (p->p_numthreads == p->p_suspcount) {
642 				thread_unsuspend_one(p->p_singlethread);
643 			}
644 		}
645 
646 		/*
647 		 * Because each upcall structure has an owner thread,
648 		 * owner thread exits only when process is in exiting
649 		 * state, so upcall to userland is no longer needed,
650 		 * deleting upcall structure is safe here.
651 		 * So when all threads in a group is exited, all upcalls
652 		 * in the group should be automatically freed.
653 		 */
654 		if (td->td_upcall)
655 			upcall_remove(td);
656 
657 		sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
658 		sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
659 		ke->ke_state = KES_UNQUEUED;
660 		ke->ke_thread = NULL;
661 		/*
662 		 * Decide what to do with the KSE attached to this thread.
663 		 */
664 		if (ke->ke_flags & KEF_EXIT) {
665 			kse_unlink(ke);
666 			if (kg->kg_kses == 0) {
667 				sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
668 				ksegrp_unlink(kg);
669 			}
670 		}
671 		else
672 			kse_reassign(ke);
673 		PROC_UNLOCK(p);
674 		td->td_kse	= NULL;
675 #if 0
676 		td->td_proc	= NULL;
677 #endif
678 		td->td_ksegrp	= NULL;
679 		td->td_last_kse	= NULL;
680 		PCPU_SET(deadthread, td);
681 	} else {
682 		PROC_UNLOCK(p);
683 	}
684 	td->td_state	= TDS_INACTIVE;
685 	/* XXX Shouldn't cpu_throw() here. */
686 	mtx_assert(&sched_lock, MA_OWNED);
687 	cpu_throw(td, choosethread());
688 	panic("I'm a teapot!");
689 	/* NOTREACHED */
690 }
691 
692 /*
693  * Do any thread specific cleanups that may be needed in wait()
694  * called with Giant, proc and schedlock not held.
695  */
696 void
697 thread_wait(struct proc *p)
698 {
699 	struct thread *td;
700 
701 	mtx_assert(&Giant, MA_NOTOWNED);
702 	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
703 	KASSERT((p->p_numksegrps == 1), ("Multiple ksegrps in wait1()"));
704 	FOREACH_THREAD_IN_PROC(p, td) {
705 		if (td->td_standin != NULL) {
706 			thread_free(td->td_standin);
707 			td->td_standin = NULL;
708 		}
709 		cpu_thread_clean(td);
710 	}
711 	thread_reap();	/* check for zombie threads etc. */
712 }
713 
714 /*
715  * Link a thread to a process.
716  * set up anything that needs to be initialized for it to
717  * be used by the process.
718  *
719  * Note that we do not link to the proc's ucred here.
720  * The thread is linked as if running but no KSE assigned.
721  */
722 void
723 thread_link(struct thread *td, struct ksegrp *kg)
724 {
725 	struct proc *p;
726 
727 	p = kg->kg_proc;
728 	td->td_state    = TDS_INACTIVE;
729 	td->td_proc     = p;
730 	td->td_ksegrp   = kg;
731 	td->td_last_kse = NULL;
732 	td->td_flags    = 0;
733 	td->td_kflags	= 0;
734 	td->td_kse      = NULL;
735 
736 	LIST_INIT(&td->td_contested);
737 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
738 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
739 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
740 	p->p_numthreads++;
741 	kg->kg_numthreads++;
742 }
743 
744 void
745 thread_unlink(struct thread *td)
746 {
747 	struct proc *p = td->td_proc;
748 	struct ksegrp *kg = td->td_ksegrp;
749 
750 	mtx_assert(&sched_lock, MA_OWNED);
751 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
752 	p->p_numthreads--;
753 	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
754 	kg->kg_numthreads--;
755 	/* could clear a few other things here */
756 }
757 
758 /*
759  * Purge a ksegrp resource. When a ksegrp is preparing to
760  * exit, it calls this function.
761  */
762 void
763 kse_purge_group(struct thread *td)
764 {
765 	struct ksegrp *kg;
766 	struct kse *ke;
767 
768 	kg = td->td_ksegrp;
769  	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
770 	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
771 		KASSERT(ke->ke_state == KES_IDLE,
772 			("%s: wrong idle KSE state", __func__));
773 		kse_unlink(ke);
774 	}
775 	KASSERT((kg->kg_kses == 1),
776 		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
777 	KASSERT((kg->kg_numupcalls == 0),
778 	        ("%s: ksegrp still has %d upcall datas",
779 		__func__, kg->kg_numupcalls));
780 }
781 
782 /*
783  * Purge a process's KSE resource. When a process is preparing to
784  * exit, it calls kse_purge to release any extra KSE resources in
785  * the process.
786  */
787 void
788 kse_purge(struct proc *p, struct thread *td)
789 {
790 	struct ksegrp *kg;
791 	struct kse *ke;
792 
793  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
794 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
795 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
796 		p->p_numksegrps--;
797 		/*
798 		 * There is no ownership for KSE, after all threads
799 		 * in the group exited, it is possible that some KSEs
800 		 * were left in idle queue, gc them now.
801 		 */
802 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
803 			KASSERT(ke->ke_state == KES_IDLE,
804 			   ("%s: wrong idle KSE state", __func__));
805 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
806 			kg->kg_idle_kses--;
807 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
808 			kg->kg_kses--;
809 			kse_stash(ke);
810 		}
811 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
812 		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
813 		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
814 		KASSERT((kg->kg_numupcalls == 0),
815 		        ("%s: ksegrp still has %d upcall datas",
816 			__func__, kg->kg_numupcalls));
817 
818 		if (kg != td->td_ksegrp)
819 			ksegrp_stash(kg);
820 	}
821 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
822 	p->p_numksegrps++;
823 }
824 
825 /*
826  * Enforce single-threading.
827  *
828  * Returns 1 if the caller must abort (another thread is waiting to
829  * exit the process or similar). Process is locked!
830  * Returns 0 when you are successfully the only thread running.
831  * A process has successfully single threaded in the suspend mode when
832  * There are no threads in user mode. Threads in the kernel must be
833  * allowed to continue until they get to the user boundary. They may even
834  * copy out their return values and data before suspending. They may however be
835  * accellerated in reaching the user boundary as we will wake up
836  * any sleeping threads that are interruptable. (PCATCH).
837  */
838 int
839 thread_single(int force_exit)
840 {
841 	struct thread *td;
842 	struct thread *td2;
843 	struct proc *p;
844 	int remaining;
845 
846 	td = curthread;
847 	p = td->td_proc;
848 	mtx_assert(&Giant, MA_NOTOWNED);
849 	PROC_LOCK_ASSERT(p, MA_OWNED);
850 	KASSERT((td != NULL), ("curthread is NULL"));
851 
852 	if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1)
853 		return (0);
854 
855 	/* Is someone already single threading? */
856 	if (p->p_singlethread)
857 		return (1);
858 
859 	if (force_exit == SINGLE_EXIT) {
860 		p->p_flag |= P_SINGLE_EXIT;
861 	} else
862 		p->p_flag &= ~P_SINGLE_EXIT;
863 	p->p_flag |= P_STOPPED_SINGLE;
864 	mtx_lock_spin(&sched_lock);
865 	p->p_singlethread = td;
866 	if (force_exit == SINGLE_EXIT)
867 		remaining = p->p_numthreads;
868 	else
869 		remaining = p->p_numthreads - p->p_suspcount;
870 	while (remaining != 1) {
871 		FOREACH_THREAD_IN_PROC(p, td2) {
872 			if (td2 == td)
873 				continue;
874 			td2->td_flags |= TDF_ASTPENDING;
875 			if (TD_IS_INHIBITED(td2)) {
876 				if (force_exit == SINGLE_EXIT) {
877 					if (td->td_flags & TDF_DBSUSPEND)
878 						td->td_flags &= ~TDF_DBSUSPEND;
879 					if (TD_IS_SUSPENDED(td2)) {
880 						thread_unsuspend_one(td2);
881 					}
882 					if (TD_ON_SLEEPQ(td2) &&
883 					    (td2->td_flags & TDF_SINTR)) {
884 						sleepq_abort(td2);
885 					}
886 				} else {
887 					if (TD_IS_SUSPENDED(td2))
888 						continue;
889 					/*
890 					 * maybe other inhibitted states too?
891 					 * XXXKSE Is it totally safe to
892 					 * suspend a non-interruptable thread?
893 					 */
894 					if (td2->td_inhibitors &
895 					    (TDI_SLEEPING | TDI_SWAPPED))
896 						thread_suspend_one(td2);
897 				}
898 			}
899 		}
900 		if (force_exit == SINGLE_EXIT)
901 			remaining = p->p_numthreads;
902 		else
903 			remaining = p->p_numthreads - p->p_suspcount;
904 
905 		/*
906 		 * Maybe we suspended some threads.. was it enough?
907 		 */
908 		if (remaining == 1)
909 			break;
910 
911 		/*
912 		 * Wake us up when everyone else has suspended.
913 		 * In the mean time we suspend as well.
914 		 */
915 		thread_suspend_one(td);
916 		PROC_UNLOCK(p);
917 		mi_switch(SW_VOL, NULL);
918 		mtx_unlock_spin(&sched_lock);
919 		PROC_LOCK(p);
920 		mtx_lock_spin(&sched_lock);
921 		if (force_exit == SINGLE_EXIT)
922 			remaining = p->p_numthreads;
923 		else
924 			remaining = p->p_numthreads - p->p_suspcount;
925 	}
926 	if (force_exit == SINGLE_EXIT) {
927 		if (td->td_upcall)
928 			upcall_remove(td);
929 		kse_purge(p, td);
930 	}
931 	mtx_unlock_spin(&sched_lock);
932 	return (0);
933 }
934 
935 /*
936  * Called in from locations that can safely check to see
937  * whether we have to suspend or at least throttle for a
938  * single-thread event (e.g. fork).
939  *
940  * Such locations include userret().
941  * If the "return_instead" argument is non zero, the thread must be able to
942  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
943  *
944  * The 'return_instead' argument tells the function if it may do a
945  * thread_exit() or suspend, or whether the caller must abort and back
946  * out instead.
947  *
948  * If the thread that set the single_threading request has set the
949  * P_SINGLE_EXIT bit in the process flags then this call will never return
950  * if 'return_instead' is false, but will exit.
951  *
952  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
953  *---------------+--------------------+---------------------
954  *       0       | returns 0          |   returns 0 or 1
955  *               | when ST ends       |   immediatly
956  *---------------+--------------------+---------------------
957  *       1       | thread exits       |   returns 1
958  *               |                    |  immediatly
959  * 0 = thread_exit() or suspension ok,
960  * other = return error instead of stopping the thread.
961  *
962  * While a full suspension is under effect, even a single threading
963  * thread would be suspended if it made this call (but it shouldn't).
964  * This call should only be made from places where
965  * thread_exit() would be safe as that may be the outcome unless
966  * return_instead is set.
967  */
968 int
969 thread_suspend_check(int return_instead)
970 {
971 	struct thread *td;
972 	struct proc *p;
973 
974 	td = curthread;
975 	p = td->td_proc;
976 	mtx_assert(&Giant, MA_NOTOWNED);
977 	PROC_LOCK_ASSERT(p, MA_OWNED);
978 	while (P_SHOULDSTOP(p) ||
979 	      ((p->p_flag & P_TRACED) && (td->td_flags & TDF_DBSUSPEND))) {
980 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
981 			KASSERT(p->p_singlethread != NULL,
982 			    ("singlethread not set"));
983 			/*
984 			 * The only suspension in action is a
985 			 * single-threading. Single threader need not stop.
986 			 * XXX Should be safe to access unlocked
987 			 * as it can only be set to be true by us.
988 			 */
989 			if (p->p_singlethread == td)
990 				return (0);	/* Exempt from stopping. */
991 		}
992 		if (return_instead)
993 			return (1);
994 
995 		mtx_lock_spin(&sched_lock);
996 		thread_stopped(p);
997 		/*
998 		 * If the process is waiting for us to exit,
999 		 * this thread should just suicide.
1000 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1001 		 */
1002 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1003 			if (p->p_flag & P_SA)
1004 				thread_exit();
1005 			else
1006 				thr_exit1();
1007 		}
1008 
1009 		/*
1010 		 * When a thread suspends, it just
1011 		 * moves to the processes's suspend queue
1012 		 * and stays there.
1013 		 */
1014 		thread_suspend_one(td);
1015 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1016 			if (p->p_numthreads == p->p_suspcount) {
1017 				thread_unsuspend_one(p->p_singlethread);
1018 			}
1019 		}
1020 		PROC_UNLOCK(p);
1021 		mi_switch(SW_INVOL, NULL);
1022 		mtx_unlock_spin(&sched_lock);
1023 		PROC_LOCK(p);
1024 	}
1025 	return (0);
1026 }
1027 
1028 void
1029 thread_suspend_one(struct thread *td)
1030 {
1031 	struct proc *p = td->td_proc;
1032 
1033 	mtx_assert(&sched_lock, MA_OWNED);
1034 	PROC_LOCK_ASSERT(p, MA_OWNED);
1035 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1036 	p->p_suspcount++;
1037 	TD_SET_SUSPENDED(td);
1038 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1039 	/*
1040 	 * Hack: If we are suspending but are on the sleep queue
1041 	 * then we are in msleep or the cv equivalent. We
1042 	 * want to look like we have two Inhibitors.
1043 	 * May already be set.. doesn't matter.
1044 	 */
1045 	if (TD_ON_SLEEPQ(td))
1046 		TD_SET_SLEEPING(td);
1047 }
1048 
1049 void
1050 thread_unsuspend_one(struct thread *td)
1051 {
1052 	struct proc *p = td->td_proc;
1053 
1054 	mtx_assert(&sched_lock, MA_OWNED);
1055 	PROC_LOCK_ASSERT(p, MA_OWNED);
1056 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1057 	TD_CLR_SUSPENDED(td);
1058 	p->p_suspcount--;
1059 	setrunnable(td);
1060 }
1061 
1062 /*
1063  * Allow all threads blocked by single threading to continue running.
1064  */
1065 void
1066 thread_unsuspend(struct proc *p)
1067 {
1068 	struct thread *td;
1069 
1070 	mtx_assert(&sched_lock, MA_OWNED);
1071 	PROC_LOCK_ASSERT(p, MA_OWNED);
1072 	if (!P_SHOULDSTOP(p)) {
1073 		while ((td = TAILQ_FIRST(&p->p_suspended))) {
1074 			thread_unsuspend_one(td);
1075 		}
1076 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1077 	    (p->p_numthreads == p->p_suspcount)) {
1078 		/*
1079 		 * Stopping everything also did the job for the single
1080 		 * threading request. Now we've downgraded to single-threaded,
1081 		 * let it continue.
1082 		 */
1083 		thread_unsuspend_one(p->p_singlethread);
1084 	}
1085 }
1086 
1087 void
1088 thread_single_end(void)
1089 {
1090 	struct thread *td;
1091 	struct proc *p;
1092 
1093 	td = curthread;
1094 	p = td->td_proc;
1095 	PROC_LOCK_ASSERT(p, MA_OWNED);
1096 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
1097 	mtx_lock_spin(&sched_lock);
1098 	p->p_singlethread = NULL;
1099 	/*
1100 	 * If there are other threads they mey now run,
1101 	 * unless of course there is a blanket 'stop order'
1102 	 * on the process. The single threader must be allowed
1103 	 * to continue however as this is a bad place to stop.
1104 	 */
1105 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1106 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1107 			thread_unsuspend_one(td);
1108 		}
1109 	}
1110 	mtx_unlock_spin(&sched_lock);
1111 }
1112 
1113