xref: /freebsd/sys/kern/kern_thread.c (revision 6b3455a7665208c366849f0b2b3bc916fb97516e)
1 /*
2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3  *  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice(s), this list of conditions and the following disclaimer as
10  *    the first lines of this file unmodified other than the possible
11  *    addition of one or more copyright notices.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice(s), this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26  * DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/smp.h>
39 #include <sys/sysctl.h>
40 #include <sys/sched.h>
41 #include <sys/sleepqueue.h>
42 #include <sys/turnstile.h>
43 #include <sys/ktr.h>
44 
45 #include <vm/vm.h>
46 #include <vm/vm_extern.h>
47 #include <vm/uma.h>
48 
49 /*
50  * KSEGRP related storage.
51  */
52 static uma_zone_t ksegrp_zone;
53 static uma_zone_t kse_zone;
54 static uma_zone_t thread_zone;
55 
56 /* DEBUG ONLY */
57 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
58 static int thread_debug = 0;
59 SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
60 	&thread_debug, 0, "thread debug");
61 
62 int max_threads_per_proc = 1500;
63 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
64 	&max_threads_per_proc, 0, "Limit on threads per proc");
65 
66 int max_groups_per_proc = 500;
67 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
68 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
69 
70 int max_threads_hits;
71 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
72 	&max_threads_hits, 0, "");
73 
74 int virtual_cpu;
75 
76 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
77 
78 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
79 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
80 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
81 struct mtx kse_zombie_lock;
82 MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
83 
84 void kse_purge(struct proc *p, struct thread *td);
85 void kse_purge_group(struct thread *td);
86 
87 /* move to proc.h */
88 extern void	kseinit(void);
89 extern void	kse_GC(void);
90 
91 
92 static int
93 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
94 {
95 	int error, new_val;
96 	int def_val;
97 
98 	def_val = mp_ncpus;
99 	if (virtual_cpu == 0)
100 		new_val = def_val;
101 	else
102 		new_val = virtual_cpu;
103 	error = sysctl_handle_int(oidp, &new_val, 0, req);
104         if (error != 0 || req->newptr == NULL)
105 		return (error);
106 	if (new_val < 0)
107 		return (EINVAL);
108 	virtual_cpu = new_val;
109 	return (0);
110 }
111 
112 /* DEBUG ONLY */
113 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
114 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
115 	"debug virtual cpus");
116 
117 /*
118  * Thread ID allocator. The allocator keeps track of assigned IDs by
119  * using a bitmap. The bitmap is created in parts. The parts are linked
120  * together.
121  */
122 typedef u_long tid_bitmap_word;
123 
124 #define	TID_IDS_PER_PART	1024
125 #define	TID_IDS_PER_IDX		(sizeof(tid_bitmap_word) << 3)
126 #define	TID_BITMAP_SIZE		(TID_IDS_PER_PART / TID_IDS_PER_IDX)
127 #define	TID_MIN			(PID_MAX + 1)
128 
129 struct tid_bitmap_part {
130 	STAILQ_ENTRY(tid_bitmap_part) bmp_next;
131 	tid_bitmap_word	bmp_bitmap[TID_BITMAP_SIZE];
132 	lwpid_t		bmp_base;
133 	int		bmp_free;
134 };
135 
136 static STAILQ_HEAD(, tid_bitmap_part) tid_bitmap =
137     STAILQ_HEAD_INITIALIZER(tid_bitmap);
138 static uma_zone_t tid_zone;
139 
140 struct mtx tid_lock;
141 MTX_SYSINIT(tid_lock, &tid_lock, "TID lock", MTX_DEF);
142 
143 /*
144  * Prepare a thread for use.
145  */
146 static void
147 thread_ctor(void *mem, int size, void *arg)
148 {
149 	struct thread	*td;
150 
151 	td = (struct thread *)mem;
152 	td->td_state = TDS_INACTIVE;
153 	td->td_oncpu	= NOCPU;
154 
155 	/*
156 	 * Note that td_critnest begins life as 1 because the thread is not
157 	 * running and is thereby implicitly waiting to be on the receiving
158 	 * end of a context switch.  A context switch must occur inside a
159 	 * critical section, and in fact, includes hand-off of the sched_lock.
160 	 * After a context switch to a newly created thread, it will release
161 	 * sched_lock for the first time, and its td_critnest will hit 0 for
162 	 * the first time.  This happens on the far end of a context switch,
163 	 * and when it context switches away from itself, it will in fact go
164 	 * back into a critical section, and hand off the sched lock to the
165 	 * next thread.
166 	 */
167 	td->td_critnest = 1;
168 }
169 
170 /*
171  * Reclaim a thread after use.
172  */
173 static void
174 thread_dtor(void *mem, int size, void *arg)
175 {
176 	struct thread *td;
177 
178 	td = (struct thread *)mem;
179 
180 #ifdef INVARIANTS
181 	/* Verify that this thread is in a safe state to free. */
182 	switch (td->td_state) {
183 	case TDS_INHIBITED:
184 	case TDS_RUNNING:
185 	case TDS_CAN_RUN:
186 	case TDS_RUNQ:
187 		/*
188 		 * We must never unlink a thread that is in one of
189 		 * these states, because it is currently active.
190 		 */
191 		panic("bad state for thread unlinking");
192 		/* NOTREACHED */
193 	case TDS_INACTIVE:
194 		break;
195 	default:
196 		panic("bad thread state");
197 		/* NOTREACHED */
198 	}
199 #endif
200 }
201 
202 /*
203  * Initialize type-stable parts of a thread (when newly created).
204  */
205 static void
206 thread_init(void *mem, int size)
207 {
208 	struct thread *td;
209 	struct tid_bitmap_part *bmp, *new;
210 	int bit, idx;
211 
212 	td = (struct thread *)mem;
213 
214 	mtx_lock(&tid_lock);
215 	STAILQ_FOREACH(bmp, &tid_bitmap, bmp_next) {
216 		if (bmp->bmp_free)
217 			break;
218 	}
219 	/* Create a new bitmap if we run out of free bits. */
220 	if (bmp == NULL) {
221 		mtx_unlock(&tid_lock);
222 		new = uma_zalloc(tid_zone, M_WAITOK);
223 		mtx_lock(&tid_lock);
224 		bmp = STAILQ_LAST(&tid_bitmap, tid_bitmap_part, bmp_next);
225 		if (bmp == NULL || bmp->bmp_free < TID_IDS_PER_PART/2) {
226 			/* 1=free, 0=assigned. This way we can use ffsl(). */
227 			memset(new->bmp_bitmap, ~0U, sizeof(new->bmp_bitmap));
228 			new->bmp_base = (bmp == NULL) ? TID_MIN :
229 			    bmp->bmp_base + TID_IDS_PER_PART;
230 			new->bmp_free = TID_IDS_PER_PART;
231 			STAILQ_INSERT_TAIL(&tid_bitmap, new, bmp_next);
232 			bmp = new;
233 			new = NULL;
234 		}
235 	} else
236 		new = NULL;
237 	/* We have a bitmap with available IDs. */
238 	idx = 0;
239 	while (idx < TID_BITMAP_SIZE && bmp->bmp_bitmap[idx] == 0UL)
240 		idx++;
241 	bit = ffsl(bmp->bmp_bitmap[idx]) - 1;
242 	td->td_tid = bmp->bmp_base + idx * TID_IDS_PER_IDX + bit;
243 	bmp->bmp_bitmap[idx] &= ~(1UL << bit);
244 	bmp->bmp_free--;
245 	mtx_unlock(&tid_lock);
246 	if (new != NULL)
247 		uma_zfree(tid_zone, new);
248 
249 	vm_thread_new(td, 0);
250 	cpu_thread_setup(td);
251 	td->td_sleepqueue = sleepq_alloc();
252 	td->td_turnstile = turnstile_alloc();
253 	td->td_sched = (struct td_sched *)&td[1];
254 }
255 
256 /*
257  * Tear down type-stable parts of a thread (just before being discarded).
258  */
259 static void
260 thread_fini(void *mem, int size)
261 {
262 	struct thread *td;
263 	struct tid_bitmap_part *bmp;
264 	lwpid_t tid;
265 	int bit, idx;
266 
267 	td = (struct thread *)mem;
268 	turnstile_free(td->td_turnstile);
269 	sleepq_free(td->td_sleepqueue);
270 	vm_thread_dispose(td);
271 
272 	STAILQ_FOREACH(bmp, &tid_bitmap, bmp_next) {
273 		if (td->td_tid >= bmp->bmp_base &&
274 		    td->td_tid < bmp->bmp_base + TID_IDS_PER_PART)
275 			break;
276 	}
277 	KASSERT(bmp != NULL, ("No TID bitmap?"));
278 	mtx_lock(&tid_lock);
279 	tid = td->td_tid - bmp->bmp_base;
280 	idx = tid / TID_IDS_PER_IDX;
281 	bit = 1UL << (tid % TID_IDS_PER_IDX);
282 	bmp->bmp_bitmap[idx] |= bit;
283 	bmp->bmp_free++;
284 	mtx_unlock(&tid_lock);
285 }
286 
287 /*
288  * Initialize type-stable parts of a kse (when newly created).
289  */
290 static void
291 kse_init(void *mem, int size)
292 {
293 	struct kse	*ke;
294 
295 	ke = (struct kse *)mem;
296 	ke->ke_sched = (struct ke_sched *)&ke[1];
297 }
298 
299 /*
300  * Initialize type-stable parts of a ksegrp (when newly created).
301  */
302 static void
303 ksegrp_init(void *mem, int size)
304 {
305 	struct ksegrp	*kg;
306 
307 	kg = (struct ksegrp *)mem;
308 	kg->kg_sched = (struct kg_sched *)&kg[1];
309 }
310 
311 /*
312  * KSE is linked into kse group.
313  */
314 void
315 kse_link(struct kse *ke, struct ksegrp *kg)
316 {
317 	struct proc *p = kg->kg_proc;
318 
319 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
320 	kg->kg_kses++;
321 	ke->ke_state	= KES_UNQUEUED;
322 	ke->ke_proc	= p;
323 	ke->ke_ksegrp	= kg;
324 	ke->ke_thread	= NULL;
325 	ke->ke_oncpu	= NOCPU;
326 	ke->ke_flags	= 0;
327 }
328 
329 void
330 kse_unlink(struct kse *ke)
331 {
332 	struct ksegrp *kg;
333 
334 	mtx_assert(&sched_lock, MA_OWNED);
335 	kg = ke->ke_ksegrp;
336 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
337 	if (ke->ke_state == KES_IDLE) {
338 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
339 		kg->kg_idle_kses--;
340 	}
341 	--kg->kg_kses;
342 	/*
343 	 * Aggregate stats from the KSE
344 	 */
345 	kse_stash(ke);
346 }
347 
348 void
349 ksegrp_link(struct ksegrp *kg, struct proc *p)
350 {
351 
352 	TAILQ_INIT(&kg->kg_threads);
353 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
354 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
355 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
356 	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
357 	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
358 	kg->kg_proc = p;
359 	/*
360 	 * the following counters are in the -zero- section
361 	 * and may not need clearing
362 	 */
363 	kg->kg_numthreads = 0;
364 	kg->kg_runnable   = 0;
365 	kg->kg_kses       = 0;
366 	kg->kg_runq_kses  = 0; /* XXXKSE change name */
367 	kg->kg_idle_kses  = 0;
368 	kg->kg_numupcalls = 0;
369 	/* link it in now that it's consistent */
370 	p->p_numksegrps++;
371 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
372 }
373 
374 void
375 ksegrp_unlink(struct ksegrp *kg)
376 {
377 	struct proc *p;
378 
379 	mtx_assert(&sched_lock, MA_OWNED);
380 	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
381 	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
382 	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
383 
384 	p = kg->kg_proc;
385 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
386 	p->p_numksegrps--;
387 	/*
388 	 * Aggregate stats from the KSE
389 	 */
390 	ksegrp_stash(kg);
391 }
392 
393 /*
394  * For a newly created process,
395  * link up all the structures and its initial threads etc.
396  */
397 void
398 proc_linkup(struct proc *p, struct ksegrp *kg,
399 	    struct kse *ke, struct thread *td)
400 {
401 
402 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
403 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
404 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
405 	p->p_numksegrps = 0;
406 	p->p_numthreads = 0;
407 
408 	ksegrp_link(kg, p);
409 	kse_link(ke, kg);
410 	thread_link(td, kg);
411 }
412 
413 /*
414  * Initialize global thread allocation resources.
415  */
416 void
417 threadinit(void)
418 {
419 
420 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
421 	    thread_ctor, thread_dtor, thread_init, thread_fini,
422 	    UMA_ALIGN_CACHE, 0);
423 	tid_zone = uma_zcreate("TID", sizeof(struct tid_bitmap_part),
424 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
425 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
426 	    NULL, NULL, ksegrp_init, NULL,
427 	    UMA_ALIGN_CACHE, 0);
428 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
429 	    NULL, NULL, kse_init, NULL,
430 	    UMA_ALIGN_CACHE, 0);
431 	kseinit();
432 }
433 
434 /*
435  * Stash an embarasingly extra thread into the zombie thread queue.
436  */
437 void
438 thread_stash(struct thread *td)
439 {
440 	mtx_lock_spin(&kse_zombie_lock);
441 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
442 	mtx_unlock_spin(&kse_zombie_lock);
443 }
444 
445 /*
446  * Stash an embarasingly extra kse into the zombie kse queue.
447  */
448 void
449 kse_stash(struct kse *ke)
450 {
451 	mtx_lock_spin(&kse_zombie_lock);
452 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
453 	mtx_unlock_spin(&kse_zombie_lock);
454 }
455 
456 /*
457  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
458  */
459 void
460 ksegrp_stash(struct ksegrp *kg)
461 {
462 	mtx_lock_spin(&kse_zombie_lock);
463 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
464 	mtx_unlock_spin(&kse_zombie_lock);
465 }
466 
467 /*
468  * Reap zombie kse resource.
469  */
470 void
471 thread_reap(void)
472 {
473 	struct thread *td_first, *td_next;
474 	struct kse *ke_first, *ke_next;
475 	struct ksegrp *kg_first, * kg_next;
476 
477 	/*
478 	 * Don't even bother to lock if none at this instant,
479 	 * we really don't care about the next instant..
480 	 */
481 	if ((!TAILQ_EMPTY(&zombie_threads))
482 	    || (!TAILQ_EMPTY(&zombie_kses))
483 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
484 		mtx_lock_spin(&kse_zombie_lock);
485 		td_first = TAILQ_FIRST(&zombie_threads);
486 		ke_first = TAILQ_FIRST(&zombie_kses);
487 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
488 		if (td_first)
489 			TAILQ_INIT(&zombie_threads);
490 		if (ke_first)
491 			TAILQ_INIT(&zombie_kses);
492 		if (kg_first)
493 			TAILQ_INIT(&zombie_ksegrps);
494 		mtx_unlock_spin(&kse_zombie_lock);
495 		while (td_first) {
496 			td_next = TAILQ_NEXT(td_first, td_runq);
497 			if (td_first->td_ucred)
498 				crfree(td_first->td_ucred);
499 			thread_free(td_first);
500 			td_first = td_next;
501 		}
502 		while (ke_first) {
503 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
504 			kse_free(ke_first);
505 			ke_first = ke_next;
506 		}
507 		while (kg_first) {
508 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
509 			ksegrp_free(kg_first);
510 			kg_first = kg_next;
511 		}
512 	}
513 	kse_GC();
514 }
515 
516 /*
517  * Allocate a ksegrp.
518  */
519 struct ksegrp *
520 ksegrp_alloc(void)
521 {
522 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
523 }
524 
525 /*
526  * Allocate a kse.
527  */
528 struct kse *
529 kse_alloc(void)
530 {
531 	return (uma_zalloc(kse_zone, M_WAITOK));
532 }
533 
534 /*
535  * Allocate a thread.
536  */
537 struct thread *
538 thread_alloc(void)
539 {
540 	thread_reap(); /* check if any zombies to get */
541 	return (uma_zalloc(thread_zone, M_WAITOK));
542 }
543 
544 /*
545  * Deallocate a ksegrp.
546  */
547 void
548 ksegrp_free(struct ksegrp *td)
549 {
550 	uma_zfree(ksegrp_zone, td);
551 }
552 
553 /*
554  * Deallocate a kse.
555  */
556 void
557 kse_free(struct kse *td)
558 {
559 	uma_zfree(kse_zone, td);
560 }
561 
562 /*
563  * Deallocate a thread.
564  */
565 void
566 thread_free(struct thread *td)
567 {
568 
569 	cpu_thread_clean(td);
570 	uma_zfree(thread_zone, td);
571 }
572 
573 /*
574  * Discard the current thread and exit from its context.
575  * Always called with scheduler locked.
576  *
577  * Because we can't free a thread while we're operating under its context,
578  * push the current thread into our CPU's deadthread holder. This means
579  * we needn't worry about someone else grabbing our context before we
580  * do a cpu_throw().  This may not be needed now as we are under schedlock.
581  * Maybe we can just do a thread_stash() as thr_exit1 does.
582  */
583 /*  XXX
584  * libthr expects its thread exit to return for the last
585  * thread, meaning that the program is back to non-threaded
586  * mode I guess. Because we do this (cpu_throw) unconditionally
587  * here, they have their own version of it. (thr_exit1())
588  * that doesn't do it all if this was the last thread.
589  * It is also called from thread_suspend_check().
590  * Of course in the end, they end up coming here through exit1
591  * anyhow..  After fixing 'thr' to play by the rules we should be able
592  * to merge these two functions together.
593  */
594 void
595 thread_exit(void)
596 {
597 	struct thread *td;
598 	struct kse *ke;
599 	struct proc *p;
600 	struct ksegrp	*kg;
601 
602 	td = curthread;
603 	kg = td->td_ksegrp;
604 	p = td->td_proc;
605 	ke = td->td_kse;
606 
607 	mtx_assert(&sched_lock, MA_OWNED);
608 	KASSERT(p != NULL, ("thread exiting without a process"));
609 	KASSERT(ke != NULL, ("thread exiting without a kse"));
610 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
611 	PROC_LOCK_ASSERT(p, MA_OWNED);
612 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
613 	mtx_assert(&Giant, MA_NOTOWNED);
614 
615 	if (td->td_standin != NULL) {
616 		thread_stash(td->td_standin);
617 		td->td_standin = NULL;
618 	}
619 
620 	cpu_thread_exit(td);	/* XXXSMP */
621 
622 	/*
623 	 * The last thread is left attached to the process
624 	 * So that the whole bundle gets recycled. Skip
625 	 * all this stuff.
626 	 */
627 	if (p->p_numthreads > 1) {
628 		thread_unlink(td);
629 		if (p->p_maxthrwaits)
630 			wakeup(&p->p_numthreads);
631 		/*
632 		 * The test below is NOT true if we are the
633 		 * sole exiting thread. P_STOPPED_SNGL is unset
634 		 * in exit1() after it is the only survivor.
635 		 */
636 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
637 			if (p->p_numthreads == p->p_suspcount) {
638 				thread_unsuspend_one(p->p_singlethread);
639 			}
640 		}
641 
642 		/*
643 		 * Because each upcall structure has an owner thread,
644 		 * owner thread exits only when process is in exiting
645 		 * state, so upcall to userland is no longer needed,
646 		 * deleting upcall structure is safe here.
647 		 * So when all threads in a group is exited, all upcalls
648 		 * in the group should be automatically freed.
649 		 */
650 		if (td->td_upcall)
651 			upcall_remove(td);
652 
653 		sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
654 		sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
655 		ke->ke_state = KES_UNQUEUED;
656 		ke->ke_thread = NULL;
657 		/*
658 		 * Decide what to do with the KSE attached to this thread.
659 		 */
660 		if (ke->ke_flags & KEF_EXIT) {
661 			kse_unlink(ke);
662 			if (kg->kg_kses == 0) {
663 				sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
664 				ksegrp_unlink(kg);
665 			}
666 		}
667 		else
668 			kse_reassign(ke);
669 		PROC_UNLOCK(p);
670 		td->td_kse	= NULL;
671 #if 0
672 		td->td_proc	= NULL;
673 #endif
674 		td->td_ksegrp	= NULL;
675 		td->td_last_kse	= NULL;
676 		PCPU_SET(deadthread, td);
677 	} else {
678 		PROC_UNLOCK(p);
679 	}
680 	td->td_state	= TDS_INACTIVE;
681 	/* XXX Shouldn't cpu_throw() here. */
682 	mtx_assert(&sched_lock, MA_OWNED);
683 	cpu_throw(td, choosethread());
684 	panic("I'm a teapot!");
685 	/* NOTREACHED */
686 }
687 
688 /*
689  * Do any thread specific cleanups that may be needed in wait()
690  * called with Giant, proc and schedlock not held.
691  */
692 void
693 thread_wait(struct proc *p)
694 {
695 	struct thread *td;
696 
697 	mtx_assert(&Giant, MA_NOTOWNED);
698 	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
699 	KASSERT((p->p_numksegrps == 1), ("Multiple ksegrps in wait1()"));
700 	FOREACH_THREAD_IN_PROC(p, td) {
701 		if (td->td_standin != NULL) {
702 			thread_free(td->td_standin);
703 			td->td_standin = NULL;
704 		}
705 		cpu_thread_clean(td);
706 	}
707 	thread_reap();	/* check for zombie threads etc. */
708 }
709 
710 /*
711  * Link a thread to a process.
712  * set up anything that needs to be initialized for it to
713  * be used by the process.
714  *
715  * Note that we do not link to the proc's ucred here.
716  * The thread is linked as if running but no KSE assigned.
717  */
718 void
719 thread_link(struct thread *td, struct ksegrp *kg)
720 {
721 	struct proc *p;
722 
723 	p = kg->kg_proc;
724 	td->td_state    = TDS_INACTIVE;
725 	td->td_proc     = p;
726 	td->td_ksegrp   = kg;
727 	td->td_last_kse = NULL;
728 	td->td_flags    = 0;
729 	td->td_kflags	= 0;
730 	td->td_kse      = NULL;
731 
732 	LIST_INIT(&td->td_contested);
733 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
734 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
735 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
736 	p->p_numthreads++;
737 	kg->kg_numthreads++;
738 }
739 
740 void
741 thread_unlink(struct thread *td)
742 {
743 	struct proc *p = td->td_proc;
744 	struct ksegrp *kg = td->td_ksegrp;
745 
746 	mtx_assert(&sched_lock, MA_OWNED);
747 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
748 	p->p_numthreads--;
749 	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
750 	kg->kg_numthreads--;
751 	/* could clear a few other things here */
752 }
753 
754 /*
755  * Purge a ksegrp resource. When a ksegrp is preparing to
756  * exit, it calls this function.
757  */
758 void
759 kse_purge_group(struct thread *td)
760 {
761 	struct ksegrp *kg;
762 	struct kse *ke;
763 
764 	kg = td->td_ksegrp;
765  	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
766 	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
767 		KASSERT(ke->ke_state == KES_IDLE,
768 			("%s: wrong idle KSE state", __func__));
769 		kse_unlink(ke);
770 	}
771 	KASSERT((kg->kg_kses == 1),
772 		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
773 	KASSERT((kg->kg_numupcalls == 0),
774 	        ("%s: ksegrp still has %d upcall datas",
775 		__func__, kg->kg_numupcalls));
776 }
777 
778 /*
779  * Purge a process's KSE resource. When a process is preparing to
780  * exit, it calls kse_purge to release any extra KSE resources in
781  * the process.
782  */
783 void
784 kse_purge(struct proc *p, struct thread *td)
785 {
786 	struct ksegrp *kg;
787 	struct kse *ke;
788 
789  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
790 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
791 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
792 		p->p_numksegrps--;
793 		/*
794 		 * There is no ownership for KSE, after all threads
795 		 * in the group exited, it is possible that some KSEs
796 		 * were left in idle queue, gc them now.
797 		 */
798 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
799 			KASSERT(ke->ke_state == KES_IDLE,
800 			   ("%s: wrong idle KSE state", __func__));
801 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
802 			kg->kg_idle_kses--;
803 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
804 			kg->kg_kses--;
805 			kse_stash(ke);
806 		}
807 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
808 		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
809 		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
810 		KASSERT((kg->kg_numupcalls == 0),
811 		        ("%s: ksegrp still has %d upcall datas",
812 			__func__, kg->kg_numupcalls));
813 
814 		if (kg != td->td_ksegrp)
815 			ksegrp_stash(kg);
816 	}
817 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
818 	p->p_numksegrps++;
819 }
820 
821 /*
822  * Enforce single-threading.
823  *
824  * Returns 1 if the caller must abort (another thread is waiting to
825  * exit the process or similar). Process is locked!
826  * Returns 0 when you are successfully the only thread running.
827  * A process has successfully single threaded in the suspend mode when
828  * There are no threads in user mode. Threads in the kernel must be
829  * allowed to continue until they get to the user boundary. They may even
830  * copy out their return values and data before suspending. They may however be
831  * accellerated in reaching the user boundary as we will wake up
832  * any sleeping threads that are interruptable. (PCATCH).
833  */
834 int
835 thread_single(int force_exit)
836 {
837 	struct thread *td;
838 	struct thread *td2;
839 	struct proc *p;
840 	int remaining;
841 
842 	td = curthread;
843 	p = td->td_proc;
844 	mtx_assert(&Giant, MA_NOTOWNED);
845 	PROC_LOCK_ASSERT(p, MA_OWNED);
846 	KASSERT((td != NULL), ("curthread is NULL"));
847 
848 	if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1)
849 		return (0);
850 
851 	/* Is someone already single threading? */
852 	if (p->p_singlethread)
853 		return (1);
854 
855 	if (force_exit == SINGLE_EXIT) {
856 		p->p_flag |= P_SINGLE_EXIT;
857 	} else
858 		p->p_flag &= ~P_SINGLE_EXIT;
859 	p->p_flag |= P_STOPPED_SINGLE;
860 	mtx_lock_spin(&sched_lock);
861 	p->p_singlethread = td;
862 	if (force_exit == SINGLE_EXIT)
863 		remaining = p->p_numthreads;
864 	else
865 		remaining = p->p_numthreads - p->p_suspcount;
866 	while (remaining != 1) {
867 		FOREACH_THREAD_IN_PROC(p, td2) {
868 			if (td2 == td)
869 				continue;
870 			td2->td_flags |= TDF_ASTPENDING;
871 			if (TD_IS_INHIBITED(td2)) {
872 				if (force_exit == SINGLE_EXIT) {
873 					if (td->td_flags & TDF_DBSUSPEND)
874 						td->td_flags &= ~TDF_DBSUSPEND;
875 					if (TD_IS_SUSPENDED(td2)) {
876 						thread_unsuspend_one(td2);
877 					}
878 					if (TD_ON_SLEEPQ(td2) &&
879 					    (td2->td_flags & TDF_SINTR)) {
880 						sleepq_abort(td2);
881 					}
882 				} else {
883 					if (TD_IS_SUSPENDED(td2))
884 						continue;
885 					/*
886 					 * maybe other inhibitted states too?
887 					 * XXXKSE Is it totally safe to
888 					 * suspend a non-interruptable thread?
889 					 */
890 					if (td2->td_inhibitors &
891 					    (TDI_SLEEPING | TDI_SWAPPED))
892 						thread_suspend_one(td2);
893 				}
894 			}
895 		}
896 		if (force_exit == SINGLE_EXIT)
897 			remaining = p->p_numthreads;
898 		else
899 			remaining = p->p_numthreads - p->p_suspcount;
900 
901 		/*
902 		 * Maybe we suspended some threads.. was it enough?
903 		 */
904 		if (remaining == 1)
905 			break;
906 
907 		/*
908 		 * Wake us up when everyone else has suspended.
909 		 * In the mean time we suspend as well.
910 		 */
911 		thread_suspend_one(td);
912 		PROC_UNLOCK(p);
913 		mi_switch(SW_VOL, NULL);
914 		mtx_unlock_spin(&sched_lock);
915 		PROC_LOCK(p);
916 		mtx_lock_spin(&sched_lock);
917 		if (force_exit == SINGLE_EXIT)
918 			remaining = p->p_numthreads;
919 		else
920 			remaining = p->p_numthreads - p->p_suspcount;
921 	}
922 	if (force_exit == SINGLE_EXIT) {
923 		if (td->td_upcall)
924 			upcall_remove(td);
925 		kse_purge(p, td);
926 	}
927 	mtx_unlock_spin(&sched_lock);
928 	return (0);
929 }
930 
931 /*
932  * Called in from locations that can safely check to see
933  * whether we have to suspend or at least throttle for a
934  * single-thread event (e.g. fork).
935  *
936  * Such locations include userret().
937  * If the "return_instead" argument is non zero, the thread must be able to
938  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
939  *
940  * The 'return_instead' argument tells the function if it may do a
941  * thread_exit() or suspend, or whether the caller must abort and back
942  * out instead.
943  *
944  * If the thread that set the single_threading request has set the
945  * P_SINGLE_EXIT bit in the process flags then this call will never return
946  * if 'return_instead' is false, but will exit.
947  *
948  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
949  *---------------+--------------------+---------------------
950  *       0       | returns 0          |   returns 0 or 1
951  *               | when ST ends       |   immediatly
952  *---------------+--------------------+---------------------
953  *       1       | thread exits       |   returns 1
954  *               |                    |  immediatly
955  * 0 = thread_exit() or suspension ok,
956  * other = return error instead of stopping the thread.
957  *
958  * While a full suspension is under effect, even a single threading
959  * thread would be suspended if it made this call (but it shouldn't).
960  * This call should only be made from places where
961  * thread_exit() would be safe as that may be the outcome unless
962  * return_instead is set.
963  */
964 int
965 thread_suspend_check(int return_instead)
966 {
967 	struct thread *td;
968 	struct proc *p;
969 
970 	td = curthread;
971 	p = td->td_proc;
972 	mtx_assert(&Giant, MA_NOTOWNED);
973 	PROC_LOCK_ASSERT(p, MA_OWNED);
974 	while (P_SHOULDSTOP(p) ||
975 	      ((p->p_flag & P_TRACED) && (td->td_flags & TDF_DBSUSPEND))) {
976 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
977 			KASSERT(p->p_singlethread != NULL,
978 			    ("singlethread not set"));
979 			/*
980 			 * The only suspension in action is a
981 			 * single-threading. Single threader need not stop.
982 			 * XXX Should be safe to access unlocked
983 			 * as it can only be set to be true by us.
984 			 */
985 			if (p->p_singlethread == td)
986 				return (0);	/* Exempt from stopping. */
987 		}
988 		if (return_instead)
989 			return (1);
990 
991 		mtx_lock_spin(&sched_lock);
992 		thread_stopped(p);
993 		/*
994 		 * If the process is waiting for us to exit,
995 		 * this thread should just suicide.
996 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
997 		 */
998 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
999 			if (p->p_flag & P_SA)
1000 				thread_exit();
1001 			else
1002 				thr_exit1();
1003 		}
1004 
1005 		/*
1006 		 * When a thread suspends, it just
1007 		 * moves to the processes's suspend queue
1008 		 * and stays there.
1009 		 */
1010 		thread_suspend_one(td);
1011 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1012 			if (p->p_numthreads == p->p_suspcount) {
1013 				thread_unsuspend_one(p->p_singlethread);
1014 			}
1015 		}
1016 		PROC_UNLOCK(p);
1017 		mi_switch(SW_INVOL, NULL);
1018 		mtx_unlock_spin(&sched_lock);
1019 		PROC_LOCK(p);
1020 	}
1021 	return (0);
1022 }
1023 
1024 void
1025 thread_suspend_one(struct thread *td)
1026 {
1027 	struct proc *p = td->td_proc;
1028 
1029 	mtx_assert(&sched_lock, MA_OWNED);
1030 	PROC_LOCK_ASSERT(p, MA_OWNED);
1031 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1032 	p->p_suspcount++;
1033 	TD_SET_SUSPENDED(td);
1034 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1035 	/*
1036 	 * Hack: If we are suspending but are on the sleep queue
1037 	 * then we are in msleep or the cv equivalent. We
1038 	 * want to look like we have two Inhibitors.
1039 	 * May already be set.. doesn't matter.
1040 	 */
1041 	if (TD_ON_SLEEPQ(td))
1042 		TD_SET_SLEEPING(td);
1043 }
1044 
1045 void
1046 thread_unsuspend_one(struct thread *td)
1047 {
1048 	struct proc *p = td->td_proc;
1049 
1050 	mtx_assert(&sched_lock, MA_OWNED);
1051 	PROC_LOCK_ASSERT(p, MA_OWNED);
1052 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1053 	TD_CLR_SUSPENDED(td);
1054 	p->p_suspcount--;
1055 	setrunnable(td);
1056 }
1057 
1058 /*
1059  * Allow all threads blocked by single threading to continue running.
1060  */
1061 void
1062 thread_unsuspend(struct proc *p)
1063 {
1064 	struct thread *td;
1065 
1066 	mtx_assert(&sched_lock, MA_OWNED);
1067 	PROC_LOCK_ASSERT(p, MA_OWNED);
1068 	if (!P_SHOULDSTOP(p)) {
1069 		while ((td = TAILQ_FIRST(&p->p_suspended))) {
1070 			thread_unsuspend_one(td);
1071 		}
1072 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1073 	    (p->p_numthreads == p->p_suspcount)) {
1074 		/*
1075 		 * Stopping everything also did the job for the single
1076 		 * threading request. Now we've downgraded to single-threaded,
1077 		 * let it continue.
1078 		 */
1079 		thread_unsuspend_one(p->p_singlethread);
1080 	}
1081 }
1082 
1083 void
1084 thread_single_end(void)
1085 {
1086 	struct thread *td;
1087 	struct proc *p;
1088 
1089 	td = curthread;
1090 	p = td->td_proc;
1091 	PROC_LOCK_ASSERT(p, MA_OWNED);
1092 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
1093 	mtx_lock_spin(&sched_lock);
1094 	p->p_singlethread = NULL;
1095 	/*
1096 	 * If there are other threads they mey now run,
1097 	 * unless of course there is a blanket 'stop order'
1098 	 * on the process. The single threader must be allowed
1099 	 * to continue however as this is a bad place to stop.
1100 	 */
1101 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1102 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1103 			thread_unsuspend_one(td);
1104 		}
1105 	}
1106 	mtx_unlock_spin(&sched_lock);
1107 }
1108 
1109