xref: /freebsd/sys/kern/kern_thread.c (revision 6f8132a867d53fbc48dd8222a4fd1408ff1d9226)
1 /*
2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3  *  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice(s), this list of conditions and the following disclaimer as
10  *    the first lines of this file unmodified other than the possible
11  *    addition of one or more copyright notices.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice(s), this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26  * DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/smp.h>
39 #include <sys/sysctl.h>
40 #include <sys/sysproto.h>
41 #include <sys/filedesc.h>
42 #include <sys/sched.h>
43 #include <sys/signalvar.h>
44 #include <sys/sx.h>
45 #include <sys/tty.h>
46 #include <sys/user.h>
47 #include <sys/jail.h>
48 #include <sys/kse.h>
49 #include <sys/ktr.h>
50 #include <sys/ucontext.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_object.h>
54 #include <vm/pmap.h>
55 #include <vm/uma.h>
56 #include <vm/vm_map.h>
57 
58 #include <machine/frame.h>
59 
60 /*
61  * KSEGRP related storage.
62  */
63 static uma_zone_t ksegrp_zone;
64 static uma_zone_t kse_zone;
65 static uma_zone_t thread_zone;
66 
67 /* DEBUG ONLY */
68 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
69 static int thread_debug = 0;
70 SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
71 	&thread_debug, 0, "thread debug");
72 
73 static int max_threads_per_proc = 30;
74 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
75 	&max_threads_per_proc, 0, "Limit on threads per proc");
76 
77 static int max_groups_per_proc = 5;
78 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
79 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
80 
81 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
82 
83 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
84 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
85 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
86 struct mtx zombie_thread_lock;
87 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
88     "zombie_thread_lock", MTX_SPIN);
89 
90 static void kse_purge(struct proc *p, struct thread *td);
91 
92 /*
93  * Prepare a thread for use.
94  */
95 static void
96 thread_ctor(void *mem, int size, void *arg)
97 {
98 	struct thread	*td;
99 
100 	td = (struct thread *)mem;
101 	td->td_state = TDS_INACTIVE;
102 	td->td_flags |= TDF_UNBOUND;
103 }
104 
105 /*
106  * Reclaim a thread after use.
107  */
108 static void
109 thread_dtor(void *mem, int size, void *arg)
110 {
111 	struct thread	*td;
112 
113 	td = (struct thread *)mem;
114 
115 #ifdef INVARIANTS
116 	/* Verify that this thread is in a safe state to free. */
117 	switch (td->td_state) {
118 	case TDS_INHIBITED:
119 	case TDS_RUNNING:
120 	case TDS_CAN_RUN:
121 	case TDS_RUNQ:
122 		/*
123 		 * We must never unlink a thread that is in one of
124 		 * these states, because it is currently active.
125 		 */
126 		panic("bad state for thread unlinking");
127 		/* NOTREACHED */
128 	case TDS_INACTIVE:
129 		break;
130 	default:
131 		panic("bad thread state");
132 		/* NOTREACHED */
133 	}
134 #endif
135 }
136 
137 /*
138  * Initialize type-stable parts of a thread (when newly created).
139  */
140 static void
141 thread_init(void *mem, int size)
142 {
143 	struct thread	*td;
144 
145 	td = (struct thread *)mem;
146 	mtx_lock(&Giant);
147 	pmap_new_thread(td, 0);
148 	mtx_unlock(&Giant);
149 	cpu_thread_setup(td);
150 	td->td_sched = (struct td_sched *)&td[1];
151 }
152 
153 /*
154  * Tear down type-stable parts of a thread (just before being discarded).
155  */
156 static void
157 thread_fini(void *mem, int size)
158 {
159 	struct thread	*td;
160 
161 	td = (struct thread *)mem;
162 	pmap_dispose_thread(td);
163 }
164 /*
165  * Initialize type-stable parts of a kse (when newly created).
166  */
167 static void
168 kse_init(void *mem, int size)
169 {
170 	struct kse	*ke;
171 
172 	ke = (struct kse *)mem;
173 	ke->ke_sched = (struct ke_sched *)&ke[1];
174 }
175 /*
176  * Initialize type-stable parts of a ksegrp (when newly created).
177  */
178 static void
179 ksegrp_init(void *mem, int size)
180 {
181 	struct ksegrp	*kg;
182 
183 	kg = (struct ksegrp *)mem;
184 	kg->kg_sched = (struct kg_sched *)&kg[1];
185 }
186 
187 /*
188  * KSE is linked onto the idle queue.
189  */
190 void
191 kse_link(struct kse *ke, struct ksegrp *kg)
192 {
193 	struct proc *p = kg->kg_proc;
194 
195 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
196 	kg->kg_kses++;
197 	ke->ke_state = KES_UNQUEUED;
198 	ke->ke_proc	= p;
199 	ke->ke_ksegrp	= kg;
200 	ke->ke_owner	= NULL;
201 	ke->ke_thread	= NULL;
202 	ke->ke_oncpu = NOCPU;
203 }
204 
205 void
206 kse_unlink(struct kse *ke)
207 {
208 	struct ksegrp *kg;
209 
210 	mtx_assert(&sched_lock, MA_OWNED);
211 	kg = ke->ke_ksegrp;
212 
213 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
214 	if (--kg->kg_kses == 0) {
215 			ksegrp_unlink(kg);
216 	}
217 	/*
218 	 * Aggregate stats from the KSE
219 	 */
220 	kse_stash(ke);
221 }
222 
223 void
224 ksegrp_link(struct ksegrp *kg, struct proc *p)
225 {
226 
227 	TAILQ_INIT(&kg->kg_threads);
228 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
229 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
230 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
231 	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
232 	kg->kg_proc	= p;
233 /* the following counters are in the -zero- section and may not need clearing */
234 	kg->kg_numthreads = 0;
235 	kg->kg_runnable = 0;
236 	kg->kg_kses = 0;
237 	kg->kg_loan_kses = 0;
238 	kg->kg_runq_kses = 0; /* XXXKSE change name */
239 /* link it in now that it's consistent */
240 	p->p_numksegrps++;
241 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
242 }
243 
244 void
245 ksegrp_unlink(struct ksegrp *kg)
246 {
247 	struct proc *p;
248 
249 	mtx_assert(&sched_lock, MA_OWNED);
250 	p = kg->kg_proc;
251 	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
252 	    ("kseg_unlink: residual threads or KSEs"));
253 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
254 	p->p_numksegrps--;
255 	/*
256 	 * Aggregate stats from the KSE
257 	 */
258 	ksegrp_stash(kg);
259 }
260 
261 /*
262  * for a newly created process,
263  * link up a the structure and its initial threads etc.
264  */
265 void
266 proc_linkup(struct proc *p, struct ksegrp *kg,
267 			struct kse *ke, struct thread *td)
268 {
269 
270 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
271 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
272 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
273 	p->p_numksegrps = 0;
274 	p->p_numthreads = 0;
275 
276 	ksegrp_link(kg, p);
277 	kse_link(ke, kg);
278 	thread_link(td, kg);
279 }
280 
281 int
282 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
283 {
284 	struct proc *p;
285 	struct thread *td2;
286 
287 	p = td->td_proc;
288 	/* KSE-enabled processes only, please. */
289 	if (!(p->p_flag & P_KSES))
290 		return (EINVAL);
291 	if (uap->tmbx == NULL)
292 		return (EINVAL);
293 	mtx_lock_spin(&sched_lock);
294 	FOREACH_THREAD_IN_PROC(p, td2) {
295 		if (td2->td_mailbox == uap->tmbx) {
296 			td2->td_flags |= TDF_INTERRUPT;
297 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
298 				if (td2->td_flags & TDF_CVWAITQ)
299 					cv_abort(td2);
300 				else
301 					abortsleep(td2);
302 			}
303 			mtx_unlock_spin(&sched_lock);
304 			return (0);
305 		}
306 	}
307 	mtx_unlock_spin(&sched_lock);
308 	return (ESRCH);
309 }
310 
311 int
312 kse_exit(struct thread *td, struct kse_exit_args *uap)
313 {
314 	struct proc *p;
315 	struct ksegrp *kg;
316 	struct kse *ke;
317 
318 	p = td->td_proc;
319 	/* Only UTS can do the syscall */
320 	if (!(p->p_flag & P_KSES) || (td->td_mailbox != NULL))
321 		return (EINVAL);
322 	kg = td->td_ksegrp;
323 	/* serialize killing kse */
324 	PROC_LOCK(p);
325 	mtx_lock_spin(&sched_lock);
326 	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
327 		mtx_unlock_spin(&sched_lock);
328 		PROC_UNLOCK(p);
329 		return (EDEADLK);
330 	}
331 	ke = td->td_kse;
332 	if (p->p_numthreads == 1) {
333 		ke->ke_flags &= ~KEF_DOUPCALL;
334 		ke->ke_mailbox = NULL;
335 		p->p_flag &= ~P_KSES;
336 		mtx_unlock_spin(&sched_lock);
337 		PROC_UNLOCK(p);
338 	} else {
339 		ke->ke_flags |= KEF_EXIT;
340 		thread_exit();
341 		/* NOTREACHED */
342 	}
343 	return (0);
344 }
345 
346 /*
347  * Either becomes an upcall or waits for an awakening event and
348  * THEN becomes an upcall. Only error cases return.
349  */
350 int
351 kse_release(struct thread * td, struct kse_release_args * uap)
352 {
353 	struct proc *p;
354 	struct ksegrp *kg;
355 
356 	p = td->td_proc;
357 	kg = td->td_ksegrp;
358 	/*
359 	 * kse must have a mailbox ready for upcall, and only UTS can
360 	 * do the syscall.
361 	 */
362 	if (!(p->p_flag & P_KSES) ||
363 	    (td->td_mailbox != NULL) ||
364 	    (td->td_kse->ke_mailbox == NULL))
365 		return (EINVAL);
366 
367 	PROC_LOCK(p);
368 	mtx_lock_spin(&sched_lock);
369 	/* Change OURSELF to become an upcall. */
370 	td->td_flags = TDF_UPCALLING; /* BOUND */
371 	if (!(td->td_kse->ke_flags & (KEF_DOUPCALL|KEF_ASTPENDING)) &&
372 	    (kg->kg_completed == NULL)) {
373 		/*
374 		 * The KSE will however be lendable.
375 		 */
376 		TD_SET_IDLE(td);
377 		PROC_UNLOCK(p);
378 		p->p_stats->p_ru.ru_nvcsw++;
379 		mi_switch();
380 		mtx_unlock_spin(&sched_lock);
381 	} else {
382 		mtx_unlock_spin(&sched_lock);
383 		PROC_UNLOCK(p);
384 	}
385 	return (0);
386 }
387 
388 /* struct kse_wakeup_args {
389 	struct kse_mailbox *mbx;
390 }; */
391 int
392 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
393 {
394 	struct proc *p;
395 	struct kse *ke;
396 	struct ksegrp *kg;
397 	struct thread *td2;
398 
399 	p = td->td_proc;
400 	td2 = NULL;
401 	/* KSE-enabled processes only, please. */
402 	if (!(p->p_flag & P_KSES))
403 		return EINVAL;
404 
405 	mtx_lock_spin(&sched_lock);
406 	if (uap->mbx) {
407 		FOREACH_KSEGRP_IN_PROC(p, kg) {
408 			FOREACH_KSE_IN_GROUP(kg, ke) {
409 				if (ke->ke_mailbox != uap->mbx)
410 					continue;
411 				td2 = ke->ke_owner;
412 				KASSERT((td2 != NULL),("KSE with no owner"));
413 				break;
414 			}
415 			if (td2) {
416 				break;
417 			}
418 		}
419 	} else {
420 		/*
421 		 * look for any idle KSE to resurrect.
422 		 */
423 		kg = td->td_ksegrp;
424 		FOREACH_KSE_IN_GROUP(kg, ke) {
425 			td2 = ke->ke_owner;
426 			KASSERT((td2 != NULL),("KSE with no owner2"));
427 			if (TD_IS_IDLE(td2))
428 				break;
429 		}
430 		KASSERT((td2 != NULL), ("no thread(s)"));
431 	}
432 	if (td2) {
433 		if (TD_IS_IDLE(td2)) {
434 			TD_CLR_IDLE(td2);
435 			setrunnable(td2);
436 		} else if (td != td2) {
437 			/* guarantee do an upcall ASAP */
438 			td2->td_kse->ke_flags |= KEF_DOUPCALL;
439 		}
440 		mtx_unlock_spin(&sched_lock);
441 		return (0);
442 	}
443 	mtx_unlock_spin(&sched_lock);
444 	return (ESRCH);
445 }
446 
447 /*
448  * No new KSEG: first call: use current KSE, don't schedule an upcall
449  * All other situations, do allocate a new KSE and schedule an upcall on it.
450  */
451 /* struct kse_create_args {
452 	struct kse_mailbox *mbx;
453 	int newgroup;
454 }; */
455 int
456 kse_create(struct thread *td, struct kse_create_args *uap)
457 {
458 	struct kse *newke;
459 	struct kse *ke;
460 	struct ksegrp *newkg;
461 	struct ksegrp *kg;
462 	struct proc *p;
463 	struct kse_mailbox mbx;
464 	int err;
465 
466 	p = td->td_proc;
467 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
468 		return (err);
469 
470 	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
471 	kg = td->td_ksegrp;
472 	if (uap->newgroup) {
473 		if (p->p_numksegrps >= max_groups_per_proc)
474 			return (EPROCLIM);
475 		/*
476 		 * If we want a new KSEGRP it doesn't matter whether
477 		 * we have already fired up KSE mode before or not.
478 		 * We put the process in KSE mode and create a new KSEGRP
479 		 * and KSE. If our KSE has not got a mailbox yet then
480 		 * that doesn't matter, just leave it that way. It will
481 		 * ensure that this thread stay BOUND. It's possible
482 		 * that the call came form a threaded library and the main
483 		 * program knows nothing of threads.
484 		 */
485 		newkg = ksegrp_alloc();
486 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
487 		      kg_startzero, kg_endzero));
488 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
489 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
490 		newke = kse_alloc();
491 	} else {
492 		/*
493 		 * Otherwise, if we have already set this KSE
494 		 * to have a mailbox, we want to make another KSE here,
495 		 * but only if there are not already the limit, which
496 		 * is 1 per CPU max.
497 		 *
498 		 * If the current KSE doesn't have a mailbox we just use it
499 		 * and give it one.
500 		 *
501 		 * Because we don't like to access
502 		 * the KSE outside of schedlock if we are UNBOUND,
503 		 * (because it can change if we are preempted by an interrupt)
504 		 * we can deduce it as having a mailbox if we are UNBOUND,
505 		 * and only need to actually look at it if we are BOUND,
506 		 * which is safe.
507 		 */
508 		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
509 			if (thread_debug == 0) { /* if debugging, allow more */
510 #ifdef SMP
511 			if (kg->kg_kses > mp_ncpus)
512 #endif
513 				return (EPROCLIM);
514 			}
515 			newke = kse_alloc();
516 		} else {
517 			newke = NULL;
518 		}
519 		newkg = NULL;
520 	}
521 	if (newke) {
522 		bzero(&newke->ke_startzero, RANGEOF(struct kse,
523 		      ke_startzero, ke_endzero));
524 #if 0
525 		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
526 		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
527 #endif
528 		/* For the first call this may not have been set */
529 		if (td->td_standin == NULL) {
530 			td->td_standin = thread_alloc();
531 		}
532 		mtx_lock_spin(&sched_lock);
533 		if (newkg) {
534 			if (p->p_numksegrps >= max_groups_per_proc) {
535 				mtx_unlock_spin(&sched_lock);
536 				ksegrp_free(newkg);
537 				kse_free(newke);
538 				return (EPROCLIM);
539 			}
540 			ksegrp_link(newkg, p);
541 		}
542 		else
543 			newkg = kg;
544 		kse_link(newke, newkg);
545 		if (p->p_sflag & PS_NEEDSIGCHK)
546 			newke->ke_flags |= KEF_ASTPENDING;
547 		newke->ke_mailbox = uap->mbx;
548 		newke->ke_upcall = mbx.km_func;
549 		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
550 		thread_schedule_upcall(td, newke);
551 		mtx_unlock_spin(&sched_lock);
552 	} else {
553 		/*
554 		 * If we didn't allocate a new KSE then the we are using
555 		 * the exisiting (BOUND) kse.
556 		 */
557 		ke = td->td_kse;
558 		ke->ke_mailbox = uap->mbx;
559 		ke->ke_upcall = mbx.km_func;
560 		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
561 	}
562 	/*
563 	 * Fill out the KSE-mode specific fields of the new kse.
564 	 */
565 	return (0);
566 }
567 
568 /*
569  * Fill a ucontext_t with a thread's context information.
570  *
571  * This is an analogue to getcontext(3).
572  */
573 void
574 thread_getcontext(struct thread *td, ucontext_t *uc)
575 {
576 
577 /*
578  * XXX this is declared in a MD include file, i386/include/ucontext.h but
579  * is used in MI code.
580  */
581 #ifdef __i386__
582 	get_mcontext(td, &uc->uc_mcontext);
583 #endif
584 	uc->uc_sigmask = td->td_proc->p_sigmask;
585 }
586 
587 /*
588  * Set a thread's context from a ucontext_t.
589  *
590  * This is an analogue to setcontext(3).
591  */
592 int
593 thread_setcontext(struct thread *td, ucontext_t *uc)
594 {
595 	int ret;
596 
597 /*
598  * XXX this is declared in a MD include file, i386/include/ucontext.h but
599  * is used in MI code.
600  */
601 #ifdef __i386__
602 	ret = set_mcontext(td, &uc->uc_mcontext);
603 #else
604 	ret = ENOSYS;
605 #endif
606 	if (ret == 0) {
607 		SIG_CANTMASK(uc->uc_sigmask);
608 		PROC_LOCK(td->td_proc);
609 		td->td_proc->p_sigmask = uc->uc_sigmask;
610 		PROC_UNLOCK(td->td_proc);
611 	}
612 	return (ret);
613 }
614 
615 /*
616  * Initialize global thread allocation resources.
617  */
618 void
619 threadinit(void)
620 {
621 
622 #ifndef __ia64__
623 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
624 	    thread_ctor, thread_dtor, thread_init, thread_fini,
625 	    UMA_ALIGN_CACHE, 0);
626 #else
627 	/*
628 	 * XXX the ia64 kstack allocator is really lame and is at the mercy
629 	 * of contigmallloc().  This hackery is to pre-construct a whole
630 	 * pile of thread structures with associated kernel stacks early
631 	 * in the system startup while contigmalloc() still works. Once we
632 	 * have them, keep them.  Sigh.
633 	 */
634 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
635 	    thread_ctor, thread_dtor, thread_init, thread_fini,
636 	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
637 	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
638 #endif
639 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
640 	    NULL, NULL, ksegrp_init, NULL,
641 	    UMA_ALIGN_CACHE, 0);
642 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
643 	    NULL, NULL, kse_init, NULL,
644 	    UMA_ALIGN_CACHE, 0);
645 }
646 
647 /*
648  * Stash an embarasingly extra thread into the zombie thread queue.
649  */
650 void
651 thread_stash(struct thread *td)
652 {
653 	mtx_lock_spin(&zombie_thread_lock);
654 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
655 	mtx_unlock_spin(&zombie_thread_lock);
656 }
657 
658 /*
659  * Stash an embarasingly extra kse into the zombie kse queue.
660  */
661 void
662 kse_stash(struct kse *ke)
663 {
664 	mtx_lock_spin(&zombie_thread_lock);
665 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
666 	mtx_unlock_spin(&zombie_thread_lock);
667 }
668 
669 /*
670  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
671  */
672 void
673 ksegrp_stash(struct ksegrp *kg)
674 {
675 	mtx_lock_spin(&zombie_thread_lock);
676 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
677 	mtx_unlock_spin(&zombie_thread_lock);
678 }
679 
680 /*
681  * Reap zombie threads.
682  */
683 void
684 thread_reap(void)
685 {
686 	struct thread *td_first, *td_next;
687 	struct kse *ke_first, *ke_next;
688 	struct ksegrp *kg_first, * kg_next;
689 
690 	/*
691 	 * don't even bother to lock if none at this instant
692 	 * We really don't care about the next instant..
693 	 */
694 	if ((!TAILQ_EMPTY(&zombie_threads))
695 	    || (!TAILQ_EMPTY(&zombie_kses))
696 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
697 		mtx_lock_spin(&zombie_thread_lock);
698 		td_first = TAILQ_FIRST(&zombie_threads);
699 		ke_first = TAILQ_FIRST(&zombie_kses);
700 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
701 		if (td_first)
702 			TAILQ_INIT(&zombie_threads);
703 		if (ke_first)
704 			TAILQ_INIT(&zombie_kses);
705 		if (kg_first)
706 			TAILQ_INIT(&zombie_ksegrps);
707 		mtx_unlock_spin(&zombie_thread_lock);
708 		while (td_first) {
709 			td_next = TAILQ_NEXT(td_first, td_runq);
710 			thread_free(td_first);
711 			td_first = td_next;
712 		}
713 		while (ke_first) {
714 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
715 			kse_free(ke_first);
716 			ke_first = ke_next;
717 		}
718 		while (kg_first) {
719 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
720 			ksegrp_free(kg_first);
721 			kg_first = kg_next;
722 		}
723 	}
724 }
725 
726 /*
727  * Allocate a ksegrp.
728  */
729 struct ksegrp *
730 ksegrp_alloc(void)
731 {
732 	return (uma_zalloc(ksegrp_zone, 0));
733 }
734 
735 /*
736  * Allocate a kse.
737  */
738 struct kse *
739 kse_alloc(void)
740 {
741 	return (uma_zalloc(kse_zone, 0));
742 }
743 
744 /*
745  * Allocate a thread.
746  */
747 struct thread *
748 thread_alloc(void)
749 {
750 	thread_reap(); /* check if any zombies to get */
751 	return (uma_zalloc(thread_zone, 0));
752 }
753 
754 /*
755  * Deallocate a ksegrp.
756  */
757 void
758 ksegrp_free(struct ksegrp *td)
759 {
760 	uma_zfree(ksegrp_zone, td);
761 }
762 
763 /*
764  * Deallocate a kse.
765  */
766 void
767 kse_free(struct kse *td)
768 {
769 	uma_zfree(kse_zone, td);
770 }
771 
772 /*
773  * Deallocate a thread.
774  */
775 void
776 thread_free(struct thread *td)
777 {
778 
779 	cpu_thread_clean(td);
780 	uma_zfree(thread_zone, td);
781 }
782 
783 /*
784  * Store the thread context in the UTS's mailbox.
785  * then add the mailbox at the head of a list we are building in user space.
786  * The list is anchored in the ksegrp structure.
787  */
788 int
789 thread_export_context(struct thread *td)
790 {
791 	struct proc *p;
792 	struct ksegrp *kg;
793 	uintptr_t mbx;
794 	void *addr;
795 	int error;
796 	ucontext_t uc;
797 	uint temp;
798 
799 	p = td->td_proc;
800 	kg = td->td_ksegrp;
801 
802 	/* Export the user/machine context. */
803 #if 0
804 	addr = (caddr_t)td->td_mailbox +
805 	    offsetof(struct kse_thr_mailbox, tm_context);
806 #else /* if user pointer arithmetic is valid in the kernel */
807 		addr = (void *)(&td->td_mailbox->tm_context);
808 #endif
809 	error = copyin(addr, &uc, sizeof(ucontext_t));
810 	if (error)
811 		goto bad;
812 
813 	thread_getcontext(td, &uc);
814 	error = copyout(&uc, addr, sizeof(ucontext_t));
815 	if (error)
816 		goto bad;
817 
818 	/* get address in latest mbox of list pointer */
819 #if 0
820 	addr = (caddr_t)td->td_mailbox
821 	    + offsetof(struct kse_thr_mailbox , tm_next);
822 #else /* if user pointer arithmetic is valid in the kernel */
823 	addr = (void *)(&td->td_mailbox->tm_next);
824 #endif
825 	/*
826 	 * Put the saved address of the previous first
827 	 * entry into this one
828 	 */
829 	for (;;) {
830 		mbx = (uintptr_t)kg->kg_completed;
831 		if (suword(addr, mbx)) {
832 			error = EFAULT;
833 			goto bad;
834 		}
835 		PROC_LOCK(p);
836 		if (mbx == (uintptr_t)kg->kg_completed) {
837 			kg->kg_completed = td->td_mailbox;
838 			PROC_UNLOCK(p);
839 			break;
840 		}
841 		PROC_UNLOCK(p);
842 	}
843 	addr = (caddr_t)td->td_mailbox
844 		 + offsetof(struct kse_thr_mailbox, tm_sticks);
845 	temp = fuword(addr) + td->td_usticks;
846 	if (suword(addr, temp))
847 		goto bad;
848 	return (0);
849 
850 bad:
851 	PROC_LOCK(p);
852 	psignal(p, SIGSEGV);
853 	PROC_UNLOCK(p);
854 	return (error);
855 }
856 
857 /*
858  * Take the list of completed mailboxes for this KSEGRP and put them on this
859  * KSE's mailbox as it's the next one going up.
860  */
861 static int
862 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
863 {
864 	struct proc *p = kg->kg_proc;
865 	void *addr;
866 	uintptr_t mbx;
867 
868 #if 0
869 	addr = (caddr_t)ke->ke_mailbox
870 	    + offsetof(struct kse_mailbox, km_completed);
871 #else /* if user pointer arithmetic is valid in the kernel */
872 		addr = (void *)(&ke->ke_mailbox->km_completed);
873 #endif
874 	for (;;) {
875 		mbx = (uintptr_t)kg->kg_completed;
876 		if (suword(addr, mbx)) {
877 			PROC_LOCK(p);
878 			psignal(p, SIGSEGV);
879 			PROC_UNLOCK(p);
880 			return (EFAULT);
881 		}
882 		/* XXXKSE could use atomic CMPXCH here */
883 		PROC_LOCK(p);
884 		if (mbx == (uintptr_t)kg->kg_completed) {
885 			kg->kg_completed = NULL;
886 			PROC_UNLOCK(p);
887 			break;
888 		}
889 		PROC_UNLOCK(p);
890 	}
891 	return (0);
892 }
893 
894 /*
895  * This function should be called at statclock interrupt time
896  */
897 int
898 thread_add_ticks_intr(int user, uint ticks)
899 {
900 	struct thread *td = curthread;
901 	struct kse *ke = td->td_kse;
902 
903 	if (ke->ke_mailbox == NULL)
904 		return -1;
905 	if (user) {
906 		/* Current always do via ast() */
907 		ke->ke_flags |= KEF_ASTPENDING;
908 		ke->ke_uuticks += ticks;
909 	} else {
910 		if (td->td_mailbox != NULL)
911 			td->td_usticks += ticks;
912 		else
913 			ke->ke_usticks += ticks;
914 	}
915 	return 0;
916 }
917 
918 static int
919 thread_update_uticks(void)
920 {
921 	struct thread *td = curthread;
922 	struct proc *p = td->td_proc;
923 	struct kse *ke = td->td_kse;
924 	struct kse_thr_mailbox *tmbx;
925 	caddr_t addr;
926 	uint uticks, sticks;
927 
928 	if (ke->ke_mailbox == NULL)
929 		return 0;
930 
931 	uticks = ke->ke_uuticks;
932 	ke->ke_uuticks = 0;
933 	sticks = ke->ke_usticks;
934 	ke->ke_usticks = 0;
935 #if 0
936 	tmbx = (void *)fuword((caddr_t)ke->ke_mailbox
937 	    + offsetof(struct kse_mailbox, km_curthread));
938 #else /* if user pointer arithmetic is ok in the kernel */
939 	tmbx = (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
940 #endif
941 	if ((tmbx == NULL) || (tmbx == (void *)-1))
942 		return 0;
943 	if (uticks) {
944 		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_uticks);
945 		uticks += fuword(addr);
946 		if (suword(addr, uticks))
947 			goto bad;
948 	}
949 	if (sticks) {
950 		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_sticks);
951 		sticks += fuword(addr);
952 		if (suword(addr, sticks))
953 			goto bad;
954 	}
955 	return 0;
956 bad:
957 	PROC_LOCK(p);
958 	psignal(p, SIGSEGV);
959 	PROC_UNLOCK(p);
960 	return -1;
961 }
962 
963 /*
964  * Discard the current thread and exit from its context.
965  *
966  * Because we can't free a thread while we're operating under its context,
967  * push the current thread into our CPU's deadthread holder. This means
968  * we needn't worry about someone else grabbing our context before we
969  * do a cpu_throw().
970  */
971 void
972 thread_exit(void)
973 {
974 	struct thread *td;
975 	struct kse *ke;
976 	struct proc *p;
977 	struct ksegrp	*kg;
978 
979 	td = curthread;
980 	kg = td->td_ksegrp;
981 	p = td->td_proc;
982 	ke = td->td_kse;
983 
984 	mtx_assert(&sched_lock, MA_OWNED);
985 	KASSERT(p != NULL, ("thread exiting without a process"));
986 	KASSERT(ke != NULL, ("thread exiting without a kse"));
987 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
988 	PROC_LOCK_ASSERT(p, MA_OWNED);
989 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
990 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
991 
992 	if (td->td_standin != NULL) {
993 		thread_stash(td->td_standin);
994 		td->td_standin = NULL;
995 	}
996 
997 	cpu_thread_exit(td);	/* XXXSMP */
998 
999 	/*
1000 	 * The last thread is left attached to the process
1001 	 * So that the whole bundle gets recycled. Skip
1002 	 * all this stuff.
1003 	 */
1004 	if (p->p_numthreads > 1) {
1005 		/*
1006 		 * Unlink this thread from its proc and the kseg.
1007 		 * In keeping with the other structs we probably should
1008 		 * have a thread_unlink() that does some of this but it
1009 		 * would only be called from here (I think) so it would
1010 		 * be a waste. (might be useful for proc_fini() as well.)
1011  		 */
1012 		TAILQ_REMOVE(&p->p_threads, td, td_plist);
1013 		p->p_numthreads--;
1014 		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1015 		kg->kg_numthreads--;
1016 		/*
1017 		 * The test below is NOT true if we are the
1018 		 * sole exiting thread. P_STOPPED_SNGL is unset
1019 		 * in exit1() after it is the only survivor.
1020 		 */
1021 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1022 			if (p->p_numthreads == p->p_suspcount) {
1023 				thread_unsuspend_one(p->p_singlethread);
1024 			}
1025 		}
1026 
1027 		/* Reassign this thread's KSE. */
1028 		ke->ke_state = KES_UNQUEUED;
1029 
1030 		/*
1031 		 * Decide what to do with the KSE attached to this thread.
1032 		 * XXX Possibly kse_reassign should do both cases as it already
1033 		 * does some of this.
1034 		 */
1035 		if (ke->ke_flags & KEF_EXIT) {
1036 			KASSERT((ke->ke_owner == td),
1037 		    	    ("thread_exit: KSE exiting with non-owner thread"));
1038 			ke->ke_thread = NULL;
1039 			td->td_kse = NULL;
1040 			kse_unlink(ke);
1041 		} else {
1042 			TD_SET_EXITING(td);	/* definitly not runnable */
1043 			kse_reassign(ke);
1044 		}
1045 		PROC_UNLOCK(p);
1046 		td->td_state	= TDS_INACTIVE;
1047 		td->td_proc	= NULL;
1048 		td->td_ksegrp	= NULL;
1049 		td->td_last_kse	= NULL;
1050 		PCPU_SET(deadthread, td);
1051 	} else {
1052 		PROC_UNLOCK(p);
1053 	}
1054 	cpu_throw();
1055 	/* NOTREACHED */
1056 }
1057 
1058 /*
1059  * Do any thread specific cleanups that may be needed in wait()
1060  * called with Giant held, proc and schedlock not held.
1061  */
1062 void
1063 thread_wait(struct proc *p)
1064 {
1065 	struct thread *td;
1066 
1067 	KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()"));
1068 	KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()"));
1069 	FOREACH_THREAD_IN_PROC(p, td) {
1070 		if (td->td_standin != NULL) {
1071 			thread_free(td->td_standin);
1072 			td->td_standin = NULL;
1073 		}
1074 		cpu_thread_clean(td);
1075 	}
1076 	thread_reap();	/* check for zombie threads etc. */
1077 }
1078 
1079 /*
1080  * Link a thread to a process.
1081  * set up anything that needs to be initialized for it to
1082  * be used by the process.
1083  *
1084  * Note that we do not link to the proc's ucred here.
1085  * The thread is linked as if running but no KSE assigned.
1086  */
1087 void
1088 thread_link(struct thread *td, struct ksegrp *kg)
1089 {
1090 	struct proc *p;
1091 
1092 	p = kg->kg_proc;
1093 	td->td_state = TDS_INACTIVE;
1094 	td->td_proc	= p;
1095 	td->td_ksegrp	= kg;
1096 	td->td_last_kse	= NULL;
1097 
1098 	LIST_INIT(&td->td_contested);
1099 	callout_init(&td->td_slpcallout, 1);
1100 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1101 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1102 	p->p_numthreads++;
1103 	kg->kg_numthreads++;
1104 	td->td_kse	= NULL;
1105 }
1106 
1107 void
1108 kse_purge(struct proc *p, struct thread *td)
1109 {
1110 	/* XXXKSE think about this..
1111 		may need to wake up threads on loan queue. */
1112 	struct ksegrp *kg;
1113 
1114  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1115 	mtx_lock_spin(&sched_lock);
1116 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1117 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1118 		p->p_numksegrps--;
1119 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1120 		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1121 			("wrong kg_kses"));
1122 		if (kg != td->td_ksegrp) {
1123 			ksegrp_stash(kg);
1124 		}
1125 	}
1126 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1127 	p->p_numksegrps++;
1128 	mtx_unlock_spin(&sched_lock);
1129 }
1130 
1131 
1132 /*
1133  * Create a thread and schedule it for upcall on the KSE given.
1134  * Use our thread's standin so that we don't have to allocate one.
1135  */
1136 struct thread *
1137 thread_schedule_upcall(struct thread *td, struct kse *ke)
1138 {
1139 	struct thread *td2;
1140 	int newkse;
1141 
1142 	mtx_assert(&sched_lock, MA_OWNED);
1143 	newkse = (ke != td->td_kse);
1144 
1145 	/*
1146 	 * If the owner and kse are BOUND then that thread is planning to
1147 	 * go to userland and upcalls are not expected. So don't make one.
1148 	 * If it is not bound then make it so with the spare thread
1149 	 * anf then borrw back the KSE to allow us to complete some in-kernel
1150 	 * work. When we complete, the Bound thread will have the chance to
1151 	 * complete. This thread will sleep as planned. Hopefully there will
1152 	 * eventually be un unbound thread that can be converted to an
1153 	 * upcall to report the completion of this thread.
1154 	 */
1155 
1156 	if ((td2 = td->td_standin) != NULL) {
1157 		td->td_standin = NULL;
1158 	} else {
1159 		if (newkse)
1160 			panic("no reserve thread when called with a new kse");
1161 		/*
1162 		 * If called from (e.g.) sleep and we do not have
1163 		 * a reserve thread, then we've used it, so do not
1164 		 * create an upcall.
1165 		 */
1166 		return (NULL);
1167 	}
1168 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1169 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1170 	bzero(&td2->td_startzero,
1171 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1172 	bcopy(&td->td_startcopy, &td2->td_startcopy,
1173 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1174 	thread_link(td2, ke->ke_ksegrp);
1175 	cpu_set_upcall(td2, td->td_pcb);
1176 
1177 	/*
1178 	 * XXXKSE do we really need this? (default values for the
1179 	 * frame).
1180 	 */
1181 	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1182 
1183 	/*
1184 	 * Bind the new thread to the KSE,
1185 	 * and if it's our KSE, lend it back to ourself
1186 	 * so we can continue running.
1187 	 */
1188 	td2->td_ucred = crhold(td->td_ucred);
1189 	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1190 	td2->td_kse = ke;
1191 	td2->td_state = TDS_CAN_RUN;
1192 	td2->td_inhibitors = 0;
1193 	ke->ke_owner = td2;
1194 	/*
1195 	 * If called from kse_reassign(), we are working on the current
1196 	 * KSE so fake that we borrowed it. If called from
1197 	 * kse_create(), don't, as we have a new kse too.
1198 	 */
1199 	if (!newkse) {
1200 		/*
1201 		 * This thread will be scheduled when the current thread
1202 		 * blocks, exits or tries to enter userspace, (which ever
1203 		 * happens first). When that happens the KSe will "revert"
1204 		 * to this thread in a BOUND manner. Since we are called
1205 		 * from msleep() this is going to be "very soon" in nearly
1206 		 * all cases.
1207 		 */
1208 		TD_SET_LOAN(td2);
1209 	} else {
1210 		ke->ke_thread = td2;
1211 		ke->ke_state = KES_THREAD;
1212 		setrunqueue(td2);
1213 	}
1214 	return (td2);	/* bogus.. should be a void function */
1215 }
1216 
1217 /*
1218  * Schedule an upcall to notify a KSE process recieved signals.
1219  *
1220  * XXX - Modifying a sigset_t like this is totally bogus.
1221  */
1222 struct thread *
1223 signal_upcall(struct proc *p, int sig)
1224 {
1225 	struct thread *td, *td2;
1226 	struct kse *ke;
1227 	sigset_t ss;
1228 	int error;
1229 
1230 	PROC_LOCK_ASSERT(p, MA_OWNED);
1231 return (NULL);
1232 
1233 	td = FIRST_THREAD_IN_PROC(p);
1234 	ke = td->td_kse;
1235 	PROC_UNLOCK(p);
1236 	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1237 	PROC_LOCK(p);
1238 	if (error)
1239 		return (NULL);
1240 	SIGADDSET(ss, sig);
1241 	PROC_UNLOCK(p);
1242 	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1243 	PROC_LOCK(p);
1244 	if (error)
1245 		return (NULL);
1246 	if (td->td_standin == NULL)
1247 		td->td_standin = thread_alloc();
1248 	mtx_lock_spin(&sched_lock);
1249 	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1250 	mtx_unlock_spin(&sched_lock);
1251 	return (td2);
1252 }
1253 
1254 /*
1255  * setup done on the thread when it enters the kernel.
1256  * XXXKSE Presently only for syscalls but eventually all kernel entries.
1257  */
1258 void
1259 thread_user_enter(struct proc *p, struct thread *td)
1260 {
1261 	struct kse *ke;
1262 
1263 	/*
1264 	 * First check that we shouldn't just abort.
1265 	 * But check if we are the single thread first!
1266 	 * XXX p_singlethread not locked, but should be safe.
1267 	 */
1268 	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1269 		PROC_LOCK(p);
1270 		mtx_lock_spin(&sched_lock);
1271 		thread_exit();
1272 		/* NOTREACHED */
1273 	}
1274 
1275 	/*
1276 	 * If we are doing a syscall in a KSE environment,
1277 	 * note where our mailbox is. There is always the
1278 	 * possibility that we could do this lazily (in kse_reassign()),
1279 	 * but for now do it every time.
1280 	 */
1281 	ke = td->td_kse;
1282 	td->td_flags &= ~TDF_UNBOUND;
1283 	if (ke->ke_mailbox != NULL) {
1284 #if 0
1285 		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1286 		    + offsetof(struct kse_mailbox, km_curthread));
1287 #else /* if user pointer arithmetic is ok in the kernel */
1288 		td->td_mailbox =
1289 		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1290 #endif
1291 		if ((td->td_mailbox == NULL) ||
1292 		    (td->td_mailbox == (void *)-1)) {
1293 			td->td_mailbox = NULL;	/* single thread it.. */
1294 			mtx_lock_spin(&sched_lock);
1295 			td->td_flags &= ~(TDF_UNBOUND|TDF_CAN_UNBIND);
1296 			mtx_unlock_spin(&sched_lock);
1297 		} else {
1298 			/*
1299 			 * when thread limit reached, act like that the thread
1300 			 * has already done an upcall.
1301 			 */
1302 			if (p->p_numthreads > max_threads_per_proc) {
1303 				if (td->td_standin != NULL) {
1304 					thread_stash(td->td_standin);
1305 					td->td_standin = NULL;
1306 				}
1307 			} else {
1308 				if (td->td_standin == NULL)
1309 					td->td_standin = thread_alloc();
1310 			}
1311 			mtx_lock_spin(&sched_lock);
1312 			td->td_flags |= TDF_CAN_UNBIND;
1313 			mtx_unlock_spin(&sched_lock);
1314 			KASSERT((ke->ke_owner == td),
1315 			    ("thread_user_enter: No starting owner "));
1316 			ke->ke_owner = td;
1317 			td->td_usticks = 0;
1318 		}
1319 	}
1320 }
1321 
1322 /*
1323  * The extra work we go through if we are a threaded process when we
1324  * return to userland.
1325  *
1326  * If we are a KSE process and returning to user mode, check for
1327  * extra work to do before we return (e.g. for more syscalls
1328  * to complete first).  If we were in a critical section, we should
1329  * just return to let it finish. Same if we were in the UTS (in
1330  * which case the mailbox's context's busy indicator will be set).
1331  * The only traps we suport will have set the mailbox.
1332  * We will clear it here.
1333  */
1334 int
1335 thread_userret(struct thread *td, struct trapframe *frame)
1336 {
1337 	int error;
1338 	int unbound;
1339 	struct kse *ke;
1340 	struct ksegrp *kg;
1341 	struct thread *worktodo;
1342 	struct proc *p;
1343 	struct timespec ts;
1344 
1345 	KASSERT((td->td_kse && td->td_kse->ke_thread && td->td_kse->ke_owner),
1346 	    ("thread_userret: bad thread/kse pointers"));
1347 	KASSERT((td == curthread),
1348 	    ("thread_userret: bad thread argument"));
1349 
1350 
1351 	kg = td->td_ksegrp;
1352 	p = td->td_proc;
1353 	error = 0;
1354 	unbound = TD_IS_UNBOUND(td);
1355 
1356 	mtx_lock_spin(&sched_lock);
1357 	if ((worktodo = kg->kg_last_assigned))
1358 		worktodo = TAILQ_NEXT(worktodo, td_runq);
1359 	else
1360 		worktodo = TAILQ_FIRST(&kg->kg_runq);
1361 
1362 	/*
1363 	 * Permanently bound threads never upcall but they may
1364 	 * loan out their KSE at this point.
1365 	 * Upcalls imply bound.. They also may want to do some Philantropy.
1366 	 * Temporarily bound threads on the other hand either yield
1367 	 * to other work and transform into an upcall, or proceed back to
1368 	 * userland.
1369 	 */
1370 
1371 	if (TD_CAN_UNBIND(td)) {
1372 		td->td_flags &= ~(TDF_UNBOUND|TDF_CAN_UNBIND);
1373 		if (!worktodo && (kg->kg_completed == NULL) &&
1374 		    !(td->td_kse->ke_flags & KEF_DOUPCALL)) {
1375 			/*
1376 			 * This thread has not started any upcall.
1377 			 * If there is no work to report other than
1378 			 * ourself, then it can return direct to userland.
1379 			 */
1380 justreturn:
1381 			mtx_unlock_spin(&sched_lock);
1382 			thread_update_uticks();
1383 			td->td_mailbox = NULL;
1384 			return (0);
1385 		}
1386 		mtx_unlock_spin(&sched_lock);
1387 		error = thread_export_context(td);
1388 		td->td_usticks = 0;
1389 		if (error) {
1390 			/*
1391 			 * As we are not running on a borrowed KSE,
1392 			 * failing to do the KSE operation just defaults
1393 			 * back to synchonous operation, so just return from
1394 			 * the syscall.
1395 			 */
1396 			goto justreturn;
1397 		}
1398 		mtx_lock_spin(&sched_lock);
1399 		/*
1400 		 * Turn ourself into a bound upcall.
1401 		 * We will rely on kse_reassign()
1402 		 * to make us run at a later time.
1403 		 */
1404 		td->td_flags |= TDF_UPCALLING;
1405 
1406 		/* there may be more work since we re-locked schedlock */
1407 		if ((worktodo = kg->kg_last_assigned))
1408 			worktodo = TAILQ_NEXT(worktodo, td_runq);
1409 		else
1410 			worktodo = TAILQ_FIRST(&kg->kg_runq);
1411 	} else if (unbound) {
1412 		/*
1413 		 * We are an unbound thread, looking to
1414 		 * return to user space. There must be another owner
1415 		 * of this KSE.
1416 		 * We are using a borrowed KSE. save state and exit.
1417 		 * kse_reassign() will recycle the kse as needed,
1418 		 */
1419 		mtx_unlock_spin(&sched_lock);
1420 		error = thread_export_context(td);
1421 		td->td_usticks = 0;
1422 		if (error) {
1423 			/*
1424 			 * There is nothing we can do.
1425 			 * We just lose that context. We
1426 			 * probably should note this somewhere and send
1427 			 * the process a signal.
1428 			 */
1429 			PROC_LOCK(td->td_proc);
1430 			psignal(td->td_proc, SIGSEGV);
1431 			mtx_lock_spin(&sched_lock);
1432 			ke = td->td_kse;
1433 			/* possibly upcall with error? */
1434 		} else {
1435 			/*
1436 			 * Don't make an upcall, just exit so that the owner
1437 			 * can get its KSE if it wants it.
1438 			 * Our context is already safely stored for later
1439 			 * use by the UTS.
1440 			 */
1441 			PROC_LOCK(p);
1442 			mtx_lock_spin(&sched_lock);
1443 			ke = td->td_kse;
1444 		}
1445 		/*
1446 		 * If the owner is idling, we now have something for it
1447 		 * to report, so make it runnable.
1448 		 * If the owner is not an upcall, make an attempt to
1449 		 * ensure that at least one of any IDLED upcalls can
1450 		 * wake up.
1451 		 */
1452 		if (ke->ke_owner->td_flags & TDF_UPCALLING) {
1453 			TD_CLR_IDLE(ke->ke_owner);
1454 		} else {
1455 			FOREACH_KSE_IN_GROUP(kg, ke) {
1456 				if (TD_IS_IDLE(ke->ke_owner)) {
1457 					TD_CLR_IDLE(ke->ke_owner);
1458 					setrunnable(ke->ke_owner);
1459 					break;
1460 				}
1461 			}
1462 		}
1463 		thread_exit();
1464 	}
1465 	/*
1466 	 * We ARE going back to userland with this KSE.
1467 	 * We are permanently bound. We may be an upcall.
1468 	 * If an upcall, check for threads that need to borrow the KSE.
1469 	 * Any other thread that comes ready after this missed the boat.
1470 	 */
1471 	ke = td->td_kse;
1472 
1473 	/*
1474 	 *  If not upcalling, go back to userspace.
1475 	 * If we are, get the upcall set up.
1476 	 */
1477 	if (td->td_flags & TDF_UPCALLING) {
1478 		if (worktodo)  {
1479 			/*
1480 			 * force a switch to more urgent 'in kernel'
1481 			 * work. Control will return to this thread
1482 			 * when there is no more work to do.
1483 			 * kse_reassign() will do that for us.
1484 			 */
1485 			TD_SET_LOAN(td);
1486 			p->p_stats->p_ru.ru_nvcsw++;
1487 			mi_switch(); /* kse_reassign() will (re)find worktodo */
1488 		}
1489 		td->td_flags &= ~TDF_UPCALLING;
1490 		if (ke->ke_flags & KEF_DOUPCALL)
1491 			ke->ke_flags &= ~KEF_DOUPCALL;
1492 		mtx_unlock_spin(&sched_lock);
1493 
1494 		/*
1495 		 * There is no more work to do and we are going to ride
1496 		 * this thread/KSE up to userland as an upcall.
1497 		 * Do the last parts of the setup needed for the upcall.
1498 		 */
1499 		CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1500 		    td, td->td_proc->p_pid, td->td_proc->p_comm);
1501 
1502 		/*
1503 		 * Set user context to the UTS.
1504 		 * Will use Giant in cpu_thread_clean() because it uses
1505 		 * kmem_free(kernel_map, ...)
1506 		 */
1507 		cpu_set_upcall_kse(td, ke);
1508 
1509 		/*
1510 		 * Unhook the list of completed threads.
1511 		 * anything that completes after this gets to
1512 		 * come in next time.
1513 		 * Put the list of completed thread mailboxes on
1514 		 * this KSE's mailbox.
1515 		 */
1516 		error = thread_link_mboxes(kg, ke);
1517 		if (error)
1518 			goto bad;
1519 
1520 		/*
1521 		 * Set state and clear the  thread mailbox pointer.
1522 		 * From now on we are just a bound outgoing process.
1523 		 * **Problem** userret is often called several times.
1524 		 * it would be nice if this all happenned only on the first
1525 		 * time through. (the scan for extra work etc.)
1526 		 */
1527 #if 0
1528 		error = suword((caddr_t)ke->ke_mailbox +
1529 		    offsetof(struct kse_mailbox, km_curthread), 0);
1530 #else	/* if user pointer arithmetic is ok in the kernel */
1531 		error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1532 #endif
1533 		ke->ke_uuticks = ke->ke_usticks = 0;
1534 		if (error)
1535 			goto bad;
1536 		nanotime(&ts);
1537 		if (copyout(&ts,
1538 		    (caddr_t)&ke->ke_mailbox->km_timeofday, sizeof(ts))) {
1539 			goto bad;
1540 		}
1541 	} else {
1542 		mtx_unlock_spin(&sched_lock);
1543 	}
1544 	/*
1545 	 * Optimisation:
1546 	 * Ensure that we have a spare thread available,
1547 	 * for when we re-enter the kernel.
1548 	 */
1549 	if (td->td_standin == NULL) {
1550 		td->td_standin = thread_alloc();
1551 	}
1552 
1553 	thread_update_uticks();
1554 	td->td_mailbox = NULL;
1555 	return (0);
1556 
1557 bad:
1558 	/*
1559 	 * Things are going to be so screwed we should just kill the process.
1560 	 * how do we do that?
1561 	 */
1562 	PROC_LOCK(td->td_proc);
1563 	psignal(td->td_proc, SIGSEGV);
1564 	PROC_UNLOCK(td->td_proc);
1565 	td->td_mailbox = NULL;
1566 	return (error);	/* go sync */
1567 }
1568 
1569 /*
1570  * Enforce single-threading.
1571  *
1572  * Returns 1 if the caller must abort (another thread is waiting to
1573  * exit the process or similar). Process is locked!
1574  * Returns 0 when you are successfully the only thread running.
1575  * A process has successfully single threaded in the suspend mode when
1576  * There are no threads in user mode. Threads in the kernel must be
1577  * allowed to continue until they get to the user boundary. They may even
1578  * copy out their return values and data before suspending. They may however be
1579  * accellerated in reaching the user boundary as we will wake up
1580  * any sleeping threads that are interruptable. (PCATCH).
1581  */
1582 int
1583 thread_single(int force_exit)
1584 {
1585 	struct thread *td;
1586 	struct thread *td2;
1587 	struct proc *p;
1588 
1589 	td = curthread;
1590 	p = td->td_proc;
1591 	mtx_assert(&Giant, MA_OWNED);
1592 	PROC_LOCK_ASSERT(p, MA_OWNED);
1593 	KASSERT((td != NULL), ("curthread is NULL"));
1594 
1595 	if ((p->p_flag & P_KSES) == 0)
1596 		return (0);
1597 
1598 	/* Is someone already single threading? */
1599 	if (p->p_singlethread)
1600 		return (1);
1601 
1602 	if (force_exit == SINGLE_EXIT) {
1603 		p->p_flag |= P_SINGLE_EXIT;
1604 		td->td_flags &= ~TDF_UNBOUND;
1605 	} else
1606 		p->p_flag &= ~P_SINGLE_EXIT;
1607 	p->p_flag |= P_STOPPED_SINGLE;
1608 	p->p_singlethread = td;
1609 	/* XXXKSE Which lock protects the below values? */
1610 	while ((p->p_numthreads - p->p_suspcount) != 1) {
1611 		mtx_lock_spin(&sched_lock);
1612 		FOREACH_THREAD_IN_PROC(p, td2) {
1613 			if (td2 == td)
1614 				continue;
1615 			if (TD_IS_INHIBITED(td2)) {
1616 				if (force_exit == SINGLE_EXIT) {
1617 					if (TD_IS_SUSPENDED(td2)) {
1618 						thread_unsuspend_one(td2);
1619 					}
1620 					if (TD_ON_SLEEPQ(td2) &&
1621 					    (td2->td_flags & TDF_SINTR)) {
1622 						if (td2->td_flags & TDF_CVWAITQ)
1623 							cv_abort(td2);
1624 						else
1625 							abortsleep(td2);
1626 					}
1627 					if (TD_IS_IDLE(td2)) {
1628 						TD_CLR_IDLE(td2);
1629 					}
1630 				} else {
1631 					if (TD_IS_SUSPENDED(td2))
1632 						continue;
1633 					/* maybe other inhibitted states too? */
1634 					if (td2->td_inhibitors &
1635 					    (TDI_SLEEPING | TDI_SWAPPED |
1636 					    TDI_LOAN | TDI_IDLE |
1637 					    TDI_EXITING))
1638 						thread_suspend_one(td2);
1639 				}
1640 			}
1641 		}
1642 		/*
1643 		 * Maybe we suspended some threads.. was it enough?
1644 		 */
1645 		if ((p->p_numthreads - p->p_suspcount) == 1) {
1646 			mtx_unlock_spin(&sched_lock);
1647 			break;
1648 		}
1649 
1650 		/*
1651 		 * Wake us up when everyone else has suspended.
1652 		 * In the mean time we suspend as well.
1653 		 */
1654 		thread_suspend_one(td);
1655 		mtx_unlock(&Giant);
1656 		PROC_UNLOCK(p);
1657 		p->p_stats->p_ru.ru_nvcsw++;
1658 		mi_switch();
1659 		mtx_unlock_spin(&sched_lock);
1660 		mtx_lock(&Giant);
1661 		PROC_LOCK(p);
1662 	}
1663 	if (force_exit == SINGLE_EXIT)
1664 		kse_purge(p, td);
1665 	return (0);
1666 }
1667 
1668 /*
1669  * Called in from locations that can safely check to see
1670  * whether we have to suspend or at least throttle for a
1671  * single-thread event (e.g. fork).
1672  *
1673  * Such locations include userret().
1674  * If the "return_instead" argument is non zero, the thread must be able to
1675  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1676  *
1677  * The 'return_instead' argument tells the function if it may do a
1678  * thread_exit() or suspend, or whether the caller must abort and back
1679  * out instead.
1680  *
1681  * If the thread that set the single_threading request has set the
1682  * P_SINGLE_EXIT bit in the process flags then this call will never return
1683  * if 'return_instead' is false, but will exit.
1684  *
1685  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1686  *---------------+--------------------+---------------------
1687  *       0       | returns 0          |   returns 0 or 1
1688  *               | when ST ends       |   immediatly
1689  *---------------+--------------------+---------------------
1690  *       1       | thread exits       |   returns 1
1691  *               |                    |  immediatly
1692  * 0 = thread_exit() or suspension ok,
1693  * other = return error instead of stopping the thread.
1694  *
1695  * While a full suspension is under effect, even a single threading
1696  * thread would be suspended if it made this call (but it shouldn't).
1697  * This call should only be made from places where
1698  * thread_exit() would be safe as that may be the outcome unless
1699  * return_instead is set.
1700  */
1701 int
1702 thread_suspend_check(int return_instead)
1703 {
1704 	struct thread *td;
1705 	struct proc *p;
1706 	struct kse *ke;
1707 	struct ksegrp *kg;
1708 
1709 	td = curthread;
1710 	p = td->td_proc;
1711 	kg = td->td_ksegrp;
1712 	PROC_LOCK_ASSERT(p, MA_OWNED);
1713 	while (P_SHOULDSTOP(p)) {
1714 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1715 			KASSERT(p->p_singlethread != NULL,
1716 			    ("singlethread not set"));
1717 			/*
1718 			 * The only suspension in action is a
1719 			 * single-threading. Single threader need not stop.
1720 			 * XXX Should be safe to access unlocked
1721 			 * as it can only be set to be true by us.
1722 			 */
1723 			if (p->p_singlethread == td)
1724 				return (0);	/* Exempt from stopping. */
1725 		}
1726 		if (return_instead)
1727 			return (1);
1728 
1729 		/*
1730 		 * If the process is waiting for us to exit,
1731 		 * this thread should just suicide.
1732 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1733 		 */
1734 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1735 			mtx_lock_spin(&sched_lock);
1736 			while (mtx_owned(&Giant))
1737 				mtx_unlock(&Giant);
1738 			/*
1739 			 * All threads should be exiting
1740 			 * Unless they are the active "singlethread".
1741 			 * destroy un-needed KSEs as we go..
1742 			 * KSEGRPS may implode too as #kses -> 0.
1743 			 */
1744 			ke = td->td_kse;
1745 			if (ke->ke_owner == td &&
1746 			    (kg->kg_kses >= kg->kg_numthreads ))
1747 				ke->ke_flags |= KEF_EXIT;
1748 			thread_exit();
1749 		}
1750 
1751 		/*
1752 		 * When a thread suspends, it just
1753 		 * moves to the processes's suspend queue
1754 		 * and stays there.
1755 		 *
1756 		 * XXXKSE if TDF_BOUND is true
1757 		 * it will not release it's KSE which might
1758 		 * lead to deadlock if there are not enough KSEs
1759 		 * to complete all waiting threads.
1760 		 * Maybe be able to 'lend' it out again.
1761 		 * (lent kse's can not go back to userland?)
1762 		 * and can only be lent in STOPPED state.
1763 		 */
1764 		mtx_lock_spin(&sched_lock);
1765 		if ((p->p_flag & P_STOPPED_SIG) &&
1766 		    (p->p_suspcount+1 == p->p_numthreads)) {
1767 			mtx_unlock_spin(&sched_lock);
1768 			PROC_LOCK(p->p_pptr);
1769 			if ((p->p_pptr->p_procsig->ps_flag &
1770 				PS_NOCLDSTOP) == 0) {
1771 				psignal(p->p_pptr, SIGCHLD);
1772 			}
1773 			PROC_UNLOCK(p->p_pptr);
1774 			mtx_lock_spin(&sched_lock);
1775 		}
1776 		mtx_assert(&Giant, MA_NOTOWNED);
1777 		thread_suspend_one(td);
1778 		PROC_UNLOCK(p);
1779 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1780 			if (p->p_numthreads == p->p_suspcount) {
1781 				thread_unsuspend_one(p->p_singlethread);
1782 			}
1783 		}
1784 		p->p_stats->p_ru.ru_nivcsw++;
1785 		mi_switch();
1786 		mtx_unlock_spin(&sched_lock);
1787 		PROC_LOCK(p);
1788 	}
1789 	return (0);
1790 }
1791 
1792 void
1793 thread_suspend_one(struct thread *td)
1794 {
1795 	struct proc *p = td->td_proc;
1796 
1797 	mtx_assert(&sched_lock, MA_OWNED);
1798 	p->p_suspcount++;
1799 	TD_SET_SUSPENDED(td);
1800 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1801 	/*
1802 	 * Hack: If we are suspending but are on the sleep queue
1803 	 * then we are in msleep or the cv equivalent. We
1804 	 * want to look like we have two Inhibitors.
1805 	 * May already be set.. doesn't matter.
1806 	 */
1807 	if (TD_ON_SLEEPQ(td))
1808 		TD_SET_SLEEPING(td);
1809 }
1810 
1811 void
1812 thread_unsuspend_one(struct thread *td)
1813 {
1814 	struct proc *p = td->td_proc;
1815 
1816 	mtx_assert(&sched_lock, MA_OWNED);
1817 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1818 	TD_CLR_SUSPENDED(td);
1819 	p->p_suspcount--;
1820 	setrunnable(td);
1821 }
1822 
1823 /*
1824  * Allow all threads blocked by single threading to continue running.
1825  */
1826 void
1827 thread_unsuspend(struct proc *p)
1828 {
1829 	struct thread *td;
1830 
1831 	mtx_assert(&sched_lock, MA_OWNED);
1832 	PROC_LOCK_ASSERT(p, MA_OWNED);
1833 	if (!P_SHOULDSTOP(p)) {
1834 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1835 			thread_unsuspend_one(td);
1836 		}
1837 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1838 	    (p->p_numthreads == p->p_suspcount)) {
1839 		/*
1840 		 * Stopping everything also did the job for the single
1841 		 * threading request. Now we've downgraded to single-threaded,
1842 		 * let it continue.
1843 		 */
1844 		thread_unsuspend_one(p->p_singlethread);
1845 	}
1846 }
1847 
1848 void
1849 thread_single_end(void)
1850 {
1851 	struct thread *td;
1852 	struct proc *p;
1853 
1854 	td = curthread;
1855 	p = td->td_proc;
1856 	PROC_LOCK_ASSERT(p, MA_OWNED);
1857 	p->p_flag &= ~P_STOPPED_SINGLE;
1858 	p->p_singlethread = NULL;
1859 	/*
1860 	 * If there are other threads they mey now run,
1861 	 * unless of course there is a blanket 'stop order'
1862 	 * on the process. The single threader must be allowed
1863 	 * to continue however as this is a bad place to stop.
1864 	 */
1865 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1866 		mtx_lock_spin(&sched_lock);
1867 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1868 			thread_unsuspend_one(td);
1869 		}
1870 		mtx_unlock_spin(&sched_lock);
1871 	}
1872 }
1873 
1874 
1875