xref: /freebsd/sys/kern/kern_thread.c (revision b19d9defef17fd447813157b9e7fd8ad26a78cb2)
1 /*
2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3  *  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice(s), this list of conditions and the following disclaimer as
10  *    the first lines of this file unmodified other than the possible
11  *    addition of one or more copyright notices.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice(s), this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26  * DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/smp.h>
39 #include <sys/sysctl.h>
40 #include <sys/sysproto.h>
41 #include <sys/filedesc.h>
42 #include <sys/sched.h>
43 #include <sys/signalvar.h>
44 #include <sys/sx.h>
45 #include <sys/tty.h>
46 #include <sys/user.h>
47 #include <sys/jail.h>
48 #include <sys/kse.h>
49 #include <sys/ktr.h>
50 #include <sys/ucontext.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_object.h>
54 #include <vm/pmap.h>
55 #include <vm/uma.h>
56 #include <vm/vm_map.h>
57 
58 #include <machine/frame.h>
59 
60 /*
61  * KSEGRP related storage.
62  */
63 static uma_zone_t ksegrp_zone;
64 static uma_zone_t kse_zone;
65 static uma_zone_t thread_zone;
66 
67 /* DEBUG ONLY */
68 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
69 static int oiks_debug = 0;	/* 0 disable, 1 printf, 2 enter debugger */
70 SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
71 	&oiks_debug, 0, "OIKS thread debug");
72 
73 static int oiks_max_threads_per_proc = 10;
74 SYSCTL_INT(_kern_threads, OID_AUTO, oiks_max_per_proc, CTLFLAG_RW,
75 	&oiks_max_threads_per_proc, 0, "Debug limit on threads per proc");
76 
77 static int max_threads_per_proc = 30;
78 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
79 	&max_threads_per_proc, 0, "Limit on threads per proc");
80 
81 static int max_groups_per_proc = 5;
82 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
83 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
84 
85 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
86 
87 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
88 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
89 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
90 struct mtx zombie_thread_lock;
91 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
92     "zombie_thread_lock", MTX_SPIN);
93 
94 
95 
96 void kse_purge(struct proc *p, struct thread *td);
97 /*
98  * Pepare a thread for use.
99  */
100 static void
101 thread_ctor(void *mem, int size, void *arg)
102 {
103 	struct thread	*td;
104 
105 	td = (struct thread *)mem;
106 	td->td_state = TDS_INACTIVE;
107 	td->td_flags |= TDF_UNBOUND;
108 }
109 
110 /*
111  * Reclaim a thread after use.
112  */
113 static void
114 thread_dtor(void *mem, int size, void *arg)
115 {
116 	struct thread	*td;
117 
118 	mtx_assert(&Giant, MA_OWNED);
119 	td = (struct thread *)mem;
120 
121 #ifdef INVARIANTS
122 	/* Verify that this thread is in a safe state to free. */
123 	switch (td->td_state) {
124 	case TDS_INHIBITED:
125 	case TDS_RUNNING:
126 	case TDS_CAN_RUN:
127 	case TDS_RUNQ:
128 		/*
129 		 * We must never unlink a thread that is in one of
130 		 * these states, because it is currently active.
131 		 */
132 		panic("bad state for thread unlinking");
133 		/* NOTREACHED */
134 	case TDS_INACTIVE:
135 		break;
136 	default:
137 		panic("bad thread state");
138 		/* NOTREACHED */
139 	}
140 #endif
141 
142 	cpu_thread_dtor(td);
143 }
144 
145 /*
146  * Initialize type-stable parts of a thread (when newly created).
147  */
148 static void
149 thread_init(void *mem, int size)
150 {
151 	struct thread	*td;
152 
153 	td = (struct thread *)mem;
154 	mtx_lock(&Giant);
155 	pmap_new_thread(td, 0);
156 	mtx_unlock(&Giant);
157 	cpu_thread_setup(td);
158 	td->td_sched = (struct td_sched *)&td[1];
159 }
160 
161 /*
162  * Tear down type-stable parts of a thread (just before being discarded).
163  */
164 static void
165 thread_fini(void *mem, int size)
166 {
167 	struct thread	*td;
168 
169 	td = (struct thread *)mem;
170 	pmap_dispose_thread(td);
171 }
172 /*
173  * Initialize type-stable parts of a kse (when newly created).
174  */
175 static void
176 kse_init(void *mem, int size)
177 {
178 	struct kse	*ke;
179 
180 	ke = (struct kse *)mem;
181 	ke->ke_sched = (struct ke_sched *)&ke[1];
182 }
183 /*
184  * Initialize type-stable parts of a ksegrp (when newly created).
185  */
186 static void
187 ksegrp_init(void *mem, int size)
188 {
189 	struct ksegrp	*kg;
190 
191 	kg = (struct ksegrp *)mem;
192 	kg->kg_sched = (struct kg_sched *)&kg[1];
193 }
194 
195 /*
196  * KSE is linked onto the idle queue.
197  */
198 void
199 kse_link(struct kse *ke, struct ksegrp *kg)
200 {
201 	struct proc *p = kg->kg_proc;
202 
203 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
204 	kg->kg_kses++;
205 	ke->ke_state = KES_UNQUEUED;
206 	ke->ke_proc	= p;
207 	ke->ke_ksegrp	= kg;
208 	ke->ke_thread	= NULL;
209 	ke->ke_oncpu = NOCPU;
210 }
211 
212 void
213 kse_unlink(struct kse *ke)
214 {
215 	struct ksegrp *kg;
216 
217 	mtx_assert(&sched_lock, MA_OWNED);
218 	kg = ke->ke_ksegrp;
219 	if (ke->ke_state == KES_IDLE) {
220 		kg->kg_idle_kses--;
221 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
222 	}
223 
224 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
225 	if (--kg->kg_kses == 0) {
226 			ksegrp_unlink(kg);
227 	}
228 	/*
229 	 * Aggregate stats from the KSE
230 	 */
231 	kse_stash(ke);
232 }
233 
234 void
235 ksegrp_link(struct ksegrp *kg, struct proc *p)
236 {
237 
238 	TAILQ_INIT(&kg->kg_threads);
239 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
240 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
241 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
242 	TAILQ_INIT(&kg->kg_iq);		/* idle kses in ksegrp */
243 	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
244 	kg->kg_proc	= p;
245 /* the following counters are in the -zero- section and may not need clearing */
246 	kg->kg_numthreads = 0;
247 	kg->kg_runnable = 0;
248 	kg->kg_kses = 0;
249 	kg->kg_idle_kses = 0;
250 	kg->kg_loan_kses = 0;
251 	kg->kg_runq_kses = 0; /* XXXKSE change name */
252 /* link it in now that it's consistent */
253 	p->p_numksegrps++;
254 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
255 }
256 
257 void
258 ksegrp_unlink(struct ksegrp *kg)
259 {
260 	struct proc *p;
261 
262 	mtx_assert(&sched_lock, MA_OWNED);
263 	p = kg->kg_proc;
264 	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
265 	    ("kseg_unlink: residual threads or KSEs"));
266 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
267 	p->p_numksegrps--;
268 	/*
269 	 * Aggregate stats from the KSE
270 	 */
271 	ksegrp_stash(kg);
272 }
273 
274 /*
275  * for a newly created process,
276  * link up a the structure and its initial threads etc.
277  */
278 void
279 proc_linkup(struct proc *p, struct ksegrp *kg,
280 			struct kse *ke, struct thread *td)
281 {
282 
283 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
284 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
285 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
286 	p->p_numksegrps = 0;
287 	p->p_numthreads = 0;
288 
289 	ksegrp_link(kg, p);
290 	kse_link(ke, kg);
291 	thread_link(td, kg);
292 }
293 
294 int
295 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
296 {
297 	struct proc *p;
298 	struct thread *td2;
299 
300 	p = td->td_proc;
301 	/* KSE-enabled processes only, please. */
302 	if (!(p->p_flag & P_KSES))
303 		return (EINVAL);
304 	if (uap->tmbx == NULL)
305 		return (EINVAL);
306 	mtx_lock_spin(&sched_lock);
307 	FOREACH_THREAD_IN_PROC(p, td2) {
308 		if (td2->td_mailbox == uap->tmbx) {
309 			td2->td_flags |= TDF_INTERRUPT;
310 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
311 				if (td2->td_flags & TDF_CVWAITQ)
312 					cv_abort(td2);
313 				else
314 					abortsleep(td2);
315 			}
316 			mtx_unlock_spin(&sched_lock);
317 			td->td_retval[0] = 0;
318 			td->td_retval[1] = 0;
319 			return (0);
320 		}
321 	}
322 	mtx_unlock_spin(&sched_lock);
323 	return (ESRCH);
324 }
325 
326 int
327 kse_exit(struct thread *td, struct kse_exit_args *uap)
328 {
329 	struct proc *p;
330 	struct ksegrp *kg;
331 
332 	p = td->td_proc;
333 	/* KSE-enabled processes only, please. */
334 	if (!(p->p_flag & P_KSES))
335 		return (EINVAL);
336 	/* must be a bound thread */
337 	if (td->td_flags & TDF_UNBOUND)
338 		return (EINVAL);
339 	kg = td->td_ksegrp;
340 	/* serialize killing kse */
341 	PROC_LOCK(p);
342 	mtx_lock_spin(&sched_lock);
343 	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
344 		mtx_unlock_spin(&sched_lock);
345 		PROC_UNLOCK(p);
346 		return (EDEADLK);
347 	}
348 	if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
349 		p->p_flag &= ~P_KSES;
350 		mtx_unlock_spin(&sched_lock);
351 		PROC_UNLOCK(p);
352 	} else {
353 		while (mtx_owned(&Giant))
354 			mtx_unlock(&Giant);
355 		td->td_kse->ke_flags |= KEF_EXIT;
356 		thread_exit();
357 		/* NOTREACHED */
358 	}
359 	return (0);
360 }
361 
362 int
363 kse_release(struct thread *td, struct kse_release_args *uap)
364 {
365 	struct proc *p;
366 
367 	p = td->td_proc;
368 	/* KSE-enabled processes only */
369 	if (!(p->p_flag & P_KSES))
370 		return (EINVAL);
371 	/*
372 	 * Must be a bound thread. And kse must have a mailbox ready,
373 	 * if not, the kse would can not generate an upcall.
374 	 */
375 	if (!(td->td_flags & TDF_UNBOUND) && (td->td_kse->ke_mailbox != NULL)) {
376 		PROC_LOCK(p);
377 		mtx_lock_spin(&sched_lock);
378 		/* prevent last thread from exiting */
379 		if (p->p_numthreads == 1) {
380 			mtx_unlock_spin(&sched_lock);
381 			if (td->td_standin == NULL) {
382 				PROC_UNLOCK(p);
383 				td->td_standin = thread_alloc();
384 				PROC_LOCK(p);
385 			}
386 			msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH,
387 			       "pause", 0);
388 			mtx_lock_spin(&sched_lock);
389 			td->td_flags |= TDF_UNBOUND;
390 			thread_schedule_upcall(td, td->td_kse);
391 		}
392 		thread_exit();
393 		/* NOTREACHED */
394 	}
395 	return (EINVAL);
396 }
397 
398 /* struct kse_wakeup_args {
399 	struct kse_mailbox *mbx;
400 }; */
401 int
402 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
403 {
404 	struct proc *p;
405 	struct kse *ke, *ke2;
406 	struct ksegrp *kg;
407 
408 	p = td->td_proc;
409 	/* KSE-enabled processes only, please. */
410 	if (!(p->p_flag & P_KSES))
411 		return EINVAL;
412 	if (td->td_standin == NULL)
413 		td->td_standin = thread_alloc();
414 	ke = NULL;
415 	mtx_lock_spin(&sched_lock);
416 	if (uap->mbx) {
417 		FOREACH_KSEGRP_IN_PROC(p, kg) {
418 			FOREACH_KSE_IN_GROUP(kg, ke2) {
419 				if (ke2->ke_mailbox != uap->mbx)
420 					continue;
421 				if (ke2->ke_state == KES_IDLE) {
422 					ke = ke2;
423 					goto found;
424 				} else {
425 					mtx_unlock_spin(&sched_lock);
426 					td->td_retval[0] = 0;
427 					td->td_retval[1] = 0;
428 					return (0);
429 				}
430 			}
431 		}
432 	} else {
433 		kg = td->td_ksegrp;
434 		ke = TAILQ_FIRST(&kg->kg_iq);
435 	}
436 	if (ke == NULL) {
437 		mtx_unlock_spin(&sched_lock);
438 		return (ESRCH);
439 	}
440 found:
441 	thread_schedule_upcall(td, ke);
442 	mtx_unlock_spin(&sched_lock);
443 	td->td_retval[0] = 0;
444 	td->td_retval[1] = 0;
445 	return (0);
446 }
447 
448 /*
449  * No new KSEG: first call: use current KSE, don't schedule an upcall
450  * All other situations, do allocate a new KSE and schedule an upcall on it.
451  */
452 /* struct kse_create_args {
453 	struct kse_mailbox *mbx;
454 	int newgroup;
455 }; */
456 int
457 kse_create(struct thread *td, struct kse_create_args *uap)
458 {
459 	struct kse *newke;
460 	struct kse *ke;
461 	struct ksegrp *newkg;
462 	struct ksegrp *kg;
463 	struct proc *p;
464 	struct kse_mailbox mbx;
465 	int err;
466 
467 	p = td->td_proc;
468 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
469 		return (err);
470 
471 	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
472 	kg = td->td_ksegrp;
473 	if (uap->newgroup) {
474 		if (p->p_numksegrps >= max_groups_per_proc)
475 			return (EPROCLIM);
476 		/*
477 		 * If we want a new KSEGRP it doesn't matter whether
478 		 * we have already fired up KSE mode before or not.
479 		 * We put the process in KSE mode and create a new KSEGRP
480 		 * and KSE. If our KSE has not got a mailbox yet then
481 		 * that doesn't matter, just leave it that way. It will
482 		 * ensure that this thread stay BOUND. It's possible
483 		 * that the call came form a threaded library and the main
484 		 * program knows nothing of threads.
485 		 */
486 		newkg = ksegrp_alloc();
487 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
488 		      kg_startzero, kg_endzero));
489 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
490 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
491 		newke = kse_alloc();
492 	} else {
493 		/*
494 		 * Otherwise, if we have already set this KSE
495 		 * to have a mailbox, we want to make another KSE here,
496 		 * but only if there are not already the limit, which
497 		 * is 1 per CPU max.
498 		 *
499 		 * If the current KSE doesn't have a mailbox we just use it
500 		 * and give it one.
501 		 *
502 		 * Because we don't like to access
503 		 * the KSE outside of schedlock if we are UNBOUND,
504 		 * (because it can change if we are preempted by an interrupt)
505 		 * we can deduce it as having a mailbox if we are UNBOUND,
506 		 * and only need to actually look at it if we are BOUND,
507 		 * which is safe.
508 		 */
509 		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
510 			if (oiks_debug == 0) {
511 #ifdef SMP
512 			if (kg->kg_kses > mp_ncpus)
513 #endif
514 				return (EPROCLIM);
515 			}
516 			newke = kse_alloc();
517 		} else {
518 			newke = NULL;
519 		}
520 		newkg = NULL;
521 	}
522 	if (newke) {
523 		bzero(&newke->ke_startzero, RANGEOF(struct kse,
524 		      ke_startzero, ke_endzero));
525 #if 0
526 		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
527 		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
528 #endif
529 		/* For the first call this may not have been set */
530 		if (td->td_standin == NULL) {
531 			td->td_standin = thread_alloc();
532 		}
533 		mtx_lock_spin(&sched_lock);
534 		if (newkg) {
535 			if (p->p_numksegrps >= max_groups_per_proc) {
536 				mtx_unlock_spin(&sched_lock);
537 				ksegrp_free(newkg);
538 				kse_free(newke);
539 				return (EPROCLIM);
540 			}
541 			ksegrp_link(newkg, p);
542 		}
543 		else
544 			newkg = kg;
545 		kse_link(newke, newkg);
546 		if (p->p_sflag & PS_NEEDSIGCHK)
547 			newke->ke_flags |= KEF_ASTPENDING;
548 		newke->ke_mailbox = uap->mbx;
549 		newke->ke_upcall = mbx.km_func;
550 		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
551 		thread_schedule_upcall(td, newke);
552 		mtx_unlock_spin(&sched_lock);
553 	} else {
554 		/*
555 		 * If we didn't allocate a new KSE then the we are using
556 		 * the exisiting (BOUND) kse.
557 		 */
558 		ke = td->td_kse;
559 		ke->ke_mailbox = uap->mbx;
560 		ke->ke_upcall = mbx.km_func;
561 		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
562 	}
563 	/*
564 	 * Fill out the KSE-mode specific fields of the new kse.
565 	 */
566 
567 	td->td_retval[0] = 0;
568 	td->td_retval[1] = 0;
569 	return (0);
570 }
571 
572 /*
573  * Fill a ucontext_t with a thread's context information.
574  *
575  * This is an analogue to getcontext(3).
576  */
577 void
578 thread_getcontext(struct thread *td, ucontext_t *uc)
579 {
580 
581 /*
582  * XXX this is declared in a MD include file, i386/include/ucontext.h but
583  * is used in MI code.
584  */
585 #ifdef __i386__
586 	get_mcontext(td, &uc->uc_mcontext);
587 #endif
588 	uc->uc_sigmask = td->td_proc->p_sigmask;
589 }
590 
591 /*
592  * Set a thread's context from a ucontext_t.
593  *
594  * This is an analogue to setcontext(3).
595  */
596 int
597 thread_setcontext(struct thread *td, ucontext_t *uc)
598 {
599 	int ret;
600 
601 /*
602  * XXX this is declared in a MD include file, i386/include/ucontext.h but
603  * is used in MI code.
604  */
605 #ifdef __i386__
606 	ret = set_mcontext(td, &uc->uc_mcontext);
607 #else
608 	ret = ENOSYS;
609 #endif
610 	if (ret == 0) {
611 		SIG_CANTMASK(uc->uc_sigmask);
612 		PROC_LOCK(td->td_proc);
613 		td->td_proc->p_sigmask = uc->uc_sigmask;
614 		PROC_UNLOCK(td->td_proc);
615 	}
616 	return (ret);
617 }
618 
619 /*
620  * Initialize global thread allocation resources.
621  */
622 void
623 threadinit(void)
624 {
625 
626 #ifndef __ia64__
627 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
628 	    thread_ctor, thread_dtor, thread_init, thread_fini,
629 	    UMA_ALIGN_CACHE, 0);
630 #else
631 	/*
632 	 * XXX the ia64 kstack allocator is really lame and is at the mercy
633 	 * of contigmallloc().  This hackery is to pre-construct a whole
634 	 * pile of thread structures with associated kernel stacks early
635 	 * in the system startup while contigmalloc() still works. Once we
636 	 * have them, keep them.  Sigh.
637 	 */
638 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
639 	    thread_ctor, thread_dtor, thread_init, thread_fini,
640 	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
641 	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
642 #endif
643 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
644 	    NULL, NULL, ksegrp_init, NULL,
645 	    UMA_ALIGN_CACHE, 0);
646 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
647 	    NULL, NULL, kse_init, NULL,
648 	    UMA_ALIGN_CACHE, 0);
649 }
650 
651 /*
652  * Stash an embarasingly extra thread into the zombie thread queue.
653  */
654 void
655 thread_stash(struct thread *td)
656 {
657 	mtx_lock_spin(&zombie_thread_lock);
658 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
659 	mtx_unlock_spin(&zombie_thread_lock);
660 }
661 
662 /*
663  * Stash an embarasingly extra kse into the zombie kse queue.
664  */
665 void
666 kse_stash(struct kse *ke)
667 {
668 	mtx_lock_spin(&zombie_thread_lock);
669 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
670 	mtx_unlock_spin(&zombie_thread_lock);
671 }
672 
673 /*
674  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
675  */
676 void
677 ksegrp_stash(struct ksegrp *kg)
678 {
679 	mtx_lock_spin(&zombie_thread_lock);
680 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
681 	mtx_unlock_spin(&zombie_thread_lock);
682 }
683 
684 /*
685  * Reap zombie threads.
686  */
687 void
688 thread_reap(void)
689 {
690 	struct thread *td_first, *td_next;
691 	struct kse *ke_first, *ke_next;
692 	struct ksegrp *kg_first, * kg_next;
693 
694 	/*
695 	 * don't even bother to lock if none at this instant
696 	 * We really don't care about the next instant..
697 	 */
698 	if ((!TAILQ_EMPTY(&zombie_threads))
699 	    || (!TAILQ_EMPTY(&zombie_kses))
700 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
701 		mtx_lock_spin(&zombie_thread_lock);
702 		td_first = TAILQ_FIRST(&zombie_threads);
703 		ke_first = TAILQ_FIRST(&zombie_kses);
704 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
705 		if (td_first)
706 			TAILQ_INIT(&zombie_threads);
707 		if (ke_first)
708 			TAILQ_INIT(&zombie_kses);
709 		if (kg_first)
710 			TAILQ_INIT(&zombie_ksegrps);
711 		mtx_unlock_spin(&zombie_thread_lock);
712 		while (td_first) {
713 			td_next = TAILQ_NEXT(td_first, td_runq);
714 			thread_free(td_first);
715 			td_first = td_next;
716 		}
717 		while (ke_first) {
718 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
719 			kse_free(ke_first);
720 			ke_first = ke_next;
721 		}
722 		while (kg_first) {
723 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
724 			ksegrp_free(kg_first);
725 			kg_first = kg_next;
726 		}
727 	}
728 }
729 
730 /*
731  * Allocate a ksegrp.
732  */
733 struct ksegrp *
734 ksegrp_alloc(void)
735 {
736 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
737 }
738 
739 /*
740  * Allocate a kse.
741  */
742 struct kse *
743 kse_alloc(void)
744 {
745 	return (uma_zalloc(kse_zone, M_WAITOK));
746 }
747 
748 /*
749  * Allocate a thread.
750  */
751 struct thread *
752 thread_alloc(void)
753 {
754 	thread_reap(); /* check if any zombies to get */
755 	return (uma_zalloc(thread_zone, M_WAITOK));
756 }
757 
758 /*
759  * Deallocate a ksegrp.
760  */
761 void
762 ksegrp_free(struct ksegrp *td)
763 {
764 	uma_zfree(ksegrp_zone, td);
765 }
766 
767 /*
768  * Deallocate a kse.
769  */
770 void
771 kse_free(struct kse *td)
772 {
773 	uma_zfree(kse_zone, td);
774 }
775 
776 /*
777  * Deallocate a thread.
778  */
779 void
780 thread_free(struct thread *td)
781 {
782 	uma_zfree(thread_zone, td);
783 }
784 
785 /*
786  * Store the thread context in the UTS's mailbox.
787  * then add the mailbox at the head of a list we are building in user space.
788  * The list is anchored in the ksegrp structure.
789  */
790 int
791 thread_export_context(struct thread *td)
792 {
793 	struct proc *p;
794 	struct ksegrp *kg;
795 	uintptr_t mbx;
796 	void *addr;
797 	int error;
798 	ucontext_t uc;
799 	uint temp;
800 
801 	p = td->td_proc;
802 	kg = td->td_ksegrp;
803 
804 	/* Export the user/machine context. */
805 #if 0
806 	addr = (caddr_t)td->td_mailbox +
807 	    offsetof(struct kse_thr_mailbox, tm_context);
808 #else /* if user pointer arithmetic is valid in the kernel */
809 		addr = (void *)(&td->td_mailbox->tm_context);
810 #endif
811 	error = copyin(addr, &uc, sizeof(ucontext_t));
812 	if (error == 0) {
813 		thread_getcontext(td, &uc);
814 		error = copyout(&uc, addr, sizeof(ucontext_t));
815 
816 	}
817 	if (error) {
818 		PROC_LOCK(p);
819 		psignal(p, SIGSEGV);
820 		PROC_UNLOCK(p);
821 		return (error);
822 	}
823 	/* get address in latest mbox of list pointer */
824 #if 0
825 	addr = (caddr_t)td->td_mailbox
826 	    + offsetof(struct kse_thr_mailbox , tm_next);
827 #else /* if user pointer arithmetic is valid in the kernel */
828 	addr = (void *)(&td->td_mailbox->tm_next);
829 #endif
830 	/*
831 	 * Put the saved address of the previous first
832 	 * entry into this one
833 	 */
834 	for (;;) {
835 		mbx = (uintptr_t)kg->kg_completed;
836 		if (suword(addr, mbx)) {
837 			goto bad;
838 		}
839 		PROC_LOCK(p);
840 		if (mbx == (uintptr_t)kg->kg_completed) {
841 			kg->kg_completed = td->td_mailbox;
842 			PROC_UNLOCK(p);
843 			break;
844 		}
845 		PROC_UNLOCK(p);
846 	}
847 	addr = (caddr_t)td->td_mailbox
848 		 + offsetof(struct kse_thr_mailbox, tm_sticks);
849 	temp = fuword(addr) + td->td_usticks;
850 	if (suword(addr, temp))
851 		goto bad;
852 	return (0);
853 
854 bad:
855 	PROC_LOCK(p);
856 	psignal(p, SIGSEGV);
857 	PROC_UNLOCK(p);
858 	return (EFAULT);
859 }
860 
861 /*
862  * Take the list of completed mailboxes for this KSEGRP and put them on this
863  * KSE's mailbox as it's the next one going up.
864  */
865 static int
866 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
867 {
868 	struct proc *p = kg->kg_proc;
869 	void *addr;
870 	uintptr_t mbx;
871 
872 #if 0
873 	addr = (caddr_t)ke->ke_mailbox
874 	    + offsetof(struct kse_mailbox, km_completed);
875 #else /* if user pointer arithmetic is valid in the kernel */
876 		addr = (void *)(&ke->ke_mailbox->km_completed);
877 #endif
878 	for (;;) {
879 		mbx = (uintptr_t)kg->kg_completed;
880 		if (suword(addr, mbx)) {
881 			PROC_LOCK(p);
882 			psignal(p, SIGSEGV);
883 			PROC_UNLOCK(p);
884 			return (EFAULT);
885 		}
886 		/* XXXKSE could use atomic CMPXCH here */
887 		PROC_LOCK(p);
888 		if (mbx == (uintptr_t)kg->kg_completed) {
889 			kg->kg_completed = NULL;
890 			PROC_UNLOCK(p);
891 			break;
892 		}
893 		PROC_UNLOCK(p);
894 	}
895 	return (0);
896 }
897 
898 /*
899  * This function should be called at statclock interrupt time
900  */
901 int
902 thread_add_ticks_intr(int user, uint ticks)
903 {
904 	struct thread *td = curthread;
905 	struct kse *ke = td->td_kse;
906 
907 	if (ke->ke_mailbox == NULL)
908 		return -1;
909 	if (user) {
910 		/* Current always do via ast() */
911 		ke->ke_flags |= KEF_ASTPENDING;
912 		ke->ke_uuticks += ticks;
913 	} else {
914 		if (td->td_mailbox != NULL)
915 			td->td_usticks += ticks;
916 		else
917 			ke->ke_usticks += ticks;
918 	}
919 	return 0;
920 }
921 
922 static int
923 thread_update_uticks(void)
924 {
925 	struct thread *td = curthread;
926 	struct proc *p = td->td_proc;
927 	struct kse *ke = td->td_kse;
928 	struct kse_thr_mailbox *tmbx;
929 	caddr_t addr;
930 	uint uticks, sticks;
931 
932 	KASSERT(!(td->td_flags & TDF_UNBOUND), ("thread not bound."));
933 
934 	if (ke->ke_mailbox == NULL)
935 		return 0;
936 
937 	uticks = ke->ke_uuticks;
938 	ke->ke_uuticks = 0;
939 	sticks = ke->ke_usticks;
940 	ke->ke_usticks = 0;
941 	tmbx = (void *)fuword((caddr_t)ke->ke_mailbox
942 			+ offsetof(struct kse_mailbox, km_curthread));
943 	if ((tmbx == NULL) || (tmbx == (void *)-1))
944 		return 0;
945 	if (uticks) {
946 		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_uticks);
947 		uticks += fuword(addr);
948 		if (suword(addr, uticks))
949 			goto bad;
950 	}
951 	if (sticks) {
952 		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_sticks);
953 		sticks += fuword(addr);
954 		if (suword(addr, sticks))
955 			goto bad;
956 	}
957 	return 0;
958 bad:
959 	PROC_LOCK(p);
960 	psignal(p, SIGSEGV);
961 	PROC_UNLOCK(p);
962 	return -1;
963 }
964 
965 /*
966  * Discard the current thread and exit from its context.
967  *
968  * Because we can't free a thread while we're operating under its context,
969  * push the current thread into our KSE's ke_tdspare slot, freeing the
970  * thread that might be there currently. Because we know that only this
971  * processor will run our KSE, we needn't worry about someone else grabbing
972  * our context before we do a cpu_throw.
973  */
974 void
975 thread_exit(void)
976 {
977 	struct thread *td;
978 	struct kse *ke;
979 	struct proc *p;
980 	struct ksegrp	*kg;
981 
982 	td = curthread;
983 	kg = td->td_ksegrp;
984 	p = td->td_proc;
985 	ke = td->td_kse;
986 
987 	mtx_assert(&sched_lock, MA_OWNED);
988 	KASSERT(p != NULL, ("thread exiting without a process"));
989 	KASSERT(ke != NULL, ("thread exiting without a kse"));
990 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
991 	PROC_LOCK_ASSERT(p, MA_OWNED);
992 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
993 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
994 
995 	if (ke->ke_tdspare != NULL) {
996 		thread_stash(ke->ke_tdspare);
997 		ke->ke_tdspare = NULL;
998 	}
999 	if (td->td_standin != NULL) {
1000 		thread_stash(td->td_standin);
1001 		td->td_standin = NULL;
1002 	}
1003 
1004 	cpu_thread_exit(td);	/* XXXSMP */
1005 
1006 	/*
1007 	 * The last thread is left attached to the process
1008 	 * So that the whole bundle gets recycled. Skip
1009 	 * all this stuff.
1010 	 */
1011 	if (p->p_numthreads > 1) {
1012 		/*
1013 		 * Unlink this thread from its proc and the kseg.
1014 		 * In keeping with the other structs we probably should
1015 		 * have a thread_unlink() that does some of this but it
1016 		 * would only be called from here (I think) so it would
1017 		 * be a waste. (might be useful for proc_fini() as well.)
1018  		 */
1019 		TAILQ_REMOVE(&p->p_threads, td, td_plist);
1020 		p->p_numthreads--;
1021 		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1022 		kg->kg_numthreads--;
1023 		/*
1024 		 * The test below is NOT true if we are the
1025 		 * sole exiting thread. P_STOPPED_SNGL is unset
1026 		 * in exit1() after it is the only survivor.
1027 		 */
1028 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1029 			if (p->p_numthreads == p->p_suspcount) {
1030 				thread_unsuspend_one(p->p_singlethread);
1031 			}
1032 		}
1033 
1034 		/* Reassign this thread's KSE. */
1035 		ke->ke_thread = NULL;
1036 		td->td_kse = NULL;
1037 		ke->ke_state = KES_UNQUEUED;
1038 		KASSERT((ke->ke_bound != td),
1039 		    ("thread_exit: entered with ke_bound set"));
1040 
1041 		/*
1042 		 * The reason for all this hoopla is
1043 		 * an attempt to stop our thread stack from being freed
1044 		 * until AFTER we have stopped running on it.
1045 		 * Since we are under schedlock, almost any method where
1046 		 * it is eventually freed by someone else is probably ok.
1047 		 * (Especially if they do it under schedlock). We could
1048 		 * almost free it here if we could be certain that
1049 		 * the uma code wouldn't pull it apart immediatly,
1050 		 * but unfortunatly we can not guarantee that.
1051 		 *
1052 		 * For threads that are exiting and NOT killing their
1053 		 * KSEs we can just stash it in the KSE, however
1054 		 * in the case where the KSE is also being deallocated,
1055 		 * we need to store it somewhere else. It turns out that
1056 		 * we will never free the last KSE, so there is always one
1057 		 * other KSE available. We might as well just choose one
1058 		 * and stash it there. Being under schedlock should make that
1059 		 * safe.
1060 		 *
1061 		 * In borrower threads, we can stash it in the lender
1062 		 * Where it won't be needed until this thread is long gone.
1063 		 * Borrower threads can't kill their KSE anyhow, so even
1064 		 * the KSE would be a safe place for them. It is not
1065 		 * necessary to have a KSE (or KSEGRP) at all beyond this
1066 		 * point, while we are under the protection of schedlock.
1067 		 *
1068 		 * Either give the KSE to another thread to use (or make
1069 		 * it idle), or free it entirely, possibly along with its
1070 		 * ksegrp if it's the last one.
1071 		 */
1072 		if (ke->ke_flags & KEF_EXIT) {
1073 			kse_unlink(ke);
1074 			/*
1075 			 * Designate another KSE to hold our thread.
1076 			 * Safe as long as we abide by whatever lock
1077 			 * we control it with.. The other KSE will not
1078 			 * be able to run it until we release the schelock,
1079 			 * but we need to be careful about it deciding to
1080 			 * write to the stack before then. Luckily
1081 			 * I believe that while another thread's
1082 			 * standin thread can be used in this way, the
1083 			 * spare thread for the KSE cannot be used without
1084 			 * holding schedlock at least once.
1085 			 */
1086 			ke =  FIRST_KSE_IN_PROC(p);
1087 		} else {
1088 			kse_reassign(ke);
1089 		}
1090 #if 0
1091 		if (ke->ke_bound) {
1092 			/*
1093 			 * WE are a borrower..
1094 			 * stash our thread with the owner.
1095 			 */
1096 			if (ke->ke_bound->td_standin) {
1097 				thread_stash(ke->ke_bound->td_standin);
1098 			}
1099 			ke->ke_bound->td_standin = td;
1100 		} else {
1101 #endif
1102 			if (ke->ke_tdspare != NULL) {
1103 				thread_stash(ke->ke_tdspare);
1104 				ke->ke_tdspare = NULL;
1105 			}
1106 			ke->ke_tdspare = td;
1107 #if 0
1108 		}
1109 #endif
1110 		PROC_UNLOCK(p);
1111 		td->td_state	= TDS_INACTIVE;
1112 		td->td_proc	= NULL;
1113 		td->td_ksegrp	= NULL;
1114 		td->td_last_kse	= NULL;
1115 	} else {
1116 		PROC_UNLOCK(p);
1117 	}
1118 
1119 	cpu_throw();
1120 	/* NOTREACHED */
1121 }
1122 
1123 /*
1124  * Link a thread to a process.
1125  * set up anything that needs to be initialized for it to
1126  * be used by the process.
1127  *
1128  * Note that we do not link to the proc's ucred here.
1129  * The thread is linked as if running but no KSE assigned.
1130  */
1131 void
1132 thread_link(struct thread *td, struct ksegrp *kg)
1133 {
1134 	struct proc *p;
1135 
1136 	p = kg->kg_proc;
1137 	td->td_state = TDS_INACTIVE;
1138 	td->td_proc	= p;
1139 	td->td_ksegrp	= kg;
1140 	td->td_last_kse	= NULL;
1141 
1142 	LIST_INIT(&td->td_contested);
1143 	callout_init(&td->td_slpcallout, 1);
1144 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1145 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1146 	p->p_numthreads++;
1147 	kg->kg_numthreads++;
1148 	if (oiks_debug && (p->p_numthreads > oiks_max_threads_per_proc)) {
1149 		printf("OIKS %d\n", p->p_numthreads);
1150 		if (oiks_debug > 1)
1151 			Debugger("OIKS");
1152 	}
1153 	td->td_kse	= NULL;
1154 }
1155 
1156 void
1157 kse_purge(struct proc *p, struct thread *td)
1158 {
1159 	struct kse *ke;
1160 	struct ksegrp *kg;
1161 
1162  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1163 	mtx_lock_spin(&sched_lock);
1164 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1165 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1166 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1167 			kg->kg_idle_kses--;
1168 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
1169 			kg->kg_kses--;
1170 			if (ke->ke_tdspare)
1171 				thread_stash(ke->ke_tdspare);
1172    			kse_stash(ke);
1173 		}
1174 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1175 		p->p_numksegrps--;
1176 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1177 		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1178 			("wrong kg_kses"));
1179 		if (kg != td->td_ksegrp) {
1180 			ksegrp_stash(kg);
1181 		}
1182 	}
1183 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1184 	p->p_numksegrps++;
1185 	mtx_unlock_spin(&sched_lock);
1186 }
1187 
1188 
1189 /*
1190  * Create a thread and schedule it for upcall on the KSE given.
1191  */
1192 struct thread *
1193 thread_schedule_upcall(struct thread *td, struct kse *ke)
1194 {
1195 	struct thread *td2;
1196 	struct ksegrp *kg;
1197 	int newkse;
1198 
1199 	mtx_assert(&sched_lock, MA_OWNED);
1200 	newkse = (ke != td->td_kse);
1201 
1202 	/*
1203 	 * If the kse is already owned by another thread then we can't
1204 	 * schedule an upcall because the other thread must be BOUND
1205 	 * which means it is not in a position to take an upcall.
1206 	 * We must be borrowing the KSE to allow us to complete some in-kernel
1207 	 * work. When we complete, the Bound thread will have teh chance to
1208 	 * complete. This thread will sleep as planned. Hopefully there will
1209 	 * eventually be un unbound thread that can be converted to an
1210 	 * upcall to report the completion of this thread.
1211 	 */
1212 	if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
1213 		return (NULL);
1214 	}
1215 	KASSERT((ke->ke_bound == NULL), ("kse already bound"));
1216 
1217 	if (ke->ke_state == KES_IDLE) {
1218 		kg = ke->ke_ksegrp;
1219 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1220 		kg->kg_idle_kses--;
1221 		ke->ke_state = KES_UNQUEUED;
1222 	}
1223 	if ((td2 = td->td_standin) != NULL) {
1224 		td->td_standin = NULL;
1225 	} else {
1226 		if (newkse)
1227 			panic("no reserve thread when called with a new kse");
1228 		/*
1229 		 * If called from (e.g.) sleep and we do not have
1230 		 * a reserve thread, then we've used it, so do not
1231 		 * create an upcall.
1232 		 */
1233 		return (NULL);
1234 	}
1235 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1236 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1237 	bzero(&td2->td_startzero,
1238 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1239 	bcopy(&td->td_startcopy, &td2->td_startcopy,
1240 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1241 	thread_link(td2, ke->ke_ksegrp);
1242 	cpu_set_upcall(td2, td->td_pcb);
1243 
1244 	/*
1245 	 * XXXKSE do we really need this? (default values for the
1246 	 * frame).
1247 	 */
1248 	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1249 
1250 	/*
1251 	 * Bind the new thread to the KSE,
1252 	 * and if it's our KSE, lend it back to ourself
1253 	 * so we can continue running.
1254 	 */
1255 	td2->td_ucred = crhold(td->td_ucred);
1256 	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1257 	td2->td_kse = ke;
1258 	td2->td_state = TDS_CAN_RUN;
1259 	td2->td_inhibitors = 0;
1260 	/*
1261 	 * If called from msleep(), we are working on the current
1262 	 * KSE so fake that we borrowed it. If called from
1263 	 * kse_create(), don't, as we have a new kse too.
1264 	 */
1265 	if (!newkse) {
1266 		/*
1267 		 * This thread will be scheduled when the current thread
1268 		 * blocks, exits or tries to enter userspace, (which ever
1269 		 * happens first). When that happens the KSe will "revert"
1270 		 * to this thread in a BOUND manner. Since we are called
1271 		 * from msleep() this is going to be "very soon" in nearly
1272 		 * all cases.
1273 		 */
1274 		ke->ke_bound = td2;
1275 		TD_SET_LOAN(td2);
1276 	} else {
1277 		ke->ke_bound = NULL;
1278 		ke->ke_thread = td2;
1279 		ke->ke_state = KES_THREAD;
1280 		setrunqueue(td2);
1281 	}
1282 	return (td2);	/* bogus.. should be a void function */
1283 }
1284 
1285 /*
1286  * Schedule an upcall to notify a KSE process recieved signals.
1287  *
1288  * XXX - Modifying a sigset_t like this is totally bogus.
1289  */
1290 struct thread *
1291 signal_upcall(struct proc *p, int sig)
1292 {
1293 	struct thread *td, *td2;
1294 	struct kse *ke;
1295 	sigset_t ss;
1296 	int error;
1297 
1298 	PROC_LOCK_ASSERT(p, MA_OWNED);
1299 return (NULL);
1300 
1301 	td = FIRST_THREAD_IN_PROC(p);
1302 	ke = td->td_kse;
1303 	PROC_UNLOCK(p);
1304 	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1305 	PROC_LOCK(p);
1306 	if (error)
1307 		return (NULL);
1308 	SIGADDSET(ss, sig);
1309 	PROC_UNLOCK(p);
1310 	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1311 	PROC_LOCK(p);
1312 	if (error)
1313 		return (NULL);
1314 	if (td->td_standin == NULL)
1315 		td->td_standin = thread_alloc();
1316 	mtx_lock_spin(&sched_lock);
1317 	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1318 	mtx_unlock_spin(&sched_lock);
1319 	return (td2);
1320 }
1321 
1322 /*
1323  * setup done on the thread when it enters the kernel.
1324  * XXXKSE Presently only for syscalls but eventually all kernel entries.
1325  */
1326 void
1327 thread_user_enter(struct proc *p, struct thread *td)
1328 {
1329 	struct kse *ke;
1330 
1331 	/*
1332 	 * First check that we shouldn't just abort.
1333 	 * But check if we are the single thread first!
1334 	 * XXX p_singlethread not locked, but should be safe.
1335 	 */
1336 	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1337 		PROC_LOCK(p);
1338 		mtx_lock_spin(&sched_lock);
1339 		thread_exit();
1340 		/* NOTREACHED */
1341 	}
1342 
1343 	/*
1344 	 * If we are doing a syscall in a KSE environment,
1345 	 * note where our mailbox is. There is always the
1346 	 * possibility that we could do this lazily (in sleep()),
1347 	 * but for now do it every time.
1348 	 */
1349 	ke = td->td_kse;
1350 	if (ke->ke_mailbox != NULL) {
1351 #if 0
1352 		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1353 		    + offsetof(struct kse_mailbox, km_curthread));
1354 #else /* if user pointer arithmetic is ok in the kernel */
1355 		td->td_mailbox =
1356 		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1357 #endif
1358 		if ((td->td_mailbox == NULL) ||
1359 		    (td->td_mailbox == (void *)-1)) {
1360 			td->td_mailbox = NULL;	/* single thread it.. */
1361 			mtx_lock_spin(&sched_lock);
1362 			td->td_flags &= ~TDF_UNBOUND;
1363 			mtx_unlock_spin(&sched_lock);
1364 		} else {
1365 			/*
1366 			 * when thread limit reached, act like that the thread
1367 			 * has already done an upcall.
1368 			 */
1369 		    	if (p->p_numthreads > max_threads_per_proc) {
1370 				if (td->td_standin != NULL)
1371 					thread_stash(td->td_standin);
1372 				td->td_standin = NULL;
1373 			} else {
1374 				if (td->td_standin == NULL)
1375 					td->td_standin = thread_alloc();
1376 			}
1377 			mtx_lock_spin(&sched_lock);
1378 			td->td_flags |= TDF_UNBOUND;
1379 			mtx_unlock_spin(&sched_lock);
1380 			td->td_usticks = 0;
1381 		}
1382 	}
1383 }
1384 
1385 /*
1386  * The extra work we go through if we are a threaded process when we
1387  * return to userland.
1388  *
1389  * If we are a KSE process and returning to user mode, check for
1390  * extra work to do before we return (e.g. for more syscalls
1391  * to complete first).  If we were in a critical section, we should
1392  * just return to let it finish. Same if we were in the UTS (in
1393  * which case the mailbox's context's busy indicator will be set).
1394  * The only traps we suport will have set the mailbox.
1395  * We will clear it here.
1396  */
1397 int
1398 thread_userret(struct thread *td, struct trapframe *frame)
1399 {
1400 	int error;
1401 	int unbound;
1402 	struct kse *ke;
1403 	struct ksegrp *kg;
1404 	struct thread *td2;
1405 	struct proc *p;
1406 	struct timespec ts;
1407 
1408 	error = 0;
1409 
1410 	unbound = td->td_flags & TDF_UNBOUND;
1411 
1412 	kg = td->td_ksegrp;
1413 	p = td->td_proc;
1414 
1415 	/*
1416 	 * Originally bound threads never upcall but they may
1417 	 * loan out their KSE at this point.
1418 	 * Upcalls imply bound.. They also may want to do some Philantropy.
1419 	 * Unbound threads on the other hand either yield to other work
1420 	 * or transform into an upcall.
1421 	 * (having saved their context to user space in both cases)
1422 	 */
1423 	if (unbound) {
1424 		/*
1425 		 * We are an unbound thread, looking to return to
1426 		 * user space.
1427 		 * THere are several possibilities:
1428 		 * 1) we are using a borrowed KSE. save state and exit.
1429 		 *    kse_reassign() will recycle the kse as needed,
1430 		 * 2) we are not.. save state, and then convert ourself
1431 		 *    to be an upcall, bound to the KSE.
1432 		 *    if there are others that need the kse,
1433 		 *    give them a chance by doing an mi_switch().
1434 		 *    Because we are bound, control will eventually return
1435 		 *    to us here.
1436 		 * ***
1437 		 * Save the thread's context, and link it
1438 		 * into the KSEGRP's list of completed threads.
1439 		 */
1440 		error = thread_export_context(td);
1441 		td->td_mailbox = NULL;
1442 		td->td_usticks = 0;
1443 		if (error) {
1444 			/*
1445 			 * If we are not running on a borrowed KSE, then
1446 			 * failing to do the KSE operation just defaults
1447 			 * back to synchonous operation, so just return from
1448 			 * the syscall. If it IS borrowed, there is nothing
1449 			 * we can do. We just lose that context. We
1450 			 * probably should note this somewhere and send
1451 			 * the process a signal.
1452 			 */
1453 			PROC_LOCK(td->td_proc);
1454 			psignal(td->td_proc, SIGSEGV);
1455 			mtx_lock_spin(&sched_lock);
1456 			if (td->td_kse->ke_bound == NULL) {
1457 				td->td_flags &= ~TDF_UNBOUND;
1458 				PROC_UNLOCK(td->td_proc);
1459 				mtx_unlock_spin(&sched_lock);
1460 				thread_update_uticks();
1461 				return (error);	/* go sync */
1462 			}
1463 			thread_exit();
1464 		}
1465 
1466 		/*
1467 		 * if the KSE is owned and we are borrowing it,
1468 		 * don't make an upcall, just exit so that the owner
1469 		 * can get its KSE if it wants it.
1470 		 * Our context is already safely stored for later
1471 		 * use by the UTS.
1472 		 */
1473 		PROC_LOCK(p);
1474 		mtx_lock_spin(&sched_lock);
1475 		if (td->td_kse->ke_bound) {
1476 			thread_exit();
1477 		}
1478 		PROC_UNLOCK(p);
1479 
1480 		/*
1481 		 * Turn ourself into a bound upcall.
1482 		 * We will rely on kse_reassign()
1483 		 * to make us run at a later time.
1484 		 * We should look just like a sheduled upcall
1485 		 * from msleep() or cv_wait().
1486 		 */
1487 		td->td_flags &= ~TDF_UNBOUND;
1488 		td->td_flags |= TDF_UPCALLING;
1489 		/* Only get here if we have become an upcall */
1490 
1491 	} else {
1492 		mtx_lock_spin(&sched_lock);
1493 	}
1494 	/*
1495 	 * We ARE going back to userland with this KSE.
1496 	 * Check for threads that need to borrow it.
1497 	 * Optimisation: don't call mi_switch if no-one wants the KSE.
1498 	 * Any other thread that comes ready after this missed the boat.
1499 	 */
1500 	ke = td->td_kse;
1501 	if ((td2 = kg->kg_last_assigned))
1502 		td2 = TAILQ_NEXT(td2, td_runq);
1503 	else
1504 		td2 = TAILQ_FIRST(&kg->kg_runq);
1505 	if (td2)  {
1506 		/*
1507 		 * force a switch to more urgent 'in kernel'
1508 		 * work. Control will return to this thread
1509 		 * when there is no more work to do.
1510 		 * kse_reassign() will do tha for us.
1511 		 */
1512 		TD_SET_LOAN(td);
1513 		ke->ke_bound = td;
1514 		ke->ke_thread = NULL;
1515 		mi_switch(); /* kse_reassign() will (re)find td2 */
1516 	}
1517 	mtx_unlock_spin(&sched_lock);
1518 
1519 	/*
1520 	 * Optimisation:
1521 	 * Ensure that we have a spare thread available,
1522 	 * for when we re-enter the kernel.
1523 	 */
1524 	if (td->td_standin == NULL) {
1525 		if (ke->ke_tdspare) {
1526 			td->td_standin = ke->ke_tdspare;
1527 			ke->ke_tdspare = NULL;
1528 		} else {
1529 			td->td_standin = thread_alloc();
1530 		}
1531 	}
1532 
1533 	thread_update_uticks();
1534 	/*
1535 	 * To get here, we know there is no other need for our
1536 	 * KSE so we can proceed. If not upcalling, go back to
1537 	 * userspace. If we are, get the upcall set up.
1538 	 */
1539 	if ((td->td_flags & TDF_UPCALLING) == 0)
1540 		return (0);
1541 
1542 	/*
1543 	 * We must be an upcall to get this far.
1544 	 * There is no more work to do and we are going to ride
1545 	 * this thead/KSE up to userland as an upcall.
1546 	 * Do the last parts of the setup needed for the upcall.
1547 	 */
1548 	CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1549 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
1550 
1551 	/*
1552 	 * Set user context to the UTS.
1553 	 */
1554 	cpu_set_upcall_kse(td, ke);
1555 
1556 	/*
1557 	 * Put any completed mailboxes on this KSE's list.
1558 	 */
1559 	error = thread_link_mboxes(kg, ke);
1560 	if (error)
1561 		goto bad;
1562 
1563 	/*
1564 	 * Set state and mailbox.
1565 	 * From now on we are just a bound outgoing process.
1566 	 * **Problem** userret is often called several times.
1567 	 * it would be nice if this all happenned only on the first time
1568 	 * through. (the scan for extra work etc.)
1569 	 */
1570 	mtx_lock_spin(&sched_lock);
1571 	td->td_flags &= ~TDF_UPCALLING;
1572 	mtx_unlock_spin(&sched_lock);
1573 #if 0
1574 	error = suword((caddr_t)ke->ke_mailbox +
1575 	    offsetof(struct kse_mailbox, km_curthread), 0);
1576 #else	/* if user pointer arithmetic is ok in the kernel */
1577 	error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1578 #endif
1579 	ke->ke_uuticks = ke->ke_usticks = 0;
1580 	if (!error) {
1581 		nanotime(&ts);
1582 		if (copyout(&ts, (caddr_t)&ke->ke_mailbox->km_timeofday,
1583 		    sizeof(ts))) {
1584 			goto bad;
1585 		}
1586 	}
1587 	return (0);
1588 
1589 bad:
1590 	/*
1591 	 * Things are going to be so screwed we should just kill the process.
1592  	 * how do we do that?
1593 	 */
1594 	PROC_LOCK(td->td_proc);
1595 	psignal(td->td_proc, SIGSEGV);
1596 	PROC_UNLOCK(td->td_proc);
1597 	return (error);	/* go sync */
1598 }
1599 
1600 /*
1601  * Enforce single-threading.
1602  *
1603  * Returns 1 if the caller must abort (another thread is waiting to
1604  * exit the process or similar). Process is locked!
1605  * Returns 0 when you are successfully the only thread running.
1606  * A process has successfully single threaded in the suspend mode when
1607  * There are no threads in user mode. Threads in the kernel must be
1608  * allowed to continue until they get to the user boundary. They may even
1609  * copy out their return values and data before suspending. They may however be
1610  * accellerated in reaching the user boundary as we will wake up
1611  * any sleeping threads that are interruptable. (PCATCH).
1612  */
1613 int
1614 thread_single(int force_exit)
1615 {
1616 	struct thread *td;
1617 	struct thread *td2;
1618 	struct proc *p;
1619 
1620 	td = curthread;
1621 	p = td->td_proc;
1622 	PROC_LOCK_ASSERT(p, MA_OWNED);
1623 	KASSERT((td != NULL), ("curthread is NULL"));
1624 
1625 	if ((p->p_flag & P_KSES) == 0)
1626 		return (0);
1627 
1628 	/* Is someone already single threading? */
1629 	if (p->p_singlethread)
1630 		return (1);
1631 
1632 	if (force_exit == SINGLE_EXIT)
1633 		p->p_flag |= P_SINGLE_EXIT;
1634 	else
1635 		p->p_flag &= ~P_SINGLE_EXIT;
1636 	p->p_flag |= P_STOPPED_SINGLE;
1637 	p->p_singlethread = td;
1638 	/* XXXKSE Which lock protects the below values? */
1639 	while ((p->p_numthreads - p->p_suspcount) != 1) {
1640 		mtx_lock_spin(&sched_lock);
1641 		FOREACH_THREAD_IN_PROC(p, td2) {
1642 			if (td2 == td)
1643 				continue;
1644 			if (TD_IS_INHIBITED(td2)) {
1645 				if (force_exit == SINGLE_EXIT) {
1646 					if (TD_IS_SUSPENDED(td2)) {
1647 						thread_unsuspend_one(td2);
1648 					}
1649 					if (TD_ON_SLEEPQ(td2) &&
1650 					    (td2->td_flags & TDF_SINTR)) {
1651 						if (td2->td_flags & TDF_CVWAITQ)
1652 							cv_abort(td2);
1653 						else
1654 							abortsleep(td2);
1655 					}
1656 				} else {
1657 					if (TD_IS_SUSPENDED(td2))
1658 						continue;
1659 					/* maybe other inhibitted states too? */
1660 					if (TD_IS_SLEEPING(td2))
1661 						thread_suspend_one(td2);
1662 				}
1663 			}
1664 		}
1665 		/*
1666 		 * Maybe we suspended some threads.. was it enough?
1667 		 */
1668 		if ((p->p_numthreads - p->p_suspcount) == 1) {
1669 			mtx_unlock_spin(&sched_lock);
1670 			break;
1671 		}
1672 
1673 		/*
1674 		 * Wake us up when everyone else has suspended.
1675 		 * In the mean time we suspend as well.
1676 		 */
1677 		thread_suspend_one(td);
1678 		mtx_unlock(&Giant);
1679 		PROC_UNLOCK(p);
1680 		mi_switch();
1681 		mtx_unlock_spin(&sched_lock);
1682 		mtx_lock(&Giant);
1683 		PROC_LOCK(p);
1684 	}
1685 	if (force_exit == SINGLE_EXIT)
1686 		kse_purge(p, td);
1687 	return (0);
1688 }
1689 
1690 /*
1691  * Called in from locations that can safely check to see
1692  * whether we have to suspend or at least throttle for a
1693  * single-thread event (e.g. fork).
1694  *
1695  * Such locations include userret().
1696  * If the "return_instead" argument is non zero, the thread must be able to
1697  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1698  *
1699  * The 'return_instead' argument tells the function if it may do a
1700  * thread_exit() or suspend, or whether the caller must abort and back
1701  * out instead.
1702  *
1703  * If the thread that set the single_threading request has set the
1704  * P_SINGLE_EXIT bit in the process flags then this call will never return
1705  * if 'return_instead' is false, but will exit.
1706  *
1707  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1708  *---------------+--------------------+---------------------
1709  *       0       | returns 0          |   returns 0 or 1
1710  *               | when ST ends       |   immediatly
1711  *---------------+--------------------+---------------------
1712  *       1       | thread exits       |   returns 1
1713  *               |                    |  immediatly
1714  * 0 = thread_exit() or suspension ok,
1715  * other = return error instead of stopping the thread.
1716  *
1717  * While a full suspension is under effect, even a single threading
1718  * thread would be suspended if it made this call (but it shouldn't).
1719  * This call should only be made from places where
1720  * thread_exit() would be safe as that may be the outcome unless
1721  * return_instead is set.
1722  */
1723 int
1724 thread_suspend_check(int return_instead)
1725 {
1726 	struct thread *td;
1727 	struct proc *p;
1728 	struct kse *ke;
1729 	struct ksegrp *kg;
1730 
1731 	td = curthread;
1732 	p = td->td_proc;
1733 	kg = td->td_ksegrp;
1734 	PROC_LOCK_ASSERT(p, MA_OWNED);
1735 	while (P_SHOULDSTOP(p)) {
1736 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1737 			KASSERT(p->p_singlethread != NULL,
1738 			    ("singlethread not set"));
1739 			/*
1740 			 * The only suspension in action is a
1741 			 * single-threading. Single threader need not stop.
1742 			 * XXX Should be safe to access unlocked
1743 			 * as it can only be set to be true by us.
1744 			 */
1745 			if (p->p_singlethread == td)
1746 				return (0);	/* Exempt from stopping. */
1747 		}
1748 		if (return_instead)
1749 			return (1);
1750 
1751 		/*
1752 		 * If the process is waiting for us to exit,
1753 		 * this thread should just suicide.
1754 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1755 		 */
1756 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1757 			mtx_lock_spin(&sched_lock);
1758 			while (mtx_owned(&Giant))
1759 				mtx_unlock(&Giant);
1760 			/*
1761 			 * free extra kses and ksegrps, we needn't worry
1762 			 * about if current thread is in same ksegrp as
1763 			 * p_singlethread and last kse in the group
1764 			 * could be killed, this is protected by kg_numthreads,
1765 			 * in this case, we deduce that kg_numthreads must > 1.
1766 			 */
1767 			ke = td->td_kse;
1768 			if (ke->ke_bound == NULL &&
1769 			    ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
1770 				ke->ke_flags |= KEF_EXIT;
1771 			thread_exit();
1772 		}
1773 
1774 		/*
1775 		 * When a thread suspends, it just
1776 		 * moves to the processes's suspend queue
1777 		 * and stays there.
1778 		 *
1779 		 * XXXKSE if TDF_BOUND is true
1780 		 * it will not release it's KSE which might
1781 		 * lead to deadlock if there are not enough KSEs
1782 		 * to complete all waiting threads.
1783 		 * Maybe be able to 'lend' it out again.
1784 		 * (lent kse's can not go back to userland?)
1785 		 * and can only be lent in STOPPED state.
1786 		 */
1787 		mtx_lock_spin(&sched_lock);
1788 		if ((p->p_flag & P_STOPPED_SIG) &&
1789 		    (p->p_suspcount+1 == p->p_numthreads)) {
1790 			mtx_unlock_spin(&sched_lock);
1791 			PROC_LOCK(p->p_pptr);
1792 			if ((p->p_pptr->p_procsig->ps_flag &
1793 				PS_NOCLDSTOP) == 0) {
1794 				psignal(p->p_pptr, SIGCHLD);
1795 			}
1796 			PROC_UNLOCK(p->p_pptr);
1797 			mtx_lock_spin(&sched_lock);
1798 		}
1799 		mtx_assert(&Giant, MA_NOTOWNED);
1800 		thread_suspend_one(td);
1801 		PROC_UNLOCK(p);
1802 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1803 			if (p->p_numthreads == p->p_suspcount) {
1804 				thread_unsuspend_one(p->p_singlethread);
1805 			}
1806 		}
1807 		p->p_stats->p_ru.ru_nivcsw++;
1808 		mi_switch();
1809 		mtx_unlock_spin(&sched_lock);
1810 		PROC_LOCK(p);
1811 	}
1812 	return (0);
1813 }
1814 
1815 void
1816 thread_suspend_one(struct thread *td)
1817 {
1818 	struct proc *p = td->td_proc;
1819 
1820 	mtx_assert(&sched_lock, MA_OWNED);
1821 	p->p_suspcount++;
1822 	TD_SET_SUSPENDED(td);
1823 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1824 	/*
1825 	 * Hack: If we are suspending but are on the sleep queue
1826 	 * then we are in msleep or the cv equivalent. We
1827 	 * want to look like we have two Inhibitors.
1828 	 * May already be set.. doesn't matter.
1829 	 */
1830 	if (TD_ON_SLEEPQ(td))
1831 		TD_SET_SLEEPING(td);
1832 }
1833 
1834 void
1835 thread_unsuspend_one(struct thread *td)
1836 {
1837 	struct proc *p = td->td_proc;
1838 
1839 	mtx_assert(&sched_lock, MA_OWNED);
1840 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1841 	TD_CLR_SUSPENDED(td);
1842 	p->p_suspcount--;
1843 	setrunnable(td);
1844 }
1845 
1846 /*
1847  * Allow all threads blocked by single threading to continue running.
1848  */
1849 void
1850 thread_unsuspend(struct proc *p)
1851 {
1852 	struct thread *td;
1853 
1854 	mtx_assert(&sched_lock, MA_OWNED);
1855 	PROC_LOCK_ASSERT(p, MA_OWNED);
1856 	if (!P_SHOULDSTOP(p)) {
1857 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1858 			thread_unsuspend_one(td);
1859 		}
1860 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1861 	    (p->p_numthreads == p->p_suspcount)) {
1862 		/*
1863 		 * Stopping everything also did the job for the single
1864 		 * threading request. Now we've downgraded to single-threaded,
1865 		 * let it continue.
1866 		 */
1867 		thread_unsuspend_one(p->p_singlethread);
1868 	}
1869 }
1870 
1871 void
1872 thread_single_end(void)
1873 {
1874 	struct thread *td;
1875 	struct proc *p;
1876 
1877 	td = curthread;
1878 	p = td->td_proc;
1879 	PROC_LOCK_ASSERT(p, MA_OWNED);
1880 	p->p_flag &= ~P_STOPPED_SINGLE;
1881 	p->p_singlethread = NULL;
1882 	/*
1883 	 * If there are other threads they mey now run,
1884 	 * unless of course there is a blanket 'stop order'
1885 	 * on the process. The single threader must be allowed
1886 	 * to continue however as this is a bad place to stop.
1887 	 */
1888 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1889 		mtx_lock_spin(&sched_lock);
1890 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1891 			thread_unsuspend_one(td);
1892 		}
1893 		mtx_unlock_spin(&sched_lock);
1894 	}
1895 }
1896 
1897 
1898