xref: /freebsd/sys/kern/kern_thread.c (revision de028f5a4a67b635ea3e45799be822c8daa1ff20)
1 /*
2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3  *  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice(s), this list of conditions and the following disclaimer as
10  *    the first lines of this file unmodified other than the possible
11  *    addition of one or more copyright notices.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice(s), this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26  * DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/smp.h>
39 #include <sys/sysctl.h>
40 #include <sys/sysproto.h>
41 #include <sys/filedesc.h>
42 #include <sys/sched.h>
43 #include <sys/signalvar.h>
44 #include <sys/sx.h>
45 #include <sys/tty.h>
46 #include <sys/user.h>
47 #include <sys/jail.h>
48 #include <sys/kse.h>
49 #include <sys/ktr.h>
50 #include <sys/ucontext.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_object.h>
54 #include <vm/pmap.h>
55 #include <vm/uma.h>
56 #include <vm/vm_map.h>
57 
58 #include <machine/frame.h>
59 
60 /*
61  * KSEGRP related storage.
62  */
63 static uma_zone_t ksegrp_zone;
64 static uma_zone_t kse_zone;
65 static uma_zone_t thread_zone;
66 
67 /* DEBUG ONLY */
68 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
69 static int oiks_debug = 0;	/* 0 disable, 1 printf, 2 enter debugger */
70 SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
71 	&oiks_debug, 0, "OIKS thread debug");
72 
73 static int oiks_max_threads_per_proc = 10;
74 SYSCTL_INT(_kern_threads, OID_AUTO, oiks_max_per_proc, CTLFLAG_RW,
75 	&oiks_max_threads_per_proc, 0, "Debug limit on threads per proc");
76 
77 static int max_threads_per_proc = 30;
78 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
79 	&max_threads_per_proc, 0, "Limit on threads per proc");
80 
81 static int max_groups_per_proc = 5;
82 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
83 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
84 
85 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
86 
87 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
88 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
89 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
90 struct mtx zombie_thread_lock;
91 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
92     "zombie_thread_lock", MTX_SPIN);
93 
94 
95 
96 void kse_purge(struct proc *p, struct thread *td);
97 /*
98  * Pepare a thread for use.
99  */
100 static void
101 thread_ctor(void *mem, int size, void *arg)
102 {
103 	struct thread	*td;
104 
105 	td = (struct thread *)mem;
106 	td->td_state = TDS_INACTIVE;
107 	td->td_flags |= TDF_UNBOUND;
108 }
109 
110 /*
111  * Reclaim a thread after use.
112  */
113 static void
114 thread_dtor(void *mem, int size, void *arg)
115 {
116 	struct thread	*td;
117 
118 	td = (struct thread *)mem;
119 
120 #ifdef INVARIANTS
121 	/* Verify that this thread is in a safe state to free. */
122 	switch (td->td_state) {
123 	case TDS_INHIBITED:
124 	case TDS_RUNNING:
125 	case TDS_CAN_RUN:
126 	case TDS_RUNQ:
127 		/*
128 		 * We must never unlink a thread that is in one of
129 		 * these states, because it is currently active.
130 		 */
131 		panic("bad state for thread unlinking");
132 		/* NOTREACHED */
133 	case TDS_INACTIVE:
134 		break;
135 	default:
136 		panic("bad thread state");
137 		/* NOTREACHED */
138 	}
139 #endif
140 }
141 
142 /*
143  * Initialize type-stable parts of a thread (when newly created).
144  */
145 static void
146 thread_init(void *mem, int size)
147 {
148 	struct thread	*td;
149 
150 	td = (struct thread *)mem;
151 	mtx_lock(&Giant);
152 	pmap_new_thread(td, 0);
153 	mtx_unlock(&Giant);
154 	cpu_thread_setup(td);
155 	td->td_sched = (struct td_sched *)&td[1];
156 }
157 
158 /*
159  * Tear down type-stable parts of a thread (just before being discarded).
160  */
161 static void
162 thread_fini(void *mem, int size)
163 {
164 	struct thread	*td;
165 
166 	td = (struct thread *)mem;
167 	pmap_dispose_thread(td);
168 }
169 /*
170  * Initialize type-stable parts of a kse (when newly created).
171  */
172 static void
173 kse_init(void *mem, int size)
174 {
175 	struct kse	*ke;
176 
177 	ke = (struct kse *)mem;
178 	ke->ke_sched = (struct ke_sched *)&ke[1];
179 }
180 /*
181  * Initialize type-stable parts of a ksegrp (when newly created).
182  */
183 static void
184 ksegrp_init(void *mem, int size)
185 {
186 	struct ksegrp	*kg;
187 
188 	kg = (struct ksegrp *)mem;
189 	kg->kg_sched = (struct kg_sched *)&kg[1];
190 }
191 
192 /*
193  * KSE is linked onto the idle queue.
194  */
195 void
196 kse_link(struct kse *ke, struct ksegrp *kg)
197 {
198 	struct proc *p = kg->kg_proc;
199 
200 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
201 	kg->kg_kses++;
202 	ke->ke_state = KES_UNQUEUED;
203 	ke->ke_proc	= p;
204 	ke->ke_ksegrp	= kg;
205 	ke->ke_thread	= NULL;
206 	ke->ke_oncpu = NOCPU;
207 }
208 
209 void
210 kse_unlink(struct kse *ke)
211 {
212 	struct ksegrp *kg;
213 
214 	mtx_assert(&sched_lock, MA_OWNED);
215 	kg = ke->ke_ksegrp;
216 	if (ke->ke_state == KES_IDLE) {
217 		kg->kg_idle_kses--;
218 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
219 	}
220 
221 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
222 	if (--kg->kg_kses == 0) {
223 			ksegrp_unlink(kg);
224 	}
225 	/*
226 	 * Aggregate stats from the KSE
227 	 */
228 	kse_stash(ke);
229 }
230 
231 void
232 ksegrp_link(struct ksegrp *kg, struct proc *p)
233 {
234 
235 	TAILQ_INIT(&kg->kg_threads);
236 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
237 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
238 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
239 	TAILQ_INIT(&kg->kg_iq);		/* idle kses in ksegrp */
240 	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
241 	kg->kg_proc	= p;
242 /* the following counters are in the -zero- section and may not need clearing */
243 	kg->kg_numthreads = 0;
244 	kg->kg_runnable = 0;
245 	kg->kg_kses = 0;
246 	kg->kg_idle_kses = 0;
247 	kg->kg_loan_kses = 0;
248 	kg->kg_runq_kses = 0; /* XXXKSE change name */
249 /* link it in now that it's consistent */
250 	p->p_numksegrps++;
251 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
252 }
253 
254 void
255 ksegrp_unlink(struct ksegrp *kg)
256 {
257 	struct proc *p;
258 
259 	mtx_assert(&sched_lock, MA_OWNED);
260 	p = kg->kg_proc;
261 	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
262 	    ("kseg_unlink: residual threads or KSEs"));
263 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
264 	p->p_numksegrps--;
265 	/*
266 	 * Aggregate stats from the KSE
267 	 */
268 	ksegrp_stash(kg);
269 }
270 
271 /*
272  * for a newly created process,
273  * link up a the structure and its initial threads etc.
274  */
275 void
276 proc_linkup(struct proc *p, struct ksegrp *kg,
277 			struct kse *ke, struct thread *td)
278 {
279 
280 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
281 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
282 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
283 	p->p_numksegrps = 0;
284 	p->p_numthreads = 0;
285 
286 	ksegrp_link(kg, p);
287 	kse_link(ke, kg);
288 	thread_link(td, kg);
289 }
290 
291 int
292 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
293 {
294 	struct proc *p;
295 	struct thread *td2;
296 
297 	p = td->td_proc;
298 	/* KSE-enabled processes only, please. */
299 	if (!(p->p_flag & P_KSES))
300 		return (EINVAL);
301 	if (uap->tmbx == NULL)
302 		return (EINVAL);
303 	mtx_lock_spin(&sched_lock);
304 	FOREACH_THREAD_IN_PROC(p, td2) {
305 		if (td2->td_mailbox == uap->tmbx) {
306 			td2->td_flags |= TDF_INTERRUPT;
307 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
308 				if (td2->td_flags & TDF_CVWAITQ)
309 					cv_abort(td2);
310 				else
311 					abortsleep(td2);
312 			}
313 			mtx_unlock_spin(&sched_lock);
314 			td->td_retval[0] = 0;
315 			td->td_retval[1] = 0;
316 			return (0);
317 		}
318 	}
319 	mtx_unlock_spin(&sched_lock);
320 	return (ESRCH);
321 }
322 
323 int
324 kse_exit(struct thread *td, struct kse_exit_args *uap)
325 {
326 	struct proc *p;
327 	struct ksegrp *kg;
328 
329 	p = td->td_proc;
330 	/* KSE-enabled processes only, please. */
331 	if (!(p->p_flag & P_KSES))
332 		return (EINVAL);
333 	/* must be a bound thread */
334 	if (td->td_flags & TDF_UNBOUND)
335 		return (EINVAL);
336 	kg = td->td_ksegrp;
337 	/* serialize killing kse */
338 	PROC_LOCK(p);
339 	mtx_lock_spin(&sched_lock);
340 	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
341 		mtx_unlock_spin(&sched_lock);
342 		PROC_UNLOCK(p);
343 		return (EDEADLK);
344 	}
345 	if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
346 		p->p_flag &= ~P_KSES;
347 		mtx_unlock_spin(&sched_lock);
348 		PROC_UNLOCK(p);
349 	} else {
350 		while (mtx_owned(&Giant))
351 			mtx_unlock(&Giant);
352 		td->td_kse->ke_flags |= KEF_EXIT;
353 		thread_exit();
354 		/* NOTREACHED */
355 	}
356 	return (0);
357 }
358 
359 int
360 kse_release(struct thread *td, struct kse_release_args *uap)
361 {
362 	struct proc *p;
363 
364 	p = td->td_proc;
365 	/* KSE-enabled processes only */
366 	if (!(p->p_flag & P_KSES))
367 		return (EINVAL);
368 	/*
369 	 * Must be a bound thread. And kse must have a mailbox ready,
370 	 * if not, the kse would can not generate an upcall.
371 	 */
372 	if (!(td->td_flags & TDF_UNBOUND) && (td->td_kse->ke_mailbox != NULL)) {
373 		PROC_LOCK(p);
374 		mtx_lock_spin(&sched_lock);
375 		/* prevent last thread from exiting */
376 		if (p->p_numthreads == 1) {
377 			mtx_unlock_spin(&sched_lock);
378 			if (td->td_standin == NULL) {
379 				PROC_UNLOCK(p);
380 				td->td_standin = thread_alloc();
381 				PROC_LOCK(p);
382 			}
383 			msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH,
384 			       "pause", 0);
385 			mtx_lock_spin(&sched_lock);
386 			td->td_flags |= TDF_UNBOUND;
387 			thread_schedule_upcall(td, td->td_kse);
388 		}
389 		thread_exit();
390 		/* NOTREACHED */
391 	}
392 	return (EINVAL);
393 }
394 
395 /* struct kse_wakeup_args {
396 	struct kse_mailbox *mbx;
397 }; */
398 int
399 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
400 {
401 	struct proc *p;
402 	struct kse *ke, *ke2;
403 	struct ksegrp *kg;
404 
405 	p = td->td_proc;
406 	/* KSE-enabled processes only, please. */
407 	if (!(p->p_flag & P_KSES))
408 		return EINVAL;
409 	if (td->td_standin == NULL)
410 		td->td_standin = thread_alloc();
411 	ke = NULL;
412 	mtx_lock_spin(&sched_lock);
413 	if (uap->mbx) {
414 		FOREACH_KSEGRP_IN_PROC(p, kg) {
415 			FOREACH_KSE_IN_GROUP(kg, ke2) {
416 				if (ke2->ke_mailbox != uap->mbx)
417 					continue;
418 				if (ke2->ke_state == KES_IDLE) {
419 					ke = ke2;
420 					goto found;
421 				} else {
422 					mtx_unlock_spin(&sched_lock);
423 					td->td_retval[0] = 0;
424 					td->td_retval[1] = 0;
425 					return (0);
426 				}
427 			}
428 		}
429 	} else {
430 		kg = td->td_ksegrp;
431 		ke = TAILQ_FIRST(&kg->kg_iq);
432 	}
433 	if (ke == NULL) {
434 		mtx_unlock_spin(&sched_lock);
435 		return (ESRCH);
436 	}
437 found:
438 	thread_schedule_upcall(td, ke);
439 	mtx_unlock_spin(&sched_lock);
440 	td->td_retval[0] = 0;
441 	td->td_retval[1] = 0;
442 	return (0);
443 }
444 
445 /*
446  * No new KSEG: first call: use current KSE, don't schedule an upcall
447  * All other situations, do allocate a new KSE and schedule an upcall on it.
448  */
449 /* struct kse_create_args {
450 	struct kse_mailbox *mbx;
451 	int newgroup;
452 }; */
453 int
454 kse_create(struct thread *td, struct kse_create_args *uap)
455 {
456 	struct kse *newke;
457 	struct kse *ke;
458 	struct ksegrp *newkg;
459 	struct ksegrp *kg;
460 	struct proc *p;
461 	struct kse_mailbox mbx;
462 	int err;
463 
464 	p = td->td_proc;
465 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
466 		return (err);
467 
468 	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
469 	kg = td->td_ksegrp;
470 	if (uap->newgroup) {
471 		if (p->p_numksegrps >= max_groups_per_proc)
472 			return (EPROCLIM);
473 		/*
474 		 * If we want a new KSEGRP it doesn't matter whether
475 		 * we have already fired up KSE mode before or not.
476 		 * We put the process in KSE mode and create a new KSEGRP
477 		 * and KSE. If our KSE has not got a mailbox yet then
478 		 * that doesn't matter, just leave it that way. It will
479 		 * ensure that this thread stay BOUND. It's possible
480 		 * that the call came form a threaded library and the main
481 		 * program knows nothing of threads.
482 		 */
483 		newkg = ksegrp_alloc();
484 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
485 		      kg_startzero, kg_endzero));
486 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
487 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
488 		newke = kse_alloc();
489 	} else {
490 		/*
491 		 * Otherwise, if we have already set this KSE
492 		 * to have a mailbox, we want to make another KSE here,
493 		 * but only if there are not already the limit, which
494 		 * is 1 per CPU max.
495 		 *
496 		 * If the current KSE doesn't have a mailbox we just use it
497 		 * and give it one.
498 		 *
499 		 * Because we don't like to access
500 		 * the KSE outside of schedlock if we are UNBOUND,
501 		 * (because it can change if we are preempted by an interrupt)
502 		 * we can deduce it as having a mailbox if we are UNBOUND,
503 		 * and only need to actually look at it if we are BOUND,
504 		 * which is safe.
505 		 */
506 		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
507 			if (oiks_debug == 0) {
508 #ifdef SMP
509 			if (kg->kg_kses > mp_ncpus)
510 #endif
511 				return (EPROCLIM);
512 			}
513 			newke = kse_alloc();
514 		} else {
515 			newke = NULL;
516 		}
517 		newkg = NULL;
518 	}
519 	if (newke) {
520 		bzero(&newke->ke_startzero, RANGEOF(struct kse,
521 		      ke_startzero, ke_endzero));
522 #if 0
523 		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
524 		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
525 #endif
526 		/* For the first call this may not have been set */
527 		if (td->td_standin == NULL) {
528 			td->td_standin = thread_alloc();
529 		}
530 		mtx_lock_spin(&sched_lock);
531 		if (newkg) {
532 			if (p->p_numksegrps >= max_groups_per_proc) {
533 				mtx_unlock_spin(&sched_lock);
534 				ksegrp_free(newkg);
535 				kse_free(newke);
536 				return (EPROCLIM);
537 			}
538 			ksegrp_link(newkg, p);
539 		}
540 		else
541 			newkg = kg;
542 		kse_link(newke, newkg);
543 		if (p->p_sflag & PS_NEEDSIGCHK)
544 			newke->ke_flags |= KEF_ASTPENDING;
545 		newke->ke_mailbox = uap->mbx;
546 		newke->ke_upcall = mbx.km_func;
547 		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
548 		thread_schedule_upcall(td, newke);
549 		mtx_unlock_spin(&sched_lock);
550 	} else {
551 		/*
552 		 * If we didn't allocate a new KSE then the we are using
553 		 * the exisiting (BOUND) kse.
554 		 */
555 		ke = td->td_kse;
556 		ke->ke_mailbox = uap->mbx;
557 		ke->ke_upcall = mbx.km_func;
558 		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
559 	}
560 	/*
561 	 * Fill out the KSE-mode specific fields of the new kse.
562 	 */
563 
564 	td->td_retval[0] = 0;
565 	td->td_retval[1] = 0;
566 	return (0);
567 }
568 
569 /*
570  * Fill a ucontext_t with a thread's context information.
571  *
572  * This is an analogue to getcontext(3).
573  */
574 void
575 thread_getcontext(struct thread *td, ucontext_t *uc)
576 {
577 
578 /*
579  * XXX this is declared in a MD include file, i386/include/ucontext.h but
580  * is used in MI code.
581  */
582 #ifdef __i386__
583 	get_mcontext(td, &uc->uc_mcontext);
584 #endif
585 	uc->uc_sigmask = td->td_proc->p_sigmask;
586 }
587 
588 /*
589  * Set a thread's context from a ucontext_t.
590  *
591  * This is an analogue to setcontext(3).
592  */
593 int
594 thread_setcontext(struct thread *td, ucontext_t *uc)
595 {
596 	int ret;
597 
598 /*
599  * XXX this is declared in a MD include file, i386/include/ucontext.h but
600  * is used in MI code.
601  */
602 #ifdef __i386__
603 	ret = set_mcontext(td, &uc->uc_mcontext);
604 #else
605 	ret = ENOSYS;
606 #endif
607 	if (ret == 0) {
608 		SIG_CANTMASK(uc->uc_sigmask);
609 		PROC_LOCK(td->td_proc);
610 		td->td_proc->p_sigmask = uc->uc_sigmask;
611 		PROC_UNLOCK(td->td_proc);
612 	}
613 	return (ret);
614 }
615 
616 /*
617  * Initialize global thread allocation resources.
618  */
619 void
620 threadinit(void)
621 {
622 
623 #ifndef __ia64__
624 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
625 	    thread_ctor, thread_dtor, thread_init, thread_fini,
626 	    UMA_ALIGN_CACHE, 0);
627 #else
628 	/*
629 	 * XXX the ia64 kstack allocator is really lame and is at the mercy
630 	 * of contigmallloc().  This hackery is to pre-construct a whole
631 	 * pile of thread structures with associated kernel stacks early
632 	 * in the system startup while contigmalloc() still works. Once we
633 	 * have them, keep them.  Sigh.
634 	 */
635 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
636 	    thread_ctor, thread_dtor, thread_init, thread_fini,
637 	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
638 	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
639 #endif
640 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
641 	    NULL, NULL, ksegrp_init, NULL,
642 	    UMA_ALIGN_CACHE, 0);
643 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
644 	    NULL, NULL, kse_init, NULL,
645 	    UMA_ALIGN_CACHE, 0);
646 }
647 
648 /*
649  * Stash an embarasingly extra thread into the zombie thread queue.
650  */
651 void
652 thread_stash(struct thread *td)
653 {
654 	mtx_lock_spin(&zombie_thread_lock);
655 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
656 	mtx_unlock_spin(&zombie_thread_lock);
657 }
658 
659 /*
660  * Stash an embarasingly extra kse into the zombie kse queue.
661  */
662 void
663 kse_stash(struct kse *ke)
664 {
665 	mtx_lock_spin(&zombie_thread_lock);
666 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
667 	mtx_unlock_spin(&zombie_thread_lock);
668 }
669 
670 /*
671  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
672  */
673 void
674 ksegrp_stash(struct ksegrp *kg)
675 {
676 	mtx_lock_spin(&zombie_thread_lock);
677 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
678 	mtx_unlock_spin(&zombie_thread_lock);
679 }
680 
681 /*
682  * Reap zombie threads.
683  */
684 void
685 thread_reap(void)
686 {
687 	struct thread *td_first, *td_next;
688 	struct kse *ke_first, *ke_next;
689 	struct ksegrp *kg_first, * kg_next;
690 
691 	/*
692 	 * don't even bother to lock if none at this instant
693 	 * We really don't care about the next instant..
694 	 */
695 	if ((!TAILQ_EMPTY(&zombie_threads))
696 	    || (!TAILQ_EMPTY(&zombie_kses))
697 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
698 		mtx_lock_spin(&zombie_thread_lock);
699 		td_first = TAILQ_FIRST(&zombie_threads);
700 		ke_first = TAILQ_FIRST(&zombie_kses);
701 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
702 		if (td_first)
703 			TAILQ_INIT(&zombie_threads);
704 		if (ke_first)
705 			TAILQ_INIT(&zombie_kses);
706 		if (kg_first)
707 			TAILQ_INIT(&zombie_ksegrps);
708 		mtx_unlock_spin(&zombie_thread_lock);
709 		while (td_first) {
710 			td_next = TAILQ_NEXT(td_first, td_runq);
711 			thread_free(td_first);
712 			td_first = td_next;
713 		}
714 		while (ke_first) {
715 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
716 			kse_free(ke_first);
717 			ke_first = ke_next;
718 		}
719 		while (kg_first) {
720 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
721 			ksegrp_free(kg_first);
722 			kg_first = kg_next;
723 		}
724 	}
725 }
726 
727 /*
728  * Allocate a ksegrp.
729  */
730 struct ksegrp *
731 ksegrp_alloc(void)
732 {
733 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
734 }
735 
736 /*
737  * Allocate a kse.
738  */
739 struct kse *
740 kse_alloc(void)
741 {
742 	return (uma_zalloc(kse_zone, M_WAITOK));
743 }
744 
745 /*
746  * Allocate a thread.
747  */
748 struct thread *
749 thread_alloc(void)
750 {
751 	thread_reap(); /* check if any zombies to get */
752 	return (uma_zalloc(thread_zone, M_WAITOK));
753 }
754 
755 /*
756  * Deallocate a ksegrp.
757  */
758 void
759 ksegrp_free(struct ksegrp *td)
760 {
761 	uma_zfree(ksegrp_zone, td);
762 }
763 
764 /*
765  * Deallocate a kse.
766  */
767 void
768 kse_free(struct kse *td)
769 {
770 	uma_zfree(kse_zone, td);
771 }
772 
773 /*
774  * Deallocate a thread.
775  */
776 void
777 thread_free(struct thread *td)
778 {
779 	uma_zfree(thread_zone, td);
780 }
781 
782 /*
783  * Store the thread context in the UTS's mailbox.
784  * then add the mailbox at the head of a list we are building in user space.
785  * The list is anchored in the ksegrp structure.
786  */
787 int
788 thread_export_context(struct thread *td)
789 {
790 	struct proc *p;
791 	struct ksegrp *kg;
792 	uintptr_t mbx;
793 	void *addr;
794 	int error;
795 	ucontext_t uc;
796 	uint temp;
797 
798 	p = td->td_proc;
799 	kg = td->td_ksegrp;
800 
801 	/* Export the user/machine context. */
802 #if 0
803 	addr = (caddr_t)td->td_mailbox +
804 	    offsetof(struct kse_thr_mailbox, tm_context);
805 #else /* if user pointer arithmetic is valid in the kernel */
806 		addr = (void *)(&td->td_mailbox->tm_context);
807 #endif
808 	error = copyin(addr, &uc, sizeof(ucontext_t));
809 	if (error == 0) {
810 		thread_getcontext(td, &uc);
811 		error = copyout(&uc, addr, sizeof(ucontext_t));
812 
813 	}
814 	if (error) {
815 		PROC_LOCK(p);
816 		psignal(p, SIGSEGV);
817 		PROC_UNLOCK(p);
818 		return (error);
819 	}
820 	/* get address in latest mbox of list pointer */
821 #if 0
822 	addr = (caddr_t)td->td_mailbox
823 	    + offsetof(struct kse_thr_mailbox , tm_next);
824 #else /* if user pointer arithmetic is valid in the kernel */
825 	addr = (void *)(&td->td_mailbox->tm_next);
826 #endif
827 	/*
828 	 * Put the saved address of the previous first
829 	 * entry into this one
830 	 */
831 	for (;;) {
832 		mbx = (uintptr_t)kg->kg_completed;
833 		if (suword(addr, mbx)) {
834 			goto bad;
835 		}
836 		PROC_LOCK(p);
837 		if (mbx == (uintptr_t)kg->kg_completed) {
838 			kg->kg_completed = td->td_mailbox;
839 			PROC_UNLOCK(p);
840 			break;
841 		}
842 		PROC_UNLOCK(p);
843 	}
844 	addr = (caddr_t)td->td_mailbox
845 		 + offsetof(struct kse_thr_mailbox, tm_sticks);
846 	temp = fuword(addr) + td->td_usticks;
847 	if (suword(addr, temp))
848 		goto bad;
849 	return (0);
850 
851 bad:
852 	PROC_LOCK(p);
853 	psignal(p, SIGSEGV);
854 	PROC_UNLOCK(p);
855 	return (EFAULT);
856 }
857 
858 /*
859  * Take the list of completed mailboxes for this KSEGRP and put them on this
860  * KSE's mailbox as it's the next one going up.
861  */
862 static int
863 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
864 {
865 	struct proc *p = kg->kg_proc;
866 	void *addr;
867 	uintptr_t mbx;
868 
869 #if 0
870 	addr = (caddr_t)ke->ke_mailbox
871 	    + offsetof(struct kse_mailbox, km_completed);
872 #else /* if user pointer arithmetic is valid in the kernel */
873 		addr = (void *)(&ke->ke_mailbox->km_completed);
874 #endif
875 	for (;;) {
876 		mbx = (uintptr_t)kg->kg_completed;
877 		if (suword(addr, mbx)) {
878 			PROC_LOCK(p);
879 			psignal(p, SIGSEGV);
880 			PROC_UNLOCK(p);
881 			return (EFAULT);
882 		}
883 		/* XXXKSE could use atomic CMPXCH here */
884 		PROC_LOCK(p);
885 		if (mbx == (uintptr_t)kg->kg_completed) {
886 			kg->kg_completed = NULL;
887 			PROC_UNLOCK(p);
888 			break;
889 		}
890 		PROC_UNLOCK(p);
891 	}
892 	return (0);
893 }
894 
895 /*
896  * This function should be called at statclock interrupt time
897  */
898 int
899 thread_add_ticks_intr(int user, uint ticks)
900 {
901 	struct thread *td = curthread;
902 	struct kse *ke = td->td_kse;
903 
904 	if (ke->ke_mailbox == NULL)
905 		return -1;
906 	if (user) {
907 		/* Current always do via ast() */
908 		ke->ke_flags |= KEF_ASTPENDING;
909 		ke->ke_uuticks += ticks;
910 	} else {
911 		if (td->td_mailbox != NULL)
912 			td->td_usticks += ticks;
913 		else
914 			ke->ke_usticks += ticks;
915 	}
916 	return 0;
917 }
918 
919 static int
920 thread_update_uticks(void)
921 {
922 	struct thread *td = curthread;
923 	struct proc *p = td->td_proc;
924 	struct kse *ke = td->td_kse;
925 	struct kse_thr_mailbox *tmbx;
926 	caddr_t addr;
927 	uint uticks, sticks;
928 
929 	KASSERT(!(td->td_flags & TDF_UNBOUND), ("thread not bound."));
930 
931 	if (ke->ke_mailbox == NULL)
932 		return 0;
933 
934 	uticks = ke->ke_uuticks;
935 	ke->ke_uuticks = 0;
936 	sticks = ke->ke_usticks;
937 	ke->ke_usticks = 0;
938 	tmbx = (void *)fuword((caddr_t)ke->ke_mailbox
939 			+ offsetof(struct kse_mailbox, km_curthread));
940 	if ((tmbx == NULL) || (tmbx == (void *)-1))
941 		return 0;
942 	if (uticks) {
943 		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_uticks);
944 		uticks += fuword(addr);
945 		if (suword(addr, uticks))
946 			goto bad;
947 	}
948 	if (sticks) {
949 		addr = (caddr_t)tmbx + offsetof(struct kse_thr_mailbox, tm_sticks);
950 		sticks += fuword(addr);
951 		if (suword(addr, sticks))
952 			goto bad;
953 	}
954 	return 0;
955 bad:
956 	PROC_LOCK(p);
957 	psignal(p, SIGSEGV);
958 	PROC_UNLOCK(p);
959 	return -1;
960 }
961 
962 /*
963  * Discard the current thread and exit from its context.
964  *
965  * Because we can't free a thread while we're operating under its context,
966  * push the current thread into our KSE's ke_tdspare slot, freeing the
967  * thread that might be there currently. Because we know that only this
968  * processor will run our KSE, we needn't worry about someone else grabbing
969  * our context before we do a cpu_throw.
970  */
971 void
972 thread_exit(void)
973 {
974 	struct thread *td;
975 	struct kse *ke;
976 	struct proc *p;
977 	struct ksegrp	*kg;
978 
979 	td = curthread;
980 	kg = td->td_ksegrp;
981 	p = td->td_proc;
982 	ke = td->td_kse;
983 
984 	mtx_assert(&sched_lock, MA_OWNED);
985 	KASSERT(p != NULL, ("thread exiting without a process"));
986 	KASSERT(ke != NULL, ("thread exiting without a kse"));
987 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
988 	PROC_LOCK_ASSERT(p, MA_OWNED);
989 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
990 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
991 
992 	if (ke->ke_tdspare != NULL) {
993 		thread_stash(ke->ke_tdspare);
994 		ke->ke_tdspare = NULL;
995 	}
996 	if (td->td_standin != NULL) {
997 		thread_stash(td->td_standin);
998 		td->td_standin = NULL;
999 	}
1000 
1001 	cpu_thread_exit(td);	/* XXXSMP */
1002 
1003 	/*
1004 	 * The last thread is left attached to the process
1005 	 * So that the whole bundle gets recycled. Skip
1006 	 * all this stuff.
1007 	 */
1008 	if (p->p_numthreads > 1) {
1009 		/*
1010 		 * Unlink this thread from its proc and the kseg.
1011 		 * In keeping with the other structs we probably should
1012 		 * have a thread_unlink() that does some of this but it
1013 		 * would only be called from here (I think) so it would
1014 		 * be a waste. (might be useful for proc_fini() as well.)
1015  		 */
1016 		TAILQ_REMOVE(&p->p_threads, td, td_plist);
1017 		p->p_numthreads--;
1018 		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
1019 		kg->kg_numthreads--;
1020 		/*
1021 		 * The test below is NOT true if we are the
1022 		 * sole exiting thread. P_STOPPED_SNGL is unset
1023 		 * in exit1() after it is the only survivor.
1024 		 */
1025 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1026 			if (p->p_numthreads == p->p_suspcount) {
1027 				thread_unsuspend_one(p->p_singlethread);
1028 			}
1029 		}
1030 
1031 		/* Reassign this thread's KSE. */
1032 		ke->ke_thread = NULL;
1033 		td->td_kse = NULL;
1034 		ke->ke_state = KES_UNQUEUED;
1035 		KASSERT((ke->ke_bound != td),
1036 		    ("thread_exit: entered with ke_bound set"));
1037 
1038 		/*
1039 		 * The reason for all this hoopla is
1040 		 * an attempt to stop our thread stack from being freed
1041 		 * until AFTER we have stopped running on it.
1042 		 * Since we are under schedlock, almost any method where
1043 		 * it is eventually freed by someone else is probably ok.
1044 		 * (Especially if they do it under schedlock). We could
1045 		 * almost free it here if we could be certain that
1046 		 * the uma code wouldn't pull it apart immediatly,
1047 		 * but unfortunatly we can not guarantee that.
1048 		 *
1049 		 * For threads that are exiting and NOT killing their
1050 		 * KSEs we can just stash it in the KSE, however
1051 		 * in the case where the KSE is also being deallocated,
1052 		 * we need to store it somewhere else. It turns out that
1053 		 * we will never free the last KSE, so there is always one
1054 		 * other KSE available. We might as well just choose one
1055 		 * and stash it there. Being under schedlock should make that
1056 		 * safe.
1057 		 *
1058 		 * In borrower threads, we can stash it in the lender
1059 		 * Where it won't be needed until this thread is long gone.
1060 		 * Borrower threads can't kill their KSE anyhow, so even
1061 		 * the KSE would be a safe place for them. It is not
1062 		 * necessary to have a KSE (or KSEGRP) at all beyond this
1063 		 * point, while we are under the protection of schedlock.
1064 		 *
1065 		 * Either give the KSE to another thread to use (or make
1066 		 * it idle), or free it entirely, possibly along with its
1067 		 * ksegrp if it's the last one.
1068 		 */
1069 		if (ke->ke_flags & KEF_EXIT) {
1070 			kse_unlink(ke);
1071 			/*
1072 			 * Designate another KSE to hold our thread.
1073 			 * Safe as long as we abide by whatever lock
1074 			 * we control it with.. The other KSE will not
1075 			 * be able to run it until we release the schelock,
1076 			 * but we need to be careful about it deciding to
1077 			 * write to the stack before then. Luckily
1078 			 * I believe that while another thread's
1079 			 * standin thread can be used in this way, the
1080 			 * spare thread for the KSE cannot be used without
1081 			 * holding schedlock at least once.
1082 			 */
1083 			ke =  FIRST_KSE_IN_PROC(p);
1084 		} else {
1085 			kse_reassign(ke);
1086 		}
1087 #if 0
1088 		if (ke->ke_bound) {
1089 			/*
1090 			 * WE are a borrower..
1091 			 * stash our thread with the owner.
1092 			 */
1093 			if (ke->ke_bound->td_standin) {
1094 				thread_stash(ke->ke_bound->td_standin);
1095 			}
1096 			ke->ke_bound->td_standin = td;
1097 		} else {
1098 #endif
1099 			if (ke->ke_tdspare != NULL) {
1100 				thread_stash(ke->ke_tdspare);
1101 				ke->ke_tdspare = NULL;
1102 			}
1103 			ke->ke_tdspare = td;
1104 #if 0
1105 		}
1106 #endif
1107 		PROC_UNLOCK(p);
1108 		td->td_state	= TDS_INACTIVE;
1109 		td->td_proc	= NULL;
1110 		td->td_ksegrp	= NULL;
1111 		td->td_last_kse	= NULL;
1112 	} else {
1113 		PROC_UNLOCK(p);
1114 	}
1115 
1116 	cpu_throw();
1117 	/* NOTREACHED */
1118 }
1119 
1120 /*
1121  * Link a thread to a process.
1122  * set up anything that needs to be initialized for it to
1123  * be used by the process.
1124  *
1125  * Note that we do not link to the proc's ucred here.
1126  * The thread is linked as if running but no KSE assigned.
1127  */
1128 void
1129 thread_link(struct thread *td, struct ksegrp *kg)
1130 {
1131 	struct proc *p;
1132 
1133 	p = kg->kg_proc;
1134 	td->td_state = TDS_INACTIVE;
1135 	td->td_proc	= p;
1136 	td->td_ksegrp	= kg;
1137 	td->td_last_kse	= NULL;
1138 
1139 	LIST_INIT(&td->td_contested);
1140 	callout_init(&td->td_slpcallout, 1);
1141 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
1142 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
1143 	p->p_numthreads++;
1144 	kg->kg_numthreads++;
1145 	if (oiks_debug && (p->p_numthreads > oiks_max_threads_per_proc)) {
1146 		printf("OIKS %d\n", p->p_numthreads);
1147 		if (oiks_debug > 1)
1148 			Debugger("OIKS");
1149 	}
1150 	td->td_kse	= NULL;
1151 }
1152 
1153 void
1154 kse_purge(struct proc *p, struct thread *td)
1155 {
1156 	struct kse *ke;
1157 	struct ksegrp *kg;
1158 
1159  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1160 	mtx_lock_spin(&sched_lock);
1161 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1162 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1163 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1164 			kg->kg_idle_kses--;
1165 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
1166 			kg->kg_kses--;
1167 			if (ke->ke_tdspare)
1168 				thread_stash(ke->ke_tdspare);
1169    			kse_stash(ke);
1170 		}
1171 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1172 		p->p_numksegrps--;
1173 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1174 		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1175 			("wrong kg_kses"));
1176 		if (kg != td->td_ksegrp) {
1177 			ksegrp_stash(kg);
1178 		}
1179 	}
1180 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1181 	p->p_numksegrps++;
1182 	mtx_unlock_spin(&sched_lock);
1183 }
1184 
1185 
1186 /*
1187  * Create a thread and schedule it for upcall on the KSE given.
1188  */
1189 struct thread *
1190 thread_schedule_upcall(struct thread *td, struct kse *ke)
1191 {
1192 	struct thread *td2;
1193 	struct ksegrp *kg;
1194 	int newkse;
1195 
1196 	mtx_assert(&sched_lock, MA_OWNED);
1197 	newkse = (ke != td->td_kse);
1198 
1199 	/*
1200 	 * If the kse is already owned by another thread then we can't
1201 	 * schedule an upcall because the other thread must be BOUND
1202 	 * which means it is not in a position to take an upcall.
1203 	 * We must be borrowing the KSE to allow us to complete some in-kernel
1204 	 * work. When we complete, the Bound thread will have teh chance to
1205 	 * complete. This thread will sleep as planned. Hopefully there will
1206 	 * eventually be un unbound thread that can be converted to an
1207 	 * upcall to report the completion of this thread.
1208 	 */
1209 	if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
1210 		return (NULL);
1211 	}
1212 	KASSERT((ke->ke_bound == NULL), ("kse already bound"));
1213 
1214 	if (ke->ke_state == KES_IDLE) {
1215 		kg = ke->ke_ksegrp;
1216 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1217 		kg->kg_idle_kses--;
1218 		ke->ke_state = KES_UNQUEUED;
1219 	}
1220 	if ((td2 = td->td_standin) != NULL) {
1221 		td->td_standin = NULL;
1222 	} else {
1223 		if (newkse)
1224 			panic("no reserve thread when called with a new kse");
1225 		/*
1226 		 * If called from (e.g.) sleep and we do not have
1227 		 * a reserve thread, then we've used it, so do not
1228 		 * create an upcall.
1229 		 */
1230 		return (NULL);
1231 	}
1232 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1233 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1234 	bzero(&td2->td_startzero,
1235 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1236 	bcopy(&td->td_startcopy, &td2->td_startcopy,
1237 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1238 	thread_link(td2, ke->ke_ksegrp);
1239 	cpu_set_upcall(td2, td->td_pcb);
1240 
1241 	/*
1242 	 * XXXKSE do we really need this? (default values for the
1243 	 * frame).
1244 	 */
1245 	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1246 
1247 	/*
1248 	 * Bind the new thread to the KSE,
1249 	 * and if it's our KSE, lend it back to ourself
1250 	 * so we can continue running.
1251 	 */
1252 	td2->td_ucred = crhold(td->td_ucred);
1253 	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1254 	td2->td_kse = ke;
1255 	td2->td_state = TDS_CAN_RUN;
1256 	td2->td_inhibitors = 0;
1257 	/*
1258 	 * If called from msleep(), we are working on the current
1259 	 * KSE so fake that we borrowed it. If called from
1260 	 * kse_create(), don't, as we have a new kse too.
1261 	 */
1262 	if (!newkse) {
1263 		/*
1264 		 * This thread will be scheduled when the current thread
1265 		 * blocks, exits or tries to enter userspace, (which ever
1266 		 * happens first). When that happens the KSe will "revert"
1267 		 * to this thread in a BOUND manner. Since we are called
1268 		 * from msleep() this is going to be "very soon" in nearly
1269 		 * all cases.
1270 		 */
1271 		ke->ke_bound = td2;
1272 		TD_SET_LOAN(td2);
1273 	} else {
1274 		ke->ke_bound = NULL;
1275 		ke->ke_thread = td2;
1276 		ke->ke_state = KES_THREAD;
1277 		setrunqueue(td2);
1278 	}
1279 	return (td2);	/* bogus.. should be a void function */
1280 }
1281 
1282 /*
1283  * Schedule an upcall to notify a KSE process recieved signals.
1284  *
1285  * XXX - Modifying a sigset_t like this is totally bogus.
1286  */
1287 struct thread *
1288 signal_upcall(struct proc *p, int sig)
1289 {
1290 	struct thread *td, *td2;
1291 	struct kse *ke;
1292 	sigset_t ss;
1293 	int error;
1294 
1295 	PROC_LOCK_ASSERT(p, MA_OWNED);
1296 return (NULL);
1297 
1298 	td = FIRST_THREAD_IN_PROC(p);
1299 	ke = td->td_kse;
1300 	PROC_UNLOCK(p);
1301 	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1302 	PROC_LOCK(p);
1303 	if (error)
1304 		return (NULL);
1305 	SIGADDSET(ss, sig);
1306 	PROC_UNLOCK(p);
1307 	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1308 	PROC_LOCK(p);
1309 	if (error)
1310 		return (NULL);
1311 	if (td->td_standin == NULL)
1312 		td->td_standin = thread_alloc();
1313 	mtx_lock_spin(&sched_lock);
1314 	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1315 	mtx_unlock_spin(&sched_lock);
1316 	return (td2);
1317 }
1318 
1319 /*
1320  * setup done on the thread when it enters the kernel.
1321  * XXXKSE Presently only for syscalls but eventually all kernel entries.
1322  */
1323 void
1324 thread_user_enter(struct proc *p, struct thread *td)
1325 {
1326 	struct kse *ke;
1327 
1328 	/*
1329 	 * First check that we shouldn't just abort.
1330 	 * But check if we are the single thread first!
1331 	 * XXX p_singlethread not locked, but should be safe.
1332 	 */
1333 	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1334 		PROC_LOCK(p);
1335 		mtx_lock_spin(&sched_lock);
1336 		thread_exit();
1337 		/* NOTREACHED */
1338 	}
1339 
1340 	/*
1341 	 * If we are doing a syscall in a KSE environment,
1342 	 * note where our mailbox is. There is always the
1343 	 * possibility that we could do this lazily (in sleep()),
1344 	 * but for now do it every time.
1345 	 */
1346 	ke = td->td_kse;
1347 	if (ke->ke_mailbox != NULL) {
1348 #if 0
1349 		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1350 		    + offsetof(struct kse_mailbox, km_curthread));
1351 #else /* if user pointer arithmetic is ok in the kernel */
1352 		td->td_mailbox =
1353 		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1354 #endif
1355 		if ((td->td_mailbox == NULL) ||
1356 		    (td->td_mailbox == (void *)-1)) {
1357 			td->td_mailbox = NULL;	/* single thread it.. */
1358 			mtx_lock_spin(&sched_lock);
1359 			td->td_flags &= ~TDF_UNBOUND;
1360 			mtx_unlock_spin(&sched_lock);
1361 		} else {
1362 			/*
1363 			 * when thread limit reached, act like that the thread
1364 			 * has already done an upcall.
1365 			 */
1366 		    	if (p->p_numthreads > max_threads_per_proc) {
1367 				if (td->td_standin != NULL)
1368 					thread_stash(td->td_standin);
1369 				td->td_standin = NULL;
1370 			} else {
1371 				if (td->td_standin == NULL)
1372 					td->td_standin = thread_alloc();
1373 			}
1374 			mtx_lock_spin(&sched_lock);
1375 			td->td_flags |= TDF_UNBOUND;
1376 			mtx_unlock_spin(&sched_lock);
1377 			td->td_usticks = 0;
1378 		}
1379 	}
1380 }
1381 
1382 /*
1383  * The extra work we go through if we are a threaded process when we
1384  * return to userland.
1385  *
1386  * If we are a KSE process and returning to user mode, check for
1387  * extra work to do before we return (e.g. for more syscalls
1388  * to complete first).  If we were in a critical section, we should
1389  * just return to let it finish. Same if we were in the UTS (in
1390  * which case the mailbox's context's busy indicator will be set).
1391  * The only traps we suport will have set the mailbox.
1392  * We will clear it here.
1393  */
1394 int
1395 thread_userret(struct thread *td, struct trapframe *frame)
1396 {
1397 	int error;
1398 	int unbound;
1399 	struct kse *ke;
1400 	struct ksegrp *kg;
1401 	struct thread *td2;
1402 	struct proc *p;
1403 	struct timespec ts;
1404 
1405 	error = 0;
1406 
1407 	unbound = td->td_flags & TDF_UNBOUND;
1408 
1409 	kg = td->td_ksegrp;
1410 	p = td->td_proc;
1411 
1412 	/*
1413 	 * Originally bound threads never upcall but they may
1414 	 * loan out their KSE at this point.
1415 	 * Upcalls imply bound.. They also may want to do some Philantropy.
1416 	 * Unbound threads on the other hand either yield to other work
1417 	 * or transform into an upcall.
1418 	 * (having saved their context to user space in both cases)
1419 	 */
1420 	if (unbound) {
1421 		/*
1422 		 * We are an unbound thread, looking to return to
1423 		 * user space.
1424 		 * THere are several possibilities:
1425 		 * 1) we are using a borrowed KSE. save state and exit.
1426 		 *    kse_reassign() will recycle the kse as needed,
1427 		 * 2) we are not.. save state, and then convert ourself
1428 		 *    to be an upcall, bound to the KSE.
1429 		 *    if there are others that need the kse,
1430 		 *    give them a chance by doing an mi_switch().
1431 		 *    Because we are bound, control will eventually return
1432 		 *    to us here.
1433 		 * ***
1434 		 * Save the thread's context, and link it
1435 		 * into the KSEGRP's list of completed threads.
1436 		 */
1437 		error = thread_export_context(td);
1438 		td->td_mailbox = NULL;
1439 		td->td_usticks = 0;
1440 		if (error) {
1441 			/*
1442 			 * If we are not running on a borrowed KSE, then
1443 			 * failing to do the KSE operation just defaults
1444 			 * back to synchonous operation, so just return from
1445 			 * the syscall. If it IS borrowed, there is nothing
1446 			 * we can do. We just lose that context. We
1447 			 * probably should note this somewhere and send
1448 			 * the process a signal.
1449 			 */
1450 			PROC_LOCK(td->td_proc);
1451 			psignal(td->td_proc, SIGSEGV);
1452 			mtx_lock_spin(&sched_lock);
1453 			if (td->td_kse->ke_bound == NULL) {
1454 				td->td_flags &= ~TDF_UNBOUND;
1455 				PROC_UNLOCK(td->td_proc);
1456 				mtx_unlock_spin(&sched_lock);
1457 				thread_update_uticks();
1458 				return (error);	/* go sync */
1459 			}
1460 			thread_exit();
1461 		}
1462 
1463 		/*
1464 		 * if the KSE is owned and we are borrowing it,
1465 		 * don't make an upcall, just exit so that the owner
1466 		 * can get its KSE if it wants it.
1467 		 * Our context is already safely stored for later
1468 		 * use by the UTS.
1469 		 */
1470 		PROC_LOCK(p);
1471 		mtx_lock_spin(&sched_lock);
1472 		if (td->td_kse->ke_bound) {
1473 			thread_exit();
1474 		}
1475 		PROC_UNLOCK(p);
1476 
1477 		/*
1478 		 * Turn ourself into a bound upcall.
1479 		 * We will rely on kse_reassign()
1480 		 * to make us run at a later time.
1481 		 * We should look just like a sheduled upcall
1482 		 * from msleep() or cv_wait().
1483 		 */
1484 		td->td_flags &= ~TDF_UNBOUND;
1485 		td->td_flags |= TDF_UPCALLING;
1486 		/* Only get here if we have become an upcall */
1487 
1488 	} else {
1489 		mtx_lock_spin(&sched_lock);
1490 	}
1491 	/*
1492 	 * We ARE going back to userland with this KSE.
1493 	 * Check for threads that need to borrow it.
1494 	 * Optimisation: don't call mi_switch if no-one wants the KSE.
1495 	 * Any other thread that comes ready after this missed the boat.
1496 	 */
1497 	ke = td->td_kse;
1498 	if ((td2 = kg->kg_last_assigned))
1499 		td2 = TAILQ_NEXT(td2, td_runq);
1500 	else
1501 		td2 = TAILQ_FIRST(&kg->kg_runq);
1502 	if (td2)  {
1503 		/*
1504 		 * force a switch to more urgent 'in kernel'
1505 		 * work. Control will return to this thread
1506 		 * when there is no more work to do.
1507 		 * kse_reassign() will do tha for us.
1508 		 */
1509 		TD_SET_LOAN(td);
1510 		ke->ke_bound = td;
1511 		ke->ke_thread = NULL;
1512 		mi_switch(); /* kse_reassign() will (re)find td2 */
1513 	}
1514 	mtx_unlock_spin(&sched_lock);
1515 
1516 	/*
1517 	 * Optimisation:
1518 	 * Ensure that we have a spare thread available,
1519 	 * for when we re-enter the kernel.
1520 	 */
1521 	if (td->td_standin == NULL) {
1522 		if (ke->ke_tdspare) {
1523 			td->td_standin = ke->ke_tdspare;
1524 			ke->ke_tdspare = NULL;
1525 		} else {
1526 			td->td_standin = thread_alloc();
1527 		}
1528 	}
1529 
1530 	thread_update_uticks();
1531 	/*
1532 	 * To get here, we know there is no other need for our
1533 	 * KSE so we can proceed. If not upcalling, go back to
1534 	 * userspace. If we are, get the upcall set up.
1535 	 */
1536 	if ((td->td_flags & TDF_UPCALLING) == 0)
1537 		return (0);
1538 
1539 	/*
1540 	 * We must be an upcall to get this far.
1541 	 * There is no more work to do and we are going to ride
1542 	 * this thead/KSE up to userland as an upcall.
1543 	 * Do the last parts of the setup needed for the upcall.
1544 	 */
1545 	CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1546 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
1547 
1548 	/*
1549 	 * Set user context to the UTS.
1550 	 */
1551 	cpu_set_upcall_kse(td, ke);
1552 
1553 	/*
1554 	 * Put any completed mailboxes on this KSE's list.
1555 	 */
1556 	error = thread_link_mboxes(kg, ke);
1557 	if (error)
1558 		goto bad;
1559 
1560 	/*
1561 	 * Set state and mailbox.
1562 	 * From now on we are just a bound outgoing process.
1563 	 * **Problem** userret is often called several times.
1564 	 * it would be nice if this all happenned only on the first time
1565 	 * through. (the scan for extra work etc.)
1566 	 */
1567 	mtx_lock_spin(&sched_lock);
1568 	td->td_flags &= ~TDF_UPCALLING;
1569 	mtx_unlock_spin(&sched_lock);
1570 #if 0
1571 	error = suword((caddr_t)ke->ke_mailbox +
1572 	    offsetof(struct kse_mailbox, km_curthread), 0);
1573 #else	/* if user pointer arithmetic is ok in the kernel */
1574 	error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1575 #endif
1576 	ke->ke_uuticks = ke->ke_usticks = 0;
1577 	if (!error) {
1578 		nanotime(&ts);
1579 		if (copyout(&ts, (caddr_t)&ke->ke_mailbox->km_timeofday,
1580 		    sizeof(ts))) {
1581 			goto bad;
1582 		}
1583 	}
1584 	return (0);
1585 
1586 bad:
1587 	/*
1588 	 * Things are going to be so screwed we should just kill the process.
1589  	 * how do we do that?
1590 	 */
1591 	PROC_LOCK(td->td_proc);
1592 	psignal(td->td_proc, SIGSEGV);
1593 	PROC_UNLOCK(td->td_proc);
1594 	return (error);	/* go sync */
1595 }
1596 
1597 /*
1598  * Enforce single-threading.
1599  *
1600  * Returns 1 if the caller must abort (another thread is waiting to
1601  * exit the process or similar). Process is locked!
1602  * Returns 0 when you are successfully the only thread running.
1603  * A process has successfully single threaded in the suspend mode when
1604  * There are no threads in user mode. Threads in the kernel must be
1605  * allowed to continue until they get to the user boundary. They may even
1606  * copy out their return values and data before suspending. They may however be
1607  * accellerated in reaching the user boundary as we will wake up
1608  * any sleeping threads that are interruptable. (PCATCH).
1609  */
1610 int
1611 thread_single(int force_exit)
1612 {
1613 	struct thread *td;
1614 	struct thread *td2;
1615 	struct proc *p;
1616 
1617 	td = curthread;
1618 	p = td->td_proc;
1619 	PROC_LOCK_ASSERT(p, MA_OWNED);
1620 	KASSERT((td != NULL), ("curthread is NULL"));
1621 
1622 	if ((p->p_flag & P_KSES) == 0)
1623 		return (0);
1624 
1625 	/* Is someone already single threading? */
1626 	if (p->p_singlethread)
1627 		return (1);
1628 
1629 	if (force_exit == SINGLE_EXIT)
1630 		p->p_flag |= P_SINGLE_EXIT;
1631 	else
1632 		p->p_flag &= ~P_SINGLE_EXIT;
1633 	p->p_flag |= P_STOPPED_SINGLE;
1634 	p->p_singlethread = td;
1635 	/* XXXKSE Which lock protects the below values? */
1636 	while ((p->p_numthreads - p->p_suspcount) != 1) {
1637 		mtx_lock_spin(&sched_lock);
1638 		FOREACH_THREAD_IN_PROC(p, td2) {
1639 			if (td2 == td)
1640 				continue;
1641 			if (TD_IS_INHIBITED(td2)) {
1642 				if (force_exit == SINGLE_EXIT) {
1643 					if (TD_IS_SUSPENDED(td2)) {
1644 						thread_unsuspend_one(td2);
1645 					}
1646 					if (TD_ON_SLEEPQ(td2) &&
1647 					    (td2->td_flags & TDF_SINTR)) {
1648 						if (td2->td_flags & TDF_CVWAITQ)
1649 							cv_abort(td2);
1650 						else
1651 							abortsleep(td2);
1652 					}
1653 				} else {
1654 					if (TD_IS_SUSPENDED(td2))
1655 						continue;
1656 					/* maybe other inhibitted states too? */
1657 					if (TD_IS_SLEEPING(td2))
1658 						thread_suspend_one(td2);
1659 				}
1660 			}
1661 		}
1662 		/*
1663 		 * Maybe we suspended some threads.. was it enough?
1664 		 */
1665 		if ((p->p_numthreads - p->p_suspcount) == 1) {
1666 			mtx_unlock_spin(&sched_lock);
1667 			break;
1668 		}
1669 
1670 		/*
1671 		 * Wake us up when everyone else has suspended.
1672 		 * In the mean time we suspend as well.
1673 		 */
1674 		thread_suspend_one(td);
1675 		mtx_unlock(&Giant);
1676 		PROC_UNLOCK(p);
1677 		mi_switch();
1678 		mtx_unlock_spin(&sched_lock);
1679 		mtx_lock(&Giant);
1680 		PROC_LOCK(p);
1681 	}
1682 	if (force_exit == SINGLE_EXIT)
1683 		kse_purge(p, td);
1684 	return (0);
1685 }
1686 
1687 /*
1688  * Called in from locations that can safely check to see
1689  * whether we have to suspend or at least throttle for a
1690  * single-thread event (e.g. fork).
1691  *
1692  * Such locations include userret().
1693  * If the "return_instead" argument is non zero, the thread must be able to
1694  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1695  *
1696  * The 'return_instead' argument tells the function if it may do a
1697  * thread_exit() or suspend, or whether the caller must abort and back
1698  * out instead.
1699  *
1700  * If the thread that set the single_threading request has set the
1701  * P_SINGLE_EXIT bit in the process flags then this call will never return
1702  * if 'return_instead' is false, but will exit.
1703  *
1704  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1705  *---------------+--------------------+---------------------
1706  *       0       | returns 0          |   returns 0 or 1
1707  *               | when ST ends       |   immediatly
1708  *---------------+--------------------+---------------------
1709  *       1       | thread exits       |   returns 1
1710  *               |                    |  immediatly
1711  * 0 = thread_exit() or suspension ok,
1712  * other = return error instead of stopping the thread.
1713  *
1714  * While a full suspension is under effect, even a single threading
1715  * thread would be suspended if it made this call (but it shouldn't).
1716  * This call should only be made from places where
1717  * thread_exit() would be safe as that may be the outcome unless
1718  * return_instead is set.
1719  */
1720 int
1721 thread_suspend_check(int return_instead)
1722 {
1723 	struct thread *td;
1724 	struct proc *p;
1725 	struct kse *ke;
1726 	struct ksegrp *kg;
1727 
1728 	td = curthread;
1729 	p = td->td_proc;
1730 	kg = td->td_ksegrp;
1731 	PROC_LOCK_ASSERT(p, MA_OWNED);
1732 	while (P_SHOULDSTOP(p)) {
1733 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1734 			KASSERT(p->p_singlethread != NULL,
1735 			    ("singlethread not set"));
1736 			/*
1737 			 * The only suspension in action is a
1738 			 * single-threading. Single threader need not stop.
1739 			 * XXX Should be safe to access unlocked
1740 			 * as it can only be set to be true by us.
1741 			 */
1742 			if (p->p_singlethread == td)
1743 				return (0);	/* Exempt from stopping. */
1744 		}
1745 		if (return_instead)
1746 			return (1);
1747 
1748 		/*
1749 		 * If the process is waiting for us to exit,
1750 		 * this thread should just suicide.
1751 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1752 		 */
1753 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1754 			mtx_lock_spin(&sched_lock);
1755 			while (mtx_owned(&Giant))
1756 				mtx_unlock(&Giant);
1757 			/*
1758 			 * free extra kses and ksegrps, we needn't worry
1759 			 * about if current thread is in same ksegrp as
1760 			 * p_singlethread and last kse in the group
1761 			 * could be killed, this is protected by kg_numthreads,
1762 			 * in this case, we deduce that kg_numthreads must > 1.
1763 			 */
1764 			ke = td->td_kse;
1765 			if (ke->ke_bound == NULL &&
1766 			    ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
1767 				ke->ke_flags |= KEF_EXIT;
1768 			thread_exit();
1769 		}
1770 
1771 		/*
1772 		 * When a thread suspends, it just
1773 		 * moves to the processes's suspend queue
1774 		 * and stays there.
1775 		 *
1776 		 * XXXKSE if TDF_BOUND is true
1777 		 * it will not release it's KSE which might
1778 		 * lead to deadlock if there are not enough KSEs
1779 		 * to complete all waiting threads.
1780 		 * Maybe be able to 'lend' it out again.
1781 		 * (lent kse's can not go back to userland?)
1782 		 * and can only be lent in STOPPED state.
1783 		 */
1784 		mtx_lock_spin(&sched_lock);
1785 		if ((p->p_flag & P_STOPPED_SIG) &&
1786 		    (p->p_suspcount+1 == p->p_numthreads)) {
1787 			mtx_unlock_spin(&sched_lock);
1788 			PROC_LOCK(p->p_pptr);
1789 			if ((p->p_pptr->p_procsig->ps_flag &
1790 				PS_NOCLDSTOP) == 0) {
1791 				psignal(p->p_pptr, SIGCHLD);
1792 			}
1793 			PROC_UNLOCK(p->p_pptr);
1794 			mtx_lock_spin(&sched_lock);
1795 		}
1796 		mtx_assert(&Giant, MA_NOTOWNED);
1797 		thread_suspend_one(td);
1798 		PROC_UNLOCK(p);
1799 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1800 			if (p->p_numthreads == p->p_suspcount) {
1801 				thread_unsuspend_one(p->p_singlethread);
1802 			}
1803 		}
1804 		p->p_stats->p_ru.ru_nivcsw++;
1805 		mi_switch();
1806 		mtx_unlock_spin(&sched_lock);
1807 		PROC_LOCK(p);
1808 	}
1809 	return (0);
1810 }
1811 
1812 void
1813 thread_suspend_one(struct thread *td)
1814 {
1815 	struct proc *p = td->td_proc;
1816 
1817 	mtx_assert(&sched_lock, MA_OWNED);
1818 	p->p_suspcount++;
1819 	TD_SET_SUSPENDED(td);
1820 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1821 	/*
1822 	 * Hack: If we are suspending but are on the sleep queue
1823 	 * then we are in msleep or the cv equivalent. We
1824 	 * want to look like we have two Inhibitors.
1825 	 * May already be set.. doesn't matter.
1826 	 */
1827 	if (TD_ON_SLEEPQ(td))
1828 		TD_SET_SLEEPING(td);
1829 }
1830 
1831 void
1832 thread_unsuspend_one(struct thread *td)
1833 {
1834 	struct proc *p = td->td_proc;
1835 
1836 	mtx_assert(&sched_lock, MA_OWNED);
1837 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1838 	TD_CLR_SUSPENDED(td);
1839 	p->p_suspcount--;
1840 	setrunnable(td);
1841 }
1842 
1843 /*
1844  * Allow all threads blocked by single threading to continue running.
1845  */
1846 void
1847 thread_unsuspend(struct proc *p)
1848 {
1849 	struct thread *td;
1850 
1851 	mtx_assert(&sched_lock, MA_OWNED);
1852 	PROC_LOCK_ASSERT(p, MA_OWNED);
1853 	if (!P_SHOULDSTOP(p)) {
1854 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1855 			thread_unsuspend_one(td);
1856 		}
1857 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1858 	    (p->p_numthreads == p->p_suspcount)) {
1859 		/*
1860 		 * Stopping everything also did the job for the single
1861 		 * threading request. Now we've downgraded to single-threaded,
1862 		 * let it continue.
1863 		 */
1864 		thread_unsuspend_one(p->p_singlethread);
1865 	}
1866 }
1867 
1868 void
1869 thread_single_end(void)
1870 {
1871 	struct thread *td;
1872 	struct proc *p;
1873 
1874 	td = curthread;
1875 	p = td->td_proc;
1876 	PROC_LOCK_ASSERT(p, MA_OWNED);
1877 	p->p_flag &= ~P_STOPPED_SINGLE;
1878 	p->p_singlethread = NULL;
1879 	/*
1880 	 * If there are other threads they mey now run,
1881 	 * unless of course there is a blanket 'stop order'
1882 	 * on the process. The single threader must be allowed
1883 	 * to continue however as this is a bad place to stop.
1884 	 */
1885 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1886 		mtx_lock_spin(&sched_lock);
1887 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1888 			thread_unsuspend_one(td);
1889 		}
1890 		mtx_unlock_spin(&sched_lock);
1891 	}
1892 }
1893 
1894 
1895