xref: /freebsd/sys/kern/kern_thread.c (revision 71fe318b852b8dfb3e799cb12ef184750f7f8eac)
1 /*
2  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
3  *  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice(s), this list of conditions and the following disclaimer as
10  *    the first lines of this file unmodified other than the possible
11  *    addition of one or more copyright notices.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice(s), this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
26  * DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/lock.h>
35 #include <sys/malloc.h>
36 #include <sys/mutex.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/sysproto.h>
40 #include <sys/filedesc.h>
41 #include <sys/tty.h>
42 #include <sys/signalvar.h>
43 #include <sys/sx.h>
44 #include <sys/user.h>
45 #include <sys/jail.h>
46 #include <sys/kse.h>
47 #include <sys/ktr.h>
48 #include <sys/ucontext.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_object.h>
52 #include <vm/pmap.h>
53 #include <vm/uma.h>
54 #include <vm/vm_map.h>
55 
56 #include <machine/frame.h>
57 
58 /*
59  * KSEGRP related storage.
60  */
61 static uma_zone_t ksegrp_zone;
62 static uma_zone_t kse_zone;
63 static uma_zone_t thread_zone;
64 
65 /* DEBUG ONLY */
66 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
67 static int oiks_debug = 1;	/* 0 disable, 1 printf, 2 enter debugger */
68 SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
69 	&oiks_debug, 0, "OIKS thread debug");
70 
71 static int max_threads_per_proc = 10;
72 SYSCTL_INT(_kern_threads, OID_AUTO, max_per_proc, CTLFLAG_RW,
73 	&max_threads_per_proc, 0, "Limit on threads per proc");
74 
75 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
76 
77 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
78 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
79 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
80 struct mtx zombie_thread_lock;
81 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
82     "zombie_thread_lock", MTX_SPIN);
83 
84 
85 
86 void kse_purge(struct proc *p, struct thread *td);
87 /*
88  * Pepare a thread for use.
89  */
90 static void
91 thread_ctor(void *mem, int size, void *arg)
92 {
93 	struct thread	*td;
94 
95 	KASSERT((size == sizeof(struct thread)),
96 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
97 
98 	td = (struct thread *)mem;
99 	td->td_state = TDS_INACTIVE;
100 	td->td_flags |= TDF_UNBOUND;
101 }
102 
103 /*
104  * Reclaim a thread after use.
105  */
106 static void
107 thread_dtor(void *mem, int size, void *arg)
108 {
109 	struct thread	*td;
110 
111 	KASSERT((size == sizeof(struct thread)),
112 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
113 
114 	td = (struct thread *)mem;
115 
116 #ifdef INVARIANTS
117 	/* Verify that this thread is in a safe state to free. */
118 	switch (td->td_state) {
119 	case TDS_INHIBITED:
120 	case TDS_RUNNING:
121 	case TDS_CAN_RUN:
122 	case TDS_RUNQ:
123 		/*
124 		 * We must never unlink a thread that is in one of
125 		 * these states, because it is currently active.
126 		 */
127 		panic("bad state for thread unlinking");
128 		/* NOTREACHED */
129 	case TDS_INACTIVE:
130 		break;
131 	default:
132 		panic("bad thread state");
133 		/* NOTREACHED */
134 	}
135 #endif
136 }
137 
138 /*
139  * Initialize type-stable parts of a thread (when newly created).
140  */
141 static void
142 thread_init(void *mem, int size)
143 {
144 	struct thread	*td;
145 
146 	KASSERT((size == sizeof(struct thread)),
147 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
148 
149 	td = (struct thread *)mem;
150 	mtx_lock(&Giant);
151 	pmap_new_thread(td, 0);
152 	mtx_unlock(&Giant);
153 	cpu_thread_setup(td);
154 }
155 
156 /*
157  * Tear down type-stable parts of a thread (just before being discarded).
158  */
159 static void
160 thread_fini(void *mem, int size)
161 {
162 	struct thread	*td;
163 
164 	KASSERT((size == sizeof(struct thread)),
165 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
166 
167 	td = (struct thread *)mem;
168 	pmap_dispose_thread(td);
169 }
170 
171 /*
172  * KSE is linked onto the idle queue.
173  */
174 void
175 kse_link(struct kse *ke, struct ksegrp *kg)
176 {
177 	struct proc *p = kg->kg_proc;
178 
179 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
180 	kg->kg_kses++;
181 	ke->ke_state = KES_UNQUEUED;
182 	ke->ke_proc	= p;
183 	ke->ke_ksegrp	= kg;
184 	ke->ke_thread	= NULL;
185 	ke->ke_oncpu = NOCPU;
186 }
187 
188 void
189 kse_unlink(struct kse *ke)
190 {
191 	struct ksegrp *kg;
192 
193 	mtx_assert(&sched_lock, MA_OWNED);
194 	kg = ke->ke_ksegrp;
195 	if (ke->ke_state == KES_IDLE) {
196 		kg->kg_idle_kses--;
197 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
198 	}
199 
200 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
201 	if (--kg->kg_kses == 0) {
202 			ksegrp_unlink(kg);
203 	}
204 	/*
205 	 * Aggregate stats from the KSE
206 	 */
207 	kse_stash(ke);
208 }
209 
210 void
211 ksegrp_link(struct ksegrp *kg, struct proc *p)
212 {
213 
214 	TAILQ_INIT(&kg->kg_threads);
215 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
216 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
217 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
218 	TAILQ_INIT(&kg->kg_iq);		/* idle kses in ksegrp */
219 	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
220 	kg->kg_proc	= p;
221 /* the following counters are in the -zero- section and may not need clearing */
222 	kg->kg_numthreads = 0;
223 	kg->kg_runnable = 0;
224 	kg->kg_kses = 0;
225 	kg->kg_idle_kses = 0;
226 	kg->kg_loan_kses = 0;
227 	kg->kg_runq_kses = 0; /* XXXKSE change name */
228 /* link it in now that it's consistent */
229 	p->p_numksegrps++;
230 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
231 }
232 
233 void
234 ksegrp_unlink(struct ksegrp *kg)
235 {
236 	struct proc *p;
237 
238 	mtx_assert(&sched_lock, MA_OWNED);
239 	p = kg->kg_proc;
240 	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
241 	    ("kseg_unlink: residual threads or KSEs"));
242 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
243 	p->p_numksegrps--;
244 	/*
245 	 * Aggregate stats from the KSE
246 	 */
247 	ksegrp_stash(kg);
248 }
249 
250 /*
251  * for a newly created process,
252  * link up a the structure and its initial threads etc.
253  */
254 void
255 proc_linkup(struct proc *p, struct ksegrp *kg,
256 			struct kse *ke, struct thread *td)
257 {
258 
259 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
260 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
261 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
262 	p->p_numksegrps = 0;
263 	p->p_numthreads = 0;
264 
265 	ksegrp_link(kg, p);
266 	kse_link(ke, kg);
267 	thread_link(td, kg);
268 }
269 
270 int
271 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
272 {
273 
274 	return(ENOSYS);
275 }
276 
277 int
278 kse_exit(struct thread *td, struct kse_exit_args *uap)
279 {
280 	struct proc *p;
281 	struct ksegrp *kg;
282 
283 	p = td->td_proc;
284 	/* KSE-enabled processes only, please. */
285 	if (!(p->p_flag & P_KSES))
286 		return EINVAL;
287 	/* must be a bound thread */
288 	if (td->td_flags & TDF_UNBOUND)
289 		return EINVAL;
290 	kg = td->td_ksegrp;
291 	/* serialize killing kse */
292 	PROC_LOCK(p);
293 	mtx_lock_spin(&sched_lock);
294 	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
295 		mtx_unlock_spin(&sched_lock);
296 		PROC_UNLOCK(p);
297 		return (EDEADLK);
298 	}
299 	if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
300 		p->p_flag &= ~P_KSES;
301 		mtx_unlock_spin(&sched_lock);
302 		PROC_UNLOCK(p);
303 	} else {
304 		while (mtx_owned(&Giant))
305 			mtx_unlock(&Giant);
306 		td->td_kse->ke_flags |= KEF_EXIT;
307 		thread_exit();
308 		/* NOTREACHED */
309 	}
310 	return 0;
311 }
312 
313 int
314 kse_release(struct thread *td, struct kse_release_args *uap)
315 {
316 	struct proc *p;
317 
318 	p = td->td_proc;
319 	/* KSE-enabled processes only, please. */
320 	if (p->p_flag & P_KSES) {
321 		PROC_LOCK(p);
322 		mtx_lock_spin(&sched_lock);
323 		thread_exit();
324 		/* NOTREACHED */
325 	}
326 	return (EINVAL);
327 }
328 
329 /* struct kse_wakeup_args {
330 	struct kse_mailbox *mbx;
331 }; */
332 int
333 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
334 {
335 	struct proc *p;
336 	struct kse *ke, *ke2;
337 	struct ksegrp *kg;
338 
339 	p = td->td_proc;
340 	/* KSE-enabled processes only, please. */
341 	if (!(p->p_flag & P_KSES))
342 		return EINVAL;
343 	if (td->td_standin == NULL)
344 		td->td_standin = thread_alloc();
345 	ke = NULL;
346 	mtx_lock_spin(&sched_lock);
347 	if (uap->mbx) {
348 		FOREACH_KSEGRP_IN_PROC(p, kg) {
349 			FOREACH_KSE_IN_GROUP(kg, ke2) {
350 				if (ke2->ke_mailbox != uap->mbx)
351 					continue;
352 				if (ke2->ke_state == KES_IDLE) {
353 					ke = ke2;
354 					goto found;
355 				} else {
356 					mtx_unlock_spin(&sched_lock);
357 					td->td_retval[0] = 0;
358 					td->td_retval[1] = 0;
359 					return 0;
360 				}
361 			}
362 		}
363 	} else {
364 		kg = td->td_ksegrp;
365 		ke = TAILQ_FIRST(&kg->kg_iq);
366 	}
367 	if (ke == NULL) {
368 		mtx_unlock_spin(&sched_lock);
369 		return ESRCH;
370 	}
371 found:
372 	thread_schedule_upcall(td, ke);
373 	mtx_unlock_spin(&sched_lock);
374 	td->td_retval[0] = 0;
375 	td->td_retval[1] = 0;
376 	return 0;
377 }
378 
379 /*
380  * No new KSEG: first call: use current KSE, don't schedule an upcall
381  * All other situations, do allocate a new KSE and schedule an upcall on it.
382  */
383 /* struct kse_create_args {
384 	struct kse_mailbox *mbx;
385 	int newgroup;
386 }; */
387 int
388 kse_create(struct thread *td, struct kse_create_args *uap)
389 {
390 	struct kse *newke;
391 	struct kse *ke;
392 	struct ksegrp *newkg;
393 	struct ksegrp *kg;
394 	struct proc *p;
395 	struct kse_mailbox mbx;
396 	int err;
397 
398 	p = td->td_proc;
399 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
400 		return (err);
401 
402 	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
403 	kg = td->td_ksegrp;
404 	if (uap->newgroup) {
405 		/*
406 		 * If we want a new KSEGRP it doesn't matter whether
407 		 * we have already fired up KSE mode before or not.
408 		 * We put the process in KSE mode and create a new KSEGRP
409 		 * and KSE. If our KSE has not got a mailbox yet then
410 		 * that doesn't matter, just leave it that way. It will
411 		 * ensure that this thread stay BOUND. It's possible
412 		 * that the call came form a threaded library and the main
413 		 * program knows nothing of threads.
414 		 */
415 		newkg = ksegrp_alloc();
416 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
417 		      kg_startzero, kg_endzero));
418 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
419 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
420 		newke = kse_alloc();
421 	} else {
422 		/*
423 		 * Otherwise, if we have already set this KSE
424 		 * to have a mailbox, we want to make another KSE here,
425 		 * but only if there are not already the limit, which
426 		 * is 1 per CPU max.
427 		 *
428 		 * If the current KSE doesn't have a mailbox we just use it
429 		 * and give it one.
430 		 *
431 		 * Because we don't like to access
432 		 * the KSE outside of schedlock if we are UNBOUND,
433 		 * (because it can change if we are preempted by an interrupt)
434 		 * we can deduce it as having a mailbox if we are UNBOUND,
435 		 * and only need to actually look at it if we are BOUND,
436 		 * which is safe.
437 		 */
438 		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
439 #if 0  /* while debugging */
440 #ifdef SMP
441 			if (kg->kg_kses > mp_ncpus)
442 #endif
443 				return (EPROCLIM);
444 #endif
445 			newke = kse_alloc();
446 		} else {
447 			newke = NULL;
448 		}
449 		newkg = NULL;
450 	}
451 	if (newke) {
452 		bzero(&newke->ke_startzero, RANGEOF(struct kse,
453 		      ke_startzero, ke_endzero));
454 #if 0
455 		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
456 		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
457 #endif
458 		/* For the first call this may not have been set */
459 		if (td->td_standin == NULL) {
460 			td->td_standin = thread_alloc();
461 		}
462 		mtx_lock_spin(&sched_lock);
463 		if (newkg)
464 			ksegrp_link(newkg, p);
465 		else
466 			newkg = kg;
467 		kse_link(newke, newkg);
468 		if (p->p_sflag & PS_NEEDSIGCHK)
469 			newke->ke_flags |= KEF_ASTPENDING;
470 		newke->ke_mailbox = uap->mbx;
471 		newke->ke_upcall = mbx.km_func;
472 		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
473 		thread_schedule_upcall(td, newke);
474 		mtx_unlock_spin(&sched_lock);
475 	} else {
476 		/*
477 		 * If we didn't allocate a new KSE then the we are using
478 		 * the exisiting (BOUND) kse.
479 		 */
480 		ke = td->td_kse;
481 		ke->ke_mailbox = uap->mbx;
482 		ke->ke_upcall = mbx.km_func;
483 		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
484 	}
485 	/*
486 	 * Fill out the KSE-mode specific fields of the new kse.
487 	 */
488 
489 	td->td_retval[0] = 0;
490 	td->td_retval[1] = 0;
491 	return (0);
492 }
493 
494 /*
495  * Fill a ucontext_t with a thread's context information.
496  *
497  * This is an analogue to getcontext(3).
498  */
499 void
500 thread_getcontext(struct thread *td, ucontext_t *uc)
501 {
502 
503 /*
504  * XXX this is declared in a MD include file, i386/include/ucontext.h but
505  * is used in MI code.
506  */
507 #ifdef __i386__
508 	get_mcontext(td, &uc->uc_mcontext);
509 #endif
510 	uc->uc_sigmask = td->td_proc->p_sigmask;
511 }
512 
513 /*
514  * Set a thread's context from a ucontext_t.
515  *
516  * This is an analogue to setcontext(3).
517  */
518 int
519 thread_setcontext(struct thread *td, ucontext_t *uc)
520 {
521 	int ret;
522 
523 /*
524  * XXX this is declared in a MD include file, i386/include/ucontext.h but
525  * is used in MI code.
526  */
527 #ifdef __i386__
528 	ret = set_mcontext(td, &uc->uc_mcontext);
529 #else
530 	ret = ENOSYS;
531 #endif
532 	if (ret == 0) {
533 		SIG_CANTMASK(uc->uc_sigmask);
534 		PROC_LOCK(td->td_proc);
535 		td->td_proc->p_sigmask = uc->uc_sigmask;
536 		PROC_UNLOCK(td->td_proc);
537 	}
538 	return (ret);
539 }
540 
541 /*
542  * Initialize global thread allocation resources.
543  */
544 void
545 threadinit(void)
546 {
547 
548 #ifndef __ia64__
549 	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
550 	    thread_ctor, thread_dtor, thread_init, thread_fini,
551 	    UMA_ALIGN_CACHE, 0);
552 #else
553 	/*
554 	 * XXX the ia64 kstack allocator is really lame and is at the mercy
555 	 * of contigmallloc().  This hackery is to pre-construct a whole
556 	 * pile of thread structures with associated kernel stacks early
557 	 * in the system startup while contigmalloc() still works. Once we
558 	 * have them, keep them.  Sigh.
559 	 */
560 	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
561 	    thread_ctor, thread_dtor, thread_init, thread_fini,
562 	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
563 	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
564 #endif
565 	ksegrp_zone = uma_zcreate("KSEGRP", sizeof (struct ksegrp),
566 	    NULL, NULL, NULL, NULL,
567 	    UMA_ALIGN_CACHE, 0);
568 	kse_zone = uma_zcreate("KSE", sizeof (struct kse),
569 	    NULL, NULL, NULL, NULL,
570 	    UMA_ALIGN_CACHE, 0);
571 }
572 
573 /*
574  * Stash an embarasingly extra thread into the zombie thread queue.
575  */
576 void
577 thread_stash(struct thread *td)
578 {
579 	mtx_lock_spin(&zombie_thread_lock);
580 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
581 	mtx_unlock_spin(&zombie_thread_lock);
582 }
583 
584 /*
585  * Stash an embarasingly extra kse into the zombie kse queue.
586  */
587 void
588 kse_stash(struct kse *ke)
589 {
590 	mtx_lock_spin(&zombie_thread_lock);
591 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
592 	mtx_unlock_spin(&zombie_thread_lock);
593 }
594 
595 /*
596  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
597  */
598 void
599 ksegrp_stash(struct ksegrp *kg)
600 {
601 	mtx_lock_spin(&zombie_thread_lock);
602 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
603 	mtx_unlock_spin(&zombie_thread_lock);
604 }
605 
606 /*
607  * Reap zombie threads.
608  */
609 void
610 thread_reap(void)
611 {
612 	struct thread *td_first, *td_next;
613 	struct kse *ke_first, *ke_next;
614 	struct ksegrp *kg_first, * kg_next;
615 
616 	/*
617 	 * don't even bother to lock if none at this instant
618 	 * We really don't care about the next instant..
619 	 */
620 	if ((!TAILQ_EMPTY(&zombie_threads))
621 	    || (!TAILQ_EMPTY(&zombie_kses))
622 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
623 		mtx_lock_spin(&zombie_thread_lock);
624 		td_first = TAILQ_FIRST(&zombie_threads);
625 		ke_first = TAILQ_FIRST(&zombie_kses);
626 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
627 		if (td_first)
628 			TAILQ_INIT(&zombie_threads);
629 		if (ke_first)
630 			TAILQ_INIT(&zombie_kses);
631 		if (kg_first)
632 			TAILQ_INIT(&zombie_ksegrps);
633 		mtx_unlock_spin(&zombie_thread_lock);
634 		while (td_first) {
635 			td_next = TAILQ_NEXT(td_first, td_runq);
636 			thread_free(td_first);
637 			td_first = td_next;
638 		}
639 		while (ke_first) {
640 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
641 			kse_free(ke_first);
642 			ke_first = ke_next;
643 		}
644 		while (kg_first) {
645 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
646 			ksegrp_free(kg_first);
647 			kg_first = kg_next;
648 		}
649 	}
650 }
651 
652 /*
653  * Allocate a ksegrp.
654  */
655 struct ksegrp *
656 ksegrp_alloc(void)
657 {
658 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
659 }
660 
661 /*
662  * Allocate a kse.
663  */
664 struct kse *
665 kse_alloc(void)
666 {
667 	return (uma_zalloc(kse_zone, M_WAITOK));
668 }
669 
670 /*
671  * Allocate a thread.
672  */
673 struct thread *
674 thread_alloc(void)
675 {
676 	thread_reap(); /* check if any zombies to get */
677 	return (uma_zalloc(thread_zone, M_WAITOK));
678 }
679 
680 /*
681  * Deallocate a ksegrp.
682  */
683 void
684 ksegrp_free(struct ksegrp *td)
685 {
686 	uma_zfree(ksegrp_zone, td);
687 }
688 
689 /*
690  * Deallocate a kse.
691  */
692 void
693 kse_free(struct kse *td)
694 {
695 	uma_zfree(kse_zone, td);
696 }
697 
698 /*
699  * Deallocate a thread.
700  */
701 void
702 thread_free(struct thread *td)
703 {
704 	uma_zfree(thread_zone, td);
705 }
706 
707 /*
708  * Store the thread context in the UTS's mailbox.
709  * then add the mailbox at the head of a list we are building in user space.
710  * The list is anchored in the ksegrp structure.
711  */
712 int
713 thread_export_context(struct thread *td)
714 {
715 	struct proc *p;
716 	struct ksegrp *kg;
717 	uintptr_t mbx;
718 	void *addr;
719 	int error;
720 	ucontext_t uc;
721 
722 	p = td->td_proc;
723 	kg = td->td_ksegrp;
724 
725 	/* Export the user/machine context. */
726 #if 0
727 	addr = (caddr_t)td->td_mailbox +
728 	    offsetof(struct kse_thr_mailbox, tm_context);
729 #else /* if user pointer arithmetic is valid in the kernel */
730 		addr = (void *)(&td->td_mailbox->tm_context);
731 #endif
732 	error = copyin(addr, &uc, sizeof(ucontext_t));
733 	if (error == 0) {
734 		thread_getcontext(td, &uc);
735 		error = copyout(&uc, addr, sizeof(ucontext_t));
736 
737 	}
738 	if (error) {
739 		PROC_LOCK(p);
740 		psignal(p, SIGSEGV);
741 		PROC_UNLOCK(p);
742 		return (error);
743 	}
744 	/* get address in latest mbox of list pointer */
745 #if 0
746 	addr = (caddr_t)td->td_mailbox
747 	    + offsetof(struct kse_thr_mailbox , tm_next);
748 #else /* if user pointer arithmetic is valid in the kernel */
749 	addr = (void *)(&td->td_mailbox->tm_next);
750 #endif
751 	/*
752 	 * Put the saved address of the previous first
753 	 * entry into this one
754 	 */
755 	for (;;) {
756 		mbx = (uintptr_t)kg->kg_completed;
757 		if (suword(addr, mbx)) {
758 			PROC_LOCK(p);
759 			psignal(p, SIGSEGV);
760 			PROC_UNLOCK(p);
761 			return (EFAULT);
762 		}
763 		PROC_LOCK(p);
764 		if (mbx == (uintptr_t)kg->kg_completed) {
765 			kg->kg_completed = td->td_mailbox;
766 			PROC_UNLOCK(p);
767 			break;
768 		}
769 		PROC_UNLOCK(p);
770 	}
771 	return (0);
772 }
773 
774 /*
775  * Take the list of completed mailboxes for this KSEGRP and put them on this
776  * KSE's mailbox as it's the next one going up.
777  */
778 static int
779 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
780 {
781 	struct proc *p = kg->kg_proc;
782 	void *addr;
783 	uintptr_t mbx;
784 
785 #if 0
786 	addr = (caddr_t)ke->ke_mailbox
787 	    + offsetof(struct kse_mailbox, km_completed);
788 #else /* if user pointer arithmetic is valid in the kernel */
789 		addr = (void *)(&ke->ke_mailbox->km_completed);
790 #endif
791 	for (;;) {
792 		mbx = (uintptr_t)kg->kg_completed;
793 		if (suword(addr, mbx)) {
794 			PROC_LOCK(p);
795 			psignal(p, SIGSEGV);
796 			PROC_UNLOCK(p);
797 			return (EFAULT);
798 		}
799 		/* XXXKSE could use atomic CMPXCH here */
800 		PROC_LOCK(p);
801 		if (mbx == (uintptr_t)kg->kg_completed) {
802 			kg->kg_completed = NULL;
803 			PROC_UNLOCK(p);
804 			break;
805 		}
806 		PROC_UNLOCK(p);
807 	}
808 	return (0);
809 }
810 
811 /*
812  * Discard the current thread and exit from its context.
813  *
814  * Because we can't free a thread while we're operating under its context,
815  * push the current thread into our KSE's ke_tdspare slot, freeing the
816  * thread that might be there currently. Because we know that only this
817  * processor will run our KSE, we needn't worry about someone else grabbing
818  * our context before we do a cpu_throw.
819  */
820 void
821 thread_exit(void)
822 {
823 	struct thread *td;
824 	struct kse *ke;
825 	struct proc *p;
826 	struct ksegrp	*kg;
827 
828 	td = curthread;
829 	kg = td->td_ksegrp;
830 	p = td->td_proc;
831 	ke = td->td_kse;
832 
833 	mtx_assert(&sched_lock, MA_OWNED);
834 	KASSERT(p != NULL, ("thread exiting without a process"));
835 	KASSERT(ke != NULL, ("thread exiting without a kse"));
836 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
837 	PROC_LOCK_ASSERT(p, MA_OWNED);
838 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
839 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
840 
841 	if (ke->ke_tdspare != NULL) {
842 		thread_stash(ke->ke_tdspare);
843 		ke->ke_tdspare = NULL;
844 	}
845 	if (td->td_standin != NULL) {
846 		thread_stash(td->td_standin);
847 		td->td_standin = NULL;
848 	}
849 
850 	cpu_thread_exit(td);	/* XXXSMP */
851 
852 	/*
853 	 * The last thread is left attached to the process
854 	 * So that the whole bundle gets recycled. Skip
855 	 * all this stuff.
856 	 */
857 	if (p->p_numthreads > 1) {
858 		/*
859 		 * Unlink this thread from its proc and the kseg.
860 		 * In keeping with the other structs we probably should
861 		 * have a thread_unlink() that does some of this but it
862 		 * would only be called from here (I think) so it would
863 		 * be a waste. (might be useful for proc_fini() as well.)
864  		 */
865 		TAILQ_REMOVE(&p->p_threads, td, td_plist);
866 		p->p_numthreads--;
867 		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
868 		kg->kg_numthreads--;
869 		/*
870 		 * The test below is NOT true if we are the
871 		 * sole exiting thread. P_STOPPED_SNGL is unset
872 		 * in exit1() after it is the only survivor.
873 		 */
874 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
875 			if (p->p_numthreads == p->p_suspcount) {
876 				thread_unsuspend_one(p->p_singlethread);
877 			}
878 		}
879 
880 		/* Reassign this thread's KSE. */
881 		ke->ke_thread = NULL;
882 		td->td_kse = NULL;
883 		ke->ke_state = KES_UNQUEUED;
884 		KASSERT((ke->ke_bound != td),
885 		    ("thread_exit: entered with ke_bound set"));
886 
887 		/*
888 		 * The reason for all this hoopla is
889 		 * an attempt to stop our thread stack from being freed
890 		 * until AFTER we have stopped running on it.
891 		 * Since we are under schedlock, almost any method where
892 		 * it is eventually freed by someone else is probably ok.
893 		 * (Especially if they do it under schedlock). We could
894 		 * almost free it here if we could be certain that
895 		 * the uma code wouldn't pull it apart immediatly,
896 		 * but unfortunatly we can not guarantee that.
897 		 *
898 		 * For threads that are exiting and NOT killing their
899 		 * KSEs we can just stash it in the KSE, however
900 		 * in the case where the KSE is also being deallocated,
901 		 * we need to store it somewhere else. It turns out that
902 		 * we will never free the last KSE, so there is always one
903 		 * other KSE available. We might as well just choose one
904 		 * and stash it there. Being under schedlock should make that
905 		 * safe.
906 		 *
907 		 * In borrower threads, we can stash it in the lender
908 		 * Where it won't be needed until this thread is long gone.
909 		 * Borrower threads can't kill their KSE anyhow, so even
910 		 * the KSE would be a safe place for them. It is not
911 		 * necessary to have a KSE (or KSEGRP) at all beyond this
912 		 * point, while we are under the protection of schedlock.
913 		 *
914 		 * Either give the KSE to another thread to use (or make
915 		 * it idle), or free it entirely, possibly along with its
916 		 * ksegrp if it's the last one.
917 		 */
918 		if (ke->ke_flags & KEF_EXIT) {
919 			kse_unlink(ke);
920 			/*
921 			 * Designate another KSE to hold our thread.
922 			 * Safe as long as we abide by whatever lock
923 			 * we control it with.. The other KSE will not
924 			 * be able to run it until we release the schelock,
925 			 * but we need to be careful about it deciding to
926 			 * write to the stack before then. Luckily
927 			 * I believe that while another thread's
928 			 * standin thread can be used in this way, the
929 			 * spare thread for the KSE cannot be used without
930 			 * holding schedlock at least once.
931 			 */
932 			ke =  FIRST_KSE_IN_PROC(p);
933 		} else {
934 			kse_reassign(ke);
935 		}
936 		if (ke->ke_bound) {
937 			/*
938 			 * WE are a borrower..
939 			 * stash our thread with the owner.
940 			 */
941 			if (ke->ke_bound->td_standin) {
942 				thread_stash(ke->ke_bound->td_standin);
943 			}
944 			ke->ke_bound->td_standin = td;
945 		} else {
946 			if (ke->ke_tdspare != NULL) {
947 				thread_stash(ke->ke_tdspare);
948 				ke->ke_tdspare = NULL;
949 			}
950 			ke->ke_tdspare = td;
951 		}
952 		PROC_UNLOCK(p);
953 		td->td_state	= TDS_INACTIVE;
954 		td->td_proc	= NULL;
955 		td->td_ksegrp	= NULL;
956 		td->td_last_kse	= NULL;
957 	} else {
958 		PROC_UNLOCK(p);
959 	}
960 
961 	cpu_throw();
962 	/* NOTREACHED */
963 }
964 
965 /*
966  * Link a thread to a process.
967  * set up anything that needs to be initialized for it to
968  * be used by the process.
969  *
970  * Note that we do not link to the proc's ucred here.
971  * The thread is linked as if running but no KSE assigned.
972  */
973 void
974 thread_link(struct thread *td, struct ksegrp *kg)
975 {
976 	struct proc *p;
977 
978 	p = kg->kg_proc;
979 	td->td_state = TDS_INACTIVE;
980 	td->td_proc	= p;
981 	td->td_ksegrp	= kg;
982 	td->td_last_kse	= NULL;
983 
984 	LIST_INIT(&td->td_contested);
985 	callout_init(&td->td_slpcallout, 1);
986 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
987 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
988 	p->p_numthreads++;
989 	kg->kg_numthreads++;
990 	if (oiks_debug && p->p_numthreads > max_threads_per_proc) {
991 		printf("OIKS %d\n", p->p_numthreads);
992 		if (oiks_debug > 1)
993 			Debugger("OIKS");
994 	}
995 	td->td_kse	= NULL;
996 }
997 
998 void
999 kse_purge(struct proc *p, struct thread *td)
1000 {
1001 	struct kse *ke;
1002 	struct ksegrp *kg;
1003 
1004  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
1005 	mtx_lock_spin(&sched_lock);
1006 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
1007 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
1008 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1009 			kg->kg_idle_kses--;
1010 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
1011 			kg->kg_kses--;
1012 			if (ke->ke_tdspare)
1013 				thread_stash(ke->ke_tdspare);
1014    			kse_stash(ke);
1015 		}
1016 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
1017 		p->p_numksegrps--;
1018 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
1019 		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
1020 			("wrong kg_kses"));
1021 		if (kg != td->td_ksegrp) {
1022 			ksegrp_stash(kg);
1023 		}
1024 	}
1025 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
1026 	p->p_numksegrps++;
1027 	mtx_unlock_spin(&sched_lock);
1028 }
1029 
1030 
1031 /*
1032  * Create a thread and schedule it for upcall on the KSE given.
1033  */
1034 struct thread *
1035 thread_schedule_upcall(struct thread *td, struct kse *ke)
1036 {
1037 	struct thread *td2;
1038 	struct ksegrp *kg;
1039 	int newkse;
1040 
1041 	mtx_assert(&sched_lock, MA_OWNED);
1042 	newkse = (ke != td->td_kse);
1043 
1044 	/*
1045 	 * If the kse is already owned by another thread then we can't
1046 	 * schedule an upcall because the other thread must be BOUND
1047 	 * which means it is not in a position to take an upcall.
1048 	 * We must be borrowing the KSE to allow us to complete some in-kernel
1049 	 * work. When we complete, the Bound thread will have teh chance to
1050 	 * complete. This thread will sleep as planned. Hopefully there will
1051 	 * eventually be un unbound thread that can be converted to an
1052 	 * upcall to report the completion of this thread.
1053 	 */
1054 	if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
1055 		return (NULL);
1056 	}
1057 	KASSERT((ke->ke_bound == NULL), ("kse already bound"));
1058 
1059 	if (ke->ke_state == KES_IDLE) {
1060 		kg = ke->ke_ksegrp;
1061 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
1062 		kg->kg_idle_kses--;
1063 		ke->ke_state = KES_UNQUEUED;
1064 	}
1065 	if ((td2 = td->td_standin) != NULL) {
1066 		td->td_standin = NULL;
1067 	} else {
1068 		if (newkse)
1069 			panic("no reserve thread when called with a new kse");
1070 		/*
1071 		 * If called from (e.g.) sleep and we do not have
1072 		 * a reserve thread, then we've used it, so do not
1073 		 * create an upcall.
1074 		 */
1075 		return(NULL);
1076 	}
1077 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
1078 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
1079 	bzero(&td2->td_startzero,
1080 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
1081 	bcopy(&td->td_startcopy, &td2->td_startcopy,
1082 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
1083 	thread_link(td2, ke->ke_ksegrp);
1084 	cpu_set_upcall(td2, td->td_pcb);
1085 
1086 	/*
1087 	 * XXXKSE do we really need this? (default values for the
1088 	 * frame).
1089 	 */
1090 	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
1091 
1092 	/*
1093 	 * Bind the new thread to the KSE,
1094 	 * and if it's our KSE, lend it back to ourself
1095 	 * so we can continue running.
1096 	 */
1097 	td2->td_ucred = crhold(td->td_ucred);
1098 	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
1099 	td2->td_kse = ke;
1100 	td2->td_state = TDS_CAN_RUN;
1101 	td2->td_inhibitors = 0;
1102 	/*
1103 	 * If called from msleep(), we are working on the current
1104 	 * KSE so fake that we borrowed it. If called from
1105 	 * kse_create(), don't, as we have a new kse too.
1106 	 */
1107 	if (!newkse) {
1108 		/*
1109 		 * This thread will be scheduled when the current thread
1110 		 * blocks, exits or tries to enter userspace, (which ever
1111 		 * happens first). When that happens the KSe will "revert"
1112 		 * to this thread in a BOUND manner. Since we are called
1113 		 * from msleep() this is going to be "very soon" in nearly
1114 		 * all cases.
1115 		 */
1116 		ke->ke_bound = td2;
1117 		TD_SET_LOAN(td2);
1118 	} else {
1119 		ke->ke_bound = NULL;
1120 		ke->ke_thread = td2;
1121 		ke->ke_state = KES_THREAD;
1122 		setrunqueue(td2);
1123 	}
1124 	return (td2);	/* bogus.. should be a void function */
1125 }
1126 
1127 /*
1128  * Schedule an upcall to notify a KSE process recieved signals.
1129  *
1130  * XXX - Modifying a sigset_t like this is totally bogus.
1131  */
1132 struct thread *
1133 signal_upcall(struct proc *p, int sig)
1134 {
1135 	struct thread *td, *td2;
1136 	struct kse *ke;
1137 	sigset_t ss;
1138 	int error;
1139 
1140 	PROC_LOCK_ASSERT(p, MA_OWNED);
1141 return (NULL);
1142 
1143 	td = FIRST_THREAD_IN_PROC(p);
1144 	ke = td->td_kse;
1145 	PROC_UNLOCK(p);
1146 	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
1147 	PROC_LOCK(p);
1148 	if (error)
1149 		return (NULL);
1150 	SIGADDSET(ss, sig);
1151 	PROC_UNLOCK(p);
1152 	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
1153 	PROC_LOCK(p);
1154 	if (error)
1155 		return (NULL);
1156 	if (td->td_standin == NULL)
1157 		td->td_standin = thread_alloc();
1158 	mtx_lock_spin(&sched_lock);
1159 	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
1160 	mtx_unlock_spin(&sched_lock);
1161 	return (td2);
1162 }
1163 
1164 /*
1165  * setup done on the thread when it enters the kernel.
1166  * XXXKSE Presently only for syscalls but eventually all kernel entries.
1167  */
1168 void
1169 thread_user_enter(struct proc *p, struct thread *td)
1170 {
1171 	struct kse *ke;
1172 
1173 	/*
1174 	 * First check that we shouldn't just abort.
1175 	 * But check if we are the single thread first!
1176 	 * XXX p_singlethread not locked, but should be safe.
1177 	 */
1178 	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
1179 		PROC_LOCK(p);
1180 		mtx_lock_spin(&sched_lock);
1181 		thread_exit();
1182 		/* NOTREACHED */
1183 	}
1184 
1185 	/*
1186 	 * If we are doing a syscall in a KSE environment,
1187 	 * note where our mailbox is. There is always the
1188 	 * possibility that we could do this lazily (in sleep()),
1189 	 * but for now do it every time.
1190 	 */
1191 	ke = td->td_kse;
1192 	if (ke->ke_mailbox != NULL) {
1193 #if 0
1194 		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
1195 		    + offsetof(struct kse_mailbox, km_curthread));
1196 #else /* if user pointer arithmetic is ok in the kernel */
1197 		td->td_mailbox =
1198 		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
1199 #endif
1200 		if ((td->td_mailbox == NULL) ||
1201 		    (td->td_mailbox == (void *)-1)) {
1202 			td->td_mailbox = NULL;	/* single thread it.. */
1203 			td->td_flags &= ~TDF_UNBOUND;
1204 		} else {
1205 			if (td->td_standin == NULL)
1206 				td->td_standin = thread_alloc();
1207 			td->td_flags |= TDF_UNBOUND;
1208 		}
1209 	}
1210 }
1211 
1212 /*
1213  * The extra work we go through if we are a threaded process when we
1214  * return to userland.
1215  *
1216  * If we are a KSE process and returning to user mode, check for
1217  * extra work to do before we return (e.g. for more syscalls
1218  * to complete first).  If we were in a critical section, we should
1219  * just return to let it finish. Same if we were in the UTS (in
1220  * which case the mailbox's context's busy indicator will be set).
1221  * The only traps we suport will have set the mailbox.
1222  * We will clear it here.
1223  */
1224 int
1225 thread_userret(struct thread *td, struct trapframe *frame)
1226 {
1227 	int error;
1228 	int unbound;
1229 	struct kse *ke;
1230 	struct ksegrp *kg;
1231 	struct thread *td2;
1232 	struct proc *p;
1233 
1234 	error = 0;
1235 
1236 	unbound = td->td_flags & TDF_UNBOUND;
1237 
1238 	kg = td->td_ksegrp;
1239 	p = td->td_proc;
1240 
1241 	/*
1242 	 * Originally bound threads never upcall but they may
1243 	 * loan out their KSE at this point.
1244 	 * Upcalls imply bound.. They also may want to do some Philantropy.
1245 	 * Unbound threads on the other hand either yield to other work
1246 	 * or transform into an upcall.
1247 	 * (having saved their context to user space in both cases)
1248 	 */
1249 	if (unbound ) {
1250 		/*
1251 		 * We are an unbound thread, looking to return to
1252 		 * user space.
1253 		 * THere are several possibilities:
1254 		 * 1) we are using a borrowed KSE. save state and exit.
1255 		 *    kse_reassign() will recycle the kse as needed,
1256 		 * 2) we are not.. save state, and then convert ourself
1257 		 *    to be an upcall, bound to the KSE.
1258 		 *    if there are others that need the kse,
1259 		 *    give them a chance by doing an mi_switch().
1260 		 *    Because we are bound, control will eventually return
1261 		 *    to us here.
1262 		 * ***
1263 		 * Save the thread's context, and link it
1264 		 * into the KSEGRP's list of completed threads.
1265 		 */
1266 		error = thread_export_context(td);
1267 		td->td_mailbox = NULL;
1268 		if (error) {
1269 			/*
1270 			 * If we are not running on a borrowed KSE, then
1271 			 * failing to do the KSE operation just defaults
1272 			 * back to synchonous operation, so just return from
1273 			 * the syscall. If it IS borrowed, there is nothing
1274 			 * we can do. We just lose that context. We
1275 			 * probably should note this somewhere and send
1276 			 * the process a signal.
1277 			 */
1278 			PROC_LOCK(td->td_proc);
1279 			psignal(td->td_proc, SIGSEGV);
1280 			mtx_lock_spin(&sched_lock);
1281 			if (td->td_kse->ke_bound == NULL) {
1282 				td->td_flags &= ~TDF_UNBOUND;
1283 				PROC_UNLOCK(td->td_proc);
1284 				mtx_unlock_spin(&sched_lock);
1285 				return (error);	/* go sync */
1286 			}
1287 			thread_exit();
1288 		}
1289 
1290 		/*
1291 		 * if the KSE is owned and we are borrowing it,
1292 		 * don't make an upcall, just exit so that the owner
1293 		 * can get its KSE if it wants it.
1294 		 * Our context is already safely stored for later
1295 		 * use by the UTS.
1296 		 */
1297 		PROC_LOCK(p);
1298 		mtx_lock_spin(&sched_lock);
1299 		if (td->td_kse->ke_bound) {
1300 			thread_exit();
1301 		}
1302 		PROC_UNLOCK(p);
1303 
1304 		/*
1305 		 * Turn ourself into a bound upcall.
1306 		 * We will rely on kse_reassign()
1307 		 * to make us run at a later time.
1308 		 * We should look just like a sheduled upcall
1309 		 * from msleep() or cv_wait().
1310 		 */
1311 		td->td_flags &= ~TDF_UNBOUND;
1312 		td->td_flags |= TDF_UPCALLING;
1313 		/* Only get here if we have become an upcall */
1314 
1315 	} else {
1316 		mtx_lock_spin(&sched_lock);
1317 	}
1318 	/*
1319 	 * We ARE going back to userland with this KSE.
1320 	 * Check for threads that need to borrow it.
1321 	 * Optimisation: don't call mi_switch if no-one wants the KSE.
1322 	 * Any other thread that comes ready after this missed the boat.
1323 	 */
1324 	ke = td->td_kse;
1325 	if ((td2 = kg->kg_last_assigned))
1326 		td2 = TAILQ_NEXT(td2, td_runq);
1327 	else
1328 		td2 = TAILQ_FIRST(&kg->kg_runq);
1329 	if (td2)  {
1330 		/*
1331 		 * force a switch to more urgent 'in kernel'
1332 		 * work. Control will return to this thread
1333 		 * when there is no more work to do.
1334 		 * kse_reassign() will do tha for us.
1335 		 */
1336 		TD_SET_LOAN(td);
1337 		ke->ke_bound = td;
1338 		ke->ke_thread = NULL;
1339 		mi_switch(); /* kse_reassign() will (re)find td2 */
1340 	}
1341 	mtx_unlock_spin(&sched_lock);
1342 
1343 	/*
1344 	 * Optimisation:
1345 	 * Ensure that we have a spare thread available,
1346 	 * for when we re-enter the kernel.
1347 	 */
1348 	if (td->td_standin == NULL) {
1349 		if (ke->ke_tdspare) {
1350 			td->td_standin = ke->ke_tdspare;
1351 			ke->ke_tdspare = NULL;
1352 		} else {
1353 			td->td_standin = thread_alloc();
1354 		}
1355 	}
1356 
1357 	/*
1358 	 * To get here, we know there is no other need for our
1359 	 * KSE so we can proceed. If not upcalling, go back to
1360 	 * userspace. If we are, get the upcall set up.
1361 	 */
1362 	if ((td->td_flags & TDF_UPCALLING) == 0)
1363 		return (0);
1364 
1365 	/*
1366 	 * We must be an upcall to get this far.
1367 	 * There is no more work to do and we are going to ride
1368 	 * this thead/KSE up to userland as an upcall.
1369 	 * Do the last parts of the setup needed for the upcall.
1370 	 */
1371 	CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
1372 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
1373 
1374 	/*
1375 	 * Set user context to the UTS.
1376 	 */
1377 	cpu_set_upcall_kse(td, ke);
1378 
1379 	/*
1380 	 * Put any completed mailboxes on this KSE's list.
1381 	 */
1382 	error = thread_link_mboxes(kg, ke);
1383 	if (error)
1384 		goto bad;
1385 
1386 	/*
1387 	 * Set state and mailbox.
1388 	 * From now on we are just a bound outgoing process.
1389 	 * **Problem** userret is often called several times.
1390 	 * it would be nice if this all happenned only on the first time
1391 	 * through. (the scan for extra work etc.)
1392 	 */
1393 	td->td_flags &= ~TDF_UPCALLING;
1394 #if 0
1395 	error = suword((caddr_t)ke->ke_mailbox +
1396 	    offsetof(struct kse_mailbox, km_curthread), 0);
1397 #else	/* if user pointer arithmetic is ok in the kernel */
1398 	error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
1399 #endif
1400 	if (!error)
1401 		return (0);
1402 
1403 bad:
1404 	/*
1405 	 * Things are going to be so screwed we should just kill the process.
1406  	 * how do we do that?
1407 	 */
1408 	PROC_LOCK(td->td_proc);
1409 	psignal(td->td_proc, SIGSEGV);
1410 	PROC_UNLOCK(td->td_proc);
1411 	return (error);	/* go sync */
1412 }
1413 
1414 /*
1415  * Enforce single-threading.
1416  *
1417  * Returns 1 if the caller must abort (another thread is waiting to
1418  * exit the process or similar). Process is locked!
1419  * Returns 0 when you are successfully the only thread running.
1420  * A process has successfully single threaded in the suspend mode when
1421  * There are no threads in user mode. Threads in the kernel must be
1422  * allowed to continue until they get to the user boundary. They may even
1423  * copy out their return values and data before suspending. They may however be
1424  * accellerated in reaching the user boundary as we will wake up
1425  * any sleeping threads that are interruptable. (PCATCH).
1426  */
1427 int
1428 thread_single(int force_exit)
1429 {
1430 	struct thread *td;
1431 	struct thread *td2;
1432 	struct proc *p;
1433 
1434 	td = curthread;
1435 	p = td->td_proc;
1436 	PROC_LOCK_ASSERT(p, MA_OWNED);
1437 	KASSERT((td != NULL), ("curthread is NULL"));
1438 
1439 	if ((p->p_flag & P_KSES) == 0)
1440 		return (0);
1441 
1442 	/* Is someone already single threading? */
1443 	if (p->p_singlethread)
1444 		return (1);
1445 
1446 	if (force_exit == SINGLE_EXIT)
1447 		p->p_flag |= P_SINGLE_EXIT;
1448 	else
1449 		p->p_flag &= ~P_SINGLE_EXIT;
1450 	p->p_flag |= P_STOPPED_SINGLE;
1451 	p->p_singlethread = td;
1452 	/* XXXKSE Which lock protects the below values? */
1453 	while ((p->p_numthreads - p->p_suspcount) != 1) {
1454 		mtx_lock_spin(&sched_lock);
1455 		FOREACH_THREAD_IN_PROC(p, td2) {
1456 			if (td2 == td)
1457 				continue;
1458 			if (TD_IS_INHIBITED(td2)) {
1459 				if (force_exit == SINGLE_EXIT) {
1460 					if (TD_IS_SUSPENDED(td2)) {
1461 						thread_unsuspend_one(td2);
1462 					}
1463 					if (TD_ON_SLEEPQ(td2) &&
1464 					    (td2->td_flags & TDF_SINTR)) {
1465 						if (td2->td_flags & TDF_CVWAITQ)
1466 							cv_abort(td2);
1467 						else
1468 							abortsleep(td2);
1469 					}
1470 				} else {
1471 					if (TD_IS_SUSPENDED(td2))
1472 						continue;
1473 					/* maybe other inhibitted states too? */
1474 					if (TD_IS_SLEEPING(td2))
1475 						thread_suspend_one(td2);
1476 				}
1477 			}
1478 		}
1479 		/*
1480 		 * Maybe we suspended some threads.. was it enough?
1481 		 */
1482 		if ((p->p_numthreads - p->p_suspcount) == 1) {
1483 			mtx_unlock_spin(&sched_lock);
1484 			break;
1485 		}
1486 
1487 		/*
1488 		 * Wake us up when everyone else has suspended.
1489 		 * In the mean time we suspend as well.
1490 		 */
1491 		thread_suspend_one(td);
1492 		mtx_unlock(&Giant);
1493 		PROC_UNLOCK(p);
1494 		mi_switch();
1495 		mtx_unlock_spin(&sched_lock);
1496 		mtx_lock(&Giant);
1497 		PROC_LOCK(p);
1498 	}
1499 	if (force_exit == SINGLE_EXIT)
1500 		kse_purge(p, td);
1501 	return (0);
1502 }
1503 
1504 /*
1505  * Called in from locations that can safely check to see
1506  * whether we have to suspend or at least throttle for a
1507  * single-thread event (e.g. fork).
1508  *
1509  * Such locations include userret().
1510  * If the "return_instead" argument is non zero, the thread must be able to
1511  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1512  *
1513  * The 'return_instead' argument tells the function if it may do a
1514  * thread_exit() or suspend, or whether the caller must abort and back
1515  * out instead.
1516  *
1517  * If the thread that set the single_threading request has set the
1518  * P_SINGLE_EXIT bit in the process flags then this call will never return
1519  * if 'return_instead' is false, but will exit.
1520  *
1521  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1522  *---------------+--------------------+---------------------
1523  *       0       | returns 0          |   returns 0 or 1
1524  *               | when ST ends       |   immediatly
1525  *---------------+--------------------+---------------------
1526  *       1       | thread exits       |   returns 1
1527  *               |                    |  immediatly
1528  * 0 = thread_exit() or suspension ok,
1529  * other = return error instead of stopping the thread.
1530  *
1531  * While a full suspension is under effect, even a single threading
1532  * thread would be suspended if it made this call (but it shouldn't).
1533  * This call should only be made from places where
1534  * thread_exit() would be safe as that may be the outcome unless
1535  * return_instead is set.
1536  */
1537 int
1538 thread_suspend_check(int return_instead)
1539 {
1540 	struct thread *td;
1541 	struct proc *p;
1542 	struct kse *ke;
1543 	struct ksegrp *kg;
1544 
1545 	td = curthread;
1546 	p = td->td_proc;
1547 	kg = td->td_ksegrp;
1548 	PROC_LOCK_ASSERT(p, MA_OWNED);
1549 	while (P_SHOULDSTOP(p)) {
1550 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1551 			KASSERT(p->p_singlethread != NULL,
1552 			    ("singlethread not set"));
1553 			/*
1554 			 * The only suspension in action is a
1555 			 * single-threading. Single threader need not stop.
1556 			 * XXX Should be safe to access unlocked
1557 			 * as it can only be set to be true by us.
1558 			 */
1559 			if (p->p_singlethread == td)
1560 				return (0);	/* Exempt from stopping. */
1561 		}
1562 		if (return_instead)
1563 			return (1);
1564 
1565 		/*
1566 		 * If the process is waiting for us to exit,
1567 		 * this thread should just suicide.
1568 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1569 		 */
1570 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1571 			mtx_lock_spin(&sched_lock);
1572 			while (mtx_owned(&Giant))
1573 				mtx_unlock(&Giant);
1574 			/*
1575 			 * free extra kses and ksegrps, we needn't worry
1576 			 * about if current thread is in same ksegrp as
1577 			 * p_singlethread and last kse in the group
1578 			 * could be killed, this is protected by kg_numthreads,
1579 			 * in this case, we deduce that kg_numthreads must > 1.
1580 			 */
1581 			ke = td->td_kse;
1582 			if (ke->ke_bound == NULL &&
1583 			    ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
1584 				ke->ke_flags |= KEF_EXIT;
1585 			thread_exit();
1586 		}
1587 
1588 		/*
1589 		 * When a thread suspends, it just
1590 		 * moves to the processes's suspend queue
1591 		 * and stays there.
1592 		 *
1593 		 * XXXKSE if TDF_BOUND is true
1594 		 * it will not release it's KSE which might
1595 		 * lead to deadlock if there are not enough KSEs
1596 		 * to complete all waiting threads.
1597 		 * Maybe be able to 'lend' it out again.
1598 		 * (lent kse's can not go back to userland?)
1599 		 * and can only be lent in STOPPED state.
1600 		 */
1601 		mtx_lock_spin(&sched_lock);
1602 		if ((p->p_flag & P_STOPPED_SIG) &&
1603 		    (p->p_suspcount+1 == p->p_numthreads)) {
1604 			mtx_unlock_spin(&sched_lock);
1605 			PROC_LOCK(p->p_pptr);
1606 			if ((p->p_pptr->p_procsig->ps_flag &
1607 				PS_NOCLDSTOP) == 0) {
1608 				psignal(p->p_pptr, SIGCHLD);
1609 			}
1610 			PROC_UNLOCK(p->p_pptr);
1611 			mtx_lock_spin(&sched_lock);
1612 		}
1613 		mtx_assert(&Giant, MA_NOTOWNED);
1614 		thread_suspend_one(td);
1615 		PROC_UNLOCK(p);
1616 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1617 			if (p->p_numthreads == p->p_suspcount) {
1618 				thread_unsuspend_one(p->p_singlethread);
1619 			}
1620 		}
1621 		p->p_stats->p_ru.ru_nivcsw++;
1622 		mi_switch();
1623 		mtx_unlock_spin(&sched_lock);
1624 		PROC_LOCK(p);
1625 	}
1626 	return (0);
1627 }
1628 
1629 void
1630 thread_suspend_one(struct thread *td)
1631 {
1632 	struct proc *p = td->td_proc;
1633 
1634 	mtx_assert(&sched_lock, MA_OWNED);
1635 	p->p_suspcount++;
1636 	TD_SET_SUSPENDED(td);
1637 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
1638 	/*
1639 	 * Hack: If we are suspending but are on the sleep queue
1640 	 * then we are in msleep or the cv equivalent. We
1641 	 * want to look like we have two Inhibitors.
1642 	 * May already be set.. doesn't matter.
1643 	 */
1644 	if (TD_ON_SLEEPQ(td))
1645 		TD_SET_SLEEPING(td);
1646 }
1647 
1648 void
1649 thread_unsuspend_one(struct thread *td)
1650 {
1651 	struct proc *p = td->td_proc;
1652 
1653 	mtx_assert(&sched_lock, MA_OWNED);
1654 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
1655 	TD_CLR_SUSPENDED(td);
1656 	p->p_suspcount--;
1657 	setrunnable(td);
1658 }
1659 
1660 /*
1661  * Allow all threads blocked by single threading to continue running.
1662  */
1663 void
1664 thread_unsuspend(struct proc *p)
1665 {
1666 	struct thread *td;
1667 
1668 	mtx_assert(&sched_lock, MA_OWNED);
1669 	PROC_LOCK_ASSERT(p, MA_OWNED);
1670 	if (!P_SHOULDSTOP(p)) {
1671 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1672 			thread_unsuspend_one(td);
1673 		}
1674 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
1675 	    (p->p_numthreads == p->p_suspcount)) {
1676 		/*
1677 		 * Stopping everything also did the job for the single
1678 		 * threading request. Now we've downgraded to single-threaded,
1679 		 * let it continue.
1680 		 */
1681 		thread_unsuspend_one(p->p_singlethread);
1682 	}
1683 }
1684 
1685 void
1686 thread_single_end(void)
1687 {
1688 	struct thread *td;
1689 	struct proc *p;
1690 
1691 	td = curthread;
1692 	p = td->td_proc;
1693 	PROC_LOCK_ASSERT(p, MA_OWNED);
1694 	p->p_flag &= ~P_STOPPED_SINGLE;
1695 	p->p_singlethread = NULL;
1696 	/*
1697 	 * If there are other threads they mey now run,
1698 	 * unless of course there is a blanket 'stop order'
1699 	 * on the process. The single threader must be allowed
1700 	 * to continue however as this is a bad place to stop.
1701 	 */
1702 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
1703 		mtx_lock_spin(&sched_lock);
1704 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
1705 			thread_unsuspend_one(td);
1706 		}
1707 		mtx_unlock_spin(&sched_lock);
1708 	}
1709 }
1710 
1711 
1712