xref: /freebsd/sys/kern/kern_thread.c (revision 7e49f04c88390c4c07f90c733c0d35ad6ff00f1c)
1  /*-
2   * SPDX-License-Identifier: BSD-2-Clause
3   *
4   * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
5   *  All rights reserved.
6   *
7   * Redistribution and use in source and binary forms, with or without
8   * modification, are permitted provided that the following conditions
9   * are met:
10   * 1. Redistributions of source code must retain the above copyright
11   *    notice(s), this list of conditions and the following disclaimer as
12   *    the first lines of this file unmodified other than the possible
13   *    addition of one or more copyright notices.
14   * 2. Redistributions in binary form must reproduce the above copyright
15   *    notice(s), this list of conditions and the following disclaimer in the
16   *    documentation and/or other materials provided with the distribution.
17   *
18   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
19   * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21   * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
22   * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
28   * DAMAGE.
29   */
30  
31  #include "opt_witness.h"
32  #include "opt_hwpmc_hooks.h"
33  
34  #include <sys/systm.h>
35  #include <sys/asan.h>
36  #include <sys/kernel.h>
37  #include <sys/lock.h>
38  #include <sys/msan.h>
39  #include <sys/mutex.h>
40  #include <sys/proc.h>
41  #include <sys/bitstring.h>
42  #include <sys/epoch.h>
43  #include <sys/rangelock.h>
44  #include <sys/resourcevar.h>
45  #include <sys/sdt.h>
46  #include <sys/smp.h>
47  #include <sys/sched.h>
48  #include <sys/sleepqueue.h>
49  #include <sys/selinfo.h>
50  #include <sys/syscallsubr.h>
51  #include <sys/dtrace_bsd.h>
52  #include <sys/sysent.h>
53  #include <sys/turnstile.h>
54  #include <sys/taskqueue.h>
55  #include <sys/ktr.h>
56  #include <sys/rwlock.h>
57  #include <sys/umtxvar.h>
58  #include <sys/vmmeter.h>
59  #include <sys/cpuset.h>
60  #ifdef	HWPMC_HOOKS
61  #include <sys/pmckern.h>
62  #endif
63  #include <sys/priv.h>
64  
65  #include <security/audit/audit.h>
66  
67  #include <vm/pmap.h>
68  #include <vm/vm.h>
69  #include <vm/vm_extern.h>
70  #include <vm/uma.h>
71  #include <vm/vm_phys.h>
72  #include <sys/eventhandler.h>
73  
74  /*
75   * Asserts below verify the stability of struct thread and struct proc
76   * layout, as exposed by KBI to modules.  On head, the KBI is allowed
77   * to drift, change to the structures must be accompanied by the
78   * assert update.
79   *
80   * On the stable branches after KBI freeze, conditions must not be
81   * violated.  Typically new fields are moved to the end of the
82   * structures.
83   */
84  #ifdef __amd64__
85  _Static_assert(offsetof(struct thread, td_flags) == 0x108,
86      "struct thread KBI td_flags");
87  _Static_assert(offsetof(struct thread, td_pflags) == 0x114,
88      "struct thread KBI td_pflags");
89  _Static_assert(offsetof(struct thread, td_frame) == 0x4b8,
90      "struct thread KBI td_frame");
91  _Static_assert(offsetof(struct thread, td_emuldata) == 0x6c0,
92      "struct thread KBI td_emuldata");
93  _Static_assert(offsetof(struct proc, p_flag) == 0xb8,
94      "struct proc KBI p_flag");
95  _Static_assert(offsetof(struct proc, p_pid) == 0xc4,
96      "struct proc KBI p_pid");
97  _Static_assert(offsetof(struct proc, p_filemon) == 0x3c8,
98      "struct proc KBI p_filemon");
99  _Static_assert(offsetof(struct proc, p_comm) == 0x3e0,
100      "struct proc KBI p_comm");
101  _Static_assert(offsetof(struct proc, p_emuldata) == 0x4d0,
102      "struct proc KBI p_emuldata");
103  #endif
104  #ifdef __i386__
105  _Static_assert(offsetof(struct thread, td_flags) == 0x9c,
106      "struct thread KBI td_flags");
107  _Static_assert(offsetof(struct thread, td_pflags) == 0xa8,
108      "struct thread KBI td_pflags");
109  _Static_assert(offsetof(struct thread, td_frame) == 0x318,
110      "struct thread KBI td_frame");
111  _Static_assert(offsetof(struct thread, td_emuldata) == 0x35c,
112      "struct thread KBI td_emuldata");
113  _Static_assert(offsetof(struct proc, p_flag) == 0x6c,
114      "struct proc KBI p_flag");
115  _Static_assert(offsetof(struct proc, p_pid) == 0x78,
116      "struct proc KBI p_pid");
117  _Static_assert(offsetof(struct proc, p_filemon) == 0x270,
118      "struct proc KBI p_filemon");
119  _Static_assert(offsetof(struct proc, p_comm) == 0x284,
120      "struct proc KBI p_comm");
121  _Static_assert(offsetof(struct proc, p_emuldata) == 0x318,
122      "struct proc KBI p_emuldata");
123  #endif
124  
125  SDT_PROVIDER_DECLARE(proc);
126  SDT_PROBE_DEFINE(proc, , , lwp__exit);
127  
128  /*
129   * thread related storage.
130   */
131  static uma_zone_t thread_zone;
132  
133  struct thread_domain_data {
134  	struct thread	*tdd_zombies;
135  	int		tdd_reapticks;
136  } __aligned(CACHE_LINE_SIZE);
137  
138  static struct thread_domain_data thread_domain_data[MAXMEMDOM];
139  
140  static struct task	thread_reap_task;
141  static struct callout  	thread_reap_callout;
142  
143  static void thread_zombie(struct thread *);
144  static void thread_reap(void);
145  static void thread_reap_all(void);
146  static void thread_reap_task_cb(void *, int);
147  static void thread_reap_callout_cb(void *);
148  static void thread_unsuspend_one(struct thread *td, struct proc *p,
149      bool boundary);
150  static void thread_free_batched(struct thread *td);
151  
152  static __exclusive_cache_line struct mtx tid_lock;
153  static bitstr_t *tid_bitmap;
154  
155  static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
156  
157  static int maxthread;
158  SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN,
159      &maxthread, 0, "Maximum number of threads");
160  
161  static __exclusive_cache_line int nthreads;
162  
163  static LIST_HEAD(tidhashhead, thread) *tidhashtbl;
164  static u_long	tidhash;
165  static u_long	tidhashlock;
166  static struct	rwlock *tidhashtbl_lock;
167  #define	TIDHASH(tid)		(&tidhashtbl[(tid) & tidhash])
168  #define	TIDHASHLOCK(tid)	(&tidhashtbl_lock[(tid) & tidhashlock])
169  
170  EVENTHANDLER_LIST_DEFINE(thread_ctor);
171  EVENTHANDLER_LIST_DEFINE(thread_dtor);
172  EVENTHANDLER_LIST_DEFINE(thread_init);
173  EVENTHANDLER_LIST_DEFINE(thread_fini);
174  
175  static bool
thread_count_inc_try(void)176  thread_count_inc_try(void)
177  {
178  	int nthreads_new;
179  
180  	nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1;
181  	if (nthreads_new >= maxthread - 100) {
182  		if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 ||
183  		    nthreads_new >= maxthread) {
184  			atomic_subtract_int(&nthreads, 1);
185  			return (false);
186  		}
187  	}
188  	return (true);
189  }
190  
191  static bool
thread_count_inc(void)192  thread_count_inc(void)
193  {
194  	static struct timeval lastfail;
195  	static int curfail;
196  
197  	thread_reap();
198  	if (thread_count_inc_try()) {
199  		return (true);
200  	}
201  
202  	thread_reap_all();
203  	if (thread_count_inc_try()) {
204  		return (true);
205  	}
206  
207  	if (ppsratecheck(&lastfail, &curfail, 1)) {
208  		printf("maxthread limit exceeded by uid %u "
209  		    "(pid %d); consider increasing kern.maxthread\n",
210  		    curthread->td_ucred->cr_ruid, curproc->p_pid);
211  	}
212  	return (false);
213  }
214  
215  static void
thread_count_sub(int n)216  thread_count_sub(int n)
217  {
218  
219  	atomic_subtract_int(&nthreads, n);
220  }
221  
222  static void
thread_count_dec(void)223  thread_count_dec(void)
224  {
225  
226  	thread_count_sub(1);
227  }
228  
229  static lwpid_t
tid_alloc(void)230  tid_alloc(void)
231  {
232  	static lwpid_t trytid;
233  	lwpid_t tid;
234  
235  	mtx_lock(&tid_lock);
236  	/*
237  	 * It is an invariant that the bitmap is big enough to hold maxthread
238  	 * IDs. If we got to this point there has to be at least one free.
239  	 */
240  	if (trytid >= maxthread)
241  		trytid = 0;
242  	bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
243  	if (tid == -1) {
244  		KASSERT(trytid != 0, ("unexpectedly ran out of IDs"));
245  		trytid = 0;
246  		bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
247  		KASSERT(tid != -1, ("unexpectedly ran out of IDs"));
248  	}
249  	bit_set(tid_bitmap, tid);
250  	trytid = tid + 1;
251  	mtx_unlock(&tid_lock);
252  	return (tid + NO_PID);
253  }
254  
255  static void
tid_free_locked(lwpid_t rtid)256  tid_free_locked(lwpid_t rtid)
257  {
258  	lwpid_t tid;
259  
260  	mtx_assert(&tid_lock, MA_OWNED);
261  	KASSERT(rtid >= NO_PID,
262  	    ("%s: invalid tid %d\n", __func__, rtid));
263  	tid = rtid - NO_PID;
264  	KASSERT(bit_test(tid_bitmap, tid) != 0,
265  	    ("thread ID %d not allocated\n", rtid));
266  	bit_clear(tid_bitmap, tid);
267  }
268  
269  static void
tid_free(lwpid_t rtid)270  tid_free(lwpid_t rtid)
271  {
272  
273  	mtx_lock(&tid_lock);
274  	tid_free_locked(rtid);
275  	mtx_unlock(&tid_lock);
276  }
277  
278  static void
tid_free_batch(lwpid_t * batch,int n)279  tid_free_batch(lwpid_t *batch, int n)
280  {
281  	int i;
282  
283  	mtx_lock(&tid_lock);
284  	for (i = 0; i < n; i++) {
285  		tid_free_locked(batch[i]);
286  	}
287  	mtx_unlock(&tid_lock);
288  }
289  
290  /*
291   * Batching for thread reapping.
292   */
293  struct tidbatch {
294  	lwpid_t tab[16];
295  	int n;
296  };
297  
298  static void
tidbatch_prep(struct tidbatch * tb)299  tidbatch_prep(struct tidbatch *tb)
300  {
301  
302  	tb->n = 0;
303  }
304  
305  static void
tidbatch_add(struct tidbatch * tb,struct thread * td)306  tidbatch_add(struct tidbatch *tb, struct thread *td)
307  {
308  
309  	KASSERT(tb->n < nitems(tb->tab),
310  	    ("%s: count too high %d", __func__, tb->n));
311  	tb->tab[tb->n] = td->td_tid;
312  	tb->n++;
313  }
314  
315  static void
tidbatch_process(struct tidbatch * tb)316  tidbatch_process(struct tidbatch *tb)
317  {
318  
319  	KASSERT(tb->n <= nitems(tb->tab),
320  	    ("%s: count too high %d", __func__, tb->n));
321  	if (tb->n == nitems(tb->tab)) {
322  		tid_free_batch(tb->tab, tb->n);
323  		tb->n = 0;
324  	}
325  }
326  
327  static void
tidbatch_final(struct tidbatch * tb)328  tidbatch_final(struct tidbatch *tb)
329  {
330  
331  	KASSERT(tb->n <= nitems(tb->tab),
332  	    ("%s: count too high %d", __func__, tb->n));
333  	if (tb->n != 0) {
334  		tid_free_batch(tb->tab, tb->n);
335  	}
336  }
337  
338  /*
339   * Batching thread count free, for consistency
340   */
341  struct tdcountbatch {
342  	int n;
343  };
344  
345  static void
tdcountbatch_prep(struct tdcountbatch * tb)346  tdcountbatch_prep(struct tdcountbatch *tb)
347  {
348  
349  	tb->n = 0;
350  }
351  
352  static void
tdcountbatch_add(struct tdcountbatch * tb,struct thread * td __unused)353  tdcountbatch_add(struct tdcountbatch *tb, struct thread *td __unused)
354  {
355  
356  	tb->n++;
357  }
358  
359  static void
tdcountbatch_process(struct tdcountbatch * tb)360  tdcountbatch_process(struct tdcountbatch *tb)
361  {
362  
363  	if (tb->n == 32) {
364  		thread_count_sub(tb->n);
365  		tb->n = 0;
366  	}
367  }
368  
369  static void
tdcountbatch_final(struct tdcountbatch * tb)370  tdcountbatch_final(struct tdcountbatch *tb)
371  {
372  
373  	if (tb->n != 0) {
374  		thread_count_sub(tb->n);
375  	}
376  }
377  
378  /*
379   * Prepare a thread for use.
380   */
381  static int
thread_ctor(void * mem,int size,void * arg,int flags)382  thread_ctor(void *mem, int size, void *arg, int flags)
383  {
384  	struct thread	*td;
385  
386  	td = (struct thread *)mem;
387  	TD_SET_STATE(td, TDS_INACTIVE);
388  	td->td_lastcpu = td->td_oncpu = NOCPU;
389  
390  	/*
391  	 * Note that td_critnest begins life as 1 because the thread is not
392  	 * running and is thereby implicitly waiting to be on the receiving
393  	 * end of a context switch.
394  	 */
395  	td->td_critnest = 1;
396  	td->td_lend_user_pri = PRI_MAX;
397  #ifdef AUDIT
398  	audit_thread_alloc(td);
399  #endif
400  #ifdef KDTRACE_HOOKS
401  	kdtrace_thread_ctor(td);
402  #endif
403  	umtx_thread_alloc(td);
404  	MPASS(td->td_sel == NULL);
405  	return (0);
406  }
407  
408  /*
409   * Reclaim a thread after use.
410   */
411  static void
thread_dtor(void * mem,int size,void * arg)412  thread_dtor(void *mem, int size, void *arg)
413  {
414  	struct thread *td;
415  
416  	td = (struct thread *)mem;
417  
418  #ifdef INVARIANTS
419  	/* Verify that this thread is in a safe state to free. */
420  	switch (TD_GET_STATE(td)) {
421  	case TDS_INHIBITED:
422  	case TDS_RUNNING:
423  	case TDS_CAN_RUN:
424  	case TDS_RUNQ:
425  		/*
426  		 * We must never unlink a thread that is in one of
427  		 * these states, because it is currently active.
428  		 */
429  		panic("bad state for thread unlinking");
430  		/* NOTREACHED */
431  	case TDS_INACTIVE:
432  		break;
433  	default:
434  		panic("bad thread state");
435  		/* NOTREACHED */
436  	}
437  #endif
438  #ifdef AUDIT
439  	audit_thread_free(td);
440  #endif
441  #ifdef KDTRACE_HOOKS
442  	kdtrace_thread_dtor(td);
443  #endif
444  	/* Free all OSD associated to this thread. */
445  	osd_thread_exit(td);
446  	ast_kclear(td);
447  	seltdfini(td);
448  }
449  
450  /*
451   * Initialize type-stable parts of a thread (when newly created).
452   */
453  static int
thread_init(void * mem,int size,int flags)454  thread_init(void *mem, int size, int flags)
455  {
456  	struct thread *td;
457  
458  	td = (struct thread *)mem;
459  
460  	td->td_allocdomain = vm_phys_domain(vtophys(td));
461  	td->td_sleepqueue = sleepq_alloc();
462  	td->td_turnstile = turnstile_alloc();
463  	EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
464  	umtx_thread_init(td);
465  	td->td_kstack = 0;
466  	td->td_sel = NULL;
467  	return (0);
468  }
469  
470  /*
471   * Tear down type-stable parts of a thread (just before being discarded).
472   */
473  static void
thread_fini(void * mem,int size)474  thread_fini(void *mem, int size)
475  {
476  	struct thread *td;
477  
478  	td = (struct thread *)mem;
479  	EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
480  	turnstile_free(td->td_turnstile);
481  	sleepq_free(td->td_sleepqueue);
482  	umtx_thread_fini(td);
483  	MPASS(td->td_sel == NULL);
484  }
485  
486  /*
487   * For a newly created process,
488   * link up all the structures and its initial threads etc.
489   * called from:
490   * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
491   * proc_dtor() (should go away)
492   * proc_init()
493   */
494  void
proc_linkup0(struct proc * p,struct thread * td)495  proc_linkup0(struct proc *p, struct thread *td)
496  {
497  	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
498  	proc_linkup(p, td);
499  }
500  
501  void
proc_linkup(struct proc * p,struct thread * td)502  proc_linkup(struct proc *p, struct thread *td)
503  {
504  
505  	sigqueue_init(&p->p_sigqueue, p);
506  	p->p_ksi = ksiginfo_alloc(M_WAITOK);
507  	if (p->p_ksi != NULL) {
508  		/* XXX p_ksi may be null if ksiginfo zone is not ready */
509  		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
510  	}
511  	LIST_INIT(&p->p_mqnotifier);
512  	p->p_numthreads = 0;
513  	thread_link(td, p);
514  }
515  
516  static void
ast_suspend(struct thread * td,int tda __unused)517  ast_suspend(struct thread *td, int tda __unused)
518  {
519  	struct proc *p;
520  
521  	p = td->td_proc;
522  	/*
523  	 * We need to check to see if we have to exit or wait due to a
524  	 * single threading requirement or some other STOP condition.
525  	 */
526  	PROC_LOCK(p);
527  	thread_suspend_check(0);
528  	PROC_UNLOCK(p);
529  }
530  
531  extern int max_threads_per_proc;
532  
533  /*
534   * Initialize global thread allocation resources.
535   */
536  void
threadinit(void)537  threadinit(void)
538  {
539  	u_long i;
540  	lwpid_t tid0;
541  
542  	/*
543  	 * Place an upper limit on threads which can be allocated.
544  	 *
545  	 * Note that other factors may make the de facto limit much lower.
546  	 *
547  	 * Platform limits are somewhat arbitrary but deemed "more than good
548  	 * enough" for the foreseable future.
549  	 */
550  	if (maxthread == 0) {
551  #ifdef _LP64
552  		maxthread = MIN(maxproc * max_threads_per_proc, 1000000);
553  #else
554  		maxthread = MIN(maxproc * max_threads_per_proc, 100000);
555  #endif
556  	}
557  
558  	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
559  	tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK);
560  	/*
561  	 * Handle thread0.
562  	 */
563  	thread_count_inc();
564  	tid0 = tid_alloc();
565  	if (tid0 != THREAD0_TID)
566  		panic("tid0 %d != %d\n", tid0, THREAD0_TID);
567  
568  	/*
569  	 * Thread structures are specially aligned so that (at least) the
570  	 * 5 lower bits of a pointer to 'struct thead' must be 0.  These bits
571  	 * are used by synchronization primitives to store flags in pointers to
572  	 * such structures.
573  	 */
574  	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
575  	    thread_ctor, thread_dtor, thread_init, thread_fini,
576  	    UMA_ALIGN_CACHE_AND_MASK(32 - 1), UMA_ZONE_NOFREE);
577  	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
578  	tidhashlock = (tidhash + 1) / 64;
579  	if (tidhashlock > 0)
580  		tidhashlock--;
581  	tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1),
582  	    M_TIDHASH, M_WAITOK | M_ZERO);
583  	for (i = 0; i < tidhashlock + 1; i++)
584  		rw_init(&tidhashtbl_lock[i], "tidhash");
585  
586  	TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL);
587  	callout_init(&thread_reap_callout, 1);
588  	callout_reset(&thread_reap_callout, 5 * hz,
589  	    thread_reap_callout_cb, NULL);
590  	ast_register(TDA_SUSPEND, ASTR_ASTF_REQUIRED, 0, ast_suspend);
591  }
592  
593  /*
594   * Place an unused thread on the zombie list.
595   */
596  void
thread_zombie(struct thread * td)597  thread_zombie(struct thread *td)
598  {
599  	struct thread_domain_data *tdd;
600  	struct thread *ztd;
601  
602  	tdd = &thread_domain_data[td->td_allocdomain];
603  	ztd = atomic_load_ptr(&tdd->tdd_zombies);
604  	for (;;) {
605  		td->td_zombie = ztd;
606  		if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies,
607  		    (uintptr_t *)&ztd, (uintptr_t)td))
608  			break;
609  		continue;
610  	}
611  }
612  
613  /*
614   * Release a thread that has exited after cpu_throw().
615   */
616  void
thread_stash(struct thread * td)617  thread_stash(struct thread *td)
618  {
619  	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
620  	thread_zombie(td);
621  }
622  
623  /*
624   * Reap zombies from passed domain.
625   */
626  static void
thread_reap_domain(struct thread_domain_data * tdd)627  thread_reap_domain(struct thread_domain_data *tdd)
628  {
629  	struct thread *itd, *ntd;
630  	struct tidbatch tidbatch;
631  	struct credbatch credbatch;
632  	struct limbatch limbatch;
633  	struct tdcountbatch tdcountbatch;
634  
635  	/*
636  	 * Reading upfront is pessimal if followed by concurrent atomic_swap,
637  	 * but most of the time the list is empty.
638  	 */
639  	if (tdd->tdd_zombies == NULL)
640  		return;
641  
642  	itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies,
643  	    (uintptr_t)NULL);
644  	if (itd == NULL)
645  		return;
646  
647  	/*
648  	 * Multiple CPUs can get here, the race is fine as ticks is only
649  	 * advisory.
650  	 */
651  	tdd->tdd_reapticks = ticks;
652  
653  	tidbatch_prep(&tidbatch);
654  	credbatch_prep(&credbatch);
655  	limbatch_prep(&limbatch);
656  	tdcountbatch_prep(&tdcountbatch);
657  
658  	while (itd != NULL) {
659  		ntd = itd->td_zombie;
660  		EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd);
661  
662  		tidbatch_add(&tidbatch, itd);
663  		credbatch_add(&credbatch, itd);
664  		limbatch_add(&limbatch, itd);
665  		tdcountbatch_add(&tdcountbatch, itd);
666  
667  		thread_free_batched(itd);
668  
669  		tidbatch_process(&tidbatch);
670  		credbatch_process(&credbatch);
671  		limbatch_process(&limbatch);
672  		tdcountbatch_process(&tdcountbatch);
673  
674  		itd = ntd;
675  	}
676  
677  	tidbatch_final(&tidbatch);
678  	credbatch_final(&credbatch);
679  	limbatch_final(&limbatch);
680  	tdcountbatch_final(&tdcountbatch);
681  }
682  
683  /*
684   * Reap zombies from all domains.
685   */
686  static void
thread_reap_all(void)687  thread_reap_all(void)
688  {
689  	struct thread_domain_data *tdd;
690  	int i, domain;
691  
692  	domain = PCPU_GET(domain);
693  	for (i = 0; i < vm_ndomains; i++) {
694  		tdd = &thread_domain_data[(i + domain) % vm_ndomains];
695  		thread_reap_domain(tdd);
696  	}
697  }
698  
699  /*
700   * Reap zombies from local domain.
701   */
702  static void
thread_reap(void)703  thread_reap(void)
704  {
705  	struct thread_domain_data *tdd;
706  	int domain;
707  
708  	domain = PCPU_GET(domain);
709  	tdd = &thread_domain_data[domain];
710  
711  	thread_reap_domain(tdd);
712  }
713  
714  static void
thread_reap_task_cb(void * arg __unused,int pending __unused)715  thread_reap_task_cb(void *arg __unused, int pending __unused)
716  {
717  
718  	thread_reap_all();
719  }
720  
721  static void
thread_reap_callout_cb(void * arg __unused)722  thread_reap_callout_cb(void *arg __unused)
723  {
724  	struct thread_domain_data *tdd;
725  	int i, cticks, lticks;
726  	bool wantreap;
727  
728  	wantreap = false;
729  	cticks = atomic_load_int(&ticks);
730  	for (i = 0; i < vm_ndomains; i++) {
731  		tdd = &thread_domain_data[i];
732  		lticks = tdd->tdd_reapticks;
733  		if (tdd->tdd_zombies != NULL &&
734  		    (u_int)(cticks - lticks) > 5 * hz) {
735  			wantreap = true;
736  			break;
737  		}
738  	}
739  
740  	if (wantreap)
741  		taskqueue_enqueue(taskqueue_thread, &thread_reap_task);
742  	callout_reset(&thread_reap_callout, 5 * hz,
743  	    thread_reap_callout_cb, NULL);
744  }
745  
746  /*
747   * Calling this function guarantees that any thread that exited before
748   * the call is reaped when the function returns.  By 'exited' we mean
749   * a thread removed from the process linkage with thread_unlink().
750   * Practically this means that caller must lock/unlock corresponding
751   * process lock before the call, to synchronize with thread_exit().
752   */
753  void
thread_reap_barrier(void)754  thread_reap_barrier(void)
755  {
756  	struct task *t;
757  
758  	/*
759  	 * First do context switches to each CPU to ensure that all
760  	 * PCPU pc_deadthreads are moved to zombie list.
761  	 */
762  	quiesce_all_cpus("", PDROP);
763  
764  	/*
765  	 * Second, fire the task in the same thread as normal
766  	 * thread_reap() is done, to serialize reaping.
767  	 */
768  	t = malloc(sizeof(*t), M_TEMP, M_WAITOK);
769  	TASK_INIT(t, 0, thread_reap_task_cb, t);
770  	taskqueue_enqueue(taskqueue_thread, t);
771  	taskqueue_drain(taskqueue_thread, t);
772  	free(t, M_TEMP);
773  }
774  
775  /*
776   * Allocate a thread.
777   */
778  struct thread *
thread_alloc(int pages)779  thread_alloc(int pages)
780  {
781  	struct thread *td;
782  	lwpid_t tid;
783  
784  	if (!thread_count_inc()) {
785  		return (NULL);
786  	}
787  
788  	tid = tid_alloc();
789  	td = uma_zalloc(thread_zone, M_WAITOK);
790  	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
791  	if (!vm_thread_new(td, pages)) {
792  		uma_zfree(thread_zone, td);
793  		tid_free(tid);
794  		thread_count_dec();
795  		return (NULL);
796  	}
797  	td->td_tid = tid;
798  	bzero(&td->td_sa.args, sizeof(td->td_sa.args));
799  	kasan_thread_alloc(td);
800  	kmsan_thread_alloc(td);
801  	cpu_thread_alloc(td);
802  	EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
803  	return (td);
804  }
805  
806  int
thread_recycle(struct thread * td,int pages)807  thread_recycle(struct thread *td, int pages)
808  {
809  	if (td->td_kstack == 0 || td->td_kstack_pages != pages) {
810  		if (td->td_kstack != 0)
811  			vm_thread_dispose(td);
812  		if (!vm_thread_new(td, pages))
813  			return (ENOMEM);
814  		cpu_thread_alloc(td);
815  	}
816  	kasan_thread_alloc(td);
817  	kmsan_thread_alloc(td);
818  	return (0);
819  }
820  
821  /*
822   * Deallocate a thread.
823   */
824  static void
thread_free_batched(struct thread * td)825  thread_free_batched(struct thread *td)
826  {
827  
828  	lock_profile_thread_exit(td);
829  	if (td->td_cpuset)
830  		cpuset_rel(td->td_cpuset);
831  	td->td_cpuset = NULL;
832  	cpu_thread_free(td);
833  	if (td->td_kstack != 0)
834  		vm_thread_dispose(td);
835  	callout_drain(&td->td_slpcallout);
836  	/*
837  	 * Freeing handled by the caller.
838  	 */
839  	td->td_tid = -1;
840  	kmsan_thread_free(td);
841  	uma_zfree(thread_zone, td);
842  }
843  
844  void
thread_free(struct thread * td)845  thread_free(struct thread *td)
846  {
847  	lwpid_t tid;
848  
849  	EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td);
850  	tid = td->td_tid;
851  	thread_free_batched(td);
852  	tid_free(tid);
853  	thread_count_dec();
854  }
855  
856  void
thread_cow_get_proc(struct thread * newtd,struct proc * p)857  thread_cow_get_proc(struct thread *newtd, struct proc *p)
858  {
859  
860  	PROC_LOCK_ASSERT(p, MA_OWNED);
861  	newtd->td_realucred = crcowget(p->p_ucred);
862  	newtd->td_ucred = newtd->td_realucred;
863  	newtd->td_limit = lim_hold(p->p_limit);
864  	newtd->td_cowgen = p->p_cowgen;
865  }
866  
867  void
thread_cow_get(struct thread * newtd,struct thread * td)868  thread_cow_get(struct thread *newtd, struct thread *td)
869  {
870  
871  	MPASS(td->td_realucred == td->td_ucred);
872  	newtd->td_realucred = crcowget(td->td_realucred);
873  	newtd->td_ucred = newtd->td_realucred;
874  	newtd->td_limit = lim_hold(td->td_limit);
875  	newtd->td_cowgen = td->td_cowgen;
876  }
877  
878  void
thread_cow_free(struct thread * td)879  thread_cow_free(struct thread *td)
880  {
881  
882  	if (td->td_realucred != NULL)
883  		crcowfree(td);
884  	if (td->td_limit != NULL)
885  		lim_free(td->td_limit);
886  }
887  
888  void
thread_cow_update(struct thread * td)889  thread_cow_update(struct thread *td)
890  {
891  	struct proc *p;
892  	struct ucred *oldcred;
893  	struct plimit *oldlimit;
894  
895  	p = td->td_proc;
896  	PROC_LOCK(p);
897  	oldcred = crcowsync();
898  	oldlimit = lim_cowsync();
899  	td->td_cowgen = p->p_cowgen;
900  	PROC_UNLOCK(p);
901  	if (oldcred != NULL)
902  		crfree(oldcred);
903  	if (oldlimit != NULL)
904  		lim_free(oldlimit);
905  }
906  
907  void
thread_cow_synced(struct thread * td)908  thread_cow_synced(struct thread *td)
909  {
910  	struct proc *p;
911  
912  	p = td->td_proc;
913  	PROC_LOCK_ASSERT(p, MA_OWNED);
914  	MPASS(td->td_cowgen != p->p_cowgen);
915  	MPASS(td->td_ucred == p->p_ucred);
916  	MPASS(td->td_limit == p->p_limit);
917  	td->td_cowgen = p->p_cowgen;
918  }
919  
920  /*
921   * Discard the current thread and exit from its context.
922   * Always called with scheduler locked.
923   *
924   * Because we can't free a thread while we're operating under its context,
925   * push the current thread into our CPU's deadthread holder. This means
926   * we needn't worry about someone else grabbing our context before we
927   * do a cpu_throw().
928   */
929  void
thread_exit(void)930  thread_exit(void)
931  {
932  	uint64_t runtime, new_switchtime;
933  	struct thread *td;
934  	struct thread *td2;
935  	struct proc *p;
936  
937  	td = curthread;
938  	p = td->td_proc;
939  
940  	PROC_SLOCK_ASSERT(p, MA_OWNED);
941  	mtx_assert(&Giant, MA_NOTOWNED);
942  
943  	PROC_LOCK_ASSERT(p, MA_OWNED);
944  	KASSERT(p != NULL, ("thread exiting without a process"));
945  	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
946  	    (long)p->p_pid, td->td_name);
947  	SDT_PROBE0(proc, , , lwp__exit);
948  	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
949  	MPASS(td->td_realucred == td->td_ucred);
950  
951  	/*
952  	 * drop FPU & debug register state storage, or any other
953  	 * architecture specific resources that
954  	 * would not be on a new untouched process.
955  	 */
956  	cpu_thread_exit(td);
957  
958  	/*
959  	 * The last thread is left attached to the process
960  	 * So that the whole bundle gets recycled. Skip
961  	 * all this stuff if we never had threads.
962  	 * EXIT clears all sign of other threads when
963  	 * it goes to single threading, so the last thread always
964  	 * takes the short path.
965  	 */
966  	if (p->p_flag & P_HADTHREADS) {
967  		if (p->p_numthreads > 1) {
968  			atomic_add_int(&td->td_proc->p_exitthreads, 1);
969  			thread_unlink(td);
970  			td2 = FIRST_THREAD_IN_PROC(p);
971  			sched_exit_thread(td2, td);
972  
973  			/*
974  			 * The test below is NOT true if we are the
975  			 * sole exiting thread. P_STOPPED_SINGLE is unset
976  			 * in exit1() after it is the only survivor.
977  			 */
978  			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
979  				if (p->p_numthreads == p->p_suspcount) {
980  					thread_lock(p->p_singlethread);
981  					thread_unsuspend_one(p->p_singlethread,
982  					    p, false);
983  				}
984  			}
985  
986  			PCPU_SET(deadthread, td);
987  		} else {
988  			/*
989  			 * The last thread is exiting.. but not through exit()
990  			 */
991  			panic ("thread_exit: Last thread exiting on its own");
992  		}
993  	}
994  #ifdef	HWPMC_HOOKS
995  	/*
996  	 * If this thread is part of a process that is being tracked by hwpmc(4),
997  	 * inform the module of the thread's impending exit.
998  	 */
999  	if (PMC_PROC_IS_USING_PMCS(td->td_proc)) {
1000  		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1001  		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL);
1002  	} else if (PMC_SYSTEM_SAMPLING_ACTIVE())
1003  		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL);
1004  #endif
1005  	PROC_UNLOCK(p);
1006  	PROC_STATLOCK(p);
1007  	thread_lock(td);
1008  	PROC_SUNLOCK(p);
1009  
1010  	/* Do the same timestamp bookkeeping that mi_switch() would do. */
1011  	new_switchtime = cpu_ticks();
1012  	runtime = new_switchtime - PCPU_GET(switchtime);
1013  	td->td_runtime += runtime;
1014  	td->td_incruntime += runtime;
1015  	PCPU_SET(switchtime, new_switchtime);
1016  	PCPU_SET(switchticks, ticks);
1017  	VM_CNT_INC(v_swtch);
1018  
1019  	/* Save our resource usage in our process. */
1020  	td->td_ru.ru_nvcsw++;
1021  	ruxagg_locked(p, td);
1022  	rucollect(&p->p_ru, &td->td_ru);
1023  	PROC_STATUNLOCK(p);
1024  
1025  	TD_SET_STATE(td, TDS_INACTIVE);
1026  #ifdef WITNESS
1027  	witness_thread_exit(td);
1028  #endif
1029  	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
1030  	sched_throw(td);
1031  	panic("I'm a teapot!");
1032  	/* NOTREACHED */
1033  }
1034  
1035  /*
1036   * Do any thread specific cleanups that may be needed in wait()
1037   * called with Giant, proc and schedlock not held.
1038   */
1039  void
thread_wait(struct proc * p)1040  thread_wait(struct proc *p)
1041  {
1042  	struct thread *td;
1043  
1044  	mtx_assert(&Giant, MA_NOTOWNED);
1045  	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
1046  	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
1047  	td = FIRST_THREAD_IN_PROC(p);
1048  	/* Lock the last thread so we spin until it exits cpu_throw(). */
1049  	thread_lock(td);
1050  	thread_unlock(td);
1051  	lock_profile_thread_exit(td);
1052  	cpuset_rel(td->td_cpuset);
1053  	td->td_cpuset = NULL;
1054  	cpu_thread_clean(td);
1055  	thread_cow_free(td);
1056  	callout_drain(&td->td_slpcallout);
1057  	thread_reap();	/* check for zombie threads etc. */
1058  }
1059  
1060  /*
1061   * Link a thread to a process.
1062   * set up anything that needs to be initialized for it to
1063   * be used by the process.
1064   */
1065  void
thread_link(struct thread * td,struct proc * p)1066  thread_link(struct thread *td, struct proc *p)
1067  {
1068  
1069  	/*
1070  	 * XXX This can't be enabled because it's called for proc0 before
1071  	 * its lock has been created.
1072  	 * PROC_LOCK_ASSERT(p, MA_OWNED);
1073  	 */
1074  	TD_SET_STATE(td, TDS_INACTIVE);
1075  	td->td_proc     = p;
1076  	td->td_flags    = TDF_INMEM;
1077  
1078  	LIST_INIT(&td->td_contested);
1079  	LIST_INIT(&td->td_lprof[0]);
1080  	LIST_INIT(&td->td_lprof[1]);
1081  #ifdef EPOCH_TRACE
1082  	SLIST_INIT(&td->td_epochs);
1083  #endif
1084  	sigqueue_init(&td->td_sigqueue, p);
1085  	callout_init(&td->td_slpcallout, 1);
1086  	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
1087  	p->p_numthreads++;
1088  }
1089  
1090  /*
1091   * Called from:
1092   *  thread_exit()
1093   */
1094  void
thread_unlink(struct thread * td)1095  thread_unlink(struct thread *td)
1096  {
1097  	struct proc *p = td->td_proc;
1098  
1099  	PROC_LOCK_ASSERT(p, MA_OWNED);
1100  #ifdef EPOCH_TRACE
1101  	MPASS(SLIST_EMPTY(&td->td_epochs));
1102  #endif
1103  
1104  	TAILQ_REMOVE(&p->p_threads, td, td_plist);
1105  	p->p_numthreads--;
1106  	/* could clear a few other things here */
1107  	/* Must  NOT clear links to proc! */
1108  }
1109  
1110  static int
calc_remaining(struct proc * p,int mode)1111  calc_remaining(struct proc *p, int mode)
1112  {
1113  	int remaining;
1114  
1115  	PROC_LOCK_ASSERT(p, MA_OWNED);
1116  	PROC_SLOCK_ASSERT(p, MA_OWNED);
1117  	if (mode == SINGLE_EXIT)
1118  		remaining = p->p_numthreads;
1119  	else if (mode == SINGLE_BOUNDARY)
1120  		remaining = p->p_numthreads - p->p_boundary_count;
1121  	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
1122  		remaining = p->p_numthreads - p->p_suspcount;
1123  	else
1124  		panic("calc_remaining: wrong mode %d", mode);
1125  	return (remaining);
1126  }
1127  
1128  static int
remain_for_mode(int mode)1129  remain_for_mode(int mode)
1130  {
1131  
1132  	return (mode == SINGLE_ALLPROC ? 0 : 1);
1133  }
1134  
1135  static void
weed_inhib(int mode,struct thread * td2,struct proc * p)1136  weed_inhib(int mode, struct thread *td2, struct proc *p)
1137  {
1138  	PROC_LOCK_ASSERT(p, MA_OWNED);
1139  	PROC_SLOCK_ASSERT(p, MA_OWNED);
1140  	THREAD_LOCK_ASSERT(td2, MA_OWNED);
1141  
1142  	/*
1143  	 * Since the thread lock is dropped by the scheduler we have
1144  	 * to retry to check for races.
1145  	 */
1146  restart:
1147  	switch (mode) {
1148  	case SINGLE_EXIT:
1149  		if (TD_IS_SUSPENDED(td2)) {
1150  			thread_unsuspend_one(td2, p, true);
1151  			thread_lock(td2);
1152  			goto restart;
1153  		}
1154  		if (TD_CAN_ABORT(td2)) {
1155  			sleepq_abort(td2, EINTR);
1156  			return;
1157  		}
1158  		break;
1159  	case SINGLE_BOUNDARY:
1160  	case SINGLE_NO_EXIT:
1161  		if (TD_IS_SUSPENDED(td2) &&
1162  		    (td2->td_flags & TDF_BOUNDARY) == 0) {
1163  			thread_unsuspend_one(td2, p, false);
1164  			thread_lock(td2);
1165  			goto restart;
1166  		}
1167  		if (TD_CAN_ABORT(td2)) {
1168  			sleepq_abort(td2, ERESTART);
1169  			return;
1170  		}
1171  		break;
1172  	case SINGLE_ALLPROC:
1173  		/*
1174  		 * ALLPROC suspend tries to avoid spurious EINTR for
1175  		 * threads sleeping interruptable, by suspending the
1176  		 * thread directly, similarly to sig_suspend_threads().
1177  		 * Since such sleep is not neccessary performed at the user
1178  		 * boundary, TDF_ALLPROCSUSP is used to avoid immediate
1179  		 * un-suspend.
1180  		 */
1181  		if (TD_IS_SUSPENDED(td2) &&
1182  		    (td2->td_flags & TDF_ALLPROCSUSP) == 0) {
1183  			thread_unsuspend_one(td2, p, false);
1184  			thread_lock(td2);
1185  			goto restart;
1186  		}
1187  		if (TD_CAN_ABORT(td2)) {
1188  			td2->td_flags |= TDF_ALLPROCSUSP;
1189  			sleepq_abort(td2, ERESTART);
1190  			return;
1191  		}
1192  		break;
1193  	default:
1194  		break;
1195  	}
1196  	thread_unlock(td2);
1197  }
1198  
1199  /*
1200   * Enforce single-threading.
1201   *
1202   * Returns 1 if the caller must abort (another thread is waiting to
1203   * exit the process or similar). Process is locked!
1204   * Returns 0 when you are successfully the only thread running.
1205   * A process has successfully single threaded in the suspend mode when
1206   * There are no threads in user mode. Threads in the kernel must be
1207   * allowed to continue until they get to the user boundary. They may even
1208   * copy out their return values and data before suspending. They may however be
1209   * accelerated in reaching the user boundary as we will wake up
1210   * any sleeping threads that are interruptable. (PCATCH).
1211   */
1212  int
thread_single(struct proc * p,int mode)1213  thread_single(struct proc *p, int mode)
1214  {
1215  	struct thread *td;
1216  	struct thread *td2;
1217  	int remaining;
1218  
1219  	td = curthread;
1220  	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1221  	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1222  	    ("invalid mode %d", mode));
1223  	/*
1224  	 * If allowing non-ALLPROC singlethreading for non-curproc
1225  	 * callers, calc_remaining() and remain_for_mode() should be
1226  	 * adjusted to also account for td->td_proc != p.  For now
1227  	 * this is not implemented because it is not used.
1228  	 */
1229  	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
1230  	    (mode != SINGLE_ALLPROC && td->td_proc == p),
1231  	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
1232  	mtx_assert(&Giant, MA_NOTOWNED);
1233  	PROC_LOCK_ASSERT(p, MA_OWNED);
1234  
1235  	/*
1236  	 * Is someone already single threading?
1237  	 * Or may be singlethreading is not needed at all.
1238  	 */
1239  	if (mode == SINGLE_ALLPROC) {
1240  		while ((p->p_flag & P_STOPPED_SINGLE) != 0) {
1241  			if ((p->p_flag2 & P2_WEXIT) != 0)
1242  				return (1);
1243  			msleep(&p->p_flag, &p->p_mtx, PCATCH, "thrsgl", 0);
1244  		}
1245  		if ((p->p_flag & (P_STOPPED_SIG | P_TRACED)) != 0 ||
1246  		    (p->p_flag2 & P2_WEXIT) != 0)
1247  			return (1);
1248  	} else if ((p->p_flag & P_HADTHREADS) == 0)
1249  		return (0);
1250  	if (p->p_singlethread != NULL && p->p_singlethread != td)
1251  		return (1);
1252  
1253  	if (mode == SINGLE_EXIT) {
1254  		p->p_flag |= P_SINGLE_EXIT;
1255  		p->p_flag &= ~P_SINGLE_BOUNDARY;
1256  	} else {
1257  		p->p_flag &= ~P_SINGLE_EXIT;
1258  		if (mode == SINGLE_BOUNDARY)
1259  			p->p_flag |= P_SINGLE_BOUNDARY;
1260  		else
1261  			p->p_flag &= ~P_SINGLE_BOUNDARY;
1262  	}
1263  	if (mode == SINGLE_ALLPROC)
1264  		p->p_flag |= P_TOTAL_STOP;
1265  	p->p_flag |= P_STOPPED_SINGLE;
1266  	PROC_SLOCK(p);
1267  	p->p_singlethread = td;
1268  	remaining = calc_remaining(p, mode);
1269  	while (remaining != remain_for_mode(mode)) {
1270  		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
1271  			goto stopme;
1272  		FOREACH_THREAD_IN_PROC(p, td2) {
1273  			if (td2 == td)
1274  				continue;
1275  			thread_lock(td2);
1276  			ast_sched_locked(td2, TDA_SUSPEND);
1277  			if (TD_IS_INHIBITED(td2)) {
1278  				weed_inhib(mode, td2, p);
1279  #ifdef SMP
1280  			} else if (TD_IS_RUNNING(td2)) {
1281  				forward_signal(td2);
1282  				thread_unlock(td2);
1283  #endif
1284  			} else
1285  				thread_unlock(td2);
1286  		}
1287  		remaining = calc_remaining(p, mode);
1288  
1289  		/*
1290  		 * Maybe we suspended some threads.. was it enough?
1291  		 */
1292  		if (remaining == remain_for_mode(mode))
1293  			break;
1294  
1295  stopme:
1296  		/*
1297  		 * Wake us up when everyone else has suspended.
1298  		 * In the mean time we suspend as well.
1299  		 */
1300  		thread_suspend_switch(td, p);
1301  		remaining = calc_remaining(p, mode);
1302  	}
1303  	if (mode == SINGLE_EXIT) {
1304  		/*
1305  		 * Convert the process to an unthreaded process.  The
1306  		 * SINGLE_EXIT is called by exit1() or execve(), in
1307  		 * both cases other threads must be retired.
1308  		 */
1309  		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
1310  		p->p_singlethread = NULL;
1311  		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
1312  
1313  		/*
1314  		 * Wait for any remaining threads to exit cpu_throw().
1315  		 */
1316  		while (p->p_exitthreads != 0) {
1317  			PROC_SUNLOCK(p);
1318  			PROC_UNLOCK(p);
1319  			sched_relinquish(td);
1320  			PROC_LOCK(p);
1321  			PROC_SLOCK(p);
1322  		}
1323  	} else if (mode == SINGLE_BOUNDARY) {
1324  		/*
1325  		 * Wait until all suspended threads are removed from
1326  		 * the processors.  The thread_suspend_check()
1327  		 * increments p_boundary_count while it is still
1328  		 * running, which makes it possible for the execve()
1329  		 * to destroy vmspace while our other threads are
1330  		 * still using the address space.
1331  		 *
1332  		 * We lock the thread, which is only allowed to
1333  		 * succeed after context switch code finished using
1334  		 * the address space.
1335  		 */
1336  		FOREACH_THREAD_IN_PROC(p, td2) {
1337  			if (td2 == td)
1338  				continue;
1339  			thread_lock(td2);
1340  			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
1341  			    ("td %p not on boundary", td2));
1342  			KASSERT(TD_IS_SUSPENDED(td2),
1343  			    ("td %p is not suspended", td2));
1344  			thread_unlock(td2);
1345  		}
1346  	}
1347  	PROC_SUNLOCK(p);
1348  	return (0);
1349  }
1350  
1351  bool
thread_suspend_check_needed(void)1352  thread_suspend_check_needed(void)
1353  {
1354  	struct proc *p;
1355  	struct thread *td;
1356  
1357  	td = curthread;
1358  	p = td->td_proc;
1359  	PROC_LOCK_ASSERT(p, MA_OWNED);
1360  	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
1361  	    (td->td_dbgflags & TDB_SUSPEND) != 0));
1362  }
1363  
1364  /*
1365   * Called in from locations that can safely check to see
1366   * whether we have to suspend or at least throttle for a
1367   * single-thread event (e.g. fork).
1368   *
1369   * Such locations include userret().
1370   * If the "return_instead" argument is non zero, the thread must be able to
1371   * accept 0 (caller may continue), or 1 (caller must abort) as a result.
1372   *
1373   * The 'return_instead' argument tells the function if it may do a
1374   * thread_exit() or suspend, or whether the caller must abort and back
1375   * out instead.
1376   *
1377   * If the thread that set the single_threading request has set the
1378   * P_SINGLE_EXIT bit in the process flags then this call will never return
1379   * if 'return_instead' is false, but will exit.
1380   *
1381   * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
1382   *---------------+--------------------+---------------------
1383   *       0       | returns 0          |   returns 0 or 1
1384   *               | when ST ends       |   immediately
1385   *---------------+--------------------+---------------------
1386   *       1       | thread exits       |   returns 1
1387   *               |                    |  immediately
1388   * 0 = thread_exit() or suspension ok,
1389   * other = return error instead of stopping the thread.
1390   *
1391   * While a full suspension is under effect, even a single threading
1392   * thread would be suspended if it made this call (but it shouldn't).
1393   * This call should only be made from places where
1394   * thread_exit() would be safe as that may be the outcome unless
1395   * return_instead is set.
1396   */
1397  int
thread_suspend_check(int return_instead)1398  thread_suspend_check(int return_instead)
1399  {
1400  	struct thread *td;
1401  	struct proc *p;
1402  
1403  	td = curthread;
1404  	p = td->td_proc;
1405  	mtx_assert(&Giant, MA_NOTOWNED);
1406  	PROC_LOCK_ASSERT(p, MA_OWNED);
1407  	while (thread_suspend_check_needed()) {
1408  		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1409  			KASSERT(p->p_singlethread != NULL,
1410  			    ("singlethread not set"));
1411  			/*
1412  			 * The only suspension in action is a
1413  			 * single-threading. Single threader need not stop.
1414  			 * It is safe to access p->p_singlethread unlocked
1415  			 * because it can only be set to our address by us.
1416  			 */
1417  			if (p->p_singlethread == td)
1418  				return (0);	/* Exempt from stopping. */
1419  		}
1420  		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
1421  			return (EINTR);
1422  
1423  		/* Should we goto user boundary if we didn't come from there? */
1424  		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1425  		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
1426  			return (ERESTART);
1427  
1428  		/*
1429  		 * Ignore suspend requests if they are deferred.
1430  		 */
1431  		if ((td->td_flags & TDF_SBDRY) != 0) {
1432  			KASSERT(return_instead,
1433  			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
1434  			KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
1435  			    (TDF_SEINTR | TDF_SERESTART),
1436  			    ("both TDF_SEINTR and TDF_SERESTART"));
1437  			return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
1438  		}
1439  
1440  		/*
1441  		 * If the process is waiting for us to exit,
1442  		 * this thread should just suicide.
1443  		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
1444  		 */
1445  		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
1446  			PROC_UNLOCK(p);
1447  
1448  			/*
1449  			 * Allow Linux emulation layer to do some work
1450  			 * before thread suicide.
1451  			 */
1452  			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
1453  				(p->p_sysent->sv_thread_detach)(td);
1454  			umtx_thread_exit(td);
1455  			kern_thr_exit(td);
1456  			panic("stopped thread did not exit");
1457  		}
1458  
1459  		PROC_SLOCK(p);
1460  		thread_stopped(p);
1461  		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
1462  			if (p->p_numthreads == p->p_suspcount + 1) {
1463  				thread_lock(p->p_singlethread);
1464  				thread_unsuspend_one(p->p_singlethread, p,
1465  				    false);
1466  			}
1467  		}
1468  		PROC_UNLOCK(p);
1469  		thread_lock(td);
1470  		/*
1471  		 * When a thread suspends, it just
1472  		 * gets taken off all queues.
1473  		 */
1474  		thread_suspend_one(td);
1475  		if (return_instead == 0) {
1476  			p->p_boundary_count++;
1477  			td->td_flags |= TDF_BOUNDARY;
1478  		}
1479  		PROC_SUNLOCK(p);
1480  		mi_switch(SW_INVOL | SWT_SUSPEND);
1481  		PROC_LOCK(p);
1482  	}
1483  	return (0);
1484  }
1485  
1486  /*
1487   * Check for possible stops and suspensions while executing a
1488   * casueword or similar transiently failing operation.
1489   *
1490   * The sleep argument controls whether the function can handle a stop
1491   * request itself or it should return ERESTART and the request is
1492   * proceed at the kernel/user boundary in ast.
1493   *
1494   * Typically, when retrying due to casueword(9) failure (rv == 1), we
1495   * should handle the stop requests there, with exception of cases when
1496   * the thread owns a kernel resource, for instance busied the umtx
1497   * key, or when functions return immediately if thread_check_susp()
1498   * returned non-zero.  On the other hand, retrying the whole lock
1499   * operation, we better not stop there but delegate the handling to
1500   * ast.
1501   *
1502   * If the request is for thread termination P_SINGLE_EXIT, we cannot
1503   * handle it at all, and simply return EINTR.
1504   */
1505  int
thread_check_susp(struct thread * td,bool sleep)1506  thread_check_susp(struct thread *td, bool sleep)
1507  {
1508  	struct proc *p;
1509  	int error;
1510  
1511  	/*
1512  	 * The check for TDA_SUSPEND is racy, but it is enough to
1513  	 * eventually break the lockstep loop.
1514  	 */
1515  	if (!td_ast_pending(td, TDA_SUSPEND))
1516  		return (0);
1517  	error = 0;
1518  	p = td->td_proc;
1519  	PROC_LOCK(p);
1520  	if (p->p_flag & P_SINGLE_EXIT)
1521  		error = EINTR;
1522  	else if (P_SHOULDSTOP(p) ||
1523  	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND)))
1524  		error = sleep ? thread_suspend_check(0) : ERESTART;
1525  	PROC_UNLOCK(p);
1526  	return (error);
1527  }
1528  
1529  void
thread_suspend_switch(struct thread * td,struct proc * p)1530  thread_suspend_switch(struct thread *td, struct proc *p)
1531  {
1532  
1533  	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1534  	PROC_LOCK_ASSERT(p, MA_OWNED);
1535  	PROC_SLOCK_ASSERT(p, MA_OWNED);
1536  	/*
1537  	 * We implement thread_suspend_one in stages here to avoid
1538  	 * dropping the proc lock while the thread lock is owned.
1539  	 */
1540  	if (p == td->td_proc) {
1541  		thread_stopped(p);
1542  		p->p_suspcount++;
1543  	}
1544  	PROC_UNLOCK(p);
1545  	thread_lock(td);
1546  	ast_unsched_locked(td, TDA_SUSPEND);
1547  	TD_SET_SUSPENDED(td);
1548  	sched_sleep(td, 0);
1549  	PROC_SUNLOCK(p);
1550  	DROP_GIANT();
1551  	mi_switch(SW_VOL | SWT_SUSPEND);
1552  	PICKUP_GIANT();
1553  	PROC_LOCK(p);
1554  	PROC_SLOCK(p);
1555  }
1556  
1557  void
thread_suspend_one(struct thread * td)1558  thread_suspend_one(struct thread *td)
1559  {
1560  	struct proc *p;
1561  
1562  	p = td->td_proc;
1563  	PROC_SLOCK_ASSERT(p, MA_OWNED);
1564  	THREAD_LOCK_ASSERT(td, MA_OWNED);
1565  	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
1566  	p->p_suspcount++;
1567  	ast_unsched_locked(td, TDA_SUSPEND);
1568  	TD_SET_SUSPENDED(td);
1569  	sched_sleep(td, 0);
1570  }
1571  
1572  static void
thread_unsuspend_one(struct thread * td,struct proc * p,bool boundary)1573  thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
1574  {
1575  
1576  	THREAD_LOCK_ASSERT(td, MA_OWNED);
1577  	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
1578  	TD_CLR_SUSPENDED(td);
1579  	td->td_flags &= ~TDF_ALLPROCSUSP;
1580  	if (td->td_proc == p) {
1581  		PROC_SLOCK_ASSERT(p, MA_OWNED);
1582  		p->p_suspcount--;
1583  		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
1584  			td->td_flags &= ~TDF_BOUNDARY;
1585  			p->p_boundary_count--;
1586  		}
1587  	}
1588  	setrunnable(td, 0);
1589  }
1590  
1591  void
thread_run_flash(struct thread * td)1592  thread_run_flash(struct thread *td)
1593  {
1594  	struct proc *p;
1595  
1596  	p = td->td_proc;
1597  	PROC_LOCK_ASSERT(p, MA_OWNED);
1598  
1599  	if (TD_ON_SLEEPQ(td))
1600  		sleepq_remove_nested(td);
1601  	else
1602  		thread_lock(td);
1603  
1604  	THREAD_LOCK_ASSERT(td, MA_OWNED);
1605  	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
1606  
1607  	TD_CLR_SUSPENDED(td);
1608  	PROC_SLOCK(p);
1609  	MPASS(p->p_suspcount > 0);
1610  	p->p_suspcount--;
1611  	PROC_SUNLOCK(p);
1612  	setrunnable(td, 0);
1613  }
1614  
1615  /*
1616   * Allow all threads blocked by single threading to continue running.
1617   */
1618  void
thread_unsuspend(struct proc * p)1619  thread_unsuspend(struct proc *p)
1620  {
1621  	struct thread *td;
1622  
1623  	PROC_LOCK_ASSERT(p, MA_OWNED);
1624  	PROC_SLOCK_ASSERT(p, MA_OWNED);
1625  	if (!P_SHOULDSTOP(p)) {
1626                  FOREACH_THREAD_IN_PROC(p, td) {
1627  			thread_lock(td);
1628  			if (TD_IS_SUSPENDED(td))
1629  				thread_unsuspend_one(td, p, true);
1630  			else
1631  				thread_unlock(td);
1632  		}
1633  	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
1634  	    p->p_numthreads == p->p_suspcount) {
1635  		/*
1636  		 * Stopping everything also did the job for the single
1637  		 * threading request. Now we've downgraded to single-threaded,
1638  		 * let it continue.
1639  		 */
1640  		if (p->p_singlethread->td_proc == p) {
1641  			thread_lock(p->p_singlethread);
1642  			thread_unsuspend_one(p->p_singlethread, p, false);
1643  		}
1644  	}
1645  }
1646  
1647  /*
1648   * End the single threading mode..
1649   */
1650  void
thread_single_end(struct proc * p,int mode)1651  thread_single_end(struct proc *p, int mode)
1652  {
1653  	struct thread *td;
1654  
1655  	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
1656  	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
1657  	    ("invalid mode %d", mode));
1658  	PROC_LOCK_ASSERT(p, MA_OWNED);
1659  	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
1660  	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
1661  	    ("mode %d does not match P_TOTAL_STOP", mode));
1662  	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
1663  	    ("thread_single_end from other thread %p %p",
1664  	    curthread, p->p_singlethread));
1665  	KASSERT(mode != SINGLE_BOUNDARY ||
1666  	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
1667  	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
1668  	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
1669  	    P_TOTAL_STOP);
1670  	PROC_SLOCK(p);
1671  	p->p_singlethread = NULL;
1672  
1673  	/*
1674  	 * If there are other threads they may now run,
1675  	 * unless of course there is a blanket 'stop order'
1676  	 * on the process. The single threader must be allowed
1677  	 * to continue however as this is a bad place to stop.
1678  	 */
1679  	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
1680                  FOREACH_THREAD_IN_PROC(p, td) {
1681  			thread_lock(td);
1682  			if (TD_IS_SUSPENDED(td))
1683  				thread_unsuspend_one(td, p, true);
1684  			else
1685  				thread_unlock(td);
1686  		}
1687  	}
1688  	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
1689  	    ("inconsistent boundary count %d", p->p_boundary_count));
1690  	PROC_SUNLOCK(p);
1691  	wakeup(&p->p_flag);
1692  }
1693  
1694  /*
1695   * Locate a thread by number and return with proc lock held.
1696   *
1697   * thread exit establishes proc -> tidhash lock ordering, but lookup
1698   * takes tidhash first and needs to return locked proc.
1699   *
1700   * The problem is worked around by relying on type-safety of both
1701   * structures and doing the work in 2 steps:
1702   * - tidhash-locked lookup which saves both thread and proc pointers
1703   * - proc-locked verification that the found thread still matches
1704   */
1705  static bool
tdfind_hash(lwpid_t tid,pid_t pid,struct proc ** pp,struct thread ** tdp)1706  tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp)
1707  {
1708  #define RUN_THRESH	16
1709  	struct proc *p;
1710  	struct thread *td;
1711  	int run;
1712  	bool locked;
1713  
1714  	run = 0;
1715  	rw_rlock(TIDHASHLOCK(tid));
1716  	locked = true;
1717  	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
1718  		if (td->td_tid != tid) {
1719  			run++;
1720  			continue;
1721  		}
1722  		p = td->td_proc;
1723  		if (pid != -1 && p->p_pid != pid) {
1724  			td = NULL;
1725  			break;
1726  		}
1727  		if (run > RUN_THRESH) {
1728  			if (rw_try_upgrade(TIDHASHLOCK(tid))) {
1729  				LIST_REMOVE(td, td_hash);
1730  				LIST_INSERT_HEAD(TIDHASH(td->td_tid),
1731  					td, td_hash);
1732  				rw_wunlock(TIDHASHLOCK(tid));
1733  				locked = false;
1734  				break;
1735  			}
1736  		}
1737  		break;
1738  	}
1739  	if (locked)
1740  		rw_runlock(TIDHASHLOCK(tid));
1741  	if (td == NULL)
1742  		return (false);
1743  	*pp = p;
1744  	*tdp = td;
1745  	return (true);
1746  }
1747  
1748  struct thread *
tdfind(lwpid_t tid,pid_t pid)1749  tdfind(lwpid_t tid, pid_t pid)
1750  {
1751  	struct proc *p;
1752  	struct thread *td;
1753  
1754  	td = curthread;
1755  	if (td->td_tid == tid) {
1756  		if (pid != -1 && td->td_proc->p_pid != pid)
1757  			return (NULL);
1758  		PROC_LOCK(td->td_proc);
1759  		return (td);
1760  	}
1761  
1762  	for (;;) {
1763  		if (!tdfind_hash(tid, pid, &p, &td))
1764  			return (NULL);
1765  		PROC_LOCK(p);
1766  		if (td->td_tid != tid) {
1767  			PROC_UNLOCK(p);
1768  			continue;
1769  		}
1770  		if (td->td_proc != p) {
1771  			PROC_UNLOCK(p);
1772  			continue;
1773  		}
1774  		if (p->p_state == PRS_NEW) {
1775  			PROC_UNLOCK(p);
1776  			return (NULL);
1777  		}
1778  		return (td);
1779  	}
1780  }
1781  
1782  void
tidhash_add(struct thread * td)1783  tidhash_add(struct thread *td)
1784  {
1785  	rw_wlock(TIDHASHLOCK(td->td_tid));
1786  	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
1787  	rw_wunlock(TIDHASHLOCK(td->td_tid));
1788  }
1789  
1790  void
tidhash_remove(struct thread * td)1791  tidhash_remove(struct thread *td)
1792  {
1793  
1794  	rw_wlock(TIDHASHLOCK(td->td_tid));
1795  	LIST_REMOVE(td, td_hash);
1796  	rw_wunlock(TIDHASHLOCK(td->td_tid));
1797  }
1798