xref: /linux/kernel/exit.c (revision c717993dd76a1049093af5c262e751d901b8da10)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/kernel/exit.c
4   *
5   *  Copyright (C) 1991, 1992  Linus Torvalds
6   */
7  
8  #include <linux/mm.h>
9  #include <linux/slab.h>
10  #include <linux/sched/autogroup.h>
11  #include <linux/sched/mm.h>
12  #include <linux/sched/stat.h>
13  #include <linux/sched/task.h>
14  #include <linux/sched/task_stack.h>
15  #include <linux/sched/cputime.h>
16  #include <linux/interrupt.h>
17  #include <linux/module.h>
18  #include <linux/capability.h>
19  #include <linux/completion.h>
20  #include <linux/personality.h>
21  #include <linux/tty.h>
22  #include <linux/iocontext.h>
23  #include <linux/key.h>
24  #include <linux/cpu.h>
25  #include <linux/acct.h>
26  #include <linux/tsacct_kern.h>
27  #include <linux/file.h>
28  #include <linux/fdtable.h>
29  #include <linux/freezer.h>
30  #include <linux/binfmts.h>
31  #include <linux/nsproxy.h>
32  #include <linux/pid_namespace.h>
33  #include <linux/ptrace.h>
34  #include <linux/profile.h>
35  #include <linux/mount.h>
36  #include <linux/proc_fs.h>
37  #include <linux/kthread.h>
38  #include <linux/mempolicy.h>
39  #include <linux/taskstats_kern.h>
40  #include <linux/delayacct.h>
41  #include <linux/cgroup.h>
42  #include <linux/syscalls.h>
43  #include <linux/signal.h>
44  #include <linux/posix-timers.h>
45  #include <linux/cn_proc.h>
46  #include <linux/mutex.h>
47  #include <linux/futex.h>
48  #include <linux/pipe_fs_i.h>
49  #include <linux/audit.h> /* for audit_free() */
50  #include <linux/resource.h>
51  #include <linux/task_io_accounting_ops.h>
52  #include <linux/tracehook.h>
53  #include <linux/fs_struct.h>
54  #include <linux/init_task.h>
55  #include <linux/perf_event.h>
56  #include <trace/events/sched.h>
57  #include <linux/hw_breakpoint.h>
58  #include <linux/oom.h>
59  #include <linux/writeback.h>
60  #include <linux/shm.h>
61  #include <linux/kcov.h>
62  #include <linux/random.h>
63  #include <linux/rcuwait.h>
64  #include <linux/compat.h>
65  #include <linux/io_uring.h>
66  #include <linux/kprobes.h>
67  
68  #include <linux/uaccess.h>
69  #include <asm/unistd.h>
70  #include <asm/mmu_context.h>
71  
72  static void __unhash_process(struct task_struct *p, bool group_dead)
73  {
74  	nr_threads--;
75  	detach_pid(p, PIDTYPE_PID);
76  	if (group_dead) {
77  		detach_pid(p, PIDTYPE_TGID);
78  		detach_pid(p, PIDTYPE_PGID);
79  		detach_pid(p, PIDTYPE_SID);
80  
81  		list_del_rcu(&p->tasks);
82  		list_del_init(&p->sibling);
83  		__this_cpu_dec(process_counts);
84  	}
85  	list_del_rcu(&p->thread_group);
86  	list_del_rcu(&p->thread_node);
87  }
88  
89  /*
90   * This function expects the tasklist_lock write-locked.
91   */
92  static void __exit_signal(struct task_struct *tsk)
93  {
94  	struct signal_struct *sig = tsk->signal;
95  	bool group_dead = thread_group_leader(tsk);
96  	struct sighand_struct *sighand;
97  	struct tty_struct *tty;
98  	u64 utime, stime;
99  
100  	sighand = rcu_dereference_check(tsk->sighand,
101  					lockdep_tasklist_lock_is_held());
102  	spin_lock(&sighand->siglock);
103  
104  #ifdef CONFIG_POSIX_TIMERS
105  	posix_cpu_timers_exit(tsk);
106  	if (group_dead)
107  		posix_cpu_timers_exit_group(tsk);
108  #endif
109  
110  	if (group_dead) {
111  		tty = sig->tty;
112  		sig->tty = NULL;
113  	} else {
114  		/*
115  		 * If there is any task waiting for the group exit
116  		 * then notify it:
117  		 */
118  		if (sig->notify_count > 0 && !--sig->notify_count)
119  			wake_up_process(sig->group_exec_task);
120  
121  		if (tsk == sig->curr_target)
122  			sig->curr_target = next_thread(tsk);
123  	}
124  
125  	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
126  			      sizeof(unsigned long long));
127  
128  	/*
129  	 * Accumulate here the counters for all threads as they die. We could
130  	 * skip the group leader because it is the last user of signal_struct,
131  	 * but we want to avoid the race with thread_group_cputime() which can
132  	 * see the empty ->thread_head list.
133  	 */
134  	task_cputime(tsk, &utime, &stime);
135  	write_seqlock(&sig->stats_lock);
136  	sig->utime += utime;
137  	sig->stime += stime;
138  	sig->gtime += task_gtime(tsk);
139  	sig->min_flt += tsk->min_flt;
140  	sig->maj_flt += tsk->maj_flt;
141  	sig->nvcsw += tsk->nvcsw;
142  	sig->nivcsw += tsk->nivcsw;
143  	sig->inblock += task_io_get_inblock(tsk);
144  	sig->oublock += task_io_get_oublock(tsk);
145  	task_io_accounting_add(&sig->ioac, &tsk->ioac);
146  	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
147  	sig->nr_threads--;
148  	__unhash_process(tsk, group_dead);
149  	write_sequnlock(&sig->stats_lock);
150  
151  	/*
152  	 * Do this under ->siglock, we can race with another thread
153  	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
154  	 */
155  	flush_sigqueue(&tsk->pending);
156  	tsk->sighand = NULL;
157  	spin_unlock(&sighand->siglock);
158  
159  	__cleanup_sighand(sighand);
160  	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
161  	if (group_dead) {
162  		flush_sigqueue(&sig->shared_pending);
163  		tty_kref_put(tty);
164  	}
165  }
166  
167  static void delayed_put_task_struct(struct rcu_head *rhp)
168  {
169  	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
170  
171  	kprobe_flush_task(tsk);
172  	perf_event_delayed_put(tsk);
173  	trace_sched_process_free(tsk);
174  	put_task_struct(tsk);
175  }
176  
177  void put_task_struct_rcu_user(struct task_struct *task)
178  {
179  	if (refcount_dec_and_test(&task->rcu_users))
180  		call_rcu(&task->rcu, delayed_put_task_struct);
181  }
182  
183  void release_task(struct task_struct *p)
184  {
185  	struct task_struct *leader;
186  	struct pid *thread_pid;
187  	int zap_leader;
188  repeat:
189  	/* don't need to get the RCU readlock here - the process is dead and
190  	 * can't be modifying its own credentials. But shut RCU-lockdep up */
191  	rcu_read_lock();
192  	dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
193  	rcu_read_unlock();
194  
195  	cgroup_release(p);
196  
197  	write_lock_irq(&tasklist_lock);
198  	ptrace_release_task(p);
199  	thread_pid = get_pid(p->thread_pid);
200  	__exit_signal(p);
201  
202  	/*
203  	 * If we are the last non-leader member of the thread
204  	 * group, and the leader is zombie, then notify the
205  	 * group leader's parent process. (if it wants notification.)
206  	 */
207  	zap_leader = 0;
208  	leader = p->group_leader;
209  	if (leader != p && thread_group_empty(leader)
210  			&& leader->exit_state == EXIT_ZOMBIE) {
211  		/*
212  		 * If we were the last child thread and the leader has
213  		 * exited already, and the leader's parent ignores SIGCHLD,
214  		 * then we are the one who should release the leader.
215  		 */
216  		zap_leader = do_notify_parent(leader, leader->exit_signal);
217  		if (zap_leader)
218  			leader->exit_state = EXIT_DEAD;
219  	}
220  
221  	write_unlock_irq(&tasklist_lock);
222  	seccomp_filter_release(p);
223  	proc_flush_pid(thread_pid);
224  	put_pid(thread_pid);
225  	release_thread(p);
226  	put_task_struct_rcu_user(p);
227  
228  	p = leader;
229  	if (unlikely(zap_leader))
230  		goto repeat;
231  }
232  
233  int rcuwait_wake_up(struct rcuwait *w)
234  {
235  	int ret = 0;
236  	struct task_struct *task;
237  
238  	rcu_read_lock();
239  
240  	/*
241  	 * Order condition vs @task, such that everything prior to the load
242  	 * of @task is visible. This is the condition as to why the user called
243  	 * rcuwait_wake() in the first place. Pairs with set_current_state()
244  	 * barrier (A) in rcuwait_wait_event().
245  	 *
246  	 *    WAIT                WAKE
247  	 *    [S] tsk = current	  [S] cond = true
248  	 *        MB (A)	      MB (B)
249  	 *    [L] cond		  [L] tsk
250  	 */
251  	smp_mb(); /* (B) */
252  
253  	task = rcu_dereference(w->task);
254  	if (task)
255  		ret = wake_up_process(task);
256  	rcu_read_unlock();
257  
258  	return ret;
259  }
260  EXPORT_SYMBOL_GPL(rcuwait_wake_up);
261  
262  /*
263   * Determine if a process group is "orphaned", according to the POSIX
264   * definition in 2.2.2.52.  Orphaned process groups are not to be affected
265   * by terminal-generated stop signals.  Newly orphaned process groups are
266   * to receive a SIGHUP and a SIGCONT.
267   *
268   * "I ask you, have you ever known what it is to be an orphan?"
269   */
270  static int will_become_orphaned_pgrp(struct pid *pgrp,
271  					struct task_struct *ignored_task)
272  {
273  	struct task_struct *p;
274  
275  	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
276  		if ((p == ignored_task) ||
277  		    (p->exit_state && thread_group_empty(p)) ||
278  		    is_global_init(p->real_parent))
279  			continue;
280  
281  		if (task_pgrp(p->real_parent) != pgrp &&
282  		    task_session(p->real_parent) == task_session(p))
283  			return 0;
284  	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
285  
286  	return 1;
287  }
288  
289  int is_current_pgrp_orphaned(void)
290  {
291  	int retval;
292  
293  	read_lock(&tasklist_lock);
294  	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
295  	read_unlock(&tasklist_lock);
296  
297  	return retval;
298  }
299  
300  static bool has_stopped_jobs(struct pid *pgrp)
301  {
302  	struct task_struct *p;
303  
304  	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
305  		if (p->signal->flags & SIGNAL_STOP_STOPPED)
306  			return true;
307  	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
308  
309  	return false;
310  }
311  
312  /*
313   * Check to see if any process groups have become orphaned as
314   * a result of our exiting, and if they have any stopped jobs,
315   * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
316   */
317  static void
318  kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
319  {
320  	struct pid *pgrp = task_pgrp(tsk);
321  	struct task_struct *ignored_task = tsk;
322  
323  	if (!parent)
324  		/* exit: our father is in a different pgrp than
325  		 * we are and we were the only connection outside.
326  		 */
327  		parent = tsk->real_parent;
328  	else
329  		/* reparent: our child is in a different pgrp than
330  		 * we are, and it was the only connection outside.
331  		 */
332  		ignored_task = NULL;
333  
334  	if (task_pgrp(parent) != pgrp &&
335  	    task_session(parent) == task_session(tsk) &&
336  	    will_become_orphaned_pgrp(pgrp, ignored_task) &&
337  	    has_stopped_jobs(pgrp)) {
338  		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
339  		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
340  	}
341  }
342  
343  static void coredump_task_exit(struct task_struct *tsk)
344  {
345  	struct core_state *core_state;
346  
347  	/*
348  	 * Serialize with any possible pending coredump.
349  	 * We must hold siglock around checking core_state
350  	 * and setting PF_POSTCOREDUMP.  The core-inducing thread
351  	 * will increment ->nr_threads for each thread in the
352  	 * group without PF_POSTCOREDUMP set.
353  	 */
354  	spin_lock_irq(&tsk->sighand->siglock);
355  	tsk->flags |= PF_POSTCOREDUMP;
356  	core_state = tsk->signal->core_state;
357  	spin_unlock_irq(&tsk->sighand->siglock);
358  	if (core_state) {
359  		struct core_thread self;
360  
361  		self.task = current;
362  		if (self.task->flags & PF_SIGNALED)
363  			self.next = xchg(&core_state->dumper.next, &self);
364  		else
365  			self.task = NULL;
366  		/*
367  		 * Implies mb(), the result of xchg() must be visible
368  		 * to core_state->dumper.
369  		 */
370  		if (atomic_dec_and_test(&core_state->nr_threads))
371  			complete(&core_state->startup);
372  
373  		for (;;) {
374  			set_current_state(TASK_UNINTERRUPTIBLE);
375  			if (!self.task) /* see coredump_finish() */
376  				break;
377  			freezable_schedule();
378  		}
379  		__set_current_state(TASK_RUNNING);
380  	}
381  }
382  
383  #ifdef CONFIG_MEMCG
384  /*
385   * A task is exiting.   If it owned this mm, find a new owner for the mm.
386   */
387  void mm_update_next_owner(struct mm_struct *mm)
388  {
389  	struct task_struct *c, *g, *p = current;
390  
391  retry:
392  	/*
393  	 * If the exiting or execing task is not the owner, it's
394  	 * someone else's problem.
395  	 */
396  	if (mm->owner != p)
397  		return;
398  	/*
399  	 * The current owner is exiting/execing and there are no other
400  	 * candidates.  Do not leave the mm pointing to a possibly
401  	 * freed task structure.
402  	 */
403  	if (atomic_read(&mm->mm_users) <= 1) {
404  		WRITE_ONCE(mm->owner, NULL);
405  		return;
406  	}
407  
408  	read_lock(&tasklist_lock);
409  	/*
410  	 * Search in the children
411  	 */
412  	list_for_each_entry(c, &p->children, sibling) {
413  		if (c->mm == mm)
414  			goto assign_new_owner;
415  	}
416  
417  	/*
418  	 * Search in the siblings
419  	 */
420  	list_for_each_entry(c, &p->real_parent->children, sibling) {
421  		if (c->mm == mm)
422  			goto assign_new_owner;
423  	}
424  
425  	/*
426  	 * Search through everything else, we should not get here often.
427  	 */
428  	for_each_process(g) {
429  		if (g->flags & PF_KTHREAD)
430  			continue;
431  		for_each_thread(g, c) {
432  			if (c->mm == mm)
433  				goto assign_new_owner;
434  			if (c->mm)
435  				break;
436  		}
437  	}
438  	read_unlock(&tasklist_lock);
439  	/*
440  	 * We found no owner yet mm_users > 1: this implies that we are
441  	 * most likely racing with swapoff (try_to_unuse()) or /proc or
442  	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
443  	 */
444  	WRITE_ONCE(mm->owner, NULL);
445  	return;
446  
447  assign_new_owner:
448  	BUG_ON(c == p);
449  	get_task_struct(c);
450  	/*
451  	 * The task_lock protects c->mm from changing.
452  	 * We always want mm->owner->mm == mm
453  	 */
454  	task_lock(c);
455  	/*
456  	 * Delay read_unlock() till we have the task_lock()
457  	 * to ensure that c does not slip away underneath us
458  	 */
459  	read_unlock(&tasklist_lock);
460  	if (c->mm != mm) {
461  		task_unlock(c);
462  		put_task_struct(c);
463  		goto retry;
464  	}
465  	WRITE_ONCE(mm->owner, c);
466  	task_unlock(c);
467  	put_task_struct(c);
468  }
469  #endif /* CONFIG_MEMCG */
470  
471  /*
472   * Turn us into a lazy TLB process if we
473   * aren't already..
474   */
475  static void exit_mm(void)
476  {
477  	struct mm_struct *mm = current->mm;
478  
479  	exit_mm_release(current, mm);
480  	if (!mm)
481  		return;
482  	sync_mm_rss(mm);
483  	mmap_read_lock(mm);
484  	mmgrab(mm);
485  	BUG_ON(mm != current->active_mm);
486  	/* more a memory barrier than a real lock */
487  	task_lock(current);
488  	/*
489  	 * When a thread stops operating on an address space, the loop
490  	 * in membarrier_private_expedited() may not observe that
491  	 * tsk->mm, and the loop in membarrier_global_expedited() may
492  	 * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
493  	 * rq->membarrier_state, so those would not issue an IPI.
494  	 * Membarrier requires a memory barrier after accessing
495  	 * user-space memory, before clearing tsk->mm or the
496  	 * rq->membarrier_state.
497  	 */
498  	smp_mb__after_spinlock();
499  	local_irq_disable();
500  	current->mm = NULL;
501  	membarrier_update_current_mm(NULL);
502  	enter_lazy_tlb(mm, current);
503  	local_irq_enable();
504  	task_unlock(current);
505  	mmap_read_unlock(mm);
506  	mm_update_next_owner(mm);
507  	mmput(mm);
508  	if (test_thread_flag(TIF_MEMDIE))
509  		exit_oom_victim();
510  }
511  
512  static struct task_struct *find_alive_thread(struct task_struct *p)
513  {
514  	struct task_struct *t;
515  
516  	for_each_thread(p, t) {
517  		if (!(t->flags & PF_EXITING))
518  			return t;
519  	}
520  	return NULL;
521  }
522  
523  static struct task_struct *find_child_reaper(struct task_struct *father,
524  						struct list_head *dead)
525  	__releases(&tasklist_lock)
526  	__acquires(&tasklist_lock)
527  {
528  	struct pid_namespace *pid_ns = task_active_pid_ns(father);
529  	struct task_struct *reaper = pid_ns->child_reaper;
530  	struct task_struct *p, *n;
531  
532  	if (likely(reaper != father))
533  		return reaper;
534  
535  	reaper = find_alive_thread(father);
536  	if (reaper) {
537  		pid_ns->child_reaper = reaper;
538  		return reaper;
539  	}
540  
541  	write_unlock_irq(&tasklist_lock);
542  
543  	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
544  		list_del_init(&p->ptrace_entry);
545  		release_task(p);
546  	}
547  
548  	zap_pid_ns_processes(pid_ns);
549  	write_lock_irq(&tasklist_lock);
550  
551  	return father;
552  }
553  
554  /*
555   * When we die, we re-parent all our children, and try to:
556   * 1. give them to another thread in our thread group, if such a member exists
557   * 2. give it to the first ancestor process which prctl'd itself as a
558   *    child_subreaper for its children (like a service manager)
559   * 3. give it to the init process (PID 1) in our pid namespace
560   */
561  static struct task_struct *find_new_reaper(struct task_struct *father,
562  					   struct task_struct *child_reaper)
563  {
564  	struct task_struct *thread, *reaper;
565  
566  	thread = find_alive_thread(father);
567  	if (thread)
568  		return thread;
569  
570  	if (father->signal->has_child_subreaper) {
571  		unsigned int ns_level = task_pid(father)->level;
572  		/*
573  		 * Find the first ->is_child_subreaper ancestor in our pid_ns.
574  		 * We can't check reaper != child_reaper to ensure we do not
575  		 * cross the namespaces, the exiting parent could be injected
576  		 * by setns() + fork().
577  		 * We check pid->level, this is slightly more efficient than
578  		 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
579  		 */
580  		for (reaper = father->real_parent;
581  		     task_pid(reaper)->level == ns_level;
582  		     reaper = reaper->real_parent) {
583  			if (reaper == &init_task)
584  				break;
585  			if (!reaper->signal->is_child_subreaper)
586  				continue;
587  			thread = find_alive_thread(reaper);
588  			if (thread)
589  				return thread;
590  		}
591  	}
592  
593  	return child_reaper;
594  }
595  
596  /*
597  * Any that need to be release_task'd are put on the @dead list.
598   */
599  static void reparent_leader(struct task_struct *father, struct task_struct *p,
600  				struct list_head *dead)
601  {
602  	if (unlikely(p->exit_state == EXIT_DEAD))
603  		return;
604  
605  	/* We don't want people slaying init. */
606  	p->exit_signal = SIGCHLD;
607  
608  	/* If it has exited notify the new parent about this child's death. */
609  	if (!p->ptrace &&
610  	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
611  		if (do_notify_parent(p, p->exit_signal)) {
612  			p->exit_state = EXIT_DEAD;
613  			list_add(&p->ptrace_entry, dead);
614  		}
615  	}
616  
617  	kill_orphaned_pgrp(p, father);
618  }
619  
620  /*
621   * This does two things:
622   *
623   * A.  Make init inherit all the child processes
624   * B.  Check to see if any process groups have become orphaned
625   *	as a result of our exiting, and if they have any stopped
626   *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
627   */
628  static void forget_original_parent(struct task_struct *father,
629  					struct list_head *dead)
630  {
631  	struct task_struct *p, *t, *reaper;
632  
633  	if (unlikely(!list_empty(&father->ptraced)))
634  		exit_ptrace(father, dead);
635  
636  	/* Can drop and reacquire tasklist_lock */
637  	reaper = find_child_reaper(father, dead);
638  	if (list_empty(&father->children))
639  		return;
640  
641  	reaper = find_new_reaper(father, reaper);
642  	list_for_each_entry(p, &father->children, sibling) {
643  		for_each_thread(p, t) {
644  			RCU_INIT_POINTER(t->real_parent, reaper);
645  			BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
646  			if (likely(!t->ptrace))
647  				t->parent = t->real_parent;
648  			if (t->pdeath_signal)
649  				group_send_sig_info(t->pdeath_signal,
650  						    SEND_SIG_NOINFO, t,
651  						    PIDTYPE_TGID);
652  		}
653  		/*
654  		 * If this is a threaded reparent there is no need to
655  		 * notify anyone anything has happened.
656  		 */
657  		if (!same_thread_group(reaper, father))
658  			reparent_leader(father, p, dead);
659  	}
660  	list_splice_tail_init(&father->children, &reaper->children);
661  }
662  
663  /*
664   * Send signals to all our closest relatives so that they know
665   * to properly mourn us..
666   */
667  static void exit_notify(struct task_struct *tsk, int group_dead)
668  {
669  	bool autoreap;
670  	struct task_struct *p, *n;
671  	LIST_HEAD(dead);
672  
673  	write_lock_irq(&tasklist_lock);
674  	forget_original_parent(tsk, &dead);
675  
676  	if (group_dead)
677  		kill_orphaned_pgrp(tsk->group_leader, NULL);
678  
679  	tsk->exit_state = EXIT_ZOMBIE;
680  	if (unlikely(tsk->ptrace)) {
681  		int sig = thread_group_leader(tsk) &&
682  				thread_group_empty(tsk) &&
683  				!ptrace_reparented(tsk) ?
684  			tsk->exit_signal : SIGCHLD;
685  		autoreap = do_notify_parent(tsk, sig);
686  	} else if (thread_group_leader(tsk)) {
687  		autoreap = thread_group_empty(tsk) &&
688  			do_notify_parent(tsk, tsk->exit_signal);
689  	} else {
690  		autoreap = true;
691  	}
692  
693  	if (autoreap) {
694  		tsk->exit_state = EXIT_DEAD;
695  		list_add(&tsk->ptrace_entry, &dead);
696  	}
697  
698  	/* mt-exec, de_thread() is waiting for group leader */
699  	if (unlikely(tsk->signal->notify_count < 0))
700  		wake_up_process(tsk->signal->group_exec_task);
701  	write_unlock_irq(&tasklist_lock);
702  
703  	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
704  		list_del_init(&p->ptrace_entry);
705  		release_task(p);
706  	}
707  }
708  
709  #ifdef CONFIG_DEBUG_STACK_USAGE
710  static void check_stack_usage(void)
711  {
712  	static DEFINE_SPINLOCK(low_water_lock);
713  	static int lowest_to_date = THREAD_SIZE;
714  	unsigned long free;
715  
716  	free = stack_not_used(current);
717  
718  	if (free >= lowest_to_date)
719  		return;
720  
721  	spin_lock(&low_water_lock);
722  	if (free < lowest_to_date) {
723  		pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
724  			current->comm, task_pid_nr(current), free);
725  		lowest_to_date = free;
726  	}
727  	spin_unlock(&low_water_lock);
728  }
729  #else
730  static inline void check_stack_usage(void) {}
731  #endif
732  
733  void __noreturn do_exit(long code)
734  {
735  	struct task_struct *tsk = current;
736  	int group_dead;
737  
738  	WARN_ON(blk_needs_flush_plug(tsk));
739  
740  	/*
741  	 * If do_dead is called because this processes oopsed, it's possible
742  	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
743  	 * continuing. Amongst other possible reasons, this is to prevent
744  	 * mm_release()->clear_child_tid() from writing to a user-controlled
745  	 * kernel address.
746  	 *
747  	 * On uptodate architectures force_uaccess_begin is a noop.  On
748  	 * architectures that still have set_fs/get_fs in addition to handling
749  	 * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
750  	 * default.
751  	 */
752  	force_uaccess_begin();
753  
754  	kcov_task_exit(tsk);
755  
756  	coredump_task_exit(tsk);
757  	ptrace_event(PTRACE_EVENT_EXIT, code);
758  
759  	validate_creds_for_do_exit(tsk);
760  
761  	io_uring_files_cancel();
762  	exit_signals(tsk);  /* sets PF_EXITING */
763  
764  	/* sync mm's RSS info before statistics gathering */
765  	if (tsk->mm)
766  		sync_mm_rss(tsk->mm);
767  	acct_update_integrals(tsk);
768  	group_dead = atomic_dec_and_test(&tsk->signal->live);
769  	if (group_dead) {
770  		/*
771  		 * If the last thread of global init has exited, panic
772  		 * immediately to get a useable coredump.
773  		 */
774  		if (unlikely(is_global_init(tsk)))
775  			panic("Attempted to kill init! exitcode=0x%08x\n",
776  				tsk->signal->group_exit_code ?: (int)code);
777  
778  #ifdef CONFIG_POSIX_TIMERS
779  		hrtimer_cancel(&tsk->signal->real_timer);
780  		exit_itimers(tsk->signal);
781  #endif
782  		if (tsk->mm)
783  			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
784  	}
785  	acct_collect(code, group_dead);
786  	if (group_dead)
787  		tty_audit_exit();
788  	audit_free(tsk);
789  
790  	tsk->exit_code = code;
791  	taskstats_exit(tsk, group_dead);
792  
793  	exit_mm();
794  
795  	if (group_dead)
796  		acct_process();
797  	trace_sched_process_exit(tsk);
798  
799  	exit_sem(tsk);
800  	exit_shm(tsk);
801  	exit_files(tsk);
802  	exit_fs(tsk);
803  	if (group_dead)
804  		disassociate_ctty(1);
805  	exit_task_namespaces(tsk);
806  	exit_task_work(tsk);
807  	exit_thread(tsk);
808  
809  	/*
810  	 * Flush inherited counters to the parent - before the parent
811  	 * gets woken up by child-exit notifications.
812  	 *
813  	 * because of cgroup mode, must be called before cgroup_exit()
814  	 */
815  	perf_event_exit_task(tsk);
816  
817  	sched_autogroup_exit_task(tsk);
818  	cgroup_exit(tsk);
819  
820  	/*
821  	 * FIXME: do that only when needed, using sched_exit tracepoint
822  	 */
823  	flush_ptrace_hw_breakpoint(tsk);
824  
825  	exit_tasks_rcu_start();
826  	exit_notify(tsk, group_dead);
827  	proc_exit_connector(tsk);
828  	mpol_put_task_policy(tsk);
829  #ifdef CONFIG_FUTEX
830  	if (unlikely(current->pi_state_cache))
831  		kfree(current->pi_state_cache);
832  #endif
833  	/*
834  	 * Make sure we are holding no locks:
835  	 */
836  	debug_check_no_locks_held();
837  
838  	if (tsk->io_context)
839  		exit_io_context(tsk);
840  
841  	if (tsk->splice_pipe)
842  		free_pipe_info(tsk->splice_pipe);
843  
844  	if (tsk->task_frag.page)
845  		put_page(tsk->task_frag.page);
846  
847  	validate_creds_for_do_exit(tsk);
848  
849  	check_stack_usage();
850  	preempt_disable();
851  	if (tsk->nr_dirtied)
852  		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
853  	exit_rcu();
854  	exit_tasks_rcu_finish();
855  
856  	lockdep_free_task(tsk);
857  	do_task_dead();
858  }
859  
860  void __noreturn make_task_dead(int signr)
861  {
862  	/*
863  	 * Take the task off the cpu after something catastrophic has
864  	 * happened.
865  	 *
866  	 * We can get here from a kernel oops, sometimes with preemption off.
867  	 * Start by checking for critical errors.
868  	 * Then fix up important state like USER_DS and preemption.
869  	 * Then do everything else.
870  	 */
871  	struct task_struct *tsk = current;
872  
873  	if (unlikely(in_interrupt()))
874  		panic("Aiee, killing interrupt handler!");
875  	if (unlikely(!tsk->pid))
876  		panic("Attempted to kill the idle task!");
877  
878  	if (unlikely(in_atomic())) {
879  		pr_info("note: %s[%d] exited with preempt_count %d\n",
880  			current->comm, task_pid_nr(current),
881  			preempt_count());
882  		preempt_count_set(PREEMPT_ENABLED);
883  	}
884  
885  	/*
886  	 * We're taking recursive faults here in make_task_dead. Safest is to just
887  	 * leave this task alone and wait for reboot.
888  	 */
889  	if (unlikely(tsk->flags & PF_EXITING)) {
890  		pr_alert("Fixing recursive fault but reboot is needed!\n");
891  		futex_exit_recursive(tsk);
892  		tsk->exit_state = EXIT_DEAD;
893  		refcount_inc(&tsk->rcu_users);
894  		do_task_dead();
895  	}
896  
897  	do_exit(signr);
898  }
899  
900  SYSCALL_DEFINE1(exit, int, error_code)
901  {
902  	do_exit((error_code&0xff)<<8);
903  }
904  
905  /*
906   * Take down every thread in the group.  This is called by fatal signals
907   * as well as by sys_exit_group (below).
908   */
909  void
910  do_group_exit(int exit_code)
911  {
912  	struct signal_struct *sig = current->signal;
913  
914  	if (sig->flags & SIGNAL_GROUP_EXIT)
915  		exit_code = sig->group_exit_code;
916  	else if (sig->group_exec_task)
917  		exit_code = 0;
918  	else if (!thread_group_empty(current)) {
919  		struct sighand_struct *const sighand = current->sighand;
920  
921  		spin_lock_irq(&sighand->siglock);
922  		if (sig->flags & SIGNAL_GROUP_EXIT)
923  			/* Another thread got here before we took the lock.  */
924  			exit_code = sig->group_exit_code;
925  		else if (sig->group_exec_task)
926  			exit_code = 0;
927  		else {
928  			sig->group_exit_code = exit_code;
929  			sig->flags = SIGNAL_GROUP_EXIT;
930  			zap_other_threads(current);
931  		}
932  		spin_unlock_irq(&sighand->siglock);
933  	}
934  
935  	do_exit(exit_code);
936  	/* NOTREACHED */
937  }
938  
939  /*
940   * this kills every thread in the thread group. Note that any externally
941   * wait4()-ing process will get the correct exit code - even if this
942   * thread is not the thread group leader.
943   */
944  SYSCALL_DEFINE1(exit_group, int, error_code)
945  {
946  	do_group_exit((error_code & 0xff) << 8);
947  	/* NOTREACHED */
948  	return 0;
949  }
950  
951  struct waitid_info {
952  	pid_t pid;
953  	uid_t uid;
954  	int status;
955  	int cause;
956  };
957  
958  struct wait_opts {
959  	enum pid_type		wo_type;
960  	int			wo_flags;
961  	struct pid		*wo_pid;
962  
963  	struct waitid_info	*wo_info;
964  	int			wo_stat;
965  	struct rusage		*wo_rusage;
966  
967  	wait_queue_entry_t		child_wait;
968  	int			notask_error;
969  };
970  
971  static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
972  {
973  	return	wo->wo_type == PIDTYPE_MAX ||
974  		task_pid_type(p, wo->wo_type) == wo->wo_pid;
975  }
976  
977  static int
978  eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
979  {
980  	if (!eligible_pid(wo, p))
981  		return 0;
982  
983  	/*
984  	 * Wait for all children (clone and not) if __WALL is set or
985  	 * if it is traced by us.
986  	 */
987  	if (ptrace || (wo->wo_flags & __WALL))
988  		return 1;
989  
990  	/*
991  	 * Otherwise, wait for clone children *only* if __WCLONE is set;
992  	 * otherwise, wait for non-clone children *only*.
993  	 *
994  	 * Note: a "clone" child here is one that reports to its parent
995  	 * using a signal other than SIGCHLD, or a non-leader thread which
996  	 * we can only see if it is traced by us.
997  	 */
998  	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
999  		return 0;
1000  
1001  	return 1;
1002  }
1003  
1004  /*
1005   * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
1006   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1007   * the lock and this task is uninteresting.  If we return nonzero, we have
1008   * released the lock and the system call should return.
1009   */
1010  static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1011  {
1012  	int state, status;
1013  	pid_t pid = task_pid_vnr(p);
1014  	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
1015  	struct waitid_info *infop;
1016  
1017  	if (!likely(wo->wo_flags & WEXITED))
1018  		return 0;
1019  
1020  	if (unlikely(wo->wo_flags & WNOWAIT)) {
1021  		status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1022  			? p->signal->group_exit_code : p->exit_code;
1023  		get_task_struct(p);
1024  		read_unlock(&tasklist_lock);
1025  		sched_annotate_sleep();
1026  		if (wo->wo_rusage)
1027  			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1028  		put_task_struct(p);
1029  		goto out_info;
1030  	}
1031  	/*
1032  	 * Move the task's state to DEAD/TRACE, only one thread can do this.
1033  	 */
1034  	state = (ptrace_reparented(p) && thread_group_leader(p)) ?
1035  		EXIT_TRACE : EXIT_DEAD;
1036  	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1037  		return 0;
1038  	/*
1039  	 * We own this thread, nobody else can reap it.
1040  	 */
1041  	read_unlock(&tasklist_lock);
1042  	sched_annotate_sleep();
1043  
1044  	/*
1045  	 * Check thread_group_leader() to exclude the traced sub-threads.
1046  	 */
1047  	if (state == EXIT_DEAD && thread_group_leader(p)) {
1048  		struct signal_struct *sig = p->signal;
1049  		struct signal_struct *psig = current->signal;
1050  		unsigned long maxrss;
1051  		u64 tgutime, tgstime;
1052  
1053  		/*
1054  		 * The resource counters for the group leader are in its
1055  		 * own task_struct.  Those for dead threads in the group
1056  		 * are in its signal_struct, as are those for the child
1057  		 * processes it has previously reaped.  All these
1058  		 * accumulate in the parent's signal_struct c* fields.
1059  		 *
1060  		 * We don't bother to take a lock here to protect these
1061  		 * p->signal fields because the whole thread group is dead
1062  		 * and nobody can change them.
1063  		 *
1064  		 * psig->stats_lock also protects us from our sub-theads
1065  		 * which can reap other children at the same time. Until
1066  		 * we change k_getrusage()-like users to rely on this lock
1067  		 * we have to take ->siglock as well.
1068  		 *
1069  		 * We use thread_group_cputime_adjusted() to get times for
1070  		 * the thread group, which consolidates times for all threads
1071  		 * in the group including the group leader.
1072  		 */
1073  		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1074  		spin_lock_irq(&current->sighand->siglock);
1075  		write_seqlock(&psig->stats_lock);
1076  		psig->cutime += tgutime + sig->cutime;
1077  		psig->cstime += tgstime + sig->cstime;
1078  		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1079  		psig->cmin_flt +=
1080  			p->min_flt + sig->min_flt + sig->cmin_flt;
1081  		psig->cmaj_flt +=
1082  			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1083  		psig->cnvcsw +=
1084  			p->nvcsw + sig->nvcsw + sig->cnvcsw;
1085  		psig->cnivcsw +=
1086  			p->nivcsw + sig->nivcsw + sig->cnivcsw;
1087  		psig->cinblock +=
1088  			task_io_get_inblock(p) +
1089  			sig->inblock + sig->cinblock;
1090  		psig->coublock +=
1091  			task_io_get_oublock(p) +
1092  			sig->oublock + sig->coublock;
1093  		maxrss = max(sig->maxrss, sig->cmaxrss);
1094  		if (psig->cmaxrss < maxrss)
1095  			psig->cmaxrss = maxrss;
1096  		task_io_accounting_add(&psig->ioac, &p->ioac);
1097  		task_io_accounting_add(&psig->ioac, &sig->ioac);
1098  		write_sequnlock(&psig->stats_lock);
1099  		spin_unlock_irq(&current->sighand->siglock);
1100  	}
1101  
1102  	if (wo->wo_rusage)
1103  		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1104  	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1105  		? p->signal->group_exit_code : p->exit_code;
1106  	wo->wo_stat = status;
1107  
1108  	if (state == EXIT_TRACE) {
1109  		write_lock_irq(&tasklist_lock);
1110  		/* We dropped tasklist, ptracer could die and untrace */
1111  		ptrace_unlink(p);
1112  
1113  		/* If parent wants a zombie, don't release it now */
1114  		state = EXIT_ZOMBIE;
1115  		if (do_notify_parent(p, p->exit_signal))
1116  			state = EXIT_DEAD;
1117  		p->exit_state = state;
1118  		write_unlock_irq(&tasklist_lock);
1119  	}
1120  	if (state == EXIT_DEAD)
1121  		release_task(p);
1122  
1123  out_info:
1124  	infop = wo->wo_info;
1125  	if (infop) {
1126  		if ((status & 0x7f) == 0) {
1127  			infop->cause = CLD_EXITED;
1128  			infop->status = status >> 8;
1129  		} else {
1130  			infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1131  			infop->status = status & 0x7f;
1132  		}
1133  		infop->pid = pid;
1134  		infop->uid = uid;
1135  	}
1136  
1137  	return pid;
1138  }
1139  
1140  static int *task_stopped_code(struct task_struct *p, bool ptrace)
1141  {
1142  	if (ptrace) {
1143  		if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
1144  			return &p->exit_code;
1145  	} else {
1146  		if (p->signal->flags & SIGNAL_STOP_STOPPED)
1147  			return &p->signal->group_exit_code;
1148  	}
1149  	return NULL;
1150  }
1151  
1152  /**
1153   * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1154   * @wo: wait options
1155   * @ptrace: is the wait for ptrace
1156   * @p: task to wait for
1157   *
1158   * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1159   *
1160   * CONTEXT:
1161   * read_lock(&tasklist_lock), which is released if return value is
1162   * non-zero.  Also, grabs and releases @p->sighand->siglock.
1163   *
1164   * RETURNS:
1165   * 0 if wait condition didn't exist and search for other wait conditions
1166   * should continue.  Non-zero return, -errno on failure and @p's pid on
1167   * success, implies that tasklist_lock is released and wait condition
1168   * search should terminate.
1169   */
1170  static int wait_task_stopped(struct wait_opts *wo,
1171  				int ptrace, struct task_struct *p)
1172  {
1173  	struct waitid_info *infop;
1174  	int exit_code, *p_code, why;
1175  	uid_t uid = 0; /* unneeded, required by compiler */
1176  	pid_t pid;
1177  
1178  	/*
1179  	 * Traditionally we see ptrace'd stopped tasks regardless of options.
1180  	 */
1181  	if (!ptrace && !(wo->wo_flags & WUNTRACED))
1182  		return 0;
1183  
1184  	if (!task_stopped_code(p, ptrace))
1185  		return 0;
1186  
1187  	exit_code = 0;
1188  	spin_lock_irq(&p->sighand->siglock);
1189  
1190  	p_code = task_stopped_code(p, ptrace);
1191  	if (unlikely(!p_code))
1192  		goto unlock_sig;
1193  
1194  	exit_code = *p_code;
1195  	if (!exit_code)
1196  		goto unlock_sig;
1197  
1198  	if (!unlikely(wo->wo_flags & WNOWAIT))
1199  		*p_code = 0;
1200  
1201  	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1202  unlock_sig:
1203  	spin_unlock_irq(&p->sighand->siglock);
1204  	if (!exit_code)
1205  		return 0;
1206  
1207  	/*
1208  	 * Now we are pretty sure this task is interesting.
1209  	 * Make sure it doesn't get reaped out from under us while we
1210  	 * give up the lock and then examine it below.  We don't want to
1211  	 * keep holding onto the tasklist_lock while we call getrusage and
1212  	 * possibly take page faults for user memory.
1213  	 */
1214  	get_task_struct(p);
1215  	pid = task_pid_vnr(p);
1216  	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1217  	read_unlock(&tasklist_lock);
1218  	sched_annotate_sleep();
1219  	if (wo->wo_rusage)
1220  		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1221  	put_task_struct(p);
1222  
1223  	if (likely(!(wo->wo_flags & WNOWAIT)))
1224  		wo->wo_stat = (exit_code << 8) | 0x7f;
1225  
1226  	infop = wo->wo_info;
1227  	if (infop) {
1228  		infop->cause = why;
1229  		infop->status = exit_code;
1230  		infop->pid = pid;
1231  		infop->uid = uid;
1232  	}
1233  	return pid;
1234  }
1235  
1236  /*
1237   * Handle do_wait work for one task in a live, non-stopped state.
1238   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1239   * the lock and this task is uninteresting.  If we return nonzero, we have
1240   * released the lock and the system call should return.
1241   */
1242  static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1243  {
1244  	struct waitid_info *infop;
1245  	pid_t pid;
1246  	uid_t uid;
1247  
1248  	if (!unlikely(wo->wo_flags & WCONTINUED))
1249  		return 0;
1250  
1251  	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1252  		return 0;
1253  
1254  	spin_lock_irq(&p->sighand->siglock);
1255  	/* Re-check with the lock held.  */
1256  	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1257  		spin_unlock_irq(&p->sighand->siglock);
1258  		return 0;
1259  	}
1260  	if (!unlikely(wo->wo_flags & WNOWAIT))
1261  		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1262  	uid = from_kuid_munged(current_user_ns(), task_uid(p));
1263  	spin_unlock_irq(&p->sighand->siglock);
1264  
1265  	pid = task_pid_vnr(p);
1266  	get_task_struct(p);
1267  	read_unlock(&tasklist_lock);
1268  	sched_annotate_sleep();
1269  	if (wo->wo_rusage)
1270  		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
1271  	put_task_struct(p);
1272  
1273  	infop = wo->wo_info;
1274  	if (!infop) {
1275  		wo->wo_stat = 0xffff;
1276  	} else {
1277  		infop->cause = CLD_CONTINUED;
1278  		infop->pid = pid;
1279  		infop->uid = uid;
1280  		infop->status = SIGCONT;
1281  	}
1282  	return pid;
1283  }
1284  
1285  /*
1286   * Consider @p for a wait by @parent.
1287   *
1288   * -ECHILD should be in ->notask_error before the first call.
1289   * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1290   * Returns zero if the search for a child should continue;
1291   * then ->notask_error is 0 if @p is an eligible child,
1292   * or still -ECHILD.
1293   */
1294  static int wait_consider_task(struct wait_opts *wo, int ptrace,
1295  				struct task_struct *p)
1296  {
1297  	/*
1298  	 * We can race with wait_task_zombie() from another thread.
1299  	 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1300  	 * can't confuse the checks below.
1301  	 */
1302  	int exit_state = READ_ONCE(p->exit_state);
1303  	int ret;
1304  
1305  	if (unlikely(exit_state == EXIT_DEAD))
1306  		return 0;
1307  
1308  	ret = eligible_child(wo, ptrace, p);
1309  	if (!ret)
1310  		return ret;
1311  
1312  	if (unlikely(exit_state == EXIT_TRACE)) {
1313  		/*
1314  		 * ptrace == 0 means we are the natural parent. In this case
1315  		 * we should clear notask_error, debugger will notify us.
1316  		 */
1317  		if (likely(!ptrace))
1318  			wo->notask_error = 0;
1319  		return 0;
1320  	}
1321  
1322  	if (likely(!ptrace) && unlikely(p->ptrace)) {
1323  		/*
1324  		 * If it is traced by its real parent's group, just pretend
1325  		 * the caller is ptrace_do_wait() and reap this child if it
1326  		 * is zombie.
1327  		 *
1328  		 * This also hides group stop state from real parent; otherwise
1329  		 * a single stop can be reported twice as group and ptrace stop.
1330  		 * If a ptracer wants to distinguish these two events for its
1331  		 * own children it should create a separate process which takes
1332  		 * the role of real parent.
1333  		 */
1334  		if (!ptrace_reparented(p))
1335  			ptrace = 1;
1336  	}
1337  
1338  	/* slay zombie? */
1339  	if (exit_state == EXIT_ZOMBIE) {
1340  		/* we don't reap group leaders with subthreads */
1341  		if (!delay_group_leader(p)) {
1342  			/*
1343  			 * A zombie ptracee is only visible to its ptracer.
1344  			 * Notification and reaping will be cascaded to the
1345  			 * real parent when the ptracer detaches.
1346  			 */
1347  			if (unlikely(ptrace) || likely(!p->ptrace))
1348  				return wait_task_zombie(wo, p);
1349  		}
1350  
1351  		/*
1352  		 * Allow access to stopped/continued state via zombie by
1353  		 * falling through.  Clearing of notask_error is complex.
1354  		 *
1355  		 * When !@ptrace:
1356  		 *
1357  		 * If WEXITED is set, notask_error should naturally be
1358  		 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
1359  		 * so, if there are live subthreads, there are events to
1360  		 * wait for.  If all subthreads are dead, it's still safe
1361  		 * to clear - this function will be called again in finite
1362  		 * amount time once all the subthreads are released and
1363  		 * will then return without clearing.
1364  		 *
1365  		 * When @ptrace:
1366  		 *
1367  		 * Stopped state is per-task and thus can't change once the
1368  		 * target task dies.  Only continued and exited can happen.
1369  		 * Clear notask_error if WCONTINUED | WEXITED.
1370  		 */
1371  		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1372  			wo->notask_error = 0;
1373  	} else {
1374  		/*
1375  		 * @p is alive and it's gonna stop, continue or exit, so
1376  		 * there always is something to wait for.
1377  		 */
1378  		wo->notask_error = 0;
1379  	}
1380  
1381  	/*
1382  	 * Wait for stopped.  Depending on @ptrace, different stopped state
1383  	 * is used and the two don't interact with each other.
1384  	 */
1385  	ret = wait_task_stopped(wo, ptrace, p);
1386  	if (ret)
1387  		return ret;
1388  
1389  	/*
1390  	 * Wait for continued.  There's only one continued state and the
1391  	 * ptracer can consume it which can confuse the real parent.  Don't
1392  	 * use WCONTINUED from ptracer.  You don't need or want it.
1393  	 */
1394  	return wait_task_continued(wo, p);
1395  }
1396  
1397  /*
1398   * Do the work of do_wait() for one thread in the group, @tsk.
1399   *
1400   * -ECHILD should be in ->notask_error before the first call.
1401   * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1402   * Returns zero if the search for a child should continue; then
1403   * ->notask_error is 0 if there were any eligible children,
1404   * or still -ECHILD.
1405   */
1406  static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1407  {
1408  	struct task_struct *p;
1409  
1410  	list_for_each_entry(p, &tsk->children, sibling) {
1411  		int ret = wait_consider_task(wo, 0, p);
1412  
1413  		if (ret)
1414  			return ret;
1415  	}
1416  
1417  	return 0;
1418  }
1419  
1420  static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1421  {
1422  	struct task_struct *p;
1423  
1424  	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1425  		int ret = wait_consider_task(wo, 1, p);
1426  
1427  		if (ret)
1428  			return ret;
1429  	}
1430  
1431  	return 0;
1432  }
1433  
1434  static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
1435  				int sync, void *key)
1436  {
1437  	struct wait_opts *wo = container_of(wait, struct wait_opts,
1438  						child_wait);
1439  	struct task_struct *p = key;
1440  
1441  	if (!eligible_pid(wo, p))
1442  		return 0;
1443  
1444  	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1445  		return 0;
1446  
1447  	return default_wake_function(wait, mode, sync, key);
1448  }
1449  
1450  void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1451  {
1452  	__wake_up_sync_key(&parent->signal->wait_chldexit,
1453  			   TASK_INTERRUPTIBLE, p);
1454  }
1455  
1456  static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
1457  				 struct task_struct *target)
1458  {
1459  	struct task_struct *parent =
1460  		!ptrace ? target->real_parent : target->parent;
1461  
1462  	return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
1463  				     same_thread_group(current, parent));
1464  }
1465  
1466  /*
1467   * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
1468   * and tracee lists to find the target task.
1469   */
1470  static int do_wait_pid(struct wait_opts *wo)
1471  {
1472  	bool ptrace;
1473  	struct task_struct *target;
1474  	int retval;
1475  
1476  	ptrace = false;
1477  	target = pid_task(wo->wo_pid, PIDTYPE_TGID);
1478  	if (target && is_effectively_child(wo, ptrace, target)) {
1479  		retval = wait_consider_task(wo, ptrace, target);
1480  		if (retval)
1481  			return retval;
1482  	}
1483  
1484  	ptrace = true;
1485  	target = pid_task(wo->wo_pid, PIDTYPE_PID);
1486  	if (target && target->ptrace &&
1487  	    is_effectively_child(wo, ptrace, target)) {
1488  		retval = wait_consider_task(wo, ptrace, target);
1489  		if (retval)
1490  			return retval;
1491  	}
1492  
1493  	return 0;
1494  }
1495  
1496  static long do_wait(struct wait_opts *wo)
1497  {
1498  	int retval;
1499  
1500  	trace_sched_process_wait(wo->wo_pid);
1501  
1502  	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1503  	wo->child_wait.private = current;
1504  	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1505  repeat:
1506  	/*
1507  	 * If there is nothing that can match our criteria, just get out.
1508  	 * We will clear ->notask_error to zero if we see any child that
1509  	 * might later match our criteria, even if we are not able to reap
1510  	 * it yet.
1511  	 */
1512  	wo->notask_error = -ECHILD;
1513  	if ((wo->wo_type < PIDTYPE_MAX) &&
1514  	   (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
1515  		goto notask;
1516  
1517  	set_current_state(TASK_INTERRUPTIBLE);
1518  	read_lock(&tasklist_lock);
1519  
1520  	if (wo->wo_type == PIDTYPE_PID) {
1521  		retval = do_wait_pid(wo);
1522  		if (retval)
1523  			goto end;
1524  	} else {
1525  		struct task_struct *tsk = current;
1526  
1527  		do {
1528  			retval = do_wait_thread(wo, tsk);
1529  			if (retval)
1530  				goto end;
1531  
1532  			retval = ptrace_do_wait(wo, tsk);
1533  			if (retval)
1534  				goto end;
1535  
1536  			if (wo->wo_flags & __WNOTHREAD)
1537  				break;
1538  		} while_each_thread(current, tsk);
1539  	}
1540  	read_unlock(&tasklist_lock);
1541  
1542  notask:
1543  	retval = wo->notask_error;
1544  	if (!retval && !(wo->wo_flags & WNOHANG)) {
1545  		retval = -ERESTARTSYS;
1546  		if (!signal_pending(current)) {
1547  			schedule();
1548  			goto repeat;
1549  		}
1550  	}
1551  end:
1552  	__set_current_state(TASK_RUNNING);
1553  	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1554  	return retval;
1555  }
1556  
1557  static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1558  			  int options, struct rusage *ru)
1559  {
1560  	struct wait_opts wo;
1561  	struct pid *pid = NULL;
1562  	enum pid_type type;
1563  	long ret;
1564  	unsigned int f_flags = 0;
1565  
1566  	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
1567  			__WNOTHREAD|__WCLONE|__WALL))
1568  		return -EINVAL;
1569  	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1570  		return -EINVAL;
1571  
1572  	switch (which) {
1573  	case P_ALL:
1574  		type = PIDTYPE_MAX;
1575  		break;
1576  	case P_PID:
1577  		type = PIDTYPE_PID;
1578  		if (upid <= 0)
1579  			return -EINVAL;
1580  
1581  		pid = find_get_pid(upid);
1582  		break;
1583  	case P_PGID:
1584  		type = PIDTYPE_PGID;
1585  		if (upid < 0)
1586  			return -EINVAL;
1587  
1588  		if (upid)
1589  			pid = find_get_pid(upid);
1590  		else
1591  			pid = get_task_pid(current, PIDTYPE_PGID);
1592  		break;
1593  	case P_PIDFD:
1594  		type = PIDTYPE_PID;
1595  		if (upid < 0)
1596  			return -EINVAL;
1597  
1598  		pid = pidfd_get_pid(upid, &f_flags);
1599  		if (IS_ERR(pid))
1600  			return PTR_ERR(pid);
1601  
1602  		break;
1603  	default:
1604  		return -EINVAL;
1605  	}
1606  
1607  	wo.wo_type	= type;
1608  	wo.wo_pid	= pid;
1609  	wo.wo_flags	= options;
1610  	wo.wo_info	= infop;
1611  	wo.wo_rusage	= ru;
1612  	if (f_flags & O_NONBLOCK)
1613  		wo.wo_flags |= WNOHANG;
1614  
1615  	ret = do_wait(&wo);
1616  	if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
1617  		ret = -EAGAIN;
1618  
1619  	put_pid(pid);
1620  	return ret;
1621  }
1622  
1623  SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1624  		infop, int, options, struct rusage __user *, ru)
1625  {
1626  	struct rusage r;
1627  	struct waitid_info info = {.status = 0};
1628  	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
1629  	int signo = 0;
1630  
1631  	if (err > 0) {
1632  		signo = SIGCHLD;
1633  		err = 0;
1634  		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1635  			return -EFAULT;
1636  	}
1637  	if (!infop)
1638  		return err;
1639  
1640  	if (!user_write_access_begin(infop, sizeof(*infop)))
1641  		return -EFAULT;
1642  
1643  	unsafe_put_user(signo, &infop->si_signo, Efault);
1644  	unsafe_put_user(0, &infop->si_errno, Efault);
1645  	unsafe_put_user(info.cause, &infop->si_code, Efault);
1646  	unsafe_put_user(info.pid, &infop->si_pid, Efault);
1647  	unsafe_put_user(info.uid, &infop->si_uid, Efault);
1648  	unsafe_put_user(info.status, &infop->si_status, Efault);
1649  	user_write_access_end();
1650  	return err;
1651  Efault:
1652  	user_write_access_end();
1653  	return -EFAULT;
1654  }
1655  
1656  long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
1657  		  struct rusage *ru)
1658  {
1659  	struct wait_opts wo;
1660  	struct pid *pid = NULL;
1661  	enum pid_type type;
1662  	long ret;
1663  
1664  	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1665  			__WNOTHREAD|__WCLONE|__WALL))
1666  		return -EINVAL;
1667  
1668  	/* -INT_MIN is not defined */
1669  	if (upid == INT_MIN)
1670  		return -ESRCH;
1671  
1672  	if (upid == -1)
1673  		type = PIDTYPE_MAX;
1674  	else if (upid < 0) {
1675  		type = PIDTYPE_PGID;
1676  		pid = find_get_pid(-upid);
1677  	} else if (upid == 0) {
1678  		type = PIDTYPE_PGID;
1679  		pid = get_task_pid(current, PIDTYPE_PGID);
1680  	} else /* upid > 0 */ {
1681  		type = PIDTYPE_PID;
1682  		pid = find_get_pid(upid);
1683  	}
1684  
1685  	wo.wo_type	= type;
1686  	wo.wo_pid	= pid;
1687  	wo.wo_flags	= options | WEXITED;
1688  	wo.wo_info	= NULL;
1689  	wo.wo_stat	= 0;
1690  	wo.wo_rusage	= ru;
1691  	ret = do_wait(&wo);
1692  	put_pid(pid);
1693  	if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
1694  		ret = -EFAULT;
1695  
1696  	return ret;
1697  }
1698  
1699  int kernel_wait(pid_t pid, int *stat)
1700  {
1701  	struct wait_opts wo = {
1702  		.wo_type	= PIDTYPE_PID,
1703  		.wo_pid		= find_get_pid(pid),
1704  		.wo_flags	= WEXITED,
1705  	};
1706  	int ret;
1707  
1708  	ret = do_wait(&wo);
1709  	if (ret > 0 && wo.wo_stat)
1710  		*stat = wo.wo_stat;
1711  	put_pid(wo.wo_pid);
1712  	return ret;
1713  }
1714  
1715  SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1716  		int, options, struct rusage __user *, ru)
1717  {
1718  	struct rusage r;
1719  	long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
1720  
1721  	if (err > 0) {
1722  		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
1723  			return -EFAULT;
1724  	}
1725  	return err;
1726  }
1727  
1728  #ifdef __ARCH_WANT_SYS_WAITPID
1729  
1730  /*
1731   * sys_waitpid() remains for compatibility. waitpid() should be
1732   * implemented by calling sys_wait4() from libc.a.
1733   */
1734  SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1735  {
1736  	return kernel_wait4(pid, stat_addr, options, NULL);
1737  }
1738  
1739  #endif
1740  
1741  #ifdef CONFIG_COMPAT
1742  COMPAT_SYSCALL_DEFINE4(wait4,
1743  	compat_pid_t, pid,
1744  	compat_uint_t __user *, stat_addr,
1745  	int, options,
1746  	struct compat_rusage __user *, ru)
1747  {
1748  	struct rusage r;
1749  	long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
1750  	if (err > 0) {
1751  		if (ru && put_compat_rusage(&r, ru))
1752  			return -EFAULT;
1753  	}
1754  	return err;
1755  }
1756  
1757  COMPAT_SYSCALL_DEFINE5(waitid,
1758  		int, which, compat_pid_t, pid,
1759  		struct compat_siginfo __user *, infop, int, options,
1760  		struct compat_rusage __user *, uru)
1761  {
1762  	struct rusage ru;
1763  	struct waitid_info info = {.status = 0};
1764  	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
1765  	int signo = 0;
1766  	if (err > 0) {
1767  		signo = SIGCHLD;
1768  		err = 0;
1769  		if (uru) {
1770  			/* kernel_waitid() overwrites everything in ru */
1771  			if (COMPAT_USE_64BIT_TIME)
1772  				err = copy_to_user(uru, &ru, sizeof(ru));
1773  			else
1774  				err = put_compat_rusage(&ru, uru);
1775  			if (err)
1776  				return -EFAULT;
1777  		}
1778  	}
1779  
1780  	if (!infop)
1781  		return err;
1782  
1783  	if (!user_write_access_begin(infop, sizeof(*infop)))
1784  		return -EFAULT;
1785  
1786  	unsafe_put_user(signo, &infop->si_signo, Efault);
1787  	unsafe_put_user(0, &infop->si_errno, Efault);
1788  	unsafe_put_user(info.cause, &infop->si_code, Efault);
1789  	unsafe_put_user(info.pid, &infop->si_pid, Efault);
1790  	unsafe_put_user(info.uid, &infop->si_uid, Efault);
1791  	unsafe_put_user(info.status, &infop->si_status, Efault);
1792  	user_write_access_end();
1793  	return err;
1794  Efault:
1795  	user_write_access_end();
1796  	return -EFAULT;
1797  }
1798  #endif
1799  
1800  /**
1801   * thread_group_exited - check that a thread group has exited
1802   * @pid: tgid of thread group to be checked.
1803   *
1804   * Test if the thread group represented by tgid has exited (all
1805   * threads are zombies, dead or completely gone).
1806   *
1807   * Return: true if the thread group has exited. false otherwise.
1808   */
1809  bool thread_group_exited(struct pid *pid)
1810  {
1811  	struct task_struct *task;
1812  	bool exited;
1813  
1814  	rcu_read_lock();
1815  	task = pid_task(pid, PIDTYPE_PID);
1816  	exited = !task ||
1817  		(READ_ONCE(task->exit_state) && thread_group_empty(task));
1818  	rcu_read_unlock();
1819  
1820  	return exited;
1821  }
1822  EXPORT_SYMBOL(thread_group_exited);
1823  
1824  __weak void abort(void)
1825  {
1826  	BUG();
1827  
1828  	/* if that doesn't kill us, halt */
1829  	panic("Oops failed to kill thread");
1830  }
1831  EXPORT_SYMBOL(abort);
1832