xref: /linux/kernel/exit.c (revision 3e85fd614c7b6bb7f33bb04a0dcb5a3bfca4c0fe)
1  /*
2   *  linux/kernel/exit.c
3   *
4   *  Copyright (C) 1991, 1992  Linus Torvalds
5   */
6  
7  #include <linux/mm.h>
8  #include <linux/slab.h>
9  #include <linux/interrupt.h>
10  #include <linux/module.h>
11  #include <linux/capability.h>
12  #include <linux/completion.h>
13  #include <linux/personality.h>
14  #include <linux/tty.h>
15  #include <linux/iocontext.h>
16  #include <linux/key.h>
17  #include <linux/security.h>
18  #include <linux/cpu.h>
19  #include <linux/acct.h>
20  #include <linux/tsacct_kern.h>
21  #include <linux/file.h>
22  #include <linux/fdtable.h>
23  #include <linux/binfmts.h>
24  #include <linux/nsproxy.h>
25  #include <linux/pid_namespace.h>
26  #include <linux/ptrace.h>
27  #include <linux/profile.h>
28  #include <linux/mount.h>
29  #include <linux/proc_fs.h>
30  #include <linux/kthread.h>
31  #include <linux/mempolicy.h>
32  #include <linux/taskstats_kern.h>
33  #include <linux/delayacct.h>
34  #include <linux/freezer.h>
35  #include <linux/cgroup.h>
36  #include <linux/syscalls.h>
37  #include <linux/signal.h>
38  #include <linux/posix-timers.h>
39  #include <linux/cn_proc.h>
40  #include <linux/mutex.h>
41  #include <linux/futex.h>
42  #include <linux/pipe_fs_i.h>
43  #include <linux/audit.h> /* for audit_free() */
44  #include <linux/resource.h>
45  #include <linux/blkdev.h>
46  #include <linux/task_io_accounting_ops.h>
47  #include <linux/tracehook.h>
48  #include <linux/fs_struct.h>
49  #include <linux/init_task.h>
50  #include <linux/perf_event.h>
51  #include <trace/events/sched.h>
52  #include <linux/hw_breakpoint.h>
53  
54  #include <asm/uaccess.h>
55  #include <asm/unistd.h>
56  #include <asm/pgtable.h>
57  #include <asm/mmu_context.h>
58  #include "cred-internals.h"
59  
60  static void exit_mm(struct task_struct * tsk);
61  
62  static void __unhash_process(struct task_struct *p)
63  {
64  	nr_threads--;
65  	detach_pid(p, PIDTYPE_PID);
66  	if (thread_group_leader(p)) {
67  		detach_pid(p, PIDTYPE_PGID);
68  		detach_pid(p, PIDTYPE_SID);
69  
70  		list_del_rcu(&p->tasks);
71  		__get_cpu_var(process_counts)--;
72  	}
73  	list_del_rcu(&p->thread_group);
74  	list_del_init(&p->sibling);
75  }
76  
77  /*
78   * This function expects the tasklist_lock write-locked.
79   */
80  static void __exit_signal(struct task_struct *tsk)
81  {
82  	struct signal_struct *sig = tsk->signal;
83  	struct sighand_struct *sighand;
84  
85  	BUG_ON(!sig);
86  	BUG_ON(!atomic_read(&sig->count));
87  
88  	sighand = rcu_dereference(tsk->sighand);
89  	spin_lock(&sighand->siglock);
90  
91  	posix_cpu_timers_exit(tsk);
92  	if (atomic_dec_and_test(&sig->count))
93  		posix_cpu_timers_exit_group(tsk);
94  	else {
95  		/*
96  		 * If there is any task waiting for the group exit
97  		 * then notify it:
98  		 */
99  		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
100  			wake_up_process(sig->group_exit_task);
101  
102  		if (tsk == sig->curr_target)
103  			sig->curr_target = next_thread(tsk);
104  		/*
105  		 * Accumulate here the counters for all threads but the
106  		 * group leader as they die, so they can be added into
107  		 * the process-wide totals when those are taken.
108  		 * The group leader stays around as a zombie as long
109  		 * as there are other threads.  When it gets reaped,
110  		 * the exit.c code will add its counts into these totals.
111  		 * We won't ever get here for the group leader, since it
112  		 * will have been the last reference on the signal_struct.
113  		 */
114  		sig->utime = cputime_add(sig->utime, tsk->utime);
115  		sig->stime = cputime_add(sig->stime, tsk->stime);
116  		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
117  		sig->min_flt += tsk->min_flt;
118  		sig->maj_flt += tsk->maj_flt;
119  		sig->nvcsw += tsk->nvcsw;
120  		sig->nivcsw += tsk->nivcsw;
121  		sig->inblock += task_io_get_inblock(tsk);
122  		sig->oublock += task_io_get_oublock(tsk);
123  		task_io_accounting_add(&sig->ioac, &tsk->ioac);
124  		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
125  		sig = NULL; /* Marker for below. */
126  	}
127  
128  	__unhash_process(tsk);
129  
130  	/*
131  	 * Do this under ->siglock, we can race with another thread
132  	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
133  	 */
134  	flush_sigqueue(&tsk->pending);
135  
136  	tsk->signal = NULL;
137  	tsk->sighand = NULL;
138  	spin_unlock(&sighand->siglock);
139  
140  	__cleanup_sighand(sighand);
141  	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
142  	if (sig) {
143  		flush_sigqueue(&sig->shared_pending);
144  		taskstats_tgid_free(sig);
145  		/*
146  		 * Make sure ->signal can't go away under rq->lock,
147  		 * see account_group_exec_runtime().
148  		 */
149  		task_rq_unlock_wait(tsk);
150  		__cleanup_signal(sig);
151  	}
152  }
153  
154  static void delayed_put_task_struct(struct rcu_head *rhp)
155  {
156  	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
157  
158  #ifdef CONFIG_PERF_EVENTS
159  	WARN_ON_ONCE(tsk->perf_event_ctxp);
160  #endif
161  	trace_sched_process_free(tsk);
162  	put_task_struct(tsk);
163  }
164  
165  
166  void release_task(struct task_struct * p)
167  {
168  	struct task_struct *leader;
169  	int zap_leader;
170  repeat:
171  	tracehook_prepare_release_task(p);
172  	/* don't need to get the RCU readlock here - the process is dead and
173  	 * can't be modifying its own credentials */
174  	atomic_dec(&__task_cred(p)->user->processes);
175  
176  	proc_flush_task(p);
177  
178  	write_lock_irq(&tasklist_lock);
179  	tracehook_finish_release_task(p);
180  	__exit_signal(p);
181  
182  	/*
183  	 * If we are the last non-leader member of the thread
184  	 * group, and the leader is zombie, then notify the
185  	 * group leader's parent process. (if it wants notification.)
186  	 */
187  	zap_leader = 0;
188  	leader = p->group_leader;
189  	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
190  		BUG_ON(task_detached(leader));
191  		do_notify_parent(leader, leader->exit_signal);
192  		/*
193  		 * If we were the last child thread and the leader has
194  		 * exited already, and the leader's parent ignores SIGCHLD,
195  		 * then we are the one who should release the leader.
196  		 *
197  		 * do_notify_parent() will have marked it self-reaping in
198  		 * that case.
199  		 */
200  		zap_leader = task_detached(leader);
201  
202  		/*
203  		 * This maintains the invariant that release_task()
204  		 * only runs on a task in EXIT_DEAD, just for sanity.
205  		 */
206  		if (zap_leader)
207  			leader->exit_state = EXIT_DEAD;
208  	}
209  
210  	write_unlock_irq(&tasklist_lock);
211  	release_thread(p);
212  	call_rcu(&p->rcu, delayed_put_task_struct);
213  
214  	p = leader;
215  	if (unlikely(zap_leader))
216  		goto repeat;
217  }
218  
219  /*
220   * This checks not only the pgrp, but falls back on the pid if no
221   * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
222   * without this...
223   *
224   * The caller must hold rcu lock or the tasklist lock.
225   */
226  struct pid *session_of_pgrp(struct pid *pgrp)
227  {
228  	struct task_struct *p;
229  	struct pid *sid = NULL;
230  
231  	p = pid_task(pgrp, PIDTYPE_PGID);
232  	if (p == NULL)
233  		p = pid_task(pgrp, PIDTYPE_PID);
234  	if (p != NULL)
235  		sid = task_session(p);
236  
237  	return sid;
238  }
239  
240  /*
241   * Determine if a process group is "orphaned", according to the POSIX
242   * definition in 2.2.2.52.  Orphaned process groups are not to be affected
243   * by terminal-generated stop signals.  Newly orphaned process groups are
244   * to receive a SIGHUP and a SIGCONT.
245   *
246   * "I ask you, have you ever known what it is to be an orphan?"
247   */
248  static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
249  {
250  	struct task_struct *p;
251  
252  	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
253  		if ((p == ignored_task) ||
254  		    (p->exit_state && thread_group_empty(p)) ||
255  		    is_global_init(p->real_parent))
256  			continue;
257  
258  		if (task_pgrp(p->real_parent) != pgrp &&
259  		    task_session(p->real_parent) == task_session(p))
260  			return 0;
261  	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
262  
263  	return 1;
264  }
265  
266  int is_current_pgrp_orphaned(void)
267  {
268  	int retval;
269  
270  	read_lock(&tasklist_lock);
271  	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
272  	read_unlock(&tasklist_lock);
273  
274  	return retval;
275  }
276  
277  static int has_stopped_jobs(struct pid *pgrp)
278  {
279  	int retval = 0;
280  	struct task_struct *p;
281  
282  	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
283  		if (!task_is_stopped(p))
284  			continue;
285  		retval = 1;
286  		break;
287  	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
288  	return retval;
289  }
290  
291  /*
292   * Check to see if any process groups have become orphaned as
293   * a result of our exiting, and if they have any stopped jobs,
294   * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
295   */
296  static void
297  kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
298  {
299  	struct pid *pgrp = task_pgrp(tsk);
300  	struct task_struct *ignored_task = tsk;
301  
302  	if (!parent)
303  		 /* exit: our father is in a different pgrp than
304  		  * we are and we were the only connection outside.
305  		  */
306  		parent = tsk->real_parent;
307  	else
308  		/* reparent: our child is in a different pgrp than
309  		 * we are, and it was the only connection outside.
310  		 */
311  		ignored_task = NULL;
312  
313  	if (task_pgrp(parent) != pgrp &&
314  	    task_session(parent) == task_session(tsk) &&
315  	    will_become_orphaned_pgrp(pgrp, ignored_task) &&
316  	    has_stopped_jobs(pgrp)) {
317  		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
318  		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
319  	}
320  }
321  
322  /**
323   * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
324   *
325   * If a kernel thread is launched as a result of a system call, or if
326   * it ever exits, it should generally reparent itself to kthreadd so it
327   * isn't in the way of other processes and is correctly cleaned up on exit.
328   *
329   * The various task state such as scheduling policy and priority may have
330   * been inherited from a user process, so we reset them to sane values here.
331   *
332   * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
333   */
334  static void reparent_to_kthreadd(void)
335  {
336  	write_lock_irq(&tasklist_lock);
337  
338  	ptrace_unlink(current);
339  	/* Reparent to init */
340  	current->real_parent = current->parent = kthreadd_task;
341  	list_move_tail(&current->sibling, &current->real_parent->children);
342  
343  	/* Set the exit signal to SIGCHLD so we signal init on exit */
344  	current->exit_signal = SIGCHLD;
345  
346  	if (task_nice(current) < 0)
347  		set_user_nice(current, 0);
348  	/* cpus_allowed? */
349  	/* rt_priority? */
350  	/* signals? */
351  	memcpy(current->signal->rlim, init_task.signal->rlim,
352  	       sizeof(current->signal->rlim));
353  
354  	atomic_inc(&init_cred.usage);
355  	commit_creds(&init_cred);
356  	write_unlock_irq(&tasklist_lock);
357  }
358  
359  void __set_special_pids(struct pid *pid)
360  {
361  	struct task_struct *curr = current->group_leader;
362  
363  	if (task_session(curr) != pid)
364  		change_pid(curr, PIDTYPE_SID, pid);
365  
366  	if (task_pgrp(curr) != pid)
367  		change_pid(curr, PIDTYPE_PGID, pid);
368  }
369  
370  static void set_special_pids(struct pid *pid)
371  {
372  	write_lock_irq(&tasklist_lock);
373  	__set_special_pids(pid);
374  	write_unlock_irq(&tasklist_lock);
375  }
376  
377  /*
378   * Let kernel threads use this to say that they allow a certain signal.
379   * Must not be used if kthread was cloned with CLONE_SIGHAND.
380   */
381  int allow_signal(int sig)
382  {
383  	if (!valid_signal(sig) || sig < 1)
384  		return -EINVAL;
385  
386  	spin_lock_irq(&current->sighand->siglock);
387  	/* This is only needed for daemonize()'ed kthreads */
388  	sigdelset(&current->blocked, sig);
389  	/*
390  	 * Kernel threads handle their own signals. Let the signal code
391  	 * know it'll be handled, so that they don't get converted to
392  	 * SIGKILL or just silently dropped.
393  	 */
394  	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
395  	recalc_sigpending();
396  	spin_unlock_irq(&current->sighand->siglock);
397  	return 0;
398  }
399  
400  EXPORT_SYMBOL(allow_signal);
401  
402  int disallow_signal(int sig)
403  {
404  	if (!valid_signal(sig) || sig < 1)
405  		return -EINVAL;
406  
407  	spin_lock_irq(&current->sighand->siglock);
408  	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
409  	recalc_sigpending();
410  	spin_unlock_irq(&current->sighand->siglock);
411  	return 0;
412  }
413  
414  EXPORT_SYMBOL(disallow_signal);
415  
416  /*
417   *	Put all the gunge required to become a kernel thread without
418   *	attached user resources in one place where it belongs.
419   */
420  
421  void daemonize(const char *name, ...)
422  {
423  	va_list args;
424  	sigset_t blocked;
425  
426  	va_start(args, name);
427  	vsnprintf(current->comm, sizeof(current->comm), name, args);
428  	va_end(args);
429  
430  	/*
431  	 * If we were started as result of loading a module, close all of the
432  	 * user space pages.  We don't need them, and if we didn't close them
433  	 * they would be locked into memory.
434  	 */
435  	exit_mm(current);
436  	/*
437  	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
438  	 * or suspend transition begins right now.
439  	 */
440  	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
441  
442  	if (current->nsproxy != &init_nsproxy) {
443  		get_nsproxy(&init_nsproxy);
444  		switch_task_namespaces(current, &init_nsproxy);
445  	}
446  	set_special_pids(&init_struct_pid);
447  	proc_clear_tty(current);
448  
449  	/* Block and flush all signals */
450  	sigfillset(&blocked);
451  	sigprocmask(SIG_BLOCK, &blocked, NULL);
452  	flush_signals(current);
453  
454  	/* Become as one with the init task */
455  
456  	daemonize_fs_struct();
457  	exit_files(current);
458  	current->files = init_task.files;
459  	atomic_inc(&current->files->count);
460  
461  	reparent_to_kthreadd();
462  }
463  
464  EXPORT_SYMBOL(daemonize);
465  
466  static void close_files(struct files_struct * files)
467  {
468  	int i, j;
469  	struct fdtable *fdt;
470  
471  	j = 0;
472  
473  	/*
474  	 * It is safe to dereference the fd table without RCU or
475  	 * ->file_lock because this is the last reference to the
476  	 * files structure.
477  	 */
478  	fdt = files_fdtable(files);
479  	for (;;) {
480  		unsigned long set;
481  		i = j * __NFDBITS;
482  		if (i >= fdt->max_fds)
483  			break;
484  		set = fdt->open_fds->fds_bits[j++];
485  		while (set) {
486  			if (set & 1) {
487  				struct file * file = xchg(&fdt->fd[i], NULL);
488  				if (file) {
489  					filp_close(file, files);
490  					cond_resched();
491  				}
492  			}
493  			i++;
494  			set >>= 1;
495  		}
496  	}
497  }
498  
499  struct files_struct *get_files_struct(struct task_struct *task)
500  {
501  	struct files_struct *files;
502  
503  	task_lock(task);
504  	files = task->files;
505  	if (files)
506  		atomic_inc(&files->count);
507  	task_unlock(task);
508  
509  	return files;
510  }
511  
512  void put_files_struct(struct files_struct *files)
513  {
514  	struct fdtable *fdt;
515  
516  	if (atomic_dec_and_test(&files->count)) {
517  		close_files(files);
518  		/*
519  		 * Free the fd and fdset arrays if we expanded them.
520  		 * If the fdtable was embedded, pass files for freeing
521  		 * at the end of the RCU grace period. Otherwise,
522  		 * you can free files immediately.
523  		 */
524  		fdt = files_fdtable(files);
525  		if (fdt != &files->fdtab)
526  			kmem_cache_free(files_cachep, files);
527  		free_fdtable(fdt);
528  	}
529  }
530  
531  void reset_files_struct(struct files_struct *files)
532  {
533  	struct task_struct *tsk = current;
534  	struct files_struct *old;
535  
536  	old = tsk->files;
537  	task_lock(tsk);
538  	tsk->files = files;
539  	task_unlock(tsk);
540  	put_files_struct(old);
541  }
542  
543  void exit_files(struct task_struct *tsk)
544  {
545  	struct files_struct * files = tsk->files;
546  
547  	if (files) {
548  		task_lock(tsk);
549  		tsk->files = NULL;
550  		task_unlock(tsk);
551  		put_files_struct(files);
552  	}
553  }
554  
555  #ifdef CONFIG_MM_OWNER
556  /*
557   * Task p is exiting and it owned mm, lets find a new owner for it
558   */
559  static inline int
560  mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
561  {
562  	/*
563  	 * If there are other users of the mm and the owner (us) is exiting
564  	 * we need to find a new owner to take on the responsibility.
565  	 */
566  	if (atomic_read(&mm->mm_users) <= 1)
567  		return 0;
568  	if (mm->owner != p)
569  		return 0;
570  	return 1;
571  }
572  
573  void mm_update_next_owner(struct mm_struct *mm)
574  {
575  	struct task_struct *c, *g, *p = current;
576  
577  retry:
578  	if (!mm_need_new_owner(mm, p))
579  		return;
580  
581  	read_lock(&tasklist_lock);
582  	/*
583  	 * Search in the children
584  	 */
585  	list_for_each_entry(c, &p->children, sibling) {
586  		if (c->mm == mm)
587  			goto assign_new_owner;
588  	}
589  
590  	/*
591  	 * Search in the siblings
592  	 */
593  	list_for_each_entry(c, &p->real_parent->children, sibling) {
594  		if (c->mm == mm)
595  			goto assign_new_owner;
596  	}
597  
598  	/*
599  	 * Search through everything else. We should not get
600  	 * here often
601  	 */
602  	do_each_thread(g, c) {
603  		if (c->mm == mm)
604  			goto assign_new_owner;
605  	} while_each_thread(g, c);
606  
607  	read_unlock(&tasklist_lock);
608  	/*
609  	 * We found no owner yet mm_users > 1: this implies that we are
610  	 * most likely racing with swapoff (try_to_unuse()) or /proc or
611  	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
612  	 */
613  	mm->owner = NULL;
614  	return;
615  
616  assign_new_owner:
617  	BUG_ON(c == p);
618  	get_task_struct(c);
619  	/*
620  	 * The task_lock protects c->mm from changing.
621  	 * We always want mm->owner->mm == mm
622  	 */
623  	task_lock(c);
624  	/*
625  	 * Delay read_unlock() till we have the task_lock()
626  	 * to ensure that c does not slip away underneath us
627  	 */
628  	read_unlock(&tasklist_lock);
629  	if (c->mm != mm) {
630  		task_unlock(c);
631  		put_task_struct(c);
632  		goto retry;
633  	}
634  	mm->owner = c;
635  	task_unlock(c);
636  	put_task_struct(c);
637  }
638  #endif /* CONFIG_MM_OWNER */
639  
640  /*
641   * Turn us into a lazy TLB process if we
642   * aren't already..
643   */
644  static void exit_mm(struct task_struct * tsk)
645  {
646  	struct mm_struct *mm = tsk->mm;
647  	struct core_state *core_state;
648  
649  	mm_release(tsk, mm);
650  	if (!mm)
651  		return;
652  	/*
653  	 * Serialize with any possible pending coredump.
654  	 * We must hold mmap_sem around checking core_state
655  	 * and clearing tsk->mm.  The core-inducing thread
656  	 * will increment ->nr_threads for each thread in the
657  	 * group with ->mm != NULL.
658  	 */
659  	down_read(&mm->mmap_sem);
660  	core_state = mm->core_state;
661  	if (core_state) {
662  		struct core_thread self;
663  		up_read(&mm->mmap_sem);
664  
665  		self.task = tsk;
666  		self.next = xchg(&core_state->dumper.next, &self);
667  		/*
668  		 * Implies mb(), the result of xchg() must be visible
669  		 * to core_state->dumper.
670  		 */
671  		if (atomic_dec_and_test(&core_state->nr_threads))
672  			complete(&core_state->startup);
673  
674  		for (;;) {
675  			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
676  			if (!self.task) /* see coredump_finish() */
677  				break;
678  			schedule();
679  		}
680  		__set_task_state(tsk, TASK_RUNNING);
681  		down_read(&mm->mmap_sem);
682  	}
683  	atomic_inc(&mm->mm_count);
684  	BUG_ON(mm != tsk->active_mm);
685  	/* more a memory barrier than a real lock */
686  	task_lock(tsk);
687  	tsk->mm = NULL;
688  	up_read(&mm->mmap_sem);
689  	enter_lazy_tlb(mm, current);
690  	/* We don't want this task to be frozen prematurely */
691  	clear_freeze_flag(tsk);
692  	task_unlock(tsk);
693  	mm_update_next_owner(mm);
694  	mmput(mm);
695  }
696  
697  /*
698   * When we die, we re-parent all our children.
699   * Try to give them to another thread in our thread
700   * group, and if no such member exists, give it to
701   * the child reaper process (ie "init") in our pid
702   * space.
703   */
704  static struct task_struct *find_new_reaper(struct task_struct *father)
705  {
706  	struct pid_namespace *pid_ns = task_active_pid_ns(father);
707  	struct task_struct *thread;
708  
709  	thread = father;
710  	while_each_thread(father, thread) {
711  		if (thread->flags & PF_EXITING)
712  			continue;
713  		if (unlikely(pid_ns->child_reaper == father))
714  			pid_ns->child_reaper = thread;
715  		return thread;
716  	}
717  
718  	if (unlikely(pid_ns->child_reaper == father)) {
719  		write_unlock_irq(&tasklist_lock);
720  		if (unlikely(pid_ns == &init_pid_ns))
721  			panic("Attempted to kill init!");
722  
723  		zap_pid_ns_processes(pid_ns);
724  		write_lock_irq(&tasklist_lock);
725  		/*
726  		 * We can not clear ->child_reaper or leave it alone.
727  		 * There may by stealth EXIT_DEAD tasks on ->children,
728  		 * forget_original_parent() must move them somewhere.
729  		 */
730  		pid_ns->child_reaper = init_pid_ns.child_reaper;
731  	}
732  
733  	return pid_ns->child_reaper;
734  }
735  
736  /*
737  * Any that need to be release_task'd are put on the @dead list.
738   */
739  static void reparent_thread(struct task_struct *father, struct task_struct *p,
740  				struct list_head *dead)
741  {
742  	if (p->pdeath_signal)
743  		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744  
745  	list_move_tail(&p->sibling, &p->real_parent->children);
746  
747  	if (task_detached(p))
748  		return;
749  	/*
750  	 * If this is a threaded reparent there is no need to
751  	 * notify anyone anything has happened.
752  	 */
753  	if (same_thread_group(p->real_parent, father))
754  		return;
755  
756  	/* We don't want people slaying init.  */
757  	p->exit_signal = SIGCHLD;
758  
759  	/* If it has exited notify the new parent about this child's death. */
760  	if (!task_ptrace(p) &&
761  	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762  		do_notify_parent(p, p->exit_signal);
763  		if (task_detached(p)) {
764  			p->exit_state = EXIT_DEAD;
765  			list_move_tail(&p->sibling, dead);
766  		}
767  	}
768  
769  	kill_orphaned_pgrp(p, father);
770  }
771  
772  static void forget_original_parent(struct task_struct *father)
773  {
774  	struct task_struct *p, *n, *reaper;
775  	LIST_HEAD(dead_children);
776  
777  	exit_ptrace(father);
778  
779  	write_lock_irq(&tasklist_lock);
780  	reaper = find_new_reaper(father);
781  
782  	list_for_each_entry_safe(p, n, &father->children, sibling) {
783  		p->real_parent = reaper;
784  		if (p->parent == father) {
785  			BUG_ON(task_ptrace(p));
786  			p->parent = p->real_parent;
787  		}
788  		reparent_thread(father, p, &dead_children);
789  	}
790  	write_unlock_irq(&tasklist_lock);
791  
792  	BUG_ON(!list_empty(&father->children));
793  
794  	list_for_each_entry_safe(p, n, &dead_children, sibling) {
795  		list_del_init(&p->sibling);
796  		release_task(p);
797  	}
798  }
799  
800  /*
801   * Send signals to all our closest relatives so that they know
802   * to properly mourn us..
803   */
804  static void exit_notify(struct task_struct *tsk, int group_dead)
805  {
806  	int signal;
807  	void *cookie;
808  
809  	/*
810  	 * This does two things:
811  	 *
812    	 * A.  Make init inherit all the child processes
813  	 * B.  Check to see if any process groups have become orphaned
814  	 *	as a result of our exiting, and if they have any stopped
815  	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
816  	 */
817  	forget_original_parent(tsk);
818  	exit_task_namespaces(tsk);
819  
820  	write_lock_irq(&tasklist_lock);
821  	if (group_dead)
822  		kill_orphaned_pgrp(tsk->group_leader, NULL);
823  
824  	/* Let father know we died
825  	 *
826  	 * Thread signals are configurable, but you aren't going to use
827  	 * that to send signals to arbitary processes.
828  	 * That stops right now.
829  	 *
830  	 * If the parent exec id doesn't match the exec id we saved
831  	 * when we started then we know the parent has changed security
832  	 * domain.
833  	 *
834  	 * If our self_exec id doesn't match our parent_exec_id then
835  	 * we have changed execution domain as these two values started
836  	 * the same after a fork.
837  	 */
838  	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
839  	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
840  	     tsk->self_exec_id != tsk->parent_exec_id))
841  		tsk->exit_signal = SIGCHLD;
842  
843  	signal = tracehook_notify_death(tsk, &cookie, group_dead);
844  	if (signal >= 0)
845  		signal = do_notify_parent(tsk, signal);
846  
847  	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
848  
849  	/* mt-exec, de_thread() is waiting for us */
850  	if (thread_group_leader(tsk) &&
851  	    tsk->signal->group_exit_task &&
852  	    tsk->signal->notify_count < 0)
853  		wake_up_process(tsk->signal->group_exit_task);
854  
855  	write_unlock_irq(&tasklist_lock);
856  
857  	tracehook_report_death(tsk, signal, cookie, group_dead);
858  
859  	/* If the process is dead, release it - nobody will wait for it */
860  	if (signal == DEATH_REAP)
861  		release_task(tsk);
862  }
863  
864  #ifdef CONFIG_DEBUG_STACK_USAGE
865  static void check_stack_usage(void)
866  {
867  	static DEFINE_SPINLOCK(low_water_lock);
868  	static int lowest_to_date = THREAD_SIZE;
869  	unsigned long free;
870  
871  	free = stack_not_used(current);
872  
873  	if (free >= lowest_to_date)
874  		return;
875  
876  	spin_lock(&low_water_lock);
877  	if (free < lowest_to_date) {
878  		printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
879  				"left\n",
880  				current->comm, free);
881  		lowest_to_date = free;
882  	}
883  	spin_unlock(&low_water_lock);
884  }
885  #else
886  static inline void check_stack_usage(void) {}
887  #endif
888  
889  NORET_TYPE void do_exit(long code)
890  {
891  	struct task_struct *tsk = current;
892  	int group_dead;
893  
894  	profile_task_exit(tsk);
895  
896  	WARN_ON(atomic_read(&tsk->fs_excl));
897  
898  	if (unlikely(in_interrupt()))
899  		panic("Aiee, killing interrupt handler!");
900  	if (unlikely(!tsk->pid))
901  		panic("Attempted to kill the idle task!");
902  
903  	tracehook_report_exit(&code);
904  
905  	validate_creds_for_do_exit(tsk);
906  
907  	/*
908  	 * We're taking recursive faults here in do_exit. Safest is to just
909  	 * leave this task alone and wait for reboot.
910  	 */
911  	if (unlikely(tsk->flags & PF_EXITING)) {
912  		printk(KERN_ALERT
913  			"Fixing recursive fault but reboot is needed!\n");
914  		/*
915  		 * We can do this unlocked here. The futex code uses
916  		 * this flag just to verify whether the pi state
917  		 * cleanup has been done or not. In the worst case it
918  		 * loops once more. We pretend that the cleanup was
919  		 * done as there is no way to return. Either the
920  		 * OWNER_DIED bit is set by now or we push the blocked
921  		 * task into the wait for ever nirwana as well.
922  		 */
923  		tsk->flags |= PF_EXITPIDONE;
924  		set_current_state(TASK_UNINTERRUPTIBLE);
925  		schedule();
926  	}
927  
928  	exit_irq_thread();
929  
930  	exit_signals(tsk);  /* sets PF_EXITING */
931  	/*
932  	 * tsk->flags are checked in the futex code to protect against
933  	 * an exiting task cleaning up the robust pi futexes.
934  	 */
935  	smp_mb();
936  	raw_spin_unlock_wait(&tsk->pi_lock);
937  
938  	if (unlikely(in_atomic()))
939  		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
940  				current->comm, task_pid_nr(current),
941  				preempt_count());
942  
943  	acct_update_integrals(tsk);
944  
945  	group_dead = atomic_dec_and_test(&tsk->signal->live);
946  	if (group_dead) {
947  		hrtimer_cancel(&tsk->signal->real_timer);
948  		exit_itimers(tsk->signal);
949  		if (tsk->mm)
950  			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
951  	}
952  	acct_collect(code, group_dead);
953  	if (group_dead)
954  		tty_audit_exit();
955  	if (unlikely(tsk->audit_context))
956  		audit_free(tsk);
957  
958  	tsk->exit_code = code;
959  	taskstats_exit(tsk, group_dead);
960  
961  	exit_mm(tsk);
962  
963  	if (group_dead)
964  		acct_process();
965  	trace_sched_process_exit(tsk);
966  
967  	exit_sem(tsk);
968  	exit_files(tsk);
969  	exit_fs(tsk);
970  	check_stack_usage();
971  	exit_thread();
972  	cgroup_exit(tsk, 1);
973  
974  	if (group_dead)
975  		disassociate_ctty(1);
976  
977  	module_put(task_thread_info(tsk)->exec_domain->module);
978  
979  	proc_exit_connector(tsk);
980  
981  	/*
982  	 * FIXME: do that only when needed, using sched_exit tracepoint
983  	 */
984  	flush_ptrace_hw_breakpoint(tsk);
985  	/*
986  	 * Flush inherited counters to the parent - before the parent
987  	 * gets woken up by child-exit notifications.
988  	 */
989  	perf_event_exit_task(tsk);
990  
991  	exit_notify(tsk, group_dead);
992  #ifdef CONFIG_NUMA
993  	mpol_put(tsk->mempolicy);
994  	tsk->mempolicy = NULL;
995  #endif
996  #ifdef CONFIG_FUTEX
997  	if (unlikely(current->pi_state_cache))
998  		kfree(current->pi_state_cache);
999  #endif
1000  	/*
1001  	 * Make sure we are holding no locks:
1002  	 */
1003  	debug_check_no_locks_held(tsk);
1004  	/*
1005  	 * We can do this unlocked here. The futex code uses this flag
1006  	 * just to verify whether the pi state cleanup has been done
1007  	 * or not. In the worst case it loops once more.
1008  	 */
1009  	tsk->flags |= PF_EXITPIDONE;
1010  
1011  	if (tsk->io_context)
1012  		exit_io_context(tsk);
1013  
1014  	if (tsk->splice_pipe)
1015  		__free_pipe_info(tsk->splice_pipe);
1016  
1017  	validate_creds_for_do_exit(tsk);
1018  
1019  	preempt_disable();
1020  	exit_rcu();
1021  	/* causes final put_task_struct in finish_task_switch(). */
1022  	tsk->state = TASK_DEAD;
1023  	schedule();
1024  	BUG();
1025  	/* Avoid "noreturn function does return".  */
1026  	for (;;)
1027  		cpu_relax();	/* For when BUG is null */
1028  }
1029  
1030  EXPORT_SYMBOL_GPL(do_exit);
1031  
1032  NORET_TYPE void complete_and_exit(struct completion *comp, long code)
1033  {
1034  	if (comp)
1035  		complete(comp);
1036  
1037  	do_exit(code);
1038  }
1039  
1040  EXPORT_SYMBOL(complete_and_exit);
1041  
1042  SYSCALL_DEFINE1(exit, int, error_code)
1043  {
1044  	do_exit((error_code&0xff)<<8);
1045  }
1046  
1047  /*
1048   * Take down every thread in the group.  This is called by fatal signals
1049   * as well as by sys_exit_group (below).
1050   */
1051  NORET_TYPE void
1052  do_group_exit(int exit_code)
1053  {
1054  	struct signal_struct *sig = current->signal;
1055  
1056  	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1057  
1058  	if (signal_group_exit(sig))
1059  		exit_code = sig->group_exit_code;
1060  	else if (!thread_group_empty(current)) {
1061  		struct sighand_struct *const sighand = current->sighand;
1062  		spin_lock_irq(&sighand->siglock);
1063  		if (signal_group_exit(sig))
1064  			/* Another thread got here before we took the lock.  */
1065  			exit_code = sig->group_exit_code;
1066  		else {
1067  			sig->group_exit_code = exit_code;
1068  			sig->flags = SIGNAL_GROUP_EXIT;
1069  			zap_other_threads(current);
1070  		}
1071  		spin_unlock_irq(&sighand->siglock);
1072  	}
1073  
1074  	do_exit(exit_code);
1075  	/* NOTREACHED */
1076  }
1077  
1078  /*
1079   * this kills every thread in the thread group. Note that any externally
1080   * wait4()-ing process will get the correct exit code - even if this
1081   * thread is not the thread group leader.
1082   */
1083  SYSCALL_DEFINE1(exit_group, int, error_code)
1084  {
1085  	do_group_exit((error_code & 0xff) << 8);
1086  	/* NOTREACHED */
1087  	return 0;
1088  }
1089  
1090  struct wait_opts {
1091  	enum pid_type		wo_type;
1092  	int			wo_flags;
1093  	struct pid		*wo_pid;
1094  
1095  	struct siginfo __user	*wo_info;
1096  	int __user		*wo_stat;
1097  	struct rusage __user	*wo_rusage;
1098  
1099  	wait_queue_t		child_wait;
1100  	int			notask_error;
1101  };
1102  
1103  static inline
1104  struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1105  {
1106  	if (type != PIDTYPE_PID)
1107  		task = task->group_leader;
1108  	return task->pids[type].pid;
1109  }
1110  
1111  static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1112  {
1113  	return	wo->wo_type == PIDTYPE_MAX ||
1114  		task_pid_type(p, wo->wo_type) == wo->wo_pid;
1115  }
1116  
1117  static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1118  {
1119  	if (!eligible_pid(wo, p))
1120  		return 0;
1121  	/* Wait for all children (clone and not) if __WALL is set;
1122  	 * otherwise, wait for clone children *only* if __WCLONE is
1123  	 * set; otherwise, wait for non-clone children *only*.  (Note:
1124  	 * A "clone" child here is one that reports to its parent
1125  	 * using a signal other than SIGCHLD.) */
1126  	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1127  	    && !(wo->wo_flags & __WALL))
1128  		return 0;
1129  
1130  	return 1;
1131  }
1132  
1133  static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1134  				pid_t pid, uid_t uid, int why, int status)
1135  {
1136  	struct siginfo __user *infop;
1137  	int retval = wo->wo_rusage
1138  		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1139  
1140  	put_task_struct(p);
1141  	infop = wo->wo_info;
1142  	if (infop) {
1143  		if (!retval)
1144  			retval = put_user(SIGCHLD, &infop->si_signo);
1145  		if (!retval)
1146  			retval = put_user(0, &infop->si_errno);
1147  		if (!retval)
1148  			retval = put_user((short)why, &infop->si_code);
1149  		if (!retval)
1150  			retval = put_user(pid, &infop->si_pid);
1151  		if (!retval)
1152  			retval = put_user(uid, &infop->si_uid);
1153  		if (!retval)
1154  			retval = put_user(status, &infop->si_status);
1155  	}
1156  	if (!retval)
1157  		retval = pid;
1158  	return retval;
1159  }
1160  
1161  /*
1162   * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
1163   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1164   * the lock and this task is uninteresting.  If we return nonzero, we have
1165   * released the lock and the system call should return.
1166   */
1167  static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1168  {
1169  	unsigned long state;
1170  	int retval, status, traced;
1171  	pid_t pid = task_pid_vnr(p);
1172  	uid_t uid = __task_cred(p)->uid;
1173  	struct siginfo __user *infop;
1174  
1175  	if (!likely(wo->wo_flags & WEXITED))
1176  		return 0;
1177  
1178  	if (unlikely(wo->wo_flags & WNOWAIT)) {
1179  		int exit_code = p->exit_code;
1180  		int why, status;
1181  
1182  		get_task_struct(p);
1183  		read_unlock(&tasklist_lock);
1184  		if ((exit_code & 0x7f) == 0) {
1185  			why = CLD_EXITED;
1186  			status = exit_code >> 8;
1187  		} else {
1188  			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1189  			status = exit_code & 0x7f;
1190  		}
1191  		return wait_noreap_copyout(wo, p, pid, uid, why, status);
1192  	}
1193  
1194  	/*
1195  	 * Try to move the task's state to DEAD
1196  	 * only one thread is allowed to do this:
1197  	 */
1198  	state = xchg(&p->exit_state, EXIT_DEAD);
1199  	if (state != EXIT_ZOMBIE) {
1200  		BUG_ON(state != EXIT_DEAD);
1201  		return 0;
1202  	}
1203  
1204  	traced = ptrace_reparented(p);
1205  	/*
1206  	 * It can be ptraced but not reparented, check
1207  	 * !task_detached() to filter out sub-threads.
1208  	 */
1209  	if (likely(!traced) && likely(!task_detached(p))) {
1210  		struct signal_struct *psig;
1211  		struct signal_struct *sig;
1212  		unsigned long maxrss;
1213  		cputime_t tgutime, tgstime;
1214  
1215  		/*
1216  		 * The resource counters for the group leader are in its
1217  		 * own task_struct.  Those for dead threads in the group
1218  		 * are in its signal_struct, as are those for the child
1219  		 * processes it has previously reaped.  All these
1220  		 * accumulate in the parent's signal_struct c* fields.
1221  		 *
1222  		 * We don't bother to take a lock here to protect these
1223  		 * p->signal fields, because they are only touched by
1224  		 * __exit_signal, which runs with tasklist_lock
1225  		 * write-locked anyway, and so is excluded here.  We do
1226  		 * need to protect the access to parent->signal fields,
1227  		 * as other threads in the parent group can be right
1228  		 * here reaping other children at the same time.
1229  		 *
1230  		 * We use thread_group_times() to get times for the thread
1231  		 * group, which consolidates times for all threads in the
1232  		 * group including the group leader.
1233  		 */
1234  		thread_group_times(p, &tgutime, &tgstime);
1235  		spin_lock_irq(&p->real_parent->sighand->siglock);
1236  		psig = p->real_parent->signal;
1237  		sig = p->signal;
1238  		psig->cutime =
1239  			cputime_add(psig->cutime,
1240  			cputime_add(tgutime,
1241  				    sig->cutime));
1242  		psig->cstime =
1243  			cputime_add(psig->cstime,
1244  			cputime_add(tgstime,
1245  				    sig->cstime));
1246  		psig->cgtime =
1247  			cputime_add(psig->cgtime,
1248  			cputime_add(p->gtime,
1249  			cputime_add(sig->gtime,
1250  				    sig->cgtime)));
1251  		psig->cmin_flt +=
1252  			p->min_flt + sig->min_flt + sig->cmin_flt;
1253  		psig->cmaj_flt +=
1254  			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1255  		psig->cnvcsw +=
1256  			p->nvcsw + sig->nvcsw + sig->cnvcsw;
1257  		psig->cnivcsw +=
1258  			p->nivcsw + sig->nivcsw + sig->cnivcsw;
1259  		psig->cinblock +=
1260  			task_io_get_inblock(p) +
1261  			sig->inblock + sig->cinblock;
1262  		psig->coublock +=
1263  			task_io_get_oublock(p) +
1264  			sig->oublock + sig->coublock;
1265  		maxrss = max(sig->maxrss, sig->cmaxrss);
1266  		if (psig->cmaxrss < maxrss)
1267  			psig->cmaxrss = maxrss;
1268  		task_io_accounting_add(&psig->ioac, &p->ioac);
1269  		task_io_accounting_add(&psig->ioac, &sig->ioac);
1270  		spin_unlock_irq(&p->real_parent->sighand->siglock);
1271  	}
1272  
1273  	/*
1274  	 * Now we are sure this task is interesting, and no other
1275  	 * thread can reap it because we set its state to EXIT_DEAD.
1276  	 */
1277  	read_unlock(&tasklist_lock);
1278  
1279  	retval = wo->wo_rusage
1280  		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1281  	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1282  		? p->signal->group_exit_code : p->exit_code;
1283  	if (!retval && wo->wo_stat)
1284  		retval = put_user(status, wo->wo_stat);
1285  
1286  	infop = wo->wo_info;
1287  	if (!retval && infop)
1288  		retval = put_user(SIGCHLD, &infop->si_signo);
1289  	if (!retval && infop)
1290  		retval = put_user(0, &infop->si_errno);
1291  	if (!retval && infop) {
1292  		int why;
1293  
1294  		if ((status & 0x7f) == 0) {
1295  			why = CLD_EXITED;
1296  			status >>= 8;
1297  		} else {
1298  			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1299  			status &= 0x7f;
1300  		}
1301  		retval = put_user((short)why, &infop->si_code);
1302  		if (!retval)
1303  			retval = put_user(status, &infop->si_status);
1304  	}
1305  	if (!retval && infop)
1306  		retval = put_user(pid, &infop->si_pid);
1307  	if (!retval && infop)
1308  		retval = put_user(uid, &infop->si_uid);
1309  	if (!retval)
1310  		retval = pid;
1311  
1312  	if (traced) {
1313  		write_lock_irq(&tasklist_lock);
1314  		/* We dropped tasklist, ptracer could die and untrace */
1315  		ptrace_unlink(p);
1316  		/*
1317  		 * If this is not a detached task, notify the parent.
1318  		 * If it's still not detached after that, don't release
1319  		 * it now.
1320  		 */
1321  		if (!task_detached(p)) {
1322  			do_notify_parent(p, p->exit_signal);
1323  			if (!task_detached(p)) {
1324  				p->exit_state = EXIT_ZOMBIE;
1325  				p = NULL;
1326  			}
1327  		}
1328  		write_unlock_irq(&tasklist_lock);
1329  	}
1330  	if (p != NULL)
1331  		release_task(p);
1332  
1333  	return retval;
1334  }
1335  
1336  static int *task_stopped_code(struct task_struct *p, bool ptrace)
1337  {
1338  	if (ptrace) {
1339  		if (task_is_stopped_or_traced(p))
1340  			return &p->exit_code;
1341  	} else {
1342  		if (p->signal->flags & SIGNAL_STOP_STOPPED)
1343  			return &p->signal->group_exit_code;
1344  	}
1345  	return NULL;
1346  }
1347  
1348  /*
1349   * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
1350   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1351   * the lock and this task is uninteresting.  If we return nonzero, we have
1352   * released the lock and the system call should return.
1353   */
1354  static int wait_task_stopped(struct wait_opts *wo,
1355  				int ptrace, struct task_struct *p)
1356  {
1357  	struct siginfo __user *infop;
1358  	int retval, exit_code, *p_code, why;
1359  	uid_t uid = 0; /* unneeded, required by compiler */
1360  	pid_t pid;
1361  
1362  	/*
1363  	 * Traditionally we see ptrace'd stopped tasks regardless of options.
1364  	 */
1365  	if (!ptrace && !(wo->wo_flags & WUNTRACED))
1366  		return 0;
1367  
1368  	exit_code = 0;
1369  	spin_lock_irq(&p->sighand->siglock);
1370  
1371  	p_code = task_stopped_code(p, ptrace);
1372  	if (unlikely(!p_code))
1373  		goto unlock_sig;
1374  
1375  	exit_code = *p_code;
1376  	if (!exit_code)
1377  		goto unlock_sig;
1378  
1379  	if (!unlikely(wo->wo_flags & WNOWAIT))
1380  		*p_code = 0;
1381  
1382  	/* don't need the RCU readlock here as we're holding a spinlock */
1383  	uid = __task_cred(p)->uid;
1384  unlock_sig:
1385  	spin_unlock_irq(&p->sighand->siglock);
1386  	if (!exit_code)
1387  		return 0;
1388  
1389  	/*
1390  	 * Now we are pretty sure this task is interesting.
1391  	 * Make sure it doesn't get reaped out from under us while we
1392  	 * give up the lock and then examine it below.  We don't want to
1393  	 * keep holding onto the tasklist_lock while we call getrusage and
1394  	 * possibly take page faults for user memory.
1395  	 */
1396  	get_task_struct(p);
1397  	pid = task_pid_vnr(p);
1398  	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1399  	read_unlock(&tasklist_lock);
1400  
1401  	if (unlikely(wo->wo_flags & WNOWAIT))
1402  		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1403  
1404  	retval = wo->wo_rusage
1405  		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1406  	if (!retval && wo->wo_stat)
1407  		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1408  
1409  	infop = wo->wo_info;
1410  	if (!retval && infop)
1411  		retval = put_user(SIGCHLD, &infop->si_signo);
1412  	if (!retval && infop)
1413  		retval = put_user(0, &infop->si_errno);
1414  	if (!retval && infop)
1415  		retval = put_user((short)why, &infop->si_code);
1416  	if (!retval && infop)
1417  		retval = put_user(exit_code, &infop->si_status);
1418  	if (!retval && infop)
1419  		retval = put_user(pid, &infop->si_pid);
1420  	if (!retval && infop)
1421  		retval = put_user(uid, &infop->si_uid);
1422  	if (!retval)
1423  		retval = pid;
1424  	put_task_struct(p);
1425  
1426  	BUG_ON(!retval);
1427  	return retval;
1428  }
1429  
1430  /*
1431   * Handle do_wait work for one task in a live, non-stopped state.
1432   * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
1433   * the lock and this task is uninteresting.  If we return nonzero, we have
1434   * released the lock and the system call should return.
1435   */
1436  static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1437  {
1438  	int retval;
1439  	pid_t pid;
1440  	uid_t uid;
1441  
1442  	if (!unlikely(wo->wo_flags & WCONTINUED))
1443  		return 0;
1444  
1445  	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1446  		return 0;
1447  
1448  	spin_lock_irq(&p->sighand->siglock);
1449  	/* Re-check with the lock held.  */
1450  	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1451  		spin_unlock_irq(&p->sighand->siglock);
1452  		return 0;
1453  	}
1454  	if (!unlikely(wo->wo_flags & WNOWAIT))
1455  		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1456  	uid = __task_cred(p)->uid;
1457  	spin_unlock_irq(&p->sighand->siglock);
1458  
1459  	pid = task_pid_vnr(p);
1460  	get_task_struct(p);
1461  	read_unlock(&tasklist_lock);
1462  
1463  	if (!wo->wo_info) {
1464  		retval = wo->wo_rusage
1465  			? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1466  		put_task_struct(p);
1467  		if (!retval && wo->wo_stat)
1468  			retval = put_user(0xffff, wo->wo_stat);
1469  		if (!retval)
1470  			retval = pid;
1471  	} else {
1472  		retval = wait_noreap_copyout(wo, p, pid, uid,
1473  					     CLD_CONTINUED, SIGCONT);
1474  		BUG_ON(retval == 0);
1475  	}
1476  
1477  	return retval;
1478  }
1479  
1480  /*
1481   * Consider @p for a wait by @parent.
1482   *
1483   * -ECHILD should be in ->notask_error before the first call.
1484   * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1485   * Returns zero if the search for a child should continue;
1486   * then ->notask_error is 0 if @p is an eligible child,
1487   * or another error from security_task_wait(), or still -ECHILD.
1488   */
1489  static int wait_consider_task(struct wait_opts *wo, int ptrace,
1490  				struct task_struct *p)
1491  {
1492  	int ret = eligible_child(wo, p);
1493  	if (!ret)
1494  		return ret;
1495  
1496  	ret = security_task_wait(p);
1497  	if (unlikely(ret < 0)) {
1498  		/*
1499  		 * If we have not yet seen any eligible child,
1500  		 * then let this error code replace -ECHILD.
1501  		 * A permission error will give the user a clue
1502  		 * to look for security policy problems, rather
1503  		 * than for mysterious wait bugs.
1504  		 */
1505  		if (wo->notask_error)
1506  			wo->notask_error = ret;
1507  		return 0;
1508  	}
1509  
1510  	if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1511  		/*
1512  		 * This child is hidden by ptrace.
1513  		 * We aren't allowed to see it now, but eventually we will.
1514  		 */
1515  		wo->notask_error = 0;
1516  		return 0;
1517  	}
1518  
1519  	if (p->exit_state == EXIT_DEAD)
1520  		return 0;
1521  
1522  	/*
1523  	 * We don't reap group leaders with subthreads.
1524  	 */
1525  	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1526  		return wait_task_zombie(wo, p);
1527  
1528  	/*
1529  	 * It's stopped or running now, so it might
1530  	 * later continue, exit, or stop again.
1531  	 */
1532  	wo->notask_error = 0;
1533  
1534  	if (task_stopped_code(p, ptrace))
1535  		return wait_task_stopped(wo, ptrace, p);
1536  
1537  	return wait_task_continued(wo, p);
1538  }
1539  
1540  /*
1541   * Do the work of do_wait() for one thread in the group, @tsk.
1542   *
1543   * -ECHILD should be in ->notask_error before the first call.
1544   * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1545   * Returns zero if the search for a child should continue; then
1546   * ->notask_error is 0 if there were any eligible children,
1547   * or another error from security_task_wait(), or still -ECHILD.
1548   */
1549  static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1550  {
1551  	struct task_struct *p;
1552  
1553  	list_for_each_entry(p, &tsk->children, sibling) {
1554  		/*
1555  		 * Do not consider detached threads.
1556  		 */
1557  		if (!task_detached(p)) {
1558  			int ret = wait_consider_task(wo, 0, p);
1559  			if (ret)
1560  				return ret;
1561  		}
1562  	}
1563  
1564  	return 0;
1565  }
1566  
1567  static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1568  {
1569  	struct task_struct *p;
1570  
1571  	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1572  		int ret = wait_consider_task(wo, 1, p);
1573  		if (ret)
1574  			return ret;
1575  	}
1576  
1577  	return 0;
1578  }
1579  
1580  static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1581  				int sync, void *key)
1582  {
1583  	struct wait_opts *wo = container_of(wait, struct wait_opts,
1584  						child_wait);
1585  	struct task_struct *p = key;
1586  
1587  	if (!eligible_pid(wo, p))
1588  		return 0;
1589  
1590  	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1591  		return 0;
1592  
1593  	return default_wake_function(wait, mode, sync, key);
1594  }
1595  
1596  void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1597  {
1598  	__wake_up_sync_key(&parent->signal->wait_chldexit,
1599  				TASK_INTERRUPTIBLE, 1, p);
1600  }
1601  
1602  static long do_wait(struct wait_opts *wo)
1603  {
1604  	struct task_struct *tsk;
1605  	int retval;
1606  
1607  	trace_sched_process_wait(wo->wo_pid);
1608  
1609  	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1610  	wo->child_wait.private = current;
1611  	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1612  repeat:
1613  	/*
1614  	 * If there is nothing that can match our critiera just get out.
1615  	 * We will clear ->notask_error to zero if we see any child that
1616  	 * might later match our criteria, even if we are not able to reap
1617  	 * it yet.
1618  	 */
1619  	wo->notask_error = -ECHILD;
1620  	if ((wo->wo_type < PIDTYPE_MAX) &&
1621  	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1622  		goto notask;
1623  
1624  	set_current_state(TASK_INTERRUPTIBLE);
1625  	read_lock(&tasklist_lock);
1626  	tsk = current;
1627  	do {
1628  		retval = do_wait_thread(wo, tsk);
1629  		if (retval)
1630  			goto end;
1631  
1632  		retval = ptrace_do_wait(wo, tsk);
1633  		if (retval)
1634  			goto end;
1635  
1636  		if (wo->wo_flags & __WNOTHREAD)
1637  			break;
1638  	} while_each_thread(current, tsk);
1639  	read_unlock(&tasklist_lock);
1640  
1641  notask:
1642  	retval = wo->notask_error;
1643  	if (!retval && !(wo->wo_flags & WNOHANG)) {
1644  		retval = -ERESTARTSYS;
1645  		if (!signal_pending(current)) {
1646  			schedule();
1647  			goto repeat;
1648  		}
1649  	}
1650  end:
1651  	__set_current_state(TASK_RUNNING);
1652  	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1653  	return retval;
1654  }
1655  
1656  SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1657  		infop, int, options, struct rusage __user *, ru)
1658  {
1659  	struct wait_opts wo;
1660  	struct pid *pid = NULL;
1661  	enum pid_type type;
1662  	long ret;
1663  
1664  	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
1665  		return -EINVAL;
1666  	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1667  		return -EINVAL;
1668  
1669  	switch (which) {
1670  	case P_ALL:
1671  		type = PIDTYPE_MAX;
1672  		break;
1673  	case P_PID:
1674  		type = PIDTYPE_PID;
1675  		if (upid <= 0)
1676  			return -EINVAL;
1677  		break;
1678  	case P_PGID:
1679  		type = PIDTYPE_PGID;
1680  		if (upid <= 0)
1681  			return -EINVAL;
1682  		break;
1683  	default:
1684  		return -EINVAL;
1685  	}
1686  
1687  	if (type < PIDTYPE_MAX)
1688  		pid = find_get_pid(upid);
1689  
1690  	wo.wo_type	= type;
1691  	wo.wo_pid	= pid;
1692  	wo.wo_flags	= options;
1693  	wo.wo_info	= infop;
1694  	wo.wo_stat	= NULL;
1695  	wo.wo_rusage	= ru;
1696  	ret = do_wait(&wo);
1697  
1698  	if (ret > 0) {
1699  		ret = 0;
1700  	} else if (infop) {
1701  		/*
1702  		 * For a WNOHANG return, clear out all the fields
1703  		 * we would set so the user can easily tell the
1704  		 * difference.
1705  		 */
1706  		if (!ret)
1707  			ret = put_user(0, &infop->si_signo);
1708  		if (!ret)
1709  			ret = put_user(0, &infop->si_errno);
1710  		if (!ret)
1711  			ret = put_user(0, &infop->si_code);
1712  		if (!ret)
1713  			ret = put_user(0, &infop->si_pid);
1714  		if (!ret)
1715  			ret = put_user(0, &infop->si_uid);
1716  		if (!ret)
1717  			ret = put_user(0, &infop->si_status);
1718  	}
1719  
1720  	put_pid(pid);
1721  
1722  	/* avoid REGPARM breakage on x86: */
1723  	asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1724  	return ret;
1725  }
1726  
1727  SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1728  		int, options, struct rusage __user *, ru)
1729  {
1730  	struct wait_opts wo;
1731  	struct pid *pid = NULL;
1732  	enum pid_type type;
1733  	long ret;
1734  
1735  	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1736  			__WNOTHREAD|__WCLONE|__WALL))
1737  		return -EINVAL;
1738  
1739  	if (upid == -1)
1740  		type = PIDTYPE_MAX;
1741  	else if (upid < 0) {
1742  		type = PIDTYPE_PGID;
1743  		pid = find_get_pid(-upid);
1744  	} else if (upid == 0) {
1745  		type = PIDTYPE_PGID;
1746  		pid = get_task_pid(current, PIDTYPE_PGID);
1747  	} else /* upid > 0 */ {
1748  		type = PIDTYPE_PID;
1749  		pid = find_get_pid(upid);
1750  	}
1751  
1752  	wo.wo_type	= type;
1753  	wo.wo_pid	= pid;
1754  	wo.wo_flags	= options | WEXITED;
1755  	wo.wo_info	= NULL;
1756  	wo.wo_stat	= stat_addr;
1757  	wo.wo_rusage	= ru;
1758  	ret = do_wait(&wo);
1759  	put_pid(pid);
1760  
1761  	/* avoid REGPARM breakage on x86: */
1762  	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1763  	return ret;
1764  }
1765  
1766  #ifdef __ARCH_WANT_SYS_WAITPID
1767  
1768  /*
1769   * sys_waitpid() remains for compatibility. waitpid() should be
1770   * implemented by calling sys_wait4() from libc.a.
1771   */
1772  SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1773  {
1774  	return sys_wait4(pid, stat_addr, options, NULL);
1775  }
1776  
1777  #endif
1778