xref: /freebsd/sys/kern/kern_fork.c (revision 800da341bc4a35f4b4d82d104b130825d9a42ffa)
19454b2d8SWarner Losh /*-
251369649SPedro F. Giffuni  * SPDX-License-Identifier: BSD-3-Clause
351369649SPedro F. Giffuni  *
4df8bae1dSRodney W. Grimes  * Copyright (c) 1982, 1986, 1989, 1991, 1993
5df8bae1dSRodney W. Grimes  *	The Regents of the University of California.  All rights reserved.
6df8bae1dSRodney W. Grimes  * (c) UNIX System Laboratories, Inc.
7df8bae1dSRodney W. Grimes  * All or some portions of this file are derived from material licensed
8df8bae1dSRodney W. Grimes  * to the University of California by American Telephone and Telegraph
9df8bae1dSRodney W. Grimes  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10df8bae1dSRodney W. Grimes  * the permission of UNIX System Laboratories, Inc.
11df8bae1dSRodney W. Grimes  *
12df8bae1dSRodney W. Grimes  * Redistribution and use in source and binary forms, with or without
13df8bae1dSRodney W. Grimes  * modification, are permitted provided that the following conditions
14df8bae1dSRodney W. Grimes  * are met:
15df8bae1dSRodney W. Grimes  * 1. Redistributions of source code must retain the above copyright
16df8bae1dSRodney W. Grimes  *    notice, this list of conditions and the following disclaimer.
17df8bae1dSRodney W. Grimes  * 2. Redistributions in binary form must reproduce the above copyright
18df8bae1dSRodney W. Grimes  *    notice, this list of conditions and the following disclaimer in the
19df8bae1dSRodney W. Grimes  *    documentation and/or other materials provided with the distribution.
2069a28758SEd Maste  * 3. Neither the name of the University nor the names of its contributors
21df8bae1dSRodney W. Grimes  *    may be used to endorse or promote products derived from this software
22df8bae1dSRodney W. Grimes  *    without specific prior written permission.
23df8bae1dSRodney W. Grimes  *
24df8bae1dSRodney W. Grimes  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25df8bae1dSRodney W. Grimes  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26df8bae1dSRodney W. Grimes  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27df8bae1dSRodney W. Grimes  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28df8bae1dSRodney W. Grimes  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29df8bae1dSRodney W. Grimes  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30df8bae1dSRodney W. Grimes  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31df8bae1dSRodney W. Grimes  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32df8bae1dSRodney W. Grimes  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33df8bae1dSRodney W. Grimes  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34df8bae1dSRodney W. Grimes  * SUCH DAMAGE.
35df8bae1dSRodney W. Grimes  */
36df8bae1dSRodney W. Grimes 
37677b542eSDavid E. O'Brien #include <sys/cdefs.h>
38db6a20e2SGarrett Wollman #include "opt_ktrace.h"
398a945d10SKonstantin Belousov #include "opt_kstack_pages.h"
40db6a20e2SGarrett Wollman 
41df8bae1dSRodney W. Grimes #include <sys/param.h>
42df8bae1dSRodney W. Grimes #include <sys/systm.h>
4334ebdceaSMateusz Guzik #include <sys/bitstring.h>
44d2d3e875SBruce Evans #include <sys/sysproto.h>
4575b8b3b2SJohn Baldwin #include <sys/eventhandler.h>
46cfb5f768SJonathan Anderson #include <sys/fcntl.h>
47df8bae1dSRodney W. Grimes #include <sys/filedesc.h>
480304c731SJamie Gritton #include <sys/jail.h>
49df8bae1dSRodney W. Grimes #include <sys/kernel.h>
5070fca427SJohn Baldwin #include <sys/kthread.h>
51c76e95c3SPeter Wemm #include <sys/sysctl.h>
5219284646SJohn Baldwin #include <sys/lock.h>
53df8bae1dSRodney W. Grimes #include <sys/malloc.h>
545dda15adSMark Johnston #include <sys/msan.h>
5535e0e5b3SJohn Baldwin #include <sys/mutex.h>
56acd3428bSRobert Watson #include <sys/priv.h>
57df8bae1dSRodney W. Grimes #include <sys/proc.h>
58cfb5f768SJonathan Anderson #include <sys/procdesc.h>
59189ac973SJohn Baldwin #include <sys/ptrace.h>
60097055e2SEdward Tomasz Napierala #include <sys/racct.h>
61df8bae1dSRodney W. Grimes #include <sys/resourcevar.h>
62b43179fbSJeff Roberson #include <sys/sched.h>
63a7b124c3SJohn Baldwin #include <sys/syscall.h>
6470fca427SJohn Baldwin #include <sys/vmmeter.h>
65df8bae1dSRodney W. Grimes #include <sys/vnode.h>
66df8bae1dSRodney W. Grimes #include <sys/acct.h>
670384fff8SJason Evans #include <sys/ktr.h>
68df8bae1dSRodney W. Grimes #include <sys/ktrace.h>
69b71fec07SBruce Evans #include <sys/unistd.h>
705d217f17SJohn Birrell #include <sys/sdt.h>
7157934cd3SJohn Baldwin #include <sys/sx.h>
72e5d81ef1SDmitry Chagin #include <sys/sysent.h>
736004362eSDavid Schultz #include <sys/signalvar.h>
74df8bae1dSRodney W. Grimes 
75fcf7f27aSRobert Watson #include <security/audit/audit.h>
76aed55708SRobert Watson #include <security/mac/mac_framework.h>
77fcf7f27aSRobert Watson 
78d93f860cSPoul-Henning Kamp #include <vm/vm.h>
79dabee6feSPeter Wemm #include <vm/pmap.h>
80dabee6feSPeter Wemm #include <vm/vm_map.h>
81efeaf95aSDavid Greenman #include <vm/vm_extern.h>
82c897b813SJeff Roberson #include <vm/uma.h>
83d93f860cSPoul-Henning Kamp 
845d217f17SJohn Birrell #ifdef KDTRACE_HOOKS
855d217f17SJohn Birrell #include <sys/dtrace_bsd.h>
865d217f17SJohn Birrell dtrace_fork_func_t	dtrace_fasttrap_fork;
875d217f17SJohn Birrell #endif
885d217f17SJohn Birrell 
895d217f17SJohn Birrell SDT_PROVIDER_DECLARE(proc);
9036160958SMark Johnston SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int");
9188c5ea45SJulian Elischer 
92d2d3e875SBruce Evans #ifndef _SYS_SYSPROTO_H_
93ad7507e2SSteven Wallace struct fork_args {
94ad7507e2SSteven Wallace 	int     dummy;
95ad7507e2SSteven Wallace };
96d2d3e875SBruce Evans #endif
97ad7507e2SSteven Wallace 
98df8bae1dSRodney W. Grimes /* ARGSUSED */
9926f9a767SRodney W. Grimes int
1008451d0ddSKip Macy sys_fork(struct thread *td, struct fork_args *uap)
101df8bae1dSRodney W. Grimes {
10233fd9b9aSMateusz Guzik 	struct fork_req fr;
103813361c1SMateusz Guzik 	int error, pid;
104be67169aSBruce Evans 
10533fd9b9aSMateusz Guzik 	bzero(&fr, sizeof(fr));
10633fd9b9aSMateusz Guzik 	fr.fr_flags = RFFDG | RFPROC;
107813361c1SMateusz Guzik 	fr.fr_pidp = &pid;
10833fd9b9aSMateusz Guzik 	error = fork1(td, &fr);
109df8abd0bSPeter Wemm 	if (error == 0) {
110813361c1SMateusz Guzik 		td->td_retval[0] = pid;
111b40ce416SJulian Elischer 		td->td_retval[1] = 0;
112df8abd0bSPeter Wemm 	}
11370fca427SJohn Baldwin 	return (error);
114df8bae1dSRodney W. Grimes }
115df8bae1dSRodney W. Grimes 
116cfb5f768SJonathan Anderson /* ARGUSED */
117cfb5f768SJonathan Anderson int
1180c829a30SMateusz Guzik sys_pdfork(struct thread *td, struct pdfork_args *uap)
119cfb5f768SJonathan Anderson {
12033fd9b9aSMateusz Guzik 	struct fork_req fr;
121813361c1SMateusz Guzik 	int error, fd, pid;
122cfb5f768SJonathan Anderson 
12333fd9b9aSMateusz Guzik 	bzero(&fr, sizeof(fr));
12433fd9b9aSMateusz Guzik 	fr.fr_flags = RFFDG | RFPROC | RFPROCDESC;
125813361c1SMateusz Guzik 	fr.fr_pidp = &pid;
12633fd9b9aSMateusz Guzik 	fr.fr_pd_fd = &fd;
12733fd9b9aSMateusz Guzik 	fr.fr_pd_flags = uap->flags;
128757a5642SChristian S.J. Peron 	AUDIT_ARG_FFLAGS(uap->flags);
129cfb5f768SJonathan Anderson 	/*
130cfb5f768SJonathan Anderson 	 * It is necessary to return fd by reference because 0 is a valid file
131cfb5f768SJonathan Anderson 	 * descriptor number, and the child needs to be able to distinguish
132cfb5f768SJonathan Anderson 	 * itself from the parent using the return value.
133cfb5f768SJonathan Anderson 	 */
13433fd9b9aSMateusz Guzik 	error = fork1(td, &fr);
135cfb5f768SJonathan Anderson 	if (error == 0) {
136813361c1SMateusz Guzik 		td->td_retval[0] = pid;
137cfb5f768SJonathan Anderson 		td->td_retval[1] = 0;
138cfb5f768SJonathan Anderson 		error = copyout(&fd, uap->fdp, sizeof(fd));
139cfb5f768SJonathan Anderson 	}
140cfb5f768SJonathan Anderson 	return (error);
141cfb5f768SJonathan Anderson }
142cfb5f768SJonathan Anderson 
143df8bae1dSRodney W. Grimes /* ARGSUSED */
14426f9a767SRodney W. Grimes int
1458451d0ddSKip Macy sys_vfork(struct thread *td, struct vfork_args *uap)
146df8bae1dSRodney W. Grimes {
14733fd9b9aSMateusz Guzik 	struct fork_req fr;
148813361c1SMateusz Guzik 	int error, pid;
149be67169aSBruce Evans 
15033fd9b9aSMateusz Guzik 	bzero(&fr, sizeof(fr));
15133fd9b9aSMateusz Guzik 	fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
152813361c1SMateusz Guzik 	fr.fr_pidp = &pid;
15333fd9b9aSMateusz Guzik 	error = fork1(td, &fr);
154df8abd0bSPeter Wemm 	if (error == 0) {
155813361c1SMateusz Guzik 		td->td_retval[0] = pid;
156b40ce416SJulian Elischer 		td->td_retval[1] = 0;
157df8abd0bSPeter Wemm 	}
15870fca427SJohn Baldwin 	return (error);
159df8bae1dSRodney W. Grimes }
160df8bae1dSRodney W. Grimes 
161dabee6feSPeter Wemm int
1628451d0ddSKip Macy sys_rfork(struct thread *td, struct rfork_args *uap)
163dabee6feSPeter Wemm {
16433fd9b9aSMateusz Guzik 	struct fork_req fr;
165813361c1SMateusz Guzik 	int error, pid;
166be67169aSBruce Evans 
167c8564ad4SBruce Evans 	/* Don't allow kernel-only flags. */
168885ccc61SJohn Baldwin 	if ((uap->flags & RFKERNELONLY) != 0)
169885ccc61SJohn Baldwin 		return (EINVAL);
170079c5b9eSKyle Evans 	/* RFSPAWN must not appear with others */
171079c5b9eSKyle Evans 	if ((uap->flags & RFSPAWN) != 0 && uap->flags != RFSPAWN)
172079c5b9eSKyle Evans 		return (EINVAL);
173c8564ad4SBruce Evans 
17414961ba7SRobert Watson 	AUDIT_ARG_FFLAGS(uap->flags);
17533fd9b9aSMateusz Guzik 	bzero(&fr, sizeof(fr));
176079c5b9eSKyle Evans 	if ((uap->flags & RFSPAWN) != 0) {
177079c5b9eSKyle Evans 		fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
178079c5b9eSKyle Evans 		fr.fr_flags2 = FR2_DROPSIG_CAUGHT;
179079c5b9eSKyle Evans 	} else {
18033fd9b9aSMateusz Guzik 		fr.fr_flags = uap->flags;
181079c5b9eSKyle Evans 	}
182813361c1SMateusz Guzik 	fr.fr_pidp = &pid;
18333fd9b9aSMateusz Guzik 	error = fork1(td, &fr);
184df8abd0bSPeter Wemm 	if (error == 0) {
185813361c1SMateusz Guzik 		td->td_retval[0] = pid;
186b40ce416SJulian Elischer 		td->td_retval[1] = 0;
187df8abd0bSPeter Wemm 	}
18870fca427SJohn Baldwin 	return (error);
189dabee6feSPeter Wemm }
190dabee6feSPeter Wemm 
19137d2b1f3SMateusz Guzik int __exclusive_cache_line	nprocs = 1;		/* process 0 */
1928f7e4eb5SDag-Erling Smørgrav int	lastpid = 0;
1938f7e4eb5SDag-Erling Smørgrav SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
194d941d475SRobert Watson     "Last used PID");
195df8bae1dSRodney W. Grimes 
196bb6a234eSPeter Wemm /*
1978f7e4eb5SDag-Erling Smørgrav  * Random component to lastpid generation.  We mix in a random factor to make
198bb6a234eSPeter Wemm  * it a little harder to predict.  We sanity check the modulus value to avoid
199bb6a234eSPeter Wemm  * doing it in critical paths.  Don't let it be too small or we pointlessly
200bb6a234eSPeter Wemm  * waste randomness entropy, and don't let it be impossibly large.  Using a
201bb6a234eSPeter Wemm  * modulus that is too big causes a LOT more process table scans and slows
202bb6a234eSPeter Wemm  * down fork processing as the pidchecked caching is defeated.
203bb6a234eSPeter Wemm  */
204ee3fd601SDan Moschuk static int randompid = 0;
205bb6a234eSPeter Wemm 
206bb6a234eSPeter Wemm static int
20782d9ae4eSPoul-Henning Kamp sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
208bb6a234eSPeter Wemm {
209bb6a234eSPeter Wemm 	int error, pid;
210bb6a234eSPeter Wemm 
21147934cefSDon Lewis 	error = sysctl_wire_old_buffer(req, sizeof(int));
21247934cefSDon Lewis 	if (error != 0)
21347934cefSDon Lewis 		return(error);
2143fc755c1SJohn Baldwin 	sx_xlock(&allproc_lock);
215bb6a234eSPeter Wemm 	pid = randompid;
216bb6a234eSPeter Wemm 	error = sysctl_handle_int(oidp, &pid, 0, req);
2173fc755c1SJohn Baldwin 	if (error == 0 && req->newptr != NULL) {
218008a0935SDag-Erling Smørgrav 		if (pid == 0)
219008a0935SDag-Erling Smørgrav 			randompid = 0;
220008a0935SDag-Erling Smørgrav 		else if (pid == 1)
221008a0935SDag-Erling Smørgrav 			/* generate a random PID modulus between 100 and 1123 */
222008a0935SDag-Erling Smørgrav 			randompid = 100 + arc4random() % 1024;
223008a0935SDag-Erling Smørgrav 		else if (pid < 0 || pid > pid_max - 100)
224008a0935SDag-Erling Smørgrav 			/* out of range */
225008a0935SDag-Erling Smørgrav 			randompid = pid_max - 100;
226008a0935SDag-Erling Smørgrav 		else if (pid < 100)
227008a0935SDag-Erling Smørgrav 			/* Make it reasonable */
228008a0935SDag-Erling Smørgrav 			randompid = 100;
229008a0935SDag-Erling Smørgrav 		else
230bb6a234eSPeter Wemm 			randompid = pid;
2313fc755c1SJohn Baldwin 	}
2323fc755c1SJohn Baldwin 	sx_xunlock(&allproc_lock);
233bb6a234eSPeter Wemm 	return (error);
234bb6a234eSPeter Wemm }
235bb6a234eSPeter Wemm 
2367029da5cSPawel Biernacki SYSCTL_PROC(_kern, OID_AUTO, randompid,
2377029da5cSPawel Biernacki     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
2387029da5cSPawel Biernacki     sysctl_kern_randompid, "I",
2397029da5cSPawel Biernacki     "Random PID modulus. Special values: 0: disable, 1: choose random value");
240ee3fd601SDan Moschuk 
24134ebdceaSMateusz Guzik extern bitstr_t proc_id_pidmap;
24234ebdceaSMateusz Guzik extern bitstr_t proc_id_grpidmap;
24334ebdceaSMateusz Guzik extern bitstr_t proc_id_sessidmap;
24434ebdceaSMateusz Guzik extern bitstr_t proc_id_reapmap;
24534ebdceaSMateusz Guzik 
24619b75ef5SMateusz Guzik /*
24719b75ef5SMateusz Guzik  * Find an unused process ID
24819b75ef5SMateusz Guzik  *
24919b75ef5SMateusz Guzik  * If RFHIGHPID is set (used during system boot), do not allocate
25019b75ef5SMateusz Guzik  * low-numbered pids.
25119b75ef5SMateusz Guzik  */
2521d845e86SEdward Tomasz Napierala static int
253afd01097SEdward Tomasz Napierala fork_findpid(int flags)
254afd01097SEdward Tomasz Napierala {
25534ebdceaSMateusz Guzik 	pid_t result;
25650c7615fSMateusz Guzik 	int trypid, random;
25750c7615fSMateusz Guzik 
25850c7615fSMateusz Guzik 	/*
25950c7615fSMateusz Guzik 	 * Avoid calling arc4random with procid_lock held.
26050c7615fSMateusz Guzik 	 */
26150c7615fSMateusz Guzik 	random = 0;
26250c7615fSMateusz Guzik 	if (__predict_false(randompid))
26350c7615fSMateusz Guzik 		random = arc4random() % randompid;
26450c7615fSMateusz Guzik 
26550c7615fSMateusz Guzik 	mtx_lock(&procid_lock);
266afd01097SEdward Tomasz Napierala 
267afd01097SEdward Tomasz Napierala 	trypid = lastpid + 1;
268afd01097SEdward Tomasz Napierala 	if (flags & RFHIGHPID) {
269afd01097SEdward Tomasz Napierala 		if (trypid < 10)
270afd01097SEdward Tomasz Napierala 			trypid = 10;
271afd01097SEdward Tomasz Napierala 	} else {
27250c7615fSMateusz Guzik 		trypid += random;
273afd01097SEdward Tomasz Napierala 	}
274afd01097SEdward Tomasz Napierala retry:
275b05641b6SMateusz Guzik 	if (trypid >= pid_max)
276b05641b6SMateusz Guzik 		trypid = 2;
277afd01097SEdward Tomasz Napierala 
27834ebdceaSMateusz Guzik 	bit_ffc_at(&proc_id_pidmap, trypid, pid_max, &result);
279eab2132aSMateusz Guzik 	if (result == -1) {
280b05641b6SMateusz Guzik 		KASSERT(trypid != 2, ("unexpectedly ran out of IDs"));
281b05641b6SMateusz Guzik 		trypid = 2;
282afd01097SEdward Tomasz Napierala 		goto retry;
283eab2132aSMateusz Guzik 	}
28434ebdceaSMateusz Guzik 	if (bit_test(&proc_id_grpidmap, result) ||
28534ebdceaSMateusz Guzik 	    bit_test(&proc_id_sessidmap, result) ||
28634ebdceaSMateusz Guzik 	    bit_test(&proc_id_reapmap, result)) {
28719b75ef5SMateusz Guzik 		trypid = result + 1;
28834ebdceaSMateusz Guzik 		goto retry;
289afd01097SEdward Tomasz Napierala 	}
290afd01097SEdward Tomasz Napierala 
291afd01097SEdward Tomasz Napierala 	/*
292afd01097SEdward Tomasz Napierala 	 * RFHIGHPID does not mess with the lastpid counter during boot.
293afd01097SEdward Tomasz Napierala 	 */
29434ebdceaSMateusz Guzik 	if ((flags & RFHIGHPID) == 0)
29534ebdceaSMateusz Guzik 		lastpid = result;
296afd01097SEdward Tomasz Napierala 
29734ebdceaSMateusz Guzik 	bit_set(&proc_id_pidmap, result);
29834ebdceaSMateusz Guzik 	mtx_unlock(&procid_lock);
2991e9a1bf5SMateusz Guzik 
30034ebdceaSMateusz Guzik 	return (result);
301afd01097SEdward Tomasz Napierala }
302afd01097SEdward Tomasz Napierala 
303afd01097SEdward Tomasz Napierala static int
3043e73ff1eSEdward Tomasz Napierala fork_norfproc(struct thread *td, int flags)
3051d845e86SEdward Tomasz Napierala {
3061d845e86SEdward Tomasz Napierala 	struct proc *p1;
307bd76586bSKonstantin Belousov 	int error;
3081d845e86SEdward Tomasz Napierala 
309087bfb0eSEdward Tomasz Napierala 	KASSERT((flags & RFPROC) == 0,
310087bfb0eSEdward Tomasz Napierala 	    ("fork_norfproc called with RFPROC set"));
3111d845e86SEdward Tomasz Napierala 	p1 = td->td_proc;
3121d845e86SEdward Tomasz Napierala 
3139246b309SMark Johnston 	/*
3149246b309SMark Johnston 	 * Quiesce other threads if necessary.  If RFMEM is not specified we
3159246b309SMark Johnston 	 * must ensure that other threads do not concurrently create a second
3169246b309SMark Johnston 	 * process sharing the vmspace, see vmspace_unshare().
3179246b309SMark Johnston 	 */
3189246b309SMark Johnston 	if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS &&
3199246b309SMark Johnston 	    ((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) {
3201d845e86SEdward Tomasz Napierala 		PROC_LOCK(p1);
3216ddcc233SKonstantin Belousov 		if (thread_single(p1, SINGLE_BOUNDARY)) {
3221d845e86SEdward Tomasz Napierala 			PROC_UNLOCK(p1);
3231d845e86SEdward Tomasz Napierala 			return (ERESTART);
3241d845e86SEdward Tomasz Napierala 		}
3251d845e86SEdward Tomasz Napierala 		PROC_UNLOCK(p1);
3261d845e86SEdward Tomasz Napierala 	}
3271d845e86SEdward Tomasz Napierala 
3281d845e86SEdward Tomasz Napierala 	error = vm_forkproc(td, NULL, NULL, NULL, flags);
329bd76586bSKonstantin Belousov 	if (error != 0)
3301d845e86SEdward Tomasz Napierala 		goto fail;
3311d845e86SEdward Tomasz Napierala 
3321d845e86SEdward Tomasz Napierala 	/*
3331d845e86SEdward Tomasz Napierala 	 * Close all file descriptors.
3341d845e86SEdward Tomasz Napierala 	 */
335bd76586bSKonstantin Belousov 	if ((flags & RFCFDG) != 0) {
3361d845e86SEdward Tomasz Napierala 		struct filedesc *fdtmp;
33785078b85SConrad Meyer 		struct pwddesc *pdtmp;
338bd76586bSKonstantin Belousov 
33985078b85SConrad Meyer 		pdtmp = pdinit(td->td_proc->p_pd, false);
340893d20c9SMateusz Guzik 		fdtmp = fdinit();
34185078b85SConrad Meyer 		pdescfree(td);
3422609222aSPawel Jakub Dawidek 		fdescfree(td);
3431d845e86SEdward Tomasz Napierala 		p1->p_fd = fdtmp;
34485078b85SConrad Meyer 		p1->p_pd = pdtmp;
3451d845e86SEdward Tomasz Napierala 	}
3461d845e86SEdward Tomasz Napierala 
3471d845e86SEdward Tomasz Napierala 	/*
3481d845e86SEdward Tomasz Napierala 	 * Unshare file descriptors (from parent).
3491d845e86SEdward Tomasz Napierala 	 */
350bd76586bSKonstantin Belousov 	if ((flags & RFFDG) != 0) {
351b9d32c36SMateusz Guzik 		fdunshare(td);
35285078b85SConrad Meyer 		pdunshare(td);
35385078b85SConrad Meyer 	}
3541d845e86SEdward Tomasz Napierala 
3551d845e86SEdward Tomasz Napierala fail:
3569246b309SMark Johnston 	if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS &&
3579246b309SMark Johnston 	    ((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) {
3581d845e86SEdward Tomasz Napierala 		PROC_LOCK(p1);
3596ddcc233SKonstantin Belousov 		thread_single_end(p1, SINGLE_BOUNDARY);
3601d845e86SEdward Tomasz Napierala 		PROC_UNLOCK(p1);
3611d845e86SEdward Tomasz Napierala 	}
3621d845e86SEdward Tomasz Napierala 	return (error);
3631d845e86SEdward Tomasz Napierala }
3641d845e86SEdward Tomasz Napierala 
365afd01097SEdward Tomasz Napierala static void
366813361c1SMateusz Guzik do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2,
367813361c1SMateusz Guzik     struct vmspace *vm2, struct file *fp_procdesc)
368df8bae1dSRodney W. Grimes {
369afd01097SEdward Tomasz Napierala 	struct proc *p1, *pptr;
3705641ae5dSJohn Baldwin 	struct filedesc *fd;
371ad05d580STor Egge 	struct filedesc_to_leader *fdtol;
37285078b85SConrad Meyer 	struct pwddesc *pd;
3733fc755c1SJohn Baldwin 	struct sigacts *newsigacts;
3745856e12eSJohn Dyson 
37570fca427SJohn Baldwin 	p1 = td->td_proc;
37670fca427SJohn Baldwin 
3771ad9ee86SXin LI 	PROC_LOCK(p1);
3781ad9ee86SXin LI 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
3791ad9ee86SXin LI 	    __rangeof(struct proc, p_startcopy, p_endcopy));
3808b4a2800SKonstantin Belousov 	pargs_hold(p2->p_args);
3811ad9ee86SXin LI 	PROC_UNLOCK(p1);
3821ad9ee86SXin LI 
3831ad9ee86SXin LI 	bzero(&p2->p_startzero,
3841ad9ee86SXin LI 	    __rangeof(struct proc, p_startzero, p_endzero));
3851ad9ee86SXin LI 
3860304c731SJamie Gritton 	/* Tell the prison that we exist. */
387413628a7SBjoern A. Zeeb 	prison_proc_hold(p2->p_ucred->cr_prison);
388413628a7SBjoern A. Zeeb 
38950c7615fSMateusz Guzik 	p2->p_state = PRS_NEW;		/* protect against others */
39050c7615fSMateusz Guzik 	p2->p_pid = fork_findpid(fr->fr_flags);
39150c7615fSMateusz Guzik 	AUDIT_ARG_PID(p2->p_pid);
39246dd801aSColin Percival 	TSFORK(p2->p_pid, p1->p_pid);
39350c7615fSMateusz Guzik 
39450c7615fSMateusz Guzik 	sx_xlock(&allproc_lock);
39550c7615fSMateusz Guzik 	LIST_INSERT_HEAD(&allproc, p2, p_list);
39650c7615fSMateusz Guzik 	allproc_gen++;
3975ecb5444SMateusz Guzik 	prison_proc_link(p2->p_ucred->cr_prison, p2);
39850c7615fSMateusz Guzik 	sx_xunlock(&allproc_lock);
39950c7615fSMateusz Guzik 
40050c7615fSMateusz Guzik 	sx_xlock(PIDHASHLOCK(p2->p_pid));
40150c7615fSMateusz Guzik 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
40250c7615fSMateusz Guzik 	sx_xunlock(PIDHASHLOCK(p2->p_pid));
4031ad9ee86SXin LI 
40481d68271SMateusz Guzik 	tidhash_add(td2);
40581d68271SMateusz Guzik 
4060384fff8SJason Evans 	/*
4073fc755c1SJohn Baldwin 	 * Malloc things while we don't hold any locks.
4083fc755c1SJohn Baldwin 	 */
409813361c1SMateusz Guzik 	if (fr->fr_flags & RFSIGSHARE)
4103fc755c1SJohn Baldwin 		newsigacts = NULL;
41190af4afaSJohn Baldwin 	else
41290af4afaSJohn Baldwin 		newsigacts = sigacts_alloc();
4133fc755c1SJohn Baldwin 
4143fc755c1SJohn Baldwin 	/*
4153fc755c1SJohn Baldwin 	 * Copy filedesc.
4163fc755c1SJohn Baldwin 	 */
417813361c1SMateusz Guzik 	if (fr->fr_flags & RFCFDG) {
41885078b85SConrad Meyer 		pd = pdinit(p1->p_pd, false);
419893d20c9SMateusz Guzik 		fd = fdinit();
420ad05d580STor Egge 		fdtol = NULL;
421813361c1SMateusz Guzik 	} else if (fr->fr_flags & RFFDG) {
422f8f74aaaSConrad Meyer 		if (fr->fr_flags2 & FR2_SHARE_PATHS)
423f8f74aaaSConrad Meyer 			pd = pdshare(p1->p_pd);
424f8f74aaaSConrad Meyer 		else
42585078b85SConrad Meyer 			pd = pdcopy(p1->p_pd);
426598b7ec8SPoul-Henning Kamp 		fd = fdcopy(p1->p_fd);
427ad05d580STor Egge 		fdtol = NULL;
428ad05d580STor Egge 	} else {
429f8f74aaaSConrad Meyer 		if (fr->fr_flags2 & FR2_SHARE_PATHS)
430f8f74aaaSConrad Meyer 			pd = pdcopy(p1->p_pd);
431f8f74aaaSConrad Meyer 		else
43285078b85SConrad Meyer 			pd = pdshare(p1->p_pd);
433c7f1c11bSAlfred Perlstein 		fd = fdshare(p1->p_fd);
434ad05d580STor Egge 		if (p1->p_fdtol == NULL)
4353e73ff1eSEdward Tomasz Napierala 			p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
436ad05d580STor Egge 			    p1->p_leader);
437813361c1SMateusz Guzik 		if ((fr->fr_flags & RFTHREAD) != 0) {
438ad05d580STor Egge 			/*
4393e73ff1eSEdward Tomasz Napierala 			 * Shared file descriptor table, and shared
4403e73ff1eSEdward Tomasz Napierala 			 * process leaders.
441ad05d580STor Egge 			 */
442d07675a9SMark Johnston 			fdtol = filedesc_to_leader_share(p1->p_fdtol, p1->p_fd);
443ad05d580STor Egge 		} else {
444ad05d580STor Egge 			/*
4453e73ff1eSEdward Tomasz Napierala 			 * Shared file descriptor table, and different
4463e73ff1eSEdward Tomasz Napierala 			 * process leaders.
447ad05d580STor Egge 			 */
448ad05d580STor Egge 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
4493e73ff1eSEdward Tomasz Napierala 			    p1->p_fd, p2);
450ad05d580STor Egge 		}
451ad05d580STor Egge 	}
4523fc755c1SJohn Baldwin 	/*
453df8bae1dSRodney W. Grimes 	 * Make a proc table entry for the new process.
454df8bae1dSRodney W. Grimes 	 * Start by zeroing the section of proc that is zero-initialized,
455df8bae1dSRodney W. Grimes 	 * then copy the section that is copied directly from the parent.
456df8bae1dSRodney W. Grimes 	 */
457316ec49aSScott Long 
4587d447c95SJohn Baldwin 	PROC_LOCK(p2);
4597d447c95SJohn Baldwin 	PROC_LOCK(p1);
4607d447c95SJohn Baldwin 
461079b7badSJulian Elischer 	bzero(&td2->td_startzero,
4626db36923SDavid Schultz 	    __rangeof(struct thread, td_startzero, td_endzero));
463079b7badSJulian Elischer 
464079b7badSJulian Elischer 	bcopy(&td->td_startcopy, &td2->td_startcopy,
4656db36923SDavid Schultz 	    __rangeof(struct thread, td_startcopy, td_endcopy));
466df8bae1dSRodney W. Grimes 
4674b9322aeSJulian Elischer 	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
468a30ec4b9SDavid Xu 	td2->td_sigstk = td->td_sigstk;
469b61ce5b0SJeff Roberson 	td2->td_flags = TDF_INMEM;
470acbe332aSDavid Xu 	td2->td_lend_user_pri = PRI_MAX;
471a30ec4b9SDavid Xu 
47221ca7b57SMarko Zec #ifdef VIMAGE
47321ca7b57SMarko Zec 	td2->td_vnet = NULL;
47421ca7b57SMarko Zec 	td2->td_vnet_lpush = NULL;
47521ca7b57SMarko Zec #endif
47621ca7b57SMarko Zec 
477df8bae1dSRodney W. Grimes 	/*
47822d19207SJohn Baldwin 	 * Allow the scheduler to initialize the child.
47922d19207SJohn Baldwin 	 */
48022d19207SJohn Baldwin 	thread_lock(td);
48122d19207SJohn Baldwin 	sched_fork(td, td2);
482626d6992SEdward Tomasz Napierala 	/*
483626d6992SEdward Tomasz Napierala 	 * Request AST to check for TDP_RFPPWAIT.  Do it here
484626d6992SEdward Tomasz Napierala 	 * to avoid calling thread_lock() again.
485626d6992SEdward Tomasz Napierala 	 */
486626d6992SEdward Tomasz Napierala 	if ((fr->fr_flags & RFPPWAIT) != 0)
487c6d31b83SKonstantin Belousov 		ast_sched_locked(td, TDA_VFORK);
48822d19207SJohn Baldwin 	thread_unlock(td);
48922d19207SJohn Baldwin 
49022d19207SJohn Baldwin 	/*
491df8bae1dSRodney W. Grimes 	 * Duplicate sub-structures as needed.
492df8bae1dSRodney W. Grimes 	 * Increase reference counts on shared objects.
493df8bae1dSRodney W. Grimes 	 */
494b61ce5b0SJeff Roberson 	p2->p_flag = P_INMEM;
495fa50a355SKonstantin Belousov 	p2->p_flag2 = p1->p_flag2 & (P2_ASLR_DISABLE | P2_ASLR_ENABLE |
4962ffee5c1SMark Johnston 	    P2_ASLR_IGNSTART | P2_NOTRACE | P2_NOTRACE_EXEC |
497fe69291fSKonstantin Belousov 	    P2_PROTMAX_ENABLE | P2_PROTMAX_DISABLE | P2_TRAPCAP |
498796a8e1aSKonstantin Belousov 	    P2_STKGAP_DISABLE | P2_STKGAP_DISABLE_EXEC | P2_NO_NEW_PRIVS |
499796a8e1aSKonstantin Belousov 	    P2_WXORX_DISABLE | P2_WXORX_ENABLE_EXEC);
50054b0e65fSJeff Roberson 	p2->p_swtick = ticks;
5019752f794SJohn Baldwin 	if (p1->p_flag & P_PROFIL)
5029752f794SJohn Baldwin 		startprofclock(p2);
503b9df5231SPoul-Henning Kamp 
504813361c1SMateusz Guzik 	if (fr->fr_flags & RFSIGSHARE) {
50590af4afaSJohn Baldwin 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
5066626c604SJulian Elischer 	} else {
50790af4afaSJohn Baldwin 		sigacts_copy(newsigacts, p1->p_sigacts);
50890af4afaSJohn Baldwin 		p2->p_sigacts = newsigacts;
509640d5404SJohn Baldwin 		if ((fr->fr_flags2 & (FR2_DROPSIG_CAUGHT | FR2_KPROC)) != 0) {
510079c5b9eSKyle Evans 			mtx_lock(&p2->p_sigacts->ps_mtx);
511640d5404SJohn Baldwin 			if ((fr->fr_flags2 & FR2_DROPSIG_CAUGHT) != 0)
512079c5b9eSKyle Evans 				sig_drop_caught(p2);
513640d5404SJohn Baldwin 			if ((fr->fr_flags2 & FR2_KPROC) != 0)
514640d5404SJohn Baldwin 				p2->p_sigacts->ps_flag |= PS_NOCLDWAIT;
515079c5b9eSKyle Evans 			mtx_unlock(&p2->p_sigacts->ps_mtx);
516079c5b9eSKyle Evans 		}
5176626c604SJulian Elischer 	}
518f49d8202SKonstantin Belousov 
519813361c1SMateusz Guzik 	if (fr->fr_flags & RFTSIGZMB)
520813361c1SMateusz Guzik 	        p2->p_sigparent = RFTSIGNUM(fr->fr_flags);
521813361c1SMateusz Guzik 	else if (fr->fr_flags & RFLINUXTHPN)
5226626c604SJulian Elischer 	        p2->p_sigparent = SIGUSR1;
5234ac9ae70SJulian Elischer 	else
5244ac9ae70SJulian Elischer 	        p2->p_sigparent = SIGCHLD;
52588c5ea45SJulian Elischer 
526640d5404SJohn Baldwin 	if ((fr->fr_flags2 & FR2_KPROC) != 0) {
527640d5404SJohn Baldwin 		p2->p_flag |= P_SYSTEM | P_KPROC;
528640d5404SJohn Baldwin 		td2->td_pflags |= TDP_KTHREAD;
529640d5404SJohn Baldwin 	}
530640d5404SJohn Baldwin 
531df8bae1dSRodney W. Grimes 	p2->p_textvp = p1->p_textvp;
532351d5f7fSKonstantin Belousov 	p2->p_textdvp = p1->p_textdvp;
5335641ae5dSJohn Baldwin 	p2->p_fd = fd;
534ad05d580STor Egge 	p2->p_fdtol = fdtol;
53585078b85SConrad Meyer 	p2->p_pd = pd;
536dabee6feSPeter Wemm 
53755648840SJohn Baldwin 	if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
53855648840SJohn Baldwin 		p2->p_flag |= P_PROTECTED;
53955648840SJohn Baldwin 		p2->p_flag2 |= P2_INHERIT_PROTECTED;
54055648840SJohn Baldwin 	}
54155648840SJohn Baldwin 
542df8bae1dSRodney W. Grimes 	/*
543c8564ad4SBruce Evans 	 * p_limit is copy-on-write.  Bump its refcount.
544df8bae1dSRodney W. Grimes 	 */
5451c4bcd05SJeff Roberson 	lim_fork(p1, p2);
5468b059651SDavid Schultz 
5474ea6a9a2SMateusz Guzik 	thread_cow_get_proc(td2, p2);
5484ea6a9a2SMateusz Guzik 
5498b059651SDavid Schultz 	pstats_fork(p1->p_stats, p2->p_stats);
5508b059651SDavid Schultz 
551299bc736SDavid Schultz 	PROC_UNLOCK(p1);
552cda5aba4SDavid Schultz 	PROC_UNLOCK(p2);
553df8bae1dSRodney W. Grimes 
554351d5f7fSKonstantin Belousov 	/*
555351d5f7fSKonstantin Belousov 	 * Bump references to the text vnode and directory, and copy
556351d5f7fSKonstantin Belousov 	 * the hardlink name.
557351d5f7fSKonstantin Belousov 	 */
558351d5f7fSKonstantin Belousov 	if (p2->p_textvp != NULL)
5595afb134cSMateusz Guzik 		vrefact(p2->p_textvp);
560351d5f7fSKonstantin Belousov 	if (p2->p_textdvp != NULL)
561351d5f7fSKonstantin Belousov 		vrefact(p2->p_textdvp);
562351d5f7fSKonstantin Belousov 	p2->p_binname = p1->p_binname == NULL ? NULL :
563351d5f7fSKonstantin Belousov 	    strdup(p1->p_binname, M_PARGS);
564a69d88afSPeter Wemm 
565c6544064SJohn Baldwin 	/*
566c8564ad4SBruce Evans 	 * Set up linkage for kernel based threading.
567c6544064SJohn Baldwin 	 */
568813361c1SMateusz Guzik 	if ((fr->fr_flags & RFTHREAD) != 0) {
569c6544064SJohn Baldwin 		mtx_lock(&ppeers_lock);
570c6544064SJohn Baldwin 		p2->p_peers = p1->p_peers;
571c6544064SJohn Baldwin 		p1->p_peers = p2;
572c6544064SJohn Baldwin 		p2->p_leader = p1->p_leader;
573c6544064SJohn Baldwin 		mtx_unlock(&ppeers_lock);
574c6544064SJohn Baldwin 		PROC_LOCK(p1->p_leader);
575c6544064SJohn Baldwin 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
576c6544064SJohn Baldwin 			PROC_UNLOCK(p1->p_leader);
577c6544064SJohn Baldwin 			/*
578c6544064SJohn Baldwin 			 * The task leader is exiting, so process p1 is
579c6544064SJohn Baldwin 			 * going to be killed shortly.  Since p1 obviously
580c6544064SJohn Baldwin 			 * isn't dead yet, we know that the leader is either
581c6544064SJohn Baldwin 			 * sending SIGKILL's to all the processes in this
582c6544064SJohn Baldwin 			 * task or is sleeping waiting for all the peers to
583c6544064SJohn Baldwin 			 * exit.  We let p1 complete the fork, but we need
584c6544064SJohn Baldwin 			 * to go ahead and kill the new process p2 since
585c6544064SJohn Baldwin 			 * the task leader may not get a chance to send
586c6544064SJohn Baldwin 			 * SIGKILL to it.  We leave it on the list so that
587c6544064SJohn Baldwin 			 * the task leader will wait for this new process
588c6544064SJohn Baldwin 			 * to commit suicide.
589c6544064SJohn Baldwin 			 */
590c6544064SJohn Baldwin 			PROC_LOCK(p2);
5918451d0ddSKip Macy 			kern_psignal(p2, SIGKILL);
592c6544064SJohn Baldwin 			PROC_UNLOCK(p2);
593293d2d22SRobert Watson 		} else
594293d2d22SRobert Watson 			PROC_UNLOCK(p1->p_leader);
595c6544064SJohn Baldwin 	} else {
596c6544064SJohn Baldwin 		p2->p_peers = NULL;
597c6544064SJohn Baldwin 		p2->p_leader = p2;
598c6544064SJohn Baldwin 	}
599c6544064SJohn Baldwin 
6003fc755c1SJohn Baldwin 	sx_xlock(&proctree_lock);
6013fc755c1SJohn Baldwin 	PGRP_LOCK(p1->p_pgrp);
6023fc755c1SJohn Baldwin 	PROC_LOCK(p2);
6033fc755c1SJohn Baldwin 	PROC_LOCK(p1);
6043fc755c1SJohn Baldwin 
60570e534e7SDavid Greenman 	/*
6069752f794SJohn Baldwin 	 * Preserve some more flags in subprocess.  P_PROFIL has already
607be67169aSBruce Evans 	 * been preserved.
60870e534e7SDavid Greenman 	 */
609a30ec4b9SDavid Xu 	p2->p_flag |= p1->p_flag & P_SUGID;
610a0558fe9SMateusz Guzik 	td2->td_pflags |= (td->td_pflags & (TDP_ALTSTACK | TDP_SIGFASTBLOCK));
611f591779bSSeigo Tanimura 	SESS_LOCK(p1->p_session);
612df8bae1dSRodney W. Grimes 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
613df8bae1dSRodney W. Grimes 		p2->p_flag |= P_CONTROLT;
614f591779bSSeigo Tanimura 	SESS_UNLOCK(p1->p_session);
615813361c1SMateusz Guzik 	if (fr->fr_flags & RFPPWAIT)
616df8bae1dSRodney W. Grimes 		p2->p_flag |= P_PPWAIT;
617be67169aSBruce Evans 
6185cded904SOlivier Houchard 	p2->p_pgrp = p1->p_pgrp;
619b75356e1SJeffrey Hsu 	LIST_INSERT_AFTER(p1, p2, p_pglist);
6202a60b9b9SSeigo Tanimura 	PGRP_UNLOCK(p1->p_pgrp);
621b75356e1SJeffrey Hsu 	LIST_INIT(&p2->p_children);
622dcd43281SKonstantin Belousov 	LIST_INIT(&p2->p_orphans);
623b75356e1SJeffrey Hsu 
624f7e50ea7SKonstantin Belousov 	callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
6254f559836SJake Burkholder 
626df8bae1dSRodney W. Grimes 	/*
627df8bae1dSRodney W. Grimes 	 * This begins the section where we must prevent the parent
628cda5aba4SDavid Schultz 	 * from being swapped.
629df8bae1dSRodney W. Grimes 	 */
630cda5aba4SDavid Schultz 	_PHOLD(p1);
63157934cd3SJohn Baldwin 	PROC_UNLOCK(p1);
6320d2afceeSDavid Greenman 
633df8bae1dSRodney W. Grimes 	/*
6343fc755c1SJohn Baldwin 	 * Attach the new process to its parent.
6353fc755c1SJohn Baldwin 	 *
6363fc755c1SJohn Baldwin 	 * If RFNOWAIT is set, the newly created process becomes a child
6373fc755c1SJohn Baldwin 	 * of init.  This effectively disassociates the child from the
6383fc755c1SJohn Baldwin 	 * parent.
6393fc755c1SJohn Baldwin 	 */
640813361c1SMateusz Guzik 	if ((fr->fr_flags & RFNOWAIT) != 0) {
641237623b0SKonstantin Belousov 		pptr = p1->p_reaper;
642237623b0SKonstantin Belousov 		p2->p_reaper = pptr;
643237623b0SKonstantin Belousov 	} else {
644237623b0SKonstantin Belousov 		p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ?
645237623b0SKonstantin Belousov 		    p1 : p1->p_reaper;
6463fc755c1SJohn Baldwin 		pptr = p1;
647237623b0SKonstantin Belousov 	}
6483fc755c1SJohn Baldwin 	p2->p_pptr = pptr;
6492c054ce9SMateusz Guzik 	p2->p_oppid = pptr->p_pid;
6503fc755c1SJohn Baldwin 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
651237623b0SKonstantin Belousov 	LIST_INIT(&p2->p_reaplist);
652237623b0SKonstantin Belousov 	LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
65334ebdceaSMateusz Guzik 	if (p2->p_reaper == p1 && p1 != initproc) {
654237623b0SKonstantin Belousov 		p2->p_reapsubtree = p2->p_pid;
65534ebdceaSMateusz Guzik 		proc_id_set_cond(PROC_ID_REAP, p2->p_pid);
65634ebdceaSMateusz Guzik 	}
6573fc755c1SJohn Baldwin 	sx_xunlock(&proctree_lock);
6583fc755c1SJohn Baldwin 
659bb0e8070SJohn Baldwin 	/* Inform accounting that we have forked. */
660bb0e8070SJohn Baldwin 	p2->p_acflag = AFORK;
661bb0e8070SJohn Baldwin 	PROC_UNLOCK(p2);
662bb0e8070SJohn Baldwin 
6637705d4b2SDmitry Chagin #ifdef KTRACE
6647705d4b2SDmitry Chagin 	ktrprocfork(p1, p2);
6657705d4b2SDmitry Chagin #endif
6667705d4b2SDmitry Chagin 
6673fc755c1SJohn Baldwin 	/*
668a2a1c95cSPeter Wemm 	 * Finish creating the child process.  It will return via a different
669a2a1c95cSPeter Wemm 	 * execution path later.  (ie: directly into user mode)
670dabee6feSPeter Wemm 	 */
671813361c1SMateusz Guzik 	vm_forkproc(td, p2, td2, vm2, fr->fr_flags);
672df8bae1dSRodney W. Grimes 
673813361c1SMateusz Guzik 	if (fr->fr_flags == (RFFDG | RFPROC)) {
67483c9dea1SGleb Smirnoff 		VM_CNT_INC(v_forks);
67583c9dea1SGleb Smirnoff 		VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize +
67694ddc707SAlan Cox 		    p2->p_vmspace->vm_ssize);
677813361c1SMateusz Guzik 	} else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
67883c9dea1SGleb Smirnoff 		VM_CNT_INC(v_vforks);
67983c9dea1SGleb Smirnoff 		VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize +
68094ddc707SAlan Cox 		    p2->p_vmspace->vm_ssize);
6815d22597fSHajimu UMEMOTO 	} else if (p1 == &proc0) {
68283c9dea1SGleb Smirnoff 		VM_CNT_INC(v_kthreads);
68383c9dea1SGleb Smirnoff 		VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize +
68494ddc707SAlan Cox 		    p2->p_vmspace->vm_ssize);
6855d22597fSHajimu UMEMOTO 	} else {
68683c9dea1SGleb Smirnoff 		VM_CNT_INC(v_rforks);
68783c9dea1SGleb Smirnoff 		VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize +
68894ddc707SAlan Cox 		    p2->p_vmspace->vm_ssize);
6895d22597fSHajimu UMEMOTO 	}
6905d22597fSHajimu UMEMOTO 
691cfb5f768SJonathan Anderson 	/*
692cfb5f768SJonathan Anderson 	 * Associate the process descriptor with the process before anything
693cfb5f768SJonathan Anderson 	 * can happen that might cause that process to need the descriptor.
694cfb5f768SJonathan Anderson 	 * However, don't do this until after fork(2) can no longer fail.
695cfb5f768SJonathan Anderson 	 */
696813361c1SMateusz Guzik 	if (fr->fr_flags & RFPROCDESC)
697813361c1SMateusz Guzik 		procdesc_new(p2, fr->fr_pd_flags);
698cfb5f768SJonathan Anderson 
699df8bae1dSRodney W. Grimes 	/*
700e9189611SPeter Wemm 	 * Both processes are set up, now check if any loadable modules want
701e0d898b4SJulian Elischer 	 * to adjust anything.
702fed06968SJulian Elischer 	 */
7032ca45184SMatt Joras 	EVENTHANDLER_DIRECT_INVOKE(process_fork, p1, p2, fr->fr_flags);
704fed06968SJulian Elischer 
705fed06968SJulian Elischer 	/*
7064c3558aaSJohn Baldwin 	 * Set the child start time and mark the process as being complete.
7074c3558aaSJohn Baldwin 	 */
7088e6fa660SJohn Baldwin 	PROC_LOCK(p2);
7098e6fa660SJohn Baldwin 	PROC_LOCK(p1);
7104c3558aaSJohn Baldwin 	microuptime(&p2->p_stats->p_start);
71111bda9b8SJeff Roberson 	PROC_SLOCK(p2);
7124c3558aaSJohn Baldwin 	p2->p_state = PRS_NORMAL;
71311bda9b8SJeff Roberson 	PROC_SUNLOCK(p2);
7146fa39a73SKonstantin Belousov 
715d3555b6fSRui Paulo #ifdef KDTRACE_HOOKS
716d3555b6fSRui Paulo 	/*
7177159310fSMark Johnston 	 * Tell the DTrace fasttrap provider about the new process so that any
7187159310fSMark Johnston 	 * tracepoints inherited from the parent can be removed. We have to do
7197159310fSMark Johnston 	 * this only after p_state is PRS_NORMAL since the fasttrap module will
7207159310fSMark Johnston 	 * use pfind() later on.
721d3555b6fSRui Paulo 	 */
722813361c1SMateusz Guzik 	if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork)
723d3555b6fSRui Paulo 		dtrace_fasttrap_fork(p1, p2);
724d3555b6fSRui Paulo #endif
725813361c1SMateusz Guzik 	if (fr->fr_flags & RFPPWAIT) {
7261d7ca9bbSKonstantin Belousov 		td->td_pflags |= TDP_RFPPWAIT;
7271d7ca9bbSKonstantin Belousov 		td->td_rfppwait_p = p2;
728fc4f075aSJohn Baldwin 		td->td_dbgflags |= TDB_VFORK;
7291d7ca9bbSKonstantin Belousov 	}
7308e6fa660SJohn Baldwin 	PROC_UNLOCK(p2);
731df8bae1dSRodney W. Grimes 
732df8bae1dSRodney W. Grimes 	/*
733e52327e3SMateusz Guzik 	 * Tell any interested parties about the new process.
734e52327e3SMateusz Guzik 	 */
735e52327e3SMateusz Guzik 	knote_fork(p1->p_klist, p2->p_pid);
736e52327e3SMateusz Guzik 
737e52327e3SMateusz Guzik 	/*
738df8bae1dSRodney W. Grimes 	 * Now can be swapped.
739df8bae1dSRodney W. Grimes 	 */
74057934cd3SJohn Baldwin 	_PRELE(p1);
7417054ee4eSKonstantin Belousov 	PROC_UNLOCK(p1);
742813361c1SMateusz Guzik 	SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags);
7435d217f17SJohn Birrell 
744813361c1SMateusz Guzik 	if (fr->fr_flags & RFPROCDESC) {
745813361c1SMateusz Guzik 		procdesc_finit(p2->p_procdesc, fp_procdesc);
746813361c1SMateusz Guzik 		fdrop(fp_procdesc, td);
747813361c1SMateusz Guzik 	}
748813361c1SMateusz Guzik 
7496e22bbf6SKonstantin Belousov 	/*
7506e22bbf6SKonstantin Belousov 	 * Speculative check for PTRACE_FORK. PTRACE_FORK is not
7516e22bbf6SKonstantin Belousov 	 * synced with forks in progress so it is OK if we miss it
7526e22bbf6SKonstantin Belousov 	 * if being set atm.
7536e22bbf6SKonstantin Belousov 	 */
7546e22bbf6SKonstantin Belousov 	if ((p1->p_ptevents & PTRACE_FORK) != 0) {
7556e22bbf6SKonstantin Belousov 		sx_xlock(&proctree_lock);
7566e22bbf6SKonstantin Belousov 		PROC_LOCK(p2);
7576e22bbf6SKonstantin Belousov 
7586e22bbf6SKonstantin Belousov 		/*
7596e22bbf6SKonstantin Belousov 		 * p1->p_ptevents & p1->p_pptr are protected by both
7606e22bbf6SKonstantin Belousov 		 * process and proctree locks for modifications,
7616e22bbf6SKonstantin Belousov 		 * so owning proctree_lock allows the race-free read.
7626e22bbf6SKonstantin Belousov 		 */
7636e22bbf6SKonstantin Belousov 		if ((p1->p_ptevents & PTRACE_FORK) != 0) {
7646e22bbf6SKonstantin Belousov 			/*
7656e22bbf6SKonstantin Belousov 			 * Arrange for debugger to receive the fork event.
7666e22bbf6SKonstantin Belousov 			 *
7676e22bbf6SKonstantin Belousov 			 * We can report PL_FLAG_FORKED regardless of
7686e22bbf6SKonstantin Belousov 			 * P_FOLLOWFORK settings, but it does not make a sense
7696e22bbf6SKonstantin Belousov 			 * for runaway child.
7706e22bbf6SKonstantin Belousov 			 */
7716e22bbf6SKonstantin Belousov 			td->td_dbgflags |= TDB_FORK;
7726e22bbf6SKonstantin Belousov 			td->td_dbg_forked = p2->p_pid;
7736e22bbf6SKonstantin Belousov 			td2->td_dbgflags |= TDB_STOPATFORK;
7746e22bbf6SKonstantin Belousov 			proc_set_traced(p2, true);
7756e22bbf6SKonstantin Belousov 			CTR2(KTR_PTRACE,
7766e22bbf6SKonstantin Belousov 			    "do_fork: attaching to new child pid %d: oppid %d",
7776e22bbf6SKonstantin Belousov 			    p2->p_pid, p2->p_oppid);
7782c054ce9SMateusz Guzik 			proc_reparent(p2, p1->p_pptr, false);
7796e22bbf6SKonstantin Belousov 		}
7806e22bbf6SKonstantin Belousov 		PROC_UNLOCK(p2);
7816e22bbf6SKonstantin Belousov 		sx_xunlock(&proctree_lock);
7826e22bbf6SKonstantin Belousov 	}
7836e22bbf6SKonstantin Belousov 
784a5ac8272SMateusz Guzik 	racct_proc_fork_done(p2);
785a5ac8272SMateusz Guzik 
786813361c1SMateusz Guzik 	if ((fr->fr_flags & RFSTOPPED) == 0) {
787a5ac8272SMateusz Guzik 		if (fr->fr_pidp != NULL)
788a5ac8272SMateusz Guzik 			*fr->fr_pidp = p2->p_pid;
789813361c1SMateusz Guzik 		/*
790813361c1SMateusz Guzik 		 * If RFSTOPPED not requested, make child runnable and
791813361c1SMateusz Guzik 		 * add to run queue.
792813361c1SMateusz Guzik 		 */
793813361c1SMateusz Guzik 		thread_lock(td2);
794813361c1SMateusz Guzik 		TD_SET_CAN_RUN(td2);
795813361c1SMateusz Guzik 		sched_add(td2, SRQ_BORING);
796813361c1SMateusz Guzik 	} else {
797813361c1SMateusz Guzik 		*fr->fr_procp = p2;
798813361c1SMateusz Guzik 	}
799afd01097SEdward Tomasz Napierala }
800afd01097SEdward Tomasz Napierala 
801c6d31b83SKonstantin Belousov static void
802c6d31b83SKonstantin Belousov ast_vfork(struct thread *td, int tda __unused)
8037d065d87SMateusz Guzik {
8047d065d87SMateusz Guzik 	struct proc *p, *p2;
8057d065d87SMateusz Guzik 
8067d065d87SMateusz Guzik 	MPASS(td->td_pflags & TDP_RFPPWAIT);
8077d065d87SMateusz Guzik 
8087d065d87SMateusz Guzik 	p = td->td_proc;
8097d065d87SMateusz Guzik 	/*
8107d065d87SMateusz Guzik 	 * Preserve synchronization semantics of vfork.  If
8117d065d87SMateusz Guzik 	 * waiting for child to exec or exit, fork set
8127d065d87SMateusz Guzik 	 * P_PPWAIT on child, and there we sleep on our proc
8137d065d87SMateusz Guzik 	 * (in case of exit).
8147d065d87SMateusz Guzik 	 *
8157d065d87SMateusz Guzik 	 * Do it after the ptracestop() above is finished, to
8167d065d87SMateusz Guzik 	 * not block our debugger until child execs or exits
8177d065d87SMateusz Guzik 	 * to finish vfork wait.
8187d065d87SMateusz Guzik 	 */
8197d065d87SMateusz Guzik 	td->td_pflags &= ~TDP_RFPPWAIT;
8207d065d87SMateusz Guzik 	p2 = td->td_rfppwait_p;
8217d065d87SMateusz Guzik again:
8227d065d87SMateusz Guzik 	PROC_LOCK(p2);
8237d065d87SMateusz Guzik 	while (p2->p_flag & P_PPWAIT) {
8247d065d87SMateusz Guzik 		PROC_LOCK(p);
8257d065d87SMateusz Guzik 		if (thread_suspend_check_needed()) {
8267d065d87SMateusz Guzik 			PROC_UNLOCK(p2);
8277d065d87SMateusz Guzik 			thread_suspend_check(0);
8287d065d87SMateusz Guzik 			PROC_UNLOCK(p);
8297d065d87SMateusz Guzik 			goto again;
8307d065d87SMateusz Guzik 		} else {
8317d065d87SMateusz Guzik 			PROC_UNLOCK(p);
8327d065d87SMateusz Guzik 		}
8337d065d87SMateusz Guzik 		cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz);
8347d065d87SMateusz Guzik 	}
8357d065d87SMateusz Guzik 	PROC_UNLOCK(p2);
8367d065d87SMateusz Guzik 
8377d065d87SMateusz Guzik 	if (td->td_dbgflags & TDB_VFORK) {
8387d065d87SMateusz Guzik 		PROC_LOCK(p);
8397d065d87SMateusz Guzik 		if (p->p_ptevents & PTRACE_VFORK)
8407d065d87SMateusz Guzik 			ptracestop(td, SIGTRAP, NULL);
8417d065d87SMateusz Guzik 		td->td_dbgflags &= ~TDB_VFORK;
8427d065d87SMateusz Guzik 		PROC_UNLOCK(p);
8437d065d87SMateusz Guzik 	}
8447d065d87SMateusz Guzik }
8457d065d87SMateusz Guzik 
846afd01097SEdward Tomasz Napierala int
84733fd9b9aSMateusz Guzik fork1(struct thread *td, struct fork_req *fr)
848afd01097SEdward Tomasz Napierala {
8494b48959fSKonstantin Belousov 	struct proc *p1, *newproc;
850afd01097SEdward Tomasz Napierala 	struct thread *td2;
851afd01097SEdward Tomasz Napierala 	struct vmspace *vm2;
85260cdcb64SMateusz Guzik 	struct ucred *cred;
8534b48959fSKonstantin Belousov 	struct file *fp_procdesc;
8543360b485SKonstantin Belousov 	struct pgrp *pg;
855afd01097SEdward Tomasz Napierala 	vm_ooffset_t mem_charged;
85660cdcb64SMateusz Guzik 	int error, nprocs_new;
857afd01097SEdward Tomasz Napierala 	static int curfail;
858afd01097SEdward Tomasz Napierala 	static struct timeval lastfail;
85933fd9b9aSMateusz Guzik 	int flags, pages;
860232b922cSKonstantin Belousov 	bool killsx_locked, singlethreaded;
86133fd9b9aSMateusz Guzik 
86233fd9b9aSMateusz Guzik 	flags = fr->fr_flags;
86333fd9b9aSMateusz Guzik 	pages = fr->fr_pages;
864afd01097SEdward Tomasz Napierala 
865813361c1SMateusz Guzik 	if ((flags & RFSTOPPED) != 0)
866813361c1SMateusz Guzik 		MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL);
867813361c1SMateusz Guzik 	else
868813361c1SMateusz Guzik 		MPASS(fr->fr_procp == NULL);
869813361c1SMateusz Guzik 
870f49d8202SKonstantin Belousov 	/* Check for the undefined or unimplemented flags. */
871f49d8202SKonstantin Belousov 	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
872f49d8202SKonstantin Belousov 		return (EINVAL);
873f49d8202SKonstantin Belousov 
874f49d8202SKonstantin Belousov 	/* Signal value requires RFTSIGZMB. */
875f49d8202SKonstantin Belousov 	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
876f49d8202SKonstantin Belousov 		return (EINVAL);
877f49d8202SKonstantin Belousov 
878afd01097SEdward Tomasz Napierala 	/* Can't copy and clear. */
879afd01097SEdward Tomasz Napierala 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
880afd01097SEdward Tomasz Napierala 		return (EINVAL);
881afd01097SEdward Tomasz Napierala 
882f49d8202SKonstantin Belousov 	/* Check the validity of the signal number. */
883f49d8202SKonstantin Belousov 	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
884f49d8202SKonstantin Belousov 		return (EINVAL);
885f49d8202SKonstantin Belousov 
886cfb5f768SJonathan Anderson 	if ((flags & RFPROCDESC) != 0) {
887cfb5f768SJonathan Anderson 		/* Can't not create a process yet get a process descriptor. */
888cfb5f768SJonathan Anderson 		if ((flags & RFPROC) == 0)
889cfb5f768SJonathan Anderson 			return (EINVAL);
890cfb5f768SJonathan Anderson 
891cfb5f768SJonathan Anderson 		/* Must provide a place to put a procdesc if creating one. */
89233fd9b9aSMateusz Guzik 		if (fr->fr_pd_fd == NULL)
893cfb5f768SJonathan Anderson 			return (EINVAL);
894b3a73448SMariusz Zaborski 
895b3a73448SMariusz Zaborski 		/* Check if we are using supported flags. */
896b3a73448SMariusz Zaborski 		if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0)
897b3a73448SMariusz Zaborski 			return (EINVAL);
898cfb5f768SJonathan Anderson 	}
899cfb5f768SJonathan Anderson 
900afd01097SEdward Tomasz Napierala 	p1 = td->td_proc;
901afd01097SEdward Tomasz Napierala 
902afd01097SEdward Tomasz Napierala 	/*
903afd01097SEdward Tomasz Napierala 	 * Here we don't create a new process, but we divorce
904afd01097SEdward Tomasz Napierala 	 * certain parts of a process from itself.
905afd01097SEdward Tomasz Napierala 	 */
9063e73ff1eSEdward Tomasz Napierala 	if ((flags & RFPROC) == 0) {
907813361c1SMateusz Guzik 		if (fr->fr_procp != NULL)
90833fd9b9aSMateusz Guzik 			*fr->fr_procp = NULL;
909813361c1SMateusz Guzik 		else if (fr->fr_pidp != NULL)
910813361c1SMateusz Guzik 			*fr->fr_pidp = 0;
9113e73ff1eSEdward Tomasz Napierala 		return (fork_norfproc(td, flags));
9123e73ff1eSEdward Tomasz Napierala 	}
913afd01097SEdward Tomasz Napierala 
9144b48959fSKonstantin Belousov 	fp_procdesc = NULL;
9154b48959fSKonstantin Belousov 	newproc = NULL;
9164b48959fSKonstantin Belousov 	vm2 = NULL;
9173360b485SKonstantin Belousov 	killsx_locked = false;
918232b922cSKonstantin Belousov 	singlethreaded = false;
9194b48959fSKonstantin Belousov 
9204b48959fSKonstantin Belousov 	/*
9214b48959fSKonstantin Belousov 	 * Increment the nprocs resource before allocations occur.
9224b48959fSKonstantin Belousov 	 * Although process entries are dynamically created, we still
9234b48959fSKonstantin Belousov 	 * keep a global limit on the maximum number we will
9244b48959fSKonstantin Belousov 	 * create. There are hard-limits as to the number of processes
9254b48959fSKonstantin Belousov 	 * that can run, established by the KVA and memory usage for
9264b48959fSKonstantin Belousov 	 * the process data.
9274b48959fSKonstantin Belousov 	 *
9284b48959fSKonstantin Belousov 	 * Don't allow a nonprivileged user to use the last ten
9294b48959fSKonstantin Belousov 	 * processes; don't let root exceed the limit.
9304b48959fSKonstantin Belousov 	 */
9314b48959fSKonstantin Belousov 	nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
9327d43b5c9SMark Johnston 	if (nprocs_new >= maxproc - 10) {
9337d43b5c9SMark Johnston 		if (priv_check_cred(td->td_ucred, PRIV_MAXPROC) != 0 ||
934be8dd142SKonstantin Belousov 		    nprocs_new >= maxproc) {
9354b48959fSKonstantin Belousov 			error = EAGAIN;
9364b48959fSKonstantin Belousov 			sx_xlock(&allproc_lock);
9374b48959fSKonstantin Belousov 			if (ppsratecheck(&lastfail, &curfail, 1)) {
9387d43b5c9SMark Johnston 				printf("maxproc limit exceeded by uid %u "
9397d43b5c9SMark Johnston 				    "(pid %d); see tuning(7) and "
9407d43b5c9SMark Johnston 				    "login.conf(5)\n",
9414b48959fSKonstantin Belousov 				    td->td_ucred->cr_ruid, p1->p_pid);
9424b48959fSKonstantin Belousov 			}
9434b48959fSKonstantin Belousov 			sx_xunlock(&allproc_lock);
9444b48959fSKonstantin Belousov 			goto fail2;
9454b48959fSKonstantin Belousov 		}
9467d43b5c9SMark Johnston 	}
9474b48959fSKonstantin Belousov 
948cfb5f768SJonathan Anderson 	/*
949232b922cSKonstantin Belousov 	 * If we are possibly multi-threaded, and there is a process
950232b922cSKonstantin Belousov 	 * sending a signal to our group right now, ensure that our
951232b922cSKonstantin Belousov 	 * other threads cannot be chosen for the signal queueing.
952232b922cSKonstantin Belousov 	 * Otherwise, this might delay signal action, and make the new
953232b922cSKonstantin Belousov 	 * child escape the signaling.
9543360b485SKonstantin Belousov 	 */
9553360b485SKonstantin Belousov 	pg = p1->p_pgrp;
956232b922cSKonstantin Belousov 	if (p1->p_numthreads > 1) {
957232b922cSKonstantin Belousov 		if (sx_try_slock(&pg->pg_killsx) != 0) {
958232b922cSKonstantin Belousov 			killsx_locked = true;
959232b922cSKonstantin Belousov 		} else {
960232b922cSKonstantin Belousov 			PROC_LOCK(p1);
961232b922cSKonstantin Belousov 			if (thread_single(p1, SINGLE_BOUNDARY)) {
962232b922cSKonstantin Belousov 				PROC_UNLOCK(p1);
9633360b485SKonstantin Belousov 				error = ERESTART;
9643360b485SKonstantin Belousov 				goto fail2;
965232b922cSKonstantin Belousov 			}
966232b922cSKonstantin Belousov 			PROC_UNLOCK(p1);
967232b922cSKonstantin Belousov 			singlethreaded = true;
968232b922cSKonstantin Belousov 		}
969232b922cSKonstantin Belousov 	}
970232b922cSKonstantin Belousov 
971232b922cSKonstantin Belousov 	/*
972232b922cSKonstantin Belousov 	 * Atomically check for signals and block processes from sending
973232b922cSKonstantin Belousov 	 * a signal to our process group until the child is visible.
974232b922cSKonstantin Belousov 	 */
975232b922cSKonstantin Belousov 	if (!killsx_locked && sx_slock_sig(&pg->pg_killsx) != 0) {
976232b922cSKonstantin Belousov 		error = ERESTART;
977232b922cSKonstantin Belousov 		goto fail2;
978232b922cSKonstantin Belousov 	}
979232b922cSKonstantin Belousov 	if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0)) {
9803360b485SKonstantin Belousov 		/*
9813360b485SKonstantin Belousov 		 * Either the process was moved to other process
9823360b485SKonstantin Belousov 		 * group, or there is pending signal.  sx_slock_sig()
9833360b485SKonstantin Belousov 		 * does not check for signals if not sleeping for the
9843360b485SKonstantin Belousov 		 * lock.
9853360b485SKonstantin Belousov 		 */
9863360b485SKonstantin Belousov 		sx_sunlock(&pg->pg_killsx);
987474708c3SKonstantin Belousov 		killsx_locked = false;
9883360b485SKonstantin Belousov 		error = ERESTART;
9893360b485SKonstantin Belousov 		goto fail2;
9903360b485SKonstantin Belousov 	} else {
9913360b485SKonstantin Belousov 		killsx_locked = true;
9923360b485SKonstantin Belousov 	}
9933360b485SKonstantin Belousov 
9943360b485SKonstantin Belousov 	/*
995cfb5f768SJonathan Anderson 	 * If required, create a process descriptor in the parent first; we
996cfb5f768SJonathan Anderson 	 * will abandon it if something goes wrong. We don't finit() until
997cfb5f768SJonathan Anderson 	 * later.
998cfb5f768SJonathan Anderson 	 */
999cfb5f768SJonathan Anderson 	if (flags & RFPROCDESC) {
1000b3a73448SMariusz Zaborski 		error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd,
1001b3a73448SMariusz Zaborski 		    fr->fr_pd_flags, fr->fr_pd_fcaps);
1002b38520f0SEdward Tomasz Napierala 		if (error != 0)
1003d8f3dc78SKonstantin Belousov 			goto fail2;
1004757a5642SChristian S.J. Peron 		AUDIT_ARG_FD(*fr->fr_pd_fd);
1005cfb5f768SJonathan Anderson 	}
1006cfb5f768SJonathan Anderson 
1007afd01097SEdward Tomasz Napierala 	mem_charged = 0;
1008afd01097SEdward Tomasz Napierala 	if (pages == 0)
1009edc82223SKonstantin Belousov 		pages = kstack_pages;
1010afd01097SEdward Tomasz Napierala 	/* Allocate new proc. */
1011afd01097SEdward Tomasz Napierala 	newproc = uma_zalloc(proc_zone, M_WAITOK);
1012afd01097SEdward Tomasz Napierala 	td2 = FIRST_THREAD_IN_PROC(newproc);
1013afd01097SEdward Tomasz Napierala 	if (td2 == NULL) {
1014afd01097SEdward Tomasz Napierala 		td2 = thread_alloc(pages);
1015afd01097SEdward Tomasz Napierala 		if (td2 == NULL) {
1016afd01097SEdward Tomasz Napierala 			error = ENOMEM;
101712cec311SMateusz Guzik 			goto fail2;
1018afd01097SEdward Tomasz Napierala 		}
1019afd01097SEdward Tomasz Napierala 		proc_linkup(newproc, td2);
1020afd01097SEdward Tomasz Napierala 	} else {
1021*800da341SMark Johnston 		error = thread_recycle(td2, pages);
1022*800da341SMark Johnston 		if (error != 0)
102312cec311SMateusz Guzik 			goto fail2;
1024afd01097SEdward Tomasz Napierala 	}
1025afd01097SEdward Tomasz Napierala 
1026afd01097SEdward Tomasz Napierala 	if ((flags & RFMEM) == 0) {
1027afd01097SEdward Tomasz Napierala 		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
1028afd01097SEdward Tomasz Napierala 		if (vm2 == NULL) {
1029afd01097SEdward Tomasz Napierala 			error = ENOMEM;
103012cec311SMateusz Guzik 			goto fail2;
1031afd01097SEdward Tomasz Napierala 		}
1032afd01097SEdward Tomasz Napierala 		if (!swap_reserve(mem_charged)) {
1033afd01097SEdward Tomasz Napierala 			/*
1034afd01097SEdward Tomasz Napierala 			 * The swap reservation failed. The accounting
1035afd01097SEdward Tomasz Napierala 			 * from the entries of the copied vm2 will be
1036e3043798SPedro F. Giffuni 			 * subtracted in vmspace_free(), so force the
1037afd01097SEdward Tomasz Napierala 			 * reservation there.
1038afd01097SEdward Tomasz Napierala 			 */
1039afd01097SEdward Tomasz Napierala 			swap_reserve_force(mem_charged);
1040afd01097SEdward Tomasz Napierala 			error = ENOMEM;
104112cec311SMateusz Guzik 			goto fail2;
1042afd01097SEdward Tomasz Napierala 		}
1043afd01097SEdward Tomasz Napierala 	} else
1044afd01097SEdward Tomasz Napierala 		vm2 = NULL;
1045afd01097SEdward Tomasz Napierala 
1046097055e2SEdward Tomasz Napierala 	/*
1047097055e2SEdward Tomasz Napierala 	 * XXX: This is ugly; when we copy resource usage, we need to bump
1048097055e2SEdward Tomasz Napierala 	 *      per-cred resource counters.
1049097055e2SEdward Tomasz Napierala 	 */
105092541c12SOlivier Certner 	newproc->p_ucred = crcowget(td->td_ucred);
1051097055e2SEdward Tomasz Napierala 
1052097055e2SEdward Tomasz Napierala 	/*
1053097055e2SEdward Tomasz Napierala 	 * Initialize resource accounting for the child process.
1054097055e2SEdward Tomasz Napierala 	 */
1055097055e2SEdward Tomasz Napierala 	error = racct_proc_fork(p1, newproc);
1056097055e2SEdward Tomasz Napierala 	if (error != 0) {
1057097055e2SEdward Tomasz Napierala 		error = EAGAIN;
1058097055e2SEdward Tomasz Napierala 		goto fail1;
1059097055e2SEdward Tomasz Napierala 	}
1060097055e2SEdward Tomasz Napierala 
10611dbf9dccSEdward Tomasz Napierala #ifdef MAC
10621dbf9dccSEdward Tomasz Napierala 	mac_proc_init(newproc);
10631dbf9dccSEdward Tomasz Napierala #endif
10649e590ff0SKonstantin Belousov 	newproc->p_klist = knlist_alloc(&newproc->p_mtx);
10651dbf9dccSEdward Tomasz Napierala 	STAILQ_INIT(&newproc->p_ktr);
10661dbf9dccSEdward Tomasz Napierala 
106758c77a9dSEdward Tomasz Napierala 	/*
1068afd01097SEdward Tomasz Napierala 	 * Increment the count of procs running with this uid. Don't allow
1069afd01097SEdward Tomasz Napierala 	 * a nonprivileged user to exceed their current limit.
1070afd01097SEdward Tomasz Napierala 	 */
107160cdcb64SMateusz Guzik 	cred = td->td_ucred;
107260cdcb64SMateusz Guzik 	if (!chgproccnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC))) {
107360cdcb64SMateusz Guzik 		if (priv_check_cred(cred, PRIV_PROC_LIMIT) != 0)
107460cdcb64SMateusz Guzik 			goto fail0;
107560cdcb64SMateusz Guzik 		chgproccnt(cred->cr_ruidinfo, 1, 0);
1076afd01097SEdward Tomasz Napierala 	}
1077afd01097SEdward Tomasz Napierala 
107860cdcb64SMateusz Guzik 	do_fork(td, fr, newproc, td2, vm2, fp_procdesc);
1079232b922cSKonstantin Belousov 	error = 0;
1080232b922cSKonstantin Belousov 	goto cleanup;
108160cdcb64SMateusz Guzik fail0:
1082afd01097SEdward Tomasz Napierala 	error = EAGAIN;
10836bea667fSRobert Watson #ifdef MAC
108430d239bcSRobert Watson 	mac_proc_destroy(newproc);
10856bea667fSRobert Watson #endif
10861dbf9dccSEdward Tomasz Napierala 	racct_proc_exit(newproc);
1087ab27d5d8SEdward Tomasz Napierala fail1:
10881724c563SMateusz Guzik 	proc_unset_cred(newproc);
108912cec311SMateusz Guzik fail2:
109069aa768aSKonstantin Belousov 	if (vm2 != NULL)
109169aa768aSKonstantin Belousov 		vmspace_free(vm2);
1092c6544064SJohn Baldwin 	uma_zfree(proc_zone, newproc);
1093de265498SPawel Jakub Dawidek 	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
109433fd9b9aSMateusz Guzik 		fdclose(td, fp_procdesc, *fr->fr_pd_fd);
1095cfb5f768SJonathan Anderson 		fdrop(fp_procdesc, td);
10960a7007b9SPawel Jakub Dawidek 	}
10974b48959fSKonstantin Belousov 	atomic_add_int(&nprocs, -1);
1098232b922cSKonstantin Belousov cleanup:
10993360b485SKonstantin Belousov 	if (killsx_locked)
11003360b485SKonstantin Belousov 		sx_sunlock(&pg->pg_killsx);
1101232b922cSKonstantin Belousov 	if (singlethreaded) {
1102232b922cSKonstantin Belousov 		PROC_LOCK(p1);
1103232b922cSKonstantin Belousov 		thread_single_end(p1, SINGLE_BOUNDARY);
1104232b922cSKonstantin Belousov 		PROC_UNLOCK(p1);
1105232b922cSKonstantin Belousov 	}
1106232b922cSKonstantin Belousov 	if (error != 0)
110784d37a46SJohn Baldwin 		pause("fork", hz / 2);
1108c6544064SJohn Baldwin 	return (error);
1109df8bae1dSRodney W. Grimes }
1110fed06968SJulian Elischer 
1111e0d898b4SJulian Elischer /*
1112a7b124c3SJohn Baldwin  * Handle the return of a child process from fork1().  This function
1113a7b124c3SJohn Baldwin  * is called from the MD fork_trampoline() entry point.
1114a7b124c3SJohn Baldwin  */
1115a7b124c3SJohn Baldwin void
11161d845e86SEdward Tomasz Napierala fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
11171d845e86SEdward Tomasz Napierala     struct trapframe *frame)
1118a7b124c3SJohn Baldwin {
1119696058c3SJulian Elischer 	struct proc *p;
112070fca427SJohn Baldwin 	struct thread *td;
1121fe54587fSJeff Roberson 	struct thread *dtd;
112270fca427SJohn Baldwin 
1123b0f71f1bSMark Johnston 	kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED);
1124b0f71f1bSMark Johnston 
11250047b9a9SBosko Milekic 	td = curthread;
11260047b9a9SBosko Milekic 	p = td->td_proc;
11270047b9a9SBosko Milekic 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
11280047b9a9SBosko Milekic 
11296617724cSJeff Roberson 	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
113093ccd6bfSKonstantin Belousov 	    td, td_get_sched(td), p->p_pid, td->td_name);
11310047b9a9SBosko Milekic 
113211bda9b8SJeff Roberson 	sched_fork_exit(td);
1133fce3b1c3SKonstantin Belousov 
1134a7b124c3SJohn Baldwin 	/*
1135fe54587fSJeff Roberson 	 * Processes normally resume in mi_switch() after being
1136fe54587fSJeff Roberson 	 * cpu_switch()'ed to, but when children start up they arrive here
1137fe54587fSJeff Roberson 	 * instead, so we must do much the same things as mi_switch() would.
1138fe54587fSJeff Roberson 	 */
1139fe54587fSJeff Roberson 	if ((dtd = PCPU_GET(deadthread))) {
1140fe54587fSJeff Roberson 		PCPU_SET(deadthread, NULL);
1141fe54587fSJeff Roberson 		thread_stash(dtd);
1142fe54587fSJeff Roberson 	}
1143fe54587fSJeff Roberson 	thread_unlock(td);
1144fe54587fSJeff Roberson 
1145fe54587fSJeff Roberson 	/*
11465c2cf818SKonstantin Belousov 	 * cpu_fork_kthread_handler intercepts this function call to
1147a7b124c3SJohn Baldwin 	 * have this call a non-return function to stay in kernel mode.
1148a7b124c3SJohn Baldwin 	 * initproc has its own fork handler, but it does return.
1149a7b124c3SJohn Baldwin 	 */
11505813dc03SJohn Baldwin 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
11518865286bSJohn Baldwin 	callout(arg, frame);
1152a7b124c3SJohn Baldwin 
1153a7b124c3SJohn Baldwin 	/*
1154a7b124c3SJohn Baldwin 	 * Check if a kernel thread misbehaved and returned from its main
1155a7b124c3SJohn Baldwin 	 * function.
1156a7b124c3SJohn Baldwin 	 */
1157db57c70aSKonstantin Belousov 	if (p->p_flag & P_KPROC) {
1158a7b124c3SJohn Baldwin 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
1159e01eafefSJulian Elischer 		    td->td_name, p->p_pid);
1160fb1f4582SJohn Baldwin 		kthread_exit();
1161a7b124c3SJohn Baldwin 	}
1162a7b124c3SJohn Baldwin 	mtx_assert(&Giant, MA_NOTOWNED);
1163993182e5SAlexander Leidinger 
1164eac62420SOlivier Certner 	/*
1165eac62420SOlivier Certner 	 * Now going to return to userland.
1166eac62420SOlivier Certner 	 */
1167eac62420SOlivier Certner 
1168e5d81ef1SDmitry Chagin 	if (p->p_sysent->sv_schedtail != NULL)
1169e5d81ef1SDmitry Chagin 		(p->p_sysent->sv_schedtail)(td);
1170eac62420SOlivier Certner 
1171eac62420SOlivier Certner 	userret(td, frame);
1172a7b124c3SJohn Baldwin }
1173a7b124c3SJohn Baldwin 
1174a7b124c3SJohn Baldwin /*
1175a7b124c3SJohn Baldwin  * Simplified back end of syscall(), used when returning from fork()
1176e69ba32fSKonstantin Belousov  * directly into user mode.  This function is passed in to fork_exit()
1177e69ba32fSKonstantin Belousov  * as the first parameter and is called when returning to a new
1178e69ba32fSKonstantin Belousov  * userland process.
1179a7b124c3SJohn Baldwin  */
1180a7b124c3SJohn Baldwin void
11811d845e86SEdward Tomasz Napierala fork_return(struct thread *td, struct trapframe *frame)
1182a7b124c3SJohn Baldwin {
11836e22bbf6SKonstantin Belousov 	struct proc *p;
11846fa39a73SKonstantin Belousov 
11856fa39a73SKonstantin Belousov 	p = td->td_proc;
1186189ac973SJohn Baldwin 	if (td->td_dbgflags & TDB_STOPATFORK) {
11876fa39a73SKonstantin Belousov 		PROC_LOCK(p);
11886e22bbf6SKonstantin Belousov 		if ((p->p_flag & P_TRACED) != 0) {
11896fa39a73SKonstantin Belousov 			/*
11906e22bbf6SKonstantin Belousov 			 * Inform the debugger if one is still present.
11916fa39a73SKonstantin Belousov 			 */
1192b7a25e63SKonstantin Belousov 			td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP;
119382a4538fSEric Badger 			ptracestop(td, SIGSTOP, NULL);
1194189ac973SJohn Baldwin 			td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
11956fa39a73SKonstantin Belousov 		} else {
11966fa39a73SKonstantin Belousov 			/*
11976fa39a73SKonstantin Belousov 			 * ... otherwise clear the request.
11986fa39a73SKonstantin Belousov 			 */
11996fa39a73SKonstantin Belousov 			td->td_dbgflags &= ~TDB_STOPATFORK;
12006fa39a73SKonstantin Belousov 		}
12016fa39a73SKonstantin Belousov 		PROC_UNLOCK(p);
1202653738e8SJohn Baldwin 	} else if (p->p_flag & P_TRACED) {
1203189ac973SJohn Baldwin  		/*
1204189ac973SJohn Baldwin 		 * This is the start of a new thread in a traced
1205189ac973SJohn Baldwin 		 * process.  Report a system call exit event.
1206189ac973SJohn Baldwin 		 */
1207189ac973SJohn Baldwin 		PROC_LOCK(p);
1208189ac973SJohn Baldwin 		td->td_dbgflags |= TDB_SCX;
12098d570f64SJohn Baldwin 		if ((p->p_ptevents & PTRACE_SCX) != 0 ||
12105fcfab6eSJohn Baldwin 		    (td->td_dbgflags & TDB_BORN) != 0)
121182a4538fSEric Badger 			ptracestop(td, SIGTRAP, NULL);
12125fcfab6eSJohn Baldwin 		td->td_dbgflags &= ~(TDB_SCX | TDB_BORN);
1213189ac973SJohn Baldwin 		PROC_UNLOCK(p);
12146fa39a73SKonstantin Belousov 	}
1215a7b124c3SJohn Baldwin 
1216cc7b7306SJamie Gritton 	/*
1217cc7b7306SJamie Gritton 	 * If the prison was killed mid-fork, die along with it.
1218cc7b7306SJamie Gritton 	 */
1219cc7b7306SJamie Gritton 	if (!prison_isalive(td->td_ucred->cr_prison))
1220cc7b7306SJamie Gritton 		exit1(td, 0, SIGKILL);
1221cc7b7306SJamie Gritton 
1222a7b124c3SJohn Baldwin #ifdef KTRACE
12230282f875SDmitry Chagin 	if (KTRPOINT(td, KTR_SYSRET))
12240282f875SDmitry Chagin 		ktrsysret(td->td_sa.code, 0, 0);
1225a7b124c3SJohn Baldwin #endif
1226a7b124c3SJohn Baldwin }
1227c6d31b83SKonstantin Belousov 
1228c6d31b83SKonstantin Belousov static void
1229c6d31b83SKonstantin Belousov fork_init(void *arg __unused)
1230c6d31b83SKonstantin Belousov {
1231c6d31b83SKonstantin Belousov 	ast_register(TDA_VFORK, ASTR_ASTF_REQUIRED | ASTR_TDP, TDP_RFPPWAIT,
1232c6d31b83SKonstantin Belousov 	    ast_vfork);
1233c6d31b83SKonstantin Belousov }
1234c6d31b83SKonstantin Belousov SYSINIT(fork, SI_SUB_INTRINSIC, SI_ORDER_ANY, fork_init, NULL);
1235