xref: /freebsd/sys/kern/kern_fork.c (revision a1a4f1a0d87b594d3f17a97dc0127eec1417e6f6)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 #include <sys/jail.h>
58 
59 #include <vm/vm.h>
60 #include <sys/lock.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_map.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_zone.h>
65 
66 #include <machine/frame.h>
67 #include <sys/user.h>
68 
69 static int	fast_vfork = 1;
70 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
71 
72 /*
73  * These are the stuctures used to create a callout list for things to do
74  * when forking a process
75  */
76 typedef struct fork_list_element {
77 	struct fork_list_element *next;
78 	forklist_fn function;
79 } *fle_p;
80 
81 static fle_p	fork_list;
82 
83 #ifndef _SYS_SYSPROTO_H_
84 struct fork_args {
85 	int     dummy;
86 };
87 #endif
88 
89 /* ARGSUSED */
90 int
91 fork(p, uap)
92 	struct proc *p;
93 	struct fork_args *uap;
94 {
95 	int error;
96 	struct proc *p2;
97 
98 	error = fork1(p, RFFDG | RFPROC, &p2);
99 	if (error == 0) {
100 		p->p_retval[0] = p2->p_pid;
101 		p->p_retval[1] = 0;
102 	}
103 	return error;
104 }
105 
106 /* ARGSUSED */
107 int
108 vfork(p, uap)
109 	struct proc *p;
110 	struct vfork_args *uap;
111 {
112 	int error;
113 	struct proc *p2;
114 
115 	error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
116 	if (error == 0) {
117 		p->p_retval[0] = p2->p_pid;
118 		p->p_retval[1] = 0;
119 	}
120 	return error;
121 }
122 
123 int
124 rfork(p, uap)
125 	struct proc *p;
126 	struct rfork_args *uap;
127 {
128 	int error;
129 	struct proc *p2;
130 
131 	error = fork1(p, uap->flags, &p2);
132 	if (error == 0) {
133 		p->p_retval[0] = p2 ? p2->p_pid : 0;
134 		p->p_retval[1] = 0;
135 	}
136 	return error;
137 }
138 
139 
140 int	nprocs = 1;		/* process 0 */
141 static int nextpid = 0;
142 
143 int
144 fork1(p1, flags, procp)
145 	struct proc *p1;
146 	int flags;
147 	struct proc **procp;
148 {
149 	struct proc *p2, *pptr;
150 	uid_t uid;
151 	struct proc *newproc;
152 	int count;
153 	static int pidchecked = 0;
154 	fle_p ep ;
155 
156 	ep = fork_list;
157 
158 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
159 		return (EINVAL);
160 
161 	/*
162 	 * Here we don't create a new process, but we divorce
163 	 * certain parts of a process from itself.
164 	 */
165 	if ((flags & RFPROC) == 0) {
166 
167 		/*
168 		 * Divorce the memory, if it is shared, essentially
169 		 * this changes shared memory amongst threads, into
170 		 * COW locally.
171 		 */
172 		if ((flags & RFMEM) == 0) {
173 			if (p1->p_vmspace->vm_refcnt > 1) {
174 				vmspace_unshare(p1);
175 			}
176 		}
177 
178 		/*
179 		 * Close all file descriptors.
180 		 */
181 		if (flags & RFCFDG) {
182 			struct filedesc *fdtmp;
183 			fdtmp = fdinit(p1);
184 			fdfree(p1);
185 			p1->p_fd = fdtmp;
186 		}
187 
188 		/*
189 		 * Unshare file descriptors (from parent.)
190 		 */
191 		if (flags & RFFDG) {
192 			if (p1->p_fd->fd_refcnt > 1) {
193 				struct filedesc *newfd;
194 				newfd = fdcopy(p1);
195 				fdfree(p1);
196 				p1->p_fd = newfd;
197 			}
198 		}
199 		*procp = NULL;
200 		return (0);
201 	}
202 
203 	/*
204 	 * Although process entries are dynamically created, we still keep
205 	 * a global limit on the maximum number we will create.  Don't allow
206 	 * a nonprivileged user to use the last process; don't let root
207 	 * exceed the limit. The variable nprocs is the current number of
208 	 * processes, maxproc is the limit.
209 	 */
210 	uid = p1->p_cred->p_ruid;
211 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
212 		tablefull("proc");
213 		return (EAGAIN);
214 	}
215 	/*
216 	 * Increment the nprocs resource before blocking can occur.  There
217 	 * are hard-limits as to the number of processes that can run.
218 	 */
219 	nprocs++;
220 
221 	/*
222 	 * Increment the count of procs running with this uid. Don't allow
223 	 * a nonprivileged user to exceed their current limit.
224 	 */
225 	count = chgproccnt(uid, 1);
226 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
227 		(void)chgproccnt(uid, -1);
228 		/*
229 		 * Back out the process count
230 		 */
231 		nprocs--;
232 		return (EAGAIN);
233 	}
234 
235 	/* Allocate new proc. */
236 	newproc = zalloc(proc_zone);
237 
238 	/*
239 	 * Setup linkage for kernel based threading
240 	 */
241 	if((flags & RFTHREAD) != 0) {
242 		newproc->p_peers = p1->p_peers;
243 		p1->p_peers = newproc;
244 		newproc->p_leader = p1->p_leader;
245 	} else {
246 		newproc->p_peers = 0;
247 		newproc->p_leader = newproc;
248 	}
249 
250 	newproc->p_wakeup = 0;
251 
252 	newproc->p_vmspace = NULL;
253 
254 	/*
255 	 * Find an unused process ID.  We remember a range of unused IDs
256 	 * ready to use (from nextpid+1 through pidchecked-1).
257 	 */
258 	nextpid++;
259 retry:
260 	/*
261 	 * If the process ID prototype has wrapped around,
262 	 * restart somewhat above 0, as the low-numbered procs
263 	 * tend to include daemons that don't exit.
264 	 */
265 	if (nextpid >= PID_MAX) {
266 		nextpid = 100;
267 		pidchecked = 0;
268 	}
269 	if (nextpid >= pidchecked) {
270 		int doingzomb = 0;
271 
272 		pidchecked = PID_MAX;
273 		/*
274 		 * Scan the active and zombie procs to check whether this pid
275 		 * is in use.  Remember the lowest pid that's greater
276 		 * than nextpid, so we can avoid checking for a while.
277 		 */
278 		p2 = allproc.lh_first;
279 again:
280 		for (; p2 != 0; p2 = p2->p_list.le_next) {
281 			while (p2->p_pid == nextpid ||
282 			    p2->p_pgrp->pg_id == nextpid ||
283 			    p2->p_session->s_sid == nextpid) {
284 				nextpid++;
285 				if (nextpid >= pidchecked)
286 					goto retry;
287 			}
288 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
289 				pidchecked = p2->p_pid;
290 			if (p2->p_pgrp->pg_id > nextpid &&
291 			    pidchecked > p2->p_pgrp->pg_id)
292 				pidchecked = p2->p_pgrp->pg_id;
293 			if (p2->p_session->s_sid > nextpid &&
294 			    pidchecked > p2->p_session->s_sid)
295 				pidchecked = p2->p_session->s_sid;
296 		}
297 		if (!doingzomb) {
298 			doingzomb = 1;
299 			p2 = zombproc.lh_first;
300 			goto again;
301 		}
302 	}
303 
304 	p2 = newproc;
305 	p2->p_stat = SIDL;			/* protect against others */
306 	p2->p_pid = nextpid;
307 	LIST_INSERT_HEAD(&allproc, p2, p_list);
308 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
309 
310 	/*
311 	 * Make a proc table entry for the new process.
312 	 * Start by zeroing the section of proc that is zero-initialized,
313 	 * then copy the section that is copied directly from the parent.
314 	 */
315 	bzero(&p2->p_startzero,
316 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
317 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
318 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
319 
320 	p2->p_aioinfo = NULL;
321 
322 	/*
323 	 * Duplicate sub-structures as needed.
324 	 * Increase reference counts on shared objects.
325 	 * The p_stats and p_sigacts substructs are set in vm_fork.
326 	 */
327 	p2->p_flag = P_INMEM;
328 	if (p1->p_flag & P_PROFIL)
329 		startprofclock(p2);
330 	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
331 	    M_SUBPROC, M_WAITOK);
332 	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
333 	p2->p_cred->p_refcnt = 1;
334 	crhold(p1->p_ucred);
335 
336 	if (p2->p_prison) {
337 		p2->p_prison->pr_ref++;
338 		p2->p_flag |= P_JAILED;
339 	}
340 
341 	if (flags & RFSIGSHARE) {
342 		p2->p_procsig = p1->p_procsig;
343 		p2->p_procsig->ps_refcnt++;
344 		if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
345 			struct sigacts *newsigacts;
346 			int s;
347 
348 			/* Create the shared sigacts structure */
349 			MALLOC(newsigacts, struct sigacts *,
350 			    sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
351 			s = splhigh();
352 			/*
353 			 * Set p_sigacts to the new shared structure.
354 			 * Note that this is updating p1->p_sigacts at the
355 			 * same time, since p_sigacts is just a pointer to
356 			 * the shared p_procsig->ps_sigacts.
357 			 */
358 			p2->p_sigacts  = newsigacts;
359 			bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
360 			    sizeof(*p2->p_sigacts));
361 			*p2->p_sigacts = p1->p_addr->u_sigacts;
362 			splx(s);
363 		}
364 	} else {
365 		MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
366 		    M_SUBPROC, M_WAITOK);
367 		bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
368 		p2->p_procsig->ps_refcnt = 1;
369 		p2->p_sigacts = NULL;	/* finished in vm_fork() */
370 	}
371 	if (flags & RFLINUXTHPN)
372 	        p2->p_sigparent = SIGUSR1;
373 	else
374 	        p2->p_sigparent = SIGCHLD;
375 
376 	/* bump references to the text vnode (for procfs) */
377 	p2->p_textvp = p1->p_textvp;
378 	if (p2->p_textvp)
379 		VREF(p2->p_textvp);
380 
381 	if (flags & RFCFDG)
382 		p2->p_fd = fdinit(p1);
383 	else if (flags & RFFDG)
384 		p2->p_fd = fdcopy(p1);
385 	else
386 		p2->p_fd = fdshare(p1);
387 
388 	/*
389 	 * If p_limit is still copy-on-write, bump refcnt,
390 	 * otherwise get a copy that won't be modified.
391 	 * (If PL_SHAREMOD is clear, the structure is shared
392 	 * copy-on-write.)
393 	 */
394 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
395 		p2->p_limit = limcopy(p1->p_limit);
396 	else {
397 		p2->p_limit = p1->p_limit;
398 		p2->p_limit->p_refcnt++;
399 	}
400 
401 	/*
402 	 * Preserve some more flags in subprocess.  P_PROFIL has already
403 	 * been preserved.
404 	 */
405 	p2->p_flag |= p1->p_flag & P_SUGID;
406 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
407 		p2->p_flag |= P_CONTROLT;
408 	if (flags & RFPPWAIT)
409 		p2->p_flag |= P_PPWAIT;
410 
411 	LIST_INSERT_AFTER(p1, p2, p_pglist);
412 
413 	/*
414 	 * Attach the new process to its parent.
415 	 *
416 	 * If RFNOWAIT is set, the newly created process becomes a child
417 	 * of init.  This effectively disassociates the child from the
418 	 * parent.
419 	 */
420 	if (flags & RFNOWAIT)
421 		pptr = initproc;
422 	else
423 		pptr = p1;
424 	p2->p_pptr = pptr;
425 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
426 	LIST_INIT(&p2->p_children);
427 
428 #ifdef KTRACE
429 	/*
430 	 * Copy traceflag and tracefile if enabled.
431 	 * If not inherited, these were zeroed above.
432 	 */
433 	if (p1->p_traceflag&KTRFAC_INHERIT) {
434 		p2->p_traceflag = p1->p_traceflag;
435 		if ((p2->p_tracep = p1->p_tracep) != NULL)
436 			VREF(p2->p_tracep);
437 	}
438 #endif
439 
440 	/*
441 	 * set priority of child to be that of parent
442 	 */
443 	p2->p_estcpu = p1->p_estcpu;
444 
445 	/*
446 	 * This begins the section where we must prevent the parent
447 	 * from being swapped.
448 	 */
449 	PHOLD(p1);
450 
451 	/*
452 	 * Finish creating the child process.  It will return via a different
453 	 * execution path later.  (ie: directly into user mode)
454 	 */
455 	vm_fork(p1, p2, flags);
456 
457 	/*
458 	 * Both processes are set up, now check if any loadable modules want
459 	 * to adjust anything.
460 	 *   What if they have an error? XXX
461 	 */
462 	while (ep) {
463 		(*ep->function)(p1, p2, flags);
464 		ep = ep->next;
465 	}
466 
467 	/*
468 	 * Make child runnable and add to run queue.
469 	 */
470 	microtime(&(p2->p_stats->p_start));
471 	p2->p_acflag = AFORK;
472 	(void) splhigh();
473 	p2->p_stat = SRUN;
474 	setrunqueue(p2);
475 	(void) spl0();
476 
477 	/*
478 	 * Now can be swapped.
479 	 */
480 	PRELE(p1);
481 
482 	/*
483 	 * Preserve synchronization semantics of vfork.  If waiting for
484 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
485 	 * proc (in case of exit).
486 	 */
487 	while (p2->p_flag & P_PPWAIT)
488 		tsleep(p1, PWAIT, "ppwait", 0);
489 
490 	/*
491 	 * Return child proc pointer to parent.
492 	 */
493 	*procp = p2;
494 	return (0);
495 }
496 
497 /*
498  * The next two functionms are general routines to handle adding/deleting
499  * items on the fork callout list.
500  *
501  * at_fork():
502  * Take the arguments given and put them onto the fork callout list,
503  * However first make sure that it's not already there.
504  * Returns 0 on success or a standard error number.
505  */
506 int
507 at_fork(function)
508 	forklist_fn function;
509 {
510 	fle_p ep;
511 
512 	/* let the programmer know if he's been stupid */
513 	if (rm_at_fork(function))
514 		printf("fork callout entry already present\n");
515 	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
516 	if (ep == NULL)
517 		return (ENOMEM);
518 	ep->next = fork_list;
519 	ep->function = function;
520 	fork_list = ep;
521 	return (0);
522 }
523 
524 /*
525  * Scan the exit callout list for the given items and remove them.
526  * Returns the number of items removed.
527  * Theoretically this value can only be 0 or 1.
528  */
529 int
530 rm_at_fork(function)
531 	forklist_fn function;
532 {
533 	fle_p *epp, ep;
534 	int count;
535 
536 	count= 0;
537 	epp = &fork_list;
538 	ep = *epp;
539 	while (ep) {
540 		if (ep->function == function) {
541 			*epp = ep->next;
542 			free(ep, M_TEMP);
543 			count++;
544 		} else {
545 			epp = &ep->next;
546 		}
547 		ep = *epp;
548 	}
549 	return (count);
550 }
551