xref: /freebsd/sys/kern/kern_fork.c (revision 4cf49a43559ed9fdad601bdcccd2c55963008675)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $FreeBSD$
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 #include <sys/jail.h>
58 
59 #include <vm/vm.h>
60 #include <sys/lock.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_map.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_zone.h>
65 
66 #include <sys/user.h>
67 
68 static int	fast_vfork = 1;
69 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
70 
71 /*
72  * These are the stuctures used to create a callout list for things to do
73  * when forking a process
74  */
75 typedef struct fork_list_element {
76 	struct fork_list_element *next;
77 	forklist_fn function;
78 } *fle_p;
79 
80 static fle_p	fork_list;
81 
82 #ifndef _SYS_SYSPROTO_H_
83 struct fork_args {
84 	int     dummy;
85 };
86 #endif
87 
88 /* ARGSUSED */
89 int
90 fork(p, uap)
91 	struct proc *p;
92 	struct fork_args *uap;
93 {
94 	int error;
95 	struct proc *p2;
96 
97 	error = fork1(p, RFFDG | RFPROC, &p2);
98 	if (error == 0) {
99 		p->p_retval[0] = p2->p_pid;
100 		p->p_retval[1] = 0;
101 	}
102 	return error;
103 }
104 
105 /* ARGSUSED */
106 int
107 vfork(p, uap)
108 	struct proc *p;
109 	struct vfork_args *uap;
110 {
111 	int error;
112 	struct proc *p2;
113 
114 	error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
115 	if (error == 0) {
116 		p->p_retval[0] = p2->p_pid;
117 		p->p_retval[1] = 0;
118 	}
119 	return error;
120 }
121 
122 int
123 rfork(p, uap)
124 	struct proc *p;
125 	struct rfork_args *uap;
126 {
127 	int error;
128 	struct proc *p2;
129 
130 	error = fork1(p, uap->flags, &p2);
131 	if (error == 0) {
132 		p->p_retval[0] = p2 ? p2->p_pid : 0;
133 		p->p_retval[1] = 0;
134 	}
135 	return error;
136 }
137 
138 
139 int	nprocs = 1;		/* process 0 */
140 static int nextpid = 0;
141 
142 int
143 fork1(p1, flags, procp)
144 	struct proc *p1;
145 	int flags;
146 	struct proc **procp;
147 {
148 	struct proc *p2, *pptr;
149 	uid_t uid;
150 	struct proc *newproc;
151 	int count;
152 	static int pidchecked = 0;
153 	fle_p ep ;
154 
155 	ep = fork_list;
156 
157 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
158 		return (EINVAL);
159 
160 	/*
161 	 * Here we don't create a new process, but we divorce
162 	 * certain parts of a process from itself.
163 	 */
164 	if ((flags & RFPROC) == 0) {
165 
166 		/*
167 		 * Divorce the memory, if it is shared, essentially
168 		 * this changes shared memory amongst threads, into
169 		 * COW locally.
170 		 */
171 		if ((flags & RFMEM) == 0) {
172 			if (p1->p_vmspace->vm_refcnt > 1) {
173 				vmspace_unshare(p1);
174 			}
175 		}
176 
177 		/*
178 		 * Close all file descriptors.
179 		 */
180 		if (flags & RFCFDG) {
181 			struct filedesc *fdtmp;
182 			fdtmp = fdinit(p1);
183 			fdfree(p1);
184 			p1->p_fd = fdtmp;
185 		}
186 
187 		/*
188 		 * Unshare file descriptors (from parent.)
189 		 */
190 		if (flags & RFFDG) {
191 			if (p1->p_fd->fd_refcnt > 1) {
192 				struct filedesc *newfd;
193 				newfd = fdcopy(p1);
194 				fdfree(p1);
195 				p1->p_fd = newfd;
196 			}
197 		}
198 		*procp = NULL;
199 		return (0);
200 	}
201 
202 	/*
203 	 * Although process entries are dynamically created, we still keep
204 	 * a global limit on the maximum number we will create.  Don't allow
205 	 * a nonprivileged user to use the last process; don't let root
206 	 * exceed the limit. The variable nprocs is the current number of
207 	 * processes, maxproc is the limit.
208 	 */
209 	uid = p1->p_cred->p_ruid;
210 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
211 		tablefull("proc");
212 		return (EAGAIN);
213 	}
214 	/*
215 	 * Increment the nprocs resource before blocking can occur.  There
216 	 * are hard-limits as to the number of processes that can run.
217 	 */
218 	nprocs++;
219 
220 	/*
221 	 * Increment the count of procs running with this uid. Don't allow
222 	 * a nonprivileged user to exceed their current limit.
223 	 */
224 	count = chgproccnt(uid, 1);
225 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
226 		(void)chgproccnt(uid, -1);
227 		/*
228 		 * Back out the process count
229 		 */
230 		nprocs--;
231 		return (EAGAIN);
232 	}
233 
234 	/* Allocate new proc. */
235 	newproc = zalloc(proc_zone);
236 
237 	/*
238 	 * Setup linkage for kernel based threading
239 	 */
240 	if((flags & RFTHREAD) != 0) {
241 		newproc->p_peers = p1->p_peers;
242 		p1->p_peers = newproc;
243 		newproc->p_leader = p1->p_leader;
244 	} else {
245 		newproc->p_peers = 0;
246 		newproc->p_leader = newproc;
247 	}
248 
249 	newproc->p_wakeup = 0;
250 
251 	newproc->p_vmspace = NULL;
252 
253 	/*
254 	 * Find an unused process ID.  We remember a range of unused IDs
255 	 * ready to use (from nextpid+1 through pidchecked-1).
256 	 */
257 	nextpid++;
258 retry:
259 	/*
260 	 * If the process ID prototype has wrapped around,
261 	 * restart somewhat above 0, as the low-numbered procs
262 	 * tend to include daemons that don't exit.
263 	 */
264 	if (nextpid >= PID_MAX) {
265 		nextpid = 100;
266 		pidchecked = 0;
267 	}
268 	if (nextpid >= pidchecked) {
269 		int doingzomb = 0;
270 
271 		pidchecked = PID_MAX;
272 		/*
273 		 * Scan the active and zombie procs to check whether this pid
274 		 * is in use.  Remember the lowest pid that's greater
275 		 * than nextpid, so we can avoid checking for a while.
276 		 */
277 		p2 = allproc.lh_first;
278 again:
279 		for (; p2 != 0; p2 = p2->p_list.le_next) {
280 			while (p2->p_pid == nextpid ||
281 			    p2->p_pgrp->pg_id == nextpid ||
282 			    p2->p_session->s_sid == nextpid) {
283 				nextpid++;
284 				if (nextpid >= pidchecked)
285 					goto retry;
286 			}
287 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
288 				pidchecked = p2->p_pid;
289 			if (p2->p_pgrp->pg_id > nextpid &&
290 			    pidchecked > p2->p_pgrp->pg_id)
291 				pidchecked = p2->p_pgrp->pg_id;
292 			if (p2->p_session->s_sid > nextpid &&
293 			    pidchecked > p2->p_session->s_sid)
294 				pidchecked = p2->p_session->s_sid;
295 		}
296 		if (!doingzomb) {
297 			doingzomb = 1;
298 			p2 = zombproc.lh_first;
299 			goto again;
300 		}
301 	}
302 
303 	p2 = newproc;
304 	p2->p_stat = SIDL;			/* protect against others */
305 	p2->p_pid = nextpid;
306 	LIST_INSERT_HEAD(&allproc, p2, p_list);
307 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
308 
309 	/*
310 	 * Make a proc table entry for the new process.
311 	 * Start by zeroing the section of proc that is zero-initialized,
312 	 * then copy the section that is copied directly from the parent.
313 	 */
314 	bzero(&p2->p_startzero,
315 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
316 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
317 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
318 
319 	p2->p_aioinfo = NULL;
320 
321 	/*
322 	 * Duplicate sub-structures as needed.
323 	 * Increase reference counts on shared objects.
324 	 * The p_stats and p_sigacts substructs are set in vm_fork.
325 	 */
326 	p2->p_flag = P_INMEM;
327 	if (p1->p_flag & P_PROFIL)
328 		startprofclock(p2);
329 	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
330 	    M_SUBPROC, M_WAITOK);
331 	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
332 	p2->p_cred->p_refcnt = 1;
333 	crhold(p1->p_ucred);
334 
335 	if (p2->p_prison) {
336 		p2->p_prison->pr_ref++;
337 		p2->p_flag |= P_JAILED;
338 	}
339 
340 	if (flags & RFSIGSHARE) {
341 		p2->p_procsig = p1->p_procsig;
342 		p2->p_procsig->ps_refcnt++;
343 		if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
344 			struct sigacts *newsigacts;
345 			int s;
346 
347 			/* Create the shared sigacts structure */
348 			MALLOC(newsigacts, struct sigacts *,
349 			    sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
350 			s = splhigh();
351 			/*
352 			 * Set p_sigacts to the new shared structure.
353 			 * Note that this is updating p1->p_sigacts at the
354 			 * same time, since p_sigacts is just a pointer to
355 			 * the shared p_procsig->ps_sigacts.
356 			 */
357 			p2->p_sigacts  = newsigacts;
358 			bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
359 			    sizeof(*p2->p_sigacts));
360 			*p2->p_sigacts = p1->p_addr->u_sigacts;
361 			splx(s);
362 		}
363 	} else {
364 		MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
365 		    M_SUBPROC, M_WAITOK);
366 		bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
367 		p2->p_procsig->ps_refcnt = 1;
368 		p2->p_sigacts = NULL;	/* finished in vm_fork() */
369 	}
370 	if (flags & RFLINUXTHPN)
371 	        p2->p_sigparent = SIGUSR1;
372 	else
373 	        p2->p_sigparent = SIGCHLD;
374 
375 	/* bump references to the text vnode (for procfs) */
376 	p2->p_textvp = p1->p_textvp;
377 	if (p2->p_textvp)
378 		VREF(p2->p_textvp);
379 
380 	if (flags & RFCFDG)
381 		p2->p_fd = fdinit(p1);
382 	else if (flags & RFFDG)
383 		p2->p_fd = fdcopy(p1);
384 	else
385 		p2->p_fd = fdshare(p1);
386 
387 	/*
388 	 * If p_limit is still copy-on-write, bump refcnt,
389 	 * otherwise get a copy that won't be modified.
390 	 * (If PL_SHAREMOD is clear, the structure is shared
391 	 * copy-on-write.)
392 	 */
393 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
394 		p2->p_limit = limcopy(p1->p_limit);
395 	else {
396 		p2->p_limit = p1->p_limit;
397 		p2->p_limit->p_refcnt++;
398 	}
399 
400 	/*
401 	 * Preserve some more flags in subprocess.  P_PROFIL has already
402 	 * been preserved.
403 	 */
404 	p2->p_flag |= p1->p_flag & P_SUGID;
405 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
406 		p2->p_flag |= P_CONTROLT;
407 	if (flags & RFPPWAIT)
408 		p2->p_flag |= P_PPWAIT;
409 
410 	LIST_INSERT_AFTER(p1, p2, p_pglist);
411 
412 	/*
413 	 * Attach the new process to its parent.
414 	 *
415 	 * If RFNOWAIT is set, the newly created process becomes a child
416 	 * of init.  This effectively disassociates the child from the
417 	 * parent.
418 	 */
419 	if (flags & RFNOWAIT)
420 		pptr = initproc;
421 	else
422 		pptr = p1;
423 	p2->p_pptr = pptr;
424 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
425 	LIST_INIT(&p2->p_children);
426 
427 #ifdef KTRACE
428 	/*
429 	 * Copy traceflag and tracefile if enabled.
430 	 * If not inherited, these were zeroed above.
431 	 */
432 	if (p1->p_traceflag&KTRFAC_INHERIT) {
433 		p2->p_traceflag = p1->p_traceflag;
434 		if ((p2->p_tracep = p1->p_tracep) != NULL)
435 			VREF(p2->p_tracep);
436 	}
437 #endif
438 
439 	/*
440 	 * set priority of child to be that of parent
441 	 */
442 	p2->p_estcpu = p1->p_estcpu;
443 
444 	/*
445 	 * This begins the section where we must prevent the parent
446 	 * from being swapped.
447 	 */
448 	PHOLD(p1);
449 
450 	/*
451 	 * Finish creating the child process.  It will return via a different
452 	 * execution path later.  (ie: directly into user mode)
453 	 */
454 	vm_fork(p1, p2, flags);
455 
456 	/*
457 	 * Both processes are set up, now check if any loadable modules want
458 	 * to adjust anything.
459 	 *   What if they have an error? XXX
460 	 */
461 	while (ep) {
462 		(*ep->function)(p1, p2, flags);
463 		ep = ep->next;
464 	}
465 
466 	/*
467 	 * Make child runnable and add to run queue.
468 	 */
469 	microtime(&(p2->p_stats->p_start));
470 	p2->p_acflag = AFORK;
471 	(void) splhigh();
472 	p2->p_stat = SRUN;
473 	setrunqueue(p2);
474 	(void) spl0();
475 
476 	/*
477 	 * Now can be swapped.
478 	 */
479 	PRELE(p1);
480 
481 	/*
482 	 * Preserve synchronization semantics of vfork.  If waiting for
483 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
484 	 * proc (in case of exit).
485 	 */
486 	while (p2->p_flag & P_PPWAIT)
487 		tsleep(p1, PWAIT, "ppwait", 0);
488 
489 	/*
490 	 * Return child proc pointer to parent.
491 	 */
492 	*procp = p2;
493 	return (0);
494 }
495 
496 /*
497  * The next two functionms are general routines to handle adding/deleting
498  * items on the fork callout list.
499  *
500  * at_fork():
501  * Take the arguments given and put them onto the fork callout list,
502  * However first make sure that it's not already there.
503  * Returns 0 on success or a standard error number.
504  */
505 int
506 at_fork(function)
507 	forklist_fn function;
508 {
509 	fle_p ep;
510 
511 	/* let the programmer know if he's been stupid */
512 	if (rm_at_fork(function))
513 		printf("fork callout entry already present\n");
514 	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
515 	if (ep == NULL)
516 		return (ENOMEM);
517 	ep->next = fork_list;
518 	ep->function = function;
519 	fork_list = ep;
520 	return (0);
521 }
522 
523 /*
524  * Scan the exit callout list for the given items and remove them.
525  * Returns the number of items removed.
526  * Theoretically this value can only be 0 or 1.
527  */
528 int
529 rm_at_fork(function)
530 	forklist_fn function;
531 {
532 	fle_p *epp, ep;
533 	int count;
534 
535 	count= 0;
536 	epp = &fork_list;
537 	ep = *epp;
538 	while (ep) {
539 		if (ep->function == function) {
540 			*epp = ep->next;
541 			free(ep, M_TEMP);
542 			count++;
543 		} else {
544 			epp = &ep->next;
545 		}
546 		ep = *epp;
547 	}
548 	return (count);
549 }
550