xref: /freebsd/sys/kern/kern_fork.c (revision 6e8394b8baa7d5d9153ab90de6824bcd19b3b4e1)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $Id: kern_fork.c,v 1.60 1999/04/28 01:04:27 luoqi Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 #include <sys/jail.h>
58 
59 #include <vm/vm.h>
60 #include <sys/lock.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_map.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_zone.h>
65 
66 #include <machine/frame.h>
67 #include <sys/user.h>
68 
69 static int	fast_vfork = 1;
70 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
71 
72 /*
73  * These are the stuctures used to create a callout list for things to do
74  * when forking a process
75  */
76 typedef struct fork_list_element {
77 	struct fork_list_element *next;
78 	forklist_fn function;
79 } *fle_p;
80 
81 static fle_p	fork_list;
82 
83 #ifndef _SYS_SYSPROTO_H_
84 struct fork_args {
85 	int     dummy;
86 };
87 #endif
88 
89 /* ARGSUSED */
90 int
91 fork(p, uap)
92 	struct proc *p;
93 	struct fork_args *uap;
94 {
95 
96 	return (fork1(p, RFFDG | RFPROC));
97 }
98 
99 /* ARGSUSED */
100 int
101 vfork(p, uap)
102 	struct proc *p;
103 	struct vfork_args *uap;
104 {
105 
106 	return (fork1(p, RFFDG | RFPROC | RFPPWAIT | (fast_vfork ? RFMEM : 0)));
107 }
108 
109 /* ARGSUSED */
110 int
111 rfork(p, uap)
112 	struct proc *p;
113 	struct rfork_args *uap;
114 {
115 
116 	return (fork1(p, uap->flags));
117 }
118 
119 
120 int	nprocs = 1;		/* process 0 */
121 static int nextpid = 0;
122 
123 int
124 fork1(p1, flags)
125 	register struct proc *p1;
126 	int flags;
127 {
128 	register struct proc *p2, *pptr;
129 	register uid_t uid;
130 	struct proc *newproc;
131 	int count;
132 	static int pidchecked = 0;
133 	fle_p ep ;
134 
135 	ep = fork_list;
136 
137 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
138 		return (EINVAL);
139 
140 	/*
141 	 * Here we don't create a new process, but we divorce
142 	 * certain parts of a process from itself.
143 	 */
144 	if ((flags & RFPROC) == 0) {
145 
146 		/*
147 		 * Divorce the memory, if it is shared, essentially
148 		 * this changes shared memory amongst threads, into
149 		 * COW locally.
150 		 */
151 		if ((flags & RFMEM) == 0) {
152 			if (p1->p_vmspace->vm_refcnt > 1) {
153 				vmspace_unshare(p1);
154 			}
155 		}
156 
157 		/*
158 		 * Close all file descriptors.
159 		 */
160 		if (flags & RFCFDG) {
161 			struct filedesc *fdtmp;
162 			fdtmp = fdinit(p1);
163 			fdfree(p1);
164 			p1->p_fd = fdtmp;
165 		}
166 
167 		/*
168 		 * Unshare file descriptors (from parent.)
169 		 */
170 		if (flags & RFFDG) {
171 			if (p1->p_fd->fd_refcnt > 1) {
172 				struct filedesc *newfd;
173 				newfd = fdcopy(p1);
174 				fdfree(p1);
175 				p1->p_fd = newfd;
176 			}
177 		}
178 		return (0);
179 	}
180 
181 	/*
182 	 * Although process entries are dynamically created, we still keep
183 	 * a global limit on the maximum number we will create.  Don't allow
184 	 * a nonprivileged user to use the last process; don't let root
185 	 * exceed the limit. The variable nprocs is the current number of
186 	 * processes, maxproc is the limit.
187 	 */
188 	uid = p1->p_cred->p_ruid;
189 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
190 		tablefull("proc");
191 		return (EAGAIN);
192 	}
193 	/*
194 	 * Increment the nprocs resource before blocking can occur.  There
195 	 * are hard-limits as to the number of processes that can run.
196 	 */
197 	nprocs++;
198 
199 	/*
200 	 * Increment the count of procs running with this uid. Don't allow
201 	 * a nonprivileged user to exceed their current limit.
202 	 */
203 	count = chgproccnt(uid, 1);
204 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
205 		(void)chgproccnt(uid, -1);
206 		/*
207 		 * Back out the process count
208 		 */
209 		nprocs--;
210 		return (EAGAIN);
211 	}
212 
213 	/* Allocate new proc. */
214 	newproc = zalloc(proc_zone);
215 
216 /*
217  * Setup linkage for kernel based threading
218  */
219 	if((flags & RFTHREAD) != 0) {
220 		newproc->p_peers = p1->p_peers;
221 		p1->p_peers = newproc;
222 		newproc->p_leader = p1->p_leader;
223 	} else {
224 		newproc->p_peers = 0;
225 		newproc->p_leader = newproc;
226 	}
227 
228 	newproc->p_wakeup = 0;
229 
230 	/*
231 	 * Find an unused process ID.  We remember a range of unused IDs
232 	 * ready to use (from nextpid+1 through pidchecked-1).
233 	 */
234 	nextpid++;
235 retry:
236 	/*
237 	 * If the process ID prototype has wrapped around,
238 	 * restart somewhat above 0, as the low-numbered procs
239 	 * tend to include daemons that don't exit.
240 	 */
241 	if (nextpid >= PID_MAX) {
242 		nextpid = 100;
243 		pidchecked = 0;
244 	}
245 	if (nextpid >= pidchecked) {
246 		int doingzomb = 0;
247 
248 		pidchecked = PID_MAX;
249 		/*
250 		 * Scan the active and zombie procs to check whether this pid
251 		 * is in use.  Remember the lowest pid that's greater
252 		 * than nextpid, so we can avoid checking for a while.
253 		 */
254 		p2 = allproc.lh_first;
255 again:
256 		for (; p2 != 0; p2 = p2->p_list.le_next) {
257 			while (p2->p_pid == nextpid ||
258 			    p2->p_pgrp->pg_id == nextpid ||
259 			    p2->p_session->s_sid == nextpid) {
260 				nextpid++;
261 				if (nextpid >= pidchecked)
262 					goto retry;
263 			}
264 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
265 				pidchecked = p2->p_pid;
266 			if (p2->p_pgrp->pg_id > nextpid &&
267 			    pidchecked > p2->p_pgrp->pg_id)
268 				pidchecked = p2->p_pgrp->pg_id;
269 			if (p2->p_session->s_sid > nextpid &&
270 			    pidchecked > p2->p_session->s_sid)
271 				pidchecked = p2->p_session->s_sid;
272 		}
273 		if (!doingzomb) {
274 			doingzomb = 1;
275 			p2 = zombproc.lh_first;
276 			goto again;
277 		}
278 	}
279 
280 	p2 = newproc;
281 	p2->p_stat = SIDL;			/* protect against others */
282 	p2->p_pid = nextpid;
283 	LIST_INSERT_HEAD(&allproc, p2, p_list);
284 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
285 
286 	/*
287 	 * Make a proc table entry for the new process.
288 	 * Start by zeroing the section of proc that is zero-initialized,
289 	 * then copy the section that is copied directly from the parent.
290 	 */
291 	bzero(&p2->p_startzero,
292 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
293 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
294 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
295 
296 	p2->p_aioinfo = NULL;
297 
298 	/*
299 	 * Duplicate sub-structures as needed.
300 	 * Increase reference counts on shared objects.
301 	 * The p_stats and p_sigacts substructs are set in vm_fork.
302 	 */
303 	p2->p_flag = P_INMEM;
304 	if (p1->p_flag & P_PROFIL)
305 		startprofclock(p2);
306 	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
307 	    M_SUBPROC, M_WAITOK);
308 	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
309 	p2->p_cred->p_refcnt = 1;
310 	crhold(p1->p_ucred);
311 
312 	if (p2->p_prison) {
313 		p2->p_prison->pr_ref++;
314 		p2->p_flag |= P_JAILED;
315 	}
316 
317 	if (flags & RFSIGSHARE) {
318 		p2->p_procsig = p1->p_procsig;
319 		p2->p_procsig->ps_refcnt++;
320 		if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
321 			struct sigacts *newsigacts;
322 			int s;
323 
324 			if (p2->p_procsig->ps_refcnt != 2)
325 				printf ("PID:%d Creating shared sigacts with procsig->ps_refcnt %d\n",
326 					p2->p_pid, p2->p_procsig->ps_refcnt);
327 			/* Create the shared sigacts structure */
328 			MALLOC (newsigacts, struct sigacts *, sizeof (struct sigacts),
329 				M_SUBPROC, M_WAITOK);
330 			s = splhigh();
331 			/* Set p_sigacts to the new shared structure.  Note that this
332 			 * is updating p1->p_sigacts at the same time, since p_sigacts
333 			 * is just a pointer to the shared p_procsig->ps_sigacts.
334 			 */
335 			p2->p_sigacts  = newsigacts;
336 			/* Copy in the values from the u area */
337 			*p2->p_sigacts = p1->p_addr->u_sigacts;
338 			splx (s);
339 		}
340 	} else {
341 		MALLOC (p2->p_procsig, struct procsig *, sizeof(struct procsig),
342 			M_SUBPROC, M_WAITOK);
343 		bcopy(&p1->p_procsig->ps_begincopy, &p2->p_procsig->ps_begincopy,
344 			(char *)&p1->p_procsig->ps_endcopy -
345 			(char *)&p1->p_procsig->ps_begincopy);
346 		p2->p_procsig->ps_refcnt = 1;
347 		/* Note that we fill in the values of sigacts in vm_fork */
348 		p2->p_sigacts = NULL;
349 	}
350 	if (flags & RFLINUXTHPN)
351 	        p2->p_sigparent = SIGUSR1;
352 	else
353 	        p2->p_sigparent = SIGCHLD;
354 
355 	/* bump references to the text vnode (for procfs) */
356 	p2->p_textvp = p1->p_textvp;
357 	if (p2->p_textvp)
358 		VREF(p2->p_textvp);
359 
360 	if (flags & RFCFDG)
361 		p2->p_fd = fdinit(p1);
362 	else if (flags & RFFDG)
363 		p2->p_fd = fdcopy(p1);
364 	else
365 		p2->p_fd = fdshare(p1);
366 
367 	/*
368 	 * If p_limit is still copy-on-write, bump refcnt,
369 	 * otherwise get a copy that won't be modified.
370 	 * (If PL_SHAREMOD is clear, the structure is shared
371 	 * copy-on-write.)
372 	 */
373 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
374 		p2->p_limit = limcopy(p1->p_limit);
375 	else {
376 		p2->p_limit = p1->p_limit;
377 		p2->p_limit->p_refcnt++;
378 	}
379 
380 	/*
381 	 * Preserve some more flags in subprocess.  P_PROFIL has already
382 	 * been preserved.
383 	 */
384 	p2->p_flag |= p1->p_flag & P_SUGID;
385 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
386 		p2->p_flag |= P_CONTROLT;
387 	if (flags & RFPPWAIT)
388 		p2->p_flag |= P_PPWAIT;
389 
390 	LIST_INSERT_AFTER(p1, p2, p_pglist);
391 
392 	/*
393 	 * Attach the new process to its parent.
394 	 *
395 	 * If RFNOWAIT is set, the newly created process becomes a child
396 	 * of init.  This effectively disassociates the child from the
397 	 * parent.
398 	 */
399 	if (flags & RFNOWAIT)
400 		pptr = initproc;
401 	else
402 		pptr = p1;
403 	p2->p_pptr = pptr;
404 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
405 	LIST_INIT(&p2->p_children);
406 
407 #ifdef KTRACE
408 	/*
409 	 * Copy traceflag and tracefile if enabled.
410 	 * If not inherited, these were zeroed above.
411 	 */
412 	if (p1->p_traceflag&KTRFAC_INHERIT) {
413 		p2->p_traceflag = p1->p_traceflag;
414 		if ((p2->p_tracep = p1->p_tracep) != NULL)
415 			VREF(p2->p_tracep);
416 	}
417 #endif
418 
419 	/*
420 	 * set priority of child to be that of parent
421 	 */
422 	p2->p_estcpu = p1->p_estcpu;
423 
424 	/*
425 	 * This begins the section where we must prevent the parent
426 	 * from being swapped.
427 	 */
428 	PHOLD(p1);
429 
430 	/*
431 	 * Finish creating the child process.  It will return via a different
432 	 * execution path later.  (ie: directly into user mode)
433 	 */
434 	vm_fork(p1, p2, flags);
435 
436 	/*
437 	 * Both processes are set up, now check if any loadable modules want
438 	 * to adjust anything.
439 	 *   What if they have an error? XXX
440 	 */
441 	while (ep) {
442 		(*ep->function)(p1, p2, flags);
443 		ep = ep->next;
444 	}
445 
446 	/*
447 	 * Make child runnable and add to run queue.
448 	 */
449 	microtime(&(p2->p_stats->p_start));
450 	p2->p_acflag = AFORK;
451 	(void) splhigh();
452 	p2->p_stat = SRUN;
453 	setrunqueue(p2);
454 	(void) spl0();
455 
456 	/*
457 	 * Now can be swapped.
458 	 */
459 	PRELE(p1);
460 
461 	/*
462 	 * Preserve synchronization semantics of vfork.  If waiting for
463 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
464 	 * proc (in case of exit).
465 	 */
466 	while (p2->p_flag & P_PPWAIT)
467 		tsleep(p1, PWAIT, "ppwait", 0);
468 
469 	/*
470 	 * Return child pid to parent process,
471 	 * marking us as parent via p1->p_retval[1].
472 	 */
473 	p1->p_retval[0] = p2->p_pid;
474 	p1->p_retval[1] = 0;
475 	return (0);
476 }
477 
478 /*
479  * The next two functionms are general routines to handle adding/deleting
480  * items on the fork callout list.
481  *
482  * at_fork():
483  * Take the arguments given and put them onto the fork callout list,
484  * However first make sure that it's not already there.
485  * Returns 0 on success or a standard error number.
486  */
487 int
488 at_fork(function)
489 	forklist_fn function;
490 {
491 	fle_p ep;
492 
493 	/* let the programmer know if he's been stupid */
494 	if (rm_at_fork(function))
495 		printf("fork callout entry already present\n");
496 	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
497 	if (ep == NULL)
498 		return (ENOMEM);
499 	ep->next = fork_list;
500 	ep->function = function;
501 	fork_list = ep;
502 	return (0);
503 }
504 
505 /*
506  * Scan the exit callout list for the given items and remove them.
507  * Returns the number of items removed.
508  * Theoretically this value can only be 0 or 1.
509  */
510 int
511 rm_at_fork(function)
512 	forklist_fn function;
513 {
514 	fle_p *epp, ep;
515 	int count;
516 
517 	count= 0;
518 	epp = &fork_list;
519 	ep = *epp;
520 	while (ep) {
521 		if (ep->function == function) {
522 			*epp = ep->next;
523 			free(ep, M_TEMP);
524 			count++;
525 		} else {
526 			epp = &ep->next;
527 		}
528 		ep = *epp;
529 	}
530 	return (count);
531 }
532