xref: /freebsd/sys/kern/kern_fork.c (revision 2ad872c5794e4c26fdf6ed219ad3f09ca0d5304a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $Id: kern_fork.c,v 1.53 1998/12/19 02:55:33 julian Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 
58 #include <vm/vm.h>
59 #include <sys/lock.h>
60 #include <vm/pmap.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_zone.h>
64 
65 #ifdef COMPAT_LINUX_THREADS
66 #include <machine/frame.h>
67 #include <sys/user.h>
68 #endif /* COMPAT_LINUX_THREADS */
69 #ifdef SMP
70 static int	fast_vfork = 0;	/* Doesn't work on SMP yet. */
71 #else
72 static int	fast_vfork = 1;
73 #endif
74 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
75 
76 /*
77  * These are the stuctures used to create a callout list for things to do
78  * when forking a process
79  */
80 typedef struct fork_list_element {
81 	struct fork_list_element *next;
82 	forklist_fn function;
83 } *fle_p;
84 
85 static fle_p	fork_list;
86 
87 #ifndef _SYS_SYSPROTO_H_
88 struct fork_args {
89 	int     dummy;
90 };
91 #endif
92 
93 /* ARGSUSED */
94 int
95 fork(p, uap)
96 	struct proc *p;
97 	struct fork_args *uap;
98 {
99 
100 	return (fork1(p, RFFDG | RFPROC));
101 }
102 
103 /* ARGSUSED */
104 int
105 vfork(p, uap)
106 	struct proc *p;
107 	struct vfork_args *uap;
108 {
109 
110 	return (fork1(p, RFFDG | RFPROC | RFPPWAIT | (fast_vfork ? RFMEM : 0)));
111 }
112 
113 /* ARGSUSED */
114 int
115 rfork(p, uap)
116 	struct proc *p;
117 	struct rfork_args *uap;
118 {
119 
120 	return (fork1(p, uap->flags));
121 }
122 
123 
124 int	nprocs = 1;		/* process 0 */
125 static int nextpid = 0;
126 
127 int
128 fork1(p1, flags)
129 	register struct proc *p1;
130 	int flags;
131 {
132 	register struct proc *p2, *pptr;
133 	register uid_t uid;
134 	struct proc *newproc;
135 	int count;
136 	static int pidchecked = 0;
137 	fle_p ep ;
138 
139 	ep = fork_list;
140 
141 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
142 		return (EINVAL);
143 
144 #ifdef SMP
145 	/*
146 	 * FATAL now, we cannot have the same PTD on both cpus, the PTD
147 	 * needs to move out of PTmap and be per-process, even for shared
148 	 * page table processes.  Unfortunately, this means either removing
149 	 * PTD[] as a fixed virtual address, or move it to the per-cpu map
150 	 * area for SMP mode.  Both cases require seperate management of
151 	 * the per-process-even-if-PTmap-is-shared PTD.
152 	 */
153 	if (flags & RFMEM) {
154 		printf("shared address space fork attempted: pid: %d\n",
155 		    p1->p_pid);
156 		return (EOPNOTSUPP);
157 	}
158 #endif
159 
160 	/*
161 	 * Here we don't create a new process, but we divorce
162 	 * certain parts of a process from itself.
163 	 */
164 	if ((flags & RFPROC) == 0) {
165 
166 		/*
167 		 * Divorce the memory, if it is shared, essentially
168 		 * this changes shared memory amongst threads, into
169 		 * COW locally.
170 		 */
171 		if ((flags & RFMEM) == 0) {
172 			if (p1->p_vmspace->vm_refcnt > 1) {
173 				vmspace_unshare(p1);
174 			}
175 		}
176 
177 		/*
178 		 * Close all file descriptors.
179 		 */
180 		if (flags & RFCFDG) {
181 			struct filedesc *fdtmp;
182 			fdtmp = fdinit(p1);
183 			fdfree(p1);
184 			p1->p_fd = fdtmp;
185 		}
186 
187 		/*
188 		 * Unshare file descriptors (from parent.)
189 		 */
190 		if (flags & RFFDG) {
191 			if (p1->p_fd->fd_refcnt > 1) {
192 				struct filedesc *newfd;
193 				newfd = fdcopy(p1);
194 				fdfree(p1);
195 				p1->p_fd = newfd;
196 			}
197 		}
198 		return (0);
199 	}
200 
201 	/*
202 	 * Although process entries are dynamically created, we still keep
203 	 * a global limit on the maximum number we will create.  Don't allow
204 	 * a nonprivileged user to use the last process; don't let root
205 	 * exceed the limit. The variable nprocs is the current number of
206 	 * processes, maxproc is the limit.
207 	 */
208 	uid = p1->p_cred->p_ruid;
209 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
210 		tablefull("proc");
211 		return (EAGAIN);
212 	}
213 	/*
214 	 * Increment the nprocs resource before blocking can occur.  There
215 	 * are hard-limits as to the number of processes that can run.
216 	 */
217 	nprocs++;
218 
219 	/*
220 	 * Increment the count of procs running with this uid. Don't allow
221 	 * a nonprivileged user to exceed their current limit.
222 	 */
223 	count = chgproccnt(uid, 1);
224 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
225 		(void)chgproccnt(uid, -1);
226 		/*
227 		 * Back out the process count
228 		 */
229 		nprocs--;
230 		return (EAGAIN);
231 	}
232 
233 	/* Allocate new proc. */
234 	newproc = zalloc(proc_zone);
235 
236 /*
237  * Setup linkage for kernel based threading
238  */
239 	if((flags & RFTHREAD) != 0) {
240 		newproc->p_peers = p1->p_peers;
241 		p1->p_peers = newproc;
242 		newproc->p_leader = p1->p_leader;
243 	} else {
244 		newproc->p_peers = 0;
245 		newproc->p_leader = newproc;
246 	}
247 
248 	newproc->p_wakeup = 0;
249 
250 	/*
251 	 * Find an unused process ID.  We remember a range of unused IDs
252 	 * ready to use (from nextpid+1 through pidchecked-1).
253 	 */
254 	nextpid++;
255 retry:
256 	/*
257 	 * If the process ID prototype has wrapped around,
258 	 * restart somewhat above 0, as the low-numbered procs
259 	 * tend to include daemons that don't exit.
260 	 */
261 	if (nextpid >= PID_MAX) {
262 		nextpid = 100;
263 		pidchecked = 0;
264 	}
265 	if (nextpid >= pidchecked) {
266 		int doingzomb = 0;
267 
268 		pidchecked = PID_MAX;
269 		/*
270 		 * Scan the active and zombie procs to check whether this pid
271 		 * is in use.  Remember the lowest pid that's greater
272 		 * than nextpid, so we can avoid checking for a while.
273 		 */
274 		p2 = allproc.lh_first;
275 again:
276 		for (; p2 != 0; p2 = p2->p_list.le_next) {
277 			while (p2->p_pid == nextpid ||
278 			    p2->p_pgrp->pg_id == nextpid ||
279 			    p2->p_session->s_sid == nextpid) {
280 				nextpid++;
281 				if (nextpid >= pidchecked)
282 					goto retry;
283 			}
284 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
285 				pidchecked = p2->p_pid;
286 			if (p2->p_pgrp->pg_id > nextpid &&
287 			    pidchecked > p2->p_pgrp->pg_id)
288 				pidchecked = p2->p_pgrp->pg_id;
289 			if (p2->p_session->s_sid > nextpid &&
290 			    pidchecked > p2->p_session->s_sid)
291 				pidchecked = p2->p_session->s_sid;
292 		}
293 		if (!doingzomb) {
294 			doingzomb = 1;
295 			p2 = zombproc.lh_first;
296 			goto again;
297 		}
298 	}
299 
300 	p2 = newproc;
301 	p2->p_stat = SIDL;			/* protect against others */
302 	p2->p_pid = nextpid;
303 	LIST_INSERT_HEAD(&allproc, p2, p_list);
304 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
305 
306 	/*
307 	 * Make a proc table entry for the new process.
308 	 * Start by zeroing the section of proc that is zero-initialized,
309 	 * then copy the section that is copied directly from the parent.
310 	 */
311 	bzero(&p2->p_startzero,
312 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
313 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
314 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
315 
316 	p2->p_aioinfo = NULL;
317 
318 	/*
319 	 * Duplicate sub-structures as needed.
320 	 * Increase reference counts on shared objects.
321 	 * The p_stats and p_sigacts substructs are set in vm_fork.
322 	 */
323 	p2->p_flag = P_INMEM;
324 	if (p1->p_flag & P_PROFIL)
325 		startprofclock(p2);
326 	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
327 	    M_SUBPROC, M_WAITOK);
328 	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
329 	p2->p_cred->p_refcnt = 1;
330 	crhold(p1->p_ucred);
331 
332 #ifdef COMPAT_LINUX_THREADS
333 	if (flags & RFSIGSHARE) {
334 		p2->p_procsig = p1->p_procsig;
335 		p2->p_procsig->ps_refcnt++;
336 		if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
337 			struct sigacts *newsigacts;
338 			int s;
339 
340 			if (p2->p_procsig->ps_refcnt != 2)
341 				printf ("PID:%d Creating shared sigacts with procsig->ps_refcnt %d\n",
342 					p2->p_pid, p2->p_procsig->ps_refcnt);
343 			/* Create the shared sigacts structure */
344 			MALLOC (newsigacts, struct sigacts *, sizeof (struct sigacts),
345 				M_SUBPROC, M_WAITOK);
346 			s = splhigh();
347 			/* Set p_sigacts to the new shared structure.  Note that this
348 			 * is updating p1->p_sigacts at the same time, since p_sigacts
349 			 * is just a pointer to the shared p_procsig->ps_sigacts.
350 			 */
351 			p2->p_sigacts  = newsigacts;
352 			/* Copy in the values from the u area */
353 			*p2->p_sigacts = p1->p_addr->u_sigacts;
354 			splx (s);
355 		}
356 	} else {
357 		MALLOC (p2->p_procsig, struct procsig *, sizeof(struct procsig),
358 			M_SUBPROC, M_WAITOK);
359 		bcopy(&p1->p_procsig->ps_begincopy, &p2->p_procsig->ps_begincopy,
360 			(unsigned)&p1->p_procsig->ps_endcopy -
361 			(unsigned)&p1->p_procsig->ps_begincopy);
362 		p2->p_procsig->ps_refcnt = 1;
363 		/* Note that we fill in the values of sigacts in vm_fork */
364 		p2->p_sigacts = NULL;
365 	}
366 	if (flags & RFLINUXTHPN) {
367 	        p2->p_sigparent = SIGUSR1;
368 	}
369 #endif /* COMPAT_LINUX_THREADS */
370 	/* bump references to the text vnode (for procfs) */
371 	p2->p_textvp = p1->p_textvp;
372 	if (p2->p_textvp)
373 		VREF(p2->p_textvp);
374 
375 	if (flags & RFCFDG)
376 		p2->p_fd = fdinit(p1);
377 	else if (flags & RFFDG)
378 		p2->p_fd = fdcopy(p1);
379 	else
380 		p2->p_fd = fdshare(p1);
381 
382 	/*
383 	 * If p_limit is still copy-on-write, bump refcnt,
384 	 * otherwise get a copy that won't be modified.
385 	 * (If PL_SHAREMOD is clear, the structure is shared
386 	 * copy-on-write.)
387 	 */
388 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
389 		p2->p_limit = limcopy(p1->p_limit);
390 	else {
391 		p2->p_limit = p1->p_limit;
392 		p2->p_limit->p_refcnt++;
393 	}
394 
395 	/*
396 	 * Preserve some more flags in subprocess.  P_PROFIL has already
397 	 * been preserved.
398 	 */
399 	p2->p_flag |= p1->p_flag & P_SUGID;
400 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
401 		p2->p_flag |= P_CONTROLT;
402 	if (flags & RFPPWAIT)
403 		p2->p_flag |= P_PPWAIT;
404 
405 	LIST_INSERT_AFTER(p1, p2, p_pglist);
406 
407 	/*
408 	 * Attach the new process to its parent.
409 	 *
410 	 * If RFNOWAIT is set, the newly created process becomes a child
411 	 * of init.  This effectively disassociates the child from the
412 	 * parent.
413 	 */
414 	if (flags & RFNOWAIT)
415 		pptr = initproc;
416 	else
417 		pptr = p1;
418 	p2->p_pptr = pptr;
419 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
420 	LIST_INIT(&p2->p_children);
421 
422 #ifdef KTRACE
423 	/*
424 	 * Copy traceflag and tracefile if enabled.
425 	 * If not inherited, these were zeroed above.
426 	 */
427 	if (p1->p_traceflag&KTRFAC_INHERIT) {
428 		p2->p_traceflag = p1->p_traceflag;
429 		if ((p2->p_tracep = p1->p_tracep) != NULL)
430 			VREF(p2->p_tracep);
431 	}
432 #endif
433 
434 	/*
435 	 * set priority of child to be that of parent
436 	 */
437 	p2->p_estcpu = p1->p_estcpu;
438 
439 	/*
440 	 * This begins the section where we must prevent the parent
441 	 * from being swapped.
442 	 */
443 	p1->p_flag |= P_NOSWAP;
444 
445 	/*
446 	 * Finish creating the child process.  It will return via a different
447 	 * execution path later.  (ie: directly into user mode)
448 	 */
449 	vm_fork(p1, p2, flags);
450 
451 	/*
452 	 * Both processes are set up, now check if any LKMs want
453 	 * to adjust anything.
454 	 *   What if they have an error? XXX
455 	 */
456 	while (ep) {
457 		(*ep->function)(p1, p2, flags);
458 		ep = ep->next;
459 	}
460 
461 	/*
462 	 * Make child runnable and add to run queue.
463 	 */
464 	microtime(&(p2->p_stats->p_start));
465 	p2->p_acflag = AFORK;
466 	(void) splhigh();
467 	p2->p_stat = SRUN;
468 	setrunqueue(p2);
469 	(void) spl0();
470 
471 	/*
472 	 * Now can be swapped.
473 	 */
474 	p1->p_flag &= ~P_NOSWAP;
475 
476 	/*
477 	 * Preserve synchronization semantics of vfork.  If waiting for
478 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
479 	 * proc (in case of exit).
480 	 */
481 	while (p2->p_flag & P_PPWAIT)
482 		tsleep(p1, PWAIT, "ppwait", 0);
483 
484 	/*
485 	 * Return child pid to parent process,
486 	 * marking us as parent via p1->p_retval[1].
487 	 */
488 	p1->p_retval[0] = p2->p_pid;
489 	p1->p_retval[1] = 0;
490 	return (0);
491 }
492 
493 /*
494  * The next two functionms are general routines to handle adding/deleting
495  * items on the fork callout list.
496  *
497  * at_fork():
498  * Take the arguments given and put them onto the fork callout list,
499  * However first make sure that it's not already there.
500  * Returns 0 on success or a standard error number.
501  */
502 int
503 at_fork(function)
504 	forklist_fn function;
505 {
506 	fle_p ep;
507 
508 	/* let the programmer know if he's been stupid */
509 	if (rm_at_fork(function))
510 		printf("fork callout entry already present\n");
511 	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
512 	if (ep == NULL)
513 		return (ENOMEM);
514 	ep->next = fork_list;
515 	ep->function = function;
516 	fork_list = ep;
517 	return (0);
518 }
519 
520 /*
521  * Scan the exit callout list for the given items and remove them.
522  * Returns the number of items removed.
523  * Theoretically this value can only be 0 or 1.
524  */
525 int
526 rm_at_fork(function)
527 	forklist_fn function;
528 {
529 	fle_p *epp, ep;
530 	int count;
531 
532 	count= 0;
533 	epp = &fork_list;
534 	ep = *epp;
535 	while (ep) {
536 		if (ep->function == function) {
537 			*epp = ep->next;
538 			free(ep, M_TEMP);
539 			count++;
540 		} else {
541 			epp = &ep->next;
542 		}
543 		ep = *epp;
544 	}
545 	return (count);
546 }
547