xref: /freebsd/sys/kern/kern_fork.c (revision a8445737e740901f5f2c8d24c12ef7fc8b00134e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $Id: kern_fork.c,v 1.50 1997/12/12 04:00:58 dyson Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 
58 #include <vm/vm.h>
59 #include <sys/lock.h>
60 #include <vm/pmap.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_zone.h>
64 
65 #ifdef SMP
66 static int	fast_vfork = 0;	/* Doesn't work on SMP yet. */
67 #else
68 static int	fast_vfork = 1;
69 #endif
70 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
71 
72 /*
73  * These are the stuctures used to create a callout list for things to do
74  * when forking a process
75  */
76 typedef struct fork_list_element {
77 	struct fork_list_element *next;
78 	forklist_fn function;
79 } *fle_p;
80 
81 static fle_p	fork_list;
82 
83 #ifndef _SYS_SYSPROTO_H_
84 struct fork_args {
85 	int     dummy;
86 };
87 #endif
88 
89 /* ARGSUSED */
90 int
91 fork(p, uap)
92 	struct proc *p;
93 	struct fork_args *uap;
94 {
95 
96 	return (fork1(p, RFFDG | RFPROC));
97 }
98 
99 /* ARGSUSED */
100 int
101 vfork(p, uap)
102 	struct proc *p;
103 	struct vfork_args *uap;
104 {
105 
106 	return (fork1(p, RFFDG | RFPROC | RFPPWAIT | (fast_vfork ? RFMEM : 0)));
107 }
108 
109 /* ARGSUSED */
110 int
111 rfork(p, uap)
112 	struct proc *p;
113 	struct rfork_args *uap;
114 {
115 
116 	return (fork1(p, uap->flags));
117 }
118 
119 
120 int	nprocs = 1;		/* process 0 */
121 static int nextpid = 0;
122 
123 int
124 fork1(p1, flags)
125 	register struct proc *p1;
126 	int flags;
127 {
128 	register struct proc *p2, *pptr;
129 	register uid_t uid;
130 	struct proc *newproc;
131 	int count;
132 	static int pidchecked = 0;
133 	fle_p ep ;
134 
135 	ep = fork_list;
136 
137 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
138 		return (EINVAL);
139 
140 #ifdef SMP
141 	/*
142 	 * FATAL now, we cannot have the same PTD on both cpus, the PTD
143 	 * needs to move out of PTmap and be per-process, even for shared
144 	 * page table processes.  Unfortunately, this means either removing
145 	 * PTD[] as a fixed virtual address, or move it to the per-cpu map
146 	 * area for SMP mode.  Both cases require seperate management of
147 	 * the per-process-even-if-PTmap-is-shared PTD.
148 	 */
149 	if (flags & RFMEM) {
150 		printf("shared address space fork attempted: pid: %d\n",
151 		    p1->p_pid);
152 		return (EOPNOTSUPP);
153 	}
154 #endif
155 
156 	/*
157 	 * Here we don't create a new process, but we divorce
158 	 * certain parts of a process from itself.
159 	 */
160 	if ((flags & RFPROC) == 0) {
161 
162 		/*
163 		 * Divorce the memory, if it is shared, essentially
164 		 * this changes shared memory amongst threads, into
165 		 * COW locally.
166 		 */
167 		if ((flags & RFMEM) == 0) {
168 			if (p1->p_vmspace->vm_refcnt > 1) {
169 				vmspace_unshare(p1);
170 			}
171 		}
172 
173 		/*
174 		 * Close all file descriptors.
175 		 */
176 		if (flags & RFCFDG) {
177 			struct filedesc *fdtmp;
178 			fdtmp = fdinit(p1);
179 			fdfree(p1);
180 			p1->p_fd = fdtmp;
181 		}
182 
183 		/*
184 		 * Unshare file descriptors (from parent.)
185 		 */
186 		if (flags & RFFDG) {
187 			if (p1->p_fd->fd_refcnt > 1) {
188 				struct filedesc *newfd;
189 				newfd = fdcopy(p1);
190 				fdfree(p1);
191 				p1->p_fd = newfd;
192 			}
193 		}
194 		return (0);
195 	}
196 
197 	/*
198 	 * Although process entries are dynamically created, we still keep
199 	 * a global limit on the maximum number we will create.  Don't allow
200 	 * a nonprivileged user to use the last process; don't let root
201 	 * exceed the limit. The variable nprocs is the current number of
202 	 * processes, maxproc is the limit.
203 	 */
204 	uid = p1->p_cred->p_ruid;
205 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
206 		tablefull("proc");
207 		return (EAGAIN);
208 	}
209 	/*
210 	 * Increment the nprocs resource before blocking can occur.  There
211 	 * are hard-limits as to the number of processes that can run.
212 	 */
213 	nprocs++;
214 
215 	/*
216 	 * Increment the count of procs running with this uid. Don't allow
217 	 * a nonprivileged user to exceed their current limit.
218 	 */
219 	count = chgproccnt(uid, 1);
220 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
221 		(void)chgproccnt(uid, -1);
222 		/*
223 		 * Back out the process count
224 		 */
225 		nprocs--;
226 		return (EAGAIN);
227 	}
228 
229 	/* Allocate new proc. */
230 	newproc = zalloc(proc_zone);
231 
232 /*
233  * Setup linkage for kernel based threading
234  */
235 	if((flags & RFTHREAD) != 0) {
236 		newproc->p_peers = p1->p_peers;
237 		p1->p_peers = newproc;
238 		newproc->p_leader = p1->p_leader;
239 	} else {
240 		newproc->p_peers = 0;
241 		newproc->p_leader = newproc;
242 	}
243 
244 	newproc->p_wakeup = 0;
245 
246 	/*
247 	 * Find an unused process ID.  We remember a range of unused IDs
248 	 * ready to use (from nextpid+1 through pidchecked-1).
249 	 */
250 	nextpid++;
251 retry:
252 	/*
253 	 * If the process ID prototype has wrapped around,
254 	 * restart somewhat above 0, as the low-numbered procs
255 	 * tend to include daemons that don't exit.
256 	 */
257 	if (nextpid >= PID_MAX) {
258 		nextpid = 100;
259 		pidchecked = 0;
260 	}
261 	if (nextpid >= pidchecked) {
262 		int doingzomb = 0;
263 
264 		pidchecked = PID_MAX;
265 		/*
266 		 * Scan the active and zombie procs to check whether this pid
267 		 * is in use.  Remember the lowest pid that's greater
268 		 * than nextpid, so we can avoid checking for a while.
269 		 */
270 		p2 = allproc.lh_first;
271 again:
272 		for (; p2 != 0; p2 = p2->p_list.le_next) {
273 			while (p2->p_pid == nextpid ||
274 			    p2->p_pgrp->pg_id == nextpid) {
275 				nextpid++;
276 				if (nextpid >= pidchecked)
277 					goto retry;
278 			}
279 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
280 				pidchecked = p2->p_pid;
281 			if (p2->p_pgrp->pg_id > nextpid &&
282 			    pidchecked > p2->p_pgrp->pg_id)
283 				pidchecked = p2->p_pgrp->pg_id;
284 		}
285 		if (!doingzomb) {
286 			doingzomb = 1;
287 			p2 = zombproc.lh_first;
288 			goto again;
289 		}
290 	}
291 
292 	p2 = newproc;
293 	p2->p_stat = SIDL;			/* protect against others */
294 	p2->p_pid = nextpid;
295 	LIST_INSERT_HEAD(&allproc, p2, p_list);
296 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
297 
298 	/*
299 	 * Make a proc table entry for the new process.
300 	 * Start by zeroing the section of proc that is zero-initialized,
301 	 * then copy the section that is copied directly from the parent.
302 	 */
303 	bzero(&p2->p_startzero,
304 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
305 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
306 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
307 
308 	p2->p_aioinfo = NULL;
309 
310 	/*
311 	 * Duplicate sub-structures as needed.
312 	 * Increase reference counts on shared objects.
313 	 * The p_stats and p_sigacts substructs are set in vm_fork.
314 	 */
315 	p2->p_flag = P_INMEM;
316 	if (p1->p_flag & P_PROFIL)
317 		startprofclock(p2);
318 	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
319 	    M_SUBPROC, M_WAITOK);
320 	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
321 	p2->p_cred->p_refcnt = 1;
322 	crhold(p1->p_ucred);
323 
324 	/* bump references to the text vnode (for procfs) */
325 	p2->p_textvp = p1->p_textvp;
326 	if (p2->p_textvp)
327 		VREF(p2->p_textvp);
328 
329 	if (flags & RFCFDG)
330 		p2->p_fd = fdinit(p1);
331 	else if (flags & RFFDG)
332 		p2->p_fd = fdcopy(p1);
333 	else
334 		p2->p_fd = fdshare(p1);
335 
336 	/*
337 	 * If p_limit is still copy-on-write, bump refcnt,
338 	 * otherwise get a copy that won't be modified.
339 	 * (If PL_SHAREMOD is clear, the structure is shared
340 	 * copy-on-write.)
341 	 */
342 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
343 		p2->p_limit = limcopy(p1->p_limit);
344 	else {
345 		p2->p_limit = p1->p_limit;
346 		p2->p_limit->p_refcnt++;
347 	}
348 
349 	/*
350 	 * Preserve some more flags in subprocess.  P_PROFIL has already
351 	 * been preserved.
352 	 */
353 	p2->p_flag |= p1->p_flag & P_SUGID;
354 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
355 		p2->p_flag |= P_CONTROLT;
356 	if (flags & RFPPWAIT)
357 		p2->p_flag |= P_PPWAIT;
358 
359 	LIST_INSERT_AFTER(p1, p2, p_pglist);
360 
361 	/*
362 	 * Attach the new process to its parent.
363 	 *
364 	 * If RFNOWAIT is set, the newly created process becomes a child
365 	 * of init.  This effectively disassociates the child from the
366 	 * parent.
367 	 */
368 	if (flags & RFNOWAIT)
369 		pptr = initproc;
370 	else
371 		pptr = p1;
372 	p2->p_pptr = pptr;
373 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
374 	LIST_INIT(&p2->p_children);
375 
376 #ifdef KTRACE
377 	/*
378 	 * Copy traceflag and tracefile if enabled.
379 	 * If not inherited, these were zeroed above.
380 	 */
381 	if (p1->p_traceflag&KTRFAC_INHERIT) {
382 		p2->p_traceflag = p1->p_traceflag;
383 		if ((p2->p_tracep = p1->p_tracep) != NULL)
384 			VREF(p2->p_tracep);
385 	}
386 #endif
387 
388 	/*
389 	 * set priority of child to be that of parent
390 	 */
391 	p2->p_estcpu = p1->p_estcpu;
392 
393 	/*
394 	 * This begins the section where we must prevent the parent
395 	 * from being swapped.
396 	 */
397 	p1->p_flag |= P_NOSWAP;
398 
399 	/*
400 	 * Finish creating the child process.  It will return via a different
401 	 * execution path later.  (ie: directly into user mode)
402 	 */
403 	vm_fork(p1, p2, flags);
404 
405 	/*
406 	 * Both processes are set up, now check if any LKMs want
407 	 * to adjust anything.
408 	 *   What if they have an error? XXX
409 	 */
410 	while (ep) {
411 		(*ep->function)(p1, p2, flags);
412 		ep = ep->next;
413 	}
414 
415 	/*
416 	 * Make child runnable and add to run queue.
417 	 */
418 	microtime(&(p2->p_stats->p_start));
419 	p2->p_acflag = AFORK;
420 	(void) splhigh();
421 	p2->p_stat = SRUN;
422 	setrunqueue(p2);
423 	(void) spl0();
424 
425 	/*
426 	 * Now can be swapped.
427 	 */
428 	p1->p_flag &= ~P_NOSWAP;
429 
430 	/*
431 	 * Preserve synchronization semantics of vfork.  If waiting for
432 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
433 	 * proc (in case of exit).
434 	 */
435 	while (p2->p_flag & P_PPWAIT)
436 		tsleep(p1, PWAIT, "ppwait", 0);
437 
438 	/*
439 	 * Return child pid to parent process,
440 	 * marking us as parent via p1->p_retval[1].
441 	 */
442 	p1->p_retval[0] = p2->p_pid;
443 	p1->p_retval[1] = 0;
444 	return (0);
445 }
446 
447 /*
448  * The next two functionms are general routines to handle adding/deleting
449  * items on the fork callout list.
450  *
451  * at_fork():
452  * Take the arguments given and put them onto the fork callout list,
453  * However first make sure that it's not already there.
454  * Returns 0 on success or a standard error number.
455  */
456 int
457 at_fork(function)
458 	forklist_fn function;
459 {
460 	fle_p ep;
461 
462 	/* let the programmer know if he's been stupid */
463 	if (rm_at_fork(function))
464 		printf("fork callout entry already present\n");
465 	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
466 	if (ep == NULL)
467 		return (ENOMEM);
468 	ep->next = fork_list;
469 	ep->function = function;
470 	fork_list = ep;
471 	return (0);
472 }
473 
474 /*
475  * Scan the exit callout list for the given items and remove them.
476  * Returns the number of items removed.
477  * Theoretically this value can only be 0 or 1.
478  */
479 int
480 rm_at_fork(function)
481 	forklist_fn function;
482 {
483 	fle_p *epp, ep;
484 	int count;
485 
486 	count= 0;
487 	epp = &fork_list;
488 	ep = *epp;
489 	while (ep) {
490 		if (ep->function == function) {
491 			*epp = ep->next;
492 			free(ep, M_TEMP);
493 			count++;
494 		} else {
495 			epp = &ep->next;
496 		}
497 		ep = *epp;
498 	}
499 	return (count);
500 }
501