xref: /freebsd/sys/kern/kern_fork.c (revision 0de89efe5c443f213c7ea28773ef2dc6cf3af2ed)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
39  * $Id: kern_fork.c,v 1.46 1997/08/22 15:10:00 peter Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 
58 #include <vm/vm.h>
59 #include <vm/vm_param.h>
60 #include <sys/lock.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_map.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_inherit.h>
65 
66 #ifdef SMP
67 int fast_vfork = 0;	/* Doesn't work on SMP yet */
68 #else
69 int fast_vfork = 1;
70 #endif
71 SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, "");
72 
73 static int fork1 __P((struct proc *p, int flags, int *retval));
74 
75 /*
76  * These are the stuctures used to create a callout list for things to do
77  * when forking a process
78  */
79 typedef struct fork_list_element {
80 	struct fork_list_element *next;
81 	forklist_fn function;
82 } *fle_p;
83 
84 static fle_p	fork_list;
85 
86 #ifndef _SYS_SYSPROTO_H_
87 struct fork_args {
88 	int     dummy;
89 };
90 #endif
91 
92 /* ARGSUSED */
93 int
94 fork(p, uap, retval)
95 	struct proc *p;
96 	struct fork_args *uap;
97 	int retval[];
98 {
99 	return (fork1(p, (RFFDG|RFPROC), retval));
100 }
101 
102 /* ARGSUSED */
103 int
104 vfork(p, uap, retval)
105 	struct proc *p;
106 	struct vfork_args *uap;
107 	int retval[];
108 {
109 	return (fork1(p, (RFFDG|RFPROC|RFPPWAIT|(fast_vfork ? RFMEM : 0)),
110 		retval));
111 }
112 
113 /* ARGSUSED */
114 int
115 rfork(p, uap, retval)
116 	struct proc *p;
117 	struct rfork_args *uap;
118 	int retval[];
119 {
120 	return (fork1(p, uap->flags, retval));
121 }
122 
123 
124 int	nprocs = 1;		/* process 0 */
125 static int nextpid = 0;
126 
127 static int
128 fork1(p1, flags, retval)
129 	register struct proc *p1;
130 	int flags;
131 	int retval[];
132 {
133 	register struct proc *p2, *pptr;
134 	register uid_t uid;
135 	struct proc *newproc;
136 	int count;
137 	static int pidchecked = 0;
138 	fle_p ep ;
139 
140 	ep = fork_list;
141 
142 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
143 		return (EINVAL);
144 
145 #ifdef SMP
146 	/*
147 	 * FATAL now, we cannot have the same PTD on both cpus, the PTD
148 	 * needs to move out of PTmap and be per-process, even for shared
149 	 * page table processes.  Unfortunately, this means either removing
150 	 * PTD[] as a fixed virtual address, or move it to the per-cpu map
151 	 * area for SMP mode.  Both cases require seperate management of
152 	 * the per-process-even-if-PTmap-is-shared PTD.
153 	 */
154 	if (flags & RFMEM) {
155 		printf("shared address space fork attempted: pid: %d\n",
156 		    p1->p_pid);
157 		return (EOPNOTSUPP);
158 	}
159 #endif
160 
161 	/*
162 	 * Here we don't create a new process, but we divorce
163 	 * certain parts of a process from itself.
164 	 */
165 	if ((flags & RFPROC) == 0) {
166 
167 		/*
168 		 * Divorce the memory, if it is shared, essentially
169 		 * this changes shared memory amongst threads, into
170 		 * COW locally.
171 		 */
172 		if ((flags & RFMEM) == 0) {
173 			if (p1->p_vmspace->vm_refcnt > 1) {
174 				vmspace_unshare(p1);
175 			}
176 		}
177 
178 		/*
179 		 * Close all file descriptors.
180 		 */
181 		if (flags & RFCFDG) {
182 			struct filedesc *fdtmp;
183 			fdtmp = fdinit(p1);
184 			fdfree(p1);
185 			p1->p_fd = fdtmp;
186 		}
187 
188 		/*
189 		 * Unshare file descriptors (from parent.)
190 		 */
191 		if (flags & RFFDG) {
192 			if (p1->p_fd->fd_refcnt > 1) {
193 				struct filedesc *newfd;
194 				newfd = fdcopy(p1);
195 				fdfree(p1);
196 				p1->p_fd = newfd;
197 			}
198 		}
199 		return (0);
200 	}
201 
202 	/*
203 	 * Although process entries are dynamically created, we still keep
204 	 * a global limit on the maximum number we will create.  Don't allow
205 	 * a nonprivileged user to use the last process; don't let root
206 	 * exceed the limit. The variable nprocs is the current number of
207 	 * processes, maxproc is the limit.
208 	 */
209 	uid = p1->p_cred->p_ruid;
210 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
211 		tablefull("proc");
212 		return (EAGAIN);
213 	}
214 	/*
215 	 * Increment the nprocs resource before blocking can occur.  There
216 	 * are hard-limits as to the number of processes that can run.
217 	 */
218 	nprocs++;
219 
220 	/*
221 	 * Increment the count of procs running with this uid. Don't allow
222 	 * a nonprivileged user to exceed their current limit.
223 	 */
224 	count = chgproccnt(uid, 1);
225 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
226 		(void)chgproccnt(uid, -1);
227 		/*
228 		 * Back out the process count
229 		 */
230 		nprocs--;
231 		return (EAGAIN);
232 	}
233 
234 	/* Allocate new proc. */
235 	MALLOC(newproc, struct proc *, sizeof(struct proc), M_PROC, M_WAITOK);
236 
237 /*
238  * Setup linkage for kernel based threading
239  */
240 	if((flags & RFTHREAD) != 0) {
241 		newproc->p_peers = p1->p_peers;
242 		p1->p_peers = newproc;
243 		newproc->p_leader = p1->p_leader;
244 	} else {
245 		newproc->p_peers = 0;
246 		newproc->p_leader = newproc;
247 	}
248 
249 	newproc->p_wakeup = 0;
250 
251 	/*
252 	 * Find an unused process ID.  We remember a range of unused IDs
253 	 * ready to use (from nextpid+1 through pidchecked-1).
254 	 */
255 	nextpid++;
256 retry:
257 	/*
258 	 * If the process ID prototype has wrapped around,
259 	 * restart somewhat above 0, as the low-numbered procs
260 	 * tend to include daemons that don't exit.
261 	 */
262 	if (nextpid >= PID_MAX) {
263 		nextpid = 100;
264 		pidchecked = 0;
265 	}
266 	if (nextpid >= pidchecked) {
267 		int doingzomb = 0;
268 
269 		pidchecked = PID_MAX;
270 		/*
271 		 * Scan the active and zombie procs to check whether this pid
272 		 * is in use.  Remember the lowest pid that's greater
273 		 * than nextpid, so we can avoid checking for a while.
274 		 */
275 		p2 = allproc.lh_first;
276 again:
277 		for (; p2 != 0; p2 = p2->p_list.le_next) {
278 			while (p2->p_pid == nextpid ||
279 			    p2->p_pgrp->pg_id == nextpid) {
280 				nextpid++;
281 				if (nextpid >= pidchecked)
282 					goto retry;
283 			}
284 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
285 				pidchecked = p2->p_pid;
286 			if (p2->p_pgrp->pg_id > nextpid &&
287 			    pidchecked > p2->p_pgrp->pg_id)
288 				pidchecked = p2->p_pgrp->pg_id;
289 		}
290 		if (!doingzomb) {
291 			doingzomb = 1;
292 			p2 = zombproc.lh_first;
293 			goto again;
294 		}
295 	}
296 
297 	p2 = newproc;
298 	p2->p_stat = SIDL;			/* protect against others */
299 	p2->p_pid = nextpid;
300 	LIST_INSERT_HEAD(&allproc, p2, p_list);
301 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
302 
303 	/*
304 	 * Make a proc table entry for the new process.
305 	 * Start by zeroing the section of proc that is zero-initialized,
306 	 * then copy the section that is copied directly from the parent.
307 	 */
308 	bzero(&p2->p_startzero,
309 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
310 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
311 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
312 
313 	p2->p_aioinfo = NULL;
314 
315 	/*
316 	 * Duplicate sub-structures as needed.
317 	 * Increase reference counts on shared objects.
318 	 * The p_stats and p_sigacts substructs are set in vm_fork.
319 	 */
320 	p2->p_flag = P_INMEM;
321 	if (p1->p_flag & P_PROFIL)
322 		startprofclock(p2);
323 	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
324 	    M_SUBPROC, M_WAITOK);
325 	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
326 	p2->p_cred->p_refcnt = 1;
327 	crhold(p1->p_ucred);
328 
329 	/* bump references to the text vnode (for procfs) */
330 	p2->p_textvp = p1->p_textvp;
331 	if (p2->p_textvp)
332 		VREF(p2->p_textvp);
333 
334 	if (flags & RFCFDG)
335 		p2->p_fd = fdinit(p1);
336 	else if (flags & RFFDG)
337 		p2->p_fd = fdcopy(p1);
338 	else
339 		p2->p_fd = fdshare(p1);
340 
341 	/*
342 	 * If p_limit is still copy-on-write, bump refcnt,
343 	 * otherwise get a copy that won't be modified.
344 	 * (If PL_SHAREMOD is clear, the structure is shared
345 	 * copy-on-write.)
346 	 */
347 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
348 		p2->p_limit = limcopy(p1->p_limit);
349 	else {
350 		p2->p_limit = p1->p_limit;
351 		p2->p_limit->p_refcnt++;
352 	}
353 
354 	/*
355 	 * Preserve some flags in subprocess.
356 	 */
357 	p2->p_flag |= p1->p_flag & P_SUGID;
358 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
359 		p2->p_flag |= P_CONTROLT;
360 	if (flags & RFPPWAIT)
361 		p2->p_flag |= P_PPWAIT;
362 	LIST_INSERT_AFTER(p1, p2, p_pglist);
363 
364 	/*
365 	 * Attach the new process to its parent.
366 	 *
367 	 * If RFNOWAIT is set, the newly created process becomes a child
368 	 * of init.  This effectively disassociates the child from the
369 	 * parent.
370 	 */
371 	if (flags & RFNOWAIT)
372 		pptr = initproc;
373 	else
374 		pptr = p1;
375 	p2->p_pptr = pptr;
376 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
377 	LIST_INIT(&p2->p_children);
378 
379 #ifdef KTRACE
380 	/*
381 	 * Copy traceflag and tracefile if enabled.
382 	 * If not inherited, these were zeroed above.
383 	 */
384 	if (p1->p_traceflag&KTRFAC_INHERIT) {
385 		p2->p_traceflag = p1->p_traceflag;
386 		if ((p2->p_tracep = p1->p_tracep) != NULL)
387 			VREF(p2->p_tracep);
388 	}
389 #endif
390 
391 	/*
392 	 * set priority of child to be that of parent
393 	 */
394 	p2->p_estcpu = p1->p_estcpu;
395 
396 	/*
397 	 * This begins the section where we must prevent the parent
398 	 * from being swapped.
399 	 */
400 	p1->p_flag |= P_NOSWAP;
401 
402 	/*
403 	 * Finish creating the child process.  It will return via a different
404 	 * execution path later.  (ie: directly into user mode)
405 	 */
406 	vm_fork(p1, p2, flags);
407 
408 	/*
409 	 * Both processes are set up, now check if any LKMs want
410 	 * to adjust anything.
411 	 *   What if they have an error? XXX
412 	 */
413 	while (ep) {
414 		(*ep->function)(p1, p2, flags);
415 		ep = ep->next;
416 	}
417 
418 	/*
419 	 * Make child runnable and add to run queue.
420 	 */
421 	microtime(&(p2->p_stats->p_start));
422 	p2->p_acflag = AFORK;
423 	(void) splhigh();
424 	p2->p_stat = SRUN;
425 	setrunqueue(p2);
426 	(void) spl0();
427 
428 	/*
429 	 * Now can be swapped.
430 	 */
431 	p1->p_flag &= ~P_NOSWAP;
432 
433 	/*
434 	 * Preserve synchronization semantics of vfork.  If waiting for
435 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
436 	 * proc (in case of exit).
437 	 */
438 	while (p2->p_flag & P_PPWAIT)
439 		tsleep(p1, PWAIT, "ppwait", 0);
440 
441 	/*
442 	 * Return child pid to parent process,
443 	 * marking us as parent via retval[1].
444 	 */
445 	retval[0] = p2->p_pid;
446 	retval[1] = 0;
447 	return (0);
448 }
449 
450 /*
451  * The next two functionms are general routines to handle adding/deleting
452  * items on the fork callout list.
453  *
454  * at_fork():
455  * Take the arguments given and put them onto the fork callout list,
456  * However first make sure that it's not already there.
457  * Returns 0 on success or a standard error number.
458  */
459 int
460 at_fork(function)
461 	forklist_fn function;
462 {
463 	fle_p ep;
464 
465 	/* let the programmer know if he's been stupid */
466 	if (rm_at_fork(function))
467 		printf("fork callout entry already present\n");
468 	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
469 	if (ep == NULL)
470 		return (ENOMEM);
471 	ep->next = fork_list;
472 	ep->function = function;
473 	fork_list = ep;
474 	return (0);
475 }
476 
477 /*
478  * Scan the exit callout list for the given items and remove them.
479  * Returns the number of items removed.
480  * Theoretically this value can only be 0 or 1.
481  */
482 int
483 rm_at_fork(function)
484 	forklist_fn function;
485 {
486 	fle_p *epp, ep;
487 	int count;
488 
489 	count= 0;
490 	epp = &fork_list;
491 	ep = *epp;
492 	while (ep) {
493 		if (ep->function == function) {
494 			*epp = ep->next;
495 			free(ep, M_TEMP);
496 			count++;
497 		} else {
498 			epp = &ep->next;
499 		}
500 		ep = *epp;
501 	}
502 	return (count);
503 }
504