xref: /freebsd/sys/kern/kern_jail.c (revision 6378393308bc6bd81fb871dacf6b03cf1a390d8b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 1999 Poul-Henning Kamp.
5  * Copyright (c) 2008 Bjoern A. Zeeb.
6  * Copyright (c) 2009 James Gritton.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/systm.h>
42 #include <sys/errno.h>
43 #include <sys/sysproto.h>
44 #include <sys/malloc.h>
45 #include <sys/osd.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/taskqueue.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/linker.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/racct.h>
55 #include <sys/rctl.h>
56 #include <sys/refcount.h>
57 #include <sys/sx.h>
58 #include <sys/sysent.h>
59 #include <sys/namei.h>
60 #include <sys/mount.h>
61 #include <sys/queue.h>
62 #include <sys/socket.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/uuid.h>
66 #include <sys/vnode.h>
67 
68 #include <net/if.h>
69 #include <net/vnet.h>
70 
71 #include <netinet/in.h>
72 
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif /* DDB */
76 
77 #include <security/mac/mac_framework.h>
78 
79 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
80 
81 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
82 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
83 
84 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
85 #ifdef INET
86 #ifdef INET6
87 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
88 #else
89 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
90 #endif
91 #else /* !INET */
92 #ifdef INET6
93 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
94 #else
95 #define	_PR_IP_SADDRSEL	0
96 #endif
97 #endif
98 
99 /* prison0 describes what is "real" about the system. */
100 struct prison prison0 = {
101 	.pr_id		= 0,
102 	.pr_name	= "0",
103 	.pr_ref		= 1,
104 	.pr_uref	= 1,
105 	.pr_path	= "/",
106 	.pr_securelevel	= -1,
107 	.pr_devfs_rsnum = 0,
108 	.pr_state	= PRISON_STATE_ALIVE,
109 	.pr_childmax	= JAIL_MAX,
110 	.pr_hostuuid	= DEFAULT_HOSTUUID,
111 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
112 #ifdef VIMAGE
113 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
114 #else
115 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
116 #endif
117 	.pr_allow	= PR_ALLOW_ALL_STATIC,
118 };
119 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
120 
121 struct bool_flags {
122 	const char	*name;
123 	const char	*noname;
124 	volatile u_int	 flag;
125 };
126 struct jailsys_flags {
127 	const char	*name;
128 	unsigned	 disable;
129 	unsigned	 new;
130 };
131 
132 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
133 struct	sx allprison_lock;
134 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
135 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
136 LIST_HEAD(, prison_racct) allprison_racct;
137 int	lastprid = 0;
138 
139 static int get_next_prid(struct prison **insprp);
140 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
141 static void prison_complete(void *context, int pending);
142 static void prison_deref(struct prison *pr, int flags);
143 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
144 static int prison_lock_xlock(struct prison *pr, int flags);
145 static void prison_free_not_last(struct prison *pr);
146 static void prison_proc_free_not_last(struct prison *pr);
147 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
148     int enable);
149 static char *prison_path(struct prison *pr1, struct prison *pr2);
150 #ifdef RACCT
151 static void prison_racct_attach(struct prison *pr);
152 static void prison_racct_modify(struct prison *pr);
153 static void prison_racct_detach(struct prison *pr);
154 #endif
155 
156 /* Flags for prison_deref */
157 #define	PD_DEREF	0x01	/* Decrement pr_ref */
158 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
159 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
160 #define	PD_LOCKED	0x10	/* pr_mtx is held */
161 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
162 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
163 #define PD_OP_FLAGS	0x07	/* Operation flags */
164 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
165 
166 /*
167  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
168  * as we cannot figure out the size of a sparse array, or an array without a
169  * terminating entry.
170  */
171 static struct bool_flags pr_flag_bool[] = {
172 	{"persist", "nopersist", PR_PERSIST},
173 #ifdef INET
174 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
175 #endif
176 #ifdef INET6
177 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
178 #endif
179 };
180 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
181 
182 static struct jailsys_flags pr_flag_jailsys[] = {
183 	{"host", 0, PR_HOST},
184 #ifdef VIMAGE
185 	{"vnet", 0, PR_VNET},
186 #endif
187 #ifdef INET
188 	{"ip4", PR_IP4_USER, PR_IP4_USER},
189 #endif
190 #ifdef INET6
191 	{"ip6", PR_IP6_USER, PR_IP6_USER},
192 #endif
193 };
194 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
195 
196 /*
197  * Make this array full-size so dynamic parameters can be added.
198  * It is protected by prison0.mtx, but lockless reading is allowed
199  * with an atomic check of the flag values.
200  */
201 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
202 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
203 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
204 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
205 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
206 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
207 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
208 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
209 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
210 	{"allow.reserved_ports", "allow.noreserved_ports",
211 	 PR_ALLOW_RESERVED_PORTS},
212 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
213 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
214 	 PR_ALLOW_UNPRIV_DEBUG},
215 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
216 };
217 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
218 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
219 
220 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
221 					 PR_ALLOW_RESERVED_PORTS | \
222 					 PR_ALLOW_UNPRIV_DEBUG | \
223 					 PR_ALLOW_SUSER)
224 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
225 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
226 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
227 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
228 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
229 #if defined(INET) || defined(INET6)
230 static unsigned jail_max_af_ips = 255;
231 #endif
232 
233 /*
234  * Initialize the parts of prison0 that can't be static-initialized with
235  * constants.  This is called from proc0_init() after creating thread0 cpuset.
236  */
237 void
238 prison0_init(void)
239 {
240 	uint8_t *file, *data;
241 	size_t size;
242 	char buf[sizeof(prison0.pr_hostuuid)];
243 	bool valid;
244 
245 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
246 	prison0.pr_osreldate = osreldate;
247 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
248 
249 	/* If we have a preloaded hostuuid, use it. */
250 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
251 	if (file != NULL) {
252 		data = preload_fetch_addr(file);
253 		size = preload_fetch_size(file);
254 		if (data != NULL) {
255 			/*
256 			 * The preloaded data may include trailing whitespace, almost
257 			 * certainly a newline; skip over any whitespace or
258 			 * non-printable characters to be safe.
259 			 */
260 			while (size > 0 && data[size - 1] <= 0x20) {
261 				size--;
262 			}
263 
264 			valid = false;
265 
266 			/*
267 			 * Not NUL-terminated when passed from loader, but
268 			 * validate_uuid requires that due to using sscanf (as
269 			 * does the subsequent strlcpy, since it still reads
270 			 * past the given size to return the true length);
271 			 * bounce to a temporary buffer to fix.
272 			 */
273 			if (size >= sizeof(buf))
274 				goto done;
275 
276 			memcpy(buf, data, size);
277 			buf[size] = '\0';
278 
279 			if (validate_uuid(buf, size, NULL, 0) != 0)
280 				goto done;
281 
282 			valid = true;
283 			(void)strlcpy(prison0.pr_hostuuid, buf,
284 			    sizeof(prison0.pr_hostuuid));
285 
286 done:
287 			if (bootverbose && !valid) {
288 				printf("hostuuid: preload data malformed: '%.*s'\n",
289 				    (int)size, data);
290 			}
291 		}
292 	}
293 	if (bootverbose)
294 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
295 }
296 
297 /*
298  * struct jail_args {
299  *	struct jail *jail;
300  * };
301  */
302 int
303 sys_jail(struct thread *td, struct jail_args *uap)
304 {
305 	uint32_t version;
306 	int error;
307 	struct jail j;
308 
309 	error = copyin(uap->jail, &version, sizeof(uint32_t));
310 	if (error)
311 		return (error);
312 
313 	switch (version) {
314 	case 0:
315 	{
316 		struct jail_v0 j0;
317 
318 		/* FreeBSD single IPv4 jails. */
319 		bzero(&j, sizeof(struct jail));
320 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
321 		if (error)
322 			return (error);
323 		j.version = j0.version;
324 		j.path = j0.path;
325 		j.hostname = j0.hostname;
326 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
327 		break;
328 	}
329 
330 	case 1:
331 		/*
332 		 * Version 1 was used by multi-IPv4 jail implementations
333 		 * that never made it into the official kernel.
334 		 */
335 		return (EINVAL);
336 
337 	case 2:	/* JAIL_API_VERSION */
338 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
339 		error = copyin(uap->jail, &j, sizeof(struct jail));
340 		if (error)
341 			return (error);
342 		break;
343 
344 	default:
345 		/* Sci-Fi jails are not supported, sorry. */
346 		return (EINVAL);
347 	}
348 	return (kern_jail(td, &j));
349 }
350 
351 int
352 kern_jail(struct thread *td, struct jail *j)
353 {
354 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
355 #ifdef INET
356 			    + 1
357 #endif
358 #ifdef INET6
359 			    + 1
360 #endif
361 			    )];
362 	struct uio opt;
363 	char *u_path, *u_hostname, *u_name;
364 	struct bool_flags *bf;
365 #ifdef INET
366 	uint32_t ip4s;
367 	struct in_addr *u_ip4;
368 #endif
369 #ifdef INET6
370 	struct in6_addr *u_ip6;
371 #endif
372 	size_t tmplen;
373 	int error, enforce_statfs;
374 
375 	bzero(&optiov, sizeof(optiov));
376 	opt.uio_iov = optiov;
377 	opt.uio_iovcnt = 0;
378 	opt.uio_offset = -1;
379 	opt.uio_resid = -1;
380 	opt.uio_segflg = UIO_SYSSPACE;
381 	opt.uio_rw = UIO_READ;
382 	opt.uio_td = td;
383 
384 	/* Set permissions for top-level jails from sysctls. */
385 	if (!jailed(td->td_ucred)) {
386 		for (bf = pr_flag_allow;
387 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
388 			atomic_load_int(&bf->flag) != 0;
389 		     bf++) {
390 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
391 			    (jail_default_allow & bf->flag)
392 			    ? bf->name : bf->noname);
393 			optiov[opt.uio_iovcnt].iov_len =
394 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
395 			opt.uio_iovcnt += 2;
396 		}
397 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
398 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
399 		opt.uio_iovcnt++;
400 		enforce_statfs = jail_default_enforce_statfs;
401 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
402 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
403 		opt.uio_iovcnt++;
404 	}
405 
406 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
407 #ifdef INET
408 	ip4s = (j->version == 0) ? 1 : j->ip4s;
409 	if (ip4s > jail_max_af_ips)
410 		return (EINVAL);
411 	tmplen += ip4s * sizeof(struct in_addr);
412 #else
413 	if (j->ip4s > 0)
414 		return (EINVAL);
415 #endif
416 #ifdef INET6
417 	if (j->ip6s > jail_max_af_ips)
418 		return (EINVAL);
419 	tmplen += j->ip6s * sizeof(struct in6_addr);
420 #else
421 	if (j->ip6s > 0)
422 		return (EINVAL);
423 #endif
424 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
425 	u_hostname = u_path + MAXPATHLEN;
426 	u_name = u_hostname + MAXHOSTNAMELEN;
427 #ifdef INET
428 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
429 #endif
430 #ifdef INET6
431 #ifdef INET
432 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
433 #else
434 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
435 #endif
436 #endif
437 	optiov[opt.uio_iovcnt].iov_base = "path";
438 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
439 	opt.uio_iovcnt++;
440 	optiov[opt.uio_iovcnt].iov_base = u_path;
441 	error = copyinstr(j->path, u_path, MAXPATHLEN,
442 	    &optiov[opt.uio_iovcnt].iov_len);
443 	if (error) {
444 		free(u_path, M_TEMP);
445 		return (error);
446 	}
447 	opt.uio_iovcnt++;
448 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
449 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
450 	opt.uio_iovcnt++;
451 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
452 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
453 	    &optiov[opt.uio_iovcnt].iov_len);
454 	if (error) {
455 		free(u_path, M_TEMP);
456 		return (error);
457 	}
458 	opt.uio_iovcnt++;
459 	if (j->jailname != NULL) {
460 		optiov[opt.uio_iovcnt].iov_base = "name";
461 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
462 		opt.uio_iovcnt++;
463 		optiov[opt.uio_iovcnt].iov_base = u_name;
464 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
465 		    &optiov[opt.uio_iovcnt].iov_len);
466 		if (error) {
467 			free(u_path, M_TEMP);
468 			return (error);
469 		}
470 		opt.uio_iovcnt++;
471 	}
472 #ifdef INET
473 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
474 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
475 	opt.uio_iovcnt++;
476 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
477 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
478 	if (j->version == 0)
479 		u_ip4->s_addr = j->ip4s;
480 	else {
481 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
482 		if (error) {
483 			free(u_path, M_TEMP);
484 			return (error);
485 		}
486 	}
487 	opt.uio_iovcnt++;
488 #endif
489 #ifdef INET6
490 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
491 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
492 	opt.uio_iovcnt++;
493 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
494 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
495 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
496 	if (error) {
497 		free(u_path, M_TEMP);
498 		return (error);
499 	}
500 	opt.uio_iovcnt++;
501 #endif
502 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
503 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
504 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
505 	free(u_path, M_TEMP);
506 	return (error);
507 }
508 
509 /*
510  * struct jail_set_args {
511  *	struct iovec *iovp;
512  *	unsigned int iovcnt;
513  *	int flags;
514  * };
515  */
516 int
517 sys_jail_set(struct thread *td, struct jail_set_args *uap)
518 {
519 	struct uio *auio;
520 	int error;
521 
522 	/* Check that we have an even number of iovecs. */
523 	if (uap->iovcnt & 1)
524 		return (EINVAL);
525 
526 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
527 	if (error)
528 		return (error);
529 	error = kern_jail_set(td, auio, uap->flags);
530 	free(auio, M_IOV);
531 	return (error);
532 }
533 
534 int
535 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
536 {
537 	struct nameidata nd;
538 #ifdef INET
539 	struct in_addr *ip4;
540 #endif
541 #ifdef INET6
542 	struct in6_addr *ip6;
543 #endif
544 	struct vfsopt *opt;
545 	struct vfsoptlist *opts;
546 	struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr;
547 	struct vnode *root;
548 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
549 	char *g_path, *osrelstr;
550 	struct bool_flags *bf;
551 	struct jailsys_flags *jsf;
552 #if defined(INET) || defined(INET6)
553 	struct prison *tppr;
554 	void *op;
555 #endif
556 	unsigned long hid;
557 	size_t namelen, onamelen, pnamelen;
558 	int born, created, cuflags, descend, drflags, enforce;
559 	int error, errmsg_len, errmsg_pos;
560 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
561 	int jid, jsys, len, level;
562 	int childmax, osreldt, rsnum, slevel;
563 #if defined(INET) || defined(INET6)
564 	int ii, ij;
565 #endif
566 #ifdef INET
567 	int ip4s, redo_ip4;
568 #endif
569 #ifdef INET6
570 	int ip6s, redo_ip6;
571 #endif
572 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
573 	uint64_t pr_allow_diff;
574 	unsigned tallow;
575 	char numbuf[12];
576 
577 	error = priv_check(td, PRIV_JAIL_SET);
578 	if (!error && (flags & JAIL_ATTACH))
579 		error = priv_check(td, PRIV_JAIL_ATTACH);
580 	if (error)
581 		return (error);
582 	mypr = td->td_ucred->cr_prison;
583 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
584 		return (EPERM);
585 	if (flags & ~JAIL_SET_MASK)
586 		return (EINVAL);
587 
588 	/*
589 	 * Check all the parameters before committing to anything.  Not all
590 	 * errors can be caught early, but we may as well try.  Also, this
591 	 * takes care of some expensive stuff (path lookup) before getting
592 	 * the allprison lock.
593 	 *
594 	 * XXX Jails are not filesystems, and jail parameters are not mount
595 	 *     options.  But it makes more sense to re-use the vfsopt code
596 	 *     than duplicate it under a different name.
597 	 */
598 	error = vfs_buildopts(optuio, &opts);
599 	if (error)
600 		return (error);
601 #ifdef INET
602 	ip4 = NULL;
603 #endif
604 #ifdef INET6
605 	ip6 = NULL;
606 #endif
607 	g_path = NULL;
608 
609 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
610 	if (!cuflags) {
611 		error = EINVAL;
612 		vfs_opterror(opts, "no valid operation (create or update)");
613 		goto done_errmsg;
614 	}
615 
616 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
617 	if (error == ENOENT)
618 		jid = 0;
619 	else if (error != 0)
620 		goto done_free;
621 
622 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
623 	if (error == ENOENT)
624 		gotslevel = 0;
625 	else if (error != 0)
626 		goto done_free;
627 	else
628 		gotslevel = 1;
629 
630 	error =
631 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
632 	if (error == ENOENT)
633 		gotchildmax = 0;
634 	else if (error != 0)
635 		goto done_free;
636 	else
637 		gotchildmax = 1;
638 
639 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
640 	if (error == ENOENT)
641 		gotenforce = 0;
642 	else if (error != 0)
643 		goto done_free;
644 	else if (enforce < 0 || enforce > 2) {
645 		error = EINVAL;
646 		goto done_free;
647 	} else
648 		gotenforce = 1;
649 
650 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
651 	if (error == ENOENT)
652 		gotrsnum = 0;
653 	else if (error != 0)
654 		goto done_free;
655 	else
656 		gotrsnum = 1;
657 
658 	pr_flags = ch_flags = 0;
659 	for (bf = pr_flag_bool;
660 	     bf < pr_flag_bool + nitems(pr_flag_bool);
661 	     bf++) {
662 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
663 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
664 	}
665 	ch_flags |= pr_flags;
666 	for (jsf = pr_flag_jailsys;
667 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
668 	     jsf++) {
669 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
670 		if (error == ENOENT)
671 			continue;
672 		if (error != 0)
673 			goto done_free;
674 		switch (jsys) {
675 		case JAIL_SYS_DISABLE:
676 			if (!jsf->disable) {
677 				error = EINVAL;
678 				goto done_free;
679 			}
680 			pr_flags |= jsf->disable;
681 			break;
682 		case JAIL_SYS_NEW:
683 			pr_flags |= jsf->new;
684 			break;
685 		case JAIL_SYS_INHERIT:
686 			break;
687 		default:
688 			error = EINVAL;
689 			goto done_free;
690 		}
691 		ch_flags |= jsf->new | jsf->disable;
692 	}
693 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
694 	    && !(pr_flags & PR_PERSIST)) {
695 		error = EINVAL;
696 		vfs_opterror(opts, "new jail must persist or attach");
697 		goto done_errmsg;
698 	}
699 #ifdef VIMAGE
700 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
701 		error = EINVAL;
702 		vfs_opterror(opts, "vnet cannot be changed after creation");
703 		goto done_errmsg;
704 	}
705 #endif
706 #ifdef INET
707 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
708 		error = EINVAL;
709 		vfs_opterror(opts, "ip4 cannot be changed after creation");
710 		goto done_errmsg;
711 	}
712 #endif
713 #ifdef INET6
714 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
715 		error = EINVAL;
716 		vfs_opterror(opts, "ip6 cannot be changed after creation");
717 		goto done_errmsg;
718 	}
719 #endif
720 
721 	pr_allow = ch_allow = 0;
722 	for (bf = pr_flag_allow;
723 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
724 		atomic_load_int(&bf->flag) != 0;
725 	     bf++) {
726 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
727 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
728 	}
729 	ch_allow |= pr_allow;
730 
731 	error = vfs_getopt(opts, "name", (void **)&name, &len);
732 	if (error == ENOENT)
733 		name = NULL;
734 	else if (error != 0)
735 		goto done_free;
736 	else {
737 		if (len == 0 || name[len - 1] != '\0') {
738 			error = EINVAL;
739 			goto done_free;
740 		}
741 		if (len > MAXHOSTNAMELEN) {
742 			error = ENAMETOOLONG;
743 			goto done_free;
744 		}
745 	}
746 
747 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
748 	if (error == ENOENT)
749 		host = NULL;
750 	else if (error != 0)
751 		goto done_free;
752 	else {
753 		ch_flags |= PR_HOST;
754 		pr_flags |= PR_HOST;
755 		if (len == 0 || host[len - 1] != '\0') {
756 			error = EINVAL;
757 			goto done_free;
758 		}
759 		if (len > MAXHOSTNAMELEN) {
760 			error = ENAMETOOLONG;
761 			goto done_free;
762 		}
763 	}
764 
765 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
766 	if (error == ENOENT)
767 		domain = NULL;
768 	else if (error != 0)
769 		goto done_free;
770 	else {
771 		ch_flags |= PR_HOST;
772 		pr_flags |= PR_HOST;
773 		if (len == 0 || domain[len - 1] != '\0') {
774 			error = EINVAL;
775 			goto done_free;
776 		}
777 		if (len > MAXHOSTNAMELEN) {
778 			error = ENAMETOOLONG;
779 			goto done_free;
780 		}
781 	}
782 
783 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
784 	if (error == ENOENT)
785 		uuid = NULL;
786 	else if (error != 0)
787 		goto done_free;
788 	else {
789 		ch_flags |= PR_HOST;
790 		pr_flags |= PR_HOST;
791 		if (len == 0 || uuid[len - 1] != '\0') {
792 			error = EINVAL;
793 			goto done_free;
794 		}
795 		if (len > HOSTUUIDLEN) {
796 			error = ENAMETOOLONG;
797 			goto done_free;
798 		}
799 	}
800 
801 #ifdef COMPAT_FREEBSD32
802 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
803 		uint32_t hid32;
804 
805 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
806 		hid = hid32;
807 	} else
808 #endif
809 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
810 	if (error == ENOENT)
811 		gothid = 0;
812 	else if (error != 0)
813 		goto done_free;
814 	else {
815 		gothid = 1;
816 		ch_flags |= PR_HOST;
817 		pr_flags |= PR_HOST;
818 	}
819 
820 #ifdef INET
821 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
822 	if (error == ENOENT)
823 		ip4s = 0;
824 	else if (error != 0)
825 		goto done_free;
826 	else if (ip4s & (sizeof(*ip4) - 1)) {
827 		error = EINVAL;
828 		goto done_free;
829 	} else {
830 		ch_flags |= PR_IP4_USER;
831 		pr_flags |= PR_IP4_USER;
832 		if (ip4s > 0) {
833 			ip4s /= sizeof(*ip4);
834 			if (ip4s > jail_max_af_ips) {
835 				error = EINVAL;
836 				vfs_opterror(opts, "too many IPv4 addresses");
837 				goto done_errmsg;
838 			}
839 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
840 			bcopy(op, ip4, ip4s * sizeof(*ip4));
841 			/*
842 			 * IP addresses are all sorted but ip[0] to preserve
843 			 * the primary IP address as given from userland.
844 			 * This special IP is used for unbound outgoing
845 			 * connections as well for "loopback" traffic in case
846 			 * source address selection cannot find any more fitting
847 			 * address to connect from.
848 			 */
849 			if (ip4s > 1)
850 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
851 				    prison_qcmp_v4);
852 			/*
853 			 * Check for duplicate addresses and do some simple
854 			 * zero and broadcast checks. If users give other bogus
855 			 * addresses it is their problem.
856 			 *
857 			 * We do not have to care about byte order for these
858 			 * checks so we will do them in NBO.
859 			 */
860 			for (ii = 0; ii < ip4s; ii++) {
861 				if (ip4[ii].s_addr == INADDR_ANY ||
862 				    ip4[ii].s_addr == INADDR_BROADCAST) {
863 					error = EINVAL;
864 					goto done_free;
865 				}
866 				if ((ii+1) < ip4s &&
867 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
868 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
869 					error = EINVAL;
870 					goto done_free;
871 				}
872 			}
873 		}
874 	}
875 #endif
876 
877 #ifdef INET6
878 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
879 	if (error == ENOENT)
880 		ip6s = 0;
881 	else if (error != 0)
882 		goto done_free;
883 	else if (ip6s & (sizeof(*ip6) - 1)) {
884 		error = EINVAL;
885 		goto done_free;
886 	} else {
887 		ch_flags |= PR_IP6_USER;
888 		pr_flags |= PR_IP6_USER;
889 		if (ip6s > 0) {
890 			ip6s /= sizeof(*ip6);
891 			if (ip6s > jail_max_af_ips) {
892 				error = EINVAL;
893 				vfs_opterror(opts, "too many IPv6 addresses");
894 				goto done_errmsg;
895 			}
896 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
897 			bcopy(op, ip6, ip6s * sizeof(*ip6));
898 			if (ip6s > 1)
899 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
900 				    prison_qcmp_v6);
901 			for (ii = 0; ii < ip6s; ii++) {
902 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
903 					error = EINVAL;
904 					goto done_free;
905 				}
906 				if ((ii+1) < ip6s &&
907 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
908 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
909 				{
910 					error = EINVAL;
911 					goto done_free;
912 				}
913 			}
914 		}
915 	}
916 #endif
917 
918 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
919 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
920 		error = EINVAL;
921 		vfs_opterror(opts,
922 		    "vnet jails cannot have IP address restrictions");
923 		goto done_errmsg;
924 	}
925 #endif
926 
927 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
928 	if (error == ENOENT)
929 		osrelstr = NULL;
930 	else if (error != 0)
931 		goto done_free;
932 	else {
933 		if (flags & JAIL_UPDATE) {
934 			error = EINVAL;
935 			vfs_opterror(opts,
936 			    "osrelease cannot be changed after creation");
937 			goto done_errmsg;
938 		}
939 		if (len == 0 || osrelstr[len - 1] != '\0') {
940 			error = EINVAL;
941 			goto done_free;
942 		}
943 		if (len >= OSRELEASELEN) {
944 			error = ENAMETOOLONG;
945 			vfs_opterror(opts,
946 			    "osrelease string must be 1-%d bytes long",
947 			    OSRELEASELEN - 1);
948 			goto done_errmsg;
949 		}
950 	}
951 
952 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
953 	if (error == ENOENT)
954 		osreldt = 0;
955 	else if (error != 0)
956 		goto done_free;
957 	else {
958 		if (flags & JAIL_UPDATE) {
959 			error = EINVAL;
960 			vfs_opterror(opts,
961 			    "osreldate cannot be changed after creation");
962 			goto done_errmsg;
963 		}
964 		if (osreldt == 0) {
965 			error = EINVAL;
966 			vfs_opterror(opts, "osreldate cannot be 0");
967 			goto done_errmsg;
968 		}
969 	}
970 
971 	root = NULL;
972 	error = vfs_getopt(opts, "path", (void **)&path, &len);
973 	if (error == ENOENT)
974 		path = NULL;
975 	else if (error != 0)
976 		goto done_free;
977 	else {
978 		if (flags & JAIL_UPDATE) {
979 			error = EINVAL;
980 			vfs_opterror(opts,
981 			    "path cannot be changed after creation");
982 			goto done_errmsg;
983 		}
984 		if (len == 0 || path[len - 1] != '\0') {
985 			error = EINVAL;
986 			goto done_free;
987 		}
988 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
989 		error = namei(&nd);
990 		if (error)
991 			goto done_free;
992 		root = nd.ni_vp;
993 		NDFREE(&nd, NDF_ONLY_PNBUF);
994 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
995 		strlcpy(g_path, path, MAXPATHLEN);
996 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
997 		if (error == 0) {
998 			path = g_path;
999 		} else {
1000 			/* exit on other errors */
1001 			goto done_free;
1002 		}
1003 		if (root->v_type != VDIR) {
1004 			error = ENOTDIR;
1005 			vput(root);
1006 			goto done_free;
1007 		}
1008 		VOP_UNLOCK(root);
1009 	}
1010 
1011 	/*
1012 	 * Find the specified jail, or at least its parent.
1013 	 * This abuses the file error codes ENOENT and EEXIST.
1014 	 */
1015 	pr = NULL;
1016 	inspr = NULL;
1017 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1018 		namelc = strrchr(name, '.');
1019 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1020 		if (*p != '\0')
1021 			jid = 0;
1022 	}
1023 	sx_xlock(&allprison_lock);
1024 	drflags = PD_LIST_XLOCKED;
1025 	ppr = mypr;
1026 	if (!prison_isalive(ppr)) {
1027 		/* This jail is dying.  This process will surely follow. */
1028 		error = EAGAIN;
1029 		goto done_deref;
1030 	}
1031 	if (jid != 0) {
1032 		if (jid < 0) {
1033 			error = EINVAL;
1034 			vfs_opterror(opts, "negative jid");
1035 			goto done_deref;
1036 		}
1037 		/*
1038 		 * See if a requested jid already exists.  Keep track of
1039 		 * where it can be inserted later.
1040 		 */
1041 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
1042 			if (inspr->pr_id < jid)
1043 				continue;
1044 			if (inspr->pr_id > jid)
1045 				break;
1046 			pr = inspr;
1047 			mtx_lock(&pr->pr_mtx);
1048 			drflags |= PD_LOCKED;
1049 			inspr = NULL;
1050 			break;
1051 		}
1052 		if (pr != NULL) {
1053 			/* Create: jid must not exist. */
1054 			if (cuflags == JAIL_CREATE) {
1055 				/*
1056 				 * Even creators that cannot see the jail will
1057 				 * get EEXIST.
1058 				 */
1059 				error = EEXIST;
1060 				vfs_opterror(opts, "jail %d already exists",
1061 				    jid);
1062 				goto done_deref;
1063 			}
1064 			if (!prison_ischild(mypr, pr)) {
1065 				/*
1066 				 * Updaters get ENOENT if they cannot see the
1067 				 * jail.  This is true even for CREATE | UPDATE,
1068 				 * which normally cannot give this error.
1069 				 */
1070 				error = ENOENT;
1071 				vfs_opterror(opts, "jail %d not found", jid);
1072 				goto done_deref;
1073 			}
1074 			ppr = pr->pr_parent;
1075 			if (!prison_isalive(ppr)) {
1076 				error = ENOENT;
1077 				vfs_opterror(opts, "jail %d is dying",
1078 				    ppr->pr_id);
1079 				goto done_deref;
1080 			}
1081 			if (!prison_isalive(pr)) {
1082 				if (!(flags & JAIL_DYING)) {
1083 					error = ENOENT;
1084 					vfs_opterror(opts, "jail %d is dying",
1085 					    jid);
1086 					goto done_deref;
1087 				}
1088 				if ((flags & JAIL_ATTACH) ||
1089 				    (pr_flags & PR_PERSIST)) {
1090 					/*
1091 					 * A dying jail might be resurrected
1092 					 * (via attach or persist), but first
1093 					 * it must determine if another jail
1094 					 * has claimed its name.  Accomplish
1095 					 * this by implicitly re-setting the
1096 					 * name.
1097 					 */
1098 					if (name == NULL)
1099 						name = prison_name(mypr, pr);
1100 				}
1101 			}
1102 		} else {
1103 			/* Update: jid must exist. */
1104 			if (cuflags == JAIL_UPDATE) {
1105 				error = ENOENT;
1106 				vfs_opterror(opts, "jail %d not found", jid);
1107 				goto done_deref;
1108 			}
1109 		}
1110 	}
1111 	/*
1112 	 * If the caller provided a name, look for a jail by that name.
1113 	 * This has different semantics for creates and updates keyed by jid
1114 	 * (where the name must not already exist in a different jail),
1115 	 * and updates keyed by the name itself (where the name must exist
1116 	 * because that is the jail being updated).
1117 	 */
1118 	namelc = NULL;
1119 	if (name != NULL) {
1120 		namelc = strrchr(name, '.');
1121 		if (namelc == NULL)
1122 			namelc = name;
1123 		else {
1124 			/*
1125 			 * This is a hierarchical name.  Split it into the
1126 			 * parent and child names, and make sure the parent
1127 			 * exists or matches an already found jail.
1128 			 */
1129 			if (pr != NULL) {
1130 				if (strncmp(name, ppr->pr_name, namelc - name)
1131 				    || ppr->pr_name[namelc - name] != '\0') {
1132 					error = EINVAL;
1133 					vfs_opterror(opts,
1134 					    "cannot change jail's parent");
1135 					goto done_deref;
1136 				}
1137 			} else {
1138 				*namelc = '\0';
1139 				ppr = prison_find_name(mypr, name);
1140 				if (ppr == NULL) {
1141 					error = ENOENT;
1142 					vfs_opterror(opts,
1143 					    "jail \"%s\" not found", name);
1144 					goto done_deref;
1145 				}
1146 				mtx_unlock(&ppr->pr_mtx);
1147 				if (!prison_isalive(ppr)) {
1148 					error = ENOENT;
1149 					vfs_opterror(opts,
1150 					    "jail \"%s\" is dying", name);
1151 					goto done_deref;
1152 				}
1153 				*namelc = '.';
1154 			}
1155 			namelc++;
1156 		}
1157 		if (namelc[0] != '\0') {
1158 			pnamelen =
1159 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1160 			deadpr = NULL;
1161 			FOREACH_PRISON_CHILD(ppr, tpr) {
1162 				if (tpr != pr &&
1163 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1164 					if (prison_isalive(tpr)) {
1165 						if (pr == NULL &&
1166 						    cuflags != JAIL_CREATE) {
1167 							/*
1168 							 * Use this jail
1169 							 * for updates.
1170 							 */
1171 							pr = tpr;
1172 							mtx_lock(&pr->pr_mtx);
1173 							drflags |= PD_LOCKED;
1174 							break;
1175 						}
1176 						/*
1177 						 * Create, or update(jid):
1178 						 * name must not exist in an
1179 						 * active sibling jail.
1180 						 */
1181 						error = EEXIST;
1182 						vfs_opterror(opts,
1183 						   "jail \"%s\" already exists",
1184 						   name);
1185 						goto done_deref;
1186 					}
1187 					if (pr == NULL &&
1188 					    cuflags != JAIL_CREATE) {
1189 						deadpr = tpr;
1190 					}
1191 				}
1192 			}
1193 			/* If no active jail is found, use a dying one. */
1194 			if (deadpr != NULL && pr == NULL) {
1195 				if (flags & JAIL_DYING) {
1196 					pr = deadpr;
1197 					mtx_lock(&pr->pr_mtx);
1198 					drflags |= PD_LOCKED;
1199 				} else if (cuflags == JAIL_UPDATE) {
1200 					error = ENOENT;
1201 					vfs_opterror(opts,
1202 					    "jail \"%s\" is dying", name);
1203 					goto done_deref;
1204 				}
1205 			}
1206 			/* Update: name must exist if no jid. */
1207 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1208 				error = ENOENT;
1209 				vfs_opterror(opts, "jail \"%s\" not found",
1210 				    name);
1211 				goto done_deref;
1212 			}
1213 		}
1214 	}
1215 	/* Update: must provide a jid or name. */
1216 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1217 		error = ENOENT;
1218 		vfs_opterror(opts, "update specified no jail");
1219 		goto done_deref;
1220 	}
1221 
1222 	/* If there's no prison to update, create a new one and link it in. */
1223 	created = pr == NULL;
1224 	if (created) {
1225 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1226 			if (tpr->pr_childcount >= tpr->pr_childmax) {
1227 				error = EPERM;
1228 				vfs_opterror(opts, "prison limit exceeded");
1229 				goto done_deref;
1230 			}
1231 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1232 			error = EAGAIN;
1233 			vfs_opterror(opts, "no available jail IDs");
1234 			goto done_deref;
1235 		}
1236 
1237 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1238 		pr->pr_state = PRISON_STATE_INVALID;
1239 		refcount_init(&pr->pr_ref, 1);
1240 		refcount_init(&pr->pr_uref, 0);
1241 		drflags |= PD_DEREF;
1242 		LIST_INIT(&pr->pr_children);
1243 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1244 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1245 
1246 		pr->pr_id = jid;
1247 		if (inspr != NULL)
1248 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1249 		else
1250 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1251 
1252 		pr->pr_parent = ppr;
1253 		prison_hold(ppr);
1254 		prison_proc_hold(ppr);
1255 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1256 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1257 			tpr->pr_childcount++;
1258 
1259 		/* Set some default values, and inherit some from the parent. */
1260 		if (namelc == NULL)
1261 			namelc = "";
1262 		if (path == NULL) {
1263 			path = "/";
1264 			root = mypr->pr_root;
1265 			vref(root);
1266 		}
1267 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1268 		pr->pr_flags |= PR_HOST;
1269 #if defined(INET) || defined(INET6)
1270 #ifdef VIMAGE
1271 		if (!(pr_flags & PR_VNET))
1272 #endif
1273 		{
1274 #ifdef INET
1275 			if (!(ch_flags & PR_IP4_USER))
1276 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1277 			else if (!(pr_flags & PR_IP4_USER)) {
1278 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1279 				if (ppr->pr_ip4 != NULL) {
1280 					pr->pr_ip4s = ppr->pr_ip4s;
1281 					pr->pr_ip4 = malloc(pr->pr_ip4s *
1282 					    sizeof(struct in_addr), M_PRISON,
1283 					    M_WAITOK);
1284 					bcopy(ppr->pr_ip4, pr->pr_ip4,
1285 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1286 				}
1287 			}
1288 #endif
1289 #ifdef INET6
1290 			if (!(ch_flags & PR_IP6_USER))
1291 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1292 			else if (!(pr_flags & PR_IP6_USER)) {
1293 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1294 				if (ppr->pr_ip6 != NULL) {
1295 					pr->pr_ip6s = ppr->pr_ip6s;
1296 					pr->pr_ip6 = malloc(pr->pr_ip6s *
1297 					    sizeof(struct in6_addr), M_PRISON,
1298 					    M_WAITOK);
1299 					bcopy(ppr->pr_ip6, pr->pr_ip6,
1300 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1301 				}
1302 			}
1303 #endif
1304 		}
1305 #endif
1306 		/* Source address selection is always on by default. */
1307 		pr->pr_flags |= _PR_IP_SADDRSEL;
1308 
1309 		pr->pr_securelevel = ppr->pr_securelevel;
1310 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1311 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
1312 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1313 
1314 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1315 		if (osrelstr == NULL)
1316 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1317 			    sizeof(pr->pr_osrelease));
1318 		else
1319 			strlcpy(pr->pr_osrelease, osrelstr,
1320 			    sizeof(pr->pr_osrelease));
1321 
1322 #ifdef VIMAGE
1323 		/* Allocate a new vnet if specified. */
1324 		pr->pr_vnet = (pr_flags & PR_VNET)
1325 		    ? vnet_alloc() : ppr->pr_vnet;
1326 #endif
1327 		/*
1328 		 * Allocate a dedicated cpuset for each jail.
1329 		 * Unlike other initial settings, this may return an error.
1330 		 */
1331 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1332 		if (error)
1333 			goto done_deref;
1334 
1335 		mtx_lock(&pr->pr_mtx);
1336 		drflags |= PD_LOCKED;
1337 	} else {
1338 		/*
1339 		 * Grab a reference for existing prisons, to ensure they
1340 		 * continue to exist for the duration of the call.
1341 		 */
1342 		prison_hold(pr);
1343 		drflags |= PD_DEREF;
1344 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1345 		if ((pr->pr_flags & PR_VNET) &&
1346 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1347 			error = EINVAL;
1348 			vfs_opterror(opts,
1349 			    "vnet jails cannot have IP address restrictions");
1350 			goto done_deref;
1351 		}
1352 #endif
1353 #ifdef INET
1354 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1355 			error = EINVAL;
1356 			vfs_opterror(opts,
1357 			    "ip4 cannot be changed after creation");
1358 			goto done_deref;
1359 		}
1360 #endif
1361 #ifdef INET6
1362 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1363 			error = EINVAL;
1364 			vfs_opterror(opts,
1365 			    "ip6 cannot be changed after creation");
1366 			goto done_deref;
1367 		}
1368 #endif
1369 	}
1370 
1371 	/* Do final error checking before setting anything. */
1372 	if (gotslevel) {
1373 		if (slevel < ppr->pr_securelevel) {
1374 			error = EPERM;
1375 			goto done_deref;
1376 		}
1377 	}
1378 	if (gotchildmax) {
1379 		if (childmax >= ppr->pr_childmax) {
1380 			error = EPERM;
1381 			goto done_deref;
1382 		}
1383 	}
1384 	if (gotenforce) {
1385 		if (enforce < ppr->pr_enforce_statfs) {
1386 			error = EPERM;
1387 			goto done_deref;
1388 		}
1389 	}
1390 	if (gotrsnum) {
1391 		/*
1392 		 * devfs_rsnum is a uint16_t
1393 		 */
1394 		if (rsnum < 0 || rsnum > 65535) {
1395 			error = EINVAL;
1396 			goto done_deref;
1397 		}
1398 		/*
1399 		 * Nested jails always inherit parent's devfs ruleset
1400 		 */
1401 		if (jailed(td->td_ucred)) {
1402 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1403 				error = EPERM;
1404 				goto done_deref;
1405 			} else
1406 				rsnum = ppr->pr_devfs_rsnum;
1407 		}
1408 	}
1409 #ifdef INET
1410 	if (ip4s > 0) {
1411 		if (ppr->pr_flags & PR_IP4) {
1412 			/*
1413 			 * Make sure the new set of IP addresses is a
1414 			 * subset of the parent's list.  Don't worry
1415 			 * about the parent being unlocked, as any
1416 			 * setting is done with allprison_lock held.
1417 			 */
1418 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1419 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1420 					break;
1421 			if (ij == ppr->pr_ip4s) {
1422 				error = EPERM;
1423 				goto done_deref;
1424 			}
1425 			if (ip4s > 1) {
1426 				for (ii = ij = 1; ii < ip4s; ii++) {
1427 					if (ip4[ii].s_addr ==
1428 					    ppr->pr_ip4[0].s_addr)
1429 						continue;
1430 					for (; ij < ppr->pr_ip4s; ij++)
1431 						if (ip4[ii].s_addr ==
1432 						    ppr->pr_ip4[ij].s_addr)
1433 							break;
1434 					if (ij == ppr->pr_ip4s)
1435 						break;
1436 				}
1437 				if (ij == ppr->pr_ip4s) {
1438 					error = EPERM;
1439 					goto done_deref;
1440 				}
1441 			}
1442 		}
1443 		/*
1444 		 * Check for conflicting IP addresses.  We permit them
1445 		 * if there is no more than one IP on each jail.  If
1446 		 * there is a duplicate on a jail with more than one
1447 		 * IP stop checking and return error.
1448 		 */
1449 #ifdef VIMAGE
1450 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1451 			if (tppr->pr_flags & PR_VNET)
1452 				break;
1453 #else
1454 		tppr = &prison0;
1455 #endif
1456 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1457 			if (tpr == pr ||
1458 #ifdef VIMAGE
1459 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1460 #endif
1461 			    !prison_isalive(tpr)) {
1462 				descend = 0;
1463 				continue;
1464 			}
1465 			if (!(tpr->pr_flags & PR_IP4_USER))
1466 				continue;
1467 			descend = 0;
1468 			if (tpr->pr_ip4 == NULL ||
1469 			    (ip4s == 1 && tpr->pr_ip4s == 1))
1470 				continue;
1471 			for (ii = 0; ii < ip4s; ii++) {
1472 				if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
1473 				    0) {
1474 					error = EADDRINUSE;
1475 					vfs_opterror(opts,
1476 					    "IPv4 addresses clash");
1477 					goto done_deref;
1478 				}
1479 			}
1480 		}
1481 	}
1482 #endif
1483 #ifdef INET6
1484 	if (ip6s > 0) {
1485 		if (ppr->pr_flags & PR_IP6) {
1486 			/*
1487 			 * Make sure the new set of IP addresses is a
1488 			 * subset of the parent's list.
1489 			 */
1490 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1491 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1492 				    &ppr->pr_ip6[ij]))
1493 					break;
1494 			if (ij == ppr->pr_ip6s) {
1495 				error = EPERM;
1496 				goto done_deref;
1497 			}
1498 			if (ip6s > 1) {
1499 				for (ii = ij = 1; ii < ip6s; ii++) {
1500 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1501 					     &ppr->pr_ip6[0]))
1502 						continue;
1503 					for (; ij < ppr->pr_ip6s; ij++)
1504 						if (IN6_ARE_ADDR_EQUAL(
1505 						    &ip6[ii], &ppr->pr_ip6[ij]))
1506 							break;
1507 					if (ij == ppr->pr_ip6s)
1508 						break;
1509 				}
1510 				if (ij == ppr->pr_ip6s) {
1511 					error = EPERM;
1512 					goto done_deref;
1513 				}
1514 			}
1515 		}
1516 		/* Check for conflicting IP addresses. */
1517 #ifdef VIMAGE
1518 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1519 			if (tppr->pr_flags & PR_VNET)
1520 				break;
1521 #else
1522 		tppr = &prison0;
1523 #endif
1524 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1525 			if (tpr == pr ||
1526 #ifdef VIMAGE
1527 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1528 #endif
1529 			    !prison_isalive(tpr)) {
1530 				descend = 0;
1531 				continue;
1532 			}
1533 			if (!(tpr->pr_flags & PR_IP6_USER))
1534 				continue;
1535 			descend = 0;
1536 			if (tpr->pr_ip6 == NULL ||
1537 			    (ip6s == 1 && tpr->pr_ip6s == 1))
1538 				continue;
1539 			for (ii = 0; ii < ip6s; ii++) {
1540 				if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
1541 				    0) {
1542 					error = EADDRINUSE;
1543 					vfs_opterror(opts,
1544 					    "IPv6 addresses clash");
1545 					goto done_deref;
1546 				}
1547 			}
1548 		}
1549 	}
1550 #endif
1551 	onamelen = namelen = 0;
1552 	if (namelc != NULL) {
1553 		/* Give a default name of the jid.  Also allow the name to be
1554 		 * explicitly the jid - but not any other number, and only in
1555 		 * normal form (no leading zero/etc).
1556 		 */
1557 		if (namelc[0] == '\0')
1558 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1559 		else if ((strtoul(namelc, &p, 10) != jid ||
1560 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1561 			error = EINVAL;
1562 			vfs_opterror(opts,
1563 			    "name cannot be numeric (unless it is the jid)");
1564 			goto done_deref;
1565 		}
1566 		/*
1567 		 * Make sure the name isn't too long for the prison or its
1568 		 * children.
1569 		 */
1570 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1571 		onamelen = strlen(pr->pr_name + pnamelen);
1572 		namelen = strlen(namelc);
1573 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1574 			error = ENAMETOOLONG;
1575 			goto done_deref;
1576 		}
1577 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1578 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1579 			    sizeof(pr->pr_name)) {
1580 				error = ENAMETOOLONG;
1581 				goto done_deref;
1582 			}
1583 		}
1584 	}
1585 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
1586 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1587 		error = EPERM;
1588 		goto done_deref;
1589 	}
1590 
1591 	/*
1592 	 * Let modules check their parameters.  This requires unlocking and
1593 	 * then re-locking the prison, but this is still a valid state as long
1594 	 * as allprison_lock remains xlocked.
1595 	 */
1596 	mtx_unlock(&pr->pr_mtx);
1597 	drflags &= ~PD_LOCKED;
1598 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1599 	if (error != 0)
1600 		goto done_deref;
1601 	mtx_lock(&pr->pr_mtx);
1602 	drflags |= PD_LOCKED;
1603 
1604 	/* At this point, all valid parameters should have been noted. */
1605 	TAILQ_FOREACH(opt, opts, link) {
1606 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1607 			error = EINVAL;
1608 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1609 			goto done_deref;
1610 		}
1611 	}
1612 
1613 	/* Set the parameters of the prison. */
1614 #ifdef INET
1615 	redo_ip4 = 0;
1616 	if (pr_flags & PR_IP4_USER) {
1617 		pr->pr_flags |= PR_IP4;
1618 		free(pr->pr_ip4, M_PRISON);
1619 		pr->pr_ip4s = ip4s;
1620 		pr->pr_ip4 = ip4;
1621 		ip4 = NULL;
1622 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1623 #ifdef VIMAGE
1624 			if (tpr->pr_flags & PR_VNET) {
1625 				descend = 0;
1626 				continue;
1627 			}
1628 #endif
1629 			if (prison_restrict_ip4(tpr, NULL)) {
1630 				redo_ip4 = 1;
1631 				descend = 0;
1632 			}
1633 		}
1634 	}
1635 #endif
1636 #ifdef INET6
1637 	redo_ip6 = 0;
1638 	if (pr_flags & PR_IP6_USER) {
1639 		pr->pr_flags |= PR_IP6;
1640 		free(pr->pr_ip6, M_PRISON);
1641 		pr->pr_ip6s = ip6s;
1642 		pr->pr_ip6 = ip6;
1643 		ip6 = NULL;
1644 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1645 #ifdef VIMAGE
1646 			if (tpr->pr_flags & PR_VNET) {
1647 				descend = 0;
1648 				continue;
1649 			}
1650 #endif
1651 			if (prison_restrict_ip6(tpr, NULL)) {
1652 				redo_ip6 = 1;
1653 				descend = 0;
1654 			}
1655 		}
1656 	}
1657 #endif
1658 	if (gotslevel) {
1659 		pr->pr_securelevel = slevel;
1660 		/* Set all child jails to be at least this level. */
1661 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1662 			if (tpr->pr_securelevel < slevel)
1663 				tpr->pr_securelevel = slevel;
1664 	}
1665 	if (gotchildmax) {
1666 		pr->pr_childmax = childmax;
1667 		/* Set all child jails to under this limit. */
1668 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1669 			if (tpr->pr_childmax > childmax - level)
1670 				tpr->pr_childmax = childmax > level
1671 				    ? childmax - level : 0;
1672 	}
1673 	if (gotenforce) {
1674 		pr->pr_enforce_statfs = enforce;
1675 		/* Pass this restriction on to the children. */
1676 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1677 			if (tpr->pr_enforce_statfs < enforce)
1678 				tpr->pr_enforce_statfs = enforce;
1679 	}
1680 	if (gotrsnum) {
1681 		pr->pr_devfs_rsnum = rsnum;
1682 		/* Pass this restriction on to the children. */
1683 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1684 			tpr->pr_devfs_rsnum = rsnum;
1685 	}
1686 	if (namelc != NULL) {
1687 		if (ppr == &prison0)
1688 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1689 		else
1690 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1691 			    ppr->pr_name, namelc);
1692 		/* Change this component of child names. */
1693 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1694 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1695 			    strlen(tpr->pr_name + onamelen) + 1);
1696 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1697 		}
1698 	}
1699 	if (path != NULL) {
1700 		/* Try to keep a real-rooted full pathname. */
1701 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1702 		pr->pr_root = root;
1703 		root = NULL;
1704 	}
1705 	if (PR_HOST & ch_flags & ~pr_flags) {
1706 		if (pr->pr_flags & PR_HOST) {
1707 			/*
1708 			 * Copy the parent's host info.  As with pr_ip4 above,
1709 			 * the lack of a lock on the parent is not a problem;
1710 			 * it is always set with allprison_lock at least
1711 			 * shared, and is held exclusively here.
1712 			 */
1713 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1714 			    sizeof(pr->pr_hostname));
1715 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1716 			    sizeof(pr->pr_domainname));
1717 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1718 			    sizeof(pr->pr_hostuuid));
1719 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1720 		}
1721 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1722 		/* Set this prison, and any descendants without PR_HOST. */
1723 		if (host != NULL)
1724 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1725 		if (domain != NULL)
1726 			strlcpy(pr->pr_domainname, domain,
1727 			    sizeof(pr->pr_domainname));
1728 		if (uuid != NULL)
1729 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1730 		if (gothid)
1731 			pr->pr_hostid = hid;
1732 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1733 			if (tpr->pr_flags & PR_HOST)
1734 				descend = 0;
1735 			else {
1736 				if (host != NULL)
1737 					strlcpy(tpr->pr_hostname,
1738 					    pr->pr_hostname,
1739 					    sizeof(tpr->pr_hostname));
1740 				if (domain != NULL)
1741 					strlcpy(tpr->pr_domainname,
1742 					    pr->pr_domainname,
1743 					    sizeof(tpr->pr_domainname));
1744 				if (uuid != NULL)
1745 					strlcpy(tpr->pr_hostuuid,
1746 					    pr->pr_hostuuid,
1747 					    sizeof(tpr->pr_hostuuid));
1748 				if (gothid)
1749 					tpr->pr_hostid = hid;
1750 			}
1751 		}
1752 	}
1753 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1754 	if ((tallow = ch_allow & ~pr_allow))
1755 		prison_set_allow_locked(pr, tallow, 0);
1756 	/*
1757 	 * Persistent prisons get an extra reference, and prisons losing their
1758 	 * persist flag lose that reference.
1759 	 */
1760 	born = !prison_isalive(pr);
1761 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
1762 		if (pr_flags & PR_PERSIST) {
1763 			prison_hold(pr);
1764 			/*
1765 			 * This may make a dead prison alive again, but wait
1766 			 * to label it as such until after OSD calls have had
1767 			 * a chance to run (and perhaps to fail).
1768 			 */
1769 			refcount_acquire(&pr->pr_uref);
1770 		} else {
1771 			drflags |= PD_DEUREF;
1772 			prison_free_not_last(pr);
1773 		}
1774 	}
1775 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1776 	mtx_unlock(&pr->pr_mtx);
1777 	drflags &= ~PD_LOCKED;
1778 	/*
1779 	 * Any errors past this point will need to de-persist newly created
1780 	 * prisons, as well as call remove methods.
1781 	 */
1782 	if (born)
1783 		drflags |= PD_KILL;
1784 
1785 #ifdef RACCT
1786 	if (racct_enable && created)
1787 		prison_racct_attach(pr);
1788 #endif
1789 
1790 	/* Locks may have prevented a complete restriction of child IP
1791 	 * addresses.  If so, allocate some more memory and try again.
1792 	 */
1793 #ifdef INET
1794 	while (redo_ip4) {
1795 		ip4s = pr->pr_ip4s;
1796 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1797 		mtx_lock(&pr->pr_mtx);
1798 		redo_ip4 = 0;
1799 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1800 #ifdef VIMAGE
1801 			if (tpr->pr_flags & PR_VNET) {
1802 				descend = 0;
1803 				continue;
1804 			}
1805 #endif
1806 			if (prison_restrict_ip4(tpr, ip4)) {
1807 				if (ip4 != NULL)
1808 					ip4 = NULL;
1809 				else
1810 					redo_ip4 = 1;
1811 			}
1812 		}
1813 		mtx_unlock(&pr->pr_mtx);
1814 	}
1815 #endif
1816 #ifdef INET6
1817 	while (redo_ip6) {
1818 		ip6s = pr->pr_ip6s;
1819 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1820 		mtx_lock(&pr->pr_mtx);
1821 		redo_ip6 = 0;
1822 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1823 #ifdef VIMAGE
1824 			if (tpr->pr_flags & PR_VNET) {
1825 				descend = 0;
1826 				continue;
1827 			}
1828 #endif
1829 			if (prison_restrict_ip6(tpr, ip6)) {
1830 				if (ip6 != NULL)
1831 					ip6 = NULL;
1832 				else
1833 					redo_ip6 = 1;
1834 			}
1835 		}
1836 		mtx_unlock(&pr->pr_mtx);
1837 	}
1838 #endif
1839 
1840 	/* Let the modules do their work. */
1841 	if (born) {
1842 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1843 		if (error)
1844 			goto done_deref;
1845 	}
1846 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1847 	if (error)
1848 		goto done_deref;
1849 
1850 	/*
1851 	 * A new prison is now ready to be seen; either it has gained a user
1852 	 * reference via persistence, or is about to gain one via attachment.
1853 	 */
1854 	if (born) {
1855 		drflags = prison_lock_xlock(pr, drflags);
1856 		pr->pr_state = PRISON_STATE_ALIVE;
1857 	}
1858 
1859 	/* Attach this process to the prison if requested. */
1860 	if (flags & JAIL_ATTACH) {
1861 		error = do_jail_attach(td, pr,
1862 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
1863 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
1864 		if (error) {
1865 			vfs_opterror(opts, "attach failed");
1866 			goto done_deref;
1867 		}
1868 	}
1869 
1870 #ifdef RACCT
1871 	if (racct_enable && !created) {
1872 		if (drflags & PD_LOCKED) {
1873 			mtx_unlock(&pr->pr_mtx);
1874 			drflags &= ~PD_LOCKED;
1875 		}
1876 		if (drflags & PD_LIST_XLOCKED) {
1877 			sx_xunlock(&allprison_lock);
1878 			drflags &= ~PD_LIST_XLOCKED;
1879 		}
1880 		prison_racct_modify(pr);
1881 	}
1882 #endif
1883 
1884 	drflags &= ~PD_KILL;
1885 	td->td_retval[0] = pr->pr_id;
1886 
1887  done_deref:
1888 	/* Release any temporary prison holds and/or locks. */
1889 	if (pr != NULL)
1890 		prison_deref(pr, drflags);
1891 	else if (drflags & PD_LIST_SLOCKED)
1892 		sx_sunlock(&allprison_lock);
1893 	else if (drflags & PD_LIST_XLOCKED)
1894 		sx_xunlock(&allprison_lock);
1895 	if (root != NULL)
1896 		vrele(root);
1897  done_errmsg:
1898 	if (error) {
1899 		/* Write the error message back to userspace. */
1900 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
1901 		    &errmsg_len) == 0 && errmsg_len > 0) {
1902 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1903 			if (optuio->uio_segflg == UIO_SYSSPACE)
1904 				bcopy(errmsg,
1905 				    optuio->uio_iov[errmsg_pos].iov_base,
1906 				    errmsg_len);
1907 			else
1908 				copyout(errmsg,
1909 				    optuio->uio_iov[errmsg_pos].iov_base,
1910 				    errmsg_len);
1911 		}
1912 	}
1913  done_free:
1914 #ifdef INET
1915 	free(ip4, M_PRISON);
1916 #endif
1917 #ifdef INET6
1918 	free(ip6, M_PRISON);
1919 #endif
1920 	if (g_path != NULL)
1921 		free(g_path, M_TEMP);
1922 	vfs_freeopts(opts);
1923 	return (error);
1924 }
1925 
1926 /*
1927  * Find the next available prison ID.  Return the ID on success, or zero
1928  * on failure.  Also set a pointer to the allprison list entry the prison
1929  * should be inserted before.
1930  */
1931 static int
1932 get_next_prid(struct prison **insprp)
1933 {
1934 	struct prison *inspr;
1935 	int jid, maxid;
1936 
1937 	jid = lastprid % JAIL_MAX + 1;
1938 	if (TAILQ_EMPTY(&allprison) ||
1939 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
1940 		/*
1941 		 * A common case is for all jails to be implicitly numbered,
1942 		 * which means they'll go on the end of the list, at least
1943 		 * for the first JAIL_MAX times.
1944 		 */
1945 		inspr = NULL;
1946 	} else {
1947 		/*
1948 		 * Take two passes through the allprison list: first starting
1949 		 * with the proposed jid, then ending with it.
1950 		 */
1951 		for (maxid = JAIL_MAX; maxid != 0; ) {
1952 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
1953 				if (inspr->pr_id < jid)
1954 					continue;
1955 				if (inspr->pr_id > jid) {
1956 					/* Found an opening. */
1957 					maxid = 0;
1958 					break;
1959 				}
1960 				if (++jid > maxid) {
1961 					if (lastprid == maxid || lastprid == 0)
1962 					{
1963 						/*
1964 						 * The entire legal range
1965 						 * has been traversed
1966 						 */
1967 						return 0;
1968 					}
1969 					/* Try again from the start. */
1970 					jid = 1;
1971 					maxid = lastprid;
1972 					break;
1973 				}
1974 			}
1975 			if (inspr == NULL) {
1976 				/* Found room at the end of the list. */
1977 				break;
1978 			}
1979 		}
1980 	}
1981 	*insprp = inspr;
1982 	lastprid = jid;
1983 	return (jid);
1984 }
1985 
1986 /*
1987  * struct jail_get_args {
1988  *	struct iovec *iovp;
1989  *	unsigned int iovcnt;
1990  *	int flags;
1991  * };
1992  */
1993 int
1994 sys_jail_get(struct thread *td, struct jail_get_args *uap)
1995 {
1996 	struct uio *auio;
1997 	int error;
1998 
1999 	/* Check that we have an even number of iovecs. */
2000 	if (uap->iovcnt & 1)
2001 		return (EINVAL);
2002 
2003 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2004 	if (error)
2005 		return (error);
2006 	error = kern_jail_get(td, auio, uap->flags);
2007 	if (error == 0)
2008 		error = copyout(auio->uio_iov, uap->iovp,
2009 		    uap->iovcnt * sizeof (struct iovec));
2010 	free(auio, M_IOV);
2011 	return (error);
2012 }
2013 
2014 int
2015 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2016 {
2017 	struct bool_flags *bf;
2018 	struct jailsys_flags *jsf;
2019 	struct prison *pr, *mypr;
2020 	struct vfsopt *opt;
2021 	struct vfsoptlist *opts;
2022 	char *errmsg, *name;
2023 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2024 	unsigned f;
2025 
2026 	if (flags & ~JAIL_GET_MASK)
2027 		return (EINVAL);
2028 
2029 	/* Get the parameter list. */
2030 	error = vfs_buildopts(optuio, &opts);
2031 	if (error)
2032 		return (error);
2033 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2034 	mypr = td->td_ucred->cr_prison;
2035 	pr = NULL;
2036 
2037 	/*
2038 	 * Find the prison specified by one of: lastjid, jid, name.
2039 	 */
2040 	sx_slock(&allprison_lock);
2041 	drflags = PD_LIST_SLOCKED;
2042 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2043 	if (error == 0) {
2044 		TAILQ_FOREACH(pr, &allprison, pr_list) {
2045 			if (pr->pr_id > jid &&
2046 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2047 			    prison_ischild(mypr, pr)) {
2048 				mtx_lock(&pr->pr_mtx);
2049 				drflags |= PD_LOCKED;
2050 				goto found_prison;
2051 			}
2052 		}
2053 		error = ENOENT;
2054 		vfs_opterror(opts, "no jail after %d", jid);
2055 		goto done;
2056 	} else if (error != ENOENT)
2057 		goto done;
2058 
2059 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2060 	if (error == 0) {
2061 		if (jid != 0) {
2062 			pr = prison_find_child(mypr, jid);
2063 			if (pr != NULL) {
2064 				drflags |= PD_LOCKED;
2065 				if (!(prison_isalive(pr) ||
2066 				    (flags & JAIL_DYING))) {
2067 					error = ENOENT;
2068 					vfs_opterror(opts, "jail %d is dying",
2069 					    jid);
2070 					goto done;
2071 				}
2072 				goto found_prison;
2073 			}
2074 			error = ENOENT;
2075 			vfs_opterror(opts, "jail %d not found", jid);
2076 			goto done;
2077 		}
2078 	} else if (error != ENOENT)
2079 		goto done;
2080 
2081 	error = vfs_getopt(opts, "name", (void **)&name, &len);
2082 	if (error == 0) {
2083 		if (len == 0 || name[len - 1] != '\0') {
2084 			error = EINVAL;
2085 			goto done;
2086 		}
2087 		pr = prison_find_name(mypr, name);
2088 		if (pr != NULL) {
2089 			drflags |= PD_LOCKED;
2090 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2091 				error = ENOENT;
2092 				vfs_opterror(opts, "jail \"%s\" is dying",
2093 				    name);
2094 				goto done;
2095 			}
2096 			goto found_prison;
2097 		}
2098 		error = ENOENT;
2099 		vfs_opterror(opts, "jail \"%s\" not found", name);
2100 		goto done;
2101 	} else if (error != ENOENT)
2102 		goto done;
2103 
2104 	vfs_opterror(opts, "no jail specified");
2105 	error = ENOENT;
2106 	goto done;
2107 
2108  found_prison:
2109 	/* Get the parameters of the prison. */
2110 	prison_hold(pr);
2111 	drflags |= PD_DEREF;
2112 	td->td_retval[0] = pr->pr_id;
2113 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2114 	if (error != 0 && error != ENOENT)
2115 		goto done;
2116 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2117 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2118 	if (error != 0 && error != ENOENT)
2119 		goto done;
2120 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2121 	if (error != 0 && error != ENOENT)
2122 		goto done;
2123 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2124 	    sizeof(pr->pr_cpuset->cs_id));
2125 	if (error != 0 && error != ENOENT)
2126 		goto done;
2127 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2128 	if (error != 0 && error != ENOENT)
2129 		goto done;
2130 #ifdef INET
2131 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2132 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2133 	if (error != 0 && error != ENOENT)
2134 		goto done;
2135 #endif
2136 #ifdef INET6
2137 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2138 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2139 	if (error != 0 && error != ENOENT)
2140 		goto done;
2141 #endif
2142 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2143 	    sizeof(pr->pr_securelevel));
2144 	if (error != 0 && error != ENOENT)
2145 		goto done;
2146 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2147 	    sizeof(pr->pr_childcount));
2148 	if (error != 0 && error != ENOENT)
2149 		goto done;
2150 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2151 	    sizeof(pr->pr_childmax));
2152 	if (error != 0 && error != ENOENT)
2153 		goto done;
2154 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2155 	if (error != 0 && error != ENOENT)
2156 		goto done;
2157 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2158 	if (error != 0 && error != ENOENT)
2159 		goto done;
2160 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2161 	if (error != 0 && error != ENOENT)
2162 		goto done;
2163 #ifdef COMPAT_FREEBSD32
2164 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2165 		uint32_t hid32 = pr->pr_hostid;
2166 
2167 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2168 	} else
2169 #endif
2170 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2171 	    sizeof(pr->pr_hostid));
2172 	if (error != 0 && error != ENOENT)
2173 		goto done;
2174 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2175 	    sizeof(pr->pr_enforce_statfs));
2176 	if (error != 0 && error != ENOENT)
2177 		goto done;
2178 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2179 	    sizeof(pr->pr_devfs_rsnum));
2180 	if (error != 0 && error != ENOENT)
2181 		goto done;
2182 	for (bf = pr_flag_bool;
2183 	     bf < pr_flag_bool + nitems(pr_flag_bool);
2184 	     bf++) {
2185 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
2186 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2187 		if (error != 0 && error != ENOENT)
2188 			goto done;
2189 		i = !i;
2190 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2191 		if (error != 0 && error != ENOENT)
2192 			goto done;
2193 	}
2194 	for (jsf = pr_flag_jailsys;
2195 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2196 	     jsf++) {
2197 		f = pr->pr_flags & (jsf->disable | jsf->new);
2198 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2199 		    : (f == jsf->new) ? JAIL_SYS_NEW
2200 		    : JAIL_SYS_INHERIT;
2201 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2202 		if (error != 0 && error != ENOENT)
2203 			goto done;
2204 	}
2205 	for (bf = pr_flag_allow;
2206 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
2207 		atomic_load_int(&bf->flag) != 0;
2208 	     bf++) {
2209 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
2210 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2211 		if (error != 0 && error != ENOENT)
2212 			goto done;
2213 		i = !i;
2214 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2215 		if (error != 0 && error != ENOENT)
2216 			goto done;
2217 	}
2218 	i = !prison_isalive(pr);
2219 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2220 	if (error != 0 && error != ENOENT)
2221 		goto done;
2222 	i = !i;
2223 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2224 	if (error != 0 && error != ENOENT)
2225 		goto done;
2226 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2227 	    sizeof(pr->pr_osreldate));
2228 	if (error != 0 && error != ENOENT)
2229 		goto done;
2230 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2231 	if (error != 0 && error != ENOENT)
2232 		goto done;
2233 
2234 	/* Get the module parameters. */
2235 	mtx_unlock(&pr->pr_mtx);
2236 	drflags &= ~PD_LOCKED;
2237 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2238 	if (error)
2239 		goto done;
2240 	prison_deref(pr, drflags);
2241 	pr = NULL;
2242 	drflags = 0;
2243 
2244 	/* By now, all parameters should have been noted. */
2245 	TAILQ_FOREACH(opt, opts, link) {
2246 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2247 			error = EINVAL;
2248 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2249 			goto done;
2250 		}
2251 	}
2252 
2253 	/* Write the fetched parameters back to userspace. */
2254 	error = 0;
2255 	TAILQ_FOREACH(opt, opts, link) {
2256 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2257 			pos = 2 * opt->pos + 1;
2258 			optuio->uio_iov[pos].iov_len = opt->len;
2259 			if (opt->value != NULL) {
2260 				if (optuio->uio_segflg == UIO_SYSSPACE) {
2261 					bcopy(opt->value,
2262 					    optuio->uio_iov[pos].iov_base,
2263 					    opt->len);
2264 				} else {
2265 					error = copyout(opt->value,
2266 					    optuio->uio_iov[pos].iov_base,
2267 					    opt->len);
2268 					if (error)
2269 						break;
2270 				}
2271 			}
2272 		}
2273 	}
2274 
2275  done:
2276 	/* Release any temporary prison holds and/or locks. */
2277 	if (pr != NULL)
2278 		prison_deref(pr, drflags);
2279 	else if (drflags & PD_LIST_SLOCKED)
2280 		sx_sunlock(&allprison_lock);
2281 	if (error && errmsg_pos >= 0) {
2282 		/* Write the error message back to userspace. */
2283 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2284 		errmsg_pos = 2 * errmsg_pos + 1;
2285 		if (errmsg_len > 0) {
2286 			if (optuio->uio_segflg == UIO_SYSSPACE)
2287 				bcopy(errmsg,
2288 				    optuio->uio_iov[errmsg_pos].iov_base,
2289 				    errmsg_len);
2290 			else
2291 				copyout(errmsg,
2292 				    optuio->uio_iov[errmsg_pos].iov_base,
2293 				    errmsg_len);
2294 		}
2295 	}
2296 	vfs_freeopts(opts);
2297 	return (error);
2298 }
2299 
2300 /*
2301  * struct jail_remove_args {
2302  *	int jid;
2303  * };
2304  */
2305 int
2306 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2307 {
2308 	struct prison *pr;
2309 	int error;
2310 
2311 	error = priv_check(td, PRIV_JAIL_REMOVE);
2312 	if (error)
2313 		return (error);
2314 
2315 	sx_xlock(&allprison_lock);
2316 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2317 	if (pr == NULL) {
2318 		sx_xunlock(&allprison_lock);
2319 		return (EINVAL);
2320 	}
2321 	if (!prison_isalive(pr)) {
2322 		/* Silently ignore already-dying prisons. */
2323 		mtx_unlock(&pr->pr_mtx);
2324 		sx_xunlock(&allprison_lock);
2325 		return (0);
2326 	}
2327 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2328 	return (0);
2329 }
2330 
2331 /*
2332  * struct jail_attach_args {
2333  *	int jid;
2334  * };
2335  */
2336 int
2337 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2338 {
2339 	struct prison *pr;
2340 	int error;
2341 
2342 	error = priv_check(td, PRIV_JAIL_ATTACH);
2343 	if (error)
2344 		return (error);
2345 
2346 	sx_slock(&allprison_lock);
2347 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2348 	if (pr == NULL) {
2349 		sx_sunlock(&allprison_lock);
2350 		return (EINVAL);
2351 	}
2352 
2353 	/* Do not allow a process to attach to a prison that is not alive. */
2354 	if (!prison_isalive(pr)) {
2355 		mtx_unlock(&pr->pr_mtx);
2356 		sx_sunlock(&allprison_lock);
2357 		return (EINVAL);
2358 	}
2359 
2360 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2361 }
2362 
2363 static int
2364 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2365 {
2366 	struct proc *p;
2367 	struct ucred *newcred, *oldcred;
2368 	int error;
2369 
2370 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2371 	sx_assert(&allprison_lock, SX_LOCKED);
2372 	drflags &= PD_LOCK_FLAGS;
2373 	/*
2374 	 * XXX: Note that there is a slight race here if two threads
2375 	 * in the same privileged process attempt to attach to two
2376 	 * different jails at the same time.  It is important for
2377 	 * user processes not to do this, or they might end up with
2378 	 * a process root from one prison, but attached to the jail
2379 	 * of another.
2380 	 */
2381 	prison_hold(pr);
2382 	refcount_acquire(&pr->pr_uref);
2383 	drflags |= PD_DEREF | PD_DEUREF;
2384 	mtx_unlock(&pr->pr_mtx);
2385 	drflags &= ~PD_LOCKED;
2386 
2387 	/* Let modules do whatever they need to prepare for attaching. */
2388 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2389 	if (error) {
2390 		prison_deref(pr, drflags);
2391 		return (error);
2392 	}
2393 	sx_unlock(&allprison_lock);
2394 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2395 
2396 	/*
2397 	 * Reparent the newly attached process to this jail.
2398 	 */
2399 	p = td->td_proc;
2400 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2401 	if (error)
2402 		goto e_revert_osd;
2403 
2404 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2405 	if ((error = change_dir(pr->pr_root, td)) != 0)
2406 		goto e_unlock;
2407 #ifdef MAC
2408 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2409 		goto e_unlock;
2410 #endif
2411 	VOP_UNLOCK(pr->pr_root);
2412 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2413 		goto e_revert_osd;
2414 
2415 	newcred = crget();
2416 	PROC_LOCK(p);
2417 	oldcred = crcopysafe(p, newcred);
2418 	newcred->cr_prison = pr;
2419 	proc_set_cred(p, newcred);
2420 	setsugid(p);
2421 #ifdef RACCT
2422 	racct_proc_ucred_changed(p, oldcred, newcred);
2423 	crhold(newcred);
2424 #endif
2425 	PROC_UNLOCK(p);
2426 #ifdef RCTL
2427 	rctl_proc_ucred_changed(p, newcred);
2428 	crfree(newcred);
2429 #endif
2430 	prison_deref(oldcred->cr_prison, drflags);
2431 	crfree(oldcred);
2432 
2433 	/*
2434 	 * If the prison was killed while changing credentials, die along
2435 	 * with it.
2436 	 */
2437 	if (!prison_isalive(pr)) {
2438 		PROC_LOCK(p);
2439 		kern_psignal(p, SIGKILL);
2440 		PROC_UNLOCK(p);
2441 	}
2442 
2443 	return (0);
2444 
2445  e_unlock:
2446 	VOP_UNLOCK(pr->pr_root);
2447  e_revert_osd:
2448 	/* Tell modules this thread is still in its old jail after all. */
2449 	sx_slock(&allprison_lock);
2450 	drflags |= PD_LIST_SLOCKED;
2451 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2452 	prison_deref(pr, drflags);
2453 	return (error);
2454 }
2455 
2456 /*
2457  * Returns a locked prison instance, or NULL on failure.
2458  */
2459 struct prison *
2460 prison_find(int prid)
2461 {
2462 	struct prison *pr;
2463 
2464 	sx_assert(&allprison_lock, SX_LOCKED);
2465 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2466 		if (pr->pr_id < prid)
2467 			continue;
2468 		if (pr->pr_id > prid)
2469 			break;
2470 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2471 		mtx_lock(&pr->pr_mtx);
2472 		return (pr);
2473 	}
2474 	return (NULL);
2475 }
2476 
2477 /*
2478  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2479  */
2480 struct prison *
2481 prison_find_child(struct prison *mypr, int prid)
2482 {
2483 	struct prison *pr;
2484 	int descend;
2485 
2486 	sx_assert(&allprison_lock, SX_LOCKED);
2487 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2488 		if (pr->pr_id == prid) {
2489 			KASSERT(prison_isvalid(pr),
2490 			    ("Found invalid prison %p", pr));
2491 			mtx_lock(&pr->pr_mtx);
2492 			return (pr);
2493 		}
2494 	}
2495 	return (NULL);
2496 }
2497 
2498 /*
2499  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2500  */
2501 struct prison *
2502 prison_find_name(struct prison *mypr, const char *name)
2503 {
2504 	struct prison *pr, *deadpr;
2505 	size_t mylen;
2506 	int descend;
2507 
2508 	sx_assert(&allprison_lock, SX_LOCKED);
2509 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2510 	deadpr = NULL;
2511 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2512 		if (!strcmp(pr->pr_name + mylen, name)) {
2513 			KASSERT(prison_isvalid(pr),
2514 			    ("Found invalid prison %p", pr));
2515 			if (prison_isalive(pr)) {
2516 				mtx_lock(&pr->pr_mtx);
2517 				return (pr);
2518 			}
2519 			deadpr = pr;
2520 		}
2521 	}
2522 	/* There was no valid prison - perhaps there was a dying one. */
2523 	if (deadpr != NULL)
2524 		mtx_lock(&deadpr->pr_mtx);
2525 	return (deadpr);
2526 }
2527 
2528 /*
2529  * See if a prison has the specific flag set.  The prison should be locked,
2530  * unless checking for flags that are only set at jail creation (such as
2531  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2532  * to any other prison data.
2533  */
2534 int
2535 prison_flag(struct ucred *cred, unsigned flag)
2536 {
2537 
2538 	return (cred->cr_prison->pr_flags & flag);
2539 }
2540 
2541 int
2542 prison_allow(struct ucred *cred, unsigned flag)
2543 {
2544 
2545 	return ((cred->cr_prison->pr_allow & flag) != 0);
2546 }
2547 
2548 /*
2549  * Hold a prison reference, by incrementing pr_ref.  It is generally
2550  * an error to hold a prison that does not already have a reference.
2551  * A prison record will remain valid as long as it has at least one
2552  * reference, and will not be removed as long as either the prison
2553  * mutex or the allprison lock is held (allprison_lock may be shared).
2554  */
2555 void
2556 prison_hold_locked(struct prison *pr)
2557 {
2558 
2559 	/* Locking is no longer required. */
2560 	prison_hold(pr);
2561 }
2562 
2563 void
2564 prison_hold(struct prison *pr)
2565 {
2566 #ifdef INVARIANTS
2567 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2568 
2569 	KASSERT(was_valid,
2570 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2571 #else
2572 	refcount_acquire(&pr->pr_ref);
2573 #endif
2574 }
2575 
2576 /*
2577  * Remove a prison reference.  If that was the last reference, the
2578  * prison will be removed (at a later time).
2579  */
2580 void
2581 prison_free_locked(struct prison *pr)
2582 {
2583 
2584 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2585 	/*
2586 	 * Locking is no longer required, but unlock because the caller
2587 	 * expects it.
2588 	 */
2589 	mtx_unlock(&pr->pr_mtx);
2590 	prison_free(pr);
2591 }
2592 
2593 void
2594 prison_free(struct prison *pr)
2595 {
2596 
2597 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2598 	    ("Trying to free dead prison %p (jid=%d).",
2599 	     pr, pr->pr_id));
2600 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
2601 		/*
2602 		 * Don't remove the last reference in this context,
2603 		 * in case there are locks held.
2604 		 */
2605 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2606 	}
2607 }
2608 
2609 static void
2610 prison_free_not_last(struct prison *pr)
2611 {
2612 #ifdef INVARIANTS
2613 	int lastref;
2614 
2615 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2616 	    ("Trying to free dead prison %p (jid=%d).",
2617 	     pr, pr->pr_id));
2618 	lastref = refcount_release(&pr->pr_ref);
2619 	KASSERT(!lastref,
2620 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2621 	     pr, pr->pr_id));
2622 #else
2623 	refcount_release(&pr->pr_ref);
2624 #endif
2625 }
2626 
2627 /*
2628  * Hold a a prison for user visibility, by incrementing pr_uref.
2629  * It is generally an error to hold a prison that isn't already
2630  * user-visible, except through the the jail system calls.  It is also
2631  * an error to hold an invalid prison.  A prison record will remain
2632  * alive as long as it has at least one user reference, and will not
2633  * be set to the dying state until the prison mutex and allprison_lock
2634  * are both freed.
2635  */
2636 void
2637 prison_proc_hold(struct prison *pr)
2638 {
2639 #ifdef INVARIANTS
2640 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2641 
2642 	KASSERT(was_alive,
2643 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2644 #else
2645 	refcount_acquire(&pr->pr_uref);
2646 #endif
2647 }
2648 
2649 /*
2650  * Remove a prison user reference.  If it was the last reference, the
2651  * prison will be considered "dying", and may be removed once all of
2652  * its references are dropped.
2653  */
2654 void
2655 prison_proc_free(struct prison *pr)
2656 {
2657 
2658 	/*
2659 	 * Locking is only required when releasing the last reference.
2660 	 * This allows assurance that a locked prison will remain alive
2661 	 * until it is unlocked.
2662 	 */
2663 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2664 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2665 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
2666 		/*
2667 		 * Don't remove the last user reference in this context,
2668 		 * which is expected to be a process that is not only locked,
2669 		 * but also half dead.  Add a reference so any calls to
2670 		 * prison_free() won't re-submit the task.
2671 		 */
2672 		prison_hold(pr);
2673 		mtx_lock(&pr->pr_mtx);
2674 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2675 		    ("Redundant last reference in prison_proc_free (jid=%d)",
2676 		     pr->pr_id));
2677 		pr->pr_flags |= PR_COMPLETE_PROC;
2678 		mtx_unlock(&pr->pr_mtx);
2679 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2680 	}
2681 }
2682 
2683 static void
2684 prison_proc_free_not_last(struct prison *pr)
2685 {
2686 #ifdef INVARIANTS
2687 	int lastref;
2688 
2689 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2690 	    ("Trying to free dead prison %p (jid=%d).",
2691 	     pr, pr->pr_id));
2692 	lastref = refcount_release(&pr->pr_uref);
2693 	KASSERT(!lastref,
2694 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2695 	     pr, pr->pr_id));
2696 #else
2697 	refcount_release(&pr->pr_uref);
2698 #endif
2699 }
2700 
2701 /*
2702  * Complete a call to either prison_free or prison_proc_free.
2703  */
2704 static void
2705 prison_complete(void *context, int pending)
2706 {
2707 	struct prison *pr = context;
2708 	int drflags;
2709 
2710 	/*
2711 	 * This could be called to release the last reference, or the last
2712 	 * user reference (plus the reference held in prison_proc_free).
2713 	 */
2714 	drflags = prison_lock_xlock(pr, PD_DEREF);
2715 	if (pr->pr_flags & PR_COMPLETE_PROC) {
2716 		pr->pr_flags &= ~PR_COMPLETE_PROC;
2717 		drflags |= PD_DEUREF;
2718 	}
2719 	prison_deref(pr, drflags);
2720 }
2721 
2722 /*
2723  * Remove a prison reference and/or user reference (usually).
2724  * This assumes context that allows sleeping (for allprison_lock),
2725  * with no non-sleeping locks held, except perhaps the prison itself.
2726  * If there are no more references, release and delist the prison.
2727  * On completion, the prison lock and the allprison lock are both
2728  * unlocked.
2729  */
2730 static void
2731 prison_deref(struct prison *pr, int flags)
2732 {
2733 	struct prisonlist freeprison;
2734 	struct prison *killpr, *rpr, *ppr, *tpr;
2735 	struct proc *p;
2736 
2737 	killpr = NULL;
2738 	TAILQ_INIT(&freeprison);
2739 	/*
2740 	 * Release this prison as requested, which may cause its parent
2741 	 * to be released, and then maybe its grandparent, etc.
2742 	 */
2743 	for (;;) {
2744 		if (flags & PD_KILL) {
2745 			/* Kill the prison and its descendents. */
2746 			KASSERT(pr != &prison0,
2747 			    ("prison_deref trying to kill prison0"));
2748 			if (!(flags & PD_DEREF)) {
2749 				prison_hold(pr);
2750 				flags |= PD_DEREF;
2751 			}
2752 			flags = prison_lock_xlock(pr, flags);
2753 			prison_deref_kill(pr, &freeprison);
2754 		}
2755 		if (flags & PD_DEUREF) {
2756 			/* Drop a user reference. */
2757 			KASSERT(refcount_load(&pr->pr_uref) > 0,
2758 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2759 			     pr->pr_id));
2760 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
2761 				if (!(flags & PD_DEREF)) {
2762 					prison_hold(pr);
2763 					flags |= PD_DEREF;
2764 				}
2765 				flags = prison_lock_xlock(pr, flags);
2766 				if (refcount_release(&pr->pr_uref) &&
2767 				    pr->pr_state == PRISON_STATE_ALIVE) {
2768 					/*
2769 					 * When the last user references goes,
2770 					 * this becomes a dying prison.
2771 					 */
2772 					KASSERT(
2773 					    refcount_load(&prison0.pr_uref) > 0,
2774 					    ("prison0 pr_uref=0"));
2775 					pr->pr_state = PRISON_STATE_DYING;
2776 					mtx_unlock(&pr->pr_mtx);
2777 					flags &= ~PD_LOCKED;
2778 					(void)osd_jail_call(pr,
2779 					    PR_METHOD_REMOVE, NULL);
2780 				}
2781 			}
2782 		}
2783 		if (flags & PD_KILL) {
2784 			/*
2785 			 * Any remaining user references are probably processes
2786 			 * that need to be killed, either in this prison or its
2787 			 * descendants.
2788 			 */
2789 			if (refcount_load(&pr->pr_uref) > 0)
2790 				killpr = pr;
2791 			/* Make sure the parent prison doesn't get killed. */
2792 			flags &= ~PD_KILL;
2793 		}
2794 		if (flags & PD_DEREF) {
2795 			/* Drop a reference. */
2796 			KASSERT(refcount_load(&pr->pr_ref) > 0,
2797 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2798 			     pr->pr_id));
2799 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
2800 				flags = prison_lock_xlock(pr, flags);
2801 				if (refcount_release(&pr->pr_ref)) {
2802 					/*
2803 					 * When the last reference goes,
2804 					 * unlink the prison and set it aside.
2805 					 */
2806 					KASSERT(
2807 					    refcount_load(&pr->pr_uref) == 0,
2808 					    ("prison_deref: last ref, "
2809 					     "but still has %d urefs (jid=%d)",
2810 					     pr->pr_uref, pr->pr_id));
2811 					KASSERT(
2812 					    refcount_load(&prison0.pr_ref) != 0,
2813 					    ("prison0 pr_ref=0"));
2814 					pr->pr_state = PRISON_STATE_INVALID;
2815 					TAILQ_REMOVE(&allprison, pr, pr_list);
2816 					LIST_REMOVE(pr, pr_sibling);
2817 					TAILQ_INSERT_TAIL(&freeprison, pr,
2818 					    pr_list);
2819 					for (ppr = pr->pr_parent;
2820 					     ppr != NULL;
2821 					     ppr = ppr->pr_parent)
2822 						ppr->pr_childcount--;
2823 					/*
2824 					 * Removing a prison frees references
2825 					 * from its parent.
2826 					 */
2827 					mtx_unlock(&pr->pr_mtx);
2828 					flags &= ~PD_LOCKED;
2829 					pr = pr->pr_parent;
2830 					flags |= PD_DEREF | PD_DEUREF;
2831 					continue;
2832 				}
2833 			}
2834 		}
2835 		break;
2836 	}
2837 
2838 	/* Release all the prison locks. */
2839 	if (flags & PD_LOCKED)
2840 		mtx_unlock(&pr->pr_mtx);
2841 	if (flags & PD_LIST_SLOCKED)
2842 		sx_sunlock(&allprison_lock);
2843 	else if (flags & PD_LIST_XLOCKED)
2844 		sx_xunlock(&allprison_lock);
2845 
2846 	/* Kill any processes attached to a killed prison. */
2847 	if (killpr != NULL) {
2848 		sx_slock(&allproc_lock);
2849 		FOREACH_PROC_IN_SYSTEM(p) {
2850 			PROC_LOCK(p);
2851 			if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
2852 				for (ppr = p->p_ucred->cr_prison;
2853 				     ppr != &prison0;
2854 				     ppr = ppr->pr_parent)
2855 					if (ppr == killpr) {
2856 						kern_psignal(p, SIGKILL);
2857 						break;
2858 					}
2859 			}
2860 			PROC_UNLOCK(p);
2861 		}
2862 		sx_sunlock(&allproc_lock);
2863 	}
2864 
2865 	/*
2866 	 * Finish removing any unreferenced prisons, which couldn't happen
2867 	 * while allprison_lock was held (to avoid a LOR on vrele).
2868 	 */
2869 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
2870 #ifdef VIMAGE
2871 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
2872 			vnet_destroy(rpr->pr_vnet);
2873 #endif
2874 		if (rpr->pr_root != NULL)
2875 			vrele(rpr->pr_root);
2876 		mtx_destroy(&rpr->pr_mtx);
2877 #ifdef INET
2878 		free(rpr->pr_ip4, M_PRISON);
2879 #endif
2880 #ifdef INET6
2881 		free(rpr->pr_ip6, M_PRISON);
2882 #endif
2883 		if (rpr->pr_cpuset != NULL)
2884 			cpuset_rel(rpr->pr_cpuset);
2885 		osd_jail_exit(rpr);
2886 #ifdef RACCT
2887 		if (racct_enable)
2888 			prison_racct_detach(rpr);
2889 #endif
2890 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
2891 		free(rpr, M_PRISON);
2892 	}
2893 }
2894 
2895 /*
2896  * Kill the prison and its descendants.  Mark them as dying, clear the
2897  * persist flag, and call module remove methods.
2898  */
2899 static void
2900 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
2901 {
2902 	struct prison *cpr, *ppr, *rpr;
2903 	bool descend;
2904 
2905 	/*
2906 	 * Unlike the descendants, the target prison can be killed
2907 	 * even if it is currently dying.  This is useful for failed
2908 	 * creation in jail_set(2).
2909 	 */
2910 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2911 	    ("Trying to kill dead prison %p (jid=%d).",
2912 	     pr, pr->pr_id));
2913 	refcount_acquire(&pr->pr_uref);
2914 	pr->pr_state = PRISON_STATE_DYING;
2915 	mtx_unlock(&pr->pr_mtx);
2916 
2917 	rpr = NULL;
2918 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
2919 		if (descend) {
2920 			if (!prison_isalive(cpr)) {
2921 				descend = false;
2922 				continue;
2923 			}
2924 			prison_hold(cpr);
2925 			prison_proc_hold(cpr);
2926 			mtx_lock(&cpr->pr_mtx);
2927 			cpr->pr_state = PRISON_STATE_DYING;
2928 			cpr->pr_flags |= PR_REMOVE;
2929 			mtx_unlock(&cpr->pr_mtx);
2930 			continue;
2931 		}
2932 		if (!(cpr->pr_flags & PR_REMOVE))
2933 			continue;
2934 		(void)osd_jail_call(cpr, PR_METHOD_REMOVE, NULL);
2935 		mtx_lock(&cpr->pr_mtx);
2936 		cpr->pr_flags &= ~PR_REMOVE;
2937 		if (cpr->pr_flags & PR_PERSIST) {
2938 			cpr->pr_flags &= ~PR_PERSIST;
2939 			prison_proc_free_not_last(cpr);
2940 			prison_free_not_last(cpr);
2941 		}
2942 		(void)refcount_release(&cpr->pr_uref);
2943 		if (refcount_release(&cpr->pr_ref)) {
2944 			/*
2945 			 * When the last reference goes, unlink the prison
2946 			 * and set it aside for prison_deref() to handle.
2947 			 * Delay unlinking the sibling list to keep the loop
2948 			 * safe.
2949 			 */
2950 			if (rpr != NULL)
2951 				LIST_REMOVE(rpr, pr_sibling);
2952 			rpr = cpr;
2953 			rpr->pr_state = PRISON_STATE_INVALID;
2954 			TAILQ_REMOVE(&allprison, rpr, pr_list);
2955 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
2956 			/*
2957 			 * Removing a prison frees references from its parent.
2958 			 */
2959 			ppr = rpr->pr_parent;
2960 			prison_proc_free_not_last(ppr);
2961 			prison_free_not_last(ppr);
2962 			for (; ppr != NULL; ppr = ppr->pr_parent)
2963 				ppr->pr_childcount--;
2964 		}
2965 		mtx_unlock(&cpr->pr_mtx);
2966 	}
2967 	if (rpr != NULL)
2968 		LIST_REMOVE(rpr, pr_sibling);
2969 
2970 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
2971 	mtx_lock(&pr->pr_mtx);
2972 	if (pr->pr_flags & PR_PERSIST) {
2973 		pr->pr_flags &= ~PR_PERSIST;
2974 		prison_proc_free_not_last(pr);
2975 		prison_free_not_last(pr);
2976 	}
2977 	(void)refcount_release(&pr->pr_uref);
2978 }
2979 
2980 /*
2981  * Given the current locking state in the flags, make sure allprison_lock
2982  * is held exclusive, and the prison is locked.  Return flags indicating
2983  * the new state.
2984  */
2985 static int
2986 prison_lock_xlock(struct prison *pr, int flags)
2987 {
2988 
2989 	if (!(flags & PD_LIST_XLOCKED)) {
2990 		/*
2991 		 * Get allprison_lock, which may be an upgrade,
2992 		 * and may require unlocking the prison.
2993 		 */
2994 		if (flags & PD_LOCKED) {
2995 			mtx_unlock(&pr->pr_mtx);
2996 			flags &= ~PD_LOCKED;
2997 		}
2998 		if (flags & PD_LIST_SLOCKED) {
2999 			if (!sx_try_upgrade(&allprison_lock)) {
3000 				sx_sunlock(&allprison_lock);
3001 				sx_xlock(&allprison_lock);
3002 			}
3003 			flags &= ~PD_LIST_SLOCKED;
3004 		} else
3005 			sx_xlock(&allprison_lock);
3006 		flags |= PD_LIST_XLOCKED;
3007 	}
3008 	if (!(flags & PD_LOCKED)) {
3009 		/* Lock the prison mutex. */
3010 		mtx_lock(&pr->pr_mtx);
3011 		flags |= PD_LOCKED;
3012 	}
3013 	return flags;
3014 }
3015 
3016 /*
3017  * Set or clear a permission bit in the pr_allow field, passing restrictions
3018  * (cleared permission) down to child jails.
3019  */
3020 void
3021 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3022 {
3023 	struct prison *pr;
3024 
3025 	pr = cred->cr_prison;
3026 	sx_slock(&allprison_lock);
3027 	mtx_lock(&pr->pr_mtx);
3028 	prison_set_allow_locked(pr, flag, enable);
3029 	mtx_unlock(&pr->pr_mtx);
3030 	sx_sunlock(&allprison_lock);
3031 }
3032 
3033 static void
3034 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3035 {
3036 	struct prison *cpr;
3037 	int descend;
3038 
3039 	if (enable != 0)
3040 		pr->pr_allow |= flag;
3041 	else {
3042 		pr->pr_allow &= ~flag;
3043 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3044 			cpr->pr_allow &= ~flag;
3045 	}
3046 }
3047 
3048 /*
3049  * Check if a jail supports the given address family.
3050  *
3051  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3052  * if not.
3053  */
3054 int
3055 prison_check_af(struct ucred *cred, int af)
3056 {
3057 	struct prison *pr;
3058 	int error;
3059 
3060 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3061 
3062 	pr = cred->cr_prison;
3063 #ifdef VIMAGE
3064 	/* Prisons with their own network stack are not limited. */
3065 	if (prison_owns_vnet(cred))
3066 		return (0);
3067 #endif
3068 
3069 	error = 0;
3070 	switch (af)
3071 	{
3072 #ifdef INET
3073 	case AF_INET:
3074 		if (pr->pr_flags & PR_IP4)
3075 		{
3076 			mtx_lock(&pr->pr_mtx);
3077 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3078 				error = EAFNOSUPPORT;
3079 			mtx_unlock(&pr->pr_mtx);
3080 		}
3081 		break;
3082 #endif
3083 #ifdef INET6
3084 	case AF_INET6:
3085 		if (pr->pr_flags & PR_IP6)
3086 		{
3087 			mtx_lock(&pr->pr_mtx);
3088 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3089 				error = EAFNOSUPPORT;
3090 			mtx_unlock(&pr->pr_mtx);
3091 		}
3092 		break;
3093 #endif
3094 	case AF_LOCAL:
3095 	case AF_ROUTE:
3096 		break;
3097 	default:
3098 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3099 			error = EAFNOSUPPORT;
3100 	}
3101 	return (error);
3102 }
3103 
3104 /*
3105  * Check if given address belongs to the jail referenced by cred (wrapper to
3106  * prison_check_ip[46]).
3107  *
3108  * Returns 0 if jail doesn't restrict the address family or if address belongs
3109  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3110  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3111  */
3112 int
3113 prison_if(struct ucred *cred, const struct sockaddr *sa)
3114 {
3115 #ifdef INET
3116 	const struct sockaddr_in *sai;
3117 #endif
3118 #ifdef INET6
3119 	const struct sockaddr_in6 *sai6;
3120 #endif
3121 	int error;
3122 
3123 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3124 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3125 
3126 #ifdef VIMAGE
3127 	if (prison_owns_vnet(cred))
3128 		return (0);
3129 #endif
3130 
3131 	error = 0;
3132 	switch (sa->sa_family)
3133 	{
3134 #ifdef INET
3135 	case AF_INET:
3136 		sai = (const struct sockaddr_in *)sa;
3137 		error = prison_check_ip4(cred, &sai->sin_addr);
3138 		break;
3139 #endif
3140 #ifdef INET6
3141 	case AF_INET6:
3142 		sai6 = (const struct sockaddr_in6 *)sa;
3143 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3144 		break;
3145 #endif
3146 	default:
3147 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3148 			error = EAFNOSUPPORT;
3149 	}
3150 	return (error);
3151 }
3152 
3153 /*
3154  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3155  */
3156 int
3157 prison_check(struct ucred *cred1, struct ucred *cred2)
3158 {
3159 
3160 	return ((cred1->cr_prison == cred2->cr_prison ||
3161 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3162 }
3163 
3164 /*
3165  * Return 1 if p2 is a child of p1, otherwise 0.
3166  */
3167 int
3168 prison_ischild(struct prison *pr1, struct prison *pr2)
3169 {
3170 
3171 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3172 		if (pr1 == pr2)
3173 			return (1);
3174 	return (0);
3175 }
3176 
3177 /*
3178  * Return true if the prison is currently alive.  A prison is alive if it
3179  * holds user references and it isn't being removed.
3180  */
3181 bool
3182 prison_isalive(struct prison *pr)
3183 {
3184 
3185 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3186 		return (false);
3187 	return (true);
3188 }
3189 
3190 /*
3191  * Return true if the prison is currently valid.  A prison is valid if it has
3192  * been fully created, and is not being destroyed.  Note that dying prisons
3193  * are still considered valid.  Invalid prisons won't be found under normal
3194  * circumstances, as they're only put in that state by functions that have
3195  * an exclusive hold on allprison_lock.
3196  */
3197 bool
3198 prison_isvalid(struct prison *pr)
3199 {
3200 
3201 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3202 		return (false);
3203 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3204 		return (false);
3205 	return (true);
3206 }
3207 
3208 /*
3209  * Return 1 if the passed credential is in a jail and that jail does not
3210  * have its own virtual network stack, otherwise 0.
3211  */
3212 int
3213 jailed_without_vnet(struct ucred *cred)
3214 {
3215 
3216 	if (!jailed(cred))
3217 		return (0);
3218 #ifdef VIMAGE
3219 	if (prison_owns_vnet(cred))
3220 		return (0);
3221 #endif
3222 
3223 	return (1);
3224 }
3225 
3226 /*
3227  * Return the correct hostname (domainname, et al) for the passed credential.
3228  */
3229 void
3230 getcredhostname(struct ucred *cred, char *buf, size_t size)
3231 {
3232 	struct prison *pr;
3233 
3234 	/*
3235 	 * A NULL credential can be used to shortcut to the physical
3236 	 * system's hostname.
3237 	 */
3238 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3239 	mtx_lock(&pr->pr_mtx);
3240 	strlcpy(buf, pr->pr_hostname, size);
3241 	mtx_unlock(&pr->pr_mtx);
3242 }
3243 
3244 void
3245 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3246 {
3247 
3248 	mtx_lock(&cred->cr_prison->pr_mtx);
3249 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3250 	mtx_unlock(&cred->cr_prison->pr_mtx);
3251 }
3252 
3253 void
3254 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3255 {
3256 
3257 	mtx_lock(&cred->cr_prison->pr_mtx);
3258 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3259 	mtx_unlock(&cred->cr_prison->pr_mtx);
3260 }
3261 
3262 void
3263 getcredhostid(struct ucred *cred, unsigned long *hostid)
3264 {
3265 
3266 	mtx_lock(&cred->cr_prison->pr_mtx);
3267 	*hostid = cred->cr_prison->pr_hostid;
3268 	mtx_unlock(&cred->cr_prison->pr_mtx);
3269 }
3270 
3271 void
3272 getjailname(struct ucred *cred, char *name, size_t len)
3273 {
3274 
3275 	mtx_lock(&cred->cr_prison->pr_mtx);
3276 	strlcpy(name, cred->cr_prison->pr_name, len);
3277 	mtx_unlock(&cred->cr_prison->pr_mtx);
3278 }
3279 
3280 #ifdef VIMAGE
3281 /*
3282  * Determine whether the prison represented by cred owns
3283  * its vnet rather than having it inherited.
3284  *
3285  * Returns 1 in case the prison owns the vnet, 0 otherwise.
3286  */
3287 int
3288 prison_owns_vnet(struct ucred *cred)
3289 {
3290 
3291 	/*
3292 	 * vnets cannot be added/removed after jail creation,
3293 	 * so no need to lock here.
3294 	 */
3295 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3296 }
3297 #endif
3298 
3299 /*
3300  * Determine whether the subject represented by cred can "see"
3301  * status of a mount point.
3302  * Returns: 0 for permitted, ENOENT otherwise.
3303  * XXX: This function should be called cr_canseemount() and should be
3304  *      placed in kern_prot.c.
3305  */
3306 int
3307 prison_canseemount(struct ucred *cred, struct mount *mp)
3308 {
3309 	struct prison *pr;
3310 	struct statfs *sp;
3311 	size_t len;
3312 
3313 	pr = cred->cr_prison;
3314 	if (pr->pr_enforce_statfs == 0)
3315 		return (0);
3316 	if (pr->pr_root->v_mount == mp)
3317 		return (0);
3318 	if (pr->pr_enforce_statfs == 2)
3319 		return (ENOENT);
3320 	/*
3321 	 * If jail's chroot directory is set to "/" we should be able to see
3322 	 * all mount-points from inside a jail.
3323 	 * This is ugly check, but this is the only situation when jail's
3324 	 * directory ends with '/'.
3325 	 */
3326 	if (strcmp(pr->pr_path, "/") == 0)
3327 		return (0);
3328 	len = strlen(pr->pr_path);
3329 	sp = &mp->mnt_stat;
3330 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3331 		return (ENOENT);
3332 	/*
3333 	 * Be sure that we don't have situation where jail's root directory
3334 	 * is "/some/path" and mount point is "/some/pathpath".
3335 	 */
3336 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3337 		return (ENOENT);
3338 	return (0);
3339 }
3340 
3341 void
3342 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3343 {
3344 	char jpath[MAXPATHLEN];
3345 	struct prison *pr;
3346 	size_t len;
3347 
3348 	pr = cred->cr_prison;
3349 	if (pr->pr_enforce_statfs == 0)
3350 		return;
3351 	if (prison_canseemount(cred, mp) != 0) {
3352 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3353 		strlcpy(sp->f_mntonname, "[restricted]",
3354 		    sizeof(sp->f_mntonname));
3355 		return;
3356 	}
3357 	if (pr->pr_root->v_mount == mp) {
3358 		/*
3359 		 * Clear current buffer data, so we are sure nothing from
3360 		 * the valid path left there.
3361 		 */
3362 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3363 		*sp->f_mntonname = '/';
3364 		return;
3365 	}
3366 	/*
3367 	 * If jail's chroot directory is set to "/" we should be able to see
3368 	 * all mount-points from inside a jail.
3369 	 */
3370 	if (strcmp(pr->pr_path, "/") == 0)
3371 		return;
3372 	len = strlen(pr->pr_path);
3373 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3374 	/*
3375 	 * Clear current buffer data, so we are sure nothing from
3376 	 * the valid path left there.
3377 	 */
3378 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3379 	if (*jpath == '\0') {
3380 		/* Should never happen. */
3381 		*sp->f_mntonname = '/';
3382 	} else {
3383 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3384 	}
3385 }
3386 
3387 /*
3388  * Check with permission for a specific privilege is granted within jail.  We
3389  * have a specific list of accepted privileges; the rest are denied.
3390  */
3391 int
3392 prison_priv_check(struct ucred *cred, int priv)
3393 {
3394 	struct prison *pr;
3395 	int error;
3396 
3397 	/*
3398 	 * Some policies have custom handlers. This routine should not be
3399 	 * called for them. See priv_check_cred().
3400 	 */
3401 	switch (priv) {
3402 	case PRIV_VFS_LOOKUP:
3403 	case PRIV_VFS_GENERATION:
3404 		KASSERT(0, ("prison_priv_check instead of a custom handler "
3405 		    "called for %d\n", priv));
3406 	}
3407 
3408 	if (!jailed(cred))
3409 		return (0);
3410 
3411 #ifdef VIMAGE
3412 	/*
3413 	 * Privileges specific to prisons with a virtual network stack.
3414 	 * There might be a duplicate entry here in case the privilege
3415 	 * is only granted conditionally in the legacy jail case.
3416 	 */
3417 	switch (priv) {
3418 #ifdef notyet
3419 		/*
3420 		 * NFS-specific privileges.
3421 		 */
3422 	case PRIV_NFS_DAEMON:
3423 	case PRIV_NFS_LOCKD:
3424 #endif
3425 		/*
3426 		 * Network stack privileges.
3427 		 */
3428 	case PRIV_NET_BRIDGE:
3429 	case PRIV_NET_GRE:
3430 	case PRIV_NET_BPF:
3431 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3432 	case PRIV_NET_ROUTE:
3433 	case PRIV_NET_TAP:
3434 	case PRIV_NET_SETIFMTU:
3435 	case PRIV_NET_SETIFFLAGS:
3436 	case PRIV_NET_SETIFCAP:
3437 	case PRIV_NET_SETIFDESCR:
3438 	case PRIV_NET_SETIFNAME	:
3439 	case PRIV_NET_SETIFMETRIC:
3440 	case PRIV_NET_SETIFPHYS:
3441 	case PRIV_NET_SETIFMAC:
3442 	case PRIV_NET_SETLANPCP:
3443 	case PRIV_NET_ADDMULTI:
3444 	case PRIV_NET_DELMULTI:
3445 	case PRIV_NET_HWIOCTL:
3446 	case PRIV_NET_SETLLADDR:
3447 	case PRIV_NET_ADDIFGROUP:
3448 	case PRIV_NET_DELIFGROUP:
3449 	case PRIV_NET_IFCREATE:
3450 	case PRIV_NET_IFDESTROY:
3451 	case PRIV_NET_ADDIFADDR:
3452 	case PRIV_NET_DELIFADDR:
3453 	case PRIV_NET_LAGG:
3454 	case PRIV_NET_GIF:
3455 	case PRIV_NET_SETIFVNET:
3456 	case PRIV_NET_SETIFFIB:
3457 
3458 		/*
3459 		 * 802.11-related privileges.
3460 		 */
3461 	case PRIV_NET80211_VAP_GETKEY:
3462 	case PRIV_NET80211_VAP_MANAGE:
3463 
3464 #ifdef notyet
3465 		/*
3466 		 * ATM privileges.
3467 		 */
3468 	case PRIV_NETATM_CFG:
3469 	case PRIV_NETATM_ADD:
3470 	case PRIV_NETATM_DEL:
3471 	case PRIV_NETATM_SET:
3472 
3473 		/*
3474 		 * Bluetooth privileges.
3475 		 */
3476 	case PRIV_NETBLUETOOTH_RAW:
3477 #endif
3478 
3479 		/*
3480 		 * Netgraph and netgraph module privileges.
3481 		 */
3482 	case PRIV_NETGRAPH_CONTROL:
3483 #ifdef notyet
3484 	case PRIV_NETGRAPH_TTY:
3485 #endif
3486 
3487 		/*
3488 		 * IPv4 and IPv6 privileges.
3489 		 */
3490 	case PRIV_NETINET_IPFW:
3491 	case PRIV_NETINET_DIVERT:
3492 	case PRIV_NETINET_PF:
3493 	case PRIV_NETINET_DUMMYNET:
3494 	case PRIV_NETINET_CARP:
3495 	case PRIV_NETINET_MROUTE:
3496 	case PRIV_NETINET_RAW:
3497 	case PRIV_NETINET_ADDRCTRL6:
3498 	case PRIV_NETINET_ND6:
3499 	case PRIV_NETINET_SCOPE6:
3500 	case PRIV_NETINET_ALIFETIME6:
3501 	case PRIV_NETINET_IPSEC:
3502 	case PRIV_NETINET_BINDANY:
3503 
3504 #ifdef notyet
3505 		/*
3506 		 * NCP privileges.
3507 		 */
3508 	case PRIV_NETNCP:
3509 
3510 		/*
3511 		 * SMB privileges.
3512 		 */
3513 	case PRIV_NETSMB:
3514 #endif
3515 
3516 	/*
3517 	 * No default: or deny here.
3518 	 * In case of no permit fall through to next switch().
3519 	 */
3520 		if (cred->cr_prison->pr_flags & PR_VNET)
3521 			return (0);
3522 	}
3523 #endif /* VIMAGE */
3524 
3525 	switch (priv) {
3526 		/*
3527 		 * Allow ktrace privileges for root in jail.
3528 		 */
3529 	case PRIV_KTRACE:
3530 
3531 #if 0
3532 		/*
3533 		 * Allow jailed processes to configure audit identity and
3534 		 * submit audit records (login, etc).  In the future we may
3535 		 * want to further refine the relationship between audit and
3536 		 * jail.
3537 		 */
3538 	case PRIV_AUDIT_GETAUDIT:
3539 	case PRIV_AUDIT_SETAUDIT:
3540 	case PRIV_AUDIT_SUBMIT:
3541 #endif
3542 
3543 		/*
3544 		 * Allow jailed processes to manipulate process UNIX
3545 		 * credentials in any way they see fit.
3546 		 */
3547 	case PRIV_CRED_SETUID:
3548 	case PRIV_CRED_SETEUID:
3549 	case PRIV_CRED_SETGID:
3550 	case PRIV_CRED_SETEGID:
3551 	case PRIV_CRED_SETGROUPS:
3552 	case PRIV_CRED_SETREUID:
3553 	case PRIV_CRED_SETREGID:
3554 	case PRIV_CRED_SETRESUID:
3555 	case PRIV_CRED_SETRESGID:
3556 
3557 		/*
3558 		 * Jail implements visibility constraints already, so allow
3559 		 * jailed root to override uid/gid-based constraints.
3560 		 */
3561 	case PRIV_SEEOTHERGIDS:
3562 	case PRIV_SEEOTHERUIDS:
3563 
3564 		/*
3565 		 * Jail implements inter-process debugging limits already, so
3566 		 * allow jailed root various debugging privileges.
3567 		 */
3568 	case PRIV_DEBUG_DIFFCRED:
3569 	case PRIV_DEBUG_SUGID:
3570 	case PRIV_DEBUG_UNPRIV:
3571 
3572 		/*
3573 		 * Allow jail to set various resource limits and login
3574 		 * properties, and for now, exceed process resource limits.
3575 		 */
3576 	case PRIV_PROC_LIMIT:
3577 	case PRIV_PROC_SETLOGIN:
3578 	case PRIV_PROC_SETRLIMIT:
3579 
3580 		/*
3581 		 * System V and POSIX IPC privileges are granted in jail.
3582 		 */
3583 	case PRIV_IPC_READ:
3584 	case PRIV_IPC_WRITE:
3585 	case PRIV_IPC_ADMIN:
3586 	case PRIV_IPC_MSGSIZE:
3587 	case PRIV_MQ_ADMIN:
3588 
3589 		/*
3590 		 * Jail operations within a jail work on child jails.
3591 		 */
3592 	case PRIV_JAIL_ATTACH:
3593 	case PRIV_JAIL_SET:
3594 	case PRIV_JAIL_REMOVE:
3595 
3596 		/*
3597 		 * Jail implements its own inter-process limits, so allow
3598 		 * root processes in jail to change scheduling on other
3599 		 * processes in the same jail.  Likewise for signalling.
3600 		 */
3601 	case PRIV_SCHED_DIFFCRED:
3602 	case PRIV_SCHED_CPUSET:
3603 	case PRIV_SIGNAL_DIFFCRED:
3604 	case PRIV_SIGNAL_SUGID:
3605 
3606 		/*
3607 		 * Allow jailed processes to write to sysctls marked as jail
3608 		 * writable.
3609 		 */
3610 	case PRIV_SYSCTL_WRITEJAIL:
3611 
3612 		/*
3613 		 * Allow root in jail to manage a variety of quota
3614 		 * properties.  These should likely be conditional on a
3615 		 * configuration option.
3616 		 */
3617 	case PRIV_VFS_GETQUOTA:
3618 	case PRIV_VFS_SETQUOTA:
3619 
3620 		/*
3621 		 * Since Jail relies on chroot() to implement file system
3622 		 * protections, grant many VFS privileges to root in jail.
3623 		 * Be careful to exclude mount-related and NFS-related
3624 		 * privileges.
3625 		 */
3626 	case PRIV_VFS_READ:
3627 	case PRIV_VFS_WRITE:
3628 	case PRIV_VFS_ADMIN:
3629 	case PRIV_VFS_EXEC:
3630 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3631 	case PRIV_VFS_CHFLAGS_DEV:
3632 	case PRIV_VFS_CHOWN:
3633 	case PRIV_VFS_CHROOT:
3634 	case PRIV_VFS_RETAINSUGID:
3635 	case PRIV_VFS_FCHROOT:
3636 	case PRIV_VFS_LINK:
3637 	case PRIV_VFS_SETGID:
3638 	case PRIV_VFS_STAT:
3639 	case PRIV_VFS_STICKYFILE:
3640 
3641 		/*
3642 		 * As in the non-jail case, non-root users are expected to be
3643 		 * able to read kernel/physical memory (provided /dev/[k]mem
3644 		 * exists in the jail and they have permission to access it).
3645 		 */
3646 	case PRIV_KMEM_READ:
3647 		return (0);
3648 
3649 		/*
3650 		 * Depending on the global setting, allow privilege of
3651 		 * setting system flags.
3652 		 */
3653 	case PRIV_VFS_SYSFLAGS:
3654 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3655 			return (0);
3656 		else
3657 			return (EPERM);
3658 
3659 		/*
3660 		 * Depending on the global setting, allow privilege of
3661 		 * mounting/unmounting file systems.
3662 		 */
3663 	case PRIV_VFS_MOUNT:
3664 	case PRIV_VFS_UNMOUNT:
3665 	case PRIV_VFS_MOUNT_NONUSER:
3666 	case PRIV_VFS_MOUNT_OWNER:
3667 		pr = cred->cr_prison;
3668 		prison_lock(pr);
3669 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
3670 			error = 0;
3671 		else
3672 			error = EPERM;
3673 		prison_unlock(pr);
3674 		return (error);
3675 
3676 		/*
3677 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
3678 		 * policy.  priv_check_cred will not specifically allow it, and
3679 		 * we may want a MAC policy to allow it.
3680 		 */
3681 	case PRIV_VFS_READ_DIR:
3682 		return (0);
3683 
3684 		/*
3685 		 * Conditionnaly allow locking (unlocking) physical pages
3686 		 * in memory.
3687 		 */
3688 	case PRIV_VM_MLOCK:
3689 	case PRIV_VM_MUNLOCK:
3690 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
3691 			return (0);
3692 		else
3693 			return (EPERM);
3694 
3695 		/*
3696 		 * Conditionally allow jailed root to bind reserved ports.
3697 		 */
3698 	case PRIV_NETINET_RESERVEDPORT:
3699 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
3700 			return (0);
3701 		else
3702 			return (EPERM);
3703 
3704 		/*
3705 		 * Allow jailed root to reuse in-use ports.
3706 		 */
3707 	case PRIV_NETINET_REUSEPORT:
3708 		return (0);
3709 
3710 		/*
3711 		 * Allow jailed root to set certain IPv4/6 (option) headers.
3712 		 */
3713 	case PRIV_NETINET_SETHDROPTS:
3714 		return (0);
3715 
3716 		/*
3717 		 * Conditionally allow creating raw sockets in jail.
3718 		 */
3719 	case PRIV_NETINET_RAW:
3720 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3721 			return (0);
3722 		else
3723 			return (EPERM);
3724 
3725 		/*
3726 		 * Since jail implements its own visibility limits on netstat
3727 		 * sysctls, allow getcred.  This allows identd to work in
3728 		 * jail.
3729 		 */
3730 	case PRIV_NETINET_GETCRED:
3731 		return (0);
3732 
3733 		/*
3734 		 * Allow jailed root to set loginclass.
3735 		 */
3736 	case PRIV_PROC_SETLOGINCLASS:
3737 		return (0);
3738 
3739 		/*
3740 		 * Do not allow a process inside a jail to read the kernel
3741 		 * message buffer unless explicitly permitted.
3742 		 */
3743 	case PRIV_MSGBUF:
3744 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
3745 			return (0);
3746 		return (EPERM);
3747 
3748 	default:
3749 		/*
3750 		 * In all remaining cases, deny the privilege request.  This
3751 		 * includes almost all network privileges, many system
3752 		 * configuration privileges.
3753 		 */
3754 		return (EPERM);
3755 	}
3756 }
3757 
3758 /*
3759  * Return the part of pr2's name that is relative to pr1, or the whole name
3760  * if it does not directly follow.
3761  */
3762 
3763 char *
3764 prison_name(struct prison *pr1, struct prison *pr2)
3765 {
3766 	char *name;
3767 
3768 	/* Jails see themselves as "0" (if they see themselves at all). */
3769 	if (pr1 == pr2)
3770 		return "0";
3771 	name = pr2->pr_name;
3772 	if (prison_ischild(pr1, pr2)) {
3773 		/*
3774 		 * pr1 isn't locked (and allprison_lock may not be either)
3775 		 * so its length can't be counted on.  But the number of dots
3776 		 * can be counted on - and counted.
3777 		 */
3778 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3779 			name = strchr(name, '.') + 1;
3780 	}
3781 	return (name);
3782 }
3783 
3784 /*
3785  * Return the part of pr2's path that is relative to pr1, or the whole path
3786  * if it does not directly follow.
3787  */
3788 static char *
3789 prison_path(struct prison *pr1, struct prison *pr2)
3790 {
3791 	char *path1, *path2;
3792 	int len1;
3793 
3794 	path1 = pr1->pr_path;
3795 	path2 = pr2->pr_path;
3796 	if (!strcmp(path1, "/"))
3797 		return (path2);
3798 	len1 = strlen(path1);
3799 	if (strncmp(path1, path2, len1))
3800 		return (path2);
3801 	if (path2[len1] == '\0')
3802 		return "/";
3803 	if (path2[len1] == '/')
3804 		return (path2 + len1);
3805 	return (path2);
3806 }
3807 
3808 /*
3809  * Jail-related sysctls.
3810  */
3811 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
3812     "Jails");
3813 
3814 static int
3815 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3816 {
3817 	struct xprison *xp;
3818 	struct prison *pr, *cpr;
3819 #ifdef INET
3820 	struct in_addr *ip4 = NULL;
3821 	int ip4s = 0;
3822 #endif
3823 #ifdef INET6
3824 	struct in6_addr *ip6 = NULL;
3825 	int ip6s = 0;
3826 #endif
3827 	int descend, error;
3828 
3829 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3830 	pr = req->td->td_ucred->cr_prison;
3831 	error = 0;
3832 	sx_slock(&allprison_lock);
3833 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3834 #if defined(INET) || defined(INET6)
3835  again:
3836 #endif
3837 		mtx_lock(&cpr->pr_mtx);
3838 #ifdef INET
3839 		if (cpr->pr_ip4s > 0) {
3840 			if (ip4s < cpr->pr_ip4s) {
3841 				ip4s = cpr->pr_ip4s;
3842 				mtx_unlock(&cpr->pr_mtx);
3843 				ip4 = realloc(ip4, ip4s *
3844 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3845 				goto again;
3846 			}
3847 			bcopy(cpr->pr_ip4, ip4,
3848 			    cpr->pr_ip4s * sizeof(struct in_addr));
3849 		}
3850 #endif
3851 #ifdef INET6
3852 		if (cpr->pr_ip6s > 0) {
3853 			if (ip6s < cpr->pr_ip6s) {
3854 				ip6s = cpr->pr_ip6s;
3855 				mtx_unlock(&cpr->pr_mtx);
3856 				ip6 = realloc(ip6, ip6s *
3857 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3858 				goto again;
3859 			}
3860 			bcopy(cpr->pr_ip6, ip6,
3861 			    cpr->pr_ip6s * sizeof(struct in6_addr));
3862 		}
3863 #endif
3864 		bzero(xp, sizeof(*xp));
3865 		xp->pr_version = XPRISON_VERSION;
3866 		xp->pr_id = cpr->pr_id;
3867 		xp->pr_state = cpr->pr_state;
3868 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3869 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3870 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3871 #ifdef INET
3872 		xp->pr_ip4s = cpr->pr_ip4s;
3873 #endif
3874 #ifdef INET6
3875 		xp->pr_ip6s = cpr->pr_ip6s;
3876 #endif
3877 		mtx_unlock(&cpr->pr_mtx);
3878 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3879 		if (error)
3880 			break;
3881 #ifdef INET
3882 		if (xp->pr_ip4s > 0) {
3883 			error = SYSCTL_OUT(req, ip4,
3884 			    xp->pr_ip4s * sizeof(struct in_addr));
3885 			if (error)
3886 				break;
3887 		}
3888 #endif
3889 #ifdef INET6
3890 		if (xp->pr_ip6s > 0) {
3891 			error = SYSCTL_OUT(req, ip6,
3892 			    xp->pr_ip6s * sizeof(struct in6_addr));
3893 			if (error)
3894 				break;
3895 		}
3896 #endif
3897 	}
3898 	sx_sunlock(&allprison_lock);
3899 	free(xp, M_TEMP);
3900 #ifdef INET
3901 	free(ip4, M_TEMP);
3902 #endif
3903 #ifdef INET6
3904 	free(ip6, M_TEMP);
3905 #endif
3906 	return (error);
3907 }
3908 
3909 SYSCTL_OID(_security_jail, OID_AUTO, list,
3910     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3911     sysctl_jail_list, "S", "List of active jails");
3912 
3913 static int
3914 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3915 {
3916 	int error, injail;
3917 
3918 	injail = jailed(req->td->td_ucred);
3919 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3920 
3921 	return (error);
3922 }
3923 
3924 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3925     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3926     sysctl_jail_jailed, "I", "Process in jail?");
3927 
3928 static int
3929 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
3930 {
3931 	int error, havevnet;
3932 #ifdef VIMAGE
3933 	struct ucred *cred = req->td->td_ucred;
3934 
3935 	havevnet = jailed(cred) && prison_owns_vnet(cred);
3936 #else
3937 	havevnet = 0;
3938 #endif
3939 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
3940 
3941 	return (error);
3942 }
3943 
3944 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
3945     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3946     sysctl_jail_vnet, "I", "Jail owns vnet?");
3947 
3948 #if defined(INET) || defined(INET6)
3949 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3950     &jail_max_af_ips, 0,
3951     "Number of IP addresses a jail may have at most per address family (deprecated)");
3952 #endif
3953 
3954 /*
3955  * Default parameters for jail(2) compatibility.  For historical reasons,
3956  * the sysctl names have varying similarity to the parameter names.  Prisons
3957  * just see their own parameters, and can't change them.
3958  */
3959 static int
3960 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3961 {
3962 	int error, i;
3963 
3964 	/* Get the current flag value, and convert it to a boolean. */
3965 	if (req->td->td_ucred->cr_prison == &prison0) {
3966 		mtx_lock(&prison0.pr_mtx);
3967 		i = (jail_default_allow & arg2) != 0;
3968 		mtx_unlock(&prison0.pr_mtx);
3969 	} else
3970 		i = prison_allow(req->td->td_ucred, arg2);
3971 
3972 	if (arg1 != NULL)
3973 		i = !i;
3974 	error = sysctl_handle_int(oidp, &i, 0, req);
3975 	if (error || !req->newptr)
3976 		return (error);
3977 	i = i ? arg2 : 0;
3978 	if (arg1 != NULL)
3979 		i ^= arg2;
3980 	/*
3981 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3982 	 * for writing.
3983 	 */
3984 	mtx_lock(&prison0.pr_mtx);
3985 	jail_default_allow = (jail_default_allow & ~arg2) | i;
3986 	mtx_unlock(&prison0.pr_mtx);
3987 	return (0);
3988 }
3989 
3990 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3991     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3992     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3993     "Processes in jail can set their hostnames (deprecated)");
3994 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3995     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3996     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3997     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
3998 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3999     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4000     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4001     "Processes in jail can use System V IPC primitives (deprecated)");
4002 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4003     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4004     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4005     "Prison root can create raw sockets (deprecated)");
4006 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4007     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4008     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4009     "Processes in jail can alter system file flags (deprecated)");
4010 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4011     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4012     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4013     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4014 
4015 static int
4016 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4017 {
4018 	struct prison *pr;
4019 	int level, error;
4020 
4021 	pr = req->td->td_ucred->cr_prison;
4022 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4023 	error = sysctl_handle_int(oidp, &level, 0, req);
4024 	if (error || !req->newptr)
4025 		return (error);
4026 	*(int *)arg1 = level;
4027 	return (0);
4028 }
4029 
4030 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4031     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4032     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4033     sysctl_jail_default_level, "I",
4034     "Processes in jail cannot see all mounted file systems (deprecated)");
4035 
4036 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4037     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4038     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4039     sysctl_jail_default_level, "I",
4040     "Ruleset for the devfs filesystem in jail (deprecated)");
4041 
4042 /*
4043  * Nodes to describe jail parameters.  Maximum length of string parameters
4044  * is returned in the string itself, and the other parameters exist merely
4045  * to make themselves and their types known.
4046  */
4047 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4048     "Jail parameters");
4049 
4050 int
4051 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4052 {
4053 	int i;
4054 	long l;
4055 	size_t s;
4056 	char numbuf[12];
4057 
4058 	switch (oidp->oid_kind & CTLTYPE)
4059 	{
4060 	case CTLTYPE_LONG:
4061 	case CTLTYPE_ULONG:
4062 		l = 0;
4063 #ifdef SCTL_MASK32
4064 		if (!(req->flags & SCTL_MASK32))
4065 #endif
4066 			return (SYSCTL_OUT(req, &l, sizeof(l)));
4067 	case CTLTYPE_INT:
4068 	case CTLTYPE_UINT:
4069 		i = 0;
4070 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4071 	case CTLTYPE_STRING:
4072 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4073 		return
4074 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4075 	case CTLTYPE_STRUCT:
4076 		s = (size_t)arg2;
4077 		return (SYSCTL_OUT(req, &s, sizeof(s)));
4078 	}
4079 	return (0);
4080 }
4081 
4082 /*
4083  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4084  * jail creation time but cannot be changed in an existing jail.
4085  */
4086 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4087 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4088 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4089 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4090 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4091     "I", "Jail secure level");
4092 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4093     "Jail value for kern.osreldate and uname -K");
4094 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4095     "Jail value for kern.osrelease and uname -r");
4096 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4097     "I", "Jail cannot see all mounted file systems");
4098 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4099     "I", "Ruleset for in-jail devfs mounts");
4100 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4101     "B", "Jail persistence");
4102 #ifdef VIMAGE
4103 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4104     "E,jailsys", "Virtual network stack");
4105 #endif
4106 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4107     "B", "Jail is in the process of shutting down");
4108 
4109 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4110 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4111     "I", "Current number of child jails");
4112 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4113     "I", "Maximum number of child jails");
4114 
4115 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4116 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4117     "Jail hostname");
4118 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4119     "Jail NIS domainname");
4120 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4121     "Jail host UUID");
4122 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4123     "LU", "Jail host ID");
4124 
4125 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4126 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4127 
4128 #ifdef INET
4129 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4130     "Jail IPv4 address virtualization");
4131 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4132     "S,in_addr,a", "Jail IPv4 addresses");
4133 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4134     "B", "Do (not) use IPv4 source address selection rather than the "
4135     "primary jail IPv4 address.");
4136 #endif
4137 #ifdef INET6
4138 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4139     "Jail IPv6 address virtualization");
4140 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4141     "S,in6_addr,a", "Jail IPv6 addresses");
4142 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4143     "B", "Do (not) use IPv6 source address selection rather than the "
4144     "primary jail IPv6 address.");
4145 #endif
4146 
4147 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4148 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4149     "B", "Jail may set hostname");
4150 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4151     "B", "Jail may use SYSV IPC");
4152 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4153     "B", "Jail may create raw sockets");
4154 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4155     "B", "Jail may alter system file flags");
4156 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4157     "B", "Jail may set file quotas");
4158 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4159     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4160 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4161     "B", "Jail may lock (unlock) physical pages in memory");
4162 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4163     "B", "Jail may bind sockets to reserved ports");
4164 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4165     "B", "Jail may read the kernel message buffer");
4166 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4167     "B", "Unprivileged processes may use process debugging facilities");
4168 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4169     "B", "Processes in jail with uid 0 have privilege");
4170 
4171 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4172 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4173     "B", "Jail may mount/unmount jail-friendly file systems in general");
4174 
4175 /*
4176  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
4177  * its associated bit in the pr_allow bitmask, or zero if the parameter was
4178  * not created.
4179  */
4180 unsigned
4181 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4182     const char *descr)
4183 {
4184 	struct bool_flags *bf;
4185 	struct sysctl_oid *parent;
4186 	char *allow_name, *allow_noname, *allowed;
4187 #ifndef NO_SYSCTL_DESCR
4188 	char *descr_deprecated;
4189 #endif
4190 	u_int allow_flag;
4191 
4192 	if (prefix
4193 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4194 		< 0 ||
4195 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4196 		< 0
4197 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4198 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4199 		free(allow_name, M_PRISON);
4200 		return 0;
4201 	}
4202 
4203 	/*
4204 	 * See if this parameter has already beed added, i.e. a module was
4205 	 * previously loaded/unloaded.
4206 	 */
4207 	mtx_lock(&prison0.pr_mtx);
4208 	for (bf = pr_flag_allow;
4209 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4210 		atomic_load_int(&bf->flag) != 0;
4211 	     bf++) {
4212 		if (strcmp(bf->name, allow_name) == 0) {
4213 			allow_flag = bf->flag;
4214 			goto no_add;
4215 		}
4216 	}
4217 
4218 	/*
4219 	 * Find a free bit in pr_allow_all, failing if there are none
4220 	 * (which shouldn't happen as long as we keep track of how many
4221 	 * potential dynamic flags exist).
4222 	 */
4223 	for (allow_flag = 1;; allow_flag <<= 1) {
4224 		if (allow_flag == 0)
4225 			goto no_add;
4226 		if ((pr_allow_all & allow_flag) == 0)
4227 			break;
4228 	}
4229 
4230 	/* Note the parameter in the next open slot in pr_flag_allow. */
4231 	for (bf = pr_flag_allow; ; bf++) {
4232 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4233 			/* This should never happen, but is not fatal. */
4234 			allow_flag = 0;
4235 			goto no_add;
4236 		}
4237 		if (atomic_load_int(&bf->flag) == 0)
4238 			break;
4239 	}
4240 	bf->name = allow_name;
4241 	bf->noname = allow_noname;
4242 	pr_allow_all |= allow_flag;
4243 	/*
4244 	 * prison0 always has permission for the new parameter.
4245 	 * Other jails must have it granted to them.
4246 	 */
4247 	prison0.pr_allow |= allow_flag;
4248 	/* The flag indicates a valid entry, so make sure it is set last. */
4249 	atomic_store_rel_int(&bf->flag, allow_flag);
4250 	mtx_unlock(&prison0.pr_mtx);
4251 
4252 	/*
4253 	 * Create sysctls for the paramter, and the back-compat global
4254 	 * permission.
4255 	 */
4256 	parent = prefix
4257 	    ? SYSCTL_ADD_NODE(NULL,
4258 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4259 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4260 	    : &sysctl___security_jail_param_allow;
4261 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4262 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4263 	    NULL, 0, sysctl_jail_param, "B", descr);
4264 	if ((prefix
4265 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4266 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4267 #ifndef NO_SYSCTL_DESCR
4268 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4269 		    descr);
4270 #endif
4271 		(void)SYSCTL_ADD_PROC(NULL,
4272 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4273 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4274 		    sysctl_jail_default_allow, "I", descr_deprecated);
4275 #ifndef NO_SYSCTL_DESCR
4276 		free(descr_deprecated, M_TEMP);
4277 #endif
4278 		free(allowed, M_TEMP);
4279 	}
4280 	return allow_flag;
4281 
4282  no_add:
4283 	mtx_unlock(&prison0.pr_mtx);
4284 	free(allow_name, M_PRISON);
4285 	free(allow_noname, M_PRISON);
4286 	return allow_flag;
4287 }
4288 
4289 /*
4290  * The VFS system will register jail-aware filesystems here.  They each get
4291  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4292  * attempts to mount.
4293  */
4294 void
4295 prison_add_vfs(struct vfsconf *vfsp)
4296 {
4297 #ifdef NO_SYSCTL_DESCR
4298 
4299 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4300 	    NULL, NULL);
4301 #else
4302 	char *descr;
4303 
4304 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4305 	    vfsp->vfc_name);
4306 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4307 	    NULL, descr);
4308 	free(descr, M_TEMP);
4309 #endif
4310 }
4311 
4312 #ifdef RACCT
4313 void
4314 prison_racct_foreach(void (*callback)(struct racct *racct,
4315     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4316     void *arg2, void *arg3)
4317 {
4318 	struct prison_racct *prr;
4319 
4320 	ASSERT_RACCT_ENABLED();
4321 
4322 	sx_slock(&allprison_lock);
4323 	if (pre != NULL)
4324 		(pre)();
4325 	LIST_FOREACH(prr, &allprison_racct, prr_next)
4326 		(callback)(prr->prr_racct, arg2, arg3);
4327 	if (post != NULL)
4328 		(post)();
4329 	sx_sunlock(&allprison_lock);
4330 }
4331 
4332 static struct prison_racct *
4333 prison_racct_find_locked(const char *name)
4334 {
4335 	struct prison_racct *prr;
4336 
4337 	ASSERT_RACCT_ENABLED();
4338 	sx_assert(&allprison_lock, SA_XLOCKED);
4339 
4340 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4341 		return (NULL);
4342 
4343 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4344 		if (strcmp(name, prr->prr_name) != 0)
4345 			continue;
4346 
4347 		/* Found prison_racct with a matching name? */
4348 		prison_racct_hold(prr);
4349 		return (prr);
4350 	}
4351 
4352 	/* Add new prison_racct. */
4353 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4354 	racct_create(&prr->prr_racct);
4355 
4356 	strcpy(prr->prr_name, name);
4357 	refcount_init(&prr->prr_refcount, 1);
4358 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4359 
4360 	return (prr);
4361 }
4362 
4363 struct prison_racct *
4364 prison_racct_find(const char *name)
4365 {
4366 	struct prison_racct *prr;
4367 
4368 	ASSERT_RACCT_ENABLED();
4369 
4370 	sx_xlock(&allprison_lock);
4371 	prr = prison_racct_find_locked(name);
4372 	sx_xunlock(&allprison_lock);
4373 	return (prr);
4374 }
4375 
4376 void
4377 prison_racct_hold(struct prison_racct *prr)
4378 {
4379 
4380 	ASSERT_RACCT_ENABLED();
4381 
4382 	refcount_acquire(&prr->prr_refcount);
4383 }
4384 
4385 static void
4386 prison_racct_free_locked(struct prison_racct *prr)
4387 {
4388 
4389 	ASSERT_RACCT_ENABLED();
4390 	sx_assert(&allprison_lock, SA_XLOCKED);
4391 
4392 	if (refcount_release(&prr->prr_refcount)) {
4393 		racct_destroy(&prr->prr_racct);
4394 		LIST_REMOVE(prr, prr_next);
4395 		free(prr, M_PRISON_RACCT);
4396 	}
4397 }
4398 
4399 void
4400 prison_racct_free(struct prison_racct *prr)
4401 {
4402 
4403 	ASSERT_RACCT_ENABLED();
4404 	sx_assert(&allprison_lock, SA_UNLOCKED);
4405 
4406 	if (refcount_release_if_not_last(&prr->prr_refcount))
4407 		return;
4408 
4409 	sx_xlock(&allprison_lock);
4410 	prison_racct_free_locked(prr);
4411 	sx_xunlock(&allprison_lock);
4412 }
4413 
4414 static void
4415 prison_racct_attach(struct prison *pr)
4416 {
4417 	struct prison_racct *prr;
4418 
4419 	ASSERT_RACCT_ENABLED();
4420 	sx_assert(&allprison_lock, SA_XLOCKED);
4421 
4422 	prr = prison_racct_find_locked(pr->pr_name);
4423 	KASSERT(prr != NULL, ("cannot find prison_racct"));
4424 
4425 	pr->pr_prison_racct = prr;
4426 }
4427 
4428 /*
4429  * Handle jail renaming.  From the racct point of view, renaming means
4430  * moving from one prison_racct to another.
4431  */
4432 static void
4433 prison_racct_modify(struct prison *pr)
4434 {
4435 #ifdef RCTL
4436 	struct proc *p;
4437 	struct ucred *cred;
4438 #endif
4439 	struct prison_racct *oldprr;
4440 
4441 	ASSERT_RACCT_ENABLED();
4442 
4443 	sx_slock(&allproc_lock);
4444 	sx_xlock(&allprison_lock);
4445 
4446 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4447 		sx_xunlock(&allprison_lock);
4448 		sx_sunlock(&allproc_lock);
4449 		return;
4450 	}
4451 
4452 	oldprr = pr->pr_prison_racct;
4453 	pr->pr_prison_racct = NULL;
4454 
4455 	prison_racct_attach(pr);
4456 
4457 	/*
4458 	 * Move resource utilisation records.
4459 	 */
4460 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4461 
4462 #ifdef RCTL
4463 	/*
4464 	 * Force rctl to reattach rules to processes.
4465 	 */
4466 	FOREACH_PROC_IN_SYSTEM(p) {
4467 		PROC_LOCK(p);
4468 		cred = crhold(p->p_ucred);
4469 		PROC_UNLOCK(p);
4470 		rctl_proc_ucred_changed(p, cred);
4471 		crfree(cred);
4472 	}
4473 #endif
4474 
4475 	sx_sunlock(&allproc_lock);
4476 	prison_racct_free_locked(oldprr);
4477 	sx_xunlock(&allprison_lock);
4478 }
4479 
4480 static void
4481 prison_racct_detach(struct prison *pr)
4482 {
4483 
4484 	ASSERT_RACCT_ENABLED();
4485 	sx_assert(&allprison_lock, SA_UNLOCKED);
4486 
4487 	if (pr->pr_prison_racct == NULL)
4488 		return;
4489 	prison_racct_free(pr->pr_prison_racct);
4490 	pr->pr_prison_racct = NULL;
4491 }
4492 #endif /* RACCT */
4493 
4494 #ifdef DDB
4495 
4496 static void
4497 db_show_prison(struct prison *pr)
4498 {
4499 	struct bool_flags *bf;
4500 	struct jailsys_flags *jsf;
4501 #if defined(INET) || defined(INET6)
4502 	int ii;
4503 #endif
4504 	unsigned f;
4505 #ifdef INET
4506 	char ip4buf[INET_ADDRSTRLEN];
4507 #endif
4508 #ifdef INET6
4509 	char ip6buf[INET6_ADDRSTRLEN];
4510 #endif
4511 
4512 	db_printf("prison %p:\n", pr);
4513 	db_printf(" jid             = %d\n", pr->pr_id);
4514 	db_printf(" name            = %s\n", pr->pr_name);
4515 	db_printf(" parent          = %p\n", pr->pr_parent);
4516 	db_printf(" ref             = %d\n", pr->pr_ref);
4517 	db_printf(" uref            = %d\n", pr->pr_uref);
4518 	db_printf(" state           = %s\n",
4519 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
4520 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
4521 	    "invalid");
4522 	db_printf(" path            = %s\n", pr->pr_path);
4523 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4524 	    ? pr->pr_cpuset->cs_id : -1);
4525 #ifdef VIMAGE
4526 	db_printf(" vnet            = %p\n", pr->pr_vnet);
4527 #endif
4528 	db_printf(" root            = %p\n", pr->pr_root);
4529 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4530 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4531 	db_printf(" children.max    = %d\n", pr->pr_childmax);
4532 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4533 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4534 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4535 	db_printf(" flags           = 0x%x", pr->pr_flags);
4536 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
4537 		if (pr->pr_flags & bf->flag)
4538 			db_printf(" %s", bf->name);
4539 	for (jsf = pr_flag_jailsys;
4540 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
4541 	     jsf++) {
4542 		f = pr->pr_flags & (jsf->disable | jsf->new);
4543 		db_printf(" %-16s= %s\n", jsf->name,
4544 		    (f != 0 && f == jsf->disable) ? "disable"
4545 		    : (f == jsf->new) ? "new"
4546 		    : "inherit");
4547 	}
4548 	db_printf(" allow           = 0x%x", pr->pr_allow);
4549 	for (bf = pr_flag_allow;
4550 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4551 		atomic_load_int(&bf->flag) != 0;
4552 	     bf++)
4553 		if (pr->pr_allow & bf->flag)
4554 			db_printf(" %s", bf->name);
4555 	db_printf("\n");
4556 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4557 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4558 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4559 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4560 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4561 #ifdef INET
4562 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4563 	for (ii = 0; ii < pr->pr_ip4s; ii++)
4564 		db_printf(" %s %s\n",
4565 		    ii == 0 ? "ip4.addr        =" : "                 ",
4566 		    inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
4567 #endif
4568 #ifdef INET6
4569 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4570 	for (ii = 0; ii < pr->pr_ip6s; ii++)
4571 		db_printf(" %s %s\n",
4572 		    ii == 0 ? "ip6.addr        =" : "                 ",
4573 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4574 #endif
4575 }
4576 
4577 DB_SHOW_COMMAND(prison, db_show_prison_command)
4578 {
4579 	struct prison *pr;
4580 
4581 	if (!have_addr) {
4582 		/*
4583 		 * Show all prisons in the list, and prison0 which is not
4584 		 * listed.
4585 		 */
4586 		db_show_prison(&prison0);
4587 		if (!db_pager_quit) {
4588 			TAILQ_FOREACH(pr, &allprison, pr_list) {
4589 				db_show_prison(pr);
4590 				if (db_pager_quit)
4591 					break;
4592 			}
4593 		}
4594 		return;
4595 	}
4596 
4597 	if (addr == 0)
4598 		pr = &prison0;
4599 	else {
4600 		/* Look for a prison with the ID and with references. */
4601 		TAILQ_FOREACH(pr, &allprison, pr_list)
4602 			if (pr->pr_id == addr && pr->pr_ref > 0)
4603 				break;
4604 		if (pr == NULL)
4605 			/* Look again, without requiring a reference. */
4606 			TAILQ_FOREACH(pr, &allprison, pr_list)
4607 				if (pr->pr_id == addr)
4608 					break;
4609 		if (pr == NULL)
4610 			/* Assume address points to a valid prison. */
4611 			pr = (struct prison *)addr;
4612 	}
4613 	db_show_prison(pr);
4614 }
4615 
4616 #endif /* DDB */
4617