xref: /freebsd/sys/kern/kern_jail.c (revision 908f215e80fa482aa953c39afa6bb516f561fc00)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 1999 Poul-Henning Kamp.
5  * Copyright (c) 2008 Bjoern A. Zeeb.
6  * Copyright (c) 2009 James Gritton.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_nfs.h"
36 
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/errno.h>
42 #include <sys/sysproto.h>
43 #include <sys/malloc.h>
44 #include <sys/osd.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/epoch.h>
48 #include <sys/taskqueue.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/linker.h>
52 #include <sys/lock.h>
53 #include <sys/mman.h>
54 #include <sys/mutex.h>
55 #include <sys/racct.h>
56 #include <sys/rctl.h>
57 #include <sys/refcount.h>
58 #include <sys/sx.h>
59 #include <sys/sysent.h>
60 #include <sys/namei.h>
61 #include <sys/mount.h>
62 #include <sys/queue.h>
63 #include <sys/socket.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/sysctl.h>
66 #include <sys/uuid.h>
67 #include <sys/vnode.h>
68 
69 #include <net/if.h>
70 #include <net/vnet.h>
71 
72 #include <netinet/in.h>
73 
74 #ifdef DDB
75 #include <ddb/ddb.h>
76 #endif /* DDB */
77 
78 #include <security/mac/mac_framework.h>
79 
80 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
81 
82 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
83 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
84 
85 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
86 #ifdef INET
87 #ifdef INET6
88 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
89 #else
90 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
91 #endif
92 #else /* !INET */
93 #ifdef INET6
94 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
95 #else
96 #define	_PR_IP_SADDRSEL	0
97 #endif
98 #endif
99 
100 /* prison0 describes what is "real" about the system. */
101 struct prison prison0 = {
102 	.pr_id		= 0,
103 	.pr_name	= "0",
104 	.pr_ref		= 1,
105 	.pr_uref	= 1,
106 	.pr_path	= "/",
107 	.pr_securelevel	= -1,
108 	.pr_devfs_rsnum = 0,
109 	.pr_state	= PRISON_STATE_ALIVE,
110 	.pr_childmax	= JAIL_MAX,
111 	.pr_hostuuid	= DEFAULT_HOSTUUID,
112 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
113 #ifdef VIMAGE
114 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
115 #else
116 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
117 #endif
118 	.pr_allow	= PR_ALLOW_ALL_STATIC,
119 };
120 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
121 
122 struct bool_flags {
123 	const char	*name;
124 	const char	*noname;
125 	volatile u_int	 flag;
126 };
127 struct jailsys_flags {
128 	const char	*name;
129 	unsigned	 disable;
130 	unsigned	 new;
131 };
132 
133 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
134 struct	sx allprison_lock;
135 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
136 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
137 LIST_HEAD(, prison_racct) allprison_racct;
138 int	lastprid = 0;
139 int	lastdeadid = 0;
140 
141 static int get_next_prid(struct prison **insprp);
142 static int get_next_deadid(struct prison **insprp);
143 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
144 static void prison_complete(void *context, int pending);
145 static void prison_deref(struct prison *pr, int flags);
146 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
147 static int prison_lock_xlock(struct prison *pr, int flags);
148 static void prison_cleanup(struct prison *pr);
149 static void prison_free_not_last(struct prison *pr);
150 static void prison_proc_free_not_last(struct prison *pr);
151 static void prison_proc_relink(struct prison *opr, struct prison *npr,
152     struct proc *p);
153 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
154     int enable);
155 static char *prison_path(struct prison *pr1, struct prison *pr2);
156 #ifdef RACCT
157 static void prison_racct_attach(struct prison *pr);
158 static void prison_racct_modify(struct prison *pr);
159 static void prison_racct_detach(struct prison *pr);
160 #endif
161 
162 /* Flags for prison_deref */
163 #define	PD_DEREF	0x01	/* Decrement pr_ref */
164 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
165 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
166 #define	PD_LOCKED	0x10	/* pr_mtx is held */
167 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
168 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
169 #define PD_OP_FLAGS	0x07	/* Operation flags */
170 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
171 
172 /*
173  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
174  * as we cannot figure out the size of a sparse array, or an array without a
175  * terminating entry.
176  */
177 static struct bool_flags pr_flag_bool[] = {
178 	{"persist", "nopersist", PR_PERSIST},
179 #ifdef INET
180 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
181 #endif
182 #ifdef INET6
183 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
184 #endif
185 };
186 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
187 
188 static struct jailsys_flags pr_flag_jailsys[] = {
189 	{"host", 0, PR_HOST},
190 #ifdef VIMAGE
191 	{"vnet", 0, PR_VNET},
192 #endif
193 #ifdef INET
194 	{"ip4", PR_IP4_USER, PR_IP4_USER},
195 #endif
196 #ifdef INET6
197 	{"ip6", PR_IP6_USER, PR_IP6_USER},
198 #endif
199 };
200 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
201 
202 /*
203  * Make this array full-size so dynamic parameters can be added.
204  * It is protected by prison0.mtx, but lockless reading is allowed
205  * with an atomic check of the flag values.
206  */
207 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
208 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
209 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
210 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
211 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
212 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
213 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
214 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
215 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
216 	{"allow.reserved_ports", "allow.noreserved_ports",
217 	 PR_ALLOW_RESERVED_PORTS},
218 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
219 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
220 	 PR_ALLOW_UNPRIV_DEBUG},
221 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
222 #ifdef VIMAGE
223 	{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
224 #endif
225 	{"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR},
226 	{"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
227 	{"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
228 };
229 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
230 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
231 
232 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
233 					 PR_ALLOW_RESERVED_PORTS | \
234 					 PR_ALLOW_UNPRIV_DEBUG | \
235 					 PR_ALLOW_SUSER)
236 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
237 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
238 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241 #if defined(INET) || defined(INET6)
242 static unsigned jail_max_af_ips = 255;
243 #endif
244 
245 /*
246  * Initialize the parts of prison0 that can't be static-initialized with
247  * constants.  This is called from proc0_init() after creating thread0 cpuset.
248  */
249 void
250 prison0_init(void)
251 {
252 	uint8_t *file, *data;
253 	size_t size;
254 	char buf[sizeof(prison0.pr_hostuuid)];
255 	bool valid;
256 
257 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
258 	prison0.pr_osreldate = osreldate;
259 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
260 
261 	/* If we have a preloaded hostuuid, use it. */
262 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
263 	if (file != NULL) {
264 		data = preload_fetch_addr(file);
265 		size = preload_fetch_size(file);
266 		if (data != NULL) {
267 			/*
268 			 * The preloaded data may include trailing whitespace, almost
269 			 * certainly a newline; skip over any whitespace or
270 			 * non-printable characters to be safe.
271 			 */
272 			while (size > 0 && data[size - 1] <= 0x20) {
273 				size--;
274 			}
275 
276 			valid = false;
277 
278 			/*
279 			 * Not NUL-terminated when passed from loader, but
280 			 * validate_uuid requires that due to using sscanf (as
281 			 * does the subsequent strlcpy, since it still reads
282 			 * past the given size to return the true length);
283 			 * bounce to a temporary buffer to fix.
284 			 */
285 			if (size >= sizeof(buf))
286 				goto done;
287 
288 			memcpy(buf, data, size);
289 			buf[size] = '\0';
290 
291 			if (validate_uuid(buf, size, NULL, 0) != 0)
292 				goto done;
293 
294 			valid = true;
295 			(void)strlcpy(prison0.pr_hostuuid, buf,
296 			    sizeof(prison0.pr_hostuuid));
297 
298 done:
299 			if (bootverbose && !valid) {
300 				printf("hostuuid: preload data malformed: '%.*s'\n",
301 				    (int)size, data);
302 			}
303 		}
304 	}
305 	if (bootverbose)
306 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
307 }
308 
309 /*
310  * struct jail_args {
311  *	struct jail *jail;
312  * };
313  */
314 int
315 sys_jail(struct thread *td, struct jail_args *uap)
316 {
317 	uint32_t version;
318 	int error;
319 	struct jail j;
320 
321 	error = copyin(uap->jail, &version, sizeof(uint32_t));
322 	if (error)
323 		return (error);
324 
325 	switch (version) {
326 	case 0:
327 	{
328 		struct jail_v0 j0;
329 
330 		/* FreeBSD single IPv4 jails. */
331 		bzero(&j, sizeof(struct jail));
332 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
333 		if (error)
334 			return (error);
335 		j.version = j0.version;
336 		j.path = j0.path;
337 		j.hostname = j0.hostname;
338 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
339 		break;
340 	}
341 
342 	case 1:
343 		/*
344 		 * Version 1 was used by multi-IPv4 jail implementations
345 		 * that never made it into the official kernel.
346 		 */
347 		return (EINVAL);
348 
349 	case 2:	/* JAIL_API_VERSION */
350 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
351 		error = copyin(uap->jail, &j, sizeof(struct jail));
352 		if (error)
353 			return (error);
354 		break;
355 
356 	default:
357 		/* Sci-Fi jails are not supported, sorry. */
358 		return (EINVAL);
359 	}
360 	return (kern_jail(td, &j));
361 }
362 
363 int
364 kern_jail(struct thread *td, struct jail *j)
365 {
366 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
367 #ifdef INET
368 			    + 1
369 #endif
370 #ifdef INET6
371 			    + 1
372 #endif
373 			    )];
374 	struct uio opt;
375 	char *u_path, *u_hostname, *u_name;
376 	struct bool_flags *bf;
377 #ifdef INET
378 	uint32_t ip4s;
379 	struct in_addr *u_ip4;
380 #endif
381 #ifdef INET6
382 	struct in6_addr *u_ip6;
383 #endif
384 	size_t tmplen;
385 	int error, enforce_statfs;
386 
387 	bzero(&optiov, sizeof(optiov));
388 	opt.uio_iov = optiov;
389 	opt.uio_iovcnt = 0;
390 	opt.uio_offset = -1;
391 	opt.uio_resid = -1;
392 	opt.uio_segflg = UIO_SYSSPACE;
393 	opt.uio_rw = UIO_READ;
394 	opt.uio_td = td;
395 
396 	/* Set permissions for top-level jails from sysctls. */
397 	if (!jailed(td->td_ucred)) {
398 		for (bf = pr_flag_allow;
399 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
400 			atomic_load_int(&bf->flag) != 0;
401 		     bf++) {
402 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
403 			    (jail_default_allow & bf->flag)
404 			    ? bf->name : bf->noname);
405 			optiov[opt.uio_iovcnt].iov_len =
406 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
407 			opt.uio_iovcnt += 2;
408 		}
409 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
410 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
411 		opt.uio_iovcnt++;
412 		enforce_statfs = jail_default_enforce_statfs;
413 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
414 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
415 		opt.uio_iovcnt++;
416 	}
417 
418 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
419 #ifdef INET
420 	ip4s = (j->version == 0) ? 1 : j->ip4s;
421 	if (ip4s > jail_max_af_ips)
422 		return (EINVAL);
423 	tmplen += ip4s * sizeof(struct in_addr);
424 #else
425 	if (j->ip4s > 0)
426 		return (EINVAL);
427 #endif
428 #ifdef INET6
429 	if (j->ip6s > jail_max_af_ips)
430 		return (EINVAL);
431 	tmplen += j->ip6s * sizeof(struct in6_addr);
432 #else
433 	if (j->ip6s > 0)
434 		return (EINVAL);
435 #endif
436 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
437 	u_hostname = u_path + MAXPATHLEN;
438 	u_name = u_hostname + MAXHOSTNAMELEN;
439 #ifdef INET
440 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
441 #endif
442 #ifdef INET6
443 #ifdef INET
444 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
445 #else
446 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
447 #endif
448 #endif
449 	optiov[opt.uio_iovcnt].iov_base = "path";
450 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
451 	opt.uio_iovcnt++;
452 	optiov[opt.uio_iovcnt].iov_base = u_path;
453 	error = copyinstr(j->path, u_path, MAXPATHLEN,
454 	    &optiov[opt.uio_iovcnt].iov_len);
455 	if (error) {
456 		free(u_path, M_TEMP);
457 		return (error);
458 	}
459 	opt.uio_iovcnt++;
460 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
461 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
462 	opt.uio_iovcnt++;
463 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
464 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
465 	    &optiov[opt.uio_iovcnt].iov_len);
466 	if (error) {
467 		free(u_path, M_TEMP);
468 		return (error);
469 	}
470 	opt.uio_iovcnt++;
471 	if (j->jailname != NULL) {
472 		optiov[opt.uio_iovcnt].iov_base = "name";
473 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
474 		opt.uio_iovcnt++;
475 		optiov[opt.uio_iovcnt].iov_base = u_name;
476 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
477 		    &optiov[opt.uio_iovcnt].iov_len);
478 		if (error) {
479 			free(u_path, M_TEMP);
480 			return (error);
481 		}
482 		opt.uio_iovcnt++;
483 	}
484 #ifdef INET
485 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
486 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
487 	opt.uio_iovcnt++;
488 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
489 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
490 	if (j->version == 0)
491 		u_ip4->s_addr = j->ip4s;
492 	else {
493 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
494 		if (error) {
495 			free(u_path, M_TEMP);
496 			return (error);
497 		}
498 	}
499 	opt.uio_iovcnt++;
500 #endif
501 #ifdef INET6
502 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
503 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
504 	opt.uio_iovcnt++;
505 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
506 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
507 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
508 	if (error) {
509 		free(u_path, M_TEMP);
510 		return (error);
511 	}
512 	opt.uio_iovcnt++;
513 #endif
514 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
515 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
516 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
517 	free(u_path, M_TEMP);
518 	return (error);
519 }
520 
521 /*
522  * struct jail_set_args {
523  *	struct iovec *iovp;
524  *	unsigned int iovcnt;
525  *	int flags;
526  * };
527  */
528 int
529 sys_jail_set(struct thread *td, struct jail_set_args *uap)
530 {
531 	struct uio *auio;
532 	int error;
533 
534 	/* Check that we have an even number of iovecs. */
535 	if (uap->iovcnt & 1)
536 		return (EINVAL);
537 
538 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
539 	if (error)
540 		return (error);
541 	error = kern_jail_set(td, auio, uap->flags);
542 	freeuio(auio);
543 	return (error);
544 }
545 
546 #if defined(INET) || defined(INET6)
547 typedef int prison_addr_cmp_t(const void *, const void *);
548 typedef bool prison_addr_valid_t(const void *);
549 static const struct pr_family {
550 	size_t			size;
551 	prison_addr_cmp_t	*cmp;
552 	prison_addr_valid_t	*valid;
553 	int			ip_flag;
554 } pr_families[PR_FAMILY_MAX] = {
555 #ifdef INET
556 	[PR_INET] = {
557 		.size = sizeof(struct in_addr),
558 		.cmp = prison_qcmp_v4,
559 		.valid = prison_valid_v4,
560 		.ip_flag = PR_IP4_USER,
561 	 },
562 #endif
563 #ifdef INET6
564 	[PR_INET6] = {
565 		.size = sizeof(struct in6_addr),
566 		.cmp = prison_qcmp_v6,
567 		.valid = prison_valid_v6,
568 		.ip_flag = PR_IP6_USER,
569 	},
570 #endif
571 };
572 
573 /*
574  * Network address lists (pr_addrs) allocation for jails.  The addresses
575  * are accessed locklessly by the network stack, thus need to be protected by
576  * the network epoch.
577  */
578 struct prison_ip {
579 	struct epoch_context ctx;
580 	uint32_t	ips;
581 #ifdef FUTURE_C
582 	/*
583 	 * XXX Variable-length automatic arrays in union may be
584 	 * supported in future C.
585 	 */
586 	union {
587 		char pr_ip[];
588 		struct in_addr pr_ip4[];
589 		struct in6_addr pr_ip6[];
590 	};
591 #else /* No future C :( */
592 	char pr_ip[];
593 #endif
594 };
595 
596 static char *
597 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
598 {
599 	MPASS(pip);
600 	MPASS(af < PR_FAMILY_MAX);
601 	MPASS(idx >= 0 && idx < pip->ips);
602 
603 	return (pip->pr_ip + pr_families[af].size * idx);
604 }
605 
606 static struct prison_ip *
607 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
608 {
609 	struct prison_ip *pip;
610 
611 	pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
612 	    M_PRISON, flags);
613 	if (pip != NULL)
614 		pip->ips = cnt;
615 	return (pip);
616 }
617 
618 /*
619  * Allocate and copyin user supplied address list, sorting and validating.
620  * kern_jail_set() helper.
621  */
622 static struct prison_ip *
623 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
624 {
625 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
626 	const size_t size = pr_families[af].size;
627 	struct prison_ip *pip;
628 
629 	pip = prison_ip_alloc(af, cnt, M_WAITOK);
630 	bcopy(op, pip->pr_ip, cnt * size);
631 	/*
632 	 * IP addresses are all sorted but ip[0] to preserve
633 	 * the primary IP address as given from userland.
634 	 * This special IP is used for unbound outgoing
635 	 * connections as well for "loopback" traffic in case
636 	 * source address selection cannot find any more fitting
637 	 * address to connect from.
638 	 */
639 	if (cnt > 1)
640 		qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
641 	/*
642 	 * Check for duplicate addresses and do some simple
643 	 * zero and broadcast checks. If users give other bogus
644 	 * addresses it is their problem.
645 	 */
646 	for (int i = 0; i < cnt; i++) {
647 		if (!pr_families[af].valid(PR_IP(pip, af, i))) {
648 			free(pip, M_PRISON);
649 			return (NULL);
650 		}
651 		if (i + 1 < cnt &&
652 		    (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
653 		     cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
654 			free(pip, M_PRISON);
655 			return (NULL);
656 		}
657 	}
658 
659 	return (pip);
660 }
661 
662 /*
663  * Allocate and dup parent prison address list.
664  * kern_jail_set() helper.
665  */
666 static void
667 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
668 {
669 	const struct prison_ip *ppip = ppr->pr_addrs[af];
670 	struct prison_ip *pip;
671 
672 	if (ppip != NULL) {
673 		pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
674 		bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
675 		pr->pr_addrs[af] = pip;
676 	}
677 }
678 
679 /*
680  * Make sure the new set of IP addresses is a subset of the parent's list.
681  * Don't worry about the parent being unlocked, as any setting is done with
682  * allprison_lock held.
683  * kern_jail_set() helper.
684  */
685 static bool
686 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
687     const pr_family_t af)
688 {
689 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
690 	int i, j;
691 
692 	if (ppip == NULL)
693 		return (false);
694 
695 	for (i = 0; i < ppip->ips; i++)
696 		if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
697 			break;
698 
699 	if (i == ppip->ips)
700 		/* Main address not present in parent. */
701 		return (false);
702 
703 	if (pip->ips > 1) {
704 		for (i = j = 1; i < pip->ips; i++) {
705 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
706 				/* Equals to parent primary address. */
707 				continue;
708 			for (; j < ppip->ips; j++)
709 				if (cmp(PR_IP(pip, af, i),
710 				    PR_IP(ppip, af, j)) == 0)
711 					break;
712 			if (j == ppip->ips)
713 				break;
714 		}
715 		if (j == ppip->ips)
716 			/* Address not present in parent. */
717 			return (false);
718 	}
719 	return (true);
720 }
721 
722 /*
723  * Check for conflicting IP addresses.  We permit them if there is no more
724  * than one IP on each jail.  If there is a duplicate on a jail with more
725  * than one IP stop checking and return error.
726  * kern_jail_set() helper.
727  */
728 static bool
729 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
730     struct prison_ip *pip, pr_family_t af)
731 {
732 	const struct prison *tppr, *tpr;
733 	int descend;
734 
735 #ifdef VIMAGE
736 	for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
737 		if (tppr->pr_flags & PR_VNET)
738 			break;
739 #else
740 	tppr = &prison0;
741 #endif
742 	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
743 		if (tpr == pr ||
744 #ifdef VIMAGE
745 		    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
746 #endif
747 		    !prison_isalive(tpr)) {
748 			descend = 0;
749 			continue;
750 		}
751 		if (!(tpr->pr_flags & pr_families[af].ip_flag))
752 			continue;
753 		descend = 0;
754 		if (tpr->pr_addrs[af] == NULL ||
755 		    (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
756 			continue;
757 		for (int i = 0; i < pip->ips; i++)
758 			if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
759 				return (false);
760 	}
761 
762 	return (true);
763 }
764 
765 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
766     "prison must start with epoch context");
767 static void
768 prison_ip_free_deferred(epoch_context_t ctx)
769 {
770 
771 	free(ctx, M_PRISON);
772 }
773 
774 static void
775 prison_ip_free(struct prison_ip *pip)
776 {
777 
778 	if (pip != NULL)
779 		NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
780 }
781 
782 static void
783 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
784 {
785 	struct prison_ip **mem, *old;
786 
787 	mtx_assert(&pr->pr_mtx, MA_OWNED);
788 
789 	mem = &pr->pr_addrs[af];
790 
791 	old = *mem;
792 	atomic_store_ptr(mem, new);
793 	prison_ip_free(old);
794 }
795 
796 /*
797  * Restrict a prison's IP address list with its parent's, possibly replacing
798  * it.  Return true if succeed, otherwise should redo.
799  * kern_jail_set() helper.
800  */
801 static bool
802 prison_ip_restrict(struct prison *pr, const pr_family_t af,
803     struct prison_ip **newp)
804 {
805 	struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
806 	struct prison_ip *pip = pr->pr_addrs[af];
807 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
808 	const size_t size = pr_families[af].size;
809 	struct prison_ip *new = newp != NULL ? *newp : NULL;
810 	uint32_t ips;
811 
812 	mtx_assert(&pr->pr_mtx, MA_OWNED);
813 
814 	/*
815 	 * Due to epoch-synchronized access to the IP address lists we always
816 	 * allocate a new list even if the old one has enough space.  We could
817 	 * atomically update an IPv4 address inside a list, but that would
818 	 * screw up sorting, and in case of IPv6 we can't even atomically write
819 	 * one.
820 	 */
821 	if (ppip == NULL) {
822 		if (pip != NULL)
823 			prison_ip_set(pr, af, NULL);
824 		return (true);
825 	}
826 
827 	if (!(pr->pr_flags & pr_families[af].ip_flag)) {
828 		if (new == NULL) {
829 			new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
830 			if (new == NULL)
831 				return (false); /* Redo */
832 		}
833 		/* This has no user settings, so just copy the parent's list. */
834 		MPASS(new->ips == ppip->ips);
835 		bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
836 		prison_ip_set(pr, af, new);
837 		if (newp != NULL)
838 			*newp = NULL; /* Used */
839 	} else if (pip != NULL) {
840 		/* Remove addresses that aren't in the parent. */
841 		int i;
842 
843 		i = 0; /* index in pip */
844 		ips = 0; /* index in new */
845 
846 		if (new == NULL) {
847 			new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
848 			if (new == NULL)
849 				return (false); /* Redo */
850 		}
851 
852 		for (int pi = 0; pi < ppip->ips; pi++)
853 			if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
854 				/* Found our primary address in parent. */
855 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
856 				    size);
857 				i++;
858 				ips++;
859 				break;
860 			}
861 		for (int pi = 1; i < pip->ips; ) {
862 			/* Check against primary, which is unsorted. */
863 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
864 				/* Matches parent's primary address. */
865 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
866 				    size);
867 				i++;
868 				ips++;
869 				continue;
870 			}
871 			/* The rest are sorted. */
872 			switch (pi >= ppip->ips ? -1 :
873 				cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
874 			case -1:
875 				i++;
876 				break;
877 			case 0:
878 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
879 				    size);
880 				i++;
881 				pi++;
882 				ips++;
883 				break;
884 			case 1:
885 				pi++;
886 				break;
887 			}
888 		}
889 		if (ips == 0) {
890 			if (newp == NULL || *newp == NULL)
891 				prison_ip_free(new);
892 			new = NULL;
893 		} else {
894 			/* Shrink to real size */
895 			KASSERT((new->ips >= ips),
896 			    ("Out-of-bounds write to prison_ip %p", new));
897 			new->ips = ips;
898 		}
899 		prison_ip_set(pr, af, new);
900 		if (newp != NULL)
901 			*newp = NULL; /* Used */
902 	}
903 	return (true);
904 }
905 
906 /*
907  * Fast-path check if an address belongs to a prison.
908  */
909 int
910 prison_ip_check(const struct prison *pr, const pr_family_t af,
911     const void *addr)
912 {
913 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
914 	struct prison_ip *pip;
915 	int i, a, z, d;
916 
917 	MPASS(mtx_owned(&pr->pr_mtx) ||
918 	    in_epoch(net_epoch_preempt) ||
919 	    sx_xlocked(&allprison_lock));
920 
921 	pip = atomic_load_ptr(&pr->pr_addrs[af]);
922 	if (__predict_false(pip == NULL))
923 		return (EAFNOSUPPORT);
924 
925 	/* Check the primary IP. */
926 	if (cmp(PR_IP(pip, af, 0), addr) == 0)
927 		return (0);
928 
929 	/*
930 	 * All the other IPs are sorted so we can do a binary search.
931 	 */
932 	a = 0;
933 	z = pip->ips - 2;
934 	while (a <= z) {
935 		i = (a + z) / 2;
936 		d = cmp(PR_IP(pip, af, i + 1), addr);
937 		if (d > 0)
938 			z = i - 1;
939 		else if (d < 0)
940 			a = i + 1;
941 		else
942 			return (0);
943 	}
944 
945 	return (EADDRNOTAVAIL);
946 }
947 
948 /*
949  * Grab primary IP.  Historically required mutex, but nothing prevents
950  * us to support epoch-protected access.  Is it used in fast path?
951  * in{6}_jail.c helper
952  */
953 const void *
954 prison_ip_get0(const struct prison *pr, const pr_family_t af)
955 {
956 	const struct prison_ip *pip = pr->pr_addrs[af];
957 
958 	mtx_assert(&pr->pr_mtx, MA_OWNED);
959 	MPASS(pip);
960 
961 	return (pip->pr_ip);
962 }
963 
964 u_int
965 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
966 {
967 
968 	return (pr->pr_addrs[af]->ips);
969 }
970 #endif	/* defined(INET) || defined(INET6) */
971 
972 int
973 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
974 {
975 	struct nameidata nd;
976 #ifdef INET
977 	struct prison_ip *ip4;
978 #endif
979 #ifdef INET6
980 	struct prison_ip *ip6;
981 #endif
982 	struct vfsopt *opt;
983 	struct vfsoptlist *opts;
984 	struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
985 	struct vnode *root;
986 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
987 	char *g_path, *osrelstr;
988 	struct bool_flags *bf;
989 	struct jailsys_flags *jsf;
990 #if defined(INET) || defined(INET6)
991 	void *op;
992 #endif
993 	unsigned long hid;
994 	size_t namelen, onamelen, pnamelen;
995 	int created, cuflags, descend, drflags, enforce;
996 	int error, errmsg_len, errmsg_pos;
997 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
998 	int deadid, jid, jsys, len, level;
999 	int childmax, osreldt, rsnum, slevel;
1000 #ifdef INET
1001 	int ip4s;
1002 	bool redo_ip4;
1003 #endif
1004 #ifdef INET6
1005 	int ip6s;
1006 	bool redo_ip6;
1007 #endif
1008 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
1009 	uint64_t pr_allow_diff;
1010 	unsigned tallow;
1011 	char numbuf[12];
1012 
1013 	error = priv_check(td, PRIV_JAIL_SET);
1014 	if (!error && (flags & JAIL_ATTACH))
1015 		error = priv_check(td, PRIV_JAIL_ATTACH);
1016 	if (error)
1017 		return (error);
1018 	mypr = td->td_ucred->cr_prison;
1019 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
1020 		return (EPERM);
1021 	if (flags & ~JAIL_SET_MASK)
1022 		return (EINVAL);
1023 
1024 	/*
1025 	 * Check all the parameters before committing to anything.  Not all
1026 	 * errors can be caught early, but we may as well try.  Also, this
1027 	 * takes care of some expensive stuff (path lookup) before getting
1028 	 * the allprison lock.
1029 	 *
1030 	 * XXX Jails are not filesystems, and jail parameters are not mount
1031 	 *     options.  But it makes more sense to re-use the vfsopt code
1032 	 *     than duplicate it under a different name.
1033 	 */
1034 	error = vfs_buildopts(optuio, &opts);
1035 	if (error)
1036 		return (error);
1037 #ifdef INET
1038 	ip4 = NULL;
1039 #endif
1040 #ifdef INET6
1041 	ip6 = NULL;
1042 #endif
1043 	g_path = NULL;
1044 
1045 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1046 	if (!cuflags) {
1047 		error = EINVAL;
1048 		vfs_opterror(opts, "no valid operation (create or update)");
1049 		goto done_errmsg;
1050 	}
1051 
1052 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1053 	if (error == ENOENT)
1054 		jid = 0;
1055 	else if (error != 0)
1056 		goto done_free;
1057 
1058 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
1059 	if (error == ENOENT)
1060 		gotslevel = 0;
1061 	else if (error != 0)
1062 		goto done_free;
1063 	else
1064 		gotslevel = 1;
1065 
1066 	error =
1067 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
1068 	if (error == ENOENT)
1069 		gotchildmax = 0;
1070 	else if (error != 0)
1071 		goto done_free;
1072 	else
1073 		gotchildmax = 1;
1074 
1075 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
1076 	if (error == ENOENT)
1077 		gotenforce = 0;
1078 	else if (error != 0)
1079 		goto done_free;
1080 	else if (enforce < 0 || enforce > 2) {
1081 		error = EINVAL;
1082 		goto done_free;
1083 	} else
1084 		gotenforce = 1;
1085 
1086 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
1087 	if (error == ENOENT)
1088 		gotrsnum = 0;
1089 	else if (error != 0)
1090 		goto done_free;
1091 	else
1092 		gotrsnum = 1;
1093 
1094 	pr_flags = ch_flags = 0;
1095 	for (bf = pr_flag_bool;
1096 	     bf < pr_flag_bool + nitems(pr_flag_bool);
1097 	     bf++) {
1098 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
1099 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
1100 	}
1101 	ch_flags |= pr_flags;
1102 	for (jsf = pr_flag_jailsys;
1103 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
1104 	     jsf++) {
1105 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
1106 		if (error == ENOENT)
1107 			continue;
1108 		if (error != 0)
1109 			goto done_free;
1110 		switch (jsys) {
1111 		case JAIL_SYS_DISABLE:
1112 			if (!jsf->disable) {
1113 				error = EINVAL;
1114 				goto done_free;
1115 			}
1116 			pr_flags |= jsf->disable;
1117 			break;
1118 		case JAIL_SYS_NEW:
1119 			pr_flags |= jsf->new;
1120 			break;
1121 		case JAIL_SYS_INHERIT:
1122 			break;
1123 		default:
1124 			error = EINVAL;
1125 			goto done_free;
1126 		}
1127 		ch_flags |= jsf->new | jsf->disable;
1128 	}
1129 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
1130 	    && !(pr_flags & PR_PERSIST)) {
1131 		error = EINVAL;
1132 		vfs_opterror(opts, "new jail must persist or attach");
1133 		goto done_errmsg;
1134 	}
1135 #ifdef VIMAGE
1136 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
1137 		error = EINVAL;
1138 		vfs_opterror(opts, "vnet cannot be changed after creation");
1139 		goto done_errmsg;
1140 	}
1141 #endif
1142 #ifdef INET
1143 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
1144 		error = EINVAL;
1145 		vfs_opterror(opts, "ip4 cannot be changed after creation");
1146 		goto done_errmsg;
1147 	}
1148 #endif
1149 #ifdef INET6
1150 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
1151 		error = EINVAL;
1152 		vfs_opterror(opts, "ip6 cannot be changed after creation");
1153 		goto done_errmsg;
1154 	}
1155 #endif
1156 
1157 	pr_allow = ch_allow = 0;
1158 	for (bf = pr_flag_allow;
1159 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
1160 		atomic_load_int(&bf->flag) != 0;
1161 	     bf++) {
1162 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
1163 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
1164 	}
1165 	ch_allow |= pr_allow;
1166 
1167 	error = vfs_getopt(opts, "name", (void **)&name, &len);
1168 	if (error == ENOENT)
1169 		name = NULL;
1170 	else if (error != 0)
1171 		goto done_free;
1172 	else {
1173 		if (len == 0 || name[len - 1] != '\0') {
1174 			error = EINVAL;
1175 			goto done_free;
1176 		}
1177 		if (len > MAXHOSTNAMELEN) {
1178 			error = ENAMETOOLONG;
1179 			goto done_free;
1180 		}
1181 	}
1182 
1183 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
1184 	if (error == ENOENT)
1185 		host = NULL;
1186 	else if (error != 0)
1187 		goto done_free;
1188 	else {
1189 		ch_flags |= PR_HOST;
1190 		pr_flags |= PR_HOST;
1191 		if (len == 0 || host[len - 1] != '\0') {
1192 			error = EINVAL;
1193 			goto done_free;
1194 		}
1195 		if (len > MAXHOSTNAMELEN) {
1196 			error = ENAMETOOLONG;
1197 			goto done_free;
1198 		}
1199 	}
1200 
1201 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
1202 	if (error == ENOENT)
1203 		domain = NULL;
1204 	else if (error != 0)
1205 		goto done_free;
1206 	else {
1207 		ch_flags |= PR_HOST;
1208 		pr_flags |= PR_HOST;
1209 		if (len == 0 || domain[len - 1] != '\0') {
1210 			error = EINVAL;
1211 			goto done_free;
1212 		}
1213 		if (len > MAXHOSTNAMELEN) {
1214 			error = ENAMETOOLONG;
1215 			goto done_free;
1216 		}
1217 	}
1218 
1219 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
1220 	if (error == ENOENT)
1221 		uuid = NULL;
1222 	else if (error != 0)
1223 		goto done_free;
1224 	else {
1225 		ch_flags |= PR_HOST;
1226 		pr_flags |= PR_HOST;
1227 		if (len == 0 || uuid[len - 1] != '\0') {
1228 			error = EINVAL;
1229 			goto done_free;
1230 		}
1231 		if (len > HOSTUUIDLEN) {
1232 			error = ENAMETOOLONG;
1233 			goto done_free;
1234 		}
1235 	}
1236 
1237 #ifdef COMPAT_FREEBSD32
1238 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1239 		uint32_t hid32;
1240 
1241 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
1242 		hid = hid32;
1243 	} else
1244 #endif
1245 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
1246 	if (error == ENOENT)
1247 		gothid = 0;
1248 	else if (error != 0)
1249 		goto done_free;
1250 	else {
1251 		gothid = 1;
1252 		ch_flags |= PR_HOST;
1253 		pr_flags |= PR_HOST;
1254 	}
1255 
1256 #ifdef INET
1257 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
1258 	if (error == ENOENT)
1259 		ip4s = 0;
1260 	else if (error != 0)
1261 		goto done_free;
1262 	else if (ip4s & (sizeof(struct in_addr) - 1)) {
1263 		error = EINVAL;
1264 		goto done_free;
1265 	} else {
1266 		ch_flags |= PR_IP4_USER;
1267 		pr_flags |= PR_IP4_USER;
1268 		if (ip4s > 0) {
1269 			ip4s /= sizeof(struct in_addr);
1270 			if (ip4s > jail_max_af_ips) {
1271 				error = EINVAL;
1272 				vfs_opterror(opts, "too many IPv4 addresses");
1273 				goto done_errmsg;
1274 			}
1275 			ip4 = prison_ip_copyin(PR_INET, op, ip4s);
1276 			if (ip4 == NULL) {
1277 				error = EINVAL;
1278 				goto done_free;
1279 			}
1280 		}
1281 	}
1282 #endif
1283 
1284 #ifdef INET6
1285 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
1286 	if (error == ENOENT)
1287 		ip6s = 0;
1288 	else if (error != 0)
1289 		goto done_free;
1290 	else if (ip6s & (sizeof(struct in6_addr) - 1)) {
1291 		error = EINVAL;
1292 		goto done_free;
1293 	} else {
1294 		ch_flags |= PR_IP6_USER;
1295 		pr_flags |= PR_IP6_USER;
1296 		if (ip6s > 0) {
1297 			ip6s /= sizeof(struct in6_addr);
1298 			if (ip6s > jail_max_af_ips) {
1299 				error = EINVAL;
1300 				vfs_opterror(opts, "too many IPv6 addresses");
1301 				goto done_errmsg;
1302 			}
1303 			ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
1304 			if (ip6 == NULL) {
1305 				error = EINVAL;
1306 				goto done_free;
1307 			}
1308 		}
1309 	}
1310 #endif
1311 
1312 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1313 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1314 		error = EINVAL;
1315 		vfs_opterror(opts,
1316 		    "vnet jails cannot have IP address restrictions");
1317 		goto done_errmsg;
1318 	}
1319 #endif
1320 
1321 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
1322 	if (error == ENOENT)
1323 		osrelstr = NULL;
1324 	else if (error != 0)
1325 		goto done_free;
1326 	else {
1327 		if (flags & JAIL_UPDATE) {
1328 			error = EINVAL;
1329 			vfs_opterror(opts,
1330 			    "osrelease cannot be changed after creation");
1331 			goto done_errmsg;
1332 		}
1333 		if (len == 0 || osrelstr[len - 1] != '\0') {
1334 			error = EINVAL;
1335 			goto done_free;
1336 		}
1337 		if (len >= OSRELEASELEN) {
1338 			error = ENAMETOOLONG;
1339 			vfs_opterror(opts,
1340 			    "osrelease string must be 1-%d bytes long",
1341 			    OSRELEASELEN - 1);
1342 			goto done_errmsg;
1343 		}
1344 	}
1345 
1346 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1347 	if (error == ENOENT)
1348 		osreldt = 0;
1349 	else if (error != 0)
1350 		goto done_free;
1351 	else {
1352 		if (flags & JAIL_UPDATE) {
1353 			error = EINVAL;
1354 			vfs_opterror(opts,
1355 			    "osreldate cannot be changed after creation");
1356 			goto done_errmsg;
1357 		}
1358 		if (osreldt == 0) {
1359 			error = EINVAL;
1360 			vfs_opterror(opts, "osreldate cannot be 0");
1361 			goto done_errmsg;
1362 		}
1363 	}
1364 
1365 	root = NULL;
1366 	error = vfs_getopt(opts, "path", (void **)&path, &len);
1367 	if (error == ENOENT)
1368 		path = NULL;
1369 	else if (error != 0)
1370 		goto done_free;
1371 	else {
1372 		if (flags & JAIL_UPDATE) {
1373 			error = EINVAL;
1374 			vfs_opterror(opts,
1375 			    "path cannot be changed after creation");
1376 			goto done_errmsg;
1377 		}
1378 		if (len == 0 || path[len - 1] != '\0') {
1379 			error = EINVAL;
1380 			goto done_free;
1381 		}
1382 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
1383 		error = namei(&nd);
1384 		if (error)
1385 			goto done_free;
1386 		root = nd.ni_vp;
1387 		NDFREE_PNBUF(&nd);
1388 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1389 		strlcpy(g_path, path, MAXPATHLEN);
1390 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1391 		if (error == 0) {
1392 			path = g_path;
1393 		} else {
1394 			/* exit on other errors */
1395 			goto done_free;
1396 		}
1397 		if (root->v_type != VDIR) {
1398 			error = ENOTDIR;
1399 			vput(root);
1400 			goto done_free;
1401 		}
1402 		VOP_UNLOCK(root);
1403 	}
1404 
1405 	/*
1406 	 * Find the specified jail, or at least its parent.
1407 	 * This abuses the file error codes ENOENT and EEXIST.
1408 	 */
1409 	pr = NULL;
1410 	inspr = NULL;
1411 	deadpr = NULL;
1412 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1413 		namelc = strrchr(name, '.');
1414 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1415 		if (*p != '\0')
1416 			jid = 0;
1417 	}
1418 	sx_xlock(&allprison_lock);
1419 	drflags = PD_LIST_XLOCKED;
1420 	ppr = mypr;
1421 	if (!prison_isalive(ppr)) {
1422 		/* This jail is dying.  This process will surely follow. */
1423 		error = EAGAIN;
1424 		goto done_deref;
1425 	}
1426 	if (jid != 0) {
1427 		if (jid < 0) {
1428 			error = EINVAL;
1429 			vfs_opterror(opts, "negative jid");
1430 			goto done_deref;
1431 		}
1432 		/*
1433 		 * See if a requested jid already exists.  Keep track of
1434 		 * where it can be inserted later.
1435 		 */
1436 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
1437 			if (inspr->pr_id < jid)
1438 				continue;
1439 			if (inspr->pr_id > jid)
1440 				break;
1441 			if (prison_isalive(inspr)) {
1442 				pr = inspr;
1443 				mtx_lock(&pr->pr_mtx);
1444 				drflags |= PD_LOCKED;
1445 			} else {
1446 				/* Note a dying jail to handle later. */
1447 				deadpr = inspr;
1448 			}
1449 			inspr = NULL;
1450 			break;
1451 		}
1452 		if (cuflags == JAIL_CREATE && pr != NULL) {
1453 			/*
1454 			 * Even creators that cannot see the jail will
1455 			 * get EEXIST.
1456 			 */
1457 			error = EEXIST;
1458 			vfs_opterror(opts, "jail %d already exists", jid);
1459 			goto done_deref;
1460 		}
1461 		if ((pr == NULL)
1462 		    ? cuflags == JAIL_UPDATE
1463 		    : !prison_ischild(mypr, pr)) {
1464 			/*
1465 			 * Updaters get ENOENT for nonexistent jails,
1466 			 * or for jails they cannot see.  The latter
1467 			 * case is true even for CREATE | UPDATE,
1468 			 * which normally cannot give this error.
1469 			 */
1470 			error = ENOENT;
1471 			vfs_opterror(opts, "jail %d not found", jid);
1472 			goto done_deref;
1473 		}
1474 	}
1475 	/*
1476 	 * If the caller provided a name, look for a jail by that name.
1477 	 * This has different semantics for creates and updates keyed by jid
1478 	 * (where the name must not already exist in a different jail),
1479 	 * and updates keyed by the name itself (where the name must exist
1480 	 * because that is the jail being updated).
1481 	 */
1482 	namelc = NULL;
1483 	if (name != NULL) {
1484 		namelc = strrchr(name, '.');
1485 		if (namelc == NULL)
1486 			namelc = name;
1487 		else {
1488 			/*
1489 			 * This is a hierarchical name.  Split it into the
1490 			 * parent and child names, and make sure the parent
1491 			 * exists or matches an already found jail.
1492 			 */
1493 			if (pr != NULL) {
1494 				if (strncmp(name, ppr->pr_name, namelc - name)
1495 				    || ppr->pr_name[namelc - name] != '\0') {
1496 					error = EINVAL;
1497 					vfs_opterror(opts,
1498 					    "cannot change jail's parent");
1499 					goto done_deref;
1500 				}
1501 			} else {
1502 				*namelc = '\0';
1503 				ppr = prison_find_name(mypr, name);
1504 				if (ppr == NULL) {
1505 					error = ENOENT;
1506 					vfs_opterror(opts,
1507 					    "jail \"%s\" not found", name);
1508 					goto done_deref;
1509 				}
1510 				mtx_unlock(&ppr->pr_mtx);
1511 				if (!prison_isalive(ppr)) {
1512 					error = ENOENT;
1513 					vfs_opterror(opts,
1514 					    "jail \"%s\" is dying", name);
1515 					goto done_deref;
1516 				}
1517 				*namelc = '.';
1518 			}
1519 			namelc++;
1520 		}
1521 		if (namelc[0] != '\0') {
1522 			pnamelen =
1523 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1524 			FOREACH_PRISON_CHILD(ppr, tpr) {
1525 				if (tpr == pr || !prison_isalive(tpr) ||
1526 				    strcmp(tpr->pr_name + pnamelen, namelc))
1527 					continue;
1528 				if (cuflags == JAIL_CREATE || pr != NULL) {
1529 					/*
1530 					 * Create, or update(jid): name must
1531 					 * not exist in an active sibling jail.
1532 					 */
1533 					error = EEXIST;
1534 					vfs_opterror(opts,
1535 					    "jail \"%s\" already exists", name);
1536 					goto done_deref;
1537 				}
1538 				/* Use this jail for updates. */
1539 				pr = tpr;
1540 				mtx_lock(&pr->pr_mtx);
1541 				drflags |= PD_LOCKED;
1542 				break;
1543 			}
1544 			/*
1545 			 * Update: name must exist if no jid is specified.
1546 			 * As with the jid case, the jail must be currently
1547 			 * visible, or else even CREATE | UPDATE will get
1548 			 * an error.
1549 			 */
1550 			if ((pr == NULL)
1551 			    ? cuflags == JAIL_UPDATE
1552 			    : !prison_isalive(pr)) {
1553 				error = ENOENT;
1554 				vfs_opterror(opts, "jail \"%s\" not found",
1555 				    name);
1556 				goto done_deref;
1557 			}
1558 		}
1559 	}
1560 	/* Update: must provide a jid or name. */
1561 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1562 		error = ENOENT;
1563 		vfs_opterror(opts, "update specified no jail");
1564 		goto done_deref;
1565 	}
1566 
1567 	/* If there's no prison to update, create a new one and link it in. */
1568 	created = pr == NULL;
1569 	if (created) {
1570 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1571 			if (tpr->pr_childcount >= tpr->pr_childmax) {
1572 				error = EPERM;
1573 				vfs_opterror(opts, "prison limit exceeded");
1574 				goto done_deref;
1575 			}
1576 
1577 		if (deadpr != NULL) {
1578 			/*
1579 			 * The prison being created has the same ID as a dying
1580 			 * one.  Handle this by giving the dying jail a new ID.
1581 			 * This may cause some confusion to user space, but
1582 			 * only to those listing dying jails.
1583 			 */
1584 			deadid = get_next_deadid(&dinspr);
1585 			if (deadid == 0) {
1586 				error = EAGAIN;
1587 				vfs_opterror(opts, "no available jail IDs");
1588 				goto done_deref;
1589 			}
1590 			mtx_lock(&deadpr->pr_mtx);
1591 			deadpr->pr_id = deadid;
1592 			mtx_unlock(&deadpr->pr_mtx);
1593 			if (dinspr == deadpr)
1594 				inspr = deadpr;
1595 			else {
1596 				inspr = TAILQ_NEXT(deadpr, pr_list);
1597 				TAILQ_REMOVE(&allprison, deadpr, pr_list);
1598 				if (dinspr != NULL)
1599 					TAILQ_INSERT_AFTER(&allprison, dinspr,
1600 					    deadpr, pr_list);
1601 				else
1602 					TAILQ_INSERT_HEAD(&allprison, deadpr,
1603 					    pr_list);
1604 			}
1605 		}
1606 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1607 			error = EAGAIN;
1608 			vfs_opterror(opts, "no available jail IDs");
1609 			goto done_deref;
1610 		}
1611 
1612 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1613 		pr->pr_state = PRISON_STATE_INVALID;
1614 		refcount_init(&pr->pr_ref, 1);
1615 		refcount_init(&pr->pr_uref, 0);
1616 		drflags |= PD_DEREF;
1617 		LIST_INIT(&pr->pr_children);
1618 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1619 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1620 
1621 		pr->pr_id = jid;
1622 		if (inspr != NULL)
1623 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1624 		else
1625 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1626 
1627 		pr->pr_parent = ppr;
1628 		prison_hold(ppr);
1629 		prison_proc_hold(ppr);
1630 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1631 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1632 			tpr->pr_childcount++;
1633 
1634 		/* Set some default values, and inherit some from the parent. */
1635 		if (namelc == NULL)
1636 			namelc = "";
1637 		if (path == NULL) {
1638 			path = "/";
1639 			root = mypr->pr_root;
1640 			vref(root);
1641 		}
1642 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1643 		pr->pr_flags |= PR_HOST;
1644 #if defined(INET) || defined(INET6)
1645 #ifdef VIMAGE
1646 		if (!(pr_flags & PR_VNET))
1647 #endif
1648 		{
1649 #ifdef INET
1650 			if (!(ch_flags & PR_IP4_USER))
1651 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1652 			else if (!(pr_flags & PR_IP4_USER)) {
1653 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1654 				prison_ip_dup(ppr, pr, PR_INET);
1655 			}
1656 #endif
1657 #ifdef INET6
1658 			if (!(ch_flags & PR_IP6_USER))
1659 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1660 			else if (!(pr_flags & PR_IP6_USER)) {
1661 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1662 				prison_ip_dup(ppr, pr, PR_INET6);
1663 			}
1664 #endif
1665 		}
1666 #endif
1667 		/* Source address selection is always on by default. */
1668 		pr->pr_flags |= _PR_IP_SADDRSEL;
1669 
1670 		pr->pr_securelevel = ppr->pr_securelevel;
1671 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1672 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
1673 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1674 
1675 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1676 		if (osrelstr == NULL)
1677 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1678 			    sizeof(pr->pr_osrelease));
1679 		else
1680 			strlcpy(pr->pr_osrelease, osrelstr,
1681 			    sizeof(pr->pr_osrelease));
1682 
1683 #ifdef VIMAGE
1684 		/* Allocate a new vnet if specified. */
1685 		pr->pr_vnet = (pr_flags & PR_VNET)
1686 		    ? vnet_alloc() : ppr->pr_vnet;
1687 #endif
1688 		/*
1689 		 * Allocate a dedicated cpuset for each jail.
1690 		 * Unlike other initial settings, this may return an error.
1691 		 */
1692 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1693 		if (error)
1694 			goto done_deref;
1695 
1696 		mtx_lock(&pr->pr_mtx);
1697 		drflags |= PD_LOCKED;
1698 	} else {
1699 		/*
1700 		 * Grab a reference for existing prisons, to ensure they
1701 		 * continue to exist for the duration of the call.
1702 		 */
1703 		prison_hold(pr);
1704 		drflags |= PD_DEREF;
1705 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1706 		if ((pr->pr_flags & PR_VNET) &&
1707 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1708 			error = EINVAL;
1709 			vfs_opterror(opts,
1710 			    "vnet jails cannot have IP address restrictions");
1711 			goto done_deref;
1712 		}
1713 #endif
1714 #ifdef INET
1715 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1716 			error = EINVAL;
1717 			vfs_opterror(opts,
1718 			    "ip4 cannot be changed after creation");
1719 			goto done_deref;
1720 		}
1721 #endif
1722 #ifdef INET6
1723 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1724 			error = EINVAL;
1725 			vfs_opterror(opts,
1726 			    "ip6 cannot be changed after creation");
1727 			goto done_deref;
1728 		}
1729 #endif
1730 	}
1731 
1732 	/* Do final error checking before setting anything. */
1733 	if (gotslevel) {
1734 		if (slevel < ppr->pr_securelevel) {
1735 			error = EPERM;
1736 			goto done_deref;
1737 		}
1738 	}
1739 	if (gotchildmax) {
1740 		if (childmax >= ppr->pr_childmax) {
1741 			error = EPERM;
1742 			goto done_deref;
1743 		}
1744 	}
1745 	if (gotenforce) {
1746 		if (enforce < ppr->pr_enforce_statfs) {
1747 			error = EPERM;
1748 			goto done_deref;
1749 		}
1750 	}
1751 	if (gotrsnum) {
1752 		/*
1753 		 * devfs_rsnum is a uint16_t
1754 		 */
1755 		if (rsnum < 0 || rsnum > 65535) {
1756 			error = EINVAL;
1757 			goto done_deref;
1758 		}
1759 		/*
1760 		 * Nested jails always inherit parent's devfs ruleset
1761 		 */
1762 		if (jailed(td->td_ucred)) {
1763 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1764 				error = EPERM;
1765 				goto done_deref;
1766 			} else
1767 				rsnum = ppr->pr_devfs_rsnum;
1768 		}
1769 	}
1770 #ifdef INET
1771 	if (ip4s > 0) {
1772 		if ((ppr->pr_flags & PR_IP4) &&
1773 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
1774 		    PR_INET)) {
1775 			error = EPERM;
1776 			goto done_deref;
1777 		}
1778 		if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
1779 			error = EADDRINUSE;
1780 			vfs_opterror(opts, "IPv4 addresses clash");
1781 			goto done_deref;
1782 		}
1783 	}
1784 #endif
1785 #ifdef INET6
1786 	if (ip6s > 0) {
1787 		if ((ppr->pr_flags & PR_IP6) &&
1788 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
1789 		    PR_INET6)) {
1790 			error = EPERM;
1791 			goto done_deref;
1792 		}
1793 		if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
1794 			error = EADDRINUSE;
1795 			vfs_opterror(opts, "IPv6 addresses clash");
1796 			goto done_deref;
1797 		}
1798 	}
1799 #endif
1800 	onamelen = namelen = 0;
1801 	if (namelc != NULL) {
1802 		/* Give a default name of the jid.  Also allow the name to be
1803 		 * explicitly the jid - but not any other number, and only in
1804 		 * normal form (no leading zero/etc).
1805 		 */
1806 		if (namelc[0] == '\0')
1807 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1808 		else if ((strtoul(namelc, &p, 10) != jid ||
1809 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1810 			error = EINVAL;
1811 			vfs_opterror(opts,
1812 			    "name cannot be numeric (unless it is the jid)");
1813 			goto done_deref;
1814 		}
1815 		/*
1816 		 * Make sure the name isn't too long for the prison or its
1817 		 * children.
1818 		 */
1819 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1820 		onamelen = strlen(pr->pr_name + pnamelen);
1821 		namelen = strlen(namelc);
1822 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1823 			error = ENAMETOOLONG;
1824 			goto done_deref;
1825 		}
1826 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1827 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1828 			    sizeof(pr->pr_name)) {
1829 				error = ENAMETOOLONG;
1830 				goto done_deref;
1831 			}
1832 		}
1833 	}
1834 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
1835 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1836 		error = EPERM;
1837 		goto done_deref;
1838 	}
1839 
1840 	/*
1841 	 * Let modules check their parameters.  This requires unlocking and
1842 	 * then re-locking the prison, but this is still a valid state as long
1843 	 * as allprison_lock remains xlocked.
1844 	 */
1845 	mtx_unlock(&pr->pr_mtx);
1846 	drflags &= ~PD_LOCKED;
1847 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1848 	if (error != 0)
1849 		goto done_deref;
1850 	mtx_lock(&pr->pr_mtx);
1851 	drflags |= PD_LOCKED;
1852 
1853 	/* At this point, all valid parameters should have been noted. */
1854 	TAILQ_FOREACH(opt, opts, link) {
1855 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1856 			error = EINVAL;
1857 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1858 			goto done_deref;
1859 		}
1860 	}
1861 
1862 	/* Set the parameters of the prison. */
1863 #ifdef INET
1864 	redo_ip4 = false;
1865 	if (pr_flags & PR_IP4_USER) {
1866 		pr->pr_flags |= PR_IP4;
1867 		prison_ip_set(pr, PR_INET, ip4);
1868 		ip4 = NULL;
1869 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1870 #ifdef VIMAGE
1871 			if (tpr->pr_flags & PR_VNET) {
1872 				descend = 0;
1873 				continue;
1874 			}
1875 #endif
1876 			if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
1877 				redo_ip4 = true;
1878 				descend = 0;
1879 			}
1880 		}
1881 	}
1882 #endif
1883 #ifdef INET6
1884 	redo_ip6 = false;
1885 	if (pr_flags & PR_IP6_USER) {
1886 		pr->pr_flags |= PR_IP6;
1887 		prison_ip_set(pr, PR_INET6, ip6);
1888 		ip6 = NULL;
1889 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1890 #ifdef VIMAGE
1891 			if (tpr->pr_flags & PR_VNET) {
1892 				descend = 0;
1893 				continue;
1894 			}
1895 #endif
1896 			if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
1897 				redo_ip6 = true;
1898 				descend = 0;
1899 			}
1900 		}
1901 	}
1902 #endif
1903 	if (gotslevel) {
1904 		pr->pr_securelevel = slevel;
1905 		/* Set all child jails to be at least this level. */
1906 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1907 			if (tpr->pr_securelevel < slevel)
1908 				tpr->pr_securelevel = slevel;
1909 	}
1910 	if (gotchildmax) {
1911 		pr->pr_childmax = childmax;
1912 		/* Set all child jails to under this limit. */
1913 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1914 			if (tpr->pr_childmax > childmax - level)
1915 				tpr->pr_childmax = childmax > level
1916 				    ? childmax - level : 0;
1917 	}
1918 	if (gotenforce) {
1919 		pr->pr_enforce_statfs = enforce;
1920 		/* Pass this restriction on to the children. */
1921 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1922 			if (tpr->pr_enforce_statfs < enforce)
1923 				tpr->pr_enforce_statfs = enforce;
1924 	}
1925 	if (gotrsnum) {
1926 		pr->pr_devfs_rsnum = rsnum;
1927 		/* Pass this restriction on to the children. */
1928 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1929 			tpr->pr_devfs_rsnum = rsnum;
1930 	}
1931 	if (namelc != NULL) {
1932 		if (ppr == &prison0)
1933 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1934 		else
1935 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1936 			    ppr->pr_name, namelc);
1937 		/* Change this component of child names. */
1938 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1939 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1940 			    strlen(tpr->pr_name + onamelen) + 1);
1941 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1942 		}
1943 	}
1944 	if (path != NULL) {
1945 		/* Try to keep a real-rooted full pathname. */
1946 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1947 		pr->pr_root = root;
1948 		root = NULL;
1949 	}
1950 	if (PR_HOST & ch_flags & ~pr_flags) {
1951 		if (pr->pr_flags & PR_HOST) {
1952 			/*
1953 			 * Copy the parent's host info.  As with pr_ip4 above,
1954 			 * the lack of a lock on the parent is not a problem;
1955 			 * it is always set with allprison_lock at least
1956 			 * shared, and is held exclusively here.
1957 			 */
1958 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1959 			    sizeof(pr->pr_hostname));
1960 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1961 			    sizeof(pr->pr_domainname));
1962 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1963 			    sizeof(pr->pr_hostuuid));
1964 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1965 		}
1966 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1967 		/* Set this prison, and any descendants without PR_HOST. */
1968 		if (host != NULL)
1969 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1970 		if (domain != NULL)
1971 			strlcpy(pr->pr_domainname, domain,
1972 			    sizeof(pr->pr_domainname));
1973 		if (uuid != NULL)
1974 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1975 		if (gothid)
1976 			pr->pr_hostid = hid;
1977 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1978 			if (tpr->pr_flags & PR_HOST)
1979 				descend = 0;
1980 			else {
1981 				if (host != NULL)
1982 					strlcpy(tpr->pr_hostname,
1983 					    pr->pr_hostname,
1984 					    sizeof(tpr->pr_hostname));
1985 				if (domain != NULL)
1986 					strlcpy(tpr->pr_domainname,
1987 					    pr->pr_domainname,
1988 					    sizeof(tpr->pr_domainname));
1989 				if (uuid != NULL)
1990 					strlcpy(tpr->pr_hostuuid,
1991 					    pr->pr_hostuuid,
1992 					    sizeof(tpr->pr_hostuuid));
1993 				if (gothid)
1994 					tpr->pr_hostid = hid;
1995 			}
1996 		}
1997 	}
1998 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1999 	if ((tallow = ch_allow & ~pr_allow))
2000 		prison_set_allow_locked(pr, tallow, 0);
2001 	/*
2002 	 * Persistent prisons get an extra reference, and prisons losing their
2003 	 * persist flag lose that reference.
2004 	 */
2005 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
2006 		if (pr_flags & PR_PERSIST) {
2007 			prison_hold(pr);
2008 			/*
2009 			 * This may be a new prison's first user reference,
2010 			 * but wait to call it alive until after OSD calls
2011 			 * have had a chance to run (and perhaps to fail).
2012 			 */
2013 			refcount_acquire(&pr->pr_uref);
2014 		} else {
2015 			drflags |= PD_DEUREF;
2016 			prison_free_not_last(pr);
2017 		}
2018 	}
2019 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
2020 	mtx_unlock(&pr->pr_mtx);
2021 	drflags &= ~PD_LOCKED;
2022 	/*
2023 	 * Any errors past this point will need to de-persist newly created
2024 	 * prisons, as well as call remove methods.
2025 	 */
2026 	if (created)
2027 		drflags |= PD_KILL;
2028 
2029 #ifdef RACCT
2030 	if (racct_enable && created)
2031 		prison_racct_attach(pr);
2032 #endif
2033 
2034 	/* Locks may have prevented a complete restriction of child IP
2035 	 * addresses.  If so, allocate some more memory and try again.
2036 	 */
2037 #ifdef INET
2038 	while (redo_ip4) {
2039 		ip4s = pr->pr_addrs[PR_INET]->ips;
2040 		MPASS(ip4 == NULL);
2041 		ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
2042 		mtx_lock(&pr->pr_mtx);
2043 		redo_ip4 = false;
2044 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2045 #ifdef VIMAGE
2046 			if (tpr->pr_flags & PR_VNET) {
2047 				descend = 0;
2048 				continue;
2049 			}
2050 #endif
2051 			if (!prison_ip_restrict(tpr, PR_INET, &ip4))
2052 				redo_ip4 = true;
2053 		}
2054 		mtx_unlock(&pr->pr_mtx);
2055 	}
2056 #endif
2057 #ifdef INET6
2058 	while (redo_ip6) {
2059 		ip6s = pr->pr_addrs[PR_INET6]->ips;
2060 		MPASS(ip6 == NULL);
2061 		ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
2062 		mtx_lock(&pr->pr_mtx);
2063 		redo_ip6 = false;
2064 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2065 #ifdef VIMAGE
2066 			if (tpr->pr_flags & PR_VNET) {
2067 				descend = 0;
2068 				continue;
2069 			}
2070 #endif
2071 			if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
2072 				redo_ip6 = true;
2073 		}
2074 		mtx_unlock(&pr->pr_mtx);
2075 	}
2076 #endif
2077 
2078 	/* Let the modules do their work. */
2079 	if (created) {
2080 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
2081 		if (error)
2082 			goto done_deref;
2083 	}
2084 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
2085 	if (error)
2086 		goto done_deref;
2087 
2088 	/*
2089 	 * A new prison is now ready to be seen; either it has gained a user
2090 	 * reference via persistence, or is about to gain one via attachment.
2091 	 */
2092 	if (created) {
2093 		drflags = prison_lock_xlock(pr, drflags);
2094 		pr->pr_state = PRISON_STATE_ALIVE;
2095 	}
2096 
2097 	/* Attach this process to the prison if requested. */
2098 	if (flags & JAIL_ATTACH) {
2099 		error = do_jail_attach(td, pr,
2100 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
2101 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
2102 		if (error) {
2103 			vfs_opterror(opts, "attach failed");
2104 			goto done_deref;
2105 		}
2106 	}
2107 
2108 #ifdef RACCT
2109 	if (racct_enable && !created) {
2110 		if (drflags & PD_LOCKED) {
2111 			mtx_unlock(&pr->pr_mtx);
2112 			drflags &= ~PD_LOCKED;
2113 		}
2114 		if (drflags & PD_LIST_XLOCKED) {
2115 			sx_xunlock(&allprison_lock);
2116 			drflags &= ~PD_LIST_XLOCKED;
2117 		}
2118 		prison_racct_modify(pr);
2119 	}
2120 #endif
2121 
2122 	if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
2123 	    (pr->pr_root->v_vflag & VV_ROOT) == 0)
2124 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
2125 		   " file system\n", pr->pr_id);
2126 
2127 	drflags &= ~PD_KILL;
2128 	td->td_retval[0] = pr->pr_id;
2129 
2130  done_deref:
2131 	/* Release any temporary prison holds and/or locks. */
2132 	if (pr != NULL)
2133 		prison_deref(pr, drflags);
2134 	else if (drflags & PD_LIST_SLOCKED)
2135 		sx_sunlock(&allprison_lock);
2136 	else if (drflags & PD_LIST_XLOCKED)
2137 		sx_xunlock(&allprison_lock);
2138 	if (root != NULL)
2139 		vrele(root);
2140  done_errmsg:
2141 	if (error) {
2142 		/* Write the error message back to userspace. */
2143 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
2144 		    &errmsg_len) == 0 && errmsg_len > 0) {
2145 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
2146 			if (optuio->uio_segflg == UIO_SYSSPACE)
2147 				bcopy(errmsg,
2148 				    optuio->uio_iov[errmsg_pos].iov_base,
2149 				    errmsg_len);
2150 			else
2151 				(void)copyout(errmsg,
2152 				    optuio->uio_iov[errmsg_pos].iov_base,
2153 				    errmsg_len);
2154 		}
2155 	}
2156  done_free:
2157 #ifdef INET
2158 	prison_ip_free(ip4);
2159 #endif
2160 #ifdef INET6
2161 	prison_ip_free(ip6);
2162 #endif
2163 	if (g_path != NULL)
2164 		free(g_path, M_TEMP);
2165 	vfs_freeopts(opts);
2166 	return (error);
2167 }
2168 
2169 /*
2170  * Find the next available prison ID.  Return the ID on success, or zero
2171  * on failure.  Also set a pointer to the allprison list entry the prison
2172  * should be inserted before.
2173  */
2174 static int
2175 get_next_prid(struct prison **insprp)
2176 {
2177 	struct prison *inspr;
2178 	int jid, maxid;
2179 
2180 	jid = lastprid % JAIL_MAX + 1;
2181 	if (TAILQ_EMPTY(&allprison) ||
2182 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
2183 		/*
2184 		 * A common case is for all jails to be implicitly numbered,
2185 		 * which means they'll go on the end of the list, at least
2186 		 * for the first JAIL_MAX times.
2187 		 */
2188 		inspr = NULL;
2189 	} else {
2190 		/*
2191 		 * Take two passes through the allprison list: first starting
2192 		 * with the proposed jid, then ending with it.
2193 		 */
2194 		for (maxid = JAIL_MAX; maxid != 0; ) {
2195 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
2196 				if (inspr->pr_id < jid)
2197 					continue;
2198 				if (inspr->pr_id > jid) {
2199 					/* Found an opening. */
2200 					maxid = 0;
2201 					break;
2202 				}
2203 				if (++jid > maxid) {
2204 					if (lastprid == maxid || lastprid == 0)
2205 					{
2206 						/*
2207 						 * The entire legal range
2208 						 * has been traversed
2209 						 */
2210 						return 0;
2211 					}
2212 					/* Try again from the start. */
2213 					jid = 1;
2214 					maxid = lastprid;
2215 					break;
2216 				}
2217 			}
2218 			if (inspr == NULL) {
2219 				/* Found room at the end of the list. */
2220 				break;
2221 			}
2222 		}
2223 	}
2224 	*insprp = inspr;
2225 	lastprid = jid;
2226 	return (jid);
2227 }
2228 
2229 /*
2230  * Find the next available ID for a renumbered dead prison.  This is the same
2231  * as get_next_prid, but counting backward from the end of the range.
2232  */
2233 static int
2234 get_next_deadid(struct prison **dinsprp)
2235 {
2236 	struct prison *dinspr;
2237 	int deadid, minid;
2238 
2239 	deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX;
2240 	/*
2241 	 * Take two reverse passes through the allprison list: first
2242 	 * starting with the proposed deadid, then ending with it.
2243 	 */
2244 	for (minid = 1; minid != 0; ) {
2245 		TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) {
2246 			if (dinspr->pr_id > deadid)
2247 				continue;
2248 			if (dinspr->pr_id < deadid) {
2249 				/* Found an opening. */
2250 				minid = 0;
2251 				break;
2252 			}
2253 			if (--deadid < minid) {
2254 				if (lastdeadid == minid || lastdeadid == 0)
2255 				{
2256 					/*
2257 					 * The entire legal range
2258 					 * has been traversed
2259 					 */
2260 					return 0;
2261 				}
2262 				/* Try again from the end. */
2263 				deadid = JAIL_MAX;
2264 				minid = lastdeadid;
2265 				break;
2266 			}
2267 		}
2268 		if (dinspr == NULL) {
2269 			/* Found room at the beginning of the list. */
2270 			break;
2271 		}
2272 	}
2273 	*dinsprp = dinspr;
2274 	lastdeadid = deadid;
2275 	return (deadid);
2276 }
2277 
2278 /*
2279  * struct jail_get_args {
2280  *	struct iovec *iovp;
2281  *	unsigned int iovcnt;
2282  *	int flags;
2283  * };
2284  */
2285 int
2286 sys_jail_get(struct thread *td, struct jail_get_args *uap)
2287 {
2288 	struct uio *auio;
2289 	int error;
2290 
2291 	/* Check that we have an even number of iovecs. */
2292 	if (uap->iovcnt & 1)
2293 		return (EINVAL);
2294 
2295 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2296 	if (error)
2297 		return (error);
2298 	error = kern_jail_get(td, auio, uap->flags);
2299 	if (error == 0)
2300 		error = copyout(auio->uio_iov, uap->iovp,
2301 		    uap->iovcnt * sizeof(struct iovec));
2302 	freeuio(auio);
2303 	return (error);
2304 }
2305 
2306 int
2307 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2308 {
2309 	struct bool_flags *bf;
2310 	struct jailsys_flags *jsf;
2311 	struct prison *pr, *mypr;
2312 	struct vfsopt *opt;
2313 	struct vfsoptlist *opts;
2314 	char *errmsg, *name;
2315 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2316 	unsigned f;
2317 
2318 	if (flags & ~JAIL_GET_MASK)
2319 		return (EINVAL);
2320 
2321 	/* Get the parameter list. */
2322 	error = vfs_buildopts(optuio, &opts);
2323 	if (error)
2324 		return (error);
2325 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2326 	mypr = td->td_ucred->cr_prison;
2327 	pr = NULL;
2328 
2329 	/*
2330 	 * Find the prison specified by one of: lastjid, jid, name.
2331 	 */
2332 	sx_slock(&allprison_lock);
2333 	drflags = PD_LIST_SLOCKED;
2334 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2335 	if (error == 0) {
2336 		TAILQ_FOREACH(pr, &allprison, pr_list) {
2337 			if (pr->pr_id > jid &&
2338 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2339 			    prison_ischild(mypr, pr)) {
2340 				mtx_lock(&pr->pr_mtx);
2341 				drflags |= PD_LOCKED;
2342 				goto found_prison;
2343 			}
2344 		}
2345 		error = ENOENT;
2346 		vfs_opterror(opts, "no jail after %d", jid);
2347 		goto done;
2348 	} else if (error != ENOENT)
2349 		goto done;
2350 
2351 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2352 	if (error == 0) {
2353 		if (jid != 0) {
2354 			pr = prison_find_child(mypr, jid);
2355 			if (pr != NULL) {
2356 				drflags |= PD_LOCKED;
2357 				if (!(prison_isalive(pr) ||
2358 				    (flags & JAIL_DYING))) {
2359 					error = ENOENT;
2360 					vfs_opterror(opts, "jail %d is dying",
2361 					    jid);
2362 					goto done;
2363 				}
2364 				goto found_prison;
2365 			}
2366 			error = ENOENT;
2367 			vfs_opterror(opts, "jail %d not found", jid);
2368 			goto done;
2369 		}
2370 	} else if (error != ENOENT)
2371 		goto done;
2372 
2373 	error = vfs_getopt(opts, "name", (void **)&name, &len);
2374 	if (error == 0) {
2375 		if (len == 0 || name[len - 1] != '\0') {
2376 			error = EINVAL;
2377 			goto done;
2378 		}
2379 		pr = prison_find_name(mypr, name);
2380 		if (pr != NULL) {
2381 			drflags |= PD_LOCKED;
2382 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2383 				error = ENOENT;
2384 				vfs_opterror(opts, "jail \"%s\" is dying",
2385 				    name);
2386 				goto done;
2387 			}
2388 			goto found_prison;
2389 		}
2390 		error = ENOENT;
2391 		vfs_opterror(opts, "jail \"%s\" not found", name);
2392 		goto done;
2393 	} else if (error != ENOENT)
2394 		goto done;
2395 
2396 	vfs_opterror(opts, "no jail specified");
2397 	error = ENOENT;
2398 	goto done;
2399 
2400  found_prison:
2401 	/* Get the parameters of the prison. */
2402 	prison_hold(pr);
2403 	drflags |= PD_DEREF;
2404 	td->td_retval[0] = pr->pr_id;
2405 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2406 	if (error != 0 && error != ENOENT)
2407 		goto done;
2408 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2409 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2410 	if (error != 0 && error != ENOENT)
2411 		goto done;
2412 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2413 	if (error != 0 && error != ENOENT)
2414 		goto done;
2415 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2416 	    sizeof(pr->pr_cpuset->cs_id));
2417 	if (error != 0 && error != ENOENT)
2418 		goto done;
2419 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2420 	if (error != 0 && error != ENOENT)
2421 		goto done;
2422 #ifdef INET
2423 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
2424 	    pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
2425 	    pr_families[PR_INET].size : 0 );
2426 	if (error != 0 && error != ENOENT)
2427 		goto done;
2428 #endif
2429 #ifdef INET6
2430 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
2431 	    pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
2432 	    pr_families[PR_INET6].size : 0 );
2433 	if (error != 0 && error != ENOENT)
2434 		goto done;
2435 #endif
2436 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2437 	    sizeof(pr->pr_securelevel));
2438 	if (error != 0 && error != ENOENT)
2439 		goto done;
2440 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2441 	    sizeof(pr->pr_childcount));
2442 	if (error != 0 && error != ENOENT)
2443 		goto done;
2444 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2445 	    sizeof(pr->pr_childmax));
2446 	if (error != 0 && error != ENOENT)
2447 		goto done;
2448 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2449 	if (error != 0 && error != ENOENT)
2450 		goto done;
2451 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2452 	if (error != 0 && error != ENOENT)
2453 		goto done;
2454 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2455 	if (error != 0 && error != ENOENT)
2456 		goto done;
2457 #ifdef COMPAT_FREEBSD32
2458 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2459 		uint32_t hid32 = pr->pr_hostid;
2460 
2461 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2462 	} else
2463 #endif
2464 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2465 	    sizeof(pr->pr_hostid));
2466 	if (error != 0 && error != ENOENT)
2467 		goto done;
2468 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2469 	    sizeof(pr->pr_enforce_statfs));
2470 	if (error != 0 && error != ENOENT)
2471 		goto done;
2472 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2473 	    sizeof(pr->pr_devfs_rsnum));
2474 	if (error != 0 && error != ENOENT)
2475 		goto done;
2476 	for (bf = pr_flag_bool;
2477 	     bf < pr_flag_bool + nitems(pr_flag_bool);
2478 	     bf++) {
2479 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
2480 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2481 		if (error != 0 && error != ENOENT)
2482 			goto done;
2483 		i = !i;
2484 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2485 		if (error != 0 && error != ENOENT)
2486 			goto done;
2487 	}
2488 	for (jsf = pr_flag_jailsys;
2489 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2490 	     jsf++) {
2491 		f = pr->pr_flags & (jsf->disable | jsf->new);
2492 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2493 		    : (f == jsf->new) ? JAIL_SYS_NEW
2494 		    : JAIL_SYS_INHERIT;
2495 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2496 		if (error != 0 && error != ENOENT)
2497 			goto done;
2498 	}
2499 	for (bf = pr_flag_allow;
2500 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
2501 		atomic_load_int(&bf->flag) != 0;
2502 	     bf++) {
2503 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
2504 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2505 		if (error != 0 && error != ENOENT)
2506 			goto done;
2507 		i = !i;
2508 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2509 		if (error != 0 && error != ENOENT)
2510 			goto done;
2511 	}
2512 	i = !prison_isalive(pr);
2513 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2514 	if (error != 0 && error != ENOENT)
2515 		goto done;
2516 	i = !i;
2517 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2518 	if (error != 0 && error != ENOENT)
2519 		goto done;
2520 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2521 	    sizeof(pr->pr_osreldate));
2522 	if (error != 0 && error != ENOENT)
2523 		goto done;
2524 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2525 	if (error != 0 && error != ENOENT)
2526 		goto done;
2527 
2528 	/* Get the module parameters. */
2529 	mtx_unlock(&pr->pr_mtx);
2530 	drflags &= ~PD_LOCKED;
2531 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2532 	if (error)
2533 		goto done;
2534 	prison_deref(pr, drflags);
2535 	pr = NULL;
2536 	drflags = 0;
2537 
2538 	/* By now, all parameters should have been noted. */
2539 	TAILQ_FOREACH(opt, opts, link) {
2540 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2541 			error = EINVAL;
2542 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2543 			goto done;
2544 		}
2545 	}
2546 
2547 	/* Write the fetched parameters back to userspace. */
2548 	error = 0;
2549 	TAILQ_FOREACH(opt, opts, link) {
2550 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2551 			pos = 2 * opt->pos + 1;
2552 			optuio->uio_iov[pos].iov_len = opt->len;
2553 			if (opt->value != NULL) {
2554 				if (optuio->uio_segflg == UIO_SYSSPACE) {
2555 					bcopy(opt->value,
2556 					    optuio->uio_iov[pos].iov_base,
2557 					    opt->len);
2558 				} else {
2559 					error = copyout(opt->value,
2560 					    optuio->uio_iov[pos].iov_base,
2561 					    opt->len);
2562 					if (error)
2563 						break;
2564 				}
2565 			}
2566 		}
2567 	}
2568 
2569  done:
2570 	/* Release any temporary prison holds and/or locks. */
2571 	if (pr != NULL)
2572 		prison_deref(pr, drflags);
2573 	else if (drflags & PD_LIST_SLOCKED)
2574 		sx_sunlock(&allprison_lock);
2575 	if (error && errmsg_pos >= 0) {
2576 		/* Write the error message back to userspace. */
2577 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2578 		errmsg_pos = 2 * errmsg_pos + 1;
2579 		if (errmsg_len > 0) {
2580 			if (optuio->uio_segflg == UIO_SYSSPACE)
2581 				bcopy(errmsg,
2582 				    optuio->uio_iov[errmsg_pos].iov_base,
2583 				    errmsg_len);
2584 			else
2585 				(void)copyout(errmsg,
2586 				    optuio->uio_iov[errmsg_pos].iov_base,
2587 				    errmsg_len);
2588 		}
2589 	}
2590 	vfs_freeopts(opts);
2591 	return (error);
2592 }
2593 
2594 /*
2595  * struct jail_remove_args {
2596  *	int jid;
2597  * };
2598  */
2599 int
2600 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2601 {
2602 	struct prison *pr;
2603 	int error;
2604 
2605 	error = priv_check(td, PRIV_JAIL_REMOVE);
2606 	if (error)
2607 		return (error);
2608 
2609 	sx_xlock(&allprison_lock);
2610 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2611 	if (pr == NULL) {
2612 		sx_xunlock(&allprison_lock);
2613 		return (EINVAL);
2614 	}
2615 	if (!prison_isalive(pr)) {
2616 		/* Silently ignore already-dying prisons. */
2617 		mtx_unlock(&pr->pr_mtx);
2618 		sx_xunlock(&allprison_lock);
2619 		return (0);
2620 	}
2621 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2622 	return (0);
2623 }
2624 
2625 /*
2626  * struct jail_attach_args {
2627  *	int jid;
2628  * };
2629  */
2630 int
2631 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2632 {
2633 	struct prison *pr;
2634 	int error;
2635 
2636 	error = priv_check(td, PRIV_JAIL_ATTACH);
2637 	if (error)
2638 		return (error);
2639 
2640 	sx_slock(&allprison_lock);
2641 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2642 	if (pr == NULL) {
2643 		sx_sunlock(&allprison_lock);
2644 		return (EINVAL);
2645 	}
2646 
2647 	/* Do not allow a process to attach to a prison that is not alive. */
2648 	if (!prison_isalive(pr)) {
2649 		mtx_unlock(&pr->pr_mtx);
2650 		sx_sunlock(&allprison_lock);
2651 		return (EINVAL);
2652 	}
2653 
2654 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2655 }
2656 
2657 static int
2658 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2659 {
2660 	struct proc *p;
2661 	struct ucred *newcred, *oldcred;
2662 	int error;
2663 
2664 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2665 	sx_assert(&allprison_lock, SX_LOCKED);
2666 	drflags &= PD_LOCK_FLAGS;
2667 	/*
2668 	 * XXX: Note that there is a slight race here if two threads
2669 	 * in the same privileged process attempt to attach to two
2670 	 * different jails at the same time.  It is important for
2671 	 * user processes not to do this, or they might end up with
2672 	 * a process root from one prison, but attached to the jail
2673 	 * of another.
2674 	 */
2675 	prison_hold(pr);
2676 	refcount_acquire(&pr->pr_uref);
2677 	drflags |= PD_DEREF | PD_DEUREF;
2678 	mtx_unlock(&pr->pr_mtx);
2679 	drflags &= ~PD_LOCKED;
2680 
2681 	/* Let modules do whatever they need to prepare for attaching. */
2682 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2683 	if (error) {
2684 		prison_deref(pr, drflags);
2685 		return (error);
2686 	}
2687 	sx_unlock(&allprison_lock);
2688 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2689 
2690 	/*
2691 	 * Reparent the newly attached process to this jail.
2692 	 */
2693 	p = td->td_proc;
2694 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2695 	if (error)
2696 		goto e_revert_osd;
2697 
2698 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2699 	if ((error = change_dir(pr->pr_root, td)) != 0)
2700 		goto e_unlock;
2701 #ifdef MAC
2702 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2703 		goto e_unlock;
2704 #endif
2705 	VOP_UNLOCK(pr->pr_root);
2706 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2707 		goto e_revert_osd;
2708 
2709 	newcred = crget();
2710 	PROC_LOCK(p);
2711 	oldcred = crcopysafe(p, newcred);
2712 	newcred->cr_prison = pr;
2713 	proc_set_cred(p, newcred);
2714 	setsugid(p);
2715 #ifdef RACCT
2716 	racct_proc_ucred_changed(p, oldcred, newcred);
2717 	crhold(newcred);
2718 #endif
2719 	PROC_UNLOCK(p);
2720 #ifdef RCTL
2721 	rctl_proc_ucred_changed(p, newcred);
2722 	crfree(newcred);
2723 #endif
2724 	prison_proc_relink(oldcred->cr_prison, pr, p);
2725 	prison_deref(oldcred->cr_prison, drflags);
2726 	crfree(oldcred);
2727 
2728 	/*
2729 	 * If the prison was killed while changing credentials, die along
2730 	 * with it.
2731 	 */
2732 	if (!prison_isalive(pr)) {
2733 		PROC_LOCK(p);
2734 		kern_psignal(p, SIGKILL);
2735 		PROC_UNLOCK(p);
2736 	}
2737 
2738 	return (0);
2739 
2740  e_unlock:
2741 	VOP_UNLOCK(pr->pr_root);
2742  e_revert_osd:
2743 	/* Tell modules this thread is still in its old jail after all. */
2744 	sx_slock(&allprison_lock);
2745 	drflags |= PD_LIST_SLOCKED;
2746 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2747 	prison_deref(pr, drflags);
2748 	return (error);
2749 }
2750 
2751 /*
2752  * Returns a locked prison instance, or NULL on failure.
2753  */
2754 struct prison *
2755 prison_find(int prid)
2756 {
2757 	struct prison *pr;
2758 
2759 	sx_assert(&allprison_lock, SX_LOCKED);
2760 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2761 		if (pr->pr_id < prid)
2762 			continue;
2763 		if (pr->pr_id > prid)
2764 			break;
2765 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2766 		mtx_lock(&pr->pr_mtx);
2767 		return (pr);
2768 	}
2769 	return (NULL);
2770 }
2771 
2772 /*
2773  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2774  */
2775 struct prison *
2776 prison_find_child(struct prison *mypr, int prid)
2777 {
2778 	struct prison *pr;
2779 	int descend;
2780 
2781 	sx_assert(&allprison_lock, SX_LOCKED);
2782 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2783 		if (pr->pr_id == prid) {
2784 			KASSERT(prison_isvalid(pr),
2785 			    ("Found invalid prison %p", pr));
2786 			mtx_lock(&pr->pr_mtx);
2787 			return (pr);
2788 		}
2789 	}
2790 	return (NULL);
2791 }
2792 
2793 /*
2794  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2795  */
2796 struct prison *
2797 prison_find_name(struct prison *mypr, const char *name)
2798 {
2799 	struct prison *pr, *deadpr;
2800 	size_t mylen;
2801 	int descend;
2802 
2803 	sx_assert(&allprison_lock, SX_LOCKED);
2804 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2805 	deadpr = NULL;
2806 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2807 		if (!strcmp(pr->pr_name + mylen, name)) {
2808 			KASSERT(prison_isvalid(pr),
2809 			    ("Found invalid prison %p", pr));
2810 			if (prison_isalive(pr)) {
2811 				mtx_lock(&pr->pr_mtx);
2812 				return (pr);
2813 			}
2814 			deadpr = pr;
2815 		}
2816 	}
2817 	/* There was no valid prison - perhaps there was a dying one. */
2818 	if (deadpr != NULL)
2819 		mtx_lock(&deadpr->pr_mtx);
2820 	return (deadpr);
2821 }
2822 
2823 /*
2824  * See if a prison has the specific flag set.  The prison should be locked,
2825  * unless checking for flags that are only set at jail creation (such as
2826  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2827  * to any other prison data.
2828  */
2829 bool
2830 prison_flag(struct ucred *cred, unsigned flag)
2831 {
2832 
2833 	return ((cred->cr_prison->pr_flags & flag) != 0);
2834 }
2835 
2836 /*
2837  * See if a prison has the specific allow flag set.
2838  * The prison *should* be locked, or only a single bit is examined, without
2839  * regard to any other prison data.
2840  */
2841 bool
2842 prison_allow(struct ucred *cred, unsigned flag)
2843 {
2844 
2845 	return ((cred->cr_prison->pr_allow & flag) != 0);
2846 }
2847 
2848 /*
2849  * Hold a prison reference, by incrementing pr_ref.  It is generally
2850  * an error to hold a prison that does not already have a reference.
2851  * A prison record will remain valid as long as it has at least one
2852  * reference, and will not be removed as long as either the prison
2853  * mutex or the allprison lock is held (allprison_lock may be shared).
2854  */
2855 void
2856 prison_hold_locked(struct prison *pr)
2857 {
2858 
2859 	/* Locking is no longer required. */
2860 	prison_hold(pr);
2861 }
2862 
2863 void
2864 prison_hold(struct prison *pr)
2865 {
2866 #ifdef INVARIANTS
2867 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2868 
2869 	KASSERT(was_valid,
2870 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2871 #else
2872 	refcount_acquire(&pr->pr_ref);
2873 #endif
2874 }
2875 
2876 /*
2877  * Remove a prison reference.  If that was the last reference, the
2878  * prison will be removed (at a later time).
2879  */
2880 void
2881 prison_free_locked(struct prison *pr)
2882 {
2883 
2884 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2885 	/*
2886 	 * Locking is no longer required, but unlock because the caller
2887 	 * expects it.
2888 	 */
2889 	mtx_unlock(&pr->pr_mtx);
2890 	prison_free(pr);
2891 }
2892 
2893 void
2894 prison_free(struct prison *pr)
2895 {
2896 
2897 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2898 	    ("Trying to free dead prison %p (jid=%d).",
2899 	     pr, pr->pr_id));
2900 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
2901 		/*
2902 		 * Don't remove the last reference in this context,
2903 		 * in case there are locks held.
2904 		 */
2905 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2906 	}
2907 }
2908 
2909 static void
2910 prison_free_not_last(struct prison *pr)
2911 {
2912 #ifdef INVARIANTS
2913 	int lastref;
2914 
2915 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2916 	    ("Trying to free dead prison %p (jid=%d).",
2917 	     pr, pr->pr_id));
2918 	lastref = refcount_release(&pr->pr_ref);
2919 	KASSERT(!lastref,
2920 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2921 	     pr, pr->pr_id));
2922 #else
2923 	refcount_release(&pr->pr_ref);
2924 #endif
2925 }
2926 
2927 /*
2928  * Hold a prison for user visibility, by incrementing pr_uref.
2929  * It is generally an error to hold a prison that isn't already
2930  * user-visible, except through the jail system calls.  It is also
2931  * an error to hold an invalid prison.  A prison record will remain
2932  * alive as long as it has at least one user reference, and will not
2933  * be set to the dying state until the prison mutex and allprison_lock
2934  * are both freed.
2935  */
2936 void
2937 prison_proc_hold(struct prison *pr)
2938 {
2939 #ifdef INVARIANTS
2940 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2941 
2942 	KASSERT(was_alive,
2943 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2944 #else
2945 	refcount_acquire(&pr->pr_uref);
2946 #endif
2947 }
2948 
2949 /*
2950  * Remove a prison user reference.  If it was the last reference, the
2951  * prison will be considered "dying", and may be removed once all of
2952  * its references are dropped.
2953  */
2954 void
2955 prison_proc_free(struct prison *pr)
2956 {
2957 
2958 	/*
2959 	 * Locking is only required when releasing the last reference.
2960 	 * This allows assurance that a locked prison will remain alive
2961 	 * until it is unlocked.
2962 	 */
2963 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2964 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2965 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
2966 		/*
2967 		 * Don't remove the last user reference in this context,
2968 		 * which is expected to be a process that is not only locked,
2969 		 * but also half dead.  Add a reference so any calls to
2970 		 * prison_free() won't re-submit the task.
2971 		 */
2972 		prison_hold(pr);
2973 		mtx_lock(&pr->pr_mtx);
2974 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2975 		    ("Redundant last reference in prison_proc_free (jid=%d)",
2976 		     pr->pr_id));
2977 		pr->pr_flags |= PR_COMPLETE_PROC;
2978 		mtx_unlock(&pr->pr_mtx);
2979 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2980 	}
2981 }
2982 
2983 static void
2984 prison_proc_free_not_last(struct prison *pr)
2985 {
2986 #ifdef INVARIANTS
2987 	int lastref;
2988 
2989 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2990 	    ("Trying to free dead prison %p (jid=%d).",
2991 	     pr, pr->pr_id));
2992 	lastref = refcount_release(&pr->pr_uref);
2993 	KASSERT(!lastref,
2994 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2995 	     pr, pr->pr_id));
2996 #else
2997 	refcount_release(&pr->pr_uref);
2998 #endif
2999 }
3000 
3001 void
3002 prison_proc_link(struct prison *pr, struct proc *p)
3003 {
3004 
3005 	sx_assert(&allproc_lock, SA_XLOCKED);
3006 	LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
3007 }
3008 
3009 void
3010 prison_proc_unlink(struct prison *pr, struct proc *p)
3011 {
3012 
3013 	sx_assert(&allproc_lock, SA_XLOCKED);
3014 	LIST_REMOVE(p, p_jaillist);
3015 }
3016 
3017 static void
3018 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
3019 {
3020 
3021 	sx_xlock(&allproc_lock);
3022 	prison_proc_unlink(opr, p);
3023 	prison_proc_link(npr, p);
3024 	sx_xunlock(&allproc_lock);
3025 }
3026 
3027 /*
3028  * Complete a call to either prison_free or prison_proc_free.
3029  */
3030 static void
3031 prison_complete(void *context, int pending)
3032 {
3033 	struct prison *pr = context;
3034 	int drflags;
3035 
3036 	/*
3037 	 * This could be called to release the last reference, or the last
3038 	 * user reference (plus the reference held in prison_proc_free).
3039 	 */
3040 	drflags = prison_lock_xlock(pr, PD_DEREF);
3041 	if (pr->pr_flags & PR_COMPLETE_PROC) {
3042 		pr->pr_flags &= ~PR_COMPLETE_PROC;
3043 		drflags |= PD_DEUREF;
3044 	}
3045 	prison_deref(pr, drflags);
3046 }
3047 
3048 static void
3049 prison_kill_processes_cb(struct proc *p, void *arg __unused)
3050 {
3051 
3052 	kern_psignal(p, SIGKILL);
3053 }
3054 
3055 /*
3056  * Note the iteration does not guarantee acting on all processes.
3057  * Most notably there may be fork or jail_attach in progress.
3058  */
3059 void
3060 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
3061     void *cbarg)
3062 {
3063 	struct prison *ppr;
3064 	struct proc *p;
3065 
3066 	if (atomic_load_int(&pr->pr_childcount) == 0) {
3067 		sx_slock(&allproc_lock);
3068 		LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
3069 			if (p->p_state == PRS_NEW)
3070 				continue;
3071 			PROC_LOCK(p);
3072 			cb(p, cbarg);
3073 			PROC_UNLOCK(p);
3074 		}
3075 		sx_sunlock(&allproc_lock);
3076 		if (atomic_load_int(&pr->pr_childcount) == 0)
3077 			return;
3078 		/*
3079 		 * Some jails popped up during the iteration, fall through to a
3080 		 * system-wide search.
3081 		 */
3082 	}
3083 
3084 	sx_slock(&allproc_lock);
3085 	FOREACH_PROC_IN_SYSTEM(p) {
3086 		PROC_LOCK(p);
3087 		if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
3088 			for (ppr = p->p_ucred->cr_prison; ppr != NULL;
3089 			    ppr = ppr->pr_parent) {
3090 				if (ppr == pr) {
3091 					cb(p, cbarg);
3092 					break;
3093 				}
3094 			}
3095 		}
3096 		PROC_UNLOCK(p);
3097 	}
3098 	sx_sunlock(&allproc_lock);
3099 }
3100 
3101 /*
3102  * Remove a prison reference and/or user reference (usually).
3103  * This assumes context that allows sleeping (for allprison_lock),
3104  * with no non-sleeping locks held, except perhaps the prison itself.
3105  * If there are no more references, release and delist the prison.
3106  * On completion, the prison lock and the allprison lock are both
3107  * unlocked.
3108  */
3109 static void
3110 prison_deref(struct prison *pr, int flags)
3111 {
3112 	struct prisonlist freeprison;
3113 	struct prison *killpr, *rpr, *ppr, *tpr;
3114 
3115 	killpr = NULL;
3116 	TAILQ_INIT(&freeprison);
3117 	/*
3118 	 * Release this prison as requested, which may cause its parent
3119 	 * to be released, and then maybe its grandparent, etc.
3120 	 */
3121 	for (;;) {
3122 		if (flags & PD_KILL) {
3123 			/* Kill the prison and its descendents. */
3124 			KASSERT(pr != &prison0,
3125 			    ("prison_deref trying to kill prison0"));
3126 			if (!(flags & PD_DEREF)) {
3127 				prison_hold(pr);
3128 				flags |= PD_DEREF;
3129 			}
3130 			flags = prison_lock_xlock(pr, flags);
3131 			prison_deref_kill(pr, &freeprison);
3132 		}
3133 		if (flags & PD_DEUREF) {
3134 			/* Drop a user reference. */
3135 			KASSERT(refcount_load(&pr->pr_uref) > 0,
3136 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
3137 			     pr->pr_id));
3138 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
3139 				if (!(flags & PD_DEREF)) {
3140 					prison_hold(pr);
3141 					flags |= PD_DEREF;
3142 				}
3143 				flags = prison_lock_xlock(pr, flags);
3144 				if (refcount_release(&pr->pr_uref) &&
3145 				    pr->pr_state == PRISON_STATE_ALIVE) {
3146 					/*
3147 					 * When the last user references goes,
3148 					 * this becomes a dying prison.
3149 					 */
3150 					KASSERT(
3151 					    refcount_load(&prison0.pr_uref) > 0,
3152 					    ("prison0 pr_uref=0"));
3153 					pr->pr_state = PRISON_STATE_DYING;
3154 					mtx_unlock(&pr->pr_mtx);
3155 					flags &= ~PD_LOCKED;
3156 					prison_cleanup(pr);
3157 				}
3158 			}
3159 		}
3160 		if (flags & PD_KILL) {
3161 			/*
3162 			 * Any remaining user references are probably processes
3163 			 * that need to be killed, either in this prison or its
3164 			 * descendants.
3165 			 */
3166 			if (refcount_load(&pr->pr_uref) > 0)
3167 				killpr = pr;
3168 			/* Make sure the parent prison doesn't get killed. */
3169 			flags &= ~PD_KILL;
3170 		}
3171 		if (flags & PD_DEREF) {
3172 			/* Drop a reference. */
3173 			KASSERT(refcount_load(&pr->pr_ref) > 0,
3174 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
3175 			     pr->pr_id));
3176 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
3177 				flags = prison_lock_xlock(pr, flags);
3178 				if (refcount_release(&pr->pr_ref)) {
3179 					/*
3180 					 * When the last reference goes,
3181 					 * unlink the prison and set it aside.
3182 					 */
3183 					KASSERT(
3184 					    refcount_load(&pr->pr_uref) == 0,
3185 					    ("prison_deref: last ref, "
3186 					     "but still has %d urefs (jid=%d)",
3187 					     pr->pr_uref, pr->pr_id));
3188 					KASSERT(
3189 					    refcount_load(&prison0.pr_ref) != 0,
3190 					    ("prison0 pr_ref=0"));
3191 					pr->pr_state = PRISON_STATE_INVALID;
3192 					TAILQ_REMOVE(&allprison, pr, pr_list);
3193 					LIST_REMOVE(pr, pr_sibling);
3194 					TAILQ_INSERT_TAIL(&freeprison, pr,
3195 					    pr_list);
3196 					for (ppr = pr->pr_parent;
3197 					     ppr != NULL;
3198 					     ppr = ppr->pr_parent)
3199 						ppr->pr_childcount--;
3200 					/*
3201 					 * Removing a prison frees references
3202 					 * from its parent.
3203 					 */
3204 					mtx_unlock(&pr->pr_mtx);
3205 					flags &= ~PD_LOCKED;
3206 					pr = pr->pr_parent;
3207 					flags |= PD_DEREF | PD_DEUREF;
3208 					continue;
3209 				}
3210 			}
3211 		}
3212 		break;
3213 	}
3214 
3215 	/* Release all the prison locks. */
3216 	if (flags & PD_LOCKED)
3217 		mtx_unlock(&pr->pr_mtx);
3218 	if (flags & PD_LIST_SLOCKED)
3219 		sx_sunlock(&allprison_lock);
3220 	else if (flags & PD_LIST_XLOCKED)
3221 		sx_xunlock(&allprison_lock);
3222 
3223 	/* Kill any processes attached to a killed prison. */
3224 	if (killpr != NULL)
3225 		prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
3226 
3227 	/*
3228 	 * Finish removing any unreferenced prisons, which couldn't happen
3229 	 * while allprison_lock was held (to avoid a LOR on vrele).
3230 	 */
3231 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
3232 #ifdef VIMAGE
3233 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
3234 			vnet_destroy(rpr->pr_vnet);
3235 #endif
3236 		if (rpr->pr_root != NULL)
3237 			vrele(rpr->pr_root);
3238 		mtx_destroy(&rpr->pr_mtx);
3239 #ifdef INET
3240 		prison_ip_free(rpr->pr_addrs[PR_INET]);
3241 #endif
3242 #ifdef INET6
3243 		prison_ip_free(rpr->pr_addrs[PR_INET6]);
3244 #endif
3245 		if (rpr->pr_cpuset != NULL)
3246 			cpuset_rel(rpr->pr_cpuset);
3247 		osd_jail_exit(rpr);
3248 #ifdef RACCT
3249 		if (racct_enable)
3250 			prison_racct_detach(rpr);
3251 #endif
3252 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
3253 		free(rpr, M_PRISON);
3254 	}
3255 }
3256 
3257 /*
3258  * Kill the prison and its descendants.  Mark them as dying, clear the
3259  * persist flag, and call module remove methods.
3260  */
3261 static void
3262 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
3263 {
3264 	struct prison *cpr, *ppr, *rpr;
3265 	bool descend;
3266 
3267 	/*
3268 	 * Unlike the descendants, the target prison can be killed
3269 	 * even if it is currently dying.  This is useful for failed
3270 	 * creation in jail_set(2).
3271 	 */
3272 	KASSERT(refcount_load(&pr->pr_ref) > 0,
3273 	    ("Trying to kill dead prison %p (jid=%d).",
3274 	     pr, pr->pr_id));
3275 	refcount_acquire(&pr->pr_uref);
3276 	pr->pr_state = PRISON_STATE_DYING;
3277 	mtx_unlock(&pr->pr_mtx);
3278 
3279 	rpr = NULL;
3280 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
3281 		if (descend) {
3282 			if (!prison_isalive(cpr)) {
3283 				descend = false;
3284 				continue;
3285 			}
3286 			prison_hold(cpr);
3287 			prison_proc_hold(cpr);
3288 			mtx_lock(&cpr->pr_mtx);
3289 			cpr->pr_state = PRISON_STATE_DYING;
3290 			cpr->pr_flags |= PR_REMOVE;
3291 			mtx_unlock(&cpr->pr_mtx);
3292 			continue;
3293 		}
3294 		if (!(cpr->pr_flags & PR_REMOVE))
3295 			continue;
3296 		prison_cleanup(cpr);
3297 		mtx_lock(&cpr->pr_mtx);
3298 		cpr->pr_flags &= ~PR_REMOVE;
3299 		if (cpr->pr_flags & PR_PERSIST) {
3300 			cpr->pr_flags &= ~PR_PERSIST;
3301 			prison_proc_free_not_last(cpr);
3302 			prison_free_not_last(cpr);
3303 		}
3304 		(void)refcount_release(&cpr->pr_uref);
3305 		if (refcount_release(&cpr->pr_ref)) {
3306 			/*
3307 			 * When the last reference goes, unlink the prison
3308 			 * and set it aside for prison_deref() to handle.
3309 			 * Delay unlinking the sibling list to keep the loop
3310 			 * safe.
3311 			 */
3312 			if (rpr != NULL)
3313 				LIST_REMOVE(rpr, pr_sibling);
3314 			rpr = cpr;
3315 			rpr->pr_state = PRISON_STATE_INVALID;
3316 			TAILQ_REMOVE(&allprison, rpr, pr_list);
3317 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
3318 			/*
3319 			 * Removing a prison frees references from its parent.
3320 			 */
3321 			ppr = rpr->pr_parent;
3322 			prison_proc_free_not_last(ppr);
3323 			prison_free_not_last(ppr);
3324 			for (; ppr != NULL; ppr = ppr->pr_parent)
3325 				ppr->pr_childcount--;
3326 		}
3327 		mtx_unlock(&cpr->pr_mtx);
3328 	}
3329 	if (rpr != NULL)
3330 		LIST_REMOVE(rpr, pr_sibling);
3331 
3332 	prison_cleanup(pr);
3333 	mtx_lock(&pr->pr_mtx);
3334 	if (pr->pr_flags & PR_PERSIST) {
3335 		pr->pr_flags &= ~PR_PERSIST;
3336 		prison_proc_free_not_last(pr);
3337 		prison_free_not_last(pr);
3338 	}
3339 	(void)refcount_release(&pr->pr_uref);
3340 }
3341 
3342 /*
3343  * Given the current locking state in the flags, make sure allprison_lock
3344  * is held exclusive, and the prison is locked.  Return flags indicating
3345  * the new state.
3346  */
3347 static int
3348 prison_lock_xlock(struct prison *pr, int flags)
3349 {
3350 
3351 	if (!(flags & PD_LIST_XLOCKED)) {
3352 		/*
3353 		 * Get allprison_lock, which may be an upgrade,
3354 		 * and may require unlocking the prison.
3355 		 */
3356 		if (flags & PD_LOCKED) {
3357 			mtx_unlock(&pr->pr_mtx);
3358 			flags &= ~PD_LOCKED;
3359 		}
3360 		if (flags & PD_LIST_SLOCKED) {
3361 			if (!sx_try_upgrade(&allprison_lock)) {
3362 				sx_sunlock(&allprison_lock);
3363 				sx_xlock(&allprison_lock);
3364 			}
3365 			flags &= ~PD_LIST_SLOCKED;
3366 		} else
3367 			sx_xlock(&allprison_lock);
3368 		flags |= PD_LIST_XLOCKED;
3369 	}
3370 	if (!(flags & PD_LOCKED)) {
3371 		/* Lock the prison mutex. */
3372 		mtx_lock(&pr->pr_mtx);
3373 		flags |= PD_LOCKED;
3374 	}
3375 	return flags;
3376 }
3377 
3378 /*
3379  * Release a prison's resources when it starts dying (when the last user
3380  * reference is dropped, or when it is killed).
3381  */
3382 static void
3383 prison_cleanup(struct prison *pr)
3384 {
3385 	sx_assert(&allprison_lock, SA_XLOCKED);
3386 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3387 	vfs_exjail_delete(pr);
3388 	shm_remove_prison(pr);
3389 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3390 }
3391 
3392 /*
3393  * Set or clear a permission bit in the pr_allow field, passing restrictions
3394  * (cleared permission) down to child jails.
3395  */
3396 void
3397 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3398 {
3399 	struct prison *pr;
3400 
3401 	pr = cred->cr_prison;
3402 	sx_slock(&allprison_lock);
3403 	mtx_lock(&pr->pr_mtx);
3404 	prison_set_allow_locked(pr, flag, enable);
3405 	mtx_unlock(&pr->pr_mtx);
3406 	sx_sunlock(&allprison_lock);
3407 }
3408 
3409 static void
3410 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3411 {
3412 	struct prison *cpr;
3413 	int descend;
3414 
3415 	if (enable != 0)
3416 		pr->pr_allow |= flag;
3417 	else {
3418 		pr->pr_allow &= ~flag;
3419 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3420 			cpr->pr_allow &= ~flag;
3421 	}
3422 }
3423 
3424 /*
3425  * Check if a jail supports the given address family.
3426  *
3427  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3428  * if not.
3429  */
3430 int
3431 prison_check_af(struct ucred *cred, int af)
3432 {
3433 	struct prison *pr;
3434 	int error;
3435 
3436 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3437 
3438 	pr = cred->cr_prison;
3439 #ifdef VIMAGE
3440 	/* Prisons with their own network stack are not limited. */
3441 	if (prison_owns_vnet(cred))
3442 		return (0);
3443 #endif
3444 
3445 	error = 0;
3446 	switch (af)
3447 	{
3448 #ifdef INET
3449 	case AF_INET:
3450 		if (pr->pr_flags & PR_IP4)
3451 		{
3452 			mtx_lock(&pr->pr_mtx);
3453 			if ((pr->pr_flags & PR_IP4) &&
3454 			    pr->pr_addrs[PR_INET] == NULL)
3455 				error = EAFNOSUPPORT;
3456 			mtx_unlock(&pr->pr_mtx);
3457 		}
3458 		break;
3459 #endif
3460 #ifdef INET6
3461 	case AF_INET6:
3462 		if (pr->pr_flags & PR_IP6)
3463 		{
3464 			mtx_lock(&pr->pr_mtx);
3465 			if ((pr->pr_flags & PR_IP6) &&
3466 			    pr->pr_addrs[PR_INET6] == NULL)
3467 				error = EAFNOSUPPORT;
3468 			mtx_unlock(&pr->pr_mtx);
3469 		}
3470 		break;
3471 #endif
3472 	case AF_LOCAL:
3473 	case AF_ROUTE:
3474 	case AF_NETLINK:
3475 		break;
3476 	default:
3477 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3478 			error = EAFNOSUPPORT;
3479 	}
3480 	return (error);
3481 }
3482 
3483 /*
3484  * Check if given address belongs to the jail referenced by cred (wrapper to
3485  * prison_check_ip[46]).
3486  *
3487  * Returns 0 if jail doesn't restrict the address family or if address belongs
3488  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3489  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3490  */
3491 int
3492 prison_if(struct ucred *cred, const struct sockaddr *sa)
3493 {
3494 #ifdef INET
3495 	const struct sockaddr_in *sai;
3496 #endif
3497 #ifdef INET6
3498 	const struct sockaddr_in6 *sai6;
3499 #endif
3500 	int error;
3501 
3502 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3503 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3504 
3505 #ifdef VIMAGE
3506 	if (prison_owns_vnet(cred))
3507 		return (0);
3508 #endif
3509 
3510 	error = 0;
3511 	switch (sa->sa_family)
3512 	{
3513 #ifdef INET
3514 	case AF_INET:
3515 		sai = (const struct sockaddr_in *)sa;
3516 		error = prison_check_ip4(cred, &sai->sin_addr);
3517 		break;
3518 #endif
3519 #ifdef INET6
3520 	case AF_INET6:
3521 		sai6 = (const struct sockaddr_in6 *)sa;
3522 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3523 		break;
3524 #endif
3525 	default:
3526 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3527 			error = EAFNOSUPPORT;
3528 	}
3529 	return (error);
3530 }
3531 
3532 /*
3533  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3534  */
3535 int
3536 prison_check(struct ucred *cred1, struct ucred *cred2)
3537 {
3538 
3539 	return ((cred1->cr_prison == cred2->cr_prison ||
3540 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3541 }
3542 
3543 /*
3544  * For mountd/nfsd to run within a prison, it must be:
3545  * - A vnet prison.
3546  * - PR_ALLOW_NFSD must be set on it.
3547  * - The root directory (pr_root) of the prison must be
3548  *   a file system mount point, so the mountd can hang
3549  *   export information on it.
3550  * - The prison's enforce_statfs cannot be 0, so that
3551  *   mountd(8) can do exports.
3552  */
3553 bool
3554 prison_check_nfsd(struct ucred *cred)
3555 {
3556 
3557 	if (jailed_without_vnet(cred))
3558 		return (false);
3559 	if (!prison_allow(cred, PR_ALLOW_NFSD))
3560 		return (false);
3561 	if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3562 		return (false);
3563 	if (cred->cr_prison->pr_enforce_statfs == 0)
3564 		return (false);
3565 	return (true);
3566 }
3567 
3568 /*
3569  * Return true if p2 is a child of p1, otherwise false.
3570  */
3571 bool
3572 prison_ischild(struct prison *pr1, struct prison *pr2)
3573 {
3574 
3575 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3576 		if (pr1 == pr2)
3577 			return (true);
3578 	return (false);
3579 }
3580 
3581 /*
3582  * Return true if the prison is currently alive.  A prison is alive if it
3583  * holds user references and it isn't being removed.
3584  */
3585 bool
3586 prison_isalive(const struct prison *pr)
3587 {
3588 
3589 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3590 		return (false);
3591 	return (true);
3592 }
3593 
3594 /*
3595  * Return true if the prison is currently valid.  A prison is valid if it has
3596  * been fully created, and is not being destroyed.  Note that dying prisons
3597  * are still considered valid.  Invalid prisons won't be found under normal
3598  * circumstances, as they're only put in that state by functions that have
3599  * an exclusive hold on allprison_lock.
3600  */
3601 bool
3602 prison_isvalid(struct prison *pr)
3603 {
3604 
3605 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3606 		return (false);
3607 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3608 		return (false);
3609 	return (true);
3610 }
3611 
3612 /*
3613  * Return true if the passed credential is in a jail and that jail does not
3614  * have its own virtual network stack, otherwise false.
3615  */
3616 bool
3617 jailed_without_vnet(struct ucred *cred)
3618 {
3619 
3620 	if (!jailed(cred))
3621 		return (false);
3622 #ifdef VIMAGE
3623 	if (prison_owns_vnet(cred))
3624 		return (false);
3625 #endif
3626 
3627 	return (true);
3628 }
3629 
3630 /*
3631  * Return the correct hostname (domainname, et al) for the passed credential.
3632  */
3633 void
3634 getcredhostname(struct ucred *cred, char *buf, size_t size)
3635 {
3636 	struct prison *pr;
3637 
3638 	/*
3639 	 * A NULL credential can be used to shortcut to the physical
3640 	 * system's hostname.
3641 	 */
3642 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3643 	mtx_lock(&pr->pr_mtx);
3644 	strlcpy(buf, pr->pr_hostname, size);
3645 	mtx_unlock(&pr->pr_mtx);
3646 }
3647 
3648 void
3649 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3650 {
3651 
3652 	mtx_lock(&cred->cr_prison->pr_mtx);
3653 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3654 	mtx_unlock(&cred->cr_prison->pr_mtx);
3655 }
3656 
3657 void
3658 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3659 {
3660 
3661 	mtx_lock(&cred->cr_prison->pr_mtx);
3662 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3663 	mtx_unlock(&cred->cr_prison->pr_mtx);
3664 }
3665 
3666 void
3667 getcredhostid(struct ucred *cred, unsigned long *hostid)
3668 {
3669 
3670 	mtx_lock(&cred->cr_prison->pr_mtx);
3671 	*hostid = cred->cr_prison->pr_hostid;
3672 	mtx_unlock(&cred->cr_prison->pr_mtx);
3673 }
3674 
3675 void
3676 getjailname(struct ucred *cred, char *name, size_t len)
3677 {
3678 
3679 	mtx_lock(&cred->cr_prison->pr_mtx);
3680 	strlcpy(name, cred->cr_prison->pr_name, len);
3681 	mtx_unlock(&cred->cr_prison->pr_mtx);
3682 }
3683 
3684 #ifdef VIMAGE
3685 /*
3686  * Determine whether the prison represented by cred owns
3687  * its vnet rather than having it inherited.
3688  *
3689  * Returns true in case the prison owns the vnet, false otherwise.
3690  */
3691 bool
3692 prison_owns_vnet(struct ucred *cred)
3693 {
3694 
3695 	/*
3696 	 * vnets cannot be added/removed after jail creation,
3697 	 * so no need to lock here.
3698 	 */
3699 	return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
3700 }
3701 #endif
3702 
3703 /*
3704  * Determine whether the subject represented by cred can "see"
3705  * status of a mount point.
3706  * Returns: 0 for permitted, ENOENT otherwise.
3707  * XXX: This function should be called cr_canseemount() and should be
3708  *      placed in kern_prot.c.
3709  */
3710 int
3711 prison_canseemount(struct ucred *cred, struct mount *mp)
3712 {
3713 	struct prison *pr;
3714 	struct statfs *sp;
3715 	size_t len;
3716 
3717 	pr = cred->cr_prison;
3718 	if (pr->pr_enforce_statfs == 0)
3719 		return (0);
3720 	if (pr->pr_root->v_mount == mp)
3721 		return (0);
3722 	if (pr->pr_enforce_statfs == 2)
3723 		return (ENOENT);
3724 	/*
3725 	 * If jail's chroot directory is set to "/" we should be able to see
3726 	 * all mount-points from inside a jail.
3727 	 * This is ugly check, but this is the only situation when jail's
3728 	 * directory ends with '/'.
3729 	 */
3730 	if (strcmp(pr->pr_path, "/") == 0)
3731 		return (0);
3732 	len = strlen(pr->pr_path);
3733 	sp = &mp->mnt_stat;
3734 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3735 		return (ENOENT);
3736 	/*
3737 	 * Be sure that we don't have situation where jail's root directory
3738 	 * is "/some/path" and mount point is "/some/pathpath".
3739 	 */
3740 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3741 		return (ENOENT);
3742 	return (0);
3743 }
3744 
3745 void
3746 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3747 {
3748 	char jpath[MAXPATHLEN];
3749 	struct prison *pr;
3750 	size_t len;
3751 
3752 	pr = cred->cr_prison;
3753 	if (pr->pr_enforce_statfs == 0)
3754 		return;
3755 	if (prison_canseemount(cred, mp) != 0) {
3756 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3757 		strlcpy(sp->f_mntonname, "[restricted]",
3758 		    sizeof(sp->f_mntonname));
3759 		return;
3760 	}
3761 	if (pr->pr_root->v_mount == mp) {
3762 		/*
3763 		 * Clear current buffer data, so we are sure nothing from
3764 		 * the valid path left there.
3765 		 */
3766 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3767 		*sp->f_mntonname = '/';
3768 		return;
3769 	}
3770 	/*
3771 	 * If jail's chroot directory is set to "/" we should be able to see
3772 	 * all mount-points from inside a jail.
3773 	 */
3774 	if (strcmp(pr->pr_path, "/") == 0)
3775 		return;
3776 	len = strlen(pr->pr_path);
3777 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3778 	/*
3779 	 * Clear current buffer data, so we are sure nothing from
3780 	 * the valid path left there.
3781 	 */
3782 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3783 	if (*jpath == '\0') {
3784 		/* Should never happen. */
3785 		*sp->f_mntonname = '/';
3786 	} else {
3787 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3788 	}
3789 }
3790 
3791 /*
3792  * Check with permission for a specific privilege is granted within jail.  We
3793  * have a specific list of accepted privileges; the rest are denied.
3794  */
3795 int
3796 prison_priv_check(struct ucred *cred, int priv)
3797 {
3798 	struct prison *pr;
3799 	int error;
3800 
3801 	/*
3802 	 * Some policies have custom handlers. This routine should not be
3803 	 * called for them. See priv_check_cred().
3804 	 */
3805 	switch (priv) {
3806 	case PRIV_VFS_LOOKUP:
3807 	case PRIV_VFS_GENERATION:
3808 		KASSERT(0, ("prison_priv_check instead of a custom handler "
3809 		    "called for %d\n", priv));
3810 	}
3811 
3812 	if (!jailed(cred))
3813 		return (0);
3814 
3815 #ifdef VIMAGE
3816 	/*
3817 	 * Privileges specific to prisons with a virtual network stack.
3818 	 * There might be a duplicate entry here in case the privilege
3819 	 * is only granted conditionally in the legacy jail case.
3820 	 */
3821 	switch (priv) {
3822 		/*
3823 		 * NFS-specific privileges.
3824 		 */
3825 	case PRIV_NFS_DAEMON:
3826 	case PRIV_VFS_GETFH:
3827 	case PRIV_VFS_MOUNT_EXPORTED:
3828 		if (!prison_check_nfsd(cred))
3829 			return (EPERM);
3830 #ifdef notyet
3831 	case PRIV_NFS_LOCKD:
3832 #endif
3833 		/*
3834 		 * Network stack privileges.
3835 		 */
3836 	case PRIV_NET_BRIDGE:
3837 	case PRIV_NET_GRE:
3838 	case PRIV_NET_BPF:
3839 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3840 	case PRIV_NET_ROUTE:
3841 	case PRIV_NET_TAP:
3842 	case PRIV_NET_SETIFMTU:
3843 	case PRIV_NET_SETIFFLAGS:
3844 	case PRIV_NET_SETIFCAP:
3845 	case PRIV_NET_SETIFDESCR:
3846 	case PRIV_NET_SETIFNAME	:
3847 	case PRIV_NET_SETIFMETRIC:
3848 	case PRIV_NET_SETIFPHYS:
3849 	case PRIV_NET_SETIFMAC:
3850 	case PRIV_NET_SETLANPCP:
3851 	case PRIV_NET_ADDMULTI:
3852 	case PRIV_NET_DELMULTI:
3853 	case PRIV_NET_HWIOCTL:
3854 	case PRIV_NET_SETLLADDR:
3855 	case PRIV_NET_ADDIFGROUP:
3856 	case PRIV_NET_DELIFGROUP:
3857 	case PRIV_NET_IFCREATE:
3858 	case PRIV_NET_IFDESTROY:
3859 	case PRIV_NET_ADDIFADDR:
3860 	case PRIV_NET_DELIFADDR:
3861 	case PRIV_NET_LAGG:
3862 	case PRIV_NET_GIF:
3863 	case PRIV_NET_SETIFVNET:
3864 	case PRIV_NET_SETIFFIB:
3865 	case PRIV_NET_OVPN:
3866 	case PRIV_NET_ME:
3867 	case PRIV_NET_WG:
3868 
3869 		/*
3870 		 * 802.11-related privileges.
3871 		 */
3872 	case PRIV_NET80211_VAP_GETKEY:
3873 	case PRIV_NET80211_VAP_MANAGE:
3874 
3875 #ifdef notyet
3876 		/*
3877 		 * ATM privileges.
3878 		 */
3879 	case PRIV_NETATM_CFG:
3880 	case PRIV_NETATM_ADD:
3881 	case PRIV_NETATM_DEL:
3882 	case PRIV_NETATM_SET:
3883 
3884 		/*
3885 		 * Bluetooth privileges.
3886 		 */
3887 	case PRIV_NETBLUETOOTH_RAW:
3888 #endif
3889 
3890 		/*
3891 		 * Netgraph and netgraph module privileges.
3892 		 */
3893 	case PRIV_NETGRAPH_CONTROL:
3894 #ifdef notyet
3895 	case PRIV_NETGRAPH_TTY:
3896 #endif
3897 
3898 		/*
3899 		 * IPv4 and IPv6 privileges.
3900 		 */
3901 	case PRIV_NETINET_IPFW:
3902 	case PRIV_NETINET_DIVERT:
3903 	case PRIV_NETINET_PF:
3904 	case PRIV_NETINET_DUMMYNET:
3905 	case PRIV_NETINET_CARP:
3906 	case PRIV_NETINET_MROUTE:
3907 	case PRIV_NETINET_RAW:
3908 	case PRIV_NETINET_ADDRCTRL6:
3909 	case PRIV_NETINET_ND6:
3910 	case PRIV_NETINET_SCOPE6:
3911 	case PRIV_NETINET_ALIFETIME6:
3912 	case PRIV_NETINET_IPSEC:
3913 	case PRIV_NETINET_BINDANY:
3914 
3915 #ifdef notyet
3916 		/*
3917 		 * NCP privileges.
3918 		 */
3919 	case PRIV_NETNCP:
3920 
3921 		/*
3922 		 * SMB privileges.
3923 		 */
3924 	case PRIV_NETSMB:
3925 #endif
3926 
3927 	/*
3928 	 * No default: or deny here.
3929 	 * In case of no permit fall through to next switch().
3930 	 */
3931 		if (cred->cr_prison->pr_flags & PR_VNET)
3932 			return (0);
3933 	}
3934 #endif /* VIMAGE */
3935 
3936 	switch (priv) {
3937 		/*
3938 		 * Allow ktrace privileges for root in jail.
3939 		 */
3940 	case PRIV_KTRACE:
3941 
3942 #if 0
3943 		/*
3944 		 * Allow jailed processes to configure audit identity and
3945 		 * submit audit records (login, etc).  In the future we may
3946 		 * want to further refine the relationship between audit and
3947 		 * jail.
3948 		 */
3949 	case PRIV_AUDIT_GETAUDIT:
3950 	case PRIV_AUDIT_SETAUDIT:
3951 	case PRIV_AUDIT_SUBMIT:
3952 #endif
3953 
3954 		/*
3955 		 * Allow jailed processes to manipulate process UNIX
3956 		 * credentials in any way they see fit.
3957 		 */
3958 	case PRIV_CRED_SETUID:
3959 	case PRIV_CRED_SETEUID:
3960 	case PRIV_CRED_SETGID:
3961 	case PRIV_CRED_SETEGID:
3962 	case PRIV_CRED_SETGROUPS:
3963 	case PRIV_CRED_SETREUID:
3964 	case PRIV_CRED_SETREGID:
3965 	case PRIV_CRED_SETRESUID:
3966 	case PRIV_CRED_SETRESGID:
3967 
3968 		/*
3969 		 * Jail implements visibility constraints already, so allow
3970 		 * jailed root to override uid/gid-based constraints.
3971 		 */
3972 	case PRIV_SEEOTHERGIDS:
3973 	case PRIV_SEEOTHERUIDS:
3974 	case PRIV_SEEJAILPROC:
3975 
3976 		/*
3977 		 * Jail implements inter-process debugging limits already, so
3978 		 * allow jailed root various debugging privileges.
3979 		 */
3980 	case PRIV_DEBUG_DIFFCRED:
3981 	case PRIV_DEBUG_SUGID:
3982 	case PRIV_DEBUG_UNPRIV:
3983 
3984 		/*
3985 		 * Allow jail to set various resource limits and login
3986 		 * properties, and for now, exceed process resource limits.
3987 		 */
3988 	case PRIV_PROC_LIMIT:
3989 	case PRIV_PROC_SETLOGIN:
3990 	case PRIV_PROC_SETRLIMIT:
3991 
3992 		/*
3993 		 * System V and POSIX IPC privileges are granted in jail.
3994 		 */
3995 	case PRIV_IPC_READ:
3996 	case PRIV_IPC_WRITE:
3997 	case PRIV_IPC_ADMIN:
3998 	case PRIV_IPC_MSGSIZE:
3999 	case PRIV_MQ_ADMIN:
4000 
4001 		/*
4002 		 * Jail operations within a jail work on child jails.
4003 		 */
4004 	case PRIV_JAIL_ATTACH:
4005 	case PRIV_JAIL_SET:
4006 	case PRIV_JAIL_REMOVE:
4007 
4008 		/*
4009 		 * Jail implements its own inter-process limits, so allow
4010 		 * root processes in jail to change scheduling on other
4011 		 * processes in the same jail.  Likewise for signalling.
4012 		 */
4013 	case PRIV_SCHED_DIFFCRED:
4014 	case PRIV_SCHED_CPUSET:
4015 	case PRIV_SIGNAL_DIFFCRED:
4016 	case PRIV_SIGNAL_SUGID:
4017 
4018 		/*
4019 		 * Allow jailed processes to write to sysctls marked as jail
4020 		 * writable.
4021 		 */
4022 	case PRIV_SYSCTL_WRITEJAIL:
4023 
4024 		/*
4025 		 * Allow root in jail to manage a variety of quota
4026 		 * properties.  These should likely be conditional on a
4027 		 * configuration option.
4028 		 */
4029 	case PRIV_VFS_GETQUOTA:
4030 	case PRIV_VFS_SETQUOTA:
4031 
4032 		/*
4033 		 * Since Jail relies on chroot() to implement file system
4034 		 * protections, grant many VFS privileges to root in jail.
4035 		 * Be careful to exclude mount-related and NFS-related
4036 		 * privileges.
4037 		 */
4038 	case PRIV_VFS_READ:
4039 	case PRIV_VFS_WRITE:
4040 	case PRIV_VFS_ADMIN:
4041 	case PRIV_VFS_EXEC:
4042 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
4043 	case PRIV_VFS_CHFLAGS_DEV:
4044 	case PRIV_VFS_CHOWN:
4045 	case PRIV_VFS_CHROOT:
4046 	case PRIV_VFS_RETAINSUGID:
4047 	case PRIV_VFS_FCHROOT:
4048 	case PRIV_VFS_LINK:
4049 	case PRIV_VFS_SETGID:
4050 	case PRIV_VFS_STAT:
4051 	case PRIV_VFS_STICKYFILE:
4052 
4053 		/*
4054 		 * As in the non-jail case, non-root users are expected to be
4055 		 * able to read kernel/physical memory (provided /dev/[k]mem
4056 		 * exists in the jail and they have permission to access it).
4057 		 */
4058 	case PRIV_KMEM_READ:
4059 		return (0);
4060 
4061 		/*
4062 		 * Depending on the global setting, allow privilege of
4063 		 * setting system flags.
4064 		 */
4065 	case PRIV_VFS_SYSFLAGS:
4066 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4067 			return (0);
4068 		else
4069 			return (EPERM);
4070 
4071 		/*
4072 		 * Depending on the global setting, allow privilege of
4073 		 * mounting/unmounting file systems.
4074 		 */
4075 	case PRIV_VFS_MOUNT:
4076 	case PRIV_VFS_UNMOUNT:
4077 	case PRIV_VFS_MOUNT_NONUSER:
4078 	case PRIV_VFS_MOUNT_OWNER:
4079 		pr = cred->cr_prison;
4080 		prison_lock(pr);
4081 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
4082 			error = 0;
4083 		else
4084 			error = EPERM;
4085 		prison_unlock(pr);
4086 		return (error);
4087 
4088 		/*
4089 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
4090 		 * policy.  priv_check_cred will not specifically allow it, and
4091 		 * we may want a MAC policy to allow it.
4092 		 */
4093 	case PRIV_VFS_READ_DIR:
4094 		return (0);
4095 
4096 		/*
4097 		 * Conditionally allow privileged process in the jail to
4098 		 * manipulate filesystem extended attributes in the system
4099 		 * namespace.
4100 		 */
4101 	case PRIV_VFS_EXTATTR_SYSTEM:
4102 		if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0)
4103 			return (0);
4104 		else
4105 			return (EPERM);
4106 
4107 		/*
4108 		 * Conditionnaly allow locking (unlocking) physical pages
4109 		 * in memory.
4110 		 */
4111 	case PRIV_VM_MLOCK:
4112 	case PRIV_VM_MUNLOCK:
4113 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
4114 			return (0);
4115 		else
4116 			return (EPERM);
4117 
4118 		/*
4119 		 * Conditionally allow jailed root to bind reserved ports.
4120 		 */
4121 	case PRIV_NETINET_RESERVEDPORT:
4122 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
4123 			return (0);
4124 		else
4125 			return (EPERM);
4126 
4127 		/*
4128 		 * Allow jailed root to reuse in-use ports.
4129 		 */
4130 	case PRIV_NETINET_REUSEPORT:
4131 		return (0);
4132 
4133 		/*
4134 		 * Allow jailed root to set certain IPv4/6 (option) headers.
4135 		 */
4136 	case PRIV_NETINET_SETHDROPTS:
4137 		return (0);
4138 
4139 		/*
4140 		 * Conditionally allow creating raw sockets in jail.
4141 		 */
4142 	case PRIV_NETINET_RAW:
4143 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4144 			return (0);
4145 		else
4146 			return (EPERM);
4147 
4148 		/*
4149 		 * Since jail implements its own visibility limits on netstat
4150 		 * sysctls, allow getcred.  This allows identd to work in
4151 		 * jail.
4152 		 */
4153 	case PRIV_NETINET_GETCRED:
4154 		return (0);
4155 
4156 		/*
4157 		 * Allow jailed root to set loginclass.
4158 		 */
4159 	case PRIV_PROC_SETLOGINCLASS:
4160 		return (0);
4161 
4162 		/*
4163 		 * Do not allow a process inside a jail to read the kernel
4164 		 * message buffer unless explicitly permitted.
4165 		 */
4166 	case PRIV_MSGBUF:
4167 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
4168 			return (0);
4169 		return (EPERM);
4170 
4171 		/*
4172 		 * Conditionally allow privileged process in the jail adjust
4173 		 * machine time.
4174 		 */
4175 	case PRIV_ADJTIME:
4176 	case PRIV_NTP_ADJTIME:
4177 		if (cred->cr_prison->pr_allow &
4178 		    (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) {
4179 			return (0);
4180 		}
4181 		return (EPERM);
4182 
4183 		/*
4184 		 * Conditionally allow privileged process in the jail set
4185 		 * machine time.
4186 		 */
4187 	case PRIV_CLOCK_SETTIME:
4188 		if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME)
4189 			return (0);
4190 		else
4191 			return (EPERM);
4192 
4193 	default:
4194 		/*
4195 		 * In all remaining cases, deny the privilege request.  This
4196 		 * includes almost all network privileges, many system
4197 		 * configuration privileges.
4198 		 */
4199 		return (EPERM);
4200 	}
4201 }
4202 
4203 /*
4204  * Return the part of pr2's name that is relative to pr1, or the whole name
4205  * if it does not directly follow.
4206  */
4207 
4208 char *
4209 prison_name(struct prison *pr1, struct prison *pr2)
4210 {
4211 	char *name;
4212 
4213 	/* Jails see themselves as "0" (if they see themselves at all). */
4214 	if (pr1 == pr2)
4215 		return "0";
4216 	name = pr2->pr_name;
4217 	if (prison_ischild(pr1, pr2)) {
4218 		/*
4219 		 * pr1 isn't locked (and allprison_lock may not be either)
4220 		 * so its length can't be counted on.  But the number of dots
4221 		 * can be counted on - and counted.
4222 		 */
4223 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4224 			name = strchr(name, '.') + 1;
4225 	}
4226 	return (name);
4227 }
4228 
4229 /*
4230  * Return the part of pr2's path that is relative to pr1, or the whole path
4231  * if it does not directly follow.
4232  */
4233 static char *
4234 prison_path(struct prison *pr1, struct prison *pr2)
4235 {
4236 	char *path1, *path2;
4237 	int len1;
4238 
4239 	path1 = pr1->pr_path;
4240 	path2 = pr2->pr_path;
4241 	if (!strcmp(path1, "/"))
4242 		return (path2);
4243 	len1 = strlen(path1);
4244 	if (strncmp(path1, path2, len1))
4245 		return (path2);
4246 	if (path2[len1] == '\0')
4247 		return "/";
4248 	if (path2[len1] == '/')
4249 		return (path2 + len1);
4250 	return (path2);
4251 }
4252 
4253 /*
4254  * Jail-related sysctls.
4255  */
4256 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4257     "Jails");
4258 
4259 #if defined(INET) || defined(INET6)
4260 /*
4261  * Copy address array to memory that would be then SYSCTL_OUT-ed.
4262  * sysctl_jail_list() helper.
4263  */
4264 static void
4265 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
4266 {
4267 	const struct prison_ip *pip;
4268 	const size_t size = pr_families[af].size;
4269 
4270  again:
4271 	mtx_assert(&pr->pr_mtx, MA_OWNED);
4272 	if ((pip = pr->pr_addrs[af]) != NULL) {
4273 		if (*len < pip->ips) {
4274 			*len = pip->ips;
4275 			mtx_unlock(&pr->pr_mtx);
4276 			*out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
4277 			mtx_lock(&pr->pr_mtx);
4278 			goto again;
4279 		}
4280 		bcopy(pip->pr_ip, *out, pip->ips * size);
4281 	}
4282 }
4283 #endif
4284 
4285 static int
4286 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4287 {
4288 	struct xprison *xp;
4289 	struct prison *pr, *cpr;
4290 #ifdef INET
4291 	struct in_addr *ip4 = NULL;
4292 	int ip4s = 0;
4293 #endif
4294 #ifdef INET6
4295 	struct in6_addr *ip6 = NULL;
4296 	int ip6s = 0;
4297 #endif
4298 	int descend, error;
4299 
4300 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4301 	pr = req->td->td_ucred->cr_prison;
4302 	error = 0;
4303 	sx_slock(&allprison_lock);
4304 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4305 		mtx_lock(&cpr->pr_mtx);
4306 #ifdef INET
4307 		prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
4308 #endif
4309 #ifdef INET6
4310 		prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
4311 #endif
4312 		bzero(xp, sizeof(*xp));
4313 		xp->pr_version = XPRISON_VERSION;
4314 		xp->pr_id = cpr->pr_id;
4315 		xp->pr_state = cpr->pr_state;
4316 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4317 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4318 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4319 #ifdef INET
4320 		xp->pr_ip4s = ip4s;
4321 #endif
4322 #ifdef INET6
4323 		xp->pr_ip6s = ip6s;
4324 #endif
4325 		mtx_unlock(&cpr->pr_mtx);
4326 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4327 		if (error)
4328 			break;
4329 #ifdef INET
4330 		if (xp->pr_ip4s > 0) {
4331 			error = SYSCTL_OUT(req, ip4,
4332 			    xp->pr_ip4s * sizeof(struct in_addr));
4333 			if (error)
4334 				break;
4335 		}
4336 #endif
4337 #ifdef INET6
4338 		if (xp->pr_ip6s > 0) {
4339 			error = SYSCTL_OUT(req, ip6,
4340 			    xp->pr_ip6s * sizeof(struct in6_addr));
4341 			if (error)
4342 				break;
4343 		}
4344 #endif
4345 	}
4346 	sx_sunlock(&allprison_lock);
4347 	free(xp, M_TEMP);
4348 #ifdef INET
4349 	free(ip4, M_TEMP);
4350 #endif
4351 #ifdef INET6
4352 	free(ip6, M_TEMP);
4353 #endif
4354 	return (error);
4355 }
4356 
4357 SYSCTL_OID(_security_jail, OID_AUTO, list,
4358     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4359     sysctl_jail_list, "S", "List of active jails");
4360 
4361 static int
4362 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4363 {
4364 	int error, injail;
4365 
4366 	injail = jailed(req->td->td_ucred);
4367 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4368 
4369 	return (error);
4370 }
4371 
4372 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4373     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4374     sysctl_jail_jailed, "I", "Process in jail?");
4375 
4376 static int
4377 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4378 {
4379 	int error, havevnet;
4380 #ifdef VIMAGE
4381 	struct ucred *cred = req->td->td_ucred;
4382 
4383 	havevnet = jailed(cred) && prison_owns_vnet(cred);
4384 #else
4385 	havevnet = 0;
4386 #endif
4387 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4388 
4389 	return (error);
4390 }
4391 
4392 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4393     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4394     sysctl_jail_vnet, "I", "Jail owns vnet?");
4395 
4396 #if defined(INET) || defined(INET6)
4397 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4398     &jail_max_af_ips, 0,
4399     "Number of IP addresses a jail may have at most per address family (deprecated)");
4400 #endif
4401 
4402 /*
4403  * Default parameters for jail(2) compatibility.  For historical reasons,
4404  * the sysctl names have varying similarity to the parameter names.  Prisons
4405  * just see their own parameters, and can't change them.
4406  */
4407 static int
4408 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4409 {
4410 	int error, i;
4411 
4412 	/* Get the current flag value, and convert it to a boolean. */
4413 	if (req->td->td_ucred->cr_prison == &prison0) {
4414 		mtx_lock(&prison0.pr_mtx);
4415 		i = (jail_default_allow & arg2) != 0;
4416 		mtx_unlock(&prison0.pr_mtx);
4417 	} else
4418 		i = prison_allow(req->td->td_ucred, arg2);
4419 
4420 	if (arg1 != NULL)
4421 		i = !i;
4422 	error = sysctl_handle_int(oidp, &i, 0, req);
4423 	if (error || !req->newptr)
4424 		return (error);
4425 	i = i ? arg2 : 0;
4426 	if (arg1 != NULL)
4427 		i ^= arg2;
4428 	/*
4429 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4430 	 * for writing.
4431 	 */
4432 	mtx_lock(&prison0.pr_mtx);
4433 	jail_default_allow = (jail_default_allow & ~arg2) | i;
4434 	mtx_unlock(&prison0.pr_mtx);
4435 	return (0);
4436 }
4437 
4438 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4439     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4440     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4441     "Processes in jail can set their hostnames (deprecated)");
4442 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4443     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4444     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4445     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4446 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4447     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4448     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4449     "Processes in jail can use System V IPC primitives (deprecated)");
4450 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4451     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4452     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4453     "Prison root can create raw sockets (deprecated)");
4454 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4455     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4456     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4457     "Processes in jail can alter system file flags (deprecated)");
4458 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4459     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4460     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4461     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4462 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
4463     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4464     NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
4465     "Processes in jail can lock/unlock physical pages in memory");
4466 
4467 static int
4468 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4469 {
4470 	struct prison *pr;
4471 	int level, error;
4472 
4473 	pr = req->td->td_ucred->cr_prison;
4474 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4475 	error = sysctl_handle_int(oidp, &level, 0, req);
4476 	if (error || !req->newptr)
4477 		return (error);
4478 	*(int *)arg1 = level;
4479 	return (0);
4480 }
4481 
4482 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4483     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4484     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4485     sysctl_jail_default_level, "I",
4486     "Processes in jail cannot see all mounted file systems (deprecated)");
4487 
4488 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4489     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4490     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4491     sysctl_jail_default_level, "I",
4492     "Ruleset for the devfs filesystem in jail (deprecated)");
4493 
4494 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4495     "Limits and stats of child jails");
4496 
4497 static int
4498 sysctl_jail_children(SYSCTL_HANDLER_ARGS)
4499 {
4500 	struct prison *pr;
4501 	int i;
4502 
4503 	pr = req->td->td_ucred->cr_prison;
4504 
4505 	switch (oidp->oid_kind & CTLTYPE) {
4506 	case CTLTYPE_INT:
4507 		i = *(int *)((char *)pr + arg2);
4508 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4509 	}
4510 
4511 	return (0);
4512 }
4513 
4514 SYSCTL_PROC(_security_jail_children, OID_AUTO, max,
4515     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4516     NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children,
4517     "I", "Maximum number of child jails");
4518 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur,
4519     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4520     NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children,
4521     "I", "Current number of child jails");
4522 
4523 /*
4524  * Nodes to describe jail parameters.  Maximum length of string parameters
4525  * is returned in the string itself, and the other parameters exist merely
4526  * to make themselves and their types known.
4527  */
4528 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4529     "Jail parameters");
4530 
4531 int
4532 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4533 {
4534 	int i;
4535 	long l;
4536 	size_t s;
4537 	char numbuf[12];
4538 
4539 	switch (oidp->oid_kind & CTLTYPE)
4540 	{
4541 	case CTLTYPE_LONG:
4542 	case CTLTYPE_ULONG:
4543 		l = 0;
4544 #ifdef SCTL_MASK32
4545 		if (!(req->flags & SCTL_MASK32))
4546 #endif
4547 			return (SYSCTL_OUT(req, &l, sizeof(l)));
4548 	case CTLTYPE_INT:
4549 	case CTLTYPE_UINT:
4550 		i = 0;
4551 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4552 	case CTLTYPE_STRING:
4553 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4554 		return
4555 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4556 	case CTLTYPE_STRUCT:
4557 		s = (size_t)arg2;
4558 		return (SYSCTL_OUT(req, &s, sizeof(s)));
4559 	}
4560 	return (0);
4561 }
4562 
4563 /*
4564  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4565  * jail creation time but cannot be changed in an existing jail.
4566  */
4567 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4568 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4569 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4570 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4571 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4572     "I", "Jail secure level");
4573 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4574     "Jail value for kern.osreldate and uname -K");
4575 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4576     "Jail value for kern.osrelease and uname -r");
4577 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4578     "I", "Jail cannot see all mounted file systems");
4579 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4580     "I", "Ruleset for in-jail devfs mounts");
4581 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4582     "B", "Jail persistence");
4583 #ifdef VIMAGE
4584 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4585     "E,jailsys", "Virtual network stack");
4586 #endif
4587 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4588     "B", "Jail is in the process of shutting down");
4589 
4590 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4591 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4592     "I", "Current number of child jails");
4593 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4594     "I", "Maximum number of child jails");
4595 
4596 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4597 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4598     "Jail hostname");
4599 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4600     "Jail NIS domainname");
4601 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4602     "Jail host UUID");
4603 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4604     "LU", "Jail host ID");
4605 
4606 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4607 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4608 
4609 #ifdef INET
4610 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4611     "Jail IPv4 address virtualization");
4612 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4613     "S,in_addr,a", "Jail IPv4 addresses");
4614 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4615     "B", "Do (not) use IPv4 source address selection rather than the "
4616     "primary jail IPv4 address.");
4617 #endif
4618 #ifdef INET6
4619 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4620     "Jail IPv6 address virtualization");
4621 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4622     "S,in6_addr,a", "Jail IPv6 addresses");
4623 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4624     "B", "Do (not) use IPv6 source address selection rather than the "
4625     "primary jail IPv6 address.");
4626 #endif
4627 
4628 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4629 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4630     "B", "Jail may set hostname");
4631 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4632     "B", "Jail may use SYSV IPC");
4633 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4634     "B", "Jail may create raw sockets");
4635 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4636     "B", "Jail may alter system file flags");
4637 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4638     "B", "Jail may set file quotas");
4639 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4640     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4641 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4642     "B", "Jail may lock (unlock) physical pages in memory");
4643 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4644     "B", "Jail may bind sockets to reserved ports");
4645 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4646     "B", "Jail may read the kernel message buffer");
4647 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4648     "B", "Unprivileged processes may use process debugging facilities");
4649 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4650     "B", "Processes in jail with uid 0 have privilege");
4651 #ifdef VIMAGE
4652 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
4653     "B", "Mountd/nfsd may run in the jail");
4654 #endif
4655 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW,
4656     "B", "Jail may set system-level filesystem extended attributes");
4657 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW,
4658     "B", "Jail may adjust system time");
4659 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
4660     "B", "Jail may set system time");
4661 
4662 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4663 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4664     "B", "Jail may mount/unmount jail-friendly file systems in general");
4665 
4666 /*
4667  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
4668  * its associated bit in the pr_allow bitmask, or zero if the parameter was
4669  * not created.
4670  */
4671 unsigned
4672 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4673     const char *descr)
4674 {
4675 	struct bool_flags *bf;
4676 	struct sysctl_oid *parent;
4677 	char *allow_name, *allow_noname, *allowed;
4678 #ifndef NO_SYSCTL_DESCR
4679 	char *descr_deprecated;
4680 #endif
4681 	u_int allow_flag;
4682 
4683 	if (prefix
4684 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4685 		< 0 ||
4686 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4687 		< 0
4688 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4689 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4690 		free(allow_name, M_PRISON);
4691 		return 0;
4692 	}
4693 
4694 	/*
4695 	 * See if this parameter has already beed added, i.e. a module was
4696 	 * previously loaded/unloaded.
4697 	 */
4698 	mtx_lock(&prison0.pr_mtx);
4699 	for (bf = pr_flag_allow;
4700 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4701 		atomic_load_int(&bf->flag) != 0;
4702 	     bf++) {
4703 		if (strcmp(bf->name, allow_name) == 0) {
4704 			allow_flag = bf->flag;
4705 			goto no_add;
4706 		}
4707 	}
4708 
4709 	/*
4710 	 * Find a free bit in pr_allow_all, failing if there are none
4711 	 * (which shouldn't happen as long as we keep track of how many
4712 	 * potential dynamic flags exist).
4713 	 */
4714 	for (allow_flag = 1;; allow_flag <<= 1) {
4715 		if (allow_flag == 0)
4716 			goto no_add;
4717 		if ((pr_allow_all & allow_flag) == 0)
4718 			break;
4719 	}
4720 
4721 	/* Note the parameter in the next open slot in pr_flag_allow. */
4722 	for (bf = pr_flag_allow; ; bf++) {
4723 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4724 			/* This should never happen, but is not fatal. */
4725 			allow_flag = 0;
4726 			goto no_add;
4727 		}
4728 		if (atomic_load_int(&bf->flag) == 0)
4729 			break;
4730 	}
4731 	bf->name = allow_name;
4732 	bf->noname = allow_noname;
4733 	pr_allow_all |= allow_flag;
4734 	/*
4735 	 * prison0 always has permission for the new parameter.
4736 	 * Other jails must have it granted to them.
4737 	 */
4738 	prison0.pr_allow |= allow_flag;
4739 	/* The flag indicates a valid entry, so make sure it is set last. */
4740 	atomic_store_rel_int(&bf->flag, allow_flag);
4741 	mtx_unlock(&prison0.pr_mtx);
4742 
4743 	/*
4744 	 * Create sysctls for the parameter, and the back-compat global
4745 	 * permission.
4746 	 */
4747 	parent = prefix
4748 	    ? SYSCTL_ADD_NODE(NULL,
4749 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4750 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4751 	    : &sysctl___security_jail_param_allow;
4752 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4753 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4754 	    NULL, 0, sysctl_jail_param, "B", descr);
4755 	if ((prefix
4756 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4757 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4758 #ifndef NO_SYSCTL_DESCR
4759 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4760 		    descr);
4761 #endif
4762 		(void)SYSCTL_ADD_PROC(NULL,
4763 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4764 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4765 		    sysctl_jail_default_allow, "I", descr_deprecated);
4766 #ifndef NO_SYSCTL_DESCR
4767 		free(descr_deprecated, M_TEMP);
4768 #endif
4769 		free(allowed, M_TEMP);
4770 	}
4771 	return allow_flag;
4772 
4773  no_add:
4774 	mtx_unlock(&prison0.pr_mtx);
4775 	free(allow_name, M_PRISON);
4776 	free(allow_noname, M_PRISON);
4777 	return allow_flag;
4778 }
4779 
4780 /*
4781  * The VFS system will register jail-aware filesystems here.  They each get
4782  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4783  * attempts to mount.
4784  */
4785 void
4786 prison_add_vfs(struct vfsconf *vfsp)
4787 {
4788 #ifdef NO_SYSCTL_DESCR
4789 
4790 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4791 	    NULL, NULL);
4792 #else
4793 	char *descr;
4794 
4795 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4796 	    vfsp->vfc_name);
4797 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4798 	    NULL, descr);
4799 	free(descr, M_TEMP);
4800 #endif
4801 }
4802 
4803 #ifdef RACCT
4804 void
4805 prison_racct_foreach(void (*callback)(struct racct *racct,
4806     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4807     void *arg2, void *arg3)
4808 {
4809 	struct prison_racct *prr;
4810 
4811 	ASSERT_RACCT_ENABLED();
4812 
4813 	sx_slock(&allprison_lock);
4814 	if (pre != NULL)
4815 		(pre)();
4816 	LIST_FOREACH(prr, &allprison_racct, prr_next)
4817 		(callback)(prr->prr_racct, arg2, arg3);
4818 	if (post != NULL)
4819 		(post)();
4820 	sx_sunlock(&allprison_lock);
4821 }
4822 
4823 static struct prison_racct *
4824 prison_racct_find_locked(const char *name)
4825 {
4826 	struct prison_racct *prr;
4827 
4828 	ASSERT_RACCT_ENABLED();
4829 	sx_assert(&allprison_lock, SA_XLOCKED);
4830 
4831 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4832 		return (NULL);
4833 
4834 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4835 		if (strcmp(name, prr->prr_name) != 0)
4836 			continue;
4837 
4838 		/* Found prison_racct with a matching name? */
4839 		prison_racct_hold(prr);
4840 		return (prr);
4841 	}
4842 
4843 	/* Add new prison_racct. */
4844 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4845 	racct_create(&prr->prr_racct);
4846 
4847 	strcpy(prr->prr_name, name);
4848 	refcount_init(&prr->prr_refcount, 1);
4849 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4850 
4851 	return (prr);
4852 }
4853 
4854 struct prison_racct *
4855 prison_racct_find(const char *name)
4856 {
4857 	struct prison_racct *prr;
4858 
4859 	ASSERT_RACCT_ENABLED();
4860 
4861 	sx_xlock(&allprison_lock);
4862 	prr = prison_racct_find_locked(name);
4863 	sx_xunlock(&allprison_lock);
4864 	return (prr);
4865 }
4866 
4867 void
4868 prison_racct_hold(struct prison_racct *prr)
4869 {
4870 
4871 	ASSERT_RACCT_ENABLED();
4872 
4873 	refcount_acquire(&prr->prr_refcount);
4874 }
4875 
4876 static void
4877 prison_racct_free_locked(struct prison_racct *prr)
4878 {
4879 
4880 	ASSERT_RACCT_ENABLED();
4881 	sx_assert(&allprison_lock, SA_XLOCKED);
4882 
4883 	if (refcount_release(&prr->prr_refcount)) {
4884 		racct_destroy(&prr->prr_racct);
4885 		LIST_REMOVE(prr, prr_next);
4886 		free(prr, M_PRISON_RACCT);
4887 	}
4888 }
4889 
4890 void
4891 prison_racct_free(struct prison_racct *prr)
4892 {
4893 
4894 	ASSERT_RACCT_ENABLED();
4895 	sx_assert(&allprison_lock, SA_UNLOCKED);
4896 
4897 	if (refcount_release_if_not_last(&prr->prr_refcount))
4898 		return;
4899 
4900 	sx_xlock(&allprison_lock);
4901 	prison_racct_free_locked(prr);
4902 	sx_xunlock(&allprison_lock);
4903 }
4904 
4905 static void
4906 prison_racct_attach(struct prison *pr)
4907 {
4908 	struct prison_racct *prr;
4909 
4910 	ASSERT_RACCT_ENABLED();
4911 	sx_assert(&allprison_lock, SA_XLOCKED);
4912 
4913 	prr = prison_racct_find_locked(pr->pr_name);
4914 	KASSERT(prr != NULL, ("cannot find prison_racct"));
4915 
4916 	pr->pr_prison_racct = prr;
4917 }
4918 
4919 /*
4920  * Handle jail renaming.  From the racct point of view, renaming means
4921  * moving from one prison_racct to another.
4922  */
4923 static void
4924 prison_racct_modify(struct prison *pr)
4925 {
4926 #ifdef RCTL
4927 	struct proc *p;
4928 	struct ucred *cred;
4929 #endif
4930 	struct prison_racct *oldprr;
4931 
4932 	ASSERT_RACCT_ENABLED();
4933 
4934 	sx_slock(&allproc_lock);
4935 	sx_xlock(&allprison_lock);
4936 
4937 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4938 		sx_xunlock(&allprison_lock);
4939 		sx_sunlock(&allproc_lock);
4940 		return;
4941 	}
4942 
4943 	oldprr = pr->pr_prison_racct;
4944 	pr->pr_prison_racct = NULL;
4945 
4946 	prison_racct_attach(pr);
4947 
4948 	/*
4949 	 * Move resource utilisation records.
4950 	 */
4951 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4952 
4953 #ifdef RCTL
4954 	/*
4955 	 * Force rctl to reattach rules to processes.
4956 	 */
4957 	FOREACH_PROC_IN_SYSTEM(p) {
4958 		PROC_LOCK(p);
4959 		cred = crhold(p->p_ucred);
4960 		PROC_UNLOCK(p);
4961 		rctl_proc_ucred_changed(p, cred);
4962 		crfree(cred);
4963 	}
4964 #endif
4965 
4966 	sx_sunlock(&allproc_lock);
4967 	prison_racct_free_locked(oldprr);
4968 	sx_xunlock(&allprison_lock);
4969 }
4970 
4971 static void
4972 prison_racct_detach(struct prison *pr)
4973 {
4974 
4975 	ASSERT_RACCT_ENABLED();
4976 	sx_assert(&allprison_lock, SA_UNLOCKED);
4977 
4978 	if (pr->pr_prison_racct == NULL)
4979 		return;
4980 	prison_racct_free(pr->pr_prison_racct);
4981 	pr->pr_prison_racct = NULL;
4982 }
4983 #endif /* RACCT */
4984 
4985 #ifdef DDB
4986 
4987 static void
4988 db_show_prison(struct prison *pr)
4989 {
4990 	struct bool_flags *bf;
4991 	struct jailsys_flags *jsf;
4992 #if defined(INET) || defined(INET6)
4993 	int ii;
4994 	struct prison_ip *pip;
4995 #endif
4996 	unsigned f;
4997 #ifdef INET
4998 	char ip4buf[INET_ADDRSTRLEN];
4999 #endif
5000 #ifdef INET6
5001 	char ip6buf[INET6_ADDRSTRLEN];
5002 #endif
5003 
5004 	db_printf("prison %p:\n", pr);
5005 	db_printf(" jid             = %d\n", pr->pr_id);
5006 	db_printf(" name            = %s\n", pr->pr_name);
5007 	db_printf(" parent          = %p\n", pr->pr_parent);
5008 	db_printf(" ref             = %d\n", pr->pr_ref);
5009 	db_printf(" uref            = %d\n", pr->pr_uref);
5010 	db_printf(" state           = %s\n",
5011 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
5012 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
5013 	    "invalid");
5014 	db_printf(" path            = %s\n", pr->pr_path);
5015 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
5016 	    ? pr->pr_cpuset->cs_id : -1);
5017 #ifdef VIMAGE
5018 	db_printf(" vnet            = %p\n", pr->pr_vnet);
5019 #endif
5020 	db_printf(" root            = %p\n", pr->pr_root);
5021 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
5022 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
5023 	db_printf(" children.max    = %d\n", pr->pr_childmax);
5024 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
5025 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
5026 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
5027 	db_printf(" flags           = 0x%x", pr->pr_flags);
5028 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
5029 		if (pr->pr_flags & bf->flag)
5030 			db_printf(" %s", bf->name);
5031 	for (jsf = pr_flag_jailsys;
5032 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
5033 	     jsf++) {
5034 		f = pr->pr_flags & (jsf->disable | jsf->new);
5035 		db_printf(" %-16s= %s\n", jsf->name,
5036 		    (f != 0 && f == jsf->disable) ? "disable"
5037 		    : (f == jsf->new) ? "new"
5038 		    : "inherit");
5039 	}
5040 	db_printf(" allow           = 0x%x", pr->pr_allow);
5041 	for (bf = pr_flag_allow;
5042 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
5043 		atomic_load_int(&bf->flag) != 0;
5044 	     bf++)
5045 		if (pr->pr_allow & bf->flag)
5046 			db_printf(" %s", bf->name);
5047 	db_printf("\n");
5048 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
5049 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
5050 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
5051 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
5052 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
5053 #ifdef INET
5054 	if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
5055 		db_printf(" ip4s            = %d\n", pip->ips);
5056 		for (ii = 0; ii < pip->ips; ii++)
5057 			db_printf(" %s %s\n",
5058 			    ii == 0 ? "ip4.addr        =" : "                 ",
5059 			    inet_ntoa_r(
5060 			    *(const struct in_addr *)PR_IP(pip, PR_INET, ii),
5061 			    ip4buf));
5062 	}
5063 #endif
5064 #ifdef INET6
5065 	if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
5066 		db_printf(" ip6s            = %d\n", pip->ips);
5067 		for (ii = 0; ii < pip->ips; ii++)
5068 			db_printf(" %s %s\n",
5069 			    ii == 0 ? "ip6.addr        =" : "                 ",
5070 			    ip6_sprintf(ip6buf,
5071 			    (const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
5072 	}
5073 #endif
5074 }
5075 
5076 DB_SHOW_COMMAND(prison, db_show_prison_command)
5077 {
5078 	struct prison *pr;
5079 
5080 	if (!have_addr) {
5081 		/*
5082 		 * Show all prisons in the list, and prison0 which is not
5083 		 * listed.
5084 		 */
5085 		db_show_prison(&prison0);
5086 		if (!db_pager_quit) {
5087 			TAILQ_FOREACH(pr, &allprison, pr_list) {
5088 				db_show_prison(pr);
5089 				if (db_pager_quit)
5090 					break;
5091 			}
5092 		}
5093 		return;
5094 	}
5095 
5096 	if (addr == 0)
5097 		pr = &prison0;
5098 	else {
5099 		/* Look for a prison with the ID and with references. */
5100 		TAILQ_FOREACH(pr, &allprison, pr_list)
5101 			if (pr->pr_id == addr && pr->pr_ref > 0)
5102 				break;
5103 		if (pr == NULL)
5104 			/* Look again, without requiring a reference. */
5105 			TAILQ_FOREACH(pr, &allprison, pr_list)
5106 				if (pr->pr_id == addr)
5107 					break;
5108 		if (pr == NULL)
5109 			/* Assume address points to a valid prison. */
5110 			pr = (struct prison *)addr;
5111 	}
5112 	db_show_prison(pr);
5113 }
5114 
5115 #endif /* DDB */
5116