xref: /freebsd/sys/kern/kern_jail.c (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 1999 Poul-Henning Kamp.
5  * Copyright (c) 2008 Bjoern A. Zeeb.
6  * Copyright (c) 2009 James Gritton.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_nfs.h"
36 
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/errno.h>
42 #include <sys/sysproto.h>
43 #include <sys/malloc.h>
44 #include <sys/osd.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/epoch.h>
48 #include <sys/taskqueue.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/linker.h>
52 #include <sys/lock.h>
53 #include <sys/mman.h>
54 #include <sys/mutex.h>
55 #include <sys/racct.h>
56 #include <sys/rctl.h>
57 #include <sys/refcount.h>
58 #include <sys/sx.h>
59 #include <sys/sysent.h>
60 #include <sys/namei.h>
61 #include <sys/mount.h>
62 #include <sys/queue.h>
63 #include <sys/socket.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/sysctl.h>
66 #include <sys/uuid.h>
67 #include <sys/vnode.h>
68 
69 #include <net/if.h>
70 #include <net/vnet.h>
71 
72 #include <netinet/in.h>
73 
74 #ifdef DDB
75 #include <ddb/ddb.h>
76 #endif /* DDB */
77 
78 #include <security/mac/mac_framework.h>
79 
80 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
81 
82 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
83 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
84 
85 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
86 #ifdef INET
87 #ifdef INET6
88 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
89 #else
90 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
91 #endif
92 #else /* !INET */
93 #ifdef INET6
94 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
95 #else
96 #define	_PR_IP_SADDRSEL	0
97 #endif
98 #endif
99 
100 /* prison0 describes what is "real" about the system. */
101 struct prison prison0 = {
102 	.pr_id		= 0,
103 	.pr_name	= "0",
104 	.pr_ref		= 1,
105 	.pr_uref	= 1,
106 	.pr_path	= "/",
107 	.pr_securelevel	= -1,
108 	.pr_devfs_rsnum = 0,
109 	.pr_state	= PRISON_STATE_ALIVE,
110 	.pr_childmax	= JAIL_MAX,
111 	.pr_hostuuid	= DEFAULT_HOSTUUID,
112 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
113 #ifdef VIMAGE
114 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
115 #else
116 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
117 #endif
118 	.pr_allow	= PR_ALLOW_ALL_STATIC,
119 };
120 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
121 
122 struct bool_flags {
123 	const char	*name;
124 	const char	*noname;
125 	volatile u_int	 flag;
126 };
127 struct jailsys_flags {
128 	const char	*name;
129 	unsigned	 disable;
130 	unsigned	 new;
131 };
132 
133 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
134 struct	sx allprison_lock;
135 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
136 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
137 LIST_HEAD(, prison_racct) allprison_racct;
138 int	lastprid = 0;
139 int	lastdeadid = 0;
140 
141 static int get_next_prid(struct prison **insprp);
142 static int get_next_deadid(struct prison **insprp);
143 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
144 static void prison_complete(void *context, int pending);
145 static void prison_deref(struct prison *pr, int flags);
146 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
147 static int prison_lock_xlock(struct prison *pr, int flags);
148 static void prison_cleanup(struct prison *pr);
149 static void prison_free_not_last(struct prison *pr);
150 static void prison_proc_free_not_last(struct prison *pr);
151 static void prison_proc_relink(struct prison *opr, struct prison *npr,
152     struct proc *p);
153 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
154     int enable);
155 static char *prison_path(struct prison *pr1, struct prison *pr2);
156 #ifdef RACCT
157 static void prison_racct_attach(struct prison *pr);
158 static void prison_racct_modify(struct prison *pr);
159 static void prison_racct_detach(struct prison *pr);
160 #endif
161 
162 /* Flags for prison_deref */
163 #define	PD_DEREF	0x01	/* Decrement pr_ref */
164 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
165 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
166 #define	PD_LOCKED	0x10	/* pr_mtx is held */
167 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
168 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
169 #define PD_OP_FLAGS	0x07	/* Operation flags */
170 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
171 
172 /*
173  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
174  * as we cannot figure out the size of a sparse array, or an array without a
175  * terminating entry.
176  */
177 static struct bool_flags pr_flag_bool[] = {
178 	{"persist", "nopersist", PR_PERSIST},
179 #ifdef INET
180 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
181 #endif
182 #ifdef INET6
183 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
184 #endif
185 };
186 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
187 
188 static struct jailsys_flags pr_flag_jailsys[] = {
189 	{"host", 0, PR_HOST},
190 #ifdef VIMAGE
191 	{"vnet", 0, PR_VNET},
192 #endif
193 #ifdef INET
194 	{"ip4", PR_IP4_USER, PR_IP4_USER},
195 #endif
196 #ifdef INET6
197 	{"ip6", PR_IP6_USER, PR_IP6_USER},
198 #endif
199 };
200 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
201 
202 /*
203  * Make this array full-size so dynamic parameters can be added.
204  * It is protected by prison0.mtx, but lockless reading is allowed
205  * with an atomic check of the flag values.
206  */
207 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
208 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
209 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
210 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
211 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
212 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
213 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
214 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
215 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
216 	{"allow.reserved_ports", "allow.noreserved_ports",
217 	 PR_ALLOW_RESERVED_PORTS},
218 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
219 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
220 	 PR_ALLOW_UNPRIV_DEBUG},
221 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
222 #ifdef VIMAGE
223 	{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
224 #endif
225 	{"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR},
226 	{"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
227 	{"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
228 };
229 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
230 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
231 
232 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
233 					 PR_ALLOW_RESERVED_PORTS | \
234 					 PR_ALLOW_UNPRIV_DEBUG | \
235 					 PR_ALLOW_SUSER)
236 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
237 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
238 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241 #if defined(INET) || defined(INET6)
242 static unsigned jail_max_af_ips = 255;
243 #endif
244 
245 /*
246  * Initialize the parts of prison0 that can't be static-initialized with
247  * constants.  This is called from proc0_init() after creating thread0 cpuset.
248  */
249 void
250 prison0_init(void)
251 {
252 	uint8_t *file, *data;
253 	size_t size;
254 	char buf[sizeof(prison0.pr_hostuuid)];
255 	bool valid;
256 
257 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
258 	prison0.pr_osreldate = osreldate;
259 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
260 
261 	/* If we have a preloaded hostuuid, use it. */
262 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
263 	if (file != NULL) {
264 		data = preload_fetch_addr(file);
265 		size = preload_fetch_size(file);
266 		if (data != NULL) {
267 			/*
268 			 * The preloaded data may include trailing whitespace, almost
269 			 * certainly a newline; skip over any whitespace or
270 			 * non-printable characters to be safe.
271 			 */
272 			while (size > 0 && data[size - 1] <= 0x20) {
273 				size--;
274 			}
275 
276 			valid = false;
277 
278 			/*
279 			 * Not NUL-terminated when passed from loader, but
280 			 * validate_uuid requires that due to using sscanf (as
281 			 * does the subsequent strlcpy, since it still reads
282 			 * past the given size to return the true length);
283 			 * bounce to a temporary buffer to fix.
284 			 */
285 			if (size >= sizeof(buf))
286 				goto done;
287 
288 			memcpy(buf, data, size);
289 			buf[size] = '\0';
290 
291 			if (validate_uuid(buf, size, NULL, 0) != 0)
292 				goto done;
293 
294 			valid = true;
295 			(void)strlcpy(prison0.pr_hostuuid, buf,
296 			    sizeof(prison0.pr_hostuuid));
297 
298 done:
299 			if (bootverbose && !valid) {
300 				printf("hostuuid: preload data malformed: '%.*s'\n",
301 				    (int)size, data);
302 			}
303 		}
304 	}
305 	if (bootverbose)
306 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
307 }
308 
309 /*
310  * struct jail_args {
311  *	struct jail *jail;
312  * };
313  */
314 int
315 sys_jail(struct thread *td, struct jail_args *uap)
316 {
317 	uint32_t version;
318 	int error;
319 	struct jail j;
320 
321 	error = copyin(uap->jail, &version, sizeof(uint32_t));
322 	if (error)
323 		return (error);
324 
325 	switch (version) {
326 	case 0:
327 	{
328 		struct jail_v0 j0;
329 
330 		/* FreeBSD single IPv4 jails. */
331 		bzero(&j, sizeof(struct jail));
332 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
333 		if (error)
334 			return (error);
335 		j.version = j0.version;
336 		j.path = j0.path;
337 		j.hostname = j0.hostname;
338 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
339 		break;
340 	}
341 
342 	case 1:
343 		/*
344 		 * Version 1 was used by multi-IPv4 jail implementations
345 		 * that never made it into the official kernel.
346 		 */
347 		return (EINVAL);
348 
349 	case 2:	/* JAIL_API_VERSION */
350 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
351 		error = copyin(uap->jail, &j, sizeof(struct jail));
352 		if (error)
353 			return (error);
354 		break;
355 
356 	default:
357 		/* Sci-Fi jails are not supported, sorry. */
358 		return (EINVAL);
359 	}
360 	return (kern_jail(td, &j));
361 }
362 
363 int
364 kern_jail(struct thread *td, struct jail *j)
365 {
366 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
367 #ifdef INET
368 			    + 1
369 #endif
370 #ifdef INET6
371 			    + 1
372 #endif
373 			    )];
374 	struct uio opt;
375 	char *u_path, *u_hostname, *u_name;
376 	struct bool_flags *bf;
377 #ifdef INET
378 	uint32_t ip4s;
379 	struct in_addr *u_ip4;
380 #endif
381 #ifdef INET6
382 	struct in6_addr *u_ip6;
383 #endif
384 	size_t tmplen;
385 	int error, enforce_statfs;
386 
387 	bzero(&optiov, sizeof(optiov));
388 	opt.uio_iov = optiov;
389 	opt.uio_iovcnt = 0;
390 	opt.uio_offset = -1;
391 	opt.uio_resid = -1;
392 	opt.uio_segflg = UIO_SYSSPACE;
393 	opt.uio_rw = UIO_READ;
394 	opt.uio_td = td;
395 
396 	/* Set permissions for top-level jails from sysctls. */
397 	if (!jailed(td->td_ucred)) {
398 		for (bf = pr_flag_allow;
399 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
400 			atomic_load_int(&bf->flag) != 0;
401 		     bf++) {
402 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
403 			    (jail_default_allow & bf->flag)
404 			    ? bf->name : bf->noname);
405 			optiov[opt.uio_iovcnt].iov_len =
406 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
407 			opt.uio_iovcnt += 2;
408 		}
409 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
410 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
411 		opt.uio_iovcnt++;
412 		enforce_statfs = jail_default_enforce_statfs;
413 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
414 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
415 		opt.uio_iovcnt++;
416 	}
417 
418 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
419 #ifdef INET
420 	ip4s = (j->version == 0) ? 1 : j->ip4s;
421 	if (ip4s > jail_max_af_ips)
422 		return (EINVAL);
423 	tmplen += ip4s * sizeof(struct in_addr);
424 #else
425 	if (j->ip4s > 0)
426 		return (EINVAL);
427 #endif
428 #ifdef INET6
429 	if (j->ip6s > jail_max_af_ips)
430 		return (EINVAL);
431 	tmplen += j->ip6s * sizeof(struct in6_addr);
432 #else
433 	if (j->ip6s > 0)
434 		return (EINVAL);
435 #endif
436 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
437 	u_hostname = u_path + MAXPATHLEN;
438 	u_name = u_hostname + MAXHOSTNAMELEN;
439 #ifdef INET
440 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
441 #endif
442 #ifdef INET6
443 #ifdef INET
444 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
445 #else
446 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
447 #endif
448 #endif
449 	optiov[opt.uio_iovcnt].iov_base = "path";
450 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
451 	opt.uio_iovcnt++;
452 	optiov[opt.uio_iovcnt].iov_base = u_path;
453 	error = copyinstr(j->path, u_path, MAXPATHLEN,
454 	    &optiov[opt.uio_iovcnt].iov_len);
455 	if (error) {
456 		free(u_path, M_TEMP);
457 		return (error);
458 	}
459 	opt.uio_iovcnt++;
460 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
461 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
462 	opt.uio_iovcnt++;
463 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
464 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
465 	    &optiov[opt.uio_iovcnt].iov_len);
466 	if (error) {
467 		free(u_path, M_TEMP);
468 		return (error);
469 	}
470 	opt.uio_iovcnt++;
471 	if (j->jailname != NULL) {
472 		optiov[opt.uio_iovcnt].iov_base = "name";
473 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
474 		opt.uio_iovcnt++;
475 		optiov[opt.uio_iovcnt].iov_base = u_name;
476 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
477 		    &optiov[opt.uio_iovcnt].iov_len);
478 		if (error) {
479 			free(u_path, M_TEMP);
480 			return (error);
481 		}
482 		opt.uio_iovcnt++;
483 	}
484 #ifdef INET
485 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
486 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
487 	opt.uio_iovcnt++;
488 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
489 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
490 	if (j->version == 0)
491 		u_ip4->s_addr = j->ip4s;
492 	else {
493 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
494 		if (error) {
495 			free(u_path, M_TEMP);
496 			return (error);
497 		}
498 	}
499 	opt.uio_iovcnt++;
500 #endif
501 #ifdef INET6
502 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
503 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
504 	opt.uio_iovcnt++;
505 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
506 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
507 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
508 	if (error) {
509 		free(u_path, M_TEMP);
510 		return (error);
511 	}
512 	opt.uio_iovcnt++;
513 #endif
514 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
515 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
516 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
517 	free(u_path, M_TEMP);
518 	return (error);
519 }
520 
521 /*
522  * struct jail_set_args {
523  *	struct iovec *iovp;
524  *	unsigned int iovcnt;
525  *	int flags;
526  * };
527  */
528 int
529 sys_jail_set(struct thread *td, struct jail_set_args *uap)
530 {
531 	struct uio *auio;
532 	int error;
533 
534 	/* Check that we have an even number of iovecs. */
535 	if (uap->iovcnt & 1)
536 		return (EINVAL);
537 
538 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
539 	if (error)
540 		return (error);
541 	error = kern_jail_set(td, auio, uap->flags);
542 	freeuio(auio);
543 	return (error);
544 }
545 
546 #if defined(INET) || defined(INET6)
547 typedef int prison_addr_cmp_t(const void *, const void *);
548 typedef bool prison_addr_valid_t(const void *);
549 static const struct pr_family {
550 	size_t			size;
551 	prison_addr_cmp_t	*cmp;
552 	prison_addr_valid_t	*valid;
553 	int			ip_flag;
554 } pr_families[PR_FAMILY_MAX] = {
555 #ifdef INET
556 	[PR_INET] = {
557 		.size = sizeof(struct in_addr),
558 		.cmp = prison_qcmp_v4,
559 		.valid = prison_valid_v4,
560 		.ip_flag = PR_IP4_USER,
561 	 },
562 #endif
563 #ifdef INET6
564 	[PR_INET6] = {
565 		.size = sizeof(struct in6_addr),
566 		.cmp = prison_qcmp_v6,
567 		.valid = prison_valid_v6,
568 		.ip_flag = PR_IP6_USER,
569 	},
570 #endif
571 };
572 
573 /*
574  * Network address lists (pr_addrs) allocation for jails.  The addresses
575  * are accessed locklessly by the network stack, thus need to be protected by
576  * the network epoch.
577  */
578 struct prison_ip {
579 	struct epoch_context ctx;
580 	uint32_t	ips;
581 #ifdef FUTURE_C
582 	/*
583 	 * XXX Variable-length automatic arrays in union may be
584 	 * supported in future C.
585 	 */
586 	union {
587 		char pr_ip[];
588 		struct in_addr pr_ip4[];
589 		struct in6_addr pr_ip6[];
590 	};
591 #else /* No future C :( */
592 	char pr_ip[];
593 #endif
594 };
595 
596 static char *
597 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
598 {
599 	MPASS(pip);
600 	MPASS(af < PR_FAMILY_MAX);
601 	MPASS(idx >= 0 && idx < pip->ips);
602 
603 	return (pip->pr_ip + pr_families[af].size * idx);
604 }
605 
606 static struct prison_ip *
607 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
608 {
609 	struct prison_ip *pip;
610 
611 	pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
612 	    M_PRISON, flags);
613 	if (pip != NULL)
614 		pip->ips = cnt;
615 	return (pip);
616 }
617 
618 /*
619  * Allocate and copyin user supplied address list, sorting and validating.
620  * kern_jail_set() helper.
621  */
622 static struct prison_ip *
623 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
624 {
625 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
626 	const size_t size = pr_families[af].size;
627 	struct prison_ip *pip;
628 
629 	pip = prison_ip_alloc(af, cnt, M_WAITOK);
630 	bcopy(op, pip->pr_ip, cnt * size);
631 	/*
632 	 * IP addresses are all sorted but ip[0] to preserve
633 	 * the primary IP address as given from userland.
634 	 * This special IP is used for unbound outgoing
635 	 * connections as well for "loopback" traffic in case
636 	 * source address selection cannot find any more fitting
637 	 * address to connect from.
638 	 */
639 	if (cnt > 1)
640 		qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
641 	/*
642 	 * Check for duplicate addresses and do some simple
643 	 * zero and broadcast checks. If users give other bogus
644 	 * addresses it is their problem.
645 	 */
646 	for (int i = 0; i < cnt; i++) {
647 		if (!pr_families[af].valid(PR_IP(pip, af, i))) {
648 			free(pip, M_PRISON);
649 			return (NULL);
650 		}
651 		if (i + 1 < cnt &&
652 		    (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
653 		     cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
654 			free(pip, M_PRISON);
655 			return (NULL);
656 		}
657 	}
658 
659 	return (pip);
660 }
661 
662 /*
663  * Allocate and dup parent prison address list.
664  * kern_jail_set() helper.
665  */
666 static void
667 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
668 {
669 	const struct prison_ip *ppip = ppr->pr_addrs[af];
670 	struct prison_ip *pip;
671 
672 	if (ppip != NULL) {
673 		pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
674 		bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
675 		pr->pr_addrs[af] = pip;
676 	}
677 }
678 
679 /*
680  * Make sure the new set of IP addresses is a subset of the parent's list.
681  * Don't worry about the parent being unlocked, as any setting is done with
682  * allprison_lock held.
683  * kern_jail_set() helper.
684  */
685 static bool
686 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
687     const pr_family_t af)
688 {
689 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
690 	int i, j;
691 
692 	if (ppip == NULL)
693 		return (false);
694 
695 	for (i = 0; i < ppip->ips; i++)
696 		if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
697 			break;
698 
699 	if (i == ppip->ips)
700 		/* Main address not present in parent. */
701 		return (false);
702 
703 	if (pip->ips > 1) {
704 		for (i = j = 1; i < pip->ips; i++) {
705 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
706 				/* Equals to parent primary address. */
707 				continue;
708 			for (; j < ppip->ips; j++)
709 				if (cmp(PR_IP(pip, af, i),
710 				    PR_IP(ppip, af, j)) == 0)
711 					break;
712 			if (j == ppip->ips)
713 				break;
714 		}
715 		if (j == ppip->ips)
716 			/* Address not present in parent. */
717 			return (false);
718 	}
719 	return (true);
720 }
721 
722 /*
723  * Check for conflicting IP addresses.  We permit them if there is no more
724  * than one IP on each jail.  If there is a duplicate on a jail with more
725  * than one IP stop checking and return error.
726  * kern_jail_set() helper.
727  */
728 static bool
729 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
730     struct prison_ip *pip, pr_family_t af)
731 {
732 	const struct prison *tppr, *tpr;
733 	int descend;
734 
735 #ifdef VIMAGE
736 	for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
737 		if (tppr->pr_flags & PR_VNET)
738 			break;
739 #else
740 	tppr = &prison0;
741 #endif
742 	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
743 		if (tpr == pr ||
744 #ifdef VIMAGE
745 		    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
746 #endif
747 		    !prison_isalive(tpr)) {
748 			descend = 0;
749 			continue;
750 		}
751 		if (!(tpr->pr_flags & pr_families[af].ip_flag))
752 			continue;
753 		descend = 0;
754 		if (tpr->pr_addrs[af] == NULL ||
755 		    (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
756 			continue;
757 		for (int i = 0; i < pip->ips; i++)
758 			if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
759 				return (false);
760 	}
761 
762 	return (true);
763 }
764 
765 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
766     "prison must start with epoch context");
767 static void
768 prison_ip_free_deferred(epoch_context_t ctx)
769 {
770 
771 	free(ctx, M_PRISON);
772 }
773 
774 static void
775 prison_ip_free(struct prison_ip *pip)
776 {
777 
778 	if (pip != NULL)
779 		NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
780 }
781 
782 static void
783 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
784 {
785 	struct prison_ip **mem, *old;
786 
787 	mtx_assert(&pr->pr_mtx, MA_OWNED);
788 
789 	mem = &pr->pr_addrs[af];
790 
791 	old = *mem;
792 	atomic_store_ptr(mem, new);
793 	prison_ip_free(old);
794 }
795 
796 /*
797  * Restrict a prison's IP address list with its parent's, possibly replacing
798  * it.  Return true if succeed, otherwise should redo.
799  * kern_jail_set() helper.
800  */
801 static bool
802 prison_ip_restrict(struct prison *pr, const pr_family_t af,
803     struct prison_ip **newp)
804 {
805 	struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
806 	struct prison_ip *pip = pr->pr_addrs[af];
807 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
808 	const size_t size = pr_families[af].size;
809 	struct prison_ip *new = newp != NULL ? *newp : NULL;
810 	uint32_t ips;
811 
812 	mtx_assert(&pr->pr_mtx, MA_OWNED);
813 
814 	/*
815 	 * Due to epoch-synchronized access to the IP address lists we always
816 	 * allocate a new list even if the old one has enough space.  We could
817 	 * atomically update an IPv4 address inside a list, but that would
818 	 * screw up sorting, and in case of IPv6 we can't even atomically write
819 	 * one.
820 	 */
821 	if (ppip == NULL) {
822 		if (pip != NULL)
823 			prison_ip_set(pr, af, NULL);
824 		return (true);
825 	}
826 
827 	if (!(pr->pr_flags & pr_families[af].ip_flag)) {
828 		if (new == NULL) {
829 			new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
830 			if (new == NULL)
831 				return (false); /* Redo */
832 		}
833 		/* This has no user settings, so just copy the parent's list. */
834 		MPASS(new->ips == ppip->ips);
835 		bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
836 		prison_ip_set(pr, af, new);
837 		if (newp != NULL)
838 			*newp = NULL; /* Used */
839 	} else if (pip != NULL) {
840 		/* Remove addresses that aren't in the parent. */
841 		int i;
842 
843 		i = 0; /* index in pip */
844 		ips = 0; /* index in new */
845 
846 		if (new == NULL) {
847 			new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
848 			if (new == NULL)
849 				return (false); /* Redo */
850 		}
851 
852 		for (int pi = 0; pi < ppip->ips; pi++)
853 			if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
854 				/* Found our primary address in parent. */
855 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
856 				    size);
857 				i++;
858 				ips++;
859 				break;
860 			}
861 		for (int pi = 1; i < pip->ips; ) {
862 			/* Check against primary, which is unsorted. */
863 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
864 				/* Matches parent's primary address. */
865 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
866 				    size);
867 				i++;
868 				ips++;
869 				continue;
870 			}
871 			/* The rest are sorted. */
872 			switch (pi >= ppip->ips ? -1 :
873 				cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
874 			case -1:
875 				i++;
876 				break;
877 			case 0:
878 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
879 				    size);
880 				i++;
881 				pi++;
882 				ips++;
883 				break;
884 			case 1:
885 				pi++;
886 				break;
887 			}
888 		}
889 		if (ips == 0) {
890 			if (newp == NULL || *newp == NULL)
891 				prison_ip_free(new);
892 			new = NULL;
893 		} else {
894 			/* Shrink to real size */
895 			KASSERT((new->ips >= ips),
896 			    ("Out-of-bounds write to prison_ip %p", new));
897 			new->ips = ips;
898 		}
899 		prison_ip_set(pr, af, new);
900 		if (newp != NULL)
901 			*newp = NULL; /* Used */
902 	}
903 	return (true);
904 }
905 
906 /*
907  * Fast-path check if an address belongs to a prison.
908  */
909 int
910 prison_ip_check(const struct prison *pr, const pr_family_t af,
911     const void *addr)
912 {
913 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
914 	struct prison_ip *pip;
915 	int i, a, z, d;
916 
917 	MPASS(mtx_owned(&pr->pr_mtx) ||
918 	    in_epoch(net_epoch_preempt) ||
919 	    sx_xlocked(&allprison_lock));
920 
921 	pip = atomic_load_ptr(&pr->pr_addrs[af]);
922 	if (__predict_false(pip == NULL))
923 		return (EAFNOSUPPORT);
924 
925 	/* Check the primary IP. */
926 	if (cmp(PR_IP(pip, af, 0), addr) == 0)
927 		return (0);
928 
929 	/*
930 	 * All the other IPs are sorted so we can do a binary search.
931 	 */
932 	a = 0;
933 	z = pip->ips - 2;
934 	while (a <= z) {
935 		i = (a + z) / 2;
936 		d = cmp(PR_IP(pip, af, i + 1), addr);
937 		if (d > 0)
938 			z = i - 1;
939 		else if (d < 0)
940 			a = i + 1;
941 		else
942 			return (0);
943 	}
944 
945 	return (EADDRNOTAVAIL);
946 }
947 
948 /*
949  * Grab primary IP.  Historically required mutex, but nothing prevents
950  * us to support epoch-protected access.  Is it used in fast path?
951  * in{6}_jail.c helper
952  */
953 const void *
954 prison_ip_get0(const struct prison *pr, const pr_family_t af)
955 {
956 	const struct prison_ip *pip = pr->pr_addrs[af];
957 
958 	mtx_assert(&pr->pr_mtx, MA_OWNED);
959 	MPASS(pip);
960 
961 	return (pip->pr_ip);
962 }
963 
964 u_int
965 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
966 {
967 
968 	return (pr->pr_addrs[af]->ips);
969 }
970 #endif	/* defined(INET) || defined(INET6) */
971 
972 int
973 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
974 {
975 	struct nameidata nd;
976 #ifdef INET
977 	struct prison_ip *ip4;
978 #endif
979 #ifdef INET6
980 	struct prison_ip *ip6;
981 #endif
982 	struct vfsopt *opt;
983 	struct vfsoptlist *opts;
984 	struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
985 	struct vnode *root;
986 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
987 	char *g_path, *osrelstr;
988 	struct bool_flags *bf;
989 	struct jailsys_flags *jsf;
990 #if defined(INET) || defined(INET6)
991 	void *op;
992 #endif
993 	unsigned long hid;
994 	size_t namelen, onamelen, pnamelen;
995 	int created, cuflags, descend, drflags, enforce;
996 	int error, errmsg_len, errmsg_pos;
997 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
998 	int deadid, jid, jsys, len, level;
999 	int childmax, osreldt, rsnum, slevel;
1000 #ifdef INET
1001 	int ip4s;
1002 	bool redo_ip4;
1003 #endif
1004 #ifdef INET6
1005 	int ip6s;
1006 	bool redo_ip6;
1007 #endif
1008 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
1009 	uint64_t pr_allow_diff;
1010 	unsigned tallow;
1011 	char numbuf[12];
1012 
1013 	error = priv_check(td, PRIV_JAIL_SET);
1014 	if (!error && (flags & JAIL_ATTACH))
1015 		error = priv_check(td, PRIV_JAIL_ATTACH);
1016 	if (error)
1017 		return (error);
1018 	mypr = td->td_ucred->cr_prison;
1019 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
1020 		return (EPERM);
1021 	if (flags & ~JAIL_SET_MASK)
1022 		return (EINVAL);
1023 
1024 	/*
1025 	 * Check all the parameters before committing to anything.  Not all
1026 	 * errors can be caught early, but we may as well try.  Also, this
1027 	 * takes care of some expensive stuff (path lookup) before getting
1028 	 * the allprison lock.
1029 	 *
1030 	 * XXX Jails are not filesystems, and jail parameters are not mount
1031 	 *     options.  But it makes more sense to re-use the vfsopt code
1032 	 *     than duplicate it under a different name.
1033 	 */
1034 	error = vfs_buildopts(optuio, &opts);
1035 	if (error)
1036 		return (error);
1037 #ifdef INET
1038 	ip4 = NULL;
1039 #endif
1040 #ifdef INET6
1041 	ip6 = NULL;
1042 #endif
1043 	g_path = NULL;
1044 
1045 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1046 	if (!cuflags) {
1047 		error = EINVAL;
1048 		vfs_opterror(opts, "no valid operation (create or update)");
1049 		goto done_errmsg;
1050 	}
1051 
1052 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1053 	if (error == ENOENT)
1054 		jid = 0;
1055 	else if (error != 0)
1056 		goto done_free;
1057 
1058 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
1059 	if (error == ENOENT)
1060 		gotslevel = 0;
1061 	else if (error != 0)
1062 		goto done_free;
1063 	else
1064 		gotslevel = 1;
1065 
1066 	error =
1067 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
1068 	if (error == ENOENT)
1069 		gotchildmax = 0;
1070 	else if (error != 0)
1071 		goto done_free;
1072 	else
1073 		gotchildmax = 1;
1074 
1075 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
1076 	if (error == ENOENT)
1077 		gotenforce = 0;
1078 	else if (error != 0)
1079 		goto done_free;
1080 	else if (enforce < 0 || enforce > 2) {
1081 		error = EINVAL;
1082 		goto done_free;
1083 	} else
1084 		gotenforce = 1;
1085 
1086 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
1087 	if (error == ENOENT)
1088 		gotrsnum = 0;
1089 	else if (error != 0)
1090 		goto done_free;
1091 	else
1092 		gotrsnum = 1;
1093 
1094 	pr_flags = ch_flags = 0;
1095 	for (bf = pr_flag_bool;
1096 	     bf < pr_flag_bool + nitems(pr_flag_bool);
1097 	     bf++) {
1098 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
1099 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
1100 	}
1101 	ch_flags |= pr_flags;
1102 	for (jsf = pr_flag_jailsys;
1103 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
1104 	     jsf++) {
1105 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
1106 		if (error == ENOENT)
1107 			continue;
1108 		if (error != 0)
1109 			goto done_free;
1110 		switch (jsys) {
1111 		case JAIL_SYS_DISABLE:
1112 			if (!jsf->disable) {
1113 				error = EINVAL;
1114 				goto done_free;
1115 			}
1116 			pr_flags |= jsf->disable;
1117 			break;
1118 		case JAIL_SYS_NEW:
1119 			pr_flags |= jsf->new;
1120 			break;
1121 		case JAIL_SYS_INHERIT:
1122 			break;
1123 		default:
1124 			error = EINVAL;
1125 			goto done_free;
1126 		}
1127 		ch_flags |= jsf->new | jsf->disable;
1128 	}
1129 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
1130 	    && !(pr_flags & PR_PERSIST)) {
1131 		error = EINVAL;
1132 		vfs_opterror(opts, "new jail must persist or attach");
1133 		goto done_errmsg;
1134 	}
1135 #ifdef VIMAGE
1136 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
1137 		error = EINVAL;
1138 		vfs_opterror(opts, "vnet cannot be changed after creation");
1139 		goto done_errmsg;
1140 	}
1141 #endif
1142 #ifdef INET
1143 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
1144 		error = EINVAL;
1145 		vfs_opterror(opts, "ip4 cannot be changed after creation");
1146 		goto done_errmsg;
1147 	}
1148 #endif
1149 #ifdef INET6
1150 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
1151 		error = EINVAL;
1152 		vfs_opterror(opts, "ip6 cannot be changed after creation");
1153 		goto done_errmsg;
1154 	}
1155 #endif
1156 
1157 	pr_allow = ch_allow = 0;
1158 	for (bf = pr_flag_allow;
1159 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
1160 		atomic_load_int(&bf->flag) != 0;
1161 	     bf++) {
1162 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
1163 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
1164 	}
1165 	ch_allow |= pr_allow;
1166 
1167 	error = vfs_getopt(opts, "name", (void **)&name, &len);
1168 	if (error == ENOENT)
1169 		name = NULL;
1170 	else if (error != 0)
1171 		goto done_free;
1172 	else {
1173 		if (len == 0 || name[len - 1] != '\0') {
1174 			error = EINVAL;
1175 			goto done_free;
1176 		}
1177 		if (len > MAXHOSTNAMELEN) {
1178 			error = ENAMETOOLONG;
1179 			goto done_free;
1180 		}
1181 	}
1182 
1183 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
1184 	if (error == ENOENT)
1185 		host = NULL;
1186 	else if (error != 0)
1187 		goto done_free;
1188 	else {
1189 		ch_flags |= PR_HOST;
1190 		pr_flags |= PR_HOST;
1191 		if (len == 0 || host[len - 1] != '\0') {
1192 			error = EINVAL;
1193 			goto done_free;
1194 		}
1195 		if (len > MAXHOSTNAMELEN) {
1196 			error = ENAMETOOLONG;
1197 			goto done_free;
1198 		}
1199 	}
1200 
1201 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
1202 	if (error == ENOENT)
1203 		domain = NULL;
1204 	else if (error != 0)
1205 		goto done_free;
1206 	else {
1207 		ch_flags |= PR_HOST;
1208 		pr_flags |= PR_HOST;
1209 		if (len == 0 || domain[len - 1] != '\0') {
1210 			error = EINVAL;
1211 			goto done_free;
1212 		}
1213 		if (len > MAXHOSTNAMELEN) {
1214 			error = ENAMETOOLONG;
1215 			goto done_free;
1216 		}
1217 	}
1218 
1219 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
1220 	if (error == ENOENT)
1221 		uuid = NULL;
1222 	else if (error != 0)
1223 		goto done_free;
1224 	else {
1225 		ch_flags |= PR_HOST;
1226 		pr_flags |= PR_HOST;
1227 		if (len == 0 || uuid[len - 1] != '\0') {
1228 			error = EINVAL;
1229 			goto done_free;
1230 		}
1231 		if (len > HOSTUUIDLEN) {
1232 			error = ENAMETOOLONG;
1233 			goto done_free;
1234 		}
1235 	}
1236 
1237 #ifdef COMPAT_FREEBSD32
1238 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1239 		uint32_t hid32;
1240 
1241 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
1242 		hid = hid32;
1243 	} else
1244 #endif
1245 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
1246 	if (error == ENOENT)
1247 		gothid = 0;
1248 	else if (error != 0)
1249 		goto done_free;
1250 	else {
1251 		gothid = 1;
1252 		ch_flags |= PR_HOST;
1253 		pr_flags |= PR_HOST;
1254 	}
1255 
1256 #ifdef INET
1257 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
1258 	if (error == ENOENT)
1259 		ip4s = 0;
1260 	else if (error != 0)
1261 		goto done_free;
1262 	else if (ip4s & (sizeof(struct in_addr) - 1)) {
1263 		error = EINVAL;
1264 		goto done_free;
1265 	} else {
1266 		ch_flags |= PR_IP4_USER;
1267 		pr_flags |= PR_IP4_USER;
1268 		if (ip4s > 0) {
1269 			ip4s /= sizeof(struct in_addr);
1270 			if (ip4s > jail_max_af_ips) {
1271 				error = EINVAL;
1272 				vfs_opterror(opts, "too many IPv4 addresses");
1273 				goto done_errmsg;
1274 			}
1275 			ip4 = prison_ip_copyin(PR_INET, op, ip4s);
1276 			if (ip4 == NULL) {
1277 				error = EINVAL;
1278 				goto done_free;
1279 			}
1280 		}
1281 	}
1282 #endif
1283 
1284 #ifdef INET6
1285 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
1286 	if (error == ENOENT)
1287 		ip6s = 0;
1288 	else if (error != 0)
1289 		goto done_free;
1290 	else if (ip6s & (sizeof(struct in6_addr) - 1)) {
1291 		error = EINVAL;
1292 		goto done_free;
1293 	} else {
1294 		ch_flags |= PR_IP6_USER;
1295 		pr_flags |= PR_IP6_USER;
1296 		if (ip6s > 0) {
1297 			ip6s /= sizeof(struct in6_addr);
1298 			if (ip6s > jail_max_af_ips) {
1299 				error = EINVAL;
1300 				vfs_opterror(opts, "too many IPv6 addresses");
1301 				goto done_errmsg;
1302 			}
1303 			ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
1304 			if (ip6 == NULL) {
1305 				error = EINVAL;
1306 				goto done_free;
1307 			}
1308 		}
1309 	}
1310 #endif
1311 
1312 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1313 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1314 		error = EINVAL;
1315 		vfs_opterror(opts,
1316 		    "vnet jails cannot have IP address restrictions");
1317 		goto done_errmsg;
1318 	}
1319 #endif
1320 
1321 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
1322 	if (error == ENOENT)
1323 		osrelstr = NULL;
1324 	else if (error != 0)
1325 		goto done_free;
1326 	else {
1327 		if (flags & JAIL_UPDATE) {
1328 			error = EINVAL;
1329 			vfs_opterror(opts,
1330 			    "osrelease cannot be changed after creation");
1331 			goto done_errmsg;
1332 		}
1333 		if (len == 0 || osrelstr[len - 1] != '\0') {
1334 			error = EINVAL;
1335 			goto done_free;
1336 		}
1337 		if (len >= OSRELEASELEN) {
1338 			error = ENAMETOOLONG;
1339 			vfs_opterror(opts,
1340 			    "osrelease string must be 1-%d bytes long",
1341 			    OSRELEASELEN - 1);
1342 			goto done_errmsg;
1343 		}
1344 	}
1345 
1346 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1347 	if (error == ENOENT)
1348 		osreldt = 0;
1349 	else if (error != 0)
1350 		goto done_free;
1351 	else {
1352 		if (flags & JAIL_UPDATE) {
1353 			error = EINVAL;
1354 			vfs_opterror(opts,
1355 			    "osreldate cannot be changed after creation");
1356 			goto done_errmsg;
1357 		}
1358 		if (osreldt == 0) {
1359 			error = EINVAL;
1360 			vfs_opterror(opts, "osreldate cannot be 0");
1361 			goto done_errmsg;
1362 		}
1363 	}
1364 
1365 	root = NULL;
1366 	error = vfs_getopt(opts, "path", (void **)&path, &len);
1367 	if (error == ENOENT)
1368 		path = NULL;
1369 	else if (error != 0)
1370 		goto done_free;
1371 	else {
1372 		if (flags & JAIL_UPDATE) {
1373 			error = EINVAL;
1374 			vfs_opterror(opts,
1375 			    "path cannot be changed after creation");
1376 			goto done_errmsg;
1377 		}
1378 		if (len == 0 || path[len - 1] != '\0') {
1379 			error = EINVAL;
1380 			goto done_free;
1381 		}
1382 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
1383 		error = namei(&nd);
1384 		if (error)
1385 			goto done_free;
1386 		root = nd.ni_vp;
1387 		NDFREE_PNBUF(&nd);
1388 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1389 		strlcpy(g_path, path, MAXPATHLEN);
1390 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1391 		if (error == 0) {
1392 			path = g_path;
1393 		} else {
1394 			/* exit on other errors */
1395 			goto done_free;
1396 		}
1397 		if (root->v_type != VDIR) {
1398 			error = ENOTDIR;
1399 			vput(root);
1400 			goto done_free;
1401 		}
1402 		VOP_UNLOCK(root);
1403 	}
1404 
1405 	/*
1406 	 * Find the specified jail, or at least its parent.
1407 	 * This abuses the file error codes ENOENT and EEXIST.
1408 	 */
1409 	pr = NULL;
1410 	inspr = NULL;
1411 	deadpr = NULL;
1412 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1413 		namelc = strrchr(name, '.');
1414 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1415 		if (*p != '\0')
1416 			jid = 0;
1417 	}
1418 	sx_xlock(&allprison_lock);
1419 	drflags = PD_LIST_XLOCKED;
1420 	ppr = mypr;
1421 	if (!prison_isalive(ppr)) {
1422 		/* This jail is dying.  This process will surely follow. */
1423 		error = EAGAIN;
1424 		goto done_deref;
1425 	}
1426 	if (jid != 0) {
1427 		if (jid < 0) {
1428 			error = EINVAL;
1429 			vfs_opterror(opts, "negative jid");
1430 			goto done_deref;
1431 		}
1432 		/*
1433 		 * See if a requested jid already exists.  Keep track of
1434 		 * where it can be inserted later.
1435 		 */
1436 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
1437 			if (inspr->pr_id < jid)
1438 				continue;
1439 			if (inspr->pr_id > jid)
1440 				break;
1441 			if (prison_isalive(inspr)) {
1442 				pr = inspr;
1443 				mtx_lock(&pr->pr_mtx);
1444 				drflags |= PD_LOCKED;
1445 			} else {
1446 				/* Note a dying jail to handle later. */
1447 				deadpr = inspr;
1448 			}
1449 			inspr = NULL;
1450 			break;
1451 		}
1452 		if (cuflags == JAIL_CREATE && pr != NULL) {
1453 			/*
1454 			 * Even creators that cannot see the jail will
1455 			 * get EEXIST.
1456 			 */
1457 			error = EEXIST;
1458 			vfs_opterror(opts, "jail %d already exists", jid);
1459 			goto done_deref;
1460 		}
1461 		if ((pr == NULL)
1462 		    ? cuflags == JAIL_UPDATE
1463 		    : !prison_ischild(mypr, pr)) {
1464 			/*
1465 			 * Updaters get ENOENT for nonexistent jails,
1466 			 * or for jails they cannot see.  The latter
1467 			 * case is true even for CREATE | UPDATE,
1468 			 * which normally cannot give this error.
1469 			 */
1470 			error = ENOENT;
1471 			vfs_opterror(opts, "jail %d not found", jid);
1472 			goto done_deref;
1473 		}
1474 	}
1475 	/*
1476 	 * If the caller provided a name, look for a jail by that name.
1477 	 * This has different semantics for creates and updates keyed by jid
1478 	 * (where the name must not already exist in a different jail),
1479 	 * and updates keyed by the name itself (where the name must exist
1480 	 * because that is the jail being updated).
1481 	 */
1482 	namelc = NULL;
1483 	if (name != NULL) {
1484 		namelc = strrchr(name, '.');
1485 		if (namelc == NULL)
1486 			namelc = name;
1487 		else {
1488 			/*
1489 			 * This is a hierarchical name.  Split it into the
1490 			 * parent and child names, and make sure the parent
1491 			 * exists or matches an already found jail.
1492 			 */
1493 			if (pr != NULL) {
1494 				if (strncmp(name, ppr->pr_name, namelc - name)
1495 				    || ppr->pr_name[namelc - name] != '\0') {
1496 					error = EINVAL;
1497 					vfs_opterror(opts,
1498 					    "cannot change jail's parent");
1499 					goto done_deref;
1500 				}
1501 			} else {
1502 				*namelc = '\0';
1503 				ppr = prison_find_name(mypr, name);
1504 				if (ppr == NULL) {
1505 					error = ENOENT;
1506 					vfs_opterror(opts,
1507 					    "jail \"%s\" not found", name);
1508 					goto done_deref;
1509 				}
1510 				mtx_unlock(&ppr->pr_mtx);
1511 				if (!prison_isalive(ppr)) {
1512 					error = ENOENT;
1513 					vfs_opterror(opts,
1514 					    "jail \"%s\" is dying", name);
1515 					goto done_deref;
1516 				}
1517 				*namelc = '.';
1518 			}
1519 			namelc++;
1520 		}
1521 		if (namelc[0] != '\0') {
1522 			pnamelen =
1523 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1524 			FOREACH_PRISON_CHILD(ppr, tpr) {
1525 				if (tpr == pr || !prison_isalive(tpr) ||
1526 				    strcmp(tpr->pr_name + pnamelen, namelc))
1527 					continue;
1528 				if (cuflags == JAIL_CREATE || pr != NULL) {
1529 					/*
1530 					 * Create, or update(jid): name must
1531 					 * not exist in an active sibling jail.
1532 					 */
1533 					error = EEXIST;
1534 					vfs_opterror(opts,
1535 					    "jail \"%s\" already exists", name);
1536 					goto done_deref;
1537 				}
1538 				/* Use this jail for updates. */
1539 				pr = tpr;
1540 				mtx_lock(&pr->pr_mtx);
1541 				drflags |= PD_LOCKED;
1542 				break;
1543 			}
1544 			/*
1545 			 * Update: name must exist if no jid is specified.
1546 			 * As with the jid case, the jail must be currently
1547 			 * visible, or else even CREATE | UPDATE will get
1548 			 * an error.
1549 			 */
1550 			if ((pr == NULL)
1551 			    ? cuflags == JAIL_UPDATE
1552 			    : !prison_isalive(pr)) {
1553 				error = ENOENT;
1554 				vfs_opterror(opts, "jail \"%s\" not found",
1555 				    name);
1556 				goto done_deref;
1557 			}
1558 		}
1559 	}
1560 	/* Update: must provide a jid or name. */
1561 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1562 		error = ENOENT;
1563 		vfs_opterror(opts, "update specified no jail");
1564 		goto done_deref;
1565 	}
1566 
1567 	/* If there's no prison to update, create a new one and link it in. */
1568 	created = pr == NULL;
1569 	if (created) {
1570 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1571 			if (tpr->pr_childcount >= tpr->pr_childmax) {
1572 				error = EPERM;
1573 				vfs_opterror(opts, "prison limit exceeded");
1574 				goto done_deref;
1575 			}
1576 
1577 		if (deadpr != NULL) {
1578 			/*
1579 			 * The prison being created has the same ID as a dying
1580 			 * one.  Handle this by giving the dying jail a new ID.
1581 			 * This may cause some confusion to user space, but
1582 			 * only to those listing dying jails.
1583 			 */
1584 			deadid = get_next_deadid(&dinspr);
1585 			if (deadid == 0) {
1586 				error = EAGAIN;
1587 				vfs_opterror(opts, "no available jail IDs");
1588 				goto done_deref;
1589 			}
1590 			mtx_lock(&deadpr->pr_mtx);
1591 			deadpr->pr_id = deadid;
1592 			mtx_unlock(&deadpr->pr_mtx);
1593 			if (dinspr == deadpr)
1594 				inspr = deadpr;
1595 			else {
1596 				inspr = TAILQ_NEXT(deadpr, pr_list);
1597 				TAILQ_REMOVE(&allprison, deadpr, pr_list);
1598 				if (dinspr != NULL)
1599 					TAILQ_INSERT_AFTER(&allprison, dinspr,
1600 					    deadpr, pr_list);
1601 				else
1602 					TAILQ_INSERT_HEAD(&allprison, deadpr,
1603 					    pr_list);
1604 			}
1605 		}
1606 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1607 			error = EAGAIN;
1608 			vfs_opterror(opts, "no available jail IDs");
1609 			goto done_deref;
1610 		}
1611 
1612 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1613 		pr->pr_state = PRISON_STATE_INVALID;
1614 		refcount_init(&pr->pr_ref, 1);
1615 		refcount_init(&pr->pr_uref, 0);
1616 		drflags |= PD_DEREF;
1617 		LIST_INIT(&pr->pr_children);
1618 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1619 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1620 
1621 		pr->pr_id = jid;
1622 		if (inspr != NULL)
1623 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1624 		else
1625 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1626 
1627 		pr->pr_parent = ppr;
1628 		prison_hold(ppr);
1629 		prison_proc_hold(ppr);
1630 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1631 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1632 			tpr->pr_childcount++;
1633 
1634 		/* Set some default values, and inherit some from the parent. */
1635 		if (namelc == NULL)
1636 			namelc = "";
1637 		if (path == NULL) {
1638 			path = "/";
1639 			root = mypr->pr_root;
1640 			vref(root);
1641 		}
1642 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1643 		pr->pr_flags |= PR_HOST;
1644 #if defined(INET) || defined(INET6)
1645 #ifdef VIMAGE
1646 		if (!(pr_flags & PR_VNET))
1647 #endif
1648 		{
1649 #ifdef INET
1650 			if (!(ch_flags & PR_IP4_USER))
1651 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1652 			else if (!(pr_flags & PR_IP4_USER)) {
1653 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1654 				prison_ip_dup(ppr, pr, PR_INET);
1655 			}
1656 #endif
1657 #ifdef INET6
1658 			if (!(ch_flags & PR_IP6_USER))
1659 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1660 			else if (!(pr_flags & PR_IP6_USER)) {
1661 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1662 				prison_ip_dup(ppr, pr, PR_INET6);
1663 			}
1664 #endif
1665 		}
1666 #endif
1667 		/* Source address selection is always on by default. */
1668 		pr->pr_flags |= _PR_IP_SADDRSEL;
1669 
1670 		pr->pr_securelevel = ppr->pr_securelevel;
1671 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1672 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
1673 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1674 
1675 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1676 		if (osrelstr == NULL)
1677 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1678 			    sizeof(pr->pr_osrelease));
1679 		else
1680 			strlcpy(pr->pr_osrelease, osrelstr,
1681 			    sizeof(pr->pr_osrelease));
1682 
1683 #ifdef VIMAGE
1684 		/* Allocate a new vnet if specified. */
1685 		pr->pr_vnet = (pr_flags & PR_VNET)
1686 		    ? vnet_alloc() : ppr->pr_vnet;
1687 #endif
1688 		/*
1689 		 * Allocate a dedicated cpuset for each jail.
1690 		 * Unlike other initial settings, this may return an error.
1691 		 */
1692 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1693 		if (error)
1694 			goto done_deref;
1695 
1696 		mtx_lock(&pr->pr_mtx);
1697 		drflags |= PD_LOCKED;
1698 	} else {
1699 		/*
1700 		 * Grab a reference for existing prisons, to ensure they
1701 		 * continue to exist for the duration of the call.
1702 		 */
1703 		prison_hold(pr);
1704 		drflags |= PD_DEREF;
1705 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1706 		if ((pr->pr_flags & PR_VNET) &&
1707 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1708 			error = EINVAL;
1709 			vfs_opterror(opts,
1710 			    "vnet jails cannot have IP address restrictions");
1711 			goto done_deref;
1712 		}
1713 #endif
1714 #ifdef INET
1715 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1716 			error = EINVAL;
1717 			vfs_opterror(opts,
1718 			    "ip4 cannot be changed after creation");
1719 			goto done_deref;
1720 		}
1721 #endif
1722 #ifdef INET6
1723 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1724 			error = EINVAL;
1725 			vfs_opterror(opts,
1726 			    "ip6 cannot be changed after creation");
1727 			goto done_deref;
1728 		}
1729 #endif
1730 	}
1731 
1732 	/* Do final error checking before setting anything. */
1733 	if (gotslevel) {
1734 		if (slevel < ppr->pr_securelevel) {
1735 			error = EPERM;
1736 			goto done_deref;
1737 		}
1738 	}
1739 	if (gotchildmax) {
1740 		if (childmax >= ppr->pr_childmax) {
1741 			error = EPERM;
1742 			goto done_deref;
1743 		}
1744 	}
1745 	if (gotenforce) {
1746 		if (enforce < ppr->pr_enforce_statfs) {
1747 			error = EPERM;
1748 			goto done_deref;
1749 		}
1750 	}
1751 	if (gotrsnum) {
1752 		/*
1753 		 * devfs_rsnum is a uint16_t
1754 		 */
1755 		if (rsnum < 0 || rsnum > 65535) {
1756 			error = EINVAL;
1757 			goto done_deref;
1758 		}
1759 		/*
1760 		 * Nested jails always inherit parent's devfs ruleset
1761 		 */
1762 		if (jailed(td->td_ucred)) {
1763 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1764 				error = EPERM;
1765 				goto done_deref;
1766 			} else
1767 				rsnum = ppr->pr_devfs_rsnum;
1768 		}
1769 	}
1770 #ifdef INET
1771 	if (ip4s > 0) {
1772 		if ((ppr->pr_flags & PR_IP4) &&
1773 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
1774 		    PR_INET)) {
1775 			error = EPERM;
1776 			goto done_deref;
1777 		}
1778 		if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
1779 			error = EADDRINUSE;
1780 			vfs_opterror(opts, "IPv4 addresses clash");
1781 			goto done_deref;
1782 		}
1783 	}
1784 #endif
1785 #ifdef INET6
1786 	if (ip6s > 0) {
1787 		if ((ppr->pr_flags & PR_IP6) &&
1788 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
1789 		    PR_INET6)) {
1790 			error = EPERM;
1791 			goto done_deref;
1792 		}
1793 		if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
1794 			error = EADDRINUSE;
1795 			vfs_opterror(opts, "IPv6 addresses clash");
1796 			goto done_deref;
1797 		}
1798 	}
1799 #endif
1800 	onamelen = namelen = 0;
1801 	if (namelc != NULL) {
1802 		/* Give a default name of the jid.  Also allow the name to be
1803 		 * explicitly the jid - but not any other number, and only in
1804 		 * normal form (no leading zero/etc).
1805 		 */
1806 		if (namelc[0] == '\0')
1807 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1808 		else if ((strtoul(namelc, &p, 10) != jid ||
1809 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1810 			error = EINVAL;
1811 			vfs_opterror(opts,
1812 			    "name cannot be numeric (unless it is the jid)");
1813 			goto done_deref;
1814 		}
1815 		/*
1816 		 * Make sure the name isn't too long for the prison or its
1817 		 * children.
1818 		 */
1819 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1820 		onamelen = strlen(pr->pr_name + pnamelen);
1821 		namelen = strlen(namelc);
1822 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1823 			error = ENAMETOOLONG;
1824 			goto done_deref;
1825 		}
1826 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1827 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1828 			    sizeof(pr->pr_name)) {
1829 				error = ENAMETOOLONG;
1830 				goto done_deref;
1831 			}
1832 		}
1833 	}
1834 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
1835 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1836 		error = EPERM;
1837 		goto done_deref;
1838 	}
1839 
1840 	/*
1841 	 * Let modules check their parameters.  This requires unlocking and
1842 	 * then re-locking the prison, but this is still a valid state as long
1843 	 * as allprison_lock remains xlocked.
1844 	 */
1845 	mtx_unlock(&pr->pr_mtx);
1846 	drflags &= ~PD_LOCKED;
1847 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1848 	if (error != 0)
1849 		goto done_deref;
1850 	mtx_lock(&pr->pr_mtx);
1851 	drflags |= PD_LOCKED;
1852 
1853 	/* At this point, all valid parameters should have been noted. */
1854 	TAILQ_FOREACH(opt, opts, link) {
1855 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1856 			error = EINVAL;
1857 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1858 			goto done_deref;
1859 		}
1860 	}
1861 
1862 	/* Set the parameters of the prison. */
1863 #ifdef INET
1864 	redo_ip4 = false;
1865 	if (pr_flags & PR_IP4_USER) {
1866 		pr->pr_flags |= PR_IP4;
1867 		prison_ip_set(pr, PR_INET, ip4);
1868 		ip4 = NULL;
1869 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1870 #ifdef VIMAGE
1871 			if (tpr->pr_flags & PR_VNET) {
1872 				descend = 0;
1873 				continue;
1874 			}
1875 #endif
1876 			if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
1877 				redo_ip4 = true;
1878 				descend = 0;
1879 			}
1880 		}
1881 	}
1882 #endif
1883 #ifdef INET6
1884 	redo_ip6 = false;
1885 	if (pr_flags & PR_IP6_USER) {
1886 		pr->pr_flags |= PR_IP6;
1887 		prison_ip_set(pr, PR_INET6, ip6);
1888 		ip6 = NULL;
1889 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1890 #ifdef VIMAGE
1891 			if (tpr->pr_flags & PR_VNET) {
1892 				descend = 0;
1893 				continue;
1894 			}
1895 #endif
1896 			if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
1897 				redo_ip6 = true;
1898 				descend = 0;
1899 			}
1900 		}
1901 	}
1902 #endif
1903 	if (gotslevel) {
1904 		pr->pr_securelevel = slevel;
1905 		/* Set all child jails to be at least this level. */
1906 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1907 			if (tpr->pr_securelevel < slevel)
1908 				tpr->pr_securelevel = slevel;
1909 	}
1910 	if (gotchildmax) {
1911 		pr->pr_childmax = childmax;
1912 		/* Set all child jails to under this limit. */
1913 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1914 			if (tpr->pr_childmax > childmax - level)
1915 				tpr->pr_childmax = childmax > level
1916 				    ? childmax - level : 0;
1917 	}
1918 	if (gotenforce) {
1919 		pr->pr_enforce_statfs = enforce;
1920 		/* Pass this restriction on to the children. */
1921 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1922 			if (tpr->pr_enforce_statfs < enforce)
1923 				tpr->pr_enforce_statfs = enforce;
1924 	}
1925 	if (gotrsnum) {
1926 		pr->pr_devfs_rsnum = rsnum;
1927 		/* Pass this restriction on to the children. */
1928 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1929 			tpr->pr_devfs_rsnum = rsnum;
1930 	}
1931 	if (namelc != NULL) {
1932 		if (ppr == &prison0)
1933 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1934 		else
1935 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1936 			    ppr->pr_name, namelc);
1937 		/* Change this component of child names. */
1938 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1939 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1940 			    strlen(tpr->pr_name + onamelen) + 1);
1941 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1942 		}
1943 	}
1944 	if (path != NULL) {
1945 		/* Try to keep a real-rooted full pathname. */
1946 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1947 		pr->pr_root = root;
1948 		root = NULL;
1949 	}
1950 	if (PR_HOST & ch_flags & ~pr_flags) {
1951 		if (pr->pr_flags & PR_HOST) {
1952 			/*
1953 			 * Copy the parent's host info.  As with pr_ip4 above,
1954 			 * the lack of a lock on the parent is not a problem;
1955 			 * it is always set with allprison_lock at least
1956 			 * shared, and is held exclusively here.
1957 			 */
1958 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1959 			    sizeof(pr->pr_hostname));
1960 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1961 			    sizeof(pr->pr_domainname));
1962 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1963 			    sizeof(pr->pr_hostuuid));
1964 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1965 		}
1966 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1967 		/* Set this prison, and any descendants without PR_HOST. */
1968 		if (host != NULL)
1969 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1970 		if (domain != NULL)
1971 			strlcpy(pr->pr_domainname, domain,
1972 			    sizeof(pr->pr_domainname));
1973 		if (uuid != NULL)
1974 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1975 		if (gothid)
1976 			pr->pr_hostid = hid;
1977 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1978 			if (tpr->pr_flags & PR_HOST)
1979 				descend = 0;
1980 			else {
1981 				if (host != NULL)
1982 					strlcpy(tpr->pr_hostname,
1983 					    pr->pr_hostname,
1984 					    sizeof(tpr->pr_hostname));
1985 				if (domain != NULL)
1986 					strlcpy(tpr->pr_domainname,
1987 					    pr->pr_domainname,
1988 					    sizeof(tpr->pr_domainname));
1989 				if (uuid != NULL)
1990 					strlcpy(tpr->pr_hostuuid,
1991 					    pr->pr_hostuuid,
1992 					    sizeof(tpr->pr_hostuuid));
1993 				if (gothid)
1994 					tpr->pr_hostid = hid;
1995 			}
1996 		}
1997 	}
1998 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1999 	if ((tallow = ch_allow & ~pr_allow))
2000 		prison_set_allow_locked(pr, tallow, 0);
2001 	/*
2002 	 * Persistent prisons get an extra reference, and prisons losing their
2003 	 * persist flag lose that reference.
2004 	 */
2005 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
2006 		if (pr_flags & PR_PERSIST) {
2007 			prison_hold(pr);
2008 			/*
2009 			 * This may be a new prison's first user reference,
2010 			 * but wait to call it alive until after OSD calls
2011 			 * have had a chance to run (and perhaps to fail).
2012 			 */
2013 			refcount_acquire(&pr->pr_uref);
2014 		} else {
2015 			drflags |= PD_DEUREF;
2016 			prison_free_not_last(pr);
2017 		}
2018 	}
2019 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
2020 	mtx_unlock(&pr->pr_mtx);
2021 	drflags &= ~PD_LOCKED;
2022 	/*
2023 	 * Any errors past this point will need to de-persist newly created
2024 	 * prisons, as well as call remove methods.
2025 	 */
2026 	if (created)
2027 		drflags |= PD_KILL;
2028 
2029 #ifdef RACCT
2030 	if (racct_enable && created)
2031 		prison_racct_attach(pr);
2032 #endif
2033 
2034 	/* Locks may have prevented a complete restriction of child IP
2035 	 * addresses.  If so, allocate some more memory and try again.
2036 	 */
2037 #ifdef INET
2038 	while (redo_ip4) {
2039 		ip4s = pr->pr_addrs[PR_INET]->ips;
2040 		MPASS(ip4 == NULL);
2041 		ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
2042 		mtx_lock(&pr->pr_mtx);
2043 		redo_ip4 = false;
2044 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2045 #ifdef VIMAGE
2046 			if (tpr->pr_flags & PR_VNET) {
2047 				descend = 0;
2048 				continue;
2049 			}
2050 #endif
2051 			if (!prison_ip_restrict(tpr, PR_INET, &ip4))
2052 				redo_ip4 = true;
2053 		}
2054 		mtx_unlock(&pr->pr_mtx);
2055 	}
2056 #endif
2057 #ifdef INET6
2058 	while (redo_ip6) {
2059 		ip6s = pr->pr_addrs[PR_INET6]->ips;
2060 		MPASS(ip6 == NULL);
2061 		ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
2062 		mtx_lock(&pr->pr_mtx);
2063 		redo_ip6 = false;
2064 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2065 #ifdef VIMAGE
2066 			if (tpr->pr_flags & PR_VNET) {
2067 				descend = 0;
2068 				continue;
2069 			}
2070 #endif
2071 			if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
2072 				redo_ip6 = true;
2073 		}
2074 		mtx_unlock(&pr->pr_mtx);
2075 	}
2076 #endif
2077 
2078 	/* Let the modules do their work. */
2079 	if (created) {
2080 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
2081 		if (error)
2082 			goto done_deref;
2083 	}
2084 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
2085 	if (error)
2086 		goto done_deref;
2087 
2088 	/*
2089 	 * A new prison is now ready to be seen; either it has gained a user
2090 	 * reference via persistence, or is about to gain one via attachment.
2091 	 */
2092 	if (created) {
2093 		drflags = prison_lock_xlock(pr, drflags);
2094 		pr->pr_state = PRISON_STATE_ALIVE;
2095 	}
2096 
2097 	/* Attach this process to the prison if requested. */
2098 	if (flags & JAIL_ATTACH) {
2099 		error = do_jail_attach(td, pr,
2100 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
2101 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
2102 		if (error) {
2103 			vfs_opterror(opts, "attach failed");
2104 			goto done_deref;
2105 		}
2106 	}
2107 
2108 #ifdef RACCT
2109 	if (racct_enable && !created) {
2110 		if (drflags & PD_LOCKED) {
2111 			mtx_unlock(&pr->pr_mtx);
2112 			drflags &= ~PD_LOCKED;
2113 		}
2114 		if (drflags & PD_LIST_XLOCKED) {
2115 			sx_xunlock(&allprison_lock);
2116 			drflags &= ~PD_LIST_XLOCKED;
2117 		}
2118 		prison_racct_modify(pr);
2119 	}
2120 #endif
2121 
2122 	if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
2123 	    (pr->pr_root->v_vflag & VV_ROOT) == 0)
2124 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
2125 		   " file system\n", pr->pr_id);
2126 
2127 	drflags &= ~PD_KILL;
2128 	td->td_retval[0] = pr->pr_id;
2129 
2130  done_deref:
2131 	/* Release any temporary prison holds and/or locks. */
2132 	if (pr != NULL)
2133 		prison_deref(pr, drflags);
2134 	else if (drflags & PD_LIST_SLOCKED)
2135 		sx_sunlock(&allprison_lock);
2136 	else if (drflags & PD_LIST_XLOCKED)
2137 		sx_xunlock(&allprison_lock);
2138 	if (root != NULL)
2139 		vrele(root);
2140  done_errmsg:
2141 	if (error) {
2142 		/* Write the error message back to userspace. */
2143 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
2144 		    &errmsg_len) == 0 && errmsg_len > 0) {
2145 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
2146 			if (optuio->uio_segflg == UIO_SYSSPACE)
2147 				bcopy(errmsg,
2148 				    optuio->uio_iov[errmsg_pos].iov_base,
2149 				    errmsg_len);
2150 			else
2151 				(void)copyout(errmsg,
2152 				    optuio->uio_iov[errmsg_pos].iov_base,
2153 				    errmsg_len);
2154 		}
2155 	}
2156  done_free:
2157 #ifdef INET
2158 	prison_ip_free(ip4);
2159 #endif
2160 #ifdef INET6
2161 	prison_ip_free(ip6);
2162 #endif
2163 	if (g_path != NULL)
2164 		free(g_path, M_TEMP);
2165 	vfs_freeopts(opts);
2166 	return (error);
2167 }
2168 
2169 /*
2170  * Find the next available prison ID.  Return the ID on success, or zero
2171  * on failure.  Also set a pointer to the allprison list entry the prison
2172  * should be inserted before.
2173  */
2174 static int
2175 get_next_prid(struct prison **insprp)
2176 {
2177 	struct prison *inspr;
2178 	int jid, maxid;
2179 
2180 	jid = lastprid % JAIL_MAX + 1;
2181 	if (TAILQ_EMPTY(&allprison) ||
2182 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
2183 		/*
2184 		 * A common case is for all jails to be implicitly numbered,
2185 		 * which means they'll go on the end of the list, at least
2186 		 * for the first JAIL_MAX times.
2187 		 */
2188 		inspr = NULL;
2189 	} else {
2190 		/*
2191 		 * Take two passes through the allprison list: first starting
2192 		 * with the proposed jid, then ending with it.
2193 		 */
2194 		for (maxid = JAIL_MAX; maxid != 0; ) {
2195 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
2196 				if (inspr->pr_id < jid)
2197 					continue;
2198 				if (inspr->pr_id > jid) {
2199 					/* Found an opening. */
2200 					maxid = 0;
2201 					break;
2202 				}
2203 				if (++jid > maxid) {
2204 					if (lastprid == maxid || lastprid == 0)
2205 					{
2206 						/*
2207 						 * The entire legal range
2208 						 * has been traversed
2209 						 */
2210 						return 0;
2211 					}
2212 					/* Try again from the start. */
2213 					jid = 1;
2214 					maxid = lastprid;
2215 					break;
2216 				}
2217 			}
2218 			if (inspr == NULL) {
2219 				/* Found room at the end of the list. */
2220 				break;
2221 			}
2222 		}
2223 	}
2224 	*insprp = inspr;
2225 	lastprid = jid;
2226 	return (jid);
2227 }
2228 
2229 /*
2230  * Find the next available ID for a renumbered dead prison.  This is the same
2231  * as get_next_prid, but counting backward from the end of the range.
2232  */
2233 static int
2234 get_next_deadid(struct prison **dinsprp)
2235 {
2236 	struct prison *dinspr;
2237 	int deadid, minid;
2238 
2239 	deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX;
2240 	/*
2241 	 * Take two reverse passes through the allprison list: first
2242 	 * starting with the proposed deadid, then ending with it.
2243 	 */
2244 	for (minid = 1; minid != 0; ) {
2245 		TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) {
2246 			if (dinspr->pr_id > deadid)
2247 				continue;
2248 			if (dinspr->pr_id < deadid) {
2249 				/* Found an opening. */
2250 				minid = 0;
2251 				break;
2252 			}
2253 			if (--deadid < minid) {
2254 				if (lastdeadid == minid || lastdeadid == 0)
2255 				{
2256 					/*
2257 					 * The entire legal range
2258 					 * has been traversed
2259 					 */
2260 					return 0;
2261 				}
2262 				/* Try again from the end. */
2263 				deadid = JAIL_MAX;
2264 				minid = lastdeadid;
2265 				break;
2266 			}
2267 		}
2268 		if (dinspr == NULL) {
2269 			/* Found room at the beginning of the list. */
2270 			break;
2271 		}
2272 	}
2273 	*dinsprp = dinspr;
2274 	lastdeadid = deadid;
2275 	return (deadid);
2276 }
2277 
2278 /*
2279  * struct jail_get_args {
2280  *	struct iovec *iovp;
2281  *	unsigned int iovcnt;
2282  *	int flags;
2283  * };
2284  */
2285 int
2286 sys_jail_get(struct thread *td, struct jail_get_args *uap)
2287 {
2288 	struct uio *auio;
2289 	int error;
2290 
2291 	/* Check that we have an even number of iovecs. */
2292 	if (uap->iovcnt & 1)
2293 		return (EINVAL);
2294 
2295 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2296 	if (error)
2297 		return (error);
2298 	error = kern_jail_get(td, auio, uap->flags);
2299 	if (error == 0)
2300 		error = copyout(auio->uio_iov, uap->iovp,
2301 		    uap->iovcnt * sizeof(struct iovec));
2302 	freeuio(auio);
2303 	return (error);
2304 }
2305 
2306 int
2307 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2308 {
2309 	struct bool_flags *bf;
2310 	struct jailsys_flags *jsf;
2311 	struct prison *pr, *mypr;
2312 	struct vfsopt *opt;
2313 	struct vfsoptlist *opts;
2314 	char *errmsg, *name;
2315 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2316 	unsigned f;
2317 
2318 	if (flags & ~JAIL_GET_MASK)
2319 		return (EINVAL);
2320 
2321 	/* Get the parameter list. */
2322 	error = vfs_buildopts(optuio, &opts);
2323 	if (error)
2324 		return (error);
2325 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2326 	mypr = td->td_ucred->cr_prison;
2327 	pr = NULL;
2328 
2329 	/*
2330 	 * Find the prison specified by one of: lastjid, jid, name.
2331 	 */
2332 	sx_slock(&allprison_lock);
2333 	drflags = PD_LIST_SLOCKED;
2334 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2335 	if (error == 0) {
2336 		TAILQ_FOREACH(pr, &allprison, pr_list) {
2337 			if (pr->pr_id > jid &&
2338 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2339 			    prison_ischild(mypr, pr)) {
2340 				mtx_lock(&pr->pr_mtx);
2341 				drflags |= PD_LOCKED;
2342 				goto found_prison;
2343 			}
2344 		}
2345 		error = ENOENT;
2346 		vfs_opterror(opts, "no jail after %d", jid);
2347 		goto done;
2348 	} else if (error != ENOENT)
2349 		goto done;
2350 
2351 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2352 	if (error == 0) {
2353 		if (jid != 0) {
2354 			pr = prison_find_child(mypr, jid);
2355 			if (pr != NULL) {
2356 				drflags |= PD_LOCKED;
2357 				if (!(prison_isalive(pr) ||
2358 				    (flags & JAIL_DYING))) {
2359 					error = ENOENT;
2360 					vfs_opterror(opts, "jail %d is dying",
2361 					    jid);
2362 					goto done;
2363 				}
2364 				goto found_prison;
2365 			}
2366 			error = ENOENT;
2367 			vfs_opterror(opts, "jail %d not found", jid);
2368 			goto done;
2369 		}
2370 	} else if (error != ENOENT)
2371 		goto done;
2372 
2373 	error = vfs_getopt(opts, "name", (void **)&name, &len);
2374 	if (error == 0) {
2375 		if (len == 0 || name[len - 1] != '\0') {
2376 			error = EINVAL;
2377 			goto done;
2378 		}
2379 		pr = prison_find_name(mypr, name);
2380 		if (pr != NULL) {
2381 			drflags |= PD_LOCKED;
2382 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2383 				error = ENOENT;
2384 				vfs_opterror(opts, "jail \"%s\" is dying",
2385 				    name);
2386 				goto done;
2387 			}
2388 			goto found_prison;
2389 		}
2390 		error = ENOENT;
2391 		vfs_opterror(opts, "jail \"%s\" not found", name);
2392 		goto done;
2393 	} else if (error != ENOENT)
2394 		goto done;
2395 
2396 	vfs_opterror(opts, "no jail specified");
2397 	error = ENOENT;
2398 	goto done;
2399 
2400  found_prison:
2401 	/* Get the parameters of the prison. */
2402 	prison_hold(pr);
2403 	drflags |= PD_DEREF;
2404 	td->td_retval[0] = pr->pr_id;
2405 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2406 	if (error != 0 && error != ENOENT)
2407 		goto done;
2408 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2409 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2410 	if (error != 0 && error != ENOENT)
2411 		goto done;
2412 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2413 	if (error != 0 && error != ENOENT)
2414 		goto done;
2415 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2416 	    sizeof(pr->pr_cpuset->cs_id));
2417 	if (error != 0 && error != ENOENT)
2418 		goto done;
2419 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2420 	if (error != 0 && error != ENOENT)
2421 		goto done;
2422 #ifdef INET
2423 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
2424 	    pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
2425 	    pr_families[PR_INET].size : 0 );
2426 	if (error != 0 && error != ENOENT)
2427 		goto done;
2428 #endif
2429 #ifdef INET6
2430 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
2431 	    pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
2432 	    pr_families[PR_INET6].size : 0 );
2433 	if (error != 0 && error != ENOENT)
2434 		goto done;
2435 #endif
2436 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2437 	    sizeof(pr->pr_securelevel));
2438 	if (error != 0 && error != ENOENT)
2439 		goto done;
2440 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2441 	    sizeof(pr->pr_childcount));
2442 	if (error != 0 && error != ENOENT)
2443 		goto done;
2444 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2445 	    sizeof(pr->pr_childmax));
2446 	if (error != 0 && error != ENOENT)
2447 		goto done;
2448 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2449 	if (error != 0 && error != ENOENT)
2450 		goto done;
2451 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2452 	if (error != 0 && error != ENOENT)
2453 		goto done;
2454 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2455 	if (error != 0 && error != ENOENT)
2456 		goto done;
2457 #ifdef COMPAT_FREEBSD32
2458 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2459 		uint32_t hid32 = pr->pr_hostid;
2460 
2461 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2462 	} else
2463 #endif
2464 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2465 	    sizeof(pr->pr_hostid));
2466 	if (error != 0 && error != ENOENT)
2467 		goto done;
2468 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2469 	    sizeof(pr->pr_enforce_statfs));
2470 	if (error != 0 && error != ENOENT)
2471 		goto done;
2472 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2473 	    sizeof(pr->pr_devfs_rsnum));
2474 	if (error != 0 && error != ENOENT)
2475 		goto done;
2476 	for (bf = pr_flag_bool;
2477 	     bf < pr_flag_bool + nitems(pr_flag_bool);
2478 	     bf++) {
2479 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
2480 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2481 		if (error != 0 && error != ENOENT)
2482 			goto done;
2483 		i = !i;
2484 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2485 		if (error != 0 && error != ENOENT)
2486 			goto done;
2487 	}
2488 	for (jsf = pr_flag_jailsys;
2489 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2490 	     jsf++) {
2491 		f = pr->pr_flags & (jsf->disable | jsf->new);
2492 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2493 		    : (f == jsf->new) ? JAIL_SYS_NEW
2494 		    : JAIL_SYS_INHERIT;
2495 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2496 		if (error != 0 && error != ENOENT)
2497 			goto done;
2498 	}
2499 	for (bf = pr_flag_allow;
2500 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
2501 		atomic_load_int(&bf->flag) != 0;
2502 	     bf++) {
2503 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
2504 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2505 		if (error != 0 && error != ENOENT)
2506 			goto done;
2507 		i = !i;
2508 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2509 		if (error != 0 && error != ENOENT)
2510 			goto done;
2511 	}
2512 	i = !prison_isalive(pr);
2513 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2514 	if (error != 0 && error != ENOENT)
2515 		goto done;
2516 	i = !i;
2517 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2518 	if (error != 0 && error != ENOENT)
2519 		goto done;
2520 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2521 	    sizeof(pr->pr_osreldate));
2522 	if (error != 0 && error != ENOENT)
2523 		goto done;
2524 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2525 	if (error != 0 && error != ENOENT)
2526 		goto done;
2527 
2528 	/* Get the module parameters. */
2529 	mtx_unlock(&pr->pr_mtx);
2530 	drflags &= ~PD_LOCKED;
2531 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2532 	if (error)
2533 		goto done;
2534 	prison_deref(pr, drflags);
2535 	pr = NULL;
2536 	drflags = 0;
2537 
2538 	/* By now, all parameters should have been noted. */
2539 	TAILQ_FOREACH(opt, opts, link) {
2540 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2541 			error = EINVAL;
2542 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2543 			goto done;
2544 		}
2545 	}
2546 
2547 	/* Write the fetched parameters back to userspace. */
2548 	error = 0;
2549 	TAILQ_FOREACH(opt, opts, link) {
2550 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2551 			pos = 2 * opt->pos + 1;
2552 			optuio->uio_iov[pos].iov_len = opt->len;
2553 			if (opt->value != NULL) {
2554 				if (optuio->uio_segflg == UIO_SYSSPACE) {
2555 					bcopy(opt->value,
2556 					    optuio->uio_iov[pos].iov_base,
2557 					    opt->len);
2558 				} else {
2559 					error = copyout(opt->value,
2560 					    optuio->uio_iov[pos].iov_base,
2561 					    opt->len);
2562 					if (error)
2563 						break;
2564 				}
2565 			}
2566 		}
2567 	}
2568 
2569  done:
2570 	/* Release any temporary prison holds and/or locks. */
2571 	if (pr != NULL)
2572 		prison_deref(pr, drflags);
2573 	else if (drflags & PD_LIST_SLOCKED)
2574 		sx_sunlock(&allprison_lock);
2575 	if (error && errmsg_pos >= 0) {
2576 		/* Write the error message back to userspace. */
2577 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2578 		errmsg_pos = 2 * errmsg_pos + 1;
2579 		if (errmsg_len > 0) {
2580 			if (optuio->uio_segflg == UIO_SYSSPACE)
2581 				bcopy(errmsg,
2582 				    optuio->uio_iov[errmsg_pos].iov_base,
2583 				    errmsg_len);
2584 			else
2585 				(void)copyout(errmsg,
2586 				    optuio->uio_iov[errmsg_pos].iov_base,
2587 				    errmsg_len);
2588 		}
2589 	}
2590 	vfs_freeopts(opts);
2591 	return (error);
2592 }
2593 
2594 /*
2595  * struct jail_remove_args {
2596  *	int jid;
2597  * };
2598  */
2599 int
2600 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2601 {
2602 	struct prison *pr;
2603 	int error;
2604 
2605 	error = priv_check(td, PRIV_JAIL_REMOVE);
2606 	if (error)
2607 		return (error);
2608 
2609 	sx_xlock(&allprison_lock);
2610 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2611 	if (pr == NULL) {
2612 		sx_xunlock(&allprison_lock);
2613 		return (EINVAL);
2614 	}
2615 	if (!prison_isalive(pr)) {
2616 		/* Silently ignore already-dying prisons. */
2617 		mtx_unlock(&pr->pr_mtx);
2618 		sx_xunlock(&allprison_lock);
2619 		return (0);
2620 	}
2621 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2622 	return (0);
2623 }
2624 
2625 /*
2626  * struct jail_attach_args {
2627  *	int jid;
2628  * };
2629  */
2630 int
2631 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2632 {
2633 	struct prison *pr;
2634 	int error;
2635 
2636 	error = priv_check(td, PRIV_JAIL_ATTACH);
2637 	if (error)
2638 		return (error);
2639 
2640 	sx_slock(&allprison_lock);
2641 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2642 	if (pr == NULL) {
2643 		sx_sunlock(&allprison_lock);
2644 		return (EINVAL);
2645 	}
2646 
2647 	/* Do not allow a process to attach to a prison that is not alive. */
2648 	if (!prison_isalive(pr)) {
2649 		mtx_unlock(&pr->pr_mtx);
2650 		sx_sunlock(&allprison_lock);
2651 		return (EINVAL);
2652 	}
2653 
2654 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2655 }
2656 
2657 static int
2658 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2659 {
2660 	struct proc *p;
2661 	struct ucred *newcred, *oldcred;
2662 	int error;
2663 
2664 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2665 	sx_assert(&allprison_lock, SX_LOCKED);
2666 	drflags &= PD_LOCK_FLAGS;
2667 	/*
2668 	 * XXX: Note that there is a slight race here if two threads
2669 	 * in the same privileged process attempt to attach to two
2670 	 * different jails at the same time.  It is important for
2671 	 * user processes not to do this, or they might end up with
2672 	 * a process root from one prison, but attached to the jail
2673 	 * of another.
2674 	 */
2675 	prison_hold(pr);
2676 	refcount_acquire(&pr->pr_uref);
2677 	drflags |= PD_DEREF | PD_DEUREF;
2678 	mtx_unlock(&pr->pr_mtx);
2679 	drflags &= ~PD_LOCKED;
2680 
2681 	/* Let modules do whatever they need to prepare for attaching. */
2682 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2683 	if (error) {
2684 		prison_deref(pr, drflags);
2685 		return (error);
2686 	}
2687 	sx_unlock(&allprison_lock);
2688 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2689 
2690 	/*
2691 	 * Reparent the newly attached process to this jail.
2692 	 */
2693 	p = td->td_proc;
2694 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2695 	if (error)
2696 		goto e_revert_osd;
2697 
2698 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2699 	if ((error = change_dir(pr->pr_root, td)) != 0)
2700 		goto e_unlock;
2701 #ifdef MAC
2702 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2703 		goto e_unlock;
2704 #endif
2705 	VOP_UNLOCK(pr->pr_root);
2706 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2707 		goto e_revert_osd;
2708 
2709 	newcred = crget();
2710 	PROC_LOCK(p);
2711 	oldcred = crcopysafe(p, newcred);
2712 	newcred->cr_prison = pr;
2713 	proc_set_cred(p, newcred);
2714 	setsugid(p);
2715 #ifdef RACCT
2716 	racct_proc_ucred_changed(p, oldcred, newcred);
2717 	crhold(newcred);
2718 #endif
2719 	PROC_UNLOCK(p);
2720 #ifdef RCTL
2721 	rctl_proc_ucred_changed(p, newcred);
2722 	crfree(newcred);
2723 #endif
2724 	prison_proc_relink(oldcred->cr_prison, pr, p);
2725 	prison_deref(oldcred->cr_prison, drflags);
2726 	crfree(oldcred);
2727 
2728 	/*
2729 	 * If the prison was killed while changing credentials, die along
2730 	 * with it.
2731 	 */
2732 	if (!prison_isalive(pr)) {
2733 		PROC_LOCK(p);
2734 		kern_psignal(p, SIGKILL);
2735 		PROC_UNLOCK(p);
2736 	}
2737 
2738 	return (0);
2739 
2740  e_unlock:
2741 	VOP_UNLOCK(pr->pr_root);
2742  e_revert_osd:
2743 	/* Tell modules this thread is still in its old jail after all. */
2744 	sx_slock(&allprison_lock);
2745 	drflags |= PD_LIST_SLOCKED;
2746 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2747 	prison_deref(pr, drflags);
2748 	return (error);
2749 }
2750 
2751 /*
2752  * Returns a locked prison instance, or NULL on failure.
2753  */
2754 struct prison *
2755 prison_find(int prid)
2756 {
2757 	struct prison *pr;
2758 
2759 	sx_assert(&allprison_lock, SX_LOCKED);
2760 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2761 		if (pr->pr_id < prid)
2762 			continue;
2763 		if (pr->pr_id > prid)
2764 			break;
2765 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2766 		mtx_lock(&pr->pr_mtx);
2767 		return (pr);
2768 	}
2769 	return (NULL);
2770 }
2771 
2772 /*
2773  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2774  */
2775 struct prison *
2776 prison_find_child(struct prison *mypr, int prid)
2777 {
2778 	struct prison *pr;
2779 	int descend;
2780 
2781 	sx_assert(&allprison_lock, SX_LOCKED);
2782 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2783 		if (pr->pr_id == prid) {
2784 			KASSERT(prison_isvalid(pr),
2785 			    ("Found invalid prison %p", pr));
2786 			mtx_lock(&pr->pr_mtx);
2787 			return (pr);
2788 		}
2789 	}
2790 	return (NULL);
2791 }
2792 
2793 /*
2794  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2795  */
2796 struct prison *
2797 prison_find_name(struct prison *mypr, const char *name)
2798 {
2799 	struct prison *pr, *deadpr;
2800 	size_t mylen;
2801 	int descend;
2802 
2803 	sx_assert(&allprison_lock, SX_LOCKED);
2804 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2805 	deadpr = NULL;
2806 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2807 		if (!strcmp(pr->pr_name + mylen, name)) {
2808 			KASSERT(prison_isvalid(pr),
2809 			    ("Found invalid prison %p", pr));
2810 			if (prison_isalive(pr)) {
2811 				mtx_lock(&pr->pr_mtx);
2812 				return (pr);
2813 			}
2814 			deadpr = pr;
2815 		}
2816 	}
2817 	/* There was no valid prison - perhaps there was a dying one. */
2818 	if (deadpr != NULL)
2819 		mtx_lock(&deadpr->pr_mtx);
2820 	return (deadpr);
2821 }
2822 
2823 /*
2824  * See if a prison has the specific flag set.  The prison should be locked,
2825  * unless checking for flags that are only set at jail creation (such as
2826  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2827  * to any other prison data.
2828  */
2829 bool
2830 prison_flag(struct ucred *cred, unsigned flag)
2831 {
2832 
2833 	return ((cred->cr_prison->pr_flags & flag) != 0);
2834 }
2835 
2836 /*
2837  * See if a prison has the specific allow flag set.
2838  * The prison *should* be locked, or only a single bit is examined, without
2839  * regard to any other prison data.
2840  */
2841 bool
2842 prison_allow(struct ucred *cred, unsigned flag)
2843 {
2844 
2845 	return ((cred->cr_prison->pr_allow & flag) != 0);
2846 }
2847 
2848 /*
2849  * Hold a prison reference, by incrementing pr_ref.  It is generally
2850  * an error to hold a prison that does not already have a reference.
2851  * A prison record will remain valid as long as it has at least one
2852  * reference, and will not be removed as long as either the prison
2853  * mutex or the allprison lock is held (allprison_lock may be shared).
2854  */
2855 void
2856 prison_hold_locked(struct prison *pr)
2857 {
2858 
2859 	/* Locking is no longer required. */
2860 	prison_hold(pr);
2861 }
2862 
2863 void
2864 prison_hold(struct prison *pr)
2865 {
2866 #ifdef INVARIANTS
2867 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2868 
2869 	KASSERT(was_valid,
2870 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2871 #else
2872 	refcount_acquire(&pr->pr_ref);
2873 #endif
2874 }
2875 
2876 /*
2877  * Remove a prison reference.  If that was the last reference, the
2878  * prison will be removed (at a later time).
2879  */
2880 void
2881 prison_free_locked(struct prison *pr)
2882 {
2883 
2884 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2885 	/*
2886 	 * Locking is no longer required, but unlock because the caller
2887 	 * expects it.
2888 	 */
2889 	mtx_unlock(&pr->pr_mtx);
2890 	prison_free(pr);
2891 }
2892 
2893 void
2894 prison_free(struct prison *pr)
2895 {
2896 
2897 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2898 	    ("Trying to free dead prison %p (jid=%d).",
2899 	     pr, pr->pr_id));
2900 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
2901 		/*
2902 		 * Don't remove the last reference in this context,
2903 		 * in case there are locks held.
2904 		 */
2905 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2906 	}
2907 }
2908 
2909 static void
2910 prison_free_not_last(struct prison *pr)
2911 {
2912 #ifdef INVARIANTS
2913 	int lastref;
2914 
2915 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2916 	    ("Trying to free dead prison %p (jid=%d).",
2917 	     pr, pr->pr_id));
2918 	lastref = refcount_release(&pr->pr_ref);
2919 	KASSERT(!lastref,
2920 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2921 	     pr, pr->pr_id));
2922 #else
2923 	refcount_release(&pr->pr_ref);
2924 #endif
2925 }
2926 
2927 /*
2928  * Hold a prison for user visibility, by incrementing pr_uref.
2929  * It is generally an error to hold a prison that isn't already
2930  * user-visible, except through the jail system calls.  It is also
2931  * an error to hold an invalid prison.  A prison record will remain
2932  * alive as long as it has at least one user reference, and will not
2933  * be set to the dying state until the prison mutex and allprison_lock
2934  * are both freed.
2935  */
2936 void
2937 prison_proc_hold(struct prison *pr)
2938 {
2939 #ifdef INVARIANTS
2940 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2941 
2942 	KASSERT(was_alive,
2943 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2944 #else
2945 	refcount_acquire(&pr->pr_uref);
2946 #endif
2947 }
2948 
2949 /*
2950  * Remove a prison user reference.  If it was the last reference, the
2951  * prison will be considered "dying", and may be removed once all of
2952  * its references are dropped.
2953  */
2954 void
2955 prison_proc_free(struct prison *pr)
2956 {
2957 
2958 	/*
2959 	 * Locking is only required when releasing the last reference.
2960 	 * This allows assurance that a locked prison will remain alive
2961 	 * until it is unlocked.
2962 	 */
2963 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2964 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2965 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
2966 		/*
2967 		 * Don't remove the last user reference in this context,
2968 		 * which is expected to be a process that is not only locked,
2969 		 * but also half dead.  Add a reference so any calls to
2970 		 * prison_free() won't re-submit the task.
2971 		 */
2972 		prison_hold(pr);
2973 		mtx_lock(&pr->pr_mtx);
2974 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2975 		    ("Redundant last reference in prison_proc_free (jid=%d)",
2976 		     pr->pr_id));
2977 		pr->pr_flags |= PR_COMPLETE_PROC;
2978 		mtx_unlock(&pr->pr_mtx);
2979 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2980 	}
2981 }
2982 
2983 static void
2984 prison_proc_free_not_last(struct prison *pr)
2985 {
2986 #ifdef INVARIANTS
2987 	int lastref;
2988 
2989 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2990 	    ("Trying to free dead prison %p (jid=%d).",
2991 	     pr, pr->pr_id));
2992 	lastref = refcount_release(&pr->pr_uref);
2993 	KASSERT(!lastref,
2994 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2995 	     pr, pr->pr_id));
2996 #else
2997 	refcount_release(&pr->pr_uref);
2998 #endif
2999 }
3000 
3001 void
3002 prison_proc_link(struct prison *pr, struct proc *p)
3003 {
3004 
3005 	sx_assert(&allproc_lock, SA_XLOCKED);
3006 	LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
3007 }
3008 
3009 void
3010 prison_proc_unlink(struct prison *pr, struct proc *p)
3011 {
3012 
3013 	sx_assert(&allproc_lock, SA_XLOCKED);
3014 	LIST_REMOVE(p, p_jaillist);
3015 }
3016 
3017 static void
3018 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
3019 {
3020 
3021 	sx_xlock(&allproc_lock);
3022 	prison_proc_unlink(opr, p);
3023 	prison_proc_link(npr, p);
3024 	sx_xunlock(&allproc_lock);
3025 }
3026 
3027 /*
3028  * Complete a call to either prison_free or prison_proc_free.
3029  */
3030 static void
3031 prison_complete(void *context, int pending)
3032 {
3033 	struct prison *pr = context;
3034 	int drflags;
3035 
3036 	/*
3037 	 * This could be called to release the last reference, or the last
3038 	 * user reference (plus the reference held in prison_proc_free).
3039 	 */
3040 	drflags = prison_lock_xlock(pr, PD_DEREF);
3041 	if (pr->pr_flags & PR_COMPLETE_PROC) {
3042 		pr->pr_flags &= ~PR_COMPLETE_PROC;
3043 		drflags |= PD_DEUREF;
3044 	}
3045 	prison_deref(pr, drflags);
3046 }
3047 
3048 static void
3049 prison_kill_processes_cb(struct proc *p, void *arg __unused)
3050 {
3051 
3052 	kern_psignal(p, SIGKILL);
3053 }
3054 
3055 /*
3056  * Note the iteration does not guarantee acting on all processes.
3057  * Most notably there may be fork or jail_attach in progress.
3058  */
3059 void
3060 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
3061     void *cbarg)
3062 {
3063 	struct prison *ppr;
3064 	struct proc *p;
3065 
3066 	if (atomic_load_int(&pr->pr_childcount) == 0) {
3067 		sx_slock(&allproc_lock);
3068 		LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
3069 			if (p->p_state == PRS_NEW)
3070 				continue;
3071 			PROC_LOCK(p);
3072 			cb(p, cbarg);
3073 			PROC_UNLOCK(p);
3074 		}
3075 		sx_sunlock(&allproc_lock);
3076 		if (atomic_load_int(&pr->pr_childcount) == 0)
3077 			return;
3078 		/*
3079 		 * Some jails popped up during the iteration, fall through to a
3080 		 * system-wide search.
3081 		 */
3082 	}
3083 
3084 	sx_slock(&allproc_lock);
3085 	FOREACH_PROC_IN_SYSTEM(p) {
3086 		PROC_LOCK(p);
3087 		if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
3088 			for (ppr = p->p_ucred->cr_prison;
3089 			    ppr != &prison0;
3090 			    ppr = ppr->pr_parent) {
3091 				if (ppr == pr) {
3092 					cb(p, cbarg);
3093 					break;
3094 				}
3095 			}
3096 		}
3097 		PROC_UNLOCK(p);
3098 	}
3099 	sx_sunlock(&allproc_lock);
3100 }
3101 
3102 /*
3103  * Remove a prison reference and/or user reference (usually).
3104  * This assumes context that allows sleeping (for allprison_lock),
3105  * with no non-sleeping locks held, except perhaps the prison itself.
3106  * If there are no more references, release and delist the prison.
3107  * On completion, the prison lock and the allprison lock are both
3108  * unlocked.
3109  */
3110 static void
3111 prison_deref(struct prison *pr, int flags)
3112 {
3113 	struct prisonlist freeprison;
3114 	struct prison *killpr, *rpr, *ppr, *tpr;
3115 
3116 	killpr = NULL;
3117 	TAILQ_INIT(&freeprison);
3118 	/*
3119 	 * Release this prison as requested, which may cause its parent
3120 	 * to be released, and then maybe its grandparent, etc.
3121 	 */
3122 	for (;;) {
3123 		if (flags & PD_KILL) {
3124 			/* Kill the prison and its descendents. */
3125 			KASSERT(pr != &prison0,
3126 			    ("prison_deref trying to kill prison0"));
3127 			if (!(flags & PD_DEREF)) {
3128 				prison_hold(pr);
3129 				flags |= PD_DEREF;
3130 			}
3131 			flags = prison_lock_xlock(pr, flags);
3132 			prison_deref_kill(pr, &freeprison);
3133 		}
3134 		if (flags & PD_DEUREF) {
3135 			/* Drop a user reference. */
3136 			KASSERT(refcount_load(&pr->pr_uref) > 0,
3137 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
3138 			     pr->pr_id));
3139 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
3140 				if (!(flags & PD_DEREF)) {
3141 					prison_hold(pr);
3142 					flags |= PD_DEREF;
3143 				}
3144 				flags = prison_lock_xlock(pr, flags);
3145 				if (refcount_release(&pr->pr_uref) &&
3146 				    pr->pr_state == PRISON_STATE_ALIVE) {
3147 					/*
3148 					 * When the last user references goes,
3149 					 * this becomes a dying prison.
3150 					 */
3151 					KASSERT(
3152 					    refcount_load(&prison0.pr_uref) > 0,
3153 					    ("prison0 pr_uref=0"));
3154 					pr->pr_state = PRISON_STATE_DYING;
3155 					mtx_unlock(&pr->pr_mtx);
3156 					flags &= ~PD_LOCKED;
3157 					prison_cleanup(pr);
3158 				}
3159 			}
3160 		}
3161 		if (flags & PD_KILL) {
3162 			/*
3163 			 * Any remaining user references are probably processes
3164 			 * that need to be killed, either in this prison or its
3165 			 * descendants.
3166 			 */
3167 			if (refcount_load(&pr->pr_uref) > 0)
3168 				killpr = pr;
3169 			/* Make sure the parent prison doesn't get killed. */
3170 			flags &= ~PD_KILL;
3171 		}
3172 		if (flags & PD_DEREF) {
3173 			/* Drop a reference. */
3174 			KASSERT(refcount_load(&pr->pr_ref) > 0,
3175 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
3176 			     pr->pr_id));
3177 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
3178 				flags = prison_lock_xlock(pr, flags);
3179 				if (refcount_release(&pr->pr_ref)) {
3180 					/*
3181 					 * When the last reference goes,
3182 					 * unlink the prison and set it aside.
3183 					 */
3184 					KASSERT(
3185 					    refcount_load(&pr->pr_uref) == 0,
3186 					    ("prison_deref: last ref, "
3187 					     "but still has %d urefs (jid=%d)",
3188 					     pr->pr_uref, pr->pr_id));
3189 					KASSERT(
3190 					    refcount_load(&prison0.pr_ref) != 0,
3191 					    ("prison0 pr_ref=0"));
3192 					pr->pr_state = PRISON_STATE_INVALID;
3193 					TAILQ_REMOVE(&allprison, pr, pr_list);
3194 					LIST_REMOVE(pr, pr_sibling);
3195 					TAILQ_INSERT_TAIL(&freeprison, pr,
3196 					    pr_list);
3197 					for (ppr = pr->pr_parent;
3198 					     ppr != NULL;
3199 					     ppr = ppr->pr_parent)
3200 						ppr->pr_childcount--;
3201 					/*
3202 					 * Removing a prison frees references
3203 					 * from its parent.
3204 					 */
3205 					mtx_unlock(&pr->pr_mtx);
3206 					flags &= ~PD_LOCKED;
3207 					pr = pr->pr_parent;
3208 					flags |= PD_DEREF | PD_DEUREF;
3209 					continue;
3210 				}
3211 			}
3212 		}
3213 		break;
3214 	}
3215 
3216 	/* Release all the prison locks. */
3217 	if (flags & PD_LOCKED)
3218 		mtx_unlock(&pr->pr_mtx);
3219 	if (flags & PD_LIST_SLOCKED)
3220 		sx_sunlock(&allprison_lock);
3221 	else if (flags & PD_LIST_XLOCKED)
3222 		sx_xunlock(&allprison_lock);
3223 
3224 	/* Kill any processes attached to a killed prison. */
3225 	if (killpr != NULL)
3226 		prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
3227 
3228 	/*
3229 	 * Finish removing any unreferenced prisons, which couldn't happen
3230 	 * while allprison_lock was held (to avoid a LOR on vrele).
3231 	 */
3232 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
3233 #ifdef VIMAGE
3234 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
3235 			vnet_destroy(rpr->pr_vnet);
3236 #endif
3237 		if (rpr->pr_root != NULL)
3238 			vrele(rpr->pr_root);
3239 		mtx_destroy(&rpr->pr_mtx);
3240 #ifdef INET
3241 		prison_ip_free(rpr->pr_addrs[PR_INET]);
3242 #endif
3243 #ifdef INET6
3244 		prison_ip_free(rpr->pr_addrs[PR_INET6]);
3245 #endif
3246 		if (rpr->pr_cpuset != NULL)
3247 			cpuset_rel(rpr->pr_cpuset);
3248 		osd_jail_exit(rpr);
3249 #ifdef RACCT
3250 		if (racct_enable)
3251 			prison_racct_detach(rpr);
3252 #endif
3253 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
3254 		free(rpr, M_PRISON);
3255 	}
3256 }
3257 
3258 /*
3259  * Kill the prison and its descendants.  Mark them as dying, clear the
3260  * persist flag, and call module remove methods.
3261  */
3262 static void
3263 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
3264 {
3265 	struct prison *cpr, *ppr, *rpr;
3266 	bool descend;
3267 
3268 	/*
3269 	 * Unlike the descendants, the target prison can be killed
3270 	 * even if it is currently dying.  This is useful for failed
3271 	 * creation in jail_set(2).
3272 	 */
3273 	KASSERT(refcount_load(&pr->pr_ref) > 0,
3274 	    ("Trying to kill dead prison %p (jid=%d).",
3275 	     pr, pr->pr_id));
3276 	refcount_acquire(&pr->pr_uref);
3277 	pr->pr_state = PRISON_STATE_DYING;
3278 	mtx_unlock(&pr->pr_mtx);
3279 
3280 	rpr = NULL;
3281 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
3282 		if (descend) {
3283 			if (!prison_isalive(cpr)) {
3284 				descend = false;
3285 				continue;
3286 			}
3287 			prison_hold(cpr);
3288 			prison_proc_hold(cpr);
3289 			mtx_lock(&cpr->pr_mtx);
3290 			cpr->pr_state = PRISON_STATE_DYING;
3291 			cpr->pr_flags |= PR_REMOVE;
3292 			mtx_unlock(&cpr->pr_mtx);
3293 			continue;
3294 		}
3295 		if (!(cpr->pr_flags & PR_REMOVE))
3296 			continue;
3297 		prison_cleanup(cpr);
3298 		mtx_lock(&cpr->pr_mtx);
3299 		cpr->pr_flags &= ~PR_REMOVE;
3300 		if (cpr->pr_flags & PR_PERSIST) {
3301 			cpr->pr_flags &= ~PR_PERSIST;
3302 			prison_proc_free_not_last(cpr);
3303 			prison_free_not_last(cpr);
3304 		}
3305 		(void)refcount_release(&cpr->pr_uref);
3306 		if (refcount_release(&cpr->pr_ref)) {
3307 			/*
3308 			 * When the last reference goes, unlink the prison
3309 			 * and set it aside for prison_deref() to handle.
3310 			 * Delay unlinking the sibling list to keep the loop
3311 			 * safe.
3312 			 */
3313 			if (rpr != NULL)
3314 				LIST_REMOVE(rpr, pr_sibling);
3315 			rpr = cpr;
3316 			rpr->pr_state = PRISON_STATE_INVALID;
3317 			TAILQ_REMOVE(&allprison, rpr, pr_list);
3318 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
3319 			/*
3320 			 * Removing a prison frees references from its parent.
3321 			 */
3322 			ppr = rpr->pr_parent;
3323 			prison_proc_free_not_last(ppr);
3324 			prison_free_not_last(ppr);
3325 			for (; ppr != NULL; ppr = ppr->pr_parent)
3326 				ppr->pr_childcount--;
3327 		}
3328 		mtx_unlock(&cpr->pr_mtx);
3329 	}
3330 	if (rpr != NULL)
3331 		LIST_REMOVE(rpr, pr_sibling);
3332 
3333 	prison_cleanup(pr);
3334 	mtx_lock(&pr->pr_mtx);
3335 	if (pr->pr_flags & PR_PERSIST) {
3336 		pr->pr_flags &= ~PR_PERSIST;
3337 		prison_proc_free_not_last(pr);
3338 		prison_free_not_last(pr);
3339 	}
3340 	(void)refcount_release(&pr->pr_uref);
3341 }
3342 
3343 /*
3344  * Given the current locking state in the flags, make sure allprison_lock
3345  * is held exclusive, and the prison is locked.  Return flags indicating
3346  * the new state.
3347  */
3348 static int
3349 prison_lock_xlock(struct prison *pr, int flags)
3350 {
3351 
3352 	if (!(flags & PD_LIST_XLOCKED)) {
3353 		/*
3354 		 * Get allprison_lock, which may be an upgrade,
3355 		 * and may require unlocking the prison.
3356 		 */
3357 		if (flags & PD_LOCKED) {
3358 			mtx_unlock(&pr->pr_mtx);
3359 			flags &= ~PD_LOCKED;
3360 		}
3361 		if (flags & PD_LIST_SLOCKED) {
3362 			if (!sx_try_upgrade(&allprison_lock)) {
3363 				sx_sunlock(&allprison_lock);
3364 				sx_xlock(&allprison_lock);
3365 			}
3366 			flags &= ~PD_LIST_SLOCKED;
3367 		} else
3368 			sx_xlock(&allprison_lock);
3369 		flags |= PD_LIST_XLOCKED;
3370 	}
3371 	if (!(flags & PD_LOCKED)) {
3372 		/* Lock the prison mutex. */
3373 		mtx_lock(&pr->pr_mtx);
3374 		flags |= PD_LOCKED;
3375 	}
3376 	return flags;
3377 }
3378 
3379 /*
3380  * Release a prison's resources when it starts dying (when the last user
3381  * reference is dropped, or when it is killed).
3382  */
3383 static void
3384 prison_cleanup(struct prison *pr)
3385 {
3386 	sx_assert(&allprison_lock, SA_XLOCKED);
3387 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3388 	vfs_exjail_delete(pr);
3389 	shm_remove_prison(pr);
3390 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3391 }
3392 
3393 /*
3394  * Set or clear a permission bit in the pr_allow field, passing restrictions
3395  * (cleared permission) down to child jails.
3396  */
3397 void
3398 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3399 {
3400 	struct prison *pr;
3401 
3402 	pr = cred->cr_prison;
3403 	sx_slock(&allprison_lock);
3404 	mtx_lock(&pr->pr_mtx);
3405 	prison_set_allow_locked(pr, flag, enable);
3406 	mtx_unlock(&pr->pr_mtx);
3407 	sx_sunlock(&allprison_lock);
3408 }
3409 
3410 static void
3411 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3412 {
3413 	struct prison *cpr;
3414 	int descend;
3415 
3416 	if (enable != 0)
3417 		pr->pr_allow |= flag;
3418 	else {
3419 		pr->pr_allow &= ~flag;
3420 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3421 			cpr->pr_allow &= ~flag;
3422 	}
3423 }
3424 
3425 /*
3426  * Check if a jail supports the given address family.
3427  *
3428  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3429  * if not.
3430  */
3431 int
3432 prison_check_af(struct ucred *cred, int af)
3433 {
3434 	struct prison *pr;
3435 	int error;
3436 
3437 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3438 
3439 	pr = cred->cr_prison;
3440 #ifdef VIMAGE
3441 	/* Prisons with their own network stack are not limited. */
3442 	if (prison_owns_vnet(cred))
3443 		return (0);
3444 #endif
3445 
3446 	error = 0;
3447 	switch (af)
3448 	{
3449 #ifdef INET
3450 	case AF_INET:
3451 		if (pr->pr_flags & PR_IP4)
3452 		{
3453 			mtx_lock(&pr->pr_mtx);
3454 			if ((pr->pr_flags & PR_IP4) &&
3455 			    pr->pr_addrs[PR_INET] == NULL)
3456 				error = EAFNOSUPPORT;
3457 			mtx_unlock(&pr->pr_mtx);
3458 		}
3459 		break;
3460 #endif
3461 #ifdef INET6
3462 	case AF_INET6:
3463 		if (pr->pr_flags & PR_IP6)
3464 		{
3465 			mtx_lock(&pr->pr_mtx);
3466 			if ((pr->pr_flags & PR_IP6) &&
3467 			    pr->pr_addrs[PR_INET6] == NULL)
3468 				error = EAFNOSUPPORT;
3469 			mtx_unlock(&pr->pr_mtx);
3470 		}
3471 		break;
3472 #endif
3473 	case AF_LOCAL:
3474 	case AF_ROUTE:
3475 	case AF_NETLINK:
3476 		break;
3477 	default:
3478 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3479 			error = EAFNOSUPPORT;
3480 	}
3481 	return (error);
3482 }
3483 
3484 /*
3485  * Check if given address belongs to the jail referenced by cred (wrapper to
3486  * prison_check_ip[46]).
3487  *
3488  * Returns 0 if jail doesn't restrict the address family or if address belongs
3489  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3490  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3491  */
3492 int
3493 prison_if(struct ucred *cred, const struct sockaddr *sa)
3494 {
3495 #ifdef INET
3496 	const struct sockaddr_in *sai;
3497 #endif
3498 #ifdef INET6
3499 	const struct sockaddr_in6 *sai6;
3500 #endif
3501 	int error;
3502 
3503 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3504 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3505 
3506 #ifdef VIMAGE
3507 	if (prison_owns_vnet(cred))
3508 		return (0);
3509 #endif
3510 
3511 	error = 0;
3512 	switch (sa->sa_family)
3513 	{
3514 #ifdef INET
3515 	case AF_INET:
3516 		sai = (const struct sockaddr_in *)sa;
3517 		error = prison_check_ip4(cred, &sai->sin_addr);
3518 		break;
3519 #endif
3520 #ifdef INET6
3521 	case AF_INET6:
3522 		sai6 = (const struct sockaddr_in6 *)sa;
3523 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3524 		break;
3525 #endif
3526 	default:
3527 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3528 			error = EAFNOSUPPORT;
3529 	}
3530 	return (error);
3531 }
3532 
3533 /*
3534  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3535  */
3536 int
3537 prison_check(struct ucred *cred1, struct ucred *cred2)
3538 {
3539 
3540 	return ((cred1->cr_prison == cred2->cr_prison ||
3541 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3542 }
3543 
3544 /*
3545  * For mountd/nfsd to run within a prison, it must be:
3546  * - A vnet prison.
3547  * - PR_ALLOW_NFSD must be set on it.
3548  * - The root directory (pr_root) of the prison must be
3549  *   a file system mount point, so the mountd can hang
3550  *   export information on it.
3551  * - The prison's enforce_statfs cannot be 0, so that
3552  *   mountd(8) can do exports.
3553  */
3554 bool
3555 prison_check_nfsd(struct ucred *cred)
3556 {
3557 
3558 	if (jailed_without_vnet(cred))
3559 		return (false);
3560 	if (!prison_allow(cred, PR_ALLOW_NFSD))
3561 		return (false);
3562 	if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3563 		return (false);
3564 	if (cred->cr_prison->pr_enforce_statfs == 0)
3565 		return (false);
3566 	return (true);
3567 }
3568 
3569 /*
3570  * Return true if p2 is a child of p1, otherwise false.
3571  */
3572 bool
3573 prison_ischild(struct prison *pr1, struct prison *pr2)
3574 {
3575 
3576 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3577 		if (pr1 == pr2)
3578 			return (true);
3579 	return (false);
3580 }
3581 
3582 /*
3583  * Return true if the prison is currently alive.  A prison is alive if it
3584  * holds user references and it isn't being removed.
3585  */
3586 bool
3587 prison_isalive(const struct prison *pr)
3588 {
3589 
3590 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3591 		return (false);
3592 	return (true);
3593 }
3594 
3595 /*
3596  * Return true if the prison is currently valid.  A prison is valid if it has
3597  * been fully created, and is not being destroyed.  Note that dying prisons
3598  * are still considered valid.  Invalid prisons won't be found under normal
3599  * circumstances, as they're only put in that state by functions that have
3600  * an exclusive hold on allprison_lock.
3601  */
3602 bool
3603 prison_isvalid(struct prison *pr)
3604 {
3605 
3606 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3607 		return (false);
3608 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3609 		return (false);
3610 	return (true);
3611 }
3612 
3613 /*
3614  * Return true if the passed credential is in a jail and that jail does not
3615  * have its own virtual network stack, otherwise false.
3616  */
3617 bool
3618 jailed_without_vnet(struct ucred *cred)
3619 {
3620 
3621 	if (!jailed(cred))
3622 		return (false);
3623 #ifdef VIMAGE
3624 	if (prison_owns_vnet(cred))
3625 		return (false);
3626 #endif
3627 
3628 	return (true);
3629 }
3630 
3631 /*
3632  * Return the correct hostname (domainname, et al) for the passed credential.
3633  */
3634 void
3635 getcredhostname(struct ucred *cred, char *buf, size_t size)
3636 {
3637 	struct prison *pr;
3638 
3639 	/*
3640 	 * A NULL credential can be used to shortcut to the physical
3641 	 * system's hostname.
3642 	 */
3643 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3644 	mtx_lock(&pr->pr_mtx);
3645 	strlcpy(buf, pr->pr_hostname, size);
3646 	mtx_unlock(&pr->pr_mtx);
3647 }
3648 
3649 void
3650 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3651 {
3652 
3653 	mtx_lock(&cred->cr_prison->pr_mtx);
3654 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3655 	mtx_unlock(&cred->cr_prison->pr_mtx);
3656 }
3657 
3658 void
3659 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3660 {
3661 
3662 	mtx_lock(&cred->cr_prison->pr_mtx);
3663 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3664 	mtx_unlock(&cred->cr_prison->pr_mtx);
3665 }
3666 
3667 void
3668 getcredhostid(struct ucred *cred, unsigned long *hostid)
3669 {
3670 
3671 	mtx_lock(&cred->cr_prison->pr_mtx);
3672 	*hostid = cred->cr_prison->pr_hostid;
3673 	mtx_unlock(&cred->cr_prison->pr_mtx);
3674 }
3675 
3676 void
3677 getjailname(struct ucred *cred, char *name, size_t len)
3678 {
3679 
3680 	mtx_lock(&cred->cr_prison->pr_mtx);
3681 	strlcpy(name, cred->cr_prison->pr_name, len);
3682 	mtx_unlock(&cred->cr_prison->pr_mtx);
3683 }
3684 
3685 #ifdef VIMAGE
3686 /*
3687  * Determine whether the prison represented by cred owns
3688  * its vnet rather than having it inherited.
3689  *
3690  * Returns true in case the prison owns the vnet, false otherwise.
3691  */
3692 bool
3693 prison_owns_vnet(struct ucred *cred)
3694 {
3695 
3696 	/*
3697 	 * vnets cannot be added/removed after jail creation,
3698 	 * so no need to lock here.
3699 	 */
3700 	return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
3701 }
3702 #endif
3703 
3704 /*
3705  * Determine whether the subject represented by cred can "see"
3706  * status of a mount point.
3707  * Returns: 0 for permitted, ENOENT otherwise.
3708  * XXX: This function should be called cr_canseemount() and should be
3709  *      placed in kern_prot.c.
3710  */
3711 int
3712 prison_canseemount(struct ucred *cred, struct mount *mp)
3713 {
3714 	struct prison *pr;
3715 	struct statfs *sp;
3716 	size_t len;
3717 
3718 	pr = cred->cr_prison;
3719 	if (pr->pr_enforce_statfs == 0)
3720 		return (0);
3721 	if (pr->pr_root->v_mount == mp)
3722 		return (0);
3723 	if (pr->pr_enforce_statfs == 2)
3724 		return (ENOENT);
3725 	/*
3726 	 * If jail's chroot directory is set to "/" we should be able to see
3727 	 * all mount-points from inside a jail.
3728 	 * This is ugly check, but this is the only situation when jail's
3729 	 * directory ends with '/'.
3730 	 */
3731 	if (strcmp(pr->pr_path, "/") == 0)
3732 		return (0);
3733 	len = strlen(pr->pr_path);
3734 	sp = &mp->mnt_stat;
3735 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3736 		return (ENOENT);
3737 	/*
3738 	 * Be sure that we don't have situation where jail's root directory
3739 	 * is "/some/path" and mount point is "/some/pathpath".
3740 	 */
3741 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3742 		return (ENOENT);
3743 	return (0);
3744 }
3745 
3746 void
3747 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3748 {
3749 	char jpath[MAXPATHLEN];
3750 	struct prison *pr;
3751 	size_t len;
3752 
3753 	pr = cred->cr_prison;
3754 	if (pr->pr_enforce_statfs == 0)
3755 		return;
3756 	if (prison_canseemount(cred, mp) != 0) {
3757 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3758 		strlcpy(sp->f_mntonname, "[restricted]",
3759 		    sizeof(sp->f_mntonname));
3760 		return;
3761 	}
3762 	if (pr->pr_root->v_mount == mp) {
3763 		/*
3764 		 * Clear current buffer data, so we are sure nothing from
3765 		 * the valid path left there.
3766 		 */
3767 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3768 		*sp->f_mntonname = '/';
3769 		return;
3770 	}
3771 	/*
3772 	 * If jail's chroot directory is set to "/" we should be able to see
3773 	 * all mount-points from inside a jail.
3774 	 */
3775 	if (strcmp(pr->pr_path, "/") == 0)
3776 		return;
3777 	len = strlen(pr->pr_path);
3778 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3779 	/*
3780 	 * Clear current buffer data, so we are sure nothing from
3781 	 * the valid path left there.
3782 	 */
3783 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3784 	if (*jpath == '\0') {
3785 		/* Should never happen. */
3786 		*sp->f_mntonname = '/';
3787 	} else {
3788 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3789 	}
3790 }
3791 
3792 /*
3793  * Check with permission for a specific privilege is granted within jail.  We
3794  * have a specific list of accepted privileges; the rest are denied.
3795  */
3796 int
3797 prison_priv_check(struct ucred *cred, int priv)
3798 {
3799 	struct prison *pr;
3800 	int error;
3801 
3802 	/*
3803 	 * Some policies have custom handlers. This routine should not be
3804 	 * called for them. See priv_check_cred().
3805 	 */
3806 	switch (priv) {
3807 	case PRIV_VFS_LOOKUP:
3808 	case PRIV_VFS_GENERATION:
3809 		KASSERT(0, ("prison_priv_check instead of a custom handler "
3810 		    "called for %d\n", priv));
3811 	}
3812 
3813 	if (!jailed(cred))
3814 		return (0);
3815 
3816 #ifdef VIMAGE
3817 	/*
3818 	 * Privileges specific to prisons with a virtual network stack.
3819 	 * There might be a duplicate entry here in case the privilege
3820 	 * is only granted conditionally in the legacy jail case.
3821 	 */
3822 	switch (priv) {
3823 		/*
3824 		 * NFS-specific privileges.
3825 		 */
3826 	case PRIV_NFS_DAEMON:
3827 	case PRIV_VFS_GETFH:
3828 	case PRIV_VFS_MOUNT_EXPORTED:
3829 		if (!prison_check_nfsd(cred))
3830 			return (EPERM);
3831 #ifdef notyet
3832 	case PRIV_NFS_LOCKD:
3833 #endif
3834 		/*
3835 		 * Network stack privileges.
3836 		 */
3837 	case PRIV_NET_BRIDGE:
3838 	case PRIV_NET_GRE:
3839 	case PRIV_NET_BPF:
3840 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3841 	case PRIV_NET_ROUTE:
3842 	case PRIV_NET_TAP:
3843 	case PRIV_NET_SETIFMTU:
3844 	case PRIV_NET_SETIFFLAGS:
3845 	case PRIV_NET_SETIFCAP:
3846 	case PRIV_NET_SETIFDESCR:
3847 	case PRIV_NET_SETIFNAME	:
3848 	case PRIV_NET_SETIFMETRIC:
3849 	case PRIV_NET_SETIFPHYS:
3850 	case PRIV_NET_SETIFMAC:
3851 	case PRIV_NET_SETLANPCP:
3852 	case PRIV_NET_ADDMULTI:
3853 	case PRIV_NET_DELMULTI:
3854 	case PRIV_NET_HWIOCTL:
3855 	case PRIV_NET_SETLLADDR:
3856 	case PRIV_NET_ADDIFGROUP:
3857 	case PRIV_NET_DELIFGROUP:
3858 	case PRIV_NET_IFCREATE:
3859 	case PRIV_NET_IFDESTROY:
3860 	case PRIV_NET_ADDIFADDR:
3861 	case PRIV_NET_DELIFADDR:
3862 	case PRIV_NET_LAGG:
3863 	case PRIV_NET_GIF:
3864 	case PRIV_NET_SETIFVNET:
3865 	case PRIV_NET_SETIFFIB:
3866 	case PRIV_NET_OVPN:
3867 	case PRIV_NET_ME:
3868 	case PRIV_NET_WG:
3869 
3870 		/*
3871 		 * 802.11-related privileges.
3872 		 */
3873 	case PRIV_NET80211_VAP_GETKEY:
3874 	case PRIV_NET80211_VAP_MANAGE:
3875 
3876 #ifdef notyet
3877 		/*
3878 		 * ATM privileges.
3879 		 */
3880 	case PRIV_NETATM_CFG:
3881 	case PRIV_NETATM_ADD:
3882 	case PRIV_NETATM_DEL:
3883 	case PRIV_NETATM_SET:
3884 
3885 		/*
3886 		 * Bluetooth privileges.
3887 		 */
3888 	case PRIV_NETBLUETOOTH_RAW:
3889 #endif
3890 
3891 		/*
3892 		 * Netgraph and netgraph module privileges.
3893 		 */
3894 	case PRIV_NETGRAPH_CONTROL:
3895 #ifdef notyet
3896 	case PRIV_NETGRAPH_TTY:
3897 #endif
3898 
3899 		/*
3900 		 * IPv4 and IPv6 privileges.
3901 		 */
3902 	case PRIV_NETINET_IPFW:
3903 	case PRIV_NETINET_DIVERT:
3904 	case PRIV_NETINET_PF:
3905 	case PRIV_NETINET_DUMMYNET:
3906 	case PRIV_NETINET_CARP:
3907 	case PRIV_NETINET_MROUTE:
3908 	case PRIV_NETINET_RAW:
3909 	case PRIV_NETINET_ADDRCTRL6:
3910 	case PRIV_NETINET_ND6:
3911 	case PRIV_NETINET_SCOPE6:
3912 	case PRIV_NETINET_ALIFETIME6:
3913 	case PRIV_NETINET_IPSEC:
3914 	case PRIV_NETINET_BINDANY:
3915 
3916 #ifdef notyet
3917 		/*
3918 		 * NCP privileges.
3919 		 */
3920 	case PRIV_NETNCP:
3921 
3922 		/*
3923 		 * SMB privileges.
3924 		 */
3925 	case PRIV_NETSMB:
3926 #endif
3927 
3928 	/*
3929 	 * No default: or deny here.
3930 	 * In case of no permit fall through to next switch().
3931 	 */
3932 		if (cred->cr_prison->pr_flags & PR_VNET)
3933 			return (0);
3934 	}
3935 #endif /* VIMAGE */
3936 
3937 	switch (priv) {
3938 		/*
3939 		 * Allow ktrace privileges for root in jail.
3940 		 */
3941 	case PRIV_KTRACE:
3942 
3943 #if 0
3944 		/*
3945 		 * Allow jailed processes to configure audit identity and
3946 		 * submit audit records (login, etc).  In the future we may
3947 		 * want to further refine the relationship between audit and
3948 		 * jail.
3949 		 */
3950 	case PRIV_AUDIT_GETAUDIT:
3951 	case PRIV_AUDIT_SETAUDIT:
3952 	case PRIV_AUDIT_SUBMIT:
3953 #endif
3954 
3955 		/*
3956 		 * Allow jailed processes to manipulate process UNIX
3957 		 * credentials in any way they see fit.
3958 		 */
3959 	case PRIV_CRED_SETUID:
3960 	case PRIV_CRED_SETEUID:
3961 	case PRIV_CRED_SETGID:
3962 	case PRIV_CRED_SETEGID:
3963 	case PRIV_CRED_SETGROUPS:
3964 	case PRIV_CRED_SETREUID:
3965 	case PRIV_CRED_SETREGID:
3966 	case PRIV_CRED_SETRESUID:
3967 	case PRIV_CRED_SETRESGID:
3968 
3969 		/*
3970 		 * Jail implements visibility constraints already, so allow
3971 		 * jailed root to override uid/gid-based constraints.
3972 		 */
3973 	case PRIV_SEEOTHERGIDS:
3974 	case PRIV_SEEOTHERUIDS:
3975 	case PRIV_SEEJAILPROC:
3976 
3977 		/*
3978 		 * Jail implements inter-process debugging limits already, so
3979 		 * allow jailed root various debugging privileges.
3980 		 */
3981 	case PRIV_DEBUG_DIFFCRED:
3982 	case PRIV_DEBUG_SUGID:
3983 	case PRIV_DEBUG_UNPRIV:
3984 
3985 		/*
3986 		 * Allow jail to set various resource limits and login
3987 		 * properties, and for now, exceed process resource limits.
3988 		 */
3989 	case PRIV_PROC_LIMIT:
3990 	case PRIV_PROC_SETLOGIN:
3991 	case PRIV_PROC_SETRLIMIT:
3992 
3993 		/*
3994 		 * System V and POSIX IPC privileges are granted in jail.
3995 		 */
3996 	case PRIV_IPC_READ:
3997 	case PRIV_IPC_WRITE:
3998 	case PRIV_IPC_ADMIN:
3999 	case PRIV_IPC_MSGSIZE:
4000 	case PRIV_MQ_ADMIN:
4001 
4002 		/*
4003 		 * Jail operations within a jail work on child jails.
4004 		 */
4005 	case PRIV_JAIL_ATTACH:
4006 	case PRIV_JAIL_SET:
4007 	case PRIV_JAIL_REMOVE:
4008 
4009 		/*
4010 		 * Jail implements its own inter-process limits, so allow
4011 		 * root processes in jail to change scheduling on other
4012 		 * processes in the same jail.  Likewise for signalling.
4013 		 */
4014 	case PRIV_SCHED_DIFFCRED:
4015 	case PRIV_SCHED_CPUSET:
4016 	case PRIV_SIGNAL_DIFFCRED:
4017 	case PRIV_SIGNAL_SUGID:
4018 
4019 		/*
4020 		 * Allow jailed processes to write to sysctls marked as jail
4021 		 * writable.
4022 		 */
4023 	case PRIV_SYSCTL_WRITEJAIL:
4024 
4025 		/*
4026 		 * Allow root in jail to manage a variety of quota
4027 		 * properties.  These should likely be conditional on a
4028 		 * configuration option.
4029 		 */
4030 	case PRIV_VFS_GETQUOTA:
4031 	case PRIV_VFS_SETQUOTA:
4032 
4033 		/*
4034 		 * Since Jail relies on chroot() to implement file system
4035 		 * protections, grant many VFS privileges to root in jail.
4036 		 * Be careful to exclude mount-related and NFS-related
4037 		 * privileges.
4038 		 */
4039 	case PRIV_VFS_READ:
4040 	case PRIV_VFS_WRITE:
4041 	case PRIV_VFS_ADMIN:
4042 	case PRIV_VFS_EXEC:
4043 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
4044 	case PRIV_VFS_CHFLAGS_DEV:
4045 	case PRIV_VFS_CHOWN:
4046 	case PRIV_VFS_CHROOT:
4047 	case PRIV_VFS_RETAINSUGID:
4048 	case PRIV_VFS_FCHROOT:
4049 	case PRIV_VFS_LINK:
4050 	case PRIV_VFS_SETGID:
4051 	case PRIV_VFS_STAT:
4052 	case PRIV_VFS_STICKYFILE:
4053 
4054 		/*
4055 		 * As in the non-jail case, non-root users are expected to be
4056 		 * able to read kernel/physical memory (provided /dev/[k]mem
4057 		 * exists in the jail and they have permission to access it).
4058 		 */
4059 	case PRIV_KMEM_READ:
4060 		return (0);
4061 
4062 		/*
4063 		 * Depending on the global setting, allow privilege of
4064 		 * setting system flags.
4065 		 */
4066 	case PRIV_VFS_SYSFLAGS:
4067 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4068 			return (0);
4069 		else
4070 			return (EPERM);
4071 
4072 		/*
4073 		 * Depending on the global setting, allow privilege of
4074 		 * mounting/unmounting file systems.
4075 		 */
4076 	case PRIV_VFS_MOUNT:
4077 	case PRIV_VFS_UNMOUNT:
4078 	case PRIV_VFS_MOUNT_NONUSER:
4079 	case PRIV_VFS_MOUNT_OWNER:
4080 		pr = cred->cr_prison;
4081 		prison_lock(pr);
4082 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
4083 			error = 0;
4084 		else
4085 			error = EPERM;
4086 		prison_unlock(pr);
4087 		return (error);
4088 
4089 		/*
4090 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
4091 		 * policy.  priv_check_cred will not specifically allow it, and
4092 		 * we may want a MAC policy to allow it.
4093 		 */
4094 	case PRIV_VFS_READ_DIR:
4095 		return (0);
4096 
4097 		/*
4098 		 * Conditionally allow privileged process in the jail to
4099 		 * manipulate filesystem extended attributes in the system
4100 		 * namespace.
4101 		 */
4102 	case PRIV_VFS_EXTATTR_SYSTEM:
4103 		if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0)
4104 			return (0);
4105 		else
4106 			return (EPERM);
4107 
4108 		/*
4109 		 * Conditionnaly allow locking (unlocking) physical pages
4110 		 * in memory.
4111 		 */
4112 	case PRIV_VM_MLOCK:
4113 	case PRIV_VM_MUNLOCK:
4114 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
4115 			return (0);
4116 		else
4117 			return (EPERM);
4118 
4119 		/*
4120 		 * Conditionally allow jailed root to bind reserved ports.
4121 		 */
4122 	case PRIV_NETINET_RESERVEDPORT:
4123 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
4124 			return (0);
4125 		else
4126 			return (EPERM);
4127 
4128 		/*
4129 		 * Allow jailed root to reuse in-use ports.
4130 		 */
4131 	case PRIV_NETINET_REUSEPORT:
4132 		return (0);
4133 
4134 		/*
4135 		 * Allow jailed root to set certain IPv4/6 (option) headers.
4136 		 */
4137 	case PRIV_NETINET_SETHDROPTS:
4138 		return (0);
4139 
4140 		/*
4141 		 * Conditionally allow creating raw sockets in jail.
4142 		 */
4143 	case PRIV_NETINET_RAW:
4144 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4145 			return (0);
4146 		else
4147 			return (EPERM);
4148 
4149 		/*
4150 		 * Since jail implements its own visibility limits on netstat
4151 		 * sysctls, allow getcred.  This allows identd to work in
4152 		 * jail.
4153 		 */
4154 	case PRIV_NETINET_GETCRED:
4155 		return (0);
4156 
4157 		/*
4158 		 * Allow jailed root to set loginclass.
4159 		 */
4160 	case PRIV_PROC_SETLOGINCLASS:
4161 		return (0);
4162 
4163 		/*
4164 		 * Do not allow a process inside a jail to read the kernel
4165 		 * message buffer unless explicitly permitted.
4166 		 */
4167 	case PRIV_MSGBUF:
4168 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
4169 			return (0);
4170 		return (EPERM);
4171 
4172 		/*
4173 		 * Conditionally allow privileged process in the jail adjust
4174 		 * machine time.
4175 		 */
4176 	case PRIV_ADJTIME:
4177 	case PRIV_NTP_ADJTIME:
4178 		if (cred->cr_prison->pr_allow &
4179 		    (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) {
4180 			return (0);
4181 		}
4182 		return (EPERM);
4183 
4184 		/*
4185 		 * Conditionally allow privileged process in the jail set
4186 		 * machine time.
4187 		 */
4188 	case PRIV_CLOCK_SETTIME:
4189 		if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME)
4190 			return (0);
4191 		else
4192 			return (EPERM);
4193 
4194 	default:
4195 		/*
4196 		 * In all remaining cases, deny the privilege request.  This
4197 		 * includes almost all network privileges, many system
4198 		 * configuration privileges.
4199 		 */
4200 		return (EPERM);
4201 	}
4202 }
4203 
4204 /*
4205  * Return the part of pr2's name that is relative to pr1, or the whole name
4206  * if it does not directly follow.
4207  */
4208 
4209 char *
4210 prison_name(struct prison *pr1, struct prison *pr2)
4211 {
4212 	char *name;
4213 
4214 	/* Jails see themselves as "0" (if they see themselves at all). */
4215 	if (pr1 == pr2)
4216 		return "0";
4217 	name = pr2->pr_name;
4218 	if (prison_ischild(pr1, pr2)) {
4219 		/*
4220 		 * pr1 isn't locked (and allprison_lock may not be either)
4221 		 * so its length can't be counted on.  But the number of dots
4222 		 * can be counted on - and counted.
4223 		 */
4224 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4225 			name = strchr(name, '.') + 1;
4226 	}
4227 	return (name);
4228 }
4229 
4230 /*
4231  * Return the part of pr2's path that is relative to pr1, or the whole path
4232  * if it does not directly follow.
4233  */
4234 static char *
4235 prison_path(struct prison *pr1, struct prison *pr2)
4236 {
4237 	char *path1, *path2;
4238 	int len1;
4239 
4240 	path1 = pr1->pr_path;
4241 	path2 = pr2->pr_path;
4242 	if (!strcmp(path1, "/"))
4243 		return (path2);
4244 	len1 = strlen(path1);
4245 	if (strncmp(path1, path2, len1))
4246 		return (path2);
4247 	if (path2[len1] == '\0')
4248 		return "/";
4249 	if (path2[len1] == '/')
4250 		return (path2 + len1);
4251 	return (path2);
4252 }
4253 
4254 /*
4255  * Jail-related sysctls.
4256  */
4257 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4258     "Jails");
4259 
4260 #if defined(INET) || defined(INET6)
4261 /*
4262  * Copy address array to memory that would be then SYSCTL_OUT-ed.
4263  * sysctl_jail_list() helper.
4264  */
4265 static void
4266 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
4267 {
4268 	const struct prison_ip *pip;
4269 	const size_t size = pr_families[af].size;
4270 
4271  again:
4272 	mtx_assert(&pr->pr_mtx, MA_OWNED);
4273 	if ((pip = pr->pr_addrs[af]) != NULL) {
4274 		if (*len < pip->ips) {
4275 			*len = pip->ips;
4276 			mtx_unlock(&pr->pr_mtx);
4277 			*out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
4278 			mtx_lock(&pr->pr_mtx);
4279 			goto again;
4280 		}
4281 		bcopy(pip->pr_ip, *out, pip->ips * size);
4282 	}
4283 }
4284 #endif
4285 
4286 static int
4287 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4288 {
4289 	struct xprison *xp;
4290 	struct prison *pr, *cpr;
4291 #ifdef INET
4292 	struct in_addr *ip4 = NULL;
4293 	int ip4s = 0;
4294 #endif
4295 #ifdef INET6
4296 	struct in6_addr *ip6 = NULL;
4297 	int ip6s = 0;
4298 #endif
4299 	int descend, error;
4300 
4301 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4302 	pr = req->td->td_ucred->cr_prison;
4303 	error = 0;
4304 	sx_slock(&allprison_lock);
4305 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4306 		mtx_lock(&cpr->pr_mtx);
4307 #ifdef INET
4308 		prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
4309 #endif
4310 #ifdef INET6
4311 		prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
4312 #endif
4313 		bzero(xp, sizeof(*xp));
4314 		xp->pr_version = XPRISON_VERSION;
4315 		xp->pr_id = cpr->pr_id;
4316 		xp->pr_state = cpr->pr_state;
4317 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4318 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4319 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4320 #ifdef INET
4321 		xp->pr_ip4s = ip4s;
4322 #endif
4323 #ifdef INET6
4324 		xp->pr_ip6s = ip6s;
4325 #endif
4326 		mtx_unlock(&cpr->pr_mtx);
4327 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4328 		if (error)
4329 			break;
4330 #ifdef INET
4331 		if (xp->pr_ip4s > 0) {
4332 			error = SYSCTL_OUT(req, ip4,
4333 			    xp->pr_ip4s * sizeof(struct in_addr));
4334 			if (error)
4335 				break;
4336 		}
4337 #endif
4338 #ifdef INET6
4339 		if (xp->pr_ip6s > 0) {
4340 			error = SYSCTL_OUT(req, ip6,
4341 			    xp->pr_ip6s * sizeof(struct in6_addr));
4342 			if (error)
4343 				break;
4344 		}
4345 #endif
4346 	}
4347 	sx_sunlock(&allprison_lock);
4348 	free(xp, M_TEMP);
4349 #ifdef INET
4350 	free(ip4, M_TEMP);
4351 #endif
4352 #ifdef INET6
4353 	free(ip6, M_TEMP);
4354 #endif
4355 	return (error);
4356 }
4357 
4358 SYSCTL_OID(_security_jail, OID_AUTO, list,
4359     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4360     sysctl_jail_list, "S", "List of active jails");
4361 
4362 static int
4363 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4364 {
4365 	int error, injail;
4366 
4367 	injail = jailed(req->td->td_ucred);
4368 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4369 
4370 	return (error);
4371 }
4372 
4373 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4374     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4375     sysctl_jail_jailed, "I", "Process in jail?");
4376 
4377 static int
4378 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4379 {
4380 	int error, havevnet;
4381 #ifdef VIMAGE
4382 	struct ucred *cred = req->td->td_ucred;
4383 
4384 	havevnet = jailed(cred) && prison_owns_vnet(cred);
4385 #else
4386 	havevnet = 0;
4387 #endif
4388 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4389 
4390 	return (error);
4391 }
4392 
4393 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4394     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4395     sysctl_jail_vnet, "I", "Jail owns vnet?");
4396 
4397 #if defined(INET) || defined(INET6)
4398 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4399     &jail_max_af_ips, 0,
4400     "Number of IP addresses a jail may have at most per address family (deprecated)");
4401 #endif
4402 
4403 /*
4404  * Default parameters for jail(2) compatibility.  For historical reasons,
4405  * the sysctl names have varying similarity to the parameter names.  Prisons
4406  * just see their own parameters, and can't change them.
4407  */
4408 static int
4409 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4410 {
4411 	int error, i;
4412 
4413 	/* Get the current flag value, and convert it to a boolean. */
4414 	if (req->td->td_ucred->cr_prison == &prison0) {
4415 		mtx_lock(&prison0.pr_mtx);
4416 		i = (jail_default_allow & arg2) != 0;
4417 		mtx_unlock(&prison0.pr_mtx);
4418 	} else
4419 		i = prison_allow(req->td->td_ucred, arg2);
4420 
4421 	if (arg1 != NULL)
4422 		i = !i;
4423 	error = sysctl_handle_int(oidp, &i, 0, req);
4424 	if (error || !req->newptr)
4425 		return (error);
4426 	i = i ? arg2 : 0;
4427 	if (arg1 != NULL)
4428 		i ^= arg2;
4429 	/*
4430 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4431 	 * for writing.
4432 	 */
4433 	mtx_lock(&prison0.pr_mtx);
4434 	jail_default_allow = (jail_default_allow & ~arg2) | i;
4435 	mtx_unlock(&prison0.pr_mtx);
4436 	return (0);
4437 }
4438 
4439 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4440     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4441     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4442     "Processes in jail can set their hostnames (deprecated)");
4443 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4444     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4445     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4446     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4447 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4448     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4449     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4450     "Processes in jail can use System V IPC primitives (deprecated)");
4451 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4452     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4453     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4454     "Prison root can create raw sockets (deprecated)");
4455 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4456     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4457     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4458     "Processes in jail can alter system file flags (deprecated)");
4459 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4460     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4461     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4462     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4463 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
4464     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4465     NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
4466     "Processes in jail can lock/unlock physical pages in memory");
4467 
4468 static int
4469 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4470 {
4471 	struct prison *pr;
4472 	int level, error;
4473 
4474 	pr = req->td->td_ucred->cr_prison;
4475 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4476 	error = sysctl_handle_int(oidp, &level, 0, req);
4477 	if (error || !req->newptr)
4478 		return (error);
4479 	*(int *)arg1 = level;
4480 	return (0);
4481 }
4482 
4483 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4484     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4485     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4486     sysctl_jail_default_level, "I",
4487     "Processes in jail cannot see all mounted file systems (deprecated)");
4488 
4489 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4490     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4491     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4492     sysctl_jail_default_level, "I",
4493     "Ruleset for the devfs filesystem in jail (deprecated)");
4494 
4495 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4496     "Limits and stats of child jails");
4497 
4498 static int
4499 sysctl_jail_children(SYSCTL_HANDLER_ARGS)
4500 {
4501 	struct prison *pr;
4502 	int i;
4503 
4504 	pr = req->td->td_ucred->cr_prison;
4505 
4506 	switch (oidp->oid_kind & CTLTYPE) {
4507 	case CTLTYPE_INT:
4508 		i = *(int *)((char *)pr + arg2);
4509 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4510 	}
4511 
4512 	return (0);
4513 }
4514 
4515 SYSCTL_PROC(_security_jail_children, OID_AUTO, max,
4516     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4517     NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children,
4518     "I", "Maximum number of child jails");
4519 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur,
4520     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4521     NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children,
4522     "I", "Current number of child jails");
4523 
4524 /*
4525  * Nodes to describe jail parameters.  Maximum length of string parameters
4526  * is returned in the string itself, and the other parameters exist merely
4527  * to make themselves and their types known.
4528  */
4529 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4530     "Jail parameters");
4531 
4532 int
4533 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4534 {
4535 	int i;
4536 	long l;
4537 	size_t s;
4538 	char numbuf[12];
4539 
4540 	switch (oidp->oid_kind & CTLTYPE)
4541 	{
4542 	case CTLTYPE_LONG:
4543 	case CTLTYPE_ULONG:
4544 		l = 0;
4545 #ifdef SCTL_MASK32
4546 		if (!(req->flags & SCTL_MASK32))
4547 #endif
4548 			return (SYSCTL_OUT(req, &l, sizeof(l)));
4549 	case CTLTYPE_INT:
4550 	case CTLTYPE_UINT:
4551 		i = 0;
4552 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4553 	case CTLTYPE_STRING:
4554 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4555 		return
4556 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4557 	case CTLTYPE_STRUCT:
4558 		s = (size_t)arg2;
4559 		return (SYSCTL_OUT(req, &s, sizeof(s)));
4560 	}
4561 	return (0);
4562 }
4563 
4564 /*
4565  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4566  * jail creation time but cannot be changed in an existing jail.
4567  */
4568 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4569 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4570 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4571 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4572 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4573     "I", "Jail secure level");
4574 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4575     "Jail value for kern.osreldate and uname -K");
4576 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4577     "Jail value for kern.osrelease and uname -r");
4578 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4579     "I", "Jail cannot see all mounted file systems");
4580 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4581     "I", "Ruleset for in-jail devfs mounts");
4582 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4583     "B", "Jail persistence");
4584 #ifdef VIMAGE
4585 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4586     "E,jailsys", "Virtual network stack");
4587 #endif
4588 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4589     "B", "Jail is in the process of shutting down");
4590 
4591 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4592 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4593     "I", "Current number of child jails");
4594 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4595     "I", "Maximum number of child jails");
4596 
4597 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4598 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4599     "Jail hostname");
4600 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4601     "Jail NIS domainname");
4602 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4603     "Jail host UUID");
4604 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4605     "LU", "Jail host ID");
4606 
4607 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4608 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4609 
4610 #ifdef INET
4611 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4612     "Jail IPv4 address virtualization");
4613 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4614     "S,in_addr,a", "Jail IPv4 addresses");
4615 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4616     "B", "Do (not) use IPv4 source address selection rather than the "
4617     "primary jail IPv4 address.");
4618 #endif
4619 #ifdef INET6
4620 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4621     "Jail IPv6 address virtualization");
4622 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4623     "S,in6_addr,a", "Jail IPv6 addresses");
4624 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4625     "B", "Do (not) use IPv6 source address selection rather than the "
4626     "primary jail IPv6 address.");
4627 #endif
4628 
4629 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4630 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4631     "B", "Jail may set hostname");
4632 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4633     "B", "Jail may use SYSV IPC");
4634 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4635     "B", "Jail may create raw sockets");
4636 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4637     "B", "Jail may alter system file flags");
4638 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4639     "B", "Jail may set file quotas");
4640 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4641     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4642 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4643     "B", "Jail may lock (unlock) physical pages in memory");
4644 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4645     "B", "Jail may bind sockets to reserved ports");
4646 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4647     "B", "Jail may read the kernel message buffer");
4648 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4649     "B", "Unprivileged processes may use process debugging facilities");
4650 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4651     "B", "Processes in jail with uid 0 have privilege");
4652 #ifdef VIMAGE
4653 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
4654     "B", "Mountd/nfsd may run in the jail");
4655 #endif
4656 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW,
4657     "B", "Jail may set system-level filesystem extended attributes");
4658 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW,
4659     "B", "Jail may adjust system time");
4660 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
4661     "B", "Jail may set system time");
4662 
4663 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4664 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4665     "B", "Jail may mount/unmount jail-friendly file systems in general");
4666 
4667 /*
4668  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
4669  * its associated bit in the pr_allow bitmask, or zero if the parameter was
4670  * not created.
4671  */
4672 unsigned
4673 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4674     const char *descr)
4675 {
4676 	struct bool_flags *bf;
4677 	struct sysctl_oid *parent;
4678 	char *allow_name, *allow_noname, *allowed;
4679 #ifndef NO_SYSCTL_DESCR
4680 	char *descr_deprecated;
4681 #endif
4682 	u_int allow_flag;
4683 
4684 	if (prefix
4685 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4686 		< 0 ||
4687 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4688 		< 0
4689 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4690 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4691 		free(allow_name, M_PRISON);
4692 		return 0;
4693 	}
4694 
4695 	/*
4696 	 * See if this parameter has already beed added, i.e. a module was
4697 	 * previously loaded/unloaded.
4698 	 */
4699 	mtx_lock(&prison0.pr_mtx);
4700 	for (bf = pr_flag_allow;
4701 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4702 		atomic_load_int(&bf->flag) != 0;
4703 	     bf++) {
4704 		if (strcmp(bf->name, allow_name) == 0) {
4705 			allow_flag = bf->flag;
4706 			goto no_add;
4707 		}
4708 	}
4709 
4710 	/*
4711 	 * Find a free bit in pr_allow_all, failing if there are none
4712 	 * (which shouldn't happen as long as we keep track of how many
4713 	 * potential dynamic flags exist).
4714 	 */
4715 	for (allow_flag = 1;; allow_flag <<= 1) {
4716 		if (allow_flag == 0)
4717 			goto no_add;
4718 		if ((pr_allow_all & allow_flag) == 0)
4719 			break;
4720 	}
4721 
4722 	/* Note the parameter in the next open slot in pr_flag_allow. */
4723 	for (bf = pr_flag_allow; ; bf++) {
4724 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4725 			/* This should never happen, but is not fatal. */
4726 			allow_flag = 0;
4727 			goto no_add;
4728 		}
4729 		if (atomic_load_int(&bf->flag) == 0)
4730 			break;
4731 	}
4732 	bf->name = allow_name;
4733 	bf->noname = allow_noname;
4734 	pr_allow_all |= allow_flag;
4735 	/*
4736 	 * prison0 always has permission for the new parameter.
4737 	 * Other jails must have it granted to them.
4738 	 */
4739 	prison0.pr_allow |= allow_flag;
4740 	/* The flag indicates a valid entry, so make sure it is set last. */
4741 	atomic_store_rel_int(&bf->flag, allow_flag);
4742 	mtx_unlock(&prison0.pr_mtx);
4743 
4744 	/*
4745 	 * Create sysctls for the parameter, and the back-compat global
4746 	 * permission.
4747 	 */
4748 	parent = prefix
4749 	    ? SYSCTL_ADD_NODE(NULL,
4750 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4751 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4752 	    : &sysctl___security_jail_param_allow;
4753 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4754 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4755 	    NULL, 0, sysctl_jail_param, "B", descr);
4756 	if ((prefix
4757 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4758 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4759 #ifndef NO_SYSCTL_DESCR
4760 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4761 		    descr);
4762 #endif
4763 		(void)SYSCTL_ADD_PROC(NULL,
4764 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4765 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4766 		    sysctl_jail_default_allow, "I", descr_deprecated);
4767 #ifndef NO_SYSCTL_DESCR
4768 		free(descr_deprecated, M_TEMP);
4769 #endif
4770 		free(allowed, M_TEMP);
4771 	}
4772 	return allow_flag;
4773 
4774  no_add:
4775 	mtx_unlock(&prison0.pr_mtx);
4776 	free(allow_name, M_PRISON);
4777 	free(allow_noname, M_PRISON);
4778 	return allow_flag;
4779 }
4780 
4781 /*
4782  * The VFS system will register jail-aware filesystems here.  They each get
4783  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4784  * attempts to mount.
4785  */
4786 void
4787 prison_add_vfs(struct vfsconf *vfsp)
4788 {
4789 #ifdef NO_SYSCTL_DESCR
4790 
4791 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4792 	    NULL, NULL);
4793 #else
4794 	char *descr;
4795 
4796 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4797 	    vfsp->vfc_name);
4798 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4799 	    NULL, descr);
4800 	free(descr, M_TEMP);
4801 #endif
4802 }
4803 
4804 #ifdef RACCT
4805 void
4806 prison_racct_foreach(void (*callback)(struct racct *racct,
4807     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4808     void *arg2, void *arg3)
4809 {
4810 	struct prison_racct *prr;
4811 
4812 	ASSERT_RACCT_ENABLED();
4813 
4814 	sx_slock(&allprison_lock);
4815 	if (pre != NULL)
4816 		(pre)();
4817 	LIST_FOREACH(prr, &allprison_racct, prr_next)
4818 		(callback)(prr->prr_racct, arg2, arg3);
4819 	if (post != NULL)
4820 		(post)();
4821 	sx_sunlock(&allprison_lock);
4822 }
4823 
4824 static struct prison_racct *
4825 prison_racct_find_locked(const char *name)
4826 {
4827 	struct prison_racct *prr;
4828 
4829 	ASSERT_RACCT_ENABLED();
4830 	sx_assert(&allprison_lock, SA_XLOCKED);
4831 
4832 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4833 		return (NULL);
4834 
4835 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4836 		if (strcmp(name, prr->prr_name) != 0)
4837 			continue;
4838 
4839 		/* Found prison_racct with a matching name? */
4840 		prison_racct_hold(prr);
4841 		return (prr);
4842 	}
4843 
4844 	/* Add new prison_racct. */
4845 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4846 	racct_create(&prr->prr_racct);
4847 
4848 	strcpy(prr->prr_name, name);
4849 	refcount_init(&prr->prr_refcount, 1);
4850 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4851 
4852 	return (prr);
4853 }
4854 
4855 struct prison_racct *
4856 prison_racct_find(const char *name)
4857 {
4858 	struct prison_racct *prr;
4859 
4860 	ASSERT_RACCT_ENABLED();
4861 
4862 	sx_xlock(&allprison_lock);
4863 	prr = prison_racct_find_locked(name);
4864 	sx_xunlock(&allprison_lock);
4865 	return (prr);
4866 }
4867 
4868 void
4869 prison_racct_hold(struct prison_racct *prr)
4870 {
4871 
4872 	ASSERT_RACCT_ENABLED();
4873 
4874 	refcount_acquire(&prr->prr_refcount);
4875 }
4876 
4877 static void
4878 prison_racct_free_locked(struct prison_racct *prr)
4879 {
4880 
4881 	ASSERT_RACCT_ENABLED();
4882 	sx_assert(&allprison_lock, SA_XLOCKED);
4883 
4884 	if (refcount_release(&prr->prr_refcount)) {
4885 		racct_destroy(&prr->prr_racct);
4886 		LIST_REMOVE(prr, prr_next);
4887 		free(prr, M_PRISON_RACCT);
4888 	}
4889 }
4890 
4891 void
4892 prison_racct_free(struct prison_racct *prr)
4893 {
4894 
4895 	ASSERT_RACCT_ENABLED();
4896 	sx_assert(&allprison_lock, SA_UNLOCKED);
4897 
4898 	if (refcount_release_if_not_last(&prr->prr_refcount))
4899 		return;
4900 
4901 	sx_xlock(&allprison_lock);
4902 	prison_racct_free_locked(prr);
4903 	sx_xunlock(&allprison_lock);
4904 }
4905 
4906 static void
4907 prison_racct_attach(struct prison *pr)
4908 {
4909 	struct prison_racct *prr;
4910 
4911 	ASSERT_RACCT_ENABLED();
4912 	sx_assert(&allprison_lock, SA_XLOCKED);
4913 
4914 	prr = prison_racct_find_locked(pr->pr_name);
4915 	KASSERT(prr != NULL, ("cannot find prison_racct"));
4916 
4917 	pr->pr_prison_racct = prr;
4918 }
4919 
4920 /*
4921  * Handle jail renaming.  From the racct point of view, renaming means
4922  * moving from one prison_racct to another.
4923  */
4924 static void
4925 prison_racct_modify(struct prison *pr)
4926 {
4927 #ifdef RCTL
4928 	struct proc *p;
4929 	struct ucred *cred;
4930 #endif
4931 	struct prison_racct *oldprr;
4932 
4933 	ASSERT_RACCT_ENABLED();
4934 
4935 	sx_slock(&allproc_lock);
4936 	sx_xlock(&allprison_lock);
4937 
4938 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4939 		sx_xunlock(&allprison_lock);
4940 		sx_sunlock(&allproc_lock);
4941 		return;
4942 	}
4943 
4944 	oldprr = pr->pr_prison_racct;
4945 	pr->pr_prison_racct = NULL;
4946 
4947 	prison_racct_attach(pr);
4948 
4949 	/*
4950 	 * Move resource utilisation records.
4951 	 */
4952 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4953 
4954 #ifdef RCTL
4955 	/*
4956 	 * Force rctl to reattach rules to processes.
4957 	 */
4958 	FOREACH_PROC_IN_SYSTEM(p) {
4959 		PROC_LOCK(p);
4960 		cred = crhold(p->p_ucred);
4961 		PROC_UNLOCK(p);
4962 		rctl_proc_ucred_changed(p, cred);
4963 		crfree(cred);
4964 	}
4965 #endif
4966 
4967 	sx_sunlock(&allproc_lock);
4968 	prison_racct_free_locked(oldprr);
4969 	sx_xunlock(&allprison_lock);
4970 }
4971 
4972 static void
4973 prison_racct_detach(struct prison *pr)
4974 {
4975 
4976 	ASSERT_RACCT_ENABLED();
4977 	sx_assert(&allprison_lock, SA_UNLOCKED);
4978 
4979 	if (pr->pr_prison_racct == NULL)
4980 		return;
4981 	prison_racct_free(pr->pr_prison_racct);
4982 	pr->pr_prison_racct = NULL;
4983 }
4984 #endif /* RACCT */
4985 
4986 #ifdef DDB
4987 
4988 static void
4989 db_show_prison(struct prison *pr)
4990 {
4991 	struct bool_flags *bf;
4992 	struct jailsys_flags *jsf;
4993 #if defined(INET) || defined(INET6)
4994 	int ii;
4995 	struct prison_ip *pip;
4996 #endif
4997 	unsigned f;
4998 #ifdef INET
4999 	char ip4buf[INET_ADDRSTRLEN];
5000 #endif
5001 #ifdef INET6
5002 	char ip6buf[INET6_ADDRSTRLEN];
5003 #endif
5004 
5005 	db_printf("prison %p:\n", pr);
5006 	db_printf(" jid             = %d\n", pr->pr_id);
5007 	db_printf(" name            = %s\n", pr->pr_name);
5008 	db_printf(" parent          = %p\n", pr->pr_parent);
5009 	db_printf(" ref             = %d\n", pr->pr_ref);
5010 	db_printf(" uref            = %d\n", pr->pr_uref);
5011 	db_printf(" state           = %s\n",
5012 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
5013 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
5014 	    "invalid");
5015 	db_printf(" path            = %s\n", pr->pr_path);
5016 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
5017 	    ? pr->pr_cpuset->cs_id : -1);
5018 #ifdef VIMAGE
5019 	db_printf(" vnet            = %p\n", pr->pr_vnet);
5020 #endif
5021 	db_printf(" root            = %p\n", pr->pr_root);
5022 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
5023 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
5024 	db_printf(" children.max    = %d\n", pr->pr_childmax);
5025 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
5026 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
5027 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
5028 	db_printf(" flags           = 0x%x", pr->pr_flags);
5029 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
5030 		if (pr->pr_flags & bf->flag)
5031 			db_printf(" %s", bf->name);
5032 	for (jsf = pr_flag_jailsys;
5033 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
5034 	     jsf++) {
5035 		f = pr->pr_flags & (jsf->disable | jsf->new);
5036 		db_printf(" %-16s= %s\n", jsf->name,
5037 		    (f != 0 && f == jsf->disable) ? "disable"
5038 		    : (f == jsf->new) ? "new"
5039 		    : "inherit");
5040 	}
5041 	db_printf(" allow           = 0x%x", pr->pr_allow);
5042 	for (bf = pr_flag_allow;
5043 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
5044 		atomic_load_int(&bf->flag) != 0;
5045 	     bf++)
5046 		if (pr->pr_allow & bf->flag)
5047 			db_printf(" %s", bf->name);
5048 	db_printf("\n");
5049 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
5050 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
5051 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
5052 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
5053 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
5054 #ifdef INET
5055 	if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
5056 		db_printf(" ip4s            = %d\n", pip->ips);
5057 		for (ii = 0; ii < pip->ips; ii++)
5058 			db_printf(" %s %s\n",
5059 			    ii == 0 ? "ip4.addr        =" : "                 ",
5060 			    inet_ntoa_r(
5061 			    *(const struct in_addr *)PR_IP(pip, PR_INET, ii),
5062 			    ip4buf));
5063 	}
5064 #endif
5065 #ifdef INET6
5066 	if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
5067 		db_printf(" ip6s            = %d\n", pip->ips);
5068 		for (ii = 0; ii < pip->ips; ii++)
5069 			db_printf(" %s %s\n",
5070 			    ii == 0 ? "ip6.addr        =" : "                 ",
5071 			    ip6_sprintf(ip6buf,
5072 			    (const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
5073 	}
5074 #endif
5075 }
5076 
5077 DB_SHOW_COMMAND(prison, db_show_prison_command)
5078 {
5079 	struct prison *pr;
5080 
5081 	if (!have_addr) {
5082 		/*
5083 		 * Show all prisons in the list, and prison0 which is not
5084 		 * listed.
5085 		 */
5086 		db_show_prison(&prison0);
5087 		if (!db_pager_quit) {
5088 			TAILQ_FOREACH(pr, &allprison, pr_list) {
5089 				db_show_prison(pr);
5090 				if (db_pager_quit)
5091 					break;
5092 			}
5093 		}
5094 		return;
5095 	}
5096 
5097 	if (addr == 0)
5098 		pr = &prison0;
5099 	else {
5100 		/* Look for a prison with the ID and with references. */
5101 		TAILQ_FOREACH(pr, &allprison, pr_list)
5102 			if (pr->pr_id == addr && pr->pr_ref > 0)
5103 				break;
5104 		if (pr == NULL)
5105 			/* Look again, without requiring a reference. */
5106 			TAILQ_FOREACH(pr, &allprison, pr_list)
5107 				if (pr->pr_id == addr)
5108 					break;
5109 		if (pr == NULL)
5110 			/* Assume address points to a valid prison. */
5111 			pr = (struct prison *)addr;
5112 	}
5113 	db_show_prison(pr);
5114 }
5115 
5116 #endif /* DDB */
5117