xref: /freebsd/sys/kern/kern_jail.c (revision 195ebc7e9e4b129de810833791a19dfb4349d6a9)
1 /*-
2  * Copyright (c) 1999 Poul-Henning Kamp.
3  * Copyright (c) 2008 Bjoern A. Zeeb.
4  * Copyright (c) 2009 James Gritton.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 #include "opt_ddb.h"
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 #include "opt_mac.h"
37 
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/systm.h>
42 #include <sys/errno.h>
43 #include <sys/sysproto.h>
44 #include <sys/malloc.h>
45 #include <sys/osd.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/taskqueue.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/sx.h>
54 #include <sys/sysent.h>
55 #include <sys/namei.h>
56 #include <sys/mount.h>
57 #include <sys/queue.h>
58 #include <sys/socket.h>
59 #include <sys/syscallsubr.h>
60 #include <sys/sysctl.h>
61 #include <sys/vnode.h>
62 #include <sys/vimage.h>
63 #include <net/if.h>
64 #include <netinet/in.h>
65 #ifdef DDB
66 #include <ddb/ddb.h>
67 #ifdef INET6
68 #include <netinet6/in6_var.h>
69 #endif /* INET6 */
70 #endif /* DDB */
71 
72 #include <security/mac/mac_framework.h>
73 
74 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
75 
76 /* prison0 describes what is "real" about the system. */
77 struct prison prison0 = {
78 	.pr_id		= 0,
79 	.pr_name	= "0",
80 	.pr_ref		= 1,
81 	.pr_uref	= 1,
82 	.pr_path	= "/",
83 	.pr_securelevel	= -1,
84 	.pr_uuid	= "00000000-0000-0000-0000-000000000000",
85 	.pr_children	= LIST_HEAD_INITIALIZER(&prison0.pr_children),
86 	.pr_flags	= PR_HOST,
87 	.pr_allow	= PR_ALLOW_ALL,
88 };
89 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
90 
91 /* allprison and lastprid are protected by allprison_lock. */
92 struct	sx allprison_lock;
93 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
94 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
95 int	lastprid = 0;
96 
97 static int do_jail_attach(struct thread *td, struct prison *pr);
98 static void prison_complete(void *context, int pending);
99 static void prison_deref(struct prison *pr, int flags);
100 static char *prison_path(struct prison *pr1, struct prison *pr2);
101 static void prison_remove_one(struct prison *pr);
102 #ifdef INET
103 static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
104 static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
105 #endif
106 #ifdef INET6
107 static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
108 static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
109 #endif
110 
111 /* Flags for prison_deref */
112 #define	PD_DEREF	0x01
113 #define	PD_DEUREF	0x02
114 #define	PD_LOCKED	0x04
115 #define	PD_LIST_SLOCKED	0x08
116 #define	PD_LIST_XLOCKED	0x10
117 
118 /*
119  * Parameter names corresponding to PR_* flag values
120  */
121 static char *pr_flag_names[] = {
122 	[0] = "persist",
123 	"host",
124 #ifdef INET
125 	"ip4",
126 #endif
127 #ifdef INET6
128 	[3] = "ip6",
129 #endif
130 };
131 
132 static char *pr_flag_nonames[] = {
133 	[0] = "nopersist",
134 	"nohost",
135 #ifdef INET
136 	"noip4",
137 #endif
138 #ifdef INET6
139 	[3] = "noip6",
140 #endif
141 };
142 
143 static char *pr_allow_names[] = {
144 	"allow.set_hostname",
145 	"allow.sysvipc",
146 	"allow.raw_sockets",
147 	"allow.chflags",
148 	"allow.mount",
149 	"allow.quotas",
150 	"allow.jails",
151 	"allow.socket_af",
152 };
153 
154 static char *pr_allow_nonames[] = {
155 	"allow.noset_hostname",
156 	"allow.nosysvipc",
157 	"allow.noraw_sockets",
158 	"allow.nochflags",
159 	"allow.nomount",
160 	"allow.noquotas",
161 	"allow.nojails",
162 	"allow.nosocket_af",
163 };
164 
165 #define	JAIL_DEFAULT_ALLOW	PR_ALLOW_SET_HOSTNAME
166 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
167 static int jail_default_enforce_statfs = 2;
168 #if defined(INET) || defined(INET6)
169 static int jail_max_af_ips = 255;
170 #endif
171 
172 #ifdef INET
173 static int
174 qcmp_v4(const void *ip1, const void *ip2)
175 {
176 	in_addr_t iaa, iab;
177 
178 	/*
179 	 * We need to compare in HBO here to get the list sorted as expected
180 	 * by the result of the code.  Sorting NBO addresses gives you
181 	 * interesting results.  If you do not understand, do not try.
182 	 */
183 	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
184 	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
185 
186 	/*
187 	 * Do not simply return the difference of the two numbers, the int is
188 	 * not wide enough.
189 	 */
190 	if (iaa > iab)
191 		return (1);
192 	else if (iaa < iab)
193 		return (-1);
194 	else
195 		return (0);
196 }
197 #endif
198 
199 #ifdef INET6
200 static int
201 qcmp_v6(const void *ip1, const void *ip2)
202 {
203 	const struct in6_addr *ia6a, *ia6b;
204 	int i, rc;
205 
206 	ia6a = (const struct in6_addr *)ip1;
207 	ia6b = (const struct in6_addr *)ip2;
208 
209 	rc = 0;
210 	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
211 		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
212 			rc = 1;
213 		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
214 			rc = -1;
215 	}
216 	return (rc);
217 }
218 #endif
219 
220 /*
221  * struct jail_args {
222  *	struct jail *jail;
223  * };
224  */
225 int
226 jail(struct thread *td, struct jail_args *uap)
227 {
228 	uint32_t version;
229 	int error;
230 	struct jail j;
231 
232 	error = copyin(uap->jail, &version, sizeof(uint32_t));
233 	if (error)
234 		return (error);
235 
236 	switch (version) {
237 	case 0:
238 	{
239 		struct jail_v0 j0;
240 
241 		/* FreeBSD single IPv4 jails. */
242 		bzero(&j, sizeof(struct jail));
243 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
244 		if (error)
245 			return (error);
246 		j.version = j0.version;
247 		j.path = j0.path;
248 		j.hostname = j0.hostname;
249 		j.ip4s = j0.ip_number;
250 		break;
251 	}
252 
253 	case 1:
254 		/*
255 		 * Version 1 was used by multi-IPv4 jail implementations
256 		 * that never made it into the official kernel.
257 		 */
258 		return (EINVAL);
259 
260 	case 2:	/* JAIL_API_VERSION */
261 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
262 		error = copyin(uap->jail, &j, sizeof(struct jail));
263 		if (error)
264 			return (error);
265 		break;
266 
267 	default:
268 		/* Sci-Fi jails are not supported, sorry. */
269 		return (EINVAL);
270 	}
271 	return (kern_jail(td, &j));
272 }
273 
274 int
275 kern_jail(struct thread *td, struct jail *j)
276 {
277 	struct iovec optiov[24];
278 	struct uio opt;
279 	char *u_path, *u_hostname, *u_name;
280 #ifdef INET
281 	int ip4s;
282 	struct in_addr *u_ip4;
283 #endif
284 #ifdef INET6
285 	struct in6_addr *u_ip6;
286 #endif
287 	size_t tmplen;
288 	int error, enforce_statfs, fi;
289 
290 	bzero(&optiov, sizeof(optiov));
291 	opt.uio_iov = optiov;
292 	opt.uio_iovcnt = 0;
293 	opt.uio_offset = -1;
294 	opt.uio_resid = -1;
295 	opt.uio_segflg = UIO_SYSSPACE;
296 	opt.uio_rw = UIO_READ;
297 	opt.uio_td = td;
298 
299 	/* Set permissions for top-level jails from sysctls. */
300 	if (!jailed(td->td_ucred)) {
301 		for (fi = 0; fi < sizeof(pr_allow_names) /
302 		     sizeof(pr_allow_names[0]); fi++) {
303 			optiov[opt.uio_iovcnt].iov_base =
304 			    (jail_default_allow & (1 << fi))
305 			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
306 			optiov[opt.uio_iovcnt].iov_len =
307 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
308 			opt.uio_iovcnt += 2;
309 		}
310 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
311 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
312 		opt.uio_iovcnt++;
313 		enforce_statfs = jail_default_enforce_statfs;
314 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
315 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
316 		opt.uio_iovcnt++;
317 	}
318 
319 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
320 #ifdef INET
321 	ip4s = (j->version == 0) ? 1 : j->ip4s;
322 	if (ip4s > jail_max_af_ips)
323 		return (EINVAL);
324 	tmplen += ip4s * sizeof(struct in_addr);
325 #else
326 	if (j->ip4s > 0)
327 		return (EINVAL);
328 #endif
329 #ifdef INET6
330 	if (j->ip6s > jail_max_af_ips)
331 		return (EINVAL);
332 	tmplen += j->ip6s * sizeof(struct in6_addr);
333 #else
334 	if (j->ip6s > 0)
335 		return (EINVAL);
336 #endif
337 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
338 	u_hostname = u_path + MAXPATHLEN;
339 	u_name = u_hostname + MAXHOSTNAMELEN;
340 #ifdef INET
341 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
342 #endif
343 #ifdef INET6
344 #ifdef INET
345 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
346 #else
347 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
348 #endif
349 #endif
350 	optiov[opt.uio_iovcnt].iov_base = "path";
351 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
352 	opt.uio_iovcnt++;
353 	optiov[opt.uio_iovcnt].iov_base = u_path;
354 	error = copyinstr(j->path, u_path, MAXPATHLEN,
355 	    &optiov[opt.uio_iovcnt].iov_len);
356 	if (error) {
357 		free(u_path, M_TEMP);
358 		return (error);
359 	}
360 	opt.uio_iovcnt++;
361 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
362 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
363 	opt.uio_iovcnt++;
364 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
365 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
366 	    &optiov[opt.uio_iovcnt].iov_len);
367 	if (error) {
368 		free(u_path, M_TEMP);
369 		return (error);
370 	}
371 	opt.uio_iovcnt++;
372 	if (j->jailname != NULL) {
373 		optiov[opt.uio_iovcnt].iov_base = "name";
374 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
375 		opt.uio_iovcnt++;
376 		optiov[opt.uio_iovcnt].iov_base = u_name;
377 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
378 		    &optiov[opt.uio_iovcnt].iov_len);
379 		if (error) {
380 			free(u_path, M_TEMP);
381 			return (error);
382 		}
383 		opt.uio_iovcnt++;
384 	}
385 #ifdef INET
386 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
387 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
388 	opt.uio_iovcnt++;
389 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
390 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
391 	if (j->version == 0)
392 		u_ip4->s_addr = j->ip4s;
393 	else {
394 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
395 		if (error) {
396 			free(u_path, M_TEMP);
397 			return (error);
398 		}
399 	}
400 	opt.uio_iovcnt++;
401 #endif
402 #ifdef INET6
403 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
404 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
405 	opt.uio_iovcnt++;
406 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
407 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
408 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
409 	if (error) {
410 		free(u_path, M_TEMP);
411 		return (error);
412 	}
413 	opt.uio_iovcnt++;
414 #endif
415 	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
416 	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
417 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
418 	free(u_path, M_TEMP);
419 	return (error);
420 }
421 
422 
423 /*
424  * struct jail_set_args {
425  *	struct iovec *iovp;
426  *	unsigned int iovcnt;
427  *	int flags;
428  * };
429  */
430 int
431 jail_set(struct thread *td, struct jail_set_args *uap)
432 {
433 	struct uio *auio;
434 	int error;
435 
436 	/* Check that we have an even number of iovecs. */
437 	if (uap->iovcnt & 1)
438 		return (EINVAL);
439 
440 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
441 	if (error)
442 		return (error);
443 	error = kern_jail_set(td, auio, uap->flags);
444 	free(auio, M_IOV);
445 	return (error);
446 }
447 
448 int
449 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
450 {
451 	struct nameidata nd;
452 #ifdef INET
453 	struct in_addr *ip4;
454 #endif
455 #ifdef INET6
456 	struct in6_addr *ip6;
457 #endif
458 	struct vfsopt *opt;
459 	struct vfsoptlist *opts;
460 	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
461 	struct vnode *root;
462 	char *domain, *errmsg, *host, *name, *p, *path, *uuid;
463 #if defined(INET) || defined(INET6)
464 	void *op;
465 #endif
466 	unsigned long hid;
467 	size_t namelen, onamelen;
468 	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
469 	int gotenforce, gothid, gotslevel, fi, jid, len;
470 	int slevel, vfslocked;
471 #if defined(INET) || defined(INET6)
472 	int ii, ij;
473 #endif
474 #ifdef INET
475 	int ip4s, ip4a, redo_ip4;
476 #endif
477 #ifdef INET6
478 	int ip6s, ip6a, redo_ip6;
479 #endif
480 	unsigned pr_flags, ch_flags;
481 	unsigned pr_allow, ch_allow, tallow;
482 	char numbuf[12];
483 
484 	error = priv_check(td, PRIV_JAIL_SET);
485 	if (!error && (flags & JAIL_ATTACH))
486 		error = priv_check(td, PRIV_JAIL_ATTACH);
487 	if (error)
488 		return (error);
489 	mypr = ppr = td->td_ucred->cr_prison;
490 	if ((flags & JAIL_CREATE) && !(mypr->pr_allow & PR_ALLOW_JAILS))
491 		return (EPERM);
492 	if (flags & ~JAIL_SET_MASK)
493 		return (EINVAL);
494 
495 	/*
496 	 * Check all the parameters before committing to anything.  Not all
497 	 * errors can be caught early, but we may as well try.  Also, this
498 	 * takes care of some expensive stuff (path lookup) before getting
499 	 * the allprison lock.
500 	 *
501 	 * XXX Jails are not filesystems, and jail parameters are not mount
502 	 *     options.  But it makes more sense to re-use the vfsopt code
503 	 *     than duplicate it under a different name.
504 	 */
505 	error = vfs_buildopts(optuio, &opts);
506 	if (error)
507 		return (error);
508 #ifdef INET
509 	ip4a = 0;
510 	ip4 = NULL;
511 #endif
512 #ifdef INET6
513 	ip6a = 0;
514 	ip6 = NULL;
515 #endif
516 
517 #if defined(INET) || defined(INET6)
518  again:
519 #endif
520 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
521 	if (error == ENOENT)
522 		jid = 0;
523 	else if (error != 0)
524 		goto done_free;
525 
526 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
527 	if (error == ENOENT)
528 		gotslevel = 0;
529 	else if (error != 0)
530 		goto done_free;
531 	else
532 		gotslevel = 1;
533 
534 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
535 	gotenforce = (error == 0);
536 	if (gotenforce) {
537 		if (enforce < 0 || enforce > 2)
538 			return (EINVAL);
539 	} else if (error != ENOENT)
540 		goto done_free;
541 
542 	pr_flags = ch_flags = 0;
543 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
544 	    fi++) {
545 		if (pr_flag_names[fi] == NULL)
546 			continue;
547 		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
548 		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
549 	}
550 	ch_flags |= pr_flags;
551 	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
552 	    && !(pr_flags & PR_PERSIST)) {
553 		error = EINVAL;
554 		vfs_opterror(opts, "new jail must persist or attach");
555 		goto done_errmsg;
556 	}
557 
558 	pr_allow = ch_allow = 0;
559 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
560 	    fi++) {
561 		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
562 		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
563 	}
564 	ch_allow |= pr_allow;
565 
566 	error = vfs_getopt(opts, "name", (void **)&name, &len);
567 	if (error == ENOENT)
568 		name = NULL;
569 	else if (error != 0)
570 		goto done_free;
571 	else {
572 		if (len == 0 || name[len - 1] != '\0') {
573 			error = EINVAL;
574 			goto done_free;
575 		}
576 		if (len > MAXHOSTNAMELEN) {
577 			error = ENAMETOOLONG;
578 			goto done_free;
579 		}
580 	}
581 
582 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
583 	if (error == ENOENT)
584 		host = NULL;
585 	else if (error != 0)
586 		goto done_free;
587 	else {
588 		ch_flags |= PR_HOST;
589 		pr_flags |= PR_HOST;
590 		if (len == 0 || host[len - 1] != '\0') {
591 			error = EINVAL;
592 			goto done_free;
593 		}
594 		if (len > MAXHOSTNAMELEN) {
595 			error = ENAMETOOLONG;
596 			goto done_free;
597 		}
598 	}
599 
600 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
601 	if (error == ENOENT)
602 		domain = NULL;
603 	else if (error != 0)
604 		goto done_free;
605 	else {
606 		ch_flags |= PR_HOST;
607 		pr_flags |= PR_HOST;
608 		if (len == 0 || domain[len - 1] != '\0') {
609 			error = EINVAL;
610 			goto done_free;
611 		}
612 		if (len > MAXHOSTNAMELEN) {
613 			error = ENAMETOOLONG;
614 			goto done_free;
615 		}
616 	}
617 
618 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
619 	if (error == ENOENT)
620 		uuid = NULL;
621 	else if (error != 0)
622 		goto done_free;
623 	else {
624 		ch_flags |= PR_HOST;
625 		pr_flags |= PR_HOST;
626 		if (len == 0 || uuid[len - 1] != '\0') {
627 			error = EINVAL;
628 			goto done_free;
629 		}
630 		if (len > HOSTUUIDLEN) {
631 			error = ENAMETOOLONG;
632 			goto done_free;
633 		}
634 	}
635 
636 #ifdef COMPAT_IA32
637 	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
638 		uint32_t hid32;
639 
640 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
641 		hid = hid32;
642 	} else
643 #endif
644 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
645 	if (error == ENOENT)
646 		gothid = 0;
647 	else if (error != 0)
648 		goto done_free;
649 	else {
650 		gothid = 1;
651 		ch_flags |= PR_HOST;
652 		pr_flags |= PR_HOST;
653 	}
654 
655 	/* This might be the second time around for this option. */
656 #ifdef INET
657 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
658 	if (error == ENOENT)
659 		ip4s = -1;
660 	else if (error != 0)
661 		goto done_free;
662 	else if (ip4s & (sizeof(*ip4) - 1)) {
663 		error = EINVAL;
664 		goto done_free;
665 	} else {
666 		ch_flags |= PR_IP4_USER;
667 		pr_flags |= PR_IP4_USER;
668 		if (ip4s > 0) {
669 			ip4s /= sizeof(*ip4);
670 			if (ip4s > jail_max_af_ips) {
671 				error = EINVAL;
672 				vfs_opterror(opts, "too many IPv4 addresses");
673 				goto done_errmsg;
674 			}
675 			if (ip4a < ip4s) {
676 				ip4a = ip4s;
677 				free(ip4, M_PRISON);
678 				ip4 = NULL;
679 			}
680 			if (ip4 == NULL)
681 				ip4 = malloc(ip4a * sizeof(*ip4), M_PRISON,
682 				    M_WAITOK);
683 			bcopy(op, ip4, ip4s * sizeof(*ip4));
684 			/*
685 			 * IP addresses are all sorted but ip[0] to preserve
686 			 * the primary IP address as given from userland.
687 			 * This special IP is used for unbound outgoing
688 			 * connections as well for "loopback" traffic.
689 			 */
690 			if (ip4s > 1)
691 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
692 			/*
693 			 * Check for duplicate addresses and do some simple
694 			 * zero and broadcast checks. If users give other bogus
695 			 * addresses it is their problem.
696 			 *
697 			 * We do not have to care about byte order for these
698 			 * checks so we will do them in NBO.
699 			 */
700 			for (ii = 0; ii < ip4s; ii++) {
701 				if (ip4[ii].s_addr == INADDR_ANY ||
702 				    ip4[ii].s_addr == INADDR_BROADCAST) {
703 					error = EINVAL;
704 					goto done_free;
705 				}
706 				if ((ii+1) < ip4s &&
707 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
708 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
709 					error = EINVAL;
710 					goto done_free;
711 				}
712 			}
713 		}
714 	}
715 #endif
716 
717 #ifdef INET6
718 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
719 	if (error == ENOENT)
720 		ip6s = -1;
721 	else if (error != 0)
722 		goto done_free;
723 	else if (ip6s & (sizeof(*ip6) - 1)) {
724 		error = EINVAL;
725 		goto done_free;
726 	} else {
727 		ch_flags |= PR_IP6_USER;
728 		pr_flags |= PR_IP6_USER;
729 		if (ip6s > 0) {
730 			ip6s /= sizeof(*ip6);
731 			if (ip6s > jail_max_af_ips) {
732 				error = EINVAL;
733 				vfs_opterror(opts, "too many IPv6 addresses");
734 				goto done_errmsg;
735 			}
736 			if (ip6a < ip6s) {
737 				ip6a = ip6s;
738 				free(ip6, M_PRISON);
739 				ip6 = NULL;
740 			}
741 			if (ip6 == NULL)
742 				ip6 = malloc(ip6a * sizeof(*ip6), M_PRISON,
743 				    M_WAITOK);
744 			bcopy(op, ip6, ip6s * sizeof(*ip6));
745 			if (ip6s > 1)
746 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
747 			for (ii = 0; ii < ip6s; ii++) {
748 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
749 					error = EINVAL;
750 					goto done_free;
751 				}
752 				if ((ii+1) < ip6s &&
753 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
754 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
755 				{
756 					error = EINVAL;
757 					goto done_free;
758 				}
759 			}
760 		}
761 	}
762 #endif
763 
764 	root = NULL;
765 	error = vfs_getopt(opts, "path", (void **)&path, &len);
766 	if (error == ENOENT)
767 		path = NULL;
768 	else if (error != 0)
769 		goto done_free;
770 	else {
771 		if (flags & JAIL_UPDATE) {
772 			error = EINVAL;
773 			vfs_opterror(opts,
774 			    "path cannot be changed after creation");
775 			goto done_errmsg;
776 		}
777 		if (len == 0 || path[len - 1] != '\0') {
778 			error = EINVAL;
779 			goto done_free;
780 		}
781 		if (len < 2 || (len == 2 && path[0] == '/'))
782 			path = NULL;
783 		else {
784 			/* Leave room for a real-root full pathname. */
785 			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
786 			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
787 				error = ENAMETOOLONG;
788 				goto done_free;
789 			}
790 			NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE,
791 			    path, td);
792 			error = namei(&nd);
793 			if (error)
794 				goto done_free;
795 			vfslocked = NDHASGIANT(&nd);
796 			root = nd.ni_vp;
797 			NDFREE(&nd, NDF_ONLY_PNBUF);
798 			if (root->v_type != VDIR) {
799 				error = ENOTDIR;
800 				vrele(root);
801 				VFS_UNLOCK_GIANT(vfslocked);
802 				goto done_free;
803 			}
804 			VFS_UNLOCK_GIANT(vfslocked);
805 		}
806 	}
807 
808 	/*
809 	 * Grab the allprison lock before letting modules check their
810 	 * parameters.  Once we have it, do not let go so we'll have a
811 	 * consistent view of the OSD list.
812 	 */
813 	sx_xlock(&allprison_lock);
814 	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
815 	if (error)
816 		goto done_unlock_list;
817 
818 	/* By now, all parameters should have been noted. */
819 	TAILQ_FOREACH(opt, opts, link) {
820 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
821 			error = EINVAL;
822 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
823 			goto done_unlock_list;
824 		}
825 	}
826 
827 	/*
828 	 * See if we are creating a new record or updating an existing one.
829 	 * This abuses the file error codes ENOENT and EEXIST.
830 	 */
831 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
832 	if (!cuflags) {
833 		error = EINVAL;
834 		vfs_opterror(opts, "no valid operation (create or update)");
835 		goto done_unlock_list;
836 	}
837 	pr = NULL;
838 	if (jid != 0) {
839 		/*
840 		 * See if a requested jid already exists.  There is an
841 		 * information leak here if the jid exists but is not within
842 		 * the caller's jail hierarchy.  Jail creators will get EEXIST
843 		 * even though they cannot see the jail, and CREATE | UPDATE
844 		 * will return ENOENT which is not normally a valid error.
845 		 */
846 		if (jid < 0) {
847 			error = EINVAL;
848 			vfs_opterror(opts, "negative jid");
849 			goto done_unlock_list;
850 		}
851 		pr = prison_find(jid);
852 		if (pr != NULL) {
853 			ppr = pr->pr_parent;
854 			/* Create: jid must not exist. */
855 			if (cuflags == JAIL_CREATE) {
856 				mtx_unlock(&pr->pr_mtx);
857 				error = EEXIST;
858 				vfs_opterror(opts, "jail %d already exists",
859 				    jid);
860 				goto done_unlock_list;
861 			}
862 			if (!prison_ischild(mypr, pr)) {
863 				mtx_unlock(&pr->pr_mtx);
864 				pr = NULL;
865 			} else if (pr->pr_uref == 0) {
866 				if (!(flags & JAIL_DYING)) {
867 					mtx_unlock(&pr->pr_mtx);
868 					error = ENOENT;
869 					vfs_opterror(opts, "jail %d is dying",
870 					    jid);
871 					goto done_unlock_list;
872 				} else if ((flags & JAIL_ATTACH) ||
873 				    (pr_flags & PR_PERSIST)) {
874 					/*
875 					 * A dying jail might be resurrected
876 					 * (via attach or persist), but first
877 					 * it must determine if another jail
878 					 * has claimed its name.  Accomplish
879 					 * this by implicitly re-setting the
880 					 * name.
881 					 */
882 					if (name == NULL)
883 						name = prison_name(mypr, pr);
884 				}
885 			}
886 		}
887 		if (pr == NULL) {
888 			/* Update: jid must exist. */
889 			if (cuflags == JAIL_UPDATE) {
890 				error = ENOENT;
891 				vfs_opterror(opts, "jail %d not found", jid);
892 				goto done_unlock_list;
893 			}
894 		}
895 	}
896 	/*
897 	 * If the caller provided a name, look for a jail by that name.
898 	 * This has different semantics for creates and updates keyed by jid
899 	 * (where the name must not already exist in a different jail),
900 	 * and updates keyed by the name itself (where the name must exist
901 	 * because that is the jail being updated).
902 	 */
903 	if (name != NULL) {
904 		p = strrchr(name, '.');
905 		if (p != NULL) {
906 			/*
907 			 * This is a hierarchical name.  Split it into the
908 			 * parent and child names, and make sure the parent
909 			 * exists or matches an already found jail.
910 			 */
911 			*p = '\0';
912 			if (pr != NULL) {
913 				if (strncmp(name, ppr->pr_name, p - name) ||
914 				    ppr->pr_name[p - name] != '\0') {
915 					mtx_unlock(&pr->pr_mtx);
916 					error = EINVAL;
917 					vfs_opterror(opts,
918 					    "cannot change jail's parent");
919 					goto done_unlock_list;
920 				}
921 			} else {
922 				ppr = prison_find_name(mypr, name);
923 				if (ppr == NULL) {
924 					error = ENOENT;
925 					vfs_opterror(opts,
926 					    "jail \"%s\" not found", name);
927 					goto done_unlock_list;
928 				}
929 				mtx_unlock(&ppr->pr_mtx);
930 			}
931 			name = p + 1;
932 		}
933 		if (name[0] != '\0') {
934 			namelen =
935 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
936  name_again:
937 			deadpr = NULL;
938 			FOREACH_PRISON_CHILD(ppr, tpr) {
939 				if (tpr != pr && tpr->pr_ref > 0 &&
940 				    !strcmp(tpr->pr_name + namelen, name)) {
941 					if (pr == NULL &&
942 					    cuflags != JAIL_CREATE) {
943 						mtx_lock(&tpr->pr_mtx);
944 						if (tpr->pr_ref > 0) {
945 							/*
946 							 * Use this jail
947 							 * for updates.
948 							 */
949 							if (tpr->pr_uref > 0) {
950 								pr = tpr;
951 								break;
952 							}
953 							deadpr = tpr;
954 						}
955 						mtx_unlock(&tpr->pr_mtx);
956 					} else if (tpr->pr_uref > 0) {
957 						/*
958 						 * Create, or update(jid):
959 						 * name must not exist in an
960 						 * active sibling jail.
961 						 */
962 						error = EEXIST;
963 						if (pr != NULL)
964 							mtx_unlock(&pr->pr_mtx);
965 						vfs_opterror(opts,
966 						   "jail \"%s\" already exists",
967 						   name);
968 						goto done_unlock_list;
969 					}
970 				}
971 			}
972 			/* If no active jail is found, use a dying one. */
973 			if (deadpr != NULL && pr == NULL) {
974 				if (flags & JAIL_DYING) {
975 					mtx_lock(&deadpr->pr_mtx);
976 					if (deadpr->pr_ref == 0) {
977 						mtx_unlock(&deadpr->pr_mtx);
978 						goto name_again;
979 					}
980 					pr = deadpr;
981 				} else if (cuflags == JAIL_UPDATE) {
982 					error = ENOENT;
983 					vfs_opterror(opts,
984 					    "jail \"%s\" is dying", name);
985 					goto done_unlock_list;
986 				}
987 			}
988 			/* Update: name must exist if no jid. */
989 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
990 				error = ENOENT;
991 				vfs_opterror(opts, "jail \"%s\" not found",
992 				    name);
993 				goto done_unlock_list;
994 			}
995 		}
996 	}
997 	/* Update: must provide a jid or name. */
998 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
999 		error = ENOENT;
1000 		vfs_opterror(opts, "update specified no jail");
1001 		goto done_unlock_list;
1002 	}
1003 
1004 	/* If there's no prison to update, create a new one and link it in. */
1005 	if (pr == NULL) {
1006 		created = 1;
1007 		mtx_lock(&ppr->pr_mtx);
1008 		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
1009 			mtx_unlock(&ppr->pr_mtx);
1010 			error = ENOENT;
1011 			vfs_opterror(opts, "parent jail went away!");
1012 			goto done_unlock_list;
1013 		}
1014 		ppr->pr_ref++;
1015 		ppr->pr_uref++;
1016 		mtx_unlock(&ppr->pr_mtx);
1017 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1018 		if (jid == 0) {
1019 			/* Find the next free jid. */
1020 			jid = lastprid + 1;
1021  findnext:
1022 			if (jid == JAIL_MAX)
1023 				jid = 1;
1024 			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1025 				if (tpr->pr_id < jid)
1026 					continue;
1027 				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1028 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1029 					break;
1030 				}
1031 				if (jid == lastprid) {
1032 					error = EAGAIN;
1033 					vfs_opterror(opts,
1034 					    "no available jail IDs");
1035 					free(pr, M_PRISON);
1036 					prison_deref(ppr, PD_DEREF |
1037 					    PD_DEUREF | PD_LIST_XLOCKED);
1038 					goto done_releroot;
1039 				}
1040 				jid++;
1041 				goto findnext;
1042 			}
1043 			lastprid = jid;
1044 		} else {
1045 			/*
1046 			 * The jail already has a jid (that did not yet exist),
1047 			 * so just find where to insert it.
1048 			 */
1049 			TAILQ_FOREACH(tpr, &allprison, pr_list)
1050 				if (tpr->pr_id >= jid) {
1051 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1052 					break;
1053 				}
1054 		}
1055 		if (tpr == NULL)
1056 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1057 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1058 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1059 			tpr->pr_prisoncount++;
1060 
1061 		pr->pr_parent = ppr;
1062 		pr->pr_id = jid;
1063 
1064 		/* Set some default values, and inherit some from the parent. */
1065 		if (name == NULL)
1066 			name = "";
1067 		if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1068 			if (host == NULL)
1069 				host = ppr->pr_host;
1070 			if (domain == NULL)
1071 				domain = ppr->pr_domain;
1072 			if (uuid == NULL)
1073 				uuid = ppr->pr_uuid;
1074 			if (!gothid)
1075 				hid = ppr->pr_hostid;
1076 		}
1077 		if (path == NULL) {
1078 			path = "/";
1079 			root = mypr->pr_root;
1080 			vref(root);
1081 		}
1082 #ifdef INET
1083 		pr->pr_flags |= ppr->pr_flags & PR_IP4;
1084 		pr->pr_ip4s = ppr->pr_ip4s;
1085 		if (ppr->pr_ip4 != NULL) {
1086 			pr->pr_ip4 = malloc(pr->pr_ip4s *
1087 			    sizeof(struct in_addr), M_PRISON, M_WAITOK);
1088 			bcopy(ppr->pr_ip4, pr->pr_ip4,
1089 			    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1090 		}
1091 #endif
1092 #ifdef INET6
1093 		pr->pr_flags |= ppr->pr_flags & PR_IP6;
1094 		pr->pr_ip6s = ppr->pr_ip6s;
1095 		if (ppr->pr_ip6 != NULL) {
1096 			pr->pr_ip6 = malloc(pr->pr_ip6s *
1097 			    sizeof(struct in6_addr), M_PRISON, M_WAITOK);
1098 			bcopy(ppr->pr_ip6, pr->pr_ip6,
1099 			    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1100 		}
1101 #endif
1102 		pr->pr_securelevel = ppr->pr_securelevel;
1103 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1104 		pr->pr_enforce_statfs = ppr->pr_enforce_statfs;
1105 
1106 		LIST_INIT(&pr->pr_children);
1107 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1108 
1109 		/*
1110 		 * Allocate a dedicated cpuset for each jail.
1111 		 * Unlike other initial settings, this may return an erorr.
1112 		 */
1113 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1114 		if (error) {
1115 			prison_deref(pr, PD_LIST_XLOCKED);
1116 			goto done_releroot;
1117 		}
1118 
1119 		mtx_lock(&pr->pr_mtx);
1120 		/*
1121 		 * New prisons do not yet have a reference, because we do not
1122 		 * want other to see the incomplete prison once the
1123 		 * allprison_lock is downgraded.
1124 		 */
1125 	} else {
1126 		created = 0;
1127 		/*
1128 		 * Grab a reference for existing prisons, to ensure they
1129 		 * continue to exist for the duration of the call.
1130 		 */
1131 		pr->pr_ref++;
1132 	}
1133 
1134 	/* Do final error checking before setting anything. */
1135 	if (gotslevel) {
1136 		if (slevel < ppr->pr_securelevel) {
1137 			error = EPERM;
1138 			goto done_deref_locked;
1139 		}
1140 	}
1141 	if (gotenforce) {
1142 		if (enforce < ppr->pr_enforce_statfs) {
1143 			error = EPERM;
1144 			goto done_deref_locked;
1145 		}
1146 	}
1147 #ifdef INET
1148 	if (ch_flags & PR_IP4_USER) {
1149 		if (ppr->pr_flags & PR_IP4) {
1150 			if (!(pr_flags & PR_IP4_USER)) {
1151 				/*
1152 				 * Silently ignore attempts to make the IP
1153 				 * addresses unrestricted when the parent is
1154 				 * restricted; in other words, interpret
1155 				 * "unrestricted" as "as unrestricted as
1156 				 * possible".
1157 				 */
1158 				ip4s = ppr->pr_ip4s;
1159 				if (ip4s == 0) {
1160 					free(ip4, M_PRISON);
1161 					ip4 = NULL;
1162 				} else if (ip4s <= ip4a) {
1163 					/* Inherit the parent's address(es). */
1164 					bcopy(ppr->pr_ip4, ip4,
1165 					    ip4s * sizeof(*ip4));
1166 				} else {
1167 					/*
1168 					 * There's no room for the parent's
1169 					 * address list.  Allocate some more.
1170 					 */
1171 					ip4a = ip4s;
1172 					free(ip4, M_PRISON);
1173 					ip4 = malloc(ip4a * sizeof(*ip4),
1174 					    M_PRISON, M_NOWAIT);
1175 					if (ip4 != NULL)
1176 						bcopy(ppr->pr_ip4, ip4,
1177 						    ip4s * sizeof(*ip4));
1178 					else {
1179 						/* Allocation failed without
1180 						 * sleeping.  Unlocking the
1181 						 * prison now will invalidate
1182 						 * some checks and prematurely
1183 						 * show an unfinished new jail.
1184 						 * So let go of everything and
1185 						 * start over.
1186 						 */
1187 						prison_deref(pr, created
1188 						    ? PD_LOCKED |
1189 						      PD_LIST_XLOCKED
1190 						    : PD_DEREF | PD_LOCKED |
1191 						      PD_LIST_XLOCKED);
1192 						if (root != NULL) {
1193 							vfslocked =
1194 							    VFS_LOCK_GIANT(
1195 							    root->v_mount);
1196 							vrele(root);
1197 							VFS_UNLOCK_GIANT(
1198 							    vfslocked);
1199 						}
1200 						ip4 = malloc(ip4a *
1201 						    sizeof(*ip4), M_PRISON,
1202 						    M_WAITOK);
1203 						goto again;
1204 					}
1205 				}
1206 			} else if (ip4s > 0) {
1207 				/*
1208 				 * Make sure the new set of IP addresses is a
1209 				 * subset of the parent's list.  Don't worry
1210 				 * about the parent being unlocked, as any
1211 				 * setting is done with allprison_lock held.
1212 				 */
1213 				for (ij = 0; ij < ppr->pr_ip4s; ij++)
1214 					if (ip4[0].s_addr ==
1215 					    ppr->pr_ip4[ij].s_addr)
1216 						break;
1217 				if (ij == ppr->pr_ip4s) {
1218 					error = EPERM;
1219 					goto done_deref_locked;
1220 				}
1221 				if (ip4s > 1) {
1222 					for (ii = ij = 1; ii < ip4s; ii++) {
1223 						if (ip4[ii].s_addr ==
1224 						    ppr->pr_ip4[0].s_addr)
1225 							continue;
1226 						for (; ij < ppr->pr_ip4s; ij++)
1227 						    if (ip4[ii].s_addr ==
1228 							ppr->pr_ip4[ij].s_addr)
1229 							    break;
1230 						if (ij == ppr->pr_ip4s)
1231 							break;
1232 					}
1233 					if (ij == ppr->pr_ip4s) {
1234 						error = EPERM;
1235 						goto done_deref_locked;
1236 					}
1237 				}
1238 			}
1239 		}
1240 		if (ip4s > 0) {
1241 			/*
1242 			 * Check for conflicting IP addresses.  We permit them
1243 			 * if there is no more than one IP on each jail.  If
1244 			 * there is a duplicate on a jail with more than one
1245 			 * IP stop checking and return error.
1246 			 */
1247 			FOREACH_PRISON_DESCENDANT(&prison0, tpr, descend) {
1248 				if (tpr == pr || tpr->pr_uref == 0) {
1249 					descend = 0;
1250 					continue;
1251 				}
1252 				if (!(tpr->pr_flags & PR_IP4_USER))
1253 					continue;
1254 				descend = 0;
1255 				if (tpr->pr_ip4 == NULL ||
1256 				    (ip4s == 1 && tpr->pr_ip4s == 1))
1257 					continue;
1258 				for (ii = 0; ii < ip4s; ii++) {
1259 					if (_prison_check_ip4(tpr,
1260 					    &ip4[ii]) == 0) {
1261 						error = EADDRINUSE;
1262 						vfs_opterror(opts,
1263 						    "IPv4 addresses clash");
1264 						goto done_deref_locked;
1265 					}
1266 				}
1267 			}
1268 		}
1269 	}
1270 #endif
1271 #ifdef INET6
1272 	if (ch_flags & PR_IP6_USER) {
1273 		if (ppr->pr_flags & PR_IP6) {
1274 			if (!(pr_flags & PR_IP6_USER)) {
1275 				/*
1276 				 * Silently ignore attempts to make the IP
1277 				 * addresses unrestricted when the parent is
1278 				 * restricted.
1279 				 */
1280 				ip6s = ppr->pr_ip6s;
1281 				if (ip6s == 0) {
1282 					free(ip6, M_PRISON);
1283 					ip6 = NULL;
1284 				} else if (ip6s <= ip6a) {
1285 					/* Inherit the parent's address(es). */
1286 					bcopy(ppr->pr_ip6, ip6,
1287 					    ip6s * sizeof(*ip6));
1288 				} else {
1289 					/*
1290 					 * There's no room for the parent's
1291 					 * address list.
1292 					 */
1293 					ip6a = ip6s;
1294 					free(ip6, M_PRISON);
1295 					ip6 = malloc(ip6a * sizeof(*ip6),
1296 					    M_PRISON, M_NOWAIT);
1297 					if (ip6 != NULL)
1298 						bcopy(ppr->pr_ip6, ip6,
1299 						    ip6s * sizeof(*ip6));
1300 					else {
1301 						prison_deref(pr, created
1302 						    ? PD_LOCKED |
1303 						      PD_LIST_XLOCKED
1304 						    : PD_DEREF | PD_LOCKED |
1305 						      PD_LIST_XLOCKED);
1306 						if (root != NULL) {
1307 							vfslocked =
1308 							    VFS_LOCK_GIANT(
1309 							    root->v_mount);
1310 							vrele(root);
1311 							VFS_UNLOCK_GIANT(
1312 							    vfslocked);
1313 						}
1314 						ip6 = malloc(ip6a *
1315 						    sizeof(*ip6), M_PRISON,
1316 						    M_WAITOK);
1317 						goto again;
1318 					}
1319 				}
1320 			} else if (ip6s > 0) {
1321 				/*
1322 				 * Make sure the new set of IP addresses is a
1323 				 * subset of the parent's list.
1324 				 */
1325 				for (ij = 0; ij < ppr->pr_ip6s; ij++)
1326 					if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1327 					    &ppr->pr_ip6[ij]))
1328 						break;
1329 				if (ij == ppr->pr_ip6s) {
1330 					error = EPERM;
1331 					goto done_deref_locked;
1332 				}
1333 				if (ip6s > 1) {
1334 					for (ii = ij = 1; ii < ip6s; ii++) {
1335 						if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1336 						    &ppr->pr_ip6[0]))
1337 							continue;
1338 						for (; ij < ppr->pr_ip6s; ij++)
1339 							if (IN6_ARE_ADDR_EQUAL(
1340 							    &ip6[ii],
1341 							    &ppr->pr_ip6[ij]))
1342 								break;
1343 						if (ij == ppr->pr_ip6s)
1344 							break;
1345 					}
1346 					if (ij == ppr->pr_ip6s) {
1347 						error = EPERM;
1348 						goto done_deref_locked;
1349 					}
1350 				}
1351 			}
1352 		}
1353 		if (ip6s > 0) {
1354 			/* Check for conflicting IP addresses. */
1355 			FOREACH_PRISON_DESCENDANT(&prison0, tpr, descend) {
1356 				if (tpr == pr || tpr->pr_uref == 0) {
1357 					descend = 0;
1358 					continue;
1359 				}
1360 				if (!(tpr->pr_flags & PR_IP6_USER))
1361 					continue;
1362 				descend = 0;
1363 				if (tpr->pr_ip6 == NULL ||
1364 				    (ip6s == 1 && tpr->pr_ip6s == 1))
1365 					continue;
1366 				for (ii = 0; ii < ip6s; ii++) {
1367 					if (_prison_check_ip6(tpr,
1368 					    &ip6[ii]) == 0) {
1369 						error = EADDRINUSE;
1370 						vfs_opterror(opts,
1371 						    "IPv6 addresses clash");
1372 						goto done_deref_locked;
1373 					}
1374 				}
1375 			}
1376 		}
1377 	}
1378 #endif
1379 	onamelen = namelen = 0;
1380 	if (name != NULL) {
1381 		/* Give a default name of the jid. */
1382 		if (name[0] == '\0')
1383 			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
1384 		else if (strtoul(name, &p, 10) != jid && *p == '\0') {
1385 			error = EINVAL;
1386 			vfs_opterror(opts, "name cannot be numeric");
1387 			goto done_deref_locked;
1388 		}
1389 		/*
1390 		 * Make sure the name isn't too long for the prison or its
1391 		 * children.
1392 		 */
1393 		onamelen = strlen(pr->pr_name);
1394 		namelen = strlen(name);
1395 		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
1396 			error = ENAMETOOLONG;
1397 			goto done_deref_locked;
1398 		}
1399 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1400 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1401 			    sizeof(pr->pr_name)) {
1402 				error = ENAMETOOLONG;
1403 				goto done_deref_locked;
1404 			}
1405 		}
1406 	}
1407 	if (pr_allow & ~ppr->pr_allow) {
1408 		error = EPERM;
1409 		goto done_deref_locked;
1410 	}
1411 
1412 	/* Set the parameters of the prison. */
1413 #ifdef INET
1414 	redo_ip4 = 0;
1415 	if (ch_flags & PR_IP4_USER) {
1416 		if (pr_flags & PR_IP4_USER) {
1417 			/* Some restriction set. */
1418 			pr->pr_flags |= PR_IP4;
1419 			if (ip4s >= 0) {
1420 				free(pr->pr_ip4, M_PRISON);
1421 				pr->pr_ip4s = ip4s;
1422 				pr->pr_ip4 = ip4;
1423 				ip4 = NULL;
1424 			}
1425 		} else if (ppr->pr_flags & PR_IP4) {
1426 			/* This restriction cleared, but keep inherited. */
1427 			free(pr->pr_ip4, M_PRISON);
1428 			pr->pr_ip4s = ip4s;
1429 			pr->pr_ip4 = ip4;
1430 			ip4 = NULL;
1431 		} else {
1432 			/* Restriction cleared, now unrestricted. */
1433 			pr->pr_flags &= ~PR_IP4;
1434 			free(pr->pr_ip4, M_PRISON);
1435 			pr->pr_ip4s = 0;
1436 		}
1437 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1438 			if (prison_restrict_ip4(tpr, NULL)) {
1439 				redo_ip4 = 1;
1440 				descend = 0;
1441 			}
1442 		}
1443 	}
1444 #endif
1445 #ifdef INET6
1446 	redo_ip6 = 0;
1447 	if (ch_flags & PR_IP6_USER) {
1448 		if (pr_flags & PR_IP6_USER) {
1449 			/* Some restriction set. */
1450 			pr->pr_flags |= PR_IP6;
1451 			if (ip6s >= 0) {
1452 				free(pr->pr_ip6, M_PRISON);
1453 				pr->pr_ip6s = ip6s;
1454 				pr->pr_ip6 = ip6;
1455 				ip6 = NULL;
1456 			}
1457 		} else if (ppr->pr_flags & PR_IP6) {
1458 			/* This restriction cleared, but keep inherited. */
1459 			free(pr->pr_ip6, M_PRISON);
1460 			pr->pr_ip6s = ip6s;
1461 			pr->pr_ip6 = ip6;
1462 			ip6 = NULL;
1463 		} else {
1464 			/* Restriction cleared, now unrestricted. */
1465 			pr->pr_flags &= ~PR_IP6;
1466 			free(pr->pr_ip6, M_PRISON);
1467 			pr->pr_ip6s = 0;
1468 		}
1469 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1470 			if (prison_restrict_ip6(tpr, NULL)) {
1471 				redo_ip6 = 1;
1472 				descend = 0;
1473 			}
1474 		}
1475 	}
1476 #endif
1477 	if (gotslevel) {
1478 		pr->pr_securelevel = slevel;
1479 		/* Set all child jails to be at least this level. */
1480 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1481 			if (tpr->pr_securelevel < slevel)
1482 				tpr->pr_securelevel = slevel;
1483 	}
1484 	if (gotenforce) {
1485 		pr->pr_enforce_statfs = enforce;
1486 		/* Pass this restriction on to the children. */
1487 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1488 			if (tpr->pr_enforce_statfs < enforce)
1489 				tpr->pr_enforce_statfs = enforce;
1490 	}
1491 	if (name != NULL) {
1492 		if (ppr == &prison0)
1493 			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
1494 		else
1495 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1496 			    ppr->pr_name, name);
1497 		/* Change this component of child names. */
1498 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1499 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1500 			    strlen(tpr->pr_name + onamelen) + 1);
1501 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1502 		}
1503 	}
1504 	if (path != NULL) {
1505 		/* Try to keep a real-rooted full pathname. */
1506 		if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
1507 			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1508 			    mypr->pr_path, path);
1509 		else
1510 			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1511 		pr->pr_root = root;
1512 	}
1513 	if (PR_HOST & ch_flags & ~pr_flags) {
1514 		if (pr->pr_flags & PR_HOST) {
1515 			/*
1516 			 * Copy the parent's host info.  As with pr_ip4 above,
1517 			 * the lack of a lock on the parent is not a problem;
1518 			 * it is always set with allprison_lock at least
1519 			 * shared, and is held exclusively here.
1520 			 */
1521 			strlcpy(pr->pr_host, pr->pr_parent->pr_host,
1522 			    sizeof(pr->pr_host));
1523 			strlcpy(pr->pr_domain, pr->pr_parent->pr_domain,
1524 			    sizeof(pr->pr_domain));
1525 			strlcpy(pr->pr_uuid, pr->pr_parent->pr_uuid,
1526 			    sizeof(pr->pr_uuid));
1527 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1528 		}
1529 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1530 		/* Set this prison, and any descendants without PR_HOST. */
1531 		if (host != NULL)
1532 			strlcpy(pr->pr_host, host, sizeof(pr->pr_host));
1533 		if (domain != NULL)
1534 			strlcpy(pr->pr_domain, domain, sizeof(pr->pr_domain));
1535 		if (uuid != NULL)
1536 			strlcpy(pr->pr_uuid, uuid, sizeof(pr->pr_uuid));
1537 		if (gothid)
1538 			pr->pr_hostid = hid;
1539 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1540 			if (tpr->pr_flags & PR_HOST)
1541 				descend = 0;
1542 			else {
1543 				if (host != NULL)
1544 					strlcpy(tpr->pr_host, pr->pr_host,
1545 					    sizeof(tpr->pr_host));
1546 				if (domain != NULL)
1547 					strlcpy(tpr->pr_domain, pr->pr_domain,
1548 					    sizeof(tpr->pr_domain));
1549 				if (uuid != NULL)
1550 					strlcpy(tpr->pr_uuid, pr->pr_uuid,
1551 					    sizeof(tpr->pr_uuid));
1552 				if (gothid)
1553 					tpr->pr_hostid = hid;
1554 			}
1555 		}
1556 	}
1557 	if ((tallow = ch_allow & ~pr_allow)) {
1558 		/* Clear allow bits in all children. */
1559 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1560 			tpr->pr_allow &= ~tallow;
1561 	}
1562 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1563 	/*
1564 	 * Persistent prisons get an extra reference, and prisons losing their
1565 	 * persist flag lose that reference.  Only do this for existing prisons
1566 	 * for now, so new ones will remain unseen until after the module
1567 	 * handlers have completed.
1568 	 */
1569 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1570 		if (pr_flags & PR_PERSIST) {
1571 			pr->pr_ref++;
1572 			pr->pr_uref++;
1573 		} else {
1574 			pr->pr_ref--;
1575 			pr->pr_uref--;
1576 		}
1577 	}
1578 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1579 	mtx_unlock(&pr->pr_mtx);
1580 
1581 	/* Locks may have prevented a complete restriction of child IP
1582 	 * addresses.  If so, allocate some more memory and try again.
1583 	 */
1584 #ifdef INET
1585 	while (redo_ip4) {
1586 		ip4s = pr->pr_ip4s;
1587 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1588 		mtx_lock(&pr->pr_mtx);
1589 		redo_ip4 = 0;
1590 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1591 			if (prison_restrict_ip4(tpr, ip4)) {
1592 				if (ip4 != NULL)
1593 					ip4 = NULL;
1594 				else
1595 					redo_ip4 = 1;
1596 			}
1597 		}
1598 		mtx_unlock(&pr->pr_mtx);
1599 	}
1600 #endif
1601 #ifdef INET6
1602 	while (redo_ip6) {
1603 		ip6s = pr->pr_ip6s;
1604 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1605 		mtx_lock(&pr->pr_mtx);
1606 		redo_ip6 = 0;
1607 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1608 			if (prison_restrict_ip6(tpr, ip6)) {
1609 				if (ip6 != NULL)
1610 					ip6 = NULL;
1611 				else
1612 					redo_ip6 = 1;
1613 			}
1614 		}
1615 		mtx_unlock(&pr->pr_mtx);
1616 	}
1617 #endif
1618 
1619 	/* Let the modules do their work. */
1620 	sx_downgrade(&allprison_lock);
1621 	if (created) {
1622 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1623 		if (error) {
1624 			prison_deref(pr, PD_LIST_SLOCKED);
1625 			goto done_errmsg;
1626 		}
1627 	}
1628 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1629 	if (error) {
1630 		prison_deref(pr, created
1631 		    ? PD_LIST_SLOCKED
1632 		    : PD_DEREF | PD_LIST_SLOCKED);
1633 		goto done_errmsg;
1634 	}
1635 
1636 	/* Attach this process to the prison if requested. */
1637 	if (flags & JAIL_ATTACH) {
1638 		mtx_lock(&pr->pr_mtx);
1639 		error = do_jail_attach(td, pr);
1640 		if (error) {
1641 			vfs_opterror(opts, "attach failed");
1642 			if (!created)
1643 				prison_deref(pr, PD_DEREF);
1644 			goto done_errmsg;
1645 		}
1646 	}
1647 
1648 	/*
1649 	 * Now that it is all there, drop the temporary reference from existing
1650 	 * prisons.  Or add a reference to newly created persistent prisons
1651 	 * (which was not done earlier so that the prison would not be publicly
1652 	 * visible).
1653 	 */
1654 	if (!created) {
1655 		prison_deref(pr, (flags & JAIL_ATTACH)
1656 		    ? PD_DEREF
1657 		    : PD_DEREF | PD_LIST_SLOCKED);
1658 	} else {
1659 		if (pr_flags & PR_PERSIST) {
1660 			mtx_lock(&pr->pr_mtx);
1661 			pr->pr_ref++;
1662 			pr->pr_uref++;
1663 			mtx_unlock(&pr->pr_mtx);
1664 		}
1665 		if (!(flags & JAIL_ATTACH))
1666 			sx_sunlock(&allprison_lock);
1667 	}
1668 	td->td_retval[0] = pr->pr_id;
1669 	goto done_errmsg;
1670 
1671  done_deref_locked:
1672 	prison_deref(pr, created
1673 	    ? PD_LOCKED | PD_LIST_XLOCKED
1674 	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1675 	goto done_releroot;
1676  done_unlock_list:
1677 	sx_xunlock(&allprison_lock);
1678  done_releroot:
1679 	if (root != NULL) {
1680 		vfslocked = VFS_LOCK_GIANT(root->v_mount);
1681 		vrele(root);
1682 		VFS_UNLOCK_GIANT(vfslocked);
1683 	}
1684  done_errmsg:
1685 	if (error) {
1686 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1687 		if (errmsg_len > 0) {
1688 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1689 			if (errmsg_pos > 0) {
1690 				if (optuio->uio_segflg == UIO_SYSSPACE)
1691 					bcopy(errmsg,
1692 					   optuio->uio_iov[errmsg_pos].iov_base,
1693 					   errmsg_len);
1694 				else
1695 					copyout(errmsg,
1696 					   optuio->uio_iov[errmsg_pos].iov_base,
1697 					   errmsg_len);
1698 			}
1699 		}
1700 	}
1701  done_free:
1702 #ifdef INET
1703 	free(ip4, M_PRISON);
1704 #endif
1705 #ifdef INET6
1706 	free(ip6, M_PRISON);
1707 #endif
1708 	vfs_freeopts(opts);
1709 	return (error);
1710 }
1711 
1712 
1713 /*
1714  * struct jail_get_args {
1715  *	struct iovec *iovp;
1716  *	unsigned int iovcnt;
1717  *	int flags;
1718  * };
1719  */
1720 int
1721 jail_get(struct thread *td, struct jail_get_args *uap)
1722 {
1723 	struct uio *auio;
1724 	int error;
1725 
1726 	/* Check that we have an even number of iovecs. */
1727 	if (uap->iovcnt & 1)
1728 		return (EINVAL);
1729 
1730 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1731 	if (error)
1732 		return (error);
1733 	error = kern_jail_get(td, auio, uap->flags);
1734 	if (error == 0)
1735 		error = copyout(auio->uio_iov, uap->iovp,
1736 		    uap->iovcnt * sizeof (struct iovec));
1737 	free(auio, M_IOV);
1738 	return (error);
1739 }
1740 
1741 int
1742 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1743 {
1744 	struct prison *pr, *mypr;
1745 	struct vfsopt *opt;
1746 	struct vfsoptlist *opts;
1747 	char *errmsg, *name;
1748 	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
1749 
1750 	if (flags & ~JAIL_GET_MASK)
1751 		return (EINVAL);
1752 
1753 	/* Get the parameter list. */
1754 	error = vfs_buildopts(optuio, &opts);
1755 	if (error)
1756 		return (error);
1757 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1758 	mypr = td->td_ucred->cr_prison;
1759 
1760 	/*
1761 	 * Find the prison specified by one of: lastjid, jid, name.
1762 	 */
1763 	sx_slock(&allprison_lock);
1764 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1765 	if (error == 0) {
1766 		TAILQ_FOREACH(pr, &allprison, pr_list) {
1767 			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1768 				mtx_lock(&pr->pr_mtx);
1769 				if (pr->pr_ref > 0 &&
1770 				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1771 					break;
1772 				mtx_unlock(&pr->pr_mtx);
1773 			}
1774 		}
1775 		if (pr != NULL)
1776 			goto found_prison;
1777 		error = ENOENT;
1778 		vfs_opterror(opts, "no jail after %d", jid);
1779 		goto done_unlock_list;
1780 	} else if (error != ENOENT)
1781 		goto done_unlock_list;
1782 
1783 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1784 	if (error == 0) {
1785 		if (jid != 0) {
1786 			pr = prison_find_child(mypr, jid);
1787 			if (pr != NULL) {
1788 				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1789 					mtx_unlock(&pr->pr_mtx);
1790 					error = ENOENT;
1791 					vfs_opterror(opts, "jail %d is dying",
1792 					    jid);
1793 					goto done_unlock_list;
1794 				}
1795 				goto found_prison;
1796 			}
1797 			error = ENOENT;
1798 			vfs_opterror(opts, "jail %d not found", jid);
1799 			goto done_unlock_list;
1800 		}
1801 	} else if (error != ENOENT)
1802 		goto done_unlock_list;
1803 
1804 	error = vfs_getopt(opts, "name", (void **)&name, &len);
1805 	if (error == 0) {
1806 		if (len == 0 || name[len - 1] != '\0') {
1807 			error = EINVAL;
1808 			goto done_unlock_list;
1809 		}
1810 		pr = prison_find_name(mypr, name);
1811 		if (pr != NULL) {
1812 			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
1813 				mtx_unlock(&pr->pr_mtx);
1814 				error = ENOENT;
1815 				vfs_opterror(opts, "jail \"%s\" is dying",
1816 				    name);
1817 				goto done_unlock_list;
1818 			}
1819 			goto found_prison;
1820 		}
1821 		error = ENOENT;
1822 		vfs_opterror(opts, "jail \"%s\" not found", name);
1823 		goto done_unlock_list;
1824 	} else if (error != ENOENT)
1825 		goto done_unlock_list;
1826 
1827 	vfs_opterror(opts, "no jail specified");
1828 	error = ENOENT;
1829 	goto done_unlock_list;
1830 
1831  found_prison:
1832 	/* Get the parameters of the prison. */
1833 	pr->pr_ref++;
1834 	locked = PD_LOCKED;
1835 	td->td_retval[0] = pr->pr_id;
1836 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
1837 	if (error != 0 && error != ENOENT)
1838 		goto done_deref;
1839 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
1840 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
1841 	if (error != 0 && error != ENOENT)
1842 		goto done_deref;
1843 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
1844 	if (error != 0 && error != ENOENT)
1845 		goto done_deref;
1846 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
1847 	    sizeof(pr->pr_cpuset->cs_id));
1848 	if (error != 0 && error != ENOENT)
1849 		goto done_deref;
1850 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
1851 	if (error != 0 && error != ENOENT)
1852 		goto done_deref;
1853 #ifdef INET
1854 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
1855 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1856 	if (error != 0 && error != ENOENT)
1857 		goto done_deref;
1858 #endif
1859 #ifdef INET6
1860 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
1861 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1862 	if (error != 0 && error != ENOENT)
1863 		goto done_deref;
1864 #endif
1865 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
1866 	    sizeof(pr->pr_securelevel));
1867 	if (error != 0 && error != ENOENT)
1868 		goto done_deref;
1869 	error = vfs_setopts(opts, "host.hostname", pr->pr_host);
1870 	if (error != 0 && error != ENOENT)
1871 		goto done_deref;
1872 	error = vfs_setopts(opts, "host.domainname", pr->pr_domain);
1873 	if (error != 0 && error != ENOENT)
1874 		goto done_deref;
1875 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_uuid);
1876 	if (error != 0 && error != ENOENT)
1877 		goto done_deref;
1878 #ifdef COMPAT_IA32
1879 	if (td->td_proc->p_sysent->sv_flags & SV_IA32) {
1880 		uint32_t hid32 = pr->pr_hostid;
1881 
1882 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
1883 	} else
1884 #endif
1885 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
1886 	    sizeof(pr->pr_hostid));
1887 	if (error != 0 && error != ENOENT)
1888 		goto done_deref;
1889 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
1890 	    sizeof(pr->pr_enforce_statfs));
1891 	if (error != 0 && error != ENOENT)
1892 		goto done_deref;
1893 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
1894 	    fi++) {
1895 		if (pr_flag_names[fi] == NULL)
1896 			continue;
1897 		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
1898 		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
1899 		if (error != 0 && error != ENOENT)
1900 			goto done_deref;
1901 		i = !i;
1902 		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
1903 		if (error != 0 && error != ENOENT)
1904 			goto done_deref;
1905 	}
1906 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
1907 	    fi++) {
1908 		if (pr_allow_names[fi] == NULL)
1909 			continue;
1910 		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
1911 		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
1912 		if (error != 0 && error != ENOENT)
1913 			goto done_deref;
1914 		i = !i;
1915 		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
1916 		if (error != 0 && error != ENOENT)
1917 			goto done_deref;
1918 	}
1919 	i = (pr->pr_uref == 0);
1920 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
1921 	if (error != 0 && error != ENOENT)
1922 		goto done_deref;
1923 	i = !i;
1924 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
1925 	if (error != 0 && error != ENOENT)
1926 		goto done_deref;
1927 
1928 	/* Get the module parameters. */
1929 	mtx_unlock(&pr->pr_mtx);
1930 	locked = 0;
1931 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
1932 	if (error)
1933 		goto done_deref;
1934 	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
1935 
1936 	/* By now, all parameters should have been noted. */
1937 	TAILQ_FOREACH(opt, opts, link) {
1938 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1939 			error = EINVAL;
1940 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1941 			goto done_errmsg;
1942 		}
1943 	}
1944 
1945 	/* Write the fetched parameters back to userspace. */
1946 	error = 0;
1947 	TAILQ_FOREACH(opt, opts, link) {
1948 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
1949 			pos = 2 * opt->pos + 1;
1950 			optuio->uio_iov[pos].iov_len = opt->len;
1951 			if (opt->value != NULL) {
1952 				if (optuio->uio_segflg == UIO_SYSSPACE) {
1953 					bcopy(opt->value,
1954 					    optuio->uio_iov[pos].iov_base,
1955 					    opt->len);
1956 				} else {
1957 					error = copyout(opt->value,
1958 					    optuio->uio_iov[pos].iov_base,
1959 					    opt->len);
1960 					if (error)
1961 						break;
1962 				}
1963 			}
1964 		}
1965 	}
1966 	goto done_errmsg;
1967 
1968  done_deref:
1969 	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
1970 	goto done_errmsg;
1971 
1972  done_unlock_list:
1973 	sx_sunlock(&allprison_lock);
1974  done_errmsg:
1975 	if (error && errmsg_pos >= 0) {
1976 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
1977 		errmsg_pos = 2 * errmsg_pos + 1;
1978 		if (errmsg_len > 0) {
1979 			if (optuio->uio_segflg == UIO_SYSSPACE)
1980 				bcopy(errmsg,
1981 				    optuio->uio_iov[errmsg_pos].iov_base,
1982 				    errmsg_len);
1983 			else
1984 				copyout(errmsg,
1985 				    optuio->uio_iov[errmsg_pos].iov_base,
1986 				    errmsg_len);
1987 		}
1988 	}
1989 	vfs_freeopts(opts);
1990 	return (error);
1991 }
1992 
1993 
1994 /*
1995  * struct jail_remove_args {
1996  *	int jid;
1997  * };
1998  */
1999 int
2000 jail_remove(struct thread *td, struct jail_remove_args *uap)
2001 {
2002 	struct prison *pr, *cpr, *lpr, *tpr;
2003 	int descend, error;
2004 
2005 	error = priv_check(td, PRIV_JAIL_REMOVE);
2006 	if (error)
2007 		return (error);
2008 
2009 	sx_xlock(&allprison_lock);
2010 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2011 	if (pr == NULL) {
2012 		sx_xunlock(&allprison_lock);
2013 		return (EINVAL);
2014 	}
2015 
2016 	/* Remove all descendants of this prison, then remove this prison. */
2017 	pr->pr_ref++;
2018 	pr->pr_flags |= PR_REMOVE;
2019 	if (!LIST_EMPTY(&pr->pr_children)) {
2020 		mtx_unlock(&pr->pr_mtx);
2021 		lpr = NULL;
2022 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2023 			mtx_lock(&cpr->pr_mtx);
2024 			if (cpr->pr_ref > 0) {
2025 				tpr = cpr;
2026 				cpr->pr_ref++;
2027 				cpr->pr_flags |= PR_REMOVE;
2028 			} else {
2029 				/* Already removed - do not do it again. */
2030 				tpr = NULL;
2031 			}
2032 			mtx_unlock(&cpr->pr_mtx);
2033 			if (lpr != NULL) {
2034 				mtx_lock(&lpr->pr_mtx);
2035 				prison_remove_one(lpr);
2036 				sx_xlock(&allprison_lock);
2037 			}
2038 			lpr = tpr;
2039 		}
2040 		if (lpr != NULL) {
2041 			mtx_lock(&lpr->pr_mtx);
2042 			prison_remove_one(lpr);
2043 			sx_xlock(&allprison_lock);
2044 		}
2045 		mtx_lock(&pr->pr_mtx);
2046 	}
2047 	prison_remove_one(pr);
2048 	return (0);
2049 }
2050 
2051 static void
2052 prison_remove_one(struct prison *pr)
2053 {
2054 	struct proc *p;
2055 	int deuref;
2056 
2057 	/* If the prison was persistent, it is not anymore. */
2058 	deuref = 0;
2059 	if (pr->pr_flags & PR_PERSIST) {
2060 		pr->pr_ref--;
2061 		deuref = PD_DEUREF;
2062 		pr->pr_flags &= ~PR_PERSIST;
2063 	}
2064 
2065 	/*
2066 	 * jail_remove added a reference.  If that's the only one, remove
2067 	 * the prison now.
2068 	 */
2069 	KASSERT(pr->pr_ref > 0,
2070 	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2071 	if (pr->pr_ref == 1) {
2072 		prison_deref(pr,
2073 		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2074 		return;
2075 	}
2076 
2077 	mtx_unlock(&pr->pr_mtx);
2078 	sx_xunlock(&allprison_lock);
2079 	/*
2080 	 * Kill all processes unfortunate enough to be attached to this prison.
2081 	 */
2082 	sx_slock(&allproc_lock);
2083 	LIST_FOREACH(p, &allproc, p_list) {
2084 		PROC_LOCK(p);
2085 		if (p->p_state != PRS_NEW && p->p_ucred &&
2086 		    p->p_ucred->cr_prison == pr)
2087 			psignal(p, SIGKILL);
2088 		PROC_UNLOCK(p);
2089 	}
2090 	sx_sunlock(&allproc_lock);
2091 	/* Remove the temporary reference added by jail_remove. */
2092 	prison_deref(pr, deuref | PD_DEREF);
2093 }
2094 
2095 
2096 /*
2097  * struct jail_attach_args {
2098  *	int jid;
2099  * };
2100  */
2101 int
2102 jail_attach(struct thread *td, struct jail_attach_args *uap)
2103 {
2104 	struct prison *pr;
2105 	int error;
2106 
2107 	error = priv_check(td, PRIV_JAIL_ATTACH);
2108 	if (error)
2109 		return (error);
2110 
2111 	sx_slock(&allprison_lock);
2112 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2113 	if (pr == NULL) {
2114 		sx_sunlock(&allprison_lock);
2115 		return (EINVAL);
2116 	}
2117 
2118 	/*
2119 	 * Do not allow a process to attach to a prison that is not
2120 	 * considered to be "alive".
2121 	 */
2122 	if (pr->pr_uref == 0) {
2123 		mtx_unlock(&pr->pr_mtx);
2124 		sx_sunlock(&allprison_lock);
2125 		return (EINVAL);
2126 	}
2127 
2128 	return (do_jail_attach(td, pr));
2129 }
2130 
2131 static int
2132 do_jail_attach(struct thread *td, struct prison *pr)
2133 {
2134 	struct prison *ppr;
2135 	struct proc *p;
2136 	struct ucred *newcred, *oldcred;
2137 	int vfslocked, error;
2138 
2139 	/*
2140 	 * XXX: Note that there is a slight race here if two threads
2141 	 * in the same privileged process attempt to attach to two
2142 	 * different jails at the same time.  It is important for
2143 	 * user processes not to do this, or they might end up with
2144 	 * a process root from one prison, but attached to the jail
2145 	 * of another.
2146 	 */
2147 	pr->pr_ref++;
2148 	pr->pr_uref++;
2149 	mtx_unlock(&pr->pr_mtx);
2150 
2151 	/* Let modules do whatever they need to prepare for attaching. */
2152 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2153 	if (error) {
2154 		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2155 		return (error);
2156 	}
2157 	sx_sunlock(&allprison_lock);
2158 
2159 	/*
2160 	 * Reparent the newly attached process to this jail.
2161 	 */
2162 	ppr = td->td_ucred->cr_prison;
2163 	p = td->td_proc;
2164 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2165 	if (error)
2166 		goto e_revert_osd;
2167 
2168 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2169 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2170 	if ((error = change_dir(pr->pr_root, td)) != 0)
2171 		goto e_unlock;
2172 #ifdef MAC
2173 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2174 		goto e_unlock;
2175 #endif
2176 	VOP_UNLOCK(pr->pr_root, 0);
2177 	if ((error = change_root(pr->pr_root, td)))
2178 		goto e_unlock_giant;
2179 	VFS_UNLOCK_GIANT(vfslocked);
2180 
2181 	newcred = crget();
2182 	PROC_LOCK(p);
2183 	oldcred = p->p_ucred;
2184 	setsugid(p);
2185 	crcopy(newcred, oldcred);
2186 	newcred->cr_prison = pr;
2187 	p->p_ucred = newcred;
2188 	PROC_UNLOCK(p);
2189 	crfree(oldcred);
2190 	prison_deref(ppr, PD_DEREF | PD_DEUREF);
2191 	return (0);
2192  e_unlock:
2193 	VOP_UNLOCK(pr->pr_root, 0);
2194  e_unlock_giant:
2195 	VFS_UNLOCK_GIANT(vfslocked);
2196  e_revert_osd:
2197 	/* Tell modules this thread is still in its old jail after all. */
2198 	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
2199 	prison_deref(pr, PD_DEREF | PD_DEUREF);
2200 	return (error);
2201 }
2202 
2203 
2204 /*
2205  * Returns a locked prison instance, or NULL on failure.
2206  */
2207 struct prison *
2208 prison_find(int prid)
2209 {
2210 	struct prison *pr;
2211 
2212 	sx_assert(&allprison_lock, SX_LOCKED);
2213 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2214 		if (pr->pr_id == prid) {
2215 			mtx_lock(&pr->pr_mtx);
2216 			if (pr->pr_ref > 0)
2217 				return (pr);
2218 			mtx_unlock(&pr->pr_mtx);
2219 		}
2220 	}
2221 	return (NULL);
2222 }
2223 
2224 /*
2225  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2226  */
2227 struct prison *
2228 prison_find_child(struct prison *mypr, int prid)
2229 {
2230 	struct prison *pr;
2231 	int descend;
2232 
2233 	sx_assert(&allprison_lock, SX_LOCKED);
2234 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2235 		if (pr->pr_id == prid) {
2236 			mtx_lock(&pr->pr_mtx);
2237 			if (pr->pr_ref > 0)
2238 				return (pr);
2239 			mtx_unlock(&pr->pr_mtx);
2240 		}
2241 	}
2242 	return (NULL);
2243 }
2244 
2245 /*
2246  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2247  */
2248 struct prison *
2249 prison_find_name(struct prison *mypr, const char *name)
2250 {
2251 	struct prison *pr, *deadpr;
2252 	size_t mylen;
2253 	int descend;
2254 
2255 	sx_assert(&allprison_lock, SX_LOCKED);
2256 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2257  again:
2258 	deadpr = NULL;
2259 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2260 		if (!strcmp(pr->pr_name + mylen, name)) {
2261 			mtx_lock(&pr->pr_mtx);
2262 			if (pr->pr_ref > 0) {
2263 				if (pr->pr_uref > 0)
2264 					return (pr);
2265 				deadpr = pr;
2266 			}
2267 			mtx_unlock(&pr->pr_mtx);
2268 		}
2269 	}
2270 	/* There was no valid prison - perhaps there was a dying one. */
2271 	if (deadpr != NULL) {
2272 		mtx_lock(&deadpr->pr_mtx);
2273 		if (deadpr->pr_ref == 0) {
2274 			mtx_unlock(&deadpr->pr_mtx);
2275 			goto again;
2276 		}
2277 	}
2278 	return (deadpr);
2279 }
2280 
2281 /*
2282  * See if a prison has the specific flag set.
2283  */
2284 int
2285 prison_flag(struct ucred *cred, unsigned flag)
2286 {
2287 
2288 	/* This is an atomic read, so no locking is necessary. */
2289 	return (cred->cr_prison->pr_flags & flag);
2290 }
2291 
2292 int
2293 prison_allow(struct ucred *cred, unsigned flag)
2294 {
2295 
2296 	/* This is an atomic read, so no locking is necessary. */
2297 	return (cred->cr_prison->pr_allow & flag);
2298 }
2299 
2300 /*
2301  * Remove a prison reference.  If that was the last reference, remove the
2302  * prison itself - but not in this context in case there are locks held.
2303  */
2304 void
2305 prison_free_locked(struct prison *pr)
2306 {
2307 
2308 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2309 	pr->pr_ref--;
2310 	if (pr->pr_ref == 0) {
2311 		mtx_unlock(&pr->pr_mtx);
2312 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
2313 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2314 		return;
2315 	}
2316 	mtx_unlock(&pr->pr_mtx);
2317 }
2318 
2319 void
2320 prison_free(struct prison *pr)
2321 {
2322 
2323 	mtx_lock(&pr->pr_mtx);
2324 	prison_free_locked(pr);
2325 }
2326 
2327 static void
2328 prison_complete(void *context, int pending)
2329 {
2330 
2331 	prison_deref((struct prison *)context, 0);
2332 }
2333 
2334 /*
2335  * Remove a prison reference (usually).  This internal version assumes no
2336  * mutexes are held, except perhaps the prison itself.  If there are no more
2337  * references, release and delist the prison.  On completion, the prison lock
2338  * and the allprison lock are both unlocked.
2339  */
2340 static void
2341 prison_deref(struct prison *pr, int flags)
2342 {
2343 	struct prison *ppr, *tpr;
2344 	int vfslocked;
2345 
2346 	if (!(flags & PD_LOCKED))
2347 		mtx_lock(&pr->pr_mtx);
2348 	/* Decrement the user references in a separate loop. */
2349 	if (flags & PD_DEUREF) {
2350 		for (tpr = pr;; tpr = tpr->pr_parent) {
2351 			if (tpr != pr)
2352 				mtx_lock(&tpr->pr_mtx);
2353 			if (--tpr->pr_uref > 0)
2354 				break;
2355 			KASSERT(tpr != &prison0, ("prison0 pr_uref=0"));
2356 			mtx_unlock(&tpr->pr_mtx);
2357 		}
2358 		/* Done if there were only user references to remove. */
2359 		if (!(flags & PD_DEREF)) {
2360 			mtx_unlock(&tpr->pr_mtx);
2361 			if (flags & PD_LIST_SLOCKED)
2362 				sx_sunlock(&allprison_lock);
2363 			else if (flags & PD_LIST_XLOCKED)
2364 				sx_xunlock(&allprison_lock);
2365 			return;
2366 		}
2367 		if (tpr != pr) {
2368 			mtx_unlock(&tpr->pr_mtx);
2369 			mtx_lock(&pr->pr_mtx);
2370 		}
2371 	}
2372 
2373 	for (;;) {
2374 		if (flags & PD_DEREF)
2375 			pr->pr_ref--;
2376 		/* If the prison still has references, nothing else to do. */
2377 		if (pr->pr_ref > 0) {
2378 			mtx_unlock(&pr->pr_mtx);
2379 			if (flags & PD_LIST_SLOCKED)
2380 				sx_sunlock(&allprison_lock);
2381 			else if (flags & PD_LIST_XLOCKED)
2382 				sx_xunlock(&allprison_lock);
2383 			return;
2384 		}
2385 
2386 		mtx_unlock(&pr->pr_mtx);
2387 		if (flags & PD_LIST_SLOCKED) {
2388 			if (!sx_try_upgrade(&allprison_lock)) {
2389 				sx_sunlock(&allprison_lock);
2390 				sx_xlock(&allprison_lock);
2391 			}
2392 		} else if (!(flags & PD_LIST_XLOCKED))
2393 			sx_xlock(&allprison_lock);
2394 
2395 		TAILQ_REMOVE(&allprison, pr, pr_list);
2396 		LIST_REMOVE(pr, pr_sibling);
2397 		ppr = pr->pr_parent;
2398 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2399 			tpr->pr_prisoncount--;
2400 		sx_downgrade(&allprison_lock);
2401 
2402 		if (pr->pr_root != NULL) {
2403 			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
2404 			vrele(pr->pr_root);
2405 			VFS_UNLOCK_GIANT(vfslocked);
2406 		}
2407 		mtx_destroy(&pr->pr_mtx);
2408 #ifdef INET
2409 		free(pr->pr_ip4, M_PRISON);
2410 #endif
2411 #ifdef INET6
2412 		free(pr->pr_ip6, M_PRISON);
2413 #endif
2414 		if (pr->pr_cpuset != NULL)
2415 			cpuset_rel(pr->pr_cpuset);
2416 		osd_jail_exit(pr);
2417 		free(pr, M_PRISON);
2418 
2419 		/* Removing a prison frees a reference on its parent. */
2420 		pr = ppr;
2421 		mtx_lock(&pr->pr_mtx);
2422 		flags = PD_DEREF | PD_LIST_SLOCKED;
2423 	}
2424 }
2425 
2426 void
2427 prison_hold_locked(struct prison *pr)
2428 {
2429 
2430 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2431 	KASSERT(pr->pr_ref > 0,
2432 	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2433 	pr->pr_ref++;
2434 }
2435 
2436 void
2437 prison_hold(struct prison *pr)
2438 {
2439 
2440 	mtx_lock(&pr->pr_mtx);
2441 	prison_hold_locked(pr);
2442 	mtx_unlock(&pr->pr_mtx);
2443 }
2444 
2445 void
2446 prison_proc_hold(struct prison *pr)
2447 {
2448 
2449 	mtx_lock(&pr->pr_mtx);
2450 	KASSERT(pr->pr_uref > 0,
2451 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2452 	pr->pr_uref++;
2453 	mtx_unlock(&pr->pr_mtx);
2454 }
2455 
2456 void
2457 prison_proc_free(struct prison *pr)
2458 {
2459 
2460 	mtx_lock(&pr->pr_mtx);
2461 	KASSERT(pr->pr_uref > 0,
2462 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2463 	prison_deref(pr, PD_DEUREF | PD_LOCKED);
2464 }
2465 
2466 
2467 #ifdef INET
2468 /*
2469  * Restrict a prison's IP address list with its parent's, possibly replacing
2470  * it.  Return true if the replacement buffer was used (or would have been).
2471  */
2472 static int
2473 prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2474 {
2475 	int ii, ij, used;
2476 	struct prison *ppr;
2477 
2478 	ppr = pr->pr_parent;
2479 	if (!(pr->pr_flags & PR_IP4_USER)) {
2480 		/* This has no user settings, so just copy the parent's list. */
2481 		if (pr->pr_ip4s < ppr->pr_ip4s) {
2482 			/*
2483 			 * There's no room for the parent's list.  Use the
2484 			 * new list buffer, which is assumed to be big enough
2485 			 * (if it was passed).  If there's no buffer, try to
2486 			 * allocate one.
2487 			 */
2488 			used = 1;
2489 			if (newip4 == NULL) {
2490 				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2491 				    M_PRISON, M_NOWAIT);
2492 				if (newip4 != NULL)
2493 					used = 0;
2494 			}
2495 			if (newip4 != NULL) {
2496 				bcopy(ppr->pr_ip4, newip4,
2497 				    ppr->pr_ip4s * sizeof(*newip4));
2498 				free(pr->pr_ip4, M_PRISON);
2499 				pr->pr_ip4 = newip4;
2500 				pr->pr_ip4s = ppr->pr_ip4s;
2501 				pr->pr_flags |= PR_IP4;
2502 			}
2503 			return (used);
2504 		}
2505 		pr->pr_ip4s = ppr->pr_ip4s;
2506 		if (pr->pr_ip4s > 0)
2507 			bcopy(ppr->pr_ip4, pr->pr_ip4,
2508 			    pr->pr_ip4s * sizeof(*newip4));
2509 		else if (pr->pr_ip4 != NULL) {
2510 			free(pr->pr_ip4, M_PRISON);
2511 			pr->pr_ip4 = NULL;
2512 		}
2513 		pr->pr_flags =
2514 			(pr->pr_flags & ~PR_IP4) | (ppr->pr_flags & PR_IP4);
2515 	} else if (pr->pr_ip4s > 0 && (ppr->pr_flags & PR_IP4)) {
2516 		/* Remove addresses that aren't in the parent. */
2517 		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2518 			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2519 				break;
2520 		if (ij < ppr->pr_ip4s)
2521 			ii = 1;
2522 		else {
2523 			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2524 			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2525 			ii = 0;
2526 		}
2527 		for (ij = 1; ii < pr->pr_ip4s; ) {
2528 			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2529 				ii++;
2530 				continue;
2531 			}
2532 			switch (ij >= ppr->pr_ip4s ? -1 :
2533 				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2534 			case -1:
2535 				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2536 				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2537 				break;
2538 			case 0:
2539 				ii++;
2540 				ij++;
2541 				break;
2542 			case 1:
2543 				ij++;
2544 				break;
2545 			}
2546 		}
2547 		if (pr->pr_ip4s == 0) {
2548 			free(pr->pr_ip4, M_PRISON);
2549 			pr->pr_ip4 = NULL;
2550 		}
2551 	}
2552 	return (0);
2553 }
2554 
2555 /*
2556  * Pass back primary IPv4 address of this jail.
2557  *
2558  * If not restricted return success but do not alter the address.  Caller has
2559  * to make sure to initialize it correctly (e.g. INADDR_ANY).
2560  *
2561  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2562  * Address returned in NBO.
2563  */
2564 int
2565 prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2566 {
2567 	struct prison *pr;
2568 
2569 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2570 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2571 
2572 	pr = cred->cr_prison;
2573 	if (!(pr->pr_flags & PR_IP4))
2574 		return (0);
2575 	mtx_lock(&pr->pr_mtx);
2576 	if (!(pr->pr_flags & PR_IP4)) {
2577 		mtx_unlock(&pr->pr_mtx);
2578 		return (0);
2579 	}
2580 	if (pr->pr_ip4 == NULL) {
2581 		mtx_unlock(&pr->pr_mtx);
2582 		return (EAFNOSUPPORT);
2583 	}
2584 
2585 	ia->s_addr = pr->pr_ip4[0].s_addr;
2586 	mtx_unlock(&pr->pr_mtx);
2587 	return (0);
2588 }
2589 
2590 /*
2591  * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2592  */
2593 int
2594 prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2595 {
2596 
2597 	if (pr1 == pr2)
2598 		return (1);
2599 
2600 	/*
2601 	 * jail_set maintains an exclusive hold on allprison_lock while it
2602 	 * changes the IP addresses, so only a shared hold is needed.  This is
2603 	 * easier than locking the two prisons which would require finding the
2604 	 * proper locking order and end up needing allprison_lock anyway.
2605 	 */
2606 	sx_slock(&allprison_lock);
2607 	while (pr1 != &prison0 && !(pr1->pr_flags & PR_IP4_USER))
2608 		pr1 = pr1->pr_parent;
2609 	while (pr2 != &prison0 && !(pr2->pr_flags & PR_IP4_USER))
2610 		pr2 = pr2->pr_parent;
2611 	sx_sunlock(&allprison_lock);
2612 	return (pr1 == pr2);
2613 }
2614 
2615 /*
2616  * Make sure our (source) address is set to something meaningful to this
2617  * jail.
2618  *
2619  * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2620  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2621  * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2622  */
2623 int
2624 prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2625 {
2626 	struct prison *pr;
2627 	struct in_addr ia0;
2628 	int error;
2629 
2630 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2631 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2632 
2633 	pr = cred->cr_prison;
2634 	if (!(pr->pr_flags & PR_IP4))
2635 		return (0);
2636 	mtx_lock(&pr->pr_mtx);
2637 	if (!(pr->pr_flags & PR_IP4)) {
2638 		mtx_unlock(&pr->pr_mtx);
2639 		return (0);
2640 	}
2641 	if (pr->pr_ip4 == NULL) {
2642 		mtx_unlock(&pr->pr_mtx);
2643 		return (EAFNOSUPPORT);
2644 	}
2645 
2646 	ia0.s_addr = ntohl(ia->s_addr);
2647 	if (ia0.s_addr == INADDR_LOOPBACK) {
2648 		ia->s_addr = pr->pr_ip4[0].s_addr;
2649 		mtx_unlock(&pr->pr_mtx);
2650 		return (0);
2651 	}
2652 
2653 	if (ia0.s_addr == INADDR_ANY) {
2654 		/*
2655 		 * In case there is only 1 IPv4 address, bind directly.
2656 		 */
2657 		if (pr->pr_ip4s == 1)
2658 			ia->s_addr = pr->pr_ip4[0].s_addr;
2659 		mtx_unlock(&pr->pr_mtx);
2660 		return (0);
2661 	}
2662 
2663 	error = _prison_check_ip4(pr, ia);
2664 	mtx_unlock(&pr->pr_mtx);
2665 	return (error);
2666 }
2667 
2668 /*
2669  * Rewrite destination address in case we will connect to loopback address.
2670  *
2671  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2672  * Address passed in in NBO and returned in NBO.
2673  */
2674 int
2675 prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
2676 {
2677 	struct prison *pr;
2678 
2679 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2680 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2681 
2682 	pr = cred->cr_prison;
2683 	if (!(pr->pr_flags & PR_IP4))
2684 		return (0);
2685 	mtx_lock(&pr->pr_mtx);
2686 	if (!(pr->pr_flags & PR_IP4)) {
2687 		mtx_unlock(&pr->pr_mtx);
2688 		return (0);
2689 	}
2690 	if (pr->pr_ip4 == NULL) {
2691 		mtx_unlock(&pr->pr_mtx);
2692 		return (EAFNOSUPPORT);
2693 	}
2694 
2695 	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
2696 		ia->s_addr = pr->pr_ip4[0].s_addr;
2697 		mtx_unlock(&pr->pr_mtx);
2698 		return (0);
2699 	}
2700 
2701 	/*
2702 	 * Return success because nothing had to be changed.
2703 	 */
2704 	mtx_unlock(&pr->pr_mtx);
2705 	return (0);
2706 }
2707 
2708 /*
2709  * Check if given address belongs to the jail referenced by cred/prison.
2710  *
2711  * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2712  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2713  * doesn't allow IPv4.  Address passed in in NBO.
2714  */
2715 static int
2716 _prison_check_ip4(struct prison *pr, struct in_addr *ia)
2717 {
2718 	int i, a, z, d;
2719 
2720 	/*
2721 	 * Check the primary IP.
2722 	 */
2723 	if (pr->pr_ip4[0].s_addr == ia->s_addr)
2724 		return (0);
2725 
2726 	/*
2727 	 * All the other IPs are sorted so we can do a binary search.
2728 	 */
2729 	a = 0;
2730 	z = pr->pr_ip4s - 2;
2731 	while (a <= z) {
2732 		i = (a + z) / 2;
2733 		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
2734 		if (d > 0)
2735 			z = i - 1;
2736 		else if (d < 0)
2737 			a = i + 1;
2738 		else
2739 			return (0);
2740 	}
2741 
2742 	return (EADDRNOTAVAIL);
2743 }
2744 
2745 int
2746 prison_check_ip4(struct ucred *cred, struct in_addr *ia)
2747 {
2748 	struct prison *pr;
2749 	int error;
2750 
2751 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2752 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2753 
2754 	pr = cred->cr_prison;
2755 	if (!(pr->pr_flags & PR_IP4))
2756 		return (0);
2757 	mtx_lock(&pr->pr_mtx);
2758 	if (!(pr->pr_flags & PR_IP4)) {
2759 		mtx_unlock(&pr->pr_mtx);
2760 		return (0);
2761 	}
2762 	if (pr->pr_ip4 == NULL) {
2763 		mtx_unlock(&pr->pr_mtx);
2764 		return (EAFNOSUPPORT);
2765 	}
2766 
2767 	error = _prison_check_ip4(pr, ia);
2768 	mtx_unlock(&pr->pr_mtx);
2769 	return (error);
2770 }
2771 #endif
2772 
2773 #ifdef INET6
2774 static int
2775 prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
2776 {
2777 	int ii, ij, used;
2778 	struct prison *ppr;
2779 
2780 	ppr = pr->pr_parent;
2781 	if (!(pr->pr_flags & PR_IP6_USER)) {
2782 		/* This has no user settings, so just copy the parent's list. */
2783 		if (pr->pr_ip6s < ppr->pr_ip6s) {
2784 			/*
2785 			 * There's no room for the parent's list.  Use the
2786 			 * new list buffer, which is assumed to be big enough
2787 			 * (if it was passed).  If there's no buffer, try to
2788 			 * allocate one.
2789 			 */
2790 			used = 1;
2791 			if (newip6 == NULL) {
2792 				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
2793 				    M_PRISON, M_NOWAIT);
2794 				if (newip6 != NULL)
2795 					used = 0;
2796 			}
2797 			if (newip6 != NULL) {
2798 				bcopy(ppr->pr_ip6, newip6,
2799 				    ppr->pr_ip6s * sizeof(*newip6));
2800 				free(pr->pr_ip6, M_PRISON);
2801 				pr->pr_ip6 = newip6;
2802 				pr->pr_ip6s = ppr->pr_ip6s;
2803 				pr->pr_flags |= PR_IP6;
2804 			}
2805 			return (used);
2806 		}
2807 		pr->pr_ip6s = ppr->pr_ip6s;
2808 		if (pr->pr_ip6s > 0)
2809 			bcopy(ppr->pr_ip6, pr->pr_ip6,
2810 			    pr->pr_ip6s * sizeof(*newip6));
2811 		else if (pr->pr_ip6 != NULL) {
2812 			free(pr->pr_ip6, M_PRISON);
2813 			pr->pr_ip6 = NULL;
2814 		}
2815 		pr->pr_flags =
2816 			(pr->pr_flags & ~PR_IP6) | (ppr->pr_flags & PR_IP6);
2817 	} else if (pr->pr_ip6s > 0 && (ppr->pr_flags & PR_IP6)) {
2818 		/* Remove addresses that aren't in the parent. */
2819 		for (ij = 0; ij < ppr->pr_ip6s; ij++)
2820 			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
2821 			    &ppr->pr_ip6[ij]))
2822 				break;
2823 		if (ij < ppr->pr_ip6s)
2824 			ii = 1;
2825 		else {
2826 			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
2827 			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
2828 			ii = 0;
2829 		}
2830 		for (ij = 1; ii < pr->pr_ip6s; ) {
2831 			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
2832 			    &ppr->pr_ip6[0])) {
2833 				ii++;
2834 				continue;
2835 			}
2836 			switch (ij >= ppr->pr_ip4s ? -1 :
2837 				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
2838 			case -1:
2839 				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
2840 				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
2841 				break;
2842 			case 0:
2843 				ii++;
2844 				ij++;
2845 				break;
2846 			case 1:
2847 				ij++;
2848 				break;
2849 			}
2850 		}
2851 		if (pr->pr_ip6s == 0) {
2852 			free(pr->pr_ip6, M_PRISON);
2853 			pr->pr_ip6 = NULL;
2854 		}
2855 	}
2856 	return 0;
2857 }
2858 
2859 /*
2860  * Pass back primary IPv6 address for this jail.
2861  *
2862  * If not restricted return success but do not alter the address.  Caller has
2863  * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
2864  *
2865  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
2866  */
2867 int
2868 prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
2869 {
2870 	struct prison *pr;
2871 
2872 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2873 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2874 
2875 	pr = cred->cr_prison;
2876 	if (!(pr->pr_flags & PR_IP6))
2877 		return (0);
2878 	mtx_lock(&pr->pr_mtx);
2879 	if (!(pr->pr_flags & PR_IP6)) {
2880 		mtx_unlock(&pr->pr_mtx);
2881 		return (0);
2882 	}
2883 	if (pr->pr_ip6 == NULL) {
2884 		mtx_unlock(&pr->pr_mtx);
2885 		return (EAFNOSUPPORT);
2886 	}
2887 
2888 	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2889 	mtx_unlock(&pr->pr_mtx);
2890 	return (0);
2891 }
2892 
2893 /*
2894  * Return true if pr1 and pr2 have the same IPv6 address restrictions.
2895  */
2896 int
2897 prison_equal_ip6(struct prison *pr1, struct prison *pr2)
2898 {
2899 
2900 	if (pr1 == pr2)
2901 		return (1);
2902 
2903 	sx_slock(&allprison_lock);
2904 	while (pr1 != &prison0 && !(pr1->pr_flags & PR_IP6_USER))
2905 		pr1 = pr1->pr_parent;
2906 	while (pr2 != &prison0 && !(pr2->pr_flags & PR_IP6_USER))
2907 		pr2 = pr2->pr_parent;
2908 	sx_sunlock(&allprison_lock);
2909 	return (pr1 == pr2);
2910 }
2911 
2912 /*
2913  * Make sure our (source) address is set to something meaningful to this jail.
2914  *
2915  * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
2916  * when needed while binding.
2917  *
2918  * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
2919  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2920  * doesn't allow IPv6.
2921  */
2922 int
2923 prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
2924 {
2925 	struct prison *pr;
2926 	int error;
2927 
2928 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2929 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2930 
2931 	pr = cred->cr_prison;
2932 	if (!(pr->pr_flags & PR_IP6))
2933 		return (0);
2934 	mtx_lock(&pr->pr_mtx);
2935 	if (!(pr->pr_flags & PR_IP6)) {
2936 		mtx_unlock(&pr->pr_mtx);
2937 		return (0);
2938 	}
2939 	if (pr->pr_ip6 == NULL) {
2940 		mtx_unlock(&pr->pr_mtx);
2941 		return (EAFNOSUPPORT);
2942 	}
2943 
2944 	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
2945 		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2946 		mtx_unlock(&pr->pr_mtx);
2947 		return (0);
2948 	}
2949 
2950 	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
2951 		/*
2952 		 * In case there is only 1 IPv6 address, and v6only is true,
2953 		 * then bind directly.
2954 		 */
2955 		if (v6only != 0 && pr->pr_ip6s == 1)
2956 			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2957 		mtx_unlock(&pr->pr_mtx);
2958 		return (0);
2959 	}
2960 
2961 	error = _prison_check_ip6(pr, ia6);
2962 	mtx_unlock(&pr->pr_mtx);
2963 	return (error);
2964 }
2965 
2966 /*
2967  * Rewrite destination address in case we will connect to loopback address.
2968  *
2969  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
2970  */
2971 int
2972 prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
2973 {
2974 	struct prison *pr;
2975 
2976 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2977 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
2978 
2979 	pr = cred->cr_prison;
2980 	if (!(pr->pr_flags & PR_IP6))
2981 		return (0);
2982 	mtx_lock(&pr->pr_mtx);
2983 	if (!(pr->pr_flags & PR_IP6)) {
2984 		mtx_unlock(&pr->pr_mtx);
2985 		return (0);
2986 	}
2987 	if (pr->pr_ip6 == NULL) {
2988 		mtx_unlock(&pr->pr_mtx);
2989 		return (EAFNOSUPPORT);
2990 	}
2991 
2992 	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
2993 		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
2994 		mtx_unlock(&pr->pr_mtx);
2995 		return (0);
2996 	}
2997 
2998 	/*
2999 	 * Return success because nothing had to be changed.
3000 	 */
3001 	mtx_unlock(&pr->pr_mtx);
3002 	return (0);
3003 }
3004 
3005 /*
3006  * Check if given address belongs to the jail referenced by cred/prison.
3007  *
3008  * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3009  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3010  * doesn't allow IPv6.
3011  */
3012 static int
3013 _prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3014 {
3015 	int i, a, z, d;
3016 
3017 	/*
3018 	 * Check the primary IP.
3019 	 */
3020 	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3021 		return (0);
3022 
3023 	/*
3024 	 * All the other IPs are sorted so we can do a binary search.
3025 	 */
3026 	a = 0;
3027 	z = pr->pr_ip6s - 2;
3028 	while (a <= z) {
3029 		i = (a + z) / 2;
3030 		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3031 		if (d > 0)
3032 			z = i - 1;
3033 		else if (d < 0)
3034 			a = i + 1;
3035 		else
3036 			return (0);
3037 	}
3038 
3039 	return (EADDRNOTAVAIL);
3040 }
3041 
3042 int
3043 prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3044 {
3045 	struct prison *pr;
3046 	int error;
3047 
3048 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3049 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3050 
3051 	pr = cred->cr_prison;
3052 	if (!(pr->pr_flags & PR_IP6))
3053 		return (0);
3054 	mtx_lock(&pr->pr_mtx);
3055 	if (!(pr->pr_flags & PR_IP6)) {
3056 		mtx_unlock(&pr->pr_mtx);
3057 		return (0);
3058 	}
3059 	if (pr->pr_ip6 == NULL) {
3060 		mtx_unlock(&pr->pr_mtx);
3061 		return (EAFNOSUPPORT);
3062 	}
3063 
3064 	error = _prison_check_ip6(pr, ia6);
3065 	mtx_unlock(&pr->pr_mtx);
3066 	return (error);
3067 }
3068 #endif
3069 
3070 /*
3071  * Check if a jail supports the given address family.
3072  *
3073  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3074  * if not.
3075  */
3076 int
3077 prison_check_af(struct ucred *cred, int af)
3078 {
3079 	struct prison *pr;
3080 	int error;
3081 
3082 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3083 
3084 	pr = cred->cr_prison;
3085 	error = 0;
3086 	switch (af)
3087 	{
3088 #ifdef INET
3089 	case AF_INET:
3090 		if (pr->pr_flags & PR_IP4)
3091 		{
3092 			mtx_lock(&pr->pr_mtx);
3093 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3094 				error = EAFNOSUPPORT;
3095 			mtx_unlock(&pr->pr_mtx);
3096 		}
3097 		break;
3098 #endif
3099 #ifdef INET6
3100 	case AF_INET6:
3101 		if (pr->pr_flags & PR_IP6)
3102 		{
3103 			mtx_lock(&pr->pr_mtx);
3104 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3105 				error = EAFNOSUPPORT;
3106 			mtx_unlock(&pr->pr_mtx);
3107 		}
3108 		break;
3109 #endif
3110 	case AF_LOCAL:
3111 	case AF_ROUTE:
3112 		break;
3113 	default:
3114 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3115 			error = EAFNOSUPPORT;
3116 	}
3117 	return (error);
3118 }
3119 
3120 /*
3121  * Check if given address belongs to the jail referenced by cred (wrapper to
3122  * prison_check_ip[46]).
3123  *
3124  * Returns 0 if jail doesn't restrict the address family or if address belongs
3125  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3126  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3127  */
3128 int
3129 prison_if(struct ucred *cred, struct sockaddr *sa)
3130 {
3131 #ifdef INET
3132 	struct sockaddr_in *sai;
3133 #endif
3134 #ifdef INET6
3135 	struct sockaddr_in6 *sai6;
3136 #endif
3137 	int error;
3138 
3139 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3140 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3141 
3142 	error = 0;
3143 	switch (sa->sa_family)
3144 	{
3145 #ifdef INET
3146 	case AF_INET:
3147 		sai = (struct sockaddr_in *)sa;
3148 		error = prison_check_ip4(cred, &sai->sin_addr);
3149 		break;
3150 #endif
3151 #ifdef INET6
3152 	case AF_INET6:
3153 		sai6 = (struct sockaddr_in6 *)sa;
3154 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3155 		break;
3156 #endif
3157 	default:
3158 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3159 			error = EAFNOSUPPORT;
3160 	}
3161 	return (error);
3162 }
3163 
3164 /*
3165  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3166  */
3167 int
3168 prison_check(struct ucred *cred1, struct ucred *cred2)
3169 {
3170 
3171 #ifdef VIMAGE
3172 	if (cred2->cr_vimage->v_procg != cred1->cr_vimage->v_procg)
3173 		return (ESRCH);
3174 #endif
3175 	return ((cred1->cr_prison == cred2->cr_prison ||
3176 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3177 }
3178 
3179 /*
3180  * Return 1 if p2 is a child of p1, otherwise 0.
3181  */
3182 int
3183 prison_ischild(struct prison *pr1, struct prison *pr2)
3184 {
3185 
3186 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3187 		if (pr1 == pr2)
3188 			return (1);
3189 	return (0);
3190 }
3191 
3192 /*
3193  * Return 1 if the passed credential is in a jail, otherwise 0.
3194  */
3195 int
3196 jailed(struct ucred *cred)
3197 {
3198 
3199 	return (cred->cr_prison != &prison0);
3200 }
3201 
3202 /*
3203  * Return the correct hostname for the passed credential.
3204  */
3205 void
3206 getcredhostname(struct ucred *cred, char *buf, size_t size)
3207 {
3208 	struct prison *pr;
3209 
3210 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3211 	mtx_lock(&pr->pr_mtx);
3212 	strlcpy(buf, pr->pr_host, size);
3213 	mtx_unlock(&pr->pr_mtx);
3214 }
3215 
3216 /*
3217  * Determine whether the subject represented by cred can "see"
3218  * status of a mount point.
3219  * Returns: 0 for permitted, ENOENT otherwise.
3220  * XXX: This function should be called cr_canseemount() and should be
3221  *      placed in kern_prot.c.
3222  */
3223 int
3224 prison_canseemount(struct ucred *cred, struct mount *mp)
3225 {
3226 	struct prison *pr;
3227 	struct statfs *sp;
3228 	size_t len;
3229 
3230 	pr = cred->cr_prison;
3231 	if (pr->pr_enforce_statfs == 0)
3232 		return (0);
3233 	if (pr->pr_root->v_mount == mp)
3234 		return (0);
3235 	if (pr->pr_enforce_statfs == 2)
3236 		return (ENOENT);
3237 	/*
3238 	 * If jail's chroot directory is set to "/" we should be able to see
3239 	 * all mount-points from inside a jail.
3240 	 * This is ugly check, but this is the only situation when jail's
3241 	 * directory ends with '/'.
3242 	 */
3243 	if (strcmp(pr->pr_path, "/") == 0)
3244 		return (0);
3245 	len = strlen(pr->pr_path);
3246 	sp = &mp->mnt_stat;
3247 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3248 		return (ENOENT);
3249 	/*
3250 	 * Be sure that we don't have situation where jail's root directory
3251 	 * is "/some/path" and mount point is "/some/pathpath".
3252 	 */
3253 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3254 		return (ENOENT);
3255 	return (0);
3256 }
3257 
3258 void
3259 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3260 {
3261 	char jpath[MAXPATHLEN];
3262 	struct prison *pr;
3263 	size_t len;
3264 
3265 	pr = cred->cr_prison;
3266 	if (pr->pr_enforce_statfs == 0)
3267 		return;
3268 	if (prison_canseemount(cred, mp) != 0) {
3269 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3270 		strlcpy(sp->f_mntonname, "[restricted]",
3271 		    sizeof(sp->f_mntonname));
3272 		return;
3273 	}
3274 	if (pr->pr_root->v_mount == mp) {
3275 		/*
3276 		 * Clear current buffer data, so we are sure nothing from
3277 		 * the valid path left there.
3278 		 */
3279 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3280 		*sp->f_mntonname = '/';
3281 		return;
3282 	}
3283 	/*
3284 	 * If jail's chroot directory is set to "/" we should be able to see
3285 	 * all mount-points from inside a jail.
3286 	 */
3287 	if (strcmp(pr->pr_path, "/") == 0)
3288 		return;
3289 	len = strlen(pr->pr_path);
3290 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3291 	/*
3292 	 * Clear current buffer data, so we are sure nothing from
3293 	 * the valid path left there.
3294 	 */
3295 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3296 	if (*jpath == '\0') {
3297 		/* Should never happen. */
3298 		*sp->f_mntonname = '/';
3299 	} else {
3300 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3301 	}
3302 }
3303 
3304 /*
3305  * Check with permission for a specific privilege is granted within jail.  We
3306  * have a specific list of accepted privileges; the rest are denied.
3307  */
3308 int
3309 prison_priv_check(struct ucred *cred, int priv)
3310 {
3311 
3312 	if (!jailed(cred))
3313 		return (0);
3314 
3315 	switch (priv) {
3316 
3317 		/*
3318 		 * Allow ktrace privileges for root in jail.
3319 		 */
3320 	case PRIV_KTRACE:
3321 
3322 #if 0
3323 		/*
3324 		 * Allow jailed processes to configure audit identity and
3325 		 * submit audit records (login, etc).  In the future we may
3326 		 * want to further refine the relationship between audit and
3327 		 * jail.
3328 		 */
3329 	case PRIV_AUDIT_GETAUDIT:
3330 	case PRIV_AUDIT_SETAUDIT:
3331 	case PRIV_AUDIT_SUBMIT:
3332 #endif
3333 
3334 		/*
3335 		 * Allow jailed processes to manipulate process UNIX
3336 		 * credentials in any way they see fit.
3337 		 */
3338 	case PRIV_CRED_SETUID:
3339 	case PRIV_CRED_SETEUID:
3340 	case PRIV_CRED_SETGID:
3341 	case PRIV_CRED_SETEGID:
3342 	case PRIV_CRED_SETGROUPS:
3343 	case PRIV_CRED_SETREUID:
3344 	case PRIV_CRED_SETREGID:
3345 	case PRIV_CRED_SETRESUID:
3346 	case PRIV_CRED_SETRESGID:
3347 
3348 		/*
3349 		 * Jail implements visibility constraints already, so allow
3350 		 * jailed root to override uid/gid-based constraints.
3351 		 */
3352 	case PRIV_SEEOTHERGIDS:
3353 	case PRIV_SEEOTHERUIDS:
3354 
3355 		/*
3356 		 * Jail implements inter-process debugging limits already, so
3357 		 * allow jailed root various debugging privileges.
3358 		 */
3359 	case PRIV_DEBUG_DIFFCRED:
3360 	case PRIV_DEBUG_SUGID:
3361 	case PRIV_DEBUG_UNPRIV:
3362 
3363 		/*
3364 		 * Allow jail to set various resource limits and login
3365 		 * properties, and for now, exceed process resource limits.
3366 		 */
3367 	case PRIV_PROC_LIMIT:
3368 	case PRIV_PROC_SETLOGIN:
3369 	case PRIV_PROC_SETRLIMIT:
3370 
3371 		/*
3372 		 * System V and POSIX IPC privileges are granted in jail.
3373 		 */
3374 	case PRIV_IPC_READ:
3375 	case PRIV_IPC_WRITE:
3376 	case PRIV_IPC_ADMIN:
3377 	case PRIV_IPC_MSGSIZE:
3378 	case PRIV_MQ_ADMIN:
3379 
3380 		/*
3381 		 * Jail operations within a jail work on child jails.
3382 		 */
3383 	case PRIV_JAIL_ATTACH:
3384 	case PRIV_JAIL_SET:
3385 	case PRIV_JAIL_REMOVE:
3386 
3387 		/*
3388 		 * Jail implements its own inter-process limits, so allow
3389 		 * root processes in jail to change scheduling on other
3390 		 * processes in the same jail.  Likewise for signalling.
3391 		 */
3392 	case PRIV_SCHED_DIFFCRED:
3393 	case PRIV_SCHED_CPUSET:
3394 	case PRIV_SIGNAL_DIFFCRED:
3395 	case PRIV_SIGNAL_SUGID:
3396 
3397 		/*
3398 		 * Allow jailed processes to write to sysctls marked as jail
3399 		 * writable.
3400 		 */
3401 	case PRIV_SYSCTL_WRITEJAIL:
3402 
3403 		/*
3404 		 * Allow root in jail to manage a variety of quota
3405 		 * properties.  These should likely be conditional on a
3406 		 * configuration option.
3407 		 */
3408 	case PRIV_VFS_GETQUOTA:
3409 	case PRIV_VFS_SETQUOTA:
3410 
3411 		/*
3412 		 * Since Jail relies on chroot() to implement file system
3413 		 * protections, grant many VFS privileges to root in jail.
3414 		 * Be careful to exclude mount-related and NFS-related
3415 		 * privileges.
3416 		 */
3417 	case PRIV_VFS_READ:
3418 	case PRIV_VFS_WRITE:
3419 	case PRIV_VFS_ADMIN:
3420 	case PRIV_VFS_EXEC:
3421 	case PRIV_VFS_LOOKUP:
3422 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3423 	case PRIV_VFS_CHFLAGS_DEV:
3424 	case PRIV_VFS_CHOWN:
3425 	case PRIV_VFS_CHROOT:
3426 	case PRIV_VFS_RETAINSUGID:
3427 	case PRIV_VFS_FCHROOT:
3428 	case PRIV_VFS_LINK:
3429 	case PRIV_VFS_SETGID:
3430 	case PRIV_VFS_STAT:
3431 	case PRIV_VFS_STICKYFILE:
3432 		return (0);
3433 
3434 		/*
3435 		 * Depending on the global setting, allow privilege of
3436 		 * setting system flags.
3437 		 */
3438 	case PRIV_VFS_SYSFLAGS:
3439 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3440 			return (0);
3441 		else
3442 			return (EPERM);
3443 
3444 		/*
3445 		 * Depending on the global setting, allow privilege of
3446 		 * mounting/unmounting file systems.
3447 		 */
3448 	case PRIV_VFS_MOUNT:
3449 	case PRIV_VFS_UNMOUNT:
3450 	case PRIV_VFS_MOUNT_NONUSER:
3451 	case PRIV_VFS_MOUNT_OWNER:
3452 		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT)
3453 			return (0);
3454 		else
3455 			return (EPERM);
3456 
3457 		/*
3458 		 * Allow jailed root to bind reserved ports and reuse in-use
3459 		 * ports.
3460 		 */
3461 	case PRIV_NETINET_RESERVEDPORT:
3462 	case PRIV_NETINET_REUSEPORT:
3463 		return (0);
3464 
3465 		/*
3466 		 * Allow jailed root to set certian IPv4/6 (option) headers.
3467 		 */
3468 	case PRIV_NETINET_SETHDROPTS:
3469 		return (0);
3470 
3471 		/*
3472 		 * Conditionally allow creating raw sockets in jail.
3473 		 */
3474 	case PRIV_NETINET_RAW:
3475 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3476 			return (0);
3477 		else
3478 			return (EPERM);
3479 
3480 		/*
3481 		 * Since jail implements its own visibility limits on netstat
3482 		 * sysctls, allow getcred.  This allows identd to work in
3483 		 * jail.
3484 		 */
3485 	case PRIV_NETINET_GETCRED:
3486 		return (0);
3487 
3488 	default:
3489 		/*
3490 		 * In all remaining cases, deny the privilege request.  This
3491 		 * includes almost all network privileges, many system
3492 		 * configuration privileges.
3493 		 */
3494 		return (EPERM);
3495 	}
3496 }
3497 
3498 /*
3499  * Return the part of pr2's name that is relative to pr1, or the whole name
3500  * if it does not directly follow.
3501  */
3502 
3503 char *
3504 prison_name(struct prison *pr1, struct prison *pr2)
3505 {
3506 	char *name;
3507 
3508 	/* Jails see themselves as "0" (if they see themselves at all). */
3509 	if (pr1 == pr2)
3510 		return "0";
3511 	name = pr2->pr_name;
3512 	if (prison_ischild(pr1, pr2)) {
3513 		/*
3514 		 * pr1 isn't locked (and allprison_lock may not be either)
3515 		 * so its length can't be counted on.  But the number of dots
3516 		 * can be counted on - and counted.
3517 		 */
3518 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3519 			name = strchr(name, '.') + 1;
3520 	}
3521 	return (name);
3522 }
3523 
3524 /*
3525  * Return the part of pr2's path that is relative to pr1, or the whole path
3526  * if it does not directly follow.
3527  */
3528 static char *
3529 prison_path(struct prison *pr1, struct prison *pr2)
3530 {
3531 	char *path1, *path2;
3532 	int len1;
3533 
3534 	path1 = pr1->pr_path;
3535 	path2 = pr2->pr_path;
3536 	if (!strcmp(path1, "/"))
3537 		return (path2);
3538 	len1 = strlen(path1);
3539 	if (strncmp(path1, path2, len1))
3540 		return (path2);
3541 	if (path2[len1] == '\0')
3542 		return "/";
3543 	if (path2[len1] == '/')
3544 		return (path2 + len1);
3545 	return (path2);
3546 }
3547 
3548 
3549 /*
3550  * Jail-related sysctls.
3551  */
3552 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3553     "Jails");
3554 
3555 static int
3556 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3557 {
3558 	struct xprison *xp;
3559 	struct prison *pr, *cpr;
3560 #ifdef INET
3561 	struct in_addr *ip4 = NULL;
3562 	int ip4s = 0;
3563 #endif
3564 #ifdef INET6
3565 	struct in_addr *ip6 = NULL;
3566 	int ip6s = 0;
3567 #endif
3568 	int descend, error;
3569 
3570 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3571 	pr = req->td->td_ucred->cr_prison;
3572 	error = 0;
3573 	sx_slock(&allprison_lock);
3574 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3575 #if defined(INET) || defined(INET6)
3576  again:
3577 #endif
3578 		mtx_lock(&cpr->pr_mtx);
3579 #ifdef INET
3580 		if (cpr->pr_ip4s > 0) {
3581 			if (ip4s < cpr->pr_ip4s) {
3582 				ip4s = cpr->pr_ip4s;
3583 				mtx_unlock(&cpr->pr_mtx);
3584 				ip4 = realloc(ip4, ip4s *
3585 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3586 				goto again;
3587 			}
3588 			bcopy(cpr->pr_ip4, ip4,
3589 			    cpr->pr_ip4s * sizeof(struct in_addr));
3590 		}
3591 #endif
3592 #ifdef INET6
3593 		if (cpr->pr_ip6s > 0) {
3594 			if (ip6s < cpr->pr_ip6s) {
3595 				ip6s = cpr->pr_ip6s;
3596 				mtx_unlock(&cpr->pr_mtx);
3597 				ip6 = realloc(ip6, ip6s *
3598 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3599 				goto again;
3600 			}
3601 			bcopy(cpr->pr_ip6, ip6,
3602 			    cpr->pr_ip6s * sizeof(struct in6_addr));
3603 		}
3604 #endif
3605 		if (cpr->pr_ref == 0) {
3606 			mtx_unlock(&cpr->pr_mtx);
3607 			continue;
3608 		}
3609 		bzero(xp, sizeof(*xp));
3610 		xp->pr_version = XPRISON_VERSION;
3611 		xp->pr_id = cpr->pr_id;
3612 		xp->pr_state = cpr->pr_uref > 0
3613 		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
3614 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3615 		strlcpy(xp->pr_host, cpr->pr_host, sizeof(xp->pr_host));
3616 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3617 #ifdef INET
3618 		xp->pr_ip4s = cpr->pr_ip4s;
3619 #endif
3620 #ifdef INET6
3621 		xp->pr_ip6s = cpr->pr_ip6s;
3622 #endif
3623 		mtx_unlock(&cpr->pr_mtx);
3624 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3625 		if (error)
3626 			break;
3627 #ifdef INET
3628 		if (xp->pr_ip4s > 0) {
3629 			error = SYSCTL_OUT(req, ip4,
3630 			    xp->pr_ip4s * sizeof(struct in_addr));
3631 			if (error)
3632 				break;
3633 		}
3634 #endif
3635 #ifdef INET6
3636 		if (xp->pr_ip6s > 0) {
3637 			error = SYSCTL_OUT(req, ip6,
3638 			    xp->pr_ip6s * sizeof(struct in6_addr));
3639 			if (error)
3640 				break;
3641 		}
3642 #endif
3643 	}
3644 	sx_sunlock(&allprison_lock);
3645 	free(xp, M_TEMP);
3646 #ifdef INET
3647 	free(ip4, M_TEMP);
3648 #endif
3649 #ifdef INET6
3650 	free(ip6, M_TEMP);
3651 #endif
3652 	return (error);
3653 }
3654 
3655 SYSCTL_OID(_security_jail, OID_AUTO, list,
3656     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3657     sysctl_jail_list, "S", "List of active jails");
3658 
3659 static int
3660 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3661 {
3662 	int error, injail;
3663 
3664 	injail = jailed(req->td->td_ucred);
3665 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3666 
3667 	return (error);
3668 }
3669 
3670 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3671     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3672     sysctl_jail_jailed, "I", "Process in jail?");
3673 
3674 #if defined(INET) || defined(INET6)
3675 SYSCTL_INT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3676     &jail_max_af_ips, 0,
3677     "Number of IP addresses a jail may have at most per address family");
3678 #endif
3679 
3680 /*
3681  * Default parameters for jail(2) compatability.  For historical reasons,
3682  * the sysctl names have varying similarity to the parameter names.  Prisons
3683  * just see their own parameters, and can't change them.
3684  */
3685 static int
3686 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3687 {
3688 	struct prison *pr;
3689 	int allow, error, i;
3690 
3691 	pr = req->td->td_ucred->cr_prison;
3692 	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
3693 
3694 	/* Get the current flag value, and convert it to a boolean. */
3695 	i = (allow & arg2) ? 1 : 0;
3696 	if (arg1 != NULL)
3697 		i = !i;
3698 	error = sysctl_handle_int(oidp, &i, 0, req);
3699 	if (error || !req->newptr)
3700 		return (error);
3701 	i = i ? arg2 : 0;
3702 	if (arg1 != NULL)
3703 		i ^= arg2;
3704 	/*
3705 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3706 	 * for writing.
3707 	 */
3708 	mtx_lock(&prison0.pr_mtx);
3709 	jail_default_allow = (jail_default_allow & ~arg2) | i;
3710 	mtx_unlock(&prison0.pr_mtx);
3711 	return (0);
3712 }
3713 
3714 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3715     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3716     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3717     "Processes in jail can set their hostnames");
3718 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3719     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3720     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3721     "Processes in jail are limited to creating UNIX/IP/route sockets only");
3722 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3723     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3724     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
3725     "Processes in jail can use System V IPC primitives");
3726 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
3727     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3728     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
3729     "Prison root can create raw sockets");
3730 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
3731     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3732     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
3733     "Processes in jail can alter system file flags");
3734 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
3735     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3736     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
3737     "Processes in jail can mount/unmount jail-friendly file systems");
3738 
3739 static int
3740 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
3741 {
3742 	struct prison *pr;
3743 	int level, error;
3744 
3745 	pr = req->td->td_ucred->cr_prison;
3746 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
3747 	error = sysctl_handle_int(oidp, &level, 0, req);
3748 	if (error || !req->newptr)
3749 		return (error);
3750 	*(int *)arg1 = level;
3751 	return (0);
3752 }
3753 
3754 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
3755     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3756     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
3757     sysctl_jail_default_level, "I",
3758     "Processes in jail cannot see all mounted file systems");
3759 
3760 /*
3761  * Nodes to describe jail parameters.  Maximum length of string parameters
3762  * is returned in the string itself, and the other parameters exist merely
3763  * to make themselves and their types known.
3764  */
3765 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
3766     "Jail parameters");
3767 
3768 int
3769 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
3770 {
3771 	int i;
3772 	long l;
3773 	size_t s;
3774 	char numbuf[12];
3775 
3776 	switch (oidp->oid_kind & CTLTYPE)
3777 	{
3778 	case CTLTYPE_LONG:
3779 	case CTLTYPE_ULONG:
3780 		l = 0;
3781 #ifdef SCTL_MASK32
3782 		if (!(req->flags & SCTL_MASK32))
3783 #endif
3784 			return (SYSCTL_OUT(req, &l, sizeof(l)));
3785 	case CTLTYPE_INT:
3786 	case CTLTYPE_UINT:
3787 		i = 0;
3788 		return (SYSCTL_OUT(req, &i, sizeof(i)));
3789 	case CTLTYPE_STRING:
3790 		snprintf(numbuf, sizeof(numbuf), "%d", arg2);
3791 		return
3792 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
3793 	case CTLTYPE_STRUCT:
3794 		s = (size_t)arg2;
3795 		return (SYSCTL_OUT(req, &s, sizeof(s)));
3796 	}
3797 	return (0);
3798 }
3799 
3800 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
3801 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
3802 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
3803 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
3804 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
3805     "I", "Jail secure level");
3806 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
3807     "I", "Jail cannot see all mounted file systems");
3808 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
3809     "B", "Jail persistence");
3810 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
3811     "B", "Jail is in the process of shutting down");
3812 
3813 SYSCTL_JAIL_PARAM_NODE(host, "Jail host info");
3814 SYSCTL_JAIL_PARAM(, nohost, CTLTYPE_INT | CTLFLAG_RW,
3815     "BN", "Jail w/ no host info");
3816 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
3817     "Jail hostname");
3818 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
3819     "Jail NIS domainname");
3820 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
3821     "Jail host UUID");
3822 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
3823     "LU", "Jail host ID");
3824 
3825 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
3826 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
3827 
3828 #ifdef INET
3829 SYSCTL_JAIL_PARAM_NODE(ip4, "Jail IPv4 address virtualization");
3830 SYSCTL_JAIL_PARAM(, noip4, CTLTYPE_INT | CTLFLAG_RW,
3831     "BN", "Jail w/ no IP address virtualization");
3832 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
3833     "S,in_addr,a", "Jail IPv4 addresses");
3834 #endif
3835 #ifdef INET6
3836 SYSCTL_JAIL_PARAM_NODE(ip6, "Jail IPv6 address virtualization");
3837 SYSCTL_JAIL_PARAM(, noip6, CTLTYPE_INT | CTLFLAG_RW,
3838     "BN", "Jail w/ no IP address virtualization");
3839 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
3840     "S,in6_addr,a", "Jail IPv6 addresses");
3841 #endif
3842 
3843 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
3844 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
3845     "B", "Jail may set hostname");
3846 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
3847     "B", "Jail may use SYSV IPC");
3848 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
3849     "B", "Jail may create raw sockets");
3850 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
3851     "B", "Jail may alter system file flags");
3852 SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW,
3853     "B", "Jail may mount/unmount jail-friendly file systems");
3854 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
3855     "B", "Jail may set file quotas");
3856 SYSCTL_JAIL_PARAM(_allow, jails, CTLTYPE_INT | CTLFLAG_RW,
3857     "B", "Jail may create child jails");
3858 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
3859     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
3860 
3861 
3862 #ifdef DDB
3863 
3864 static void
3865 db_show_prison(struct prison *pr)
3866 {
3867 	int fi;
3868 #if defined(INET) || defined(INET6)
3869 	int ii;
3870 #endif
3871 #ifdef INET6
3872 	char ip6buf[INET6_ADDRSTRLEN];
3873 #endif
3874 
3875 	db_printf("prison %p:\n", pr);
3876 	db_printf(" jid             = %d\n", pr->pr_id);
3877 	db_printf(" name            = %s\n", pr->pr_name);
3878 	db_printf(" parent          = %p\n", pr->pr_parent);
3879 	db_printf(" ref             = %d\n", pr->pr_ref);
3880 	db_printf(" uref            = %d\n", pr->pr_uref);
3881 	db_printf(" path            = %s\n", pr->pr_path);
3882 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
3883 	    ? pr->pr_cpuset->cs_id : -1);
3884 	db_printf(" root            = %p\n", pr->pr_root);
3885 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
3886 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
3887 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
3888 	db_printf(" flags           = %x", pr->pr_flags);
3889 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
3890 	    fi++)
3891 		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
3892 			db_printf(" %s", pr_flag_names[fi]);
3893 	db_printf(" allow           = %x", pr->pr_allow);
3894 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
3895 	    fi++)
3896 		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
3897 			db_printf(" %s", pr_allow_names[fi]);
3898 	db_printf("\n");
3899 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
3900 	db_printf(" host.hostname   = %s\n", pr->pr_host);
3901 	db_printf(" host.domainname = %s\n", pr->pr_domain);
3902 	db_printf(" host.hostuuid   = %s\n", pr->pr_uuid);
3903 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
3904 #ifdef INET
3905 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
3906 	for (ii = 0; ii < pr->pr_ip4s; ii++)
3907 		db_printf(" %s %s\n",
3908 		    ii == 0 ? "ip4             =" : "                 ",
3909 		    inet_ntoa(pr->pr_ip4[ii]));
3910 #endif
3911 #ifdef INET6
3912 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
3913 	for (ii = 0; ii < pr->pr_ip6s; ii++)
3914 		db_printf(" %s %s\n",
3915 		    ii == 0 ? "ip6             =" : "                 ",
3916 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
3917 #endif
3918 }
3919 
3920 DB_SHOW_COMMAND(prison, db_show_prison_command)
3921 {
3922 	struct prison *pr;
3923 
3924 	if (!have_addr) {
3925 		/*
3926 		 * Show all prisons in the list, and prison0 which is not
3927 		 * listed.
3928 		 */
3929 		db_show_prison(&prison0);
3930 		if (!db_pager_quit) {
3931 			TAILQ_FOREACH(pr, &allprison, pr_list) {
3932 				db_show_prison(pr);
3933 				if (db_pager_quit)
3934 					break;
3935 			}
3936 		}
3937 		return;
3938 	}
3939 
3940 	if (addr == 0)
3941 		pr = &prison0;
3942 	else {
3943 		/* Look for a prison with the ID and with references. */
3944 		TAILQ_FOREACH(pr, &allprison, pr_list)
3945 			if (pr->pr_id == addr && pr->pr_ref > 0)
3946 				break;
3947 		if (pr == NULL)
3948 			/* Look again, without requiring a reference. */
3949 			TAILQ_FOREACH(pr, &allprison, pr_list)
3950 				if (pr->pr_id == addr)
3951 					break;
3952 		if (pr == NULL)
3953 			/* Assume address points to a valid prison. */
3954 			pr = (struct prison *)addr;
3955 	}
3956 	db_show_prison(pr);
3957 }
3958 
3959 #endif /* DDB */
3960