1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 1999 Poul-Henning Kamp.
5 * Copyright (c) 2008 Bjoern A. Zeeb.
6 * Copyright (c) 2009 James Gritton.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_nfs.h"
36
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/errno.h>
42 #include <sys/sysproto.h>
43 #include <sys/malloc.h>
44 #include <sys/osd.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/epoch.h>
48 #include <sys/taskqueue.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/linker.h>
52 #include <sys/lock.h>
53 #include <sys/mman.h>
54 #include <sys/mutex.h>
55 #include <sys/racct.h>
56 #include <sys/rctl.h>
57 #include <sys/refcount.h>
58 #include <sys/sx.h>
59 #include <sys/sysent.h>
60 #include <sys/namei.h>
61 #include <sys/mount.h>
62 #include <sys/queue.h>
63 #include <sys/socket.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/sysctl.h>
66 #include <sys/uuid.h>
67 #include <sys/vnode.h>
68
69 #include <net/if.h>
70 #include <net/vnet.h>
71
72 #include <netinet/in.h>
73
74 #ifdef DDB
75 #include <ddb/ddb.h>
76 #endif /* DDB */
77
78 #include <security/mac/mac_framework.h>
79
80 #define PRISON0_HOSTUUID_MODULE "hostuuid"
81
82 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
83 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
84
85 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
86 #ifdef INET
87 #ifdef INET6
88 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
89 #else
90 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
91 #endif
92 #else /* !INET */
93 #ifdef INET6
94 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
95 #else
96 #define _PR_IP_SADDRSEL 0
97 #endif
98 #endif
99
100 /* prison0 describes what is "real" about the system. */
101 struct prison prison0 = {
102 .pr_id = 0,
103 .pr_name = "0",
104 .pr_ref = 1,
105 .pr_uref = 1,
106 .pr_path = "/",
107 .pr_securelevel = -1,
108 .pr_devfs_rsnum = 0,
109 .pr_state = PRISON_STATE_ALIVE,
110 .pr_childmax = JAIL_MAX,
111 .pr_hostuuid = DEFAULT_HOSTUUID,
112 .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
113 #ifdef VIMAGE
114 .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
115 #else
116 .pr_flags = PR_HOST|_PR_IP_SADDRSEL,
117 #endif
118 .pr_allow = PR_ALLOW_ALL_STATIC,
119 };
120 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
121
122 struct bool_flags {
123 const char *name;
124 const char *noname;
125 volatile u_int flag;
126 };
127 struct jailsys_flags {
128 const char *name;
129 unsigned disable;
130 unsigned new;
131 };
132
133 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
134 struct sx allprison_lock;
135 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
136 struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
137 LIST_HEAD(, prison_racct) allprison_racct;
138 int lastprid = 0;
139 int lastdeadid = 0;
140
141 static int get_next_prid(struct prison **insprp);
142 static int get_next_deadid(struct prison **insprp);
143 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
144 static void prison_complete(void *context, int pending);
145 static void prison_deref(struct prison *pr, int flags);
146 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
147 static int prison_lock_xlock(struct prison *pr, int flags);
148 static void prison_cleanup(struct prison *pr);
149 static void prison_free_not_last(struct prison *pr);
150 static void prison_proc_free_not_last(struct prison *pr);
151 static void prison_proc_relink(struct prison *opr, struct prison *npr,
152 struct proc *p);
153 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
154 int enable);
155 static char *prison_path(struct prison *pr1, struct prison *pr2);
156 #ifdef RACCT
157 static void prison_racct_attach(struct prison *pr);
158 static void prison_racct_modify(struct prison *pr);
159 static void prison_racct_detach(struct prison *pr);
160 #endif
161
162 /* Flags for prison_deref */
163 #define PD_DEREF 0x01 /* Decrement pr_ref */
164 #define PD_DEUREF 0x02 /* Decrement pr_uref */
165 #define PD_KILL 0x04 /* Remove jail, kill processes, etc */
166 #define PD_LOCKED 0x10 /* pr_mtx is held */
167 #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */
168 #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */
169 #define PD_OP_FLAGS 0x07 /* Operation flags */
170 #define PD_LOCK_FLAGS 0x70 /* Lock status flags */
171
172 /*
173 * Parameter names corresponding to PR_* flag values. Size values are for kvm
174 * as we cannot figure out the size of a sparse array, or an array without a
175 * terminating entry.
176 */
177 static struct bool_flags pr_flag_bool[] = {
178 {"persist", "nopersist", PR_PERSIST},
179 #ifdef INET
180 {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
181 #endif
182 #ifdef INET6
183 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
184 #endif
185 };
186 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
187
188 static struct jailsys_flags pr_flag_jailsys[] = {
189 {"host", 0, PR_HOST},
190 #ifdef VIMAGE
191 {"vnet", 0, PR_VNET},
192 #endif
193 #ifdef INET
194 {"ip4", PR_IP4_USER, PR_IP4_USER},
195 #endif
196 #ifdef INET6
197 {"ip6", PR_IP6_USER, PR_IP6_USER},
198 #endif
199 };
200 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
201
202 /*
203 * Make this array full-size so dynamic parameters can be added.
204 * It is protected by prison0.mtx, but lockless reading is allowed
205 * with an atomic check of the flag values.
206 */
207 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
208 {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
209 {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
210 {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
211 {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
212 {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
213 {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
214 {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
215 {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
216 {"allow.reserved_ports", "allow.noreserved_ports",
217 PR_ALLOW_RESERVED_PORTS},
218 {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
219 {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
220 PR_ALLOW_UNPRIV_DEBUG},
221 {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
222 #ifdef VIMAGE
223 {"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
224 #endif
225 {"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR},
226 {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
227 {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
228 };
229 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
230 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
231
232 #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \
233 PR_ALLOW_RESERVED_PORTS | \
234 PR_ALLOW_UNPRIV_DEBUG | \
235 PR_ALLOW_SUSER)
236 #define JAIL_DEFAULT_ENFORCE_STATFS 2
237 #define JAIL_DEFAULT_DEVFS_RSNUM 0
238 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241 #if defined(INET) || defined(INET6)
242 static unsigned jail_max_af_ips = 255;
243 #endif
244
245 /*
246 * Initialize the parts of prison0 that can't be static-initialized with
247 * constants. This is called from proc0_init() after creating thread0 cpuset.
248 */
249 void
prison0_init(void)250 prison0_init(void)
251 {
252 uint8_t *file, *data;
253 size_t size;
254 char buf[sizeof(prison0.pr_hostuuid)];
255 bool valid;
256
257 prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
258 prison0.pr_osreldate = osreldate;
259 strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
260
261 /* If we have a preloaded hostuuid, use it. */
262 file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
263 if (file != NULL) {
264 data = preload_fetch_addr(file);
265 size = preload_fetch_size(file);
266 if (data != NULL) {
267 /*
268 * The preloaded data may include trailing whitespace, almost
269 * certainly a newline; skip over any whitespace or
270 * non-printable characters to be safe.
271 */
272 while (size > 0 && data[size - 1] <= 0x20) {
273 size--;
274 }
275
276 valid = false;
277
278 /*
279 * Not NUL-terminated when passed from loader, but
280 * validate_uuid requires that due to using sscanf (as
281 * does the subsequent strlcpy, since it still reads
282 * past the given size to return the true length);
283 * bounce to a temporary buffer to fix.
284 */
285 if (size >= sizeof(buf))
286 goto done;
287
288 memcpy(buf, data, size);
289 buf[size] = '\0';
290
291 if (validate_uuid(buf, size, NULL, 0) != 0)
292 goto done;
293
294 valid = true;
295 (void)strlcpy(prison0.pr_hostuuid, buf,
296 sizeof(prison0.pr_hostuuid));
297
298 done:
299 if (bootverbose && !valid) {
300 printf("hostuuid: preload data malformed: '%.*s'\n",
301 (int)size, data);
302 }
303 }
304 }
305 if (bootverbose)
306 printf("hostuuid: using %s\n", prison0.pr_hostuuid);
307 }
308
309 /*
310 * struct jail_args {
311 * struct jail *jail;
312 * };
313 */
314 int
sys_jail(struct thread * td,struct jail_args * uap)315 sys_jail(struct thread *td, struct jail_args *uap)
316 {
317 uint32_t version;
318 int error;
319 struct jail j;
320
321 error = copyin(uap->jail, &version, sizeof(uint32_t));
322 if (error)
323 return (error);
324
325 switch (version) {
326 case 0:
327 {
328 struct jail_v0 j0;
329
330 /* FreeBSD single IPv4 jails. */
331 bzero(&j, sizeof(struct jail));
332 error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
333 if (error)
334 return (error);
335 j.version = j0.version;
336 j.path = j0.path;
337 j.hostname = j0.hostname;
338 j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */
339 break;
340 }
341
342 case 1:
343 /*
344 * Version 1 was used by multi-IPv4 jail implementations
345 * that never made it into the official kernel.
346 */
347 return (EINVAL);
348
349 case 2: /* JAIL_API_VERSION */
350 /* FreeBSD multi-IPv4/IPv6,noIP jails. */
351 error = copyin(uap->jail, &j, sizeof(struct jail));
352 if (error)
353 return (error);
354 break;
355
356 default:
357 /* Sci-Fi jails are not supported, sorry. */
358 return (EINVAL);
359 }
360 return (kern_jail(td, &j));
361 }
362
363 int
kern_jail(struct thread * td,struct jail * j)364 kern_jail(struct thread *td, struct jail *j)
365 {
366 struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
367 #ifdef INET
368 + 1
369 #endif
370 #ifdef INET6
371 + 1
372 #endif
373 )];
374 struct uio opt;
375 char *u_path, *u_hostname, *u_name;
376 struct bool_flags *bf;
377 #ifdef INET
378 uint32_t ip4s;
379 struct in_addr *u_ip4;
380 #endif
381 #ifdef INET6
382 struct in6_addr *u_ip6;
383 #endif
384 size_t tmplen;
385 int error, enforce_statfs;
386
387 bzero(&optiov, sizeof(optiov));
388 opt.uio_iov = optiov;
389 opt.uio_iovcnt = 0;
390 opt.uio_offset = -1;
391 opt.uio_resid = -1;
392 opt.uio_segflg = UIO_SYSSPACE;
393 opt.uio_rw = UIO_READ;
394 opt.uio_td = td;
395
396 /* Set permissions for top-level jails from sysctls. */
397 if (!jailed(td->td_ucred)) {
398 for (bf = pr_flag_allow;
399 bf < pr_flag_allow + nitems(pr_flag_allow) &&
400 atomic_load_int(&bf->flag) != 0;
401 bf++) {
402 optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
403 (jail_default_allow & bf->flag)
404 ? bf->name : bf->noname);
405 optiov[opt.uio_iovcnt].iov_len =
406 strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
407 opt.uio_iovcnt += 2;
408 }
409 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
410 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
411 opt.uio_iovcnt++;
412 enforce_statfs = jail_default_enforce_statfs;
413 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
414 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
415 opt.uio_iovcnt++;
416 }
417
418 tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
419 #ifdef INET
420 ip4s = (j->version == 0) ? 1 : j->ip4s;
421 if (ip4s > jail_max_af_ips)
422 return (EINVAL);
423 tmplen += ip4s * sizeof(struct in_addr);
424 #else
425 if (j->ip4s > 0)
426 return (EINVAL);
427 #endif
428 #ifdef INET6
429 if (j->ip6s > jail_max_af_ips)
430 return (EINVAL);
431 tmplen += j->ip6s * sizeof(struct in6_addr);
432 #else
433 if (j->ip6s > 0)
434 return (EINVAL);
435 #endif
436 u_path = malloc(tmplen, M_TEMP, M_WAITOK);
437 u_hostname = u_path + MAXPATHLEN;
438 u_name = u_hostname + MAXHOSTNAMELEN;
439 #ifdef INET
440 u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
441 #endif
442 #ifdef INET6
443 #ifdef INET
444 u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
445 #else
446 u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
447 #endif
448 #endif
449 optiov[opt.uio_iovcnt].iov_base = "path";
450 optiov[opt.uio_iovcnt].iov_len = sizeof("path");
451 opt.uio_iovcnt++;
452 optiov[opt.uio_iovcnt].iov_base = u_path;
453 error = copyinstr(j->path, u_path, MAXPATHLEN,
454 &optiov[opt.uio_iovcnt].iov_len);
455 if (error) {
456 free(u_path, M_TEMP);
457 return (error);
458 }
459 opt.uio_iovcnt++;
460 optiov[opt.uio_iovcnt].iov_base = "host.hostname";
461 optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
462 opt.uio_iovcnt++;
463 optiov[opt.uio_iovcnt].iov_base = u_hostname;
464 error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
465 &optiov[opt.uio_iovcnt].iov_len);
466 if (error) {
467 free(u_path, M_TEMP);
468 return (error);
469 }
470 opt.uio_iovcnt++;
471 if (j->jailname != NULL) {
472 optiov[opt.uio_iovcnt].iov_base = "name";
473 optiov[opt.uio_iovcnt].iov_len = sizeof("name");
474 opt.uio_iovcnt++;
475 optiov[opt.uio_iovcnt].iov_base = u_name;
476 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
477 &optiov[opt.uio_iovcnt].iov_len);
478 if (error) {
479 free(u_path, M_TEMP);
480 return (error);
481 }
482 opt.uio_iovcnt++;
483 }
484 #ifdef INET
485 optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
486 optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
487 opt.uio_iovcnt++;
488 optiov[opt.uio_iovcnt].iov_base = u_ip4;
489 optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
490 if (j->version == 0)
491 u_ip4->s_addr = j->ip4s;
492 else {
493 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
494 if (error) {
495 free(u_path, M_TEMP);
496 return (error);
497 }
498 }
499 opt.uio_iovcnt++;
500 #endif
501 #ifdef INET6
502 optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
503 optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
504 opt.uio_iovcnt++;
505 optiov[opt.uio_iovcnt].iov_base = u_ip6;
506 optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
507 error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
508 if (error) {
509 free(u_path, M_TEMP);
510 return (error);
511 }
512 opt.uio_iovcnt++;
513 #endif
514 KASSERT(opt.uio_iovcnt <= nitems(optiov),
515 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
516 error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
517 free(u_path, M_TEMP);
518 return (error);
519 }
520
521 /*
522 * struct jail_set_args {
523 * struct iovec *iovp;
524 * unsigned int iovcnt;
525 * int flags;
526 * };
527 */
528 int
sys_jail_set(struct thread * td,struct jail_set_args * uap)529 sys_jail_set(struct thread *td, struct jail_set_args *uap)
530 {
531 struct uio *auio;
532 int error;
533
534 /* Check that we have an even number of iovecs. */
535 if (uap->iovcnt & 1)
536 return (EINVAL);
537
538 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
539 if (error)
540 return (error);
541 error = kern_jail_set(td, auio, uap->flags);
542 freeuio(auio);
543 return (error);
544 }
545
546 #if defined(INET) || defined(INET6)
547 typedef int prison_addr_cmp_t(const void *, const void *);
548 typedef bool prison_addr_valid_t(const void *);
549 static const struct pr_family {
550 size_t size;
551 prison_addr_cmp_t *cmp;
552 prison_addr_valid_t *valid;
553 int ip_flag;
554 } pr_families[PR_FAMILY_MAX] = {
555 #ifdef INET
556 [PR_INET] = {
557 .size = sizeof(struct in_addr),
558 .cmp = prison_qcmp_v4,
559 .valid = prison_valid_v4,
560 .ip_flag = PR_IP4_USER,
561 },
562 #endif
563 #ifdef INET6
564 [PR_INET6] = {
565 .size = sizeof(struct in6_addr),
566 .cmp = prison_qcmp_v6,
567 .valid = prison_valid_v6,
568 .ip_flag = PR_IP6_USER,
569 },
570 #endif
571 };
572
573 /*
574 * Network address lists (pr_addrs) allocation for jails. The addresses
575 * are accessed locklessly by the network stack, thus need to be protected by
576 * the network epoch.
577 */
578 struct prison_ip {
579 struct epoch_context ctx;
580 uint32_t ips;
581 #ifdef FUTURE_C
582 /*
583 * XXX Variable-length automatic arrays in union may be
584 * supported in future C.
585 */
586 union {
587 char pr_ip[];
588 struct in_addr pr_ip4[];
589 struct in6_addr pr_ip6[];
590 };
591 #else /* No future C :( */
592 char pr_ip[];
593 #endif
594 };
595
596 static char *
PR_IP(struct prison_ip * pip,const pr_family_t af,int idx)597 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
598 {
599 MPASS(pip);
600 MPASS(af < PR_FAMILY_MAX);
601 MPASS(idx >= 0 && idx < pip->ips);
602
603 return (pip->pr_ip + pr_families[af].size * idx);
604 }
605
606 static struct prison_ip *
prison_ip_alloc(const pr_family_t af,uint32_t cnt,int flags)607 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
608 {
609 struct prison_ip *pip;
610
611 pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
612 M_PRISON, flags);
613 if (pip != NULL)
614 pip->ips = cnt;
615 return (pip);
616 }
617
618 /*
619 * Allocate and copyin user supplied address list, sorting and validating.
620 * kern_jail_set() helper.
621 */
622 static struct prison_ip *
prison_ip_copyin(const pr_family_t af,void * op,uint32_t cnt)623 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
624 {
625 prison_addr_cmp_t *const cmp = pr_families[af].cmp;
626 const size_t size = pr_families[af].size;
627 struct prison_ip *pip;
628
629 pip = prison_ip_alloc(af, cnt, M_WAITOK);
630 bcopy(op, pip->pr_ip, cnt * size);
631 /*
632 * IP addresses are all sorted but ip[0] to preserve
633 * the primary IP address as given from userland.
634 * This special IP is used for unbound outgoing
635 * connections as well for "loopback" traffic in case
636 * source address selection cannot find any more fitting
637 * address to connect from.
638 */
639 if (cnt > 1)
640 qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
641 /*
642 * Check for duplicate addresses and do some simple
643 * zero and broadcast checks. If users give other bogus
644 * addresses it is their problem.
645 */
646 for (int i = 0; i < cnt; i++) {
647 if (!pr_families[af].valid(PR_IP(pip, af, i))) {
648 free(pip, M_PRISON);
649 return (NULL);
650 }
651 if (i + 1 < cnt &&
652 (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
653 cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
654 free(pip, M_PRISON);
655 return (NULL);
656 }
657 }
658
659 return (pip);
660 }
661
662 /*
663 * Allocate and dup parent prison address list.
664 * kern_jail_set() helper.
665 */
666 static void
prison_ip_dup(struct prison * ppr,struct prison * pr,const pr_family_t af)667 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
668 {
669 const struct prison_ip *ppip = ppr->pr_addrs[af];
670 struct prison_ip *pip;
671
672 if (ppip != NULL) {
673 pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
674 bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
675 pr->pr_addrs[af] = pip;
676 }
677 }
678
679 /*
680 * Make sure the new set of IP addresses is a subset of the parent's list.
681 * Don't worry about the parent being unlocked, as any setting is done with
682 * allprison_lock held.
683 * kern_jail_set() helper.
684 */
685 static bool
prison_ip_parent_match(struct prison_ip * ppip,struct prison_ip * pip,const pr_family_t af)686 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
687 const pr_family_t af)
688 {
689 prison_addr_cmp_t *const cmp = pr_families[af].cmp;
690 int i, j;
691
692 if (ppip == NULL)
693 return (false);
694
695 for (i = 0; i < ppip->ips; i++)
696 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
697 break;
698
699 if (i == ppip->ips)
700 /* Main address not present in parent. */
701 return (false);
702
703 if (pip->ips > 1) {
704 for (i = j = 1; i < pip->ips; i++) {
705 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
706 /* Equals to parent primary address. */
707 continue;
708 for (; j < ppip->ips; j++)
709 if (cmp(PR_IP(pip, af, i),
710 PR_IP(ppip, af, j)) == 0)
711 break;
712 if (j == ppip->ips)
713 break;
714 }
715 if (j == ppip->ips)
716 /* Address not present in parent. */
717 return (false);
718 }
719 return (true);
720 }
721
722 /*
723 * Check for conflicting IP addresses. We permit them if there is no more
724 * than one IP on each jail. If there is a duplicate on a jail with more
725 * than one IP stop checking and return error.
726 * kern_jail_set() helper.
727 */
728 static bool
prison_ip_conflict_check(const struct prison * ppr,const struct prison * pr,struct prison_ip * pip,pr_family_t af)729 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
730 struct prison_ip *pip, pr_family_t af)
731 {
732 const struct prison *tppr, *tpr;
733 int descend;
734
735 #ifdef VIMAGE
736 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
737 if (tppr->pr_flags & PR_VNET)
738 break;
739 #else
740 tppr = &prison0;
741 #endif
742 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
743 if (tpr == pr ||
744 #ifdef VIMAGE
745 (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
746 #endif
747 !prison_isalive(tpr)) {
748 descend = 0;
749 continue;
750 }
751 if (!(tpr->pr_flags & pr_families[af].ip_flag))
752 continue;
753 descend = 0;
754 if (tpr->pr_addrs[af] == NULL ||
755 (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
756 continue;
757 for (int i = 0; i < pip->ips; i++)
758 if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
759 return (false);
760 }
761
762 return (true);
763 }
764
765 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
766 "prison must start with epoch context");
767 static void
prison_ip_free_deferred(epoch_context_t ctx)768 prison_ip_free_deferred(epoch_context_t ctx)
769 {
770
771 free(ctx, M_PRISON);
772 }
773
774 static void
prison_ip_free(struct prison_ip * pip)775 prison_ip_free(struct prison_ip *pip)
776 {
777
778 if (pip != NULL)
779 NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
780 }
781
782 static void
prison_ip_set(struct prison * pr,const pr_family_t af,struct prison_ip * new)783 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
784 {
785 struct prison_ip **mem, *old;
786
787 mtx_assert(&pr->pr_mtx, MA_OWNED);
788
789 mem = &pr->pr_addrs[af];
790
791 old = *mem;
792 atomic_store_ptr(mem, new);
793 prison_ip_free(old);
794 }
795
796 /*
797 * Restrict a prison's IP address list with its parent's, possibly replacing
798 * it. Return true if succeed, otherwise should redo.
799 * kern_jail_set() helper.
800 */
801 static bool
prison_ip_restrict(struct prison * pr,const pr_family_t af,struct prison_ip ** newp)802 prison_ip_restrict(struct prison *pr, const pr_family_t af,
803 struct prison_ip **newp)
804 {
805 struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
806 struct prison_ip *pip = pr->pr_addrs[af];
807 int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
808 const size_t size = pr_families[af].size;
809 struct prison_ip *new = newp != NULL ? *newp : NULL;
810 uint32_t ips;
811
812 mtx_assert(&pr->pr_mtx, MA_OWNED);
813
814 /*
815 * Due to epoch-synchronized access to the IP address lists we always
816 * allocate a new list even if the old one has enough space. We could
817 * atomically update an IPv4 address inside a list, but that would
818 * screw up sorting, and in case of IPv6 we can't even atomically write
819 * one.
820 */
821 if (ppip == NULL) {
822 if (pip != NULL)
823 prison_ip_set(pr, af, NULL);
824 return (true);
825 }
826
827 if (!(pr->pr_flags & pr_families[af].ip_flag)) {
828 if (new == NULL) {
829 new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
830 if (new == NULL)
831 return (false); /* Redo */
832 }
833 /* This has no user settings, so just copy the parent's list. */
834 MPASS(new->ips == ppip->ips);
835 bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
836 prison_ip_set(pr, af, new);
837 if (newp != NULL)
838 *newp = NULL; /* Used */
839 } else if (pip != NULL) {
840 /* Remove addresses that aren't in the parent. */
841 int i;
842
843 i = 0; /* index in pip */
844 ips = 0; /* index in new */
845
846 if (new == NULL) {
847 new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
848 if (new == NULL)
849 return (false); /* Redo */
850 }
851
852 for (int pi = 0; pi < ppip->ips; pi++)
853 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
854 /* Found our primary address in parent. */
855 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
856 size);
857 i++;
858 ips++;
859 break;
860 }
861 for (int pi = 1; i < pip->ips; ) {
862 /* Check against primary, which is unsorted. */
863 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
864 /* Matches parent's primary address. */
865 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
866 size);
867 i++;
868 ips++;
869 continue;
870 }
871 /* The rest are sorted. */
872 switch (pi >= ppip->ips ? -1 :
873 cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
874 case -1:
875 i++;
876 break;
877 case 0:
878 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
879 size);
880 i++;
881 pi++;
882 ips++;
883 break;
884 case 1:
885 pi++;
886 break;
887 }
888 }
889 if (ips == 0) {
890 if (newp == NULL || *newp == NULL)
891 prison_ip_free(new);
892 new = NULL;
893 } else {
894 /* Shrink to real size */
895 KASSERT((new->ips >= ips),
896 ("Out-of-bounds write to prison_ip %p", new));
897 new->ips = ips;
898 }
899 prison_ip_set(pr, af, new);
900 if (newp != NULL)
901 *newp = NULL; /* Used */
902 }
903 return (true);
904 }
905
906 /*
907 * Fast-path check if an address belongs to a prison.
908 */
909 int
prison_ip_check(const struct prison * pr,const pr_family_t af,const void * addr)910 prison_ip_check(const struct prison *pr, const pr_family_t af,
911 const void *addr)
912 {
913 int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
914 struct prison_ip *pip;
915 int i, a, z, d;
916
917 MPASS(mtx_owned(&pr->pr_mtx) ||
918 in_epoch(net_epoch_preempt) ||
919 sx_xlocked(&allprison_lock));
920
921 pip = atomic_load_ptr(&pr->pr_addrs[af]);
922 if (__predict_false(pip == NULL))
923 return (EAFNOSUPPORT);
924
925 /* Check the primary IP. */
926 if (cmp(PR_IP(pip, af, 0), addr) == 0)
927 return (0);
928
929 /*
930 * All the other IPs are sorted so we can do a binary search.
931 */
932 a = 0;
933 z = pip->ips - 2;
934 while (a <= z) {
935 i = (a + z) / 2;
936 d = cmp(PR_IP(pip, af, i + 1), addr);
937 if (d > 0)
938 z = i - 1;
939 else if (d < 0)
940 a = i + 1;
941 else
942 return (0);
943 }
944
945 return (EADDRNOTAVAIL);
946 }
947
948 /*
949 * Grab primary IP. Historically required mutex, but nothing prevents
950 * us to support epoch-protected access. Is it used in fast path?
951 * in{6}_jail.c helper
952 */
953 const void *
prison_ip_get0(const struct prison * pr,const pr_family_t af)954 prison_ip_get0(const struct prison *pr, const pr_family_t af)
955 {
956 const struct prison_ip *pip = pr->pr_addrs[af];
957
958 mtx_assert(&pr->pr_mtx, MA_OWNED);
959 MPASS(pip);
960
961 return (pip->pr_ip);
962 }
963
964 u_int
prison_ip_cnt(const struct prison * pr,const pr_family_t af)965 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
966 {
967
968 return (pr->pr_addrs[af]->ips);
969 }
970 #endif /* defined(INET) || defined(INET6) */
971
972 int
kern_jail_set(struct thread * td,struct uio * optuio,int flags)973 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
974 {
975 struct nameidata nd;
976 #ifdef INET
977 struct prison_ip *ip4;
978 #endif
979 #ifdef INET6
980 struct prison_ip *ip6;
981 #endif
982 struct vfsopt *opt;
983 struct vfsoptlist *opts;
984 struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
985 struct vnode *root;
986 char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
987 char *g_path, *osrelstr;
988 struct bool_flags *bf;
989 struct jailsys_flags *jsf;
990 #if defined(INET) || defined(INET6)
991 void *op;
992 #endif
993 unsigned long hid;
994 size_t namelen, onamelen, pnamelen;
995 int created, cuflags, descend, drflags, enforce;
996 int error, errmsg_len, errmsg_pos;
997 int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
998 int deadid, jid, jsys, len, level;
999 int childmax, osreldt, rsnum, slevel;
1000 #ifdef INET
1001 int ip4s;
1002 bool redo_ip4;
1003 #endif
1004 #ifdef INET6
1005 int ip6s;
1006 bool redo_ip6;
1007 #endif
1008 uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
1009 uint64_t pr_allow_diff;
1010 unsigned tallow;
1011 char numbuf[12];
1012
1013 error = priv_check(td, PRIV_JAIL_SET);
1014 if (!error && (flags & JAIL_ATTACH))
1015 error = priv_check(td, PRIV_JAIL_ATTACH);
1016 if (error)
1017 return (error);
1018 mypr = td->td_ucred->cr_prison;
1019 if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
1020 return (EPERM);
1021 if (flags & ~JAIL_SET_MASK)
1022 return (EINVAL);
1023
1024 /*
1025 * Check all the parameters before committing to anything. Not all
1026 * errors can be caught early, but we may as well try. Also, this
1027 * takes care of some expensive stuff (path lookup) before getting
1028 * the allprison lock.
1029 *
1030 * XXX Jails are not filesystems, and jail parameters are not mount
1031 * options. But it makes more sense to re-use the vfsopt code
1032 * than duplicate it under a different name.
1033 */
1034 error = vfs_buildopts(optuio, &opts);
1035 if (error)
1036 return (error);
1037 #ifdef INET
1038 ip4 = NULL;
1039 #endif
1040 #ifdef INET6
1041 ip6 = NULL;
1042 #endif
1043 g_path = NULL;
1044
1045 cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1046 if (!cuflags) {
1047 error = EINVAL;
1048 vfs_opterror(opts, "no valid operation (create or update)");
1049 goto done_errmsg;
1050 }
1051
1052 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1053 if (error == ENOENT)
1054 jid = 0;
1055 else if (error != 0)
1056 goto done_free;
1057
1058 error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
1059 if (error == ENOENT)
1060 gotslevel = 0;
1061 else if (error != 0)
1062 goto done_free;
1063 else
1064 gotslevel = 1;
1065
1066 error =
1067 vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
1068 if (error == ENOENT)
1069 gotchildmax = 0;
1070 else if (error != 0)
1071 goto done_free;
1072 else
1073 gotchildmax = 1;
1074
1075 error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
1076 if (error == ENOENT)
1077 gotenforce = 0;
1078 else if (error != 0)
1079 goto done_free;
1080 else if (enforce < 0 || enforce > 2) {
1081 error = EINVAL;
1082 goto done_free;
1083 } else
1084 gotenforce = 1;
1085
1086 error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
1087 if (error == ENOENT)
1088 gotrsnum = 0;
1089 else if (error != 0)
1090 goto done_free;
1091 else
1092 gotrsnum = 1;
1093
1094 pr_flags = ch_flags = 0;
1095 for (bf = pr_flag_bool;
1096 bf < pr_flag_bool + nitems(pr_flag_bool);
1097 bf++) {
1098 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
1099 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
1100 }
1101 ch_flags |= pr_flags;
1102 for (jsf = pr_flag_jailsys;
1103 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
1104 jsf++) {
1105 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
1106 if (error == ENOENT)
1107 continue;
1108 if (error != 0)
1109 goto done_free;
1110 switch (jsys) {
1111 case JAIL_SYS_DISABLE:
1112 if (!jsf->disable) {
1113 error = EINVAL;
1114 goto done_free;
1115 }
1116 pr_flags |= jsf->disable;
1117 break;
1118 case JAIL_SYS_NEW:
1119 pr_flags |= jsf->new;
1120 break;
1121 case JAIL_SYS_INHERIT:
1122 break;
1123 default:
1124 error = EINVAL;
1125 goto done_free;
1126 }
1127 ch_flags |= jsf->new | jsf->disable;
1128 }
1129 if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
1130 && !(pr_flags & PR_PERSIST)) {
1131 error = EINVAL;
1132 vfs_opterror(opts, "new jail must persist or attach");
1133 goto done_errmsg;
1134 }
1135 #ifdef VIMAGE
1136 if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
1137 error = EINVAL;
1138 vfs_opterror(opts, "vnet cannot be changed after creation");
1139 goto done_errmsg;
1140 }
1141 #endif
1142 #ifdef INET
1143 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
1144 error = EINVAL;
1145 vfs_opterror(opts, "ip4 cannot be changed after creation");
1146 goto done_errmsg;
1147 }
1148 #endif
1149 #ifdef INET6
1150 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
1151 error = EINVAL;
1152 vfs_opterror(opts, "ip6 cannot be changed after creation");
1153 goto done_errmsg;
1154 }
1155 #endif
1156
1157 pr_allow = ch_allow = 0;
1158 for (bf = pr_flag_allow;
1159 bf < pr_flag_allow + nitems(pr_flag_allow) &&
1160 atomic_load_int(&bf->flag) != 0;
1161 bf++) {
1162 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
1163 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
1164 }
1165 ch_allow |= pr_allow;
1166
1167 error = vfs_getopt(opts, "name", (void **)&name, &len);
1168 if (error == ENOENT)
1169 name = NULL;
1170 else if (error != 0)
1171 goto done_free;
1172 else {
1173 if (len == 0 || name[len - 1] != '\0') {
1174 error = EINVAL;
1175 goto done_free;
1176 }
1177 if (len > MAXHOSTNAMELEN) {
1178 error = ENAMETOOLONG;
1179 goto done_free;
1180 }
1181 }
1182
1183 error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
1184 if (error == ENOENT)
1185 host = NULL;
1186 else if (error != 0)
1187 goto done_free;
1188 else {
1189 ch_flags |= PR_HOST;
1190 pr_flags |= PR_HOST;
1191 if (len == 0 || host[len - 1] != '\0') {
1192 error = EINVAL;
1193 goto done_free;
1194 }
1195 if (len > MAXHOSTNAMELEN) {
1196 error = ENAMETOOLONG;
1197 goto done_free;
1198 }
1199 }
1200
1201 error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
1202 if (error == ENOENT)
1203 domain = NULL;
1204 else if (error != 0)
1205 goto done_free;
1206 else {
1207 ch_flags |= PR_HOST;
1208 pr_flags |= PR_HOST;
1209 if (len == 0 || domain[len - 1] != '\0') {
1210 error = EINVAL;
1211 goto done_free;
1212 }
1213 if (len > MAXHOSTNAMELEN) {
1214 error = ENAMETOOLONG;
1215 goto done_free;
1216 }
1217 }
1218
1219 error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
1220 if (error == ENOENT)
1221 uuid = NULL;
1222 else if (error != 0)
1223 goto done_free;
1224 else {
1225 ch_flags |= PR_HOST;
1226 pr_flags |= PR_HOST;
1227 if (len == 0 || uuid[len - 1] != '\0') {
1228 error = EINVAL;
1229 goto done_free;
1230 }
1231 if (len > HOSTUUIDLEN) {
1232 error = ENAMETOOLONG;
1233 goto done_free;
1234 }
1235 }
1236
1237 #ifdef COMPAT_FREEBSD32
1238 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1239 uint32_t hid32;
1240
1241 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
1242 hid = hid32;
1243 } else
1244 #endif
1245 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
1246 if (error == ENOENT)
1247 gothid = 0;
1248 else if (error != 0)
1249 goto done_free;
1250 else {
1251 gothid = 1;
1252 ch_flags |= PR_HOST;
1253 pr_flags |= PR_HOST;
1254 }
1255
1256 #ifdef INET
1257 error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
1258 if (error == ENOENT)
1259 ip4s = 0;
1260 else if (error != 0)
1261 goto done_free;
1262 else if (ip4s & (sizeof(struct in_addr) - 1)) {
1263 error = EINVAL;
1264 goto done_free;
1265 } else {
1266 ch_flags |= PR_IP4_USER;
1267 pr_flags |= PR_IP4_USER;
1268 if (ip4s > 0) {
1269 ip4s /= sizeof(struct in_addr);
1270 if (ip4s > jail_max_af_ips) {
1271 error = EINVAL;
1272 vfs_opterror(opts, "too many IPv4 addresses");
1273 goto done_errmsg;
1274 }
1275 ip4 = prison_ip_copyin(PR_INET, op, ip4s);
1276 if (ip4 == NULL) {
1277 error = EINVAL;
1278 goto done_free;
1279 }
1280 }
1281 }
1282 #endif
1283
1284 #ifdef INET6
1285 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
1286 if (error == ENOENT)
1287 ip6s = 0;
1288 else if (error != 0)
1289 goto done_free;
1290 else if (ip6s & (sizeof(struct in6_addr) - 1)) {
1291 error = EINVAL;
1292 goto done_free;
1293 } else {
1294 ch_flags |= PR_IP6_USER;
1295 pr_flags |= PR_IP6_USER;
1296 if (ip6s > 0) {
1297 ip6s /= sizeof(struct in6_addr);
1298 if (ip6s > jail_max_af_ips) {
1299 error = EINVAL;
1300 vfs_opterror(opts, "too many IPv6 addresses");
1301 goto done_errmsg;
1302 }
1303 ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
1304 if (ip6 == NULL) {
1305 error = EINVAL;
1306 goto done_free;
1307 }
1308 }
1309 }
1310 #endif
1311
1312 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1313 if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1314 error = EINVAL;
1315 vfs_opterror(opts,
1316 "vnet jails cannot have IP address restrictions");
1317 goto done_errmsg;
1318 }
1319 #endif
1320
1321 error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
1322 if (error == ENOENT)
1323 osrelstr = NULL;
1324 else if (error != 0)
1325 goto done_free;
1326 else {
1327 if (flags & JAIL_UPDATE) {
1328 error = EINVAL;
1329 vfs_opterror(opts,
1330 "osrelease cannot be changed after creation");
1331 goto done_errmsg;
1332 }
1333 if (len == 0 || osrelstr[len - 1] != '\0') {
1334 error = EINVAL;
1335 goto done_free;
1336 }
1337 if (len >= OSRELEASELEN) {
1338 error = ENAMETOOLONG;
1339 vfs_opterror(opts,
1340 "osrelease string must be 1-%d bytes long",
1341 OSRELEASELEN - 1);
1342 goto done_errmsg;
1343 }
1344 }
1345
1346 error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1347 if (error == ENOENT)
1348 osreldt = 0;
1349 else if (error != 0)
1350 goto done_free;
1351 else {
1352 if (flags & JAIL_UPDATE) {
1353 error = EINVAL;
1354 vfs_opterror(opts,
1355 "osreldate cannot be changed after creation");
1356 goto done_errmsg;
1357 }
1358 if (osreldt == 0) {
1359 error = EINVAL;
1360 vfs_opterror(opts, "osreldate cannot be 0");
1361 goto done_errmsg;
1362 }
1363 }
1364
1365 root = NULL;
1366 error = vfs_getopt(opts, "path", (void **)&path, &len);
1367 if (error == ENOENT)
1368 path = NULL;
1369 else if (error != 0)
1370 goto done_free;
1371 else {
1372 if (flags & JAIL_UPDATE) {
1373 error = EINVAL;
1374 vfs_opterror(opts,
1375 "path cannot be changed after creation");
1376 goto done_errmsg;
1377 }
1378 if (len == 0 || path[len - 1] != '\0') {
1379 error = EINVAL;
1380 goto done_free;
1381 }
1382 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
1383 error = namei(&nd);
1384 if (error)
1385 goto done_free;
1386 root = nd.ni_vp;
1387 NDFREE_PNBUF(&nd);
1388 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1389 strlcpy(g_path, path, MAXPATHLEN);
1390 error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1391 if (error == 0) {
1392 path = g_path;
1393 } else {
1394 /* exit on other errors */
1395 goto done_free;
1396 }
1397 if (root->v_type != VDIR) {
1398 error = ENOTDIR;
1399 vput(root);
1400 goto done_free;
1401 }
1402 VOP_UNLOCK(root);
1403 }
1404
1405 /*
1406 * Find the specified jail, or at least its parent.
1407 * This abuses the file error codes ENOENT and EEXIST.
1408 */
1409 pr = NULL;
1410 inspr = NULL;
1411 deadpr = NULL;
1412 if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1413 namelc = strrchr(name, '.');
1414 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1415 if (*p != '\0')
1416 jid = 0;
1417 }
1418 sx_xlock(&allprison_lock);
1419 drflags = PD_LIST_XLOCKED;
1420 ppr = mypr;
1421 if (!prison_isalive(ppr)) {
1422 /* This jail is dying. This process will surely follow. */
1423 error = EAGAIN;
1424 goto done_deref;
1425 }
1426 if (jid != 0) {
1427 if (jid < 0) {
1428 error = EINVAL;
1429 vfs_opterror(opts, "negative jid");
1430 goto done_deref;
1431 }
1432 /*
1433 * See if a requested jid already exists. Keep track of
1434 * where it can be inserted later.
1435 */
1436 TAILQ_FOREACH(inspr, &allprison, pr_list) {
1437 if (inspr->pr_id < jid)
1438 continue;
1439 if (inspr->pr_id > jid)
1440 break;
1441 if (prison_isalive(inspr)) {
1442 pr = inspr;
1443 mtx_lock(&pr->pr_mtx);
1444 drflags |= PD_LOCKED;
1445 } else {
1446 /* Note a dying jail to handle later. */
1447 deadpr = inspr;
1448 }
1449 inspr = NULL;
1450 break;
1451 }
1452 if (cuflags == JAIL_CREATE && pr != NULL) {
1453 /*
1454 * Even creators that cannot see the jail will
1455 * get EEXIST.
1456 */
1457 error = EEXIST;
1458 vfs_opterror(opts, "jail %d already exists", jid);
1459 goto done_deref;
1460 }
1461 if ((pr == NULL)
1462 ? cuflags == JAIL_UPDATE
1463 : !prison_ischild(mypr, pr)) {
1464 /*
1465 * Updaters get ENOENT for nonexistent jails,
1466 * or for jails they cannot see. The latter
1467 * case is true even for CREATE | UPDATE,
1468 * which normally cannot give this error.
1469 */
1470 error = ENOENT;
1471 vfs_opterror(opts, "jail %d not found", jid);
1472 goto done_deref;
1473 }
1474 }
1475 /*
1476 * If the caller provided a name, look for a jail by that name.
1477 * This has different semantics for creates and updates keyed by jid
1478 * (where the name must not already exist in a different jail),
1479 * and updates keyed by the name itself (where the name must exist
1480 * because that is the jail being updated).
1481 */
1482 namelc = NULL;
1483 if (name != NULL) {
1484 namelc = strrchr(name, '.');
1485 if (namelc == NULL)
1486 namelc = name;
1487 else {
1488 /*
1489 * This is a hierarchical name. Split it into the
1490 * parent and child names, and make sure the parent
1491 * exists or matches an already found jail.
1492 */
1493 if (pr != NULL) {
1494 if (strncmp(name, ppr->pr_name, namelc - name)
1495 || ppr->pr_name[namelc - name] != '\0') {
1496 error = EINVAL;
1497 vfs_opterror(opts,
1498 "cannot change jail's parent");
1499 goto done_deref;
1500 }
1501 } else {
1502 *namelc = '\0';
1503 ppr = prison_find_name(mypr, name);
1504 if (ppr == NULL) {
1505 error = ENOENT;
1506 vfs_opterror(opts,
1507 "jail \"%s\" not found", name);
1508 goto done_deref;
1509 }
1510 mtx_unlock(&ppr->pr_mtx);
1511 if (!prison_isalive(ppr)) {
1512 error = ENOENT;
1513 vfs_opterror(opts,
1514 "jail \"%s\" is dying", name);
1515 goto done_deref;
1516 }
1517 *namelc = '.';
1518 }
1519 namelc++;
1520 }
1521 if (namelc[0] != '\0') {
1522 pnamelen =
1523 (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1524 FOREACH_PRISON_CHILD(ppr, tpr) {
1525 if (tpr == pr || !prison_isalive(tpr) ||
1526 strcmp(tpr->pr_name + pnamelen, namelc))
1527 continue;
1528 if (cuflags == JAIL_CREATE || pr != NULL) {
1529 /*
1530 * Create, or update(jid): name must
1531 * not exist in an active sibling jail.
1532 */
1533 error = EEXIST;
1534 vfs_opterror(opts,
1535 "jail \"%s\" already exists", name);
1536 goto done_deref;
1537 }
1538 /* Use this jail for updates. */
1539 pr = tpr;
1540 mtx_lock(&pr->pr_mtx);
1541 drflags |= PD_LOCKED;
1542 break;
1543 }
1544 /*
1545 * Update: name must exist if no jid is specified.
1546 * As with the jid case, the jail must be currently
1547 * visible, or else even CREATE | UPDATE will get
1548 * an error.
1549 */
1550 if ((pr == NULL)
1551 ? cuflags == JAIL_UPDATE
1552 : !prison_isalive(pr)) {
1553 error = ENOENT;
1554 vfs_opterror(opts, "jail \"%s\" not found",
1555 name);
1556 goto done_deref;
1557 }
1558 }
1559 }
1560 /* Update: must provide a jid or name. */
1561 else if (cuflags == JAIL_UPDATE && pr == NULL) {
1562 error = ENOENT;
1563 vfs_opterror(opts, "update specified no jail");
1564 goto done_deref;
1565 }
1566
1567 /* If there's no prison to update, create a new one and link it in. */
1568 created = pr == NULL;
1569 if (created) {
1570 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1571 if (tpr->pr_childcount >= tpr->pr_childmax) {
1572 error = EPERM;
1573 vfs_opterror(opts, "prison limit exceeded");
1574 goto done_deref;
1575 }
1576
1577 if (deadpr != NULL) {
1578 /*
1579 * The prison being created has the same ID as a dying
1580 * one. Handle this by giving the dying jail a new ID.
1581 * This may cause some confusion to user space, but
1582 * only to those listing dying jails.
1583 */
1584 deadid = get_next_deadid(&dinspr);
1585 if (deadid == 0) {
1586 error = EAGAIN;
1587 vfs_opterror(opts, "no available jail IDs");
1588 goto done_deref;
1589 }
1590 mtx_lock(&deadpr->pr_mtx);
1591 deadpr->pr_id = deadid;
1592 mtx_unlock(&deadpr->pr_mtx);
1593 if (dinspr == deadpr)
1594 inspr = deadpr;
1595 else {
1596 inspr = TAILQ_NEXT(deadpr, pr_list);
1597 TAILQ_REMOVE(&allprison, deadpr, pr_list);
1598 if (dinspr != NULL)
1599 TAILQ_INSERT_AFTER(&allprison, dinspr,
1600 deadpr, pr_list);
1601 else
1602 TAILQ_INSERT_HEAD(&allprison, deadpr,
1603 pr_list);
1604 }
1605 }
1606 if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1607 error = EAGAIN;
1608 vfs_opterror(opts, "no available jail IDs");
1609 goto done_deref;
1610 }
1611
1612 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1613 pr->pr_state = PRISON_STATE_INVALID;
1614 refcount_init(&pr->pr_ref, 1);
1615 refcount_init(&pr->pr_uref, 0);
1616 drflags |= PD_DEREF;
1617 LIST_INIT(&pr->pr_children);
1618 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1619 TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1620
1621 pr->pr_id = jid;
1622 if (inspr != NULL)
1623 TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1624 else
1625 TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1626
1627 pr->pr_parent = ppr;
1628 prison_hold(ppr);
1629 prison_proc_hold(ppr);
1630 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1631 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1632 tpr->pr_childcount++;
1633
1634 /* Set some default values, and inherit some from the parent. */
1635 if (namelc == NULL)
1636 namelc = "";
1637 if (path == NULL) {
1638 path = "/";
1639 root = mypr->pr_root;
1640 vref(root);
1641 }
1642 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1643 pr->pr_flags |= PR_HOST;
1644 #if defined(INET) || defined(INET6)
1645 #ifdef VIMAGE
1646 if (!(pr_flags & PR_VNET))
1647 #endif
1648 {
1649 #ifdef INET
1650 if (!(ch_flags & PR_IP4_USER))
1651 pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1652 else if (!(pr_flags & PR_IP4_USER)) {
1653 pr->pr_flags |= ppr->pr_flags & PR_IP4;
1654 prison_ip_dup(ppr, pr, PR_INET);
1655 }
1656 #endif
1657 #ifdef INET6
1658 if (!(ch_flags & PR_IP6_USER))
1659 pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1660 else if (!(pr_flags & PR_IP6_USER)) {
1661 pr->pr_flags |= ppr->pr_flags & PR_IP6;
1662 prison_ip_dup(ppr, pr, PR_INET6);
1663 }
1664 #endif
1665 }
1666 #endif
1667 /* Source address selection is always on by default. */
1668 pr->pr_flags |= _PR_IP_SADDRSEL;
1669
1670 pr->pr_securelevel = ppr->pr_securelevel;
1671 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1672 pr->pr_enforce_statfs = jail_default_enforce_statfs;
1673 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1674
1675 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1676 if (osrelstr == NULL)
1677 strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1678 sizeof(pr->pr_osrelease));
1679 else
1680 strlcpy(pr->pr_osrelease, osrelstr,
1681 sizeof(pr->pr_osrelease));
1682
1683 #ifdef VIMAGE
1684 /* Allocate a new vnet if specified. */
1685 pr->pr_vnet = (pr_flags & PR_VNET)
1686 ? vnet_alloc() : ppr->pr_vnet;
1687 #endif
1688 /*
1689 * Allocate a dedicated cpuset for each jail.
1690 * Unlike other initial settings, this may return an error.
1691 */
1692 error = cpuset_create_root(ppr, &pr->pr_cpuset);
1693 if (error)
1694 goto done_deref;
1695
1696 mtx_lock(&pr->pr_mtx);
1697 drflags |= PD_LOCKED;
1698 } else {
1699 /*
1700 * Grab a reference for existing prisons, to ensure they
1701 * continue to exist for the duration of the call.
1702 */
1703 prison_hold(pr);
1704 drflags |= PD_DEREF;
1705 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1706 if ((pr->pr_flags & PR_VNET) &&
1707 (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1708 error = EINVAL;
1709 vfs_opterror(opts,
1710 "vnet jails cannot have IP address restrictions");
1711 goto done_deref;
1712 }
1713 #endif
1714 #ifdef INET
1715 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1716 error = EINVAL;
1717 vfs_opterror(opts,
1718 "ip4 cannot be changed after creation");
1719 goto done_deref;
1720 }
1721 #endif
1722 #ifdef INET6
1723 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1724 error = EINVAL;
1725 vfs_opterror(opts,
1726 "ip6 cannot be changed after creation");
1727 goto done_deref;
1728 }
1729 #endif
1730 }
1731
1732 /* Do final error checking before setting anything. */
1733 if (gotslevel) {
1734 if (slevel < ppr->pr_securelevel) {
1735 error = EPERM;
1736 goto done_deref;
1737 }
1738 }
1739 if (gotchildmax) {
1740 if (childmax >= ppr->pr_childmax) {
1741 error = EPERM;
1742 goto done_deref;
1743 }
1744 }
1745 if (gotenforce) {
1746 if (enforce < ppr->pr_enforce_statfs) {
1747 error = EPERM;
1748 goto done_deref;
1749 }
1750 }
1751 if (gotrsnum) {
1752 /*
1753 * devfs_rsnum is a uint16_t
1754 */
1755 if (rsnum < 0 || rsnum > 65535) {
1756 error = EINVAL;
1757 goto done_deref;
1758 }
1759 /*
1760 * Nested jails always inherit parent's devfs ruleset
1761 */
1762 if (jailed(td->td_ucred)) {
1763 if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1764 error = EPERM;
1765 goto done_deref;
1766 } else
1767 rsnum = ppr->pr_devfs_rsnum;
1768 }
1769 }
1770 #ifdef INET
1771 if (ip4s > 0) {
1772 if ((ppr->pr_flags & PR_IP4) &&
1773 !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
1774 PR_INET)) {
1775 error = EPERM;
1776 goto done_deref;
1777 }
1778 if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
1779 error = EADDRINUSE;
1780 vfs_opterror(opts, "IPv4 addresses clash");
1781 goto done_deref;
1782 }
1783 }
1784 #endif
1785 #ifdef INET6
1786 if (ip6s > 0) {
1787 if ((ppr->pr_flags & PR_IP6) &&
1788 !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
1789 PR_INET6)) {
1790 error = EPERM;
1791 goto done_deref;
1792 }
1793 if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
1794 error = EADDRINUSE;
1795 vfs_opterror(opts, "IPv6 addresses clash");
1796 goto done_deref;
1797 }
1798 }
1799 #endif
1800 onamelen = namelen = 0;
1801 if (namelc != NULL) {
1802 /* Give a default name of the jid. Also allow the name to be
1803 * explicitly the jid - but not any other number, and only in
1804 * normal form (no leading zero/etc).
1805 */
1806 if (namelc[0] == '\0')
1807 snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1808 else if ((strtoul(namelc, &p, 10) != jid ||
1809 namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1810 error = EINVAL;
1811 vfs_opterror(opts,
1812 "name cannot be numeric (unless it is the jid)");
1813 goto done_deref;
1814 }
1815 /*
1816 * Make sure the name isn't too long for the prison or its
1817 * children.
1818 */
1819 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1820 onamelen = strlen(pr->pr_name + pnamelen);
1821 namelen = strlen(namelc);
1822 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1823 error = ENAMETOOLONG;
1824 goto done_deref;
1825 }
1826 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1827 if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1828 sizeof(pr->pr_name)) {
1829 error = ENAMETOOLONG;
1830 goto done_deref;
1831 }
1832 }
1833 }
1834 pr_allow_diff = pr_allow & ~ppr->pr_allow;
1835 if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1836 error = EPERM;
1837 goto done_deref;
1838 }
1839
1840 /*
1841 * Let modules check their parameters. This requires unlocking and
1842 * then re-locking the prison, but this is still a valid state as long
1843 * as allprison_lock remains xlocked.
1844 */
1845 mtx_unlock(&pr->pr_mtx);
1846 drflags &= ~PD_LOCKED;
1847 error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1848 if (error != 0)
1849 goto done_deref;
1850 mtx_lock(&pr->pr_mtx);
1851 drflags |= PD_LOCKED;
1852
1853 /* At this point, all valid parameters should have been noted. */
1854 TAILQ_FOREACH(opt, opts, link) {
1855 if (!opt->seen && strcmp(opt->name, "errmsg")) {
1856 error = EINVAL;
1857 vfs_opterror(opts, "unknown parameter: %s", opt->name);
1858 goto done_deref;
1859 }
1860 }
1861
1862 /* Set the parameters of the prison. */
1863 #ifdef INET
1864 redo_ip4 = false;
1865 if (pr_flags & PR_IP4_USER) {
1866 pr->pr_flags |= PR_IP4;
1867 prison_ip_set(pr, PR_INET, ip4);
1868 ip4 = NULL;
1869 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1870 #ifdef VIMAGE
1871 if (tpr->pr_flags & PR_VNET) {
1872 descend = 0;
1873 continue;
1874 }
1875 #endif
1876 if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
1877 redo_ip4 = true;
1878 descend = 0;
1879 }
1880 }
1881 }
1882 #endif
1883 #ifdef INET6
1884 redo_ip6 = false;
1885 if (pr_flags & PR_IP6_USER) {
1886 pr->pr_flags |= PR_IP6;
1887 prison_ip_set(pr, PR_INET6, ip6);
1888 ip6 = NULL;
1889 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1890 #ifdef VIMAGE
1891 if (tpr->pr_flags & PR_VNET) {
1892 descend = 0;
1893 continue;
1894 }
1895 #endif
1896 if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
1897 redo_ip6 = true;
1898 descend = 0;
1899 }
1900 }
1901 }
1902 #endif
1903 if (gotslevel) {
1904 pr->pr_securelevel = slevel;
1905 /* Set all child jails to be at least this level. */
1906 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1907 if (tpr->pr_securelevel < slevel)
1908 tpr->pr_securelevel = slevel;
1909 }
1910 if (gotchildmax) {
1911 pr->pr_childmax = childmax;
1912 /* Set all child jails to under this limit. */
1913 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1914 if (tpr->pr_childmax > childmax - level)
1915 tpr->pr_childmax = childmax > level
1916 ? childmax - level : 0;
1917 }
1918 if (gotenforce) {
1919 pr->pr_enforce_statfs = enforce;
1920 /* Pass this restriction on to the children. */
1921 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1922 if (tpr->pr_enforce_statfs < enforce)
1923 tpr->pr_enforce_statfs = enforce;
1924 }
1925 if (gotrsnum) {
1926 pr->pr_devfs_rsnum = rsnum;
1927 /* Pass this restriction on to the children. */
1928 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1929 tpr->pr_devfs_rsnum = rsnum;
1930 }
1931 if (namelc != NULL) {
1932 if (ppr == &prison0)
1933 strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1934 else
1935 snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1936 ppr->pr_name, namelc);
1937 /* Change this component of child names. */
1938 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1939 bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1940 strlen(tpr->pr_name + onamelen) + 1);
1941 bcopy(pr->pr_name, tpr->pr_name, namelen);
1942 }
1943 }
1944 if (path != NULL) {
1945 /* Try to keep a real-rooted full pathname. */
1946 strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1947 pr->pr_root = root;
1948 root = NULL;
1949 }
1950 if (PR_HOST & ch_flags & ~pr_flags) {
1951 if (pr->pr_flags & PR_HOST) {
1952 /*
1953 * Copy the parent's host info. As with pr_ip4 above,
1954 * the lack of a lock on the parent is not a problem;
1955 * it is always set with allprison_lock at least
1956 * shared, and is held exclusively here.
1957 */
1958 strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1959 sizeof(pr->pr_hostname));
1960 strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1961 sizeof(pr->pr_domainname));
1962 strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1963 sizeof(pr->pr_hostuuid));
1964 pr->pr_hostid = pr->pr_parent->pr_hostid;
1965 }
1966 } else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1967 /* Set this prison, and any descendants without PR_HOST. */
1968 if (host != NULL)
1969 strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1970 if (domain != NULL)
1971 strlcpy(pr->pr_domainname, domain,
1972 sizeof(pr->pr_domainname));
1973 if (uuid != NULL)
1974 strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1975 if (gothid)
1976 pr->pr_hostid = hid;
1977 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1978 if (tpr->pr_flags & PR_HOST)
1979 descend = 0;
1980 else {
1981 if (host != NULL)
1982 strlcpy(tpr->pr_hostname,
1983 pr->pr_hostname,
1984 sizeof(tpr->pr_hostname));
1985 if (domain != NULL)
1986 strlcpy(tpr->pr_domainname,
1987 pr->pr_domainname,
1988 sizeof(tpr->pr_domainname));
1989 if (uuid != NULL)
1990 strlcpy(tpr->pr_hostuuid,
1991 pr->pr_hostuuid,
1992 sizeof(tpr->pr_hostuuid));
1993 if (gothid)
1994 tpr->pr_hostid = hid;
1995 }
1996 }
1997 }
1998 pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1999 if ((tallow = ch_allow & ~pr_allow))
2000 prison_set_allow_locked(pr, tallow, 0);
2001 /*
2002 * Persistent prisons get an extra reference, and prisons losing their
2003 * persist flag lose that reference.
2004 */
2005 if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
2006 if (pr_flags & PR_PERSIST) {
2007 prison_hold(pr);
2008 /*
2009 * This may be a new prison's first user reference,
2010 * but wait to call it alive until after OSD calls
2011 * have had a chance to run (and perhaps to fail).
2012 */
2013 refcount_acquire(&pr->pr_uref);
2014 } else {
2015 drflags |= PD_DEUREF;
2016 prison_free_not_last(pr);
2017 }
2018 }
2019 pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
2020 mtx_unlock(&pr->pr_mtx);
2021 drflags &= ~PD_LOCKED;
2022 /*
2023 * Any errors past this point will need to de-persist newly created
2024 * prisons, as well as call remove methods.
2025 */
2026 if (created)
2027 drflags |= PD_KILL;
2028
2029 #ifdef RACCT
2030 if (racct_enable && created)
2031 prison_racct_attach(pr);
2032 #endif
2033
2034 /* Locks may have prevented a complete restriction of child IP
2035 * addresses. If so, allocate some more memory and try again.
2036 */
2037 #ifdef INET
2038 while (redo_ip4) {
2039 ip4s = pr->pr_addrs[PR_INET]->ips;
2040 MPASS(ip4 == NULL);
2041 ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
2042 mtx_lock(&pr->pr_mtx);
2043 redo_ip4 = false;
2044 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2045 #ifdef VIMAGE
2046 if (tpr->pr_flags & PR_VNET) {
2047 descend = 0;
2048 continue;
2049 }
2050 #endif
2051 if (!prison_ip_restrict(tpr, PR_INET, &ip4))
2052 redo_ip4 = true;
2053 }
2054 mtx_unlock(&pr->pr_mtx);
2055 }
2056 #endif
2057 #ifdef INET6
2058 while (redo_ip6) {
2059 ip6s = pr->pr_addrs[PR_INET6]->ips;
2060 MPASS(ip6 == NULL);
2061 ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
2062 mtx_lock(&pr->pr_mtx);
2063 redo_ip6 = false;
2064 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2065 #ifdef VIMAGE
2066 if (tpr->pr_flags & PR_VNET) {
2067 descend = 0;
2068 continue;
2069 }
2070 #endif
2071 if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
2072 redo_ip6 = true;
2073 }
2074 mtx_unlock(&pr->pr_mtx);
2075 }
2076 #endif
2077
2078 /* Let the modules do their work. */
2079 if (created) {
2080 error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
2081 if (error)
2082 goto done_deref;
2083 }
2084 error = osd_jail_call(pr, PR_METHOD_SET, opts);
2085 if (error)
2086 goto done_deref;
2087
2088 /*
2089 * A new prison is now ready to be seen; either it has gained a user
2090 * reference via persistence, or is about to gain one via attachment.
2091 */
2092 if (created) {
2093 drflags = prison_lock_xlock(pr, drflags);
2094 pr->pr_state = PRISON_STATE_ALIVE;
2095 }
2096
2097 /* Attach this process to the prison if requested. */
2098 if (flags & JAIL_ATTACH) {
2099 error = do_jail_attach(td, pr,
2100 prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
2101 drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
2102 if (error) {
2103 vfs_opterror(opts, "attach failed");
2104 goto done_deref;
2105 }
2106 }
2107
2108 #ifdef RACCT
2109 if (racct_enable && !created) {
2110 if (drflags & PD_LOCKED) {
2111 mtx_unlock(&pr->pr_mtx);
2112 drflags &= ~PD_LOCKED;
2113 }
2114 if (drflags & PD_LIST_XLOCKED) {
2115 sx_xunlock(&allprison_lock);
2116 drflags &= ~PD_LIST_XLOCKED;
2117 }
2118 prison_racct_modify(pr);
2119 }
2120 #endif
2121
2122 if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
2123 (pr->pr_root->v_vflag & VV_ROOT) == 0)
2124 printf("Warning jail jid=%d: mountd/nfsd requires a separate"
2125 " file system\n", pr->pr_id);
2126
2127 drflags &= ~PD_KILL;
2128 td->td_retval[0] = pr->pr_id;
2129
2130 done_deref:
2131 /* Release any temporary prison holds and/or locks. */
2132 if (pr != NULL)
2133 prison_deref(pr, drflags);
2134 else if (drflags & PD_LIST_SLOCKED)
2135 sx_sunlock(&allprison_lock);
2136 else if (drflags & PD_LIST_XLOCKED)
2137 sx_xunlock(&allprison_lock);
2138 if (root != NULL)
2139 vrele(root);
2140 done_errmsg:
2141 if (error) {
2142 /* Write the error message back to userspace. */
2143 if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
2144 &errmsg_len) == 0 && errmsg_len > 0) {
2145 errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
2146 if (optuio->uio_segflg == UIO_SYSSPACE)
2147 bcopy(errmsg,
2148 optuio->uio_iov[errmsg_pos].iov_base,
2149 errmsg_len);
2150 else
2151 (void)copyout(errmsg,
2152 optuio->uio_iov[errmsg_pos].iov_base,
2153 errmsg_len);
2154 }
2155 }
2156 done_free:
2157 #ifdef INET
2158 prison_ip_free(ip4);
2159 #endif
2160 #ifdef INET6
2161 prison_ip_free(ip6);
2162 #endif
2163 if (g_path != NULL)
2164 free(g_path, M_TEMP);
2165 vfs_freeopts(opts);
2166 return (error);
2167 }
2168
2169 /*
2170 * Find the next available prison ID. Return the ID on success, or zero
2171 * on failure. Also set a pointer to the allprison list entry the prison
2172 * should be inserted before.
2173 */
2174 static int
get_next_prid(struct prison ** insprp)2175 get_next_prid(struct prison **insprp)
2176 {
2177 struct prison *inspr;
2178 int jid, maxid;
2179
2180 jid = lastprid % JAIL_MAX + 1;
2181 if (TAILQ_EMPTY(&allprison) ||
2182 TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
2183 /*
2184 * A common case is for all jails to be implicitly numbered,
2185 * which means they'll go on the end of the list, at least
2186 * for the first JAIL_MAX times.
2187 */
2188 inspr = NULL;
2189 } else {
2190 /*
2191 * Take two passes through the allprison list: first starting
2192 * with the proposed jid, then ending with it.
2193 */
2194 for (maxid = JAIL_MAX; maxid != 0; ) {
2195 TAILQ_FOREACH(inspr, &allprison, pr_list) {
2196 if (inspr->pr_id < jid)
2197 continue;
2198 if (inspr->pr_id > jid) {
2199 /* Found an opening. */
2200 maxid = 0;
2201 break;
2202 }
2203 if (++jid > maxid) {
2204 if (lastprid == maxid || lastprid == 0)
2205 {
2206 /*
2207 * The entire legal range
2208 * has been traversed
2209 */
2210 return 0;
2211 }
2212 /* Try again from the start. */
2213 jid = 1;
2214 maxid = lastprid;
2215 break;
2216 }
2217 }
2218 if (inspr == NULL) {
2219 /* Found room at the end of the list. */
2220 break;
2221 }
2222 }
2223 }
2224 *insprp = inspr;
2225 lastprid = jid;
2226 return (jid);
2227 }
2228
2229 /*
2230 * Find the next available ID for a renumbered dead prison. This is the same
2231 * as get_next_prid, but counting backward from the end of the range.
2232 */
2233 static int
get_next_deadid(struct prison ** dinsprp)2234 get_next_deadid(struct prison **dinsprp)
2235 {
2236 struct prison *dinspr;
2237 int deadid, minid;
2238
2239 deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX;
2240 /*
2241 * Take two reverse passes through the allprison list: first
2242 * starting with the proposed deadid, then ending with it.
2243 */
2244 for (minid = 1; minid != 0; ) {
2245 TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) {
2246 if (dinspr->pr_id > deadid)
2247 continue;
2248 if (dinspr->pr_id < deadid) {
2249 /* Found an opening. */
2250 minid = 0;
2251 break;
2252 }
2253 if (--deadid < minid) {
2254 if (lastdeadid == minid || lastdeadid == 0)
2255 {
2256 /*
2257 * The entire legal range
2258 * has been traversed
2259 */
2260 return 0;
2261 }
2262 /* Try again from the end. */
2263 deadid = JAIL_MAX;
2264 minid = lastdeadid;
2265 break;
2266 }
2267 }
2268 if (dinspr == NULL) {
2269 /* Found room at the beginning of the list. */
2270 break;
2271 }
2272 }
2273 *dinsprp = dinspr;
2274 lastdeadid = deadid;
2275 return (deadid);
2276 }
2277
2278 /*
2279 * struct jail_get_args {
2280 * struct iovec *iovp;
2281 * unsigned int iovcnt;
2282 * int flags;
2283 * };
2284 */
2285 int
sys_jail_get(struct thread * td,struct jail_get_args * uap)2286 sys_jail_get(struct thread *td, struct jail_get_args *uap)
2287 {
2288 struct uio *auio;
2289 int error;
2290
2291 /* Check that we have an even number of iovecs. */
2292 if (uap->iovcnt & 1)
2293 return (EINVAL);
2294
2295 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2296 if (error)
2297 return (error);
2298 error = kern_jail_get(td, auio, uap->flags);
2299 if (error == 0)
2300 error = copyout(auio->uio_iov, uap->iovp,
2301 uap->iovcnt * sizeof(struct iovec));
2302 freeuio(auio);
2303 return (error);
2304 }
2305
2306 int
kern_jail_get(struct thread * td,struct uio * optuio,int flags)2307 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2308 {
2309 struct bool_flags *bf;
2310 struct jailsys_flags *jsf;
2311 struct prison *pr, *mypr;
2312 struct vfsopt *opt;
2313 struct vfsoptlist *opts;
2314 char *errmsg, *name;
2315 int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2316 unsigned f;
2317
2318 if (flags & ~JAIL_GET_MASK)
2319 return (EINVAL);
2320
2321 /* Get the parameter list. */
2322 error = vfs_buildopts(optuio, &opts);
2323 if (error)
2324 return (error);
2325 errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2326 mypr = td->td_ucred->cr_prison;
2327 pr = NULL;
2328
2329 /*
2330 * Find the prison specified by one of: lastjid, jid, name.
2331 */
2332 sx_slock(&allprison_lock);
2333 drflags = PD_LIST_SLOCKED;
2334 error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2335 if (error == 0) {
2336 TAILQ_FOREACH(pr, &allprison, pr_list) {
2337 if (pr->pr_id > jid &&
2338 ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2339 prison_ischild(mypr, pr)) {
2340 mtx_lock(&pr->pr_mtx);
2341 drflags |= PD_LOCKED;
2342 goto found_prison;
2343 }
2344 }
2345 error = ENOENT;
2346 vfs_opterror(opts, "no jail after %d", jid);
2347 goto done;
2348 } else if (error != ENOENT)
2349 goto done;
2350
2351 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2352 if (error == 0) {
2353 if (jid != 0) {
2354 pr = prison_find_child(mypr, jid);
2355 if (pr != NULL) {
2356 drflags |= PD_LOCKED;
2357 if (!(prison_isalive(pr) ||
2358 (flags & JAIL_DYING))) {
2359 error = ENOENT;
2360 vfs_opterror(opts, "jail %d is dying",
2361 jid);
2362 goto done;
2363 }
2364 goto found_prison;
2365 }
2366 error = ENOENT;
2367 vfs_opterror(opts, "jail %d not found", jid);
2368 goto done;
2369 }
2370 } else if (error != ENOENT)
2371 goto done;
2372
2373 error = vfs_getopt(opts, "name", (void **)&name, &len);
2374 if (error == 0) {
2375 if (len == 0 || name[len - 1] != '\0') {
2376 error = EINVAL;
2377 goto done;
2378 }
2379 pr = prison_find_name(mypr, name);
2380 if (pr != NULL) {
2381 drflags |= PD_LOCKED;
2382 if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2383 error = ENOENT;
2384 vfs_opterror(opts, "jail \"%s\" is dying",
2385 name);
2386 goto done;
2387 }
2388 goto found_prison;
2389 }
2390 error = ENOENT;
2391 vfs_opterror(opts, "jail \"%s\" not found", name);
2392 goto done;
2393 } else if (error != ENOENT)
2394 goto done;
2395
2396 vfs_opterror(opts, "no jail specified");
2397 error = ENOENT;
2398 goto done;
2399
2400 found_prison:
2401 /* Get the parameters of the prison. */
2402 prison_hold(pr);
2403 drflags |= PD_DEREF;
2404 td->td_retval[0] = pr->pr_id;
2405 error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2406 if (error != 0 && error != ENOENT)
2407 goto done;
2408 i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2409 error = vfs_setopt(opts, "parent", &i, sizeof(i));
2410 if (error != 0 && error != ENOENT)
2411 goto done;
2412 error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2413 if (error != 0 && error != ENOENT)
2414 goto done;
2415 error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2416 sizeof(pr->pr_cpuset->cs_id));
2417 if (error != 0 && error != ENOENT)
2418 goto done;
2419 error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2420 if (error != 0 && error != ENOENT)
2421 goto done;
2422 #ifdef INET
2423 error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
2424 pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
2425 pr_families[PR_INET].size : 0 );
2426 if (error != 0 && error != ENOENT)
2427 goto done;
2428 #endif
2429 #ifdef INET6
2430 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
2431 pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
2432 pr_families[PR_INET6].size : 0 );
2433 if (error != 0 && error != ENOENT)
2434 goto done;
2435 #endif
2436 error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2437 sizeof(pr->pr_securelevel));
2438 if (error != 0 && error != ENOENT)
2439 goto done;
2440 error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2441 sizeof(pr->pr_childcount));
2442 if (error != 0 && error != ENOENT)
2443 goto done;
2444 error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2445 sizeof(pr->pr_childmax));
2446 if (error != 0 && error != ENOENT)
2447 goto done;
2448 error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2449 if (error != 0 && error != ENOENT)
2450 goto done;
2451 error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2452 if (error != 0 && error != ENOENT)
2453 goto done;
2454 error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2455 if (error != 0 && error != ENOENT)
2456 goto done;
2457 #ifdef COMPAT_FREEBSD32
2458 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2459 uint32_t hid32 = pr->pr_hostid;
2460
2461 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2462 } else
2463 #endif
2464 error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2465 sizeof(pr->pr_hostid));
2466 if (error != 0 && error != ENOENT)
2467 goto done;
2468 error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2469 sizeof(pr->pr_enforce_statfs));
2470 if (error != 0 && error != ENOENT)
2471 goto done;
2472 error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2473 sizeof(pr->pr_devfs_rsnum));
2474 if (error != 0 && error != ENOENT)
2475 goto done;
2476 for (bf = pr_flag_bool;
2477 bf < pr_flag_bool + nitems(pr_flag_bool);
2478 bf++) {
2479 i = (pr->pr_flags & bf->flag) ? 1 : 0;
2480 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2481 if (error != 0 && error != ENOENT)
2482 goto done;
2483 i = !i;
2484 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2485 if (error != 0 && error != ENOENT)
2486 goto done;
2487 }
2488 for (jsf = pr_flag_jailsys;
2489 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2490 jsf++) {
2491 f = pr->pr_flags & (jsf->disable | jsf->new);
2492 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2493 : (f == jsf->new) ? JAIL_SYS_NEW
2494 : JAIL_SYS_INHERIT;
2495 error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2496 if (error != 0 && error != ENOENT)
2497 goto done;
2498 }
2499 for (bf = pr_flag_allow;
2500 bf < pr_flag_allow + nitems(pr_flag_allow) &&
2501 atomic_load_int(&bf->flag) != 0;
2502 bf++) {
2503 i = (pr->pr_allow & bf->flag) ? 1 : 0;
2504 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2505 if (error != 0 && error != ENOENT)
2506 goto done;
2507 i = !i;
2508 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2509 if (error != 0 && error != ENOENT)
2510 goto done;
2511 }
2512 i = !prison_isalive(pr);
2513 error = vfs_setopt(opts, "dying", &i, sizeof(i));
2514 if (error != 0 && error != ENOENT)
2515 goto done;
2516 i = !i;
2517 error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2518 if (error != 0 && error != ENOENT)
2519 goto done;
2520 error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2521 sizeof(pr->pr_osreldate));
2522 if (error != 0 && error != ENOENT)
2523 goto done;
2524 error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2525 if (error != 0 && error != ENOENT)
2526 goto done;
2527
2528 /* Get the module parameters. */
2529 mtx_unlock(&pr->pr_mtx);
2530 drflags &= ~PD_LOCKED;
2531 error = osd_jail_call(pr, PR_METHOD_GET, opts);
2532 if (error)
2533 goto done;
2534 prison_deref(pr, drflags);
2535 pr = NULL;
2536 drflags = 0;
2537
2538 /* By now, all parameters should have been noted. */
2539 TAILQ_FOREACH(opt, opts, link) {
2540 if (!opt->seen && strcmp(opt->name, "errmsg")) {
2541 error = EINVAL;
2542 vfs_opterror(opts, "unknown parameter: %s", opt->name);
2543 goto done;
2544 }
2545 }
2546
2547 /* Write the fetched parameters back to userspace. */
2548 error = 0;
2549 TAILQ_FOREACH(opt, opts, link) {
2550 if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2551 pos = 2 * opt->pos + 1;
2552 optuio->uio_iov[pos].iov_len = opt->len;
2553 if (opt->value != NULL) {
2554 if (optuio->uio_segflg == UIO_SYSSPACE) {
2555 bcopy(opt->value,
2556 optuio->uio_iov[pos].iov_base,
2557 opt->len);
2558 } else {
2559 error = copyout(opt->value,
2560 optuio->uio_iov[pos].iov_base,
2561 opt->len);
2562 if (error)
2563 break;
2564 }
2565 }
2566 }
2567 }
2568
2569 done:
2570 /* Release any temporary prison holds and/or locks. */
2571 if (pr != NULL)
2572 prison_deref(pr, drflags);
2573 else if (drflags & PD_LIST_SLOCKED)
2574 sx_sunlock(&allprison_lock);
2575 if (error && errmsg_pos >= 0) {
2576 /* Write the error message back to userspace. */
2577 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2578 errmsg_pos = 2 * errmsg_pos + 1;
2579 if (errmsg_len > 0) {
2580 if (optuio->uio_segflg == UIO_SYSSPACE)
2581 bcopy(errmsg,
2582 optuio->uio_iov[errmsg_pos].iov_base,
2583 errmsg_len);
2584 else
2585 (void)copyout(errmsg,
2586 optuio->uio_iov[errmsg_pos].iov_base,
2587 errmsg_len);
2588 }
2589 }
2590 vfs_freeopts(opts);
2591 return (error);
2592 }
2593
2594 /*
2595 * struct jail_remove_args {
2596 * int jid;
2597 * };
2598 */
2599 int
sys_jail_remove(struct thread * td,struct jail_remove_args * uap)2600 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2601 {
2602 struct prison *pr;
2603 int error;
2604
2605 error = priv_check(td, PRIV_JAIL_REMOVE);
2606 if (error)
2607 return (error);
2608
2609 sx_xlock(&allprison_lock);
2610 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2611 if (pr == NULL) {
2612 sx_xunlock(&allprison_lock);
2613 return (EINVAL);
2614 }
2615 if (!prison_isalive(pr)) {
2616 /* Silently ignore already-dying prisons. */
2617 mtx_unlock(&pr->pr_mtx);
2618 sx_xunlock(&allprison_lock);
2619 return (0);
2620 }
2621 prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2622 return (0);
2623 }
2624
2625 /*
2626 * struct jail_attach_args {
2627 * int jid;
2628 * };
2629 */
2630 int
sys_jail_attach(struct thread * td,struct jail_attach_args * uap)2631 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2632 {
2633 struct prison *pr;
2634 int error;
2635
2636 error = priv_check(td, PRIV_JAIL_ATTACH);
2637 if (error)
2638 return (error);
2639
2640 sx_slock(&allprison_lock);
2641 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2642 if (pr == NULL) {
2643 sx_sunlock(&allprison_lock);
2644 return (EINVAL);
2645 }
2646
2647 /* Do not allow a process to attach to a prison that is not alive. */
2648 if (!prison_isalive(pr)) {
2649 mtx_unlock(&pr->pr_mtx);
2650 sx_sunlock(&allprison_lock);
2651 return (EINVAL);
2652 }
2653
2654 return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2655 }
2656
2657 static int
do_jail_attach(struct thread * td,struct prison * pr,int drflags)2658 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2659 {
2660 struct proc *p;
2661 struct ucred *newcred, *oldcred;
2662 int error;
2663
2664 mtx_assert(&pr->pr_mtx, MA_OWNED);
2665 sx_assert(&allprison_lock, SX_LOCKED);
2666 drflags &= PD_LOCK_FLAGS;
2667 /*
2668 * XXX: Note that there is a slight race here if two threads
2669 * in the same privileged process attempt to attach to two
2670 * different jails at the same time. It is important for
2671 * user processes not to do this, or they might end up with
2672 * a process root from one prison, but attached to the jail
2673 * of another.
2674 */
2675 prison_hold(pr);
2676 refcount_acquire(&pr->pr_uref);
2677 drflags |= PD_DEREF | PD_DEUREF;
2678 mtx_unlock(&pr->pr_mtx);
2679 drflags &= ~PD_LOCKED;
2680
2681 /* Let modules do whatever they need to prepare for attaching. */
2682 error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2683 if (error) {
2684 prison_deref(pr, drflags);
2685 return (error);
2686 }
2687 sx_unlock(&allprison_lock);
2688 drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2689
2690 /*
2691 * Reparent the newly attached process to this jail.
2692 */
2693 p = td->td_proc;
2694 error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2695 if (error)
2696 goto e_revert_osd;
2697
2698 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2699 if ((error = change_dir(pr->pr_root, td)) != 0)
2700 goto e_unlock;
2701 #ifdef MAC
2702 if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2703 goto e_unlock;
2704 #endif
2705 VOP_UNLOCK(pr->pr_root);
2706 if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2707 goto e_revert_osd;
2708
2709 newcred = crget();
2710 PROC_LOCK(p);
2711 oldcred = crcopysafe(p, newcred);
2712 newcred->cr_prison = pr;
2713 proc_set_cred(p, newcred);
2714 setsugid(p);
2715 #ifdef RACCT
2716 racct_proc_ucred_changed(p, oldcred, newcred);
2717 crhold(newcred);
2718 #endif
2719 PROC_UNLOCK(p);
2720 #ifdef RCTL
2721 rctl_proc_ucred_changed(p, newcred);
2722 crfree(newcred);
2723 #endif
2724 prison_proc_relink(oldcred->cr_prison, pr, p);
2725 prison_deref(oldcred->cr_prison, drflags);
2726 crfree(oldcred);
2727
2728 /*
2729 * If the prison was killed while changing credentials, die along
2730 * with it.
2731 */
2732 if (!prison_isalive(pr)) {
2733 PROC_LOCK(p);
2734 kern_psignal(p, SIGKILL);
2735 PROC_UNLOCK(p);
2736 }
2737
2738 return (0);
2739
2740 e_unlock:
2741 VOP_UNLOCK(pr->pr_root);
2742 e_revert_osd:
2743 /* Tell modules this thread is still in its old jail after all. */
2744 sx_slock(&allprison_lock);
2745 drflags |= PD_LIST_SLOCKED;
2746 (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2747 prison_deref(pr, drflags);
2748 return (error);
2749 }
2750
2751 /*
2752 * Returns a locked prison instance, or NULL on failure.
2753 */
2754 struct prison *
prison_find(int prid)2755 prison_find(int prid)
2756 {
2757 struct prison *pr;
2758
2759 sx_assert(&allprison_lock, SX_LOCKED);
2760 TAILQ_FOREACH(pr, &allprison, pr_list) {
2761 if (pr->pr_id < prid)
2762 continue;
2763 if (pr->pr_id > prid)
2764 break;
2765 KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2766 mtx_lock(&pr->pr_mtx);
2767 return (pr);
2768 }
2769 return (NULL);
2770 }
2771
2772 /*
2773 * Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
2774 */
2775 struct prison *
prison_find_child(struct prison * mypr,int prid)2776 prison_find_child(struct prison *mypr, int prid)
2777 {
2778 struct prison *pr;
2779 int descend;
2780
2781 sx_assert(&allprison_lock, SX_LOCKED);
2782 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2783 if (pr->pr_id == prid) {
2784 KASSERT(prison_isvalid(pr),
2785 ("Found invalid prison %p", pr));
2786 mtx_lock(&pr->pr_mtx);
2787 return (pr);
2788 }
2789 }
2790 return (NULL);
2791 }
2792
2793 /*
2794 * Look for the name relative to mypr. Returns a locked prison or NULL.
2795 */
2796 struct prison *
prison_find_name(struct prison * mypr,const char * name)2797 prison_find_name(struct prison *mypr, const char *name)
2798 {
2799 struct prison *pr, *deadpr;
2800 size_t mylen;
2801 int descend;
2802
2803 sx_assert(&allprison_lock, SX_LOCKED);
2804 mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2805 deadpr = NULL;
2806 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2807 if (!strcmp(pr->pr_name + mylen, name)) {
2808 KASSERT(prison_isvalid(pr),
2809 ("Found invalid prison %p", pr));
2810 if (prison_isalive(pr)) {
2811 mtx_lock(&pr->pr_mtx);
2812 return (pr);
2813 }
2814 deadpr = pr;
2815 }
2816 }
2817 /* There was no valid prison - perhaps there was a dying one. */
2818 if (deadpr != NULL)
2819 mtx_lock(&deadpr->pr_mtx);
2820 return (deadpr);
2821 }
2822
2823 /*
2824 * See if a prison has the specific flag set. The prison should be locked,
2825 * unless checking for flags that are only set at jail creation (such as
2826 * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2827 * to any other prison data.
2828 */
2829 bool
prison_flag(struct ucred * cred,unsigned flag)2830 prison_flag(struct ucred *cred, unsigned flag)
2831 {
2832
2833 return ((cred->cr_prison->pr_flags & flag) != 0);
2834 }
2835
2836 /*
2837 * See if a prison has the specific allow flag set.
2838 * The prison *should* be locked, or only a single bit is examined, without
2839 * regard to any other prison data.
2840 */
2841 bool
prison_allow(struct ucred * cred,unsigned flag)2842 prison_allow(struct ucred *cred, unsigned flag)
2843 {
2844
2845 return ((cred->cr_prison->pr_allow & flag) != 0);
2846 }
2847
2848 /*
2849 * Hold a prison reference, by incrementing pr_ref. It is generally
2850 * an error to hold a prison that does not already have a reference.
2851 * A prison record will remain valid as long as it has at least one
2852 * reference, and will not be removed as long as either the prison
2853 * mutex or the allprison lock is held (allprison_lock may be shared).
2854 */
2855 void
prison_hold_locked(struct prison * pr)2856 prison_hold_locked(struct prison *pr)
2857 {
2858
2859 /* Locking is no longer required. */
2860 prison_hold(pr);
2861 }
2862
2863 void
prison_hold(struct prison * pr)2864 prison_hold(struct prison *pr)
2865 {
2866 #ifdef INVARIANTS
2867 int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2868
2869 KASSERT(was_valid,
2870 ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2871 #else
2872 refcount_acquire(&pr->pr_ref);
2873 #endif
2874 }
2875
2876 /*
2877 * Remove a prison reference. If that was the last reference, the
2878 * prison will be removed (at a later time).
2879 */
2880 void
prison_free_locked(struct prison * pr)2881 prison_free_locked(struct prison *pr)
2882 {
2883
2884 mtx_assert(&pr->pr_mtx, MA_OWNED);
2885 /*
2886 * Locking is no longer required, but unlock because the caller
2887 * expects it.
2888 */
2889 mtx_unlock(&pr->pr_mtx);
2890 prison_free(pr);
2891 }
2892
2893 void
prison_free(struct prison * pr)2894 prison_free(struct prison *pr)
2895 {
2896
2897 KASSERT(refcount_load(&pr->pr_ref) > 0,
2898 ("Trying to free dead prison %p (jid=%d).",
2899 pr, pr->pr_id));
2900 if (!refcount_release_if_not_last(&pr->pr_ref)) {
2901 /*
2902 * Don't remove the last reference in this context,
2903 * in case there are locks held.
2904 */
2905 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2906 }
2907 }
2908
2909 static void
prison_free_not_last(struct prison * pr)2910 prison_free_not_last(struct prison *pr)
2911 {
2912 #ifdef INVARIANTS
2913 int lastref;
2914
2915 KASSERT(refcount_load(&pr->pr_ref) > 0,
2916 ("Trying to free dead prison %p (jid=%d).",
2917 pr, pr->pr_id));
2918 lastref = refcount_release(&pr->pr_ref);
2919 KASSERT(!lastref,
2920 ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2921 pr, pr->pr_id));
2922 #else
2923 refcount_release(&pr->pr_ref);
2924 #endif
2925 }
2926
2927 /*
2928 * Hold a prison for user visibility, by incrementing pr_uref.
2929 * It is generally an error to hold a prison that isn't already
2930 * user-visible, except through the jail system calls. It is also
2931 * an error to hold an invalid prison. A prison record will remain
2932 * alive as long as it has at least one user reference, and will not
2933 * be set to the dying state until the prison mutex and allprison_lock
2934 * are both freed.
2935 */
2936 void
prison_proc_hold(struct prison * pr)2937 prison_proc_hold(struct prison *pr)
2938 {
2939 #ifdef INVARIANTS
2940 int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2941
2942 KASSERT(was_alive,
2943 ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2944 #else
2945 refcount_acquire(&pr->pr_uref);
2946 #endif
2947 }
2948
2949 /*
2950 * Remove a prison user reference. If it was the last reference, the
2951 * prison will be considered "dying", and may be removed once all of
2952 * its references are dropped.
2953 */
2954 void
prison_proc_free(struct prison * pr)2955 prison_proc_free(struct prison *pr)
2956 {
2957
2958 /*
2959 * Locking is only required when releasing the last reference.
2960 * This allows assurance that a locked prison will remain alive
2961 * until it is unlocked.
2962 */
2963 KASSERT(refcount_load(&pr->pr_uref) > 0,
2964 ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2965 if (!refcount_release_if_not_last(&pr->pr_uref)) {
2966 /*
2967 * Don't remove the last user reference in this context,
2968 * which is expected to be a process that is not only locked,
2969 * but also half dead. Add a reference so any calls to
2970 * prison_free() won't re-submit the task.
2971 */
2972 prison_hold(pr);
2973 mtx_lock(&pr->pr_mtx);
2974 KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2975 ("Redundant last reference in prison_proc_free (jid=%d)",
2976 pr->pr_id));
2977 pr->pr_flags |= PR_COMPLETE_PROC;
2978 mtx_unlock(&pr->pr_mtx);
2979 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2980 }
2981 }
2982
2983 static void
prison_proc_free_not_last(struct prison * pr)2984 prison_proc_free_not_last(struct prison *pr)
2985 {
2986 #ifdef INVARIANTS
2987 int lastref;
2988
2989 KASSERT(refcount_load(&pr->pr_uref) > 0,
2990 ("Trying to free dead prison %p (jid=%d).",
2991 pr, pr->pr_id));
2992 lastref = refcount_release(&pr->pr_uref);
2993 KASSERT(!lastref,
2994 ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2995 pr, pr->pr_id));
2996 #else
2997 refcount_release(&pr->pr_uref);
2998 #endif
2999 }
3000
3001 void
prison_proc_link(struct prison * pr,struct proc * p)3002 prison_proc_link(struct prison *pr, struct proc *p)
3003 {
3004
3005 sx_assert(&allproc_lock, SA_XLOCKED);
3006 LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
3007 }
3008
3009 void
prison_proc_unlink(struct prison * pr,struct proc * p)3010 prison_proc_unlink(struct prison *pr, struct proc *p)
3011 {
3012
3013 sx_assert(&allproc_lock, SA_XLOCKED);
3014 LIST_REMOVE(p, p_jaillist);
3015 }
3016
3017 static void
prison_proc_relink(struct prison * opr,struct prison * npr,struct proc * p)3018 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
3019 {
3020
3021 sx_xlock(&allproc_lock);
3022 prison_proc_unlink(opr, p);
3023 prison_proc_link(npr, p);
3024 sx_xunlock(&allproc_lock);
3025 }
3026
3027 /*
3028 * Complete a call to either prison_free or prison_proc_free.
3029 */
3030 static void
prison_complete(void * context,int pending)3031 prison_complete(void *context, int pending)
3032 {
3033 struct prison *pr = context;
3034 int drflags;
3035
3036 /*
3037 * This could be called to release the last reference, or the last
3038 * user reference (plus the reference held in prison_proc_free).
3039 */
3040 drflags = prison_lock_xlock(pr, PD_DEREF);
3041 if (pr->pr_flags & PR_COMPLETE_PROC) {
3042 pr->pr_flags &= ~PR_COMPLETE_PROC;
3043 drflags |= PD_DEUREF;
3044 }
3045 prison_deref(pr, drflags);
3046 }
3047
3048 static void
prison_kill_processes_cb(struct proc * p,void * arg __unused)3049 prison_kill_processes_cb(struct proc *p, void *arg __unused)
3050 {
3051
3052 kern_psignal(p, SIGKILL);
3053 }
3054
3055 /*
3056 * Note the iteration does not guarantee acting on all processes.
3057 * Most notably there may be fork or jail_attach in progress.
3058 */
3059 void
prison_proc_iterate(struct prison * pr,void (* cb)(struct proc *,void *),void * cbarg)3060 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
3061 void *cbarg)
3062 {
3063 struct prison *ppr;
3064 struct proc *p;
3065
3066 if (atomic_load_int(&pr->pr_childcount) == 0) {
3067 sx_slock(&allproc_lock);
3068 LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
3069 if (p->p_state == PRS_NEW)
3070 continue;
3071 PROC_LOCK(p);
3072 cb(p, cbarg);
3073 PROC_UNLOCK(p);
3074 }
3075 sx_sunlock(&allproc_lock);
3076 if (atomic_load_int(&pr->pr_childcount) == 0)
3077 return;
3078 /*
3079 * Some jails popped up during the iteration, fall through to a
3080 * system-wide search.
3081 */
3082 }
3083
3084 sx_slock(&allproc_lock);
3085 FOREACH_PROC_IN_SYSTEM(p) {
3086 PROC_LOCK(p);
3087 if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
3088 for (ppr = p->p_ucred->cr_prison;
3089 ppr != &prison0;
3090 ppr = ppr->pr_parent) {
3091 if (ppr == pr) {
3092 cb(p, cbarg);
3093 break;
3094 }
3095 }
3096 }
3097 PROC_UNLOCK(p);
3098 }
3099 sx_sunlock(&allproc_lock);
3100 }
3101
3102 /*
3103 * Remove a prison reference and/or user reference (usually).
3104 * This assumes context that allows sleeping (for allprison_lock),
3105 * with no non-sleeping locks held, except perhaps the prison itself.
3106 * If there are no more references, release and delist the prison.
3107 * On completion, the prison lock and the allprison lock are both
3108 * unlocked.
3109 */
3110 static void
prison_deref(struct prison * pr,int flags)3111 prison_deref(struct prison *pr, int flags)
3112 {
3113 struct prisonlist freeprison;
3114 struct prison *killpr, *rpr, *ppr, *tpr;
3115
3116 killpr = NULL;
3117 TAILQ_INIT(&freeprison);
3118 /*
3119 * Release this prison as requested, which may cause its parent
3120 * to be released, and then maybe its grandparent, etc.
3121 */
3122 for (;;) {
3123 if (flags & PD_KILL) {
3124 /* Kill the prison and its descendents. */
3125 KASSERT(pr != &prison0,
3126 ("prison_deref trying to kill prison0"));
3127 if (!(flags & PD_DEREF)) {
3128 prison_hold(pr);
3129 flags |= PD_DEREF;
3130 }
3131 flags = prison_lock_xlock(pr, flags);
3132 prison_deref_kill(pr, &freeprison);
3133 }
3134 if (flags & PD_DEUREF) {
3135 /* Drop a user reference. */
3136 KASSERT(refcount_load(&pr->pr_uref) > 0,
3137 ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
3138 pr->pr_id));
3139 if (!refcount_release_if_not_last(&pr->pr_uref)) {
3140 if (!(flags & PD_DEREF)) {
3141 prison_hold(pr);
3142 flags |= PD_DEREF;
3143 }
3144 flags = prison_lock_xlock(pr, flags);
3145 if (refcount_release(&pr->pr_uref) &&
3146 pr->pr_state == PRISON_STATE_ALIVE) {
3147 /*
3148 * When the last user references goes,
3149 * this becomes a dying prison.
3150 */
3151 KASSERT(
3152 refcount_load(&prison0.pr_uref) > 0,
3153 ("prison0 pr_uref=0"));
3154 pr->pr_state = PRISON_STATE_DYING;
3155 mtx_unlock(&pr->pr_mtx);
3156 flags &= ~PD_LOCKED;
3157 prison_cleanup(pr);
3158 }
3159 }
3160 }
3161 if (flags & PD_KILL) {
3162 /*
3163 * Any remaining user references are probably processes
3164 * that need to be killed, either in this prison or its
3165 * descendants.
3166 */
3167 if (refcount_load(&pr->pr_uref) > 0)
3168 killpr = pr;
3169 /* Make sure the parent prison doesn't get killed. */
3170 flags &= ~PD_KILL;
3171 }
3172 if (flags & PD_DEREF) {
3173 /* Drop a reference. */
3174 KASSERT(refcount_load(&pr->pr_ref) > 0,
3175 ("prison_deref PD_DEREF on a dead prison (jid=%d)",
3176 pr->pr_id));
3177 if (!refcount_release_if_not_last(&pr->pr_ref)) {
3178 flags = prison_lock_xlock(pr, flags);
3179 if (refcount_release(&pr->pr_ref)) {
3180 /*
3181 * When the last reference goes,
3182 * unlink the prison and set it aside.
3183 */
3184 KASSERT(
3185 refcount_load(&pr->pr_uref) == 0,
3186 ("prison_deref: last ref, "
3187 "but still has %d urefs (jid=%d)",
3188 pr->pr_uref, pr->pr_id));
3189 KASSERT(
3190 refcount_load(&prison0.pr_ref) != 0,
3191 ("prison0 pr_ref=0"));
3192 pr->pr_state = PRISON_STATE_INVALID;
3193 TAILQ_REMOVE(&allprison, pr, pr_list);
3194 LIST_REMOVE(pr, pr_sibling);
3195 TAILQ_INSERT_TAIL(&freeprison, pr,
3196 pr_list);
3197 for (ppr = pr->pr_parent;
3198 ppr != NULL;
3199 ppr = ppr->pr_parent)
3200 ppr->pr_childcount--;
3201 /*
3202 * Removing a prison frees references
3203 * from its parent.
3204 */
3205 mtx_unlock(&pr->pr_mtx);
3206 flags &= ~PD_LOCKED;
3207 pr = pr->pr_parent;
3208 flags |= PD_DEREF | PD_DEUREF;
3209 continue;
3210 }
3211 }
3212 }
3213 break;
3214 }
3215
3216 /* Release all the prison locks. */
3217 if (flags & PD_LOCKED)
3218 mtx_unlock(&pr->pr_mtx);
3219 if (flags & PD_LIST_SLOCKED)
3220 sx_sunlock(&allprison_lock);
3221 else if (flags & PD_LIST_XLOCKED)
3222 sx_xunlock(&allprison_lock);
3223
3224 /* Kill any processes attached to a killed prison. */
3225 if (killpr != NULL)
3226 prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
3227
3228 /*
3229 * Finish removing any unreferenced prisons, which couldn't happen
3230 * while allprison_lock was held (to avoid a LOR on vrele).
3231 */
3232 TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
3233 #ifdef VIMAGE
3234 if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
3235 vnet_destroy(rpr->pr_vnet);
3236 #endif
3237 if (rpr->pr_root != NULL)
3238 vrele(rpr->pr_root);
3239 mtx_destroy(&rpr->pr_mtx);
3240 #ifdef INET
3241 prison_ip_free(rpr->pr_addrs[PR_INET]);
3242 #endif
3243 #ifdef INET6
3244 prison_ip_free(rpr->pr_addrs[PR_INET6]);
3245 #endif
3246 if (rpr->pr_cpuset != NULL)
3247 cpuset_rel(rpr->pr_cpuset);
3248 osd_jail_exit(rpr);
3249 #ifdef RACCT
3250 if (racct_enable)
3251 prison_racct_detach(rpr);
3252 #endif
3253 TAILQ_REMOVE(&freeprison, rpr, pr_list);
3254 free(rpr, M_PRISON);
3255 }
3256 }
3257
3258 /*
3259 * Kill the prison and its descendants. Mark them as dying, clear the
3260 * persist flag, and call module remove methods.
3261 */
3262 static void
prison_deref_kill(struct prison * pr,struct prisonlist * freeprison)3263 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
3264 {
3265 struct prison *cpr, *ppr, *rpr;
3266 bool descend;
3267
3268 /*
3269 * Unlike the descendants, the target prison can be killed
3270 * even if it is currently dying. This is useful for failed
3271 * creation in jail_set(2).
3272 */
3273 KASSERT(refcount_load(&pr->pr_ref) > 0,
3274 ("Trying to kill dead prison %p (jid=%d).",
3275 pr, pr->pr_id));
3276 refcount_acquire(&pr->pr_uref);
3277 pr->pr_state = PRISON_STATE_DYING;
3278 mtx_unlock(&pr->pr_mtx);
3279
3280 rpr = NULL;
3281 FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
3282 if (descend) {
3283 if (!prison_isalive(cpr)) {
3284 descend = false;
3285 continue;
3286 }
3287 prison_hold(cpr);
3288 prison_proc_hold(cpr);
3289 mtx_lock(&cpr->pr_mtx);
3290 cpr->pr_state = PRISON_STATE_DYING;
3291 cpr->pr_flags |= PR_REMOVE;
3292 mtx_unlock(&cpr->pr_mtx);
3293 continue;
3294 }
3295 if (!(cpr->pr_flags & PR_REMOVE))
3296 continue;
3297 prison_cleanup(cpr);
3298 mtx_lock(&cpr->pr_mtx);
3299 cpr->pr_flags &= ~PR_REMOVE;
3300 if (cpr->pr_flags & PR_PERSIST) {
3301 cpr->pr_flags &= ~PR_PERSIST;
3302 prison_proc_free_not_last(cpr);
3303 prison_free_not_last(cpr);
3304 }
3305 (void)refcount_release(&cpr->pr_uref);
3306 if (refcount_release(&cpr->pr_ref)) {
3307 /*
3308 * When the last reference goes, unlink the prison
3309 * and set it aside for prison_deref() to handle.
3310 * Delay unlinking the sibling list to keep the loop
3311 * safe.
3312 */
3313 if (rpr != NULL)
3314 LIST_REMOVE(rpr, pr_sibling);
3315 rpr = cpr;
3316 rpr->pr_state = PRISON_STATE_INVALID;
3317 TAILQ_REMOVE(&allprison, rpr, pr_list);
3318 TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
3319 /*
3320 * Removing a prison frees references from its parent.
3321 */
3322 ppr = rpr->pr_parent;
3323 prison_proc_free_not_last(ppr);
3324 prison_free_not_last(ppr);
3325 for (; ppr != NULL; ppr = ppr->pr_parent)
3326 ppr->pr_childcount--;
3327 }
3328 mtx_unlock(&cpr->pr_mtx);
3329 }
3330 if (rpr != NULL)
3331 LIST_REMOVE(rpr, pr_sibling);
3332
3333 prison_cleanup(pr);
3334 mtx_lock(&pr->pr_mtx);
3335 if (pr->pr_flags & PR_PERSIST) {
3336 pr->pr_flags &= ~PR_PERSIST;
3337 prison_proc_free_not_last(pr);
3338 prison_free_not_last(pr);
3339 }
3340 (void)refcount_release(&pr->pr_uref);
3341 }
3342
3343 /*
3344 * Given the current locking state in the flags, make sure allprison_lock
3345 * is held exclusive, and the prison is locked. Return flags indicating
3346 * the new state.
3347 */
3348 static int
prison_lock_xlock(struct prison * pr,int flags)3349 prison_lock_xlock(struct prison *pr, int flags)
3350 {
3351
3352 if (!(flags & PD_LIST_XLOCKED)) {
3353 /*
3354 * Get allprison_lock, which may be an upgrade,
3355 * and may require unlocking the prison.
3356 */
3357 if (flags & PD_LOCKED) {
3358 mtx_unlock(&pr->pr_mtx);
3359 flags &= ~PD_LOCKED;
3360 }
3361 if (flags & PD_LIST_SLOCKED) {
3362 if (!sx_try_upgrade(&allprison_lock)) {
3363 sx_sunlock(&allprison_lock);
3364 sx_xlock(&allprison_lock);
3365 }
3366 flags &= ~PD_LIST_SLOCKED;
3367 } else
3368 sx_xlock(&allprison_lock);
3369 flags |= PD_LIST_XLOCKED;
3370 }
3371 if (!(flags & PD_LOCKED)) {
3372 /* Lock the prison mutex. */
3373 mtx_lock(&pr->pr_mtx);
3374 flags |= PD_LOCKED;
3375 }
3376 return flags;
3377 }
3378
3379 /*
3380 * Release a prison's resources when it starts dying (when the last user
3381 * reference is dropped, or when it is killed).
3382 */
3383 static void
prison_cleanup(struct prison * pr)3384 prison_cleanup(struct prison *pr)
3385 {
3386 sx_assert(&allprison_lock, SA_XLOCKED);
3387 mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3388 vfs_exjail_delete(pr);
3389 shm_remove_prison(pr);
3390 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3391 }
3392
3393 /*
3394 * Set or clear a permission bit in the pr_allow field, passing restrictions
3395 * (cleared permission) down to child jails.
3396 */
3397 void
prison_set_allow(struct ucred * cred,unsigned flag,int enable)3398 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3399 {
3400 struct prison *pr;
3401
3402 pr = cred->cr_prison;
3403 sx_slock(&allprison_lock);
3404 mtx_lock(&pr->pr_mtx);
3405 prison_set_allow_locked(pr, flag, enable);
3406 mtx_unlock(&pr->pr_mtx);
3407 sx_sunlock(&allprison_lock);
3408 }
3409
3410 static void
prison_set_allow_locked(struct prison * pr,unsigned flag,int enable)3411 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3412 {
3413 struct prison *cpr;
3414 int descend;
3415
3416 if (enable != 0)
3417 pr->pr_allow |= flag;
3418 else {
3419 pr->pr_allow &= ~flag;
3420 FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3421 cpr->pr_allow &= ~flag;
3422 }
3423 }
3424
3425 /*
3426 * Check if a jail supports the given address family.
3427 *
3428 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3429 * if not.
3430 */
3431 int
prison_check_af(struct ucred * cred,int af)3432 prison_check_af(struct ucred *cred, int af)
3433 {
3434 struct prison *pr;
3435 int error;
3436
3437 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3438
3439 pr = cred->cr_prison;
3440 #ifdef VIMAGE
3441 /* Prisons with their own network stack are not limited. */
3442 if (prison_owns_vnet(cred))
3443 return (0);
3444 #endif
3445
3446 error = 0;
3447 switch (af)
3448 {
3449 #ifdef INET
3450 case AF_INET:
3451 if (pr->pr_flags & PR_IP4)
3452 {
3453 mtx_lock(&pr->pr_mtx);
3454 if ((pr->pr_flags & PR_IP4) &&
3455 pr->pr_addrs[PR_INET] == NULL)
3456 error = EAFNOSUPPORT;
3457 mtx_unlock(&pr->pr_mtx);
3458 }
3459 break;
3460 #endif
3461 #ifdef INET6
3462 case AF_INET6:
3463 if (pr->pr_flags & PR_IP6)
3464 {
3465 mtx_lock(&pr->pr_mtx);
3466 if ((pr->pr_flags & PR_IP6) &&
3467 pr->pr_addrs[PR_INET6] == NULL)
3468 error = EAFNOSUPPORT;
3469 mtx_unlock(&pr->pr_mtx);
3470 }
3471 break;
3472 #endif
3473 case AF_LOCAL:
3474 case AF_ROUTE:
3475 case AF_NETLINK:
3476 break;
3477 default:
3478 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3479 error = EAFNOSUPPORT;
3480 }
3481 return (error);
3482 }
3483
3484 /*
3485 * Check if given address belongs to the jail referenced by cred (wrapper to
3486 * prison_check_ip[46]).
3487 *
3488 * Returns 0 if jail doesn't restrict the address family or if address belongs
3489 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3490 * the jail doesn't allow the address family. IPv4 Address passed in in NBO.
3491 */
3492 int
prison_if(struct ucred * cred,const struct sockaddr * sa)3493 prison_if(struct ucred *cred, const struct sockaddr *sa)
3494 {
3495 #ifdef INET
3496 const struct sockaddr_in *sai;
3497 #endif
3498 #ifdef INET6
3499 const struct sockaddr_in6 *sai6;
3500 #endif
3501 int error;
3502
3503 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3504 KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3505
3506 #ifdef VIMAGE
3507 if (prison_owns_vnet(cred))
3508 return (0);
3509 #endif
3510
3511 error = 0;
3512 switch (sa->sa_family)
3513 {
3514 #ifdef INET
3515 case AF_INET:
3516 sai = (const struct sockaddr_in *)sa;
3517 error = prison_check_ip4(cred, &sai->sin_addr);
3518 break;
3519 #endif
3520 #ifdef INET6
3521 case AF_INET6:
3522 sai6 = (const struct sockaddr_in6 *)sa;
3523 error = prison_check_ip6(cred, &sai6->sin6_addr);
3524 break;
3525 #endif
3526 default:
3527 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3528 error = EAFNOSUPPORT;
3529 }
3530 return (error);
3531 }
3532
3533 /*
3534 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3535 */
3536 int
prison_check(struct ucred * cred1,struct ucred * cred2)3537 prison_check(struct ucred *cred1, struct ucred *cred2)
3538 {
3539
3540 return ((cred1->cr_prison == cred2->cr_prison ||
3541 prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3542 }
3543
3544 /*
3545 * For mountd/nfsd to run within a prison, it must be:
3546 * - A vnet prison.
3547 * - PR_ALLOW_NFSD must be set on it.
3548 * - The root directory (pr_root) of the prison must be
3549 * a file system mount point, so the mountd can hang
3550 * export information on it.
3551 * - The prison's enforce_statfs cannot be 0, so that
3552 * mountd(8) can do exports.
3553 */
3554 bool
prison_check_nfsd(struct ucred * cred)3555 prison_check_nfsd(struct ucred *cred)
3556 {
3557
3558 if (jailed_without_vnet(cred))
3559 return (false);
3560 if (!prison_allow(cred, PR_ALLOW_NFSD))
3561 return (false);
3562 if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3563 return (false);
3564 if (cred->cr_prison->pr_enforce_statfs == 0)
3565 return (false);
3566 return (true);
3567 }
3568
3569 /*
3570 * Return true if p2 is a child of p1, otherwise false.
3571 */
3572 bool
prison_ischild(struct prison * pr1,struct prison * pr2)3573 prison_ischild(struct prison *pr1, struct prison *pr2)
3574 {
3575
3576 for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3577 if (pr1 == pr2)
3578 return (true);
3579 return (false);
3580 }
3581
3582 /*
3583 * Return true if the prison is currently alive. A prison is alive if it
3584 * holds user references and it isn't being removed.
3585 */
3586 bool
prison_isalive(const struct prison * pr)3587 prison_isalive(const struct prison *pr)
3588 {
3589
3590 if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3591 return (false);
3592 return (true);
3593 }
3594
3595 /*
3596 * Return true if the prison is currently valid. A prison is valid if it has
3597 * been fully created, and is not being destroyed. Note that dying prisons
3598 * are still considered valid. Invalid prisons won't be found under normal
3599 * circumstances, as they're only put in that state by functions that have
3600 * an exclusive hold on allprison_lock.
3601 */
3602 bool
prison_isvalid(struct prison * pr)3603 prison_isvalid(struct prison *pr)
3604 {
3605
3606 if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3607 return (false);
3608 if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3609 return (false);
3610 return (true);
3611 }
3612
3613 /*
3614 * Return true if the passed credential is in a jail and that jail does not
3615 * have its own virtual network stack, otherwise false.
3616 */
3617 bool
jailed_without_vnet(struct ucred * cred)3618 jailed_without_vnet(struct ucred *cred)
3619 {
3620
3621 if (!jailed(cred))
3622 return (false);
3623 #ifdef VIMAGE
3624 if (prison_owns_vnet(cred))
3625 return (false);
3626 #endif
3627
3628 return (true);
3629 }
3630
3631 /*
3632 * Return the correct hostname (domainname, et al) for the passed credential.
3633 */
3634 void
getcredhostname(struct ucred * cred,char * buf,size_t size)3635 getcredhostname(struct ucred *cred, char *buf, size_t size)
3636 {
3637 struct prison *pr;
3638
3639 /*
3640 * A NULL credential can be used to shortcut to the physical
3641 * system's hostname.
3642 */
3643 pr = (cred != NULL) ? cred->cr_prison : &prison0;
3644 mtx_lock(&pr->pr_mtx);
3645 strlcpy(buf, pr->pr_hostname, size);
3646 mtx_unlock(&pr->pr_mtx);
3647 }
3648
3649 void
getcreddomainname(struct ucred * cred,char * buf,size_t size)3650 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3651 {
3652
3653 mtx_lock(&cred->cr_prison->pr_mtx);
3654 strlcpy(buf, cred->cr_prison->pr_domainname, size);
3655 mtx_unlock(&cred->cr_prison->pr_mtx);
3656 }
3657
3658 void
getcredhostuuid(struct ucred * cred,char * buf,size_t size)3659 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3660 {
3661
3662 mtx_lock(&cred->cr_prison->pr_mtx);
3663 strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3664 mtx_unlock(&cred->cr_prison->pr_mtx);
3665 }
3666
3667 void
getcredhostid(struct ucred * cred,unsigned long * hostid)3668 getcredhostid(struct ucred *cred, unsigned long *hostid)
3669 {
3670
3671 mtx_lock(&cred->cr_prison->pr_mtx);
3672 *hostid = cred->cr_prison->pr_hostid;
3673 mtx_unlock(&cred->cr_prison->pr_mtx);
3674 }
3675
3676 void
getjailname(struct ucred * cred,char * name,size_t len)3677 getjailname(struct ucred *cred, char *name, size_t len)
3678 {
3679
3680 mtx_lock(&cred->cr_prison->pr_mtx);
3681 strlcpy(name, cred->cr_prison->pr_name, len);
3682 mtx_unlock(&cred->cr_prison->pr_mtx);
3683 }
3684
3685 #ifdef VIMAGE
3686 /*
3687 * Determine whether the prison represented by cred owns
3688 * its vnet rather than having it inherited.
3689 *
3690 * Returns true in case the prison owns the vnet, false otherwise.
3691 */
3692 bool
prison_owns_vnet(struct ucred * cred)3693 prison_owns_vnet(struct ucred *cred)
3694 {
3695
3696 /*
3697 * vnets cannot be added/removed after jail creation,
3698 * so no need to lock here.
3699 */
3700 return ((cred->cr_prison->pr_flags & PR_VNET) != 0);
3701 }
3702 #endif
3703
3704 /*
3705 * Determine whether the subject represented by cred can "see"
3706 * status of a mount point.
3707 * Returns: 0 for permitted, ENOENT otherwise.
3708 * XXX: This function should be called cr_canseemount() and should be
3709 * placed in kern_prot.c.
3710 */
3711 int
prison_canseemount(struct ucred * cred,struct mount * mp)3712 prison_canseemount(struct ucred *cred, struct mount *mp)
3713 {
3714 struct prison *pr;
3715 struct statfs *sp;
3716 size_t len;
3717
3718 pr = cred->cr_prison;
3719 if (pr->pr_enforce_statfs == 0)
3720 return (0);
3721 if (pr->pr_root->v_mount == mp)
3722 return (0);
3723 if (pr->pr_enforce_statfs == 2)
3724 return (ENOENT);
3725 /*
3726 * If jail's chroot directory is set to "/" we should be able to see
3727 * all mount-points from inside a jail.
3728 * This is ugly check, but this is the only situation when jail's
3729 * directory ends with '/'.
3730 */
3731 if (strcmp(pr->pr_path, "/") == 0)
3732 return (0);
3733 len = strlen(pr->pr_path);
3734 sp = &mp->mnt_stat;
3735 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3736 return (ENOENT);
3737 /*
3738 * Be sure that we don't have situation where jail's root directory
3739 * is "/some/path" and mount point is "/some/pathpath".
3740 */
3741 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3742 return (ENOENT);
3743 return (0);
3744 }
3745
3746 void
prison_enforce_statfs(struct ucred * cred,struct mount * mp,struct statfs * sp)3747 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3748 {
3749 char jpath[MAXPATHLEN];
3750 struct prison *pr;
3751 size_t len;
3752
3753 pr = cred->cr_prison;
3754 if (pr->pr_enforce_statfs == 0)
3755 return;
3756 if (prison_canseemount(cred, mp) != 0) {
3757 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3758 strlcpy(sp->f_mntonname, "[restricted]",
3759 sizeof(sp->f_mntonname));
3760 return;
3761 }
3762 if (pr->pr_root->v_mount == mp) {
3763 /*
3764 * Clear current buffer data, so we are sure nothing from
3765 * the valid path left there.
3766 */
3767 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3768 *sp->f_mntonname = '/';
3769 return;
3770 }
3771 /*
3772 * If jail's chroot directory is set to "/" we should be able to see
3773 * all mount-points from inside a jail.
3774 */
3775 if (strcmp(pr->pr_path, "/") == 0)
3776 return;
3777 len = strlen(pr->pr_path);
3778 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3779 /*
3780 * Clear current buffer data, so we are sure nothing from
3781 * the valid path left there.
3782 */
3783 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3784 if (*jpath == '\0') {
3785 /* Should never happen. */
3786 *sp->f_mntonname = '/';
3787 } else {
3788 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3789 }
3790 }
3791
3792 /*
3793 * Check with permission for a specific privilege is granted within jail. We
3794 * have a specific list of accepted privileges; the rest are denied.
3795 */
3796 int
prison_priv_check(struct ucred * cred,int priv)3797 prison_priv_check(struct ucred *cred, int priv)
3798 {
3799 struct prison *pr;
3800 int error;
3801
3802 /*
3803 * Some policies have custom handlers. This routine should not be
3804 * called for them. See priv_check_cred().
3805 */
3806 switch (priv) {
3807 case PRIV_VFS_LOOKUP:
3808 case PRIV_VFS_GENERATION:
3809 KASSERT(0, ("prison_priv_check instead of a custom handler "
3810 "called for %d\n", priv));
3811 }
3812
3813 if (!jailed(cred))
3814 return (0);
3815
3816 #ifdef VIMAGE
3817 /*
3818 * Privileges specific to prisons with a virtual network stack.
3819 * There might be a duplicate entry here in case the privilege
3820 * is only granted conditionally in the legacy jail case.
3821 */
3822 switch (priv) {
3823 /*
3824 * NFS-specific privileges.
3825 */
3826 case PRIV_NFS_DAEMON:
3827 case PRIV_VFS_GETFH:
3828 case PRIV_VFS_MOUNT_EXPORTED:
3829 if (!prison_check_nfsd(cred))
3830 return (EPERM);
3831 #ifdef notyet
3832 case PRIV_NFS_LOCKD:
3833 #endif
3834 /*
3835 * Network stack privileges.
3836 */
3837 case PRIV_NET_BRIDGE:
3838 case PRIV_NET_GRE:
3839 case PRIV_NET_BPF:
3840 case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
3841 case PRIV_NET_ROUTE:
3842 case PRIV_NET_TAP:
3843 case PRIV_NET_SETIFMTU:
3844 case PRIV_NET_SETIFFLAGS:
3845 case PRIV_NET_SETIFCAP:
3846 case PRIV_NET_SETIFDESCR:
3847 case PRIV_NET_SETIFNAME :
3848 case PRIV_NET_SETIFMETRIC:
3849 case PRIV_NET_SETIFPHYS:
3850 case PRIV_NET_SETIFMAC:
3851 case PRIV_NET_SETLANPCP:
3852 case PRIV_NET_ADDMULTI:
3853 case PRIV_NET_DELMULTI:
3854 case PRIV_NET_HWIOCTL:
3855 case PRIV_NET_SETLLADDR:
3856 case PRIV_NET_ADDIFGROUP:
3857 case PRIV_NET_DELIFGROUP:
3858 case PRIV_NET_IFCREATE:
3859 case PRIV_NET_IFDESTROY:
3860 case PRIV_NET_ADDIFADDR:
3861 case PRIV_NET_DELIFADDR:
3862 case PRIV_NET_LAGG:
3863 case PRIV_NET_GIF:
3864 case PRIV_NET_SETIFVNET:
3865 case PRIV_NET_SETIFFIB:
3866 case PRIV_NET_OVPN:
3867 case PRIV_NET_ME:
3868 case PRIV_NET_WG:
3869
3870 /*
3871 * 802.11-related privileges.
3872 */
3873 case PRIV_NET80211_VAP_GETKEY:
3874 case PRIV_NET80211_VAP_MANAGE:
3875
3876 #ifdef notyet
3877 /*
3878 * ATM privileges.
3879 */
3880 case PRIV_NETATM_CFG:
3881 case PRIV_NETATM_ADD:
3882 case PRIV_NETATM_DEL:
3883 case PRIV_NETATM_SET:
3884
3885 /*
3886 * Bluetooth privileges.
3887 */
3888 case PRIV_NETBLUETOOTH_RAW:
3889 #endif
3890
3891 /*
3892 * Netgraph and netgraph module privileges.
3893 */
3894 case PRIV_NETGRAPH_CONTROL:
3895 #ifdef notyet
3896 case PRIV_NETGRAPH_TTY:
3897 #endif
3898
3899 /*
3900 * IPv4 and IPv6 privileges.
3901 */
3902 case PRIV_NETINET_IPFW:
3903 case PRIV_NETINET_DIVERT:
3904 case PRIV_NETINET_PF:
3905 case PRIV_NETINET_DUMMYNET:
3906 case PRIV_NETINET_CARP:
3907 case PRIV_NETINET_MROUTE:
3908 case PRIV_NETINET_RAW:
3909 case PRIV_NETINET_ADDRCTRL6:
3910 case PRIV_NETINET_ND6:
3911 case PRIV_NETINET_SCOPE6:
3912 case PRIV_NETINET_ALIFETIME6:
3913 case PRIV_NETINET_IPSEC:
3914 case PRIV_NETINET_BINDANY:
3915
3916 #ifdef notyet
3917 /*
3918 * NCP privileges.
3919 */
3920 case PRIV_NETNCP:
3921
3922 /*
3923 * SMB privileges.
3924 */
3925 case PRIV_NETSMB:
3926 #endif
3927
3928 /*
3929 * No default: or deny here.
3930 * In case of no permit fall through to next switch().
3931 */
3932 if (cred->cr_prison->pr_flags & PR_VNET)
3933 return (0);
3934 }
3935 #endif /* VIMAGE */
3936
3937 switch (priv) {
3938 /*
3939 * Allow ktrace privileges for root in jail.
3940 */
3941 case PRIV_KTRACE:
3942
3943 #if 0
3944 /*
3945 * Allow jailed processes to configure audit identity and
3946 * submit audit records (login, etc). In the future we may
3947 * want to further refine the relationship between audit and
3948 * jail.
3949 */
3950 case PRIV_AUDIT_GETAUDIT:
3951 case PRIV_AUDIT_SETAUDIT:
3952 case PRIV_AUDIT_SUBMIT:
3953 #endif
3954
3955 /*
3956 * Allow jailed processes to manipulate process UNIX
3957 * credentials in any way they see fit.
3958 */
3959 case PRIV_CRED_SETUID:
3960 case PRIV_CRED_SETEUID:
3961 case PRIV_CRED_SETGID:
3962 case PRIV_CRED_SETEGID:
3963 case PRIV_CRED_SETGROUPS:
3964 case PRIV_CRED_SETREUID:
3965 case PRIV_CRED_SETREGID:
3966 case PRIV_CRED_SETRESUID:
3967 case PRIV_CRED_SETRESGID:
3968
3969 /*
3970 * Jail implements visibility constraints already, so allow
3971 * jailed root to override uid/gid-based constraints.
3972 */
3973 case PRIV_SEEOTHERGIDS:
3974 case PRIV_SEEOTHERUIDS:
3975 case PRIV_SEEJAILPROC:
3976
3977 /*
3978 * Jail implements inter-process debugging limits already, so
3979 * allow jailed root various debugging privileges.
3980 */
3981 case PRIV_DEBUG_DIFFCRED:
3982 case PRIV_DEBUG_SUGID:
3983 case PRIV_DEBUG_UNPRIV:
3984
3985 /*
3986 * Allow jail to set various resource limits and login
3987 * properties, and for now, exceed process resource limits.
3988 */
3989 case PRIV_PROC_LIMIT:
3990 case PRIV_PROC_SETLOGIN:
3991 case PRIV_PROC_SETRLIMIT:
3992
3993 /*
3994 * System V and POSIX IPC privileges are granted in jail.
3995 */
3996 case PRIV_IPC_READ:
3997 case PRIV_IPC_WRITE:
3998 case PRIV_IPC_ADMIN:
3999 case PRIV_IPC_MSGSIZE:
4000 case PRIV_MQ_ADMIN:
4001
4002 /*
4003 * Jail operations within a jail work on child jails.
4004 */
4005 case PRIV_JAIL_ATTACH:
4006 case PRIV_JAIL_SET:
4007 case PRIV_JAIL_REMOVE:
4008
4009 /*
4010 * Jail implements its own inter-process limits, so allow
4011 * root processes in jail to change scheduling on other
4012 * processes in the same jail. Likewise for signalling.
4013 */
4014 case PRIV_SCHED_DIFFCRED:
4015 case PRIV_SCHED_CPUSET:
4016 case PRIV_SIGNAL_DIFFCRED:
4017 case PRIV_SIGNAL_SUGID:
4018
4019 /*
4020 * Allow jailed processes to write to sysctls marked as jail
4021 * writable.
4022 */
4023 case PRIV_SYSCTL_WRITEJAIL:
4024
4025 /*
4026 * Allow root in jail to manage a variety of quota
4027 * properties. These should likely be conditional on a
4028 * configuration option.
4029 */
4030 case PRIV_VFS_GETQUOTA:
4031 case PRIV_VFS_SETQUOTA:
4032
4033 /*
4034 * Since Jail relies on chroot() to implement file system
4035 * protections, grant many VFS privileges to root in jail.
4036 * Be careful to exclude mount-related and NFS-related
4037 * privileges.
4038 */
4039 case PRIV_VFS_READ:
4040 case PRIV_VFS_WRITE:
4041 case PRIV_VFS_ADMIN:
4042 case PRIV_VFS_EXEC:
4043 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
4044 case PRIV_VFS_CHFLAGS_DEV:
4045 case PRIV_VFS_CHOWN:
4046 case PRIV_VFS_CHROOT:
4047 case PRIV_VFS_RETAINSUGID:
4048 case PRIV_VFS_FCHROOT:
4049 case PRIV_VFS_LINK:
4050 case PRIV_VFS_SETGID:
4051 case PRIV_VFS_STAT:
4052 case PRIV_VFS_STICKYFILE:
4053
4054 /*
4055 * As in the non-jail case, non-root users are expected to be
4056 * able to read kernel/physical memory (provided /dev/[k]mem
4057 * exists in the jail and they have permission to access it).
4058 */
4059 case PRIV_KMEM_READ:
4060 return (0);
4061
4062 /*
4063 * Depending on the global setting, allow privilege of
4064 * setting system flags.
4065 */
4066 case PRIV_VFS_SYSFLAGS:
4067 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4068 return (0);
4069 else
4070 return (EPERM);
4071
4072 /*
4073 * Depending on the global setting, allow privilege of
4074 * mounting/unmounting file systems.
4075 */
4076 case PRIV_VFS_MOUNT:
4077 case PRIV_VFS_UNMOUNT:
4078 case PRIV_VFS_MOUNT_NONUSER:
4079 case PRIV_VFS_MOUNT_OWNER:
4080 pr = cred->cr_prison;
4081 prison_lock(pr);
4082 if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
4083 error = 0;
4084 else
4085 error = EPERM;
4086 prison_unlock(pr);
4087 return (error);
4088
4089 /*
4090 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
4091 * policy. priv_check_cred will not specifically allow it, and
4092 * we may want a MAC policy to allow it.
4093 */
4094 case PRIV_VFS_READ_DIR:
4095 return (0);
4096
4097 /*
4098 * Conditionally allow privileged process in the jail to
4099 * manipulate filesystem extended attributes in the system
4100 * namespace.
4101 */
4102 case PRIV_VFS_EXTATTR_SYSTEM:
4103 if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0)
4104 return (0);
4105 else
4106 return (EPERM);
4107
4108 /*
4109 * Conditionnaly allow locking (unlocking) physical pages
4110 * in memory.
4111 */
4112 case PRIV_VM_MLOCK:
4113 case PRIV_VM_MUNLOCK:
4114 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
4115 return (0);
4116 else
4117 return (EPERM);
4118
4119 /*
4120 * Conditionally allow jailed root to bind reserved ports.
4121 */
4122 case PRIV_NETINET_RESERVEDPORT:
4123 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
4124 return (0);
4125 else
4126 return (EPERM);
4127
4128 /*
4129 * Allow jailed root to reuse in-use ports.
4130 */
4131 case PRIV_NETINET_REUSEPORT:
4132 return (0);
4133
4134 /*
4135 * Allow jailed root to set certain IPv4/6 (option) headers.
4136 */
4137 case PRIV_NETINET_SETHDROPTS:
4138 return (0);
4139
4140 /*
4141 * Conditionally allow creating raw sockets in jail.
4142 */
4143 case PRIV_NETINET_RAW:
4144 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4145 return (0);
4146 else
4147 return (EPERM);
4148
4149 /*
4150 * Since jail implements its own visibility limits on netstat
4151 * sysctls, allow getcred. This allows identd to work in
4152 * jail.
4153 */
4154 case PRIV_NETINET_GETCRED:
4155 return (0);
4156
4157 /*
4158 * Allow jailed root to set loginclass.
4159 */
4160 case PRIV_PROC_SETLOGINCLASS:
4161 return (0);
4162
4163 /*
4164 * Do not allow a process inside a jail to read the kernel
4165 * message buffer unless explicitly permitted.
4166 */
4167 case PRIV_MSGBUF:
4168 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
4169 return (0);
4170 return (EPERM);
4171
4172 /*
4173 * Conditionally allow privileged process in the jail adjust
4174 * machine time.
4175 */
4176 case PRIV_ADJTIME:
4177 case PRIV_NTP_ADJTIME:
4178 if (cred->cr_prison->pr_allow &
4179 (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) {
4180 return (0);
4181 }
4182 return (EPERM);
4183
4184 /*
4185 * Conditionally allow privileged process in the jail set
4186 * machine time.
4187 */
4188 case PRIV_CLOCK_SETTIME:
4189 if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME)
4190 return (0);
4191 else
4192 return (EPERM);
4193
4194 default:
4195 /*
4196 * In all remaining cases, deny the privilege request. This
4197 * includes almost all network privileges, many system
4198 * configuration privileges.
4199 */
4200 return (EPERM);
4201 }
4202 }
4203
4204 /*
4205 * Return the part of pr2's name that is relative to pr1, or the whole name
4206 * if it does not directly follow.
4207 */
4208
4209 char *
prison_name(struct prison * pr1,struct prison * pr2)4210 prison_name(struct prison *pr1, struct prison *pr2)
4211 {
4212 char *name;
4213
4214 /* Jails see themselves as "0" (if they see themselves at all). */
4215 if (pr1 == pr2)
4216 return "0";
4217 name = pr2->pr_name;
4218 if (prison_ischild(pr1, pr2)) {
4219 /*
4220 * pr1 isn't locked (and allprison_lock may not be either)
4221 * so its length can't be counted on. But the number of dots
4222 * can be counted on - and counted.
4223 */
4224 for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4225 name = strchr(name, '.') + 1;
4226 }
4227 return (name);
4228 }
4229
4230 /*
4231 * Return the part of pr2's path that is relative to pr1, or the whole path
4232 * if it does not directly follow.
4233 */
4234 static char *
prison_path(struct prison * pr1,struct prison * pr2)4235 prison_path(struct prison *pr1, struct prison *pr2)
4236 {
4237 char *path1, *path2;
4238 int len1;
4239
4240 path1 = pr1->pr_path;
4241 path2 = pr2->pr_path;
4242 if (!strcmp(path1, "/"))
4243 return (path2);
4244 len1 = strlen(path1);
4245 if (strncmp(path1, path2, len1))
4246 return (path2);
4247 if (path2[len1] == '\0')
4248 return "/";
4249 if (path2[len1] == '/')
4250 return (path2 + len1);
4251 return (path2);
4252 }
4253
4254 /*
4255 * Jail-related sysctls.
4256 */
4257 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4258 "Jails");
4259
4260 #if defined(INET) || defined(INET6)
4261 /*
4262 * Copy address array to memory that would be then SYSCTL_OUT-ed.
4263 * sysctl_jail_list() helper.
4264 */
4265 static void
prison_ip_copyout(struct prison * pr,const pr_family_t af,void ** out,int * len)4266 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
4267 {
4268 const struct prison_ip *pip;
4269 const size_t size = pr_families[af].size;
4270
4271 again:
4272 mtx_assert(&pr->pr_mtx, MA_OWNED);
4273 if ((pip = pr->pr_addrs[af]) != NULL) {
4274 if (*len < pip->ips) {
4275 *len = pip->ips;
4276 mtx_unlock(&pr->pr_mtx);
4277 *out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
4278 mtx_lock(&pr->pr_mtx);
4279 goto again;
4280 }
4281 bcopy(pip->pr_ip, *out, pip->ips * size);
4282 }
4283 }
4284 #endif
4285
4286 static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)4287 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4288 {
4289 struct xprison *xp;
4290 struct prison *pr, *cpr;
4291 #ifdef INET
4292 struct in_addr *ip4 = NULL;
4293 int ip4s = 0;
4294 #endif
4295 #ifdef INET6
4296 struct in6_addr *ip6 = NULL;
4297 int ip6s = 0;
4298 #endif
4299 int descend, error;
4300
4301 xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4302 pr = req->td->td_ucred->cr_prison;
4303 error = 0;
4304 sx_slock(&allprison_lock);
4305 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4306 mtx_lock(&cpr->pr_mtx);
4307 #ifdef INET
4308 prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
4309 #endif
4310 #ifdef INET6
4311 prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
4312 #endif
4313 bzero(xp, sizeof(*xp));
4314 xp->pr_version = XPRISON_VERSION;
4315 xp->pr_id = cpr->pr_id;
4316 xp->pr_state = cpr->pr_state;
4317 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4318 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4319 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4320 #ifdef INET
4321 xp->pr_ip4s = ip4s;
4322 #endif
4323 #ifdef INET6
4324 xp->pr_ip6s = ip6s;
4325 #endif
4326 mtx_unlock(&cpr->pr_mtx);
4327 error = SYSCTL_OUT(req, xp, sizeof(*xp));
4328 if (error)
4329 break;
4330 #ifdef INET
4331 if (xp->pr_ip4s > 0) {
4332 error = SYSCTL_OUT(req, ip4,
4333 xp->pr_ip4s * sizeof(struct in_addr));
4334 if (error)
4335 break;
4336 }
4337 #endif
4338 #ifdef INET6
4339 if (xp->pr_ip6s > 0) {
4340 error = SYSCTL_OUT(req, ip6,
4341 xp->pr_ip6s * sizeof(struct in6_addr));
4342 if (error)
4343 break;
4344 }
4345 #endif
4346 }
4347 sx_sunlock(&allprison_lock);
4348 free(xp, M_TEMP);
4349 #ifdef INET
4350 free(ip4, M_TEMP);
4351 #endif
4352 #ifdef INET6
4353 free(ip6, M_TEMP);
4354 #endif
4355 return (error);
4356 }
4357
4358 SYSCTL_OID(_security_jail, OID_AUTO, list,
4359 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4360 sysctl_jail_list, "S", "List of active jails");
4361
4362 static int
sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)4363 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4364 {
4365 int error, injail;
4366
4367 injail = jailed(req->td->td_ucred);
4368 error = SYSCTL_OUT(req, &injail, sizeof(injail));
4369
4370 return (error);
4371 }
4372
4373 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4374 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4375 sysctl_jail_jailed, "I", "Process in jail?");
4376
4377 static int
sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)4378 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4379 {
4380 int error, havevnet;
4381 #ifdef VIMAGE
4382 struct ucred *cred = req->td->td_ucred;
4383
4384 havevnet = jailed(cred) && prison_owns_vnet(cred);
4385 #else
4386 havevnet = 0;
4387 #endif
4388 error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4389
4390 return (error);
4391 }
4392
4393 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4394 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4395 sysctl_jail_vnet, "I", "Jail owns vnet?");
4396
4397 #if defined(INET) || defined(INET6)
4398 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4399 &jail_max_af_ips, 0,
4400 "Number of IP addresses a jail may have at most per address family (deprecated)");
4401 #endif
4402
4403 /*
4404 * Default parameters for jail(2) compatibility. For historical reasons,
4405 * the sysctl names have varying similarity to the parameter names. Prisons
4406 * just see their own parameters, and can't change them.
4407 */
4408 static int
sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)4409 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4410 {
4411 int error, i;
4412
4413 /* Get the current flag value, and convert it to a boolean. */
4414 if (req->td->td_ucred->cr_prison == &prison0) {
4415 mtx_lock(&prison0.pr_mtx);
4416 i = (jail_default_allow & arg2) != 0;
4417 mtx_unlock(&prison0.pr_mtx);
4418 } else
4419 i = prison_allow(req->td->td_ucred, arg2);
4420
4421 if (arg1 != NULL)
4422 i = !i;
4423 error = sysctl_handle_int(oidp, &i, 0, req);
4424 if (error || !req->newptr)
4425 return (error);
4426 i = i ? arg2 : 0;
4427 if (arg1 != NULL)
4428 i ^= arg2;
4429 /*
4430 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4431 * for writing.
4432 */
4433 mtx_lock(&prison0.pr_mtx);
4434 jail_default_allow = (jail_default_allow & ~arg2) | i;
4435 mtx_unlock(&prison0.pr_mtx);
4436 return (0);
4437 }
4438
4439 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4440 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4441 NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4442 "Processes in jail can set their hostnames (deprecated)");
4443 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4444 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4445 (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4446 "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4447 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4448 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4449 NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4450 "Processes in jail can use System V IPC primitives (deprecated)");
4451 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4452 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4453 NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4454 "Prison root can create raw sockets (deprecated)");
4455 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4456 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4457 NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4458 "Processes in jail can alter system file flags (deprecated)");
4459 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4460 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4461 NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4462 "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4463 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
4464 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4465 NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
4466 "Processes in jail can lock/unlock physical pages in memory");
4467
4468 static int
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)4469 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4470 {
4471 struct prison *pr;
4472 int level, error;
4473
4474 pr = req->td->td_ucred->cr_prison;
4475 level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4476 error = sysctl_handle_int(oidp, &level, 0, req);
4477 if (error || !req->newptr)
4478 return (error);
4479 *(int *)arg1 = level;
4480 return (0);
4481 }
4482
4483 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4484 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4485 &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4486 sysctl_jail_default_level, "I",
4487 "Processes in jail cannot see all mounted file systems (deprecated)");
4488
4489 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4490 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4491 &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4492 sysctl_jail_default_level, "I",
4493 "Ruleset for the devfs filesystem in jail (deprecated)");
4494
4495 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4496 "Limits and stats of child jails");
4497
4498 static int
sysctl_jail_children(SYSCTL_HANDLER_ARGS)4499 sysctl_jail_children(SYSCTL_HANDLER_ARGS)
4500 {
4501 struct prison *pr;
4502 int i;
4503
4504 pr = req->td->td_ucred->cr_prison;
4505
4506 switch (oidp->oid_kind & CTLTYPE) {
4507 case CTLTYPE_INT:
4508 i = *(int *)((char *)pr + arg2);
4509 return (SYSCTL_OUT(req, &i, sizeof(i)));
4510 }
4511
4512 return (0);
4513 }
4514
4515 SYSCTL_PROC(_security_jail_children, OID_AUTO, max,
4516 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4517 NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children,
4518 "I", "Maximum number of child jails");
4519 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur,
4520 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4521 NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children,
4522 "I", "Current number of child jails");
4523
4524 /*
4525 * Nodes to describe jail parameters. Maximum length of string parameters
4526 * is returned in the string itself, and the other parameters exist merely
4527 * to make themselves and their types known.
4528 */
4529 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4530 "Jail parameters");
4531
4532 int
sysctl_jail_param(SYSCTL_HANDLER_ARGS)4533 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4534 {
4535 int i;
4536 long l;
4537 size_t s;
4538 char numbuf[12];
4539
4540 switch (oidp->oid_kind & CTLTYPE)
4541 {
4542 case CTLTYPE_LONG:
4543 case CTLTYPE_ULONG:
4544 l = 0;
4545 #ifdef SCTL_MASK32
4546 if (!(req->flags & SCTL_MASK32))
4547 #endif
4548 return (SYSCTL_OUT(req, &l, sizeof(l)));
4549 case CTLTYPE_INT:
4550 case CTLTYPE_UINT:
4551 i = 0;
4552 return (SYSCTL_OUT(req, &i, sizeof(i)));
4553 case CTLTYPE_STRING:
4554 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4555 return
4556 (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4557 case CTLTYPE_STRUCT:
4558 s = (size_t)arg2;
4559 return (SYSCTL_OUT(req, &s, sizeof(s)));
4560 }
4561 return (0);
4562 }
4563
4564 /*
4565 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4566 * jail creation time but cannot be changed in an existing jail.
4567 */
4568 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4569 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4570 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4571 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4572 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4573 "I", "Jail secure level");
4574 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4575 "Jail value for kern.osreldate and uname -K");
4576 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4577 "Jail value for kern.osrelease and uname -r");
4578 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4579 "I", "Jail cannot see all mounted file systems");
4580 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4581 "I", "Ruleset for in-jail devfs mounts");
4582 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4583 "B", "Jail persistence");
4584 #ifdef VIMAGE
4585 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4586 "E,jailsys", "Virtual network stack");
4587 #endif
4588 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4589 "B", "Jail is in the process of shutting down");
4590
4591 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4592 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4593 "I", "Current number of child jails");
4594 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4595 "I", "Maximum number of child jails");
4596
4597 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4598 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4599 "Jail hostname");
4600 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4601 "Jail NIS domainname");
4602 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4603 "Jail host UUID");
4604 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4605 "LU", "Jail host ID");
4606
4607 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4608 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4609
4610 #ifdef INET
4611 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4612 "Jail IPv4 address virtualization");
4613 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4614 "S,in_addr,a", "Jail IPv4 addresses");
4615 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4616 "B", "Do (not) use IPv4 source address selection rather than the "
4617 "primary jail IPv4 address.");
4618 #endif
4619 #ifdef INET6
4620 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4621 "Jail IPv6 address virtualization");
4622 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4623 "S,in6_addr,a", "Jail IPv6 addresses");
4624 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4625 "B", "Do (not) use IPv6 source address selection rather than the "
4626 "primary jail IPv6 address.");
4627 #endif
4628
4629 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4630 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4631 "B", "Jail may set hostname");
4632 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4633 "B", "Jail may use SYSV IPC");
4634 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4635 "B", "Jail may create raw sockets");
4636 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4637 "B", "Jail may alter system file flags");
4638 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4639 "B", "Jail may set file quotas");
4640 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4641 "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4642 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4643 "B", "Jail may lock (unlock) physical pages in memory");
4644 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4645 "B", "Jail may bind sockets to reserved ports");
4646 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4647 "B", "Jail may read the kernel message buffer");
4648 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4649 "B", "Unprivileged processes may use process debugging facilities");
4650 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4651 "B", "Processes in jail with uid 0 have privilege");
4652 #ifdef VIMAGE
4653 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
4654 "B", "Mountd/nfsd may run in the jail");
4655 #endif
4656 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW,
4657 "B", "Jail may set system-level filesystem extended attributes");
4658 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW,
4659 "B", "Jail may adjust system time");
4660 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
4661 "B", "Jail may set system time");
4662
4663 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4664 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4665 "B", "Jail may mount/unmount jail-friendly file systems in general");
4666
4667 /*
4668 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return
4669 * its associated bit in the pr_allow bitmask, or zero if the parameter was
4670 * not created.
4671 */
4672 unsigned
prison_add_allow(const char * prefix,const char * name,const char * prefix_descr,const char * descr)4673 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4674 const char *descr)
4675 {
4676 struct bool_flags *bf;
4677 struct sysctl_oid *parent;
4678 char *allow_name, *allow_noname, *allowed;
4679 #ifndef NO_SYSCTL_DESCR
4680 char *descr_deprecated;
4681 #endif
4682 u_int allow_flag;
4683
4684 if (prefix
4685 ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4686 < 0 ||
4687 asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4688 < 0
4689 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4690 asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4691 free(allow_name, M_PRISON);
4692 return 0;
4693 }
4694
4695 /*
4696 * See if this parameter has already beed added, i.e. a module was
4697 * previously loaded/unloaded.
4698 */
4699 mtx_lock(&prison0.pr_mtx);
4700 for (bf = pr_flag_allow;
4701 bf < pr_flag_allow + nitems(pr_flag_allow) &&
4702 atomic_load_int(&bf->flag) != 0;
4703 bf++) {
4704 if (strcmp(bf->name, allow_name) == 0) {
4705 allow_flag = bf->flag;
4706 goto no_add;
4707 }
4708 }
4709
4710 /*
4711 * Find a free bit in pr_allow_all, failing if there are none
4712 * (which shouldn't happen as long as we keep track of how many
4713 * potential dynamic flags exist).
4714 */
4715 for (allow_flag = 1;; allow_flag <<= 1) {
4716 if (allow_flag == 0)
4717 goto no_add;
4718 if ((pr_allow_all & allow_flag) == 0)
4719 break;
4720 }
4721
4722 /* Note the parameter in the next open slot in pr_flag_allow. */
4723 for (bf = pr_flag_allow; ; bf++) {
4724 if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4725 /* This should never happen, but is not fatal. */
4726 allow_flag = 0;
4727 goto no_add;
4728 }
4729 if (atomic_load_int(&bf->flag) == 0)
4730 break;
4731 }
4732 bf->name = allow_name;
4733 bf->noname = allow_noname;
4734 pr_allow_all |= allow_flag;
4735 /*
4736 * prison0 always has permission for the new parameter.
4737 * Other jails must have it granted to them.
4738 */
4739 prison0.pr_allow |= allow_flag;
4740 /* The flag indicates a valid entry, so make sure it is set last. */
4741 atomic_store_rel_int(&bf->flag, allow_flag);
4742 mtx_unlock(&prison0.pr_mtx);
4743
4744 /*
4745 * Create sysctls for the parameter, and the back-compat global
4746 * permission.
4747 */
4748 parent = prefix
4749 ? SYSCTL_ADD_NODE(NULL,
4750 SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4751 OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4752 : &sysctl___security_jail_param_allow;
4753 (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4754 name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4755 NULL, 0, sysctl_jail_param, "B", descr);
4756 if ((prefix
4757 ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4758 : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4759 #ifndef NO_SYSCTL_DESCR
4760 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4761 descr);
4762 #endif
4763 (void)SYSCTL_ADD_PROC(NULL,
4764 SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4765 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4766 sysctl_jail_default_allow, "I", descr_deprecated);
4767 #ifndef NO_SYSCTL_DESCR
4768 free(descr_deprecated, M_TEMP);
4769 #endif
4770 free(allowed, M_TEMP);
4771 }
4772 return allow_flag;
4773
4774 no_add:
4775 mtx_unlock(&prison0.pr_mtx);
4776 free(allow_name, M_PRISON);
4777 free(allow_noname, M_PRISON);
4778 return allow_flag;
4779 }
4780
4781 /*
4782 * The VFS system will register jail-aware filesystems here. They each get
4783 * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4784 * attempts to mount.
4785 */
4786 void
prison_add_vfs(struct vfsconf * vfsp)4787 prison_add_vfs(struct vfsconf *vfsp)
4788 {
4789 #ifdef NO_SYSCTL_DESCR
4790
4791 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4792 NULL, NULL);
4793 #else
4794 char *descr;
4795
4796 (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4797 vfsp->vfc_name);
4798 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4799 NULL, descr);
4800 free(descr, M_TEMP);
4801 #endif
4802 }
4803
4804 #ifdef RACCT
4805 void
prison_racct_foreach(void (* callback)(struct racct * racct,void * arg2,void * arg3),void (* pre)(void),void (* post)(void),void * arg2,void * arg3)4806 prison_racct_foreach(void (*callback)(struct racct *racct,
4807 void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4808 void *arg2, void *arg3)
4809 {
4810 struct prison_racct *prr;
4811
4812 ASSERT_RACCT_ENABLED();
4813
4814 sx_slock(&allprison_lock);
4815 if (pre != NULL)
4816 (pre)();
4817 LIST_FOREACH(prr, &allprison_racct, prr_next)
4818 (callback)(prr->prr_racct, arg2, arg3);
4819 if (post != NULL)
4820 (post)();
4821 sx_sunlock(&allprison_lock);
4822 }
4823
4824 static struct prison_racct *
prison_racct_find_locked(const char * name)4825 prison_racct_find_locked(const char *name)
4826 {
4827 struct prison_racct *prr;
4828
4829 ASSERT_RACCT_ENABLED();
4830 sx_assert(&allprison_lock, SA_XLOCKED);
4831
4832 if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4833 return (NULL);
4834
4835 LIST_FOREACH(prr, &allprison_racct, prr_next) {
4836 if (strcmp(name, prr->prr_name) != 0)
4837 continue;
4838
4839 /* Found prison_racct with a matching name? */
4840 prison_racct_hold(prr);
4841 return (prr);
4842 }
4843
4844 /* Add new prison_racct. */
4845 prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4846 racct_create(&prr->prr_racct);
4847
4848 strcpy(prr->prr_name, name);
4849 refcount_init(&prr->prr_refcount, 1);
4850 LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4851
4852 return (prr);
4853 }
4854
4855 struct prison_racct *
prison_racct_find(const char * name)4856 prison_racct_find(const char *name)
4857 {
4858 struct prison_racct *prr;
4859
4860 ASSERT_RACCT_ENABLED();
4861
4862 sx_xlock(&allprison_lock);
4863 prr = prison_racct_find_locked(name);
4864 sx_xunlock(&allprison_lock);
4865 return (prr);
4866 }
4867
4868 void
prison_racct_hold(struct prison_racct * prr)4869 prison_racct_hold(struct prison_racct *prr)
4870 {
4871
4872 ASSERT_RACCT_ENABLED();
4873
4874 refcount_acquire(&prr->prr_refcount);
4875 }
4876
4877 static void
prison_racct_free_locked(struct prison_racct * prr)4878 prison_racct_free_locked(struct prison_racct *prr)
4879 {
4880
4881 ASSERT_RACCT_ENABLED();
4882 sx_assert(&allprison_lock, SA_XLOCKED);
4883
4884 if (refcount_release(&prr->prr_refcount)) {
4885 racct_destroy(&prr->prr_racct);
4886 LIST_REMOVE(prr, prr_next);
4887 free(prr, M_PRISON_RACCT);
4888 }
4889 }
4890
4891 void
prison_racct_free(struct prison_racct * prr)4892 prison_racct_free(struct prison_racct *prr)
4893 {
4894
4895 ASSERT_RACCT_ENABLED();
4896 sx_assert(&allprison_lock, SA_UNLOCKED);
4897
4898 if (refcount_release_if_not_last(&prr->prr_refcount))
4899 return;
4900
4901 sx_xlock(&allprison_lock);
4902 prison_racct_free_locked(prr);
4903 sx_xunlock(&allprison_lock);
4904 }
4905
4906 static void
prison_racct_attach(struct prison * pr)4907 prison_racct_attach(struct prison *pr)
4908 {
4909 struct prison_racct *prr;
4910
4911 ASSERT_RACCT_ENABLED();
4912 sx_assert(&allprison_lock, SA_XLOCKED);
4913
4914 prr = prison_racct_find_locked(pr->pr_name);
4915 KASSERT(prr != NULL, ("cannot find prison_racct"));
4916
4917 pr->pr_prison_racct = prr;
4918 }
4919
4920 /*
4921 * Handle jail renaming. From the racct point of view, renaming means
4922 * moving from one prison_racct to another.
4923 */
4924 static void
prison_racct_modify(struct prison * pr)4925 prison_racct_modify(struct prison *pr)
4926 {
4927 #ifdef RCTL
4928 struct proc *p;
4929 struct ucred *cred;
4930 #endif
4931 struct prison_racct *oldprr;
4932
4933 ASSERT_RACCT_ENABLED();
4934
4935 sx_slock(&allproc_lock);
4936 sx_xlock(&allprison_lock);
4937
4938 if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4939 sx_xunlock(&allprison_lock);
4940 sx_sunlock(&allproc_lock);
4941 return;
4942 }
4943
4944 oldprr = pr->pr_prison_racct;
4945 pr->pr_prison_racct = NULL;
4946
4947 prison_racct_attach(pr);
4948
4949 /*
4950 * Move resource utilisation records.
4951 */
4952 racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4953
4954 #ifdef RCTL
4955 /*
4956 * Force rctl to reattach rules to processes.
4957 */
4958 FOREACH_PROC_IN_SYSTEM(p) {
4959 PROC_LOCK(p);
4960 cred = crhold(p->p_ucred);
4961 PROC_UNLOCK(p);
4962 rctl_proc_ucred_changed(p, cred);
4963 crfree(cred);
4964 }
4965 #endif
4966
4967 sx_sunlock(&allproc_lock);
4968 prison_racct_free_locked(oldprr);
4969 sx_xunlock(&allprison_lock);
4970 }
4971
4972 static void
prison_racct_detach(struct prison * pr)4973 prison_racct_detach(struct prison *pr)
4974 {
4975
4976 ASSERT_RACCT_ENABLED();
4977 sx_assert(&allprison_lock, SA_UNLOCKED);
4978
4979 if (pr->pr_prison_racct == NULL)
4980 return;
4981 prison_racct_free(pr->pr_prison_racct);
4982 pr->pr_prison_racct = NULL;
4983 }
4984 #endif /* RACCT */
4985
4986 #ifdef DDB
4987
4988 static void
db_show_prison(struct prison * pr)4989 db_show_prison(struct prison *pr)
4990 {
4991 struct bool_flags *bf;
4992 struct jailsys_flags *jsf;
4993 #if defined(INET) || defined(INET6)
4994 int ii;
4995 struct prison_ip *pip;
4996 #endif
4997 unsigned f;
4998 #ifdef INET
4999 char ip4buf[INET_ADDRSTRLEN];
5000 #endif
5001 #ifdef INET6
5002 char ip6buf[INET6_ADDRSTRLEN];
5003 #endif
5004
5005 db_printf("prison %p:\n", pr);
5006 db_printf(" jid = %d\n", pr->pr_id);
5007 db_printf(" name = %s\n", pr->pr_name);
5008 db_printf(" parent = %p\n", pr->pr_parent);
5009 db_printf(" ref = %d\n", pr->pr_ref);
5010 db_printf(" uref = %d\n", pr->pr_uref);
5011 db_printf(" state = %s\n",
5012 pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
5013 pr->pr_state == PRISON_STATE_DYING ? "dying" :
5014 "invalid");
5015 db_printf(" path = %s\n", pr->pr_path);
5016 db_printf(" cpuset = %d\n", pr->pr_cpuset
5017 ? pr->pr_cpuset->cs_id : -1);
5018 #ifdef VIMAGE
5019 db_printf(" vnet = %p\n", pr->pr_vnet);
5020 #endif
5021 db_printf(" root = %p\n", pr->pr_root);
5022 db_printf(" securelevel = %d\n", pr->pr_securelevel);
5023 db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum);
5024 db_printf(" children.max = %d\n", pr->pr_childmax);
5025 db_printf(" children.cur = %d\n", pr->pr_childcount);
5026 db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
5027 db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
5028 db_printf(" flags = 0x%x", pr->pr_flags);
5029 for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
5030 if (pr->pr_flags & bf->flag)
5031 db_printf(" %s", bf->name);
5032 for (jsf = pr_flag_jailsys;
5033 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
5034 jsf++) {
5035 f = pr->pr_flags & (jsf->disable | jsf->new);
5036 db_printf(" %-16s= %s\n", jsf->name,
5037 (f != 0 && f == jsf->disable) ? "disable"
5038 : (f == jsf->new) ? "new"
5039 : "inherit");
5040 }
5041 db_printf(" allow = 0x%x", pr->pr_allow);
5042 for (bf = pr_flag_allow;
5043 bf < pr_flag_allow + nitems(pr_flag_allow) &&
5044 atomic_load_int(&bf->flag) != 0;
5045 bf++)
5046 if (pr->pr_allow & bf->flag)
5047 db_printf(" %s", bf->name);
5048 db_printf("\n");
5049 db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
5050 db_printf(" host.hostname = %s\n", pr->pr_hostname);
5051 db_printf(" host.domainname = %s\n", pr->pr_domainname);
5052 db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
5053 db_printf(" host.hostid = %lu\n", pr->pr_hostid);
5054 #ifdef INET
5055 if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
5056 db_printf(" ip4s = %d\n", pip->ips);
5057 for (ii = 0; ii < pip->ips; ii++)
5058 db_printf(" %s %s\n",
5059 ii == 0 ? "ip4.addr =" : " ",
5060 inet_ntoa_r(
5061 *(const struct in_addr *)PR_IP(pip, PR_INET, ii),
5062 ip4buf));
5063 }
5064 #endif
5065 #ifdef INET6
5066 if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
5067 db_printf(" ip6s = %d\n", pip->ips);
5068 for (ii = 0; ii < pip->ips; ii++)
5069 db_printf(" %s %s\n",
5070 ii == 0 ? "ip6.addr =" : " ",
5071 ip6_sprintf(ip6buf,
5072 (const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
5073 }
5074 #endif
5075 }
5076
DB_SHOW_COMMAND(prison,db_show_prison_command)5077 DB_SHOW_COMMAND(prison, db_show_prison_command)
5078 {
5079 struct prison *pr;
5080
5081 if (!have_addr) {
5082 /*
5083 * Show all prisons in the list, and prison0 which is not
5084 * listed.
5085 */
5086 db_show_prison(&prison0);
5087 if (!db_pager_quit) {
5088 TAILQ_FOREACH(pr, &allprison, pr_list) {
5089 db_show_prison(pr);
5090 if (db_pager_quit)
5091 break;
5092 }
5093 }
5094 return;
5095 }
5096
5097 if (addr == 0)
5098 pr = &prison0;
5099 else {
5100 /* Look for a prison with the ID and with references. */
5101 TAILQ_FOREACH(pr, &allprison, pr_list)
5102 if (pr->pr_id == addr && pr->pr_ref > 0)
5103 break;
5104 if (pr == NULL)
5105 /* Look again, without requiring a reference. */
5106 TAILQ_FOREACH(pr, &allprison, pr_list)
5107 if (pr->pr_id == addr)
5108 break;
5109 if (pr == NULL)
5110 /* Assume address points to a valid prison. */
5111 pr = (struct prison *)addr;
5112 }
5113 db_show_prison(pr);
5114 }
5115
5116 #endif /* DDB */
5117