1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 1999 Poul-Henning Kamp.
5 * Copyright (c) 2008 Bjoern A. Zeeb.
6 * Copyright (c) 2009 James Gritton.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_nfs.h"
36
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/errno.h>
42 #include <sys/file.h>
43 #include <sys/sysproto.h>
44 #include <sys/malloc.h>
45 #include <sys/osd.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/epoch.h>
49 #include <sys/event.h>
50 #include <sys/taskqueue.h>
51 #include <sys/fcntl.h>
52 #include <sys/jail.h>
53 #include <sys/jaildesc.h>
54 #include <sys/linker.h>
55 #include <sys/lock.h>
56 #include <sys/mman.h>
57 #include <sys/mutex.h>
58 #include <sys/racct.h>
59 #include <sys/rctl.h>
60 #include <sys/refcount.h>
61 #include <sys/sx.h>
62 #include <sys/sysent.h>
63 #include <sys/namei.h>
64 #include <sys/mount.h>
65 #include <sys/queue.h>
66 #include <sys/socket.h>
67 #include <sys/syscallsubr.h>
68 #include <sys/sysctl.h>
69 #include <sys/uuid.h>
70 #include <sys/vnode.h>
71
72 #include <net/if.h>
73 #include <net/vnet.h>
74
75 #include <netinet/in.h>
76
77 #ifdef DDB
78 #include <ddb/ddb.h>
79 #endif /* DDB */
80
81 #include <security/mac/mac_framework.h>
82
83 #define PRISON0_HOSTUUID_MODULE "hostuuid"
84
85 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
86 #ifdef RACCT
87 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
88 #endif
89
90 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
91 #ifdef INET
92 #ifdef INET6
93 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
94 #else
95 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
96 #endif
97 #else /* !INET */
98 #ifdef INET6
99 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
100 #else
101 #define _PR_IP_SADDRSEL 0
102 #endif
103 #endif
104
105 /* prison0 describes what is "real" about the system. */
106 struct prison prison0 = {
107 .pr_id = 0,
108 .pr_name = "0",
109 .pr_ref = 1,
110 .pr_uref = 1,
111 .pr_path = "/",
112 .pr_securelevel = -1,
113 .pr_devfs_rsnum = 0,
114 .pr_state = PRISON_STATE_ALIVE,
115 .pr_childmax = JAIL_MAX,
116 .pr_hostuuid = DEFAULT_HOSTUUID,
117 .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
118 #ifdef VIMAGE
119 .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
120 #else
121 .pr_flags = PR_HOST|_PR_IP_SADDRSEL,
122 #endif
123 .pr_allow = PR_ALLOW_PRISON0,
124 };
125 _Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0,
126 "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved");
127
128 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
129
130 struct bool_flags {
131 const char *name;
132 const char *noname;
133 volatile u_int flag;
134 };
135 struct jailsys_flags {
136 const char *name;
137 unsigned disable;
138 unsigned new;
139 };
140
141 /*
142 * Handle jail teardown in a dedicated thread to avoid deadlocks from
143 * vnet_destroy().
144 */
145 TASKQUEUE_DEFINE_THREAD(jail_remove);
146
147 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
148 struct sx allprison_lock;
149 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
150 struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
151 LIST_HEAD(, prison_racct) allprison_racct;
152 int lastprid = 0;
153 int lastdeadid = 0;
154
155 static int get_next_prid(struct prison **insprp);
156 static int get_next_deadid(struct prison **insprp);
157 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
158 static void prison_complete(void *context, int pending);
159 static void prison_deref(struct prison *pr, int flags);
160 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
161 static int prison_lock_xlock(struct prison *pr, int flags);
162 static void prison_cleanup_locked(struct prison *pr);
163 static void prison_cleanup_unlocked(struct prison *pr);
164 static void prison_free_not_last(struct prison *pr);
165 static void prison_proc_free_not_last(struct prison *pr);
166 static void prison_proc_relink(struct prison *opr, struct prison *npr,
167 struct proc *p);
168 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
169 int enable);
170 static char *prison_path(struct prison *pr1, struct prison *pr2);
171 #ifdef RACCT
172 static void prison_racct_attach(struct prison *pr);
173 static void prison_racct_modify(struct prison *pr);
174 static void prison_racct_detach(struct prison *pr);
175 #endif
176 static void prison_knote(struct prison *pr, long hint);
177
178 /* Flags for prison_deref */
179 #define PD_DEREF 0x01 /* Decrement pr_ref */
180 #define PD_DEUREF 0x02 /* Decrement pr_uref */
181 #define PD_KILL 0x04 /* Remove jail, kill processes, etc */
182 #define PD_LOCKED 0x10 /* pr_mtx is held */
183 #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */
184 #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */
185 #define PD_OP_FLAGS 0x07 /* Operation flags */
186 #define PD_LOCK_FLAGS 0x70 /* Lock status flags */
187
188 /*
189 * Parameter names corresponding to PR_* flag values. Size values are for kvm
190 * as we cannot figure out the size of a sparse array, or an array without a
191 * terminating entry.
192 */
193 static struct bool_flags pr_flag_bool[] = {
194 {"persist", "nopersist", PR_PERSIST},
195 #ifdef INET
196 {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
197 #endif
198 #ifdef INET6
199 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
200 #endif
201 };
202 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
203
204 static struct jailsys_flags pr_flag_jailsys[] = {
205 {"host", 0, PR_HOST},
206 #ifdef VIMAGE
207 {"vnet", 0, PR_VNET},
208 #endif
209 #ifdef INET
210 {"ip4", PR_IP4_USER, PR_IP4_USER},
211 #endif
212 #ifdef INET6
213 {"ip6", PR_IP6_USER, PR_IP6_USER},
214 #endif
215 };
216 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
217
218 /*
219 * Make this array full-size so dynamic parameters can be added.
220 * It is protected by prison0.mtx, but lockless reading is allowed
221 * with an atomic check of the flag values.
222 */
223 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
224 {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
225 {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
226 {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
227 {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
228 {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
229 {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
230 {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
231 {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
232 {"allow.reserved_ports", "allow.noreserved_ports",
233 PR_ALLOW_RESERVED_PORTS},
234 {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
235 {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
236 PR_ALLOW_UNPRIV_DEBUG},
237 {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
238 #ifdef VIMAGE
239 {"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
240 #endif
241 {"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR},
242 {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
243 {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
244 {"allow.routing", "allow.norouting", PR_ALLOW_ROUTING},
245 {"allow.unprivileged_parent_tampering",
246 "allow.nounprivileged_parent_tampering",
247 PR_ALLOW_UNPRIV_PARENT_TAMPER},
248 #ifdef AUDIT
249 {"allow.setaudit", "allow.nosetaudit", PR_ALLOW_SETAUDIT},
250 #endif
251 };
252 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
253 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
254
255 #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \
256 PR_ALLOW_RESERVED_PORTS | \
257 PR_ALLOW_UNPRIV_DEBUG | \
258 PR_ALLOW_SUSER)
259 #define JAIL_DEFAULT_ENFORCE_STATFS 2
260 #define JAIL_DEFAULT_DEVFS_RSNUM 0
261 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
262 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
263 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
264 #if defined(INET) || defined(INET6)
265 static unsigned jail_max_af_ips = 255;
266 #endif
267
268 /*
269 * Initialize the parts of prison0 that can't be static-initialized with
270 * constants. This is called from proc0_init() after creating thread0 cpuset.
271 */
272 void
prison0_init(void)273 prison0_init(void)
274 {
275 uint8_t *file, *data;
276 size_t size;
277 char buf[sizeof(prison0.pr_hostuuid)];
278 bool valid;
279
280 prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
281 prison0.pr_osreldate = osreldate;
282 strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
283
284 /* If we have a preloaded hostuuid, use it. */
285 file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
286 if (file != NULL) {
287 data = preload_fetch_addr(file);
288 size = preload_fetch_size(file);
289 if (data != NULL) {
290 /*
291 * The preloaded data may include trailing whitespace, almost
292 * certainly a newline; skip over any whitespace or
293 * non-printable characters to be safe.
294 */
295 while (size > 0 && data[size - 1] <= 0x20) {
296 size--;
297 }
298
299 valid = false;
300
301 /*
302 * Not NUL-terminated when passed from loader, but
303 * validate_uuid requires that due to using sscanf (as
304 * does the subsequent strlcpy, since it still reads
305 * past the given size to return the true length);
306 * bounce to a temporary buffer to fix.
307 */
308 if (size >= sizeof(buf))
309 goto done;
310
311 memcpy(buf, data, size);
312 buf[size] = '\0';
313
314 if (validate_uuid(buf, size, NULL, 0) != 0)
315 goto done;
316
317 valid = true;
318 (void)strlcpy(prison0.pr_hostuuid, buf,
319 sizeof(prison0.pr_hostuuid));
320
321 done:
322 if (bootverbose && !valid) {
323 printf("hostuuid: preload data malformed: '%.*s'\n",
324 (int)size, data);
325 }
326 }
327 }
328 if (bootverbose)
329 printf("hostuuid: using %s\n", prison0.pr_hostuuid);
330 }
331
332 /*
333 * struct jail_args {
334 * struct jail *jail;
335 * };
336 */
337 int
sys_jail(struct thread * td,struct jail_args * uap)338 sys_jail(struct thread *td, struct jail_args *uap)
339 {
340 uint32_t version;
341 int error;
342 struct jail j;
343
344 error = copyin(uap->jail, &version, sizeof(uint32_t));
345 if (error)
346 return (error);
347
348 switch (version) {
349 case 0:
350 {
351 struct jail_v0 j0;
352
353 /* FreeBSD single IPv4 jails. */
354 bzero(&j, sizeof(struct jail));
355 error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
356 if (error)
357 return (error);
358 j.version = j0.version;
359 j.path = j0.path;
360 j.hostname = j0.hostname;
361 j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */
362 break;
363 }
364
365 case 1:
366 /*
367 * Version 1 was used by multi-IPv4 jail implementations
368 * that never made it into the official kernel.
369 */
370 return (EINVAL);
371
372 case 2: /* JAIL_API_VERSION */
373 /* FreeBSD multi-IPv4/IPv6,noIP jails. */
374 error = copyin(uap->jail, &j, sizeof(struct jail));
375 if (error)
376 return (error);
377 break;
378
379 default:
380 /* Sci-Fi jails are not supported, sorry. */
381 return (EINVAL);
382 }
383 return (kern_jail(td, &j));
384 }
385
386 int
kern_jail(struct thread * td,struct jail * j)387 kern_jail(struct thread *td, struct jail *j)
388 {
389 struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
390 #ifdef INET
391 + 1
392 #endif
393 #ifdef INET6
394 + 1
395 #endif
396 )];
397 struct uio opt;
398 char *u_path, *u_hostname, *u_name;
399 struct bool_flags *bf;
400 #ifdef INET
401 uint32_t ip4s;
402 struct in_addr *u_ip4;
403 #endif
404 #ifdef INET6
405 struct in6_addr *u_ip6;
406 #endif
407 size_t tmplen;
408 int error, enforce_statfs;
409
410 bzero(&optiov, sizeof(optiov));
411 opt.uio_iov = optiov;
412 opt.uio_iovcnt = 0;
413 opt.uio_offset = -1;
414 opt.uio_resid = -1;
415 opt.uio_segflg = UIO_SYSSPACE;
416 opt.uio_rw = UIO_READ;
417 opt.uio_td = td;
418
419 /* Set permissions for top-level jails from sysctls. */
420 if (!jailed(td->td_ucred)) {
421 for (bf = pr_flag_allow;
422 bf < pr_flag_allow + nitems(pr_flag_allow) &&
423 atomic_load_int(&bf->flag) != 0;
424 bf++) {
425 optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
426 (jail_default_allow & bf->flag)
427 ? bf->name : bf->noname);
428 optiov[opt.uio_iovcnt].iov_len =
429 strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
430 opt.uio_iovcnt += 2;
431 }
432 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
433 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
434 opt.uio_iovcnt++;
435 enforce_statfs = jail_default_enforce_statfs;
436 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
437 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
438 opt.uio_iovcnt++;
439 }
440
441 tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
442 #ifdef INET
443 ip4s = (j->version == 0) ? 1 : j->ip4s;
444 if (ip4s > jail_max_af_ips)
445 return (EINVAL);
446 tmplen += ip4s * sizeof(struct in_addr);
447 #else
448 if (j->ip4s > 0)
449 return (EINVAL);
450 #endif
451 #ifdef INET6
452 if (j->ip6s > jail_max_af_ips)
453 return (EINVAL);
454 tmplen += j->ip6s * sizeof(struct in6_addr);
455 #else
456 if (j->ip6s > 0)
457 return (EINVAL);
458 #endif
459 u_path = malloc(tmplen, M_TEMP, M_WAITOK);
460 u_hostname = u_path + MAXPATHLEN;
461 u_name = u_hostname + MAXHOSTNAMELEN;
462 #ifdef INET
463 u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
464 #endif
465 #ifdef INET6
466 #ifdef INET
467 u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
468 #else
469 u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
470 #endif
471 #endif
472 optiov[opt.uio_iovcnt].iov_base = "path";
473 optiov[opt.uio_iovcnt].iov_len = sizeof("path");
474 opt.uio_iovcnt++;
475 optiov[opt.uio_iovcnt].iov_base = u_path;
476 error = copyinstr(j->path, u_path, MAXPATHLEN,
477 &optiov[opt.uio_iovcnt].iov_len);
478 if (error) {
479 free(u_path, M_TEMP);
480 return (error);
481 }
482 opt.uio_iovcnt++;
483 optiov[opt.uio_iovcnt].iov_base = "host.hostname";
484 optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
485 opt.uio_iovcnt++;
486 optiov[opt.uio_iovcnt].iov_base = u_hostname;
487 error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
488 &optiov[opt.uio_iovcnt].iov_len);
489 if (error) {
490 free(u_path, M_TEMP);
491 return (error);
492 }
493 opt.uio_iovcnt++;
494 if (j->jailname != NULL) {
495 optiov[opt.uio_iovcnt].iov_base = "name";
496 optiov[opt.uio_iovcnt].iov_len = sizeof("name");
497 opt.uio_iovcnt++;
498 optiov[opt.uio_iovcnt].iov_base = u_name;
499 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
500 &optiov[opt.uio_iovcnt].iov_len);
501 if (error) {
502 free(u_path, M_TEMP);
503 return (error);
504 }
505 opt.uio_iovcnt++;
506 }
507 #ifdef INET
508 optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
509 optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
510 opt.uio_iovcnt++;
511 optiov[opt.uio_iovcnt].iov_base = u_ip4;
512 optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
513 if (j->version == 0)
514 u_ip4->s_addr = j->ip4s;
515 else {
516 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
517 if (error) {
518 free(u_path, M_TEMP);
519 return (error);
520 }
521 }
522 opt.uio_iovcnt++;
523 #endif
524 #ifdef INET6
525 optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
526 optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
527 opt.uio_iovcnt++;
528 optiov[opt.uio_iovcnt].iov_base = u_ip6;
529 optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
530 error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
531 if (error) {
532 free(u_path, M_TEMP);
533 return (error);
534 }
535 opt.uio_iovcnt++;
536 #endif
537 KASSERT(opt.uio_iovcnt <= nitems(optiov),
538 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
539 error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
540 free(u_path, M_TEMP);
541 return (error);
542 }
543
544 /*
545 * struct jail_set_args {
546 * struct iovec *iovp;
547 * unsigned int iovcnt;
548 * int flags;
549 * };
550 */
551 int
sys_jail_set(struct thread * td,struct jail_set_args * uap)552 sys_jail_set(struct thread *td, struct jail_set_args *uap)
553 {
554 struct uio *auio;
555 int error;
556
557 /* Check that we have an even number of iovecs. */
558 if (uap->iovcnt & 1)
559 return (EINVAL);
560
561 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
562 if (error)
563 return (error);
564 error = kern_jail_set(td, auio, uap->flags);
565 freeuio(auio);
566 return (error);
567 }
568
569 #if defined(INET) || defined(INET6)
570 typedef int prison_addr_cmp_t(const void *, const void *);
571 typedef bool prison_addr_valid_t(const void *);
572 static const struct pr_family {
573 size_t size;
574 prison_addr_cmp_t *cmp;
575 prison_addr_valid_t *valid;
576 int ip_flag;
577 } pr_families[PR_FAMILY_MAX] = {
578 #ifdef INET
579 [PR_INET] = {
580 .size = sizeof(struct in_addr),
581 .cmp = prison_qcmp_v4,
582 .valid = prison_valid_v4,
583 .ip_flag = PR_IP4_USER,
584 },
585 #endif
586 #ifdef INET6
587 [PR_INET6] = {
588 .size = sizeof(struct in6_addr),
589 .cmp = prison_qcmp_v6,
590 .valid = prison_valid_v6,
591 .ip_flag = PR_IP6_USER,
592 },
593 #endif
594 };
595
596 /*
597 * Network address lists (pr_addrs) allocation for jails. The addresses
598 * are accessed locklessly by the network stack, thus need to be protected by
599 * the network epoch.
600 */
601 struct prison_ip {
602 struct epoch_context ctx;
603 uint32_t ips;
604 #ifdef FUTURE_C
605 /*
606 * XXX Variable-length automatic arrays in union may be
607 * supported in future C.
608 */
609 union {
610 char pr_ip[];
611 struct in_addr pr_ip4[];
612 struct in6_addr pr_ip6[];
613 };
614 #else /* No future C :( */
615 char pr_ip[];
616 #endif
617 };
618
619 static char *
PR_IP(struct prison_ip * pip,const pr_family_t af,int idx)620 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
621 {
622 MPASS(pip);
623 MPASS(af < PR_FAMILY_MAX);
624 MPASS(idx >= 0 && idx < pip->ips);
625
626 return (pip->pr_ip + pr_families[af].size * idx);
627 }
628
629 static struct prison_ip *
prison_ip_alloc(const pr_family_t af,uint32_t cnt,int flags)630 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
631 {
632 struct prison_ip *pip;
633
634 pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
635 M_PRISON, flags);
636 if (pip != NULL)
637 pip->ips = cnt;
638 return (pip);
639 }
640
641 /*
642 * Allocate and copyin user supplied address list, sorting and validating.
643 * kern_jail_set() helper.
644 */
645 static struct prison_ip *
prison_ip_copyin(const pr_family_t af,void * op,uint32_t cnt)646 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
647 {
648 prison_addr_cmp_t *const cmp = pr_families[af].cmp;
649 const size_t size = pr_families[af].size;
650 struct prison_ip *pip;
651
652 pip = prison_ip_alloc(af, cnt, M_WAITOK);
653 bcopy(op, pip->pr_ip, cnt * size);
654 /*
655 * IP addresses are all sorted but ip[0] to preserve
656 * the primary IP address as given from userland.
657 * This special IP is used for unbound outgoing
658 * connections as well for "loopback" traffic in case
659 * source address selection cannot find any more fitting
660 * address to connect from.
661 */
662 if (cnt > 1)
663 qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
664 /*
665 * Check for duplicate addresses and do some simple
666 * zero and broadcast checks. If users give other bogus
667 * addresses it is their problem.
668 */
669 for (int i = 0; i < cnt; i++) {
670 if (!pr_families[af].valid(PR_IP(pip, af, i))) {
671 free(pip, M_PRISON);
672 return (NULL);
673 }
674 if (i + 1 < cnt &&
675 (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
676 cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
677 free(pip, M_PRISON);
678 return (NULL);
679 }
680 }
681
682 return (pip);
683 }
684
685 /*
686 * Allocate and dup parent prison address list.
687 * kern_jail_set() helper.
688 */
689 static void
prison_ip_dup(struct prison * ppr,struct prison * pr,const pr_family_t af)690 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
691 {
692 const struct prison_ip *ppip = ppr->pr_addrs[af];
693 struct prison_ip *pip;
694
695 if (ppip != NULL) {
696 pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
697 bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
698 pr->pr_addrs[af] = pip;
699 }
700 }
701
702 /*
703 * Make sure the new set of IP addresses is a subset of the parent's list.
704 * Don't worry about the parent being unlocked, as any setting is done with
705 * allprison_lock held.
706 * kern_jail_set() helper.
707 */
708 static bool
prison_ip_parent_match(struct prison_ip * ppip,struct prison_ip * pip,const pr_family_t af)709 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
710 const pr_family_t af)
711 {
712 prison_addr_cmp_t *const cmp = pr_families[af].cmp;
713 int i, j;
714
715 if (ppip == NULL)
716 return (false);
717
718 for (i = 0; i < ppip->ips; i++)
719 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
720 break;
721
722 if (i == ppip->ips)
723 /* Main address not present in parent. */
724 return (false);
725
726 if (pip->ips > 1) {
727 for (i = j = 1; i < pip->ips; i++) {
728 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
729 /* Equals to parent primary address. */
730 continue;
731 for (; j < ppip->ips; j++)
732 if (cmp(PR_IP(pip, af, i),
733 PR_IP(ppip, af, j)) == 0)
734 break;
735 if (j == ppip->ips)
736 break;
737 }
738 if (j == ppip->ips)
739 /* Address not present in parent. */
740 return (false);
741 }
742 return (true);
743 }
744
745 /*
746 * Check for conflicting IP addresses. We permit them if there is no more
747 * than one IP on each jail. If there is a duplicate on a jail with more
748 * than one IP stop checking and return error.
749 * kern_jail_set() helper.
750 */
751 static bool
prison_ip_conflict_check(const struct prison * ppr,const struct prison * pr,struct prison_ip * pip,pr_family_t af)752 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
753 struct prison_ip *pip, pr_family_t af)
754 {
755 const struct prison *tppr, *tpr;
756 int descend;
757
758 #ifdef VIMAGE
759 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
760 if (tppr->pr_flags & PR_VNET)
761 break;
762 #else
763 tppr = &prison0;
764 #endif
765 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
766 if (tpr == pr ||
767 #ifdef VIMAGE
768 (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
769 #endif
770 !prison_isalive(tpr)) {
771 descend = 0;
772 continue;
773 }
774 if (!(tpr->pr_flags & pr_families[af].ip_flag))
775 continue;
776 descend = 0;
777 if (tpr->pr_addrs[af] == NULL ||
778 (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
779 continue;
780 for (int i = 0; i < pip->ips; i++)
781 if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
782 return (false);
783 }
784
785 return (true);
786 }
787
788 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
789 "prison must start with epoch context");
790 static void
prison_ip_free_deferred(epoch_context_t ctx)791 prison_ip_free_deferred(epoch_context_t ctx)
792 {
793
794 free(ctx, M_PRISON);
795 }
796
797 static void
prison_ip_free(struct prison_ip * pip)798 prison_ip_free(struct prison_ip *pip)
799 {
800
801 if (pip != NULL)
802 NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
803 }
804
805 static void
prison_ip_set(struct prison * pr,const pr_family_t af,struct prison_ip * new)806 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
807 {
808 struct prison_ip **mem, *old;
809
810 mtx_assert(&pr->pr_mtx, MA_OWNED);
811
812 mem = &pr->pr_addrs[af];
813
814 old = *mem;
815 atomic_store_ptr(mem, new);
816 prison_ip_free(old);
817 }
818
819 /*
820 * Restrict a prison's IP address list with its parent's, possibly replacing
821 * it. Return true if succeed, otherwise should redo.
822 * kern_jail_set() helper.
823 */
824 static bool
prison_ip_restrict(struct prison * pr,const pr_family_t af,struct prison_ip ** newp)825 prison_ip_restrict(struct prison *pr, const pr_family_t af,
826 struct prison_ip **newp)
827 {
828 struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
829 struct prison_ip *pip = pr->pr_addrs[af];
830 int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
831 const size_t size = pr_families[af].size;
832 struct prison_ip *new = newp != NULL ? *newp : NULL;
833 uint32_t ips;
834
835 mtx_assert(&pr->pr_mtx, MA_OWNED);
836
837 /*
838 * Due to epoch-synchronized access to the IP address lists we always
839 * allocate a new list even if the old one has enough space. We could
840 * atomically update an IPv4 address inside a list, but that would
841 * screw up sorting, and in case of IPv6 we can't even atomically write
842 * one.
843 */
844 if (ppip == NULL) {
845 if (pip != NULL)
846 prison_ip_set(pr, af, NULL);
847 return (true);
848 }
849
850 if (!(pr->pr_flags & pr_families[af].ip_flag)) {
851 if (new == NULL) {
852 new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
853 if (new == NULL)
854 return (false); /* Redo */
855 }
856 /* This has no user settings, so just copy the parent's list. */
857 MPASS(new->ips == ppip->ips);
858 bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
859 prison_ip_set(pr, af, new);
860 if (newp != NULL)
861 *newp = NULL; /* Used */
862 } else if (pip != NULL) {
863 /* Remove addresses that aren't in the parent. */
864 int i;
865
866 i = 0; /* index in pip */
867 ips = 0; /* index in new */
868
869 if (new == NULL) {
870 new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
871 if (new == NULL)
872 return (false); /* Redo */
873 }
874
875 for (int pi = 0; pi < ppip->ips; pi++)
876 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
877 /* Found our primary address in parent. */
878 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
879 size);
880 i++;
881 ips++;
882 break;
883 }
884 for (int pi = 1; i < pip->ips; ) {
885 /* Check against primary, which is unsorted. */
886 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
887 /* Matches parent's primary address. */
888 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
889 size);
890 i++;
891 ips++;
892 continue;
893 }
894 /* The rest are sorted. */
895 switch (pi >= ppip->ips ? -1 :
896 cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
897 case -1:
898 i++;
899 break;
900 case 0:
901 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
902 size);
903 i++;
904 pi++;
905 ips++;
906 break;
907 case 1:
908 pi++;
909 break;
910 }
911 }
912 if (ips == 0) {
913 if (newp == NULL || *newp == NULL)
914 prison_ip_free(new);
915 new = NULL;
916 } else {
917 /* Shrink to real size */
918 KASSERT((new->ips >= ips),
919 ("Out-of-bounds write to prison_ip %p", new));
920 new->ips = ips;
921 }
922 prison_ip_set(pr, af, new);
923 if (newp != NULL)
924 *newp = NULL; /* Used */
925 }
926 return (true);
927 }
928
929 /*
930 * Fast-path check if an address belongs to a prison.
931 */
932 int
prison_ip_check(const struct prison * pr,const pr_family_t af,const void * addr)933 prison_ip_check(const struct prison *pr, const pr_family_t af,
934 const void *addr)
935 {
936 int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
937 struct prison_ip *pip;
938 int i, a, z, d;
939
940 MPASS(mtx_owned(&pr->pr_mtx) ||
941 in_epoch(net_epoch_preempt) ||
942 sx_xlocked(&allprison_lock));
943
944 pip = atomic_load_ptr(&pr->pr_addrs[af]);
945 if (__predict_false(pip == NULL))
946 return (EAFNOSUPPORT);
947
948 /* Check the primary IP. */
949 if (cmp(PR_IP(pip, af, 0), addr) == 0)
950 return (0);
951
952 /*
953 * All the other IPs are sorted so we can do a binary search.
954 */
955 a = 0;
956 z = pip->ips - 2;
957 while (a <= z) {
958 i = (a + z) / 2;
959 d = cmp(PR_IP(pip, af, i + 1), addr);
960 if (d > 0)
961 z = i - 1;
962 else if (d < 0)
963 a = i + 1;
964 else
965 return (0);
966 }
967
968 return (EADDRNOTAVAIL);
969 }
970
971 /*
972 * Grab primary IP. Historically required mutex, but nothing prevents
973 * us to support epoch-protected access. Is it used in fast path?
974 * in{6}_jail.c helper
975 */
976 const void *
prison_ip_get0(const struct prison * pr,const pr_family_t af)977 prison_ip_get0(const struct prison *pr, const pr_family_t af)
978 {
979 const struct prison_ip *pip = pr->pr_addrs[af];
980
981 mtx_assert(&pr->pr_mtx, MA_OWNED);
982 MPASS(pip);
983
984 return (pip->pr_ip);
985 }
986
987 u_int
prison_ip_cnt(const struct prison * pr,const pr_family_t af)988 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
989 {
990
991 return (pr->pr_addrs[af]->ips);
992 }
993 #endif /* defined(INET) || defined(INET6) */
994
995 int
kern_jail_set(struct thread * td,struct uio * optuio,int flags)996 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
997 {
998 struct file *jfp_out;
999 struct nameidata nd;
1000 #ifdef INET
1001 struct prison_ip *ip4;
1002 #endif
1003 #ifdef INET6
1004 struct prison_ip *ip6;
1005 #endif
1006 struct vfsopt *opt;
1007 struct vfsoptlist *opts;
1008 struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
1009 struct ucred *jdcred;
1010 struct vnode *root;
1011 char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
1012 char *g_path, *osrelstr;
1013 struct bool_flags *bf;
1014 struct jailsys_flags *jsf;
1015 #if defined(INET) || defined(INET6)
1016 void *op;
1017 #endif
1018 unsigned long hid;
1019 size_t namelen, onamelen, pnamelen;
1020 int created, cuflags, descend, drflags, enforce;
1021 int error, errmsg_len, errmsg_pos;
1022 int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
1023 int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level;
1024 int childmax, osreldt, rsnum, slevel;
1025 #ifdef INET
1026 int ip4s;
1027 bool redo_ip4;
1028 #endif
1029 #ifdef INET6
1030 int ip6s;
1031 bool redo_ip6;
1032 #endif
1033 bool maybe_changed;
1034 uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
1035 uint64_t pr_allow_diff;
1036 unsigned tallow;
1037 char numbuf[12];
1038
1039 mypr = td->td_ucred->cr_prison;
1040 if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) &&
1041 mypr->pr_childmax == 0)
1042 return (EPERM);
1043 if (flags & ~JAIL_SET_MASK)
1044 return (EINVAL);
1045 if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) ==
1046 (JAIL_USE_DESC | JAIL_AT_DESC))
1047 return (EINVAL);
1048 prison_hold(mypr);
1049
1050 #ifdef INET
1051 ip4 = NULL;
1052 #endif
1053 #ifdef INET6
1054 ip6 = NULL;
1055 #endif
1056 g_path = NULL;
1057 jfp_out = NULL;
1058 jfd_out = -1;
1059 /*
1060 * Check all the parameters before committing to anything. Not all
1061 * errors can be caught early, but we may as well try. Also, this
1062 * takes care of some expensive stuff (path lookup) before getting
1063 * the allprison lock.
1064 *
1065 * XXX Jails are not filesystems, and jail parameters are not mount
1066 * options. But it makes more sense to re-use the vfsopt code
1067 * than duplicate it under a different name.
1068 */
1069 error = vfs_buildopts(optuio, &opts);
1070 if (error) {
1071 opts = NULL;
1072 goto done_free;
1073 }
1074
1075 cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1076 if (!cuflags) {
1077 error = EINVAL;
1078 vfs_opterror(opts, "no valid operation (create or update)");
1079 goto done_errmsg;
1080 }
1081
1082 error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
1083 if (error == ENOENT) {
1084 if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
1085 JAIL_OWN_DESC)) {
1086 vfs_opterror(opts, "missing desc");
1087 goto done_errmsg;
1088 }
1089 jfd_in = -1;
1090 } else if (error != 0)
1091 goto done_free;
1092 else {
1093 if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
1094 JAIL_OWN_DESC))) {
1095 error = EINVAL;
1096 vfs_opterror(opts, "unexpected desc");
1097 goto done_errmsg;
1098 }
1099 if (flags & JAIL_AT_DESC) {
1100 /*
1101 * Look up and create jails based on the
1102 * descriptor's prison.
1103 */
1104 prison_free(mypr);
1105 error = jaildesc_find(td, jfd_in, &mypr, NULL);
1106 if (error != 0) {
1107 vfs_opterror(opts, error == ENOENT ?
1108 "descriptor to dead jail" :
1109 "not a jail descriptor");
1110 goto done_errmsg;
1111 }
1112 if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) {
1113 error = EPERM;
1114 goto done_free;
1115 }
1116 }
1117 if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
1118 /* Allocate a jail descriptor to return later. */
1119 error = jaildesc_alloc(td, &jfp_out, &jfd_out,
1120 flags & JAIL_OWN_DESC);
1121 if (error)
1122 goto done_free;
1123 }
1124 }
1125
1126 /*
1127 * Delay the permission check if using a jail descriptor,
1128 * until we get the descriptor's credentials.
1129 */
1130 if (!(flags & JAIL_USE_DESC)) {
1131 error = priv_check(td, PRIV_JAIL_SET);
1132 if (error == 0 && (flags & JAIL_ATTACH))
1133 error = priv_check(td, PRIV_JAIL_ATTACH);
1134 if (error)
1135 goto done_free;
1136 }
1137
1138 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1139 if (error == ENOENT)
1140 jid = 0;
1141 else if (error != 0)
1142 goto done_free;
1143
1144 error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
1145 if (error == ENOENT)
1146 gotslevel = 0;
1147 else if (error != 0)
1148 goto done_free;
1149 else
1150 gotslevel = 1;
1151
1152 error =
1153 vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
1154 if (error == ENOENT)
1155 gotchildmax = 0;
1156 else if (error != 0)
1157 goto done_free;
1158 else
1159 gotchildmax = 1;
1160
1161 error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
1162 if (error == ENOENT)
1163 gotenforce = 0;
1164 else if (error != 0)
1165 goto done_free;
1166 else if (enforce < 0 || enforce > 2) {
1167 error = EINVAL;
1168 goto done_free;
1169 } else
1170 gotenforce = 1;
1171
1172 error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
1173 if (error == ENOENT)
1174 gotrsnum = 0;
1175 else if (error != 0)
1176 goto done_free;
1177 else
1178 gotrsnum = 1;
1179
1180 pr_flags = ch_flags = 0;
1181 for (bf = pr_flag_bool;
1182 bf < pr_flag_bool + nitems(pr_flag_bool);
1183 bf++) {
1184 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
1185 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
1186 }
1187 ch_flags |= pr_flags;
1188 for (jsf = pr_flag_jailsys;
1189 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
1190 jsf++) {
1191 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
1192 if (error == ENOENT)
1193 continue;
1194 if (error != 0)
1195 goto done_free;
1196 switch (jsys) {
1197 case JAIL_SYS_DISABLE:
1198 if (!jsf->disable) {
1199 error = EINVAL;
1200 goto done_free;
1201 }
1202 pr_flags |= jsf->disable;
1203 break;
1204 case JAIL_SYS_NEW:
1205 pr_flags |= jsf->new;
1206 break;
1207 case JAIL_SYS_INHERIT:
1208 break;
1209 default:
1210 error = EINVAL;
1211 goto done_free;
1212 }
1213 ch_flags |= jsf->new | jsf->disable;
1214 }
1215 if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
1216 && !(pr_flags & PR_PERSIST)) {
1217 error = EINVAL;
1218 vfs_opterror(opts, "new jail must persist or attach");
1219 goto done_errmsg;
1220 }
1221 #ifdef VIMAGE
1222 if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
1223 error = EINVAL;
1224 vfs_opterror(opts, "vnet cannot be changed after creation");
1225 goto done_errmsg;
1226 }
1227 #endif
1228 #ifdef INET
1229 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
1230 error = EINVAL;
1231 vfs_opterror(opts, "ip4 cannot be changed after creation");
1232 goto done_errmsg;
1233 }
1234 #endif
1235 #ifdef INET6
1236 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
1237 error = EINVAL;
1238 vfs_opterror(opts, "ip6 cannot be changed after creation");
1239 goto done_errmsg;
1240 }
1241 #endif
1242
1243 pr_allow = ch_allow = 0;
1244 for (bf = pr_flag_allow;
1245 bf < pr_flag_allow + nitems(pr_flag_allow) &&
1246 atomic_load_int(&bf->flag) != 0;
1247 bf++) {
1248 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
1249 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
1250 }
1251 ch_allow |= pr_allow;
1252
1253 error = vfs_getopt(opts, "name", (void **)&name, &len);
1254 if (error == ENOENT)
1255 name = NULL;
1256 else if (error != 0)
1257 goto done_free;
1258 else {
1259 if (len == 0 || name[len - 1] != '\0') {
1260 error = EINVAL;
1261 goto done_free;
1262 }
1263 if (len > MAXHOSTNAMELEN) {
1264 error = ENAMETOOLONG;
1265 goto done_free;
1266 }
1267 }
1268
1269 error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
1270 if (error == ENOENT)
1271 host = NULL;
1272 else if (error != 0)
1273 goto done_free;
1274 else {
1275 ch_flags |= PR_HOST;
1276 pr_flags |= PR_HOST;
1277 if (len == 0 || host[len - 1] != '\0') {
1278 error = EINVAL;
1279 goto done_free;
1280 }
1281 if (len > MAXHOSTNAMELEN) {
1282 error = ENAMETOOLONG;
1283 goto done_free;
1284 }
1285 }
1286
1287 error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
1288 if (error == ENOENT)
1289 domain = NULL;
1290 else if (error != 0)
1291 goto done_free;
1292 else {
1293 ch_flags |= PR_HOST;
1294 pr_flags |= PR_HOST;
1295 if (len == 0 || domain[len - 1] != '\0') {
1296 error = EINVAL;
1297 goto done_free;
1298 }
1299 if (len > MAXHOSTNAMELEN) {
1300 error = ENAMETOOLONG;
1301 goto done_free;
1302 }
1303 }
1304
1305 error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
1306 if (error == ENOENT)
1307 uuid = NULL;
1308 else if (error != 0)
1309 goto done_free;
1310 else {
1311 ch_flags |= PR_HOST;
1312 pr_flags |= PR_HOST;
1313 if (len == 0 || uuid[len - 1] != '\0') {
1314 error = EINVAL;
1315 goto done_free;
1316 }
1317 if (len > HOSTUUIDLEN) {
1318 error = ENAMETOOLONG;
1319 goto done_free;
1320 }
1321 }
1322
1323 #ifdef COMPAT_FREEBSD32
1324 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1325 uint32_t hid32;
1326
1327 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
1328 hid = hid32;
1329 } else
1330 #endif
1331 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
1332 if (error == ENOENT)
1333 gothid = 0;
1334 else if (error != 0)
1335 goto done_free;
1336 else {
1337 gothid = 1;
1338 ch_flags |= PR_HOST;
1339 pr_flags |= PR_HOST;
1340 }
1341
1342 #ifdef INET
1343 error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
1344 if (error == ENOENT)
1345 ip4s = 0;
1346 else if (error != 0)
1347 goto done_free;
1348 else if (ip4s & (sizeof(struct in_addr) - 1)) {
1349 error = EINVAL;
1350 goto done_free;
1351 } else {
1352 ch_flags |= PR_IP4_USER;
1353 pr_flags |= PR_IP4_USER;
1354 if (ip4s > 0) {
1355 ip4s /= sizeof(struct in_addr);
1356 if (ip4s > jail_max_af_ips) {
1357 error = EINVAL;
1358 vfs_opterror(opts, "too many IPv4 addresses");
1359 goto done_errmsg;
1360 }
1361 ip4 = prison_ip_copyin(PR_INET, op, ip4s);
1362 if (ip4 == NULL) {
1363 error = EINVAL;
1364 goto done_free;
1365 }
1366 }
1367 }
1368 #endif
1369
1370 #ifdef INET6
1371 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
1372 if (error == ENOENT)
1373 ip6s = 0;
1374 else if (error != 0)
1375 goto done_free;
1376 else if (ip6s & (sizeof(struct in6_addr) - 1)) {
1377 error = EINVAL;
1378 goto done_free;
1379 } else {
1380 ch_flags |= PR_IP6_USER;
1381 pr_flags |= PR_IP6_USER;
1382 if (ip6s > 0) {
1383 ip6s /= sizeof(struct in6_addr);
1384 if (ip6s > jail_max_af_ips) {
1385 error = EINVAL;
1386 vfs_opterror(opts, "too many IPv6 addresses");
1387 goto done_errmsg;
1388 }
1389 ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
1390 if (ip6 == NULL) {
1391 error = EINVAL;
1392 goto done_free;
1393 }
1394 }
1395 }
1396 #endif
1397
1398 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1399 if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1400 error = EINVAL;
1401 vfs_opterror(opts,
1402 "vnet jails cannot have IP address restrictions");
1403 goto done_errmsg;
1404 }
1405 #endif
1406
1407 error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
1408 if (error == ENOENT)
1409 osrelstr = NULL;
1410 else if (error != 0)
1411 goto done_free;
1412 else {
1413 if (flags & JAIL_UPDATE) {
1414 error = EINVAL;
1415 vfs_opterror(opts,
1416 "osrelease cannot be changed after creation");
1417 goto done_errmsg;
1418 }
1419 if (len == 0 || osrelstr[len - 1] != '\0') {
1420 error = EINVAL;
1421 goto done_free;
1422 }
1423 if (len >= OSRELEASELEN) {
1424 error = ENAMETOOLONG;
1425 vfs_opterror(opts,
1426 "osrelease string must be 1-%d bytes long",
1427 OSRELEASELEN - 1);
1428 goto done_errmsg;
1429 }
1430 }
1431
1432 error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1433 if (error == ENOENT)
1434 osreldt = 0;
1435 else if (error != 0)
1436 goto done_free;
1437 else {
1438 if (flags & JAIL_UPDATE) {
1439 error = EINVAL;
1440 vfs_opterror(opts,
1441 "osreldate cannot be changed after creation");
1442 goto done_errmsg;
1443 }
1444 if (osreldt == 0) {
1445 error = EINVAL;
1446 vfs_opterror(opts, "osreldate cannot be 0");
1447 goto done_errmsg;
1448 }
1449 }
1450
1451 root = NULL;
1452 error = vfs_getopt(opts, "path", (void **)&path, &len);
1453 if (error == ENOENT)
1454 path = NULL;
1455 else if (error != 0)
1456 goto done_free;
1457 else {
1458 if (flags & JAIL_UPDATE) {
1459 error = EINVAL;
1460 vfs_opterror(opts,
1461 "path cannot be changed after creation");
1462 goto done_errmsg;
1463 }
1464 if (len == 0 || path[len - 1] != '\0') {
1465 error = EINVAL;
1466 goto done_free;
1467 }
1468 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
1469 error = namei(&nd);
1470 if (error)
1471 goto done_free;
1472 root = nd.ni_vp;
1473 NDFREE_PNBUF(&nd);
1474 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1475 strlcpy(g_path, path, MAXPATHLEN);
1476 error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1477 if (error == 0) {
1478 path = g_path;
1479 } else {
1480 /* exit on other errors */
1481 goto done_free;
1482 }
1483 if (root->v_type != VDIR) {
1484 error = ENOTDIR;
1485 vput(root);
1486 goto done_free;
1487 }
1488 VOP_UNLOCK(root);
1489 }
1490
1491 /*
1492 * Find the specified jail, or at least its parent.
1493 * This abuses the file error codes ENOENT and EEXIST.
1494 */
1495 pr = NULL;
1496 inspr = NULL;
1497 deadpr = NULL;
1498 maybe_changed = false;
1499 if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1500 namelc = strrchr(name, '.');
1501 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1502 if (*p != '\0')
1503 jid = 0;
1504 }
1505 sx_xlock(&allprison_lock);
1506 drflags = PD_LIST_XLOCKED;
1507 ppr = mypr;
1508 if (!prison_isalive(ppr)) {
1509 /* This jail is dying. This process will surely follow. */
1510 error = EAGAIN;
1511 goto done_deref;
1512 }
1513 if (flags & JAIL_USE_DESC) {
1514 /* Get the jail from its descriptor. */
1515 error = jaildesc_find(td, jfd_in, &pr, &jdcred);
1516 if (error) {
1517 vfs_opterror(opts, error == ENOENT ?
1518 "descriptor to dead jail" :
1519 "not a jail descriptor");
1520 goto done_deref;
1521 }
1522 drflags |= PD_DEREF;
1523 error = priv_check_cred(jdcred, PRIV_JAIL_SET);
1524 if (error == 0 && (flags & JAIL_ATTACH))
1525 error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
1526 crfree(jdcred);
1527 if (error)
1528 goto done_deref;
1529 mtx_lock(&pr->pr_mtx);
1530 drflags |= PD_LOCKED;
1531 if (cuflags == JAIL_CREATE) {
1532 error = EEXIST;
1533 vfs_opterror(opts, "jail %d already exists",
1534 pr->pr_id);
1535 goto done_deref;
1536 }
1537 if (!prison_isalive(pr)) {
1538 /* While a jid can be resurrected, the prison
1539 * itself cannot.
1540 */
1541 error = ENOENT;
1542 vfs_opterror(opts, "jail %d is dying", pr->pr_id);
1543 goto done_deref;
1544 }
1545 if (jid != 0 && jid != pr->pr_id) {
1546 error = EINVAL;
1547 vfs_opterror(opts, "cannot change jid");
1548 goto done_deref;
1549 }
1550 jid = pr->pr_id;
1551 } else if (jid != 0) {
1552 if (jid < 0) {
1553 error = EINVAL;
1554 vfs_opterror(opts, "negative jid");
1555 goto done_deref;
1556 }
1557 /*
1558 * See if a requested jid already exists. Keep track of
1559 * where it can be inserted later.
1560 */
1561 TAILQ_FOREACH(inspr, &allprison, pr_list) {
1562 if (inspr->pr_id < jid)
1563 continue;
1564 if (inspr->pr_id > jid)
1565 break;
1566 if (prison_isalive(inspr)) {
1567 pr = inspr;
1568 mtx_lock(&pr->pr_mtx);
1569 drflags |= PD_LOCKED;
1570 } else {
1571 /* Note a dying jail to handle later. */
1572 deadpr = inspr;
1573 }
1574 inspr = NULL;
1575 break;
1576 }
1577 if (cuflags == JAIL_CREATE && pr != NULL) {
1578 /*
1579 * Even creators that cannot see the jail will
1580 * get EEXIST.
1581 */
1582 error = EEXIST;
1583 vfs_opterror(opts, "jail %d already exists", jid);
1584 goto done_deref;
1585 }
1586 if ((pr == NULL)
1587 ? cuflags == JAIL_UPDATE
1588 : !prison_ischild(mypr, pr)) {
1589 /*
1590 * Updaters get ENOENT for nonexistent jails,
1591 * or for jails they cannot see. The latter
1592 * case is true even for CREATE | UPDATE,
1593 * which normally cannot give this error.
1594 */
1595 error = ENOENT;
1596 vfs_opterror(opts, "jail %d not found", jid);
1597 goto done_deref;
1598 }
1599 }
1600 /*
1601 * If the caller provided a name, look for a jail by that name.
1602 * This has different semantics for creates and updates keyed by jid
1603 * (where the name must not already exist in a different jail),
1604 * and updates keyed by the name itself (where the name must exist
1605 * because that is the jail being updated).
1606 */
1607 namelc = NULL;
1608 if (name != NULL) {
1609 namelc = strrchr(name, '.');
1610 if (namelc == NULL)
1611 namelc = name;
1612 else {
1613 /*
1614 * This is a hierarchical name. Split it into the
1615 * parent and child names, and make sure the parent
1616 * exists or matches an already found jail.
1617 */
1618 if (pr != NULL) {
1619 if (strncmp(name, ppr->pr_name, namelc - name)
1620 || ppr->pr_name[namelc - name] != '\0') {
1621 error = EINVAL;
1622 vfs_opterror(opts,
1623 "cannot change jail's parent");
1624 goto done_deref;
1625 }
1626 } else {
1627 *namelc = '\0';
1628 ppr = prison_find_name(mypr, name);
1629 if (ppr == NULL) {
1630 error = ENOENT;
1631 vfs_opterror(opts,
1632 "jail \"%s\" not found", name);
1633 goto done_deref;
1634 }
1635 mtx_unlock(&ppr->pr_mtx);
1636 if (!prison_isalive(ppr)) {
1637 error = ENOENT;
1638 vfs_opterror(opts,
1639 "jail \"%s\" is dying", name);
1640 goto done_deref;
1641 }
1642 *namelc = '.';
1643 }
1644 namelc++;
1645 }
1646 if (namelc[0] != '\0') {
1647 pnamelen =
1648 (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1649 FOREACH_PRISON_CHILD(ppr, tpr) {
1650 if (tpr == pr || !prison_isalive(tpr) ||
1651 strcmp(tpr->pr_name + pnamelen, namelc))
1652 continue;
1653 if (cuflags == JAIL_CREATE || pr != NULL) {
1654 /*
1655 * Create, or update(jid): name must
1656 * not exist in an active sibling jail.
1657 */
1658 error = EEXIST;
1659 vfs_opterror(opts,
1660 "jail \"%s\" already exists", name);
1661 goto done_deref;
1662 }
1663 /* Use this jail for updates. */
1664 pr = tpr;
1665 mtx_lock(&pr->pr_mtx);
1666 drflags |= PD_LOCKED;
1667 break;
1668 }
1669 /*
1670 * Update: name must exist if no jid is specified.
1671 * As with the jid case, the jail must be currently
1672 * visible, or else even CREATE | UPDATE will get
1673 * an error.
1674 */
1675 if ((pr == NULL)
1676 ? cuflags == JAIL_UPDATE
1677 : !prison_isalive(pr)) {
1678 error = ENOENT;
1679 vfs_opterror(opts, "jail \"%s\" not found",
1680 name);
1681 goto done_deref;
1682 }
1683 }
1684 }
1685 /* Update: must provide a desc, jid, or name. */
1686 else if (cuflags == JAIL_UPDATE && pr == NULL) {
1687 error = ENOENT;
1688 vfs_opterror(opts, "update specified no jail");
1689 goto done_deref;
1690 }
1691
1692 /* If there's no prison to update, create a new one and link it in. */
1693 created = pr == NULL;
1694 if (created) {
1695 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1696 if (tpr->pr_childcount >= tpr->pr_childmax) {
1697 error = EPERM;
1698 vfs_opterror(opts, "prison limit exceeded");
1699 goto done_deref;
1700 }
1701
1702 if (deadpr != NULL) {
1703 /*
1704 * The prison being created has the same ID as a dying
1705 * one. Handle this by giving the dying jail a new ID.
1706 * This may cause some confusion to user space, but
1707 * only to those listing dying jails.
1708 */
1709 deadid = get_next_deadid(&dinspr);
1710 if (deadid == 0) {
1711 error = EAGAIN;
1712 vfs_opterror(opts, "no available jail IDs");
1713 goto done_deref;
1714 }
1715 mtx_lock(&deadpr->pr_mtx);
1716 deadpr->pr_id = deadid;
1717 mtx_unlock(&deadpr->pr_mtx);
1718 if (dinspr == deadpr)
1719 inspr = deadpr;
1720 else {
1721 inspr = TAILQ_NEXT(deadpr, pr_list);
1722 TAILQ_REMOVE(&allprison, deadpr, pr_list);
1723 if (dinspr != NULL)
1724 TAILQ_INSERT_AFTER(&allprison, dinspr,
1725 deadpr, pr_list);
1726 else
1727 TAILQ_INSERT_HEAD(&allprison, deadpr,
1728 pr_list);
1729 }
1730 }
1731 if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1732 error = EAGAIN;
1733 vfs_opterror(opts, "no available jail IDs");
1734 goto done_deref;
1735 }
1736
1737 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1738 pr->pr_state = PRISON_STATE_INVALID;
1739 refcount_init(&pr->pr_ref, 1);
1740 refcount_init(&pr->pr_uref, 0);
1741 drflags |= PD_DEREF;
1742 LIST_INIT(&pr->pr_children);
1743 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1744 TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1745
1746 pr->pr_id = jid;
1747 if (inspr != NULL)
1748 TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1749 else
1750 TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1751
1752 pr->pr_parent = ppr;
1753 prison_hold(ppr);
1754 prison_proc_hold(ppr);
1755 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1756 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1757 tpr->pr_childcount++;
1758 pr->pr_klist = knlist_alloc(&pr->pr_mtx);
1759
1760 /* Set some default values, and inherit some from the parent. */
1761 if (namelc == NULL)
1762 namelc = "";
1763 if (path == NULL) {
1764 path = "/";
1765 root = mypr->pr_root;
1766 vref(root);
1767 }
1768 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1769 pr->pr_flags |= PR_HOST;
1770 #if defined(INET) || defined(INET6)
1771 #ifdef VIMAGE
1772 if (!(pr_flags & PR_VNET))
1773 #endif
1774 {
1775 #ifdef INET
1776 if (!(ch_flags & PR_IP4_USER))
1777 pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1778 else if (!(pr_flags & PR_IP4_USER)) {
1779 pr->pr_flags |= ppr->pr_flags & PR_IP4;
1780 prison_ip_dup(ppr, pr, PR_INET);
1781 }
1782 #endif
1783 #ifdef INET6
1784 if (!(ch_flags & PR_IP6_USER))
1785 pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1786 else if (!(pr_flags & PR_IP6_USER)) {
1787 pr->pr_flags |= ppr->pr_flags & PR_IP6;
1788 prison_ip_dup(ppr, pr, PR_INET6);
1789 }
1790 #endif
1791 }
1792 #endif
1793 /* Source address selection is always on by default. */
1794 pr->pr_flags |= _PR_IP_SADDRSEL;
1795
1796 pr->pr_securelevel = ppr->pr_securelevel;
1797 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1798 pr->pr_enforce_statfs = jail_default_enforce_statfs;
1799 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1800
1801 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1802 if (osrelstr == NULL)
1803 strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1804 sizeof(pr->pr_osrelease));
1805 else
1806 strlcpy(pr->pr_osrelease, osrelstr,
1807 sizeof(pr->pr_osrelease));
1808
1809 #ifdef VIMAGE
1810 /*
1811 * Allocate a new vnet if specified.
1812 *
1813 * Set PR_VNET now if so, so that the vnet is disposed of
1814 * properly when the jail is destroyed.
1815 */
1816 if (pr_flags & PR_VNET) {
1817 pr->pr_flags |= PR_VNET;
1818 pr->pr_vnet = vnet_alloc();
1819 } else {
1820 pr->pr_vnet = ppr->pr_vnet;
1821 }
1822 #endif
1823 /*
1824 * Allocate a dedicated cpuset for each jail.
1825 * Unlike other initial settings, this may return an error.
1826 */
1827 error = cpuset_create_root(ppr, &pr->pr_cpuset);
1828 if (error)
1829 goto done_deref;
1830
1831 mtx_lock(&pr->pr_mtx);
1832 drflags |= PD_LOCKED;
1833 } else {
1834 /*
1835 * Grab a reference for existing prisons, to ensure they
1836 * continue to exist for the duration of the call.
1837 */
1838 if (!(drflags & PD_DEREF)) {
1839 prison_hold(pr);
1840 drflags |= PD_DEREF;
1841 }
1842 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1843 if ((pr->pr_flags & PR_VNET) &&
1844 (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1845 error = EINVAL;
1846 vfs_opterror(opts,
1847 "vnet jails cannot have IP address restrictions");
1848 goto done_deref;
1849 }
1850 #endif
1851 #ifdef INET
1852 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1853 error = EINVAL;
1854 vfs_opterror(opts,
1855 "ip4 cannot be changed after creation");
1856 goto done_deref;
1857 }
1858 #endif
1859 #ifdef INET6
1860 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1861 error = EINVAL;
1862 vfs_opterror(opts,
1863 "ip6 cannot be changed after creation");
1864 goto done_deref;
1865 }
1866 #endif
1867 }
1868
1869 /* Do final error checking before setting anything. */
1870 if (gotslevel) {
1871 if (slevel < ppr->pr_securelevel) {
1872 error = EPERM;
1873 goto done_deref;
1874 }
1875 }
1876 if (gotchildmax) {
1877 if (childmax >= ppr->pr_childmax) {
1878 error = EPERM;
1879 goto done_deref;
1880 }
1881 }
1882 if (gotenforce) {
1883 if (enforce < ppr->pr_enforce_statfs) {
1884 error = EPERM;
1885 goto done_deref;
1886 }
1887 }
1888 if (gotrsnum) {
1889 /*
1890 * devfs_rsnum is a uint16_t
1891 */
1892 if (rsnum < 0 || rsnum > 65535) {
1893 error = EINVAL;
1894 goto done_deref;
1895 }
1896 /*
1897 * Nested jails always inherit parent's devfs ruleset
1898 */
1899 if (jailed(td->td_ucred)) {
1900 if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1901 error = EPERM;
1902 goto done_deref;
1903 } else
1904 rsnum = ppr->pr_devfs_rsnum;
1905 }
1906 }
1907 #ifdef INET
1908 if (ip4s > 0) {
1909 if ((ppr->pr_flags & PR_IP4) &&
1910 !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
1911 PR_INET)) {
1912 error = EPERM;
1913 goto done_deref;
1914 }
1915 if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
1916 error = EADDRINUSE;
1917 vfs_opterror(opts, "IPv4 addresses clash");
1918 goto done_deref;
1919 }
1920 }
1921 #endif
1922 #ifdef INET6
1923 if (ip6s > 0) {
1924 if ((ppr->pr_flags & PR_IP6) &&
1925 !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
1926 PR_INET6)) {
1927 error = EPERM;
1928 goto done_deref;
1929 }
1930 if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
1931 error = EADDRINUSE;
1932 vfs_opterror(opts, "IPv6 addresses clash");
1933 goto done_deref;
1934 }
1935 }
1936 #endif
1937 onamelen = namelen = 0;
1938 if (namelc != NULL) {
1939 /* Give a default name of the jid. Also allow the name to be
1940 * explicitly the jid - but not any other number, and only in
1941 * normal form (no leading zero/etc).
1942 */
1943 if (namelc[0] == '\0')
1944 snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1945 else if ((strtoul(namelc, &p, 10) != jid ||
1946 namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1947 error = EINVAL;
1948 vfs_opterror(opts,
1949 "name cannot be numeric (unless it is the jid)");
1950 goto done_deref;
1951 }
1952 /*
1953 * Make sure the name isn't too long for the prison or its
1954 * children.
1955 */
1956 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1957 onamelen = strlen(pr->pr_name + pnamelen);
1958 namelen = strlen(namelc);
1959 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1960 error = ENAMETOOLONG;
1961 goto done_deref;
1962 }
1963 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1964 if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1965 sizeof(pr->pr_name)) {
1966 error = ENAMETOOLONG;
1967 goto done_deref;
1968 }
1969 }
1970 }
1971 pr_allow_diff = pr_allow & ~ppr->pr_allow;
1972 if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1973 error = EPERM;
1974 goto done_deref;
1975 }
1976
1977 /*
1978 * Let modules check their parameters. This requires unlocking and
1979 * then re-locking the prison, but this is still a valid state as long
1980 * as allprison_lock remains xlocked.
1981 */
1982 mtx_unlock(&pr->pr_mtx);
1983 drflags &= ~PD_LOCKED;
1984 error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1985 if (error != 0)
1986 goto done_deref;
1987 mtx_lock(&pr->pr_mtx);
1988 drflags |= PD_LOCKED;
1989
1990 /* At this point, all valid parameters should have been noted. */
1991 TAILQ_FOREACH(opt, opts, link) {
1992 if (!opt->seen && strcmp(opt->name, "errmsg")) {
1993 error = EINVAL;
1994 vfs_opterror(opts, "unknown parameter: %s", opt->name);
1995 goto done_deref;
1996 }
1997 }
1998 maybe_changed = true;
1999
2000 /* Set the parameters of the prison. */
2001 #ifdef INET
2002 redo_ip4 = false;
2003 if (pr_flags & PR_IP4_USER) {
2004 pr->pr_flags |= PR_IP4;
2005 prison_ip_set(pr, PR_INET, ip4);
2006 ip4 = NULL;
2007 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2008 #ifdef VIMAGE
2009 if (tpr->pr_flags & PR_VNET) {
2010 descend = 0;
2011 continue;
2012 }
2013 #endif
2014 if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
2015 redo_ip4 = true;
2016 descend = 0;
2017 }
2018 }
2019 }
2020 #endif
2021 #ifdef INET6
2022 redo_ip6 = false;
2023 if (pr_flags & PR_IP6_USER) {
2024 pr->pr_flags |= PR_IP6;
2025 prison_ip_set(pr, PR_INET6, ip6);
2026 ip6 = NULL;
2027 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2028 #ifdef VIMAGE
2029 if (tpr->pr_flags & PR_VNET) {
2030 descend = 0;
2031 continue;
2032 }
2033 #endif
2034 if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
2035 redo_ip6 = true;
2036 descend = 0;
2037 }
2038 }
2039 }
2040 #endif
2041 if (gotslevel) {
2042 pr->pr_securelevel = slevel;
2043 /* Set all child jails to be at least this level. */
2044 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
2045 if (tpr->pr_securelevel < slevel)
2046 tpr->pr_securelevel = slevel;
2047 }
2048 if (gotchildmax) {
2049 pr->pr_childmax = childmax;
2050 /* Set all child jails to under this limit. */
2051 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
2052 if (tpr->pr_childmax > childmax - level)
2053 tpr->pr_childmax = childmax > level
2054 ? childmax - level : 0;
2055 }
2056 if (gotenforce) {
2057 pr->pr_enforce_statfs = enforce;
2058 /* Pass this restriction on to the children. */
2059 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
2060 if (tpr->pr_enforce_statfs < enforce)
2061 tpr->pr_enforce_statfs = enforce;
2062 }
2063 if (gotrsnum) {
2064 pr->pr_devfs_rsnum = rsnum;
2065 /* Pass this restriction on to the children. */
2066 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
2067 tpr->pr_devfs_rsnum = rsnum;
2068 }
2069 if (namelc != NULL) {
2070 if (ppr == &prison0)
2071 strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
2072 else
2073 snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
2074 ppr->pr_name, namelc);
2075 /* Change this component of child names. */
2076 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2077 bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
2078 strlen(tpr->pr_name + onamelen) + 1);
2079 bcopy(pr->pr_name, tpr->pr_name, namelen);
2080 }
2081 }
2082 if (path != NULL) {
2083 /* Try to keep a real-rooted full pathname. */
2084 strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
2085 pr->pr_root = root;
2086 root = NULL;
2087 }
2088 if (PR_HOST & ch_flags & ~pr_flags) {
2089 if (pr->pr_flags & PR_HOST) {
2090 /*
2091 * Copy the parent's host info. As with pr_ip4 above,
2092 * the lack of a lock on the parent is not a problem;
2093 * it is always set with allprison_lock at least
2094 * shared, and is held exclusively here.
2095 */
2096 strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
2097 sizeof(pr->pr_hostname));
2098 strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
2099 sizeof(pr->pr_domainname));
2100 strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
2101 sizeof(pr->pr_hostuuid));
2102 pr->pr_hostid = pr->pr_parent->pr_hostid;
2103 }
2104 } else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
2105 /* Set this prison, and any descendants without PR_HOST. */
2106 if (host != NULL)
2107 strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
2108 if (domain != NULL)
2109 strlcpy(pr->pr_domainname, domain,
2110 sizeof(pr->pr_domainname));
2111 if (uuid != NULL)
2112 strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
2113 if (gothid)
2114 pr->pr_hostid = hid;
2115 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2116 if (tpr->pr_flags & PR_HOST)
2117 descend = 0;
2118 else {
2119 if (host != NULL)
2120 strlcpy(tpr->pr_hostname,
2121 pr->pr_hostname,
2122 sizeof(tpr->pr_hostname));
2123 if (domain != NULL)
2124 strlcpy(tpr->pr_domainname,
2125 pr->pr_domainname,
2126 sizeof(tpr->pr_domainname));
2127 if (uuid != NULL)
2128 strlcpy(tpr->pr_hostuuid,
2129 pr->pr_hostuuid,
2130 sizeof(tpr->pr_hostuuid));
2131 if (gothid)
2132 tpr->pr_hostid = hid;
2133 }
2134 }
2135 }
2136 pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
2137 if ((tallow = ch_allow & ~pr_allow))
2138 prison_set_allow_locked(pr, tallow, 0);
2139 /*
2140 * Persistent prisons get an extra reference, and prisons losing their
2141 * persist flag lose that reference.
2142 */
2143 if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
2144 if (pr_flags & PR_PERSIST) {
2145 prison_hold(pr);
2146 /*
2147 * This may be a new prison's first user reference,
2148 * but wait to call it alive until after OSD calls
2149 * have had a chance to run (and perhaps to fail).
2150 */
2151 refcount_acquire(&pr->pr_uref);
2152 } else {
2153 drflags |= PD_DEUREF;
2154 prison_free_not_last(pr);
2155 }
2156 }
2157 pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
2158 mtx_unlock(&pr->pr_mtx);
2159 drflags &= ~PD_LOCKED;
2160 /*
2161 * Any errors past this point will need to de-persist newly created
2162 * prisons, as well as call remove methods.
2163 */
2164 if (created)
2165 drflags |= PD_KILL;
2166
2167 #ifdef RACCT
2168 if (racct_enable && created)
2169 prison_racct_attach(pr);
2170 #endif
2171
2172 /* Locks may have prevented a complete restriction of child IP
2173 * addresses. If so, allocate some more memory and try again.
2174 */
2175 #ifdef INET
2176 while (redo_ip4) {
2177 ip4s = pr->pr_addrs[PR_INET]->ips;
2178 MPASS(ip4 == NULL);
2179 ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
2180 mtx_lock(&pr->pr_mtx);
2181 redo_ip4 = false;
2182 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2183 #ifdef VIMAGE
2184 if (tpr->pr_flags & PR_VNET) {
2185 descend = 0;
2186 continue;
2187 }
2188 #endif
2189 if (!prison_ip_restrict(tpr, PR_INET, &ip4))
2190 redo_ip4 = true;
2191 }
2192 mtx_unlock(&pr->pr_mtx);
2193 }
2194 #endif
2195 #ifdef INET6
2196 while (redo_ip6) {
2197 ip6s = pr->pr_addrs[PR_INET6]->ips;
2198 MPASS(ip6 == NULL);
2199 ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
2200 mtx_lock(&pr->pr_mtx);
2201 redo_ip6 = false;
2202 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2203 #ifdef VIMAGE
2204 if (tpr->pr_flags & PR_VNET) {
2205 descend = 0;
2206 continue;
2207 }
2208 #endif
2209 if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
2210 redo_ip6 = true;
2211 }
2212 mtx_unlock(&pr->pr_mtx);
2213 }
2214 #endif
2215
2216 /* Let the modules do their work. */
2217 if (created) {
2218 error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
2219 if (error)
2220 goto done_deref;
2221 }
2222 error = osd_jail_call(pr, PR_METHOD_SET, opts);
2223 if (error)
2224 goto done_deref;
2225
2226 /*
2227 * A new prison is now ready to be seen; either it has gained a user
2228 * reference via persistence, or is about to gain one via attachment.
2229 */
2230 if (created) {
2231 sx_assert(&allprison_lock, SX_XLOCKED);
2232 prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id);
2233 mtx_lock(&pr->pr_mtx);
2234 drflags |= PD_LOCKED;
2235 pr->pr_state = PRISON_STATE_ALIVE;
2236 }
2237
2238 /* Attach this process to the prison if requested. */
2239 if (flags & JAIL_ATTACH) {
2240 error = do_jail_attach(td, pr,
2241 prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
2242 drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
2243 if (error) {
2244 vfs_opterror(opts, "attach failed");
2245 goto done_deref;
2246 }
2247 }
2248
2249 #ifdef RACCT
2250 if (racct_enable && !created) {
2251 if (drflags & PD_LOCKED) {
2252 mtx_unlock(&pr->pr_mtx);
2253 drflags &= ~PD_LOCKED;
2254 }
2255 if (drflags & PD_LIST_XLOCKED) {
2256 sx_xunlock(&allprison_lock);
2257 drflags &= ~PD_LIST_XLOCKED;
2258 }
2259 prison_racct_modify(pr);
2260 }
2261 #endif
2262
2263 if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
2264 (pr->pr_root->v_vflag & VV_ROOT) == 0)
2265 printf("Warning jail jid=%d: mountd/nfsd requires a separate"
2266 " file system\n", pr->pr_id);
2267
2268 /*
2269 * Now that the prison is fully created without error, set the
2270 * jail descriptor if one was requested. This is the only
2271 * parameter that is returned to the caller (except the error
2272 * message).
2273 */
2274 if (jfd_out >= 0) {
2275 if (!(drflags & PD_LOCKED)) {
2276 mtx_lock(&pr->pr_mtx);
2277 drflags |= PD_LOCKED;
2278 }
2279 jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1;
2280 if (optuio->uio_segflg == UIO_SYSSPACE)
2281 *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out;
2282 else
2283 (void)copyout(&jfd_out,
2284 optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out));
2285 jaildesc_set_prison(jfp_out, pr);
2286 }
2287
2288 drflags &= ~PD_KILL;
2289 td->td_retval[0] = pr->pr_id;
2290
2291 done_deref:
2292 /*
2293 * Report changes to kevent. This can happen even if the
2294 * system call fails, as changes might have been made before
2295 * the failure.
2296 */
2297 if (maybe_changed && !created)
2298 prison_knote(pr, NOTE_JAIL_SET);
2299 /* Release any temporary prison holds and/or locks. */
2300 if (pr != NULL)
2301 prison_deref(pr, drflags);
2302 else if (drflags & PD_LIST_SLOCKED)
2303 sx_sunlock(&allprison_lock);
2304 else if (drflags & PD_LIST_XLOCKED)
2305 sx_xunlock(&allprison_lock);
2306 if (root != NULL)
2307 vrele(root);
2308 done_errmsg:
2309 if (error) {
2310 /* Write the error message back to userspace. */
2311 if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
2312 &errmsg_len) == 0 && errmsg_len > 0) {
2313 errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
2314 if (optuio->uio_segflg == UIO_SYSSPACE)
2315 bcopy(errmsg,
2316 optuio->uio_iov[errmsg_pos].iov_base,
2317 errmsg_len);
2318 else
2319 (void)copyout(errmsg,
2320 optuio->uio_iov[errmsg_pos].iov_base,
2321 errmsg_len);
2322 }
2323 }
2324 done_free:
2325 /* Clean up other resources. */
2326 #ifdef INET
2327 prison_ip_free(ip4);
2328 #endif
2329 #ifdef INET6
2330 prison_ip_free(ip6);
2331 #endif
2332 if (jfp_out != NULL)
2333 fdrop(jfp_out, td);
2334 if (error && jfd_out >= 0)
2335 (void)kern_close(td, jfd_out);
2336 if (g_path != NULL)
2337 free(g_path, M_TEMP);
2338 if (opts != NULL)
2339 vfs_freeopts(opts);
2340 prison_free(mypr);
2341 return (error);
2342 }
2343
2344 /*
2345 * Find the next available prison ID. Return the ID on success, or zero
2346 * on failure. Also set a pointer to the allprison list entry the prison
2347 * should be inserted before.
2348 */
2349 static int
get_next_prid(struct prison ** insprp)2350 get_next_prid(struct prison **insprp)
2351 {
2352 struct prison *inspr;
2353 int jid, maxid;
2354
2355 jid = lastprid % JAIL_MAX + 1;
2356 if (TAILQ_EMPTY(&allprison) ||
2357 TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
2358 /*
2359 * A common case is for all jails to be implicitly numbered,
2360 * which means they'll go on the end of the list, at least
2361 * for the first JAIL_MAX times.
2362 */
2363 inspr = NULL;
2364 } else {
2365 /*
2366 * Take two passes through the allprison list: first starting
2367 * with the proposed jid, then ending with it.
2368 */
2369 for (maxid = JAIL_MAX; maxid != 0; ) {
2370 TAILQ_FOREACH(inspr, &allprison, pr_list) {
2371 if (inspr->pr_id < jid)
2372 continue;
2373 if (inspr->pr_id > jid) {
2374 /* Found an opening. */
2375 maxid = 0;
2376 break;
2377 }
2378 if (++jid > maxid) {
2379 if (lastprid == maxid || lastprid == 0)
2380 {
2381 /*
2382 * The entire legal range
2383 * has been traversed
2384 */
2385 return 0;
2386 }
2387 /* Try again from the start. */
2388 jid = 1;
2389 maxid = lastprid;
2390 break;
2391 }
2392 }
2393 if (inspr == NULL) {
2394 /* Found room at the end of the list. */
2395 break;
2396 }
2397 }
2398 }
2399 *insprp = inspr;
2400 lastprid = jid;
2401 return (jid);
2402 }
2403
2404 /*
2405 * Find the next available ID for a renumbered dead prison. This is the same
2406 * as get_next_prid, but counting backward from the end of the range.
2407 */
2408 static int
get_next_deadid(struct prison ** dinsprp)2409 get_next_deadid(struct prison **dinsprp)
2410 {
2411 struct prison *dinspr;
2412 int deadid, minid;
2413
2414 deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX;
2415 /*
2416 * Take two reverse passes through the allprison list: first
2417 * starting with the proposed deadid, then ending with it.
2418 */
2419 for (minid = 1; minid != 0; ) {
2420 TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) {
2421 if (dinspr->pr_id > deadid)
2422 continue;
2423 if (dinspr->pr_id < deadid) {
2424 /* Found an opening. */
2425 minid = 0;
2426 break;
2427 }
2428 if (--deadid < minid) {
2429 if (lastdeadid == minid || lastdeadid == 0)
2430 {
2431 /*
2432 * The entire legal range
2433 * has been traversed
2434 */
2435 return 0;
2436 }
2437 /* Try again from the end. */
2438 deadid = JAIL_MAX;
2439 minid = lastdeadid;
2440 break;
2441 }
2442 }
2443 if (dinspr == NULL) {
2444 /* Found room at the beginning of the list. */
2445 break;
2446 }
2447 }
2448 *dinsprp = dinspr;
2449 lastdeadid = deadid;
2450 return (deadid);
2451 }
2452
2453 /*
2454 * struct jail_get_args {
2455 * struct iovec *iovp;
2456 * unsigned int iovcnt;
2457 * int flags;
2458 * };
2459 */
2460 int
sys_jail_get(struct thread * td,struct jail_get_args * uap)2461 sys_jail_get(struct thread *td, struct jail_get_args *uap)
2462 {
2463 struct uio *auio;
2464 int error;
2465
2466 /* Check that we have an even number of iovecs. */
2467 if (uap->iovcnt & 1)
2468 return (EINVAL);
2469
2470 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2471 if (error)
2472 return (error);
2473 error = kern_jail_get(td, auio, uap->flags);
2474 if (error == 0)
2475 error = copyout(auio->uio_iov, uap->iovp,
2476 uap->iovcnt * sizeof(struct iovec));
2477 freeuio(auio);
2478 return (error);
2479 }
2480
2481 int
kern_jail_get(struct thread * td,struct uio * optuio,int flags)2482 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2483 {
2484 struct bool_flags *bf;
2485 struct file *jfp_out;
2486 struct jailsys_flags *jsf;
2487 struct prison *pr, *mypr;
2488 struct vfsopt *opt;
2489 struct vfsoptlist *opts;
2490 char *errmsg, *name;
2491 int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2492 int jfd_in, jfd_out;
2493 unsigned f;
2494
2495 if (flags & ~JAIL_GET_MASK)
2496 return (EINVAL);
2497 if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) ==
2498 (JAIL_USE_DESC | JAIL_AT_DESC))
2499 return (EINVAL);
2500
2501 /* Get the parameter list. */
2502 error = vfs_buildopts(optuio, &opts);
2503 if (error)
2504 return (error);
2505 errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2506 mypr = td->td_ucred->cr_prison;
2507 prison_hold(mypr);
2508 pr = NULL;
2509 jfp_out = NULL;
2510 jfd_out = -1;
2511
2512 /*
2513 * Find the prison specified by one of: desc, lastjid, jid, name.
2514 */
2515 sx_slock(&allprison_lock);
2516 drflags = PD_LIST_SLOCKED;
2517
2518 error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in));
2519 if (error == ENOENT) {
2520 if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) {
2521 vfs_opterror(opts, "missing desc");
2522 goto done;
2523 }
2524 } else if (error == 0) {
2525 if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC |
2526 JAIL_OWN_DESC))) {
2527 error = EINVAL;
2528 vfs_opterror(opts, "unexpected desc");
2529 goto done;
2530 }
2531 if (flags & JAIL_USE_DESC) {
2532 /* Get the jail from its descriptor. */
2533 error = jaildesc_find(td, jfd_in, &pr, NULL);
2534 if (error) {
2535 vfs_opterror(opts, error == ENOENT ?
2536 "descriptor to dead jail" :
2537 "not a jail descriptor");
2538 goto done;
2539 }
2540 drflags |= PD_DEREF;
2541 mtx_lock(&pr->pr_mtx);
2542 drflags |= PD_LOCKED;
2543 if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2544 error = ENOENT;
2545 vfs_opterror(opts, "jail %d is dying",
2546 pr->pr_id);
2547 goto done;
2548 }
2549 goto found_prison;
2550 }
2551 if (flags & JAIL_AT_DESC) {
2552 /* Look up jails based on the descriptor's prison. */
2553 prison_free(mypr);
2554 error = jaildesc_find(td, jfd_in, &mypr, NULL);
2555 if (error != 0) {
2556 vfs_opterror(opts, error == ENOENT ?
2557 "descriptor to dead jail" :
2558 "not a jail descriptor");
2559 goto done;
2560 }
2561 }
2562 if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) {
2563 /* Allocate a jail descriptor to return later. */
2564 error = jaildesc_alloc(td, &jfp_out, &jfd_out,
2565 flags & JAIL_OWN_DESC);
2566 if (error)
2567 goto done;
2568 }
2569 } else
2570 goto done;
2571
2572 error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2573 if (error == 0) {
2574 TAILQ_FOREACH(pr, &allprison, pr_list) {
2575 if (pr->pr_id > jid &&
2576 ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2577 prison_ischild(mypr, pr)) {
2578 mtx_lock(&pr->pr_mtx);
2579 drflags |= PD_LOCKED;
2580 goto found_prison;
2581 }
2582 }
2583 error = ENOENT;
2584 vfs_opterror(opts, "no jail after %d", jid);
2585 goto done;
2586 } else if (error != ENOENT)
2587 goto done;
2588
2589 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2590 if (error == 0) {
2591 if (jid != 0) {
2592 pr = prison_find_child(mypr, jid);
2593 if (pr != NULL) {
2594 drflags |= PD_LOCKED;
2595 if (!(prison_isalive(pr) ||
2596 (flags & JAIL_DYING))) {
2597 error = ENOENT;
2598 vfs_opterror(opts, "jail %d is dying",
2599 jid);
2600 goto done;
2601 }
2602 goto found_prison;
2603 }
2604 error = ENOENT;
2605 vfs_opterror(opts, "jail %d not found", jid);
2606 goto done;
2607 }
2608 } else if (error != ENOENT)
2609 goto done;
2610
2611 error = vfs_getopt(opts, "name", (void **)&name, &len);
2612 if (error == 0) {
2613 if (len == 0 || name[len - 1] != '\0') {
2614 error = EINVAL;
2615 goto done;
2616 }
2617 pr = prison_find_name(mypr, name);
2618 if (pr != NULL) {
2619 drflags |= PD_LOCKED;
2620 if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2621 error = ENOENT;
2622 vfs_opterror(opts, "jail \"%s\" is dying",
2623 name);
2624 goto done;
2625 }
2626 goto found_prison;
2627 }
2628 error = ENOENT;
2629 vfs_opterror(opts, "jail \"%s\" not found", name);
2630 goto done;
2631 } else if (error != ENOENT)
2632 goto done;
2633
2634 vfs_opterror(opts, "no jail specified");
2635 error = ENOENT;
2636 goto done;
2637
2638 found_prison:
2639 /* Get the parameters of the prison. */
2640 if (!(drflags & PD_DEREF)) {
2641 prison_hold(pr);
2642 drflags |= PD_DEREF;
2643 }
2644 td->td_retval[0] = pr->pr_id;
2645 if (jfd_out >= 0) {
2646 error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out));
2647 if (error != 0 && error != ENOENT)
2648 goto done;
2649 jaildesc_set_prison(jfp_out, pr);
2650 }
2651 error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2652 if (error != 0 && error != ENOENT)
2653 goto done;
2654 i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2655 error = vfs_setopt(opts, "parent", &i, sizeof(i));
2656 if (error != 0 && error != ENOENT)
2657 goto done;
2658 error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2659 if (error != 0 && error != ENOENT)
2660 goto done;
2661 error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2662 sizeof(pr->pr_cpuset->cs_id));
2663 if (error != 0 && error != ENOENT)
2664 goto done;
2665 error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2666 if (error != 0 && error != ENOENT)
2667 goto done;
2668 #ifdef INET
2669 error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
2670 pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
2671 pr_families[PR_INET].size : 0 );
2672 if (error != 0 && error != ENOENT)
2673 goto done;
2674 #endif
2675 #ifdef INET6
2676 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
2677 pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
2678 pr_families[PR_INET6].size : 0 );
2679 if (error != 0 && error != ENOENT)
2680 goto done;
2681 #endif
2682 error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2683 sizeof(pr->pr_securelevel));
2684 if (error != 0 && error != ENOENT)
2685 goto done;
2686 error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2687 sizeof(pr->pr_childcount));
2688 if (error != 0 && error != ENOENT)
2689 goto done;
2690 error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2691 sizeof(pr->pr_childmax));
2692 if (error != 0 && error != ENOENT)
2693 goto done;
2694 error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2695 if (error != 0 && error != ENOENT)
2696 goto done;
2697 error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2698 if (error != 0 && error != ENOENT)
2699 goto done;
2700 error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2701 if (error != 0 && error != ENOENT)
2702 goto done;
2703 #ifdef COMPAT_FREEBSD32
2704 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2705 uint32_t hid32 = pr->pr_hostid;
2706
2707 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2708 } else
2709 #endif
2710 error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2711 sizeof(pr->pr_hostid));
2712 if (error != 0 && error != ENOENT)
2713 goto done;
2714 error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2715 sizeof(pr->pr_enforce_statfs));
2716 if (error != 0 && error != ENOENT)
2717 goto done;
2718 error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2719 sizeof(pr->pr_devfs_rsnum));
2720 if (error != 0 && error != ENOENT)
2721 goto done;
2722 for (bf = pr_flag_bool;
2723 bf < pr_flag_bool + nitems(pr_flag_bool);
2724 bf++) {
2725 i = (pr->pr_flags & bf->flag) ? 1 : 0;
2726 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2727 if (error != 0 && error != ENOENT)
2728 goto done;
2729 i = !i;
2730 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2731 if (error != 0 && error != ENOENT)
2732 goto done;
2733 }
2734 for (jsf = pr_flag_jailsys;
2735 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2736 jsf++) {
2737 f = pr->pr_flags & (jsf->disable | jsf->new);
2738 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2739 : (f == jsf->new) ? JAIL_SYS_NEW
2740 : JAIL_SYS_INHERIT;
2741 error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2742 if (error != 0 && error != ENOENT)
2743 goto done;
2744 }
2745 for (bf = pr_flag_allow;
2746 bf < pr_flag_allow + nitems(pr_flag_allow) &&
2747 atomic_load_int(&bf->flag) != 0;
2748 bf++) {
2749 i = (pr->pr_allow & bf->flag) ? 1 : 0;
2750 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2751 if (error != 0 && error != ENOENT)
2752 goto done;
2753 i = !i;
2754 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2755 if (error != 0 && error != ENOENT)
2756 goto done;
2757 }
2758 i = !prison_isalive(pr);
2759 error = vfs_setopt(opts, "dying", &i, sizeof(i));
2760 if (error != 0 && error != ENOENT)
2761 goto done;
2762 i = !i;
2763 error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2764 if (error != 0 && error != ENOENT)
2765 goto done;
2766 error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2767 sizeof(pr->pr_osreldate));
2768 if (error != 0 && error != ENOENT)
2769 goto done;
2770 error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2771 if (error != 0 && error != ENOENT)
2772 goto done;
2773
2774 /* Get the module parameters. */
2775 mtx_unlock(&pr->pr_mtx);
2776 drflags &= ~PD_LOCKED;
2777 error = osd_jail_call(pr, PR_METHOD_GET, opts);
2778 if (error)
2779 goto done;
2780 prison_deref(pr, drflags);
2781 pr = NULL;
2782 drflags = 0;
2783
2784 /* By now, all parameters should have been noted. */
2785 TAILQ_FOREACH(opt, opts, link) {
2786 if (!opt->seen &&
2787 (strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name ||
2788 strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) {
2789 /* Communicate back a missing key. */
2790 free(opt->value, M_MOUNT);
2791 opt->value = NULL;
2792 opt->len = 0;
2793 continue;
2794 }
2795 if (!opt->seen && strcmp(opt->name, "errmsg")) {
2796 error = EINVAL;
2797 vfs_opterror(opts, "unknown parameter: %s", opt->name);
2798 goto done;
2799 }
2800 }
2801
2802 /* Write the fetched parameters back to userspace. */
2803 error = 0;
2804 TAILQ_FOREACH(opt, opts, link) {
2805 if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2806 pos = 2 * opt->pos + 1;
2807 optuio->uio_iov[pos].iov_len = opt->len;
2808 if (opt->value != NULL) {
2809 if (optuio->uio_segflg == UIO_SYSSPACE) {
2810 bcopy(opt->value,
2811 optuio->uio_iov[pos].iov_base,
2812 opt->len);
2813 } else {
2814 error = copyout(opt->value,
2815 optuio->uio_iov[pos].iov_base,
2816 opt->len);
2817 if (error)
2818 break;
2819 }
2820 }
2821 }
2822 }
2823
2824 done:
2825 /* Release any temporary prison holds and/or locks. */
2826 if (pr != NULL)
2827 prison_deref(pr, drflags);
2828 else if (drflags & PD_LIST_SLOCKED)
2829 sx_sunlock(&allprison_lock);
2830 else if (drflags & PD_LIST_XLOCKED)
2831 sx_xunlock(&allprison_lock);
2832 /* Clean up other resources. */
2833 if (jfp_out != NULL)
2834 (void)fdrop(jfp_out, td);
2835 if (error && jfd_out >= 0)
2836 (void)kern_close(td, jfd_out);
2837 if (error && errmsg_pos >= 0) {
2838 /* Write the error message back to userspace. */
2839 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2840 errmsg_pos = 2 * errmsg_pos + 1;
2841 if (errmsg_len > 0) {
2842 if (optuio->uio_segflg == UIO_SYSSPACE)
2843 bcopy(errmsg,
2844 optuio->uio_iov[errmsg_pos].iov_base,
2845 errmsg_len);
2846 else
2847 (void)copyout(errmsg,
2848 optuio->uio_iov[errmsg_pos].iov_base,
2849 errmsg_len);
2850 }
2851 }
2852 vfs_freeopts(opts);
2853 prison_free(mypr);
2854 return (error);
2855 }
2856
2857 /*
2858 * struct jail_remove_args {
2859 * int jid;
2860 * };
2861 */
2862 int
sys_jail_remove(struct thread * td,struct jail_remove_args * uap)2863 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2864 {
2865 struct prison *pr;
2866 int error;
2867
2868 error = priv_check(td, PRIV_JAIL_REMOVE);
2869 if (error)
2870 return (error);
2871
2872 sx_xlock(&allprison_lock);
2873 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2874 if (pr == NULL) {
2875 sx_xunlock(&allprison_lock);
2876 return (EINVAL);
2877 }
2878 prison_hold(pr);
2879 prison_remove(pr);
2880 return (0);
2881 }
2882
2883 /*
2884 * struct jail_remove_jd_args {
2885 * int fd;
2886 * };
2887 */
2888 int
sys_jail_remove_jd(struct thread * td,struct jail_remove_jd_args * uap)2889 sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap)
2890 {
2891 struct prison *pr;
2892 struct ucred *jdcred;
2893 int error;
2894
2895 error = jaildesc_find(td, uap->fd, &pr, &jdcred);
2896 if (error)
2897 return (error);
2898 error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE);
2899 crfree(jdcred);
2900 if (error) {
2901 prison_free(pr);
2902 return (error);
2903 }
2904 sx_xlock(&allprison_lock);
2905 mtx_lock(&pr->pr_mtx);
2906 prison_remove(pr);
2907 return (0);
2908 }
2909
2910 /*
2911 * Begin the removal process for a prison. The allprison lock should
2912 * be held exclusively, and the prison should be both locked and held.
2913 */
2914 void
prison_remove(struct prison * pr)2915 prison_remove(struct prison *pr)
2916 {
2917 sx_assert(&allprison_lock, SA_XLOCKED);
2918 mtx_assert(&pr->pr_mtx, MA_OWNED);
2919 prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2920 }
2921
2922 /*
2923 * struct jail_attach_args {
2924 * int jid;
2925 * };
2926 */
2927 int
sys_jail_attach(struct thread * td,struct jail_attach_args * uap)2928 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2929 {
2930 struct prison *pr;
2931 int error;
2932
2933 error = priv_check(td, PRIV_JAIL_ATTACH);
2934 if (error)
2935 return (error);
2936
2937 sx_slock(&allprison_lock);
2938 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2939 if (pr == NULL) {
2940 sx_sunlock(&allprison_lock);
2941 return (EINVAL);
2942 }
2943
2944 /* Do not allow a process to attach to a prison that is not alive. */
2945 if (!prison_isalive(pr)) {
2946 mtx_unlock(&pr->pr_mtx);
2947 sx_sunlock(&allprison_lock);
2948 return (EINVAL);
2949 }
2950
2951 return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2952 }
2953
2954 /*
2955 * struct jail_attach_jd_args {
2956 * int fd;
2957 * };
2958 */
2959 int
sys_jail_attach_jd(struct thread * td,struct jail_attach_jd_args * uap)2960 sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap)
2961 {
2962 struct prison *pr;
2963 struct ucred *jdcred;
2964 int drflags, error;
2965
2966 sx_slock(&allprison_lock);
2967 drflags = PD_LIST_SLOCKED;
2968 error = jaildesc_find(td, uap->fd, &pr, &jdcred);
2969 if (error)
2970 goto fail;
2971 drflags |= PD_DEREF;
2972 error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH);
2973 crfree(jdcred);
2974 if (error)
2975 goto fail;
2976 mtx_lock(&pr->pr_mtx);
2977 drflags |= PD_LOCKED;
2978
2979 /* Do not allow a process to attach to a prison that is not alive. */
2980 if (!prison_isalive(pr)) {
2981 error = EINVAL;
2982 goto fail;
2983 }
2984
2985 return (do_jail_attach(td, pr, drflags));
2986
2987 fail:
2988 prison_deref(pr, drflags);
2989 return (error);
2990 }
2991
2992 static int
do_jail_attach(struct thread * td,struct prison * pr,int drflags)2993 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2994 {
2995 struct proc *p;
2996 struct ucred *newcred, *oldcred;
2997 int error;
2998
2999 mtx_assert(&pr->pr_mtx, MA_OWNED);
3000 sx_assert(&allprison_lock, SX_LOCKED);
3001 drflags &= PD_LOCK_FLAGS;
3002 /*
3003 * XXX: Note that there is a slight race here if two threads
3004 * in the same privileged process attempt to attach to two
3005 * different jails at the same time. It is important for
3006 * user processes not to do this, or they might end up with
3007 * a process root from one prison, but attached to the jail
3008 * of another.
3009 */
3010 if (!(drflags & PD_DEREF)) {
3011 prison_hold(pr);
3012 drflags |= PD_DEREF;
3013 }
3014 refcount_acquire(&pr->pr_uref);
3015 drflags |= PD_DEUREF;
3016 mtx_unlock(&pr->pr_mtx);
3017 drflags &= ~PD_LOCKED;
3018
3019 /* Let modules do whatever they need to prepare for attaching. */
3020 error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
3021 if (error) {
3022 prison_deref(pr, drflags);
3023 return (error);
3024 }
3025 sx_unlock(&allprison_lock);
3026 drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
3027
3028 /*
3029 * Reparent the newly attached process to this jail.
3030 */
3031 p = td->td_proc;
3032 error = cpuset_setproc_update_set(p, pr->pr_cpuset);
3033 if (error)
3034 goto e_revert_osd;
3035
3036 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
3037 if ((error = change_dir(pr->pr_root, td)) != 0)
3038 goto e_unlock;
3039 #ifdef MAC
3040 if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
3041 goto e_unlock;
3042 #endif
3043 VOP_UNLOCK(pr->pr_root);
3044 if ((error = pwd_chroot_chdir(td, pr->pr_root)))
3045 goto e_revert_osd;
3046
3047 newcred = crget();
3048 PROC_LOCK(p);
3049 oldcred = crcopysafe(p, newcred);
3050 newcred->cr_prison = pr;
3051 #ifdef RACCT
3052 racct_proc_ucred_changed(p, oldcred, newcred);
3053 #endif
3054 #ifdef RCTL
3055 crhold(newcred);
3056 #endif
3057 /*
3058 * Takes over 'newcred''s reference, so 'newcred' must not be used
3059 * besides this point except on RCTL where we took an additional
3060 * reference above.
3061 */
3062 proc_set_cred(p, newcred);
3063 setsugid(p);
3064 PROC_UNLOCK(p);
3065 #ifdef RCTL
3066 rctl_proc_ucred_changed(p, newcred);
3067 crfree(newcred);
3068 #endif
3069 prison_proc_relink(oldcred->cr_prison, pr, p);
3070 prison_deref(oldcred->cr_prison, drflags);
3071 crfree(oldcred);
3072 prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid);
3073
3074 /*
3075 * If the prison was killed while changing credentials, die along
3076 * with it.
3077 */
3078 if (!prison_isalive(pr)) {
3079 PROC_LOCK(p);
3080 kern_psignal(p, SIGKILL);
3081 PROC_UNLOCK(p);
3082 }
3083
3084 return (0);
3085
3086 e_unlock:
3087 VOP_UNLOCK(pr->pr_root);
3088 e_revert_osd:
3089 /* Tell modules this thread is still in its old jail after all. */
3090 sx_slock(&allprison_lock);
3091 drflags |= PD_LIST_SLOCKED;
3092 (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
3093 prison_deref(pr, drflags);
3094 return (error);
3095 }
3096
3097 /*
3098 * Returns a locked prison instance, or NULL on failure.
3099 */
3100 struct prison *
prison_find(int prid)3101 prison_find(int prid)
3102 {
3103 struct prison *pr;
3104
3105 sx_assert(&allprison_lock, SX_LOCKED);
3106 TAILQ_FOREACH(pr, &allprison, pr_list) {
3107 if (pr->pr_id < prid)
3108 continue;
3109 if (pr->pr_id > prid)
3110 break;
3111 KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
3112 mtx_lock(&pr->pr_mtx);
3113 return (pr);
3114 }
3115 return (NULL);
3116 }
3117
3118 /*
3119 * Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
3120 */
3121 struct prison *
prison_find_child(struct prison * mypr,int prid)3122 prison_find_child(struct prison *mypr, int prid)
3123 {
3124 struct prison *pr;
3125 int descend;
3126
3127 sx_assert(&allprison_lock, SX_LOCKED);
3128 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
3129 if (pr->pr_id == prid) {
3130 KASSERT(prison_isvalid(pr),
3131 ("Found invalid prison %p", pr));
3132 mtx_lock(&pr->pr_mtx);
3133 return (pr);
3134 }
3135 }
3136 return (NULL);
3137 }
3138
3139 /*
3140 * Look for the name relative to mypr. Returns a locked prison or NULL.
3141 */
3142 struct prison *
prison_find_name(struct prison * mypr,const char * name)3143 prison_find_name(struct prison *mypr, const char *name)
3144 {
3145 struct prison *pr, *deadpr;
3146 size_t mylen;
3147 int descend;
3148
3149 sx_assert(&allprison_lock, SX_LOCKED);
3150 mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
3151 deadpr = NULL;
3152 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
3153 if (!strcmp(pr->pr_name + mylen, name)) {
3154 KASSERT(prison_isvalid(pr),
3155 ("Found invalid prison %p", pr));
3156 if (prison_isalive(pr)) {
3157 mtx_lock(&pr->pr_mtx);
3158 return (pr);
3159 }
3160 deadpr = pr;
3161 }
3162 }
3163 /* There was no valid prison - perhaps there was a dying one. */
3164 if (deadpr != NULL)
3165 mtx_lock(&deadpr->pr_mtx);
3166 return (deadpr);
3167 }
3168
3169 /*
3170 * See if a prison has the specific flag set. The prison should be locked,
3171 * unless checking for flags that are only set at jail creation (such as
3172 * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
3173 * to any other prison data.
3174 */
3175 bool
prison_flag(struct ucred * cred,unsigned flag)3176 prison_flag(struct ucred *cred, unsigned flag)
3177 {
3178
3179 return ((cred->cr_prison->pr_flags & flag) != 0);
3180 }
3181
3182 /*
3183 * See if a prison has the specific allow flag set.
3184 * The prison *should* be locked, or only a single bit is examined, without
3185 * regard to any other prison data.
3186 */
3187 bool
prison_allow(struct ucred * cred,unsigned flag)3188 prison_allow(struct ucred *cred, unsigned flag)
3189 {
3190
3191 return ((cred->cr_prison->pr_allow & flag) != 0);
3192 }
3193
3194 /*
3195 * Hold a prison reference, by incrementing pr_ref. It is generally
3196 * an error to hold a prison that does not already have a reference.
3197 * A prison record will remain valid as long as it has at least one
3198 * reference, and will not be removed as long as either the prison
3199 * mutex or the allprison lock is held (allprison_lock may be shared).
3200 */
3201 void
prison_hold_locked(struct prison * pr)3202 prison_hold_locked(struct prison *pr)
3203 {
3204
3205 /* Locking is no longer required. */
3206 prison_hold(pr);
3207 }
3208
3209 void
prison_hold(struct prison * pr)3210 prison_hold(struct prison *pr)
3211 {
3212 #ifdef INVARIANTS
3213 int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
3214
3215 KASSERT(was_valid,
3216 ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
3217 #else
3218 refcount_acquire(&pr->pr_ref);
3219 #endif
3220 }
3221
3222 /*
3223 * Remove a prison reference. If that was the last reference, the
3224 * prison will be removed (at a later time).
3225 */
3226 void
prison_free_locked(struct prison * pr)3227 prison_free_locked(struct prison *pr)
3228 {
3229
3230 mtx_assert(&pr->pr_mtx, MA_OWNED);
3231 /*
3232 * Locking is no longer required, but unlock because the caller
3233 * expects it.
3234 */
3235 mtx_unlock(&pr->pr_mtx);
3236 prison_free(pr);
3237 }
3238
3239 void
prison_free(struct prison * pr)3240 prison_free(struct prison *pr)
3241 {
3242
3243 KASSERT(refcount_load(&pr->pr_ref) > 0,
3244 ("Trying to free dead prison %p (jid=%d).",
3245 pr, pr->pr_id));
3246 if (!refcount_release_if_not_last(&pr->pr_ref)) {
3247 /*
3248 * Don't remove the last reference in this context,
3249 * in case there are locks held.
3250 */
3251 taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task);
3252 }
3253 }
3254
3255 static void
prison_free_not_last(struct prison * pr)3256 prison_free_not_last(struct prison *pr)
3257 {
3258 #ifdef INVARIANTS
3259 int lastref;
3260
3261 KASSERT(refcount_load(&pr->pr_ref) > 0,
3262 ("Trying to free dead prison %p (jid=%d).",
3263 pr, pr->pr_id));
3264 lastref = refcount_release(&pr->pr_ref);
3265 KASSERT(!lastref,
3266 ("prison_free_not_last freed last ref on prison %p (jid=%d).",
3267 pr, pr->pr_id));
3268 #else
3269 refcount_release(&pr->pr_ref);
3270 #endif
3271 }
3272
3273 /*
3274 * Hold a prison for user visibility, by incrementing pr_uref.
3275 * It is generally an error to hold a prison that isn't already
3276 * user-visible, except through the jail system calls. It is also
3277 * an error to hold an invalid prison. A prison record will remain
3278 * alive as long as it has at least one user reference, and will not
3279 * be set to the dying state until the prison mutex and allprison_lock
3280 * are both freed.
3281 */
3282 void
prison_proc_hold(struct prison * pr)3283 prison_proc_hold(struct prison *pr)
3284 {
3285 #ifdef INVARIANTS
3286 int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
3287
3288 KASSERT(was_alive,
3289 ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
3290 #else
3291 refcount_acquire(&pr->pr_uref);
3292 #endif
3293 }
3294
3295 /*
3296 * Remove a prison user reference. If it was the last reference, the
3297 * prison will be considered "dying", and may be removed once all of
3298 * its references are dropped.
3299 */
3300 void
prison_proc_free(struct prison * pr)3301 prison_proc_free(struct prison *pr)
3302 {
3303
3304 /*
3305 * Locking is only required when releasing the last reference.
3306 * This allows assurance that a locked prison will remain alive
3307 * until it is unlocked.
3308 */
3309 KASSERT(refcount_load(&pr->pr_uref) > 0,
3310 ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
3311 if (!refcount_release_if_not_last(&pr->pr_uref)) {
3312 /*
3313 * Don't remove the last user reference in this context,
3314 * which is expected to be a process that is not only locked,
3315 * but also half dead. Add a reference so any calls to
3316 * prison_free() won't re-submit the task.
3317 */
3318 prison_hold(pr);
3319 mtx_lock(&pr->pr_mtx);
3320 KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
3321 ("Redundant last reference in prison_proc_free (jid=%d)",
3322 pr->pr_id));
3323 pr->pr_flags |= PR_COMPLETE_PROC;
3324 mtx_unlock(&pr->pr_mtx);
3325 taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task);
3326 }
3327 }
3328
3329 static void
prison_proc_free_not_last(struct prison * pr)3330 prison_proc_free_not_last(struct prison *pr)
3331 {
3332 #ifdef INVARIANTS
3333 int lastref;
3334
3335 KASSERT(refcount_load(&pr->pr_uref) > 0,
3336 ("Trying to free dead prison %p (jid=%d).",
3337 pr, pr->pr_id));
3338 lastref = refcount_release(&pr->pr_uref);
3339 KASSERT(!lastref,
3340 ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
3341 pr, pr->pr_id));
3342 #else
3343 refcount_release(&pr->pr_uref);
3344 #endif
3345 }
3346
3347 void
prison_proc_link(struct prison * pr,struct proc * p)3348 prison_proc_link(struct prison *pr, struct proc *p)
3349 {
3350
3351 sx_assert(&allproc_lock, SA_XLOCKED);
3352 LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
3353 }
3354
3355 void
prison_proc_unlink(struct prison * pr,struct proc * p)3356 prison_proc_unlink(struct prison *pr, struct proc *p)
3357 {
3358
3359 sx_assert(&allproc_lock, SA_XLOCKED);
3360 LIST_REMOVE(p, p_jaillist);
3361 }
3362
3363 static void
prison_proc_relink(struct prison * opr,struct prison * npr,struct proc * p)3364 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
3365 {
3366
3367 sx_xlock(&allproc_lock);
3368 prison_proc_unlink(opr, p);
3369 prison_proc_link(npr, p);
3370 sx_xunlock(&allproc_lock);
3371 }
3372
3373 /*
3374 * Complete a call to either prison_free or prison_proc_free.
3375 */
3376 static void
prison_complete(void * context,int pending)3377 prison_complete(void *context, int pending)
3378 {
3379 struct prison *pr = context;
3380 int drflags;
3381
3382 /*
3383 * This could be called to release the last reference, or the last
3384 * user reference (plus the reference held in prison_proc_free).
3385 */
3386 drflags = prison_lock_xlock(pr, PD_DEREF);
3387 if (pr->pr_flags & PR_COMPLETE_PROC) {
3388 pr->pr_flags &= ~PR_COMPLETE_PROC;
3389 drflags |= PD_DEUREF;
3390 }
3391 prison_deref(pr, drflags);
3392 }
3393
3394 static void
prison_kill_processes_cb(struct proc * p,void * arg __unused)3395 prison_kill_processes_cb(struct proc *p, void *arg __unused)
3396 {
3397
3398 kern_psignal(p, SIGKILL);
3399 }
3400
3401 /*
3402 * Note the iteration does not guarantee acting on all processes.
3403 * Most notably there may be fork or jail_attach in progress.
3404 */
3405 void
prison_proc_iterate(struct prison * pr,void (* cb)(struct proc *,void *),void * cbarg)3406 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
3407 void *cbarg)
3408 {
3409 struct prison *ppr;
3410 struct proc *p;
3411
3412 if (atomic_load_int(&pr->pr_childcount) == 0) {
3413 sx_slock(&allproc_lock);
3414 LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
3415 if (p->p_state == PRS_NEW)
3416 continue;
3417 PROC_LOCK(p);
3418 cb(p, cbarg);
3419 PROC_UNLOCK(p);
3420 }
3421 sx_sunlock(&allproc_lock);
3422 if (atomic_load_int(&pr->pr_childcount) == 0)
3423 return;
3424 /*
3425 * Some jails popped up during the iteration, fall through to a
3426 * system-wide search.
3427 */
3428 }
3429
3430 sx_slock(&allproc_lock);
3431 FOREACH_PROC_IN_SYSTEM(p) {
3432 PROC_LOCK(p);
3433 if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
3434 for (ppr = p->p_ucred->cr_prison; ppr != NULL;
3435 ppr = ppr->pr_parent) {
3436 if (ppr == pr) {
3437 cb(p, cbarg);
3438 break;
3439 }
3440 }
3441 }
3442 PROC_UNLOCK(p);
3443 }
3444 sx_sunlock(&allproc_lock);
3445 }
3446
3447 /*
3448 * Remove a prison reference and/or user reference (usually).
3449 * This assumes context that allows sleeping (for allprison_lock),
3450 * with no non-sleeping locks held, except perhaps the prison itself.
3451 * If there are no more references, release and delist the prison.
3452 * On completion, the prison lock and the allprison lock are both
3453 * unlocked.
3454 */
3455 static void
prison_deref(struct prison * pr,int flags)3456 prison_deref(struct prison *pr, int flags)
3457 {
3458 struct prisonlist freeprison;
3459 struct prison *killpr, *rpr, *ppr, *tpr;
3460
3461 killpr = NULL;
3462 TAILQ_INIT(&freeprison);
3463 /*
3464 * Release this prison as requested, which may cause its parent
3465 * to be released, and then maybe its grandparent, etc.
3466 */
3467 for (;;) {
3468 if (flags & PD_KILL) {
3469 /* Kill the prison and its descendents. */
3470 KASSERT(pr != &prison0,
3471 ("prison_deref trying to kill prison0"));
3472 if (!prison_isalive(pr)) {
3473 /* Silently ignore already-dying prisons. */
3474 flags &= ~PD_KILL;
3475 } else {
3476 if (!(flags & PD_DEREF)) {
3477 prison_hold(pr);
3478 flags |= PD_DEREF;
3479 }
3480 flags = prison_lock_xlock(pr, flags);
3481 prison_deref_kill(pr, &freeprison);
3482 }
3483 }
3484 if (flags & PD_DEUREF) {
3485 /* Drop a user reference. */
3486 KASSERT(refcount_load(&pr->pr_uref) > 0,
3487 ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
3488 pr->pr_id));
3489 if (!refcount_release_if_not_last(&pr->pr_uref)) {
3490 if (!(flags & PD_DEREF)) {
3491 prison_hold(pr);
3492 flags |= PD_DEREF;
3493 }
3494 flags = prison_lock_xlock(pr, flags);
3495 if (refcount_release(&pr->pr_uref) &&
3496 pr->pr_state == PRISON_STATE_ALIVE) {
3497 /*
3498 * When the last user references goes,
3499 * this becomes a dying prison.
3500 */
3501 KASSERT(
3502 refcount_load(&prison0.pr_uref) > 0,
3503 ("prison0 pr_uref=0"));
3504 pr->pr_state = PRISON_STATE_DYING;
3505 prison_cleanup_locked(pr);
3506 mtx_unlock(&pr->pr_mtx);
3507 flags &= ~PD_LOCKED;
3508 prison_cleanup_unlocked(pr);
3509 }
3510 }
3511 }
3512 if (flags & PD_KILL) {
3513 /*
3514 * Any remaining user references are probably processes
3515 * that need to be killed, either in this prison or its
3516 * descendants.
3517 */
3518 if (refcount_load(&pr->pr_uref) > 0)
3519 killpr = pr;
3520 /* Make sure the parent prison doesn't get killed. */
3521 flags &= ~PD_KILL;
3522 }
3523 if (flags & PD_DEREF) {
3524 /* Drop a reference. */
3525 KASSERT(refcount_load(&pr->pr_ref) > 0,
3526 ("prison_deref PD_DEREF on a dead prison (jid=%d)",
3527 pr->pr_id));
3528 if (!refcount_release_if_not_last(&pr->pr_ref)) {
3529 flags = prison_lock_xlock(pr, flags);
3530 if (refcount_release(&pr->pr_ref)) {
3531 /*
3532 * When the last reference goes,
3533 * unlink the prison and set it aside.
3534 */
3535 KASSERT(
3536 refcount_load(&pr->pr_uref) == 0,
3537 ("prison_deref: last ref, "
3538 "but still has %d urefs (jid=%d)",
3539 pr->pr_uref, pr->pr_id));
3540 KASSERT(
3541 refcount_load(&prison0.pr_ref) != 0,
3542 ("prison0 pr_ref=0"));
3543 pr->pr_state = PRISON_STATE_INVALID;
3544 TAILQ_REMOVE(&allprison, pr, pr_list);
3545 LIST_REMOVE(pr, pr_sibling);
3546 TAILQ_INSERT_TAIL(&freeprison, pr,
3547 pr_list);
3548 for (ppr = pr->pr_parent;
3549 ppr != NULL;
3550 ppr = ppr->pr_parent)
3551 ppr->pr_childcount--;
3552 /*
3553 * Removing a prison frees references
3554 * from its parent.
3555 */
3556 ppr = pr->pr_parent;
3557 pr->pr_parent = NULL;
3558 mtx_unlock(&pr->pr_mtx);
3559
3560 pr = ppr;
3561 flags &= ~PD_LOCKED;
3562 flags |= PD_DEREF | PD_DEUREF;
3563 continue;
3564 }
3565 }
3566 }
3567 break;
3568 }
3569
3570 /* Release all the prison locks. */
3571 if (flags & PD_LOCKED)
3572 mtx_unlock(&pr->pr_mtx);
3573 if (flags & PD_LIST_SLOCKED)
3574 sx_sunlock(&allprison_lock);
3575 else if (flags & PD_LIST_XLOCKED)
3576 sx_xunlock(&allprison_lock);
3577
3578 /* Kill any processes attached to a killed prison. */
3579 if (killpr != NULL)
3580 prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
3581
3582 /*
3583 * Finish removing any unreferenced prisons, which couldn't happen
3584 * while allprison_lock was held (to avoid a LOR on vrele).
3585 */
3586 TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
3587 #ifdef VIMAGE
3588 if (rpr->pr_flags & PR_VNET)
3589 vnet_destroy(rpr->pr_vnet);
3590 #endif
3591 if (rpr->pr_root != NULL)
3592 vrele(rpr->pr_root);
3593 mtx_destroy(&rpr->pr_mtx);
3594 #ifdef INET
3595 prison_ip_free(rpr->pr_addrs[PR_INET]);
3596 #endif
3597 #ifdef INET6
3598 prison_ip_free(rpr->pr_addrs[PR_INET6]);
3599 #endif
3600 if (rpr->pr_cpuset != NULL)
3601 cpuset_rel(rpr->pr_cpuset);
3602 osd_jail_exit(rpr);
3603 #ifdef RACCT
3604 if (racct_enable)
3605 prison_racct_detach(rpr);
3606 #endif
3607 TAILQ_REMOVE(&freeprison, rpr, pr_list);
3608 free(rpr, M_PRISON);
3609 }
3610 }
3611
3612 /*
3613 * Kill the prison and its descendants. Mark them as dying, clear the
3614 * persist flag, and call module remove methods.
3615 */
3616 static void
prison_deref_kill(struct prison * pr,struct prisonlist * freeprison)3617 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
3618 {
3619 struct prison *cpr, *ppr, *rpr;
3620 bool descend;
3621
3622 /*
3623 * Unlike the descendants, the target prison can be killed
3624 * even if it is currently dying. This is useful for failed
3625 * creation in jail_set(2).
3626 */
3627 KASSERT(refcount_load(&pr->pr_ref) > 0,
3628 ("Trying to kill dead prison %p (jid=%d).",
3629 pr, pr->pr_id));
3630 refcount_acquire(&pr->pr_uref);
3631 pr->pr_state = PRISON_STATE_DYING;
3632 mtx_unlock(&pr->pr_mtx);
3633
3634 rpr = NULL;
3635 FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
3636 if (descend) {
3637 if (!prison_isalive(cpr)) {
3638 descend = false;
3639 continue;
3640 }
3641 prison_hold(cpr);
3642 prison_proc_hold(cpr);
3643 mtx_lock(&cpr->pr_mtx);
3644 cpr->pr_state = PRISON_STATE_DYING;
3645 cpr->pr_flags |= PR_REMOVE;
3646 mtx_unlock(&cpr->pr_mtx);
3647 continue;
3648 }
3649 if (!(cpr->pr_flags & PR_REMOVE))
3650 continue;
3651 prison_cleanup_unlocked(cpr);
3652 mtx_lock(&cpr->pr_mtx);
3653 prison_cleanup_locked(cpr);
3654 cpr->pr_flags &= ~PR_REMOVE;
3655 if (cpr->pr_flags & PR_PERSIST) {
3656 cpr->pr_flags &= ~PR_PERSIST;
3657 prison_proc_free_not_last(cpr);
3658 prison_free_not_last(cpr);
3659 }
3660 (void)refcount_release(&cpr->pr_uref);
3661 if (refcount_release(&cpr->pr_ref)) {
3662 /*
3663 * When the last reference goes, unlink the prison
3664 * and set it aside for prison_deref() to handle.
3665 * Delay unlinking the sibling list to keep the loop
3666 * safe.
3667 */
3668 if (rpr != NULL)
3669 LIST_REMOVE(rpr, pr_sibling);
3670 rpr = cpr;
3671 rpr->pr_state = PRISON_STATE_INVALID;
3672 TAILQ_REMOVE(&allprison, rpr, pr_list);
3673 TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
3674 /*
3675 * Removing a prison frees references from its parent.
3676 */
3677 ppr = rpr->pr_parent;
3678 prison_proc_free_not_last(ppr);
3679 prison_free_not_last(ppr);
3680 for (; ppr != NULL; ppr = ppr->pr_parent)
3681 ppr->pr_childcount--;
3682 }
3683 mtx_unlock(&cpr->pr_mtx);
3684 }
3685 if (rpr != NULL)
3686 LIST_REMOVE(rpr, pr_sibling);
3687
3688 prison_cleanup_unlocked(pr);
3689 mtx_lock(&pr->pr_mtx);
3690 prison_cleanup_locked(pr);
3691 if (pr->pr_flags & PR_PERSIST) {
3692 pr->pr_flags &= ~PR_PERSIST;
3693 prison_proc_free_not_last(pr);
3694 prison_free_not_last(pr);
3695 }
3696 (void)refcount_release(&pr->pr_uref);
3697 }
3698
3699 /*
3700 * Given the current locking state in the flags, make sure allprison_lock
3701 * is held exclusive, and the prison is locked. Return flags indicating
3702 * the new state.
3703 */
3704 static int
prison_lock_xlock(struct prison * pr,int flags)3705 prison_lock_xlock(struct prison *pr, int flags)
3706 {
3707
3708 if (!(flags & PD_LIST_XLOCKED)) {
3709 /*
3710 * Get allprison_lock, which may be an upgrade,
3711 * and may require unlocking the prison.
3712 */
3713 if (flags & PD_LOCKED) {
3714 mtx_unlock(&pr->pr_mtx);
3715 flags &= ~PD_LOCKED;
3716 }
3717 if (flags & PD_LIST_SLOCKED) {
3718 if (!sx_try_upgrade(&allprison_lock)) {
3719 sx_sunlock(&allprison_lock);
3720 sx_xlock(&allprison_lock);
3721 }
3722 flags &= ~PD_LIST_SLOCKED;
3723 } else
3724 sx_xlock(&allprison_lock);
3725 flags |= PD_LIST_XLOCKED;
3726 }
3727 if (!(flags & PD_LOCKED)) {
3728 /* Lock the prison mutex. */
3729 mtx_lock(&pr->pr_mtx);
3730 flags |= PD_LOCKED;
3731 }
3732 return flags;
3733 }
3734
3735 /*
3736 * Release a prison's resources when it starts dying (when the last user
3737 * reference is dropped, or when it is killed). Two functions are called,
3738 * for work that requires a locked prison or an unlocked one.
3739 */
3740 static void
prison_cleanup_locked(struct prison * pr)3741 prison_cleanup_locked(struct prison *pr)
3742 {
3743 sx_assert(&allprison_lock, SA_XLOCKED);
3744 mtx_assert(&pr->pr_mtx, MA_OWNED);
3745 prison_knote(pr, NOTE_JAIL_REMOVE);
3746 knlist_detach(pr->pr_klist);
3747 jaildesc_prison_cleanup(pr);
3748 pr->pr_klist = NULL;
3749 }
3750
3751 static void
prison_cleanup_unlocked(struct prison * pr)3752 prison_cleanup_unlocked(struct prison *pr)
3753 {
3754 sx_assert(&allprison_lock, SA_XLOCKED);
3755 mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3756 vfs_exjail_delete(pr);
3757 shm_remove_prison(pr);
3758 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3759 }
3760
3761 /*
3762 * Set or clear a permission bit in the pr_allow field, passing restrictions
3763 * (cleared permission) down to child jails.
3764 */
3765 void
prison_set_allow(struct ucred * cred,unsigned flag,int enable)3766 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3767 {
3768 struct prison *pr;
3769
3770 pr = cred->cr_prison;
3771 sx_slock(&allprison_lock);
3772 mtx_lock(&pr->pr_mtx);
3773 prison_set_allow_locked(pr, flag, enable);
3774 mtx_unlock(&pr->pr_mtx);
3775 sx_sunlock(&allprison_lock);
3776 }
3777
3778 static void
prison_set_allow_locked(struct prison * pr,unsigned flag,int enable)3779 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3780 {
3781 struct prison *cpr;
3782 int descend;
3783
3784 if (enable != 0)
3785 pr->pr_allow |= flag;
3786 else {
3787 pr->pr_allow &= ~flag;
3788 FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3789 cpr->pr_allow &= ~flag;
3790 }
3791 }
3792
3793 /*
3794 * Check if a jail supports the given address family.
3795 *
3796 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3797 * if not.
3798 */
3799 int
prison_check_af(struct ucred * cred,int af)3800 prison_check_af(struct ucred *cred, int af)
3801 {
3802 struct prison *pr;
3803 int error;
3804
3805 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3806
3807 pr = cred->cr_prison;
3808 #ifdef VIMAGE
3809 /* Prisons with their own network stack are not limited. */
3810 if (prison_owns_vnet(pr))
3811 return (0);
3812 #endif
3813
3814 error = 0;
3815 switch (af)
3816 {
3817 #ifdef INET
3818 case AF_INET:
3819 if (pr->pr_flags & PR_IP4)
3820 {
3821 mtx_lock(&pr->pr_mtx);
3822 if ((pr->pr_flags & PR_IP4) &&
3823 pr->pr_addrs[PR_INET] == NULL)
3824 error = EAFNOSUPPORT;
3825 mtx_unlock(&pr->pr_mtx);
3826 }
3827 break;
3828 #endif
3829 #ifdef INET6
3830 case AF_INET6:
3831 if (pr->pr_flags & PR_IP6)
3832 {
3833 mtx_lock(&pr->pr_mtx);
3834 if ((pr->pr_flags & PR_IP6) &&
3835 pr->pr_addrs[PR_INET6] == NULL)
3836 error = EAFNOSUPPORT;
3837 mtx_unlock(&pr->pr_mtx);
3838 }
3839 break;
3840 #endif
3841 case AF_LOCAL:
3842 case AF_ROUTE:
3843 case AF_NETLINK:
3844 break;
3845 default:
3846 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3847 error = EAFNOSUPPORT;
3848 }
3849 return (error);
3850 }
3851
3852 /*
3853 * Check if given address belongs to the jail referenced by cred (wrapper to
3854 * prison_check_ip[46]).
3855 *
3856 * Returns 0 if jail doesn't restrict the address family or if address belongs
3857 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3858 * the jail doesn't allow the address family. IPv4 Address passed in in NBO.
3859 */
3860 int
prison_if(struct ucred * cred,const struct sockaddr * sa)3861 prison_if(struct ucred *cred, const struct sockaddr *sa)
3862 {
3863 #ifdef INET
3864 const struct sockaddr_in *sai;
3865 #endif
3866 #ifdef INET6
3867 const struct sockaddr_in6 *sai6;
3868 #endif
3869 int error;
3870
3871 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3872 KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3873
3874 #ifdef VIMAGE
3875 if (prison_owns_vnet(cred->cr_prison))
3876 return (0);
3877 #endif
3878
3879 error = 0;
3880 switch (sa->sa_family)
3881 {
3882 #ifdef INET
3883 case AF_INET:
3884 sai = (const struct sockaddr_in *)sa;
3885 error = prison_check_ip4(cred, &sai->sin_addr);
3886 break;
3887 #endif
3888 #ifdef INET6
3889 case AF_INET6:
3890 sai6 = (const struct sockaddr_in6 *)sa;
3891 error = prison_check_ip6(cred, &sai6->sin6_addr);
3892 break;
3893 #endif
3894 default:
3895 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3896 error = EAFNOSUPPORT;
3897 }
3898 return (error);
3899 }
3900
3901 /*
3902 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3903 */
3904 int
prison_check(struct ucred * cred1,struct ucred * cred2)3905 prison_check(struct ucred *cred1, struct ucred *cred2)
3906 {
3907
3908 return ((cred1->cr_prison == cred2->cr_prison ||
3909 prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3910 }
3911
3912 /*
3913 * For mountd/nfsd to run within a prison, it must be:
3914 * - A vnet prison.
3915 * - PR_ALLOW_NFSD must be set on it.
3916 * - The root directory (pr_root) of the prison must be
3917 * a file system mount point, so the mountd can hang
3918 * export information on it.
3919 * - The prison's enforce_statfs cannot be 0, so that
3920 * mountd(8) can do exports.
3921 */
3922 bool
prison_check_nfsd(struct ucred * cred)3923 prison_check_nfsd(struct ucred *cred)
3924 {
3925
3926 if (jailed_without_vnet(cred))
3927 return (false);
3928 if (!prison_allow(cred, PR_ALLOW_NFSD))
3929 return (false);
3930 if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3931 return (false);
3932 if (cred->cr_prison->pr_enforce_statfs == 0)
3933 return (false);
3934 return (true);
3935 }
3936
3937 /*
3938 * Return true if p2 is a child of p1, otherwise false.
3939 */
3940 bool
prison_ischild(struct prison * pr1,struct prison * pr2)3941 prison_ischild(struct prison *pr1, struct prison *pr2)
3942 {
3943
3944 for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3945 if (pr1 == pr2)
3946 return (true);
3947 return (false);
3948 }
3949
3950 /*
3951 * Return true if the prison is currently alive. A prison is alive if it
3952 * holds user references and it isn't being removed.
3953 */
3954 bool
prison_isalive(const struct prison * pr)3955 prison_isalive(const struct prison *pr)
3956 {
3957
3958 if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3959 return (false);
3960 return (true);
3961 }
3962
3963 /*
3964 * Return true if the prison is currently valid. A prison is valid if it has
3965 * been fully created, and is not being destroyed. Note that dying prisons
3966 * are still considered valid. Invalid prisons won't be found under normal
3967 * circumstances, as they're only put in that state by functions that have
3968 * an exclusive hold on allprison_lock.
3969 */
3970 bool
prison_isvalid(struct prison * pr)3971 prison_isvalid(struct prison *pr)
3972 {
3973
3974 if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3975 return (false);
3976 if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3977 return (false);
3978 return (true);
3979 }
3980
3981 /*
3982 * Return true if the passed credential is in a jail and that jail does not
3983 * have its own virtual network stack, otherwise false.
3984 */
3985 bool
jailed_without_vnet(struct ucred * cred)3986 jailed_without_vnet(struct ucred *cred)
3987 {
3988
3989 if (!jailed(cred))
3990 return (false);
3991 #ifdef VIMAGE
3992 if (prison_owns_vnet(cred->cr_prison))
3993 return (false);
3994 #endif
3995
3996 return (true);
3997 }
3998
3999 /*
4000 * Return the correct hostname (domainname, et al) for the passed credential.
4001 */
4002 void
getcredhostname(struct ucred * cred,char * buf,size_t size)4003 getcredhostname(struct ucred *cred, char *buf, size_t size)
4004 {
4005 struct prison *pr;
4006
4007 /*
4008 * A NULL credential can be used to shortcut to the physical
4009 * system's hostname.
4010 */
4011 pr = (cred != NULL) ? cred->cr_prison : &prison0;
4012 mtx_lock(&pr->pr_mtx);
4013 strlcpy(buf, pr->pr_hostname, size);
4014 mtx_unlock(&pr->pr_mtx);
4015 }
4016
4017 void
getcreddomainname(struct ucred * cred,char * buf,size_t size)4018 getcreddomainname(struct ucred *cred, char *buf, size_t size)
4019 {
4020
4021 mtx_lock(&cred->cr_prison->pr_mtx);
4022 strlcpy(buf, cred->cr_prison->pr_domainname, size);
4023 mtx_unlock(&cred->cr_prison->pr_mtx);
4024 }
4025
4026 void
getcredhostuuid(struct ucred * cred,char * buf,size_t size)4027 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
4028 {
4029
4030 mtx_lock(&cred->cr_prison->pr_mtx);
4031 strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
4032 mtx_unlock(&cred->cr_prison->pr_mtx);
4033 }
4034
4035 void
getcredhostid(struct ucred * cred,unsigned long * hostid)4036 getcredhostid(struct ucred *cred, unsigned long *hostid)
4037 {
4038
4039 mtx_lock(&cred->cr_prison->pr_mtx);
4040 *hostid = cred->cr_prison->pr_hostid;
4041 mtx_unlock(&cred->cr_prison->pr_mtx);
4042 }
4043
4044 void
getjailname(struct ucred * cred,char * name,size_t len)4045 getjailname(struct ucred *cred, char *name, size_t len)
4046 {
4047
4048 mtx_lock(&cred->cr_prison->pr_mtx);
4049 strlcpy(name, cred->cr_prison->pr_name, len);
4050 mtx_unlock(&cred->cr_prison->pr_mtx);
4051 }
4052
4053 #ifdef VIMAGE
4054 /*
4055 * Determine whether the prison owns its VNET.
4056 */
4057 bool
prison_owns_vnet(struct prison * pr)4058 prison_owns_vnet(struct prison *pr)
4059 {
4060
4061 /*
4062 * vnets cannot be added/removed after jail creation,
4063 * so no need to lock here.
4064 */
4065 return ((pr->pr_flags & PR_VNET) != 0);
4066 }
4067 #endif
4068
4069 /*
4070 * Determine whether the subject represented by cred can "see"
4071 * status of a mount point.
4072 * Returns: 0 for permitted, ENOENT otherwise.
4073 * XXX: This function should be called cr_canseemount() and should be
4074 * placed in kern_prot.c.
4075 */
4076 int
prison_canseemount(struct ucred * cred,struct mount * mp)4077 prison_canseemount(struct ucred *cred, struct mount *mp)
4078 {
4079 struct prison *pr;
4080 struct statfs *sp;
4081 size_t len;
4082
4083 pr = cred->cr_prison;
4084 if (pr->pr_enforce_statfs == 0)
4085 return (0);
4086 if (pr->pr_root->v_mount == mp)
4087 return (0);
4088 if (pr->pr_enforce_statfs == 2)
4089 return (ENOENT);
4090 /*
4091 * If jail's chroot directory is set to "/" we should be able to see
4092 * all mount-points from inside a jail.
4093 * This is ugly check, but this is the only situation when jail's
4094 * directory ends with '/'.
4095 */
4096 if (strcmp(pr->pr_path, "/") == 0)
4097 return (0);
4098 len = strlen(pr->pr_path);
4099 sp = &mp->mnt_stat;
4100 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
4101 return (ENOENT);
4102 /*
4103 * Be sure that we don't have situation where jail's root directory
4104 * is "/some/path" and mount point is "/some/pathpath".
4105 */
4106 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
4107 return (ENOENT);
4108 return (0);
4109 }
4110
4111 void
prison_enforce_statfs(struct ucred * cred,struct mount * mp,struct statfs * sp)4112 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
4113 {
4114 char jpath[MAXPATHLEN];
4115 struct prison *pr;
4116 size_t len;
4117
4118 pr = cred->cr_prison;
4119 if (pr->pr_enforce_statfs == 0)
4120 return;
4121 if (prison_canseemount(cred, mp) != 0) {
4122 bzero(&sp->f_fsid, sizeof(sp->f_fsid));
4123 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4124 strlcpy(sp->f_mntonname, "[restricted]",
4125 sizeof(sp->f_mntonname));
4126 return;
4127 }
4128 if (pr->pr_enforce_statfs > 1)
4129 bzero(&sp->f_fsid, sizeof(sp->f_fsid));
4130 if (pr->pr_root->v_mount == mp) {
4131 /*
4132 * Clear current buffer data, so we are sure nothing from
4133 * the valid path left there.
4134 */
4135 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4136 *sp->f_mntonname = '/';
4137 return;
4138 }
4139 /*
4140 * If jail's chroot directory is set to "/" we should be able to see
4141 * all mount-points from inside a jail.
4142 */
4143 if (strcmp(pr->pr_path, "/") == 0)
4144 return;
4145 len = strlen(pr->pr_path);
4146 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
4147 /*
4148 * Clear current buffer data, so we are sure nothing from
4149 * the valid path left there.
4150 */
4151 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4152 if (*jpath == '\0') {
4153 /* Should never happen. */
4154 *sp->f_mntonname = '/';
4155 } else {
4156 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
4157 }
4158 }
4159
4160 /*
4161 * Check with permission for a specific privilege is granted within jail. We
4162 * have a specific list of accepted privileges; the rest are denied.
4163 */
4164 int
prison_priv_check(struct ucred * cred,int priv)4165 prison_priv_check(struct ucred *cred, int priv)
4166 {
4167 struct prison *pr;
4168 int error;
4169
4170 /*
4171 * Some policies have custom handlers. This routine should not be
4172 * called for them. See priv_check_cred().
4173 */
4174 switch (priv) {
4175 case PRIV_VFS_LOOKUP:
4176 case PRIV_VFS_GENERATION:
4177 KASSERT(0, ("prison_priv_check instead of a custom handler "
4178 "called for %d\n", priv));
4179 }
4180
4181 if (!jailed(cred))
4182 return (0);
4183
4184 #ifdef VIMAGE
4185 /*
4186 * Privileges specific to prisons with a virtual network stack.
4187 * There might be a duplicate entry here in case the privilege
4188 * is only granted conditionally in the legacy jail case.
4189 */
4190 switch (priv) {
4191 /*
4192 * NFS-specific privileges.
4193 */
4194 case PRIV_NFS_DAEMON:
4195 case PRIV_VFS_GETFH:
4196 case PRIV_VFS_MOUNT_EXPORTED:
4197 if (!prison_check_nfsd(cred))
4198 return (EPERM);
4199 #ifdef notyet
4200 case PRIV_NFS_LOCKD:
4201 #endif
4202 /*
4203 * Network stack privileges.
4204 */
4205 case PRIV_NET_BRIDGE:
4206 case PRIV_NET_GRE:
4207 case PRIV_NET_BPF:
4208 case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
4209 case PRIV_NET_ROUTE:
4210 case PRIV_NET_TAP:
4211 case PRIV_NET_SETIFMTU:
4212 case PRIV_NET_SETIFFLAGS:
4213 case PRIV_NET_SETIFCAP:
4214 case PRIV_NET_SETIFDESCR:
4215 case PRIV_NET_SETIFNAME :
4216 case PRIV_NET_SETIFMETRIC:
4217 case PRIV_NET_SETIFPHYS:
4218 case PRIV_NET_SETIFMAC:
4219 case PRIV_NET_SETLANPCP:
4220 case PRIV_NET_ADDMULTI:
4221 case PRIV_NET_DELMULTI:
4222 case PRIV_NET_HWIOCTL:
4223 case PRIV_NET_SETLLADDR:
4224 case PRIV_NET_ADDIFGROUP:
4225 case PRIV_NET_DELIFGROUP:
4226 case PRIV_NET_IFCREATE:
4227 case PRIV_NET_IFDESTROY:
4228 case PRIV_NET_ADDIFADDR:
4229 case PRIV_NET_DELIFADDR:
4230 case PRIV_NET_LAGG:
4231 case PRIV_NET_GIF:
4232 case PRIV_NET_SETIFVNET:
4233 case PRIV_NET_SETIFFIB:
4234 case PRIV_NET_OVPN:
4235 case PRIV_NET_ME:
4236 case PRIV_NET_WG:
4237
4238 /*
4239 * 802.11-related privileges.
4240 */
4241 case PRIV_NET80211_VAP_GETKEY:
4242 case PRIV_NET80211_VAP_MANAGE:
4243
4244 #ifdef notyet
4245 /*
4246 * ATM privileges.
4247 */
4248 case PRIV_NETATM_CFG:
4249 case PRIV_NETATM_ADD:
4250 case PRIV_NETATM_DEL:
4251 case PRIV_NETATM_SET:
4252
4253 /*
4254 * Bluetooth privileges.
4255 */
4256 case PRIV_NETBLUETOOTH_RAW:
4257 #endif
4258
4259 /*
4260 * Netgraph and netgraph module privileges.
4261 */
4262 case PRIV_NETGRAPH_CONTROL:
4263 #ifdef notyet
4264 case PRIV_NETGRAPH_TTY:
4265 #endif
4266
4267 /*
4268 * IPv4 and IPv6 privileges.
4269 */
4270 case PRIV_NETINET_IPFW:
4271 case PRIV_NETINET_DIVERT:
4272 case PRIV_NETINET_PF:
4273 case PRIV_NETINET_DUMMYNET:
4274 case PRIV_NETINET_CARP:
4275 case PRIV_NETINET_MROUTE:
4276 case PRIV_NETINET_RAW:
4277 case PRIV_NETINET_ADDRCTRL6:
4278 case PRIV_NETINET_ND6:
4279 case PRIV_NETINET_SCOPE6:
4280 case PRIV_NETINET_ALIFETIME6:
4281 case PRIV_NETINET_IPSEC:
4282 case PRIV_NETINET_BINDANY:
4283
4284 #ifdef notyet
4285 /*
4286 * NCP privileges.
4287 */
4288 case PRIV_NETNCP:
4289
4290 /*
4291 * SMB privileges.
4292 */
4293 case PRIV_NETSMB:
4294 #endif
4295
4296 /*
4297 * No default: or deny here.
4298 * In case of no permit fall through to next switch().
4299 */
4300 if (cred->cr_prison->pr_flags & PR_VNET)
4301 return (0);
4302 }
4303 #endif /* VIMAGE */
4304
4305 switch (priv) {
4306 /*
4307 * Allow ktrace privileges for root in jail.
4308 */
4309 case PRIV_KTRACE:
4310
4311 /*
4312 * Allow jailed processes to configure audit identity and
4313 * submit audit records (login, etc). In the future we may
4314 * want to further refine the relationship between audit and
4315 * jail.
4316 */
4317 case PRIV_AUDIT_GETAUDIT:
4318 case PRIV_AUDIT_SETAUDIT:
4319 if (cred->cr_prison->pr_allow & PR_ALLOW_SETAUDIT)
4320 return (0);
4321 else
4322 return (EPERM);
4323 #if 0
4324 case PRIV_AUDIT_SUBMIT:
4325 #endif
4326
4327 /*
4328 * Allow jailed processes to manipulate process UNIX
4329 * credentials in any way they see fit.
4330 */
4331 case PRIV_CRED_SETCRED:
4332 case PRIV_CRED_SETUID:
4333 case PRIV_CRED_SETEUID:
4334 case PRIV_CRED_SETGID:
4335 case PRIV_CRED_SETEGID:
4336 case PRIV_CRED_SETGROUPS:
4337 case PRIV_CRED_SETREUID:
4338 case PRIV_CRED_SETREGID:
4339 case PRIV_CRED_SETRESUID:
4340 case PRIV_CRED_SETRESGID:
4341
4342 /*
4343 * Jail implements visibility constraints already, so allow
4344 * jailed root to override uid/gid-based constraints.
4345 */
4346 case PRIV_SEEOTHERGIDS:
4347 case PRIV_SEEOTHERUIDS:
4348 case PRIV_SEEJAILPROC:
4349
4350 /*
4351 * Jail implements inter-process debugging limits already, so
4352 * allow jailed root various debugging privileges.
4353 */
4354 case PRIV_DEBUG_DIFFCRED:
4355 case PRIV_DEBUG_SUGID:
4356 case PRIV_DEBUG_UNPRIV:
4357 case PRIV_DEBUG_DIFFJAIL:
4358
4359 /*
4360 * Allow jail to set various resource limits and login
4361 * properties, and for now, exceed process resource limits.
4362 */
4363 case PRIV_PROC_LIMIT:
4364 case PRIV_PROC_SETLOGIN:
4365 case PRIV_PROC_SETRLIMIT:
4366
4367 /*
4368 * Debuggers should work in jails.
4369 */
4370 case PRIV_PROC_MEM_WRITE:
4371
4372 /*
4373 * System V and POSIX IPC privileges are granted in jail.
4374 */
4375 case PRIV_IPC_READ:
4376 case PRIV_IPC_WRITE:
4377 case PRIV_IPC_ADMIN:
4378 case PRIV_IPC_MSGSIZE:
4379 case PRIV_MQ_ADMIN:
4380
4381 /*
4382 * Jail operations within a jail work on child jails.
4383 */
4384 case PRIV_JAIL_ATTACH:
4385 case PRIV_JAIL_SET:
4386 case PRIV_JAIL_REMOVE:
4387
4388 /*
4389 * Jail implements its own inter-process limits, so allow
4390 * root processes in jail to change scheduling on other
4391 * processes in the same jail. Likewise for signalling.
4392 */
4393 case PRIV_SCHED_DIFFCRED:
4394 case PRIV_SCHED_CPUSET:
4395 case PRIV_SCHED_DIFFJAIL:
4396 case PRIV_SIGNAL_DIFFCRED:
4397 case PRIV_SIGNAL_SUGID:
4398 case PRIV_SIGNAL_DIFFJAIL:
4399
4400 /*
4401 * Allow jailed processes to write to sysctls marked as jail
4402 * writable.
4403 */
4404 case PRIV_SYSCTL_WRITEJAIL:
4405
4406 /*
4407 * Allow root in jail to manage a variety of quota
4408 * properties. These should likely be conditional on a
4409 * configuration option.
4410 */
4411 case PRIV_VFS_GETQUOTA:
4412 case PRIV_VFS_SETQUOTA:
4413
4414 /*
4415 * Since Jail relies on chroot() to implement file system
4416 * protections, grant many VFS privileges to root in jail.
4417 * Be careful to exclude mount-related and NFS-related
4418 * privileges.
4419 */
4420 case PRIV_VFS_READ:
4421 case PRIV_VFS_WRITE:
4422 case PRIV_VFS_ADMIN:
4423 case PRIV_VFS_EXEC:
4424 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
4425 case PRIV_VFS_CHFLAGS_DEV:
4426 case PRIV_VFS_CHOWN:
4427 case PRIV_VFS_CHROOT:
4428 case PRIV_VFS_RETAINSUGID:
4429 case PRIV_VFS_FCHROOT:
4430 case PRIV_VFS_LINK:
4431 case PRIV_VFS_SETGID:
4432 case PRIV_VFS_STAT:
4433 case PRIV_VFS_STICKYFILE:
4434
4435 /*
4436 * As in the non-jail case, non-root users are expected to be
4437 * able to read kernel/physical memory (provided /dev/[k]mem
4438 * exists in the jail and they have permission to access it).
4439 */
4440 case PRIV_KMEM_READ:
4441 return (0);
4442
4443 /*
4444 * Depending on the global setting, allow privilege of
4445 * setting system flags.
4446 */
4447 case PRIV_VFS_SYSFLAGS:
4448 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4449 return (0);
4450 else
4451 return (EPERM);
4452
4453 /*
4454 * Depending on the global setting, allow privilege of
4455 * mounting/unmounting file systems.
4456 */
4457 case PRIV_VFS_MOUNT:
4458 case PRIV_VFS_UNMOUNT:
4459 case PRIV_VFS_MOUNT_NONUSER:
4460 case PRIV_VFS_MOUNT_OWNER:
4461 pr = cred->cr_prison;
4462 prison_lock(pr);
4463 if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
4464 error = 0;
4465 else
4466 error = EPERM;
4467 prison_unlock(pr);
4468 return (error);
4469
4470 /*
4471 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
4472 * policy. priv_check_cred will not specifically allow it, and
4473 * we may want a MAC policy to allow it.
4474 */
4475 case PRIV_VFS_READ_DIR:
4476 return (0);
4477
4478 /*
4479 * Conditionally allow privileged process in the jail to
4480 * manipulate filesystem extended attributes in the system
4481 * namespace.
4482 */
4483 case PRIV_VFS_EXTATTR_SYSTEM:
4484 if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0)
4485 return (0);
4486 else
4487 return (EPERM);
4488
4489 /*
4490 * Conditionnaly allow locking (unlocking) physical pages
4491 * in memory.
4492 */
4493 case PRIV_VM_MLOCK:
4494 case PRIV_VM_MUNLOCK:
4495 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
4496 return (0);
4497 else
4498 return (EPERM);
4499
4500 /*
4501 * Conditionally allow jailed root to bind reserved ports.
4502 */
4503 case PRIV_NETINET_RESERVEDPORT:
4504 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
4505 return (0);
4506 else
4507 return (EPERM);
4508
4509 /*
4510 * Allow jailed root to reuse in-use ports.
4511 */
4512 case PRIV_NETINET_REUSEPORT:
4513 return (0);
4514
4515 /*
4516 * Allow jailed root to set certain IPv4/6 (option) headers.
4517 */
4518 case PRIV_NETINET_SETHDROPTS:
4519 return (0);
4520
4521 /*
4522 * Conditionally allow creating raw sockets in jail.
4523 */
4524 case PRIV_NETINET_RAW:
4525 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4526 return (0);
4527 else
4528 return (EPERM);
4529
4530 /*
4531 * Since jail implements its own visibility limits on netstat
4532 * sysctls, allow getcred. This allows identd to work in
4533 * jail.
4534 */
4535 case PRIV_NETINET_GETCRED:
4536 return (0);
4537
4538 /*
4539 * Allow jailed root to set loginclass.
4540 */
4541 case PRIV_PROC_SETLOGINCLASS:
4542 return (0);
4543
4544 /*
4545 * Do not allow a process inside a jail to read the kernel
4546 * message buffer unless explicitly permitted.
4547 */
4548 case PRIV_MSGBUF:
4549 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
4550 return (0);
4551 return (EPERM);
4552
4553 /*
4554 * Conditionally allow privileged process in the jail adjust
4555 * machine time.
4556 */
4557 case PRIV_ADJTIME:
4558 case PRIV_NTP_ADJTIME:
4559 if (cred->cr_prison->pr_allow &
4560 (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) {
4561 return (0);
4562 }
4563 return (EPERM);
4564
4565 /*
4566 * Conditionally allow privileged process in the jail set
4567 * machine time.
4568 */
4569 case PRIV_SETTIMEOFDAY:
4570 case PRIV_CLOCK_SETTIME:
4571 if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME)
4572 return (0);
4573 else
4574 return (EPERM);
4575
4576 /*
4577 * Conditionally allow privileged process in the jail to modify
4578 * the routing table.
4579 */
4580 case PRIV_NET_ROUTE:
4581 if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING)
4582 return (0);
4583 else
4584 return (EPERM);
4585
4586 default:
4587 /*
4588 * In all remaining cases, deny the privilege request. This
4589 * includes almost all network privileges, many system
4590 * configuration privileges.
4591 */
4592 return (EPERM);
4593 }
4594 }
4595
4596 /*
4597 * Return the part of pr2's name that is relative to pr1, or the whole name
4598 * if it does not directly follow.
4599 */
4600
4601 char *
prison_name(struct prison * pr1,struct prison * pr2)4602 prison_name(struct prison *pr1, struct prison *pr2)
4603 {
4604 char *name;
4605
4606 /* Jails see themselves as "0" (if they see themselves at all). */
4607 if (pr1 == pr2)
4608 return "0";
4609 name = pr2->pr_name;
4610 if (prison_ischild(pr1, pr2)) {
4611 /*
4612 * pr1 isn't locked (and allprison_lock may not be either)
4613 * so its length can't be counted on. But the number of dots
4614 * can be counted on - and counted.
4615 */
4616 for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4617 name = strchr(name, '.') + 1;
4618 }
4619 return (name);
4620 }
4621
4622 /*
4623 * Return the part of pr2's path that is relative to pr1, or the whole path
4624 * if it does not directly follow.
4625 */
4626 static char *
prison_path(struct prison * pr1,struct prison * pr2)4627 prison_path(struct prison *pr1, struct prison *pr2)
4628 {
4629 char *path1, *path2;
4630 int len1;
4631
4632 path1 = pr1->pr_path;
4633 path2 = pr2->pr_path;
4634 if (!strcmp(path1, "/"))
4635 return (path2);
4636 len1 = strlen(path1);
4637 if (strncmp(path1, path2, len1))
4638 return (path2);
4639 if (path2[len1] == '\0')
4640 return "/";
4641 if (path2[len1] == '/')
4642 return (path2 + len1);
4643 return (path2);
4644 }
4645
4646 /*
4647 * Jail-related sysctls.
4648 */
4649 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4650 "Jails");
4651
4652 #if defined(INET) || defined(INET6)
4653 /*
4654 * Copy address array to memory that would be then SYSCTL_OUT-ed.
4655 * sysctl_jail_list() helper.
4656 */
4657 static void
prison_ip_copyout(struct prison * pr,const pr_family_t af,void ** out,int * len)4658 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
4659 {
4660 const struct prison_ip *pip;
4661 const size_t size = pr_families[af].size;
4662
4663 again:
4664 mtx_assert(&pr->pr_mtx, MA_OWNED);
4665 if ((pip = pr->pr_addrs[af]) != NULL) {
4666 if (*len < pip->ips) {
4667 *len = pip->ips;
4668 mtx_unlock(&pr->pr_mtx);
4669 *out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
4670 mtx_lock(&pr->pr_mtx);
4671 goto again;
4672 }
4673 bcopy(pip->pr_ip, *out, pip->ips * size);
4674 }
4675 }
4676 #endif
4677
4678 static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)4679 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4680 {
4681 struct xprison *xp;
4682 struct prison *pr, *cpr;
4683 #ifdef INET
4684 struct in_addr *ip4 = NULL;
4685 int ip4s = 0;
4686 #endif
4687 #ifdef INET6
4688 struct in6_addr *ip6 = NULL;
4689 int ip6s = 0;
4690 #endif
4691 int descend, error;
4692
4693 xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4694 pr = req->td->td_ucred->cr_prison;
4695 error = 0;
4696 sx_slock(&allprison_lock);
4697 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4698 mtx_lock(&cpr->pr_mtx);
4699 #ifdef INET
4700 prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
4701 #endif
4702 #ifdef INET6
4703 prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
4704 #endif
4705 bzero(xp, sizeof(*xp));
4706 xp->pr_version = XPRISON_VERSION;
4707 xp->pr_id = cpr->pr_id;
4708 xp->pr_state = cpr->pr_state;
4709 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4710 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4711 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4712 #ifdef INET
4713 xp->pr_ip4s = ip4s;
4714 #endif
4715 #ifdef INET6
4716 xp->pr_ip6s = ip6s;
4717 #endif
4718 mtx_unlock(&cpr->pr_mtx);
4719 error = SYSCTL_OUT(req, xp, sizeof(*xp));
4720 if (error)
4721 break;
4722 #ifdef INET
4723 if (xp->pr_ip4s > 0) {
4724 error = SYSCTL_OUT(req, ip4,
4725 xp->pr_ip4s * sizeof(struct in_addr));
4726 if (error)
4727 break;
4728 }
4729 #endif
4730 #ifdef INET6
4731 if (xp->pr_ip6s > 0) {
4732 error = SYSCTL_OUT(req, ip6,
4733 xp->pr_ip6s * sizeof(struct in6_addr));
4734 if (error)
4735 break;
4736 }
4737 #endif
4738 }
4739 sx_sunlock(&allprison_lock);
4740 free(xp, M_TEMP);
4741 #ifdef INET
4742 free(ip4, M_TEMP);
4743 #endif
4744 #ifdef INET6
4745 free(ip6, M_TEMP);
4746 #endif
4747 return (error);
4748 }
4749
4750 SYSCTL_OID(_security_jail, OID_AUTO, list,
4751 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4752 sysctl_jail_list, "S", "List of active jails");
4753
4754 static int
sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)4755 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4756 {
4757 int error, injail;
4758
4759 injail = jailed(req->td->td_ucred);
4760 error = SYSCTL_OUT(req, &injail, sizeof(injail));
4761
4762 return (error);
4763 }
4764
4765 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4766 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4767 sysctl_jail_jailed, "I", "Process in jail?");
4768
4769 static int
sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)4770 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4771 {
4772 int error, havevnet;
4773 #ifdef VIMAGE
4774 struct ucred *cred = req->td->td_ucred;
4775
4776 havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison);
4777 #else
4778 havevnet = 0;
4779 #endif
4780 error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4781
4782 return (error);
4783 }
4784
4785 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4786 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4787 sysctl_jail_vnet, "I", "Jail owns vnet?");
4788
4789 #if defined(INET) || defined(INET6)
4790 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4791 &jail_max_af_ips, 0,
4792 "Number of IP addresses a jail may have at most per address family (deprecated)");
4793 #endif
4794
4795 /*
4796 * Default parameters for jail(2) compatibility. For historical reasons,
4797 * the sysctl names have varying similarity to the parameter names. Prisons
4798 * just see their own parameters, and can't change them.
4799 */
4800 static int
sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)4801 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4802 {
4803 int error, i;
4804
4805 /* Get the current flag value, and convert it to a boolean. */
4806 if (req->td->td_ucred->cr_prison == &prison0) {
4807 mtx_lock(&prison0.pr_mtx);
4808 i = (jail_default_allow & arg2) != 0;
4809 mtx_unlock(&prison0.pr_mtx);
4810 } else
4811 i = prison_allow(req->td->td_ucred, arg2);
4812
4813 if (arg1 != NULL)
4814 i = !i;
4815 error = sysctl_handle_int(oidp, &i, 0, req);
4816 if (error || !req->newptr)
4817 return (error);
4818 i = i ? arg2 : 0;
4819 if (arg1 != NULL)
4820 i ^= arg2;
4821 /*
4822 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4823 * for writing.
4824 */
4825 mtx_lock(&prison0.pr_mtx);
4826 jail_default_allow = (jail_default_allow & ~arg2) | i;
4827 mtx_unlock(&prison0.pr_mtx);
4828 return (0);
4829 }
4830
4831 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4832 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4833 NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4834 "Processes in jail can set their hostnames (deprecated)");
4835 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4836 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4837 (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4838 "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4839 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4840 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4841 NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4842 "Processes in jail can use System V IPC primitives (deprecated)");
4843 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4844 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4845 NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4846 "Prison root can create raw sockets (deprecated)");
4847 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4848 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4849 NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4850 "Processes in jail can alter system file flags (deprecated)");
4851 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4852 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4853 NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4854 "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4855 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
4856 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4857 NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
4858 "Processes in jail can lock/unlock physical pages in memory");
4859
4860 static int
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)4861 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4862 {
4863 struct prison *pr;
4864 int level, error;
4865
4866 pr = req->td->td_ucred->cr_prison;
4867 level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4868 error = sysctl_handle_int(oidp, &level, 0, req);
4869 if (error || !req->newptr)
4870 return (error);
4871 *(int *)arg1 = level;
4872 return (0);
4873 }
4874
4875 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4876 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4877 &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4878 sysctl_jail_default_level, "I",
4879 "Processes in jail cannot see all mounted file systems (deprecated)");
4880
4881 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4882 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4883 &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4884 sysctl_jail_default_level, "I",
4885 "Ruleset for the devfs filesystem in jail (deprecated)");
4886
4887 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4888 "Limits and stats of child jails");
4889
4890 static int
sysctl_jail_children(SYSCTL_HANDLER_ARGS)4891 sysctl_jail_children(SYSCTL_HANDLER_ARGS)
4892 {
4893 struct prison *pr;
4894 int i;
4895
4896 pr = req->td->td_ucred->cr_prison;
4897
4898 switch (oidp->oid_kind & CTLTYPE) {
4899 case CTLTYPE_INT:
4900 i = *(int *)((char *)pr + arg2);
4901 return (SYSCTL_OUT(req, &i, sizeof(i)));
4902 }
4903
4904 return (0);
4905 }
4906
4907 SYSCTL_PROC(_security_jail_children, OID_AUTO, max,
4908 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4909 NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children,
4910 "I", "Maximum number of child jails");
4911 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur,
4912 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4913 NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children,
4914 "I", "Current number of child jails");
4915
4916 /*
4917 * Nodes to describe jail parameters. Maximum length of string parameters
4918 * is returned in the string itself, and the other parameters exist merely
4919 * to make themselves and their types known.
4920 */
4921 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4922 "Jail parameters");
4923
4924 int
sysctl_jail_param(SYSCTL_HANDLER_ARGS)4925 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4926 {
4927 int i;
4928 long l;
4929 size_t s;
4930 char numbuf[12];
4931
4932 switch (oidp->oid_kind & CTLTYPE)
4933 {
4934 case CTLTYPE_LONG:
4935 case CTLTYPE_ULONG:
4936 l = 0;
4937 #ifdef SCTL_MASK32
4938 if (!(req->flags & SCTL_MASK32))
4939 #endif
4940 return (SYSCTL_OUT(req, &l, sizeof(l)));
4941 case CTLTYPE_INT:
4942 case CTLTYPE_UINT:
4943 i = 0;
4944 return (SYSCTL_OUT(req, &i, sizeof(i)));
4945 case CTLTYPE_STRING:
4946 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4947 return
4948 (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4949 case CTLTYPE_STRUCT:
4950 s = (size_t)arg2;
4951 return (SYSCTL_OUT(req, &s, sizeof(s)));
4952 }
4953 return (0);
4954 }
4955
4956 /*
4957 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4958 * jail creation time but cannot be changed in an existing jail.
4959 */
4960 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4961 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4962 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4963 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4964 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4965 "I", "Jail secure level");
4966 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4967 "Jail value for kern.osreldate and uname -K");
4968 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4969 "Jail value for kern.osrelease and uname -r");
4970 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4971 "I", "Jail cannot see all mounted file systems");
4972 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4973 "I", "Ruleset for in-jail devfs mounts");
4974 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4975 "B", "Jail persistence");
4976 #ifdef VIMAGE
4977 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4978 "E,jailsys", "Virtual network stack");
4979 #endif
4980 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4981 "B", "Jail is in the process of shutting down");
4982
4983 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4984 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4985 "I", "Current number of child jails");
4986 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4987 "I", "Maximum number of child jails");
4988
4989 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4990 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4991 "Jail hostname");
4992 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4993 "Jail NIS domainname");
4994 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4995 "Jail host UUID");
4996 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4997 "LU", "Jail host ID");
4998
4999 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
5000 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
5001
5002 #ifdef INET
5003 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
5004 "Jail IPv4 address virtualization");
5005 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
5006 "S,in_addr,a", "Jail IPv4 addresses");
5007 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
5008 "B", "Do (not) use IPv4 source address selection rather than the "
5009 "primary jail IPv4 address.");
5010 #endif
5011 #ifdef INET6
5012 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
5013 "Jail IPv6 address virtualization");
5014 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
5015 "S,in6_addr,a", "Jail IPv6 addresses");
5016 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
5017 "B", "Do (not) use IPv6 source address selection rather than the "
5018 "primary jail IPv6 address.");
5019 #endif
5020
5021 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
5022 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
5023 "B", "Jail may set hostname");
5024 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
5025 "B", "Jail may use SYSV IPC");
5026 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
5027 "B", "Jail may create raw sockets");
5028 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
5029 "B", "Jail may alter system file flags");
5030 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
5031 "B", "Jail may set file quotas");
5032 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
5033 "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
5034 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
5035 "B", "Jail may lock (unlock) physical pages in memory");
5036 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
5037 "B", "Jail may bind sockets to reserved ports");
5038 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
5039 "B", "Jail may read the kernel message buffer");
5040 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
5041 "B", "Unprivileged processes may use process debugging facilities");
5042 SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering,
5043 CTLTYPE_INT | CTLFLAG_RW, "B",
5044 "Unprivileged parent jail processes may tamper with same-uid processes"
5045 " (signal/debug/cpuset)");
5046 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
5047 "B", "Processes in jail with uid 0 have privilege");
5048 #ifdef VIMAGE
5049 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
5050 "B", "Mountd/nfsd may run in the jail");
5051 #endif
5052 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW,
5053 "B", "Jail may set system-level filesystem extended attributes");
5054 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW,
5055 "B", "Jail may adjust system time");
5056 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
5057 "B", "Jail may set system time");
5058 SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW,
5059 "B", "Jail may modify routing table");
5060 #ifdef AUDIT
5061 SYSCTL_JAIL_PARAM(_allow, setaudit, CTLTYPE_INT | CTLFLAG_RW,
5062 "B", "Jail may set and get audit session state");
5063 #endif
5064
5065 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
5066 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
5067 "B", "Jail may mount/unmount jail-friendly file systems in general");
5068
5069 /*
5070 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return
5071 * its associated bit in the pr_allow bitmask, or zero if the parameter was
5072 * not created.
5073 */
5074 unsigned
prison_add_allow(const char * prefix,const char * name,const char * prefix_descr,const char * descr)5075 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
5076 const char *descr)
5077 {
5078 struct bool_flags *bf;
5079 struct sysctl_oid *parent;
5080 char *allow_name, *allow_noname, *allowed;
5081 #ifndef NO_SYSCTL_DESCR
5082 char *descr_deprecated;
5083 #endif
5084 u_int allow_flag;
5085
5086 if (prefix
5087 ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
5088 < 0 ||
5089 asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
5090 < 0
5091 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
5092 asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
5093 free(allow_name, M_PRISON);
5094 return 0;
5095 }
5096
5097 /*
5098 * See if this parameter has already beed added, i.e. a module was
5099 * previously loaded/unloaded.
5100 */
5101 mtx_lock(&prison0.pr_mtx);
5102 for (bf = pr_flag_allow;
5103 bf < pr_flag_allow + nitems(pr_flag_allow) &&
5104 atomic_load_int(&bf->flag) != 0;
5105 bf++) {
5106 if (strcmp(bf->name, allow_name) == 0) {
5107 allow_flag = bf->flag;
5108 goto no_add;
5109 }
5110 }
5111
5112 /*
5113 * Find a free bit in pr_allow_all, failing if there are none
5114 * (which shouldn't happen as long as we keep track of how many
5115 * potential dynamic flags exist).
5116 */
5117 for (allow_flag = 1;; allow_flag <<= 1) {
5118 if (allow_flag == 0)
5119 goto no_add;
5120 if ((pr_allow_all & allow_flag) == 0)
5121 break;
5122 }
5123
5124 /* Note the parameter in the next open slot in pr_flag_allow. */
5125 for (bf = pr_flag_allow; ; bf++) {
5126 if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
5127 /* This should never happen, but is not fatal. */
5128 allow_flag = 0;
5129 goto no_add;
5130 }
5131 if (atomic_load_int(&bf->flag) == 0)
5132 break;
5133 }
5134 bf->name = allow_name;
5135 bf->noname = allow_noname;
5136 pr_allow_all |= allow_flag;
5137 /*
5138 * prison0 always has permission for the new parameter.
5139 * Other jails must have it granted to them.
5140 */
5141 prison0.pr_allow |= allow_flag;
5142 /* The flag indicates a valid entry, so make sure it is set last. */
5143 atomic_store_rel_int(&bf->flag, allow_flag);
5144 mtx_unlock(&prison0.pr_mtx);
5145
5146 /*
5147 * Create sysctls for the parameter, and the back-compat global
5148 * permission.
5149 */
5150 parent = prefix
5151 ? SYSCTL_ADD_NODE(NULL,
5152 SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
5153 OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
5154 : &sysctl___security_jail_param_allow;
5155 (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
5156 name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
5157 NULL, 0, sysctl_jail_param, "B", descr);
5158 if ((prefix
5159 ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
5160 : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
5161 #ifndef NO_SYSCTL_DESCR
5162 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
5163 descr);
5164 #endif
5165 (void)SYSCTL_ADD_PROC(NULL,
5166 SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
5167 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
5168 sysctl_jail_default_allow, "I", descr_deprecated);
5169 #ifndef NO_SYSCTL_DESCR
5170 free(descr_deprecated, M_TEMP);
5171 #endif
5172 free(allowed, M_TEMP);
5173 }
5174 return allow_flag;
5175
5176 no_add:
5177 mtx_unlock(&prison0.pr_mtx);
5178 free(allow_name, M_PRISON);
5179 free(allow_noname, M_PRISON);
5180 return allow_flag;
5181 }
5182
5183 /*
5184 * The VFS system will register jail-aware filesystems here. They each get
5185 * a parameter allow.mount.xxxfs and a flag to check when a jailed user
5186 * attempts to mount.
5187 */
5188 void
prison_add_vfs(struct vfsconf * vfsp)5189 prison_add_vfs(struct vfsconf *vfsp)
5190 {
5191 #ifdef NO_SYSCTL_DESCR
5192
5193 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
5194 NULL, NULL);
5195 #else
5196 char *descr;
5197
5198 (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
5199 vfsp->vfc_name);
5200 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
5201 NULL, descr);
5202 free(descr, M_TEMP);
5203 #endif
5204 }
5205
5206 #ifdef RACCT
5207 void
prison_racct_foreach(void (* callback)(struct racct * racct,void * arg2,void * arg3),void (* pre)(void),void (* post)(void),void * arg2,void * arg3)5208 prison_racct_foreach(void (*callback)(struct racct *racct,
5209 void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
5210 void *arg2, void *arg3)
5211 {
5212 struct prison_racct *prr;
5213
5214 ASSERT_RACCT_ENABLED();
5215
5216 sx_slock(&allprison_lock);
5217 if (pre != NULL)
5218 (pre)();
5219 LIST_FOREACH(prr, &allprison_racct, prr_next)
5220 (callback)(prr->prr_racct, arg2, arg3);
5221 if (post != NULL)
5222 (post)();
5223 sx_sunlock(&allprison_lock);
5224 }
5225
5226 static struct prison_racct *
prison_racct_find_locked(const char * name)5227 prison_racct_find_locked(const char *name)
5228 {
5229 struct prison_racct *prr;
5230
5231 ASSERT_RACCT_ENABLED();
5232 sx_assert(&allprison_lock, SA_XLOCKED);
5233
5234 if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
5235 return (NULL);
5236
5237 LIST_FOREACH(prr, &allprison_racct, prr_next) {
5238 if (strcmp(name, prr->prr_name) != 0)
5239 continue;
5240
5241 /* Found prison_racct with a matching name? */
5242 prison_racct_hold(prr);
5243 return (prr);
5244 }
5245
5246 /* Add new prison_racct. */
5247 prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
5248 racct_create(&prr->prr_racct);
5249
5250 strcpy(prr->prr_name, name);
5251 refcount_init(&prr->prr_refcount, 1);
5252 LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
5253
5254 return (prr);
5255 }
5256
5257 struct prison_racct *
prison_racct_find(const char * name)5258 prison_racct_find(const char *name)
5259 {
5260 struct prison_racct *prr;
5261
5262 ASSERT_RACCT_ENABLED();
5263
5264 sx_xlock(&allprison_lock);
5265 prr = prison_racct_find_locked(name);
5266 sx_xunlock(&allprison_lock);
5267 return (prr);
5268 }
5269
5270 void
prison_racct_hold(struct prison_racct * prr)5271 prison_racct_hold(struct prison_racct *prr)
5272 {
5273
5274 ASSERT_RACCT_ENABLED();
5275
5276 refcount_acquire(&prr->prr_refcount);
5277 }
5278
5279 static void
prison_racct_free_locked(struct prison_racct * prr)5280 prison_racct_free_locked(struct prison_racct *prr)
5281 {
5282
5283 ASSERT_RACCT_ENABLED();
5284 sx_assert(&allprison_lock, SA_XLOCKED);
5285
5286 if (refcount_release(&prr->prr_refcount)) {
5287 racct_destroy(&prr->prr_racct);
5288 LIST_REMOVE(prr, prr_next);
5289 free(prr, M_PRISON_RACCT);
5290 }
5291 }
5292
5293 void
prison_racct_free(struct prison_racct * prr)5294 prison_racct_free(struct prison_racct *prr)
5295 {
5296
5297 ASSERT_RACCT_ENABLED();
5298 sx_assert(&allprison_lock, SA_UNLOCKED);
5299
5300 if (refcount_release_if_not_last(&prr->prr_refcount))
5301 return;
5302
5303 sx_xlock(&allprison_lock);
5304 prison_racct_free_locked(prr);
5305 sx_xunlock(&allprison_lock);
5306 }
5307
5308 static void
prison_racct_attach(struct prison * pr)5309 prison_racct_attach(struct prison *pr)
5310 {
5311 struct prison_racct *prr;
5312
5313 ASSERT_RACCT_ENABLED();
5314 sx_assert(&allprison_lock, SA_XLOCKED);
5315
5316 prr = prison_racct_find_locked(pr->pr_name);
5317 KASSERT(prr != NULL, ("cannot find prison_racct"));
5318
5319 pr->pr_prison_racct = prr;
5320 }
5321
5322 /*
5323 * Handle jail renaming. From the racct point of view, renaming means
5324 * moving from one prison_racct to another.
5325 */
5326 static void
prison_racct_modify(struct prison * pr)5327 prison_racct_modify(struct prison *pr)
5328 {
5329 #ifdef RCTL
5330 struct proc *p;
5331 struct ucred *cred;
5332 #endif
5333 struct prison_racct *oldprr;
5334
5335 ASSERT_RACCT_ENABLED();
5336
5337 sx_slock(&allproc_lock);
5338 sx_xlock(&allprison_lock);
5339
5340 if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
5341 sx_xunlock(&allprison_lock);
5342 sx_sunlock(&allproc_lock);
5343 return;
5344 }
5345
5346 oldprr = pr->pr_prison_racct;
5347 pr->pr_prison_racct = NULL;
5348
5349 prison_racct_attach(pr);
5350
5351 /*
5352 * Move resource utilisation records.
5353 */
5354 racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
5355
5356 #ifdef RCTL
5357 /*
5358 * Force rctl to reattach rules to processes.
5359 */
5360 FOREACH_PROC_IN_SYSTEM(p) {
5361 PROC_LOCK(p);
5362 cred = crhold(p->p_ucred);
5363 PROC_UNLOCK(p);
5364 rctl_proc_ucred_changed(p, cred);
5365 crfree(cred);
5366 }
5367 #endif
5368
5369 sx_sunlock(&allproc_lock);
5370 prison_racct_free_locked(oldprr);
5371 sx_xunlock(&allprison_lock);
5372 }
5373
5374 static void
prison_racct_detach(struct prison * pr)5375 prison_racct_detach(struct prison *pr)
5376 {
5377
5378 ASSERT_RACCT_ENABLED();
5379 sx_assert(&allprison_lock, SA_UNLOCKED);
5380
5381 if (pr->pr_prison_racct == NULL)
5382 return;
5383 prison_racct_free(pr->pr_prison_racct);
5384 pr->pr_prison_racct = NULL;
5385 }
5386 #endif /* RACCT */
5387
5388 /*
5389 * Submit a knote for a prison, locking if necessary.
5390 */
5391 static void
prison_knote(struct prison * pr,long hint)5392 prison_knote(struct prison *pr, long hint)
5393 {
5394 int locked;
5395
5396 locked = mtx_owned(&pr->pr_mtx);
5397 if (!locked)
5398 mtx_lock(&pr->pr_mtx);
5399 KNOTE_LOCKED(pr->pr_klist, hint);
5400 jaildesc_knote(pr, hint);
5401 if (!locked)
5402 mtx_unlock(&pr->pr_mtx);
5403 }
5404
5405 #ifdef DDB
5406
5407 static void
db_show_prison(struct prison * pr)5408 db_show_prison(struct prison *pr)
5409 {
5410 struct bool_flags *bf;
5411 struct jailsys_flags *jsf;
5412 #if defined(INET) || defined(INET6)
5413 int ii;
5414 struct prison_ip *pip;
5415 #endif
5416 unsigned f;
5417 #ifdef INET
5418 char ip4buf[INET_ADDRSTRLEN];
5419 #endif
5420 #ifdef INET6
5421 char ip6buf[INET6_ADDRSTRLEN];
5422 #endif
5423
5424 db_printf("prison %p:\n", pr);
5425 db_printf(" jid = %d\n", pr->pr_id);
5426 db_printf(" name = %s\n", pr->pr_name);
5427 db_printf(" parent = %p\n", pr->pr_parent);
5428 db_printf(" ref = %d\n", pr->pr_ref);
5429 db_printf(" uref = %d\n", pr->pr_uref);
5430 db_printf(" state = %s\n",
5431 pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
5432 pr->pr_state == PRISON_STATE_DYING ? "dying" :
5433 "invalid");
5434 db_printf(" path = %s\n", pr->pr_path);
5435 db_printf(" cpuset = %d\n", pr->pr_cpuset
5436 ? pr->pr_cpuset->cs_id : -1);
5437 #ifdef VIMAGE
5438 db_printf(" vnet = %p\n", pr->pr_vnet);
5439 #endif
5440 db_printf(" root = %p\n", pr->pr_root);
5441 db_printf(" securelevel = %d\n", pr->pr_securelevel);
5442 db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum);
5443 db_printf(" children.max = %d\n", pr->pr_childmax);
5444 db_printf(" children.cur = %d\n", pr->pr_childcount);
5445 db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
5446 db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
5447 db_printf(" flags = 0x%x", pr->pr_flags);
5448 for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
5449 if (pr->pr_flags & bf->flag)
5450 db_printf(" %s", bf->name);
5451 for (jsf = pr_flag_jailsys;
5452 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
5453 jsf++) {
5454 f = pr->pr_flags & (jsf->disable | jsf->new);
5455 db_printf(" %-16s= %s\n", jsf->name,
5456 (f != 0 && f == jsf->disable) ? "disable"
5457 : (f == jsf->new) ? "new"
5458 : "inherit");
5459 }
5460 db_printf(" allow = 0x%x", pr->pr_allow);
5461 for (bf = pr_flag_allow;
5462 bf < pr_flag_allow + nitems(pr_flag_allow) &&
5463 atomic_load_int(&bf->flag) != 0;
5464 bf++)
5465 if (pr->pr_allow & bf->flag)
5466 db_printf(" %s", bf->name);
5467 db_printf("\n");
5468 db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
5469 db_printf(" host.hostname = %s\n", pr->pr_hostname);
5470 db_printf(" host.domainname = %s\n", pr->pr_domainname);
5471 db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
5472 db_printf(" host.hostid = %lu\n", pr->pr_hostid);
5473 #ifdef INET
5474 if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
5475 db_printf(" ip4s = %d\n", pip->ips);
5476 for (ii = 0; ii < pip->ips; ii++)
5477 db_printf(" %s %s\n",
5478 ii == 0 ? "ip4.addr =" : " ",
5479 inet_ntoa_r(
5480 *(const struct in_addr *)PR_IP(pip, PR_INET, ii),
5481 ip4buf));
5482 }
5483 #endif
5484 #ifdef INET6
5485 if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
5486 db_printf(" ip6s = %d\n", pip->ips);
5487 for (ii = 0; ii < pip->ips; ii++)
5488 db_printf(" %s %s\n",
5489 ii == 0 ? "ip6.addr =" : " ",
5490 ip6_sprintf(ip6buf,
5491 (const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
5492 }
5493 #endif
5494 }
5495
DB_SHOW_COMMAND(prison,db_show_prison_command)5496 DB_SHOW_COMMAND(prison, db_show_prison_command)
5497 {
5498 struct prison *pr;
5499
5500 if (!have_addr) {
5501 /*
5502 * Show all prisons in the list, and prison0 which is not
5503 * listed.
5504 */
5505 db_show_prison(&prison0);
5506 if (!db_pager_quit) {
5507 TAILQ_FOREACH(pr, &allprison, pr_list) {
5508 db_show_prison(pr);
5509 if (db_pager_quit)
5510 break;
5511 }
5512 }
5513 return;
5514 }
5515
5516 if (addr == 0)
5517 pr = &prison0;
5518 else {
5519 /* Look for a prison with the ID and with references. */
5520 TAILQ_FOREACH(pr, &allprison, pr_list)
5521 if (pr->pr_id == addr && pr->pr_ref > 0)
5522 break;
5523 if (pr == NULL)
5524 /* Look again, without requiring a reference. */
5525 TAILQ_FOREACH(pr, &allprison, pr_list)
5526 if (pr->pr_id == addr)
5527 break;
5528 if (pr == NULL)
5529 /* Assume address points to a valid prison. */
5530 pr = (struct prison *)addr;
5531 }
5532 db_show_prison(pr);
5533 }
5534
5535 #endif /* DDB */
5536