1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 1999 Poul-Henning Kamp. 5 * Copyright (c) 2008 Bjoern A. Zeeb. 6 * Copyright (c) 2009 James Gritton. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 #include "opt_ddb.h" 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_nfs.h" 36 37 #include <sys/param.h> 38 #include <sys/types.h> 39 #include <sys/ctype.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/errno.h> 43 #include <sys/file.h> 44 #include <sys/sysproto.h> 45 #include <sys/malloc.h> 46 #include <sys/osd.h> 47 #include <sys/priv.h> 48 #include <sys/proc.h> 49 #include <sys/epoch.h> 50 #include <sys/event.h> 51 #include <sys/taskqueue.h> 52 #include <sys/fcntl.h> 53 #include <sys/jail.h> 54 #include <sys/jaildesc.h> 55 #include <sys/linker.h> 56 #include <sys/lock.h> 57 #include <sys/mman.h> 58 #include <sys/mutex.h> 59 #include <sys/racct.h> 60 #include <sys/rctl.h> 61 #include <sys/refcount.h> 62 #include <sys/sx.h> 63 #include <sys/sysent.h> 64 #include <sys/namei.h> 65 #include <sys/mount.h> 66 #include <sys/queue.h> 67 #include <sys/socket.h> 68 #include <sys/syscallsubr.h> 69 #include <sys/sysctl.h> 70 #include <sys/uuid.h> 71 #include <sys/vnode.h> 72 73 #include <net/if.h> 74 #include <net/vnet.h> 75 76 #include <netinet/in.h> 77 78 #ifdef DDB 79 #include <ddb/ddb.h> 80 #endif /* DDB */ 81 82 #include <security/mac/mac_framework.h> 83 #include <security/mac/mac_policy.h> 84 #include <security/mac/mac_syscalls.h> 85 86 #define PRISON0_HOSTUUID_MODULE "hostuuid" 87 88 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); 89 #ifdef RACCT 90 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); 91 #endif 92 93 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ 94 #ifdef INET 95 #ifdef INET6 96 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL 97 #else 98 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL 99 #endif 100 #else /* !INET */ 101 #ifdef INET6 102 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL 103 #else 104 #define _PR_IP_SADDRSEL 0 105 #endif 106 #endif 107 108 /* prison0 describes what is "real" about the system. */ 109 struct prison prison0 = { 110 .pr_id = 0, 111 .pr_name = "0", 112 .pr_ref = 1, 113 .pr_uref = 1, 114 .pr_path = "/", 115 .pr_securelevel = -1, 116 .pr_devfs_rsnum = 0, 117 .pr_state = PRISON_STATE_ALIVE, 118 .pr_childmax = JAIL_MAX, 119 .pr_hostuuid = DEFAULT_HOSTUUID, 120 .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), 121 #ifdef VIMAGE 122 .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, 123 #else 124 .pr_flags = PR_HOST|_PR_IP_SADDRSEL, 125 #endif 126 .pr_allow = PR_ALLOW_PRISON0, 127 }; 128 _Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0, 129 "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved"); 130 131 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); 132 133 struct bool_flags { 134 const char *name; 135 const char *noname; 136 volatile u_int flag; 137 }; 138 struct jailsys_flags { 139 const char *name; 140 unsigned disable; 141 unsigned new; 142 }; 143 144 /* 145 * Handle jail teardown in a dedicated thread to avoid deadlocks from 146 * vnet_destroy(). 147 */ 148 TASKQUEUE_DEFINE_THREAD(jail_remove); 149 150 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ 151 struct sx allprison_lock; 152 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); 153 struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); 154 LIST_HEAD(, prison_racct) allprison_racct; 155 int lastprid = 0; 156 int lastdeadid = 0; 157 158 static int get_next_prid(struct prison **insprp); 159 static int get_next_deadid(struct prison **insprp); 160 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); 161 static void prison_complete(void *context, int pending); 162 static void prison_deref(struct prison *pr, int flags); 163 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); 164 static int prison_lock_xlock(struct prison *pr, int flags); 165 static void prison_cleanup_locked(struct prison *pr); 166 static void prison_cleanup_unlocked(struct prison *pr); 167 static void prison_free_not_last(struct prison *pr); 168 static void prison_proc_free_not_last(struct prison *pr); 169 static void prison_proc_relink(struct prison *opr, struct prison *npr, 170 struct proc *p); 171 static void prison_set_allow_locked(struct prison *pr, unsigned flag, 172 int enable); 173 static char *prison_path(struct prison *pr1, struct prison *pr2); 174 #ifdef RACCT 175 static void prison_racct_attach(struct prison *pr); 176 static void prison_racct_modify(struct prison *pr); 177 static void prison_racct_detach(struct prison *pr); 178 #endif 179 static void prison_knote(struct prison *pr, long hint); 180 181 /* Flags for prison_deref */ 182 #define PD_DEREF 0x01 /* Decrement pr_ref */ 183 #define PD_DEUREF 0x02 /* Decrement pr_uref */ 184 #define PD_KILL 0x04 /* Remove jail, kill processes, etc */ 185 #define PD_LOCKED 0x10 /* pr_mtx is held */ 186 #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */ 187 #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */ 188 #define PD_OP_FLAGS 0x07 /* Operation flags */ 189 #define PD_LOCK_FLAGS 0x70 /* Lock status flags */ 190 191 /* 192 * Parameter names corresponding to PR_* flag values. Size values are for kvm 193 * as we cannot figure out the size of a sparse array, or an array without a 194 * terminating entry. 195 */ 196 static struct bool_flags pr_flag_bool[] = { 197 {"persist", "nopersist", PR_PERSIST}, 198 #ifdef INET 199 {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, 200 #endif 201 #ifdef INET6 202 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, 203 #endif 204 }; 205 const size_t pr_flag_bool_size = sizeof(pr_flag_bool); 206 207 static struct jailsys_flags pr_flag_jailsys[] = { 208 {"host", 0, PR_HOST}, 209 #ifdef VIMAGE 210 {"vnet", 0, PR_VNET}, 211 #endif 212 #ifdef INET 213 {"ip4", PR_IP4_USER, PR_IP4_USER}, 214 #endif 215 #ifdef INET6 216 {"ip6", PR_IP6_USER, PR_IP6_USER}, 217 #endif 218 }; 219 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); 220 221 /* 222 * Make this array full-size so dynamic parameters can be added. 223 * It is protected by prison0.mtx, but lockless reading is allowed 224 * with an atomic check of the flag values. 225 */ 226 static struct bool_flags pr_flag_allow[NBBY * NBPW] = { 227 {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, 228 {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, 229 {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, 230 {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, 231 {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, 232 {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, 233 {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, 234 {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, 235 {"allow.reserved_ports", "allow.noreserved_ports", 236 PR_ALLOW_RESERVED_PORTS}, 237 {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, 238 {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", 239 PR_ALLOW_UNPRIV_DEBUG}, 240 {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, 241 #ifdef VIMAGE 242 {"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD}, 243 #endif 244 {"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR}, 245 {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME}, 246 {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME}, 247 {"allow.routing", "allow.norouting", PR_ALLOW_ROUTING}, 248 {"allow.unprivileged_parent_tampering", 249 "allow.nounprivileged_parent_tampering", 250 PR_ALLOW_UNPRIV_PARENT_TAMPER}, 251 #ifdef AUDIT 252 {"allow.setaudit", "allow.nosetaudit", PR_ALLOW_SETAUDIT}, 253 #endif 254 }; 255 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; 256 const size_t pr_flag_allow_size = sizeof(pr_flag_allow); 257 258 #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ 259 PR_ALLOW_RESERVED_PORTS | \ 260 PR_ALLOW_UNPRIV_DEBUG | \ 261 PR_ALLOW_SUSER) 262 #define JAIL_DEFAULT_ENFORCE_STATFS 2 263 #define JAIL_DEFAULT_DEVFS_RSNUM 0 264 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; 265 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; 266 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; 267 #if defined(INET) || defined(INET6) 268 static unsigned jail_max_af_ips = 255; 269 #endif 270 271 /* 272 * Initialize the parts of prison0 that can't be static-initialized with 273 * constants. This is called from proc0_init() after creating thread0 cpuset. 274 */ 275 void 276 prison0_init(void) 277 { 278 uint8_t *file, *data; 279 size_t size; 280 char buf[sizeof(prison0.pr_hostuuid)]; 281 #ifdef MAC 282 int error __diagused; 283 #endif 284 bool valid; 285 286 #ifdef MAC 287 error = mac_prison_init(&prison0, M_WAITOK); 288 MPASS(error == 0); 289 290 mtx_unlock(&prison0.pr_mtx); 291 #endif 292 prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); 293 prison0.pr_osreldate = osreldate; 294 strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); 295 296 /* If we have a preloaded hostuuid, use it. */ 297 file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); 298 if (file != NULL) { 299 data = preload_fetch_addr(file); 300 size = preload_fetch_size(file); 301 if (data != NULL) { 302 /* 303 * The preloaded data may include trailing whitespace, almost 304 * certainly a newline; skip over any whitespace or 305 * non-printable characters to be safe. 306 */ 307 while (size > 0 && data[size - 1] <= 0x20) { 308 size--; 309 } 310 311 valid = false; 312 313 /* 314 * Not NUL-terminated when passed from loader, but 315 * validate_uuid requires that due to using sscanf (as 316 * does the subsequent strlcpy, since it still reads 317 * past the given size to return the true length); 318 * bounce to a temporary buffer to fix. 319 */ 320 if (size >= sizeof(buf)) 321 goto done; 322 323 memcpy(buf, data, size); 324 buf[size] = '\0'; 325 326 if (validate_uuid(buf, size, NULL, 0) != 0) 327 goto done; 328 329 valid = true; 330 (void)strlcpy(prison0.pr_hostuuid, buf, 331 sizeof(prison0.pr_hostuuid)); 332 333 done: 334 if (bootverbose && !valid) { 335 printf("hostuuid: preload data malformed: '%.*s'\n", 336 (int)size, data); 337 } 338 } 339 } 340 if (bootverbose) 341 printf("hostuuid: using %s\n", prison0.pr_hostuuid); 342 } 343 344 /* 345 * struct jail_args { 346 * struct jail *jail; 347 * }; 348 */ 349 int 350 sys_jail(struct thread *td, struct jail_args *uap) 351 { 352 uint32_t version; 353 int error; 354 struct jail j; 355 356 error = copyin(uap->jail, &version, sizeof(uint32_t)); 357 if (error) 358 return (error); 359 360 switch (version) { 361 case 0: 362 { 363 struct jail_v0 j0; 364 365 /* FreeBSD single IPv4 jails. */ 366 bzero(&j, sizeof(struct jail)); 367 error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); 368 if (error) 369 return (error); 370 j.version = j0.version; 371 j.path = j0.path; 372 j.hostname = j0.hostname; 373 j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ 374 break; 375 } 376 377 case 1: 378 /* 379 * Version 1 was used by multi-IPv4 jail implementations 380 * that never made it into the official kernel. 381 */ 382 return (EINVAL); 383 384 case 2: /* JAIL_API_VERSION */ 385 /* FreeBSD multi-IPv4/IPv6,noIP jails. */ 386 error = copyin(uap->jail, &j, sizeof(struct jail)); 387 if (error) 388 return (error); 389 break; 390 391 default: 392 /* Sci-Fi jails are not supported, sorry. */ 393 return (EINVAL); 394 } 395 return (kern_jail(td, &j)); 396 } 397 398 int 399 kern_jail(struct thread *td, struct jail *j) 400 { 401 struct iovec optiov[2 * (4 + nitems(pr_flag_allow) 402 #ifdef INET 403 + 1 404 #endif 405 #ifdef INET6 406 + 1 407 #endif 408 )]; 409 struct uio opt; 410 char *u_path, *u_hostname, *u_name; 411 struct bool_flags *bf; 412 #ifdef INET 413 uint32_t ip4s; 414 struct in_addr *u_ip4; 415 #endif 416 #ifdef INET6 417 struct in6_addr *u_ip6; 418 #endif 419 size_t tmplen; 420 int error, enforce_statfs; 421 422 bzero(&optiov, sizeof(optiov)); 423 opt.uio_iov = optiov; 424 opt.uio_iovcnt = 0; 425 opt.uio_offset = -1; 426 opt.uio_resid = -1; 427 opt.uio_segflg = UIO_SYSSPACE; 428 opt.uio_rw = UIO_READ; 429 opt.uio_td = td; 430 431 /* Set permissions for top-level jails from sysctls. */ 432 if (!jailed(td->td_ucred)) { 433 for (bf = pr_flag_allow; 434 bf < pr_flag_allow + nitems(pr_flag_allow) && 435 atomic_load_int(&bf->flag) != 0; 436 bf++) { 437 optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, 438 (jail_default_allow & bf->flag) 439 ? bf->name : bf->noname); 440 optiov[opt.uio_iovcnt].iov_len = 441 strlen(optiov[opt.uio_iovcnt].iov_base) + 1; 442 opt.uio_iovcnt += 2; 443 } 444 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; 445 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); 446 opt.uio_iovcnt++; 447 enforce_statfs = jail_default_enforce_statfs; 448 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; 449 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); 450 opt.uio_iovcnt++; 451 } 452 453 tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; 454 #ifdef INET 455 ip4s = (j->version == 0) ? 1 : j->ip4s; 456 if (ip4s > jail_max_af_ips) 457 return (EINVAL); 458 tmplen += ip4s * sizeof(struct in_addr); 459 #else 460 if (j->ip4s > 0) 461 return (EINVAL); 462 #endif 463 #ifdef INET6 464 if (j->ip6s > jail_max_af_ips) 465 return (EINVAL); 466 tmplen += j->ip6s * sizeof(struct in6_addr); 467 #else 468 if (j->ip6s > 0) 469 return (EINVAL); 470 #endif 471 u_path = malloc(tmplen, M_TEMP, M_WAITOK); 472 u_hostname = u_path + MAXPATHLEN; 473 u_name = u_hostname + MAXHOSTNAMELEN; 474 #ifdef INET 475 u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); 476 #endif 477 #ifdef INET6 478 #ifdef INET 479 u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); 480 #else 481 u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); 482 #endif 483 #endif 484 optiov[opt.uio_iovcnt].iov_base = "path"; 485 optiov[opt.uio_iovcnt].iov_len = sizeof("path"); 486 opt.uio_iovcnt++; 487 optiov[opt.uio_iovcnt].iov_base = u_path; 488 error = copyinstr(j->path, u_path, MAXPATHLEN, 489 &optiov[opt.uio_iovcnt].iov_len); 490 if (error) { 491 free(u_path, M_TEMP); 492 return (error); 493 } 494 opt.uio_iovcnt++; 495 optiov[opt.uio_iovcnt].iov_base = "host.hostname"; 496 optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); 497 opt.uio_iovcnt++; 498 optiov[opt.uio_iovcnt].iov_base = u_hostname; 499 error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, 500 &optiov[opt.uio_iovcnt].iov_len); 501 if (error) { 502 free(u_path, M_TEMP); 503 return (error); 504 } 505 opt.uio_iovcnt++; 506 if (j->jailname != NULL) { 507 optiov[opt.uio_iovcnt].iov_base = "name"; 508 optiov[opt.uio_iovcnt].iov_len = sizeof("name"); 509 opt.uio_iovcnt++; 510 optiov[opt.uio_iovcnt].iov_base = u_name; 511 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, 512 &optiov[opt.uio_iovcnt].iov_len); 513 if (error) { 514 free(u_path, M_TEMP); 515 return (error); 516 } 517 opt.uio_iovcnt++; 518 } 519 #ifdef INET 520 optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; 521 optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); 522 opt.uio_iovcnt++; 523 optiov[opt.uio_iovcnt].iov_base = u_ip4; 524 optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); 525 if (j->version == 0) 526 u_ip4->s_addr = j->ip4s; 527 else { 528 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); 529 if (error) { 530 free(u_path, M_TEMP); 531 return (error); 532 } 533 } 534 opt.uio_iovcnt++; 535 #endif 536 #ifdef INET6 537 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; 538 optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); 539 opt.uio_iovcnt++; 540 optiov[opt.uio_iovcnt].iov_base = u_ip6; 541 optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); 542 error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); 543 if (error) { 544 free(u_path, M_TEMP); 545 return (error); 546 } 547 opt.uio_iovcnt++; 548 #endif 549 KASSERT(opt.uio_iovcnt <= nitems(optiov), 550 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); 551 error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); 552 free(u_path, M_TEMP); 553 return (error); 554 } 555 556 /* 557 * struct jail_set_args { 558 * struct iovec *iovp; 559 * unsigned int iovcnt; 560 * int flags; 561 * }; 562 */ 563 int 564 sys_jail_set(struct thread *td, struct jail_set_args *uap) 565 { 566 struct uio *auio; 567 int error; 568 569 /* Check that we have an even number of iovecs. */ 570 if (uap->iovcnt & 1) 571 return (EINVAL); 572 573 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 574 if (error) 575 return (error); 576 error = kern_jail_set(td, auio, uap->flags); 577 freeuio(auio); 578 return (error); 579 } 580 581 #if defined(INET) || defined(INET6) 582 typedef int prison_addr_cmp_t(const void *, const void *); 583 typedef bool prison_addr_valid_t(const void *); 584 static const struct pr_family { 585 size_t size; 586 prison_addr_cmp_t *cmp; 587 prison_addr_valid_t *valid; 588 int ip_flag; 589 } pr_families[PR_FAMILY_MAX] = { 590 #ifdef INET 591 [PR_INET] = { 592 .size = sizeof(struct in_addr), 593 .cmp = prison_qcmp_v4, 594 .valid = prison_valid_v4, 595 .ip_flag = PR_IP4_USER, 596 }, 597 #endif 598 #ifdef INET6 599 [PR_INET6] = { 600 .size = sizeof(struct in6_addr), 601 .cmp = prison_qcmp_v6, 602 .valid = prison_valid_v6, 603 .ip_flag = PR_IP6_USER, 604 }, 605 #endif 606 }; 607 608 /* 609 * Network address lists (pr_addrs) allocation for jails. The addresses 610 * are accessed locklessly by the network stack, thus need to be protected by 611 * the network epoch. 612 */ 613 struct prison_ip { 614 struct epoch_context ctx; 615 uint32_t ips; 616 #ifdef FUTURE_C 617 /* 618 * XXX Variable-length automatic arrays in union may be 619 * supported in future C. 620 */ 621 union { 622 char pr_ip[]; 623 struct in_addr pr_ip4[]; 624 struct in6_addr pr_ip6[]; 625 }; 626 #else /* No future C :( */ 627 char pr_ip[]; 628 #endif 629 }; 630 631 static char * 632 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx) 633 { 634 MPASS(pip); 635 MPASS(af < PR_FAMILY_MAX); 636 MPASS(idx >= 0 && idx < pip->ips); 637 638 return (pip->pr_ip + pr_families[af].size * idx); 639 } 640 641 static struct prison_ip * 642 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags) 643 { 644 struct prison_ip *pip; 645 646 pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size, 647 M_PRISON, flags); 648 if (pip != NULL) 649 pip->ips = cnt; 650 return (pip); 651 } 652 653 /* 654 * Allocate and copyin user supplied address list, sorting and validating. 655 * kern_jail_set() helper. 656 */ 657 static struct prison_ip * 658 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt) 659 { 660 prison_addr_cmp_t *const cmp = pr_families[af].cmp; 661 const size_t size = pr_families[af].size; 662 struct prison_ip *pip; 663 664 pip = prison_ip_alloc(af, cnt, M_WAITOK); 665 bcopy(op, pip->pr_ip, cnt * size); 666 /* 667 * IP addresses are all sorted but ip[0] to preserve 668 * the primary IP address as given from userland. 669 * This special IP is used for unbound outgoing 670 * connections as well for "loopback" traffic in case 671 * source address selection cannot find any more fitting 672 * address to connect from. 673 */ 674 if (cnt > 1) 675 qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp); 676 /* 677 * Check for duplicate addresses and do some simple 678 * zero and broadcast checks. If users give other bogus 679 * addresses it is their problem. 680 */ 681 for (int i = 0; i < cnt; i++) { 682 if (!pr_families[af].valid(PR_IP(pip, af, i))) { 683 free(pip, M_PRISON); 684 return (NULL); 685 } 686 if (i + 1 < cnt && 687 (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 || 688 cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) { 689 free(pip, M_PRISON); 690 return (NULL); 691 } 692 } 693 694 return (pip); 695 } 696 697 /* 698 * Allocate and dup parent prison address list. 699 * kern_jail_set() helper. 700 */ 701 static void 702 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af) 703 { 704 const struct prison_ip *ppip = ppr->pr_addrs[af]; 705 struct prison_ip *pip; 706 707 if (ppip != NULL) { 708 pip = prison_ip_alloc(af, ppip->ips, M_WAITOK); 709 bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size); 710 pr->pr_addrs[af] = pip; 711 } 712 } 713 714 /* 715 * Make sure the new set of IP addresses is a subset of the parent's list. 716 * Don't worry about the parent being unlocked, as any setting is done with 717 * allprison_lock held. 718 * kern_jail_set() helper. 719 */ 720 static bool 721 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip, 722 const pr_family_t af) 723 { 724 prison_addr_cmp_t *const cmp = pr_families[af].cmp; 725 int i, j; 726 727 if (ppip == NULL) 728 return (false); 729 730 for (i = 0; i < ppip->ips; i++) 731 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0) 732 break; 733 734 if (i == ppip->ips) 735 /* Main address not present in parent. */ 736 return (false); 737 738 if (pip->ips > 1) { 739 for (i = j = 1; i < pip->ips; i++) { 740 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) 741 /* Equals to parent primary address. */ 742 continue; 743 for (; j < ppip->ips; j++) 744 if (cmp(PR_IP(pip, af, i), 745 PR_IP(ppip, af, j)) == 0) 746 break; 747 if (j == ppip->ips) 748 break; 749 } 750 if (j == ppip->ips) 751 /* Address not present in parent. */ 752 return (false); 753 } 754 return (true); 755 } 756 757 /* 758 * Check for conflicting IP addresses. We permit them if there is no more 759 * than one IP on each jail. If there is a duplicate on a jail with more 760 * than one IP stop checking and return error. 761 * kern_jail_set() helper. 762 */ 763 static bool 764 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr, 765 struct prison_ip *pip, pr_family_t af) 766 { 767 const struct prison *tppr, *tpr; 768 int descend; 769 770 #ifdef VIMAGE 771 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) 772 if (tppr->pr_flags & PR_VNET) 773 break; 774 #else 775 tppr = &prison0; 776 #endif 777 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { 778 if (tpr == pr || 779 #ifdef VIMAGE 780 (tpr != tppr && (tpr->pr_flags & PR_VNET)) || 781 #endif 782 !prison_isalive(tpr)) { 783 descend = 0; 784 continue; 785 } 786 if (!(tpr->pr_flags & pr_families[af].ip_flag)) 787 continue; 788 descend = 0; 789 if (tpr->pr_addrs[af] == NULL || 790 (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1)) 791 continue; 792 for (int i = 0; i < pip->ips; i++) 793 if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0) 794 return (false); 795 } 796 797 return (true); 798 } 799 800 _Static_assert(offsetof(struct prison_ip, ctx) == 0, 801 "prison must start with epoch context"); 802 static void 803 prison_ip_free_deferred(epoch_context_t ctx) 804 { 805 806 free(ctx, M_PRISON); 807 } 808 809 static void 810 prison_ip_free(struct prison_ip *pip) 811 { 812 813 if (pip != NULL) 814 NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx); 815 } 816 817 static void 818 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new) 819 { 820 struct prison_ip **mem, *old; 821 822 mtx_assert(&pr->pr_mtx, MA_OWNED); 823 824 mem = &pr->pr_addrs[af]; 825 826 old = *mem; 827 atomic_store_ptr(mem, new); 828 prison_ip_free(old); 829 } 830 831 /* 832 * Restrict a prison's IP address list with its parent's, possibly replacing 833 * it. Return true if succeed, otherwise should redo. 834 * kern_jail_set() helper. 835 */ 836 static bool 837 prison_ip_restrict(struct prison *pr, const pr_family_t af, 838 struct prison_ip **newp) 839 { 840 struct prison_ip *ppip = pr->pr_parent->pr_addrs[af]; 841 struct prison_ip *pip = pr->pr_addrs[af]; 842 int (*const cmp)(const void *, const void *) = pr_families[af].cmp; 843 const size_t size = pr_families[af].size; 844 struct prison_ip *new = newp != NULL ? *newp : NULL; 845 uint32_t ips; 846 847 mtx_assert(&pr->pr_mtx, MA_OWNED); 848 849 /* 850 * Due to epoch-synchronized access to the IP address lists we always 851 * allocate a new list even if the old one has enough space. We could 852 * atomically update an IPv4 address inside a list, but that would 853 * screw up sorting, and in case of IPv6 we can't even atomically write 854 * one. 855 */ 856 if (ppip == NULL) { 857 if (pip != NULL) 858 prison_ip_set(pr, af, NULL); 859 return (true); 860 } 861 862 if (!(pr->pr_flags & pr_families[af].ip_flag)) { 863 if (new == NULL) { 864 new = prison_ip_alloc(af, ppip->ips, M_NOWAIT); 865 if (new == NULL) 866 return (false); /* Redo */ 867 } 868 /* This has no user settings, so just copy the parent's list. */ 869 MPASS(new->ips == ppip->ips); 870 bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size); 871 prison_ip_set(pr, af, new); 872 if (newp != NULL) 873 *newp = NULL; /* Used */ 874 } else if (pip != NULL) { 875 /* Remove addresses that aren't in the parent. */ 876 int i; 877 878 i = 0; /* index in pip */ 879 ips = 0; /* index in new */ 880 881 if (new == NULL) { 882 new = prison_ip_alloc(af, pip->ips, M_NOWAIT); 883 if (new == NULL) 884 return (false); /* Redo */ 885 } 886 887 for (int pi = 0; pi < ppip->ips; pi++) 888 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) { 889 /* Found our primary address in parent. */ 890 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), 891 size); 892 i++; 893 ips++; 894 break; 895 } 896 for (int pi = 1; i < pip->ips; ) { 897 /* Check against primary, which is unsorted. */ 898 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) { 899 /* Matches parent's primary address. */ 900 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), 901 size); 902 i++; 903 ips++; 904 continue; 905 } 906 /* The rest are sorted. */ 907 switch (pi >= ppip->ips ? -1 : 908 cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) { 909 case -1: 910 i++; 911 break; 912 case 0: 913 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), 914 size); 915 i++; 916 pi++; 917 ips++; 918 break; 919 case 1: 920 pi++; 921 break; 922 } 923 } 924 if (ips == 0) { 925 if (newp == NULL || *newp == NULL) 926 prison_ip_free(new); 927 new = NULL; 928 } else { 929 /* Shrink to real size */ 930 KASSERT((new->ips >= ips), 931 ("Out-of-bounds write to prison_ip %p", new)); 932 new->ips = ips; 933 } 934 prison_ip_set(pr, af, new); 935 if (newp != NULL) 936 *newp = NULL; /* Used */ 937 } 938 return (true); 939 } 940 941 /* 942 * Fast-path check if an address belongs to a prison. 943 */ 944 int 945 prison_ip_check(const struct prison *pr, const pr_family_t af, 946 const void *addr) 947 { 948 int (*const cmp)(const void *, const void *) = pr_families[af].cmp; 949 struct prison_ip *pip; 950 int i, a, z, d; 951 952 MPASS(mtx_owned(&pr->pr_mtx) || 953 in_epoch(net_epoch_preempt) || 954 sx_xlocked(&allprison_lock)); 955 956 pip = atomic_load_ptr(&pr->pr_addrs[af]); 957 if (__predict_false(pip == NULL)) 958 return (EAFNOSUPPORT); 959 960 /* Check the primary IP. */ 961 if (cmp(PR_IP(pip, af, 0), addr) == 0) 962 return (0); 963 964 /* 965 * All the other IPs are sorted so we can do a binary search. 966 */ 967 a = 0; 968 z = pip->ips - 2; 969 while (a <= z) { 970 i = (a + z) / 2; 971 d = cmp(PR_IP(pip, af, i + 1), addr); 972 if (d > 0) 973 z = i - 1; 974 else if (d < 0) 975 a = i + 1; 976 else 977 return (0); 978 } 979 980 return (EADDRNOTAVAIL); 981 } 982 983 /* 984 * Grab primary IP. Historically required mutex, but nothing prevents 985 * us to support epoch-protected access. Is it used in fast path? 986 * in{6}_jail.c helper 987 */ 988 const void * 989 prison_ip_get0(const struct prison *pr, const pr_family_t af) 990 { 991 const struct prison_ip *pip = pr->pr_addrs[af]; 992 993 mtx_assert(&pr->pr_mtx, MA_OWNED); 994 MPASS(pip); 995 996 return (pip->pr_ip); 997 } 998 999 u_int 1000 prison_ip_cnt(const struct prison *pr, const pr_family_t af) 1001 { 1002 1003 return (pr->pr_addrs[af]->ips); 1004 } 1005 #endif /* defined(INET) || defined(INET6) */ 1006 1007 int 1008 kern_jail_set(struct thread *td, struct uio *optuio, int flags) 1009 { 1010 struct file *jfp_out; 1011 struct nameidata nd; 1012 #ifdef INET 1013 struct prison_ip *ip4; 1014 #endif 1015 #ifdef INET6 1016 struct prison_ip *ip6; 1017 #endif 1018 struct vfsopt *opt; 1019 struct vfsoptlist *opts; 1020 struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; 1021 struct ucred *jdcred; 1022 struct vnode *root; 1023 char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; 1024 char *g_path, *osrelstr; 1025 struct bool_flags *bf; 1026 struct jailsys_flags *jsf; 1027 #if defined(INET) || defined(INET6) 1028 void *op; 1029 #endif 1030 unsigned long hid; 1031 size_t namelen, onamelen, pnamelen; 1032 #ifdef MAC 1033 void *mac_set_prison_data = NULL; 1034 int gotmaclabel; 1035 #endif 1036 int created, cuflags, descend, drflags, enforce; 1037 int error, errmsg_len, errmsg_pos; 1038 int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; 1039 int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; 1040 int childmax, osreldt, rsnum, slevel; 1041 #ifdef INET 1042 int ip4s; 1043 bool redo_ip4; 1044 #endif 1045 #ifdef INET6 1046 int ip6s; 1047 bool redo_ip6; 1048 #endif 1049 bool maybe_changed; 1050 uint64_t pr_allow, ch_allow, pr_flags, ch_flags; 1051 uint64_t pr_allow_diff; 1052 unsigned tallow; 1053 char numbuf[12]; 1054 1055 mypr = td->td_ucred->cr_prison; 1056 if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) && 1057 mypr->pr_childmax == 0) 1058 return (EPERM); 1059 if (flags & ~JAIL_SET_MASK) 1060 return (EINVAL); 1061 if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) == 1062 (JAIL_USE_DESC | JAIL_AT_DESC)) 1063 return (EINVAL); 1064 prison_hold(mypr); 1065 1066 #ifdef INET 1067 ip4 = NULL; 1068 #endif 1069 #ifdef INET6 1070 ip6 = NULL; 1071 #endif 1072 g_path = NULL; 1073 jfp_out = NULL; 1074 jfd_out = -1; 1075 /* 1076 * Check all the parameters before committing to anything. Not all 1077 * errors can be caught early, but we may as well try. Also, this 1078 * takes care of some expensive stuff (path lookup) before getting 1079 * the allprison lock. 1080 * 1081 * XXX Jails are not filesystems, and jail parameters are not mount 1082 * options. But it makes more sense to re-use the vfsopt code 1083 * than duplicate it under a different name. 1084 */ 1085 error = vfs_buildopts(optuio, &opts); 1086 if (error) { 1087 opts = NULL; 1088 goto done_free; 1089 } 1090 1091 cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); 1092 if (!cuflags) { 1093 error = EINVAL; 1094 vfs_opterror(opts, "no valid operation (create or update)"); 1095 goto done_errmsg; 1096 } 1097 1098 error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); 1099 if (error == ENOENT) { 1100 if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | 1101 JAIL_OWN_DESC)) { 1102 vfs_opterror(opts, "missing desc"); 1103 goto done_errmsg; 1104 } 1105 jfd_in = -1; 1106 } else if (error != 0) 1107 goto done_free; 1108 else { 1109 if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | 1110 JAIL_OWN_DESC))) { 1111 error = EINVAL; 1112 vfs_opterror(opts, "unexpected desc"); 1113 goto done_errmsg; 1114 } 1115 if (flags & JAIL_AT_DESC) { 1116 /* 1117 * Look up and create jails based on the 1118 * descriptor's prison. 1119 */ 1120 prison_free(mypr); 1121 error = jaildesc_find(td, jfd_in, &mypr, NULL); 1122 if (error != 0) { 1123 vfs_opterror(opts, error == ENOENT ? 1124 "descriptor to dead jail" : 1125 "not a jail descriptor"); 1126 goto done_errmsg; 1127 } 1128 if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) { 1129 error = EPERM; 1130 goto done_free; 1131 } 1132 } 1133 if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { 1134 /* Allocate a jail descriptor to return later. */ 1135 error = jaildesc_alloc(td, &jfp_out, &jfd_out, 1136 flags & JAIL_OWN_DESC); 1137 if (error) 1138 goto done_free; 1139 } 1140 } 1141 1142 /* 1143 * Delay the permission check if using a jail descriptor, 1144 * until we get the descriptor's credentials. 1145 */ 1146 if (!(flags & JAIL_USE_DESC)) { 1147 error = priv_check(td, PRIV_JAIL_SET); 1148 if (error == 0 && (flags & JAIL_ATTACH)) 1149 error = priv_check(td, PRIV_JAIL_ATTACH); 1150 if (error) 1151 goto done_free; 1152 } 1153 1154 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); 1155 if (error == ENOENT) 1156 jid = 0; 1157 else if (error != 0) 1158 goto done_free; 1159 1160 error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); 1161 if (error == ENOENT) 1162 gotslevel = 0; 1163 else if (error != 0) 1164 goto done_free; 1165 else 1166 gotslevel = 1; 1167 1168 error = 1169 vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); 1170 if (error == ENOENT) 1171 gotchildmax = 0; 1172 else if (error != 0) 1173 goto done_free; 1174 else 1175 gotchildmax = 1; 1176 1177 error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); 1178 if (error == ENOENT) 1179 gotenforce = 0; 1180 else if (error != 0) 1181 goto done_free; 1182 else if (enforce < 0 || enforce > 2) { 1183 error = EINVAL; 1184 goto done_free; 1185 } else 1186 gotenforce = 1; 1187 1188 error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); 1189 if (error == ENOENT) 1190 gotrsnum = 0; 1191 else if (error != 0) 1192 goto done_free; 1193 else 1194 gotrsnum = 1; 1195 1196 pr_flags = ch_flags = 0; 1197 for (bf = pr_flag_bool; 1198 bf < pr_flag_bool + nitems(pr_flag_bool); 1199 bf++) { 1200 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); 1201 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); 1202 } 1203 ch_flags |= pr_flags; 1204 for (jsf = pr_flag_jailsys; 1205 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 1206 jsf++) { 1207 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); 1208 if (error == ENOENT) 1209 continue; 1210 if (error != 0) 1211 goto done_free; 1212 switch (jsys) { 1213 case JAIL_SYS_DISABLE: 1214 if (!jsf->disable) { 1215 error = EINVAL; 1216 goto done_free; 1217 } 1218 pr_flags |= jsf->disable; 1219 break; 1220 case JAIL_SYS_NEW: 1221 pr_flags |= jsf->new; 1222 break; 1223 case JAIL_SYS_INHERIT: 1224 break; 1225 default: 1226 error = EINVAL; 1227 goto done_free; 1228 } 1229 ch_flags |= jsf->new | jsf->disable; 1230 } 1231 if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE 1232 && !(pr_flags & PR_PERSIST)) { 1233 error = EINVAL; 1234 vfs_opterror(opts, "new jail must persist or attach"); 1235 goto done_errmsg; 1236 } 1237 #ifdef VIMAGE 1238 if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { 1239 error = EINVAL; 1240 vfs_opterror(opts, "vnet cannot be changed after creation"); 1241 goto done_errmsg; 1242 } 1243 #endif 1244 #ifdef INET 1245 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { 1246 error = EINVAL; 1247 vfs_opterror(opts, "ip4 cannot be changed after creation"); 1248 goto done_errmsg; 1249 } 1250 #endif 1251 #ifdef INET6 1252 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { 1253 error = EINVAL; 1254 vfs_opterror(opts, "ip6 cannot be changed after creation"); 1255 goto done_errmsg; 1256 } 1257 #endif 1258 1259 pr_allow = ch_allow = 0; 1260 for (bf = pr_flag_allow; 1261 bf < pr_flag_allow + nitems(pr_flag_allow) && 1262 atomic_load_int(&bf->flag) != 0; 1263 bf++) { 1264 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); 1265 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); 1266 } 1267 ch_allow |= pr_allow; 1268 1269 error = vfs_getopt(opts, "name", (void **)&name, &len); 1270 if (error == ENOENT) 1271 name = NULL; 1272 else if (error != 0) 1273 goto done_free; 1274 else { 1275 if (len == 0 || name[len - 1] != '\0') { 1276 error = EINVAL; 1277 goto done_free; 1278 } 1279 if (len > MAXHOSTNAMELEN) { 1280 error = ENAMETOOLONG; 1281 goto done_free; 1282 } 1283 } 1284 1285 error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); 1286 if (error == ENOENT) 1287 host = NULL; 1288 else if (error != 0) 1289 goto done_free; 1290 else { 1291 ch_flags |= PR_HOST; 1292 pr_flags |= PR_HOST; 1293 if (len == 0 || host[len - 1] != '\0') { 1294 error = EINVAL; 1295 goto done_free; 1296 } 1297 if (len > MAXHOSTNAMELEN) { 1298 error = ENAMETOOLONG; 1299 goto done_free; 1300 } 1301 } 1302 1303 error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); 1304 if (error == ENOENT) 1305 domain = NULL; 1306 else if (error != 0) 1307 goto done_free; 1308 else { 1309 ch_flags |= PR_HOST; 1310 pr_flags |= PR_HOST; 1311 if (len == 0 || domain[len - 1] != '\0') { 1312 error = EINVAL; 1313 goto done_free; 1314 } 1315 if (len > MAXHOSTNAMELEN) { 1316 error = ENAMETOOLONG; 1317 goto done_free; 1318 } 1319 } 1320 1321 error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); 1322 if (error == ENOENT) 1323 uuid = NULL; 1324 else if (error != 0) 1325 goto done_free; 1326 else { 1327 ch_flags |= PR_HOST; 1328 pr_flags |= PR_HOST; 1329 if (len == 0 || uuid[len - 1] != '\0') { 1330 error = EINVAL; 1331 goto done_free; 1332 } 1333 if (len > HOSTUUIDLEN) { 1334 error = ENAMETOOLONG; 1335 goto done_free; 1336 } 1337 } 1338 1339 #ifdef COMPAT_FREEBSD32 1340 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 1341 uint32_t hid32; 1342 1343 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); 1344 hid = hid32; 1345 } else 1346 #endif 1347 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); 1348 if (error == ENOENT) 1349 gothid = 0; 1350 else if (error != 0) 1351 goto done_free; 1352 else { 1353 gothid = 1; 1354 ch_flags |= PR_HOST; 1355 pr_flags |= PR_HOST; 1356 } 1357 1358 #ifdef MAC 1359 /* Process the mac.label vfsopt */ 1360 error = mac_set_prison_prepare(td, opts, &mac_set_prison_data); 1361 if (error == ENOENT) 1362 gotmaclabel = 0; 1363 else if (error != 0) 1364 goto done_errmsg; 1365 else 1366 gotmaclabel = 1; 1367 #endif 1368 1369 #ifdef INET 1370 error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); 1371 if (error == ENOENT) 1372 ip4s = 0; 1373 else if (error != 0) 1374 goto done_free; 1375 else if (ip4s & (sizeof(struct in_addr) - 1)) { 1376 error = EINVAL; 1377 goto done_free; 1378 } else { 1379 ch_flags |= PR_IP4_USER; 1380 pr_flags |= PR_IP4_USER; 1381 if (ip4s > 0) { 1382 ip4s /= sizeof(struct in_addr); 1383 if (ip4s > jail_max_af_ips) { 1384 error = EINVAL; 1385 vfs_opterror(opts, "too many IPv4 addresses"); 1386 goto done_errmsg; 1387 } 1388 ip4 = prison_ip_copyin(PR_INET, op, ip4s); 1389 if (ip4 == NULL) { 1390 error = EINVAL; 1391 goto done_free; 1392 } 1393 } 1394 } 1395 #endif 1396 1397 #ifdef INET6 1398 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); 1399 if (error == ENOENT) 1400 ip6s = 0; 1401 else if (error != 0) 1402 goto done_free; 1403 else if (ip6s & (sizeof(struct in6_addr) - 1)) { 1404 error = EINVAL; 1405 goto done_free; 1406 } else { 1407 ch_flags |= PR_IP6_USER; 1408 pr_flags |= PR_IP6_USER; 1409 if (ip6s > 0) { 1410 ip6s /= sizeof(struct in6_addr); 1411 if (ip6s > jail_max_af_ips) { 1412 error = EINVAL; 1413 vfs_opterror(opts, "too many IPv6 addresses"); 1414 goto done_errmsg; 1415 } 1416 ip6 = prison_ip_copyin(PR_INET6, op, ip6s); 1417 if (ip6 == NULL) { 1418 error = EINVAL; 1419 goto done_free; 1420 } 1421 } 1422 } 1423 #endif 1424 1425 #if defined(VIMAGE) && (defined(INET) || defined(INET6)) 1426 if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { 1427 error = EINVAL; 1428 vfs_opterror(opts, 1429 "vnet jails cannot have IP address restrictions"); 1430 goto done_errmsg; 1431 } 1432 #endif 1433 1434 error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); 1435 if (error == ENOENT) 1436 osrelstr = NULL; 1437 else if (error != 0) 1438 goto done_free; 1439 else { 1440 if (flags & JAIL_UPDATE) { 1441 error = EINVAL; 1442 vfs_opterror(opts, 1443 "osrelease cannot be changed after creation"); 1444 goto done_errmsg; 1445 } 1446 if (len == 0 || osrelstr[len - 1] != '\0') { 1447 error = EINVAL; 1448 goto done_free; 1449 } 1450 if (len >= OSRELEASELEN) { 1451 error = ENAMETOOLONG; 1452 vfs_opterror(opts, 1453 "osrelease string must be 1-%d bytes long", 1454 OSRELEASELEN - 1); 1455 goto done_errmsg; 1456 } 1457 } 1458 1459 error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); 1460 if (error == ENOENT) 1461 osreldt = 0; 1462 else if (error != 0) 1463 goto done_free; 1464 else { 1465 if (flags & JAIL_UPDATE) { 1466 error = EINVAL; 1467 vfs_opterror(opts, 1468 "osreldate cannot be changed after creation"); 1469 goto done_errmsg; 1470 } 1471 if (osreldt == 0) { 1472 error = EINVAL; 1473 vfs_opterror(opts, "osreldate cannot be 0"); 1474 goto done_errmsg; 1475 } 1476 } 1477 1478 root = NULL; 1479 error = vfs_getopt(opts, "path", (void **)&path, &len); 1480 if (error == ENOENT) 1481 path = NULL; 1482 else if (error != 0) 1483 goto done_free; 1484 else { 1485 if (flags & JAIL_UPDATE) { 1486 error = EINVAL; 1487 vfs_opterror(opts, 1488 "path cannot be changed after creation"); 1489 goto done_errmsg; 1490 } 1491 if (len == 0 || path[len - 1] != '\0') { 1492 error = EINVAL; 1493 goto done_free; 1494 } 1495 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path); 1496 error = namei(&nd); 1497 if (error) 1498 goto done_free; 1499 root = nd.ni_vp; 1500 NDFREE_PNBUF(&nd); 1501 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 1502 strlcpy(g_path, path, MAXPATHLEN); 1503 error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); 1504 if (error == 0) { 1505 path = g_path; 1506 } else { 1507 /* exit on other errors */ 1508 goto done_free; 1509 } 1510 if (root->v_type != VDIR) { 1511 error = ENOTDIR; 1512 vput(root); 1513 goto done_free; 1514 } 1515 VOP_UNLOCK(root); 1516 } 1517 1518 /* 1519 * Find the specified jail, or at least its parent. 1520 * This abuses the file error codes ENOENT and EEXIST. 1521 */ 1522 pr = NULL; 1523 inspr = NULL; 1524 deadpr = NULL; 1525 maybe_changed = false; 1526 if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { 1527 namelc = strrchr(name, '.'); 1528 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); 1529 if (*p != '\0') 1530 jid = 0; 1531 } 1532 sx_xlock(&allprison_lock); 1533 drflags = PD_LIST_XLOCKED; 1534 ppr = mypr; 1535 if (!prison_isalive(ppr)) { 1536 /* This jail is dying. This process will surely follow. */ 1537 error = EAGAIN; 1538 goto done_deref; 1539 } 1540 if (flags & JAIL_USE_DESC) { 1541 /* Get the jail from its descriptor. */ 1542 error = jaildesc_find(td, jfd_in, &pr, &jdcred); 1543 if (error) { 1544 vfs_opterror(opts, error == ENOENT ? 1545 "descriptor to dead jail" : 1546 "not a jail descriptor"); 1547 goto done_deref; 1548 } 1549 drflags |= PD_DEREF; 1550 error = priv_check_cred(jdcred, PRIV_JAIL_SET); 1551 if (error == 0 && (flags & JAIL_ATTACH)) 1552 error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); 1553 crfree(jdcred); 1554 if (error) 1555 goto done_deref; 1556 mtx_lock(&pr->pr_mtx); 1557 drflags |= PD_LOCKED; 1558 if (cuflags == JAIL_CREATE) { 1559 error = EEXIST; 1560 vfs_opterror(opts, "jail %d already exists", 1561 pr->pr_id); 1562 goto done_deref; 1563 } 1564 if (!prison_isalive(pr)) { 1565 /* While a jid can be resurrected, the prison 1566 * itself cannot. 1567 */ 1568 error = ENOENT; 1569 vfs_opterror(opts, "jail %d is dying", pr->pr_id); 1570 goto done_deref; 1571 } 1572 if (jid != 0 && jid != pr->pr_id) { 1573 error = EINVAL; 1574 vfs_opterror(opts, "cannot change jid"); 1575 goto done_deref; 1576 } 1577 jid = pr->pr_id; 1578 } else if (jid != 0) { 1579 if (jid < 0) { 1580 error = EINVAL; 1581 vfs_opterror(opts, "negative jid"); 1582 goto done_deref; 1583 } 1584 /* 1585 * See if a requested jid already exists. Keep track of 1586 * where it can be inserted later. 1587 */ 1588 TAILQ_FOREACH(inspr, &allprison, pr_list) { 1589 if (inspr->pr_id < jid) 1590 continue; 1591 if (inspr->pr_id > jid) 1592 break; 1593 if (prison_isalive(inspr)) { 1594 pr = inspr; 1595 mtx_lock(&pr->pr_mtx); 1596 drflags |= PD_LOCKED; 1597 } else { 1598 /* Note a dying jail to handle later. */ 1599 deadpr = inspr; 1600 } 1601 inspr = NULL; 1602 break; 1603 } 1604 if (cuflags == JAIL_CREATE && pr != NULL) { 1605 /* 1606 * Even creators that cannot see the jail will 1607 * get EEXIST. 1608 */ 1609 error = EEXIST; 1610 vfs_opterror(opts, "jail %d already exists", jid); 1611 goto done_deref; 1612 } 1613 if ((pr == NULL) 1614 ? cuflags == JAIL_UPDATE 1615 : !prison_ischild(mypr, pr)) { 1616 /* 1617 * Updaters get ENOENT for nonexistent jails, 1618 * or for jails they cannot see. The latter 1619 * case is true even for CREATE | UPDATE, 1620 * which normally cannot give this error. 1621 */ 1622 error = ENOENT; 1623 vfs_opterror(opts, "jail %d not found", jid); 1624 goto done_deref; 1625 } 1626 } 1627 /* 1628 * If the caller provided a name, look for a jail by that name. 1629 * This has different semantics for creates and updates keyed by jid 1630 * (where the name must not already exist in a different jail), 1631 * and updates keyed by the name itself (where the name must exist 1632 * because that is the jail being updated). 1633 */ 1634 namelc = NULL; 1635 if (name != NULL) { 1636 namelc = strrchr(name, '.'); 1637 if (namelc == NULL) 1638 namelc = name; 1639 else { 1640 /* 1641 * This is a hierarchical name. Split it into the 1642 * parent and child names, and make sure the parent 1643 * exists or matches an already found jail. 1644 */ 1645 if (pr != NULL) { 1646 if (strncmp(name, ppr->pr_name, namelc - name) 1647 || ppr->pr_name[namelc - name] != '\0') { 1648 error = EINVAL; 1649 vfs_opterror(opts, 1650 "cannot change jail's parent"); 1651 goto done_deref; 1652 } 1653 } else { 1654 *namelc = '\0'; 1655 ppr = prison_find_name(mypr, name); 1656 if (ppr == NULL) { 1657 error = ENOENT; 1658 vfs_opterror(opts, 1659 "jail \"%s\" not found", name); 1660 goto done_deref; 1661 } 1662 mtx_unlock(&ppr->pr_mtx); 1663 if (!prison_isalive(ppr)) { 1664 error = ENOENT; 1665 vfs_opterror(opts, 1666 "jail \"%s\" is dying", name); 1667 goto done_deref; 1668 } 1669 *namelc = '.'; 1670 } 1671 namelc++; 1672 } 1673 if (namelc[0] != '\0') { 1674 pnamelen = 1675 (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; 1676 FOREACH_PRISON_CHILD(ppr, tpr) { 1677 if (tpr == pr || !prison_isalive(tpr) || 1678 strcmp(tpr->pr_name + pnamelen, namelc)) 1679 continue; 1680 if (cuflags == JAIL_CREATE || pr != NULL) { 1681 /* 1682 * Create, or update(jid): name must 1683 * not exist in an active sibling jail. 1684 */ 1685 error = EEXIST; 1686 vfs_opterror(opts, 1687 "jail \"%s\" already exists", name); 1688 goto done_deref; 1689 } 1690 /* Use this jail for updates. */ 1691 pr = tpr; 1692 mtx_lock(&pr->pr_mtx); 1693 drflags |= PD_LOCKED; 1694 break; 1695 } 1696 /* 1697 * Update: name must exist if no jid is specified. 1698 * As with the jid case, the jail must be currently 1699 * visible, or else even CREATE | UPDATE will get 1700 * an error. 1701 */ 1702 if ((pr == NULL) 1703 ? cuflags == JAIL_UPDATE 1704 : !prison_isalive(pr)) { 1705 error = ENOENT; 1706 vfs_opterror(opts, "jail \"%s\" not found", 1707 name); 1708 goto done_deref; 1709 } 1710 } 1711 } 1712 /* Update: must provide a desc, jid, or name. */ 1713 else if (cuflags == JAIL_UPDATE && pr == NULL) { 1714 error = ENOENT; 1715 vfs_opterror(opts, "update specified no jail"); 1716 goto done_deref; 1717 } 1718 1719 /* If there's no prison to update, create a new one and link it in. */ 1720 created = pr == NULL; 1721 if (created) { 1722 #ifdef MAC 1723 error = mac_prison_check_create(td->td_ucred, opts, flags); 1724 if (error != 0) 1725 goto done_deref; 1726 #endif 1727 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) 1728 if (tpr->pr_childcount >= tpr->pr_childmax) { 1729 error = EPERM; 1730 vfs_opterror(opts, "prison limit exceeded"); 1731 goto done_deref; 1732 } 1733 1734 if (deadpr != NULL) { 1735 /* 1736 * The prison being created has the same ID as a dying 1737 * one. Handle this by giving the dying jail a new ID. 1738 * This may cause some confusion to user space, but 1739 * only to those listing dying jails. 1740 */ 1741 deadid = get_next_deadid(&dinspr); 1742 if (deadid == 0) { 1743 error = EAGAIN; 1744 vfs_opterror(opts, "no available jail IDs"); 1745 goto done_deref; 1746 } 1747 mtx_lock(&deadpr->pr_mtx); 1748 deadpr->pr_id = deadid; 1749 mtx_unlock(&deadpr->pr_mtx); 1750 if (dinspr == deadpr) 1751 inspr = deadpr; 1752 else { 1753 inspr = TAILQ_NEXT(deadpr, pr_list); 1754 TAILQ_REMOVE(&allprison, deadpr, pr_list); 1755 if (dinspr != NULL) 1756 TAILQ_INSERT_AFTER(&allprison, dinspr, 1757 deadpr, pr_list); 1758 else 1759 TAILQ_INSERT_HEAD(&allprison, deadpr, 1760 pr_list); 1761 } 1762 } 1763 if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { 1764 error = EAGAIN; 1765 vfs_opterror(opts, "no available jail IDs"); 1766 goto done_deref; 1767 } 1768 1769 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); 1770 pr->pr_state = PRISON_STATE_INVALID; 1771 refcount_init(&pr->pr_ref, 1); 1772 refcount_init(&pr->pr_uref, 0); 1773 drflags |= PD_DEREF; 1774 LIST_INIT(&pr->pr_children); 1775 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); 1776 TASK_INIT(&pr->pr_task, 0, prison_complete, pr); 1777 1778 pr->pr_id = jid; 1779 if (inspr != NULL) 1780 TAILQ_INSERT_BEFORE(inspr, pr, pr_list); 1781 else 1782 TAILQ_INSERT_TAIL(&allprison, pr, pr_list); 1783 1784 pr->pr_parent = ppr; 1785 prison_hold(ppr); 1786 prison_proc_hold(ppr); 1787 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); 1788 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) 1789 tpr->pr_childcount++; 1790 pr->pr_klist = knlist_alloc(&pr->pr_mtx); 1791 1792 /* Set some default values, and inherit some from the parent. */ 1793 if (namelc == NULL) 1794 namelc = ""; 1795 if (path == NULL) { 1796 path = "/"; 1797 root = mypr->pr_root; 1798 vref(root); 1799 } 1800 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); 1801 pr->pr_flags |= PR_HOST; 1802 #if defined(INET) || defined(INET6) 1803 #ifdef VIMAGE 1804 if (!(pr_flags & PR_VNET)) 1805 #endif 1806 { 1807 #ifdef INET 1808 if (!(ch_flags & PR_IP4_USER)) 1809 pr->pr_flags |= PR_IP4 | PR_IP4_USER; 1810 else if (!(pr_flags & PR_IP4_USER)) { 1811 pr->pr_flags |= ppr->pr_flags & PR_IP4; 1812 prison_ip_dup(ppr, pr, PR_INET); 1813 } 1814 #endif 1815 #ifdef INET6 1816 if (!(ch_flags & PR_IP6_USER)) 1817 pr->pr_flags |= PR_IP6 | PR_IP6_USER; 1818 else if (!(pr_flags & PR_IP6_USER)) { 1819 pr->pr_flags |= ppr->pr_flags & PR_IP6; 1820 prison_ip_dup(ppr, pr, PR_INET6); 1821 } 1822 #endif 1823 } 1824 #endif 1825 /* Source address selection is always on by default. */ 1826 pr->pr_flags |= _PR_IP_SADDRSEL; 1827 1828 pr->pr_securelevel = ppr->pr_securelevel; 1829 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; 1830 pr->pr_enforce_statfs = jail_default_enforce_statfs; 1831 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; 1832 1833 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; 1834 if (osrelstr == NULL) 1835 strlcpy(pr->pr_osrelease, ppr->pr_osrelease, 1836 sizeof(pr->pr_osrelease)); 1837 else 1838 strlcpy(pr->pr_osrelease, osrelstr, 1839 sizeof(pr->pr_osrelease)); 1840 1841 #ifdef VIMAGE 1842 /* 1843 * Allocate a new vnet if specified. 1844 * 1845 * Set PR_VNET now if so, so that the vnet is disposed of 1846 * properly when the jail is destroyed. 1847 */ 1848 if (pr_flags & PR_VNET) { 1849 pr->pr_flags |= PR_VNET; 1850 pr->pr_vnet = vnet_alloc(); 1851 } else { 1852 pr->pr_vnet = ppr->pr_vnet; 1853 } 1854 #endif 1855 /* 1856 * Allocate a dedicated cpuset for each jail. 1857 * Unlike other initial settings, this may return an error. 1858 */ 1859 error = cpuset_create_root(ppr, &pr->pr_cpuset); 1860 if (error) 1861 goto done_deref; 1862 1863 #ifdef MAC 1864 error = mac_prison_init(pr, M_WAITOK); 1865 MPASS(error == 0); 1866 1867 mtx_assert(&pr->pr_mtx, MA_OWNED); 1868 #else 1869 mtx_lock(&pr->pr_mtx); 1870 #endif 1871 drflags |= PD_LOCKED; 1872 } else { 1873 /* 1874 * Grab a reference for existing prisons, to ensure they 1875 * continue to exist for the duration of the call. 1876 */ 1877 if (!(drflags & PD_DEREF)) { 1878 prison_hold(pr); 1879 drflags |= PD_DEREF; 1880 } 1881 #ifdef MAC 1882 error = mac_prison_check_set(td->td_ucred, pr, opts, flags); 1883 if (error != 0) 1884 goto done_deref; 1885 #endif 1886 #if defined(VIMAGE) && (defined(INET) || defined(INET6)) 1887 if ((pr->pr_flags & PR_VNET) && 1888 (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { 1889 error = EINVAL; 1890 vfs_opterror(opts, 1891 "vnet jails cannot have IP address restrictions"); 1892 goto done_deref; 1893 } 1894 #endif 1895 #ifdef INET 1896 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { 1897 error = EINVAL; 1898 vfs_opterror(opts, 1899 "ip4 cannot be changed after creation"); 1900 goto done_deref; 1901 } 1902 #endif 1903 #ifdef INET6 1904 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { 1905 error = EINVAL; 1906 vfs_opterror(opts, 1907 "ip6 cannot be changed after creation"); 1908 goto done_deref; 1909 } 1910 #endif 1911 } 1912 1913 /* Do final error checking before setting anything. */ 1914 if (gotslevel) { 1915 if (slevel < ppr->pr_securelevel) { 1916 error = EPERM; 1917 goto done_deref; 1918 } 1919 } 1920 if (gotchildmax) { 1921 if (childmax >= ppr->pr_childmax) { 1922 error = EPERM; 1923 goto done_deref; 1924 } 1925 } 1926 if (gotenforce) { 1927 if (enforce < ppr->pr_enforce_statfs) { 1928 error = EPERM; 1929 goto done_deref; 1930 } 1931 } 1932 if (gotrsnum) { 1933 /* 1934 * devfs_rsnum is a uint16_t 1935 */ 1936 if (rsnum < 0 || rsnum > 65535) { 1937 error = EINVAL; 1938 goto done_deref; 1939 } 1940 /* 1941 * Nested jails always inherit parent's devfs ruleset 1942 */ 1943 if (jailed(td->td_ucred)) { 1944 if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { 1945 error = EPERM; 1946 goto done_deref; 1947 } else 1948 rsnum = ppr->pr_devfs_rsnum; 1949 } 1950 } 1951 #ifdef INET 1952 if (ip4s > 0) { 1953 if ((ppr->pr_flags & PR_IP4) && 1954 !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4, 1955 PR_INET)) { 1956 error = EPERM; 1957 goto done_deref; 1958 } 1959 if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) { 1960 error = EADDRINUSE; 1961 vfs_opterror(opts, "IPv4 addresses clash"); 1962 goto done_deref; 1963 } 1964 } 1965 #endif 1966 #ifdef INET6 1967 if (ip6s > 0) { 1968 if ((ppr->pr_flags & PR_IP6) && 1969 !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6, 1970 PR_INET6)) { 1971 error = EPERM; 1972 goto done_deref; 1973 } 1974 if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) { 1975 error = EADDRINUSE; 1976 vfs_opterror(opts, "IPv6 addresses clash"); 1977 goto done_deref; 1978 } 1979 } 1980 #endif 1981 onamelen = namelen = 0; 1982 if (namelc != NULL) { 1983 /* Give a default name of the jid. Also allow the name to be 1984 * explicitly the jid - but not any other number, and only in 1985 * normal form (no leading zero/etc). 1986 */ 1987 if (namelc[0] == '\0') 1988 snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); 1989 else if ((strtoul(namelc, &p, 10) != jid || 1990 namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { 1991 error = EINVAL; 1992 vfs_opterror(opts, 1993 "name cannot be numeric (unless it is the jid)"); 1994 goto done_deref; 1995 } 1996 /* 1997 * Make sure the name isn't too long for the prison or its 1998 * children. 1999 */ 2000 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; 2001 onamelen = strlen(pr->pr_name + pnamelen); 2002 namelen = strlen(namelc); 2003 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { 2004 error = ENAMETOOLONG; 2005 goto done_deref; 2006 } 2007 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { 2008 if (strlen(tpr->pr_name) + (namelen - onamelen) >= 2009 sizeof(pr->pr_name)) { 2010 error = ENAMETOOLONG; 2011 goto done_deref; 2012 } 2013 } 2014 } 2015 pr_allow_diff = pr_allow & ~ppr->pr_allow; 2016 if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { 2017 error = EPERM; 2018 goto done_deref; 2019 } 2020 2021 /* 2022 * Let modules check their parameters. This requires unlocking and 2023 * then re-locking the prison, but this is still a valid state as long 2024 * as allprison_lock remains xlocked. 2025 */ 2026 mtx_unlock(&pr->pr_mtx); 2027 drflags &= ~PD_LOCKED; 2028 error = osd_jail_call(pr, PR_METHOD_CHECK, opts); 2029 if (error != 0) 2030 goto done_deref; 2031 mtx_lock(&pr->pr_mtx); 2032 drflags |= PD_LOCKED; 2033 2034 /* At this point, all valid parameters should have been noted. */ 2035 TAILQ_FOREACH(opt, opts, link) { 2036 if (!opt->seen && strcmp(opt->name, "errmsg")) { 2037 error = EINVAL; 2038 vfs_opterror(opts, "unknown parameter: %s", opt->name); 2039 goto done_deref; 2040 } 2041 } 2042 maybe_changed = true; 2043 2044 /* Set the parameters of the prison. */ 2045 #ifdef INET 2046 redo_ip4 = false; 2047 if (pr_flags & PR_IP4_USER) { 2048 pr->pr_flags |= PR_IP4; 2049 prison_ip_set(pr, PR_INET, ip4); 2050 ip4 = NULL; 2051 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2052 #ifdef VIMAGE 2053 if (tpr->pr_flags & PR_VNET) { 2054 descend = 0; 2055 continue; 2056 } 2057 #endif 2058 if (!prison_ip_restrict(tpr, PR_INET, NULL)) { 2059 redo_ip4 = true; 2060 descend = 0; 2061 } 2062 } 2063 } 2064 #endif 2065 #ifdef INET6 2066 redo_ip6 = false; 2067 if (pr_flags & PR_IP6_USER) { 2068 pr->pr_flags |= PR_IP6; 2069 prison_ip_set(pr, PR_INET6, ip6); 2070 ip6 = NULL; 2071 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2072 #ifdef VIMAGE 2073 if (tpr->pr_flags & PR_VNET) { 2074 descend = 0; 2075 continue; 2076 } 2077 #endif 2078 if (!prison_ip_restrict(tpr, PR_INET6, NULL)) { 2079 redo_ip6 = true; 2080 descend = 0; 2081 } 2082 } 2083 } 2084 #endif 2085 if (gotslevel) { 2086 pr->pr_securelevel = slevel; 2087 /* Set all child jails to be at least this level. */ 2088 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 2089 if (tpr->pr_securelevel < slevel) 2090 tpr->pr_securelevel = slevel; 2091 } 2092 if (gotchildmax) { 2093 pr->pr_childmax = childmax; 2094 /* Set all child jails to under this limit. */ 2095 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) 2096 if (tpr->pr_childmax > childmax - level) 2097 tpr->pr_childmax = childmax > level 2098 ? childmax - level : 0; 2099 } 2100 if (gotenforce) { 2101 pr->pr_enforce_statfs = enforce; 2102 /* Pass this restriction on to the children. */ 2103 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 2104 if (tpr->pr_enforce_statfs < enforce) 2105 tpr->pr_enforce_statfs = enforce; 2106 } 2107 if (gotrsnum) { 2108 pr->pr_devfs_rsnum = rsnum; 2109 /* Pass this restriction on to the children. */ 2110 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 2111 tpr->pr_devfs_rsnum = rsnum; 2112 } 2113 if (namelc != NULL) { 2114 if (ppr == &prison0) 2115 strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); 2116 else 2117 snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", 2118 ppr->pr_name, namelc); 2119 /* Change this component of child names. */ 2120 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2121 bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, 2122 strlen(tpr->pr_name + onamelen) + 1); 2123 bcopy(pr->pr_name, tpr->pr_name, namelen); 2124 } 2125 } 2126 if (path != NULL) { 2127 /* Try to keep a real-rooted full pathname. */ 2128 strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); 2129 pr->pr_root = root; 2130 root = NULL; 2131 } 2132 if (PR_HOST & ch_flags & ~pr_flags) { 2133 if (pr->pr_flags & PR_HOST) { 2134 /* 2135 * Copy the parent's host info. As with pr_ip4 above, 2136 * the lack of a lock on the parent is not a problem; 2137 * it is always set with allprison_lock at least 2138 * shared, and is held exclusively here. 2139 */ 2140 strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, 2141 sizeof(pr->pr_hostname)); 2142 strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, 2143 sizeof(pr->pr_domainname)); 2144 strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, 2145 sizeof(pr->pr_hostuuid)); 2146 pr->pr_hostid = pr->pr_parent->pr_hostid; 2147 } 2148 } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { 2149 /* Set this prison, and any descendants without PR_HOST. */ 2150 if (host != NULL) 2151 strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); 2152 if (domain != NULL) 2153 strlcpy(pr->pr_domainname, domain, 2154 sizeof(pr->pr_domainname)); 2155 if (uuid != NULL) 2156 strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); 2157 if (gothid) 2158 pr->pr_hostid = hid; 2159 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2160 if (tpr->pr_flags & PR_HOST) 2161 descend = 0; 2162 else { 2163 if (host != NULL) 2164 strlcpy(tpr->pr_hostname, 2165 pr->pr_hostname, 2166 sizeof(tpr->pr_hostname)); 2167 if (domain != NULL) 2168 strlcpy(tpr->pr_domainname, 2169 pr->pr_domainname, 2170 sizeof(tpr->pr_domainname)); 2171 if (uuid != NULL) 2172 strlcpy(tpr->pr_hostuuid, 2173 pr->pr_hostuuid, 2174 sizeof(tpr->pr_hostuuid)); 2175 if (gothid) 2176 tpr->pr_hostid = hid; 2177 } 2178 } 2179 } 2180 pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; 2181 if ((tallow = ch_allow & ~pr_allow)) 2182 prison_set_allow_locked(pr, tallow, 0); 2183 /* 2184 * Persistent prisons get an extra reference, and prisons losing their 2185 * persist flag lose that reference. 2186 */ 2187 if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { 2188 if (pr_flags & PR_PERSIST) { 2189 prison_hold(pr); 2190 /* 2191 * This may be a new prison's first user reference, 2192 * but wait to call it alive until after OSD calls 2193 * have had a chance to run (and perhaps to fail). 2194 */ 2195 refcount_acquire(&pr->pr_uref); 2196 } else { 2197 drflags |= PD_DEUREF; 2198 prison_free_not_last(pr); 2199 } 2200 } 2201 pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; 2202 2203 #ifdef MAC 2204 /* Apply any request MAC label before we let modules do their work. */ 2205 if (gotmaclabel) { 2206 error = mac_set_prison_core(td, pr, mac_set_prison_data); 2207 if (error) { 2208 vfs_opterror(opts, "mac relabel denied"); 2209 goto done_deref; 2210 } 2211 } 2212 #endif 2213 mtx_unlock(&pr->pr_mtx); 2214 drflags &= ~PD_LOCKED; 2215 /* 2216 * Any errors past this point will need to de-persist newly created 2217 * prisons, as well as call remove methods. 2218 */ 2219 if (created) 2220 drflags |= PD_KILL; 2221 2222 #ifdef RACCT 2223 if (racct_enable && created) 2224 prison_racct_attach(pr); 2225 #endif 2226 2227 /* Locks may have prevented a complete restriction of child IP 2228 * addresses. If so, allocate some more memory and try again. 2229 */ 2230 #ifdef INET 2231 while (redo_ip4) { 2232 ip4s = pr->pr_addrs[PR_INET]->ips; 2233 MPASS(ip4 == NULL); 2234 ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK); 2235 mtx_lock(&pr->pr_mtx); 2236 redo_ip4 = false; 2237 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2238 #ifdef VIMAGE 2239 if (tpr->pr_flags & PR_VNET) { 2240 descend = 0; 2241 continue; 2242 } 2243 #endif 2244 if (!prison_ip_restrict(tpr, PR_INET, &ip4)) 2245 redo_ip4 = true; 2246 } 2247 mtx_unlock(&pr->pr_mtx); 2248 } 2249 #endif 2250 #ifdef INET6 2251 while (redo_ip6) { 2252 ip6s = pr->pr_addrs[PR_INET6]->ips; 2253 MPASS(ip6 == NULL); 2254 ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK); 2255 mtx_lock(&pr->pr_mtx); 2256 redo_ip6 = false; 2257 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2258 #ifdef VIMAGE 2259 if (tpr->pr_flags & PR_VNET) { 2260 descend = 0; 2261 continue; 2262 } 2263 #endif 2264 if (!prison_ip_restrict(tpr, PR_INET6, &ip6)) 2265 redo_ip6 = true; 2266 } 2267 mtx_unlock(&pr->pr_mtx); 2268 } 2269 #endif 2270 2271 /* Let the modules do their work. */ 2272 if (created) { 2273 error = osd_jail_call(pr, PR_METHOD_CREATE, opts); 2274 if (error) 2275 goto done_deref; 2276 } 2277 error = osd_jail_call(pr, PR_METHOD_SET, opts); 2278 if (error) 2279 goto done_deref; 2280 2281 /* 2282 * A new prison is now ready to be seen; either it has gained a user 2283 * reference via persistence, or is about to gain one via attachment. 2284 */ 2285 if (created) { 2286 sx_assert(&allprison_lock, SX_XLOCKED); 2287 prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id); 2288 #ifdef MAC 2289 /* 2290 * Note that mac_prison_created() assumes that it's called in a 2291 * sleepable context. 2292 */ 2293 mac_prison_created(td->td_ucred, pr); 2294 #endif 2295 mtx_lock(&pr->pr_mtx); 2296 drflags |= PD_LOCKED; 2297 pr->pr_state = PRISON_STATE_ALIVE; 2298 } 2299 2300 /* Attach this process to the prison if requested. */ 2301 if (flags & JAIL_ATTACH) { 2302 #ifdef MAC 2303 error = mac_prison_check_attach(td->td_ucred, pr); 2304 if (error != 0) { 2305 vfs_opterror(opts, 2306 "attach operation denied by MAC policy"); 2307 goto done_deref; 2308 } 2309 #endif 2310 error = do_jail_attach(td, pr, 2311 prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS)); 2312 drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); 2313 if (error) { 2314 vfs_opterror(opts, "attach failed"); 2315 goto done_deref; 2316 } 2317 } 2318 2319 #ifdef RACCT 2320 if (racct_enable && !created) { 2321 if (drflags & PD_LOCKED) { 2322 mtx_unlock(&pr->pr_mtx); 2323 drflags &= ~PD_LOCKED; 2324 } 2325 if (drflags & PD_LIST_XLOCKED) { 2326 sx_xunlock(&allprison_lock); 2327 drflags &= ~PD_LIST_XLOCKED; 2328 } 2329 prison_racct_modify(pr); 2330 } 2331 #endif 2332 2333 if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 && 2334 (pr->pr_root->v_vflag & VV_ROOT) == 0) 2335 printf("Warning jail jid=%d: mountd/nfsd requires a separate" 2336 " file system\n", pr->pr_id); 2337 2338 /* 2339 * Now that the prison is fully created without error, set the 2340 * jail descriptor if one was requested. This is the only 2341 * parameter that is returned to the caller (except the error 2342 * message). 2343 */ 2344 if (jfd_out >= 0) { 2345 if (!(drflags & PD_LOCKED)) { 2346 mtx_lock(&pr->pr_mtx); 2347 drflags |= PD_LOCKED; 2348 } 2349 jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1; 2350 if (optuio->uio_segflg == UIO_SYSSPACE) 2351 *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out; 2352 else 2353 (void)copyout(&jfd_out, 2354 optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out)); 2355 jaildesc_set_prison(jfp_out, pr); 2356 } 2357 2358 drflags &= ~PD_KILL; 2359 td->td_retval[0] = pr->pr_id; 2360 2361 done_deref: 2362 /* 2363 * Report changes to kevent. This can happen even if the 2364 * system call fails, as changes might have been made before 2365 * the failure. 2366 */ 2367 if (maybe_changed && !created) 2368 prison_knote(pr, NOTE_JAIL_SET); 2369 /* Release any temporary prison holds and/or locks. */ 2370 if (pr != NULL) 2371 prison_deref(pr, drflags); 2372 else if (drflags & PD_LIST_SLOCKED) 2373 sx_sunlock(&allprison_lock); 2374 else if (drflags & PD_LIST_XLOCKED) 2375 sx_xunlock(&allprison_lock); 2376 if (root != NULL) 2377 vrele(root); 2378 done_errmsg: 2379 if (error) { 2380 /* Write the error message back to userspace. */ 2381 if (vfs_getopt(opts, "errmsg", (void **)&errmsg, 2382 &errmsg_len) == 0 && errmsg_len > 0) { 2383 errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; 2384 if (optuio->uio_segflg == UIO_SYSSPACE) 2385 bcopy(errmsg, 2386 optuio->uio_iov[errmsg_pos].iov_base, 2387 errmsg_len); 2388 else 2389 (void)copyout(errmsg, 2390 optuio->uio_iov[errmsg_pos].iov_base, 2391 errmsg_len); 2392 } 2393 } 2394 done_free: 2395 /* Clean up other resources. */ 2396 #ifdef INET 2397 prison_ip_free(ip4); 2398 #endif 2399 #ifdef INET6 2400 prison_ip_free(ip6); 2401 #endif 2402 #ifdef MAC 2403 if (mac_set_prison_data != NULL) 2404 mac_set_prison_finish(td, error == 0, mac_set_prison_data); 2405 #endif 2406 if (jfp_out != NULL) 2407 fdrop(jfp_out, td); 2408 if (error && jfd_out >= 0) 2409 (void)kern_close(td, jfd_out); 2410 if (g_path != NULL) 2411 free(g_path, M_TEMP); 2412 if (opts != NULL) 2413 vfs_freeopts(opts); 2414 prison_free(mypr); 2415 return (error); 2416 } 2417 2418 /* 2419 * Find the next available prison ID. Return the ID on success, or zero 2420 * on failure. Also set a pointer to the allprison list entry the prison 2421 * should be inserted before. 2422 */ 2423 static int 2424 get_next_prid(struct prison **insprp) 2425 { 2426 struct prison *inspr; 2427 int jid, maxid; 2428 2429 jid = lastprid % JAIL_MAX + 1; 2430 if (TAILQ_EMPTY(&allprison) || 2431 TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) { 2432 /* 2433 * A common case is for all jails to be implicitly numbered, 2434 * which means they'll go on the end of the list, at least 2435 * for the first JAIL_MAX times. 2436 */ 2437 inspr = NULL; 2438 } else { 2439 /* 2440 * Take two passes through the allprison list: first starting 2441 * with the proposed jid, then ending with it. 2442 */ 2443 for (maxid = JAIL_MAX; maxid != 0; ) { 2444 TAILQ_FOREACH(inspr, &allprison, pr_list) { 2445 if (inspr->pr_id < jid) 2446 continue; 2447 if (inspr->pr_id > jid) { 2448 /* Found an opening. */ 2449 maxid = 0; 2450 break; 2451 } 2452 if (++jid > maxid) { 2453 if (lastprid == maxid || lastprid == 0) 2454 { 2455 /* 2456 * The entire legal range 2457 * has been traversed 2458 */ 2459 return 0; 2460 } 2461 /* Try again from the start. */ 2462 jid = 1; 2463 maxid = lastprid; 2464 break; 2465 } 2466 } 2467 if (inspr == NULL) { 2468 /* Found room at the end of the list. */ 2469 break; 2470 } 2471 } 2472 } 2473 *insprp = inspr; 2474 lastprid = jid; 2475 return (jid); 2476 } 2477 2478 /* 2479 * Find the next available ID for a renumbered dead prison. This is the same 2480 * as get_next_prid, but counting backward from the end of the range. 2481 */ 2482 static int 2483 get_next_deadid(struct prison **dinsprp) 2484 { 2485 struct prison *dinspr; 2486 int deadid, minid; 2487 2488 deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX; 2489 /* 2490 * Take two reverse passes through the allprison list: first 2491 * starting with the proposed deadid, then ending with it. 2492 */ 2493 for (minid = 1; minid != 0; ) { 2494 TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) { 2495 if (dinspr->pr_id > deadid) 2496 continue; 2497 if (dinspr->pr_id < deadid) { 2498 /* Found an opening. */ 2499 minid = 0; 2500 break; 2501 } 2502 if (--deadid < minid) { 2503 if (lastdeadid == minid || lastdeadid == 0) 2504 { 2505 /* 2506 * The entire legal range 2507 * has been traversed 2508 */ 2509 return 0; 2510 } 2511 /* Try again from the end. */ 2512 deadid = JAIL_MAX; 2513 minid = lastdeadid; 2514 break; 2515 } 2516 } 2517 if (dinspr == NULL) { 2518 /* Found room at the beginning of the list. */ 2519 break; 2520 } 2521 } 2522 *dinsprp = dinspr; 2523 lastdeadid = deadid; 2524 return (deadid); 2525 } 2526 2527 /* 2528 * struct jail_get_args { 2529 * struct iovec *iovp; 2530 * unsigned int iovcnt; 2531 * int flags; 2532 * }; 2533 */ 2534 int 2535 sys_jail_get(struct thread *td, struct jail_get_args *uap) 2536 { 2537 struct uio *auio; 2538 int error; 2539 2540 /* Check that we have an even number of iovecs. */ 2541 if (uap->iovcnt & 1) 2542 return (EINVAL); 2543 2544 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 2545 if (error) 2546 return (error); 2547 error = kern_jail_get(td, auio, uap->flags); 2548 if (error == 0) 2549 error = copyout(auio->uio_iov, uap->iovp, 2550 uap->iovcnt * sizeof(struct iovec)); 2551 freeuio(auio); 2552 return (error); 2553 } 2554 2555 int 2556 kern_jail_get(struct thread *td, struct uio *optuio, int flags) 2557 { 2558 struct bool_flags *bf; 2559 struct file *jfp_out; 2560 struct jailsys_flags *jsf; 2561 struct prison *pr, *mypr; 2562 struct vfsopt *opt; 2563 struct vfsoptlist *opts; 2564 char *errmsg, *name; 2565 int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; 2566 int jfd_in, jfd_out; 2567 unsigned f; 2568 2569 if (flags & ~JAIL_GET_MASK) 2570 return (EINVAL); 2571 if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) == 2572 (JAIL_USE_DESC | JAIL_AT_DESC)) 2573 return (EINVAL); 2574 2575 /* Get the parameter list. */ 2576 error = vfs_buildopts(optuio, &opts); 2577 if (error) 2578 return (error); 2579 errmsg_pos = vfs_getopt_pos(opts, "errmsg"); 2580 mypr = td->td_ucred->cr_prison; 2581 prison_hold(mypr); 2582 pr = NULL; 2583 jfp_out = NULL; 2584 jfd_out = -1; 2585 2586 /* 2587 * Find the prison specified by one of: desc, lastjid, jid, name. 2588 */ 2589 sx_slock(&allprison_lock); 2590 drflags = PD_LIST_SLOCKED; 2591 2592 error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); 2593 if (error == ENOENT) { 2594 if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { 2595 vfs_opterror(opts, "missing desc"); 2596 goto done; 2597 } 2598 } else if (error == 0) { 2599 if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | 2600 JAIL_OWN_DESC))) { 2601 error = EINVAL; 2602 vfs_opterror(opts, "unexpected desc"); 2603 goto done; 2604 } 2605 if (flags & JAIL_USE_DESC) { 2606 /* Get the jail from its descriptor. */ 2607 error = jaildesc_find(td, jfd_in, &pr, NULL); 2608 if (error) { 2609 vfs_opterror(opts, error == ENOENT ? 2610 "descriptor to dead jail" : 2611 "not a jail descriptor"); 2612 goto done; 2613 } 2614 drflags |= PD_DEREF; 2615 mtx_lock(&pr->pr_mtx); 2616 drflags |= PD_LOCKED; 2617 goto found_prison; 2618 } 2619 if (flags & JAIL_AT_DESC) { 2620 /* Look up jails based on the descriptor's prison. */ 2621 prison_free(mypr); 2622 error = jaildesc_find(td, jfd_in, &mypr, NULL); 2623 if (error != 0) { 2624 vfs_opterror(opts, error == ENOENT ? 2625 "descriptor to dead jail" : 2626 "not a jail descriptor"); 2627 goto done; 2628 } 2629 } 2630 if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { 2631 /* Allocate a jail descriptor to return later. */ 2632 error = jaildesc_alloc(td, &jfp_out, &jfd_out, 2633 flags & JAIL_OWN_DESC); 2634 if (error) 2635 goto done; 2636 } 2637 } else 2638 goto done; 2639 2640 error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); 2641 if (error == 0) { 2642 TAILQ_FOREACH(pr, &allprison, pr_list) { 2643 if (pr->pr_id > jid && 2644 ((flags & JAIL_DYING) || prison_isalive(pr)) && 2645 prison_ischild(mypr, pr)) { 2646 mtx_lock(&pr->pr_mtx); 2647 drflags |= PD_LOCKED; 2648 #ifdef MAC 2649 /* 2650 * We special-case this one check because we 2651 * don't want MAC to break jail enumeration. We 2652 * need to just move on to the next accessible 2653 * and alive prison. 2654 */ 2655 error = mac_prison_check_get(td->td_ucred, pr, 2656 opts, flags); 2657 if (error != 0) { 2658 mtx_unlock(&pr->pr_mtx); 2659 drflags &= ~PD_LOCKED; 2660 continue; 2661 } 2662 2663 /* 2664 * Avoid potentially expensive trip back into 2665 * the MAC framework. 2666 */ 2667 goto found_prison_nomac_alive; 2668 #else 2669 goto found_prison; 2670 #endif 2671 } 2672 } 2673 error = ENOENT; 2674 vfs_opterror(opts, "no jail after %d", jid); 2675 goto done; 2676 } else if (error != ENOENT) 2677 goto done; 2678 2679 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); 2680 if (error == 0) { 2681 if (jid != 0) { 2682 pr = prison_find_child(mypr, jid); 2683 if (pr != NULL) { 2684 drflags |= PD_LOCKED; 2685 goto found_prison; 2686 } 2687 error = ENOENT; 2688 vfs_opterror(opts, "jail %d not found", jid); 2689 goto done; 2690 } 2691 } else if (error != ENOENT) 2692 goto done; 2693 2694 error = vfs_getopt(opts, "name", (void **)&name, &len); 2695 if (error == 0) { 2696 if (len == 0 || name[len - 1] != '\0') { 2697 error = EINVAL; 2698 goto done; 2699 } 2700 pr = prison_find_name(mypr, name); 2701 if (pr != NULL) { 2702 drflags |= PD_LOCKED; 2703 goto found_prison; 2704 } 2705 error = ENOENT; 2706 vfs_opterror(opts, "jail \"%s\" not found", name); 2707 goto done; 2708 } else if (error != ENOENT) 2709 goto done; 2710 2711 vfs_opterror(opts, "no jail specified"); 2712 error = ENOENT; 2713 goto done; 2714 2715 found_prison: 2716 #ifdef MAC 2717 error = mac_prison_check_get(td->td_ucred, pr, opts, flags); 2718 if (error != 0) 2719 goto done; 2720 #endif 2721 if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { 2722 error = ENOENT; 2723 if (pr->pr_name[0] != '0' && isdigit(pr->pr_name[0])) { 2724 vfs_opterror(opts, "jail %d is dying", 2725 pr->pr_id); 2726 } else { 2727 vfs_opterror(opts, "jail \"%s\" (%d) is dying", 2728 pr->pr_name, pr->pr_id); 2729 } 2730 goto done; 2731 } 2732 #ifdef MAC 2733 found_prison_nomac_alive: 2734 #endif 2735 /* Get the parameters of the prison. */ 2736 if (!(drflags & PD_DEREF)) { 2737 prison_hold(pr); 2738 drflags |= PD_DEREF; 2739 } 2740 td->td_retval[0] = pr->pr_id; 2741 if (jfd_out >= 0) { 2742 error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out)); 2743 if (error != 0 && error != ENOENT) 2744 goto done; 2745 jaildesc_set_prison(jfp_out, pr); 2746 } 2747 error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); 2748 if (error != 0 && error != ENOENT) 2749 goto done; 2750 i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; 2751 error = vfs_setopt(opts, "parent", &i, sizeof(i)); 2752 if (error != 0 && error != ENOENT) 2753 goto done; 2754 error = vfs_setopts(opts, "name", prison_name(mypr, pr)); 2755 if (error != 0 && error != ENOENT) 2756 goto done; 2757 error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, 2758 sizeof(pr->pr_cpuset->cs_id)); 2759 if (error != 0 && error != ENOENT) 2760 goto done; 2761 error = vfs_setopts(opts, "path", prison_path(mypr, pr)); 2762 if (error != 0 && error != ENOENT) 2763 goto done; 2764 #ifdef INET 2765 error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip, 2766 pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips * 2767 pr_families[PR_INET].size : 0 ); 2768 if (error != 0 && error != ENOENT) 2769 goto done; 2770 #endif 2771 #ifdef INET6 2772 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip, 2773 pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips * 2774 pr_families[PR_INET6].size : 0 ); 2775 if (error != 0 && error != ENOENT) 2776 goto done; 2777 #endif 2778 error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, 2779 sizeof(pr->pr_securelevel)); 2780 if (error != 0 && error != ENOENT) 2781 goto done; 2782 error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, 2783 sizeof(pr->pr_childcount)); 2784 if (error != 0 && error != ENOENT) 2785 goto done; 2786 error = vfs_setopt(opts, "children.max", &pr->pr_childmax, 2787 sizeof(pr->pr_childmax)); 2788 if (error != 0 && error != ENOENT) 2789 goto done; 2790 error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); 2791 if (error != 0 && error != ENOENT) 2792 goto done; 2793 error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); 2794 if (error != 0 && error != ENOENT) 2795 goto done; 2796 error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); 2797 if (error != 0 && error != ENOENT) 2798 goto done; 2799 #ifdef COMPAT_FREEBSD32 2800 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 2801 uint32_t hid32 = pr->pr_hostid; 2802 2803 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); 2804 } else 2805 #endif 2806 error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, 2807 sizeof(pr->pr_hostid)); 2808 if (error != 0 && error != ENOENT) 2809 goto done; 2810 error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, 2811 sizeof(pr->pr_enforce_statfs)); 2812 if (error != 0 && error != ENOENT) 2813 goto done; 2814 error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, 2815 sizeof(pr->pr_devfs_rsnum)); 2816 if (error != 0 && error != ENOENT) 2817 goto done; 2818 for (bf = pr_flag_bool; 2819 bf < pr_flag_bool + nitems(pr_flag_bool); 2820 bf++) { 2821 i = (pr->pr_flags & bf->flag) ? 1 : 0; 2822 error = vfs_setopt(opts, bf->name, &i, sizeof(i)); 2823 if (error != 0 && error != ENOENT) 2824 goto done; 2825 i = !i; 2826 error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); 2827 if (error != 0 && error != ENOENT) 2828 goto done; 2829 } 2830 for (jsf = pr_flag_jailsys; 2831 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 2832 jsf++) { 2833 f = pr->pr_flags & (jsf->disable | jsf->new); 2834 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE 2835 : (f == jsf->new) ? JAIL_SYS_NEW 2836 : JAIL_SYS_INHERIT; 2837 error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); 2838 if (error != 0 && error != ENOENT) 2839 goto done; 2840 } 2841 for (bf = pr_flag_allow; 2842 bf < pr_flag_allow + nitems(pr_flag_allow) && 2843 atomic_load_int(&bf->flag) != 0; 2844 bf++) { 2845 i = (pr->pr_allow & bf->flag) ? 1 : 0; 2846 error = vfs_setopt(opts, bf->name, &i, sizeof(i)); 2847 if (error != 0 && error != ENOENT) 2848 goto done; 2849 i = !i; 2850 error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); 2851 if (error != 0 && error != ENOENT) 2852 goto done; 2853 } 2854 i = !prison_isalive(pr); 2855 error = vfs_setopt(opts, "dying", &i, sizeof(i)); 2856 if (error != 0 && error != ENOENT) 2857 goto done; 2858 i = !i; 2859 error = vfs_setopt(opts, "nodying", &i, sizeof(i)); 2860 if (error != 0 && error != ENOENT) 2861 goto done; 2862 error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, 2863 sizeof(pr->pr_osreldate)); 2864 if (error != 0 && error != ENOENT) 2865 goto done; 2866 error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); 2867 if (error != 0 && error != ENOENT) 2868 goto done; 2869 2870 #ifdef MAC 2871 /* 2872 * We get the MAC label last because we'll let the MAC framework drop 2873 * pr_mtx to externalize the label. 2874 */ 2875 error = mac_get_prison(td, pr, opts); 2876 mtx_assert(&pr->pr_mtx, MA_NOTOWNED); 2877 drflags &= ~PD_LOCKED; 2878 if (error != 0 && error != ENOENT) 2879 goto done; 2880 #else 2881 mtx_unlock(&pr->pr_mtx); 2882 drflags &= ~PD_LOCKED; 2883 #endif 2884 2885 /* Get the module parameters. */ 2886 error = osd_jail_call(pr, PR_METHOD_GET, opts); 2887 if (error) 2888 goto done; 2889 prison_deref(pr, drflags); 2890 pr = NULL; 2891 drflags = 0; 2892 2893 /* By now, all parameters should have been noted. */ 2894 TAILQ_FOREACH(opt, opts, link) { 2895 if (!opt->seen && 2896 (strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name || 2897 strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) { 2898 /* Communicate back a missing key. */ 2899 free(opt->value, M_MOUNT); 2900 opt->value = NULL; 2901 opt->len = 0; 2902 continue; 2903 } 2904 if (!opt->seen && strcmp(opt->name, "errmsg")) { 2905 error = EINVAL; 2906 vfs_opterror(opts, "unknown parameter: %s", opt->name); 2907 goto done; 2908 } 2909 } 2910 2911 /* Write the fetched parameters back to userspace. */ 2912 error = 0; 2913 TAILQ_FOREACH(opt, opts, link) { 2914 if (opt->pos >= 0 && opt->pos != errmsg_pos) { 2915 pos = 2 * opt->pos + 1; 2916 optuio->uio_iov[pos].iov_len = opt->len; 2917 if (opt->value != NULL) { 2918 if (optuio->uio_segflg == UIO_SYSSPACE) { 2919 bcopy(opt->value, 2920 optuio->uio_iov[pos].iov_base, 2921 opt->len); 2922 } else { 2923 error = copyout(opt->value, 2924 optuio->uio_iov[pos].iov_base, 2925 opt->len); 2926 if (error) 2927 break; 2928 } 2929 } 2930 } 2931 } 2932 2933 done: 2934 /* Release any temporary prison holds and/or locks. */ 2935 if (pr != NULL) 2936 prison_deref(pr, drflags); 2937 else if (drflags & PD_LIST_SLOCKED) 2938 sx_sunlock(&allprison_lock); 2939 else if (drflags & PD_LIST_XLOCKED) 2940 sx_xunlock(&allprison_lock); 2941 /* Clean up other resources. */ 2942 if (jfp_out != NULL) 2943 (void)fdrop(jfp_out, td); 2944 if (error && jfd_out >= 0) 2945 (void)kern_close(td, jfd_out); 2946 if (error && errmsg_pos >= 0) { 2947 /* Write the error message back to userspace. */ 2948 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); 2949 errmsg_pos = 2 * errmsg_pos + 1; 2950 if (errmsg_len > 0) { 2951 if (optuio->uio_segflg == UIO_SYSSPACE) 2952 bcopy(errmsg, 2953 optuio->uio_iov[errmsg_pos].iov_base, 2954 errmsg_len); 2955 else 2956 (void)copyout(errmsg, 2957 optuio->uio_iov[errmsg_pos].iov_base, 2958 errmsg_len); 2959 } 2960 } 2961 vfs_freeopts(opts); 2962 prison_free(mypr); 2963 return (error); 2964 } 2965 2966 /* 2967 * struct jail_remove_args { 2968 * int jid; 2969 * }; 2970 */ 2971 int 2972 sys_jail_remove(struct thread *td, struct jail_remove_args *uap) 2973 { 2974 struct prison *pr; 2975 int error; 2976 2977 error = priv_check(td, PRIV_JAIL_REMOVE); 2978 if (error) 2979 return (error); 2980 2981 sx_xlock(&allprison_lock); 2982 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); 2983 if (pr == NULL) { 2984 sx_xunlock(&allprison_lock); 2985 return (EINVAL); 2986 } 2987 #ifdef MAC 2988 error = mac_prison_check_remove(td->td_ucred, pr); 2989 if (error != 0) { 2990 mtx_unlock(&pr->pr_mtx); 2991 sx_xunlock(&allprison_lock); 2992 return (error); 2993 } 2994 #endif 2995 prison_hold(pr); 2996 prison_remove(pr); 2997 return (0); 2998 } 2999 3000 /* 3001 * struct jail_remove_jd_args { 3002 * int fd; 3003 * }; 3004 */ 3005 int 3006 sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap) 3007 { 3008 struct prison *pr; 3009 struct ucred *jdcred; 3010 int error; 3011 3012 error = jaildesc_find(td, uap->fd, &pr, &jdcred); 3013 if (error) 3014 return (error); 3015 error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE); 3016 crfree(jdcred); 3017 #ifdef MAC 3018 if (error == 0) 3019 error = mac_prison_check_remove(td->td_ucred, pr); 3020 #endif 3021 if (error) { 3022 prison_free(pr); 3023 return (error); 3024 } 3025 sx_xlock(&allprison_lock); 3026 mtx_lock(&pr->pr_mtx); 3027 prison_remove(pr); 3028 return (0); 3029 } 3030 3031 /* 3032 * Begin the removal process for a prison. The allprison lock should 3033 * be held exclusively, and the prison should be both locked and held. 3034 */ 3035 void 3036 prison_remove(struct prison *pr) 3037 { 3038 sx_assert(&allprison_lock, SA_XLOCKED); 3039 mtx_assert(&pr->pr_mtx, MA_OWNED); 3040 prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); 3041 } 3042 3043 /* 3044 * struct jail_attach_args { 3045 * int jid; 3046 * }; 3047 */ 3048 int 3049 sys_jail_attach(struct thread *td, struct jail_attach_args *uap) 3050 { 3051 struct prison *pr; 3052 int error; 3053 3054 error = priv_check(td, PRIV_JAIL_ATTACH); 3055 if (error) 3056 return (error); 3057 3058 sx_slock(&allprison_lock); 3059 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); 3060 if (pr == NULL) { 3061 sx_sunlock(&allprison_lock); 3062 return (EINVAL); 3063 } 3064 3065 #ifdef MAC 3066 error = mac_prison_check_attach(td->td_ucred, pr); 3067 if (error != 0) 3068 goto unlock; 3069 #endif 3070 3071 /* Do not allow a process to attach to a prison that is not alive. */ 3072 if (!prison_isalive(pr)) { 3073 error = EINVAL; 3074 goto unlock; 3075 } 3076 3077 return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); 3078 3079 unlock: 3080 3081 mtx_unlock(&pr->pr_mtx); 3082 sx_sunlock(&allprison_lock); 3083 return (error); 3084 } 3085 3086 /* 3087 * struct jail_attach_jd_args { 3088 * int fd; 3089 * }; 3090 */ 3091 int 3092 sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap) 3093 { 3094 struct prison *pr; 3095 struct ucred *jdcred; 3096 int drflags, error; 3097 3098 sx_slock(&allprison_lock); 3099 drflags = PD_LIST_SLOCKED; 3100 error = jaildesc_find(td, uap->fd, &pr, &jdcred); 3101 if (error) 3102 goto fail; 3103 drflags |= PD_DEREF; 3104 error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); 3105 #ifdef MAC 3106 if (error == 0) 3107 error = mac_prison_check_attach(td->td_ucred, pr); 3108 #endif 3109 crfree(jdcred); 3110 if (error) 3111 goto fail; 3112 mtx_lock(&pr->pr_mtx); 3113 drflags |= PD_LOCKED; 3114 3115 /* Do not allow a process to attach to a prison that is not alive. */ 3116 if (!prison_isalive(pr)) { 3117 error = EINVAL; 3118 goto fail; 3119 } 3120 3121 return (do_jail_attach(td, pr, drflags)); 3122 3123 fail: 3124 prison_deref(pr, drflags); 3125 return (error); 3126 } 3127 3128 static int 3129 do_jail_attach(struct thread *td, struct prison *pr, int drflags) 3130 { 3131 struct proc *p; 3132 struct ucred *newcred, *oldcred; 3133 int error; 3134 3135 mtx_assert(&pr->pr_mtx, MA_OWNED); 3136 sx_assert(&allprison_lock, SX_LOCKED); 3137 drflags &= PD_LOCK_FLAGS; 3138 /* 3139 * XXX: Note that there is a slight race here if two threads 3140 * in the same privileged process attempt to attach to two 3141 * different jails at the same time. It is important for 3142 * user processes not to do this, or they might end up with 3143 * a process root from one prison, but attached to the jail 3144 * of another. 3145 */ 3146 if (!(drflags & PD_DEREF)) { 3147 prison_hold(pr); 3148 drflags |= PD_DEREF; 3149 } 3150 refcount_acquire(&pr->pr_uref); 3151 drflags |= PD_DEUREF; 3152 mtx_unlock(&pr->pr_mtx); 3153 drflags &= ~PD_LOCKED; 3154 3155 /* Let modules do whatever they need to prepare for attaching. */ 3156 error = osd_jail_call(pr, PR_METHOD_ATTACH, td); 3157 if (error) { 3158 prison_deref(pr, drflags); 3159 return (error); 3160 } 3161 sx_unlock(&allprison_lock); 3162 drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED); 3163 3164 /* 3165 * Reparent the newly attached process to this jail. 3166 */ 3167 p = td->td_proc; 3168 error = cpuset_setproc_update_set(p, pr->pr_cpuset); 3169 if (error) 3170 goto e_revert_osd; 3171 3172 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); 3173 if ((error = change_dir(pr->pr_root, td)) != 0) 3174 goto e_unlock; 3175 #ifdef MAC 3176 if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) 3177 goto e_unlock; 3178 #endif 3179 VOP_UNLOCK(pr->pr_root); 3180 if ((error = pwd_chroot_chdir(td, pr->pr_root))) 3181 goto e_revert_osd; 3182 3183 newcred = crget(); 3184 PROC_LOCK(p); 3185 oldcred = crcopysafe(p, newcred); 3186 newcred->cr_prison = pr; 3187 #ifdef RACCT 3188 racct_proc_ucred_changed(p, oldcred, newcred); 3189 #endif 3190 #ifdef RCTL 3191 crhold(newcred); 3192 #endif 3193 /* 3194 * Takes over 'newcred''s reference, so 'newcred' must not be used 3195 * besides this point except on RCTL where we took an additional 3196 * reference above. 3197 */ 3198 proc_set_cred(p, newcred); 3199 setsugid(p); 3200 PROC_UNLOCK(p); 3201 #ifdef RCTL 3202 rctl_proc_ucred_changed(p, newcred); 3203 crfree(newcred); 3204 #endif 3205 prison_proc_relink(oldcred->cr_prison, pr, p); 3206 prison_deref(oldcred->cr_prison, drflags); 3207 crfree(oldcred); 3208 prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); 3209 #ifdef MAC 3210 /* 3211 * Note that mac_prison_attached() assumes that it's called in a 3212 * sleepable context. 3213 */ 3214 mac_prison_attached(td->td_ucred, pr, td->td_proc); 3215 #endif 3216 3217 /* 3218 * If the prison was killed while changing credentials, die along 3219 * with it. 3220 */ 3221 if (!prison_isalive(pr)) { 3222 PROC_LOCK(p); 3223 kern_psignal(p, SIGKILL); 3224 PROC_UNLOCK(p); 3225 } 3226 3227 return (0); 3228 3229 e_unlock: 3230 VOP_UNLOCK(pr->pr_root); 3231 e_revert_osd: 3232 /* Tell modules this thread is still in its old jail after all. */ 3233 sx_slock(&allprison_lock); 3234 drflags |= PD_LIST_SLOCKED; 3235 (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); 3236 prison_deref(pr, drflags); 3237 return (error); 3238 } 3239 3240 /* 3241 * Returns a locked prison instance, or NULL on failure. 3242 */ 3243 struct prison * 3244 prison_find(int prid) 3245 { 3246 struct prison *pr; 3247 3248 sx_assert(&allprison_lock, SX_LOCKED); 3249 TAILQ_FOREACH(pr, &allprison, pr_list) { 3250 if (pr->pr_id < prid) 3251 continue; 3252 if (pr->pr_id > prid) 3253 break; 3254 KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); 3255 mtx_lock(&pr->pr_mtx); 3256 return (pr); 3257 } 3258 return (NULL); 3259 } 3260 3261 /* 3262 * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. 3263 */ 3264 struct prison * 3265 prison_find_child(struct prison *mypr, int prid) 3266 { 3267 struct prison *pr; 3268 int descend; 3269 3270 sx_assert(&allprison_lock, SX_LOCKED); 3271 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { 3272 if (pr->pr_id == prid) { 3273 KASSERT(prison_isvalid(pr), 3274 ("Found invalid prison %p", pr)); 3275 mtx_lock(&pr->pr_mtx); 3276 return (pr); 3277 } 3278 } 3279 return (NULL); 3280 } 3281 3282 /* 3283 * Look for the name relative to mypr. Returns a locked prison or NULL. 3284 */ 3285 struct prison * 3286 prison_find_name(struct prison *mypr, const char *name) 3287 { 3288 struct prison *pr, *deadpr; 3289 size_t mylen; 3290 int descend; 3291 3292 sx_assert(&allprison_lock, SX_LOCKED); 3293 mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; 3294 deadpr = NULL; 3295 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { 3296 if (!strcmp(pr->pr_name + mylen, name)) { 3297 KASSERT(prison_isvalid(pr), 3298 ("Found invalid prison %p", pr)); 3299 if (prison_isalive(pr)) { 3300 mtx_lock(&pr->pr_mtx); 3301 return (pr); 3302 } 3303 deadpr = pr; 3304 } 3305 } 3306 /* There was no valid prison - perhaps there was a dying one. */ 3307 if (deadpr != NULL) 3308 mtx_lock(&deadpr->pr_mtx); 3309 return (deadpr); 3310 } 3311 3312 /* 3313 * See if a prison has the specific flag set. The prison should be locked, 3314 * unless checking for flags that are only set at jail creation (such as 3315 * PR_IP4 and PR_IP6), or only the single bit is examined, without regard 3316 * to any other prison data. 3317 */ 3318 bool 3319 prison_flag(struct ucred *cred, unsigned flag) 3320 { 3321 3322 return ((cred->cr_prison->pr_flags & flag) != 0); 3323 } 3324 3325 /* 3326 * See if a prison has the specific allow flag set. 3327 * The prison *should* be locked, or only a single bit is examined, without 3328 * regard to any other prison data. 3329 */ 3330 bool 3331 prison_allow(struct ucred *cred, unsigned flag) 3332 { 3333 3334 return ((cred->cr_prison->pr_allow & flag) != 0); 3335 } 3336 3337 /* 3338 * Hold a prison reference, by incrementing pr_ref. It is generally 3339 * an error to hold a prison that does not already have a reference. 3340 * A prison record will remain valid as long as it has at least one 3341 * reference, and will not be removed as long as either the prison 3342 * mutex or the allprison lock is held (allprison_lock may be shared). 3343 */ 3344 void 3345 prison_hold_locked(struct prison *pr) 3346 { 3347 3348 /* Locking is no longer required. */ 3349 prison_hold(pr); 3350 } 3351 3352 void 3353 prison_hold(struct prison *pr) 3354 { 3355 #ifdef INVARIANTS 3356 int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref); 3357 3358 KASSERT(was_valid, 3359 ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); 3360 #else 3361 refcount_acquire(&pr->pr_ref); 3362 #endif 3363 } 3364 3365 /* 3366 * Remove a prison reference. If that was the last reference, the 3367 * prison will be removed (at a later time). 3368 */ 3369 void 3370 prison_free_locked(struct prison *pr) 3371 { 3372 3373 mtx_assert(&pr->pr_mtx, MA_OWNED); 3374 /* 3375 * Locking is no longer required, but unlock because the caller 3376 * expects it. 3377 */ 3378 mtx_unlock(&pr->pr_mtx); 3379 prison_free(pr); 3380 } 3381 3382 void 3383 prison_free(struct prison *pr) 3384 { 3385 3386 KASSERT(refcount_load(&pr->pr_ref) > 0, 3387 ("Trying to free dead prison %p (jid=%d).", 3388 pr, pr->pr_id)); 3389 if (!refcount_release_if_not_last(&pr->pr_ref)) { 3390 /* 3391 * Don't remove the last reference in this context, 3392 * in case there are locks held. 3393 */ 3394 taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); 3395 } 3396 } 3397 3398 static void 3399 prison_free_not_last(struct prison *pr) 3400 { 3401 #ifdef INVARIANTS 3402 int lastref; 3403 3404 KASSERT(refcount_load(&pr->pr_ref) > 0, 3405 ("Trying to free dead prison %p (jid=%d).", 3406 pr, pr->pr_id)); 3407 lastref = refcount_release(&pr->pr_ref); 3408 KASSERT(!lastref, 3409 ("prison_free_not_last freed last ref on prison %p (jid=%d).", 3410 pr, pr->pr_id)); 3411 #else 3412 refcount_release(&pr->pr_ref); 3413 #endif 3414 } 3415 3416 /* 3417 * Hold a prison for user visibility, by incrementing pr_uref. 3418 * It is generally an error to hold a prison that isn't already 3419 * user-visible, except through the jail system calls. It is also 3420 * an error to hold an invalid prison. A prison record will remain 3421 * alive as long as it has at least one user reference, and will not 3422 * be set to the dying state until the prison mutex and allprison_lock 3423 * are both freed. 3424 */ 3425 void 3426 prison_proc_hold(struct prison *pr) 3427 { 3428 #ifdef INVARIANTS 3429 int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref); 3430 3431 KASSERT(was_alive, 3432 ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); 3433 #else 3434 refcount_acquire(&pr->pr_uref); 3435 #endif 3436 } 3437 3438 /* 3439 * Remove a prison user reference. If it was the last reference, the 3440 * prison will be considered "dying", and may be removed once all of 3441 * its references are dropped. 3442 */ 3443 void 3444 prison_proc_free(struct prison *pr) 3445 { 3446 3447 /* 3448 * Locking is only required when releasing the last reference. 3449 * This allows assurance that a locked prison will remain alive 3450 * until it is unlocked. 3451 */ 3452 KASSERT(refcount_load(&pr->pr_uref) > 0, 3453 ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); 3454 if (!refcount_release_if_not_last(&pr->pr_uref)) { 3455 /* 3456 * Don't remove the last user reference in this context, 3457 * which is expected to be a process that is not only locked, 3458 * but also half dead. Add a reference so any calls to 3459 * prison_free() won't re-submit the task. 3460 */ 3461 prison_hold(pr); 3462 mtx_lock(&pr->pr_mtx); 3463 KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC), 3464 ("Redundant last reference in prison_proc_free (jid=%d)", 3465 pr->pr_id)); 3466 pr->pr_flags |= PR_COMPLETE_PROC; 3467 mtx_unlock(&pr->pr_mtx); 3468 taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); 3469 } 3470 } 3471 3472 static void 3473 prison_proc_free_not_last(struct prison *pr) 3474 { 3475 #ifdef INVARIANTS 3476 int lastref; 3477 3478 KASSERT(refcount_load(&pr->pr_uref) > 0, 3479 ("Trying to free dead prison %p (jid=%d).", 3480 pr, pr->pr_id)); 3481 lastref = refcount_release(&pr->pr_uref); 3482 KASSERT(!lastref, 3483 ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).", 3484 pr, pr->pr_id)); 3485 #else 3486 refcount_release(&pr->pr_uref); 3487 #endif 3488 } 3489 3490 void 3491 prison_proc_link(struct prison *pr, struct proc *p) 3492 { 3493 3494 sx_assert(&allproc_lock, SA_XLOCKED); 3495 LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist); 3496 } 3497 3498 void 3499 prison_proc_unlink(struct prison *pr, struct proc *p) 3500 { 3501 3502 sx_assert(&allproc_lock, SA_XLOCKED); 3503 LIST_REMOVE(p, p_jaillist); 3504 } 3505 3506 static void 3507 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p) 3508 { 3509 3510 sx_xlock(&allproc_lock); 3511 prison_proc_unlink(opr, p); 3512 prison_proc_link(npr, p); 3513 sx_xunlock(&allproc_lock); 3514 } 3515 3516 /* 3517 * Complete a call to either prison_free or prison_proc_free. 3518 */ 3519 static void 3520 prison_complete(void *context, int pending) 3521 { 3522 struct prison *pr = context; 3523 int drflags; 3524 3525 /* 3526 * This could be called to release the last reference, or the last 3527 * user reference (plus the reference held in prison_proc_free). 3528 */ 3529 drflags = prison_lock_xlock(pr, PD_DEREF); 3530 if (pr->pr_flags & PR_COMPLETE_PROC) { 3531 pr->pr_flags &= ~PR_COMPLETE_PROC; 3532 drflags |= PD_DEUREF; 3533 } 3534 prison_deref(pr, drflags); 3535 } 3536 3537 static void 3538 prison_kill_processes_cb(struct proc *p, void *arg __unused) 3539 { 3540 3541 kern_psignal(p, SIGKILL); 3542 } 3543 3544 /* 3545 * Note the iteration does not guarantee acting on all processes. 3546 * Most notably there may be fork or jail_attach in progress. 3547 */ 3548 void 3549 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *), 3550 void *cbarg) 3551 { 3552 struct prison *ppr; 3553 struct proc *p; 3554 3555 if (atomic_load_int(&pr->pr_childcount) == 0) { 3556 sx_slock(&allproc_lock); 3557 LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) { 3558 if (p->p_state == PRS_NEW) 3559 continue; 3560 PROC_LOCK(p); 3561 cb(p, cbarg); 3562 PROC_UNLOCK(p); 3563 } 3564 sx_sunlock(&allproc_lock); 3565 if (atomic_load_int(&pr->pr_childcount) == 0) 3566 return; 3567 /* 3568 * Some jails popped up during the iteration, fall through to a 3569 * system-wide search. 3570 */ 3571 } 3572 3573 sx_slock(&allproc_lock); 3574 FOREACH_PROC_IN_SYSTEM(p) { 3575 PROC_LOCK(p); 3576 if (p->p_state != PRS_NEW && p->p_ucred != NULL) { 3577 for (ppr = p->p_ucred->cr_prison; ppr != NULL; 3578 ppr = ppr->pr_parent) { 3579 if (ppr == pr) { 3580 cb(p, cbarg); 3581 break; 3582 } 3583 } 3584 } 3585 PROC_UNLOCK(p); 3586 } 3587 sx_sunlock(&allproc_lock); 3588 } 3589 3590 /* 3591 * Remove a prison reference and/or user reference (usually). 3592 * This assumes context that allows sleeping (for allprison_lock), 3593 * with no non-sleeping locks held, except perhaps the prison itself. 3594 * If there are no more references, release and delist the prison. 3595 * On completion, the prison lock and the allprison lock are both 3596 * unlocked. 3597 */ 3598 static void 3599 prison_deref(struct prison *pr, int flags) 3600 { 3601 struct prisonlist freeprison; 3602 struct prison *killpr, *rpr, *ppr, *tpr; 3603 3604 killpr = NULL; 3605 TAILQ_INIT(&freeprison); 3606 /* 3607 * Release this prison as requested, which may cause its parent 3608 * to be released, and then maybe its grandparent, etc. 3609 */ 3610 for (;;) { 3611 if (flags & PD_KILL) { 3612 /* Kill the prison and its descendents. */ 3613 KASSERT(pr != &prison0, 3614 ("prison_deref trying to kill prison0")); 3615 if (!prison_isalive(pr)) { 3616 /* Silently ignore already-dying prisons. */ 3617 flags &= ~PD_KILL; 3618 } else { 3619 if (!(flags & PD_DEREF)) { 3620 prison_hold(pr); 3621 flags |= PD_DEREF; 3622 } 3623 flags = prison_lock_xlock(pr, flags); 3624 prison_deref_kill(pr, &freeprison); 3625 } 3626 } 3627 if (flags & PD_DEUREF) { 3628 /* Drop a user reference. */ 3629 KASSERT(refcount_load(&pr->pr_uref) > 0, 3630 ("prison_deref PD_DEUREF on a dead prison (jid=%d)", 3631 pr->pr_id)); 3632 if (!refcount_release_if_not_last(&pr->pr_uref)) { 3633 if (!(flags & PD_DEREF)) { 3634 prison_hold(pr); 3635 flags |= PD_DEREF; 3636 } 3637 flags = prison_lock_xlock(pr, flags); 3638 if (refcount_release(&pr->pr_uref) && 3639 pr->pr_state == PRISON_STATE_ALIVE) { 3640 /* 3641 * When the last user references goes, 3642 * this becomes a dying prison. 3643 */ 3644 KASSERT( 3645 refcount_load(&prison0.pr_uref) > 0, 3646 ("prison0 pr_uref=0")); 3647 pr->pr_state = PRISON_STATE_DYING; 3648 prison_cleanup_locked(pr); 3649 mtx_unlock(&pr->pr_mtx); 3650 flags &= ~PD_LOCKED; 3651 prison_cleanup_unlocked(pr); 3652 } 3653 } 3654 } 3655 if (flags & PD_KILL) { 3656 /* 3657 * Any remaining user references are probably processes 3658 * that need to be killed, either in this prison or its 3659 * descendants. 3660 */ 3661 if (refcount_load(&pr->pr_uref) > 0) 3662 killpr = pr; 3663 /* Make sure the parent prison doesn't get killed. */ 3664 flags &= ~PD_KILL; 3665 } 3666 if (flags & PD_DEREF) { 3667 /* Drop a reference. */ 3668 KASSERT(refcount_load(&pr->pr_ref) > 0, 3669 ("prison_deref PD_DEREF on a dead prison (jid=%d)", 3670 pr->pr_id)); 3671 if (!refcount_release_if_not_last(&pr->pr_ref)) { 3672 flags = prison_lock_xlock(pr, flags); 3673 if (refcount_release(&pr->pr_ref)) { 3674 /* 3675 * When the last reference goes, 3676 * unlink the prison and set it aside. 3677 */ 3678 KASSERT( 3679 refcount_load(&pr->pr_uref) == 0, 3680 ("prison_deref: last ref, " 3681 "but still has %d urefs (jid=%d)", 3682 pr->pr_uref, pr->pr_id)); 3683 KASSERT( 3684 refcount_load(&prison0.pr_ref) != 0, 3685 ("prison0 pr_ref=0")); 3686 #ifdef MAC 3687 /* 3688 * The MAC framework will call into any 3689 * policies that want to hook 3690 * prison_destroy_label, so ideally we 3691 * call this prior to any final state 3692 * invalidation to be safe. 3693 */ 3694 mac_prison_destroy(pr); 3695 #endif 3696 pr->pr_state = PRISON_STATE_INVALID; 3697 TAILQ_REMOVE(&allprison, pr, pr_list); 3698 LIST_REMOVE(pr, pr_sibling); 3699 TAILQ_INSERT_TAIL(&freeprison, pr, 3700 pr_list); 3701 for (ppr = pr->pr_parent; 3702 ppr != NULL; 3703 ppr = ppr->pr_parent) 3704 ppr->pr_childcount--; 3705 /* 3706 * Removing a prison frees references 3707 * from its parent. 3708 */ 3709 ppr = pr->pr_parent; 3710 pr->pr_parent = NULL; 3711 mtx_unlock(&pr->pr_mtx); 3712 3713 pr = ppr; 3714 flags &= ~PD_LOCKED; 3715 flags |= PD_DEREF | PD_DEUREF; 3716 continue; 3717 } 3718 } 3719 } 3720 break; 3721 } 3722 3723 /* Release all the prison locks. */ 3724 if (flags & PD_LOCKED) 3725 mtx_unlock(&pr->pr_mtx); 3726 if (flags & PD_LIST_SLOCKED) 3727 sx_sunlock(&allprison_lock); 3728 else if (flags & PD_LIST_XLOCKED) 3729 sx_xunlock(&allprison_lock); 3730 3731 /* Kill any processes attached to a killed prison. */ 3732 if (killpr != NULL) 3733 prison_proc_iterate(killpr, prison_kill_processes_cb, NULL); 3734 3735 /* 3736 * Finish removing any unreferenced prisons, which couldn't happen 3737 * while allprison_lock was held (to avoid a LOR on vrele). 3738 */ 3739 TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { 3740 #ifdef VIMAGE 3741 if (rpr->pr_flags & PR_VNET) 3742 vnet_destroy(rpr->pr_vnet); 3743 #endif 3744 if (rpr->pr_root != NULL) 3745 vrele(rpr->pr_root); 3746 mtx_destroy(&rpr->pr_mtx); 3747 #ifdef INET 3748 prison_ip_free(rpr->pr_addrs[PR_INET]); 3749 #endif 3750 #ifdef INET6 3751 prison_ip_free(rpr->pr_addrs[PR_INET6]); 3752 #endif 3753 if (rpr->pr_cpuset != NULL) 3754 cpuset_rel(rpr->pr_cpuset); 3755 osd_jail_exit(rpr); 3756 #ifdef RACCT 3757 if (racct_enable) 3758 prison_racct_detach(rpr); 3759 #endif 3760 TAILQ_REMOVE(&freeprison, rpr, pr_list); 3761 free(rpr, M_PRISON); 3762 } 3763 } 3764 3765 /* 3766 * Kill the prison and its descendants. Mark them as dying, clear the 3767 * persist flag, and call module remove methods. 3768 */ 3769 static void 3770 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) 3771 { 3772 struct prison *cpr, *ppr, *rpr; 3773 bool descend; 3774 3775 /* 3776 * Unlike the descendants, the target prison can be killed 3777 * even if it is currently dying. This is useful for failed 3778 * creation in jail_set(2). 3779 */ 3780 KASSERT(refcount_load(&pr->pr_ref) > 0, 3781 ("Trying to kill dead prison %p (jid=%d).", 3782 pr, pr->pr_id)); 3783 refcount_acquire(&pr->pr_uref); 3784 pr->pr_state = PRISON_STATE_DYING; 3785 mtx_unlock(&pr->pr_mtx); 3786 3787 rpr = NULL; 3788 FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { 3789 if (descend) { 3790 if (!prison_isalive(cpr)) { 3791 descend = false; 3792 continue; 3793 } 3794 prison_hold(cpr); 3795 prison_proc_hold(cpr); 3796 mtx_lock(&cpr->pr_mtx); 3797 cpr->pr_state = PRISON_STATE_DYING; 3798 cpr->pr_flags |= PR_REMOVE; 3799 mtx_unlock(&cpr->pr_mtx); 3800 continue; 3801 } 3802 if (!(cpr->pr_flags & PR_REMOVE)) 3803 continue; 3804 prison_cleanup_unlocked(cpr); 3805 mtx_lock(&cpr->pr_mtx); 3806 prison_cleanup_locked(cpr); 3807 cpr->pr_flags &= ~PR_REMOVE; 3808 if (cpr->pr_flags & PR_PERSIST) { 3809 cpr->pr_flags &= ~PR_PERSIST; 3810 prison_proc_free_not_last(cpr); 3811 prison_free_not_last(cpr); 3812 } 3813 (void)refcount_release(&cpr->pr_uref); 3814 if (refcount_release(&cpr->pr_ref)) { 3815 /* 3816 * When the last reference goes, unlink the prison 3817 * and set it aside for prison_deref() to handle. 3818 * Delay unlinking the sibling list to keep the loop 3819 * safe. 3820 */ 3821 if (rpr != NULL) 3822 LIST_REMOVE(rpr, pr_sibling); 3823 rpr = cpr; 3824 rpr->pr_state = PRISON_STATE_INVALID; 3825 TAILQ_REMOVE(&allprison, rpr, pr_list); 3826 TAILQ_INSERT_TAIL(freeprison, rpr, pr_list); 3827 /* 3828 * Removing a prison frees references from its parent. 3829 */ 3830 ppr = rpr->pr_parent; 3831 prison_proc_free_not_last(ppr); 3832 prison_free_not_last(ppr); 3833 for (; ppr != NULL; ppr = ppr->pr_parent) 3834 ppr->pr_childcount--; 3835 } 3836 mtx_unlock(&cpr->pr_mtx); 3837 } 3838 if (rpr != NULL) 3839 LIST_REMOVE(rpr, pr_sibling); 3840 3841 prison_cleanup_unlocked(pr); 3842 mtx_lock(&pr->pr_mtx); 3843 prison_cleanup_locked(pr); 3844 if (pr->pr_flags & PR_PERSIST) { 3845 pr->pr_flags &= ~PR_PERSIST; 3846 prison_proc_free_not_last(pr); 3847 prison_free_not_last(pr); 3848 } 3849 (void)refcount_release(&pr->pr_uref); 3850 } 3851 3852 /* 3853 * Given the current locking state in the flags, make sure allprison_lock 3854 * is held exclusive, and the prison is locked. Return flags indicating 3855 * the new state. 3856 */ 3857 static int 3858 prison_lock_xlock(struct prison *pr, int flags) 3859 { 3860 3861 if (!(flags & PD_LIST_XLOCKED)) { 3862 /* 3863 * Get allprison_lock, which may be an upgrade, 3864 * and may require unlocking the prison. 3865 */ 3866 if (flags & PD_LOCKED) { 3867 mtx_unlock(&pr->pr_mtx); 3868 flags &= ~PD_LOCKED; 3869 } 3870 if (flags & PD_LIST_SLOCKED) { 3871 if (!sx_try_upgrade(&allprison_lock)) { 3872 sx_sunlock(&allprison_lock); 3873 sx_xlock(&allprison_lock); 3874 } 3875 flags &= ~PD_LIST_SLOCKED; 3876 } else 3877 sx_xlock(&allprison_lock); 3878 flags |= PD_LIST_XLOCKED; 3879 } 3880 if (!(flags & PD_LOCKED)) { 3881 /* Lock the prison mutex. */ 3882 mtx_lock(&pr->pr_mtx); 3883 flags |= PD_LOCKED; 3884 } 3885 return flags; 3886 } 3887 3888 /* 3889 * Release a prison's resources when it starts dying (when the last user 3890 * reference is dropped, or when it is killed). Two functions are called, 3891 * for work that requires a locked prison or an unlocked one. 3892 */ 3893 static void 3894 prison_cleanup_locked(struct prison *pr) 3895 { 3896 sx_assert(&allprison_lock, SA_XLOCKED); 3897 mtx_assert(&pr->pr_mtx, MA_OWNED); 3898 prison_knote(pr, NOTE_JAIL_REMOVE); 3899 knlist_detach(pr->pr_klist); 3900 jaildesc_prison_cleanup(pr); 3901 pr->pr_klist = NULL; 3902 } 3903 3904 static void 3905 prison_cleanup_unlocked(struct prison *pr) 3906 { 3907 sx_assert(&allprison_lock, SA_XLOCKED); 3908 mtx_assert(&pr->pr_mtx, MA_NOTOWNED); 3909 vfs_exjail_delete(pr); 3910 shm_remove_prison(pr); 3911 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); 3912 } 3913 3914 /* 3915 * Set or clear a permission bit in the pr_allow field, passing restrictions 3916 * (cleared permission) down to child jails. 3917 */ 3918 void 3919 prison_set_allow(struct ucred *cred, unsigned flag, int enable) 3920 { 3921 struct prison *pr; 3922 3923 pr = cred->cr_prison; 3924 sx_slock(&allprison_lock); 3925 mtx_lock(&pr->pr_mtx); 3926 prison_set_allow_locked(pr, flag, enable); 3927 mtx_unlock(&pr->pr_mtx); 3928 sx_sunlock(&allprison_lock); 3929 } 3930 3931 static void 3932 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable) 3933 { 3934 struct prison *cpr; 3935 int descend; 3936 3937 if (enable != 0) 3938 pr->pr_allow |= flag; 3939 else { 3940 pr->pr_allow &= ~flag; 3941 FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) 3942 cpr->pr_allow &= ~flag; 3943 } 3944 } 3945 3946 /* 3947 * Check if a jail supports the given address family. 3948 * 3949 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT 3950 * if not. 3951 */ 3952 int 3953 prison_check_af(struct ucred *cred, int af) 3954 { 3955 struct prison *pr; 3956 int error; 3957 3958 KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); 3959 3960 pr = cred->cr_prison; 3961 #ifdef VIMAGE 3962 /* Prisons with their own network stack are not limited. */ 3963 if (prison_owns_vnet(pr)) 3964 return (0); 3965 #endif 3966 3967 error = 0; 3968 switch (af) 3969 { 3970 #ifdef INET 3971 case AF_INET: 3972 if (pr->pr_flags & PR_IP4) 3973 { 3974 mtx_lock(&pr->pr_mtx); 3975 if ((pr->pr_flags & PR_IP4) && 3976 pr->pr_addrs[PR_INET] == NULL) 3977 error = EAFNOSUPPORT; 3978 mtx_unlock(&pr->pr_mtx); 3979 } 3980 break; 3981 #endif 3982 #ifdef INET6 3983 case AF_INET6: 3984 if (pr->pr_flags & PR_IP6) 3985 { 3986 mtx_lock(&pr->pr_mtx); 3987 if ((pr->pr_flags & PR_IP6) && 3988 pr->pr_addrs[PR_INET6] == NULL) 3989 error = EAFNOSUPPORT; 3990 mtx_unlock(&pr->pr_mtx); 3991 } 3992 break; 3993 #endif 3994 case AF_LOCAL: 3995 case AF_ROUTE: 3996 case AF_NETLINK: 3997 break; 3998 default: 3999 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) 4000 error = EAFNOSUPPORT; 4001 } 4002 return (error); 4003 } 4004 4005 /* 4006 * Check if given address belongs to the jail referenced by cred (wrapper to 4007 * prison_check_ip[46]). 4008 * 4009 * Returns 0 if jail doesn't restrict the address family or if address belongs 4010 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if 4011 * the jail doesn't allow the address family. IPv4 Address passed in in NBO. 4012 */ 4013 int 4014 prison_if(struct ucred *cred, const struct sockaddr *sa) 4015 { 4016 #ifdef INET 4017 const struct sockaddr_in *sai; 4018 #endif 4019 #ifdef INET6 4020 const struct sockaddr_in6 *sai6; 4021 #endif 4022 int error; 4023 4024 KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); 4025 KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); 4026 4027 #ifdef VIMAGE 4028 if (prison_owns_vnet(cred->cr_prison)) 4029 return (0); 4030 #endif 4031 4032 error = 0; 4033 switch (sa->sa_family) 4034 { 4035 #ifdef INET 4036 case AF_INET: 4037 sai = (const struct sockaddr_in *)sa; 4038 error = prison_check_ip4(cred, &sai->sin_addr); 4039 break; 4040 #endif 4041 #ifdef INET6 4042 case AF_INET6: 4043 sai6 = (const struct sockaddr_in6 *)sa; 4044 error = prison_check_ip6(cred, &sai6->sin6_addr); 4045 break; 4046 #endif 4047 default: 4048 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) 4049 error = EAFNOSUPPORT; 4050 } 4051 return (error); 4052 } 4053 4054 /* 4055 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. 4056 */ 4057 int 4058 prison_check(struct ucred *cred1, struct ucred *cred2) 4059 { 4060 4061 return ((cred1->cr_prison == cred2->cr_prison || 4062 prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); 4063 } 4064 4065 /* 4066 * For mountd/nfsd to run within a prison, it must be: 4067 * - A vnet prison. 4068 * - PR_ALLOW_NFSD must be set on it. 4069 * - The root directory (pr_root) of the prison must be 4070 * a file system mount point, so the mountd can hang 4071 * export information on it. 4072 * - The prison's enforce_statfs cannot be 0, so that 4073 * mountd(8) can do exports. 4074 */ 4075 bool 4076 prison_check_nfsd(struct ucred *cred) 4077 { 4078 4079 if (jailed_without_vnet(cred)) 4080 return (false); 4081 if (!prison_allow(cred, PR_ALLOW_NFSD)) 4082 return (false); 4083 if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0) 4084 return (false); 4085 if (cred->cr_prison->pr_enforce_statfs == 0) 4086 return (false); 4087 return (true); 4088 } 4089 4090 /* 4091 * Return true if p2 is a child of p1, otherwise false. 4092 */ 4093 bool 4094 prison_ischild(struct prison *pr1, struct prison *pr2) 4095 { 4096 4097 for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) 4098 if (pr1 == pr2) 4099 return (true); 4100 return (false); 4101 } 4102 4103 /* 4104 * Return true if the prison is currently alive. A prison is alive if it 4105 * holds user references and it isn't being removed. 4106 */ 4107 bool 4108 prison_isalive(const struct prison *pr) 4109 { 4110 4111 if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE)) 4112 return (false); 4113 return (true); 4114 } 4115 4116 /* 4117 * Return true if the prison is currently valid. A prison is valid if it has 4118 * been fully created, and is not being destroyed. Note that dying prisons 4119 * are still considered valid. Invalid prisons won't be found under normal 4120 * circumstances, as they're only put in that state by functions that have 4121 * an exclusive hold on allprison_lock. 4122 */ 4123 bool 4124 prison_isvalid(struct prison *pr) 4125 { 4126 4127 if (__predict_false(pr->pr_state == PRISON_STATE_INVALID)) 4128 return (false); 4129 if (__predict_false(refcount_load(&pr->pr_ref) == 0)) 4130 return (false); 4131 return (true); 4132 } 4133 4134 /* 4135 * Return true if the passed credential is in a jail and that jail does not 4136 * have its own virtual network stack, otherwise false. 4137 */ 4138 bool 4139 jailed_without_vnet(struct ucred *cred) 4140 { 4141 4142 if (!jailed(cred)) 4143 return (false); 4144 #ifdef VIMAGE 4145 if (prison_owns_vnet(cred->cr_prison)) 4146 return (false); 4147 #endif 4148 4149 return (true); 4150 } 4151 4152 /* 4153 * Return the correct hostname (domainname, et al) for the passed credential. 4154 */ 4155 void 4156 getcredhostname(struct ucred *cred, char *buf, size_t size) 4157 { 4158 struct prison *pr; 4159 4160 /* 4161 * A NULL credential can be used to shortcut to the physical 4162 * system's hostname. 4163 */ 4164 pr = (cred != NULL) ? cred->cr_prison : &prison0; 4165 mtx_lock(&pr->pr_mtx); 4166 strlcpy(buf, pr->pr_hostname, size); 4167 mtx_unlock(&pr->pr_mtx); 4168 } 4169 4170 void 4171 getcreddomainname(struct ucred *cred, char *buf, size_t size) 4172 { 4173 4174 mtx_lock(&cred->cr_prison->pr_mtx); 4175 strlcpy(buf, cred->cr_prison->pr_domainname, size); 4176 mtx_unlock(&cred->cr_prison->pr_mtx); 4177 } 4178 4179 void 4180 getcredhostuuid(struct ucred *cred, char *buf, size_t size) 4181 { 4182 4183 mtx_lock(&cred->cr_prison->pr_mtx); 4184 strlcpy(buf, cred->cr_prison->pr_hostuuid, size); 4185 mtx_unlock(&cred->cr_prison->pr_mtx); 4186 } 4187 4188 void 4189 getcredhostid(struct ucred *cred, unsigned long *hostid) 4190 { 4191 4192 mtx_lock(&cred->cr_prison->pr_mtx); 4193 *hostid = cred->cr_prison->pr_hostid; 4194 mtx_unlock(&cred->cr_prison->pr_mtx); 4195 } 4196 4197 void 4198 getjailname(struct ucred *cred, char *name, size_t len) 4199 { 4200 4201 mtx_lock(&cred->cr_prison->pr_mtx); 4202 strlcpy(name, cred->cr_prison->pr_name, len); 4203 mtx_unlock(&cred->cr_prison->pr_mtx); 4204 } 4205 4206 #ifdef VIMAGE 4207 /* 4208 * Determine whether the prison owns its VNET. 4209 */ 4210 bool 4211 prison_owns_vnet(struct prison *pr) 4212 { 4213 4214 /* 4215 * vnets cannot be added/removed after jail creation, 4216 * so no need to lock here. 4217 */ 4218 return ((pr->pr_flags & PR_VNET) != 0); 4219 } 4220 #endif 4221 4222 /* 4223 * Determine whether the subject represented by cred can "see" 4224 * status of a mount point. 4225 * Returns: 0 for permitted, ENOENT otherwise. 4226 * XXX: This function should be called cr_canseemount() and should be 4227 * placed in kern_prot.c. 4228 */ 4229 int 4230 prison_canseemount(struct ucred *cred, struct mount *mp) 4231 { 4232 struct prison *pr; 4233 struct statfs *sp; 4234 size_t len; 4235 4236 pr = cred->cr_prison; 4237 if (pr->pr_enforce_statfs == 0) 4238 return (0); 4239 if (pr->pr_root->v_mount == mp) 4240 return (0); 4241 if (pr->pr_enforce_statfs == 2) 4242 return (ENOENT); 4243 /* 4244 * If jail's chroot directory is set to "/" we should be able to see 4245 * all mount-points from inside a jail. 4246 * This is ugly check, but this is the only situation when jail's 4247 * directory ends with '/'. 4248 */ 4249 if (strcmp(pr->pr_path, "/") == 0) 4250 return (0); 4251 len = strlen(pr->pr_path); 4252 sp = &mp->mnt_stat; 4253 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) 4254 return (ENOENT); 4255 /* 4256 * Be sure that we don't have situation where jail's root directory 4257 * is "/some/path" and mount point is "/some/pathpath". 4258 */ 4259 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') 4260 return (ENOENT); 4261 return (0); 4262 } 4263 4264 void 4265 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) 4266 { 4267 char jpath[MAXPATHLEN]; 4268 struct prison *pr; 4269 size_t len; 4270 4271 pr = cred->cr_prison; 4272 if (pr->pr_enforce_statfs == 0) 4273 return; 4274 if (prison_canseemount(cred, mp) != 0) { 4275 bzero(&sp->f_fsid, sizeof(sp->f_fsid)); 4276 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 4277 strlcpy(sp->f_mntonname, "[restricted]", 4278 sizeof(sp->f_mntonname)); 4279 return; 4280 } 4281 if (pr->pr_enforce_statfs > 1) 4282 bzero(&sp->f_fsid, sizeof(sp->f_fsid)); 4283 if (pr->pr_root->v_mount == mp) { 4284 /* 4285 * Clear current buffer data, so we are sure nothing from 4286 * the valid path left there. 4287 */ 4288 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 4289 *sp->f_mntonname = '/'; 4290 return; 4291 } 4292 /* 4293 * If jail's chroot directory is set to "/" we should be able to see 4294 * all mount-points from inside a jail. 4295 */ 4296 if (strcmp(pr->pr_path, "/") == 0) 4297 return; 4298 len = strlen(pr->pr_path); 4299 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); 4300 /* 4301 * Clear current buffer data, so we are sure nothing from 4302 * the valid path left there. 4303 */ 4304 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 4305 if (*jpath == '\0') { 4306 /* Should never happen. */ 4307 *sp->f_mntonname = '/'; 4308 } else { 4309 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); 4310 } 4311 } 4312 4313 /* 4314 * Check with permission for a specific privilege is granted within jail. We 4315 * have a specific list of accepted privileges; the rest are denied. 4316 */ 4317 int 4318 prison_priv_check(struct ucred *cred, int priv) 4319 { 4320 struct prison *pr; 4321 int error; 4322 4323 /* 4324 * Some policies have custom handlers. This routine should not be 4325 * called for them. See priv_check_cred(). 4326 */ 4327 switch (priv) { 4328 case PRIV_VFS_LOOKUP: 4329 case PRIV_VFS_GENERATION: 4330 KASSERT(0, ("prison_priv_check instead of a custom handler " 4331 "called for %d\n", priv)); 4332 } 4333 4334 if (!jailed(cred)) 4335 return (0); 4336 4337 #ifdef VIMAGE 4338 /* 4339 * Privileges specific to prisons with a virtual network stack. 4340 * There might be a duplicate entry here in case the privilege 4341 * is only granted conditionally in the legacy jail case. 4342 */ 4343 switch (priv) { 4344 /* 4345 * NFS-specific privileges. 4346 */ 4347 case PRIV_NFS_DAEMON: 4348 case PRIV_VFS_GETFH: 4349 case PRIV_VFS_MOUNT_EXPORTED: 4350 if (!prison_check_nfsd(cred)) 4351 return (EPERM); 4352 #ifdef notyet 4353 case PRIV_NFS_LOCKD: 4354 #endif 4355 /* 4356 * Network stack privileges. 4357 */ 4358 case PRIV_NET_BRIDGE: 4359 case PRIV_NET_GRE: 4360 case PRIV_NET_BPF: 4361 case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ 4362 case PRIV_NET_ROUTE: 4363 case PRIV_NET_TAP: 4364 case PRIV_NET_SETIFMTU: 4365 case PRIV_NET_SETIFFLAGS: 4366 case PRIV_NET_SETIFCAP: 4367 case PRIV_NET_SETIFDESCR: 4368 case PRIV_NET_SETIFNAME : 4369 case PRIV_NET_SETIFMETRIC: 4370 case PRIV_NET_SETIFPHYS: 4371 case PRIV_NET_SETIFMAC: 4372 case PRIV_NET_SETLANPCP: 4373 case PRIV_NET_ADDMULTI: 4374 case PRIV_NET_DELMULTI: 4375 case PRIV_NET_HWIOCTL: 4376 case PRIV_NET_SETLLADDR: 4377 case PRIV_NET_ADDIFGROUP: 4378 case PRIV_NET_DELIFGROUP: 4379 case PRIV_NET_IFCREATE: 4380 case PRIV_NET_IFDESTROY: 4381 case PRIV_NET_ADDIFADDR: 4382 case PRIV_NET_DELIFADDR: 4383 case PRIV_NET_LAGG: 4384 case PRIV_NET_GIF: 4385 case PRIV_NET_SETIFVNET: 4386 case PRIV_NET_SETIFFIB: 4387 case PRIV_NET_OVPN: 4388 case PRIV_NET_ME: 4389 case PRIV_NET_WG: 4390 4391 /* 4392 * 802.11-related privileges. 4393 */ 4394 case PRIV_NET80211_VAP_GETKEY: 4395 case PRIV_NET80211_VAP_MANAGE: 4396 4397 #ifdef notyet 4398 /* 4399 * ATM privileges. 4400 */ 4401 case PRIV_NETATM_CFG: 4402 case PRIV_NETATM_ADD: 4403 case PRIV_NETATM_DEL: 4404 case PRIV_NETATM_SET: 4405 4406 /* 4407 * Bluetooth privileges. 4408 */ 4409 case PRIV_NETBLUETOOTH_RAW: 4410 #endif 4411 4412 /* 4413 * Netgraph and netgraph module privileges. 4414 */ 4415 case PRIV_NETGRAPH_CONTROL: 4416 #ifdef notyet 4417 case PRIV_NETGRAPH_TTY: 4418 #endif 4419 4420 /* 4421 * IPv4 and IPv6 privileges. 4422 */ 4423 case PRIV_NETINET_IPFW: 4424 case PRIV_NETINET_DIVERT: 4425 case PRIV_NETINET_PF: 4426 case PRIV_NETINET_DUMMYNET: 4427 case PRIV_NETINET_CARP: 4428 case PRIV_NETINET_MROUTE: 4429 case PRIV_NETINET_RAW: 4430 case PRIV_NETINET_ADDRCTRL6: 4431 case PRIV_NETINET_ND6: 4432 case PRIV_NETINET_SCOPE6: 4433 case PRIV_NETINET_ALIFETIME6: 4434 case PRIV_NETINET_IPSEC: 4435 case PRIV_NETINET_BINDANY: 4436 4437 #ifdef notyet 4438 /* 4439 * NCP privileges. 4440 */ 4441 case PRIV_NETNCP: 4442 4443 /* 4444 * SMB privileges. 4445 */ 4446 case PRIV_NETSMB: 4447 #endif 4448 4449 /* 4450 * No default: or deny here. 4451 * In case of no permit fall through to next switch(). 4452 */ 4453 if (cred->cr_prison->pr_flags & PR_VNET) 4454 return (0); 4455 } 4456 #endif /* VIMAGE */ 4457 4458 switch (priv) { 4459 /* 4460 * Allow ktrace privileges for root in jail. 4461 */ 4462 case PRIV_KTRACE: 4463 4464 /* 4465 * Allow jailed processes to configure audit identity and 4466 * submit audit records (login, etc). In the future we may 4467 * want to further refine the relationship between audit and 4468 * jail. 4469 */ 4470 case PRIV_AUDIT_GETAUDIT: 4471 case PRIV_AUDIT_SETAUDIT: 4472 if (cred->cr_prison->pr_allow & PR_ALLOW_SETAUDIT) 4473 return (0); 4474 else 4475 return (EPERM); 4476 #if 0 4477 case PRIV_AUDIT_SUBMIT: 4478 #endif 4479 4480 /* 4481 * Allow jailed processes to manipulate process UNIX 4482 * credentials in any way they see fit. 4483 */ 4484 case PRIV_CRED_SETCRED: 4485 case PRIV_CRED_SETUID: 4486 case PRIV_CRED_SETEUID: 4487 case PRIV_CRED_SETGID: 4488 case PRIV_CRED_SETEGID: 4489 case PRIV_CRED_SETGROUPS: 4490 case PRIV_CRED_SETREUID: 4491 case PRIV_CRED_SETREGID: 4492 case PRIV_CRED_SETRESUID: 4493 case PRIV_CRED_SETRESGID: 4494 4495 /* 4496 * Jail implements visibility constraints already, so allow 4497 * jailed root to override uid/gid-based constraints. 4498 */ 4499 case PRIV_SEEOTHERGIDS: 4500 case PRIV_SEEOTHERUIDS: 4501 case PRIV_SEEJAILPROC: 4502 4503 /* 4504 * Jail implements inter-process debugging limits already, so 4505 * allow jailed root various debugging privileges. 4506 */ 4507 case PRIV_DEBUG_DIFFCRED: 4508 case PRIV_DEBUG_SUGID: 4509 case PRIV_DEBUG_UNPRIV: 4510 case PRIV_DEBUG_DIFFJAIL: 4511 4512 /* 4513 * Allow jail to set various resource limits and login 4514 * properties, and for now, exceed process resource limits. 4515 */ 4516 case PRIV_PROC_LIMIT: 4517 case PRIV_PROC_SETLOGIN: 4518 case PRIV_PROC_SETRLIMIT: 4519 4520 /* 4521 * Debuggers should work in jails. 4522 */ 4523 case PRIV_PROC_MEM_WRITE: 4524 4525 /* 4526 * System V and POSIX IPC privileges are granted in jail. 4527 */ 4528 case PRIV_IPC_READ: 4529 case PRIV_IPC_WRITE: 4530 case PRIV_IPC_ADMIN: 4531 case PRIV_IPC_MSGSIZE: 4532 case PRIV_MQ_ADMIN: 4533 4534 /* 4535 * Jail operations within a jail work on child jails. 4536 */ 4537 case PRIV_JAIL_ATTACH: 4538 case PRIV_JAIL_SET: 4539 case PRIV_JAIL_REMOVE: 4540 4541 /* 4542 * Jail implements its own inter-process limits, so allow 4543 * root processes in jail to change scheduling on other 4544 * processes in the same jail. Likewise for signalling. 4545 */ 4546 case PRIV_SCHED_DIFFCRED: 4547 case PRIV_SCHED_CPUSET: 4548 case PRIV_SCHED_DIFFJAIL: 4549 case PRIV_SIGNAL_DIFFCRED: 4550 case PRIV_SIGNAL_SUGID: 4551 case PRIV_SIGNAL_DIFFJAIL: 4552 4553 /* 4554 * Allow jailed processes to write to sysctls marked as jail 4555 * writable. 4556 */ 4557 case PRIV_SYSCTL_WRITEJAIL: 4558 4559 /* 4560 * Allow root in jail to manage a variety of quota 4561 * properties. These should likely be conditional on a 4562 * configuration option. 4563 */ 4564 case PRIV_VFS_GETQUOTA: 4565 case PRIV_VFS_SETQUOTA: 4566 4567 /* 4568 * Since Jail relies on chroot() to implement file system 4569 * protections, grant many VFS privileges to root in jail. 4570 * Be careful to exclude mount-related and NFS-related 4571 * privileges. 4572 */ 4573 case PRIV_VFS_READ: 4574 case PRIV_VFS_WRITE: 4575 case PRIV_VFS_ADMIN: 4576 case PRIV_VFS_EXEC: 4577 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ 4578 case PRIV_VFS_CHFLAGS_DEV: 4579 case PRIV_VFS_CHOWN: 4580 case PRIV_VFS_CHROOT: 4581 case PRIV_VFS_RETAINSUGID: 4582 case PRIV_VFS_FCHROOT: 4583 case PRIV_VFS_LINK: 4584 case PRIV_VFS_SETGID: 4585 case PRIV_VFS_STAT: 4586 case PRIV_VFS_STICKYFILE: 4587 4588 /* 4589 * As in the non-jail case, non-root users are expected to be 4590 * able to read kernel/physical memory (provided /dev/[k]mem 4591 * exists in the jail and they have permission to access it). 4592 */ 4593 case PRIV_KMEM_READ: 4594 return (0); 4595 4596 /* 4597 * Depending on the global setting, allow privilege of 4598 * setting system flags. 4599 */ 4600 case PRIV_VFS_SYSFLAGS: 4601 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) 4602 return (0); 4603 else 4604 return (EPERM); 4605 4606 /* 4607 * Depending on the global setting, allow privilege of 4608 * mounting/unmounting file systems. 4609 */ 4610 case PRIV_VFS_MOUNT: 4611 case PRIV_VFS_UNMOUNT: 4612 case PRIV_VFS_MOUNT_NONUSER: 4613 case PRIV_VFS_MOUNT_OWNER: 4614 pr = cred->cr_prison; 4615 prison_lock(pr); 4616 if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2) 4617 error = 0; 4618 else 4619 error = EPERM; 4620 prison_unlock(pr); 4621 return (error); 4622 4623 /* 4624 * Jails should hold no disposition on the PRIV_VFS_READ_DIR 4625 * policy. priv_check_cred will not specifically allow it, and 4626 * we may want a MAC policy to allow it. 4627 */ 4628 case PRIV_VFS_READ_DIR: 4629 return (0); 4630 4631 /* 4632 * Conditionally allow privileged process in the jail to 4633 * manipulate filesystem extended attributes in the system 4634 * namespace. 4635 */ 4636 case PRIV_VFS_EXTATTR_SYSTEM: 4637 if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0) 4638 return (0); 4639 else 4640 return (EPERM); 4641 4642 /* 4643 * Conditionnaly allow locking (unlocking) physical pages 4644 * in memory. 4645 */ 4646 case PRIV_VM_MLOCK: 4647 case PRIV_VM_MUNLOCK: 4648 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) 4649 return (0); 4650 else 4651 return (EPERM); 4652 4653 /* 4654 * Conditionally allow jailed root to bind reserved ports. 4655 */ 4656 case PRIV_NETINET_RESERVEDPORT: 4657 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) 4658 return (0); 4659 else 4660 return (EPERM); 4661 4662 /* 4663 * Allow jailed root to reuse in-use ports. 4664 */ 4665 case PRIV_NETINET_REUSEPORT: 4666 return (0); 4667 4668 /* 4669 * Allow jailed root to set certain IPv4/6 (option) headers. 4670 */ 4671 case PRIV_NETINET_SETHDROPTS: 4672 return (0); 4673 4674 /* 4675 * Conditionally allow creating raw sockets in jail. 4676 */ 4677 case PRIV_NETINET_RAW: 4678 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) 4679 return (0); 4680 else 4681 return (EPERM); 4682 4683 /* 4684 * Since jail implements its own visibility limits on netstat 4685 * sysctls, allow getcred. This allows identd to work in 4686 * jail. 4687 */ 4688 case PRIV_NETINET_GETCRED: 4689 return (0); 4690 4691 /* 4692 * Allow jailed root to set loginclass. 4693 */ 4694 case PRIV_PROC_SETLOGINCLASS: 4695 return (0); 4696 4697 /* 4698 * Do not allow a process inside a jail to read the kernel 4699 * message buffer unless explicitly permitted. 4700 */ 4701 case PRIV_MSGBUF: 4702 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) 4703 return (0); 4704 return (EPERM); 4705 4706 /* 4707 * Conditionally allow privileged process in the jail adjust 4708 * machine time. 4709 */ 4710 case PRIV_ADJTIME: 4711 case PRIV_NTP_ADJTIME: 4712 if (cred->cr_prison->pr_allow & 4713 (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) { 4714 return (0); 4715 } 4716 return (EPERM); 4717 4718 /* 4719 * Conditionally allow privileged process in the jail set 4720 * machine time. 4721 */ 4722 case PRIV_SETTIMEOFDAY: 4723 case PRIV_CLOCK_SETTIME: 4724 if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME) 4725 return (0); 4726 else 4727 return (EPERM); 4728 4729 /* 4730 * Conditionally allow privileged process in the jail to modify 4731 * the routing table. 4732 */ 4733 case PRIV_NET_ROUTE: 4734 if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING) 4735 return (0); 4736 else 4737 return (EPERM); 4738 4739 case PRIV_VMM_PPTDEV: 4740 /* 4741 * Allow jailed root to manage passthrough devices. vmm(4) also 4742 * checks for the dynamically added allow.vmm_ppt. 4743 */ 4744 return (0); 4745 4746 default: 4747 /* 4748 * In all remaining cases, deny the privilege request. This 4749 * includes almost all network privileges, many system 4750 * configuration privileges. 4751 */ 4752 return (EPERM); 4753 } 4754 } 4755 4756 /* 4757 * Return the part of pr2's name that is relative to pr1, or the whole name 4758 * if it does not directly follow. 4759 */ 4760 4761 char * 4762 prison_name(struct prison *pr1, struct prison *pr2) 4763 { 4764 char *name; 4765 4766 /* Jails see themselves as "0" (if they see themselves at all). */ 4767 if (pr1 == pr2) 4768 return "0"; 4769 name = pr2->pr_name; 4770 if (prison_ischild(pr1, pr2)) { 4771 /* 4772 * pr1 isn't locked (and allprison_lock may not be either) 4773 * so its length can't be counted on. But the number of dots 4774 * can be counted on - and counted. 4775 */ 4776 for (; pr1 != &prison0; pr1 = pr1->pr_parent) 4777 name = strchr(name, '.') + 1; 4778 } 4779 return (name); 4780 } 4781 4782 /* 4783 * Return the part of pr2's path that is relative to pr1, or the whole path 4784 * if it does not directly follow. 4785 */ 4786 static char * 4787 prison_path(struct prison *pr1, struct prison *pr2) 4788 { 4789 char *path1, *path2; 4790 int len1; 4791 4792 path1 = pr1->pr_path; 4793 path2 = pr2->pr_path; 4794 if (!strcmp(path1, "/")) 4795 return (path2); 4796 len1 = strlen(path1); 4797 if (strncmp(path1, path2, len1)) 4798 return (path2); 4799 if (path2[len1] == '\0') 4800 return "/"; 4801 if (path2[len1] == '/') 4802 return (path2 + len1); 4803 return (path2); 4804 } 4805 4806 /* 4807 * Jail-related sysctls. 4808 */ 4809 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 4810 "Jails"); 4811 4812 #if defined(INET) || defined(INET6) 4813 /* 4814 * Copy address array to memory that would be then SYSCTL_OUT-ed. 4815 * sysctl_jail_list() helper. 4816 */ 4817 static void 4818 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len) 4819 { 4820 const struct prison_ip *pip; 4821 const size_t size = pr_families[af].size; 4822 4823 again: 4824 mtx_assert(&pr->pr_mtx, MA_OWNED); 4825 if ((pip = pr->pr_addrs[af]) != NULL) { 4826 if (*len < pip->ips) { 4827 *len = pip->ips; 4828 mtx_unlock(&pr->pr_mtx); 4829 *out = realloc(*out, *len * size, M_TEMP, M_WAITOK); 4830 mtx_lock(&pr->pr_mtx); 4831 goto again; 4832 } 4833 bcopy(pip->pr_ip, *out, pip->ips * size); 4834 } 4835 } 4836 #endif 4837 4838 static int 4839 sysctl_jail_list(SYSCTL_HANDLER_ARGS) 4840 { 4841 struct xprison *xp; 4842 struct prison *pr, *cpr; 4843 #ifdef INET 4844 struct in_addr *ip4 = NULL; 4845 int ip4s = 0; 4846 #endif 4847 #ifdef INET6 4848 struct in6_addr *ip6 = NULL; 4849 int ip6s = 0; 4850 #endif 4851 int descend, error; 4852 4853 xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); 4854 pr = req->td->td_ucred->cr_prison; 4855 error = 0; 4856 sx_slock(&allprison_lock); 4857 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { 4858 mtx_lock(&cpr->pr_mtx); 4859 #ifdef INET 4860 prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s); 4861 #endif 4862 #ifdef INET6 4863 prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s); 4864 #endif 4865 bzero(xp, sizeof(*xp)); 4866 xp->pr_version = XPRISON_VERSION; 4867 xp->pr_id = cpr->pr_id; 4868 xp->pr_state = cpr->pr_state; 4869 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); 4870 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); 4871 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); 4872 #ifdef INET 4873 xp->pr_ip4s = ip4s; 4874 #endif 4875 #ifdef INET6 4876 xp->pr_ip6s = ip6s; 4877 #endif 4878 mtx_unlock(&cpr->pr_mtx); 4879 error = SYSCTL_OUT(req, xp, sizeof(*xp)); 4880 if (error) 4881 break; 4882 #ifdef INET 4883 if (xp->pr_ip4s > 0) { 4884 error = SYSCTL_OUT(req, ip4, 4885 xp->pr_ip4s * sizeof(struct in_addr)); 4886 if (error) 4887 break; 4888 } 4889 #endif 4890 #ifdef INET6 4891 if (xp->pr_ip6s > 0) { 4892 error = SYSCTL_OUT(req, ip6, 4893 xp->pr_ip6s * sizeof(struct in6_addr)); 4894 if (error) 4895 break; 4896 } 4897 #endif 4898 } 4899 sx_sunlock(&allprison_lock); 4900 free(xp, M_TEMP); 4901 #ifdef INET 4902 free(ip4, M_TEMP); 4903 #endif 4904 #ifdef INET6 4905 free(ip6, M_TEMP); 4906 #endif 4907 return (error); 4908 } 4909 4910 SYSCTL_OID(_security_jail, OID_AUTO, list, 4911 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 4912 sysctl_jail_list, "S", "List of active jails"); 4913 4914 static int 4915 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) 4916 { 4917 int error, injail; 4918 4919 injail = jailed(req->td->td_ucred); 4920 error = SYSCTL_OUT(req, &injail, sizeof(injail)); 4921 4922 return (error); 4923 } 4924 4925 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, 4926 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 4927 sysctl_jail_jailed, "I", "Process in jail?"); 4928 4929 static int 4930 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) 4931 { 4932 int error, havevnet; 4933 #ifdef VIMAGE 4934 struct ucred *cred = req->td->td_ucred; 4935 4936 havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison); 4937 #else 4938 havevnet = 0; 4939 #endif 4940 error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); 4941 4942 return (error); 4943 } 4944 4945 SYSCTL_PROC(_security_jail, OID_AUTO, vnet, 4946 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 4947 sysctl_jail_vnet, "I", "Jail owns vnet?"); 4948 4949 #if defined(INET) || defined(INET6) 4950 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, 4951 &jail_max_af_ips, 0, 4952 "Number of IP addresses a jail may have at most per address family (deprecated)"); 4953 #endif 4954 4955 /* 4956 * Default parameters for jail(2) compatibility. For historical reasons, 4957 * the sysctl names have varying similarity to the parameter names. Prisons 4958 * just see their own parameters, and can't change them. 4959 */ 4960 static int 4961 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) 4962 { 4963 int error, i; 4964 4965 /* Get the current flag value, and convert it to a boolean. */ 4966 if (req->td->td_ucred->cr_prison == &prison0) { 4967 mtx_lock(&prison0.pr_mtx); 4968 i = (jail_default_allow & arg2) != 0; 4969 mtx_unlock(&prison0.pr_mtx); 4970 } else 4971 i = prison_allow(req->td->td_ucred, arg2); 4972 4973 if (arg1 != NULL) 4974 i = !i; 4975 error = sysctl_handle_int(oidp, &i, 0, req); 4976 if (error || !req->newptr) 4977 return (error); 4978 i = i ? arg2 : 0; 4979 if (arg1 != NULL) 4980 i ^= arg2; 4981 /* 4982 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 4983 * for writing. 4984 */ 4985 mtx_lock(&prison0.pr_mtx); 4986 jail_default_allow = (jail_default_allow & ~arg2) | i; 4987 mtx_unlock(&prison0.pr_mtx); 4988 return (0); 4989 } 4990 4991 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, 4992 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4993 NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", 4994 "Processes in jail can set their hostnames (deprecated)"); 4995 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, 4996 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4997 (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", 4998 "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); 4999 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, 5000 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5001 NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", 5002 "Processes in jail can use System V IPC primitives (deprecated)"); 5003 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, 5004 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5005 NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", 5006 "Prison root can create raw sockets (deprecated)"); 5007 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, 5008 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5009 NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", 5010 "Processes in jail can alter system file flags (deprecated)"); 5011 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, 5012 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5013 NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", 5014 "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); 5015 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed, 5016 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5017 NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I", 5018 "Processes in jail can lock/unlock physical pages in memory"); 5019 5020 static int 5021 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) 5022 { 5023 struct prison *pr; 5024 int level, error; 5025 5026 pr = req->td->td_ucred->cr_prison; 5027 level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); 5028 error = sysctl_handle_int(oidp, &level, 0, req); 5029 if (error || !req->newptr) 5030 return (error); 5031 *(int *)arg1 = level; 5032 return (0); 5033 } 5034 5035 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, 5036 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5037 &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), 5038 sysctl_jail_default_level, "I", 5039 "Processes in jail cannot see all mounted file systems (deprecated)"); 5040 5041 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, 5042 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 5043 &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), 5044 sysctl_jail_default_level, "I", 5045 "Ruleset for the devfs filesystem in jail (deprecated)"); 5046 5047 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 5048 "Limits and stats of child jails"); 5049 5050 static int 5051 sysctl_jail_children(SYSCTL_HANDLER_ARGS) 5052 { 5053 struct prison *pr; 5054 int i; 5055 5056 pr = req->td->td_ucred->cr_prison; 5057 5058 switch (oidp->oid_kind & CTLTYPE) { 5059 case CTLTYPE_INT: 5060 i = *(int *)((char *)pr + arg2); 5061 return (SYSCTL_OUT(req, &i, sizeof(i))); 5062 } 5063 5064 return (0); 5065 } 5066 5067 SYSCTL_PROC(_security_jail_children, OID_AUTO, max, 5068 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 5069 NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children, 5070 "I", "Maximum number of child jails"); 5071 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur, 5072 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 5073 NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children, 5074 "I", "Current number of child jails"); 5075 5076 /* 5077 * Nodes to describe jail parameters. Maximum length of string parameters 5078 * is returned in the string itself, and the other parameters exist merely 5079 * to make themselves and their types known. 5080 */ 5081 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 5082 "Jail parameters"); 5083 5084 int 5085 sysctl_jail_param(SYSCTL_HANDLER_ARGS) 5086 { 5087 int i; 5088 long l; 5089 size_t s; 5090 char numbuf[12]; 5091 5092 switch (oidp->oid_kind & CTLTYPE) 5093 { 5094 case CTLTYPE_LONG: 5095 case CTLTYPE_ULONG: 5096 l = 0; 5097 #ifdef SCTL_MASK32 5098 if (!(req->flags & SCTL_MASK32)) 5099 #endif 5100 return (SYSCTL_OUT(req, &l, sizeof(l))); 5101 case CTLTYPE_INT: 5102 case CTLTYPE_UINT: 5103 i = 0; 5104 return (SYSCTL_OUT(req, &i, sizeof(i))); 5105 case CTLTYPE_STRING: 5106 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); 5107 return 5108 (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); 5109 case CTLTYPE_STRUCT: 5110 s = (size_t)arg2; 5111 return (SYSCTL_OUT(req, &s, sizeof(s))); 5112 } 5113 return (0); 5114 } 5115 5116 /* 5117 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at 5118 * jail creation time but cannot be changed in an existing jail. 5119 */ 5120 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); 5121 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); 5122 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); 5123 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); 5124 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, 5125 "I", "Jail secure level"); 5126 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 5127 "Jail value for kern.osreldate and uname -K"); 5128 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 5129 "Jail value for kern.osrelease and uname -r"); 5130 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, 5131 "I", "Jail cannot see all mounted file systems"); 5132 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, 5133 "I", "Ruleset for in-jail devfs mounts"); 5134 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, 5135 "B", "Jail persistence"); 5136 #ifdef VIMAGE 5137 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, 5138 "E,jailsys", "Virtual network stack"); 5139 #endif 5140 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, 5141 "B", "Jail is in the process of shutting down"); 5142 5143 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); 5144 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, 5145 "I", "Current number of child jails"); 5146 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, 5147 "I", "Maximum number of child jails"); 5148 5149 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); 5150 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, 5151 "Jail hostname"); 5152 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, 5153 "Jail NIS domainname"); 5154 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, 5155 "Jail host UUID"); 5156 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, 5157 "LU", "Jail host ID"); 5158 5159 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); 5160 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); 5161 5162 #ifdef MAC 5163 SYSCTL_JAIL_PARAM_STRUCT(_mac, label, CTLFLAG_RW, sizeof(struct mac), 5164 "S,mac", "Jail MAC label"); 5165 #endif 5166 5167 #ifdef INET 5168 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, 5169 "Jail IPv4 address virtualization"); 5170 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), 5171 "S,in_addr,a", "Jail IPv4 addresses"); 5172 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, 5173 "B", "Do (not) use IPv4 source address selection rather than the " 5174 "primary jail IPv4 address."); 5175 #endif 5176 #ifdef INET6 5177 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, 5178 "Jail IPv6 address virtualization"); 5179 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), 5180 "S,in6_addr,a", "Jail IPv6 addresses"); 5181 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, 5182 "B", "Do (not) use IPv6 source address selection rather than the " 5183 "primary jail IPv6 address."); 5184 #endif 5185 5186 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); 5187 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, 5188 "B", "Jail may set hostname"); 5189 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, 5190 "B", "Jail may use SYSV IPC"); 5191 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, 5192 "B", "Jail may create raw sockets"); 5193 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, 5194 "B", "Jail may alter system file flags"); 5195 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, 5196 "B", "Jail may set file quotas"); 5197 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, 5198 "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); 5199 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, 5200 "B", "Jail may lock (unlock) physical pages in memory"); 5201 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, 5202 "B", "Jail may bind sockets to reserved ports"); 5203 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, 5204 "B", "Jail may read the kernel message buffer"); 5205 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, 5206 "B", "Unprivileged processes may use process debugging facilities"); 5207 SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering, 5208 CTLTYPE_INT | CTLFLAG_RW, "B", 5209 "Unprivileged parent jail processes may tamper with same-uid processes" 5210 " (signal/debug/cpuset)"); 5211 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, 5212 "B", "Processes in jail with uid 0 have privilege"); 5213 #ifdef VIMAGE 5214 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW, 5215 "B", "Mountd/nfsd may run in the jail"); 5216 #endif 5217 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW, 5218 "B", "Jail may set system-level filesystem extended attributes"); 5219 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW, 5220 "B", "Jail may adjust system time"); 5221 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW, 5222 "B", "Jail may set system time"); 5223 SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW, 5224 "B", "Jail may modify routing table"); 5225 #ifdef AUDIT 5226 SYSCTL_JAIL_PARAM(_allow, setaudit, CTLTYPE_INT | CTLFLAG_RW, 5227 "B", "Jail may set and get audit session state"); 5228 #endif 5229 5230 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); 5231 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, 5232 "B", "Jail may mount/unmount jail-friendly file systems in general"); 5233 5234 /* 5235 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return 5236 * its associated bit in the pr_allow bitmask, or zero if the parameter was 5237 * not created. 5238 */ 5239 unsigned 5240 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, 5241 const char *descr) 5242 { 5243 struct bool_flags *bf; 5244 struct sysctl_oid *parent; 5245 char *allow_name, *allow_noname, *allowed; 5246 #ifndef NO_SYSCTL_DESCR 5247 char *descr_deprecated; 5248 #endif 5249 u_int allow_flag; 5250 5251 if (prefix 5252 ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) 5253 < 0 || 5254 asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) 5255 < 0 5256 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || 5257 asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { 5258 free(allow_name, M_PRISON); 5259 return 0; 5260 } 5261 5262 /* 5263 * See if this parameter has already beed added, i.e. a module was 5264 * previously loaded/unloaded. 5265 */ 5266 mtx_lock(&prison0.pr_mtx); 5267 for (bf = pr_flag_allow; 5268 bf < pr_flag_allow + nitems(pr_flag_allow) && 5269 atomic_load_int(&bf->flag) != 0; 5270 bf++) { 5271 if (strcmp(bf->name, allow_name) == 0) { 5272 allow_flag = bf->flag; 5273 goto no_add; 5274 } 5275 } 5276 5277 /* 5278 * Find a free bit in pr_allow_all, failing if there are none 5279 * (which shouldn't happen as long as we keep track of how many 5280 * potential dynamic flags exist). 5281 */ 5282 for (allow_flag = 1;; allow_flag <<= 1) { 5283 if (allow_flag == 0) 5284 goto no_add; 5285 if ((pr_allow_all & allow_flag) == 0) 5286 break; 5287 } 5288 5289 /* Note the parameter in the next open slot in pr_flag_allow. */ 5290 for (bf = pr_flag_allow; ; bf++) { 5291 if (bf == pr_flag_allow + nitems(pr_flag_allow)) { 5292 /* This should never happen, but is not fatal. */ 5293 allow_flag = 0; 5294 goto no_add; 5295 } 5296 if (atomic_load_int(&bf->flag) == 0) 5297 break; 5298 } 5299 bf->name = allow_name; 5300 bf->noname = allow_noname; 5301 pr_allow_all |= allow_flag; 5302 /* 5303 * prison0 always has permission for the new parameter. 5304 * Other jails must have it granted to them. 5305 */ 5306 prison0.pr_allow |= allow_flag; 5307 /* The flag indicates a valid entry, so make sure it is set last. */ 5308 atomic_store_rel_int(&bf->flag, allow_flag); 5309 mtx_unlock(&prison0.pr_mtx); 5310 5311 /* 5312 * Create sysctls for the parameter, and the back-compat global 5313 * permission. 5314 */ 5315 parent = prefix 5316 ? SYSCTL_ADD_NODE(NULL, 5317 SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), 5318 OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) 5319 : &sysctl___security_jail_param_allow; 5320 (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, 5321 name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5322 NULL, 0, sysctl_jail_param, "B", descr); 5323 if ((prefix 5324 ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) 5325 : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { 5326 #ifndef NO_SYSCTL_DESCR 5327 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", 5328 descr); 5329 #endif 5330 (void)SYSCTL_ADD_PROC(NULL, 5331 SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, 5332 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, 5333 sysctl_jail_default_allow, "I", descr_deprecated); 5334 #ifndef NO_SYSCTL_DESCR 5335 free(descr_deprecated, M_TEMP); 5336 #endif 5337 free(allowed, M_TEMP); 5338 } 5339 return allow_flag; 5340 5341 no_add: 5342 mtx_unlock(&prison0.pr_mtx); 5343 free(allow_name, M_PRISON); 5344 free(allow_noname, M_PRISON); 5345 return allow_flag; 5346 } 5347 5348 /* 5349 * The VFS system will register jail-aware filesystems here. They each get 5350 * a parameter allow.mount.xxxfs and a flag to check when a jailed user 5351 * attempts to mount. 5352 */ 5353 void 5354 prison_add_vfs(struct vfsconf *vfsp) 5355 { 5356 #ifdef NO_SYSCTL_DESCR 5357 5358 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, 5359 NULL, NULL); 5360 #else 5361 char *descr; 5362 5363 (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", 5364 vfsp->vfc_name); 5365 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, 5366 NULL, descr); 5367 free(descr, M_TEMP); 5368 #endif 5369 } 5370 5371 #ifdef RACCT 5372 void 5373 prison_racct_foreach(void (*callback)(struct racct *racct, 5374 void *arg2, void *arg3), void (*pre)(void), void (*post)(void), 5375 void *arg2, void *arg3) 5376 { 5377 struct prison_racct *prr; 5378 5379 ASSERT_RACCT_ENABLED(); 5380 5381 sx_slock(&allprison_lock); 5382 if (pre != NULL) 5383 (pre)(); 5384 LIST_FOREACH(prr, &allprison_racct, prr_next) 5385 (callback)(prr->prr_racct, arg2, arg3); 5386 if (post != NULL) 5387 (post)(); 5388 sx_sunlock(&allprison_lock); 5389 } 5390 5391 static struct prison_racct * 5392 prison_racct_find_locked(const char *name) 5393 { 5394 struct prison_racct *prr; 5395 5396 ASSERT_RACCT_ENABLED(); 5397 sx_assert(&allprison_lock, SA_XLOCKED); 5398 5399 if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) 5400 return (NULL); 5401 5402 LIST_FOREACH(prr, &allprison_racct, prr_next) { 5403 if (strcmp(name, prr->prr_name) != 0) 5404 continue; 5405 5406 /* Found prison_racct with a matching name? */ 5407 prison_racct_hold(prr); 5408 return (prr); 5409 } 5410 5411 /* Add new prison_racct. */ 5412 prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); 5413 racct_create(&prr->prr_racct); 5414 5415 strcpy(prr->prr_name, name); 5416 refcount_init(&prr->prr_refcount, 1); 5417 LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); 5418 5419 return (prr); 5420 } 5421 5422 struct prison_racct * 5423 prison_racct_find(const char *name) 5424 { 5425 struct prison_racct *prr; 5426 5427 ASSERT_RACCT_ENABLED(); 5428 5429 sx_xlock(&allprison_lock); 5430 prr = prison_racct_find_locked(name); 5431 sx_xunlock(&allprison_lock); 5432 return (prr); 5433 } 5434 5435 void 5436 prison_racct_hold(struct prison_racct *prr) 5437 { 5438 5439 ASSERT_RACCT_ENABLED(); 5440 5441 refcount_acquire(&prr->prr_refcount); 5442 } 5443 5444 static void 5445 prison_racct_free_locked(struct prison_racct *prr) 5446 { 5447 5448 ASSERT_RACCT_ENABLED(); 5449 sx_assert(&allprison_lock, SA_XLOCKED); 5450 5451 if (refcount_release(&prr->prr_refcount)) { 5452 racct_destroy(&prr->prr_racct); 5453 LIST_REMOVE(prr, prr_next); 5454 free(prr, M_PRISON_RACCT); 5455 } 5456 } 5457 5458 void 5459 prison_racct_free(struct prison_racct *prr) 5460 { 5461 5462 ASSERT_RACCT_ENABLED(); 5463 sx_assert(&allprison_lock, SA_UNLOCKED); 5464 5465 if (refcount_release_if_not_last(&prr->prr_refcount)) 5466 return; 5467 5468 sx_xlock(&allprison_lock); 5469 prison_racct_free_locked(prr); 5470 sx_xunlock(&allprison_lock); 5471 } 5472 5473 static void 5474 prison_racct_attach(struct prison *pr) 5475 { 5476 struct prison_racct *prr; 5477 5478 ASSERT_RACCT_ENABLED(); 5479 sx_assert(&allprison_lock, SA_XLOCKED); 5480 5481 prr = prison_racct_find_locked(pr->pr_name); 5482 KASSERT(prr != NULL, ("cannot find prison_racct")); 5483 5484 pr->pr_prison_racct = prr; 5485 } 5486 5487 /* 5488 * Handle jail renaming. From the racct point of view, renaming means 5489 * moving from one prison_racct to another. 5490 */ 5491 static void 5492 prison_racct_modify(struct prison *pr) 5493 { 5494 #ifdef RCTL 5495 struct proc *p; 5496 struct ucred *cred; 5497 #endif 5498 struct prison_racct *oldprr; 5499 5500 ASSERT_RACCT_ENABLED(); 5501 5502 sx_slock(&allproc_lock); 5503 sx_xlock(&allprison_lock); 5504 5505 if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { 5506 sx_xunlock(&allprison_lock); 5507 sx_sunlock(&allproc_lock); 5508 return; 5509 } 5510 5511 oldprr = pr->pr_prison_racct; 5512 pr->pr_prison_racct = NULL; 5513 5514 prison_racct_attach(pr); 5515 5516 /* 5517 * Move resource utilisation records. 5518 */ 5519 racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); 5520 5521 #ifdef RCTL 5522 /* 5523 * Force rctl to reattach rules to processes. 5524 */ 5525 FOREACH_PROC_IN_SYSTEM(p) { 5526 PROC_LOCK(p); 5527 cred = crhold(p->p_ucred); 5528 PROC_UNLOCK(p); 5529 rctl_proc_ucred_changed(p, cred); 5530 crfree(cred); 5531 } 5532 #endif 5533 5534 sx_sunlock(&allproc_lock); 5535 prison_racct_free_locked(oldprr); 5536 sx_xunlock(&allprison_lock); 5537 } 5538 5539 static void 5540 prison_racct_detach(struct prison *pr) 5541 { 5542 5543 ASSERT_RACCT_ENABLED(); 5544 sx_assert(&allprison_lock, SA_UNLOCKED); 5545 5546 if (pr->pr_prison_racct == NULL) 5547 return; 5548 prison_racct_free(pr->pr_prison_racct); 5549 pr->pr_prison_racct = NULL; 5550 } 5551 #endif /* RACCT */ 5552 5553 /* 5554 * Submit a knote for a prison, locking if necessary. 5555 */ 5556 static void 5557 prison_knote(struct prison *pr, long hint) 5558 { 5559 int locked; 5560 5561 locked = mtx_owned(&pr->pr_mtx); 5562 if (!locked) 5563 mtx_lock(&pr->pr_mtx); 5564 KNOTE_LOCKED(pr->pr_klist, hint); 5565 jaildesc_knote(pr, hint); 5566 if (!locked) 5567 mtx_unlock(&pr->pr_mtx); 5568 } 5569 5570 #ifdef DDB 5571 5572 static void 5573 db_show_prison(struct prison *pr) 5574 { 5575 struct bool_flags *bf; 5576 struct jailsys_flags *jsf; 5577 #if defined(INET) || defined(INET6) 5578 int ii; 5579 struct prison_ip *pip; 5580 #endif 5581 unsigned f; 5582 #ifdef INET 5583 char ip4buf[INET_ADDRSTRLEN]; 5584 #endif 5585 #ifdef INET6 5586 char ip6buf[INET6_ADDRSTRLEN]; 5587 #endif 5588 5589 db_printf("prison %p:\n", pr); 5590 db_printf(" jid = %d\n", pr->pr_id); 5591 db_printf(" name = %s\n", pr->pr_name); 5592 db_printf(" parent = %p\n", pr->pr_parent); 5593 db_printf(" ref = %d\n", pr->pr_ref); 5594 db_printf(" uref = %d\n", pr->pr_uref); 5595 db_printf(" state = %s\n", 5596 pr->pr_state == PRISON_STATE_ALIVE ? "alive" : 5597 pr->pr_state == PRISON_STATE_DYING ? "dying" : 5598 "invalid"); 5599 db_printf(" path = %s\n", pr->pr_path); 5600 db_printf(" cpuset = %d\n", pr->pr_cpuset 5601 ? pr->pr_cpuset->cs_id : -1); 5602 #ifdef VIMAGE 5603 db_printf(" vnet = %p\n", pr->pr_vnet); 5604 #endif 5605 db_printf(" root = %p\n", pr->pr_root); 5606 db_printf(" securelevel = %d\n", pr->pr_securelevel); 5607 db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); 5608 db_printf(" children.max = %d\n", pr->pr_childmax); 5609 db_printf(" children.cur = %d\n", pr->pr_childcount); 5610 db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); 5611 db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); 5612 db_printf(" flags = 0x%x", pr->pr_flags); 5613 for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) 5614 if (pr->pr_flags & bf->flag) 5615 db_printf(" %s", bf->name); 5616 for (jsf = pr_flag_jailsys; 5617 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 5618 jsf++) { 5619 f = pr->pr_flags & (jsf->disable | jsf->new); 5620 db_printf(" %-16s= %s\n", jsf->name, 5621 (f != 0 && f == jsf->disable) ? "disable" 5622 : (f == jsf->new) ? "new" 5623 : "inherit"); 5624 } 5625 db_printf(" allow = 0x%x", pr->pr_allow); 5626 for (bf = pr_flag_allow; 5627 bf < pr_flag_allow + nitems(pr_flag_allow) && 5628 atomic_load_int(&bf->flag) != 0; 5629 bf++) 5630 if (pr->pr_allow & bf->flag) 5631 db_printf(" %s", bf->name); 5632 db_printf("\n"); 5633 db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); 5634 db_printf(" host.hostname = %s\n", pr->pr_hostname); 5635 db_printf(" host.domainname = %s\n", pr->pr_domainname); 5636 db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); 5637 db_printf(" host.hostid = %lu\n", pr->pr_hostid); 5638 #ifdef INET 5639 if ((pip = pr->pr_addrs[PR_INET]) != NULL) { 5640 db_printf(" ip4s = %d\n", pip->ips); 5641 for (ii = 0; ii < pip->ips; ii++) 5642 db_printf(" %s %s\n", 5643 ii == 0 ? "ip4.addr =" : " ", 5644 inet_ntoa_r( 5645 *(const struct in_addr *)PR_IP(pip, PR_INET, ii), 5646 ip4buf)); 5647 } 5648 #endif 5649 #ifdef INET6 5650 if ((pip = pr->pr_addrs[PR_INET6]) != NULL) { 5651 db_printf(" ip6s = %d\n", pip->ips); 5652 for (ii = 0; ii < pip->ips; ii++) 5653 db_printf(" %s %s\n", 5654 ii == 0 ? "ip6.addr =" : " ", 5655 ip6_sprintf(ip6buf, 5656 (const struct in6_addr *)PR_IP(pip, PR_INET6, ii))); 5657 } 5658 #endif 5659 } 5660 5661 DB_SHOW_COMMAND(prison, db_show_prison_command) 5662 { 5663 struct prison *pr; 5664 5665 if (!have_addr) { 5666 /* 5667 * Show all prisons in the list, and prison0 which is not 5668 * listed. 5669 */ 5670 db_show_prison(&prison0); 5671 if (!db_pager_quit) { 5672 TAILQ_FOREACH(pr, &allprison, pr_list) { 5673 db_show_prison(pr); 5674 if (db_pager_quit) 5675 break; 5676 } 5677 } 5678 return; 5679 } 5680 5681 if (addr == 0) 5682 pr = &prison0; 5683 else { 5684 /* Look for a prison with the ID and with references. */ 5685 TAILQ_FOREACH(pr, &allprison, pr_list) 5686 if (pr->pr_id == addr && pr->pr_ref > 0) 5687 break; 5688 if (pr == NULL) 5689 /* Look again, without requiring a reference. */ 5690 TAILQ_FOREACH(pr, &allprison, pr_list) 5691 if (pr->pr_id == addr) 5692 break; 5693 if (pr == NULL) 5694 /* Assume address points to a valid prison. */ 5695 pr = (struct prison *)addr; 5696 } 5697 db_show_prison(pr); 5698 } 5699 5700 #endif /* DDB */ 5701