1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 1999 Poul-Henning Kamp. 5 * Copyright (c) 2008 Bjoern A. Zeeb. 6 * Copyright (c) 2009 James Gritton. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 #include "opt_ddb.h" 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_nfs.h" 36 37 #include <sys/param.h> 38 #include <sys/types.h> 39 #include <sys/kernel.h> 40 #include <sys/systm.h> 41 #include <sys/errno.h> 42 #include <sys/file.h> 43 #include <sys/sysproto.h> 44 #include <sys/malloc.h> 45 #include <sys/osd.h> 46 #include <sys/priv.h> 47 #include <sys/proc.h> 48 #include <sys/epoch.h> 49 #include <sys/event.h> 50 #include <sys/taskqueue.h> 51 #include <sys/fcntl.h> 52 #include <sys/jail.h> 53 #include <sys/jaildesc.h> 54 #include <sys/linker.h> 55 #include <sys/lock.h> 56 #include <sys/mman.h> 57 #include <sys/mutex.h> 58 #include <sys/racct.h> 59 #include <sys/rctl.h> 60 #include <sys/refcount.h> 61 #include <sys/sx.h> 62 #include <sys/sysent.h> 63 #include <sys/namei.h> 64 #include <sys/mount.h> 65 #include <sys/queue.h> 66 #include <sys/socket.h> 67 #include <sys/syscallsubr.h> 68 #include <sys/sysctl.h> 69 #include <sys/uuid.h> 70 #include <sys/vnode.h> 71 72 #include <net/if.h> 73 #include <net/vnet.h> 74 75 #include <netinet/in.h> 76 77 #ifdef DDB 78 #include <ddb/ddb.h> 79 #endif /* DDB */ 80 81 #include <security/mac/mac_framework.h> 82 83 #define PRISON0_HOSTUUID_MODULE "hostuuid" 84 85 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); 86 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); 87 88 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ 89 #ifdef INET 90 #ifdef INET6 91 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL 92 #else 93 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL 94 #endif 95 #else /* !INET */ 96 #ifdef INET6 97 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL 98 #else 99 #define _PR_IP_SADDRSEL 0 100 #endif 101 #endif 102 103 /* prison0 describes what is "real" about the system. */ 104 struct prison prison0 = { 105 .pr_id = 0, 106 .pr_name = "0", 107 .pr_ref = 1, 108 .pr_uref = 1, 109 .pr_path = "/", 110 .pr_securelevel = -1, 111 .pr_devfs_rsnum = 0, 112 .pr_state = PRISON_STATE_ALIVE, 113 .pr_childmax = JAIL_MAX, 114 .pr_hostuuid = DEFAULT_HOSTUUID, 115 .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), 116 #ifdef VIMAGE 117 .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, 118 #else 119 .pr_flags = PR_HOST|_PR_IP_SADDRSEL, 120 #endif 121 .pr_allow = PR_ALLOW_PRISON0, 122 }; 123 _Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0, 124 "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved"); 125 126 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); 127 128 struct bool_flags { 129 const char *name; 130 const char *noname; 131 volatile u_int flag; 132 }; 133 struct jailsys_flags { 134 const char *name; 135 unsigned disable; 136 unsigned new; 137 }; 138 139 /* 140 * Handle jail teardown in a dedicated thread to avoid deadlocks from 141 * vnet_destroy(). 142 */ 143 TASKQUEUE_DEFINE_THREAD(jail_remove); 144 145 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ 146 struct sx allprison_lock; 147 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); 148 struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); 149 LIST_HEAD(, prison_racct) allprison_racct; 150 int lastprid = 0; 151 int lastdeadid = 0; 152 153 static int get_next_prid(struct prison **insprp); 154 static int get_next_deadid(struct prison **insprp); 155 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); 156 static void prison_complete(void *context, int pending); 157 static void prison_deref(struct prison *pr, int flags); 158 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); 159 static int prison_lock_xlock(struct prison *pr, int flags); 160 static void prison_cleanup_locked(struct prison *pr); 161 static void prison_cleanup_unlocked(struct prison *pr); 162 static void prison_free_not_last(struct prison *pr); 163 static void prison_proc_free_not_last(struct prison *pr); 164 static void prison_proc_relink(struct prison *opr, struct prison *npr, 165 struct proc *p); 166 static void prison_set_allow_locked(struct prison *pr, unsigned flag, 167 int enable); 168 static char *prison_path(struct prison *pr1, struct prison *pr2); 169 #ifdef RACCT 170 static void prison_racct_attach(struct prison *pr); 171 static void prison_racct_modify(struct prison *pr); 172 static void prison_racct_detach(struct prison *pr); 173 #endif 174 static void prison_knote(struct prison *pr, long hint); 175 176 /* Flags for prison_deref */ 177 #define PD_DEREF 0x01 /* Decrement pr_ref */ 178 #define PD_DEUREF 0x02 /* Decrement pr_uref */ 179 #define PD_KILL 0x04 /* Remove jail, kill processes, etc */ 180 #define PD_LOCKED 0x10 /* pr_mtx is held */ 181 #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */ 182 #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */ 183 #define PD_OP_FLAGS 0x07 /* Operation flags */ 184 #define PD_LOCK_FLAGS 0x70 /* Lock status flags */ 185 186 /* 187 * Parameter names corresponding to PR_* flag values. Size values are for kvm 188 * as we cannot figure out the size of a sparse array, or an array without a 189 * terminating entry. 190 */ 191 static struct bool_flags pr_flag_bool[] = { 192 {"persist", "nopersist", PR_PERSIST}, 193 #ifdef INET 194 {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, 195 #endif 196 #ifdef INET6 197 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, 198 #endif 199 }; 200 const size_t pr_flag_bool_size = sizeof(pr_flag_bool); 201 202 static struct jailsys_flags pr_flag_jailsys[] = { 203 {"host", 0, PR_HOST}, 204 #ifdef VIMAGE 205 {"vnet", 0, PR_VNET}, 206 #endif 207 #ifdef INET 208 {"ip4", PR_IP4_USER, PR_IP4_USER}, 209 #endif 210 #ifdef INET6 211 {"ip6", PR_IP6_USER, PR_IP6_USER}, 212 #endif 213 }; 214 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); 215 216 /* 217 * Make this array full-size so dynamic parameters can be added. 218 * It is protected by prison0.mtx, but lockless reading is allowed 219 * with an atomic check of the flag values. 220 */ 221 static struct bool_flags pr_flag_allow[NBBY * NBPW] = { 222 {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, 223 {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, 224 {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, 225 {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, 226 {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, 227 {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, 228 {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, 229 {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, 230 {"allow.reserved_ports", "allow.noreserved_ports", 231 PR_ALLOW_RESERVED_PORTS}, 232 {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, 233 {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", 234 PR_ALLOW_UNPRIV_DEBUG}, 235 {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, 236 #ifdef VIMAGE 237 {"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD}, 238 #endif 239 {"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR}, 240 {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME}, 241 {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME}, 242 {"allow.routing", "allow.norouting", PR_ALLOW_ROUTING}, 243 {"allow.unprivileged_parent_tampering", 244 "allow.nounprivileged_parent_tampering", 245 PR_ALLOW_UNPRIV_PARENT_TAMPER}, 246 }; 247 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; 248 const size_t pr_flag_allow_size = sizeof(pr_flag_allow); 249 250 #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ 251 PR_ALLOW_RESERVED_PORTS | \ 252 PR_ALLOW_UNPRIV_DEBUG | \ 253 PR_ALLOW_SUSER) 254 #define JAIL_DEFAULT_ENFORCE_STATFS 2 255 #define JAIL_DEFAULT_DEVFS_RSNUM 0 256 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; 257 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; 258 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; 259 #if defined(INET) || defined(INET6) 260 static unsigned jail_max_af_ips = 255; 261 #endif 262 263 /* 264 * Initialize the parts of prison0 that can't be static-initialized with 265 * constants. This is called from proc0_init() after creating thread0 cpuset. 266 */ 267 void 268 prison0_init(void) 269 { 270 uint8_t *file, *data; 271 size_t size; 272 char buf[sizeof(prison0.pr_hostuuid)]; 273 bool valid; 274 275 prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); 276 prison0.pr_osreldate = osreldate; 277 strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); 278 279 /* If we have a preloaded hostuuid, use it. */ 280 file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); 281 if (file != NULL) { 282 data = preload_fetch_addr(file); 283 size = preload_fetch_size(file); 284 if (data != NULL) { 285 /* 286 * The preloaded data may include trailing whitespace, almost 287 * certainly a newline; skip over any whitespace or 288 * non-printable characters to be safe. 289 */ 290 while (size > 0 && data[size - 1] <= 0x20) { 291 size--; 292 } 293 294 valid = false; 295 296 /* 297 * Not NUL-terminated when passed from loader, but 298 * validate_uuid requires that due to using sscanf (as 299 * does the subsequent strlcpy, since it still reads 300 * past the given size to return the true length); 301 * bounce to a temporary buffer to fix. 302 */ 303 if (size >= sizeof(buf)) 304 goto done; 305 306 memcpy(buf, data, size); 307 buf[size] = '\0'; 308 309 if (validate_uuid(buf, size, NULL, 0) != 0) 310 goto done; 311 312 valid = true; 313 (void)strlcpy(prison0.pr_hostuuid, buf, 314 sizeof(prison0.pr_hostuuid)); 315 316 done: 317 if (bootverbose && !valid) { 318 printf("hostuuid: preload data malformed: '%.*s'\n", 319 (int)size, data); 320 } 321 } 322 } 323 if (bootverbose) 324 printf("hostuuid: using %s\n", prison0.pr_hostuuid); 325 } 326 327 /* 328 * struct jail_args { 329 * struct jail *jail; 330 * }; 331 */ 332 int 333 sys_jail(struct thread *td, struct jail_args *uap) 334 { 335 uint32_t version; 336 int error; 337 struct jail j; 338 339 error = copyin(uap->jail, &version, sizeof(uint32_t)); 340 if (error) 341 return (error); 342 343 switch (version) { 344 case 0: 345 { 346 struct jail_v0 j0; 347 348 /* FreeBSD single IPv4 jails. */ 349 bzero(&j, sizeof(struct jail)); 350 error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); 351 if (error) 352 return (error); 353 j.version = j0.version; 354 j.path = j0.path; 355 j.hostname = j0.hostname; 356 j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ 357 break; 358 } 359 360 case 1: 361 /* 362 * Version 1 was used by multi-IPv4 jail implementations 363 * that never made it into the official kernel. 364 */ 365 return (EINVAL); 366 367 case 2: /* JAIL_API_VERSION */ 368 /* FreeBSD multi-IPv4/IPv6,noIP jails. */ 369 error = copyin(uap->jail, &j, sizeof(struct jail)); 370 if (error) 371 return (error); 372 break; 373 374 default: 375 /* Sci-Fi jails are not supported, sorry. */ 376 return (EINVAL); 377 } 378 return (kern_jail(td, &j)); 379 } 380 381 int 382 kern_jail(struct thread *td, struct jail *j) 383 { 384 struct iovec optiov[2 * (4 + nitems(pr_flag_allow) 385 #ifdef INET 386 + 1 387 #endif 388 #ifdef INET6 389 + 1 390 #endif 391 )]; 392 struct uio opt; 393 char *u_path, *u_hostname, *u_name; 394 struct bool_flags *bf; 395 #ifdef INET 396 uint32_t ip4s; 397 struct in_addr *u_ip4; 398 #endif 399 #ifdef INET6 400 struct in6_addr *u_ip6; 401 #endif 402 size_t tmplen; 403 int error, enforce_statfs; 404 405 bzero(&optiov, sizeof(optiov)); 406 opt.uio_iov = optiov; 407 opt.uio_iovcnt = 0; 408 opt.uio_offset = -1; 409 opt.uio_resid = -1; 410 opt.uio_segflg = UIO_SYSSPACE; 411 opt.uio_rw = UIO_READ; 412 opt.uio_td = td; 413 414 /* Set permissions for top-level jails from sysctls. */ 415 if (!jailed(td->td_ucred)) { 416 for (bf = pr_flag_allow; 417 bf < pr_flag_allow + nitems(pr_flag_allow) && 418 atomic_load_int(&bf->flag) != 0; 419 bf++) { 420 optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, 421 (jail_default_allow & bf->flag) 422 ? bf->name : bf->noname); 423 optiov[opt.uio_iovcnt].iov_len = 424 strlen(optiov[opt.uio_iovcnt].iov_base) + 1; 425 opt.uio_iovcnt += 2; 426 } 427 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; 428 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); 429 opt.uio_iovcnt++; 430 enforce_statfs = jail_default_enforce_statfs; 431 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; 432 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); 433 opt.uio_iovcnt++; 434 } 435 436 tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; 437 #ifdef INET 438 ip4s = (j->version == 0) ? 1 : j->ip4s; 439 if (ip4s > jail_max_af_ips) 440 return (EINVAL); 441 tmplen += ip4s * sizeof(struct in_addr); 442 #else 443 if (j->ip4s > 0) 444 return (EINVAL); 445 #endif 446 #ifdef INET6 447 if (j->ip6s > jail_max_af_ips) 448 return (EINVAL); 449 tmplen += j->ip6s * sizeof(struct in6_addr); 450 #else 451 if (j->ip6s > 0) 452 return (EINVAL); 453 #endif 454 u_path = malloc(tmplen, M_TEMP, M_WAITOK); 455 u_hostname = u_path + MAXPATHLEN; 456 u_name = u_hostname + MAXHOSTNAMELEN; 457 #ifdef INET 458 u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); 459 #endif 460 #ifdef INET6 461 #ifdef INET 462 u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); 463 #else 464 u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); 465 #endif 466 #endif 467 optiov[opt.uio_iovcnt].iov_base = "path"; 468 optiov[opt.uio_iovcnt].iov_len = sizeof("path"); 469 opt.uio_iovcnt++; 470 optiov[opt.uio_iovcnt].iov_base = u_path; 471 error = copyinstr(j->path, u_path, MAXPATHLEN, 472 &optiov[opt.uio_iovcnt].iov_len); 473 if (error) { 474 free(u_path, M_TEMP); 475 return (error); 476 } 477 opt.uio_iovcnt++; 478 optiov[opt.uio_iovcnt].iov_base = "host.hostname"; 479 optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); 480 opt.uio_iovcnt++; 481 optiov[opt.uio_iovcnt].iov_base = u_hostname; 482 error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, 483 &optiov[opt.uio_iovcnt].iov_len); 484 if (error) { 485 free(u_path, M_TEMP); 486 return (error); 487 } 488 opt.uio_iovcnt++; 489 if (j->jailname != NULL) { 490 optiov[opt.uio_iovcnt].iov_base = "name"; 491 optiov[opt.uio_iovcnt].iov_len = sizeof("name"); 492 opt.uio_iovcnt++; 493 optiov[opt.uio_iovcnt].iov_base = u_name; 494 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, 495 &optiov[opt.uio_iovcnt].iov_len); 496 if (error) { 497 free(u_path, M_TEMP); 498 return (error); 499 } 500 opt.uio_iovcnt++; 501 } 502 #ifdef INET 503 optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; 504 optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); 505 opt.uio_iovcnt++; 506 optiov[opt.uio_iovcnt].iov_base = u_ip4; 507 optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); 508 if (j->version == 0) 509 u_ip4->s_addr = j->ip4s; 510 else { 511 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); 512 if (error) { 513 free(u_path, M_TEMP); 514 return (error); 515 } 516 } 517 opt.uio_iovcnt++; 518 #endif 519 #ifdef INET6 520 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; 521 optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); 522 opt.uio_iovcnt++; 523 optiov[opt.uio_iovcnt].iov_base = u_ip6; 524 optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); 525 error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); 526 if (error) { 527 free(u_path, M_TEMP); 528 return (error); 529 } 530 opt.uio_iovcnt++; 531 #endif 532 KASSERT(opt.uio_iovcnt <= nitems(optiov), 533 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); 534 error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); 535 free(u_path, M_TEMP); 536 return (error); 537 } 538 539 /* 540 * struct jail_set_args { 541 * struct iovec *iovp; 542 * unsigned int iovcnt; 543 * int flags; 544 * }; 545 */ 546 int 547 sys_jail_set(struct thread *td, struct jail_set_args *uap) 548 { 549 struct uio *auio; 550 int error; 551 552 /* Check that we have an even number of iovecs. */ 553 if (uap->iovcnt & 1) 554 return (EINVAL); 555 556 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 557 if (error) 558 return (error); 559 error = kern_jail_set(td, auio, uap->flags); 560 freeuio(auio); 561 return (error); 562 } 563 564 #if defined(INET) || defined(INET6) 565 typedef int prison_addr_cmp_t(const void *, const void *); 566 typedef bool prison_addr_valid_t(const void *); 567 static const struct pr_family { 568 size_t size; 569 prison_addr_cmp_t *cmp; 570 prison_addr_valid_t *valid; 571 int ip_flag; 572 } pr_families[PR_FAMILY_MAX] = { 573 #ifdef INET 574 [PR_INET] = { 575 .size = sizeof(struct in_addr), 576 .cmp = prison_qcmp_v4, 577 .valid = prison_valid_v4, 578 .ip_flag = PR_IP4_USER, 579 }, 580 #endif 581 #ifdef INET6 582 [PR_INET6] = { 583 .size = sizeof(struct in6_addr), 584 .cmp = prison_qcmp_v6, 585 .valid = prison_valid_v6, 586 .ip_flag = PR_IP6_USER, 587 }, 588 #endif 589 }; 590 591 /* 592 * Network address lists (pr_addrs) allocation for jails. The addresses 593 * are accessed locklessly by the network stack, thus need to be protected by 594 * the network epoch. 595 */ 596 struct prison_ip { 597 struct epoch_context ctx; 598 uint32_t ips; 599 #ifdef FUTURE_C 600 /* 601 * XXX Variable-length automatic arrays in union may be 602 * supported in future C. 603 */ 604 union { 605 char pr_ip[]; 606 struct in_addr pr_ip4[]; 607 struct in6_addr pr_ip6[]; 608 }; 609 #else /* No future C :( */ 610 char pr_ip[]; 611 #endif 612 }; 613 614 static char * 615 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx) 616 { 617 MPASS(pip); 618 MPASS(af < PR_FAMILY_MAX); 619 MPASS(idx >= 0 && idx < pip->ips); 620 621 return (pip->pr_ip + pr_families[af].size * idx); 622 } 623 624 static struct prison_ip * 625 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags) 626 { 627 struct prison_ip *pip; 628 629 pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size, 630 M_PRISON, flags); 631 if (pip != NULL) 632 pip->ips = cnt; 633 return (pip); 634 } 635 636 /* 637 * Allocate and copyin user supplied address list, sorting and validating. 638 * kern_jail_set() helper. 639 */ 640 static struct prison_ip * 641 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt) 642 { 643 prison_addr_cmp_t *const cmp = pr_families[af].cmp; 644 const size_t size = pr_families[af].size; 645 struct prison_ip *pip; 646 647 pip = prison_ip_alloc(af, cnt, M_WAITOK); 648 bcopy(op, pip->pr_ip, cnt * size); 649 /* 650 * IP addresses are all sorted but ip[0] to preserve 651 * the primary IP address as given from userland. 652 * This special IP is used for unbound outgoing 653 * connections as well for "loopback" traffic in case 654 * source address selection cannot find any more fitting 655 * address to connect from. 656 */ 657 if (cnt > 1) 658 qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp); 659 /* 660 * Check for duplicate addresses and do some simple 661 * zero and broadcast checks. If users give other bogus 662 * addresses it is their problem. 663 */ 664 for (int i = 0; i < cnt; i++) { 665 if (!pr_families[af].valid(PR_IP(pip, af, i))) { 666 free(pip, M_PRISON); 667 return (NULL); 668 } 669 if (i + 1 < cnt && 670 (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 || 671 cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) { 672 free(pip, M_PRISON); 673 return (NULL); 674 } 675 } 676 677 return (pip); 678 } 679 680 /* 681 * Allocate and dup parent prison address list. 682 * kern_jail_set() helper. 683 */ 684 static void 685 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af) 686 { 687 const struct prison_ip *ppip = ppr->pr_addrs[af]; 688 struct prison_ip *pip; 689 690 if (ppip != NULL) { 691 pip = prison_ip_alloc(af, ppip->ips, M_WAITOK); 692 bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size); 693 pr->pr_addrs[af] = pip; 694 } 695 } 696 697 /* 698 * Make sure the new set of IP addresses is a subset of the parent's list. 699 * Don't worry about the parent being unlocked, as any setting is done with 700 * allprison_lock held. 701 * kern_jail_set() helper. 702 */ 703 static bool 704 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip, 705 const pr_family_t af) 706 { 707 prison_addr_cmp_t *const cmp = pr_families[af].cmp; 708 int i, j; 709 710 if (ppip == NULL) 711 return (false); 712 713 for (i = 0; i < ppip->ips; i++) 714 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0) 715 break; 716 717 if (i == ppip->ips) 718 /* Main address not present in parent. */ 719 return (false); 720 721 if (pip->ips > 1) { 722 for (i = j = 1; i < pip->ips; i++) { 723 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) 724 /* Equals to parent primary address. */ 725 continue; 726 for (; j < ppip->ips; j++) 727 if (cmp(PR_IP(pip, af, i), 728 PR_IP(ppip, af, j)) == 0) 729 break; 730 if (j == ppip->ips) 731 break; 732 } 733 if (j == ppip->ips) 734 /* Address not present in parent. */ 735 return (false); 736 } 737 return (true); 738 } 739 740 /* 741 * Check for conflicting IP addresses. We permit them if there is no more 742 * than one IP on each jail. If there is a duplicate on a jail with more 743 * than one IP stop checking and return error. 744 * kern_jail_set() helper. 745 */ 746 static bool 747 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr, 748 struct prison_ip *pip, pr_family_t af) 749 { 750 const struct prison *tppr, *tpr; 751 int descend; 752 753 #ifdef VIMAGE 754 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) 755 if (tppr->pr_flags & PR_VNET) 756 break; 757 #else 758 tppr = &prison0; 759 #endif 760 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { 761 if (tpr == pr || 762 #ifdef VIMAGE 763 (tpr != tppr && (tpr->pr_flags & PR_VNET)) || 764 #endif 765 !prison_isalive(tpr)) { 766 descend = 0; 767 continue; 768 } 769 if (!(tpr->pr_flags & pr_families[af].ip_flag)) 770 continue; 771 descend = 0; 772 if (tpr->pr_addrs[af] == NULL || 773 (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1)) 774 continue; 775 for (int i = 0; i < pip->ips; i++) 776 if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0) 777 return (false); 778 } 779 780 return (true); 781 } 782 783 _Static_assert(offsetof(struct prison_ip, ctx) == 0, 784 "prison must start with epoch context"); 785 static void 786 prison_ip_free_deferred(epoch_context_t ctx) 787 { 788 789 free(ctx, M_PRISON); 790 } 791 792 static void 793 prison_ip_free(struct prison_ip *pip) 794 { 795 796 if (pip != NULL) 797 NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx); 798 } 799 800 static void 801 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new) 802 { 803 struct prison_ip **mem, *old; 804 805 mtx_assert(&pr->pr_mtx, MA_OWNED); 806 807 mem = &pr->pr_addrs[af]; 808 809 old = *mem; 810 atomic_store_ptr(mem, new); 811 prison_ip_free(old); 812 } 813 814 /* 815 * Restrict a prison's IP address list with its parent's, possibly replacing 816 * it. Return true if succeed, otherwise should redo. 817 * kern_jail_set() helper. 818 */ 819 static bool 820 prison_ip_restrict(struct prison *pr, const pr_family_t af, 821 struct prison_ip **newp) 822 { 823 struct prison_ip *ppip = pr->pr_parent->pr_addrs[af]; 824 struct prison_ip *pip = pr->pr_addrs[af]; 825 int (*const cmp)(const void *, const void *) = pr_families[af].cmp; 826 const size_t size = pr_families[af].size; 827 struct prison_ip *new = newp != NULL ? *newp : NULL; 828 uint32_t ips; 829 830 mtx_assert(&pr->pr_mtx, MA_OWNED); 831 832 /* 833 * Due to epoch-synchronized access to the IP address lists we always 834 * allocate a new list even if the old one has enough space. We could 835 * atomically update an IPv4 address inside a list, but that would 836 * screw up sorting, and in case of IPv6 we can't even atomically write 837 * one. 838 */ 839 if (ppip == NULL) { 840 if (pip != NULL) 841 prison_ip_set(pr, af, NULL); 842 return (true); 843 } 844 845 if (!(pr->pr_flags & pr_families[af].ip_flag)) { 846 if (new == NULL) { 847 new = prison_ip_alloc(af, ppip->ips, M_NOWAIT); 848 if (new == NULL) 849 return (false); /* Redo */ 850 } 851 /* This has no user settings, so just copy the parent's list. */ 852 MPASS(new->ips == ppip->ips); 853 bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size); 854 prison_ip_set(pr, af, new); 855 if (newp != NULL) 856 *newp = NULL; /* Used */ 857 } else if (pip != NULL) { 858 /* Remove addresses that aren't in the parent. */ 859 int i; 860 861 i = 0; /* index in pip */ 862 ips = 0; /* index in new */ 863 864 if (new == NULL) { 865 new = prison_ip_alloc(af, pip->ips, M_NOWAIT); 866 if (new == NULL) 867 return (false); /* Redo */ 868 } 869 870 for (int pi = 0; pi < ppip->ips; pi++) 871 if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) { 872 /* Found our primary address in parent. */ 873 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), 874 size); 875 i++; 876 ips++; 877 break; 878 } 879 for (int pi = 1; i < pip->ips; ) { 880 /* Check against primary, which is unsorted. */ 881 if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) { 882 /* Matches parent's primary address. */ 883 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), 884 size); 885 i++; 886 ips++; 887 continue; 888 } 889 /* The rest are sorted. */ 890 switch (pi >= ppip->ips ? -1 : 891 cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) { 892 case -1: 893 i++; 894 break; 895 case 0: 896 bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), 897 size); 898 i++; 899 pi++; 900 ips++; 901 break; 902 case 1: 903 pi++; 904 break; 905 } 906 } 907 if (ips == 0) { 908 if (newp == NULL || *newp == NULL) 909 prison_ip_free(new); 910 new = NULL; 911 } else { 912 /* Shrink to real size */ 913 KASSERT((new->ips >= ips), 914 ("Out-of-bounds write to prison_ip %p", new)); 915 new->ips = ips; 916 } 917 prison_ip_set(pr, af, new); 918 if (newp != NULL) 919 *newp = NULL; /* Used */ 920 } 921 return (true); 922 } 923 924 /* 925 * Fast-path check if an address belongs to a prison. 926 */ 927 int 928 prison_ip_check(const struct prison *pr, const pr_family_t af, 929 const void *addr) 930 { 931 int (*const cmp)(const void *, const void *) = pr_families[af].cmp; 932 struct prison_ip *pip; 933 int i, a, z, d; 934 935 MPASS(mtx_owned(&pr->pr_mtx) || 936 in_epoch(net_epoch_preempt) || 937 sx_xlocked(&allprison_lock)); 938 939 pip = atomic_load_ptr(&pr->pr_addrs[af]); 940 if (__predict_false(pip == NULL)) 941 return (EAFNOSUPPORT); 942 943 /* Check the primary IP. */ 944 if (cmp(PR_IP(pip, af, 0), addr) == 0) 945 return (0); 946 947 /* 948 * All the other IPs are sorted so we can do a binary search. 949 */ 950 a = 0; 951 z = pip->ips - 2; 952 while (a <= z) { 953 i = (a + z) / 2; 954 d = cmp(PR_IP(pip, af, i + 1), addr); 955 if (d > 0) 956 z = i - 1; 957 else if (d < 0) 958 a = i + 1; 959 else 960 return (0); 961 } 962 963 return (EADDRNOTAVAIL); 964 } 965 966 /* 967 * Grab primary IP. Historically required mutex, but nothing prevents 968 * us to support epoch-protected access. Is it used in fast path? 969 * in{6}_jail.c helper 970 */ 971 const void * 972 prison_ip_get0(const struct prison *pr, const pr_family_t af) 973 { 974 const struct prison_ip *pip = pr->pr_addrs[af]; 975 976 mtx_assert(&pr->pr_mtx, MA_OWNED); 977 MPASS(pip); 978 979 return (pip->pr_ip); 980 } 981 982 u_int 983 prison_ip_cnt(const struct prison *pr, const pr_family_t af) 984 { 985 986 return (pr->pr_addrs[af]->ips); 987 } 988 #endif /* defined(INET) || defined(INET6) */ 989 990 int 991 kern_jail_set(struct thread *td, struct uio *optuio, int flags) 992 { 993 struct file *jfp_out; 994 struct jaildesc *desc_in; 995 struct nameidata nd; 996 #ifdef INET 997 struct prison_ip *ip4; 998 #endif 999 #ifdef INET6 1000 struct prison_ip *ip6; 1001 #endif 1002 struct vfsopt *opt; 1003 struct vfsoptlist *opts; 1004 struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; 1005 struct ucred *jdcred; 1006 struct vnode *root; 1007 char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; 1008 char *g_path, *osrelstr; 1009 struct bool_flags *bf; 1010 struct jailsys_flags *jsf; 1011 #if defined(INET) || defined(INET6) 1012 void *op; 1013 #endif 1014 unsigned long hid; 1015 size_t namelen, onamelen, pnamelen; 1016 int created, cuflags, descend, drflags, enforce; 1017 int error, errmsg_len, errmsg_pos; 1018 int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; 1019 int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; 1020 int childmax, osreldt, rsnum, slevel; 1021 #ifdef INET 1022 int ip4s; 1023 bool redo_ip4; 1024 #endif 1025 #ifdef INET6 1026 int ip6s; 1027 bool redo_ip6; 1028 #endif 1029 bool maybe_changed; 1030 uint64_t pr_allow, ch_allow, pr_flags, ch_flags; 1031 uint64_t pr_allow_diff; 1032 unsigned tallow; 1033 char numbuf[12]; 1034 1035 mypr = td->td_ucred->cr_prison; 1036 if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) 1037 && mypr->pr_childmax == 0) 1038 return (EPERM); 1039 if (flags & ~JAIL_SET_MASK) 1040 return (EINVAL); 1041 if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) 1042 == (JAIL_USE_DESC | JAIL_AT_DESC)) 1043 return (EINVAL); 1044 prison_hold(mypr); 1045 1046 #ifdef INET 1047 ip4 = NULL; 1048 #endif 1049 #ifdef INET6 1050 ip6 = NULL; 1051 #endif 1052 g_path = NULL; 1053 jfp_out = NULL; 1054 jfd_out = -1; 1055 /* 1056 * Check all the parameters before committing to anything. Not all 1057 * errors can be caught early, but we may as well try. Also, this 1058 * takes care of some expensive stuff (path lookup) before getting 1059 * the allprison lock. 1060 * 1061 * XXX Jails are not filesystems, and jail parameters are not mount 1062 * options. But it makes more sense to re-use the vfsopt code 1063 * than duplicate it under a different name. 1064 */ 1065 error = vfs_buildopts(optuio, &opts); 1066 if (error) 1067 goto done_free; 1068 1069 cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); 1070 if (!cuflags) { 1071 error = EINVAL; 1072 vfs_opterror(opts, "no valid operation (create or update)"); 1073 goto done_errmsg; 1074 } 1075 1076 error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); 1077 if (error == ENOENT) { 1078 if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | 1079 JAIL_OWN_DESC)) { 1080 vfs_opterror(opts, "missing desc"); 1081 goto done_errmsg; 1082 } 1083 jfd_in = -1; 1084 } else if (error != 0) 1085 goto done_free; 1086 else { 1087 if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | 1088 JAIL_OWN_DESC))) { 1089 vfs_opterror(opts, "unexpected desc"); 1090 goto done_errmsg; 1091 } 1092 if (flags & JAIL_AT_DESC) { 1093 /* 1094 * Look up and create jails based on the 1095 * descriptor's prison. 1096 */ 1097 prison_free(mypr); 1098 error = jaildesc_find(td, jfd_in, &desc_in, &mypr, 1099 NULL); 1100 if (error != 0) { 1101 vfs_opterror(opts, error == ENOENT 1102 ? "descriptor to dead jail" 1103 : "not a jail descriptor"); 1104 goto done_errmsg; 1105 } 1106 /* 1107 * Check file permissions using the current 1108 * credentials, and operation permissions 1109 * using the descriptor's credentials. 1110 */ 1111 error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, 1112 desc_in->jd_gid, VEXEC, td->td_ucred); 1113 JAILDESC_UNLOCK(desc_in); 1114 if (error != 0) 1115 goto done_free; 1116 if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) { 1117 error = EPERM; 1118 goto done_free; 1119 } 1120 } 1121 if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { 1122 /* Allocate a jail descriptor to return later. */ 1123 error = jaildesc_alloc(td, &jfp_out, &jfd_out, 1124 flags & JAIL_OWN_DESC); 1125 if (error) 1126 goto done_free; 1127 } 1128 } 1129 1130 /* 1131 * Delay the permission check if using a jail descriptor, 1132 * until we get the descriptor's credentials. 1133 */ 1134 if (!(flags & JAIL_USE_DESC)) { 1135 error = priv_check(td, PRIV_JAIL_SET); 1136 if (error == 0 && (flags & JAIL_ATTACH)) 1137 error = priv_check(td, PRIV_JAIL_ATTACH); 1138 if (error) 1139 goto done_free; 1140 } 1141 1142 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); 1143 if (error == ENOENT) 1144 jid = 0; 1145 else if (error != 0) 1146 goto done_free; 1147 1148 error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); 1149 if (error == ENOENT) 1150 gotslevel = 0; 1151 else if (error != 0) 1152 goto done_free; 1153 else 1154 gotslevel = 1; 1155 1156 error = 1157 vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); 1158 if (error == ENOENT) 1159 gotchildmax = 0; 1160 else if (error != 0) 1161 goto done_free; 1162 else 1163 gotchildmax = 1; 1164 1165 error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); 1166 if (error == ENOENT) 1167 gotenforce = 0; 1168 else if (error != 0) 1169 goto done_free; 1170 else if (enforce < 0 || enforce > 2) { 1171 error = EINVAL; 1172 goto done_free; 1173 } else 1174 gotenforce = 1; 1175 1176 error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); 1177 if (error == ENOENT) 1178 gotrsnum = 0; 1179 else if (error != 0) 1180 goto done_free; 1181 else 1182 gotrsnum = 1; 1183 1184 pr_flags = ch_flags = 0; 1185 for (bf = pr_flag_bool; 1186 bf < pr_flag_bool + nitems(pr_flag_bool); 1187 bf++) { 1188 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); 1189 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); 1190 } 1191 ch_flags |= pr_flags; 1192 for (jsf = pr_flag_jailsys; 1193 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 1194 jsf++) { 1195 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); 1196 if (error == ENOENT) 1197 continue; 1198 if (error != 0) 1199 goto done_free; 1200 switch (jsys) { 1201 case JAIL_SYS_DISABLE: 1202 if (!jsf->disable) { 1203 error = EINVAL; 1204 goto done_free; 1205 } 1206 pr_flags |= jsf->disable; 1207 break; 1208 case JAIL_SYS_NEW: 1209 pr_flags |= jsf->new; 1210 break; 1211 case JAIL_SYS_INHERIT: 1212 break; 1213 default: 1214 error = EINVAL; 1215 goto done_free; 1216 } 1217 ch_flags |= jsf->new | jsf->disable; 1218 } 1219 if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE 1220 && !(pr_flags & PR_PERSIST)) { 1221 error = EINVAL; 1222 vfs_opterror(opts, "new jail must persist or attach"); 1223 goto done_errmsg; 1224 } 1225 #ifdef VIMAGE 1226 if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { 1227 error = EINVAL; 1228 vfs_opterror(opts, "vnet cannot be changed after creation"); 1229 goto done_errmsg; 1230 } 1231 #endif 1232 #ifdef INET 1233 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { 1234 error = EINVAL; 1235 vfs_opterror(opts, "ip4 cannot be changed after creation"); 1236 goto done_errmsg; 1237 } 1238 #endif 1239 #ifdef INET6 1240 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { 1241 error = EINVAL; 1242 vfs_opterror(opts, "ip6 cannot be changed after creation"); 1243 goto done_errmsg; 1244 } 1245 #endif 1246 1247 pr_allow = ch_allow = 0; 1248 for (bf = pr_flag_allow; 1249 bf < pr_flag_allow + nitems(pr_flag_allow) && 1250 atomic_load_int(&bf->flag) != 0; 1251 bf++) { 1252 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); 1253 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); 1254 } 1255 ch_allow |= pr_allow; 1256 1257 error = vfs_getopt(opts, "name", (void **)&name, &len); 1258 if (error == ENOENT) 1259 name = NULL; 1260 else if (error != 0) 1261 goto done_free; 1262 else { 1263 if (len == 0 || name[len - 1] != '\0') { 1264 error = EINVAL; 1265 goto done_free; 1266 } 1267 if (len > MAXHOSTNAMELEN) { 1268 error = ENAMETOOLONG; 1269 goto done_free; 1270 } 1271 } 1272 1273 error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); 1274 if (error == ENOENT) 1275 host = NULL; 1276 else if (error != 0) 1277 goto done_free; 1278 else { 1279 ch_flags |= PR_HOST; 1280 pr_flags |= PR_HOST; 1281 if (len == 0 || host[len - 1] != '\0') { 1282 error = EINVAL; 1283 goto done_free; 1284 } 1285 if (len > MAXHOSTNAMELEN) { 1286 error = ENAMETOOLONG; 1287 goto done_free; 1288 } 1289 } 1290 1291 error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); 1292 if (error == ENOENT) 1293 domain = NULL; 1294 else if (error != 0) 1295 goto done_free; 1296 else { 1297 ch_flags |= PR_HOST; 1298 pr_flags |= PR_HOST; 1299 if (len == 0 || domain[len - 1] != '\0') { 1300 error = EINVAL; 1301 goto done_free; 1302 } 1303 if (len > MAXHOSTNAMELEN) { 1304 error = ENAMETOOLONG; 1305 goto done_free; 1306 } 1307 } 1308 1309 error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); 1310 if (error == ENOENT) 1311 uuid = NULL; 1312 else if (error != 0) 1313 goto done_free; 1314 else { 1315 ch_flags |= PR_HOST; 1316 pr_flags |= PR_HOST; 1317 if (len == 0 || uuid[len - 1] != '\0') { 1318 error = EINVAL; 1319 goto done_free; 1320 } 1321 if (len > HOSTUUIDLEN) { 1322 error = ENAMETOOLONG; 1323 goto done_free; 1324 } 1325 } 1326 1327 #ifdef COMPAT_FREEBSD32 1328 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 1329 uint32_t hid32; 1330 1331 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); 1332 hid = hid32; 1333 } else 1334 #endif 1335 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); 1336 if (error == ENOENT) 1337 gothid = 0; 1338 else if (error != 0) 1339 goto done_free; 1340 else { 1341 gothid = 1; 1342 ch_flags |= PR_HOST; 1343 pr_flags |= PR_HOST; 1344 } 1345 1346 #ifdef INET 1347 error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); 1348 if (error == ENOENT) 1349 ip4s = 0; 1350 else if (error != 0) 1351 goto done_free; 1352 else if (ip4s & (sizeof(struct in_addr) - 1)) { 1353 error = EINVAL; 1354 goto done_free; 1355 } else { 1356 ch_flags |= PR_IP4_USER; 1357 pr_flags |= PR_IP4_USER; 1358 if (ip4s > 0) { 1359 ip4s /= sizeof(struct in_addr); 1360 if (ip4s > jail_max_af_ips) { 1361 error = EINVAL; 1362 vfs_opterror(opts, "too many IPv4 addresses"); 1363 goto done_errmsg; 1364 } 1365 ip4 = prison_ip_copyin(PR_INET, op, ip4s); 1366 if (ip4 == NULL) { 1367 error = EINVAL; 1368 goto done_free; 1369 } 1370 } 1371 } 1372 #endif 1373 1374 #ifdef INET6 1375 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); 1376 if (error == ENOENT) 1377 ip6s = 0; 1378 else if (error != 0) 1379 goto done_free; 1380 else if (ip6s & (sizeof(struct in6_addr) - 1)) { 1381 error = EINVAL; 1382 goto done_free; 1383 } else { 1384 ch_flags |= PR_IP6_USER; 1385 pr_flags |= PR_IP6_USER; 1386 if (ip6s > 0) { 1387 ip6s /= sizeof(struct in6_addr); 1388 if (ip6s > jail_max_af_ips) { 1389 error = EINVAL; 1390 vfs_opterror(opts, "too many IPv6 addresses"); 1391 goto done_errmsg; 1392 } 1393 ip6 = prison_ip_copyin(PR_INET6, op, ip6s); 1394 if (ip6 == NULL) { 1395 error = EINVAL; 1396 goto done_free; 1397 } 1398 } 1399 } 1400 #endif 1401 1402 #if defined(VIMAGE) && (defined(INET) || defined(INET6)) 1403 if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { 1404 error = EINVAL; 1405 vfs_opterror(opts, 1406 "vnet jails cannot have IP address restrictions"); 1407 goto done_errmsg; 1408 } 1409 #endif 1410 1411 error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); 1412 if (error == ENOENT) 1413 osrelstr = NULL; 1414 else if (error != 0) 1415 goto done_free; 1416 else { 1417 if (flags & JAIL_UPDATE) { 1418 error = EINVAL; 1419 vfs_opterror(opts, 1420 "osrelease cannot be changed after creation"); 1421 goto done_errmsg; 1422 } 1423 if (len == 0 || osrelstr[len - 1] != '\0') { 1424 error = EINVAL; 1425 goto done_free; 1426 } 1427 if (len >= OSRELEASELEN) { 1428 error = ENAMETOOLONG; 1429 vfs_opterror(opts, 1430 "osrelease string must be 1-%d bytes long", 1431 OSRELEASELEN - 1); 1432 goto done_errmsg; 1433 } 1434 } 1435 1436 error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); 1437 if (error == ENOENT) 1438 osreldt = 0; 1439 else if (error != 0) 1440 goto done_free; 1441 else { 1442 if (flags & JAIL_UPDATE) { 1443 error = EINVAL; 1444 vfs_opterror(opts, 1445 "osreldate cannot be changed after creation"); 1446 goto done_errmsg; 1447 } 1448 if (osreldt == 0) { 1449 error = EINVAL; 1450 vfs_opterror(opts, "osreldate cannot be 0"); 1451 goto done_errmsg; 1452 } 1453 } 1454 1455 root = NULL; 1456 error = vfs_getopt(opts, "path", (void **)&path, &len); 1457 if (error == ENOENT) 1458 path = NULL; 1459 else if (error != 0) 1460 goto done_free; 1461 else { 1462 if (flags & JAIL_UPDATE) { 1463 error = EINVAL; 1464 vfs_opterror(opts, 1465 "path cannot be changed after creation"); 1466 goto done_errmsg; 1467 } 1468 if (len == 0 || path[len - 1] != '\0') { 1469 error = EINVAL; 1470 goto done_free; 1471 } 1472 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path); 1473 error = namei(&nd); 1474 if (error) 1475 goto done_free; 1476 root = nd.ni_vp; 1477 NDFREE_PNBUF(&nd); 1478 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 1479 strlcpy(g_path, path, MAXPATHLEN); 1480 error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); 1481 if (error == 0) { 1482 path = g_path; 1483 } else { 1484 /* exit on other errors */ 1485 goto done_free; 1486 } 1487 if (root->v_type != VDIR) { 1488 error = ENOTDIR; 1489 vput(root); 1490 goto done_free; 1491 } 1492 VOP_UNLOCK(root); 1493 } 1494 1495 /* 1496 * Find the specified jail, or at least its parent. 1497 * This abuses the file error codes ENOENT and EEXIST. 1498 */ 1499 pr = NULL; 1500 inspr = NULL; 1501 deadpr = NULL; 1502 maybe_changed = false; 1503 if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { 1504 namelc = strrchr(name, '.'); 1505 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); 1506 if (*p != '\0') 1507 jid = 0; 1508 } 1509 sx_xlock(&allprison_lock); 1510 drflags = PD_LIST_XLOCKED; 1511 ppr = mypr; 1512 if (!prison_isalive(ppr)) { 1513 /* This jail is dying. This process will surely follow. */ 1514 error = EAGAIN; 1515 goto done_deref; 1516 } 1517 if (flags & JAIL_USE_DESC) { 1518 /* Get the jail from its descriptor. */ 1519 error = jaildesc_find(td, jfd_in, &desc_in, &pr, &jdcred); 1520 if (error) { 1521 vfs_opterror(opts, error == ENOENT 1522 ? "descriptor to dead jail" 1523 : "not a jail descriptor"); 1524 goto done_deref; 1525 } 1526 drflags |= PD_DEREF; 1527 /* 1528 * Check file permissions using the current credentials, 1529 * and operation permissions using the descriptor's 1530 * credentials. 1531 */ 1532 error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, 1533 desc_in->jd_gid, VWRITE, td->td_ucred); 1534 if (error == 0 && (flags & JAIL_ATTACH)) 1535 error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, 1536 desc_in->jd_gid, VEXEC, td->td_ucred); 1537 JAILDESC_UNLOCK(desc_in); 1538 if (error == 0) 1539 error = priv_check_cred(jdcred, PRIV_JAIL_SET); 1540 if (error == 0 && (flags & JAIL_ATTACH)) 1541 error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); 1542 crfree(jdcred); 1543 if (error) 1544 goto done_deref; 1545 mtx_lock(&pr->pr_mtx); 1546 drflags |= PD_LOCKED; 1547 if (cuflags == JAIL_CREATE) { 1548 error = EEXIST; 1549 vfs_opterror(opts, "jail %d already exists", 1550 pr->pr_id); 1551 goto done_deref; 1552 } 1553 if (!prison_isalive(pr)) { 1554 /* While a jid can be resurrected, the prison 1555 * itself cannot. 1556 */ 1557 error = ENOENT; 1558 vfs_opterror(opts, "jail %d is dying", pr->pr_id); 1559 goto done_deref; 1560 } 1561 if (jid != 0 && jid != pr->pr_id) { 1562 error = EINVAL; 1563 vfs_opterror(opts, "cannot change jid"); 1564 goto done_deref; 1565 } 1566 jid = pr->pr_id; 1567 } else if (jid != 0) { 1568 if (jid < 0) { 1569 error = EINVAL; 1570 vfs_opterror(opts, "negative jid"); 1571 goto done_deref; 1572 } 1573 /* 1574 * See if a requested jid already exists. Keep track of 1575 * where it can be inserted later. 1576 */ 1577 TAILQ_FOREACH(inspr, &allprison, pr_list) { 1578 if (inspr->pr_id < jid) 1579 continue; 1580 if (inspr->pr_id > jid) 1581 break; 1582 if (prison_isalive(inspr)) { 1583 pr = inspr; 1584 mtx_lock(&pr->pr_mtx); 1585 drflags |= PD_LOCKED; 1586 } else { 1587 /* Note a dying jail to handle later. */ 1588 deadpr = inspr; 1589 } 1590 inspr = NULL; 1591 break; 1592 } 1593 if (cuflags == JAIL_CREATE && pr != NULL) { 1594 /* 1595 * Even creators that cannot see the jail will 1596 * get EEXIST. 1597 */ 1598 error = EEXIST; 1599 vfs_opterror(opts, "jail %d already exists", jid); 1600 goto done_deref; 1601 } 1602 if ((pr == NULL) 1603 ? cuflags == JAIL_UPDATE 1604 : !prison_ischild(mypr, pr)) { 1605 /* 1606 * Updaters get ENOENT for nonexistent jails, 1607 * or for jails they cannot see. The latter 1608 * case is true even for CREATE | UPDATE, 1609 * which normally cannot give this error. 1610 */ 1611 error = ENOENT; 1612 vfs_opterror(opts, "jail %d not found", jid); 1613 goto done_deref; 1614 } 1615 } 1616 /* 1617 * If the caller provided a name, look for a jail by that name. 1618 * This has different semantics for creates and updates keyed by jid 1619 * (where the name must not already exist in a different jail), 1620 * and updates keyed by the name itself (where the name must exist 1621 * because that is the jail being updated). 1622 */ 1623 namelc = NULL; 1624 if (name != NULL) { 1625 namelc = strrchr(name, '.'); 1626 if (namelc == NULL) 1627 namelc = name; 1628 else { 1629 /* 1630 * This is a hierarchical name. Split it into the 1631 * parent and child names, and make sure the parent 1632 * exists or matches an already found jail. 1633 */ 1634 if (pr != NULL) { 1635 if (strncmp(name, ppr->pr_name, namelc - name) 1636 || ppr->pr_name[namelc - name] != '\0') { 1637 error = EINVAL; 1638 vfs_opterror(opts, 1639 "cannot change jail's parent"); 1640 goto done_deref; 1641 } 1642 } else { 1643 *namelc = '\0'; 1644 ppr = prison_find_name(mypr, name); 1645 if (ppr == NULL) { 1646 error = ENOENT; 1647 vfs_opterror(opts, 1648 "jail \"%s\" not found", name); 1649 goto done_deref; 1650 } 1651 mtx_unlock(&ppr->pr_mtx); 1652 if (!prison_isalive(ppr)) { 1653 error = ENOENT; 1654 vfs_opterror(opts, 1655 "jail \"%s\" is dying", name); 1656 goto done_deref; 1657 } 1658 *namelc = '.'; 1659 } 1660 namelc++; 1661 } 1662 if (namelc[0] != '\0') { 1663 pnamelen = 1664 (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; 1665 FOREACH_PRISON_CHILD(ppr, tpr) { 1666 if (tpr == pr || !prison_isalive(tpr) || 1667 strcmp(tpr->pr_name + pnamelen, namelc)) 1668 continue; 1669 if (cuflags == JAIL_CREATE || pr != NULL) { 1670 /* 1671 * Create, or update(jid): name must 1672 * not exist in an active sibling jail. 1673 */ 1674 error = EEXIST; 1675 vfs_opterror(opts, 1676 "jail \"%s\" already exists", name); 1677 goto done_deref; 1678 } 1679 /* Use this jail for updates. */ 1680 pr = tpr; 1681 mtx_lock(&pr->pr_mtx); 1682 drflags |= PD_LOCKED; 1683 break; 1684 } 1685 /* 1686 * Update: name must exist if no jid is specified. 1687 * As with the jid case, the jail must be currently 1688 * visible, or else even CREATE | UPDATE will get 1689 * an error. 1690 */ 1691 if ((pr == NULL) 1692 ? cuflags == JAIL_UPDATE 1693 : !prison_isalive(pr)) { 1694 error = ENOENT; 1695 vfs_opterror(opts, "jail \"%s\" not found", 1696 name); 1697 goto done_deref; 1698 } 1699 } 1700 } 1701 /* Update: must provide a desc, jid, or name. */ 1702 else if (cuflags == JAIL_UPDATE && pr == NULL) { 1703 error = ENOENT; 1704 vfs_opterror(opts, "update specified no jail"); 1705 goto done_deref; 1706 } 1707 1708 /* If there's no prison to update, create a new one and link it in. */ 1709 created = pr == NULL; 1710 if (created) { 1711 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) 1712 if (tpr->pr_childcount >= tpr->pr_childmax) { 1713 error = EPERM; 1714 vfs_opterror(opts, "prison limit exceeded"); 1715 goto done_deref; 1716 } 1717 1718 if (deadpr != NULL) { 1719 /* 1720 * The prison being created has the same ID as a dying 1721 * one. Handle this by giving the dying jail a new ID. 1722 * This may cause some confusion to user space, but 1723 * only to those listing dying jails. 1724 */ 1725 deadid = get_next_deadid(&dinspr); 1726 if (deadid == 0) { 1727 error = EAGAIN; 1728 vfs_opterror(opts, "no available jail IDs"); 1729 goto done_deref; 1730 } 1731 mtx_lock(&deadpr->pr_mtx); 1732 deadpr->pr_id = deadid; 1733 mtx_unlock(&deadpr->pr_mtx); 1734 if (dinspr == deadpr) 1735 inspr = deadpr; 1736 else { 1737 inspr = TAILQ_NEXT(deadpr, pr_list); 1738 TAILQ_REMOVE(&allprison, deadpr, pr_list); 1739 if (dinspr != NULL) 1740 TAILQ_INSERT_AFTER(&allprison, dinspr, 1741 deadpr, pr_list); 1742 else 1743 TAILQ_INSERT_HEAD(&allprison, deadpr, 1744 pr_list); 1745 } 1746 } 1747 if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { 1748 error = EAGAIN; 1749 vfs_opterror(opts, "no available jail IDs"); 1750 goto done_deref; 1751 } 1752 1753 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); 1754 pr->pr_state = PRISON_STATE_INVALID; 1755 refcount_init(&pr->pr_ref, 1); 1756 refcount_init(&pr->pr_uref, 0); 1757 drflags |= PD_DEREF; 1758 LIST_INIT(&pr->pr_children); 1759 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); 1760 TASK_INIT(&pr->pr_task, 0, prison_complete, pr); 1761 1762 pr->pr_id = jid; 1763 if (inspr != NULL) 1764 TAILQ_INSERT_BEFORE(inspr, pr, pr_list); 1765 else 1766 TAILQ_INSERT_TAIL(&allprison, pr, pr_list); 1767 1768 pr->pr_parent = ppr; 1769 prison_hold(ppr); 1770 prison_proc_hold(ppr); 1771 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); 1772 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) 1773 tpr->pr_childcount++; 1774 pr->pr_klist = knlist_alloc(&pr->pr_mtx); 1775 1776 /* Set some default values, and inherit some from the parent. */ 1777 if (namelc == NULL) 1778 namelc = ""; 1779 if (path == NULL) { 1780 path = "/"; 1781 root = mypr->pr_root; 1782 vref(root); 1783 } 1784 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); 1785 pr->pr_flags |= PR_HOST; 1786 #if defined(INET) || defined(INET6) 1787 #ifdef VIMAGE 1788 if (!(pr_flags & PR_VNET)) 1789 #endif 1790 { 1791 #ifdef INET 1792 if (!(ch_flags & PR_IP4_USER)) 1793 pr->pr_flags |= PR_IP4 | PR_IP4_USER; 1794 else if (!(pr_flags & PR_IP4_USER)) { 1795 pr->pr_flags |= ppr->pr_flags & PR_IP4; 1796 prison_ip_dup(ppr, pr, PR_INET); 1797 } 1798 #endif 1799 #ifdef INET6 1800 if (!(ch_flags & PR_IP6_USER)) 1801 pr->pr_flags |= PR_IP6 | PR_IP6_USER; 1802 else if (!(pr_flags & PR_IP6_USER)) { 1803 pr->pr_flags |= ppr->pr_flags & PR_IP6; 1804 prison_ip_dup(ppr, pr, PR_INET6); 1805 } 1806 #endif 1807 } 1808 #endif 1809 /* Source address selection is always on by default. */ 1810 pr->pr_flags |= _PR_IP_SADDRSEL; 1811 1812 pr->pr_securelevel = ppr->pr_securelevel; 1813 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; 1814 pr->pr_enforce_statfs = jail_default_enforce_statfs; 1815 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; 1816 1817 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; 1818 if (osrelstr == NULL) 1819 strlcpy(pr->pr_osrelease, ppr->pr_osrelease, 1820 sizeof(pr->pr_osrelease)); 1821 else 1822 strlcpy(pr->pr_osrelease, osrelstr, 1823 sizeof(pr->pr_osrelease)); 1824 1825 #ifdef VIMAGE 1826 /* 1827 * Allocate a new vnet if specified. 1828 * 1829 * Set PR_VNET now if so, so that the vnet is disposed of 1830 * properly when the jail is destroyed. 1831 */ 1832 if (pr_flags & PR_VNET) { 1833 pr->pr_flags |= PR_VNET; 1834 pr->pr_vnet = vnet_alloc(); 1835 } else { 1836 pr->pr_vnet = ppr->pr_vnet; 1837 } 1838 #endif 1839 /* 1840 * Allocate a dedicated cpuset for each jail. 1841 * Unlike other initial settings, this may return an error. 1842 */ 1843 error = cpuset_create_root(ppr, &pr->pr_cpuset); 1844 if (error) 1845 goto done_deref; 1846 1847 mtx_lock(&pr->pr_mtx); 1848 drflags |= PD_LOCKED; 1849 } else { 1850 /* 1851 * Grab a reference for existing prisons, to ensure they 1852 * continue to exist for the duration of the call. 1853 */ 1854 if (!(drflags & PD_DEREF)) { 1855 prison_hold(pr); 1856 drflags |= PD_DEREF; 1857 } 1858 #if defined(VIMAGE) && (defined(INET) || defined(INET6)) 1859 if ((pr->pr_flags & PR_VNET) && 1860 (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { 1861 error = EINVAL; 1862 vfs_opterror(opts, 1863 "vnet jails cannot have IP address restrictions"); 1864 goto done_deref; 1865 } 1866 #endif 1867 #ifdef INET 1868 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { 1869 error = EINVAL; 1870 vfs_opterror(opts, 1871 "ip4 cannot be changed after creation"); 1872 goto done_deref; 1873 } 1874 #endif 1875 #ifdef INET6 1876 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { 1877 error = EINVAL; 1878 vfs_opterror(opts, 1879 "ip6 cannot be changed after creation"); 1880 goto done_deref; 1881 } 1882 #endif 1883 } 1884 1885 /* Do final error checking before setting anything. */ 1886 if (gotslevel) { 1887 if (slevel < ppr->pr_securelevel) { 1888 error = EPERM; 1889 goto done_deref; 1890 } 1891 } 1892 if (gotchildmax) { 1893 if (childmax >= ppr->pr_childmax) { 1894 error = EPERM; 1895 goto done_deref; 1896 } 1897 } 1898 if (gotenforce) { 1899 if (enforce < ppr->pr_enforce_statfs) { 1900 error = EPERM; 1901 goto done_deref; 1902 } 1903 } 1904 if (gotrsnum) { 1905 /* 1906 * devfs_rsnum is a uint16_t 1907 */ 1908 if (rsnum < 0 || rsnum > 65535) { 1909 error = EINVAL; 1910 goto done_deref; 1911 } 1912 /* 1913 * Nested jails always inherit parent's devfs ruleset 1914 */ 1915 if (jailed(td->td_ucred)) { 1916 if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { 1917 error = EPERM; 1918 goto done_deref; 1919 } else 1920 rsnum = ppr->pr_devfs_rsnum; 1921 } 1922 } 1923 #ifdef INET 1924 if (ip4s > 0) { 1925 if ((ppr->pr_flags & PR_IP4) && 1926 !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4, 1927 PR_INET)) { 1928 error = EPERM; 1929 goto done_deref; 1930 } 1931 if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) { 1932 error = EADDRINUSE; 1933 vfs_opterror(opts, "IPv4 addresses clash"); 1934 goto done_deref; 1935 } 1936 } 1937 #endif 1938 #ifdef INET6 1939 if (ip6s > 0) { 1940 if ((ppr->pr_flags & PR_IP6) && 1941 !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6, 1942 PR_INET6)) { 1943 error = EPERM; 1944 goto done_deref; 1945 } 1946 if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) { 1947 error = EADDRINUSE; 1948 vfs_opterror(opts, "IPv6 addresses clash"); 1949 goto done_deref; 1950 } 1951 } 1952 #endif 1953 onamelen = namelen = 0; 1954 if (namelc != NULL) { 1955 /* Give a default name of the jid. Also allow the name to be 1956 * explicitly the jid - but not any other number, and only in 1957 * normal form (no leading zero/etc). 1958 */ 1959 if (namelc[0] == '\0') 1960 snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); 1961 else if ((strtoul(namelc, &p, 10) != jid || 1962 namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { 1963 error = EINVAL; 1964 vfs_opterror(opts, 1965 "name cannot be numeric (unless it is the jid)"); 1966 goto done_deref; 1967 } 1968 /* 1969 * Make sure the name isn't too long for the prison or its 1970 * children. 1971 */ 1972 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; 1973 onamelen = strlen(pr->pr_name + pnamelen); 1974 namelen = strlen(namelc); 1975 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { 1976 error = ENAMETOOLONG; 1977 goto done_deref; 1978 } 1979 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { 1980 if (strlen(tpr->pr_name) + (namelen - onamelen) >= 1981 sizeof(pr->pr_name)) { 1982 error = ENAMETOOLONG; 1983 goto done_deref; 1984 } 1985 } 1986 } 1987 pr_allow_diff = pr_allow & ~ppr->pr_allow; 1988 if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { 1989 error = EPERM; 1990 goto done_deref; 1991 } 1992 1993 /* 1994 * Let modules check their parameters. This requires unlocking and 1995 * then re-locking the prison, but this is still a valid state as long 1996 * as allprison_lock remains xlocked. 1997 */ 1998 mtx_unlock(&pr->pr_mtx); 1999 drflags &= ~PD_LOCKED; 2000 error = osd_jail_call(pr, PR_METHOD_CHECK, opts); 2001 if (error != 0) 2002 goto done_deref; 2003 mtx_lock(&pr->pr_mtx); 2004 drflags |= PD_LOCKED; 2005 2006 /* At this point, all valid parameters should have been noted. */ 2007 TAILQ_FOREACH(opt, opts, link) { 2008 if (!opt->seen && strcmp(opt->name, "errmsg")) { 2009 error = EINVAL; 2010 vfs_opterror(opts, "unknown parameter: %s", opt->name); 2011 goto done_deref; 2012 } 2013 } 2014 maybe_changed = true; 2015 2016 /* Set the parameters of the prison. */ 2017 #ifdef INET 2018 redo_ip4 = false; 2019 if (pr_flags & PR_IP4_USER) { 2020 pr->pr_flags |= PR_IP4; 2021 prison_ip_set(pr, PR_INET, ip4); 2022 ip4 = NULL; 2023 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2024 #ifdef VIMAGE 2025 if (tpr->pr_flags & PR_VNET) { 2026 descend = 0; 2027 continue; 2028 } 2029 #endif 2030 if (!prison_ip_restrict(tpr, PR_INET, NULL)) { 2031 redo_ip4 = true; 2032 descend = 0; 2033 } 2034 } 2035 } 2036 #endif 2037 #ifdef INET6 2038 redo_ip6 = false; 2039 if (pr_flags & PR_IP6_USER) { 2040 pr->pr_flags |= PR_IP6; 2041 prison_ip_set(pr, PR_INET6, ip6); 2042 ip6 = NULL; 2043 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2044 #ifdef VIMAGE 2045 if (tpr->pr_flags & PR_VNET) { 2046 descend = 0; 2047 continue; 2048 } 2049 #endif 2050 if (!prison_ip_restrict(tpr, PR_INET6, NULL)) { 2051 redo_ip6 = true; 2052 descend = 0; 2053 } 2054 } 2055 } 2056 #endif 2057 if (gotslevel) { 2058 pr->pr_securelevel = slevel; 2059 /* Set all child jails to be at least this level. */ 2060 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 2061 if (tpr->pr_securelevel < slevel) 2062 tpr->pr_securelevel = slevel; 2063 } 2064 if (gotchildmax) { 2065 pr->pr_childmax = childmax; 2066 /* Set all child jails to under this limit. */ 2067 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) 2068 if (tpr->pr_childmax > childmax - level) 2069 tpr->pr_childmax = childmax > level 2070 ? childmax - level : 0; 2071 } 2072 if (gotenforce) { 2073 pr->pr_enforce_statfs = enforce; 2074 /* Pass this restriction on to the children. */ 2075 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 2076 if (tpr->pr_enforce_statfs < enforce) 2077 tpr->pr_enforce_statfs = enforce; 2078 } 2079 if (gotrsnum) { 2080 pr->pr_devfs_rsnum = rsnum; 2081 /* Pass this restriction on to the children. */ 2082 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 2083 tpr->pr_devfs_rsnum = rsnum; 2084 } 2085 if (namelc != NULL) { 2086 if (ppr == &prison0) 2087 strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); 2088 else 2089 snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", 2090 ppr->pr_name, namelc); 2091 /* Change this component of child names. */ 2092 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2093 bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, 2094 strlen(tpr->pr_name + onamelen) + 1); 2095 bcopy(pr->pr_name, tpr->pr_name, namelen); 2096 } 2097 } 2098 if (path != NULL) { 2099 /* Try to keep a real-rooted full pathname. */ 2100 strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); 2101 pr->pr_root = root; 2102 root = NULL; 2103 } 2104 if (PR_HOST & ch_flags & ~pr_flags) { 2105 if (pr->pr_flags & PR_HOST) { 2106 /* 2107 * Copy the parent's host info. As with pr_ip4 above, 2108 * the lack of a lock on the parent is not a problem; 2109 * it is always set with allprison_lock at least 2110 * shared, and is held exclusively here. 2111 */ 2112 strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, 2113 sizeof(pr->pr_hostname)); 2114 strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, 2115 sizeof(pr->pr_domainname)); 2116 strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, 2117 sizeof(pr->pr_hostuuid)); 2118 pr->pr_hostid = pr->pr_parent->pr_hostid; 2119 } 2120 } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { 2121 /* Set this prison, and any descendants without PR_HOST. */ 2122 if (host != NULL) 2123 strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); 2124 if (domain != NULL) 2125 strlcpy(pr->pr_domainname, domain, 2126 sizeof(pr->pr_domainname)); 2127 if (uuid != NULL) 2128 strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); 2129 if (gothid) 2130 pr->pr_hostid = hid; 2131 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2132 if (tpr->pr_flags & PR_HOST) 2133 descend = 0; 2134 else { 2135 if (host != NULL) 2136 strlcpy(tpr->pr_hostname, 2137 pr->pr_hostname, 2138 sizeof(tpr->pr_hostname)); 2139 if (domain != NULL) 2140 strlcpy(tpr->pr_domainname, 2141 pr->pr_domainname, 2142 sizeof(tpr->pr_domainname)); 2143 if (uuid != NULL) 2144 strlcpy(tpr->pr_hostuuid, 2145 pr->pr_hostuuid, 2146 sizeof(tpr->pr_hostuuid)); 2147 if (gothid) 2148 tpr->pr_hostid = hid; 2149 } 2150 } 2151 } 2152 pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; 2153 if ((tallow = ch_allow & ~pr_allow)) 2154 prison_set_allow_locked(pr, tallow, 0); 2155 /* 2156 * Persistent prisons get an extra reference, and prisons losing their 2157 * persist flag lose that reference. 2158 */ 2159 if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { 2160 if (pr_flags & PR_PERSIST) { 2161 prison_hold(pr); 2162 /* 2163 * This may be a new prison's first user reference, 2164 * but wait to call it alive until after OSD calls 2165 * have had a chance to run (and perhaps to fail). 2166 */ 2167 refcount_acquire(&pr->pr_uref); 2168 } else { 2169 drflags |= PD_DEUREF; 2170 prison_free_not_last(pr); 2171 } 2172 } 2173 pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; 2174 mtx_unlock(&pr->pr_mtx); 2175 drflags &= ~PD_LOCKED; 2176 /* 2177 * Any errors past this point will need to de-persist newly created 2178 * prisons, as well as call remove methods. 2179 */ 2180 if (created) 2181 drflags |= PD_KILL; 2182 2183 #ifdef RACCT 2184 if (racct_enable && created) 2185 prison_racct_attach(pr); 2186 #endif 2187 2188 /* Locks may have prevented a complete restriction of child IP 2189 * addresses. If so, allocate some more memory and try again. 2190 */ 2191 #ifdef INET 2192 while (redo_ip4) { 2193 ip4s = pr->pr_addrs[PR_INET]->ips; 2194 MPASS(ip4 == NULL); 2195 ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK); 2196 mtx_lock(&pr->pr_mtx); 2197 redo_ip4 = false; 2198 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2199 #ifdef VIMAGE 2200 if (tpr->pr_flags & PR_VNET) { 2201 descend = 0; 2202 continue; 2203 } 2204 #endif 2205 if (!prison_ip_restrict(tpr, PR_INET, &ip4)) 2206 redo_ip4 = true; 2207 } 2208 mtx_unlock(&pr->pr_mtx); 2209 } 2210 #endif 2211 #ifdef INET6 2212 while (redo_ip6) { 2213 ip6s = pr->pr_addrs[PR_INET6]->ips; 2214 MPASS(ip6 == NULL); 2215 ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK); 2216 mtx_lock(&pr->pr_mtx); 2217 redo_ip6 = false; 2218 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 2219 #ifdef VIMAGE 2220 if (tpr->pr_flags & PR_VNET) { 2221 descend = 0; 2222 continue; 2223 } 2224 #endif 2225 if (!prison_ip_restrict(tpr, PR_INET6, &ip6)) 2226 redo_ip6 = true; 2227 } 2228 mtx_unlock(&pr->pr_mtx); 2229 } 2230 #endif 2231 2232 /* Let the modules do their work. */ 2233 if (created) { 2234 error = osd_jail_call(pr, PR_METHOD_CREATE, opts); 2235 if (error) 2236 goto done_deref; 2237 } 2238 error = osd_jail_call(pr, PR_METHOD_SET, opts); 2239 if (error) 2240 goto done_deref; 2241 2242 /* 2243 * A new prison is now ready to be seen; either it has gained a user 2244 * reference via persistence, or is about to gain one via attachment. 2245 */ 2246 if (created) { 2247 sx_assert(&allprison_lock, SX_XLOCKED); 2248 mtx_lock(&ppr->pr_mtx); 2249 knote_fork(ppr->pr_klist, pr->pr_id); 2250 mtx_unlock(&ppr->pr_mtx); 2251 mtx_lock(&pr->pr_mtx); 2252 drflags |= PD_LOCKED; 2253 pr->pr_state = PRISON_STATE_ALIVE; 2254 } 2255 2256 /* Attach this process to the prison if requested. */ 2257 if (flags & JAIL_ATTACH) { 2258 error = do_jail_attach(td, pr, 2259 prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS)); 2260 drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); 2261 if (error) { 2262 vfs_opterror(opts, "attach failed"); 2263 goto done_deref; 2264 } 2265 } 2266 2267 #ifdef RACCT 2268 if (racct_enable && !created) { 2269 if (drflags & PD_LOCKED) { 2270 mtx_unlock(&pr->pr_mtx); 2271 drflags &= ~PD_LOCKED; 2272 } 2273 if (drflags & PD_LIST_XLOCKED) { 2274 sx_xunlock(&allprison_lock); 2275 drflags &= ~PD_LIST_XLOCKED; 2276 } 2277 prison_racct_modify(pr); 2278 } 2279 #endif 2280 2281 if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 && 2282 (pr->pr_root->v_vflag & VV_ROOT) == 0) 2283 printf("Warning jail jid=%d: mountd/nfsd requires a separate" 2284 " file system\n", pr->pr_id); 2285 2286 /* 2287 * Now that the prison is fully created without error, set the 2288 * jail descriptor if one was requested. This is the only 2289 * parameter that is returned to the caller (except the error 2290 * message). 2291 */ 2292 if (jfd_out >= 0) { 2293 if (!(drflags & PD_LOCKED)) { 2294 mtx_lock(&pr->pr_mtx); 2295 drflags |= PD_LOCKED; 2296 } 2297 jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1; 2298 if (optuio->uio_segflg == UIO_SYSSPACE) 2299 *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out; 2300 else 2301 (void)copyout(&jfd_out, 2302 optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out)); 2303 jaildesc_set_prison(jfp_out, pr); 2304 } 2305 2306 drflags &= ~PD_KILL; 2307 td->td_retval[0] = pr->pr_id; 2308 2309 done_deref: 2310 /* 2311 * Report changes to kevent. This can happen even if the 2312 * system call fails, as changes might have been made before 2313 * the failure. 2314 */ 2315 if (maybe_changed && !created) 2316 prison_knote(pr, NOTE_JAIL_SET); 2317 /* Release any temporary prison holds and/or locks. */ 2318 if (pr != NULL) 2319 prison_deref(pr, drflags); 2320 else if (drflags & PD_LIST_SLOCKED) 2321 sx_sunlock(&allprison_lock); 2322 else if (drflags & PD_LIST_XLOCKED) 2323 sx_xunlock(&allprison_lock); 2324 if (root != NULL) 2325 vrele(root); 2326 done_errmsg: 2327 if (error) { 2328 /* Write the error message back to userspace. */ 2329 if (vfs_getopt(opts, "errmsg", (void **)&errmsg, 2330 &errmsg_len) == 0 && errmsg_len > 0) { 2331 errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; 2332 if (optuio->uio_segflg == UIO_SYSSPACE) 2333 bcopy(errmsg, 2334 optuio->uio_iov[errmsg_pos].iov_base, 2335 errmsg_len); 2336 else 2337 (void)copyout(errmsg, 2338 optuio->uio_iov[errmsg_pos].iov_base, 2339 errmsg_len); 2340 } 2341 } 2342 done_free: 2343 /* Clean up other resources. */ 2344 #ifdef INET 2345 prison_ip_free(ip4); 2346 #endif 2347 #ifdef INET6 2348 prison_ip_free(ip6); 2349 #endif 2350 if (jfp_out != NULL) 2351 fdrop(jfp_out, td); 2352 if (error && jfd_out >= 0) 2353 (void)kern_close(td, jfd_out); 2354 if (g_path != NULL) 2355 free(g_path, M_TEMP); 2356 vfs_freeopts(opts); 2357 prison_free(mypr); 2358 return (error); 2359 } 2360 2361 /* 2362 * Find the next available prison ID. Return the ID on success, or zero 2363 * on failure. Also set a pointer to the allprison list entry the prison 2364 * should be inserted before. 2365 */ 2366 static int 2367 get_next_prid(struct prison **insprp) 2368 { 2369 struct prison *inspr; 2370 int jid, maxid; 2371 2372 jid = lastprid % JAIL_MAX + 1; 2373 if (TAILQ_EMPTY(&allprison) || 2374 TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) { 2375 /* 2376 * A common case is for all jails to be implicitly numbered, 2377 * which means they'll go on the end of the list, at least 2378 * for the first JAIL_MAX times. 2379 */ 2380 inspr = NULL; 2381 } else { 2382 /* 2383 * Take two passes through the allprison list: first starting 2384 * with the proposed jid, then ending with it. 2385 */ 2386 for (maxid = JAIL_MAX; maxid != 0; ) { 2387 TAILQ_FOREACH(inspr, &allprison, pr_list) { 2388 if (inspr->pr_id < jid) 2389 continue; 2390 if (inspr->pr_id > jid) { 2391 /* Found an opening. */ 2392 maxid = 0; 2393 break; 2394 } 2395 if (++jid > maxid) { 2396 if (lastprid == maxid || lastprid == 0) 2397 { 2398 /* 2399 * The entire legal range 2400 * has been traversed 2401 */ 2402 return 0; 2403 } 2404 /* Try again from the start. */ 2405 jid = 1; 2406 maxid = lastprid; 2407 break; 2408 } 2409 } 2410 if (inspr == NULL) { 2411 /* Found room at the end of the list. */ 2412 break; 2413 } 2414 } 2415 } 2416 *insprp = inspr; 2417 lastprid = jid; 2418 return (jid); 2419 } 2420 2421 /* 2422 * Find the next available ID for a renumbered dead prison. This is the same 2423 * as get_next_prid, but counting backward from the end of the range. 2424 */ 2425 static int 2426 get_next_deadid(struct prison **dinsprp) 2427 { 2428 struct prison *dinspr; 2429 int deadid, minid; 2430 2431 deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX; 2432 /* 2433 * Take two reverse passes through the allprison list: first 2434 * starting with the proposed deadid, then ending with it. 2435 */ 2436 for (minid = 1; minid != 0; ) { 2437 TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) { 2438 if (dinspr->pr_id > deadid) 2439 continue; 2440 if (dinspr->pr_id < deadid) { 2441 /* Found an opening. */ 2442 minid = 0; 2443 break; 2444 } 2445 if (--deadid < minid) { 2446 if (lastdeadid == minid || lastdeadid == 0) 2447 { 2448 /* 2449 * The entire legal range 2450 * has been traversed 2451 */ 2452 return 0; 2453 } 2454 /* Try again from the end. */ 2455 deadid = JAIL_MAX; 2456 minid = lastdeadid; 2457 break; 2458 } 2459 } 2460 if (dinspr == NULL) { 2461 /* Found room at the beginning of the list. */ 2462 break; 2463 } 2464 } 2465 *dinsprp = dinspr; 2466 lastdeadid = deadid; 2467 return (deadid); 2468 } 2469 2470 /* 2471 * struct jail_get_args { 2472 * struct iovec *iovp; 2473 * unsigned int iovcnt; 2474 * int flags; 2475 * }; 2476 */ 2477 int 2478 sys_jail_get(struct thread *td, struct jail_get_args *uap) 2479 { 2480 struct uio *auio; 2481 int error; 2482 2483 /* Check that we have an even number of iovecs. */ 2484 if (uap->iovcnt & 1) 2485 return (EINVAL); 2486 2487 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 2488 if (error) 2489 return (error); 2490 error = kern_jail_get(td, auio, uap->flags); 2491 if (error == 0) 2492 error = copyout(auio->uio_iov, uap->iovp, 2493 uap->iovcnt * sizeof(struct iovec)); 2494 freeuio(auio); 2495 return (error); 2496 } 2497 2498 int 2499 kern_jail_get(struct thread *td, struct uio *optuio, int flags) 2500 { 2501 struct bool_flags *bf; 2502 struct file *jfp_out; 2503 struct jaildesc *desc_in; 2504 struct jailsys_flags *jsf; 2505 struct prison *pr, *mypr; 2506 struct vfsopt *opt; 2507 struct vfsoptlist *opts; 2508 char *errmsg, *name; 2509 int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; 2510 int jfd_in, jfd_out; 2511 unsigned f; 2512 2513 if (flags & ~JAIL_GET_MASK) 2514 return (EINVAL); 2515 if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) 2516 == (JAIL_USE_DESC | JAIL_AT_DESC)) 2517 return (EINVAL); 2518 2519 /* Get the parameter list. */ 2520 error = vfs_buildopts(optuio, &opts); 2521 if (error) 2522 return (error); 2523 errmsg_pos = vfs_getopt_pos(opts, "errmsg"); 2524 mypr = td->td_ucred->cr_prison; 2525 prison_hold(mypr); 2526 pr = NULL; 2527 jfp_out = NULL; 2528 jfd_out = -1; 2529 2530 /* 2531 * Find the prison specified by one of: desc, lastjid, jid, name. 2532 */ 2533 sx_slock(&allprison_lock); 2534 drflags = PD_LIST_SLOCKED; 2535 2536 error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); 2537 if (error == ENOENT) { 2538 if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { 2539 vfs_opterror(opts, "missing desc"); 2540 goto done; 2541 } 2542 } else if (error == 0) { 2543 if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | 2544 JAIL_OWN_DESC))) { 2545 vfs_opterror(opts, "unexpected desc"); 2546 goto done; 2547 } 2548 if (flags & JAIL_USE_DESC) { 2549 /* Get the jail from its descriptor. */ 2550 error = jaildesc_find(td, jfd_in, &desc_in, &pr, NULL); 2551 if (error) { 2552 vfs_opterror(opts, error == ENOENT 2553 ? "descriptor to dead jail" 2554 : "not a jail descriptor"); 2555 goto done; 2556 } 2557 drflags |= PD_DEREF; 2558 error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, 2559 desc_in->jd_gid, VREAD, td->td_ucred); 2560 JAILDESC_UNLOCK(desc_in); 2561 if (error != 0) 2562 goto done; 2563 mtx_lock(&pr->pr_mtx); 2564 drflags |= PD_LOCKED; 2565 if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { 2566 error = ENOENT; 2567 vfs_opterror(opts, "jail %d is dying", 2568 pr->pr_id); 2569 goto done; 2570 } 2571 goto found_prison; 2572 } 2573 if (flags & JAIL_AT_DESC) { 2574 /* Look up jails based on the descriptor's prison. */ 2575 prison_free(mypr); 2576 error = jaildesc_find(td, jfd_in, &desc_in, &mypr, 2577 NULL); 2578 if (error != 0) { 2579 vfs_opterror(opts, error == ENOENT 2580 ? "descriptor to dead jail" 2581 : "not a jail descriptor"); 2582 goto done; 2583 } 2584 error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, 2585 desc_in->jd_gid, VEXEC, td->td_ucred); 2586 JAILDESC_UNLOCK(desc_in); 2587 if (error != 0) 2588 goto done; 2589 } 2590 if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { 2591 /* Allocate a jail descriptor to return later. */ 2592 error = jaildesc_alloc(td, &jfp_out, &jfd_out, 2593 flags & JAIL_OWN_DESC); 2594 if (error) 2595 goto done; 2596 } 2597 } else 2598 goto done; 2599 2600 error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); 2601 if (error == 0) { 2602 TAILQ_FOREACH(pr, &allprison, pr_list) { 2603 if (pr->pr_id > jid && 2604 ((flags & JAIL_DYING) || prison_isalive(pr)) && 2605 prison_ischild(mypr, pr)) { 2606 mtx_lock(&pr->pr_mtx); 2607 drflags |= PD_LOCKED; 2608 goto found_prison; 2609 } 2610 } 2611 error = ENOENT; 2612 vfs_opterror(opts, "no jail after %d", jid); 2613 goto done; 2614 } else if (error != ENOENT) 2615 goto done; 2616 2617 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); 2618 if (error == 0) { 2619 if (jid != 0) { 2620 pr = prison_find_child(mypr, jid); 2621 if (pr != NULL) { 2622 drflags |= PD_LOCKED; 2623 if (!(prison_isalive(pr) || 2624 (flags & JAIL_DYING))) { 2625 error = ENOENT; 2626 vfs_opterror(opts, "jail %d is dying", 2627 jid); 2628 goto done; 2629 } 2630 goto found_prison; 2631 } 2632 error = ENOENT; 2633 vfs_opterror(opts, "jail %d not found", jid); 2634 goto done; 2635 } 2636 } else if (error != ENOENT) 2637 goto done; 2638 2639 error = vfs_getopt(opts, "name", (void **)&name, &len); 2640 if (error == 0) { 2641 if (len == 0 || name[len - 1] != '\0') { 2642 error = EINVAL; 2643 goto done; 2644 } 2645 pr = prison_find_name(mypr, name); 2646 if (pr != NULL) { 2647 drflags |= PD_LOCKED; 2648 if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { 2649 error = ENOENT; 2650 vfs_opterror(opts, "jail \"%s\" is dying", 2651 name); 2652 goto done; 2653 } 2654 goto found_prison; 2655 } 2656 error = ENOENT; 2657 vfs_opterror(opts, "jail \"%s\" not found", name); 2658 goto done; 2659 } else if (error != ENOENT) 2660 goto done; 2661 2662 vfs_opterror(opts, "no jail specified"); 2663 error = ENOENT; 2664 goto done; 2665 2666 found_prison: 2667 /* Get the parameters of the prison. */ 2668 if (!(drflags & PD_DEREF)) { 2669 prison_hold(pr); 2670 drflags |= PD_DEREF; 2671 } 2672 td->td_retval[0] = pr->pr_id; 2673 if (jfd_out >= 0) { 2674 error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out)); 2675 if (error != 0 && error != ENOENT) 2676 goto done; 2677 jaildesc_set_prison(jfp_out, pr); 2678 } 2679 error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); 2680 if (error != 0 && error != ENOENT) 2681 goto done; 2682 i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; 2683 error = vfs_setopt(opts, "parent", &i, sizeof(i)); 2684 if (error != 0 && error != ENOENT) 2685 goto done; 2686 error = vfs_setopts(opts, "name", prison_name(mypr, pr)); 2687 if (error != 0 && error != ENOENT) 2688 goto done; 2689 error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, 2690 sizeof(pr->pr_cpuset->cs_id)); 2691 if (error != 0 && error != ENOENT) 2692 goto done; 2693 error = vfs_setopts(opts, "path", prison_path(mypr, pr)); 2694 if (error != 0 && error != ENOENT) 2695 goto done; 2696 #ifdef INET 2697 error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip, 2698 pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips * 2699 pr_families[PR_INET].size : 0 ); 2700 if (error != 0 && error != ENOENT) 2701 goto done; 2702 #endif 2703 #ifdef INET6 2704 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip, 2705 pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips * 2706 pr_families[PR_INET6].size : 0 ); 2707 if (error != 0 && error != ENOENT) 2708 goto done; 2709 #endif 2710 error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, 2711 sizeof(pr->pr_securelevel)); 2712 if (error != 0 && error != ENOENT) 2713 goto done; 2714 error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, 2715 sizeof(pr->pr_childcount)); 2716 if (error != 0 && error != ENOENT) 2717 goto done; 2718 error = vfs_setopt(opts, "children.max", &pr->pr_childmax, 2719 sizeof(pr->pr_childmax)); 2720 if (error != 0 && error != ENOENT) 2721 goto done; 2722 error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); 2723 if (error != 0 && error != ENOENT) 2724 goto done; 2725 error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); 2726 if (error != 0 && error != ENOENT) 2727 goto done; 2728 error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); 2729 if (error != 0 && error != ENOENT) 2730 goto done; 2731 #ifdef COMPAT_FREEBSD32 2732 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 2733 uint32_t hid32 = pr->pr_hostid; 2734 2735 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); 2736 } else 2737 #endif 2738 error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, 2739 sizeof(pr->pr_hostid)); 2740 if (error != 0 && error != ENOENT) 2741 goto done; 2742 error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, 2743 sizeof(pr->pr_enforce_statfs)); 2744 if (error != 0 && error != ENOENT) 2745 goto done; 2746 error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, 2747 sizeof(pr->pr_devfs_rsnum)); 2748 if (error != 0 && error != ENOENT) 2749 goto done; 2750 for (bf = pr_flag_bool; 2751 bf < pr_flag_bool + nitems(pr_flag_bool); 2752 bf++) { 2753 i = (pr->pr_flags & bf->flag) ? 1 : 0; 2754 error = vfs_setopt(opts, bf->name, &i, sizeof(i)); 2755 if (error != 0 && error != ENOENT) 2756 goto done; 2757 i = !i; 2758 error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); 2759 if (error != 0 && error != ENOENT) 2760 goto done; 2761 } 2762 for (jsf = pr_flag_jailsys; 2763 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 2764 jsf++) { 2765 f = pr->pr_flags & (jsf->disable | jsf->new); 2766 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE 2767 : (f == jsf->new) ? JAIL_SYS_NEW 2768 : JAIL_SYS_INHERIT; 2769 error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); 2770 if (error != 0 && error != ENOENT) 2771 goto done; 2772 } 2773 for (bf = pr_flag_allow; 2774 bf < pr_flag_allow + nitems(pr_flag_allow) && 2775 atomic_load_int(&bf->flag) != 0; 2776 bf++) { 2777 i = (pr->pr_allow & bf->flag) ? 1 : 0; 2778 error = vfs_setopt(opts, bf->name, &i, sizeof(i)); 2779 if (error != 0 && error != ENOENT) 2780 goto done; 2781 i = !i; 2782 error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); 2783 if (error != 0 && error != ENOENT) 2784 goto done; 2785 } 2786 i = !prison_isalive(pr); 2787 error = vfs_setopt(opts, "dying", &i, sizeof(i)); 2788 if (error != 0 && error != ENOENT) 2789 goto done; 2790 i = !i; 2791 error = vfs_setopt(opts, "nodying", &i, sizeof(i)); 2792 if (error != 0 && error != ENOENT) 2793 goto done; 2794 error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, 2795 sizeof(pr->pr_osreldate)); 2796 if (error != 0 && error != ENOENT) 2797 goto done; 2798 error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); 2799 if (error != 0 && error != ENOENT) 2800 goto done; 2801 2802 /* Get the module parameters. */ 2803 mtx_unlock(&pr->pr_mtx); 2804 drflags &= ~PD_LOCKED; 2805 error = osd_jail_call(pr, PR_METHOD_GET, opts); 2806 if (error) 2807 goto done; 2808 prison_deref(pr, drflags); 2809 pr = NULL; 2810 drflags = 0; 2811 2812 /* By now, all parameters should have been noted. */ 2813 TAILQ_FOREACH(opt, opts, link) { 2814 if (!opt->seen && 2815 (strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name || 2816 strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) { 2817 /* Communicate back a missing key. */ 2818 free(opt->value, M_MOUNT); 2819 opt->value = NULL; 2820 opt->len = 0; 2821 continue; 2822 } 2823 if (!opt->seen && strcmp(opt->name, "errmsg")) { 2824 error = EINVAL; 2825 vfs_opterror(opts, "unknown parameter: %s", opt->name); 2826 goto done; 2827 } 2828 } 2829 2830 /* Write the fetched parameters back to userspace. */ 2831 error = 0; 2832 TAILQ_FOREACH(opt, opts, link) { 2833 if (opt->pos >= 0 && opt->pos != errmsg_pos) { 2834 pos = 2 * opt->pos + 1; 2835 optuio->uio_iov[pos].iov_len = opt->len; 2836 if (opt->value != NULL) { 2837 if (optuio->uio_segflg == UIO_SYSSPACE) { 2838 bcopy(opt->value, 2839 optuio->uio_iov[pos].iov_base, 2840 opt->len); 2841 } else { 2842 error = copyout(opt->value, 2843 optuio->uio_iov[pos].iov_base, 2844 opt->len); 2845 if (error) 2846 break; 2847 } 2848 } 2849 } 2850 } 2851 2852 done: 2853 /* Release any temporary prison holds and/or locks. */ 2854 if (pr != NULL) 2855 prison_deref(pr, drflags); 2856 else if (drflags & PD_LIST_SLOCKED) 2857 sx_sunlock(&allprison_lock); 2858 else if (drflags & PD_LIST_XLOCKED) 2859 sx_xunlock(&allprison_lock); 2860 /* Clean up other resources. */ 2861 if (jfp_out != NULL) 2862 (void)fdrop(jfp_out, td); 2863 if (error && jfd_out >= 0) 2864 (void)kern_close(td, jfd_out); 2865 if (error && errmsg_pos >= 0) { 2866 /* Write the error message back to userspace. */ 2867 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); 2868 errmsg_pos = 2 * errmsg_pos + 1; 2869 if (errmsg_len > 0) { 2870 if (optuio->uio_segflg == UIO_SYSSPACE) 2871 bcopy(errmsg, 2872 optuio->uio_iov[errmsg_pos].iov_base, 2873 errmsg_len); 2874 else 2875 (void)copyout(errmsg, 2876 optuio->uio_iov[errmsg_pos].iov_base, 2877 errmsg_len); 2878 } 2879 } 2880 vfs_freeopts(opts); 2881 prison_free(mypr); 2882 return (error); 2883 } 2884 2885 /* 2886 * struct jail_remove_args { 2887 * int jid; 2888 * }; 2889 */ 2890 int 2891 sys_jail_remove(struct thread *td, struct jail_remove_args *uap) 2892 { 2893 struct prison *pr; 2894 int error; 2895 2896 error = priv_check(td, PRIV_JAIL_REMOVE); 2897 if (error) 2898 return (error); 2899 2900 sx_xlock(&allprison_lock); 2901 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); 2902 if (pr == NULL) { 2903 sx_xunlock(&allprison_lock); 2904 return (EINVAL); 2905 } 2906 prison_hold(pr); 2907 prison_remove(pr); 2908 return (0); 2909 } 2910 2911 /* 2912 * struct jail_remove_jd_args { 2913 * int fd; 2914 * }; 2915 */ 2916 int 2917 sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap) 2918 { 2919 struct jaildesc *jd; 2920 struct prison *pr; 2921 struct ucred *jdcred; 2922 int error; 2923 2924 error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred); 2925 if (error) 2926 return (error); 2927 /* 2928 * Check file permissions using the current credentials, and 2929 * operation permissions using the descriptor's credentials. 2930 */ 2931 error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VWRITE, 2932 td->td_ucred); 2933 JAILDESC_UNLOCK(jd); 2934 if (error == 0) 2935 error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE); 2936 crfree(jdcred); 2937 if (error) { 2938 prison_free(pr); 2939 return (error); 2940 } 2941 sx_xlock(&allprison_lock); 2942 mtx_lock(&pr->pr_mtx); 2943 prison_remove(pr); 2944 return (0); 2945 } 2946 2947 /* 2948 * Begin the removal process for a prison. The allprison lock should 2949 * be held exclusively, and the prison should be both locked and held. 2950 */ 2951 void 2952 prison_remove(struct prison *pr) 2953 { 2954 sx_assert(&allprison_lock, SA_XLOCKED); 2955 mtx_assert(&pr->pr_mtx, MA_OWNED); 2956 if (!prison_isalive(pr)) { 2957 /* Silently ignore already-dying prisons. */ 2958 mtx_unlock(&pr->pr_mtx); 2959 sx_xunlock(&allprison_lock); 2960 return; 2961 } 2962 prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); 2963 } 2964 2965 /* 2966 * struct jail_attach_args { 2967 * int jid; 2968 * }; 2969 */ 2970 int 2971 sys_jail_attach(struct thread *td, struct jail_attach_args *uap) 2972 { 2973 struct prison *pr; 2974 int error; 2975 2976 error = priv_check(td, PRIV_JAIL_ATTACH); 2977 if (error) 2978 return (error); 2979 2980 sx_slock(&allprison_lock); 2981 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); 2982 if (pr == NULL) { 2983 sx_sunlock(&allprison_lock); 2984 return (EINVAL); 2985 } 2986 2987 /* Do not allow a process to attach to a prison that is not alive. */ 2988 if (!prison_isalive(pr)) { 2989 mtx_unlock(&pr->pr_mtx); 2990 sx_sunlock(&allprison_lock); 2991 return (EINVAL); 2992 } 2993 2994 return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); 2995 } 2996 2997 /* 2998 * struct jail_attach_jd_args { 2999 * int fd; 3000 * }; 3001 */ 3002 int 3003 sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap) 3004 { 3005 struct jaildesc *jd; 3006 struct prison *pr; 3007 struct ucred *jdcred; 3008 int drflags, error; 3009 3010 sx_slock(&allprison_lock); 3011 drflags = PD_LIST_SLOCKED; 3012 error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred); 3013 if (error) 3014 goto fail; 3015 drflags |= PD_DEREF; 3016 /* 3017 * Check file permissions using the current credentials, and 3018 * operation permissions using the descriptor's credentials. 3019 */ 3020 error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VEXEC, 3021 td->td_ucred); 3022 JAILDESC_UNLOCK(jd); 3023 if (error == 0) 3024 error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); 3025 crfree(jdcred); 3026 if (error) 3027 goto fail; 3028 mtx_lock(&pr->pr_mtx); 3029 drflags |= PD_LOCKED; 3030 3031 /* Do not allow a process to attach to a prison that is not alive. */ 3032 if (!prison_isalive(pr)) { 3033 error = EINVAL; 3034 goto fail; 3035 } 3036 3037 return (do_jail_attach(td, pr, drflags)); 3038 3039 fail: 3040 prison_deref(pr, drflags); 3041 return (error); 3042 } 3043 3044 static int 3045 do_jail_attach(struct thread *td, struct prison *pr, int drflags) 3046 { 3047 struct proc *p; 3048 struct ucred *newcred, *oldcred; 3049 int error; 3050 3051 mtx_assert(&pr->pr_mtx, MA_OWNED); 3052 sx_assert(&allprison_lock, SX_LOCKED); 3053 drflags &= PD_LOCK_FLAGS; 3054 /* 3055 * XXX: Note that there is a slight race here if two threads 3056 * in the same privileged process attempt to attach to two 3057 * different jails at the same time. It is important for 3058 * user processes not to do this, or they might end up with 3059 * a process root from one prison, but attached to the jail 3060 * of another. 3061 */ 3062 if (!(drflags & PD_DEREF)) { 3063 prison_hold(pr); 3064 drflags |= PD_DEREF; 3065 } 3066 refcount_acquire(&pr->pr_uref); 3067 drflags |= PD_DEUREF; 3068 mtx_unlock(&pr->pr_mtx); 3069 drflags &= ~PD_LOCKED; 3070 3071 /* Let modules do whatever they need to prepare for attaching. */ 3072 error = osd_jail_call(pr, PR_METHOD_ATTACH, td); 3073 if (error) { 3074 prison_deref(pr, drflags); 3075 return (error); 3076 } 3077 sx_unlock(&allprison_lock); 3078 drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED); 3079 3080 /* 3081 * Reparent the newly attached process to this jail. 3082 */ 3083 p = td->td_proc; 3084 error = cpuset_setproc_update_set(p, pr->pr_cpuset); 3085 if (error) 3086 goto e_revert_osd; 3087 3088 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); 3089 if ((error = change_dir(pr->pr_root, td)) != 0) 3090 goto e_unlock; 3091 #ifdef MAC 3092 if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) 3093 goto e_unlock; 3094 #endif 3095 VOP_UNLOCK(pr->pr_root); 3096 if ((error = pwd_chroot_chdir(td, pr->pr_root))) 3097 goto e_revert_osd; 3098 3099 newcred = crget(); 3100 PROC_LOCK(p); 3101 oldcred = crcopysafe(p, newcred); 3102 newcred->cr_prison = pr; 3103 proc_set_cred(p, newcred); 3104 setsugid(p); 3105 #ifdef RACCT 3106 racct_proc_ucred_changed(p, oldcred, newcred); 3107 crhold(newcred); 3108 #endif 3109 PROC_UNLOCK(p); 3110 #ifdef RCTL 3111 rctl_proc_ucred_changed(p, newcred); 3112 crfree(newcred); 3113 #endif 3114 prison_proc_relink(oldcred->cr_prison, pr, p); 3115 prison_deref(oldcred->cr_prison, drflags); 3116 crfree(oldcred); 3117 prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); 3118 3119 /* 3120 * If the prison was killed while changing credentials, die along 3121 * with it. 3122 */ 3123 if (!prison_isalive(pr)) { 3124 PROC_LOCK(p); 3125 kern_psignal(p, SIGKILL); 3126 PROC_UNLOCK(p); 3127 } 3128 3129 return (0); 3130 3131 e_unlock: 3132 VOP_UNLOCK(pr->pr_root); 3133 e_revert_osd: 3134 /* Tell modules this thread is still in its old jail after all. */ 3135 sx_slock(&allprison_lock); 3136 drflags |= PD_LIST_SLOCKED; 3137 (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); 3138 prison_deref(pr, drflags); 3139 return (error); 3140 } 3141 3142 /* 3143 * Returns a locked prison instance, or NULL on failure. 3144 */ 3145 struct prison * 3146 prison_find(int prid) 3147 { 3148 struct prison *pr; 3149 3150 sx_assert(&allprison_lock, SX_LOCKED); 3151 TAILQ_FOREACH(pr, &allprison, pr_list) { 3152 if (pr->pr_id < prid) 3153 continue; 3154 if (pr->pr_id > prid) 3155 break; 3156 KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); 3157 mtx_lock(&pr->pr_mtx); 3158 return (pr); 3159 } 3160 return (NULL); 3161 } 3162 3163 /* 3164 * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. 3165 */ 3166 struct prison * 3167 prison_find_child(struct prison *mypr, int prid) 3168 { 3169 struct prison *pr; 3170 int descend; 3171 3172 sx_assert(&allprison_lock, SX_LOCKED); 3173 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { 3174 if (pr->pr_id == prid) { 3175 KASSERT(prison_isvalid(pr), 3176 ("Found invalid prison %p", pr)); 3177 mtx_lock(&pr->pr_mtx); 3178 return (pr); 3179 } 3180 } 3181 return (NULL); 3182 } 3183 3184 /* 3185 * Look for the name relative to mypr. Returns a locked prison or NULL. 3186 */ 3187 struct prison * 3188 prison_find_name(struct prison *mypr, const char *name) 3189 { 3190 struct prison *pr, *deadpr; 3191 size_t mylen; 3192 int descend; 3193 3194 sx_assert(&allprison_lock, SX_LOCKED); 3195 mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; 3196 deadpr = NULL; 3197 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { 3198 if (!strcmp(pr->pr_name + mylen, name)) { 3199 KASSERT(prison_isvalid(pr), 3200 ("Found invalid prison %p", pr)); 3201 if (prison_isalive(pr)) { 3202 mtx_lock(&pr->pr_mtx); 3203 return (pr); 3204 } 3205 deadpr = pr; 3206 } 3207 } 3208 /* There was no valid prison - perhaps there was a dying one. */ 3209 if (deadpr != NULL) 3210 mtx_lock(&deadpr->pr_mtx); 3211 return (deadpr); 3212 } 3213 3214 /* 3215 * See if a prison has the specific flag set. The prison should be locked, 3216 * unless checking for flags that are only set at jail creation (such as 3217 * PR_IP4 and PR_IP6), or only the single bit is examined, without regard 3218 * to any other prison data. 3219 */ 3220 bool 3221 prison_flag(struct ucred *cred, unsigned flag) 3222 { 3223 3224 return ((cred->cr_prison->pr_flags & flag) != 0); 3225 } 3226 3227 /* 3228 * See if a prison has the specific allow flag set. 3229 * The prison *should* be locked, or only a single bit is examined, without 3230 * regard to any other prison data. 3231 */ 3232 bool 3233 prison_allow(struct ucred *cred, unsigned flag) 3234 { 3235 3236 return ((cred->cr_prison->pr_allow & flag) != 0); 3237 } 3238 3239 /* 3240 * Hold a prison reference, by incrementing pr_ref. It is generally 3241 * an error to hold a prison that does not already have a reference. 3242 * A prison record will remain valid as long as it has at least one 3243 * reference, and will not be removed as long as either the prison 3244 * mutex or the allprison lock is held (allprison_lock may be shared). 3245 */ 3246 void 3247 prison_hold_locked(struct prison *pr) 3248 { 3249 3250 /* Locking is no longer required. */ 3251 prison_hold(pr); 3252 } 3253 3254 void 3255 prison_hold(struct prison *pr) 3256 { 3257 #ifdef INVARIANTS 3258 int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref); 3259 3260 KASSERT(was_valid, 3261 ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); 3262 #else 3263 refcount_acquire(&pr->pr_ref); 3264 #endif 3265 } 3266 3267 /* 3268 * Remove a prison reference. If that was the last reference, the 3269 * prison will be removed (at a later time). 3270 */ 3271 void 3272 prison_free_locked(struct prison *pr) 3273 { 3274 3275 mtx_assert(&pr->pr_mtx, MA_OWNED); 3276 /* 3277 * Locking is no longer required, but unlock because the caller 3278 * expects it. 3279 */ 3280 mtx_unlock(&pr->pr_mtx); 3281 prison_free(pr); 3282 } 3283 3284 void 3285 prison_free(struct prison *pr) 3286 { 3287 3288 KASSERT(refcount_load(&pr->pr_ref) > 0, 3289 ("Trying to free dead prison %p (jid=%d).", 3290 pr, pr->pr_id)); 3291 if (!refcount_release_if_not_last(&pr->pr_ref)) { 3292 /* 3293 * Don't remove the last reference in this context, 3294 * in case there are locks held. 3295 */ 3296 taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); 3297 } 3298 } 3299 3300 static void 3301 prison_free_not_last(struct prison *pr) 3302 { 3303 #ifdef INVARIANTS 3304 int lastref; 3305 3306 KASSERT(refcount_load(&pr->pr_ref) > 0, 3307 ("Trying to free dead prison %p (jid=%d).", 3308 pr, pr->pr_id)); 3309 lastref = refcount_release(&pr->pr_ref); 3310 KASSERT(!lastref, 3311 ("prison_free_not_last freed last ref on prison %p (jid=%d).", 3312 pr, pr->pr_id)); 3313 #else 3314 refcount_release(&pr->pr_ref); 3315 #endif 3316 } 3317 3318 /* 3319 * Hold a prison for user visibility, by incrementing pr_uref. 3320 * It is generally an error to hold a prison that isn't already 3321 * user-visible, except through the jail system calls. It is also 3322 * an error to hold an invalid prison. A prison record will remain 3323 * alive as long as it has at least one user reference, and will not 3324 * be set to the dying state until the prison mutex and allprison_lock 3325 * are both freed. 3326 */ 3327 void 3328 prison_proc_hold(struct prison *pr) 3329 { 3330 #ifdef INVARIANTS 3331 int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref); 3332 3333 KASSERT(was_alive, 3334 ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); 3335 #else 3336 refcount_acquire(&pr->pr_uref); 3337 #endif 3338 } 3339 3340 /* 3341 * Remove a prison user reference. If it was the last reference, the 3342 * prison will be considered "dying", and may be removed once all of 3343 * its references are dropped. 3344 */ 3345 void 3346 prison_proc_free(struct prison *pr) 3347 { 3348 3349 /* 3350 * Locking is only required when releasing the last reference. 3351 * This allows assurance that a locked prison will remain alive 3352 * until it is unlocked. 3353 */ 3354 KASSERT(refcount_load(&pr->pr_uref) > 0, 3355 ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); 3356 if (!refcount_release_if_not_last(&pr->pr_uref)) { 3357 /* 3358 * Don't remove the last user reference in this context, 3359 * which is expected to be a process that is not only locked, 3360 * but also half dead. Add a reference so any calls to 3361 * prison_free() won't re-submit the task. 3362 */ 3363 prison_hold(pr); 3364 mtx_lock(&pr->pr_mtx); 3365 KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC), 3366 ("Redundant last reference in prison_proc_free (jid=%d)", 3367 pr->pr_id)); 3368 pr->pr_flags |= PR_COMPLETE_PROC; 3369 mtx_unlock(&pr->pr_mtx); 3370 taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); 3371 } 3372 } 3373 3374 static void 3375 prison_proc_free_not_last(struct prison *pr) 3376 { 3377 #ifdef INVARIANTS 3378 int lastref; 3379 3380 KASSERT(refcount_load(&pr->pr_uref) > 0, 3381 ("Trying to free dead prison %p (jid=%d).", 3382 pr, pr->pr_id)); 3383 lastref = refcount_release(&pr->pr_uref); 3384 KASSERT(!lastref, 3385 ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).", 3386 pr, pr->pr_id)); 3387 #else 3388 refcount_release(&pr->pr_uref); 3389 #endif 3390 } 3391 3392 void 3393 prison_proc_link(struct prison *pr, struct proc *p) 3394 { 3395 3396 sx_assert(&allproc_lock, SA_XLOCKED); 3397 LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist); 3398 } 3399 3400 void 3401 prison_proc_unlink(struct prison *pr, struct proc *p) 3402 { 3403 3404 sx_assert(&allproc_lock, SA_XLOCKED); 3405 LIST_REMOVE(p, p_jaillist); 3406 } 3407 3408 static void 3409 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p) 3410 { 3411 3412 sx_xlock(&allproc_lock); 3413 prison_proc_unlink(opr, p); 3414 prison_proc_link(npr, p); 3415 sx_xunlock(&allproc_lock); 3416 } 3417 3418 /* 3419 * Complete a call to either prison_free or prison_proc_free. 3420 */ 3421 static void 3422 prison_complete(void *context, int pending) 3423 { 3424 struct prison *pr = context; 3425 int drflags; 3426 3427 /* 3428 * This could be called to release the last reference, or the last 3429 * user reference (plus the reference held in prison_proc_free). 3430 */ 3431 drflags = prison_lock_xlock(pr, PD_DEREF); 3432 if (pr->pr_flags & PR_COMPLETE_PROC) { 3433 pr->pr_flags &= ~PR_COMPLETE_PROC; 3434 drflags |= PD_DEUREF; 3435 } 3436 prison_deref(pr, drflags); 3437 } 3438 3439 static void 3440 prison_kill_processes_cb(struct proc *p, void *arg __unused) 3441 { 3442 3443 kern_psignal(p, SIGKILL); 3444 } 3445 3446 /* 3447 * Note the iteration does not guarantee acting on all processes. 3448 * Most notably there may be fork or jail_attach in progress. 3449 */ 3450 void 3451 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *), 3452 void *cbarg) 3453 { 3454 struct prison *ppr; 3455 struct proc *p; 3456 3457 if (atomic_load_int(&pr->pr_childcount) == 0) { 3458 sx_slock(&allproc_lock); 3459 LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) { 3460 if (p->p_state == PRS_NEW) 3461 continue; 3462 PROC_LOCK(p); 3463 cb(p, cbarg); 3464 PROC_UNLOCK(p); 3465 } 3466 sx_sunlock(&allproc_lock); 3467 if (atomic_load_int(&pr->pr_childcount) == 0) 3468 return; 3469 /* 3470 * Some jails popped up during the iteration, fall through to a 3471 * system-wide search. 3472 */ 3473 } 3474 3475 sx_slock(&allproc_lock); 3476 FOREACH_PROC_IN_SYSTEM(p) { 3477 PROC_LOCK(p); 3478 if (p->p_state != PRS_NEW && p->p_ucred != NULL) { 3479 for (ppr = p->p_ucred->cr_prison; ppr != NULL; 3480 ppr = ppr->pr_parent) { 3481 if (ppr == pr) { 3482 cb(p, cbarg); 3483 break; 3484 } 3485 } 3486 } 3487 PROC_UNLOCK(p); 3488 } 3489 sx_sunlock(&allproc_lock); 3490 } 3491 3492 /* 3493 * Remove a prison reference and/or user reference (usually). 3494 * This assumes context that allows sleeping (for allprison_lock), 3495 * with no non-sleeping locks held, except perhaps the prison itself. 3496 * If there are no more references, release and delist the prison. 3497 * On completion, the prison lock and the allprison lock are both 3498 * unlocked. 3499 */ 3500 static void 3501 prison_deref(struct prison *pr, int flags) 3502 { 3503 struct prisonlist freeprison; 3504 struct prison *killpr, *rpr, *ppr, *tpr; 3505 3506 killpr = NULL; 3507 TAILQ_INIT(&freeprison); 3508 /* 3509 * Release this prison as requested, which may cause its parent 3510 * to be released, and then maybe its grandparent, etc. 3511 */ 3512 for (;;) { 3513 if (flags & PD_KILL) { 3514 /* Kill the prison and its descendents. */ 3515 KASSERT(pr != &prison0, 3516 ("prison_deref trying to kill prison0")); 3517 if (!(flags & PD_DEREF)) { 3518 prison_hold(pr); 3519 flags |= PD_DEREF; 3520 } 3521 flags = prison_lock_xlock(pr, flags); 3522 prison_deref_kill(pr, &freeprison); 3523 } 3524 if (flags & PD_DEUREF) { 3525 /* Drop a user reference. */ 3526 KASSERT(refcount_load(&pr->pr_uref) > 0, 3527 ("prison_deref PD_DEUREF on a dead prison (jid=%d)", 3528 pr->pr_id)); 3529 if (!refcount_release_if_not_last(&pr->pr_uref)) { 3530 if (!(flags & PD_DEREF)) { 3531 prison_hold(pr); 3532 flags |= PD_DEREF; 3533 } 3534 flags = prison_lock_xlock(pr, flags); 3535 if (refcount_release(&pr->pr_uref) && 3536 pr->pr_state == PRISON_STATE_ALIVE) { 3537 /* 3538 * When the last user references goes, 3539 * this becomes a dying prison. 3540 */ 3541 KASSERT( 3542 refcount_load(&prison0.pr_uref) > 0, 3543 ("prison0 pr_uref=0")); 3544 pr->pr_state = PRISON_STATE_DYING; 3545 prison_cleanup_locked(pr); 3546 mtx_unlock(&pr->pr_mtx); 3547 flags &= ~PD_LOCKED; 3548 prison_cleanup_unlocked(pr); 3549 } 3550 } 3551 } 3552 if (flags & PD_KILL) { 3553 /* 3554 * Any remaining user references are probably processes 3555 * that need to be killed, either in this prison or its 3556 * descendants. 3557 */ 3558 if (refcount_load(&pr->pr_uref) > 0) 3559 killpr = pr; 3560 /* Make sure the parent prison doesn't get killed. */ 3561 flags &= ~PD_KILL; 3562 } 3563 if (flags & PD_DEREF) { 3564 /* Drop a reference. */ 3565 KASSERT(refcount_load(&pr->pr_ref) > 0, 3566 ("prison_deref PD_DEREF on a dead prison (jid=%d)", 3567 pr->pr_id)); 3568 if (!refcount_release_if_not_last(&pr->pr_ref)) { 3569 flags = prison_lock_xlock(pr, flags); 3570 if (refcount_release(&pr->pr_ref)) { 3571 /* 3572 * When the last reference goes, 3573 * unlink the prison and set it aside. 3574 */ 3575 KASSERT( 3576 refcount_load(&pr->pr_uref) == 0, 3577 ("prison_deref: last ref, " 3578 "but still has %d urefs (jid=%d)", 3579 pr->pr_uref, pr->pr_id)); 3580 KASSERT( 3581 refcount_load(&prison0.pr_ref) != 0, 3582 ("prison0 pr_ref=0")); 3583 pr->pr_state = PRISON_STATE_INVALID; 3584 TAILQ_REMOVE(&allprison, pr, pr_list); 3585 LIST_REMOVE(pr, pr_sibling); 3586 TAILQ_INSERT_TAIL(&freeprison, pr, 3587 pr_list); 3588 for (ppr = pr->pr_parent; 3589 ppr != NULL; 3590 ppr = ppr->pr_parent) 3591 ppr->pr_childcount--; 3592 /* 3593 * Removing a prison frees references 3594 * from its parent. 3595 */ 3596 ppr = pr->pr_parent; 3597 pr->pr_parent = NULL; 3598 mtx_unlock(&pr->pr_mtx); 3599 3600 pr = ppr; 3601 flags &= ~PD_LOCKED; 3602 flags |= PD_DEREF | PD_DEUREF; 3603 continue; 3604 } 3605 } 3606 } 3607 break; 3608 } 3609 3610 /* Release all the prison locks. */ 3611 if (flags & PD_LOCKED) 3612 mtx_unlock(&pr->pr_mtx); 3613 if (flags & PD_LIST_SLOCKED) 3614 sx_sunlock(&allprison_lock); 3615 else if (flags & PD_LIST_XLOCKED) 3616 sx_xunlock(&allprison_lock); 3617 3618 /* Kill any processes attached to a killed prison. */ 3619 if (killpr != NULL) 3620 prison_proc_iterate(killpr, prison_kill_processes_cb, NULL); 3621 3622 /* 3623 * Finish removing any unreferenced prisons, which couldn't happen 3624 * while allprison_lock was held (to avoid a LOR on vrele). 3625 */ 3626 TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { 3627 #ifdef VIMAGE 3628 if (rpr->pr_flags & PR_VNET) 3629 vnet_destroy(rpr->pr_vnet); 3630 #endif 3631 if (rpr->pr_root != NULL) 3632 vrele(rpr->pr_root); 3633 mtx_destroy(&rpr->pr_mtx); 3634 #ifdef INET 3635 prison_ip_free(rpr->pr_addrs[PR_INET]); 3636 #endif 3637 #ifdef INET6 3638 prison_ip_free(rpr->pr_addrs[PR_INET6]); 3639 #endif 3640 if (rpr->pr_cpuset != NULL) 3641 cpuset_rel(rpr->pr_cpuset); 3642 osd_jail_exit(rpr); 3643 #ifdef RACCT 3644 if (racct_enable) 3645 prison_racct_detach(rpr); 3646 #endif 3647 TAILQ_REMOVE(&freeprison, rpr, pr_list); 3648 free(rpr, M_PRISON); 3649 } 3650 } 3651 3652 /* 3653 * Kill the prison and its descendants. Mark them as dying, clear the 3654 * persist flag, and call module remove methods. 3655 */ 3656 static void 3657 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) 3658 { 3659 struct prison *cpr, *ppr, *rpr; 3660 bool descend; 3661 3662 /* 3663 * Unlike the descendants, the target prison can be killed 3664 * even if it is currently dying. This is useful for failed 3665 * creation in jail_set(2). 3666 */ 3667 KASSERT(refcount_load(&pr->pr_ref) > 0, 3668 ("Trying to kill dead prison %p (jid=%d).", 3669 pr, pr->pr_id)); 3670 refcount_acquire(&pr->pr_uref); 3671 pr->pr_state = PRISON_STATE_DYING; 3672 mtx_unlock(&pr->pr_mtx); 3673 3674 rpr = NULL; 3675 FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { 3676 if (descend) { 3677 if (!prison_isalive(cpr)) { 3678 descend = false; 3679 continue; 3680 } 3681 prison_hold(cpr); 3682 prison_proc_hold(cpr); 3683 mtx_lock(&cpr->pr_mtx); 3684 cpr->pr_state = PRISON_STATE_DYING; 3685 cpr->pr_flags |= PR_REMOVE; 3686 mtx_unlock(&cpr->pr_mtx); 3687 continue; 3688 } 3689 if (!(cpr->pr_flags & PR_REMOVE)) 3690 continue; 3691 prison_cleanup_unlocked(cpr); 3692 mtx_lock(&cpr->pr_mtx); 3693 prison_cleanup_locked(cpr); 3694 cpr->pr_flags &= ~PR_REMOVE; 3695 if (cpr->pr_flags & PR_PERSIST) { 3696 cpr->pr_flags &= ~PR_PERSIST; 3697 prison_proc_free_not_last(cpr); 3698 prison_free_not_last(cpr); 3699 } 3700 (void)refcount_release(&cpr->pr_uref); 3701 if (refcount_release(&cpr->pr_ref)) { 3702 /* 3703 * When the last reference goes, unlink the prison 3704 * and set it aside for prison_deref() to handle. 3705 * Delay unlinking the sibling list to keep the loop 3706 * safe. 3707 */ 3708 if (rpr != NULL) 3709 LIST_REMOVE(rpr, pr_sibling); 3710 rpr = cpr; 3711 rpr->pr_state = PRISON_STATE_INVALID; 3712 TAILQ_REMOVE(&allprison, rpr, pr_list); 3713 TAILQ_INSERT_TAIL(freeprison, rpr, pr_list); 3714 /* 3715 * Removing a prison frees references from its parent. 3716 */ 3717 ppr = rpr->pr_parent; 3718 prison_proc_free_not_last(ppr); 3719 prison_free_not_last(ppr); 3720 for (; ppr != NULL; ppr = ppr->pr_parent) 3721 ppr->pr_childcount--; 3722 } 3723 mtx_unlock(&cpr->pr_mtx); 3724 } 3725 if (rpr != NULL) 3726 LIST_REMOVE(rpr, pr_sibling); 3727 3728 prison_cleanup_unlocked(pr); 3729 mtx_lock(&pr->pr_mtx); 3730 prison_cleanup_locked(pr); 3731 if (pr->pr_flags & PR_PERSIST) { 3732 pr->pr_flags &= ~PR_PERSIST; 3733 prison_proc_free_not_last(pr); 3734 prison_free_not_last(pr); 3735 } 3736 (void)refcount_release(&pr->pr_uref); 3737 } 3738 3739 /* 3740 * Given the current locking state in the flags, make sure allprison_lock 3741 * is held exclusive, and the prison is locked. Return flags indicating 3742 * the new state. 3743 */ 3744 static int 3745 prison_lock_xlock(struct prison *pr, int flags) 3746 { 3747 3748 if (!(flags & PD_LIST_XLOCKED)) { 3749 /* 3750 * Get allprison_lock, which may be an upgrade, 3751 * and may require unlocking the prison. 3752 */ 3753 if (flags & PD_LOCKED) { 3754 mtx_unlock(&pr->pr_mtx); 3755 flags &= ~PD_LOCKED; 3756 } 3757 if (flags & PD_LIST_SLOCKED) { 3758 if (!sx_try_upgrade(&allprison_lock)) { 3759 sx_sunlock(&allprison_lock); 3760 sx_xlock(&allprison_lock); 3761 } 3762 flags &= ~PD_LIST_SLOCKED; 3763 } else 3764 sx_xlock(&allprison_lock); 3765 flags |= PD_LIST_XLOCKED; 3766 } 3767 if (!(flags & PD_LOCKED)) { 3768 /* Lock the prison mutex. */ 3769 mtx_lock(&pr->pr_mtx); 3770 flags |= PD_LOCKED; 3771 } 3772 return flags; 3773 } 3774 3775 /* 3776 * Release a prison's resources when it starts dying (when the last user 3777 * reference is dropped, or when it is killed). Two functions are called, 3778 * for work that requires a locked prison or an unlocked one. 3779 */ 3780 static void 3781 prison_cleanup_locked(struct prison *pr) 3782 { 3783 sx_assert(&allprison_lock, SA_XLOCKED); 3784 mtx_assert(&pr->pr_mtx, MA_OWNED); 3785 prison_knote(pr, NOTE_JAIL_REMOVE); 3786 knlist_detach(pr->pr_klist); 3787 jaildesc_prison_cleanup(pr); 3788 pr->pr_klist = NULL; 3789 } 3790 3791 static void 3792 prison_cleanup_unlocked(struct prison *pr) 3793 { 3794 sx_assert(&allprison_lock, SA_XLOCKED); 3795 mtx_assert(&pr->pr_mtx, MA_NOTOWNED); 3796 vfs_exjail_delete(pr); 3797 shm_remove_prison(pr); 3798 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); 3799 } 3800 3801 /* 3802 * Set or clear a permission bit in the pr_allow field, passing restrictions 3803 * (cleared permission) down to child jails. 3804 */ 3805 void 3806 prison_set_allow(struct ucred *cred, unsigned flag, int enable) 3807 { 3808 struct prison *pr; 3809 3810 pr = cred->cr_prison; 3811 sx_slock(&allprison_lock); 3812 mtx_lock(&pr->pr_mtx); 3813 prison_set_allow_locked(pr, flag, enable); 3814 mtx_unlock(&pr->pr_mtx); 3815 sx_sunlock(&allprison_lock); 3816 } 3817 3818 static void 3819 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable) 3820 { 3821 struct prison *cpr; 3822 int descend; 3823 3824 if (enable != 0) 3825 pr->pr_allow |= flag; 3826 else { 3827 pr->pr_allow &= ~flag; 3828 FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) 3829 cpr->pr_allow &= ~flag; 3830 } 3831 } 3832 3833 /* 3834 * Check if a jail supports the given address family. 3835 * 3836 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT 3837 * if not. 3838 */ 3839 int 3840 prison_check_af(struct ucred *cred, int af) 3841 { 3842 struct prison *pr; 3843 int error; 3844 3845 KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); 3846 3847 pr = cred->cr_prison; 3848 #ifdef VIMAGE 3849 /* Prisons with their own network stack are not limited. */ 3850 if (prison_owns_vnet(pr)) 3851 return (0); 3852 #endif 3853 3854 error = 0; 3855 switch (af) 3856 { 3857 #ifdef INET 3858 case AF_INET: 3859 if (pr->pr_flags & PR_IP4) 3860 { 3861 mtx_lock(&pr->pr_mtx); 3862 if ((pr->pr_flags & PR_IP4) && 3863 pr->pr_addrs[PR_INET] == NULL) 3864 error = EAFNOSUPPORT; 3865 mtx_unlock(&pr->pr_mtx); 3866 } 3867 break; 3868 #endif 3869 #ifdef INET6 3870 case AF_INET6: 3871 if (pr->pr_flags & PR_IP6) 3872 { 3873 mtx_lock(&pr->pr_mtx); 3874 if ((pr->pr_flags & PR_IP6) && 3875 pr->pr_addrs[PR_INET6] == NULL) 3876 error = EAFNOSUPPORT; 3877 mtx_unlock(&pr->pr_mtx); 3878 } 3879 break; 3880 #endif 3881 case AF_LOCAL: 3882 case AF_ROUTE: 3883 case AF_NETLINK: 3884 break; 3885 default: 3886 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) 3887 error = EAFNOSUPPORT; 3888 } 3889 return (error); 3890 } 3891 3892 /* 3893 * Check if given address belongs to the jail referenced by cred (wrapper to 3894 * prison_check_ip[46]). 3895 * 3896 * Returns 0 if jail doesn't restrict the address family or if address belongs 3897 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if 3898 * the jail doesn't allow the address family. IPv4 Address passed in in NBO. 3899 */ 3900 int 3901 prison_if(struct ucred *cred, const struct sockaddr *sa) 3902 { 3903 #ifdef INET 3904 const struct sockaddr_in *sai; 3905 #endif 3906 #ifdef INET6 3907 const struct sockaddr_in6 *sai6; 3908 #endif 3909 int error; 3910 3911 KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); 3912 KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); 3913 3914 #ifdef VIMAGE 3915 if (prison_owns_vnet(cred->cr_prison)) 3916 return (0); 3917 #endif 3918 3919 error = 0; 3920 switch (sa->sa_family) 3921 { 3922 #ifdef INET 3923 case AF_INET: 3924 sai = (const struct sockaddr_in *)sa; 3925 error = prison_check_ip4(cred, &sai->sin_addr); 3926 break; 3927 #endif 3928 #ifdef INET6 3929 case AF_INET6: 3930 sai6 = (const struct sockaddr_in6 *)sa; 3931 error = prison_check_ip6(cred, &sai6->sin6_addr); 3932 break; 3933 #endif 3934 default: 3935 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) 3936 error = EAFNOSUPPORT; 3937 } 3938 return (error); 3939 } 3940 3941 /* 3942 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. 3943 */ 3944 int 3945 prison_check(struct ucred *cred1, struct ucred *cred2) 3946 { 3947 3948 return ((cred1->cr_prison == cred2->cr_prison || 3949 prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); 3950 } 3951 3952 /* 3953 * For mountd/nfsd to run within a prison, it must be: 3954 * - A vnet prison. 3955 * - PR_ALLOW_NFSD must be set on it. 3956 * - The root directory (pr_root) of the prison must be 3957 * a file system mount point, so the mountd can hang 3958 * export information on it. 3959 * - The prison's enforce_statfs cannot be 0, so that 3960 * mountd(8) can do exports. 3961 */ 3962 bool 3963 prison_check_nfsd(struct ucred *cred) 3964 { 3965 3966 if (jailed_without_vnet(cred)) 3967 return (false); 3968 if (!prison_allow(cred, PR_ALLOW_NFSD)) 3969 return (false); 3970 if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0) 3971 return (false); 3972 if (cred->cr_prison->pr_enforce_statfs == 0) 3973 return (false); 3974 return (true); 3975 } 3976 3977 /* 3978 * Return true if p2 is a child of p1, otherwise false. 3979 */ 3980 bool 3981 prison_ischild(struct prison *pr1, struct prison *pr2) 3982 { 3983 3984 for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) 3985 if (pr1 == pr2) 3986 return (true); 3987 return (false); 3988 } 3989 3990 /* 3991 * Return true if the prison is currently alive. A prison is alive if it 3992 * holds user references and it isn't being removed. 3993 */ 3994 bool 3995 prison_isalive(const struct prison *pr) 3996 { 3997 3998 if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE)) 3999 return (false); 4000 return (true); 4001 } 4002 4003 /* 4004 * Return true if the prison is currently valid. A prison is valid if it has 4005 * been fully created, and is not being destroyed. Note that dying prisons 4006 * are still considered valid. Invalid prisons won't be found under normal 4007 * circumstances, as they're only put in that state by functions that have 4008 * an exclusive hold on allprison_lock. 4009 */ 4010 bool 4011 prison_isvalid(struct prison *pr) 4012 { 4013 4014 if (__predict_false(pr->pr_state == PRISON_STATE_INVALID)) 4015 return (false); 4016 if (__predict_false(refcount_load(&pr->pr_ref) == 0)) 4017 return (false); 4018 return (true); 4019 } 4020 4021 /* 4022 * Return true if the passed credential is in a jail and that jail does not 4023 * have its own virtual network stack, otherwise false. 4024 */ 4025 bool 4026 jailed_without_vnet(struct ucred *cred) 4027 { 4028 4029 if (!jailed(cred)) 4030 return (false); 4031 #ifdef VIMAGE 4032 if (prison_owns_vnet(cred->cr_prison)) 4033 return (false); 4034 #endif 4035 4036 return (true); 4037 } 4038 4039 /* 4040 * Return the correct hostname (domainname, et al) for the passed credential. 4041 */ 4042 void 4043 getcredhostname(struct ucred *cred, char *buf, size_t size) 4044 { 4045 struct prison *pr; 4046 4047 /* 4048 * A NULL credential can be used to shortcut to the physical 4049 * system's hostname. 4050 */ 4051 pr = (cred != NULL) ? cred->cr_prison : &prison0; 4052 mtx_lock(&pr->pr_mtx); 4053 strlcpy(buf, pr->pr_hostname, size); 4054 mtx_unlock(&pr->pr_mtx); 4055 } 4056 4057 void 4058 getcreddomainname(struct ucred *cred, char *buf, size_t size) 4059 { 4060 4061 mtx_lock(&cred->cr_prison->pr_mtx); 4062 strlcpy(buf, cred->cr_prison->pr_domainname, size); 4063 mtx_unlock(&cred->cr_prison->pr_mtx); 4064 } 4065 4066 void 4067 getcredhostuuid(struct ucred *cred, char *buf, size_t size) 4068 { 4069 4070 mtx_lock(&cred->cr_prison->pr_mtx); 4071 strlcpy(buf, cred->cr_prison->pr_hostuuid, size); 4072 mtx_unlock(&cred->cr_prison->pr_mtx); 4073 } 4074 4075 void 4076 getcredhostid(struct ucred *cred, unsigned long *hostid) 4077 { 4078 4079 mtx_lock(&cred->cr_prison->pr_mtx); 4080 *hostid = cred->cr_prison->pr_hostid; 4081 mtx_unlock(&cred->cr_prison->pr_mtx); 4082 } 4083 4084 void 4085 getjailname(struct ucred *cred, char *name, size_t len) 4086 { 4087 4088 mtx_lock(&cred->cr_prison->pr_mtx); 4089 strlcpy(name, cred->cr_prison->pr_name, len); 4090 mtx_unlock(&cred->cr_prison->pr_mtx); 4091 } 4092 4093 #ifdef VIMAGE 4094 /* 4095 * Determine whether the prison owns its VNET. 4096 */ 4097 bool 4098 prison_owns_vnet(struct prison *pr) 4099 { 4100 4101 /* 4102 * vnets cannot be added/removed after jail creation, 4103 * so no need to lock here. 4104 */ 4105 return ((pr->pr_flags & PR_VNET) != 0); 4106 } 4107 #endif 4108 4109 /* 4110 * Determine whether the subject represented by cred can "see" 4111 * status of a mount point. 4112 * Returns: 0 for permitted, ENOENT otherwise. 4113 * XXX: This function should be called cr_canseemount() and should be 4114 * placed in kern_prot.c. 4115 */ 4116 int 4117 prison_canseemount(struct ucred *cred, struct mount *mp) 4118 { 4119 struct prison *pr; 4120 struct statfs *sp; 4121 size_t len; 4122 4123 pr = cred->cr_prison; 4124 if (pr->pr_enforce_statfs == 0) 4125 return (0); 4126 if (pr->pr_root->v_mount == mp) 4127 return (0); 4128 if (pr->pr_enforce_statfs == 2) 4129 return (ENOENT); 4130 /* 4131 * If jail's chroot directory is set to "/" we should be able to see 4132 * all mount-points from inside a jail. 4133 * This is ugly check, but this is the only situation when jail's 4134 * directory ends with '/'. 4135 */ 4136 if (strcmp(pr->pr_path, "/") == 0) 4137 return (0); 4138 len = strlen(pr->pr_path); 4139 sp = &mp->mnt_stat; 4140 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) 4141 return (ENOENT); 4142 /* 4143 * Be sure that we don't have situation where jail's root directory 4144 * is "/some/path" and mount point is "/some/pathpath". 4145 */ 4146 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') 4147 return (ENOENT); 4148 return (0); 4149 } 4150 4151 void 4152 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) 4153 { 4154 char jpath[MAXPATHLEN]; 4155 struct prison *pr; 4156 size_t len; 4157 4158 pr = cred->cr_prison; 4159 if (pr->pr_enforce_statfs == 0) 4160 return; 4161 if (prison_canseemount(cred, mp) != 0) { 4162 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 4163 strlcpy(sp->f_mntonname, "[restricted]", 4164 sizeof(sp->f_mntonname)); 4165 return; 4166 } 4167 if (pr->pr_root->v_mount == mp) { 4168 /* 4169 * Clear current buffer data, so we are sure nothing from 4170 * the valid path left there. 4171 */ 4172 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 4173 *sp->f_mntonname = '/'; 4174 return; 4175 } 4176 /* 4177 * If jail's chroot directory is set to "/" we should be able to see 4178 * all mount-points from inside a jail. 4179 */ 4180 if (strcmp(pr->pr_path, "/") == 0) 4181 return; 4182 len = strlen(pr->pr_path); 4183 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); 4184 /* 4185 * Clear current buffer data, so we are sure nothing from 4186 * the valid path left there. 4187 */ 4188 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 4189 if (*jpath == '\0') { 4190 /* Should never happen. */ 4191 *sp->f_mntonname = '/'; 4192 } else { 4193 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); 4194 } 4195 } 4196 4197 /* 4198 * Check with permission for a specific privilege is granted within jail. We 4199 * have a specific list of accepted privileges; the rest are denied. 4200 */ 4201 int 4202 prison_priv_check(struct ucred *cred, int priv) 4203 { 4204 struct prison *pr; 4205 int error; 4206 4207 /* 4208 * Some policies have custom handlers. This routine should not be 4209 * called for them. See priv_check_cred(). 4210 */ 4211 switch (priv) { 4212 case PRIV_VFS_LOOKUP: 4213 case PRIV_VFS_GENERATION: 4214 KASSERT(0, ("prison_priv_check instead of a custom handler " 4215 "called for %d\n", priv)); 4216 } 4217 4218 if (!jailed(cred)) 4219 return (0); 4220 4221 #ifdef VIMAGE 4222 /* 4223 * Privileges specific to prisons with a virtual network stack. 4224 * There might be a duplicate entry here in case the privilege 4225 * is only granted conditionally in the legacy jail case. 4226 */ 4227 switch (priv) { 4228 /* 4229 * NFS-specific privileges. 4230 */ 4231 case PRIV_NFS_DAEMON: 4232 case PRIV_VFS_GETFH: 4233 case PRIV_VFS_MOUNT_EXPORTED: 4234 if (!prison_check_nfsd(cred)) 4235 return (EPERM); 4236 #ifdef notyet 4237 case PRIV_NFS_LOCKD: 4238 #endif 4239 /* 4240 * Network stack privileges. 4241 */ 4242 case PRIV_NET_BRIDGE: 4243 case PRIV_NET_GRE: 4244 case PRIV_NET_BPF: 4245 case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ 4246 case PRIV_NET_ROUTE: 4247 case PRIV_NET_TAP: 4248 case PRIV_NET_SETIFMTU: 4249 case PRIV_NET_SETIFFLAGS: 4250 case PRIV_NET_SETIFCAP: 4251 case PRIV_NET_SETIFDESCR: 4252 case PRIV_NET_SETIFNAME : 4253 case PRIV_NET_SETIFMETRIC: 4254 case PRIV_NET_SETIFPHYS: 4255 case PRIV_NET_SETIFMAC: 4256 case PRIV_NET_SETLANPCP: 4257 case PRIV_NET_ADDMULTI: 4258 case PRIV_NET_DELMULTI: 4259 case PRIV_NET_HWIOCTL: 4260 case PRIV_NET_SETLLADDR: 4261 case PRIV_NET_ADDIFGROUP: 4262 case PRIV_NET_DELIFGROUP: 4263 case PRIV_NET_IFCREATE: 4264 case PRIV_NET_IFDESTROY: 4265 case PRIV_NET_ADDIFADDR: 4266 case PRIV_NET_DELIFADDR: 4267 case PRIV_NET_LAGG: 4268 case PRIV_NET_GIF: 4269 case PRIV_NET_SETIFVNET: 4270 case PRIV_NET_SETIFFIB: 4271 case PRIV_NET_OVPN: 4272 case PRIV_NET_ME: 4273 case PRIV_NET_WG: 4274 4275 /* 4276 * 802.11-related privileges. 4277 */ 4278 case PRIV_NET80211_VAP_GETKEY: 4279 case PRIV_NET80211_VAP_MANAGE: 4280 4281 #ifdef notyet 4282 /* 4283 * ATM privileges. 4284 */ 4285 case PRIV_NETATM_CFG: 4286 case PRIV_NETATM_ADD: 4287 case PRIV_NETATM_DEL: 4288 case PRIV_NETATM_SET: 4289 4290 /* 4291 * Bluetooth privileges. 4292 */ 4293 case PRIV_NETBLUETOOTH_RAW: 4294 #endif 4295 4296 /* 4297 * Netgraph and netgraph module privileges. 4298 */ 4299 case PRIV_NETGRAPH_CONTROL: 4300 #ifdef notyet 4301 case PRIV_NETGRAPH_TTY: 4302 #endif 4303 4304 /* 4305 * IPv4 and IPv6 privileges. 4306 */ 4307 case PRIV_NETINET_IPFW: 4308 case PRIV_NETINET_DIVERT: 4309 case PRIV_NETINET_PF: 4310 case PRIV_NETINET_DUMMYNET: 4311 case PRIV_NETINET_CARP: 4312 case PRIV_NETINET_MROUTE: 4313 case PRIV_NETINET_RAW: 4314 case PRIV_NETINET_ADDRCTRL6: 4315 case PRIV_NETINET_ND6: 4316 case PRIV_NETINET_SCOPE6: 4317 case PRIV_NETINET_ALIFETIME6: 4318 case PRIV_NETINET_IPSEC: 4319 case PRIV_NETINET_BINDANY: 4320 4321 #ifdef notyet 4322 /* 4323 * NCP privileges. 4324 */ 4325 case PRIV_NETNCP: 4326 4327 /* 4328 * SMB privileges. 4329 */ 4330 case PRIV_NETSMB: 4331 #endif 4332 4333 /* 4334 * No default: or deny here. 4335 * In case of no permit fall through to next switch(). 4336 */ 4337 if (cred->cr_prison->pr_flags & PR_VNET) 4338 return (0); 4339 } 4340 #endif /* VIMAGE */ 4341 4342 switch (priv) { 4343 /* 4344 * Allow ktrace privileges for root in jail. 4345 */ 4346 case PRIV_KTRACE: 4347 4348 #if 0 4349 /* 4350 * Allow jailed processes to configure audit identity and 4351 * submit audit records (login, etc). In the future we may 4352 * want to further refine the relationship between audit and 4353 * jail. 4354 */ 4355 case PRIV_AUDIT_GETAUDIT: 4356 case PRIV_AUDIT_SETAUDIT: 4357 case PRIV_AUDIT_SUBMIT: 4358 #endif 4359 4360 /* 4361 * Allow jailed processes to manipulate process UNIX 4362 * credentials in any way they see fit. 4363 */ 4364 case PRIV_CRED_SETCRED: 4365 case PRIV_CRED_SETUID: 4366 case PRIV_CRED_SETEUID: 4367 case PRIV_CRED_SETGID: 4368 case PRIV_CRED_SETEGID: 4369 case PRIV_CRED_SETGROUPS: 4370 case PRIV_CRED_SETREUID: 4371 case PRIV_CRED_SETREGID: 4372 case PRIV_CRED_SETRESUID: 4373 case PRIV_CRED_SETRESGID: 4374 4375 /* 4376 * Jail implements visibility constraints already, so allow 4377 * jailed root to override uid/gid-based constraints. 4378 */ 4379 case PRIV_SEEOTHERGIDS: 4380 case PRIV_SEEOTHERUIDS: 4381 case PRIV_SEEJAILPROC: 4382 4383 /* 4384 * Jail implements inter-process debugging limits already, so 4385 * allow jailed root various debugging privileges. 4386 */ 4387 case PRIV_DEBUG_DIFFCRED: 4388 case PRIV_DEBUG_SUGID: 4389 case PRIV_DEBUG_UNPRIV: 4390 case PRIV_DEBUG_DIFFJAIL: 4391 4392 /* 4393 * Allow jail to set various resource limits and login 4394 * properties, and for now, exceed process resource limits. 4395 */ 4396 case PRIV_PROC_LIMIT: 4397 case PRIV_PROC_SETLOGIN: 4398 case PRIV_PROC_SETRLIMIT: 4399 4400 /* 4401 * Debuggers should work in jails. 4402 */ 4403 case PRIV_PROC_MEM_WRITE: 4404 4405 /* 4406 * System V and POSIX IPC privileges are granted in jail. 4407 */ 4408 case PRIV_IPC_READ: 4409 case PRIV_IPC_WRITE: 4410 case PRIV_IPC_ADMIN: 4411 case PRIV_IPC_MSGSIZE: 4412 case PRIV_MQ_ADMIN: 4413 4414 /* 4415 * Jail operations within a jail work on child jails. 4416 */ 4417 case PRIV_JAIL_ATTACH: 4418 case PRIV_JAIL_SET: 4419 case PRIV_JAIL_REMOVE: 4420 4421 /* 4422 * Jail implements its own inter-process limits, so allow 4423 * root processes in jail to change scheduling on other 4424 * processes in the same jail. Likewise for signalling. 4425 */ 4426 case PRIV_SCHED_DIFFCRED: 4427 case PRIV_SCHED_CPUSET: 4428 case PRIV_SCHED_DIFFJAIL: 4429 case PRIV_SIGNAL_DIFFCRED: 4430 case PRIV_SIGNAL_SUGID: 4431 case PRIV_SIGNAL_DIFFJAIL: 4432 4433 /* 4434 * Allow jailed processes to write to sysctls marked as jail 4435 * writable. 4436 */ 4437 case PRIV_SYSCTL_WRITEJAIL: 4438 4439 /* 4440 * Allow root in jail to manage a variety of quota 4441 * properties. These should likely be conditional on a 4442 * configuration option. 4443 */ 4444 case PRIV_VFS_GETQUOTA: 4445 case PRIV_VFS_SETQUOTA: 4446 4447 /* 4448 * Since Jail relies on chroot() to implement file system 4449 * protections, grant many VFS privileges to root in jail. 4450 * Be careful to exclude mount-related and NFS-related 4451 * privileges. 4452 */ 4453 case PRIV_VFS_READ: 4454 case PRIV_VFS_WRITE: 4455 case PRIV_VFS_ADMIN: 4456 case PRIV_VFS_EXEC: 4457 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ 4458 case PRIV_VFS_CHFLAGS_DEV: 4459 case PRIV_VFS_CHOWN: 4460 case PRIV_VFS_CHROOT: 4461 case PRIV_VFS_RETAINSUGID: 4462 case PRIV_VFS_FCHROOT: 4463 case PRIV_VFS_LINK: 4464 case PRIV_VFS_SETGID: 4465 case PRIV_VFS_STAT: 4466 case PRIV_VFS_STICKYFILE: 4467 4468 /* 4469 * As in the non-jail case, non-root users are expected to be 4470 * able to read kernel/physical memory (provided /dev/[k]mem 4471 * exists in the jail and they have permission to access it). 4472 */ 4473 case PRIV_KMEM_READ: 4474 return (0); 4475 4476 /* 4477 * Depending on the global setting, allow privilege of 4478 * setting system flags. 4479 */ 4480 case PRIV_VFS_SYSFLAGS: 4481 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) 4482 return (0); 4483 else 4484 return (EPERM); 4485 4486 /* 4487 * Depending on the global setting, allow privilege of 4488 * mounting/unmounting file systems. 4489 */ 4490 case PRIV_VFS_MOUNT: 4491 case PRIV_VFS_UNMOUNT: 4492 case PRIV_VFS_MOUNT_NONUSER: 4493 case PRIV_VFS_MOUNT_OWNER: 4494 pr = cred->cr_prison; 4495 prison_lock(pr); 4496 if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2) 4497 error = 0; 4498 else 4499 error = EPERM; 4500 prison_unlock(pr); 4501 return (error); 4502 4503 /* 4504 * Jails should hold no disposition on the PRIV_VFS_READ_DIR 4505 * policy. priv_check_cred will not specifically allow it, and 4506 * we may want a MAC policy to allow it. 4507 */ 4508 case PRIV_VFS_READ_DIR: 4509 return (0); 4510 4511 /* 4512 * Conditionally allow privileged process in the jail to 4513 * manipulate filesystem extended attributes in the system 4514 * namespace. 4515 */ 4516 case PRIV_VFS_EXTATTR_SYSTEM: 4517 if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0) 4518 return (0); 4519 else 4520 return (EPERM); 4521 4522 /* 4523 * Conditionnaly allow locking (unlocking) physical pages 4524 * in memory. 4525 */ 4526 case PRIV_VM_MLOCK: 4527 case PRIV_VM_MUNLOCK: 4528 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) 4529 return (0); 4530 else 4531 return (EPERM); 4532 4533 /* 4534 * Conditionally allow jailed root to bind reserved ports. 4535 */ 4536 case PRIV_NETINET_RESERVEDPORT: 4537 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) 4538 return (0); 4539 else 4540 return (EPERM); 4541 4542 /* 4543 * Allow jailed root to reuse in-use ports. 4544 */ 4545 case PRIV_NETINET_REUSEPORT: 4546 return (0); 4547 4548 /* 4549 * Allow jailed root to set certain IPv4/6 (option) headers. 4550 */ 4551 case PRIV_NETINET_SETHDROPTS: 4552 return (0); 4553 4554 /* 4555 * Conditionally allow creating raw sockets in jail. 4556 */ 4557 case PRIV_NETINET_RAW: 4558 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) 4559 return (0); 4560 else 4561 return (EPERM); 4562 4563 /* 4564 * Since jail implements its own visibility limits on netstat 4565 * sysctls, allow getcred. This allows identd to work in 4566 * jail. 4567 */ 4568 case PRIV_NETINET_GETCRED: 4569 return (0); 4570 4571 /* 4572 * Allow jailed root to set loginclass. 4573 */ 4574 case PRIV_PROC_SETLOGINCLASS: 4575 return (0); 4576 4577 /* 4578 * Do not allow a process inside a jail to read the kernel 4579 * message buffer unless explicitly permitted. 4580 */ 4581 case PRIV_MSGBUF: 4582 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) 4583 return (0); 4584 return (EPERM); 4585 4586 /* 4587 * Conditionally allow privileged process in the jail adjust 4588 * machine time. 4589 */ 4590 case PRIV_ADJTIME: 4591 case PRIV_NTP_ADJTIME: 4592 if (cred->cr_prison->pr_allow & 4593 (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) { 4594 return (0); 4595 } 4596 return (EPERM); 4597 4598 /* 4599 * Conditionally allow privileged process in the jail set 4600 * machine time. 4601 */ 4602 case PRIV_SETTIMEOFDAY: 4603 case PRIV_CLOCK_SETTIME: 4604 if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME) 4605 return (0); 4606 else 4607 return (EPERM); 4608 4609 /* 4610 * Conditionally allow privileged process in the jail to modify 4611 * the routing table. 4612 */ 4613 case PRIV_NET_ROUTE: 4614 if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING) 4615 return (0); 4616 else 4617 return (EPERM); 4618 4619 default: 4620 /* 4621 * In all remaining cases, deny the privilege request. This 4622 * includes almost all network privileges, many system 4623 * configuration privileges. 4624 */ 4625 return (EPERM); 4626 } 4627 } 4628 4629 /* 4630 * Return the part of pr2's name that is relative to pr1, or the whole name 4631 * if it does not directly follow. 4632 */ 4633 4634 char * 4635 prison_name(struct prison *pr1, struct prison *pr2) 4636 { 4637 char *name; 4638 4639 /* Jails see themselves as "0" (if they see themselves at all). */ 4640 if (pr1 == pr2) 4641 return "0"; 4642 name = pr2->pr_name; 4643 if (prison_ischild(pr1, pr2)) { 4644 /* 4645 * pr1 isn't locked (and allprison_lock may not be either) 4646 * so its length can't be counted on. But the number of dots 4647 * can be counted on - and counted. 4648 */ 4649 for (; pr1 != &prison0; pr1 = pr1->pr_parent) 4650 name = strchr(name, '.') + 1; 4651 } 4652 return (name); 4653 } 4654 4655 /* 4656 * Return the part of pr2's path that is relative to pr1, or the whole path 4657 * if it does not directly follow. 4658 */ 4659 static char * 4660 prison_path(struct prison *pr1, struct prison *pr2) 4661 { 4662 char *path1, *path2; 4663 int len1; 4664 4665 path1 = pr1->pr_path; 4666 path2 = pr2->pr_path; 4667 if (!strcmp(path1, "/")) 4668 return (path2); 4669 len1 = strlen(path1); 4670 if (strncmp(path1, path2, len1)) 4671 return (path2); 4672 if (path2[len1] == '\0') 4673 return "/"; 4674 if (path2[len1] == '/') 4675 return (path2 + len1); 4676 return (path2); 4677 } 4678 4679 /* 4680 * Jail-related sysctls. 4681 */ 4682 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 4683 "Jails"); 4684 4685 #if defined(INET) || defined(INET6) 4686 /* 4687 * Copy address array to memory that would be then SYSCTL_OUT-ed. 4688 * sysctl_jail_list() helper. 4689 */ 4690 static void 4691 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len) 4692 { 4693 const struct prison_ip *pip; 4694 const size_t size = pr_families[af].size; 4695 4696 again: 4697 mtx_assert(&pr->pr_mtx, MA_OWNED); 4698 if ((pip = pr->pr_addrs[af]) != NULL) { 4699 if (*len < pip->ips) { 4700 *len = pip->ips; 4701 mtx_unlock(&pr->pr_mtx); 4702 *out = realloc(*out, *len * size, M_TEMP, M_WAITOK); 4703 mtx_lock(&pr->pr_mtx); 4704 goto again; 4705 } 4706 bcopy(pip->pr_ip, *out, pip->ips * size); 4707 } 4708 } 4709 #endif 4710 4711 static int 4712 sysctl_jail_list(SYSCTL_HANDLER_ARGS) 4713 { 4714 struct xprison *xp; 4715 struct prison *pr, *cpr; 4716 #ifdef INET 4717 struct in_addr *ip4 = NULL; 4718 int ip4s = 0; 4719 #endif 4720 #ifdef INET6 4721 struct in6_addr *ip6 = NULL; 4722 int ip6s = 0; 4723 #endif 4724 int descend, error; 4725 4726 xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); 4727 pr = req->td->td_ucred->cr_prison; 4728 error = 0; 4729 sx_slock(&allprison_lock); 4730 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { 4731 mtx_lock(&cpr->pr_mtx); 4732 #ifdef INET 4733 prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s); 4734 #endif 4735 #ifdef INET6 4736 prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s); 4737 #endif 4738 bzero(xp, sizeof(*xp)); 4739 xp->pr_version = XPRISON_VERSION; 4740 xp->pr_id = cpr->pr_id; 4741 xp->pr_state = cpr->pr_state; 4742 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); 4743 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); 4744 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); 4745 #ifdef INET 4746 xp->pr_ip4s = ip4s; 4747 #endif 4748 #ifdef INET6 4749 xp->pr_ip6s = ip6s; 4750 #endif 4751 mtx_unlock(&cpr->pr_mtx); 4752 error = SYSCTL_OUT(req, xp, sizeof(*xp)); 4753 if (error) 4754 break; 4755 #ifdef INET 4756 if (xp->pr_ip4s > 0) { 4757 error = SYSCTL_OUT(req, ip4, 4758 xp->pr_ip4s * sizeof(struct in_addr)); 4759 if (error) 4760 break; 4761 } 4762 #endif 4763 #ifdef INET6 4764 if (xp->pr_ip6s > 0) { 4765 error = SYSCTL_OUT(req, ip6, 4766 xp->pr_ip6s * sizeof(struct in6_addr)); 4767 if (error) 4768 break; 4769 } 4770 #endif 4771 } 4772 sx_sunlock(&allprison_lock); 4773 free(xp, M_TEMP); 4774 #ifdef INET 4775 free(ip4, M_TEMP); 4776 #endif 4777 #ifdef INET6 4778 free(ip6, M_TEMP); 4779 #endif 4780 return (error); 4781 } 4782 4783 SYSCTL_OID(_security_jail, OID_AUTO, list, 4784 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 4785 sysctl_jail_list, "S", "List of active jails"); 4786 4787 static int 4788 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) 4789 { 4790 int error, injail; 4791 4792 injail = jailed(req->td->td_ucred); 4793 error = SYSCTL_OUT(req, &injail, sizeof(injail)); 4794 4795 return (error); 4796 } 4797 4798 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, 4799 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 4800 sysctl_jail_jailed, "I", "Process in jail?"); 4801 4802 static int 4803 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) 4804 { 4805 int error, havevnet; 4806 #ifdef VIMAGE 4807 struct ucred *cred = req->td->td_ucred; 4808 4809 havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison); 4810 #else 4811 havevnet = 0; 4812 #endif 4813 error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); 4814 4815 return (error); 4816 } 4817 4818 SYSCTL_PROC(_security_jail, OID_AUTO, vnet, 4819 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 4820 sysctl_jail_vnet, "I", "Jail owns vnet?"); 4821 4822 #if defined(INET) || defined(INET6) 4823 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, 4824 &jail_max_af_ips, 0, 4825 "Number of IP addresses a jail may have at most per address family (deprecated)"); 4826 #endif 4827 4828 /* 4829 * Default parameters for jail(2) compatibility. For historical reasons, 4830 * the sysctl names have varying similarity to the parameter names. Prisons 4831 * just see their own parameters, and can't change them. 4832 */ 4833 static int 4834 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) 4835 { 4836 int error, i; 4837 4838 /* Get the current flag value, and convert it to a boolean. */ 4839 if (req->td->td_ucred->cr_prison == &prison0) { 4840 mtx_lock(&prison0.pr_mtx); 4841 i = (jail_default_allow & arg2) != 0; 4842 mtx_unlock(&prison0.pr_mtx); 4843 } else 4844 i = prison_allow(req->td->td_ucred, arg2); 4845 4846 if (arg1 != NULL) 4847 i = !i; 4848 error = sysctl_handle_int(oidp, &i, 0, req); 4849 if (error || !req->newptr) 4850 return (error); 4851 i = i ? arg2 : 0; 4852 if (arg1 != NULL) 4853 i ^= arg2; 4854 /* 4855 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 4856 * for writing. 4857 */ 4858 mtx_lock(&prison0.pr_mtx); 4859 jail_default_allow = (jail_default_allow & ~arg2) | i; 4860 mtx_unlock(&prison0.pr_mtx); 4861 return (0); 4862 } 4863 4864 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, 4865 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4866 NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", 4867 "Processes in jail can set their hostnames (deprecated)"); 4868 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, 4869 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4870 (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", 4871 "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); 4872 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, 4873 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4874 NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", 4875 "Processes in jail can use System V IPC primitives (deprecated)"); 4876 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, 4877 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4878 NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", 4879 "Prison root can create raw sockets (deprecated)"); 4880 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, 4881 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4882 NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", 4883 "Processes in jail can alter system file flags (deprecated)"); 4884 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, 4885 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4886 NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", 4887 "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); 4888 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed, 4889 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4890 NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I", 4891 "Processes in jail can lock/unlock physical pages in memory"); 4892 4893 static int 4894 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) 4895 { 4896 struct prison *pr; 4897 int level, error; 4898 4899 pr = req->td->td_ucred->cr_prison; 4900 level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); 4901 error = sysctl_handle_int(oidp, &level, 0, req); 4902 if (error || !req->newptr) 4903 return (error); 4904 *(int *)arg1 = level; 4905 return (0); 4906 } 4907 4908 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, 4909 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 4910 &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), 4911 sysctl_jail_default_level, "I", 4912 "Processes in jail cannot see all mounted file systems (deprecated)"); 4913 4914 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, 4915 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 4916 &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), 4917 sysctl_jail_default_level, "I", 4918 "Ruleset for the devfs filesystem in jail (deprecated)"); 4919 4920 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 4921 "Limits and stats of child jails"); 4922 4923 static int 4924 sysctl_jail_children(SYSCTL_HANDLER_ARGS) 4925 { 4926 struct prison *pr; 4927 int i; 4928 4929 pr = req->td->td_ucred->cr_prison; 4930 4931 switch (oidp->oid_kind & CTLTYPE) { 4932 case CTLTYPE_INT: 4933 i = *(int *)((char *)pr + arg2); 4934 return (SYSCTL_OUT(req, &i, sizeof(i))); 4935 } 4936 4937 return (0); 4938 } 4939 4940 SYSCTL_PROC(_security_jail_children, OID_AUTO, max, 4941 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 4942 NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children, 4943 "I", "Maximum number of child jails"); 4944 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur, 4945 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 4946 NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children, 4947 "I", "Current number of child jails"); 4948 4949 /* 4950 * Nodes to describe jail parameters. Maximum length of string parameters 4951 * is returned in the string itself, and the other parameters exist merely 4952 * to make themselves and their types known. 4953 */ 4954 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 4955 "Jail parameters"); 4956 4957 int 4958 sysctl_jail_param(SYSCTL_HANDLER_ARGS) 4959 { 4960 int i; 4961 long l; 4962 size_t s; 4963 char numbuf[12]; 4964 4965 switch (oidp->oid_kind & CTLTYPE) 4966 { 4967 case CTLTYPE_LONG: 4968 case CTLTYPE_ULONG: 4969 l = 0; 4970 #ifdef SCTL_MASK32 4971 if (!(req->flags & SCTL_MASK32)) 4972 #endif 4973 return (SYSCTL_OUT(req, &l, sizeof(l))); 4974 case CTLTYPE_INT: 4975 case CTLTYPE_UINT: 4976 i = 0; 4977 return (SYSCTL_OUT(req, &i, sizeof(i))); 4978 case CTLTYPE_STRING: 4979 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); 4980 return 4981 (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); 4982 case CTLTYPE_STRUCT: 4983 s = (size_t)arg2; 4984 return (SYSCTL_OUT(req, &s, sizeof(s))); 4985 } 4986 return (0); 4987 } 4988 4989 /* 4990 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at 4991 * jail creation time but cannot be changed in an existing jail. 4992 */ 4993 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); 4994 SYSCTL_JAIL_PARAM(, desc, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail descriptor"); 4995 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); 4996 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); 4997 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); 4998 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, 4999 "I", "Jail secure level"); 5000 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 5001 "Jail value for kern.osreldate and uname -K"); 5002 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 5003 "Jail value for kern.osrelease and uname -r"); 5004 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, 5005 "I", "Jail cannot see all mounted file systems"); 5006 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, 5007 "I", "Ruleset for in-jail devfs mounts"); 5008 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, 5009 "B", "Jail persistence"); 5010 #ifdef VIMAGE 5011 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, 5012 "E,jailsys", "Virtual network stack"); 5013 #endif 5014 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, 5015 "B", "Jail is in the process of shutting down"); 5016 5017 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); 5018 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, 5019 "I", "Current number of child jails"); 5020 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, 5021 "I", "Maximum number of child jails"); 5022 5023 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); 5024 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, 5025 "Jail hostname"); 5026 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, 5027 "Jail NIS domainname"); 5028 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, 5029 "Jail host UUID"); 5030 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, 5031 "LU", "Jail host ID"); 5032 5033 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); 5034 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); 5035 5036 #ifdef INET 5037 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, 5038 "Jail IPv4 address virtualization"); 5039 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), 5040 "S,in_addr,a", "Jail IPv4 addresses"); 5041 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, 5042 "B", "Do (not) use IPv4 source address selection rather than the " 5043 "primary jail IPv4 address."); 5044 #endif 5045 #ifdef INET6 5046 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, 5047 "Jail IPv6 address virtualization"); 5048 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), 5049 "S,in6_addr,a", "Jail IPv6 addresses"); 5050 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, 5051 "B", "Do (not) use IPv6 source address selection rather than the " 5052 "primary jail IPv6 address."); 5053 #endif 5054 5055 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); 5056 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, 5057 "B", "Jail may set hostname"); 5058 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, 5059 "B", "Jail may use SYSV IPC"); 5060 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, 5061 "B", "Jail may create raw sockets"); 5062 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, 5063 "B", "Jail may alter system file flags"); 5064 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, 5065 "B", "Jail may set file quotas"); 5066 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, 5067 "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); 5068 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, 5069 "B", "Jail may lock (unlock) physical pages in memory"); 5070 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, 5071 "B", "Jail may bind sockets to reserved ports"); 5072 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, 5073 "B", "Jail may read the kernel message buffer"); 5074 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, 5075 "B", "Unprivileged processes may use process debugging facilities"); 5076 SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering, 5077 CTLTYPE_INT | CTLFLAG_RW, "B", 5078 "Unprivileged parent jail processes may tamper with same-uid processes" 5079 " (signal/debug/cpuset)"); 5080 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, 5081 "B", "Processes in jail with uid 0 have privilege"); 5082 #ifdef VIMAGE 5083 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW, 5084 "B", "Mountd/nfsd may run in the jail"); 5085 #endif 5086 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW, 5087 "B", "Jail may set system-level filesystem extended attributes"); 5088 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW, 5089 "B", "Jail may adjust system time"); 5090 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW, 5091 "B", "Jail may set system time"); 5092 SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW, 5093 "B", "Jail may modify routing table"); 5094 5095 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); 5096 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, 5097 "B", "Jail may mount/unmount jail-friendly file systems in general"); 5098 5099 /* 5100 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return 5101 * its associated bit in the pr_allow bitmask, or zero if the parameter was 5102 * not created. 5103 */ 5104 unsigned 5105 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, 5106 const char *descr) 5107 { 5108 struct bool_flags *bf; 5109 struct sysctl_oid *parent; 5110 char *allow_name, *allow_noname, *allowed; 5111 #ifndef NO_SYSCTL_DESCR 5112 char *descr_deprecated; 5113 #endif 5114 u_int allow_flag; 5115 5116 if (prefix 5117 ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) 5118 < 0 || 5119 asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) 5120 < 0 5121 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || 5122 asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { 5123 free(allow_name, M_PRISON); 5124 return 0; 5125 } 5126 5127 /* 5128 * See if this parameter has already beed added, i.e. a module was 5129 * previously loaded/unloaded. 5130 */ 5131 mtx_lock(&prison0.pr_mtx); 5132 for (bf = pr_flag_allow; 5133 bf < pr_flag_allow + nitems(pr_flag_allow) && 5134 atomic_load_int(&bf->flag) != 0; 5135 bf++) { 5136 if (strcmp(bf->name, allow_name) == 0) { 5137 allow_flag = bf->flag; 5138 goto no_add; 5139 } 5140 } 5141 5142 /* 5143 * Find a free bit in pr_allow_all, failing if there are none 5144 * (which shouldn't happen as long as we keep track of how many 5145 * potential dynamic flags exist). 5146 */ 5147 for (allow_flag = 1;; allow_flag <<= 1) { 5148 if (allow_flag == 0) 5149 goto no_add; 5150 if ((pr_allow_all & allow_flag) == 0) 5151 break; 5152 } 5153 5154 /* Note the parameter in the next open slot in pr_flag_allow. */ 5155 for (bf = pr_flag_allow; ; bf++) { 5156 if (bf == pr_flag_allow + nitems(pr_flag_allow)) { 5157 /* This should never happen, but is not fatal. */ 5158 allow_flag = 0; 5159 goto no_add; 5160 } 5161 if (atomic_load_int(&bf->flag) == 0) 5162 break; 5163 } 5164 bf->name = allow_name; 5165 bf->noname = allow_noname; 5166 pr_allow_all |= allow_flag; 5167 /* 5168 * prison0 always has permission for the new parameter. 5169 * Other jails must have it granted to them. 5170 */ 5171 prison0.pr_allow |= allow_flag; 5172 /* The flag indicates a valid entry, so make sure it is set last. */ 5173 atomic_store_rel_int(&bf->flag, allow_flag); 5174 mtx_unlock(&prison0.pr_mtx); 5175 5176 /* 5177 * Create sysctls for the parameter, and the back-compat global 5178 * permission. 5179 */ 5180 parent = prefix 5181 ? SYSCTL_ADD_NODE(NULL, 5182 SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), 5183 OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) 5184 : &sysctl___security_jail_param_allow; 5185 (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, 5186 name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 5187 NULL, 0, sysctl_jail_param, "B", descr); 5188 if ((prefix 5189 ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) 5190 : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { 5191 #ifndef NO_SYSCTL_DESCR 5192 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", 5193 descr); 5194 #endif 5195 (void)SYSCTL_ADD_PROC(NULL, 5196 SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, 5197 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, 5198 sysctl_jail_default_allow, "I", descr_deprecated); 5199 #ifndef NO_SYSCTL_DESCR 5200 free(descr_deprecated, M_TEMP); 5201 #endif 5202 free(allowed, M_TEMP); 5203 } 5204 return allow_flag; 5205 5206 no_add: 5207 mtx_unlock(&prison0.pr_mtx); 5208 free(allow_name, M_PRISON); 5209 free(allow_noname, M_PRISON); 5210 return allow_flag; 5211 } 5212 5213 /* 5214 * The VFS system will register jail-aware filesystems here. They each get 5215 * a parameter allow.mount.xxxfs and a flag to check when a jailed user 5216 * attempts to mount. 5217 */ 5218 void 5219 prison_add_vfs(struct vfsconf *vfsp) 5220 { 5221 #ifdef NO_SYSCTL_DESCR 5222 5223 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, 5224 NULL, NULL); 5225 #else 5226 char *descr; 5227 5228 (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", 5229 vfsp->vfc_name); 5230 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, 5231 NULL, descr); 5232 free(descr, M_TEMP); 5233 #endif 5234 } 5235 5236 #ifdef RACCT 5237 void 5238 prison_racct_foreach(void (*callback)(struct racct *racct, 5239 void *arg2, void *arg3), void (*pre)(void), void (*post)(void), 5240 void *arg2, void *arg3) 5241 { 5242 struct prison_racct *prr; 5243 5244 ASSERT_RACCT_ENABLED(); 5245 5246 sx_slock(&allprison_lock); 5247 if (pre != NULL) 5248 (pre)(); 5249 LIST_FOREACH(prr, &allprison_racct, prr_next) 5250 (callback)(prr->prr_racct, arg2, arg3); 5251 if (post != NULL) 5252 (post)(); 5253 sx_sunlock(&allprison_lock); 5254 } 5255 5256 static struct prison_racct * 5257 prison_racct_find_locked(const char *name) 5258 { 5259 struct prison_racct *prr; 5260 5261 ASSERT_RACCT_ENABLED(); 5262 sx_assert(&allprison_lock, SA_XLOCKED); 5263 5264 if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) 5265 return (NULL); 5266 5267 LIST_FOREACH(prr, &allprison_racct, prr_next) { 5268 if (strcmp(name, prr->prr_name) != 0) 5269 continue; 5270 5271 /* Found prison_racct with a matching name? */ 5272 prison_racct_hold(prr); 5273 return (prr); 5274 } 5275 5276 /* Add new prison_racct. */ 5277 prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); 5278 racct_create(&prr->prr_racct); 5279 5280 strcpy(prr->prr_name, name); 5281 refcount_init(&prr->prr_refcount, 1); 5282 LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); 5283 5284 return (prr); 5285 } 5286 5287 struct prison_racct * 5288 prison_racct_find(const char *name) 5289 { 5290 struct prison_racct *prr; 5291 5292 ASSERT_RACCT_ENABLED(); 5293 5294 sx_xlock(&allprison_lock); 5295 prr = prison_racct_find_locked(name); 5296 sx_xunlock(&allprison_lock); 5297 return (prr); 5298 } 5299 5300 void 5301 prison_racct_hold(struct prison_racct *prr) 5302 { 5303 5304 ASSERT_RACCT_ENABLED(); 5305 5306 refcount_acquire(&prr->prr_refcount); 5307 } 5308 5309 static void 5310 prison_racct_free_locked(struct prison_racct *prr) 5311 { 5312 5313 ASSERT_RACCT_ENABLED(); 5314 sx_assert(&allprison_lock, SA_XLOCKED); 5315 5316 if (refcount_release(&prr->prr_refcount)) { 5317 racct_destroy(&prr->prr_racct); 5318 LIST_REMOVE(prr, prr_next); 5319 free(prr, M_PRISON_RACCT); 5320 } 5321 } 5322 5323 void 5324 prison_racct_free(struct prison_racct *prr) 5325 { 5326 5327 ASSERT_RACCT_ENABLED(); 5328 sx_assert(&allprison_lock, SA_UNLOCKED); 5329 5330 if (refcount_release_if_not_last(&prr->prr_refcount)) 5331 return; 5332 5333 sx_xlock(&allprison_lock); 5334 prison_racct_free_locked(prr); 5335 sx_xunlock(&allprison_lock); 5336 } 5337 5338 static void 5339 prison_racct_attach(struct prison *pr) 5340 { 5341 struct prison_racct *prr; 5342 5343 ASSERT_RACCT_ENABLED(); 5344 sx_assert(&allprison_lock, SA_XLOCKED); 5345 5346 prr = prison_racct_find_locked(pr->pr_name); 5347 KASSERT(prr != NULL, ("cannot find prison_racct")); 5348 5349 pr->pr_prison_racct = prr; 5350 } 5351 5352 /* 5353 * Handle jail renaming. From the racct point of view, renaming means 5354 * moving from one prison_racct to another. 5355 */ 5356 static void 5357 prison_racct_modify(struct prison *pr) 5358 { 5359 #ifdef RCTL 5360 struct proc *p; 5361 struct ucred *cred; 5362 #endif 5363 struct prison_racct *oldprr; 5364 5365 ASSERT_RACCT_ENABLED(); 5366 5367 sx_slock(&allproc_lock); 5368 sx_xlock(&allprison_lock); 5369 5370 if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { 5371 sx_xunlock(&allprison_lock); 5372 sx_sunlock(&allproc_lock); 5373 return; 5374 } 5375 5376 oldprr = pr->pr_prison_racct; 5377 pr->pr_prison_racct = NULL; 5378 5379 prison_racct_attach(pr); 5380 5381 /* 5382 * Move resource utilisation records. 5383 */ 5384 racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); 5385 5386 #ifdef RCTL 5387 /* 5388 * Force rctl to reattach rules to processes. 5389 */ 5390 FOREACH_PROC_IN_SYSTEM(p) { 5391 PROC_LOCK(p); 5392 cred = crhold(p->p_ucred); 5393 PROC_UNLOCK(p); 5394 rctl_proc_ucred_changed(p, cred); 5395 crfree(cred); 5396 } 5397 #endif 5398 5399 sx_sunlock(&allproc_lock); 5400 prison_racct_free_locked(oldprr); 5401 sx_xunlock(&allprison_lock); 5402 } 5403 5404 static void 5405 prison_racct_detach(struct prison *pr) 5406 { 5407 5408 ASSERT_RACCT_ENABLED(); 5409 sx_assert(&allprison_lock, SA_UNLOCKED); 5410 5411 if (pr->pr_prison_racct == NULL) 5412 return; 5413 prison_racct_free(pr->pr_prison_racct); 5414 pr->pr_prison_racct = NULL; 5415 } 5416 #endif /* RACCT */ 5417 5418 /* 5419 * Submit a knote for a prison, locking if necessary. 5420 */ 5421 static void 5422 prison_knote(struct prison *pr, long hint) 5423 { 5424 int locked; 5425 5426 locked = mtx_owned(&pr->pr_mtx); 5427 if (!locked) 5428 mtx_lock(&pr->pr_mtx); 5429 KNOTE_LOCKED(pr->pr_klist, hint); 5430 if (!locked) 5431 mtx_unlock(&pr->pr_mtx); 5432 } 5433 5434 #ifdef DDB 5435 5436 static void 5437 db_show_prison(struct prison *pr) 5438 { 5439 struct bool_flags *bf; 5440 struct jailsys_flags *jsf; 5441 #if defined(INET) || defined(INET6) 5442 int ii; 5443 struct prison_ip *pip; 5444 #endif 5445 unsigned f; 5446 #ifdef INET 5447 char ip4buf[INET_ADDRSTRLEN]; 5448 #endif 5449 #ifdef INET6 5450 char ip6buf[INET6_ADDRSTRLEN]; 5451 #endif 5452 5453 db_printf("prison %p:\n", pr); 5454 db_printf(" jid = %d\n", pr->pr_id); 5455 db_printf(" name = %s\n", pr->pr_name); 5456 db_printf(" parent = %p\n", pr->pr_parent); 5457 db_printf(" ref = %d\n", pr->pr_ref); 5458 db_printf(" uref = %d\n", pr->pr_uref); 5459 db_printf(" state = %s\n", 5460 pr->pr_state == PRISON_STATE_ALIVE ? "alive" : 5461 pr->pr_state == PRISON_STATE_DYING ? "dying" : 5462 "invalid"); 5463 db_printf(" path = %s\n", pr->pr_path); 5464 db_printf(" cpuset = %d\n", pr->pr_cpuset 5465 ? pr->pr_cpuset->cs_id : -1); 5466 #ifdef VIMAGE 5467 db_printf(" vnet = %p\n", pr->pr_vnet); 5468 #endif 5469 db_printf(" root = %p\n", pr->pr_root); 5470 db_printf(" securelevel = %d\n", pr->pr_securelevel); 5471 db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); 5472 db_printf(" children.max = %d\n", pr->pr_childmax); 5473 db_printf(" children.cur = %d\n", pr->pr_childcount); 5474 db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); 5475 db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); 5476 db_printf(" flags = 0x%x", pr->pr_flags); 5477 for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) 5478 if (pr->pr_flags & bf->flag) 5479 db_printf(" %s", bf->name); 5480 for (jsf = pr_flag_jailsys; 5481 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 5482 jsf++) { 5483 f = pr->pr_flags & (jsf->disable | jsf->new); 5484 db_printf(" %-16s= %s\n", jsf->name, 5485 (f != 0 && f == jsf->disable) ? "disable" 5486 : (f == jsf->new) ? "new" 5487 : "inherit"); 5488 } 5489 db_printf(" allow = 0x%x", pr->pr_allow); 5490 for (bf = pr_flag_allow; 5491 bf < pr_flag_allow + nitems(pr_flag_allow) && 5492 atomic_load_int(&bf->flag) != 0; 5493 bf++) 5494 if (pr->pr_allow & bf->flag) 5495 db_printf(" %s", bf->name); 5496 db_printf("\n"); 5497 db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); 5498 db_printf(" host.hostname = %s\n", pr->pr_hostname); 5499 db_printf(" host.domainname = %s\n", pr->pr_domainname); 5500 db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); 5501 db_printf(" host.hostid = %lu\n", pr->pr_hostid); 5502 #ifdef INET 5503 if ((pip = pr->pr_addrs[PR_INET]) != NULL) { 5504 db_printf(" ip4s = %d\n", pip->ips); 5505 for (ii = 0; ii < pip->ips; ii++) 5506 db_printf(" %s %s\n", 5507 ii == 0 ? "ip4.addr =" : " ", 5508 inet_ntoa_r( 5509 *(const struct in_addr *)PR_IP(pip, PR_INET, ii), 5510 ip4buf)); 5511 } 5512 #endif 5513 #ifdef INET6 5514 if ((pip = pr->pr_addrs[PR_INET6]) != NULL) { 5515 db_printf(" ip6s = %d\n", pip->ips); 5516 for (ii = 0; ii < pip->ips; ii++) 5517 db_printf(" %s %s\n", 5518 ii == 0 ? "ip6.addr =" : " ", 5519 ip6_sprintf(ip6buf, 5520 (const struct in6_addr *)PR_IP(pip, PR_INET6, ii))); 5521 } 5522 #endif 5523 } 5524 5525 DB_SHOW_COMMAND(prison, db_show_prison_command) 5526 { 5527 struct prison *pr; 5528 5529 if (!have_addr) { 5530 /* 5531 * Show all prisons in the list, and prison0 which is not 5532 * listed. 5533 */ 5534 db_show_prison(&prison0); 5535 if (!db_pager_quit) { 5536 TAILQ_FOREACH(pr, &allprison, pr_list) { 5537 db_show_prison(pr); 5538 if (db_pager_quit) 5539 break; 5540 } 5541 } 5542 return; 5543 } 5544 5545 if (addr == 0) 5546 pr = &prison0; 5547 else { 5548 /* Look for a prison with the ID and with references. */ 5549 TAILQ_FOREACH(pr, &allprison, pr_list) 5550 if (pr->pr_id == addr && pr->pr_ref > 0) 5551 break; 5552 if (pr == NULL) 5553 /* Look again, without requiring a reference. */ 5554 TAILQ_FOREACH(pr, &allprison, pr_list) 5555 if (pr->pr_id == addr) 5556 break; 5557 if (pr == NULL) 5558 /* Assume address points to a valid prison. */ 5559 pr = (struct prison *)addr; 5560 } 5561 db_show_prison(pr); 5562 } 5563 5564 #endif /* DDB */ 5565