1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1999 Poul-Henning Kamp. 5 * Copyright (c) 2008 Bjoern A. Zeeb. 6 * Copyright (c) 2009 James Gritton. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_ddb.h" 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 38 #include <sys/param.h> 39 #include <sys/types.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/errno.h> 43 #include <sys/sysproto.h> 44 #include <sys/malloc.h> 45 #include <sys/osd.h> 46 #include <sys/priv.h> 47 #include <sys/proc.h> 48 #include <sys/taskqueue.h> 49 #include <sys/fcntl.h> 50 #include <sys/jail.h> 51 #include <sys/linker.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/racct.h> 55 #include <sys/rctl.h> 56 #include <sys/refcount.h> 57 #include <sys/sx.h> 58 #include <sys/sysent.h> 59 #include <sys/namei.h> 60 #include <sys/mount.h> 61 #include <sys/queue.h> 62 #include <sys/socket.h> 63 #include <sys/syscallsubr.h> 64 #include <sys/sysctl.h> 65 #include <sys/uuid.h> 66 #include <sys/vnode.h> 67 68 #include <net/if.h> 69 #include <net/vnet.h> 70 71 #include <netinet/in.h> 72 73 #ifdef DDB 74 #include <ddb/ddb.h> 75 #endif /* DDB */ 76 77 #include <security/mac/mac_framework.h> 78 79 #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" 80 #define PRISON0_HOSTUUID_MODULE "hostuuid" 81 82 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); 83 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); 84 85 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ 86 #ifdef INET 87 #ifdef INET6 88 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL 89 #else 90 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL 91 #endif 92 #else /* !INET */ 93 #ifdef INET6 94 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL 95 #else 96 #define _PR_IP_SADDRSEL 0 97 #endif 98 #endif 99 100 /* prison0 describes what is "real" about the system. */ 101 struct prison prison0 = { 102 .pr_id = 0, 103 .pr_name = "0", 104 .pr_ref = 1, 105 .pr_uref = 1, 106 .pr_path = "/", 107 .pr_securelevel = -1, 108 .pr_devfs_rsnum = 0, 109 .pr_childmax = JAIL_MAX, 110 .pr_hostuuid = DEFAULT_HOSTUUID, 111 .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), 112 #ifdef VIMAGE 113 .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, 114 #else 115 .pr_flags = PR_HOST|_PR_IP_SADDRSEL, 116 #endif 117 .pr_allow = PR_ALLOW_ALL_STATIC, 118 }; 119 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); 120 121 struct bool_flags { 122 const char *name; 123 const char *noname; 124 unsigned flag; 125 }; 126 struct jailsys_flags { 127 const char *name; 128 unsigned disable; 129 unsigned new; 130 }; 131 132 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ 133 struct sx allprison_lock; 134 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); 135 struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); 136 LIST_HEAD(, prison_racct) allprison_racct; 137 int lastprid = 0; 138 139 static int do_jail_attach(struct thread *td, struct prison *pr); 140 static void prison_complete(void *context, int pending); 141 static void prison_deref(struct prison *pr, int flags); 142 static char *prison_path(struct prison *pr1, struct prison *pr2); 143 static void prison_remove_one(struct prison *pr); 144 #ifdef RACCT 145 static void prison_racct_attach(struct prison *pr); 146 static void prison_racct_modify(struct prison *pr); 147 static void prison_racct_detach(struct prison *pr); 148 #endif 149 150 /* Flags for prison_deref */ 151 #define PD_DEREF 0x01 152 #define PD_DEUREF 0x02 153 #define PD_LOCKED 0x04 154 #define PD_LIST_SLOCKED 0x08 155 #define PD_LIST_XLOCKED 0x10 156 157 /* 158 * Parameter names corresponding to PR_* flag values. Size values are for kvm 159 * as we cannot figure out the size of a sparse array, or an array without a 160 * terminating entry. 161 */ 162 static struct bool_flags pr_flag_bool[] = { 163 {"persist", "nopersist", PR_PERSIST}, 164 #ifdef INET 165 {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, 166 #endif 167 #ifdef INET6 168 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, 169 #endif 170 }; 171 const size_t pr_flag_bool_size = sizeof(pr_flag_bool); 172 173 static struct jailsys_flags pr_flag_jailsys[] = { 174 {"host", 0, PR_HOST}, 175 #ifdef VIMAGE 176 {"vnet", 0, PR_VNET}, 177 #endif 178 #ifdef INET 179 {"ip4", PR_IP4_USER, PR_IP4_USER}, 180 #endif 181 #ifdef INET6 182 {"ip6", PR_IP6_USER, PR_IP6_USER}, 183 #endif 184 }; 185 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); 186 187 /* Make this array full-size so dynamic parameters can be added. */ 188 static struct bool_flags pr_flag_allow[NBBY * NBPW] = { 189 {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, 190 {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, 191 {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, 192 {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, 193 {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, 194 {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, 195 {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, 196 {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, 197 {"allow.reserved_ports", "allow.noreserved_ports", 198 PR_ALLOW_RESERVED_PORTS}, 199 {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, 200 {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", 201 PR_ALLOW_UNPRIV_DEBUG}, 202 {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, 203 }; 204 const size_t pr_flag_allow_size = sizeof(pr_flag_allow); 205 206 #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ 207 PR_ALLOW_RESERVED_PORTS | \ 208 PR_ALLOW_UNPRIV_DEBUG | \ 209 PR_ALLOW_SUSER) 210 #define JAIL_DEFAULT_ENFORCE_STATFS 2 211 #define JAIL_DEFAULT_DEVFS_RSNUM 0 212 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; 213 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; 214 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; 215 #if defined(INET) || defined(INET6) 216 static unsigned jail_max_af_ips = 255; 217 #endif 218 219 /* 220 * Initialize the parts of prison0 that can't be static-initialized with 221 * constants. This is called from proc0_init() after creating thread0 cpuset. 222 */ 223 void 224 prison0_init(void) 225 { 226 uint8_t *file, *data; 227 size_t size; 228 229 prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); 230 prison0.pr_osreldate = osreldate; 231 strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); 232 233 /* If we have a preloaded hostuuid, use it. */ 234 file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); 235 if (file != NULL) { 236 data = preload_fetch_addr(file); 237 size = preload_fetch_size(file); 238 if (data != NULL) { 239 /* 240 * The preloaded data may include trailing whitespace, almost 241 * certainly a newline; skip over any whitespace or 242 * non-printable characters to be safe. 243 */ 244 while (size > 0 && data[size - 1] <= 0x20) { 245 data[size--] = '\0'; 246 } 247 if (validate_uuid(data, size, NULL, 0) == 0) { 248 (void)strlcpy(prison0.pr_hostuuid, data, 249 size + 1); 250 } else if (bootverbose) { 251 printf("hostuuid: preload data malformed: '%s'", 252 data); 253 } 254 } 255 } 256 if (bootverbose) 257 printf("hostuuid: using %s\n", prison0.pr_hostuuid); 258 } 259 260 /* 261 * struct jail_args { 262 * struct jail *jail; 263 * }; 264 */ 265 int 266 sys_jail(struct thread *td, struct jail_args *uap) 267 { 268 uint32_t version; 269 int error; 270 struct jail j; 271 272 error = copyin(uap->jail, &version, sizeof(uint32_t)); 273 if (error) 274 return (error); 275 276 switch (version) { 277 case 0: 278 { 279 struct jail_v0 j0; 280 281 /* FreeBSD single IPv4 jails. */ 282 bzero(&j, sizeof(struct jail)); 283 error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); 284 if (error) 285 return (error); 286 j.version = j0.version; 287 j.path = j0.path; 288 j.hostname = j0.hostname; 289 j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ 290 break; 291 } 292 293 case 1: 294 /* 295 * Version 1 was used by multi-IPv4 jail implementations 296 * that never made it into the official kernel. 297 */ 298 return (EINVAL); 299 300 case 2: /* JAIL_API_VERSION */ 301 /* FreeBSD multi-IPv4/IPv6,noIP jails. */ 302 error = copyin(uap->jail, &j, sizeof(struct jail)); 303 if (error) 304 return (error); 305 break; 306 307 default: 308 /* Sci-Fi jails are not supported, sorry. */ 309 return (EINVAL); 310 } 311 return (kern_jail(td, &j)); 312 } 313 314 int 315 kern_jail(struct thread *td, struct jail *j) 316 { 317 struct iovec optiov[2 * (4 + nitems(pr_flag_allow) 318 #ifdef INET 319 + 1 320 #endif 321 #ifdef INET6 322 + 1 323 #endif 324 )]; 325 struct uio opt; 326 char *u_path, *u_hostname, *u_name; 327 struct bool_flags *bf; 328 #ifdef INET 329 uint32_t ip4s; 330 struct in_addr *u_ip4; 331 #endif 332 #ifdef INET6 333 struct in6_addr *u_ip6; 334 #endif 335 size_t tmplen; 336 int error, enforce_statfs; 337 338 bzero(&optiov, sizeof(optiov)); 339 opt.uio_iov = optiov; 340 opt.uio_iovcnt = 0; 341 opt.uio_offset = -1; 342 opt.uio_resid = -1; 343 opt.uio_segflg = UIO_SYSSPACE; 344 opt.uio_rw = UIO_READ; 345 opt.uio_td = td; 346 347 /* Set permissions for top-level jails from sysctls. */ 348 if (!jailed(td->td_ucred)) { 349 for (bf = pr_flag_allow; 350 bf < pr_flag_allow + nitems(pr_flag_allow) && 351 bf->flag != 0; 352 bf++) { 353 optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, 354 (jail_default_allow & bf->flag) 355 ? bf->name : bf->noname); 356 optiov[opt.uio_iovcnt].iov_len = 357 strlen(optiov[opt.uio_iovcnt].iov_base) + 1; 358 opt.uio_iovcnt += 2; 359 } 360 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; 361 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); 362 opt.uio_iovcnt++; 363 enforce_statfs = jail_default_enforce_statfs; 364 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; 365 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); 366 opt.uio_iovcnt++; 367 } 368 369 tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; 370 #ifdef INET 371 ip4s = (j->version == 0) ? 1 : j->ip4s; 372 if (ip4s > jail_max_af_ips) 373 return (EINVAL); 374 tmplen += ip4s * sizeof(struct in_addr); 375 #else 376 if (j->ip4s > 0) 377 return (EINVAL); 378 #endif 379 #ifdef INET6 380 if (j->ip6s > jail_max_af_ips) 381 return (EINVAL); 382 tmplen += j->ip6s * sizeof(struct in6_addr); 383 #else 384 if (j->ip6s > 0) 385 return (EINVAL); 386 #endif 387 u_path = malloc(tmplen, M_TEMP, M_WAITOK); 388 u_hostname = u_path + MAXPATHLEN; 389 u_name = u_hostname + MAXHOSTNAMELEN; 390 #ifdef INET 391 u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); 392 #endif 393 #ifdef INET6 394 #ifdef INET 395 u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); 396 #else 397 u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); 398 #endif 399 #endif 400 optiov[opt.uio_iovcnt].iov_base = "path"; 401 optiov[opt.uio_iovcnt].iov_len = sizeof("path"); 402 opt.uio_iovcnt++; 403 optiov[opt.uio_iovcnt].iov_base = u_path; 404 error = copyinstr(j->path, u_path, MAXPATHLEN, 405 &optiov[opt.uio_iovcnt].iov_len); 406 if (error) { 407 free(u_path, M_TEMP); 408 return (error); 409 } 410 opt.uio_iovcnt++; 411 optiov[opt.uio_iovcnt].iov_base = "host.hostname"; 412 optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); 413 opt.uio_iovcnt++; 414 optiov[opt.uio_iovcnt].iov_base = u_hostname; 415 error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, 416 &optiov[opt.uio_iovcnt].iov_len); 417 if (error) { 418 free(u_path, M_TEMP); 419 return (error); 420 } 421 opt.uio_iovcnt++; 422 if (j->jailname != NULL) { 423 optiov[opt.uio_iovcnt].iov_base = "name"; 424 optiov[opt.uio_iovcnt].iov_len = sizeof("name"); 425 opt.uio_iovcnt++; 426 optiov[opt.uio_iovcnt].iov_base = u_name; 427 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, 428 &optiov[opt.uio_iovcnt].iov_len); 429 if (error) { 430 free(u_path, M_TEMP); 431 return (error); 432 } 433 opt.uio_iovcnt++; 434 } 435 #ifdef INET 436 optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; 437 optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); 438 opt.uio_iovcnt++; 439 optiov[opt.uio_iovcnt].iov_base = u_ip4; 440 optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); 441 if (j->version == 0) 442 u_ip4->s_addr = j->ip4s; 443 else { 444 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); 445 if (error) { 446 free(u_path, M_TEMP); 447 return (error); 448 } 449 } 450 opt.uio_iovcnt++; 451 #endif 452 #ifdef INET6 453 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; 454 optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); 455 opt.uio_iovcnt++; 456 optiov[opt.uio_iovcnt].iov_base = u_ip6; 457 optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); 458 error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); 459 if (error) { 460 free(u_path, M_TEMP); 461 return (error); 462 } 463 opt.uio_iovcnt++; 464 #endif 465 KASSERT(opt.uio_iovcnt <= nitems(optiov), 466 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); 467 error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); 468 free(u_path, M_TEMP); 469 return (error); 470 } 471 472 /* 473 * struct jail_set_args { 474 * struct iovec *iovp; 475 * unsigned int iovcnt; 476 * int flags; 477 * }; 478 */ 479 int 480 sys_jail_set(struct thread *td, struct jail_set_args *uap) 481 { 482 struct uio *auio; 483 int error; 484 485 /* Check that we have an even number of iovecs. */ 486 if (uap->iovcnt & 1) 487 return (EINVAL); 488 489 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 490 if (error) 491 return (error); 492 error = kern_jail_set(td, auio, uap->flags); 493 free(auio, M_IOV); 494 return (error); 495 } 496 497 int 498 kern_jail_set(struct thread *td, struct uio *optuio, int flags) 499 { 500 struct nameidata nd; 501 #ifdef INET 502 struct in_addr *ip4; 503 #endif 504 #ifdef INET6 505 struct in6_addr *ip6; 506 #endif 507 struct vfsopt *opt; 508 struct vfsoptlist *opts; 509 struct prison *pr, *deadpr, *mypr, *ppr, *tpr; 510 struct vnode *root; 511 char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; 512 char *g_path, *osrelstr; 513 struct bool_flags *bf; 514 struct jailsys_flags *jsf; 515 #if defined(INET) || defined(INET6) 516 struct prison *tppr; 517 void *op; 518 #endif 519 unsigned long hid; 520 size_t namelen, onamelen, pnamelen; 521 int born, created, cuflags, descend, enforce; 522 int error, errmsg_len, errmsg_pos; 523 int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; 524 int jid, jsys, len, level; 525 int childmax, osreldt, rsnum, slevel; 526 #if defined(INET) || defined(INET6) 527 int ii, ij; 528 #endif 529 #ifdef INET 530 int ip4s, redo_ip4; 531 #endif 532 #ifdef INET6 533 int ip6s, redo_ip6; 534 #endif 535 uint64_t pr_allow, ch_allow, pr_flags, ch_flags; 536 uint64_t pr_allow_diff; 537 unsigned tallow; 538 char numbuf[12]; 539 540 error = priv_check(td, PRIV_JAIL_SET); 541 if (!error && (flags & JAIL_ATTACH)) 542 error = priv_check(td, PRIV_JAIL_ATTACH); 543 if (error) 544 return (error); 545 mypr = td->td_ucred->cr_prison; 546 if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) 547 return (EPERM); 548 if (flags & ~JAIL_SET_MASK) 549 return (EINVAL); 550 551 /* 552 * Check all the parameters before committing to anything. Not all 553 * errors can be caught early, but we may as well try. Also, this 554 * takes care of some expensive stuff (path lookup) before getting 555 * the allprison lock. 556 * 557 * XXX Jails are not filesystems, and jail parameters are not mount 558 * options. But it makes more sense to re-use the vfsopt code 559 * than duplicate it under a different name. 560 */ 561 error = vfs_buildopts(optuio, &opts); 562 if (error) 563 return (error); 564 #ifdef INET 565 ip4 = NULL; 566 #endif 567 #ifdef INET6 568 ip6 = NULL; 569 #endif 570 g_path = NULL; 571 572 cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); 573 if (!cuflags) { 574 error = EINVAL; 575 vfs_opterror(opts, "no valid operation (create or update)"); 576 goto done_errmsg; 577 } 578 579 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); 580 if (error == ENOENT) 581 jid = 0; 582 else if (error != 0) 583 goto done_free; 584 585 error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); 586 if (error == ENOENT) 587 gotslevel = 0; 588 else if (error != 0) 589 goto done_free; 590 else 591 gotslevel = 1; 592 593 error = 594 vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); 595 if (error == ENOENT) 596 gotchildmax = 0; 597 else if (error != 0) 598 goto done_free; 599 else 600 gotchildmax = 1; 601 602 error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); 603 if (error == ENOENT) 604 gotenforce = 0; 605 else if (error != 0) 606 goto done_free; 607 else if (enforce < 0 || enforce > 2) { 608 error = EINVAL; 609 goto done_free; 610 } else 611 gotenforce = 1; 612 613 error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); 614 if (error == ENOENT) 615 gotrsnum = 0; 616 else if (error != 0) 617 goto done_free; 618 else 619 gotrsnum = 1; 620 621 pr_flags = ch_flags = 0; 622 for (bf = pr_flag_bool; 623 bf < pr_flag_bool + nitems(pr_flag_bool); 624 bf++) { 625 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); 626 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); 627 } 628 ch_flags |= pr_flags; 629 for (jsf = pr_flag_jailsys; 630 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 631 jsf++) { 632 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); 633 if (error == ENOENT) 634 continue; 635 if (error != 0) 636 goto done_free; 637 switch (jsys) { 638 case JAIL_SYS_DISABLE: 639 if (!jsf->disable) { 640 error = EINVAL; 641 goto done_free; 642 } 643 pr_flags |= jsf->disable; 644 break; 645 case JAIL_SYS_NEW: 646 pr_flags |= jsf->new; 647 break; 648 case JAIL_SYS_INHERIT: 649 break; 650 default: 651 error = EINVAL; 652 goto done_free; 653 } 654 ch_flags |= jsf->new | jsf->disable; 655 } 656 if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE 657 && !(pr_flags & PR_PERSIST)) { 658 error = EINVAL; 659 vfs_opterror(opts, "new jail must persist or attach"); 660 goto done_errmsg; 661 } 662 #ifdef VIMAGE 663 if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { 664 error = EINVAL; 665 vfs_opterror(opts, "vnet cannot be changed after creation"); 666 goto done_errmsg; 667 } 668 #endif 669 #ifdef INET 670 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { 671 error = EINVAL; 672 vfs_opterror(opts, "ip4 cannot be changed after creation"); 673 goto done_errmsg; 674 } 675 #endif 676 #ifdef INET6 677 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { 678 error = EINVAL; 679 vfs_opterror(opts, "ip6 cannot be changed after creation"); 680 goto done_errmsg; 681 } 682 #endif 683 684 pr_allow = ch_allow = 0; 685 for (bf = pr_flag_allow; 686 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0; 687 bf++) { 688 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); 689 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); 690 } 691 ch_allow |= pr_allow; 692 693 error = vfs_getopt(opts, "name", (void **)&name, &len); 694 if (error == ENOENT) 695 name = NULL; 696 else if (error != 0) 697 goto done_free; 698 else { 699 if (len == 0 || name[len - 1] != '\0') { 700 error = EINVAL; 701 goto done_free; 702 } 703 if (len > MAXHOSTNAMELEN) { 704 error = ENAMETOOLONG; 705 goto done_free; 706 } 707 } 708 709 error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); 710 if (error == ENOENT) 711 host = NULL; 712 else if (error != 0) 713 goto done_free; 714 else { 715 ch_flags |= PR_HOST; 716 pr_flags |= PR_HOST; 717 if (len == 0 || host[len - 1] != '\0') { 718 error = EINVAL; 719 goto done_free; 720 } 721 if (len > MAXHOSTNAMELEN) { 722 error = ENAMETOOLONG; 723 goto done_free; 724 } 725 } 726 727 error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); 728 if (error == ENOENT) 729 domain = NULL; 730 else if (error != 0) 731 goto done_free; 732 else { 733 ch_flags |= PR_HOST; 734 pr_flags |= PR_HOST; 735 if (len == 0 || domain[len - 1] != '\0') { 736 error = EINVAL; 737 goto done_free; 738 } 739 if (len > MAXHOSTNAMELEN) { 740 error = ENAMETOOLONG; 741 goto done_free; 742 } 743 } 744 745 error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); 746 if (error == ENOENT) 747 uuid = NULL; 748 else if (error != 0) 749 goto done_free; 750 else { 751 ch_flags |= PR_HOST; 752 pr_flags |= PR_HOST; 753 if (len == 0 || uuid[len - 1] != '\0') { 754 error = EINVAL; 755 goto done_free; 756 } 757 if (len > HOSTUUIDLEN) { 758 error = ENAMETOOLONG; 759 goto done_free; 760 } 761 } 762 763 #ifdef COMPAT_FREEBSD32 764 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 765 uint32_t hid32; 766 767 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); 768 hid = hid32; 769 } else 770 #endif 771 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); 772 if (error == ENOENT) 773 gothid = 0; 774 else if (error != 0) 775 goto done_free; 776 else { 777 gothid = 1; 778 ch_flags |= PR_HOST; 779 pr_flags |= PR_HOST; 780 } 781 782 #ifdef INET 783 error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); 784 if (error == ENOENT) 785 ip4s = 0; 786 else if (error != 0) 787 goto done_free; 788 else if (ip4s & (sizeof(*ip4) - 1)) { 789 error = EINVAL; 790 goto done_free; 791 } else { 792 ch_flags |= PR_IP4_USER; 793 pr_flags |= PR_IP4_USER; 794 if (ip4s > 0) { 795 ip4s /= sizeof(*ip4); 796 if (ip4s > jail_max_af_ips) { 797 error = EINVAL; 798 vfs_opterror(opts, "too many IPv4 addresses"); 799 goto done_errmsg; 800 } 801 ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); 802 bcopy(op, ip4, ip4s * sizeof(*ip4)); 803 /* 804 * IP addresses are all sorted but ip[0] to preserve 805 * the primary IP address as given from userland. 806 * This special IP is used for unbound outgoing 807 * connections as well for "loopback" traffic in case 808 * source address selection cannot find any more fitting 809 * address to connect from. 810 */ 811 if (ip4s > 1) 812 qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), 813 prison_qcmp_v4); 814 /* 815 * Check for duplicate addresses and do some simple 816 * zero and broadcast checks. If users give other bogus 817 * addresses it is their problem. 818 * 819 * We do not have to care about byte order for these 820 * checks so we will do them in NBO. 821 */ 822 for (ii = 0; ii < ip4s; ii++) { 823 if (ip4[ii].s_addr == INADDR_ANY || 824 ip4[ii].s_addr == INADDR_BROADCAST) { 825 error = EINVAL; 826 goto done_free; 827 } 828 if ((ii+1) < ip4s && 829 (ip4[0].s_addr == ip4[ii+1].s_addr || 830 ip4[ii].s_addr == ip4[ii+1].s_addr)) { 831 error = EINVAL; 832 goto done_free; 833 } 834 } 835 } 836 } 837 #endif 838 839 #ifdef INET6 840 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); 841 if (error == ENOENT) 842 ip6s = 0; 843 else if (error != 0) 844 goto done_free; 845 else if (ip6s & (sizeof(*ip6) - 1)) { 846 error = EINVAL; 847 goto done_free; 848 } else { 849 ch_flags |= PR_IP6_USER; 850 pr_flags |= PR_IP6_USER; 851 if (ip6s > 0) { 852 ip6s /= sizeof(*ip6); 853 if (ip6s > jail_max_af_ips) { 854 error = EINVAL; 855 vfs_opterror(opts, "too many IPv6 addresses"); 856 goto done_errmsg; 857 } 858 ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); 859 bcopy(op, ip6, ip6s * sizeof(*ip6)); 860 if (ip6s > 1) 861 qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), 862 prison_qcmp_v6); 863 for (ii = 0; ii < ip6s; ii++) { 864 if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { 865 error = EINVAL; 866 goto done_free; 867 } 868 if ((ii+1) < ip6s && 869 (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || 870 IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) 871 { 872 error = EINVAL; 873 goto done_free; 874 } 875 } 876 } 877 } 878 #endif 879 880 #if defined(VIMAGE) && (defined(INET) || defined(INET6)) 881 if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { 882 error = EINVAL; 883 vfs_opterror(opts, 884 "vnet jails cannot have IP address restrictions"); 885 goto done_errmsg; 886 } 887 #endif 888 889 error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); 890 if (error == ENOENT) 891 osrelstr = NULL; 892 else if (error != 0) 893 goto done_free; 894 else { 895 if (flags & JAIL_UPDATE) { 896 error = EINVAL; 897 vfs_opterror(opts, 898 "osrelease cannot be changed after creation"); 899 goto done_errmsg; 900 } 901 if (len == 0 || osrelstr[len - 1] != '\0') { 902 error = EINVAL; 903 goto done_free; 904 } 905 if (len >= OSRELEASELEN) { 906 error = ENAMETOOLONG; 907 vfs_opterror(opts, 908 "osrelease string must be 1-%d bytes long", 909 OSRELEASELEN - 1); 910 goto done_errmsg; 911 } 912 } 913 914 error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); 915 if (error == ENOENT) 916 osreldt = 0; 917 else if (error != 0) 918 goto done_free; 919 else { 920 if (flags & JAIL_UPDATE) { 921 error = EINVAL; 922 vfs_opterror(opts, 923 "osreldate cannot be changed after creation"); 924 goto done_errmsg; 925 } 926 if (osreldt == 0) { 927 error = EINVAL; 928 vfs_opterror(opts, "osreldate cannot be 0"); 929 goto done_errmsg; 930 } 931 } 932 933 root = NULL; 934 error = vfs_getopt(opts, "path", (void **)&path, &len); 935 if (error == ENOENT) 936 path = NULL; 937 else if (error != 0) 938 goto done_free; 939 else { 940 if (flags & JAIL_UPDATE) { 941 error = EINVAL; 942 vfs_opterror(opts, 943 "path cannot be changed after creation"); 944 goto done_errmsg; 945 } 946 if (len == 0 || path[len - 1] != '\0') { 947 error = EINVAL; 948 goto done_free; 949 } 950 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, 951 path, td); 952 error = namei(&nd); 953 if (error) 954 goto done_free; 955 root = nd.ni_vp; 956 NDFREE(&nd, NDF_ONLY_PNBUF); 957 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); 958 strlcpy(g_path, path, MAXPATHLEN); 959 error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); 960 if (error == 0) { 961 path = g_path; 962 } else { 963 /* exit on other errors */ 964 goto done_free; 965 } 966 if (root->v_type != VDIR) { 967 error = ENOTDIR; 968 vput(root); 969 goto done_free; 970 } 971 VOP_UNLOCK(root); 972 } 973 974 /* 975 * Find the specified jail, or at least its parent. 976 * This abuses the file error codes ENOENT and EEXIST. 977 */ 978 pr = NULL; 979 ppr = mypr; 980 if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { 981 namelc = strrchr(name, '.'); 982 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); 983 if (*p != '\0') 984 jid = 0; 985 } 986 sx_xlock(&allprison_lock); 987 if (jid != 0) { 988 /* 989 * See if a requested jid already exists. There is an 990 * information leak here if the jid exists but is not within 991 * the caller's jail hierarchy. Jail creators will get EEXIST 992 * even though they cannot see the jail, and CREATE | UPDATE 993 * will return ENOENT which is not normally a valid error. 994 */ 995 if (jid < 0) { 996 error = EINVAL; 997 vfs_opterror(opts, "negative jid"); 998 goto done_unlock_list; 999 } 1000 pr = prison_find(jid); 1001 if (pr != NULL) { 1002 ppr = pr->pr_parent; 1003 /* Create: jid must not exist. */ 1004 if (cuflags == JAIL_CREATE) { 1005 mtx_unlock(&pr->pr_mtx); 1006 error = EEXIST; 1007 vfs_opterror(opts, "jail %d already exists", 1008 jid); 1009 goto done_unlock_list; 1010 } 1011 if (!prison_ischild(mypr, pr)) { 1012 mtx_unlock(&pr->pr_mtx); 1013 pr = NULL; 1014 } else if (pr->pr_uref == 0) { 1015 if (!(flags & JAIL_DYING)) { 1016 mtx_unlock(&pr->pr_mtx); 1017 error = ENOENT; 1018 vfs_opterror(opts, "jail %d is dying", 1019 jid); 1020 goto done_unlock_list; 1021 } else if ((flags & JAIL_ATTACH) || 1022 (pr_flags & PR_PERSIST)) { 1023 /* 1024 * A dying jail might be resurrected 1025 * (via attach or persist), but first 1026 * it must determine if another jail 1027 * has claimed its name. Accomplish 1028 * this by implicitly re-setting the 1029 * name. 1030 */ 1031 if (name == NULL) 1032 name = prison_name(mypr, pr); 1033 } 1034 } 1035 } 1036 if (pr == NULL) { 1037 /* Update: jid must exist. */ 1038 if (cuflags == JAIL_UPDATE) { 1039 error = ENOENT; 1040 vfs_opterror(opts, "jail %d not found", jid); 1041 goto done_unlock_list; 1042 } 1043 } 1044 } 1045 /* 1046 * If the caller provided a name, look for a jail by that name. 1047 * This has different semantics for creates and updates keyed by jid 1048 * (where the name must not already exist in a different jail), 1049 * and updates keyed by the name itself (where the name must exist 1050 * because that is the jail being updated). 1051 */ 1052 namelc = NULL; 1053 if (name != NULL) { 1054 namelc = strrchr(name, '.'); 1055 if (namelc == NULL) 1056 namelc = name; 1057 else { 1058 /* 1059 * This is a hierarchical name. Split it into the 1060 * parent and child names, and make sure the parent 1061 * exists or matches an already found jail. 1062 */ 1063 if (pr != NULL) { 1064 if (strncmp(name, ppr->pr_name, namelc - name) 1065 || ppr->pr_name[namelc - name] != '\0') { 1066 mtx_unlock(&pr->pr_mtx); 1067 error = EINVAL; 1068 vfs_opterror(opts, 1069 "cannot change jail's parent"); 1070 goto done_unlock_list; 1071 } 1072 } else { 1073 *namelc = '\0'; 1074 ppr = prison_find_name(mypr, name); 1075 if (ppr == NULL) { 1076 error = ENOENT; 1077 vfs_opterror(opts, 1078 "jail \"%s\" not found", name); 1079 goto done_unlock_list; 1080 } 1081 mtx_unlock(&ppr->pr_mtx); 1082 *namelc = '.'; 1083 } 1084 namelc++; 1085 } 1086 if (namelc[0] != '\0') { 1087 pnamelen = 1088 (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; 1089 name_again: 1090 deadpr = NULL; 1091 FOREACH_PRISON_CHILD(ppr, tpr) { 1092 if (tpr != pr && tpr->pr_ref > 0 && 1093 !strcmp(tpr->pr_name + pnamelen, namelc)) { 1094 if (pr == NULL && 1095 cuflags != JAIL_CREATE) { 1096 mtx_lock(&tpr->pr_mtx); 1097 if (tpr->pr_ref > 0) { 1098 /* 1099 * Use this jail 1100 * for updates. 1101 */ 1102 if (tpr->pr_uref > 0) { 1103 pr = tpr; 1104 break; 1105 } 1106 deadpr = tpr; 1107 } 1108 mtx_unlock(&tpr->pr_mtx); 1109 } else if (tpr->pr_uref > 0) { 1110 /* 1111 * Create, or update(jid): 1112 * name must not exist in an 1113 * active sibling jail. 1114 */ 1115 error = EEXIST; 1116 if (pr != NULL) 1117 mtx_unlock(&pr->pr_mtx); 1118 vfs_opterror(opts, 1119 "jail \"%s\" already exists", 1120 name); 1121 goto done_unlock_list; 1122 } 1123 } 1124 } 1125 /* If no active jail is found, use a dying one. */ 1126 if (deadpr != NULL && pr == NULL) { 1127 if (flags & JAIL_DYING) { 1128 mtx_lock(&deadpr->pr_mtx); 1129 if (deadpr->pr_ref == 0) { 1130 mtx_unlock(&deadpr->pr_mtx); 1131 goto name_again; 1132 } 1133 pr = deadpr; 1134 } else if (cuflags == JAIL_UPDATE) { 1135 error = ENOENT; 1136 vfs_opterror(opts, 1137 "jail \"%s\" is dying", name); 1138 goto done_unlock_list; 1139 } 1140 } 1141 /* Update: name must exist if no jid. */ 1142 else if (cuflags == JAIL_UPDATE && pr == NULL) { 1143 error = ENOENT; 1144 vfs_opterror(opts, "jail \"%s\" not found", 1145 name); 1146 goto done_unlock_list; 1147 } 1148 } 1149 } 1150 /* Update: must provide a jid or name. */ 1151 else if (cuflags == JAIL_UPDATE && pr == NULL) { 1152 error = ENOENT; 1153 vfs_opterror(opts, "update specified no jail"); 1154 goto done_unlock_list; 1155 } 1156 1157 /* If there's no prison to update, create a new one and link it in. */ 1158 if (pr == NULL) { 1159 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) 1160 if (tpr->pr_childcount >= tpr->pr_childmax) { 1161 error = EPERM; 1162 vfs_opterror(opts, "prison limit exceeded"); 1163 goto done_unlock_list; 1164 } 1165 created = 1; 1166 mtx_lock(&ppr->pr_mtx); 1167 if (ppr->pr_ref == 0) { 1168 mtx_unlock(&ppr->pr_mtx); 1169 error = ENOENT; 1170 vfs_opterror(opts, "jail \"%s\" not found", 1171 prison_name(mypr, ppr)); 1172 goto done_unlock_list; 1173 } 1174 ppr->pr_ref++; 1175 ppr->pr_uref++; 1176 mtx_unlock(&ppr->pr_mtx); 1177 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); 1178 if (jid == 0) { 1179 /* Find the next free jid. */ 1180 jid = lastprid + 1; 1181 findnext: 1182 if (jid == JAIL_MAX) 1183 jid = 1; 1184 TAILQ_FOREACH(tpr, &allprison, pr_list) { 1185 if (tpr->pr_id < jid) 1186 continue; 1187 if (tpr->pr_id > jid || tpr->pr_ref == 0) { 1188 TAILQ_INSERT_BEFORE(tpr, pr, pr_list); 1189 break; 1190 } 1191 if (jid == lastprid) { 1192 error = EAGAIN; 1193 vfs_opterror(opts, 1194 "no available jail IDs"); 1195 free(pr, M_PRISON); 1196 prison_deref(ppr, PD_DEREF | 1197 PD_DEUREF | PD_LIST_XLOCKED); 1198 goto done_releroot; 1199 } 1200 jid++; 1201 goto findnext; 1202 } 1203 lastprid = jid; 1204 } else { 1205 /* 1206 * The jail already has a jid (that did not yet exist), 1207 * so just find where to insert it. 1208 */ 1209 TAILQ_FOREACH(tpr, &allprison, pr_list) 1210 if (tpr->pr_id >= jid) { 1211 TAILQ_INSERT_BEFORE(tpr, pr, pr_list); 1212 break; 1213 } 1214 } 1215 if (tpr == NULL) 1216 TAILQ_INSERT_TAIL(&allprison, pr, pr_list); 1217 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); 1218 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) 1219 tpr->pr_childcount++; 1220 1221 pr->pr_parent = ppr; 1222 pr->pr_id = jid; 1223 1224 /* Set some default values, and inherit some from the parent. */ 1225 if (namelc == NULL) 1226 namelc = ""; 1227 if (path == NULL) { 1228 path = "/"; 1229 root = mypr->pr_root; 1230 vref(root); 1231 } 1232 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); 1233 pr->pr_flags |= PR_HOST; 1234 #if defined(INET) || defined(INET6) 1235 #ifdef VIMAGE 1236 if (!(pr_flags & PR_VNET)) 1237 #endif 1238 { 1239 #ifdef INET 1240 if (!(ch_flags & PR_IP4_USER)) 1241 pr->pr_flags |= PR_IP4 | PR_IP4_USER; 1242 else if (!(pr_flags & PR_IP4_USER)) { 1243 pr->pr_flags |= ppr->pr_flags & PR_IP4; 1244 if (ppr->pr_ip4 != NULL) { 1245 pr->pr_ip4s = ppr->pr_ip4s; 1246 pr->pr_ip4 = malloc(pr->pr_ip4s * 1247 sizeof(struct in_addr), M_PRISON, 1248 M_WAITOK); 1249 bcopy(ppr->pr_ip4, pr->pr_ip4, 1250 pr->pr_ip4s * sizeof(*pr->pr_ip4)); 1251 } 1252 } 1253 #endif 1254 #ifdef INET6 1255 if (!(ch_flags & PR_IP6_USER)) 1256 pr->pr_flags |= PR_IP6 | PR_IP6_USER; 1257 else if (!(pr_flags & PR_IP6_USER)) { 1258 pr->pr_flags |= ppr->pr_flags & PR_IP6; 1259 if (ppr->pr_ip6 != NULL) { 1260 pr->pr_ip6s = ppr->pr_ip6s; 1261 pr->pr_ip6 = malloc(pr->pr_ip6s * 1262 sizeof(struct in6_addr), M_PRISON, 1263 M_WAITOK); 1264 bcopy(ppr->pr_ip6, pr->pr_ip6, 1265 pr->pr_ip6s * sizeof(*pr->pr_ip6)); 1266 } 1267 } 1268 #endif 1269 } 1270 #endif 1271 /* Source address selection is always on by default. */ 1272 pr->pr_flags |= _PR_IP_SADDRSEL; 1273 1274 pr->pr_securelevel = ppr->pr_securelevel; 1275 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; 1276 pr->pr_enforce_statfs = jail_default_enforce_statfs; 1277 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; 1278 1279 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; 1280 if (osrelstr == NULL) 1281 strlcpy(pr->pr_osrelease, ppr->pr_osrelease, 1282 sizeof(pr->pr_osrelease)); 1283 else 1284 strlcpy(pr->pr_osrelease, osrelstr, 1285 sizeof(pr->pr_osrelease)); 1286 1287 LIST_INIT(&pr->pr_children); 1288 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); 1289 TASK_INIT(&pr->pr_task, 0, prison_complete, pr); 1290 1291 #ifdef VIMAGE 1292 /* Allocate a new vnet if specified. */ 1293 pr->pr_vnet = (pr_flags & PR_VNET) 1294 ? vnet_alloc() : ppr->pr_vnet; 1295 #endif 1296 /* 1297 * Allocate a dedicated cpuset for each jail. 1298 * Unlike other initial settings, this may return an erorr. 1299 */ 1300 error = cpuset_create_root(ppr, &pr->pr_cpuset); 1301 if (error) { 1302 prison_deref(pr, PD_LIST_XLOCKED); 1303 goto done_releroot; 1304 } 1305 1306 mtx_lock(&pr->pr_mtx); 1307 /* 1308 * New prisons do not yet have a reference, because we do not 1309 * want others to see the incomplete prison once the 1310 * allprison_lock is downgraded. 1311 */ 1312 } else { 1313 created = 0; 1314 /* 1315 * Grab a reference for existing prisons, to ensure they 1316 * continue to exist for the duration of the call. 1317 */ 1318 pr->pr_ref++; 1319 #if defined(VIMAGE) && (defined(INET) || defined(INET6)) 1320 if ((pr->pr_flags & PR_VNET) && 1321 (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { 1322 error = EINVAL; 1323 vfs_opterror(opts, 1324 "vnet jails cannot have IP address restrictions"); 1325 goto done_deref_locked; 1326 } 1327 #endif 1328 #ifdef INET 1329 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { 1330 error = EINVAL; 1331 vfs_opterror(opts, 1332 "ip4 cannot be changed after creation"); 1333 goto done_deref_locked; 1334 } 1335 #endif 1336 #ifdef INET6 1337 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { 1338 error = EINVAL; 1339 vfs_opterror(opts, 1340 "ip6 cannot be changed after creation"); 1341 goto done_deref_locked; 1342 } 1343 #endif 1344 } 1345 1346 /* Do final error checking before setting anything. */ 1347 if (gotslevel) { 1348 if (slevel < ppr->pr_securelevel) { 1349 error = EPERM; 1350 goto done_deref_locked; 1351 } 1352 } 1353 if (gotchildmax) { 1354 if (childmax >= ppr->pr_childmax) { 1355 error = EPERM; 1356 goto done_deref_locked; 1357 } 1358 } 1359 if (gotenforce) { 1360 if (enforce < ppr->pr_enforce_statfs) { 1361 error = EPERM; 1362 goto done_deref_locked; 1363 } 1364 } 1365 if (gotrsnum) { 1366 /* 1367 * devfs_rsnum is a uint16_t 1368 */ 1369 if (rsnum < 0 || rsnum > 65535) { 1370 error = EINVAL; 1371 goto done_deref_locked; 1372 } 1373 /* 1374 * Nested jails always inherit parent's devfs ruleset 1375 */ 1376 if (jailed(td->td_ucred)) { 1377 if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { 1378 error = EPERM; 1379 goto done_deref_locked; 1380 } else 1381 rsnum = ppr->pr_devfs_rsnum; 1382 } 1383 } 1384 #ifdef INET 1385 if (ip4s > 0) { 1386 if (ppr->pr_flags & PR_IP4) { 1387 /* 1388 * Make sure the new set of IP addresses is a 1389 * subset of the parent's list. Don't worry 1390 * about the parent being unlocked, as any 1391 * setting is done with allprison_lock held. 1392 */ 1393 for (ij = 0; ij < ppr->pr_ip4s; ij++) 1394 if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) 1395 break; 1396 if (ij == ppr->pr_ip4s) { 1397 error = EPERM; 1398 goto done_deref_locked; 1399 } 1400 if (ip4s > 1) { 1401 for (ii = ij = 1; ii < ip4s; ii++) { 1402 if (ip4[ii].s_addr == 1403 ppr->pr_ip4[0].s_addr) 1404 continue; 1405 for (; ij < ppr->pr_ip4s; ij++) 1406 if (ip4[ii].s_addr == 1407 ppr->pr_ip4[ij].s_addr) 1408 break; 1409 if (ij == ppr->pr_ip4s) 1410 break; 1411 } 1412 if (ij == ppr->pr_ip4s) { 1413 error = EPERM; 1414 goto done_deref_locked; 1415 } 1416 } 1417 } 1418 /* 1419 * Check for conflicting IP addresses. We permit them 1420 * if there is no more than one IP on each jail. If 1421 * there is a duplicate on a jail with more than one 1422 * IP stop checking and return error. 1423 */ 1424 #ifdef VIMAGE 1425 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) 1426 if (tppr->pr_flags & PR_VNET) 1427 break; 1428 #else 1429 tppr = &prison0; 1430 #endif 1431 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { 1432 if (tpr == pr || 1433 #ifdef VIMAGE 1434 (tpr != tppr && (tpr->pr_flags & PR_VNET)) || 1435 #endif 1436 tpr->pr_uref == 0) { 1437 descend = 0; 1438 continue; 1439 } 1440 if (!(tpr->pr_flags & PR_IP4_USER)) 1441 continue; 1442 descend = 0; 1443 if (tpr->pr_ip4 == NULL || 1444 (ip4s == 1 && tpr->pr_ip4s == 1)) 1445 continue; 1446 for (ii = 0; ii < ip4s; ii++) { 1447 if (prison_check_ip4_locked(tpr, &ip4[ii]) == 1448 0) { 1449 error = EADDRINUSE; 1450 vfs_opterror(opts, 1451 "IPv4 addresses clash"); 1452 goto done_deref_locked; 1453 } 1454 } 1455 } 1456 } 1457 #endif 1458 #ifdef INET6 1459 if (ip6s > 0) { 1460 if (ppr->pr_flags & PR_IP6) { 1461 /* 1462 * Make sure the new set of IP addresses is a 1463 * subset of the parent's list. 1464 */ 1465 for (ij = 0; ij < ppr->pr_ip6s; ij++) 1466 if (IN6_ARE_ADDR_EQUAL(&ip6[0], 1467 &ppr->pr_ip6[ij])) 1468 break; 1469 if (ij == ppr->pr_ip6s) { 1470 error = EPERM; 1471 goto done_deref_locked; 1472 } 1473 if (ip6s > 1) { 1474 for (ii = ij = 1; ii < ip6s; ii++) { 1475 if (IN6_ARE_ADDR_EQUAL(&ip6[ii], 1476 &ppr->pr_ip6[0])) 1477 continue; 1478 for (; ij < ppr->pr_ip6s; ij++) 1479 if (IN6_ARE_ADDR_EQUAL( 1480 &ip6[ii], &ppr->pr_ip6[ij])) 1481 break; 1482 if (ij == ppr->pr_ip6s) 1483 break; 1484 } 1485 if (ij == ppr->pr_ip6s) { 1486 error = EPERM; 1487 goto done_deref_locked; 1488 } 1489 } 1490 } 1491 /* Check for conflicting IP addresses. */ 1492 #ifdef VIMAGE 1493 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) 1494 if (tppr->pr_flags & PR_VNET) 1495 break; 1496 #else 1497 tppr = &prison0; 1498 #endif 1499 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { 1500 if (tpr == pr || 1501 #ifdef VIMAGE 1502 (tpr != tppr && (tpr->pr_flags & PR_VNET)) || 1503 #endif 1504 tpr->pr_uref == 0) { 1505 descend = 0; 1506 continue; 1507 } 1508 if (!(tpr->pr_flags & PR_IP6_USER)) 1509 continue; 1510 descend = 0; 1511 if (tpr->pr_ip6 == NULL || 1512 (ip6s == 1 && tpr->pr_ip6s == 1)) 1513 continue; 1514 for (ii = 0; ii < ip6s; ii++) { 1515 if (prison_check_ip6_locked(tpr, &ip6[ii]) == 1516 0) { 1517 error = EADDRINUSE; 1518 vfs_opterror(opts, 1519 "IPv6 addresses clash"); 1520 goto done_deref_locked; 1521 } 1522 } 1523 } 1524 } 1525 #endif 1526 onamelen = namelen = 0; 1527 if (namelc != NULL) { 1528 /* Give a default name of the jid. Also allow the name to be 1529 * explicitly the jid - but not any other number, and only in 1530 * normal form (no leading zero/etc). 1531 */ 1532 if (namelc[0] == '\0') 1533 snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); 1534 else if ((strtoul(namelc, &p, 10) != jid || 1535 namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { 1536 error = EINVAL; 1537 vfs_opterror(opts, 1538 "name cannot be numeric (unless it is the jid)"); 1539 goto done_deref_locked; 1540 } 1541 /* 1542 * Make sure the name isn't too long for the prison or its 1543 * children. 1544 */ 1545 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; 1546 onamelen = strlen(pr->pr_name + pnamelen); 1547 namelen = strlen(namelc); 1548 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { 1549 error = ENAMETOOLONG; 1550 goto done_deref_locked; 1551 } 1552 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { 1553 if (strlen(tpr->pr_name) + (namelen - onamelen) >= 1554 sizeof(pr->pr_name)) { 1555 error = ENAMETOOLONG; 1556 goto done_deref_locked; 1557 } 1558 } 1559 } 1560 pr_allow_diff = pr_allow & ~ppr->pr_allow; 1561 if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { 1562 error = EPERM; 1563 goto done_deref_locked; 1564 } 1565 1566 /* 1567 * Let modules check their parameters. This requires unlocking and 1568 * then re-locking the prison, but this is still a valid state as long 1569 * as allprison_lock remains xlocked. 1570 */ 1571 mtx_unlock(&pr->pr_mtx); 1572 error = osd_jail_call(pr, PR_METHOD_CHECK, opts); 1573 if (error != 0) { 1574 prison_deref(pr, created 1575 ? PD_LIST_XLOCKED 1576 : PD_DEREF | PD_LIST_XLOCKED); 1577 goto done_releroot; 1578 } 1579 mtx_lock(&pr->pr_mtx); 1580 1581 /* At this point, all valid parameters should have been noted. */ 1582 TAILQ_FOREACH(opt, opts, link) { 1583 if (!opt->seen && strcmp(opt->name, "errmsg")) { 1584 error = EINVAL; 1585 vfs_opterror(opts, "unknown parameter: %s", opt->name); 1586 goto done_deref_locked; 1587 } 1588 } 1589 1590 /* Set the parameters of the prison. */ 1591 #ifdef INET 1592 redo_ip4 = 0; 1593 if (pr_flags & PR_IP4_USER) { 1594 pr->pr_flags |= PR_IP4; 1595 free(pr->pr_ip4, M_PRISON); 1596 pr->pr_ip4s = ip4s; 1597 pr->pr_ip4 = ip4; 1598 ip4 = NULL; 1599 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 1600 #ifdef VIMAGE 1601 if (tpr->pr_flags & PR_VNET) { 1602 descend = 0; 1603 continue; 1604 } 1605 #endif 1606 if (prison_restrict_ip4(tpr, NULL)) { 1607 redo_ip4 = 1; 1608 descend = 0; 1609 } 1610 } 1611 } 1612 #endif 1613 #ifdef INET6 1614 redo_ip6 = 0; 1615 if (pr_flags & PR_IP6_USER) { 1616 pr->pr_flags |= PR_IP6; 1617 free(pr->pr_ip6, M_PRISON); 1618 pr->pr_ip6s = ip6s; 1619 pr->pr_ip6 = ip6; 1620 ip6 = NULL; 1621 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 1622 #ifdef VIMAGE 1623 if (tpr->pr_flags & PR_VNET) { 1624 descend = 0; 1625 continue; 1626 } 1627 #endif 1628 if (prison_restrict_ip6(tpr, NULL)) { 1629 redo_ip6 = 1; 1630 descend = 0; 1631 } 1632 } 1633 } 1634 #endif 1635 if (gotslevel) { 1636 pr->pr_securelevel = slevel; 1637 /* Set all child jails to be at least this level. */ 1638 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 1639 if (tpr->pr_securelevel < slevel) 1640 tpr->pr_securelevel = slevel; 1641 } 1642 if (gotchildmax) { 1643 pr->pr_childmax = childmax; 1644 /* Set all child jails to under this limit. */ 1645 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) 1646 if (tpr->pr_childmax > childmax - level) 1647 tpr->pr_childmax = childmax > level 1648 ? childmax - level : 0; 1649 } 1650 if (gotenforce) { 1651 pr->pr_enforce_statfs = enforce; 1652 /* Pass this restriction on to the children. */ 1653 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 1654 if (tpr->pr_enforce_statfs < enforce) 1655 tpr->pr_enforce_statfs = enforce; 1656 } 1657 if (gotrsnum) { 1658 pr->pr_devfs_rsnum = rsnum; 1659 /* Pass this restriction on to the children. */ 1660 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 1661 tpr->pr_devfs_rsnum = rsnum; 1662 } 1663 if (namelc != NULL) { 1664 if (ppr == &prison0) 1665 strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); 1666 else 1667 snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", 1668 ppr->pr_name, namelc); 1669 /* Change this component of child names. */ 1670 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 1671 bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, 1672 strlen(tpr->pr_name + onamelen) + 1); 1673 bcopy(pr->pr_name, tpr->pr_name, namelen); 1674 } 1675 } 1676 if (path != NULL) { 1677 /* Try to keep a real-rooted full pathname. */ 1678 strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); 1679 pr->pr_root = root; 1680 } 1681 if (PR_HOST & ch_flags & ~pr_flags) { 1682 if (pr->pr_flags & PR_HOST) { 1683 /* 1684 * Copy the parent's host info. As with pr_ip4 above, 1685 * the lack of a lock on the parent is not a problem; 1686 * it is always set with allprison_lock at least 1687 * shared, and is held exclusively here. 1688 */ 1689 strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, 1690 sizeof(pr->pr_hostname)); 1691 strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, 1692 sizeof(pr->pr_domainname)); 1693 strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, 1694 sizeof(pr->pr_hostuuid)); 1695 pr->pr_hostid = pr->pr_parent->pr_hostid; 1696 } 1697 } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { 1698 /* Set this prison, and any descendants without PR_HOST. */ 1699 if (host != NULL) 1700 strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); 1701 if (domain != NULL) 1702 strlcpy(pr->pr_domainname, domain, 1703 sizeof(pr->pr_domainname)); 1704 if (uuid != NULL) 1705 strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); 1706 if (gothid) 1707 pr->pr_hostid = hid; 1708 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 1709 if (tpr->pr_flags & PR_HOST) 1710 descend = 0; 1711 else { 1712 if (host != NULL) 1713 strlcpy(tpr->pr_hostname, 1714 pr->pr_hostname, 1715 sizeof(tpr->pr_hostname)); 1716 if (domain != NULL) 1717 strlcpy(tpr->pr_domainname, 1718 pr->pr_domainname, 1719 sizeof(tpr->pr_domainname)); 1720 if (uuid != NULL) 1721 strlcpy(tpr->pr_hostuuid, 1722 pr->pr_hostuuid, 1723 sizeof(tpr->pr_hostuuid)); 1724 if (gothid) 1725 tpr->pr_hostid = hid; 1726 } 1727 } 1728 } 1729 if ((tallow = ch_allow & ~pr_allow)) { 1730 /* Clear allow bits in all children. */ 1731 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) 1732 tpr->pr_allow &= ~tallow; 1733 } 1734 pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; 1735 /* 1736 * Persistent prisons get an extra reference, and prisons losing their 1737 * persist flag lose that reference. Only do this for existing prisons 1738 * for now, so new ones will remain unseen until after the module 1739 * handlers have completed. 1740 */ 1741 born = pr->pr_uref == 0; 1742 if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { 1743 if (pr_flags & PR_PERSIST) { 1744 pr->pr_ref++; 1745 pr->pr_uref++; 1746 } else { 1747 pr->pr_ref--; 1748 pr->pr_uref--; 1749 } 1750 } 1751 pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; 1752 mtx_unlock(&pr->pr_mtx); 1753 1754 #ifdef RACCT 1755 if (racct_enable && created) 1756 prison_racct_attach(pr); 1757 #endif 1758 1759 /* Locks may have prevented a complete restriction of child IP 1760 * addresses. If so, allocate some more memory and try again. 1761 */ 1762 #ifdef INET 1763 while (redo_ip4) { 1764 ip4s = pr->pr_ip4s; 1765 ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); 1766 mtx_lock(&pr->pr_mtx); 1767 redo_ip4 = 0; 1768 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 1769 #ifdef VIMAGE 1770 if (tpr->pr_flags & PR_VNET) { 1771 descend = 0; 1772 continue; 1773 } 1774 #endif 1775 if (prison_restrict_ip4(tpr, ip4)) { 1776 if (ip4 != NULL) 1777 ip4 = NULL; 1778 else 1779 redo_ip4 = 1; 1780 } 1781 } 1782 mtx_unlock(&pr->pr_mtx); 1783 } 1784 #endif 1785 #ifdef INET6 1786 while (redo_ip6) { 1787 ip6s = pr->pr_ip6s; 1788 ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); 1789 mtx_lock(&pr->pr_mtx); 1790 redo_ip6 = 0; 1791 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { 1792 #ifdef VIMAGE 1793 if (tpr->pr_flags & PR_VNET) { 1794 descend = 0; 1795 continue; 1796 } 1797 #endif 1798 if (prison_restrict_ip6(tpr, ip6)) { 1799 if (ip6 != NULL) 1800 ip6 = NULL; 1801 else 1802 redo_ip6 = 1; 1803 } 1804 } 1805 mtx_unlock(&pr->pr_mtx); 1806 } 1807 #endif 1808 1809 /* Let the modules do their work. */ 1810 sx_downgrade(&allprison_lock); 1811 if (born) { 1812 error = osd_jail_call(pr, PR_METHOD_CREATE, opts); 1813 if (error) { 1814 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); 1815 prison_deref(pr, created 1816 ? PD_LIST_SLOCKED 1817 : PD_DEREF | PD_LIST_SLOCKED); 1818 goto done_errmsg; 1819 } 1820 } 1821 error = osd_jail_call(pr, PR_METHOD_SET, opts); 1822 if (error) { 1823 if (born) 1824 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); 1825 prison_deref(pr, created 1826 ? PD_LIST_SLOCKED 1827 : PD_DEREF | PD_LIST_SLOCKED); 1828 goto done_errmsg; 1829 } 1830 1831 /* Attach this process to the prison if requested. */ 1832 if (flags & JAIL_ATTACH) { 1833 mtx_lock(&pr->pr_mtx); 1834 error = do_jail_attach(td, pr); 1835 if (error) { 1836 vfs_opterror(opts, "attach failed"); 1837 if (!created) 1838 prison_deref(pr, PD_DEREF); 1839 goto done_errmsg; 1840 } 1841 } 1842 1843 #ifdef RACCT 1844 if (racct_enable && !created) { 1845 if (!(flags & JAIL_ATTACH)) 1846 sx_sunlock(&allprison_lock); 1847 prison_racct_modify(pr); 1848 if (!(flags & JAIL_ATTACH)) 1849 sx_slock(&allprison_lock); 1850 } 1851 #endif 1852 1853 td->td_retval[0] = pr->pr_id; 1854 1855 /* 1856 * Now that it is all there, drop the temporary reference from existing 1857 * prisons. Or add a reference to newly created persistent prisons 1858 * (which was not done earlier so that the prison would not be publicly 1859 * visible). 1860 */ 1861 if (!created) { 1862 prison_deref(pr, (flags & JAIL_ATTACH) 1863 ? PD_DEREF 1864 : PD_DEREF | PD_LIST_SLOCKED); 1865 } else { 1866 if (pr_flags & PR_PERSIST) { 1867 mtx_lock(&pr->pr_mtx); 1868 pr->pr_ref++; 1869 pr->pr_uref++; 1870 mtx_unlock(&pr->pr_mtx); 1871 } 1872 if (!(flags & JAIL_ATTACH)) 1873 sx_sunlock(&allprison_lock); 1874 } 1875 1876 goto done_free; 1877 1878 done_deref_locked: 1879 prison_deref(pr, created 1880 ? PD_LOCKED | PD_LIST_XLOCKED 1881 : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); 1882 goto done_releroot; 1883 done_unlock_list: 1884 sx_xunlock(&allprison_lock); 1885 done_releroot: 1886 if (root != NULL) 1887 vrele(root); 1888 done_errmsg: 1889 if (error) { 1890 if (vfs_getopt(opts, "errmsg", (void **)&errmsg, 1891 &errmsg_len) == 0 && errmsg_len > 0) { 1892 errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; 1893 if (optuio->uio_segflg == UIO_SYSSPACE) 1894 bcopy(errmsg, 1895 optuio->uio_iov[errmsg_pos].iov_base, 1896 errmsg_len); 1897 else 1898 copyout(errmsg, 1899 optuio->uio_iov[errmsg_pos].iov_base, 1900 errmsg_len); 1901 } 1902 } 1903 done_free: 1904 #ifdef INET 1905 free(ip4, M_PRISON); 1906 #endif 1907 #ifdef INET6 1908 free(ip6, M_PRISON); 1909 #endif 1910 if (g_path != NULL) 1911 free(g_path, M_TEMP); 1912 vfs_freeopts(opts); 1913 return (error); 1914 } 1915 1916 /* 1917 * struct jail_get_args { 1918 * struct iovec *iovp; 1919 * unsigned int iovcnt; 1920 * int flags; 1921 * }; 1922 */ 1923 int 1924 sys_jail_get(struct thread *td, struct jail_get_args *uap) 1925 { 1926 struct uio *auio; 1927 int error; 1928 1929 /* Check that we have an even number of iovecs. */ 1930 if (uap->iovcnt & 1) 1931 return (EINVAL); 1932 1933 error = copyinuio(uap->iovp, uap->iovcnt, &auio); 1934 if (error) 1935 return (error); 1936 error = kern_jail_get(td, auio, uap->flags); 1937 if (error == 0) 1938 error = copyout(auio->uio_iov, uap->iovp, 1939 uap->iovcnt * sizeof (struct iovec)); 1940 free(auio, M_IOV); 1941 return (error); 1942 } 1943 1944 int 1945 kern_jail_get(struct thread *td, struct uio *optuio, int flags) 1946 { 1947 struct bool_flags *bf; 1948 struct jailsys_flags *jsf; 1949 struct prison *pr, *mypr; 1950 struct vfsopt *opt; 1951 struct vfsoptlist *opts; 1952 char *errmsg, *name; 1953 int error, errmsg_len, errmsg_pos, i, jid, len, locked, pos; 1954 unsigned f; 1955 1956 if (flags & ~JAIL_GET_MASK) 1957 return (EINVAL); 1958 1959 /* Get the parameter list. */ 1960 error = vfs_buildopts(optuio, &opts); 1961 if (error) 1962 return (error); 1963 errmsg_pos = vfs_getopt_pos(opts, "errmsg"); 1964 mypr = td->td_ucred->cr_prison; 1965 1966 /* 1967 * Find the prison specified by one of: lastjid, jid, name. 1968 */ 1969 sx_slock(&allprison_lock); 1970 error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); 1971 if (error == 0) { 1972 TAILQ_FOREACH(pr, &allprison, pr_list) { 1973 if (pr->pr_id > jid && prison_ischild(mypr, pr)) { 1974 mtx_lock(&pr->pr_mtx); 1975 if (pr->pr_ref > 0 && 1976 (pr->pr_uref > 0 || (flags & JAIL_DYING))) 1977 break; 1978 mtx_unlock(&pr->pr_mtx); 1979 } 1980 } 1981 if (pr != NULL) 1982 goto found_prison; 1983 error = ENOENT; 1984 vfs_opterror(opts, "no jail after %d", jid); 1985 goto done_unlock_list; 1986 } else if (error != ENOENT) 1987 goto done_unlock_list; 1988 1989 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); 1990 if (error == 0) { 1991 if (jid != 0) { 1992 pr = prison_find_child(mypr, jid); 1993 if (pr != NULL) { 1994 if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { 1995 mtx_unlock(&pr->pr_mtx); 1996 error = ENOENT; 1997 vfs_opterror(opts, "jail %d is dying", 1998 jid); 1999 goto done_unlock_list; 2000 } 2001 goto found_prison; 2002 } 2003 error = ENOENT; 2004 vfs_opterror(opts, "jail %d not found", jid); 2005 goto done_unlock_list; 2006 } 2007 } else if (error != ENOENT) 2008 goto done_unlock_list; 2009 2010 error = vfs_getopt(opts, "name", (void **)&name, &len); 2011 if (error == 0) { 2012 if (len == 0 || name[len - 1] != '\0') { 2013 error = EINVAL; 2014 goto done_unlock_list; 2015 } 2016 pr = prison_find_name(mypr, name); 2017 if (pr != NULL) { 2018 if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { 2019 mtx_unlock(&pr->pr_mtx); 2020 error = ENOENT; 2021 vfs_opterror(opts, "jail \"%s\" is dying", 2022 name); 2023 goto done_unlock_list; 2024 } 2025 goto found_prison; 2026 } 2027 error = ENOENT; 2028 vfs_opterror(opts, "jail \"%s\" not found", name); 2029 goto done_unlock_list; 2030 } else if (error != ENOENT) 2031 goto done_unlock_list; 2032 2033 vfs_opterror(opts, "no jail specified"); 2034 error = ENOENT; 2035 goto done_unlock_list; 2036 2037 found_prison: 2038 /* Get the parameters of the prison. */ 2039 pr->pr_ref++; 2040 locked = PD_LOCKED; 2041 td->td_retval[0] = pr->pr_id; 2042 error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); 2043 if (error != 0 && error != ENOENT) 2044 goto done_deref; 2045 i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; 2046 error = vfs_setopt(opts, "parent", &i, sizeof(i)); 2047 if (error != 0 && error != ENOENT) 2048 goto done_deref; 2049 error = vfs_setopts(opts, "name", prison_name(mypr, pr)); 2050 if (error != 0 && error != ENOENT) 2051 goto done_deref; 2052 error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, 2053 sizeof(pr->pr_cpuset->cs_id)); 2054 if (error != 0 && error != ENOENT) 2055 goto done_deref; 2056 error = vfs_setopts(opts, "path", prison_path(mypr, pr)); 2057 if (error != 0 && error != ENOENT) 2058 goto done_deref; 2059 #ifdef INET 2060 error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, 2061 pr->pr_ip4s * sizeof(*pr->pr_ip4)); 2062 if (error != 0 && error != ENOENT) 2063 goto done_deref; 2064 #endif 2065 #ifdef INET6 2066 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, 2067 pr->pr_ip6s * sizeof(*pr->pr_ip6)); 2068 if (error != 0 && error != ENOENT) 2069 goto done_deref; 2070 #endif 2071 error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, 2072 sizeof(pr->pr_securelevel)); 2073 if (error != 0 && error != ENOENT) 2074 goto done_deref; 2075 error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, 2076 sizeof(pr->pr_childcount)); 2077 if (error != 0 && error != ENOENT) 2078 goto done_deref; 2079 error = vfs_setopt(opts, "children.max", &pr->pr_childmax, 2080 sizeof(pr->pr_childmax)); 2081 if (error != 0 && error != ENOENT) 2082 goto done_deref; 2083 error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); 2084 if (error != 0 && error != ENOENT) 2085 goto done_deref; 2086 error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); 2087 if (error != 0 && error != ENOENT) 2088 goto done_deref; 2089 error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); 2090 if (error != 0 && error != ENOENT) 2091 goto done_deref; 2092 #ifdef COMPAT_FREEBSD32 2093 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 2094 uint32_t hid32 = pr->pr_hostid; 2095 2096 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); 2097 } else 2098 #endif 2099 error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, 2100 sizeof(pr->pr_hostid)); 2101 if (error != 0 && error != ENOENT) 2102 goto done_deref; 2103 error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, 2104 sizeof(pr->pr_enforce_statfs)); 2105 if (error != 0 && error != ENOENT) 2106 goto done_deref; 2107 error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, 2108 sizeof(pr->pr_devfs_rsnum)); 2109 if (error != 0 && error != ENOENT) 2110 goto done_deref; 2111 for (bf = pr_flag_bool; 2112 bf < pr_flag_bool + nitems(pr_flag_bool); 2113 bf++) { 2114 i = (pr->pr_flags & bf->flag) ? 1 : 0; 2115 error = vfs_setopt(opts, bf->name, &i, sizeof(i)); 2116 if (error != 0 && error != ENOENT) 2117 goto done_deref; 2118 i = !i; 2119 error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); 2120 if (error != 0 && error != ENOENT) 2121 goto done_deref; 2122 } 2123 for (jsf = pr_flag_jailsys; 2124 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 2125 jsf++) { 2126 f = pr->pr_flags & (jsf->disable | jsf->new); 2127 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE 2128 : (f == jsf->new) ? JAIL_SYS_NEW 2129 : JAIL_SYS_INHERIT; 2130 error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); 2131 if (error != 0 && error != ENOENT) 2132 goto done_deref; 2133 } 2134 for (bf = pr_flag_allow; 2135 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0; 2136 bf++) { 2137 i = (pr->pr_allow & bf->flag) ? 1 : 0; 2138 error = vfs_setopt(opts, bf->name, &i, sizeof(i)); 2139 if (error != 0 && error != ENOENT) 2140 goto done_deref; 2141 i = !i; 2142 error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); 2143 if (error != 0 && error != ENOENT) 2144 goto done_deref; 2145 } 2146 i = (pr->pr_uref == 0); 2147 error = vfs_setopt(opts, "dying", &i, sizeof(i)); 2148 if (error != 0 && error != ENOENT) 2149 goto done_deref; 2150 i = !i; 2151 error = vfs_setopt(opts, "nodying", &i, sizeof(i)); 2152 if (error != 0 && error != ENOENT) 2153 goto done_deref; 2154 error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, 2155 sizeof(pr->pr_osreldate)); 2156 if (error != 0 && error != ENOENT) 2157 goto done_deref; 2158 error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); 2159 if (error != 0 && error != ENOENT) 2160 goto done_deref; 2161 2162 /* Get the module parameters. */ 2163 mtx_unlock(&pr->pr_mtx); 2164 locked = 0; 2165 error = osd_jail_call(pr, PR_METHOD_GET, opts); 2166 if (error) 2167 goto done_deref; 2168 prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED); 2169 2170 /* By now, all parameters should have been noted. */ 2171 TAILQ_FOREACH(opt, opts, link) { 2172 if (!opt->seen && strcmp(opt->name, "errmsg")) { 2173 error = EINVAL; 2174 vfs_opterror(opts, "unknown parameter: %s", opt->name); 2175 goto done_errmsg; 2176 } 2177 } 2178 2179 /* Write the fetched parameters back to userspace. */ 2180 error = 0; 2181 TAILQ_FOREACH(opt, opts, link) { 2182 if (opt->pos >= 0 && opt->pos != errmsg_pos) { 2183 pos = 2 * opt->pos + 1; 2184 optuio->uio_iov[pos].iov_len = opt->len; 2185 if (opt->value != NULL) { 2186 if (optuio->uio_segflg == UIO_SYSSPACE) { 2187 bcopy(opt->value, 2188 optuio->uio_iov[pos].iov_base, 2189 opt->len); 2190 } else { 2191 error = copyout(opt->value, 2192 optuio->uio_iov[pos].iov_base, 2193 opt->len); 2194 if (error) 2195 break; 2196 } 2197 } 2198 } 2199 } 2200 goto done_errmsg; 2201 2202 done_deref: 2203 prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED); 2204 goto done_errmsg; 2205 2206 done_unlock_list: 2207 sx_sunlock(&allprison_lock); 2208 done_errmsg: 2209 if (error && errmsg_pos >= 0) { 2210 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); 2211 errmsg_pos = 2 * errmsg_pos + 1; 2212 if (errmsg_len > 0) { 2213 if (optuio->uio_segflg == UIO_SYSSPACE) 2214 bcopy(errmsg, 2215 optuio->uio_iov[errmsg_pos].iov_base, 2216 errmsg_len); 2217 else 2218 copyout(errmsg, 2219 optuio->uio_iov[errmsg_pos].iov_base, 2220 errmsg_len); 2221 } 2222 } 2223 vfs_freeopts(opts); 2224 return (error); 2225 } 2226 2227 /* 2228 * struct jail_remove_args { 2229 * int jid; 2230 * }; 2231 */ 2232 int 2233 sys_jail_remove(struct thread *td, struct jail_remove_args *uap) 2234 { 2235 struct prison *pr, *cpr, *lpr, *tpr; 2236 int descend, error; 2237 2238 error = priv_check(td, PRIV_JAIL_REMOVE); 2239 if (error) 2240 return (error); 2241 2242 sx_xlock(&allprison_lock); 2243 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); 2244 if (pr == NULL) { 2245 sx_xunlock(&allprison_lock); 2246 return (EINVAL); 2247 } 2248 2249 /* Remove all descendants of this prison, then remove this prison. */ 2250 pr->pr_ref++; 2251 if (!LIST_EMPTY(&pr->pr_children)) { 2252 mtx_unlock(&pr->pr_mtx); 2253 lpr = NULL; 2254 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { 2255 mtx_lock(&cpr->pr_mtx); 2256 if (cpr->pr_ref > 0) { 2257 tpr = cpr; 2258 cpr->pr_ref++; 2259 } else { 2260 /* Already removed - do not do it again. */ 2261 tpr = NULL; 2262 } 2263 mtx_unlock(&cpr->pr_mtx); 2264 if (lpr != NULL) { 2265 mtx_lock(&lpr->pr_mtx); 2266 prison_remove_one(lpr); 2267 sx_xlock(&allprison_lock); 2268 } 2269 lpr = tpr; 2270 } 2271 if (lpr != NULL) { 2272 mtx_lock(&lpr->pr_mtx); 2273 prison_remove_one(lpr); 2274 sx_xlock(&allprison_lock); 2275 } 2276 mtx_lock(&pr->pr_mtx); 2277 } 2278 prison_remove_one(pr); 2279 return (0); 2280 } 2281 2282 static void 2283 prison_remove_one(struct prison *pr) 2284 { 2285 struct proc *p; 2286 int deuref; 2287 2288 /* If the prison was persistent, it is not anymore. */ 2289 deuref = 0; 2290 if (pr->pr_flags & PR_PERSIST) { 2291 pr->pr_ref--; 2292 deuref = PD_DEUREF; 2293 pr->pr_flags &= ~PR_PERSIST; 2294 } 2295 2296 /* 2297 * jail_remove added a reference. If that's the only one, remove 2298 * the prison now. 2299 */ 2300 KASSERT(pr->pr_ref > 0, 2301 ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); 2302 if (pr->pr_ref == 1) { 2303 prison_deref(pr, 2304 deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); 2305 return; 2306 } 2307 2308 mtx_unlock(&pr->pr_mtx); 2309 sx_xunlock(&allprison_lock); 2310 /* 2311 * Kill all processes unfortunate enough to be attached to this prison. 2312 */ 2313 sx_slock(&allproc_lock); 2314 FOREACH_PROC_IN_SYSTEM(p) { 2315 PROC_LOCK(p); 2316 if (p->p_state != PRS_NEW && p->p_ucred && 2317 p->p_ucred->cr_prison == pr) 2318 kern_psignal(p, SIGKILL); 2319 PROC_UNLOCK(p); 2320 } 2321 sx_sunlock(&allproc_lock); 2322 /* Remove the temporary reference added by jail_remove. */ 2323 prison_deref(pr, deuref | PD_DEREF); 2324 } 2325 2326 /* 2327 * struct jail_attach_args { 2328 * int jid; 2329 * }; 2330 */ 2331 int 2332 sys_jail_attach(struct thread *td, struct jail_attach_args *uap) 2333 { 2334 struct prison *pr; 2335 int error; 2336 2337 error = priv_check(td, PRIV_JAIL_ATTACH); 2338 if (error) 2339 return (error); 2340 2341 /* 2342 * Start with exclusive hold on allprison_lock to ensure that a possible 2343 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove. 2344 * But then immediately downgrade it since we don't need to stop 2345 * readers. 2346 */ 2347 sx_xlock(&allprison_lock); 2348 sx_downgrade(&allprison_lock); 2349 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); 2350 if (pr == NULL) { 2351 sx_sunlock(&allprison_lock); 2352 return (EINVAL); 2353 } 2354 2355 /* 2356 * Do not allow a process to attach to a prison that is not 2357 * considered to be "alive". 2358 */ 2359 if (pr->pr_uref == 0) { 2360 mtx_unlock(&pr->pr_mtx); 2361 sx_sunlock(&allprison_lock); 2362 return (EINVAL); 2363 } 2364 2365 return (do_jail_attach(td, pr)); 2366 } 2367 2368 static int 2369 do_jail_attach(struct thread *td, struct prison *pr) 2370 { 2371 struct proc *p; 2372 struct ucred *newcred, *oldcred; 2373 int error; 2374 2375 /* 2376 * XXX: Note that there is a slight race here if two threads 2377 * in the same privileged process attempt to attach to two 2378 * different jails at the same time. It is important for 2379 * user processes not to do this, or they might end up with 2380 * a process root from one prison, but attached to the jail 2381 * of another. 2382 */ 2383 pr->pr_ref++; 2384 pr->pr_uref++; 2385 mtx_unlock(&pr->pr_mtx); 2386 2387 /* Let modules do whatever they need to prepare for attaching. */ 2388 error = osd_jail_call(pr, PR_METHOD_ATTACH, td); 2389 if (error) { 2390 prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED); 2391 return (error); 2392 } 2393 sx_sunlock(&allprison_lock); 2394 2395 /* 2396 * Reparent the newly attached process to this jail. 2397 */ 2398 p = td->td_proc; 2399 error = cpuset_setproc_update_set(p, pr->pr_cpuset); 2400 if (error) 2401 goto e_revert_osd; 2402 2403 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); 2404 if ((error = change_dir(pr->pr_root, td)) != 0) 2405 goto e_unlock; 2406 #ifdef MAC 2407 if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) 2408 goto e_unlock; 2409 #endif 2410 VOP_UNLOCK(pr->pr_root); 2411 if ((error = pwd_chroot(td, pr->pr_root))) 2412 goto e_revert_osd; 2413 2414 newcred = crget(); 2415 PROC_LOCK(p); 2416 oldcred = crcopysafe(p, newcred); 2417 newcred->cr_prison = pr; 2418 proc_set_cred(p, newcred); 2419 setsugid(p); 2420 #ifdef RACCT 2421 racct_proc_ucred_changed(p, oldcred, newcred); 2422 crhold(newcred); 2423 #endif 2424 PROC_UNLOCK(p); 2425 #ifdef RCTL 2426 rctl_proc_ucred_changed(p, newcred); 2427 crfree(newcred); 2428 #endif 2429 prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF); 2430 crfree(oldcred); 2431 return (0); 2432 2433 e_unlock: 2434 VOP_UNLOCK(pr->pr_root); 2435 e_revert_osd: 2436 /* Tell modules this thread is still in its old jail after all. */ 2437 (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); 2438 prison_deref(pr, PD_DEREF | PD_DEUREF); 2439 return (error); 2440 } 2441 2442 /* 2443 * Returns a locked prison instance, or NULL on failure. 2444 */ 2445 struct prison * 2446 prison_find(int prid) 2447 { 2448 struct prison *pr; 2449 2450 sx_assert(&allprison_lock, SX_LOCKED); 2451 TAILQ_FOREACH(pr, &allprison, pr_list) { 2452 if (pr->pr_id == prid) { 2453 mtx_lock(&pr->pr_mtx); 2454 if (pr->pr_ref > 0) 2455 return (pr); 2456 mtx_unlock(&pr->pr_mtx); 2457 } 2458 } 2459 return (NULL); 2460 } 2461 2462 /* 2463 * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. 2464 */ 2465 struct prison * 2466 prison_find_child(struct prison *mypr, int prid) 2467 { 2468 struct prison *pr; 2469 int descend; 2470 2471 sx_assert(&allprison_lock, SX_LOCKED); 2472 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { 2473 if (pr->pr_id == prid) { 2474 mtx_lock(&pr->pr_mtx); 2475 if (pr->pr_ref > 0) 2476 return (pr); 2477 mtx_unlock(&pr->pr_mtx); 2478 } 2479 } 2480 return (NULL); 2481 } 2482 2483 /* 2484 * Look for the name relative to mypr. Returns a locked prison or NULL. 2485 */ 2486 struct prison * 2487 prison_find_name(struct prison *mypr, const char *name) 2488 { 2489 struct prison *pr, *deadpr; 2490 size_t mylen; 2491 int descend; 2492 2493 sx_assert(&allprison_lock, SX_LOCKED); 2494 mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; 2495 again: 2496 deadpr = NULL; 2497 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { 2498 if (!strcmp(pr->pr_name + mylen, name)) { 2499 mtx_lock(&pr->pr_mtx); 2500 if (pr->pr_ref > 0) { 2501 if (pr->pr_uref > 0) 2502 return (pr); 2503 deadpr = pr; 2504 } 2505 mtx_unlock(&pr->pr_mtx); 2506 } 2507 } 2508 /* There was no valid prison - perhaps there was a dying one. */ 2509 if (deadpr != NULL) { 2510 mtx_lock(&deadpr->pr_mtx); 2511 if (deadpr->pr_ref == 0) { 2512 mtx_unlock(&deadpr->pr_mtx); 2513 goto again; 2514 } 2515 } 2516 return (deadpr); 2517 } 2518 2519 /* 2520 * See if a prison has the specific flag set. 2521 */ 2522 int 2523 prison_flag(struct ucred *cred, unsigned flag) 2524 { 2525 2526 /* This is an atomic read, so no locking is necessary. */ 2527 return (cred->cr_prison->pr_flags & flag); 2528 } 2529 2530 int 2531 prison_allow(struct ucred *cred, unsigned flag) 2532 { 2533 2534 /* This is an atomic read, so no locking is necessary. */ 2535 return (cred->cr_prison->pr_allow & flag); 2536 } 2537 2538 /* 2539 * Remove a prison reference. If that was the last reference, remove the 2540 * prison itself - but not in this context in case there are locks held. 2541 */ 2542 void 2543 prison_free_locked(struct prison *pr) 2544 { 2545 int ref; 2546 2547 mtx_assert(&pr->pr_mtx, MA_OWNED); 2548 ref = --pr->pr_ref; 2549 mtx_unlock(&pr->pr_mtx); 2550 if (ref == 0) 2551 taskqueue_enqueue(taskqueue_thread, &pr->pr_task); 2552 } 2553 2554 void 2555 prison_free(struct prison *pr) 2556 { 2557 2558 mtx_lock(&pr->pr_mtx); 2559 prison_free_locked(pr); 2560 } 2561 2562 /* 2563 * Complete a call to either prison_free or prison_proc_free. 2564 */ 2565 static void 2566 prison_complete(void *context, int pending) 2567 { 2568 struct prison *pr = context; 2569 2570 sx_xlock(&allprison_lock); 2571 mtx_lock(&pr->pr_mtx); 2572 prison_deref(pr, pr->pr_uref 2573 ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED 2574 : PD_LOCKED | PD_LIST_XLOCKED); 2575 } 2576 2577 /* 2578 * Remove a prison reference (usually). This internal version assumes no 2579 * mutexes are held, except perhaps the prison itself. If there are no more 2580 * references, release and delist the prison. On completion, the prison lock 2581 * and the allprison lock are both unlocked. 2582 */ 2583 static void 2584 prison_deref(struct prison *pr, int flags) 2585 { 2586 struct prison *ppr, *tpr; 2587 int ref, lasturef; 2588 2589 if (!(flags & PD_LOCKED)) 2590 mtx_lock(&pr->pr_mtx); 2591 for (;;) { 2592 if (flags & PD_DEUREF) { 2593 KASSERT(pr->pr_uref > 0, 2594 ("prison_deref PD_DEUREF on a dead prison (jid=%d)", 2595 pr->pr_id)); 2596 pr->pr_uref--; 2597 lasturef = pr->pr_uref == 0; 2598 if (lasturef) 2599 pr->pr_ref++; 2600 KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); 2601 } else 2602 lasturef = 0; 2603 if (flags & PD_DEREF) { 2604 KASSERT(pr->pr_ref > 0, 2605 ("prison_deref PD_DEREF on a dead prison (jid=%d)", 2606 pr->pr_id)); 2607 pr->pr_ref--; 2608 } 2609 ref = pr->pr_ref; 2610 mtx_unlock(&pr->pr_mtx); 2611 2612 /* 2613 * Tell the modules if the last user reference was removed 2614 * (even it sticks around in dying state). 2615 */ 2616 if (lasturef) { 2617 if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { 2618 sx_xlock(&allprison_lock); 2619 flags |= PD_LIST_XLOCKED; 2620 } 2621 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); 2622 mtx_lock(&pr->pr_mtx); 2623 ref = --pr->pr_ref; 2624 mtx_unlock(&pr->pr_mtx); 2625 } 2626 2627 /* If the prison still has references, nothing else to do. */ 2628 if (ref > 0) { 2629 if (flags & PD_LIST_SLOCKED) 2630 sx_sunlock(&allprison_lock); 2631 else if (flags & PD_LIST_XLOCKED) 2632 sx_xunlock(&allprison_lock); 2633 return; 2634 } 2635 2636 if (flags & PD_LIST_SLOCKED) { 2637 if (!sx_try_upgrade(&allprison_lock)) { 2638 sx_sunlock(&allprison_lock); 2639 sx_xlock(&allprison_lock); 2640 } 2641 } else if (!(flags & PD_LIST_XLOCKED)) 2642 sx_xlock(&allprison_lock); 2643 2644 TAILQ_REMOVE(&allprison, pr, pr_list); 2645 LIST_REMOVE(pr, pr_sibling); 2646 ppr = pr->pr_parent; 2647 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) 2648 tpr->pr_childcount--; 2649 sx_xunlock(&allprison_lock); 2650 2651 #ifdef VIMAGE 2652 if (pr->pr_vnet != ppr->pr_vnet) 2653 vnet_destroy(pr->pr_vnet); 2654 #endif 2655 if (pr->pr_root != NULL) 2656 vrele(pr->pr_root); 2657 mtx_destroy(&pr->pr_mtx); 2658 #ifdef INET 2659 free(pr->pr_ip4, M_PRISON); 2660 #endif 2661 #ifdef INET6 2662 free(pr->pr_ip6, M_PRISON); 2663 #endif 2664 if (pr->pr_cpuset != NULL) 2665 cpuset_rel(pr->pr_cpuset); 2666 osd_jail_exit(pr); 2667 #ifdef RACCT 2668 if (racct_enable) 2669 prison_racct_detach(pr); 2670 #endif 2671 free(pr, M_PRISON); 2672 2673 /* Removing a prison frees a reference on its parent. */ 2674 pr = ppr; 2675 mtx_lock(&pr->pr_mtx); 2676 flags = PD_DEREF | PD_DEUREF; 2677 } 2678 } 2679 2680 void 2681 prison_hold_locked(struct prison *pr) 2682 { 2683 2684 mtx_assert(&pr->pr_mtx, MA_OWNED); 2685 KASSERT(pr->pr_ref > 0, 2686 ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); 2687 pr->pr_ref++; 2688 } 2689 2690 void 2691 prison_hold(struct prison *pr) 2692 { 2693 2694 mtx_lock(&pr->pr_mtx); 2695 prison_hold_locked(pr); 2696 mtx_unlock(&pr->pr_mtx); 2697 } 2698 2699 void 2700 prison_proc_hold(struct prison *pr) 2701 { 2702 2703 mtx_lock(&pr->pr_mtx); 2704 KASSERT(pr->pr_uref > 0, 2705 ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); 2706 pr->pr_uref++; 2707 mtx_unlock(&pr->pr_mtx); 2708 } 2709 2710 void 2711 prison_proc_free(struct prison *pr) 2712 { 2713 2714 mtx_lock(&pr->pr_mtx); 2715 KASSERT(pr->pr_uref > 0, 2716 ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); 2717 if (pr->pr_uref > 1) 2718 pr->pr_uref--; 2719 else { 2720 /* 2721 * Don't remove the last user reference in this context, which 2722 * is expected to be a process that is not only locked, but 2723 * also half dead. 2724 */ 2725 pr->pr_ref++; 2726 mtx_unlock(&pr->pr_mtx); 2727 taskqueue_enqueue(taskqueue_thread, &pr->pr_task); 2728 return; 2729 } 2730 mtx_unlock(&pr->pr_mtx); 2731 } 2732 2733 /* 2734 * Check if a jail supports the given address family. 2735 * 2736 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT 2737 * if not. 2738 */ 2739 int 2740 prison_check_af(struct ucred *cred, int af) 2741 { 2742 struct prison *pr; 2743 int error; 2744 2745 KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); 2746 2747 pr = cred->cr_prison; 2748 #ifdef VIMAGE 2749 /* Prisons with their own network stack are not limited. */ 2750 if (prison_owns_vnet(cred)) 2751 return (0); 2752 #endif 2753 2754 error = 0; 2755 switch (af) 2756 { 2757 #ifdef INET 2758 case AF_INET: 2759 if (pr->pr_flags & PR_IP4) 2760 { 2761 mtx_lock(&pr->pr_mtx); 2762 if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) 2763 error = EAFNOSUPPORT; 2764 mtx_unlock(&pr->pr_mtx); 2765 } 2766 break; 2767 #endif 2768 #ifdef INET6 2769 case AF_INET6: 2770 if (pr->pr_flags & PR_IP6) 2771 { 2772 mtx_lock(&pr->pr_mtx); 2773 if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) 2774 error = EAFNOSUPPORT; 2775 mtx_unlock(&pr->pr_mtx); 2776 } 2777 break; 2778 #endif 2779 case AF_LOCAL: 2780 case AF_ROUTE: 2781 break; 2782 default: 2783 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) 2784 error = EAFNOSUPPORT; 2785 } 2786 return (error); 2787 } 2788 2789 /* 2790 * Check if given address belongs to the jail referenced by cred (wrapper to 2791 * prison_check_ip[46]). 2792 * 2793 * Returns 0 if jail doesn't restrict the address family or if address belongs 2794 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if 2795 * the jail doesn't allow the address family. IPv4 Address passed in in NBO. 2796 */ 2797 int 2798 prison_if(struct ucred *cred, const struct sockaddr *sa) 2799 { 2800 #ifdef INET 2801 const struct sockaddr_in *sai; 2802 #endif 2803 #ifdef INET6 2804 const struct sockaddr_in6 *sai6; 2805 #endif 2806 int error; 2807 2808 KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); 2809 KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); 2810 2811 #ifdef VIMAGE 2812 if (prison_owns_vnet(cred)) 2813 return (0); 2814 #endif 2815 2816 error = 0; 2817 switch (sa->sa_family) 2818 { 2819 #ifdef INET 2820 case AF_INET: 2821 sai = (const struct sockaddr_in *)sa; 2822 error = prison_check_ip4(cred, &sai->sin_addr); 2823 break; 2824 #endif 2825 #ifdef INET6 2826 case AF_INET6: 2827 sai6 = (const struct sockaddr_in6 *)sa; 2828 error = prison_check_ip6(cred, &sai6->sin6_addr); 2829 break; 2830 #endif 2831 default: 2832 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) 2833 error = EAFNOSUPPORT; 2834 } 2835 return (error); 2836 } 2837 2838 /* 2839 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. 2840 */ 2841 int 2842 prison_check(struct ucred *cred1, struct ucred *cred2) 2843 { 2844 2845 return ((cred1->cr_prison == cred2->cr_prison || 2846 prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); 2847 } 2848 2849 /* 2850 * Return 1 if p2 is a child of p1, otherwise 0. 2851 */ 2852 int 2853 prison_ischild(struct prison *pr1, struct prison *pr2) 2854 { 2855 2856 for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) 2857 if (pr1 == pr2) 2858 return (1); 2859 return (0); 2860 } 2861 2862 /* 2863 * Return 1 if the passed credential is in a jail and that jail does not 2864 * have its own virtual network stack, otherwise 0. 2865 */ 2866 int 2867 jailed_without_vnet(struct ucred *cred) 2868 { 2869 2870 if (!jailed(cred)) 2871 return (0); 2872 #ifdef VIMAGE 2873 if (prison_owns_vnet(cred)) 2874 return (0); 2875 #endif 2876 2877 return (1); 2878 } 2879 2880 /* 2881 * Return the correct hostname (domainname, et al) for the passed credential. 2882 */ 2883 void 2884 getcredhostname(struct ucred *cred, char *buf, size_t size) 2885 { 2886 struct prison *pr; 2887 2888 /* 2889 * A NULL credential can be used to shortcut to the physical 2890 * system's hostname. 2891 */ 2892 pr = (cred != NULL) ? cred->cr_prison : &prison0; 2893 mtx_lock(&pr->pr_mtx); 2894 strlcpy(buf, pr->pr_hostname, size); 2895 mtx_unlock(&pr->pr_mtx); 2896 } 2897 2898 void 2899 getcreddomainname(struct ucred *cred, char *buf, size_t size) 2900 { 2901 2902 mtx_lock(&cred->cr_prison->pr_mtx); 2903 strlcpy(buf, cred->cr_prison->pr_domainname, size); 2904 mtx_unlock(&cred->cr_prison->pr_mtx); 2905 } 2906 2907 void 2908 getcredhostuuid(struct ucred *cred, char *buf, size_t size) 2909 { 2910 2911 mtx_lock(&cred->cr_prison->pr_mtx); 2912 strlcpy(buf, cred->cr_prison->pr_hostuuid, size); 2913 mtx_unlock(&cred->cr_prison->pr_mtx); 2914 } 2915 2916 void 2917 getcredhostid(struct ucred *cred, unsigned long *hostid) 2918 { 2919 2920 mtx_lock(&cred->cr_prison->pr_mtx); 2921 *hostid = cred->cr_prison->pr_hostid; 2922 mtx_unlock(&cred->cr_prison->pr_mtx); 2923 } 2924 2925 void 2926 getjailname(struct ucred *cred, char *name, size_t len) 2927 { 2928 2929 mtx_lock(&cred->cr_prison->pr_mtx); 2930 strlcpy(name, cred->cr_prison->pr_name, len); 2931 mtx_unlock(&cred->cr_prison->pr_mtx); 2932 } 2933 2934 #ifdef VIMAGE 2935 /* 2936 * Determine whether the prison represented by cred owns 2937 * its vnet rather than having it inherited. 2938 * 2939 * Returns 1 in case the prison owns the vnet, 0 otherwise. 2940 */ 2941 int 2942 prison_owns_vnet(struct ucred *cred) 2943 { 2944 2945 /* 2946 * vnets cannot be added/removed after jail creation, 2947 * so no need to lock here. 2948 */ 2949 return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); 2950 } 2951 #endif 2952 2953 /* 2954 * Determine whether the subject represented by cred can "see" 2955 * status of a mount point. 2956 * Returns: 0 for permitted, ENOENT otherwise. 2957 * XXX: This function should be called cr_canseemount() and should be 2958 * placed in kern_prot.c. 2959 */ 2960 int 2961 prison_canseemount(struct ucred *cred, struct mount *mp) 2962 { 2963 struct prison *pr; 2964 struct statfs *sp; 2965 size_t len; 2966 2967 pr = cred->cr_prison; 2968 if (pr->pr_enforce_statfs == 0) 2969 return (0); 2970 if (pr->pr_root->v_mount == mp) 2971 return (0); 2972 if (pr->pr_enforce_statfs == 2) 2973 return (ENOENT); 2974 /* 2975 * If jail's chroot directory is set to "/" we should be able to see 2976 * all mount-points from inside a jail. 2977 * This is ugly check, but this is the only situation when jail's 2978 * directory ends with '/'. 2979 */ 2980 if (strcmp(pr->pr_path, "/") == 0) 2981 return (0); 2982 len = strlen(pr->pr_path); 2983 sp = &mp->mnt_stat; 2984 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) 2985 return (ENOENT); 2986 /* 2987 * Be sure that we don't have situation where jail's root directory 2988 * is "/some/path" and mount point is "/some/pathpath". 2989 */ 2990 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') 2991 return (ENOENT); 2992 return (0); 2993 } 2994 2995 void 2996 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) 2997 { 2998 char jpath[MAXPATHLEN]; 2999 struct prison *pr; 3000 size_t len; 3001 3002 pr = cred->cr_prison; 3003 if (pr->pr_enforce_statfs == 0) 3004 return; 3005 if (prison_canseemount(cred, mp) != 0) { 3006 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 3007 strlcpy(sp->f_mntonname, "[restricted]", 3008 sizeof(sp->f_mntonname)); 3009 return; 3010 } 3011 if (pr->pr_root->v_mount == mp) { 3012 /* 3013 * Clear current buffer data, so we are sure nothing from 3014 * the valid path left there. 3015 */ 3016 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 3017 *sp->f_mntonname = '/'; 3018 return; 3019 } 3020 /* 3021 * If jail's chroot directory is set to "/" we should be able to see 3022 * all mount-points from inside a jail. 3023 */ 3024 if (strcmp(pr->pr_path, "/") == 0) 3025 return; 3026 len = strlen(pr->pr_path); 3027 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); 3028 /* 3029 * Clear current buffer data, so we are sure nothing from 3030 * the valid path left there. 3031 */ 3032 bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); 3033 if (*jpath == '\0') { 3034 /* Should never happen. */ 3035 *sp->f_mntonname = '/'; 3036 } else { 3037 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); 3038 } 3039 } 3040 3041 /* 3042 * Check with permission for a specific privilege is granted within jail. We 3043 * have a specific list of accepted privileges; the rest are denied. 3044 */ 3045 int 3046 prison_priv_check(struct ucred *cred, int priv) 3047 { 3048 3049 /* 3050 * Some policies have custom handlers. This routine should not be 3051 * called for them. See priv_check_cred(). 3052 */ 3053 switch (priv) { 3054 case PRIV_VFS_LOOKUP: 3055 case PRIV_VFS_GENERATION: 3056 KASSERT(0, ("prison_priv_check instead of a custom handler " 3057 "called for %d\n", priv)); 3058 } 3059 3060 if (!jailed(cred)) 3061 return (0); 3062 3063 #ifdef VIMAGE 3064 /* 3065 * Privileges specific to prisons with a virtual network stack. 3066 * There might be a duplicate entry here in case the privilege 3067 * is only granted conditionally in the legacy jail case. 3068 */ 3069 switch (priv) { 3070 #ifdef notyet 3071 /* 3072 * NFS-specific privileges. 3073 */ 3074 case PRIV_NFS_DAEMON: 3075 case PRIV_NFS_LOCKD: 3076 #endif 3077 /* 3078 * Network stack privileges. 3079 */ 3080 case PRIV_NET_BRIDGE: 3081 case PRIV_NET_GRE: 3082 case PRIV_NET_BPF: 3083 case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ 3084 case PRIV_NET_ROUTE: 3085 case PRIV_NET_TAP: 3086 case PRIV_NET_SETIFMTU: 3087 case PRIV_NET_SETIFFLAGS: 3088 case PRIV_NET_SETIFCAP: 3089 case PRIV_NET_SETIFDESCR: 3090 case PRIV_NET_SETIFNAME : 3091 case PRIV_NET_SETIFMETRIC: 3092 case PRIV_NET_SETIFPHYS: 3093 case PRIV_NET_SETIFMAC: 3094 case PRIV_NET_SETLANPCP: 3095 case PRIV_NET_ADDMULTI: 3096 case PRIV_NET_DELMULTI: 3097 case PRIV_NET_HWIOCTL: 3098 case PRIV_NET_SETLLADDR: 3099 case PRIV_NET_ADDIFGROUP: 3100 case PRIV_NET_DELIFGROUP: 3101 case PRIV_NET_IFCREATE: 3102 case PRIV_NET_IFDESTROY: 3103 case PRIV_NET_ADDIFADDR: 3104 case PRIV_NET_DELIFADDR: 3105 case PRIV_NET_LAGG: 3106 case PRIV_NET_GIF: 3107 case PRIV_NET_SETIFVNET: 3108 case PRIV_NET_SETIFFIB: 3109 3110 /* 3111 * 802.11-related privileges. 3112 */ 3113 case PRIV_NET80211_VAP_GETKEY: 3114 case PRIV_NET80211_VAP_MANAGE: 3115 3116 #ifdef notyet 3117 /* 3118 * ATM privileges. 3119 */ 3120 case PRIV_NETATM_CFG: 3121 case PRIV_NETATM_ADD: 3122 case PRIV_NETATM_DEL: 3123 case PRIV_NETATM_SET: 3124 3125 /* 3126 * Bluetooth privileges. 3127 */ 3128 case PRIV_NETBLUETOOTH_RAW: 3129 #endif 3130 3131 /* 3132 * Netgraph and netgraph module privileges. 3133 */ 3134 case PRIV_NETGRAPH_CONTROL: 3135 #ifdef notyet 3136 case PRIV_NETGRAPH_TTY: 3137 #endif 3138 3139 /* 3140 * IPv4 and IPv6 privileges. 3141 */ 3142 case PRIV_NETINET_IPFW: 3143 case PRIV_NETINET_DIVERT: 3144 case PRIV_NETINET_PF: 3145 case PRIV_NETINET_DUMMYNET: 3146 case PRIV_NETINET_CARP: 3147 case PRIV_NETINET_MROUTE: 3148 case PRIV_NETINET_RAW: 3149 case PRIV_NETINET_ADDRCTRL6: 3150 case PRIV_NETINET_ND6: 3151 case PRIV_NETINET_SCOPE6: 3152 case PRIV_NETINET_ALIFETIME6: 3153 case PRIV_NETINET_IPSEC: 3154 case PRIV_NETINET_BINDANY: 3155 3156 #ifdef notyet 3157 /* 3158 * NCP privileges. 3159 */ 3160 case PRIV_NETNCP: 3161 3162 /* 3163 * SMB privileges. 3164 */ 3165 case PRIV_NETSMB: 3166 #endif 3167 3168 /* 3169 * No default: or deny here. 3170 * In case of no permit fall through to next switch(). 3171 */ 3172 if (cred->cr_prison->pr_flags & PR_VNET) 3173 return (0); 3174 } 3175 #endif /* VIMAGE */ 3176 3177 switch (priv) { 3178 /* 3179 * Allow ktrace privileges for root in jail. 3180 */ 3181 case PRIV_KTRACE: 3182 3183 #if 0 3184 /* 3185 * Allow jailed processes to configure audit identity and 3186 * submit audit records (login, etc). In the future we may 3187 * want to further refine the relationship between audit and 3188 * jail. 3189 */ 3190 case PRIV_AUDIT_GETAUDIT: 3191 case PRIV_AUDIT_SETAUDIT: 3192 case PRIV_AUDIT_SUBMIT: 3193 #endif 3194 3195 /* 3196 * Allow jailed processes to manipulate process UNIX 3197 * credentials in any way they see fit. 3198 */ 3199 case PRIV_CRED_SETUID: 3200 case PRIV_CRED_SETEUID: 3201 case PRIV_CRED_SETGID: 3202 case PRIV_CRED_SETEGID: 3203 case PRIV_CRED_SETGROUPS: 3204 case PRIV_CRED_SETREUID: 3205 case PRIV_CRED_SETREGID: 3206 case PRIV_CRED_SETRESUID: 3207 case PRIV_CRED_SETRESGID: 3208 3209 /* 3210 * Jail implements visibility constraints already, so allow 3211 * jailed root to override uid/gid-based constraints. 3212 */ 3213 case PRIV_SEEOTHERGIDS: 3214 case PRIV_SEEOTHERUIDS: 3215 3216 /* 3217 * Jail implements inter-process debugging limits already, so 3218 * allow jailed root various debugging privileges. 3219 */ 3220 case PRIV_DEBUG_DIFFCRED: 3221 case PRIV_DEBUG_SUGID: 3222 case PRIV_DEBUG_UNPRIV: 3223 3224 /* 3225 * Allow jail to set various resource limits and login 3226 * properties, and for now, exceed process resource limits. 3227 */ 3228 case PRIV_PROC_LIMIT: 3229 case PRIV_PROC_SETLOGIN: 3230 case PRIV_PROC_SETRLIMIT: 3231 3232 /* 3233 * System V and POSIX IPC privileges are granted in jail. 3234 */ 3235 case PRIV_IPC_READ: 3236 case PRIV_IPC_WRITE: 3237 case PRIV_IPC_ADMIN: 3238 case PRIV_IPC_MSGSIZE: 3239 case PRIV_MQ_ADMIN: 3240 3241 /* 3242 * Jail operations within a jail work on child jails. 3243 */ 3244 case PRIV_JAIL_ATTACH: 3245 case PRIV_JAIL_SET: 3246 case PRIV_JAIL_REMOVE: 3247 3248 /* 3249 * Jail implements its own inter-process limits, so allow 3250 * root processes in jail to change scheduling on other 3251 * processes in the same jail. Likewise for signalling. 3252 */ 3253 case PRIV_SCHED_DIFFCRED: 3254 case PRIV_SCHED_CPUSET: 3255 case PRIV_SIGNAL_DIFFCRED: 3256 case PRIV_SIGNAL_SUGID: 3257 3258 /* 3259 * Allow jailed processes to write to sysctls marked as jail 3260 * writable. 3261 */ 3262 case PRIV_SYSCTL_WRITEJAIL: 3263 3264 /* 3265 * Allow root in jail to manage a variety of quota 3266 * properties. These should likely be conditional on a 3267 * configuration option. 3268 */ 3269 case PRIV_VFS_GETQUOTA: 3270 case PRIV_VFS_SETQUOTA: 3271 3272 /* 3273 * Since Jail relies on chroot() to implement file system 3274 * protections, grant many VFS privileges to root in jail. 3275 * Be careful to exclude mount-related and NFS-related 3276 * privileges. 3277 */ 3278 case PRIV_VFS_READ: 3279 case PRIV_VFS_WRITE: 3280 case PRIV_VFS_ADMIN: 3281 case PRIV_VFS_EXEC: 3282 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ 3283 case PRIV_VFS_CHFLAGS_DEV: 3284 case PRIV_VFS_CHOWN: 3285 case PRIV_VFS_CHROOT: 3286 case PRIV_VFS_RETAINSUGID: 3287 case PRIV_VFS_FCHROOT: 3288 case PRIV_VFS_LINK: 3289 case PRIV_VFS_SETGID: 3290 case PRIV_VFS_STAT: 3291 case PRIV_VFS_STICKYFILE: 3292 3293 /* 3294 * As in the non-jail case, non-root users are expected to be 3295 * able to read kernel/phyiscal memory (provided /dev/[k]mem 3296 * exists in the jail and they have permission to access it). 3297 */ 3298 case PRIV_KMEM_READ: 3299 return (0); 3300 3301 /* 3302 * Depending on the global setting, allow privilege of 3303 * setting system flags. 3304 */ 3305 case PRIV_VFS_SYSFLAGS: 3306 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) 3307 return (0); 3308 else 3309 return (EPERM); 3310 3311 /* 3312 * Depending on the global setting, allow privilege of 3313 * mounting/unmounting file systems. 3314 */ 3315 case PRIV_VFS_MOUNT: 3316 case PRIV_VFS_UNMOUNT: 3317 case PRIV_VFS_MOUNT_NONUSER: 3318 case PRIV_VFS_MOUNT_OWNER: 3319 if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT && 3320 cred->cr_prison->pr_enforce_statfs < 2) 3321 return (0); 3322 else 3323 return (EPERM); 3324 3325 /* 3326 * Jails should hold no disposition on the PRIV_VFS_READ_DIR 3327 * policy. priv_check_cred will not specifically allow it, and 3328 * we may want a MAC policy to allow it. 3329 */ 3330 case PRIV_VFS_READ_DIR: 3331 return (0); 3332 3333 /* 3334 * Conditionnaly allow locking (unlocking) physical pages 3335 * in memory. 3336 */ 3337 case PRIV_VM_MLOCK: 3338 case PRIV_VM_MUNLOCK: 3339 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) 3340 return (0); 3341 else 3342 return (EPERM); 3343 3344 /* 3345 * Conditionally allow jailed root to bind reserved ports. 3346 */ 3347 case PRIV_NETINET_RESERVEDPORT: 3348 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) 3349 return (0); 3350 else 3351 return (EPERM); 3352 3353 /* 3354 * Allow jailed root to reuse in-use ports. 3355 */ 3356 case PRIV_NETINET_REUSEPORT: 3357 return (0); 3358 3359 /* 3360 * Allow jailed root to set certain IPv4/6 (option) headers. 3361 */ 3362 case PRIV_NETINET_SETHDROPTS: 3363 return (0); 3364 3365 /* 3366 * Conditionally allow creating raw sockets in jail. 3367 */ 3368 case PRIV_NETINET_RAW: 3369 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) 3370 return (0); 3371 else 3372 return (EPERM); 3373 3374 /* 3375 * Since jail implements its own visibility limits on netstat 3376 * sysctls, allow getcred. This allows identd to work in 3377 * jail. 3378 */ 3379 case PRIV_NETINET_GETCRED: 3380 return (0); 3381 3382 /* 3383 * Allow jailed root to set loginclass. 3384 */ 3385 case PRIV_PROC_SETLOGINCLASS: 3386 return (0); 3387 3388 /* 3389 * Do not allow a process inside a jail to read the kernel 3390 * message buffer unless explicitly permitted. 3391 */ 3392 case PRIV_MSGBUF: 3393 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) 3394 return (0); 3395 return (EPERM); 3396 3397 default: 3398 /* 3399 * In all remaining cases, deny the privilege request. This 3400 * includes almost all network privileges, many system 3401 * configuration privileges. 3402 */ 3403 return (EPERM); 3404 } 3405 } 3406 3407 /* 3408 * Return the part of pr2's name that is relative to pr1, or the whole name 3409 * if it does not directly follow. 3410 */ 3411 3412 char * 3413 prison_name(struct prison *pr1, struct prison *pr2) 3414 { 3415 char *name; 3416 3417 /* Jails see themselves as "0" (if they see themselves at all). */ 3418 if (pr1 == pr2) 3419 return "0"; 3420 name = pr2->pr_name; 3421 if (prison_ischild(pr1, pr2)) { 3422 /* 3423 * pr1 isn't locked (and allprison_lock may not be either) 3424 * so its length can't be counted on. But the number of dots 3425 * can be counted on - and counted. 3426 */ 3427 for (; pr1 != &prison0; pr1 = pr1->pr_parent) 3428 name = strchr(name, '.') + 1; 3429 } 3430 return (name); 3431 } 3432 3433 /* 3434 * Return the part of pr2's path that is relative to pr1, or the whole path 3435 * if it does not directly follow. 3436 */ 3437 static char * 3438 prison_path(struct prison *pr1, struct prison *pr2) 3439 { 3440 char *path1, *path2; 3441 int len1; 3442 3443 path1 = pr1->pr_path; 3444 path2 = pr2->pr_path; 3445 if (!strcmp(path1, "/")) 3446 return (path2); 3447 len1 = strlen(path1); 3448 if (strncmp(path1, path2, len1)) 3449 return (path2); 3450 if (path2[len1] == '\0') 3451 return "/"; 3452 if (path2[len1] == '/') 3453 return (path2 + len1); 3454 return (path2); 3455 } 3456 3457 /* 3458 * Jail-related sysctls. 3459 */ 3460 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 3461 "Jails"); 3462 3463 static int 3464 sysctl_jail_list(SYSCTL_HANDLER_ARGS) 3465 { 3466 struct xprison *xp; 3467 struct prison *pr, *cpr; 3468 #ifdef INET 3469 struct in_addr *ip4 = NULL; 3470 int ip4s = 0; 3471 #endif 3472 #ifdef INET6 3473 struct in6_addr *ip6 = NULL; 3474 int ip6s = 0; 3475 #endif 3476 int descend, error; 3477 3478 xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); 3479 pr = req->td->td_ucred->cr_prison; 3480 error = 0; 3481 sx_slock(&allprison_lock); 3482 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { 3483 #if defined(INET) || defined(INET6) 3484 again: 3485 #endif 3486 mtx_lock(&cpr->pr_mtx); 3487 #ifdef INET 3488 if (cpr->pr_ip4s > 0) { 3489 if (ip4s < cpr->pr_ip4s) { 3490 ip4s = cpr->pr_ip4s; 3491 mtx_unlock(&cpr->pr_mtx); 3492 ip4 = realloc(ip4, ip4s * 3493 sizeof(struct in_addr), M_TEMP, M_WAITOK); 3494 goto again; 3495 } 3496 bcopy(cpr->pr_ip4, ip4, 3497 cpr->pr_ip4s * sizeof(struct in_addr)); 3498 } 3499 #endif 3500 #ifdef INET6 3501 if (cpr->pr_ip6s > 0) { 3502 if (ip6s < cpr->pr_ip6s) { 3503 ip6s = cpr->pr_ip6s; 3504 mtx_unlock(&cpr->pr_mtx); 3505 ip6 = realloc(ip6, ip6s * 3506 sizeof(struct in6_addr), M_TEMP, M_WAITOK); 3507 goto again; 3508 } 3509 bcopy(cpr->pr_ip6, ip6, 3510 cpr->pr_ip6s * sizeof(struct in6_addr)); 3511 } 3512 #endif 3513 if (cpr->pr_ref == 0) { 3514 mtx_unlock(&cpr->pr_mtx); 3515 continue; 3516 } 3517 bzero(xp, sizeof(*xp)); 3518 xp->pr_version = XPRISON_VERSION; 3519 xp->pr_id = cpr->pr_id; 3520 xp->pr_state = cpr->pr_uref > 0 3521 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; 3522 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); 3523 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); 3524 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); 3525 #ifdef INET 3526 xp->pr_ip4s = cpr->pr_ip4s; 3527 #endif 3528 #ifdef INET6 3529 xp->pr_ip6s = cpr->pr_ip6s; 3530 #endif 3531 mtx_unlock(&cpr->pr_mtx); 3532 error = SYSCTL_OUT(req, xp, sizeof(*xp)); 3533 if (error) 3534 break; 3535 #ifdef INET 3536 if (xp->pr_ip4s > 0) { 3537 error = SYSCTL_OUT(req, ip4, 3538 xp->pr_ip4s * sizeof(struct in_addr)); 3539 if (error) 3540 break; 3541 } 3542 #endif 3543 #ifdef INET6 3544 if (xp->pr_ip6s > 0) { 3545 error = SYSCTL_OUT(req, ip6, 3546 xp->pr_ip6s * sizeof(struct in6_addr)); 3547 if (error) 3548 break; 3549 } 3550 #endif 3551 } 3552 sx_sunlock(&allprison_lock); 3553 free(xp, M_TEMP); 3554 #ifdef INET 3555 free(ip4, M_TEMP); 3556 #endif 3557 #ifdef INET6 3558 free(ip6, M_TEMP); 3559 #endif 3560 return (error); 3561 } 3562 3563 SYSCTL_OID(_security_jail, OID_AUTO, list, 3564 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 3565 sysctl_jail_list, "S", "List of active jails"); 3566 3567 static int 3568 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) 3569 { 3570 int error, injail; 3571 3572 injail = jailed(req->td->td_ucred); 3573 error = SYSCTL_OUT(req, &injail, sizeof(injail)); 3574 3575 return (error); 3576 } 3577 3578 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, 3579 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 3580 sysctl_jail_jailed, "I", "Process in jail?"); 3581 3582 static int 3583 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) 3584 { 3585 int error, havevnet; 3586 #ifdef VIMAGE 3587 struct ucred *cred = req->td->td_ucred; 3588 3589 havevnet = jailed(cred) && prison_owns_vnet(cred); 3590 #else 3591 havevnet = 0; 3592 #endif 3593 error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); 3594 3595 return (error); 3596 } 3597 3598 SYSCTL_PROC(_security_jail, OID_AUTO, vnet, 3599 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 3600 sysctl_jail_vnet, "I", "Jail owns vnet?"); 3601 3602 #if defined(INET) || defined(INET6) 3603 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, 3604 &jail_max_af_ips, 0, 3605 "Number of IP addresses a jail may have at most per address family (deprecated)"); 3606 #endif 3607 3608 /* 3609 * Default parameters for jail(2) compatibility. For historical reasons, 3610 * the sysctl names have varying similarity to the parameter names. Prisons 3611 * just see their own parameters, and can't change them. 3612 */ 3613 static int 3614 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) 3615 { 3616 struct prison *pr; 3617 int allow, error, i; 3618 3619 pr = req->td->td_ucred->cr_prison; 3620 allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow; 3621 3622 /* Get the current flag value, and convert it to a boolean. */ 3623 i = (allow & arg2) ? 1 : 0; 3624 if (arg1 != NULL) 3625 i = !i; 3626 error = sysctl_handle_int(oidp, &i, 0, req); 3627 if (error || !req->newptr) 3628 return (error); 3629 i = i ? arg2 : 0; 3630 if (arg1 != NULL) 3631 i ^= arg2; 3632 /* 3633 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 3634 * for writing. 3635 */ 3636 mtx_lock(&prison0.pr_mtx); 3637 jail_default_allow = (jail_default_allow & ~arg2) | i; 3638 mtx_unlock(&prison0.pr_mtx); 3639 return (0); 3640 } 3641 3642 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, 3643 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3644 NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", 3645 "Processes in jail can set their hostnames (deprecated)"); 3646 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, 3647 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3648 (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", 3649 "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); 3650 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, 3651 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3652 NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", 3653 "Processes in jail can use System V IPC primitives (deprecated)"); 3654 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, 3655 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3656 NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", 3657 "Prison root can create raw sockets (deprecated)"); 3658 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, 3659 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3660 NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", 3661 "Processes in jail can alter system file flags (deprecated)"); 3662 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, 3663 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3664 NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", 3665 "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); 3666 3667 static int 3668 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) 3669 { 3670 struct prison *pr; 3671 int level, error; 3672 3673 pr = req->td->td_ucred->cr_prison; 3674 level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); 3675 error = sysctl_handle_int(oidp, &level, 0, req); 3676 if (error || !req->newptr) 3677 return (error); 3678 *(int *)arg1 = level; 3679 return (0); 3680 } 3681 3682 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, 3683 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3684 &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), 3685 sysctl_jail_default_level, "I", 3686 "Processes in jail cannot see all mounted file systems (deprecated)"); 3687 3688 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, 3689 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 3690 &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), 3691 sysctl_jail_default_level, "I", 3692 "Ruleset for the devfs filesystem in jail (deprecated)"); 3693 3694 /* 3695 * Nodes to describe jail parameters. Maximum length of string parameters 3696 * is returned in the string itself, and the other parameters exist merely 3697 * to make themselves and their types known. 3698 */ 3699 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 3700 "Jail parameters"); 3701 3702 int 3703 sysctl_jail_param(SYSCTL_HANDLER_ARGS) 3704 { 3705 int i; 3706 long l; 3707 size_t s; 3708 char numbuf[12]; 3709 3710 switch (oidp->oid_kind & CTLTYPE) 3711 { 3712 case CTLTYPE_LONG: 3713 case CTLTYPE_ULONG: 3714 l = 0; 3715 #ifdef SCTL_MASK32 3716 if (!(req->flags & SCTL_MASK32)) 3717 #endif 3718 return (SYSCTL_OUT(req, &l, sizeof(l))); 3719 case CTLTYPE_INT: 3720 case CTLTYPE_UINT: 3721 i = 0; 3722 return (SYSCTL_OUT(req, &i, sizeof(i))); 3723 case CTLTYPE_STRING: 3724 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); 3725 return 3726 (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); 3727 case CTLTYPE_STRUCT: 3728 s = (size_t)arg2; 3729 return (SYSCTL_OUT(req, &s, sizeof(s))); 3730 } 3731 return (0); 3732 } 3733 3734 /* 3735 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at 3736 * jail creation time but cannot be changed in an existing jail. 3737 */ 3738 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); 3739 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); 3740 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); 3741 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); 3742 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, 3743 "I", "Jail secure level"); 3744 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 3745 "Jail value for kern.osreldate and uname -K"); 3746 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 3747 "Jail value for kern.osrelease and uname -r"); 3748 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, 3749 "I", "Jail cannot see all mounted file systems"); 3750 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, 3751 "I", "Ruleset for in-jail devfs mounts"); 3752 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, 3753 "B", "Jail persistence"); 3754 #ifdef VIMAGE 3755 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, 3756 "E,jailsys", "Virtual network stack"); 3757 #endif 3758 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, 3759 "B", "Jail is in the process of shutting down"); 3760 3761 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); 3762 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, 3763 "I", "Current number of child jails"); 3764 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, 3765 "I", "Maximum number of child jails"); 3766 3767 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); 3768 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, 3769 "Jail hostname"); 3770 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, 3771 "Jail NIS domainname"); 3772 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, 3773 "Jail host UUID"); 3774 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, 3775 "LU", "Jail host ID"); 3776 3777 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); 3778 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); 3779 3780 #ifdef INET 3781 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, 3782 "Jail IPv4 address virtualization"); 3783 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), 3784 "S,in_addr,a", "Jail IPv4 addresses"); 3785 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, 3786 "B", "Do (not) use IPv4 source address selection rather than the " 3787 "primary jail IPv4 address."); 3788 #endif 3789 #ifdef INET6 3790 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, 3791 "Jail IPv6 address virtualization"); 3792 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), 3793 "S,in6_addr,a", "Jail IPv6 addresses"); 3794 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, 3795 "B", "Do (not) use IPv6 source address selection rather than the " 3796 "primary jail IPv6 address."); 3797 #endif 3798 3799 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); 3800 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, 3801 "B", "Jail may set hostname"); 3802 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, 3803 "B", "Jail may use SYSV IPC"); 3804 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, 3805 "B", "Jail may create raw sockets"); 3806 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, 3807 "B", "Jail may alter system file flags"); 3808 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, 3809 "B", "Jail may set file quotas"); 3810 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, 3811 "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); 3812 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, 3813 "B", "Jail may lock (unlock) physical pages in memory"); 3814 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, 3815 "B", "Jail may bind sockets to reserved ports"); 3816 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, 3817 "B", "Jail may read the kernel message buffer"); 3818 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, 3819 "B", "Unprivileged processes may use process debugging facilities"); 3820 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, 3821 "B", "Processes in jail with uid 0 have privilege"); 3822 3823 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); 3824 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, 3825 "B", "Jail may mount/unmount jail-friendly file systems in general"); 3826 3827 /* 3828 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return 3829 * its associated bit in the pr_allow bitmask, or zero if the parameter was 3830 * not created. 3831 */ 3832 unsigned 3833 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, 3834 const char *descr) 3835 { 3836 struct bool_flags *bf; 3837 struct sysctl_oid *parent; 3838 char *allow_name, *allow_noname, *allowed; 3839 #ifndef NO_SYSCTL_DESCR 3840 char *descr_deprecated; 3841 #endif 3842 unsigned allow_flag; 3843 3844 if (prefix 3845 ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) 3846 < 0 || 3847 asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) 3848 < 0 3849 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || 3850 asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { 3851 free(allow_name, M_PRISON); 3852 return 0; 3853 } 3854 3855 /* 3856 * See if this parameter has already beed added, i.e. a module was 3857 * previously loaded/unloaded. 3858 */ 3859 mtx_lock(&prison0.pr_mtx); 3860 for (bf = pr_flag_allow; 3861 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0; 3862 bf++) { 3863 if (strcmp(bf->name, allow_name) == 0) { 3864 allow_flag = bf->flag; 3865 goto no_add; 3866 } 3867 } 3868 3869 /* 3870 * Find a free bit in prison0's pr_allow, failing if there are none 3871 * (which shouldn't happen as long as we keep track of how many 3872 * potential dynamic flags exist). 3873 * 3874 * Due to per-jail unprivileged process debugging support 3875 * using pr_allow, also verify against PR_ALLOW_ALL_STATIC. 3876 * prison0 may have unprivileged process debugging unset. 3877 */ 3878 for (allow_flag = 1;; allow_flag <<= 1) { 3879 if (allow_flag == 0) 3880 goto no_add; 3881 if (allow_flag & PR_ALLOW_ALL_STATIC) 3882 continue; 3883 if ((prison0.pr_allow & allow_flag) == 0) 3884 break; 3885 } 3886 3887 /* 3888 * Note the parameter in the next open slot in pr_flag_allow. 3889 * Set the flag last so code that checks pr_flag_allow can do so 3890 * without locking. 3891 */ 3892 for (bf = pr_flag_allow; bf->flag != 0; bf++) 3893 if (bf == pr_flag_allow + nitems(pr_flag_allow)) { 3894 /* This should never happen, but is not fatal. */ 3895 allow_flag = 0; 3896 goto no_add; 3897 } 3898 prison0.pr_allow |= allow_flag; 3899 bf->name = allow_name; 3900 bf->noname = allow_noname; 3901 bf->flag = allow_flag; 3902 mtx_unlock(&prison0.pr_mtx); 3903 3904 /* 3905 * Create sysctls for the paramter, and the back-compat global 3906 * permission. 3907 */ 3908 parent = prefix 3909 ? SYSCTL_ADD_NODE(NULL, 3910 SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), 3911 OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) 3912 : &sysctl___security_jail_param_allow; 3913 (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, 3914 name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 3915 NULL, 0, sysctl_jail_param, "B", descr); 3916 if ((prefix 3917 ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) 3918 : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { 3919 #ifndef NO_SYSCTL_DESCR 3920 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", 3921 descr); 3922 #endif 3923 (void)SYSCTL_ADD_PROC(NULL, 3924 SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, 3925 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, 3926 sysctl_jail_default_allow, "I", descr_deprecated); 3927 #ifndef NO_SYSCTL_DESCR 3928 free(descr_deprecated, M_TEMP); 3929 #endif 3930 free(allowed, M_TEMP); 3931 } 3932 return allow_flag; 3933 3934 no_add: 3935 mtx_unlock(&prison0.pr_mtx); 3936 free(allow_name, M_PRISON); 3937 free(allow_noname, M_PRISON); 3938 return allow_flag; 3939 } 3940 3941 /* 3942 * The VFS system will register jail-aware filesystems here. They each get 3943 * a parameter allow.mount.xxxfs and a flag to check when a jailed user 3944 * attempts to mount. 3945 */ 3946 void 3947 prison_add_vfs(struct vfsconf *vfsp) 3948 { 3949 #ifdef NO_SYSCTL_DESCR 3950 3951 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, 3952 NULL, NULL); 3953 #else 3954 char *descr; 3955 3956 (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", 3957 vfsp->vfc_name); 3958 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, 3959 NULL, descr); 3960 free(descr, M_TEMP); 3961 #endif 3962 } 3963 3964 #ifdef RACCT 3965 void 3966 prison_racct_foreach(void (*callback)(struct racct *racct, 3967 void *arg2, void *arg3), void (*pre)(void), void (*post)(void), 3968 void *arg2, void *arg3) 3969 { 3970 struct prison_racct *prr; 3971 3972 ASSERT_RACCT_ENABLED(); 3973 3974 sx_slock(&allprison_lock); 3975 if (pre != NULL) 3976 (pre)(); 3977 LIST_FOREACH(prr, &allprison_racct, prr_next) 3978 (callback)(prr->prr_racct, arg2, arg3); 3979 if (post != NULL) 3980 (post)(); 3981 sx_sunlock(&allprison_lock); 3982 } 3983 3984 static struct prison_racct * 3985 prison_racct_find_locked(const char *name) 3986 { 3987 struct prison_racct *prr; 3988 3989 ASSERT_RACCT_ENABLED(); 3990 sx_assert(&allprison_lock, SA_XLOCKED); 3991 3992 if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) 3993 return (NULL); 3994 3995 LIST_FOREACH(prr, &allprison_racct, prr_next) { 3996 if (strcmp(name, prr->prr_name) != 0) 3997 continue; 3998 3999 /* Found prison_racct with a matching name? */ 4000 prison_racct_hold(prr); 4001 return (prr); 4002 } 4003 4004 /* Add new prison_racct. */ 4005 prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); 4006 racct_create(&prr->prr_racct); 4007 4008 strcpy(prr->prr_name, name); 4009 refcount_init(&prr->prr_refcount, 1); 4010 LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); 4011 4012 return (prr); 4013 } 4014 4015 struct prison_racct * 4016 prison_racct_find(const char *name) 4017 { 4018 struct prison_racct *prr; 4019 4020 ASSERT_RACCT_ENABLED(); 4021 4022 sx_xlock(&allprison_lock); 4023 prr = prison_racct_find_locked(name); 4024 sx_xunlock(&allprison_lock); 4025 return (prr); 4026 } 4027 4028 void 4029 prison_racct_hold(struct prison_racct *prr) 4030 { 4031 4032 ASSERT_RACCT_ENABLED(); 4033 4034 refcount_acquire(&prr->prr_refcount); 4035 } 4036 4037 static void 4038 prison_racct_free_locked(struct prison_racct *prr) 4039 { 4040 4041 ASSERT_RACCT_ENABLED(); 4042 sx_assert(&allprison_lock, SA_XLOCKED); 4043 4044 if (refcount_release(&prr->prr_refcount)) { 4045 racct_destroy(&prr->prr_racct); 4046 LIST_REMOVE(prr, prr_next); 4047 free(prr, M_PRISON_RACCT); 4048 } 4049 } 4050 4051 void 4052 prison_racct_free(struct prison_racct *prr) 4053 { 4054 4055 ASSERT_RACCT_ENABLED(); 4056 sx_assert(&allprison_lock, SA_UNLOCKED); 4057 4058 if (refcount_release_if_not_last(&prr->prr_refcount)) 4059 return; 4060 4061 sx_xlock(&allprison_lock); 4062 prison_racct_free_locked(prr); 4063 sx_xunlock(&allprison_lock); 4064 } 4065 4066 static void 4067 prison_racct_attach(struct prison *pr) 4068 { 4069 struct prison_racct *prr; 4070 4071 ASSERT_RACCT_ENABLED(); 4072 sx_assert(&allprison_lock, SA_XLOCKED); 4073 4074 prr = prison_racct_find_locked(pr->pr_name); 4075 KASSERT(prr != NULL, ("cannot find prison_racct")); 4076 4077 pr->pr_prison_racct = prr; 4078 } 4079 4080 /* 4081 * Handle jail renaming. From the racct point of view, renaming means 4082 * moving from one prison_racct to another. 4083 */ 4084 static void 4085 prison_racct_modify(struct prison *pr) 4086 { 4087 #ifdef RCTL 4088 struct proc *p; 4089 struct ucred *cred; 4090 #endif 4091 struct prison_racct *oldprr; 4092 4093 ASSERT_RACCT_ENABLED(); 4094 4095 sx_slock(&allproc_lock); 4096 sx_xlock(&allprison_lock); 4097 4098 if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { 4099 sx_xunlock(&allprison_lock); 4100 sx_sunlock(&allproc_lock); 4101 return; 4102 } 4103 4104 oldprr = pr->pr_prison_racct; 4105 pr->pr_prison_racct = NULL; 4106 4107 prison_racct_attach(pr); 4108 4109 /* 4110 * Move resource utilisation records. 4111 */ 4112 racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); 4113 4114 #ifdef RCTL 4115 /* 4116 * Force rctl to reattach rules to processes. 4117 */ 4118 FOREACH_PROC_IN_SYSTEM(p) { 4119 PROC_LOCK(p); 4120 cred = crhold(p->p_ucred); 4121 PROC_UNLOCK(p); 4122 rctl_proc_ucred_changed(p, cred); 4123 crfree(cred); 4124 } 4125 #endif 4126 4127 sx_sunlock(&allproc_lock); 4128 prison_racct_free_locked(oldprr); 4129 sx_xunlock(&allprison_lock); 4130 } 4131 4132 static void 4133 prison_racct_detach(struct prison *pr) 4134 { 4135 4136 ASSERT_RACCT_ENABLED(); 4137 sx_assert(&allprison_lock, SA_UNLOCKED); 4138 4139 if (pr->pr_prison_racct == NULL) 4140 return; 4141 prison_racct_free(pr->pr_prison_racct); 4142 pr->pr_prison_racct = NULL; 4143 } 4144 #endif /* RACCT */ 4145 4146 #ifdef DDB 4147 4148 static void 4149 db_show_prison(struct prison *pr) 4150 { 4151 struct bool_flags *bf; 4152 struct jailsys_flags *jsf; 4153 #if defined(INET) || defined(INET6) 4154 int ii; 4155 #endif 4156 unsigned f; 4157 #ifdef INET 4158 char ip4buf[INET_ADDRSTRLEN]; 4159 #endif 4160 #ifdef INET6 4161 char ip6buf[INET6_ADDRSTRLEN]; 4162 #endif 4163 4164 db_printf("prison %p:\n", pr); 4165 db_printf(" jid = %d\n", pr->pr_id); 4166 db_printf(" name = %s\n", pr->pr_name); 4167 db_printf(" parent = %p\n", pr->pr_parent); 4168 db_printf(" ref = %d\n", pr->pr_ref); 4169 db_printf(" uref = %d\n", pr->pr_uref); 4170 db_printf(" path = %s\n", pr->pr_path); 4171 db_printf(" cpuset = %d\n", pr->pr_cpuset 4172 ? pr->pr_cpuset->cs_id : -1); 4173 #ifdef VIMAGE 4174 db_printf(" vnet = %p\n", pr->pr_vnet); 4175 #endif 4176 db_printf(" root = %p\n", pr->pr_root); 4177 db_printf(" securelevel = %d\n", pr->pr_securelevel); 4178 db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); 4179 db_printf(" children.max = %d\n", pr->pr_childmax); 4180 db_printf(" children.cur = %d\n", pr->pr_childcount); 4181 db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); 4182 db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); 4183 db_printf(" flags = 0x%x", pr->pr_flags); 4184 for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) 4185 if (pr->pr_flags & bf->flag) 4186 db_printf(" %s", bf->name); 4187 for (jsf = pr_flag_jailsys; 4188 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); 4189 jsf++) { 4190 f = pr->pr_flags & (jsf->disable | jsf->new); 4191 db_printf(" %-16s= %s\n", jsf->name, 4192 (f != 0 && f == jsf->disable) ? "disable" 4193 : (f == jsf->new) ? "new" 4194 : "inherit"); 4195 } 4196 db_printf(" allow = 0x%x", pr->pr_allow); 4197 for (bf = pr_flag_allow; 4198 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0; 4199 bf++) 4200 if (pr->pr_allow & bf->flag) 4201 db_printf(" %s", bf->name); 4202 db_printf("\n"); 4203 db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); 4204 db_printf(" host.hostname = %s\n", pr->pr_hostname); 4205 db_printf(" host.domainname = %s\n", pr->pr_domainname); 4206 db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); 4207 db_printf(" host.hostid = %lu\n", pr->pr_hostid); 4208 #ifdef INET 4209 db_printf(" ip4s = %d\n", pr->pr_ip4s); 4210 for (ii = 0; ii < pr->pr_ip4s; ii++) 4211 db_printf(" %s %s\n", 4212 ii == 0 ? "ip4.addr =" : " ", 4213 inet_ntoa_r(pr->pr_ip4[ii], ip4buf)); 4214 #endif 4215 #ifdef INET6 4216 db_printf(" ip6s = %d\n", pr->pr_ip6s); 4217 for (ii = 0; ii < pr->pr_ip6s; ii++) 4218 db_printf(" %s %s\n", 4219 ii == 0 ? "ip6.addr =" : " ", 4220 ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); 4221 #endif 4222 } 4223 4224 DB_SHOW_COMMAND(prison, db_show_prison_command) 4225 { 4226 struct prison *pr; 4227 4228 if (!have_addr) { 4229 /* 4230 * Show all prisons in the list, and prison0 which is not 4231 * listed. 4232 */ 4233 db_show_prison(&prison0); 4234 if (!db_pager_quit) { 4235 TAILQ_FOREACH(pr, &allprison, pr_list) { 4236 db_show_prison(pr); 4237 if (db_pager_quit) 4238 break; 4239 } 4240 } 4241 return; 4242 } 4243 4244 if (addr == 0) 4245 pr = &prison0; 4246 else { 4247 /* Look for a prison with the ID and with references. */ 4248 TAILQ_FOREACH(pr, &allprison, pr_list) 4249 if (pr->pr_id == addr && pr->pr_ref > 0) 4250 break; 4251 if (pr == NULL) 4252 /* Look again, without requiring a reference. */ 4253 TAILQ_FOREACH(pr, &allprison, pr_list) 4254 if (pr->pr_id == addr) 4255 break; 4256 if (pr == NULL) 4257 /* Assume address points to a valid prison. */ 4258 pr = (struct prison *)addr; 4259 } 4260 db_show_prison(pr); 4261 } 4262 4263 #endif /* DDB */ 4264