1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * Zones 32 * 33 * A zone is a named collection of processes, namespace constraints, 34 * and other system resources which comprise a secure and manageable 35 * application containment facility. 36 * 37 * Zones (represented by the reference counted zone_t) are tracked in 38 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 39 * (zoneid_t) are used to track zone association. Zone IDs are 40 * dynamically generated when the zone is created; if a persistent 41 * identifier is needed (core files, accounting logs, audit trail, 42 * etc.), the zone name should be used. 43 * 44 * 45 * Global Zone: 46 * 47 * The global zone (zoneid 0) is automatically associated with all 48 * system resources that have not been bound to a user-created zone. 49 * This means that even systems where zones are not in active use 50 * have a global zone, and all processes, mounts, etc. are 51 * associated with that zone. The global zone is generally 52 * unconstrained in terms of privileges and access, though the usual 53 * credential and privilege based restrictions apply. 54 * 55 * 56 * Zone States: 57 * 58 * The states in which a zone may be in and the transitions are as 59 * follows: 60 * 61 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 62 * initialized zone is added to the list of active zones on the system but 63 * isn't accessible. 64 * 65 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 66 * ready. The zone is made visible after the ZSD constructor callbacks are 67 * executed. A zone remains in this state until it transitions into 68 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 69 * 70 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 71 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 72 * state. 73 * 74 * ZONE_IS_RUNNING: The zone is open for business: zsched has 75 * successfully started init. A zone remains in this state until 76 * zone_shutdown() is called. 77 * 78 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 79 * killing all processes running in the zone. The zone remains 80 * in this state until there are no more user processes running in the zone. 81 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 82 * Since zone_shutdown() is restartable, it may be called successfully 83 * multiple times for the same zone_t. Setting of the zone's state to 84 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 85 * the zone's status without worrying about it being a moving target. 86 * 87 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 88 * are no more user processes in the zone. The zone remains in this 89 * state until there are no more kernel threads associated with the 90 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 91 * fail. 92 * 93 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 94 * have exited. zone_shutdown() returns. Henceforth it is not possible to 95 * join the zone or create kernel threads therein. 96 * 97 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 98 * remains in this state until zsched exits. Calls to zone_find_by_*() 99 * return NULL from now on. 100 * 101 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 102 * processes or threads doing work on behalf of the zone. The zone is 103 * removed from the list of active zones. zone_destroy() returns, and 104 * the zone can be recreated. 105 * 106 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 107 * callbacks are executed, and all memory associated with the zone is 108 * freed. 109 * 110 * Threads can wait for the zone to enter a requested state by using 111 * zone_status_wait() or zone_status_timedwait() with the desired 112 * state passed in as an argument. Zone state transitions are 113 * uni-directional; it is not possible to move back to an earlier state. 114 * 115 * 116 * Zone-Specific Data: 117 * 118 * Subsystems needing to maintain zone-specific data can store that 119 * data using the ZSD mechanism. This provides a zone-specific data 120 * store, similar to thread-specific data (see pthread_getspecific(3C) 121 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 122 * to register callbacks to be invoked when a zone is created, shut 123 * down, or destroyed. This can be used to initialize zone-specific 124 * data for new zones and to clean up when zones go away. 125 * 126 * 127 * Data Structures: 128 * 129 * The per-zone structure (zone_t) is reference counted, and freed 130 * when all references are released. zone_hold and zone_rele can be 131 * used to adjust the reference count. In addition, reference counts 132 * associated with the cred_t structure are tracked separately using 133 * zone_cred_hold and zone_cred_rele. 134 * 135 * Pointers to active zone_t's are stored in two hash tables; one 136 * for searching by id, the other for searching by name. Lookups 137 * can be performed on either basis, using zone_find_by_id and 138 * zone_find_by_name. Both return zone_t pointers with the zone 139 * held, so zone_rele should be called when the pointer is no longer 140 * needed. Zones can also be searched by path; zone_find_by_path 141 * returns the zone with which a path name is associated (global 142 * zone if the path is not within some other zone's file system 143 * hierarchy). This currently requires iterating through each zone, 144 * so it is slower than an id or name search via a hash table. 145 * 146 * 147 * Locking: 148 * 149 * zonehash_lock: This is a top-level global lock used to protect the 150 * zone hash tables and lists. Zones cannot be created or destroyed 151 * while this lock is held. 152 * zone_status_lock: This is a global lock protecting zone state. 153 * Zones cannot change state while this lock is held. It also 154 * protects the list of kernel threads associated with a zone. 155 * zone_lock: This is a per-zone lock used to protect several fields of 156 * the zone_t (see <sys/zone.h> for details). In addition, holding 157 * this lock means that the zone cannot go away. 158 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 159 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 160 * list (a list of zones in the ZONE_IS_DEAD state). 161 * 162 * Ordering requirements: 163 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 164 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 165 * 166 * Blocking memory allocations are permitted while holding any of the 167 * zone locks. 168 * 169 * 170 * System Call Interface: 171 * 172 * The zone subsystem can be managed and queried from user level with 173 * the following system calls (all subcodes of the primary "zone" 174 * system call): 175 * - zone_create: creates a zone with selected attributes (name, 176 * root path, privileges, resource controls, ZFS datasets) 177 * - zone_enter: allows the current process to enter a zone 178 * - zone_getattr: reports attributes of a zone 179 * - zone_list: lists all zones active in the system 180 * - zone_lookup: looks up zone id based on name 181 * - zone_shutdown: initiates shutdown process (see states above) 182 * - zone_destroy: completes shutdown process (see states above) 183 * 184 */ 185 186 #include <sys/priv_impl.h> 187 #include <sys/cred.h> 188 #include <c2/audit.h> 189 #include <sys/ddi.h> 190 #include <sys/debug.h> 191 #include <sys/file.h> 192 #include <sys/kmem.h> 193 #include <sys/mutex.h> 194 #include <sys/pathname.h> 195 #include <sys/proc.h> 196 #include <sys/project.h> 197 #include <sys/task.h> 198 #include <sys/systm.h> 199 #include <sys/types.h> 200 #include <sys/utsname.h> 201 #include <sys/vnode.h> 202 #include <sys/vfs.h> 203 #include <sys/systeminfo.h> 204 #include <sys/policy.h> 205 #include <sys/cred_impl.h> 206 #include <sys/contract_impl.h> 207 #include <sys/contract/process_impl.h> 208 #include <sys/class.h> 209 #include <sys/pool.h> 210 #include <sys/pool_pset.h> 211 #include <sys/pset.h> 212 #include <sys/log.h> 213 #include <sys/sysmacros.h> 214 #include <sys/callb.h> 215 #include <sys/vmparam.h> 216 #include <sys/corectl.h> 217 218 #include <sys/door.h> 219 #include <sys/cpuvar.h> 220 #include <sys/fs/snode.h> 221 222 #include <sys/uadmin.h> 223 #include <sys/session.h> 224 #include <sys/cmn_err.h> 225 #include <sys/modhash.h> 226 #include <sys/nvpair.h> 227 #include <sys/rctl.h> 228 #include <sys/fss.h> 229 #include <sys/zone.h> 230 231 /* 232 * cv used to signal that all references to the zone have been released. This 233 * needs to be global since there may be multiple waiters, and the first to 234 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 235 */ 236 static kcondvar_t zone_destroy_cv; 237 /* 238 * Lock used to serialize access to zone_cv. This could have been per-zone, 239 * but then we'd need another lock for zone_destroy_cv, and why bother? 240 */ 241 static kmutex_t zone_status_lock; 242 243 /* 244 * ZSD-related global variables. 245 */ 246 static kmutex_t zsd_key_lock; /* protects the following two */ 247 /* 248 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 249 */ 250 static zone_key_t zsd_keyval = 0; 251 /* 252 * Global list of registered keys. We use this when a new zone is created. 253 */ 254 static list_t zsd_registered_keys; 255 256 int zone_hash_size = 256; 257 static mod_hash_t *zonehashbyname, *zonehashbyid; 258 static kmutex_t zonehash_lock; 259 static uint_t zonecount; 260 static id_space_t *zoneid_space; 261 262 /* 263 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 264 * kernel proper runs, and which manages all other zones. 265 * 266 * Although not declared as static, the variable "zone0" should not be used 267 * except for by code that needs to reference the global zone early on in boot, 268 * before it is fully initialized. All other consumers should use 269 * 'global_zone'. 270 */ 271 zone_t zone0; 272 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 273 274 /* 275 * List of active zones, protected by zonehash_lock. 276 */ 277 static list_t zone_active; 278 279 /* 280 * List of destroyed zones that still have outstanding cred references. 281 * Used for debugging. Uses a separate lock to avoid lock ordering 282 * problems in zone_free. 283 */ 284 static list_t zone_deathrow; 285 static kmutex_t zone_deathrow_lock; 286 287 /* number of zones is limited by virtual interface limit in IP */ 288 uint_t maxzones = 8192; 289 290 /* 291 * This isn't static so lint doesn't complain. 292 */ 293 rctl_hndl_t rc_zone_cpu_shares; 294 rctl_hndl_t rc_zone_nlwps; 295 /* 296 * Synchronization primitives used to synchronize between mounts and zone 297 * creation/destruction. 298 */ 299 static int mounts_in_progress; 300 static kcondvar_t mount_cv; 301 static kmutex_t mount_lock; 302 303 const char * const zone_initname = "/sbin/init"; 304 305 static int zone_shutdown(zoneid_t zoneid); 306 307 /* 308 * Bump this number when you alter the zone syscall interfaces; this is 309 * because we need to have support for previous API versions in libc 310 * to support patching; libc calls into the kernel to determine this number. 311 * 312 * Version 1 of the API is the version originally shipped with Solaris 10 313 * Version 2 alters the zone_create system call in order to support more 314 * arguments by moving the args into a structure; and to do better 315 * error reporting when zone_create() fails. 316 * Version 3 alters the zone_create system call in order to support the 317 * import of ZFS datasets to zones. 318 */ 319 static const int ZONE_SYSCALL_API_VERSION = 3; 320 321 /* 322 * Certain filesystems (such as NFS and autofs) need to know which zone 323 * the mount is being placed in. Because of this, we need to be able to 324 * ensure that a zone isn't in the process of being created such that 325 * nfs_mount() thinks it is in the global zone, while by the time it 326 * gets added the list of mounted zones, it ends up on zoneA's mount 327 * list. 328 * 329 * The following functions: block_mounts()/resume_mounts() and 330 * mount_in_progress()/mount_completed() are used by zones and the VFS 331 * layer (respectively) to synchronize zone creation and new mounts. 332 * 333 * The semantics are like a reader-reader lock such that there may 334 * either be multiple mounts (or zone creations, if that weren't 335 * serialized by zonehash_lock) in progress at the same time, but not 336 * both. 337 * 338 * We use cv's so the user can ctrl-C out of the operation if it's 339 * taking too long. 340 * 341 * The semantics are such that there is unfair bias towards the 342 * "current" operation. This means that zone creations may starve if 343 * there is a rapid succession of new mounts coming in to the system, or 344 * there is a remote possibility that zones will be created at such a 345 * rate that new mounts will not be able to proceed. 346 */ 347 /* 348 * Prevent new mounts from progressing to the point of calling 349 * VFS_MOUNT(). If there are already mounts in this "region", wait for 350 * them to complete. 351 */ 352 static int 353 block_mounts(void) 354 { 355 int retval = 0; 356 357 /* 358 * Since it may block for a long time, block_mounts() shouldn't be 359 * called with zonehash_lock held. 360 */ 361 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 362 mutex_enter(&mount_lock); 363 while (mounts_in_progress > 0) { 364 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 365 goto signaled; 366 } 367 /* 368 * A negative value of mounts_in_progress indicates that mounts 369 * have been blocked by (-mounts_in_progress) different callers. 370 */ 371 mounts_in_progress--; 372 retval = 1; 373 signaled: 374 mutex_exit(&mount_lock); 375 return (retval); 376 } 377 378 /* 379 * The VFS layer may progress with new mounts as far as we're concerned. 380 * Allow them to progress if we were the last obstacle. 381 */ 382 static void 383 resume_mounts(void) 384 { 385 mutex_enter(&mount_lock); 386 if (++mounts_in_progress == 0) 387 cv_broadcast(&mount_cv); 388 mutex_exit(&mount_lock); 389 } 390 391 /* 392 * The VFS layer is busy with a mount; zones should wait until all 393 * mounts are completed to progress. 394 */ 395 void 396 mount_in_progress(void) 397 { 398 mutex_enter(&mount_lock); 399 while (mounts_in_progress < 0) 400 cv_wait(&mount_cv, &mount_lock); 401 mounts_in_progress++; 402 mutex_exit(&mount_lock); 403 } 404 405 /* 406 * VFS is done with one mount; wake up any waiting block_mounts() 407 * callers if this is the last mount. 408 */ 409 void 410 mount_completed(void) 411 { 412 mutex_enter(&mount_lock); 413 if (--mounts_in_progress == 0) 414 cv_broadcast(&mount_cv); 415 mutex_exit(&mount_lock); 416 } 417 418 /* 419 * ZSD routines. 420 * 421 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 422 * defined by the pthread_key_create() and related interfaces. 423 * 424 * Kernel subsystems may register one or more data items and/or 425 * callbacks to be executed when a zone is created, shutdown, or 426 * destroyed. 427 * 428 * Unlike the thread counterpart, destructor callbacks will be executed 429 * even if the data pointer is NULL and/or there are no constructor 430 * callbacks, so it is the responsibility of such callbacks to check for 431 * NULL data values if necessary. 432 * 433 * The locking strategy and overall picture is as follows: 434 * 435 * When someone calls zone_key_create(), a template ZSD entry is added to the 436 * global list "zsd_registered_keys", protected by zsd_key_lock. The 437 * constructor callback is called immediately on all existing zones, and a 438 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 439 * zone_lock). As this operation requires the list of zones, the list of 440 * registered keys, and the per-zone list of ZSD entries to remain constant 441 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 442 * all existing zones, and zsd_key_lock, in that order. Similar locking is 443 * needed when zone_key_delete() is called. It is thus sufficient to hold 444 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 445 * per-zone zone_zsd list. 446 * 447 * Note that this implementation does not make a copy of the ZSD entry if a 448 * constructor callback is not provided. A zone_getspecific() on such an 449 * uninitialized ZSD entry will return NULL. 450 * 451 * When new zones are created constructor callbacks for all registered ZSD 452 * entries will be called. 453 * 454 * The framework does not provide any locking around zone_getspecific() and 455 * zone_setspecific() apart from that needed for internal consistency, so 456 * callers interested in atomic "test-and-set" semantics will need to provide 457 * their own locking. 458 */ 459 void 460 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 461 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 462 { 463 struct zsd_entry *zsdp; 464 struct zsd_entry *t; 465 struct zone *zone; 466 467 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 468 zsdp->zsd_data = NULL; 469 zsdp->zsd_create = create; 470 zsdp->zsd_shutdown = shutdown; 471 zsdp->zsd_destroy = destroy; 472 473 mutex_enter(&zonehash_lock); /* stop the world */ 474 for (zone = list_head(&zone_active); zone != NULL; 475 zone = list_next(&zone_active, zone)) 476 mutex_enter(&zone->zone_lock); /* lock all zones */ 477 478 mutex_enter(&zsd_key_lock); 479 *keyp = zsdp->zsd_key = ++zsd_keyval; 480 ASSERT(zsd_keyval != 0); 481 list_insert_tail(&zsd_registered_keys, zsdp); 482 mutex_exit(&zsd_key_lock); 483 484 if (create != NULL) { 485 for (zone = list_head(&zone_active); zone != NULL; 486 zone = list_next(&zone_active, zone)) { 487 t = kmem_alloc(sizeof (*t), KM_SLEEP); 488 t->zsd_key = *keyp; 489 t->zsd_data = (*create)(zone->zone_id); 490 t->zsd_create = create; 491 t->zsd_shutdown = shutdown; 492 t->zsd_destroy = destroy; 493 list_insert_tail(&zone->zone_zsd, t); 494 } 495 } 496 for (zone = list_head(&zone_active); zone != NULL; 497 zone = list_next(&zone_active, zone)) 498 mutex_exit(&zone->zone_lock); 499 mutex_exit(&zonehash_lock); 500 } 501 502 /* 503 * Helper function to find the zsd_entry associated with the key in the 504 * given list. 505 */ 506 static struct zsd_entry * 507 zsd_find(list_t *l, zone_key_t key) 508 { 509 struct zsd_entry *zsd; 510 511 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 512 if (zsd->zsd_key == key) { 513 /* 514 * Move to head of list to keep list in MRU order. 515 */ 516 if (zsd != list_head(l)) { 517 list_remove(l, zsd); 518 list_insert_head(l, zsd); 519 } 520 return (zsd); 521 } 522 } 523 return (NULL); 524 } 525 526 /* 527 * Function called when a module is being unloaded, or otherwise wishes 528 * to unregister its ZSD key and callbacks. 529 */ 530 int 531 zone_key_delete(zone_key_t key) 532 { 533 struct zsd_entry *zsdp = NULL; 534 zone_t *zone; 535 536 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 537 for (zone = list_head(&zone_active); zone != NULL; 538 zone = list_next(&zone_active, zone)) 539 mutex_enter(&zone->zone_lock); /* lock all zones */ 540 541 mutex_enter(&zsd_key_lock); 542 zsdp = zsd_find(&zsd_registered_keys, key); 543 if (zsdp == NULL) 544 goto notfound; 545 list_remove(&zsd_registered_keys, zsdp); 546 mutex_exit(&zsd_key_lock); 547 548 for (zone = list_head(&zone_active); zone != NULL; 549 zone = list_next(&zone_active, zone)) { 550 struct zsd_entry *del; 551 void *data; 552 553 if (!(zone->zone_flags & ZF_DESTROYED)) { 554 del = zsd_find(&zone->zone_zsd, key); 555 if (del != NULL) { 556 data = del->zsd_data; 557 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 558 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 559 list_remove(&zone->zone_zsd, del); 560 kmem_free(del, sizeof (*del)); 561 } else { 562 data = NULL; 563 } 564 if (zsdp->zsd_shutdown) 565 zsdp->zsd_shutdown(zone->zone_id, data); 566 if (zsdp->zsd_destroy) 567 zsdp->zsd_destroy(zone->zone_id, data); 568 } 569 mutex_exit(&zone->zone_lock); 570 } 571 mutex_exit(&zonehash_lock); 572 kmem_free(zsdp, sizeof (*zsdp)); 573 return (0); 574 575 notfound: 576 mutex_exit(&zsd_key_lock); 577 for (zone = list_head(&zone_active); zone != NULL; 578 zone = list_next(&zone_active, zone)) 579 mutex_exit(&zone->zone_lock); 580 mutex_exit(&zonehash_lock); 581 return (-1); 582 } 583 584 /* 585 * ZSD counterpart of pthread_setspecific(). 586 */ 587 int 588 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 589 { 590 struct zsd_entry *t; 591 struct zsd_entry *zsdp = NULL; 592 593 mutex_enter(&zone->zone_lock); 594 t = zsd_find(&zone->zone_zsd, key); 595 if (t != NULL) { 596 /* 597 * Replace old value with new 598 */ 599 t->zsd_data = (void *)data; 600 mutex_exit(&zone->zone_lock); 601 return (0); 602 } 603 /* 604 * If there was no previous value, go through the list of registered 605 * keys. 606 * 607 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 608 * necessary for shutdown callbacks to be able to execute without fear 609 * of deadlock. 610 */ 611 mutex_enter(&zsd_key_lock); 612 zsdp = zsd_find(&zsd_registered_keys, key); 613 if (zsdp == NULL) { /* Key was not registered */ 614 mutex_exit(&zsd_key_lock); 615 mutex_exit(&zone->zone_lock); 616 return (-1); 617 } 618 619 /* 620 * Add a zsd_entry to this zone, using the template we just retrieved 621 * to initialize the constructor and destructor(s). 622 */ 623 t = kmem_alloc(sizeof (*t), KM_SLEEP); 624 t->zsd_key = key; 625 t->zsd_data = (void *)data; 626 t->zsd_create = zsdp->zsd_create; 627 t->zsd_shutdown = zsdp->zsd_shutdown; 628 t->zsd_destroy = zsdp->zsd_destroy; 629 list_insert_tail(&zone->zone_zsd, t); 630 mutex_exit(&zsd_key_lock); 631 mutex_exit(&zone->zone_lock); 632 return (0); 633 } 634 635 /* 636 * ZSD counterpart of pthread_getspecific(). 637 */ 638 void * 639 zone_getspecific(zone_key_t key, zone_t *zone) 640 { 641 struct zsd_entry *t; 642 void *data; 643 644 mutex_enter(&zone->zone_lock); 645 t = zsd_find(&zone->zone_zsd, key); 646 data = (t == NULL ? NULL : t->zsd_data); 647 mutex_exit(&zone->zone_lock); 648 return (data); 649 } 650 651 /* 652 * Function used to initialize a zone's list of ZSD callbacks and data 653 * when the zone is being created. The callbacks are initialized from 654 * the template list (zsd_registered_keys), and the constructor 655 * callback executed (if one exists). 656 * 657 * This is called before the zone is made publicly available, hence no 658 * need to grab zone_lock. 659 * 660 * Although we grab and release zsd_key_lock, new entries cannot be 661 * added to or removed from the zsd_registered_keys list until we 662 * release zonehash_lock, so there isn't a window for a 663 * zone_key_create() to come in after we've dropped zsd_key_lock but 664 * before the zone is added to the zone list, such that the constructor 665 * callbacks aren't executed for the new zone. 666 */ 667 static void 668 zone_zsd_configure(zone_t *zone) 669 { 670 struct zsd_entry *zsdp; 671 struct zsd_entry *t; 672 zoneid_t zoneid = zone->zone_id; 673 674 ASSERT(MUTEX_HELD(&zonehash_lock)); 675 ASSERT(list_head(&zone->zone_zsd) == NULL); 676 mutex_enter(&zsd_key_lock); 677 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 678 zsdp = list_next(&zsd_registered_keys, zsdp)) { 679 if (zsdp->zsd_create != NULL) { 680 t = kmem_alloc(sizeof (*t), KM_SLEEP); 681 t->zsd_key = zsdp->zsd_key; 682 t->zsd_create = zsdp->zsd_create; 683 t->zsd_data = (*t->zsd_create)(zoneid); 684 t->zsd_shutdown = zsdp->zsd_shutdown; 685 t->zsd_destroy = zsdp->zsd_destroy; 686 list_insert_tail(&zone->zone_zsd, t); 687 } 688 } 689 mutex_exit(&zsd_key_lock); 690 } 691 692 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 693 694 /* 695 * Helper function to execute shutdown or destructor callbacks. 696 */ 697 static void 698 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 699 { 700 struct zsd_entry *zsdp; 701 struct zsd_entry *t; 702 zoneid_t zoneid = zone->zone_id; 703 704 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 705 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 706 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 707 708 mutex_enter(&zone->zone_lock); 709 if (ct == ZSD_DESTROY) { 710 if (zone->zone_flags & ZF_DESTROYED) { 711 /* 712 * Make sure destructors are only called once. 713 */ 714 mutex_exit(&zone->zone_lock); 715 return; 716 } 717 zone->zone_flags |= ZF_DESTROYED; 718 } 719 mutex_exit(&zone->zone_lock); 720 721 /* 722 * Both zsd_key_lock and zone_lock need to be held in order to add or 723 * remove a ZSD key, (either globally as part of 724 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 725 * possible through zone_setspecific()), so it's sufficient to hold 726 * zsd_key_lock here. 727 * 728 * This is a good thing, since we don't want to recursively try to grab 729 * zone_lock if a callback attempts to do something like a crfree() or 730 * zone_rele(). 731 */ 732 mutex_enter(&zsd_key_lock); 733 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 734 zsdp = list_next(&zsd_registered_keys, zsdp)) { 735 zone_key_t key = zsdp->zsd_key; 736 737 /* Skip if no callbacks registered */ 738 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 739 continue; 740 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 741 continue; 742 /* 743 * Call the callback with the zone-specific data if we can find 744 * any, otherwise with NULL. 745 */ 746 t = zsd_find(&zone->zone_zsd, key); 747 if (t != NULL) { 748 if (ct == ZSD_SHUTDOWN) { 749 t->zsd_shutdown(zoneid, t->zsd_data); 750 } else { 751 ASSERT(ct == ZSD_DESTROY); 752 t->zsd_destroy(zoneid, t->zsd_data); 753 } 754 } else { 755 if (ct == ZSD_SHUTDOWN) { 756 zsdp->zsd_shutdown(zoneid, NULL); 757 } else { 758 ASSERT(ct == ZSD_DESTROY); 759 zsdp->zsd_destroy(zoneid, NULL); 760 } 761 } 762 } 763 mutex_exit(&zsd_key_lock); 764 } 765 766 /* 767 * Called when the zone is going away; free ZSD-related memory, and 768 * destroy the zone_zsd list. 769 */ 770 static void 771 zone_free_zsd(zone_t *zone) 772 { 773 struct zsd_entry *t, *next; 774 775 /* 776 * Free all the zsd_entry's we had on this zone. 777 */ 778 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 779 next = list_next(&zone->zone_zsd, t); 780 list_remove(&zone->zone_zsd, t); 781 kmem_free(t, sizeof (*t)); 782 } 783 list_destroy(&zone->zone_zsd); 784 } 785 786 /* 787 * Frees memory associated with the zone dataset list. 788 */ 789 static void 790 zone_free_datasets(zone_t *zone) 791 { 792 zone_dataset_t *t, *next; 793 794 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 795 next = list_next(&zone->zone_datasets, t); 796 list_remove(&zone->zone_datasets, t); 797 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 798 kmem_free(t, sizeof (*t)); 799 } 800 list_destroy(&zone->zone_datasets); 801 } 802 803 /* 804 * zone.cpu-shares resource control support. 805 */ 806 /*ARGSUSED*/ 807 static rctl_qty_t 808 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 809 { 810 ASSERT(MUTEX_HELD(&p->p_lock)); 811 return (p->p_zone->zone_shares); 812 } 813 814 /*ARGSUSED*/ 815 static int 816 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 817 rctl_qty_t nv) 818 { 819 ASSERT(MUTEX_HELD(&p->p_lock)); 820 ASSERT(e->rcep_t == RCENTITY_ZONE); 821 if (e->rcep_p.zone == NULL) 822 return (0); 823 824 e->rcep_p.zone->zone_shares = nv; 825 return (0); 826 } 827 828 static rctl_ops_t zone_cpu_shares_ops = { 829 rcop_no_action, 830 zone_cpu_shares_usage, 831 zone_cpu_shares_set, 832 rcop_no_test 833 }; 834 835 /*ARGSUSED*/ 836 static rctl_qty_t 837 zone_lwps_usage(rctl_t *r, proc_t *p) 838 { 839 rctl_qty_t nlwps; 840 zone_t *zone = p->p_zone; 841 842 ASSERT(MUTEX_HELD(&p->p_lock)); 843 844 mutex_enter(&zone->zone_nlwps_lock); 845 nlwps = zone->zone_nlwps; 846 mutex_exit(&zone->zone_nlwps_lock); 847 848 return (nlwps); 849 } 850 851 /*ARGSUSED*/ 852 static int 853 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 854 rctl_qty_t incr, uint_t flags) 855 { 856 rctl_qty_t nlwps; 857 858 ASSERT(MUTEX_HELD(&p->p_lock)); 859 ASSERT(e->rcep_t == RCENTITY_ZONE); 860 if (e->rcep_p.zone == NULL) 861 return (0); 862 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 863 nlwps = e->rcep_p.zone->zone_nlwps; 864 865 if (nlwps + incr > rcntl->rcv_value) 866 return (1); 867 868 return (0); 869 } 870 871 /*ARGSUSED*/ 872 static int 873 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 874 875 ASSERT(MUTEX_HELD(&p->p_lock)); 876 ASSERT(e->rcep_t == RCENTITY_ZONE); 877 if (e->rcep_p.zone == NULL) 878 return (0); 879 e->rcep_p.zone->zone_nlwps_ctl = nv; 880 return (0); 881 } 882 883 static rctl_ops_t zone_lwps_ops = { 884 rcop_no_action, 885 zone_lwps_usage, 886 zone_lwps_set, 887 zone_lwps_test, 888 }; 889 890 /* 891 * Helper function to brand the zone with a unique ID. 892 */ 893 static void 894 zone_uniqid(zone_t *zone) 895 { 896 static uint64_t uniqid = 0; 897 898 ASSERT(MUTEX_HELD(&zonehash_lock)); 899 zone->zone_uniqid = uniqid++; 900 } 901 902 /* 903 * Returns a held pointer to the "kcred" for the specified zone. 904 */ 905 struct cred * 906 zone_get_kcred(zoneid_t zoneid) 907 { 908 zone_t *zone; 909 cred_t *cr; 910 911 if ((zone = zone_find_by_id(zoneid)) == NULL) 912 return (NULL); 913 cr = zone->zone_kcred; 914 crhold(cr); 915 zone_rele(zone); 916 return (cr); 917 } 918 919 /* 920 * Called very early on in boot to initialize the ZSD list so that 921 * zone_key_create() can be called before zone_init(). It also initializes 922 * portions of zone0 which may be used before zone_init() is called. The 923 * variable "global_zone" will be set when zone0 is fully initialized by 924 * zone_init(). 925 */ 926 void 927 zone_zsd_init(void) 928 { 929 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 930 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 931 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 932 offsetof(struct zsd_entry, zsd_linkage)); 933 list_create(&zone_active, sizeof (zone_t), 934 offsetof(zone_t, zone_linkage)); 935 list_create(&zone_deathrow, sizeof (zone_t), 936 offsetof(zone_t, zone_linkage)); 937 938 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 939 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 940 zone0.zone_shares = 1; 941 zone0.zone_nlwps_ctl = INT_MAX; 942 zone0.zone_name = GLOBAL_ZONENAME; 943 zone0.zone_nodename = utsname.nodename; 944 zone0.zone_domain = srpc_domain; 945 zone0.zone_ref = 1; 946 zone0.zone_id = GLOBAL_ZONEID; 947 zone0.zone_status = ZONE_IS_RUNNING; 948 zone0.zone_rootpath = "/"; 949 zone0.zone_rootpathlen = 2; 950 zone0.zone_psetid = ZONE_PS_INVAL; 951 zone0.zone_ncpus = 0; 952 zone0.zone_ncpus_online = 0; 953 zone0.zone_proc_initpid = 1; 954 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 955 offsetof(struct zsd_entry, zsd_linkage)); 956 list_insert_head(&zone_active, &zone0); 957 958 /* 959 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 960 * to anything meaningful. It is assigned to be 'rootdir' in 961 * vfs_mountroot(). 962 */ 963 zone0.zone_rootvp = NULL; 964 zone0.zone_vfslist = NULL; 965 zone0.zone_bootargs = NULL; 966 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 967 /* 968 * The global zone has all privileges 969 */ 970 priv_fillset(zone0.zone_privset); 971 /* 972 * Add p0 to the global zone 973 */ 974 zone0.zone_zsched = &p0; 975 p0.p_zone = &zone0; 976 } 977 978 /* 979 * Called by main() to initialize the zones framework. 980 */ 981 void 982 zone_init(void) 983 { 984 rctl_dict_entry_t *rde; 985 rctl_val_t *dval; 986 rctl_set_t *set; 987 rctl_alloc_gp_t *gp; 988 rctl_entity_p_t e; 989 990 ASSERT(curproc == &p0); 991 992 /* 993 * Create ID space for zone IDs. ID 0 is reserved for the 994 * global zone. 995 */ 996 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 997 998 /* 999 * Initialize generic zone resource controls, if any. 1000 */ 1001 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 1002 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 1003 RCTL_GLOBAL_NOBASIC | 1004 RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES, 1005 &zone_cpu_shares_ops); 1006 1007 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 1008 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1009 INT_MAX, INT_MAX, &zone_lwps_ops); 1010 /* 1011 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 1012 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 1013 */ 1014 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 1015 bzero(dval, sizeof (rctl_val_t)); 1016 dval->rcv_value = 1; 1017 dval->rcv_privilege = RCPRIV_PRIVILEGED; 1018 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 1019 dval->rcv_action_recip_pid = -1; 1020 1021 rde = rctl_dict_lookup("zone.cpu-shares"); 1022 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 1023 1024 /* 1025 * Initialize the ``global zone''. 1026 */ 1027 set = rctl_set_create(); 1028 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 1029 mutex_enter(&p0.p_lock); 1030 e.rcep_p.zone = &zone0; 1031 e.rcep_t = RCENTITY_ZONE; 1032 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1033 gp); 1034 1035 zone0.zone_nlwps = p0.p_lwpcnt; 1036 zone0.zone_ntasks = 1; 1037 mutex_exit(&p0.p_lock); 1038 rctl_prealloc_destroy(gp); 1039 /* 1040 * pool_default hasn't been initialized yet, so we let pool_init() take 1041 * care of making the global zone is in the default pool. 1042 */ 1043 mutex_enter(&zonehash_lock); 1044 zone_uniqid(&zone0); 1045 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1046 mutex_exit(&zonehash_lock); 1047 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1048 mod_hash_null_valdtor); 1049 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1050 zone_hash_size, mod_hash_null_valdtor); 1051 zonecount = 1; 1052 1053 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1054 (mod_hash_val_t)&zone0); 1055 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1056 (mod_hash_val_t)&zone0); 1057 /* 1058 * We avoid setting zone_kcred until now, since kcred is initialized 1059 * sometime after zone_zsd_init() and before zone_init(). 1060 */ 1061 zone0.zone_kcred = kcred; 1062 /* 1063 * The global zone is fully initialized (except for zone_rootvp which 1064 * will be set when the root filesystem is mounted). 1065 */ 1066 global_zone = &zone0; 1067 } 1068 1069 static void 1070 zone_free(zone_t *zone) 1071 { 1072 ASSERT(zone != global_zone); 1073 ASSERT(zone->zone_ntasks == 0); 1074 ASSERT(zone->zone_nlwps == 0); 1075 ASSERT(zone->zone_cred_ref == 0); 1076 ASSERT(zone->zone_kcred == NULL); 1077 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1078 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1079 1080 /* remove from deathrow list */ 1081 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1082 ASSERT(zone->zone_ref == 0); 1083 mutex_enter(&zone_deathrow_lock); 1084 list_remove(&zone_deathrow, zone); 1085 mutex_exit(&zone_deathrow_lock); 1086 } 1087 1088 zone_free_zsd(zone); 1089 zone_free_datasets(zone); 1090 1091 if (zone->zone_rootvp != NULL) 1092 VN_RELE(zone->zone_rootvp); 1093 if (zone->zone_rootpath) 1094 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1095 if (zone->zone_name != NULL) 1096 kmem_free(zone->zone_name, ZONENAME_MAX); 1097 if (zone->zone_nodename != NULL) 1098 kmem_free(zone->zone_nodename, _SYS_NMLN); 1099 if (zone->zone_domain != NULL) 1100 kmem_free(zone->zone_domain, _SYS_NMLN); 1101 if (zone->zone_privset != NULL) 1102 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1103 if (zone->zone_rctls != NULL) 1104 rctl_set_free(zone->zone_rctls); 1105 if (zone->zone_bootargs != NULL) 1106 kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX); 1107 id_free(zoneid_space, zone->zone_id); 1108 mutex_destroy(&zone->zone_lock); 1109 cv_destroy(&zone->zone_cv); 1110 kmem_free(zone, sizeof (zone_t)); 1111 } 1112 1113 /* 1114 * See block comment at the top of this file for information about zone 1115 * status values. 1116 */ 1117 /* 1118 * Convenience function for setting zone status. 1119 */ 1120 static void 1121 zone_status_set(zone_t *zone, zone_status_t status) 1122 { 1123 ASSERT(MUTEX_HELD(&zone_status_lock)); 1124 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1125 status >= zone_status_get(zone)); 1126 zone->zone_status = status; 1127 cv_broadcast(&zone->zone_cv); 1128 } 1129 1130 /* 1131 * Public function to retrieve the zone status. The zone status may 1132 * change after it is retrieved. 1133 */ 1134 zone_status_t 1135 zone_status_get(zone_t *zone) 1136 { 1137 return (zone->zone_status); 1138 } 1139 1140 static int 1141 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1142 { 1143 char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP); 1144 size_t len; 1145 int err; 1146 1147 err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len); 1148 if (err != 0) { 1149 kmem_free(bootargs, ZONEBOOTARGS_MAX); 1150 return (err); /* EFAULT or ENAMETOOLONG */ 1151 } 1152 bootargs[len] = '\0'; 1153 1154 ASSERT(zone->zone_bootargs == NULL); 1155 zone->zone_bootargs = bootargs; 1156 return (0); 1157 } 1158 1159 /* 1160 * Block indefinitely waiting for (zone_status >= status) 1161 */ 1162 void 1163 zone_status_wait(zone_t *zone, zone_status_t status) 1164 { 1165 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1166 1167 mutex_enter(&zone_status_lock); 1168 while (zone->zone_status < status) { 1169 cv_wait(&zone->zone_cv, &zone_status_lock); 1170 } 1171 mutex_exit(&zone_status_lock); 1172 } 1173 1174 /* 1175 * Private CPR-safe version of zone_status_wait(). 1176 */ 1177 static void 1178 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1179 { 1180 callb_cpr_t cprinfo; 1181 1182 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1183 1184 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1185 str); 1186 mutex_enter(&zone_status_lock); 1187 while (zone->zone_status < status) { 1188 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1189 cv_wait(&zone->zone_cv, &zone_status_lock); 1190 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1191 } 1192 /* 1193 * zone_status_lock is implicitly released by the following. 1194 */ 1195 CALLB_CPR_EXIT(&cprinfo); 1196 } 1197 1198 /* 1199 * Block until zone enters requested state or signal is received. Return (0) 1200 * if signaled, non-zero otherwise. 1201 */ 1202 int 1203 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1204 { 1205 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1206 1207 mutex_enter(&zone_status_lock); 1208 while (zone->zone_status < status) { 1209 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1210 mutex_exit(&zone_status_lock); 1211 return (0); 1212 } 1213 } 1214 mutex_exit(&zone_status_lock); 1215 return (1); 1216 } 1217 1218 /* 1219 * Block until the zone enters the requested state or the timeout expires, 1220 * whichever happens first. Return (-1) if operation timed out, time remaining 1221 * otherwise. 1222 */ 1223 clock_t 1224 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1225 { 1226 clock_t timeleft = 0; 1227 1228 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1229 1230 mutex_enter(&zone_status_lock); 1231 while (zone->zone_status < status && timeleft != -1) { 1232 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1233 } 1234 mutex_exit(&zone_status_lock); 1235 return (timeleft); 1236 } 1237 1238 /* 1239 * Block until the zone enters the requested state, the current process is 1240 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1241 * operation timed out, 0 if signaled, time remaining otherwise. 1242 */ 1243 clock_t 1244 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1245 { 1246 clock_t timeleft = tim - lbolt; 1247 1248 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1249 1250 mutex_enter(&zone_status_lock); 1251 while (zone->zone_status < status) { 1252 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1253 tim); 1254 if (timeleft <= 0) 1255 break; 1256 } 1257 mutex_exit(&zone_status_lock); 1258 return (timeleft); 1259 } 1260 1261 /* 1262 * Zones have two reference counts: one for references from credential 1263 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1264 * This is so we can allow a zone to be rebooted while there are still 1265 * outstanding cred references, since certain drivers cache dblks (which 1266 * implicitly results in cached creds). We wait for zone_ref to drop to 1267 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1268 * later freed when the zone_cred_ref drops to 0, though nothing other 1269 * than the zone id and privilege set should be accessed once the zone 1270 * is "dead". 1271 * 1272 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1273 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1274 * to 0. This can be useful to flush out other sources of cached creds 1275 * that may be less innocuous than the driver case. 1276 */ 1277 1278 int zone_wait_for_cred = 0; 1279 1280 static void 1281 zone_hold_locked(zone_t *z) 1282 { 1283 ASSERT(MUTEX_HELD(&z->zone_lock)); 1284 z->zone_ref++; 1285 ASSERT(z->zone_ref != 0); 1286 } 1287 1288 void 1289 zone_hold(zone_t *z) 1290 { 1291 mutex_enter(&z->zone_lock); 1292 zone_hold_locked(z); 1293 mutex_exit(&z->zone_lock); 1294 } 1295 1296 /* 1297 * If the non-cred ref count drops to 1 and either the cred ref count 1298 * is 0 or we aren't waiting for cred references, the zone is ready to 1299 * be destroyed. 1300 */ 1301 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1302 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1303 1304 void 1305 zone_rele(zone_t *z) 1306 { 1307 boolean_t wakeup; 1308 1309 mutex_enter(&z->zone_lock); 1310 ASSERT(z->zone_ref != 0); 1311 z->zone_ref--; 1312 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1313 /* no more refs, free the structure */ 1314 mutex_exit(&z->zone_lock); 1315 zone_free(z); 1316 return; 1317 } 1318 /* signal zone_destroy so the zone can finish halting */ 1319 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1320 mutex_exit(&z->zone_lock); 1321 1322 if (wakeup) { 1323 /* 1324 * Grabbing zonehash_lock here effectively synchronizes with 1325 * zone_destroy() to avoid missed signals. 1326 */ 1327 mutex_enter(&zonehash_lock); 1328 cv_broadcast(&zone_destroy_cv); 1329 mutex_exit(&zonehash_lock); 1330 } 1331 } 1332 1333 void 1334 zone_cred_hold(zone_t *z) 1335 { 1336 mutex_enter(&z->zone_lock); 1337 z->zone_cred_ref++; 1338 ASSERT(z->zone_cred_ref != 0); 1339 mutex_exit(&z->zone_lock); 1340 } 1341 1342 void 1343 zone_cred_rele(zone_t *z) 1344 { 1345 boolean_t wakeup; 1346 1347 mutex_enter(&z->zone_lock); 1348 ASSERT(z->zone_cred_ref != 0); 1349 z->zone_cred_ref--; 1350 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1351 /* no more refs, free the structure */ 1352 mutex_exit(&z->zone_lock); 1353 zone_free(z); 1354 return; 1355 } 1356 /* 1357 * If zone_destroy is waiting for the cred references to drain 1358 * out, and they have, signal it. 1359 */ 1360 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1361 zone_status_get(z) >= ZONE_IS_DEAD); 1362 mutex_exit(&z->zone_lock); 1363 1364 if (wakeup) { 1365 /* 1366 * Grabbing zonehash_lock here effectively synchronizes with 1367 * zone_destroy() to avoid missed signals. 1368 */ 1369 mutex_enter(&zonehash_lock); 1370 cv_broadcast(&zone_destroy_cv); 1371 mutex_exit(&zonehash_lock); 1372 } 1373 } 1374 1375 void 1376 zone_task_hold(zone_t *z) 1377 { 1378 mutex_enter(&z->zone_lock); 1379 z->zone_ntasks++; 1380 ASSERT(z->zone_ntasks != 0); 1381 mutex_exit(&z->zone_lock); 1382 } 1383 1384 void 1385 zone_task_rele(zone_t *zone) 1386 { 1387 uint_t refcnt; 1388 1389 mutex_enter(&zone->zone_lock); 1390 ASSERT(zone->zone_ntasks != 0); 1391 refcnt = --zone->zone_ntasks; 1392 if (refcnt > 1) { /* Common case */ 1393 mutex_exit(&zone->zone_lock); 1394 return; 1395 } 1396 zone_hold_locked(zone); /* so we can use the zone_t later */ 1397 mutex_exit(&zone->zone_lock); 1398 if (refcnt == 1) { 1399 /* 1400 * See if the zone is shutting down. 1401 */ 1402 mutex_enter(&zone_status_lock); 1403 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1404 goto out; 1405 } 1406 1407 /* 1408 * Make sure the ntasks didn't change since we 1409 * dropped zone_lock. 1410 */ 1411 mutex_enter(&zone->zone_lock); 1412 if (refcnt != zone->zone_ntasks) { 1413 mutex_exit(&zone->zone_lock); 1414 goto out; 1415 } 1416 mutex_exit(&zone->zone_lock); 1417 1418 /* 1419 * No more user processes in the zone. The zone is empty. 1420 */ 1421 zone_status_set(zone, ZONE_IS_EMPTY); 1422 goto out; 1423 } 1424 1425 ASSERT(refcnt == 0); 1426 /* 1427 * zsched has exited; the zone is dead. 1428 */ 1429 zone->zone_zsched = NULL; /* paranoia */ 1430 mutex_enter(&zone_status_lock); 1431 zone_status_set(zone, ZONE_IS_DEAD); 1432 out: 1433 mutex_exit(&zone_status_lock); 1434 zone_rele(zone); 1435 } 1436 1437 zoneid_t 1438 getzoneid(void) 1439 { 1440 return (curproc->p_zone->zone_id); 1441 } 1442 1443 /* 1444 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1445 * check the validity of a zone's state. 1446 */ 1447 static zone_t * 1448 zone_find_all_by_id(zoneid_t zoneid) 1449 { 1450 mod_hash_val_t hv; 1451 zone_t *zone = NULL; 1452 1453 ASSERT(MUTEX_HELD(&zonehash_lock)); 1454 1455 if (mod_hash_find(zonehashbyid, 1456 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1457 zone = (zone_t *)hv; 1458 return (zone); 1459 } 1460 1461 static zone_t * 1462 zone_find_all_by_name(char *name) 1463 { 1464 mod_hash_val_t hv; 1465 zone_t *zone = NULL; 1466 1467 ASSERT(MUTEX_HELD(&zonehash_lock)); 1468 1469 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 1470 zone = (zone_t *)hv; 1471 return (zone); 1472 } 1473 1474 /* 1475 * Public interface for looking up a zone by zoneid. Only returns the zone if 1476 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 1477 * Caller must call zone_rele() once it is done with the zone. 1478 * 1479 * The zone may begin the zone_destroy() sequence immediately after this 1480 * function returns, but may be safely used until zone_rele() is called. 1481 */ 1482 zone_t * 1483 zone_find_by_id(zoneid_t zoneid) 1484 { 1485 zone_t *zone; 1486 zone_status_t status; 1487 1488 mutex_enter(&zonehash_lock); 1489 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 1490 mutex_exit(&zonehash_lock); 1491 return (NULL); 1492 } 1493 status = zone_status_get(zone); 1494 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1495 /* 1496 * For all practical purposes the zone doesn't exist. 1497 */ 1498 mutex_exit(&zonehash_lock); 1499 return (NULL); 1500 } 1501 zone_hold(zone); 1502 mutex_exit(&zonehash_lock); 1503 return (zone); 1504 } 1505 1506 /* 1507 * Similar to zone_find_by_id, but using zone name as the key. 1508 */ 1509 zone_t * 1510 zone_find_by_name(char *name) 1511 { 1512 zone_t *zone; 1513 zone_status_t status; 1514 1515 mutex_enter(&zonehash_lock); 1516 if ((zone = zone_find_all_by_name(name)) == NULL) { 1517 mutex_exit(&zonehash_lock); 1518 return (NULL); 1519 } 1520 status = zone_status_get(zone); 1521 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1522 /* 1523 * For all practical purposes the zone doesn't exist. 1524 */ 1525 mutex_exit(&zonehash_lock); 1526 return (NULL); 1527 } 1528 zone_hold(zone); 1529 mutex_exit(&zonehash_lock); 1530 return (zone); 1531 } 1532 1533 /* 1534 * Similar to zone_find_by_id(), using the path as a key. For instance, 1535 * if there is a zone "foo" rooted at /foo/root, and the path argument 1536 * is "/foo/root/proc", it will return the held zone_t corresponding to 1537 * zone "foo". 1538 * 1539 * zone_find_by_path() always returns a non-NULL value, since at the 1540 * very least every path will be contained in the global zone. 1541 * 1542 * As with the other zone_find_by_*() functions, the caller is 1543 * responsible for zone_rele()ing the return value of this function. 1544 */ 1545 zone_t * 1546 zone_find_by_path(const char *path) 1547 { 1548 zone_t *zone; 1549 zone_t *zret = NULL; 1550 zone_status_t status; 1551 1552 if (path == NULL) { 1553 /* 1554 * Call from rootconf(). 1555 */ 1556 zone_hold(global_zone); 1557 return (global_zone); 1558 } 1559 ASSERT(*path == '/'); 1560 mutex_enter(&zonehash_lock); 1561 for (zone = list_head(&zone_active); zone != NULL; 1562 zone = list_next(&zone_active, zone)) { 1563 if (ZONE_PATH_VISIBLE(path, zone)) 1564 zret = zone; 1565 } 1566 ASSERT(zret != NULL); 1567 status = zone_status_get(zret); 1568 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1569 /* 1570 * Zone practically doesn't exist. 1571 */ 1572 zret = global_zone; 1573 } 1574 zone_hold(zret); 1575 mutex_exit(&zonehash_lock); 1576 return (zret); 1577 } 1578 1579 /* 1580 * Get the number of cpus visible to this zone. The system-wide global 1581 * 'ncpus' is returned if pools are disabled, the caller is in the 1582 * global zone, or a NULL zone argument is passed in. 1583 */ 1584 int 1585 zone_ncpus_get(zone_t *zone) 1586 { 1587 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 1588 1589 return (myncpus != 0 ? myncpus : ncpus); 1590 } 1591 1592 /* 1593 * Get the number of online cpus visible to this zone. The system-wide 1594 * global 'ncpus_online' is returned if pools are disabled, the caller 1595 * is in the global zone, or a NULL zone argument is passed in. 1596 */ 1597 int 1598 zone_ncpus_online_get(zone_t *zone) 1599 { 1600 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 1601 1602 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 1603 } 1604 1605 /* 1606 * Return the pool to which the zone is currently bound. 1607 */ 1608 pool_t * 1609 zone_pool_get(zone_t *zone) 1610 { 1611 ASSERT(pool_lock_held()); 1612 1613 return (zone->zone_pool); 1614 } 1615 1616 /* 1617 * Set the zone's pool pointer and update the zone's visibility to match 1618 * the resources in the new pool. 1619 */ 1620 void 1621 zone_pool_set(zone_t *zone, pool_t *pool) 1622 { 1623 ASSERT(pool_lock_held()); 1624 ASSERT(MUTEX_HELD(&cpu_lock)); 1625 1626 zone->zone_pool = pool; 1627 zone_pset_set(zone, pool->pool_pset->pset_id); 1628 } 1629 1630 /* 1631 * Return the cached value of the id of the processor set to which the 1632 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 1633 * facility is disabled. 1634 */ 1635 psetid_t 1636 zone_pset_get(zone_t *zone) 1637 { 1638 ASSERT(MUTEX_HELD(&cpu_lock)); 1639 1640 return (zone->zone_psetid); 1641 } 1642 1643 /* 1644 * Set the cached value of the id of the processor set to which the zone 1645 * is currently bound. Also update the zone's visibility to match the 1646 * resources in the new processor set. 1647 */ 1648 void 1649 zone_pset_set(zone_t *zone, psetid_t newpsetid) 1650 { 1651 psetid_t oldpsetid; 1652 1653 ASSERT(MUTEX_HELD(&cpu_lock)); 1654 oldpsetid = zone_pset_get(zone); 1655 1656 if (oldpsetid == newpsetid) 1657 return; 1658 /* 1659 * Global zone sees all. 1660 */ 1661 if (zone != global_zone) { 1662 zone->zone_psetid = newpsetid; 1663 if (newpsetid != ZONE_PS_INVAL) 1664 pool_pset_visibility_add(newpsetid, zone); 1665 if (oldpsetid != ZONE_PS_INVAL) 1666 pool_pset_visibility_remove(oldpsetid, zone); 1667 } 1668 /* 1669 * Disabling pools, so we should start using the global values 1670 * for ncpus and ncpus_online. 1671 */ 1672 if (newpsetid == ZONE_PS_INVAL) { 1673 zone->zone_ncpus = 0; 1674 zone->zone_ncpus_online = 0; 1675 } 1676 } 1677 1678 /* 1679 * Walk the list of active zones and issue the provided callback for 1680 * each of them. 1681 * 1682 * Caller must not be holding any locks that may be acquired under 1683 * zonehash_lock. See comment at the beginning of the file for a list of 1684 * common locks and their interactions with zones. 1685 */ 1686 int 1687 zone_walk(int (*cb)(zone_t *, void *), void *data) 1688 { 1689 zone_t *zone; 1690 int ret = 0; 1691 zone_status_t status; 1692 1693 mutex_enter(&zonehash_lock); 1694 for (zone = list_head(&zone_active); zone != NULL; 1695 zone = list_next(&zone_active, zone)) { 1696 /* 1697 * Skip zones that shouldn't be externally visible. 1698 */ 1699 status = zone_status_get(zone); 1700 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 1701 continue; 1702 /* 1703 * Bail immediately if any callback invocation returns a 1704 * non-zero value. 1705 */ 1706 ret = (*cb)(zone, data); 1707 if (ret != 0) 1708 break; 1709 } 1710 mutex_exit(&zonehash_lock); 1711 return (ret); 1712 } 1713 1714 static int 1715 zone_set_root(zone_t *zone, const char *upath) 1716 { 1717 vnode_t *vp; 1718 int trycount; 1719 int error = 0; 1720 char *path; 1721 struct pathname upn, pn; 1722 size_t pathlen; 1723 1724 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 1725 return (error); 1726 1727 pn_alloc(&pn); 1728 1729 /* prevent infinite loop */ 1730 trycount = 10; 1731 for (;;) { 1732 if (--trycount <= 0) { 1733 error = ESTALE; 1734 goto out; 1735 } 1736 1737 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 1738 /* 1739 * VOP_ACCESS() may cover 'vp' with a new 1740 * filesystem, if 'vp' is an autoFS vnode. 1741 * Get the new 'vp' if so. 1742 */ 1743 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 1744 (vp->v_vfsmountedhere == NULL || 1745 (error = traverse(&vp)) == 0)) { 1746 pathlen = pn.pn_pathlen + 2; 1747 path = kmem_alloc(pathlen, KM_SLEEP); 1748 (void) strncpy(path, pn.pn_path, 1749 pn.pn_pathlen + 1); 1750 path[pathlen - 2] = '/'; 1751 path[pathlen - 1] = '\0'; 1752 pn_free(&pn); 1753 pn_free(&upn); 1754 1755 /* Success! */ 1756 break; 1757 } 1758 VN_RELE(vp); 1759 } 1760 if (error != ESTALE) 1761 goto out; 1762 } 1763 1764 ASSERT(error == 0); 1765 zone->zone_rootvp = vp; /* we hold a reference to vp */ 1766 zone->zone_rootpath = path; 1767 zone->zone_rootpathlen = pathlen; 1768 return (0); 1769 1770 out: 1771 pn_free(&pn); 1772 pn_free(&upn); 1773 return (error); 1774 } 1775 1776 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 1777 ((c) >= 'a' && (c) <= 'z') || \ 1778 ((c) >= 'A' && (c) <= 'Z')) 1779 1780 static int 1781 zone_set_name(zone_t *zone, const char *uname) 1782 { 1783 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 1784 size_t len; 1785 int i, err; 1786 1787 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 1788 kmem_free(kname, ZONENAME_MAX); 1789 return (err); /* EFAULT or ENAMETOOLONG */ 1790 } 1791 1792 /* must be less than ZONENAME_MAX */ 1793 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 1794 kmem_free(kname, ZONENAME_MAX); 1795 return (EINVAL); 1796 } 1797 1798 /* 1799 * Name must start with an alphanumeric and must contain only 1800 * alphanumerics, '-', '_' and '.'. 1801 */ 1802 if (!isalnum(kname[0])) { 1803 kmem_free(kname, ZONENAME_MAX); 1804 return (EINVAL); 1805 } 1806 for (i = 1; i < len - 1; i++) { 1807 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 1808 kname[i] != '.') { 1809 kmem_free(kname, ZONENAME_MAX); 1810 return (EINVAL); 1811 } 1812 } 1813 1814 zone->zone_name = kname; 1815 return (0); 1816 } 1817 1818 /* 1819 * Similar to thread_create(), but makes sure the thread is in the appropriate 1820 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 1821 */ 1822 /*ARGSUSED*/ 1823 kthread_t * 1824 zthread_create( 1825 caddr_t stk, 1826 size_t stksize, 1827 void (*proc)(), 1828 void *arg, 1829 size_t len, 1830 pri_t pri) 1831 { 1832 kthread_t *t; 1833 zone_t *zone = curproc->p_zone; 1834 proc_t *pp = zone->zone_zsched; 1835 1836 zone_hold(zone); /* Reference to be dropped when thread exits */ 1837 1838 /* 1839 * No-one should be trying to create threads if the zone is shutting 1840 * down and there aren't any kernel threads around. See comment 1841 * in zthread_exit(). 1842 */ 1843 ASSERT(!(zone->zone_kthreads == NULL && 1844 zone_status_get(zone) >= ZONE_IS_EMPTY)); 1845 /* 1846 * Create a thread, but don't let it run until we've finished setting 1847 * things up. 1848 */ 1849 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 1850 ASSERT(t->t_forw == NULL); 1851 mutex_enter(&zone_status_lock); 1852 if (zone->zone_kthreads == NULL) { 1853 t->t_forw = t->t_back = t; 1854 } else { 1855 kthread_t *tx = zone->zone_kthreads; 1856 1857 t->t_forw = tx; 1858 t->t_back = tx->t_back; 1859 tx->t_back->t_forw = t; 1860 tx->t_back = t; 1861 } 1862 zone->zone_kthreads = t; 1863 mutex_exit(&zone_status_lock); 1864 1865 mutex_enter(&pp->p_lock); 1866 t->t_proc_flag |= TP_ZTHREAD; 1867 project_rele(t->t_proj); 1868 t->t_proj = project_hold(pp->p_task->tk_proj); 1869 1870 /* 1871 * Setup complete, let it run. 1872 */ 1873 thread_lock(t); 1874 t->t_schedflag |= TS_ALLSTART; 1875 setrun_locked(t); 1876 thread_unlock(t); 1877 1878 mutex_exit(&pp->p_lock); 1879 1880 return (t); 1881 } 1882 1883 /* 1884 * Similar to thread_exit(). Must be called by threads created via 1885 * zthread_exit(). 1886 */ 1887 void 1888 zthread_exit(void) 1889 { 1890 kthread_t *t = curthread; 1891 proc_t *pp = curproc; 1892 zone_t *zone = pp->p_zone; 1893 1894 mutex_enter(&zone_status_lock); 1895 1896 /* 1897 * Reparent to p0 1898 */ 1899 kpreempt_disable(); 1900 mutex_enter(&pp->p_lock); 1901 t->t_proc_flag &= ~TP_ZTHREAD; 1902 t->t_procp = &p0; 1903 hat_thread_exit(t); 1904 mutex_exit(&pp->p_lock); 1905 kpreempt_enable(); 1906 1907 if (t->t_back == t) { 1908 ASSERT(t->t_forw == t); 1909 /* 1910 * If the zone is empty, once the thread count 1911 * goes to zero no further kernel threads can be 1912 * created. This is because if the creator is a process 1913 * in the zone, then it must have exited before the zone 1914 * state could be set to ZONE_IS_EMPTY. 1915 * Otherwise, if the creator is a kernel thread in the 1916 * zone, the thread count is non-zero. 1917 * 1918 * This really means that non-zone kernel threads should 1919 * not create zone kernel threads. 1920 */ 1921 zone->zone_kthreads = NULL; 1922 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 1923 zone_status_set(zone, ZONE_IS_DOWN); 1924 } 1925 } else { 1926 t->t_forw->t_back = t->t_back; 1927 t->t_back->t_forw = t->t_forw; 1928 if (zone->zone_kthreads == t) 1929 zone->zone_kthreads = t->t_forw; 1930 } 1931 mutex_exit(&zone_status_lock); 1932 zone_rele(zone); 1933 thread_exit(); 1934 /* NOTREACHED */ 1935 } 1936 1937 static void 1938 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 1939 { 1940 vnode_t *oldvp; 1941 1942 /* we're going to hold a reference here to the directory */ 1943 VN_HOLD(vp); 1944 1945 #ifdef C2_AUDIT 1946 if (audit_active) /* update abs cwd/root path see c2audit.c */ 1947 audit_chdirec(vp, vpp); 1948 #endif 1949 1950 mutex_enter(&pp->p_lock); 1951 oldvp = *vpp; 1952 *vpp = vp; 1953 mutex_exit(&pp->p_lock); 1954 if (oldvp != NULL) 1955 VN_RELE(oldvp); 1956 } 1957 1958 /* 1959 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 1960 */ 1961 static int 1962 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 1963 { 1964 nvpair_t *nvp = NULL; 1965 boolean_t priv_set = B_FALSE; 1966 boolean_t limit_set = B_FALSE; 1967 boolean_t action_set = B_FALSE; 1968 1969 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1970 const char *name; 1971 uint64_t ui64; 1972 1973 name = nvpair_name(nvp); 1974 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 1975 return (EINVAL); 1976 (void) nvpair_value_uint64(nvp, &ui64); 1977 if (strcmp(name, "privilege") == 0) { 1978 /* 1979 * Currently only privileged values are allowed, but 1980 * this may change in the future. 1981 */ 1982 if (ui64 != RCPRIV_PRIVILEGED) 1983 return (EINVAL); 1984 rv->rcv_privilege = ui64; 1985 priv_set = B_TRUE; 1986 } else if (strcmp(name, "limit") == 0) { 1987 rv->rcv_value = ui64; 1988 limit_set = B_TRUE; 1989 } else if (strcmp(name, "action") == 0) { 1990 if (ui64 != RCTL_LOCAL_NOACTION && 1991 ui64 != RCTL_LOCAL_DENY) 1992 return (EINVAL); 1993 rv->rcv_flagaction = ui64; 1994 action_set = B_TRUE; 1995 } else { 1996 return (EINVAL); 1997 } 1998 } 1999 2000 if (!(priv_set && limit_set && action_set)) 2001 return (EINVAL); 2002 rv->rcv_action_signal = 0; 2003 rv->rcv_action_recipient = NULL; 2004 rv->rcv_action_recip_pid = -1; 2005 rv->rcv_firing_time = 0; 2006 2007 return (0); 2008 } 2009 2010 void 2011 zone_icode(void) 2012 { 2013 proc_t *p = ttoproc(curthread); 2014 struct core_globals *cg; 2015 2016 /* 2017 * For all purposes (ZONE_ATTR_INITPID and restart_init), 2018 * storing just the pid of init is sufficient. 2019 */ 2020 p->p_zone->zone_proc_initpid = p->p_pid; 2021 2022 /* 2023 * Allocate user address space and stack segment 2024 */ 2025 2026 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 2027 p->p_usrstack = (caddr_t)USRSTACK32; 2028 p->p_model = DATAMODEL_ILP32; 2029 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 2030 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 2031 p->p_stk_ctl = INT32_MAX; 2032 2033 p->p_as = as_alloc(); 2034 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 2035 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 2036 2037 cg = zone_getspecific(core_zone_key, p->p_zone); 2038 ASSERT(cg != NULL); 2039 corectl_path_hold(cg->core_default_path); 2040 corectl_content_hold(cg->core_default_content); 2041 p->p_corefile = cg->core_default_path; 2042 p->p_content = cg->core_default_content; 2043 2044 init_mstate(curthread, LMS_SYSTEM); 2045 2046 p->p_zone->zone_boot_err = exec_init(zone_initname, 0, 2047 p->p_zone->zone_bootargs); 2048 2049 mutex_enter(&zone_status_lock); 2050 if (p->p_zone->zone_boot_err != 0) { 2051 /* 2052 * Make sure we are still in the booting state-- we could have 2053 * raced and already be shutting down, or even further along. 2054 */ 2055 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2056 zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); 2057 mutex_exit(&zone_status_lock); 2058 /* It's gone bad, dispose of the process */ 2059 if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { 2060 mutex_enter(&p->p_lock); 2061 ASSERT(p->p_flag & SEXITLWPS); 2062 lwp_exit(); 2063 } 2064 } else { 2065 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2066 zone_status_set(p->p_zone, ZONE_IS_RUNNING); 2067 mutex_exit(&zone_status_lock); 2068 /* cause the process to return to userland. */ 2069 lwp_rtt(); 2070 } 2071 } 2072 2073 struct zsched_arg { 2074 zone_t *zone; 2075 nvlist_t *nvlist; 2076 }; 2077 2078 /* 2079 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2080 * anything to do with scheduling, but rather with the fact that 2081 * per-zone kernel threads are parented to zsched, just like regular 2082 * kernel threads are parented to sched (p0). 2083 * 2084 * zsched is also responsible for launching init for the zone. 2085 */ 2086 static void 2087 zsched(void *arg) 2088 { 2089 struct zsched_arg *za = arg; 2090 proc_t *pp = curproc; 2091 proc_t *initp = proc_init; 2092 zone_t *zone = za->zone; 2093 cred_t *cr, *oldcred; 2094 rctl_set_t *set; 2095 rctl_alloc_gp_t *gp; 2096 contract_t *ct = NULL; 2097 task_t *tk, *oldtk; 2098 rctl_entity_p_t e; 2099 kproject_t *pj; 2100 2101 nvlist_t *nvl = za->nvlist; 2102 nvpair_t *nvp = NULL; 2103 2104 bcopy("zsched", u.u_psargs, sizeof ("zsched")); 2105 bcopy("zsched", u.u_comm, sizeof ("zsched")); 2106 u.u_argc = 0; 2107 u.u_argv = NULL; 2108 u.u_envp = NULL; 2109 closeall(P_FINFO(pp)); 2110 2111 /* 2112 * We are this zone's "zsched" process. As the zone isn't generally 2113 * visible yet we don't need to grab any locks before initializing its 2114 * zone_proc pointer. 2115 */ 2116 zone_hold(zone); /* this hold is released by zone_destroy() */ 2117 zone->zone_zsched = pp; 2118 mutex_enter(&pp->p_lock); 2119 pp->p_zone = zone; 2120 mutex_exit(&pp->p_lock); 2121 2122 /* 2123 * Disassociate process from its 'parent'; parent ourselves to init 2124 * (pid 1) and change other values as needed. 2125 */ 2126 sess_create(); 2127 2128 mutex_enter(&pidlock); 2129 proc_detach(pp); 2130 pp->p_ppid = 1; 2131 pp->p_flag |= SZONETOP; 2132 pp->p_ancpid = 1; 2133 pp->p_parent = initp; 2134 pp->p_psibling = NULL; 2135 if (initp->p_child) 2136 initp->p_child->p_psibling = pp; 2137 pp->p_sibling = initp->p_child; 2138 initp->p_child = pp; 2139 2140 /* Decrement what newproc() incremented. */ 2141 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2142 /* 2143 * Our credentials are about to become kcred-like, so we don't care 2144 * about the caller's ruid. 2145 */ 2146 upcount_inc(crgetruid(kcred), zone->zone_id); 2147 mutex_exit(&pidlock); 2148 2149 /* 2150 * getting out of global zone, so decrement lwp counts 2151 */ 2152 pj = pp->p_task->tk_proj; 2153 mutex_enter(&global_zone->zone_nlwps_lock); 2154 pj->kpj_nlwps -= pp->p_lwpcnt; 2155 global_zone->zone_nlwps -= pp->p_lwpcnt; 2156 mutex_exit(&global_zone->zone_nlwps_lock); 2157 2158 /* 2159 * Create and join a new task in project '0' of this zone. 2160 * 2161 * We don't need to call holdlwps() since we know we're the only lwp in 2162 * this process. 2163 * 2164 * task_join() returns with p_lock held. 2165 */ 2166 tk = task_create(0, zone); 2167 mutex_enter(&cpu_lock); 2168 oldtk = task_join(tk, 0); 2169 mutex_exit(&curproc->p_lock); 2170 mutex_exit(&cpu_lock); 2171 task_rele(oldtk); 2172 2173 /* 2174 * add lwp counts to zsched's zone, and increment project's task count 2175 * due to the task created in the above tasksys_settaskid 2176 */ 2177 pj = pp->p_task->tk_proj; 2178 mutex_enter(&zone->zone_nlwps_lock); 2179 pj->kpj_nlwps += pp->p_lwpcnt; 2180 pj->kpj_ntasks += 1; 2181 zone->zone_nlwps += pp->p_lwpcnt; 2182 mutex_exit(&zone->zone_nlwps_lock); 2183 2184 /* 2185 * The process was created by a process in the global zone, hence the 2186 * credentials are wrong. We might as well have kcred-ish credentials. 2187 */ 2188 cr = zone->zone_kcred; 2189 crhold(cr); 2190 mutex_enter(&pp->p_crlock); 2191 oldcred = pp->p_cred; 2192 pp->p_cred = cr; 2193 mutex_exit(&pp->p_crlock); 2194 crfree(oldcred); 2195 2196 /* 2197 * Hold credentials again (for thread) 2198 */ 2199 crhold(cr); 2200 2201 /* 2202 * p_lwpcnt can't change since this is a kernel process. 2203 */ 2204 crset(pp, cr); 2205 2206 /* 2207 * Chroot 2208 */ 2209 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2210 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2211 2212 /* 2213 * Initialize zone's rctl set. 2214 */ 2215 set = rctl_set_create(); 2216 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2217 mutex_enter(&pp->p_lock); 2218 e.rcep_p.zone = zone; 2219 e.rcep_t = RCENTITY_ZONE; 2220 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2221 mutex_exit(&pp->p_lock); 2222 rctl_prealloc_destroy(gp); 2223 2224 /* 2225 * Apply the rctls passed in to zone_create(). This is basically a list 2226 * assignment: all of the old values are removed and the new ones 2227 * inserted. That is, if an empty list is passed in, all values are 2228 * removed. 2229 */ 2230 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2231 rctl_dict_entry_t *rde; 2232 rctl_hndl_t hndl; 2233 char *name; 2234 nvlist_t **nvlarray; 2235 uint_t i, nelem; 2236 int error; /* For ASSERT()s */ 2237 2238 name = nvpair_name(nvp); 2239 hndl = rctl_hndl_lookup(name); 2240 ASSERT(hndl != -1); 2241 rde = rctl_dict_lookup_hndl(hndl); 2242 ASSERT(rde != NULL); 2243 2244 for (; /* ever */; ) { 2245 rctl_val_t oval; 2246 2247 mutex_enter(&pp->p_lock); 2248 error = rctl_local_get(hndl, NULL, &oval, pp); 2249 mutex_exit(&pp->p_lock); 2250 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2251 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2252 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2253 break; 2254 mutex_enter(&pp->p_lock); 2255 error = rctl_local_delete(hndl, &oval, pp); 2256 mutex_exit(&pp->p_lock); 2257 ASSERT(error == 0); 2258 } 2259 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2260 ASSERT(error == 0); 2261 for (i = 0; i < nelem; i++) { 2262 rctl_val_t *nvalp; 2263 2264 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2265 error = nvlist2rctlval(nvlarray[i], nvalp); 2266 ASSERT(error == 0); 2267 /* 2268 * rctl_local_insert can fail if the value being 2269 * inserted is a duplicate; this is OK. 2270 */ 2271 mutex_enter(&pp->p_lock); 2272 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2273 kmem_cache_free(rctl_val_cache, nvalp); 2274 mutex_exit(&pp->p_lock); 2275 } 2276 } 2277 /* 2278 * Tell the world that we're done setting up. 2279 * 2280 * At this point we want to set the zone status to ZONE_IS_READY 2281 * and atomically set the zone's processor set visibility. Once 2282 * we drop pool_lock() this zone will automatically get updated 2283 * to reflect any future changes to the pools configuration. 2284 */ 2285 pool_lock(); 2286 mutex_enter(&cpu_lock); 2287 mutex_enter(&zonehash_lock); 2288 zone_uniqid(zone); 2289 zone_zsd_configure(zone); 2290 if (pool_state == POOL_ENABLED) 2291 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2292 mutex_enter(&zone_status_lock); 2293 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2294 zone_status_set(zone, ZONE_IS_READY); 2295 mutex_exit(&zone_status_lock); 2296 mutex_exit(&zonehash_lock); 2297 mutex_exit(&cpu_lock); 2298 pool_unlock(); 2299 2300 /* 2301 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2302 * we launch init, and set the state to running. 2303 */ 2304 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2305 2306 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2307 id_t cid; 2308 2309 /* 2310 * Ok, this is a little complicated. We need to grab the 2311 * zone's pool's scheduling class ID; note that by now, we 2312 * are already bound to a pool if we need to be (zoneadmd 2313 * will have done that to us while we're in the READY 2314 * state). *But* the scheduling class for the zone's 'init' 2315 * must be explicitly passed to newproc, which doesn't 2316 * respect pool bindings. 2317 * 2318 * We hold the pool_lock across the call to newproc() to 2319 * close the obvious race: the pool's scheduling class 2320 * could change before we manage to create the LWP with 2321 * classid 'cid'. 2322 */ 2323 pool_lock(); 2324 cid = pool_get_class(zone->zone_pool); 2325 if (cid == -1) 2326 cid = defaultcid; 2327 2328 /* 2329 * If this fails, zone_boot will ultimately fail. The 2330 * state of the zone will be set to SHUTTING_DOWN-- userland 2331 * will have to tear down the zone, and fail, or try again. 2332 */ 2333 if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid, 2334 minclsyspri - 1, &ct)) != 0) { 2335 mutex_enter(&zone_status_lock); 2336 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2337 mutex_exit(&zone_status_lock); 2338 } 2339 pool_unlock(); 2340 } 2341 2342 /* 2343 * Wait for zone_destroy() to be called. This is what we spend 2344 * most of our life doing. 2345 */ 2346 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2347 2348 if (ct) 2349 /* 2350 * At this point the process contract should be empty. 2351 * (Though if it isn't, it's not the end of the world.) 2352 */ 2353 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2354 2355 /* 2356 * Allow kcred to be freed when all referring processes 2357 * (including this one) go away. We can't just do this in 2358 * zone_free because we need to wait for the zone_cred_ref to 2359 * drop to 0 before calling zone_free, and the existence of 2360 * zone_kcred will prevent that. Thus, we call crfree here to 2361 * balance the crdup in zone_create. The crhold calls earlier 2362 * in zsched will be dropped when the thread and process exit. 2363 */ 2364 crfree(zone->zone_kcred); 2365 zone->zone_kcred = NULL; 2366 2367 exit(CLD_EXITED, 0); 2368 } 2369 2370 /* 2371 * Helper function to determine if there are any submounts of the 2372 * provided path. Used to make sure the zone doesn't "inherit" any 2373 * mounts from before it is created. 2374 */ 2375 static uint_t 2376 zone_mount_count(const char *rootpath) 2377 { 2378 vfs_t *vfsp; 2379 uint_t count = 0; 2380 size_t rootpathlen = strlen(rootpath); 2381 2382 /* 2383 * Holding zonehash_lock prevents race conditions with 2384 * vfs_list_add()/vfs_list_remove() since we serialize with 2385 * zone_find_by_path(). 2386 */ 2387 ASSERT(MUTEX_HELD(&zonehash_lock)); 2388 /* 2389 * The rootpath must end with a '/' 2390 */ 2391 ASSERT(rootpath[rootpathlen - 1] == '/'); 2392 2393 /* 2394 * This intentionally does not count the rootpath itself if that 2395 * happens to be a mount point. 2396 */ 2397 vfs_list_read_lock(); 2398 vfsp = rootvfs; 2399 do { 2400 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2401 rootpathlen) == 0) 2402 count++; 2403 vfsp = vfsp->vfs_next; 2404 } while (vfsp != rootvfs); 2405 vfs_list_unlock(); 2406 return (count); 2407 } 2408 2409 /* 2410 * Helper function to make sure that a zone created on 'rootpath' 2411 * wouldn't end up containing other zones' rootpaths. 2412 */ 2413 static boolean_t 2414 zone_is_nested(const char *rootpath) 2415 { 2416 zone_t *zone; 2417 size_t rootpathlen = strlen(rootpath); 2418 size_t len; 2419 2420 ASSERT(MUTEX_HELD(&zonehash_lock)); 2421 2422 for (zone = list_head(&zone_active); zone != NULL; 2423 zone = list_next(&zone_active, zone)) { 2424 if (zone == global_zone) 2425 continue; 2426 len = strlen(zone->zone_rootpath); 2427 if (strncmp(rootpath, zone->zone_rootpath, 2428 MIN(rootpathlen, len)) == 0) 2429 return (B_TRUE); 2430 } 2431 return (B_FALSE); 2432 } 2433 2434 static int 2435 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, 2436 size_t zone_privssz) 2437 { 2438 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 2439 2440 if (zone_privssz < sizeof (priv_set_t)) 2441 return (set_errno(ENOMEM)); 2442 2443 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 2444 kmem_free(privs, sizeof (priv_set_t)); 2445 return (EFAULT); 2446 } 2447 2448 zone->zone_privset = privs; 2449 return (0); 2450 } 2451 2452 /* 2453 * We make creative use of nvlists to pass in rctls from userland. The list is 2454 * a list of the following structures: 2455 * 2456 * (name = rctl_name, value = nvpair_list_array) 2457 * 2458 * Where each element of the nvpair_list_array is of the form: 2459 * 2460 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 2461 * (name = "limit", value = uint64_t), 2462 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 2463 */ 2464 static int 2465 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 2466 { 2467 nvpair_t *nvp = NULL; 2468 nvlist_t *nvl = NULL; 2469 char *kbuf; 2470 int error; 2471 rctl_val_t rv; 2472 2473 *nvlp = NULL; 2474 2475 if (buflen == 0) 2476 return (0); 2477 2478 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2479 return (ENOMEM); 2480 if (copyin(ubuf, kbuf, buflen)) { 2481 error = EFAULT; 2482 goto out; 2483 } 2484 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 2485 /* 2486 * nvl may have been allocated/free'd, but the value set to 2487 * non-NULL, so we reset it here. 2488 */ 2489 nvl = NULL; 2490 error = EINVAL; 2491 goto out; 2492 } 2493 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2494 rctl_dict_entry_t *rde; 2495 rctl_hndl_t hndl; 2496 nvlist_t **nvlarray; 2497 uint_t i, nelem; 2498 char *name; 2499 2500 error = EINVAL; 2501 name = nvpair_name(nvp); 2502 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 2503 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 2504 goto out; 2505 } 2506 if ((hndl = rctl_hndl_lookup(name)) == -1) { 2507 goto out; 2508 } 2509 rde = rctl_dict_lookup_hndl(hndl); 2510 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2511 ASSERT(error == 0); 2512 for (i = 0; i < nelem; i++) { 2513 if (error = nvlist2rctlval(nvlarray[i], &rv)) 2514 goto out; 2515 } 2516 if (rctl_invalid_value(rde, &rv)) { 2517 error = EINVAL; 2518 goto out; 2519 } 2520 } 2521 error = 0; 2522 *nvlp = nvl; 2523 out: 2524 kmem_free(kbuf, buflen); 2525 if (error && nvl != NULL) 2526 nvlist_free(nvl); 2527 return (error); 2528 } 2529 2530 int 2531 zone_create_error(int er_error, int er_ext, int *er_out) { 2532 if (er_out != NULL) { 2533 if (copyout(&er_ext, er_out, sizeof (int))) { 2534 return (set_errno(EFAULT)); 2535 } 2536 } 2537 return (set_errno(er_error)); 2538 } 2539 2540 /* 2541 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 2542 */ 2543 static int 2544 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 2545 { 2546 char *kbuf; 2547 char *dataset, *next; 2548 zone_dataset_t *zd; 2549 size_t len; 2550 2551 if (ubuf == NULL || buflen == 0) 2552 return (0); 2553 2554 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2555 return (ENOMEM); 2556 2557 if (copyin(ubuf, kbuf, buflen) != 0) { 2558 kmem_free(kbuf, buflen); 2559 return (EFAULT); 2560 } 2561 2562 dataset = next = kbuf; 2563 for (;;) { 2564 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 2565 2566 next = strchr(dataset, ','); 2567 2568 if (next == NULL) 2569 len = strlen(dataset); 2570 else 2571 len = next - dataset; 2572 2573 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 2574 bcopy(dataset, zd->zd_dataset, len); 2575 zd->zd_dataset[len] = '\0'; 2576 2577 list_insert_head(&zone->zone_datasets, zd); 2578 2579 if (next == NULL) 2580 break; 2581 2582 dataset = next + 1; 2583 } 2584 2585 kmem_free(kbuf, buflen); 2586 return (0); 2587 } 2588 2589 /* 2590 * System call to create/initialize a new zone named 'zone_name', rooted 2591 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 2592 * and initialized with the zone-wide rctls described in 'rctlbuf'. 2593 * 2594 * If extended error is non-null, we may use it to return more detailed 2595 * error information. 2596 */ 2597 static zoneid_t 2598 zone_create(const char *zone_name, const char *zone_root, 2599 const priv_set_t *zone_privs, size_t zone_privssz, 2600 caddr_t rctlbuf, size_t rctlbufsz, 2601 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error) 2602 { 2603 struct zsched_arg zarg; 2604 nvlist_t *rctls = NULL; 2605 proc_t *pp = curproc; 2606 zone_t *zone, *ztmp; 2607 zoneid_t zoneid; 2608 int error; 2609 int error2 = 0; 2610 char *str; 2611 cred_t *zkcr; 2612 2613 if (secpolicy_zone_config(CRED()) != 0) 2614 return (set_errno(EPERM)); 2615 2616 /* can't boot zone from within chroot environment */ 2617 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 2618 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 2619 extended_error)); 2620 2621 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 2622 zoneid = zone->zone_id = id_alloc(zoneid_space); 2623 zone->zone_status = ZONE_IS_UNINITIALIZED; 2624 zone->zone_pool = pool_default; 2625 zone->zone_pool_mod = gethrtime(); 2626 zone->zone_psetid = ZONE_PS_INVAL; 2627 zone->zone_ncpus = 0; 2628 zone->zone_ncpus_online = 0; 2629 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 2630 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 2631 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 2632 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 2633 offsetof(struct zsd_entry, zsd_linkage)); 2634 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 2635 offsetof(zone_dataset_t, zd_linkage)); 2636 2637 if ((error = zone_set_name(zone, zone_name)) != 0) { 2638 zone_free(zone); 2639 return (zone_create_error(error, 0, extended_error)); 2640 } 2641 2642 if ((error = zone_set_root(zone, zone_root)) != 0) { 2643 zone_free(zone); 2644 return (zone_create_error(error, 0, extended_error)); 2645 } 2646 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) { 2647 zone_free(zone); 2648 return (zone_create_error(error, 0, extended_error)); 2649 } 2650 2651 /* initialize node name to be the same as zone name */ 2652 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2653 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 2654 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 2655 2656 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2657 zone->zone_domain[0] = '\0'; 2658 zone->zone_shares = 1; 2659 zone->zone_bootargs = NULL; 2660 2661 /* 2662 * Zsched initializes the rctls. 2663 */ 2664 zone->zone_rctls = NULL; 2665 2666 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 2667 zone_free(zone); 2668 return (zone_create_error(error, 0, extended_error)); 2669 } 2670 2671 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 2672 zone_free(zone); 2673 return (set_errno(error)); 2674 } 2675 2676 /* 2677 * Stop all lwps since that's what normally happens as part of fork(). 2678 * This needs to happen before we grab any locks to avoid deadlock 2679 * (another lwp in the process could be waiting for the held lock). 2680 */ 2681 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 2682 zone_free(zone); 2683 if (rctls) 2684 nvlist_free(rctls); 2685 return (zone_create_error(error, 0, extended_error)); 2686 } 2687 2688 if (block_mounts() == 0) { 2689 mutex_enter(&pp->p_lock); 2690 if (curthread != pp->p_agenttp) 2691 continuelwps(pp); 2692 mutex_exit(&pp->p_lock); 2693 zone_free(zone); 2694 if (rctls) 2695 nvlist_free(rctls); 2696 return (zone_create_error(error, 0, extended_error)); 2697 } 2698 2699 /* 2700 * Set up credential for kernel access. After this, any errors 2701 * should go through the dance in errout rather than calling 2702 * zone_free directly. 2703 */ 2704 zone->zone_kcred = crdup(kcred); 2705 crsetzone(zone->zone_kcred, zone); 2706 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 2707 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 2708 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 2709 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 2710 2711 mutex_enter(&zonehash_lock); 2712 /* 2713 * Make sure zone doesn't already exist. 2714 */ 2715 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) { 2716 zone_status_t status; 2717 2718 status = zone_status_get(ztmp); 2719 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 2720 error = EEXIST; 2721 else 2722 error = EBUSY; 2723 goto errout; 2724 } 2725 2726 /* 2727 * Don't allow zone creations which would cause one zone's rootpath to 2728 * be accessible from that of another (non-global) zone. 2729 */ 2730 if (zone_is_nested(zone->zone_rootpath)) { 2731 error = EBUSY; 2732 goto errout; 2733 } 2734 2735 ASSERT(zonecount != 0); /* check for leaks */ 2736 if (zonecount + 1 > maxzones) { 2737 error = ENOMEM; 2738 goto errout; 2739 } 2740 2741 if (zone_mount_count(zone->zone_rootpath) != 0) { 2742 error = EBUSY; 2743 error2 = ZE_AREMOUNTS; 2744 goto errout; 2745 } 2746 2747 /* 2748 * Zone is still incomplete, but we need to drop all locks while 2749 * zsched() initializes this zone's kernel process. We 2750 * optimistically add the zone to the hashtable and associated 2751 * lists so a parallel zone_create() doesn't try to create the 2752 * same zone. 2753 */ 2754 zonecount++; 2755 (void) mod_hash_insert(zonehashbyid, 2756 (mod_hash_key_t)(uintptr_t)zone->zone_id, 2757 (mod_hash_val_t)(uintptr_t)zone); 2758 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 2759 (void) strcpy(str, zone->zone_name); 2760 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 2761 (mod_hash_val_t)(uintptr_t)zone); 2762 /* 2763 * Insert into active list. At this point there are no 'hold's 2764 * on the zone, but everyone else knows not to use it, so we can 2765 * continue to use it. zsched() will do a zone_hold() if the 2766 * newproc() is successful. 2767 */ 2768 list_insert_tail(&zone_active, zone); 2769 mutex_exit(&zonehash_lock); 2770 2771 zarg.zone = zone; 2772 zarg.nvlist = rctls; 2773 /* 2774 * The process, task, and project rctls are probably wrong; 2775 * we need an interface to get the default values of all rctls, 2776 * and initialize zsched appropriately. I'm not sure that that 2777 * makes much of a difference, though. 2778 */ 2779 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 2780 /* 2781 * We need to undo all globally visible state. 2782 */ 2783 mutex_enter(&zonehash_lock); 2784 list_remove(&zone_active, zone); 2785 (void) mod_hash_destroy(zonehashbyname, 2786 (mod_hash_key_t)(uintptr_t)zone->zone_name); 2787 (void) mod_hash_destroy(zonehashbyid, 2788 (mod_hash_key_t)(uintptr_t)zone->zone_id); 2789 ASSERT(zonecount > 1); 2790 zonecount--; 2791 goto errout; 2792 } 2793 2794 /* 2795 * Zone creation can't fail from now on. 2796 */ 2797 2798 /* 2799 * Let the other lwps continue. 2800 */ 2801 mutex_enter(&pp->p_lock); 2802 if (curthread != pp->p_agenttp) 2803 continuelwps(pp); 2804 mutex_exit(&pp->p_lock); 2805 2806 /* 2807 * Wait for zsched to finish initializing the zone. 2808 */ 2809 zone_status_wait(zone, ZONE_IS_READY); 2810 /* 2811 * The zone is fully visible, so we can let mounts progress. 2812 */ 2813 resume_mounts(); 2814 if (rctls) 2815 nvlist_free(rctls); 2816 2817 return (zoneid); 2818 2819 errout: 2820 mutex_exit(&zonehash_lock); 2821 /* 2822 * Let the other lwps continue. 2823 */ 2824 mutex_enter(&pp->p_lock); 2825 if (curthread != pp->p_agenttp) 2826 continuelwps(pp); 2827 mutex_exit(&pp->p_lock); 2828 2829 resume_mounts(); 2830 if (rctls) 2831 nvlist_free(rctls); 2832 /* 2833 * There is currently one reference to the zone, a cred_ref from 2834 * zone_kcred. To free the zone, we call crfree, which will call 2835 * zone_cred_rele, which will call zone_free. 2836 */ 2837 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 2838 ASSERT(zone->zone_kcred->cr_ref == 1); 2839 ASSERT(zone->zone_ref == 0); 2840 zkcr = zone->zone_kcred; 2841 zone->zone_kcred = NULL; 2842 crfree(zkcr); /* triggers call to zone_free */ 2843 return (zone_create_error(error, error2, extended_error)); 2844 } 2845 2846 /* 2847 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 2848 * the heavy lifting. 2849 */ 2850 static int 2851 zone_boot(zoneid_t zoneid, const char *bootargs) 2852 { 2853 int err; 2854 zone_t *zone; 2855 2856 if (secpolicy_zone_config(CRED()) != 0) 2857 return (set_errno(EPERM)); 2858 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2859 return (set_errno(EINVAL)); 2860 2861 mutex_enter(&zonehash_lock); 2862 /* 2863 * Look for zone under hash lock to prevent races with calls to 2864 * zone_shutdown, zone_destroy, etc. 2865 */ 2866 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2867 mutex_exit(&zonehash_lock); 2868 return (set_errno(EINVAL)); 2869 } 2870 2871 if ((err = zone_set_bootargs(zone, bootargs)) != 0) { 2872 mutex_exit(&zonehash_lock); 2873 return (set_errno(err)); 2874 } 2875 2876 mutex_enter(&zone_status_lock); 2877 if (zone_status_get(zone) != ZONE_IS_READY) { 2878 mutex_exit(&zone_status_lock); 2879 mutex_exit(&zonehash_lock); 2880 return (set_errno(EINVAL)); 2881 } 2882 zone_status_set(zone, ZONE_IS_BOOTING); 2883 mutex_exit(&zone_status_lock); 2884 2885 zone_hold(zone); /* so we can use the zone_t later */ 2886 mutex_exit(&zonehash_lock); 2887 2888 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 2889 zone_rele(zone); 2890 return (set_errno(EINTR)); 2891 } 2892 2893 /* 2894 * Boot (starting init) might have failed, in which case the zone 2895 * will go to the SHUTTING_DOWN state; an appropriate errno will 2896 * be placed in zone->zone_boot_err, and so we return that. 2897 */ 2898 err = zone->zone_boot_err; 2899 zone_rele(zone); 2900 return (err ? set_errno(err) : 0); 2901 } 2902 2903 /* 2904 * Kills all user processes in the zone, waiting for them all to exit 2905 * before returning. 2906 */ 2907 static int 2908 zone_empty(zone_t *zone) 2909 { 2910 int waitstatus; 2911 2912 /* 2913 * We need to drop zonehash_lock before killing all 2914 * processes, otherwise we'll deadlock with zone_find_* 2915 * which can be called from the exit path. 2916 */ 2917 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 2918 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 2919 ZONE_IS_EMPTY)) == -1) { 2920 killall(zone->zone_id); 2921 } 2922 /* 2923 * return EINTR if we were signaled 2924 */ 2925 if (waitstatus == 0) 2926 return (EINTR); 2927 return (0); 2928 } 2929 2930 /* 2931 * Systemcall to start the zone's halt sequence. By the time this 2932 * function successfully returns, all user processes and kernel threads 2933 * executing in it will have exited, ZSD shutdown callbacks executed, 2934 * and the zone status set to ZONE_IS_DOWN. 2935 * 2936 * It is possible that the call will interrupt itself if the caller is the 2937 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 2938 */ 2939 static int 2940 zone_shutdown(zoneid_t zoneid) 2941 { 2942 int error; 2943 zone_t *zone; 2944 zone_status_t status; 2945 2946 if (secpolicy_zone_config(CRED()) != 0) 2947 return (set_errno(EPERM)); 2948 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2949 return (set_errno(EINVAL)); 2950 2951 /* 2952 * Block mounts so that VFS_MOUNT() can get an accurate view of 2953 * the zone's status with regards to ZONE_IS_SHUTTING down. 2954 * 2955 * e.g. NFS can fail the mount if it determines that the zone 2956 * has already begun the shutdown sequence. 2957 */ 2958 if (block_mounts() == 0) 2959 return (set_errno(EINTR)); 2960 mutex_enter(&zonehash_lock); 2961 /* 2962 * Look for zone under hash lock to prevent races with other 2963 * calls to zone_shutdown and zone_destroy. 2964 */ 2965 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2966 mutex_exit(&zonehash_lock); 2967 resume_mounts(); 2968 return (set_errno(EINVAL)); 2969 } 2970 mutex_enter(&zone_status_lock); 2971 status = zone_status_get(zone); 2972 /* 2973 * Fail if the zone isn't fully initialized yet. 2974 */ 2975 if (status < ZONE_IS_READY) { 2976 mutex_exit(&zone_status_lock); 2977 mutex_exit(&zonehash_lock); 2978 resume_mounts(); 2979 return (set_errno(EINVAL)); 2980 } 2981 /* 2982 * If conditions required for zone_shutdown() to return have been met, 2983 * return success. 2984 */ 2985 if (status >= ZONE_IS_DOWN) { 2986 mutex_exit(&zone_status_lock); 2987 mutex_exit(&zonehash_lock); 2988 resume_mounts(); 2989 return (0); 2990 } 2991 /* 2992 * If zone_shutdown() hasn't been called before, go through the motions. 2993 * If it has, there's nothing to do but wait for the kernel threads to 2994 * drain. 2995 */ 2996 if (status < ZONE_IS_EMPTY) { 2997 uint_t ntasks; 2998 2999 mutex_enter(&zone->zone_lock); 3000 if ((ntasks = zone->zone_ntasks) != 1) { 3001 /* 3002 * There's still stuff running. 3003 */ 3004 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3005 } 3006 mutex_exit(&zone->zone_lock); 3007 if (ntasks == 1) { 3008 /* 3009 * The only way to create another task is through 3010 * zone_enter(), which will block until we drop 3011 * zonehash_lock. The zone is empty. 3012 */ 3013 if (zone->zone_kthreads == NULL) { 3014 /* 3015 * Skip ahead to ZONE_IS_DOWN 3016 */ 3017 zone_status_set(zone, ZONE_IS_DOWN); 3018 } else { 3019 zone_status_set(zone, ZONE_IS_EMPTY); 3020 } 3021 } 3022 } 3023 zone_hold(zone); /* so we can use the zone_t later */ 3024 mutex_exit(&zone_status_lock); 3025 mutex_exit(&zonehash_lock); 3026 resume_mounts(); 3027 3028 if (error = zone_empty(zone)) { 3029 zone_rele(zone); 3030 return (set_errno(error)); 3031 } 3032 /* 3033 * After the zone status goes to ZONE_IS_DOWN this zone will no 3034 * longer be notified of changes to the pools configuration, so 3035 * in order to not end up with a stale pool pointer, we point 3036 * ourselves at the default pool and remove all resource 3037 * visibility. This is especially important as the zone_t may 3038 * languish on the deathrow for a very long time waiting for 3039 * cred's to drain out. 3040 * 3041 * This rebinding of the zone can happen multiple times 3042 * (presumably due to interrupted or parallel systemcalls) 3043 * without any adverse effects. 3044 */ 3045 if (pool_lock_intr() != 0) { 3046 zone_rele(zone); 3047 return (set_errno(EINTR)); 3048 } 3049 if (pool_state == POOL_ENABLED) { 3050 mutex_enter(&cpu_lock); 3051 zone_pool_set(zone, pool_default); 3052 /* 3053 * The zone no longer needs to be able to see any cpus. 3054 */ 3055 zone_pset_set(zone, ZONE_PS_INVAL); 3056 mutex_exit(&cpu_lock); 3057 } 3058 pool_unlock(); 3059 3060 /* 3061 * ZSD shutdown callbacks can be executed multiple times, hence 3062 * it is safe to not be holding any locks across this call. 3063 */ 3064 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 3065 3066 mutex_enter(&zone_status_lock); 3067 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 3068 zone_status_set(zone, ZONE_IS_DOWN); 3069 mutex_exit(&zone_status_lock); 3070 3071 /* 3072 * Wait for kernel threads to drain. 3073 */ 3074 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 3075 zone_rele(zone); 3076 return (set_errno(EINTR)); 3077 } 3078 zone_rele(zone); 3079 return (0); 3080 } 3081 3082 /* 3083 * Systemcall entry point to finalize the zone halt process. The caller 3084 * must have already successfully callefd zone_shutdown(). 3085 * 3086 * Upon successful completion, the zone will have been fully destroyed: 3087 * zsched will have exited, destructor callbacks executed, and the zone 3088 * removed from the list of active zones. 3089 */ 3090 static int 3091 zone_destroy(zoneid_t zoneid) 3092 { 3093 uint64_t uniqid; 3094 zone_t *zone; 3095 zone_status_t status; 3096 3097 if (secpolicy_zone_config(CRED()) != 0) 3098 return (set_errno(EPERM)); 3099 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3100 return (set_errno(EINVAL)); 3101 3102 mutex_enter(&zonehash_lock); 3103 /* 3104 * Look for zone under hash lock to prevent races with other 3105 * calls to zone_destroy. 3106 */ 3107 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3108 mutex_exit(&zonehash_lock); 3109 return (set_errno(EINVAL)); 3110 } 3111 3112 if (zone_mount_count(zone->zone_rootpath) != 0) { 3113 mutex_exit(&zonehash_lock); 3114 return (set_errno(EBUSY)); 3115 } 3116 mutex_enter(&zone_status_lock); 3117 status = zone_status_get(zone); 3118 if (status < ZONE_IS_DOWN) { 3119 mutex_exit(&zone_status_lock); 3120 mutex_exit(&zonehash_lock); 3121 return (set_errno(EBUSY)); 3122 } else if (status == ZONE_IS_DOWN) { 3123 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3124 } 3125 mutex_exit(&zone_status_lock); 3126 zone_hold(zone); 3127 mutex_exit(&zonehash_lock); 3128 3129 /* 3130 * wait for zsched to exit 3131 */ 3132 zone_status_wait(zone, ZONE_IS_DEAD); 3133 zone_zsd_callbacks(zone, ZSD_DESTROY); 3134 uniqid = zone->zone_uniqid; 3135 zone_rele(zone); 3136 zone = NULL; /* potentially free'd */ 3137 3138 mutex_enter(&zonehash_lock); 3139 for (; /* ever */; ) { 3140 boolean_t unref; 3141 3142 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3143 zone->zone_uniqid != uniqid) { 3144 /* 3145 * The zone has gone away. Necessary conditions 3146 * are met, so we return success. 3147 */ 3148 mutex_exit(&zonehash_lock); 3149 return (0); 3150 } 3151 mutex_enter(&zone->zone_lock); 3152 unref = ZONE_IS_UNREF(zone); 3153 mutex_exit(&zone->zone_lock); 3154 if (unref) { 3155 /* 3156 * There is only one reference to the zone -- that 3157 * added when the zone was added to the hashtables -- 3158 * and things will remain this way until we drop 3159 * zonehash_lock... we can go ahead and cleanup the 3160 * zone. 3161 */ 3162 break; 3163 } 3164 3165 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3166 /* Signaled */ 3167 mutex_exit(&zonehash_lock); 3168 return (set_errno(EINTR)); 3169 } 3170 3171 } 3172 3173 /* 3174 * It is now safe to let the zone be recreated; remove it from the 3175 * lists. The memory will not be freed until the last cred 3176 * reference goes away. 3177 */ 3178 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3179 zonecount--; 3180 /* remove from active list and hash tables */ 3181 list_remove(&zone_active, zone); 3182 (void) mod_hash_destroy(zonehashbyname, 3183 (mod_hash_key_t)zone->zone_name); 3184 (void) mod_hash_destroy(zonehashbyid, 3185 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3186 mutex_exit(&zonehash_lock); 3187 3188 /* 3189 * Release the root vnode; we're not using it anymore. Nor should any 3190 * other thread that might access it exist. 3191 */ 3192 if (zone->zone_rootvp != NULL) { 3193 VN_RELE(zone->zone_rootvp); 3194 zone->zone_rootvp = NULL; 3195 } 3196 3197 /* add to deathrow list */ 3198 mutex_enter(&zone_deathrow_lock); 3199 list_insert_tail(&zone_deathrow, zone); 3200 mutex_exit(&zone_deathrow_lock); 3201 3202 /* 3203 * Drop last reference (which was added by zsched()), this will 3204 * free the zone unless there are outstanding cred references. 3205 */ 3206 zone_rele(zone); 3207 return (0); 3208 } 3209 3210 /* 3211 * Systemcall entry point for zone_getattr(2). 3212 */ 3213 static ssize_t 3214 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3215 { 3216 size_t size; 3217 int error = 0, err; 3218 zone_t *zone; 3219 char *zonepath; 3220 zone_status_t zone_status; 3221 pid_t initpid; 3222 boolean_t global = (curproc->p_zone == global_zone); 3223 3224 mutex_enter(&zonehash_lock); 3225 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3226 mutex_exit(&zonehash_lock); 3227 return (set_errno(EINVAL)); 3228 } 3229 zone_status = zone_status_get(zone); 3230 if (zone_status < ZONE_IS_READY) { 3231 mutex_exit(&zonehash_lock); 3232 return (set_errno(EINVAL)); 3233 } 3234 zone_hold(zone); 3235 mutex_exit(&zonehash_lock); 3236 3237 /* 3238 * If not in the global zone, don't show information about other zones. 3239 */ 3240 if (!global && curproc->p_zone != zone) { 3241 zone_rele(zone); 3242 return (set_errno(EINVAL)); 3243 } 3244 3245 switch (attr) { 3246 case ZONE_ATTR_ROOT: 3247 if (global) { 3248 /* 3249 * Copy the path to trim the trailing "/" (except for 3250 * the global zone). 3251 */ 3252 if (zone != global_zone) 3253 size = zone->zone_rootpathlen - 1; 3254 else 3255 size = zone->zone_rootpathlen; 3256 zonepath = kmem_alloc(size, KM_SLEEP); 3257 bcopy(zone->zone_rootpath, zonepath, size); 3258 zonepath[size - 1] = '\0'; 3259 } else { 3260 /* 3261 * Caller is not in the global zone, just return 3262 * faked-up path for current zone. 3263 */ 3264 zonepath = "/"; 3265 size = 2; 3266 } 3267 if (bufsize > size) 3268 bufsize = size; 3269 if (buf != NULL) { 3270 err = copyoutstr(zonepath, buf, bufsize, NULL); 3271 if (err != 0 && err != ENAMETOOLONG) 3272 error = EFAULT; 3273 } 3274 if (global) 3275 kmem_free(zonepath, size); 3276 break; 3277 3278 case ZONE_ATTR_NAME: 3279 size = strlen(zone->zone_name) + 1; 3280 if (bufsize > size) 3281 bufsize = size; 3282 if (buf != NULL) { 3283 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 3284 if (err != 0 && err != ENAMETOOLONG) 3285 error = EFAULT; 3286 } 3287 break; 3288 3289 case ZONE_ATTR_STATUS: 3290 /* 3291 * Since we're not holding zonehash_lock, the zone status 3292 * may be anything; leave it up to userland to sort it out. 3293 */ 3294 size = sizeof (zone_status); 3295 if (bufsize > size) 3296 bufsize = size; 3297 zone_status = zone_status_get(zone); 3298 if (buf != NULL && 3299 copyout(&zone_status, buf, bufsize) != 0) 3300 error = EFAULT; 3301 break; 3302 case ZONE_ATTR_PRIVSET: 3303 size = sizeof (priv_set_t); 3304 if (bufsize > size) 3305 bufsize = size; 3306 if (buf != NULL && 3307 copyout(zone->zone_privset, buf, bufsize) != 0) 3308 error = EFAULT; 3309 break; 3310 case ZONE_ATTR_UNIQID: 3311 size = sizeof (zone->zone_uniqid); 3312 if (bufsize > size) 3313 bufsize = size; 3314 if (buf != NULL && 3315 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 3316 error = EFAULT; 3317 break; 3318 case ZONE_ATTR_POOLID: 3319 { 3320 pool_t *pool; 3321 poolid_t poolid; 3322 3323 if (pool_lock_intr() != 0) { 3324 error = EINTR; 3325 break; 3326 } 3327 pool = zone_pool_get(zone); 3328 poolid = pool->pool_id; 3329 pool_unlock(); 3330 size = sizeof (poolid); 3331 if (bufsize > size) 3332 bufsize = size; 3333 if (buf != NULL && copyout(&poolid, buf, size) != 0) 3334 error = EFAULT; 3335 } 3336 break; 3337 case ZONE_ATTR_INITPID: 3338 size = sizeof (initpid); 3339 if (bufsize > size) 3340 bufsize = size; 3341 initpid = zone->zone_proc_initpid; 3342 if (initpid == -1) { 3343 error = ESRCH; 3344 break; 3345 } 3346 if (buf != NULL && 3347 copyout(&initpid, buf, bufsize) != 0) 3348 error = EFAULT; 3349 break; 3350 default: 3351 error = EINVAL; 3352 } 3353 zone_rele(zone); 3354 3355 if (error) 3356 return (set_errno(error)); 3357 return ((ssize_t)size); 3358 } 3359 3360 /* 3361 * Return zero if the process has at least one vnode mapped in to its 3362 * address space which shouldn't be allowed to change zones. 3363 */ 3364 static int 3365 as_can_change_zones(void) 3366 { 3367 proc_t *pp = curproc; 3368 struct seg *seg; 3369 struct as *as = pp->p_as; 3370 vnode_t *vp; 3371 int allow = 1; 3372 3373 ASSERT(pp->p_as != &kas); 3374 AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); 3375 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 3376 /* 3377 * if we can't get a backing vnode for this segment then skip 3378 * it. 3379 */ 3380 vp = NULL; 3381 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 3382 continue; 3383 if (!vn_can_change_zones(vp)) { /* bail on first match */ 3384 allow = 0; 3385 break; 3386 } 3387 } 3388 AS_LOCK_EXIT(&as, &as->a_lock); 3389 return (allow); 3390 } 3391 3392 /* 3393 * Systemcall entry point for zone_enter(). 3394 * 3395 * The current process is injected into said zone. In the process 3396 * it will change its project membership, privileges, rootdir/cwd, 3397 * zone-wide rctls, and pool association to match those of the zone. 3398 * 3399 * The first zone_enter() called while the zone is in the ZONE_IS_READY 3400 * state will transition it to ZONE_IS_RUNNING. Processes may only 3401 * enter a zone that is "ready" or "running". 3402 */ 3403 static int 3404 zone_enter(zoneid_t zoneid) 3405 { 3406 zone_t *zone; 3407 vnode_t *vp; 3408 proc_t *pp = curproc; 3409 contract_t *ct; 3410 cont_process_t *ctp; 3411 task_t *tk, *oldtk; 3412 kproject_t *zone_proj0; 3413 cred_t *cr, *newcr; 3414 pool_t *oldpool, *newpool; 3415 sess_t *sp; 3416 uid_t uid; 3417 zone_status_t status; 3418 int err = 0; 3419 rctl_entity_p_t e; 3420 3421 if (secpolicy_zone_config(CRED()) != 0) 3422 return (set_errno(EPERM)); 3423 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3424 return (set_errno(EINVAL)); 3425 3426 /* 3427 * Stop all lwps so we don't need to hold a lock to look at 3428 * curproc->p_zone. This needs to happen before we grab any 3429 * locks to avoid deadlock (another lwp in the process could 3430 * be waiting for the held lock). 3431 */ 3432 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 3433 return (set_errno(EINTR)); 3434 3435 /* 3436 * Make sure we're not changing zones with files open or mapped in 3437 * to our address space which shouldn't be changing zones. 3438 */ 3439 if (!files_can_change_zones()) { 3440 err = EBADF; 3441 goto out; 3442 } 3443 if (!as_can_change_zones()) { 3444 err = EFAULT; 3445 goto out; 3446 } 3447 3448 mutex_enter(&zonehash_lock); 3449 if (pp->p_zone != global_zone) { 3450 mutex_exit(&zonehash_lock); 3451 err = EINVAL; 3452 goto out; 3453 } 3454 3455 zone = zone_find_all_by_id(zoneid); 3456 if (zone == NULL) { 3457 mutex_exit(&zonehash_lock); 3458 err = EINVAL; 3459 goto out; 3460 } 3461 3462 /* 3463 * To prevent processes in a zone from holding contracts on 3464 * extrazonal resources, and to avoid process contract 3465 * memberships which span zones, contract holders and processes 3466 * which aren't the sole members of their encapsulating process 3467 * contracts are not allowed to zone_enter. 3468 */ 3469 ctp = pp->p_ct_process; 3470 ct = &ctp->conp_contract; 3471 mutex_enter(&ct->ct_lock); 3472 mutex_enter(&pp->p_lock); 3473 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 3474 mutex_exit(&pp->p_lock); 3475 mutex_exit(&ct->ct_lock); 3476 mutex_exit(&zonehash_lock); 3477 pool_unlock(); 3478 err = EINVAL; 3479 goto out; 3480 } 3481 3482 /* 3483 * Moreover, we don't allow processes whose encapsulating 3484 * process contracts have inherited extrazonal contracts. 3485 * While it would be easier to eliminate all process contracts 3486 * with inherited contracts, we need to be able to give a 3487 * restarted init (or other zone-penetrating process) its 3488 * predecessor's contracts. 3489 */ 3490 if (ctp->conp_ninherited != 0) { 3491 contract_t *next; 3492 for (next = list_head(&ctp->conp_inherited); next; 3493 next = list_next(&ctp->conp_inherited, next)) { 3494 if (contract_getzuniqid(next) != zone->zone_uniqid) { 3495 mutex_exit(&pp->p_lock); 3496 mutex_exit(&ct->ct_lock); 3497 mutex_exit(&zonehash_lock); 3498 pool_unlock(); 3499 err = EINVAL; 3500 goto out; 3501 } 3502 } 3503 } 3504 mutex_exit(&pp->p_lock); 3505 mutex_exit(&ct->ct_lock); 3506 3507 status = zone_status_get(zone); 3508 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 3509 /* 3510 * Can't join 3511 */ 3512 mutex_exit(&zonehash_lock); 3513 err = EINVAL; 3514 goto out; 3515 } 3516 3517 /* 3518 * Make sure new priv set is within the permitted set for caller 3519 */ 3520 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 3521 mutex_exit(&zonehash_lock); 3522 err = EPERM; 3523 goto out; 3524 } 3525 /* 3526 * We want to momentarily drop zonehash_lock while we optimistically 3527 * bind curproc to the pool it should be running in. This is safe 3528 * since the zone can't disappear (we have a hold on it). 3529 */ 3530 zone_hold(zone); 3531 mutex_exit(&zonehash_lock); 3532 3533 /* 3534 * Grab pool_lock to keep the pools configuration from changing 3535 * and to stop ourselves from getting rebound to another pool 3536 * until we join the zone. 3537 */ 3538 if (pool_lock_intr() != 0) { 3539 zone_rele(zone); 3540 err = EINTR; 3541 goto out; 3542 } 3543 ASSERT(secpolicy_pool(CRED()) == 0); 3544 /* 3545 * Bind ourselves to the pool currently associated with the zone. 3546 */ 3547 oldpool = curproc->p_pool; 3548 newpool = zone_pool_get(zone); 3549 if (pool_state == POOL_ENABLED && newpool != oldpool && 3550 (err = pool_do_bind(newpool, P_PID, P_MYID, 3551 POOL_BIND_ALL)) != 0) { 3552 pool_unlock(); 3553 zone_rele(zone); 3554 goto out; 3555 } 3556 3557 /* 3558 * Grab cpu_lock now; we'll need it later when we call 3559 * task_join(). 3560 */ 3561 mutex_enter(&cpu_lock); 3562 mutex_enter(&zonehash_lock); 3563 /* 3564 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 3565 */ 3566 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3567 /* 3568 * Can't join anymore. 3569 */ 3570 mutex_exit(&zonehash_lock); 3571 mutex_exit(&cpu_lock); 3572 if (pool_state == POOL_ENABLED && 3573 newpool != oldpool) 3574 (void) pool_do_bind(oldpool, P_PID, P_MYID, 3575 POOL_BIND_ALL); 3576 pool_unlock(); 3577 zone_rele(zone); 3578 err = EINVAL; 3579 goto out; 3580 } 3581 3582 mutex_enter(&pp->p_lock); 3583 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 3584 /* verify that we do not exceed and task or lwp limits */ 3585 mutex_enter(&zone->zone_nlwps_lock); 3586 /* add new lwps to zone and zone's proj0 */ 3587 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 3588 zone->zone_nlwps += pp->p_lwpcnt; 3589 /* add 1 task to zone's proj0 */ 3590 zone_proj0->kpj_ntasks += 1; 3591 mutex_exit(&pp->p_lock); 3592 mutex_exit(&zone->zone_nlwps_lock); 3593 3594 /* remove lwps from proc's old zone and old project */ 3595 mutex_enter(&pp->p_zone->zone_nlwps_lock); 3596 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 3597 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 3598 mutex_exit(&pp->p_zone->zone_nlwps_lock); 3599 3600 /* 3601 * Joining the zone cannot fail from now on. 3602 * 3603 * This means that a lot of the following code can be commonized and 3604 * shared with zsched(). 3605 */ 3606 3607 /* 3608 * Reset the encapsulating process contract's zone. 3609 */ 3610 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 3611 contract_setzuniqid(ct, zone->zone_uniqid); 3612 3613 /* 3614 * Create a new task and associate the process with the project keyed 3615 * by (projid,zoneid). 3616 * 3617 * We might as well be in project 0; the global zone's projid doesn't 3618 * make much sense in a zone anyhow. 3619 * 3620 * This also increments zone_ntasks, and returns with p_lock held. 3621 */ 3622 tk = task_create(0, zone); 3623 oldtk = task_join(tk, 0); 3624 mutex_exit(&cpu_lock); 3625 3626 pp->p_flag |= SZONETOP; 3627 pp->p_zone = zone; 3628 3629 /* 3630 * call RCTLOP_SET functions on this proc 3631 */ 3632 e.rcep_p.zone = zone; 3633 e.rcep_t = RCENTITY_ZONE; 3634 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 3635 RCD_CALLBACK); 3636 mutex_exit(&pp->p_lock); 3637 3638 /* 3639 * We don't need to hold any of zsched's locks here; not only do we know 3640 * the process and zone aren't going away, we know its session isn't 3641 * changing either. 3642 * 3643 * By joining zsched's session here, we mimic the behavior in the 3644 * global zone of init's sid being the pid of sched. We extend this 3645 * to all zlogin-like zone_enter()'ing processes as well. 3646 */ 3647 mutex_enter(&pidlock); 3648 sp = zone->zone_zsched->p_sessp; 3649 SESS_HOLD(sp); 3650 mutex_enter(&pp->p_lock); 3651 pgexit(pp); 3652 SESS_RELE(pp->p_sessp); 3653 pp->p_sessp = sp; 3654 pgjoin(pp, zone->zone_zsched->p_pidp); 3655 mutex_exit(&pp->p_lock); 3656 mutex_exit(&pidlock); 3657 3658 mutex_exit(&zonehash_lock); 3659 /* 3660 * We're firmly in the zone; let pools progress. 3661 */ 3662 pool_unlock(); 3663 task_rele(oldtk); 3664 /* 3665 * We don't need to retain a hold on the zone since we already 3666 * incremented zone_ntasks, so the zone isn't going anywhere. 3667 */ 3668 zone_rele(zone); 3669 3670 /* 3671 * Chroot 3672 */ 3673 vp = zone->zone_rootvp; 3674 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 3675 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 3676 3677 /* 3678 * Change process credentials 3679 */ 3680 newcr = cralloc(); 3681 mutex_enter(&pp->p_crlock); 3682 cr = pp->p_cred; 3683 crcopy_to(cr, newcr); 3684 crsetzone(newcr, zone); 3685 pp->p_cred = newcr; 3686 3687 /* 3688 * Restrict all process privilege sets to zone limit 3689 */ 3690 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 3691 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 3692 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 3693 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 3694 mutex_exit(&pp->p_crlock); 3695 crset(pp, newcr); 3696 3697 /* 3698 * Adjust upcount to reflect zone entry. 3699 */ 3700 uid = crgetruid(newcr); 3701 mutex_enter(&pidlock); 3702 upcount_dec(uid, GLOBAL_ZONEID); 3703 upcount_inc(uid, zoneid); 3704 mutex_exit(&pidlock); 3705 3706 /* 3707 * Set up core file path and content. 3708 */ 3709 set_core_defaults(); 3710 3711 out: 3712 /* 3713 * Let the other lwps continue. 3714 */ 3715 mutex_enter(&pp->p_lock); 3716 if (curthread != pp->p_agenttp) 3717 continuelwps(pp); 3718 mutex_exit(&pp->p_lock); 3719 3720 return (err != 0 ? set_errno(err) : 0); 3721 } 3722 3723 /* 3724 * Systemcall entry point for zone_list(2). 3725 * 3726 * Processes running in a (non-global) zone only see themselves. 3727 */ 3728 static int 3729 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 3730 { 3731 zoneid_t *zoneids; 3732 zone_t *zone; 3733 uint_t user_nzones, real_nzones; 3734 int error = 0; 3735 uint_t i; 3736 3737 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 3738 return (set_errno(EFAULT)); 3739 3740 if (curproc->p_zone != global_zone) { 3741 /* just return current zone */ 3742 real_nzones = 1; 3743 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 3744 zoneids[0] = curproc->p_zone->zone_id; 3745 } else { 3746 mutex_enter(&zonehash_lock); 3747 real_nzones = zonecount; 3748 if (real_nzones) { 3749 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 3750 KM_SLEEP); 3751 i = 0; 3752 for (zone = list_head(&zone_active); zone != NULL; 3753 zone = list_next(&zone_active, zone)) 3754 zoneids[i++] = zone->zone_id; 3755 ASSERT(i == real_nzones); 3756 } 3757 mutex_exit(&zonehash_lock); 3758 } 3759 3760 if (user_nzones > real_nzones) 3761 user_nzones = real_nzones; 3762 3763 if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0) 3764 error = EFAULT; 3765 else if (zoneidlist != NULL && user_nzones != 0) { 3766 if (copyout(zoneids, zoneidlist, 3767 user_nzones * sizeof (zoneid_t)) != 0) 3768 error = EFAULT; 3769 } 3770 3771 if (real_nzones) 3772 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 3773 3774 if (error) 3775 return (set_errno(error)); 3776 else 3777 return (0); 3778 } 3779 3780 /* 3781 * Systemcall entry point for zone_lookup(2). 3782 * 3783 * Non-global zones are only able to see themselves. 3784 */ 3785 static zoneid_t 3786 zone_lookup(const char *zone_name) 3787 { 3788 char *kname; 3789 zone_t *zone; 3790 zoneid_t zoneid; 3791 int err; 3792 3793 if (zone_name == NULL) { 3794 /* return caller's zone id */ 3795 return (getzoneid()); 3796 } 3797 3798 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3799 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 3800 kmem_free(kname, ZONENAME_MAX); 3801 return (set_errno(err)); 3802 } 3803 3804 mutex_enter(&zonehash_lock); 3805 zone = zone_find_all_by_name(kname); 3806 kmem_free(kname, ZONENAME_MAX); 3807 if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY || 3808 (curproc->p_zone != global_zone && curproc->p_zone != zone)) { 3809 /* in non-global zone, can only lookup own name */ 3810 mutex_exit(&zonehash_lock); 3811 return (set_errno(EINVAL)); 3812 } 3813 zoneid = zone->zone_id; 3814 mutex_exit(&zonehash_lock); 3815 return (zoneid); 3816 } 3817 3818 static int 3819 zone_version(int *version_arg) 3820 { 3821 int version = ZONE_SYSCALL_API_VERSION; 3822 3823 if (copyout(&version, version_arg, sizeof (int)) != 0) 3824 return (set_errno(EFAULT)); 3825 return (0); 3826 } 3827 3828 /* ARGSUSED */ 3829 long 3830 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 3831 { 3832 zone_def zs; 3833 3834 switch (cmd) { 3835 case ZONE_CREATE: 3836 if (get_udatamodel() == DATAMODEL_NATIVE) { 3837 if (copyin(arg1, &zs, sizeof (zone_def))) { 3838 return (set_errno(EFAULT)); 3839 } 3840 } else { 3841 #ifdef _SYSCALL32_IMPL 3842 zone_def32 zs32; 3843 3844 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 3845 return (set_errno(EFAULT)); 3846 } 3847 zs.zone_name = 3848 (const char *)(unsigned long)zs32.zone_name; 3849 zs.zone_root = 3850 (const char *)(unsigned long)zs32.zone_root; 3851 zs.zone_privs = 3852 (const struct priv_set *) 3853 (unsigned long)zs32.zone_privs; 3854 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 3855 zs.rctlbufsz = zs32.rctlbufsz; 3856 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 3857 zs.zfsbufsz = zs32.zfsbufsz; 3858 zs.extended_error = 3859 (int *)(unsigned long)zs32.extended_error; 3860 #else 3861 panic("get_udatamodel() returned bogus result\n"); 3862 #endif 3863 } 3864 3865 return (zone_create(zs.zone_name, zs.zone_root, 3866 zs.zone_privs, zs.zone_privssz, 3867 (caddr_t)zs.rctlbuf, zs.rctlbufsz, 3868 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 3869 zs.extended_error)); 3870 case ZONE_BOOT: 3871 return (zone_boot((zoneid_t)(uintptr_t)arg1, 3872 (const char *)arg2)); 3873 case ZONE_DESTROY: 3874 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 3875 case ZONE_GETATTR: 3876 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 3877 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 3878 case ZONE_ENTER: 3879 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 3880 case ZONE_LIST: 3881 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 3882 case ZONE_SHUTDOWN: 3883 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 3884 case ZONE_LOOKUP: 3885 return (zone_lookup((const char *)arg1)); 3886 case ZONE_VERSION: 3887 return (zone_version((int *)arg1)); 3888 default: 3889 return (set_errno(EINVAL)); 3890 } 3891 } 3892 3893 struct zarg { 3894 zone_t *zone; 3895 zone_cmd_arg_t arg; 3896 }; 3897 3898 static int 3899 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 3900 { 3901 char *buf; 3902 size_t buflen; 3903 int error; 3904 3905 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 3906 buf = kmem_alloc(buflen, KM_SLEEP); 3907 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 3908 error = door_ki_open(buf, doorp); 3909 kmem_free(buf, buflen); 3910 return (error); 3911 } 3912 3913 static void 3914 zone_release_door(door_handle_t *doorp) 3915 { 3916 door_ki_rele(*doorp); 3917 *doorp = NULL; 3918 } 3919 3920 static void 3921 zone_ki_call_zoneadmd(struct zarg *zargp) 3922 { 3923 door_handle_t door = NULL; 3924 door_arg_t darg, save_arg; 3925 char *zone_name; 3926 size_t zone_namelen; 3927 zoneid_t zoneid; 3928 zone_t *zone; 3929 zone_cmd_arg_t arg; 3930 uint64_t uniqid; 3931 size_t size; 3932 int error; 3933 int retry; 3934 3935 zone = zargp->zone; 3936 arg = zargp->arg; 3937 kmem_free(zargp, sizeof (*zargp)); 3938 3939 zone_namelen = strlen(zone->zone_name) + 1; 3940 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 3941 bcopy(zone->zone_name, zone_name, zone_namelen); 3942 zoneid = zone->zone_id; 3943 uniqid = zone->zone_uniqid; 3944 /* 3945 * zoneadmd may be down, but at least we can empty out the zone. 3946 * We can ignore the return value of zone_empty() since we're called 3947 * from a kernel thread and know we won't be delivered any signals. 3948 */ 3949 ASSERT(curproc == &p0); 3950 (void) zone_empty(zone); 3951 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 3952 zone_rele(zone); 3953 3954 size = sizeof (arg); 3955 darg.rbuf = (char *)&arg; 3956 darg.data_ptr = (char *)&arg; 3957 darg.rsize = size; 3958 darg.data_size = size; 3959 darg.desc_ptr = NULL; 3960 darg.desc_num = 0; 3961 3962 save_arg = darg; 3963 /* 3964 * Since we're not holding a reference to the zone, any number of 3965 * things can go wrong, including the zone disappearing before we get a 3966 * chance to talk to zoneadmd. 3967 */ 3968 for (retry = 0; /* forever */; retry++) { 3969 if (door == NULL && 3970 (error = zone_lookup_door(zone_name, &door)) != 0) { 3971 goto next; 3972 } 3973 ASSERT(door != NULL); 3974 3975 if ((error = door_ki_upcall(door, &darg)) == 0) { 3976 break; 3977 } 3978 switch (error) { 3979 case EINTR: 3980 /* FALLTHROUGH */ 3981 case EAGAIN: /* process may be forking */ 3982 /* 3983 * Back off for a bit 3984 */ 3985 break; 3986 case EBADF: 3987 zone_release_door(&door); 3988 if (zone_lookup_door(zone_name, &door) != 0) { 3989 /* 3990 * zoneadmd may be dead, but it may come back to 3991 * life later. 3992 */ 3993 break; 3994 } 3995 break; 3996 default: 3997 cmn_err(CE_WARN, 3998 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 3999 error); 4000 goto out; 4001 } 4002 next: 4003 /* 4004 * If this isn't the same zone_t that we originally had in mind, 4005 * then this is the same as if two kadmin requests come in at 4006 * the same time: the first one wins. This means we lose, so we 4007 * bail. 4008 */ 4009 if ((zone = zone_find_by_id(zoneid)) == NULL) { 4010 /* 4011 * Problem is solved. 4012 */ 4013 break; 4014 } 4015 if (zone->zone_uniqid != uniqid) { 4016 /* 4017 * zoneid recycled 4018 */ 4019 zone_rele(zone); 4020 break; 4021 } 4022 /* 4023 * We could zone_status_timedwait(), but there doesn't seem to 4024 * be much point in doing that (plus, it would mean that 4025 * zone_free() isn't called until this thread exits). 4026 */ 4027 zone_rele(zone); 4028 delay(hz); 4029 darg = save_arg; 4030 } 4031 out: 4032 if (door != NULL) { 4033 zone_release_door(&door); 4034 } 4035 kmem_free(zone_name, zone_namelen); 4036 thread_exit(); 4037 } 4038 4039 /* 4040 * Entry point for uadmin() to tell the zone to go away or reboot. The caller 4041 * is a process in the zone to be modified. 4042 * 4043 * In order to shutdown the zone, we will hand off control to zoneadmd 4044 * (running in the global zone) via a door. We do a half-hearted job at 4045 * killing all processes in the zone, create a kernel thread to contact 4046 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 4047 * a form of generation number used to let zoneadmd (as well as 4048 * zone_destroy()) know exactly which zone they're re talking about. 4049 */ 4050 int 4051 zone_uadmin(int cmd, int fcn, cred_t *credp) 4052 { 4053 struct zarg *zargp; 4054 zone_cmd_t zcmd; 4055 zone_t *zone; 4056 4057 zone = curproc->p_zone; 4058 ASSERT(getzoneid() != GLOBAL_ZONEID); 4059 4060 switch (cmd) { 4061 case A_SHUTDOWN: 4062 switch (fcn) { 4063 case AD_HALT: 4064 case AD_POWEROFF: 4065 zcmd = Z_HALT; 4066 break; 4067 case AD_BOOT: 4068 zcmd = Z_REBOOT; 4069 break; 4070 case AD_IBOOT: 4071 case AD_SBOOT: 4072 case AD_SIBOOT: 4073 case AD_NOSYNC: 4074 return (ENOTSUP); 4075 default: 4076 return (EINVAL); 4077 } 4078 break; 4079 case A_REBOOT: 4080 zcmd = Z_REBOOT; 4081 break; 4082 case A_FTRACE: 4083 case A_REMOUNT: 4084 case A_FREEZE: 4085 case A_DUMP: 4086 return (ENOTSUP); 4087 default: 4088 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 4089 return (EINVAL); 4090 } 4091 4092 if (secpolicy_zone_admin(credp, B_FALSE)) 4093 return (EPERM); 4094 mutex_enter(&zone_status_lock); 4095 /* 4096 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 4097 * is in the zone. 4098 */ 4099 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 4100 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 4101 /* 4102 * This zone is already on its way down. 4103 */ 4104 mutex_exit(&zone_status_lock); 4105 return (0); 4106 } 4107 /* 4108 * Prevent future zone_enter()s 4109 */ 4110 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 4111 mutex_exit(&zone_status_lock); 4112 4113 /* 4114 * Kill everyone now and call zoneadmd later. 4115 * zone_ki_call_zoneadmd() will do a more thorough job of this 4116 * later. 4117 */ 4118 killall(zone->zone_id); 4119 /* 4120 * Now, create the thread to contact zoneadmd and do the rest of the 4121 * work. This thread can't be created in our zone otherwise 4122 * zone_destroy() would deadlock. 4123 */ 4124 zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP); 4125 zargp->arg.cmd = zcmd; 4126 zargp->arg.uniqid = zone->zone_uniqid; 4127 (void) strcpy(zargp->arg.locale, "C"); 4128 zone_hold(zargp->zone = zone); 4129 4130 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 4131 TS_RUN, minclsyspri); 4132 exit(CLD_EXITED, 0); 4133 4134 return (EINVAL); 4135 } 4136 4137 /* 4138 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 4139 * status to ZONE_IS_SHUTTING_DOWN. 4140 */ 4141 void 4142 zone_shutdown_global(void) 4143 { 4144 ASSERT(curproc->p_zone == global_zone); 4145 4146 mutex_enter(&zone_status_lock); 4147 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 4148 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 4149 mutex_exit(&zone_status_lock); 4150 } 4151 4152 /* 4153 * Returns true if the named dataset is visible in the current zone. 4154 * The 'write' parameter is set to 1 if the dataset is also writable. 4155 */ 4156 int 4157 zone_dataset_visible(const char *dataset, int *write) 4158 { 4159 zone_dataset_t *zd; 4160 size_t len; 4161 zone_t *zone = curproc->p_zone; 4162 4163 if (dataset[0] == '\0') 4164 return (0); 4165 4166 /* 4167 * Walk the list once, looking for datasets which match exactly, or 4168 * specify a dataset underneath an exported dataset. If found, return 4169 * true and note that it is writable. 4170 */ 4171 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4172 zd = list_next(&zone->zone_datasets, zd)) { 4173 4174 len = strlen(zd->zd_dataset); 4175 if (strlen(dataset) >= len && 4176 bcmp(dataset, zd->zd_dataset, len) == 0 && 4177 (dataset[len] == '\0' || dataset[len] == '/' || 4178 dataset[len] == '@')) { 4179 if (write) 4180 *write = 1; 4181 return (1); 4182 } 4183 } 4184 4185 /* 4186 * Walk the list a second time, searching for datasets which are parents 4187 * of exported datasets. These should be visible, but read-only. 4188 * 4189 * Note that we also have to support forms such as 'pool/dataset/', with 4190 * a trailing slash. 4191 */ 4192 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4193 zd = list_next(&zone->zone_datasets, zd)) { 4194 4195 len = strlen(dataset); 4196 if (dataset[len - 1] == '/') 4197 len--; /* Ignore trailing slash */ 4198 if (len < strlen(zd->zd_dataset) && 4199 bcmp(dataset, zd->zd_dataset, len) == 0 && 4200 zd->zd_dataset[len] == '/') { 4201 if (write) 4202 *write = 0; 4203 return (1); 4204 } 4205 } 4206 4207 return (0); 4208 } 4209