1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * Zones 32 * 33 * A zone is a named collection of processes, namespace constraints, 34 * and other system resources which comprise a secure and manageable 35 * application containment facility. 36 * 37 * Zones (represented by the reference counted zone_t) are tracked in 38 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 39 * (zoneid_t) are used to track zone association. Zone IDs are 40 * dynamically generated when the zone is created; if a persistent 41 * identifier is needed (core files, accounting logs, audit trail, 42 * etc.), the zone name should be used. 43 * 44 * 45 * Global Zone: 46 * 47 * The global zone (zoneid 0) is automatically associated with all 48 * system resources that have not been bound to a user-created zone. 49 * This means that even systems where zones are not in active use 50 * have a global zone, and all processes, mounts, etc. are 51 * associated with that zone. The global zone is generally 52 * unconstrained in terms of privileges and access, though the usual 53 * credential and privilege based restrictions apply. 54 * 55 * 56 * Zone States: 57 * 58 * The states in which a zone may be in and the transitions are as 59 * follows: 60 * 61 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 62 * initialized zone is added to the list of active zones on the system but 63 * isn't accessible. 64 * 65 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 66 * ready. The zone is made visible after the ZSD constructor callbacks are 67 * executed. A zone remains in this state until it transitions into 68 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 69 * 70 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 71 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 72 * state. 73 * 74 * ZONE_IS_RUNNING: The zone is open for business: zsched has 75 * successfully started init. A zone remains in this state until 76 * zone_shutdown() is called. 77 * 78 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 79 * killing all processes running in the zone. The zone remains 80 * in this state until there are no more user processes running in the zone. 81 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 82 * Since zone_shutdown() is restartable, it may be called successfully 83 * multiple times for the same zone_t. Setting of the zone's state to 84 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 85 * the zone's status without worrying about it being a moving target. 86 * 87 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 88 * are no more user processes in the zone. The zone remains in this 89 * state until there are no more kernel threads associated with the 90 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 91 * fail. 92 * 93 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 94 * have exited. zone_shutdown() returns. Henceforth it is not possible to 95 * join the zone or create kernel threads therein. 96 * 97 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 98 * remains in this state until zsched exits. Calls to zone_find_by_*() 99 * return NULL from now on. 100 * 101 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 102 * processes or threads doing work on behalf of the zone. The zone is 103 * removed from the list of active zones. zone_destroy() returns, and 104 * the zone can be recreated. 105 * 106 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 107 * callbacks are executed, and all memory associated with the zone is 108 * freed. 109 * 110 * Threads can wait for the zone to enter a requested state by using 111 * zone_status_wait() or zone_status_timedwait() with the desired 112 * state passed in as an argument. Zone state transitions are 113 * uni-directional; it is not possible to move back to an earlier state. 114 * 115 * 116 * Zone-Specific Data: 117 * 118 * Subsystems needing to maintain zone-specific data can store that 119 * data using the ZSD mechanism. This provides a zone-specific data 120 * store, similar to thread-specific data (see pthread_getspecific(3C) 121 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 122 * to register callbacks to be invoked when a zone is created, shut 123 * down, or destroyed. This can be used to initialize zone-specific 124 * data for new zones and to clean up when zones go away. 125 * 126 * 127 * Data Structures: 128 * 129 * The per-zone structure (zone_t) is reference counted, and freed 130 * when all references are released. zone_hold and zone_rele can be 131 * used to adjust the reference count. In addition, reference counts 132 * associated with the cred_t structure are tracked separately using 133 * zone_cred_hold and zone_cred_rele. 134 * 135 * Pointers to active zone_t's are stored in two hash tables; one 136 * for searching by id, the other for searching by name. Lookups 137 * can be performed on either basis, using zone_find_by_id and 138 * zone_find_by_name. Both return zone_t pointers with the zone 139 * held, so zone_rele should be called when the pointer is no longer 140 * needed. Zones can also be searched by path; zone_find_by_path 141 * returns the zone with which a path name is associated (global 142 * zone if the path is not within some other zone's file system 143 * hierarchy). This currently requires iterating through each zone, 144 * so it is slower than an id or name search via a hash table. 145 * 146 * 147 * Locking: 148 * 149 * zonehash_lock: This is a top-level global lock used to protect the 150 * zone hash tables and lists. Zones cannot be created or destroyed 151 * while this lock is held. 152 * zone_status_lock: This is a global lock protecting zone state. 153 * Zones cannot change state while this lock is held. It also 154 * protects the list of kernel threads associated with a zone. 155 * zone_lock: This is a per-zone lock used to protect several fields of 156 * the zone_t (see <sys/zone.h> for details). In addition, holding 157 * this lock means that the zone cannot go away. 158 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 159 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 160 * list (a list of zones in the ZONE_IS_DEAD state). 161 * 162 * Ordering requirements: 163 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 164 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 165 * 166 * Blocking memory allocations are permitted while holding any of the 167 * zone locks. 168 * 169 * 170 * System Call Interface: 171 * 172 * The zone subsystem can be managed and queried from user level with 173 * the following system calls (all subcodes of the primary "zone" 174 * system call): 175 * - zone_create: creates a zone with selected attributes (name, 176 * root path, privileges, resource controls) 177 * - zone_enter: allows the current process to enter a zone 178 * - zone_getattr: reports attributes of a zone 179 * - zone_list: lists all zones active in the system 180 * - zone_lookup: looks up zone id based on name 181 * - zone_shutdown: initiates shutdown process (see states above) 182 * - zone_destroy: completes shutdown process (see states above) 183 * 184 */ 185 186 #include <sys/priv_impl.h> 187 #include <sys/cred.h> 188 #include <c2/audit.h> 189 #include <sys/ddi.h> 190 #include <sys/debug.h> 191 #include <sys/file.h> 192 #include <sys/kmem.h> 193 #include <sys/mutex.h> 194 #include <sys/pathname.h> 195 #include <sys/proc.h> 196 #include <sys/project.h> 197 #include <sys/task.h> 198 #include <sys/systm.h> 199 #include <sys/types.h> 200 #include <sys/utsname.h> 201 #include <sys/vnode.h> 202 #include <sys/vfs.h> 203 #include <sys/systeminfo.h> 204 #include <sys/policy.h> 205 #include <sys/cred_impl.h> 206 #include <sys/contract_impl.h> 207 #include <sys/contract/process_impl.h> 208 #include <sys/class.h> 209 #include <sys/pool.h> 210 #include <sys/pool_pset.h> 211 #include <sys/pset.h> 212 #include <sys/log.h> 213 #include <sys/sysmacros.h> 214 #include <sys/callb.h> 215 #include <sys/vmparam.h> 216 #include <sys/corectl.h> 217 218 #include <sys/door.h> 219 #include <sys/cpuvar.h> 220 #include <sys/fs/snode.h> 221 222 #include <sys/uadmin.h> 223 #include <sys/session.h> 224 #include <sys/cmn_err.h> 225 #include <sys/modhash.h> 226 #include <sys/nvpair.h> 227 #include <sys/rctl.h> 228 #include <sys/fss.h> 229 #include <sys/zone.h> 230 231 /* 232 * cv used to signal that all references to the zone have been released. This 233 * needs to be global since there may be multiple waiters, and the first to 234 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 235 */ 236 static kcondvar_t zone_destroy_cv; 237 /* 238 * Lock used to serialize access to zone_cv. This could have been per-zone, 239 * but then we'd need another lock for zone_destroy_cv, and why bother? 240 */ 241 static kmutex_t zone_status_lock; 242 243 /* 244 * ZSD-related global variables. 245 */ 246 static kmutex_t zsd_key_lock; /* protects the following two */ 247 /* 248 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 249 */ 250 static zone_key_t zsd_keyval = 0; 251 /* 252 * Global list of registered keys. We use this when a new zone is created. 253 */ 254 static list_t zsd_registered_keys; 255 256 int zone_hash_size = 256; 257 static mod_hash_t *zonehashbyname, *zonehashbyid; 258 static kmutex_t zonehash_lock; 259 static uint_t zonecount; 260 static id_space_t *zoneid_space; 261 262 /* 263 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 264 * kernel proper runs, and which manages all other zones. 265 * 266 * Although not declared as static, the variable "zone0" should not be used 267 * except for by code that needs to reference the global zone early on in boot, 268 * before it is fully initialized. All other consumers should use 269 * 'global_zone'. 270 */ 271 zone_t zone0; 272 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 273 274 /* 275 * List of active zones, protected by zonehash_lock. 276 */ 277 static list_t zone_active; 278 279 /* 280 * List of destroyed zones that still have outstanding cred references. 281 * Used for debugging. Uses a separate lock to avoid lock ordering 282 * problems in zone_free. 283 */ 284 static list_t zone_deathrow; 285 static kmutex_t zone_deathrow_lock; 286 287 /* number of zones is limited by virtual interface limit in IP */ 288 uint_t maxzones = 8192; 289 290 /* 291 * This isn't static so lint doesn't complain. 292 */ 293 rctl_hndl_t rc_zone_cpu_shares; 294 rctl_hndl_t rc_zone_nlwps; 295 /* 296 * Synchronization primitives used to synchronize between mounts and zone 297 * creation/destruction. 298 */ 299 static int mounts_in_progress; 300 static kcondvar_t mount_cv; 301 static kmutex_t mount_lock; 302 303 const char * const zone_initname = "/sbin/init"; 304 305 static int zone_shutdown(zoneid_t zoneid); 306 307 /* 308 * Certain filesystems (such as NFS and autofs) need to know which zone 309 * the mount is being placed in. Because of this, we need to be able to 310 * ensure that a zone isn't in the process of being created such that 311 * nfs_mount() thinks it is in the global zone, while by the time it 312 * gets added the list of mounted zones, it ends up on zoneA's mount 313 * list. 314 * 315 * The following functions: block_mounts()/resume_mounts() and 316 * mount_in_progress()/mount_completed() are used by zones and the VFS 317 * layer (respectively) to synchronize zone creation and new mounts. 318 * 319 * The semantics are like a reader-reader lock such that there may 320 * either be multiple mounts (or zone creations, if that weren't 321 * serialized by zonehash_lock) in progress at the same time, but not 322 * both. 323 * 324 * We use cv's so the user can ctrl-C out of the operation if it's 325 * taking too long. 326 * 327 * The semantics are such that there is unfair bias towards the 328 * "current" operation. This means that zone creations may starve if 329 * there is a rapid succession of new mounts coming in to the system, or 330 * there is a remote possibility that zones will be created at such a 331 * rate that new mounts will not be able to proceed. 332 */ 333 /* 334 * Prevent new mounts from progressing to the point of calling 335 * VFS_MOUNT(). If there are already mounts in this "region", wait for 336 * them to complete. 337 */ 338 static int 339 block_mounts(void) 340 { 341 int retval = 0; 342 343 /* 344 * Since it may block for a long time, block_mounts() shouldn't be 345 * called with zonehash_lock held. 346 */ 347 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 348 mutex_enter(&mount_lock); 349 while (mounts_in_progress > 0) { 350 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 351 goto signaled; 352 } 353 /* 354 * A negative value of mounts_in_progress indicates that mounts 355 * have been blocked by (-mounts_in_progress) different callers. 356 */ 357 mounts_in_progress--; 358 retval = 1; 359 signaled: 360 mutex_exit(&mount_lock); 361 return (retval); 362 } 363 364 /* 365 * The VFS layer may progress with new mounts as far as we're concerned. 366 * Allow them to progress if we were the last obstacle. 367 */ 368 static void 369 resume_mounts(void) 370 { 371 mutex_enter(&mount_lock); 372 if (++mounts_in_progress == 0) 373 cv_broadcast(&mount_cv); 374 mutex_exit(&mount_lock); 375 } 376 377 /* 378 * The VFS layer is busy with a mount; zones should wait until all 379 * mounts are completed to progress. 380 */ 381 void 382 mount_in_progress(void) 383 { 384 mutex_enter(&mount_lock); 385 while (mounts_in_progress < 0) 386 cv_wait(&mount_cv, &mount_lock); 387 mounts_in_progress++; 388 mutex_exit(&mount_lock); 389 } 390 391 /* 392 * VFS is done with one mount; wake up any waiting block_mounts() 393 * callers if this is the last mount. 394 */ 395 void 396 mount_completed(void) 397 { 398 mutex_enter(&mount_lock); 399 if (--mounts_in_progress == 0) 400 cv_broadcast(&mount_cv); 401 mutex_exit(&mount_lock); 402 } 403 404 /* 405 * ZSD routines. 406 * 407 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 408 * defined by the pthread_key_create() and related interfaces. 409 * 410 * Kernel subsystems may register one or more data items and/or 411 * callbacks to be executed when a zone is created, shutdown, or 412 * destroyed. 413 * 414 * Unlike the thread counterpart, destructor callbacks will be executed 415 * even if the data pointer is NULL and/or there are no constructor 416 * callbacks, so it is the responsibility of such callbacks to check for 417 * NULL data values if necessary. 418 * 419 * The locking strategy and overall picture is as follows: 420 * 421 * When someone calls zone_key_create(), a template ZSD entry is added to the 422 * global list "zsd_registered_keys", protected by zsd_key_lock. The 423 * constructor callback is called immediately on all existing zones, and a 424 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 425 * zone_lock). As this operation requires the list of zones, the list of 426 * registered keys, and the per-zone list of ZSD entries to remain constant 427 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 428 * all existing zones, and zsd_key_lock, in that order. Similar locking is 429 * needed when zone_key_delete() is called. It is thus sufficient to hold 430 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 431 * per-zone zone_zsd list. 432 * 433 * Note that this implementation does not make a copy of the ZSD entry if a 434 * constructor callback is not provided. A zone_getspecific() on such an 435 * uninitialized ZSD entry will return NULL. 436 * 437 * When new zones are created constructor callbacks for all registered ZSD 438 * entries will be called. 439 * 440 * The framework does not provide any locking around zone_getspecific() and 441 * zone_setspecific() apart from that needed for internal consistency, so 442 * callers interested in atomic "test-and-set" semantics will need to provide 443 * their own locking. 444 */ 445 void 446 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 447 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 448 { 449 struct zsd_entry *zsdp; 450 struct zsd_entry *t; 451 struct zone *zone; 452 453 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 454 zsdp->zsd_data = NULL; 455 zsdp->zsd_create = create; 456 zsdp->zsd_shutdown = shutdown; 457 zsdp->zsd_destroy = destroy; 458 459 mutex_enter(&zonehash_lock); /* stop the world */ 460 for (zone = list_head(&zone_active); zone != NULL; 461 zone = list_next(&zone_active, zone)) 462 mutex_enter(&zone->zone_lock); /* lock all zones */ 463 464 mutex_enter(&zsd_key_lock); 465 *keyp = zsdp->zsd_key = ++zsd_keyval; 466 ASSERT(zsd_keyval != 0); 467 list_insert_tail(&zsd_registered_keys, zsdp); 468 mutex_exit(&zsd_key_lock); 469 470 if (create != NULL) { 471 for (zone = list_head(&zone_active); zone != NULL; 472 zone = list_next(&zone_active, zone)) { 473 t = kmem_alloc(sizeof (*t), KM_SLEEP); 474 t->zsd_key = *keyp; 475 t->zsd_data = (*create)(zone->zone_id); 476 t->zsd_create = create; 477 t->zsd_shutdown = shutdown; 478 t->zsd_destroy = destroy; 479 list_insert_tail(&zone->zone_zsd, t); 480 } 481 } 482 for (zone = list_head(&zone_active); zone != NULL; 483 zone = list_next(&zone_active, zone)) 484 mutex_exit(&zone->zone_lock); 485 mutex_exit(&zonehash_lock); 486 } 487 488 /* 489 * Helper function to find the zsd_entry associated with the key in the 490 * given list. 491 */ 492 static struct zsd_entry * 493 zsd_find(list_t *l, zone_key_t key) 494 { 495 struct zsd_entry *zsd; 496 497 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 498 if (zsd->zsd_key == key) { 499 /* 500 * Move to head of list to keep list in MRU order. 501 */ 502 if (zsd != list_head(l)) { 503 list_remove(l, zsd); 504 list_insert_head(l, zsd); 505 } 506 return (zsd); 507 } 508 } 509 return (NULL); 510 } 511 512 /* 513 * Function called when a module is being unloaded, or otherwise wishes 514 * to unregister its ZSD key and callbacks. 515 */ 516 int 517 zone_key_delete(zone_key_t key) 518 { 519 struct zsd_entry *zsdp = NULL; 520 zone_t *zone; 521 522 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 523 for (zone = list_head(&zone_active); zone != NULL; 524 zone = list_next(&zone_active, zone)) 525 mutex_enter(&zone->zone_lock); /* lock all zones */ 526 527 mutex_enter(&zsd_key_lock); 528 zsdp = zsd_find(&zsd_registered_keys, key); 529 if (zsdp == NULL) 530 goto notfound; 531 list_remove(&zsd_registered_keys, zsdp); 532 mutex_exit(&zsd_key_lock); 533 534 for (zone = list_head(&zone_active); zone != NULL; 535 zone = list_next(&zone_active, zone)) { 536 struct zsd_entry *del; 537 void *data; 538 539 if (!(zone->zone_flags & ZF_DESTROYED)) { 540 del = zsd_find(&zone->zone_zsd, key); 541 if (del != NULL) { 542 data = del->zsd_data; 543 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 544 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 545 list_remove(&zone->zone_zsd, del); 546 kmem_free(del, sizeof (*del)); 547 } else { 548 data = NULL; 549 } 550 if (zsdp->zsd_shutdown) 551 zsdp->zsd_shutdown(zone->zone_id, data); 552 if (zsdp->zsd_destroy) 553 zsdp->zsd_destroy(zone->zone_id, data); 554 } 555 mutex_exit(&zone->zone_lock); 556 } 557 mutex_exit(&zonehash_lock); 558 kmem_free(zsdp, sizeof (*zsdp)); 559 return (0); 560 561 notfound: 562 mutex_exit(&zsd_key_lock); 563 for (zone = list_head(&zone_active); zone != NULL; 564 zone = list_next(&zone_active, zone)) 565 mutex_exit(&zone->zone_lock); 566 mutex_exit(&zonehash_lock); 567 return (-1); 568 } 569 570 /* 571 * ZSD counterpart of pthread_setspecific(). 572 */ 573 int 574 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 575 { 576 struct zsd_entry *t; 577 struct zsd_entry *zsdp = NULL; 578 579 mutex_enter(&zone->zone_lock); 580 t = zsd_find(&zone->zone_zsd, key); 581 if (t != NULL) { 582 /* 583 * Replace old value with new 584 */ 585 t->zsd_data = (void *)data; 586 mutex_exit(&zone->zone_lock); 587 return (0); 588 } 589 /* 590 * If there was no previous value, go through the list of registered 591 * keys. 592 * 593 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 594 * necessary for shutdown callbacks to be able to execute without fear 595 * of deadlock. 596 */ 597 mutex_enter(&zsd_key_lock); 598 zsdp = zsd_find(&zsd_registered_keys, key); 599 if (zsdp == NULL) { /* Key was not registered */ 600 mutex_exit(&zsd_key_lock); 601 mutex_exit(&zone->zone_lock); 602 return (-1); 603 } 604 605 /* 606 * Add a zsd_entry to this zone, using the template we just retrieved 607 * to initialize the constructor and destructor(s). 608 */ 609 t = kmem_alloc(sizeof (*t), KM_SLEEP); 610 t->zsd_key = key; 611 t->zsd_data = (void *)data; 612 t->zsd_create = zsdp->zsd_create; 613 t->zsd_shutdown = zsdp->zsd_shutdown; 614 t->zsd_destroy = zsdp->zsd_destroy; 615 list_insert_tail(&zone->zone_zsd, t); 616 mutex_exit(&zsd_key_lock); 617 mutex_exit(&zone->zone_lock); 618 return (0); 619 } 620 621 /* 622 * ZSD counterpart of pthread_getspecific(). 623 */ 624 void * 625 zone_getspecific(zone_key_t key, zone_t *zone) 626 { 627 struct zsd_entry *t; 628 void *data; 629 630 mutex_enter(&zone->zone_lock); 631 t = zsd_find(&zone->zone_zsd, key); 632 data = (t == NULL ? NULL : t->zsd_data); 633 mutex_exit(&zone->zone_lock); 634 return (data); 635 } 636 637 /* 638 * Function used to initialize a zone's list of ZSD callbacks and data 639 * when the zone is being created. The callbacks are initialized from 640 * the template list (zsd_registered_keys), and the constructor 641 * callback executed (if one exists). 642 * 643 * This is called before the zone is made publicly available, hence no 644 * need to grab zone_lock. 645 * 646 * Although we grab and release zsd_key_lock, new entries cannot be 647 * added to or removed from the zsd_registered_keys list until we 648 * release zonehash_lock, so there isn't a window for a 649 * zone_key_create() to come in after we've dropped zsd_key_lock but 650 * before the zone is added to the zone list, such that the constructor 651 * callbacks aren't executed for the new zone. 652 */ 653 static void 654 zone_zsd_configure(zone_t *zone) 655 { 656 struct zsd_entry *zsdp; 657 struct zsd_entry *t; 658 zoneid_t zoneid = zone->zone_id; 659 660 ASSERT(MUTEX_HELD(&zonehash_lock)); 661 ASSERT(list_head(&zone->zone_zsd) == NULL); 662 mutex_enter(&zsd_key_lock); 663 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 664 zsdp = list_next(&zsd_registered_keys, zsdp)) { 665 if (zsdp->zsd_create != NULL) { 666 t = kmem_alloc(sizeof (*t), KM_SLEEP); 667 t->zsd_key = zsdp->zsd_key; 668 t->zsd_create = zsdp->zsd_create; 669 t->zsd_data = (*t->zsd_create)(zoneid); 670 t->zsd_shutdown = zsdp->zsd_shutdown; 671 t->zsd_destroy = zsdp->zsd_destroy; 672 list_insert_tail(&zone->zone_zsd, t); 673 } 674 } 675 mutex_exit(&zsd_key_lock); 676 } 677 678 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 679 680 /* 681 * Helper function to execute shutdown or destructor callbacks. 682 */ 683 static void 684 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 685 { 686 struct zsd_entry *zsdp; 687 struct zsd_entry *t; 688 zoneid_t zoneid = zone->zone_id; 689 690 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 691 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 692 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 693 694 mutex_enter(&zone->zone_lock); 695 if (ct == ZSD_DESTROY) { 696 if (zone->zone_flags & ZF_DESTROYED) { 697 /* 698 * Make sure destructors are only called once. 699 */ 700 mutex_exit(&zone->zone_lock); 701 return; 702 } 703 zone->zone_flags |= ZF_DESTROYED; 704 } 705 mutex_exit(&zone->zone_lock); 706 707 /* 708 * Both zsd_key_lock and zone_lock need to be held in order to add or 709 * remove a ZSD key, (either globally as part of 710 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 711 * possible through zone_setspecific()), so it's sufficient to hold 712 * zsd_key_lock here. 713 * 714 * This is a good thing, since we don't want to recursively try to grab 715 * zone_lock if a callback attempts to do something like a crfree() or 716 * zone_rele(). 717 */ 718 mutex_enter(&zsd_key_lock); 719 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 720 zsdp = list_next(&zsd_registered_keys, zsdp)) { 721 zone_key_t key = zsdp->zsd_key; 722 723 /* Skip if no callbacks registered */ 724 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 725 continue; 726 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 727 continue; 728 /* 729 * Call the callback with the zone-specific data if we can find 730 * any, otherwise with NULL. 731 */ 732 t = zsd_find(&zone->zone_zsd, key); 733 if (t != NULL) { 734 if (ct == ZSD_SHUTDOWN) { 735 t->zsd_shutdown(zoneid, t->zsd_data); 736 } else { 737 ASSERT(ct == ZSD_DESTROY); 738 t->zsd_destroy(zoneid, t->zsd_data); 739 } 740 } else { 741 if (ct == ZSD_SHUTDOWN) { 742 zsdp->zsd_shutdown(zoneid, NULL); 743 } else { 744 ASSERT(ct == ZSD_DESTROY); 745 zsdp->zsd_destroy(zoneid, NULL); 746 } 747 } 748 } 749 mutex_exit(&zsd_key_lock); 750 } 751 752 /* 753 * Called when the zone is going away; free ZSD-related memory, and 754 * destroy the zone_zsd list. 755 */ 756 static void 757 zone_free_zsd(zone_t *zone) 758 { 759 struct zsd_entry *t, *next; 760 761 /* 762 * Free all the zsd_entry's we had on this zone. 763 */ 764 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 765 next = list_next(&zone->zone_zsd, t); 766 list_remove(&zone->zone_zsd, t); 767 kmem_free(t, sizeof (*t)); 768 } 769 list_destroy(&zone->zone_zsd); 770 } 771 772 /* 773 * zone.cpu-shares resource control support. 774 */ 775 /*ARGSUSED*/ 776 static rctl_qty_t 777 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 778 { 779 ASSERT(MUTEX_HELD(&p->p_lock)); 780 return (p->p_zone->zone_shares); 781 } 782 783 /*ARGSUSED*/ 784 static int 785 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 786 rctl_qty_t nv) 787 { 788 ASSERT(MUTEX_HELD(&p->p_lock)); 789 ASSERT(e->rcep_t == RCENTITY_ZONE); 790 if (e->rcep_p.zone == NULL) 791 return (0); 792 793 e->rcep_p.zone->zone_shares = nv; 794 return (0); 795 } 796 797 static rctl_ops_t zone_cpu_shares_ops = { 798 rcop_no_action, 799 zone_cpu_shares_usage, 800 zone_cpu_shares_set, 801 rcop_no_test 802 }; 803 804 /*ARGSUSED*/ 805 static rctl_qty_t 806 zone_lwps_usage(rctl_t *r, proc_t *p) 807 { 808 rctl_qty_t nlwps; 809 zone_t *zone = p->p_zone; 810 811 ASSERT(MUTEX_HELD(&p->p_lock)); 812 813 mutex_enter(&zone->zone_nlwps_lock); 814 nlwps = zone->zone_nlwps; 815 mutex_exit(&zone->zone_nlwps_lock); 816 817 return (nlwps); 818 } 819 820 /*ARGSUSED*/ 821 static int 822 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 823 rctl_qty_t incr, uint_t flags) 824 { 825 rctl_qty_t nlwps; 826 827 ASSERT(MUTEX_HELD(&p->p_lock)); 828 ASSERT(e->rcep_t == RCENTITY_ZONE); 829 if (e->rcep_p.zone == NULL) 830 return (0); 831 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 832 nlwps = e->rcep_p.zone->zone_nlwps; 833 834 if (nlwps + incr > rcntl->rcv_value) 835 return (1); 836 837 return (0); 838 } 839 840 /*ARGSUSED*/ 841 static int 842 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 843 844 ASSERT(MUTEX_HELD(&p->p_lock)); 845 ASSERT(e->rcep_t == RCENTITY_ZONE); 846 if (e->rcep_p.zone == NULL) 847 return (0); 848 e->rcep_p.zone->zone_nlwps_ctl = nv; 849 return (0); 850 } 851 852 static rctl_ops_t zone_lwps_ops = { 853 rcop_no_action, 854 zone_lwps_usage, 855 zone_lwps_set, 856 zone_lwps_test, 857 }; 858 859 /* 860 * Helper function to brand the zone with a unique ID. 861 */ 862 static void 863 zone_uniqid(zone_t *zone) 864 { 865 static uint64_t uniqid = 0; 866 867 ASSERT(MUTEX_HELD(&zonehash_lock)); 868 zone->zone_uniqid = uniqid++; 869 } 870 871 /* 872 * Returns a held pointer to the "kcred" for the specified zone. 873 */ 874 struct cred * 875 zone_get_kcred(zoneid_t zoneid) 876 { 877 zone_t *zone; 878 cred_t *cr; 879 880 if ((zone = zone_find_by_id(zoneid)) == NULL) 881 return (NULL); 882 cr = zone->zone_kcred; 883 crhold(cr); 884 zone_rele(zone); 885 return (cr); 886 } 887 888 /* 889 * Called very early on in boot to initialize the ZSD list so that 890 * zone_key_create() can be called before zone_init(). It also initializes 891 * portions of zone0 which may be used before zone_init() is called. The 892 * variable "global_zone" will be set when zone0 is fully initialized by 893 * zone_init(). 894 */ 895 void 896 zone_zsd_init(void) 897 { 898 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 899 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 900 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 901 offsetof(struct zsd_entry, zsd_linkage)); 902 list_create(&zone_active, sizeof (zone_t), 903 offsetof(zone_t, zone_linkage)); 904 list_create(&zone_deathrow, sizeof (zone_t), 905 offsetof(zone_t, zone_linkage)); 906 907 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 908 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 909 zone0.zone_shares = 1; 910 zone0.zone_nlwps_ctl = INT_MAX; 911 zone0.zone_name = GLOBAL_ZONENAME; 912 zone0.zone_nodename = utsname.nodename; 913 zone0.zone_domain = srpc_domain; 914 zone0.zone_ref = 1; 915 zone0.zone_id = GLOBAL_ZONEID; 916 zone0.zone_status = ZONE_IS_RUNNING; 917 zone0.zone_rootpath = "/"; 918 zone0.zone_rootpathlen = 2; 919 zone0.zone_psetid = ZONE_PS_INVAL; 920 zone0.zone_ncpus = 0; 921 zone0.zone_ncpus_online = 0; 922 zone0.zone_proc_initpid = 1; 923 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 924 offsetof(struct zsd_entry, zsd_linkage)); 925 list_insert_head(&zone_active, &zone0); 926 927 /* 928 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 929 * to anything meaningful. It is assigned to be 'rootdir' in 930 * vfs_mountroot(). 931 */ 932 zone0.zone_rootvp = NULL; 933 zone0.zone_vfslist = NULL; 934 zone0.zone_bootargs = NULL; 935 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 936 /* 937 * The global zone has all privileges 938 */ 939 priv_fillset(zone0.zone_privset); 940 /* 941 * Add p0 to the global zone 942 */ 943 zone0.zone_zsched = &p0; 944 p0.p_zone = &zone0; 945 } 946 947 /* 948 * Called by main() to initialize the zones framework. 949 */ 950 void 951 zone_init(void) 952 { 953 rctl_dict_entry_t *rde; 954 rctl_val_t *dval; 955 rctl_set_t *set; 956 rctl_alloc_gp_t *gp; 957 rctl_entity_p_t e; 958 959 ASSERT(curproc == &p0); 960 961 /* 962 * Create ID space for zone IDs. ID 0 is reserved for the 963 * global zone. 964 */ 965 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 966 967 /* 968 * Initialize generic zone resource controls, if any. 969 */ 970 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 971 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 972 RCTL_GLOBAL_NOBASIC | 973 RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES, 974 &zone_cpu_shares_ops); 975 976 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 977 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 978 INT_MAX, INT_MAX, &zone_lwps_ops); 979 /* 980 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 981 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 982 */ 983 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 984 bzero(dval, sizeof (rctl_val_t)); 985 dval->rcv_value = 1; 986 dval->rcv_privilege = RCPRIV_PRIVILEGED; 987 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 988 dval->rcv_action_recip_pid = -1; 989 990 rde = rctl_dict_lookup("zone.cpu-shares"); 991 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 992 993 /* 994 * Initialize the ``global zone''. 995 */ 996 set = rctl_set_create(); 997 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 998 mutex_enter(&p0.p_lock); 999 e.rcep_p.zone = &zone0; 1000 e.rcep_t = RCENTITY_ZONE; 1001 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1002 gp); 1003 1004 zone0.zone_nlwps = p0.p_lwpcnt; 1005 zone0.zone_ntasks = 1; 1006 mutex_exit(&p0.p_lock); 1007 rctl_prealloc_destroy(gp); 1008 /* 1009 * pool_default hasn't been initialized yet, so we let pool_init() take 1010 * care of making the global zone is in the default pool. 1011 */ 1012 mutex_enter(&zonehash_lock); 1013 zone_uniqid(&zone0); 1014 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1015 mutex_exit(&zonehash_lock); 1016 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1017 mod_hash_null_valdtor); 1018 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1019 zone_hash_size, mod_hash_null_valdtor); 1020 zonecount = 1; 1021 1022 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1023 (mod_hash_val_t)&zone0); 1024 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1025 (mod_hash_val_t)&zone0); 1026 /* 1027 * We avoid setting zone_kcred until now, since kcred is initialized 1028 * sometime after zone_zsd_init() and before zone_init(). 1029 */ 1030 zone0.zone_kcred = kcred; 1031 /* 1032 * The global zone is fully initialized (except for zone_rootvp which 1033 * will be set when the root filesystem is mounted). 1034 */ 1035 global_zone = &zone0; 1036 } 1037 1038 static void 1039 zone_free(zone_t *zone) 1040 { 1041 ASSERT(zone != global_zone); 1042 ASSERT(zone->zone_ntasks == 0); 1043 ASSERT(zone->zone_nlwps == 0); 1044 ASSERT(zone->zone_cred_ref == 0); 1045 ASSERT(zone->zone_kcred == NULL); 1046 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1047 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1048 1049 /* remove from deathrow list */ 1050 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1051 ASSERT(zone->zone_ref == 0); 1052 mutex_enter(&zone_deathrow_lock); 1053 list_remove(&zone_deathrow, zone); 1054 mutex_exit(&zone_deathrow_lock); 1055 } 1056 1057 zone_free_zsd(zone); 1058 1059 if (zone->zone_rootvp != NULL) 1060 VN_RELE(zone->zone_rootvp); 1061 if (zone->zone_rootpath) 1062 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1063 if (zone->zone_name != NULL) 1064 kmem_free(zone->zone_name, ZONENAME_MAX); 1065 if (zone->zone_nodename != NULL) 1066 kmem_free(zone->zone_nodename, _SYS_NMLN); 1067 if (zone->zone_domain != NULL) 1068 kmem_free(zone->zone_domain, _SYS_NMLN); 1069 if (zone->zone_privset != NULL) 1070 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1071 if (zone->zone_rctls != NULL) 1072 rctl_set_free(zone->zone_rctls); 1073 if (zone->zone_bootargs != NULL) 1074 kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX); 1075 id_free(zoneid_space, zone->zone_id); 1076 mutex_destroy(&zone->zone_lock); 1077 cv_destroy(&zone->zone_cv); 1078 kmem_free(zone, sizeof (zone_t)); 1079 } 1080 1081 /* 1082 * See block comment at the top of this file for information about zone 1083 * status values. 1084 */ 1085 /* 1086 * Convenience function for setting zone status. 1087 */ 1088 static void 1089 zone_status_set(zone_t *zone, zone_status_t status) 1090 { 1091 ASSERT(MUTEX_HELD(&zone_status_lock)); 1092 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1093 status >= zone_status_get(zone)); 1094 zone->zone_status = status; 1095 cv_broadcast(&zone->zone_cv); 1096 } 1097 1098 /* 1099 * Public function to retrieve the zone status. The zone status may 1100 * change after it is retrieved. 1101 */ 1102 zone_status_t 1103 zone_status_get(zone_t *zone) 1104 { 1105 return (zone->zone_status); 1106 } 1107 1108 static int 1109 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1110 { 1111 char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP); 1112 size_t len; 1113 int err; 1114 1115 err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len); 1116 if (err != 0) { 1117 kmem_free(bootargs, ZONEBOOTARGS_MAX); 1118 return (err); /* EFAULT or ENAMETOOLONG */ 1119 } 1120 bootargs[len] = '\0'; 1121 1122 ASSERT(zone->zone_bootargs == NULL); 1123 zone->zone_bootargs = bootargs; 1124 return (0); 1125 } 1126 1127 /* 1128 * Block indefinitely waiting for (zone_status >= status) 1129 */ 1130 void 1131 zone_status_wait(zone_t *zone, zone_status_t status) 1132 { 1133 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1134 1135 mutex_enter(&zone_status_lock); 1136 while (zone->zone_status < status) { 1137 cv_wait(&zone->zone_cv, &zone_status_lock); 1138 } 1139 mutex_exit(&zone_status_lock); 1140 } 1141 1142 /* 1143 * Private CPR-safe version of zone_status_wait(). 1144 */ 1145 static void 1146 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1147 { 1148 callb_cpr_t cprinfo; 1149 1150 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1151 1152 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1153 str); 1154 mutex_enter(&zone_status_lock); 1155 while (zone->zone_status < status) { 1156 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1157 cv_wait(&zone->zone_cv, &zone_status_lock); 1158 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1159 } 1160 /* 1161 * zone_status_lock is implicitly released by the following. 1162 */ 1163 CALLB_CPR_EXIT(&cprinfo); 1164 } 1165 1166 /* 1167 * Block until zone enters requested state or signal is received. Return (0) 1168 * if signaled, non-zero otherwise. 1169 */ 1170 int 1171 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1172 { 1173 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1174 1175 mutex_enter(&zone_status_lock); 1176 while (zone->zone_status < status) { 1177 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1178 mutex_exit(&zone_status_lock); 1179 return (0); 1180 } 1181 } 1182 mutex_exit(&zone_status_lock); 1183 return (1); 1184 } 1185 1186 /* 1187 * Block until the zone enters the requested state or the timeout expires, 1188 * whichever happens first. Return (-1) if operation timed out, time remaining 1189 * otherwise. 1190 */ 1191 clock_t 1192 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1193 { 1194 clock_t timeleft = 0; 1195 1196 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1197 1198 mutex_enter(&zone_status_lock); 1199 while (zone->zone_status < status && timeleft != -1) { 1200 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1201 } 1202 mutex_exit(&zone_status_lock); 1203 return (timeleft); 1204 } 1205 1206 /* 1207 * Block until the zone enters the requested state, the current process is 1208 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1209 * operation timed out, 0 if signaled, time remaining otherwise. 1210 */ 1211 clock_t 1212 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1213 { 1214 clock_t timeleft = tim - lbolt; 1215 1216 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1217 1218 mutex_enter(&zone_status_lock); 1219 while (zone->zone_status < status) { 1220 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1221 tim); 1222 if (timeleft <= 0) 1223 break; 1224 } 1225 mutex_exit(&zone_status_lock); 1226 return (timeleft); 1227 } 1228 1229 /* 1230 * Zones have two reference counts: one for references from credential 1231 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1232 * This is so we can allow a zone to be rebooted while there are still 1233 * outstanding cred references, since certain drivers cache dblks (which 1234 * implicitly results in cached creds). We wait for zone_ref to drop to 1235 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1236 * later freed when the zone_cred_ref drops to 0, though nothing other 1237 * than the zone id and privilege set should be accessed once the zone 1238 * is "dead". 1239 * 1240 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1241 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1242 * to 0. This can be useful to flush out other sources of cached creds 1243 * that may be less innocuous than the driver case. 1244 */ 1245 1246 int zone_wait_for_cred = 0; 1247 1248 static void 1249 zone_hold_locked(zone_t *z) 1250 { 1251 ASSERT(MUTEX_HELD(&z->zone_lock)); 1252 z->zone_ref++; 1253 ASSERT(z->zone_ref != 0); 1254 } 1255 1256 void 1257 zone_hold(zone_t *z) 1258 { 1259 mutex_enter(&z->zone_lock); 1260 zone_hold_locked(z); 1261 mutex_exit(&z->zone_lock); 1262 } 1263 1264 /* 1265 * If the non-cred ref count drops to 1 and either the cred ref count 1266 * is 0 or we aren't waiting for cred references, the zone is ready to 1267 * be destroyed. 1268 */ 1269 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1270 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1271 1272 void 1273 zone_rele(zone_t *z) 1274 { 1275 boolean_t wakeup; 1276 1277 mutex_enter(&z->zone_lock); 1278 ASSERT(z->zone_ref != 0); 1279 z->zone_ref--; 1280 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1281 /* no more refs, free the structure */ 1282 mutex_exit(&z->zone_lock); 1283 zone_free(z); 1284 return; 1285 } 1286 /* signal zone_destroy so the zone can finish halting */ 1287 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1288 mutex_exit(&z->zone_lock); 1289 1290 if (wakeup) { 1291 /* 1292 * Grabbing zonehash_lock here effectively synchronizes with 1293 * zone_destroy() to avoid missed signals. 1294 */ 1295 mutex_enter(&zonehash_lock); 1296 cv_broadcast(&zone_destroy_cv); 1297 mutex_exit(&zonehash_lock); 1298 } 1299 } 1300 1301 void 1302 zone_cred_hold(zone_t *z) 1303 { 1304 mutex_enter(&z->zone_lock); 1305 z->zone_cred_ref++; 1306 ASSERT(z->zone_cred_ref != 0); 1307 mutex_exit(&z->zone_lock); 1308 } 1309 1310 void 1311 zone_cred_rele(zone_t *z) 1312 { 1313 boolean_t wakeup; 1314 1315 mutex_enter(&z->zone_lock); 1316 ASSERT(z->zone_cred_ref != 0); 1317 z->zone_cred_ref--; 1318 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1319 /* no more refs, free the structure */ 1320 mutex_exit(&z->zone_lock); 1321 zone_free(z); 1322 return; 1323 } 1324 /* 1325 * If zone_destroy is waiting for the cred references to drain 1326 * out, and they have, signal it. 1327 */ 1328 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1329 zone_status_get(z) >= ZONE_IS_DEAD); 1330 mutex_exit(&z->zone_lock); 1331 1332 if (wakeup) { 1333 /* 1334 * Grabbing zonehash_lock here effectively synchronizes with 1335 * zone_destroy() to avoid missed signals. 1336 */ 1337 mutex_enter(&zonehash_lock); 1338 cv_broadcast(&zone_destroy_cv); 1339 mutex_exit(&zonehash_lock); 1340 } 1341 } 1342 1343 void 1344 zone_task_hold(zone_t *z) 1345 { 1346 mutex_enter(&z->zone_lock); 1347 z->zone_ntasks++; 1348 ASSERT(z->zone_ntasks != 0); 1349 mutex_exit(&z->zone_lock); 1350 } 1351 1352 void 1353 zone_task_rele(zone_t *zone) 1354 { 1355 uint_t refcnt; 1356 1357 mutex_enter(&zone->zone_lock); 1358 ASSERT(zone->zone_ntasks != 0); 1359 refcnt = --zone->zone_ntasks; 1360 if (refcnt > 1) { /* Common case */ 1361 mutex_exit(&zone->zone_lock); 1362 return; 1363 } 1364 zone_hold_locked(zone); /* so we can use the zone_t later */ 1365 mutex_exit(&zone->zone_lock); 1366 if (refcnt == 1) { 1367 /* 1368 * See if the zone is shutting down. 1369 */ 1370 mutex_enter(&zone_status_lock); 1371 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1372 goto out; 1373 } 1374 1375 /* 1376 * Make sure the ntasks didn't change since we 1377 * dropped zone_lock. 1378 */ 1379 mutex_enter(&zone->zone_lock); 1380 if (refcnt != zone->zone_ntasks) { 1381 mutex_exit(&zone->zone_lock); 1382 goto out; 1383 } 1384 mutex_exit(&zone->zone_lock); 1385 1386 /* 1387 * No more user processes in the zone. The zone is empty. 1388 */ 1389 zone_status_set(zone, ZONE_IS_EMPTY); 1390 goto out; 1391 } 1392 1393 ASSERT(refcnt == 0); 1394 /* 1395 * zsched has exited; the zone is dead. 1396 */ 1397 zone->zone_zsched = NULL; /* paranoia */ 1398 mutex_enter(&zone_status_lock); 1399 zone_status_set(zone, ZONE_IS_DEAD); 1400 out: 1401 mutex_exit(&zone_status_lock); 1402 zone_rele(zone); 1403 } 1404 1405 zoneid_t 1406 getzoneid(void) 1407 { 1408 return (curproc->p_zone->zone_id); 1409 } 1410 1411 /* 1412 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1413 * check the validity of a zone's state. 1414 */ 1415 static zone_t * 1416 zone_find_all_by_id(zoneid_t zoneid) 1417 { 1418 mod_hash_val_t hv; 1419 zone_t *zone = NULL; 1420 1421 ASSERT(MUTEX_HELD(&zonehash_lock)); 1422 1423 if (mod_hash_find(zonehashbyid, 1424 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1425 zone = (zone_t *)hv; 1426 return (zone); 1427 } 1428 1429 static zone_t * 1430 zone_find_all_by_name(char *name) 1431 { 1432 mod_hash_val_t hv; 1433 zone_t *zone = NULL; 1434 1435 ASSERT(MUTEX_HELD(&zonehash_lock)); 1436 1437 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 1438 zone = (zone_t *)hv; 1439 return (zone); 1440 } 1441 1442 /* 1443 * Public interface for looking up a zone by zoneid. Only returns the zone if 1444 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 1445 * Caller must call zone_rele() once it is done with the zone. 1446 * 1447 * The zone may begin the zone_destroy() sequence immediately after this 1448 * function returns, but may be safely used until zone_rele() is called. 1449 */ 1450 zone_t * 1451 zone_find_by_id(zoneid_t zoneid) 1452 { 1453 zone_t *zone; 1454 zone_status_t status; 1455 1456 mutex_enter(&zonehash_lock); 1457 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 1458 mutex_exit(&zonehash_lock); 1459 return (NULL); 1460 } 1461 status = zone_status_get(zone); 1462 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1463 /* 1464 * For all practical purposes the zone doesn't exist. 1465 */ 1466 mutex_exit(&zonehash_lock); 1467 return (NULL); 1468 } 1469 zone_hold(zone); 1470 mutex_exit(&zonehash_lock); 1471 return (zone); 1472 } 1473 1474 /* 1475 * Similar to zone_find_by_id, but using zone name as the key. 1476 */ 1477 zone_t * 1478 zone_find_by_name(char *name) 1479 { 1480 zone_t *zone; 1481 zone_status_t status; 1482 1483 mutex_enter(&zonehash_lock); 1484 if ((zone = zone_find_all_by_name(name)) == NULL) { 1485 mutex_exit(&zonehash_lock); 1486 return (NULL); 1487 } 1488 status = zone_status_get(zone); 1489 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1490 /* 1491 * For all practical purposes the zone doesn't exist. 1492 */ 1493 mutex_exit(&zonehash_lock); 1494 return (NULL); 1495 } 1496 zone_hold(zone); 1497 mutex_exit(&zonehash_lock); 1498 return (zone); 1499 } 1500 1501 /* 1502 * Similar to zone_find_by_id(), using the path as a key. For instance, 1503 * if there is a zone "foo" rooted at /foo/root, and the path argument 1504 * is "/foo/root/proc", it will return the held zone_t corresponding to 1505 * zone "foo". 1506 * 1507 * zone_find_by_path() always returns a non-NULL value, since at the 1508 * very least every path will be contained in the global zone. 1509 * 1510 * As with the other zone_find_by_*() functions, the caller is 1511 * responsible for zone_rele()ing the return value of this function. 1512 */ 1513 zone_t * 1514 zone_find_by_path(const char *path) 1515 { 1516 zone_t *zone; 1517 zone_t *zret = NULL; 1518 zone_status_t status; 1519 1520 if (path == NULL) { 1521 /* 1522 * Call from rootconf(). 1523 */ 1524 zone_hold(global_zone); 1525 return (global_zone); 1526 } 1527 ASSERT(*path == '/'); 1528 mutex_enter(&zonehash_lock); 1529 for (zone = list_head(&zone_active); zone != NULL; 1530 zone = list_next(&zone_active, zone)) { 1531 if (ZONE_PATH_VISIBLE(path, zone)) 1532 zret = zone; 1533 } 1534 ASSERT(zret != NULL); 1535 status = zone_status_get(zret); 1536 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1537 /* 1538 * Zone practically doesn't exist. 1539 */ 1540 zret = global_zone; 1541 } 1542 zone_hold(zret); 1543 mutex_exit(&zonehash_lock); 1544 return (zret); 1545 } 1546 1547 /* 1548 * Get the number of cpus visible to this zone. The system-wide global 1549 * 'ncpus' is returned if pools are disabled, the caller is in the 1550 * global zone, or a NULL zone argument is passed in. 1551 */ 1552 int 1553 zone_ncpus_get(zone_t *zone) 1554 { 1555 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 1556 1557 return (myncpus != 0 ? myncpus : ncpus); 1558 } 1559 1560 /* 1561 * Get the number of online cpus visible to this zone. The system-wide 1562 * global 'ncpus_online' is returned if pools are disabled, the caller 1563 * is in the global zone, or a NULL zone argument is passed in. 1564 */ 1565 int 1566 zone_ncpus_online_get(zone_t *zone) 1567 { 1568 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 1569 1570 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 1571 } 1572 1573 /* 1574 * Return the pool to which the zone is currently bound. 1575 */ 1576 pool_t * 1577 zone_pool_get(zone_t *zone) 1578 { 1579 ASSERT(pool_lock_held()); 1580 1581 return (zone->zone_pool); 1582 } 1583 1584 /* 1585 * Set the zone's pool pointer and update the zone's visibility to match 1586 * the resources in the new pool. 1587 */ 1588 void 1589 zone_pool_set(zone_t *zone, pool_t *pool) 1590 { 1591 ASSERT(pool_lock_held()); 1592 ASSERT(MUTEX_HELD(&cpu_lock)); 1593 1594 zone->zone_pool = pool; 1595 zone_pset_set(zone, pool->pool_pset->pset_id); 1596 } 1597 1598 /* 1599 * Return the cached value of the id of the processor set to which the 1600 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 1601 * facility is disabled. 1602 */ 1603 psetid_t 1604 zone_pset_get(zone_t *zone) 1605 { 1606 ASSERT(MUTEX_HELD(&cpu_lock)); 1607 1608 return (zone->zone_psetid); 1609 } 1610 1611 /* 1612 * Set the cached value of the id of the processor set to which the zone 1613 * is currently bound. Also update the zone's visibility to match the 1614 * resources in the new processor set. 1615 */ 1616 void 1617 zone_pset_set(zone_t *zone, psetid_t newpsetid) 1618 { 1619 psetid_t oldpsetid; 1620 1621 ASSERT(MUTEX_HELD(&cpu_lock)); 1622 oldpsetid = zone_pset_get(zone); 1623 1624 if (oldpsetid == newpsetid) 1625 return; 1626 /* 1627 * Global zone sees all. 1628 */ 1629 if (zone != global_zone) { 1630 zone->zone_psetid = newpsetid; 1631 if (newpsetid != ZONE_PS_INVAL) 1632 pool_pset_visibility_add(newpsetid, zone); 1633 if (oldpsetid != ZONE_PS_INVAL) 1634 pool_pset_visibility_remove(oldpsetid, zone); 1635 } 1636 /* 1637 * Disabling pools, so we should start using the global values 1638 * for ncpus and ncpus_online. 1639 */ 1640 if (newpsetid == ZONE_PS_INVAL) { 1641 zone->zone_ncpus = 0; 1642 zone->zone_ncpus_online = 0; 1643 } 1644 } 1645 1646 /* 1647 * Walk the list of active zones and issue the provided callback for 1648 * each of them. 1649 * 1650 * Caller must not be holding any locks that may be acquired under 1651 * zonehash_lock. See comment at the beginning of the file for a list of 1652 * common locks and their interactions with zones. 1653 */ 1654 int 1655 zone_walk(int (*cb)(zone_t *, void *), void *data) 1656 { 1657 zone_t *zone; 1658 int ret = 0; 1659 zone_status_t status; 1660 1661 mutex_enter(&zonehash_lock); 1662 for (zone = list_head(&zone_active); zone != NULL; 1663 zone = list_next(&zone_active, zone)) { 1664 /* 1665 * Skip zones that shouldn't be externally visible. 1666 */ 1667 status = zone_status_get(zone); 1668 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 1669 continue; 1670 /* 1671 * Bail immediately if any callback invocation returns a 1672 * non-zero value. 1673 */ 1674 ret = (*cb)(zone, data); 1675 if (ret != 0) 1676 break; 1677 } 1678 mutex_exit(&zonehash_lock); 1679 return (ret); 1680 } 1681 1682 static int 1683 zone_set_root(zone_t *zone, const char *upath) 1684 { 1685 vnode_t *vp; 1686 int trycount; 1687 int error = 0; 1688 char *path; 1689 struct pathname upn, pn; 1690 size_t pathlen; 1691 1692 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 1693 return (error); 1694 1695 pn_alloc(&pn); 1696 1697 /* prevent infinite loop */ 1698 trycount = 10; 1699 for (;;) { 1700 if (--trycount <= 0) { 1701 error = ESTALE; 1702 goto out; 1703 } 1704 1705 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 1706 /* 1707 * VOP_ACCESS() may cover 'vp' with a new 1708 * filesystem, if 'vp' is an autoFS vnode. 1709 * Get the new 'vp' if so. 1710 */ 1711 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 1712 (vp->v_vfsmountedhere == NULL || 1713 (error = traverse(&vp)) == 0)) { 1714 pathlen = pn.pn_pathlen + 2; 1715 path = kmem_alloc(pathlen, KM_SLEEP); 1716 (void) strncpy(path, pn.pn_path, 1717 pn.pn_pathlen + 1); 1718 path[pathlen - 2] = '/'; 1719 path[pathlen - 1] = '\0'; 1720 pn_free(&pn); 1721 pn_free(&upn); 1722 1723 /* Success! */ 1724 break; 1725 } 1726 VN_RELE(vp); 1727 } 1728 if (error != ESTALE) 1729 goto out; 1730 } 1731 1732 ASSERT(error == 0); 1733 zone->zone_rootvp = vp; /* we hold a reference to vp */ 1734 zone->zone_rootpath = path; 1735 zone->zone_rootpathlen = pathlen; 1736 return (0); 1737 1738 out: 1739 pn_free(&pn); 1740 pn_free(&upn); 1741 return (error); 1742 } 1743 1744 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 1745 ((c) >= 'a' && (c) <= 'z') || \ 1746 ((c) >= 'A' && (c) <= 'Z')) 1747 1748 static int 1749 zone_set_name(zone_t *zone, const char *uname) 1750 { 1751 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 1752 size_t len; 1753 int i, err; 1754 1755 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 1756 kmem_free(kname, ZONENAME_MAX); 1757 return (err); /* EFAULT or ENAMETOOLONG */ 1758 } 1759 1760 /* must be less than ZONENAME_MAX */ 1761 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 1762 kmem_free(kname, ZONENAME_MAX); 1763 return (EINVAL); 1764 } 1765 1766 /* 1767 * Name must start with an alphanumeric and must contain only 1768 * alphanumerics, '-', '_' and '.'. 1769 */ 1770 if (!isalnum(kname[0])) { 1771 kmem_free(kname, ZONENAME_MAX); 1772 return (EINVAL); 1773 } 1774 for (i = 1; i < len - 1; i++) { 1775 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 1776 kname[i] != '.') { 1777 kmem_free(kname, ZONENAME_MAX); 1778 return (EINVAL); 1779 } 1780 } 1781 1782 zone->zone_name = kname; 1783 return (0); 1784 } 1785 1786 /* 1787 * Similar to thread_create(), but makes sure the thread is in the appropriate 1788 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 1789 */ 1790 /*ARGSUSED*/ 1791 kthread_t * 1792 zthread_create( 1793 caddr_t stk, 1794 size_t stksize, 1795 void (*proc)(), 1796 void *arg, 1797 size_t len, 1798 pri_t pri) 1799 { 1800 kthread_t *t; 1801 zone_t *zone = curproc->p_zone; 1802 proc_t *pp = zone->zone_zsched; 1803 1804 zone_hold(zone); /* Reference to be dropped when thread exits */ 1805 1806 /* 1807 * No-one should be trying to create threads if the zone is shutting 1808 * down and there aren't any kernel threads around. See comment 1809 * in zthread_exit(). 1810 */ 1811 ASSERT(!(zone->zone_kthreads == NULL && 1812 zone_status_get(zone) >= ZONE_IS_EMPTY)); 1813 /* 1814 * Create a thread, but don't let it run until we've finished setting 1815 * things up. 1816 */ 1817 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 1818 ASSERT(t->t_forw == NULL); 1819 mutex_enter(&zone_status_lock); 1820 if (zone->zone_kthreads == NULL) { 1821 t->t_forw = t->t_back = t; 1822 } else { 1823 kthread_t *tx = zone->zone_kthreads; 1824 1825 t->t_forw = tx; 1826 t->t_back = tx->t_back; 1827 tx->t_back->t_forw = t; 1828 tx->t_back = t; 1829 } 1830 zone->zone_kthreads = t; 1831 mutex_exit(&zone_status_lock); 1832 1833 mutex_enter(&pp->p_lock); 1834 t->t_proc_flag |= TP_ZTHREAD; 1835 project_rele(t->t_proj); 1836 t->t_proj = project_hold(pp->p_task->tk_proj); 1837 1838 /* 1839 * Setup complete, let it run. 1840 */ 1841 thread_lock(t); 1842 t->t_schedflag |= TS_ALLSTART; 1843 setrun_locked(t); 1844 thread_unlock(t); 1845 1846 mutex_exit(&pp->p_lock); 1847 1848 return (t); 1849 } 1850 1851 /* 1852 * Similar to thread_exit(). Must be called by threads created via 1853 * zthread_exit(). 1854 */ 1855 void 1856 zthread_exit(void) 1857 { 1858 kthread_t *t = curthread; 1859 proc_t *pp = curproc; 1860 zone_t *zone = pp->p_zone; 1861 1862 mutex_enter(&zone_status_lock); 1863 1864 /* 1865 * Reparent to p0 1866 */ 1867 mutex_enter(&pp->p_lock); 1868 t->t_proc_flag &= ~TP_ZTHREAD; 1869 t->t_procp = &p0; 1870 hat_thread_exit(t); 1871 mutex_exit(&pp->p_lock); 1872 1873 if (t->t_back == t) { 1874 ASSERT(t->t_forw == t); 1875 /* 1876 * If the zone is empty, once the thread count 1877 * goes to zero no further kernel threads can be 1878 * created. This is because if the creator is a process 1879 * in the zone, then it must have exited before the zone 1880 * state could be set to ZONE_IS_EMPTY. 1881 * Otherwise, if the creator is a kernel thread in the 1882 * zone, the thread count is non-zero. 1883 * 1884 * This really means that non-zone kernel threads should 1885 * not create zone kernel threads. 1886 */ 1887 zone->zone_kthreads = NULL; 1888 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 1889 zone_status_set(zone, ZONE_IS_DOWN); 1890 } 1891 } else { 1892 t->t_forw->t_back = t->t_back; 1893 t->t_back->t_forw = t->t_forw; 1894 if (zone->zone_kthreads == t) 1895 zone->zone_kthreads = t->t_forw; 1896 } 1897 mutex_exit(&zone_status_lock); 1898 zone_rele(zone); 1899 thread_exit(); 1900 /* NOTREACHED */ 1901 } 1902 1903 static void 1904 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 1905 { 1906 vnode_t *oldvp; 1907 1908 /* we're going to hold a reference here to the directory */ 1909 VN_HOLD(vp); 1910 1911 #ifdef C2_AUDIT 1912 if (audit_active) /* update abs cwd/root path see c2audit.c */ 1913 audit_chdirec(vp, vpp); 1914 #endif 1915 1916 mutex_enter(&pp->p_lock); 1917 oldvp = *vpp; 1918 *vpp = vp; 1919 mutex_exit(&pp->p_lock); 1920 if (oldvp != NULL) 1921 VN_RELE(oldvp); 1922 } 1923 1924 /* 1925 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 1926 */ 1927 static int 1928 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 1929 { 1930 nvpair_t *nvp = NULL; 1931 boolean_t priv_set = B_FALSE; 1932 boolean_t limit_set = B_FALSE; 1933 boolean_t action_set = B_FALSE; 1934 1935 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1936 const char *name; 1937 uint64_t ui64; 1938 1939 name = nvpair_name(nvp); 1940 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 1941 return (EINVAL); 1942 (void) nvpair_value_uint64(nvp, &ui64); 1943 if (strcmp(name, "privilege") == 0) { 1944 /* 1945 * Currently only privileged values are allowed, but 1946 * this may change in the future. 1947 */ 1948 if (ui64 != RCPRIV_PRIVILEGED) 1949 return (EINVAL); 1950 rv->rcv_privilege = ui64; 1951 priv_set = B_TRUE; 1952 } else if (strcmp(name, "limit") == 0) { 1953 rv->rcv_value = ui64; 1954 limit_set = B_TRUE; 1955 } else if (strcmp(name, "action") == 0) { 1956 if (ui64 != RCTL_LOCAL_NOACTION && 1957 ui64 != RCTL_LOCAL_DENY) 1958 return (EINVAL); 1959 rv->rcv_flagaction = ui64; 1960 action_set = B_TRUE; 1961 } else { 1962 return (EINVAL); 1963 } 1964 } 1965 1966 if (!(priv_set && limit_set && action_set)) 1967 return (EINVAL); 1968 rv->rcv_action_signal = 0; 1969 rv->rcv_action_recipient = NULL; 1970 rv->rcv_action_recip_pid = -1; 1971 rv->rcv_firing_time = 0; 1972 1973 return (0); 1974 } 1975 1976 void 1977 zone_icode(void) 1978 { 1979 proc_t *p = ttoproc(curthread); 1980 struct core_globals *cg; 1981 1982 /* 1983 * For all purposes (ZONE_ATTR_INITPID and restart_init), 1984 * storing just the pid of init is sufficient. 1985 */ 1986 p->p_zone->zone_proc_initpid = p->p_pid; 1987 1988 /* 1989 * Allocate user address space and stack segment 1990 */ 1991 1992 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 1993 p->p_usrstack = (caddr_t)USRSTACK32; 1994 p->p_model = DATAMODEL_ILP32; 1995 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 1996 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 1997 p->p_stk_ctl = INT32_MAX; 1998 1999 p->p_as = as_alloc(); 2000 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 2001 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 2002 2003 cg = zone_getspecific(core_zone_key, p->p_zone); 2004 ASSERT(cg != NULL); 2005 corectl_path_hold(cg->core_default_path); 2006 corectl_content_hold(cg->core_default_content); 2007 p->p_corefile = cg->core_default_path; 2008 p->p_content = cg->core_default_content; 2009 2010 init_mstate(curthread, LMS_SYSTEM); 2011 2012 p->p_zone->zone_boot_err = exec_init(zone_initname, 0, 2013 p->p_zone->zone_bootargs); 2014 2015 mutex_enter(&zone_status_lock); 2016 if (p->p_zone->zone_boot_err != 0) { 2017 /* 2018 * Make sure we are still in the booting state-- we could have 2019 * raced and already be shutting down, or even further along. 2020 */ 2021 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2022 zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); 2023 mutex_exit(&zone_status_lock); 2024 /* It's gone bad, dispose of the process */ 2025 if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { 2026 mutex_enter(&p->p_lock); 2027 ASSERT(p->p_flag & SEXITLWPS); 2028 lwp_exit(); 2029 } 2030 } else { 2031 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2032 zone_status_set(p->p_zone, ZONE_IS_RUNNING); 2033 mutex_exit(&zone_status_lock); 2034 /* cause the process to return to userland. */ 2035 lwp_rtt(); 2036 } 2037 } 2038 2039 struct zsched_arg { 2040 zone_t *zone; 2041 nvlist_t *nvlist; 2042 }; 2043 2044 /* 2045 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2046 * anything to do with scheduling, but rather with the fact that 2047 * per-zone kernel threads are parented to zsched, just like regular 2048 * kernel threads are parented to sched (p0). 2049 * 2050 * zsched is also responsible for launching init for the zone. 2051 */ 2052 static void 2053 zsched(void *arg) 2054 { 2055 struct zsched_arg *za = arg; 2056 proc_t *pp = curproc; 2057 proc_t *initp = proc_init; 2058 zone_t *zone = za->zone; 2059 cred_t *cr, *oldcred; 2060 rctl_set_t *set; 2061 rctl_alloc_gp_t *gp; 2062 contract_t *ct = NULL; 2063 task_t *tk, *oldtk; 2064 rctl_entity_p_t e; 2065 kproject_t *pj; 2066 2067 nvlist_t *nvl = za->nvlist; 2068 nvpair_t *nvp = NULL; 2069 2070 bcopy("zsched", u.u_psargs, sizeof ("zsched")); 2071 bcopy("zsched", u.u_comm, sizeof ("zsched")); 2072 u.u_argc = 0; 2073 u.u_argv = NULL; 2074 u.u_envp = NULL; 2075 closeall(P_FINFO(pp)); 2076 2077 /* 2078 * We are this zone's "zsched" process. As the zone isn't generally 2079 * visible yet we don't need to grab any locks before initializing its 2080 * zone_proc pointer. 2081 */ 2082 zone_hold(zone); /* this hold is released by zone_destroy() */ 2083 zone->zone_zsched = pp; 2084 mutex_enter(&pp->p_lock); 2085 pp->p_zone = zone; 2086 mutex_exit(&pp->p_lock); 2087 2088 /* 2089 * Disassociate process from its 'parent'; parent ourselves to init 2090 * (pid 1) and change other values as needed. 2091 */ 2092 sess_create(); 2093 2094 mutex_enter(&pidlock); 2095 proc_detach(pp); 2096 pp->p_ppid = 1; 2097 pp->p_flag |= SZONETOP; 2098 pp->p_ancpid = 1; 2099 pp->p_parent = initp; 2100 pp->p_psibling = NULL; 2101 if (initp->p_child) 2102 initp->p_child->p_psibling = pp; 2103 pp->p_sibling = initp->p_child; 2104 initp->p_child = pp; 2105 2106 /* Decrement what newproc() incremented. */ 2107 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2108 /* 2109 * Our credentials are about to become kcred-like, so we don't care 2110 * about the caller's ruid. 2111 */ 2112 upcount_inc(crgetruid(kcred), zone->zone_id); 2113 mutex_exit(&pidlock); 2114 2115 /* 2116 * getting out of global zone, so decrement lwp counts 2117 */ 2118 pj = pp->p_task->tk_proj; 2119 mutex_enter(&global_zone->zone_nlwps_lock); 2120 pj->kpj_nlwps -= pp->p_lwpcnt; 2121 global_zone->zone_nlwps -= pp->p_lwpcnt; 2122 mutex_exit(&global_zone->zone_nlwps_lock); 2123 2124 /* 2125 * Create and join a new task in project '0' of this zone. 2126 * 2127 * We don't need to call holdlwps() since we know we're the only lwp in 2128 * this process. 2129 * 2130 * task_join() returns with p_lock held. 2131 */ 2132 tk = task_create(0, zone); 2133 mutex_enter(&cpu_lock); 2134 oldtk = task_join(tk, 0); 2135 mutex_exit(&curproc->p_lock); 2136 mutex_exit(&cpu_lock); 2137 task_rele(oldtk); 2138 2139 /* 2140 * add lwp counts to zsched's zone, and increment project's task count 2141 * due to the task created in the above tasksys_settaskid 2142 */ 2143 pj = pp->p_task->tk_proj; 2144 mutex_enter(&zone->zone_nlwps_lock); 2145 pj->kpj_nlwps += pp->p_lwpcnt; 2146 pj->kpj_ntasks += 1; 2147 zone->zone_nlwps += pp->p_lwpcnt; 2148 mutex_exit(&zone->zone_nlwps_lock); 2149 2150 /* 2151 * The process was created by a process in the global zone, hence the 2152 * credentials are wrong. We might as well have kcred-ish credentials. 2153 */ 2154 cr = zone->zone_kcred; 2155 crhold(cr); 2156 mutex_enter(&pp->p_crlock); 2157 oldcred = pp->p_cred; 2158 pp->p_cred = cr; 2159 mutex_exit(&pp->p_crlock); 2160 crfree(oldcred); 2161 2162 /* 2163 * Hold credentials again (for thread) 2164 */ 2165 crhold(cr); 2166 2167 /* 2168 * p_lwpcnt can't change since this is a kernel process. 2169 */ 2170 crset(pp, cr); 2171 2172 /* 2173 * Chroot 2174 */ 2175 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2176 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2177 2178 /* 2179 * Initialize zone's rctl set. 2180 */ 2181 set = rctl_set_create(); 2182 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2183 mutex_enter(&pp->p_lock); 2184 e.rcep_p.zone = zone; 2185 e.rcep_t = RCENTITY_ZONE; 2186 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2187 mutex_exit(&pp->p_lock); 2188 rctl_prealloc_destroy(gp); 2189 2190 /* 2191 * Apply the rctls passed in to zone_create(). This is basically a list 2192 * assignment: all of the old values are removed and the new ones 2193 * inserted. That is, if an empty list is passed in, all values are 2194 * removed. 2195 */ 2196 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2197 rctl_dict_entry_t *rde; 2198 rctl_hndl_t hndl; 2199 char *name; 2200 nvlist_t **nvlarray; 2201 uint_t i, nelem; 2202 int error; /* For ASSERT()s */ 2203 2204 name = nvpair_name(nvp); 2205 hndl = rctl_hndl_lookup(name); 2206 ASSERT(hndl != -1); 2207 rde = rctl_dict_lookup_hndl(hndl); 2208 ASSERT(rde != NULL); 2209 2210 for (; /* ever */; ) { 2211 rctl_val_t oval; 2212 2213 mutex_enter(&pp->p_lock); 2214 error = rctl_local_get(hndl, NULL, &oval, pp); 2215 mutex_exit(&pp->p_lock); 2216 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2217 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2218 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2219 break; 2220 mutex_enter(&pp->p_lock); 2221 error = rctl_local_delete(hndl, &oval, pp); 2222 mutex_exit(&pp->p_lock); 2223 ASSERT(error == 0); 2224 } 2225 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2226 ASSERT(error == 0); 2227 for (i = 0; i < nelem; i++) { 2228 rctl_val_t *nvalp; 2229 2230 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2231 error = nvlist2rctlval(nvlarray[i], nvalp); 2232 ASSERT(error == 0); 2233 /* 2234 * rctl_local_insert can fail if the value being 2235 * inserted is a duplicate; this is OK. 2236 */ 2237 mutex_enter(&pp->p_lock); 2238 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2239 kmem_cache_free(rctl_val_cache, nvalp); 2240 mutex_exit(&pp->p_lock); 2241 } 2242 } 2243 /* 2244 * Tell the world that we're done setting up. 2245 * 2246 * At this point we want to set the zone status to ZONE_IS_READY 2247 * and atomically set the zone's processor set visibility. Once 2248 * we drop pool_lock() this zone will automatically get updated 2249 * to reflect any future changes to the pools configuration. 2250 */ 2251 pool_lock(); 2252 mutex_enter(&cpu_lock); 2253 mutex_enter(&zonehash_lock); 2254 zone_uniqid(zone); 2255 zone_zsd_configure(zone); 2256 if (pool_state == POOL_ENABLED) 2257 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2258 mutex_enter(&zone_status_lock); 2259 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2260 zone_status_set(zone, ZONE_IS_READY); 2261 mutex_exit(&zone_status_lock); 2262 mutex_exit(&zonehash_lock); 2263 mutex_exit(&cpu_lock); 2264 pool_unlock(); 2265 2266 /* 2267 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2268 * we launch init, and set the state to running. 2269 */ 2270 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2271 2272 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2273 id_t cid; 2274 2275 /* 2276 * Ok, this is a little complicated. We need to grab the 2277 * zone's pool's scheduling class ID; note that by now, we 2278 * are already bound to a pool if we need to be (zoneadmd 2279 * will have done that to us while we're in the READY 2280 * state). *But* the scheduling class for the zone's 'init' 2281 * must be explicitly passed to newproc, which doesn't 2282 * respect pool bindings. 2283 * 2284 * We hold the pool_lock across the call to newproc() to 2285 * close the obvious race: the pool's scheduling class 2286 * could change before we manage to create the LWP with 2287 * classid 'cid'. 2288 */ 2289 pool_lock(); 2290 cid = pool_get_class(zone->zone_pool); 2291 if (cid == -1) 2292 cid = defaultcid; 2293 2294 /* 2295 * If this fails, zone_boot will ultimately fail. The 2296 * state of the zone will be set to SHUTTING_DOWN-- userland 2297 * will have to tear down the zone, and fail, or try again. 2298 */ 2299 if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid, 2300 minclsyspri - 1, &ct)) != 0) { 2301 mutex_enter(&zone_status_lock); 2302 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2303 mutex_exit(&zone_status_lock); 2304 } 2305 pool_unlock(); 2306 } 2307 2308 /* 2309 * Wait for zone_destroy() to be called. This is what we spend 2310 * most of our life doing. 2311 */ 2312 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2313 2314 if (ct) 2315 /* 2316 * At this point the process contract should be empty. 2317 * (Though if it isn't, it's not the end of the world.) 2318 */ 2319 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2320 2321 /* 2322 * Allow kcred to be freed when all referring processes 2323 * (including this one) go away. We can't just do this in 2324 * zone_free because we need to wait for the zone_cred_ref to 2325 * drop to 0 before calling zone_free, and the existence of 2326 * zone_kcred will prevent that. Thus, we call crfree here to 2327 * balance the crdup in zone_create. The crhold calls earlier 2328 * in zsched will be dropped when the thread and process exit. 2329 */ 2330 crfree(zone->zone_kcred); 2331 zone->zone_kcred = NULL; 2332 2333 exit(CLD_EXITED, 0); 2334 } 2335 2336 /* 2337 * Helper function to determine if there are any submounts of the 2338 * provided path. Used to make sure the zone doesn't "inherit" any 2339 * mounts from before it is created. 2340 */ 2341 static uint_t 2342 zone_mount_count(const char *rootpath) 2343 { 2344 vfs_t *vfsp; 2345 uint_t count = 0; 2346 size_t rootpathlen = strlen(rootpath); 2347 2348 /* 2349 * Holding zonehash_lock prevents race conditions with 2350 * vfs_list_add()/vfs_list_remove() since we serialize with 2351 * zone_find_by_path(). 2352 */ 2353 ASSERT(MUTEX_HELD(&zonehash_lock)); 2354 /* 2355 * The rootpath must end with a '/' 2356 */ 2357 ASSERT(rootpath[rootpathlen - 1] == '/'); 2358 2359 /* 2360 * This intentionally does not count the rootpath itself if that 2361 * happens to be a mount point. 2362 */ 2363 vfs_list_read_lock(); 2364 vfsp = rootvfs; 2365 do { 2366 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2367 rootpathlen) == 0) 2368 count++; 2369 vfsp = vfsp->vfs_next; 2370 } while (vfsp != rootvfs); 2371 vfs_list_unlock(); 2372 return (count); 2373 } 2374 2375 /* 2376 * Helper function to make sure that a zone created on 'rootpath' 2377 * wouldn't end up containing other zones' rootpaths. 2378 */ 2379 static boolean_t 2380 zone_is_nested(const char *rootpath) 2381 { 2382 zone_t *zone; 2383 size_t rootpathlen = strlen(rootpath); 2384 size_t len; 2385 2386 ASSERT(MUTEX_HELD(&zonehash_lock)); 2387 2388 for (zone = list_head(&zone_active); zone != NULL; 2389 zone = list_next(&zone_active, zone)) { 2390 if (zone == global_zone) 2391 continue; 2392 len = strlen(zone->zone_rootpath); 2393 if (strncmp(rootpath, zone->zone_rootpath, 2394 MIN(rootpathlen, len)) == 0) 2395 return (B_TRUE); 2396 } 2397 return (B_FALSE); 2398 } 2399 2400 static int 2401 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs) 2402 { 2403 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 2404 2405 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 2406 kmem_free(privs, sizeof (priv_set_t)); 2407 return (EFAULT); 2408 } 2409 2410 zone->zone_privset = privs; 2411 return (0); 2412 } 2413 2414 /* 2415 * We make creative use of nvlists to pass in rctls from userland. The list is 2416 * a list of the following structures: 2417 * 2418 * (name = rctl_name, value = nvpair_list_array) 2419 * 2420 * Where each element of the nvpair_list_array is of the form: 2421 * 2422 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 2423 * (name = "limit", value = uint64_t), 2424 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 2425 */ 2426 static int 2427 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 2428 { 2429 nvpair_t *nvp = NULL; 2430 nvlist_t *nvl = NULL; 2431 char *kbuf; 2432 int error; 2433 rctl_val_t rv; 2434 2435 *nvlp = NULL; 2436 2437 if (buflen == 0) 2438 return (0); 2439 2440 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2441 return (ENOMEM); 2442 if (copyin(ubuf, kbuf, buflen)) { 2443 error = EFAULT; 2444 goto out; 2445 } 2446 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 2447 /* 2448 * nvl may have been allocated/free'd, but the value set to 2449 * non-NULL, so we reset it here. 2450 */ 2451 nvl = NULL; 2452 error = EINVAL; 2453 goto out; 2454 } 2455 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2456 rctl_dict_entry_t *rde; 2457 rctl_hndl_t hndl; 2458 nvlist_t **nvlarray; 2459 uint_t i, nelem; 2460 char *name; 2461 2462 error = EINVAL; 2463 name = nvpair_name(nvp); 2464 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 2465 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 2466 goto out; 2467 } 2468 if ((hndl = rctl_hndl_lookup(name)) == -1) { 2469 goto out; 2470 } 2471 rde = rctl_dict_lookup_hndl(hndl); 2472 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2473 ASSERT(error == 0); 2474 for (i = 0; i < nelem; i++) { 2475 if (error = nvlist2rctlval(nvlarray[i], &rv)) 2476 goto out; 2477 } 2478 if (rctl_invalid_value(rde, &rv)) { 2479 error = EINVAL; 2480 goto out; 2481 } 2482 } 2483 error = 0; 2484 *nvlp = nvl; 2485 out: 2486 kmem_free(kbuf, buflen); 2487 if (error && nvl != NULL) 2488 nvlist_free(nvl); 2489 return (error); 2490 } 2491 2492 int 2493 zone_create_error(int er_error, int er_ext, int *er_out) { 2494 if (er_out != NULL) { 2495 if (copyout(&er_ext, er_out, sizeof (int))) { 2496 return (set_errno(EFAULT)); 2497 } 2498 } 2499 return (set_errno(er_error)); 2500 } 2501 2502 /* 2503 * System call to create/initialize a new zone named 'zone_name', rooted 2504 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 2505 * and initialized with the zone-wide rctls described in 'rctlbuf'. 2506 * 2507 * If extended error is non-null, we may use it to return more detailed 2508 * error information. 2509 */ 2510 static zoneid_t 2511 zone_create(const char *zone_name, const char *zone_root, 2512 const priv_set_t *zone_privs, caddr_t rctlbuf, size_t rctlbufsz, 2513 int *extended_error) 2514 { 2515 struct zsched_arg zarg; 2516 nvlist_t *rctls = NULL; 2517 proc_t *pp = curproc; 2518 zone_t *zone, *ztmp; 2519 zoneid_t zoneid; 2520 int error; 2521 int error2 = 0; 2522 char *str; 2523 cred_t *zkcr; 2524 2525 if (secpolicy_zone_config(CRED()) != 0) 2526 return (set_errno(EPERM)); 2527 2528 /* can't boot zone from within chroot environment */ 2529 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 2530 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 2531 extended_error)); 2532 2533 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 2534 zoneid = zone->zone_id = id_alloc(zoneid_space); 2535 zone->zone_status = ZONE_IS_UNINITIALIZED; 2536 zone->zone_pool = pool_default; 2537 zone->zone_pool_mod = gethrtime(); 2538 zone->zone_psetid = ZONE_PS_INVAL; 2539 zone->zone_ncpus = 0; 2540 zone->zone_ncpus_online = 0; 2541 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 2542 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 2543 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 2544 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 2545 offsetof(struct zsd_entry, zsd_linkage)); 2546 2547 if ((error = zone_set_name(zone, zone_name)) != 0) { 2548 zone_free(zone); 2549 return (zone_create_error(error, 0, extended_error)); 2550 } 2551 2552 if ((error = zone_set_root(zone, zone_root)) != 0) { 2553 zone_free(zone); 2554 return (zone_create_error(error, 0, extended_error)); 2555 } 2556 if ((error = zone_set_privset(zone, zone_privs)) != 0) { 2557 zone_free(zone); 2558 return (zone_create_error(error, 0, extended_error)); 2559 } 2560 2561 /* initialize node name to be the same as zone name */ 2562 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2563 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 2564 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 2565 2566 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2567 zone->zone_domain[0] = '\0'; 2568 zone->zone_shares = 1; 2569 zone->zone_bootargs = NULL; 2570 2571 /* 2572 * Zsched initializes the rctls. 2573 */ 2574 zone->zone_rctls = NULL; 2575 2576 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 2577 zone_free(zone); 2578 return (zone_create_error(error, 0, extended_error)); 2579 } 2580 2581 /* 2582 * Stop all lwps since that's what normally happens as part of fork(). 2583 * This needs to happen before we grab any locks to avoid deadlock 2584 * (another lwp in the process could be waiting for the held lock). 2585 */ 2586 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 2587 zone_free(zone); 2588 if (rctls) 2589 nvlist_free(rctls); 2590 return (zone_create_error(error, 0, extended_error)); 2591 } 2592 2593 if (block_mounts() == 0) { 2594 mutex_enter(&pp->p_lock); 2595 if (curthread != pp->p_agenttp) 2596 continuelwps(pp); 2597 mutex_exit(&pp->p_lock); 2598 zone_free(zone); 2599 if (rctls) 2600 nvlist_free(rctls); 2601 return (zone_create_error(error, 0, extended_error)); 2602 } 2603 2604 /* 2605 * Set up credential for kernel access. After this, any errors 2606 * should go through the dance in errout rather than calling 2607 * zone_free directly. 2608 */ 2609 zone->zone_kcred = crdup(kcred); 2610 crsetzone(zone->zone_kcred, zone); 2611 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 2612 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 2613 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 2614 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 2615 2616 mutex_enter(&zonehash_lock); 2617 /* 2618 * Make sure zone doesn't already exist. 2619 */ 2620 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) { 2621 zone_status_t status; 2622 2623 status = zone_status_get(ztmp); 2624 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 2625 error = EEXIST; 2626 else 2627 error = EBUSY; 2628 goto errout; 2629 } 2630 2631 /* 2632 * Don't allow zone creations which would cause one zone's rootpath to 2633 * be accessible from that of another (non-global) zone. 2634 */ 2635 if (zone_is_nested(zone->zone_rootpath)) { 2636 error = EBUSY; 2637 goto errout; 2638 } 2639 2640 ASSERT(zonecount != 0); /* check for leaks */ 2641 if (zonecount + 1 > maxzones) { 2642 error = ENOMEM; 2643 goto errout; 2644 } 2645 2646 if (zone_mount_count(zone->zone_rootpath) != 0) { 2647 error = EBUSY; 2648 error2 = ZE_AREMOUNTS; 2649 goto errout; 2650 } 2651 2652 /* 2653 * Zone is still incomplete, but we need to drop all locks while 2654 * zsched() initializes this zone's kernel process. We 2655 * optimistically add the zone to the hashtable and associated 2656 * lists so a parallel zone_create() doesn't try to create the 2657 * same zone. 2658 */ 2659 zonecount++; 2660 (void) mod_hash_insert(zonehashbyid, 2661 (mod_hash_key_t)(uintptr_t)zone->zone_id, 2662 (mod_hash_val_t)(uintptr_t)zone); 2663 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 2664 (void) strcpy(str, zone->zone_name); 2665 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 2666 (mod_hash_val_t)(uintptr_t)zone); 2667 /* 2668 * Insert into active list. At this point there are no 'hold's 2669 * on the zone, but everyone else knows not to use it, so we can 2670 * continue to use it. zsched() will do a zone_hold() if the 2671 * newproc() is successful. 2672 */ 2673 list_insert_tail(&zone_active, zone); 2674 mutex_exit(&zonehash_lock); 2675 2676 zarg.zone = zone; 2677 zarg.nvlist = rctls; 2678 /* 2679 * The process, task, and project rctls are probably wrong; 2680 * we need an interface to get the default values of all rctls, 2681 * and initialize zsched appropriately. I'm not sure that that 2682 * makes much of a difference, though. 2683 */ 2684 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 2685 /* 2686 * We need to undo all globally visible state. 2687 */ 2688 mutex_enter(&zonehash_lock); 2689 list_remove(&zone_active, zone); 2690 (void) mod_hash_destroy(zonehashbyname, 2691 (mod_hash_key_t)(uintptr_t)zone->zone_name); 2692 (void) mod_hash_destroy(zonehashbyid, 2693 (mod_hash_key_t)(uintptr_t)zone->zone_id); 2694 ASSERT(zonecount > 1); 2695 zonecount--; 2696 goto errout; 2697 } 2698 2699 /* 2700 * Zone creation can't fail from now on. 2701 */ 2702 2703 /* 2704 * Let the other lwps continue. 2705 */ 2706 mutex_enter(&pp->p_lock); 2707 if (curthread != pp->p_agenttp) 2708 continuelwps(pp); 2709 mutex_exit(&pp->p_lock); 2710 2711 /* 2712 * Wait for zsched to finish initializing the zone. 2713 */ 2714 zone_status_wait(zone, ZONE_IS_READY); 2715 /* 2716 * The zone is fully visible, so we can let mounts progress. 2717 */ 2718 resume_mounts(); 2719 if (rctls) 2720 nvlist_free(rctls); 2721 2722 return (zoneid); 2723 2724 errout: 2725 mutex_exit(&zonehash_lock); 2726 /* 2727 * Let the other lwps continue. 2728 */ 2729 mutex_enter(&pp->p_lock); 2730 if (curthread != pp->p_agenttp) 2731 continuelwps(pp); 2732 mutex_exit(&pp->p_lock); 2733 2734 resume_mounts(); 2735 if (rctls) 2736 nvlist_free(rctls); 2737 /* 2738 * There is currently one reference to the zone, a cred_ref from 2739 * zone_kcred. To free the zone, we call crfree, which will call 2740 * zone_cred_rele, which will call zone_free. 2741 */ 2742 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 2743 ASSERT(zone->zone_kcred->cr_ref == 1); 2744 ASSERT(zone->zone_ref == 0); 2745 zkcr = zone->zone_kcred; 2746 zone->zone_kcred = NULL; 2747 crfree(zkcr); /* triggers call to zone_free */ 2748 return (zone_create_error(error, error2, extended_error)); 2749 } 2750 2751 /* 2752 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 2753 * the heavy lifting. 2754 */ 2755 static int 2756 zone_boot(zoneid_t zoneid, const char *bootargs) 2757 { 2758 int err; 2759 zone_t *zone; 2760 2761 if (secpolicy_zone_config(CRED()) != 0) 2762 return (set_errno(EPERM)); 2763 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2764 return (set_errno(EINVAL)); 2765 2766 mutex_enter(&zonehash_lock); 2767 /* 2768 * Look for zone under hash lock to prevent races with calls to 2769 * zone_shutdown, zone_destroy, etc. 2770 */ 2771 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2772 mutex_exit(&zonehash_lock); 2773 return (set_errno(EINVAL)); 2774 } 2775 2776 if ((err = zone_set_bootargs(zone, bootargs)) != 0) { 2777 mutex_exit(&zonehash_lock); 2778 return (set_errno(err)); 2779 } 2780 2781 mutex_enter(&zone_status_lock); 2782 if (zone_status_get(zone) != ZONE_IS_READY) { 2783 mutex_exit(&zone_status_lock); 2784 mutex_exit(&zonehash_lock); 2785 return (set_errno(EINVAL)); 2786 } 2787 zone_status_set(zone, ZONE_IS_BOOTING); 2788 mutex_exit(&zone_status_lock); 2789 2790 zone_hold(zone); /* so we can use the zone_t later */ 2791 mutex_exit(&zonehash_lock); 2792 2793 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 2794 zone_rele(zone); 2795 return (set_errno(EINTR)); 2796 } 2797 2798 /* 2799 * Boot (starting init) might have failed, in which case the zone 2800 * will go to the SHUTTING_DOWN state; an appropriate errno will 2801 * be placed in zone->zone_boot_err, and so we return that. 2802 */ 2803 err = zone->zone_boot_err; 2804 zone_rele(zone); 2805 return (err ? set_errno(err) : 0); 2806 } 2807 2808 /* 2809 * Kills all user processes in the zone, waiting for them all to exit 2810 * before returning. 2811 */ 2812 static int 2813 zone_empty(zone_t *zone) 2814 { 2815 int waitstatus; 2816 2817 /* 2818 * We need to drop zonehash_lock before killing all 2819 * processes, otherwise we'll deadlock with zone_find_* 2820 * which can be called from the exit path. 2821 */ 2822 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 2823 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 2824 ZONE_IS_EMPTY)) == -1) { 2825 killall(zone->zone_id); 2826 } 2827 /* 2828 * return EINTR if we were signaled 2829 */ 2830 if (waitstatus == 0) 2831 return (EINTR); 2832 return (0); 2833 } 2834 2835 /* 2836 * Systemcall to start the zone's halt sequence. By the time this 2837 * function successfully returns, all user processes and kernel threads 2838 * executing in it will have exited, ZSD shutdown callbacks executed, 2839 * and the zone status set to ZONE_IS_DOWN. 2840 * 2841 * It is possible that the call will interrupt itself if the caller is the 2842 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 2843 */ 2844 static int 2845 zone_shutdown(zoneid_t zoneid) 2846 { 2847 int error; 2848 zone_t *zone; 2849 zone_status_t status; 2850 2851 if (secpolicy_zone_config(CRED()) != 0) 2852 return (set_errno(EPERM)); 2853 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2854 return (set_errno(EINVAL)); 2855 2856 /* 2857 * Block mounts so that VFS_MOUNT() can get an accurate view of 2858 * the zone's status with regards to ZONE_IS_SHUTTING down. 2859 * 2860 * e.g. NFS can fail the mount if it determines that the zone 2861 * has already begun the shutdown sequence. 2862 */ 2863 if (block_mounts() == 0) 2864 return (set_errno(EINTR)); 2865 mutex_enter(&zonehash_lock); 2866 /* 2867 * Look for zone under hash lock to prevent races with other 2868 * calls to zone_shutdown and zone_destroy. 2869 */ 2870 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2871 mutex_exit(&zonehash_lock); 2872 resume_mounts(); 2873 return (set_errno(EINVAL)); 2874 } 2875 mutex_enter(&zone_status_lock); 2876 status = zone_status_get(zone); 2877 /* 2878 * Fail if the zone isn't fully initialized yet. 2879 */ 2880 if (status < ZONE_IS_READY) { 2881 mutex_exit(&zone_status_lock); 2882 mutex_exit(&zonehash_lock); 2883 resume_mounts(); 2884 return (set_errno(EINVAL)); 2885 } 2886 /* 2887 * If conditions required for zone_shutdown() to return have been met, 2888 * return success. 2889 */ 2890 if (status >= ZONE_IS_DOWN) { 2891 mutex_exit(&zone_status_lock); 2892 mutex_exit(&zonehash_lock); 2893 resume_mounts(); 2894 return (0); 2895 } 2896 /* 2897 * If zone_shutdown() hasn't been called before, go through the motions. 2898 * If it has, there's nothing to do but wait for the kernel threads to 2899 * drain. 2900 */ 2901 if (status < ZONE_IS_EMPTY) { 2902 uint_t ntasks; 2903 2904 mutex_enter(&zone->zone_lock); 2905 if ((ntasks = zone->zone_ntasks) != 1) { 2906 /* 2907 * There's still stuff running. 2908 */ 2909 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2910 } 2911 mutex_exit(&zone->zone_lock); 2912 if (ntasks == 1) { 2913 /* 2914 * The only way to create another task is through 2915 * zone_enter(), which will block until we drop 2916 * zonehash_lock. The zone is empty. 2917 */ 2918 if (zone->zone_kthreads == NULL) { 2919 /* 2920 * Skip ahead to ZONE_IS_DOWN 2921 */ 2922 zone_status_set(zone, ZONE_IS_DOWN); 2923 } else { 2924 zone_status_set(zone, ZONE_IS_EMPTY); 2925 } 2926 } 2927 } 2928 zone_hold(zone); /* so we can use the zone_t later */ 2929 mutex_exit(&zone_status_lock); 2930 mutex_exit(&zonehash_lock); 2931 resume_mounts(); 2932 2933 if (error = zone_empty(zone)) { 2934 zone_rele(zone); 2935 return (set_errno(error)); 2936 } 2937 /* 2938 * After the zone status goes to ZONE_IS_DOWN this zone will no 2939 * longer be notified of changes to the pools configuration, so 2940 * in order to not end up with a stale pool pointer, we point 2941 * ourselves at the default pool and remove all resource 2942 * visibility. This is especially important as the zone_t may 2943 * languish on the deathrow for a very long time waiting for 2944 * cred's to drain out. 2945 * 2946 * This rebinding of the zone can happen multiple times 2947 * (presumably due to interrupted or parallel systemcalls) 2948 * without any adverse effects. 2949 */ 2950 if (pool_lock_intr() != 0) { 2951 zone_rele(zone); 2952 return (set_errno(EINTR)); 2953 } 2954 if (pool_state == POOL_ENABLED) { 2955 mutex_enter(&cpu_lock); 2956 zone_pool_set(zone, pool_default); 2957 /* 2958 * The zone no longer needs to be able to see any cpus. 2959 */ 2960 zone_pset_set(zone, ZONE_PS_INVAL); 2961 mutex_exit(&cpu_lock); 2962 } 2963 pool_unlock(); 2964 2965 /* 2966 * ZSD shutdown callbacks can be executed multiple times, hence 2967 * it is safe to not be holding any locks across this call. 2968 */ 2969 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 2970 2971 mutex_enter(&zone_status_lock); 2972 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 2973 zone_status_set(zone, ZONE_IS_DOWN); 2974 mutex_exit(&zone_status_lock); 2975 2976 /* 2977 * Wait for kernel threads to drain. 2978 */ 2979 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 2980 zone_rele(zone); 2981 return (set_errno(EINTR)); 2982 } 2983 zone_rele(zone); 2984 return (0); 2985 } 2986 2987 /* 2988 * Systemcall entry point to finalize the zone halt process. The caller 2989 * must have already successfully callefd zone_shutdown(). 2990 * 2991 * Upon successful completion, the zone will have been fully destroyed: 2992 * zsched will have exited, destructor callbacks executed, and the zone 2993 * removed from the list of active zones. 2994 */ 2995 static int 2996 zone_destroy(zoneid_t zoneid) 2997 { 2998 uint64_t uniqid; 2999 zone_t *zone; 3000 zone_status_t status; 3001 3002 if (secpolicy_zone_config(CRED()) != 0) 3003 return (set_errno(EPERM)); 3004 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3005 return (set_errno(EINVAL)); 3006 3007 mutex_enter(&zonehash_lock); 3008 /* 3009 * Look for zone under hash lock to prevent races with other 3010 * calls to zone_destroy. 3011 */ 3012 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3013 mutex_exit(&zonehash_lock); 3014 return (set_errno(EINVAL)); 3015 } 3016 3017 if (zone_mount_count(zone->zone_rootpath) != 0) { 3018 mutex_exit(&zonehash_lock); 3019 return (set_errno(EBUSY)); 3020 } 3021 mutex_enter(&zone_status_lock); 3022 status = zone_status_get(zone); 3023 if (status < ZONE_IS_DOWN) { 3024 mutex_exit(&zone_status_lock); 3025 mutex_exit(&zonehash_lock); 3026 return (set_errno(EBUSY)); 3027 } else if (status == ZONE_IS_DOWN) { 3028 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3029 } 3030 mutex_exit(&zone_status_lock); 3031 zone_hold(zone); 3032 mutex_exit(&zonehash_lock); 3033 3034 /* 3035 * wait for zsched to exit 3036 */ 3037 zone_status_wait(zone, ZONE_IS_DEAD); 3038 zone_zsd_callbacks(zone, ZSD_DESTROY); 3039 uniqid = zone->zone_uniqid; 3040 zone_rele(zone); 3041 zone = NULL; /* potentially free'd */ 3042 3043 mutex_enter(&zonehash_lock); 3044 for (; /* ever */; ) { 3045 boolean_t unref; 3046 3047 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3048 zone->zone_uniqid != uniqid) { 3049 /* 3050 * The zone has gone away. Necessary conditions 3051 * are met, so we return success. 3052 */ 3053 mutex_exit(&zonehash_lock); 3054 return (0); 3055 } 3056 mutex_enter(&zone->zone_lock); 3057 unref = ZONE_IS_UNREF(zone); 3058 mutex_exit(&zone->zone_lock); 3059 if (unref) { 3060 /* 3061 * There is only one reference to the zone -- that 3062 * added when the zone was added to the hashtables -- 3063 * and things will remain this way until we drop 3064 * zonehash_lock... we can go ahead and cleanup the 3065 * zone. 3066 */ 3067 break; 3068 } 3069 3070 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3071 /* Signaled */ 3072 mutex_exit(&zonehash_lock); 3073 return (set_errno(EINTR)); 3074 } 3075 3076 } 3077 3078 /* 3079 * It is now safe to let the zone be recreated; remove it from the 3080 * lists. The memory will not be freed until the last cred 3081 * reference goes away. 3082 */ 3083 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3084 zonecount--; 3085 /* remove from active list and hash tables */ 3086 list_remove(&zone_active, zone); 3087 (void) mod_hash_destroy(zonehashbyname, 3088 (mod_hash_key_t)zone->zone_name); 3089 (void) mod_hash_destroy(zonehashbyid, 3090 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3091 mutex_exit(&zonehash_lock); 3092 3093 /* add to deathrow list */ 3094 mutex_enter(&zone_deathrow_lock); 3095 list_insert_tail(&zone_deathrow, zone); 3096 mutex_exit(&zone_deathrow_lock); 3097 3098 /* 3099 * Drop last reference (which was added by zsched()), this will 3100 * free the zone unless there are outstanding cred references. 3101 */ 3102 zone_rele(zone); 3103 return (0); 3104 } 3105 3106 /* 3107 * Systemcall entry point for zone_getattr(2). 3108 */ 3109 static ssize_t 3110 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3111 { 3112 size_t size; 3113 int error = 0, err; 3114 zone_t *zone; 3115 char *zonepath; 3116 zone_status_t zone_status; 3117 pid_t initpid; 3118 boolean_t global = (curproc->p_zone == global_zone); 3119 3120 mutex_enter(&zonehash_lock); 3121 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3122 mutex_exit(&zonehash_lock); 3123 return (set_errno(EINVAL)); 3124 } 3125 zone_status = zone_status_get(zone); 3126 if (zone_status < ZONE_IS_READY) { 3127 mutex_exit(&zonehash_lock); 3128 return (set_errno(EINVAL)); 3129 } 3130 zone_hold(zone); 3131 mutex_exit(&zonehash_lock); 3132 3133 /* 3134 * If not in the global zone, don't show information about other zones. 3135 */ 3136 if (!global && curproc->p_zone != zone) { 3137 zone_rele(zone); 3138 return (set_errno(EINVAL)); 3139 } 3140 3141 switch (attr) { 3142 case ZONE_ATTR_ROOT: 3143 if (global) { 3144 /* 3145 * Copy the path to trim the trailing "/" (except for 3146 * the global zone). 3147 */ 3148 if (zone != global_zone) 3149 size = zone->zone_rootpathlen - 1; 3150 else 3151 size = zone->zone_rootpathlen; 3152 zonepath = kmem_alloc(size, KM_SLEEP); 3153 bcopy(zone->zone_rootpath, zonepath, size); 3154 zonepath[size - 1] = '\0'; 3155 } else { 3156 /* 3157 * Caller is not in the global zone, just return 3158 * faked-up path for current zone. 3159 */ 3160 zonepath = "/"; 3161 size = 2; 3162 } 3163 if (bufsize > size) 3164 bufsize = size; 3165 if (buf != NULL) { 3166 err = copyoutstr(zonepath, buf, bufsize, NULL); 3167 if (err != 0 && err != ENAMETOOLONG) 3168 error = EFAULT; 3169 } 3170 if (global) 3171 kmem_free(zonepath, size); 3172 break; 3173 3174 case ZONE_ATTR_NAME: 3175 size = strlen(zone->zone_name) + 1; 3176 if (bufsize > size) 3177 bufsize = size; 3178 if (buf != NULL) { 3179 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 3180 if (err != 0 && err != ENAMETOOLONG) 3181 error = EFAULT; 3182 } 3183 break; 3184 3185 case ZONE_ATTR_STATUS: 3186 /* 3187 * Since we're not holding zonehash_lock, the zone status 3188 * may be anything; leave it up to userland to sort it out. 3189 */ 3190 size = sizeof (zone_status); 3191 if (bufsize > size) 3192 bufsize = size; 3193 zone_status = zone_status_get(zone); 3194 if (buf != NULL && 3195 copyout(&zone_status, buf, bufsize) != 0) 3196 error = EFAULT; 3197 break; 3198 case ZONE_ATTR_PRIVSET: 3199 size = sizeof (priv_set_t); 3200 if (bufsize > size) 3201 bufsize = size; 3202 if (buf != NULL && 3203 copyout(zone->zone_privset, buf, bufsize) != 0) 3204 error = EFAULT; 3205 break; 3206 case ZONE_ATTR_UNIQID: 3207 size = sizeof (zone->zone_uniqid); 3208 if (bufsize > size) 3209 bufsize = size; 3210 if (buf != NULL && 3211 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 3212 error = EFAULT; 3213 break; 3214 case ZONE_ATTR_POOLID: 3215 { 3216 pool_t *pool; 3217 poolid_t poolid; 3218 3219 if (pool_lock_intr() != 0) { 3220 error = EINTR; 3221 break; 3222 } 3223 pool = zone_pool_get(zone); 3224 poolid = pool->pool_id; 3225 pool_unlock(); 3226 size = sizeof (poolid); 3227 if (bufsize > size) 3228 bufsize = size; 3229 if (buf != NULL && copyout(&poolid, buf, size) != 0) 3230 error = EFAULT; 3231 } 3232 break; 3233 case ZONE_ATTR_INITPID: 3234 size = sizeof (initpid); 3235 if (bufsize > size) 3236 bufsize = size; 3237 initpid = zone->zone_proc_initpid; 3238 if (initpid == -1) { 3239 error = ESRCH; 3240 break; 3241 } 3242 if (buf != NULL && 3243 copyout(&initpid, buf, bufsize) != 0) 3244 error = EFAULT; 3245 break; 3246 default: 3247 error = EINVAL; 3248 } 3249 zone_rele(zone); 3250 3251 if (error) 3252 return (set_errno(error)); 3253 return ((ssize_t)size); 3254 } 3255 3256 /* 3257 * Return zero if the process has at least one vnode mapped in to its 3258 * address space which shouldn't be allowed to change zones. 3259 */ 3260 static int 3261 as_can_change_zones(void) 3262 { 3263 proc_t *pp = curproc; 3264 struct seg *seg; 3265 struct as *as = pp->p_as; 3266 vnode_t *vp; 3267 int allow = 1; 3268 3269 ASSERT(pp->p_as != &kas); 3270 AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); 3271 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 3272 /* 3273 * if we can't get a backing vnode for this segment then skip 3274 * it. 3275 */ 3276 vp = NULL; 3277 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 3278 continue; 3279 if (!vn_can_change_zones(vp)) { /* bail on first match */ 3280 allow = 0; 3281 break; 3282 } 3283 } 3284 AS_LOCK_EXIT(&as, &as->a_lock); 3285 return (allow); 3286 } 3287 3288 /* 3289 * Systemcall entry point for zone_enter(). 3290 * 3291 * The current process is injected into said zone. In the process 3292 * it will change its project membership, privileges, rootdir/cwd, 3293 * zone-wide rctls, and pool association to match those of the zone. 3294 * 3295 * The first zone_enter() called while the zone is in the ZONE_IS_READY 3296 * state will transition it to ZONE_IS_RUNNING. Processes may only 3297 * enter a zone that is "ready" or "running". 3298 */ 3299 static int 3300 zone_enter(zoneid_t zoneid) 3301 { 3302 zone_t *zone; 3303 vnode_t *vp; 3304 proc_t *pp = curproc; 3305 contract_t *ct; 3306 cont_process_t *ctp; 3307 task_t *tk, *oldtk; 3308 kproject_t *zone_proj0; 3309 cred_t *cr, *newcr; 3310 pool_t *oldpool, *newpool; 3311 sess_t *sp; 3312 uid_t uid; 3313 zone_status_t status; 3314 int err = 0; 3315 rctl_entity_p_t e; 3316 3317 if (secpolicy_zone_config(CRED()) != 0) 3318 return (set_errno(EPERM)); 3319 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3320 return (set_errno(EINVAL)); 3321 3322 /* 3323 * Stop all lwps so we don't need to hold a lock to look at 3324 * curproc->p_zone. This needs to happen before we grab any 3325 * locks to avoid deadlock (another lwp in the process could 3326 * be waiting for the held lock). 3327 */ 3328 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 3329 return (set_errno(EINTR)); 3330 3331 /* 3332 * Make sure we're not changing zones with files open or mapped in 3333 * to our address space which shouldn't be changing zones. 3334 */ 3335 if (!files_can_change_zones()) { 3336 err = EBADF; 3337 goto out; 3338 } 3339 if (!as_can_change_zones()) { 3340 err = EFAULT; 3341 goto out; 3342 } 3343 3344 mutex_enter(&zonehash_lock); 3345 if (pp->p_zone != global_zone) { 3346 mutex_exit(&zonehash_lock); 3347 err = EINVAL; 3348 goto out; 3349 } 3350 3351 zone = zone_find_all_by_id(zoneid); 3352 if (zone == NULL) { 3353 mutex_exit(&zonehash_lock); 3354 err = EINVAL; 3355 goto out; 3356 } 3357 3358 /* 3359 * To prevent processes in a zone from holding contracts on 3360 * extrazonal resources, and to avoid process contract 3361 * memberships which span zones, contract holders and processes 3362 * which aren't the sole members of their encapsulating process 3363 * contracts are not allowed to zone_enter. 3364 */ 3365 ctp = pp->p_ct_process; 3366 ct = &ctp->conp_contract; 3367 mutex_enter(&ct->ct_lock); 3368 mutex_enter(&pp->p_lock); 3369 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 3370 mutex_exit(&pp->p_lock); 3371 mutex_exit(&ct->ct_lock); 3372 mutex_exit(&zonehash_lock); 3373 pool_unlock(); 3374 err = EINVAL; 3375 goto out; 3376 } 3377 3378 /* 3379 * Moreover, we don't allow processes whose encapsulating 3380 * process contracts have inherited extrazonal contracts. 3381 * While it would be easier to eliminate all process contracts 3382 * with inherited contracts, we need to be able to give a 3383 * restarted init (or other zone-penetrating process) its 3384 * predecessor's contracts. 3385 */ 3386 if (ctp->conp_ninherited != 0) { 3387 contract_t *next; 3388 for (next = list_head(&ctp->conp_inherited); next; 3389 next = list_next(&ctp->conp_inherited, next)) { 3390 if (contract_getzuniqid(next) != zone->zone_uniqid) { 3391 mutex_exit(&pp->p_lock); 3392 mutex_exit(&ct->ct_lock); 3393 mutex_exit(&zonehash_lock); 3394 pool_unlock(); 3395 err = EINVAL; 3396 goto out; 3397 } 3398 } 3399 } 3400 mutex_exit(&pp->p_lock); 3401 mutex_exit(&ct->ct_lock); 3402 3403 status = zone_status_get(zone); 3404 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 3405 /* 3406 * Can't join 3407 */ 3408 mutex_exit(&zonehash_lock); 3409 err = EINVAL; 3410 goto out; 3411 } 3412 3413 /* 3414 * Make sure new priv set is within the permitted set for caller 3415 */ 3416 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 3417 mutex_exit(&zonehash_lock); 3418 err = EPERM; 3419 goto out; 3420 } 3421 /* 3422 * We want to momentarily drop zonehash_lock while we optimistically 3423 * bind curproc to the pool it should be running in. This is safe 3424 * since the zone can't disappear (we have a hold on it). 3425 */ 3426 zone_hold(zone); 3427 mutex_exit(&zonehash_lock); 3428 3429 /* 3430 * Grab pool_lock to keep the pools configuration from changing 3431 * and to stop ourselves from getting rebound to another pool 3432 * until we join the zone. 3433 */ 3434 if (pool_lock_intr() != 0) { 3435 zone_rele(zone); 3436 err = EINTR; 3437 goto out; 3438 } 3439 ASSERT(secpolicy_pool(CRED()) == 0); 3440 /* 3441 * Bind ourselves to the pool currently associated with the zone. 3442 */ 3443 oldpool = curproc->p_pool; 3444 newpool = zone_pool_get(zone); 3445 if (pool_state == POOL_ENABLED && newpool != oldpool && 3446 (err = pool_do_bind(newpool, P_PID, P_MYID, 3447 POOL_BIND_ALL)) != 0) { 3448 pool_unlock(); 3449 zone_rele(zone); 3450 goto out; 3451 } 3452 3453 /* 3454 * Grab cpu_lock now; we'll need it later when we call 3455 * task_join(). 3456 */ 3457 mutex_enter(&cpu_lock); 3458 mutex_enter(&zonehash_lock); 3459 /* 3460 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 3461 */ 3462 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3463 /* 3464 * Can't join anymore. 3465 */ 3466 mutex_exit(&zonehash_lock); 3467 mutex_exit(&cpu_lock); 3468 if (pool_state == POOL_ENABLED && 3469 newpool != oldpool) 3470 (void) pool_do_bind(oldpool, P_PID, P_MYID, 3471 POOL_BIND_ALL); 3472 pool_unlock(); 3473 zone_rele(zone); 3474 err = EINVAL; 3475 goto out; 3476 } 3477 3478 mutex_enter(&pp->p_lock); 3479 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 3480 /* verify that we do not exceed and task or lwp limits */ 3481 mutex_enter(&zone->zone_nlwps_lock); 3482 /* add new lwps to zone and zone's proj0 */ 3483 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 3484 zone->zone_nlwps += pp->p_lwpcnt; 3485 /* add 1 task to zone's proj0 */ 3486 zone_proj0->kpj_ntasks += 1; 3487 mutex_exit(&pp->p_lock); 3488 mutex_exit(&zone->zone_nlwps_lock); 3489 3490 /* remove lwps from proc's old zone and old project */ 3491 mutex_enter(&pp->p_zone->zone_nlwps_lock); 3492 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 3493 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 3494 mutex_exit(&pp->p_zone->zone_nlwps_lock); 3495 3496 /* 3497 * Joining the zone cannot fail from now on. 3498 * 3499 * This means that a lot of the following code can be commonized and 3500 * shared with zsched(). 3501 */ 3502 3503 /* 3504 * Reset the encapsulating process contract's zone. 3505 */ 3506 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 3507 contract_setzuniqid(ct, zone->zone_uniqid); 3508 3509 /* 3510 * Create a new task and associate the process with the project keyed 3511 * by (projid,zoneid). 3512 * 3513 * We might as well be in project 0; the global zone's projid doesn't 3514 * make much sense in a zone anyhow. 3515 * 3516 * This also increments zone_ntasks, and returns with p_lock held. 3517 */ 3518 tk = task_create(0, zone); 3519 oldtk = task_join(tk, 0); 3520 mutex_exit(&cpu_lock); 3521 3522 pp->p_flag |= SZONETOP; 3523 pp->p_zone = zone; 3524 3525 /* 3526 * call RCTLOP_SET functions on this proc 3527 */ 3528 e.rcep_p.zone = zone; 3529 e.rcep_t = RCENTITY_ZONE; 3530 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 3531 RCD_CALLBACK); 3532 mutex_exit(&pp->p_lock); 3533 3534 /* 3535 * We don't need to hold any of zsched's locks here; not only do we know 3536 * the process and zone aren't going away, we know its session isn't 3537 * changing either. 3538 * 3539 * By joining zsched's session here, we mimic the behavior in the 3540 * global zone of init's sid being the pid of sched. We extend this 3541 * to all zlogin-like zone_enter()'ing processes as well. 3542 */ 3543 mutex_enter(&pidlock); 3544 sp = zone->zone_zsched->p_sessp; 3545 SESS_HOLD(sp); 3546 mutex_enter(&pp->p_lock); 3547 pgexit(pp); 3548 SESS_RELE(pp->p_sessp); 3549 pp->p_sessp = sp; 3550 pgjoin(pp, zone->zone_zsched->p_pidp); 3551 mutex_exit(&pp->p_lock); 3552 mutex_exit(&pidlock); 3553 3554 mutex_exit(&zonehash_lock); 3555 /* 3556 * We're firmly in the zone; let pools progress. 3557 */ 3558 pool_unlock(); 3559 task_rele(oldtk); 3560 /* 3561 * We don't need to retain a hold on the zone since we already 3562 * incremented zone_ntasks, so the zone isn't going anywhere. 3563 */ 3564 zone_rele(zone); 3565 3566 /* 3567 * Chroot 3568 */ 3569 vp = zone->zone_rootvp; 3570 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 3571 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 3572 3573 /* 3574 * Change process credentials 3575 */ 3576 newcr = cralloc(); 3577 mutex_enter(&pp->p_crlock); 3578 cr = pp->p_cred; 3579 crcopy_to(cr, newcr); 3580 crsetzone(newcr, zone); 3581 pp->p_cred = newcr; 3582 3583 /* 3584 * Restrict all process privilege sets to zone limit 3585 */ 3586 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 3587 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 3588 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 3589 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 3590 mutex_exit(&pp->p_crlock); 3591 crset(pp, newcr); 3592 3593 /* 3594 * Adjust upcount to reflect zone entry. 3595 */ 3596 uid = crgetruid(newcr); 3597 mutex_enter(&pidlock); 3598 upcount_dec(uid, GLOBAL_ZONEID); 3599 upcount_inc(uid, zoneid); 3600 mutex_exit(&pidlock); 3601 3602 /* 3603 * Set up core file path and content. 3604 */ 3605 set_core_defaults(); 3606 3607 out: 3608 /* 3609 * Let the other lwps continue. 3610 */ 3611 mutex_enter(&pp->p_lock); 3612 if (curthread != pp->p_agenttp) 3613 continuelwps(pp); 3614 mutex_exit(&pp->p_lock); 3615 3616 return (err != 0 ? set_errno(err) : 0); 3617 } 3618 3619 /* 3620 * Systemcall entry point for zone_list(2). 3621 * 3622 * Processes running in a (non-global) zone only see themselves. 3623 */ 3624 static int 3625 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 3626 { 3627 zoneid_t *zoneids; 3628 zone_t *zone; 3629 uint_t user_nzones, real_nzones; 3630 int error = 0; 3631 uint_t i; 3632 3633 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 3634 return (set_errno(EFAULT)); 3635 3636 if (curproc->p_zone != global_zone) { 3637 /* just return current zone */ 3638 real_nzones = 1; 3639 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 3640 zoneids[0] = curproc->p_zone->zone_id; 3641 } else { 3642 mutex_enter(&zonehash_lock); 3643 real_nzones = zonecount; 3644 if (real_nzones) { 3645 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 3646 KM_SLEEP); 3647 i = 0; 3648 for (zone = list_head(&zone_active); zone != NULL; 3649 zone = list_next(&zone_active, zone)) 3650 zoneids[i++] = zone->zone_id; 3651 ASSERT(i == real_nzones); 3652 } 3653 mutex_exit(&zonehash_lock); 3654 } 3655 3656 if (user_nzones > real_nzones) 3657 user_nzones = real_nzones; 3658 3659 if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0) 3660 error = EFAULT; 3661 else if (zoneidlist != NULL && user_nzones != 0) { 3662 if (copyout(zoneids, zoneidlist, 3663 user_nzones * sizeof (zoneid_t)) != 0) 3664 error = EFAULT; 3665 } 3666 3667 if (real_nzones) 3668 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 3669 3670 if (error) 3671 return (set_errno(error)); 3672 else 3673 return (0); 3674 } 3675 3676 /* 3677 * Systemcall entry point for zone_lookup(2). 3678 * 3679 * Non-global zones are only able to see themselves. 3680 */ 3681 static zoneid_t 3682 zone_lookup(const char *zone_name) 3683 { 3684 char *kname; 3685 zone_t *zone; 3686 zoneid_t zoneid; 3687 int err; 3688 3689 if (zone_name == NULL) { 3690 /* return caller's zone id */ 3691 return (getzoneid()); 3692 } 3693 3694 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3695 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 3696 kmem_free(kname, ZONENAME_MAX); 3697 return (set_errno(err)); 3698 } 3699 3700 mutex_enter(&zonehash_lock); 3701 zone = zone_find_all_by_name(kname); 3702 kmem_free(kname, ZONENAME_MAX); 3703 if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY || 3704 (curproc->p_zone != global_zone && curproc->p_zone != zone)) { 3705 /* in non-global zone, can only lookup own name */ 3706 mutex_exit(&zonehash_lock); 3707 return (set_errno(EINVAL)); 3708 } 3709 zoneid = zone->zone_id; 3710 mutex_exit(&zonehash_lock); 3711 return (zoneid); 3712 } 3713 3714 /* ARGSUSED */ 3715 long 3716 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5) 3717 { 3718 zone_def zs; 3719 3720 switch (cmd) { 3721 case ZONE_CREATE: 3722 if (get_udatamodel() == DATAMODEL_NATIVE) { 3723 if (copyin(arg1, &zs, sizeof (zone_def))) { 3724 return (set_errno(EFAULT)); 3725 } 3726 } else { 3727 #ifdef _SYSCALL32_IMPL 3728 zone_def32 zs32; 3729 3730 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 3731 return (set_errno(EFAULT)); 3732 } 3733 zs.zone_name = 3734 (const char *)(unsigned long)zs32.zone_name; 3735 zs.zone_root = 3736 (const char *)(unsigned long)zs32.zone_root; 3737 zs.zone_privs = 3738 (const struct priv_set *) 3739 (unsigned long)zs32.zone_privs; 3740 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 3741 zs.rctlbufsz = zs32.rctlbufsz; 3742 zs.extended_error = 3743 (int *)(unsigned long)zs32.extended_error; 3744 #else 3745 panic("get_udatamodel() returned bogus result\n"); 3746 #endif 3747 } 3748 3749 return (zone_create(zs.zone_name, zs.zone_root, 3750 zs.zone_privs, (caddr_t)zs.rctlbuf, zs.rctlbufsz, 3751 zs.extended_error)); 3752 case ZONE_BOOT: 3753 return (zone_boot((zoneid_t)(uintptr_t)arg1, 3754 (const char *)arg2)); 3755 case ZONE_DESTROY: 3756 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 3757 case ZONE_GETATTR: 3758 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 3759 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 3760 case ZONE_ENTER: 3761 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 3762 case ZONE_LIST: 3763 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 3764 case ZONE_SHUTDOWN: 3765 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 3766 case ZONE_LOOKUP: 3767 return (zone_lookup((const char *)arg1)); 3768 default: 3769 return (set_errno(EINVAL)); 3770 } 3771 } 3772 3773 struct zarg { 3774 zone_t *zone; 3775 zone_cmd_arg_t arg; 3776 }; 3777 3778 static int 3779 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 3780 { 3781 char *buf; 3782 size_t buflen; 3783 int error; 3784 3785 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 3786 buf = kmem_alloc(buflen, KM_SLEEP); 3787 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 3788 error = door_ki_open(buf, doorp); 3789 kmem_free(buf, buflen); 3790 return (error); 3791 } 3792 3793 static void 3794 zone_release_door(door_handle_t *doorp) 3795 { 3796 door_ki_rele(*doorp); 3797 *doorp = NULL; 3798 } 3799 3800 static void 3801 zone_ki_call_zoneadmd(struct zarg *zargp) 3802 { 3803 door_handle_t door = NULL; 3804 door_arg_t darg, save_arg; 3805 char *zone_name; 3806 size_t zone_namelen; 3807 zoneid_t zoneid; 3808 zone_t *zone; 3809 zone_cmd_arg_t arg; 3810 uint64_t uniqid; 3811 size_t size; 3812 int error; 3813 int retry; 3814 3815 zone = zargp->zone; 3816 arg = zargp->arg; 3817 kmem_free(zargp, sizeof (*zargp)); 3818 3819 zone_namelen = strlen(zone->zone_name) + 1; 3820 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 3821 bcopy(zone->zone_name, zone_name, zone_namelen); 3822 zoneid = zone->zone_id; 3823 uniqid = zone->zone_uniqid; 3824 /* 3825 * zoneadmd may be down, but at least we can empty out the zone. 3826 * We can ignore the return value of zone_empty() since we're called 3827 * from a kernel thread and know we won't be delivered any signals. 3828 */ 3829 ASSERT(curproc == &p0); 3830 (void) zone_empty(zone); 3831 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 3832 zone_rele(zone); 3833 3834 size = sizeof (arg); 3835 darg.rbuf = (char *)&arg; 3836 darg.data_ptr = (char *)&arg; 3837 darg.rsize = size; 3838 darg.data_size = size; 3839 darg.desc_ptr = NULL; 3840 darg.desc_num = 0; 3841 3842 save_arg = darg; 3843 /* 3844 * Since we're not holding a reference to the zone, any number of 3845 * things can go wrong, including the zone disappearing before we get a 3846 * chance to talk to zoneadmd. 3847 */ 3848 for (retry = 0; /* forever */; retry++) { 3849 if (door == NULL && 3850 (error = zone_lookup_door(zone_name, &door)) != 0) { 3851 goto next; 3852 } 3853 ASSERT(door != NULL); 3854 3855 if ((error = door_ki_upcall(door, &darg)) == 0) { 3856 break; 3857 } 3858 switch (error) { 3859 case EINTR: 3860 /* FALLTHROUGH */ 3861 case EAGAIN: /* process may be forking */ 3862 /* 3863 * Back off for a bit 3864 */ 3865 break; 3866 case EBADF: 3867 zone_release_door(&door); 3868 if (zone_lookup_door(zone_name, &door) != 0) { 3869 /* 3870 * zoneadmd may be dead, but it may come back to 3871 * life later. 3872 */ 3873 break; 3874 } 3875 break; 3876 default: 3877 cmn_err(CE_WARN, 3878 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 3879 error); 3880 goto out; 3881 } 3882 next: 3883 /* 3884 * If this isn't the same zone_t that we originally had in mind, 3885 * then this is the same as if two kadmin requests come in at 3886 * the same time: the first one wins. This means we lose, so we 3887 * bail. 3888 */ 3889 if ((zone = zone_find_by_id(zoneid)) == NULL) { 3890 /* 3891 * Problem is solved. 3892 */ 3893 break; 3894 } 3895 if (zone->zone_uniqid != uniqid) { 3896 /* 3897 * zoneid recycled 3898 */ 3899 zone_rele(zone); 3900 break; 3901 } 3902 /* 3903 * We could zone_status_timedwait(), but there doesn't seem to 3904 * be much point in doing that (plus, it would mean that 3905 * zone_free() isn't called until this thread exits). 3906 */ 3907 zone_rele(zone); 3908 delay(hz); 3909 darg = save_arg; 3910 } 3911 out: 3912 if (door != NULL) { 3913 zone_release_door(&door); 3914 } 3915 kmem_free(zone_name, zone_namelen); 3916 thread_exit(); 3917 } 3918 3919 /* 3920 * Entry point for uadmin() to tell the zone to go away or reboot. The caller 3921 * is a process in the zone to be modified. 3922 * 3923 * In order to shutdown the zone, we will hand off control to zoneadmd 3924 * (running in the global zone) via a door. We do a half-hearted job at 3925 * killing all processes in the zone, create a kernel thread to contact 3926 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 3927 * a form of generation number used to let zoneadmd (as well as 3928 * zone_destroy()) know exactly which zone they're re talking about. 3929 */ 3930 int 3931 zone_uadmin(int cmd, int fcn, cred_t *credp) 3932 { 3933 struct zarg *zargp; 3934 zone_cmd_t zcmd; 3935 zone_t *zone; 3936 3937 zone = curproc->p_zone; 3938 ASSERT(getzoneid() != GLOBAL_ZONEID); 3939 3940 switch (cmd) { 3941 case A_SHUTDOWN: 3942 switch (fcn) { 3943 case AD_HALT: 3944 case AD_POWEROFF: 3945 zcmd = Z_HALT; 3946 break; 3947 case AD_BOOT: 3948 zcmd = Z_REBOOT; 3949 break; 3950 case AD_IBOOT: 3951 case AD_SBOOT: 3952 case AD_SIBOOT: 3953 case AD_NOSYNC: 3954 return (ENOTSUP); 3955 default: 3956 return (EINVAL); 3957 } 3958 break; 3959 case A_REBOOT: 3960 zcmd = Z_REBOOT; 3961 break; 3962 case A_FTRACE: 3963 case A_REMOUNT: 3964 case A_FREEZE: 3965 case A_DUMP: 3966 return (ENOTSUP); 3967 default: 3968 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 3969 return (EINVAL); 3970 } 3971 3972 if (secpolicy_zone_admin(credp, B_FALSE)) 3973 return (EPERM); 3974 mutex_enter(&zone_status_lock); 3975 /* 3976 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 3977 * is in the zone. 3978 */ 3979 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 3980 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 3981 /* 3982 * This zone is already on its way down. 3983 */ 3984 mutex_exit(&zone_status_lock); 3985 return (0); 3986 } 3987 /* 3988 * Prevent future zone_enter()s 3989 */ 3990 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3991 mutex_exit(&zone_status_lock); 3992 3993 /* 3994 * Kill everyone now and call zoneadmd later. 3995 * zone_ki_call_zoneadmd() will do a more thorough job of this 3996 * later. 3997 */ 3998 killall(zone->zone_id); 3999 /* 4000 * Now, create the thread to contact zoneadmd and do the rest of the 4001 * work. This thread can't be created in our zone otherwise 4002 * zone_destroy() would deadlock. 4003 */ 4004 zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP); 4005 zargp->arg.cmd = zcmd; 4006 zargp->arg.uniqid = zone->zone_uniqid; 4007 (void) strcpy(zargp->arg.locale, "C"); 4008 zone_hold(zargp->zone = zone); 4009 4010 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 4011 TS_RUN, minclsyspri); 4012 exit(CLD_EXITED, 0); 4013 4014 return (EINVAL); 4015 } 4016 4017 /* 4018 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 4019 * status to ZONE_IS_SHUTTING_DOWN. 4020 */ 4021 void 4022 zone_shutdown_global(void) 4023 { 4024 ASSERT(curproc->p_zone == global_zone); 4025 4026 mutex_enter(&zone_status_lock); 4027 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 4028 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 4029 mutex_exit(&zone_status_lock); 4030 } 4031