1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Zones 31 * 32 * A zone is a named collection of processes, namespace constraints, 33 * and other system resources which comprise a secure and manageable 34 * application containment facility. 35 * 36 * Zones (represented by the reference counted zone_t) are tracked in 37 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 38 * (zoneid_t) are used to track zone association. Zone IDs are 39 * dynamically generated when the zone is created; if a persistent 40 * identifier is needed (core files, accounting logs, audit trail, 41 * etc.), the zone name should be used. 42 * 43 * 44 * Global Zone: 45 * 46 * The global zone (zoneid 0) is automatically associated with all 47 * system resources that have not been bound to a user-created zone. 48 * This means that even systems where zones are not in active use 49 * have a global zone, and all processes, mounts, etc. are 50 * associated with that zone. The global zone is generally 51 * unconstrained in terms of privileges and access, though the usual 52 * credential and privilege based restrictions apply. 53 * 54 * 55 * Zone States: 56 * 57 * The states in which a zone may be in and the transitions are as 58 * follows: 59 * 60 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 61 * initialized zone is added to the list of active zones on the system but 62 * isn't accessible. 63 * 64 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 65 * ready. The zone is made visible after the ZSD constructor callbacks are 66 * executed. A zone remains in this state until it transitions into 67 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 68 * 69 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 70 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 71 * state. 72 * 73 * ZONE_IS_RUNNING: The zone is open for business: zsched has 74 * successfully started init. A zone remains in this state until 75 * zone_shutdown() is called. 76 * 77 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 78 * killing all processes running in the zone. The zone remains 79 * in this state until there are no more user processes running in the zone. 80 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 81 * Since zone_shutdown() is restartable, it may be called successfully 82 * multiple times for the same zone_t. Setting of the zone's state to 83 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 84 * the zone's status without worrying about it being a moving target. 85 * 86 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 87 * are no more user processes in the zone. The zone remains in this 88 * state until there are no more kernel threads associated with the 89 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 90 * fail. 91 * 92 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 93 * have exited. zone_shutdown() returns. Henceforth it is not possible to 94 * join the zone or create kernel threads therein. 95 * 96 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 97 * remains in this state until zsched exits. Calls to zone_find_by_*() 98 * return NULL from now on. 99 * 100 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 101 * processes or threads doing work on behalf of the zone. The zone is 102 * removed from the list of active zones. zone_destroy() returns, and 103 * the zone can be recreated. 104 * 105 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 106 * callbacks are executed, and all memory associated with the zone is 107 * freed. 108 * 109 * Threads can wait for the zone to enter a requested state by using 110 * zone_status_wait() or zone_status_timedwait() with the desired 111 * state passed in as an argument. Zone state transitions are 112 * uni-directional; it is not possible to move back to an earlier state. 113 * 114 * 115 * Zone-Specific Data: 116 * 117 * Subsystems needing to maintain zone-specific data can store that 118 * data using the ZSD mechanism. This provides a zone-specific data 119 * store, similar to thread-specific data (see pthread_getspecific(3C) 120 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 121 * to register callbacks to be invoked when a zone is created, shut 122 * down, or destroyed. This can be used to initialize zone-specific 123 * data for new zones and to clean up when zones go away. 124 * 125 * 126 * Data Structures: 127 * 128 * The per-zone structure (zone_t) is reference counted, and freed 129 * when all references are released. zone_hold and zone_rele can be 130 * used to adjust the reference count. In addition, reference counts 131 * associated with the cred_t structure are tracked separately using 132 * zone_cred_hold and zone_cred_rele. 133 * 134 * Pointers to active zone_t's are stored in two hash tables; one 135 * for searching by id, the other for searching by name. Lookups 136 * can be performed on either basis, using zone_find_by_id and 137 * zone_find_by_name. Both return zone_t pointers with the zone 138 * held, so zone_rele should be called when the pointer is no longer 139 * needed. Zones can also be searched by path; zone_find_by_path 140 * returns the zone with which a path name is associated (global 141 * zone if the path is not within some other zone's file system 142 * hierarchy). This currently requires iterating through each zone, 143 * so it is slower than an id or name search via a hash table. 144 * 145 * 146 * Locking: 147 * 148 * zonehash_lock: This is a top-level global lock used to protect the 149 * zone hash tables and lists. Zones cannot be created or destroyed 150 * while this lock is held. 151 * zone_status_lock: This is a global lock protecting zone state. 152 * Zones cannot change state while this lock is held. It also 153 * protects the list of kernel threads associated with a zone. 154 * zone_lock: This is a per-zone lock used to protect several fields of 155 * the zone_t (see <sys/zone.h> for details). In addition, holding 156 * this lock means that the zone cannot go away. 157 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 158 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 159 * list (a list of zones in the ZONE_IS_DEAD state). 160 * 161 * Ordering requirements: 162 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 163 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 164 * 165 * Blocking memory allocations are permitted while holding any of the 166 * zone locks. 167 * 168 * 169 * System Call Interface: 170 * 171 * The zone subsystem can be managed and queried from user level with 172 * the following system calls (all subcodes of the primary "zone" 173 * system call): 174 * - zone_create: creates a zone with selected attributes (name, 175 * root path, privileges, resource controls) 176 * - zone_enter: allows the current process to enter a zone 177 * - zone_getattr: reports attributes of a zone 178 * - zone_list: lists all zones active in the system 179 * - zone_lookup: looks up zone id based on name 180 * - zone_shutdown: initiates shutdown process (see states above) 181 * - zone_destroy: completes shutdown process (see states above) 182 * 183 */ 184 185 #include <sys/priv_impl.h> 186 #include <sys/cred.h> 187 #include <c2/audit.h> 188 #include <sys/ddi.h> 189 #include <sys/debug.h> 190 #include <sys/file.h> 191 #include <sys/kmem.h> 192 #include <sys/mutex.h> 193 #include <sys/pathname.h> 194 #include <sys/proc.h> 195 #include <sys/project.h> 196 #include <sys/task.h> 197 #include <sys/systm.h> 198 #include <sys/types.h> 199 #include <sys/utsname.h> 200 #include <sys/vnode.h> 201 #include <sys/vfs.h> 202 #include <sys/systeminfo.h> 203 #include <sys/policy.h> 204 #include <sys/cred_impl.h> 205 #include <sys/contract_impl.h> 206 #include <sys/contract/process_impl.h> 207 #include <sys/class.h> 208 #include <sys/pool.h> 209 #include <sys/pool_pset.h> 210 #include <sys/pset.h> 211 #include <sys/log.h> 212 #include <sys/sysmacros.h> 213 #include <sys/callb.h> 214 #include <sys/vmparam.h> 215 #include <sys/corectl.h> 216 217 #include <sys/door.h> 218 #include <sys/cpuvar.h> 219 #include <sys/fs/snode.h> 220 221 #include <sys/uadmin.h> 222 #include <sys/session.h> 223 #include <sys/cmn_err.h> 224 #include <sys/modhash.h> 225 #include <sys/nvpair.h> 226 #include <sys/rctl.h> 227 #include <sys/fss.h> 228 #include <sys/zone.h> 229 230 /* 231 * cv used to signal that all references to the zone have been released. This 232 * needs to be global since there may be multiple waiters, and the first to 233 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 234 */ 235 static kcondvar_t zone_destroy_cv; 236 /* 237 * Lock used to serialize access to zone_cv. This could have been per-zone, 238 * but then we'd need another lock for zone_destroy_cv, and why bother? 239 */ 240 static kmutex_t zone_status_lock; 241 242 /* 243 * ZSD-related global variables. 244 */ 245 static kmutex_t zsd_key_lock; /* protects the following two */ 246 /* 247 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 248 */ 249 static zone_key_t zsd_keyval = 0; 250 /* 251 * Global list of registered keys. We use this when a new zone is created. 252 */ 253 static list_t zsd_registered_keys; 254 255 int zone_hash_size = 256; 256 static mod_hash_t *zonehashbyname, *zonehashbyid; 257 static kmutex_t zonehash_lock; 258 static uint_t zonecount; 259 static id_space_t *zoneid_space; 260 261 /* 262 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 263 * kernel proper runs, and which manages all other zones. 264 * 265 * Although not declared as static, the variable "zone0" should not be used 266 * except for by code that needs to reference the global zone early on in boot, 267 * before it is fully initialized. All other consumers should use 268 * 'global_zone'. 269 */ 270 zone_t zone0; 271 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 272 273 /* 274 * List of active zones, protected by zonehash_lock. 275 */ 276 static list_t zone_active; 277 278 /* 279 * List of destroyed zones that still have outstanding cred references. 280 * Used for debugging. Uses a separate lock to avoid lock ordering 281 * problems in zone_free. 282 */ 283 static list_t zone_deathrow; 284 static kmutex_t zone_deathrow_lock; 285 286 /* number of zones is limited by virtual interface limit in IP */ 287 uint_t maxzones = 8192; 288 289 /* 290 * This isn't static so lint doesn't complain. 291 */ 292 rctl_hndl_t rc_zone_cpu_shares; 293 rctl_hndl_t rc_zone_nlwps; 294 /* 295 * Synchronization primitives used to synchronize between mounts and zone 296 * creation/destruction. 297 */ 298 static int mounts_in_progress; 299 static kcondvar_t mount_cv; 300 static kmutex_t mount_lock; 301 302 const char * const zone_initname = "/sbin/init"; 303 304 static int zone_shutdown(zoneid_t zoneid); 305 306 /* 307 * Certain filesystems (such as NFS and autofs) need to know which zone 308 * the mount is being placed in. Because of this, we need to be able to 309 * ensure that a zone isn't in the process of being created such that 310 * nfs_mount() thinks it is in the global zone, while by the time it 311 * gets added the list of mounted zones, it ends up on zoneA's mount 312 * list. 313 * 314 * The following functions: block_mounts()/resume_mounts() and 315 * mount_in_progress()/mount_completed() are used by zones and the VFS 316 * layer (respectively) to synchronize zone creation and new mounts. 317 * 318 * The semantics are like a reader-reader lock such that there may 319 * either be multiple mounts (or zone creations, if that weren't 320 * serialized by zonehash_lock) in progress at the same time, but not 321 * both. 322 * 323 * We use cv's so the user can ctrl-C out of the operation if it's 324 * taking too long. 325 * 326 * The semantics are such that there is unfair bias towards the 327 * "current" operation. This means that zone creations may starve if 328 * there is a rapid succession of new mounts coming in to the system, or 329 * there is a remote possibility that zones will be created at such a 330 * rate that new mounts will not be able to proceed. 331 */ 332 /* 333 * Prevent new mounts from progressing to the point of calling 334 * VFS_MOUNT(). If there are already mounts in this "region", wait for 335 * them to complete. 336 */ 337 static int 338 block_mounts(void) 339 { 340 int retval = 0; 341 342 /* 343 * Since it may block for a long time, block_mounts() shouldn't be 344 * called with zonehash_lock held. 345 */ 346 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 347 mutex_enter(&mount_lock); 348 while (mounts_in_progress > 0) { 349 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 350 goto signaled; 351 } 352 /* 353 * A negative value of mounts_in_progress indicates that mounts 354 * have been blocked by (-mounts_in_progress) different callers. 355 */ 356 mounts_in_progress--; 357 retval = 1; 358 signaled: 359 mutex_exit(&mount_lock); 360 return (retval); 361 } 362 363 /* 364 * The VFS layer may progress with new mounts as far as we're concerned. 365 * Allow them to progress if we were the last obstacle. 366 */ 367 static void 368 resume_mounts(void) 369 { 370 mutex_enter(&mount_lock); 371 if (++mounts_in_progress == 0) 372 cv_broadcast(&mount_cv); 373 mutex_exit(&mount_lock); 374 } 375 376 /* 377 * The VFS layer is busy with a mount; zones should wait until all 378 * mounts are completed to progress. 379 */ 380 void 381 mount_in_progress(void) 382 { 383 mutex_enter(&mount_lock); 384 while (mounts_in_progress < 0) 385 cv_wait(&mount_cv, &mount_lock); 386 mounts_in_progress++; 387 mutex_exit(&mount_lock); 388 } 389 390 /* 391 * VFS is done with one mount; wake up any waiting block_mounts() 392 * callers if this is the last mount. 393 */ 394 void 395 mount_completed(void) 396 { 397 mutex_enter(&mount_lock); 398 if (--mounts_in_progress == 0) 399 cv_broadcast(&mount_cv); 400 mutex_exit(&mount_lock); 401 } 402 403 /* 404 * ZSD routines. 405 * 406 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 407 * defined by the pthread_key_create() and related interfaces. 408 * 409 * Kernel subsystems may register one or more data items and/or 410 * callbacks to be executed when a zone is created, shutdown, or 411 * destroyed. 412 * 413 * Unlike the thread counterpart, destructor callbacks will be executed 414 * even if the data pointer is NULL and/or there are no constructor 415 * callbacks, so it is the responsibility of such callbacks to check for 416 * NULL data values if necessary. 417 * 418 * The locking strategy and overall picture is as follows: 419 * 420 * When someone calls zone_key_create(), a template ZSD entry is added to the 421 * global list "zsd_registered_keys", protected by zsd_key_lock. The 422 * constructor callback is called immediately on all existing zones, and a 423 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 424 * zone_lock). As this operation requires the list of zones, the list of 425 * registered keys, and the per-zone list of ZSD entries to remain constant 426 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 427 * all existing zones, and zsd_key_lock, in that order. Similar locking is 428 * needed when zone_key_delete() is called. It is thus sufficient to hold 429 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 430 * per-zone zone_zsd list. 431 * 432 * Note that this implementation does not make a copy of the ZSD entry if a 433 * constructor callback is not provided. A zone_getspecific() on such an 434 * uninitialized ZSD entry will return NULL. 435 * 436 * When new zones are created constructor callbacks for all registered ZSD 437 * entries will be called. 438 * 439 * The framework does not provide any locking around zone_getspecific() and 440 * zone_setspecific() apart from that needed for internal consistency, so 441 * callers interested in atomic "test-and-set" semantics will need to provide 442 * their own locking. 443 */ 444 void 445 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 446 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 447 { 448 struct zsd_entry *zsdp; 449 struct zsd_entry *t; 450 struct zone *zone; 451 452 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 453 zsdp->zsd_data = NULL; 454 zsdp->zsd_create = create; 455 zsdp->zsd_shutdown = shutdown; 456 zsdp->zsd_destroy = destroy; 457 458 mutex_enter(&zonehash_lock); /* stop the world */ 459 for (zone = list_head(&zone_active); zone != NULL; 460 zone = list_next(&zone_active, zone)) 461 mutex_enter(&zone->zone_lock); /* lock all zones */ 462 463 mutex_enter(&zsd_key_lock); 464 *keyp = zsdp->zsd_key = ++zsd_keyval; 465 ASSERT(zsd_keyval != 0); 466 list_insert_tail(&zsd_registered_keys, zsdp); 467 mutex_exit(&zsd_key_lock); 468 469 if (create != NULL) { 470 for (zone = list_head(&zone_active); zone != NULL; 471 zone = list_next(&zone_active, zone)) { 472 t = kmem_alloc(sizeof (*t), KM_SLEEP); 473 t->zsd_key = *keyp; 474 t->zsd_data = (*create)(zone->zone_id); 475 t->zsd_create = create; 476 t->zsd_shutdown = shutdown; 477 t->zsd_destroy = destroy; 478 list_insert_tail(&zone->zone_zsd, t); 479 } 480 } 481 for (zone = list_head(&zone_active); zone != NULL; 482 zone = list_next(&zone_active, zone)) 483 mutex_exit(&zone->zone_lock); 484 mutex_exit(&zonehash_lock); 485 } 486 487 /* 488 * Helper function to find the zsd_entry associated with the key in the 489 * given list. 490 */ 491 static struct zsd_entry * 492 zsd_find(list_t *l, zone_key_t key) 493 { 494 struct zsd_entry *zsd; 495 496 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 497 if (zsd->zsd_key == key) { 498 /* 499 * Move to head of list to keep list in MRU order. 500 */ 501 if (zsd != list_head(l)) { 502 list_remove(l, zsd); 503 list_insert_head(l, zsd); 504 } 505 return (zsd); 506 } 507 } 508 return (NULL); 509 } 510 511 /* 512 * Function called when a module is being unloaded, or otherwise wishes 513 * to unregister its ZSD key and callbacks. 514 */ 515 int 516 zone_key_delete(zone_key_t key) 517 { 518 struct zsd_entry *zsdp = NULL; 519 zone_t *zone; 520 521 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 522 for (zone = list_head(&zone_active); zone != NULL; 523 zone = list_next(&zone_active, zone)) 524 mutex_enter(&zone->zone_lock); /* lock all zones */ 525 526 mutex_enter(&zsd_key_lock); 527 zsdp = zsd_find(&zsd_registered_keys, key); 528 if (zsdp == NULL) 529 goto notfound; 530 list_remove(&zsd_registered_keys, zsdp); 531 mutex_exit(&zsd_key_lock); 532 533 for (zone = list_head(&zone_active); zone != NULL; 534 zone = list_next(&zone_active, zone)) { 535 struct zsd_entry *del; 536 void *data; 537 538 if (!(zone->zone_flags & ZF_DESTROYED)) { 539 del = zsd_find(&zone->zone_zsd, key); 540 if (del != NULL) { 541 data = del->zsd_data; 542 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 543 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 544 list_remove(&zone->zone_zsd, del); 545 kmem_free(del, sizeof (*del)); 546 } else { 547 data = NULL; 548 } 549 if (zsdp->zsd_shutdown) 550 zsdp->zsd_shutdown(zone->zone_id, data); 551 if (zsdp->zsd_destroy) 552 zsdp->zsd_destroy(zone->zone_id, data); 553 } 554 mutex_exit(&zone->zone_lock); 555 } 556 mutex_exit(&zonehash_lock); 557 kmem_free(zsdp, sizeof (*zsdp)); 558 return (0); 559 560 notfound: 561 mutex_exit(&zsd_key_lock); 562 for (zone = list_head(&zone_active); zone != NULL; 563 zone = list_next(&zone_active, zone)) 564 mutex_exit(&zone->zone_lock); 565 mutex_exit(&zonehash_lock); 566 return (-1); 567 } 568 569 /* 570 * ZSD counterpart of pthread_setspecific(). 571 */ 572 int 573 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 574 { 575 struct zsd_entry *t; 576 struct zsd_entry *zsdp = NULL; 577 578 mutex_enter(&zone->zone_lock); 579 t = zsd_find(&zone->zone_zsd, key); 580 if (t != NULL) { 581 /* 582 * Replace old value with new 583 */ 584 t->zsd_data = (void *)data; 585 mutex_exit(&zone->zone_lock); 586 return (0); 587 } 588 /* 589 * If there was no previous value, go through the list of registered 590 * keys. 591 * 592 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 593 * necessary for shutdown callbacks to be able to execute without fear 594 * of deadlock. 595 */ 596 mutex_enter(&zsd_key_lock); 597 zsdp = zsd_find(&zsd_registered_keys, key); 598 if (zsdp == NULL) { /* Key was not registered */ 599 mutex_exit(&zsd_key_lock); 600 mutex_exit(&zone->zone_lock); 601 return (-1); 602 } 603 604 /* 605 * Add a zsd_entry to this zone, using the template we just retrieved 606 * to initialize the constructor and destructor(s). 607 */ 608 t = kmem_alloc(sizeof (*t), KM_SLEEP); 609 t->zsd_key = key; 610 t->zsd_data = (void *)data; 611 t->zsd_create = zsdp->zsd_create; 612 t->zsd_shutdown = zsdp->zsd_shutdown; 613 t->zsd_destroy = zsdp->zsd_destroy; 614 list_insert_tail(&zone->zone_zsd, t); 615 mutex_exit(&zsd_key_lock); 616 mutex_exit(&zone->zone_lock); 617 return (0); 618 } 619 620 /* 621 * ZSD counterpart of pthread_getspecific(). 622 */ 623 void * 624 zone_getspecific(zone_key_t key, zone_t *zone) 625 { 626 struct zsd_entry *t; 627 void *data; 628 629 mutex_enter(&zone->zone_lock); 630 t = zsd_find(&zone->zone_zsd, key); 631 data = (t == NULL ? NULL : t->zsd_data); 632 mutex_exit(&zone->zone_lock); 633 return (data); 634 } 635 636 /* 637 * Function used to initialize a zone's list of ZSD callbacks and data 638 * when the zone is being created. The callbacks are initialized from 639 * the template list (zsd_registered_keys), and the constructor 640 * callback executed (if one exists). 641 * 642 * This is called before the zone is made publicly available, hence no 643 * need to grab zone_lock. 644 * 645 * Although we grab and release zsd_key_lock, new entries cannot be 646 * added to or removed from the zsd_registered_keys list until we 647 * release zonehash_lock, so there isn't a window for a 648 * zone_key_create() to come in after we've dropped zsd_key_lock but 649 * before the zone is added to the zone list, such that the constructor 650 * callbacks aren't executed for the new zone. 651 */ 652 static void 653 zone_zsd_configure(zone_t *zone) 654 { 655 struct zsd_entry *zsdp; 656 struct zsd_entry *t; 657 zoneid_t zoneid = zone->zone_id; 658 659 ASSERT(MUTEX_HELD(&zonehash_lock)); 660 ASSERT(list_head(&zone->zone_zsd) == NULL); 661 mutex_enter(&zsd_key_lock); 662 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 663 zsdp = list_next(&zsd_registered_keys, zsdp)) { 664 if (zsdp->zsd_create != NULL) { 665 t = kmem_alloc(sizeof (*t), KM_SLEEP); 666 t->zsd_key = zsdp->zsd_key; 667 t->zsd_create = zsdp->zsd_create; 668 t->zsd_data = (*t->zsd_create)(zoneid); 669 t->zsd_shutdown = zsdp->zsd_shutdown; 670 t->zsd_destroy = zsdp->zsd_destroy; 671 list_insert_tail(&zone->zone_zsd, t); 672 } 673 } 674 mutex_exit(&zsd_key_lock); 675 } 676 677 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 678 679 /* 680 * Helper function to execute shutdown or destructor callbacks. 681 */ 682 static void 683 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 684 { 685 struct zsd_entry *zsdp; 686 struct zsd_entry *t; 687 zoneid_t zoneid = zone->zone_id; 688 689 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 690 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 691 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 692 693 mutex_enter(&zone->zone_lock); 694 if (ct == ZSD_DESTROY) { 695 if (zone->zone_flags & ZF_DESTROYED) { 696 /* 697 * Make sure destructors are only called once. 698 */ 699 mutex_exit(&zone->zone_lock); 700 return; 701 } 702 zone->zone_flags |= ZF_DESTROYED; 703 } 704 mutex_exit(&zone->zone_lock); 705 706 /* 707 * Both zsd_key_lock and zone_lock need to be held in order to add or 708 * remove a ZSD key, (either globally as part of 709 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 710 * possible through zone_setspecific()), so it's sufficient to hold 711 * zsd_key_lock here. 712 * 713 * This is a good thing, since we don't want to recursively try to grab 714 * zone_lock if a callback attempts to do something like a crfree() or 715 * zone_rele(). 716 */ 717 mutex_enter(&zsd_key_lock); 718 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 719 zsdp = list_next(&zsd_registered_keys, zsdp)) { 720 zone_key_t key = zsdp->zsd_key; 721 722 /* Skip if no callbacks registered */ 723 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 724 continue; 725 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 726 continue; 727 /* 728 * Call the callback with the zone-specific data if we can find 729 * any, otherwise with NULL. 730 */ 731 t = zsd_find(&zone->zone_zsd, key); 732 if (t != NULL) { 733 if (ct == ZSD_SHUTDOWN) { 734 t->zsd_shutdown(zoneid, t->zsd_data); 735 } else { 736 ASSERT(ct == ZSD_DESTROY); 737 t->zsd_destroy(zoneid, t->zsd_data); 738 } 739 } else { 740 if (ct == ZSD_SHUTDOWN) { 741 zsdp->zsd_shutdown(zoneid, NULL); 742 } else { 743 ASSERT(ct == ZSD_DESTROY); 744 zsdp->zsd_destroy(zoneid, NULL); 745 } 746 } 747 } 748 mutex_exit(&zsd_key_lock); 749 } 750 751 /* 752 * Called when the zone is going away; free ZSD-related memory, and 753 * destroy the zone_zsd list. 754 */ 755 static void 756 zone_free_zsd(zone_t *zone) 757 { 758 struct zsd_entry *t, *next; 759 760 /* 761 * Free all the zsd_entry's we had on this zone. 762 */ 763 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 764 next = list_next(&zone->zone_zsd, t); 765 list_remove(&zone->zone_zsd, t); 766 kmem_free(t, sizeof (*t)); 767 } 768 list_destroy(&zone->zone_zsd); 769 } 770 771 /* 772 * zone.cpu-shares resource control support. 773 */ 774 /*ARGSUSED*/ 775 static rctl_qty_t 776 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 777 { 778 ASSERT(MUTEX_HELD(&p->p_lock)); 779 return (p->p_zone->zone_shares); 780 } 781 782 /*ARGSUSED*/ 783 static int 784 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 785 rctl_qty_t nv) 786 { 787 ASSERT(MUTEX_HELD(&p->p_lock)); 788 ASSERT(e->rcep_t == RCENTITY_ZONE); 789 if (e->rcep_p.zone == NULL) 790 return (0); 791 792 e->rcep_p.zone->zone_shares = nv; 793 return (0); 794 } 795 796 static rctl_ops_t zone_cpu_shares_ops = { 797 rcop_no_action, 798 zone_cpu_shares_usage, 799 zone_cpu_shares_set, 800 rcop_no_test 801 }; 802 803 /*ARGSUSED*/ 804 static rctl_qty_t 805 zone_lwps_usage(rctl_t *r, proc_t *p) 806 { 807 rctl_qty_t nlwps; 808 zone_t *zone = p->p_zone; 809 810 ASSERT(MUTEX_HELD(&p->p_lock)); 811 812 mutex_enter(&zone->zone_nlwps_lock); 813 nlwps = zone->zone_nlwps; 814 mutex_exit(&zone->zone_nlwps_lock); 815 816 return (nlwps); 817 } 818 819 /*ARGSUSED*/ 820 static int 821 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 822 rctl_qty_t incr, uint_t flags) 823 { 824 rctl_qty_t nlwps; 825 826 ASSERT(MUTEX_HELD(&p->p_lock)); 827 ASSERT(e->rcep_t == RCENTITY_ZONE); 828 if (e->rcep_p.zone == NULL) 829 return (0); 830 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 831 nlwps = e->rcep_p.zone->zone_nlwps; 832 833 if (nlwps + incr > rcntl->rcv_value) 834 return (1); 835 836 return (0); 837 } 838 839 /*ARGSUSED*/ 840 static int 841 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 842 843 ASSERT(MUTEX_HELD(&p->p_lock)); 844 ASSERT(e->rcep_t == RCENTITY_ZONE); 845 if (e->rcep_p.zone == NULL) 846 return (0); 847 e->rcep_p.zone->zone_nlwps_ctl = nv; 848 return (0); 849 } 850 851 static rctl_ops_t zone_lwps_ops = { 852 rcop_no_action, 853 zone_lwps_usage, 854 zone_lwps_set, 855 zone_lwps_test, 856 }; 857 858 /* 859 * Helper function to brand the zone with a unique ID. 860 */ 861 static void 862 zone_uniqid(zone_t *zone) 863 { 864 static uint64_t uniqid = 0; 865 866 ASSERT(MUTEX_HELD(&zonehash_lock)); 867 zone->zone_uniqid = uniqid++; 868 } 869 870 /* 871 * Returns a held pointer to the "kcred" for the specified zone. 872 */ 873 struct cred * 874 zone_get_kcred(zoneid_t zoneid) 875 { 876 zone_t *zone; 877 cred_t *cr; 878 879 if ((zone = zone_find_by_id(zoneid)) == NULL) 880 return (NULL); 881 cr = zone->zone_kcred; 882 crhold(cr); 883 zone_rele(zone); 884 return (cr); 885 } 886 887 /* 888 * Called very early on in boot to initialize the ZSD list so that 889 * zone_key_create() can be called before zone_init(). It also initializes 890 * portions of zone0 which may be used before zone_init() is called. The 891 * variable "global_zone" will be set when zone0 is fully initialized by 892 * zone_init(). 893 */ 894 void 895 zone_zsd_init(void) 896 { 897 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 898 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 899 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 900 offsetof(struct zsd_entry, zsd_linkage)); 901 list_create(&zone_active, sizeof (zone_t), 902 offsetof(zone_t, zone_linkage)); 903 list_create(&zone_deathrow, sizeof (zone_t), 904 offsetof(zone_t, zone_linkage)); 905 906 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 907 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 908 zone0.zone_shares = 1; 909 zone0.zone_nlwps_ctl = INT_MAX; 910 zone0.zone_name = GLOBAL_ZONENAME; 911 zone0.zone_nodename = utsname.nodename; 912 zone0.zone_domain = srpc_domain; 913 zone0.zone_ref = 1; 914 zone0.zone_id = GLOBAL_ZONEID; 915 zone0.zone_status = ZONE_IS_RUNNING; 916 zone0.zone_rootpath = "/"; 917 zone0.zone_rootpathlen = 2; 918 zone0.zone_psetid = ZONE_PS_INVAL; 919 zone0.zone_ncpus = 0; 920 zone0.zone_ncpus_online = 0; 921 zone0.zone_proc_initpid = 1; 922 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 923 offsetof(struct zsd_entry, zsd_linkage)); 924 list_insert_head(&zone_active, &zone0); 925 926 /* 927 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 928 * to anything meaningful. It is assigned to be 'rootdir' in 929 * vfs_mountroot(). 930 */ 931 zone0.zone_rootvp = NULL; 932 zone0.zone_vfslist = NULL; 933 zone0.zone_bootargs = NULL; 934 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 935 /* 936 * The global zone has all privileges 937 */ 938 priv_fillset(zone0.zone_privset); 939 /* 940 * Add p0 to the global zone 941 */ 942 zone0.zone_zsched = &p0; 943 p0.p_zone = &zone0; 944 } 945 946 /* 947 * Called by main() to initialize the zones framework. 948 */ 949 void 950 zone_init(void) 951 { 952 rctl_dict_entry_t *rde; 953 rctl_val_t *dval; 954 rctl_set_t *set; 955 rctl_alloc_gp_t *gp; 956 rctl_entity_p_t e; 957 958 ASSERT(curproc == &p0); 959 960 /* 961 * Create ID space for zone IDs. ID 0 is reserved for the 962 * global zone. 963 */ 964 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 965 966 /* 967 * Initialize generic zone resource controls, if any. 968 */ 969 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 970 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 971 RCTL_GLOBAL_NOBASIC | 972 RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES, 973 &zone_cpu_shares_ops); 974 975 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 976 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 977 INT_MAX, INT_MAX, &zone_lwps_ops); 978 /* 979 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 980 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 981 */ 982 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 983 bzero(dval, sizeof (rctl_val_t)); 984 dval->rcv_value = 1; 985 dval->rcv_privilege = RCPRIV_PRIVILEGED; 986 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 987 dval->rcv_action_recip_pid = -1; 988 989 rde = rctl_dict_lookup("zone.cpu-shares"); 990 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 991 992 /* 993 * Initialize the ``global zone''. 994 */ 995 set = rctl_set_create(); 996 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 997 mutex_enter(&p0.p_lock); 998 e.rcep_p.zone = &zone0; 999 e.rcep_t = RCENTITY_ZONE; 1000 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1001 gp); 1002 1003 zone0.zone_nlwps = p0.p_lwpcnt; 1004 zone0.zone_ntasks = 1; 1005 mutex_exit(&p0.p_lock); 1006 rctl_prealloc_destroy(gp); 1007 /* 1008 * pool_default hasn't been initialized yet, so we let pool_init() take 1009 * care of making the global zone is in the default pool. 1010 */ 1011 mutex_enter(&zonehash_lock); 1012 zone_uniqid(&zone0); 1013 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1014 mutex_exit(&zonehash_lock); 1015 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1016 mod_hash_null_valdtor); 1017 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1018 zone_hash_size, mod_hash_null_valdtor); 1019 zonecount = 1; 1020 1021 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1022 (mod_hash_val_t)&zone0); 1023 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1024 (mod_hash_val_t)&zone0); 1025 /* 1026 * We avoid setting zone_kcred until now, since kcred is initialized 1027 * sometime after zone_zsd_init() and before zone_init(). 1028 */ 1029 zone0.zone_kcred = kcred; 1030 /* 1031 * The global zone is fully initialized (except for zone_rootvp which 1032 * will be set when the root filesystem is mounted). 1033 */ 1034 global_zone = &zone0; 1035 } 1036 1037 static void 1038 zone_free(zone_t *zone) 1039 { 1040 ASSERT(zone != global_zone); 1041 ASSERT(zone->zone_ntasks == 0); 1042 ASSERT(zone->zone_nlwps == 0); 1043 ASSERT(zone->zone_cred_ref == 0); 1044 ASSERT(zone->zone_kcred == NULL); 1045 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1046 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1047 1048 /* remove from deathrow list */ 1049 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1050 ASSERT(zone->zone_ref == 0); 1051 mutex_enter(&zone_deathrow_lock); 1052 list_remove(&zone_deathrow, zone); 1053 mutex_exit(&zone_deathrow_lock); 1054 } 1055 1056 zone_free_zsd(zone); 1057 1058 if (zone->zone_rootvp != NULL) 1059 VN_RELE(zone->zone_rootvp); 1060 if (zone->zone_rootpath) 1061 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1062 if (zone->zone_name != NULL) 1063 kmem_free(zone->zone_name, ZONENAME_MAX); 1064 if (zone->zone_nodename != NULL) 1065 kmem_free(zone->zone_nodename, _SYS_NMLN); 1066 if (zone->zone_domain != NULL) 1067 kmem_free(zone->zone_domain, _SYS_NMLN); 1068 if (zone->zone_privset != NULL) 1069 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1070 if (zone->zone_rctls != NULL) 1071 rctl_set_free(zone->zone_rctls); 1072 if (zone->zone_bootargs != NULL) 1073 kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX); 1074 id_free(zoneid_space, zone->zone_id); 1075 mutex_destroy(&zone->zone_lock); 1076 cv_destroy(&zone->zone_cv); 1077 kmem_free(zone, sizeof (zone_t)); 1078 } 1079 1080 /* 1081 * See block comment at the top of this file for information about zone 1082 * status values. 1083 */ 1084 /* 1085 * Convenience function for setting zone status. 1086 */ 1087 static void 1088 zone_status_set(zone_t *zone, zone_status_t status) 1089 { 1090 ASSERT(MUTEX_HELD(&zone_status_lock)); 1091 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1092 status >= zone_status_get(zone)); 1093 zone->zone_status = status; 1094 cv_broadcast(&zone->zone_cv); 1095 } 1096 1097 /* 1098 * Public function to retrieve the zone status. The zone status may 1099 * change after it is retrieved. 1100 */ 1101 zone_status_t 1102 zone_status_get(zone_t *zone) 1103 { 1104 return (zone->zone_status); 1105 } 1106 1107 static int 1108 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1109 { 1110 char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP); 1111 size_t len; 1112 int err; 1113 1114 err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len); 1115 if (err != 0) { 1116 kmem_free(bootargs, ZONEBOOTARGS_MAX); 1117 return (err); /* EFAULT or ENAMETOOLONG */ 1118 } 1119 bootargs[len] = '\0'; 1120 1121 ASSERT(zone->zone_bootargs == NULL); 1122 zone->zone_bootargs = bootargs; 1123 return (0); 1124 } 1125 1126 /* 1127 * Block indefinitely waiting for (zone_status >= status) 1128 */ 1129 void 1130 zone_status_wait(zone_t *zone, zone_status_t status) 1131 { 1132 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1133 1134 mutex_enter(&zone_status_lock); 1135 while (zone->zone_status < status) { 1136 cv_wait(&zone->zone_cv, &zone_status_lock); 1137 } 1138 mutex_exit(&zone_status_lock); 1139 } 1140 1141 /* 1142 * Private CPR-safe version of zone_status_wait(). 1143 */ 1144 static void 1145 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1146 { 1147 callb_cpr_t cprinfo; 1148 1149 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1150 1151 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1152 str); 1153 mutex_enter(&zone_status_lock); 1154 while (zone->zone_status < status) { 1155 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1156 cv_wait(&zone->zone_cv, &zone_status_lock); 1157 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1158 } 1159 /* 1160 * zone_status_lock is implicitly released by the following. 1161 */ 1162 CALLB_CPR_EXIT(&cprinfo); 1163 } 1164 1165 /* 1166 * Block until zone enters requested state or signal is received. Return (0) 1167 * if signaled, non-zero otherwise. 1168 */ 1169 int 1170 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1171 { 1172 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1173 1174 mutex_enter(&zone_status_lock); 1175 while (zone->zone_status < status) { 1176 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1177 mutex_exit(&zone_status_lock); 1178 return (0); 1179 } 1180 } 1181 mutex_exit(&zone_status_lock); 1182 return (1); 1183 } 1184 1185 /* 1186 * Block until the zone enters the requested state or the timeout expires, 1187 * whichever happens first. Return (-1) if operation timed out, time remaining 1188 * otherwise. 1189 */ 1190 clock_t 1191 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1192 { 1193 clock_t timeleft = 0; 1194 1195 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1196 1197 mutex_enter(&zone_status_lock); 1198 while (zone->zone_status < status && timeleft != -1) { 1199 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1200 } 1201 mutex_exit(&zone_status_lock); 1202 return (timeleft); 1203 } 1204 1205 /* 1206 * Block until the zone enters the requested state, the current process is 1207 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1208 * operation timed out, 0 if signaled, time remaining otherwise. 1209 */ 1210 clock_t 1211 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1212 { 1213 clock_t timeleft = tim - lbolt; 1214 1215 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1216 1217 mutex_enter(&zone_status_lock); 1218 while (zone->zone_status < status) { 1219 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1220 tim); 1221 if (timeleft <= 0) 1222 break; 1223 } 1224 mutex_exit(&zone_status_lock); 1225 return (timeleft); 1226 } 1227 1228 /* 1229 * Zones have two reference counts: one for references from credential 1230 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1231 * This is so we can allow a zone to be rebooted while there are still 1232 * outstanding cred references, since certain drivers cache dblks (which 1233 * implicitly results in cached creds). We wait for zone_ref to drop to 1234 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1235 * later freed when the zone_cred_ref drops to 0, though nothing other 1236 * than the zone id and privilege set should be accessed once the zone 1237 * is "dead". 1238 * 1239 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1240 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1241 * to 0. This can be useful to flush out other sources of cached creds 1242 * that may be less innocuous than the driver case. 1243 */ 1244 1245 int zone_wait_for_cred = 0; 1246 1247 static void 1248 zone_hold_locked(zone_t *z) 1249 { 1250 ASSERT(MUTEX_HELD(&z->zone_lock)); 1251 z->zone_ref++; 1252 ASSERT(z->zone_ref != 0); 1253 } 1254 1255 void 1256 zone_hold(zone_t *z) 1257 { 1258 mutex_enter(&z->zone_lock); 1259 zone_hold_locked(z); 1260 mutex_exit(&z->zone_lock); 1261 } 1262 1263 /* 1264 * If the non-cred ref count drops to 1 and either the cred ref count 1265 * is 0 or we aren't waiting for cred references, the zone is ready to 1266 * be destroyed. 1267 */ 1268 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1269 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1270 1271 void 1272 zone_rele(zone_t *z) 1273 { 1274 boolean_t wakeup; 1275 1276 mutex_enter(&z->zone_lock); 1277 ASSERT(z->zone_ref != 0); 1278 z->zone_ref--; 1279 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1280 /* no more refs, free the structure */ 1281 mutex_exit(&z->zone_lock); 1282 zone_free(z); 1283 return; 1284 } 1285 /* signal zone_destroy so the zone can finish halting */ 1286 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1287 mutex_exit(&z->zone_lock); 1288 1289 if (wakeup) { 1290 /* 1291 * Grabbing zonehash_lock here effectively synchronizes with 1292 * zone_destroy() to avoid missed signals. 1293 */ 1294 mutex_enter(&zonehash_lock); 1295 cv_broadcast(&zone_destroy_cv); 1296 mutex_exit(&zonehash_lock); 1297 } 1298 } 1299 1300 void 1301 zone_cred_hold(zone_t *z) 1302 { 1303 mutex_enter(&z->zone_lock); 1304 z->zone_cred_ref++; 1305 ASSERT(z->zone_cred_ref != 0); 1306 mutex_exit(&z->zone_lock); 1307 } 1308 1309 void 1310 zone_cred_rele(zone_t *z) 1311 { 1312 boolean_t wakeup; 1313 1314 mutex_enter(&z->zone_lock); 1315 ASSERT(z->zone_cred_ref != 0); 1316 z->zone_cred_ref--; 1317 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1318 /* no more refs, free the structure */ 1319 mutex_exit(&z->zone_lock); 1320 zone_free(z); 1321 return; 1322 } 1323 /* 1324 * If zone_destroy is waiting for the cred references to drain 1325 * out, and they have, signal it. 1326 */ 1327 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1328 zone_status_get(z) >= ZONE_IS_DEAD); 1329 mutex_exit(&z->zone_lock); 1330 1331 if (wakeup) { 1332 /* 1333 * Grabbing zonehash_lock here effectively synchronizes with 1334 * zone_destroy() to avoid missed signals. 1335 */ 1336 mutex_enter(&zonehash_lock); 1337 cv_broadcast(&zone_destroy_cv); 1338 mutex_exit(&zonehash_lock); 1339 } 1340 } 1341 1342 void 1343 zone_task_hold(zone_t *z) 1344 { 1345 mutex_enter(&z->zone_lock); 1346 z->zone_ntasks++; 1347 ASSERT(z->zone_ntasks != 0); 1348 mutex_exit(&z->zone_lock); 1349 } 1350 1351 void 1352 zone_task_rele(zone_t *zone) 1353 { 1354 uint_t refcnt; 1355 1356 mutex_enter(&zone->zone_lock); 1357 ASSERT(zone->zone_ntasks != 0); 1358 refcnt = --zone->zone_ntasks; 1359 if (refcnt > 1) { /* Common case */ 1360 mutex_exit(&zone->zone_lock); 1361 return; 1362 } 1363 zone_hold_locked(zone); /* so we can use the zone_t later */ 1364 mutex_exit(&zone->zone_lock); 1365 if (refcnt == 1) { 1366 /* 1367 * See if the zone is shutting down. 1368 */ 1369 mutex_enter(&zone_status_lock); 1370 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1371 goto out; 1372 } 1373 1374 /* 1375 * Make sure the ntasks didn't change since we 1376 * dropped zone_lock. 1377 */ 1378 mutex_enter(&zone->zone_lock); 1379 if (refcnt != zone->zone_ntasks) { 1380 mutex_exit(&zone->zone_lock); 1381 goto out; 1382 } 1383 mutex_exit(&zone->zone_lock); 1384 1385 /* 1386 * No more user processes in the zone. The zone is empty. 1387 */ 1388 zone_status_set(zone, ZONE_IS_EMPTY); 1389 goto out; 1390 } 1391 1392 ASSERT(refcnt == 0); 1393 /* 1394 * zsched has exited; the zone is dead. 1395 */ 1396 zone->zone_zsched = NULL; /* paranoia */ 1397 mutex_enter(&zone_status_lock); 1398 zone_status_set(zone, ZONE_IS_DEAD); 1399 out: 1400 mutex_exit(&zone_status_lock); 1401 zone_rele(zone); 1402 } 1403 1404 zoneid_t 1405 getzoneid(void) 1406 { 1407 return (curproc->p_zone->zone_id); 1408 } 1409 1410 /* 1411 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1412 * check the validity of a zone's state. 1413 */ 1414 static zone_t * 1415 zone_find_all_by_id(zoneid_t zoneid) 1416 { 1417 mod_hash_val_t hv; 1418 zone_t *zone = NULL; 1419 1420 ASSERT(MUTEX_HELD(&zonehash_lock)); 1421 1422 if (mod_hash_find(zonehashbyid, 1423 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1424 zone = (zone_t *)hv; 1425 return (zone); 1426 } 1427 1428 static zone_t * 1429 zone_find_all_by_name(char *name) 1430 { 1431 mod_hash_val_t hv; 1432 zone_t *zone = NULL; 1433 1434 ASSERT(MUTEX_HELD(&zonehash_lock)); 1435 1436 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 1437 zone = (zone_t *)hv; 1438 return (zone); 1439 } 1440 1441 /* 1442 * Public interface for looking up a zone by zoneid. Only returns the zone if 1443 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 1444 * Caller must call zone_rele() once it is done with the zone. 1445 * 1446 * The zone may begin the zone_destroy() sequence immediately after this 1447 * function returns, but may be safely used until zone_rele() is called. 1448 */ 1449 zone_t * 1450 zone_find_by_id(zoneid_t zoneid) 1451 { 1452 zone_t *zone; 1453 zone_status_t status; 1454 1455 mutex_enter(&zonehash_lock); 1456 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 1457 mutex_exit(&zonehash_lock); 1458 return (NULL); 1459 } 1460 status = zone_status_get(zone); 1461 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1462 /* 1463 * For all practical purposes the zone doesn't exist. 1464 */ 1465 mutex_exit(&zonehash_lock); 1466 return (NULL); 1467 } 1468 zone_hold(zone); 1469 mutex_exit(&zonehash_lock); 1470 return (zone); 1471 } 1472 1473 /* 1474 * Similar to zone_find_by_id, but using zone name as the key. 1475 */ 1476 zone_t * 1477 zone_find_by_name(char *name) 1478 { 1479 zone_t *zone; 1480 zone_status_t status; 1481 1482 mutex_enter(&zonehash_lock); 1483 if ((zone = zone_find_all_by_name(name)) == NULL) { 1484 mutex_exit(&zonehash_lock); 1485 return (NULL); 1486 } 1487 status = zone_status_get(zone); 1488 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1489 /* 1490 * For all practical purposes the zone doesn't exist. 1491 */ 1492 mutex_exit(&zonehash_lock); 1493 return (NULL); 1494 } 1495 zone_hold(zone); 1496 mutex_exit(&zonehash_lock); 1497 return (zone); 1498 } 1499 1500 /* 1501 * Similar to zone_find_by_id(), using the path as a key. For instance, 1502 * if there is a zone "foo" rooted at /foo/root, and the path argument 1503 * is "/foo/root/proc", it will return the held zone_t corresponding to 1504 * zone "foo". 1505 * 1506 * zone_find_by_path() always returns a non-NULL value, since at the 1507 * very least every path will be contained in the global zone. 1508 * 1509 * As with the other zone_find_by_*() functions, the caller is 1510 * responsible for zone_rele()ing the return value of this function. 1511 */ 1512 zone_t * 1513 zone_find_by_path(const char *path) 1514 { 1515 zone_t *zone; 1516 zone_t *zret = NULL; 1517 zone_status_t status; 1518 1519 if (path == NULL) { 1520 /* 1521 * Call from rootconf(). 1522 */ 1523 zone_hold(global_zone); 1524 return (global_zone); 1525 } 1526 ASSERT(*path == '/'); 1527 mutex_enter(&zonehash_lock); 1528 for (zone = list_head(&zone_active); zone != NULL; 1529 zone = list_next(&zone_active, zone)) { 1530 if (ZONE_PATH_VISIBLE(path, zone)) 1531 zret = zone; 1532 } 1533 ASSERT(zret != NULL); 1534 status = zone_status_get(zret); 1535 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1536 /* 1537 * Zone practically doesn't exist. 1538 */ 1539 zret = global_zone; 1540 } 1541 zone_hold(zret); 1542 mutex_exit(&zonehash_lock); 1543 return (zret); 1544 } 1545 1546 /* 1547 * Get the number of cpus visible to this zone. The system-wide global 1548 * 'ncpus' is returned if pools are disabled, the caller is in the 1549 * global zone, or a NULL zone argument is passed in. 1550 */ 1551 int 1552 zone_ncpus_get(zone_t *zone) 1553 { 1554 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 1555 1556 return (myncpus != 0 ? myncpus : ncpus); 1557 } 1558 1559 /* 1560 * Get the number of online cpus visible to this zone. The system-wide 1561 * global 'ncpus_online' is returned if pools are disabled, the caller 1562 * is in the global zone, or a NULL zone argument is passed in. 1563 */ 1564 int 1565 zone_ncpus_online_get(zone_t *zone) 1566 { 1567 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 1568 1569 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 1570 } 1571 1572 /* 1573 * Return the pool to which the zone is currently bound. 1574 */ 1575 pool_t * 1576 zone_pool_get(zone_t *zone) 1577 { 1578 ASSERT(pool_lock_held()); 1579 1580 return (zone->zone_pool); 1581 } 1582 1583 /* 1584 * Set the zone's pool pointer and update the zone's visibility to match 1585 * the resources in the new pool. 1586 */ 1587 void 1588 zone_pool_set(zone_t *zone, pool_t *pool) 1589 { 1590 ASSERT(pool_lock_held()); 1591 ASSERT(MUTEX_HELD(&cpu_lock)); 1592 1593 zone->zone_pool = pool; 1594 zone_pset_set(zone, pool->pool_pset->pset_id); 1595 } 1596 1597 /* 1598 * Return the cached value of the id of the processor set to which the 1599 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 1600 * facility is disabled. 1601 */ 1602 psetid_t 1603 zone_pset_get(zone_t *zone) 1604 { 1605 ASSERT(MUTEX_HELD(&cpu_lock)); 1606 1607 return (zone->zone_psetid); 1608 } 1609 1610 /* 1611 * Set the cached value of the id of the processor set to which the zone 1612 * is currently bound. Also update the zone's visibility to match the 1613 * resources in the new processor set. 1614 */ 1615 void 1616 zone_pset_set(zone_t *zone, psetid_t newpsetid) 1617 { 1618 psetid_t oldpsetid; 1619 1620 ASSERT(MUTEX_HELD(&cpu_lock)); 1621 oldpsetid = zone_pset_get(zone); 1622 1623 if (oldpsetid == newpsetid) 1624 return; 1625 /* 1626 * Global zone sees all. 1627 */ 1628 if (zone != global_zone) { 1629 zone->zone_psetid = newpsetid; 1630 if (newpsetid != ZONE_PS_INVAL) 1631 pool_pset_visibility_add(newpsetid, zone); 1632 if (oldpsetid != ZONE_PS_INVAL) 1633 pool_pset_visibility_remove(oldpsetid, zone); 1634 } 1635 /* 1636 * Disabling pools, so we should start using the global values 1637 * for ncpus and ncpus_online. 1638 */ 1639 if (newpsetid == ZONE_PS_INVAL) { 1640 zone->zone_ncpus = 0; 1641 zone->zone_ncpus_online = 0; 1642 } 1643 } 1644 1645 /* 1646 * Walk the list of active zones and issue the provided callback for 1647 * each of them. 1648 * 1649 * Caller must not be holding any locks that may be acquired under 1650 * zonehash_lock. See comment at the beginning of the file for a list of 1651 * common locks and their interactions with zones. 1652 */ 1653 int 1654 zone_walk(int (*cb)(zone_t *, void *), void *data) 1655 { 1656 zone_t *zone; 1657 int ret = 0; 1658 zone_status_t status; 1659 1660 mutex_enter(&zonehash_lock); 1661 for (zone = list_head(&zone_active); zone != NULL; 1662 zone = list_next(&zone_active, zone)) { 1663 /* 1664 * Skip zones that shouldn't be externally visible. 1665 */ 1666 status = zone_status_get(zone); 1667 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 1668 continue; 1669 /* 1670 * Bail immediately if any callback invocation returns a 1671 * non-zero value. 1672 */ 1673 ret = (*cb)(zone, data); 1674 if (ret != 0) 1675 break; 1676 } 1677 mutex_exit(&zonehash_lock); 1678 return (ret); 1679 } 1680 1681 static int 1682 zone_set_root(zone_t *zone, const char *upath) 1683 { 1684 vnode_t *vp; 1685 int trycount; 1686 int error = 0; 1687 char *path; 1688 struct pathname upn, pn; 1689 size_t pathlen; 1690 1691 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 1692 return (error); 1693 1694 pn_alloc(&pn); 1695 1696 /* prevent infinite loop */ 1697 trycount = 10; 1698 for (;;) { 1699 if (--trycount <= 0) { 1700 error = ESTALE; 1701 goto out; 1702 } 1703 1704 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 1705 /* 1706 * VOP_ACCESS() may cover 'vp' with a new 1707 * filesystem, if 'vp' is an autoFS vnode. 1708 * Get the new 'vp' if so. 1709 */ 1710 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 1711 (vp->v_vfsmountedhere == NULL || 1712 (error = traverse(&vp)) == 0)) { 1713 pathlen = pn.pn_pathlen + 2; 1714 path = kmem_alloc(pathlen, KM_SLEEP); 1715 (void) strncpy(path, pn.pn_path, 1716 pn.pn_pathlen + 1); 1717 path[pathlen - 2] = '/'; 1718 path[pathlen - 1] = '\0'; 1719 pn_free(&pn); 1720 pn_free(&upn); 1721 1722 /* Success! */ 1723 break; 1724 } 1725 VN_RELE(vp); 1726 } 1727 if (error != ESTALE) 1728 goto out; 1729 } 1730 1731 ASSERT(error == 0); 1732 zone->zone_rootvp = vp; /* we hold a reference to vp */ 1733 zone->zone_rootpath = path; 1734 zone->zone_rootpathlen = pathlen; 1735 return (0); 1736 1737 out: 1738 pn_free(&pn); 1739 pn_free(&upn); 1740 return (error); 1741 } 1742 1743 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 1744 ((c) >= 'a' && (c) <= 'z') || \ 1745 ((c) >= 'A' && (c) <= 'Z')) 1746 1747 static int 1748 zone_set_name(zone_t *zone, const char *uname) 1749 { 1750 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 1751 size_t len; 1752 int i, err; 1753 1754 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 1755 kmem_free(kname, ZONENAME_MAX); 1756 return (err); /* EFAULT or ENAMETOOLONG */ 1757 } 1758 1759 /* must be less than ZONENAME_MAX */ 1760 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 1761 kmem_free(kname, ZONENAME_MAX); 1762 return (EINVAL); 1763 } 1764 1765 /* 1766 * Name must start with an alphanumeric and must contain only 1767 * alphanumerics, '-', '_' and '.'. 1768 */ 1769 if (!isalnum(kname[0])) { 1770 kmem_free(kname, ZONENAME_MAX); 1771 return (EINVAL); 1772 } 1773 for (i = 1; i < len - 1; i++) { 1774 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 1775 kname[i] != '.') { 1776 kmem_free(kname, ZONENAME_MAX); 1777 return (EINVAL); 1778 } 1779 } 1780 1781 zone->zone_name = kname; 1782 return (0); 1783 } 1784 1785 /* 1786 * Similar to thread_create(), but makes sure the thread is in the appropriate 1787 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 1788 */ 1789 /*ARGSUSED*/ 1790 kthread_t * 1791 zthread_create( 1792 caddr_t stk, 1793 size_t stksize, 1794 void (*proc)(), 1795 void *arg, 1796 size_t len, 1797 pri_t pri) 1798 { 1799 kthread_t *t; 1800 zone_t *zone = curproc->p_zone; 1801 proc_t *pp = zone->zone_zsched; 1802 1803 zone_hold(zone); /* Reference to be dropped when thread exits */ 1804 1805 /* 1806 * No-one should be trying to create threads if the zone is shutting 1807 * down and there aren't any kernel threads around. See comment 1808 * in zthread_exit(). 1809 */ 1810 ASSERT(!(zone->zone_kthreads == NULL && 1811 zone_status_get(zone) >= ZONE_IS_EMPTY)); 1812 /* 1813 * Create a thread, but don't let it run until we've finished setting 1814 * things up. 1815 */ 1816 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 1817 ASSERT(t->t_forw == NULL); 1818 mutex_enter(&zone_status_lock); 1819 if (zone->zone_kthreads == NULL) { 1820 t->t_forw = t->t_back = t; 1821 } else { 1822 kthread_t *tx = zone->zone_kthreads; 1823 1824 t->t_forw = tx; 1825 t->t_back = tx->t_back; 1826 tx->t_back->t_forw = t; 1827 tx->t_back = t; 1828 } 1829 zone->zone_kthreads = t; 1830 mutex_exit(&zone_status_lock); 1831 1832 mutex_enter(&pp->p_lock); 1833 t->t_proc_flag |= TP_ZTHREAD; 1834 project_rele(t->t_proj); 1835 t->t_proj = project_hold(pp->p_task->tk_proj); 1836 1837 /* 1838 * Setup complete, let it run. 1839 */ 1840 thread_lock(t); 1841 t->t_schedflag |= TS_ALLSTART; 1842 setrun_locked(t); 1843 thread_unlock(t); 1844 1845 mutex_exit(&pp->p_lock); 1846 1847 return (t); 1848 } 1849 1850 /* 1851 * Similar to thread_exit(). Must be called by threads created via 1852 * zthread_exit(). 1853 */ 1854 void 1855 zthread_exit(void) 1856 { 1857 kthread_t *t = curthread; 1858 proc_t *pp = curproc; 1859 zone_t *zone = pp->p_zone; 1860 1861 mutex_enter(&zone_status_lock); 1862 1863 /* 1864 * Reparent to p0 1865 */ 1866 mutex_enter(&pp->p_lock); 1867 t->t_proc_flag &= ~TP_ZTHREAD; 1868 t->t_procp = &p0; 1869 hat_thread_exit(t); 1870 mutex_exit(&pp->p_lock); 1871 1872 if (t->t_back == t) { 1873 ASSERT(t->t_forw == t); 1874 /* 1875 * If the zone is empty, once the thread count 1876 * goes to zero no further kernel threads can be 1877 * created. This is because if the creator is a process 1878 * in the zone, then it must have exited before the zone 1879 * state could be set to ZONE_IS_EMPTY. 1880 * Otherwise, if the creator is a kernel thread in the 1881 * zone, the thread count is non-zero. 1882 * 1883 * This really means that non-zone kernel threads should 1884 * not create zone kernel threads. 1885 */ 1886 zone->zone_kthreads = NULL; 1887 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 1888 zone_status_set(zone, ZONE_IS_DOWN); 1889 } 1890 } else { 1891 t->t_forw->t_back = t->t_back; 1892 t->t_back->t_forw = t->t_forw; 1893 if (zone->zone_kthreads == t) 1894 zone->zone_kthreads = t->t_forw; 1895 } 1896 mutex_exit(&zone_status_lock); 1897 zone_rele(zone); 1898 thread_exit(); 1899 /* NOTREACHED */ 1900 } 1901 1902 static void 1903 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 1904 { 1905 vnode_t *oldvp; 1906 1907 /* we're going to hold a reference here to the directory */ 1908 VN_HOLD(vp); 1909 1910 #ifdef C2_AUDIT 1911 if (audit_active) /* update abs cwd/root path see c2audit.c */ 1912 audit_chdirec(vp, vpp); 1913 #endif 1914 1915 mutex_enter(&pp->p_lock); 1916 oldvp = *vpp; 1917 *vpp = vp; 1918 mutex_exit(&pp->p_lock); 1919 if (oldvp != NULL) 1920 VN_RELE(oldvp); 1921 } 1922 1923 /* 1924 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 1925 */ 1926 static int 1927 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 1928 { 1929 nvpair_t *nvp = NULL; 1930 boolean_t priv_set = B_FALSE; 1931 boolean_t limit_set = B_FALSE; 1932 boolean_t action_set = B_FALSE; 1933 1934 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1935 const char *name; 1936 uint64_t ui64; 1937 1938 name = nvpair_name(nvp); 1939 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 1940 return (EINVAL); 1941 (void) nvpair_value_uint64(nvp, &ui64); 1942 if (strcmp(name, "privilege") == 0) { 1943 /* 1944 * Currently only privileged values are allowed, but 1945 * this may change in the future. 1946 */ 1947 if (ui64 != RCPRIV_PRIVILEGED) 1948 return (EINVAL); 1949 rv->rcv_privilege = ui64; 1950 priv_set = B_TRUE; 1951 } else if (strcmp(name, "limit") == 0) { 1952 rv->rcv_value = ui64; 1953 limit_set = B_TRUE; 1954 } else if (strcmp(name, "action") == 0) { 1955 if (ui64 != RCTL_LOCAL_NOACTION && 1956 ui64 != RCTL_LOCAL_DENY) 1957 return (EINVAL); 1958 rv->rcv_flagaction = ui64; 1959 action_set = B_TRUE; 1960 } else { 1961 return (EINVAL); 1962 } 1963 } 1964 1965 if (!(priv_set && limit_set && action_set)) 1966 return (EINVAL); 1967 rv->rcv_action_signal = 0; 1968 rv->rcv_action_recipient = NULL; 1969 rv->rcv_action_recip_pid = -1; 1970 rv->rcv_firing_time = 0; 1971 1972 return (0); 1973 } 1974 1975 void 1976 zone_icode(void) 1977 { 1978 proc_t *p = ttoproc(curthread); 1979 struct core_globals *cg; 1980 1981 /* 1982 * For all purposes (ZONE_ATTR_INITPID and restart_init), 1983 * storing just the pid of init is sufficient. 1984 */ 1985 p->p_zone->zone_proc_initpid = p->p_pid; 1986 1987 /* 1988 * Allocate user address space and stack segment 1989 */ 1990 1991 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 1992 p->p_usrstack = (caddr_t)USRSTACK32; 1993 p->p_model = DATAMODEL_ILP32; 1994 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 1995 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 1996 p->p_stk_ctl = INT32_MAX; 1997 1998 p->p_as = as_alloc(); 1999 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 2000 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 2001 2002 cg = zone_getspecific(core_zone_key, p->p_zone); 2003 ASSERT(cg != NULL); 2004 corectl_path_hold(cg->core_default_path); 2005 corectl_content_hold(cg->core_default_content); 2006 p->p_corefile = cg->core_default_path; 2007 p->p_content = cg->core_default_content; 2008 2009 init_mstate(curthread, LMS_SYSTEM); 2010 2011 p->p_zone->zone_boot_err = exec_init(zone_initname, 0, 2012 p->p_zone->zone_bootargs); 2013 2014 mutex_enter(&zone_status_lock); 2015 if (p->p_zone->zone_boot_err != 0) { 2016 /* 2017 * Make sure we are still in the booting state-- we could have 2018 * raced and already be shutting down, or even further along. 2019 */ 2020 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2021 zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); 2022 mutex_exit(&zone_status_lock); 2023 /* It's gone bad, dispose of the process */ 2024 if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { 2025 mutex_enter(&curproc->p_lock); 2026 ASSERT(curproc->p_flag & SEXITLWPS); 2027 lwp_exit(); 2028 } 2029 } else { 2030 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2031 zone_status_set(p->p_zone, ZONE_IS_RUNNING); 2032 mutex_exit(&zone_status_lock); 2033 /* cause the process to return to userland. */ 2034 lwp_rtt(); 2035 } 2036 } 2037 2038 struct zsched_arg { 2039 zone_t *zone; 2040 nvlist_t *nvlist; 2041 }; 2042 2043 /* 2044 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2045 * anything to do with scheduling, but rather with the fact that 2046 * per-zone kernel threads are parented to zsched, just like regular 2047 * kernel threads are parented to sched (p0). 2048 * 2049 * zsched is also responsible for launching init for the zone. 2050 */ 2051 static void 2052 zsched(void *arg) 2053 { 2054 struct zsched_arg *za = arg; 2055 proc_t *pp = curproc; 2056 proc_t *initp = proc_init; 2057 zone_t *zone = za->zone; 2058 cred_t *cr, *oldcred; 2059 rctl_set_t *set; 2060 rctl_alloc_gp_t *gp; 2061 contract_t *ct = NULL; 2062 task_t *tk, *oldtk; 2063 rctl_entity_p_t e; 2064 kproject_t *pj; 2065 2066 nvlist_t *nvl = za->nvlist; 2067 nvpair_t *nvp = NULL; 2068 2069 bcopy("zsched", u.u_psargs, sizeof ("zsched")); 2070 bcopy("zsched", u.u_comm, sizeof ("zsched")); 2071 u.u_argc = 0; 2072 u.u_argv = NULL; 2073 u.u_envp = NULL; 2074 closeall(P_FINFO(pp)); 2075 2076 /* 2077 * We are this zone's "zsched" process. As the zone isn't generally 2078 * visible yet we don't need to grab any locks before initializing its 2079 * zone_proc pointer. 2080 */ 2081 zone_hold(zone); /* this hold is released by zone_destroy() */ 2082 zone->zone_zsched = pp; 2083 mutex_enter(&pp->p_lock); 2084 pp->p_zone = zone; 2085 mutex_exit(&pp->p_lock); 2086 2087 /* 2088 * Disassociate process from its 'parent'; parent ourselves to init 2089 * (pid 1) and change other values as needed. 2090 */ 2091 sess_create(); 2092 2093 mutex_enter(&pidlock); 2094 proc_detach(pp); 2095 pp->p_ppid = 1; 2096 pp->p_flag |= SZONETOP; 2097 pp->p_ancpid = 1; 2098 pp->p_parent = initp; 2099 pp->p_psibling = NULL; 2100 if (initp->p_child) 2101 initp->p_child->p_psibling = pp; 2102 pp->p_sibling = initp->p_child; 2103 initp->p_child = pp; 2104 2105 /* Decrement what newproc() incremented. */ 2106 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2107 /* 2108 * Our credentials are about to become kcred-like, so we don't care 2109 * about the caller's ruid. 2110 */ 2111 upcount_inc(crgetruid(kcred), zone->zone_id); 2112 mutex_exit(&pidlock); 2113 2114 /* 2115 * getting out of global zone, so decrement lwp counts 2116 */ 2117 pj = pp->p_task->tk_proj; 2118 mutex_enter(&global_zone->zone_nlwps_lock); 2119 pj->kpj_nlwps -= pp->p_lwpcnt; 2120 global_zone->zone_nlwps -= pp->p_lwpcnt; 2121 mutex_exit(&global_zone->zone_nlwps_lock); 2122 2123 /* 2124 * Create and join a new task in project '0' of this zone. 2125 * 2126 * We don't need to call holdlwps() since we know we're the only lwp in 2127 * this process. 2128 * 2129 * task_join() returns with p_lock held. 2130 */ 2131 tk = task_create(0, zone); 2132 mutex_enter(&cpu_lock); 2133 oldtk = task_join(tk, 0); 2134 mutex_exit(&curproc->p_lock); 2135 mutex_exit(&cpu_lock); 2136 task_rele(oldtk); 2137 2138 /* 2139 * add lwp counts to zsched's zone, and increment project's task count 2140 * due to the task created in the above tasksys_settaskid 2141 */ 2142 pj = pp->p_task->tk_proj; 2143 mutex_enter(&zone->zone_nlwps_lock); 2144 pj->kpj_nlwps += pp->p_lwpcnt; 2145 pj->kpj_ntasks += 1; 2146 zone->zone_nlwps += pp->p_lwpcnt; 2147 mutex_exit(&zone->zone_nlwps_lock); 2148 2149 /* 2150 * The process was created by a process in the global zone, hence the 2151 * credentials are wrong. We might as well have kcred-ish credentials. 2152 */ 2153 cr = zone->zone_kcred; 2154 crhold(cr); 2155 mutex_enter(&pp->p_crlock); 2156 oldcred = pp->p_cred; 2157 pp->p_cred = cr; 2158 mutex_exit(&pp->p_crlock); 2159 crfree(oldcred); 2160 2161 /* 2162 * Hold credentials again (for thread) 2163 */ 2164 crhold(cr); 2165 2166 /* 2167 * p_lwpcnt can't change since this is a kernel process. 2168 */ 2169 crset(pp, cr); 2170 2171 /* 2172 * Chroot 2173 */ 2174 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2175 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2176 2177 /* 2178 * Initialize zone's rctl set. 2179 */ 2180 set = rctl_set_create(); 2181 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2182 mutex_enter(&pp->p_lock); 2183 e.rcep_p.zone = zone; 2184 e.rcep_t = RCENTITY_ZONE; 2185 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2186 mutex_exit(&pp->p_lock); 2187 rctl_prealloc_destroy(gp); 2188 2189 /* 2190 * Apply the rctls passed in to zone_create(). This is basically a list 2191 * assignment: all of the old values are removed and the new ones 2192 * inserted. That is, if an empty list is passed in, all values are 2193 * removed. 2194 */ 2195 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2196 rctl_dict_entry_t *rde; 2197 rctl_hndl_t hndl; 2198 char *name; 2199 nvlist_t **nvlarray; 2200 uint_t i, nelem; 2201 int error; /* For ASSERT()s */ 2202 2203 name = nvpair_name(nvp); 2204 hndl = rctl_hndl_lookup(name); 2205 ASSERT(hndl != -1); 2206 rde = rctl_dict_lookup_hndl(hndl); 2207 ASSERT(rde != NULL); 2208 2209 for (; /* ever */; ) { 2210 rctl_val_t oval; 2211 2212 mutex_enter(&pp->p_lock); 2213 error = rctl_local_get(hndl, NULL, &oval, pp); 2214 mutex_exit(&pp->p_lock); 2215 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2216 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2217 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2218 break; 2219 mutex_enter(&pp->p_lock); 2220 error = rctl_local_delete(hndl, &oval, pp); 2221 mutex_exit(&pp->p_lock); 2222 ASSERT(error == 0); 2223 } 2224 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2225 ASSERT(error == 0); 2226 for (i = 0; i < nelem; i++) { 2227 rctl_val_t *nvalp; 2228 2229 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2230 error = nvlist2rctlval(nvlarray[i], nvalp); 2231 ASSERT(error == 0); 2232 /* 2233 * rctl_local_insert can fail if the value being 2234 * inserted is a duplicate; this is OK. 2235 */ 2236 mutex_enter(&pp->p_lock); 2237 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2238 kmem_cache_free(rctl_val_cache, nvalp); 2239 mutex_exit(&pp->p_lock); 2240 } 2241 } 2242 /* 2243 * Tell the world that we're done setting up. 2244 * 2245 * At this point we want to set the zone status to ZONE_IS_READY 2246 * and atomically set the zone's processor set visibility. Once 2247 * we drop pool_lock() this zone will automatically get updated 2248 * to reflect any future changes to the pools configuration. 2249 */ 2250 pool_lock(); 2251 mutex_enter(&cpu_lock); 2252 mutex_enter(&zonehash_lock); 2253 zone_uniqid(zone); 2254 zone_zsd_configure(zone); 2255 if (pool_state == POOL_ENABLED) 2256 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2257 mutex_enter(&zone_status_lock); 2258 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2259 zone_status_set(zone, ZONE_IS_READY); 2260 mutex_exit(&zone_status_lock); 2261 mutex_exit(&zonehash_lock); 2262 mutex_exit(&cpu_lock); 2263 pool_unlock(); 2264 2265 /* 2266 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2267 * we launch init, and set the state to running. 2268 */ 2269 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2270 2271 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2272 id_t cid; 2273 2274 /* 2275 * Ok, this is a little complicated. We need to grab the 2276 * zone's pool's scheduling class ID; note that by now, we 2277 * are already bound to a pool if we need to be (zoneadmd 2278 * will have done that to us while we're in the READY 2279 * state). *But* the scheduling class for the zone's 'init' 2280 * must be explicitly passed to newproc, which doesn't 2281 * respect pool bindings. 2282 * 2283 * We hold the pool_lock across the call to newproc() to 2284 * close the obvious race: the pool's scheduling class 2285 * could change before we manage to create the LWP with 2286 * classid 'cid'. 2287 */ 2288 pool_lock(); 2289 cid = pool_get_class(zone->zone_pool); 2290 if (cid == -1) 2291 cid = defaultcid; 2292 2293 /* 2294 * If this fails, zone_boot will ultimately fail. The 2295 * state of the zone will be set to SHUTTING_DOWN-- userland 2296 * will have to tear down the zone, and fail, or try again. 2297 */ 2298 if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid, 2299 minclsyspri - 1, &ct)) != 0) { 2300 mutex_enter(&zone_status_lock); 2301 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2302 mutex_exit(&zone_status_lock); 2303 } 2304 pool_unlock(); 2305 } 2306 2307 /* 2308 * Wait for zone_destroy() to be called. This is what we spend 2309 * most of our life doing. 2310 */ 2311 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2312 2313 if (ct) 2314 /* 2315 * At this point the process contract should be empty. 2316 * (Though if it isn't, it's not the end of the world.) 2317 */ 2318 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2319 2320 /* 2321 * Allow kcred to be freed when all referring processes 2322 * (including this one) go away. We can't just do this in 2323 * zone_free because we need to wait for the zone_cred_ref to 2324 * drop to 0 before calling zone_free, and the existence of 2325 * zone_kcred will prevent that. Thus, we call crfree here to 2326 * balance the crdup in zone_create. The crhold calls earlier 2327 * in zsched will be dropped when the thread and process exit. 2328 */ 2329 crfree(zone->zone_kcred); 2330 zone->zone_kcred = NULL; 2331 2332 exit(CLD_EXITED, 0); 2333 } 2334 2335 /* 2336 * Helper function to determine if there are any submounts of the 2337 * provided path. Used to make sure the zone doesn't "inherit" any 2338 * mounts from before it is created. 2339 */ 2340 static uint_t 2341 zone_mount_count(const char *rootpath) 2342 { 2343 vfs_t *vfsp; 2344 uint_t count = 0; 2345 size_t rootpathlen = strlen(rootpath); 2346 2347 /* 2348 * Holding zonehash_lock prevents race conditions with 2349 * vfs_list_add()/vfs_list_remove() since we serialize with 2350 * zone_find_by_path(). 2351 */ 2352 ASSERT(MUTEX_HELD(&zonehash_lock)); 2353 /* 2354 * The rootpath must end with a '/' 2355 */ 2356 ASSERT(rootpath[rootpathlen - 1] == '/'); 2357 2358 /* 2359 * This intentionally does not count the rootpath itself if that 2360 * happens to be a mount point. 2361 */ 2362 vfs_list_read_lock(); 2363 vfsp = rootvfs; 2364 do { 2365 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2366 rootpathlen) == 0) 2367 count++; 2368 vfsp = vfsp->vfs_next; 2369 } while (vfsp != rootvfs); 2370 vfs_list_unlock(); 2371 return (count); 2372 } 2373 2374 /* 2375 * Helper function to make sure that a zone created on 'rootpath' 2376 * wouldn't end up containing other zones' rootpaths. 2377 */ 2378 static boolean_t 2379 zone_is_nested(const char *rootpath) 2380 { 2381 zone_t *zone; 2382 size_t rootpathlen = strlen(rootpath); 2383 size_t len; 2384 2385 ASSERT(MUTEX_HELD(&zonehash_lock)); 2386 2387 for (zone = list_head(&zone_active); zone != NULL; 2388 zone = list_next(&zone_active, zone)) { 2389 if (zone == global_zone) 2390 continue; 2391 len = strlen(zone->zone_rootpath); 2392 if (strncmp(rootpath, zone->zone_rootpath, 2393 MIN(rootpathlen, len)) == 0) 2394 return (B_TRUE); 2395 } 2396 return (B_FALSE); 2397 } 2398 2399 static int 2400 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs) 2401 { 2402 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 2403 2404 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 2405 kmem_free(privs, sizeof (priv_set_t)); 2406 return (EFAULT); 2407 } 2408 2409 zone->zone_privset = privs; 2410 return (0); 2411 } 2412 2413 /* 2414 * We make creative use of nvlists to pass in rctls from userland. The list is 2415 * a list of the following structures: 2416 * 2417 * (name = rctl_name, value = nvpair_list_array) 2418 * 2419 * Where each element of the nvpair_list_array is of the form: 2420 * 2421 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 2422 * (name = "limit", value = uint64_t), 2423 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 2424 */ 2425 static int 2426 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 2427 { 2428 nvpair_t *nvp = NULL; 2429 nvlist_t *nvl = NULL; 2430 char *kbuf; 2431 int error; 2432 rctl_val_t rv; 2433 2434 *nvlp = NULL; 2435 2436 if (buflen == 0) 2437 return (0); 2438 2439 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2440 return (ENOMEM); 2441 if (copyin(ubuf, kbuf, buflen)) { 2442 error = EFAULT; 2443 goto out; 2444 } 2445 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 2446 /* 2447 * nvl may have been allocated/free'd, but the value set to 2448 * non-NULL, so we reset it here. 2449 */ 2450 nvl = NULL; 2451 error = EINVAL; 2452 goto out; 2453 } 2454 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2455 rctl_dict_entry_t *rde; 2456 rctl_hndl_t hndl; 2457 nvlist_t **nvlarray; 2458 uint_t i, nelem; 2459 char *name; 2460 2461 error = EINVAL; 2462 name = nvpair_name(nvp); 2463 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 2464 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 2465 goto out; 2466 } 2467 if ((hndl = rctl_hndl_lookup(name)) == -1) { 2468 goto out; 2469 } 2470 rde = rctl_dict_lookup_hndl(hndl); 2471 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2472 ASSERT(error == 0); 2473 for (i = 0; i < nelem; i++) { 2474 if (error = nvlist2rctlval(nvlarray[i], &rv)) 2475 goto out; 2476 } 2477 if (rctl_invalid_value(rde, &rv)) { 2478 error = EINVAL; 2479 goto out; 2480 } 2481 } 2482 error = 0; 2483 *nvlp = nvl; 2484 out: 2485 kmem_free(kbuf, buflen); 2486 if (error && nvl != NULL) 2487 nvlist_free(nvl); 2488 return (error); 2489 } 2490 2491 int 2492 zone_create_error(int er_error, int er_ext, int *er_out) { 2493 if (er_out != NULL) { 2494 if (copyout(&er_ext, er_out, sizeof (int))) { 2495 return (set_errno(EFAULT)); 2496 } 2497 } 2498 return (set_errno(er_error)); 2499 } 2500 2501 /* 2502 * System call to create/initialize a new zone named 'zone_name', rooted 2503 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 2504 * and initialized with the zone-wide rctls described in 'rctlbuf'. 2505 * 2506 * If extended error is non-null, we may use it to return more detailed 2507 * error information. 2508 */ 2509 static zoneid_t 2510 zone_create(const char *zone_name, const char *zone_root, 2511 const priv_set_t *zone_privs, caddr_t rctlbuf, size_t rctlbufsz, 2512 int *extended_error) 2513 { 2514 struct zsched_arg zarg; 2515 nvlist_t *rctls = NULL; 2516 proc_t *pp = curproc; 2517 zone_t *zone, *ztmp; 2518 zoneid_t zoneid; 2519 int error; 2520 int error2 = 0; 2521 char *str; 2522 cred_t *zkcr; 2523 2524 if (secpolicy_zone_config(CRED()) != 0) 2525 return (set_errno(EPERM)); 2526 2527 /* can't boot zone from within chroot environment */ 2528 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 2529 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 2530 extended_error)); 2531 2532 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 2533 zoneid = zone->zone_id = id_alloc(zoneid_space); 2534 zone->zone_status = ZONE_IS_UNINITIALIZED; 2535 zone->zone_pool = pool_default; 2536 zone->zone_pool_mod = gethrtime(); 2537 zone->zone_psetid = ZONE_PS_INVAL; 2538 zone->zone_ncpus = 0; 2539 zone->zone_ncpus_online = 0; 2540 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 2541 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 2542 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 2543 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 2544 offsetof(struct zsd_entry, zsd_linkage)); 2545 2546 if ((error = zone_set_name(zone, zone_name)) != 0) { 2547 zone_free(zone); 2548 return (zone_create_error(error, 0, extended_error)); 2549 } 2550 2551 if ((error = zone_set_root(zone, zone_root)) != 0) { 2552 zone_free(zone); 2553 return (zone_create_error(error, 0, extended_error)); 2554 } 2555 if ((error = zone_set_privset(zone, zone_privs)) != 0) { 2556 zone_free(zone); 2557 return (zone_create_error(error, 0, extended_error)); 2558 } 2559 2560 /* initialize node name to be the same as zone name */ 2561 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2562 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 2563 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 2564 2565 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2566 zone->zone_domain[0] = '\0'; 2567 zone->zone_shares = 1; 2568 zone->zone_bootargs = NULL; 2569 2570 /* 2571 * Zsched initializes the rctls. 2572 */ 2573 zone->zone_rctls = NULL; 2574 2575 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 2576 zone_free(zone); 2577 return (zone_create_error(error, 0, extended_error)); 2578 } 2579 2580 /* 2581 * Stop all lwps since that's what normally happens as part of fork(). 2582 * This needs to happen before we grab any locks to avoid deadlock 2583 * (another lwp in the process could be waiting for the held lock). 2584 */ 2585 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 2586 zone_free(zone); 2587 if (rctls) 2588 nvlist_free(rctls); 2589 return (zone_create_error(error, 0, extended_error)); 2590 } 2591 2592 if (block_mounts() == 0) { 2593 mutex_enter(&pp->p_lock); 2594 if (curthread != pp->p_agenttp) 2595 continuelwps(pp); 2596 mutex_exit(&pp->p_lock); 2597 zone_free(zone); 2598 if (rctls) 2599 nvlist_free(rctls); 2600 return (zone_create_error(error, 0, extended_error)); 2601 } 2602 2603 /* 2604 * Set up credential for kernel access. After this, any errors 2605 * should go through the dance in errout rather than calling 2606 * zone_free directly. 2607 */ 2608 zone->zone_kcred = crdup(kcred); 2609 crsetzone(zone->zone_kcred, zone); 2610 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 2611 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 2612 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 2613 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 2614 2615 mutex_enter(&zonehash_lock); 2616 /* 2617 * Make sure zone doesn't already exist. 2618 */ 2619 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) { 2620 zone_status_t status; 2621 2622 status = zone_status_get(ztmp); 2623 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 2624 error = EEXIST; 2625 else 2626 error = EBUSY; 2627 goto errout; 2628 } 2629 2630 /* 2631 * Don't allow zone creations which would cause one zone's rootpath to 2632 * be accessible from that of another (non-global) zone. 2633 */ 2634 if (zone_is_nested(zone->zone_rootpath)) { 2635 error = EBUSY; 2636 goto errout; 2637 } 2638 2639 ASSERT(zonecount != 0); /* check for leaks */ 2640 if (zonecount + 1 > maxzones) { 2641 error = ENOMEM; 2642 goto errout; 2643 } 2644 2645 if (zone_mount_count(zone->zone_rootpath) != 0) { 2646 error = EBUSY; 2647 error2 = ZE_AREMOUNTS; 2648 goto errout; 2649 } 2650 2651 /* 2652 * Zone is still incomplete, but we need to drop all locks while 2653 * zsched() initializes this zone's kernel process. We 2654 * optimistically add the zone to the hashtable and associated 2655 * lists so a parallel zone_create() doesn't try to create the 2656 * same zone. 2657 */ 2658 zonecount++; 2659 (void) mod_hash_insert(zonehashbyid, 2660 (mod_hash_key_t)(uintptr_t)zone->zone_id, 2661 (mod_hash_val_t)(uintptr_t)zone); 2662 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 2663 (void) strcpy(str, zone->zone_name); 2664 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 2665 (mod_hash_val_t)(uintptr_t)zone); 2666 /* 2667 * Insert into active list. At this point there are no 'hold's 2668 * on the zone, but everyone else knows not to use it, so we can 2669 * continue to use it. zsched() will do a zone_hold() if the 2670 * newproc() is successful. 2671 */ 2672 list_insert_tail(&zone_active, zone); 2673 mutex_exit(&zonehash_lock); 2674 2675 zarg.zone = zone; 2676 zarg.nvlist = rctls; 2677 /* 2678 * The process, task, and project rctls are probably wrong; 2679 * we need an interface to get the default values of all rctls, 2680 * and initialize zsched appropriately. I'm not sure that that 2681 * makes much of a difference, though. 2682 */ 2683 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 2684 /* 2685 * We need to undo all globally visible state. 2686 */ 2687 mutex_enter(&zonehash_lock); 2688 list_remove(&zone_active, zone); 2689 (void) mod_hash_destroy(zonehashbyname, 2690 (mod_hash_key_t)(uintptr_t)zone->zone_name); 2691 (void) mod_hash_destroy(zonehashbyid, 2692 (mod_hash_key_t)(uintptr_t)zone->zone_id); 2693 ASSERT(zonecount > 1); 2694 zonecount--; 2695 goto errout; 2696 } 2697 2698 /* 2699 * Zone creation can't fail from now on. 2700 */ 2701 2702 /* 2703 * Let the other lwps continue. 2704 */ 2705 mutex_enter(&pp->p_lock); 2706 if (curthread != pp->p_agenttp) 2707 continuelwps(pp); 2708 mutex_exit(&pp->p_lock); 2709 2710 /* 2711 * Wait for zsched to finish initializing the zone. 2712 */ 2713 zone_status_wait(zone, ZONE_IS_READY); 2714 /* 2715 * The zone is fully visible, so we can let mounts progress. 2716 */ 2717 resume_mounts(); 2718 if (rctls) 2719 nvlist_free(rctls); 2720 2721 return (zoneid); 2722 2723 errout: 2724 mutex_exit(&zonehash_lock); 2725 /* 2726 * Let the other lwps continue. 2727 */ 2728 mutex_enter(&pp->p_lock); 2729 if (curthread != pp->p_agenttp) 2730 continuelwps(pp); 2731 mutex_exit(&pp->p_lock); 2732 2733 resume_mounts(); 2734 if (rctls) 2735 nvlist_free(rctls); 2736 /* 2737 * There is currently one reference to the zone, a cred_ref from 2738 * zone_kcred. To free the zone, we call crfree, which will call 2739 * zone_cred_rele, which will call zone_free. 2740 */ 2741 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 2742 ASSERT(zone->zone_kcred->cr_ref == 1); 2743 ASSERT(zone->zone_ref == 0); 2744 zkcr = zone->zone_kcred; 2745 zone->zone_kcred = NULL; 2746 crfree(zkcr); /* triggers call to zone_free */ 2747 return (zone_create_error(error, error2, extended_error)); 2748 } 2749 2750 /* 2751 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 2752 * the heavy lifting. 2753 */ 2754 static int 2755 zone_boot(zoneid_t zoneid, const char *bootargs) 2756 { 2757 int err; 2758 zone_t *zone; 2759 2760 if (secpolicy_zone_config(CRED()) != 0) 2761 return (set_errno(EPERM)); 2762 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2763 return (set_errno(EINVAL)); 2764 2765 mutex_enter(&zonehash_lock); 2766 /* 2767 * Look for zone under hash lock to prevent races with calls to 2768 * zone_shutdown, zone_destroy, etc. 2769 */ 2770 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2771 mutex_exit(&zonehash_lock); 2772 return (set_errno(EINVAL)); 2773 } 2774 2775 if ((err = zone_set_bootargs(zone, bootargs)) != 0) { 2776 mutex_exit(&zonehash_lock); 2777 return (set_errno(err)); 2778 } 2779 2780 mutex_enter(&zone_status_lock); 2781 if (zone_status_get(zone) != ZONE_IS_READY) { 2782 mutex_exit(&zone_status_lock); 2783 mutex_exit(&zonehash_lock); 2784 return (set_errno(EINVAL)); 2785 } 2786 zone_status_set(zone, ZONE_IS_BOOTING); 2787 mutex_exit(&zone_status_lock); 2788 2789 zone_hold(zone); /* so we can use the zone_t later */ 2790 mutex_exit(&zonehash_lock); 2791 2792 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 2793 zone_rele(zone); 2794 return (set_errno(EINTR)); 2795 } 2796 2797 /* 2798 * Boot (starting init) might have failed, in which case the zone 2799 * will go to the SHUTTING_DOWN state; an appropriate errno will 2800 * be placed in zone->zone_boot_err, and so we return that. 2801 */ 2802 err = zone->zone_boot_err; 2803 zone_rele(zone); 2804 return (err ? set_errno(err) : 0); 2805 } 2806 2807 /* 2808 * Kills all user processes in the zone, waiting for them all to exit 2809 * before returning. 2810 */ 2811 static int 2812 zone_empty(zone_t *zone) 2813 { 2814 int waitstatus; 2815 2816 /* 2817 * We need to drop zonehash_lock before killing all 2818 * processes, otherwise we'll deadlock with zone_find_* 2819 * which can be called from the exit path. 2820 */ 2821 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 2822 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 2823 ZONE_IS_EMPTY)) == -1) { 2824 killall(zone->zone_id); 2825 } 2826 /* 2827 * return EINTR if we were signaled 2828 */ 2829 if (waitstatus == 0) 2830 return (EINTR); 2831 return (0); 2832 } 2833 2834 /* 2835 * Systemcall to start the zone's halt sequence. By the time this 2836 * function successfully returns, all user processes and kernel threads 2837 * executing in it will have exited, ZSD shutdown callbacks executed, 2838 * and the zone status set to ZONE_IS_DOWN. 2839 * 2840 * It is possible that the call will interrupt itself if the caller is the 2841 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 2842 */ 2843 static int 2844 zone_shutdown(zoneid_t zoneid) 2845 { 2846 int error; 2847 zone_t *zone; 2848 zone_status_t status; 2849 2850 if (secpolicy_zone_config(CRED()) != 0) 2851 return (set_errno(EPERM)); 2852 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2853 return (set_errno(EINVAL)); 2854 2855 /* 2856 * Block mounts so that VFS_MOUNT() can get an accurate view of 2857 * the zone's status with regards to ZONE_IS_SHUTTING down. 2858 * 2859 * e.g. NFS can fail the mount if it determines that the zone 2860 * has already begun the shutdown sequence. 2861 */ 2862 if (block_mounts() == 0) 2863 return (set_errno(EINTR)); 2864 mutex_enter(&zonehash_lock); 2865 /* 2866 * Look for zone under hash lock to prevent races with other 2867 * calls to zone_shutdown and zone_destroy. 2868 */ 2869 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2870 mutex_exit(&zonehash_lock); 2871 resume_mounts(); 2872 return (set_errno(EINVAL)); 2873 } 2874 mutex_enter(&zone_status_lock); 2875 status = zone_status_get(zone); 2876 /* 2877 * Fail if the zone isn't fully initialized yet. 2878 */ 2879 if (status < ZONE_IS_READY) { 2880 mutex_exit(&zone_status_lock); 2881 mutex_exit(&zonehash_lock); 2882 resume_mounts(); 2883 return (set_errno(EINVAL)); 2884 } 2885 /* 2886 * If conditions required for zone_shutdown() to return have been met, 2887 * return success. 2888 */ 2889 if (status >= ZONE_IS_DOWN) { 2890 mutex_exit(&zone_status_lock); 2891 mutex_exit(&zonehash_lock); 2892 resume_mounts(); 2893 return (0); 2894 } 2895 /* 2896 * If zone_shutdown() hasn't been called before, go through the motions. 2897 * If it has, there's nothing to do but wait for the kernel threads to 2898 * drain. 2899 */ 2900 if (status < ZONE_IS_EMPTY) { 2901 uint_t ntasks; 2902 2903 mutex_enter(&zone->zone_lock); 2904 if ((ntasks = zone->zone_ntasks) != 1) { 2905 /* 2906 * There's still stuff running. 2907 */ 2908 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2909 } 2910 mutex_exit(&zone->zone_lock); 2911 if (ntasks == 1) { 2912 /* 2913 * The only way to create another task is through 2914 * zone_enter(), which will block until we drop 2915 * zonehash_lock. The zone is empty. 2916 */ 2917 if (zone->zone_kthreads == NULL) { 2918 /* 2919 * Skip ahead to ZONE_IS_DOWN 2920 */ 2921 zone_status_set(zone, ZONE_IS_DOWN); 2922 } else { 2923 zone_status_set(zone, ZONE_IS_EMPTY); 2924 } 2925 } 2926 } 2927 zone_hold(zone); /* so we can use the zone_t later */ 2928 mutex_exit(&zone_status_lock); 2929 mutex_exit(&zonehash_lock); 2930 resume_mounts(); 2931 2932 if (error = zone_empty(zone)) { 2933 zone_rele(zone); 2934 return (set_errno(error)); 2935 } 2936 /* 2937 * After the zone status goes to ZONE_IS_DOWN this zone will no 2938 * longer be notified of changes to the pools configuration, so 2939 * in order to not end up with a stale pool pointer, we point 2940 * ourselves at the default pool and remove all resource 2941 * visibility. This is especially important as the zone_t may 2942 * languish on the deathrow for a very long time waiting for 2943 * cred's to drain out. 2944 * 2945 * This rebinding of the zone can happen multiple times 2946 * (presumably due to interrupted or parallel systemcalls) 2947 * without any adverse effects. 2948 */ 2949 if (pool_lock_intr() != 0) { 2950 zone_rele(zone); 2951 return (set_errno(EINTR)); 2952 } 2953 if (pool_state == POOL_ENABLED) { 2954 mutex_enter(&cpu_lock); 2955 zone_pool_set(zone, pool_default); 2956 /* 2957 * The zone no longer needs to be able to see any cpus. 2958 */ 2959 zone_pset_set(zone, ZONE_PS_INVAL); 2960 mutex_exit(&cpu_lock); 2961 } 2962 pool_unlock(); 2963 2964 /* 2965 * ZSD shutdown callbacks can be executed multiple times, hence 2966 * it is safe to not be holding any locks across this call. 2967 */ 2968 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 2969 2970 mutex_enter(&zone_status_lock); 2971 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 2972 zone_status_set(zone, ZONE_IS_DOWN); 2973 mutex_exit(&zone_status_lock); 2974 2975 /* 2976 * Wait for kernel threads to drain. 2977 */ 2978 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 2979 zone_rele(zone); 2980 return (set_errno(EINTR)); 2981 } 2982 zone_rele(zone); 2983 return (0); 2984 } 2985 2986 /* 2987 * Systemcall entry point to finalize the zone halt process. The caller 2988 * must have already successfully callefd zone_shutdown(). 2989 * 2990 * Upon successful completion, the zone will have been fully destroyed: 2991 * zsched will have exited, destructor callbacks executed, and the zone 2992 * removed from the list of active zones. 2993 */ 2994 static int 2995 zone_destroy(zoneid_t zoneid) 2996 { 2997 uint64_t uniqid; 2998 zone_t *zone; 2999 zone_status_t status; 3000 3001 if (secpolicy_zone_config(CRED()) != 0) 3002 return (set_errno(EPERM)); 3003 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3004 return (set_errno(EINVAL)); 3005 3006 mutex_enter(&zonehash_lock); 3007 /* 3008 * Look for zone under hash lock to prevent races with other 3009 * calls to zone_destroy. 3010 */ 3011 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3012 mutex_exit(&zonehash_lock); 3013 return (set_errno(EINVAL)); 3014 } 3015 3016 if (zone_mount_count(zone->zone_rootpath) != 0) { 3017 mutex_exit(&zonehash_lock); 3018 return (set_errno(EBUSY)); 3019 } 3020 mutex_enter(&zone_status_lock); 3021 status = zone_status_get(zone); 3022 if (status < ZONE_IS_DOWN) { 3023 mutex_exit(&zone_status_lock); 3024 mutex_exit(&zonehash_lock); 3025 return (set_errno(EBUSY)); 3026 } else if (status == ZONE_IS_DOWN) { 3027 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3028 } 3029 mutex_exit(&zone_status_lock); 3030 zone_hold(zone); 3031 mutex_exit(&zonehash_lock); 3032 3033 /* 3034 * wait for zsched to exit 3035 */ 3036 zone_status_wait(zone, ZONE_IS_DEAD); 3037 zone_zsd_callbacks(zone, ZSD_DESTROY); 3038 uniqid = zone->zone_uniqid; 3039 zone_rele(zone); 3040 zone = NULL; /* potentially free'd */ 3041 3042 mutex_enter(&zonehash_lock); 3043 for (; /* ever */; ) { 3044 boolean_t unref; 3045 3046 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3047 zone->zone_uniqid != uniqid) { 3048 /* 3049 * The zone has gone away. Necessary conditions 3050 * are met, so we return success. 3051 */ 3052 mutex_exit(&zonehash_lock); 3053 return (0); 3054 } 3055 mutex_enter(&zone->zone_lock); 3056 unref = ZONE_IS_UNREF(zone); 3057 mutex_exit(&zone->zone_lock); 3058 if (unref) { 3059 /* 3060 * There is only one reference to the zone -- that 3061 * added when the zone was added to the hashtables -- 3062 * and things will remain this way until we drop 3063 * zonehash_lock... we can go ahead and cleanup the 3064 * zone. 3065 */ 3066 break; 3067 } 3068 3069 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3070 /* Signaled */ 3071 mutex_exit(&zonehash_lock); 3072 return (set_errno(EINTR)); 3073 } 3074 3075 } 3076 3077 /* 3078 * It is now safe to let the zone be recreated; remove it from the 3079 * lists. The memory will not be freed until the last cred 3080 * reference goes away. 3081 */ 3082 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3083 zonecount--; 3084 /* remove from active list and hash tables */ 3085 list_remove(&zone_active, zone); 3086 (void) mod_hash_destroy(zonehashbyname, 3087 (mod_hash_key_t)zone->zone_name); 3088 (void) mod_hash_destroy(zonehashbyid, 3089 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3090 mutex_exit(&zonehash_lock); 3091 3092 /* add to deathrow list */ 3093 mutex_enter(&zone_deathrow_lock); 3094 list_insert_tail(&zone_deathrow, zone); 3095 mutex_exit(&zone_deathrow_lock); 3096 3097 /* 3098 * Drop last reference (which was added by zsched()), this will 3099 * free the zone unless there are outstanding cred references. 3100 */ 3101 zone_rele(zone); 3102 return (0); 3103 } 3104 3105 /* 3106 * Systemcall entry point for zone_getattr(2). 3107 */ 3108 static ssize_t 3109 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3110 { 3111 size_t size; 3112 int error = 0, err; 3113 zone_t *zone; 3114 char *zonepath; 3115 zone_status_t zone_status; 3116 pid_t initpid; 3117 boolean_t global = (curproc->p_zone == global_zone); 3118 3119 mutex_enter(&zonehash_lock); 3120 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3121 mutex_exit(&zonehash_lock); 3122 return (set_errno(EINVAL)); 3123 } 3124 zone_status = zone_status_get(zone); 3125 if (zone_status < ZONE_IS_READY) { 3126 mutex_exit(&zonehash_lock); 3127 return (set_errno(EINVAL)); 3128 } 3129 zone_hold(zone); 3130 mutex_exit(&zonehash_lock); 3131 3132 /* 3133 * If not in the global zone, don't show information about other zones. 3134 */ 3135 if (!global && curproc->p_zone != zone) { 3136 zone_rele(zone); 3137 return (set_errno(EINVAL)); 3138 } 3139 3140 switch (attr) { 3141 case ZONE_ATTR_ROOT: 3142 if (global) { 3143 /* 3144 * Copy the path to trim the trailing "/" (except for 3145 * the global zone). 3146 */ 3147 if (zone != global_zone) 3148 size = zone->zone_rootpathlen - 1; 3149 else 3150 size = zone->zone_rootpathlen; 3151 zonepath = kmem_alloc(size, KM_SLEEP); 3152 bcopy(zone->zone_rootpath, zonepath, size); 3153 zonepath[size - 1] = '\0'; 3154 } else { 3155 /* 3156 * Caller is not in the global zone, just return 3157 * faked-up path for current zone. 3158 */ 3159 zonepath = "/"; 3160 size = 2; 3161 } 3162 if (bufsize > size) 3163 bufsize = size; 3164 if (buf != NULL) { 3165 err = copyoutstr(zonepath, buf, bufsize, NULL); 3166 if (err != 0 && err != ENAMETOOLONG) 3167 error = EFAULT; 3168 } 3169 if (global) 3170 kmem_free(zonepath, size); 3171 break; 3172 3173 case ZONE_ATTR_NAME: 3174 size = strlen(zone->zone_name) + 1; 3175 if (bufsize > size) 3176 bufsize = size; 3177 if (buf != NULL) { 3178 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 3179 if (err != 0 && err != ENAMETOOLONG) 3180 error = EFAULT; 3181 } 3182 break; 3183 3184 case ZONE_ATTR_STATUS: 3185 /* 3186 * Since we're not holding zonehash_lock, the zone status 3187 * may be anything; leave it up to userland to sort it out. 3188 */ 3189 size = sizeof (zone_status); 3190 if (bufsize > size) 3191 bufsize = size; 3192 zone_status = zone_status_get(zone); 3193 if (buf != NULL && 3194 copyout(&zone_status, buf, bufsize) != 0) 3195 error = EFAULT; 3196 break; 3197 case ZONE_ATTR_PRIVSET: 3198 size = sizeof (priv_set_t); 3199 if (bufsize > size) 3200 bufsize = size; 3201 if (buf != NULL && 3202 copyout(zone->zone_privset, buf, bufsize) != 0) 3203 error = EFAULT; 3204 break; 3205 case ZONE_ATTR_UNIQID: 3206 size = sizeof (zone->zone_uniqid); 3207 if (bufsize > size) 3208 bufsize = size; 3209 if (buf != NULL && 3210 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 3211 error = EFAULT; 3212 break; 3213 case ZONE_ATTR_POOLID: 3214 { 3215 pool_t *pool; 3216 poolid_t poolid; 3217 3218 if (pool_lock_intr() != 0) { 3219 error = EINTR; 3220 break; 3221 } 3222 pool = zone_pool_get(zone); 3223 poolid = pool->pool_id; 3224 pool_unlock(); 3225 size = sizeof (poolid); 3226 if (bufsize > size) 3227 bufsize = size; 3228 if (buf != NULL && copyout(&poolid, buf, size) != 0) 3229 error = EFAULT; 3230 } 3231 break; 3232 case ZONE_ATTR_INITPID: 3233 size = sizeof (initpid); 3234 if (bufsize > size) 3235 bufsize = size; 3236 initpid = zone->zone_proc_initpid; 3237 if (initpid == -1) { 3238 error = ESRCH; 3239 break; 3240 } 3241 if (buf != NULL && 3242 copyout(&initpid, buf, bufsize) != 0) 3243 error = EFAULT; 3244 break; 3245 default: 3246 error = EINVAL; 3247 } 3248 zone_rele(zone); 3249 3250 if (error) 3251 return (set_errno(error)); 3252 return ((ssize_t)size); 3253 } 3254 3255 /* 3256 * Return zero if the process has at least one vnode mapped in to its 3257 * address space which shouldn't be allowed to change zones. 3258 */ 3259 static int 3260 as_can_change_zones(void) 3261 { 3262 proc_t *pp = curproc; 3263 struct seg *seg; 3264 struct as *as = pp->p_as; 3265 vnode_t *vp; 3266 int allow = 1; 3267 3268 ASSERT(pp->p_as != &kas); 3269 AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); 3270 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 3271 /* 3272 * if we can't get a backing vnode for this segment then skip 3273 * it. 3274 */ 3275 vp = NULL; 3276 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 3277 continue; 3278 if (!vn_can_change_zones(vp)) { /* bail on first match */ 3279 allow = 0; 3280 break; 3281 } 3282 } 3283 AS_LOCK_EXIT(&as, &as->a_lock); 3284 return (allow); 3285 } 3286 3287 /* 3288 * Systemcall entry point for zone_enter(). 3289 * 3290 * The current process is injected into said zone. In the process 3291 * it will change its project membership, privileges, rootdir/cwd, 3292 * zone-wide rctls, and pool association to match those of the zone. 3293 * 3294 * The first zone_enter() called while the zone is in the ZONE_IS_READY 3295 * state will transition it to ZONE_IS_RUNNING. Processes may only 3296 * enter a zone that is "ready" or "running". 3297 */ 3298 static int 3299 zone_enter(zoneid_t zoneid) 3300 { 3301 zone_t *zone; 3302 vnode_t *vp; 3303 proc_t *pp = curproc; 3304 contract_t *ct; 3305 cont_process_t *ctp; 3306 task_t *tk, *oldtk; 3307 kproject_t *zone_proj0; 3308 cred_t *cr, *newcr; 3309 pool_t *oldpool, *newpool; 3310 sess_t *sp; 3311 uid_t uid; 3312 zone_status_t status; 3313 int err = 0; 3314 rctl_entity_p_t e; 3315 3316 if (secpolicy_zone_config(CRED()) != 0) 3317 return (set_errno(EPERM)); 3318 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3319 return (set_errno(EINVAL)); 3320 3321 /* 3322 * Stop all lwps so we don't need to hold a lock to look at 3323 * curproc->p_zone. This needs to happen before we grab any 3324 * locks to avoid deadlock (another lwp in the process could 3325 * be waiting for the held lock). 3326 */ 3327 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 3328 return (set_errno(EINTR)); 3329 3330 /* 3331 * Make sure we're not changing zones with files open or mapped in 3332 * to our address space which shouldn't be changing zones. 3333 */ 3334 if (!files_can_change_zones()) { 3335 err = EBADF; 3336 goto out; 3337 } 3338 if (!as_can_change_zones()) { 3339 err = EFAULT; 3340 goto out; 3341 } 3342 3343 mutex_enter(&zonehash_lock); 3344 if (pp->p_zone != global_zone) { 3345 mutex_exit(&zonehash_lock); 3346 err = EINVAL; 3347 goto out; 3348 } 3349 3350 zone = zone_find_all_by_id(zoneid); 3351 if (zone == NULL) { 3352 mutex_exit(&zonehash_lock); 3353 err = EINVAL; 3354 goto out; 3355 } 3356 3357 /* 3358 * To prevent processes in a zone from holding contracts on 3359 * extrazonal resources, and to avoid process contract 3360 * memberships which span zones, contract holders and processes 3361 * which aren't the sole members of their encapsulating process 3362 * contracts are not allowed to zone_enter. 3363 */ 3364 ctp = pp->p_ct_process; 3365 ct = &ctp->conp_contract; 3366 mutex_enter(&ct->ct_lock); 3367 mutex_enter(&pp->p_lock); 3368 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 3369 mutex_exit(&pp->p_lock); 3370 mutex_exit(&ct->ct_lock); 3371 mutex_exit(&zonehash_lock); 3372 pool_unlock(); 3373 err = EINVAL; 3374 goto out; 3375 } 3376 3377 /* 3378 * Moreover, we don't allow processes whose encapsulating 3379 * process contracts have inherited extrazonal contracts. 3380 * While it would be easier to eliminate all process contracts 3381 * with inherited contracts, we need to be able to give a 3382 * restarted init (or other zone-penetrating process) its 3383 * predecessor's contracts. 3384 */ 3385 if (ctp->conp_ninherited != 0) { 3386 contract_t *next; 3387 for (next = list_head(&ctp->conp_inherited); next; 3388 next = list_next(&ctp->conp_inherited, next)) { 3389 if (contract_getzuniqid(next) != zone->zone_uniqid) { 3390 mutex_exit(&pp->p_lock); 3391 mutex_exit(&ct->ct_lock); 3392 mutex_exit(&zonehash_lock); 3393 pool_unlock(); 3394 err = EINVAL; 3395 goto out; 3396 } 3397 } 3398 } 3399 mutex_exit(&pp->p_lock); 3400 mutex_exit(&ct->ct_lock); 3401 3402 status = zone_status_get(zone); 3403 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 3404 /* 3405 * Can't join 3406 */ 3407 mutex_exit(&zonehash_lock); 3408 err = EINVAL; 3409 goto out; 3410 } 3411 3412 /* 3413 * Make sure new priv set is within the permitted set for caller 3414 */ 3415 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 3416 mutex_exit(&zonehash_lock); 3417 err = EPERM; 3418 goto out; 3419 } 3420 /* 3421 * We want to momentarily drop zonehash_lock while we optimistically 3422 * bind curproc to the pool it should be running in. This is safe 3423 * since the zone can't disappear (we have a hold on it). 3424 */ 3425 zone_hold(zone); 3426 mutex_exit(&zonehash_lock); 3427 3428 /* 3429 * Grab pool_lock to keep the pools configuration from changing 3430 * and to stop ourselves from getting rebound to another pool 3431 * until we join the zone. 3432 */ 3433 if (pool_lock_intr() != 0) { 3434 zone_rele(zone); 3435 err = EINTR; 3436 goto out; 3437 } 3438 ASSERT(secpolicy_pool(CRED()) == 0); 3439 /* 3440 * Bind ourselves to the pool currently associated with the zone. 3441 */ 3442 oldpool = curproc->p_pool; 3443 newpool = zone_pool_get(zone); 3444 if (pool_state == POOL_ENABLED && newpool != oldpool && 3445 (err = pool_do_bind(newpool, P_PID, P_MYID, 3446 POOL_BIND_ALL)) != 0) { 3447 pool_unlock(); 3448 zone_rele(zone); 3449 goto out; 3450 } 3451 3452 /* 3453 * Grab cpu_lock now; we'll need it later when we call 3454 * task_join(). 3455 */ 3456 mutex_enter(&cpu_lock); 3457 mutex_enter(&zonehash_lock); 3458 /* 3459 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 3460 */ 3461 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3462 /* 3463 * Can't join anymore. 3464 */ 3465 mutex_exit(&zonehash_lock); 3466 mutex_exit(&cpu_lock); 3467 if (pool_state == POOL_ENABLED && 3468 newpool != oldpool) 3469 (void) pool_do_bind(oldpool, P_PID, P_MYID, 3470 POOL_BIND_ALL); 3471 pool_unlock(); 3472 zone_rele(zone); 3473 err = EINVAL; 3474 goto out; 3475 } 3476 3477 mutex_enter(&pp->p_lock); 3478 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 3479 /* verify that we do not exceed and task or lwp limits */ 3480 mutex_enter(&zone->zone_nlwps_lock); 3481 /* add new lwps to zone and zone's proj0 */ 3482 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 3483 zone->zone_nlwps += pp->p_lwpcnt; 3484 /* add 1 task to zone's proj0 */ 3485 zone_proj0->kpj_ntasks += 1; 3486 mutex_exit(&pp->p_lock); 3487 mutex_exit(&zone->zone_nlwps_lock); 3488 3489 /* remove lwps from proc's old zone and old project */ 3490 mutex_enter(&pp->p_zone->zone_nlwps_lock); 3491 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 3492 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 3493 mutex_exit(&pp->p_zone->zone_nlwps_lock); 3494 3495 /* 3496 * Joining the zone cannot fail from now on. 3497 * 3498 * This means that a lot of the following code can be commonized and 3499 * shared with zsched(). 3500 */ 3501 3502 /* 3503 * Reset the encapsulating process contract's zone. 3504 */ 3505 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 3506 contract_setzuniqid(ct, zone->zone_uniqid); 3507 3508 /* 3509 * Create a new task and associate the process with the project keyed 3510 * by (projid,zoneid). 3511 * 3512 * We might as well be in project 0; the global zone's projid doesn't 3513 * make much sense in a zone anyhow. 3514 * 3515 * This also increments zone_ntasks, and returns with p_lock held. 3516 */ 3517 tk = task_create(0, zone); 3518 oldtk = task_join(tk, 0); 3519 mutex_exit(&cpu_lock); 3520 3521 pp->p_flag |= SZONETOP; 3522 pp->p_zone = zone; 3523 3524 /* 3525 * call RCTLOP_SET functions on this proc 3526 */ 3527 e.rcep_p.zone = zone; 3528 e.rcep_t = RCENTITY_ZONE; 3529 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 3530 RCD_CALLBACK); 3531 mutex_exit(&pp->p_lock); 3532 3533 /* 3534 * We don't need to hold any of zsched's locks here; not only do we know 3535 * the process and zone aren't going away, we know its session isn't 3536 * changing either. 3537 * 3538 * By joining zsched's session here, we mimic the behavior in the 3539 * global zone of init's sid being the pid of sched. We extend this 3540 * to all zlogin-like zone_enter()'ing processes as well. 3541 */ 3542 mutex_enter(&pidlock); 3543 sp = zone->zone_zsched->p_sessp; 3544 SESS_HOLD(sp); 3545 mutex_enter(&pp->p_lock); 3546 pgexit(pp); 3547 SESS_RELE(pp->p_sessp); 3548 pp->p_sessp = sp; 3549 pgjoin(pp, zone->zone_zsched->p_pidp); 3550 mutex_exit(&pp->p_lock); 3551 mutex_exit(&pidlock); 3552 3553 mutex_exit(&zonehash_lock); 3554 /* 3555 * We're firmly in the zone; let pools progress. 3556 */ 3557 pool_unlock(); 3558 task_rele(oldtk); 3559 /* 3560 * We don't need to retain a hold on the zone since we already 3561 * incremented zone_ntasks, so the zone isn't going anywhere. 3562 */ 3563 zone_rele(zone); 3564 3565 /* 3566 * Chroot 3567 */ 3568 vp = zone->zone_rootvp; 3569 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 3570 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 3571 3572 /* 3573 * Change process credentials 3574 */ 3575 newcr = cralloc(); 3576 mutex_enter(&pp->p_crlock); 3577 cr = pp->p_cred; 3578 crcopy_to(cr, newcr); 3579 crsetzone(newcr, zone); 3580 pp->p_cred = newcr; 3581 3582 /* 3583 * Restrict all process privilege sets to zone limit 3584 */ 3585 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 3586 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 3587 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 3588 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 3589 mutex_exit(&pp->p_crlock); 3590 crset(pp, newcr); 3591 3592 /* 3593 * Adjust upcount to reflect zone entry. 3594 */ 3595 uid = crgetruid(newcr); 3596 mutex_enter(&pidlock); 3597 upcount_dec(uid, GLOBAL_ZONEID); 3598 upcount_inc(uid, zoneid); 3599 mutex_exit(&pidlock); 3600 3601 /* 3602 * Set up core file path and content. 3603 */ 3604 set_core_defaults(); 3605 3606 out: 3607 /* 3608 * Let the other lwps continue. 3609 */ 3610 mutex_enter(&pp->p_lock); 3611 if (curthread != pp->p_agenttp) 3612 continuelwps(pp); 3613 mutex_exit(&pp->p_lock); 3614 3615 return (err != 0 ? set_errno(err) : 0); 3616 } 3617 3618 /* 3619 * Systemcall entry point for zone_list(2). 3620 * 3621 * Processes running in a (non-global) zone only see themselves. 3622 */ 3623 static int 3624 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 3625 { 3626 zoneid_t *zoneids; 3627 zone_t *zone; 3628 uint_t user_nzones, real_nzones; 3629 int error = 0; 3630 uint_t i; 3631 3632 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 3633 return (set_errno(EFAULT)); 3634 3635 if (curproc->p_zone != global_zone) { 3636 /* just return current zone */ 3637 real_nzones = 1; 3638 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 3639 zoneids[0] = curproc->p_zone->zone_id; 3640 } else { 3641 mutex_enter(&zonehash_lock); 3642 real_nzones = zonecount; 3643 if (real_nzones) { 3644 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 3645 KM_SLEEP); 3646 i = 0; 3647 for (zone = list_head(&zone_active); zone != NULL; 3648 zone = list_next(&zone_active, zone)) 3649 zoneids[i++] = zone->zone_id; 3650 ASSERT(i == real_nzones); 3651 } 3652 mutex_exit(&zonehash_lock); 3653 } 3654 3655 if (user_nzones > real_nzones) 3656 user_nzones = real_nzones; 3657 3658 if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0) 3659 error = EFAULT; 3660 else if (zoneidlist != NULL && user_nzones != 0) { 3661 if (copyout(zoneids, zoneidlist, 3662 user_nzones * sizeof (zoneid_t)) != 0) 3663 error = EFAULT; 3664 } 3665 3666 if (real_nzones) 3667 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 3668 3669 if (error) 3670 return (set_errno(error)); 3671 else 3672 return (0); 3673 } 3674 3675 /* 3676 * Systemcall entry point for zone_lookup(2). 3677 * 3678 * Non-global zones are only able to see themselves. 3679 */ 3680 static zoneid_t 3681 zone_lookup(const char *zone_name) 3682 { 3683 char *kname; 3684 zone_t *zone; 3685 zoneid_t zoneid; 3686 int err; 3687 3688 if (zone_name == NULL) { 3689 /* return caller's zone id */ 3690 return (getzoneid()); 3691 } 3692 3693 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3694 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 3695 kmem_free(kname, ZONENAME_MAX); 3696 return (set_errno(err)); 3697 } 3698 3699 mutex_enter(&zonehash_lock); 3700 zone = zone_find_all_by_name(kname); 3701 kmem_free(kname, ZONENAME_MAX); 3702 if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY || 3703 (curproc->p_zone != global_zone && curproc->p_zone != zone)) { 3704 /* in non-global zone, can only lookup own name */ 3705 mutex_exit(&zonehash_lock); 3706 return (set_errno(EINVAL)); 3707 } 3708 zoneid = zone->zone_id; 3709 mutex_exit(&zonehash_lock); 3710 return (zoneid); 3711 } 3712 3713 /* ARGSUSED */ 3714 long 3715 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5) 3716 { 3717 zone_def zs; 3718 3719 switch (cmd) { 3720 case ZONE_CREATE: 3721 if (get_udatamodel() == DATAMODEL_NATIVE) { 3722 if (copyin(arg1, &zs, sizeof (zone_def))) { 3723 return (set_errno(EFAULT)); 3724 } 3725 } else { 3726 #ifdef _SYSCALL32_IMPL 3727 zone_def32 zs32; 3728 3729 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 3730 return (set_errno(EFAULT)); 3731 } 3732 zs.zone_name = 3733 (const char *)(unsigned long)zs32.zone_name; 3734 zs.zone_root = 3735 (const char *)(unsigned long)zs32.zone_root; 3736 zs.zone_privs = 3737 (const struct priv_set *) 3738 (unsigned long)zs32.zone_privs; 3739 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 3740 zs.rctlbufsz = zs32.rctlbufsz; 3741 zs.extended_error = 3742 (int *)(unsigned long)zs32.extended_error; 3743 #else 3744 panic("get_udatamodel() returned bogus result\n"); 3745 #endif 3746 } 3747 3748 return (zone_create(zs.zone_name, zs.zone_root, 3749 zs.zone_privs, (caddr_t)zs.rctlbuf, zs.rctlbufsz, 3750 zs.extended_error)); 3751 case ZONE_BOOT: 3752 return (zone_boot((zoneid_t)(uintptr_t)arg1, 3753 (const char *)arg2)); 3754 case ZONE_DESTROY: 3755 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 3756 case ZONE_GETATTR: 3757 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 3758 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 3759 case ZONE_ENTER: 3760 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 3761 case ZONE_LIST: 3762 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 3763 case ZONE_SHUTDOWN: 3764 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 3765 case ZONE_LOOKUP: 3766 return (zone_lookup((const char *)arg1)); 3767 default: 3768 return (set_errno(EINVAL)); 3769 } 3770 } 3771 3772 struct zarg { 3773 zone_t *zone; 3774 zone_cmd_arg_t arg; 3775 }; 3776 3777 static int 3778 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 3779 { 3780 char *buf; 3781 size_t buflen; 3782 int error; 3783 3784 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 3785 buf = kmem_alloc(buflen, KM_SLEEP); 3786 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 3787 error = door_ki_open(buf, doorp); 3788 kmem_free(buf, buflen); 3789 return (error); 3790 } 3791 3792 static void 3793 zone_release_door(door_handle_t *doorp) 3794 { 3795 door_ki_rele(*doorp); 3796 *doorp = NULL; 3797 } 3798 3799 static void 3800 zone_ki_call_zoneadmd(struct zarg *zargp) 3801 { 3802 door_handle_t door = NULL; 3803 door_arg_t darg, save_arg; 3804 char *zone_name; 3805 size_t zone_namelen; 3806 zoneid_t zoneid; 3807 zone_t *zone; 3808 zone_cmd_arg_t arg; 3809 uint64_t uniqid; 3810 size_t size; 3811 int error; 3812 int retry; 3813 3814 zone = zargp->zone; 3815 arg = zargp->arg; 3816 kmem_free(zargp, sizeof (*zargp)); 3817 3818 zone_namelen = strlen(zone->zone_name) + 1; 3819 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 3820 bcopy(zone->zone_name, zone_name, zone_namelen); 3821 zoneid = zone->zone_id; 3822 uniqid = zone->zone_uniqid; 3823 /* 3824 * zoneadmd may be down, but at least we can empty out the zone. 3825 * We can ignore the return value of zone_empty() since we're called 3826 * from a kernel thread and know we won't be delivered any signals. 3827 */ 3828 ASSERT(curproc == &p0); 3829 (void) zone_empty(zone); 3830 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 3831 zone_rele(zone); 3832 3833 size = sizeof (arg); 3834 darg.rbuf = (char *)&arg; 3835 darg.data_ptr = (char *)&arg; 3836 darg.rsize = size; 3837 darg.data_size = size; 3838 darg.desc_ptr = NULL; 3839 darg.desc_num = 0; 3840 3841 save_arg = darg; 3842 /* 3843 * Since we're not holding a reference to the zone, any number of 3844 * things can go wrong, including the zone disappearing before we get a 3845 * chance to talk to zoneadmd. 3846 */ 3847 for (retry = 0; /* forever */; retry++) { 3848 if (door == NULL && 3849 (error = zone_lookup_door(zone_name, &door)) != 0) { 3850 goto next; 3851 } 3852 ASSERT(door != NULL); 3853 3854 if ((error = door_ki_upcall(door, &darg)) == 0) { 3855 break; 3856 } 3857 switch (error) { 3858 case EINTR: 3859 /* FALLTHROUGH */ 3860 case EAGAIN: /* process may be forking */ 3861 /* 3862 * Back off for a bit 3863 */ 3864 break; 3865 case EBADF: 3866 zone_release_door(&door); 3867 if (zone_lookup_door(zone_name, &door) != 0) { 3868 /* 3869 * zoneadmd may be dead, but it may come back to 3870 * life later. 3871 */ 3872 break; 3873 } 3874 break; 3875 default: 3876 cmn_err(CE_WARN, 3877 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 3878 error); 3879 goto out; 3880 } 3881 next: 3882 /* 3883 * If this isn't the same zone_t that we originally had in mind, 3884 * then this is the same as if two kadmin requests come in at 3885 * the same time: the first one wins. This means we lose, so we 3886 * bail. 3887 */ 3888 if ((zone = zone_find_by_id(zoneid)) == NULL) { 3889 /* 3890 * Problem is solved. 3891 */ 3892 break; 3893 } 3894 if (zone->zone_uniqid != uniqid) { 3895 /* 3896 * zoneid recycled 3897 */ 3898 zone_rele(zone); 3899 break; 3900 } 3901 /* 3902 * We could zone_status_timedwait(), but there doesn't seem to 3903 * be much point in doing that (plus, it would mean that 3904 * zone_free() isn't called until this thread exits). 3905 */ 3906 zone_rele(zone); 3907 delay(hz); 3908 darg = save_arg; 3909 } 3910 out: 3911 if (door != NULL) { 3912 zone_release_door(&door); 3913 } 3914 kmem_free(zone_name, zone_namelen); 3915 thread_exit(); 3916 } 3917 3918 /* 3919 * Entry point for uadmin() to tell the zone to go away or reboot. The caller 3920 * is a process in the zone to be modified. 3921 * 3922 * In order to shutdown the zone, we will hand off control to zoneadmd 3923 * (running in the global zone) via a door. We do a half-hearted job at 3924 * killing all processes in the zone, create a kernel thread to contact 3925 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 3926 * a form of generation number used to let zoneadmd (as well as 3927 * zone_destroy()) know exactly which zone they're re talking about. 3928 */ 3929 int 3930 zone_uadmin(int cmd, int fcn, cred_t *credp) 3931 { 3932 struct zarg *zargp; 3933 zone_cmd_t zcmd; 3934 zone_t *zone; 3935 3936 zone = curproc->p_zone; 3937 ASSERT(getzoneid() != GLOBAL_ZONEID); 3938 3939 switch (cmd) { 3940 case A_SHUTDOWN: 3941 switch (fcn) { 3942 case AD_HALT: 3943 case AD_POWEROFF: 3944 zcmd = Z_HALT; 3945 break; 3946 case AD_BOOT: 3947 zcmd = Z_REBOOT; 3948 break; 3949 case AD_IBOOT: 3950 case AD_SBOOT: 3951 case AD_SIBOOT: 3952 case AD_NOSYNC: 3953 return (ENOTSUP); 3954 default: 3955 return (EINVAL); 3956 } 3957 break; 3958 case A_REBOOT: 3959 zcmd = Z_REBOOT; 3960 break; 3961 case A_FTRACE: 3962 case A_REMOUNT: 3963 case A_FREEZE: 3964 case A_DUMP: 3965 return (ENOTSUP); 3966 default: 3967 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 3968 return (EINVAL); 3969 } 3970 3971 if (secpolicy_zone_admin(credp, B_FALSE)) 3972 return (EPERM); 3973 mutex_enter(&zone_status_lock); 3974 /* 3975 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 3976 * is in the zone. 3977 */ 3978 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 3979 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 3980 /* 3981 * This zone is already on its way down. 3982 */ 3983 mutex_exit(&zone_status_lock); 3984 return (0); 3985 } 3986 /* 3987 * Prevent future zone_enter()s 3988 */ 3989 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3990 mutex_exit(&zone_status_lock); 3991 3992 /* 3993 * Kill everyone now and call zoneadmd later. 3994 * zone_ki_call_zoneadmd() will do a more thorough job of this 3995 * later. 3996 */ 3997 killall(zone->zone_id); 3998 /* 3999 * Now, create the thread to contact zoneadmd and do the rest of the 4000 * work. This thread can't be created in our zone otherwise 4001 * zone_destroy() would deadlock. 4002 */ 4003 zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP); 4004 zargp->arg.cmd = zcmd; 4005 zargp->arg.uniqid = zone->zone_uniqid; 4006 (void) strcpy(zargp->arg.locale, "C"); 4007 zone_hold(zargp->zone = zone); 4008 4009 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 4010 TS_RUN, minclsyspri); 4011 exit(CLD_EXITED, 0); 4012 4013 return (EINVAL); 4014 } 4015 4016 /* 4017 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 4018 * status to ZONE_IS_SHUTTING_DOWN. 4019 */ 4020 void 4021 zone_shutdown_global(void) 4022 { 4023 ASSERT(curproc->p_zone == global_zone); 4024 4025 mutex_enter(&zone_status_lock); 4026 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 4027 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 4028 mutex_exit(&zone_status_lock); 4029 } 4030