1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * Zones 32 * 33 * A zone is a named collection of processes, namespace constraints, 34 * and other system resources which comprise a secure and manageable 35 * application containment facility. 36 * 37 * Zones (represented by the reference counted zone_t) are tracked in 38 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 39 * (zoneid_t) are used to track zone association. Zone IDs are 40 * dynamically generated when the zone is created; if a persistent 41 * identifier is needed (core files, accounting logs, audit trail, 42 * etc.), the zone name should be used. 43 * 44 * 45 * Global Zone: 46 * 47 * The global zone (zoneid 0) is automatically associated with all 48 * system resources that have not been bound to a user-created zone. 49 * This means that even systems where zones are not in active use 50 * have a global zone, and all processes, mounts, etc. are 51 * associated with that zone. The global zone is generally 52 * unconstrained in terms of privileges and access, though the usual 53 * credential and privilege based restrictions apply. 54 * 55 * 56 * Zone States: 57 * 58 * The states in which a zone may be in and the transitions are as 59 * follows: 60 * 61 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 62 * initialized zone is added to the list of active zones on the system but 63 * isn't accessible. 64 * 65 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 66 * ready. The zone is made visible after the ZSD constructor callbacks are 67 * executed. A zone remains in this state until it transitions into 68 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 69 * 70 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 71 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 72 * state. 73 * 74 * ZONE_IS_RUNNING: The zone is open for business: zsched has 75 * successfully started init. A zone remains in this state until 76 * zone_shutdown() is called. 77 * 78 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 79 * killing all processes running in the zone. The zone remains 80 * in this state until there are no more user processes running in the zone. 81 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 82 * Since zone_shutdown() is restartable, it may be called successfully 83 * multiple times for the same zone_t. Setting of the zone's state to 84 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 85 * the zone's status without worrying about it being a moving target. 86 * 87 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 88 * are no more user processes in the zone. The zone remains in this 89 * state until there are no more kernel threads associated with the 90 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 91 * fail. 92 * 93 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 94 * have exited. zone_shutdown() returns. Henceforth it is not possible to 95 * join the zone or create kernel threads therein. 96 * 97 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 98 * remains in this state until zsched exits. Calls to zone_find_by_*() 99 * return NULL from now on. 100 * 101 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 102 * processes or threads doing work on behalf of the zone. The zone is 103 * removed from the list of active zones. zone_destroy() returns, and 104 * the zone can be recreated. 105 * 106 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 107 * callbacks are executed, and all memory associated with the zone is 108 * freed. 109 * 110 * Threads can wait for the zone to enter a requested state by using 111 * zone_status_wait() or zone_status_timedwait() with the desired 112 * state passed in as an argument. Zone state transitions are 113 * uni-directional; it is not possible to move back to an earlier state. 114 * 115 * 116 * Zone-Specific Data: 117 * 118 * Subsystems needing to maintain zone-specific data can store that 119 * data using the ZSD mechanism. This provides a zone-specific data 120 * store, similar to thread-specific data (see pthread_getspecific(3C) 121 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 122 * to register callbacks to be invoked when a zone is created, shut 123 * down, or destroyed. This can be used to initialize zone-specific 124 * data for new zones and to clean up when zones go away. 125 * 126 * 127 * Data Structures: 128 * 129 * The per-zone structure (zone_t) is reference counted, and freed 130 * when all references are released. zone_hold and zone_rele can be 131 * used to adjust the reference count. In addition, reference counts 132 * associated with the cred_t structure are tracked separately using 133 * zone_cred_hold and zone_cred_rele. 134 * 135 * Pointers to active zone_t's are stored in two hash tables; one 136 * for searching by id, the other for searching by name. Lookups 137 * can be performed on either basis, using zone_find_by_id and 138 * zone_find_by_name. Both return zone_t pointers with the zone 139 * held, so zone_rele should be called when the pointer is no longer 140 * needed. Zones can also be searched by path; zone_find_by_path 141 * returns the zone with which a path name is associated (global 142 * zone if the path is not within some other zone's file system 143 * hierarchy). This currently requires iterating through each zone, 144 * so it is slower than an id or name search via a hash table. 145 * 146 * 147 * Locking: 148 * 149 * zonehash_lock: This is a top-level global lock used to protect the 150 * zone hash tables and lists. Zones cannot be created or destroyed 151 * while this lock is held. 152 * zone_status_lock: This is a global lock protecting zone state. 153 * Zones cannot change state while this lock is held. It also 154 * protects the list of kernel threads associated with a zone. 155 * zone_lock: This is a per-zone lock used to protect several fields of 156 * the zone_t (see <sys/zone.h> for details). In addition, holding 157 * this lock means that the zone cannot go away. 158 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 159 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 160 * list (a list of zones in the ZONE_IS_DEAD state). 161 * 162 * Ordering requirements: 163 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 164 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 165 * 166 * Blocking memory allocations are permitted while holding any of the 167 * zone locks. 168 * 169 * 170 * System Call Interface: 171 * 172 * The zone subsystem can be managed and queried from user level with 173 * the following system calls (all subcodes of the primary "zone" 174 * system call): 175 * - zone_create: creates a zone with selected attributes (name, 176 * root path, privileges, resource controls, ZFS datasets) 177 * - zone_enter: allows the current process to enter a zone 178 * - zone_getattr: reports attributes of a zone 179 * - zone_list: lists all zones active in the system 180 * - zone_lookup: looks up zone id based on name 181 * - zone_shutdown: initiates shutdown process (see states above) 182 * - zone_destroy: completes shutdown process (see states above) 183 * 184 */ 185 186 #include <sys/priv_impl.h> 187 #include <sys/cred.h> 188 #include <c2/audit.h> 189 #include <sys/ddi.h> 190 #include <sys/debug.h> 191 #include <sys/file.h> 192 #include <sys/kmem.h> 193 #include <sys/mutex.h> 194 #include <sys/pathname.h> 195 #include <sys/proc.h> 196 #include <sys/project.h> 197 #include <sys/task.h> 198 #include <sys/systm.h> 199 #include <sys/types.h> 200 #include <sys/utsname.h> 201 #include <sys/vnode.h> 202 #include <sys/vfs.h> 203 #include <sys/systeminfo.h> 204 #include <sys/policy.h> 205 #include <sys/cred_impl.h> 206 #include <sys/contract_impl.h> 207 #include <sys/contract/process_impl.h> 208 #include <sys/class.h> 209 #include <sys/pool.h> 210 #include <sys/pool_pset.h> 211 #include <sys/pset.h> 212 #include <sys/log.h> 213 #include <sys/sysmacros.h> 214 #include <sys/callb.h> 215 #include <sys/vmparam.h> 216 #include <sys/corectl.h> 217 218 #include <sys/door.h> 219 #include <sys/cpuvar.h> 220 #include <sys/fs/snode.h> 221 222 #include <sys/uadmin.h> 223 #include <sys/session.h> 224 #include <sys/cmn_err.h> 225 #include <sys/modhash.h> 226 #include <sys/nvpair.h> 227 #include <sys/rctl.h> 228 #include <sys/fss.h> 229 #include <sys/zone.h> 230 231 /* 232 * cv used to signal that all references to the zone have been released. This 233 * needs to be global since there may be multiple waiters, and the first to 234 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 235 */ 236 static kcondvar_t zone_destroy_cv; 237 /* 238 * Lock used to serialize access to zone_cv. This could have been per-zone, 239 * but then we'd need another lock for zone_destroy_cv, and why bother? 240 */ 241 static kmutex_t zone_status_lock; 242 243 /* 244 * ZSD-related global variables. 245 */ 246 static kmutex_t zsd_key_lock; /* protects the following two */ 247 /* 248 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 249 */ 250 static zone_key_t zsd_keyval = 0; 251 /* 252 * Global list of registered keys. We use this when a new zone is created. 253 */ 254 static list_t zsd_registered_keys; 255 256 int zone_hash_size = 256; 257 static mod_hash_t *zonehashbyname, *zonehashbyid; 258 static kmutex_t zonehash_lock; 259 static uint_t zonecount; 260 static id_space_t *zoneid_space; 261 262 /* 263 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 264 * kernel proper runs, and which manages all other zones. 265 * 266 * Although not declared as static, the variable "zone0" should not be used 267 * except for by code that needs to reference the global zone early on in boot, 268 * before it is fully initialized. All other consumers should use 269 * 'global_zone'. 270 */ 271 zone_t zone0; 272 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 273 274 /* 275 * List of active zones, protected by zonehash_lock. 276 */ 277 static list_t zone_active; 278 279 /* 280 * List of destroyed zones that still have outstanding cred references. 281 * Used for debugging. Uses a separate lock to avoid lock ordering 282 * problems in zone_free. 283 */ 284 static list_t zone_deathrow; 285 static kmutex_t zone_deathrow_lock; 286 287 /* number of zones is limited by virtual interface limit in IP */ 288 uint_t maxzones = 8192; 289 290 /* 291 * This isn't static so lint doesn't complain. 292 */ 293 rctl_hndl_t rc_zone_cpu_shares; 294 rctl_hndl_t rc_zone_nlwps; 295 /* 296 * Synchronization primitives used to synchronize between mounts and zone 297 * creation/destruction. 298 */ 299 static int mounts_in_progress; 300 static kcondvar_t mount_cv; 301 static kmutex_t mount_lock; 302 303 const char * const zone_initname = "/sbin/init"; 304 305 static int zone_shutdown(zoneid_t zoneid); 306 307 /* 308 * Certain filesystems (such as NFS and autofs) need to know which zone 309 * the mount is being placed in. Because of this, we need to be able to 310 * ensure that a zone isn't in the process of being created such that 311 * nfs_mount() thinks it is in the global zone, while by the time it 312 * gets added the list of mounted zones, it ends up on zoneA's mount 313 * list. 314 * 315 * The following functions: block_mounts()/resume_mounts() and 316 * mount_in_progress()/mount_completed() are used by zones and the VFS 317 * layer (respectively) to synchronize zone creation and new mounts. 318 * 319 * The semantics are like a reader-reader lock such that there may 320 * either be multiple mounts (or zone creations, if that weren't 321 * serialized by zonehash_lock) in progress at the same time, but not 322 * both. 323 * 324 * We use cv's so the user can ctrl-C out of the operation if it's 325 * taking too long. 326 * 327 * The semantics are such that there is unfair bias towards the 328 * "current" operation. This means that zone creations may starve if 329 * there is a rapid succession of new mounts coming in to the system, or 330 * there is a remote possibility that zones will be created at such a 331 * rate that new mounts will not be able to proceed. 332 */ 333 /* 334 * Prevent new mounts from progressing to the point of calling 335 * VFS_MOUNT(). If there are already mounts in this "region", wait for 336 * them to complete. 337 */ 338 static int 339 block_mounts(void) 340 { 341 int retval = 0; 342 343 /* 344 * Since it may block for a long time, block_mounts() shouldn't be 345 * called with zonehash_lock held. 346 */ 347 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 348 mutex_enter(&mount_lock); 349 while (mounts_in_progress > 0) { 350 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 351 goto signaled; 352 } 353 /* 354 * A negative value of mounts_in_progress indicates that mounts 355 * have been blocked by (-mounts_in_progress) different callers. 356 */ 357 mounts_in_progress--; 358 retval = 1; 359 signaled: 360 mutex_exit(&mount_lock); 361 return (retval); 362 } 363 364 /* 365 * The VFS layer may progress with new mounts as far as we're concerned. 366 * Allow them to progress if we were the last obstacle. 367 */ 368 static void 369 resume_mounts(void) 370 { 371 mutex_enter(&mount_lock); 372 if (++mounts_in_progress == 0) 373 cv_broadcast(&mount_cv); 374 mutex_exit(&mount_lock); 375 } 376 377 /* 378 * The VFS layer is busy with a mount; zones should wait until all 379 * mounts are completed to progress. 380 */ 381 void 382 mount_in_progress(void) 383 { 384 mutex_enter(&mount_lock); 385 while (mounts_in_progress < 0) 386 cv_wait(&mount_cv, &mount_lock); 387 mounts_in_progress++; 388 mutex_exit(&mount_lock); 389 } 390 391 /* 392 * VFS is done with one mount; wake up any waiting block_mounts() 393 * callers if this is the last mount. 394 */ 395 void 396 mount_completed(void) 397 { 398 mutex_enter(&mount_lock); 399 if (--mounts_in_progress == 0) 400 cv_broadcast(&mount_cv); 401 mutex_exit(&mount_lock); 402 } 403 404 /* 405 * ZSD routines. 406 * 407 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 408 * defined by the pthread_key_create() and related interfaces. 409 * 410 * Kernel subsystems may register one or more data items and/or 411 * callbacks to be executed when a zone is created, shutdown, or 412 * destroyed. 413 * 414 * Unlike the thread counterpart, destructor callbacks will be executed 415 * even if the data pointer is NULL and/or there are no constructor 416 * callbacks, so it is the responsibility of such callbacks to check for 417 * NULL data values if necessary. 418 * 419 * The locking strategy and overall picture is as follows: 420 * 421 * When someone calls zone_key_create(), a template ZSD entry is added to the 422 * global list "zsd_registered_keys", protected by zsd_key_lock. The 423 * constructor callback is called immediately on all existing zones, and a 424 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 425 * zone_lock). As this operation requires the list of zones, the list of 426 * registered keys, and the per-zone list of ZSD entries to remain constant 427 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 428 * all existing zones, and zsd_key_lock, in that order. Similar locking is 429 * needed when zone_key_delete() is called. It is thus sufficient to hold 430 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 431 * per-zone zone_zsd list. 432 * 433 * Note that this implementation does not make a copy of the ZSD entry if a 434 * constructor callback is not provided. A zone_getspecific() on such an 435 * uninitialized ZSD entry will return NULL. 436 * 437 * When new zones are created constructor callbacks for all registered ZSD 438 * entries will be called. 439 * 440 * The framework does not provide any locking around zone_getspecific() and 441 * zone_setspecific() apart from that needed for internal consistency, so 442 * callers interested in atomic "test-and-set" semantics will need to provide 443 * their own locking. 444 */ 445 void 446 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 447 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 448 { 449 struct zsd_entry *zsdp; 450 struct zsd_entry *t; 451 struct zone *zone; 452 453 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 454 zsdp->zsd_data = NULL; 455 zsdp->zsd_create = create; 456 zsdp->zsd_shutdown = shutdown; 457 zsdp->zsd_destroy = destroy; 458 459 mutex_enter(&zonehash_lock); /* stop the world */ 460 for (zone = list_head(&zone_active); zone != NULL; 461 zone = list_next(&zone_active, zone)) 462 mutex_enter(&zone->zone_lock); /* lock all zones */ 463 464 mutex_enter(&zsd_key_lock); 465 *keyp = zsdp->zsd_key = ++zsd_keyval; 466 ASSERT(zsd_keyval != 0); 467 list_insert_tail(&zsd_registered_keys, zsdp); 468 mutex_exit(&zsd_key_lock); 469 470 if (create != NULL) { 471 for (zone = list_head(&zone_active); zone != NULL; 472 zone = list_next(&zone_active, zone)) { 473 t = kmem_alloc(sizeof (*t), KM_SLEEP); 474 t->zsd_key = *keyp; 475 t->zsd_data = (*create)(zone->zone_id); 476 t->zsd_create = create; 477 t->zsd_shutdown = shutdown; 478 t->zsd_destroy = destroy; 479 list_insert_tail(&zone->zone_zsd, t); 480 } 481 } 482 for (zone = list_head(&zone_active); zone != NULL; 483 zone = list_next(&zone_active, zone)) 484 mutex_exit(&zone->zone_lock); 485 mutex_exit(&zonehash_lock); 486 } 487 488 /* 489 * Helper function to find the zsd_entry associated with the key in the 490 * given list. 491 */ 492 static struct zsd_entry * 493 zsd_find(list_t *l, zone_key_t key) 494 { 495 struct zsd_entry *zsd; 496 497 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 498 if (zsd->zsd_key == key) { 499 /* 500 * Move to head of list to keep list in MRU order. 501 */ 502 if (zsd != list_head(l)) { 503 list_remove(l, zsd); 504 list_insert_head(l, zsd); 505 } 506 return (zsd); 507 } 508 } 509 return (NULL); 510 } 511 512 /* 513 * Function called when a module is being unloaded, or otherwise wishes 514 * to unregister its ZSD key and callbacks. 515 */ 516 int 517 zone_key_delete(zone_key_t key) 518 { 519 struct zsd_entry *zsdp = NULL; 520 zone_t *zone; 521 522 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 523 for (zone = list_head(&zone_active); zone != NULL; 524 zone = list_next(&zone_active, zone)) 525 mutex_enter(&zone->zone_lock); /* lock all zones */ 526 527 mutex_enter(&zsd_key_lock); 528 zsdp = zsd_find(&zsd_registered_keys, key); 529 if (zsdp == NULL) 530 goto notfound; 531 list_remove(&zsd_registered_keys, zsdp); 532 mutex_exit(&zsd_key_lock); 533 534 for (zone = list_head(&zone_active); zone != NULL; 535 zone = list_next(&zone_active, zone)) { 536 struct zsd_entry *del; 537 void *data; 538 539 if (!(zone->zone_flags & ZF_DESTROYED)) { 540 del = zsd_find(&zone->zone_zsd, key); 541 if (del != NULL) { 542 data = del->zsd_data; 543 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 544 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 545 list_remove(&zone->zone_zsd, del); 546 kmem_free(del, sizeof (*del)); 547 } else { 548 data = NULL; 549 } 550 if (zsdp->zsd_shutdown) 551 zsdp->zsd_shutdown(zone->zone_id, data); 552 if (zsdp->zsd_destroy) 553 zsdp->zsd_destroy(zone->zone_id, data); 554 } 555 mutex_exit(&zone->zone_lock); 556 } 557 mutex_exit(&zonehash_lock); 558 kmem_free(zsdp, sizeof (*zsdp)); 559 return (0); 560 561 notfound: 562 mutex_exit(&zsd_key_lock); 563 for (zone = list_head(&zone_active); zone != NULL; 564 zone = list_next(&zone_active, zone)) 565 mutex_exit(&zone->zone_lock); 566 mutex_exit(&zonehash_lock); 567 return (-1); 568 } 569 570 /* 571 * ZSD counterpart of pthread_setspecific(). 572 */ 573 int 574 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 575 { 576 struct zsd_entry *t; 577 struct zsd_entry *zsdp = NULL; 578 579 mutex_enter(&zone->zone_lock); 580 t = zsd_find(&zone->zone_zsd, key); 581 if (t != NULL) { 582 /* 583 * Replace old value with new 584 */ 585 t->zsd_data = (void *)data; 586 mutex_exit(&zone->zone_lock); 587 return (0); 588 } 589 /* 590 * If there was no previous value, go through the list of registered 591 * keys. 592 * 593 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 594 * necessary for shutdown callbacks to be able to execute without fear 595 * of deadlock. 596 */ 597 mutex_enter(&zsd_key_lock); 598 zsdp = zsd_find(&zsd_registered_keys, key); 599 if (zsdp == NULL) { /* Key was not registered */ 600 mutex_exit(&zsd_key_lock); 601 mutex_exit(&zone->zone_lock); 602 return (-1); 603 } 604 605 /* 606 * Add a zsd_entry to this zone, using the template we just retrieved 607 * to initialize the constructor and destructor(s). 608 */ 609 t = kmem_alloc(sizeof (*t), KM_SLEEP); 610 t->zsd_key = key; 611 t->zsd_data = (void *)data; 612 t->zsd_create = zsdp->zsd_create; 613 t->zsd_shutdown = zsdp->zsd_shutdown; 614 t->zsd_destroy = zsdp->zsd_destroy; 615 list_insert_tail(&zone->zone_zsd, t); 616 mutex_exit(&zsd_key_lock); 617 mutex_exit(&zone->zone_lock); 618 return (0); 619 } 620 621 /* 622 * ZSD counterpart of pthread_getspecific(). 623 */ 624 void * 625 zone_getspecific(zone_key_t key, zone_t *zone) 626 { 627 struct zsd_entry *t; 628 void *data; 629 630 mutex_enter(&zone->zone_lock); 631 t = zsd_find(&zone->zone_zsd, key); 632 data = (t == NULL ? NULL : t->zsd_data); 633 mutex_exit(&zone->zone_lock); 634 return (data); 635 } 636 637 /* 638 * Function used to initialize a zone's list of ZSD callbacks and data 639 * when the zone is being created. The callbacks are initialized from 640 * the template list (zsd_registered_keys), and the constructor 641 * callback executed (if one exists). 642 * 643 * This is called before the zone is made publicly available, hence no 644 * need to grab zone_lock. 645 * 646 * Although we grab and release zsd_key_lock, new entries cannot be 647 * added to or removed from the zsd_registered_keys list until we 648 * release zonehash_lock, so there isn't a window for a 649 * zone_key_create() to come in after we've dropped zsd_key_lock but 650 * before the zone is added to the zone list, such that the constructor 651 * callbacks aren't executed for the new zone. 652 */ 653 static void 654 zone_zsd_configure(zone_t *zone) 655 { 656 struct zsd_entry *zsdp; 657 struct zsd_entry *t; 658 zoneid_t zoneid = zone->zone_id; 659 660 ASSERT(MUTEX_HELD(&zonehash_lock)); 661 ASSERT(list_head(&zone->zone_zsd) == NULL); 662 mutex_enter(&zsd_key_lock); 663 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 664 zsdp = list_next(&zsd_registered_keys, zsdp)) { 665 if (zsdp->zsd_create != NULL) { 666 t = kmem_alloc(sizeof (*t), KM_SLEEP); 667 t->zsd_key = zsdp->zsd_key; 668 t->zsd_create = zsdp->zsd_create; 669 t->zsd_data = (*t->zsd_create)(zoneid); 670 t->zsd_shutdown = zsdp->zsd_shutdown; 671 t->zsd_destroy = zsdp->zsd_destroy; 672 list_insert_tail(&zone->zone_zsd, t); 673 } 674 } 675 mutex_exit(&zsd_key_lock); 676 } 677 678 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 679 680 /* 681 * Helper function to execute shutdown or destructor callbacks. 682 */ 683 static void 684 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 685 { 686 struct zsd_entry *zsdp; 687 struct zsd_entry *t; 688 zoneid_t zoneid = zone->zone_id; 689 690 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 691 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 692 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 693 694 mutex_enter(&zone->zone_lock); 695 if (ct == ZSD_DESTROY) { 696 if (zone->zone_flags & ZF_DESTROYED) { 697 /* 698 * Make sure destructors are only called once. 699 */ 700 mutex_exit(&zone->zone_lock); 701 return; 702 } 703 zone->zone_flags |= ZF_DESTROYED; 704 } 705 mutex_exit(&zone->zone_lock); 706 707 /* 708 * Both zsd_key_lock and zone_lock need to be held in order to add or 709 * remove a ZSD key, (either globally as part of 710 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 711 * possible through zone_setspecific()), so it's sufficient to hold 712 * zsd_key_lock here. 713 * 714 * This is a good thing, since we don't want to recursively try to grab 715 * zone_lock if a callback attempts to do something like a crfree() or 716 * zone_rele(). 717 */ 718 mutex_enter(&zsd_key_lock); 719 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 720 zsdp = list_next(&zsd_registered_keys, zsdp)) { 721 zone_key_t key = zsdp->zsd_key; 722 723 /* Skip if no callbacks registered */ 724 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 725 continue; 726 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 727 continue; 728 /* 729 * Call the callback with the zone-specific data if we can find 730 * any, otherwise with NULL. 731 */ 732 t = zsd_find(&zone->zone_zsd, key); 733 if (t != NULL) { 734 if (ct == ZSD_SHUTDOWN) { 735 t->zsd_shutdown(zoneid, t->zsd_data); 736 } else { 737 ASSERT(ct == ZSD_DESTROY); 738 t->zsd_destroy(zoneid, t->zsd_data); 739 } 740 } else { 741 if (ct == ZSD_SHUTDOWN) { 742 zsdp->zsd_shutdown(zoneid, NULL); 743 } else { 744 ASSERT(ct == ZSD_DESTROY); 745 zsdp->zsd_destroy(zoneid, NULL); 746 } 747 } 748 } 749 mutex_exit(&zsd_key_lock); 750 } 751 752 /* 753 * Called when the zone is going away; free ZSD-related memory, and 754 * destroy the zone_zsd list. 755 */ 756 static void 757 zone_free_zsd(zone_t *zone) 758 { 759 struct zsd_entry *t, *next; 760 761 /* 762 * Free all the zsd_entry's we had on this zone. 763 */ 764 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 765 next = list_next(&zone->zone_zsd, t); 766 list_remove(&zone->zone_zsd, t); 767 kmem_free(t, sizeof (*t)); 768 } 769 list_destroy(&zone->zone_zsd); 770 } 771 772 /* 773 * Frees memory associated with the zone dataset list. 774 */ 775 static void 776 zone_free_datasets(zone_t *zone) 777 { 778 zone_dataset_t *t, *next; 779 780 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 781 next = list_next(&zone->zone_datasets, t); 782 list_remove(&zone->zone_datasets, t); 783 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 784 kmem_free(t, sizeof (*t)); 785 } 786 list_destroy(&zone->zone_datasets); 787 } 788 789 /* 790 * zone.cpu-shares resource control support. 791 */ 792 /*ARGSUSED*/ 793 static rctl_qty_t 794 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 795 { 796 ASSERT(MUTEX_HELD(&p->p_lock)); 797 return (p->p_zone->zone_shares); 798 } 799 800 /*ARGSUSED*/ 801 static int 802 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 803 rctl_qty_t nv) 804 { 805 ASSERT(MUTEX_HELD(&p->p_lock)); 806 ASSERT(e->rcep_t == RCENTITY_ZONE); 807 if (e->rcep_p.zone == NULL) 808 return (0); 809 810 e->rcep_p.zone->zone_shares = nv; 811 return (0); 812 } 813 814 static rctl_ops_t zone_cpu_shares_ops = { 815 rcop_no_action, 816 zone_cpu_shares_usage, 817 zone_cpu_shares_set, 818 rcop_no_test 819 }; 820 821 /*ARGSUSED*/ 822 static rctl_qty_t 823 zone_lwps_usage(rctl_t *r, proc_t *p) 824 { 825 rctl_qty_t nlwps; 826 zone_t *zone = p->p_zone; 827 828 ASSERT(MUTEX_HELD(&p->p_lock)); 829 830 mutex_enter(&zone->zone_nlwps_lock); 831 nlwps = zone->zone_nlwps; 832 mutex_exit(&zone->zone_nlwps_lock); 833 834 return (nlwps); 835 } 836 837 /*ARGSUSED*/ 838 static int 839 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 840 rctl_qty_t incr, uint_t flags) 841 { 842 rctl_qty_t nlwps; 843 844 ASSERT(MUTEX_HELD(&p->p_lock)); 845 ASSERT(e->rcep_t == RCENTITY_ZONE); 846 if (e->rcep_p.zone == NULL) 847 return (0); 848 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 849 nlwps = e->rcep_p.zone->zone_nlwps; 850 851 if (nlwps + incr > rcntl->rcv_value) 852 return (1); 853 854 return (0); 855 } 856 857 /*ARGSUSED*/ 858 static int 859 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 860 861 ASSERT(MUTEX_HELD(&p->p_lock)); 862 ASSERT(e->rcep_t == RCENTITY_ZONE); 863 if (e->rcep_p.zone == NULL) 864 return (0); 865 e->rcep_p.zone->zone_nlwps_ctl = nv; 866 return (0); 867 } 868 869 static rctl_ops_t zone_lwps_ops = { 870 rcop_no_action, 871 zone_lwps_usage, 872 zone_lwps_set, 873 zone_lwps_test, 874 }; 875 876 /* 877 * Helper function to brand the zone with a unique ID. 878 */ 879 static void 880 zone_uniqid(zone_t *zone) 881 { 882 static uint64_t uniqid = 0; 883 884 ASSERT(MUTEX_HELD(&zonehash_lock)); 885 zone->zone_uniqid = uniqid++; 886 } 887 888 /* 889 * Returns a held pointer to the "kcred" for the specified zone. 890 */ 891 struct cred * 892 zone_get_kcred(zoneid_t zoneid) 893 { 894 zone_t *zone; 895 cred_t *cr; 896 897 if ((zone = zone_find_by_id(zoneid)) == NULL) 898 return (NULL); 899 cr = zone->zone_kcred; 900 crhold(cr); 901 zone_rele(zone); 902 return (cr); 903 } 904 905 /* 906 * Called very early on in boot to initialize the ZSD list so that 907 * zone_key_create() can be called before zone_init(). It also initializes 908 * portions of zone0 which may be used before zone_init() is called. The 909 * variable "global_zone" will be set when zone0 is fully initialized by 910 * zone_init(). 911 */ 912 void 913 zone_zsd_init(void) 914 { 915 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 916 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 917 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 918 offsetof(struct zsd_entry, zsd_linkage)); 919 list_create(&zone_active, sizeof (zone_t), 920 offsetof(zone_t, zone_linkage)); 921 list_create(&zone_deathrow, sizeof (zone_t), 922 offsetof(zone_t, zone_linkage)); 923 924 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 925 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 926 zone0.zone_shares = 1; 927 zone0.zone_nlwps_ctl = INT_MAX; 928 zone0.zone_name = GLOBAL_ZONENAME; 929 zone0.zone_nodename = utsname.nodename; 930 zone0.zone_domain = srpc_domain; 931 zone0.zone_ref = 1; 932 zone0.zone_id = GLOBAL_ZONEID; 933 zone0.zone_status = ZONE_IS_RUNNING; 934 zone0.zone_rootpath = "/"; 935 zone0.zone_rootpathlen = 2; 936 zone0.zone_psetid = ZONE_PS_INVAL; 937 zone0.zone_ncpus = 0; 938 zone0.zone_ncpus_online = 0; 939 zone0.zone_proc_initpid = 1; 940 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 941 offsetof(struct zsd_entry, zsd_linkage)); 942 list_insert_head(&zone_active, &zone0); 943 944 /* 945 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 946 * to anything meaningful. It is assigned to be 'rootdir' in 947 * vfs_mountroot(). 948 */ 949 zone0.zone_rootvp = NULL; 950 zone0.zone_vfslist = NULL; 951 zone0.zone_bootargs = NULL; 952 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 953 /* 954 * The global zone has all privileges 955 */ 956 priv_fillset(zone0.zone_privset); 957 /* 958 * Add p0 to the global zone 959 */ 960 zone0.zone_zsched = &p0; 961 p0.p_zone = &zone0; 962 } 963 964 /* 965 * Called by main() to initialize the zones framework. 966 */ 967 void 968 zone_init(void) 969 { 970 rctl_dict_entry_t *rde; 971 rctl_val_t *dval; 972 rctl_set_t *set; 973 rctl_alloc_gp_t *gp; 974 rctl_entity_p_t e; 975 976 ASSERT(curproc == &p0); 977 978 /* 979 * Create ID space for zone IDs. ID 0 is reserved for the 980 * global zone. 981 */ 982 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 983 984 /* 985 * Initialize generic zone resource controls, if any. 986 */ 987 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 988 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 989 RCTL_GLOBAL_NOBASIC | 990 RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES, 991 &zone_cpu_shares_ops); 992 993 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 994 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 995 INT_MAX, INT_MAX, &zone_lwps_ops); 996 /* 997 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 998 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 999 */ 1000 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 1001 bzero(dval, sizeof (rctl_val_t)); 1002 dval->rcv_value = 1; 1003 dval->rcv_privilege = RCPRIV_PRIVILEGED; 1004 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 1005 dval->rcv_action_recip_pid = -1; 1006 1007 rde = rctl_dict_lookup("zone.cpu-shares"); 1008 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 1009 1010 /* 1011 * Initialize the ``global zone''. 1012 */ 1013 set = rctl_set_create(); 1014 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 1015 mutex_enter(&p0.p_lock); 1016 e.rcep_p.zone = &zone0; 1017 e.rcep_t = RCENTITY_ZONE; 1018 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1019 gp); 1020 1021 zone0.zone_nlwps = p0.p_lwpcnt; 1022 zone0.zone_ntasks = 1; 1023 mutex_exit(&p0.p_lock); 1024 rctl_prealloc_destroy(gp); 1025 /* 1026 * pool_default hasn't been initialized yet, so we let pool_init() take 1027 * care of making the global zone is in the default pool. 1028 */ 1029 mutex_enter(&zonehash_lock); 1030 zone_uniqid(&zone0); 1031 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1032 mutex_exit(&zonehash_lock); 1033 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1034 mod_hash_null_valdtor); 1035 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1036 zone_hash_size, mod_hash_null_valdtor); 1037 zonecount = 1; 1038 1039 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1040 (mod_hash_val_t)&zone0); 1041 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1042 (mod_hash_val_t)&zone0); 1043 /* 1044 * We avoid setting zone_kcred until now, since kcred is initialized 1045 * sometime after zone_zsd_init() and before zone_init(). 1046 */ 1047 zone0.zone_kcred = kcred; 1048 /* 1049 * The global zone is fully initialized (except for zone_rootvp which 1050 * will be set when the root filesystem is mounted). 1051 */ 1052 global_zone = &zone0; 1053 } 1054 1055 static void 1056 zone_free(zone_t *zone) 1057 { 1058 ASSERT(zone != global_zone); 1059 ASSERT(zone->zone_ntasks == 0); 1060 ASSERT(zone->zone_nlwps == 0); 1061 ASSERT(zone->zone_cred_ref == 0); 1062 ASSERT(zone->zone_kcred == NULL); 1063 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1064 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1065 1066 /* remove from deathrow list */ 1067 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1068 ASSERT(zone->zone_ref == 0); 1069 mutex_enter(&zone_deathrow_lock); 1070 list_remove(&zone_deathrow, zone); 1071 mutex_exit(&zone_deathrow_lock); 1072 } 1073 1074 zone_free_zsd(zone); 1075 zone_free_datasets(zone); 1076 1077 if (zone->zone_rootvp != NULL) 1078 VN_RELE(zone->zone_rootvp); 1079 if (zone->zone_rootpath) 1080 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1081 if (zone->zone_name != NULL) 1082 kmem_free(zone->zone_name, ZONENAME_MAX); 1083 if (zone->zone_nodename != NULL) 1084 kmem_free(zone->zone_nodename, _SYS_NMLN); 1085 if (zone->zone_domain != NULL) 1086 kmem_free(zone->zone_domain, _SYS_NMLN); 1087 if (zone->zone_privset != NULL) 1088 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1089 if (zone->zone_rctls != NULL) 1090 rctl_set_free(zone->zone_rctls); 1091 if (zone->zone_bootargs != NULL) 1092 kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX); 1093 id_free(zoneid_space, zone->zone_id); 1094 mutex_destroy(&zone->zone_lock); 1095 cv_destroy(&zone->zone_cv); 1096 kmem_free(zone, sizeof (zone_t)); 1097 } 1098 1099 /* 1100 * See block comment at the top of this file for information about zone 1101 * status values. 1102 */ 1103 /* 1104 * Convenience function for setting zone status. 1105 */ 1106 static void 1107 zone_status_set(zone_t *zone, zone_status_t status) 1108 { 1109 ASSERT(MUTEX_HELD(&zone_status_lock)); 1110 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1111 status >= zone_status_get(zone)); 1112 zone->zone_status = status; 1113 cv_broadcast(&zone->zone_cv); 1114 } 1115 1116 /* 1117 * Public function to retrieve the zone status. The zone status may 1118 * change after it is retrieved. 1119 */ 1120 zone_status_t 1121 zone_status_get(zone_t *zone) 1122 { 1123 return (zone->zone_status); 1124 } 1125 1126 static int 1127 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1128 { 1129 char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP); 1130 size_t len; 1131 int err; 1132 1133 err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len); 1134 if (err != 0) { 1135 kmem_free(bootargs, ZONEBOOTARGS_MAX); 1136 return (err); /* EFAULT or ENAMETOOLONG */ 1137 } 1138 bootargs[len] = '\0'; 1139 1140 ASSERT(zone->zone_bootargs == NULL); 1141 zone->zone_bootargs = bootargs; 1142 return (0); 1143 } 1144 1145 /* 1146 * Block indefinitely waiting for (zone_status >= status) 1147 */ 1148 void 1149 zone_status_wait(zone_t *zone, zone_status_t status) 1150 { 1151 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1152 1153 mutex_enter(&zone_status_lock); 1154 while (zone->zone_status < status) { 1155 cv_wait(&zone->zone_cv, &zone_status_lock); 1156 } 1157 mutex_exit(&zone_status_lock); 1158 } 1159 1160 /* 1161 * Private CPR-safe version of zone_status_wait(). 1162 */ 1163 static void 1164 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1165 { 1166 callb_cpr_t cprinfo; 1167 1168 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1169 1170 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1171 str); 1172 mutex_enter(&zone_status_lock); 1173 while (zone->zone_status < status) { 1174 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1175 cv_wait(&zone->zone_cv, &zone_status_lock); 1176 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1177 } 1178 /* 1179 * zone_status_lock is implicitly released by the following. 1180 */ 1181 CALLB_CPR_EXIT(&cprinfo); 1182 } 1183 1184 /* 1185 * Block until zone enters requested state or signal is received. Return (0) 1186 * if signaled, non-zero otherwise. 1187 */ 1188 int 1189 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1190 { 1191 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1192 1193 mutex_enter(&zone_status_lock); 1194 while (zone->zone_status < status) { 1195 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1196 mutex_exit(&zone_status_lock); 1197 return (0); 1198 } 1199 } 1200 mutex_exit(&zone_status_lock); 1201 return (1); 1202 } 1203 1204 /* 1205 * Block until the zone enters the requested state or the timeout expires, 1206 * whichever happens first. Return (-1) if operation timed out, time remaining 1207 * otherwise. 1208 */ 1209 clock_t 1210 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1211 { 1212 clock_t timeleft = 0; 1213 1214 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1215 1216 mutex_enter(&zone_status_lock); 1217 while (zone->zone_status < status && timeleft != -1) { 1218 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1219 } 1220 mutex_exit(&zone_status_lock); 1221 return (timeleft); 1222 } 1223 1224 /* 1225 * Block until the zone enters the requested state, the current process is 1226 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1227 * operation timed out, 0 if signaled, time remaining otherwise. 1228 */ 1229 clock_t 1230 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1231 { 1232 clock_t timeleft = tim - lbolt; 1233 1234 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1235 1236 mutex_enter(&zone_status_lock); 1237 while (zone->zone_status < status) { 1238 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1239 tim); 1240 if (timeleft <= 0) 1241 break; 1242 } 1243 mutex_exit(&zone_status_lock); 1244 return (timeleft); 1245 } 1246 1247 /* 1248 * Zones have two reference counts: one for references from credential 1249 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1250 * This is so we can allow a zone to be rebooted while there are still 1251 * outstanding cred references, since certain drivers cache dblks (which 1252 * implicitly results in cached creds). We wait for zone_ref to drop to 1253 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1254 * later freed when the zone_cred_ref drops to 0, though nothing other 1255 * than the zone id and privilege set should be accessed once the zone 1256 * is "dead". 1257 * 1258 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1259 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1260 * to 0. This can be useful to flush out other sources of cached creds 1261 * that may be less innocuous than the driver case. 1262 */ 1263 1264 int zone_wait_for_cred = 0; 1265 1266 static void 1267 zone_hold_locked(zone_t *z) 1268 { 1269 ASSERT(MUTEX_HELD(&z->zone_lock)); 1270 z->zone_ref++; 1271 ASSERT(z->zone_ref != 0); 1272 } 1273 1274 void 1275 zone_hold(zone_t *z) 1276 { 1277 mutex_enter(&z->zone_lock); 1278 zone_hold_locked(z); 1279 mutex_exit(&z->zone_lock); 1280 } 1281 1282 /* 1283 * If the non-cred ref count drops to 1 and either the cred ref count 1284 * is 0 or we aren't waiting for cred references, the zone is ready to 1285 * be destroyed. 1286 */ 1287 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1288 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1289 1290 void 1291 zone_rele(zone_t *z) 1292 { 1293 boolean_t wakeup; 1294 1295 mutex_enter(&z->zone_lock); 1296 ASSERT(z->zone_ref != 0); 1297 z->zone_ref--; 1298 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1299 /* no more refs, free the structure */ 1300 mutex_exit(&z->zone_lock); 1301 zone_free(z); 1302 return; 1303 } 1304 /* signal zone_destroy so the zone can finish halting */ 1305 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1306 mutex_exit(&z->zone_lock); 1307 1308 if (wakeup) { 1309 /* 1310 * Grabbing zonehash_lock here effectively synchronizes with 1311 * zone_destroy() to avoid missed signals. 1312 */ 1313 mutex_enter(&zonehash_lock); 1314 cv_broadcast(&zone_destroy_cv); 1315 mutex_exit(&zonehash_lock); 1316 } 1317 } 1318 1319 void 1320 zone_cred_hold(zone_t *z) 1321 { 1322 mutex_enter(&z->zone_lock); 1323 z->zone_cred_ref++; 1324 ASSERT(z->zone_cred_ref != 0); 1325 mutex_exit(&z->zone_lock); 1326 } 1327 1328 void 1329 zone_cred_rele(zone_t *z) 1330 { 1331 boolean_t wakeup; 1332 1333 mutex_enter(&z->zone_lock); 1334 ASSERT(z->zone_cred_ref != 0); 1335 z->zone_cred_ref--; 1336 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1337 /* no more refs, free the structure */ 1338 mutex_exit(&z->zone_lock); 1339 zone_free(z); 1340 return; 1341 } 1342 /* 1343 * If zone_destroy is waiting for the cred references to drain 1344 * out, and they have, signal it. 1345 */ 1346 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1347 zone_status_get(z) >= ZONE_IS_DEAD); 1348 mutex_exit(&z->zone_lock); 1349 1350 if (wakeup) { 1351 /* 1352 * Grabbing zonehash_lock here effectively synchronizes with 1353 * zone_destroy() to avoid missed signals. 1354 */ 1355 mutex_enter(&zonehash_lock); 1356 cv_broadcast(&zone_destroy_cv); 1357 mutex_exit(&zonehash_lock); 1358 } 1359 } 1360 1361 void 1362 zone_task_hold(zone_t *z) 1363 { 1364 mutex_enter(&z->zone_lock); 1365 z->zone_ntasks++; 1366 ASSERT(z->zone_ntasks != 0); 1367 mutex_exit(&z->zone_lock); 1368 } 1369 1370 void 1371 zone_task_rele(zone_t *zone) 1372 { 1373 uint_t refcnt; 1374 1375 mutex_enter(&zone->zone_lock); 1376 ASSERT(zone->zone_ntasks != 0); 1377 refcnt = --zone->zone_ntasks; 1378 if (refcnt > 1) { /* Common case */ 1379 mutex_exit(&zone->zone_lock); 1380 return; 1381 } 1382 zone_hold_locked(zone); /* so we can use the zone_t later */ 1383 mutex_exit(&zone->zone_lock); 1384 if (refcnt == 1) { 1385 /* 1386 * See if the zone is shutting down. 1387 */ 1388 mutex_enter(&zone_status_lock); 1389 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1390 goto out; 1391 } 1392 1393 /* 1394 * Make sure the ntasks didn't change since we 1395 * dropped zone_lock. 1396 */ 1397 mutex_enter(&zone->zone_lock); 1398 if (refcnt != zone->zone_ntasks) { 1399 mutex_exit(&zone->zone_lock); 1400 goto out; 1401 } 1402 mutex_exit(&zone->zone_lock); 1403 1404 /* 1405 * No more user processes in the zone. The zone is empty. 1406 */ 1407 zone_status_set(zone, ZONE_IS_EMPTY); 1408 goto out; 1409 } 1410 1411 ASSERT(refcnt == 0); 1412 /* 1413 * zsched has exited; the zone is dead. 1414 */ 1415 zone->zone_zsched = NULL; /* paranoia */ 1416 mutex_enter(&zone_status_lock); 1417 zone_status_set(zone, ZONE_IS_DEAD); 1418 out: 1419 mutex_exit(&zone_status_lock); 1420 zone_rele(zone); 1421 } 1422 1423 zoneid_t 1424 getzoneid(void) 1425 { 1426 return (curproc->p_zone->zone_id); 1427 } 1428 1429 /* 1430 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1431 * check the validity of a zone's state. 1432 */ 1433 static zone_t * 1434 zone_find_all_by_id(zoneid_t zoneid) 1435 { 1436 mod_hash_val_t hv; 1437 zone_t *zone = NULL; 1438 1439 ASSERT(MUTEX_HELD(&zonehash_lock)); 1440 1441 if (mod_hash_find(zonehashbyid, 1442 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1443 zone = (zone_t *)hv; 1444 return (zone); 1445 } 1446 1447 static zone_t * 1448 zone_find_all_by_name(char *name) 1449 { 1450 mod_hash_val_t hv; 1451 zone_t *zone = NULL; 1452 1453 ASSERT(MUTEX_HELD(&zonehash_lock)); 1454 1455 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 1456 zone = (zone_t *)hv; 1457 return (zone); 1458 } 1459 1460 /* 1461 * Public interface for looking up a zone by zoneid. Only returns the zone if 1462 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 1463 * Caller must call zone_rele() once it is done with the zone. 1464 * 1465 * The zone may begin the zone_destroy() sequence immediately after this 1466 * function returns, but may be safely used until zone_rele() is called. 1467 */ 1468 zone_t * 1469 zone_find_by_id(zoneid_t zoneid) 1470 { 1471 zone_t *zone; 1472 zone_status_t status; 1473 1474 mutex_enter(&zonehash_lock); 1475 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 1476 mutex_exit(&zonehash_lock); 1477 return (NULL); 1478 } 1479 status = zone_status_get(zone); 1480 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1481 /* 1482 * For all practical purposes the zone doesn't exist. 1483 */ 1484 mutex_exit(&zonehash_lock); 1485 return (NULL); 1486 } 1487 zone_hold(zone); 1488 mutex_exit(&zonehash_lock); 1489 return (zone); 1490 } 1491 1492 /* 1493 * Similar to zone_find_by_id, but using zone name as the key. 1494 */ 1495 zone_t * 1496 zone_find_by_name(char *name) 1497 { 1498 zone_t *zone; 1499 zone_status_t status; 1500 1501 mutex_enter(&zonehash_lock); 1502 if ((zone = zone_find_all_by_name(name)) == NULL) { 1503 mutex_exit(&zonehash_lock); 1504 return (NULL); 1505 } 1506 status = zone_status_get(zone); 1507 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1508 /* 1509 * For all practical purposes the zone doesn't exist. 1510 */ 1511 mutex_exit(&zonehash_lock); 1512 return (NULL); 1513 } 1514 zone_hold(zone); 1515 mutex_exit(&zonehash_lock); 1516 return (zone); 1517 } 1518 1519 /* 1520 * Similar to zone_find_by_id(), using the path as a key. For instance, 1521 * if there is a zone "foo" rooted at /foo/root, and the path argument 1522 * is "/foo/root/proc", it will return the held zone_t corresponding to 1523 * zone "foo". 1524 * 1525 * zone_find_by_path() always returns a non-NULL value, since at the 1526 * very least every path will be contained in the global zone. 1527 * 1528 * As with the other zone_find_by_*() functions, the caller is 1529 * responsible for zone_rele()ing the return value of this function. 1530 */ 1531 zone_t * 1532 zone_find_by_path(const char *path) 1533 { 1534 zone_t *zone; 1535 zone_t *zret = NULL; 1536 zone_status_t status; 1537 1538 if (path == NULL) { 1539 /* 1540 * Call from rootconf(). 1541 */ 1542 zone_hold(global_zone); 1543 return (global_zone); 1544 } 1545 ASSERT(*path == '/'); 1546 mutex_enter(&zonehash_lock); 1547 for (zone = list_head(&zone_active); zone != NULL; 1548 zone = list_next(&zone_active, zone)) { 1549 if (ZONE_PATH_VISIBLE(path, zone)) 1550 zret = zone; 1551 } 1552 ASSERT(zret != NULL); 1553 status = zone_status_get(zret); 1554 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1555 /* 1556 * Zone practically doesn't exist. 1557 */ 1558 zret = global_zone; 1559 } 1560 zone_hold(zret); 1561 mutex_exit(&zonehash_lock); 1562 return (zret); 1563 } 1564 1565 /* 1566 * Get the number of cpus visible to this zone. The system-wide global 1567 * 'ncpus' is returned if pools are disabled, the caller is in the 1568 * global zone, or a NULL zone argument is passed in. 1569 */ 1570 int 1571 zone_ncpus_get(zone_t *zone) 1572 { 1573 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 1574 1575 return (myncpus != 0 ? myncpus : ncpus); 1576 } 1577 1578 /* 1579 * Get the number of online cpus visible to this zone. The system-wide 1580 * global 'ncpus_online' is returned if pools are disabled, the caller 1581 * is in the global zone, or a NULL zone argument is passed in. 1582 */ 1583 int 1584 zone_ncpus_online_get(zone_t *zone) 1585 { 1586 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 1587 1588 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 1589 } 1590 1591 /* 1592 * Return the pool to which the zone is currently bound. 1593 */ 1594 pool_t * 1595 zone_pool_get(zone_t *zone) 1596 { 1597 ASSERT(pool_lock_held()); 1598 1599 return (zone->zone_pool); 1600 } 1601 1602 /* 1603 * Set the zone's pool pointer and update the zone's visibility to match 1604 * the resources in the new pool. 1605 */ 1606 void 1607 zone_pool_set(zone_t *zone, pool_t *pool) 1608 { 1609 ASSERT(pool_lock_held()); 1610 ASSERT(MUTEX_HELD(&cpu_lock)); 1611 1612 zone->zone_pool = pool; 1613 zone_pset_set(zone, pool->pool_pset->pset_id); 1614 } 1615 1616 /* 1617 * Return the cached value of the id of the processor set to which the 1618 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 1619 * facility is disabled. 1620 */ 1621 psetid_t 1622 zone_pset_get(zone_t *zone) 1623 { 1624 ASSERT(MUTEX_HELD(&cpu_lock)); 1625 1626 return (zone->zone_psetid); 1627 } 1628 1629 /* 1630 * Set the cached value of the id of the processor set to which the zone 1631 * is currently bound. Also update the zone's visibility to match the 1632 * resources in the new processor set. 1633 */ 1634 void 1635 zone_pset_set(zone_t *zone, psetid_t newpsetid) 1636 { 1637 psetid_t oldpsetid; 1638 1639 ASSERT(MUTEX_HELD(&cpu_lock)); 1640 oldpsetid = zone_pset_get(zone); 1641 1642 if (oldpsetid == newpsetid) 1643 return; 1644 /* 1645 * Global zone sees all. 1646 */ 1647 if (zone != global_zone) { 1648 zone->zone_psetid = newpsetid; 1649 if (newpsetid != ZONE_PS_INVAL) 1650 pool_pset_visibility_add(newpsetid, zone); 1651 if (oldpsetid != ZONE_PS_INVAL) 1652 pool_pset_visibility_remove(oldpsetid, zone); 1653 } 1654 /* 1655 * Disabling pools, so we should start using the global values 1656 * for ncpus and ncpus_online. 1657 */ 1658 if (newpsetid == ZONE_PS_INVAL) { 1659 zone->zone_ncpus = 0; 1660 zone->zone_ncpus_online = 0; 1661 } 1662 } 1663 1664 /* 1665 * Walk the list of active zones and issue the provided callback for 1666 * each of them. 1667 * 1668 * Caller must not be holding any locks that may be acquired under 1669 * zonehash_lock. See comment at the beginning of the file for a list of 1670 * common locks and their interactions with zones. 1671 */ 1672 int 1673 zone_walk(int (*cb)(zone_t *, void *), void *data) 1674 { 1675 zone_t *zone; 1676 int ret = 0; 1677 zone_status_t status; 1678 1679 mutex_enter(&zonehash_lock); 1680 for (zone = list_head(&zone_active); zone != NULL; 1681 zone = list_next(&zone_active, zone)) { 1682 /* 1683 * Skip zones that shouldn't be externally visible. 1684 */ 1685 status = zone_status_get(zone); 1686 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 1687 continue; 1688 /* 1689 * Bail immediately if any callback invocation returns a 1690 * non-zero value. 1691 */ 1692 ret = (*cb)(zone, data); 1693 if (ret != 0) 1694 break; 1695 } 1696 mutex_exit(&zonehash_lock); 1697 return (ret); 1698 } 1699 1700 static int 1701 zone_set_root(zone_t *zone, const char *upath) 1702 { 1703 vnode_t *vp; 1704 int trycount; 1705 int error = 0; 1706 char *path; 1707 struct pathname upn, pn; 1708 size_t pathlen; 1709 1710 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 1711 return (error); 1712 1713 pn_alloc(&pn); 1714 1715 /* prevent infinite loop */ 1716 trycount = 10; 1717 for (;;) { 1718 if (--trycount <= 0) { 1719 error = ESTALE; 1720 goto out; 1721 } 1722 1723 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 1724 /* 1725 * VOP_ACCESS() may cover 'vp' with a new 1726 * filesystem, if 'vp' is an autoFS vnode. 1727 * Get the new 'vp' if so. 1728 */ 1729 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 1730 (vp->v_vfsmountedhere == NULL || 1731 (error = traverse(&vp)) == 0)) { 1732 pathlen = pn.pn_pathlen + 2; 1733 path = kmem_alloc(pathlen, KM_SLEEP); 1734 (void) strncpy(path, pn.pn_path, 1735 pn.pn_pathlen + 1); 1736 path[pathlen - 2] = '/'; 1737 path[pathlen - 1] = '\0'; 1738 pn_free(&pn); 1739 pn_free(&upn); 1740 1741 /* Success! */ 1742 break; 1743 } 1744 VN_RELE(vp); 1745 } 1746 if (error != ESTALE) 1747 goto out; 1748 } 1749 1750 ASSERT(error == 0); 1751 zone->zone_rootvp = vp; /* we hold a reference to vp */ 1752 zone->zone_rootpath = path; 1753 zone->zone_rootpathlen = pathlen; 1754 return (0); 1755 1756 out: 1757 pn_free(&pn); 1758 pn_free(&upn); 1759 return (error); 1760 } 1761 1762 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 1763 ((c) >= 'a' && (c) <= 'z') || \ 1764 ((c) >= 'A' && (c) <= 'Z')) 1765 1766 static int 1767 zone_set_name(zone_t *zone, const char *uname) 1768 { 1769 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 1770 size_t len; 1771 int i, err; 1772 1773 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 1774 kmem_free(kname, ZONENAME_MAX); 1775 return (err); /* EFAULT or ENAMETOOLONG */ 1776 } 1777 1778 /* must be less than ZONENAME_MAX */ 1779 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 1780 kmem_free(kname, ZONENAME_MAX); 1781 return (EINVAL); 1782 } 1783 1784 /* 1785 * Name must start with an alphanumeric and must contain only 1786 * alphanumerics, '-', '_' and '.'. 1787 */ 1788 if (!isalnum(kname[0])) { 1789 kmem_free(kname, ZONENAME_MAX); 1790 return (EINVAL); 1791 } 1792 for (i = 1; i < len - 1; i++) { 1793 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 1794 kname[i] != '.') { 1795 kmem_free(kname, ZONENAME_MAX); 1796 return (EINVAL); 1797 } 1798 } 1799 1800 zone->zone_name = kname; 1801 return (0); 1802 } 1803 1804 /* 1805 * Similar to thread_create(), but makes sure the thread is in the appropriate 1806 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 1807 */ 1808 /*ARGSUSED*/ 1809 kthread_t * 1810 zthread_create( 1811 caddr_t stk, 1812 size_t stksize, 1813 void (*proc)(), 1814 void *arg, 1815 size_t len, 1816 pri_t pri) 1817 { 1818 kthread_t *t; 1819 zone_t *zone = curproc->p_zone; 1820 proc_t *pp = zone->zone_zsched; 1821 1822 zone_hold(zone); /* Reference to be dropped when thread exits */ 1823 1824 /* 1825 * No-one should be trying to create threads if the zone is shutting 1826 * down and there aren't any kernel threads around. See comment 1827 * in zthread_exit(). 1828 */ 1829 ASSERT(!(zone->zone_kthreads == NULL && 1830 zone_status_get(zone) >= ZONE_IS_EMPTY)); 1831 /* 1832 * Create a thread, but don't let it run until we've finished setting 1833 * things up. 1834 */ 1835 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 1836 ASSERT(t->t_forw == NULL); 1837 mutex_enter(&zone_status_lock); 1838 if (zone->zone_kthreads == NULL) { 1839 t->t_forw = t->t_back = t; 1840 } else { 1841 kthread_t *tx = zone->zone_kthreads; 1842 1843 t->t_forw = tx; 1844 t->t_back = tx->t_back; 1845 tx->t_back->t_forw = t; 1846 tx->t_back = t; 1847 } 1848 zone->zone_kthreads = t; 1849 mutex_exit(&zone_status_lock); 1850 1851 mutex_enter(&pp->p_lock); 1852 t->t_proc_flag |= TP_ZTHREAD; 1853 project_rele(t->t_proj); 1854 t->t_proj = project_hold(pp->p_task->tk_proj); 1855 1856 /* 1857 * Setup complete, let it run. 1858 */ 1859 thread_lock(t); 1860 t->t_schedflag |= TS_ALLSTART; 1861 setrun_locked(t); 1862 thread_unlock(t); 1863 1864 mutex_exit(&pp->p_lock); 1865 1866 return (t); 1867 } 1868 1869 /* 1870 * Similar to thread_exit(). Must be called by threads created via 1871 * zthread_exit(). 1872 */ 1873 void 1874 zthread_exit(void) 1875 { 1876 kthread_t *t = curthread; 1877 proc_t *pp = curproc; 1878 zone_t *zone = pp->p_zone; 1879 1880 mutex_enter(&zone_status_lock); 1881 1882 /* 1883 * Reparent to p0 1884 */ 1885 mutex_enter(&pp->p_lock); 1886 t->t_proc_flag &= ~TP_ZTHREAD; 1887 t->t_procp = &p0; 1888 hat_thread_exit(t); 1889 mutex_exit(&pp->p_lock); 1890 1891 if (t->t_back == t) { 1892 ASSERT(t->t_forw == t); 1893 /* 1894 * If the zone is empty, once the thread count 1895 * goes to zero no further kernel threads can be 1896 * created. This is because if the creator is a process 1897 * in the zone, then it must have exited before the zone 1898 * state could be set to ZONE_IS_EMPTY. 1899 * Otherwise, if the creator is a kernel thread in the 1900 * zone, the thread count is non-zero. 1901 * 1902 * This really means that non-zone kernel threads should 1903 * not create zone kernel threads. 1904 */ 1905 zone->zone_kthreads = NULL; 1906 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 1907 zone_status_set(zone, ZONE_IS_DOWN); 1908 } 1909 } else { 1910 t->t_forw->t_back = t->t_back; 1911 t->t_back->t_forw = t->t_forw; 1912 if (zone->zone_kthreads == t) 1913 zone->zone_kthreads = t->t_forw; 1914 } 1915 mutex_exit(&zone_status_lock); 1916 zone_rele(zone); 1917 thread_exit(); 1918 /* NOTREACHED */ 1919 } 1920 1921 static void 1922 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 1923 { 1924 vnode_t *oldvp; 1925 1926 /* we're going to hold a reference here to the directory */ 1927 VN_HOLD(vp); 1928 1929 #ifdef C2_AUDIT 1930 if (audit_active) /* update abs cwd/root path see c2audit.c */ 1931 audit_chdirec(vp, vpp); 1932 #endif 1933 1934 mutex_enter(&pp->p_lock); 1935 oldvp = *vpp; 1936 *vpp = vp; 1937 mutex_exit(&pp->p_lock); 1938 if (oldvp != NULL) 1939 VN_RELE(oldvp); 1940 } 1941 1942 /* 1943 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 1944 */ 1945 static int 1946 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 1947 { 1948 nvpair_t *nvp = NULL; 1949 boolean_t priv_set = B_FALSE; 1950 boolean_t limit_set = B_FALSE; 1951 boolean_t action_set = B_FALSE; 1952 1953 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1954 const char *name; 1955 uint64_t ui64; 1956 1957 name = nvpair_name(nvp); 1958 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 1959 return (EINVAL); 1960 (void) nvpair_value_uint64(nvp, &ui64); 1961 if (strcmp(name, "privilege") == 0) { 1962 /* 1963 * Currently only privileged values are allowed, but 1964 * this may change in the future. 1965 */ 1966 if (ui64 != RCPRIV_PRIVILEGED) 1967 return (EINVAL); 1968 rv->rcv_privilege = ui64; 1969 priv_set = B_TRUE; 1970 } else if (strcmp(name, "limit") == 0) { 1971 rv->rcv_value = ui64; 1972 limit_set = B_TRUE; 1973 } else if (strcmp(name, "action") == 0) { 1974 if (ui64 != RCTL_LOCAL_NOACTION && 1975 ui64 != RCTL_LOCAL_DENY) 1976 return (EINVAL); 1977 rv->rcv_flagaction = ui64; 1978 action_set = B_TRUE; 1979 } else { 1980 return (EINVAL); 1981 } 1982 } 1983 1984 if (!(priv_set && limit_set && action_set)) 1985 return (EINVAL); 1986 rv->rcv_action_signal = 0; 1987 rv->rcv_action_recipient = NULL; 1988 rv->rcv_action_recip_pid = -1; 1989 rv->rcv_firing_time = 0; 1990 1991 return (0); 1992 } 1993 1994 void 1995 zone_icode(void) 1996 { 1997 proc_t *p = ttoproc(curthread); 1998 struct core_globals *cg; 1999 2000 /* 2001 * For all purposes (ZONE_ATTR_INITPID and restart_init), 2002 * storing just the pid of init is sufficient. 2003 */ 2004 p->p_zone->zone_proc_initpid = p->p_pid; 2005 2006 /* 2007 * Allocate user address space and stack segment 2008 */ 2009 2010 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 2011 p->p_usrstack = (caddr_t)USRSTACK32; 2012 p->p_model = DATAMODEL_ILP32; 2013 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 2014 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 2015 p->p_stk_ctl = INT32_MAX; 2016 2017 p->p_as = as_alloc(); 2018 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 2019 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 2020 2021 cg = zone_getspecific(core_zone_key, p->p_zone); 2022 ASSERT(cg != NULL); 2023 corectl_path_hold(cg->core_default_path); 2024 corectl_content_hold(cg->core_default_content); 2025 p->p_corefile = cg->core_default_path; 2026 p->p_content = cg->core_default_content; 2027 2028 init_mstate(curthread, LMS_SYSTEM); 2029 2030 p->p_zone->zone_boot_err = exec_init(zone_initname, 0, 2031 p->p_zone->zone_bootargs); 2032 2033 mutex_enter(&zone_status_lock); 2034 if (p->p_zone->zone_boot_err != 0) { 2035 /* 2036 * Make sure we are still in the booting state-- we could have 2037 * raced and already be shutting down, or even further along. 2038 */ 2039 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2040 zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); 2041 mutex_exit(&zone_status_lock); 2042 /* It's gone bad, dispose of the process */ 2043 if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { 2044 mutex_enter(&p->p_lock); 2045 ASSERT(p->p_flag & SEXITLWPS); 2046 lwp_exit(); 2047 } 2048 } else { 2049 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2050 zone_status_set(p->p_zone, ZONE_IS_RUNNING); 2051 mutex_exit(&zone_status_lock); 2052 /* cause the process to return to userland. */ 2053 lwp_rtt(); 2054 } 2055 } 2056 2057 struct zsched_arg { 2058 zone_t *zone; 2059 nvlist_t *nvlist; 2060 }; 2061 2062 /* 2063 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2064 * anything to do with scheduling, but rather with the fact that 2065 * per-zone kernel threads are parented to zsched, just like regular 2066 * kernel threads are parented to sched (p0). 2067 * 2068 * zsched is also responsible for launching init for the zone. 2069 */ 2070 static void 2071 zsched(void *arg) 2072 { 2073 struct zsched_arg *za = arg; 2074 proc_t *pp = curproc; 2075 proc_t *initp = proc_init; 2076 zone_t *zone = za->zone; 2077 cred_t *cr, *oldcred; 2078 rctl_set_t *set; 2079 rctl_alloc_gp_t *gp; 2080 contract_t *ct = NULL; 2081 task_t *tk, *oldtk; 2082 rctl_entity_p_t e; 2083 kproject_t *pj; 2084 2085 nvlist_t *nvl = za->nvlist; 2086 nvpair_t *nvp = NULL; 2087 2088 bcopy("zsched", u.u_psargs, sizeof ("zsched")); 2089 bcopy("zsched", u.u_comm, sizeof ("zsched")); 2090 u.u_argc = 0; 2091 u.u_argv = NULL; 2092 u.u_envp = NULL; 2093 closeall(P_FINFO(pp)); 2094 2095 /* 2096 * We are this zone's "zsched" process. As the zone isn't generally 2097 * visible yet we don't need to grab any locks before initializing its 2098 * zone_proc pointer. 2099 */ 2100 zone_hold(zone); /* this hold is released by zone_destroy() */ 2101 zone->zone_zsched = pp; 2102 mutex_enter(&pp->p_lock); 2103 pp->p_zone = zone; 2104 mutex_exit(&pp->p_lock); 2105 2106 /* 2107 * Disassociate process from its 'parent'; parent ourselves to init 2108 * (pid 1) and change other values as needed. 2109 */ 2110 sess_create(); 2111 2112 mutex_enter(&pidlock); 2113 proc_detach(pp); 2114 pp->p_ppid = 1; 2115 pp->p_flag |= SZONETOP; 2116 pp->p_ancpid = 1; 2117 pp->p_parent = initp; 2118 pp->p_psibling = NULL; 2119 if (initp->p_child) 2120 initp->p_child->p_psibling = pp; 2121 pp->p_sibling = initp->p_child; 2122 initp->p_child = pp; 2123 2124 /* Decrement what newproc() incremented. */ 2125 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2126 /* 2127 * Our credentials are about to become kcred-like, so we don't care 2128 * about the caller's ruid. 2129 */ 2130 upcount_inc(crgetruid(kcred), zone->zone_id); 2131 mutex_exit(&pidlock); 2132 2133 /* 2134 * getting out of global zone, so decrement lwp counts 2135 */ 2136 pj = pp->p_task->tk_proj; 2137 mutex_enter(&global_zone->zone_nlwps_lock); 2138 pj->kpj_nlwps -= pp->p_lwpcnt; 2139 global_zone->zone_nlwps -= pp->p_lwpcnt; 2140 mutex_exit(&global_zone->zone_nlwps_lock); 2141 2142 /* 2143 * Create and join a new task in project '0' of this zone. 2144 * 2145 * We don't need to call holdlwps() since we know we're the only lwp in 2146 * this process. 2147 * 2148 * task_join() returns with p_lock held. 2149 */ 2150 tk = task_create(0, zone); 2151 mutex_enter(&cpu_lock); 2152 oldtk = task_join(tk, 0); 2153 mutex_exit(&curproc->p_lock); 2154 mutex_exit(&cpu_lock); 2155 task_rele(oldtk); 2156 2157 /* 2158 * add lwp counts to zsched's zone, and increment project's task count 2159 * due to the task created in the above tasksys_settaskid 2160 */ 2161 pj = pp->p_task->tk_proj; 2162 mutex_enter(&zone->zone_nlwps_lock); 2163 pj->kpj_nlwps += pp->p_lwpcnt; 2164 pj->kpj_ntasks += 1; 2165 zone->zone_nlwps += pp->p_lwpcnt; 2166 mutex_exit(&zone->zone_nlwps_lock); 2167 2168 /* 2169 * The process was created by a process in the global zone, hence the 2170 * credentials are wrong. We might as well have kcred-ish credentials. 2171 */ 2172 cr = zone->zone_kcred; 2173 crhold(cr); 2174 mutex_enter(&pp->p_crlock); 2175 oldcred = pp->p_cred; 2176 pp->p_cred = cr; 2177 mutex_exit(&pp->p_crlock); 2178 crfree(oldcred); 2179 2180 /* 2181 * Hold credentials again (for thread) 2182 */ 2183 crhold(cr); 2184 2185 /* 2186 * p_lwpcnt can't change since this is a kernel process. 2187 */ 2188 crset(pp, cr); 2189 2190 /* 2191 * Chroot 2192 */ 2193 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2194 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2195 2196 /* 2197 * Initialize zone's rctl set. 2198 */ 2199 set = rctl_set_create(); 2200 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2201 mutex_enter(&pp->p_lock); 2202 e.rcep_p.zone = zone; 2203 e.rcep_t = RCENTITY_ZONE; 2204 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2205 mutex_exit(&pp->p_lock); 2206 rctl_prealloc_destroy(gp); 2207 2208 /* 2209 * Apply the rctls passed in to zone_create(). This is basically a list 2210 * assignment: all of the old values are removed and the new ones 2211 * inserted. That is, if an empty list is passed in, all values are 2212 * removed. 2213 */ 2214 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2215 rctl_dict_entry_t *rde; 2216 rctl_hndl_t hndl; 2217 char *name; 2218 nvlist_t **nvlarray; 2219 uint_t i, nelem; 2220 int error; /* For ASSERT()s */ 2221 2222 name = nvpair_name(nvp); 2223 hndl = rctl_hndl_lookup(name); 2224 ASSERT(hndl != -1); 2225 rde = rctl_dict_lookup_hndl(hndl); 2226 ASSERT(rde != NULL); 2227 2228 for (; /* ever */; ) { 2229 rctl_val_t oval; 2230 2231 mutex_enter(&pp->p_lock); 2232 error = rctl_local_get(hndl, NULL, &oval, pp); 2233 mutex_exit(&pp->p_lock); 2234 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2235 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2236 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2237 break; 2238 mutex_enter(&pp->p_lock); 2239 error = rctl_local_delete(hndl, &oval, pp); 2240 mutex_exit(&pp->p_lock); 2241 ASSERT(error == 0); 2242 } 2243 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2244 ASSERT(error == 0); 2245 for (i = 0; i < nelem; i++) { 2246 rctl_val_t *nvalp; 2247 2248 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2249 error = nvlist2rctlval(nvlarray[i], nvalp); 2250 ASSERT(error == 0); 2251 /* 2252 * rctl_local_insert can fail if the value being 2253 * inserted is a duplicate; this is OK. 2254 */ 2255 mutex_enter(&pp->p_lock); 2256 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2257 kmem_cache_free(rctl_val_cache, nvalp); 2258 mutex_exit(&pp->p_lock); 2259 } 2260 } 2261 /* 2262 * Tell the world that we're done setting up. 2263 * 2264 * At this point we want to set the zone status to ZONE_IS_READY 2265 * and atomically set the zone's processor set visibility. Once 2266 * we drop pool_lock() this zone will automatically get updated 2267 * to reflect any future changes to the pools configuration. 2268 */ 2269 pool_lock(); 2270 mutex_enter(&cpu_lock); 2271 mutex_enter(&zonehash_lock); 2272 zone_uniqid(zone); 2273 zone_zsd_configure(zone); 2274 if (pool_state == POOL_ENABLED) 2275 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2276 mutex_enter(&zone_status_lock); 2277 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2278 zone_status_set(zone, ZONE_IS_READY); 2279 mutex_exit(&zone_status_lock); 2280 mutex_exit(&zonehash_lock); 2281 mutex_exit(&cpu_lock); 2282 pool_unlock(); 2283 2284 /* 2285 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2286 * we launch init, and set the state to running. 2287 */ 2288 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2289 2290 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2291 id_t cid; 2292 2293 /* 2294 * Ok, this is a little complicated. We need to grab the 2295 * zone's pool's scheduling class ID; note that by now, we 2296 * are already bound to a pool if we need to be (zoneadmd 2297 * will have done that to us while we're in the READY 2298 * state). *But* the scheduling class for the zone's 'init' 2299 * must be explicitly passed to newproc, which doesn't 2300 * respect pool bindings. 2301 * 2302 * We hold the pool_lock across the call to newproc() to 2303 * close the obvious race: the pool's scheduling class 2304 * could change before we manage to create the LWP with 2305 * classid 'cid'. 2306 */ 2307 pool_lock(); 2308 cid = pool_get_class(zone->zone_pool); 2309 if (cid == -1) 2310 cid = defaultcid; 2311 2312 /* 2313 * If this fails, zone_boot will ultimately fail. The 2314 * state of the zone will be set to SHUTTING_DOWN-- userland 2315 * will have to tear down the zone, and fail, or try again. 2316 */ 2317 if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid, 2318 minclsyspri - 1, &ct)) != 0) { 2319 mutex_enter(&zone_status_lock); 2320 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2321 mutex_exit(&zone_status_lock); 2322 } 2323 pool_unlock(); 2324 } 2325 2326 /* 2327 * Wait for zone_destroy() to be called. This is what we spend 2328 * most of our life doing. 2329 */ 2330 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2331 2332 if (ct) 2333 /* 2334 * At this point the process contract should be empty. 2335 * (Though if it isn't, it's not the end of the world.) 2336 */ 2337 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2338 2339 /* 2340 * Allow kcred to be freed when all referring processes 2341 * (including this one) go away. We can't just do this in 2342 * zone_free because we need to wait for the zone_cred_ref to 2343 * drop to 0 before calling zone_free, and the existence of 2344 * zone_kcred will prevent that. Thus, we call crfree here to 2345 * balance the crdup in zone_create. The crhold calls earlier 2346 * in zsched will be dropped when the thread and process exit. 2347 */ 2348 crfree(zone->zone_kcred); 2349 zone->zone_kcred = NULL; 2350 2351 exit(CLD_EXITED, 0); 2352 } 2353 2354 /* 2355 * Helper function to determine if there are any submounts of the 2356 * provided path. Used to make sure the zone doesn't "inherit" any 2357 * mounts from before it is created. 2358 */ 2359 static uint_t 2360 zone_mount_count(const char *rootpath) 2361 { 2362 vfs_t *vfsp; 2363 uint_t count = 0; 2364 size_t rootpathlen = strlen(rootpath); 2365 2366 /* 2367 * Holding zonehash_lock prevents race conditions with 2368 * vfs_list_add()/vfs_list_remove() since we serialize with 2369 * zone_find_by_path(). 2370 */ 2371 ASSERT(MUTEX_HELD(&zonehash_lock)); 2372 /* 2373 * The rootpath must end with a '/' 2374 */ 2375 ASSERT(rootpath[rootpathlen - 1] == '/'); 2376 2377 /* 2378 * This intentionally does not count the rootpath itself if that 2379 * happens to be a mount point. 2380 */ 2381 vfs_list_read_lock(); 2382 vfsp = rootvfs; 2383 do { 2384 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2385 rootpathlen) == 0) 2386 count++; 2387 vfsp = vfsp->vfs_next; 2388 } while (vfsp != rootvfs); 2389 vfs_list_unlock(); 2390 return (count); 2391 } 2392 2393 /* 2394 * Helper function to make sure that a zone created on 'rootpath' 2395 * wouldn't end up containing other zones' rootpaths. 2396 */ 2397 static boolean_t 2398 zone_is_nested(const char *rootpath) 2399 { 2400 zone_t *zone; 2401 size_t rootpathlen = strlen(rootpath); 2402 size_t len; 2403 2404 ASSERT(MUTEX_HELD(&zonehash_lock)); 2405 2406 for (zone = list_head(&zone_active); zone != NULL; 2407 zone = list_next(&zone_active, zone)) { 2408 if (zone == global_zone) 2409 continue; 2410 len = strlen(zone->zone_rootpath); 2411 if (strncmp(rootpath, zone->zone_rootpath, 2412 MIN(rootpathlen, len)) == 0) 2413 return (B_TRUE); 2414 } 2415 return (B_FALSE); 2416 } 2417 2418 static int 2419 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs) 2420 { 2421 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 2422 2423 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 2424 kmem_free(privs, sizeof (priv_set_t)); 2425 return (EFAULT); 2426 } 2427 2428 zone->zone_privset = privs; 2429 return (0); 2430 } 2431 2432 /* 2433 * We make creative use of nvlists to pass in rctls from userland. The list is 2434 * a list of the following structures: 2435 * 2436 * (name = rctl_name, value = nvpair_list_array) 2437 * 2438 * Where each element of the nvpair_list_array is of the form: 2439 * 2440 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 2441 * (name = "limit", value = uint64_t), 2442 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 2443 */ 2444 static int 2445 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 2446 { 2447 nvpair_t *nvp = NULL; 2448 nvlist_t *nvl = NULL; 2449 char *kbuf; 2450 int error; 2451 rctl_val_t rv; 2452 2453 *nvlp = NULL; 2454 2455 if (buflen == 0) 2456 return (0); 2457 2458 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2459 return (ENOMEM); 2460 if (copyin(ubuf, kbuf, buflen)) { 2461 error = EFAULT; 2462 goto out; 2463 } 2464 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 2465 /* 2466 * nvl may have been allocated/free'd, but the value set to 2467 * non-NULL, so we reset it here. 2468 */ 2469 nvl = NULL; 2470 error = EINVAL; 2471 goto out; 2472 } 2473 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2474 rctl_dict_entry_t *rde; 2475 rctl_hndl_t hndl; 2476 nvlist_t **nvlarray; 2477 uint_t i, nelem; 2478 char *name; 2479 2480 error = EINVAL; 2481 name = nvpair_name(nvp); 2482 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 2483 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 2484 goto out; 2485 } 2486 if ((hndl = rctl_hndl_lookup(name)) == -1) { 2487 goto out; 2488 } 2489 rde = rctl_dict_lookup_hndl(hndl); 2490 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2491 ASSERT(error == 0); 2492 for (i = 0; i < nelem; i++) { 2493 if (error = nvlist2rctlval(nvlarray[i], &rv)) 2494 goto out; 2495 } 2496 if (rctl_invalid_value(rde, &rv)) { 2497 error = EINVAL; 2498 goto out; 2499 } 2500 } 2501 error = 0; 2502 *nvlp = nvl; 2503 out: 2504 kmem_free(kbuf, buflen); 2505 if (error && nvl != NULL) 2506 nvlist_free(nvl); 2507 return (error); 2508 } 2509 2510 int 2511 zone_create_error(int er_error, int er_ext, int *er_out) { 2512 if (er_out != NULL) { 2513 if (copyout(&er_ext, er_out, sizeof (int))) { 2514 return (set_errno(EFAULT)); 2515 } 2516 } 2517 return (set_errno(er_error)); 2518 } 2519 2520 /* 2521 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 2522 */ 2523 static int 2524 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 2525 { 2526 char *kbuf; 2527 char *dataset, *next; 2528 zone_dataset_t *zd; 2529 size_t len; 2530 2531 if (ubuf == NULL || buflen == 0) 2532 return (0); 2533 2534 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2535 return (ENOMEM); 2536 2537 if (copyin(ubuf, kbuf, buflen) != 0) { 2538 kmem_free(kbuf, buflen); 2539 return (EFAULT); 2540 } 2541 2542 dataset = next = kbuf; 2543 for (;;) { 2544 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 2545 2546 next = strchr(dataset, ','); 2547 2548 if (next == NULL) 2549 len = strlen(dataset); 2550 else 2551 len = next - dataset; 2552 2553 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 2554 bcopy(dataset, zd->zd_dataset, len); 2555 zd->zd_dataset[len] = '\0'; 2556 2557 list_insert_head(&zone->zone_datasets, zd); 2558 2559 if (next == NULL) 2560 break; 2561 2562 dataset = next + 1; 2563 } 2564 2565 kmem_free(kbuf, buflen); 2566 return (0); 2567 } 2568 2569 /* 2570 * System call to create/initialize a new zone named 'zone_name', rooted 2571 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 2572 * and initialized with the zone-wide rctls described in 'rctlbuf'. 2573 * 2574 * If extended error is non-null, we may use it to return more detailed 2575 * error information. 2576 */ 2577 static zoneid_t 2578 zone_create(const char *zone_name, const char *zone_root, 2579 const priv_set_t *zone_privs, caddr_t rctlbuf, size_t rctlbufsz, 2580 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error) 2581 { 2582 struct zsched_arg zarg; 2583 nvlist_t *rctls = NULL; 2584 proc_t *pp = curproc; 2585 zone_t *zone, *ztmp; 2586 zoneid_t zoneid; 2587 int error; 2588 int error2 = 0; 2589 char *str; 2590 cred_t *zkcr; 2591 2592 if (secpolicy_zone_config(CRED()) != 0) 2593 return (set_errno(EPERM)); 2594 2595 /* can't boot zone from within chroot environment */ 2596 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 2597 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 2598 extended_error)); 2599 2600 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 2601 zoneid = zone->zone_id = id_alloc(zoneid_space); 2602 zone->zone_status = ZONE_IS_UNINITIALIZED; 2603 zone->zone_pool = pool_default; 2604 zone->zone_pool_mod = gethrtime(); 2605 zone->zone_psetid = ZONE_PS_INVAL; 2606 zone->zone_ncpus = 0; 2607 zone->zone_ncpus_online = 0; 2608 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 2609 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 2610 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 2611 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 2612 offsetof(struct zsd_entry, zsd_linkage)); 2613 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 2614 offsetof(zone_dataset_t, zd_linkage)); 2615 2616 if ((error = zone_set_name(zone, zone_name)) != 0) { 2617 zone_free(zone); 2618 return (zone_create_error(error, 0, extended_error)); 2619 } 2620 2621 if ((error = zone_set_root(zone, zone_root)) != 0) { 2622 zone_free(zone); 2623 return (zone_create_error(error, 0, extended_error)); 2624 } 2625 if ((error = zone_set_privset(zone, zone_privs)) != 0) { 2626 zone_free(zone); 2627 return (zone_create_error(error, 0, extended_error)); 2628 } 2629 2630 /* initialize node name to be the same as zone name */ 2631 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2632 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 2633 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 2634 2635 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2636 zone->zone_domain[0] = '\0'; 2637 zone->zone_shares = 1; 2638 zone->zone_bootargs = NULL; 2639 2640 /* 2641 * Zsched initializes the rctls. 2642 */ 2643 zone->zone_rctls = NULL; 2644 2645 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 2646 zone_free(zone); 2647 return (zone_create_error(error, 0, extended_error)); 2648 } 2649 2650 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 2651 zone_free(zone); 2652 return (set_errno(error)); 2653 } 2654 2655 /* 2656 * Stop all lwps since that's what normally happens as part of fork(). 2657 * This needs to happen before we grab any locks to avoid deadlock 2658 * (another lwp in the process could be waiting for the held lock). 2659 */ 2660 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 2661 zone_free(zone); 2662 if (rctls) 2663 nvlist_free(rctls); 2664 return (zone_create_error(error, 0, extended_error)); 2665 } 2666 2667 if (block_mounts() == 0) { 2668 mutex_enter(&pp->p_lock); 2669 if (curthread != pp->p_agenttp) 2670 continuelwps(pp); 2671 mutex_exit(&pp->p_lock); 2672 zone_free(zone); 2673 if (rctls) 2674 nvlist_free(rctls); 2675 return (zone_create_error(error, 0, extended_error)); 2676 } 2677 2678 /* 2679 * Set up credential for kernel access. After this, any errors 2680 * should go through the dance in errout rather than calling 2681 * zone_free directly. 2682 */ 2683 zone->zone_kcred = crdup(kcred); 2684 crsetzone(zone->zone_kcred, zone); 2685 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 2686 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 2687 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 2688 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 2689 2690 mutex_enter(&zonehash_lock); 2691 /* 2692 * Make sure zone doesn't already exist. 2693 */ 2694 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) { 2695 zone_status_t status; 2696 2697 status = zone_status_get(ztmp); 2698 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 2699 error = EEXIST; 2700 else 2701 error = EBUSY; 2702 goto errout; 2703 } 2704 2705 /* 2706 * Don't allow zone creations which would cause one zone's rootpath to 2707 * be accessible from that of another (non-global) zone. 2708 */ 2709 if (zone_is_nested(zone->zone_rootpath)) { 2710 error = EBUSY; 2711 goto errout; 2712 } 2713 2714 ASSERT(zonecount != 0); /* check for leaks */ 2715 if (zonecount + 1 > maxzones) { 2716 error = ENOMEM; 2717 goto errout; 2718 } 2719 2720 if (zone_mount_count(zone->zone_rootpath) != 0) { 2721 error = EBUSY; 2722 error2 = ZE_AREMOUNTS; 2723 goto errout; 2724 } 2725 2726 /* 2727 * Zone is still incomplete, but we need to drop all locks while 2728 * zsched() initializes this zone's kernel process. We 2729 * optimistically add the zone to the hashtable and associated 2730 * lists so a parallel zone_create() doesn't try to create the 2731 * same zone. 2732 */ 2733 zonecount++; 2734 (void) mod_hash_insert(zonehashbyid, 2735 (mod_hash_key_t)(uintptr_t)zone->zone_id, 2736 (mod_hash_val_t)(uintptr_t)zone); 2737 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 2738 (void) strcpy(str, zone->zone_name); 2739 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 2740 (mod_hash_val_t)(uintptr_t)zone); 2741 /* 2742 * Insert into active list. At this point there are no 'hold's 2743 * on the zone, but everyone else knows not to use it, so we can 2744 * continue to use it. zsched() will do a zone_hold() if the 2745 * newproc() is successful. 2746 */ 2747 list_insert_tail(&zone_active, zone); 2748 mutex_exit(&zonehash_lock); 2749 2750 zarg.zone = zone; 2751 zarg.nvlist = rctls; 2752 /* 2753 * The process, task, and project rctls are probably wrong; 2754 * we need an interface to get the default values of all rctls, 2755 * and initialize zsched appropriately. I'm not sure that that 2756 * makes much of a difference, though. 2757 */ 2758 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 2759 /* 2760 * We need to undo all globally visible state. 2761 */ 2762 mutex_enter(&zonehash_lock); 2763 list_remove(&zone_active, zone); 2764 (void) mod_hash_destroy(zonehashbyname, 2765 (mod_hash_key_t)(uintptr_t)zone->zone_name); 2766 (void) mod_hash_destroy(zonehashbyid, 2767 (mod_hash_key_t)(uintptr_t)zone->zone_id); 2768 ASSERT(zonecount > 1); 2769 zonecount--; 2770 goto errout; 2771 } 2772 2773 /* 2774 * Zone creation can't fail from now on. 2775 */ 2776 2777 /* 2778 * Let the other lwps continue. 2779 */ 2780 mutex_enter(&pp->p_lock); 2781 if (curthread != pp->p_agenttp) 2782 continuelwps(pp); 2783 mutex_exit(&pp->p_lock); 2784 2785 /* 2786 * Wait for zsched to finish initializing the zone. 2787 */ 2788 zone_status_wait(zone, ZONE_IS_READY); 2789 /* 2790 * The zone is fully visible, so we can let mounts progress. 2791 */ 2792 resume_mounts(); 2793 if (rctls) 2794 nvlist_free(rctls); 2795 2796 return (zoneid); 2797 2798 errout: 2799 mutex_exit(&zonehash_lock); 2800 /* 2801 * Let the other lwps continue. 2802 */ 2803 mutex_enter(&pp->p_lock); 2804 if (curthread != pp->p_agenttp) 2805 continuelwps(pp); 2806 mutex_exit(&pp->p_lock); 2807 2808 resume_mounts(); 2809 if (rctls) 2810 nvlist_free(rctls); 2811 /* 2812 * There is currently one reference to the zone, a cred_ref from 2813 * zone_kcred. To free the zone, we call crfree, which will call 2814 * zone_cred_rele, which will call zone_free. 2815 */ 2816 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 2817 ASSERT(zone->zone_kcred->cr_ref == 1); 2818 ASSERT(zone->zone_ref == 0); 2819 zkcr = zone->zone_kcred; 2820 zone->zone_kcred = NULL; 2821 crfree(zkcr); /* triggers call to zone_free */ 2822 return (zone_create_error(error, error2, extended_error)); 2823 } 2824 2825 /* 2826 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 2827 * the heavy lifting. 2828 */ 2829 static int 2830 zone_boot(zoneid_t zoneid, const char *bootargs) 2831 { 2832 int err; 2833 zone_t *zone; 2834 2835 if (secpolicy_zone_config(CRED()) != 0) 2836 return (set_errno(EPERM)); 2837 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2838 return (set_errno(EINVAL)); 2839 2840 mutex_enter(&zonehash_lock); 2841 /* 2842 * Look for zone under hash lock to prevent races with calls to 2843 * zone_shutdown, zone_destroy, etc. 2844 */ 2845 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2846 mutex_exit(&zonehash_lock); 2847 return (set_errno(EINVAL)); 2848 } 2849 2850 if ((err = zone_set_bootargs(zone, bootargs)) != 0) { 2851 mutex_exit(&zonehash_lock); 2852 return (set_errno(err)); 2853 } 2854 2855 mutex_enter(&zone_status_lock); 2856 if (zone_status_get(zone) != ZONE_IS_READY) { 2857 mutex_exit(&zone_status_lock); 2858 mutex_exit(&zonehash_lock); 2859 return (set_errno(EINVAL)); 2860 } 2861 zone_status_set(zone, ZONE_IS_BOOTING); 2862 mutex_exit(&zone_status_lock); 2863 2864 zone_hold(zone); /* so we can use the zone_t later */ 2865 mutex_exit(&zonehash_lock); 2866 2867 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 2868 zone_rele(zone); 2869 return (set_errno(EINTR)); 2870 } 2871 2872 /* 2873 * Boot (starting init) might have failed, in which case the zone 2874 * will go to the SHUTTING_DOWN state; an appropriate errno will 2875 * be placed in zone->zone_boot_err, and so we return that. 2876 */ 2877 err = zone->zone_boot_err; 2878 zone_rele(zone); 2879 return (err ? set_errno(err) : 0); 2880 } 2881 2882 /* 2883 * Kills all user processes in the zone, waiting for them all to exit 2884 * before returning. 2885 */ 2886 static int 2887 zone_empty(zone_t *zone) 2888 { 2889 int waitstatus; 2890 2891 /* 2892 * We need to drop zonehash_lock before killing all 2893 * processes, otherwise we'll deadlock with zone_find_* 2894 * which can be called from the exit path. 2895 */ 2896 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 2897 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 2898 ZONE_IS_EMPTY)) == -1) { 2899 killall(zone->zone_id); 2900 } 2901 /* 2902 * return EINTR if we were signaled 2903 */ 2904 if (waitstatus == 0) 2905 return (EINTR); 2906 return (0); 2907 } 2908 2909 /* 2910 * Systemcall to start the zone's halt sequence. By the time this 2911 * function successfully returns, all user processes and kernel threads 2912 * executing in it will have exited, ZSD shutdown callbacks executed, 2913 * and the zone status set to ZONE_IS_DOWN. 2914 * 2915 * It is possible that the call will interrupt itself if the caller is the 2916 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 2917 */ 2918 static int 2919 zone_shutdown(zoneid_t zoneid) 2920 { 2921 int error; 2922 zone_t *zone; 2923 zone_status_t status; 2924 2925 if (secpolicy_zone_config(CRED()) != 0) 2926 return (set_errno(EPERM)); 2927 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2928 return (set_errno(EINVAL)); 2929 2930 /* 2931 * Block mounts so that VFS_MOUNT() can get an accurate view of 2932 * the zone's status with regards to ZONE_IS_SHUTTING down. 2933 * 2934 * e.g. NFS can fail the mount if it determines that the zone 2935 * has already begun the shutdown sequence. 2936 */ 2937 if (block_mounts() == 0) 2938 return (set_errno(EINTR)); 2939 mutex_enter(&zonehash_lock); 2940 /* 2941 * Look for zone under hash lock to prevent races with other 2942 * calls to zone_shutdown and zone_destroy. 2943 */ 2944 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2945 mutex_exit(&zonehash_lock); 2946 resume_mounts(); 2947 return (set_errno(EINVAL)); 2948 } 2949 mutex_enter(&zone_status_lock); 2950 status = zone_status_get(zone); 2951 /* 2952 * Fail if the zone isn't fully initialized yet. 2953 */ 2954 if (status < ZONE_IS_READY) { 2955 mutex_exit(&zone_status_lock); 2956 mutex_exit(&zonehash_lock); 2957 resume_mounts(); 2958 return (set_errno(EINVAL)); 2959 } 2960 /* 2961 * If conditions required for zone_shutdown() to return have been met, 2962 * return success. 2963 */ 2964 if (status >= ZONE_IS_DOWN) { 2965 mutex_exit(&zone_status_lock); 2966 mutex_exit(&zonehash_lock); 2967 resume_mounts(); 2968 return (0); 2969 } 2970 /* 2971 * If zone_shutdown() hasn't been called before, go through the motions. 2972 * If it has, there's nothing to do but wait for the kernel threads to 2973 * drain. 2974 */ 2975 if (status < ZONE_IS_EMPTY) { 2976 uint_t ntasks; 2977 2978 mutex_enter(&zone->zone_lock); 2979 if ((ntasks = zone->zone_ntasks) != 1) { 2980 /* 2981 * There's still stuff running. 2982 */ 2983 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2984 } 2985 mutex_exit(&zone->zone_lock); 2986 if (ntasks == 1) { 2987 /* 2988 * The only way to create another task is through 2989 * zone_enter(), which will block until we drop 2990 * zonehash_lock. The zone is empty. 2991 */ 2992 if (zone->zone_kthreads == NULL) { 2993 /* 2994 * Skip ahead to ZONE_IS_DOWN 2995 */ 2996 zone_status_set(zone, ZONE_IS_DOWN); 2997 } else { 2998 zone_status_set(zone, ZONE_IS_EMPTY); 2999 } 3000 } 3001 } 3002 zone_hold(zone); /* so we can use the zone_t later */ 3003 mutex_exit(&zone_status_lock); 3004 mutex_exit(&zonehash_lock); 3005 resume_mounts(); 3006 3007 if (error = zone_empty(zone)) { 3008 zone_rele(zone); 3009 return (set_errno(error)); 3010 } 3011 /* 3012 * After the zone status goes to ZONE_IS_DOWN this zone will no 3013 * longer be notified of changes to the pools configuration, so 3014 * in order to not end up with a stale pool pointer, we point 3015 * ourselves at the default pool and remove all resource 3016 * visibility. This is especially important as the zone_t may 3017 * languish on the deathrow for a very long time waiting for 3018 * cred's to drain out. 3019 * 3020 * This rebinding of the zone can happen multiple times 3021 * (presumably due to interrupted or parallel systemcalls) 3022 * without any adverse effects. 3023 */ 3024 if (pool_lock_intr() != 0) { 3025 zone_rele(zone); 3026 return (set_errno(EINTR)); 3027 } 3028 if (pool_state == POOL_ENABLED) { 3029 mutex_enter(&cpu_lock); 3030 zone_pool_set(zone, pool_default); 3031 /* 3032 * The zone no longer needs to be able to see any cpus. 3033 */ 3034 zone_pset_set(zone, ZONE_PS_INVAL); 3035 mutex_exit(&cpu_lock); 3036 } 3037 pool_unlock(); 3038 3039 /* 3040 * ZSD shutdown callbacks can be executed multiple times, hence 3041 * it is safe to not be holding any locks across this call. 3042 */ 3043 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 3044 3045 mutex_enter(&zone_status_lock); 3046 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 3047 zone_status_set(zone, ZONE_IS_DOWN); 3048 mutex_exit(&zone_status_lock); 3049 3050 /* 3051 * Wait for kernel threads to drain. 3052 */ 3053 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 3054 zone_rele(zone); 3055 return (set_errno(EINTR)); 3056 } 3057 zone_rele(zone); 3058 return (0); 3059 } 3060 3061 /* 3062 * Systemcall entry point to finalize the zone halt process. The caller 3063 * must have already successfully callefd zone_shutdown(). 3064 * 3065 * Upon successful completion, the zone will have been fully destroyed: 3066 * zsched will have exited, destructor callbacks executed, and the zone 3067 * removed from the list of active zones. 3068 */ 3069 static int 3070 zone_destroy(zoneid_t zoneid) 3071 { 3072 uint64_t uniqid; 3073 zone_t *zone; 3074 zone_status_t status; 3075 3076 if (secpolicy_zone_config(CRED()) != 0) 3077 return (set_errno(EPERM)); 3078 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3079 return (set_errno(EINVAL)); 3080 3081 mutex_enter(&zonehash_lock); 3082 /* 3083 * Look for zone under hash lock to prevent races with other 3084 * calls to zone_destroy. 3085 */ 3086 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3087 mutex_exit(&zonehash_lock); 3088 return (set_errno(EINVAL)); 3089 } 3090 3091 if (zone_mount_count(zone->zone_rootpath) != 0) { 3092 mutex_exit(&zonehash_lock); 3093 return (set_errno(EBUSY)); 3094 } 3095 mutex_enter(&zone_status_lock); 3096 status = zone_status_get(zone); 3097 if (status < ZONE_IS_DOWN) { 3098 mutex_exit(&zone_status_lock); 3099 mutex_exit(&zonehash_lock); 3100 return (set_errno(EBUSY)); 3101 } else if (status == ZONE_IS_DOWN) { 3102 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3103 } 3104 mutex_exit(&zone_status_lock); 3105 zone_hold(zone); 3106 mutex_exit(&zonehash_lock); 3107 3108 /* 3109 * wait for zsched to exit 3110 */ 3111 zone_status_wait(zone, ZONE_IS_DEAD); 3112 zone_zsd_callbacks(zone, ZSD_DESTROY); 3113 uniqid = zone->zone_uniqid; 3114 zone_rele(zone); 3115 zone = NULL; /* potentially free'd */ 3116 3117 mutex_enter(&zonehash_lock); 3118 for (; /* ever */; ) { 3119 boolean_t unref; 3120 3121 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3122 zone->zone_uniqid != uniqid) { 3123 /* 3124 * The zone has gone away. Necessary conditions 3125 * are met, so we return success. 3126 */ 3127 mutex_exit(&zonehash_lock); 3128 return (0); 3129 } 3130 mutex_enter(&zone->zone_lock); 3131 unref = ZONE_IS_UNREF(zone); 3132 mutex_exit(&zone->zone_lock); 3133 if (unref) { 3134 /* 3135 * There is only one reference to the zone -- that 3136 * added when the zone was added to the hashtables -- 3137 * and things will remain this way until we drop 3138 * zonehash_lock... we can go ahead and cleanup the 3139 * zone. 3140 */ 3141 break; 3142 } 3143 3144 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3145 /* Signaled */ 3146 mutex_exit(&zonehash_lock); 3147 return (set_errno(EINTR)); 3148 } 3149 3150 } 3151 3152 /* 3153 * It is now safe to let the zone be recreated; remove it from the 3154 * lists. The memory will not be freed until the last cred 3155 * reference goes away. 3156 */ 3157 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3158 zonecount--; 3159 /* remove from active list and hash tables */ 3160 list_remove(&zone_active, zone); 3161 (void) mod_hash_destroy(zonehashbyname, 3162 (mod_hash_key_t)zone->zone_name); 3163 (void) mod_hash_destroy(zonehashbyid, 3164 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3165 mutex_exit(&zonehash_lock); 3166 3167 /* 3168 * Release the root vnode; we're not using it anymore. Nor should any 3169 * other thread that might access it exist. 3170 */ 3171 if (zone->zone_rootvp != NULL) { 3172 VN_RELE(zone->zone_rootvp); 3173 zone->zone_rootvp = NULL; 3174 } 3175 3176 /* add to deathrow list */ 3177 mutex_enter(&zone_deathrow_lock); 3178 list_insert_tail(&zone_deathrow, zone); 3179 mutex_exit(&zone_deathrow_lock); 3180 3181 /* 3182 * Drop last reference (which was added by zsched()), this will 3183 * free the zone unless there are outstanding cred references. 3184 */ 3185 zone_rele(zone); 3186 return (0); 3187 } 3188 3189 /* 3190 * Systemcall entry point for zone_getattr(2). 3191 */ 3192 static ssize_t 3193 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3194 { 3195 size_t size; 3196 int error = 0, err; 3197 zone_t *zone; 3198 char *zonepath; 3199 zone_status_t zone_status; 3200 pid_t initpid; 3201 boolean_t global = (curproc->p_zone == global_zone); 3202 3203 mutex_enter(&zonehash_lock); 3204 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3205 mutex_exit(&zonehash_lock); 3206 return (set_errno(EINVAL)); 3207 } 3208 zone_status = zone_status_get(zone); 3209 if (zone_status < ZONE_IS_READY) { 3210 mutex_exit(&zonehash_lock); 3211 return (set_errno(EINVAL)); 3212 } 3213 zone_hold(zone); 3214 mutex_exit(&zonehash_lock); 3215 3216 /* 3217 * If not in the global zone, don't show information about other zones. 3218 */ 3219 if (!global && curproc->p_zone != zone) { 3220 zone_rele(zone); 3221 return (set_errno(EINVAL)); 3222 } 3223 3224 switch (attr) { 3225 case ZONE_ATTR_ROOT: 3226 if (global) { 3227 /* 3228 * Copy the path to trim the trailing "/" (except for 3229 * the global zone). 3230 */ 3231 if (zone != global_zone) 3232 size = zone->zone_rootpathlen - 1; 3233 else 3234 size = zone->zone_rootpathlen; 3235 zonepath = kmem_alloc(size, KM_SLEEP); 3236 bcopy(zone->zone_rootpath, zonepath, size); 3237 zonepath[size - 1] = '\0'; 3238 } else { 3239 /* 3240 * Caller is not in the global zone, just return 3241 * faked-up path for current zone. 3242 */ 3243 zonepath = "/"; 3244 size = 2; 3245 } 3246 if (bufsize > size) 3247 bufsize = size; 3248 if (buf != NULL) { 3249 err = copyoutstr(zonepath, buf, bufsize, NULL); 3250 if (err != 0 && err != ENAMETOOLONG) 3251 error = EFAULT; 3252 } 3253 if (global) 3254 kmem_free(zonepath, size); 3255 break; 3256 3257 case ZONE_ATTR_NAME: 3258 size = strlen(zone->zone_name) + 1; 3259 if (bufsize > size) 3260 bufsize = size; 3261 if (buf != NULL) { 3262 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 3263 if (err != 0 && err != ENAMETOOLONG) 3264 error = EFAULT; 3265 } 3266 break; 3267 3268 case ZONE_ATTR_STATUS: 3269 /* 3270 * Since we're not holding zonehash_lock, the zone status 3271 * may be anything; leave it up to userland to sort it out. 3272 */ 3273 size = sizeof (zone_status); 3274 if (bufsize > size) 3275 bufsize = size; 3276 zone_status = zone_status_get(zone); 3277 if (buf != NULL && 3278 copyout(&zone_status, buf, bufsize) != 0) 3279 error = EFAULT; 3280 break; 3281 case ZONE_ATTR_PRIVSET: 3282 size = sizeof (priv_set_t); 3283 if (bufsize > size) 3284 bufsize = size; 3285 if (buf != NULL && 3286 copyout(zone->zone_privset, buf, bufsize) != 0) 3287 error = EFAULT; 3288 break; 3289 case ZONE_ATTR_UNIQID: 3290 size = sizeof (zone->zone_uniqid); 3291 if (bufsize > size) 3292 bufsize = size; 3293 if (buf != NULL && 3294 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 3295 error = EFAULT; 3296 break; 3297 case ZONE_ATTR_POOLID: 3298 { 3299 pool_t *pool; 3300 poolid_t poolid; 3301 3302 if (pool_lock_intr() != 0) { 3303 error = EINTR; 3304 break; 3305 } 3306 pool = zone_pool_get(zone); 3307 poolid = pool->pool_id; 3308 pool_unlock(); 3309 size = sizeof (poolid); 3310 if (bufsize > size) 3311 bufsize = size; 3312 if (buf != NULL && copyout(&poolid, buf, size) != 0) 3313 error = EFAULT; 3314 } 3315 break; 3316 case ZONE_ATTR_INITPID: 3317 size = sizeof (initpid); 3318 if (bufsize > size) 3319 bufsize = size; 3320 initpid = zone->zone_proc_initpid; 3321 if (initpid == -1) { 3322 error = ESRCH; 3323 break; 3324 } 3325 if (buf != NULL && 3326 copyout(&initpid, buf, bufsize) != 0) 3327 error = EFAULT; 3328 break; 3329 default: 3330 error = EINVAL; 3331 } 3332 zone_rele(zone); 3333 3334 if (error) 3335 return (set_errno(error)); 3336 return ((ssize_t)size); 3337 } 3338 3339 /* 3340 * Return zero if the process has at least one vnode mapped in to its 3341 * address space which shouldn't be allowed to change zones. 3342 */ 3343 static int 3344 as_can_change_zones(void) 3345 { 3346 proc_t *pp = curproc; 3347 struct seg *seg; 3348 struct as *as = pp->p_as; 3349 vnode_t *vp; 3350 int allow = 1; 3351 3352 ASSERT(pp->p_as != &kas); 3353 AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); 3354 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 3355 /* 3356 * if we can't get a backing vnode for this segment then skip 3357 * it. 3358 */ 3359 vp = NULL; 3360 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 3361 continue; 3362 if (!vn_can_change_zones(vp)) { /* bail on first match */ 3363 allow = 0; 3364 break; 3365 } 3366 } 3367 AS_LOCK_EXIT(&as, &as->a_lock); 3368 return (allow); 3369 } 3370 3371 /* 3372 * Systemcall entry point for zone_enter(). 3373 * 3374 * The current process is injected into said zone. In the process 3375 * it will change its project membership, privileges, rootdir/cwd, 3376 * zone-wide rctls, and pool association to match those of the zone. 3377 * 3378 * The first zone_enter() called while the zone is in the ZONE_IS_READY 3379 * state will transition it to ZONE_IS_RUNNING. Processes may only 3380 * enter a zone that is "ready" or "running". 3381 */ 3382 static int 3383 zone_enter(zoneid_t zoneid) 3384 { 3385 zone_t *zone; 3386 vnode_t *vp; 3387 proc_t *pp = curproc; 3388 contract_t *ct; 3389 cont_process_t *ctp; 3390 task_t *tk, *oldtk; 3391 kproject_t *zone_proj0; 3392 cred_t *cr, *newcr; 3393 pool_t *oldpool, *newpool; 3394 sess_t *sp; 3395 uid_t uid; 3396 zone_status_t status; 3397 int err = 0; 3398 rctl_entity_p_t e; 3399 3400 if (secpolicy_zone_config(CRED()) != 0) 3401 return (set_errno(EPERM)); 3402 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3403 return (set_errno(EINVAL)); 3404 3405 /* 3406 * Stop all lwps so we don't need to hold a lock to look at 3407 * curproc->p_zone. This needs to happen before we grab any 3408 * locks to avoid deadlock (another lwp in the process could 3409 * be waiting for the held lock). 3410 */ 3411 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 3412 return (set_errno(EINTR)); 3413 3414 /* 3415 * Make sure we're not changing zones with files open or mapped in 3416 * to our address space which shouldn't be changing zones. 3417 */ 3418 if (!files_can_change_zones()) { 3419 err = EBADF; 3420 goto out; 3421 } 3422 if (!as_can_change_zones()) { 3423 err = EFAULT; 3424 goto out; 3425 } 3426 3427 mutex_enter(&zonehash_lock); 3428 if (pp->p_zone != global_zone) { 3429 mutex_exit(&zonehash_lock); 3430 err = EINVAL; 3431 goto out; 3432 } 3433 3434 zone = zone_find_all_by_id(zoneid); 3435 if (zone == NULL) { 3436 mutex_exit(&zonehash_lock); 3437 err = EINVAL; 3438 goto out; 3439 } 3440 3441 /* 3442 * To prevent processes in a zone from holding contracts on 3443 * extrazonal resources, and to avoid process contract 3444 * memberships which span zones, contract holders and processes 3445 * which aren't the sole members of their encapsulating process 3446 * contracts are not allowed to zone_enter. 3447 */ 3448 ctp = pp->p_ct_process; 3449 ct = &ctp->conp_contract; 3450 mutex_enter(&ct->ct_lock); 3451 mutex_enter(&pp->p_lock); 3452 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 3453 mutex_exit(&pp->p_lock); 3454 mutex_exit(&ct->ct_lock); 3455 mutex_exit(&zonehash_lock); 3456 pool_unlock(); 3457 err = EINVAL; 3458 goto out; 3459 } 3460 3461 /* 3462 * Moreover, we don't allow processes whose encapsulating 3463 * process contracts have inherited extrazonal contracts. 3464 * While it would be easier to eliminate all process contracts 3465 * with inherited contracts, we need to be able to give a 3466 * restarted init (or other zone-penetrating process) its 3467 * predecessor's contracts. 3468 */ 3469 if (ctp->conp_ninherited != 0) { 3470 contract_t *next; 3471 for (next = list_head(&ctp->conp_inherited); next; 3472 next = list_next(&ctp->conp_inherited, next)) { 3473 if (contract_getzuniqid(next) != zone->zone_uniqid) { 3474 mutex_exit(&pp->p_lock); 3475 mutex_exit(&ct->ct_lock); 3476 mutex_exit(&zonehash_lock); 3477 pool_unlock(); 3478 err = EINVAL; 3479 goto out; 3480 } 3481 } 3482 } 3483 mutex_exit(&pp->p_lock); 3484 mutex_exit(&ct->ct_lock); 3485 3486 status = zone_status_get(zone); 3487 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 3488 /* 3489 * Can't join 3490 */ 3491 mutex_exit(&zonehash_lock); 3492 err = EINVAL; 3493 goto out; 3494 } 3495 3496 /* 3497 * Make sure new priv set is within the permitted set for caller 3498 */ 3499 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 3500 mutex_exit(&zonehash_lock); 3501 err = EPERM; 3502 goto out; 3503 } 3504 /* 3505 * We want to momentarily drop zonehash_lock while we optimistically 3506 * bind curproc to the pool it should be running in. This is safe 3507 * since the zone can't disappear (we have a hold on it). 3508 */ 3509 zone_hold(zone); 3510 mutex_exit(&zonehash_lock); 3511 3512 /* 3513 * Grab pool_lock to keep the pools configuration from changing 3514 * and to stop ourselves from getting rebound to another pool 3515 * until we join the zone. 3516 */ 3517 if (pool_lock_intr() != 0) { 3518 zone_rele(zone); 3519 err = EINTR; 3520 goto out; 3521 } 3522 ASSERT(secpolicy_pool(CRED()) == 0); 3523 /* 3524 * Bind ourselves to the pool currently associated with the zone. 3525 */ 3526 oldpool = curproc->p_pool; 3527 newpool = zone_pool_get(zone); 3528 if (pool_state == POOL_ENABLED && newpool != oldpool && 3529 (err = pool_do_bind(newpool, P_PID, P_MYID, 3530 POOL_BIND_ALL)) != 0) { 3531 pool_unlock(); 3532 zone_rele(zone); 3533 goto out; 3534 } 3535 3536 /* 3537 * Grab cpu_lock now; we'll need it later when we call 3538 * task_join(). 3539 */ 3540 mutex_enter(&cpu_lock); 3541 mutex_enter(&zonehash_lock); 3542 /* 3543 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 3544 */ 3545 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3546 /* 3547 * Can't join anymore. 3548 */ 3549 mutex_exit(&zonehash_lock); 3550 mutex_exit(&cpu_lock); 3551 if (pool_state == POOL_ENABLED && 3552 newpool != oldpool) 3553 (void) pool_do_bind(oldpool, P_PID, P_MYID, 3554 POOL_BIND_ALL); 3555 pool_unlock(); 3556 zone_rele(zone); 3557 err = EINVAL; 3558 goto out; 3559 } 3560 3561 mutex_enter(&pp->p_lock); 3562 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 3563 /* verify that we do not exceed and task or lwp limits */ 3564 mutex_enter(&zone->zone_nlwps_lock); 3565 /* add new lwps to zone and zone's proj0 */ 3566 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 3567 zone->zone_nlwps += pp->p_lwpcnt; 3568 /* add 1 task to zone's proj0 */ 3569 zone_proj0->kpj_ntasks += 1; 3570 mutex_exit(&pp->p_lock); 3571 mutex_exit(&zone->zone_nlwps_lock); 3572 3573 /* remove lwps from proc's old zone and old project */ 3574 mutex_enter(&pp->p_zone->zone_nlwps_lock); 3575 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 3576 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 3577 mutex_exit(&pp->p_zone->zone_nlwps_lock); 3578 3579 /* 3580 * Joining the zone cannot fail from now on. 3581 * 3582 * This means that a lot of the following code can be commonized and 3583 * shared with zsched(). 3584 */ 3585 3586 /* 3587 * Reset the encapsulating process contract's zone. 3588 */ 3589 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 3590 contract_setzuniqid(ct, zone->zone_uniqid); 3591 3592 /* 3593 * Create a new task and associate the process with the project keyed 3594 * by (projid,zoneid). 3595 * 3596 * We might as well be in project 0; the global zone's projid doesn't 3597 * make much sense in a zone anyhow. 3598 * 3599 * This also increments zone_ntasks, and returns with p_lock held. 3600 */ 3601 tk = task_create(0, zone); 3602 oldtk = task_join(tk, 0); 3603 mutex_exit(&cpu_lock); 3604 3605 pp->p_flag |= SZONETOP; 3606 pp->p_zone = zone; 3607 3608 /* 3609 * call RCTLOP_SET functions on this proc 3610 */ 3611 e.rcep_p.zone = zone; 3612 e.rcep_t = RCENTITY_ZONE; 3613 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 3614 RCD_CALLBACK); 3615 mutex_exit(&pp->p_lock); 3616 3617 /* 3618 * We don't need to hold any of zsched's locks here; not only do we know 3619 * the process and zone aren't going away, we know its session isn't 3620 * changing either. 3621 * 3622 * By joining zsched's session here, we mimic the behavior in the 3623 * global zone of init's sid being the pid of sched. We extend this 3624 * to all zlogin-like zone_enter()'ing processes as well. 3625 */ 3626 mutex_enter(&pidlock); 3627 sp = zone->zone_zsched->p_sessp; 3628 SESS_HOLD(sp); 3629 mutex_enter(&pp->p_lock); 3630 pgexit(pp); 3631 SESS_RELE(pp->p_sessp); 3632 pp->p_sessp = sp; 3633 pgjoin(pp, zone->zone_zsched->p_pidp); 3634 mutex_exit(&pp->p_lock); 3635 mutex_exit(&pidlock); 3636 3637 mutex_exit(&zonehash_lock); 3638 /* 3639 * We're firmly in the zone; let pools progress. 3640 */ 3641 pool_unlock(); 3642 task_rele(oldtk); 3643 /* 3644 * We don't need to retain a hold on the zone since we already 3645 * incremented zone_ntasks, so the zone isn't going anywhere. 3646 */ 3647 zone_rele(zone); 3648 3649 /* 3650 * Chroot 3651 */ 3652 vp = zone->zone_rootvp; 3653 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 3654 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 3655 3656 /* 3657 * Change process credentials 3658 */ 3659 newcr = cralloc(); 3660 mutex_enter(&pp->p_crlock); 3661 cr = pp->p_cred; 3662 crcopy_to(cr, newcr); 3663 crsetzone(newcr, zone); 3664 pp->p_cred = newcr; 3665 3666 /* 3667 * Restrict all process privilege sets to zone limit 3668 */ 3669 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 3670 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 3671 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 3672 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 3673 mutex_exit(&pp->p_crlock); 3674 crset(pp, newcr); 3675 3676 /* 3677 * Adjust upcount to reflect zone entry. 3678 */ 3679 uid = crgetruid(newcr); 3680 mutex_enter(&pidlock); 3681 upcount_dec(uid, GLOBAL_ZONEID); 3682 upcount_inc(uid, zoneid); 3683 mutex_exit(&pidlock); 3684 3685 /* 3686 * Set up core file path and content. 3687 */ 3688 set_core_defaults(); 3689 3690 out: 3691 /* 3692 * Let the other lwps continue. 3693 */ 3694 mutex_enter(&pp->p_lock); 3695 if (curthread != pp->p_agenttp) 3696 continuelwps(pp); 3697 mutex_exit(&pp->p_lock); 3698 3699 return (err != 0 ? set_errno(err) : 0); 3700 } 3701 3702 /* 3703 * Systemcall entry point for zone_list(2). 3704 * 3705 * Processes running in a (non-global) zone only see themselves. 3706 */ 3707 static int 3708 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 3709 { 3710 zoneid_t *zoneids; 3711 zone_t *zone; 3712 uint_t user_nzones, real_nzones; 3713 int error = 0; 3714 uint_t i; 3715 3716 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 3717 return (set_errno(EFAULT)); 3718 3719 if (curproc->p_zone != global_zone) { 3720 /* just return current zone */ 3721 real_nzones = 1; 3722 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 3723 zoneids[0] = curproc->p_zone->zone_id; 3724 } else { 3725 mutex_enter(&zonehash_lock); 3726 real_nzones = zonecount; 3727 if (real_nzones) { 3728 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 3729 KM_SLEEP); 3730 i = 0; 3731 for (zone = list_head(&zone_active); zone != NULL; 3732 zone = list_next(&zone_active, zone)) 3733 zoneids[i++] = zone->zone_id; 3734 ASSERT(i == real_nzones); 3735 } 3736 mutex_exit(&zonehash_lock); 3737 } 3738 3739 if (user_nzones > real_nzones) 3740 user_nzones = real_nzones; 3741 3742 if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0) 3743 error = EFAULT; 3744 else if (zoneidlist != NULL && user_nzones != 0) { 3745 if (copyout(zoneids, zoneidlist, 3746 user_nzones * sizeof (zoneid_t)) != 0) 3747 error = EFAULT; 3748 } 3749 3750 if (real_nzones) 3751 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 3752 3753 if (error) 3754 return (set_errno(error)); 3755 else 3756 return (0); 3757 } 3758 3759 /* 3760 * Systemcall entry point for zone_lookup(2). 3761 * 3762 * Non-global zones are only able to see themselves. 3763 */ 3764 static zoneid_t 3765 zone_lookup(const char *zone_name) 3766 { 3767 char *kname; 3768 zone_t *zone; 3769 zoneid_t zoneid; 3770 int err; 3771 3772 if (zone_name == NULL) { 3773 /* return caller's zone id */ 3774 return (getzoneid()); 3775 } 3776 3777 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3778 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 3779 kmem_free(kname, ZONENAME_MAX); 3780 return (set_errno(err)); 3781 } 3782 3783 mutex_enter(&zonehash_lock); 3784 zone = zone_find_all_by_name(kname); 3785 kmem_free(kname, ZONENAME_MAX); 3786 if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY || 3787 (curproc->p_zone != global_zone && curproc->p_zone != zone)) { 3788 /* in non-global zone, can only lookup own name */ 3789 mutex_exit(&zonehash_lock); 3790 return (set_errno(EINVAL)); 3791 } 3792 zoneid = zone->zone_id; 3793 mutex_exit(&zonehash_lock); 3794 return (zoneid); 3795 } 3796 3797 /* ARGSUSED */ 3798 long 3799 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 3800 { 3801 zone_def zs; 3802 3803 switch (cmd) { 3804 case ZONE_CREATE: 3805 if (get_udatamodel() == DATAMODEL_NATIVE) { 3806 if (copyin(arg1, &zs, sizeof (zone_def))) { 3807 return (set_errno(EFAULT)); 3808 } 3809 } else { 3810 #ifdef _SYSCALL32_IMPL 3811 zone_def32 zs32; 3812 3813 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 3814 return (set_errno(EFAULT)); 3815 } 3816 zs.zone_name = 3817 (const char *)(unsigned long)zs32.zone_name; 3818 zs.zone_root = 3819 (const char *)(unsigned long)zs32.zone_root; 3820 zs.zone_privs = 3821 (const struct priv_set *) 3822 (unsigned long)zs32.zone_privs; 3823 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 3824 zs.rctlbufsz = zs32.rctlbufsz; 3825 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 3826 zs.zfsbufsz = zs32.zfsbufsz; 3827 zs.extended_error = 3828 (int *)(unsigned long)zs32.extended_error; 3829 #else 3830 panic("get_udatamodel() returned bogus result\n"); 3831 #endif 3832 } 3833 3834 return (zone_create(zs.zone_name, zs.zone_root, 3835 zs.zone_privs, (caddr_t)zs.rctlbuf, zs.rctlbufsz, 3836 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 3837 zs.extended_error)); 3838 case ZONE_BOOT: 3839 return (zone_boot((zoneid_t)(uintptr_t)arg1, 3840 (const char *)arg2)); 3841 case ZONE_DESTROY: 3842 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 3843 case ZONE_GETATTR: 3844 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 3845 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 3846 case ZONE_ENTER: 3847 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 3848 case ZONE_LIST: 3849 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 3850 case ZONE_SHUTDOWN: 3851 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 3852 case ZONE_LOOKUP: 3853 return (zone_lookup((const char *)arg1)); 3854 default: 3855 return (set_errno(EINVAL)); 3856 } 3857 } 3858 3859 struct zarg { 3860 zone_t *zone; 3861 zone_cmd_arg_t arg; 3862 }; 3863 3864 static int 3865 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 3866 { 3867 char *buf; 3868 size_t buflen; 3869 int error; 3870 3871 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 3872 buf = kmem_alloc(buflen, KM_SLEEP); 3873 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 3874 error = door_ki_open(buf, doorp); 3875 kmem_free(buf, buflen); 3876 return (error); 3877 } 3878 3879 static void 3880 zone_release_door(door_handle_t *doorp) 3881 { 3882 door_ki_rele(*doorp); 3883 *doorp = NULL; 3884 } 3885 3886 static void 3887 zone_ki_call_zoneadmd(struct zarg *zargp) 3888 { 3889 door_handle_t door = NULL; 3890 door_arg_t darg, save_arg; 3891 char *zone_name; 3892 size_t zone_namelen; 3893 zoneid_t zoneid; 3894 zone_t *zone; 3895 zone_cmd_arg_t arg; 3896 uint64_t uniqid; 3897 size_t size; 3898 int error; 3899 int retry; 3900 3901 zone = zargp->zone; 3902 arg = zargp->arg; 3903 kmem_free(zargp, sizeof (*zargp)); 3904 3905 zone_namelen = strlen(zone->zone_name) + 1; 3906 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 3907 bcopy(zone->zone_name, zone_name, zone_namelen); 3908 zoneid = zone->zone_id; 3909 uniqid = zone->zone_uniqid; 3910 /* 3911 * zoneadmd may be down, but at least we can empty out the zone. 3912 * We can ignore the return value of zone_empty() since we're called 3913 * from a kernel thread and know we won't be delivered any signals. 3914 */ 3915 ASSERT(curproc == &p0); 3916 (void) zone_empty(zone); 3917 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 3918 zone_rele(zone); 3919 3920 size = sizeof (arg); 3921 darg.rbuf = (char *)&arg; 3922 darg.data_ptr = (char *)&arg; 3923 darg.rsize = size; 3924 darg.data_size = size; 3925 darg.desc_ptr = NULL; 3926 darg.desc_num = 0; 3927 3928 save_arg = darg; 3929 /* 3930 * Since we're not holding a reference to the zone, any number of 3931 * things can go wrong, including the zone disappearing before we get a 3932 * chance to talk to zoneadmd. 3933 */ 3934 for (retry = 0; /* forever */; retry++) { 3935 if (door == NULL && 3936 (error = zone_lookup_door(zone_name, &door)) != 0) { 3937 goto next; 3938 } 3939 ASSERT(door != NULL); 3940 3941 if ((error = door_ki_upcall(door, &darg)) == 0) { 3942 break; 3943 } 3944 switch (error) { 3945 case EINTR: 3946 /* FALLTHROUGH */ 3947 case EAGAIN: /* process may be forking */ 3948 /* 3949 * Back off for a bit 3950 */ 3951 break; 3952 case EBADF: 3953 zone_release_door(&door); 3954 if (zone_lookup_door(zone_name, &door) != 0) { 3955 /* 3956 * zoneadmd may be dead, but it may come back to 3957 * life later. 3958 */ 3959 break; 3960 } 3961 break; 3962 default: 3963 cmn_err(CE_WARN, 3964 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 3965 error); 3966 goto out; 3967 } 3968 next: 3969 /* 3970 * If this isn't the same zone_t that we originally had in mind, 3971 * then this is the same as if two kadmin requests come in at 3972 * the same time: the first one wins. This means we lose, so we 3973 * bail. 3974 */ 3975 if ((zone = zone_find_by_id(zoneid)) == NULL) { 3976 /* 3977 * Problem is solved. 3978 */ 3979 break; 3980 } 3981 if (zone->zone_uniqid != uniqid) { 3982 /* 3983 * zoneid recycled 3984 */ 3985 zone_rele(zone); 3986 break; 3987 } 3988 /* 3989 * We could zone_status_timedwait(), but there doesn't seem to 3990 * be much point in doing that (plus, it would mean that 3991 * zone_free() isn't called until this thread exits). 3992 */ 3993 zone_rele(zone); 3994 delay(hz); 3995 darg = save_arg; 3996 } 3997 out: 3998 if (door != NULL) { 3999 zone_release_door(&door); 4000 } 4001 kmem_free(zone_name, zone_namelen); 4002 thread_exit(); 4003 } 4004 4005 /* 4006 * Entry point for uadmin() to tell the zone to go away or reboot. The caller 4007 * is a process in the zone to be modified. 4008 * 4009 * In order to shutdown the zone, we will hand off control to zoneadmd 4010 * (running in the global zone) via a door. We do a half-hearted job at 4011 * killing all processes in the zone, create a kernel thread to contact 4012 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 4013 * a form of generation number used to let zoneadmd (as well as 4014 * zone_destroy()) know exactly which zone they're re talking about. 4015 */ 4016 int 4017 zone_uadmin(int cmd, int fcn, cred_t *credp) 4018 { 4019 struct zarg *zargp; 4020 zone_cmd_t zcmd; 4021 zone_t *zone; 4022 4023 zone = curproc->p_zone; 4024 ASSERT(getzoneid() != GLOBAL_ZONEID); 4025 4026 switch (cmd) { 4027 case A_SHUTDOWN: 4028 switch (fcn) { 4029 case AD_HALT: 4030 case AD_POWEROFF: 4031 zcmd = Z_HALT; 4032 break; 4033 case AD_BOOT: 4034 zcmd = Z_REBOOT; 4035 break; 4036 case AD_IBOOT: 4037 case AD_SBOOT: 4038 case AD_SIBOOT: 4039 case AD_NOSYNC: 4040 return (ENOTSUP); 4041 default: 4042 return (EINVAL); 4043 } 4044 break; 4045 case A_REBOOT: 4046 zcmd = Z_REBOOT; 4047 break; 4048 case A_FTRACE: 4049 case A_REMOUNT: 4050 case A_FREEZE: 4051 case A_DUMP: 4052 return (ENOTSUP); 4053 default: 4054 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 4055 return (EINVAL); 4056 } 4057 4058 if (secpolicy_zone_admin(credp, B_FALSE)) 4059 return (EPERM); 4060 mutex_enter(&zone_status_lock); 4061 /* 4062 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 4063 * is in the zone. 4064 */ 4065 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 4066 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 4067 /* 4068 * This zone is already on its way down. 4069 */ 4070 mutex_exit(&zone_status_lock); 4071 return (0); 4072 } 4073 /* 4074 * Prevent future zone_enter()s 4075 */ 4076 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 4077 mutex_exit(&zone_status_lock); 4078 4079 /* 4080 * Kill everyone now and call zoneadmd later. 4081 * zone_ki_call_zoneadmd() will do a more thorough job of this 4082 * later. 4083 */ 4084 killall(zone->zone_id); 4085 /* 4086 * Now, create the thread to contact zoneadmd and do the rest of the 4087 * work. This thread can't be created in our zone otherwise 4088 * zone_destroy() would deadlock. 4089 */ 4090 zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP); 4091 zargp->arg.cmd = zcmd; 4092 zargp->arg.uniqid = zone->zone_uniqid; 4093 (void) strcpy(zargp->arg.locale, "C"); 4094 zone_hold(zargp->zone = zone); 4095 4096 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 4097 TS_RUN, minclsyspri); 4098 exit(CLD_EXITED, 0); 4099 4100 return (EINVAL); 4101 } 4102 4103 /* 4104 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 4105 * status to ZONE_IS_SHUTTING_DOWN. 4106 */ 4107 void 4108 zone_shutdown_global(void) 4109 { 4110 ASSERT(curproc->p_zone == global_zone); 4111 4112 mutex_enter(&zone_status_lock); 4113 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 4114 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 4115 mutex_exit(&zone_status_lock); 4116 } 4117 4118 /* 4119 * Returns true if the named dataset is visible in the current zone. 4120 * The 'write' parameter is set to 1 if the dataset is also writable. 4121 */ 4122 int 4123 zone_dataset_visible(const char *dataset, int *write) 4124 { 4125 zone_dataset_t *zd; 4126 size_t len; 4127 zone_t *zone = curproc->p_zone; 4128 4129 if (dataset[0] == '\0') 4130 return (0); 4131 4132 /* 4133 * Walk the list once, looking for datasets which match exactly, or 4134 * specify a dataset underneath an exported dataset. If found, return 4135 * true and note that it is writable. 4136 */ 4137 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4138 zd = list_next(&zone->zone_datasets, zd)) { 4139 4140 len = strlen(zd->zd_dataset); 4141 if (strlen(dataset) >= len && 4142 bcmp(dataset, zd->zd_dataset, len) == 0 && 4143 (zd->zd_dataset[len-1] == '/' || 4144 dataset[len] == '\0' || dataset[len] == '/')) { 4145 if (write) 4146 *write = 1; 4147 return (1); 4148 } 4149 } 4150 4151 /* 4152 * Walk the list a second time, searching for datasets which are parents 4153 * of exported datasets. These should be visible, but read-only. 4154 * 4155 * Note that we also have to support forms such as 'pool/dataset/', with 4156 * a trailing slash. 4157 */ 4158 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4159 zd = list_next(&zone->zone_datasets, zd)) { 4160 4161 len = strlen(dataset); 4162 if (dataset[len - 1] == '/') 4163 len--; /* Ignore trailing slash */ 4164 if (len < strlen(zd->zd_dataset) && 4165 bcmp(dataset, zd->zd_dataset, len) == 0 && 4166 zd->zd_dataset[len] == '/') { 4167 if (write) 4168 *write = 0; 4169 return (1); 4170 } 4171 } 4172 4173 return (0); 4174 } 4175