1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * Zones 32 * 33 * A zone is a named collection of processes, namespace constraints, 34 * and other system resources which comprise a secure and manageable 35 * application containment facility. 36 * 37 * Zones (represented by the reference counted zone_t) are tracked in 38 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 39 * (zoneid_t) are used to track zone association. Zone IDs are 40 * dynamically generated when the zone is created; if a persistent 41 * identifier is needed (core files, accounting logs, audit trail, 42 * etc.), the zone name should be used. 43 * 44 * 45 * Global Zone: 46 * 47 * The global zone (zoneid 0) is automatically associated with all 48 * system resources that have not been bound to a user-created zone. 49 * This means that even systems where zones are not in active use 50 * have a global zone, and all processes, mounts, etc. are 51 * associated with that zone. The global zone is generally 52 * unconstrained in terms of privileges and access, though the usual 53 * credential and privilege based restrictions apply. 54 * 55 * 56 * Zone States: 57 * 58 * The states in which a zone may be in and the transitions are as 59 * follows: 60 * 61 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 62 * initialized zone is added to the list of active zones on the system but 63 * isn't accessible. 64 * 65 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 66 * ready. The zone is made visible after the ZSD constructor callbacks are 67 * executed. A zone remains in this state until it transitions into 68 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 69 * 70 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 71 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 72 * state. 73 * 74 * ZONE_IS_RUNNING: The zone is open for business: zsched has 75 * successfully started init. A zone remains in this state until 76 * zone_shutdown() is called. 77 * 78 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 79 * killing all processes running in the zone. The zone remains 80 * in this state until there are no more user processes running in the zone. 81 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 82 * Since zone_shutdown() is restartable, it may be called successfully 83 * multiple times for the same zone_t. Setting of the zone's state to 84 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 85 * the zone's status without worrying about it being a moving target. 86 * 87 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 88 * are no more user processes in the zone. The zone remains in this 89 * state until there are no more kernel threads associated with the 90 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 91 * fail. 92 * 93 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 94 * have exited. zone_shutdown() returns. Henceforth it is not possible to 95 * join the zone or create kernel threads therein. 96 * 97 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 98 * remains in this state until zsched exits. Calls to zone_find_by_*() 99 * return NULL from now on. 100 * 101 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 102 * processes or threads doing work on behalf of the zone. The zone is 103 * removed from the list of active zones. zone_destroy() returns, and 104 * the zone can be recreated. 105 * 106 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 107 * callbacks are executed, and all memory associated with the zone is 108 * freed. 109 * 110 * Threads can wait for the zone to enter a requested state by using 111 * zone_status_wait() or zone_status_timedwait() with the desired 112 * state passed in as an argument. Zone state transitions are 113 * uni-directional; it is not possible to move back to an earlier state. 114 * 115 * 116 * Zone-Specific Data: 117 * 118 * Subsystems needing to maintain zone-specific data can store that 119 * data using the ZSD mechanism. This provides a zone-specific data 120 * store, similar to thread-specific data (see pthread_getspecific(3C) 121 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 122 * to register callbacks to be invoked when a zone is created, shut 123 * down, or destroyed. This can be used to initialize zone-specific 124 * data for new zones and to clean up when zones go away. 125 * 126 * 127 * Data Structures: 128 * 129 * The per-zone structure (zone_t) is reference counted, and freed 130 * when all references are released. zone_hold and zone_rele can be 131 * used to adjust the reference count. In addition, reference counts 132 * associated with the cred_t structure are tracked separately using 133 * zone_cred_hold and zone_cred_rele. 134 * 135 * Pointers to active zone_t's are stored in two hash tables; one 136 * for searching by id, the other for searching by name. Lookups 137 * can be performed on either basis, using zone_find_by_id and 138 * zone_find_by_name. Both return zone_t pointers with the zone 139 * held, so zone_rele should be called when the pointer is no longer 140 * needed. Zones can also be searched by path; zone_find_by_path 141 * returns the zone with which a path name is associated (global 142 * zone if the path is not within some other zone's file system 143 * hierarchy). This currently requires iterating through each zone, 144 * so it is slower than an id or name search via a hash table. 145 * 146 * 147 * Locking: 148 * 149 * zonehash_lock: This is a top-level global lock used to protect the 150 * zone hash tables and lists. Zones cannot be created or destroyed 151 * while this lock is held. 152 * zone_status_lock: This is a global lock protecting zone state. 153 * Zones cannot change state while this lock is held. It also 154 * protects the list of kernel threads associated with a zone. 155 * zone_lock: This is a per-zone lock used to protect several fields of 156 * the zone_t (see <sys/zone.h> for details). In addition, holding 157 * this lock means that the zone cannot go away. 158 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 159 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 160 * list (a list of zones in the ZONE_IS_DEAD state). 161 * 162 * Ordering requirements: 163 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 164 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 165 * 166 * Blocking memory allocations are permitted while holding any of the 167 * zone locks. 168 * 169 * 170 * System Call Interface: 171 * 172 * The zone subsystem can be managed and queried from user level with 173 * the following system calls (all subcodes of the primary "zone" 174 * system call): 175 * - zone_create: creates a zone with selected attributes (name, 176 * root path, privileges, resource controls, ZFS datasets) 177 * - zone_enter: allows the current process to enter a zone 178 * - zone_getattr: reports attributes of a zone 179 * - zone_list: lists all zones active in the system 180 * - zone_lookup: looks up zone id based on name 181 * - zone_shutdown: initiates shutdown process (see states above) 182 * - zone_destroy: completes shutdown process (see states above) 183 * 184 */ 185 186 #include <sys/priv_impl.h> 187 #include <sys/cred.h> 188 #include <c2/audit.h> 189 #include <sys/ddi.h> 190 #include <sys/debug.h> 191 #include <sys/file.h> 192 #include <sys/kmem.h> 193 #include <sys/mutex.h> 194 #include <sys/pathname.h> 195 #include <sys/proc.h> 196 #include <sys/project.h> 197 #include <sys/task.h> 198 #include <sys/systm.h> 199 #include <sys/types.h> 200 #include <sys/utsname.h> 201 #include <sys/vnode.h> 202 #include <sys/vfs.h> 203 #include <sys/systeminfo.h> 204 #include <sys/policy.h> 205 #include <sys/cred_impl.h> 206 #include <sys/contract_impl.h> 207 #include <sys/contract/process_impl.h> 208 #include <sys/class.h> 209 #include <sys/pool.h> 210 #include <sys/pool_pset.h> 211 #include <sys/pset.h> 212 #include <sys/log.h> 213 #include <sys/sysmacros.h> 214 #include <sys/callb.h> 215 #include <sys/vmparam.h> 216 #include <sys/corectl.h> 217 218 #include <sys/door.h> 219 #include <sys/cpuvar.h> 220 #include <sys/fs/snode.h> 221 222 #include <sys/uadmin.h> 223 #include <sys/session.h> 224 #include <sys/cmn_err.h> 225 #include <sys/modhash.h> 226 #include <sys/nvpair.h> 227 #include <sys/rctl.h> 228 #include <sys/fss.h> 229 #include <sys/zone.h> 230 231 /* 232 * cv used to signal that all references to the zone have been released. This 233 * needs to be global since there may be multiple waiters, and the first to 234 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 235 */ 236 static kcondvar_t zone_destroy_cv; 237 /* 238 * Lock used to serialize access to zone_cv. This could have been per-zone, 239 * but then we'd need another lock for zone_destroy_cv, and why bother? 240 */ 241 static kmutex_t zone_status_lock; 242 243 /* 244 * ZSD-related global variables. 245 */ 246 static kmutex_t zsd_key_lock; /* protects the following two */ 247 /* 248 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 249 */ 250 static zone_key_t zsd_keyval = 0; 251 /* 252 * Global list of registered keys. We use this when a new zone is created. 253 */ 254 static list_t zsd_registered_keys; 255 256 int zone_hash_size = 256; 257 static mod_hash_t *zonehashbyname, *zonehashbyid; 258 static kmutex_t zonehash_lock; 259 static uint_t zonecount; 260 static id_space_t *zoneid_space; 261 262 /* 263 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 264 * kernel proper runs, and which manages all other zones. 265 * 266 * Although not declared as static, the variable "zone0" should not be used 267 * except for by code that needs to reference the global zone early on in boot, 268 * before it is fully initialized. All other consumers should use 269 * 'global_zone'. 270 */ 271 zone_t zone0; 272 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 273 274 /* 275 * List of active zones, protected by zonehash_lock. 276 */ 277 static list_t zone_active; 278 279 /* 280 * List of destroyed zones that still have outstanding cred references. 281 * Used for debugging. Uses a separate lock to avoid lock ordering 282 * problems in zone_free. 283 */ 284 static list_t zone_deathrow; 285 static kmutex_t zone_deathrow_lock; 286 287 /* number of zones is limited by virtual interface limit in IP */ 288 uint_t maxzones = 8192; 289 290 /* 291 * This isn't static so lint doesn't complain. 292 */ 293 rctl_hndl_t rc_zone_cpu_shares; 294 rctl_hndl_t rc_zone_nlwps; 295 /* 296 * Synchronization primitives used to synchronize between mounts and zone 297 * creation/destruction. 298 */ 299 static int mounts_in_progress; 300 static kcondvar_t mount_cv; 301 static kmutex_t mount_lock; 302 303 const char * const zone_initname = "/sbin/init"; 304 305 static int zone_shutdown(zoneid_t zoneid); 306 307 /* 308 * Bump this number when you alter the zone syscall interfaces; this is 309 * because we need to have support for previous API versions in libc 310 * to support patching; libc calls into the kernel to determine this number. 311 * 312 * Version 1 of the API is the version originally shipped with Solaris 10 313 * Version 2 alters the zone_create system call in order to support more 314 * arguments by moving the args into a structure; and to do better 315 * error reporting when zone_create() fails. 316 * Version 3 alters the zone_create system call in order to support the 317 * import of ZFS datasets to zones. 318 */ 319 static const int ZONE_SYSCALL_API_VERSION = 3; 320 321 /* 322 * Certain filesystems (such as NFS and autofs) need to know which zone 323 * the mount is being placed in. Because of this, we need to be able to 324 * ensure that a zone isn't in the process of being created such that 325 * nfs_mount() thinks it is in the global zone, while by the time it 326 * gets added the list of mounted zones, it ends up on zoneA's mount 327 * list. 328 * 329 * The following functions: block_mounts()/resume_mounts() and 330 * mount_in_progress()/mount_completed() are used by zones and the VFS 331 * layer (respectively) to synchronize zone creation and new mounts. 332 * 333 * The semantics are like a reader-reader lock such that there may 334 * either be multiple mounts (or zone creations, if that weren't 335 * serialized by zonehash_lock) in progress at the same time, but not 336 * both. 337 * 338 * We use cv's so the user can ctrl-C out of the operation if it's 339 * taking too long. 340 * 341 * The semantics are such that there is unfair bias towards the 342 * "current" operation. This means that zone creations may starve if 343 * there is a rapid succession of new mounts coming in to the system, or 344 * there is a remote possibility that zones will be created at such a 345 * rate that new mounts will not be able to proceed. 346 */ 347 /* 348 * Prevent new mounts from progressing to the point of calling 349 * VFS_MOUNT(). If there are already mounts in this "region", wait for 350 * them to complete. 351 */ 352 static int 353 block_mounts(void) 354 { 355 int retval = 0; 356 357 /* 358 * Since it may block for a long time, block_mounts() shouldn't be 359 * called with zonehash_lock held. 360 */ 361 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 362 mutex_enter(&mount_lock); 363 while (mounts_in_progress > 0) { 364 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 365 goto signaled; 366 } 367 /* 368 * A negative value of mounts_in_progress indicates that mounts 369 * have been blocked by (-mounts_in_progress) different callers. 370 */ 371 mounts_in_progress--; 372 retval = 1; 373 signaled: 374 mutex_exit(&mount_lock); 375 return (retval); 376 } 377 378 /* 379 * The VFS layer may progress with new mounts as far as we're concerned. 380 * Allow them to progress if we were the last obstacle. 381 */ 382 static void 383 resume_mounts(void) 384 { 385 mutex_enter(&mount_lock); 386 if (++mounts_in_progress == 0) 387 cv_broadcast(&mount_cv); 388 mutex_exit(&mount_lock); 389 } 390 391 /* 392 * The VFS layer is busy with a mount; zones should wait until all 393 * mounts are completed to progress. 394 */ 395 void 396 mount_in_progress(void) 397 { 398 mutex_enter(&mount_lock); 399 while (mounts_in_progress < 0) 400 cv_wait(&mount_cv, &mount_lock); 401 mounts_in_progress++; 402 mutex_exit(&mount_lock); 403 } 404 405 /* 406 * VFS is done with one mount; wake up any waiting block_mounts() 407 * callers if this is the last mount. 408 */ 409 void 410 mount_completed(void) 411 { 412 mutex_enter(&mount_lock); 413 if (--mounts_in_progress == 0) 414 cv_broadcast(&mount_cv); 415 mutex_exit(&mount_lock); 416 } 417 418 /* 419 * ZSD routines. 420 * 421 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 422 * defined by the pthread_key_create() and related interfaces. 423 * 424 * Kernel subsystems may register one or more data items and/or 425 * callbacks to be executed when a zone is created, shutdown, or 426 * destroyed. 427 * 428 * Unlike the thread counterpart, destructor callbacks will be executed 429 * even if the data pointer is NULL and/or there are no constructor 430 * callbacks, so it is the responsibility of such callbacks to check for 431 * NULL data values if necessary. 432 * 433 * The locking strategy and overall picture is as follows: 434 * 435 * When someone calls zone_key_create(), a template ZSD entry is added to the 436 * global list "zsd_registered_keys", protected by zsd_key_lock. The 437 * constructor callback is called immediately on all existing zones, and a 438 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 439 * zone_lock). As this operation requires the list of zones, the list of 440 * registered keys, and the per-zone list of ZSD entries to remain constant 441 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 442 * all existing zones, and zsd_key_lock, in that order. Similar locking is 443 * needed when zone_key_delete() is called. It is thus sufficient to hold 444 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 445 * per-zone zone_zsd list. 446 * 447 * Note that this implementation does not make a copy of the ZSD entry if a 448 * constructor callback is not provided. A zone_getspecific() on such an 449 * uninitialized ZSD entry will return NULL. 450 * 451 * When new zones are created constructor callbacks for all registered ZSD 452 * entries will be called. 453 * 454 * The framework does not provide any locking around zone_getspecific() and 455 * zone_setspecific() apart from that needed for internal consistency, so 456 * callers interested in atomic "test-and-set" semantics will need to provide 457 * their own locking. 458 */ 459 void 460 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 461 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 462 { 463 struct zsd_entry *zsdp; 464 struct zsd_entry *t; 465 struct zone *zone; 466 467 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 468 zsdp->zsd_data = NULL; 469 zsdp->zsd_create = create; 470 zsdp->zsd_shutdown = shutdown; 471 zsdp->zsd_destroy = destroy; 472 473 mutex_enter(&zonehash_lock); /* stop the world */ 474 for (zone = list_head(&zone_active); zone != NULL; 475 zone = list_next(&zone_active, zone)) 476 mutex_enter(&zone->zone_lock); /* lock all zones */ 477 478 mutex_enter(&zsd_key_lock); 479 *keyp = zsdp->zsd_key = ++zsd_keyval; 480 ASSERT(zsd_keyval != 0); 481 list_insert_tail(&zsd_registered_keys, zsdp); 482 mutex_exit(&zsd_key_lock); 483 484 if (create != NULL) { 485 for (zone = list_head(&zone_active); zone != NULL; 486 zone = list_next(&zone_active, zone)) { 487 t = kmem_alloc(sizeof (*t), KM_SLEEP); 488 t->zsd_key = *keyp; 489 t->zsd_data = (*create)(zone->zone_id); 490 t->zsd_create = create; 491 t->zsd_shutdown = shutdown; 492 t->zsd_destroy = destroy; 493 list_insert_tail(&zone->zone_zsd, t); 494 } 495 } 496 for (zone = list_head(&zone_active); zone != NULL; 497 zone = list_next(&zone_active, zone)) 498 mutex_exit(&zone->zone_lock); 499 mutex_exit(&zonehash_lock); 500 } 501 502 /* 503 * Helper function to find the zsd_entry associated with the key in the 504 * given list. 505 */ 506 static struct zsd_entry * 507 zsd_find(list_t *l, zone_key_t key) 508 { 509 struct zsd_entry *zsd; 510 511 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 512 if (zsd->zsd_key == key) { 513 /* 514 * Move to head of list to keep list in MRU order. 515 */ 516 if (zsd != list_head(l)) { 517 list_remove(l, zsd); 518 list_insert_head(l, zsd); 519 } 520 return (zsd); 521 } 522 } 523 return (NULL); 524 } 525 526 /* 527 * Function called when a module is being unloaded, or otherwise wishes 528 * to unregister its ZSD key and callbacks. 529 */ 530 int 531 zone_key_delete(zone_key_t key) 532 { 533 struct zsd_entry *zsdp = NULL; 534 zone_t *zone; 535 536 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 537 for (zone = list_head(&zone_active); zone != NULL; 538 zone = list_next(&zone_active, zone)) 539 mutex_enter(&zone->zone_lock); /* lock all zones */ 540 541 mutex_enter(&zsd_key_lock); 542 zsdp = zsd_find(&zsd_registered_keys, key); 543 if (zsdp == NULL) 544 goto notfound; 545 list_remove(&zsd_registered_keys, zsdp); 546 mutex_exit(&zsd_key_lock); 547 548 for (zone = list_head(&zone_active); zone != NULL; 549 zone = list_next(&zone_active, zone)) { 550 struct zsd_entry *del; 551 void *data; 552 553 if (!(zone->zone_flags & ZF_DESTROYED)) { 554 del = zsd_find(&zone->zone_zsd, key); 555 if (del != NULL) { 556 data = del->zsd_data; 557 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 558 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 559 list_remove(&zone->zone_zsd, del); 560 kmem_free(del, sizeof (*del)); 561 } else { 562 data = NULL; 563 } 564 if (zsdp->zsd_shutdown) 565 zsdp->zsd_shutdown(zone->zone_id, data); 566 if (zsdp->zsd_destroy) 567 zsdp->zsd_destroy(zone->zone_id, data); 568 } 569 mutex_exit(&zone->zone_lock); 570 } 571 mutex_exit(&zonehash_lock); 572 kmem_free(zsdp, sizeof (*zsdp)); 573 return (0); 574 575 notfound: 576 mutex_exit(&zsd_key_lock); 577 for (zone = list_head(&zone_active); zone != NULL; 578 zone = list_next(&zone_active, zone)) 579 mutex_exit(&zone->zone_lock); 580 mutex_exit(&zonehash_lock); 581 return (-1); 582 } 583 584 /* 585 * ZSD counterpart of pthread_setspecific(). 586 */ 587 int 588 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 589 { 590 struct zsd_entry *t; 591 struct zsd_entry *zsdp = NULL; 592 593 mutex_enter(&zone->zone_lock); 594 t = zsd_find(&zone->zone_zsd, key); 595 if (t != NULL) { 596 /* 597 * Replace old value with new 598 */ 599 t->zsd_data = (void *)data; 600 mutex_exit(&zone->zone_lock); 601 return (0); 602 } 603 /* 604 * If there was no previous value, go through the list of registered 605 * keys. 606 * 607 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 608 * necessary for shutdown callbacks to be able to execute without fear 609 * of deadlock. 610 */ 611 mutex_enter(&zsd_key_lock); 612 zsdp = zsd_find(&zsd_registered_keys, key); 613 if (zsdp == NULL) { /* Key was not registered */ 614 mutex_exit(&zsd_key_lock); 615 mutex_exit(&zone->zone_lock); 616 return (-1); 617 } 618 619 /* 620 * Add a zsd_entry to this zone, using the template we just retrieved 621 * to initialize the constructor and destructor(s). 622 */ 623 t = kmem_alloc(sizeof (*t), KM_SLEEP); 624 t->zsd_key = key; 625 t->zsd_data = (void *)data; 626 t->zsd_create = zsdp->zsd_create; 627 t->zsd_shutdown = zsdp->zsd_shutdown; 628 t->zsd_destroy = zsdp->zsd_destroy; 629 list_insert_tail(&zone->zone_zsd, t); 630 mutex_exit(&zsd_key_lock); 631 mutex_exit(&zone->zone_lock); 632 return (0); 633 } 634 635 /* 636 * ZSD counterpart of pthread_getspecific(). 637 */ 638 void * 639 zone_getspecific(zone_key_t key, zone_t *zone) 640 { 641 struct zsd_entry *t; 642 void *data; 643 644 mutex_enter(&zone->zone_lock); 645 t = zsd_find(&zone->zone_zsd, key); 646 data = (t == NULL ? NULL : t->zsd_data); 647 mutex_exit(&zone->zone_lock); 648 return (data); 649 } 650 651 /* 652 * Function used to initialize a zone's list of ZSD callbacks and data 653 * when the zone is being created. The callbacks are initialized from 654 * the template list (zsd_registered_keys), and the constructor 655 * callback executed (if one exists). 656 * 657 * This is called before the zone is made publicly available, hence no 658 * need to grab zone_lock. 659 * 660 * Although we grab and release zsd_key_lock, new entries cannot be 661 * added to or removed from the zsd_registered_keys list until we 662 * release zonehash_lock, so there isn't a window for a 663 * zone_key_create() to come in after we've dropped zsd_key_lock but 664 * before the zone is added to the zone list, such that the constructor 665 * callbacks aren't executed for the new zone. 666 */ 667 static void 668 zone_zsd_configure(zone_t *zone) 669 { 670 struct zsd_entry *zsdp; 671 struct zsd_entry *t; 672 zoneid_t zoneid = zone->zone_id; 673 674 ASSERT(MUTEX_HELD(&zonehash_lock)); 675 ASSERT(list_head(&zone->zone_zsd) == NULL); 676 mutex_enter(&zsd_key_lock); 677 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 678 zsdp = list_next(&zsd_registered_keys, zsdp)) { 679 if (zsdp->zsd_create != NULL) { 680 t = kmem_alloc(sizeof (*t), KM_SLEEP); 681 t->zsd_key = zsdp->zsd_key; 682 t->zsd_create = zsdp->zsd_create; 683 t->zsd_data = (*t->zsd_create)(zoneid); 684 t->zsd_shutdown = zsdp->zsd_shutdown; 685 t->zsd_destroy = zsdp->zsd_destroy; 686 list_insert_tail(&zone->zone_zsd, t); 687 } 688 } 689 mutex_exit(&zsd_key_lock); 690 } 691 692 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 693 694 /* 695 * Helper function to execute shutdown or destructor callbacks. 696 */ 697 static void 698 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 699 { 700 struct zsd_entry *zsdp; 701 struct zsd_entry *t; 702 zoneid_t zoneid = zone->zone_id; 703 704 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 705 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 706 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 707 708 mutex_enter(&zone->zone_lock); 709 if (ct == ZSD_DESTROY) { 710 if (zone->zone_flags & ZF_DESTROYED) { 711 /* 712 * Make sure destructors are only called once. 713 */ 714 mutex_exit(&zone->zone_lock); 715 return; 716 } 717 zone->zone_flags |= ZF_DESTROYED; 718 } 719 mutex_exit(&zone->zone_lock); 720 721 /* 722 * Both zsd_key_lock and zone_lock need to be held in order to add or 723 * remove a ZSD key, (either globally as part of 724 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 725 * possible through zone_setspecific()), so it's sufficient to hold 726 * zsd_key_lock here. 727 * 728 * This is a good thing, since we don't want to recursively try to grab 729 * zone_lock if a callback attempts to do something like a crfree() or 730 * zone_rele(). 731 */ 732 mutex_enter(&zsd_key_lock); 733 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 734 zsdp = list_next(&zsd_registered_keys, zsdp)) { 735 zone_key_t key = zsdp->zsd_key; 736 737 /* Skip if no callbacks registered */ 738 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 739 continue; 740 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 741 continue; 742 /* 743 * Call the callback with the zone-specific data if we can find 744 * any, otherwise with NULL. 745 */ 746 t = zsd_find(&zone->zone_zsd, key); 747 if (t != NULL) { 748 if (ct == ZSD_SHUTDOWN) { 749 t->zsd_shutdown(zoneid, t->zsd_data); 750 } else { 751 ASSERT(ct == ZSD_DESTROY); 752 t->zsd_destroy(zoneid, t->zsd_data); 753 } 754 } else { 755 if (ct == ZSD_SHUTDOWN) { 756 zsdp->zsd_shutdown(zoneid, NULL); 757 } else { 758 ASSERT(ct == ZSD_DESTROY); 759 zsdp->zsd_destroy(zoneid, NULL); 760 } 761 } 762 } 763 mutex_exit(&zsd_key_lock); 764 } 765 766 /* 767 * Called when the zone is going away; free ZSD-related memory, and 768 * destroy the zone_zsd list. 769 */ 770 static void 771 zone_free_zsd(zone_t *zone) 772 { 773 struct zsd_entry *t, *next; 774 775 /* 776 * Free all the zsd_entry's we had on this zone. 777 */ 778 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 779 next = list_next(&zone->zone_zsd, t); 780 list_remove(&zone->zone_zsd, t); 781 kmem_free(t, sizeof (*t)); 782 } 783 list_destroy(&zone->zone_zsd); 784 } 785 786 /* 787 * Frees memory associated with the zone dataset list. 788 */ 789 static void 790 zone_free_datasets(zone_t *zone) 791 { 792 zone_dataset_t *t, *next; 793 794 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 795 next = list_next(&zone->zone_datasets, t); 796 list_remove(&zone->zone_datasets, t); 797 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 798 kmem_free(t, sizeof (*t)); 799 } 800 list_destroy(&zone->zone_datasets); 801 } 802 803 /* 804 * zone.cpu-shares resource control support. 805 */ 806 /*ARGSUSED*/ 807 static rctl_qty_t 808 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 809 { 810 ASSERT(MUTEX_HELD(&p->p_lock)); 811 return (p->p_zone->zone_shares); 812 } 813 814 /*ARGSUSED*/ 815 static int 816 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 817 rctl_qty_t nv) 818 { 819 ASSERT(MUTEX_HELD(&p->p_lock)); 820 ASSERT(e->rcep_t == RCENTITY_ZONE); 821 if (e->rcep_p.zone == NULL) 822 return (0); 823 824 e->rcep_p.zone->zone_shares = nv; 825 return (0); 826 } 827 828 static rctl_ops_t zone_cpu_shares_ops = { 829 rcop_no_action, 830 zone_cpu_shares_usage, 831 zone_cpu_shares_set, 832 rcop_no_test 833 }; 834 835 /*ARGSUSED*/ 836 static rctl_qty_t 837 zone_lwps_usage(rctl_t *r, proc_t *p) 838 { 839 rctl_qty_t nlwps; 840 zone_t *zone = p->p_zone; 841 842 ASSERT(MUTEX_HELD(&p->p_lock)); 843 844 mutex_enter(&zone->zone_nlwps_lock); 845 nlwps = zone->zone_nlwps; 846 mutex_exit(&zone->zone_nlwps_lock); 847 848 return (nlwps); 849 } 850 851 /*ARGSUSED*/ 852 static int 853 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 854 rctl_qty_t incr, uint_t flags) 855 { 856 rctl_qty_t nlwps; 857 858 ASSERT(MUTEX_HELD(&p->p_lock)); 859 ASSERT(e->rcep_t == RCENTITY_ZONE); 860 if (e->rcep_p.zone == NULL) 861 return (0); 862 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 863 nlwps = e->rcep_p.zone->zone_nlwps; 864 865 if (nlwps + incr > rcntl->rcv_value) 866 return (1); 867 868 return (0); 869 } 870 871 /*ARGSUSED*/ 872 static int 873 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 874 875 ASSERT(MUTEX_HELD(&p->p_lock)); 876 ASSERT(e->rcep_t == RCENTITY_ZONE); 877 if (e->rcep_p.zone == NULL) 878 return (0); 879 e->rcep_p.zone->zone_nlwps_ctl = nv; 880 return (0); 881 } 882 883 static rctl_ops_t zone_lwps_ops = { 884 rcop_no_action, 885 zone_lwps_usage, 886 zone_lwps_set, 887 zone_lwps_test, 888 }; 889 890 /* 891 * Helper function to brand the zone with a unique ID. 892 */ 893 static void 894 zone_uniqid(zone_t *zone) 895 { 896 static uint64_t uniqid = 0; 897 898 ASSERT(MUTEX_HELD(&zonehash_lock)); 899 zone->zone_uniqid = uniqid++; 900 } 901 902 /* 903 * Returns a held pointer to the "kcred" for the specified zone. 904 */ 905 struct cred * 906 zone_get_kcred(zoneid_t zoneid) 907 { 908 zone_t *zone; 909 cred_t *cr; 910 911 if ((zone = zone_find_by_id(zoneid)) == NULL) 912 return (NULL); 913 cr = zone->zone_kcred; 914 crhold(cr); 915 zone_rele(zone); 916 return (cr); 917 } 918 919 /* 920 * Called very early on in boot to initialize the ZSD list so that 921 * zone_key_create() can be called before zone_init(). It also initializes 922 * portions of zone0 which may be used before zone_init() is called. The 923 * variable "global_zone" will be set when zone0 is fully initialized by 924 * zone_init(). 925 */ 926 void 927 zone_zsd_init(void) 928 { 929 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 930 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 931 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 932 offsetof(struct zsd_entry, zsd_linkage)); 933 list_create(&zone_active, sizeof (zone_t), 934 offsetof(zone_t, zone_linkage)); 935 list_create(&zone_deathrow, sizeof (zone_t), 936 offsetof(zone_t, zone_linkage)); 937 938 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 939 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 940 zone0.zone_shares = 1; 941 zone0.zone_nlwps_ctl = INT_MAX; 942 zone0.zone_name = GLOBAL_ZONENAME; 943 zone0.zone_nodename = utsname.nodename; 944 zone0.zone_domain = srpc_domain; 945 zone0.zone_ref = 1; 946 zone0.zone_id = GLOBAL_ZONEID; 947 zone0.zone_status = ZONE_IS_RUNNING; 948 zone0.zone_rootpath = "/"; 949 zone0.zone_rootpathlen = 2; 950 zone0.zone_psetid = ZONE_PS_INVAL; 951 zone0.zone_ncpus = 0; 952 zone0.zone_ncpus_online = 0; 953 zone0.zone_proc_initpid = 1; 954 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 955 offsetof(struct zsd_entry, zsd_linkage)); 956 list_insert_head(&zone_active, &zone0); 957 958 /* 959 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 960 * to anything meaningful. It is assigned to be 'rootdir' in 961 * vfs_mountroot(). 962 */ 963 zone0.zone_rootvp = NULL; 964 zone0.zone_vfslist = NULL; 965 zone0.zone_bootargs = NULL; 966 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 967 /* 968 * The global zone has all privileges 969 */ 970 priv_fillset(zone0.zone_privset); 971 /* 972 * Add p0 to the global zone 973 */ 974 zone0.zone_zsched = &p0; 975 p0.p_zone = &zone0; 976 } 977 978 /* 979 * Called by main() to initialize the zones framework. 980 */ 981 void 982 zone_init(void) 983 { 984 rctl_dict_entry_t *rde; 985 rctl_val_t *dval; 986 rctl_set_t *set; 987 rctl_alloc_gp_t *gp; 988 rctl_entity_p_t e; 989 990 ASSERT(curproc == &p0); 991 992 /* 993 * Create ID space for zone IDs. ID 0 is reserved for the 994 * global zone. 995 */ 996 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 997 998 /* 999 * Initialize generic zone resource controls, if any. 1000 */ 1001 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 1002 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 1003 RCTL_GLOBAL_NOBASIC | 1004 RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES, 1005 &zone_cpu_shares_ops); 1006 1007 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 1008 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1009 INT_MAX, INT_MAX, &zone_lwps_ops); 1010 /* 1011 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 1012 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 1013 */ 1014 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 1015 bzero(dval, sizeof (rctl_val_t)); 1016 dval->rcv_value = 1; 1017 dval->rcv_privilege = RCPRIV_PRIVILEGED; 1018 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 1019 dval->rcv_action_recip_pid = -1; 1020 1021 rde = rctl_dict_lookup("zone.cpu-shares"); 1022 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 1023 1024 /* 1025 * Initialize the ``global zone''. 1026 */ 1027 set = rctl_set_create(); 1028 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 1029 mutex_enter(&p0.p_lock); 1030 e.rcep_p.zone = &zone0; 1031 e.rcep_t = RCENTITY_ZONE; 1032 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1033 gp); 1034 1035 zone0.zone_nlwps = p0.p_lwpcnt; 1036 zone0.zone_ntasks = 1; 1037 mutex_exit(&p0.p_lock); 1038 rctl_prealloc_destroy(gp); 1039 /* 1040 * pool_default hasn't been initialized yet, so we let pool_init() take 1041 * care of making the global zone is in the default pool. 1042 */ 1043 mutex_enter(&zonehash_lock); 1044 zone_uniqid(&zone0); 1045 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1046 mutex_exit(&zonehash_lock); 1047 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1048 mod_hash_null_valdtor); 1049 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1050 zone_hash_size, mod_hash_null_valdtor); 1051 zonecount = 1; 1052 1053 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1054 (mod_hash_val_t)&zone0); 1055 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1056 (mod_hash_val_t)&zone0); 1057 /* 1058 * We avoid setting zone_kcred until now, since kcred is initialized 1059 * sometime after zone_zsd_init() and before zone_init(). 1060 */ 1061 zone0.zone_kcred = kcred; 1062 /* 1063 * The global zone is fully initialized (except for zone_rootvp which 1064 * will be set when the root filesystem is mounted). 1065 */ 1066 global_zone = &zone0; 1067 } 1068 1069 static void 1070 zone_free(zone_t *zone) 1071 { 1072 ASSERT(zone != global_zone); 1073 ASSERT(zone->zone_ntasks == 0); 1074 ASSERT(zone->zone_nlwps == 0); 1075 ASSERT(zone->zone_cred_ref == 0); 1076 ASSERT(zone->zone_kcred == NULL); 1077 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1078 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1079 1080 /* remove from deathrow list */ 1081 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1082 ASSERT(zone->zone_ref == 0); 1083 mutex_enter(&zone_deathrow_lock); 1084 list_remove(&zone_deathrow, zone); 1085 mutex_exit(&zone_deathrow_lock); 1086 } 1087 1088 zone_free_zsd(zone); 1089 zone_free_datasets(zone); 1090 1091 if (zone->zone_rootvp != NULL) 1092 VN_RELE(zone->zone_rootvp); 1093 if (zone->zone_rootpath) 1094 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1095 if (zone->zone_name != NULL) 1096 kmem_free(zone->zone_name, ZONENAME_MAX); 1097 if (zone->zone_nodename != NULL) 1098 kmem_free(zone->zone_nodename, _SYS_NMLN); 1099 if (zone->zone_domain != NULL) 1100 kmem_free(zone->zone_domain, _SYS_NMLN); 1101 if (zone->zone_privset != NULL) 1102 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1103 if (zone->zone_rctls != NULL) 1104 rctl_set_free(zone->zone_rctls); 1105 if (zone->zone_bootargs != NULL) 1106 kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX); 1107 id_free(zoneid_space, zone->zone_id); 1108 mutex_destroy(&zone->zone_lock); 1109 cv_destroy(&zone->zone_cv); 1110 kmem_free(zone, sizeof (zone_t)); 1111 } 1112 1113 /* 1114 * See block comment at the top of this file for information about zone 1115 * status values. 1116 */ 1117 /* 1118 * Convenience function for setting zone status. 1119 */ 1120 static void 1121 zone_status_set(zone_t *zone, zone_status_t status) 1122 { 1123 ASSERT(MUTEX_HELD(&zone_status_lock)); 1124 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1125 status >= zone_status_get(zone)); 1126 zone->zone_status = status; 1127 cv_broadcast(&zone->zone_cv); 1128 } 1129 1130 /* 1131 * Public function to retrieve the zone status. The zone status may 1132 * change after it is retrieved. 1133 */ 1134 zone_status_t 1135 zone_status_get(zone_t *zone) 1136 { 1137 return (zone->zone_status); 1138 } 1139 1140 static int 1141 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1142 { 1143 char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP); 1144 size_t len; 1145 int err; 1146 1147 err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len); 1148 if (err != 0) { 1149 kmem_free(bootargs, ZONEBOOTARGS_MAX); 1150 return (err); /* EFAULT or ENAMETOOLONG */ 1151 } 1152 bootargs[len] = '\0'; 1153 1154 ASSERT(zone->zone_bootargs == NULL); 1155 zone->zone_bootargs = bootargs; 1156 return (0); 1157 } 1158 1159 /* 1160 * Block indefinitely waiting for (zone_status >= status) 1161 */ 1162 void 1163 zone_status_wait(zone_t *zone, zone_status_t status) 1164 { 1165 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1166 1167 mutex_enter(&zone_status_lock); 1168 while (zone->zone_status < status) { 1169 cv_wait(&zone->zone_cv, &zone_status_lock); 1170 } 1171 mutex_exit(&zone_status_lock); 1172 } 1173 1174 /* 1175 * Private CPR-safe version of zone_status_wait(). 1176 */ 1177 static void 1178 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1179 { 1180 callb_cpr_t cprinfo; 1181 1182 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1183 1184 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1185 str); 1186 mutex_enter(&zone_status_lock); 1187 while (zone->zone_status < status) { 1188 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1189 cv_wait(&zone->zone_cv, &zone_status_lock); 1190 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1191 } 1192 /* 1193 * zone_status_lock is implicitly released by the following. 1194 */ 1195 CALLB_CPR_EXIT(&cprinfo); 1196 } 1197 1198 /* 1199 * Block until zone enters requested state or signal is received. Return (0) 1200 * if signaled, non-zero otherwise. 1201 */ 1202 int 1203 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1204 { 1205 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1206 1207 mutex_enter(&zone_status_lock); 1208 while (zone->zone_status < status) { 1209 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1210 mutex_exit(&zone_status_lock); 1211 return (0); 1212 } 1213 } 1214 mutex_exit(&zone_status_lock); 1215 return (1); 1216 } 1217 1218 /* 1219 * Block until the zone enters the requested state or the timeout expires, 1220 * whichever happens first. Return (-1) if operation timed out, time remaining 1221 * otherwise. 1222 */ 1223 clock_t 1224 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1225 { 1226 clock_t timeleft = 0; 1227 1228 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1229 1230 mutex_enter(&zone_status_lock); 1231 while (zone->zone_status < status && timeleft != -1) { 1232 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1233 } 1234 mutex_exit(&zone_status_lock); 1235 return (timeleft); 1236 } 1237 1238 /* 1239 * Block until the zone enters the requested state, the current process is 1240 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1241 * operation timed out, 0 if signaled, time remaining otherwise. 1242 */ 1243 clock_t 1244 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1245 { 1246 clock_t timeleft = tim - lbolt; 1247 1248 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1249 1250 mutex_enter(&zone_status_lock); 1251 while (zone->zone_status < status) { 1252 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1253 tim); 1254 if (timeleft <= 0) 1255 break; 1256 } 1257 mutex_exit(&zone_status_lock); 1258 return (timeleft); 1259 } 1260 1261 /* 1262 * Zones have two reference counts: one for references from credential 1263 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1264 * This is so we can allow a zone to be rebooted while there are still 1265 * outstanding cred references, since certain drivers cache dblks (which 1266 * implicitly results in cached creds). We wait for zone_ref to drop to 1267 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1268 * later freed when the zone_cred_ref drops to 0, though nothing other 1269 * than the zone id and privilege set should be accessed once the zone 1270 * is "dead". 1271 * 1272 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1273 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1274 * to 0. This can be useful to flush out other sources of cached creds 1275 * that may be less innocuous than the driver case. 1276 */ 1277 1278 int zone_wait_for_cred = 0; 1279 1280 static void 1281 zone_hold_locked(zone_t *z) 1282 { 1283 ASSERT(MUTEX_HELD(&z->zone_lock)); 1284 z->zone_ref++; 1285 ASSERT(z->zone_ref != 0); 1286 } 1287 1288 void 1289 zone_hold(zone_t *z) 1290 { 1291 mutex_enter(&z->zone_lock); 1292 zone_hold_locked(z); 1293 mutex_exit(&z->zone_lock); 1294 } 1295 1296 /* 1297 * If the non-cred ref count drops to 1 and either the cred ref count 1298 * is 0 or we aren't waiting for cred references, the zone is ready to 1299 * be destroyed. 1300 */ 1301 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1302 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1303 1304 void 1305 zone_rele(zone_t *z) 1306 { 1307 boolean_t wakeup; 1308 1309 mutex_enter(&z->zone_lock); 1310 ASSERT(z->zone_ref != 0); 1311 z->zone_ref--; 1312 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1313 /* no more refs, free the structure */ 1314 mutex_exit(&z->zone_lock); 1315 zone_free(z); 1316 return; 1317 } 1318 /* signal zone_destroy so the zone can finish halting */ 1319 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1320 mutex_exit(&z->zone_lock); 1321 1322 if (wakeup) { 1323 /* 1324 * Grabbing zonehash_lock here effectively synchronizes with 1325 * zone_destroy() to avoid missed signals. 1326 */ 1327 mutex_enter(&zonehash_lock); 1328 cv_broadcast(&zone_destroy_cv); 1329 mutex_exit(&zonehash_lock); 1330 } 1331 } 1332 1333 void 1334 zone_cred_hold(zone_t *z) 1335 { 1336 mutex_enter(&z->zone_lock); 1337 z->zone_cred_ref++; 1338 ASSERT(z->zone_cred_ref != 0); 1339 mutex_exit(&z->zone_lock); 1340 } 1341 1342 void 1343 zone_cred_rele(zone_t *z) 1344 { 1345 boolean_t wakeup; 1346 1347 mutex_enter(&z->zone_lock); 1348 ASSERT(z->zone_cred_ref != 0); 1349 z->zone_cred_ref--; 1350 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1351 /* no more refs, free the structure */ 1352 mutex_exit(&z->zone_lock); 1353 zone_free(z); 1354 return; 1355 } 1356 /* 1357 * If zone_destroy is waiting for the cred references to drain 1358 * out, and they have, signal it. 1359 */ 1360 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1361 zone_status_get(z) >= ZONE_IS_DEAD); 1362 mutex_exit(&z->zone_lock); 1363 1364 if (wakeup) { 1365 /* 1366 * Grabbing zonehash_lock here effectively synchronizes with 1367 * zone_destroy() to avoid missed signals. 1368 */ 1369 mutex_enter(&zonehash_lock); 1370 cv_broadcast(&zone_destroy_cv); 1371 mutex_exit(&zonehash_lock); 1372 } 1373 } 1374 1375 void 1376 zone_task_hold(zone_t *z) 1377 { 1378 mutex_enter(&z->zone_lock); 1379 z->zone_ntasks++; 1380 ASSERT(z->zone_ntasks != 0); 1381 mutex_exit(&z->zone_lock); 1382 } 1383 1384 void 1385 zone_task_rele(zone_t *zone) 1386 { 1387 uint_t refcnt; 1388 1389 mutex_enter(&zone->zone_lock); 1390 ASSERT(zone->zone_ntasks != 0); 1391 refcnt = --zone->zone_ntasks; 1392 if (refcnt > 1) { /* Common case */ 1393 mutex_exit(&zone->zone_lock); 1394 return; 1395 } 1396 zone_hold_locked(zone); /* so we can use the zone_t later */ 1397 mutex_exit(&zone->zone_lock); 1398 if (refcnt == 1) { 1399 /* 1400 * See if the zone is shutting down. 1401 */ 1402 mutex_enter(&zone_status_lock); 1403 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1404 goto out; 1405 } 1406 1407 /* 1408 * Make sure the ntasks didn't change since we 1409 * dropped zone_lock. 1410 */ 1411 mutex_enter(&zone->zone_lock); 1412 if (refcnt != zone->zone_ntasks) { 1413 mutex_exit(&zone->zone_lock); 1414 goto out; 1415 } 1416 mutex_exit(&zone->zone_lock); 1417 1418 /* 1419 * No more user processes in the zone. The zone is empty. 1420 */ 1421 zone_status_set(zone, ZONE_IS_EMPTY); 1422 goto out; 1423 } 1424 1425 ASSERT(refcnt == 0); 1426 /* 1427 * zsched has exited; the zone is dead. 1428 */ 1429 zone->zone_zsched = NULL; /* paranoia */ 1430 mutex_enter(&zone_status_lock); 1431 zone_status_set(zone, ZONE_IS_DEAD); 1432 out: 1433 mutex_exit(&zone_status_lock); 1434 zone_rele(zone); 1435 } 1436 1437 zoneid_t 1438 getzoneid(void) 1439 { 1440 return (curproc->p_zone->zone_id); 1441 } 1442 1443 /* 1444 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1445 * check the validity of a zone's state. 1446 */ 1447 static zone_t * 1448 zone_find_all_by_id(zoneid_t zoneid) 1449 { 1450 mod_hash_val_t hv; 1451 zone_t *zone = NULL; 1452 1453 ASSERT(MUTEX_HELD(&zonehash_lock)); 1454 1455 if (mod_hash_find(zonehashbyid, 1456 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1457 zone = (zone_t *)hv; 1458 return (zone); 1459 } 1460 1461 static zone_t * 1462 zone_find_all_by_name(char *name) 1463 { 1464 mod_hash_val_t hv; 1465 zone_t *zone = NULL; 1466 1467 ASSERT(MUTEX_HELD(&zonehash_lock)); 1468 1469 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 1470 zone = (zone_t *)hv; 1471 return (zone); 1472 } 1473 1474 /* 1475 * Public interface for looking up a zone by zoneid. Only returns the zone if 1476 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 1477 * Caller must call zone_rele() once it is done with the zone. 1478 * 1479 * The zone may begin the zone_destroy() sequence immediately after this 1480 * function returns, but may be safely used until zone_rele() is called. 1481 */ 1482 zone_t * 1483 zone_find_by_id(zoneid_t zoneid) 1484 { 1485 zone_t *zone; 1486 zone_status_t status; 1487 1488 mutex_enter(&zonehash_lock); 1489 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 1490 mutex_exit(&zonehash_lock); 1491 return (NULL); 1492 } 1493 status = zone_status_get(zone); 1494 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1495 /* 1496 * For all practical purposes the zone doesn't exist. 1497 */ 1498 mutex_exit(&zonehash_lock); 1499 return (NULL); 1500 } 1501 zone_hold(zone); 1502 mutex_exit(&zonehash_lock); 1503 return (zone); 1504 } 1505 1506 /* 1507 * Similar to zone_find_by_id, but using zone name as the key. 1508 */ 1509 zone_t * 1510 zone_find_by_name(char *name) 1511 { 1512 zone_t *zone; 1513 zone_status_t status; 1514 1515 mutex_enter(&zonehash_lock); 1516 if ((zone = zone_find_all_by_name(name)) == NULL) { 1517 mutex_exit(&zonehash_lock); 1518 return (NULL); 1519 } 1520 status = zone_status_get(zone); 1521 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1522 /* 1523 * For all practical purposes the zone doesn't exist. 1524 */ 1525 mutex_exit(&zonehash_lock); 1526 return (NULL); 1527 } 1528 zone_hold(zone); 1529 mutex_exit(&zonehash_lock); 1530 return (zone); 1531 } 1532 1533 /* 1534 * Similar to zone_find_by_id(), using the path as a key. For instance, 1535 * if there is a zone "foo" rooted at /foo/root, and the path argument 1536 * is "/foo/root/proc", it will return the held zone_t corresponding to 1537 * zone "foo". 1538 * 1539 * zone_find_by_path() always returns a non-NULL value, since at the 1540 * very least every path will be contained in the global zone. 1541 * 1542 * As with the other zone_find_by_*() functions, the caller is 1543 * responsible for zone_rele()ing the return value of this function. 1544 */ 1545 zone_t * 1546 zone_find_by_path(const char *path) 1547 { 1548 zone_t *zone; 1549 zone_t *zret = NULL; 1550 zone_status_t status; 1551 1552 if (path == NULL) { 1553 /* 1554 * Call from rootconf(). 1555 */ 1556 zone_hold(global_zone); 1557 return (global_zone); 1558 } 1559 ASSERT(*path == '/'); 1560 mutex_enter(&zonehash_lock); 1561 for (zone = list_head(&zone_active); zone != NULL; 1562 zone = list_next(&zone_active, zone)) { 1563 if (ZONE_PATH_VISIBLE(path, zone)) 1564 zret = zone; 1565 } 1566 ASSERT(zret != NULL); 1567 status = zone_status_get(zret); 1568 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1569 /* 1570 * Zone practically doesn't exist. 1571 */ 1572 zret = global_zone; 1573 } 1574 zone_hold(zret); 1575 mutex_exit(&zonehash_lock); 1576 return (zret); 1577 } 1578 1579 /* 1580 * Get the number of cpus visible to this zone. The system-wide global 1581 * 'ncpus' is returned if pools are disabled, the caller is in the 1582 * global zone, or a NULL zone argument is passed in. 1583 */ 1584 int 1585 zone_ncpus_get(zone_t *zone) 1586 { 1587 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 1588 1589 return (myncpus != 0 ? myncpus : ncpus); 1590 } 1591 1592 /* 1593 * Get the number of online cpus visible to this zone. The system-wide 1594 * global 'ncpus_online' is returned if pools are disabled, the caller 1595 * is in the global zone, or a NULL zone argument is passed in. 1596 */ 1597 int 1598 zone_ncpus_online_get(zone_t *zone) 1599 { 1600 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 1601 1602 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 1603 } 1604 1605 /* 1606 * Return the pool to which the zone is currently bound. 1607 */ 1608 pool_t * 1609 zone_pool_get(zone_t *zone) 1610 { 1611 ASSERT(pool_lock_held()); 1612 1613 return (zone->zone_pool); 1614 } 1615 1616 /* 1617 * Set the zone's pool pointer and update the zone's visibility to match 1618 * the resources in the new pool. 1619 */ 1620 void 1621 zone_pool_set(zone_t *zone, pool_t *pool) 1622 { 1623 ASSERT(pool_lock_held()); 1624 ASSERT(MUTEX_HELD(&cpu_lock)); 1625 1626 zone->zone_pool = pool; 1627 zone_pset_set(zone, pool->pool_pset->pset_id); 1628 } 1629 1630 /* 1631 * Return the cached value of the id of the processor set to which the 1632 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 1633 * facility is disabled. 1634 */ 1635 psetid_t 1636 zone_pset_get(zone_t *zone) 1637 { 1638 ASSERT(MUTEX_HELD(&cpu_lock)); 1639 1640 return (zone->zone_psetid); 1641 } 1642 1643 /* 1644 * Set the cached value of the id of the processor set to which the zone 1645 * is currently bound. Also update the zone's visibility to match the 1646 * resources in the new processor set. 1647 */ 1648 void 1649 zone_pset_set(zone_t *zone, psetid_t newpsetid) 1650 { 1651 psetid_t oldpsetid; 1652 1653 ASSERT(MUTEX_HELD(&cpu_lock)); 1654 oldpsetid = zone_pset_get(zone); 1655 1656 if (oldpsetid == newpsetid) 1657 return; 1658 /* 1659 * Global zone sees all. 1660 */ 1661 if (zone != global_zone) { 1662 zone->zone_psetid = newpsetid; 1663 if (newpsetid != ZONE_PS_INVAL) 1664 pool_pset_visibility_add(newpsetid, zone); 1665 if (oldpsetid != ZONE_PS_INVAL) 1666 pool_pset_visibility_remove(oldpsetid, zone); 1667 } 1668 /* 1669 * Disabling pools, so we should start using the global values 1670 * for ncpus and ncpus_online. 1671 */ 1672 if (newpsetid == ZONE_PS_INVAL) { 1673 zone->zone_ncpus = 0; 1674 zone->zone_ncpus_online = 0; 1675 } 1676 } 1677 1678 /* 1679 * Walk the list of active zones and issue the provided callback for 1680 * each of them. 1681 * 1682 * Caller must not be holding any locks that may be acquired under 1683 * zonehash_lock. See comment at the beginning of the file for a list of 1684 * common locks and their interactions with zones. 1685 */ 1686 int 1687 zone_walk(int (*cb)(zone_t *, void *), void *data) 1688 { 1689 zone_t *zone; 1690 int ret = 0; 1691 zone_status_t status; 1692 1693 mutex_enter(&zonehash_lock); 1694 for (zone = list_head(&zone_active); zone != NULL; 1695 zone = list_next(&zone_active, zone)) { 1696 /* 1697 * Skip zones that shouldn't be externally visible. 1698 */ 1699 status = zone_status_get(zone); 1700 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 1701 continue; 1702 /* 1703 * Bail immediately if any callback invocation returns a 1704 * non-zero value. 1705 */ 1706 ret = (*cb)(zone, data); 1707 if (ret != 0) 1708 break; 1709 } 1710 mutex_exit(&zonehash_lock); 1711 return (ret); 1712 } 1713 1714 static int 1715 zone_set_root(zone_t *zone, const char *upath) 1716 { 1717 vnode_t *vp; 1718 int trycount; 1719 int error = 0; 1720 char *path; 1721 struct pathname upn, pn; 1722 size_t pathlen; 1723 1724 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 1725 return (error); 1726 1727 pn_alloc(&pn); 1728 1729 /* prevent infinite loop */ 1730 trycount = 10; 1731 for (;;) { 1732 if (--trycount <= 0) { 1733 error = ESTALE; 1734 goto out; 1735 } 1736 1737 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 1738 /* 1739 * VOP_ACCESS() may cover 'vp' with a new 1740 * filesystem, if 'vp' is an autoFS vnode. 1741 * Get the new 'vp' if so. 1742 */ 1743 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 1744 (vp->v_vfsmountedhere == NULL || 1745 (error = traverse(&vp)) == 0)) { 1746 pathlen = pn.pn_pathlen + 2; 1747 path = kmem_alloc(pathlen, KM_SLEEP); 1748 (void) strncpy(path, pn.pn_path, 1749 pn.pn_pathlen + 1); 1750 path[pathlen - 2] = '/'; 1751 path[pathlen - 1] = '\0'; 1752 pn_free(&pn); 1753 pn_free(&upn); 1754 1755 /* Success! */ 1756 break; 1757 } 1758 VN_RELE(vp); 1759 } 1760 if (error != ESTALE) 1761 goto out; 1762 } 1763 1764 ASSERT(error == 0); 1765 zone->zone_rootvp = vp; /* we hold a reference to vp */ 1766 zone->zone_rootpath = path; 1767 zone->zone_rootpathlen = pathlen; 1768 return (0); 1769 1770 out: 1771 pn_free(&pn); 1772 pn_free(&upn); 1773 return (error); 1774 } 1775 1776 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 1777 ((c) >= 'a' && (c) <= 'z') || \ 1778 ((c) >= 'A' && (c) <= 'Z')) 1779 1780 static int 1781 zone_set_name(zone_t *zone, const char *uname) 1782 { 1783 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 1784 size_t len; 1785 int i, err; 1786 1787 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 1788 kmem_free(kname, ZONENAME_MAX); 1789 return (err); /* EFAULT or ENAMETOOLONG */ 1790 } 1791 1792 /* must be less than ZONENAME_MAX */ 1793 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 1794 kmem_free(kname, ZONENAME_MAX); 1795 return (EINVAL); 1796 } 1797 1798 /* 1799 * Name must start with an alphanumeric and must contain only 1800 * alphanumerics, '-', '_' and '.'. 1801 */ 1802 if (!isalnum(kname[0])) { 1803 kmem_free(kname, ZONENAME_MAX); 1804 return (EINVAL); 1805 } 1806 for (i = 1; i < len - 1; i++) { 1807 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 1808 kname[i] != '.') { 1809 kmem_free(kname, ZONENAME_MAX); 1810 return (EINVAL); 1811 } 1812 } 1813 1814 zone->zone_name = kname; 1815 return (0); 1816 } 1817 1818 /* 1819 * Similar to thread_create(), but makes sure the thread is in the appropriate 1820 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 1821 */ 1822 /*ARGSUSED*/ 1823 kthread_t * 1824 zthread_create( 1825 caddr_t stk, 1826 size_t stksize, 1827 void (*proc)(), 1828 void *arg, 1829 size_t len, 1830 pri_t pri) 1831 { 1832 kthread_t *t; 1833 zone_t *zone = curproc->p_zone; 1834 proc_t *pp = zone->zone_zsched; 1835 1836 zone_hold(zone); /* Reference to be dropped when thread exits */ 1837 1838 /* 1839 * No-one should be trying to create threads if the zone is shutting 1840 * down and there aren't any kernel threads around. See comment 1841 * in zthread_exit(). 1842 */ 1843 ASSERT(!(zone->zone_kthreads == NULL && 1844 zone_status_get(zone) >= ZONE_IS_EMPTY)); 1845 /* 1846 * Create a thread, but don't let it run until we've finished setting 1847 * things up. 1848 */ 1849 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 1850 ASSERT(t->t_forw == NULL); 1851 mutex_enter(&zone_status_lock); 1852 if (zone->zone_kthreads == NULL) { 1853 t->t_forw = t->t_back = t; 1854 } else { 1855 kthread_t *tx = zone->zone_kthreads; 1856 1857 t->t_forw = tx; 1858 t->t_back = tx->t_back; 1859 tx->t_back->t_forw = t; 1860 tx->t_back = t; 1861 } 1862 zone->zone_kthreads = t; 1863 mutex_exit(&zone_status_lock); 1864 1865 mutex_enter(&pp->p_lock); 1866 t->t_proc_flag |= TP_ZTHREAD; 1867 project_rele(t->t_proj); 1868 t->t_proj = project_hold(pp->p_task->tk_proj); 1869 1870 /* 1871 * Setup complete, let it run. 1872 */ 1873 thread_lock(t); 1874 t->t_schedflag |= TS_ALLSTART; 1875 setrun_locked(t); 1876 thread_unlock(t); 1877 1878 mutex_exit(&pp->p_lock); 1879 1880 return (t); 1881 } 1882 1883 /* 1884 * Similar to thread_exit(). Must be called by threads created via 1885 * zthread_exit(). 1886 */ 1887 void 1888 zthread_exit(void) 1889 { 1890 kthread_t *t = curthread; 1891 proc_t *pp = curproc; 1892 zone_t *zone = pp->p_zone; 1893 1894 mutex_enter(&zone_status_lock); 1895 1896 /* 1897 * Reparent to p0 1898 */ 1899 mutex_enter(&pp->p_lock); 1900 t->t_proc_flag &= ~TP_ZTHREAD; 1901 t->t_procp = &p0; 1902 hat_thread_exit(t); 1903 mutex_exit(&pp->p_lock); 1904 1905 if (t->t_back == t) { 1906 ASSERT(t->t_forw == t); 1907 /* 1908 * If the zone is empty, once the thread count 1909 * goes to zero no further kernel threads can be 1910 * created. This is because if the creator is a process 1911 * in the zone, then it must have exited before the zone 1912 * state could be set to ZONE_IS_EMPTY. 1913 * Otherwise, if the creator is a kernel thread in the 1914 * zone, the thread count is non-zero. 1915 * 1916 * This really means that non-zone kernel threads should 1917 * not create zone kernel threads. 1918 */ 1919 zone->zone_kthreads = NULL; 1920 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 1921 zone_status_set(zone, ZONE_IS_DOWN); 1922 } 1923 } else { 1924 t->t_forw->t_back = t->t_back; 1925 t->t_back->t_forw = t->t_forw; 1926 if (zone->zone_kthreads == t) 1927 zone->zone_kthreads = t->t_forw; 1928 } 1929 mutex_exit(&zone_status_lock); 1930 zone_rele(zone); 1931 thread_exit(); 1932 /* NOTREACHED */ 1933 } 1934 1935 static void 1936 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 1937 { 1938 vnode_t *oldvp; 1939 1940 /* we're going to hold a reference here to the directory */ 1941 VN_HOLD(vp); 1942 1943 #ifdef C2_AUDIT 1944 if (audit_active) /* update abs cwd/root path see c2audit.c */ 1945 audit_chdirec(vp, vpp); 1946 #endif 1947 1948 mutex_enter(&pp->p_lock); 1949 oldvp = *vpp; 1950 *vpp = vp; 1951 mutex_exit(&pp->p_lock); 1952 if (oldvp != NULL) 1953 VN_RELE(oldvp); 1954 } 1955 1956 /* 1957 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 1958 */ 1959 static int 1960 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 1961 { 1962 nvpair_t *nvp = NULL; 1963 boolean_t priv_set = B_FALSE; 1964 boolean_t limit_set = B_FALSE; 1965 boolean_t action_set = B_FALSE; 1966 1967 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 1968 const char *name; 1969 uint64_t ui64; 1970 1971 name = nvpair_name(nvp); 1972 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 1973 return (EINVAL); 1974 (void) nvpair_value_uint64(nvp, &ui64); 1975 if (strcmp(name, "privilege") == 0) { 1976 /* 1977 * Currently only privileged values are allowed, but 1978 * this may change in the future. 1979 */ 1980 if (ui64 != RCPRIV_PRIVILEGED) 1981 return (EINVAL); 1982 rv->rcv_privilege = ui64; 1983 priv_set = B_TRUE; 1984 } else if (strcmp(name, "limit") == 0) { 1985 rv->rcv_value = ui64; 1986 limit_set = B_TRUE; 1987 } else if (strcmp(name, "action") == 0) { 1988 if (ui64 != RCTL_LOCAL_NOACTION && 1989 ui64 != RCTL_LOCAL_DENY) 1990 return (EINVAL); 1991 rv->rcv_flagaction = ui64; 1992 action_set = B_TRUE; 1993 } else { 1994 return (EINVAL); 1995 } 1996 } 1997 1998 if (!(priv_set && limit_set && action_set)) 1999 return (EINVAL); 2000 rv->rcv_action_signal = 0; 2001 rv->rcv_action_recipient = NULL; 2002 rv->rcv_action_recip_pid = -1; 2003 rv->rcv_firing_time = 0; 2004 2005 return (0); 2006 } 2007 2008 void 2009 zone_icode(void) 2010 { 2011 proc_t *p = ttoproc(curthread); 2012 struct core_globals *cg; 2013 2014 /* 2015 * For all purposes (ZONE_ATTR_INITPID and restart_init), 2016 * storing just the pid of init is sufficient. 2017 */ 2018 p->p_zone->zone_proc_initpid = p->p_pid; 2019 2020 /* 2021 * Allocate user address space and stack segment 2022 */ 2023 2024 p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; 2025 p->p_usrstack = (caddr_t)USRSTACK32; 2026 p->p_model = DATAMODEL_ILP32; 2027 p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; 2028 p->p_datprot = PROT_ZFOD & ~PROT_EXEC; 2029 p->p_stk_ctl = INT32_MAX; 2030 2031 p->p_as = as_alloc(); 2032 p->p_as->a_userlimit = (caddr_t)USERLIMIT32; 2033 (void) hat_setup(p->p_as->a_hat, HAT_INIT); 2034 2035 cg = zone_getspecific(core_zone_key, p->p_zone); 2036 ASSERT(cg != NULL); 2037 corectl_path_hold(cg->core_default_path); 2038 corectl_content_hold(cg->core_default_content); 2039 p->p_corefile = cg->core_default_path; 2040 p->p_content = cg->core_default_content; 2041 2042 init_mstate(curthread, LMS_SYSTEM); 2043 2044 p->p_zone->zone_boot_err = exec_init(zone_initname, 0, 2045 p->p_zone->zone_bootargs); 2046 2047 mutex_enter(&zone_status_lock); 2048 if (p->p_zone->zone_boot_err != 0) { 2049 /* 2050 * Make sure we are still in the booting state-- we could have 2051 * raced and already be shutting down, or even further along. 2052 */ 2053 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2054 zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); 2055 mutex_exit(&zone_status_lock); 2056 /* It's gone bad, dispose of the process */ 2057 if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { 2058 mutex_enter(&p->p_lock); 2059 ASSERT(p->p_flag & SEXITLWPS); 2060 lwp_exit(); 2061 } 2062 } else { 2063 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2064 zone_status_set(p->p_zone, ZONE_IS_RUNNING); 2065 mutex_exit(&zone_status_lock); 2066 /* cause the process to return to userland. */ 2067 lwp_rtt(); 2068 } 2069 } 2070 2071 struct zsched_arg { 2072 zone_t *zone; 2073 nvlist_t *nvlist; 2074 }; 2075 2076 /* 2077 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2078 * anything to do with scheduling, but rather with the fact that 2079 * per-zone kernel threads are parented to zsched, just like regular 2080 * kernel threads are parented to sched (p0). 2081 * 2082 * zsched is also responsible for launching init for the zone. 2083 */ 2084 static void 2085 zsched(void *arg) 2086 { 2087 struct zsched_arg *za = arg; 2088 proc_t *pp = curproc; 2089 proc_t *initp = proc_init; 2090 zone_t *zone = za->zone; 2091 cred_t *cr, *oldcred; 2092 rctl_set_t *set; 2093 rctl_alloc_gp_t *gp; 2094 contract_t *ct = NULL; 2095 task_t *tk, *oldtk; 2096 rctl_entity_p_t e; 2097 kproject_t *pj; 2098 2099 nvlist_t *nvl = za->nvlist; 2100 nvpair_t *nvp = NULL; 2101 2102 bcopy("zsched", u.u_psargs, sizeof ("zsched")); 2103 bcopy("zsched", u.u_comm, sizeof ("zsched")); 2104 u.u_argc = 0; 2105 u.u_argv = NULL; 2106 u.u_envp = NULL; 2107 closeall(P_FINFO(pp)); 2108 2109 /* 2110 * We are this zone's "zsched" process. As the zone isn't generally 2111 * visible yet we don't need to grab any locks before initializing its 2112 * zone_proc pointer. 2113 */ 2114 zone_hold(zone); /* this hold is released by zone_destroy() */ 2115 zone->zone_zsched = pp; 2116 mutex_enter(&pp->p_lock); 2117 pp->p_zone = zone; 2118 mutex_exit(&pp->p_lock); 2119 2120 /* 2121 * Disassociate process from its 'parent'; parent ourselves to init 2122 * (pid 1) and change other values as needed. 2123 */ 2124 sess_create(); 2125 2126 mutex_enter(&pidlock); 2127 proc_detach(pp); 2128 pp->p_ppid = 1; 2129 pp->p_flag |= SZONETOP; 2130 pp->p_ancpid = 1; 2131 pp->p_parent = initp; 2132 pp->p_psibling = NULL; 2133 if (initp->p_child) 2134 initp->p_child->p_psibling = pp; 2135 pp->p_sibling = initp->p_child; 2136 initp->p_child = pp; 2137 2138 /* Decrement what newproc() incremented. */ 2139 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2140 /* 2141 * Our credentials are about to become kcred-like, so we don't care 2142 * about the caller's ruid. 2143 */ 2144 upcount_inc(crgetruid(kcred), zone->zone_id); 2145 mutex_exit(&pidlock); 2146 2147 /* 2148 * getting out of global zone, so decrement lwp counts 2149 */ 2150 pj = pp->p_task->tk_proj; 2151 mutex_enter(&global_zone->zone_nlwps_lock); 2152 pj->kpj_nlwps -= pp->p_lwpcnt; 2153 global_zone->zone_nlwps -= pp->p_lwpcnt; 2154 mutex_exit(&global_zone->zone_nlwps_lock); 2155 2156 /* 2157 * Create and join a new task in project '0' of this zone. 2158 * 2159 * We don't need to call holdlwps() since we know we're the only lwp in 2160 * this process. 2161 * 2162 * task_join() returns with p_lock held. 2163 */ 2164 tk = task_create(0, zone); 2165 mutex_enter(&cpu_lock); 2166 oldtk = task_join(tk, 0); 2167 mutex_exit(&curproc->p_lock); 2168 mutex_exit(&cpu_lock); 2169 task_rele(oldtk); 2170 2171 /* 2172 * add lwp counts to zsched's zone, and increment project's task count 2173 * due to the task created in the above tasksys_settaskid 2174 */ 2175 pj = pp->p_task->tk_proj; 2176 mutex_enter(&zone->zone_nlwps_lock); 2177 pj->kpj_nlwps += pp->p_lwpcnt; 2178 pj->kpj_ntasks += 1; 2179 zone->zone_nlwps += pp->p_lwpcnt; 2180 mutex_exit(&zone->zone_nlwps_lock); 2181 2182 /* 2183 * The process was created by a process in the global zone, hence the 2184 * credentials are wrong. We might as well have kcred-ish credentials. 2185 */ 2186 cr = zone->zone_kcred; 2187 crhold(cr); 2188 mutex_enter(&pp->p_crlock); 2189 oldcred = pp->p_cred; 2190 pp->p_cred = cr; 2191 mutex_exit(&pp->p_crlock); 2192 crfree(oldcred); 2193 2194 /* 2195 * Hold credentials again (for thread) 2196 */ 2197 crhold(cr); 2198 2199 /* 2200 * p_lwpcnt can't change since this is a kernel process. 2201 */ 2202 crset(pp, cr); 2203 2204 /* 2205 * Chroot 2206 */ 2207 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2208 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2209 2210 /* 2211 * Initialize zone's rctl set. 2212 */ 2213 set = rctl_set_create(); 2214 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2215 mutex_enter(&pp->p_lock); 2216 e.rcep_p.zone = zone; 2217 e.rcep_t = RCENTITY_ZONE; 2218 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2219 mutex_exit(&pp->p_lock); 2220 rctl_prealloc_destroy(gp); 2221 2222 /* 2223 * Apply the rctls passed in to zone_create(). This is basically a list 2224 * assignment: all of the old values are removed and the new ones 2225 * inserted. That is, if an empty list is passed in, all values are 2226 * removed. 2227 */ 2228 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2229 rctl_dict_entry_t *rde; 2230 rctl_hndl_t hndl; 2231 char *name; 2232 nvlist_t **nvlarray; 2233 uint_t i, nelem; 2234 int error; /* For ASSERT()s */ 2235 2236 name = nvpair_name(nvp); 2237 hndl = rctl_hndl_lookup(name); 2238 ASSERT(hndl != -1); 2239 rde = rctl_dict_lookup_hndl(hndl); 2240 ASSERT(rde != NULL); 2241 2242 for (; /* ever */; ) { 2243 rctl_val_t oval; 2244 2245 mutex_enter(&pp->p_lock); 2246 error = rctl_local_get(hndl, NULL, &oval, pp); 2247 mutex_exit(&pp->p_lock); 2248 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2249 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2250 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2251 break; 2252 mutex_enter(&pp->p_lock); 2253 error = rctl_local_delete(hndl, &oval, pp); 2254 mutex_exit(&pp->p_lock); 2255 ASSERT(error == 0); 2256 } 2257 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2258 ASSERT(error == 0); 2259 for (i = 0; i < nelem; i++) { 2260 rctl_val_t *nvalp; 2261 2262 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2263 error = nvlist2rctlval(nvlarray[i], nvalp); 2264 ASSERT(error == 0); 2265 /* 2266 * rctl_local_insert can fail if the value being 2267 * inserted is a duplicate; this is OK. 2268 */ 2269 mutex_enter(&pp->p_lock); 2270 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2271 kmem_cache_free(rctl_val_cache, nvalp); 2272 mutex_exit(&pp->p_lock); 2273 } 2274 } 2275 /* 2276 * Tell the world that we're done setting up. 2277 * 2278 * At this point we want to set the zone status to ZONE_IS_READY 2279 * and atomically set the zone's processor set visibility. Once 2280 * we drop pool_lock() this zone will automatically get updated 2281 * to reflect any future changes to the pools configuration. 2282 */ 2283 pool_lock(); 2284 mutex_enter(&cpu_lock); 2285 mutex_enter(&zonehash_lock); 2286 zone_uniqid(zone); 2287 zone_zsd_configure(zone); 2288 if (pool_state == POOL_ENABLED) 2289 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2290 mutex_enter(&zone_status_lock); 2291 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2292 zone_status_set(zone, ZONE_IS_READY); 2293 mutex_exit(&zone_status_lock); 2294 mutex_exit(&zonehash_lock); 2295 mutex_exit(&cpu_lock); 2296 pool_unlock(); 2297 2298 /* 2299 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2300 * we launch init, and set the state to running. 2301 */ 2302 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2303 2304 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2305 id_t cid; 2306 2307 /* 2308 * Ok, this is a little complicated. We need to grab the 2309 * zone's pool's scheduling class ID; note that by now, we 2310 * are already bound to a pool if we need to be (zoneadmd 2311 * will have done that to us while we're in the READY 2312 * state). *But* the scheduling class for the zone's 'init' 2313 * must be explicitly passed to newproc, which doesn't 2314 * respect pool bindings. 2315 * 2316 * We hold the pool_lock across the call to newproc() to 2317 * close the obvious race: the pool's scheduling class 2318 * could change before we manage to create the LWP with 2319 * classid 'cid'. 2320 */ 2321 pool_lock(); 2322 cid = pool_get_class(zone->zone_pool); 2323 if (cid == -1) 2324 cid = defaultcid; 2325 2326 /* 2327 * If this fails, zone_boot will ultimately fail. The 2328 * state of the zone will be set to SHUTTING_DOWN-- userland 2329 * will have to tear down the zone, and fail, or try again. 2330 */ 2331 if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid, 2332 minclsyspri - 1, &ct)) != 0) { 2333 mutex_enter(&zone_status_lock); 2334 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2335 mutex_exit(&zone_status_lock); 2336 } 2337 pool_unlock(); 2338 } 2339 2340 /* 2341 * Wait for zone_destroy() to be called. This is what we spend 2342 * most of our life doing. 2343 */ 2344 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2345 2346 if (ct) 2347 /* 2348 * At this point the process contract should be empty. 2349 * (Though if it isn't, it's not the end of the world.) 2350 */ 2351 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2352 2353 /* 2354 * Allow kcred to be freed when all referring processes 2355 * (including this one) go away. We can't just do this in 2356 * zone_free because we need to wait for the zone_cred_ref to 2357 * drop to 0 before calling zone_free, and the existence of 2358 * zone_kcred will prevent that. Thus, we call crfree here to 2359 * balance the crdup in zone_create. The crhold calls earlier 2360 * in zsched will be dropped when the thread and process exit. 2361 */ 2362 crfree(zone->zone_kcred); 2363 zone->zone_kcred = NULL; 2364 2365 exit(CLD_EXITED, 0); 2366 } 2367 2368 /* 2369 * Helper function to determine if there are any submounts of the 2370 * provided path. Used to make sure the zone doesn't "inherit" any 2371 * mounts from before it is created. 2372 */ 2373 static uint_t 2374 zone_mount_count(const char *rootpath) 2375 { 2376 vfs_t *vfsp; 2377 uint_t count = 0; 2378 size_t rootpathlen = strlen(rootpath); 2379 2380 /* 2381 * Holding zonehash_lock prevents race conditions with 2382 * vfs_list_add()/vfs_list_remove() since we serialize with 2383 * zone_find_by_path(). 2384 */ 2385 ASSERT(MUTEX_HELD(&zonehash_lock)); 2386 /* 2387 * The rootpath must end with a '/' 2388 */ 2389 ASSERT(rootpath[rootpathlen - 1] == '/'); 2390 2391 /* 2392 * This intentionally does not count the rootpath itself if that 2393 * happens to be a mount point. 2394 */ 2395 vfs_list_read_lock(); 2396 vfsp = rootvfs; 2397 do { 2398 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2399 rootpathlen) == 0) 2400 count++; 2401 vfsp = vfsp->vfs_next; 2402 } while (vfsp != rootvfs); 2403 vfs_list_unlock(); 2404 return (count); 2405 } 2406 2407 /* 2408 * Helper function to make sure that a zone created on 'rootpath' 2409 * wouldn't end up containing other zones' rootpaths. 2410 */ 2411 static boolean_t 2412 zone_is_nested(const char *rootpath) 2413 { 2414 zone_t *zone; 2415 size_t rootpathlen = strlen(rootpath); 2416 size_t len; 2417 2418 ASSERT(MUTEX_HELD(&zonehash_lock)); 2419 2420 for (zone = list_head(&zone_active); zone != NULL; 2421 zone = list_next(&zone_active, zone)) { 2422 if (zone == global_zone) 2423 continue; 2424 len = strlen(zone->zone_rootpath); 2425 if (strncmp(rootpath, zone->zone_rootpath, 2426 MIN(rootpathlen, len)) == 0) 2427 return (B_TRUE); 2428 } 2429 return (B_FALSE); 2430 } 2431 2432 static int 2433 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, 2434 size_t zone_privssz) 2435 { 2436 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 2437 2438 if (zone_privssz < sizeof (priv_set_t)) 2439 return (set_errno(ENOMEM)); 2440 2441 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 2442 kmem_free(privs, sizeof (priv_set_t)); 2443 return (EFAULT); 2444 } 2445 2446 zone->zone_privset = privs; 2447 return (0); 2448 } 2449 2450 /* 2451 * We make creative use of nvlists to pass in rctls from userland. The list is 2452 * a list of the following structures: 2453 * 2454 * (name = rctl_name, value = nvpair_list_array) 2455 * 2456 * Where each element of the nvpair_list_array is of the form: 2457 * 2458 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 2459 * (name = "limit", value = uint64_t), 2460 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 2461 */ 2462 static int 2463 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 2464 { 2465 nvpair_t *nvp = NULL; 2466 nvlist_t *nvl = NULL; 2467 char *kbuf; 2468 int error; 2469 rctl_val_t rv; 2470 2471 *nvlp = NULL; 2472 2473 if (buflen == 0) 2474 return (0); 2475 2476 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2477 return (ENOMEM); 2478 if (copyin(ubuf, kbuf, buflen)) { 2479 error = EFAULT; 2480 goto out; 2481 } 2482 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 2483 /* 2484 * nvl may have been allocated/free'd, but the value set to 2485 * non-NULL, so we reset it here. 2486 */ 2487 nvl = NULL; 2488 error = EINVAL; 2489 goto out; 2490 } 2491 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2492 rctl_dict_entry_t *rde; 2493 rctl_hndl_t hndl; 2494 nvlist_t **nvlarray; 2495 uint_t i, nelem; 2496 char *name; 2497 2498 error = EINVAL; 2499 name = nvpair_name(nvp); 2500 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 2501 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 2502 goto out; 2503 } 2504 if ((hndl = rctl_hndl_lookup(name)) == -1) { 2505 goto out; 2506 } 2507 rde = rctl_dict_lookup_hndl(hndl); 2508 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2509 ASSERT(error == 0); 2510 for (i = 0; i < nelem; i++) { 2511 if (error = nvlist2rctlval(nvlarray[i], &rv)) 2512 goto out; 2513 } 2514 if (rctl_invalid_value(rde, &rv)) { 2515 error = EINVAL; 2516 goto out; 2517 } 2518 } 2519 error = 0; 2520 *nvlp = nvl; 2521 out: 2522 kmem_free(kbuf, buflen); 2523 if (error && nvl != NULL) 2524 nvlist_free(nvl); 2525 return (error); 2526 } 2527 2528 int 2529 zone_create_error(int er_error, int er_ext, int *er_out) { 2530 if (er_out != NULL) { 2531 if (copyout(&er_ext, er_out, sizeof (int))) { 2532 return (set_errno(EFAULT)); 2533 } 2534 } 2535 return (set_errno(er_error)); 2536 } 2537 2538 /* 2539 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 2540 */ 2541 static int 2542 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 2543 { 2544 char *kbuf; 2545 char *dataset, *next; 2546 zone_dataset_t *zd; 2547 size_t len; 2548 2549 if (ubuf == NULL || buflen == 0) 2550 return (0); 2551 2552 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2553 return (ENOMEM); 2554 2555 if (copyin(ubuf, kbuf, buflen) != 0) { 2556 kmem_free(kbuf, buflen); 2557 return (EFAULT); 2558 } 2559 2560 dataset = next = kbuf; 2561 for (;;) { 2562 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 2563 2564 next = strchr(dataset, ','); 2565 2566 if (next == NULL) 2567 len = strlen(dataset); 2568 else 2569 len = next - dataset; 2570 2571 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 2572 bcopy(dataset, zd->zd_dataset, len); 2573 zd->zd_dataset[len] = '\0'; 2574 2575 list_insert_head(&zone->zone_datasets, zd); 2576 2577 if (next == NULL) 2578 break; 2579 2580 dataset = next + 1; 2581 } 2582 2583 kmem_free(kbuf, buflen); 2584 return (0); 2585 } 2586 2587 /* 2588 * System call to create/initialize a new zone named 'zone_name', rooted 2589 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 2590 * and initialized with the zone-wide rctls described in 'rctlbuf'. 2591 * 2592 * If extended error is non-null, we may use it to return more detailed 2593 * error information. 2594 */ 2595 static zoneid_t 2596 zone_create(const char *zone_name, const char *zone_root, 2597 const priv_set_t *zone_privs, size_t zone_privssz, 2598 caddr_t rctlbuf, size_t rctlbufsz, 2599 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error) 2600 { 2601 struct zsched_arg zarg; 2602 nvlist_t *rctls = NULL; 2603 proc_t *pp = curproc; 2604 zone_t *zone, *ztmp; 2605 zoneid_t zoneid; 2606 int error; 2607 int error2 = 0; 2608 char *str; 2609 cred_t *zkcr; 2610 2611 if (secpolicy_zone_config(CRED()) != 0) 2612 return (set_errno(EPERM)); 2613 2614 /* can't boot zone from within chroot environment */ 2615 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 2616 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 2617 extended_error)); 2618 2619 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 2620 zoneid = zone->zone_id = id_alloc(zoneid_space); 2621 zone->zone_status = ZONE_IS_UNINITIALIZED; 2622 zone->zone_pool = pool_default; 2623 zone->zone_pool_mod = gethrtime(); 2624 zone->zone_psetid = ZONE_PS_INVAL; 2625 zone->zone_ncpus = 0; 2626 zone->zone_ncpus_online = 0; 2627 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 2628 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 2629 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 2630 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 2631 offsetof(struct zsd_entry, zsd_linkage)); 2632 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 2633 offsetof(zone_dataset_t, zd_linkage)); 2634 2635 if ((error = zone_set_name(zone, zone_name)) != 0) { 2636 zone_free(zone); 2637 return (zone_create_error(error, 0, extended_error)); 2638 } 2639 2640 if ((error = zone_set_root(zone, zone_root)) != 0) { 2641 zone_free(zone); 2642 return (zone_create_error(error, 0, extended_error)); 2643 } 2644 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) { 2645 zone_free(zone); 2646 return (zone_create_error(error, 0, extended_error)); 2647 } 2648 2649 /* initialize node name to be the same as zone name */ 2650 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2651 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 2652 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 2653 2654 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2655 zone->zone_domain[0] = '\0'; 2656 zone->zone_shares = 1; 2657 zone->zone_bootargs = NULL; 2658 2659 /* 2660 * Zsched initializes the rctls. 2661 */ 2662 zone->zone_rctls = NULL; 2663 2664 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 2665 zone_free(zone); 2666 return (zone_create_error(error, 0, extended_error)); 2667 } 2668 2669 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 2670 zone_free(zone); 2671 return (set_errno(error)); 2672 } 2673 2674 /* 2675 * Stop all lwps since that's what normally happens as part of fork(). 2676 * This needs to happen before we grab any locks to avoid deadlock 2677 * (another lwp in the process could be waiting for the held lock). 2678 */ 2679 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 2680 zone_free(zone); 2681 if (rctls) 2682 nvlist_free(rctls); 2683 return (zone_create_error(error, 0, extended_error)); 2684 } 2685 2686 if (block_mounts() == 0) { 2687 mutex_enter(&pp->p_lock); 2688 if (curthread != pp->p_agenttp) 2689 continuelwps(pp); 2690 mutex_exit(&pp->p_lock); 2691 zone_free(zone); 2692 if (rctls) 2693 nvlist_free(rctls); 2694 return (zone_create_error(error, 0, extended_error)); 2695 } 2696 2697 /* 2698 * Set up credential for kernel access. After this, any errors 2699 * should go through the dance in errout rather than calling 2700 * zone_free directly. 2701 */ 2702 zone->zone_kcred = crdup(kcred); 2703 crsetzone(zone->zone_kcred, zone); 2704 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 2705 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 2706 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 2707 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 2708 2709 mutex_enter(&zonehash_lock); 2710 /* 2711 * Make sure zone doesn't already exist. 2712 */ 2713 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) { 2714 zone_status_t status; 2715 2716 status = zone_status_get(ztmp); 2717 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 2718 error = EEXIST; 2719 else 2720 error = EBUSY; 2721 goto errout; 2722 } 2723 2724 /* 2725 * Don't allow zone creations which would cause one zone's rootpath to 2726 * be accessible from that of another (non-global) zone. 2727 */ 2728 if (zone_is_nested(zone->zone_rootpath)) { 2729 error = EBUSY; 2730 goto errout; 2731 } 2732 2733 ASSERT(zonecount != 0); /* check for leaks */ 2734 if (zonecount + 1 > maxzones) { 2735 error = ENOMEM; 2736 goto errout; 2737 } 2738 2739 if (zone_mount_count(zone->zone_rootpath) != 0) { 2740 error = EBUSY; 2741 error2 = ZE_AREMOUNTS; 2742 goto errout; 2743 } 2744 2745 /* 2746 * Zone is still incomplete, but we need to drop all locks while 2747 * zsched() initializes this zone's kernel process. We 2748 * optimistically add the zone to the hashtable and associated 2749 * lists so a parallel zone_create() doesn't try to create the 2750 * same zone. 2751 */ 2752 zonecount++; 2753 (void) mod_hash_insert(zonehashbyid, 2754 (mod_hash_key_t)(uintptr_t)zone->zone_id, 2755 (mod_hash_val_t)(uintptr_t)zone); 2756 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 2757 (void) strcpy(str, zone->zone_name); 2758 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 2759 (mod_hash_val_t)(uintptr_t)zone); 2760 /* 2761 * Insert into active list. At this point there are no 'hold's 2762 * on the zone, but everyone else knows not to use it, so we can 2763 * continue to use it. zsched() will do a zone_hold() if the 2764 * newproc() is successful. 2765 */ 2766 list_insert_tail(&zone_active, zone); 2767 mutex_exit(&zonehash_lock); 2768 2769 zarg.zone = zone; 2770 zarg.nvlist = rctls; 2771 /* 2772 * The process, task, and project rctls are probably wrong; 2773 * we need an interface to get the default values of all rctls, 2774 * and initialize zsched appropriately. I'm not sure that that 2775 * makes much of a difference, though. 2776 */ 2777 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 2778 /* 2779 * We need to undo all globally visible state. 2780 */ 2781 mutex_enter(&zonehash_lock); 2782 list_remove(&zone_active, zone); 2783 (void) mod_hash_destroy(zonehashbyname, 2784 (mod_hash_key_t)(uintptr_t)zone->zone_name); 2785 (void) mod_hash_destroy(zonehashbyid, 2786 (mod_hash_key_t)(uintptr_t)zone->zone_id); 2787 ASSERT(zonecount > 1); 2788 zonecount--; 2789 goto errout; 2790 } 2791 2792 /* 2793 * Zone creation can't fail from now on. 2794 */ 2795 2796 /* 2797 * Let the other lwps continue. 2798 */ 2799 mutex_enter(&pp->p_lock); 2800 if (curthread != pp->p_agenttp) 2801 continuelwps(pp); 2802 mutex_exit(&pp->p_lock); 2803 2804 /* 2805 * Wait for zsched to finish initializing the zone. 2806 */ 2807 zone_status_wait(zone, ZONE_IS_READY); 2808 /* 2809 * The zone is fully visible, so we can let mounts progress. 2810 */ 2811 resume_mounts(); 2812 if (rctls) 2813 nvlist_free(rctls); 2814 2815 return (zoneid); 2816 2817 errout: 2818 mutex_exit(&zonehash_lock); 2819 /* 2820 * Let the other lwps continue. 2821 */ 2822 mutex_enter(&pp->p_lock); 2823 if (curthread != pp->p_agenttp) 2824 continuelwps(pp); 2825 mutex_exit(&pp->p_lock); 2826 2827 resume_mounts(); 2828 if (rctls) 2829 nvlist_free(rctls); 2830 /* 2831 * There is currently one reference to the zone, a cred_ref from 2832 * zone_kcred. To free the zone, we call crfree, which will call 2833 * zone_cred_rele, which will call zone_free. 2834 */ 2835 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 2836 ASSERT(zone->zone_kcred->cr_ref == 1); 2837 ASSERT(zone->zone_ref == 0); 2838 zkcr = zone->zone_kcred; 2839 zone->zone_kcred = NULL; 2840 crfree(zkcr); /* triggers call to zone_free */ 2841 return (zone_create_error(error, error2, extended_error)); 2842 } 2843 2844 /* 2845 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 2846 * the heavy lifting. 2847 */ 2848 static int 2849 zone_boot(zoneid_t zoneid, const char *bootargs) 2850 { 2851 int err; 2852 zone_t *zone; 2853 2854 if (secpolicy_zone_config(CRED()) != 0) 2855 return (set_errno(EPERM)); 2856 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2857 return (set_errno(EINVAL)); 2858 2859 mutex_enter(&zonehash_lock); 2860 /* 2861 * Look for zone under hash lock to prevent races with calls to 2862 * zone_shutdown, zone_destroy, etc. 2863 */ 2864 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2865 mutex_exit(&zonehash_lock); 2866 return (set_errno(EINVAL)); 2867 } 2868 2869 if ((err = zone_set_bootargs(zone, bootargs)) != 0) { 2870 mutex_exit(&zonehash_lock); 2871 return (set_errno(err)); 2872 } 2873 2874 mutex_enter(&zone_status_lock); 2875 if (zone_status_get(zone) != ZONE_IS_READY) { 2876 mutex_exit(&zone_status_lock); 2877 mutex_exit(&zonehash_lock); 2878 return (set_errno(EINVAL)); 2879 } 2880 zone_status_set(zone, ZONE_IS_BOOTING); 2881 mutex_exit(&zone_status_lock); 2882 2883 zone_hold(zone); /* so we can use the zone_t later */ 2884 mutex_exit(&zonehash_lock); 2885 2886 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 2887 zone_rele(zone); 2888 return (set_errno(EINTR)); 2889 } 2890 2891 /* 2892 * Boot (starting init) might have failed, in which case the zone 2893 * will go to the SHUTTING_DOWN state; an appropriate errno will 2894 * be placed in zone->zone_boot_err, and so we return that. 2895 */ 2896 err = zone->zone_boot_err; 2897 zone_rele(zone); 2898 return (err ? set_errno(err) : 0); 2899 } 2900 2901 /* 2902 * Kills all user processes in the zone, waiting for them all to exit 2903 * before returning. 2904 */ 2905 static int 2906 zone_empty(zone_t *zone) 2907 { 2908 int waitstatus; 2909 2910 /* 2911 * We need to drop zonehash_lock before killing all 2912 * processes, otherwise we'll deadlock with zone_find_* 2913 * which can be called from the exit path. 2914 */ 2915 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 2916 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 2917 ZONE_IS_EMPTY)) == -1) { 2918 killall(zone->zone_id); 2919 } 2920 /* 2921 * return EINTR if we were signaled 2922 */ 2923 if (waitstatus == 0) 2924 return (EINTR); 2925 return (0); 2926 } 2927 2928 /* 2929 * Systemcall to start the zone's halt sequence. By the time this 2930 * function successfully returns, all user processes and kernel threads 2931 * executing in it will have exited, ZSD shutdown callbacks executed, 2932 * and the zone status set to ZONE_IS_DOWN. 2933 * 2934 * It is possible that the call will interrupt itself if the caller is the 2935 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 2936 */ 2937 static int 2938 zone_shutdown(zoneid_t zoneid) 2939 { 2940 int error; 2941 zone_t *zone; 2942 zone_status_t status; 2943 2944 if (secpolicy_zone_config(CRED()) != 0) 2945 return (set_errno(EPERM)); 2946 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 2947 return (set_errno(EINVAL)); 2948 2949 /* 2950 * Block mounts so that VFS_MOUNT() can get an accurate view of 2951 * the zone's status with regards to ZONE_IS_SHUTTING down. 2952 * 2953 * e.g. NFS can fail the mount if it determines that the zone 2954 * has already begun the shutdown sequence. 2955 */ 2956 if (block_mounts() == 0) 2957 return (set_errno(EINTR)); 2958 mutex_enter(&zonehash_lock); 2959 /* 2960 * Look for zone under hash lock to prevent races with other 2961 * calls to zone_shutdown and zone_destroy. 2962 */ 2963 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2964 mutex_exit(&zonehash_lock); 2965 resume_mounts(); 2966 return (set_errno(EINVAL)); 2967 } 2968 mutex_enter(&zone_status_lock); 2969 status = zone_status_get(zone); 2970 /* 2971 * Fail if the zone isn't fully initialized yet. 2972 */ 2973 if (status < ZONE_IS_READY) { 2974 mutex_exit(&zone_status_lock); 2975 mutex_exit(&zonehash_lock); 2976 resume_mounts(); 2977 return (set_errno(EINVAL)); 2978 } 2979 /* 2980 * If conditions required for zone_shutdown() to return have been met, 2981 * return success. 2982 */ 2983 if (status >= ZONE_IS_DOWN) { 2984 mutex_exit(&zone_status_lock); 2985 mutex_exit(&zonehash_lock); 2986 resume_mounts(); 2987 return (0); 2988 } 2989 /* 2990 * If zone_shutdown() hasn't been called before, go through the motions. 2991 * If it has, there's nothing to do but wait for the kernel threads to 2992 * drain. 2993 */ 2994 if (status < ZONE_IS_EMPTY) { 2995 uint_t ntasks; 2996 2997 mutex_enter(&zone->zone_lock); 2998 if ((ntasks = zone->zone_ntasks) != 1) { 2999 /* 3000 * There's still stuff running. 3001 */ 3002 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3003 } 3004 mutex_exit(&zone->zone_lock); 3005 if (ntasks == 1) { 3006 /* 3007 * The only way to create another task is through 3008 * zone_enter(), which will block until we drop 3009 * zonehash_lock. The zone is empty. 3010 */ 3011 if (zone->zone_kthreads == NULL) { 3012 /* 3013 * Skip ahead to ZONE_IS_DOWN 3014 */ 3015 zone_status_set(zone, ZONE_IS_DOWN); 3016 } else { 3017 zone_status_set(zone, ZONE_IS_EMPTY); 3018 } 3019 } 3020 } 3021 zone_hold(zone); /* so we can use the zone_t later */ 3022 mutex_exit(&zone_status_lock); 3023 mutex_exit(&zonehash_lock); 3024 resume_mounts(); 3025 3026 if (error = zone_empty(zone)) { 3027 zone_rele(zone); 3028 return (set_errno(error)); 3029 } 3030 /* 3031 * After the zone status goes to ZONE_IS_DOWN this zone will no 3032 * longer be notified of changes to the pools configuration, so 3033 * in order to not end up with a stale pool pointer, we point 3034 * ourselves at the default pool and remove all resource 3035 * visibility. This is especially important as the zone_t may 3036 * languish on the deathrow for a very long time waiting for 3037 * cred's to drain out. 3038 * 3039 * This rebinding of the zone can happen multiple times 3040 * (presumably due to interrupted or parallel systemcalls) 3041 * without any adverse effects. 3042 */ 3043 if (pool_lock_intr() != 0) { 3044 zone_rele(zone); 3045 return (set_errno(EINTR)); 3046 } 3047 if (pool_state == POOL_ENABLED) { 3048 mutex_enter(&cpu_lock); 3049 zone_pool_set(zone, pool_default); 3050 /* 3051 * The zone no longer needs to be able to see any cpus. 3052 */ 3053 zone_pset_set(zone, ZONE_PS_INVAL); 3054 mutex_exit(&cpu_lock); 3055 } 3056 pool_unlock(); 3057 3058 /* 3059 * ZSD shutdown callbacks can be executed multiple times, hence 3060 * it is safe to not be holding any locks across this call. 3061 */ 3062 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 3063 3064 mutex_enter(&zone_status_lock); 3065 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 3066 zone_status_set(zone, ZONE_IS_DOWN); 3067 mutex_exit(&zone_status_lock); 3068 3069 /* 3070 * Wait for kernel threads to drain. 3071 */ 3072 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 3073 zone_rele(zone); 3074 return (set_errno(EINTR)); 3075 } 3076 zone_rele(zone); 3077 return (0); 3078 } 3079 3080 /* 3081 * Systemcall entry point to finalize the zone halt process. The caller 3082 * must have already successfully callefd zone_shutdown(). 3083 * 3084 * Upon successful completion, the zone will have been fully destroyed: 3085 * zsched will have exited, destructor callbacks executed, and the zone 3086 * removed from the list of active zones. 3087 */ 3088 static int 3089 zone_destroy(zoneid_t zoneid) 3090 { 3091 uint64_t uniqid; 3092 zone_t *zone; 3093 zone_status_t status; 3094 3095 if (secpolicy_zone_config(CRED()) != 0) 3096 return (set_errno(EPERM)); 3097 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3098 return (set_errno(EINVAL)); 3099 3100 mutex_enter(&zonehash_lock); 3101 /* 3102 * Look for zone under hash lock to prevent races with other 3103 * calls to zone_destroy. 3104 */ 3105 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3106 mutex_exit(&zonehash_lock); 3107 return (set_errno(EINVAL)); 3108 } 3109 3110 if (zone_mount_count(zone->zone_rootpath) != 0) { 3111 mutex_exit(&zonehash_lock); 3112 return (set_errno(EBUSY)); 3113 } 3114 mutex_enter(&zone_status_lock); 3115 status = zone_status_get(zone); 3116 if (status < ZONE_IS_DOWN) { 3117 mutex_exit(&zone_status_lock); 3118 mutex_exit(&zonehash_lock); 3119 return (set_errno(EBUSY)); 3120 } else if (status == ZONE_IS_DOWN) { 3121 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3122 } 3123 mutex_exit(&zone_status_lock); 3124 zone_hold(zone); 3125 mutex_exit(&zonehash_lock); 3126 3127 /* 3128 * wait for zsched to exit 3129 */ 3130 zone_status_wait(zone, ZONE_IS_DEAD); 3131 zone_zsd_callbacks(zone, ZSD_DESTROY); 3132 uniqid = zone->zone_uniqid; 3133 zone_rele(zone); 3134 zone = NULL; /* potentially free'd */ 3135 3136 mutex_enter(&zonehash_lock); 3137 for (; /* ever */; ) { 3138 boolean_t unref; 3139 3140 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3141 zone->zone_uniqid != uniqid) { 3142 /* 3143 * The zone has gone away. Necessary conditions 3144 * are met, so we return success. 3145 */ 3146 mutex_exit(&zonehash_lock); 3147 return (0); 3148 } 3149 mutex_enter(&zone->zone_lock); 3150 unref = ZONE_IS_UNREF(zone); 3151 mutex_exit(&zone->zone_lock); 3152 if (unref) { 3153 /* 3154 * There is only one reference to the zone -- that 3155 * added when the zone was added to the hashtables -- 3156 * and things will remain this way until we drop 3157 * zonehash_lock... we can go ahead and cleanup the 3158 * zone. 3159 */ 3160 break; 3161 } 3162 3163 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3164 /* Signaled */ 3165 mutex_exit(&zonehash_lock); 3166 return (set_errno(EINTR)); 3167 } 3168 3169 } 3170 3171 /* 3172 * It is now safe to let the zone be recreated; remove it from the 3173 * lists. The memory will not be freed until the last cred 3174 * reference goes away. 3175 */ 3176 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3177 zonecount--; 3178 /* remove from active list and hash tables */ 3179 list_remove(&zone_active, zone); 3180 (void) mod_hash_destroy(zonehashbyname, 3181 (mod_hash_key_t)zone->zone_name); 3182 (void) mod_hash_destroy(zonehashbyid, 3183 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3184 mutex_exit(&zonehash_lock); 3185 3186 /* 3187 * Release the root vnode; we're not using it anymore. Nor should any 3188 * other thread that might access it exist. 3189 */ 3190 if (zone->zone_rootvp != NULL) { 3191 VN_RELE(zone->zone_rootvp); 3192 zone->zone_rootvp = NULL; 3193 } 3194 3195 /* add to deathrow list */ 3196 mutex_enter(&zone_deathrow_lock); 3197 list_insert_tail(&zone_deathrow, zone); 3198 mutex_exit(&zone_deathrow_lock); 3199 3200 /* 3201 * Drop last reference (which was added by zsched()), this will 3202 * free the zone unless there are outstanding cred references. 3203 */ 3204 zone_rele(zone); 3205 return (0); 3206 } 3207 3208 /* 3209 * Systemcall entry point for zone_getattr(2). 3210 */ 3211 static ssize_t 3212 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3213 { 3214 size_t size; 3215 int error = 0, err; 3216 zone_t *zone; 3217 char *zonepath; 3218 zone_status_t zone_status; 3219 pid_t initpid; 3220 boolean_t global = (curproc->p_zone == global_zone); 3221 3222 mutex_enter(&zonehash_lock); 3223 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3224 mutex_exit(&zonehash_lock); 3225 return (set_errno(EINVAL)); 3226 } 3227 zone_status = zone_status_get(zone); 3228 if (zone_status < ZONE_IS_READY) { 3229 mutex_exit(&zonehash_lock); 3230 return (set_errno(EINVAL)); 3231 } 3232 zone_hold(zone); 3233 mutex_exit(&zonehash_lock); 3234 3235 /* 3236 * If not in the global zone, don't show information about other zones. 3237 */ 3238 if (!global && curproc->p_zone != zone) { 3239 zone_rele(zone); 3240 return (set_errno(EINVAL)); 3241 } 3242 3243 switch (attr) { 3244 case ZONE_ATTR_ROOT: 3245 if (global) { 3246 /* 3247 * Copy the path to trim the trailing "/" (except for 3248 * the global zone). 3249 */ 3250 if (zone != global_zone) 3251 size = zone->zone_rootpathlen - 1; 3252 else 3253 size = zone->zone_rootpathlen; 3254 zonepath = kmem_alloc(size, KM_SLEEP); 3255 bcopy(zone->zone_rootpath, zonepath, size); 3256 zonepath[size - 1] = '\0'; 3257 } else { 3258 /* 3259 * Caller is not in the global zone, just return 3260 * faked-up path for current zone. 3261 */ 3262 zonepath = "/"; 3263 size = 2; 3264 } 3265 if (bufsize > size) 3266 bufsize = size; 3267 if (buf != NULL) { 3268 err = copyoutstr(zonepath, buf, bufsize, NULL); 3269 if (err != 0 && err != ENAMETOOLONG) 3270 error = EFAULT; 3271 } 3272 if (global) 3273 kmem_free(zonepath, size); 3274 break; 3275 3276 case ZONE_ATTR_NAME: 3277 size = strlen(zone->zone_name) + 1; 3278 if (bufsize > size) 3279 bufsize = size; 3280 if (buf != NULL) { 3281 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 3282 if (err != 0 && err != ENAMETOOLONG) 3283 error = EFAULT; 3284 } 3285 break; 3286 3287 case ZONE_ATTR_STATUS: 3288 /* 3289 * Since we're not holding zonehash_lock, the zone status 3290 * may be anything; leave it up to userland to sort it out. 3291 */ 3292 size = sizeof (zone_status); 3293 if (bufsize > size) 3294 bufsize = size; 3295 zone_status = zone_status_get(zone); 3296 if (buf != NULL && 3297 copyout(&zone_status, buf, bufsize) != 0) 3298 error = EFAULT; 3299 break; 3300 case ZONE_ATTR_PRIVSET: 3301 size = sizeof (priv_set_t); 3302 if (bufsize > size) 3303 bufsize = size; 3304 if (buf != NULL && 3305 copyout(zone->zone_privset, buf, bufsize) != 0) 3306 error = EFAULT; 3307 break; 3308 case ZONE_ATTR_UNIQID: 3309 size = sizeof (zone->zone_uniqid); 3310 if (bufsize > size) 3311 bufsize = size; 3312 if (buf != NULL && 3313 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 3314 error = EFAULT; 3315 break; 3316 case ZONE_ATTR_POOLID: 3317 { 3318 pool_t *pool; 3319 poolid_t poolid; 3320 3321 if (pool_lock_intr() != 0) { 3322 error = EINTR; 3323 break; 3324 } 3325 pool = zone_pool_get(zone); 3326 poolid = pool->pool_id; 3327 pool_unlock(); 3328 size = sizeof (poolid); 3329 if (bufsize > size) 3330 bufsize = size; 3331 if (buf != NULL && copyout(&poolid, buf, size) != 0) 3332 error = EFAULT; 3333 } 3334 break; 3335 case ZONE_ATTR_INITPID: 3336 size = sizeof (initpid); 3337 if (bufsize > size) 3338 bufsize = size; 3339 initpid = zone->zone_proc_initpid; 3340 if (initpid == -1) { 3341 error = ESRCH; 3342 break; 3343 } 3344 if (buf != NULL && 3345 copyout(&initpid, buf, bufsize) != 0) 3346 error = EFAULT; 3347 break; 3348 default: 3349 error = EINVAL; 3350 } 3351 zone_rele(zone); 3352 3353 if (error) 3354 return (set_errno(error)); 3355 return ((ssize_t)size); 3356 } 3357 3358 /* 3359 * Return zero if the process has at least one vnode mapped in to its 3360 * address space which shouldn't be allowed to change zones. 3361 */ 3362 static int 3363 as_can_change_zones(void) 3364 { 3365 proc_t *pp = curproc; 3366 struct seg *seg; 3367 struct as *as = pp->p_as; 3368 vnode_t *vp; 3369 int allow = 1; 3370 3371 ASSERT(pp->p_as != &kas); 3372 AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); 3373 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 3374 /* 3375 * if we can't get a backing vnode for this segment then skip 3376 * it. 3377 */ 3378 vp = NULL; 3379 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 3380 continue; 3381 if (!vn_can_change_zones(vp)) { /* bail on first match */ 3382 allow = 0; 3383 break; 3384 } 3385 } 3386 AS_LOCK_EXIT(&as, &as->a_lock); 3387 return (allow); 3388 } 3389 3390 /* 3391 * Systemcall entry point for zone_enter(). 3392 * 3393 * The current process is injected into said zone. In the process 3394 * it will change its project membership, privileges, rootdir/cwd, 3395 * zone-wide rctls, and pool association to match those of the zone. 3396 * 3397 * The first zone_enter() called while the zone is in the ZONE_IS_READY 3398 * state will transition it to ZONE_IS_RUNNING. Processes may only 3399 * enter a zone that is "ready" or "running". 3400 */ 3401 static int 3402 zone_enter(zoneid_t zoneid) 3403 { 3404 zone_t *zone; 3405 vnode_t *vp; 3406 proc_t *pp = curproc; 3407 contract_t *ct; 3408 cont_process_t *ctp; 3409 task_t *tk, *oldtk; 3410 kproject_t *zone_proj0; 3411 cred_t *cr, *newcr; 3412 pool_t *oldpool, *newpool; 3413 sess_t *sp; 3414 uid_t uid; 3415 zone_status_t status; 3416 int err = 0; 3417 rctl_entity_p_t e; 3418 3419 if (secpolicy_zone_config(CRED()) != 0) 3420 return (set_errno(EPERM)); 3421 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3422 return (set_errno(EINVAL)); 3423 3424 /* 3425 * Stop all lwps so we don't need to hold a lock to look at 3426 * curproc->p_zone. This needs to happen before we grab any 3427 * locks to avoid deadlock (another lwp in the process could 3428 * be waiting for the held lock). 3429 */ 3430 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 3431 return (set_errno(EINTR)); 3432 3433 /* 3434 * Make sure we're not changing zones with files open or mapped in 3435 * to our address space which shouldn't be changing zones. 3436 */ 3437 if (!files_can_change_zones()) { 3438 err = EBADF; 3439 goto out; 3440 } 3441 if (!as_can_change_zones()) { 3442 err = EFAULT; 3443 goto out; 3444 } 3445 3446 mutex_enter(&zonehash_lock); 3447 if (pp->p_zone != global_zone) { 3448 mutex_exit(&zonehash_lock); 3449 err = EINVAL; 3450 goto out; 3451 } 3452 3453 zone = zone_find_all_by_id(zoneid); 3454 if (zone == NULL) { 3455 mutex_exit(&zonehash_lock); 3456 err = EINVAL; 3457 goto out; 3458 } 3459 3460 /* 3461 * To prevent processes in a zone from holding contracts on 3462 * extrazonal resources, and to avoid process contract 3463 * memberships which span zones, contract holders and processes 3464 * which aren't the sole members of their encapsulating process 3465 * contracts are not allowed to zone_enter. 3466 */ 3467 ctp = pp->p_ct_process; 3468 ct = &ctp->conp_contract; 3469 mutex_enter(&ct->ct_lock); 3470 mutex_enter(&pp->p_lock); 3471 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 3472 mutex_exit(&pp->p_lock); 3473 mutex_exit(&ct->ct_lock); 3474 mutex_exit(&zonehash_lock); 3475 pool_unlock(); 3476 err = EINVAL; 3477 goto out; 3478 } 3479 3480 /* 3481 * Moreover, we don't allow processes whose encapsulating 3482 * process contracts have inherited extrazonal contracts. 3483 * While it would be easier to eliminate all process contracts 3484 * with inherited contracts, we need to be able to give a 3485 * restarted init (or other zone-penetrating process) its 3486 * predecessor's contracts. 3487 */ 3488 if (ctp->conp_ninherited != 0) { 3489 contract_t *next; 3490 for (next = list_head(&ctp->conp_inherited); next; 3491 next = list_next(&ctp->conp_inherited, next)) { 3492 if (contract_getzuniqid(next) != zone->zone_uniqid) { 3493 mutex_exit(&pp->p_lock); 3494 mutex_exit(&ct->ct_lock); 3495 mutex_exit(&zonehash_lock); 3496 pool_unlock(); 3497 err = EINVAL; 3498 goto out; 3499 } 3500 } 3501 } 3502 mutex_exit(&pp->p_lock); 3503 mutex_exit(&ct->ct_lock); 3504 3505 status = zone_status_get(zone); 3506 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 3507 /* 3508 * Can't join 3509 */ 3510 mutex_exit(&zonehash_lock); 3511 err = EINVAL; 3512 goto out; 3513 } 3514 3515 /* 3516 * Make sure new priv set is within the permitted set for caller 3517 */ 3518 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 3519 mutex_exit(&zonehash_lock); 3520 err = EPERM; 3521 goto out; 3522 } 3523 /* 3524 * We want to momentarily drop zonehash_lock while we optimistically 3525 * bind curproc to the pool it should be running in. This is safe 3526 * since the zone can't disappear (we have a hold on it). 3527 */ 3528 zone_hold(zone); 3529 mutex_exit(&zonehash_lock); 3530 3531 /* 3532 * Grab pool_lock to keep the pools configuration from changing 3533 * and to stop ourselves from getting rebound to another pool 3534 * until we join the zone. 3535 */ 3536 if (pool_lock_intr() != 0) { 3537 zone_rele(zone); 3538 err = EINTR; 3539 goto out; 3540 } 3541 ASSERT(secpolicy_pool(CRED()) == 0); 3542 /* 3543 * Bind ourselves to the pool currently associated with the zone. 3544 */ 3545 oldpool = curproc->p_pool; 3546 newpool = zone_pool_get(zone); 3547 if (pool_state == POOL_ENABLED && newpool != oldpool && 3548 (err = pool_do_bind(newpool, P_PID, P_MYID, 3549 POOL_BIND_ALL)) != 0) { 3550 pool_unlock(); 3551 zone_rele(zone); 3552 goto out; 3553 } 3554 3555 /* 3556 * Grab cpu_lock now; we'll need it later when we call 3557 * task_join(). 3558 */ 3559 mutex_enter(&cpu_lock); 3560 mutex_enter(&zonehash_lock); 3561 /* 3562 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 3563 */ 3564 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3565 /* 3566 * Can't join anymore. 3567 */ 3568 mutex_exit(&zonehash_lock); 3569 mutex_exit(&cpu_lock); 3570 if (pool_state == POOL_ENABLED && 3571 newpool != oldpool) 3572 (void) pool_do_bind(oldpool, P_PID, P_MYID, 3573 POOL_BIND_ALL); 3574 pool_unlock(); 3575 zone_rele(zone); 3576 err = EINVAL; 3577 goto out; 3578 } 3579 3580 mutex_enter(&pp->p_lock); 3581 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 3582 /* verify that we do not exceed and task or lwp limits */ 3583 mutex_enter(&zone->zone_nlwps_lock); 3584 /* add new lwps to zone and zone's proj0 */ 3585 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 3586 zone->zone_nlwps += pp->p_lwpcnt; 3587 /* add 1 task to zone's proj0 */ 3588 zone_proj0->kpj_ntasks += 1; 3589 mutex_exit(&pp->p_lock); 3590 mutex_exit(&zone->zone_nlwps_lock); 3591 3592 /* remove lwps from proc's old zone and old project */ 3593 mutex_enter(&pp->p_zone->zone_nlwps_lock); 3594 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 3595 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 3596 mutex_exit(&pp->p_zone->zone_nlwps_lock); 3597 3598 /* 3599 * Joining the zone cannot fail from now on. 3600 * 3601 * This means that a lot of the following code can be commonized and 3602 * shared with zsched(). 3603 */ 3604 3605 /* 3606 * Reset the encapsulating process contract's zone. 3607 */ 3608 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 3609 contract_setzuniqid(ct, zone->zone_uniqid); 3610 3611 /* 3612 * Create a new task and associate the process with the project keyed 3613 * by (projid,zoneid). 3614 * 3615 * We might as well be in project 0; the global zone's projid doesn't 3616 * make much sense in a zone anyhow. 3617 * 3618 * This also increments zone_ntasks, and returns with p_lock held. 3619 */ 3620 tk = task_create(0, zone); 3621 oldtk = task_join(tk, 0); 3622 mutex_exit(&cpu_lock); 3623 3624 pp->p_flag |= SZONETOP; 3625 pp->p_zone = zone; 3626 3627 /* 3628 * call RCTLOP_SET functions on this proc 3629 */ 3630 e.rcep_p.zone = zone; 3631 e.rcep_t = RCENTITY_ZONE; 3632 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 3633 RCD_CALLBACK); 3634 mutex_exit(&pp->p_lock); 3635 3636 /* 3637 * We don't need to hold any of zsched's locks here; not only do we know 3638 * the process and zone aren't going away, we know its session isn't 3639 * changing either. 3640 * 3641 * By joining zsched's session here, we mimic the behavior in the 3642 * global zone of init's sid being the pid of sched. We extend this 3643 * to all zlogin-like zone_enter()'ing processes as well. 3644 */ 3645 mutex_enter(&pidlock); 3646 sp = zone->zone_zsched->p_sessp; 3647 SESS_HOLD(sp); 3648 mutex_enter(&pp->p_lock); 3649 pgexit(pp); 3650 SESS_RELE(pp->p_sessp); 3651 pp->p_sessp = sp; 3652 pgjoin(pp, zone->zone_zsched->p_pidp); 3653 mutex_exit(&pp->p_lock); 3654 mutex_exit(&pidlock); 3655 3656 mutex_exit(&zonehash_lock); 3657 /* 3658 * We're firmly in the zone; let pools progress. 3659 */ 3660 pool_unlock(); 3661 task_rele(oldtk); 3662 /* 3663 * We don't need to retain a hold on the zone since we already 3664 * incremented zone_ntasks, so the zone isn't going anywhere. 3665 */ 3666 zone_rele(zone); 3667 3668 /* 3669 * Chroot 3670 */ 3671 vp = zone->zone_rootvp; 3672 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 3673 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 3674 3675 /* 3676 * Change process credentials 3677 */ 3678 newcr = cralloc(); 3679 mutex_enter(&pp->p_crlock); 3680 cr = pp->p_cred; 3681 crcopy_to(cr, newcr); 3682 crsetzone(newcr, zone); 3683 pp->p_cred = newcr; 3684 3685 /* 3686 * Restrict all process privilege sets to zone limit 3687 */ 3688 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 3689 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 3690 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 3691 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 3692 mutex_exit(&pp->p_crlock); 3693 crset(pp, newcr); 3694 3695 /* 3696 * Adjust upcount to reflect zone entry. 3697 */ 3698 uid = crgetruid(newcr); 3699 mutex_enter(&pidlock); 3700 upcount_dec(uid, GLOBAL_ZONEID); 3701 upcount_inc(uid, zoneid); 3702 mutex_exit(&pidlock); 3703 3704 /* 3705 * Set up core file path and content. 3706 */ 3707 set_core_defaults(); 3708 3709 out: 3710 /* 3711 * Let the other lwps continue. 3712 */ 3713 mutex_enter(&pp->p_lock); 3714 if (curthread != pp->p_agenttp) 3715 continuelwps(pp); 3716 mutex_exit(&pp->p_lock); 3717 3718 return (err != 0 ? set_errno(err) : 0); 3719 } 3720 3721 /* 3722 * Systemcall entry point for zone_list(2). 3723 * 3724 * Processes running in a (non-global) zone only see themselves. 3725 */ 3726 static int 3727 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 3728 { 3729 zoneid_t *zoneids; 3730 zone_t *zone; 3731 uint_t user_nzones, real_nzones; 3732 int error = 0; 3733 uint_t i; 3734 3735 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 3736 return (set_errno(EFAULT)); 3737 3738 if (curproc->p_zone != global_zone) { 3739 /* just return current zone */ 3740 real_nzones = 1; 3741 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 3742 zoneids[0] = curproc->p_zone->zone_id; 3743 } else { 3744 mutex_enter(&zonehash_lock); 3745 real_nzones = zonecount; 3746 if (real_nzones) { 3747 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 3748 KM_SLEEP); 3749 i = 0; 3750 for (zone = list_head(&zone_active); zone != NULL; 3751 zone = list_next(&zone_active, zone)) 3752 zoneids[i++] = zone->zone_id; 3753 ASSERT(i == real_nzones); 3754 } 3755 mutex_exit(&zonehash_lock); 3756 } 3757 3758 if (user_nzones > real_nzones) 3759 user_nzones = real_nzones; 3760 3761 if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0) 3762 error = EFAULT; 3763 else if (zoneidlist != NULL && user_nzones != 0) { 3764 if (copyout(zoneids, zoneidlist, 3765 user_nzones * sizeof (zoneid_t)) != 0) 3766 error = EFAULT; 3767 } 3768 3769 if (real_nzones) 3770 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 3771 3772 if (error) 3773 return (set_errno(error)); 3774 else 3775 return (0); 3776 } 3777 3778 /* 3779 * Systemcall entry point for zone_lookup(2). 3780 * 3781 * Non-global zones are only able to see themselves. 3782 */ 3783 static zoneid_t 3784 zone_lookup(const char *zone_name) 3785 { 3786 char *kname; 3787 zone_t *zone; 3788 zoneid_t zoneid; 3789 int err; 3790 3791 if (zone_name == NULL) { 3792 /* return caller's zone id */ 3793 return (getzoneid()); 3794 } 3795 3796 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3797 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 3798 kmem_free(kname, ZONENAME_MAX); 3799 return (set_errno(err)); 3800 } 3801 3802 mutex_enter(&zonehash_lock); 3803 zone = zone_find_all_by_name(kname); 3804 kmem_free(kname, ZONENAME_MAX); 3805 if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY || 3806 (curproc->p_zone != global_zone && curproc->p_zone != zone)) { 3807 /* in non-global zone, can only lookup own name */ 3808 mutex_exit(&zonehash_lock); 3809 return (set_errno(EINVAL)); 3810 } 3811 zoneid = zone->zone_id; 3812 mutex_exit(&zonehash_lock); 3813 return (zoneid); 3814 } 3815 3816 static int 3817 zone_version(int *version_arg) 3818 { 3819 int version = ZONE_SYSCALL_API_VERSION; 3820 3821 if (copyout(&version, version_arg, sizeof (int)) != 0) 3822 return (set_errno(EFAULT)); 3823 return (0); 3824 } 3825 3826 /* ARGSUSED */ 3827 long 3828 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 3829 { 3830 zone_def zs; 3831 3832 switch (cmd) { 3833 case ZONE_CREATE: 3834 if (get_udatamodel() == DATAMODEL_NATIVE) { 3835 if (copyin(arg1, &zs, sizeof (zone_def))) { 3836 return (set_errno(EFAULT)); 3837 } 3838 } else { 3839 #ifdef _SYSCALL32_IMPL 3840 zone_def32 zs32; 3841 3842 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 3843 return (set_errno(EFAULT)); 3844 } 3845 zs.zone_name = 3846 (const char *)(unsigned long)zs32.zone_name; 3847 zs.zone_root = 3848 (const char *)(unsigned long)zs32.zone_root; 3849 zs.zone_privs = 3850 (const struct priv_set *) 3851 (unsigned long)zs32.zone_privs; 3852 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 3853 zs.rctlbufsz = zs32.rctlbufsz; 3854 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 3855 zs.zfsbufsz = zs32.zfsbufsz; 3856 zs.extended_error = 3857 (int *)(unsigned long)zs32.extended_error; 3858 #else 3859 panic("get_udatamodel() returned bogus result\n"); 3860 #endif 3861 } 3862 3863 return (zone_create(zs.zone_name, zs.zone_root, 3864 zs.zone_privs, zs.zone_privssz, 3865 (caddr_t)zs.rctlbuf, zs.rctlbufsz, 3866 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 3867 zs.extended_error)); 3868 case ZONE_BOOT: 3869 return (zone_boot((zoneid_t)(uintptr_t)arg1, 3870 (const char *)arg2)); 3871 case ZONE_DESTROY: 3872 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 3873 case ZONE_GETATTR: 3874 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 3875 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 3876 case ZONE_ENTER: 3877 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 3878 case ZONE_LIST: 3879 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 3880 case ZONE_SHUTDOWN: 3881 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 3882 case ZONE_LOOKUP: 3883 return (zone_lookup((const char *)arg1)); 3884 case ZONE_VERSION: 3885 return (zone_version((int *)arg1)); 3886 default: 3887 return (set_errno(EINVAL)); 3888 } 3889 } 3890 3891 struct zarg { 3892 zone_t *zone; 3893 zone_cmd_arg_t arg; 3894 }; 3895 3896 static int 3897 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 3898 { 3899 char *buf; 3900 size_t buflen; 3901 int error; 3902 3903 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 3904 buf = kmem_alloc(buflen, KM_SLEEP); 3905 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 3906 error = door_ki_open(buf, doorp); 3907 kmem_free(buf, buflen); 3908 return (error); 3909 } 3910 3911 static void 3912 zone_release_door(door_handle_t *doorp) 3913 { 3914 door_ki_rele(*doorp); 3915 *doorp = NULL; 3916 } 3917 3918 static void 3919 zone_ki_call_zoneadmd(struct zarg *zargp) 3920 { 3921 door_handle_t door = NULL; 3922 door_arg_t darg, save_arg; 3923 char *zone_name; 3924 size_t zone_namelen; 3925 zoneid_t zoneid; 3926 zone_t *zone; 3927 zone_cmd_arg_t arg; 3928 uint64_t uniqid; 3929 size_t size; 3930 int error; 3931 int retry; 3932 3933 zone = zargp->zone; 3934 arg = zargp->arg; 3935 kmem_free(zargp, sizeof (*zargp)); 3936 3937 zone_namelen = strlen(zone->zone_name) + 1; 3938 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 3939 bcopy(zone->zone_name, zone_name, zone_namelen); 3940 zoneid = zone->zone_id; 3941 uniqid = zone->zone_uniqid; 3942 /* 3943 * zoneadmd may be down, but at least we can empty out the zone. 3944 * We can ignore the return value of zone_empty() since we're called 3945 * from a kernel thread and know we won't be delivered any signals. 3946 */ 3947 ASSERT(curproc == &p0); 3948 (void) zone_empty(zone); 3949 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 3950 zone_rele(zone); 3951 3952 size = sizeof (arg); 3953 darg.rbuf = (char *)&arg; 3954 darg.data_ptr = (char *)&arg; 3955 darg.rsize = size; 3956 darg.data_size = size; 3957 darg.desc_ptr = NULL; 3958 darg.desc_num = 0; 3959 3960 save_arg = darg; 3961 /* 3962 * Since we're not holding a reference to the zone, any number of 3963 * things can go wrong, including the zone disappearing before we get a 3964 * chance to talk to zoneadmd. 3965 */ 3966 for (retry = 0; /* forever */; retry++) { 3967 if (door == NULL && 3968 (error = zone_lookup_door(zone_name, &door)) != 0) { 3969 goto next; 3970 } 3971 ASSERT(door != NULL); 3972 3973 if ((error = door_ki_upcall(door, &darg)) == 0) { 3974 break; 3975 } 3976 switch (error) { 3977 case EINTR: 3978 /* FALLTHROUGH */ 3979 case EAGAIN: /* process may be forking */ 3980 /* 3981 * Back off for a bit 3982 */ 3983 break; 3984 case EBADF: 3985 zone_release_door(&door); 3986 if (zone_lookup_door(zone_name, &door) != 0) { 3987 /* 3988 * zoneadmd may be dead, but it may come back to 3989 * life later. 3990 */ 3991 break; 3992 } 3993 break; 3994 default: 3995 cmn_err(CE_WARN, 3996 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 3997 error); 3998 goto out; 3999 } 4000 next: 4001 /* 4002 * If this isn't the same zone_t that we originally had in mind, 4003 * then this is the same as if two kadmin requests come in at 4004 * the same time: the first one wins. This means we lose, so we 4005 * bail. 4006 */ 4007 if ((zone = zone_find_by_id(zoneid)) == NULL) { 4008 /* 4009 * Problem is solved. 4010 */ 4011 break; 4012 } 4013 if (zone->zone_uniqid != uniqid) { 4014 /* 4015 * zoneid recycled 4016 */ 4017 zone_rele(zone); 4018 break; 4019 } 4020 /* 4021 * We could zone_status_timedwait(), but there doesn't seem to 4022 * be much point in doing that (plus, it would mean that 4023 * zone_free() isn't called until this thread exits). 4024 */ 4025 zone_rele(zone); 4026 delay(hz); 4027 darg = save_arg; 4028 } 4029 out: 4030 if (door != NULL) { 4031 zone_release_door(&door); 4032 } 4033 kmem_free(zone_name, zone_namelen); 4034 thread_exit(); 4035 } 4036 4037 /* 4038 * Entry point for uadmin() to tell the zone to go away or reboot. The caller 4039 * is a process in the zone to be modified. 4040 * 4041 * In order to shutdown the zone, we will hand off control to zoneadmd 4042 * (running in the global zone) via a door. We do a half-hearted job at 4043 * killing all processes in the zone, create a kernel thread to contact 4044 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 4045 * a form of generation number used to let zoneadmd (as well as 4046 * zone_destroy()) know exactly which zone they're re talking about. 4047 */ 4048 int 4049 zone_uadmin(int cmd, int fcn, cred_t *credp) 4050 { 4051 struct zarg *zargp; 4052 zone_cmd_t zcmd; 4053 zone_t *zone; 4054 4055 zone = curproc->p_zone; 4056 ASSERT(getzoneid() != GLOBAL_ZONEID); 4057 4058 switch (cmd) { 4059 case A_SHUTDOWN: 4060 switch (fcn) { 4061 case AD_HALT: 4062 case AD_POWEROFF: 4063 zcmd = Z_HALT; 4064 break; 4065 case AD_BOOT: 4066 zcmd = Z_REBOOT; 4067 break; 4068 case AD_IBOOT: 4069 case AD_SBOOT: 4070 case AD_SIBOOT: 4071 case AD_NOSYNC: 4072 return (ENOTSUP); 4073 default: 4074 return (EINVAL); 4075 } 4076 break; 4077 case A_REBOOT: 4078 zcmd = Z_REBOOT; 4079 break; 4080 case A_FTRACE: 4081 case A_REMOUNT: 4082 case A_FREEZE: 4083 case A_DUMP: 4084 return (ENOTSUP); 4085 default: 4086 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 4087 return (EINVAL); 4088 } 4089 4090 if (secpolicy_zone_admin(credp, B_FALSE)) 4091 return (EPERM); 4092 mutex_enter(&zone_status_lock); 4093 /* 4094 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 4095 * is in the zone. 4096 */ 4097 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 4098 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 4099 /* 4100 * This zone is already on its way down. 4101 */ 4102 mutex_exit(&zone_status_lock); 4103 return (0); 4104 } 4105 /* 4106 * Prevent future zone_enter()s 4107 */ 4108 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 4109 mutex_exit(&zone_status_lock); 4110 4111 /* 4112 * Kill everyone now and call zoneadmd later. 4113 * zone_ki_call_zoneadmd() will do a more thorough job of this 4114 * later. 4115 */ 4116 killall(zone->zone_id); 4117 /* 4118 * Now, create the thread to contact zoneadmd and do the rest of the 4119 * work. This thread can't be created in our zone otherwise 4120 * zone_destroy() would deadlock. 4121 */ 4122 zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP); 4123 zargp->arg.cmd = zcmd; 4124 zargp->arg.uniqid = zone->zone_uniqid; 4125 (void) strcpy(zargp->arg.locale, "C"); 4126 zone_hold(zargp->zone = zone); 4127 4128 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 4129 TS_RUN, minclsyspri); 4130 exit(CLD_EXITED, 0); 4131 4132 return (EINVAL); 4133 } 4134 4135 /* 4136 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 4137 * status to ZONE_IS_SHUTTING_DOWN. 4138 */ 4139 void 4140 zone_shutdown_global(void) 4141 { 4142 ASSERT(curproc->p_zone == global_zone); 4143 4144 mutex_enter(&zone_status_lock); 4145 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 4146 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 4147 mutex_exit(&zone_status_lock); 4148 } 4149 4150 /* 4151 * Returns true if the named dataset is visible in the current zone. 4152 * The 'write' parameter is set to 1 if the dataset is also writable. 4153 */ 4154 int 4155 zone_dataset_visible(const char *dataset, int *write) 4156 { 4157 zone_dataset_t *zd; 4158 size_t len; 4159 zone_t *zone = curproc->p_zone; 4160 4161 if (dataset[0] == '\0') 4162 return (0); 4163 4164 /* 4165 * Walk the list once, looking for datasets which match exactly, or 4166 * specify a dataset underneath an exported dataset. If found, return 4167 * true and note that it is writable. 4168 */ 4169 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4170 zd = list_next(&zone->zone_datasets, zd)) { 4171 4172 len = strlen(zd->zd_dataset); 4173 if (strlen(dataset) >= len && 4174 bcmp(dataset, zd->zd_dataset, len) == 0 && 4175 (dataset[len] == '\0' || dataset[len] == '/' || 4176 dataset[len] == '@')) { 4177 if (write) 4178 *write = 1; 4179 return (1); 4180 } 4181 } 4182 4183 /* 4184 * Walk the list a second time, searching for datasets which are parents 4185 * of exported datasets. These should be visible, but read-only. 4186 * 4187 * Note that we also have to support forms such as 'pool/dataset/', with 4188 * a trailing slash. 4189 */ 4190 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4191 zd = list_next(&zone->zone_datasets, zd)) { 4192 4193 len = strlen(dataset); 4194 if (dataset[len - 1] == '/') 4195 len--; /* Ignore trailing slash */ 4196 if (len < strlen(zd->zd_dataset) && 4197 bcmp(dataset, zd->zd_dataset, len) == 0 && 4198 zd->zd_dataset[len] == '/') { 4199 if (write) 4200 *write = 0; 4201 return (1); 4202 } 4203 } 4204 4205 return (0); 4206 } 4207