1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Zones 31 * 32 * A zone is a named collection of processes, namespace constraints, 33 * and other system resources which comprise a secure and manageable 34 * application containment facility. 35 * 36 * Zones (represented by the reference counted zone_t) are tracked in 37 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 38 * (zoneid_t) are used to track zone association. Zone IDs are 39 * dynamically generated when the zone is created; if a persistent 40 * identifier is needed (core files, accounting logs, audit trail, 41 * etc.), the zone name should be used. 42 * 43 * 44 * Global Zone: 45 * 46 * The global zone (zoneid 0) is automatically associated with all 47 * system resources that have not been bound to a user-created zone. 48 * This means that even systems where zones are not in active use 49 * have a global zone, and all processes, mounts, etc. are 50 * associated with that zone. The global zone is generally 51 * unconstrained in terms of privileges and access, though the usual 52 * credential and privilege based restrictions apply. 53 * 54 * 55 * Zone States: 56 * 57 * The states in which a zone may be in and the transitions are as 58 * follows: 59 * 60 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 61 * initialized zone is added to the list of active zones on the system but 62 * isn't accessible. 63 * 64 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 65 * ready. The zone is made visible after the ZSD constructor callbacks are 66 * executed. A zone remains in this state until it transitions into 67 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 68 * 69 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 70 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 71 * state. 72 * 73 * ZONE_IS_RUNNING: The zone is open for business: zsched has 74 * successfully started init. A zone remains in this state until 75 * zone_shutdown() is called. 76 * 77 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 78 * killing all processes running in the zone. The zone remains 79 * in this state until there are no more user processes running in the zone. 80 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 81 * Since zone_shutdown() is restartable, it may be called successfully 82 * multiple times for the same zone_t. Setting of the zone's state to 83 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 84 * the zone's status without worrying about it being a moving target. 85 * 86 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 87 * are no more user processes in the zone. The zone remains in this 88 * state until there are no more kernel threads associated with the 89 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 90 * fail. 91 * 92 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 93 * have exited. zone_shutdown() returns. Henceforth it is not possible to 94 * join the zone or create kernel threads therein. 95 * 96 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 97 * remains in this state until zsched exits. Calls to zone_find_by_*() 98 * return NULL from now on. 99 * 100 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 101 * processes or threads doing work on behalf of the zone. The zone is 102 * removed from the list of active zones. zone_destroy() returns, and 103 * the zone can be recreated. 104 * 105 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 106 * callbacks are executed, and all memory associated with the zone is 107 * freed. 108 * 109 * Threads can wait for the zone to enter a requested state by using 110 * zone_status_wait() or zone_status_timedwait() with the desired 111 * state passed in as an argument. Zone state transitions are 112 * uni-directional; it is not possible to move back to an earlier state. 113 * 114 * 115 * Zone-Specific Data: 116 * 117 * Subsystems needing to maintain zone-specific data can store that 118 * data using the ZSD mechanism. This provides a zone-specific data 119 * store, similar to thread-specific data (see pthread_getspecific(3C) 120 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 121 * to register callbacks to be invoked when a zone is created, shut 122 * down, or destroyed. This can be used to initialize zone-specific 123 * data for new zones and to clean up when zones go away. 124 * 125 * 126 * Data Structures: 127 * 128 * The per-zone structure (zone_t) is reference counted, and freed 129 * when all references are released. zone_hold and zone_rele can be 130 * used to adjust the reference count. In addition, reference counts 131 * associated with the cred_t structure are tracked separately using 132 * zone_cred_hold and zone_cred_rele. 133 * 134 * Pointers to active zone_t's are stored in two hash tables; one 135 * for searching by id, the other for searching by name. Lookups 136 * can be performed on either basis, using zone_find_by_id and 137 * zone_find_by_name. Both return zone_t pointers with the zone 138 * held, so zone_rele should be called when the pointer is no longer 139 * needed. Zones can also be searched by path; zone_find_by_path 140 * returns the zone with which a path name is associated (global 141 * zone if the path is not within some other zone's file system 142 * hierarchy). This currently requires iterating through each zone, 143 * so it is slower than an id or name search via a hash table. 144 * 145 * 146 * Locking: 147 * 148 * zonehash_lock: This is a top-level global lock used to protect the 149 * zone hash tables and lists. Zones cannot be created or destroyed 150 * while this lock is held. 151 * zone_status_lock: This is a global lock protecting zone state. 152 * Zones cannot change state while this lock is held. It also 153 * protects the list of kernel threads associated with a zone. 154 * zone_lock: This is a per-zone lock used to protect several fields of 155 * the zone_t (see <sys/zone.h> for details). In addition, holding 156 * this lock means that the zone cannot go away. 157 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 158 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 159 * list (a list of zones in the ZONE_IS_DEAD state). 160 * 161 * Ordering requirements: 162 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 163 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 164 * 165 * Blocking memory allocations are permitted while holding any of the 166 * zone locks. 167 * 168 * 169 * System Call Interface: 170 * 171 * The zone subsystem can be managed and queried from user level with 172 * the following system calls (all subcodes of the primary "zone" 173 * system call): 174 * - zone_create: creates a zone with selected attributes (name, 175 * root path, privileges, resource controls, ZFS datasets) 176 * - zone_enter: allows the current process to enter a zone 177 * - zone_getattr: reports attributes of a zone 178 * - zone_setattr: set attributes of a zone 179 * - zone_boot: set 'init' running for the zone 180 * - zone_list: lists all zones active in the system 181 * - zone_lookup: looks up zone id based on name 182 * - zone_shutdown: initiates shutdown process (see states above) 183 * - zone_destroy: completes shutdown process (see states above) 184 * 185 */ 186 187 #include <sys/priv_impl.h> 188 #include <sys/cred.h> 189 #include <c2/audit.h> 190 #include <sys/debug.h> 191 #include <sys/file.h> 192 #include <sys/kmem.h> 193 #include <sys/mutex.h> 194 #include <sys/note.h> 195 #include <sys/pathname.h> 196 #include <sys/proc.h> 197 #include <sys/project.h> 198 #include <sys/sysevent.h> 199 #include <sys/task.h> 200 #include <sys/systm.h> 201 #include <sys/types.h> 202 #include <sys/utsname.h> 203 #include <sys/vnode.h> 204 #include <sys/vfs.h> 205 #include <sys/systeminfo.h> 206 #include <sys/policy.h> 207 #include <sys/cred_impl.h> 208 #include <sys/contract_impl.h> 209 #include <sys/contract/process_impl.h> 210 #include <sys/class.h> 211 #include <sys/pool.h> 212 #include <sys/pool_pset.h> 213 #include <sys/pset.h> 214 #include <sys/sysmacros.h> 215 #include <sys/callb.h> 216 #include <sys/vmparam.h> 217 #include <sys/corectl.h> 218 219 #include <sys/door.h> 220 #include <sys/cpuvar.h> 221 222 #include <sys/uadmin.h> 223 #include <sys/session.h> 224 #include <sys/cmn_err.h> 225 #include <sys/modhash.h> 226 #include <sys/sunddi.h> 227 #include <sys/nvpair.h> 228 #include <sys/rctl.h> 229 #include <sys/fss.h> 230 #include <sys/zone.h> 231 #include <sys/tsol/label.h> 232 233 /* 234 * cv used to signal that all references to the zone have been released. This 235 * needs to be global since there may be multiple waiters, and the first to 236 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 237 */ 238 static kcondvar_t zone_destroy_cv; 239 /* 240 * Lock used to serialize access to zone_cv. This could have been per-zone, 241 * but then we'd need another lock for zone_destroy_cv, and why bother? 242 */ 243 static kmutex_t zone_status_lock; 244 245 /* 246 * ZSD-related global variables. 247 */ 248 static kmutex_t zsd_key_lock; /* protects the following two */ 249 /* 250 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 251 */ 252 static zone_key_t zsd_keyval = 0; 253 /* 254 * Global list of registered keys. We use this when a new zone is created. 255 */ 256 static list_t zsd_registered_keys; 257 258 int zone_hash_size = 256; 259 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel; 260 static kmutex_t zonehash_lock; 261 static uint_t zonecount; 262 static id_space_t *zoneid_space; 263 264 /* 265 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 266 * kernel proper runs, and which manages all other zones. 267 * 268 * Although not declared as static, the variable "zone0" should not be used 269 * except for by code that needs to reference the global zone early on in boot, 270 * before it is fully initialized. All other consumers should use 271 * 'global_zone'. 272 */ 273 zone_t zone0; 274 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 275 276 /* 277 * List of active zones, protected by zonehash_lock. 278 */ 279 static list_t zone_active; 280 281 /* 282 * List of destroyed zones that still have outstanding cred references. 283 * Used for debugging. Uses a separate lock to avoid lock ordering 284 * problems in zone_free. 285 */ 286 static list_t zone_deathrow; 287 static kmutex_t zone_deathrow_lock; 288 289 /* number of zones is limited by virtual interface limit in IP */ 290 uint_t maxzones = 8192; 291 292 /* Event channel to sent zone state change notifications */ 293 evchan_t *zone_event_chan; 294 295 /* 296 * This table holds the mapping from kernel zone states to 297 * states visible in the state notification API. 298 * The idea is that we only expose "obvious" states and 299 * do not expose states which are just implementation details. 300 */ 301 const char *zone_status_table[] = { 302 ZONE_EVENT_UNINITIALIZED, /* uninitialized */ 303 ZONE_EVENT_READY, /* ready */ 304 ZONE_EVENT_READY, /* booting */ 305 ZONE_EVENT_RUNNING, /* running */ 306 ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */ 307 ZONE_EVENT_SHUTTING_DOWN, /* empty */ 308 ZONE_EVENT_SHUTTING_DOWN, /* down */ 309 ZONE_EVENT_SHUTTING_DOWN, /* dying */ 310 ZONE_EVENT_UNINITIALIZED, /* dead */ 311 }; 312 313 /* 314 * This isn't static so lint doesn't complain. 315 */ 316 rctl_hndl_t rc_zone_cpu_shares; 317 rctl_hndl_t rc_zone_nlwps; 318 /* 319 * Synchronization primitives used to synchronize between mounts and zone 320 * creation/destruction. 321 */ 322 static int mounts_in_progress; 323 static kcondvar_t mount_cv; 324 static kmutex_t mount_lock; 325 326 const char * const zone_default_initname = "/sbin/init"; 327 static char * const zone_prefix = "/zone/"; 328 329 static int zone_shutdown(zoneid_t zoneid); 330 331 /* 332 * Bump this number when you alter the zone syscall interfaces; this is 333 * because we need to have support for previous API versions in libc 334 * to support patching; libc calls into the kernel to determine this number. 335 * 336 * Version 1 of the API is the version originally shipped with Solaris 10 337 * Version 2 alters the zone_create system call in order to support more 338 * arguments by moving the args into a structure; and to do better 339 * error reporting when zone_create() fails. 340 * Version 3 alters the zone_create system call in order to support the 341 * import of ZFS datasets to zones. 342 * Version 4 alters the zone_create system call in order to support 343 * Trusted Extensions. 344 * Version 5 alters the zone_boot system call, and converts its old 345 * bootargs parameter to be set by the zone_setattr API instead. 346 */ 347 static const int ZONE_SYSCALL_API_VERSION = 5; 348 349 /* 350 * Certain filesystems (such as NFS and autofs) need to know which zone 351 * the mount is being placed in. Because of this, we need to be able to 352 * ensure that a zone isn't in the process of being created such that 353 * nfs_mount() thinks it is in the global zone, while by the time it 354 * gets added the list of mounted zones, it ends up on zoneA's mount 355 * list. 356 * 357 * The following functions: block_mounts()/resume_mounts() and 358 * mount_in_progress()/mount_completed() are used by zones and the VFS 359 * layer (respectively) to synchronize zone creation and new mounts. 360 * 361 * The semantics are like a reader-reader lock such that there may 362 * either be multiple mounts (or zone creations, if that weren't 363 * serialized by zonehash_lock) in progress at the same time, but not 364 * both. 365 * 366 * We use cv's so the user can ctrl-C out of the operation if it's 367 * taking too long. 368 * 369 * The semantics are such that there is unfair bias towards the 370 * "current" operation. This means that zone creations may starve if 371 * there is a rapid succession of new mounts coming in to the system, or 372 * there is a remote possibility that zones will be created at such a 373 * rate that new mounts will not be able to proceed. 374 */ 375 /* 376 * Prevent new mounts from progressing to the point of calling 377 * VFS_MOUNT(). If there are already mounts in this "region", wait for 378 * them to complete. 379 */ 380 static int 381 block_mounts(void) 382 { 383 int retval = 0; 384 385 /* 386 * Since it may block for a long time, block_mounts() shouldn't be 387 * called with zonehash_lock held. 388 */ 389 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 390 mutex_enter(&mount_lock); 391 while (mounts_in_progress > 0) { 392 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 393 goto signaled; 394 } 395 /* 396 * A negative value of mounts_in_progress indicates that mounts 397 * have been blocked by (-mounts_in_progress) different callers. 398 */ 399 mounts_in_progress--; 400 retval = 1; 401 signaled: 402 mutex_exit(&mount_lock); 403 return (retval); 404 } 405 406 /* 407 * The VFS layer may progress with new mounts as far as we're concerned. 408 * Allow them to progress if we were the last obstacle. 409 */ 410 static void 411 resume_mounts(void) 412 { 413 mutex_enter(&mount_lock); 414 if (++mounts_in_progress == 0) 415 cv_broadcast(&mount_cv); 416 mutex_exit(&mount_lock); 417 } 418 419 /* 420 * The VFS layer is busy with a mount; zones should wait until all 421 * mounts are completed to progress. 422 */ 423 void 424 mount_in_progress(void) 425 { 426 mutex_enter(&mount_lock); 427 while (mounts_in_progress < 0) 428 cv_wait(&mount_cv, &mount_lock); 429 mounts_in_progress++; 430 mutex_exit(&mount_lock); 431 } 432 433 /* 434 * VFS is done with one mount; wake up any waiting block_mounts() 435 * callers if this is the last mount. 436 */ 437 void 438 mount_completed(void) 439 { 440 mutex_enter(&mount_lock); 441 if (--mounts_in_progress == 0) 442 cv_broadcast(&mount_cv); 443 mutex_exit(&mount_lock); 444 } 445 446 /* 447 * ZSD routines. 448 * 449 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 450 * defined by the pthread_key_create() and related interfaces. 451 * 452 * Kernel subsystems may register one or more data items and/or 453 * callbacks to be executed when a zone is created, shutdown, or 454 * destroyed. 455 * 456 * Unlike the thread counterpart, destructor callbacks will be executed 457 * even if the data pointer is NULL and/or there are no constructor 458 * callbacks, so it is the responsibility of such callbacks to check for 459 * NULL data values if necessary. 460 * 461 * The locking strategy and overall picture is as follows: 462 * 463 * When someone calls zone_key_create(), a template ZSD entry is added to the 464 * global list "zsd_registered_keys", protected by zsd_key_lock. The 465 * constructor callback is called immediately on all existing zones, and a 466 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 467 * zone_lock). As this operation requires the list of zones, the list of 468 * registered keys, and the per-zone list of ZSD entries to remain constant 469 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 470 * all existing zones, and zsd_key_lock, in that order. Similar locking is 471 * needed when zone_key_delete() is called. It is thus sufficient to hold 472 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 473 * per-zone zone_zsd list. 474 * 475 * Note that this implementation does not make a copy of the ZSD entry if a 476 * constructor callback is not provided. A zone_getspecific() on such an 477 * uninitialized ZSD entry will return NULL. 478 * 479 * When new zones are created constructor callbacks for all registered ZSD 480 * entries will be called. 481 * 482 * The framework does not provide any locking around zone_getspecific() and 483 * zone_setspecific() apart from that needed for internal consistency, so 484 * callers interested in atomic "test-and-set" semantics will need to provide 485 * their own locking. 486 */ 487 void 488 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 489 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 490 { 491 struct zsd_entry *zsdp; 492 struct zsd_entry *t; 493 struct zone *zone; 494 495 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 496 zsdp->zsd_data = NULL; 497 zsdp->zsd_create = create; 498 zsdp->zsd_shutdown = shutdown; 499 zsdp->zsd_destroy = destroy; 500 501 mutex_enter(&zonehash_lock); /* stop the world */ 502 for (zone = list_head(&zone_active); zone != NULL; 503 zone = list_next(&zone_active, zone)) 504 mutex_enter(&zone->zone_lock); /* lock all zones */ 505 506 mutex_enter(&zsd_key_lock); 507 *keyp = zsdp->zsd_key = ++zsd_keyval; 508 ASSERT(zsd_keyval != 0); 509 list_insert_tail(&zsd_registered_keys, zsdp); 510 mutex_exit(&zsd_key_lock); 511 512 if (create != NULL) { 513 for (zone = list_head(&zone_active); zone != NULL; 514 zone = list_next(&zone_active, zone)) { 515 t = kmem_alloc(sizeof (*t), KM_SLEEP); 516 t->zsd_key = *keyp; 517 t->zsd_data = (*create)(zone->zone_id); 518 t->zsd_create = create; 519 t->zsd_shutdown = shutdown; 520 t->zsd_destroy = destroy; 521 list_insert_tail(&zone->zone_zsd, t); 522 } 523 } 524 for (zone = list_head(&zone_active); zone != NULL; 525 zone = list_next(&zone_active, zone)) 526 mutex_exit(&zone->zone_lock); 527 mutex_exit(&zonehash_lock); 528 } 529 530 /* 531 * Helper function to find the zsd_entry associated with the key in the 532 * given list. 533 */ 534 static struct zsd_entry * 535 zsd_find(list_t *l, zone_key_t key) 536 { 537 struct zsd_entry *zsd; 538 539 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 540 if (zsd->zsd_key == key) { 541 /* 542 * Move to head of list to keep list in MRU order. 543 */ 544 if (zsd != list_head(l)) { 545 list_remove(l, zsd); 546 list_insert_head(l, zsd); 547 } 548 return (zsd); 549 } 550 } 551 return (NULL); 552 } 553 554 /* 555 * Function called when a module is being unloaded, or otherwise wishes 556 * to unregister its ZSD key and callbacks. 557 */ 558 int 559 zone_key_delete(zone_key_t key) 560 { 561 struct zsd_entry *zsdp = NULL; 562 zone_t *zone; 563 564 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 565 for (zone = list_head(&zone_active); zone != NULL; 566 zone = list_next(&zone_active, zone)) 567 mutex_enter(&zone->zone_lock); /* lock all zones */ 568 569 mutex_enter(&zsd_key_lock); 570 zsdp = zsd_find(&zsd_registered_keys, key); 571 if (zsdp == NULL) 572 goto notfound; 573 list_remove(&zsd_registered_keys, zsdp); 574 mutex_exit(&zsd_key_lock); 575 576 for (zone = list_head(&zone_active); zone != NULL; 577 zone = list_next(&zone_active, zone)) { 578 struct zsd_entry *del; 579 void *data; 580 581 if (!(zone->zone_flags & ZF_DESTROYED)) { 582 del = zsd_find(&zone->zone_zsd, key); 583 if (del != NULL) { 584 data = del->zsd_data; 585 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 586 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 587 list_remove(&zone->zone_zsd, del); 588 kmem_free(del, sizeof (*del)); 589 } else { 590 data = NULL; 591 } 592 if (zsdp->zsd_shutdown) 593 zsdp->zsd_shutdown(zone->zone_id, data); 594 if (zsdp->zsd_destroy) 595 zsdp->zsd_destroy(zone->zone_id, data); 596 } 597 mutex_exit(&zone->zone_lock); 598 } 599 mutex_exit(&zonehash_lock); 600 kmem_free(zsdp, sizeof (*zsdp)); 601 return (0); 602 603 notfound: 604 mutex_exit(&zsd_key_lock); 605 for (zone = list_head(&zone_active); zone != NULL; 606 zone = list_next(&zone_active, zone)) 607 mutex_exit(&zone->zone_lock); 608 mutex_exit(&zonehash_lock); 609 return (-1); 610 } 611 612 /* 613 * ZSD counterpart of pthread_setspecific(). 614 */ 615 int 616 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 617 { 618 struct zsd_entry *t; 619 struct zsd_entry *zsdp = NULL; 620 621 mutex_enter(&zone->zone_lock); 622 t = zsd_find(&zone->zone_zsd, key); 623 if (t != NULL) { 624 /* 625 * Replace old value with new 626 */ 627 t->zsd_data = (void *)data; 628 mutex_exit(&zone->zone_lock); 629 return (0); 630 } 631 /* 632 * If there was no previous value, go through the list of registered 633 * keys. 634 * 635 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 636 * necessary for shutdown callbacks to be able to execute without fear 637 * of deadlock. 638 */ 639 mutex_enter(&zsd_key_lock); 640 zsdp = zsd_find(&zsd_registered_keys, key); 641 if (zsdp == NULL) { /* Key was not registered */ 642 mutex_exit(&zsd_key_lock); 643 mutex_exit(&zone->zone_lock); 644 return (-1); 645 } 646 647 /* 648 * Add a zsd_entry to this zone, using the template we just retrieved 649 * to initialize the constructor and destructor(s). 650 */ 651 t = kmem_alloc(sizeof (*t), KM_SLEEP); 652 t->zsd_key = key; 653 t->zsd_data = (void *)data; 654 t->zsd_create = zsdp->zsd_create; 655 t->zsd_shutdown = zsdp->zsd_shutdown; 656 t->zsd_destroy = zsdp->zsd_destroy; 657 list_insert_tail(&zone->zone_zsd, t); 658 mutex_exit(&zsd_key_lock); 659 mutex_exit(&zone->zone_lock); 660 return (0); 661 } 662 663 /* 664 * ZSD counterpart of pthread_getspecific(). 665 */ 666 void * 667 zone_getspecific(zone_key_t key, zone_t *zone) 668 { 669 struct zsd_entry *t; 670 void *data; 671 672 mutex_enter(&zone->zone_lock); 673 t = zsd_find(&zone->zone_zsd, key); 674 data = (t == NULL ? NULL : t->zsd_data); 675 mutex_exit(&zone->zone_lock); 676 return (data); 677 } 678 679 /* 680 * Function used to initialize a zone's list of ZSD callbacks and data 681 * when the zone is being created. The callbacks are initialized from 682 * the template list (zsd_registered_keys), and the constructor 683 * callback executed (if one exists). 684 * 685 * This is called before the zone is made publicly available, hence no 686 * need to grab zone_lock. 687 * 688 * Although we grab and release zsd_key_lock, new entries cannot be 689 * added to or removed from the zsd_registered_keys list until we 690 * release zonehash_lock, so there isn't a window for a 691 * zone_key_create() to come in after we've dropped zsd_key_lock but 692 * before the zone is added to the zone list, such that the constructor 693 * callbacks aren't executed for the new zone. 694 */ 695 static void 696 zone_zsd_configure(zone_t *zone) 697 { 698 struct zsd_entry *zsdp; 699 struct zsd_entry *t; 700 zoneid_t zoneid = zone->zone_id; 701 702 ASSERT(MUTEX_HELD(&zonehash_lock)); 703 ASSERT(list_head(&zone->zone_zsd) == NULL); 704 mutex_enter(&zsd_key_lock); 705 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 706 zsdp = list_next(&zsd_registered_keys, zsdp)) { 707 if (zsdp->zsd_create != NULL) { 708 t = kmem_alloc(sizeof (*t), KM_SLEEP); 709 t->zsd_key = zsdp->zsd_key; 710 t->zsd_create = zsdp->zsd_create; 711 t->zsd_data = (*t->zsd_create)(zoneid); 712 t->zsd_shutdown = zsdp->zsd_shutdown; 713 t->zsd_destroy = zsdp->zsd_destroy; 714 list_insert_tail(&zone->zone_zsd, t); 715 } 716 } 717 mutex_exit(&zsd_key_lock); 718 } 719 720 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 721 722 /* 723 * Helper function to execute shutdown or destructor callbacks. 724 */ 725 static void 726 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 727 { 728 struct zsd_entry *zsdp; 729 struct zsd_entry *t; 730 zoneid_t zoneid = zone->zone_id; 731 732 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 733 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 734 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 735 736 mutex_enter(&zone->zone_lock); 737 if (ct == ZSD_DESTROY) { 738 if (zone->zone_flags & ZF_DESTROYED) { 739 /* 740 * Make sure destructors are only called once. 741 */ 742 mutex_exit(&zone->zone_lock); 743 return; 744 } 745 zone->zone_flags |= ZF_DESTROYED; 746 } 747 mutex_exit(&zone->zone_lock); 748 749 /* 750 * Both zsd_key_lock and zone_lock need to be held in order to add or 751 * remove a ZSD key, (either globally as part of 752 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 753 * possible through zone_setspecific()), so it's sufficient to hold 754 * zsd_key_lock here. 755 * 756 * This is a good thing, since we don't want to recursively try to grab 757 * zone_lock if a callback attempts to do something like a crfree() or 758 * zone_rele(). 759 */ 760 mutex_enter(&zsd_key_lock); 761 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 762 zsdp = list_next(&zsd_registered_keys, zsdp)) { 763 zone_key_t key = zsdp->zsd_key; 764 765 /* Skip if no callbacks registered */ 766 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 767 continue; 768 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 769 continue; 770 /* 771 * Call the callback with the zone-specific data if we can find 772 * any, otherwise with NULL. 773 */ 774 t = zsd_find(&zone->zone_zsd, key); 775 if (t != NULL) { 776 if (ct == ZSD_SHUTDOWN) { 777 t->zsd_shutdown(zoneid, t->zsd_data); 778 } else { 779 ASSERT(ct == ZSD_DESTROY); 780 t->zsd_destroy(zoneid, t->zsd_data); 781 } 782 } else { 783 if (ct == ZSD_SHUTDOWN) { 784 zsdp->zsd_shutdown(zoneid, NULL); 785 } else { 786 ASSERT(ct == ZSD_DESTROY); 787 zsdp->zsd_destroy(zoneid, NULL); 788 } 789 } 790 } 791 mutex_exit(&zsd_key_lock); 792 } 793 794 /* 795 * Called when the zone is going away; free ZSD-related memory, and 796 * destroy the zone_zsd list. 797 */ 798 static void 799 zone_free_zsd(zone_t *zone) 800 { 801 struct zsd_entry *t, *next; 802 803 /* 804 * Free all the zsd_entry's we had on this zone. 805 */ 806 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 807 next = list_next(&zone->zone_zsd, t); 808 list_remove(&zone->zone_zsd, t); 809 kmem_free(t, sizeof (*t)); 810 } 811 list_destroy(&zone->zone_zsd); 812 } 813 814 /* 815 * Frees memory associated with the zone dataset list. 816 */ 817 static void 818 zone_free_datasets(zone_t *zone) 819 { 820 zone_dataset_t *t, *next; 821 822 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 823 next = list_next(&zone->zone_datasets, t); 824 list_remove(&zone->zone_datasets, t); 825 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 826 kmem_free(t, sizeof (*t)); 827 } 828 list_destroy(&zone->zone_datasets); 829 } 830 831 /* 832 * zone.cpu-shares resource control support. 833 */ 834 /*ARGSUSED*/ 835 static rctl_qty_t 836 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 837 { 838 ASSERT(MUTEX_HELD(&p->p_lock)); 839 return (p->p_zone->zone_shares); 840 } 841 842 /*ARGSUSED*/ 843 static int 844 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 845 rctl_qty_t nv) 846 { 847 ASSERT(MUTEX_HELD(&p->p_lock)); 848 ASSERT(e->rcep_t == RCENTITY_ZONE); 849 if (e->rcep_p.zone == NULL) 850 return (0); 851 852 e->rcep_p.zone->zone_shares = nv; 853 return (0); 854 } 855 856 static rctl_ops_t zone_cpu_shares_ops = { 857 rcop_no_action, 858 zone_cpu_shares_usage, 859 zone_cpu_shares_set, 860 rcop_no_test 861 }; 862 863 /*ARGSUSED*/ 864 static rctl_qty_t 865 zone_lwps_usage(rctl_t *r, proc_t *p) 866 { 867 rctl_qty_t nlwps; 868 zone_t *zone = p->p_zone; 869 870 ASSERT(MUTEX_HELD(&p->p_lock)); 871 872 mutex_enter(&zone->zone_nlwps_lock); 873 nlwps = zone->zone_nlwps; 874 mutex_exit(&zone->zone_nlwps_lock); 875 876 return (nlwps); 877 } 878 879 /*ARGSUSED*/ 880 static int 881 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 882 rctl_qty_t incr, uint_t flags) 883 { 884 rctl_qty_t nlwps; 885 886 ASSERT(MUTEX_HELD(&p->p_lock)); 887 ASSERT(e->rcep_t == RCENTITY_ZONE); 888 if (e->rcep_p.zone == NULL) 889 return (0); 890 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 891 nlwps = e->rcep_p.zone->zone_nlwps; 892 893 if (nlwps + incr > rcntl->rcv_value) 894 return (1); 895 896 return (0); 897 } 898 899 /*ARGSUSED*/ 900 static int 901 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) { 902 903 ASSERT(MUTEX_HELD(&p->p_lock)); 904 ASSERT(e->rcep_t == RCENTITY_ZONE); 905 if (e->rcep_p.zone == NULL) 906 return (0); 907 e->rcep_p.zone->zone_nlwps_ctl = nv; 908 return (0); 909 } 910 911 static rctl_ops_t zone_lwps_ops = { 912 rcop_no_action, 913 zone_lwps_usage, 914 zone_lwps_set, 915 zone_lwps_test, 916 }; 917 918 /* 919 * Helper function to brand the zone with a unique ID. 920 */ 921 static void 922 zone_uniqid(zone_t *zone) 923 { 924 static uint64_t uniqid = 0; 925 926 ASSERT(MUTEX_HELD(&zonehash_lock)); 927 zone->zone_uniqid = uniqid++; 928 } 929 930 /* 931 * Returns a held pointer to the "kcred" for the specified zone. 932 */ 933 struct cred * 934 zone_get_kcred(zoneid_t zoneid) 935 { 936 zone_t *zone; 937 cred_t *cr; 938 939 if ((zone = zone_find_by_id(zoneid)) == NULL) 940 return (NULL); 941 cr = zone->zone_kcred; 942 crhold(cr); 943 zone_rele(zone); 944 return (cr); 945 } 946 947 /* 948 * Called very early on in boot to initialize the ZSD list so that 949 * zone_key_create() can be called before zone_init(). It also initializes 950 * portions of zone0 which may be used before zone_init() is called. The 951 * variable "global_zone" will be set when zone0 is fully initialized by 952 * zone_init(). 953 */ 954 void 955 zone_zsd_init(void) 956 { 957 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 958 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 959 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 960 offsetof(struct zsd_entry, zsd_linkage)); 961 list_create(&zone_active, sizeof (zone_t), 962 offsetof(zone_t, zone_linkage)); 963 list_create(&zone_deathrow, sizeof (zone_t), 964 offsetof(zone_t, zone_linkage)); 965 966 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 967 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 968 zone0.zone_shares = 1; 969 zone0.zone_nlwps_ctl = INT_MAX; 970 zone0.zone_name = GLOBAL_ZONENAME; 971 zone0.zone_nodename = utsname.nodename; 972 zone0.zone_domain = srpc_domain; 973 zone0.zone_ref = 1; 974 zone0.zone_id = GLOBAL_ZONEID; 975 zone0.zone_status = ZONE_IS_RUNNING; 976 zone0.zone_rootpath = "/"; 977 zone0.zone_rootpathlen = 2; 978 zone0.zone_psetid = ZONE_PS_INVAL; 979 zone0.zone_ncpus = 0; 980 zone0.zone_ncpus_online = 0; 981 zone0.zone_proc_initpid = 1; 982 zone0.zone_initname = initname; 983 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 984 offsetof(struct zsd_entry, zsd_linkage)); 985 list_insert_head(&zone_active, &zone0); 986 987 /* 988 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 989 * to anything meaningful. It is assigned to be 'rootdir' in 990 * vfs_mountroot(). 991 */ 992 zone0.zone_rootvp = NULL; 993 zone0.zone_vfslist = NULL; 994 zone0.zone_bootargs = initargs; 995 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 996 /* 997 * The global zone has all privileges 998 */ 999 priv_fillset(zone0.zone_privset); 1000 /* 1001 * Add p0 to the global zone 1002 */ 1003 zone0.zone_zsched = &p0; 1004 p0.p_zone = &zone0; 1005 } 1006 1007 /* 1008 * Compute a hash value based on the contents of the label and the DOI. The 1009 * hash algorithm is somewhat arbitrary, but is based on the observation that 1010 * humans will likely pick labels that differ by amounts that work out to be 1011 * multiples of the number of hash chains, and thus stirring in some primes 1012 * should help. 1013 */ 1014 static uint_t 1015 hash_bylabel(void *hdata, mod_hash_key_t key) 1016 { 1017 const ts_label_t *lab = (ts_label_t *)key; 1018 const uint32_t *up, *ue; 1019 uint_t hash; 1020 int i; 1021 1022 _NOTE(ARGUNUSED(hdata)); 1023 1024 hash = lab->tsl_doi + (lab->tsl_doi << 1); 1025 /* we depend on alignment of label, but not representation */ 1026 up = (const uint32_t *)&lab->tsl_label; 1027 ue = up + sizeof (lab->tsl_label) / sizeof (*up); 1028 i = 1; 1029 while (up < ue) { 1030 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */ 1031 hash += *up + (*up << ((i % 16) + 1)); 1032 up++; 1033 i++; 1034 } 1035 return (hash); 1036 } 1037 1038 /* 1039 * All that mod_hash cares about here is zero (equal) versus non-zero (not 1040 * equal). This may need to be changed if less than / greater than is ever 1041 * needed. 1042 */ 1043 static int 1044 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1045 { 1046 ts_label_t *lab1 = (ts_label_t *)key1; 1047 ts_label_t *lab2 = (ts_label_t *)key2; 1048 1049 return (label_equal(lab1, lab2) ? 0 : 1); 1050 } 1051 1052 /* 1053 * Called by main() to initialize the zones framework. 1054 */ 1055 void 1056 zone_init(void) 1057 { 1058 rctl_dict_entry_t *rde; 1059 rctl_val_t *dval; 1060 rctl_set_t *set; 1061 rctl_alloc_gp_t *gp; 1062 rctl_entity_p_t e; 1063 int res; 1064 1065 ASSERT(curproc == &p0); 1066 1067 /* 1068 * Create ID space for zone IDs. ID 0 is reserved for the 1069 * global zone. 1070 */ 1071 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 1072 1073 /* 1074 * Initialize generic zone resource controls, if any. 1075 */ 1076 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 1077 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 1078 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, 1079 FSS_MAXSHARES, FSS_MAXSHARES, 1080 &zone_cpu_shares_ops); 1081 1082 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 1083 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1084 INT_MAX, INT_MAX, &zone_lwps_ops); 1085 /* 1086 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 1087 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 1088 */ 1089 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 1090 bzero(dval, sizeof (rctl_val_t)); 1091 dval->rcv_value = 1; 1092 dval->rcv_privilege = RCPRIV_PRIVILEGED; 1093 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 1094 dval->rcv_action_recip_pid = -1; 1095 1096 rde = rctl_dict_lookup("zone.cpu-shares"); 1097 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 1098 1099 /* 1100 * Initialize the ``global zone''. 1101 */ 1102 set = rctl_set_create(); 1103 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 1104 mutex_enter(&p0.p_lock); 1105 e.rcep_p.zone = &zone0; 1106 e.rcep_t = RCENTITY_ZONE; 1107 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1108 gp); 1109 1110 zone0.zone_nlwps = p0.p_lwpcnt; 1111 zone0.zone_ntasks = 1; 1112 mutex_exit(&p0.p_lock); 1113 rctl_prealloc_destroy(gp); 1114 /* 1115 * pool_default hasn't been initialized yet, so we let pool_init() take 1116 * care of making the global zone is in the default pool. 1117 */ 1118 1119 /* 1120 * Initialize zone label. 1121 * mlp are initialized when tnzonecfg is loaded. 1122 */ 1123 zone0.zone_slabel = l_admin_low; 1124 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 1125 label_hold(l_admin_low); 1126 1127 mutex_enter(&zonehash_lock); 1128 zone_uniqid(&zone0); 1129 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1130 1131 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1132 mod_hash_null_valdtor); 1133 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1134 zone_hash_size, mod_hash_null_valdtor); 1135 /* 1136 * maintain zonehashbylabel only for labeled systems 1137 */ 1138 if (is_system_labeled()) 1139 zonehashbylabel = mod_hash_create_extended("zone_by_label", 1140 zone_hash_size, mod_hash_null_keydtor, 1141 mod_hash_null_valdtor, hash_bylabel, NULL, 1142 hash_labelkey_cmp, KM_SLEEP); 1143 zonecount = 1; 1144 1145 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1146 (mod_hash_val_t)&zone0); 1147 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1148 (mod_hash_val_t)&zone0); 1149 if (is_system_labeled()) { 1150 zone0.zone_flags |= ZF_HASHED_LABEL; 1151 (void) mod_hash_insert(zonehashbylabel, 1152 (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0); 1153 } 1154 mutex_exit(&zonehash_lock); 1155 1156 /* 1157 * We avoid setting zone_kcred until now, since kcred is initialized 1158 * sometime after zone_zsd_init() and before zone_init(). 1159 */ 1160 zone0.zone_kcred = kcred; 1161 /* 1162 * The global zone is fully initialized (except for zone_rootvp which 1163 * will be set when the root filesystem is mounted). 1164 */ 1165 global_zone = &zone0; 1166 1167 /* 1168 * Setup an event channel to send zone status change notifications on 1169 */ 1170 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan, 1171 EVCH_CREAT); 1172 1173 if (res) 1174 panic("Sysevent_evc_bind failed during zone setup.\n"); 1175 } 1176 1177 static void 1178 zone_free(zone_t *zone) 1179 { 1180 ASSERT(zone != global_zone); 1181 ASSERT(zone->zone_ntasks == 0); 1182 ASSERT(zone->zone_nlwps == 0); 1183 ASSERT(zone->zone_cred_ref == 0); 1184 ASSERT(zone->zone_kcred == NULL); 1185 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1186 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1187 1188 /* remove from deathrow list */ 1189 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1190 ASSERT(zone->zone_ref == 0); 1191 mutex_enter(&zone_deathrow_lock); 1192 list_remove(&zone_deathrow, zone); 1193 mutex_exit(&zone_deathrow_lock); 1194 } 1195 1196 zone_free_zsd(zone); 1197 zone_free_datasets(zone); 1198 1199 if (zone->zone_rootvp != NULL) 1200 VN_RELE(zone->zone_rootvp); 1201 if (zone->zone_rootpath) 1202 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1203 if (zone->zone_name != NULL) 1204 kmem_free(zone->zone_name, ZONENAME_MAX); 1205 if (zone->zone_slabel != NULL) 1206 label_rele(zone->zone_slabel); 1207 if (zone->zone_nodename != NULL) 1208 kmem_free(zone->zone_nodename, _SYS_NMLN); 1209 if (zone->zone_domain != NULL) 1210 kmem_free(zone->zone_domain, _SYS_NMLN); 1211 if (zone->zone_privset != NULL) 1212 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1213 if (zone->zone_rctls != NULL) 1214 rctl_set_free(zone->zone_rctls); 1215 if (zone->zone_bootargs != NULL) 1216 kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1); 1217 if (zone->zone_initname != NULL) 1218 kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1); 1219 id_free(zoneid_space, zone->zone_id); 1220 mutex_destroy(&zone->zone_lock); 1221 cv_destroy(&zone->zone_cv); 1222 rw_destroy(&zone->zone_mlps.mlpl_rwlock); 1223 kmem_free(zone, sizeof (zone_t)); 1224 } 1225 1226 /* 1227 * See block comment at the top of this file for information about zone 1228 * status values. 1229 */ 1230 /* 1231 * Convenience function for setting zone status. 1232 */ 1233 static void 1234 zone_status_set(zone_t *zone, zone_status_t status) 1235 { 1236 1237 nvlist_t *nvl = NULL; 1238 ASSERT(MUTEX_HELD(&zone_status_lock)); 1239 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1240 status >= zone_status_get(zone)); 1241 1242 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || 1243 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || 1244 nvlist_add_string(nvl, ZONE_CB_NEWSTATE, 1245 zone_status_table[status]) || 1246 nvlist_add_string(nvl, ZONE_CB_OLDSTATE, 1247 zone_status_table[zone->zone_status]) || 1248 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || 1249 nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || 1250 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, 1251 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { 1252 #ifdef DEBUG 1253 (void) printf( 1254 "Failed to allocate and send zone state change event.\n"); 1255 #endif 1256 } 1257 nvlist_free(nvl); 1258 1259 zone->zone_status = status; 1260 1261 cv_broadcast(&zone->zone_cv); 1262 } 1263 1264 /* 1265 * Public function to retrieve the zone status. The zone status may 1266 * change after it is retrieved. 1267 */ 1268 zone_status_t 1269 zone_status_get(zone_t *zone) 1270 { 1271 return (zone->zone_status); 1272 } 1273 1274 static int 1275 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1276 { 1277 char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); 1278 int err = 0; 1279 1280 ASSERT(zone != global_zone); 1281 if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0) 1282 goto done; /* EFAULT or ENAMETOOLONG */ 1283 1284 if (zone->zone_bootargs != NULL) 1285 kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1); 1286 1287 zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP); 1288 (void) strcpy(zone->zone_bootargs, bootargs); 1289 1290 done: 1291 kmem_free(bootargs, BOOTARGS_MAX); 1292 return (err); 1293 } 1294 1295 static int 1296 zone_set_initname(zone_t *zone, const char *zone_initname) 1297 { 1298 char initname[INITNAME_SZ]; 1299 size_t len; 1300 int err = 0; 1301 1302 ASSERT(zone != global_zone); 1303 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0) 1304 return (err); /* EFAULT or ENAMETOOLONG */ 1305 1306 if (zone->zone_initname != NULL) 1307 kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1); 1308 1309 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); 1310 (void) strcpy(zone->zone_initname, initname); 1311 return (0); 1312 } 1313 1314 /* 1315 * Block indefinitely waiting for (zone_status >= status) 1316 */ 1317 void 1318 zone_status_wait(zone_t *zone, zone_status_t status) 1319 { 1320 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1321 1322 mutex_enter(&zone_status_lock); 1323 while (zone->zone_status < status) { 1324 cv_wait(&zone->zone_cv, &zone_status_lock); 1325 } 1326 mutex_exit(&zone_status_lock); 1327 } 1328 1329 /* 1330 * Private CPR-safe version of zone_status_wait(). 1331 */ 1332 static void 1333 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1334 { 1335 callb_cpr_t cprinfo; 1336 1337 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1338 1339 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1340 str); 1341 mutex_enter(&zone_status_lock); 1342 while (zone->zone_status < status) { 1343 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1344 cv_wait(&zone->zone_cv, &zone_status_lock); 1345 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1346 } 1347 /* 1348 * zone_status_lock is implicitly released by the following. 1349 */ 1350 CALLB_CPR_EXIT(&cprinfo); 1351 } 1352 1353 /* 1354 * Block until zone enters requested state or signal is received. Return (0) 1355 * if signaled, non-zero otherwise. 1356 */ 1357 int 1358 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1359 { 1360 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1361 1362 mutex_enter(&zone_status_lock); 1363 while (zone->zone_status < status) { 1364 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1365 mutex_exit(&zone_status_lock); 1366 return (0); 1367 } 1368 } 1369 mutex_exit(&zone_status_lock); 1370 return (1); 1371 } 1372 1373 /* 1374 * Block until the zone enters the requested state or the timeout expires, 1375 * whichever happens first. Return (-1) if operation timed out, time remaining 1376 * otherwise. 1377 */ 1378 clock_t 1379 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1380 { 1381 clock_t timeleft = 0; 1382 1383 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1384 1385 mutex_enter(&zone_status_lock); 1386 while (zone->zone_status < status && timeleft != -1) { 1387 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1388 } 1389 mutex_exit(&zone_status_lock); 1390 return (timeleft); 1391 } 1392 1393 /* 1394 * Block until the zone enters the requested state, the current process is 1395 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1396 * operation timed out, 0 if signaled, time remaining otherwise. 1397 */ 1398 clock_t 1399 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1400 { 1401 clock_t timeleft = tim - lbolt; 1402 1403 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1404 1405 mutex_enter(&zone_status_lock); 1406 while (zone->zone_status < status) { 1407 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1408 tim); 1409 if (timeleft <= 0) 1410 break; 1411 } 1412 mutex_exit(&zone_status_lock); 1413 return (timeleft); 1414 } 1415 1416 /* 1417 * Zones have two reference counts: one for references from credential 1418 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1419 * This is so we can allow a zone to be rebooted while there are still 1420 * outstanding cred references, since certain drivers cache dblks (which 1421 * implicitly results in cached creds). We wait for zone_ref to drop to 1422 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1423 * later freed when the zone_cred_ref drops to 0, though nothing other 1424 * than the zone id and privilege set should be accessed once the zone 1425 * is "dead". 1426 * 1427 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1428 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1429 * to 0. This can be useful to flush out other sources of cached creds 1430 * that may be less innocuous than the driver case. 1431 */ 1432 1433 int zone_wait_for_cred = 0; 1434 1435 static void 1436 zone_hold_locked(zone_t *z) 1437 { 1438 ASSERT(MUTEX_HELD(&z->zone_lock)); 1439 z->zone_ref++; 1440 ASSERT(z->zone_ref != 0); 1441 } 1442 1443 void 1444 zone_hold(zone_t *z) 1445 { 1446 mutex_enter(&z->zone_lock); 1447 zone_hold_locked(z); 1448 mutex_exit(&z->zone_lock); 1449 } 1450 1451 /* 1452 * If the non-cred ref count drops to 1 and either the cred ref count 1453 * is 0 or we aren't waiting for cred references, the zone is ready to 1454 * be destroyed. 1455 */ 1456 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1457 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1458 1459 void 1460 zone_rele(zone_t *z) 1461 { 1462 boolean_t wakeup; 1463 1464 mutex_enter(&z->zone_lock); 1465 ASSERT(z->zone_ref != 0); 1466 z->zone_ref--; 1467 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1468 /* no more refs, free the structure */ 1469 mutex_exit(&z->zone_lock); 1470 zone_free(z); 1471 return; 1472 } 1473 /* signal zone_destroy so the zone can finish halting */ 1474 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1475 mutex_exit(&z->zone_lock); 1476 1477 if (wakeup) { 1478 /* 1479 * Grabbing zonehash_lock here effectively synchronizes with 1480 * zone_destroy() to avoid missed signals. 1481 */ 1482 mutex_enter(&zonehash_lock); 1483 cv_broadcast(&zone_destroy_cv); 1484 mutex_exit(&zonehash_lock); 1485 } 1486 } 1487 1488 void 1489 zone_cred_hold(zone_t *z) 1490 { 1491 mutex_enter(&z->zone_lock); 1492 z->zone_cred_ref++; 1493 ASSERT(z->zone_cred_ref != 0); 1494 mutex_exit(&z->zone_lock); 1495 } 1496 1497 void 1498 zone_cred_rele(zone_t *z) 1499 { 1500 boolean_t wakeup; 1501 1502 mutex_enter(&z->zone_lock); 1503 ASSERT(z->zone_cred_ref != 0); 1504 z->zone_cred_ref--; 1505 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1506 /* no more refs, free the structure */ 1507 mutex_exit(&z->zone_lock); 1508 zone_free(z); 1509 return; 1510 } 1511 /* 1512 * If zone_destroy is waiting for the cred references to drain 1513 * out, and they have, signal it. 1514 */ 1515 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1516 zone_status_get(z) >= ZONE_IS_DEAD); 1517 mutex_exit(&z->zone_lock); 1518 1519 if (wakeup) { 1520 /* 1521 * Grabbing zonehash_lock here effectively synchronizes with 1522 * zone_destroy() to avoid missed signals. 1523 */ 1524 mutex_enter(&zonehash_lock); 1525 cv_broadcast(&zone_destroy_cv); 1526 mutex_exit(&zonehash_lock); 1527 } 1528 } 1529 1530 void 1531 zone_task_hold(zone_t *z) 1532 { 1533 mutex_enter(&z->zone_lock); 1534 z->zone_ntasks++; 1535 ASSERT(z->zone_ntasks != 0); 1536 mutex_exit(&z->zone_lock); 1537 } 1538 1539 void 1540 zone_task_rele(zone_t *zone) 1541 { 1542 uint_t refcnt; 1543 1544 mutex_enter(&zone->zone_lock); 1545 ASSERT(zone->zone_ntasks != 0); 1546 refcnt = --zone->zone_ntasks; 1547 if (refcnt > 1) { /* Common case */ 1548 mutex_exit(&zone->zone_lock); 1549 return; 1550 } 1551 zone_hold_locked(zone); /* so we can use the zone_t later */ 1552 mutex_exit(&zone->zone_lock); 1553 if (refcnt == 1) { 1554 /* 1555 * See if the zone is shutting down. 1556 */ 1557 mutex_enter(&zone_status_lock); 1558 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1559 goto out; 1560 } 1561 1562 /* 1563 * Make sure the ntasks didn't change since we 1564 * dropped zone_lock. 1565 */ 1566 mutex_enter(&zone->zone_lock); 1567 if (refcnt != zone->zone_ntasks) { 1568 mutex_exit(&zone->zone_lock); 1569 goto out; 1570 } 1571 mutex_exit(&zone->zone_lock); 1572 1573 /* 1574 * No more user processes in the zone. The zone is empty. 1575 */ 1576 zone_status_set(zone, ZONE_IS_EMPTY); 1577 goto out; 1578 } 1579 1580 ASSERT(refcnt == 0); 1581 /* 1582 * zsched has exited; the zone is dead. 1583 */ 1584 zone->zone_zsched = NULL; /* paranoia */ 1585 mutex_enter(&zone_status_lock); 1586 zone_status_set(zone, ZONE_IS_DEAD); 1587 out: 1588 mutex_exit(&zone_status_lock); 1589 zone_rele(zone); 1590 } 1591 1592 zoneid_t 1593 getzoneid(void) 1594 { 1595 return (curproc->p_zone->zone_id); 1596 } 1597 1598 /* 1599 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1600 * check the validity of a zone's state. 1601 */ 1602 static zone_t * 1603 zone_find_all_by_id(zoneid_t zoneid) 1604 { 1605 mod_hash_val_t hv; 1606 zone_t *zone = NULL; 1607 1608 ASSERT(MUTEX_HELD(&zonehash_lock)); 1609 1610 if (mod_hash_find(zonehashbyid, 1611 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1612 zone = (zone_t *)hv; 1613 return (zone); 1614 } 1615 1616 static zone_t * 1617 zone_find_all_by_label(const ts_label_t *label) 1618 { 1619 mod_hash_val_t hv; 1620 zone_t *zone = NULL; 1621 1622 ASSERT(MUTEX_HELD(&zonehash_lock)); 1623 1624 /* 1625 * zonehashbylabel is not maintained for unlabeled systems 1626 */ 1627 if (!is_system_labeled()) 1628 return (NULL); 1629 if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0) 1630 zone = (zone_t *)hv; 1631 return (zone); 1632 } 1633 1634 static zone_t * 1635 zone_find_all_by_name(char *name) 1636 { 1637 mod_hash_val_t hv; 1638 zone_t *zone = NULL; 1639 1640 ASSERT(MUTEX_HELD(&zonehash_lock)); 1641 1642 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 1643 zone = (zone_t *)hv; 1644 return (zone); 1645 } 1646 1647 /* 1648 * Public interface for looking up a zone by zoneid. Only returns the zone if 1649 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 1650 * Caller must call zone_rele() once it is done with the zone. 1651 * 1652 * The zone may begin the zone_destroy() sequence immediately after this 1653 * function returns, but may be safely used until zone_rele() is called. 1654 */ 1655 zone_t * 1656 zone_find_by_id(zoneid_t zoneid) 1657 { 1658 zone_t *zone; 1659 zone_status_t status; 1660 1661 mutex_enter(&zonehash_lock); 1662 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 1663 mutex_exit(&zonehash_lock); 1664 return (NULL); 1665 } 1666 status = zone_status_get(zone); 1667 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1668 /* 1669 * For all practical purposes the zone doesn't exist. 1670 */ 1671 mutex_exit(&zonehash_lock); 1672 return (NULL); 1673 } 1674 zone_hold(zone); 1675 mutex_exit(&zonehash_lock); 1676 return (zone); 1677 } 1678 1679 /* 1680 * Similar to zone_find_by_id, but using zone label as the key. 1681 */ 1682 zone_t * 1683 zone_find_by_label(const ts_label_t *label) 1684 { 1685 zone_t *zone; 1686 zone_status_t status; 1687 1688 mutex_enter(&zonehash_lock); 1689 if ((zone = zone_find_all_by_label(label)) == NULL) { 1690 mutex_exit(&zonehash_lock); 1691 return (NULL); 1692 } 1693 1694 status = zone_status_get(zone); 1695 if (status > ZONE_IS_DOWN) { 1696 /* 1697 * For all practical purposes the zone doesn't exist. 1698 */ 1699 mutex_exit(&zonehash_lock); 1700 return (NULL); 1701 } 1702 zone_hold(zone); 1703 mutex_exit(&zonehash_lock); 1704 return (zone); 1705 } 1706 1707 /* 1708 * Similar to zone_find_by_id, but using zone name as the key. 1709 */ 1710 zone_t * 1711 zone_find_by_name(char *name) 1712 { 1713 zone_t *zone; 1714 zone_status_t status; 1715 1716 mutex_enter(&zonehash_lock); 1717 if ((zone = zone_find_all_by_name(name)) == NULL) { 1718 mutex_exit(&zonehash_lock); 1719 return (NULL); 1720 } 1721 status = zone_status_get(zone); 1722 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1723 /* 1724 * For all practical purposes the zone doesn't exist. 1725 */ 1726 mutex_exit(&zonehash_lock); 1727 return (NULL); 1728 } 1729 zone_hold(zone); 1730 mutex_exit(&zonehash_lock); 1731 return (zone); 1732 } 1733 1734 /* 1735 * Similar to zone_find_by_id(), using the path as a key. For instance, 1736 * if there is a zone "foo" rooted at /foo/root, and the path argument 1737 * is "/foo/root/proc", it will return the held zone_t corresponding to 1738 * zone "foo". 1739 * 1740 * zone_find_by_path() always returns a non-NULL value, since at the 1741 * very least every path will be contained in the global zone. 1742 * 1743 * As with the other zone_find_by_*() functions, the caller is 1744 * responsible for zone_rele()ing the return value of this function. 1745 */ 1746 zone_t * 1747 zone_find_by_path(const char *path) 1748 { 1749 zone_t *zone; 1750 zone_t *zret = NULL; 1751 zone_status_t status; 1752 1753 if (path == NULL) { 1754 /* 1755 * Call from rootconf(). 1756 */ 1757 zone_hold(global_zone); 1758 return (global_zone); 1759 } 1760 ASSERT(*path == '/'); 1761 mutex_enter(&zonehash_lock); 1762 for (zone = list_head(&zone_active); zone != NULL; 1763 zone = list_next(&zone_active, zone)) { 1764 if (ZONE_PATH_VISIBLE(path, zone)) 1765 zret = zone; 1766 } 1767 ASSERT(zret != NULL); 1768 status = zone_status_get(zret); 1769 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 1770 /* 1771 * Zone practically doesn't exist. 1772 */ 1773 zret = global_zone; 1774 } 1775 zone_hold(zret); 1776 mutex_exit(&zonehash_lock); 1777 return (zret); 1778 } 1779 1780 /* 1781 * Get the number of cpus visible to this zone. The system-wide global 1782 * 'ncpus' is returned if pools are disabled, the caller is in the 1783 * global zone, or a NULL zone argument is passed in. 1784 */ 1785 int 1786 zone_ncpus_get(zone_t *zone) 1787 { 1788 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 1789 1790 return (myncpus != 0 ? myncpus : ncpus); 1791 } 1792 1793 /* 1794 * Get the number of online cpus visible to this zone. The system-wide 1795 * global 'ncpus_online' is returned if pools are disabled, the caller 1796 * is in the global zone, or a NULL zone argument is passed in. 1797 */ 1798 int 1799 zone_ncpus_online_get(zone_t *zone) 1800 { 1801 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 1802 1803 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 1804 } 1805 1806 /* 1807 * Return the pool to which the zone is currently bound. 1808 */ 1809 pool_t * 1810 zone_pool_get(zone_t *zone) 1811 { 1812 ASSERT(pool_lock_held()); 1813 1814 return (zone->zone_pool); 1815 } 1816 1817 /* 1818 * Set the zone's pool pointer and update the zone's visibility to match 1819 * the resources in the new pool. 1820 */ 1821 void 1822 zone_pool_set(zone_t *zone, pool_t *pool) 1823 { 1824 ASSERT(pool_lock_held()); 1825 ASSERT(MUTEX_HELD(&cpu_lock)); 1826 1827 zone->zone_pool = pool; 1828 zone_pset_set(zone, pool->pool_pset->pset_id); 1829 } 1830 1831 /* 1832 * Return the cached value of the id of the processor set to which the 1833 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 1834 * facility is disabled. 1835 */ 1836 psetid_t 1837 zone_pset_get(zone_t *zone) 1838 { 1839 ASSERT(MUTEX_HELD(&cpu_lock)); 1840 1841 return (zone->zone_psetid); 1842 } 1843 1844 /* 1845 * Set the cached value of the id of the processor set to which the zone 1846 * is currently bound. Also update the zone's visibility to match the 1847 * resources in the new processor set. 1848 */ 1849 void 1850 zone_pset_set(zone_t *zone, psetid_t newpsetid) 1851 { 1852 psetid_t oldpsetid; 1853 1854 ASSERT(MUTEX_HELD(&cpu_lock)); 1855 oldpsetid = zone_pset_get(zone); 1856 1857 if (oldpsetid == newpsetid) 1858 return; 1859 /* 1860 * Global zone sees all. 1861 */ 1862 if (zone != global_zone) { 1863 zone->zone_psetid = newpsetid; 1864 if (newpsetid != ZONE_PS_INVAL) 1865 pool_pset_visibility_add(newpsetid, zone); 1866 if (oldpsetid != ZONE_PS_INVAL) 1867 pool_pset_visibility_remove(oldpsetid, zone); 1868 } 1869 /* 1870 * Disabling pools, so we should start using the global values 1871 * for ncpus and ncpus_online. 1872 */ 1873 if (newpsetid == ZONE_PS_INVAL) { 1874 zone->zone_ncpus = 0; 1875 zone->zone_ncpus_online = 0; 1876 } 1877 } 1878 1879 /* 1880 * Walk the list of active zones and issue the provided callback for 1881 * each of them. 1882 * 1883 * Caller must not be holding any locks that may be acquired under 1884 * zonehash_lock. See comment at the beginning of the file for a list of 1885 * common locks and their interactions with zones. 1886 */ 1887 int 1888 zone_walk(int (*cb)(zone_t *, void *), void *data) 1889 { 1890 zone_t *zone; 1891 int ret = 0; 1892 zone_status_t status; 1893 1894 mutex_enter(&zonehash_lock); 1895 for (zone = list_head(&zone_active); zone != NULL; 1896 zone = list_next(&zone_active, zone)) { 1897 /* 1898 * Skip zones that shouldn't be externally visible. 1899 */ 1900 status = zone_status_get(zone); 1901 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 1902 continue; 1903 /* 1904 * Bail immediately if any callback invocation returns a 1905 * non-zero value. 1906 */ 1907 ret = (*cb)(zone, data); 1908 if (ret != 0) 1909 break; 1910 } 1911 mutex_exit(&zonehash_lock); 1912 return (ret); 1913 } 1914 1915 static int 1916 zone_set_root(zone_t *zone, const char *upath) 1917 { 1918 vnode_t *vp; 1919 int trycount; 1920 int error = 0; 1921 char *path; 1922 struct pathname upn, pn; 1923 size_t pathlen; 1924 1925 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 1926 return (error); 1927 1928 pn_alloc(&pn); 1929 1930 /* prevent infinite loop */ 1931 trycount = 10; 1932 for (;;) { 1933 if (--trycount <= 0) { 1934 error = ESTALE; 1935 goto out; 1936 } 1937 1938 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 1939 /* 1940 * VOP_ACCESS() may cover 'vp' with a new 1941 * filesystem, if 'vp' is an autoFS vnode. 1942 * Get the new 'vp' if so. 1943 */ 1944 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 1945 (vp->v_vfsmountedhere == NULL || 1946 (error = traverse(&vp)) == 0)) { 1947 pathlen = pn.pn_pathlen + 2; 1948 path = kmem_alloc(pathlen, KM_SLEEP); 1949 (void) strncpy(path, pn.pn_path, 1950 pn.pn_pathlen + 1); 1951 path[pathlen - 2] = '/'; 1952 path[pathlen - 1] = '\0'; 1953 pn_free(&pn); 1954 pn_free(&upn); 1955 1956 /* Success! */ 1957 break; 1958 } 1959 VN_RELE(vp); 1960 } 1961 if (error != ESTALE) 1962 goto out; 1963 } 1964 1965 ASSERT(error == 0); 1966 zone->zone_rootvp = vp; /* we hold a reference to vp */ 1967 zone->zone_rootpath = path; 1968 zone->zone_rootpathlen = pathlen; 1969 if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0) 1970 zone->zone_flags |= ZF_IS_SCRATCH; 1971 return (0); 1972 1973 out: 1974 pn_free(&pn); 1975 pn_free(&upn); 1976 return (error); 1977 } 1978 1979 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 1980 ((c) >= 'a' && (c) <= 'z') || \ 1981 ((c) >= 'A' && (c) <= 'Z')) 1982 1983 static int 1984 zone_set_name(zone_t *zone, const char *uname) 1985 { 1986 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 1987 size_t len; 1988 int i, err; 1989 1990 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 1991 kmem_free(kname, ZONENAME_MAX); 1992 return (err); /* EFAULT or ENAMETOOLONG */ 1993 } 1994 1995 /* must be less than ZONENAME_MAX */ 1996 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 1997 kmem_free(kname, ZONENAME_MAX); 1998 return (EINVAL); 1999 } 2000 2001 /* 2002 * Name must start with an alphanumeric and must contain only 2003 * alphanumerics, '-', '_' and '.'. 2004 */ 2005 if (!isalnum(kname[0])) { 2006 kmem_free(kname, ZONENAME_MAX); 2007 return (EINVAL); 2008 } 2009 for (i = 1; i < len - 1; i++) { 2010 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 2011 kname[i] != '.') { 2012 kmem_free(kname, ZONENAME_MAX); 2013 return (EINVAL); 2014 } 2015 } 2016 2017 zone->zone_name = kname; 2018 return (0); 2019 } 2020 2021 /* 2022 * Similar to thread_create(), but makes sure the thread is in the appropriate 2023 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 2024 */ 2025 /*ARGSUSED*/ 2026 kthread_t * 2027 zthread_create( 2028 caddr_t stk, 2029 size_t stksize, 2030 void (*proc)(), 2031 void *arg, 2032 size_t len, 2033 pri_t pri) 2034 { 2035 kthread_t *t; 2036 zone_t *zone = curproc->p_zone; 2037 proc_t *pp = zone->zone_zsched; 2038 2039 zone_hold(zone); /* Reference to be dropped when thread exits */ 2040 2041 /* 2042 * No-one should be trying to create threads if the zone is shutting 2043 * down and there aren't any kernel threads around. See comment 2044 * in zthread_exit(). 2045 */ 2046 ASSERT(!(zone->zone_kthreads == NULL && 2047 zone_status_get(zone) >= ZONE_IS_EMPTY)); 2048 /* 2049 * Create a thread, but don't let it run until we've finished setting 2050 * things up. 2051 */ 2052 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 2053 ASSERT(t->t_forw == NULL); 2054 mutex_enter(&zone_status_lock); 2055 if (zone->zone_kthreads == NULL) { 2056 t->t_forw = t->t_back = t; 2057 } else { 2058 kthread_t *tx = zone->zone_kthreads; 2059 2060 t->t_forw = tx; 2061 t->t_back = tx->t_back; 2062 tx->t_back->t_forw = t; 2063 tx->t_back = t; 2064 } 2065 zone->zone_kthreads = t; 2066 mutex_exit(&zone_status_lock); 2067 2068 mutex_enter(&pp->p_lock); 2069 t->t_proc_flag |= TP_ZTHREAD; 2070 project_rele(t->t_proj); 2071 t->t_proj = project_hold(pp->p_task->tk_proj); 2072 2073 /* 2074 * Setup complete, let it run. 2075 */ 2076 thread_lock(t); 2077 t->t_schedflag |= TS_ALLSTART; 2078 setrun_locked(t); 2079 thread_unlock(t); 2080 2081 mutex_exit(&pp->p_lock); 2082 2083 return (t); 2084 } 2085 2086 /* 2087 * Similar to thread_exit(). Must be called by threads created via 2088 * zthread_exit(). 2089 */ 2090 void 2091 zthread_exit(void) 2092 { 2093 kthread_t *t = curthread; 2094 proc_t *pp = curproc; 2095 zone_t *zone = pp->p_zone; 2096 2097 mutex_enter(&zone_status_lock); 2098 2099 /* 2100 * Reparent to p0 2101 */ 2102 kpreempt_disable(); 2103 mutex_enter(&pp->p_lock); 2104 t->t_proc_flag &= ~TP_ZTHREAD; 2105 t->t_procp = &p0; 2106 hat_thread_exit(t); 2107 mutex_exit(&pp->p_lock); 2108 kpreempt_enable(); 2109 2110 if (t->t_back == t) { 2111 ASSERT(t->t_forw == t); 2112 /* 2113 * If the zone is empty, once the thread count 2114 * goes to zero no further kernel threads can be 2115 * created. This is because if the creator is a process 2116 * in the zone, then it must have exited before the zone 2117 * state could be set to ZONE_IS_EMPTY. 2118 * Otherwise, if the creator is a kernel thread in the 2119 * zone, the thread count is non-zero. 2120 * 2121 * This really means that non-zone kernel threads should 2122 * not create zone kernel threads. 2123 */ 2124 zone->zone_kthreads = NULL; 2125 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 2126 zone_status_set(zone, ZONE_IS_DOWN); 2127 } 2128 } else { 2129 t->t_forw->t_back = t->t_back; 2130 t->t_back->t_forw = t->t_forw; 2131 if (zone->zone_kthreads == t) 2132 zone->zone_kthreads = t->t_forw; 2133 } 2134 mutex_exit(&zone_status_lock); 2135 zone_rele(zone); 2136 thread_exit(); 2137 /* NOTREACHED */ 2138 } 2139 2140 static void 2141 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 2142 { 2143 vnode_t *oldvp; 2144 2145 /* we're going to hold a reference here to the directory */ 2146 VN_HOLD(vp); 2147 2148 #ifdef C2_AUDIT 2149 if (audit_active) /* update abs cwd/root path see c2audit.c */ 2150 audit_chdirec(vp, vpp); 2151 #endif 2152 2153 mutex_enter(&pp->p_lock); 2154 oldvp = *vpp; 2155 *vpp = vp; 2156 mutex_exit(&pp->p_lock); 2157 if (oldvp != NULL) 2158 VN_RELE(oldvp); 2159 } 2160 2161 /* 2162 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 2163 */ 2164 static int 2165 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 2166 { 2167 nvpair_t *nvp = NULL; 2168 boolean_t priv_set = B_FALSE; 2169 boolean_t limit_set = B_FALSE; 2170 boolean_t action_set = B_FALSE; 2171 2172 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2173 const char *name; 2174 uint64_t ui64; 2175 2176 name = nvpair_name(nvp); 2177 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 2178 return (EINVAL); 2179 (void) nvpair_value_uint64(nvp, &ui64); 2180 if (strcmp(name, "privilege") == 0) { 2181 /* 2182 * Currently only privileged values are allowed, but 2183 * this may change in the future. 2184 */ 2185 if (ui64 != RCPRIV_PRIVILEGED) 2186 return (EINVAL); 2187 rv->rcv_privilege = ui64; 2188 priv_set = B_TRUE; 2189 } else if (strcmp(name, "limit") == 0) { 2190 rv->rcv_value = ui64; 2191 limit_set = B_TRUE; 2192 } else if (strcmp(name, "action") == 0) { 2193 if (ui64 != RCTL_LOCAL_NOACTION && 2194 ui64 != RCTL_LOCAL_DENY) 2195 return (EINVAL); 2196 rv->rcv_flagaction = ui64; 2197 action_set = B_TRUE; 2198 } else { 2199 return (EINVAL); 2200 } 2201 } 2202 2203 if (!(priv_set && limit_set && action_set)) 2204 return (EINVAL); 2205 rv->rcv_action_signal = 0; 2206 rv->rcv_action_recipient = NULL; 2207 rv->rcv_action_recip_pid = -1; 2208 rv->rcv_firing_time = 0; 2209 2210 return (0); 2211 } 2212 2213 /* 2214 * Non-global zone version of start_init. 2215 */ 2216 void 2217 zone_start_init(void) 2218 { 2219 proc_t *p = ttoproc(curthread); 2220 2221 ASSERT(!INGLOBALZONE(curproc)); 2222 2223 /* 2224 * We maintain zone_boot_err so that we can return the cause of the 2225 * failure back to the caller of the zone_boot syscall. 2226 */ 2227 p->p_zone->zone_boot_err = start_init_common(); 2228 2229 mutex_enter(&zone_status_lock); 2230 if (p->p_zone->zone_boot_err != 0) { 2231 /* 2232 * Make sure we are still in the booting state-- we could have 2233 * raced and already be shutting down, or even further along. 2234 */ 2235 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2236 zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); 2237 mutex_exit(&zone_status_lock); 2238 /* It's gone bad, dispose of the process */ 2239 if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { 2240 mutex_enter(&p->p_lock); 2241 ASSERT(p->p_flag & SEXITLWPS); 2242 lwp_exit(); 2243 } 2244 } else { 2245 if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) 2246 zone_status_set(p->p_zone, ZONE_IS_RUNNING); 2247 mutex_exit(&zone_status_lock); 2248 /* cause the process to return to userland. */ 2249 lwp_rtt(); 2250 } 2251 } 2252 2253 struct zsched_arg { 2254 zone_t *zone; 2255 nvlist_t *nvlist; 2256 }; 2257 2258 /* 2259 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2260 * anything to do with scheduling, but rather with the fact that 2261 * per-zone kernel threads are parented to zsched, just like regular 2262 * kernel threads are parented to sched (p0). 2263 * 2264 * zsched is also responsible for launching init for the zone. 2265 */ 2266 static void 2267 zsched(void *arg) 2268 { 2269 struct zsched_arg *za = arg; 2270 proc_t *pp = curproc; 2271 proc_t *initp = proc_init; 2272 zone_t *zone = za->zone; 2273 cred_t *cr, *oldcred; 2274 rctl_set_t *set; 2275 rctl_alloc_gp_t *gp; 2276 contract_t *ct = NULL; 2277 task_t *tk, *oldtk; 2278 rctl_entity_p_t e; 2279 kproject_t *pj; 2280 2281 nvlist_t *nvl = za->nvlist; 2282 nvpair_t *nvp = NULL; 2283 2284 bcopy("zsched", u.u_psargs, sizeof ("zsched")); 2285 bcopy("zsched", u.u_comm, sizeof ("zsched")); 2286 u.u_argc = 0; 2287 u.u_argv = NULL; 2288 u.u_envp = NULL; 2289 closeall(P_FINFO(pp)); 2290 2291 /* 2292 * We are this zone's "zsched" process. As the zone isn't generally 2293 * visible yet we don't need to grab any locks before initializing its 2294 * zone_proc pointer. 2295 */ 2296 zone_hold(zone); /* this hold is released by zone_destroy() */ 2297 zone->zone_zsched = pp; 2298 mutex_enter(&pp->p_lock); 2299 pp->p_zone = zone; 2300 mutex_exit(&pp->p_lock); 2301 2302 /* 2303 * Disassociate process from its 'parent'; parent ourselves to init 2304 * (pid 1) and change other values as needed. 2305 */ 2306 sess_create(); 2307 2308 mutex_enter(&pidlock); 2309 proc_detach(pp); 2310 pp->p_ppid = 1; 2311 pp->p_flag |= SZONETOP; 2312 pp->p_ancpid = 1; 2313 pp->p_parent = initp; 2314 pp->p_psibling = NULL; 2315 if (initp->p_child) 2316 initp->p_child->p_psibling = pp; 2317 pp->p_sibling = initp->p_child; 2318 initp->p_child = pp; 2319 2320 /* Decrement what newproc() incremented. */ 2321 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2322 /* 2323 * Our credentials are about to become kcred-like, so we don't care 2324 * about the caller's ruid. 2325 */ 2326 upcount_inc(crgetruid(kcred), zone->zone_id); 2327 mutex_exit(&pidlock); 2328 2329 /* 2330 * getting out of global zone, so decrement lwp counts 2331 */ 2332 pj = pp->p_task->tk_proj; 2333 mutex_enter(&global_zone->zone_nlwps_lock); 2334 pj->kpj_nlwps -= pp->p_lwpcnt; 2335 global_zone->zone_nlwps -= pp->p_lwpcnt; 2336 mutex_exit(&global_zone->zone_nlwps_lock); 2337 2338 /* 2339 * Create and join a new task in project '0' of this zone. 2340 * 2341 * We don't need to call holdlwps() since we know we're the only lwp in 2342 * this process. 2343 * 2344 * task_join() returns with p_lock held. 2345 */ 2346 tk = task_create(0, zone); 2347 mutex_enter(&cpu_lock); 2348 oldtk = task_join(tk, 0); 2349 mutex_exit(&curproc->p_lock); 2350 mutex_exit(&cpu_lock); 2351 task_rele(oldtk); 2352 2353 /* 2354 * add lwp counts to zsched's zone, and increment project's task count 2355 * due to the task created in the above tasksys_settaskid 2356 */ 2357 pj = pp->p_task->tk_proj; 2358 mutex_enter(&zone->zone_nlwps_lock); 2359 pj->kpj_nlwps += pp->p_lwpcnt; 2360 pj->kpj_ntasks += 1; 2361 zone->zone_nlwps += pp->p_lwpcnt; 2362 mutex_exit(&zone->zone_nlwps_lock); 2363 2364 /* 2365 * The process was created by a process in the global zone, hence the 2366 * credentials are wrong. We might as well have kcred-ish credentials. 2367 */ 2368 cr = zone->zone_kcred; 2369 crhold(cr); 2370 mutex_enter(&pp->p_crlock); 2371 oldcred = pp->p_cred; 2372 pp->p_cred = cr; 2373 mutex_exit(&pp->p_crlock); 2374 crfree(oldcred); 2375 2376 /* 2377 * Hold credentials again (for thread) 2378 */ 2379 crhold(cr); 2380 2381 /* 2382 * p_lwpcnt can't change since this is a kernel process. 2383 */ 2384 crset(pp, cr); 2385 2386 /* 2387 * Chroot 2388 */ 2389 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2390 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2391 2392 /* 2393 * Initialize zone's rctl set. 2394 */ 2395 set = rctl_set_create(); 2396 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2397 mutex_enter(&pp->p_lock); 2398 e.rcep_p.zone = zone; 2399 e.rcep_t = RCENTITY_ZONE; 2400 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2401 mutex_exit(&pp->p_lock); 2402 rctl_prealloc_destroy(gp); 2403 2404 /* 2405 * Apply the rctls passed in to zone_create(). This is basically a list 2406 * assignment: all of the old values are removed and the new ones 2407 * inserted. That is, if an empty list is passed in, all values are 2408 * removed. 2409 */ 2410 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2411 rctl_dict_entry_t *rde; 2412 rctl_hndl_t hndl; 2413 char *name; 2414 nvlist_t **nvlarray; 2415 uint_t i, nelem; 2416 int error; /* For ASSERT()s */ 2417 2418 name = nvpair_name(nvp); 2419 hndl = rctl_hndl_lookup(name); 2420 ASSERT(hndl != -1); 2421 rde = rctl_dict_lookup_hndl(hndl); 2422 ASSERT(rde != NULL); 2423 2424 for (; /* ever */; ) { 2425 rctl_val_t oval; 2426 2427 mutex_enter(&pp->p_lock); 2428 error = rctl_local_get(hndl, NULL, &oval, pp); 2429 mutex_exit(&pp->p_lock); 2430 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2431 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2432 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2433 break; 2434 mutex_enter(&pp->p_lock); 2435 error = rctl_local_delete(hndl, &oval, pp); 2436 mutex_exit(&pp->p_lock); 2437 ASSERT(error == 0); 2438 } 2439 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2440 ASSERT(error == 0); 2441 for (i = 0; i < nelem; i++) { 2442 rctl_val_t *nvalp; 2443 2444 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2445 error = nvlist2rctlval(nvlarray[i], nvalp); 2446 ASSERT(error == 0); 2447 /* 2448 * rctl_local_insert can fail if the value being 2449 * inserted is a duplicate; this is OK. 2450 */ 2451 mutex_enter(&pp->p_lock); 2452 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2453 kmem_cache_free(rctl_val_cache, nvalp); 2454 mutex_exit(&pp->p_lock); 2455 } 2456 } 2457 /* 2458 * Tell the world that we're done setting up. 2459 * 2460 * At this point we want to set the zone status to ZONE_IS_READY 2461 * and atomically set the zone's processor set visibility. Once 2462 * we drop pool_lock() this zone will automatically get updated 2463 * to reflect any future changes to the pools configuration. 2464 */ 2465 pool_lock(); 2466 mutex_enter(&cpu_lock); 2467 mutex_enter(&zonehash_lock); 2468 zone_uniqid(zone); 2469 zone_zsd_configure(zone); 2470 if (pool_state == POOL_ENABLED) 2471 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2472 mutex_enter(&zone_status_lock); 2473 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2474 zone_status_set(zone, ZONE_IS_READY); 2475 mutex_exit(&zone_status_lock); 2476 mutex_exit(&zonehash_lock); 2477 mutex_exit(&cpu_lock); 2478 pool_unlock(); 2479 2480 /* 2481 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2482 * we launch init, and set the state to running. 2483 */ 2484 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2485 2486 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2487 id_t cid; 2488 2489 /* 2490 * Ok, this is a little complicated. We need to grab the 2491 * zone's pool's scheduling class ID; note that by now, we 2492 * are already bound to a pool if we need to be (zoneadmd 2493 * will have done that to us while we're in the READY 2494 * state). *But* the scheduling class for the zone's 'init' 2495 * must be explicitly passed to newproc, which doesn't 2496 * respect pool bindings. 2497 * 2498 * We hold the pool_lock across the call to newproc() to 2499 * close the obvious race: the pool's scheduling class 2500 * could change before we manage to create the LWP with 2501 * classid 'cid'. 2502 */ 2503 pool_lock(); 2504 cid = pool_get_class(zone->zone_pool); 2505 if (cid == -1) 2506 cid = defaultcid; 2507 2508 /* 2509 * If this fails, zone_boot will ultimately fail. The 2510 * state of the zone will be set to SHUTTING_DOWN-- userland 2511 * will have to tear down the zone, and fail, or try again. 2512 */ 2513 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid, 2514 minclsyspri - 1, &ct)) != 0) { 2515 mutex_enter(&zone_status_lock); 2516 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2517 mutex_exit(&zone_status_lock); 2518 } 2519 pool_unlock(); 2520 } 2521 2522 /* 2523 * Wait for zone_destroy() to be called. This is what we spend 2524 * most of our life doing. 2525 */ 2526 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2527 2528 if (ct) 2529 /* 2530 * At this point the process contract should be empty. 2531 * (Though if it isn't, it's not the end of the world.) 2532 */ 2533 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2534 2535 /* 2536 * Allow kcred to be freed when all referring processes 2537 * (including this one) go away. We can't just do this in 2538 * zone_free because we need to wait for the zone_cred_ref to 2539 * drop to 0 before calling zone_free, and the existence of 2540 * zone_kcred will prevent that. Thus, we call crfree here to 2541 * balance the crdup in zone_create. The crhold calls earlier 2542 * in zsched will be dropped when the thread and process exit. 2543 */ 2544 crfree(zone->zone_kcred); 2545 zone->zone_kcred = NULL; 2546 2547 exit(CLD_EXITED, 0); 2548 } 2549 2550 /* 2551 * Helper function to determine if there are any submounts of the 2552 * provided path. Used to make sure the zone doesn't "inherit" any 2553 * mounts from before it is created. 2554 */ 2555 static uint_t 2556 zone_mount_count(const char *rootpath) 2557 { 2558 vfs_t *vfsp; 2559 uint_t count = 0; 2560 size_t rootpathlen = strlen(rootpath); 2561 2562 /* 2563 * Holding zonehash_lock prevents race conditions with 2564 * vfs_list_add()/vfs_list_remove() since we serialize with 2565 * zone_find_by_path(). 2566 */ 2567 ASSERT(MUTEX_HELD(&zonehash_lock)); 2568 /* 2569 * The rootpath must end with a '/' 2570 */ 2571 ASSERT(rootpath[rootpathlen - 1] == '/'); 2572 2573 /* 2574 * This intentionally does not count the rootpath itself if that 2575 * happens to be a mount point. 2576 */ 2577 vfs_list_read_lock(); 2578 vfsp = rootvfs; 2579 do { 2580 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2581 rootpathlen) == 0) 2582 count++; 2583 vfsp = vfsp->vfs_next; 2584 } while (vfsp != rootvfs); 2585 vfs_list_unlock(); 2586 return (count); 2587 } 2588 2589 /* 2590 * Helper function to make sure that a zone created on 'rootpath' 2591 * wouldn't end up containing other zones' rootpaths. 2592 */ 2593 static boolean_t 2594 zone_is_nested(const char *rootpath) 2595 { 2596 zone_t *zone; 2597 size_t rootpathlen = strlen(rootpath); 2598 size_t len; 2599 2600 ASSERT(MUTEX_HELD(&zonehash_lock)); 2601 2602 for (zone = list_head(&zone_active); zone != NULL; 2603 zone = list_next(&zone_active, zone)) { 2604 if (zone == global_zone) 2605 continue; 2606 len = strlen(zone->zone_rootpath); 2607 if (strncmp(rootpath, zone->zone_rootpath, 2608 MIN(rootpathlen, len)) == 0) 2609 return (B_TRUE); 2610 } 2611 return (B_FALSE); 2612 } 2613 2614 static int 2615 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, 2616 size_t zone_privssz) 2617 { 2618 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 2619 2620 if (zone_privssz < sizeof (priv_set_t)) 2621 return (set_errno(ENOMEM)); 2622 2623 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 2624 kmem_free(privs, sizeof (priv_set_t)); 2625 return (EFAULT); 2626 } 2627 2628 zone->zone_privset = privs; 2629 return (0); 2630 } 2631 2632 /* 2633 * We make creative use of nvlists to pass in rctls from userland. The list is 2634 * a list of the following structures: 2635 * 2636 * (name = rctl_name, value = nvpair_list_array) 2637 * 2638 * Where each element of the nvpair_list_array is of the form: 2639 * 2640 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 2641 * (name = "limit", value = uint64_t), 2642 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 2643 */ 2644 static int 2645 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 2646 { 2647 nvpair_t *nvp = NULL; 2648 nvlist_t *nvl = NULL; 2649 char *kbuf; 2650 int error; 2651 rctl_val_t rv; 2652 2653 *nvlp = NULL; 2654 2655 if (buflen == 0) 2656 return (0); 2657 2658 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2659 return (ENOMEM); 2660 if (copyin(ubuf, kbuf, buflen)) { 2661 error = EFAULT; 2662 goto out; 2663 } 2664 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 2665 /* 2666 * nvl may have been allocated/free'd, but the value set to 2667 * non-NULL, so we reset it here. 2668 */ 2669 nvl = NULL; 2670 error = EINVAL; 2671 goto out; 2672 } 2673 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2674 rctl_dict_entry_t *rde; 2675 rctl_hndl_t hndl; 2676 nvlist_t **nvlarray; 2677 uint_t i, nelem; 2678 char *name; 2679 2680 error = EINVAL; 2681 name = nvpair_name(nvp); 2682 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 2683 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 2684 goto out; 2685 } 2686 if ((hndl = rctl_hndl_lookup(name)) == -1) { 2687 goto out; 2688 } 2689 rde = rctl_dict_lookup_hndl(hndl); 2690 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2691 ASSERT(error == 0); 2692 for (i = 0; i < nelem; i++) { 2693 if (error = nvlist2rctlval(nvlarray[i], &rv)) 2694 goto out; 2695 } 2696 if (rctl_invalid_value(rde, &rv)) { 2697 error = EINVAL; 2698 goto out; 2699 } 2700 } 2701 error = 0; 2702 *nvlp = nvl; 2703 out: 2704 kmem_free(kbuf, buflen); 2705 if (error && nvl != NULL) 2706 nvlist_free(nvl); 2707 return (error); 2708 } 2709 2710 int 2711 zone_create_error(int er_error, int er_ext, int *er_out) { 2712 if (er_out != NULL) { 2713 if (copyout(&er_ext, er_out, sizeof (int))) { 2714 return (set_errno(EFAULT)); 2715 } 2716 } 2717 return (set_errno(er_error)); 2718 } 2719 2720 static int 2721 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi) 2722 { 2723 ts_label_t *tsl; 2724 bslabel_t blab; 2725 2726 /* Get label from user */ 2727 if (copyin(lab, &blab, sizeof (blab)) != 0) 2728 return (EFAULT); 2729 tsl = labelalloc(&blab, doi, KM_NOSLEEP); 2730 if (tsl == NULL) 2731 return (ENOMEM); 2732 2733 zone->zone_slabel = tsl; 2734 return (0); 2735 } 2736 2737 /* 2738 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 2739 */ 2740 static int 2741 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 2742 { 2743 char *kbuf; 2744 char *dataset, *next; 2745 zone_dataset_t *zd; 2746 size_t len; 2747 2748 if (ubuf == NULL || buflen == 0) 2749 return (0); 2750 2751 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 2752 return (ENOMEM); 2753 2754 if (copyin(ubuf, kbuf, buflen) != 0) { 2755 kmem_free(kbuf, buflen); 2756 return (EFAULT); 2757 } 2758 2759 dataset = next = kbuf; 2760 for (;;) { 2761 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 2762 2763 next = strchr(dataset, ','); 2764 2765 if (next == NULL) 2766 len = strlen(dataset); 2767 else 2768 len = next - dataset; 2769 2770 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 2771 bcopy(dataset, zd->zd_dataset, len); 2772 zd->zd_dataset[len] = '\0'; 2773 2774 list_insert_head(&zone->zone_datasets, zd); 2775 2776 if (next == NULL) 2777 break; 2778 2779 dataset = next + 1; 2780 } 2781 2782 kmem_free(kbuf, buflen); 2783 return (0); 2784 } 2785 2786 /* 2787 * System call to create/initialize a new zone named 'zone_name', rooted 2788 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 2789 * and initialized with the zone-wide rctls described in 'rctlbuf', and 2790 * with labeling set by 'match', 'doi', and 'label'. 2791 * 2792 * If extended error is non-null, we may use it to return more detailed 2793 * error information. 2794 */ 2795 static zoneid_t 2796 zone_create(const char *zone_name, const char *zone_root, 2797 const priv_set_t *zone_privs, size_t zone_privssz, 2798 caddr_t rctlbuf, size_t rctlbufsz, 2799 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, 2800 int match, uint32_t doi, const bslabel_t *label) 2801 { 2802 struct zsched_arg zarg; 2803 nvlist_t *rctls = NULL; 2804 proc_t *pp = curproc; 2805 zone_t *zone, *ztmp; 2806 zoneid_t zoneid; 2807 int error; 2808 int error2 = 0; 2809 char *str; 2810 cred_t *zkcr; 2811 boolean_t insert_label_hash; 2812 2813 if (secpolicy_zone_config(CRED()) != 0) 2814 return (set_errno(EPERM)); 2815 2816 /* can't boot zone from within chroot environment */ 2817 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 2818 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 2819 extended_error)); 2820 2821 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 2822 zoneid = zone->zone_id = id_alloc(zoneid_space); 2823 zone->zone_status = ZONE_IS_UNINITIALIZED; 2824 zone->zone_pool = pool_default; 2825 zone->zone_pool_mod = gethrtime(); 2826 zone->zone_psetid = ZONE_PS_INVAL; 2827 zone->zone_ncpus = 0; 2828 zone->zone_ncpus_online = 0; 2829 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 2830 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 2831 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 2832 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 2833 offsetof(struct zsd_entry, zsd_linkage)); 2834 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 2835 offsetof(zone_dataset_t, zd_linkage)); 2836 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 2837 2838 if ((error = zone_set_name(zone, zone_name)) != 0) { 2839 zone_free(zone); 2840 return (zone_create_error(error, 0, extended_error)); 2841 } 2842 2843 if ((error = zone_set_root(zone, zone_root)) != 0) { 2844 zone_free(zone); 2845 return (zone_create_error(error, 0, extended_error)); 2846 } 2847 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) { 2848 zone_free(zone); 2849 return (zone_create_error(error, 0, extended_error)); 2850 } 2851 2852 /* initialize node name to be the same as zone name */ 2853 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2854 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 2855 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 2856 2857 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 2858 zone->zone_domain[0] = '\0'; 2859 zone->zone_shares = 1; 2860 zone->zone_bootargs = NULL; 2861 zone->zone_initname = 2862 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); 2863 (void) strcpy(zone->zone_initname, zone_default_initname); 2864 2865 /* 2866 * Zsched initializes the rctls. 2867 */ 2868 zone->zone_rctls = NULL; 2869 2870 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 2871 zone_free(zone); 2872 return (zone_create_error(error, 0, extended_error)); 2873 } 2874 2875 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 2876 zone_free(zone); 2877 return (set_errno(error)); 2878 } 2879 2880 /* 2881 * Read in the trusted system parameters: 2882 * match flag and sensitivity label. 2883 */ 2884 zone->zone_match = match; 2885 if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 2886 error = zone_set_label(zone, label, doi); 2887 if (error != 0) { 2888 zone_free(zone); 2889 return (set_errno(error)); 2890 } 2891 insert_label_hash = B_TRUE; 2892 } else { 2893 /* all zones get an admin_low label if system is not labeled */ 2894 zone->zone_slabel = l_admin_low; 2895 label_hold(l_admin_low); 2896 insert_label_hash = B_FALSE; 2897 } 2898 2899 /* 2900 * Stop all lwps since that's what normally happens as part of fork(). 2901 * This needs to happen before we grab any locks to avoid deadlock 2902 * (another lwp in the process could be waiting for the held lock). 2903 */ 2904 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 2905 zone_free(zone); 2906 if (rctls) 2907 nvlist_free(rctls); 2908 return (zone_create_error(error, 0, extended_error)); 2909 } 2910 2911 if (block_mounts() == 0) { 2912 mutex_enter(&pp->p_lock); 2913 if (curthread != pp->p_agenttp) 2914 continuelwps(pp); 2915 mutex_exit(&pp->p_lock); 2916 zone_free(zone); 2917 if (rctls) 2918 nvlist_free(rctls); 2919 return (zone_create_error(error, 0, extended_error)); 2920 } 2921 2922 /* 2923 * Set up credential for kernel access. After this, any errors 2924 * should go through the dance in errout rather than calling 2925 * zone_free directly. 2926 */ 2927 zone->zone_kcred = crdup(kcred); 2928 crsetzone(zone->zone_kcred, zone); 2929 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 2930 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 2931 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 2932 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 2933 2934 mutex_enter(&zonehash_lock); 2935 /* 2936 * Make sure zone doesn't already exist. 2937 * 2938 * If the system and zone are labeled, 2939 * make sure no other zone exists that has the same label. 2940 */ 2941 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL || 2942 (insert_label_hash && 2943 (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) { 2944 zone_status_t status; 2945 2946 status = zone_status_get(ztmp); 2947 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 2948 error = EEXIST; 2949 else 2950 error = EBUSY; 2951 goto errout; 2952 } 2953 2954 /* 2955 * Don't allow zone creations which would cause one zone's rootpath to 2956 * be accessible from that of another (non-global) zone. 2957 */ 2958 if (zone_is_nested(zone->zone_rootpath)) { 2959 error = EBUSY; 2960 goto errout; 2961 } 2962 2963 ASSERT(zonecount != 0); /* check for leaks */ 2964 if (zonecount + 1 > maxzones) { 2965 error = ENOMEM; 2966 goto errout; 2967 } 2968 2969 if (zone_mount_count(zone->zone_rootpath) != 0) { 2970 error = EBUSY; 2971 error2 = ZE_AREMOUNTS; 2972 goto errout; 2973 } 2974 2975 /* 2976 * Zone is still incomplete, but we need to drop all locks while 2977 * zsched() initializes this zone's kernel process. We 2978 * optimistically add the zone to the hashtable and associated 2979 * lists so a parallel zone_create() doesn't try to create the 2980 * same zone. 2981 */ 2982 zonecount++; 2983 (void) mod_hash_insert(zonehashbyid, 2984 (mod_hash_key_t)(uintptr_t)zone->zone_id, 2985 (mod_hash_val_t)(uintptr_t)zone); 2986 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 2987 (void) strcpy(str, zone->zone_name); 2988 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 2989 (mod_hash_val_t)(uintptr_t)zone); 2990 if (insert_label_hash) { 2991 (void) mod_hash_insert(zonehashbylabel, 2992 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone); 2993 zone->zone_flags |= ZF_HASHED_LABEL; 2994 } 2995 2996 /* 2997 * Insert into active list. At this point there are no 'hold's 2998 * on the zone, but everyone else knows not to use it, so we can 2999 * continue to use it. zsched() will do a zone_hold() if the 3000 * newproc() is successful. 3001 */ 3002 list_insert_tail(&zone_active, zone); 3003 mutex_exit(&zonehash_lock); 3004 3005 zarg.zone = zone; 3006 zarg.nvlist = rctls; 3007 /* 3008 * The process, task, and project rctls are probably wrong; 3009 * we need an interface to get the default values of all rctls, 3010 * and initialize zsched appropriately. I'm not sure that that 3011 * makes much of a difference, though. 3012 */ 3013 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 3014 /* 3015 * We need to undo all globally visible state. 3016 */ 3017 mutex_enter(&zonehash_lock); 3018 list_remove(&zone_active, zone); 3019 if (zone->zone_flags & ZF_HASHED_LABEL) { 3020 ASSERT(zone->zone_slabel != NULL); 3021 (void) mod_hash_destroy(zonehashbylabel, 3022 (mod_hash_key_t)zone->zone_slabel); 3023 } 3024 (void) mod_hash_destroy(zonehashbyname, 3025 (mod_hash_key_t)(uintptr_t)zone->zone_name); 3026 (void) mod_hash_destroy(zonehashbyid, 3027 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3028 ASSERT(zonecount > 1); 3029 zonecount--; 3030 goto errout; 3031 } 3032 3033 /* 3034 * Zone creation can't fail from now on. 3035 */ 3036 3037 /* 3038 * Let the other lwps continue. 3039 */ 3040 mutex_enter(&pp->p_lock); 3041 if (curthread != pp->p_agenttp) 3042 continuelwps(pp); 3043 mutex_exit(&pp->p_lock); 3044 3045 /* 3046 * Wait for zsched to finish initializing the zone. 3047 */ 3048 zone_status_wait(zone, ZONE_IS_READY); 3049 /* 3050 * The zone is fully visible, so we can let mounts progress. 3051 */ 3052 resume_mounts(); 3053 if (rctls) 3054 nvlist_free(rctls); 3055 3056 return (zoneid); 3057 3058 errout: 3059 mutex_exit(&zonehash_lock); 3060 /* 3061 * Let the other lwps continue. 3062 */ 3063 mutex_enter(&pp->p_lock); 3064 if (curthread != pp->p_agenttp) 3065 continuelwps(pp); 3066 mutex_exit(&pp->p_lock); 3067 3068 resume_mounts(); 3069 if (rctls) 3070 nvlist_free(rctls); 3071 /* 3072 * There is currently one reference to the zone, a cred_ref from 3073 * zone_kcred. To free the zone, we call crfree, which will call 3074 * zone_cred_rele, which will call zone_free. 3075 */ 3076 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 3077 ASSERT(zone->zone_kcred->cr_ref == 1); 3078 ASSERT(zone->zone_ref == 0); 3079 zkcr = zone->zone_kcred; 3080 zone->zone_kcred = NULL; 3081 crfree(zkcr); /* triggers call to zone_free */ 3082 return (zone_create_error(error, error2, extended_error)); 3083 } 3084 3085 /* 3086 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 3087 * the heavy lifting. initname is the path to the program to launch 3088 * at the "top" of the zone; if this is NULL, we use the system default, 3089 * which is stored at zone_default_initname. 3090 */ 3091 static int 3092 zone_boot(zoneid_t zoneid) 3093 { 3094 int err; 3095 zone_t *zone; 3096 3097 if (secpolicy_zone_config(CRED()) != 0) 3098 return (set_errno(EPERM)); 3099 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3100 return (set_errno(EINVAL)); 3101 3102 mutex_enter(&zonehash_lock); 3103 /* 3104 * Look for zone under hash lock to prevent races with calls to 3105 * zone_shutdown, zone_destroy, etc. 3106 */ 3107 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3108 mutex_exit(&zonehash_lock); 3109 return (set_errno(EINVAL)); 3110 } 3111 3112 mutex_enter(&zone_status_lock); 3113 if (zone_status_get(zone) != ZONE_IS_READY) { 3114 mutex_exit(&zone_status_lock); 3115 mutex_exit(&zonehash_lock); 3116 return (set_errno(EINVAL)); 3117 } 3118 zone_status_set(zone, ZONE_IS_BOOTING); 3119 mutex_exit(&zone_status_lock); 3120 3121 zone_hold(zone); /* so we can use the zone_t later */ 3122 mutex_exit(&zonehash_lock); 3123 3124 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 3125 zone_rele(zone); 3126 return (set_errno(EINTR)); 3127 } 3128 3129 /* 3130 * Boot (starting init) might have failed, in which case the zone 3131 * will go to the SHUTTING_DOWN state; an appropriate errno will 3132 * be placed in zone->zone_boot_err, and so we return that. 3133 */ 3134 err = zone->zone_boot_err; 3135 zone_rele(zone); 3136 return (err ? set_errno(err) : 0); 3137 } 3138 3139 /* 3140 * Kills all user processes in the zone, waiting for them all to exit 3141 * before returning. 3142 */ 3143 static int 3144 zone_empty(zone_t *zone) 3145 { 3146 int waitstatus; 3147 3148 /* 3149 * We need to drop zonehash_lock before killing all 3150 * processes, otherwise we'll deadlock with zone_find_* 3151 * which can be called from the exit path. 3152 */ 3153 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 3154 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 3155 ZONE_IS_EMPTY)) == -1) { 3156 killall(zone->zone_id); 3157 } 3158 /* 3159 * return EINTR if we were signaled 3160 */ 3161 if (waitstatus == 0) 3162 return (EINTR); 3163 return (0); 3164 } 3165 3166 /* 3167 * This function implements the policy for zone visibility. 3168 * 3169 * In standard Solaris, a non-global zone can only see itself. 3170 * 3171 * In Trusted Extensions, a labeled zone can lookup any zone whose label 3172 * it dominates. For this test, the label of the global zone is treated as 3173 * admin_high so it is special-cased instead of being checked for dominance. 3174 * 3175 * Returns true if zone attributes are viewable, false otherwise. 3176 */ 3177 static boolean_t 3178 zone_list_access(zone_t *zone) 3179 { 3180 3181 if (curproc->p_zone == global_zone || 3182 curproc->p_zone == zone) { 3183 return (B_TRUE); 3184 } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 3185 bslabel_t *curproc_label; 3186 bslabel_t *zone_label; 3187 3188 curproc_label = label2bslabel(curproc->p_zone->zone_slabel); 3189 zone_label = label2bslabel(zone->zone_slabel); 3190 3191 if (zone->zone_id != GLOBAL_ZONEID && 3192 bldominates(curproc_label, zone_label)) { 3193 return (B_TRUE); 3194 } else { 3195 return (B_FALSE); 3196 } 3197 } else { 3198 return (B_FALSE); 3199 } 3200 } 3201 3202 /* 3203 * Systemcall to start the zone's halt sequence. By the time this 3204 * function successfully returns, all user processes and kernel threads 3205 * executing in it will have exited, ZSD shutdown callbacks executed, 3206 * and the zone status set to ZONE_IS_DOWN. 3207 * 3208 * It is possible that the call will interrupt itself if the caller is the 3209 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 3210 */ 3211 static int 3212 zone_shutdown(zoneid_t zoneid) 3213 { 3214 int error; 3215 zone_t *zone; 3216 zone_status_t status; 3217 3218 if (secpolicy_zone_config(CRED()) != 0) 3219 return (set_errno(EPERM)); 3220 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3221 return (set_errno(EINVAL)); 3222 3223 /* 3224 * Block mounts so that VFS_MOUNT() can get an accurate view of 3225 * the zone's status with regards to ZONE_IS_SHUTTING down. 3226 * 3227 * e.g. NFS can fail the mount if it determines that the zone 3228 * has already begun the shutdown sequence. 3229 */ 3230 if (block_mounts() == 0) 3231 return (set_errno(EINTR)); 3232 mutex_enter(&zonehash_lock); 3233 /* 3234 * Look for zone under hash lock to prevent races with other 3235 * calls to zone_shutdown and zone_destroy. 3236 */ 3237 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3238 mutex_exit(&zonehash_lock); 3239 resume_mounts(); 3240 return (set_errno(EINVAL)); 3241 } 3242 mutex_enter(&zone_status_lock); 3243 status = zone_status_get(zone); 3244 /* 3245 * Fail if the zone isn't fully initialized yet. 3246 */ 3247 if (status < ZONE_IS_READY) { 3248 mutex_exit(&zone_status_lock); 3249 mutex_exit(&zonehash_lock); 3250 resume_mounts(); 3251 return (set_errno(EINVAL)); 3252 } 3253 /* 3254 * If conditions required for zone_shutdown() to return have been met, 3255 * return success. 3256 */ 3257 if (status >= ZONE_IS_DOWN) { 3258 mutex_exit(&zone_status_lock); 3259 mutex_exit(&zonehash_lock); 3260 resume_mounts(); 3261 return (0); 3262 } 3263 /* 3264 * If zone_shutdown() hasn't been called before, go through the motions. 3265 * If it has, there's nothing to do but wait for the kernel threads to 3266 * drain. 3267 */ 3268 if (status < ZONE_IS_EMPTY) { 3269 uint_t ntasks; 3270 3271 mutex_enter(&zone->zone_lock); 3272 if ((ntasks = zone->zone_ntasks) != 1) { 3273 /* 3274 * There's still stuff running. 3275 */ 3276 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3277 } 3278 mutex_exit(&zone->zone_lock); 3279 if (ntasks == 1) { 3280 /* 3281 * The only way to create another task is through 3282 * zone_enter(), which will block until we drop 3283 * zonehash_lock. The zone is empty. 3284 */ 3285 if (zone->zone_kthreads == NULL) { 3286 /* 3287 * Skip ahead to ZONE_IS_DOWN 3288 */ 3289 zone_status_set(zone, ZONE_IS_DOWN); 3290 } else { 3291 zone_status_set(zone, ZONE_IS_EMPTY); 3292 } 3293 } 3294 } 3295 zone_hold(zone); /* so we can use the zone_t later */ 3296 mutex_exit(&zone_status_lock); 3297 mutex_exit(&zonehash_lock); 3298 resume_mounts(); 3299 3300 if (error = zone_empty(zone)) { 3301 zone_rele(zone); 3302 return (set_errno(error)); 3303 } 3304 /* 3305 * After the zone status goes to ZONE_IS_DOWN this zone will no 3306 * longer be notified of changes to the pools configuration, so 3307 * in order to not end up with a stale pool pointer, we point 3308 * ourselves at the default pool and remove all resource 3309 * visibility. This is especially important as the zone_t may 3310 * languish on the deathrow for a very long time waiting for 3311 * cred's to drain out. 3312 * 3313 * This rebinding of the zone can happen multiple times 3314 * (presumably due to interrupted or parallel systemcalls) 3315 * without any adverse effects. 3316 */ 3317 if (pool_lock_intr() != 0) { 3318 zone_rele(zone); 3319 return (set_errno(EINTR)); 3320 } 3321 if (pool_state == POOL_ENABLED) { 3322 mutex_enter(&cpu_lock); 3323 zone_pool_set(zone, pool_default); 3324 /* 3325 * The zone no longer needs to be able to see any cpus. 3326 */ 3327 zone_pset_set(zone, ZONE_PS_INVAL); 3328 mutex_exit(&cpu_lock); 3329 } 3330 pool_unlock(); 3331 3332 /* 3333 * ZSD shutdown callbacks can be executed multiple times, hence 3334 * it is safe to not be holding any locks across this call. 3335 */ 3336 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 3337 3338 mutex_enter(&zone_status_lock); 3339 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 3340 zone_status_set(zone, ZONE_IS_DOWN); 3341 mutex_exit(&zone_status_lock); 3342 3343 /* 3344 * Wait for kernel threads to drain. 3345 */ 3346 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 3347 zone_rele(zone); 3348 return (set_errno(EINTR)); 3349 } 3350 zone_rele(zone); 3351 return (0); 3352 } 3353 3354 /* 3355 * Systemcall entry point to finalize the zone halt process. The caller 3356 * must have already successfully callefd zone_shutdown(). 3357 * 3358 * Upon successful completion, the zone will have been fully destroyed: 3359 * zsched will have exited, destructor callbacks executed, and the zone 3360 * removed from the list of active zones. 3361 */ 3362 static int 3363 zone_destroy(zoneid_t zoneid) 3364 { 3365 uint64_t uniqid; 3366 zone_t *zone; 3367 zone_status_t status; 3368 3369 if (secpolicy_zone_config(CRED()) != 0) 3370 return (set_errno(EPERM)); 3371 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3372 return (set_errno(EINVAL)); 3373 3374 mutex_enter(&zonehash_lock); 3375 /* 3376 * Look for zone under hash lock to prevent races with other 3377 * calls to zone_destroy. 3378 */ 3379 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3380 mutex_exit(&zonehash_lock); 3381 return (set_errno(EINVAL)); 3382 } 3383 3384 if (zone_mount_count(zone->zone_rootpath) != 0) { 3385 mutex_exit(&zonehash_lock); 3386 return (set_errno(EBUSY)); 3387 } 3388 mutex_enter(&zone_status_lock); 3389 status = zone_status_get(zone); 3390 if (status < ZONE_IS_DOWN) { 3391 mutex_exit(&zone_status_lock); 3392 mutex_exit(&zonehash_lock); 3393 return (set_errno(EBUSY)); 3394 } else if (status == ZONE_IS_DOWN) { 3395 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3396 } 3397 mutex_exit(&zone_status_lock); 3398 zone_hold(zone); 3399 mutex_exit(&zonehash_lock); 3400 3401 /* 3402 * wait for zsched to exit 3403 */ 3404 zone_status_wait(zone, ZONE_IS_DEAD); 3405 zone_zsd_callbacks(zone, ZSD_DESTROY); 3406 uniqid = zone->zone_uniqid; 3407 zone_rele(zone); 3408 zone = NULL; /* potentially free'd */ 3409 3410 mutex_enter(&zonehash_lock); 3411 for (; /* ever */; ) { 3412 boolean_t unref; 3413 3414 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3415 zone->zone_uniqid != uniqid) { 3416 /* 3417 * The zone has gone away. Necessary conditions 3418 * are met, so we return success. 3419 */ 3420 mutex_exit(&zonehash_lock); 3421 return (0); 3422 } 3423 mutex_enter(&zone->zone_lock); 3424 unref = ZONE_IS_UNREF(zone); 3425 mutex_exit(&zone->zone_lock); 3426 if (unref) { 3427 /* 3428 * There is only one reference to the zone -- that 3429 * added when the zone was added to the hashtables -- 3430 * and things will remain this way until we drop 3431 * zonehash_lock... we can go ahead and cleanup the 3432 * zone. 3433 */ 3434 break; 3435 } 3436 3437 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3438 /* Signaled */ 3439 mutex_exit(&zonehash_lock); 3440 return (set_errno(EINTR)); 3441 } 3442 3443 } 3444 3445 /* 3446 * It is now safe to let the zone be recreated; remove it from the 3447 * lists. The memory will not be freed until the last cred 3448 * reference goes away. 3449 */ 3450 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3451 zonecount--; 3452 /* remove from active list and hash tables */ 3453 list_remove(&zone_active, zone); 3454 (void) mod_hash_destroy(zonehashbyname, 3455 (mod_hash_key_t)zone->zone_name); 3456 (void) mod_hash_destroy(zonehashbyid, 3457 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3458 if (zone->zone_flags & ZF_HASHED_LABEL) 3459 (void) mod_hash_destroy(zonehashbylabel, 3460 (mod_hash_key_t)zone->zone_slabel); 3461 mutex_exit(&zonehash_lock); 3462 3463 /* 3464 * Release the root vnode; we're not using it anymore. Nor should any 3465 * other thread that might access it exist. 3466 */ 3467 if (zone->zone_rootvp != NULL) { 3468 VN_RELE(zone->zone_rootvp); 3469 zone->zone_rootvp = NULL; 3470 } 3471 3472 /* add to deathrow list */ 3473 mutex_enter(&zone_deathrow_lock); 3474 list_insert_tail(&zone_deathrow, zone); 3475 mutex_exit(&zone_deathrow_lock); 3476 3477 /* 3478 * Drop last reference (which was added by zsched()), this will 3479 * free the zone unless there are outstanding cred references. 3480 */ 3481 zone_rele(zone); 3482 return (0); 3483 } 3484 3485 /* 3486 * Systemcall entry point for zone_getattr(2). 3487 */ 3488 static ssize_t 3489 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3490 { 3491 size_t size; 3492 int error = 0, err; 3493 zone_t *zone; 3494 char *zonepath; 3495 char *outstr; 3496 zone_status_t zone_status; 3497 pid_t initpid; 3498 boolean_t global = (curproc->p_zone == global_zone); 3499 boolean_t curzone = (curproc->p_zone->zone_id == zoneid); 3500 3501 mutex_enter(&zonehash_lock); 3502 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3503 mutex_exit(&zonehash_lock); 3504 return (set_errno(EINVAL)); 3505 } 3506 zone_status = zone_status_get(zone); 3507 if (zone_status < ZONE_IS_READY) { 3508 mutex_exit(&zonehash_lock); 3509 return (set_errno(EINVAL)); 3510 } 3511 zone_hold(zone); 3512 mutex_exit(&zonehash_lock); 3513 3514 /* 3515 * If not in the global zone, don't show information about other zones, 3516 * unless the system is labeled and the local zone's label dominates 3517 * the other zone. 3518 */ 3519 if (!zone_list_access(zone)) { 3520 zone_rele(zone); 3521 return (set_errno(EINVAL)); 3522 } 3523 3524 switch (attr) { 3525 case ZONE_ATTR_ROOT: 3526 if (global) { 3527 /* 3528 * Copy the path to trim the trailing "/" (except for 3529 * the global zone). 3530 */ 3531 if (zone != global_zone) 3532 size = zone->zone_rootpathlen - 1; 3533 else 3534 size = zone->zone_rootpathlen; 3535 zonepath = kmem_alloc(size, KM_SLEEP); 3536 bcopy(zone->zone_rootpath, zonepath, size); 3537 zonepath[size - 1] = '\0'; 3538 } else { 3539 if (curzone || !is_system_labeled()) { 3540 /* 3541 * Caller is not in the global zone. 3542 * if the query is on the current zone 3543 * or the system is not labeled, 3544 * just return faked-up path for current zone. 3545 */ 3546 zonepath = "/"; 3547 size = 2; 3548 } else { 3549 /* 3550 * Return related path for current zone. 3551 */ 3552 int prefix_len = strlen(zone_prefix); 3553 int zname_len = strlen(zone->zone_name); 3554 3555 size = prefix_len + zname_len + 1; 3556 zonepath = kmem_alloc(size, KM_SLEEP); 3557 bcopy(zone_prefix, zonepath, prefix_len); 3558 bcopy(zone->zone_name, zonepath + 3559 prefix_len, zname_len); 3560 zonepath[size - 1] = '\0'; 3561 } 3562 } 3563 if (bufsize > size) 3564 bufsize = size; 3565 if (buf != NULL) { 3566 err = copyoutstr(zonepath, buf, bufsize, NULL); 3567 if (err != 0 && err != ENAMETOOLONG) 3568 error = EFAULT; 3569 } 3570 if (global || (is_system_labeled() && !curzone)) 3571 kmem_free(zonepath, size); 3572 break; 3573 3574 case ZONE_ATTR_NAME: 3575 size = strlen(zone->zone_name) + 1; 3576 if (bufsize > size) 3577 bufsize = size; 3578 if (buf != NULL) { 3579 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 3580 if (err != 0 && err != ENAMETOOLONG) 3581 error = EFAULT; 3582 } 3583 break; 3584 3585 case ZONE_ATTR_STATUS: 3586 /* 3587 * Since we're not holding zonehash_lock, the zone status 3588 * may be anything; leave it up to userland to sort it out. 3589 */ 3590 size = sizeof (zone_status); 3591 if (bufsize > size) 3592 bufsize = size; 3593 zone_status = zone_status_get(zone); 3594 if (buf != NULL && 3595 copyout(&zone_status, buf, bufsize) != 0) 3596 error = EFAULT; 3597 break; 3598 case ZONE_ATTR_PRIVSET: 3599 size = sizeof (priv_set_t); 3600 if (bufsize > size) 3601 bufsize = size; 3602 if (buf != NULL && 3603 copyout(zone->zone_privset, buf, bufsize) != 0) 3604 error = EFAULT; 3605 break; 3606 case ZONE_ATTR_UNIQID: 3607 size = sizeof (zone->zone_uniqid); 3608 if (bufsize > size) 3609 bufsize = size; 3610 if (buf != NULL && 3611 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 3612 error = EFAULT; 3613 break; 3614 case ZONE_ATTR_POOLID: 3615 { 3616 pool_t *pool; 3617 poolid_t poolid; 3618 3619 if (pool_lock_intr() != 0) { 3620 error = EINTR; 3621 break; 3622 } 3623 pool = zone_pool_get(zone); 3624 poolid = pool->pool_id; 3625 pool_unlock(); 3626 size = sizeof (poolid); 3627 if (bufsize > size) 3628 bufsize = size; 3629 if (buf != NULL && copyout(&poolid, buf, size) != 0) 3630 error = EFAULT; 3631 } 3632 break; 3633 case ZONE_ATTR_SLBL: 3634 size = sizeof (bslabel_t); 3635 if (bufsize > size) 3636 bufsize = size; 3637 if (zone->zone_slabel == NULL) 3638 error = EINVAL; 3639 else if (buf != NULL && 3640 copyout(label2bslabel(zone->zone_slabel), buf, 3641 bufsize) != 0) 3642 error = EFAULT; 3643 break; 3644 case ZONE_ATTR_INITPID: 3645 size = sizeof (initpid); 3646 if (bufsize > size) 3647 bufsize = size; 3648 initpid = zone->zone_proc_initpid; 3649 if (initpid == -1) { 3650 error = ESRCH; 3651 break; 3652 } 3653 if (buf != NULL && 3654 copyout(&initpid, buf, bufsize) != 0) 3655 error = EFAULT; 3656 break; 3657 case ZONE_ATTR_INITNAME: 3658 size = strlen(zone->zone_initname) + 1; 3659 if (bufsize > size) 3660 bufsize = size; 3661 if (buf != NULL) { 3662 err = copyoutstr(zone->zone_initname, buf, bufsize, 3663 NULL); 3664 if (err != 0 && err != ENAMETOOLONG) 3665 error = EFAULT; 3666 } 3667 break; 3668 case ZONE_ATTR_BOOTARGS: 3669 if (zone->zone_bootargs == NULL) 3670 outstr = ""; 3671 else 3672 outstr = zone->zone_bootargs; 3673 size = strlen(outstr) + 1; 3674 if (bufsize > size) 3675 bufsize = size; 3676 if (buf != NULL) { 3677 err = copyoutstr(outstr, buf, bufsize, NULL); 3678 if (err != 0 && err != ENAMETOOLONG) 3679 error = EFAULT; 3680 } 3681 break; 3682 default: 3683 error = EINVAL; 3684 } 3685 zone_rele(zone); 3686 3687 if (error) 3688 return (set_errno(error)); 3689 return ((ssize_t)size); 3690 } 3691 3692 /* 3693 * Systemcall entry point for zone_setattr(2). 3694 */ 3695 /*ARGSUSED*/ 3696 static int 3697 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3698 { 3699 zone_t *zone; 3700 zone_status_t zone_status; 3701 int err; 3702 3703 if (secpolicy_zone_config(CRED()) != 0) 3704 return (set_errno(EPERM)); 3705 3706 /* 3707 * At present, attributes can only be set on non-running, 3708 * non-global zones. 3709 */ 3710 if (zoneid == GLOBAL_ZONEID) { 3711 return (set_errno(EINVAL)); 3712 } 3713 3714 mutex_enter(&zonehash_lock); 3715 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3716 mutex_exit(&zonehash_lock); 3717 return (set_errno(EINVAL)); 3718 } 3719 zone_hold(zone); 3720 mutex_exit(&zonehash_lock); 3721 3722 zone_status = zone_status_get(zone); 3723 if (zone_status > ZONE_IS_READY) 3724 goto done; 3725 3726 switch (attr) { 3727 case ZONE_ATTR_INITNAME: 3728 err = zone_set_initname(zone, (const char *)buf); 3729 break; 3730 case ZONE_ATTR_BOOTARGS: 3731 err = zone_set_bootargs(zone, (const char *)buf); 3732 break; 3733 default: 3734 err = EINVAL; 3735 } 3736 3737 done: 3738 zone_rele(zone); 3739 return (err != 0 ? set_errno(err) : 0); 3740 } 3741 3742 /* 3743 * Return zero if the process has at least one vnode mapped in to its 3744 * address space which shouldn't be allowed to change zones. 3745 */ 3746 static int 3747 as_can_change_zones(void) 3748 { 3749 proc_t *pp = curproc; 3750 struct seg *seg; 3751 struct as *as = pp->p_as; 3752 vnode_t *vp; 3753 int allow = 1; 3754 3755 ASSERT(pp->p_as != &kas); 3756 AS_LOCK_ENTER(&as, &as->a_lock, RW_READER); 3757 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 3758 /* 3759 * if we can't get a backing vnode for this segment then skip 3760 * it. 3761 */ 3762 vp = NULL; 3763 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 3764 continue; 3765 if (!vn_can_change_zones(vp)) { /* bail on first match */ 3766 allow = 0; 3767 break; 3768 } 3769 } 3770 AS_LOCK_EXIT(&as, &as->a_lock); 3771 return (allow); 3772 } 3773 3774 /* 3775 * Systemcall entry point for zone_enter(). 3776 * 3777 * The current process is injected into said zone. In the process 3778 * it will change its project membership, privileges, rootdir/cwd, 3779 * zone-wide rctls, and pool association to match those of the zone. 3780 * 3781 * The first zone_enter() called while the zone is in the ZONE_IS_READY 3782 * state will transition it to ZONE_IS_RUNNING. Processes may only 3783 * enter a zone that is "ready" or "running". 3784 */ 3785 static int 3786 zone_enter(zoneid_t zoneid) 3787 { 3788 zone_t *zone; 3789 vnode_t *vp; 3790 proc_t *pp = curproc; 3791 contract_t *ct; 3792 cont_process_t *ctp; 3793 task_t *tk, *oldtk; 3794 kproject_t *zone_proj0; 3795 cred_t *cr, *newcr; 3796 pool_t *oldpool, *newpool; 3797 sess_t *sp; 3798 uid_t uid; 3799 zone_status_t status; 3800 int err = 0; 3801 rctl_entity_p_t e; 3802 3803 if (secpolicy_zone_config(CRED()) != 0) 3804 return (set_errno(EPERM)); 3805 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3806 return (set_errno(EINVAL)); 3807 3808 /* 3809 * Stop all lwps so we don't need to hold a lock to look at 3810 * curproc->p_zone. This needs to happen before we grab any 3811 * locks to avoid deadlock (another lwp in the process could 3812 * be waiting for the held lock). 3813 */ 3814 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 3815 return (set_errno(EINTR)); 3816 3817 /* 3818 * Make sure we're not changing zones with files open or mapped in 3819 * to our address space which shouldn't be changing zones. 3820 */ 3821 if (!files_can_change_zones()) { 3822 err = EBADF; 3823 goto out; 3824 } 3825 if (!as_can_change_zones()) { 3826 err = EFAULT; 3827 goto out; 3828 } 3829 3830 mutex_enter(&zonehash_lock); 3831 if (pp->p_zone != global_zone) { 3832 mutex_exit(&zonehash_lock); 3833 err = EINVAL; 3834 goto out; 3835 } 3836 3837 zone = zone_find_all_by_id(zoneid); 3838 if (zone == NULL) { 3839 mutex_exit(&zonehash_lock); 3840 err = EINVAL; 3841 goto out; 3842 } 3843 3844 /* 3845 * To prevent processes in a zone from holding contracts on 3846 * extrazonal resources, and to avoid process contract 3847 * memberships which span zones, contract holders and processes 3848 * which aren't the sole members of their encapsulating process 3849 * contracts are not allowed to zone_enter. 3850 */ 3851 ctp = pp->p_ct_process; 3852 ct = &ctp->conp_contract; 3853 mutex_enter(&ct->ct_lock); 3854 mutex_enter(&pp->p_lock); 3855 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 3856 mutex_exit(&pp->p_lock); 3857 mutex_exit(&ct->ct_lock); 3858 mutex_exit(&zonehash_lock); 3859 pool_unlock(); 3860 err = EINVAL; 3861 goto out; 3862 } 3863 3864 /* 3865 * Moreover, we don't allow processes whose encapsulating 3866 * process contracts have inherited extrazonal contracts. 3867 * While it would be easier to eliminate all process contracts 3868 * with inherited contracts, we need to be able to give a 3869 * restarted init (or other zone-penetrating process) its 3870 * predecessor's contracts. 3871 */ 3872 if (ctp->conp_ninherited != 0) { 3873 contract_t *next; 3874 for (next = list_head(&ctp->conp_inherited); next; 3875 next = list_next(&ctp->conp_inherited, next)) { 3876 if (contract_getzuniqid(next) != zone->zone_uniqid) { 3877 mutex_exit(&pp->p_lock); 3878 mutex_exit(&ct->ct_lock); 3879 mutex_exit(&zonehash_lock); 3880 pool_unlock(); 3881 err = EINVAL; 3882 goto out; 3883 } 3884 } 3885 } 3886 mutex_exit(&pp->p_lock); 3887 mutex_exit(&ct->ct_lock); 3888 3889 status = zone_status_get(zone); 3890 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 3891 /* 3892 * Can't join 3893 */ 3894 mutex_exit(&zonehash_lock); 3895 err = EINVAL; 3896 goto out; 3897 } 3898 3899 /* 3900 * Make sure new priv set is within the permitted set for caller 3901 */ 3902 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 3903 mutex_exit(&zonehash_lock); 3904 err = EPERM; 3905 goto out; 3906 } 3907 /* 3908 * We want to momentarily drop zonehash_lock while we optimistically 3909 * bind curproc to the pool it should be running in. This is safe 3910 * since the zone can't disappear (we have a hold on it). 3911 */ 3912 zone_hold(zone); 3913 mutex_exit(&zonehash_lock); 3914 3915 /* 3916 * Grab pool_lock to keep the pools configuration from changing 3917 * and to stop ourselves from getting rebound to another pool 3918 * until we join the zone. 3919 */ 3920 if (pool_lock_intr() != 0) { 3921 zone_rele(zone); 3922 err = EINTR; 3923 goto out; 3924 } 3925 ASSERT(secpolicy_pool(CRED()) == 0); 3926 /* 3927 * Bind ourselves to the pool currently associated with the zone. 3928 */ 3929 oldpool = curproc->p_pool; 3930 newpool = zone_pool_get(zone); 3931 if (pool_state == POOL_ENABLED && newpool != oldpool && 3932 (err = pool_do_bind(newpool, P_PID, P_MYID, 3933 POOL_BIND_ALL)) != 0) { 3934 pool_unlock(); 3935 zone_rele(zone); 3936 goto out; 3937 } 3938 3939 /* 3940 * Grab cpu_lock now; we'll need it later when we call 3941 * task_join(). 3942 */ 3943 mutex_enter(&cpu_lock); 3944 mutex_enter(&zonehash_lock); 3945 /* 3946 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 3947 */ 3948 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 3949 /* 3950 * Can't join anymore. 3951 */ 3952 mutex_exit(&zonehash_lock); 3953 mutex_exit(&cpu_lock); 3954 if (pool_state == POOL_ENABLED && 3955 newpool != oldpool) 3956 (void) pool_do_bind(oldpool, P_PID, P_MYID, 3957 POOL_BIND_ALL); 3958 pool_unlock(); 3959 zone_rele(zone); 3960 err = EINVAL; 3961 goto out; 3962 } 3963 3964 mutex_enter(&pp->p_lock); 3965 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 3966 /* verify that we do not exceed and task or lwp limits */ 3967 mutex_enter(&zone->zone_nlwps_lock); 3968 /* add new lwps to zone and zone's proj0 */ 3969 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 3970 zone->zone_nlwps += pp->p_lwpcnt; 3971 /* add 1 task to zone's proj0 */ 3972 zone_proj0->kpj_ntasks += 1; 3973 mutex_exit(&pp->p_lock); 3974 mutex_exit(&zone->zone_nlwps_lock); 3975 3976 /* remove lwps from proc's old zone and old project */ 3977 mutex_enter(&pp->p_zone->zone_nlwps_lock); 3978 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 3979 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 3980 mutex_exit(&pp->p_zone->zone_nlwps_lock); 3981 3982 /* 3983 * Joining the zone cannot fail from now on. 3984 * 3985 * This means that a lot of the following code can be commonized and 3986 * shared with zsched(). 3987 */ 3988 3989 /* 3990 * Reset the encapsulating process contract's zone. 3991 */ 3992 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 3993 contract_setzuniqid(ct, zone->zone_uniqid); 3994 3995 /* 3996 * Create a new task and associate the process with the project keyed 3997 * by (projid,zoneid). 3998 * 3999 * We might as well be in project 0; the global zone's projid doesn't 4000 * make much sense in a zone anyhow. 4001 * 4002 * This also increments zone_ntasks, and returns with p_lock held. 4003 */ 4004 tk = task_create(0, zone); 4005 oldtk = task_join(tk, 0); 4006 mutex_exit(&cpu_lock); 4007 4008 pp->p_flag |= SZONETOP; 4009 pp->p_zone = zone; 4010 4011 /* 4012 * call RCTLOP_SET functions on this proc 4013 */ 4014 e.rcep_p.zone = zone; 4015 e.rcep_t = RCENTITY_ZONE; 4016 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 4017 RCD_CALLBACK); 4018 mutex_exit(&pp->p_lock); 4019 4020 /* 4021 * We don't need to hold any of zsched's locks here; not only do we know 4022 * the process and zone aren't going away, we know its session isn't 4023 * changing either. 4024 * 4025 * By joining zsched's session here, we mimic the behavior in the 4026 * global zone of init's sid being the pid of sched. We extend this 4027 * to all zlogin-like zone_enter()'ing processes as well. 4028 */ 4029 mutex_enter(&pidlock); 4030 sp = zone->zone_zsched->p_sessp; 4031 SESS_HOLD(sp); 4032 mutex_enter(&pp->p_lock); 4033 pgexit(pp); 4034 SESS_RELE(pp->p_sessp); 4035 pp->p_sessp = sp; 4036 pgjoin(pp, zone->zone_zsched->p_pidp); 4037 mutex_exit(&pp->p_lock); 4038 mutex_exit(&pidlock); 4039 4040 mutex_exit(&zonehash_lock); 4041 /* 4042 * We're firmly in the zone; let pools progress. 4043 */ 4044 pool_unlock(); 4045 task_rele(oldtk); 4046 /* 4047 * We don't need to retain a hold on the zone since we already 4048 * incremented zone_ntasks, so the zone isn't going anywhere. 4049 */ 4050 zone_rele(zone); 4051 4052 /* 4053 * Chroot 4054 */ 4055 vp = zone->zone_rootvp; 4056 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 4057 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 4058 4059 /* 4060 * Change process credentials 4061 */ 4062 newcr = cralloc(); 4063 mutex_enter(&pp->p_crlock); 4064 cr = pp->p_cred; 4065 crcopy_to(cr, newcr); 4066 crsetzone(newcr, zone); 4067 pp->p_cred = newcr; 4068 4069 /* 4070 * Restrict all process privilege sets to zone limit 4071 */ 4072 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 4073 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 4074 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 4075 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 4076 mutex_exit(&pp->p_crlock); 4077 crset(pp, newcr); 4078 4079 /* 4080 * Adjust upcount to reflect zone entry. 4081 */ 4082 uid = crgetruid(newcr); 4083 mutex_enter(&pidlock); 4084 upcount_dec(uid, GLOBAL_ZONEID); 4085 upcount_inc(uid, zoneid); 4086 mutex_exit(&pidlock); 4087 4088 /* 4089 * Set up core file path and content. 4090 */ 4091 set_core_defaults(); 4092 4093 out: 4094 /* 4095 * Let the other lwps continue. 4096 */ 4097 mutex_enter(&pp->p_lock); 4098 if (curthread != pp->p_agenttp) 4099 continuelwps(pp); 4100 mutex_exit(&pp->p_lock); 4101 4102 return (err != 0 ? set_errno(err) : 0); 4103 } 4104 4105 /* 4106 * Systemcall entry point for zone_list(2). 4107 * 4108 * Processes running in a (non-global) zone only see themselves. 4109 * On labeled systems, they see all zones whose label they dominate. 4110 */ 4111 static int 4112 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 4113 { 4114 zoneid_t *zoneids; 4115 zone_t *zone, *myzone; 4116 uint_t user_nzones, real_nzones; 4117 uint_t domi_nzones; 4118 int error; 4119 4120 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 4121 return (set_errno(EFAULT)); 4122 4123 myzone = curproc->p_zone; 4124 if (myzone != global_zone) { 4125 bslabel_t *mybslab; 4126 4127 if (!is_system_labeled()) { 4128 /* just return current zone */ 4129 real_nzones = domi_nzones = 1; 4130 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 4131 zoneids[0] = myzone->zone_id; 4132 } else { 4133 /* return all zones that are dominated */ 4134 mutex_enter(&zonehash_lock); 4135 real_nzones = zonecount; 4136 domi_nzones = 0; 4137 if (real_nzones > 0) { 4138 zoneids = kmem_alloc(real_nzones * 4139 sizeof (zoneid_t), KM_SLEEP); 4140 mybslab = label2bslabel(myzone->zone_slabel); 4141 for (zone = list_head(&zone_active); 4142 zone != NULL; 4143 zone = list_next(&zone_active, zone)) { 4144 if (zone->zone_id == GLOBAL_ZONEID) 4145 continue; 4146 if (zone != myzone && 4147 (zone->zone_flags & ZF_IS_SCRATCH)) 4148 continue; 4149 /* 4150 * Note that a label always dominates 4151 * itself, so myzone is always included 4152 * in the list. 4153 */ 4154 if (bldominates(mybslab, 4155 label2bslabel(zone->zone_slabel))) { 4156 zoneids[domi_nzones++] = 4157 zone->zone_id; 4158 } 4159 } 4160 } 4161 mutex_exit(&zonehash_lock); 4162 } 4163 } else { 4164 mutex_enter(&zonehash_lock); 4165 real_nzones = zonecount; 4166 domi_nzones = 0; 4167 if (real_nzones > 0) { 4168 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 4169 KM_SLEEP); 4170 for (zone = list_head(&zone_active); zone != NULL; 4171 zone = list_next(&zone_active, zone)) 4172 zoneids[domi_nzones++] = zone->zone_id; 4173 ASSERT(domi_nzones == real_nzones); 4174 } 4175 mutex_exit(&zonehash_lock); 4176 } 4177 4178 /* 4179 * If user has allocated space for fewer entries than we found, then 4180 * return only up to his limit. Either way, tell him exactly how many 4181 * we found. 4182 */ 4183 if (domi_nzones < user_nzones) 4184 user_nzones = domi_nzones; 4185 error = 0; 4186 if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) { 4187 error = EFAULT; 4188 } else if (zoneidlist != NULL && user_nzones != 0) { 4189 if (copyout(zoneids, zoneidlist, 4190 user_nzones * sizeof (zoneid_t)) != 0) 4191 error = EFAULT; 4192 } 4193 4194 if (real_nzones > 0) 4195 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 4196 4197 if (error != 0) 4198 return (set_errno(error)); 4199 else 4200 return (0); 4201 } 4202 4203 /* 4204 * Systemcall entry point for zone_lookup(2). 4205 * 4206 * Non-global zones are only able to see themselves and (on labeled systems) 4207 * the zones they dominate. 4208 */ 4209 static zoneid_t 4210 zone_lookup(const char *zone_name) 4211 { 4212 char *kname; 4213 zone_t *zone; 4214 zoneid_t zoneid; 4215 int err; 4216 4217 if (zone_name == NULL) { 4218 /* return caller's zone id */ 4219 return (getzoneid()); 4220 } 4221 4222 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 4223 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 4224 kmem_free(kname, ZONENAME_MAX); 4225 return (set_errno(err)); 4226 } 4227 4228 mutex_enter(&zonehash_lock); 4229 zone = zone_find_all_by_name(kname); 4230 kmem_free(kname, ZONENAME_MAX); 4231 /* 4232 * In a non-global zone, can only lookup global and own name. 4233 * In Trusted Extensions zone label dominance rules apply. 4234 */ 4235 if (zone == NULL || 4236 zone_status_get(zone) < ZONE_IS_READY || 4237 !zone_list_access(zone)) { 4238 mutex_exit(&zonehash_lock); 4239 return (set_errno(EINVAL)); 4240 } else { 4241 zoneid = zone->zone_id; 4242 mutex_exit(&zonehash_lock); 4243 return (zoneid); 4244 } 4245 } 4246 4247 static int 4248 zone_version(int *version_arg) 4249 { 4250 int version = ZONE_SYSCALL_API_VERSION; 4251 4252 if (copyout(&version, version_arg, sizeof (int)) != 0) 4253 return (set_errno(EFAULT)); 4254 return (0); 4255 } 4256 4257 /* ARGSUSED */ 4258 long 4259 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 4260 { 4261 zone_def zs; 4262 4263 switch (cmd) { 4264 case ZONE_CREATE: 4265 if (get_udatamodel() == DATAMODEL_NATIVE) { 4266 if (copyin(arg1, &zs, sizeof (zone_def))) { 4267 return (set_errno(EFAULT)); 4268 } 4269 } else { 4270 #ifdef _SYSCALL32_IMPL 4271 zone_def32 zs32; 4272 4273 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 4274 return (set_errno(EFAULT)); 4275 } 4276 zs.zone_name = 4277 (const char *)(unsigned long)zs32.zone_name; 4278 zs.zone_root = 4279 (const char *)(unsigned long)zs32.zone_root; 4280 zs.zone_privs = 4281 (const struct priv_set *) 4282 (unsigned long)zs32.zone_privs; 4283 zs.zone_privssz = zs32.zone_privssz; 4284 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 4285 zs.rctlbufsz = zs32.rctlbufsz; 4286 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 4287 zs.zfsbufsz = zs32.zfsbufsz; 4288 zs.extended_error = 4289 (int *)(unsigned long)zs32.extended_error; 4290 zs.match = zs32.match; 4291 zs.doi = zs32.doi; 4292 zs.label = (const bslabel_t *)(uintptr_t)zs32.label; 4293 #else 4294 panic("get_udatamodel() returned bogus result\n"); 4295 #endif 4296 } 4297 4298 return (zone_create(zs.zone_name, zs.zone_root, 4299 zs.zone_privs, zs.zone_privssz, 4300 (caddr_t)zs.rctlbuf, zs.rctlbufsz, 4301 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 4302 zs.extended_error, zs.match, zs.doi, 4303 zs.label)); 4304 case ZONE_BOOT: 4305 return (zone_boot((zoneid_t)(uintptr_t)arg1)); 4306 case ZONE_DESTROY: 4307 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 4308 case ZONE_GETATTR: 4309 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 4310 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 4311 case ZONE_SETATTR: 4312 return (zone_setattr((zoneid_t)(uintptr_t)arg1, 4313 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 4314 case ZONE_ENTER: 4315 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 4316 case ZONE_LIST: 4317 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 4318 case ZONE_SHUTDOWN: 4319 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 4320 case ZONE_LOOKUP: 4321 return (zone_lookup((const char *)arg1)); 4322 case ZONE_VERSION: 4323 return (zone_version((int *)arg1)); 4324 default: 4325 return (set_errno(EINVAL)); 4326 } 4327 } 4328 4329 struct zarg { 4330 zone_t *zone; 4331 zone_cmd_arg_t arg; 4332 }; 4333 4334 static int 4335 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 4336 { 4337 char *buf; 4338 size_t buflen; 4339 int error; 4340 4341 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 4342 buf = kmem_alloc(buflen, KM_SLEEP); 4343 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 4344 error = door_ki_open(buf, doorp); 4345 kmem_free(buf, buflen); 4346 return (error); 4347 } 4348 4349 static void 4350 zone_release_door(door_handle_t *doorp) 4351 { 4352 door_ki_rele(*doorp); 4353 *doorp = NULL; 4354 } 4355 4356 static void 4357 zone_ki_call_zoneadmd(struct zarg *zargp) 4358 { 4359 door_handle_t door = NULL; 4360 door_arg_t darg, save_arg; 4361 char *zone_name; 4362 size_t zone_namelen; 4363 zoneid_t zoneid; 4364 zone_t *zone; 4365 zone_cmd_arg_t arg; 4366 uint64_t uniqid; 4367 size_t size; 4368 int error; 4369 int retry; 4370 4371 zone = zargp->zone; 4372 arg = zargp->arg; 4373 kmem_free(zargp, sizeof (*zargp)); 4374 4375 zone_namelen = strlen(zone->zone_name) + 1; 4376 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 4377 bcopy(zone->zone_name, zone_name, zone_namelen); 4378 zoneid = zone->zone_id; 4379 uniqid = zone->zone_uniqid; 4380 /* 4381 * zoneadmd may be down, but at least we can empty out the zone. 4382 * We can ignore the return value of zone_empty() since we're called 4383 * from a kernel thread and know we won't be delivered any signals. 4384 */ 4385 ASSERT(curproc == &p0); 4386 (void) zone_empty(zone); 4387 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 4388 zone_rele(zone); 4389 4390 size = sizeof (arg); 4391 darg.rbuf = (char *)&arg; 4392 darg.data_ptr = (char *)&arg; 4393 darg.rsize = size; 4394 darg.data_size = size; 4395 darg.desc_ptr = NULL; 4396 darg.desc_num = 0; 4397 4398 save_arg = darg; 4399 /* 4400 * Since we're not holding a reference to the zone, any number of 4401 * things can go wrong, including the zone disappearing before we get a 4402 * chance to talk to zoneadmd. 4403 */ 4404 for (retry = 0; /* forever */; retry++) { 4405 if (door == NULL && 4406 (error = zone_lookup_door(zone_name, &door)) != 0) { 4407 goto next; 4408 } 4409 ASSERT(door != NULL); 4410 4411 if ((error = door_ki_upcall(door, &darg)) == 0) { 4412 break; 4413 } 4414 switch (error) { 4415 case EINTR: 4416 /* FALLTHROUGH */ 4417 case EAGAIN: /* process may be forking */ 4418 /* 4419 * Back off for a bit 4420 */ 4421 break; 4422 case EBADF: 4423 zone_release_door(&door); 4424 if (zone_lookup_door(zone_name, &door) != 0) { 4425 /* 4426 * zoneadmd may be dead, but it may come back to 4427 * life later. 4428 */ 4429 break; 4430 } 4431 break; 4432 default: 4433 cmn_err(CE_WARN, 4434 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 4435 error); 4436 goto out; 4437 } 4438 next: 4439 /* 4440 * If this isn't the same zone_t that we originally had in mind, 4441 * then this is the same as if two kadmin requests come in at 4442 * the same time: the first one wins. This means we lose, so we 4443 * bail. 4444 */ 4445 if ((zone = zone_find_by_id(zoneid)) == NULL) { 4446 /* 4447 * Problem is solved. 4448 */ 4449 break; 4450 } 4451 if (zone->zone_uniqid != uniqid) { 4452 /* 4453 * zoneid recycled 4454 */ 4455 zone_rele(zone); 4456 break; 4457 } 4458 /* 4459 * We could zone_status_timedwait(), but there doesn't seem to 4460 * be much point in doing that (plus, it would mean that 4461 * zone_free() isn't called until this thread exits). 4462 */ 4463 zone_rele(zone); 4464 delay(hz); 4465 darg = save_arg; 4466 } 4467 out: 4468 if (door != NULL) { 4469 zone_release_door(&door); 4470 } 4471 kmem_free(zone_name, zone_namelen); 4472 thread_exit(); 4473 } 4474 4475 /* 4476 * Entry point for uadmin() to tell the zone to go away or reboot. Analog to 4477 * kadmin(). The caller is a process in the zone. 4478 * 4479 * In order to shutdown the zone, we will hand off control to zoneadmd 4480 * (running in the global zone) via a door. We do a half-hearted job at 4481 * killing all processes in the zone, create a kernel thread to contact 4482 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 4483 * a form of generation number used to let zoneadmd (as well as 4484 * zone_destroy()) know exactly which zone they're re talking about. 4485 */ 4486 int 4487 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) 4488 { 4489 struct zarg *zargp; 4490 zone_cmd_t zcmd; 4491 zone_t *zone; 4492 4493 zone = curproc->p_zone; 4494 ASSERT(getzoneid() != GLOBAL_ZONEID); 4495 4496 switch (cmd) { 4497 case A_SHUTDOWN: 4498 switch (fcn) { 4499 case AD_HALT: 4500 case AD_POWEROFF: 4501 zcmd = Z_HALT; 4502 break; 4503 case AD_BOOT: 4504 zcmd = Z_REBOOT; 4505 break; 4506 case AD_IBOOT: 4507 case AD_SBOOT: 4508 case AD_SIBOOT: 4509 case AD_NOSYNC: 4510 return (ENOTSUP); 4511 default: 4512 return (EINVAL); 4513 } 4514 break; 4515 case A_REBOOT: 4516 zcmd = Z_REBOOT; 4517 break; 4518 case A_FTRACE: 4519 case A_REMOUNT: 4520 case A_FREEZE: 4521 case A_DUMP: 4522 return (ENOTSUP); 4523 default: 4524 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 4525 return (EINVAL); 4526 } 4527 4528 if (secpolicy_zone_admin(credp, B_FALSE)) 4529 return (EPERM); 4530 mutex_enter(&zone_status_lock); 4531 4532 /* 4533 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 4534 * is in the zone. 4535 */ 4536 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 4537 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 4538 /* 4539 * This zone is already on its way down. 4540 */ 4541 mutex_exit(&zone_status_lock); 4542 return (0); 4543 } 4544 /* 4545 * Prevent future zone_enter()s 4546 */ 4547 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 4548 mutex_exit(&zone_status_lock); 4549 4550 /* 4551 * Kill everyone now and call zoneadmd later. 4552 * zone_ki_call_zoneadmd() will do a more thorough job of this 4553 * later. 4554 */ 4555 killall(zone->zone_id); 4556 /* 4557 * Now, create the thread to contact zoneadmd and do the rest of the 4558 * work. This thread can't be created in our zone otherwise 4559 * zone_destroy() would deadlock. 4560 */ 4561 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP); 4562 zargp->arg.cmd = zcmd; 4563 zargp->arg.uniqid = zone->zone_uniqid; 4564 zargp->zone = zone; 4565 (void) strcpy(zargp->arg.locale, "C"); 4566 /* mdep was already copied in for us by uadmin */ 4567 if (mdep != NULL) 4568 (void) strlcpy(zargp->arg.bootbuf, mdep, 4569 sizeof (zargp->arg.bootbuf)); 4570 zone_hold(zone); 4571 4572 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 4573 TS_RUN, minclsyspri); 4574 exit(CLD_EXITED, 0); 4575 4576 return (EINVAL); 4577 } 4578 4579 /* 4580 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 4581 * status to ZONE_IS_SHUTTING_DOWN. 4582 */ 4583 void 4584 zone_shutdown_global(void) 4585 { 4586 ASSERT(curproc->p_zone == global_zone); 4587 4588 mutex_enter(&zone_status_lock); 4589 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 4590 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 4591 mutex_exit(&zone_status_lock); 4592 } 4593 4594 /* 4595 * Returns true if the named dataset is visible in the current zone. 4596 * The 'write' parameter is set to 1 if the dataset is also writable. 4597 */ 4598 int 4599 zone_dataset_visible(const char *dataset, int *write) 4600 { 4601 zone_dataset_t *zd; 4602 size_t len; 4603 zone_t *zone = curproc->p_zone; 4604 4605 if (dataset[0] == '\0') 4606 return (0); 4607 4608 /* 4609 * Walk the list once, looking for datasets which match exactly, or 4610 * specify a dataset underneath an exported dataset. If found, return 4611 * true and note that it is writable. 4612 */ 4613 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4614 zd = list_next(&zone->zone_datasets, zd)) { 4615 4616 len = strlen(zd->zd_dataset); 4617 if (strlen(dataset) >= len && 4618 bcmp(dataset, zd->zd_dataset, len) == 0 && 4619 (dataset[len] == '\0' || dataset[len] == '/' || 4620 dataset[len] == '@')) { 4621 if (write) 4622 *write = 1; 4623 return (1); 4624 } 4625 } 4626 4627 /* 4628 * Walk the list a second time, searching for datasets which are parents 4629 * of exported datasets. These should be visible, but read-only. 4630 * 4631 * Note that we also have to support forms such as 'pool/dataset/', with 4632 * a trailing slash. 4633 */ 4634 for (zd = list_head(&zone->zone_datasets); zd != NULL; 4635 zd = list_next(&zone->zone_datasets, zd)) { 4636 4637 len = strlen(dataset); 4638 if (dataset[len - 1] == '/') 4639 len--; /* Ignore trailing slash */ 4640 if (len < strlen(zd->zd_dataset) && 4641 bcmp(dataset, zd->zd_dataset, len) == 0 && 4642 zd->zd_dataset[len] == '/') { 4643 if (write) 4644 *write = 0; 4645 return (1); 4646 } 4647 } 4648 4649 return (0); 4650 } 4651 4652 /* 4653 * zone_find_by_any_path() - 4654 * 4655 * kernel-private routine similar to zone_find_by_path(), but which 4656 * effectively compares against zone paths rather than zonerootpath 4657 * (i.e., the last component of zonerootpaths, which should be "root/", 4658 * are not compared.) This is done in order to accurately identify all 4659 * paths, whether zone-visible or not, including those which are parallel 4660 * to /root/, such as /dev/, /home/, etc... 4661 * 4662 * If the specified path does not fall under any zone path then global 4663 * zone is returned. 4664 * 4665 * The treat_abs parameter indicates whether the path should be treated as 4666 * an absolute path although it does not begin with "/". (This supports 4667 * nfs mount syntax such as host:any/path.) 4668 * 4669 * The caller is responsible for zone_rele of the returned zone. 4670 */ 4671 zone_t * 4672 zone_find_by_any_path(const char *path, boolean_t treat_abs) 4673 { 4674 zone_t *zone; 4675 int path_offset = 0; 4676 4677 if (path == NULL) { 4678 zone_hold(global_zone); 4679 return (global_zone); 4680 } 4681 4682 if (*path != '/') { 4683 ASSERT(treat_abs); 4684 path_offset = 1; 4685 } 4686 4687 mutex_enter(&zonehash_lock); 4688 for (zone = list_head(&zone_active); zone != NULL; 4689 zone = list_next(&zone_active, zone)) { 4690 char *c; 4691 size_t pathlen; 4692 char *rootpath_start; 4693 4694 if (zone == global_zone) /* skip global zone */ 4695 continue; 4696 4697 /* scan backwards to find start of last component */ 4698 c = zone->zone_rootpath + zone->zone_rootpathlen - 2; 4699 do { 4700 c--; 4701 } while (*c != '/'); 4702 4703 pathlen = c - zone->zone_rootpath + 1 - path_offset; 4704 rootpath_start = (zone->zone_rootpath + path_offset); 4705 if (strncmp(path, rootpath_start, pathlen) == 0) 4706 break; 4707 } 4708 if (zone == NULL) 4709 zone = global_zone; 4710 zone_hold(zone); 4711 mutex_exit(&zonehash_lock); 4712 return (zone); 4713 } 4714