1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Zones 31 * 32 * A zone is a named collection of processes, namespace constraints, 33 * and other system resources which comprise a secure and manageable 34 * application containment facility. 35 * 36 * Zones (represented by the reference counted zone_t) are tracked in 37 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 38 * (zoneid_t) are used to track zone association. Zone IDs are 39 * dynamically generated when the zone is created; if a persistent 40 * identifier is needed (core files, accounting logs, audit trail, 41 * etc.), the zone name should be used. 42 * 43 * 44 * Global Zone: 45 * 46 * The global zone (zoneid 0) is automatically associated with all 47 * system resources that have not been bound to a user-created zone. 48 * This means that even systems where zones are not in active use 49 * have a global zone, and all processes, mounts, etc. are 50 * associated with that zone. The global zone is generally 51 * unconstrained in terms of privileges and access, though the usual 52 * credential and privilege based restrictions apply. 53 * 54 * 55 * Zone States: 56 * 57 * The states in which a zone may be in and the transitions are as 58 * follows: 59 * 60 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 61 * initialized zone is added to the list of active zones on the system but 62 * isn't accessible. 63 * 64 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 65 * ready. The zone is made visible after the ZSD constructor callbacks are 66 * executed. A zone remains in this state until it transitions into 67 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 68 * 69 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 70 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 71 * state. 72 * 73 * ZONE_IS_RUNNING: The zone is open for business: zsched has 74 * successfully started init. A zone remains in this state until 75 * zone_shutdown() is called. 76 * 77 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 78 * killing all processes running in the zone. The zone remains 79 * in this state until there are no more user processes running in the zone. 80 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 81 * Since zone_shutdown() is restartable, it may be called successfully 82 * multiple times for the same zone_t. Setting of the zone's state to 83 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 84 * the zone's status without worrying about it being a moving target. 85 * 86 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 87 * are no more user processes in the zone. The zone remains in this 88 * state until there are no more kernel threads associated with the 89 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 90 * fail. 91 * 92 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 93 * have exited. zone_shutdown() returns. Henceforth it is not possible to 94 * join the zone or create kernel threads therein. 95 * 96 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 97 * remains in this state until zsched exits. Calls to zone_find_by_*() 98 * return NULL from now on. 99 * 100 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 101 * processes or threads doing work on behalf of the zone. The zone is 102 * removed from the list of active zones. zone_destroy() returns, and 103 * the zone can be recreated. 104 * 105 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 106 * callbacks are executed, and all memory associated with the zone is 107 * freed. 108 * 109 * Threads can wait for the zone to enter a requested state by using 110 * zone_status_wait() or zone_status_timedwait() with the desired 111 * state passed in as an argument. Zone state transitions are 112 * uni-directional; it is not possible to move back to an earlier state. 113 * 114 * 115 * Zone-Specific Data: 116 * 117 * Subsystems needing to maintain zone-specific data can store that 118 * data using the ZSD mechanism. This provides a zone-specific data 119 * store, similar to thread-specific data (see pthread_getspecific(3C) 120 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 121 * to register callbacks to be invoked when a zone is created, shut 122 * down, or destroyed. This can be used to initialize zone-specific 123 * data for new zones and to clean up when zones go away. 124 * 125 * 126 * Data Structures: 127 * 128 * The per-zone structure (zone_t) is reference counted, and freed 129 * when all references are released. zone_hold and zone_rele can be 130 * used to adjust the reference count. In addition, reference counts 131 * associated with the cred_t structure are tracked separately using 132 * zone_cred_hold and zone_cred_rele. 133 * 134 * Pointers to active zone_t's are stored in two hash tables; one 135 * for searching by id, the other for searching by name. Lookups 136 * can be performed on either basis, using zone_find_by_id and 137 * zone_find_by_name. Both return zone_t pointers with the zone 138 * held, so zone_rele should be called when the pointer is no longer 139 * needed. Zones can also be searched by path; zone_find_by_path 140 * returns the zone with which a path name is associated (global 141 * zone if the path is not within some other zone's file system 142 * hierarchy). This currently requires iterating through each zone, 143 * so it is slower than an id or name search via a hash table. 144 * 145 * 146 * Locking: 147 * 148 * zonehash_lock: This is a top-level global lock used to protect the 149 * zone hash tables and lists. Zones cannot be created or destroyed 150 * while this lock is held. 151 * zone_status_lock: This is a global lock protecting zone state. 152 * Zones cannot change state while this lock is held. It also 153 * protects the list of kernel threads associated with a zone. 154 * zone_lock: This is a per-zone lock used to protect several fields of 155 * the zone_t (see <sys/zone.h> for details). In addition, holding 156 * this lock means that the zone cannot go away. 157 * zone_nlwps_lock: This is a per-zone lock used to protect the fields 158 * related to the zone.max-lwps rctl. 159 * zone_mem_lock: This is a per-zone lock used to protect the fields 160 * related to the zone.max-locked-memory and zone.max-swap rctls. 161 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 162 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 163 * list (a list of zones in the ZONE_IS_DEAD state). 164 * 165 * Ordering requirements: 166 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 167 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 168 * 169 * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: 170 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock 171 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock 172 * 173 * Blocking memory allocations are permitted while holding any of the 174 * zone locks. 175 * 176 * 177 * System Call Interface: 178 * 179 * The zone subsystem can be managed and queried from user level with 180 * the following system calls (all subcodes of the primary "zone" 181 * system call): 182 * - zone_create: creates a zone with selected attributes (name, 183 * root path, privileges, resource controls, ZFS datasets) 184 * - zone_enter: allows the current process to enter a zone 185 * - zone_getattr: reports attributes of a zone 186 * - zone_setattr: set attributes of a zone 187 * - zone_boot: set 'init' running for the zone 188 * - zone_list: lists all zones active in the system 189 * - zone_lookup: looks up zone id based on name 190 * - zone_shutdown: initiates shutdown process (see states above) 191 * - zone_destroy: completes shutdown process (see states above) 192 * 193 */ 194 195 #include <sys/priv_impl.h> 196 #include <sys/cred.h> 197 #include <c2/audit.h> 198 #include <sys/debug.h> 199 #include <sys/file.h> 200 #include <sys/kmem.h> 201 #include <sys/kstat.h> 202 #include <sys/mutex.h> 203 #include <sys/note.h> 204 #include <sys/pathname.h> 205 #include <sys/proc.h> 206 #include <sys/project.h> 207 #include <sys/sysevent.h> 208 #include <sys/task.h> 209 #include <sys/systm.h> 210 #include <sys/types.h> 211 #include <sys/utsname.h> 212 #include <sys/vnode.h> 213 #include <sys/vfs.h> 214 #include <sys/systeminfo.h> 215 #include <sys/policy.h> 216 #include <sys/cred_impl.h> 217 #include <sys/contract_impl.h> 218 #include <sys/contract/process_impl.h> 219 #include <sys/class.h> 220 #include <sys/pool.h> 221 #include <sys/pool_pset.h> 222 #include <sys/pset.h> 223 #include <sys/sysmacros.h> 224 #include <sys/callb.h> 225 #include <sys/vmparam.h> 226 #include <sys/corectl.h> 227 #include <sys/ipc_impl.h> 228 229 #include <sys/door.h> 230 #include <sys/cpuvar.h> 231 232 #include <sys/uadmin.h> 233 #include <sys/session.h> 234 #include <sys/cmn_err.h> 235 #include <sys/modhash.h> 236 #include <sys/sunddi.h> 237 #include <sys/nvpair.h> 238 #include <sys/rctl.h> 239 #include <sys/fss.h> 240 #include <sys/brand.h> 241 #include <sys/zone.h> 242 #include <net/if.h> 243 #include <vm/seg.h> 244 245 /* 246 * cv used to signal that all references to the zone have been released. This 247 * needs to be global since there may be multiple waiters, and the first to 248 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 249 */ 250 static kcondvar_t zone_destroy_cv; 251 /* 252 * Lock used to serialize access to zone_cv. This could have been per-zone, 253 * but then we'd need another lock for zone_destroy_cv, and why bother? 254 */ 255 static kmutex_t zone_status_lock; 256 257 /* 258 * ZSD-related global variables. 259 */ 260 static kmutex_t zsd_key_lock; /* protects the following two */ 261 /* 262 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 263 */ 264 static zone_key_t zsd_keyval = 0; 265 /* 266 * Global list of registered keys. We use this when a new zone is created. 267 */ 268 static list_t zsd_registered_keys; 269 270 int zone_hash_size = 256; 271 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel; 272 static kmutex_t zonehash_lock; 273 static uint_t zonecount; 274 static id_space_t *zoneid_space; 275 276 /* 277 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 278 * kernel proper runs, and which manages all other zones. 279 * 280 * Although not declared as static, the variable "zone0" should not be used 281 * except for by code that needs to reference the global zone early on in boot, 282 * before it is fully initialized. All other consumers should use 283 * 'global_zone'. 284 */ 285 zone_t zone0; 286 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 287 288 /* 289 * List of active zones, protected by zonehash_lock. 290 */ 291 static list_t zone_active; 292 293 /* 294 * List of destroyed zones that still have outstanding cred references. 295 * Used for debugging. Uses a separate lock to avoid lock ordering 296 * problems in zone_free. 297 */ 298 static list_t zone_deathrow; 299 static kmutex_t zone_deathrow_lock; 300 301 /* number of zones is limited by virtual interface limit in IP */ 302 uint_t maxzones = 8192; 303 304 /* Event channel to sent zone state change notifications */ 305 evchan_t *zone_event_chan; 306 307 /* 308 * This table holds the mapping from kernel zone states to 309 * states visible in the state notification API. 310 * The idea is that we only expose "obvious" states and 311 * do not expose states which are just implementation details. 312 */ 313 const char *zone_status_table[] = { 314 ZONE_EVENT_UNINITIALIZED, /* uninitialized */ 315 ZONE_EVENT_READY, /* ready */ 316 ZONE_EVENT_READY, /* booting */ 317 ZONE_EVENT_RUNNING, /* running */ 318 ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */ 319 ZONE_EVENT_SHUTTING_DOWN, /* empty */ 320 ZONE_EVENT_SHUTTING_DOWN, /* down */ 321 ZONE_EVENT_SHUTTING_DOWN, /* dying */ 322 ZONE_EVENT_UNINITIALIZED, /* dead */ 323 }; 324 325 /* 326 * This isn't static so lint doesn't complain. 327 */ 328 rctl_hndl_t rc_zone_cpu_shares; 329 rctl_hndl_t rc_zone_locked_mem; 330 rctl_hndl_t rc_zone_max_swap; 331 rctl_hndl_t rc_zone_nlwps; 332 rctl_hndl_t rc_zone_shmmax; 333 rctl_hndl_t rc_zone_shmmni; 334 rctl_hndl_t rc_zone_semmni; 335 rctl_hndl_t rc_zone_msgmni; 336 /* 337 * Synchronization primitives used to synchronize between mounts and zone 338 * creation/destruction. 339 */ 340 static int mounts_in_progress; 341 static kcondvar_t mount_cv; 342 static kmutex_t mount_lock; 343 344 const char * const zone_default_initname = "/sbin/init"; 345 static char * const zone_prefix = "/zone/"; 346 static int zone_shutdown(zoneid_t zoneid); 347 static int zone_add_datalink(zoneid_t, char *); 348 static int zone_remove_datalink(zoneid_t, char *); 349 static int zone_check_datalink(zoneid_t *, char *); 350 static int zone_list_datalink(zoneid_t, int *, char *); 351 352 /* 353 * Bump this number when you alter the zone syscall interfaces; this is 354 * because we need to have support for previous API versions in libc 355 * to support patching; libc calls into the kernel to determine this number. 356 * 357 * Version 1 of the API is the version originally shipped with Solaris 10 358 * Version 2 alters the zone_create system call in order to support more 359 * arguments by moving the args into a structure; and to do better 360 * error reporting when zone_create() fails. 361 * Version 3 alters the zone_create system call in order to support the 362 * import of ZFS datasets to zones. 363 * Version 4 alters the zone_create system call in order to support 364 * Trusted Extensions. 365 * Version 5 alters the zone_boot system call, and converts its old 366 * bootargs parameter to be set by the zone_setattr API instead. 367 * Version 6 adds the flag argument to zone_create. 368 */ 369 static const int ZONE_SYSCALL_API_VERSION = 6; 370 371 /* 372 * Certain filesystems (such as NFS and autofs) need to know which zone 373 * the mount is being placed in. Because of this, we need to be able to 374 * ensure that a zone isn't in the process of being created such that 375 * nfs_mount() thinks it is in the global zone, while by the time it 376 * gets added the list of mounted zones, it ends up on zoneA's mount 377 * list. 378 * 379 * The following functions: block_mounts()/resume_mounts() and 380 * mount_in_progress()/mount_completed() are used by zones and the VFS 381 * layer (respectively) to synchronize zone creation and new mounts. 382 * 383 * The semantics are like a reader-reader lock such that there may 384 * either be multiple mounts (or zone creations, if that weren't 385 * serialized by zonehash_lock) in progress at the same time, but not 386 * both. 387 * 388 * We use cv's so the user can ctrl-C out of the operation if it's 389 * taking too long. 390 * 391 * The semantics are such that there is unfair bias towards the 392 * "current" operation. This means that zone creations may starve if 393 * there is a rapid succession of new mounts coming in to the system, or 394 * there is a remote possibility that zones will be created at such a 395 * rate that new mounts will not be able to proceed. 396 */ 397 /* 398 * Prevent new mounts from progressing to the point of calling 399 * VFS_MOUNT(). If there are already mounts in this "region", wait for 400 * them to complete. 401 */ 402 static int 403 block_mounts(void) 404 { 405 int retval = 0; 406 407 /* 408 * Since it may block for a long time, block_mounts() shouldn't be 409 * called with zonehash_lock held. 410 */ 411 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 412 mutex_enter(&mount_lock); 413 while (mounts_in_progress > 0) { 414 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 415 goto signaled; 416 } 417 /* 418 * A negative value of mounts_in_progress indicates that mounts 419 * have been blocked by (-mounts_in_progress) different callers. 420 */ 421 mounts_in_progress--; 422 retval = 1; 423 signaled: 424 mutex_exit(&mount_lock); 425 return (retval); 426 } 427 428 /* 429 * The VFS layer may progress with new mounts as far as we're concerned. 430 * Allow them to progress if we were the last obstacle. 431 */ 432 static void 433 resume_mounts(void) 434 { 435 mutex_enter(&mount_lock); 436 if (++mounts_in_progress == 0) 437 cv_broadcast(&mount_cv); 438 mutex_exit(&mount_lock); 439 } 440 441 /* 442 * The VFS layer is busy with a mount; zones should wait until all 443 * mounts are completed to progress. 444 */ 445 void 446 mount_in_progress(void) 447 { 448 mutex_enter(&mount_lock); 449 while (mounts_in_progress < 0) 450 cv_wait(&mount_cv, &mount_lock); 451 mounts_in_progress++; 452 mutex_exit(&mount_lock); 453 } 454 455 /* 456 * VFS is done with one mount; wake up any waiting block_mounts() 457 * callers if this is the last mount. 458 */ 459 void 460 mount_completed(void) 461 { 462 mutex_enter(&mount_lock); 463 if (--mounts_in_progress == 0) 464 cv_broadcast(&mount_cv); 465 mutex_exit(&mount_lock); 466 } 467 468 /* 469 * ZSD routines. 470 * 471 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 472 * defined by the pthread_key_create() and related interfaces. 473 * 474 * Kernel subsystems may register one or more data items and/or 475 * callbacks to be executed when a zone is created, shutdown, or 476 * destroyed. 477 * 478 * Unlike the thread counterpart, destructor callbacks will be executed 479 * even if the data pointer is NULL and/or there are no constructor 480 * callbacks, so it is the responsibility of such callbacks to check for 481 * NULL data values if necessary. 482 * 483 * The locking strategy and overall picture is as follows: 484 * 485 * When someone calls zone_key_create(), a template ZSD entry is added to the 486 * global list "zsd_registered_keys", protected by zsd_key_lock. The 487 * constructor callback is called immediately on all existing zones, and a 488 * copy of the ZSD entry added to the per-zone zone_zsd list (protected by 489 * zone_lock). As this operation requires the list of zones, the list of 490 * registered keys, and the per-zone list of ZSD entries to remain constant 491 * throughout the entire operation, it must grab zonehash_lock, zone_lock for 492 * all existing zones, and zsd_key_lock, in that order. Similar locking is 493 * needed when zone_key_delete() is called. It is thus sufficient to hold 494 * zsd_key_lock *or* zone_lock to prevent additions to or removals from the 495 * per-zone zone_zsd list. 496 * 497 * Note that this implementation does not make a copy of the ZSD entry if a 498 * constructor callback is not provided. A zone_getspecific() on such an 499 * uninitialized ZSD entry will return NULL. 500 * 501 * When new zones are created constructor callbacks for all registered ZSD 502 * entries will be called. 503 * 504 * The framework does not provide any locking around zone_getspecific() and 505 * zone_setspecific() apart from that needed for internal consistency, so 506 * callers interested in atomic "test-and-set" semantics will need to provide 507 * their own locking. 508 */ 509 void 510 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 511 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 512 { 513 struct zsd_entry *zsdp; 514 struct zsd_entry *t; 515 struct zone *zone; 516 517 zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); 518 zsdp->zsd_data = NULL; 519 zsdp->zsd_create = create; 520 zsdp->zsd_shutdown = shutdown; 521 zsdp->zsd_destroy = destroy; 522 523 mutex_enter(&zonehash_lock); /* stop the world */ 524 for (zone = list_head(&zone_active); zone != NULL; 525 zone = list_next(&zone_active, zone)) 526 mutex_enter(&zone->zone_lock); /* lock all zones */ 527 528 mutex_enter(&zsd_key_lock); 529 *keyp = zsdp->zsd_key = ++zsd_keyval; 530 ASSERT(zsd_keyval != 0); 531 list_insert_tail(&zsd_registered_keys, zsdp); 532 mutex_exit(&zsd_key_lock); 533 534 if (create != NULL) { 535 for (zone = list_head(&zone_active); zone != NULL; 536 zone = list_next(&zone_active, zone)) { 537 t = kmem_alloc(sizeof (*t), KM_SLEEP); 538 t->zsd_key = *keyp; 539 t->zsd_data = (*create)(zone->zone_id); 540 t->zsd_create = create; 541 t->zsd_shutdown = shutdown; 542 t->zsd_destroy = destroy; 543 list_insert_tail(&zone->zone_zsd, t); 544 } 545 } 546 for (zone = list_head(&zone_active); zone != NULL; 547 zone = list_next(&zone_active, zone)) 548 mutex_exit(&zone->zone_lock); 549 mutex_exit(&zonehash_lock); 550 } 551 552 /* 553 * Helper function to find the zsd_entry associated with the key in the 554 * given list. 555 */ 556 static struct zsd_entry * 557 zsd_find(list_t *l, zone_key_t key) 558 { 559 struct zsd_entry *zsd; 560 561 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 562 if (zsd->zsd_key == key) { 563 /* 564 * Move to head of list to keep list in MRU order. 565 */ 566 if (zsd != list_head(l)) { 567 list_remove(l, zsd); 568 list_insert_head(l, zsd); 569 } 570 return (zsd); 571 } 572 } 573 return (NULL); 574 } 575 576 /* 577 * Function called when a module is being unloaded, or otherwise wishes 578 * to unregister its ZSD key and callbacks. 579 */ 580 int 581 zone_key_delete(zone_key_t key) 582 { 583 struct zsd_entry *zsdp = NULL; 584 zone_t *zone; 585 586 mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ 587 for (zone = list_head(&zone_active); zone != NULL; 588 zone = list_next(&zone_active, zone)) 589 mutex_enter(&zone->zone_lock); /* lock all zones */ 590 591 mutex_enter(&zsd_key_lock); 592 zsdp = zsd_find(&zsd_registered_keys, key); 593 if (zsdp == NULL) 594 goto notfound; 595 list_remove(&zsd_registered_keys, zsdp); 596 mutex_exit(&zsd_key_lock); 597 598 for (zone = list_head(&zone_active); zone != NULL; 599 zone = list_next(&zone_active, zone)) { 600 struct zsd_entry *del; 601 void *data; 602 603 if (!(zone->zone_flags & ZF_DESTROYED)) { 604 del = zsd_find(&zone->zone_zsd, key); 605 if (del != NULL) { 606 data = del->zsd_data; 607 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 608 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 609 list_remove(&zone->zone_zsd, del); 610 kmem_free(del, sizeof (*del)); 611 } else { 612 data = NULL; 613 } 614 if (zsdp->zsd_shutdown) 615 zsdp->zsd_shutdown(zone->zone_id, data); 616 if (zsdp->zsd_destroy) 617 zsdp->zsd_destroy(zone->zone_id, data); 618 } 619 mutex_exit(&zone->zone_lock); 620 } 621 mutex_exit(&zonehash_lock); 622 kmem_free(zsdp, sizeof (*zsdp)); 623 return (0); 624 625 notfound: 626 mutex_exit(&zsd_key_lock); 627 for (zone = list_head(&zone_active); zone != NULL; 628 zone = list_next(&zone_active, zone)) 629 mutex_exit(&zone->zone_lock); 630 mutex_exit(&zonehash_lock); 631 return (-1); 632 } 633 634 /* 635 * ZSD counterpart of pthread_setspecific(). 636 */ 637 int 638 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 639 { 640 struct zsd_entry *t; 641 struct zsd_entry *zsdp = NULL; 642 643 mutex_enter(&zone->zone_lock); 644 t = zsd_find(&zone->zone_zsd, key); 645 if (t != NULL) { 646 /* 647 * Replace old value with new 648 */ 649 t->zsd_data = (void *)data; 650 mutex_exit(&zone->zone_lock); 651 return (0); 652 } 653 /* 654 * If there was no previous value, go through the list of registered 655 * keys. 656 * 657 * We avoid grabbing zsd_key_lock until we are sure we need it; this is 658 * necessary for shutdown callbacks to be able to execute without fear 659 * of deadlock. 660 */ 661 mutex_enter(&zsd_key_lock); 662 zsdp = zsd_find(&zsd_registered_keys, key); 663 if (zsdp == NULL) { /* Key was not registered */ 664 mutex_exit(&zsd_key_lock); 665 mutex_exit(&zone->zone_lock); 666 return (-1); 667 } 668 669 /* 670 * Add a zsd_entry to this zone, using the template we just retrieved 671 * to initialize the constructor and destructor(s). 672 */ 673 t = kmem_alloc(sizeof (*t), KM_SLEEP); 674 t->zsd_key = key; 675 t->zsd_data = (void *)data; 676 t->zsd_create = zsdp->zsd_create; 677 t->zsd_shutdown = zsdp->zsd_shutdown; 678 t->zsd_destroy = zsdp->zsd_destroy; 679 list_insert_tail(&zone->zone_zsd, t); 680 mutex_exit(&zsd_key_lock); 681 mutex_exit(&zone->zone_lock); 682 return (0); 683 } 684 685 /* 686 * ZSD counterpart of pthread_getspecific(). 687 */ 688 void * 689 zone_getspecific(zone_key_t key, zone_t *zone) 690 { 691 struct zsd_entry *t; 692 void *data; 693 694 mutex_enter(&zone->zone_lock); 695 t = zsd_find(&zone->zone_zsd, key); 696 data = (t == NULL ? NULL : t->zsd_data); 697 mutex_exit(&zone->zone_lock); 698 return (data); 699 } 700 701 /* 702 * Function used to initialize a zone's list of ZSD callbacks and data 703 * when the zone is being created. The callbacks are initialized from 704 * the template list (zsd_registered_keys), and the constructor 705 * callback executed (if one exists). 706 * 707 * This is called before the zone is made publicly available, hence no 708 * need to grab zone_lock. 709 * 710 * Although we grab and release zsd_key_lock, new entries cannot be 711 * added to or removed from the zsd_registered_keys list until we 712 * release zonehash_lock, so there isn't a window for a 713 * zone_key_create() to come in after we've dropped zsd_key_lock but 714 * before the zone is added to the zone list, such that the constructor 715 * callbacks aren't executed for the new zone. 716 */ 717 static void 718 zone_zsd_configure(zone_t *zone) 719 { 720 struct zsd_entry *zsdp; 721 struct zsd_entry *t; 722 zoneid_t zoneid = zone->zone_id; 723 724 ASSERT(MUTEX_HELD(&zonehash_lock)); 725 ASSERT(list_head(&zone->zone_zsd) == NULL); 726 mutex_enter(&zsd_key_lock); 727 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 728 zsdp = list_next(&zsd_registered_keys, zsdp)) { 729 if (zsdp->zsd_create != NULL) { 730 t = kmem_alloc(sizeof (*t), KM_SLEEP); 731 t->zsd_key = zsdp->zsd_key; 732 t->zsd_create = zsdp->zsd_create; 733 t->zsd_data = (*t->zsd_create)(zoneid); 734 t->zsd_shutdown = zsdp->zsd_shutdown; 735 t->zsd_destroy = zsdp->zsd_destroy; 736 list_insert_tail(&zone->zone_zsd, t); 737 } 738 } 739 mutex_exit(&zsd_key_lock); 740 } 741 742 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 743 744 /* 745 * Helper function to execute shutdown or destructor callbacks. 746 */ 747 static void 748 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 749 { 750 struct zsd_entry *zsdp; 751 struct zsd_entry *t; 752 zoneid_t zoneid = zone->zone_id; 753 754 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 755 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 756 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 757 758 mutex_enter(&zone->zone_lock); 759 if (ct == ZSD_DESTROY) { 760 if (zone->zone_flags & ZF_DESTROYED) { 761 /* 762 * Make sure destructors are only called once. 763 */ 764 mutex_exit(&zone->zone_lock); 765 return; 766 } 767 zone->zone_flags |= ZF_DESTROYED; 768 } 769 mutex_exit(&zone->zone_lock); 770 771 /* 772 * Both zsd_key_lock and zone_lock need to be held in order to add or 773 * remove a ZSD key, (either globally as part of 774 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is 775 * possible through zone_setspecific()), so it's sufficient to hold 776 * zsd_key_lock here. 777 * 778 * This is a good thing, since we don't want to recursively try to grab 779 * zone_lock if a callback attempts to do something like a crfree() or 780 * zone_rele(). 781 */ 782 mutex_enter(&zsd_key_lock); 783 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 784 zsdp = list_next(&zsd_registered_keys, zsdp)) { 785 zone_key_t key = zsdp->zsd_key; 786 787 /* Skip if no callbacks registered */ 788 if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) 789 continue; 790 if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) 791 continue; 792 /* 793 * Call the callback with the zone-specific data if we can find 794 * any, otherwise with NULL. 795 */ 796 t = zsd_find(&zone->zone_zsd, key); 797 if (t != NULL) { 798 if (ct == ZSD_SHUTDOWN) { 799 t->zsd_shutdown(zoneid, t->zsd_data); 800 } else { 801 ASSERT(ct == ZSD_DESTROY); 802 t->zsd_destroy(zoneid, t->zsd_data); 803 } 804 } else { 805 if (ct == ZSD_SHUTDOWN) { 806 zsdp->zsd_shutdown(zoneid, NULL); 807 } else { 808 ASSERT(ct == ZSD_DESTROY); 809 zsdp->zsd_destroy(zoneid, NULL); 810 } 811 } 812 } 813 mutex_exit(&zsd_key_lock); 814 } 815 816 /* 817 * Called when the zone is going away; free ZSD-related memory, and 818 * destroy the zone_zsd list. 819 */ 820 static void 821 zone_free_zsd(zone_t *zone) 822 { 823 struct zsd_entry *t, *next; 824 825 /* 826 * Free all the zsd_entry's we had on this zone. 827 */ 828 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 829 next = list_next(&zone->zone_zsd, t); 830 list_remove(&zone->zone_zsd, t); 831 kmem_free(t, sizeof (*t)); 832 } 833 list_destroy(&zone->zone_zsd); 834 } 835 836 /* 837 * Frees memory associated with the zone dataset list. 838 */ 839 static void 840 zone_free_datasets(zone_t *zone) 841 { 842 zone_dataset_t *t, *next; 843 844 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 845 next = list_next(&zone->zone_datasets, t); 846 list_remove(&zone->zone_datasets, t); 847 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 848 kmem_free(t, sizeof (*t)); 849 } 850 list_destroy(&zone->zone_datasets); 851 } 852 853 /* 854 * zone.cpu-shares resource control support. 855 */ 856 /*ARGSUSED*/ 857 static rctl_qty_t 858 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 859 { 860 ASSERT(MUTEX_HELD(&p->p_lock)); 861 return (p->p_zone->zone_shares); 862 } 863 864 /*ARGSUSED*/ 865 static int 866 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 867 rctl_qty_t nv) 868 { 869 ASSERT(MUTEX_HELD(&p->p_lock)); 870 ASSERT(e->rcep_t == RCENTITY_ZONE); 871 if (e->rcep_p.zone == NULL) 872 return (0); 873 874 e->rcep_p.zone->zone_shares = nv; 875 return (0); 876 } 877 878 static rctl_ops_t zone_cpu_shares_ops = { 879 rcop_no_action, 880 zone_cpu_shares_usage, 881 zone_cpu_shares_set, 882 rcop_no_test 883 }; 884 885 /*ARGSUSED*/ 886 static rctl_qty_t 887 zone_lwps_usage(rctl_t *r, proc_t *p) 888 { 889 rctl_qty_t nlwps; 890 zone_t *zone = p->p_zone; 891 892 ASSERT(MUTEX_HELD(&p->p_lock)); 893 894 mutex_enter(&zone->zone_nlwps_lock); 895 nlwps = zone->zone_nlwps; 896 mutex_exit(&zone->zone_nlwps_lock); 897 898 return (nlwps); 899 } 900 901 /*ARGSUSED*/ 902 static int 903 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 904 rctl_qty_t incr, uint_t flags) 905 { 906 rctl_qty_t nlwps; 907 908 ASSERT(MUTEX_HELD(&p->p_lock)); 909 ASSERT(e->rcep_t == RCENTITY_ZONE); 910 if (e->rcep_p.zone == NULL) 911 return (0); 912 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 913 nlwps = e->rcep_p.zone->zone_nlwps; 914 915 if (nlwps + incr > rcntl->rcv_value) 916 return (1); 917 918 return (0); 919 } 920 921 /*ARGSUSED*/ 922 static int 923 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) 924 { 925 ASSERT(MUTEX_HELD(&p->p_lock)); 926 ASSERT(e->rcep_t == RCENTITY_ZONE); 927 if (e->rcep_p.zone == NULL) 928 return (0); 929 e->rcep_p.zone->zone_nlwps_ctl = nv; 930 return (0); 931 } 932 933 static rctl_ops_t zone_lwps_ops = { 934 rcop_no_action, 935 zone_lwps_usage, 936 zone_lwps_set, 937 zone_lwps_test, 938 }; 939 940 /*ARGSUSED*/ 941 static int 942 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 943 rctl_qty_t incr, uint_t flags) 944 { 945 rctl_qty_t v; 946 ASSERT(MUTEX_HELD(&p->p_lock)); 947 ASSERT(e->rcep_t == RCENTITY_ZONE); 948 v = e->rcep_p.zone->zone_shmmax + incr; 949 if (v > rval->rcv_value) 950 return (1); 951 return (0); 952 } 953 954 static rctl_ops_t zone_shmmax_ops = { 955 rcop_no_action, 956 rcop_no_usage, 957 rcop_no_set, 958 zone_shmmax_test 959 }; 960 961 /*ARGSUSED*/ 962 static int 963 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 964 rctl_qty_t incr, uint_t flags) 965 { 966 rctl_qty_t v; 967 ASSERT(MUTEX_HELD(&p->p_lock)); 968 ASSERT(e->rcep_t == RCENTITY_ZONE); 969 v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr; 970 if (v > rval->rcv_value) 971 return (1); 972 return (0); 973 } 974 975 static rctl_ops_t zone_shmmni_ops = { 976 rcop_no_action, 977 rcop_no_usage, 978 rcop_no_set, 979 zone_shmmni_test 980 }; 981 982 /*ARGSUSED*/ 983 static int 984 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 985 rctl_qty_t incr, uint_t flags) 986 { 987 rctl_qty_t v; 988 ASSERT(MUTEX_HELD(&p->p_lock)); 989 ASSERT(e->rcep_t == RCENTITY_ZONE); 990 v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr; 991 if (v > rval->rcv_value) 992 return (1); 993 return (0); 994 } 995 996 static rctl_ops_t zone_semmni_ops = { 997 rcop_no_action, 998 rcop_no_usage, 999 rcop_no_set, 1000 zone_semmni_test 1001 }; 1002 1003 /*ARGSUSED*/ 1004 static int 1005 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1006 rctl_qty_t incr, uint_t flags) 1007 { 1008 rctl_qty_t v; 1009 ASSERT(MUTEX_HELD(&p->p_lock)); 1010 ASSERT(e->rcep_t == RCENTITY_ZONE); 1011 v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr; 1012 if (v > rval->rcv_value) 1013 return (1); 1014 return (0); 1015 } 1016 1017 static rctl_ops_t zone_msgmni_ops = { 1018 rcop_no_action, 1019 rcop_no_usage, 1020 rcop_no_set, 1021 zone_msgmni_test 1022 }; 1023 1024 /*ARGSUSED*/ 1025 static rctl_qty_t 1026 zone_locked_mem_usage(rctl_t *rctl, struct proc *p) 1027 { 1028 rctl_qty_t q; 1029 ASSERT(MUTEX_HELD(&p->p_lock)); 1030 mutex_enter(&p->p_zone->zone_mem_lock); 1031 q = p->p_zone->zone_locked_mem; 1032 mutex_exit(&p->p_zone->zone_mem_lock); 1033 return (q); 1034 } 1035 1036 /*ARGSUSED*/ 1037 static int 1038 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1039 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1040 { 1041 rctl_qty_t q; 1042 zone_t *z; 1043 1044 z = e->rcep_p.zone; 1045 ASSERT(MUTEX_HELD(&p->p_lock)); 1046 ASSERT(MUTEX_HELD(&z->zone_mem_lock)); 1047 q = z->zone_locked_mem; 1048 if (q + incr > rcntl->rcv_value) 1049 return (1); 1050 return (0); 1051 } 1052 1053 /*ARGSUSED*/ 1054 static int 1055 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1056 rctl_qty_t nv) 1057 { 1058 ASSERT(MUTEX_HELD(&p->p_lock)); 1059 ASSERT(e->rcep_t == RCENTITY_ZONE); 1060 if (e->rcep_p.zone == NULL) 1061 return (0); 1062 e->rcep_p.zone->zone_locked_mem_ctl = nv; 1063 return (0); 1064 } 1065 1066 static rctl_ops_t zone_locked_mem_ops = { 1067 rcop_no_action, 1068 zone_locked_mem_usage, 1069 zone_locked_mem_set, 1070 zone_locked_mem_test 1071 }; 1072 1073 /*ARGSUSED*/ 1074 static rctl_qty_t 1075 zone_max_swap_usage(rctl_t *rctl, struct proc *p) 1076 { 1077 rctl_qty_t q; 1078 zone_t *z = p->p_zone; 1079 1080 ASSERT(MUTEX_HELD(&p->p_lock)); 1081 mutex_enter(&z->zone_mem_lock); 1082 q = z->zone_max_swap; 1083 mutex_exit(&z->zone_mem_lock); 1084 return (q); 1085 } 1086 1087 /*ARGSUSED*/ 1088 static int 1089 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1090 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1091 { 1092 rctl_qty_t q; 1093 zone_t *z; 1094 1095 z = e->rcep_p.zone; 1096 ASSERT(MUTEX_HELD(&p->p_lock)); 1097 ASSERT(MUTEX_HELD(&z->zone_mem_lock)); 1098 q = z->zone_max_swap; 1099 if (q + incr > rcntl->rcv_value) 1100 return (1); 1101 return (0); 1102 } 1103 1104 /*ARGSUSED*/ 1105 static int 1106 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1107 rctl_qty_t nv) 1108 { 1109 ASSERT(MUTEX_HELD(&p->p_lock)); 1110 ASSERT(e->rcep_t == RCENTITY_ZONE); 1111 if (e->rcep_p.zone == NULL) 1112 return (0); 1113 e->rcep_p.zone->zone_max_swap_ctl = nv; 1114 return (0); 1115 } 1116 1117 static rctl_ops_t zone_max_swap_ops = { 1118 rcop_no_action, 1119 zone_max_swap_usage, 1120 zone_max_swap_set, 1121 zone_max_swap_test 1122 }; 1123 1124 /* 1125 * Helper function to brand the zone with a unique ID. 1126 */ 1127 static void 1128 zone_uniqid(zone_t *zone) 1129 { 1130 static uint64_t uniqid = 0; 1131 1132 ASSERT(MUTEX_HELD(&zonehash_lock)); 1133 zone->zone_uniqid = uniqid++; 1134 } 1135 1136 /* 1137 * Returns a held pointer to the "kcred" for the specified zone. 1138 */ 1139 struct cred * 1140 zone_get_kcred(zoneid_t zoneid) 1141 { 1142 zone_t *zone; 1143 cred_t *cr; 1144 1145 if ((zone = zone_find_by_id(zoneid)) == NULL) 1146 return (NULL); 1147 cr = zone->zone_kcred; 1148 crhold(cr); 1149 zone_rele(zone); 1150 return (cr); 1151 } 1152 1153 static int 1154 zone_lockedmem_kstat_update(kstat_t *ksp, int rw) 1155 { 1156 zone_t *zone = ksp->ks_private; 1157 zone_kstat_t *zk = ksp->ks_data; 1158 1159 if (rw == KSTAT_WRITE) 1160 return (EACCES); 1161 1162 zk->zk_usage.value.ui64 = zone->zone_locked_mem; 1163 zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; 1164 return (0); 1165 } 1166 1167 static int 1168 zone_swapresv_kstat_update(kstat_t *ksp, int rw) 1169 { 1170 zone_t *zone = ksp->ks_private; 1171 zone_kstat_t *zk = ksp->ks_data; 1172 1173 if (rw == KSTAT_WRITE) 1174 return (EACCES); 1175 1176 zk->zk_usage.value.ui64 = zone->zone_max_swap; 1177 zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; 1178 return (0); 1179 } 1180 1181 static void 1182 zone_kstat_create(zone_t *zone) 1183 { 1184 kstat_t *ksp; 1185 zone_kstat_t *zk; 1186 1187 ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED, 1188 sizeof (zone_kstat_t) / sizeof (kstat_named_t), 1189 KSTAT_FLAG_VIRTUAL); 1190 1191 if (ksp == NULL) 1192 return; 1193 1194 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); 1195 ksp->ks_data_size += strlen(zone->zone_name) + 1; 1196 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); 1197 kstat_named_setstr(&zk->zk_zonename, zone->zone_name); 1198 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); 1199 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); 1200 ksp->ks_update = zone_lockedmem_kstat_update; 1201 ksp->ks_private = zone; 1202 kstat_install(ksp); 1203 1204 zone->zone_lockedmem_kstat = ksp; 1205 1206 ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED, 1207 sizeof (zone_kstat_t) / sizeof (kstat_named_t), 1208 KSTAT_FLAG_VIRTUAL); 1209 1210 if (ksp == NULL) 1211 return; 1212 1213 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); 1214 ksp->ks_data_size += strlen(zone->zone_name) + 1; 1215 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); 1216 kstat_named_setstr(&zk->zk_zonename, zone->zone_name); 1217 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); 1218 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); 1219 ksp->ks_update = zone_swapresv_kstat_update; 1220 ksp->ks_private = zone; 1221 kstat_install(ksp); 1222 1223 zone->zone_swapresv_kstat = ksp; 1224 } 1225 1226 static void 1227 zone_kstat_delete(zone_t *zone) 1228 { 1229 void *data; 1230 1231 if (zone->zone_lockedmem_kstat != NULL) { 1232 data = zone->zone_lockedmem_kstat->ks_data; 1233 kstat_delete(zone->zone_lockedmem_kstat); 1234 kmem_free(data, sizeof (zone_kstat_t)); 1235 } 1236 if (zone->zone_swapresv_kstat != NULL) { 1237 data = zone->zone_swapresv_kstat->ks_data; 1238 kstat_delete(zone->zone_swapresv_kstat); 1239 kmem_free(data, sizeof (zone_kstat_t)); 1240 } 1241 } 1242 1243 /* 1244 * Called very early on in boot to initialize the ZSD list so that 1245 * zone_key_create() can be called before zone_init(). It also initializes 1246 * portions of zone0 which may be used before zone_init() is called. The 1247 * variable "global_zone" will be set when zone0 is fully initialized by 1248 * zone_init(). 1249 */ 1250 void 1251 zone_zsd_init(void) 1252 { 1253 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 1254 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 1255 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 1256 offsetof(struct zsd_entry, zsd_linkage)); 1257 list_create(&zone_active, sizeof (zone_t), 1258 offsetof(zone_t, zone_linkage)); 1259 list_create(&zone_deathrow, sizeof (zone_t), 1260 offsetof(zone_t, zone_linkage)); 1261 1262 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 1263 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 1264 mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); 1265 zone0.zone_shares = 1; 1266 zone0.zone_nlwps = 0; 1267 zone0.zone_nlwps_ctl = INT_MAX; 1268 zone0.zone_locked_mem = 0; 1269 zone0.zone_locked_mem_ctl = UINT64_MAX; 1270 ASSERT(zone0.zone_max_swap == 0); 1271 zone0.zone_max_swap_ctl = UINT64_MAX; 1272 zone0.zone_shmmax = 0; 1273 zone0.zone_ipc.ipcq_shmmni = 0; 1274 zone0.zone_ipc.ipcq_semmni = 0; 1275 zone0.zone_ipc.ipcq_msgmni = 0; 1276 zone0.zone_name = GLOBAL_ZONENAME; 1277 zone0.zone_nodename = utsname.nodename; 1278 zone0.zone_domain = srpc_domain; 1279 zone0.zone_ref = 1; 1280 zone0.zone_id = GLOBAL_ZONEID; 1281 zone0.zone_status = ZONE_IS_RUNNING; 1282 zone0.zone_rootpath = "/"; 1283 zone0.zone_rootpathlen = 2; 1284 zone0.zone_psetid = ZONE_PS_INVAL; 1285 zone0.zone_ncpus = 0; 1286 zone0.zone_ncpus_online = 0; 1287 zone0.zone_proc_initpid = 1; 1288 zone0.zone_initname = initname; 1289 zone0.zone_lockedmem_kstat = NULL; 1290 zone0.zone_swapresv_kstat = NULL; 1291 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 1292 offsetof(struct zsd_entry, zsd_linkage)); 1293 list_insert_head(&zone_active, &zone0); 1294 1295 /* 1296 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 1297 * to anything meaningful. It is assigned to be 'rootdir' in 1298 * vfs_mountroot(). 1299 */ 1300 zone0.zone_rootvp = NULL; 1301 zone0.zone_vfslist = NULL; 1302 zone0.zone_bootargs = initargs; 1303 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 1304 /* 1305 * The global zone has all privileges 1306 */ 1307 priv_fillset(zone0.zone_privset); 1308 /* 1309 * Add p0 to the global zone 1310 */ 1311 zone0.zone_zsched = &p0; 1312 p0.p_zone = &zone0; 1313 } 1314 1315 /* 1316 * Compute a hash value based on the contents of the label and the DOI. The 1317 * hash algorithm is somewhat arbitrary, but is based on the observation that 1318 * humans will likely pick labels that differ by amounts that work out to be 1319 * multiples of the number of hash chains, and thus stirring in some primes 1320 * should help. 1321 */ 1322 static uint_t 1323 hash_bylabel(void *hdata, mod_hash_key_t key) 1324 { 1325 const ts_label_t *lab = (ts_label_t *)key; 1326 const uint32_t *up, *ue; 1327 uint_t hash; 1328 int i; 1329 1330 _NOTE(ARGUNUSED(hdata)); 1331 1332 hash = lab->tsl_doi + (lab->tsl_doi << 1); 1333 /* we depend on alignment of label, but not representation */ 1334 up = (const uint32_t *)&lab->tsl_label; 1335 ue = up + sizeof (lab->tsl_label) / sizeof (*up); 1336 i = 1; 1337 while (up < ue) { 1338 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */ 1339 hash += *up + (*up << ((i % 16) + 1)); 1340 up++; 1341 i++; 1342 } 1343 return (hash); 1344 } 1345 1346 /* 1347 * All that mod_hash cares about here is zero (equal) versus non-zero (not 1348 * equal). This may need to be changed if less than / greater than is ever 1349 * needed. 1350 */ 1351 static int 1352 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1353 { 1354 ts_label_t *lab1 = (ts_label_t *)key1; 1355 ts_label_t *lab2 = (ts_label_t *)key2; 1356 1357 return (label_equal(lab1, lab2) ? 0 : 1); 1358 } 1359 1360 /* 1361 * Called by main() to initialize the zones framework. 1362 */ 1363 void 1364 zone_init(void) 1365 { 1366 rctl_dict_entry_t *rde; 1367 rctl_val_t *dval; 1368 rctl_set_t *set; 1369 rctl_alloc_gp_t *gp; 1370 rctl_entity_p_t e; 1371 int res; 1372 1373 ASSERT(curproc == &p0); 1374 1375 /* 1376 * Create ID space for zone IDs. ID 0 is reserved for the 1377 * global zone. 1378 */ 1379 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 1380 1381 /* 1382 * Initialize generic zone resource controls, if any. 1383 */ 1384 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 1385 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 1386 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, 1387 FSS_MAXSHARES, FSS_MAXSHARES, 1388 &zone_cpu_shares_ops); 1389 1390 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 1391 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1392 INT_MAX, INT_MAX, &zone_lwps_ops); 1393 /* 1394 * System V IPC resource controls 1395 */ 1396 rc_zone_msgmni = rctl_register("zone.max-msg-ids", 1397 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 1398 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops); 1399 1400 rc_zone_semmni = rctl_register("zone.max-sem-ids", 1401 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 1402 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops); 1403 1404 rc_zone_shmmni = rctl_register("zone.max-shm-ids", 1405 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 1406 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops); 1407 1408 rc_zone_shmmax = rctl_register("zone.max-shm-memory", 1409 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 1410 RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops); 1411 1412 /* 1413 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 1414 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 1415 */ 1416 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 1417 bzero(dval, sizeof (rctl_val_t)); 1418 dval->rcv_value = 1; 1419 dval->rcv_privilege = RCPRIV_PRIVILEGED; 1420 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 1421 dval->rcv_action_recip_pid = -1; 1422 1423 rde = rctl_dict_lookup("zone.cpu-shares"); 1424 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 1425 1426 rc_zone_locked_mem = rctl_register("zone.max-locked-memory", 1427 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | 1428 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 1429 &zone_locked_mem_ops); 1430 1431 rc_zone_max_swap = rctl_register("zone.max-swap", 1432 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | 1433 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 1434 &zone_max_swap_ops); 1435 1436 /* 1437 * Initialize the ``global zone''. 1438 */ 1439 set = rctl_set_create(); 1440 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 1441 mutex_enter(&p0.p_lock); 1442 e.rcep_p.zone = &zone0; 1443 e.rcep_t = RCENTITY_ZONE; 1444 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 1445 gp); 1446 1447 zone0.zone_nlwps = p0.p_lwpcnt; 1448 zone0.zone_ntasks = 1; 1449 mutex_exit(&p0.p_lock); 1450 zone0.zone_restart_init = B_TRUE; 1451 zone0.zone_brand = &native_brand; 1452 rctl_prealloc_destroy(gp); 1453 /* 1454 * pool_default hasn't been initialized yet, so we let pool_init() 1455 * take care of making sure the global zone is in the default pool. 1456 */ 1457 1458 /* 1459 * Initialize global zone kstats 1460 */ 1461 zone_kstat_create(&zone0); 1462 1463 /* 1464 * Initialize zone label. 1465 * mlp are initialized when tnzonecfg is loaded. 1466 */ 1467 zone0.zone_slabel = l_admin_low; 1468 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 1469 label_hold(l_admin_low); 1470 1471 mutex_enter(&zonehash_lock); 1472 zone_uniqid(&zone0); 1473 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 1474 1475 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 1476 mod_hash_null_valdtor); 1477 zonehashbyname = mod_hash_create_strhash("zone_by_name", 1478 zone_hash_size, mod_hash_null_valdtor); 1479 /* 1480 * maintain zonehashbylabel only for labeled systems 1481 */ 1482 if (is_system_labeled()) 1483 zonehashbylabel = mod_hash_create_extended("zone_by_label", 1484 zone_hash_size, mod_hash_null_keydtor, 1485 mod_hash_null_valdtor, hash_bylabel, NULL, 1486 hash_labelkey_cmp, KM_SLEEP); 1487 zonecount = 1; 1488 1489 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 1490 (mod_hash_val_t)&zone0); 1491 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 1492 (mod_hash_val_t)&zone0); 1493 if (is_system_labeled()) { 1494 zone0.zone_flags |= ZF_HASHED_LABEL; 1495 (void) mod_hash_insert(zonehashbylabel, 1496 (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0); 1497 } 1498 mutex_exit(&zonehash_lock); 1499 1500 /* 1501 * We avoid setting zone_kcred until now, since kcred is initialized 1502 * sometime after zone_zsd_init() and before zone_init(). 1503 */ 1504 zone0.zone_kcred = kcred; 1505 /* 1506 * The global zone is fully initialized (except for zone_rootvp which 1507 * will be set when the root filesystem is mounted). 1508 */ 1509 global_zone = &zone0; 1510 1511 /* 1512 * Setup an event channel to send zone status change notifications on 1513 */ 1514 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan, 1515 EVCH_CREAT); 1516 1517 if (res) 1518 panic("Sysevent_evc_bind failed during zone setup.\n"); 1519 1520 } 1521 1522 static void 1523 zone_free(zone_t *zone) 1524 { 1525 ASSERT(zone != global_zone); 1526 ASSERT(zone->zone_ntasks == 0); 1527 ASSERT(zone->zone_nlwps == 0); 1528 ASSERT(zone->zone_cred_ref == 0); 1529 ASSERT(zone->zone_kcred == NULL); 1530 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 1531 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 1532 1533 /* remove from deathrow list */ 1534 if (zone_status_get(zone) == ZONE_IS_DEAD) { 1535 ASSERT(zone->zone_ref == 0); 1536 mutex_enter(&zone_deathrow_lock); 1537 list_remove(&zone_deathrow, zone); 1538 mutex_exit(&zone_deathrow_lock); 1539 } 1540 1541 zone_free_zsd(zone); 1542 zone_free_datasets(zone); 1543 1544 if (zone->zone_rootvp != NULL) 1545 VN_RELE(zone->zone_rootvp); 1546 if (zone->zone_rootpath) 1547 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 1548 if (zone->zone_name != NULL) 1549 kmem_free(zone->zone_name, ZONENAME_MAX); 1550 if (zone->zone_slabel != NULL) 1551 label_rele(zone->zone_slabel); 1552 if (zone->zone_nodename != NULL) 1553 kmem_free(zone->zone_nodename, _SYS_NMLN); 1554 if (zone->zone_domain != NULL) 1555 kmem_free(zone->zone_domain, _SYS_NMLN); 1556 if (zone->zone_privset != NULL) 1557 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 1558 if (zone->zone_rctls != NULL) 1559 rctl_set_free(zone->zone_rctls); 1560 if (zone->zone_bootargs != NULL) 1561 kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1); 1562 if (zone->zone_initname != NULL) 1563 kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1); 1564 id_free(zoneid_space, zone->zone_id); 1565 mutex_destroy(&zone->zone_lock); 1566 cv_destroy(&zone->zone_cv); 1567 rw_destroy(&zone->zone_mlps.mlpl_rwlock); 1568 kmem_free(zone, sizeof (zone_t)); 1569 } 1570 1571 /* 1572 * See block comment at the top of this file for information about zone 1573 * status values. 1574 */ 1575 /* 1576 * Convenience function for setting zone status. 1577 */ 1578 static void 1579 zone_status_set(zone_t *zone, zone_status_t status) 1580 { 1581 1582 nvlist_t *nvl = NULL; 1583 ASSERT(MUTEX_HELD(&zone_status_lock)); 1584 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 1585 status >= zone_status_get(zone)); 1586 1587 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || 1588 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || 1589 nvlist_add_string(nvl, ZONE_CB_NEWSTATE, 1590 zone_status_table[status]) || 1591 nvlist_add_string(nvl, ZONE_CB_OLDSTATE, 1592 zone_status_table[zone->zone_status]) || 1593 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || 1594 nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || 1595 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, 1596 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { 1597 #ifdef DEBUG 1598 (void) printf( 1599 "Failed to allocate and send zone state change event.\n"); 1600 #endif 1601 } 1602 nvlist_free(nvl); 1603 1604 zone->zone_status = status; 1605 1606 cv_broadcast(&zone->zone_cv); 1607 } 1608 1609 /* 1610 * Public function to retrieve the zone status. The zone status may 1611 * change after it is retrieved. 1612 */ 1613 zone_status_t 1614 zone_status_get(zone_t *zone) 1615 { 1616 return (zone->zone_status); 1617 } 1618 1619 static int 1620 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 1621 { 1622 char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); 1623 int err = 0; 1624 1625 ASSERT(zone != global_zone); 1626 if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0) 1627 goto done; /* EFAULT or ENAMETOOLONG */ 1628 1629 if (zone->zone_bootargs != NULL) 1630 kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1); 1631 1632 zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP); 1633 (void) strcpy(zone->zone_bootargs, bootargs); 1634 1635 done: 1636 kmem_free(bootargs, BOOTARGS_MAX); 1637 return (err); 1638 } 1639 1640 static int 1641 zone_set_initname(zone_t *zone, const char *zone_initname) 1642 { 1643 char initname[INITNAME_SZ]; 1644 size_t len; 1645 int err = 0; 1646 1647 ASSERT(zone != global_zone); 1648 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0) 1649 return (err); /* EFAULT or ENAMETOOLONG */ 1650 1651 if (zone->zone_initname != NULL) 1652 kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1); 1653 1654 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); 1655 (void) strcpy(zone->zone_initname, initname); 1656 return (0); 1657 } 1658 1659 static int 1660 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) 1661 { 1662 uint64_t mcap; 1663 int err = 0; 1664 1665 if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) 1666 zone->zone_phys_mcap = mcap; 1667 1668 return (err); 1669 } 1670 1671 static int 1672 zone_set_sched_class(zone_t *zone, const char *new_class) 1673 { 1674 char sched_class[PC_CLNMSZ]; 1675 id_t classid; 1676 int err; 1677 1678 ASSERT(zone != global_zone); 1679 if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) 1680 return (err); /* EFAULT or ENAMETOOLONG */ 1681 1682 if (getcid(sched_class, &classid) != 0 || classid == syscid) 1683 return (set_errno(EINVAL)); 1684 zone->zone_defaultcid = classid; 1685 ASSERT(zone->zone_defaultcid > 0 && 1686 zone->zone_defaultcid < loaded_classes); 1687 1688 return (0); 1689 } 1690 1691 /* 1692 * Block indefinitely waiting for (zone_status >= status) 1693 */ 1694 void 1695 zone_status_wait(zone_t *zone, zone_status_t status) 1696 { 1697 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1698 1699 mutex_enter(&zone_status_lock); 1700 while (zone->zone_status < status) { 1701 cv_wait(&zone->zone_cv, &zone_status_lock); 1702 } 1703 mutex_exit(&zone_status_lock); 1704 } 1705 1706 /* 1707 * Private CPR-safe version of zone_status_wait(). 1708 */ 1709 static void 1710 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 1711 { 1712 callb_cpr_t cprinfo; 1713 1714 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1715 1716 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 1717 str); 1718 mutex_enter(&zone_status_lock); 1719 while (zone->zone_status < status) { 1720 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1721 cv_wait(&zone->zone_cv, &zone_status_lock); 1722 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 1723 } 1724 /* 1725 * zone_status_lock is implicitly released by the following. 1726 */ 1727 CALLB_CPR_EXIT(&cprinfo); 1728 } 1729 1730 /* 1731 * Block until zone enters requested state or signal is received. Return (0) 1732 * if signaled, non-zero otherwise. 1733 */ 1734 int 1735 zone_status_wait_sig(zone_t *zone, zone_status_t status) 1736 { 1737 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1738 1739 mutex_enter(&zone_status_lock); 1740 while (zone->zone_status < status) { 1741 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 1742 mutex_exit(&zone_status_lock); 1743 return (0); 1744 } 1745 } 1746 mutex_exit(&zone_status_lock); 1747 return (1); 1748 } 1749 1750 /* 1751 * Block until the zone enters the requested state or the timeout expires, 1752 * whichever happens first. Return (-1) if operation timed out, time remaining 1753 * otherwise. 1754 */ 1755 clock_t 1756 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 1757 { 1758 clock_t timeleft = 0; 1759 1760 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1761 1762 mutex_enter(&zone_status_lock); 1763 while (zone->zone_status < status && timeleft != -1) { 1764 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 1765 } 1766 mutex_exit(&zone_status_lock); 1767 return (timeleft); 1768 } 1769 1770 /* 1771 * Block until the zone enters the requested state, the current process is 1772 * signaled, or the timeout expires, whichever happens first. Return (-1) if 1773 * operation timed out, 0 if signaled, time remaining otherwise. 1774 */ 1775 clock_t 1776 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 1777 { 1778 clock_t timeleft = tim - lbolt; 1779 1780 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 1781 1782 mutex_enter(&zone_status_lock); 1783 while (zone->zone_status < status) { 1784 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 1785 tim); 1786 if (timeleft <= 0) 1787 break; 1788 } 1789 mutex_exit(&zone_status_lock); 1790 return (timeleft); 1791 } 1792 1793 /* 1794 * Zones have two reference counts: one for references from credential 1795 * structures (zone_cred_ref), and one (zone_ref) for everything else. 1796 * This is so we can allow a zone to be rebooted while there are still 1797 * outstanding cred references, since certain drivers cache dblks (which 1798 * implicitly results in cached creds). We wait for zone_ref to drop to 1799 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 1800 * later freed when the zone_cred_ref drops to 0, though nothing other 1801 * than the zone id and privilege set should be accessed once the zone 1802 * is "dead". 1803 * 1804 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 1805 * to force halt/reboot to block waiting for the zone_cred_ref to drop 1806 * to 0. This can be useful to flush out other sources of cached creds 1807 * that may be less innocuous than the driver case. 1808 */ 1809 1810 int zone_wait_for_cred = 0; 1811 1812 static void 1813 zone_hold_locked(zone_t *z) 1814 { 1815 ASSERT(MUTEX_HELD(&z->zone_lock)); 1816 z->zone_ref++; 1817 ASSERT(z->zone_ref != 0); 1818 } 1819 1820 void 1821 zone_hold(zone_t *z) 1822 { 1823 mutex_enter(&z->zone_lock); 1824 zone_hold_locked(z); 1825 mutex_exit(&z->zone_lock); 1826 } 1827 1828 /* 1829 * If the non-cred ref count drops to 1 and either the cred ref count 1830 * is 0 or we aren't waiting for cred references, the zone is ready to 1831 * be destroyed. 1832 */ 1833 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 1834 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 1835 1836 void 1837 zone_rele(zone_t *z) 1838 { 1839 boolean_t wakeup; 1840 1841 mutex_enter(&z->zone_lock); 1842 ASSERT(z->zone_ref != 0); 1843 z->zone_ref--; 1844 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1845 /* no more refs, free the structure */ 1846 mutex_exit(&z->zone_lock); 1847 zone_free(z); 1848 return; 1849 } 1850 /* signal zone_destroy so the zone can finish halting */ 1851 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 1852 mutex_exit(&z->zone_lock); 1853 1854 if (wakeup) { 1855 /* 1856 * Grabbing zonehash_lock here effectively synchronizes with 1857 * zone_destroy() to avoid missed signals. 1858 */ 1859 mutex_enter(&zonehash_lock); 1860 cv_broadcast(&zone_destroy_cv); 1861 mutex_exit(&zonehash_lock); 1862 } 1863 } 1864 1865 void 1866 zone_cred_hold(zone_t *z) 1867 { 1868 mutex_enter(&z->zone_lock); 1869 z->zone_cred_ref++; 1870 ASSERT(z->zone_cred_ref != 0); 1871 mutex_exit(&z->zone_lock); 1872 } 1873 1874 void 1875 zone_cred_rele(zone_t *z) 1876 { 1877 boolean_t wakeup; 1878 1879 mutex_enter(&z->zone_lock); 1880 ASSERT(z->zone_cred_ref != 0); 1881 z->zone_cred_ref--; 1882 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 1883 /* no more refs, free the structure */ 1884 mutex_exit(&z->zone_lock); 1885 zone_free(z); 1886 return; 1887 } 1888 /* 1889 * If zone_destroy is waiting for the cred references to drain 1890 * out, and they have, signal it. 1891 */ 1892 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 1893 zone_status_get(z) >= ZONE_IS_DEAD); 1894 mutex_exit(&z->zone_lock); 1895 1896 if (wakeup) { 1897 /* 1898 * Grabbing zonehash_lock here effectively synchronizes with 1899 * zone_destroy() to avoid missed signals. 1900 */ 1901 mutex_enter(&zonehash_lock); 1902 cv_broadcast(&zone_destroy_cv); 1903 mutex_exit(&zonehash_lock); 1904 } 1905 } 1906 1907 void 1908 zone_task_hold(zone_t *z) 1909 { 1910 mutex_enter(&z->zone_lock); 1911 z->zone_ntasks++; 1912 ASSERT(z->zone_ntasks != 0); 1913 mutex_exit(&z->zone_lock); 1914 } 1915 1916 void 1917 zone_task_rele(zone_t *zone) 1918 { 1919 uint_t refcnt; 1920 1921 mutex_enter(&zone->zone_lock); 1922 ASSERT(zone->zone_ntasks != 0); 1923 refcnt = --zone->zone_ntasks; 1924 if (refcnt > 1) { /* Common case */ 1925 mutex_exit(&zone->zone_lock); 1926 return; 1927 } 1928 zone_hold_locked(zone); /* so we can use the zone_t later */ 1929 mutex_exit(&zone->zone_lock); 1930 if (refcnt == 1) { 1931 /* 1932 * See if the zone is shutting down. 1933 */ 1934 mutex_enter(&zone_status_lock); 1935 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 1936 goto out; 1937 } 1938 1939 /* 1940 * Make sure the ntasks didn't change since we 1941 * dropped zone_lock. 1942 */ 1943 mutex_enter(&zone->zone_lock); 1944 if (refcnt != zone->zone_ntasks) { 1945 mutex_exit(&zone->zone_lock); 1946 goto out; 1947 } 1948 mutex_exit(&zone->zone_lock); 1949 1950 /* 1951 * No more user processes in the zone. The zone is empty. 1952 */ 1953 zone_status_set(zone, ZONE_IS_EMPTY); 1954 goto out; 1955 } 1956 1957 ASSERT(refcnt == 0); 1958 /* 1959 * zsched has exited; the zone is dead. 1960 */ 1961 zone->zone_zsched = NULL; /* paranoia */ 1962 mutex_enter(&zone_status_lock); 1963 zone_status_set(zone, ZONE_IS_DEAD); 1964 out: 1965 mutex_exit(&zone_status_lock); 1966 zone_rele(zone); 1967 } 1968 1969 zoneid_t 1970 getzoneid(void) 1971 { 1972 return (curproc->p_zone->zone_id); 1973 } 1974 1975 /* 1976 * Internal versions of zone_find_by_*(). These don't zone_hold() or 1977 * check the validity of a zone's state. 1978 */ 1979 static zone_t * 1980 zone_find_all_by_id(zoneid_t zoneid) 1981 { 1982 mod_hash_val_t hv; 1983 zone_t *zone = NULL; 1984 1985 ASSERT(MUTEX_HELD(&zonehash_lock)); 1986 1987 if (mod_hash_find(zonehashbyid, 1988 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 1989 zone = (zone_t *)hv; 1990 return (zone); 1991 } 1992 1993 static zone_t * 1994 zone_find_all_by_label(const ts_label_t *label) 1995 { 1996 mod_hash_val_t hv; 1997 zone_t *zone = NULL; 1998 1999 ASSERT(MUTEX_HELD(&zonehash_lock)); 2000 2001 /* 2002 * zonehashbylabel is not maintained for unlabeled systems 2003 */ 2004 if (!is_system_labeled()) 2005 return (NULL); 2006 if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0) 2007 zone = (zone_t *)hv; 2008 return (zone); 2009 } 2010 2011 static zone_t * 2012 zone_find_all_by_name(char *name) 2013 { 2014 mod_hash_val_t hv; 2015 zone_t *zone = NULL; 2016 2017 ASSERT(MUTEX_HELD(&zonehash_lock)); 2018 2019 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 2020 zone = (zone_t *)hv; 2021 return (zone); 2022 } 2023 2024 /* 2025 * Public interface for looking up a zone by zoneid. Only returns the zone if 2026 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 2027 * Caller must call zone_rele() once it is done with the zone. 2028 * 2029 * The zone may begin the zone_destroy() sequence immediately after this 2030 * function returns, but may be safely used until zone_rele() is called. 2031 */ 2032 zone_t * 2033 zone_find_by_id(zoneid_t zoneid) 2034 { 2035 zone_t *zone; 2036 zone_status_t status; 2037 2038 mutex_enter(&zonehash_lock); 2039 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2040 mutex_exit(&zonehash_lock); 2041 return (NULL); 2042 } 2043 status = zone_status_get(zone); 2044 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2045 /* 2046 * For all practical purposes the zone doesn't exist. 2047 */ 2048 mutex_exit(&zonehash_lock); 2049 return (NULL); 2050 } 2051 zone_hold(zone); 2052 mutex_exit(&zonehash_lock); 2053 return (zone); 2054 } 2055 2056 /* 2057 * Similar to zone_find_by_id, but using zone label as the key. 2058 */ 2059 zone_t * 2060 zone_find_by_label(const ts_label_t *label) 2061 { 2062 zone_t *zone; 2063 zone_status_t status; 2064 2065 mutex_enter(&zonehash_lock); 2066 if ((zone = zone_find_all_by_label(label)) == NULL) { 2067 mutex_exit(&zonehash_lock); 2068 return (NULL); 2069 } 2070 2071 status = zone_status_get(zone); 2072 if (status > ZONE_IS_DOWN) { 2073 /* 2074 * For all practical purposes the zone doesn't exist. 2075 */ 2076 mutex_exit(&zonehash_lock); 2077 return (NULL); 2078 } 2079 zone_hold(zone); 2080 mutex_exit(&zonehash_lock); 2081 return (zone); 2082 } 2083 2084 /* 2085 * Similar to zone_find_by_id, but using zone name as the key. 2086 */ 2087 zone_t * 2088 zone_find_by_name(char *name) 2089 { 2090 zone_t *zone; 2091 zone_status_t status; 2092 2093 mutex_enter(&zonehash_lock); 2094 if ((zone = zone_find_all_by_name(name)) == NULL) { 2095 mutex_exit(&zonehash_lock); 2096 return (NULL); 2097 } 2098 status = zone_status_get(zone); 2099 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2100 /* 2101 * For all practical purposes the zone doesn't exist. 2102 */ 2103 mutex_exit(&zonehash_lock); 2104 return (NULL); 2105 } 2106 zone_hold(zone); 2107 mutex_exit(&zonehash_lock); 2108 return (zone); 2109 } 2110 2111 /* 2112 * Similar to zone_find_by_id(), using the path as a key. For instance, 2113 * if there is a zone "foo" rooted at /foo/root, and the path argument 2114 * is "/foo/root/proc", it will return the held zone_t corresponding to 2115 * zone "foo". 2116 * 2117 * zone_find_by_path() always returns a non-NULL value, since at the 2118 * very least every path will be contained in the global zone. 2119 * 2120 * As with the other zone_find_by_*() functions, the caller is 2121 * responsible for zone_rele()ing the return value of this function. 2122 */ 2123 zone_t * 2124 zone_find_by_path(const char *path) 2125 { 2126 zone_t *zone; 2127 zone_t *zret = NULL; 2128 zone_status_t status; 2129 2130 if (path == NULL) { 2131 /* 2132 * Call from rootconf(). 2133 */ 2134 zone_hold(global_zone); 2135 return (global_zone); 2136 } 2137 ASSERT(*path == '/'); 2138 mutex_enter(&zonehash_lock); 2139 for (zone = list_head(&zone_active); zone != NULL; 2140 zone = list_next(&zone_active, zone)) { 2141 if (ZONE_PATH_VISIBLE(path, zone)) 2142 zret = zone; 2143 } 2144 ASSERT(zret != NULL); 2145 status = zone_status_get(zret); 2146 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2147 /* 2148 * Zone practically doesn't exist. 2149 */ 2150 zret = global_zone; 2151 } 2152 zone_hold(zret); 2153 mutex_exit(&zonehash_lock); 2154 return (zret); 2155 } 2156 2157 /* 2158 * Get the number of cpus visible to this zone. The system-wide global 2159 * 'ncpus' is returned if pools are disabled, the caller is in the 2160 * global zone, or a NULL zone argument is passed in. 2161 */ 2162 int 2163 zone_ncpus_get(zone_t *zone) 2164 { 2165 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 2166 2167 return (myncpus != 0 ? myncpus : ncpus); 2168 } 2169 2170 /* 2171 * Get the number of online cpus visible to this zone. The system-wide 2172 * global 'ncpus_online' is returned if pools are disabled, the caller 2173 * is in the global zone, or a NULL zone argument is passed in. 2174 */ 2175 int 2176 zone_ncpus_online_get(zone_t *zone) 2177 { 2178 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 2179 2180 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 2181 } 2182 2183 /* 2184 * Return the pool to which the zone is currently bound. 2185 */ 2186 pool_t * 2187 zone_pool_get(zone_t *zone) 2188 { 2189 ASSERT(pool_lock_held()); 2190 2191 return (zone->zone_pool); 2192 } 2193 2194 /* 2195 * Set the zone's pool pointer and update the zone's visibility to match 2196 * the resources in the new pool. 2197 */ 2198 void 2199 zone_pool_set(zone_t *zone, pool_t *pool) 2200 { 2201 ASSERT(pool_lock_held()); 2202 ASSERT(MUTEX_HELD(&cpu_lock)); 2203 2204 zone->zone_pool = pool; 2205 zone_pset_set(zone, pool->pool_pset->pset_id); 2206 } 2207 2208 /* 2209 * Return the cached value of the id of the processor set to which the 2210 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 2211 * facility is disabled. 2212 */ 2213 psetid_t 2214 zone_pset_get(zone_t *zone) 2215 { 2216 ASSERT(MUTEX_HELD(&cpu_lock)); 2217 2218 return (zone->zone_psetid); 2219 } 2220 2221 /* 2222 * Set the cached value of the id of the processor set to which the zone 2223 * is currently bound. Also update the zone's visibility to match the 2224 * resources in the new processor set. 2225 */ 2226 void 2227 zone_pset_set(zone_t *zone, psetid_t newpsetid) 2228 { 2229 psetid_t oldpsetid; 2230 2231 ASSERT(MUTEX_HELD(&cpu_lock)); 2232 oldpsetid = zone_pset_get(zone); 2233 2234 if (oldpsetid == newpsetid) 2235 return; 2236 /* 2237 * Global zone sees all. 2238 */ 2239 if (zone != global_zone) { 2240 zone->zone_psetid = newpsetid; 2241 if (newpsetid != ZONE_PS_INVAL) 2242 pool_pset_visibility_add(newpsetid, zone); 2243 if (oldpsetid != ZONE_PS_INVAL) 2244 pool_pset_visibility_remove(oldpsetid, zone); 2245 } 2246 /* 2247 * Disabling pools, so we should start using the global values 2248 * for ncpus and ncpus_online. 2249 */ 2250 if (newpsetid == ZONE_PS_INVAL) { 2251 zone->zone_ncpus = 0; 2252 zone->zone_ncpus_online = 0; 2253 } 2254 } 2255 2256 /* 2257 * Walk the list of active zones and issue the provided callback for 2258 * each of them. 2259 * 2260 * Caller must not be holding any locks that may be acquired under 2261 * zonehash_lock. See comment at the beginning of the file for a list of 2262 * common locks and their interactions with zones. 2263 */ 2264 int 2265 zone_walk(int (*cb)(zone_t *, void *), void *data) 2266 { 2267 zone_t *zone; 2268 int ret = 0; 2269 zone_status_t status; 2270 2271 mutex_enter(&zonehash_lock); 2272 for (zone = list_head(&zone_active); zone != NULL; 2273 zone = list_next(&zone_active, zone)) { 2274 /* 2275 * Skip zones that shouldn't be externally visible. 2276 */ 2277 status = zone_status_get(zone); 2278 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 2279 continue; 2280 /* 2281 * Bail immediately if any callback invocation returns a 2282 * non-zero value. 2283 */ 2284 ret = (*cb)(zone, data); 2285 if (ret != 0) 2286 break; 2287 } 2288 mutex_exit(&zonehash_lock); 2289 return (ret); 2290 } 2291 2292 static int 2293 zone_set_root(zone_t *zone, const char *upath) 2294 { 2295 vnode_t *vp; 2296 int trycount; 2297 int error = 0; 2298 char *path; 2299 struct pathname upn, pn; 2300 size_t pathlen; 2301 2302 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 2303 return (error); 2304 2305 pn_alloc(&pn); 2306 2307 /* prevent infinite loop */ 2308 trycount = 10; 2309 for (;;) { 2310 if (--trycount <= 0) { 2311 error = ESTALE; 2312 goto out; 2313 } 2314 2315 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 2316 /* 2317 * VOP_ACCESS() may cover 'vp' with a new 2318 * filesystem, if 'vp' is an autoFS vnode. 2319 * Get the new 'vp' if so. 2320 */ 2321 if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 && 2322 (vp->v_vfsmountedhere == NULL || 2323 (error = traverse(&vp)) == 0)) { 2324 pathlen = pn.pn_pathlen + 2; 2325 path = kmem_alloc(pathlen, KM_SLEEP); 2326 (void) strncpy(path, pn.pn_path, 2327 pn.pn_pathlen + 1); 2328 path[pathlen - 2] = '/'; 2329 path[pathlen - 1] = '\0'; 2330 pn_free(&pn); 2331 pn_free(&upn); 2332 2333 /* Success! */ 2334 break; 2335 } 2336 VN_RELE(vp); 2337 } 2338 if (error != ESTALE) 2339 goto out; 2340 } 2341 2342 ASSERT(error == 0); 2343 zone->zone_rootvp = vp; /* we hold a reference to vp */ 2344 zone->zone_rootpath = path; 2345 zone->zone_rootpathlen = pathlen; 2346 if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0) 2347 zone->zone_flags |= ZF_IS_SCRATCH; 2348 return (0); 2349 2350 out: 2351 pn_free(&pn); 2352 pn_free(&upn); 2353 return (error); 2354 } 2355 2356 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 2357 ((c) >= 'a' && (c) <= 'z') || \ 2358 ((c) >= 'A' && (c) <= 'Z')) 2359 2360 static int 2361 zone_set_name(zone_t *zone, const char *uname) 2362 { 2363 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 2364 size_t len; 2365 int i, err; 2366 2367 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 2368 kmem_free(kname, ZONENAME_MAX); 2369 return (err); /* EFAULT or ENAMETOOLONG */ 2370 } 2371 2372 /* must be less than ZONENAME_MAX */ 2373 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 2374 kmem_free(kname, ZONENAME_MAX); 2375 return (EINVAL); 2376 } 2377 2378 /* 2379 * Name must start with an alphanumeric and must contain only 2380 * alphanumerics, '-', '_' and '.'. 2381 */ 2382 if (!isalnum(kname[0])) { 2383 kmem_free(kname, ZONENAME_MAX); 2384 return (EINVAL); 2385 } 2386 for (i = 1; i < len - 1; i++) { 2387 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 2388 kname[i] != '.') { 2389 kmem_free(kname, ZONENAME_MAX); 2390 return (EINVAL); 2391 } 2392 } 2393 2394 zone->zone_name = kname; 2395 return (0); 2396 } 2397 2398 /* 2399 * Similar to thread_create(), but makes sure the thread is in the appropriate 2400 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 2401 */ 2402 /*ARGSUSED*/ 2403 kthread_t * 2404 zthread_create( 2405 caddr_t stk, 2406 size_t stksize, 2407 void (*proc)(), 2408 void *arg, 2409 size_t len, 2410 pri_t pri) 2411 { 2412 kthread_t *t; 2413 zone_t *zone = curproc->p_zone; 2414 proc_t *pp = zone->zone_zsched; 2415 2416 zone_hold(zone); /* Reference to be dropped when thread exits */ 2417 2418 /* 2419 * No-one should be trying to create threads if the zone is shutting 2420 * down and there aren't any kernel threads around. See comment 2421 * in zthread_exit(). 2422 */ 2423 ASSERT(!(zone->zone_kthreads == NULL && 2424 zone_status_get(zone) >= ZONE_IS_EMPTY)); 2425 /* 2426 * Create a thread, but don't let it run until we've finished setting 2427 * things up. 2428 */ 2429 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 2430 ASSERT(t->t_forw == NULL); 2431 mutex_enter(&zone_status_lock); 2432 if (zone->zone_kthreads == NULL) { 2433 t->t_forw = t->t_back = t; 2434 } else { 2435 kthread_t *tx = zone->zone_kthreads; 2436 2437 t->t_forw = tx; 2438 t->t_back = tx->t_back; 2439 tx->t_back->t_forw = t; 2440 tx->t_back = t; 2441 } 2442 zone->zone_kthreads = t; 2443 mutex_exit(&zone_status_lock); 2444 2445 mutex_enter(&pp->p_lock); 2446 t->t_proc_flag |= TP_ZTHREAD; 2447 project_rele(t->t_proj); 2448 t->t_proj = project_hold(pp->p_task->tk_proj); 2449 2450 /* 2451 * Setup complete, let it run. 2452 */ 2453 thread_lock(t); 2454 t->t_schedflag |= TS_ALLSTART; 2455 setrun_locked(t); 2456 thread_unlock(t); 2457 2458 mutex_exit(&pp->p_lock); 2459 2460 return (t); 2461 } 2462 2463 /* 2464 * Similar to thread_exit(). Must be called by threads created via 2465 * zthread_exit(). 2466 */ 2467 void 2468 zthread_exit(void) 2469 { 2470 kthread_t *t = curthread; 2471 proc_t *pp = curproc; 2472 zone_t *zone = pp->p_zone; 2473 2474 mutex_enter(&zone_status_lock); 2475 2476 /* 2477 * Reparent to p0 2478 */ 2479 kpreempt_disable(); 2480 mutex_enter(&pp->p_lock); 2481 t->t_proc_flag &= ~TP_ZTHREAD; 2482 t->t_procp = &p0; 2483 hat_thread_exit(t); 2484 mutex_exit(&pp->p_lock); 2485 kpreempt_enable(); 2486 2487 if (t->t_back == t) { 2488 ASSERT(t->t_forw == t); 2489 /* 2490 * If the zone is empty, once the thread count 2491 * goes to zero no further kernel threads can be 2492 * created. This is because if the creator is a process 2493 * in the zone, then it must have exited before the zone 2494 * state could be set to ZONE_IS_EMPTY. 2495 * Otherwise, if the creator is a kernel thread in the 2496 * zone, the thread count is non-zero. 2497 * 2498 * This really means that non-zone kernel threads should 2499 * not create zone kernel threads. 2500 */ 2501 zone->zone_kthreads = NULL; 2502 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 2503 zone_status_set(zone, ZONE_IS_DOWN); 2504 } 2505 } else { 2506 t->t_forw->t_back = t->t_back; 2507 t->t_back->t_forw = t->t_forw; 2508 if (zone->zone_kthreads == t) 2509 zone->zone_kthreads = t->t_forw; 2510 } 2511 mutex_exit(&zone_status_lock); 2512 zone_rele(zone); 2513 thread_exit(); 2514 /* NOTREACHED */ 2515 } 2516 2517 static void 2518 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 2519 { 2520 vnode_t *oldvp; 2521 2522 /* we're going to hold a reference here to the directory */ 2523 VN_HOLD(vp); 2524 2525 #ifdef C2_AUDIT 2526 if (audit_active) /* update abs cwd/root path see c2audit.c */ 2527 audit_chdirec(vp, vpp); 2528 #endif 2529 2530 mutex_enter(&pp->p_lock); 2531 oldvp = *vpp; 2532 *vpp = vp; 2533 mutex_exit(&pp->p_lock); 2534 if (oldvp != NULL) 2535 VN_RELE(oldvp); 2536 } 2537 2538 /* 2539 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 2540 */ 2541 static int 2542 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 2543 { 2544 nvpair_t *nvp = NULL; 2545 boolean_t priv_set = B_FALSE; 2546 boolean_t limit_set = B_FALSE; 2547 boolean_t action_set = B_FALSE; 2548 2549 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2550 const char *name; 2551 uint64_t ui64; 2552 2553 name = nvpair_name(nvp); 2554 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 2555 return (EINVAL); 2556 (void) nvpair_value_uint64(nvp, &ui64); 2557 if (strcmp(name, "privilege") == 0) { 2558 /* 2559 * Currently only privileged values are allowed, but 2560 * this may change in the future. 2561 */ 2562 if (ui64 != RCPRIV_PRIVILEGED) 2563 return (EINVAL); 2564 rv->rcv_privilege = ui64; 2565 priv_set = B_TRUE; 2566 } else if (strcmp(name, "limit") == 0) { 2567 rv->rcv_value = ui64; 2568 limit_set = B_TRUE; 2569 } else if (strcmp(name, "action") == 0) { 2570 if (ui64 != RCTL_LOCAL_NOACTION && 2571 ui64 != RCTL_LOCAL_DENY) 2572 return (EINVAL); 2573 rv->rcv_flagaction = ui64; 2574 action_set = B_TRUE; 2575 } else { 2576 return (EINVAL); 2577 } 2578 } 2579 2580 if (!(priv_set && limit_set && action_set)) 2581 return (EINVAL); 2582 rv->rcv_action_signal = 0; 2583 rv->rcv_action_recipient = NULL; 2584 rv->rcv_action_recip_pid = -1; 2585 rv->rcv_firing_time = 0; 2586 2587 return (0); 2588 } 2589 2590 /* 2591 * Non-global zone version of start_init. 2592 */ 2593 void 2594 zone_start_init(void) 2595 { 2596 proc_t *p = ttoproc(curthread); 2597 zone_t *z = p->p_zone; 2598 2599 ASSERT(!INGLOBALZONE(curproc)); 2600 2601 /* 2602 * For all purposes (ZONE_ATTR_INITPID and restart_init), 2603 * storing just the pid of init is sufficient. 2604 */ 2605 z->zone_proc_initpid = p->p_pid; 2606 2607 /* 2608 * We maintain zone_boot_err so that we can return the cause of the 2609 * failure back to the caller of the zone_boot syscall. 2610 */ 2611 p->p_zone->zone_boot_err = start_init_common(); 2612 2613 mutex_enter(&zone_status_lock); 2614 if (z->zone_boot_err != 0) { 2615 /* 2616 * Make sure we are still in the booting state-- we could have 2617 * raced and already be shutting down, or even further along. 2618 */ 2619 if (zone_status_get(z) == ZONE_IS_BOOTING) 2620 zone_status_set(z, ZONE_IS_SHUTTING_DOWN); 2621 mutex_exit(&zone_status_lock); 2622 /* It's gone bad, dispose of the process */ 2623 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) { 2624 mutex_enter(&p->p_lock); 2625 ASSERT(p->p_flag & SEXITLWPS); 2626 lwp_exit(); 2627 } 2628 } else { 2629 if (zone_status_get(z) == ZONE_IS_BOOTING) 2630 zone_status_set(z, ZONE_IS_RUNNING); 2631 mutex_exit(&zone_status_lock); 2632 /* cause the process to return to userland. */ 2633 lwp_rtt(); 2634 } 2635 } 2636 2637 struct zsched_arg { 2638 zone_t *zone; 2639 nvlist_t *nvlist; 2640 }; 2641 2642 /* 2643 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 2644 * anything to do with scheduling, but rather with the fact that 2645 * per-zone kernel threads are parented to zsched, just like regular 2646 * kernel threads are parented to sched (p0). 2647 * 2648 * zsched is also responsible for launching init for the zone. 2649 */ 2650 static void 2651 zsched(void *arg) 2652 { 2653 struct zsched_arg *za = arg; 2654 proc_t *pp = curproc; 2655 proc_t *initp = proc_init; 2656 zone_t *zone = za->zone; 2657 cred_t *cr, *oldcred; 2658 rctl_set_t *set; 2659 rctl_alloc_gp_t *gp; 2660 contract_t *ct = NULL; 2661 task_t *tk, *oldtk; 2662 rctl_entity_p_t e; 2663 kproject_t *pj; 2664 2665 nvlist_t *nvl = za->nvlist; 2666 nvpair_t *nvp = NULL; 2667 2668 bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched")); 2669 bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched")); 2670 PTOU(pp)->u_argc = 0; 2671 PTOU(pp)->u_argv = NULL; 2672 PTOU(pp)->u_envp = NULL; 2673 closeall(P_FINFO(pp)); 2674 2675 /* 2676 * We are this zone's "zsched" process. As the zone isn't generally 2677 * visible yet we don't need to grab any locks before initializing its 2678 * zone_proc pointer. 2679 */ 2680 zone_hold(zone); /* this hold is released by zone_destroy() */ 2681 zone->zone_zsched = pp; 2682 mutex_enter(&pp->p_lock); 2683 pp->p_zone = zone; 2684 mutex_exit(&pp->p_lock); 2685 2686 /* 2687 * Disassociate process from its 'parent'; parent ourselves to init 2688 * (pid 1) and change other values as needed. 2689 */ 2690 sess_create(); 2691 2692 mutex_enter(&pidlock); 2693 proc_detach(pp); 2694 pp->p_ppid = 1; 2695 pp->p_flag |= SZONETOP; 2696 pp->p_ancpid = 1; 2697 pp->p_parent = initp; 2698 pp->p_psibling = NULL; 2699 if (initp->p_child) 2700 initp->p_child->p_psibling = pp; 2701 pp->p_sibling = initp->p_child; 2702 initp->p_child = pp; 2703 2704 /* Decrement what newproc() incremented. */ 2705 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 2706 /* 2707 * Our credentials are about to become kcred-like, so we don't care 2708 * about the caller's ruid. 2709 */ 2710 upcount_inc(crgetruid(kcred), zone->zone_id); 2711 mutex_exit(&pidlock); 2712 2713 /* 2714 * getting out of global zone, so decrement lwp counts 2715 */ 2716 pj = pp->p_task->tk_proj; 2717 mutex_enter(&global_zone->zone_nlwps_lock); 2718 pj->kpj_nlwps -= pp->p_lwpcnt; 2719 global_zone->zone_nlwps -= pp->p_lwpcnt; 2720 mutex_exit(&global_zone->zone_nlwps_lock); 2721 2722 /* 2723 * Decrement locked memory counts on old zone and project. 2724 */ 2725 mutex_enter(&global_zone->zone_mem_lock); 2726 global_zone->zone_locked_mem -= pp->p_locked_mem; 2727 pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; 2728 mutex_exit(&global_zone->zone_mem_lock); 2729 2730 /* 2731 * Create and join a new task in project '0' of this zone. 2732 * 2733 * We don't need to call holdlwps() since we know we're the only lwp in 2734 * this process. 2735 * 2736 * task_join() returns with p_lock held. 2737 */ 2738 tk = task_create(0, zone); 2739 mutex_enter(&cpu_lock); 2740 oldtk = task_join(tk, 0); 2741 2742 pj = pp->p_task->tk_proj; 2743 2744 mutex_enter(&zone->zone_mem_lock); 2745 zone->zone_locked_mem += pp->p_locked_mem; 2746 pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; 2747 mutex_exit(&zone->zone_mem_lock); 2748 2749 /* 2750 * add lwp counts to zsched's zone, and increment project's task count 2751 * due to the task created in the above tasksys_settaskid 2752 */ 2753 2754 mutex_enter(&zone->zone_nlwps_lock); 2755 pj->kpj_nlwps += pp->p_lwpcnt; 2756 pj->kpj_ntasks += 1; 2757 zone->zone_nlwps += pp->p_lwpcnt; 2758 mutex_exit(&zone->zone_nlwps_lock); 2759 2760 mutex_exit(&curproc->p_lock); 2761 mutex_exit(&cpu_lock); 2762 task_rele(oldtk); 2763 2764 /* 2765 * The process was created by a process in the global zone, hence the 2766 * credentials are wrong. We might as well have kcred-ish credentials. 2767 */ 2768 cr = zone->zone_kcred; 2769 crhold(cr); 2770 mutex_enter(&pp->p_crlock); 2771 oldcred = pp->p_cred; 2772 pp->p_cred = cr; 2773 mutex_exit(&pp->p_crlock); 2774 crfree(oldcred); 2775 2776 /* 2777 * Hold credentials again (for thread) 2778 */ 2779 crhold(cr); 2780 2781 /* 2782 * p_lwpcnt can't change since this is a kernel process. 2783 */ 2784 crset(pp, cr); 2785 2786 /* 2787 * Chroot 2788 */ 2789 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 2790 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 2791 2792 /* 2793 * Initialize zone's rctl set. 2794 */ 2795 set = rctl_set_create(); 2796 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2797 mutex_enter(&pp->p_lock); 2798 e.rcep_p.zone = zone; 2799 e.rcep_t = RCENTITY_ZONE; 2800 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 2801 mutex_exit(&pp->p_lock); 2802 rctl_prealloc_destroy(gp); 2803 2804 /* 2805 * Apply the rctls passed in to zone_create(). This is basically a list 2806 * assignment: all of the old values are removed and the new ones 2807 * inserted. That is, if an empty list is passed in, all values are 2808 * removed. 2809 */ 2810 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 2811 rctl_dict_entry_t *rde; 2812 rctl_hndl_t hndl; 2813 char *name; 2814 nvlist_t **nvlarray; 2815 uint_t i, nelem; 2816 int error; /* For ASSERT()s */ 2817 2818 name = nvpair_name(nvp); 2819 hndl = rctl_hndl_lookup(name); 2820 ASSERT(hndl != -1); 2821 rde = rctl_dict_lookup_hndl(hndl); 2822 ASSERT(rde != NULL); 2823 2824 for (; /* ever */; ) { 2825 rctl_val_t oval; 2826 2827 mutex_enter(&pp->p_lock); 2828 error = rctl_local_get(hndl, NULL, &oval, pp); 2829 mutex_exit(&pp->p_lock); 2830 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 2831 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 2832 if (oval.rcv_privilege == RCPRIV_SYSTEM) 2833 break; 2834 mutex_enter(&pp->p_lock); 2835 error = rctl_local_delete(hndl, &oval, pp); 2836 mutex_exit(&pp->p_lock); 2837 ASSERT(error == 0); 2838 } 2839 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 2840 ASSERT(error == 0); 2841 for (i = 0; i < nelem; i++) { 2842 rctl_val_t *nvalp; 2843 2844 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2845 error = nvlist2rctlval(nvlarray[i], nvalp); 2846 ASSERT(error == 0); 2847 /* 2848 * rctl_local_insert can fail if the value being 2849 * inserted is a duplicate; this is OK. 2850 */ 2851 mutex_enter(&pp->p_lock); 2852 if (rctl_local_insert(hndl, nvalp, pp) != 0) 2853 kmem_cache_free(rctl_val_cache, nvalp); 2854 mutex_exit(&pp->p_lock); 2855 } 2856 } 2857 /* 2858 * Tell the world that we're done setting up. 2859 * 2860 * At this point we want to set the zone status to ZONE_IS_READY 2861 * and atomically set the zone's processor set visibility. Once 2862 * we drop pool_lock() this zone will automatically get updated 2863 * to reflect any future changes to the pools configuration. 2864 */ 2865 pool_lock(); 2866 mutex_enter(&cpu_lock); 2867 mutex_enter(&zonehash_lock); 2868 zone_uniqid(zone); 2869 zone_zsd_configure(zone); 2870 if (pool_state == POOL_ENABLED) 2871 zone_pset_set(zone, pool_default->pool_pset->pset_id); 2872 mutex_enter(&zone_status_lock); 2873 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2874 zone_status_set(zone, ZONE_IS_READY); 2875 mutex_exit(&zone_status_lock); 2876 mutex_exit(&zonehash_lock); 2877 mutex_exit(&cpu_lock); 2878 pool_unlock(); 2879 2880 /* 2881 * Once we see the zone transition to the ZONE_IS_BOOTING state, 2882 * we launch init, and set the state to running. 2883 */ 2884 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 2885 2886 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 2887 id_t cid; 2888 2889 /* 2890 * Ok, this is a little complicated. We need to grab the 2891 * zone's pool's scheduling class ID; note that by now, we 2892 * are already bound to a pool if we need to be (zoneadmd 2893 * will have done that to us while we're in the READY 2894 * state). *But* the scheduling class for the zone's 'init' 2895 * must be explicitly passed to newproc, which doesn't 2896 * respect pool bindings. 2897 * 2898 * We hold the pool_lock across the call to newproc() to 2899 * close the obvious race: the pool's scheduling class 2900 * could change before we manage to create the LWP with 2901 * classid 'cid'. 2902 */ 2903 pool_lock(); 2904 if (zone->zone_defaultcid > 0) 2905 cid = zone->zone_defaultcid; 2906 else 2907 cid = pool_get_class(zone->zone_pool); 2908 if (cid == -1) 2909 cid = defaultcid; 2910 2911 /* 2912 * If this fails, zone_boot will ultimately fail. The 2913 * state of the zone will be set to SHUTTING_DOWN-- userland 2914 * will have to tear down the zone, and fail, or try again. 2915 */ 2916 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid, 2917 minclsyspri - 1, &ct)) != 0) { 2918 mutex_enter(&zone_status_lock); 2919 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 2920 mutex_exit(&zone_status_lock); 2921 } 2922 pool_unlock(); 2923 } 2924 2925 /* 2926 * Wait for zone_destroy() to be called. This is what we spend 2927 * most of our life doing. 2928 */ 2929 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 2930 2931 if (ct) 2932 /* 2933 * At this point the process contract should be empty. 2934 * (Though if it isn't, it's not the end of the world.) 2935 */ 2936 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 2937 2938 /* 2939 * Allow kcred to be freed when all referring processes 2940 * (including this one) go away. We can't just do this in 2941 * zone_free because we need to wait for the zone_cred_ref to 2942 * drop to 0 before calling zone_free, and the existence of 2943 * zone_kcred will prevent that. Thus, we call crfree here to 2944 * balance the crdup in zone_create. The crhold calls earlier 2945 * in zsched will be dropped when the thread and process exit. 2946 */ 2947 crfree(zone->zone_kcred); 2948 zone->zone_kcred = NULL; 2949 2950 exit(CLD_EXITED, 0); 2951 } 2952 2953 /* 2954 * Helper function to determine if there are any submounts of the 2955 * provided path. Used to make sure the zone doesn't "inherit" any 2956 * mounts from before it is created. 2957 */ 2958 static uint_t 2959 zone_mount_count(const char *rootpath) 2960 { 2961 vfs_t *vfsp; 2962 uint_t count = 0; 2963 size_t rootpathlen = strlen(rootpath); 2964 2965 /* 2966 * Holding zonehash_lock prevents race conditions with 2967 * vfs_list_add()/vfs_list_remove() since we serialize with 2968 * zone_find_by_path(). 2969 */ 2970 ASSERT(MUTEX_HELD(&zonehash_lock)); 2971 /* 2972 * The rootpath must end with a '/' 2973 */ 2974 ASSERT(rootpath[rootpathlen - 1] == '/'); 2975 2976 /* 2977 * This intentionally does not count the rootpath itself if that 2978 * happens to be a mount point. 2979 */ 2980 vfs_list_read_lock(); 2981 vfsp = rootvfs; 2982 do { 2983 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 2984 rootpathlen) == 0) 2985 count++; 2986 vfsp = vfsp->vfs_next; 2987 } while (vfsp != rootvfs); 2988 vfs_list_unlock(); 2989 return (count); 2990 } 2991 2992 /* 2993 * Helper function to make sure that a zone created on 'rootpath' 2994 * wouldn't end up containing other zones' rootpaths. 2995 */ 2996 static boolean_t 2997 zone_is_nested(const char *rootpath) 2998 { 2999 zone_t *zone; 3000 size_t rootpathlen = strlen(rootpath); 3001 size_t len; 3002 3003 ASSERT(MUTEX_HELD(&zonehash_lock)); 3004 3005 for (zone = list_head(&zone_active); zone != NULL; 3006 zone = list_next(&zone_active, zone)) { 3007 if (zone == global_zone) 3008 continue; 3009 len = strlen(zone->zone_rootpath); 3010 if (strncmp(rootpath, zone->zone_rootpath, 3011 MIN(rootpathlen, len)) == 0) 3012 return (B_TRUE); 3013 } 3014 return (B_FALSE); 3015 } 3016 3017 static int 3018 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, 3019 size_t zone_privssz) 3020 { 3021 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 3022 3023 if (zone_privssz < sizeof (priv_set_t)) 3024 return (set_errno(ENOMEM)); 3025 3026 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 3027 kmem_free(privs, sizeof (priv_set_t)); 3028 return (EFAULT); 3029 } 3030 3031 zone->zone_privset = privs; 3032 return (0); 3033 } 3034 3035 /* 3036 * We make creative use of nvlists to pass in rctls from userland. The list is 3037 * a list of the following structures: 3038 * 3039 * (name = rctl_name, value = nvpair_list_array) 3040 * 3041 * Where each element of the nvpair_list_array is of the form: 3042 * 3043 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 3044 * (name = "limit", value = uint64_t), 3045 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 3046 */ 3047 static int 3048 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 3049 { 3050 nvpair_t *nvp = NULL; 3051 nvlist_t *nvl = NULL; 3052 char *kbuf; 3053 int error; 3054 rctl_val_t rv; 3055 3056 *nvlp = NULL; 3057 3058 if (buflen == 0) 3059 return (0); 3060 3061 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 3062 return (ENOMEM); 3063 if (copyin(ubuf, kbuf, buflen)) { 3064 error = EFAULT; 3065 goto out; 3066 } 3067 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 3068 /* 3069 * nvl may have been allocated/free'd, but the value set to 3070 * non-NULL, so we reset it here. 3071 */ 3072 nvl = NULL; 3073 error = EINVAL; 3074 goto out; 3075 } 3076 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3077 rctl_dict_entry_t *rde; 3078 rctl_hndl_t hndl; 3079 nvlist_t **nvlarray; 3080 uint_t i, nelem; 3081 char *name; 3082 3083 error = EINVAL; 3084 name = nvpair_name(nvp); 3085 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 3086 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 3087 goto out; 3088 } 3089 if ((hndl = rctl_hndl_lookup(name)) == -1) { 3090 goto out; 3091 } 3092 rde = rctl_dict_lookup_hndl(hndl); 3093 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 3094 ASSERT(error == 0); 3095 for (i = 0; i < nelem; i++) { 3096 if (error = nvlist2rctlval(nvlarray[i], &rv)) 3097 goto out; 3098 } 3099 if (rctl_invalid_value(rde, &rv)) { 3100 error = EINVAL; 3101 goto out; 3102 } 3103 } 3104 error = 0; 3105 *nvlp = nvl; 3106 out: 3107 kmem_free(kbuf, buflen); 3108 if (error && nvl != NULL) 3109 nvlist_free(nvl); 3110 return (error); 3111 } 3112 3113 int 3114 zone_create_error(int er_error, int er_ext, int *er_out) { 3115 if (er_out != NULL) { 3116 if (copyout(&er_ext, er_out, sizeof (int))) { 3117 return (set_errno(EFAULT)); 3118 } 3119 } 3120 return (set_errno(er_error)); 3121 } 3122 3123 static int 3124 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi) 3125 { 3126 ts_label_t *tsl; 3127 bslabel_t blab; 3128 3129 /* Get label from user */ 3130 if (copyin(lab, &blab, sizeof (blab)) != 0) 3131 return (EFAULT); 3132 tsl = labelalloc(&blab, doi, KM_NOSLEEP); 3133 if (tsl == NULL) 3134 return (ENOMEM); 3135 3136 zone->zone_slabel = tsl; 3137 return (0); 3138 } 3139 3140 /* 3141 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 3142 */ 3143 static int 3144 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 3145 { 3146 char *kbuf; 3147 char *dataset, *next; 3148 zone_dataset_t *zd; 3149 size_t len; 3150 3151 if (ubuf == NULL || buflen == 0) 3152 return (0); 3153 3154 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 3155 return (ENOMEM); 3156 3157 if (copyin(ubuf, kbuf, buflen) != 0) { 3158 kmem_free(kbuf, buflen); 3159 return (EFAULT); 3160 } 3161 3162 dataset = next = kbuf; 3163 for (;;) { 3164 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 3165 3166 next = strchr(dataset, ','); 3167 3168 if (next == NULL) 3169 len = strlen(dataset); 3170 else 3171 len = next - dataset; 3172 3173 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 3174 bcopy(dataset, zd->zd_dataset, len); 3175 zd->zd_dataset[len] = '\0'; 3176 3177 list_insert_head(&zone->zone_datasets, zd); 3178 3179 if (next == NULL) 3180 break; 3181 3182 dataset = next + 1; 3183 } 3184 3185 kmem_free(kbuf, buflen); 3186 return (0); 3187 } 3188 3189 /* 3190 * System call to create/initialize a new zone named 'zone_name', rooted 3191 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 3192 * and initialized with the zone-wide rctls described in 'rctlbuf', and 3193 * with labeling set by 'match', 'doi', and 'label'. 3194 * 3195 * If extended error is non-null, we may use it to return more detailed 3196 * error information. 3197 */ 3198 static zoneid_t 3199 zone_create(const char *zone_name, const char *zone_root, 3200 const priv_set_t *zone_privs, size_t zone_privssz, 3201 caddr_t rctlbuf, size_t rctlbufsz, 3202 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, 3203 int match, uint32_t doi, const bslabel_t *label, 3204 int flags) 3205 { 3206 struct zsched_arg zarg; 3207 nvlist_t *rctls = NULL; 3208 proc_t *pp = curproc; 3209 zone_t *zone, *ztmp; 3210 zoneid_t zoneid; 3211 int error; 3212 int error2 = 0; 3213 char *str; 3214 cred_t *zkcr; 3215 boolean_t insert_label_hash; 3216 3217 if (secpolicy_zone_config(CRED()) != 0) 3218 return (set_errno(EPERM)); 3219 3220 /* can't boot zone from within chroot environment */ 3221 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 3222 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 3223 extended_error)); 3224 3225 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 3226 zoneid = zone->zone_id = id_alloc(zoneid_space); 3227 zone->zone_status = ZONE_IS_UNINITIALIZED; 3228 zone->zone_pool = pool_default; 3229 zone->zone_pool_mod = gethrtime(); 3230 zone->zone_psetid = ZONE_PS_INVAL; 3231 zone->zone_ncpus = 0; 3232 zone->zone_ncpus_online = 0; 3233 zone->zone_restart_init = B_TRUE; 3234 zone->zone_brand = &native_brand; 3235 zone->zone_initname = NULL; 3236 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 3237 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 3238 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); 3239 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 3240 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 3241 offsetof(struct zsd_entry, zsd_linkage)); 3242 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 3243 offsetof(zone_dataset_t, zd_linkage)); 3244 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 3245 3246 if (flags & ZCF_NET_EXCL) { 3247 zone->zone_flags |= ZF_NET_EXCL; 3248 } 3249 3250 if ((error = zone_set_name(zone, zone_name)) != 0) { 3251 zone_free(zone); 3252 return (zone_create_error(error, 0, extended_error)); 3253 } 3254 3255 if ((error = zone_set_root(zone, zone_root)) != 0) { 3256 zone_free(zone); 3257 return (zone_create_error(error, 0, extended_error)); 3258 } 3259 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) { 3260 zone_free(zone); 3261 return (zone_create_error(error, 0, extended_error)); 3262 } 3263 3264 /* initialize node name to be the same as zone name */ 3265 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 3266 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 3267 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 3268 3269 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 3270 zone->zone_domain[0] = '\0'; 3271 zone->zone_shares = 1; 3272 zone->zone_shmmax = 0; 3273 zone->zone_ipc.ipcq_shmmni = 0; 3274 zone->zone_ipc.ipcq_semmni = 0; 3275 zone->zone_ipc.ipcq_msgmni = 0; 3276 zone->zone_bootargs = NULL; 3277 zone->zone_initname = 3278 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); 3279 (void) strcpy(zone->zone_initname, zone_default_initname); 3280 zone->zone_nlwps = 0; 3281 zone->zone_nlwps_ctl = INT_MAX; 3282 zone->zone_locked_mem = 0; 3283 zone->zone_locked_mem_ctl = UINT64_MAX; 3284 zone->zone_max_swap = 0; 3285 zone->zone_max_swap_ctl = UINT64_MAX; 3286 zone0.zone_lockedmem_kstat = NULL; 3287 zone0.zone_swapresv_kstat = NULL; 3288 3289 /* 3290 * Zsched initializes the rctls. 3291 */ 3292 zone->zone_rctls = NULL; 3293 3294 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 3295 zone_free(zone); 3296 return (zone_create_error(error, 0, extended_error)); 3297 } 3298 3299 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 3300 zone_free(zone); 3301 return (set_errno(error)); 3302 } 3303 3304 /* 3305 * Read in the trusted system parameters: 3306 * match flag and sensitivity label. 3307 */ 3308 zone->zone_match = match; 3309 if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 3310 error = zone_set_label(zone, label, doi); 3311 if (error != 0) { 3312 zone_free(zone); 3313 return (set_errno(error)); 3314 } 3315 insert_label_hash = B_TRUE; 3316 } else { 3317 /* all zones get an admin_low label if system is not labeled */ 3318 zone->zone_slabel = l_admin_low; 3319 label_hold(l_admin_low); 3320 insert_label_hash = B_FALSE; 3321 } 3322 3323 /* 3324 * Stop all lwps since that's what normally happens as part of fork(). 3325 * This needs to happen before we grab any locks to avoid deadlock 3326 * (another lwp in the process could be waiting for the held lock). 3327 */ 3328 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 3329 zone_free(zone); 3330 if (rctls) 3331 nvlist_free(rctls); 3332 return (zone_create_error(error, 0, extended_error)); 3333 } 3334 3335 if (block_mounts() == 0) { 3336 mutex_enter(&pp->p_lock); 3337 if (curthread != pp->p_agenttp) 3338 continuelwps(pp); 3339 mutex_exit(&pp->p_lock); 3340 zone_free(zone); 3341 if (rctls) 3342 nvlist_free(rctls); 3343 return (zone_create_error(error, 0, extended_error)); 3344 } 3345 3346 /* 3347 * Set up credential for kernel access. After this, any errors 3348 * should go through the dance in errout rather than calling 3349 * zone_free directly. 3350 */ 3351 zone->zone_kcred = crdup(kcred); 3352 crsetzone(zone->zone_kcred, zone); 3353 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 3354 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 3355 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 3356 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 3357 3358 mutex_enter(&zonehash_lock); 3359 /* 3360 * Make sure zone doesn't already exist. 3361 * 3362 * If the system and zone are labeled, 3363 * make sure no other zone exists that has the same label. 3364 */ 3365 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL || 3366 (insert_label_hash && 3367 (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) { 3368 zone_status_t status; 3369 3370 status = zone_status_get(ztmp); 3371 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 3372 error = EEXIST; 3373 else 3374 error = EBUSY; 3375 goto errout; 3376 } 3377 3378 /* 3379 * Don't allow zone creations which would cause one zone's rootpath to 3380 * be accessible from that of another (non-global) zone. 3381 */ 3382 if (zone_is_nested(zone->zone_rootpath)) { 3383 error = EBUSY; 3384 goto errout; 3385 } 3386 3387 ASSERT(zonecount != 0); /* check for leaks */ 3388 if (zonecount + 1 > maxzones) { 3389 error = ENOMEM; 3390 goto errout; 3391 } 3392 3393 if (zone_mount_count(zone->zone_rootpath) != 0) { 3394 error = EBUSY; 3395 error2 = ZE_AREMOUNTS; 3396 goto errout; 3397 } 3398 3399 /* 3400 * Zone is still incomplete, but we need to drop all locks while 3401 * zsched() initializes this zone's kernel process. We 3402 * optimistically add the zone to the hashtable and associated 3403 * lists so a parallel zone_create() doesn't try to create the 3404 * same zone. 3405 */ 3406 zonecount++; 3407 (void) mod_hash_insert(zonehashbyid, 3408 (mod_hash_key_t)(uintptr_t)zone->zone_id, 3409 (mod_hash_val_t)(uintptr_t)zone); 3410 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 3411 (void) strcpy(str, zone->zone_name); 3412 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 3413 (mod_hash_val_t)(uintptr_t)zone); 3414 if (insert_label_hash) { 3415 (void) mod_hash_insert(zonehashbylabel, 3416 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone); 3417 zone->zone_flags |= ZF_HASHED_LABEL; 3418 } 3419 3420 /* 3421 * Insert into active list. At this point there are no 'hold's 3422 * on the zone, but everyone else knows not to use it, so we can 3423 * continue to use it. zsched() will do a zone_hold() if the 3424 * newproc() is successful. 3425 */ 3426 list_insert_tail(&zone_active, zone); 3427 mutex_exit(&zonehash_lock); 3428 3429 zarg.zone = zone; 3430 zarg.nvlist = rctls; 3431 /* 3432 * The process, task, and project rctls are probably wrong; 3433 * we need an interface to get the default values of all rctls, 3434 * and initialize zsched appropriately. I'm not sure that that 3435 * makes much of a difference, though. 3436 */ 3437 if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) { 3438 /* 3439 * We need to undo all globally visible state. 3440 */ 3441 mutex_enter(&zonehash_lock); 3442 list_remove(&zone_active, zone); 3443 if (zone->zone_flags & ZF_HASHED_LABEL) { 3444 ASSERT(zone->zone_slabel != NULL); 3445 (void) mod_hash_destroy(zonehashbylabel, 3446 (mod_hash_key_t)zone->zone_slabel); 3447 } 3448 (void) mod_hash_destroy(zonehashbyname, 3449 (mod_hash_key_t)(uintptr_t)zone->zone_name); 3450 (void) mod_hash_destroy(zonehashbyid, 3451 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3452 ASSERT(zonecount > 1); 3453 zonecount--; 3454 goto errout; 3455 } 3456 3457 /* 3458 * Zone creation can't fail from now on. 3459 */ 3460 3461 /* 3462 * Create zone kstats 3463 */ 3464 zone_kstat_create(zone); 3465 3466 /* 3467 * Let the other lwps continue. 3468 */ 3469 mutex_enter(&pp->p_lock); 3470 if (curthread != pp->p_agenttp) 3471 continuelwps(pp); 3472 mutex_exit(&pp->p_lock); 3473 3474 /* 3475 * Wait for zsched to finish initializing the zone. 3476 */ 3477 zone_status_wait(zone, ZONE_IS_READY); 3478 /* 3479 * The zone is fully visible, so we can let mounts progress. 3480 */ 3481 resume_mounts(); 3482 if (rctls) 3483 nvlist_free(rctls); 3484 3485 return (zoneid); 3486 3487 errout: 3488 mutex_exit(&zonehash_lock); 3489 /* 3490 * Let the other lwps continue. 3491 */ 3492 mutex_enter(&pp->p_lock); 3493 if (curthread != pp->p_agenttp) 3494 continuelwps(pp); 3495 mutex_exit(&pp->p_lock); 3496 3497 resume_mounts(); 3498 if (rctls) 3499 nvlist_free(rctls); 3500 /* 3501 * There is currently one reference to the zone, a cred_ref from 3502 * zone_kcred. To free the zone, we call crfree, which will call 3503 * zone_cred_rele, which will call zone_free. 3504 */ 3505 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 3506 ASSERT(zone->zone_kcred->cr_ref == 1); 3507 ASSERT(zone->zone_ref == 0); 3508 zkcr = zone->zone_kcred; 3509 zone->zone_kcred = NULL; 3510 crfree(zkcr); /* triggers call to zone_free */ 3511 return (zone_create_error(error, error2, extended_error)); 3512 } 3513 3514 /* 3515 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 3516 * the heavy lifting. initname is the path to the program to launch 3517 * at the "top" of the zone; if this is NULL, we use the system default, 3518 * which is stored at zone_default_initname. 3519 */ 3520 static int 3521 zone_boot(zoneid_t zoneid) 3522 { 3523 int err; 3524 zone_t *zone; 3525 3526 if (secpolicy_zone_config(CRED()) != 0) 3527 return (set_errno(EPERM)); 3528 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3529 return (set_errno(EINVAL)); 3530 3531 mutex_enter(&zonehash_lock); 3532 /* 3533 * Look for zone under hash lock to prevent races with calls to 3534 * zone_shutdown, zone_destroy, etc. 3535 */ 3536 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3537 mutex_exit(&zonehash_lock); 3538 return (set_errno(EINVAL)); 3539 } 3540 3541 mutex_enter(&zone_status_lock); 3542 if (zone_status_get(zone) != ZONE_IS_READY) { 3543 mutex_exit(&zone_status_lock); 3544 mutex_exit(&zonehash_lock); 3545 return (set_errno(EINVAL)); 3546 } 3547 zone_status_set(zone, ZONE_IS_BOOTING); 3548 mutex_exit(&zone_status_lock); 3549 3550 zone_hold(zone); /* so we can use the zone_t later */ 3551 mutex_exit(&zonehash_lock); 3552 3553 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 3554 zone_rele(zone); 3555 return (set_errno(EINTR)); 3556 } 3557 3558 /* 3559 * Boot (starting init) might have failed, in which case the zone 3560 * will go to the SHUTTING_DOWN state; an appropriate errno will 3561 * be placed in zone->zone_boot_err, and so we return that. 3562 */ 3563 err = zone->zone_boot_err; 3564 zone_rele(zone); 3565 return (err ? set_errno(err) : 0); 3566 } 3567 3568 /* 3569 * Kills all user processes in the zone, waiting for them all to exit 3570 * before returning. 3571 */ 3572 static int 3573 zone_empty(zone_t *zone) 3574 { 3575 int waitstatus; 3576 3577 /* 3578 * We need to drop zonehash_lock before killing all 3579 * processes, otherwise we'll deadlock with zone_find_* 3580 * which can be called from the exit path. 3581 */ 3582 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 3583 while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz, 3584 ZONE_IS_EMPTY)) == -1) { 3585 killall(zone->zone_id); 3586 } 3587 /* 3588 * return EINTR if we were signaled 3589 */ 3590 if (waitstatus == 0) 3591 return (EINTR); 3592 return (0); 3593 } 3594 3595 /* 3596 * This function implements the policy for zone visibility. 3597 * 3598 * In standard Solaris, a non-global zone can only see itself. 3599 * 3600 * In Trusted Extensions, a labeled zone can lookup any zone whose label 3601 * it dominates. For this test, the label of the global zone is treated as 3602 * admin_high so it is special-cased instead of being checked for dominance. 3603 * 3604 * Returns true if zone attributes are viewable, false otherwise. 3605 */ 3606 static boolean_t 3607 zone_list_access(zone_t *zone) 3608 { 3609 3610 if (curproc->p_zone == global_zone || 3611 curproc->p_zone == zone) { 3612 return (B_TRUE); 3613 } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 3614 bslabel_t *curproc_label; 3615 bslabel_t *zone_label; 3616 3617 curproc_label = label2bslabel(curproc->p_zone->zone_slabel); 3618 zone_label = label2bslabel(zone->zone_slabel); 3619 3620 if (zone->zone_id != GLOBAL_ZONEID && 3621 bldominates(curproc_label, zone_label)) { 3622 return (B_TRUE); 3623 } else { 3624 return (B_FALSE); 3625 } 3626 } else { 3627 return (B_FALSE); 3628 } 3629 } 3630 3631 /* 3632 * Systemcall to start the zone's halt sequence. By the time this 3633 * function successfully returns, all user processes and kernel threads 3634 * executing in it will have exited, ZSD shutdown callbacks executed, 3635 * and the zone status set to ZONE_IS_DOWN. 3636 * 3637 * It is possible that the call will interrupt itself if the caller is the 3638 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 3639 */ 3640 static int 3641 zone_shutdown(zoneid_t zoneid) 3642 { 3643 int error; 3644 zone_t *zone; 3645 zone_status_t status; 3646 3647 if (secpolicy_zone_config(CRED()) != 0) 3648 return (set_errno(EPERM)); 3649 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3650 return (set_errno(EINVAL)); 3651 3652 /* 3653 * Block mounts so that VFS_MOUNT() can get an accurate view of 3654 * the zone's status with regards to ZONE_IS_SHUTTING down. 3655 * 3656 * e.g. NFS can fail the mount if it determines that the zone 3657 * has already begun the shutdown sequence. 3658 */ 3659 if (block_mounts() == 0) 3660 return (set_errno(EINTR)); 3661 mutex_enter(&zonehash_lock); 3662 /* 3663 * Look for zone under hash lock to prevent races with other 3664 * calls to zone_shutdown and zone_destroy. 3665 */ 3666 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3667 mutex_exit(&zonehash_lock); 3668 resume_mounts(); 3669 return (set_errno(EINVAL)); 3670 } 3671 mutex_enter(&zone_status_lock); 3672 status = zone_status_get(zone); 3673 /* 3674 * Fail if the zone isn't fully initialized yet. 3675 */ 3676 if (status < ZONE_IS_READY) { 3677 mutex_exit(&zone_status_lock); 3678 mutex_exit(&zonehash_lock); 3679 resume_mounts(); 3680 return (set_errno(EINVAL)); 3681 } 3682 /* 3683 * If conditions required for zone_shutdown() to return have been met, 3684 * return success. 3685 */ 3686 if (status >= ZONE_IS_DOWN) { 3687 mutex_exit(&zone_status_lock); 3688 mutex_exit(&zonehash_lock); 3689 resume_mounts(); 3690 return (0); 3691 } 3692 /* 3693 * If zone_shutdown() hasn't been called before, go through the motions. 3694 * If it has, there's nothing to do but wait for the kernel threads to 3695 * drain. 3696 */ 3697 if (status < ZONE_IS_EMPTY) { 3698 uint_t ntasks; 3699 3700 mutex_enter(&zone->zone_lock); 3701 if ((ntasks = zone->zone_ntasks) != 1) { 3702 /* 3703 * There's still stuff running. 3704 */ 3705 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3706 } 3707 mutex_exit(&zone->zone_lock); 3708 if (ntasks == 1) { 3709 /* 3710 * The only way to create another task is through 3711 * zone_enter(), which will block until we drop 3712 * zonehash_lock. The zone is empty. 3713 */ 3714 if (zone->zone_kthreads == NULL) { 3715 /* 3716 * Skip ahead to ZONE_IS_DOWN 3717 */ 3718 zone_status_set(zone, ZONE_IS_DOWN); 3719 } else { 3720 zone_status_set(zone, ZONE_IS_EMPTY); 3721 } 3722 } 3723 } 3724 zone_hold(zone); /* so we can use the zone_t later */ 3725 mutex_exit(&zone_status_lock); 3726 mutex_exit(&zonehash_lock); 3727 resume_mounts(); 3728 3729 if (error = zone_empty(zone)) { 3730 zone_rele(zone); 3731 return (set_errno(error)); 3732 } 3733 /* 3734 * After the zone status goes to ZONE_IS_DOWN this zone will no 3735 * longer be notified of changes to the pools configuration, so 3736 * in order to not end up with a stale pool pointer, we point 3737 * ourselves at the default pool and remove all resource 3738 * visibility. This is especially important as the zone_t may 3739 * languish on the deathrow for a very long time waiting for 3740 * cred's to drain out. 3741 * 3742 * This rebinding of the zone can happen multiple times 3743 * (presumably due to interrupted or parallel systemcalls) 3744 * without any adverse effects. 3745 */ 3746 if (pool_lock_intr() != 0) { 3747 zone_rele(zone); 3748 return (set_errno(EINTR)); 3749 } 3750 if (pool_state == POOL_ENABLED) { 3751 mutex_enter(&cpu_lock); 3752 zone_pool_set(zone, pool_default); 3753 /* 3754 * The zone no longer needs to be able to see any cpus. 3755 */ 3756 zone_pset_set(zone, ZONE_PS_INVAL); 3757 mutex_exit(&cpu_lock); 3758 } 3759 pool_unlock(); 3760 3761 /* 3762 * ZSD shutdown callbacks can be executed multiple times, hence 3763 * it is safe to not be holding any locks across this call. 3764 */ 3765 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 3766 3767 mutex_enter(&zone_status_lock); 3768 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 3769 zone_status_set(zone, ZONE_IS_DOWN); 3770 mutex_exit(&zone_status_lock); 3771 3772 /* 3773 * Wait for kernel threads to drain. 3774 */ 3775 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 3776 zone_rele(zone); 3777 return (set_errno(EINTR)); 3778 } 3779 3780 brand_unregister_zone(zone->zone_brand); 3781 3782 zone_rele(zone); 3783 return (0); 3784 } 3785 3786 /* 3787 * Systemcall entry point to finalize the zone halt process. The caller 3788 * must have already successfully called zone_shutdown(). 3789 * 3790 * Upon successful completion, the zone will have been fully destroyed: 3791 * zsched will have exited, destructor callbacks executed, and the zone 3792 * removed from the list of active zones. 3793 */ 3794 static int 3795 zone_destroy(zoneid_t zoneid) 3796 { 3797 uint64_t uniqid; 3798 zone_t *zone; 3799 zone_status_t status; 3800 3801 if (secpolicy_zone_config(CRED()) != 0) 3802 return (set_errno(EPERM)); 3803 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 3804 return (set_errno(EINVAL)); 3805 3806 mutex_enter(&zonehash_lock); 3807 /* 3808 * Look for zone under hash lock to prevent races with other 3809 * calls to zone_destroy. 3810 */ 3811 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3812 mutex_exit(&zonehash_lock); 3813 return (set_errno(EINVAL)); 3814 } 3815 3816 if (zone_mount_count(zone->zone_rootpath) != 0) { 3817 mutex_exit(&zonehash_lock); 3818 return (set_errno(EBUSY)); 3819 } 3820 mutex_enter(&zone_status_lock); 3821 status = zone_status_get(zone); 3822 if (status < ZONE_IS_DOWN) { 3823 mutex_exit(&zone_status_lock); 3824 mutex_exit(&zonehash_lock); 3825 return (set_errno(EBUSY)); 3826 } else if (status == ZONE_IS_DOWN) { 3827 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 3828 } 3829 mutex_exit(&zone_status_lock); 3830 zone_hold(zone); 3831 mutex_exit(&zonehash_lock); 3832 3833 /* 3834 * wait for zsched to exit 3835 */ 3836 zone_status_wait(zone, ZONE_IS_DEAD); 3837 zone_zsd_callbacks(zone, ZSD_DESTROY); 3838 zone->zone_netstack = NULL; 3839 uniqid = zone->zone_uniqid; 3840 zone_rele(zone); 3841 zone = NULL; /* potentially free'd */ 3842 3843 mutex_enter(&zonehash_lock); 3844 for (; /* ever */; ) { 3845 boolean_t unref; 3846 3847 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 3848 zone->zone_uniqid != uniqid) { 3849 /* 3850 * The zone has gone away. Necessary conditions 3851 * are met, so we return success. 3852 */ 3853 mutex_exit(&zonehash_lock); 3854 return (0); 3855 } 3856 mutex_enter(&zone->zone_lock); 3857 unref = ZONE_IS_UNREF(zone); 3858 mutex_exit(&zone->zone_lock); 3859 if (unref) { 3860 /* 3861 * There is only one reference to the zone -- that 3862 * added when the zone was added to the hashtables -- 3863 * and things will remain this way until we drop 3864 * zonehash_lock... we can go ahead and cleanup the 3865 * zone. 3866 */ 3867 break; 3868 } 3869 3870 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 3871 /* Signaled */ 3872 mutex_exit(&zonehash_lock); 3873 return (set_errno(EINTR)); 3874 } 3875 3876 } 3877 3878 /* Get rid of the zone's kstats */ 3879 zone_kstat_delete(zone); 3880 3881 /* 3882 * It is now safe to let the zone be recreated; remove it from the 3883 * lists. The memory will not be freed until the last cred 3884 * reference goes away. 3885 */ 3886 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 3887 zonecount--; 3888 /* remove from active list and hash tables */ 3889 list_remove(&zone_active, zone); 3890 (void) mod_hash_destroy(zonehashbyname, 3891 (mod_hash_key_t)zone->zone_name); 3892 (void) mod_hash_destroy(zonehashbyid, 3893 (mod_hash_key_t)(uintptr_t)zone->zone_id); 3894 if (zone->zone_flags & ZF_HASHED_LABEL) 3895 (void) mod_hash_destroy(zonehashbylabel, 3896 (mod_hash_key_t)zone->zone_slabel); 3897 mutex_exit(&zonehash_lock); 3898 3899 /* 3900 * Release the root vnode; we're not using it anymore. Nor should any 3901 * other thread that might access it exist. 3902 */ 3903 if (zone->zone_rootvp != NULL) { 3904 VN_RELE(zone->zone_rootvp); 3905 zone->zone_rootvp = NULL; 3906 } 3907 3908 /* add to deathrow list */ 3909 mutex_enter(&zone_deathrow_lock); 3910 list_insert_tail(&zone_deathrow, zone); 3911 mutex_exit(&zone_deathrow_lock); 3912 3913 /* 3914 * Drop last reference (which was added by zsched()), this will 3915 * free the zone unless there are outstanding cred references. 3916 */ 3917 zone_rele(zone); 3918 return (0); 3919 } 3920 3921 /* 3922 * Systemcall entry point for zone_getattr(2). 3923 */ 3924 static ssize_t 3925 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 3926 { 3927 size_t size; 3928 int error = 0, err; 3929 zone_t *zone; 3930 char *zonepath; 3931 char *outstr; 3932 zone_status_t zone_status; 3933 pid_t initpid; 3934 boolean_t global = (curproc->p_zone == global_zone); 3935 boolean_t curzone = (curproc->p_zone->zone_id == zoneid); 3936 ushort_t flags; 3937 3938 mutex_enter(&zonehash_lock); 3939 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 3940 mutex_exit(&zonehash_lock); 3941 return (set_errno(EINVAL)); 3942 } 3943 zone_status = zone_status_get(zone); 3944 if (zone_status < ZONE_IS_READY) { 3945 mutex_exit(&zonehash_lock); 3946 return (set_errno(EINVAL)); 3947 } 3948 zone_hold(zone); 3949 mutex_exit(&zonehash_lock); 3950 3951 /* 3952 * If not in the global zone, don't show information about other zones, 3953 * unless the system is labeled and the local zone's label dominates 3954 * the other zone. 3955 */ 3956 if (!zone_list_access(zone)) { 3957 zone_rele(zone); 3958 return (set_errno(EINVAL)); 3959 } 3960 3961 switch (attr) { 3962 case ZONE_ATTR_ROOT: 3963 if (global) { 3964 /* 3965 * Copy the path to trim the trailing "/" (except for 3966 * the global zone). 3967 */ 3968 if (zone != global_zone) 3969 size = zone->zone_rootpathlen - 1; 3970 else 3971 size = zone->zone_rootpathlen; 3972 zonepath = kmem_alloc(size, KM_SLEEP); 3973 bcopy(zone->zone_rootpath, zonepath, size); 3974 zonepath[size - 1] = '\0'; 3975 } else { 3976 if (curzone || !is_system_labeled()) { 3977 /* 3978 * Caller is not in the global zone. 3979 * if the query is on the current zone 3980 * or the system is not labeled, 3981 * just return faked-up path for current zone. 3982 */ 3983 zonepath = "/"; 3984 size = 2; 3985 } else { 3986 /* 3987 * Return related path for current zone. 3988 */ 3989 int prefix_len = strlen(zone_prefix); 3990 int zname_len = strlen(zone->zone_name); 3991 3992 size = prefix_len + zname_len + 1; 3993 zonepath = kmem_alloc(size, KM_SLEEP); 3994 bcopy(zone_prefix, zonepath, prefix_len); 3995 bcopy(zone->zone_name, zonepath + 3996 prefix_len, zname_len); 3997 zonepath[size - 1] = '\0'; 3998 } 3999 } 4000 if (bufsize > size) 4001 bufsize = size; 4002 if (buf != NULL) { 4003 err = copyoutstr(zonepath, buf, bufsize, NULL); 4004 if (err != 0 && err != ENAMETOOLONG) 4005 error = EFAULT; 4006 } 4007 if (global || (is_system_labeled() && !curzone)) 4008 kmem_free(zonepath, size); 4009 break; 4010 4011 case ZONE_ATTR_NAME: 4012 size = strlen(zone->zone_name) + 1; 4013 if (bufsize > size) 4014 bufsize = size; 4015 if (buf != NULL) { 4016 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 4017 if (err != 0 && err != ENAMETOOLONG) 4018 error = EFAULT; 4019 } 4020 break; 4021 4022 case ZONE_ATTR_STATUS: 4023 /* 4024 * Since we're not holding zonehash_lock, the zone status 4025 * may be anything; leave it up to userland to sort it out. 4026 */ 4027 size = sizeof (zone_status); 4028 if (bufsize > size) 4029 bufsize = size; 4030 zone_status = zone_status_get(zone); 4031 if (buf != NULL && 4032 copyout(&zone_status, buf, bufsize) != 0) 4033 error = EFAULT; 4034 break; 4035 case ZONE_ATTR_FLAGS: 4036 size = sizeof (zone->zone_flags); 4037 if (bufsize > size) 4038 bufsize = size; 4039 flags = zone->zone_flags; 4040 if (buf != NULL && 4041 copyout(&flags, buf, bufsize) != 0) 4042 error = EFAULT; 4043 break; 4044 case ZONE_ATTR_PRIVSET: 4045 size = sizeof (priv_set_t); 4046 if (bufsize > size) 4047 bufsize = size; 4048 if (buf != NULL && 4049 copyout(zone->zone_privset, buf, bufsize) != 0) 4050 error = EFAULT; 4051 break; 4052 case ZONE_ATTR_UNIQID: 4053 size = sizeof (zone->zone_uniqid); 4054 if (bufsize > size) 4055 bufsize = size; 4056 if (buf != NULL && 4057 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 4058 error = EFAULT; 4059 break; 4060 case ZONE_ATTR_POOLID: 4061 { 4062 pool_t *pool; 4063 poolid_t poolid; 4064 4065 if (pool_lock_intr() != 0) { 4066 error = EINTR; 4067 break; 4068 } 4069 pool = zone_pool_get(zone); 4070 poolid = pool->pool_id; 4071 pool_unlock(); 4072 size = sizeof (poolid); 4073 if (bufsize > size) 4074 bufsize = size; 4075 if (buf != NULL && copyout(&poolid, buf, size) != 0) 4076 error = EFAULT; 4077 } 4078 break; 4079 case ZONE_ATTR_SLBL: 4080 size = sizeof (bslabel_t); 4081 if (bufsize > size) 4082 bufsize = size; 4083 if (zone->zone_slabel == NULL) 4084 error = EINVAL; 4085 else if (buf != NULL && 4086 copyout(label2bslabel(zone->zone_slabel), buf, 4087 bufsize) != 0) 4088 error = EFAULT; 4089 break; 4090 case ZONE_ATTR_INITPID: 4091 size = sizeof (initpid); 4092 if (bufsize > size) 4093 bufsize = size; 4094 initpid = zone->zone_proc_initpid; 4095 if (initpid == -1) { 4096 error = ESRCH; 4097 break; 4098 } 4099 if (buf != NULL && 4100 copyout(&initpid, buf, bufsize) != 0) 4101 error = EFAULT; 4102 break; 4103 case ZONE_ATTR_BRAND: 4104 size = strlen(zone->zone_brand->b_name) + 1; 4105 4106 if (bufsize > size) 4107 bufsize = size; 4108 if (buf != NULL) { 4109 err = copyoutstr(zone->zone_brand->b_name, buf, 4110 bufsize, NULL); 4111 if (err != 0 && err != ENAMETOOLONG) 4112 error = EFAULT; 4113 } 4114 break; 4115 case ZONE_ATTR_INITNAME: 4116 size = strlen(zone->zone_initname) + 1; 4117 if (bufsize > size) 4118 bufsize = size; 4119 if (buf != NULL) { 4120 err = copyoutstr(zone->zone_initname, buf, bufsize, 4121 NULL); 4122 if (err != 0 && err != ENAMETOOLONG) 4123 error = EFAULT; 4124 } 4125 break; 4126 case ZONE_ATTR_BOOTARGS: 4127 if (zone->zone_bootargs == NULL) 4128 outstr = ""; 4129 else 4130 outstr = zone->zone_bootargs; 4131 size = strlen(outstr) + 1; 4132 if (bufsize > size) 4133 bufsize = size; 4134 if (buf != NULL) { 4135 err = copyoutstr(outstr, buf, bufsize, NULL); 4136 if (err != 0 && err != ENAMETOOLONG) 4137 error = EFAULT; 4138 } 4139 break; 4140 case ZONE_ATTR_PHYS_MCAP: 4141 size = sizeof (zone->zone_phys_mcap); 4142 if (bufsize > size) 4143 bufsize = size; 4144 if (buf != NULL && 4145 copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) 4146 error = EFAULT; 4147 break; 4148 case ZONE_ATTR_SCHED_CLASS: 4149 mutex_enter(&class_lock); 4150 4151 if (zone->zone_defaultcid >= loaded_classes) 4152 outstr = ""; 4153 else 4154 outstr = sclass[zone->zone_defaultcid].cl_name; 4155 size = strlen(outstr) + 1; 4156 if (bufsize > size) 4157 bufsize = size; 4158 if (buf != NULL) { 4159 err = copyoutstr(outstr, buf, bufsize, NULL); 4160 if (err != 0 && err != ENAMETOOLONG) 4161 error = EFAULT; 4162 } 4163 4164 mutex_exit(&class_lock); 4165 break; 4166 default: 4167 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { 4168 size = bufsize; 4169 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size); 4170 } else { 4171 error = EINVAL; 4172 } 4173 } 4174 zone_rele(zone); 4175 4176 if (error) 4177 return (set_errno(error)); 4178 return ((ssize_t)size); 4179 } 4180 4181 /* 4182 * Systemcall entry point for zone_setattr(2). 4183 */ 4184 /*ARGSUSED*/ 4185 static int 4186 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 4187 { 4188 zone_t *zone; 4189 zone_status_t zone_status; 4190 struct brand_attr *attrp; 4191 int err; 4192 4193 if (secpolicy_zone_config(CRED()) != 0) 4194 return (set_errno(EPERM)); 4195 4196 /* 4197 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the 4198 * global zone. 4199 */ 4200 if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { 4201 return (set_errno(EINVAL)); 4202 } 4203 4204 mutex_enter(&zonehash_lock); 4205 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4206 mutex_exit(&zonehash_lock); 4207 return (set_errno(EINVAL)); 4208 } 4209 zone_hold(zone); 4210 mutex_exit(&zonehash_lock); 4211 4212 /* 4213 * At present most attributes can only be set on non-running, 4214 * non-global zones. 4215 */ 4216 zone_status = zone_status_get(zone); 4217 if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) 4218 goto done; 4219 4220 switch (attr) { 4221 case ZONE_ATTR_INITNAME: 4222 err = zone_set_initname(zone, (const char *)buf); 4223 break; 4224 case ZONE_ATTR_BOOTARGS: 4225 err = zone_set_bootargs(zone, (const char *)buf); 4226 break; 4227 case ZONE_ATTR_BRAND: 4228 ASSERT(!ZONE_IS_BRANDED(zone)); 4229 err = 0; 4230 attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP); 4231 if ((buf == NULL) || 4232 (copyin(buf, attrp, sizeof (struct brand_attr)) != 0)) { 4233 kmem_free(attrp, sizeof (struct brand_attr)); 4234 err = EFAULT; 4235 break; 4236 } 4237 4238 if (is_system_labeled() && strncmp(attrp->ba_brandname, 4239 NATIVE_BRAND_NAME, MAXNAMELEN) != 0) { 4240 err = EPERM; 4241 break; 4242 } 4243 4244 zone->zone_brand = brand_register_zone(attrp); 4245 kmem_free(attrp, sizeof (struct brand_attr)); 4246 if (zone->zone_brand == NULL) 4247 err = EINVAL; 4248 break; 4249 case ZONE_ATTR_PHYS_MCAP: 4250 err = zone_set_phys_mcap(zone, (const uint64_t *)buf); 4251 break; 4252 case ZONE_ATTR_SCHED_CLASS: 4253 err = zone_set_sched_class(zone, (const char *)buf); 4254 break; 4255 default: 4256 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) 4257 err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); 4258 else 4259 err = EINVAL; 4260 } 4261 4262 done: 4263 zone_rele(zone); 4264 return (err != 0 ? set_errno(err) : 0); 4265 } 4266 4267 /* 4268 * Return zero if the process has at least one vnode mapped in to its 4269 * address space which shouldn't be allowed to change zones. 4270 * 4271 * Also return zero if the process has any shared mappings which reserve 4272 * swap. This is because the counting for zone.max-swap does not allow swap 4273 * revervation to be shared between zones. zone swap reservation is counted 4274 * on zone->zone_max_swap. 4275 */ 4276 static int 4277 as_can_change_zones(void) 4278 { 4279 proc_t *pp = curproc; 4280 struct seg *seg; 4281 struct as *as = pp->p_as; 4282 vnode_t *vp; 4283 int allow = 1; 4284 4285 ASSERT(pp->p_as != &kas); 4286 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 4287 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 4288 4289 /* 4290 * Cannot enter zone with shared anon memory which 4291 * reserves swap. See comment above. 4292 */ 4293 if (seg_can_change_zones(seg) == B_FALSE) { 4294 allow = 0; 4295 break; 4296 } 4297 /* 4298 * if we can't get a backing vnode for this segment then skip 4299 * it. 4300 */ 4301 vp = NULL; 4302 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 4303 continue; 4304 if (!vn_can_change_zones(vp)) { /* bail on first match */ 4305 allow = 0; 4306 break; 4307 } 4308 } 4309 AS_LOCK_EXIT(as, &as->a_lock); 4310 return (allow); 4311 } 4312 4313 /* 4314 * Count swap reserved by curproc's address space 4315 */ 4316 static size_t 4317 as_swresv(void) 4318 { 4319 proc_t *pp = curproc; 4320 struct seg *seg; 4321 struct as *as = pp->p_as; 4322 size_t swap = 0; 4323 4324 ASSERT(pp->p_as != &kas); 4325 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 4326 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) 4327 swap += seg_swresv(seg); 4328 4329 return (swap); 4330 } 4331 4332 /* 4333 * Systemcall entry point for zone_enter(). 4334 * 4335 * The current process is injected into said zone. In the process 4336 * it will change its project membership, privileges, rootdir/cwd, 4337 * zone-wide rctls, and pool association to match those of the zone. 4338 * 4339 * The first zone_enter() called while the zone is in the ZONE_IS_READY 4340 * state will transition it to ZONE_IS_RUNNING. Processes may only 4341 * enter a zone that is "ready" or "running". 4342 */ 4343 static int 4344 zone_enter(zoneid_t zoneid) 4345 { 4346 zone_t *zone; 4347 vnode_t *vp; 4348 proc_t *pp = curproc; 4349 contract_t *ct; 4350 cont_process_t *ctp; 4351 task_t *tk, *oldtk; 4352 kproject_t *zone_proj0; 4353 cred_t *cr, *newcr; 4354 pool_t *oldpool, *newpool; 4355 sess_t *sp; 4356 uid_t uid; 4357 zone_status_t status; 4358 int err = 0; 4359 rctl_entity_p_t e; 4360 size_t swap; 4361 4362 if (secpolicy_zone_config(CRED()) != 0) 4363 return (set_errno(EPERM)); 4364 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4365 return (set_errno(EINVAL)); 4366 4367 /* 4368 * Stop all lwps so we don't need to hold a lock to look at 4369 * curproc->p_zone. This needs to happen before we grab any 4370 * locks to avoid deadlock (another lwp in the process could 4371 * be waiting for the held lock). 4372 */ 4373 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 4374 return (set_errno(EINTR)); 4375 4376 /* 4377 * Make sure we're not changing zones with files open or mapped in 4378 * to our address space which shouldn't be changing zones. 4379 */ 4380 if (!files_can_change_zones()) { 4381 err = EBADF; 4382 goto out; 4383 } 4384 if (!as_can_change_zones()) { 4385 err = EFAULT; 4386 goto out; 4387 } 4388 4389 mutex_enter(&zonehash_lock); 4390 if (pp->p_zone != global_zone) { 4391 mutex_exit(&zonehash_lock); 4392 err = EINVAL; 4393 goto out; 4394 } 4395 4396 zone = zone_find_all_by_id(zoneid); 4397 if (zone == NULL) { 4398 mutex_exit(&zonehash_lock); 4399 err = EINVAL; 4400 goto out; 4401 } 4402 4403 /* 4404 * To prevent processes in a zone from holding contracts on 4405 * extrazonal resources, and to avoid process contract 4406 * memberships which span zones, contract holders and processes 4407 * which aren't the sole members of their encapsulating process 4408 * contracts are not allowed to zone_enter. 4409 */ 4410 ctp = pp->p_ct_process; 4411 ct = &ctp->conp_contract; 4412 mutex_enter(&ct->ct_lock); 4413 mutex_enter(&pp->p_lock); 4414 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 4415 mutex_exit(&pp->p_lock); 4416 mutex_exit(&ct->ct_lock); 4417 mutex_exit(&zonehash_lock); 4418 pool_unlock(); 4419 err = EINVAL; 4420 goto out; 4421 } 4422 4423 /* 4424 * Moreover, we don't allow processes whose encapsulating 4425 * process contracts have inherited extrazonal contracts. 4426 * While it would be easier to eliminate all process contracts 4427 * with inherited contracts, we need to be able to give a 4428 * restarted init (or other zone-penetrating process) its 4429 * predecessor's contracts. 4430 */ 4431 if (ctp->conp_ninherited != 0) { 4432 contract_t *next; 4433 for (next = list_head(&ctp->conp_inherited); next; 4434 next = list_next(&ctp->conp_inherited, next)) { 4435 if (contract_getzuniqid(next) != zone->zone_uniqid) { 4436 mutex_exit(&pp->p_lock); 4437 mutex_exit(&ct->ct_lock); 4438 mutex_exit(&zonehash_lock); 4439 pool_unlock(); 4440 err = EINVAL; 4441 goto out; 4442 } 4443 } 4444 } 4445 mutex_exit(&pp->p_lock); 4446 mutex_exit(&ct->ct_lock); 4447 4448 status = zone_status_get(zone); 4449 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 4450 /* 4451 * Can't join 4452 */ 4453 mutex_exit(&zonehash_lock); 4454 err = EINVAL; 4455 goto out; 4456 } 4457 4458 /* 4459 * Make sure new priv set is within the permitted set for caller 4460 */ 4461 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 4462 mutex_exit(&zonehash_lock); 4463 err = EPERM; 4464 goto out; 4465 } 4466 /* 4467 * We want to momentarily drop zonehash_lock while we optimistically 4468 * bind curproc to the pool it should be running in. This is safe 4469 * since the zone can't disappear (we have a hold on it). 4470 */ 4471 zone_hold(zone); 4472 mutex_exit(&zonehash_lock); 4473 4474 /* 4475 * Grab pool_lock to keep the pools configuration from changing 4476 * and to stop ourselves from getting rebound to another pool 4477 * until we join the zone. 4478 */ 4479 if (pool_lock_intr() != 0) { 4480 zone_rele(zone); 4481 err = EINTR; 4482 goto out; 4483 } 4484 ASSERT(secpolicy_pool(CRED()) == 0); 4485 /* 4486 * Bind ourselves to the pool currently associated with the zone. 4487 */ 4488 oldpool = curproc->p_pool; 4489 newpool = zone_pool_get(zone); 4490 if (pool_state == POOL_ENABLED && newpool != oldpool && 4491 (err = pool_do_bind(newpool, P_PID, P_MYID, 4492 POOL_BIND_ALL)) != 0) { 4493 pool_unlock(); 4494 zone_rele(zone); 4495 goto out; 4496 } 4497 4498 /* 4499 * Grab cpu_lock now; we'll need it later when we call 4500 * task_join(). 4501 */ 4502 mutex_enter(&cpu_lock); 4503 mutex_enter(&zonehash_lock); 4504 /* 4505 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 4506 */ 4507 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 4508 /* 4509 * Can't join anymore. 4510 */ 4511 mutex_exit(&zonehash_lock); 4512 mutex_exit(&cpu_lock); 4513 if (pool_state == POOL_ENABLED && 4514 newpool != oldpool) 4515 (void) pool_do_bind(oldpool, P_PID, P_MYID, 4516 POOL_BIND_ALL); 4517 pool_unlock(); 4518 zone_rele(zone); 4519 err = EINVAL; 4520 goto out; 4521 } 4522 4523 /* 4524 * a_lock must be held while transfering locked memory and swap 4525 * reservation from the global zone to the non global zone because 4526 * asynchronous faults on the processes' address space can lock 4527 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE 4528 * segments respectively. 4529 */ 4530 AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); 4531 swap = as_swresv(); 4532 mutex_enter(&pp->p_lock); 4533 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 4534 /* verify that we do not exceed and task or lwp limits */ 4535 mutex_enter(&zone->zone_nlwps_lock); 4536 /* add new lwps to zone and zone's proj0 */ 4537 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 4538 zone->zone_nlwps += pp->p_lwpcnt; 4539 /* add 1 task to zone's proj0 */ 4540 zone_proj0->kpj_ntasks += 1; 4541 mutex_exit(&zone->zone_nlwps_lock); 4542 4543 mutex_enter(&zone->zone_mem_lock); 4544 zone->zone_locked_mem += pp->p_locked_mem; 4545 zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; 4546 zone->zone_max_swap += swap; 4547 mutex_exit(&zone->zone_mem_lock); 4548 4549 /* remove lwps from proc's old zone and old project */ 4550 mutex_enter(&pp->p_zone->zone_nlwps_lock); 4551 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 4552 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 4553 mutex_exit(&pp->p_zone->zone_nlwps_lock); 4554 4555 mutex_enter(&pp->p_zone->zone_mem_lock); 4556 pp->p_zone->zone_locked_mem -= pp->p_locked_mem; 4557 pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; 4558 pp->p_zone->zone_max_swap -= swap; 4559 mutex_exit(&pp->p_zone->zone_mem_lock); 4560 4561 mutex_exit(&pp->p_lock); 4562 AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); 4563 4564 /* 4565 * Joining the zone cannot fail from now on. 4566 * 4567 * This means that a lot of the following code can be commonized and 4568 * shared with zsched(). 4569 */ 4570 4571 /* 4572 * Reset the encapsulating process contract's zone. 4573 */ 4574 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 4575 contract_setzuniqid(ct, zone->zone_uniqid); 4576 4577 /* 4578 * Create a new task and associate the process with the project keyed 4579 * by (projid,zoneid). 4580 * 4581 * We might as well be in project 0; the global zone's projid doesn't 4582 * make much sense in a zone anyhow. 4583 * 4584 * This also increments zone_ntasks, and returns with p_lock held. 4585 */ 4586 tk = task_create(0, zone); 4587 oldtk = task_join(tk, 0); 4588 mutex_exit(&cpu_lock); 4589 4590 pp->p_flag |= SZONETOP; 4591 pp->p_zone = zone; 4592 4593 /* 4594 * call RCTLOP_SET functions on this proc 4595 */ 4596 e.rcep_p.zone = zone; 4597 e.rcep_t = RCENTITY_ZONE; 4598 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 4599 RCD_CALLBACK); 4600 mutex_exit(&pp->p_lock); 4601 4602 /* 4603 * We don't need to hold any of zsched's locks here; not only do we know 4604 * the process and zone aren't going away, we know its session isn't 4605 * changing either. 4606 * 4607 * By joining zsched's session here, we mimic the behavior in the 4608 * global zone of init's sid being the pid of sched. We extend this 4609 * to all zlogin-like zone_enter()'ing processes as well. 4610 */ 4611 mutex_enter(&pidlock); 4612 sp = zone->zone_zsched->p_sessp; 4613 sess_hold(zone->zone_zsched); 4614 mutex_enter(&pp->p_lock); 4615 pgexit(pp); 4616 sess_rele(pp->p_sessp, B_TRUE); 4617 pp->p_sessp = sp; 4618 pgjoin(pp, zone->zone_zsched->p_pidp); 4619 4620 /* 4621 * If there is a default scheduling class for the zone and it is not 4622 * the class we are currently in, change all of the threads in the 4623 * process to the new class. We need to be holding pidlock & p_lock 4624 * when we call parmsset so this is a good place to do it. 4625 */ 4626 if (zone->zone_defaultcid > 0 && 4627 zone->zone_defaultcid != curthread->t_cid) { 4628 pcparms_t pcparms; 4629 kthread_id_t t; 4630 4631 pcparms.pc_cid = zone->zone_defaultcid; 4632 pcparms.pc_clparms[0] = 0; 4633 4634 /* 4635 * If setting the class fails, we still want to enter the zone. 4636 */ 4637 if ((t = pp->p_tlist) != NULL) { 4638 do { 4639 (void) parmsset(&pcparms, t); 4640 } while ((t = t->t_forw) != pp->p_tlist); 4641 } 4642 } 4643 4644 mutex_exit(&pp->p_lock); 4645 mutex_exit(&pidlock); 4646 4647 mutex_exit(&zonehash_lock); 4648 /* 4649 * We're firmly in the zone; let pools progress. 4650 */ 4651 pool_unlock(); 4652 task_rele(oldtk); 4653 /* 4654 * We don't need to retain a hold on the zone since we already 4655 * incremented zone_ntasks, so the zone isn't going anywhere. 4656 */ 4657 zone_rele(zone); 4658 4659 /* 4660 * Chroot 4661 */ 4662 vp = zone->zone_rootvp; 4663 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 4664 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 4665 4666 /* 4667 * Change process credentials 4668 */ 4669 newcr = cralloc(); 4670 mutex_enter(&pp->p_crlock); 4671 cr = pp->p_cred; 4672 crcopy_to(cr, newcr); 4673 crsetzone(newcr, zone); 4674 pp->p_cred = newcr; 4675 4676 /* 4677 * Restrict all process privilege sets to zone limit 4678 */ 4679 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 4680 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 4681 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 4682 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 4683 mutex_exit(&pp->p_crlock); 4684 crset(pp, newcr); 4685 4686 /* 4687 * Adjust upcount to reflect zone entry. 4688 */ 4689 uid = crgetruid(newcr); 4690 mutex_enter(&pidlock); 4691 upcount_dec(uid, GLOBAL_ZONEID); 4692 upcount_inc(uid, zoneid); 4693 mutex_exit(&pidlock); 4694 4695 /* 4696 * Set up core file path and content. 4697 */ 4698 set_core_defaults(); 4699 4700 out: 4701 /* 4702 * Let the other lwps continue. 4703 */ 4704 mutex_enter(&pp->p_lock); 4705 if (curthread != pp->p_agenttp) 4706 continuelwps(pp); 4707 mutex_exit(&pp->p_lock); 4708 4709 return (err != 0 ? set_errno(err) : 0); 4710 } 4711 4712 /* 4713 * Systemcall entry point for zone_list(2). 4714 * 4715 * Processes running in a (non-global) zone only see themselves. 4716 * On labeled systems, they see all zones whose label they dominate. 4717 */ 4718 static int 4719 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 4720 { 4721 zoneid_t *zoneids; 4722 zone_t *zone, *myzone; 4723 uint_t user_nzones, real_nzones; 4724 uint_t domi_nzones; 4725 int error; 4726 4727 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 4728 return (set_errno(EFAULT)); 4729 4730 myzone = curproc->p_zone; 4731 if (myzone != global_zone) { 4732 bslabel_t *mybslab; 4733 4734 if (!is_system_labeled()) { 4735 /* just return current zone */ 4736 real_nzones = domi_nzones = 1; 4737 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 4738 zoneids[0] = myzone->zone_id; 4739 } else { 4740 /* return all zones that are dominated */ 4741 mutex_enter(&zonehash_lock); 4742 real_nzones = zonecount; 4743 domi_nzones = 0; 4744 if (real_nzones > 0) { 4745 zoneids = kmem_alloc(real_nzones * 4746 sizeof (zoneid_t), KM_SLEEP); 4747 mybslab = label2bslabel(myzone->zone_slabel); 4748 for (zone = list_head(&zone_active); 4749 zone != NULL; 4750 zone = list_next(&zone_active, zone)) { 4751 if (zone->zone_id == GLOBAL_ZONEID) 4752 continue; 4753 if (zone != myzone && 4754 (zone->zone_flags & ZF_IS_SCRATCH)) 4755 continue; 4756 /* 4757 * Note that a label always dominates 4758 * itself, so myzone is always included 4759 * in the list. 4760 */ 4761 if (bldominates(mybslab, 4762 label2bslabel(zone->zone_slabel))) { 4763 zoneids[domi_nzones++] = 4764 zone->zone_id; 4765 } 4766 } 4767 } 4768 mutex_exit(&zonehash_lock); 4769 } 4770 } else { 4771 mutex_enter(&zonehash_lock); 4772 real_nzones = zonecount; 4773 domi_nzones = 0; 4774 if (real_nzones > 0) { 4775 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 4776 KM_SLEEP); 4777 for (zone = list_head(&zone_active); zone != NULL; 4778 zone = list_next(&zone_active, zone)) 4779 zoneids[domi_nzones++] = zone->zone_id; 4780 ASSERT(domi_nzones == real_nzones); 4781 } 4782 mutex_exit(&zonehash_lock); 4783 } 4784 4785 /* 4786 * If user has allocated space for fewer entries than we found, then 4787 * return only up to his limit. Either way, tell him exactly how many 4788 * we found. 4789 */ 4790 if (domi_nzones < user_nzones) 4791 user_nzones = domi_nzones; 4792 error = 0; 4793 if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) { 4794 error = EFAULT; 4795 } else if (zoneidlist != NULL && user_nzones != 0) { 4796 if (copyout(zoneids, zoneidlist, 4797 user_nzones * sizeof (zoneid_t)) != 0) 4798 error = EFAULT; 4799 } 4800 4801 if (real_nzones > 0) 4802 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 4803 4804 if (error != 0) 4805 return (set_errno(error)); 4806 else 4807 return (0); 4808 } 4809 4810 /* 4811 * Systemcall entry point for zone_lookup(2). 4812 * 4813 * Non-global zones are only able to see themselves and (on labeled systems) 4814 * the zones they dominate. 4815 */ 4816 static zoneid_t 4817 zone_lookup(const char *zone_name) 4818 { 4819 char *kname; 4820 zone_t *zone; 4821 zoneid_t zoneid; 4822 int err; 4823 4824 if (zone_name == NULL) { 4825 /* return caller's zone id */ 4826 return (getzoneid()); 4827 } 4828 4829 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 4830 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 4831 kmem_free(kname, ZONENAME_MAX); 4832 return (set_errno(err)); 4833 } 4834 4835 mutex_enter(&zonehash_lock); 4836 zone = zone_find_all_by_name(kname); 4837 kmem_free(kname, ZONENAME_MAX); 4838 /* 4839 * In a non-global zone, can only lookup global and own name. 4840 * In Trusted Extensions zone label dominance rules apply. 4841 */ 4842 if (zone == NULL || 4843 zone_status_get(zone) < ZONE_IS_READY || 4844 !zone_list_access(zone)) { 4845 mutex_exit(&zonehash_lock); 4846 return (set_errno(EINVAL)); 4847 } else { 4848 zoneid = zone->zone_id; 4849 mutex_exit(&zonehash_lock); 4850 return (zoneid); 4851 } 4852 } 4853 4854 static int 4855 zone_version(int *version_arg) 4856 { 4857 int version = ZONE_SYSCALL_API_VERSION; 4858 4859 if (copyout(&version, version_arg, sizeof (int)) != 0) 4860 return (set_errno(EFAULT)); 4861 return (0); 4862 } 4863 4864 /* ARGSUSED */ 4865 long 4866 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 4867 { 4868 zone_def zs; 4869 4870 switch (cmd) { 4871 case ZONE_CREATE: 4872 if (get_udatamodel() == DATAMODEL_NATIVE) { 4873 if (copyin(arg1, &zs, sizeof (zone_def))) { 4874 return (set_errno(EFAULT)); 4875 } 4876 } else { 4877 #ifdef _SYSCALL32_IMPL 4878 zone_def32 zs32; 4879 4880 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 4881 return (set_errno(EFAULT)); 4882 } 4883 zs.zone_name = 4884 (const char *)(unsigned long)zs32.zone_name; 4885 zs.zone_root = 4886 (const char *)(unsigned long)zs32.zone_root; 4887 zs.zone_privs = 4888 (const struct priv_set *) 4889 (unsigned long)zs32.zone_privs; 4890 zs.zone_privssz = zs32.zone_privssz; 4891 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 4892 zs.rctlbufsz = zs32.rctlbufsz; 4893 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 4894 zs.zfsbufsz = zs32.zfsbufsz; 4895 zs.extended_error = 4896 (int *)(unsigned long)zs32.extended_error; 4897 zs.match = zs32.match; 4898 zs.doi = zs32.doi; 4899 zs.label = (const bslabel_t *)(uintptr_t)zs32.label; 4900 zs.flags = zs32.flags; 4901 #else 4902 panic("get_udatamodel() returned bogus result\n"); 4903 #endif 4904 } 4905 4906 return (zone_create(zs.zone_name, zs.zone_root, 4907 zs.zone_privs, zs.zone_privssz, 4908 (caddr_t)zs.rctlbuf, zs.rctlbufsz, 4909 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 4910 zs.extended_error, zs.match, zs.doi, 4911 zs.label, zs.flags)); 4912 case ZONE_BOOT: 4913 return (zone_boot((zoneid_t)(uintptr_t)arg1)); 4914 case ZONE_DESTROY: 4915 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 4916 case ZONE_GETATTR: 4917 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 4918 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 4919 case ZONE_SETATTR: 4920 return (zone_setattr((zoneid_t)(uintptr_t)arg1, 4921 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 4922 case ZONE_ENTER: 4923 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 4924 case ZONE_LIST: 4925 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 4926 case ZONE_SHUTDOWN: 4927 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 4928 case ZONE_LOOKUP: 4929 return (zone_lookup((const char *)arg1)); 4930 case ZONE_VERSION: 4931 return (zone_version((int *)arg1)); 4932 case ZONE_ADD_DATALINK: 4933 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1, 4934 (char *)arg2)); 4935 case ZONE_DEL_DATALINK: 4936 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1, 4937 (char *)arg2)); 4938 case ZONE_CHECK_DATALINK: 4939 return (zone_check_datalink((zoneid_t *)arg1, (char *)arg2)); 4940 case ZONE_LIST_DATALINK: 4941 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1, 4942 (int *)arg2, (char *)arg3)); 4943 default: 4944 return (set_errno(EINVAL)); 4945 } 4946 } 4947 4948 struct zarg { 4949 zone_t *zone; 4950 zone_cmd_arg_t arg; 4951 }; 4952 4953 static int 4954 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 4955 { 4956 char *buf; 4957 size_t buflen; 4958 int error; 4959 4960 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 4961 buf = kmem_alloc(buflen, KM_SLEEP); 4962 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 4963 error = door_ki_open(buf, doorp); 4964 kmem_free(buf, buflen); 4965 return (error); 4966 } 4967 4968 static void 4969 zone_release_door(door_handle_t *doorp) 4970 { 4971 door_ki_rele(*doorp); 4972 *doorp = NULL; 4973 } 4974 4975 static void 4976 zone_ki_call_zoneadmd(struct zarg *zargp) 4977 { 4978 door_handle_t door = NULL; 4979 door_arg_t darg, save_arg; 4980 char *zone_name; 4981 size_t zone_namelen; 4982 zoneid_t zoneid; 4983 zone_t *zone; 4984 zone_cmd_arg_t arg; 4985 uint64_t uniqid; 4986 size_t size; 4987 int error; 4988 int retry; 4989 4990 zone = zargp->zone; 4991 arg = zargp->arg; 4992 kmem_free(zargp, sizeof (*zargp)); 4993 4994 zone_namelen = strlen(zone->zone_name) + 1; 4995 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 4996 bcopy(zone->zone_name, zone_name, zone_namelen); 4997 zoneid = zone->zone_id; 4998 uniqid = zone->zone_uniqid; 4999 /* 5000 * zoneadmd may be down, but at least we can empty out the zone. 5001 * We can ignore the return value of zone_empty() since we're called 5002 * from a kernel thread and know we won't be delivered any signals. 5003 */ 5004 ASSERT(curproc == &p0); 5005 (void) zone_empty(zone); 5006 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 5007 zone_rele(zone); 5008 5009 size = sizeof (arg); 5010 darg.rbuf = (char *)&arg; 5011 darg.data_ptr = (char *)&arg; 5012 darg.rsize = size; 5013 darg.data_size = size; 5014 darg.desc_ptr = NULL; 5015 darg.desc_num = 0; 5016 5017 save_arg = darg; 5018 /* 5019 * Since we're not holding a reference to the zone, any number of 5020 * things can go wrong, including the zone disappearing before we get a 5021 * chance to talk to zoneadmd. 5022 */ 5023 for (retry = 0; /* forever */; retry++) { 5024 if (door == NULL && 5025 (error = zone_lookup_door(zone_name, &door)) != 0) { 5026 goto next; 5027 } 5028 ASSERT(door != NULL); 5029 5030 if ((error = door_ki_upcall(door, &darg)) == 0) { 5031 break; 5032 } 5033 switch (error) { 5034 case EINTR: 5035 /* FALLTHROUGH */ 5036 case EAGAIN: /* process may be forking */ 5037 /* 5038 * Back off for a bit 5039 */ 5040 break; 5041 case EBADF: 5042 zone_release_door(&door); 5043 if (zone_lookup_door(zone_name, &door) != 0) { 5044 /* 5045 * zoneadmd may be dead, but it may come back to 5046 * life later. 5047 */ 5048 break; 5049 } 5050 break; 5051 default: 5052 cmn_err(CE_WARN, 5053 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 5054 error); 5055 goto out; 5056 } 5057 next: 5058 /* 5059 * If this isn't the same zone_t that we originally had in mind, 5060 * then this is the same as if two kadmin requests come in at 5061 * the same time: the first one wins. This means we lose, so we 5062 * bail. 5063 */ 5064 if ((zone = zone_find_by_id(zoneid)) == NULL) { 5065 /* 5066 * Problem is solved. 5067 */ 5068 break; 5069 } 5070 if (zone->zone_uniqid != uniqid) { 5071 /* 5072 * zoneid recycled 5073 */ 5074 zone_rele(zone); 5075 break; 5076 } 5077 /* 5078 * We could zone_status_timedwait(), but there doesn't seem to 5079 * be much point in doing that (plus, it would mean that 5080 * zone_free() isn't called until this thread exits). 5081 */ 5082 zone_rele(zone); 5083 delay(hz); 5084 darg = save_arg; 5085 } 5086 out: 5087 if (door != NULL) { 5088 zone_release_door(&door); 5089 } 5090 kmem_free(zone_name, zone_namelen); 5091 thread_exit(); 5092 } 5093 5094 /* 5095 * Entry point for uadmin() to tell the zone to go away or reboot. Analog to 5096 * kadmin(). The caller is a process in the zone. 5097 * 5098 * In order to shutdown the zone, we will hand off control to zoneadmd 5099 * (running in the global zone) via a door. We do a half-hearted job at 5100 * killing all processes in the zone, create a kernel thread to contact 5101 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 5102 * a form of generation number used to let zoneadmd (as well as 5103 * zone_destroy()) know exactly which zone they're re talking about. 5104 */ 5105 int 5106 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) 5107 { 5108 struct zarg *zargp; 5109 zone_cmd_t zcmd; 5110 zone_t *zone; 5111 5112 zone = curproc->p_zone; 5113 ASSERT(getzoneid() != GLOBAL_ZONEID); 5114 5115 switch (cmd) { 5116 case A_SHUTDOWN: 5117 switch (fcn) { 5118 case AD_HALT: 5119 case AD_POWEROFF: 5120 zcmd = Z_HALT; 5121 break; 5122 case AD_BOOT: 5123 zcmd = Z_REBOOT; 5124 break; 5125 case AD_IBOOT: 5126 case AD_SBOOT: 5127 case AD_SIBOOT: 5128 case AD_NOSYNC: 5129 return (ENOTSUP); 5130 default: 5131 return (EINVAL); 5132 } 5133 break; 5134 case A_REBOOT: 5135 zcmd = Z_REBOOT; 5136 break; 5137 case A_FTRACE: 5138 case A_REMOUNT: 5139 case A_FREEZE: 5140 case A_DUMP: 5141 return (ENOTSUP); 5142 default: 5143 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 5144 return (EINVAL); 5145 } 5146 5147 if (secpolicy_zone_admin(credp, B_FALSE)) 5148 return (EPERM); 5149 mutex_enter(&zone_status_lock); 5150 5151 /* 5152 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 5153 * is in the zone. 5154 */ 5155 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 5156 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 5157 /* 5158 * This zone is already on its way down. 5159 */ 5160 mutex_exit(&zone_status_lock); 5161 return (0); 5162 } 5163 /* 5164 * Prevent future zone_enter()s 5165 */ 5166 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 5167 mutex_exit(&zone_status_lock); 5168 5169 /* 5170 * Kill everyone now and call zoneadmd later. 5171 * zone_ki_call_zoneadmd() will do a more thorough job of this 5172 * later. 5173 */ 5174 killall(zone->zone_id); 5175 /* 5176 * Now, create the thread to contact zoneadmd and do the rest of the 5177 * work. This thread can't be created in our zone otherwise 5178 * zone_destroy() would deadlock. 5179 */ 5180 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP); 5181 zargp->arg.cmd = zcmd; 5182 zargp->arg.uniqid = zone->zone_uniqid; 5183 zargp->zone = zone; 5184 (void) strcpy(zargp->arg.locale, "C"); 5185 /* mdep was already copied in for us by uadmin */ 5186 if (mdep != NULL) 5187 (void) strlcpy(zargp->arg.bootbuf, mdep, 5188 sizeof (zargp->arg.bootbuf)); 5189 zone_hold(zone); 5190 5191 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 5192 TS_RUN, minclsyspri); 5193 exit(CLD_EXITED, 0); 5194 5195 return (EINVAL); 5196 } 5197 5198 /* 5199 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 5200 * status to ZONE_IS_SHUTTING_DOWN. 5201 */ 5202 void 5203 zone_shutdown_global(void) 5204 { 5205 ASSERT(curproc->p_zone == global_zone); 5206 5207 mutex_enter(&zone_status_lock); 5208 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 5209 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 5210 mutex_exit(&zone_status_lock); 5211 } 5212 5213 /* 5214 * Returns true if the named dataset is visible in the current zone. 5215 * The 'write' parameter is set to 1 if the dataset is also writable. 5216 */ 5217 int 5218 zone_dataset_visible(const char *dataset, int *write) 5219 { 5220 zone_dataset_t *zd; 5221 size_t len; 5222 zone_t *zone = curproc->p_zone; 5223 5224 if (dataset[0] == '\0') 5225 return (0); 5226 5227 /* 5228 * Walk the list once, looking for datasets which match exactly, or 5229 * specify a dataset underneath an exported dataset. If found, return 5230 * true and note that it is writable. 5231 */ 5232 for (zd = list_head(&zone->zone_datasets); zd != NULL; 5233 zd = list_next(&zone->zone_datasets, zd)) { 5234 5235 len = strlen(zd->zd_dataset); 5236 if (strlen(dataset) >= len && 5237 bcmp(dataset, zd->zd_dataset, len) == 0 && 5238 (dataset[len] == '\0' || dataset[len] == '/' || 5239 dataset[len] == '@')) { 5240 if (write) 5241 *write = 1; 5242 return (1); 5243 } 5244 } 5245 5246 /* 5247 * Walk the list a second time, searching for datasets which are parents 5248 * of exported datasets. These should be visible, but read-only. 5249 * 5250 * Note that we also have to support forms such as 'pool/dataset/', with 5251 * a trailing slash. 5252 */ 5253 for (zd = list_head(&zone->zone_datasets); zd != NULL; 5254 zd = list_next(&zone->zone_datasets, zd)) { 5255 5256 len = strlen(dataset); 5257 if (dataset[len - 1] == '/') 5258 len--; /* Ignore trailing slash */ 5259 if (len < strlen(zd->zd_dataset) && 5260 bcmp(dataset, zd->zd_dataset, len) == 0 && 5261 zd->zd_dataset[len] == '/') { 5262 if (write) 5263 *write = 0; 5264 return (1); 5265 } 5266 } 5267 5268 return (0); 5269 } 5270 5271 /* 5272 * zone_find_by_any_path() - 5273 * 5274 * kernel-private routine similar to zone_find_by_path(), but which 5275 * effectively compares against zone paths rather than zonerootpath 5276 * (i.e., the last component of zonerootpaths, which should be "root/", 5277 * are not compared.) This is done in order to accurately identify all 5278 * paths, whether zone-visible or not, including those which are parallel 5279 * to /root/, such as /dev/, /home/, etc... 5280 * 5281 * If the specified path does not fall under any zone path then global 5282 * zone is returned. 5283 * 5284 * The treat_abs parameter indicates whether the path should be treated as 5285 * an absolute path although it does not begin with "/". (This supports 5286 * nfs mount syntax such as host:any/path.) 5287 * 5288 * The caller is responsible for zone_rele of the returned zone. 5289 */ 5290 zone_t * 5291 zone_find_by_any_path(const char *path, boolean_t treat_abs) 5292 { 5293 zone_t *zone; 5294 int path_offset = 0; 5295 5296 if (path == NULL) { 5297 zone_hold(global_zone); 5298 return (global_zone); 5299 } 5300 5301 if (*path != '/') { 5302 ASSERT(treat_abs); 5303 path_offset = 1; 5304 } 5305 5306 mutex_enter(&zonehash_lock); 5307 for (zone = list_head(&zone_active); zone != NULL; 5308 zone = list_next(&zone_active, zone)) { 5309 char *c; 5310 size_t pathlen; 5311 char *rootpath_start; 5312 5313 if (zone == global_zone) /* skip global zone */ 5314 continue; 5315 5316 /* scan backwards to find start of last component */ 5317 c = zone->zone_rootpath + zone->zone_rootpathlen - 2; 5318 do { 5319 c--; 5320 } while (*c != '/'); 5321 5322 pathlen = c - zone->zone_rootpath + 1 - path_offset; 5323 rootpath_start = (zone->zone_rootpath + path_offset); 5324 if (strncmp(path, rootpath_start, pathlen) == 0) 5325 break; 5326 } 5327 if (zone == NULL) 5328 zone = global_zone; 5329 zone_hold(zone); 5330 mutex_exit(&zonehash_lock); 5331 return (zone); 5332 } 5333 5334 /* List of data link names which are accessible from the zone */ 5335 struct dlnamelist { 5336 char dlnl_name[LIFNAMSIZ]; 5337 struct dlnamelist *dlnl_next; 5338 }; 5339 5340 5341 /* 5342 * Check whether the datalink name (dlname) itself is present. 5343 * Return true if found. 5344 */ 5345 static boolean_t 5346 zone_dlname(zone_t *zone, char *dlname) 5347 { 5348 struct dlnamelist *dlnl; 5349 boolean_t found = B_FALSE; 5350 5351 mutex_enter(&zone->zone_lock); 5352 for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) { 5353 if (strncmp(dlnl->dlnl_name, dlname, LIFNAMSIZ) == 0) { 5354 found = B_TRUE; 5355 break; 5356 } 5357 } 5358 mutex_exit(&zone->zone_lock); 5359 return (found); 5360 } 5361 5362 /* 5363 * Add an data link name for the zone. Does not check for duplicates. 5364 */ 5365 static int 5366 zone_add_datalink(zoneid_t zoneid, char *dlname) 5367 { 5368 struct dlnamelist *dlnl; 5369 zone_t *zone; 5370 zone_t *thiszone; 5371 int err; 5372 5373 dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP); 5374 if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) { 5375 kmem_free(dlnl, sizeof (struct dlnamelist)); 5376 return (set_errno(err)); 5377 } 5378 5379 thiszone = zone_find_by_id(zoneid); 5380 if (thiszone == NULL) { 5381 kmem_free(dlnl, sizeof (struct dlnamelist)); 5382 return (set_errno(ENXIO)); 5383 } 5384 5385 /* 5386 * Verify that the datalink name isn't already used by a different 5387 * zone while allowing duplicate entries for the same zone (e.g. due 5388 * to both using IPv4 and IPv6 on an interface) 5389 */ 5390 mutex_enter(&zonehash_lock); 5391 for (zone = list_head(&zone_active); zone != NULL; 5392 zone = list_next(&zone_active, zone)) { 5393 if (zone->zone_id == zoneid) 5394 continue; 5395 5396 if (zone_dlname(zone, dlnl->dlnl_name)) { 5397 mutex_exit(&zonehash_lock); 5398 zone_rele(thiszone); 5399 kmem_free(dlnl, sizeof (struct dlnamelist)); 5400 return (set_errno(EPERM)); 5401 } 5402 } 5403 mutex_enter(&thiszone->zone_lock); 5404 dlnl->dlnl_next = thiszone->zone_dl_list; 5405 thiszone->zone_dl_list = dlnl; 5406 mutex_exit(&thiszone->zone_lock); 5407 mutex_exit(&zonehash_lock); 5408 zone_rele(thiszone); 5409 return (0); 5410 } 5411 5412 static int 5413 zone_remove_datalink(zoneid_t zoneid, char *dlname) 5414 { 5415 struct dlnamelist *dlnl, *odlnl, **dlnlp; 5416 zone_t *zone; 5417 int err; 5418 5419 dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP); 5420 if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) { 5421 kmem_free(dlnl, sizeof (struct dlnamelist)); 5422 return (set_errno(err)); 5423 } 5424 zone = zone_find_by_id(zoneid); 5425 if (zone == NULL) { 5426 kmem_free(dlnl, sizeof (struct dlnamelist)); 5427 return (set_errno(EINVAL)); 5428 } 5429 5430 mutex_enter(&zone->zone_lock); 5431 /* Look for match */ 5432 dlnlp = &zone->zone_dl_list; 5433 while (*dlnlp != NULL) { 5434 if (strncmp(dlnl->dlnl_name, (*dlnlp)->dlnl_name, 5435 LIFNAMSIZ) == 0) 5436 goto found; 5437 dlnlp = &((*dlnlp)->dlnl_next); 5438 } 5439 mutex_exit(&zone->zone_lock); 5440 zone_rele(zone); 5441 kmem_free(dlnl, sizeof (struct dlnamelist)); 5442 return (set_errno(ENXIO)); 5443 5444 found: 5445 odlnl = *dlnlp; 5446 *dlnlp = (*dlnlp)->dlnl_next; 5447 kmem_free(odlnl, sizeof (struct dlnamelist)); 5448 5449 mutex_exit(&zone->zone_lock); 5450 zone_rele(zone); 5451 kmem_free(dlnl, sizeof (struct dlnamelist)); 5452 return (0); 5453 } 5454 5455 /* 5456 * Using the zoneidp as ALL_ZONES, we can lookup which zone is using datalink 5457 * name (dlname); otherwise we just check if the specified zoneidp has access 5458 * to the datalink name. 5459 */ 5460 static int 5461 zone_check_datalink(zoneid_t *zoneidp, char *dlname) 5462 { 5463 zoneid_t id; 5464 char *dln; 5465 zone_t *zone; 5466 int err = 0; 5467 boolean_t allzones = B_FALSE; 5468 5469 if (copyin(zoneidp, &id, sizeof (id)) != 0) { 5470 return (set_errno(EFAULT)); 5471 } 5472 dln = kmem_zalloc(LIFNAMSIZ, KM_SLEEP); 5473 if ((err = copyinstr(dlname, dln, LIFNAMSIZ, NULL)) != 0) { 5474 kmem_free(dln, LIFNAMSIZ); 5475 return (set_errno(err)); 5476 } 5477 5478 if (id == ALL_ZONES) 5479 allzones = B_TRUE; 5480 5481 /* 5482 * Check whether datalink name is already used. 5483 */ 5484 mutex_enter(&zonehash_lock); 5485 for (zone = list_head(&zone_active); zone != NULL; 5486 zone = list_next(&zone_active, zone)) { 5487 if (allzones || (id == zone->zone_id)) { 5488 if (!zone_dlname(zone, dln)) 5489 continue; 5490 if (allzones) 5491 err = copyout(&zone->zone_id, zoneidp, 5492 sizeof (*zoneidp)); 5493 5494 mutex_exit(&zonehash_lock); 5495 kmem_free(dln, LIFNAMSIZ); 5496 return (err ? set_errno(EFAULT) : 0); 5497 } 5498 } 5499 5500 /* datalink name is not found in any active zone. */ 5501 mutex_exit(&zonehash_lock); 5502 kmem_free(dln, LIFNAMSIZ); 5503 return (set_errno(ENXIO)); 5504 } 5505 5506 /* 5507 * Get the names of the datalinks assigned to a zone. 5508 * Here *nump is the number of datalinks, and the assumption 5509 * is that the caller will gurantee that the the supplied buffer is 5510 * big enough to hold at least #*nump datalink names, that is, 5511 * LIFNAMSIZ X *nump 5512 * On return, *nump will be the "new" number of datalinks, if it 5513 * ever changed. 5514 */ 5515 static int 5516 zone_list_datalink(zoneid_t zoneid, int *nump, char *buf) 5517 { 5518 int num, dlcount; 5519 zone_t *zone; 5520 struct dlnamelist *dlnl; 5521 char *ptr; 5522 5523 if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) 5524 return (set_errno(EFAULT)); 5525 5526 zone = zone_find_by_id(zoneid); 5527 if (zone == NULL) { 5528 return (set_errno(ENXIO)); 5529 } 5530 5531 num = 0; 5532 mutex_enter(&zone->zone_lock); 5533 ptr = buf; 5534 for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) { 5535 /* 5536 * If the list changed and the new number is bigger 5537 * than what the caller supplied, just count, don't 5538 * do copyout 5539 */ 5540 if (++num > dlcount) 5541 continue; 5542 if (copyout(dlnl->dlnl_name, ptr, LIFNAMSIZ) != 0) { 5543 mutex_exit(&zone->zone_lock); 5544 zone_rele(zone); 5545 return (set_errno(EFAULT)); 5546 } 5547 ptr += LIFNAMSIZ; 5548 } 5549 mutex_exit(&zone->zone_lock); 5550 zone_rele(zone); 5551 5552 /* Increased or decreased, caller should be notified. */ 5553 if (num != dlcount) { 5554 if (copyout(&num, nump, sizeof (num)) != 0) { 5555 return (set_errno(EFAULT)); 5556 } 5557 } 5558 return (0); 5559 } 5560 5561 /* 5562 * Public interface for looking up a zone by zoneid. It's a customized version 5563 * for netstack_zone_create(), it: 5564 * 1. Doesn't acquire the zonehash_lock, since it is called from 5565 * zone_key_create() or zone_zsd_configure(), lock already held. 5566 * 2. Doesn't check the status of the zone. 5567 * 3. It will be called even before zone_init is called, in that case the 5568 * address of zone0 is returned directly, and netstack_zone_create() 5569 * will only assign a value to zone0.zone_netstack, won't break anything. 5570 */ 5571 zone_t * 5572 zone_find_by_id_nolock(zoneid_t zoneid) 5573 { 5574 ASSERT(MUTEX_HELD(&zonehash_lock)); 5575 5576 if (zonehashbyid == NULL) 5577 return (&zone0); 5578 else 5579 return (zone_find_all_by_id(zoneid)); 5580 } 5581