1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Zones 28 * 29 * A zone is a named collection of processes, namespace constraints, 30 * and other system resources which comprise a secure and manageable 31 * application containment facility. 32 * 33 * Zones (represented by the reference counted zone_t) are tracked in 34 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 35 * (zoneid_t) are used to track zone association. Zone IDs are 36 * dynamically generated when the zone is created; if a persistent 37 * identifier is needed (core files, accounting logs, audit trail, 38 * etc.), the zone name should be used. 39 * 40 * 41 * Global Zone: 42 * 43 * The global zone (zoneid 0) is automatically associated with all 44 * system resources that have not been bound to a user-created zone. 45 * This means that even systems where zones are not in active use 46 * have a global zone, and all processes, mounts, etc. are 47 * associated with that zone. The global zone is generally 48 * unconstrained in terms of privileges and access, though the usual 49 * credential and privilege based restrictions apply. 50 * 51 * 52 * Zone States: 53 * 54 * The states in which a zone may be in and the transitions are as 55 * follows: 56 * 57 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 58 * initialized zone is added to the list of active zones on the system but 59 * isn't accessible. 60 * 61 * ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are 62 * not yet completed. Not possible to enter the zone, but attributes can 63 * be retrieved. 64 * 65 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 66 * ready. The zone is made visible after the ZSD constructor callbacks are 67 * executed. A zone remains in this state until it transitions into 68 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 69 * 70 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 71 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 72 * state. 73 * 74 * ZONE_IS_RUNNING: The zone is open for business: zsched has 75 * successfully started init. A zone remains in this state until 76 * zone_shutdown() is called. 77 * 78 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 79 * killing all processes running in the zone. The zone remains 80 * in this state until there are no more user processes running in the zone. 81 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 82 * Since zone_shutdown() is restartable, it may be called successfully 83 * multiple times for the same zone_t. Setting of the zone's state to 84 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 85 * the zone's status without worrying about it being a moving target. 86 * 87 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 88 * are no more user processes in the zone. The zone remains in this 89 * state until there are no more kernel threads associated with the 90 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 91 * fail. 92 * 93 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 94 * have exited. zone_shutdown() returns. Henceforth it is not possible to 95 * join the zone or create kernel threads therein. 96 * 97 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 98 * remains in this state until zsched exits. Calls to zone_find_by_*() 99 * return NULL from now on. 100 * 101 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 102 * processes or threads doing work on behalf of the zone. The zone is 103 * removed from the list of active zones. zone_destroy() returns, and 104 * the zone can be recreated. 105 * 106 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 107 * callbacks are executed, and all memory associated with the zone is 108 * freed. 109 * 110 * Threads can wait for the zone to enter a requested state by using 111 * zone_status_wait() or zone_status_timedwait() with the desired 112 * state passed in as an argument. Zone state transitions are 113 * uni-directional; it is not possible to move back to an earlier state. 114 * 115 * 116 * Zone-Specific Data: 117 * 118 * Subsystems needing to maintain zone-specific data can store that 119 * data using the ZSD mechanism. This provides a zone-specific data 120 * store, similar to thread-specific data (see pthread_getspecific(3C) 121 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 122 * to register callbacks to be invoked when a zone is created, shut 123 * down, or destroyed. This can be used to initialize zone-specific 124 * data for new zones and to clean up when zones go away. 125 * 126 * 127 * Data Structures: 128 * 129 * The per-zone structure (zone_t) is reference counted, and freed 130 * when all references are released. zone_hold and zone_rele can be 131 * used to adjust the reference count. In addition, reference counts 132 * associated with the cred_t structure are tracked separately using 133 * zone_cred_hold and zone_cred_rele. 134 * 135 * Pointers to active zone_t's are stored in two hash tables; one 136 * for searching by id, the other for searching by name. Lookups 137 * can be performed on either basis, using zone_find_by_id and 138 * zone_find_by_name. Both return zone_t pointers with the zone 139 * held, so zone_rele should be called when the pointer is no longer 140 * needed. Zones can also be searched by path; zone_find_by_path 141 * returns the zone with which a path name is associated (global 142 * zone if the path is not within some other zone's file system 143 * hierarchy). This currently requires iterating through each zone, 144 * so it is slower than an id or name search via a hash table. 145 * 146 * 147 * Locking: 148 * 149 * zonehash_lock: This is a top-level global lock used to protect the 150 * zone hash tables and lists. Zones cannot be created or destroyed 151 * while this lock is held. 152 * zone_status_lock: This is a global lock protecting zone state. 153 * Zones cannot change state while this lock is held. It also 154 * protects the list of kernel threads associated with a zone. 155 * zone_lock: This is a per-zone lock used to protect several fields of 156 * the zone_t (see <sys/zone.h> for details). In addition, holding 157 * this lock means that the zone cannot go away. 158 * zone_nlwps_lock: This is a per-zone lock used to protect the fields 159 * related to the zone.max-lwps rctl. 160 * zone_mem_lock: This is a per-zone lock used to protect the fields 161 * related to the zone.max-locked-memory and zone.max-swap rctls. 162 * zone_rctl_lock: This is a per-zone lock used to protect other rctls, 163 * currently just max_lofi 164 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 165 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 166 * list (a list of zones in the ZONE_IS_DEAD state). 167 * 168 * Ordering requirements: 169 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 170 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 171 * 172 * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: 173 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock 174 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock 175 * 176 * Blocking memory allocations are permitted while holding any of the 177 * zone locks. 178 * 179 * 180 * System Call Interface: 181 * 182 * The zone subsystem can be managed and queried from user level with 183 * the following system calls (all subcodes of the primary "zone" 184 * system call): 185 * - zone_create: creates a zone with selected attributes (name, 186 * root path, privileges, resource controls, ZFS datasets) 187 * - zone_enter: allows the current process to enter a zone 188 * - zone_getattr: reports attributes of a zone 189 * - zone_setattr: set attributes of a zone 190 * - zone_boot: set 'init' running for the zone 191 * - zone_list: lists all zones active in the system 192 * - zone_lookup: looks up zone id based on name 193 * - zone_shutdown: initiates shutdown process (see states above) 194 * - zone_destroy: completes shutdown process (see states above) 195 * 196 */ 197 198 #include <sys/priv_impl.h> 199 #include <sys/cred.h> 200 #include <c2/audit.h> 201 #include <sys/debug.h> 202 #include <sys/file.h> 203 #include <sys/kmem.h> 204 #include <sys/kstat.h> 205 #include <sys/mutex.h> 206 #include <sys/note.h> 207 #include <sys/pathname.h> 208 #include <sys/proc.h> 209 #include <sys/project.h> 210 #include <sys/sysevent.h> 211 #include <sys/task.h> 212 #include <sys/systm.h> 213 #include <sys/types.h> 214 #include <sys/utsname.h> 215 #include <sys/vnode.h> 216 #include <sys/vfs.h> 217 #include <sys/systeminfo.h> 218 #include <sys/policy.h> 219 #include <sys/cred_impl.h> 220 #include <sys/contract_impl.h> 221 #include <sys/contract/process_impl.h> 222 #include <sys/class.h> 223 #include <sys/pool.h> 224 #include <sys/pool_pset.h> 225 #include <sys/pset.h> 226 #include <sys/sysmacros.h> 227 #include <sys/callb.h> 228 #include <sys/vmparam.h> 229 #include <sys/corectl.h> 230 #include <sys/ipc_impl.h> 231 #include <sys/klpd.h> 232 233 #include <sys/door.h> 234 #include <sys/cpuvar.h> 235 #include <sys/sdt.h> 236 237 #include <sys/uadmin.h> 238 #include <sys/session.h> 239 #include <sys/cmn_err.h> 240 #include <sys/modhash.h> 241 #include <sys/sunddi.h> 242 #include <sys/nvpair.h> 243 #include <sys/rctl.h> 244 #include <sys/fss.h> 245 #include <sys/brand.h> 246 #include <sys/zone.h> 247 #include <net/if.h> 248 #include <sys/cpucaps.h> 249 #include <vm/seg.h> 250 #include <sys/mac.h> 251 252 /* List of data link IDs which are accessible from the zone */ 253 typedef struct zone_dl { 254 datalink_id_t zdl_id; 255 nvlist_t *zdl_net; 256 list_node_t zdl_linkage; 257 } zone_dl_t; 258 259 /* 260 * cv used to signal that all references to the zone have been released. This 261 * needs to be global since there may be multiple waiters, and the first to 262 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 263 */ 264 static kcondvar_t zone_destroy_cv; 265 /* 266 * Lock used to serialize access to zone_cv. This could have been per-zone, 267 * but then we'd need another lock for zone_destroy_cv, and why bother? 268 */ 269 static kmutex_t zone_status_lock; 270 271 /* 272 * ZSD-related global variables. 273 */ 274 static kmutex_t zsd_key_lock; /* protects the following two */ 275 /* 276 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 277 */ 278 static zone_key_t zsd_keyval = 0; 279 /* 280 * Global list of registered keys. We use this when a new zone is created. 281 */ 282 static list_t zsd_registered_keys; 283 284 int zone_hash_size = 256; 285 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel; 286 static kmutex_t zonehash_lock; 287 static uint_t zonecount; 288 static id_space_t *zoneid_space; 289 290 /* 291 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 292 * kernel proper runs, and which manages all other zones. 293 * 294 * Although not declared as static, the variable "zone0" should not be used 295 * except for by code that needs to reference the global zone early on in boot, 296 * before it is fully initialized. All other consumers should use 297 * 'global_zone'. 298 */ 299 zone_t zone0; 300 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 301 302 /* 303 * List of active zones, protected by zonehash_lock. 304 */ 305 static list_t zone_active; 306 307 /* 308 * List of destroyed zones that still have outstanding cred references. 309 * Used for debugging. Uses a separate lock to avoid lock ordering 310 * problems in zone_free. 311 */ 312 static list_t zone_deathrow; 313 static kmutex_t zone_deathrow_lock; 314 315 /* number of zones is limited by virtual interface limit in IP */ 316 uint_t maxzones = 8192; 317 318 /* Event channel to sent zone state change notifications */ 319 evchan_t *zone_event_chan; 320 321 /* 322 * This table holds the mapping from kernel zone states to 323 * states visible in the state notification API. 324 * The idea is that we only expose "obvious" states and 325 * do not expose states which are just implementation details. 326 */ 327 const char *zone_status_table[] = { 328 ZONE_EVENT_UNINITIALIZED, /* uninitialized */ 329 ZONE_EVENT_INITIALIZED, /* initialized */ 330 ZONE_EVENT_READY, /* ready */ 331 ZONE_EVENT_READY, /* booting */ 332 ZONE_EVENT_RUNNING, /* running */ 333 ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */ 334 ZONE_EVENT_SHUTTING_DOWN, /* empty */ 335 ZONE_EVENT_SHUTTING_DOWN, /* down */ 336 ZONE_EVENT_SHUTTING_DOWN, /* dying */ 337 ZONE_EVENT_UNINITIALIZED, /* dead */ 338 }; 339 340 /* 341 * This isn't static so lint doesn't complain. 342 */ 343 rctl_hndl_t rc_zone_cpu_shares; 344 rctl_hndl_t rc_zone_locked_mem; 345 rctl_hndl_t rc_zone_max_swap; 346 rctl_hndl_t rc_zone_max_lofi; 347 rctl_hndl_t rc_zone_cpu_cap; 348 rctl_hndl_t rc_zone_nlwps; 349 rctl_hndl_t rc_zone_nprocs; 350 rctl_hndl_t rc_zone_shmmax; 351 rctl_hndl_t rc_zone_shmmni; 352 rctl_hndl_t rc_zone_semmni; 353 rctl_hndl_t rc_zone_msgmni; 354 /* 355 * Synchronization primitives used to synchronize between mounts and zone 356 * creation/destruction. 357 */ 358 static int mounts_in_progress; 359 static kcondvar_t mount_cv; 360 static kmutex_t mount_lock; 361 362 const char * const zone_default_initname = "/sbin/init"; 363 static char * const zone_prefix = "/zone/"; 364 static int zone_shutdown(zoneid_t zoneid); 365 static int zone_add_datalink(zoneid_t, datalink_id_t); 366 static int zone_remove_datalink(zoneid_t, datalink_id_t); 367 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); 368 static int zone_set_network(zoneid_t, zone_net_data_t *); 369 static int zone_get_network(zoneid_t, zone_net_data_t *); 370 371 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); 372 373 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t); 374 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *); 375 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t); 376 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *, 377 zone_key_t); 378 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t); 379 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *, 380 kmutex_t *); 381 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, 382 kmutex_t *); 383 384 /* 385 * Bump this number when you alter the zone syscall interfaces; this is 386 * because we need to have support for previous API versions in libc 387 * to support patching; libc calls into the kernel to determine this number. 388 * 389 * Version 1 of the API is the version originally shipped with Solaris 10 390 * Version 2 alters the zone_create system call in order to support more 391 * arguments by moving the args into a structure; and to do better 392 * error reporting when zone_create() fails. 393 * Version 3 alters the zone_create system call in order to support the 394 * import of ZFS datasets to zones. 395 * Version 4 alters the zone_create system call in order to support 396 * Trusted Extensions. 397 * Version 5 alters the zone_boot system call, and converts its old 398 * bootargs parameter to be set by the zone_setattr API instead. 399 * Version 6 adds the flag argument to zone_create. 400 */ 401 static const int ZONE_SYSCALL_API_VERSION = 6; 402 403 /* 404 * Certain filesystems (such as NFS and autofs) need to know which zone 405 * the mount is being placed in. Because of this, we need to be able to 406 * ensure that a zone isn't in the process of being created such that 407 * nfs_mount() thinks it is in the global zone, while by the time it 408 * gets added the list of mounted zones, it ends up on zoneA's mount 409 * list. 410 * 411 * The following functions: block_mounts()/resume_mounts() and 412 * mount_in_progress()/mount_completed() are used by zones and the VFS 413 * layer (respectively) to synchronize zone creation and new mounts. 414 * 415 * The semantics are like a reader-reader lock such that there may 416 * either be multiple mounts (or zone creations, if that weren't 417 * serialized by zonehash_lock) in progress at the same time, but not 418 * both. 419 * 420 * We use cv's so the user can ctrl-C out of the operation if it's 421 * taking too long. 422 * 423 * The semantics are such that there is unfair bias towards the 424 * "current" operation. This means that zone creations may starve if 425 * there is a rapid succession of new mounts coming in to the system, or 426 * there is a remote possibility that zones will be created at such a 427 * rate that new mounts will not be able to proceed. 428 */ 429 /* 430 * Prevent new mounts from progressing to the point of calling 431 * VFS_MOUNT(). If there are already mounts in this "region", wait for 432 * them to complete. 433 */ 434 static int 435 block_mounts(void) 436 { 437 int retval = 0; 438 439 /* 440 * Since it may block for a long time, block_mounts() shouldn't be 441 * called with zonehash_lock held. 442 */ 443 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 444 mutex_enter(&mount_lock); 445 while (mounts_in_progress > 0) { 446 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 447 goto signaled; 448 } 449 /* 450 * A negative value of mounts_in_progress indicates that mounts 451 * have been blocked by (-mounts_in_progress) different callers. 452 */ 453 mounts_in_progress--; 454 retval = 1; 455 signaled: 456 mutex_exit(&mount_lock); 457 return (retval); 458 } 459 460 /* 461 * The VFS layer may progress with new mounts as far as we're concerned. 462 * Allow them to progress if we were the last obstacle. 463 */ 464 static void 465 resume_mounts(void) 466 { 467 mutex_enter(&mount_lock); 468 if (++mounts_in_progress == 0) 469 cv_broadcast(&mount_cv); 470 mutex_exit(&mount_lock); 471 } 472 473 /* 474 * The VFS layer is busy with a mount; zones should wait until all 475 * mounts are completed to progress. 476 */ 477 void 478 mount_in_progress(void) 479 { 480 mutex_enter(&mount_lock); 481 while (mounts_in_progress < 0) 482 cv_wait(&mount_cv, &mount_lock); 483 mounts_in_progress++; 484 mutex_exit(&mount_lock); 485 } 486 487 /* 488 * VFS is done with one mount; wake up any waiting block_mounts() 489 * callers if this is the last mount. 490 */ 491 void 492 mount_completed(void) 493 { 494 mutex_enter(&mount_lock); 495 if (--mounts_in_progress == 0) 496 cv_broadcast(&mount_cv); 497 mutex_exit(&mount_lock); 498 } 499 500 /* 501 * ZSD routines. 502 * 503 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 504 * defined by the pthread_key_create() and related interfaces. 505 * 506 * Kernel subsystems may register one or more data items and/or 507 * callbacks to be executed when a zone is created, shutdown, or 508 * destroyed. 509 * 510 * Unlike the thread counterpart, destructor callbacks will be executed 511 * even if the data pointer is NULL and/or there are no constructor 512 * callbacks, so it is the responsibility of such callbacks to check for 513 * NULL data values if necessary. 514 * 515 * The locking strategy and overall picture is as follows: 516 * 517 * When someone calls zone_key_create(), a template ZSD entry is added to the 518 * global list "zsd_registered_keys", protected by zsd_key_lock. While 519 * holding that lock all the existing zones are marked as 520 * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone 521 * zone_zsd list (protected by zone_lock). The global list is updated first 522 * (under zone_key_lock) to make sure that newly created zones use the 523 * most recent list of keys. Then under zonehash_lock we walk the zones 524 * and mark them. Similar locking is used in zone_key_delete(). 525 * 526 * The actual create, shutdown, and destroy callbacks are done without 527 * holding any lock. And zsd_flags are used to ensure that the operations 528 * completed so that when zone_key_create (and zone_create) is done, as well as 529 * zone_key_delete (and zone_destroy) is done, all the necessary callbacks 530 * are completed. 531 * 532 * When new zones are created constructor callbacks for all registered ZSD 533 * entries will be called. That also uses the above two phases of marking 534 * what needs to be done, and then running the callbacks without holding 535 * any locks. 536 * 537 * The framework does not provide any locking around zone_getspecific() and 538 * zone_setspecific() apart from that needed for internal consistency, so 539 * callers interested in atomic "test-and-set" semantics will need to provide 540 * their own locking. 541 */ 542 543 /* 544 * Helper function to find the zsd_entry associated with the key in the 545 * given list. 546 */ 547 static struct zsd_entry * 548 zsd_find(list_t *l, zone_key_t key) 549 { 550 struct zsd_entry *zsd; 551 552 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 553 if (zsd->zsd_key == key) { 554 return (zsd); 555 } 556 } 557 return (NULL); 558 } 559 560 /* 561 * Helper function to find the zsd_entry associated with the key in the 562 * given list. Move it to the front of the list. 563 */ 564 static struct zsd_entry * 565 zsd_find_mru(list_t *l, zone_key_t key) 566 { 567 struct zsd_entry *zsd; 568 569 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 570 if (zsd->zsd_key == key) { 571 /* 572 * Move to head of list to keep list in MRU order. 573 */ 574 if (zsd != list_head(l)) { 575 list_remove(l, zsd); 576 list_insert_head(l, zsd); 577 } 578 return (zsd); 579 } 580 } 581 return (NULL); 582 } 583 584 void 585 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 586 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 587 { 588 struct zsd_entry *zsdp; 589 struct zsd_entry *t; 590 struct zone *zone; 591 zone_key_t key; 592 593 zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP); 594 zsdp->zsd_data = NULL; 595 zsdp->zsd_create = create; 596 zsdp->zsd_shutdown = shutdown; 597 zsdp->zsd_destroy = destroy; 598 599 /* 600 * Insert in global list of callbacks. Makes future zone creations 601 * see it. 602 */ 603 mutex_enter(&zsd_key_lock); 604 key = zsdp->zsd_key = ++zsd_keyval; 605 ASSERT(zsd_keyval != 0); 606 list_insert_tail(&zsd_registered_keys, zsdp); 607 mutex_exit(&zsd_key_lock); 608 609 /* 610 * Insert for all existing zones and mark them as needing 611 * a create callback. 612 */ 613 mutex_enter(&zonehash_lock); /* stop the world */ 614 for (zone = list_head(&zone_active); zone != NULL; 615 zone = list_next(&zone_active, zone)) { 616 zone_status_t status; 617 618 mutex_enter(&zone->zone_lock); 619 620 /* Skip zones that are on the way down or not yet up */ 621 status = zone_status_get(zone); 622 if (status >= ZONE_IS_DOWN || 623 status == ZONE_IS_UNINITIALIZED) { 624 mutex_exit(&zone->zone_lock); 625 continue; 626 } 627 628 t = zsd_find_mru(&zone->zone_zsd, key); 629 if (t != NULL) { 630 /* 631 * A zsd_configure already inserted it after 632 * we dropped zsd_key_lock above. 633 */ 634 mutex_exit(&zone->zone_lock); 635 continue; 636 } 637 t = kmem_zalloc(sizeof (*t), KM_SLEEP); 638 t->zsd_key = key; 639 t->zsd_create = create; 640 t->zsd_shutdown = shutdown; 641 t->zsd_destroy = destroy; 642 if (create != NULL) { 643 t->zsd_flags = ZSD_CREATE_NEEDED; 644 DTRACE_PROBE2(zsd__create__needed, 645 zone_t *, zone, zone_key_t, key); 646 } 647 list_insert_tail(&zone->zone_zsd, t); 648 mutex_exit(&zone->zone_lock); 649 } 650 mutex_exit(&zonehash_lock); 651 652 if (create != NULL) { 653 /* Now call the create callback for this key */ 654 zsd_apply_all_zones(zsd_apply_create, key); 655 } 656 /* 657 * It is safe for consumers to use the key now, make it 658 * globally visible. Specifically zone_getspecific() will 659 * always successfully return the zone specific data associated 660 * with the key. 661 */ 662 *keyp = key; 663 664 } 665 666 /* 667 * Function called when a module is being unloaded, or otherwise wishes 668 * to unregister its ZSD key and callbacks. 669 * 670 * Remove from the global list and determine the functions that need to 671 * be called under a global lock. Then call the functions without 672 * holding any locks. Finally free up the zone_zsd entries. (The apply 673 * functions need to access the zone_zsd entries to find zsd_data etc.) 674 */ 675 int 676 zone_key_delete(zone_key_t key) 677 { 678 struct zsd_entry *zsdp = NULL; 679 zone_t *zone; 680 681 mutex_enter(&zsd_key_lock); 682 zsdp = zsd_find_mru(&zsd_registered_keys, key); 683 if (zsdp == NULL) { 684 mutex_exit(&zsd_key_lock); 685 return (-1); 686 } 687 list_remove(&zsd_registered_keys, zsdp); 688 mutex_exit(&zsd_key_lock); 689 690 mutex_enter(&zonehash_lock); 691 for (zone = list_head(&zone_active); zone != NULL; 692 zone = list_next(&zone_active, zone)) { 693 struct zsd_entry *del; 694 695 mutex_enter(&zone->zone_lock); 696 del = zsd_find_mru(&zone->zone_zsd, key); 697 if (del == NULL) { 698 /* 699 * Somebody else got here first e.g the zone going 700 * away. 701 */ 702 mutex_exit(&zone->zone_lock); 703 continue; 704 } 705 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 706 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 707 if (del->zsd_shutdown != NULL && 708 (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { 709 del->zsd_flags |= ZSD_SHUTDOWN_NEEDED; 710 DTRACE_PROBE2(zsd__shutdown__needed, 711 zone_t *, zone, zone_key_t, key); 712 } 713 if (del->zsd_destroy != NULL && 714 (del->zsd_flags & ZSD_DESTROY_ALL) == 0) { 715 del->zsd_flags |= ZSD_DESTROY_NEEDED; 716 DTRACE_PROBE2(zsd__destroy__needed, 717 zone_t *, zone, zone_key_t, key); 718 } 719 mutex_exit(&zone->zone_lock); 720 } 721 mutex_exit(&zonehash_lock); 722 kmem_free(zsdp, sizeof (*zsdp)); 723 724 /* Now call the shutdown and destroy callback for this key */ 725 zsd_apply_all_zones(zsd_apply_shutdown, key); 726 zsd_apply_all_zones(zsd_apply_destroy, key); 727 728 /* Now we can free up the zsdp structures in each zone */ 729 mutex_enter(&zonehash_lock); 730 for (zone = list_head(&zone_active); zone != NULL; 731 zone = list_next(&zone_active, zone)) { 732 struct zsd_entry *del; 733 734 mutex_enter(&zone->zone_lock); 735 del = zsd_find(&zone->zone_zsd, key); 736 if (del != NULL) { 737 list_remove(&zone->zone_zsd, del); 738 ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS)); 739 kmem_free(del, sizeof (*del)); 740 } 741 mutex_exit(&zone->zone_lock); 742 } 743 mutex_exit(&zonehash_lock); 744 745 return (0); 746 } 747 748 /* 749 * ZSD counterpart of pthread_setspecific(). 750 * 751 * Since all zsd callbacks, including those with no create function, 752 * have an entry in zone_zsd, if the key is registered it is part of 753 * the zone_zsd list. 754 * Return an error if the key wasn't registerd. 755 */ 756 int 757 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 758 { 759 struct zsd_entry *t; 760 761 mutex_enter(&zone->zone_lock); 762 t = zsd_find_mru(&zone->zone_zsd, key); 763 if (t != NULL) { 764 /* 765 * Replace old value with new 766 */ 767 t->zsd_data = (void *)data; 768 mutex_exit(&zone->zone_lock); 769 return (0); 770 } 771 mutex_exit(&zone->zone_lock); 772 return (-1); 773 } 774 775 /* 776 * ZSD counterpart of pthread_getspecific(). 777 */ 778 void * 779 zone_getspecific(zone_key_t key, zone_t *zone) 780 { 781 struct zsd_entry *t; 782 void *data; 783 784 mutex_enter(&zone->zone_lock); 785 t = zsd_find_mru(&zone->zone_zsd, key); 786 data = (t == NULL ? NULL : t->zsd_data); 787 mutex_exit(&zone->zone_lock); 788 return (data); 789 } 790 791 /* 792 * Function used to initialize a zone's list of ZSD callbacks and data 793 * when the zone is being created. The callbacks are initialized from 794 * the template list (zsd_registered_keys). The constructor callback is 795 * executed later (once the zone exists and with locks dropped). 796 */ 797 static void 798 zone_zsd_configure(zone_t *zone) 799 { 800 struct zsd_entry *zsdp; 801 struct zsd_entry *t; 802 803 ASSERT(MUTEX_HELD(&zonehash_lock)); 804 ASSERT(list_head(&zone->zone_zsd) == NULL); 805 mutex_enter(&zone->zone_lock); 806 mutex_enter(&zsd_key_lock); 807 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 808 zsdp = list_next(&zsd_registered_keys, zsdp)) { 809 /* 810 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create 811 * should not have added anything to it. 812 */ 813 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL); 814 815 t = kmem_zalloc(sizeof (*t), KM_SLEEP); 816 t->zsd_key = zsdp->zsd_key; 817 t->zsd_create = zsdp->zsd_create; 818 t->zsd_shutdown = zsdp->zsd_shutdown; 819 t->zsd_destroy = zsdp->zsd_destroy; 820 if (zsdp->zsd_create != NULL) { 821 t->zsd_flags = ZSD_CREATE_NEEDED; 822 DTRACE_PROBE2(zsd__create__needed, 823 zone_t *, zone, zone_key_t, zsdp->zsd_key); 824 } 825 list_insert_tail(&zone->zone_zsd, t); 826 } 827 mutex_exit(&zsd_key_lock); 828 mutex_exit(&zone->zone_lock); 829 } 830 831 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 832 833 /* 834 * Helper function to execute shutdown or destructor callbacks. 835 */ 836 static void 837 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 838 { 839 struct zsd_entry *t; 840 841 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 842 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 843 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 844 845 /* 846 * Run the callback solely based on what is registered for the zone 847 * in zone_zsd. The global list can change independently of this 848 * as keys are registered and unregistered and we don't register new 849 * callbacks for a zone that is in the process of going away. 850 */ 851 mutex_enter(&zone->zone_lock); 852 for (t = list_head(&zone->zone_zsd); t != NULL; 853 t = list_next(&zone->zone_zsd, t)) { 854 zone_key_t key = t->zsd_key; 855 856 /* Skip if no callbacks registered */ 857 858 if (ct == ZSD_SHUTDOWN) { 859 if (t->zsd_shutdown != NULL && 860 (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { 861 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED; 862 DTRACE_PROBE2(zsd__shutdown__needed, 863 zone_t *, zone, zone_key_t, key); 864 } 865 } else { 866 if (t->zsd_destroy != NULL && 867 (t->zsd_flags & ZSD_DESTROY_ALL) == 0) { 868 t->zsd_flags |= ZSD_DESTROY_NEEDED; 869 DTRACE_PROBE2(zsd__destroy__needed, 870 zone_t *, zone, zone_key_t, key); 871 } 872 } 873 } 874 mutex_exit(&zone->zone_lock); 875 876 /* Now call the shutdown and destroy callback for this key */ 877 zsd_apply_all_keys(zsd_apply_shutdown, zone); 878 zsd_apply_all_keys(zsd_apply_destroy, zone); 879 880 } 881 882 /* 883 * Called when the zone is going away; free ZSD-related memory, and 884 * destroy the zone_zsd list. 885 */ 886 static void 887 zone_free_zsd(zone_t *zone) 888 { 889 struct zsd_entry *t, *next; 890 891 /* 892 * Free all the zsd_entry's we had on this zone. 893 */ 894 mutex_enter(&zone->zone_lock); 895 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 896 next = list_next(&zone->zone_zsd, t); 897 list_remove(&zone->zone_zsd, t); 898 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS)); 899 kmem_free(t, sizeof (*t)); 900 } 901 list_destroy(&zone->zone_zsd); 902 mutex_exit(&zone->zone_lock); 903 904 } 905 906 /* 907 * Apply a function to all zones for particular key value. 908 * 909 * The applyfn has to drop zonehash_lock if it does some work, and 910 * then reacquire it before it returns. 911 * When the lock is dropped we don't follow list_next even 912 * if it is possible to do so without any hazards. This is 913 * because we want the design to allow for the list of zones 914 * to change in any arbitrary way during the time the 915 * lock was dropped. 916 * 917 * It is safe to restart the loop at list_head since the applyfn 918 * changes the zsd_flags as it does work, so a subsequent 919 * pass through will have no effect in applyfn, hence the loop will terminate 920 * in at worst O(N^2). 921 */ 922 static void 923 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key) 924 { 925 zone_t *zone; 926 927 mutex_enter(&zonehash_lock); 928 zone = list_head(&zone_active); 929 while (zone != NULL) { 930 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) { 931 /* Lock dropped - restart at head */ 932 zone = list_head(&zone_active); 933 } else { 934 zone = list_next(&zone_active, zone); 935 } 936 } 937 mutex_exit(&zonehash_lock); 938 } 939 940 /* 941 * Apply a function to all keys for a particular zone. 942 * 943 * The applyfn has to drop zonehash_lock if it does some work, and 944 * then reacquire it before it returns. 945 * When the lock is dropped we don't follow list_next even 946 * if it is possible to do so without any hazards. This is 947 * because we want the design to allow for the list of zsd callbacks 948 * to change in any arbitrary way during the time the 949 * lock was dropped. 950 * 951 * It is safe to restart the loop at list_head since the applyfn 952 * changes the zsd_flags as it does work, so a subsequent 953 * pass through will have no effect in applyfn, hence the loop will terminate 954 * in at worst O(N^2). 955 */ 956 static void 957 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone) 958 { 959 struct zsd_entry *t; 960 961 mutex_enter(&zone->zone_lock); 962 t = list_head(&zone->zone_zsd); 963 while (t != NULL) { 964 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) { 965 /* Lock dropped - restart at head */ 966 t = list_head(&zone->zone_zsd); 967 } else { 968 t = list_next(&zone->zone_zsd, t); 969 } 970 } 971 mutex_exit(&zone->zone_lock); 972 } 973 974 /* 975 * Call the create function for the zone and key if CREATE_NEEDED 976 * is set. 977 * If some other thread gets here first and sets CREATE_INPROGRESS, then 978 * we wait for that thread to complete so that we can ensure that 979 * all the callbacks are done when we've looped over all zones/keys. 980 * 981 * When we call the create function, we drop the global held by the 982 * caller, and return true to tell the caller it needs to re-evalute the 983 * state. 984 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock 985 * remains held on exit. 986 */ 987 static boolean_t 988 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held, 989 zone_t *zone, zone_key_t key) 990 { 991 void *result; 992 struct zsd_entry *t; 993 boolean_t dropped; 994 995 if (lockp != NULL) { 996 ASSERT(MUTEX_HELD(lockp)); 997 } 998 if (zone_lock_held) { 999 ASSERT(MUTEX_HELD(&zone->zone_lock)); 1000 } else { 1001 mutex_enter(&zone->zone_lock); 1002 } 1003 1004 t = zsd_find(&zone->zone_zsd, key); 1005 if (t == NULL) { 1006 /* 1007 * Somebody else got here first e.g the zone going 1008 * away. 1009 */ 1010 if (!zone_lock_held) 1011 mutex_exit(&zone->zone_lock); 1012 return (B_FALSE); 1013 } 1014 dropped = B_FALSE; 1015 if (zsd_wait_for_inprogress(zone, t, lockp)) 1016 dropped = B_TRUE; 1017 1018 if (t->zsd_flags & ZSD_CREATE_NEEDED) { 1019 t->zsd_flags &= ~ZSD_CREATE_NEEDED; 1020 t->zsd_flags |= ZSD_CREATE_INPROGRESS; 1021 DTRACE_PROBE2(zsd__create__inprogress, 1022 zone_t *, zone, zone_key_t, key); 1023 mutex_exit(&zone->zone_lock); 1024 if (lockp != NULL) 1025 mutex_exit(lockp); 1026 1027 dropped = B_TRUE; 1028 ASSERT(t->zsd_create != NULL); 1029 DTRACE_PROBE2(zsd__create__start, 1030 zone_t *, zone, zone_key_t, key); 1031 1032 result = (*t->zsd_create)(zone->zone_id); 1033 1034 DTRACE_PROBE2(zsd__create__end, 1035 zone_t *, zone, voidn *, result); 1036 1037 ASSERT(result != NULL); 1038 if (lockp != NULL) 1039 mutex_enter(lockp); 1040 mutex_enter(&zone->zone_lock); 1041 t->zsd_data = result; 1042 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS; 1043 t->zsd_flags |= ZSD_CREATE_COMPLETED; 1044 cv_broadcast(&t->zsd_cv); 1045 DTRACE_PROBE2(zsd__create__completed, 1046 zone_t *, zone, zone_key_t, key); 1047 } 1048 if (!zone_lock_held) 1049 mutex_exit(&zone->zone_lock); 1050 return (dropped); 1051 } 1052 1053 /* 1054 * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED 1055 * is set. 1056 * If some other thread gets here first and sets *_INPROGRESS, then 1057 * we wait for that thread to complete so that we can ensure that 1058 * all the callbacks are done when we've looped over all zones/keys. 1059 * 1060 * When we call the shutdown function, we drop the global held by the 1061 * caller, and return true to tell the caller it needs to re-evalute the 1062 * state. 1063 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock 1064 * remains held on exit. 1065 */ 1066 static boolean_t 1067 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held, 1068 zone_t *zone, zone_key_t key) 1069 { 1070 struct zsd_entry *t; 1071 void *data; 1072 boolean_t dropped; 1073 1074 if (lockp != NULL) { 1075 ASSERT(MUTEX_HELD(lockp)); 1076 } 1077 if (zone_lock_held) { 1078 ASSERT(MUTEX_HELD(&zone->zone_lock)); 1079 } else { 1080 mutex_enter(&zone->zone_lock); 1081 } 1082 1083 t = zsd_find(&zone->zone_zsd, key); 1084 if (t == NULL) { 1085 /* 1086 * Somebody else got here first e.g the zone going 1087 * away. 1088 */ 1089 if (!zone_lock_held) 1090 mutex_exit(&zone->zone_lock); 1091 return (B_FALSE); 1092 } 1093 dropped = B_FALSE; 1094 if (zsd_wait_for_creator(zone, t, lockp)) 1095 dropped = B_TRUE; 1096 1097 if (zsd_wait_for_inprogress(zone, t, lockp)) 1098 dropped = B_TRUE; 1099 1100 if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) { 1101 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED; 1102 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS; 1103 DTRACE_PROBE2(zsd__shutdown__inprogress, 1104 zone_t *, zone, zone_key_t, key); 1105 mutex_exit(&zone->zone_lock); 1106 if (lockp != NULL) 1107 mutex_exit(lockp); 1108 dropped = B_TRUE; 1109 1110 ASSERT(t->zsd_shutdown != NULL); 1111 data = t->zsd_data; 1112 1113 DTRACE_PROBE2(zsd__shutdown__start, 1114 zone_t *, zone, zone_key_t, key); 1115 1116 (t->zsd_shutdown)(zone->zone_id, data); 1117 DTRACE_PROBE2(zsd__shutdown__end, 1118 zone_t *, zone, zone_key_t, key); 1119 1120 if (lockp != NULL) 1121 mutex_enter(lockp); 1122 mutex_enter(&zone->zone_lock); 1123 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS; 1124 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED; 1125 cv_broadcast(&t->zsd_cv); 1126 DTRACE_PROBE2(zsd__shutdown__completed, 1127 zone_t *, zone, zone_key_t, key); 1128 } 1129 if (!zone_lock_held) 1130 mutex_exit(&zone->zone_lock); 1131 return (dropped); 1132 } 1133 1134 /* 1135 * Call the destroy function for the zone and key if DESTROY_NEEDED 1136 * is set. 1137 * If some other thread gets here first and sets *_INPROGRESS, then 1138 * we wait for that thread to complete so that we can ensure that 1139 * all the callbacks are done when we've looped over all zones/keys. 1140 * 1141 * When we call the destroy function, we drop the global held by the 1142 * caller, and return true to tell the caller it needs to re-evalute the 1143 * state. 1144 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock 1145 * remains held on exit. 1146 */ 1147 static boolean_t 1148 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held, 1149 zone_t *zone, zone_key_t key) 1150 { 1151 struct zsd_entry *t; 1152 void *data; 1153 boolean_t dropped; 1154 1155 if (lockp != NULL) { 1156 ASSERT(MUTEX_HELD(lockp)); 1157 } 1158 if (zone_lock_held) { 1159 ASSERT(MUTEX_HELD(&zone->zone_lock)); 1160 } else { 1161 mutex_enter(&zone->zone_lock); 1162 } 1163 1164 t = zsd_find(&zone->zone_zsd, key); 1165 if (t == NULL) { 1166 /* 1167 * Somebody else got here first e.g the zone going 1168 * away. 1169 */ 1170 if (!zone_lock_held) 1171 mutex_exit(&zone->zone_lock); 1172 return (B_FALSE); 1173 } 1174 dropped = B_FALSE; 1175 if (zsd_wait_for_creator(zone, t, lockp)) 1176 dropped = B_TRUE; 1177 1178 if (zsd_wait_for_inprogress(zone, t, lockp)) 1179 dropped = B_TRUE; 1180 1181 if (t->zsd_flags & ZSD_DESTROY_NEEDED) { 1182 t->zsd_flags &= ~ZSD_DESTROY_NEEDED; 1183 t->zsd_flags |= ZSD_DESTROY_INPROGRESS; 1184 DTRACE_PROBE2(zsd__destroy__inprogress, 1185 zone_t *, zone, zone_key_t, key); 1186 mutex_exit(&zone->zone_lock); 1187 if (lockp != NULL) 1188 mutex_exit(lockp); 1189 dropped = B_TRUE; 1190 1191 ASSERT(t->zsd_destroy != NULL); 1192 data = t->zsd_data; 1193 DTRACE_PROBE2(zsd__destroy__start, 1194 zone_t *, zone, zone_key_t, key); 1195 1196 (t->zsd_destroy)(zone->zone_id, data); 1197 DTRACE_PROBE2(zsd__destroy__end, 1198 zone_t *, zone, zone_key_t, key); 1199 1200 if (lockp != NULL) 1201 mutex_enter(lockp); 1202 mutex_enter(&zone->zone_lock); 1203 t->zsd_data = NULL; 1204 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS; 1205 t->zsd_flags |= ZSD_DESTROY_COMPLETED; 1206 cv_broadcast(&t->zsd_cv); 1207 DTRACE_PROBE2(zsd__destroy__completed, 1208 zone_t *, zone, zone_key_t, key); 1209 } 1210 if (!zone_lock_held) 1211 mutex_exit(&zone->zone_lock); 1212 return (dropped); 1213 } 1214 1215 /* 1216 * Wait for any CREATE_NEEDED flag to be cleared. 1217 * Returns true if lockp was temporarily dropped while waiting. 1218 */ 1219 static boolean_t 1220 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) 1221 { 1222 boolean_t dropped = B_FALSE; 1223 1224 while (t->zsd_flags & ZSD_CREATE_NEEDED) { 1225 DTRACE_PROBE2(zsd__wait__for__creator, 1226 zone_t *, zone, struct zsd_entry *, t); 1227 if (lockp != NULL) { 1228 dropped = B_TRUE; 1229 mutex_exit(lockp); 1230 } 1231 cv_wait(&t->zsd_cv, &zone->zone_lock); 1232 if (lockp != NULL) { 1233 /* First drop zone_lock to preserve order */ 1234 mutex_exit(&zone->zone_lock); 1235 mutex_enter(lockp); 1236 mutex_enter(&zone->zone_lock); 1237 } 1238 } 1239 return (dropped); 1240 } 1241 1242 /* 1243 * Wait for any INPROGRESS flag to be cleared. 1244 * Returns true if lockp was temporarily dropped while waiting. 1245 */ 1246 static boolean_t 1247 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) 1248 { 1249 boolean_t dropped = B_FALSE; 1250 1251 while (t->zsd_flags & ZSD_ALL_INPROGRESS) { 1252 DTRACE_PROBE2(zsd__wait__for__inprogress, 1253 zone_t *, zone, struct zsd_entry *, t); 1254 if (lockp != NULL) { 1255 dropped = B_TRUE; 1256 mutex_exit(lockp); 1257 } 1258 cv_wait(&t->zsd_cv, &zone->zone_lock); 1259 if (lockp != NULL) { 1260 /* First drop zone_lock to preserve order */ 1261 mutex_exit(&zone->zone_lock); 1262 mutex_enter(lockp); 1263 mutex_enter(&zone->zone_lock); 1264 } 1265 } 1266 return (dropped); 1267 } 1268 1269 /* 1270 * Frees memory associated with the zone dataset list. 1271 */ 1272 static void 1273 zone_free_datasets(zone_t *zone) 1274 { 1275 zone_dataset_t *t, *next; 1276 1277 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 1278 next = list_next(&zone->zone_datasets, t); 1279 list_remove(&zone->zone_datasets, t); 1280 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 1281 kmem_free(t, sizeof (*t)); 1282 } 1283 list_destroy(&zone->zone_datasets); 1284 } 1285 1286 /* 1287 * zone.cpu-shares resource control support. 1288 */ 1289 /*ARGSUSED*/ 1290 static rctl_qty_t 1291 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 1292 { 1293 ASSERT(MUTEX_HELD(&p->p_lock)); 1294 return (p->p_zone->zone_shares); 1295 } 1296 1297 /*ARGSUSED*/ 1298 static int 1299 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1300 rctl_qty_t nv) 1301 { 1302 ASSERT(MUTEX_HELD(&p->p_lock)); 1303 ASSERT(e->rcep_t == RCENTITY_ZONE); 1304 if (e->rcep_p.zone == NULL) 1305 return (0); 1306 1307 e->rcep_p.zone->zone_shares = nv; 1308 return (0); 1309 } 1310 1311 static rctl_ops_t zone_cpu_shares_ops = { 1312 rcop_no_action, 1313 zone_cpu_shares_usage, 1314 zone_cpu_shares_set, 1315 rcop_no_test 1316 }; 1317 1318 /* 1319 * zone.cpu-cap resource control support. 1320 */ 1321 /*ARGSUSED*/ 1322 static rctl_qty_t 1323 zone_cpu_cap_get(rctl_t *rctl, struct proc *p) 1324 { 1325 ASSERT(MUTEX_HELD(&p->p_lock)); 1326 return (cpucaps_zone_get(p->p_zone)); 1327 } 1328 1329 /*ARGSUSED*/ 1330 static int 1331 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1332 rctl_qty_t nv) 1333 { 1334 zone_t *zone = e->rcep_p.zone; 1335 1336 ASSERT(MUTEX_HELD(&p->p_lock)); 1337 ASSERT(e->rcep_t == RCENTITY_ZONE); 1338 1339 if (zone == NULL) 1340 return (0); 1341 1342 /* 1343 * set cap to the new value. 1344 */ 1345 return (cpucaps_zone_set(zone, nv)); 1346 } 1347 1348 static rctl_ops_t zone_cpu_cap_ops = { 1349 rcop_no_action, 1350 zone_cpu_cap_get, 1351 zone_cpu_cap_set, 1352 rcop_no_test 1353 }; 1354 1355 /*ARGSUSED*/ 1356 static rctl_qty_t 1357 zone_lwps_usage(rctl_t *r, proc_t *p) 1358 { 1359 rctl_qty_t nlwps; 1360 zone_t *zone = p->p_zone; 1361 1362 ASSERT(MUTEX_HELD(&p->p_lock)); 1363 1364 mutex_enter(&zone->zone_nlwps_lock); 1365 nlwps = zone->zone_nlwps; 1366 mutex_exit(&zone->zone_nlwps_lock); 1367 1368 return (nlwps); 1369 } 1370 1371 /*ARGSUSED*/ 1372 static int 1373 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 1374 rctl_qty_t incr, uint_t flags) 1375 { 1376 rctl_qty_t nlwps; 1377 1378 ASSERT(MUTEX_HELD(&p->p_lock)); 1379 ASSERT(e->rcep_t == RCENTITY_ZONE); 1380 if (e->rcep_p.zone == NULL) 1381 return (0); 1382 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 1383 nlwps = e->rcep_p.zone->zone_nlwps; 1384 1385 if (nlwps + incr > rcntl->rcv_value) 1386 return (1); 1387 1388 return (0); 1389 } 1390 1391 /*ARGSUSED*/ 1392 static int 1393 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) 1394 { 1395 ASSERT(MUTEX_HELD(&p->p_lock)); 1396 ASSERT(e->rcep_t == RCENTITY_ZONE); 1397 if (e->rcep_p.zone == NULL) 1398 return (0); 1399 e->rcep_p.zone->zone_nlwps_ctl = nv; 1400 return (0); 1401 } 1402 1403 static rctl_ops_t zone_lwps_ops = { 1404 rcop_no_action, 1405 zone_lwps_usage, 1406 zone_lwps_set, 1407 zone_lwps_test, 1408 }; 1409 1410 /*ARGSUSED*/ 1411 static rctl_qty_t 1412 zone_procs_usage(rctl_t *r, proc_t *p) 1413 { 1414 rctl_qty_t nprocs; 1415 zone_t *zone = p->p_zone; 1416 1417 ASSERT(MUTEX_HELD(&p->p_lock)); 1418 1419 mutex_enter(&zone->zone_nlwps_lock); 1420 nprocs = zone->zone_nprocs; 1421 mutex_exit(&zone->zone_nlwps_lock); 1422 1423 return (nprocs); 1424 } 1425 1426 /*ARGSUSED*/ 1427 static int 1428 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 1429 rctl_qty_t incr, uint_t flags) 1430 { 1431 rctl_qty_t nprocs; 1432 1433 ASSERT(MUTEX_HELD(&p->p_lock)); 1434 ASSERT(e->rcep_t == RCENTITY_ZONE); 1435 if (e->rcep_p.zone == NULL) 1436 return (0); 1437 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 1438 nprocs = e->rcep_p.zone->zone_nprocs; 1439 1440 if (nprocs + incr > rcntl->rcv_value) 1441 return (1); 1442 1443 return (0); 1444 } 1445 1446 /*ARGSUSED*/ 1447 static int 1448 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) 1449 { 1450 ASSERT(MUTEX_HELD(&p->p_lock)); 1451 ASSERT(e->rcep_t == RCENTITY_ZONE); 1452 if (e->rcep_p.zone == NULL) 1453 return (0); 1454 e->rcep_p.zone->zone_nprocs_ctl = nv; 1455 return (0); 1456 } 1457 1458 static rctl_ops_t zone_procs_ops = { 1459 rcop_no_action, 1460 zone_procs_usage, 1461 zone_procs_set, 1462 zone_procs_test, 1463 }; 1464 1465 /*ARGSUSED*/ 1466 static int 1467 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1468 rctl_qty_t incr, uint_t flags) 1469 { 1470 rctl_qty_t v; 1471 ASSERT(MUTEX_HELD(&p->p_lock)); 1472 ASSERT(e->rcep_t == RCENTITY_ZONE); 1473 v = e->rcep_p.zone->zone_shmmax + incr; 1474 if (v > rval->rcv_value) 1475 return (1); 1476 return (0); 1477 } 1478 1479 static rctl_ops_t zone_shmmax_ops = { 1480 rcop_no_action, 1481 rcop_no_usage, 1482 rcop_no_set, 1483 zone_shmmax_test 1484 }; 1485 1486 /*ARGSUSED*/ 1487 static int 1488 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1489 rctl_qty_t incr, uint_t flags) 1490 { 1491 rctl_qty_t v; 1492 ASSERT(MUTEX_HELD(&p->p_lock)); 1493 ASSERT(e->rcep_t == RCENTITY_ZONE); 1494 v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr; 1495 if (v > rval->rcv_value) 1496 return (1); 1497 return (0); 1498 } 1499 1500 static rctl_ops_t zone_shmmni_ops = { 1501 rcop_no_action, 1502 rcop_no_usage, 1503 rcop_no_set, 1504 zone_shmmni_test 1505 }; 1506 1507 /*ARGSUSED*/ 1508 static int 1509 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1510 rctl_qty_t incr, uint_t flags) 1511 { 1512 rctl_qty_t v; 1513 ASSERT(MUTEX_HELD(&p->p_lock)); 1514 ASSERT(e->rcep_t == RCENTITY_ZONE); 1515 v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr; 1516 if (v > rval->rcv_value) 1517 return (1); 1518 return (0); 1519 } 1520 1521 static rctl_ops_t zone_semmni_ops = { 1522 rcop_no_action, 1523 rcop_no_usage, 1524 rcop_no_set, 1525 zone_semmni_test 1526 }; 1527 1528 /*ARGSUSED*/ 1529 static int 1530 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1531 rctl_qty_t incr, uint_t flags) 1532 { 1533 rctl_qty_t v; 1534 ASSERT(MUTEX_HELD(&p->p_lock)); 1535 ASSERT(e->rcep_t == RCENTITY_ZONE); 1536 v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr; 1537 if (v > rval->rcv_value) 1538 return (1); 1539 return (0); 1540 } 1541 1542 static rctl_ops_t zone_msgmni_ops = { 1543 rcop_no_action, 1544 rcop_no_usage, 1545 rcop_no_set, 1546 zone_msgmni_test 1547 }; 1548 1549 /*ARGSUSED*/ 1550 static rctl_qty_t 1551 zone_locked_mem_usage(rctl_t *rctl, struct proc *p) 1552 { 1553 rctl_qty_t q; 1554 ASSERT(MUTEX_HELD(&p->p_lock)); 1555 mutex_enter(&p->p_zone->zone_mem_lock); 1556 q = p->p_zone->zone_locked_mem; 1557 mutex_exit(&p->p_zone->zone_mem_lock); 1558 return (q); 1559 } 1560 1561 /*ARGSUSED*/ 1562 static int 1563 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1564 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1565 { 1566 rctl_qty_t q; 1567 zone_t *z; 1568 1569 z = e->rcep_p.zone; 1570 ASSERT(MUTEX_HELD(&p->p_lock)); 1571 ASSERT(MUTEX_HELD(&z->zone_mem_lock)); 1572 q = z->zone_locked_mem; 1573 if (q + incr > rcntl->rcv_value) 1574 return (1); 1575 return (0); 1576 } 1577 1578 /*ARGSUSED*/ 1579 static int 1580 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1581 rctl_qty_t nv) 1582 { 1583 ASSERT(MUTEX_HELD(&p->p_lock)); 1584 ASSERT(e->rcep_t == RCENTITY_ZONE); 1585 if (e->rcep_p.zone == NULL) 1586 return (0); 1587 e->rcep_p.zone->zone_locked_mem_ctl = nv; 1588 return (0); 1589 } 1590 1591 static rctl_ops_t zone_locked_mem_ops = { 1592 rcop_no_action, 1593 zone_locked_mem_usage, 1594 zone_locked_mem_set, 1595 zone_locked_mem_test 1596 }; 1597 1598 /*ARGSUSED*/ 1599 static rctl_qty_t 1600 zone_max_swap_usage(rctl_t *rctl, struct proc *p) 1601 { 1602 rctl_qty_t q; 1603 zone_t *z = p->p_zone; 1604 1605 ASSERT(MUTEX_HELD(&p->p_lock)); 1606 mutex_enter(&z->zone_mem_lock); 1607 q = z->zone_max_swap; 1608 mutex_exit(&z->zone_mem_lock); 1609 return (q); 1610 } 1611 1612 /*ARGSUSED*/ 1613 static int 1614 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1615 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1616 { 1617 rctl_qty_t q; 1618 zone_t *z; 1619 1620 z = e->rcep_p.zone; 1621 ASSERT(MUTEX_HELD(&p->p_lock)); 1622 ASSERT(MUTEX_HELD(&z->zone_mem_lock)); 1623 q = z->zone_max_swap; 1624 if (q + incr > rcntl->rcv_value) 1625 return (1); 1626 return (0); 1627 } 1628 1629 /*ARGSUSED*/ 1630 static int 1631 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1632 rctl_qty_t nv) 1633 { 1634 ASSERT(MUTEX_HELD(&p->p_lock)); 1635 ASSERT(e->rcep_t == RCENTITY_ZONE); 1636 if (e->rcep_p.zone == NULL) 1637 return (0); 1638 e->rcep_p.zone->zone_max_swap_ctl = nv; 1639 return (0); 1640 } 1641 1642 static rctl_ops_t zone_max_swap_ops = { 1643 rcop_no_action, 1644 zone_max_swap_usage, 1645 zone_max_swap_set, 1646 zone_max_swap_test 1647 }; 1648 1649 /*ARGSUSED*/ 1650 static rctl_qty_t 1651 zone_max_lofi_usage(rctl_t *rctl, struct proc *p) 1652 { 1653 rctl_qty_t q; 1654 zone_t *z = p->p_zone; 1655 1656 ASSERT(MUTEX_HELD(&p->p_lock)); 1657 mutex_enter(&z->zone_rctl_lock); 1658 q = z->zone_max_lofi; 1659 mutex_exit(&z->zone_rctl_lock); 1660 return (q); 1661 } 1662 1663 /*ARGSUSED*/ 1664 static int 1665 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1666 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1667 { 1668 rctl_qty_t q; 1669 zone_t *z; 1670 1671 z = e->rcep_p.zone; 1672 ASSERT(MUTEX_HELD(&p->p_lock)); 1673 ASSERT(MUTEX_HELD(&z->zone_rctl_lock)); 1674 q = z->zone_max_lofi; 1675 if (q + incr > rcntl->rcv_value) 1676 return (1); 1677 return (0); 1678 } 1679 1680 /*ARGSUSED*/ 1681 static int 1682 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1683 rctl_qty_t nv) 1684 { 1685 ASSERT(MUTEX_HELD(&p->p_lock)); 1686 ASSERT(e->rcep_t == RCENTITY_ZONE); 1687 if (e->rcep_p.zone == NULL) 1688 return (0); 1689 e->rcep_p.zone->zone_max_lofi_ctl = nv; 1690 return (0); 1691 } 1692 1693 static rctl_ops_t zone_max_lofi_ops = { 1694 rcop_no_action, 1695 zone_max_lofi_usage, 1696 zone_max_lofi_set, 1697 zone_max_lofi_test 1698 }; 1699 1700 /* 1701 * Helper function to brand the zone with a unique ID. 1702 */ 1703 static void 1704 zone_uniqid(zone_t *zone) 1705 { 1706 static uint64_t uniqid = 0; 1707 1708 ASSERT(MUTEX_HELD(&zonehash_lock)); 1709 zone->zone_uniqid = uniqid++; 1710 } 1711 1712 /* 1713 * Returns a held pointer to the "kcred" for the specified zone. 1714 */ 1715 struct cred * 1716 zone_get_kcred(zoneid_t zoneid) 1717 { 1718 zone_t *zone; 1719 cred_t *cr; 1720 1721 if ((zone = zone_find_by_id(zoneid)) == NULL) 1722 return (NULL); 1723 cr = zone->zone_kcred; 1724 crhold(cr); 1725 zone_rele(zone); 1726 return (cr); 1727 } 1728 1729 static int 1730 zone_lockedmem_kstat_update(kstat_t *ksp, int rw) 1731 { 1732 zone_t *zone = ksp->ks_private; 1733 zone_kstat_t *zk = ksp->ks_data; 1734 1735 if (rw == KSTAT_WRITE) 1736 return (EACCES); 1737 1738 zk->zk_usage.value.ui64 = zone->zone_locked_mem; 1739 zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; 1740 return (0); 1741 } 1742 1743 static int 1744 zone_nprocs_kstat_update(kstat_t *ksp, int rw) 1745 { 1746 zone_t *zone = ksp->ks_private; 1747 zone_kstat_t *zk = ksp->ks_data; 1748 1749 if (rw == KSTAT_WRITE) 1750 return (EACCES); 1751 1752 zk->zk_usage.value.ui64 = zone->zone_nprocs; 1753 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl; 1754 return (0); 1755 } 1756 1757 static int 1758 zone_swapresv_kstat_update(kstat_t *ksp, int rw) 1759 { 1760 zone_t *zone = ksp->ks_private; 1761 zone_kstat_t *zk = ksp->ks_data; 1762 1763 if (rw == KSTAT_WRITE) 1764 return (EACCES); 1765 1766 zk->zk_usage.value.ui64 = zone->zone_max_swap; 1767 zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; 1768 return (0); 1769 } 1770 1771 static kstat_t * 1772 zone_kstat_create_common(zone_t *zone, char *name, 1773 int (*updatefunc) (kstat_t *, int)) 1774 { 1775 kstat_t *ksp; 1776 zone_kstat_t *zk; 1777 1778 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED, 1779 sizeof (zone_kstat_t) / sizeof (kstat_named_t), 1780 KSTAT_FLAG_VIRTUAL); 1781 1782 if (ksp == NULL) 1783 return (NULL); 1784 1785 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); 1786 ksp->ks_data_size += strlen(zone->zone_name) + 1; 1787 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); 1788 kstat_named_setstr(&zk->zk_zonename, zone->zone_name); 1789 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); 1790 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); 1791 ksp->ks_update = updatefunc; 1792 ksp->ks_private = zone; 1793 kstat_install(ksp); 1794 return (ksp); 1795 } 1796 1797 static void 1798 zone_kstat_create(zone_t *zone) 1799 { 1800 zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, 1801 "lockedmem", zone_lockedmem_kstat_update); 1802 zone->zone_swapresv_kstat = zone_kstat_create_common(zone, 1803 "swapresv", zone_swapresv_kstat_update); 1804 zone->zone_nprocs_kstat = zone_kstat_create_common(zone, 1805 "nprocs", zone_nprocs_kstat_update); 1806 } 1807 1808 static void 1809 zone_kstat_delete_common(kstat_t **pkstat) 1810 { 1811 void *data; 1812 1813 if (*pkstat != NULL) { 1814 data = (*pkstat)->ks_data; 1815 kstat_delete(*pkstat); 1816 kmem_free(data, sizeof (zone_kstat_t)); 1817 *pkstat = NULL; 1818 } 1819 } 1820 1821 static void 1822 zone_kstat_delete(zone_t *zone) 1823 { 1824 zone_kstat_delete_common(&zone->zone_lockedmem_kstat); 1825 zone_kstat_delete_common(&zone->zone_swapresv_kstat); 1826 zone_kstat_delete_common(&zone->zone_nprocs_kstat); 1827 } 1828 1829 /* 1830 * Called very early on in boot to initialize the ZSD list so that 1831 * zone_key_create() can be called before zone_init(). It also initializes 1832 * portions of zone0 which may be used before zone_init() is called. The 1833 * variable "global_zone" will be set when zone0 is fully initialized by 1834 * zone_init(). 1835 */ 1836 void 1837 zone_zsd_init(void) 1838 { 1839 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 1840 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 1841 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 1842 offsetof(struct zsd_entry, zsd_linkage)); 1843 list_create(&zone_active, sizeof (zone_t), 1844 offsetof(zone_t, zone_linkage)); 1845 list_create(&zone_deathrow, sizeof (zone_t), 1846 offsetof(zone_t, zone_linkage)); 1847 1848 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 1849 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 1850 mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); 1851 zone0.zone_shares = 1; 1852 zone0.zone_nlwps = 0; 1853 zone0.zone_nlwps_ctl = INT_MAX; 1854 zone0.zone_nprocs = 0; 1855 zone0.zone_nprocs_ctl = INT_MAX; 1856 zone0.zone_locked_mem = 0; 1857 zone0.zone_locked_mem_ctl = UINT64_MAX; 1858 ASSERT(zone0.zone_max_swap == 0); 1859 zone0.zone_max_swap_ctl = UINT64_MAX; 1860 zone0.zone_max_lofi = 0; 1861 zone0.zone_max_lofi_ctl = UINT64_MAX; 1862 zone0.zone_shmmax = 0; 1863 zone0.zone_ipc.ipcq_shmmni = 0; 1864 zone0.zone_ipc.ipcq_semmni = 0; 1865 zone0.zone_ipc.ipcq_msgmni = 0; 1866 zone0.zone_name = GLOBAL_ZONENAME; 1867 zone0.zone_nodename = utsname.nodename; 1868 zone0.zone_domain = srpc_domain; 1869 zone0.zone_hostid = HW_INVALID_HOSTID; 1870 zone0.zone_fs_allowed = NULL; 1871 zone0.zone_ref = 1; 1872 zone0.zone_id = GLOBAL_ZONEID; 1873 zone0.zone_status = ZONE_IS_RUNNING; 1874 zone0.zone_rootpath = "/"; 1875 zone0.zone_rootpathlen = 2; 1876 zone0.zone_psetid = ZONE_PS_INVAL; 1877 zone0.zone_ncpus = 0; 1878 zone0.zone_ncpus_online = 0; 1879 zone0.zone_proc_initpid = 1; 1880 zone0.zone_initname = initname; 1881 zone0.zone_lockedmem_kstat = NULL; 1882 zone0.zone_swapresv_kstat = NULL; 1883 zone0.zone_nprocs_kstat = NULL; 1884 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 1885 offsetof(struct zsd_entry, zsd_linkage)); 1886 list_insert_head(&zone_active, &zone0); 1887 1888 /* 1889 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 1890 * to anything meaningful. It is assigned to be 'rootdir' in 1891 * vfs_mountroot(). 1892 */ 1893 zone0.zone_rootvp = NULL; 1894 zone0.zone_vfslist = NULL; 1895 zone0.zone_bootargs = initargs; 1896 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 1897 /* 1898 * The global zone has all privileges 1899 */ 1900 priv_fillset(zone0.zone_privset); 1901 /* 1902 * Add p0 to the global zone 1903 */ 1904 zone0.zone_zsched = &p0; 1905 p0.p_zone = &zone0; 1906 } 1907 1908 /* 1909 * Compute a hash value based on the contents of the label and the DOI. The 1910 * hash algorithm is somewhat arbitrary, but is based on the observation that 1911 * humans will likely pick labels that differ by amounts that work out to be 1912 * multiples of the number of hash chains, and thus stirring in some primes 1913 * should help. 1914 */ 1915 static uint_t 1916 hash_bylabel(void *hdata, mod_hash_key_t key) 1917 { 1918 const ts_label_t *lab = (ts_label_t *)key; 1919 const uint32_t *up, *ue; 1920 uint_t hash; 1921 int i; 1922 1923 _NOTE(ARGUNUSED(hdata)); 1924 1925 hash = lab->tsl_doi + (lab->tsl_doi << 1); 1926 /* we depend on alignment of label, but not representation */ 1927 up = (const uint32_t *)&lab->tsl_label; 1928 ue = up + sizeof (lab->tsl_label) / sizeof (*up); 1929 i = 1; 1930 while (up < ue) { 1931 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */ 1932 hash += *up + (*up << ((i % 16) + 1)); 1933 up++; 1934 i++; 1935 } 1936 return (hash); 1937 } 1938 1939 /* 1940 * All that mod_hash cares about here is zero (equal) versus non-zero (not 1941 * equal). This may need to be changed if less than / greater than is ever 1942 * needed. 1943 */ 1944 static int 1945 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1946 { 1947 ts_label_t *lab1 = (ts_label_t *)key1; 1948 ts_label_t *lab2 = (ts_label_t *)key2; 1949 1950 return (label_equal(lab1, lab2) ? 0 : 1); 1951 } 1952 1953 /* 1954 * Called by main() to initialize the zones framework. 1955 */ 1956 void 1957 zone_init(void) 1958 { 1959 rctl_dict_entry_t *rde; 1960 rctl_val_t *dval; 1961 rctl_set_t *set; 1962 rctl_alloc_gp_t *gp; 1963 rctl_entity_p_t e; 1964 int res; 1965 1966 ASSERT(curproc == &p0); 1967 1968 /* 1969 * Create ID space for zone IDs. ID 0 is reserved for the 1970 * global zone. 1971 */ 1972 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 1973 1974 /* 1975 * Initialize generic zone resource controls, if any. 1976 */ 1977 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 1978 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 1979 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, 1980 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops); 1981 1982 rc_zone_cpu_cap = rctl_register("zone.cpu-cap", 1983 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS | 1984 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER | 1985 RCTL_GLOBAL_INFINITE, 1986 MAXCAP, MAXCAP, &zone_cpu_cap_ops); 1987 1988 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 1989 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1990 INT_MAX, INT_MAX, &zone_lwps_ops); 1991 1992 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE, 1993 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1994 INT_MAX, INT_MAX, &zone_procs_ops); 1995 1996 /* 1997 * System V IPC resource controls 1998 */ 1999 rc_zone_msgmni = rctl_register("zone.max-msg-ids", 2000 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2001 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops); 2002 2003 rc_zone_semmni = rctl_register("zone.max-sem-ids", 2004 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2005 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops); 2006 2007 rc_zone_shmmni = rctl_register("zone.max-shm-ids", 2008 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2009 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops); 2010 2011 rc_zone_shmmax = rctl_register("zone.max-shm-memory", 2012 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2013 RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops); 2014 2015 /* 2016 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 2017 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 2018 */ 2019 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2020 bzero(dval, sizeof (rctl_val_t)); 2021 dval->rcv_value = 1; 2022 dval->rcv_privilege = RCPRIV_PRIVILEGED; 2023 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 2024 dval->rcv_action_recip_pid = -1; 2025 2026 rde = rctl_dict_lookup("zone.cpu-shares"); 2027 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 2028 2029 rc_zone_locked_mem = rctl_register("zone.max-locked-memory", 2030 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | 2031 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 2032 &zone_locked_mem_ops); 2033 2034 rc_zone_max_swap = rctl_register("zone.max-swap", 2035 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | 2036 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 2037 &zone_max_swap_ops); 2038 2039 rc_zone_max_lofi = rctl_register("zone.max-lofi", 2040 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | 2041 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 2042 &zone_max_lofi_ops); 2043 2044 /* 2045 * Initialize the ``global zone''. 2046 */ 2047 set = rctl_set_create(); 2048 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2049 mutex_enter(&p0.p_lock); 2050 e.rcep_p.zone = &zone0; 2051 e.rcep_t = RCENTITY_ZONE; 2052 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 2053 gp); 2054 2055 zone0.zone_nlwps = p0.p_lwpcnt; 2056 zone0.zone_nprocs = 1; 2057 zone0.zone_ntasks = 1; 2058 mutex_exit(&p0.p_lock); 2059 zone0.zone_restart_init = B_TRUE; 2060 zone0.zone_brand = &native_brand; 2061 rctl_prealloc_destroy(gp); 2062 /* 2063 * pool_default hasn't been initialized yet, so we let pool_init() 2064 * take care of making sure the global zone is in the default pool. 2065 */ 2066 2067 /* 2068 * Initialize global zone kstats 2069 */ 2070 zone_kstat_create(&zone0); 2071 2072 /* 2073 * Initialize zone label. 2074 * mlp are initialized when tnzonecfg is loaded. 2075 */ 2076 zone0.zone_slabel = l_admin_low; 2077 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 2078 label_hold(l_admin_low); 2079 2080 /* 2081 * Initialise the lock for the database structure used by mntfs. 2082 */ 2083 rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL); 2084 2085 mutex_enter(&zonehash_lock); 2086 zone_uniqid(&zone0); 2087 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 2088 2089 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 2090 mod_hash_null_valdtor); 2091 zonehashbyname = mod_hash_create_strhash("zone_by_name", 2092 zone_hash_size, mod_hash_null_valdtor); 2093 /* 2094 * maintain zonehashbylabel only for labeled systems 2095 */ 2096 if (is_system_labeled()) 2097 zonehashbylabel = mod_hash_create_extended("zone_by_label", 2098 zone_hash_size, mod_hash_null_keydtor, 2099 mod_hash_null_valdtor, hash_bylabel, NULL, 2100 hash_labelkey_cmp, KM_SLEEP); 2101 zonecount = 1; 2102 2103 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 2104 (mod_hash_val_t)&zone0); 2105 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 2106 (mod_hash_val_t)&zone0); 2107 if (is_system_labeled()) { 2108 zone0.zone_flags |= ZF_HASHED_LABEL; 2109 (void) mod_hash_insert(zonehashbylabel, 2110 (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0); 2111 } 2112 mutex_exit(&zonehash_lock); 2113 2114 /* 2115 * We avoid setting zone_kcred until now, since kcred is initialized 2116 * sometime after zone_zsd_init() and before zone_init(). 2117 */ 2118 zone0.zone_kcred = kcred; 2119 /* 2120 * The global zone is fully initialized (except for zone_rootvp which 2121 * will be set when the root filesystem is mounted). 2122 */ 2123 global_zone = &zone0; 2124 2125 /* 2126 * Setup an event channel to send zone status change notifications on 2127 */ 2128 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan, 2129 EVCH_CREAT); 2130 2131 if (res) 2132 panic("Sysevent_evc_bind failed during zone setup.\n"); 2133 2134 } 2135 2136 static void 2137 zone_free(zone_t *zone) 2138 { 2139 ASSERT(zone != global_zone); 2140 ASSERT(zone->zone_ntasks == 0); 2141 ASSERT(zone->zone_nlwps == 0); 2142 ASSERT(zone->zone_nprocs == 0); 2143 ASSERT(zone->zone_cred_ref == 0); 2144 ASSERT(zone->zone_kcred == NULL); 2145 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 2146 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2147 2148 /* 2149 * Remove any zone caps. 2150 */ 2151 cpucaps_zone_remove(zone); 2152 2153 ASSERT(zone->zone_cpucap == NULL); 2154 2155 /* remove from deathrow list */ 2156 if (zone_status_get(zone) == ZONE_IS_DEAD) { 2157 ASSERT(zone->zone_ref == 0); 2158 mutex_enter(&zone_deathrow_lock); 2159 list_remove(&zone_deathrow, zone); 2160 mutex_exit(&zone_deathrow_lock); 2161 } 2162 2163 zone_free_zsd(zone); 2164 zone_free_datasets(zone); 2165 list_destroy(&zone->zone_dl_list); 2166 2167 if (zone->zone_rootvp != NULL) 2168 VN_RELE(zone->zone_rootvp); 2169 if (zone->zone_rootpath) 2170 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 2171 if (zone->zone_name != NULL) 2172 kmem_free(zone->zone_name, ZONENAME_MAX); 2173 if (zone->zone_slabel != NULL) 2174 label_rele(zone->zone_slabel); 2175 if (zone->zone_nodename != NULL) 2176 kmem_free(zone->zone_nodename, _SYS_NMLN); 2177 if (zone->zone_domain != NULL) 2178 kmem_free(zone->zone_domain, _SYS_NMLN); 2179 if (zone->zone_privset != NULL) 2180 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 2181 if (zone->zone_rctls != NULL) 2182 rctl_set_free(zone->zone_rctls); 2183 if (zone->zone_bootargs != NULL) 2184 strfree(zone->zone_bootargs); 2185 if (zone->zone_initname != NULL) 2186 strfree(zone->zone_initname); 2187 if (zone->zone_fs_allowed != NULL) 2188 strfree(zone->zone_fs_allowed); 2189 if (zone->zone_pfexecd != NULL) 2190 klpd_freelist(&zone->zone_pfexecd); 2191 id_free(zoneid_space, zone->zone_id); 2192 mutex_destroy(&zone->zone_lock); 2193 cv_destroy(&zone->zone_cv); 2194 rw_destroy(&zone->zone_mlps.mlpl_rwlock); 2195 rw_destroy(&zone->zone_mntfs_db_lock); 2196 kmem_free(zone, sizeof (zone_t)); 2197 } 2198 2199 /* 2200 * See block comment at the top of this file for information about zone 2201 * status values. 2202 */ 2203 /* 2204 * Convenience function for setting zone status. 2205 */ 2206 static void 2207 zone_status_set(zone_t *zone, zone_status_t status) 2208 { 2209 2210 nvlist_t *nvl = NULL; 2211 ASSERT(MUTEX_HELD(&zone_status_lock)); 2212 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 2213 status >= zone_status_get(zone)); 2214 2215 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || 2216 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || 2217 nvlist_add_string(nvl, ZONE_CB_NEWSTATE, 2218 zone_status_table[status]) || 2219 nvlist_add_string(nvl, ZONE_CB_OLDSTATE, 2220 zone_status_table[zone->zone_status]) || 2221 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || 2222 nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || 2223 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, 2224 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { 2225 #ifdef DEBUG 2226 (void) printf( 2227 "Failed to allocate and send zone state change event.\n"); 2228 #endif 2229 } 2230 nvlist_free(nvl); 2231 2232 zone->zone_status = status; 2233 2234 cv_broadcast(&zone->zone_cv); 2235 } 2236 2237 /* 2238 * Public function to retrieve the zone status. The zone status may 2239 * change after it is retrieved. 2240 */ 2241 zone_status_t 2242 zone_status_get(zone_t *zone) 2243 { 2244 return (zone->zone_status); 2245 } 2246 2247 static int 2248 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 2249 { 2250 char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); 2251 int err = 0; 2252 2253 ASSERT(zone != global_zone); 2254 if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0) 2255 goto done; /* EFAULT or ENAMETOOLONG */ 2256 2257 if (zone->zone_bootargs != NULL) 2258 strfree(zone->zone_bootargs); 2259 2260 zone->zone_bootargs = strdup(buf); 2261 2262 done: 2263 kmem_free(buf, BOOTARGS_MAX); 2264 return (err); 2265 } 2266 2267 static int 2268 zone_set_brand(zone_t *zone, const char *brand) 2269 { 2270 struct brand_attr *attrp; 2271 brand_t *bp; 2272 2273 attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP); 2274 if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) { 2275 kmem_free(attrp, sizeof (struct brand_attr)); 2276 return (EFAULT); 2277 } 2278 2279 bp = brand_register_zone(attrp); 2280 kmem_free(attrp, sizeof (struct brand_attr)); 2281 if (bp == NULL) 2282 return (EINVAL); 2283 2284 /* 2285 * This is the only place where a zone can change it's brand. 2286 * We already need to hold zone_status_lock to check the zone 2287 * status, so we'll just use that lock to serialize zone 2288 * branding requests as well. 2289 */ 2290 mutex_enter(&zone_status_lock); 2291 2292 /* Re-Branding is not allowed and the zone can't be booted yet */ 2293 if ((ZONE_IS_BRANDED(zone)) || 2294 (zone_status_get(zone) >= ZONE_IS_BOOTING)) { 2295 mutex_exit(&zone_status_lock); 2296 brand_unregister_zone(bp); 2297 return (EINVAL); 2298 } 2299 2300 /* set up the brand specific data */ 2301 zone->zone_brand = bp; 2302 ZBROP(zone)->b_init_brand_data(zone); 2303 2304 mutex_exit(&zone_status_lock); 2305 return (0); 2306 } 2307 2308 static int 2309 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed) 2310 { 2311 char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP); 2312 int err = 0; 2313 2314 ASSERT(zone != global_zone); 2315 if ((err = copyinstr(zone_fs_allowed, buf, 2316 ZONE_FS_ALLOWED_MAX, NULL)) != 0) 2317 goto done; 2318 2319 if (zone->zone_fs_allowed != NULL) 2320 strfree(zone->zone_fs_allowed); 2321 2322 zone->zone_fs_allowed = strdup(buf); 2323 2324 done: 2325 kmem_free(buf, ZONE_FS_ALLOWED_MAX); 2326 return (err); 2327 } 2328 2329 static int 2330 zone_set_initname(zone_t *zone, const char *zone_initname) 2331 { 2332 char initname[INITNAME_SZ]; 2333 size_t len; 2334 int err = 0; 2335 2336 ASSERT(zone != global_zone); 2337 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0) 2338 return (err); /* EFAULT or ENAMETOOLONG */ 2339 2340 if (zone->zone_initname != NULL) 2341 strfree(zone->zone_initname); 2342 2343 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); 2344 (void) strcpy(zone->zone_initname, initname); 2345 return (0); 2346 } 2347 2348 static int 2349 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) 2350 { 2351 uint64_t mcap; 2352 int err = 0; 2353 2354 if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) 2355 zone->zone_phys_mcap = mcap; 2356 2357 return (err); 2358 } 2359 2360 static int 2361 zone_set_sched_class(zone_t *zone, const char *new_class) 2362 { 2363 char sched_class[PC_CLNMSZ]; 2364 id_t classid; 2365 int err; 2366 2367 ASSERT(zone != global_zone); 2368 if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) 2369 return (err); /* EFAULT or ENAMETOOLONG */ 2370 2371 if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid)) 2372 return (set_errno(EINVAL)); 2373 zone->zone_defaultcid = classid; 2374 ASSERT(zone->zone_defaultcid > 0 && 2375 zone->zone_defaultcid < loaded_classes); 2376 2377 return (0); 2378 } 2379 2380 /* 2381 * Block indefinitely waiting for (zone_status >= status) 2382 */ 2383 void 2384 zone_status_wait(zone_t *zone, zone_status_t status) 2385 { 2386 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2387 2388 mutex_enter(&zone_status_lock); 2389 while (zone->zone_status < status) { 2390 cv_wait(&zone->zone_cv, &zone_status_lock); 2391 } 2392 mutex_exit(&zone_status_lock); 2393 } 2394 2395 /* 2396 * Private CPR-safe version of zone_status_wait(). 2397 */ 2398 static void 2399 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 2400 { 2401 callb_cpr_t cprinfo; 2402 2403 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2404 2405 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 2406 str); 2407 mutex_enter(&zone_status_lock); 2408 while (zone->zone_status < status) { 2409 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2410 cv_wait(&zone->zone_cv, &zone_status_lock); 2411 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 2412 } 2413 /* 2414 * zone_status_lock is implicitly released by the following. 2415 */ 2416 CALLB_CPR_EXIT(&cprinfo); 2417 } 2418 2419 /* 2420 * Block until zone enters requested state or signal is received. Return (0) 2421 * if signaled, non-zero otherwise. 2422 */ 2423 int 2424 zone_status_wait_sig(zone_t *zone, zone_status_t status) 2425 { 2426 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2427 2428 mutex_enter(&zone_status_lock); 2429 while (zone->zone_status < status) { 2430 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 2431 mutex_exit(&zone_status_lock); 2432 return (0); 2433 } 2434 } 2435 mutex_exit(&zone_status_lock); 2436 return (1); 2437 } 2438 2439 /* 2440 * Block until the zone enters the requested state or the timeout expires, 2441 * whichever happens first. Return (-1) if operation timed out, time remaining 2442 * otherwise. 2443 */ 2444 clock_t 2445 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 2446 { 2447 clock_t timeleft = 0; 2448 2449 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2450 2451 mutex_enter(&zone_status_lock); 2452 while (zone->zone_status < status && timeleft != -1) { 2453 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 2454 } 2455 mutex_exit(&zone_status_lock); 2456 return (timeleft); 2457 } 2458 2459 /* 2460 * Block until the zone enters the requested state, the current process is 2461 * signaled, or the timeout expires, whichever happens first. Return (-1) if 2462 * operation timed out, 0 if signaled, time remaining otherwise. 2463 */ 2464 clock_t 2465 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 2466 { 2467 clock_t timeleft = tim - ddi_get_lbolt(); 2468 2469 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2470 2471 mutex_enter(&zone_status_lock); 2472 while (zone->zone_status < status) { 2473 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 2474 tim); 2475 if (timeleft <= 0) 2476 break; 2477 } 2478 mutex_exit(&zone_status_lock); 2479 return (timeleft); 2480 } 2481 2482 /* 2483 * Zones have two reference counts: one for references from credential 2484 * structures (zone_cred_ref), and one (zone_ref) for everything else. 2485 * This is so we can allow a zone to be rebooted while there are still 2486 * outstanding cred references, since certain drivers cache dblks (which 2487 * implicitly results in cached creds). We wait for zone_ref to drop to 2488 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 2489 * later freed when the zone_cred_ref drops to 0, though nothing other 2490 * than the zone id and privilege set should be accessed once the zone 2491 * is "dead". 2492 * 2493 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 2494 * to force halt/reboot to block waiting for the zone_cred_ref to drop 2495 * to 0. This can be useful to flush out other sources of cached creds 2496 * that may be less innocuous than the driver case. 2497 */ 2498 2499 int zone_wait_for_cred = 0; 2500 2501 static void 2502 zone_hold_locked(zone_t *z) 2503 { 2504 ASSERT(MUTEX_HELD(&z->zone_lock)); 2505 z->zone_ref++; 2506 ASSERT(z->zone_ref != 0); 2507 } 2508 2509 void 2510 zone_hold(zone_t *z) 2511 { 2512 mutex_enter(&z->zone_lock); 2513 zone_hold_locked(z); 2514 mutex_exit(&z->zone_lock); 2515 } 2516 2517 /* 2518 * If the non-cred ref count drops to 1 and either the cred ref count 2519 * is 0 or we aren't waiting for cred references, the zone is ready to 2520 * be destroyed. 2521 */ 2522 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 2523 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 2524 2525 void 2526 zone_rele(zone_t *z) 2527 { 2528 boolean_t wakeup; 2529 2530 mutex_enter(&z->zone_lock); 2531 ASSERT(z->zone_ref != 0); 2532 z->zone_ref--; 2533 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 2534 /* no more refs, free the structure */ 2535 mutex_exit(&z->zone_lock); 2536 zone_free(z); 2537 return; 2538 } 2539 /* signal zone_destroy so the zone can finish halting */ 2540 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 2541 mutex_exit(&z->zone_lock); 2542 2543 if (wakeup) { 2544 /* 2545 * Grabbing zonehash_lock here effectively synchronizes with 2546 * zone_destroy() to avoid missed signals. 2547 */ 2548 mutex_enter(&zonehash_lock); 2549 cv_broadcast(&zone_destroy_cv); 2550 mutex_exit(&zonehash_lock); 2551 } 2552 } 2553 2554 void 2555 zone_cred_hold(zone_t *z) 2556 { 2557 mutex_enter(&z->zone_lock); 2558 z->zone_cred_ref++; 2559 ASSERT(z->zone_cred_ref != 0); 2560 mutex_exit(&z->zone_lock); 2561 } 2562 2563 void 2564 zone_cred_rele(zone_t *z) 2565 { 2566 boolean_t wakeup; 2567 2568 mutex_enter(&z->zone_lock); 2569 ASSERT(z->zone_cred_ref != 0); 2570 z->zone_cred_ref--; 2571 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 2572 /* no more refs, free the structure */ 2573 mutex_exit(&z->zone_lock); 2574 zone_free(z); 2575 return; 2576 } 2577 /* 2578 * If zone_destroy is waiting for the cred references to drain 2579 * out, and they have, signal it. 2580 */ 2581 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 2582 zone_status_get(z) >= ZONE_IS_DEAD); 2583 mutex_exit(&z->zone_lock); 2584 2585 if (wakeup) { 2586 /* 2587 * Grabbing zonehash_lock here effectively synchronizes with 2588 * zone_destroy() to avoid missed signals. 2589 */ 2590 mutex_enter(&zonehash_lock); 2591 cv_broadcast(&zone_destroy_cv); 2592 mutex_exit(&zonehash_lock); 2593 } 2594 } 2595 2596 void 2597 zone_task_hold(zone_t *z) 2598 { 2599 mutex_enter(&z->zone_lock); 2600 z->zone_ntasks++; 2601 ASSERT(z->zone_ntasks != 0); 2602 mutex_exit(&z->zone_lock); 2603 } 2604 2605 void 2606 zone_task_rele(zone_t *zone) 2607 { 2608 uint_t refcnt; 2609 2610 mutex_enter(&zone->zone_lock); 2611 ASSERT(zone->zone_ntasks != 0); 2612 refcnt = --zone->zone_ntasks; 2613 if (refcnt > 1) { /* Common case */ 2614 mutex_exit(&zone->zone_lock); 2615 return; 2616 } 2617 zone_hold_locked(zone); /* so we can use the zone_t later */ 2618 mutex_exit(&zone->zone_lock); 2619 if (refcnt == 1) { 2620 /* 2621 * See if the zone is shutting down. 2622 */ 2623 mutex_enter(&zone_status_lock); 2624 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 2625 goto out; 2626 } 2627 2628 /* 2629 * Make sure the ntasks didn't change since we 2630 * dropped zone_lock. 2631 */ 2632 mutex_enter(&zone->zone_lock); 2633 if (refcnt != zone->zone_ntasks) { 2634 mutex_exit(&zone->zone_lock); 2635 goto out; 2636 } 2637 mutex_exit(&zone->zone_lock); 2638 2639 /* 2640 * No more user processes in the zone. The zone is empty. 2641 */ 2642 zone_status_set(zone, ZONE_IS_EMPTY); 2643 goto out; 2644 } 2645 2646 ASSERT(refcnt == 0); 2647 /* 2648 * zsched has exited; the zone is dead. 2649 */ 2650 zone->zone_zsched = NULL; /* paranoia */ 2651 mutex_enter(&zone_status_lock); 2652 zone_status_set(zone, ZONE_IS_DEAD); 2653 out: 2654 mutex_exit(&zone_status_lock); 2655 zone_rele(zone); 2656 } 2657 2658 zoneid_t 2659 getzoneid(void) 2660 { 2661 return (curproc->p_zone->zone_id); 2662 } 2663 2664 /* 2665 * Internal versions of zone_find_by_*(). These don't zone_hold() or 2666 * check the validity of a zone's state. 2667 */ 2668 static zone_t * 2669 zone_find_all_by_id(zoneid_t zoneid) 2670 { 2671 mod_hash_val_t hv; 2672 zone_t *zone = NULL; 2673 2674 ASSERT(MUTEX_HELD(&zonehash_lock)); 2675 2676 if (mod_hash_find(zonehashbyid, 2677 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 2678 zone = (zone_t *)hv; 2679 return (zone); 2680 } 2681 2682 static zone_t * 2683 zone_find_all_by_label(const ts_label_t *label) 2684 { 2685 mod_hash_val_t hv; 2686 zone_t *zone = NULL; 2687 2688 ASSERT(MUTEX_HELD(&zonehash_lock)); 2689 2690 /* 2691 * zonehashbylabel is not maintained for unlabeled systems 2692 */ 2693 if (!is_system_labeled()) 2694 return (NULL); 2695 if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0) 2696 zone = (zone_t *)hv; 2697 return (zone); 2698 } 2699 2700 static zone_t * 2701 zone_find_all_by_name(char *name) 2702 { 2703 mod_hash_val_t hv; 2704 zone_t *zone = NULL; 2705 2706 ASSERT(MUTEX_HELD(&zonehash_lock)); 2707 2708 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 2709 zone = (zone_t *)hv; 2710 return (zone); 2711 } 2712 2713 /* 2714 * Public interface for looking up a zone by zoneid. Only returns the zone if 2715 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 2716 * Caller must call zone_rele() once it is done with the zone. 2717 * 2718 * The zone may begin the zone_destroy() sequence immediately after this 2719 * function returns, but may be safely used until zone_rele() is called. 2720 */ 2721 zone_t * 2722 zone_find_by_id(zoneid_t zoneid) 2723 { 2724 zone_t *zone; 2725 zone_status_t status; 2726 2727 mutex_enter(&zonehash_lock); 2728 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2729 mutex_exit(&zonehash_lock); 2730 return (NULL); 2731 } 2732 status = zone_status_get(zone); 2733 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2734 /* 2735 * For all practical purposes the zone doesn't exist. 2736 */ 2737 mutex_exit(&zonehash_lock); 2738 return (NULL); 2739 } 2740 zone_hold(zone); 2741 mutex_exit(&zonehash_lock); 2742 return (zone); 2743 } 2744 2745 /* 2746 * Similar to zone_find_by_id, but using zone label as the key. 2747 */ 2748 zone_t * 2749 zone_find_by_label(const ts_label_t *label) 2750 { 2751 zone_t *zone; 2752 zone_status_t status; 2753 2754 mutex_enter(&zonehash_lock); 2755 if ((zone = zone_find_all_by_label(label)) == NULL) { 2756 mutex_exit(&zonehash_lock); 2757 return (NULL); 2758 } 2759 2760 status = zone_status_get(zone); 2761 if (status > ZONE_IS_DOWN) { 2762 /* 2763 * For all practical purposes the zone doesn't exist. 2764 */ 2765 mutex_exit(&zonehash_lock); 2766 return (NULL); 2767 } 2768 zone_hold(zone); 2769 mutex_exit(&zonehash_lock); 2770 return (zone); 2771 } 2772 2773 /* 2774 * Similar to zone_find_by_id, but using zone name as the key. 2775 */ 2776 zone_t * 2777 zone_find_by_name(char *name) 2778 { 2779 zone_t *zone; 2780 zone_status_t status; 2781 2782 mutex_enter(&zonehash_lock); 2783 if ((zone = zone_find_all_by_name(name)) == NULL) { 2784 mutex_exit(&zonehash_lock); 2785 return (NULL); 2786 } 2787 status = zone_status_get(zone); 2788 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2789 /* 2790 * For all practical purposes the zone doesn't exist. 2791 */ 2792 mutex_exit(&zonehash_lock); 2793 return (NULL); 2794 } 2795 zone_hold(zone); 2796 mutex_exit(&zonehash_lock); 2797 return (zone); 2798 } 2799 2800 /* 2801 * Similar to zone_find_by_id(), using the path as a key. For instance, 2802 * if there is a zone "foo" rooted at /foo/root, and the path argument 2803 * is "/foo/root/proc", it will return the held zone_t corresponding to 2804 * zone "foo". 2805 * 2806 * zone_find_by_path() always returns a non-NULL value, since at the 2807 * very least every path will be contained in the global zone. 2808 * 2809 * As with the other zone_find_by_*() functions, the caller is 2810 * responsible for zone_rele()ing the return value of this function. 2811 */ 2812 zone_t * 2813 zone_find_by_path(const char *path) 2814 { 2815 zone_t *zone; 2816 zone_t *zret = NULL; 2817 zone_status_t status; 2818 2819 if (path == NULL) { 2820 /* 2821 * Call from rootconf(). 2822 */ 2823 zone_hold(global_zone); 2824 return (global_zone); 2825 } 2826 ASSERT(*path == '/'); 2827 mutex_enter(&zonehash_lock); 2828 for (zone = list_head(&zone_active); zone != NULL; 2829 zone = list_next(&zone_active, zone)) { 2830 if (ZONE_PATH_VISIBLE(path, zone)) 2831 zret = zone; 2832 } 2833 ASSERT(zret != NULL); 2834 status = zone_status_get(zret); 2835 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2836 /* 2837 * Zone practically doesn't exist. 2838 */ 2839 zret = global_zone; 2840 } 2841 zone_hold(zret); 2842 mutex_exit(&zonehash_lock); 2843 return (zret); 2844 } 2845 2846 /* 2847 * Get the number of cpus visible to this zone. The system-wide global 2848 * 'ncpus' is returned if pools are disabled, the caller is in the 2849 * global zone, or a NULL zone argument is passed in. 2850 */ 2851 int 2852 zone_ncpus_get(zone_t *zone) 2853 { 2854 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 2855 2856 return (myncpus != 0 ? myncpus : ncpus); 2857 } 2858 2859 /* 2860 * Get the number of online cpus visible to this zone. The system-wide 2861 * global 'ncpus_online' is returned if pools are disabled, the caller 2862 * is in the global zone, or a NULL zone argument is passed in. 2863 */ 2864 int 2865 zone_ncpus_online_get(zone_t *zone) 2866 { 2867 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 2868 2869 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 2870 } 2871 2872 /* 2873 * Return the pool to which the zone is currently bound. 2874 */ 2875 pool_t * 2876 zone_pool_get(zone_t *zone) 2877 { 2878 ASSERT(pool_lock_held()); 2879 2880 return (zone->zone_pool); 2881 } 2882 2883 /* 2884 * Set the zone's pool pointer and update the zone's visibility to match 2885 * the resources in the new pool. 2886 */ 2887 void 2888 zone_pool_set(zone_t *zone, pool_t *pool) 2889 { 2890 ASSERT(pool_lock_held()); 2891 ASSERT(MUTEX_HELD(&cpu_lock)); 2892 2893 zone->zone_pool = pool; 2894 zone_pset_set(zone, pool->pool_pset->pset_id); 2895 } 2896 2897 /* 2898 * Return the cached value of the id of the processor set to which the 2899 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 2900 * facility is disabled. 2901 */ 2902 psetid_t 2903 zone_pset_get(zone_t *zone) 2904 { 2905 ASSERT(MUTEX_HELD(&cpu_lock)); 2906 2907 return (zone->zone_psetid); 2908 } 2909 2910 /* 2911 * Set the cached value of the id of the processor set to which the zone 2912 * is currently bound. Also update the zone's visibility to match the 2913 * resources in the new processor set. 2914 */ 2915 void 2916 zone_pset_set(zone_t *zone, psetid_t newpsetid) 2917 { 2918 psetid_t oldpsetid; 2919 2920 ASSERT(MUTEX_HELD(&cpu_lock)); 2921 oldpsetid = zone_pset_get(zone); 2922 2923 if (oldpsetid == newpsetid) 2924 return; 2925 /* 2926 * Global zone sees all. 2927 */ 2928 if (zone != global_zone) { 2929 zone->zone_psetid = newpsetid; 2930 if (newpsetid != ZONE_PS_INVAL) 2931 pool_pset_visibility_add(newpsetid, zone); 2932 if (oldpsetid != ZONE_PS_INVAL) 2933 pool_pset_visibility_remove(oldpsetid, zone); 2934 } 2935 /* 2936 * Disabling pools, so we should start using the global values 2937 * for ncpus and ncpus_online. 2938 */ 2939 if (newpsetid == ZONE_PS_INVAL) { 2940 zone->zone_ncpus = 0; 2941 zone->zone_ncpus_online = 0; 2942 } 2943 } 2944 2945 /* 2946 * Walk the list of active zones and issue the provided callback for 2947 * each of them. 2948 * 2949 * Caller must not be holding any locks that may be acquired under 2950 * zonehash_lock. See comment at the beginning of the file for a list of 2951 * common locks and their interactions with zones. 2952 */ 2953 int 2954 zone_walk(int (*cb)(zone_t *, void *), void *data) 2955 { 2956 zone_t *zone; 2957 int ret = 0; 2958 zone_status_t status; 2959 2960 mutex_enter(&zonehash_lock); 2961 for (zone = list_head(&zone_active); zone != NULL; 2962 zone = list_next(&zone_active, zone)) { 2963 /* 2964 * Skip zones that shouldn't be externally visible. 2965 */ 2966 status = zone_status_get(zone); 2967 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 2968 continue; 2969 /* 2970 * Bail immediately if any callback invocation returns a 2971 * non-zero value. 2972 */ 2973 ret = (*cb)(zone, data); 2974 if (ret != 0) 2975 break; 2976 } 2977 mutex_exit(&zonehash_lock); 2978 return (ret); 2979 } 2980 2981 static int 2982 zone_set_root(zone_t *zone, const char *upath) 2983 { 2984 vnode_t *vp; 2985 int trycount; 2986 int error = 0; 2987 char *path; 2988 struct pathname upn, pn; 2989 size_t pathlen; 2990 2991 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 2992 return (error); 2993 2994 pn_alloc(&pn); 2995 2996 /* prevent infinite loop */ 2997 trycount = 10; 2998 for (;;) { 2999 if (--trycount <= 0) { 3000 error = ESTALE; 3001 goto out; 3002 } 3003 3004 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 3005 /* 3006 * VOP_ACCESS() may cover 'vp' with a new 3007 * filesystem, if 'vp' is an autoFS vnode. 3008 * Get the new 'vp' if so. 3009 */ 3010 if ((error = 3011 VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 && 3012 (!vn_ismntpt(vp) || 3013 (error = traverse(&vp)) == 0)) { 3014 pathlen = pn.pn_pathlen + 2; 3015 path = kmem_alloc(pathlen, KM_SLEEP); 3016 (void) strncpy(path, pn.pn_path, 3017 pn.pn_pathlen + 1); 3018 path[pathlen - 2] = '/'; 3019 path[pathlen - 1] = '\0'; 3020 pn_free(&pn); 3021 pn_free(&upn); 3022 3023 /* Success! */ 3024 break; 3025 } 3026 VN_RELE(vp); 3027 } 3028 if (error != ESTALE) 3029 goto out; 3030 } 3031 3032 ASSERT(error == 0); 3033 zone->zone_rootvp = vp; /* we hold a reference to vp */ 3034 zone->zone_rootpath = path; 3035 zone->zone_rootpathlen = pathlen; 3036 if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0) 3037 zone->zone_flags |= ZF_IS_SCRATCH; 3038 return (0); 3039 3040 out: 3041 pn_free(&pn); 3042 pn_free(&upn); 3043 return (error); 3044 } 3045 3046 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 3047 ((c) >= 'a' && (c) <= 'z') || \ 3048 ((c) >= 'A' && (c) <= 'Z')) 3049 3050 static int 3051 zone_set_name(zone_t *zone, const char *uname) 3052 { 3053 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3054 size_t len; 3055 int i, err; 3056 3057 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 3058 kmem_free(kname, ZONENAME_MAX); 3059 return (err); /* EFAULT or ENAMETOOLONG */ 3060 } 3061 3062 /* must be less than ZONENAME_MAX */ 3063 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 3064 kmem_free(kname, ZONENAME_MAX); 3065 return (EINVAL); 3066 } 3067 3068 /* 3069 * Name must start with an alphanumeric and must contain only 3070 * alphanumerics, '-', '_' and '.'. 3071 */ 3072 if (!isalnum(kname[0])) { 3073 kmem_free(kname, ZONENAME_MAX); 3074 return (EINVAL); 3075 } 3076 for (i = 1; i < len - 1; i++) { 3077 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 3078 kname[i] != '.') { 3079 kmem_free(kname, ZONENAME_MAX); 3080 return (EINVAL); 3081 } 3082 } 3083 3084 zone->zone_name = kname; 3085 return (0); 3086 } 3087 3088 /* 3089 * Gets the 32-bit hostid of the specified zone as an unsigned int. If 'zonep' 3090 * is NULL or it points to a zone with no hostid emulation, then the machine's 3091 * hostid (i.e., the global zone's hostid) is returned. This function returns 3092 * zero if neither the zone nor the host machine (global zone) have hostids. It 3093 * returns HW_INVALID_HOSTID if the function attempts to return the machine's 3094 * hostid and the machine's hostid is invalid. 3095 */ 3096 uint32_t 3097 zone_get_hostid(zone_t *zonep) 3098 { 3099 unsigned long machine_hostid; 3100 3101 if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) { 3102 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0) 3103 return (HW_INVALID_HOSTID); 3104 return ((uint32_t)machine_hostid); 3105 } 3106 return (zonep->zone_hostid); 3107 } 3108 3109 /* 3110 * Similar to thread_create(), but makes sure the thread is in the appropriate 3111 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 3112 */ 3113 /*ARGSUSED*/ 3114 kthread_t * 3115 zthread_create( 3116 caddr_t stk, 3117 size_t stksize, 3118 void (*proc)(), 3119 void *arg, 3120 size_t len, 3121 pri_t pri) 3122 { 3123 kthread_t *t; 3124 zone_t *zone = curproc->p_zone; 3125 proc_t *pp = zone->zone_zsched; 3126 3127 zone_hold(zone); /* Reference to be dropped when thread exits */ 3128 3129 /* 3130 * No-one should be trying to create threads if the zone is shutting 3131 * down and there aren't any kernel threads around. See comment 3132 * in zthread_exit(). 3133 */ 3134 ASSERT(!(zone->zone_kthreads == NULL && 3135 zone_status_get(zone) >= ZONE_IS_EMPTY)); 3136 /* 3137 * Create a thread, but don't let it run until we've finished setting 3138 * things up. 3139 */ 3140 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 3141 ASSERT(t->t_forw == NULL); 3142 mutex_enter(&zone_status_lock); 3143 if (zone->zone_kthreads == NULL) { 3144 t->t_forw = t->t_back = t; 3145 } else { 3146 kthread_t *tx = zone->zone_kthreads; 3147 3148 t->t_forw = tx; 3149 t->t_back = tx->t_back; 3150 tx->t_back->t_forw = t; 3151 tx->t_back = t; 3152 } 3153 zone->zone_kthreads = t; 3154 mutex_exit(&zone_status_lock); 3155 3156 mutex_enter(&pp->p_lock); 3157 t->t_proc_flag |= TP_ZTHREAD; 3158 project_rele(t->t_proj); 3159 t->t_proj = project_hold(pp->p_task->tk_proj); 3160 3161 /* 3162 * Setup complete, let it run. 3163 */ 3164 thread_lock(t); 3165 t->t_schedflag |= TS_ALLSTART; 3166 setrun_locked(t); 3167 thread_unlock(t); 3168 3169 mutex_exit(&pp->p_lock); 3170 3171 return (t); 3172 } 3173 3174 /* 3175 * Similar to thread_exit(). Must be called by threads created via 3176 * zthread_exit(). 3177 */ 3178 void 3179 zthread_exit(void) 3180 { 3181 kthread_t *t = curthread; 3182 proc_t *pp = curproc; 3183 zone_t *zone = pp->p_zone; 3184 3185 mutex_enter(&zone_status_lock); 3186 3187 /* 3188 * Reparent to p0 3189 */ 3190 kpreempt_disable(); 3191 mutex_enter(&pp->p_lock); 3192 t->t_proc_flag &= ~TP_ZTHREAD; 3193 t->t_procp = &p0; 3194 hat_thread_exit(t); 3195 mutex_exit(&pp->p_lock); 3196 kpreempt_enable(); 3197 3198 if (t->t_back == t) { 3199 ASSERT(t->t_forw == t); 3200 /* 3201 * If the zone is empty, once the thread count 3202 * goes to zero no further kernel threads can be 3203 * created. This is because if the creator is a process 3204 * in the zone, then it must have exited before the zone 3205 * state could be set to ZONE_IS_EMPTY. 3206 * Otherwise, if the creator is a kernel thread in the 3207 * zone, the thread count is non-zero. 3208 * 3209 * This really means that non-zone kernel threads should 3210 * not create zone kernel threads. 3211 */ 3212 zone->zone_kthreads = NULL; 3213 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 3214 zone_status_set(zone, ZONE_IS_DOWN); 3215 /* 3216 * Remove any CPU caps on this zone. 3217 */ 3218 cpucaps_zone_remove(zone); 3219 } 3220 } else { 3221 t->t_forw->t_back = t->t_back; 3222 t->t_back->t_forw = t->t_forw; 3223 if (zone->zone_kthreads == t) 3224 zone->zone_kthreads = t->t_forw; 3225 } 3226 mutex_exit(&zone_status_lock); 3227 zone_rele(zone); 3228 thread_exit(); 3229 /* NOTREACHED */ 3230 } 3231 3232 static void 3233 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 3234 { 3235 vnode_t *oldvp; 3236 3237 /* we're going to hold a reference here to the directory */ 3238 VN_HOLD(vp); 3239 3240 /* update abs cwd/root path see c2/audit.c */ 3241 if (AU_AUDITING()) 3242 audit_chdirec(vp, vpp); 3243 3244 mutex_enter(&pp->p_lock); 3245 oldvp = *vpp; 3246 *vpp = vp; 3247 mutex_exit(&pp->p_lock); 3248 if (oldvp != NULL) 3249 VN_RELE(oldvp); 3250 } 3251 3252 /* 3253 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 3254 */ 3255 static int 3256 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 3257 { 3258 nvpair_t *nvp = NULL; 3259 boolean_t priv_set = B_FALSE; 3260 boolean_t limit_set = B_FALSE; 3261 boolean_t action_set = B_FALSE; 3262 3263 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3264 const char *name; 3265 uint64_t ui64; 3266 3267 name = nvpair_name(nvp); 3268 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 3269 return (EINVAL); 3270 (void) nvpair_value_uint64(nvp, &ui64); 3271 if (strcmp(name, "privilege") == 0) { 3272 /* 3273 * Currently only privileged values are allowed, but 3274 * this may change in the future. 3275 */ 3276 if (ui64 != RCPRIV_PRIVILEGED) 3277 return (EINVAL); 3278 rv->rcv_privilege = ui64; 3279 priv_set = B_TRUE; 3280 } else if (strcmp(name, "limit") == 0) { 3281 rv->rcv_value = ui64; 3282 limit_set = B_TRUE; 3283 } else if (strcmp(name, "action") == 0) { 3284 if (ui64 != RCTL_LOCAL_NOACTION && 3285 ui64 != RCTL_LOCAL_DENY) 3286 return (EINVAL); 3287 rv->rcv_flagaction = ui64; 3288 action_set = B_TRUE; 3289 } else { 3290 return (EINVAL); 3291 } 3292 } 3293 3294 if (!(priv_set && limit_set && action_set)) 3295 return (EINVAL); 3296 rv->rcv_action_signal = 0; 3297 rv->rcv_action_recipient = NULL; 3298 rv->rcv_action_recip_pid = -1; 3299 rv->rcv_firing_time = 0; 3300 3301 return (0); 3302 } 3303 3304 /* 3305 * Non-global zone version of start_init. 3306 */ 3307 void 3308 zone_start_init(void) 3309 { 3310 proc_t *p = ttoproc(curthread); 3311 zone_t *z = p->p_zone; 3312 3313 ASSERT(!INGLOBALZONE(curproc)); 3314 3315 /* 3316 * For all purposes (ZONE_ATTR_INITPID and restart_init), 3317 * storing just the pid of init is sufficient. 3318 */ 3319 z->zone_proc_initpid = p->p_pid; 3320 3321 /* 3322 * We maintain zone_boot_err so that we can return the cause of the 3323 * failure back to the caller of the zone_boot syscall. 3324 */ 3325 p->p_zone->zone_boot_err = start_init_common(); 3326 3327 /* 3328 * We will prevent booting zones from becoming running zones if the 3329 * global zone is shutting down. 3330 */ 3331 mutex_enter(&zone_status_lock); 3332 if (z->zone_boot_err != 0 || zone_status_get(global_zone) >= 3333 ZONE_IS_SHUTTING_DOWN) { 3334 /* 3335 * Make sure we are still in the booting state-- we could have 3336 * raced and already be shutting down, or even further along. 3337 */ 3338 if (zone_status_get(z) == ZONE_IS_BOOTING) { 3339 zone_status_set(z, ZONE_IS_SHUTTING_DOWN); 3340 } 3341 mutex_exit(&zone_status_lock); 3342 /* It's gone bad, dispose of the process */ 3343 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) { 3344 mutex_enter(&p->p_lock); 3345 ASSERT(p->p_flag & SEXITLWPS); 3346 lwp_exit(); 3347 } 3348 } else { 3349 if (zone_status_get(z) == ZONE_IS_BOOTING) 3350 zone_status_set(z, ZONE_IS_RUNNING); 3351 mutex_exit(&zone_status_lock); 3352 /* cause the process to return to userland. */ 3353 lwp_rtt(); 3354 } 3355 } 3356 3357 struct zsched_arg { 3358 zone_t *zone; 3359 nvlist_t *nvlist; 3360 }; 3361 3362 /* 3363 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 3364 * anything to do with scheduling, but rather with the fact that 3365 * per-zone kernel threads are parented to zsched, just like regular 3366 * kernel threads are parented to sched (p0). 3367 * 3368 * zsched is also responsible for launching init for the zone. 3369 */ 3370 static void 3371 zsched(void *arg) 3372 { 3373 struct zsched_arg *za = arg; 3374 proc_t *pp = curproc; 3375 proc_t *initp = proc_init; 3376 zone_t *zone = za->zone; 3377 cred_t *cr, *oldcred; 3378 rctl_set_t *set; 3379 rctl_alloc_gp_t *gp; 3380 contract_t *ct = NULL; 3381 task_t *tk, *oldtk; 3382 rctl_entity_p_t e; 3383 kproject_t *pj; 3384 3385 nvlist_t *nvl = za->nvlist; 3386 nvpair_t *nvp = NULL; 3387 3388 bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched")); 3389 bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched")); 3390 PTOU(pp)->u_argc = 0; 3391 PTOU(pp)->u_argv = NULL; 3392 PTOU(pp)->u_envp = NULL; 3393 closeall(P_FINFO(pp)); 3394 3395 /* 3396 * We are this zone's "zsched" process. As the zone isn't generally 3397 * visible yet we don't need to grab any locks before initializing its 3398 * zone_proc pointer. 3399 */ 3400 zone_hold(zone); /* this hold is released by zone_destroy() */ 3401 zone->zone_zsched = pp; 3402 mutex_enter(&pp->p_lock); 3403 pp->p_zone = zone; 3404 mutex_exit(&pp->p_lock); 3405 3406 /* 3407 * Disassociate process from its 'parent'; parent ourselves to init 3408 * (pid 1) and change other values as needed. 3409 */ 3410 sess_create(); 3411 3412 mutex_enter(&pidlock); 3413 proc_detach(pp); 3414 pp->p_ppid = 1; 3415 pp->p_flag |= SZONETOP; 3416 pp->p_ancpid = 1; 3417 pp->p_parent = initp; 3418 pp->p_psibling = NULL; 3419 if (initp->p_child) 3420 initp->p_child->p_psibling = pp; 3421 pp->p_sibling = initp->p_child; 3422 initp->p_child = pp; 3423 3424 /* Decrement what newproc() incremented. */ 3425 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 3426 /* 3427 * Our credentials are about to become kcred-like, so we don't care 3428 * about the caller's ruid. 3429 */ 3430 upcount_inc(crgetruid(kcred), zone->zone_id); 3431 mutex_exit(&pidlock); 3432 3433 /* 3434 * getting out of global zone, so decrement lwp and process counts 3435 */ 3436 pj = pp->p_task->tk_proj; 3437 mutex_enter(&global_zone->zone_nlwps_lock); 3438 pj->kpj_nlwps -= pp->p_lwpcnt; 3439 global_zone->zone_nlwps -= pp->p_lwpcnt; 3440 pj->kpj_nprocs--; 3441 global_zone->zone_nprocs--; 3442 mutex_exit(&global_zone->zone_nlwps_lock); 3443 3444 /* 3445 * Decrement locked memory counts on old zone and project. 3446 */ 3447 mutex_enter(&global_zone->zone_mem_lock); 3448 global_zone->zone_locked_mem -= pp->p_locked_mem; 3449 pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; 3450 mutex_exit(&global_zone->zone_mem_lock); 3451 3452 /* 3453 * Create and join a new task in project '0' of this zone. 3454 * 3455 * We don't need to call holdlwps() since we know we're the only lwp in 3456 * this process. 3457 * 3458 * task_join() returns with p_lock held. 3459 */ 3460 tk = task_create(0, zone); 3461 mutex_enter(&cpu_lock); 3462 oldtk = task_join(tk, 0); 3463 3464 pj = pp->p_task->tk_proj; 3465 3466 mutex_enter(&zone->zone_mem_lock); 3467 zone->zone_locked_mem += pp->p_locked_mem; 3468 pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; 3469 mutex_exit(&zone->zone_mem_lock); 3470 3471 /* 3472 * add lwp and process counts to zsched's zone, and increment 3473 * project's task and process count due to the task created in 3474 * the above task_create. 3475 */ 3476 mutex_enter(&zone->zone_nlwps_lock); 3477 pj->kpj_nlwps += pp->p_lwpcnt; 3478 pj->kpj_ntasks += 1; 3479 zone->zone_nlwps += pp->p_lwpcnt; 3480 pj->kpj_nprocs++; 3481 zone->zone_nprocs++; 3482 mutex_exit(&zone->zone_nlwps_lock); 3483 3484 mutex_exit(&curproc->p_lock); 3485 mutex_exit(&cpu_lock); 3486 task_rele(oldtk); 3487 3488 /* 3489 * The process was created by a process in the global zone, hence the 3490 * credentials are wrong. We might as well have kcred-ish credentials. 3491 */ 3492 cr = zone->zone_kcred; 3493 crhold(cr); 3494 mutex_enter(&pp->p_crlock); 3495 oldcred = pp->p_cred; 3496 pp->p_cred = cr; 3497 mutex_exit(&pp->p_crlock); 3498 crfree(oldcred); 3499 3500 /* 3501 * Hold credentials again (for thread) 3502 */ 3503 crhold(cr); 3504 3505 /* 3506 * p_lwpcnt can't change since this is a kernel process. 3507 */ 3508 crset(pp, cr); 3509 3510 /* 3511 * Chroot 3512 */ 3513 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 3514 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 3515 3516 /* 3517 * Initialize zone's rctl set. 3518 */ 3519 set = rctl_set_create(); 3520 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 3521 mutex_enter(&pp->p_lock); 3522 e.rcep_p.zone = zone; 3523 e.rcep_t = RCENTITY_ZONE; 3524 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 3525 mutex_exit(&pp->p_lock); 3526 rctl_prealloc_destroy(gp); 3527 3528 /* 3529 * Apply the rctls passed in to zone_create(). This is basically a list 3530 * assignment: all of the old values are removed and the new ones 3531 * inserted. That is, if an empty list is passed in, all values are 3532 * removed. 3533 */ 3534 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3535 rctl_dict_entry_t *rde; 3536 rctl_hndl_t hndl; 3537 char *name; 3538 nvlist_t **nvlarray; 3539 uint_t i, nelem; 3540 int error; /* For ASSERT()s */ 3541 3542 name = nvpair_name(nvp); 3543 hndl = rctl_hndl_lookup(name); 3544 ASSERT(hndl != -1); 3545 rde = rctl_dict_lookup_hndl(hndl); 3546 ASSERT(rde != NULL); 3547 3548 for (; /* ever */; ) { 3549 rctl_val_t oval; 3550 3551 mutex_enter(&pp->p_lock); 3552 error = rctl_local_get(hndl, NULL, &oval, pp); 3553 mutex_exit(&pp->p_lock); 3554 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 3555 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 3556 if (oval.rcv_privilege == RCPRIV_SYSTEM) 3557 break; 3558 mutex_enter(&pp->p_lock); 3559 error = rctl_local_delete(hndl, &oval, pp); 3560 mutex_exit(&pp->p_lock); 3561 ASSERT(error == 0); 3562 } 3563 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 3564 ASSERT(error == 0); 3565 for (i = 0; i < nelem; i++) { 3566 rctl_val_t *nvalp; 3567 3568 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 3569 error = nvlist2rctlval(nvlarray[i], nvalp); 3570 ASSERT(error == 0); 3571 /* 3572 * rctl_local_insert can fail if the value being 3573 * inserted is a duplicate; this is OK. 3574 */ 3575 mutex_enter(&pp->p_lock); 3576 if (rctl_local_insert(hndl, nvalp, pp) != 0) 3577 kmem_cache_free(rctl_val_cache, nvalp); 3578 mutex_exit(&pp->p_lock); 3579 } 3580 } 3581 /* 3582 * Tell the world that we're done setting up. 3583 * 3584 * At this point we want to set the zone status to ZONE_IS_INITIALIZED 3585 * and atomically set the zone's processor set visibility. Once 3586 * we drop pool_lock() this zone will automatically get updated 3587 * to reflect any future changes to the pools configuration. 3588 * 3589 * Note that after we drop the locks below (zonehash_lock in 3590 * particular) other operations such as a zone_getattr call can 3591 * now proceed and observe the zone. That is the reason for doing a 3592 * state transition to the INITIALIZED state. 3593 */ 3594 pool_lock(); 3595 mutex_enter(&cpu_lock); 3596 mutex_enter(&zonehash_lock); 3597 zone_uniqid(zone); 3598 zone_zsd_configure(zone); 3599 if (pool_state == POOL_ENABLED) 3600 zone_pset_set(zone, pool_default->pool_pset->pset_id); 3601 mutex_enter(&zone_status_lock); 3602 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 3603 zone_status_set(zone, ZONE_IS_INITIALIZED); 3604 mutex_exit(&zone_status_lock); 3605 mutex_exit(&zonehash_lock); 3606 mutex_exit(&cpu_lock); 3607 pool_unlock(); 3608 3609 /* Now call the create callback for this key */ 3610 zsd_apply_all_keys(zsd_apply_create, zone); 3611 3612 /* The callbacks are complete. Mark ZONE_IS_READY */ 3613 mutex_enter(&zone_status_lock); 3614 ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED); 3615 zone_status_set(zone, ZONE_IS_READY); 3616 mutex_exit(&zone_status_lock); 3617 3618 /* 3619 * Once we see the zone transition to the ZONE_IS_BOOTING state, 3620 * we launch init, and set the state to running. 3621 */ 3622 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 3623 3624 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 3625 id_t cid; 3626 3627 /* 3628 * Ok, this is a little complicated. We need to grab the 3629 * zone's pool's scheduling class ID; note that by now, we 3630 * are already bound to a pool if we need to be (zoneadmd 3631 * will have done that to us while we're in the READY 3632 * state). *But* the scheduling class for the zone's 'init' 3633 * must be explicitly passed to newproc, which doesn't 3634 * respect pool bindings. 3635 * 3636 * We hold the pool_lock across the call to newproc() to 3637 * close the obvious race: the pool's scheduling class 3638 * could change before we manage to create the LWP with 3639 * classid 'cid'. 3640 */ 3641 pool_lock(); 3642 if (zone->zone_defaultcid > 0) 3643 cid = zone->zone_defaultcid; 3644 else 3645 cid = pool_get_class(zone->zone_pool); 3646 if (cid == -1) 3647 cid = defaultcid; 3648 3649 /* 3650 * If this fails, zone_boot will ultimately fail. The 3651 * state of the zone will be set to SHUTTING_DOWN-- userland 3652 * will have to tear down the zone, and fail, or try again. 3653 */ 3654 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid, 3655 minclsyspri - 1, &ct, 0)) != 0) { 3656 mutex_enter(&zone_status_lock); 3657 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3658 mutex_exit(&zone_status_lock); 3659 } 3660 pool_unlock(); 3661 } 3662 3663 /* 3664 * Wait for zone_destroy() to be called. This is what we spend 3665 * most of our life doing. 3666 */ 3667 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 3668 3669 if (ct) 3670 /* 3671 * At this point the process contract should be empty. 3672 * (Though if it isn't, it's not the end of the world.) 3673 */ 3674 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 3675 3676 /* 3677 * Allow kcred to be freed when all referring processes 3678 * (including this one) go away. We can't just do this in 3679 * zone_free because we need to wait for the zone_cred_ref to 3680 * drop to 0 before calling zone_free, and the existence of 3681 * zone_kcred will prevent that. Thus, we call crfree here to 3682 * balance the crdup in zone_create. The crhold calls earlier 3683 * in zsched will be dropped when the thread and process exit. 3684 */ 3685 crfree(zone->zone_kcred); 3686 zone->zone_kcred = NULL; 3687 3688 exit(CLD_EXITED, 0); 3689 } 3690 3691 /* 3692 * Helper function to determine if there are any submounts of the 3693 * provided path. Used to make sure the zone doesn't "inherit" any 3694 * mounts from before it is created. 3695 */ 3696 static uint_t 3697 zone_mount_count(const char *rootpath) 3698 { 3699 vfs_t *vfsp; 3700 uint_t count = 0; 3701 size_t rootpathlen = strlen(rootpath); 3702 3703 /* 3704 * Holding zonehash_lock prevents race conditions with 3705 * vfs_list_add()/vfs_list_remove() since we serialize with 3706 * zone_find_by_path(). 3707 */ 3708 ASSERT(MUTEX_HELD(&zonehash_lock)); 3709 /* 3710 * The rootpath must end with a '/' 3711 */ 3712 ASSERT(rootpath[rootpathlen - 1] == '/'); 3713 3714 /* 3715 * This intentionally does not count the rootpath itself if that 3716 * happens to be a mount point. 3717 */ 3718 vfs_list_read_lock(); 3719 vfsp = rootvfs; 3720 do { 3721 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 3722 rootpathlen) == 0) 3723 count++; 3724 vfsp = vfsp->vfs_next; 3725 } while (vfsp != rootvfs); 3726 vfs_list_unlock(); 3727 return (count); 3728 } 3729 3730 /* 3731 * Helper function to make sure that a zone created on 'rootpath' 3732 * wouldn't end up containing other zones' rootpaths. 3733 */ 3734 static boolean_t 3735 zone_is_nested(const char *rootpath) 3736 { 3737 zone_t *zone; 3738 size_t rootpathlen = strlen(rootpath); 3739 size_t len; 3740 3741 ASSERT(MUTEX_HELD(&zonehash_lock)); 3742 3743 /* 3744 * zone_set_root() appended '/' and '\0' at the end of rootpath 3745 */ 3746 if ((rootpathlen <= 3) && (rootpath[0] == '/') && 3747 (rootpath[1] == '/') && (rootpath[2] == '\0')) 3748 return (B_TRUE); 3749 3750 for (zone = list_head(&zone_active); zone != NULL; 3751 zone = list_next(&zone_active, zone)) { 3752 if (zone == global_zone) 3753 continue; 3754 len = strlen(zone->zone_rootpath); 3755 if (strncmp(rootpath, zone->zone_rootpath, 3756 MIN(rootpathlen, len)) == 0) 3757 return (B_TRUE); 3758 } 3759 return (B_FALSE); 3760 } 3761 3762 static int 3763 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, 3764 size_t zone_privssz) 3765 { 3766 priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 3767 3768 if (zone_privssz < sizeof (priv_set_t)) 3769 return (set_errno(ENOMEM)); 3770 3771 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 3772 kmem_free(privs, sizeof (priv_set_t)); 3773 return (EFAULT); 3774 } 3775 3776 zone->zone_privset = privs; 3777 return (0); 3778 } 3779 3780 /* 3781 * We make creative use of nvlists to pass in rctls from userland. The list is 3782 * a list of the following structures: 3783 * 3784 * (name = rctl_name, value = nvpair_list_array) 3785 * 3786 * Where each element of the nvpair_list_array is of the form: 3787 * 3788 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 3789 * (name = "limit", value = uint64_t), 3790 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 3791 */ 3792 static int 3793 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 3794 { 3795 nvpair_t *nvp = NULL; 3796 nvlist_t *nvl = NULL; 3797 char *kbuf; 3798 int error; 3799 rctl_val_t rv; 3800 3801 *nvlp = NULL; 3802 3803 if (buflen == 0) 3804 return (0); 3805 3806 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 3807 return (ENOMEM); 3808 if (copyin(ubuf, kbuf, buflen)) { 3809 error = EFAULT; 3810 goto out; 3811 } 3812 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 3813 /* 3814 * nvl may have been allocated/free'd, but the value set to 3815 * non-NULL, so we reset it here. 3816 */ 3817 nvl = NULL; 3818 error = EINVAL; 3819 goto out; 3820 } 3821 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3822 rctl_dict_entry_t *rde; 3823 rctl_hndl_t hndl; 3824 nvlist_t **nvlarray; 3825 uint_t i, nelem; 3826 char *name; 3827 3828 error = EINVAL; 3829 name = nvpair_name(nvp); 3830 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 3831 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 3832 goto out; 3833 } 3834 if ((hndl = rctl_hndl_lookup(name)) == -1) { 3835 goto out; 3836 } 3837 rde = rctl_dict_lookup_hndl(hndl); 3838 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 3839 ASSERT(error == 0); 3840 for (i = 0; i < nelem; i++) { 3841 if (error = nvlist2rctlval(nvlarray[i], &rv)) 3842 goto out; 3843 } 3844 if (rctl_invalid_value(rde, &rv)) { 3845 error = EINVAL; 3846 goto out; 3847 } 3848 } 3849 error = 0; 3850 *nvlp = nvl; 3851 out: 3852 kmem_free(kbuf, buflen); 3853 if (error && nvl != NULL) 3854 nvlist_free(nvl); 3855 return (error); 3856 } 3857 3858 int 3859 zone_create_error(int er_error, int er_ext, int *er_out) { 3860 if (er_out != NULL) { 3861 if (copyout(&er_ext, er_out, sizeof (int))) { 3862 return (set_errno(EFAULT)); 3863 } 3864 } 3865 return (set_errno(er_error)); 3866 } 3867 3868 static int 3869 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi) 3870 { 3871 ts_label_t *tsl; 3872 bslabel_t blab; 3873 3874 /* Get label from user */ 3875 if (copyin(lab, &blab, sizeof (blab)) != 0) 3876 return (EFAULT); 3877 tsl = labelalloc(&blab, doi, KM_NOSLEEP); 3878 if (tsl == NULL) 3879 return (ENOMEM); 3880 3881 zone->zone_slabel = tsl; 3882 return (0); 3883 } 3884 3885 /* 3886 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 3887 */ 3888 static int 3889 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 3890 { 3891 char *kbuf; 3892 char *dataset, *next; 3893 zone_dataset_t *zd; 3894 size_t len; 3895 3896 if (ubuf == NULL || buflen == 0) 3897 return (0); 3898 3899 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 3900 return (ENOMEM); 3901 3902 if (copyin(ubuf, kbuf, buflen) != 0) { 3903 kmem_free(kbuf, buflen); 3904 return (EFAULT); 3905 } 3906 3907 dataset = next = kbuf; 3908 for (;;) { 3909 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 3910 3911 next = strchr(dataset, ','); 3912 3913 if (next == NULL) 3914 len = strlen(dataset); 3915 else 3916 len = next - dataset; 3917 3918 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 3919 bcopy(dataset, zd->zd_dataset, len); 3920 zd->zd_dataset[len] = '\0'; 3921 3922 list_insert_head(&zone->zone_datasets, zd); 3923 3924 if (next == NULL) 3925 break; 3926 3927 dataset = next + 1; 3928 } 3929 3930 kmem_free(kbuf, buflen); 3931 return (0); 3932 } 3933 3934 /* 3935 * System call to create/initialize a new zone named 'zone_name', rooted 3936 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 3937 * and initialized with the zone-wide rctls described in 'rctlbuf', and 3938 * with labeling set by 'match', 'doi', and 'label'. 3939 * 3940 * If extended error is non-null, we may use it to return more detailed 3941 * error information. 3942 */ 3943 static zoneid_t 3944 zone_create(const char *zone_name, const char *zone_root, 3945 const priv_set_t *zone_privs, size_t zone_privssz, 3946 caddr_t rctlbuf, size_t rctlbufsz, 3947 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, 3948 int match, uint32_t doi, const bslabel_t *label, 3949 int flags) 3950 { 3951 struct zsched_arg zarg; 3952 nvlist_t *rctls = NULL; 3953 proc_t *pp = curproc; 3954 zone_t *zone, *ztmp; 3955 zoneid_t zoneid; 3956 int error; 3957 int error2 = 0; 3958 char *str; 3959 cred_t *zkcr; 3960 boolean_t insert_label_hash; 3961 3962 if (secpolicy_zone_config(CRED()) != 0) 3963 return (set_errno(EPERM)); 3964 3965 /* can't boot zone from within chroot environment */ 3966 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 3967 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 3968 extended_error)); 3969 3970 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 3971 zoneid = zone->zone_id = id_alloc(zoneid_space); 3972 zone->zone_status = ZONE_IS_UNINITIALIZED; 3973 zone->zone_pool = pool_default; 3974 zone->zone_pool_mod = gethrtime(); 3975 zone->zone_psetid = ZONE_PS_INVAL; 3976 zone->zone_ncpus = 0; 3977 zone->zone_ncpus_online = 0; 3978 zone->zone_restart_init = B_TRUE; 3979 zone->zone_brand = &native_brand; 3980 zone->zone_initname = NULL; 3981 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 3982 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 3983 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); 3984 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 3985 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 3986 offsetof(struct zsd_entry, zsd_linkage)); 3987 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 3988 offsetof(zone_dataset_t, zd_linkage)); 3989 list_create(&zone->zone_dl_list, sizeof (zone_dl_t), 3990 offsetof(zone_dl_t, zdl_linkage)); 3991 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 3992 rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL); 3993 3994 if (flags & ZCF_NET_EXCL) { 3995 zone->zone_flags |= ZF_NET_EXCL; 3996 } 3997 3998 if ((error = zone_set_name(zone, zone_name)) != 0) { 3999 zone_free(zone); 4000 return (zone_create_error(error, 0, extended_error)); 4001 } 4002 4003 if ((error = zone_set_root(zone, zone_root)) != 0) { 4004 zone_free(zone); 4005 return (zone_create_error(error, 0, extended_error)); 4006 } 4007 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) { 4008 zone_free(zone); 4009 return (zone_create_error(error, 0, extended_error)); 4010 } 4011 4012 /* initialize node name to be the same as zone name */ 4013 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 4014 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 4015 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 4016 4017 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 4018 zone->zone_domain[0] = '\0'; 4019 zone->zone_hostid = HW_INVALID_HOSTID; 4020 zone->zone_shares = 1; 4021 zone->zone_shmmax = 0; 4022 zone->zone_ipc.ipcq_shmmni = 0; 4023 zone->zone_ipc.ipcq_semmni = 0; 4024 zone->zone_ipc.ipcq_msgmni = 0; 4025 zone->zone_bootargs = NULL; 4026 zone->zone_fs_allowed = NULL; 4027 zone->zone_initname = 4028 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); 4029 (void) strcpy(zone->zone_initname, zone_default_initname); 4030 zone->zone_nlwps = 0; 4031 zone->zone_nlwps_ctl = INT_MAX; 4032 zone->zone_nprocs = 0; 4033 zone->zone_nprocs_ctl = INT_MAX; 4034 zone->zone_locked_mem = 0; 4035 zone->zone_locked_mem_ctl = UINT64_MAX; 4036 zone->zone_max_swap = 0; 4037 zone->zone_max_swap_ctl = UINT64_MAX; 4038 zone->zone_max_lofi = 0; 4039 zone->zone_max_lofi_ctl = UINT64_MAX; 4040 zone0.zone_lockedmem_kstat = NULL; 4041 zone0.zone_swapresv_kstat = NULL; 4042 4043 /* 4044 * Zsched initializes the rctls. 4045 */ 4046 zone->zone_rctls = NULL; 4047 4048 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 4049 zone_free(zone); 4050 return (zone_create_error(error, 0, extended_error)); 4051 } 4052 4053 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 4054 zone_free(zone); 4055 return (set_errno(error)); 4056 } 4057 4058 /* 4059 * Read in the trusted system parameters: 4060 * match flag and sensitivity label. 4061 */ 4062 zone->zone_match = match; 4063 if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 4064 /* Fail if requested to set doi to anything but system's doi */ 4065 if (doi != 0 && doi != default_doi) { 4066 zone_free(zone); 4067 return (set_errno(EINVAL)); 4068 } 4069 /* Always apply system's doi to the zone */ 4070 error = zone_set_label(zone, label, default_doi); 4071 if (error != 0) { 4072 zone_free(zone); 4073 return (set_errno(error)); 4074 } 4075 insert_label_hash = B_TRUE; 4076 } else { 4077 /* all zones get an admin_low label if system is not labeled */ 4078 zone->zone_slabel = l_admin_low; 4079 label_hold(l_admin_low); 4080 insert_label_hash = B_FALSE; 4081 } 4082 4083 /* 4084 * Stop all lwps since that's what normally happens as part of fork(). 4085 * This needs to happen before we grab any locks to avoid deadlock 4086 * (another lwp in the process could be waiting for the held lock). 4087 */ 4088 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 4089 zone_free(zone); 4090 if (rctls) 4091 nvlist_free(rctls); 4092 return (zone_create_error(error, 0, extended_error)); 4093 } 4094 4095 if (block_mounts() == 0) { 4096 mutex_enter(&pp->p_lock); 4097 if (curthread != pp->p_agenttp) 4098 continuelwps(pp); 4099 mutex_exit(&pp->p_lock); 4100 zone_free(zone); 4101 if (rctls) 4102 nvlist_free(rctls); 4103 return (zone_create_error(error, 0, extended_error)); 4104 } 4105 4106 /* 4107 * Set up credential for kernel access. After this, any errors 4108 * should go through the dance in errout rather than calling 4109 * zone_free directly. 4110 */ 4111 zone->zone_kcred = crdup(kcred); 4112 crsetzone(zone->zone_kcred, zone); 4113 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 4114 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 4115 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 4116 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 4117 4118 mutex_enter(&zonehash_lock); 4119 /* 4120 * Make sure zone doesn't already exist. 4121 * 4122 * If the system and zone are labeled, 4123 * make sure no other zone exists that has the same label. 4124 */ 4125 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL || 4126 (insert_label_hash && 4127 (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) { 4128 zone_status_t status; 4129 4130 status = zone_status_get(ztmp); 4131 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 4132 error = EEXIST; 4133 else 4134 error = EBUSY; 4135 4136 if (insert_label_hash) 4137 error2 = ZE_LABELINUSE; 4138 4139 goto errout; 4140 } 4141 4142 /* 4143 * Don't allow zone creations which would cause one zone's rootpath to 4144 * be accessible from that of another (non-global) zone. 4145 */ 4146 if (zone_is_nested(zone->zone_rootpath)) { 4147 error = EBUSY; 4148 goto errout; 4149 } 4150 4151 ASSERT(zonecount != 0); /* check for leaks */ 4152 if (zonecount + 1 > maxzones) { 4153 error = ENOMEM; 4154 goto errout; 4155 } 4156 4157 if (zone_mount_count(zone->zone_rootpath) != 0) { 4158 error = EBUSY; 4159 error2 = ZE_AREMOUNTS; 4160 goto errout; 4161 } 4162 4163 /* 4164 * Zone is still incomplete, but we need to drop all locks while 4165 * zsched() initializes this zone's kernel process. We 4166 * optimistically add the zone to the hashtable and associated 4167 * lists so a parallel zone_create() doesn't try to create the 4168 * same zone. 4169 */ 4170 zonecount++; 4171 (void) mod_hash_insert(zonehashbyid, 4172 (mod_hash_key_t)(uintptr_t)zone->zone_id, 4173 (mod_hash_val_t)(uintptr_t)zone); 4174 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 4175 (void) strcpy(str, zone->zone_name); 4176 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 4177 (mod_hash_val_t)(uintptr_t)zone); 4178 if (insert_label_hash) { 4179 (void) mod_hash_insert(zonehashbylabel, 4180 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone); 4181 zone->zone_flags |= ZF_HASHED_LABEL; 4182 } 4183 4184 /* 4185 * Insert into active list. At this point there are no 'hold's 4186 * on the zone, but everyone else knows not to use it, so we can 4187 * continue to use it. zsched() will do a zone_hold() if the 4188 * newproc() is successful. 4189 */ 4190 list_insert_tail(&zone_active, zone); 4191 mutex_exit(&zonehash_lock); 4192 4193 zarg.zone = zone; 4194 zarg.nvlist = rctls; 4195 /* 4196 * The process, task, and project rctls are probably wrong; 4197 * we need an interface to get the default values of all rctls, 4198 * and initialize zsched appropriately. I'm not sure that that 4199 * makes much of a difference, though. 4200 */ 4201 error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); 4202 if (error != 0) { 4203 /* 4204 * We need to undo all globally visible state. 4205 */ 4206 mutex_enter(&zonehash_lock); 4207 list_remove(&zone_active, zone); 4208 if (zone->zone_flags & ZF_HASHED_LABEL) { 4209 ASSERT(zone->zone_slabel != NULL); 4210 (void) mod_hash_destroy(zonehashbylabel, 4211 (mod_hash_key_t)zone->zone_slabel); 4212 } 4213 (void) mod_hash_destroy(zonehashbyname, 4214 (mod_hash_key_t)(uintptr_t)zone->zone_name); 4215 (void) mod_hash_destroy(zonehashbyid, 4216 (mod_hash_key_t)(uintptr_t)zone->zone_id); 4217 ASSERT(zonecount > 1); 4218 zonecount--; 4219 goto errout; 4220 } 4221 4222 /* 4223 * Zone creation can't fail from now on. 4224 */ 4225 4226 /* 4227 * Create zone kstats 4228 */ 4229 zone_kstat_create(zone); 4230 4231 /* 4232 * Let the other lwps continue. 4233 */ 4234 mutex_enter(&pp->p_lock); 4235 if (curthread != pp->p_agenttp) 4236 continuelwps(pp); 4237 mutex_exit(&pp->p_lock); 4238 4239 /* 4240 * Wait for zsched to finish initializing the zone. 4241 */ 4242 zone_status_wait(zone, ZONE_IS_READY); 4243 /* 4244 * The zone is fully visible, so we can let mounts progress. 4245 */ 4246 resume_mounts(); 4247 if (rctls) 4248 nvlist_free(rctls); 4249 4250 return (zoneid); 4251 4252 errout: 4253 mutex_exit(&zonehash_lock); 4254 /* 4255 * Let the other lwps continue. 4256 */ 4257 mutex_enter(&pp->p_lock); 4258 if (curthread != pp->p_agenttp) 4259 continuelwps(pp); 4260 mutex_exit(&pp->p_lock); 4261 4262 resume_mounts(); 4263 if (rctls) 4264 nvlist_free(rctls); 4265 /* 4266 * There is currently one reference to the zone, a cred_ref from 4267 * zone_kcred. To free the zone, we call crfree, which will call 4268 * zone_cred_rele, which will call zone_free. 4269 */ 4270 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 4271 ASSERT(zone->zone_kcred->cr_ref == 1); 4272 ASSERT(zone->zone_ref == 0); 4273 zkcr = zone->zone_kcred; 4274 zone->zone_kcred = NULL; 4275 crfree(zkcr); /* triggers call to zone_free */ 4276 return (zone_create_error(error, error2, extended_error)); 4277 } 4278 4279 /* 4280 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 4281 * the heavy lifting. initname is the path to the program to launch 4282 * at the "top" of the zone; if this is NULL, we use the system default, 4283 * which is stored at zone_default_initname. 4284 */ 4285 static int 4286 zone_boot(zoneid_t zoneid) 4287 { 4288 int err; 4289 zone_t *zone; 4290 4291 if (secpolicy_zone_config(CRED()) != 0) 4292 return (set_errno(EPERM)); 4293 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4294 return (set_errno(EINVAL)); 4295 4296 mutex_enter(&zonehash_lock); 4297 /* 4298 * Look for zone under hash lock to prevent races with calls to 4299 * zone_shutdown, zone_destroy, etc. 4300 */ 4301 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4302 mutex_exit(&zonehash_lock); 4303 return (set_errno(EINVAL)); 4304 } 4305 4306 mutex_enter(&zone_status_lock); 4307 if (zone_status_get(zone) != ZONE_IS_READY) { 4308 mutex_exit(&zone_status_lock); 4309 mutex_exit(&zonehash_lock); 4310 return (set_errno(EINVAL)); 4311 } 4312 zone_status_set(zone, ZONE_IS_BOOTING); 4313 mutex_exit(&zone_status_lock); 4314 4315 zone_hold(zone); /* so we can use the zone_t later */ 4316 mutex_exit(&zonehash_lock); 4317 4318 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 4319 zone_rele(zone); 4320 return (set_errno(EINTR)); 4321 } 4322 4323 /* 4324 * Boot (starting init) might have failed, in which case the zone 4325 * will go to the SHUTTING_DOWN state; an appropriate errno will 4326 * be placed in zone->zone_boot_err, and so we return that. 4327 */ 4328 err = zone->zone_boot_err; 4329 zone_rele(zone); 4330 return (err ? set_errno(err) : 0); 4331 } 4332 4333 /* 4334 * Kills all user processes in the zone, waiting for them all to exit 4335 * before returning. 4336 */ 4337 static int 4338 zone_empty(zone_t *zone) 4339 { 4340 int waitstatus; 4341 4342 /* 4343 * We need to drop zonehash_lock before killing all 4344 * processes, otherwise we'll deadlock with zone_find_* 4345 * which can be called from the exit path. 4346 */ 4347 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 4348 while ((waitstatus = zone_status_timedwait_sig(zone, 4349 ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { 4350 killall(zone->zone_id); 4351 } 4352 /* 4353 * return EINTR if we were signaled 4354 */ 4355 if (waitstatus == 0) 4356 return (EINTR); 4357 return (0); 4358 } 4359 4360 /* 4361 * This function implements the policy for zone visibility. 4362 * 4363 * In standard Solaris, a non-global zone can only see itself. 4364 * 4365 * In Trusted Extensions, a labeled zone can lookup any zone whose label 4366 * it dominates. For this test, the label of the global zone is treated as 4367 * admin_high so it is special-cased instead of being checked for dominance. 4368 * 4369 * Returns true if zone attributes are viewable, false otherwise. 4370 */ 4371 static boolean_t 4372 zone_list_access(zone_t *zone) 4373 { 4374 4375 if (curproc->p_zone == global_zone || 4376 curproc->p_zone == zone) { 4377 return (B_TRUE); 4378 } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 4379 bslabel_t *curproc_label; 4380 bslabel_t *zone_label; 4381 4382 curproc_label = label2bslabel(curproc->p_zone->zone_slabel); 4383 zone_label = label2bslabel(zone->zone_slabel); 4384 4385 if (zone->zone_id != GLOBAL_ZONEID && 4386 bldominates(curproc_label, zone_label)) { 4387 return (B_TRUE); 4388 } else { 4389 return (B_FALSE); 4390 } 4391 } else { 4392 return (B_FALSE); 4393 } 4394 } 4395 4396 /* 4397 * Systemcall to start the zone's halt sequence. By the time this 4398 * function successfully returns, all user processes and kernel threads 4399 * executing in it will have exited, ZSD shutdown callbacks executed, 4400 * and the zone status set to ZONE_IS_DOWN. 4401 * 4402 * It is possible that the call will interrupt itself if the caller is the 4403 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 4404 */ 4405 static int 4406 zone_shutdown(zoneid_t zoneid) 4407 { 4408 int error; 4409 zone_t *zone; 4410 zone_status_t status; 4411 4412 if (secpolicy_zone_config(CRED()) != 0) 4413 return (set_errno(EPERM)); 4414 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4415 return (set_errno(EINVAL)); 4416 4417 /* 4418 * Block mounts so that VFS_MOUNT() can get an accurate view of 4419 * the zone's status with regards to ZONE_IS_SHUTTING down. 4420 * 4421 * e.g. NFS can fail the mount if it determines that the zone 4422 * has already begun the shutdown sequence. 4423 */ 4424 if (block_mounts() == 0) 4425 return (set_errno(EINTR)); 4426 mutex_enter(&zonehash_lock); 4427 /* 4428 * Look for zone under hash lock to prevent races with other 4429 * calls to zone_shutdown and zone_destroy. 4430 */ 4431 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4432 mutex_exit(&zonehash_lock); 4433 resume_mounts(); 4434 return (set_errno(EINVAL)); 4435 } 4436 mutex_enter(&zone_status_lock); 4437 status = zone_status_get(zone); 4438 /* 4439 * Fail if the zone isn't fully initialized yet. 4440 */ 4441 if (status < ZONE_IS_READY) { 4442 mutex_exit(&zone_status_lock); 4443 mutex_exit(&zonehash_lock); 4444 resume_mounts(); 4445 return (set_errno(EINVAL)); 4446 } 4447 /* 4448 * If conditions required for zone_shutdown() to return have been met, 4449 * return success. 4450 */ 4451 if (status >= ZONE_IS_DOWN) { 4452 mutex_exit(&zone_status_lock); 4453 mutex_exit(&zonehash_lock); 4454 resume_mounts(); 4455 return (0); 4456 } 4457 /* 4458 * If zone_shutdown() hasn't been called before, go through the motions. 4459 * If it has, there's nothing to do but wait for the kernel threads to 4460 * drain. 4461 */ 4462 if (status < ZONE_IS_EMPTY) { 4463 uint_t ntasks; 4464 4465 mutex_enter(&zone->zone_lock); 4466 if ((ntasks = zone->zone_ntasks) != 1) { 4467 /* 4468 * There's still stuff running. 4469 */ 4470 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 4471 } 4472 mutex_exit(&zone->zone_lock); 4473 if (ntasks == 1) { 4474 /* 4475 * The only way to create another task is through 4476 * zone_enter(), which will block until we drop 4477 * zonehash_lock. The zone is empty. 4478 */ 4479 if (zone->zone_kthreads == NULL) { 4480 /* 4481 * Skip ahead to ZONE_IS_DOWN 4482 */ 4483 zone_status_set(zone, ZONE_IS_DOWN); 4484 } else { 4485 zone_status_set(zone, ZONE_IS_EMPTY); 4486 } 4487 } 4488 } 4489 zone_hold(zone); /* so we can use the zone_t later */ 4490 mutex_exit(&zone_status_lock); 4491 mutex_exit(&zonehash_lock); 4492 resume_mounts(); 4493 4494 if (error = zone_empty(zone)) { 4495 zone_rele(zone); 4496 return (set_errno(error)); 4497 } 4498 /* 4499 * After the zone status goes to ZONE_IS_DOWN this zone will no 4500 * longer be notified of changes to the pools configuration, so 4501 * in order to not end up with a stale pool pointer, we point 4502 * ourselves at the default pool and remove all resource 4503 * visibility. This is especially important as the zone_t may 4504 * languish on the deathrow for a very long time waiting for 4505 * cred's to drain out. 4506 * 4507 * This rebinding of the zone can happen multiple times 4508 * (presumably due to interrupted or parallel systemcalls) 4509 * without any adverse effects. 4510 */ 4511 if (pool_lock_intr() != 0) { 4512 zone_rele(zone); 4513 return (set_errno(EINTR)); 4514 } 4515 if (pool_state == POOL_ENABLED) { 4516 mutex_enter(&cpu_lock); 4517 zone_pool_set(zone, pool_default); 4518 /* 4519 * The zone no longer needs to be able to see any cpus. 4520 */ 4521 zone_pset_set(zone, ZONE_PS_INVAL); 4522 mutex_exit(&cpu_lock); 4523 } 4524 pool_unlock(); 4525 4526 /* 4527 * ZSD shutdown callbacks can be executed multiple times, hence 4528 * it is safe to not be holding any locks across this call. 4529 */ 4530 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 4531 4532 mutex_enter(&zone_status_lock); 4533 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 4534 zone_status_set(zone, ZONE_IS_DOWN); 4535 mutex_exit(&zone_status_lock); 4536 4537 /* 4538 * Wait for kernel threads to drain. 4539 */ 4540 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 4541 zone_rele(zone); 4542 return (set_errno(EINTR)); 4543 } 4544 4545 /* 4546 * Zone can be become down/destroyable even if the above wait 4547 * returns EINTR, so any code added here may never execute. 4548 * (i.e. don't add code here) 4549 */ 4550 4551 zone_rele(zone); 4552 return (0); 4553 } 4554 4555 /* 4556 * Systemcall entry point to finalize the zone halt process. The caller 4557 * must have already successfully called zone_shutdown(). 4558 * 4559 * Upon successful completion, the zone will have been fully destroyed: 4560 * zsched will have exited, destructor callbacks executed, and the zone 4561 * removed from the list of active zones. 4562 */ 4563 static int 4564 zone_destroy(zoneid_t zoneid) 4565 { 4566 uint64_t uniqid; 4567 zone_t *zone; 4568 zone_status_t status; 4569 4570 if (secpolicy_zone_config(CRED()) != 0) 4571 return (set_errno(EPERM)); 4572 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4573 return (set_errno(EINVAL)); 4574 4575 mutex_enter(&zonehash_lock); 4576 /* 4577 * Look for zone under hash lock to prevent races with other 4578 * calls to zone_destroy. 4579 */ 4580 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4581 mutex_exit(&zonehash_lock); 4582 return (set_errno(EINVAL)); 4583 } 4584 4585 if (zone_mount_count(zone->zone_rootpath) != 0) { 4586 mutex_exit(&zonehash_lock); 4587 return (set_errno(EBUSY)); 4588 } 4589 mutex_enter(&zone_status_lock); 4590 status = zone_status_get(zone); 4591 if (status < ZONE_IS_DOWN) { 4592 mutex_exit(&zone_status_lock); 4593 mutex_exit(&zonehash_lock); 4594 return (set_errno(EBUSY)); 4595 } else if (status == ZONE_IS_DOWN) { 4596 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 4597 } 4598 mutex_exit(&zone_status_lock); 4599 zone_hold(zone); 4600 mutex_exit(&zonehash_lock); 4601 4602 /* 4603 * wait for zsched to exit 4604 */ 4605 zone_status_wait(zone, ZONE_IS_DEAD); 4606 zone_zsd_callbacks(zone, ZSD_DESTROY); 4607 zone->zone_netstack = NULL; 4608 uniqid = zone->zone_uniqid; 4609 zone_rele(zone); 4610 zone = NULL; /* potentially free'd */ 4611 4612 mutex_enter(&zonehash_lock); 4613 for (; /* ever */; ) { 4614 boolean_t unref; 4615 4616 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 4617 zone->zone_uniqid != uniqid) { 4618 /* 4619 * The zone has gone away. Necessary conditions 4620 * are met, so we return success. 4621 */ 4622 mutex_exit(&zonehash_lock); 4623 return (0); 4624 } 4625 mutex_enter(&zone->zone_lock); 4626 unref = ZONE_IS_UNREF(zone); 4627 mutex_exit(&zone->zone_lock); 4628 if (unref) { 4629 /* 4630 * There is only one reference to the zone -- that 4631 * added when the zone was added to the hashtables -- 4632 * and things will remain this way until we drop 4633 * zonehash_lock... we can go ahead and cleanup the 4634 * zone. 4635 */ 4636 break; 4637 } 4638 4639 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 4640 /* Signaled */ 4641 mutex_exit(&zonehash_lock); 4642 return (set_errno(EINTR)); 4643 } 4644 4645 } 4646 4647 /* 4648 * Remove CPU cap for this zone now since we're not going to 4649 * fail below this point. 4650 */ 4651 cpucaps_zone_remove(zone); 4652 4653 /* Get rid of the zone's kstats */ 4654 zone_kstat_delete(zone); 4655 4656 /* remove the pfexecd doors */ 4657 if (zone->zone_pfexecd != NULL) { 4658 klpd_freelist(&zone->zone_pfexecd); 4659 zone->zone_pfexecd = NULL; 4660 } 4661 4662 /* free brand specific data */ 4663 if (ZONE_IS_BRANDED(zone)) 4664 ZBROP(zone)->b_free_brand_data(zone); 4665 4666 /* Say goodbye to brand framework. */ 4667 brand_unregister_zone(zone->zone_brand); 4668 4669 /* 4670 * It is now safe to let the zone be recreated; remove it from the 4671 * lists. The memory will not be freed until the last cred 4672 * reference goes away. 4673 */ 4674 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 4675 zonecount--; 4676 /* remove from active list and hash tables */ 4677 list_remove(&zone_active, zone); 4678 (void) mod_hash_destroy(zonehashbyname, 4679 (mod_hash_key_t)zone->zone_name); 4680 (void) mod_hash_destroy(zonehashbyid, 4681 (mod_hash_key_t)(uintptr_t)zone->zone_id); 4682 if (zone->zone_flags & ZF_HASHED_LABEL) 4683 (void) mod_hash_destroy(zonehashbylabel, 4684 (mod_hash_key_t)zone->zone_slabel); 4685 mutex_exit(&zonehash_lock); 4686 4687 /* 4688 * Release the root vnode; we're not using it anymore. Nor should any 4689 * other thread that might access it exist. 4690 */ 4691 if (zone->zone_rootvp != NULL) { 4692 VN_RELE(zone->zone_rootvp); 4693 zone->zone_rootvp = NULL; 4694 } 4695 4696 /* add to deathrow list */ 4697 mutex_enter(&zone_deathrow_lock); 4698 list_insert_tail(&zone_deathrow, zone); 4699 mutex_exit(&zone_deathrow_lock); 4700 4701 /* 4702 * Drop last reference (which was added by zsched()), this will 4703 * free the zone unless there are outstanding cred references. 4704 */ 4705 zone_rele(zone); 4706 return (0); 4707 } 4708 4709 /* 4710 * Systemcall entry point for zone_getattr(2). 4711 */ 4712 static ssize_t 4713 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 4714 { 4715 size_t size; 4716 int error = 0, err; 4717 zone_t *zone; 4718 char *zonepath; 4719 char *outstr; 4720 zone_status_t zone_status; 4721 pid_t initpid; 4722 boolean_t global = (curzone == global_zone); 4723 boolean_t inzone = (curzone->zone_id == zoneid); 4724 ushort_t flags; 4725 zone_net_data_t *zbuf; 4726 4727 mutex_enter(&zonehash_lock); 4728 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4729 mutex_exit(&zonehash_lock); 4730 return (set_errno(EINVAL)); 4731 } 4732 zone_status = zone_status_get(zone); 4733 if (zone_status < ZONE_IS_INITIALIZED) { 4734 mutex_exit(&zonehash_lock); 4735 return (set_errno(EINVAL)); 4736 } 4737 zone_hold(zone); 4738 mutex_exit(&zonehash_lock); 4739 4740 /* 4741 * If not in the global zone, don't show information about other zones, 4742 * unless the system is labeled and the local zone's label dominates 4743 * the other zone. 4744 */ 4745 if (!zone_list_access(zone)) { 4746 zone_rele(zone); 4747 return (set_errno(EINVAL)); 4748 } 4749 4750 switch (attr) { 4751 case ZONE_ATTR_ROOT: 4752 if (global) { 4753 /* 4754 * Copy the path to trim the trailing "/" (except for 4755 * the global zone). 4756 */ 4757 if (zone != global_zone) 4758 size = zone->zone_rootpathlen - 1; 4759 else 4760 size = zone->zone_rootpathlen; 4761 zonepath = kmem_alloc(size, KM_SLEEP); 4762 bcopy(zone->zone_rootpath, zonepath, size); 4763 zonepath[size - 1] = '\0'; 4764 } else { 4765 if (inzone || !is_system_labeled()) { 4766 /* 4767 * Caller is not in the global zone. 4768 * if the query is on the current zone 4769 * or the system is not labeled, 4770 * just return faked-up path for current zone. 4771 */ 4772 zonepath = "/"; 4773 size = 2; 4774 } else { 4775 /* 4776 * Return related path for current zone. 4777 */ 4778 int prefix_len = strlen(zone_prefix); 4779 int zname_len = strlen(zone->zone_name); 4780 4781 size = prefix_len + zname_len + 1; 4782 zonepath = kmem_alloc(size, KM_SLEEP); 4783 bcopy(zone_prefix, zonepath, prefix_len); 4784 bcopy(zone->zone_name, zonepath + 4785 prefix_len, zname_len); 4786 zonepath[size - 1] = '\0'; 4787 } 4788 } 4789 if (bufsize > size) 4790 bufsize = size; 4791 if (buf != NULL) { 4792 err = copyoutstr(zonepath, buf, bufsize, NULL); 4793 if (err != 0 && err != ENAMETOOLONG) 4794 error = EFAULT; 4795 } 4796 if (global || (is_system_labeled() && !inzone)) 4797 kmem_free(zonepath, size); 4798 break; 4799 4800 case ZONE_ATTR_NAME: 4801 size = strlen(zone->zone_name) + 1; 4802 if (bufsize > size) 4803 bufsize = size; 4804 if (buf != NULL) { 4805 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 4806 if (err != 0 && err != ENAMETOOLONG) 4807 error = EFAULT; 4808 } 4809 break; 4810 4811 case ZONE_ATTR_STATUS: 4812 /* 4813 * Since we're not holding zonehash_lock, the zone status 4814 * may be anything; leave it up to userland to sort it out. 4815 */ 4816 size = sizeof (zone_status); 4817 if (bufsize > size) 4818 bufsize = size; 4819 zone_status = zone_status_get(zone); 4820 if (buf != NULL && 4821 copyout(&zone_status, buf, bufsize) != 0) 4822 error = EFAULT; 4823 break; 4824 case ZONE_ATTR_FLAGS: 4825 size = sizeof (zone->zone_flags); 4826 if (bufsize > size) 4827 bufsize = size; 4828 flags = zone->zone_flags; 4829 if (buf != NULL && 4830 copyout(&flags, buf, bufsize) != 0) 4831 error = EFAULT; 4832 break; 4833 case ZONE_ATTR_PRIVSET: 4834 size = sizeof (priv_set_t); 4835 if (bufsize > size) 4836 bufsize = size; 4837 if (buf != NULL && 4838 copyout(zone->zone_privset, buf, bufsize) != 0) 4839 error = EFAULT; 4840 break; 4841 case ZONE_ATTR_UNIQID: 4842 size = sizeof (zone->zone_uniqid); 4843 if (bufsize > size) 4844 bufsize = size; 4845 if (buf != NULL && 4846 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 4847 error = EFAULT; 4848 break; 4849 case ZONE_ATTR_POOLID: 4850 { 4851 pool_t *pool; 4852 poolid_t poolid; 4853 4854 if (pool_lock_intr() != 0) { 4855 error = EINTR; 4856 break; 4857 } 4858 pool = zone_pool_get(zone); 4859 poolid = pool->pool_id; 4860 pool_unlock(); 4861 size = sizeof (poolid); 4862 if (bufsize > size) 4863 bufsize = size; 4864 if (buf != NULL && copyout(&poolid, buf, size) != 0) 4865 error = EFAULT; 4866 } 4867 break; 4868 case ZONE_ATTR_SLBL: 4869 size = sizeof (bslabel_t); 4870 if (bufsize > size) 4871 bufsize = size; 4872 if (zone->zone_slabel == NULL) 4873 error = EINVAL; 4874 else if (buf != NULL && 4875 copyout(label2bslabel(zone->zone_slabel), buf, 4876 bufsize) != 0) 4877 error = EFAULT; 4878 break; 4879 case ZONE_ATTR_INITPID: 4880 size = sizeof (initpid); 4881 if (bufsize > size) 4882 bufsize = size; 4883 initpid = zone->zone_proc_initpid; 4884 if (initpid == -1) { 4885 error = ESRCH; 4886 break; 4887 } 4888 if (buf != NULL && 4889 copyout(&initpid, buf, bufsize) != 0) 4890 error = EFAULT; 4891 break; 4892 case ZONE_ATTR_BRAND: 4893 size = strlen(zone->zone_brand->b_name) + 1; 4894 4895 if (bufsize > size) 4896 bufsize = size; 4897 if (buf != NULL) { 4898 err = copyoutstr(zone->zone_brand->b_name, buf, 4899 bufsize, NULL); 4900 if (err != 0 && err != ENAMETOOLONG) 4901 error = EFAULT; 4902 } 4903 break; 4904 case ZONE_ATTR_INITNAME: 4905 size = strlen(zone->zone_initname) + 1; 4906 if (bufsize > size) 4907 bufsize = size; 4908 if (buf != NULL) { 4909 err = copyoutstr(zone->zone_initname, buf, bufsize, 4910 NULL); 4911 if (err != 0 && err != ENAMETOOLONG) 4912 error = EFAULT; 4913 } 4914 break; 4915 case ZONE_ATTR_BOOTARGS: 4916 if (zone->zone_bootargs == NULL) 4917 outstr = ""; 4918 else 4919 outstr = zone->zone_bootargs; 4920 size = strlen(outstr) + 1; 4921 if (bufsize > size) 4922 bufsize = size; 4923 if (buf != NULL) { 4924 err = copyoutstr(outstr, buf, bufsize, NULL); 4925 if (err != 0 && err != ENAMETOOLONG) 4926 error = EFAULT; 4927 } 4928 break; 4929 case ZONE_ATTR_PHYS_MCAP: 4930 size = sizeof (zone->zone_phys_mcap); 4931 if (bufsize > size) 4932 bufsize = size; 4933 if (buf != NULL && 4934 copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) 4935 error = EFAULT; 4936 break; 4937 case ZONE_ATTR_SCHED_CLASS: 4938 mutex_enter(&class_lock); 4939 4940 if (zone->zone_defaultcid >= loaded_classes) 4941 outstr = ""; 4942 else 4943 outstr = sclass[zone->zone_defaultcid].cl_name; 4944 size = strlen(outstr) + 1; 4945 if (bufsize > size) 4946 bufsize = size; 4947 if (buf != NULL) { 4948 err = copyoutstr(outstr, buf, bufsize, NULL); 4949 if (err != 0 && err != ENAMETOOLONG) 4950 error = EFAULT; 4951 } 4952 4953 mutex_exit(&class_lock); 4954 break; 4955 case ZONE_ATTR_HOSTID: 4956 if (zone->zone_hostid != HW_INVALID_HOSTID && 4957 bufsize == sizeof (zone->zone_hostid)) { 4958 size = sizeof (zone->zone_hostid); 4959 if (buf != NULL && copyout(&zone->zone_hostid, buf, 4960 bufsize) != 0) 4961 error = EFAULT; 4962 } else { 4963 error = EINVAL; 4964 } 4965 break; 4966 case ZONE_ATTR_FS_ALLOWED: 4967 if (zone->zone_fs_allowed == NULL) 4968 outstr = ""; 4969 else 4970 outstr = zone->zone_fs_allowed; 4971 size = strlen(outstr) + 1; 4972 if (bufsize > size) 4973 bufsize = size; 4974 if (buf != NULL) { 4975 err = copyoutstr(outstr, buf, bufsize, NULL); 4976 if (err != 0 && err != ENAMETOOLONG) 4977 error = EFAULT; 4978 } 4979 break; 4980 case ZONE_ATTR_NETWORK: 4981 zbuf = kmem_alloc(bufsize, KM_SLEEP); 4982 if (copyin(buf, zbuf, bufsize) != 0) { 4983 error = EFAULT; 4984 } else { 4985 error = zone_get_network(zoneid, zbuf); 4986 if (error == 0 && copyout(zbuf, buf, bufsize) != 0) 4987 error = EFAULT; 4988 } 4989 kmem_free(zbuf, bufsize); 4990 break; 4991 default: 4992 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { 4993 size = bufsize; 4994 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size); 4995 } else { 4996 error = EINVAL; 4997 } 4998 } 4999 zone_rele(zone); 5000 5001 if (error) 5002 return (set_errno(error)); 5003 return ((ssize_t)size); 5004 } 5005 5006 /* 5007 * Systemcall entry point for zone_setattr(2). 5008 */ 5009 /*ARGSUSED*/ 5010 static int 5011 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 5012 { 5013 zone_t *zone; 5014 zone_status_t zone_status; 5015 int err; 5016 zone_net_data_t *zbuf; 5017 5018 if (secpolicy_zone_config(CRED()) != 0) 5019 return (set_errno(EPERM)); 5020 5021 /* 5022 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the 5023 * global zone. 5024 */ 5025 if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { 5026 return (set_errno(EINVAL)); 5027 } 5028 5029 mutex_enter(&zonehash_lock); 5030 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 5031 mutex_exit(&zonehash_lock); 5032 return (set_errno(EINVAL)); 5033 } 5034 zone_hold(zone); 5035 mutex_exit(&zonehash_lock); 5036 5037 /* 5038 * At present most attributes can only be set on non-running, 5039 * non-global zones. 5040 */ 5041 zone_status = zone_status_get(zone); 5042 if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) 5043 goto done; 5044 5045 switch (attr) { 5046 case ZONE_ATTR_INITNAME: 5047 err = zone_set_initname(zone, (const char *)buf); 5048 break; 5049 case ZONE_ATTR_BOOTARGS: 5050 err = zone_set_bootargs(zone, (const char *)buf); 5051 break; 5052 case ZONE_ATTR_BRAND: 5053 err = zone_set_brand(zone, (const char *)buf); 5054 break; 5055 case ZONE_ATTR_FS_ALLOWED: 5056 err = zone_set_fs_allowed(zone, (const char *)buf); 5057 break; 5058 case ZONE_ATTR_PHYS_MCAP: 5059 err = zone_set_phys_mcap(zone, (const uint64_t *)buf); 5060 break; 5061 case ZONE_ATTR_SCHED_CLASS: 5062 err = zone_set_sched_class(zone, (const char *)buf); 5063 break; 5064 case ZONE_ATTR_HOSTID: 5065 if (bufsize == sizeof (zone->zone_hostid)) { 5066 if (copyin(buf, &zone->zone_hostid, bufsize) == 0) 5067 err = 0; 5068 else 5069 err = EFAULT; 5070 } else { 5071 err = EINVAL; 5072 } 5073 break; 5074 case ZONE_ATTR_NETWORK: 5075 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) { 5076 err = EINVAL; 5077 goto done; 5078 } 5079 zbuf = kmem_alloc(bufsize, KM_SLEEP); 5080 if (copyin(buf, zbuf, bufsize) != 0) { 5081 err = EFAULT; 5082 goto done; 5083 } 5084 err = zone_set_network(zoneid, zbuf); 5085 kmem_free(zbuf, bufsize); 5086 break; 5087 default: 5088 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) 5089 err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); 5090 else 5091 err = EINVAL; 5092 } 5093 5094 done: 5095 zone_rele(zone); 5096 return (err != 0 ? set_errno(err) : 0); 5097 } 5098 5099 /* 5100 * Return zero if the process has at least one vnode mapped in to its 5101 * address space which shouldn't be allowed to change zones. 5102 * 5103 * Also return zero if the process has any shared mappings which reserve 5104 * swap. This is because the counting for zone.max-swap does not allow swap 5105 * reservation to be shared between zones. zone swap reservation is counted 5106 * on zone->zone_max_swap. 5107 */ 5108 static int 5109 as_can_change_zones(void) 5110 { 5111 proc_t *pp = curproc; 5112 struct seg *seg; 5113 struct as *as = pp->p_as; 5114 vnode_t *vp; 5115 int allow = 1; 5116 5117 ASSERT(pp->p_as != &kas); 5118 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 5119 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 5120 5121 /* 5122 * Cannot enter zone with shared anon memory which 5123 * reserves swap. See comment above. 5124 */ 5125 if (seg_can_change_zones(seg) == B_FALSE) { 5126 allow = 0; 5127 break; 5128 } 5129 /* 5130 * if we can't get a backing vnode for this segment then skip 5131 * it. 5132 */ 5133 vp = NULL; 5134 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 5135 continue; 5136 if (!vn_can_change_zones(vp)) { /* bail on first match */ 5137 allow = 0; 5138 break; 5139 } 5140 } 5141 AS_LOCK_EXIT(as, &as->a_lock); 5142 return (allow); 5143 } 5144 5145 /* 5146 * Count swap reserved by curproc's address space 5147 */ 5148 static size_t 5149 as_swresv(void) 5150 { 5151 proc_t *pp = curproc; 5152 struct seg *seg; 5153 struct as *as = pp->p_as; 5154 size_t swap = 0; 5155 5156 ASSERT(pp->p_as != &kas); 5157 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 5158 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) 5159 swap += seg_swresv(seg); 5160 5161 return (swap); 5162 } 5163 5164 /* 5165 * Systemcall entry point for zone_enter(). 5166 * 5167 * The current process is injected into said zone. In the process 5168 * it will change its project membership, privileges, rootdir/cwd, 5169 * zone-wide rctls, and pool association to match those of the zone. 5170 * 5171 * The first zone_enter() called while the zone is in the ZONE_IS_READY 5172 * state will transition it to ZONE_IS_RUNNING. Processes may only 5173 * enter a zone that is "ready" or "running". 5174 */ 5175 static int 5176 zone_enter(zoneid_t zoneid) 5177 { 5178 zone_t *zone; 5179 vnode_t *vp; 5180 proc_t *pp = curproc; 5181 contract_t *ct; 5182 cont_process_t *ctp; 5183 task_t *tk, *oldtk; 5184 kproject_t *zone_proj0; 5185 cred_t *cr, *newcr; 5186 pool_t *oldpool, *newpool; 5187 sess_t *sp; 5188 uid_t uid; 5189 zone_status_t status; 5190 int err = 0; 5191 rctl_entity_p_t e; 5192 size_t swap; 5193 kthread_id_t t; 5194 5195 if (secpolicy_zone_config(CRED()) != 0) 5196 return (set_errno(EPERM)); 5197 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 5198 return (set_errno(EINVAL)); 5199 5200 /* 5201 * Stop all lwps so we don't need to hold a lock to look at 5202 * curproc->p_zone. This needs to happen before we grab any 5203 * locks to avoid deadlock (another lwp in the process could 5204 * be waiting for the held lock). 5205 */ 5206 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 5207 return (set_errno(EINTR)); 5208 5209 /* 5210 * Make sure we're not changing zones with files open or mapped in 5211 * to our address space which shouldn't be changing zones. 5212 */ 5213 if (!files_can_change_zones()) { 5214 err = EBADF; 5215 goto out; 5216 } 5217 if (!as_can_change_zones()) { 5218 err = EFAULT; 5219 goto out; 5220 } 5221 5222 mutex_enter(&zonehash_lock); 5223 if (pp->p_zone != global_zone) { 5224 mutex_exit(&zonehash_lock); 5225 err = EINVAL; 5226 goto out; 5227 } 5228 5229 zone = zone_find_all_by_id(zoneid); 5230 if (zone == NULL) { 5231 mutex_exit(&zonehash_lock); 5232 err = EINVAL; 5233 goto out; 5234 } 5235 5236 /* 5237 * To prevent processes in a zone from holding contracts on 5238 * extrazonal resources, and to avoid process contract 5239 * memberships which span zones, contract holders and processes 5240 * which aren't the sole members of their encapsulating process 5241 * contracts are not allowed to zone_enter. 5242 */ 5243 ctp = pp->p_ct_process; 5244 ct = &ctp->conp_contract; 5245 mutex_enter(&ct->ct_lock); 5246 mutex_enter(&pp->p_lock); 5247 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 5248 mutex_exit(&pp->p_lock); 5249 mutex_exit(&ct->ct_lock); 5250 mutex_exit(&zonehash_lock); 5251 err = EINVAL; 5252 goto out; 5253 } 5254 5255 /* 5256 * Moreover, we don't allow processes whose encapsulating 5257 * process contracts have inherited extrazonal contracts. 5258 * While it would be easier to eliminate all process contracts 5259 * with inherited contracts, we need to be able to give a 5260 * restarted init (or other zone-penetrating process) its 5261 * predecessor's contracts. 5262 */ 5263 if (ctp->conp_ninherited != 0) { 5264 contract_t *next; 5265 for (next = list_head(&ctp->conp_inherited); next; 5266 next = list_next(&ctp->conp_inherited, next)) { 5267 if (contract_getzuniqid(next) != zone->zone_uniqid) { 5268 mutex_exit(&pp->p_lock); 5269 mutex_exit(&ct->ct_lock); 5270 mutex_exit(&zonehash_lock); 5271 err = EINVAL; 5272 goto out; 5273 } 5274 } 5275 } 5276 5277 mutex_exit(&pp->p_lock); 5278 mutex_exit(&ct->ct_lock); 5279 5280 status = zone_status_get(zone); 5281 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 5282 /* 5283 * Can't join 5284 */ 5285 mutex_exit(&zonehash_lock); 5286 err = EINVAL; 5287 goto out; 5288 } 5289 5290 /* 5291 * Make sure new priv set is within the permitted set for caller 5292 */ 5293 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 5294 mutex_exit(&zonehash_lock); 5295 err = EPERM; 5296 goto out; 5297 } 5298 /* 5299 * We want to momentarily drop zonehash_lock while we optimistically 5300 * bind curproc to the pool it should be running in. This is safe 5301 * since the zone can't disappear (we have a hold on it). 5302 */ 5303 zone_hold(zone); 5304 mutex_exit(&zonehash_lock); 5305 5306 /* 5307 * Grab pool_lock to keep the pools configuration from changing 5308 * and to stop ourselves from getting rebound to another pool 5309 * until we join the zone. 5310 */ 5311 if (pool_lock_intr() != 0) { 5312 zone_rele(zone); 5313 err = EINTR; 5314 goto out; 5315 } 5316 ASSERT(secpolicy_pool(CRED()) == 0); 5317 /* 5318 * Bind ourselves to the pool currently associated with the zone. 5319 */ 5320 oldpool = curproc->p_pool; 5321 newpool = zone_pool_get(zone); 5322 if (pool_state == POOL_ENABLED && newpool != oldpool && 5323 (err = pool_do_bind(newpool, P_PID, P_MYID, 5324 POOL_BIND_ALL)) != 0) { 5325 pool_unlock(); 5326 zone_rele(zone); 5327 goto out; 5328 } 5329 5330 /* 5331 * Grab cpu_lock now; we'll need it later when we call 5332 * task_join(). 5333 */ 5334 mutex_enter(&cpu_lock); 5335 mutex_enter(&zonehash_lock); 5336 /* 5337 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 5338 */ 5339 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 5340 /* 5341 * Can't join anymore. 5342 */ 5343 mutex_exit(&zonehash_lock); 5344 mutex_exit(&cpu_lock); 5345 if (pool_state == POOL_ENABLED && 5346 newpool != oldpool) 5347 (void) pool_do_bind(oldpool, P_PID, P_MYID, 5348 POOL_BIND_ALL); 5349 pool_unlock(); 5350 zone_rele(zone); 5351 err = EINVAL; 5352 goto out; 5353 } 5354 5355 /* 5356 * a_lock must be held while transfering locked memory and swap 5357 * reservation from the global zone to the non global zone because 5358 * asynchronous faults on the processes' address space can lock 5359 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE 5360 * segments respectively. 5361 */ 5362 AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); 5363 swap = as_swresv(); 5364 mutex_enter(&pp->p_lock); 5365 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 5366 /* verify that we do not exceed and task or lwp limits */ 5367 mutex_enter(&zone->zone_nlwps_lock); 5368 /* add new lwps to zone and zone's proj0 */ 5369 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 5370 zone->zone_nlwps += pp->p_lwpcnt; 5371 /* add 1 task to zone's proj0 */ 5372 zone_proj0->kpj_ntasks += 1; 5373 5374 zone_proj0->kpj_nprocs++; 5375 zone->zone_nprocs++; 5376 mutex_exit(&zone->zone_nlwps_lock); 5377 5378 mutex_enter(&zone->zone_mem_lock); 5379 zone->zone_locked_mem += pp->p_locked_mem; 5380 zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; 5381 zone->zone_max_swap += swap; 5382 mutex_exit(&zone->zone_mem_lock); 5383 5384 mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock)); 5385 zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem; 5386 mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock)); 5387 5388 /* remove lwps and process from proc's old zone and old project */ 5389 mutex_enter(&pp->p_zone->zone_nlwps_lock); 5390 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 5391 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 5392 pp->p_task->tk_proj->kpj_nprocs--; 5393 pp->p_zone->zone_nprocs--; 5394 mutex_exit(&pp->p_zone->zone_nlwps_lock); 5395 5396 mutex_enter(&pp->p_zone->zone_mem_lock); 5397 pp->p_zone->zone_locked_mem -= pp->p_locked_mem; 5398 pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; 5399 pp->p_zone->zone_max_swap -= swap; 5400 mutex_exit(&pp->p_zone->zone_mem_lock); 5401 5402 mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock)); 5403 pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem; 5404 mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock)); 5405 5406 pp->p_flag |= SZONETOP; 5407 pp->p_zone = zone; 5408 mutex_exit(&pp->p_lock); 5409 AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); 5410 5411 /* 5412 * Joining the zone cannot fail from now on. 5413 * 5414 * This means that a lot of the following code can be commonized and 5415 * shared with zsched(). 5416 */ 5417 5418 /* 5419 * If the process contract fmri was inherited, we need to 5420 * flag this so that any contract status will not leak 5421 * extra zone information, svc_fmri in this case 5422 */ 5423 if (ctp->conp_svc_ctid != ct->ct_id) { 5424 mutex_enter(&ct->ct_lock); 5425 ctp->conp_svc_zone_enter = ct->ct_id; 5426 mutex_exit(&ct->ct_lock); 5427 } 5428 5429 /* 5430 * Reset the encapsulating process contract's zone. 5431 */ 5432 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 5433 contract_setzuniqid(ct, zone->zone_uniqid); 5434 5435 /* 5436 * Create a new task and associate the process with the project keyed 5437 * by (projid,zoneid). 5438 * 5439 * We might as well be in project 0; the global zone's projid doesn't 5440 * make much sense in a zone anyhow. 5441 * 5442 * This also increments zone_ntasks, and returns with p_lock held. 5443 */ 5444 tk = task_create(0, zone); 5445 oldtk = task_join(tk, 0); 5446 mutex_exit(&cpu_lock); 5447 5448 /* 5449 * call RCTLOP_SET functions on this proc 5450 */ 5451 e.rcep_p.zone = zone; 5452 e.rcep_t = RCENTITY_ZONE; 5453 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 5454 RCD_CALLBACK); 5455 mutex_exit(&pp->p_lock); 5456 5457 /* 5458 * We don't need to hold any of zsched's locks here; not only do we know 5459 * the process and zone aren't going away, we know its session isn't 5460 * changing either. 5461 * 5462 * By joining zsched's session here, we mimic the behavior in the 5463 * global zone of init's sid being the pid of sched. We extend this 5464 * to all zlogin-like zone_enter()'ing processes as well. 5465 */ 5466 mutex_enter(&pidlock); 5467 sp = zone->zone_zsched->p_sessp; 5468 sess_hold(zone->zone_zsched); 5469 mutex_enter(&pp->p_lock); 5470 pgexit(pp); 5471 sess_rele(pp->p_sessp, B_TRUE); 5472 pp->p_sessp = sp; 5473 pgjoin(pp, zone->zone_zsched->p_pidp); 5474 5475 /* 5476 * If any threads are scheduled to be placed on zone wait queue they 5477 * should abandon the idea since the wait queue is changing. 5478 * We need to be holding pidlock & p_lock to do this. 5479 */ 5480 if ((t = pp->p_tlist) != NULL) { 5481 do { 5482 thread_lock(t); 5483 /* 5484 * Kick this thread so that he doesn't sit 5485 * on a wrong wait queue. 5486 */ 5487 if (ISWAITING(t)) 5488 setrun_locked(t); 5489 5490 if (t->t_schedflag & TS_ANYWAITQ) 5491 t->t_schedflag &= ~ TS_ANYWAITQ; 5492 5493 thread_unlock(t); 5494 } while ((t = t->t_forw) != pp->p_tlist); 5495 } 5496 5497 /* 5498 * If there is a default scheduling class for the zone and it is not 5499 * the class we are currently in, change all of the threads in the 5500 * process to the new class. We need to be holding pidlock & p_lock 5501 * when we call parmsset so this is a good place to do it. 5502 */ 5503 if (zone->zone_defaultcid > 0 && 5504 zone->zone_defaultcid != curthread->t_cid) { 5505 pcparms_t pcparms; 5506 5507 pcparms.pc_cid = zone->zone_defaultcid; 5508 pcparms.pc_clparms[0] = 0; 5509 5510 /* 5511 * If setting the class fails, we still want to enter the zone. 5512 */ 5513 if ((t = pp->p_tlist) != NULL) { 5514 do { 5515 (void) parmsset(&pcparms, t); 5516 } while ((t = t->t_forw) != pp->p_tlist); 5517 } 5518 } 5519 5520 mutex_exit(&pp->p_lock); 5521 mutex_exit(&pidlock); 5522 5523 mutex_exit(&zonehash_lock); 5524 /* 5525 * We're firmly in the zone; let pools progress. 5526 */ 5527 pool_unlock(); 5528 task_rele(oldtk); 5529 /* 5530 * We don't need to retain a hold on the zone since we already 5531 * incremented zone_ntasks, so the zone isn't going anywhere. 5532 */ 5533 zone_rele(zone); 5534 5535 /* 5536 * Chroot 5537 */ 5538 vp = zone->zone_rootvp; 5539 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 5540 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 5541 5542 /* 5543 * Change process credentials 5544 */ 5545 newcr = cralloc(); 5546 mutex_enter(&pp->p_crlock); 5547 cr = pp->p_cred; 5548 crcopy_to(cr, newcr); 5549 crsetzone(newcr, zone); 5550 pp->p_cred = newcr; 5551 5552 /* 5553 * Restrict all process privilege sets to zone limit 5554 */ 5555 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 5556 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 5557 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 5558 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 5559 mutex_exit(&pp->p_crlock); 5560 crset(pp, newcr); 5561 5562 /* 5563 * Adjust upcount to reflect zone entry. 5564 */ 5565 uid = crgetruid(newcr); 5566 mutex_enter(&pidlock); 5567 upcount_dec(uid, GLOBAL_ZONEID); 5568 upcount_inc(uid, zoneid); 5569 mutex_exit(&pidlock); 5570 5571 /* 5572 * Set up core file path and content. 5573 */ 5574 set_core_defaults(); 5575 5576 out: 5577 /* 5578 * Let the other lwps continue. 5579 */ 5580 mutex_enter(&pp->p_lock); 5581 if (curthread != pp->p_agenttp) 5582 continuelwps(pp); 5583 mutex_exit(&pp->p_lock); 5584 5585 return (err != 0 ? set_errno(err) : 0); 5586 } 5587 5588 /* 5589 * Systemcall entry point for zone_list(2). 5590 * 5591 * Processes running in a (non-global) zone only see themselves. 5592 * On labeled systems, they see all zones whose label they dominate. 5593 */ 5594 static int 5595 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 5596 { 5597 zoneid_t *zoneids; 5598 zone_t *zone, *myzone; 5599 uint_t user_nzones, real_nzones; 5600 uint_t domi_nzones; 5601 int error; 5602 5603 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 5604 return (set_errno(EFAULT)); 5605 5606 myzone = curproc->p_zone; 5607 if (myzone != global_zone) { 5608 bslabel_t *mybslab; 5609 5610 if (!is_system_labeled()) { 5611 /* just return current zone */ 5612 real_nzones = domi_nzones = 1; 5613 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 5614 zoneids[0] = myzone->zone_id; 5615 } else { 5616 /* return all zones that are dominated */ 5617 mutex_enter(&zonehash_lock); 5618 real_nzones = zonecount; 5619 domi_nzones = 0; 5620 if (real_nzones > 0) { 5621 zoneids = kmem_alloc(real_nzones * 5622 sizeof (zoneid_t), KM_SLEEP); 5623 mybslab = label2bslabel(myzone->zone_slabel); 5624 for (zone = list_head(&zone_active); 5625 zone != NULL; 5626 zone = list_next(&zone_active, zone)) { 5627 if (zone->zone_id == GLOBAL_ZONEID) 5628 continue; 5629 if (zone != myzone && 5630 (zone->zone_flags & ZF_IS_SCRATCH)) 5631 continue; 5632 /* 5633 * Note that a label always dominates 5634 * itself, so myzone is always included 5635 * in the list. 5636 */ 5637 if (bldominates(mybslab, 5638 label2bslabel(zone->zone_slabel))) { 5639 zoneids[domi_nzones++] = 5640 zone->zone_id; 5641 } 5642 } 5643 } 5644 mutex_exit(&zonehash_lock); 5645 } 5646 } else { 5647 mutex_enter(&zonehash_lock); 5648 real_nzones = zonecount; 5649 domi_nzones = 0; 5650 if (real_nzones > 0) { 5651 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 5652 KM_SLEEP); 5653 for (zone = list_head(&zone_active); zone != NULL; 5654 zone = list_next(&zone_active, zone)) 5655 zoneids[domi_nzones++] = zone->zone_id; 5656 ASSERT(domi_nzones == real_nzones); 5657 } 5658 mutex_exit(&zonehash_lock); 5659 } 5660 5661 /* 5662 * If user has allocated space for fewer entries than we found, then 5663 * return only up to his limit. Either way, tell him exactly how many 5664 * we found. 5665 */ 5666 if (domi_nzones < user_nzones) 5667 user_nzones = domi_nzones; 5668 error = 0; 5669 if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) { 5670 error = EFAULT; 5671 } else if (zoneidlist != NULL && user_nzones != 0) { 5672 if (copyout(zoneids, zoneidlist, 5673 user_nzones * sizeof (zoneid_t)) != 0) 5674 error = EFAULT; 5675 } 5676 5677 if (real_nzones > 0) 5678 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 5679 5680 if (error != 0) 5681 return (set_errno(error)); 5682 else 5683 return (0); 5684 } 5685 5686 /* 5687 * Systemcall entry point for zone_lookup(2). 5688 * 5689 * Non-global zones are only able to see themselves and (on labeled systems) 5690 * the zones they dominate. 5691 */ 5692 static zoneid_t 5693 zone_lookup(const char *zone_name) 5694 { 5695 char *kname; 5696 zone_t *zone; 5697 zoneid_t zoneid; 5698 int err; 5699 5700 if (zone_name == NULL) { 5701 /* return caller's zone id */ 5702 return (getzoneid()); 5703 } 5704 5705 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 5706 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 5707 kmem_free(kname, ZONENAME_MAX); 5708 return (set_errno(err)); 5709 } 5710 5711 mutex_enter(&zonehash_lock); 5712 zone = zone_find_all_by_name(kname); 5713 kmem_free(kname, ZONENAME_MAX); 5714 /* 5715 * In a non-global zone, can only lookup global and own name. 5716 * In Trusted Extensions zone label dominance rules apply. 5717 */ 5718 if (zone == NULL || 5719 zone_status_get(zone) < ZONE_IS_READY || 5720 !zone_list_access(zone)) { 5721 mutex_exit(&zonehash_lock); 5722 return (set_errno(EINVAL)); 5723 } else { 5724 zoneid = zone->zone_id; 5725 mutex_exit(&zonehash_lock); 5726 return (zoneid); 5727 } 5728 } 5729 5730 static int 5731 zone_version(int *version_arg) 5732 { 5733 int version = ZONE_SYSCALL_API_VERSION; 5734 5735 if (copyout(&version, version_arg, sizeof (int)) != 0) 5736 return (set_errno(EFAULT)); 5737 return (0); 5738 } 5739 5740 /* ARGSUSED */ 5741 long 5742 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 5743 { 5744 zone_def zs; 5745 int err; 5746 5747 switch (cmd) { 5748 case ZONE_CREATE: 5749 if (get_udatamodel() == DATAMODEL_NATIVE) { 5750 if (copyin(arg1, &zs, sizeof (zone_def))) { 5751 return (set_errno(EFAULT)); 5752 } 5753 } else { 5754 #ifdef _SYSCALL32_IMPL 5755 zone_def32 zs32; 5756 5757 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 5758 return (set_errno(EFAULT)); 5759 } 5760 zs.zone_name = 5761 (const char *)(unsigned long)zs32.zone_name; 5762 zs.zone_root = 5763 (const char *)(unsigned long)zs32.zone_root; 5764 zs.zone_privs = 5765 (const struct priv_set *) 5766 (unsigned long)zs32.zone_privs; 5767 zs.zone_privssz = zs32.zone_privssz; 5768 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 5769 zs.rctlbufsz = zs32.rctlbufsz; 5770 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 5771 zs.zfsbufsz = zs32.zfsbufsz; 5772 zs.extended_error = 5773 (int *)(unsigned long)zs32.extended_error; 5774 zs.match = zs32.match; 5775 zs.doi = zs32.doi; 5776 zs.label = (const bslabel_t *)(uintptr_t)zs32.label; 5777 zs.flags = zs32.flags; 5778 #else 5779 panic("get_udatamodel() returned bogus result\n"); 5780 #endif 5781 } 5782 5783 return (zone_create(zs.zone_name, zs.zone_root, 5784 zs.zone_privs, zs.zone_privssz, 5785 (caddr_t)zs.rctlbuf, zs.rctlbufsz, 5786 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 5787 zs.extended_error, zs.match, zs.doi, 5788 zs.label, zs.flags)); 5789 case ZONE_BOOT: 5790 return (zone_boot((zoneid_t)(uintptr_t)arg1)); 5791 case ZONE_DESTROY: 5792 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 5793 case ZONE_GETATTR: 5794 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 5795 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 5796 case ZONE_SETATTR: 5797 return (zone_setattr((zoneid_t)(uintptr_t)arg1, 5798 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 5799 case ZONE_ENTER: 5800 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 5801 case ZONE_LIST: 5802 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 5803 case ZONE_SHUTDOWN: 5804 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 5805 case ZONE_LOOKUP: 5806 return (zone_lookup((const char *)arg1)); 5807 case ZONE_VERSION: 5808 return (zone_version((int *)arg1)); 5809 case ZONE_ADD_DATALINK: 5810 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1, 5811 (datalink_id_t)(uintptr_t)arg2)); 5812 case ZONE_DEL_DATALINK: 5813 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1, 5814 (datalink_id_t)(uintptr_t)arg2)); 5815 case ZONE_CHECK_DATALINK: { 5816 zoneid_t zoneid; 5817 boolean_t need_copyout; 5818 5819 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0) 5820 return (EFAULT); 5821 need_copyout = (zoneid == ALL_ZONES); 5822 err = zone_check_datalink(&zoneid, 5823 (datalink_id_t)(uintptr_t)arg2); 5824 if (err == 0 && need_copyout) { 5825 if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0) 5826 err = EFAULT; 5827 } 5828 return (err == 0 ? 0 : set_errno(err)); 5829 } 5830 case ZONE_LIST_DATALINK: 5831 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1, 5832 (int *)arg2, (datalink_id_t *)(uintptr_t)arg3)); 5833 default: 5834 return (set_errno(EINVAL)); 5835 } 5836 } 5837 5838 struct zarg { 5839 zone_t *zone; 5840 zone_cmd_arg_t arg; 5841 }; 5842 5843 static int 5844 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 5845 { 5846 char *buf; 5847 size_t buflen; 5848 int error; 5849 5850 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 5851 buf = kmem_alloc(buflen, KM_SLEEP); 5852 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 5853 error = door_ki_open(buf, doorp); 5854 kmem_free(buf, buflen); 5855 return (error); 5856 } 5857 5858 static void 5859 zone_release_door(door_handle_t *doorp) 5860 { 5861 door_ki_rele(*doorp); 5862 *doorp = NULL; 5863 } 5864 5865 static void 5866 zone_ki_call_zoneadmd(struct zarg *zargp) 5867 { 5868 door_handle_t door = NULL; 5869 door_arg_t darg, save_arg; 5870 char *zone_name; 5871 size_t zone_namelen; 5872 zoneid_t zoneid; 5873 zone_t *zone; 5874 zone_cmd_arg_t arg; 5875 uint64_t uniqid; 5876 size_t size; 5877 int error; 5878 int retry; 5879 5880 zone = zargp->zone; 5881 arg = zargp->arg; 5882 kmem_free(zargp, sizeof (*zargp)); 5883 5884 zone_namelen = strlen(zone->zone_name) + 1; 5885 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 5886 bcopy(zone->zone_name, zone_name, zone_namelen); 5887 zoneid = zone->zone_id; 5888 uniqid = zone->zone_uniqid; 5889 /* 5890 * zoneadmd may be down, but at least we can empty out the zone. 5891 * We can ignore the return value of zone_empty() since we're called 5892 * from a kernel thread and know we won't be delivered any signals. 5893 */ 5894 ASSERT(curproc == &p0); 5895 (void) zone_empty(zone); 5896 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 5897 zone_rele(zone); 5898 5899 size = sizeof (arg); 5900 darg.rbuf = (char *)&arg; 5901 darg.data_ptr = (char *)&arg; 5902 darg.rsize = size; 5903 darg.data_size = size; 5904 darg.desc_ptr = NULL; 5905 darg.desc_num = 0; 5906 5907 save_arg = darg; 5908 /* 5909 * Since we're not holding a reference to the zone, any number of 5910 * things can go wrong, including the zone disappearing before we get a 5911 * chance to talk to zoneadmd. 5912 */ 5913 for (retry = 0; /* forever */; retry++) { 5914 if (door == NULL && 5915 (error = zone_lookup_door(zone_name, &door)) != 0) { 5916 goto next; 5917 } 5918 ASSERT(door != NULL); 5919 5920 if ((error = door_ki_upcall_limited(door, &darg, NULL, 5921 SIZE_MAX, 0)) == 0) { 5922 break; 5923 } 5924 switch (error) { 5925 case EINTR: 5926 /* FALLTHROUGH */ 5927 case EAGAIN: /* process may be forking */ 5928 /* 5929 * Back off for a bit 5930 */ 5931 break; 5932 case EBADF: 5933 zone_release_door(&door); 5934 if (zone_lookup_door(zone_name, &door) != 0) { 5935 /* 5936 * zoneadmd may be dead, but it may come back to 5937 * life later. 5938 */ 5939 break; 5940 } 5941 break; 5942 default: 5943 cmn_err(CE_WARN, 5944 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 5945 error); 5946 goto out; 5947 } 5948 next: 5949 /* 5950 * If this isn't the same zone_t that we originally had in mind, 5951 * then this is the same as if two kadmin requests come in at 5952 * the same time: the first one wins. This means we lose, so we 5953 * bail. 5954 */ 5955 if ((zone = zone_find_by_id(zoneid)) == NULL) { 5956 /* 5957 * Problem is solved. 5958 */ 5959 break; 5960 } 5961 if (zone->zone_uniqid != uniqid) { 5962 /* 5963 * zoneid recycled 5964 */ 5965 zone_rele(zone); 5966 break; 5967 } 5968 /* 5969 * We could zone_status_timedwait(), but there doesn't seem to 5970 * be much point in doing that (plus, it would mean that 5971 * zone_free() isn't called until this thread exits). 5972 */ 5973 zone_rele(zone); 5974 delay(hz); 5975 darg = save_arg; 5976 } 5977 out: 5978 if (door != NULL) { 5979 zone_release_door(&door); 5980 } 5981 kmem_free(zone_name, zone_namelen); 5982 thread_exit(); 5983 } 5984 5985 /* 5986 * Entry point for uadmin() to tell the zone to go away or reboot. Analog to 5987 * kadmin(). The caller is a process in the zone. 5988 * 5989 * In order to shutdown the zone, we will hand off control to zoneadmd 5990 * (running in the global zone) via a door. We do a half-hearted job at 5991 * killing all processes in the zone, create a kernel thread to contact 5992 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 5993 * a form of generation number used to let zoneadmd (as well as 5994 * zone_destroy()) know exactly which zone they're re talking about. 5995 */ 5996 int 5997 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) 5998 { 5999 struct zarg *zargp; 6000 zone_cmd_t zcmd; 6001 zone_t *zone; 6002 6003 zone = curproc->p_zone; 6004 ASSERT(getzoneid() != GLOBAL_ZONEID); 6005 6006 switch (cmd) { 6007 case A_SHUTDOWN: 6008 switch (fcn) { 6009 case AD_HALT: 6010 case AD_POWEROFF: 6011 zcmd = Z_HALT; 6012 break; 6013 case AD_BOOT: 6014 zcmd = Z_REBOOT; 6015 break; 6016 case AD_IBOOT: 6017 case AD_SBOOT: 6018 case AD_SIBOOT: 6019 case AD_NOSYNC: 6020 return (ENOTSUP); 6021 default: 6022 return (EINVAL); 6023 } 6024 break; 6025 case A_REBOOT: 6026 zcmd = Z_REBOOT; 6027 break; 6028 case A_FTRACE: 6029 case A_REMOUNT: 6030 case A_FREEZE: 6031 case A_DUMP: 6032 case A_CONFIG: 6033 return (ENOTSUP); 6034 default: 6035 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 6036 return (EINVAL); 6037 } 6038 6039 if (secpolicy_zone_admin(credp, B_FALSE)) 6040 return (EPERM); 6041 mutex_enter(&zone_status_lock); 6042 6043 /* 6044 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 6045 * is in the zone. 6046 */ 6047 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 6048 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 6049 /* 6050 * This zone is already on its way down. 6051 */ 6052 mutex_exit(&zone_status_lock); 6053 return (0); 6054 } 6055 /* 6056 * Prevent future zone_enter()s 6057 */ 6058 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 6059 mutex_exit(&zone_status_lock); 6060 6061 /* 6062 * Kill everyone now and call zoneadmd later. 6063 * zone_ki_call_zoneadmd() will do a more thorough job of this 6064 * later. 6065 */ 6066 killall(zone->zone_id); 6067 /* 6068 * Now, create the thread to contact zoneadmd and do the rest of the 6069 * work. This thread can't be created in our zone otherwise 6070 * zone_destroy() would deadlock. 6071 */ 6072 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP); 6073 zargp->arg.cmd = zcmd; 6074 zargp->arg.uniqid = zone->zone_uniqid; 6075 zargp->zone = zone; 6076 (void) strcpy(zargp->arg.locale, "C"); 6077 /* mdep was already copied in for us by uadmin */ 6078 if (mdep != NULL) 6079 (void) strlcpy(zargp->arg.bootbuf, mdep, 6080 sizeof (zargp->arg.bootbuf)); 6081 zone_hold(zone); 6082 6083 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 6084 TS_RUN, minclsyspri); 6085 exit(CLD_EXITED, 0); 6086 6087 return (EINVAL); 6088 } 6089 6090 /* 6091 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 6092 * status to ZONE_IS_SHUTTING_DOWN. 6093 * 6094 * This function also shuts down all running zones to ensure that they won't 6095 * fork new processes. 6096 */ 6097 void 6098 zone_shutdown_global(void) 6099 { 6100 zone_t *current_zonep; 6101 6102 ASSERT(INGLOBALZONE(curproc)); 6103 mutex_enter(&zonehash_lock); 6104 mutex_enter(&zone_status_lock); 6105 6106 /* Modify the global zone's status first. */ 6107 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 6108 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 6109 6110 /* 6111 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN. 6112 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so 6113 * could cause assertions to fail (e.g., assertions about a zone's 6114 * state during initialization, readying, or booting) or produce races. 6115 * We'll let threads continue to initialize and ready new zones: they'll 6116 * fail to boot the new zones when they see that the global zone is 6117 * shutting down. 6118 */ 6119 for (current_zonep = list_head(&zone_active); current_zonep != NULL; 6120 current_zonep = list_next(&zone_active, current_zonep)) { 6121 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING) 6122 zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN); 6123 } 6124 mutex_exit(&zone_status_lock); 6125 mutex_exit(&zonehash_lock); 6126 } 6127 6128 /* 6129 * Returns true if the named dataset is visible in the current zone. 6130 * The 'write' parameter is set to 1 if the dataset is also writable. 6131 */ 6132 int 6133 zone_dataset_visible(const char *dataset, int *write) 6134 { 6135 static int zfstype = -1; 6136 zone_dataset_t *zd; 6137 size_t len; 6138 zone_t *zone = curproc->p_zone; 6139 const char *name = NULL; 6140 vfs_t *vfsp = NULL; 6141 6142 if (dataset[0] == '\0') 6143 return (0); 6144 6145 /* 6146 * Walk the list once, looking for datasets which match exactly, or 6147 * specify a dataset underneath an exported dataset. If found, return 6148 * true and note that it is writable. 6149 */ 6150 for (zd = list_head(&zone->zone_datasets); zd != NULL; 6151 zd = list_next(&zone->zone_datasets, zd)) { 6152 6153 len = strlen(zd->zd_dataset); 6154 if (strlen(dataset) >= len && 6155 bcmp(dataset, zd->zd_dataset, len) == 0 && 6156 (dataset[len] == '\0' || dataset[len] == '/' || 6157 dataset[len] == '@')) { 6158 if (write) 6159 *write = 1; 6160 return (1); 6161 } 6162 } 6163 6164 /* 6165 * Walk the list a second time, searching for datasets which are parents 6166 * of exported datasets. These should be visible, but read-only. 6167 * 6168 * Note that we also have to support forms such as 'pool/dataset/', with 6169 * a trailing slash. 6170 */ 6171 for (zd = list_head(&zone->zone_datasets); zd != NULL; 6172 zd = list_next(&zone->zone_datasets, zd)) { 6173 6174 len = strlen(dataset); 6175 if (dataset[len - 1] == '/') 6176 len--; /* Ignore trailing slash */ 6177 if (len < strlen(zd->zd_dataset) && 6178 bcmp(dataset, zd->zd_dataset, len) == 0 && 6179 zd->zd_dataset[len] == '/') { 6180 if (write) 6181 *write = 0; 6182 return (1); 6183 } 6184 } 6185 6186 /* 6187 * We reach here if the given dataset is not found in the zone_dataset 6188 * list. Check if this dataset was added as a filesystem (ie. "add fs") 6189 * instead of delegation. For this we search for the dataset in the 6190 * zone_vfslist of this zone. If found, return true and note that it is 6191 * not writable. 6192 */ 6193 6194 /* 6195 * Initialize zfstype if it is not initialized yet. 6196 */ 6197 if (zfstype == -1) { 6198 struct vfssw *vswp = vfs_getvfssw("zfs"); 6199 zfstype = vswp - vfssw; 6200 vfs_unrefvfssw(vswp); 6201 } 6202 6203 vfs_list_read_lock(); 6204 vfsp = zone->zone_vfslist; 6205 do { 6206 ASSERT(vfsp); 6207 if (vfsp->vfs_fstype == zfstype) { 6208 name = refstr_value(vfsp->vfs_resource); 6209 6210 /* 6211 * Check if we have an exact match. 6212 */ 6213 if (strcmp(dataset, name) == 0) { 6214 vfs_list_unlock(); 6215 if (write) 6216 *write = 0; 6217 return (1); 6218 } 6219 /* 6220 * We need to check if we are looking for parents of 6221 * a dataset. These should be visible, but read-only. 6222 */ 6223 len = strlen(dataset); 6224 if (dataset[len - 1] == '/') 6225 len--; 6226 6227 if (len < strlen(name) && 6228 bcmp(dataset, name, len) == 0 && name[len] == '/') { 6229 vfs_list_unlock(); 6230 if (write) 6231 *write = 0; 6232 return (1); 6233 } 6234 } 6235 vfsp = vfsp->vfs_zone_next; 6236 } while (vfsp != zone->zone_vfslist); 6237 6238 vfs_list_unlock(); 6239 return (0); 6240 } 6241 6242 /* 6243 * zone_find_by_any_path() - 6244 * 6245 * kernel-private routine similar to zone_find_by_path(), but which 6246 * effectively compares against zone paths rather than zonerootpath 6247 * (i.e., the last component of zonerootpaths, which should be "root/", 6248 * are not compared.) This is done in order to accurately identify all 6249 * paths, whether zone-visible or not, including those which are parallel 6250 * to /root/, such as /dev/, /home/, etc... 6251 * 6252 * If the specified path does not fall under any zone path then global 6253 * zone is returned. 6254 * 6255 * The treat_abs parameter indicates whether the path should be treated as 6256 * an absolute path although it does not begin with "/". (This supports 6257 * nfs mount syntax such as host:any/path.) 6258 * 6259 * The caller is responsible for zone_rele of the returned zone. 6260 */ 6261 zone_t * 6262 zone_find_by_any_path(const char *path, boolean_t treat_abs) 6263 { 6264 zone_t *zone; 6265 int path_offset = 0; 6266 6267 if (path == NULL) { 6268 zone_hold(global_zone); 6269 return (global_zone); 6270 } 6271 6272 if (*path != '/') { 6273 ASSERT(treat_abs); 6274 path_offset = 1; 6275 } 6276 6277 mutex_enter(&zonehash_lock); 6278 for (zone = list_head(&zone_active); zone != NULL; 6279 zone = list_next(&zone_active, zone)) { 6280 char *c; 6281 size_t pathlen; 6282 char *rootpath_start; 6283 6284 if (zone == global_zone) /* skip global zone */ 6285 continue; 6286 6287 /* scan backwards to find start of last component */ 6288 c = zone->zone_rootpath + zone->zone_rootpathlen - 2; 6289 do { 6290 c--; 6291 } while (*c != '/'); 6292 6293 pathlen = c - zone->zone_rootpath + 1 - path_offset; 6294 rootpath_start = (zone->zone_rootpath + path_offset); 6295 if (strncmp(path, rootpath_start, pathlen) == 0) 6296 break; 6297 } 6298 if (zone == NULL) 6299 zone = global_zone; 6300 zone_hold(zone); 6301 mutex_exit(&zonehash_lock); 6302 return (zone); 6303 } 6304 6305 /* 6306 * Finds a zone_dl_t with the given linkid in the given zone. Returns the 6307 * zone_dl_t pointer if found, and NULL otherwise. 6308 */ 6309 static zone_dl_t * 6310 zone_find_dl(zone_t *zone, datalink_id_t linkid) 6311 { 6312 zone_dl_t *zdl; 6313 6314 ASSERT(mutex_owned(&zone->zone_lock)); 6315 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6316 zdl = list_next(&zone->zone_dl_list, zdl)) { 6317 if (zdl->zdl_id == linkid) 6318 break; 6319 } 6320 return (zdl); 6321 } 6322 6323 static boolean_t 6324 zone_dl_exists(zone_t *zone, datalink_id_t linkid) 6325 { 6326 boolean_t exists; 6327 6328 mutex_enter(&zone->zone_lock); 6329 exists = (zone_find_dl(zone, linkid) != NULL); 6330 mutex_exit(&zone->zone_lock); 6331 return (exists); 6332 } 6333 6334 /* 6335 * Add an data link name for the zone. 6336 */ 6337 static int 6338 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) 6339 { 6340 zone_dl_t *zdl; 6341 zone_t *zone; 6342 zone_t *thiszone; 6343 6344 if ((thiszone = zone_find_by_id(zoneid)) == NULL) 6345 return (set_errno(ENXIO)); 6346 6347 /* Verify that the datalink ID doesn't already belong to a zone. */ 6348 mutex_enter(&zonehash_lock); 6349 for (zone = list_head(&zone_active); zone != NULL; 6350 zone = list_next(&zone_active, zone)) { 6351 if (zone_dl_exists(zone, linkid)) { 6352 mutex_exit(&zonehash_lock); 6353 zone_rele(thiszone); 6354 return (set_errno((zone == thiszone) ? EEXIST : EPERM)); 6355 } 6356 } 6357 6358 zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP); 6359 zdl->zdl_id = linkid; 6360 zdl->zdl_net = NULL; 6361 mutex_enter(&thiszone->zone_lock); 6362 list_insert_head(&thiszone->zone_dl_list, zdl); 6363 mutex_exit(&thiszone->zone_lock); 6364 mutex_exit(&zonehash_lock); 6365 zone_rele(thiszone); 6366 return (0); 6367 } 6368 6369 static int 6370 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) 6371 { 6372 zone_dl_t *zdl; 6373 zone_t *zone; 6374 int err = 0; 6375 6376 if ((zone = zone_find_by_id(zoneid)) == NULL) 6377 return (set_errno(EINVAL)); 6378 6379 mutex_enter(&zone->zone_lock); 6380 if ((zdl = zone_find_dl(zone, linkid)) == NULL) { 6381 err = ENXIO; 6382 } else { 6383 list_remove(&zone->zone_dl_list, zdl); 6384 if (zdl->zdl_net != NULL) 6385 nvlist_free(zdl->zdl_net); 6386 kmem_free(zdl, sizeof (zone_dl_t)); 6387 } 6388 mutex_exit(&zone->zone_lock); 6389 zone_rele(zone); 6390 return (err == 0 ? 0 : set_errno(err)); 6391 } 6392 6393 /* 6394 * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned 6395 * the linkid. Otherwise we just check if the specified zoneidp has been 6396 * assigned the supplied linkid. 6397 */ 6398 int 6399 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) 6400 { 6401 zone_t *zone; 6402 int err = ENXIO; 6403 6404 if (*zoneidp != ALL_ZONES) { 6405 if ((zone = zone_find_by_id(*zoneidp)) != NULL) { 6406 if (zone_dl_exists(zone, linkid)) 6407 err = 0; 6408 zone_rele(zone); 6409 } 6410 return (err); 6411 } 6412 6413 mutex_enter(&zonehash_lock); 6414 for (zone = list_head(&zone_active); zone != NULL; 6415 zone = list_next(&zone_active, zone)) { 6416 if (zone_dl_exists(zone, linkid)) { 6417 *zoneidp = zone->zone_id; 6418 err = 0; 6419 break; 6420 } 6421 } 6422 mutex_exit(&zonehash_lock); 6423 return (err); 6424 } 6425 6426 /* 6427 * Get the list of datalink IDs assigned to a zone. 6428 * 6429 * On input, *nump is the number of datalink IDs that can fit in the supplied 6430 * idarray. Upon return, *nump is either set to the number of datalink IDs 6431 * that were placed in the array if the array was large enough, or to the 6432 * number of datalink IDs that the function needs to place in the array if the 6433 * array is too small. 6434 */ 6435 static int 6436 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) 6437 { 6438 uint_t num, dlcount; 6439 zone_t *zone; 6440 zone_dl_t *zdl; 6441 datalink_id_t *idptr = idarray; 6442 6443 if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) 6444 return (set_errno(EFAULT)); 6445 if ((zone = zone_find_by_id(zoneid)) == NULL) 6446 return (set_errno(ENXIO)); 6447 6448 num = 0; 6449 mutex_enter(&zone->zone_lock); 6450 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6451 zdl = list_next(&zone->zone_dl_list, zdl)) { 6452 /* 6453 * If the list is bigger than what the caller supplied, just 6454 * count, don't do copyout. 6455 */ 6456 if (++num > dlcount) 6457 continue; 6458 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) { 6459 mutex_exit(&zone->zone_lock); 6460 zone_rele(zone); 6461 return (set_errno(EFAULT)); 6462 } 6463 idptr++; 6464 } 6465 mutex_exit(&zone->zone_lock); 6466 zone_rele(zone); 6467 6468 /* Increased or decreased, caller should be notified. */ 6469 if (num != dlcount) { 6470 if (copyout(&num, nump, sizeof (num)) != 0) 6471 return (set_errno(EFAULT)); 6472 } 6473 return (0); 6474 } 6475 6476 /* 6477 * Public interface for looking up a zone by zoneid. It's a customized version 6478 * for netstack_zone_create(). It can only be called from the zsd create 6479 * callbacks, since it doesn't have reference on the zone structure hence if 6480 * it is called elsewhere the zone could disappear after the zonehash_lock 6481 * is dropped. 6482 * 6483 * Furthermore it 6484 * 1. Doesn't check the status of the zone. 6485 * 2. It will be called even before zone_init is called, in that case the 6486 * address of zone0 is returned directly, and netstack_zone_create() 6487 * will only assign a value to zone0.zone_netstack, won't break anything. 6488 * 3. Returns without the zone being held. 6489 */ 6490 zone_t * 6491 zone_find_by_id_nolock(zoneid_t zoneid) 6492 { 6493 zone_t *zone; 6494 6495 mutex_enter(&zonehash_lock); 6496 if (zonehashbyid == NULL) 6497 zone = &zone0; 6498 else 6499 zone = zone_find_all_by_id(zoneid); 6500 mutex_exit(&zonehash_lock); 6501 return (zone); 6502 } 6503 6504 /* 6505 * Walk the datalinks for a given zone 6506 */ 6507 int 6508 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *), 6509 void *data) 6510 { 6511 zone_t *zone; 6512 zone_dl_t *zdl; 6513 datalink_id_t *idarray; 6514 uint_t idcount = 0; 6515 int i, ret = 0; 6516 6517 if ((zone = zone_find_by_id(zoneid)) == NULL) 6518 return (ENOENT); 6519 6520 /* 6521 * We first build an array of linkid's so that we can walk these and 6522 * execute the callback with the zone_lock dropped. 6523 */ 6524 mutex_enter(&zone->zone_lock); 6525 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6526 zdl = list_next(&zone->zone_dl_list, zdl)) { 6527 idcount++; 6528 } 6529 6530 if (idcount == 0) { 6531 mutex_exit(&zone->zone_lock); 6532 zone_rele(zone); 6533 return (0); 6534 } 6535 6536 idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP); 6537 if (idarray == NULL) { 6538 mutex_exit(&zone->zone_lock); 6539 zone_rele(zone); 6540 return (ENOMEM); 6541 } 6542 6543 for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6544 i++, zdl = list_next(&zone->zone_dl_list, zdl)) { 6545 idarray[i] = zdl->zdl_id; 6546 } 6547 6548 mutex_exit(&zone->zone_lock); 6549 6550 for (i = 0; i < idcount && ret == 0; i++) { 6551 if ((ret = (*cb)(idarray[i], data)) != 0) 6552 break; 6553 } 6554 6555 zone_rele(zone); 6556 kmem_free(idarray, sizeof (datalink_id_t) * idcount); 6557 return (ret); 6558 } 6559 6560 static char * 6561 zone_net_type2name(int type) 6562 { 6563 switch (type) { 6564 case ZONE_NETWORK_ADDRESS: 6565 return (ZONE_NET_ADDRNAME); 6566 case ZONE_NETWORK_DEFROUTER: 6567 return (ZONE_NET_RTRNAME); 6568 default: 6569 return (NULL); 6570 } 6571 } 6572 6573 static int 6574 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf) 6575 { 6576 zone_t *zone; 6577 zone_dl_t *zdl; 6578 nvlist_t *nvl; 6579 int err = 0; 6580 uint8_t *new = NULL; 6581 char *nvname; 6582 int bufsize; 6583 datalink_id_t linkid = znbuf->zn_linkid; 6584 6585 if (secpolicy_zone_config(CRED()) != 0) 6586 return (set_errno(EPERM)); 6587 6588 if (zoneid == GLOBAL_ZONEID) 6589 return (set_errno(EINVAL)); 6590 6591 nvname = zone_net_type2name(znbuf->zn_type); 6592 bufsize = znbuf->zn_len; 6593 new = znbuf->zn_val; 6594 if (nvname == NULL) 6595 return (set_errno(EINVAL)); 6596 6597 if ((zone = zone_find_by_id(zoneid)) == NULL) { 6598 return (set_errno(EINVAL)); 6599 } 6600 6601 mutex_enter(&zone->zone_lock); 6602 if ((zdl = zone_find_dl(zone, linkid)) == NULL) { 6603 err = ENXIO; 6604 goto done; 6605 } 6606 if ((nvl = zdl->zdl_net) == NULL) { 6607 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) { 6608 err = ENOMEM; 6609 goto done; 6610 } else { 6611 zdl->zdl_net = nvl; 6612 } 6613 } 6614 if (nvlist_exists(nvl, nvname)) { 6615 err = EINVAL; 6616 goto done; 6617 } 6618 err = nvlist_add_uint8_array(nvl, nvname, new, bufsize); 6619 ASSERT(err == 0); 6620 done: 6621 mutex_exit(&zone->zone_lock); 6622 zone_rele(zone); 6623 if (err != 0) 6624 return (set_errno(err)); 6625 else 6626 return (0); 6627 } 6628 6629 static int 6630 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf) 6631 { 6632 zone_t *zone; 6633 zone_dl_t *zdl; 6634 nvlist_t *nvl; 6635 uint8_t *ptr; 6636 uint_t psize; 6637 int err = 0; 6638 char *nvname; 6639 int bufsize; 6640 void *buf; 6641 datalink_id_t linkid = znbuf->zn_linkid; 6642 6643 if (zoneid == GLOBAL_ZONEID) 6644 return (set_errno(EINVAL)); 6645 6646 nvname = zone_net_type2name(znbuf->zn_type); 6647 bufsize = znbuf->zn_len; 6648 buf = znbuf->zn_val; 6649 6650 if (nvname == NULL) 6651 return (set_errno(EINVAL)); 6652 if ((zone = zone_find_by_id(zoneid)) == NULL) 6653 return (set_errno(EINVAL)); 6654 6655 mutex_enter(&zone->zone_lock); 6656 if ((zdl = zone_find_dl(zone, linkid)) == NULL) { 6657 err = ENXIO; 6658 goto done; 6659 } 6660 if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) { 6661 err = ENOENT; 6662 goto done; 6663 } 6664 err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize); 6665 ASSERT(err == 0); 6666 6667 if (psize > bufsize) { 6668 err = ENOBUFS; 6669 goto done; 6670 } 6671 znbuf->zn_len = psize; 6672 bcopy(ptr, buf, psize); 6673 done: 6674 mutex_exit(&zone->zone_lock); 6675 zone_rele(zone); 6676 if (err != 0) 6677 return (set_errno(err)); 6678 else 6679 return (0); 6680 } 6681