1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Zones 28 * 29 * A zone is a named collection of processes, namespace constraints, 30 * and other system resources which comprise a secure and manageable 31 * application containment facility. 32 * 33 * Zones (represented by the reference counted zone_t) are tracked in 34 * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs 35 * (zoneid_t) are used to track zone association. Zone IDs are 36 * dynamically generated when the zone is created; if a persistent 37 * identifier is needed (core files, accounting logs, audit trail, 38 * etc.), the zone name should be used. 39 * 40 * 41 * Global Zone: 42 * 43 * The global zone (zoneid 0) is automatically associated with all 44 * system resources that have not been bound to a user-created zone. 45 * This means that even systems where zones are not in active use 46 * have a global zone, and all processes, mounts, etc. are 47 * associated with that zone. The global zone is generally 48 * unconstrained in terms of privileges and access, though the usual 49 * credential and privilege based restrictions apply. 50 * 51 * 52 * Zone States: 53 * 54 * The states in which a zone may be in and the transitions are as 55 * follows: 56 * 57 * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially 58 * initialized zone is added to the list of active zones on the system but 59 * isn't accessible. 60 * 61 * ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are 62 * not yet completed. Not possible to enter the zone, but attributes can 63 * be retrieved. 64 * 65 * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is 66 * ready. The zone is made visible after the ZSD constructor callbacks are 67 * executed. A zone remains in this state until it transitions into 68 * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). 69 * 70 * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start 71 * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN 72 * state. 73 * 74 * ZONE_IS_RUNNING: The zone is open for business: zsched has 75 * successfully started init. A zone remains in this state until 76 * zone_shutdown() is called. 77 * 78 * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is 79 * killing all processes running in the zone. The zone remains 80 * in this state until there are no more user processes running in the zone. 81 * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. 82 * Since zone_shutdown() is restartable, it may be called successfully 83 * multiple times for the same zone_t. Setting of the zone's state to 84 * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check 85 * the zone's status without worrying about it being a moving target. 86 * 87 * ZONE_IS_EMPTY: zone_shutdown() has been called, and there 88 * are no more user processes in the zone. The zone remains in this 89 * state until there are no more kernel threads associated with the 90 * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will 91 * fail. 92 * 93 * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone 94 * have exited. zone_shutdown() returns. Henceforth it is not possible to 95 * join the zone or create kernel threads therein. 96 * 97 * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone 98 * remains in this state until zsched exits. Calls to zone_find_by_*() 99 * return NULL from now on. 100 * 101 * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no 102 * processes or threads doing work on behalf of the zone. The zone is 103 * removed from the list of active zones. zone_destroy() returns, and 104 * the zone can be recreated. 105 * 106 * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor 107 * callbacks are executed, and all memory associated with the zone is 108 * freed. 109 * 110 * Threads can wait for the zone to enter a requested state by using 111 * zone_status_wait() or zone_status_timedwait() with the desired 112 * state passed in as an argument. Zone state transitions are 113 * uni-directional; it is not possible to move back to an earlier state. 114 * 115 * 116 * Zone-Specific Data: 117 * 118 * Subsystems needing to maintain zone-specific data can store that 119 * data using the ZSD mechanism. This provides a zone-specific data 120 * store, similar to thread-specific data (see pthread_getspecific(3C) 121 * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used 122 * to register callbacks to be invoked when a zone is created, shut 123 * down, or destroyed. This can be used to initialize zone-specific 124 * data for new zones and to clean up when zones go away. 125 * 126 * 127 * Data Structures: 128 * 129 * The per-zone structure (zone_t) is reference counted, and freed 130 * when all references are released. zone_hold and zone_rele can be 131 * used to adjust the reference count. In addition, reference counts 132 * associated with the cred_t structure are tracked separately using 133 * zone_cred_hold and zone_cred_rele. 134 * 135 * Pointers to active zone_t's are stored in two hash tables; one 136 * for searching by id, the other for searching by name. Lookups 137 * can be performed on either basis, using zone_find_by_id and 138 * zone_find_by_name. Both return zone_t pointers with the zone 139 * held, so zone_rele should be called when the pointer is no longer 140 * needed. Zones can also be searched by path; zone_find_by_path 141 * returns the zone with which a path name is associated (global 142 * zone if the path is not within some other zone's file system 143 * hierarchy). This currently requires iterating through each zone, 144 * so it is slower than an id or name search via a hash table. 145 * 146 * 147 * Locking: 148 * 149 * zonehash_lock: This is a top-level global lock used to protect the 150 * zone hash tables and lists. Zones cannot be created or destroyed 151 * while this lock is held. 152 * zone_status_lock: This is a global lock protecting zone state. 153 * Zones cannot change state while this lock is held. It also 154 * protects the list of kernel threads associated with a zone. 155 * zone_lock: This is a per-zone lock used to protect several fields of 156 * the zone_t (see <sys/zone.h> for details). In addition, holding 157 * this lock means that the zone cannot go away. 158 * zone_nlwps_lock: This is a per-zone lock used to protect the fields 159 * related to the zone.max-lwps rctl. 160 * zone_mem_lock: This is a per-zone lock used to protect the fields 161 * related to the zone.max-locked-memory and zone.max-swap rctls. 162 * zone_rctl_lock: This is a per-zone lock used to protect other rctls, 163 * currently just max_lofi 164 * zsd_key_lock: This is a global lock protecting the key state for ZSD. 165 * zone_deathrow_lock: This is a global lock protecting the "deathrow" 166 * list (a list of zones in the ZONE_IS_DEAD state). 167 * 168 * Ordering requirements: 169 * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> 170 * zone_lock --> zsd_key_lock --> pidlock --> p_lock 171 * 172 * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: 173 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock 174 * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock 175 * 176 * Blocking memory allocations are permitted while holding any of the 177 * zone locks. 178 * 179 * 180 * System Call Interface: 181 * 182 * The zone subsystem can be managed and queried from user level with 183 * the following system calls (all subcodes of the primary "zone" 184 * system call): 185 * - zone_create: creates a zone with selected attributes (name, 186 * root path, privileges, resource controls, ZFS datasets) 187 * - zone_enter: allows the current process to enter a zone 188 * - zone_getattr: reports attributes of a zone 189 * - zone_setattr: set attributes of a zone 190 * - zone_boot: set 'init' running for the zone 191 * - zone_list: lists all zones active in the system 192 * - zone_lookup: looks up zone id based on name 193 * - zone_shutdown: initiates shutdown process (see states above) 194 * - zone_destroy: completes shutdown process (see states above) 195 * 196 */ 197 198 #include <sys/priv_impl.h> 199 #include <sys/cred.h> 200 #include <c2/audit.h> 201 #include <sys/debug.h> 202 #include <sys/file.h> 203 #include <sys/kmem.h> 204 #include <sys/kstat.h> 205 #include <sys/mutex.h> 206 #include <sys/note.h> 207 #include <sys/pathname.h> 208 #include <sys/proc.h> 209 #include <sys/project.h> 210 #include <sys/sysevent.h> 211 #include <sys/task.h> 212 #include <sys/systm.h> 213 #include <sys/types.h> 214 #include <sys/utsname.h> 215 #include <sys/vnode.h> 216 #include <sys/vfs.h> 217 #include <sys/systeminfo.h> 218 #include <sys/policy.h> 219 #include <sys/cred_impl.h> 220 #include <sys/contract_impl.h> 221 #include <sys/contract/process_impl.h> 222 #include <sys/class.h> 223 #include <sys/pool.h> 224 #include <sys/pool_pset.h> 225 #include <sys/pset.h> 226 #include <sys/sysmacros.h> 227 #include <sys/callb.h> 228 #include <sys/vmparam.h> 229 #include <sys/corectl.h> 230 #include <sys/ipc_impl.h> 231 #include <sys/klpd.h> 232 233 #include <sys/door.h> 234 #include <sys/cpuvar.h> 235 #include <sys/sdt.h> 236 237 #include <sys/uadmin.h> 238 #include <sys/session.h> 239 #include <sys/cmn_err.h> 240 #include <sys/modhash.h> 241 #include <sys/sunddi.h> 242 #include <sys/nvpair.h> 243 #include <sys/rctl.h> 244 #include <sys/fss.h> 245 #include <sys/brand.h> 246 #include <sys/zone.h> 247 #include <net/if.h> 248 #include <sys/cpucaps.h> 249 #include <vm/seg.h> 250 #include <sys/mac.h> 251 252 /* List of data link IDs which are accessible from the zone */ 253 typedef struct zone_dl { 254 datalink_id_t zdl_id; 255 nvlist_t *zdl_net; 256 list_node_t zdl_linkage; 257 } zone_dl_t; 258 259 /* 260 * cv used to signal that all references to the zone have been released. This 261 * needs to be global since there may be multiple waiters, and the first to 262 * wake up will free the zone_t, hence we cannot use zone->zone_cv. 263 */ 264 static kcondvar_t zone_destroy_cv; 265 /* 266 * Lock used to serialize access to zone_cv. This could have been per-zone, 267 * but then we'd need another lock for zone_destroy_cv, and why bother? 268 */ 269 static kmutex_t zone_status_lock; 270 271 /* 272 * ZSD-related global variables. 273 */ 274 static kmutex_t zsd_key_lock; /* protects the following two */ 275 /* 276 * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. 277 */ 278 static zone_key_t zsd_keyval = 0; 279 /* 280 * Global list of registered keys. We use this when a new zone is created. 281 */ 282 static list_t zsd_registered_keys; 283 284 int zone_hash_size = 256; 285 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel; 286 static kmutex_t zonehash_lock; 287 static uint_t zonecount; 288 static id_space_t *zoneid_space; 289 290 /* 291 * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the 292 * kernel proper runs, and which manages all other zones. 293 * 294 * Although not declared as static, the variable "zone0" should not be used 295 * except for by code that needs to reference the global zone early on in boot, 296 * before it is fully initialized. All other consumers should use 297 * 'global_zone'. 298 */ 299 zone_t zone0; 300 zone_t *global_zone = NULL; /* Set when the global zone is initialized */ 301 302 /* 303 * List of active zones, protected by zonehash_lock. 304 */ 305 static list_t zone_active; 306 307 /* 308 * List of destroyed zones that still have outstanding cred references. 309 * Used for debugging. Uses a separate lock to avoid lock ordering 310 * problems in zone_free. 311 */ 312 static list_t zone_deathrow; 313 static kmutex_t zone_deathrow_lock; 314 315 /* number of zones is limited by virtual interface limit in IP */ 316 uint_t maxzones = 8192; 317 318 /* Event channel to sent zone state change notifications */ 319 evchan_t *zone_event_chan; 320 321 /* 322 * This table holds the mapping from kernel zone states to 323 * states visible in the state notification API. 324 * The idea is that we only expose "obvious" states and 325 * do not expose states which are just implementation details. 326 */ 327 const char *zone_status_table[] = { 328 ZONE_EVENT_UNINITIALIZED, /* uninitialized */ 329 ZONE_EVENT_INITIALIZED, /* initialized */ 330 ZONE_EVENT_READY, /* ready */ 331 ZONE_EVENT_READY, /* booting */ 332 ZONE_EVENT_RUNNING, /* running */ 333 ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */ 334 ZONE_EVENT_SHUTTING_DOWN, /* empty */ 335 ZONE_EVENT_SHUTTING_DOWN, /* down */ 336 ZONE_EVENT_SHUTTING_DOWN, /* dying */ 337 ZONE_EVENT_UNINITIALIZED, /* dead */ 338 }; 339 340 /* 341 * This isn't static so lint doesn't complain. 342 */ 343 rctl_hndl_t rc_zone_cpu_shares; 344 rctl_hndl_t rc_zone_locked_mem; 345 rctl_hndl_t rc_zone_max_swap; 346 rctl_hndl_t rc_zone_max_lofi; 347 rctl_hndl_t rc_zone_cpu_cap; 348 rctl_hndl_t rc_zone_nlwps; 349 rctl_hndl_t rc_zone_nprocs; 350 rctl_hndl_t rc_zone_shmmax; 351 rctl_hndl_t rc_zone_shmmni; 352 rctl_hndl_t rc_zone_semmni; 353 rctl_hndl_t rc_zone_msgmni; 354 /* 355 * Synchronization primitives used to synchronize between mounts and zone 356 * creation/destruction. 357 */ 358 static int mounts_in_progress; 359 static kcondvar_t mount_cv; 360 static kmutex_t mount_lock; 361 362 const char * const zone_default_initname = "/sbin/init"; 363 static char * const zone_prefix = "/zone/"; 364 static int zone_shutdown(zoneid_t zoneid); 365 static int zone_add_datalink(zoneid_t, datalink_id_t); 366 static int zone_remove_datalink(zoneid_t, datalink_id_t); 367 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); 368 static int zone_set_network(zoneid_t, zone_net_data_t *); 369 static int zone_get_network(zoneid_t, zone_net_data_t *); 370 371 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); 372 373 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t); 374 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *); 375 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t); 376 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *, 377 zone_key_t); 378 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t); 379 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *, 380 kmutex_t *); 381 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, 382 kmutex_t *); 383 384 /* 385 * Bump this number when you alter the zone syscall interfaces; this is 386 * because we need to have support for previous API versions in libc 387 * to support patching; libc calls into the kernel to determine this number. 388 * 389 * Version 1 of the API is the version originally shipped with Solaris 10 390 * Version 2 alters the zone_create system call in order to support more 391 * arguments by moving the args into a structure; and to do better 392 * error reporting when zone_create() fails. 393 * Version 3 alters the zone_create system call in order to support the 394 * import of ZFS datasets to zones. 395 * Version 4 alters the zone_create system call in order to support 396 * Trusted Extensions. 397 * Version 5 alters the zone_boot system call, and converts its old 398 * bootargs parameter to be set by the zone_setattr API instead. 399 * Version 6 adds the flag argument to zone_create. 400 */ 401 static const int ZONE_SYSCALL_API_VERSION = 6; 402 403 /* 404 * Certain filesystems (such as NFS and autofs) need to know which zone 405 * the mount is being placed in. Because of this, we need to be able to 406 * ensure that a zone isn't in the process of being created such that 407 * nfs_mount() thinks it is in the global zone, while by the time it 408 * gets added the list of mounted zones, it ends up on zoneA's mount 409 * list. 410 * 411 * The following functions: block_mounts()/resume_mounts() and 412 * mount_in_progress()/mount_completed() are used by zones and the VFS 413 * layer (respectively) to synchronize zone creation and new mounts. 414 * 415 * The semantics are like a reader-reader lock such that there may 416 * either be multiple mounts (or zone creations, if that weren't 417 * serialized by zonehash_lock) in progress at the same time, but not 418 * both. 419 * 420 * We use cv's so the user can ctrl-C out of the operation if it's 421 * taking too long. 422 * 423 * The semantics are such that there is unfair bias towards the 424 * "current" operation. This means that zone creations may starve if 425 * there is a rapid succession of new mounts coming in to the system, or 426 * there is a remote possibility that zones will be created at such a 427 * rate that new mounts will not be able to proceed. 428 */ 429 /* 430 * Prevent new mounts from progressing to the point of calling 431 * VFS_MOUNT(). If there are already mounts in this "region", wait for 432 * them to complete. 433 */ 434 static int 435 block_mounts(void) 436 { 437 int retval = 0; 438 439 /* 440 * Since it may block for a long time, block_mounts() shouldn't be 441 * called with zonehash_lock held. 442 */ 443 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 444 mutex_enter(&mount_lock); 445 while (mounts_in_progress > 0) { 446 if (cv_wait_sig(&mount_cv, &mount_lock) == 0) 447 goto signaled; 448 } 449 /* 450 * A negative value of mounts_in_progress indicates that mounts 451 * have been blocked by (-mounts_in_progress) different callers. 452 */ 453 mounts_in_progress--; 454 retval = 1; 455 signaled: 456 mutex_exit(&mount_lock); 457 return (retval); 458 } 459 460 /* 461 * The VFS layer may progress with new mounts as far as we're concerned. 462 * Allow them to progress if we were the last obstacle. 463 */ 464 static void 465 resume_mounts(void) 466 { 467 mutex_enter(&mount_lock); 468 if (++mounts_in_progress == 0) 469 cv_broadcast(&mount_cv); 470 mutex_exit(&mount_lock); 471 } 472 473 /* 474 * The VFS layer is busy with a mount; zones should wait until all 475 * mounts are completed to progress. 476 */ 477 void 478 mount_in_progress(void) 479 { 480 mutex_enter(&mount_lock); 481 while (mounts_in_progress < 0) 482 cv_wait(&mount_cv, &mount_lock); 483 mounts_in_progress++; 484 mutex_exit(&mount_lock); 485 } 486 487 /* 488 * VFS is done with one mount; wake up any waiting block_mounts() 489 * callers if this is the last mount. 490 */ 491 void 492 mount_completed(void) 493 { 494 mutex_enter(&mount_lock); 495 if (--mounts_in_progress == 0) 496 cv_broadcast(&mount_cv); 497 mutex_exit(&mount_lock); 498 } 499 500 /* 501 * ZSD routines. 502 * 503 * Zone Specific Data (ZSD) is modeled after Thread Specific Data as 504 * defined by the pthread_key_create() and related interfaces. 505 * 506 * Kernel subsystems may register one or more data items and/or 507 * callbacks to be executed when a zone is created, shutdown, or 508 * destroyed. 509 * 510 * Unlike the thread counterpart, destructor callbacks will be executed 511 * even if the data pointer is NULL and/or there are no constructor 512 * callbacks, so it is the responsibility of such callbacks to check for 513 * NULL data values if necessary. 514 * 515 * The locking strategy and overall picture is as follows: 516 * 517 * When someone calls zone_key_create(), a template ZSD entry is added to the 518 * global list "zsd_registered_keys", protected by zsd_key_lock. While 519 * holding that lock all the existing zones are marked as 520 * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone 521 * zone_zsd list (protected by zone_lock). The global list is updated first 522 * (under zone_key_lock) to make sure that newly created zones use the 523 * most recent list of keys. Then under zonehash_lock we walk the zones 524 * and mark them. Similar locking is used in zone_key_delete(). 525 * 526 * The actual create, shutdown, and destroy callbacks are done without 527 * holding any lock. And zsd_flags are used to ensure that the operations 528 * completed so that when zone_key_create (and zone_create) is done, as well as 529 * zone_key_delete (and zone_destroy) is done, all the necessary callbacks 530 * are completed. 531 * 532 * When new zones are created constructor callbacks for all registered ZSD 533 * entries will be called. That also uses the above two phases of marking 534 * what needs to be done, and then running the callbacks without holding 535 * any locks. 536 * 537 * The framework does not provide any locking around zone_getspecific() and 538 * zone_setspecific() apart from that needed for internal consistency, so 539 * callers interested in atomic "test-and-set" semantics will need to provide 540 * their own locking. 541 */ 542 543 /* 544 * Helper function to find the zsd_entry associated with the key in the 545 * given list. 546 */ 547 static struct zsd_entry * 548 zsd_find(list_t *l, zone_key_t key) 549 { 550 struct zsd_entry *zsd; 551 552 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 553 if (zsd->zsd_key == key) { 554 return (zsd); 555 } 556 } 557 return (NULL); 558 } 559 560 /* 561 * Helper function to find the zsd_entry associated with the key in the 562 * given list. Move it to the front of the list. 563 */ 564 static struct zsd_entry * 565 zsd_find_mru(list_t *l, zone_key_t key) 566 { 567 struct zsd_entry *zsd; 568 569 for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { 570 if (zsd->zsd_key == key) { 571 /* 572 * Move to head of list to keep list in MRU order. 573 */ 574 if (zsd != list_head(l)) { 575 list_remove(l, zsd); 576 list_insert_head(l, zsd); 577 } 578 return (zsd); 579 } 580 } 581 return (NULL); 582 } 583 584 void 585 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), 586 void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) 587 { 588 struct zsd_entry *zsdp; 589 struct zsd_entry *t; 590 struct zone *zone; 591 zone_key_t key; 592 593 zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP); 594 zsdp->zsd_data = NULL; 595 zsdp->zsd_create = create; 596 zsdp->zsd_shutdown = shutdown; 597 zsdp->zsd_destroy = destroy; 598 599 /* 600 * Insert in global list of callbacks. Makes future zone creations 601 * see it. 602 */ 603 mutex_enter(&zsd_key_lock); 604 key = zsdp->zsd_key = ++zsd_keyval; 605 ASSERT(zsd_keyval != 0); 606 list_insert_tail(&zsd_registered_keys, zsdp); 607 mutex_exit(&zsd_key_lock); 608 609 /* 610 * Insert for all existing zones and mark them as needing 611 * a create callback. 612 */ 613 mutex_enter(&zonehash_lock); /* stop the world */ 614 for (zone = list_head(&zone_active); zone != NULL; 615 zone = list_next(&zone_active, zone)) { 616 zone_status_t status; 617 618 mutex_enter(&zone->zone_lock); 619 620 /* Skip zones that are on the way down or not yet up */ 621 status = zone_status_get(zone); 622 if (status >= ZONE_IS_DOWN || 623 status == ZONE_IS_UNINITIALIZED) { 624 mutex_exit(&zone->zone_lock); 625 continue; 626 } 627 628 t = zsd_find_mru(&zone->zone_zsd, key); 629 if (t != NULL) { 630 /* 631 * A zsd_configure already inserted it after 632 * we dropped zsd_key_lock above. 633 */ 634 mutex_exit(&zone->zone_lock); 635 continue; 636 } 637 t = kmem_zalloc(sizeof (*t), KM_SLEEP); 638 t->zsd_key = key; 639 t->zsd_create = create; 640 t->zsd_shutdown = shutdown; 641 t->zsd_destroy = destroy; 642 if (create != NULL) { 643 t->zsd_flags = ZSD_CREATE_NEEDED; 644 DTRACE_PROBE2(zsd__create__needed, 645 zone_t *, zone, zone_key_t, key); 646 } 647 list_insert_tail(&zone->zone_zsd, t); 648 mutex_exit(&zone->zone_lock); 649 } 650 mutex_exit(&zonehash_lock); 651 652 if (create != NULL) { 653 /* Now call the create callback for this key */ 654 zsd_apply_all_zones(zsd_apply_create, key); 655 } 656 /* 657 * It is safe for consumers to use the key now, make it 658 * globally visible. Specifically zone_getspecific() will 659 * always successfully return the zone specific data associated 660 * with the key. 661 */ 662 *keyp = key; 663 664 } 665 666 /* 667 * Function called when a module is being unloaded, or otherwise wishes 668 * to unregister its ZSD key and callbacks. 669 * 670 * Remove from the global list and determine the functions that need to 671 * be called under a global lock. Then call the functions without 672 * holding any locks. Finally free up the zone_zsd entries. (The apply 673 * functions need to access the zone_zsd entries to find zsd_data etc.) 674 */ 675 int 676 zone_key_delete(zone_key_t key) 677 { 678 struct zsd_entry *zsdp = NULL; 679 zone_t *zone; 680 681 mutex_enter(&zsd_key_lock); 682 zsdp = zsd_find_mru(&zsd_registered_keys, key); 683 if (zsdp == NULL) { 684 mutex_exit(&zsd_key_lock); 685 return (-1); 686 } 687 list_remove(&zsd_registered_keys, zsdp); 688 mutex_exit(&zsd_key_lock); 689 690 mutex_enter(&zonehash_lock); 691 for (zone = list_head(&zone_active); zone != NULL; 692 zone = list_next(&zone_active, zone)) { 693 struct zsd_entry *del; 694 695 mutex_enter(&zone->zone_lock); 696 del = zsd_find_mru(&zone->zone_zsd, key); 697 if (del == NULL) { 698 /* 699 * Somebody else got here first e.g the zone going 700 * away. 701 */ 702 mutex_exit(&zone->zone_lock); 703 continue; 704 } 705 ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); 706 ASSERT(del->zsd_destroy == zsdp->zsd_destroy); 707 if (del->zsd_shutdown != NULL && 708 (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { 709 del->zsd_flags |= ZSD_SHUTDOWN_NEEDED; 710 DTRACE_PROBE2(zsd__shutdown__needed, 711 zone_t *, zone, zone_key_t, key); 712 } 713 if (del->zsd_destroy != NULL && 714 (del->zsd_flags & ZSD_DESTROY_ALL) == 0) { 715 del->zsd_flags |= ZSD_DESTROY_NEEDED; 716 DTRACE_PROBE2(zsd__destroy__needed, 717 zone_t *, zone, zone_key_t, key); 718 } 719 mutex_exit(&zone->zone_lock); 720 } 721 mutex_exit(&zonehash_lock); 722 kmem_free(zsdp, sizeof (*zsdp)); 723 724 /* Now call the shutdown and destroy callback for this key */ 725 zsd_apply_all_zones(zsd_apply_shutdown, key); 726 zsd_apply_all_zones(zsd_apply_destroy, key); 727 728 /* Now we can free up the zsdp structures in each zone */ 729 mutex_enter(&zonehash_lock); 730 for (zone = list_head(&zone_active); zone != NULL; 731 zone = list_next(&zone_active, zone)) { 732 struct zsd_entry *del; 733 734 mutex_enter(&zone->zone_lock); 735 del = zsd_find(&zone->zone_zsd, key); 736 if (del != NULL) { 737 list_remove(&zone->zone_zsd, del); 738 ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS)); 739 kmem_free(del, sizeof (*del)); 740 } 741 mutex_exit(&zone->zone_lock); 742 } 743 mutex_exit(&zonehash_lock); 744 745 return (0); 746 } 747 748 /* 749 * ZSD counterpart of pthread_setspecific(). 750 * 751 * Since all zsd callbacks, including those with no create function, 752 * have an entry in zone_zsd, if the key is registered it is part of 753 * the zone_zsd list. 754 * Return an error if the key wasn't registerd. 755 */ 756 int 757 zone_setspecific(zone_key_t key, zone_t *zone, const void *data) 758 { 759 struct zsd_entry *t; 760 761 mutex_enter(&zone->zone_lock); 762 t = zsd_find_mru(&zone->zone_zsd, key); 763 if (t != NULL) { 764 /* 765 * Replace old value with new 766 */ 767 t->zsd_data = (void *)data; 768 mutex_exit(&zone->zone_lock); 769 return (0); 770 } 771 mutex_exit(&zone->zone_lock); 772 return (-1); 773 } 774 775 /* 776 * ZSD counterpart of pthread_getspecific(). 777 */ 778 void * 779 zone_getspecific(zone_key_t key, zone_t *zone) 780 { 781 struct zsd_entry *t; 782 void *data; 783 784 mutex_enter(&zone->zone_lock); 785 t = zsd_find_mru(&zone->zone_zsd, key); 786 data = (t == NULL ? NULL : t->zsd_data); 787 mutex_exit(&zone->zone_lock); 788 return (data); 789 } 790 791 /* 792 * Function used to initialize a zone's list of ZSD callbacks and data 793 * when the zone is being created. The callbacks are initialized from 794 * the template list (zsd_registered_keys). The constructor callback is 795 * executed later (once the zone exists and with locks dropped). 796 */ 797 static void 798 zone_zsd_configure(zone_t *zone) 799 { 800 struct zsd_entry *zsdp; 801 struct zsd_entry *t; 802 803 ASSERT(MUTEX_HELD(&zonehash_lock)); 804 ASSERT(list_head(&zone->zone_zsd) == NULL); 805 mutex_enter(&zone->zone_lock); 806 mutex_enter(&zsd_key_lock); 807 for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; 808 zsdp = list_next(&zsd_registered_keys, zsdp)) { 809 /* 810 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create 811 * should not have added anything to it. 812 */ 813 ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL); 814 815 t = kmem_zalloc(sizeof (*t), KM_SLEEP); 816 t->zsd_key = zsdp->zsd_key; 817 t->zsd_create = zsdp->zsd_create; 818 t->zsd_shutdown = zsdp->zsd_shutdown; 819 t->zsd_destroy = zsdp->zsd_destroy; 820 if (zsdp->zsd_create != NULL) { 821 t->zsd_flags = ZSD_CREATE_NEEDED; 822 DTRACE_PROBE2(zsd__create__needed, 823 zone_t *, zone, zone_key_t, zsdp->zsd_key); 824 } 825 list_insert_tail(&zone->zone_zsd, t); 826 } 827 mutex_exit(&zsd_key_lock); 828 mutex_exit(&zone->zone_lock); 829 } 830 831 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; 832 833 /* 834 * Helper function to execute shutdown or destructor callbacks. 835 */ 836 static void 837 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) 838 { 839 struct zsd_entry *t; 840 841 ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); 842 ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); 843 ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); 844 845 /* 846 * Run the callback solely based on what is registered for the zone 847 * in zone_zsd. The global list can change independently of this 848 * as keys are registered and unregistered and we don't register new 849 * callbacks for a zone that is in the process of going away. 850 */ 851 mutex_enter(&zone->zone_lock); 852 for (t = list_head(&zone->zone_zsd); t != NULL; 853 t = list_next(&zone->zone_zsd, t)) { 854 zone_key_t key = t->zsd_key; 855 856 /* Skip if no callbacks registered */ 857 858 if (ct == ZSD_SHUTDOWN) { 859 if (t->zsd_shutdown != NULL && 860 (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { 861 t->zsd_flags |= ZSD_SHUTDOWN_NEEDED; 862 DTRACE_PROBE2(zsd__shutdown__needed, 863 zone_t *, zone, zone_key_t, key); 864 } 865 } else { 866 if (t->zsd_destroy != NULL && 867 (t->zsd_flags & ZSD_DESTROY_ALL) == 0) { 868 t->zsd_flags |= ZSD_DESTROY_NEEDED; 869 DTRACE_PROBE2(zsd__destroy__needed, 870 zone_t *, zone, zone_key_t, key); 871 } 872 } 873 } 874 mutex_exit(&zone->zone_lock); 875 876 /* Now call the shutdown and destroy callback for this key */ 877 zsd_apply_all_keys(zsd_apply_shutdown, zone); 878 zsd_apply_all_keys(zsd_apply_destroy, zone); 879 880 } 881 882 /* 883 * Called when the zone is going away; free ZSD-related memory, and 884 * destroy the zone_zsd list. 885 */ 886 static void 887 zone_free_zsd(zone_t *zone) 888 { 889 struct zsd_entry *t, *next; 890 891 /* 892 * Free all the zsd_entry's we had on this zone. 893 */ 894 mutex_enter(&zone->zone_lock); 895 for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { 896 next = list_next(&zone->zone_zsd, t); 897 list_remove(&zone->zone_zsd, t); 898 ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS)); 899 kmem_free(t, sizeof (*t)); 900 } 901 list_destroy(&zone->zone_zsd); 902 mutex_exit(&zone->zone_lock); 903 904 } 905 906 /* 907 * Apply a function to all zones for particular key value. 908 * 909 * The applyfn has to drop zonehash_lock if it does some work, and 910 * then reacquire it before it returns. 911 * When the lock is dropped we don't follow list_next even 912 * if it is possible to do so without any hazards. This is 913 * because we want the design to allow for the list of zones 914 * to change in any arbitrary way during the time the 915 * lock was dropped. 916 * 917 * It is safe to restart the loop at list_head since the applyfn 918 * changes the zsd_flags as it does work, so a subsequent 919 * pass through will have no effect in applyfn, hence the loop will terminate 920 * in at worst O(N^2). 921 */ 922 static void 923 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key) 924 { 925 zone_t *zone; 926 927 mutex_enter(&zonehash_lock); 928 zone = list_head(&zone_active); 929 while (zone != NULL) { 930 if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) { 931 /* Lock dropped - restart at head */ 932 zone = list_head(&zone_active); 933 } else { 934 zone = list_next(&zone_active, zone); 935 } 936 } 937 mutex_exit(&zonehash_lock); 938 } 939 940 /* 941 * Apply a function to all keys for a particular zone. 942 * 943 * The applyfn has to drop zonehash_lock if it does some work, and 944 * then reacquire it before it returns. 945 * When the lock is dropped we don't follow list_next even 946 * if it is possible to do so without any hazards. This is 947 * because we want the design to allow for the list of zsd callbacks 948 * to change in any arbitrary way during the time the 949 * lock was dropped. 950 * 951 * It is safe to restart the loop at list_head since the applyfn 952 * changes the zsd_flags as it does work, so a subsequent 953 * pass through will have no effect in applyfn, hence the loop will terminate 954 * in at worst O(N^2). 955 */ 956 static void 957 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone) 958 { 959 struct zsd_entry *t; 960 961 mutex_enter(&zone->zone_lock); 962 t = list_head(&zone->zone_zsd); 963 while (t != NULL) { 964 if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) { 965 /* Lock dropped - restart at head */ 966 t = list_head(&zone->zone_zsd); 967 } else { 968 t = list_next(&zone->zone_zsd, t); 969 } 970 } 971 mutex_exit(&zone->zone_lock); 972 } 973 974 /* 975 * Call the create function for the zone and key if CREATE_NEEDED 976 * is set. 977 * If some other thread gets here first and sets CREATE_INPROGRESS, then 978 * we wait for that thread to complete so that we can ensure that 979 * all the callbacks are done when we've looped over all zones/keys. 980 * 981 * When we call the create function, we drop the global held by the 982 * caller, and return true to tell the caller it needs to re-evalute the 983 * state. 984 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock 985 * remains held on exit. 986 */ 987 static boolean_t 988 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held, 989 zone_t *zone, zone_key_t key) 990 { 991 void *result; 992 struct zsd_entry *t; 993 boolean_t dropped; 994 995 if (lockp != NULL) { 996 ASSERT(MUTEX_HELD(lockp)); 997 } 998 if (zone_lock_held) { 999 ASSERT(MUTEX_HELD(&zone->zone_lock)); 1000 } else { 1001 mutex_enter(&zone->zone_lock); 1002 } 1003 1004 t = zsd_find(&zone->zone_zsd, key); 1005 if (t == NULL) { 1006 /* 1007 * Somebody else got here first e.g the zone going 1008 * away. 1009 */ 1010 if (!zone_lock_held) 1011 mutex_exit(&zone->zone_lock); 1012 return (B_FALSE); 1013 } 1014 dropped = B_FALSE; 1015 if (zsd_wait_for_inprogress(zone, t, lockp)) 1016 dropped = B_TRUE; 1017 1018 if (t->zsd_flags & ZSD_CREATE_NEEDED) { 1019 t->zsd_flags &= ~ZSD_CREATE_NEEDED; 1020 t->zsd_flags |= ZSD_CREATE_INPROGRESS; 1021 DTRACE_PROBE2(zsd__create__inprogress, 1022 zone_t *, zone, zone_key_t, key); 1023 mutex_exit(&zone->zone_lock); 1024 if (lockp != NULL) 1025 mutex_exit(lockp); 1026 1027 dropped = B_TRUE; 1028 ASSERT(t->zsd_create != NULL); 1029 DTRACE_PROBE2(zsd__create__start, 1030 zone_t *, zone, zone_key_t, key); 1031 1032 result = (*t->zsd_create)(zone->zone_id); 1033 1034 DTRACE_PROBE2(zsd__create__end, 1035 zone_t *, zone, voidn *, result); 1036 1037 ASSERT(result != NULL); 1038 if (lockp != NULL) 1039 mutex_enter(lockp); 1040 mutex_enter(&zone->zone_lock); 1041 t->zsd_data = result; 1042 t->zsd_flags &= ~ZSD_CREATE_INPROGRESS; 1043 t->zsd_flags |= ZSD_CREATE_COMPLETED; 1044 cv_broadcast(&t->zsd_cv); 1045 DTRACE_PROBE2(zsd__create__completed, 1046 zone_t *, zone, zone_key_t, key); 1047 } 1048 if (!zone_lock_held) 1049 mutex_exit(&zone->zone_lock); 1050 return (dropped); 1051 } 1052 1053 /* 1054 * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED 1055 * is set. 1056 * If some other thread gets here first and sets *_INPROGRESS, then 1057 * we wait for that thread to complete so that we can ensure that 1058 * all the callbacks are done when we've looped over all zones/keys. 1059 * 1060 * When we call the shutdown function, we drop the global held by the 1061 * caller, and return true to tell the caller it needs to re-evalute the 1062 * state. 1063 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock 1064 * remains held on exit. 1065 */ 1066 static boolean_t 1067 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held, 1068 zone_t *zone, zone_key_t key) 1069 { 1070 struct zsd_entry *t; 1071 void *data; 1072 boolean_t dropped; 1073 1074 if (lockp != NULL) { 1075 ASSERT(MUTEX_HELD(lockp)); 1076 } 1077 if (zone_lock_held) { 1078 ASSERT(MUTEX_HELD(&zone->zone_lock)); 1079 } else { 1080 mutex_enter(&zone->zone_lock); 1081 } 1082 1083 t = zsd_find(&zone->zone_zsd, key); 1084 if (t == NULL) { 1085 /* 1086 * Somebody else got here first e.g the zone going 1087 * away. 1088 */ 1089 if (!zone_lock_held) 1090 mutex_exit(&zone->zone_lock); 1091 return (B_FALSE); 1092 } 1093 dropped = B_FALSE; 1094 if (zsd_wait_for_creator(zone, t, lockp)) 1095 dropped = B_TRUE; 1096 1097 if (zsd_wait_for_inprogress(zone, t, lockp)) 1098 dropped = B_TRUE; 1099 1100 if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) { 1101 t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED; 1102 t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS; 1103 DTRACE_PROBE2(zsd__shutdown__inprogress, 1104 zone_t *, zone, zone_key_t, key); 1105 mutex_exit(&zone->zone_lock); 1106 if (lockp != NULL) 1107 mutex_exit(lockp); 1108 dropped = B_TRUE; 1109 1110 ASSERT(t->zsd_shutdown != NULL); 1111 data = t->zsd_data; 1112 1113 DTRACE_PROBE2(zsd__shutdown__start, 1114 zone_t *, zone, zone_key_t, key); 1115 1116 (t->zsd_shutdown)(zone->zone_id, data); 1117 DTRACE_PROBE2(zsd__shutdown__end, 1118 zone_t *, zone, zone_key_t, key); 1119 1120 if (lockp != NULL) 1121 mutex_enter(lockp); 1122 mutex_enter(&zone->zone_lock); 1123 t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS; 1124 t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED; 1125 cv_broadcast(&t->zsd_cv); 1126 DTRACE_PROBE2(zsd__shutdown__completed, 1127 zone_t *, zone, zone_key_t, key); 1128 } 1129 if (!zone_lock_held) 1130 mutex_exit(&zone->zone_lock); 1131 return (dropped); 1132 } 1133 1134 /* 1135 * Call the destroy function for the zone and key if DESTROY_NEEDED 1136 * is set. 1137 * If some other thread gets here first and sets *_INPROGRESS, then 1138 * we wait for that thread to complete so that we can ensure that 1139 * all the callbacks are done when we've looped over all zones/keys. 1140 * 1141 * When we call the destroy function, we drop the global held by the 1142 * caller, and return true to tell the caller it needs to re-evalute the 1143 * state. 1144 * If the caller holds zone_lock then zone_lock_held is set, and zone_lock 1145 * remains held on exit. 1146 */ 1147 static boolean_t 1148 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held, 1149 zone_t *zone, zone_key_t key) 1150 { 1151 struct zsd_entry *t; 1152 void *data; 1153 boolean_t dropped; 1154 1155 if (lockp != NULL) { 1156 ASSERT(MUTEX_HELD(lockp)); 1157 } 1158 if (zone_lock_held) { 1159 ASSERT(MUTEX_HELD(&zone->zone_lock)); 1160 } else { 1161 mutex_enter(&zone->zone_lock); 1162 } 1163 1164 t = zsd_find(&zone->zone_zsd, key); 1165 if (t == NULL) { 1166 /* 1167 * Somebody else got here first e.g the zone going 1168 * away. 1169 */ 1170 if (!zone_lock_held) 1171 mutex_exit(&zone->zone_lock); 1172 return (B_FALSE); 1173 } 1174 dropped = B_FALSE; 1175 if (zsd_wait_for_creator(zone, t, lockp)) 1176 dropped = B_TRUE; 1177 1178 if (zsd_wait_for_inprogress(zone, t, lockp)) 1179 dropped = B_TRUE; 1180 1181 if (t->zsd_flags & ZSD_DESTROY_NEEDED) { 1182 t->zsd_flags &= ~ZSD_DESTROY_NEEDED; 1183 t->zsd_flags |= ZSD_DESTROY_INPROGRESS; 1184 DTRACE_PROBE2(zsd__destroy__inprogress, 1185 zone_t *, zone, zone_key_t, key); 1186 mutex_exit(&zone->zone_lock); 1187 if (lockp != NULL) 1188 mutex_exit(lockp); 1189 dropped = B_TRUE; 1190 1191 ASSERT(t->zsd_destroy != NULL); 1192 data = t->zsd_data; 1193 DTRACE_PROBE2(zsd__destroy__start, 1194 zone_t *, zone, zone_key_t, key); 1195 1196 (t->zsd_destroy)(zone->zone_id, data); 1197 DTRACE_PROBE2(zsd__destroy__end, 1198 zone_t *, zone, zone_key_t, key); 1199 1200 if (lockp != NULL) 1201 mutex_enter(lockp); 1202 mutex_enter(&zone->zone_lock); 1203 t->zsd_data = NULL; 1204 t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS; 1205 t->zsd_flags |= ZSD_DESTROY_COMPLETED; 1206 cv_broadcast(&t->zsd_cv); 1207 DTRACE_PROBE2(zsd__destroy__completed, 1208 zone_t *, zone, zone_key_t, key); 1209 } 1210 if (!zone_lock_held) 1211 mutex_exit(&zone->zone_lock); 1212 return (dropped); 1213 } 1214 1215 /* 1216 * Wait for any CREATE_NEEDED flag to be cleared. 1217 * Returns true if lockp was temporarily dropped while waiting. 1218 */ 1219 static boolean_t 1220 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) 1221 { 1222 boolean_t dropped = B_FALSE; 1223 1224 while (t->zsd_flags & ZSD_CREATE_NEEDED) { 1225 DTRACE_PROBE2(zsd__wait__for__creator, 1226 zone_t *, zone, struct zsd_entry *, t); 1227 if (lockp != NULL) { 1228 dropped = B_TRUE; 1229 mutex_exit(lockp); 1230 } 1231 cv_wait(&t->zsd_cv, &zone->zone_lock); 1232 if (lockp != NULL) { 1233 /* First drop zone_lock to preserve order */ 1234 mutex_exit(&zone->zone_lock); 1235 mutex_enter(lockp); 1236 mutex_enter(&zone->zone_lock); 1237 } 1238 } 1239 return (dropped); 1240 } 1241 1242 /* 1243 * Wait for any INPROGRESS flag to be cleared. 1244 * Returns true if lockp was temporarily dropped while waiting. 1245 */ 1246 static boolean_t 1247 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) 1248 { 1249 boolean_t dropped = B_FALSE; 1250 1251 while (t->zsd_flags & ZSD_ALL_INPROGRESS) { 1252 DTRACE_PROBE2(zsd__wait__for__inprogress, 1253 zone_t *, zone, struct zsd_entry *, t); 1254 if (lockp != NULL) { 1255 dropped = B_TRUE; 1256 mutex_exit(lockp); 1257 } 1258 cv_wait(&t->zsd_cv, &zone->zone_lock); 1259 if (lockp != NULL) { 1260 /* First drop zone_lock to preserve order */ 1261 mutex_exit(&zone->zone_lock); 1262 mutex_enter(lockp); 1263 mutex_enter(&zone->zone_lock); 1264 } 1265 } 1266 return (dropped); 1267 } 1268 1269 /* 1270 * Frees memory associated with the zone dataset list. 1271 */ 1272 static void 1273 zone_free_datasets(zone_t *zone) 1274 { 1275 zone_dataset_t *t, *next; 1276 1277 for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { 1278 next = list_next(&zone->zone_datasets, t); 1279 list_remove(&zone->zone_datasets, t); 1280 kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); 1281 kmem_free(t, sizeof (*t)); 1282 } 1283 list_destroy(&zone->zone_datasets); 1284 } 1285 1286 /* 1287 * zone.cpu-shares resource control support. 1288 */ 1289 /*ARGSUSED*/ 1290 static rctl_qty_t 1291 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) 1292 { 1293 ASSERT(MUTEX_HELD(&p->p_lock)); 1294 return (p->p_zone->zone_shares); 1295 } 1296 1297 /*ARGSUSED*/ 1298 static int 1299 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1300 rctl_qty_t nv) 1301 { 1302 ASSERT(MUTEX_HELD(&p->p_lock)); 1303 ASSERT(e->rcep_t == RCENTITY_ZONE); 1304 if (e->rcep_p.zone == NULL) 1305 return (0); 1306 1307 e->rcep_p.zone->zone_shares = nv; 1308 return (0); 1309 } 1310 1311 static rctl_ops_t zone_cpu_shares_ops = { 1312 rcop_no_action, 1313 zone_cpu_shares_usage, 1314 zone_cpu_shares_set, 1315 rcop_no_test 1316 }; 1317 1318 /* 1319 * zone.cpu-cap resource control support. 1320 */ 1321 /*ARGSUSED*/ 1322 static rctl_qty_t 1323 zone_cpu_cap_get(rctl_t *rctl, struct proc *p) 1324 { 1325 ASSERT(MUTEX_HELD(&p->p_lock)); 1326 return (cpucaps_zone_get(p->p_zone)); 1327 } 1328 1329 /*ARGSUSED*/ 1330 static int 1331 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1332 rctl_qty_t nv) 1333 { 1334 zone_t *zone = e->rcep_p.zone; 1335 1336 ASSERT(MUTEX_HELD(&p->p_lock)); 1337 ASSERT(e->rcep_t == RCENTITY_ZONE); 1338 1339 if (zone == NULL) 1340 return (0); 1341 1342 /* 1343 * set cap to the new value. 1344 */ 1345 return (cpucaps_zone_set(zone, nv)); 1346 } 1347 1348 static rctl_ops_t zone_cpu_cap_ops = { 1349 rcop_no_action, 1350 zone_cpu_cap_get, 1351 zone_cpu_cap_set, 1352 rcop_no_test 1353 }; 1354 1355 /*ARGSUSED*/ 1356 static rctl_qty_t 1357 zone_lwps_usage(rctl_t *r, proc_t *p) 1358 { 1359 rctl_qty_t nlwps; 1360 zone_t *zone = p->p_zone; 1361 1362 ASSERT(MUTEX_HELD(&p->p_lock)); 1363 1364 mutex_enter(&zone->zone_nlwps_lock); 1365 nlwps = zone->zone_nlwps; 1366 mutex_exit(&zone->zone_nlwps_lock); 1367 1368 return (nlwps); 1369 } 1370 1371 /*ARGSUSED*/ 1372 static int 1373 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 1374 rctl_qty_t incr, uint_t flags) 1375 { 1376 rctl_qty_t nlwps; 1377 1378 ASSERT(MUTEX_HELD(&p->p_lock)); 1379 ASSERT(e->rcep_t == RCENTITY_ZONE); 1380 if (e->rcep_p.zone == NULL) 1381 return (0); 1382 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 1383 nlwps = e->rcep_p.zone->zone_nlwps; 1384 1385 if (nlwps + incr > rcntl->rcv_value) 1386 return (1); 1387 1388 return (0); 1389 } 1390 1391 /*ARGSUSED*/ 1392 static int 1393 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) 1394 { 1395 ASSERT(MUTEX_HELD(&p->p_lock)); 1396 ASSERT(e->rcep_t == RCENTITY_ZONE); 1397 if (e->rcep_p.zone == NULL) 1398 return (0); 1399 e->rcep_p.zone->zone_nlwps_ctl = nv; 1400 return (0); 1401 } 1402 1403 static rctl_ops_t zone_lwps_ops = { 1404 rcop_no_action, 1405 zone_lwps_usage, 1406 zone_lwps_set, 1407 zone_lwps_test, 1408 }; 1409 1410 /*ARGSUSED*/ 1411 static rctl_qty_t 1412 zone_procs_usage(rctl_t *r, proc_t *p) 1413 { 1414 rctl_qty_t nprocs; 1415 zone_t *zone = p->p_zone; 1416 1417 ASSERT(MUTEX_HELD(&p->p_lock)); 1418 1419 mutex_enter(&zone->zone_nlwps_lock); 1420 nprocs = zone->zone_nprocs; 1421 mutex_exit(&zone->zone_nlwps_lock); 1422 1423 return (nprocs); 1424 } 1425 1426 /*ARGSUSED*/ 1427 static int 1428 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, 1429 rctl_qty_t incr, uint_t flags) 1430 { 1431 rctl_qty_t nprocs; 1432 1433 ASSERT(MUTEX_HELD(&p->p_lock)); 1434 ASSERT(e->rcep_t == RCENTITY_ZONE); 1435 if (e->rcep_p.zone == NULL) 1436 return (0); 1437 ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); 1438 nprocs = e->rcep_p.zone->zone_nprocs; 1439 1440 if (nprocs + incr > rcntl->rcv_value) 1441 return (1); 1442 1443 return (0); 1444 } 1445 1446 /*ARGSUSED*/ 1447 static int 1448 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) 1449 { 1450 ASSERT(MUTEX_HELD(&p->p_lock)); 1451 ASSERT(e->rcep_t == RCENTITY_ZONE); 1452 if (e->rcep_p.zone == NULL) 1453 return (0); 1454 e->rcep_p.zone->zone_nprocs_ctl = nv; 1455 return (0); 1456 } 1457 1458 static rctl_ops_t zone_procs_ops = { 1459 rcop_no_action, 1460 zone_procs_usage, 1461 zone_procs_set, 1462 zone_procs_test, 1463 }; 1464 1465 /*ARGSUSED*/ 1466 static int 1467 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1468 rctl_qty_t incr, uint_t flags) 1469 { 1470 rctl_qty_t v; 1471 ASSERT(MUTEX_HELD(&p->p_lock)); 1472 ASSERT(e->rcep_t == RCENTITY_ZONE); 1473 v = e->rcep_p.zone->zone_shmmax + incr; 1474 if (v > rval->rcv_value) 1475 return (1); 1476 return (0); 1477 } 1478 1479 static rctl_ops_t zone_shmmax_ops = { 1480 rcop_no_action, 1481 rcop_no_usage, 1482 rcop_no_set, 1483 zone_shmmax_test 1484 }; 1485 1486 /*ARGSUSED*/ 1487 static int 1488 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1489 rctl_qty_t incr, uint_t flags) 1490 { 1491 rctl_qty_t v; 1492 ASSERT(MUTEX_HELD(&p->p_lock)); 1493 ASSERT(e->rcep_t == RCENTITY_ZONE); 1494 v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr; 1495 if (v > rval->rcv_value) 1496 return (1); 1497 return (0); 1498 } 1499 1500 static rctl_ops_t zone_shmmni_ops = { 1501 rcop_no_action, 1502 rcop_no_usage, 1503 rcop_no_set, 1504 zone_shmmni_test 1505 }; 1506 1507 /*ARGSUSED*/ 1508 static int 1509 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1510 rctl_qty_t incr, uint_t flags) 1511 { 1512 rctl_qty_t v; 1513 ASSERT(MUTEX_HELD(&p->p_lock)); 1514 ASSERT(e->rcep_t == RCENTITY_ZONE); 1515 v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr; 1516 if (v > rval->rcv_value) 1517 return (1); 1518 return (0); 1519 } 1520 1521 static rctl_ops_t zone_semmni_ops = { 1522 rcop_no_action, 1523 rcop_no_usage, 1524 rcop_no_set, 1525 zone_semmni_test 1526 }; 1527 1528 /*ARGSUSED*/ 1529 static int 1530 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, 1531 rctl_qty_t incr, uint_t flags) 1532 { 1533 rctl_qty_t v; 1534 ASSERT(MUTEX_HELD(&p->p_lock)); 1535 ASSERT(e->rcep_t == RCENTITY_ZONE); 1536 v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr; 1537 if (v > rval->rcv_value) 1538 return (1); 1539 return (0); 1540 } 1541 1542 static rctl_ops_t zone_msgmni_ops = { 1543 rcop_no_action, 1544 rcop_no_usage, 1545 rcop_no_set, 1546 zone_msgmni_test 1547 }; 1548 1549 /*ARGSUSED*/ 1550 static rctl_qty_t 1551 zone_locked_mem_usage(rctl_t *rctl, struct proc *p) 1552 { 1553 rctl_qty_t q; 1554 ASSERT(MUTEX_HELD(&p->p_lock)); 1555 mutex_enter(&p->p_zone->zone_mem_lock); 1556 q = p->p_zone->zone_locked_mem; 1557 mutex_exit(&p->p_zone->zone_mem_lock); 1558 return (q); 1559 } 1560 1561 /*ARGSUSED*/ 1562 static int 1563 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1564 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1565 { 1566 rctl_qty_t q; 1567 zone_t *z; 1568 1569 z = e->rcep_p.zone; 1570 ASSERT(MUTEX_HELD(&p->p_lock)); 1571 ASSERT(MUTEX_HELD(&z->zone_mem_lock)); 1572 q = z->zone_locked_mem; 1573 if (q + incr > rcntl->rcv_value) 1574 return (1); 1575 return (0); 1576 } 1577 1578 /*ARGSUSED*/ 1579 static int 1580 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1581 rctl_qty_t nv) 1582 { 1583 ASSERT(MUTEX_HELD(&p->p_lock)); 1584 ASSERT(e->rcep_t == RCENTITY_ZONE); 1585 if (e->rcep_p.zone == NULL) 1586 return (0); 1587 e->rcep_p.zone->zone_locked_mem_ctl = nv; 1588 return (0); 1589 } 1590 1591 static rctl_ops_t zone_locked_mem_ops = { 1592 rcop_no_action, 1593 zone_locked_mem_usage, 1594 zone_locked_mem_set, 1595 zone_locked_mem_test 1596 }; 1597 1598 /*ARGSUSED*/ 1599 static rctl_qty_t 1600 zone_max_swap_usage(rctl_t *rctl, struct proc *p) 1601 { 1602 rctl_qty_t q; 1603 zone_t *z = p->p_zone; 1604 1605 ASSERT(MUTEX_HELD(&p->p_lock)); 1606 mutex_enter(&z->zone_mem_lock); 1607 q = z->zone_max_swap; 1608 mutex_exit(&z->zone_mem_lock); 1609 return (q); 1610 } 1611 1612 /*ARGSUSED*/ 1613 static int 1614 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1615 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1616 { 1617 rctl_qty_t q; 1618 zone_t *z; 1619 1620 z = e->rcep_p.zone; 1621 ASSERT(MUTEX_HELD(&p->p_lock)); 1622 ASSERT(MUTEX_HELD(&z->zone_mem_lock)); 1623 q = z->zone_max_swap; 1624 if (q + incr > rcntl->rcv_value) 1625 return (1); 1626 return (0); 1627 } 1628 1629 /*ARGSUSED*/ 1630 static int 1631 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1632 rctl_qty_t nv) 1633 { 1634 ASSERT(MUTEX_HELD(&p->p_lock)); 1635 ASSERT(e->rcep_t == RCENTITY_ZONE); 1636 if (e->rcep_p.zone == NULL) 1637 return (0); 1638 e->rcep_p.zone->zone_max_swap_ctl = nv; 1639 return (0); 1640 } 1641 1642 static rctl_ops_t zone_max_swap_ops = { 1643 rcop_no_action, 1644 zone_max_swap_usage, 1645 zone_max_swap_set, 1646 zone_max_swap_test 1647 }; 1648 1649 /*ARGSUSED*/ 1650 static rctl_qty_t 1651 zone_max_lofi_usage(rctl_t *rctl, struct proc *p) 1652 { 1653 rctl_qty_t q; 1654 zone_t *z = p->p_zone; 1655 1656 ASSERT(MUTEX_HELD(&p->p_lock)); 1657 mutex_enter(&z->zone_rctl_lock); 1658 q = z->zone_max_lofi; 1659 mutex_exit(&z->zone_rctl_lock); 1660 return (q); 1661 } 1662 1663 /*ARGSUSED*/ 1664 static int 1665 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, 1666 rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) 1667 { 1668 rctl_qty_t q; 1669 zone_t *z; 1670 1671 z = e->rcep_p.zone; 1672 ASSERT(MUTEX_HELD(&p->p_lock)); 1673 ASSERT(MUTEX_HELD(&z->zone_rctl_lock)); 1674 q = z->zone_max_lofi; 1675 if (q + incr > rcntl->rcv_value) 1676 return (1); 1677 return (0); 1678 } 1679 1680 /*ARGSUSED*/ 1681 static int 1682 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, 1683 rctl_qty_t nv) 1684 { 1685 ASSERT(MUTEX_HELD(&p->p_lock)); 1686 ASSERT(e->rcep_t == RCENTITY_ZONE); 1687 if (e->rcep_p.zone == NULL) 1688 return (0); 1689 e->rcep_p.zone->zone_max_lofi_ctl = nv; 1690 return (0); 1691 } 1692 1693 static rctl_ops_t zone_max_lofi_ops = { 1694 rcop_no_action, 1695 zone_max_lofi_usage, 1696 zone_max_lofi_set, 1697 zone_max_lofi_test 1698 }; 1699 1700 /* 1701 * Helper function to brand the zone with a unique ID. 1702 */ 1703 static void 1704 zone_uniqid(zone_t *zone) 1705 { 1706 static uint64_t uniqid = 0; 1707 1708 ASSERT(MUTEX_HELD(&zonehash_lock)); 1709 zone->zone_uniqid = uniqid++; 1710 } 1711 1712 /* 1713 * Returns a held pointer to the "kcred" for the specified zone. 1714 */ 1715 struct cred * 1716 zone_get_kcred(zoneid_t zoneid) 1717 { 1718 zone_t *zone; 1719 cred_t *cr; 1720 1721 if ((zone = zone_find_by_id(zoneid)) == NULL) 1722 return (NULL); 1723 cr = zone->zone_kcred; 1724 crhold(cr); 1725 zone_rele(zone); 1726 return (cr); 1727 } 1728 1729 static int 1730 zone_lockedmem_kstat_update(kstat_t *ksp, int rw) 1731 { 1732 zone_t *zone = ksp->ks_private; 1733 zone_kstat_t *zk = ksp->ks_data; 1734 1735 if (rw == KSTAT_WRITE) 1736 return (EACCES); 1737 1738 zk->zk_usage.value.ui64 = zone->zone_locked_mem; 1739 zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; 1740 return (0); 1741 } 1742 1743 static int 1744 zone_nprocs_kstat_update(kstat_t *ksp, int rw) 1745 { 1746 zone_t *zone = ksp->ks_private; 1747 zone_kstat_t *zk = ksp->ks_data; 1748 1749 if (rw == KSTAT_WRITE) 1750 return (EACCES); 1751 1752 zk->zk_usage.value.ui64 = zone->zone_nprocs; 1753 zk->zk_value.value.ui64 = zone->zone_nprocs_ctl; 1754 return (0); 1755 } 1756 1757 static int 1758 zone_swapresv_kstat_update(kstat_t *ksp, int rw) 1759 { 1760 zone_t *zone = ksp->ks_private; 1761 zone_kstat_t *zk = ksp->ks_data; 1762 1763 if (rw == KSTAT_WRITE) 1764 return (EACCES); 1765 1766 zk->zk_usage.value.ui64 = zone->zone_max_swap; 1767 zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; 1768 return (0); 1769 } 1770 1771 static kstat_t * 1772 zone_kstat_create_common(zone_t *zone, char *name, 1773 int (*updatefunc) (kstat_t *, int)) 1774 { 1775 kstat_t *ksp; 1776 zone_kstat_t *zk; 1777 1778 ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED, 1779 sizeof (zone_kstat_t) / sizeof (kstat_named_t), 1780 KSTAT_FLAG_VIRTUAL); 1781 1782 if (ksp == NULL) 1783 return (NULL); 1784 1785 zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); 1786 ksp->ks_data_size += strlen(zone->zone_name) + 1; 1787 kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); 1788 kstat_named_setstr(&zk->zk_zonename, zone->zone_name); 1789 kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); 1790 kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); 1791 ksp->ks_update = updatefunc; 1792 ksp->ks_private = zone; 1793 kstat_install(ksp); 1794 return (ksp); 1795 } 1796 1797 static void 1798 zone_kstat_create(zone_t *zone) 1799 { 1800 zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, 1801 "lockedmem", zone_lockedmem_kstat_update); 1802 zone->zone_swapresv_kstat = zone_kstat_create_common(zone, 1803 "swapresv", zone_swapresv_kstat_update); 1804 zone->zone_nprocs_kstat = zone_kstat_create_common(zone, 1805 "nprocs", zone_nprocs_kstat_update); 1806 } 1807 1808 static void 1809 zone_kstat_delete_common(kstat_t **pkstat) 1810 { 1811 void *data; 1812 1813 if (*pkstat != NULL) { 1814 data = (*pkstat)->ks_data; 1815 kstat_delete(*pkstat); 1816 kmem_free(data, sizeof (zone_kstat_t)); 1817 *pkstat = NULL; 1818 } 1819 } 1820 1821 static void 1822 zone_kstat_delete(zone_t *zone) 1823 { 1824 zone_kstat_delete_common(&zone->zone_lockedmem_kstat); 1825 zone_kstat_delete_common(&zone->zone_swapresv_kstat); 1826 zone_kstat_delete_common(&zone->zone_nprocs_kstat); 1827 } 1828 1829 /* 1830 * Called very early on in boot to initialize the ZSD list so that 1831 * zone_key_create() can be called before zone_init(). It also initializes 1832 * portions of zone0 which may be used before zone_init() is called. The 1833 * variable "global_zone" will be set when zone0 is fully initialized by 1834 * zone_init(). 1835 */ 1836 void 1837 zone_zsd_init(void) 1838 { 1839 mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); 1840 mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); 1841 list_create(&zsd_registered_keys, sizeof (struct zsd_entry), 1842 offsetof(struct zsd_entry, zsd_linkage)); 1843 list_create(&zone_active, sizeof (zone_t), 1844 offsetof(zone_t, zone_linkage)); 1845 list_create(&zone_deathrow, sizeof (zone_t), 1846 offsetof(zone_t, zone_linkage)); 1847 1848 mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); 1849 mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 1850 mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); 1851 zone0.zone_shares = 1; 1852 zone0.zone_nlwps = 0; 1853 zone0.zone_nlwps_ctl = INT_MAX; 1854 zone0.zone_nprocs = 0; 1855 zone0.zone_nprocs_ctl = INT_MAX; 1856 zone0.zone_locked_mem = 0; 1857 zone0.zone_locked_mem_ctl = UINT64_MAX; 1858 ASSERT(zone0.zone_max_swap == 0); 1859 zone0.zone_max_swap_ctl = UINT64_MAX; 1860 zone0.zone_max_lofi = 0; 1861 zone0.zone_max_lofi_ctl = UINT64_MAX; 1862 zone0.zone_shmmax = 0; 1863 zone0.zone_ipc.ipcq_shmmni = 0; 1864 zone0.zone_ipc.ipcq_semmni = 0; 1865 zone0.zone_ipc.ipcq_msgmni = 0; 1866 zone0.zone_name = GLOBAL_ZONENAME; 1867 zone0.zone_nodename = utsname.nodename; 1868 zone0.zone_domain = srpc_domain; 1869 zone0.zone_hostid = HW_INVALID_HOSTID; 1870 zone0.zone_fs_allowed = NULL; 1871 zone0.zone_ref = 1; 1872 zone0.zone_id = GLOBAL_ZONEID; 1873 zone0.zone_status = ZONE_IS_RUNNING; 1874 zone0.zone_rootpath = "/"; 1875 zone0.zone_rootpathlen = 2; 1876 zone0.zone_psetid = ZONE_PS_INVAL; 1877 zone0.zone_ncpus = 0; 1878 zone0.zone_ncpus_online = 0; 1879 zone0.zone_proc_initpid = 1; 1880 zone0.zone_initname = initname; 1881 zone0.zone_lockedmem_kstat = NULL; 1882 zone0.zone_swapresv_kstat = NULL; 1883 zone0.zone_nprocs_kstat = NULL; 1884 list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), 1885 offsetof(struct zsd_entry, zsd_linkage)); 1886 list_insert_head(&zone_active, &zone0); 1887 1888 /* 1889 * The root filesystem is not mounted yet, so zone_rootvp cannot be set 1890 * to anything meaningful. It is assigned to be 'rootdir' in 1891 * vfs_mountroot(). 1892 */ 1893 zone0.zone_rootvp = NULL; 1894 zone0.zone_vfslist = NULL; 1895 zone0.zone_bootargs = initargs; 1896 zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 1897 /* 1898 * The global zone has all privileges 1899 */ 1900 priv_fillset(zone0.zone_privset); 1901 /* 1902 * Add p0 to the global zone 1903 */ 1904 zone0.zone_zsched = &p0; 1905 p0.p_zone = &zone0; 1906 } 1907 1908 /* 1909 * Compute a hash value based on the contents of the label and the DOI. The 1910 * hash algorithm is somewhat arbitrary, but is based on the observation that 1911 * humans will likely pick labels that differ by amounts that work out to be 1912 * multiples of the number of hash chains, and thus stirring in some primes 1913 * should help. 1914 */ 1915 static uint_t 1916 hash_bylabel(void *hdata, mod_hash_key_t key) 1917 { 1918 const ts_label_t *lab = (ts_label_t *)key; 1919 const uint32_t *up, *ue; 1920 uint_t hash; 1921 int i; 1922 1923 _NOTE(ARGUNUSED(hdata)); 1924 1925 hash = lab->tsl_doi + (lab->tsl_doi << 1); 1926 /* we depend on alignment of label, but not representation */ 1927 up = (const uint32_t *)&lab->tsl_label; 1928 ue = up + sizeof (lab->tsl_label) / sizeof (*up); 1929 i = 1; 1930 while (up < ue) { 1931 /* using 2^n + 1, 1 <= n <= 16 as source of many primes */ 1932 hash += *up + (*up << ((i % 16) + 1)); 1933 up++; 1934 i++; 1935 } 1936 return (hash); 1937 } 1938 1939 /* 1940 * All that mod_hash cares about here is zero (equal) versus non-zero (not 1941 * equal). This may need to be changed if less than / greater than is ever 1942 * needed. 1943 */ 1944 static int 1945 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1946 { 1947 ts_label_t *lab1 = (ts_label_t *)key1; 1948 ts_label_t *lab2 = (ts_label_t *)key2; 1949 1950 return (label_equal(lab1, lab2) ? 0 : 1); 1951 } 1952 1953 /* 1954 * Called by main() to initialize the zones framework. 1955 */ 1956 void 1957 zone_init(void) 1958 { 1959 rctl_dict_entry_t *rde; 1960 rctl_val_t *dval; 1961 rctl_set_t *set; 1962 rctl_alloc_gp_t *gp; 1963 rctl_entity_p_t e; 1964 int res; 1965 1966 ASSERT(curproc == &p0); 1967 1968 /* 1969 * Create ID space for zone IDs. ID 0 is reserved for the 1970 * global zone. 1971 */ 1972 zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); 1973 1974 /* 1975 * Initialize generic zone resource controls, if any. 1976 */ 1977 rc_zone_cpu_shares = rctl_register("zone.cpu-shares", 1978 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | 1979 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, 1980 FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops); 1981 1982 rc_zone_cpu_cap = rctl_register("zone.cpu-cap", 1983 RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS | 1984 RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER | 1985 RCTL_GLOBAL_INFINITE, 1986 MAXCAP, MAXCAP, &zone_cpu_cap_ops); 1987 1988 rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, 1989 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1990 INT_MAX, INT_MAX, &zone_lwps_ops); 1991 1992 rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE, 1993 RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, 1994 INT_MAX, INT_MAX, &zone_procs_ops); 1995 1996 /* 1997 * System V IPC resource controls 1998 */ 1999 rc_zone_msgmni = rctl_register("zone.max-msg-ids", 2000 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2001 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops); 2002 2003 rc_zone_semmni = rctl_register("zone.max-sem-ids", 2004 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2005 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops); 2006 2007 rc_zone_shmmni = rctl_register("zone.max-shm-ids", 2008 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2009 RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops); 2010 2011 rc_zone_shmmax = rctl_register("zone.max-shm-memory", 2012 RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | 2013 RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops); 2014 2015 /* 2016 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach 2017 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. 2018 */ 2019 dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 2020 bzero(dval, sizeof (rctl_val_t)); 2021 dval->rcv_value = 1; 2022 dval->rcv_privilege = RCPRIV_PRIVILEGED; 2023 dval->rcv_flagaction = RCTL_LOCAL_NOACTION; 2024 dval->rcv_action_recip_pid = -1; 2025 2026 rde = rctl_dict_lookup("zone.cpu-shares"); 2027 (void) rctl_val_list_insert(&rde->rcd_default_value, dval); 2028 2029 rc_zone_locked_mem = rctl_register("zone.max-locked-memory", 2030 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | 2031 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 2032 &zone_locked_mem_ops); 2033 2034 rc_zone_max_swap = rctl_register("zone.max-swap", 2035 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | 2036 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 2037 &zone_max_swap_ops); 2038 2039 rc_zone_max_lofi = rctl_register("zone.max-lofi", 2040 RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | 2041 RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, 2042 &zone_max_lofi_ops); 2043 2044 /* 2045 * Initialize the ``global zone''. 2046 */ 2047 set = rctl_set_create(); 2048 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 2049 mutex_enter(&p0.p_lock); 2050 e.rcep_p.zone = &zone0; 2051 e.rcep_t = RCENTITY_ZONE; 2052 zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, 2053 gp); 2054 2055 zone0.zone_nlwps = p0.p_lwpcnt; 2056 zone0.zone_nprocs = 1; 2057 zone0.zone_ntasks = 1; 2058 mutex_exit(&p0.p_lock); 2059 zone0.zone_restart_init = B_TRUE; 2060 zone0.zone_brand = &native_brand; 2061 rctl_prealloc_destroy(gp); 2062 /* 2063 * pool_default hasn't been initialized yet, so we let pool_init() 2064 * take care of making sure the global zone is in the default pool. 2065 */ 2066 2067 /* 2068 * Initialize global zone kstats 2069 */ 2070 zone_kstat_create(&zone0); 2071 2072 /* 2073 * Initialize zone label. 2074 * mlp are initialized when tnzonecfg is loaded. 2075 */ 2076 zone0.zone_slabel = l_admin_low; 2077 rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 2078 label_hold(l_admin_low); 2079 2080 /* 2081 * Initialise the lock for the database structure used by mntfs. 2082 */ 2083 rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL); 2084 2085 mutex_enter(&zonehash_lock); 2086 zone_uniqid(&zone0); 2087 ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); 2088 2089 zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, 2090 mod_hash_null_valdtor); 2091 zonehashbyname = mod_hash_create_strhash("zone_by_name", 2092 zone_hash_size, mod_hash_null_valdtor); 2093 /* 2094 * maintain zonehashbylabel only for labeled systems 2095 */ 2096 if (is_system_labeled()) 2097 zonehashbylabel = mod_hash_create_extended("zone_by_label", 2098 zone_hash_size, mod_hash_null_keydtor, 2099 mod_hash_null_valdtor, hash_bylabel, NULL, 2100 hash_labelkey_cmp, KM_SLEEP); 2101 zonecount = 1; 2102 2103 (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, 2104 (mod_hash_val_t)&zone0); 2105 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, 2106 (mod_hash_val_t)&zone0); 2107 if (is_system_labeled()) { 2108 zone0.zone_flags |= ZF_HASHED_LABEL; 2109 (void) mod_hash_insert(zonehashbylabel, 2110 (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0); 2111 } 2112 mutex_exit(&zonehash_lock); 2113 2114 /* 2115 * We avoid setting zone_kcred until now, since kcred is initialized 2116 * sometime after zone_zsd_init() and before zone_init(). 2117 */ 2118 zone0.zone_kcred = kcred; 2119 /* 2120 * The global zone is fully initialized (except for zone_rootvp which 2121 * will be set when the root filesystem is mounted). 2122 */ 2123 global_zone = &zone0; 2124 2125 /* 2126 * Setup an event channel to send zone status change notifications on 2127 */ 2128 res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan, 2129 EVCH_CREAT); 2130 2131 if (res) 2132 panic("Sysevent_evc_bind failed during zone setup.\n"); 2133 2134 } 2135 2136 static void 2137 zone_free(zone_t *zone) 2138 { 2139 ASSERT(zone != global_zone); 2140 ASSERT(zone->zone_ntasks == 0); 2141 ASSERT(zone->zone_nlwps == 0); 2142 ASSERT(zone->zone_nprocs == 0); 2143 ASSERT(zone->zone_cred_ref == 0); 2144 ASSERT(zone->zone_kcred == NULL); 2145 ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || 2146 zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 2147 2148 /* 2149 * Remove any zone caps. 2150 */ 2151 cpucaps_zone_remove(zone); 2152 2153 ASSERT(zone->zone_cpucap == NULL); 2154 2155 /* remove from deathrow list */ 2156 if (zone_status_get(zone) == ZONE_IS_DEAD) { 2157 ASSERT(zone->zone_ref == 0); 2158 mutex_enter(&zone_deathrow_lock); 2159 list_remove(&zone_deathrow, zone); 2160 mutex_exit(&zone_deathrow_lock); 2161 } 2162 2163 zone_free_zsd(zone); 2164 zone_free_datasets(zone); 2165 list_destroy(&zone->zone_dl_list); 2166 2167 if (zone->zone_rootvp != NULL) 2168 VN_RELE(zone->zone_rootvp); 2169 if (zone->zone_rootpath) 2170 kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); 2171 if (zone->zone_name != NULL) 2172 kmem_free(zone->zone_name, ZONENAME_MAX); 2173 if (zone->zone_slabel != NULL) 2174 label_rele(zone->zone_slabel); 2175 if (zone->zone_nodename != NULL) 2176 kmem_free(zone->zone_nodename, _SYS_NMLN); 2177 if (zone->zone_domain != NULL) 2178 kmem_free(zone->zone_domain, _SYS_NMLN); 2179 if (zone->zone_privset != NULL) 2180 kmem_free(zone->zone_privset, sizeof (priv_set_t)); 2181 if (zone->zone_rctls != NULL) 2182 rctl_set_free(zone->zone_rctls); 2183 if (zone->zone_bootargs != NULL) 2184 strfree(zone->zone_bootargs); 2185 if (zone->zone_initname != NULL) 2186 strfree(zone->zone_initname); 2187 if (zone->zone_fs_allowed != NULL) 2188 strfree(zone->zone_fs_allowed); 2189 if (zone->zone_pfexecd != NULL) 2190 klpd_freelist(&zone->zone_pfexecd); 2191 id_free(zoneid_space, zone->zone_id); 2192 mutex_destroy(&zone->zone_lock); 2193 cv_destroy(&zone->zone_cv); 2194 rw_destroy(&zone->zone_mlps.mlpl_rwlock); 2195 rw_destroy(&zone->zone_mntfs_db_lock); 2196 kmem_free(zone, sizeof (zone_t)); 2197 } 2198 2199 /* 2200 * See block comment at the top of this file for information about zone 2201 * status values. 2202 */ 2203 /* 2204 * Convenience function for setting zone status. 2205 */ 2206 static void 2207 zone_status_set(zone_t *zone, zone_status_t status) 2208 { 2209 2210 nvlist_t *nvl = NULL; 2211 ASSERT(MUTEX_HELD(&zone_status_lock)); 2212 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && 2213 status >= zone_status_get(zone)); 2214 2215 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || 2216 nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || 2217 nvlist_add_string(nvl, ZONE_CB_NEWSTATE, 2218 zone_status_table[status]) || 2219 nvlist_add_string(nvl, ZONE_CB_OLDSTATE, 2220 zone_status_table[zone->zone_status]) || 2221 nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || 2222 nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || 2223 sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, 2224 ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { 2225 #ifdef DEBUG 2226 (void) printf( 2227 "Failed to allocate and send zone state change event.\n"); 2228 #endif 2229 } 2230 nvlist_free(nvl); 2231 2232 zone->zone_status = status; 2233 2234 cv_broadcast(&zone->zone_cv); 2235 } 2236 2237 /* 2238 * Public function to retrieve the zone status. The zone status may 2239 * change after it is retrieved. 2240 */ 2241 zone_status_t 2242 zone_status_get(zone_t *zone) 2243 { 2244 return (zone->zone_status); 2245 } 2246 2247 static int 2248 zone_set_bootargs(zone_t *zone, const char *zone_bootargs) 2249 { 2250 char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); 2251 int err = 0; 2252 2253 ASSERT(zone != global_zone); 2254 if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0) 2255 goto done; /* EFAULT or ENAMETOOLONG */ 2256 2257 if (zone->zone_bootargs != NULL) 2258 strfree(zone->zone_bootargs); 2259 2260 zone->zone_bootargs = strdup(buf); 2261 2262 done: 2263 kmem_free(buf, BOOTARGS_MAX); 2264 return (err); 2265 } 2266 2267 static int 2268 zone_set_brand(zone_t *zone, const char *brand) 2269 { 2270 struct brand_attr *attrp; 2271 brand_t *bp; 2272 2273 attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP); 2274 if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) { 2275 kmem_free(attrp, sizeof (struct brand_attr)); 2276 return (EFAULT); 2277 } 2278 2279 bp = brand_register_zone(attrp); 2280 kmem_free(attrp, sizeof (struct brand_attr)); 2281 if (bp == NULL) 2282 return (EINVAL); 2283 2284 /* 2285 * This is the only place where a zone can change it's brand. 2286 * We already need to hold zone_status_lock to check the zone 2287 * status, so we'll just use that lock to serialize zone 2288 * branding requests as well. 2289 */ 2290 mutex_enter(&zone_status_lock); 2291 2292 /* Re-Branding is not allowed and the zone can't be booted yet */ 2293 if ((ZONE_IS_BRANDED(zone)) || 2294 (zone_status_get(zone) >= ZONE_IS_BOOTING)) { 2295 mutex_exit(&zone_status_lock); 2296 brand_unregister_zone(bp); 2297 return (EINVAL); 2298 } 2299 2300 /* set up the brand specific data */ 2301 zone->zone_brand = bp; 2302 ZBROP(zone)->b_init_brand_data(zone); 2303 2304 mutex_exit(&zone_status_lock); 2305 return (0); 2306 } 2307 2308 static int 2309 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed) 2310 { 2311 char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP); 2312 int err = 0; 2313 2314 ASSERT(zone != global_zone); 2315 if ((err = copyinstr(zone_fs_allowed, buf, 2316 ZONE_FS_ALLOWED_MAX, NULL)) != 0) 2317 goto done; 2318 2319 if (zone->zone_fs_allowed != NULL) 2320 strfree(zone->zone_fs_allowed); 2321 2322 zone->zone_fs_allowed = strdup(buf); 2323 2324 done: 2325 kmem_free(buf, ZONE_FS_ALLOWED_MAX); 2326 return (err); 2327 } 2328 2329 static int 2330 zone_set_initname(zone_t *zone, const char *zone_initname) 2331 { 2332 char initname[INITNAME_SZ]; 2333 size_t len; 2334 int err = 0; 2335 2336 ASSERT(zone != global_zone); 2337 if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0) 2338 return (err); /* EFAULT or ENAMETOOLONG */ 2339 2340 if (zone->zone_initname != NULL) 2341 strfree(zone->zone_initname); 2342 2343 zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); 2344 (void) strcpy(zone->zone_initname, initname); 2345 return (0); 2346 } 2347 2348 static int 2349 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) 2350 { 2351 uint64_t mcap; 2352 int err = 0; 2353 2354 if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) 2355 zone->zone_phys_mcap = mcap; 2356 2357 return (err); 2358 } 2359 2360 static int 2361 zone_set_sched_class(zone_t *zone, const char *new_class) 2362 { 2363 char sched_class[PC_CLNMSZ]; 2364 id_t classid; 2365 int err; 2366 2367 ASSERT(zone != global_zone); 2368 if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) 2369 return (err); /* EFAULT or ENAMETOOLONG */ 2370 2371 if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid)) 2372 return (set_errno(EINVAL)); 2373 zone->zone_defaultcid = classid; 2374 ASSERT(zone->zone_defaultcid > 0 && 2375 zone->zone_defaultcid < loaded_classes); 2376 2377 return (0); 2378 } 2379 2380 /* 2381 * Block indefinitely waiting for (zone_status >= status) 2382 */ 2383 void 2384 zone_status_wait(zone_t *zone, zone_status_t status) 2385 { 2386 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2387 2388 mutex_enter(&zone_status_lock); 2389 while (zone->zone_status < status) { 2390 cv_wait(&zone->zone_cv, &zone_status_lock); 2391 } 2392 mutex_exit(&zone_status_lock); 2393 } 2394 2395 /* 2396 * Private CPR-safe version of zone_status_wait(). 2397 */ 2398 static void 2399 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) 2400 { 2401 callb_cpr_t cprinfo; 2402 2403 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2404 2405 CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, 2406 str); 2407 mutex_enter(&zone_status_lock); 2408 while (zone->zone_status < status) { 2409 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2410 cv_wait(&zone->zone_cv, &zone_status_lock); 2411 CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); 2412 } 2413 /* 2414 * zone_status_lock is implicitly released by the following. 2415 */ 2416 CALLB_CPR_EXIT(&cprinfo); 2417 } 2418 2419 /* 2420 * Block until zone enters requested state or signal is received. Return (0) 2421 * if signaled, non-zero otherwise. 2422 */ 2423 int 2424 zone_status_wait_sig(zone_t *zone, zone_status_t status) 2425 { 2426 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2427 2428 mutex_enter(&zone_status_lock); 2429 while (zone->zone_status < status) { 2430 if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { 2431 mutex_exit(&zone_status_lock); 2432 return (0); 2433 } 2434 } 2435 mutex_exit(&zone_status_lock); 2436 return (1); 2437 } 2438 2439 /* 2440 * Block until the zone enters the requested state or the timeout expires, 2441 * whichever happens first. Return (-1) if operation timed out, time remaining 2442 * otherwise. 2443 */ 2444 clock_t 2445 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) 2446 { 2447 clock_t timeleft = 0; 2448 2449 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2450 2451 mutex_enter(&zone_status_lock); 2452 while (zone->zone_status < status && timeleft != -1) { 2453 timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); 2454 } 2455 mutex_exit(&zone_status_lock); 2456 return (timeleft); 2457 } 2458 2459 /* 2460 * Block until the zone enters the requested state, the current process is 2461 * signaled, or the timeout expires, whichever happens first. Return (-1) if 2462 * operation timed out, 0 if signaled, time remaining otherwise. 2463 */ 2464 clock_t 2465 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) 2466 { 2467 clock_t timeleft = tim - ddi_get_lbolt(); 2468 2469 ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); 2470 2471 mutex_enter(&zone_status_lock); 2472 while (zone->zone_status < status) { 2473 timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, 2474 tim); 2475 if (timeleft <= 0) 2476 break; 2477 } 2478 mutex_exit(&zone_status_lock); 2479 return (timeleft); 2480 } 2481 2482 /* 2483 * Zones have two reference counts: one for references from credential 2484 * structures (zone_cred_ref), and one (zone_ref) for everything else. 2485 * This is so we can allow a zone to be rebooted while there are still 2486 * outstanding cred references, since certain drivers cache dblks (which 2487 * implicitly results in cached creds). We wait for zone_ref to drop to 2488 * 0 (actually 1), but not zone_cred_ref. The zone structure itself is 2489 * later freed when the zone_cred_ref drops to 0, though nothing other 2490 * than the zone id and privilege set should be accessed once the zone 2491 * is "dead". 2492 * 2493 * A debugging flag, zone_wait_for_cred, can be set to a non-zero value 2494 * to force halt/reboot to block waiting for the zone_cred_ref to drop 2495 * to 0. This can be useful to flush out other sources of cached creds 2496 * that may be less innocuous than the driver case. 2497 */ 2498 2499 int zone_wait_for_cred = 0; 2500 2501 static void 2502 zone_hold_locked(zone_t *z) 2503 { 2504 ASSERT(MUTEX_HELD(&z->zone_lock)); 2505 z->zone_ref++; 2506 ASSERT(z->zone_ref != 0); 2507 } 2508 2509 void 2510 zone_hold(zone_t *z) 2511 { 2512 mutex_enter(&z->zone_lock); 2513 zone_hold_locked(z); 2514 mutex_exit(&z->zone_lock); 2515 } 2516 2517 /* 2518 * If the non-cred ref count drops to 1 and either the cred ref count 2519 * is 0 or we aren't waiting for cred references, the zone is ready to 2520 * be destroyed. 2521 */ 2522 #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ 2523 (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) 2524 2525 void 2526 zone_rele(zone_t *z) 2527 { 2528 boolean_t wakeup; 2529 2530 mutex_enter(&z->zone_lock); 2531 ASSERT(z->zone_ref != 0); 2532 z->zone_ref--; 2533 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 2534 /* no more refs, free the structure */ 2535 mutex_exit(&z->zone_lock); 2536 zone_free(z); 2537 return; 2538 } 2539 /* signal zone_destroy so the zone can finish halting */ 2540 wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); 2541 mutex_exit(&z->zone_lock); 2542 2543 if (wakeup) { 2544 /* 2545 * Grabbing zonehash_lock here effectively synchronizes with 2546 * zone_destroy() to avoid missed signals. 2547 */ 2548 mutex_enter(&zonehash_lock); 2549 cv_broadcast(&zone_destroy_cv); 2550 mutex_exit(&zonehash_lock); 2551 } 2552 } 2553 2554 void 2555 zone_cred_hold(zone_t *z) 2556 { 2557 mutex_enter(&z->zone_lock); 2558 z->zone_cred_ref++; 2559 ASSERT(z->zone_cred_ref != 0); 2560 mutex_exit(&z->zone_lock); 2561 } 2562 2563 void 2564 zone_cred_rele(zone_t *z) 2565 { 2566 boolean_t wakeup; 2567 2568 mutex_enter(&z->zone_lock); 2569 ASSERT(z->zone_cred_ref != 0); 2570 z->zone_cred_ref--; 2571 if (z->zone_ref == 0 && z->zone_cred_ref == 0) { 2572 /* no more refs, free the structure */ 2573 mutex_exit(&z->zone_lock); 2574 zone_free(z); 2575 return; 2576 } 2577 /* 2578 * If zone_destroy is waiting for the cred references to drain 2579 * out, and they have, signal it. 2580 */ 2581 wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) && 2582 zone_status_get(z) >= ZONE_IS_DEAD); 2583 mutex_exit(&z->zone_lock); 2584 2585 if (wakeup) { 2586 /* 2587 * Grabbing zonehash_lock here effectively synchronizes with 2588 * zone_destroy() to avoid missed signals. 2589 */ 2590 mutex_enter(&zonehash_lock); 2591 cv_broadcast(&zone_destroy_cv); 2592 mutex_exit(&zonehash_lock); 2593 } 2594 } 2595 2596 void 2597 zone_task_hold(zone_t *z) 2598 { 2599 mutex_enter(&z->zone_lock); 2600 z->zone_ntasks++; 2601 ASSERT(z->zone_ntasks != 0); 2602 mutex_exit(&z->zone_lock); 2603 } 2604 2605 void 2606 zone_task_rele(zone_t *zone) 2607 { 2608 uint_t refcnt; 2609 2610 mutex_enter(&zone->zone_lock); 2611 ASSERT(zone->zone_ntasks != 0); 2612 refcnt = --zone->zone_ntasks; 2613 if (refcnt > 1) { /* Common case */ 2614 mutex_exit(&zone->zone_lock); 2615 return; 2616 } 2617 zone_hold_locked(zone); /* so we can use the zone_t later */ 2618 mutex_exit(&zone->zone_lock); 2619 if (refcnt == 1) { 2620 /* 2621 * See if the zone is shutting down. 2622 */ 2623 mutex_enter(&zone_status_lock); 2624 if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) { 2625 goto out; 2626 } 2627 2628 /* 2629 * Make sure the ntasks didn't change since we 2630 * dropped zone_lock. 2631 */ 2632 mutex_enter(&zone->zone_lock); 2633 if (refcnt != zone->zone_ntasks) { 2634 mutex_exit(&zone->zone_lock); 2635 goto out; 2636 } 2637 mutex_exit(&zone->zone_lock); 2638 2639 /* 2640 * No more user processes in the zone. The zone is empty. 2641 */ 2642 zone_status_set(zone, ZONE_IS_EMPTY); 2643 goto out; 2644 } 2645 2646 ASSERT(refcnt == 0); 2647 /* 2648 * zsched has exited; the zone is dead. 2649 */ 2650 zone->zone_zsched = NULL; /* paranoia */ 2651 mutex_enter(&zone_status_lock); 2652 zone_status_set(zone, ZONE_IS_DEAD); 2653 out: 2654 mutex_exit(&zone_status_lock); 2655 zone_rele(zone); 2656 } 2657 2658 zoneid_t 2659 getzoneid(void) 2660 { 2661 return (curproc->p_zone->zone_id); 2662 } 2663 2664 /* 2665 * Internal versions of zone_find_by_*(). These don't zone_hold() or 2666 * check the validity of a zone's state. 2667 */ 2668 static zone_t * 2669 zone_find_all_by_id(zoneid_t zoneid) 2670 { 2671 mod_hash_val_t hv; 2672 zone_t *zone = NULL; 2673 2674 ASSERT(MUTEX_HELD(&zonehash_lock)); 2675 2676 if (mod_hash_find(zonehashbyid, 2677 (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0) 2678 zone = (zone_t *)hv; 2679 return (zone); 2680 } 2681 2682 static zone_t * 2683 zone_find_all_by_label(const ts_label_t *label) 2684 { 2685 mod_hash_val_t hv; 2686 zone_t *zone = NULL; 2687 2688 ASSERT(MUTEX_HELD(&zonehash_lock)); 2689 2690 /* 2691 * zonehashbylabel is not maintained for unlabeled systems 2692 */ 2693 if (!is_system_labeled()) 2694 return (NULL); 2695 if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0) 2696 zone = (zone_t *)hv; 2697 return (zone); 2698 } 2699 2700 static zone_t * 2701 zone_find_all_by_name(char *name) 2702 { 2703 mod_hash_val_t hv; 2704 zone_t *zone = NULL; 2705 2706 ASSERT(MUTEX_HELD(&zonehash_lock)); 2707 2708 if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0) 2709 zone = (zone_t *)hv; 2710 return (zone); 2711 } 2712 2713 /* 2714 * Public interface for looking up a zone by zoneid. Only returns the zone if 2715 * it is fully initialized, and has not yet begun the zone_destroy() sequence. 2716 * Caller must call zone_rele() once it is done with the zone. 2717 * 2718 * The zone may begin the zone_destroy() sequence immediately after this 2719 * function returns, but may be safely used until zone_rele() is called. 2720 */ 2721 zone_t * 2722 zone_find_by_id(zoneid_t zoneid) 2723 { 2724 zone_t *zone; 2725 zone_status_t status; 2726 2727 mutex_enter(&zonehash_lock); 2728 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 2729 mutex_exit(&zonehash_lock); 2730 return (NULL); 2731 } 2732 status = zone_status_get(zone); 2733 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2734 /* 2735 * For all practical purposes the zone doesn't exist. 2736 */ 2737 mutex_exit(&zonehash_lock); 2738 return (NULL); 2739 } 2740 zone_hold(zone); 2741 mutex_exit(&zonehash_lock); 2742 return (zone); 2743 } 2744 2745 /* 2746 * Similar to zone_find_by_id, but using zone label as the key. 2747 */ 2748 zone_t * 2749 zone_find_by_label(const ts_label_t *label) 2750 { 2751 zone_t *zone; 2752 zone_status_t status; 2753 2754 mutex_enter(&zonehash_lock); 2755 if ((zone = zone_find_all_by_label(label)) == NULL) { 2756 mutex_exit(&zonehash_lock); 2757 return (NULL); 2758 } 2759 2760 status = zone_status_get(zone); 2761 if (status > ZONE_IS_DOWN) { 2762 /* 2763 * For all practical purposes the zone doesn't exist. 2764 */ 2765 mutex_exit(&zonehash_lock); 2766 return (NULL); 2767 } 2768 zone_hold(zone); 2769 mutex_exit(&zonehash_lock); 2770 return (zone); 2771 } 2772 2773 /* 2774 * Similar to zone_find_by_id, but using zone name as the key. 2775 */ 2776 zone_t * 2777 zone_find_by_name(char *name) 2778 { 2779 zone_t *zone; 2780 zone_status_t status; 2781 2782 mutex_enter(&zonehash_lock); 2783 if ((zone = zone_find_all_by_name(name)) == NULL) { 2784 mutex_exit(&zonehash_lock); 2785 return (NULL); 2786 } 2787 status = zone_status_get(zone); 2788 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2789 /* 2790 * For all practical purposes the zone doesn't exist. 2791 */ 2792 mutex_exit(&zonehash_lock); 2793 return (NULL); 2794 } 2795 zone_hold(zone); 2796 mutex_exit(&zonehash_lock); 2797 return (zone); 2798 } 2799 2800 /* 2801 * Similar to zone_find_by_id(), using the path as a key. For instance, 2802 * if there is a zone "foo" rooted at /foo/root, and the path argument 2803 * is "/foo/root/proc", it will return the held zone_t corresponding to 2804 * zone "foo". 2805 * 2806 * zone_find_by_path() always returns a non-NULL value, since at the 2807 * very least every path will be contained in the global zone. 2808 * 2809 * As with the other zone_find_by_*() functions, the caller is 2810 * responsible for zone_rele()ing the return value of this function. 2811 */ 2812 zone_t * 2813 zone_find_by_path(const char *path) 2814 { 2815 zone_t *zone; 2816 zone_t *zret = NULL; 2817 zone_status_t status; 2818 2819 if (path == NULL) { 2820 /* 2821 * Call from rootconf(). 2822 */ 2823 zone_hold(global_zone); 2824 return (global_zone); 2825 } 2826 ASSERT(*path == '/'); 2827 mutex_enter(&zonehash_lock); 2828 for (zone = list_head(&zone_active); zone != NULL; 2829 zone = list_next(&zone_active, zone)) { 2830 if (ZONE_PATH_VISIBLE(path, zone)) 2831 zret = zone; 2832 } 2833 ASSERT(zret != NULL); 2834 status = zone_status_get(zret); 2835 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) { 2836 /* 2837 * Zone practically doesn't exist. 2838 */ 2839 zret = global_zone; 2840 } 2841 zone_hold(zret); 2842 mutex_exit(&zonehash_lock); 2843 return (zret); 2844 } 2845 2846 /* 2847 * Get the number of cpus visible to this zone. The system-wide global 2848 * 'ncpus' is returned if pools are disabled, the caller is in the 2849 * global zone, or a NULL zone argument is passed in. 2850 */ 2851 int 2852 zone_ncpus_get(zone_t *zone) 2853 { 2854 int myncpus = zone == NULL ? 0 : zone->zone_ncpus; 2855 2856 return (myncpus != 0 ? myncpus : ncpus); 2857 } 2858 2859 /* 2860 * Get the number of online cpus visible to this zone. The system-wide 2861 * global 'ncpus_online' is returned if pools are disabled, the caller 2862 * is in the global zone, or a NULL zone argument is passed in. 2863 */ 2864 int 2865 zone_ncpus_online_get(zone_t *zone) 2866 { 2867 int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online; 2868 2869 return (myncpus_online != 0 ? myncpus_online : ncpus_online); 2870 } 2871 2872 /* 2873 * Return the pool to which the zone is currently bound. 2874 */ 2875 pool_t * 2876 zone_pool_get(zone_t *zone) 2877 { 2878 ASSERT(pool_lock_held()); 2879 2880 return (zone->zone_pool); 2881 } 2882 2883 /* 2884 * Set the zone's pool pointer and update the zone's visibility to match 2885 * the resources in the new pool. 2886 */ 2887 void 2888 zone_pool_set(zone_t *zone, pool_t *pool) 2889 { 2890 ASSERT(pool_lock_held()); 2891 ASSERT(MUTEX_HELD(&cpu_lock)); 2892 2893 zone->zone_pool = pool; 2894 zone_pset_set(zone, pool->pool_pset->pset_id); 2895 } 2896 2897 /* 2898 * Return the cached value of the id of the processor set to which the 2899 * zone is currently bound. The value will be ZONE_PS_INVAL if the pools 2900 * facility is disabled. 2901 */ 2902 psetid_t 2903 zone_pset_get(zone_t *zone) 2904 { 2905 ASSERT(MUTEX_HELD(&cpu_lock)); 2906 2907 return (zone->zone_psetid); 2908 } 2909 2910 /* 2911 * Set the cached value of the id of the processor set to which the zone 2912 * is currently bound. Also update the zone's visibility to match the 2913 * resources in the new processor set. 2914 */ 2915 void 2916 zone_pset_set(zone_t *zone, psetid_t newpsetid) 2917 { 2918 psetid_t oldpsetid; 2919 2920 ASSERT(MUTEX_HELD(&cpu_lock)); 2921 oldpsetid = zone_pset_get(zone); 2922 2923 if (oldpsetid == newpsetid) 2924 return; 2925 /* 2926 * Global zone sees all. 2927 */ 2928 if (zone != global_zone) { 2929 zone->zone_psetid = newpsetid; 2930 if (newpsetid != ZONE_PS_INVAL) 2931 pool_pset_visibility_add(newpsetid, zone); 2932 if (oldpsetid != ZONE_PS_INVAL) 2933 pool_pset_visibility_remove(oldpsetid, zone); 2934 } 2935 /* 2936 * Disabling pools, so we should start using the global values 2937 * for ncpus and ncpus_online. 2938 */ 2939 if (newpsetid == ZONE_PS_INVAL) { 2940 zone->zone_ncpus = 0; 2941 zone->zone_ncpus_online = 0; 2942 } 2943 } 2944 2945 /* 2946 * Walk the list of active zones and issue the provided callback for 2947 * each of them. 2948 * 2949 * Caller must not be holding any locks that may be acquired under 2950 * zonehash_lock. See comment at the beginning of the file for a list of 2951 * common locks and their interactions with zones. 2952 */ 2953 int 2954 zone_walk(int (*cb)(zone_t *, void *), void *data) 2955 { 2956 zone_t *zone; 2957 int ret = 0; 2958 zone_status_t status; 2959 2960 mutex_enter(&zonehash_lock); 2961 for (zone = list_head(&zone_active); zone != NULL; 2962 zone = list_next(&zone_active, zone)) { 2963 /* 2964 * Skip zones that shouldn't be externally visible. 2965 */ 2966 status = zone_status_get(zone); 2967 if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) 2968 continue; 2969 /* 2970 * Bail immediately if any callback invocation returns a 2971 * non-zero value. 2972 */ 2973 ret = (*cb)(zone, data); 2974 if (ret != 0) 2975 break; 2976 } 2977 mutex_exit(&zonehash_lock); 2978 return (ret); 2979 } 2980 2981 static int 2982 zone_set_root(zone_t *zone, const char *upath) 2983 { 2984 vnode_t *vp; 2985 int trycount; 2986 int error = 0; 2987 char *path; 2988 struct pathname upn, pn; 2989 size_t pathlen; 2990 2991 if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0) 2992 return (error); 2993 2994 pn_alloc(&pn); 2995 2996 /* prevent infinite loop */ 2997 trycount = 10; 2998 for (;;) { 2999 if (--trycount <= 0) { 3000 error = ESTALE; 3001 goto out; 3002 } 3003 3004 if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) { 3005 /* 3006 * VOP_ACCESS() may cover 'vp' with a new 3007 * filesystem, if 'vp' is an autoFS vnode. 3008 * Get the new 'vp' if so. 3009 */ 3010 if ((error = 3011 VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 && 3012 (!vn_ismntpt(vp) || 3013 (error = traverse(&vp)) == 0)) { 3014 pathlen = pn.pn_pathlen + 2; 3015 path = kmem_alloc(pathlen, KM_SLEEP); 3016 (void) strncpy(path, pn.pn_path, 3017 pn.pn_pathlen + 1); 3018 path[pathlen - 2] = '/'; 3019 path[pathlen - 1] = '\0'; 3020 pn_free(&pn); 3021 pn_free(&upn); 3022 3023 /* Success! */ 3024 break; 3025 } 3026 VN_RELE(vp); 3027 } 3028 if (error != ESTALE) 3029 goto out; 3030 } 3031 3032 ASSERT(error == 0); 3033 zone->zone_rootvp = vp; /* we hold a reference to vp */ 3034 zone->zone_rootpath = path; 3035 zone->zone_rootpathlen = pathlen; 3036 if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0) 3037 zone->zone_flags |= ZF_IS_SCRATCH; 3038 return (0); 3039 3040 out: 3041 pn_free(&pn); 3042 pn_free(&upn); 3043 return (error); 3044 } 3045 3046 #define isalnum(c) (((c) >= '0' && (c) <= '9') || \ 3047 ((c) >= 'a' && (c) <= 'z') || \ 3048 ((c) >= 'A' && (c) <= 'Z')) 3049 3050 static int 3051 zone_set_name(zone_t *zone, const char *uname) 3052 { 3053 char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 3054 size_t len; 3055 int i, err; 3056 3057 if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) { 3058 kmem_free(kname, ZONENAME_MAX); 3059 return (err); /* EFAULT or ENAMETOOLONG */ 3060 } 3061 3062 /* must be less than ZONENAME_MAX */ 3063 if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') { 3064 kmem_free(kname, ZONENAME_MAX); 3065 return (EINVAL); 3066 } 3067 3068 /* 3069 * Name must start with an alphanumeric and must contain only 3070 * alphanumerics, '-', '_' and '.'. 3071 */ 3072 if (!isalnum(kname[0])) { 3073 kmem_free(kname, ZONENAME_MAX); 3074 return (EINVAL); 3075 } 3076 for (i = 1; i < len - 1; i++) { 3077 if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' && 3078 kname[i] != '.') { 3079 kmem_free(kname, ZONENAME_MAX); 3080 return (EINVAL); 3081 } 3082 } 3083 3084 zone->zone_name = kname; 3085 return (0); 3086 } 3087 3088 /* 3089 * Gets the 32-bit hostid of the specified zone as an unsigned int. If 'zonep' 3090 * is NULL or it points to a zone with no hostid emulation, then the machine's 3091 * hostid (i.e., the global zone's hostid) is returned. This function returns 3092 * zero if neither the zone nor the host machine (global zone) have hostids. It 3093 * returns HW_INVALID_HOSTID if the function attempts to return the machine's 3094 * hostid and the machine's hostid is invalid. 3095 */ 3096 uint32_t 3097 zone_get_hostid(zone_t *zonep) 3098 { 3099 unsigned long machine_hostid; 3100 3101 if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) { 3102 if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0) 3103 return (HW_INVALID_HOSTID); 3104 return ((uint32_t)machine_hostid); 3105 } 3106 return (zonep->zone_hostid); 3107 } 3108 3109 /* 3110 * Similar to thread_create(), but makes sure the thread is in the appropriate 3111 * zone's zsched process (curproc->p_zone->zone_zsched) before returning. 3112 */ 3113 /*ARGSUSED*/ 3114 kthread_t * 3115 zthread_create( 3116 caddr_t stk, 3117 size_t stksize, 3118 void (*proc)(), 3119 void *arg, 3120 size_t len, 3121 pri_t pri) 3122 { 3123 kthread_t *t; 3124 zone_t *zone = curproc->p_zone; 3125 proc_t *pp = zone->zone_zsched; 3126 3127 zone_hold(zone); /* Reference to be dropped when thread exits */ 3128 3129 /* 3130 * No-one should be trying to create threads if the zone is shutting 3131 * down and there aren't any kernel threads around. See comment 3132 * in zthread_exit(). 3133 */ 3134 ASSERT(!(zone->zone_kthreads == NULL && 3135 zone_status_get(zone) >= ZONE_IS_EMPTY)); 3136 /* 3137 * Create a thread, but don't let it run until we've finished setting 3138 * things up. 3139 */ 3140 t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri); 3141 ASSERT(t->t_forw == NULL); 3142 mutex_enter(&zone_status_lock); 3143 if (zone->zone_kthreads == NULL) { 3144 t->t_forw = t->t_back = t; 3145 } else { 3146 kthread_t *tx = zone->zone_kthreads; 3147 3148 t->t_forw = tx; 3149 t->t_back = tx->t_back; 3150 tx->t_back->t_forw = t; 3151 tx->t_back = t; 3152 } 3153 zone->zone_kthreads = t; 3154 mutex_exit(&zone_status_lock); 3155 3156 mutex_enter(&pp->p_lock); 3157 t->t_proc_flag |= TP_ZTHREAD; 3158 project_rele(t->t_proj); 3159 t->t_proj = project_hold(pp->p_task->tk_proj); 3160 3161 /* 3162 * Setup complete, let it run. 3163 */ 3164 thread_lock(t); 3165 t->t_schedflag |= TS_ALLSTART; 3166 setrun_locked(t); 3167 thread_unlock(t); 3168 3169 mutex_exit(&pp->p_lock); 3170 3171 return (t); 3172 } 3173 3174 /* 3175 * Similar to thread_exit(). Must be called by threads created via 3176 * zthread_exit(). 3177 */ 3178 void 3179 zthread_exit(void) 3180 { 3181 kthread_t *t = curthread; 3182 proc_t *pp = curproc; 3183 zone_t *zone = pp->p_zone; 3184 3185 mutex_enter(&zone_status_lock); 3186 3187 /* 3188 * Reparent to p0 3189 */ 3190 kpreempt_disable(); 3191 mutex_enter(&pp->p_lock); 3192 t->t_proc_flag &= ~TP_ZTHREAD; 3193 t->t_procp = &p0; 3194 hat_thread_exit(t); 3195 mutex_exit(&pp->p_lock); 3196 kpreempt_enable(); 3197 3198 if (t->t_back == t) { 3199 ASSERT(t->t_forw == t); 3200 /* 3201 * If the zone is empty, once the thread count 3202 * goes to zero no further kernel threads can be 3203 * created. This is because if the creator is a process 3204 * in the zone, then it must have exited before the zone 3205 * state could be set to ZONE_IS_EMPTY. 3206 * Otherwise, if the creator is a kernel thread in the 3207 * zone, the thread count is non-zero. 3208 * 3209 * This really means that non-zone kernel threads should 3210 * not create zone kernel threads. 3211 */ 3212 zone->zone_kthreads = NULL; 3213 if (zone_status_get(zone) == ZONE_IS_EMPTY) { 3214 zone_status_set(zone, ZONE_IS_DOWN); 3215 /* 3216 * Remove any CPU caps on this zone. 3217 */ 3218 cpucaps_zone_remove(zone); 3219 } 3220 } else { 3221 t->t_forw->t_back = t->t_back; 3222 t->t_back->t_forw = t->t_forw; 3223 if (zone->zone_kthreads == t) 3224 zone->zone_kthreads = t->t_forw; 3225 } 3226 mutex_exit(&zone_status_lock); 3227 zone_rele(zone); 3228 thread_exit(); 3229 /* NOTREACHED */ 3230 } 3231 3232 static void 3233 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp) 3234 { 3235 vnode_t *oldvp; 3236 3237 /* we're going to hold a reference here to the directory */ 3238 VN_HOLD(vp); 3239 3240 /* update abs cwd/root path see c2/audit.c */ 3241 if (AU_AUDITING()) 3242 audit_chdirec(vp, vpp); 3243 3244 mutex_enter(&pp->p_lock); 3245 oldvp = *vpp; 3246 *vpp = vp; 3247 mutex_exit(&pp->p_lock); 3248 if (oldvp != NULL) 3249 VN_RELE(oldvp); 3250 } 3251 3252 /* 3253 * Convert an rctl value represented by an nvlist_t into an rctl_val_t. 3254 */ 3255 static int 3256 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv) 3257 { 3258 nvpair_t *nvp = NULL; 3259 boolean_t priv_set = B_FALSE; 3260 boolean_t limit_set = B_FALSE; 3261 boolean_t action_set = B_FALSE; 3262 3263 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3264 const char *name; 3265 uint64_t ui64; 3266 3267 name = nvpair_name(nvp); 3268 if (nvpair_type(nvp) != DATA_TYPE_UINT64) 3269 return (EINVAL); 3270 (void) nvpair_value_uint64(nvp, &ui64); 3271 if (strcmp(name, "privilege") == 0) { 3272 /* 3273 * Currently only privileged values are allowed, but 3274 * this may change in the future. 3275 */ 3276 if (ui64 != RCPRIV_PRIVILEGED) 3277 return (EINVAL); 3278 rv->rcv_privilege = ui64; 3279 priv_set = B_TRUE; 3280 } else if (strcmp(name, "limit") == 0) { 3281 rv->rcv_value = ui64; 3282 limit_set = B_TRUE; 3283 } else if (strcmp(name, "action") == 0) { 3284 if (ui64 != RCTL_LOCAL_NOACTION && 3285 ui64 != RCTL_LOCAL_DENY) 3286 return (EINVAL); 3287 rv->rcv_flagaction = ui64; 3288 action_set = B_TRUE; 3289 } else { 3290 return (EINVAL); 3291 } 3292 } 3293 3294 if (!(priv_set && limit_set && action_set)) 3295 return (EINVAL); 3296 rv->rcv_action_signal = 0; 3297 rv->rcv_action_recipient = NULL; 3298 rv->rcv_action_recip_pid = -1; 3299 rv->rcv_firing_time = 0; 3300 3301 return (0); 3302 } 3303 3304 /* 3305 * Non-global zone version of start_init. 3306 */ 3307 void 3308 zone_start_init(void) 3309 { 3310 proc_t *p = ttoproc(curthread); 3311 zone_t *z = p->p_zone; 3312 3313 ASSERT(!INGLOBALZONE(curproc)); 3314 3315 /* 3316 * For all purposes (ZONE_ATTR_INITPID and restart_init), 3317 * storing just the pid of init is sufficient. 3318 */ 3319 z->zone_proc_initpid = p->p_pid; 3320 3321 /* 3322 * We maintain zone_boot_err so that we can return the cause of the 3323 * failure back to the caller of the zone_boot syscall. 3324 */ 3325 p->p_zone->zone_boot_err = start_init_common(); 3326 3327 /* 3328 * We will prevent booting zones from becoming running zones if the 3329 * global zone is shutting down. 3330 */ 3331 mutex_enter(&zone_status_lock); 3332 if (z->zone_boot_err != 0 || zone_status_get(global_zone) >= 3333 ZONE_IS_SHUTTING_DOWN) { 3334 /* 3335 * Make sure we are still in the booting state-- we could have 3336 * raced and already be shutting down, or even further along. 3337 */ 3338 if (zone_status_get(z) == ZONE_IS_BOOTING) { 3339 zone_status_set(z, ZONE_IS_SHUTTING_DOWN); 3340 } 3341 mutex_exit(&zone_status_lock); 3342 /* It's gone bad, dispose of the process */ 3343 if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) { 3344 mutex_enter(&p->p_lock); 3345 ASSERT(p->p_flag & SEXITLWPS); 3346 lwp_exit(); 3347 } 3348 } else { 3349 if (zone_status_get(z) == ZONE_IS_BOOTING) 3350 zone_status_set(z, ZONE_IS_RUNNING); 3351 mutex_exit(&zone_status_lock); 3352 /* cause the process to return to userland. */ 3353 lwp_rtt(); 3354 } 3355 } 3356 3357 struct zsched_arg { 3358 zone_t *zone; 3359 nvlist_t *nvlist; 3360 }; 3361 3362 /* 3363 * Per-zone "sched" workalike. The similarity to "sched" doesn't have 3364 * anything to do with scheduling, but rather with the fact that 3365 * per-zone kernel threads are parented to zsched, just like regular 3366 * kernel threads are parented to sched (p0). 3367 * 3368 * zsched is also responsible for launching init for the zone. 3369 */ 3370 static void 3371 zsched(void *arg) 3372 { 3373 struct zsched_arg *za = arg; 3374 proc_t *pp = curproc; 3375 proc_t *initp = proc_init; 3376 zone_t *zone = za->zone; 3377 cred_t *cr, *oldcred; 3378 rctl_set_t *set; 3379 rctl_alloc_gp_t *gp; 3380 contract_t *ct = NULL; 3381 task_t *tk, *oldtk; 3382 rctl_entity_p_t e; 3383 kproject_t *pj; 3384 3385 nvlist_t *nvl = za->nvlist; 3386 nvpair_t *nvp = NULL; 3387 3388 bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched")); 3389 bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched")); 3390 PTOU(pp)->u_argc = 0; 3391 PTOU(pp)->u_argv = NULL; 3392 PTOU(pp)->u_envp = NULL; 3393 closeall(P_FINFO(pp)); 3394 3395 /* 3396 * We are this zone's "zsched" process. As the zone isn't generally 3397 * visible yet we don't need to grab any locks before initializing its 3398 * zone_proc pointer. 3399 */ 3400 zone_hold(zone); /* this hold is released by zone_destroy() */ 3401 zone->zone_zsched = pp; 3402 mutex_enter(&pp->p_lock); 3403 pp->p_zone = zone; 3404 mutex_exit(&pp->p_lock); 3405 3406 /* 3407 * Disassociate process from its 'parent'; parent ourselves to init 3408 * (pid 1) and change other values as needed. 3409 */ 3410 sess_create(); 3411 3412 mutex_enter(&pidlock); 3413 proc_detach(pp); 3414 pp->p_ppid = 1; 3415 pp->p_flag |= SZONETOP; 3416 pp->p_ancpid = 1; 3417 pp->p_parent = initp; 3418 pp->p_psibling = NULL; 3419 if (initp->p_child) 3420 initp->p_child->p_psibling = pp; 3421 pp->p_sibling = initp->p_child; 3422 initp->p_child = pp; 3423 3424 /* Decrement what newproc() incremented. */ 3425 upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID); 3426 /* 3427 * Our credentials are about to become kcred-like, so we don't care 3428 * about the caller's ruid. 3429 */ 3430 upcount_inc(crgetruid(kcred), zone->zone_id); 3431 mutex_exit(&pidlock); 3432 3433 /* 3434 * getting out of global zone, so decrement lwp and process counts 3435 */ 3436 pj = pp->p_task->tk_proj; 3437 mutex_enter(&global_zone->zone_nlwps_lock); 3438 pj->kpj_nlwps -= pp->p_lwpcnt; 3439 global_zone->zone_nlwps -= pp->p_lwpcnt; 3440 pj->kpj_nprocs--; 3441 global_zone->zone_nprocs--; 3442 mutex_exit(&global_zone->zone_nlwps_lock); 3443 3444 /* 3445 * Decrement locked memory counts on old zone and project. 3446 */ 3447 mutex_enter(&global_zone->zone_mem_lock); 3448 global_zone->zone_locked_mem -= pp->p_locked_mem; 3449 pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; 3450 mutex_exit(&global_zone->zone_mem_lock); 3451 3452 /* 3453 * Create and join a new task in project '0' of this zone. 3454 * 3455 * We don't need to call holdlwps() since we know we're the only lwp in 3456 * this process. 3457 * 3458 * task_join() returns with p_lock held. 3459 */ 3460 tk = task_create(0, zone); 3461 mutex_enter(&cpu_lock); 3462 oldtk = task_join(tk, 0); 3463 3464 pj = pp->p_task->tk_proj; 3465 3466 mutex_enter(&zone->zone_mem_lock); 3467 zone->zone_locked_mem += pp->p_locked_mem; 3468 pj->kpj_data.kpd_locked_mem += pp->p_locked_mem; 3469 mutex_exit(&zone->zone_mem_lock); 3470 3471 /* 3472 * add lwp and process counts to zsched's zone, and increment 3473 * project's task and process count due to the task created in 3474 * the above task_create. 3475 */ 3476 mutex_enter(&zone->zone_nlwps_lock); 3477 pj->kpj_nlwps += pp->p_lwpcnt; 3478 pj->kpj_ntasks += 1; 3479 zone->zone_nlwps += pp->p_lwpcnt; 3480 pj->kpj_nprocs++; 3481 zone->zone_nprocs++; 3482 mutex_exit(&zone->zone_nlwps_lock); 3483 3484 mutex_exit(&curproc->p_lock); 3485 mutex_exit(&cpu_lock); 3486 task_rele(oldtk); 3487 3488 /* 3489 * The process was created by a process in the global zone, hence the 3490 * credentials are wrong. We might as well have kcred-ish credentials. 3491 */ 3492 cr = zone->zone_kcred; 3493 crhold(cr); 3494 mutex_enter(&pp->p_crlock); 3495 oldcred = pp->p_cred; 3496 pp->p_cred = cr; 3497 mutex_exit(&pp->p_crlock); 3498 crfree(oldcred); 3499 3500 /* 3501 * Hold credentials again (for thread) 3502 */ 3503 crhold(cr); 3504 3505 /* 3506 * p_lwpcnt can't change since this is a kernel process. 3507 */ 3508 crset(pp, cr); 3509 3510 /* 3511 * Chroot 3512 */ 3513 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp); 3514 zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp); 3515 3516 /* 3517 * Initialize zone's rctl set. 3518 */ 3519 set = rctl_set_create(); 3520 gp = rctl_set_init_prealloc(RCENTITY_ZONE); 3521 mutex_enter(&pp->p_lock); 3522 e.rcep_p.zone = zone; 3523 e.rcep_t = RCENTITY_ZONE; 3524 zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp); 3525 mutex_exit(&pp->p_lock); 3526 rctl_prealloc_destroy(gp); 3527 3528 /* 3529 * Apply the rctls passed in to zone_create(). This is basically a list 3530 * assignment: all of the old values are removed and the new ones 3531 * inserted. That is, if an empty list is passed in, all values are 3532 * removed. 3533 */ 3534 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3535 rctl_dict_entry_t *rde; 3536 rctl_hndl_t hndl; 3537 char *name; 3538 nvlist_t **nvlarray; 3539 uint_t i, nelem; 3540 int error; /* For ASSERT()s */ 3541 3542 name = nvpair_name(nvp); 3543 hndl = rctl_hndl_lookup(name); 3544 ASSERT(hndl != -1); 3545 rde = rctl_dict_lookup_hndl(hndl); 3546 ASSERT(rde != NULL); 3547 3548 for (; /* ever */; ) { 3549 rctl_val_t oval; 3550 3551 mutex_enter(&pp->p_lock); 3552 error = rctl_local_get(hndl, NULL, &oval, pp); 3553 mutex_exit(&pp->p_lock); 3554 ASSERT(error == 0); /* Can't fail for RCTL_FIRST */ 3555 ASSERT(oval.rcv_privilege != RCPRIV_BASIC); 3556 if (oval.rcv_privilege == RCPRIV_SYSTEM) 3557 break; 3558 mutex_enter(&pp->p_lock); 3559 error = rctl_local_delete(hndl, &oval, pp); 3560 mutex_exit(&pp->p_lock); 3561 ASSERT(error == 0); 3562 } 3563 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 3564 ASSERT(error == 0); 3565 for (i = 0; i < nelem; i++) { 3566 rctl_val_t *nvalp; 3567 3568 nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); 3569 error = nvlist2rctlval(nvlarray[i], nvalp); 3570 ASSERT(error == 0); 3571 /* 3572 * rctl_local_insert can fail if the value being 3573 * inserted is a duplicate; this is OK. 3574 */ 3575 mutex_enter(&pp->p_lock); 3576 if (rctl_local_insert(hndl, nvalp, pp) != 0) 3577 kmem_cache_free(rctl_val_cache, nvalp); 3578 mutex_exit(&pp->p_lock); 3579 } 3580 } 3581 /* 3582 * Tell the world that we're done setting up. 3583 * 3584 * At this point we want to set the zone status to ZONE_IS_INITIALIZED 3585 * and atomically set the zone's processor set visibility. Once 3586 * we drop pool_lock() this zone will automatically get updated 3587 * to reflect any future changes to the pools configuration. 3588 * 3589 * Note that after we drop the locks below (zonehash_lock in 3590 * particular) other operations such as a zone_getattr call can 3591 * now proceed and observe the zone. That is the reason for doing a 3592 * state transition to the INITIALIZED state. 3593 */ 3594 pool_lock(); 3595 mutex_enter(&cpu_lock); 3596 mutex_enter(&zonehash_lock); 3597 zone_uniqid(zone); 3598 zone_zsd_configure(zone); 3599 if (pool_state == POOL_ENABLED) 3600 zone_pset_set(zone, pool_default->pool_pset->pset_id); 3601 mutex_enter(&zone_status_lock); 3602 ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); 3603 zone_status_set(zone, ZONE_IS_INITIALIZED); 3604 mutex_exit(&zone_status_lock); 3605 mutex_exit(&zonehash_lock); 3606 mutex_exit(&cpu_lock); 3607 pool_unlock(); 3608 3609 /* Now call the create callback for this key */ 3610 zsd_apply_all_keys(zsd_apply_create, zone); 3611 3612 /* The callbacks are complete. Mark ZONE_IS_READY */ 3613 mutex_enter(&zone_status_lock); 3614 ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED); 3615 zone_status_set(zone, ZONE_IS_READY); 3616 mutex_exit(&zone_status_lock); 3617 3618 /* 3619 * Once we see the zone transition to the ZONE_IS_BOOTING state, 3620 * we launch init, and set the state to running. 3621 */ 3622 zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched"); 3623 3624 if (zone_status_get(zone) == ZONE_IS_BOOTING) { 3625 id_t cid; 3626 3627 /* 3628 * Ok, this is a little complicated. We need to grab the 3629 * zone's pool's scheduling class ID; note that by now, we 3630 * are already bound to a pool if we need to be (zoneadmd 3631 * will have done that to us while we're in the READY 3632 * state). *But* the scheduling class for the zone's 'init' 3633 * must be explicitly passed to newproc, which doesn't 3634 * respect pool bindings. 3635 * 3636 * We hold the pool_lock across the call to newproc() to 3637 * close the obvious race: the pool's scheduling class 3638 * could change before we manage to create the LWP with 3639 * classid 'cid'. 3640 */ 3641 pool_lock(); 3642 if (zone->zone_defaultcid > 0) 3643 cid = zone->zone_defaultcid; 3644 else 3645 cid = pool_get_class(zone->zone_pool); 3646 if (cid == -1) 3647 cid = defaultcid; 3648 3649 /* 3650 * If this fails, zone_boot will ultimately fail. The 3651 * state of the zone will be set to SHUTTING_DOWN-- userland 3652 * will have to tear down the zone, and fail, or try again. 3653 */ 3654 if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid, 3655 minclsyspri - 1, &ct, 0)) != 0) { 3656 mutex_enter(&zone_status_lock); 3657 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 3658 mutex_exit(&zone_status_lock); 3659 } 3660 pool_unlock(); 3661 } 3662 3663 /* 3664 * Wait for zone_destroy() to be called. This is what we spend 3665 * most of our life doing. 3666 */ 3667 zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched"); 3668 3669 if (ct) 3670 /* 3671 * At this point the process contract should be empty. 3672 * (Though if it isn't, it's not the end of the world.) 3673 */ 3674 VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0); 3675 3676 /* 3677 * Allow kcred to be freed when all referring processes 3678 * (including this one) go away. We can't just do this in 3679 * zone_free because we need to wait for the zone_cred_ref to 3680 * drop to 0 before calling zone_free, and the existence of 3681 * zone_kcred will prevent that. Thus, we call crfree here to 3682 * balance the crdup in zone_create. The crhold calls earlier 3683 * in zsched will be dropped when the thread and process exit. 3684 */ 3685 crfree(zone->zone_kcred); 3686 zone->zone_kcred = NULL; 3687 3688 exit(CLD_EXITED, 0); 3689 } 3690 3691 /* 3692 * Helper function to determine if there are any submounts of the 3693 * provided path. Used to make sure the zone doesn't "inherit" any 3694 * mounts from before it is created. 3695 */ 3696 static uint_t 3697 zone_mount_count(const char *rootpath) 3698 { 3699 vfs_t *vfsp; 3700 uint_t count = 0; 3701 size_t rootpathlen = strlen(rootpath); 3702 3703 /* 3704 * Holding zonehash_lock prevents race conditions with 3705 * vfs_list_add()/vfs_list_remove() since we serialize with 3706 * zone_find_by_path(). 3707 */ 3708 ASSERT(MUTEX_HELD(&zonehash_lock)); 3709 /* 3710 * The rootpath must end with a '/' 3711 */ 3712 ASSERT(rootpath[rootpathlen - 1] == '/'); 3713 3714 /* 3715 * This intentionally does not count the rootpath itself if that 3716 * happens to be a mount point. 3717 */ 3718 vfs_list_read_lock(); 3719 vfsp = rootvfs; 3720 do { 3721 if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt), 3722 rootpathlen) == 0) 3723 count++; 3724 vfsp = vfsp->vfs_next; 3725 } while (vfsp != rootvfs); 3726 vfs_list_unlock(); 3727 return (count); 3728 } 3729 3730 /* 3731 * Helper function to make sure that a zone created on 'rootpath' 3732 * wouldn't end up containing other zones' rootpaths. 3733 */ 3734 static boolean_t 3735 zone_is_nested(const char *rootpath) 3736 { 3737 zone_t *zone; 3738 size_t rootpathlen = strlen(rootpath); 3739 size_t len; 3740 3741 ASSERT(MUTEX_HELD(&zonehash_lock)); 3742 3743 /* 3744 * zone_set_root() appended '/' and '\0' at the end of rootpath 3745 */ 3746 if ((rootpathlen <= 3) && (rootpath[0] == '/') && 3747 (rootpath[1] == '/') && (rootpath[2] == '\0')) 3748 return (B_TRUE); 3749 3750 for (zone = list_head(&zone_active); zone != NULL; 3751 zone = list_next(&zone_active, zone)) { 3752 if (zone == global_zone) 3753 continue; 3754 len = strlen(zone->zone_rootpath); 3755 if (strncmp(rootpath, zone->zone_rootpath, 3756 MIN(rootpathlen, len)) == 0) 3757 return (B_TRUE); 3758 } 3759 return (B_FALSE); 3760 } 3761 3762 static int 3763 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, 3764 size_t zone_privssz) 3765 { 3766 priv_set_t *privs; 3767 3768 if (zone_privssz < sizeof (priv_set_t)) 3769 return (ENOMEM); 3770 3771 privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); 3772 3773 if (copyin(zone_privs, privs, sizeof (priv_set_t))) { 3774 kmem_free(privs, sizeof (priv_set_t)); 3775 return (EFAULT); 3776 } 3777 3778 zone->zone_privset = privs; 3779 return (0); 3780 } 3781 3782 /* 3783 * We make creative use of nvlists to pass in rctls from userland. The list is 3784 * a list of the following structures: 3785 * 3786 * (name = rctl_name, value = nvpair_list_array) 3787 * 3788 * Where each element of the nvpair_list_array is of the form: 3789 * 3790 * [(name = "privilege", value = RCPRIV_PRIVILEGED), 3791 * (name = "limit", value = uint64_t), 3792 * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))] 3793 */ 3794 static int 3795 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) 3796 { 3797 nvpair_t *nvp = NULL; 3798 nvlist_t *nvl = NULL; 3799 char *kbuf; 3800 int error; 3801 rctl_val_t rv; 3802 3803 *nvlp = NULL; 3804 3805 if (buflen == 0) 3806 return (0); 3807 3808 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 3809 return (ENOMEM); 3810 if (copyin(ubuf, kbuf, buflen)) { 3811 error = EFAULT; 3812 goto out; 3813 } 3814 if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) { 3815 /* 3816 * nvl may have been allocated/free'd, but the value set to 3817 * non-NULL, so we reset it here. 3818 */ 3819 nvl = NULL; 3820 error = EINVAL; 3821 goto out; 3822 } 3823 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 3824 rctl_dict_entry_t *rde; 3825 rctl_hndl_t hndl; 3826 nvlist_t **nvlarray; 3827 uint_t i, nelem; 3828 char *name; 3829 3830 error = EINVAL; 3831 name = nvpair_name(nvp); 3832 if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) 3833 != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { 3834 goto out; 3835 } 3836 if ((hndl = rctl_hndl_lookup(name)) == -1) { 3837 goto out; 3838 } 3839 rde = rctl_dict_lookup_hndl(hndl); 3840 error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem); 3841 ASSERT(error == 0); 3842 for (i = 0; i < nelem; i++) { 3843 if (error = nvlist2rctlval(nvlarray[i], &rv)) 3844 goto out; 3845 } 3846 if (rctl_invalid_value(rde, &rv)) { 3847 error = EINVAL; 3848 goto out; 3849 } 3850 } 3851 error = 0; 3852 *nvlp = nvl; 3853 out: 3854 kmem_free(kbuf, buflen); 3855 if (error && nvl != NULL) 3856 nvlist_free(nvl); 3857 return (error); 3858 } 3859 3860 int 3861 zone_create_error(int er_error, int er_ext, int *er_out) { 3862 if (er_out != NULL) { 3863 if (copyout(&er_ext, er_out, sizeof (int))) { 3864 return (set_errno(EFAULT)); 3865 } 3866 } 3867 return (set_errno(er_error)); 3868 } 3869 3870 static int 3871 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi) 3872 { 3873 ts_label_t *tsl; 3874 bslabel_t blab; 3875 3876 /* Get label from user */ 3877 if (copyin(lab, &blab, sizeof (blab)) != 0) 3878 return (EFAULT); 3879 tsl = labelalloc(&blab, doi, KM_NOSLEEP); 3880 if (tsl == NULL) 3881 return (ENOMEM); 3882 3883 zone->zone_slabel = tsl; 3884 return (0); 3885 } 3886 3887 /* 3888 * Parses a comma-separated list of ZFS datasets into a per-zone dictionary. 3889 */ 3890 static int 3891 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen) 3892 { 3893 char *kbuf; 3894 char *dataset, *next; 3895 zone_dataset_t *zd; 3896 size_t len; 3897 3898 if (ubuf == NULL || buflen == 0) 3899 return (0); 3900 3901 if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL) 3902 return (ENOMEM); 3903 3904 if (copyin(ubuf, kbuf, buflen) != 0) { 3905 kmem_free(kbuf, buflen); 3906 return (EFAULT); 3907 } 3908 3909 dataset = next = kbuf; 3910 for (;;) { 3911 zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP); 3912 3913 next = strchr(dataset, ','); 3914 3915 if (next == NULL) 3916 len = strlen(dataset); 3917 else 3918 len = next - dataset; 3919 3920 zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP); 3921 bcopy(dataset, zd->zd_dataset, len); 3922 zd->zd_dataset[len] = '\0'; 3923 3924 list_insert_head(&zone->zone_datasets, zd); 3925 3926 if (next == NULL) 3927 break; 3928 3929 dataset = next + 1; 3930 } 3931 3932 kmem_free(kbuf, buflen); 3933 return (0); 3934 } 3935 3936 /* 3937 * System call to create/initialize a new zone named 'zone_name', rooted 3938 * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs', 3939 * and initialized with the zone-wide rctls described in 'rctlbuf', and 3940 * with labeling set by 'match', 'doi', and 'label'. 3941 * 3942 * If extended error is non-null, we may use it to return more detailed 3943 * error information. 3944 */ 3945 static zoneid_t 3946 zone_create(const char *zone_name, const char *zone_root, 3947 const priv_set_t *zone_privs, size_t zone_privssz, 3948 caddr_t rctlbuf, size_t rctlbufsz, 3949 caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, 3950 int match, uint32_t doi, const bslabel_t *label, 3951 int flags) 3952 { 3953 struct zsched_arg zarg; 3954 nvlist_t *rctls = NULL; 3955 proc_t *pp = curproc; 3956 zone_t *zone, *ztmp; 3957 zoneid_t zoneid; 3958 int error; 3959 int error2 = 0; 3960 char *str; 3961 cred_t *zkcr; 3962 boolean_t insert_label_hash; 3963 3964 if (secpolicy_zone_config(CRED()) != 0) 3965 return (set_errno(EPERM)); 3966 3967 /* can't boot zone from within chroot environment */ 3968 if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir) 3969 return (zone_create_error(ENOTSUP, ZE_CHROOTED, 3970 extended_error)); 3971 3972 zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); 3973 zoneid = zone->zone_id = id_alloc(zoneid_space); 3974 zone->zone_status = ZONE_IS_UNINITIALIZED; 3975 zone->zone_pool = pool_default; 3976 zone->zone_pool_mod = gethrtime(); 3977 zone->zone_psetid = ZONE_PS_INVAL; 3978 zone->zone_ncpus = 0; 3979 zone->zone_ncpus_online = 0; 3980 zone->zone_restart_init = B_TRUE; 3981 zone->zone_brand = &native_brand; 3982 zone->zone_initname = NULL; 3983 mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); 3984 mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); 3985 mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); 3986 cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); 3987 list_create(&zone->zone_zsd, sizeof (struct zsd_entry), 3988 offsetof(struct zsd_entry, zsd_linkage)); 3989 list_create(&zone->zone_datasets, sizeof (zone_dataset_t), 3990 offsetof(zone_dataset_t, zd_linkage)); 3991 list_create(&zone->zone_dl_list, sizeof (zone_dl_t), 3992 offsetof(zone_dl_t, zdl_linkage)); 3993 rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); 3994 rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL); 3995 3996 if (flags & ZCF_NET_EXCL) { 3997 zone->zone_flags |= ZF_NET_EXCL; 3998 } 3999 4000 if ((error = zone_set_name(zone, zone_name)) != 0) { 4001 zone_free(zone); 4002 return (zone_create_error(error, 0, extended_error)); 4003 } 4004 4005 if ((error = zone_set_root(zone, zone_root)) != 0) { 4006 zone_free(zone); 4007 return (zone_create_error(error, 0, extended_error)); 4008 } 4009 if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) { 4010 zone_free(zone); 4011 return (zone_create_error(error, 0, extended_error)); 4012 } 4013 4014 /* initialize node name to be the same as zone name */ 4015 zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP); 4016 (void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN); 4017 zone->zone_nodename[_SYS_NMLN - 1] = '\0'; 4018 4019 zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP); 4020 zone->zone_domain[0] = '\0'; 4021 zone->zone_hostid = HW_INVALID_HOSTID; 4022 zone->zone_shares = 1; 4023 zone->zone_shmmax = 0; 4024 zone->zone_ipc.ipcq_shmmni = 0; 4025 zone->zone_ipc.ipcq_semmni = 0; 4026 zone->zone_ipc.ipcq_msgmni = 0; 4027 zone->zone_bootargs = NULL; 4028 zone->zone_fs_allowed = NULL; 4029 zone->zone_initname = 4030 kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP); 4031 (void) strcpy(zone->zone_initname, zone_default_initname); 4032 zone->zone_nlwps = 0; 4033 zone->zone_nlwps_ctl = INT_MAX; 4034 zone->zone_nprocs = 0; 4035 zone->zone_nprocs_ctl = INT_MAX; 4036 zone->zone_locked_mem = 0; 4037 zone->zone_locked_mem_ctl = UINT64_MAX; 4038 zone->zone_max_swap = 0; 4039 zone->zone_max_swap_ctl = UINT64_MAX; 4040 zone->zone_max_lofi = 0; 4041 zone->zone_max_lofi_ctl = UINT64_MAX; 4042 zone0.zone_lockedmem_kstat = NULL; 4043 zone0.zone_swapresv_kstat = NULL; 4044 4045 /* 4046 * Zsched initializes the rctls. 4047 */ 4048 zone->zone_rctls = NULL; 4049 4050 if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { 4051 zone_free(zone); 4052 return (zone_create_error(error, 0, extended_error)); 4053 } 4054 4055 if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) { 4056 zone_free(zone); 4057 return (set_errno(error)); 4058 } 4059 4060 /* 4061 * Read in the trusted system parameters: 4062 * match flag and sensitivity label. 4063 */ 4064 zone->zone_match = match; 4065 if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 4066 /* Fail if requested to set doi to anything but system's doi */ 4067 if (doi != 0 && doi != default_doi) { 4068 zone_free(zone); 4069 return (set_errno(EINVAL)); 4070 } 4071 /* Always apply system's doi to the zone */ 4072 error = zone_set_label(zone, label, default_doi); 4073 if (error != 0) { 4074 zone_free(zone); 4075 return (set_errno(error)); 4076 } 4077 insert_label_hash = B_TRUE; 4078 } else { 4079 /* all zones get an admin_low label if system is not labeled */ 4080 zone->zone_slabel = l_admin_low; 4081 label_hold(l_admin_low); 4082 insert_label_hash = B_FALSE; 4083 } 4084 4085 /* 4086 * Stop all lwps since that's what normally happens as part of fork(). 4087 * This needs to happen before we grab any locks to avoid deadlock 4088 * (another lwp in the process could be waiting for the held lock). 4089 */ 4090 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) { 4091 zone_free(zone); 4092 if (rctls) 4093 nvlist_free(rctls); 4094 return (zone_create_error(error, 0, extended_error)); 4095 } 4096 4097 if (block_mounts() == 0) { 4098 mutex_enter(&pp->p_lock); 4099 if (curthread != pp->p_agenttp) 4100 continuelwps(pp); 4101 mutex_exit(&pp->p_lock); 4102 zone_free(zone); 4103 if (rctls) 4104 nvlist_free(rctls); 4105 return (zone_create_error(error, 0, extended_error)); 4106 } 4107 4108 /* 4109 * Set up credential for kernel access. After this, any errors 4110 * should go through the dance in errout rather than calling 4111 * zone_free directly. 4112 */ 4113 zone->zone_kcred = crdup(kcred); 4114 crsetzone(zone->zone_kcred, zone); 4115 priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred)); 4116 priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred)); 4117 priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred)); 4118 priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred)); 4119 4120 mutex_enter(&zonehash_lock); 4121 /* 4122 * Make sure zone doesn't already exist. 4123 * 4124 * If the system and zone are labeled, 4125 * make sure no other zone exists that has the same label. 4126 */ 4127 if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL || 4128 (insert_label_hash && 4129 (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) { 4130 zone_status_t status; 4131 4132 status = zone_status_get(ztmp); 4133 if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING) 4134 error = EEXIST; 4135 else 4136 error = EBUSY; 4137 4138 if (insert_label_hash) 4139 error2 = ZE_LABELINUSE; 4140 4141 goto errout; 4142 } 4143 4144 /* 4145 * Don't allow zone creations which would cause one zone's rootpath to 4146 * be accessible from that of another (non-global) zone. 4147 */ 4148 if (zone_is_nested(zone->zone_rootpath)) { 4149 error = EBUSY; 4150 goto errout; 4151 } 4152 4153 ASSERT(zonecount != 0); /* check for leaks */ 4154 if (zonecount + 1 > maxzones) { 4155 error = ENOMEM; 4156 goto errout; 4157 } 4158 4159 if (zone_mount_count(zone->zone_rootpath) != 0) { 4160 error = EBUSY; 4161 error2 = ZE_AREMOUNTS; 4162 goto errout; 4163 } 4164 4165 /* 4166 * Zone is still incomplete, but we need to drop all locks while 4167 * zsched() initializes this zone's kernel process. We 4168 * optimistically add the zone to the hashtable and associated 4169 * lists so a parallel zone_create() doesn't try to create the 4170 * same zone. 4171 */ 4172 zonecount++; 4173 (void) mod_hash_insert(zonehashbyid, 4174 (mod_hash_key_t)(uintptr_t)zone->zone_id, 4175 (mod_hash_val_t)(uintptr_t)zone); 4176 str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP); 4177 (void) strcpy(str, zone->zone_name); 4178 (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str, 4179 (mod_hash_val_t)(uintptr_t)zone); 4180 if (insert_label_hash) { 4181 (void) mod_hash_insert(zonehashbylabel, 4182 (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone); 4183 zone->zone_flags |= ZF_HASHED_LABEL; 4184 } 4185 4186 /* 4187 * Insert into active list. At this point there are no 'hold's 4188 * on the zone, but everyone else knows not to use it, so we can 4189 * continue to use it. zsched() will do a zone_hold() if the 4190 * newproc() is successful. 4191 */ 4192 list_insert_tail(&zone_active, zone); 4193 mutex_exit(&zonehash_lock); 4194 4195 zarg.zone = zone; 4196 zarg.nvlist = rctls; 4197 /* 4198 * The process, task, and project rctls are probably wrong; 4199 * we need an interface to get the default values of all rctls, 4200 * and initialize zsched appropriately. I'm not sure that that 4201 * makes much of a difference, though. 4202 */ 4203 error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); 4204 if (error != 0) { 4205 /* 4206 * We need to undo all globally visible state. 4207 */ 4208 mutex_enter(&zonehash_lock); 4209 list_remove(&zone_active, zone); 4210 if (zone->zone_flags & ZF_HASHED_LABEL) { 4211 ASSERT(zone->zone_slabel != NULL); 4212 (void) mod_hash_destroy(zonehashbylabel, 4213 (mod_hash_key_t)zone->zone_slabel); 4214 } 4215 (void) mod_hash_destroy(zonehashbyname, 4216 (mod_hash_key_t)(uintptr_t)zone->zone_name); 4217 (void) mod_hash_destroy(zonehashbyid, 4218 (mod_hash_key_t)(uintptr_t)zone->zone_id); 4219 ASSERT(zonecount > 1); 4220 zonecount--; 4221 goto errout; 4222 } 4223 4224 /* 4225 * Zone creation can't fail from now on. 4226 */ 4227 4228 /* 4229 * Create zone kstats 4230 */ 4231 zone_kstat_create(zone); 4232 4233 /* 4234 * Let the other lwps continue. 4235 */ 4236 mutex_enter(&pp->p_lock); 4237 if (curthread != pp->p_agenttp) 4238 continuelwps(pp); 4239 mutex_exit(&pp->p_lock); 4240 4241 /* 4242 * Wait for zsched to finish initializing the zone. 4243 */ 4244 zone_status_wait(zone, ZONE_IS_READY); 4245 /* 4246 * The zone is fully visible, so we can let mounts progress. 4247 */ 4248 resume_mounts(); 4249 if (rctls) 4250 nvlist_free(rctls); 4251 4252 return (zoneid); 4253 4254 errout: 4255 mutex_exit(&zonehash_lock); 4256 /* 4257 * Let the other lwps continue. 4258 */ 4259 mutex_enter(&pp->p_lock); 4260 if (curthread != pp->p_agenttp) 4261 continuelwps(pp); 4262 mutex_exit(&pp->p_lock); 4263 4264 resume_mounts(); 4265 if (rctls) 4266 nvlist_free(rctls); 4267 /* 4268 * There is currently one reference to the zone, a cred_ref from 4269 * zone_kcred. To free the zone, we call crfree, which will call 4270 * zone_cred_rele, which will call zone_free. 4271 */ 4272 ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ 4273 ASSERT(zone->zone_kcred->cr_ref == 1); 4274 ASSERT(zone->zone_ref == 0); 4275 zkcr = zone->zone_kcred; 4276 zone->zone_kcred = NULL; 4277 crfree(zkcr); /* triggers call to zone_free */ 4278 return (zone_create_error(error, error2, extended_error)); 4279 } 4280 4281 /* 4282 * Cause the zone to boot. This is pretty simple, since we let zoneadmd do 4283 * the heavy lifting. initname is the path to the program to launch 4284 * at the "top" of the zone; if this is NULL, we use the system default, 4285 * which is stored at zone_default_initname. 4286 */ 4287 static int 4288 zone_boot(zoneid_t zoneid) 4289 { 4290 int err; 4291 zone_t *zone; 4292 4293 if (secpolicy_zone_config(CRED()) != 0) 4294 return (set_errno(EPERM)); 4295 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4296 return (set_errno(EINVAL)); 4297 4298 mutex_enter(&zonehash_lock); 4299 /* 4300 * Look for zone under hash lock to prevent races with calls to 4301 * zone_shutdown, zone_destroy, etc. 4302 */ 4303 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4304 mutex_exit(&zonehash_lock); 4305 return (set_errno(EINVAL)); 4306 } 4307 4308 mutex_enter(&zone_status_lock); 4309 if (zone_status_get(zone) != ZONE_IS_READY) { 4310 mutex_exit(&zone_status_lock); 4311 mutex_exit(&zonehash_lock); 4312 return (set_errno(EINVAL)); 4313 } 4314 zone_status_set(zone, ZONE_IS_BOOTING); 4315 mutex_exit(&zone_status_lock); 4316 4317 zone_hold(zone); /* so we can use the zone_t later */ 4318 mutex_exit(&zonehash_lock); 4319 4320 if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) { 4321 zone_rele(zone); 4322 return (set_errno(EINTR)); 4323 } 4324 4325 /* 4326 * Boot (starting init) might have failed, in which case the zone 4327 * will go to the SHUTTING_DOWN state; an appropriate errno will 4328 * be placed in zone->zone_boot_err, and so we return that. 4329 */ 4330 err = zone->zone_boot_err; 4331 zone_rele(zone); 4332 return (err ? set_errno(err) : 0); 4333 } 4334 4335 /* 4336 * Kills all user processes in the zone, waiting for them all to exit 4337 * before returning. 4338 */ 4339 static int 4340 zone_empty(zone_t *zone) 4341 { 4342 int waitstatus; 4343 4344 /* 4345 * We need to drop zonehash_lock before killing all 4346 * processes, otherwise we'll deadlock with zone_find_* 4347 * which can be called from the exit path. 4348 */ 4349 ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); 4350 while ((waitstatus = zone_status_timedwait_sig(zone, 4351 ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { 4352 killall(zone->zone_id); 4353 } 4354 /* 4355 * return EINTR if we were signaled 4356 */ 4357 if (waitstatus == 0) 4358 return (EINTR); 4359 return (0); 4360 } 4361 4362 /* 4363 * This function implements the policy for zone visibility. 4364 * 4365 * In standard Solaris, a non-global zone can only see itself. 4366 * 4367 * In Trusted Extensions, a labeled zone can lookup any zone whose label 4368 * it dominates. For this test, the label of the global zone is treated as 4369 * admin_high so it is special-cased instead of being checked for dominance. 4370 * 4371 * Returns true if zone attributes are viewable, false otherwise. 4372 */ 4373 static boolean_t 4374 zone_list_access(zone_t *zone) 4375 { 4376 4377 if (curproc->p_zone == global_zone || 4378 curproc->p_zone == zone) { 4379 return (B_TRUE); 4380 } else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) { 4381 bslabel_t *curproc_label; 4382 bslabel_t *zone_label; 4383 4384 curproc_label = label2bslabel(curproc->p_zone->zone_slabel); 4385 zone_label = label2bslabel(zone->zone_slabel); 4386 4387 if (zone->zone_id != GLOBAL_ZONEID && 4388 bldominates(curproc_label, zone_label)) { 4389 return (B_TRUE); 4390 } else { 4391 return (B_FALSE); 4392 } 4393 } else { 4394 return (B_FALSE); 4395 } 4396 } 4397 4398 /* 4399 * Systemcall to start the zone's halt sequence. By the time this 4400 * function successfully returns, all user processes and kernel threads 4401 * executing in it will have exited, ZSD shutdown callbacks executed, 4402 * and the zone status set to ZONE_IS_DOWN. 4403 * 4404 * It is possible that the call will interrupt itself if the caller is the 4405 * parent of any process running in the zone, and doesn't have SIGCHLD blocked. 4406 */ 4407 static int 4408 zone_shutdown(zoneid_t zoneid) 4409 { 4410 int error; 4411 zone_t *zone; 4412 zone_status_t status; 4413 4414 if (secpolicy_zone_config(CRED()) != 0) 4415 return (set_errno(EPERM)); 4416 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4417 return (set_errno(EINVAL)); 4418 4419 /* 4420 * Block mounts so that VFS_MOUNT() can get an accurate view of 4421 * the zone's status with regards to ZONE_IS_SHUTTING down. 4422 * 4423 * e.g. NFS can fail the mount if it determines that the zone 4424 * has already begun the shutdown sequence. 4425 */ 4426 if (block_mounts() == 0) 4427 return (set_errno(EINTR)); 4428 mutex_enter(&zonehash_lock); 4429 /* 4430 * Look for zone under hash lock to prevent races with other 4431 * calls to zone_shutdown and zone_destroy. 4432 */ 4433 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4434 mutex_exit(&zonehash_lock); 4435 resume_mounts(); 4436 return (set_errno(EINVAL)); 4437 } 4438 mutex_enter(&zone_status_lock); 4439 status = zone_status_get(zone); 4440 /* 4441 * Fail if the zone isn't fully initialized yet. 4442 */ 4443 if (status < ZONE_IS_READY) { 4444 mutex_exit(&zone_status_lock); 4445 mutex_exit(&zonehash_lock); 4446 resume_mounts(); 4447 return (set_errno(EINVAL)); 4448 } 4449 /* 4450 * If conditions required for zone_shutdown() to return have been met, 4451 * return success. 4452 */ 4453 if (status >= ZONE_IS_DOWN) { 4454 mutex_exit(&zone_status_lock); 4455 mutex_exit(&zonehash_lock); 4456 resume_mounts(); 4457 return (0); 4458 } 4459 /* 4460 * If zone_shutdown() hasn't been called before, go through the motions. 4461 * If it has, there's nothing to do but wait for the kernel threads to 4462 * drain. 4463 */ 4464 if (status < ZONE_IS_EMPTY) { 4465 uint_t ntasks; 4466 4467 mutex_enter(&zone->zone_lock); 4468 if ((ntasks = zone->zone_ntasks) != 1) { 4469 /* 4470 * There's still stuff running. 4471 */ 4472 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 4473 } 4474 mutex_exit(&zone->zone_lock); 4475 if (ntasks == 1) { 4476 /* 4477 * The only way to create another task is through 4478 * zone_enter(), which will block until we drop 4479 * zonehash_lock. The zone is empty. 4480 */ 4481 if (zone->zone_kthreads == NULL) { 4482 /* 4483 * Skip ahead to ZONE_IS_DOWN 4484 */ 4485 zone_status_set(zone, ZONE_IS_DOWN); 4486 } else { 4487 zone_status_set(zone, ZONE_IS_EMPTY); 4488 } 4489 } 4490 } 4491 zone_hold(zone); /* so we can use the zone_t later */ 4492 mutex_exit(&zone_status_lock); 4493 mutex_exit(&zonehash_lock); 4494 resume_mounts(); 4495 4496 if (error = zone_empty(zone)) { 4497 zone_rele(zone); 4498 return (set_errno(error)); 4499 } 4500 /* 4501 * After the zone status goes to ZONE_IS_DOWN this zone will no 4502 * longer be notified of changes to the pools configuration, so 4503 * in order to not end up with a stale pool pointer, we point 4504 * ourselves at the default pool and remove all resource 4505 * visibility. This is especially important as the zone_t may 4506 * languish on the deathrow for a very long time waiting for 4507 * cred's to drain out. 4508 * 4509 * This rebinding of the zone can happen multiple times 4510 * (presumably due to interrupted or parallel systemcalls) 4511 * without any adverse effects. 4512 */ 4513 if (pool_lock_intr() != 0) { 4514 zone_rele(zone); 4515 return (set_errno(EINTR)); 4516 } 4517 if (pool_state == POOL_ENABLED) { 4518 mutex_enter(&cpu_lock); 4519 zone_pool_set(zone, pool_default); 4520 /* 4521 * The zone no longer needs to be able to see any cpus. 4522 */ 4523 zone_pset_set(zone, ZONE_PS_INVAL); 4524 mutex_exit(&cpu_lock); 4525 } 4526 pool_unlock(); 4527 4528 /* 4529 * ZSD shutdown callbacks can be executed multiple times, hence 4530 * it is safe to not be holding any locks across this call. 4531 */ 4532 zone_zsd_callbacks(zone, ZSD_SHUTDOWN); 4533 4534 mutex_enter(&zone_status_lock); 4535 if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN) 4536 zone_status_set(zone, ZONE_IS_DOWN); 4537 mutex_exit(&zone_status_lock); 4538 4539 /* 4540 * Wait for kernel threads to drain. 4541 */ 4542 if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) { 4543 zone_rele(zone); 4544 return (set_errno(EINTR)); 4545 } 4546 4547 /* 4548 * Zone can be become down/destroyable even if the above wait 4549 * returns EINTR, so any code added here may never execute. 4550 * (i.e. don't add code here) 4551 */ 4552 4553 zone_rele(zone); 4554 return (0); 4555 } 4556 4557 /* 4558 * Systemcall entry point to finalize the zone halt process. The caller 4559 * must have already successfully called zone_shutdown(). 4560 * 4561 * Upon successful completion, the zone will have been fully destroyed: 4562 * zsched will have exited, destructor callbacks executed, and the zone 4563 * removed from the list of active zones. 4564 */ 4565 static int 4566 zone_destroy(zoneid_t zoneid) 4567 { 4568 uint64_t uniqid; 4569 zone_t *zone; 4570 zone_status_t status; 4571 4572 if (secpolicy_zone_config(CRED()) != 0) 4573 return (set_errno(EPERM)); 4574 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 4575 return (set_errno(EINVAL)); 4576 4577 mutex_enter(&zonehash_lock); 4578 /* 4579 * Look for zone under hash lock to prevent races with other 4580 * calls to zone_destroy. 4581 */ 4582 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4583 mutex_exit(&zonehash_lock); 4584 return (set_errno(EINVAL)); 4585 } 4586 4587 if (zone_mount_count(zone->zone_rootpath) != 0) { 4588 mutex_exit(&zonehash_lock); 4589 return (set_errno(EBUSY)); 4590 } 4591 mutex_enter(&zone_status_lock); 4592 status = zone_status_get(zone); 4593 if (status < ZONE_IS_DOWN) { 4594 mutex_exit(&zone_status_lock); 4595 mutex_exit(&zonehash_lock); 4596 return (set_errno(EBUSY)); 4597 } else if (status == ZONE_IS_DOWN) { 4598 zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */ 4599 } 4600 mutex_exit(&zone_status_lock); 4601 zone_hold(zone); 4602 mutex_exit(&zonehash_lock); 4603 4604 /* 4605 * wait for zsched to exit 4606 */ 4607 zone_status_wait(zone, ZONE_IS_DEAD); 4608 zone_zsd_callbacks(zone, ZSD_DESTROY); 4609 zone->zone_netstack = NULL; 4610 uniqid = zone->zone_uniqid; 4611 zone_rele(zone); 4612 zone = NULL; /* potentially free'd */ 4613 4614 mutex_enter(&zonehash_lock); 4615 for (; /* ever */; ) { 4616 boolean_t unref; 4617 4618 if ((zone = zone_find_all_by_id(zoneid)) == NULL || 4619 zone->zone_uniqid != uniqid) { 4620 /* 4621 * The zone has gone away. Necessary conditions 4622 * are met, so we return success. 4623 */ 4624 mutex_exit(&zonehash_lock); 4625 return (0); 4626 } 4627 mutex_enter(&zone->zone_lock); 4628 unref = ZONE_IS_UNREF(zone); 4629 mutex_exit(&zone->zone_lock); 4630 if (unref) { 4631 /* 4632 * There is only one reference to the zone -- that 4633 * added when the zone was added to the hashtables -- 4634 * and things will remain this way until we drop 4635 * zonehash_lock... we can go ahead and cleanup the 4636 * zone. 4637 */ 4638 break; 4639 } 4640 4641 if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { 4642 /* Signaled */ 4643 mutex_exit(&zonehash_lock); 4644 return (set_errno(EINTR)); 4645 } 4646 4647 } 4648 4649 /* 4650 * Remove CPU cap for this zone now since we're not going to 4651 * fail below this point. 4652 */ 4653 cpucaps_zone_remove(zone); 4654 4655 /* Get rid of the zone's kstats */ 4656 zone_kstat_delete(zone); 4657 4658 /* remove the pfexecd doors */ 4659 if (zone->zone_pfexecd != NULL) { 4660 klpd_freelist(&zone->zone_pfexecd); 4661 zone->zone_pfexecd = NULL; 4662 } 4663 4664 /* free brand specific data */ 4665 if (ZONE_IS_BRANDED(zone)) 4666 ZBROP(zone)->b_free_brand_data(zone); 4667 4668 /* Say goodbye to brand framework. */ 4669 brand_unregister_zone(zone->zone_brand); 4670 4671 /* 4672 * It is now safe to let the zone be recreated; remove it from the 4673 * lists. The memory will not be freed until the last cred 4674 * reference goes away. 4675 */ 4676 ASSERT(zonecount > 1); /* must be > 1; can't destroy global zone */ 4677 zonecount--; 4678 /* remove from active list and hash tables */ 4679 list_remove(&zone_active, zone); 4680 (void) mod_hash_destroy(zonehashbyname, 4681 (mod_hash_key_t)zone->zone_name); 4682 (void) mod_hash_destroy(zonehashbyid, 4683 (mod_hash_key_t)(uintptr_t)zone->zone_id); 4684 if (zone->zone_flags & ZF_HASHED_LABEL) 4685 (void) mod_hash_destroy(zonehashbylabel, 4686 (mod_hash_key_t)zone->zone_slabel); 4687 mutex_exit(&zonehash_lock); 4688 4689 /* 4690 * Release the root vnode; we're not using it anymore. Nor should any 4691 * other thread that might access it exist. 4692 */ 4693 if (zone->zone_rootvp != NULL) { 4694 VN_RELE(zone->zone_rootvp); 4695 zone->zone_rootvp = NULL; 4696 } 4697 4698 /* add to deathrow list */ 4699 mutex_enter(&zone_deathrow_lock); 4700 list_insert_tail(&zone_deathrow, zone); 4701 mutex_exit(&zone_deathrow_lock); 4702 4703 /* 4704 * Drop last reference (which was added by zsched()), this will 4705 * free the zone unless there are outstanding cred references. 4706 */ 4707 zone_rele(zone); 4708 return (0); 4709 } 4710 4711 /* 4712 * Systemcall entry point for zone_getattr(2). 4713 */ 4714 static ssize_t 4715 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 4716 { 4717 size_t size; 4718 int error = 0, err; 4719 zone_t *zone; 4720 char *zonepath; 4721 char *outstr; 4722 zone_status_t zone_status; 4723 pid_t initpid; 4724 boolean_t global = (curzone == global_zone); 4725 boolean_t inzone = (curzone->zone_id == zoneid); 4726 ushort_t flags; 4727 zone_net_data_t *zbuf; 4728 4729 mutex_enter(&zonehash_lock); 4730 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 4731 mutex_exit(&zonehash_lock); 4732 return (set_errno(EINVAL)); 4733 } 4734 zone_status = zone_status_get(zone); 4735 if (zone_status < ZONE_IS_INITIALIZED) { 4736 mutex_exit(&zonehash_lock); 4737 return (set_errno(EINVAL)); 4738 } 4739 zone_hold(zone); 4740 mutex_exit(&zonehash_lock); 4741 4742 /* 4743 * If not in the global zone, don't show information about other zones, 4744 * unless the system is labeled and the local zone's label dominates 4745 * the other zone. 4746 */ 4747 if (!zone_list_access(zone)) { 4748 zone_rele(zone); 4749 return (set_errno(EINVAL)); 4750 } 4751 4752 switch (attr) { 4753 case ZONE_ATTR_ROOT: 4754 if (global) { 4755 /* 4756 * Copy the path to trim the trailing "/" (except for 4757 * the global zone). 4758 */ 4759 if (zone != global_zone) 4760 size = zone->zone_rootpathlen - 1; 4761 else 4762 size = zone->zone_rootpathlen; 4763 zonepath = kmem_alloc(size, KM_SLEEP); 4764 bcopy(zone->zone_rootpath, zonepath, size); 4765 zonepath[size - 1] = '\0'; 4766 } else { 4767 if (inzone || !is_system_labeled()) { 4768 /* 4769 * Caller is not in the global zone. 4770 * if the query is on the current zone 4771 * or the system is not labeled, 4772 * just return faked-up path for current zone. 4773 */ 4774 zonepath = "/"; 4775 size = 2; 4776 } else { 4777 /* 4778 * Return related path for current zone. 4779 */ 4780 int prefix_len = strlen(zone_prefix); 4781 int zname_len = strlen(zone->zone_name); 4782 4783 size = prefix_len + zname_len + 1; 4784 zonepath = kmem_alloc(size, KM_SLEEP); 4785 bcopy(zone_prefix, zonepath, prefix_len); 4786 bcopy(zone->zone_name, zonepath + 4787 prefix_len, zname_len); 4788 zonepath[size - 1] = '\0'; 4789 } 4790 } 4791 if (bufsize > size) 4792 bufsize = size; 4793 if (buf != NULL) { 4794 err = copyoutstr(zonepath, buf, bufsize, NULL); 4795 if (err != 0 && err != ENAMETOOLONG) 4796 error = EFAULT; 4797 } 4798 if (global || (is_system_labeled() && !inzone)) 4799 kmem_free(zonepath, size); 4800 break; 4801 4802 case ZONE_ATTR_NAME: 4803 size = strlen(zone->zone_name) + 1; 4804 if (bufsize > size) 4805 bufsize = size; 4806 if (buf != NULL) { 4807 err = copyoutstr(zone->zone_name, buf, bufsize, NULL); 4808 if (err != 0 && err != ENAMETOOLONG) 4809 error = EFAULT; 4810 } 4811 break; 4812 4813 case ZONE_ATTR_STATUS: 4814 /* 4815 * Since we're not holding zonehash_lock, the zone status 4816 * may be anything; leave it up to userland to sort it out. 4817 */ 4818 size = sizeof (zone_status); 4819 if (bufsize > size) 4820 bufsize = size; 4821 zone_status = zone_status_get(zone); 4822 if (buf != NULL && 4823 copyout(&zone_status, buf, bufsize) != 0) 4824 error = EFAULT; 4825 break; 4826 case ZONE_ATTR_FLAGS: 4827 size = sizeof (zone->zone_flags); 4828 if (bufsize > size) 4829 bufsize = size; 4830 flags = zone->zone_flags; 4831 if (buf != NULL && 4832 copyout(&flags, buf, bufsize) != 0) 4833 error = EFAULT; 4834 break; 4835 case ZONE_ATTR_PRIVSET: 4836 size = sizeof (priv_set_t); 4837 if (bufsize > size) 4838 bufsize = size; 4839 if (buf != NULL && 4840 copyout(zone->zone_privset, buf, bufsize) != 0) 4841 error = EFAULT; 4842 break; 4843 case ZONE_ATTR_UNIQID: 4844 size = sizeof (zone->zone_uniqid); 4845 if (bufsize > size) 4846 bufsize = size; 4847 if (buf != NULL && 4848 copyout(&zone->zone_uniqid, buf, bufsize) != 0) 4849 error = EFAULT; 4850 break; 4851 case ZONE_ATTR_POOLID: 4852 { 4853 pool_t *pool; 4854 poolid_t poolid; 4855 4856 if (pool_lock_intr() != 0) { 4857 error = EINTR; 4858 break; 4859 } 4860 pool = zone_pool_get(zone); 4861 poolid = pool->pool_id; 4862 pool_unlock(); 4863 size = sizeof (poolid); 4864 if (bufsize > size) 4865 bufsize = size; 4866 if (buf != NULL && copyout(&poolid, buf, size) != 0) 4867 error = EFAULT; 4868 } 4869 break; 4870 case ZONE_ATTR_SLBL: 4871 size = sizeof (bslabel_t); 4872 if (bufsize > size) 4873 bufsize = size; 4874 if (zone->zone_slabel == NULL) 4875 error = EINVAL; 4876 else if (buf != NULL && 4877 copyout(label2bslabel(zone->zone_slabel), buf, 4878 bufsize) != 0) 4879 error = EFAULT; 4880 break; 4881 case ZONE_ATTR_INITPID: 4882 size = sizeof (initpid); 4883 if (bufsize > size) 4884 bufsize = size; 4885 initpid = zone->zone_proc_initpid; 4886 if (initpid == -1) { 4887 error = ESRCH; 4888 break; 4889 } 4890 if (buf != NULL && 4891 copyout(&initpid, buf, bufsize) != 0) 4892 error = EFAULT; 4893 break; 4894 case ZONE_ATTR_BRAND: 4895 size = strlen(zone->zone_brand->b_name) + 1; 4896 4897 if (bufsize > size) 4898 bufsize = size; 4899 if (buf != NULL) { 4900 err = copyoutstr(zone->zone_brand->b_name, buf, 4901 bufsize, NULL); 4902 if (err != 0 && err != ENAMETOOLONG) 4903 error = EFAULT; 4904 } 4905 break; 4906 case ZONE_ATTR_INITNAME: 4907 size = strlen(zone->zone_initname) + 1; 4908 if (bufsize > size) 4909 bufsize = size; 4910 if (buf != NULL) { 4911 err = copyoutstr(zone->zone_initname, buf, bufsize, 4912 NULL); 4913 if (err != 0 && err != ENAMETOOLONG) 4914 error = EFAULT; 4915 } 4916 break; 4917 case ZONE_ATTR_BOOTARGS: 4918 if (zone->zone_bootargs == NULL) 4919 outstr = ""; 4920 else 4921 outstr = zone->zone_bootargs; 4922 size = strlen(outstr) + 1; 4923 if (bufsize > size) 4924 bufsize = size; 4925 if (buf != NULL) { 4926 err = copyoutstr(outstr, buf, bufsize, NULL); 4927 if (err != 0 && err != ENAMETOOLONG) 4928 error = EFAULT; 4929 } 4930 break; 4931 case ZONE_ATTR_PHYS_MCAP: 4932 size = sizeof (zone->zone_phys_mcap); 4933 if (bufsize > size) 4934 bufsize = size; 4935 if (buf != NULL && 4936 copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) 4937 error = EFAULT; 4938 break; 4939 case ZONE_ATTR_SCHED_CLASS: 4940 mutex_enter(&class_lock); 4941 4942 if (zone->zone_defaultcid >= loaded_classes) 4943 outstr = ""; 4944 else 4945 outstr = sclass[zone->zone_defaultcid].cl_name; 4946 size = strlen(outstr) + 1; 4947 if (bufsize > size) 4948 bufsize = size; 4949 if (buf != NULL) { 4950 err = copyoutstr(outstr, buf, bufsize, NULL); 4951 if (err != 0 && err != ENAMETOOLONG) 4952 error = EFAULT; 4953 } 4954 4955 mutex_exit(&class_lock); 4956 break; 4957 case ZONE_ATTR_HOSTID: 4958 if (zone->zone_hostid != HW_INVALID_HOSTID && 4959 bufsize == sizeof (zone->zone_hostid)) { 4960 size = sizeof (zone->zone_hostid); 4961 if (buf != NULL && copyout(&zone->zone_hostid, buf, 4962 bufsize) != 0) 4963 error = EFAULT; 4964 } else { 4965 error = EINVAL; 4966 } 4967 break; 4968 case ZONE_ATTR_FS_ALLOWED: 4969 if (zone->zone_fs_allowed == NULL) 4970 outstr = ""; 4971 else 4972 outstr = zone->zone_fs_allowed; 4973 size = strlen(outstr) + 1; 4974 if (bufsize > size) 4975 bufsize = size; 4976 if (buf != NULL) { 4977 err = copyoutstr(outstr, buf, bufsize, NULL); 4978 if (err != 0 && err != ENAMETOOLONG) 4979 error = EFAULT; 4980 } 4981 break; 4982 case ZONE_ATTR_NETWORK: 4983 zbuf = kmem_alloc(bufsize, KM_SLEEP); 4984 if (copyin(buf, zbuf, bufsize) != 0) { 4985 error = EFAULT; 4986 } else { 4987 error = zone_get_network(zoneid, zbuf); 4988 if (error == 0 && copyout(zbuf, buf, bufsize) != 0) 4989 error = EFAULT; 4990 } 4991 kmem_free(zbuf, bufsize); 4992 break; 4993 default: 4994 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { 4995 size = bufsize; 4996 error = ZBROP(zone)->b_getattr(zone, attr, buf, &size); 4997 } else { 4998 error = EINVAL; 4999 } 5000 } 5001 zone_rele(zone); 5002 5003 if (error) 5004 return (set_errno(error)); 5005 return ((ssize_t)size); 5006 } 5007 5008 /* 5009 * Systemcall entry point for zone_setattr(2). 5010 */ 5011 /*ARGSUSED*/ 5012 static int 5013 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) 5014 { 5015 zone_t *zone; 5016 zone_status_t zone_status; 5017 int err = -1; 5018 zone_net_data_t *zbuf; 5019 5020 if (secpolicy_zone_config(CRED()) != 0) 5021 return (set_errno(EPERM)); 5022 5023 /* 5024 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the 5025 * global zone. 5026 */ 5027 if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { 5028 return (set_errno(EINVAL)); 5029 } 5030 5031 mutex_enter(&zonehash_lock); 5032 if ((zone = zone_find_all_by_id(zoneid)) == NULL) { 5033 mutex_exit(&zonehash_lock); 5034 return (set_errno(EINVAL)); 5035 } 5036 zone_hold(zone); 5037 mutex_exit(&zonehash_lock); 5038 5039 /* 5040 * At present most attributes can only be set on non-running, 5041 * non-global zones. 5042 */ 5043 zone_status = zone_status_get(zone); 5044 if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { 5045 err = EINVAL; 5046 goto done; 5047 } 5048 5049 switch (attr) { 5050 case ZONE_ATTR_INITNAME: 5051 err = zone_set_initname(zone, (const char *)buf); 5052 break; 5053 case ZONE_ATTR_BOOTARGS: 5054 err = zone_set_bootargs(zone, (const char *)buf); 5055 break; 5056 case ZONE_ATTR_BRAND: 5057 err = zone_set_brand(zone, (const char *)buf); 5058 break; 5059 case ZONE_ATTR_FS_ALLOWED: 5060 err = zone_set_fs_allowed(zone, (const char *)buf); 5061 break; 5062 case ZONE_ATTR_PHYS_MCAP: 5063 err = zone_set_phys_mcap(zone, (const uint64_t *)buf); 5064 break; 5065 case ZONE_ATTR_SCHED_CLASS: 5066 err = zone_set_sched_class(zone, (const char *)buf); 5067 break; 5068 case ZONE_ATTR_HOSTID: 5069 if (bufsize == sizeof (zone->zone_hostid)) { 5070 if (copyin(buf, &zone->zone_hostid, bufsize) == 0) 5071 err = 0; 5072 else 5073 err = EFAULT; 5074 } else { 5075 err = EINVAL; 5076 } 5077 break; 5078 case ZONE_ATTR_NETWORK: 5079 if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) { 5080 err = EINVAL; 5081 break; 5082 } 5083 zbuf = kmem_alloc(bufsize, KM_SLEEP); 5084 if (copyin(buf, zbuf, bufsize) != 0) { 5085 kmem_free(zbuf, bufsize); 5086 err = EFAULT; 5087 break; 5088 } 5089 err = zone_set_network(zoneid, zbuf); 5090 kmem_free(zbuf, bufsize); 5091 break; 5092 default: 5093 if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) 5094 err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); 5095 else 5096 err = EINVAL; 5097 } 5098 5099 done: 5100 zone_rele(zone); 5101 ASSERT(err != -1); 5102 return (err != 0 ? set_errno(err) : 0); 5103 } 5104 5105 /* 5106 * Return zero if the process has at least one vnode mapped in to its 5107 * address space which shouldn't be allowed to change zones. 5108 * 5109 * Also return zero if the process has any shared mappings which reserve 5110 * swap. This is because the counting for zone.max-swap does not allow swap 5111 * reservation to be shared between zones. zone swap reservation is counted 5112 * on zone->zone_max_swap. 5113 */ 5114 static int 5115 as_can_change_zones(void) 5116 { 5117 proc_t *pp = curproc; 5118 struct seg *seg; 5119 struct as *as = pp->p_as; 5120 vnode_t *vp; 5121 int allow = 1; 5122 5123 ASSERT(pp->p_as != &kas); 5124 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 5125 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 5126 5127 /* 5128 * Cannot enter zone with shared anon memory which 5129 * reserves swap. See comment above. 5130 */ 5131 if (seg_can_change_zones(seg) == B_FALSE) { 5132 allow = 0; 5133 break; 5134 } 5135 /* 5136 * if we can't get a backing vnode for this segment then skip 5137 * it. 5138 */ 5139 vp = NULL; 5140 if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL) 5141 continue; 5142 if (!vn_can_change_zones(vp)) { /* bail on first match */ 5143 allow = 0; 5144 break; 5145 } 5146 } 5147 AS_LOCK_EXIT(as, &as->a_lock); 5148 return (allow); 5149 } 5150 5151 /* 5152 * Count swap reserved by curproc's address space 5153 */ 5154 static size_t 5155 as_swresv(void) 5156 { 5157 proc_t *pp = curproc; 5158 struct seg *seg; 5159 struct as *as = pp->p_as; 5160 size_t swap = 0; 5161 5162 ASSERT(pp->p_as != &kas); 5163 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 5164 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) 5165 swap += seg_swresv(seg); 5166 5167 return (swap); 5168 } 5169 5170 /* 5171 * Systemcall entry point for zone_enter(). 5172 * 5173 * The current process is injected into said zone. In the process 5174 * it will change its project membership, privileges, rootdir/cwd, 5175 * zone-wide rctls, and pool association to match those of the zone. 5176 * 5177 * The first zone_enter() called while the zone is in the ZONE_IS_READY 5178 * state will transition it to ZONE_IS_RUNNING. Processes may only 5179 * enter a zone that is "ready" or "running". 5180 */ 5181 static int 5182 zone_enter(zoneid_t zoneid) 5183 { 5184 zone_t *zone; 5185 vnode_t *vp; 5186 proc_t *pp = curproc; 5187 contract_t *ct; 5188 cont_process_t *ctp; 5189 task_t *tk, *oldtk; 5190 kproject_t *zone_proj0; 5191 cred_t *cr, *newcr; 5192 pool_t *oldpool, *newpool; 5193 sess_t *sp; 5194 uid_t uid; 5195 zone_status_t status; 5196 int err = 0; 5197 rctl_entity_p_t e; 5198 size_t swap; 5199 kthread_id_t t; 5200 5201 if (secpolicy_zone_config(CRED()) != 0) 5202 return (set_errno(EPERM)); 5203 if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID) 5204 return (set_errno(EINVAL)); 5205 5206 /* 5207 * Stop all lwps so we don't need to hold a lock to look at 5208 * curproc->p_zone. This needs to happen before we grab any 5209 * locks to avoid deadlock (another lwp in the process could 5210 * be waiting for the held lock). 5211 */ 5212 if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) 5213 return (set_errno(EINTR)); 5214 5215 /* 5216 * Make sure we're not changing zones with files open or mapped in 5217 * to our address space which shouldn't be changing zones. 5218 */ 5219 if (!files_can_change_zones()) { 5220 err = EBADF; 5221 goto out; 5222 } 5223 if (!as_can_change_zones()) { 5224 err = EFAULT; 5225 goto out; 5226 } 5227 5228 mutex_enter(&zonehash_lock); 5229 if (pp->p_zone != global_zone) { 5230 mutex_exit(&zonehash_lock); 5231 err = EINVAL; 5232 goto out; 5233 } 5234 5235 zone = zone_find_all_by_id(zoneid); 5236 if (zone == NULL) { 5237 mutex_exit(&zonehash_lock); 5238 err = EINVAL; 5239 goto out; 5240 } 5241 5242 /* 5243 * To prevent processes in a zone from holding contracts on 5244 * extrazonal resources, and to avoid process contract 5245 * memberships which span zones, contract holders and processes 5246 * which aren't the sole members of their encapsulating process 5247 * contracts are not allowed to zone_enter. 5248 */ 5249 ctp = pp->p_ct_process; 5250 ct = &ctp->conp_contract; 5251 mutex_enter(&ct->ct_lock); 5252 mutex_enter(&pp->p_lock); 5253 if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) { 5254 mutex_exit(&pp->p_lock); 5255 mutex_exit(&ct->ct_lock); 5256 mutex_exit(&zonehash_lock); 5257 err = EINVAL; 5258 goto out; 5259 } 5260 5261 /* 5262 * Moreover, we don't allow processes whose encapsulating 5263 * process contracts have inherited extrazonal contracts. 5264 * While it would be easier to eliminate all process contracts 5265 * with inherited contracts, we need to be able to give a 5266 * restarted init (or other zone-penetrating process) its 5267 * predecessor's contracts. 5268 */ 5269 if (ctp->conp_ninherited != 0) { 5270 contract_t *next; 5271 for (next = list_head(&ctp->conp_inherited); next; 5272 next = list_next(&ctp->conp_inherited, next)) { 5273 if (contract_getzuniqid(next) != zone->zone_uniqid) { 5274 mutex_exit(&pp->p_lock); 5275 mutex_exit(&ct->ct_lock); 5276 mutex_exit(&zonehash_lock); 5277 err = EINVAL; 5278 goto out; 5279 } 5280 } 5281 } 5282 5283 mutex_exit(&pp->p_lock); 5284 mutex_exit(&ct->ct_lock); 5285 5286 status = zone_status_get(zone); 5287 if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) { 5288 /* 5289 * Can't join 5290 */ 5291 mutex_exit(&zonehash_lock); 5292 err = EINVAL; 5293 goto out; 5294 } 5295 5296 /* 5297 * Make sure new priv set is within the permitted set for caller 5298 */ 5299 if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) { 5300 mutex_exit(&zonehash_lock); 5301 err = EPERM; 5302 goto out; 5303 } 5304 /* 5305 * We want to momentarily drop zonehash_lock while we optimistically 5306 * bind curproc to the pool it should be running in. This is safe 5307 * since the zone can't disappear (we have a hold on it). 5308 */ 5309 zone_hold(zone); 5310 mutex_exit(&zonehash_lock); 5311 5312 /* 5313 * Grab pool_lock to keep the pools configuration from changing 5314 * and to stop ourselves from getting rebound to another pool 5315 * until we join the zone. 5316 */ 5317 if (pool_lock_intr() != 0) { 5318 zone_rele(zone); 5319 err = EINTR; 5320 goto out; 5321 } 5322 ASSERT(secpolicy_pool(CRED()) == 0); 5323 /* 5324 * Bind ourselves to the pool currently associated with the zone. 5325 */ 5326 oldpool = curproc->p_pool; 5327 newpool = zone_pool_get(zone); 5328 if (pool_state == POOL_ENABLED && newpool != oldpool && 5329 (err = pool_do_bind(newpool, P_PID, P_MYID, 5330 POOL_BIND_ALL)) != 0) { 5331 pool_unlock(); 5332 zone_rele(zone); 5333 goto out; 5334 } 5335 5336 /* 5337 * Grab cpu_lock now; we'll need it later when we call 5338 * task_join(). 5339 */ 5340 mutex_enter(&cpu_lock); 5341 mutex_enter(&zonehash_lock); 5342 /* 5343 * Make sure the zone hasn't moved on since we dropped zonehash_lock. 5344 */ 5345 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) { 5346 /* 5347 * Can't join anymore. 5348 */ 5349 mutex_exit(&zonehash_lock); 5350 mutex_exit(&cpu_lock); 5351 if (pool_state == POOL_ENABLED && 5352 newpool != oldpool) 5353 (void) pool_do_bind(oldpool, P_PID, P_MYID, 5354 POOL_BIND_ALL); 5355 pool_unlock(); 5356 zone_rele(zone); 5357 err = EINVAL; 5358 goto out; 5359 } 5360 5361 /* 5362 * a_lock must be held while transfering locked memory and swap 5363 * reservation from the global zone to the non global zone because 5364 * asynchronous faults on the processes' address space can lock 5365 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE 5366 * segments respectively. 5367 */ 5368 AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER); 5369 swap = as_swresv(); 5370 mutex_enter(&pp->p_lock); 5371 zone_proj0 = zone->zone_zsched->p_task->tk_proj; 5372 /* verify that we do not exceed and task or lwp limits */ 5373 mutex_enter(&zone->zone_nlwps_lock); 5374 /* add new lwps to zone and zone's proj0 */ 5375 zone_proj0->kpj_nlwps += pp->p_lwpcnt; 5376 zone->zone_nlwps += pp->p_lwpcnt; 5377 /* add 1 task to zone's proj0 */ 5378 zone_proj0->kpj_ntasks += 1; 5379 5380 zone_proj0->kpj_nprocs++; 5381 zone->zone_nprocs++; 5382 mutex_exit(&zone->zone_nlwps_lock); 5383 5384 mutex_enter(&zone->zone_mem_lock); 5385 zone->zone_locked_mem += pp->p_locked_mem; 5386 zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem; 5387 zone->zone_max_swap += swap; 5388 mutex_exit(&zone->zone_mem_lock); 5389 5390 mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock)); 5391 zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem; 5392 mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock)); 5393 5394 /* remove lwps and process from proc's old zone and old project */ 5395 mutex_enter(&pp->p_zone->zone_nlwps_lock); 5396 pp->p_zone->zone_nlwps -= pp->p_lwpcnt; 5397 pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt; 5398 pp->p_task->tk_proj->kpj_nprocs--; 5399 pp->p_zone->zone_nprocs--; 5400 mutex_exit(&pp->p_zone->zone_nlwps_lock); 5401 5402 mutex_enter(&pp->p_zone->zone_mem_lock); 5403 pp->p_zone->zone_locked_mem -= pp->p_locked_mem; 5404 pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem; 5405 pp->p_zone->zone_max_swap -= swap; 5406 mutex_exit(&pp->p_zone->zone_mem_lock); 5407 5408 mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock)); 5409 pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem; 5410 mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock)); 5411 5412 pp->p_flag |= SZONETOP; 5413 pp->p_zone = zone; 5414 mutex_exit(&pp->p_lock); 5415 AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock); 5416 5417 /* 5418 * Joining the zone cannot fail from now on. 5419 * 5420 * This means that a lot of the following code can be commonized and 5421 * shared with zsched(). 5422 */ 5423 5424 /* 5425 * If the process contract fmri was inherited, we need to 5426 * flag this so that any contract status will not leak 5427 * extra zone information, svc_fmri in this case 5428 */ 5429 if (ctp->conp_svc_ctid != ct->ct_id) { 5430 mutex_enter(&ct->ct_lock); 5431 ctp->conp_svc_zone_enter = ct->ct_id; 5432 mutex_exit(&ct->ct_lock); 5433 } 5434 5435 /* 5436 * Reset the encapsulating process contract's zone. 5437 */ 5438 ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID); 5439 contract_setzuniqid(ct, zone->zone_uniqid); 5440 5441 /* 5442 * Create a new task and associate the process with the project keyed 5443 * by (projid,zoneid). 5444 * 5445 * We might as well be in project 0; the global zone's projid doesn't 5446 * make much sense in a zone anyhow. 5447 * 5448 * This also increments zone_ntasks, and returns with p_lock held. 5449 */ 5450 tk = task_create(0, zone); 5451 oldtk = task_join(tk, 0); 5452 mutex_exit(&cpu_lock); 5453 5454 /* 5455 * call RCTLOP_SET functions on this proc 5456 */ 5457 e.rcep_p.zone = zone; 5458 e.rcep_t = RCENTITY_ZONE; 5459 (void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL, 5460 RCD_CALLBACK); 5461 mutex_exit(&pp->p_lock); 5462 5463 /* 5464 * We don't need to hold any of zsched's locks here; not only do we know 5465 * the process and zone aren't going away, we know its session isn't 5466 * changing either. 5467 * 5468 * By joining zsched's session here, we mimic the behavior in the 5469 * global zone of init's sid being the pid of sched. We extend this 5470 * to all zlogin-like zone_enter()'ing processes as well. 5471 */ 5472 mutex_enter(&pidlock); 5473 sp = zone->zone_zsched->p_sessp; 5474 sess_hold(zone->zone_zsched); 5475 mutex_enter(&pp->p_lock); 5476 pgexit(pp); 5477 sess_rele(pp->p_sessp, B_TRUE); 5478 pp->p_sessp = sp; 5479 pgjoin(pp, zone->zone_zsched->p_pidp); 5480 5481 /* 5482 * If any threads are scheduled to be placed on zone wait queue they 5483 * should abandon the idea since the wait queue is changing. 5484 * We need to be holding pidlock & p_lock to do this. 5485 */ 5486 if ((t = pp->p_tlist) != NULL) { 5487 do { 5488 thread_lock(t); 5489 /* 5490 * Kick this thread so that he doesn't sit 5491 * on a wrong wait queue. 5492 */ 5493 if (ISWAITING(t)) 5494 setrun_locked(t); 5495 5496 if (t->t_schedflag & TS_ANYWAITQ) 5497 t->t_schedflag &= ~ TS_ANYWAITQ; 5498 5499 thread_unlock(t); 5500 } while ((t = t->t_forw) != pp->p_tlist); 5501 } 5502 5503 /* 5504 * If there is a default scheduling class for the zone and it is not 5505 * the class we are currently in, change all of the threads in the 5506 * process to the new class. We need to be holding pidlock & p_lock 5507 * when we call parmsset so this is a good place to do it. 5508 */ 5509 if (zone->zone_defaultcid > 0 && 5510 zone->zone_defaultcid != curthread->t_cid) { 5511 pcparms_t pcparms; 5512 5513 pcparms.pc_cid = zone->zone_defaultcid; 5514 pcparms.pc_clparms[0] = 0; 5515 5516 /* 5517 * If setting the class fails, we still want to enter the zone. 5518 */ 5519 if ((t = pp->p_tlist) != NULL) { 5520 do { 5521 (void) parmsset(&pcparms, t); 5522 } while ((t = t->t_forw) != pp->p_tlist); 5523 } 5524 } 5525 5526 mutex_exit(&pp->p_lock); 5527 mutex_exit(&pidlock); 5528 5529 mutex_exit(&zonehash_lock); 5530 /* 5531 * We're firmly in the zone; let pools progress. 5532 */ 5533 pool_unlock(); 5534 task_rele(oldtk); 5535 /* 5536 * We don't need to retain a hold on the zone since we already 5537 * incremented zone_ntasks, so the zone isn't going anywhere. 5538 */ 5539 zone_rele(zone); 5540 5541 /* 5542 * Chroot 5543 */ 5544 vp = zone->zone_rootvp; 5545 zone_chdir(vp, &PTOU(pp)->u_cdir, pp); 5546 zone_chdir(vp, &PTOU(pp)->u_rdir, pp); 5547 5548 /* 5549 * Change process credentials 5550 */ 5551 newcr = cralloc(); 5552 mutex_enter(&pp->p_crlock); 5553 cr = pp->p_cred; 5554 crcopy_to(cr, newcr); 5555 crsetzone(newcr, zone); 5556 pp->p_cred = newcr; 5557 5558 /* 5559 * Restrict all process privilege sets to zone limit 5560 */ 5561 priv_intersect(zone->zone_privset, &CR_PPRIV(newcr)); 5562 priv_intersect(zone->zone_privset, &CR_EPRIV(newcr)); 5563 priv_intersect(zone->zone_privset, &CR_IPRIV(newcr)); 5564 priv_intersect(zone->zone_privset, &CR_LPRIV(newcr)); 5565 mutex_exit(&pp->p_crlock); 5566 crset(pp, newcr); 5567 5568 /* 5569 * Adjust upcount to reflect zone entry. 5570 */ 5571 uid = crgetruid(newcr); 5572 mutex_enter(&pidlock); 5573 upcount_dec(uid, GLOBAL_ZONEID); 5574 upcount_inc(uid, zoneid); 5575 mutex_exit(&pidlock); 5576 5577 /* 5578 * Set up core file path and content. 5579 */ 5580 set_core_defaults(); 5581 5582 out: 5583 /* 5584 * Let the other lwps continue. 5585 */ 5586 mutex_enter(&pp->p_lock); 5587 if (curthread != pp->p_agenttp) 5588 continuelwps(pp); 5589 mutex_exit(&pp->p_lock); 5590 5591 return (err != 0 ? set_errno(err) : 0); 5592 } 5593 5594 /* 5595 * Systemcall entry point for zone_list(2). 5596 * 5597 * Processes running in a (non-global) zone only see themselves. 5598 * On labeled systems, they see all zones whose label they dominate. 5599 */ 5600 static int 5601 zone_list(zoneid_t *zoneidlist, uint_t *numzones) 5602 { 5603 zoneid_t *zoneids; 5604 zone_t *zone, *myzone; 5605 uint_t user_nzones, real_nzones; 5606 uint_t domi_nzones; 5607 int error; 5608 5609 if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0) 5610 return (set_errno(EFAULT)); 5611 5612 myzone = curproc->p_zone; 5613 if (myzone != global_zone) { 5614 bslabel_t *mybslab; 5615 5616 if (!is_system_labeled()) { 5617 /* just return current zone */ 5618 real_nzones = domi_nzones = 1; 5619 zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP); 5620 zoneids[0] = myzone->zone_id; 5621 } else { 5622 /* return all zones that are dominated */ 5623 mutex_enter(&zonehash_lock); 5624 real_nzones = zonecount; 5625 domi_nzones = 0; 5626 if (real_nzones > 0) { 5627 zoneids = kmem_alloc(real_nzones * 5628 sizeof (zoneid_t), KM_SLEEP); 5629 mybslab = label2bslabel(myzone->zone_slabel); 5630 for (zone = list_head(&zone_active); 5631 zone != NULL; 5632 zone = list_next(&zone_active, zone)) { 5633 if (zone->zone_id == GLOBAL_ZONEID) 5634 continue; 5635 if (zone != myzone && 5636 (zone->zone_flags & ZF_IS_SCRATCH)) 5637 continue; 5638 /* 5639 * Note that a label always dominates 5640 * itself, so myzone is always included 5641 * in the list. 5642 */ 5643 if (bldominates(mybslab, 5644 label2bslabel(zone->zone_slabel))) { 5645 zoneids[domi_nzones++] = 5646 zone->zone_id; 5647 } 5648 } 5649 } 5650 mutex_exit(&zonehash_lock); 5651 } 5652 } else { 5653 mutex_enter(&zonehash_lock); 5654 real_nzones = zonecount; 5655 domi_nzones = 0; 5656 if (real_nzones > 0) { 5657 zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t), 5658 KM_SLEEP); 5659 for (zone = list_head(&zone_active); zone != NULL; 5660 zone = list_next(&zone_active, zone)) 5661 zoneids[domi_nzones++] = zone->zone_id; 5662 ASSERT(domi_nzones == real_nzones); 5663 } 5664 mutex_exit(&zonehash_lock); 5665 } 5666 5667 /* 5668 * If user has allocated space for fewer entries than we found, then 5669 * return only up to his limit. Either way, tell him exactly how many 5670 * we found. 5671 */ 5672 if (domi_nzones < user_nzones) 5673 user_nzones = domi_nzones; 5674 error = 0; 5675 if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) { 5676 error = EFAULT; 5677 } else if (zoneidlist != NULL && user_nzones != 0) { 5678 if (copyout(zoneids, zoneidlist, 5679 user_nzones * sizeof (zoneid_t)) != 0) 5680 error = EFAULT; 5681 } 5682 5683 if (real_nzones > 0) 5684 kmem_free(zoneids, real_nzones * sizeof (zoneid_t)); 5685 5686 if (error != 0) 5687 return (set_errno(error)); 5688 else 5689 return (0); 5690 } 5691 5692 /* 5693 * Systemcall entry point for zone_lookup(2). 5694 * 5695 * Non-global zones are only able to see themselves and (on labeled systems) 5696 * the zones they dominate. 5697 */ 5698 static zoneid_t 5699 zone_lookup(const char *zone_name) 5700 { 5701 char *kname; 5702 zone_t *zone; 5703 zoneid_t zoneid; 5704 int err; 5705 5706 if (zone_name == NULL) { 5707 /* return caller's zone id */ 5708 return (getzoneid()); 5709 } 5710 5711 kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP); 5712 if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) { 5713 kmem_free(kname, ZONENAME_MAX); 5714 return (set_errno(err)); 5715 } 5716 5717 mutex_enter(&zonehash_lock); 5718 zone = zone_find_all_by_name(kname); 5719 kmem_free(kname, ZONENAME_MAX); 5720 /* 5721 * In a non-global zone, can only lookup global and own name. 5722 * In Trusted Extensions zone label dominance rules apply. 5723 */ 5724 if (zone == NULL || 5725 zone_status_get(zone) < ZONE_IS_READY || 5726 !zone_list_access(zone)) { 5727 mutex_exit(&zonehash_lock); 5728 return (set_errno(EINVAL)); 5729 } else { 5730 zoneid = zone->zone_id; 5731 mutex_exit(&zonehash_lock); 5732 return (zoneid); 5733 } 5734 } 5735 5736 static int 5737 zone_version(int *version_arg) 5738 { 5739 int version = ZONE_SYSCALL_API_VERSION; 5740 5741 if (copyout(&version, version_arg, sizeof (int)) != 0) 5742 return (set_errno(EFAULT)); 5743 return (0); 5744 } 5745 5746 /* ARGSUSED */ 5747 long 5748 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 5749 { 5750 zone_def zs; 5751 int err; 5752 5753 switch (cmd) { 5754 case ZONE_CREATE: 5755 if (get_udatamodel() == DATAMODEL_NATIVE) { 5756 if (copyin(arg1, &zs, sizeof (zone_def))) { 5757 return (set_errno(EFAULT)); 5758 } 5759 } else { 5760 #ifdef _SYSCALL32_IMPL 5761 zone_def32 zs32; 5762 5763 if (copyin(arg1, &zs32, sizeof (zone_def32))) { 5764 return (set_errno(EFAULT)); 5765 } 5766 zs.zone_name = 5767 (const char *)(unsigned long)zs32.zone_name; 5768 zs.zone_root = 5769 (const char *)(unsigned long)zs32.zone_root; 5770 zs.zone_privs = 5771 (const struct priv_set *) 5772 (unsigned long)zs32.zone_privs; 5773 zs.zone_privssz = zs32.zone_privssz; 5774 zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf; 5775 zs.rctlbufsz = zs32.rctlbufsz; 5776 zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf; 5777 zs.zfsbufsz = zs32.zfsbufsz; 5778 zs.extended_error = 5779 (int *)(unsigned long)zs32.extended_error; 5780 zs.match = zs32.match; 5781 zs.doi = zs32.doi; 5782 zs.label = (const bslabel_t *)(uintptr_t)zs32.label; 5783 zs.flags = zs32.flags; 5784 #else 5785 panic("get_udatamodel() returned bogus result\n"); 5786 #endif 5787 } 5788 5789 return (zone_create(zs.zone_name, zs.zone_root, 5790 zs.zone_privs, zs.zone_privssz, 5791 (caddr_t)zs.rctlbuf, zs.rctlbufsz, 5792 (caddr_t)zs.zfsbuf, zs.zfsbufsz, 5793 zs.extended_error, zs.match, zs.doi, 5794 zs.label, zs.flags)); 5795 case ZONE_BOOT: 5796 return (zone_boot((zoneid_t)(uintptr_t)arg1)); 5797 case ZONE_DESTROY: 5798 return (zone_destroy((zoneid_t)(uintptr_t)arg1)); 5799 case ZONE_GETATTR: 5800 return (zone_getattr((zoneid_t)(uintptr_t)arg1, 5801 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 5802 case ZONE_SETATTR: 5803 return (zone_setattr((zoneid_t)(uintptr_t)arg1, 5804 (int)(uintptr_t)arg2, arg3, (size_t)arg4)); 5805 case ZONE_ENTER: 5806 return (zone_enter((zoneid_t)(uintptr_t)arg1)); 5807 case ZONE_LIST: 5808 return (zone_list((zoneid_t *)arg1, (uint_t *)arg2)); 5809 case ZONE_SHUTDOWN: 5810 return (zone_shutdown((zoneid_t)(uintptr_t)arg1)); 5811 case ZONE_LOOKUP: 5812 return (zone_lookup((const char *)arg1)); 5813 case ZONE_VERSION: 5814 return (zone_version((int *)arg1)); 5815 case ZONE_ADD_DATALINK: 5816 return (zone_add_datalink((zoneid_t)(uintptr_t)arg1, 5817 (datalink_id_t)(uintptr_t)arg2)); 5818 case ZONE_DEL_DATALINK: 5819 return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1, 5820 (datalink_id_t)(uintptr_t)arg2)); 5821 case ZONE_CHECK_DATALINK: { 5822 zoneid_t zoneid; 5823 boolean_t need_copyout; 5824 5825 if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0) 5826 return (EFAULT); 5827 need_copyout = (zoneid == ALL_ZONES); 5828 err = zone_check_datalink(&zoneid, 5829 (datalink_id_t)(uintptr_t)arg2); 5830 if (err == 0 && need_copyout) { 5831 if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0) 5832 err = EFAULT; 5833 } 5834 return (err == 0 ? 0 : set_errno(err)); 5835 } 5836 case ZONE_LIST_DATALINK: 5837 return (zone_list_datalink((zoneid_t)(uintptr_t)arg1, 5838 (int *)arg2, (datalink_id_t *)(uintptr_t)arg3)); 5839 default: 5840 return (set_errno(EINVAL)); 5841 } 5842 } 5843 5844 struct zarg { 5845 zone_t *zone; 5846 zone_cmd_arg_t arg; 5847 }; 5848 5849 static int 5850 zone_lookup_door(const char *zone_name, door_handle_t *doorp) 5851 { 5852 char *buf; 5853 size_t buflen; 5854 int error; 5855 5856 buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name); 5857 buf = kmem_alloc(buflen, KM_SLEEP); 5858 (void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name); 5859 error = door_ki_open(buf, doorp); 5860 kmem_free(buf, buflen); 5861 return (error); 5862 } 5863 5864 static void 5865 zone_release_door(door_handle_t *doorp) 5866 { 5867 door_ki_rele(*doorp); 5868 *doorp = NULL; 5869 } 5870 5871 static void 5872 zone_ki_call_zoneadmd(struct zarg *zargp) 5873 { 5874 door_handle_t door = NULL; 5875 door_arg_t darg, save_arg; 5876 char *zone_name; 5877 size_t zone_namelen; 5878 zoneid_t zoneid; 5879 zone_t *zone; 5880 zone_cmd_arg_t arg; 5881 uint64_t uniqid; 5882 size_t size; 5883 int error; 5884 int retry; 5885 5886 zone = zargp->zone; 5887 arg = zargp->arg; 5888 kmem_free(zargp, sizeof (*zargp)); 5889 5890 zone_namelen = strlen(zone->zone_name) + 1; 5891 zone_name = kmem_alloc(zone_namelen, KM_SLEEP); 5892 bcopy(zone->zone_name, zone_name, zone_namelen); 5893 zoneid = zone->zone_id; 5894 uniqid = zone->zone_uniqid; 5895 /* 5896 * zoneadmd may be down, but at least we can empty out the zone. 5897 * We can ignore the return value of zone_empty() since we're called 5898 * from a kernel thread and know we won't be delivered any signals. 5899 */ 5900 ASSERT(curproc == &p0); 5901 (void) zone_empty(zone); 5902 ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY); 5903 zone_rele(zone); 5904 5905 size = sizeof (arg); 5906 darg.rbuf = (char *)&arg; 5907 darg.data_ptr = (char *)&arg; 5908 darg.rsize = size; 5909 darg.data_size = size; 5910 darg.desc_ptr = NULL; 5911 darg.desc_num = 0; 5912 5913 save_arg = darg; 5914 /* 5915 * Since we're not holding a reference to the zone, any number of 5916 * things can go wrong, including the zone disappearing before we get a 5917 * chance to talk to zoneadmd. 5918 */ 5919 for (retry = 0; /* forever */; retry++) { 5920 if (door == NULL && 5921 (error = zone_lookup_door(zone_name, &door)) != 0) { 5922 goto next; 5923 } 5924 ASSERT(door != NULL); 5925 5926 if ((error = door_ki_upcall_limited(door, &darg, NULL, 5927 SIZE_MAX, 0)) == 0) { 5928 break; 5929 } 5930 switch (error) { 5931 case EINTR: 5932 /* FALLTHROUGH */ 5933 case EAGAIN: /* process may be forking */ 5934 /* 5935 * Back off for a bit 5936 */ 5937 break; 5938 case EBADF: 5939 zone_release_door(&door); 5940 if (zone_lookup_door(zone_name, &door) != 0) { 5941 /* 5942 * zoneadmd may be dead, but it may come back to 5943 * life later. 5944 */ 5945 break; 5946 } 5947 break; 5948 default: 5949 cmn_err(CE_WARN, 5950 "zone_ki_call_zoneadmd: door_ki_upcall error %d\n", 5951 error); 5952 goto out; 5953 } 5954 next: 5955 /* 5956 * If this isn't the same zone_t that we originally had in mind, 5957 * then this is the same as if two kadmin requests come in at 5958 * the same time: the first one wins. This means we lose, so we 5959 * bail. 5960 */ 5961 if ((zone = zone_find_by_id(zoneid)) == NULL) { 5962 /* 5963 * Problem is solved. 5964 */ 5965 break; 5966 } 5967 if (zone->zone_uniqid != uniqid) { 5968 /* 5969 * zoneid recycled 5970 */ 5971 zone_rele(zone); 5972 break; 5973 } 5974 /* 5975 * We could zone_status_timedwait(), but there doesn't seem to 5976 * be much point in doing that (plus, it would mean that 5977 * zone_free() isn't called until this thread exits). 5978 */ 5979 zone_rele(zone); 5980 delay(hz); 5981 darg = save_arg; 5982 } 5983 out: 5984 if (door != NULL) { 5985 zone_release_door(&door); 5986 } 5987 kmem_free(zone_name, zone_namelen); 5988 thread_exit(); 5989 } 5990 5991 /* 5992 * Entry point for uadmin() to tell the zone to go away or reboot. Analog to 5993 * kadmin(). The caller is a process in the zone. 5994 * 5995 * In order to shutdown the zone, we will hand off control to zoneadmd 5996 * (running in the global zone) via a door. We do a half-hearted job at 5997 * killing all processes in the zone, create a kernel thread to contact 5998 * zoneadmd, and make note of the "uniqid" of the zone. The uniqid is 5999 * a form of generation number used to let zoneadmd (as well as 6000 * zone_destroy()) know exactly which zone they're re talking about. 6001 */ 6002 int 6003 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) 6004 { 6005 struct zarg *zargp; 6006 zone_cmd_t zcmd; 6007 zone_t *zone; 6008 6009 zone = curproc->p_zone; 6010 ASSERT(getzoneid() != GLOBAL_ZONEID); 6011 6012 switch (cmd) { 6013 case A_SHUTDOWN: 6014 switch (fcn) { 6015 case AD_HALT: 6016 case AD_POWEROFF: 6017 zcmd = Z_HALT; 6018 break; 6019 case AD_BOOT: 6020 zcmd = Z_REBOOT; 6021 break; 6022 case AD_IBOOT: 6023 case AD_SBOOT: 6024 case AD_SIBOOT: 6025 case AD_NOSYNC: 6026 return (ENOTSUP); 6027 default: 6028 return (EINVAL); 6029 } 6030 break; 6031 case A_REBOOT: 6032 zcmd = Z_REBOOT; 6033 break; 6034 case A_FTRACE: 6035 case A_REMOUNT: 6036 case A_FREEZE: 6037 case A_DUMP: 6038 case A_CONFIG: 6039 return (ENOTSUP); 6040 default: 6041 ASSERT(cmd != A_SWAPCTL); /* handled by uadmin() */ 6042 return (EINVAL); 6043 } 6044 6045 if (secpolicy_zone_admin(credp, B_FALSE)) 6046 return (EPERM); 6047 mutex_enter(&zone_status_lock); 6048 6049 /* 6050 * zone_status can't be ZONE_IS_EMPTY or higher since curproc 6051 * is in the zone. 6052 */ 6053 ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY); 6054 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 6055 /* 6056 * This zone is already on its way down. 6057 */ 6058 mutex_exit(&zone_status_lock); 6059 return (0); 6060 } 6061 /* 6062 * Prevent future zone_enter()s 6063 */ 6064 zone_status_set(zone, ZONE_IS_SHUTTING_DOWN); 6065 mutex_exit(&zone_status_lock); 6066 6067 /* 6068 * Kill everyone now and call zoneadmd later. 6069 * zone_ki_call_zoneadmd() will do a more thorough job of this 6070 * later. 6071 */ 6072 killall(zone->zone_id); 6073 /* 6074 * Now, create the thread to contact zoneadmd and do the rest of the 6075 * work. This thread can't be created in our zone otherwise 6076 * zone_destroy() would deadlock. 6077 */ 6078 zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP); 6079 zargp->arg.cmd = zcmd; 6080 zargp->arg.uniqid = zone->zone_uniqid; 6081 zargp->zone = zone; 6082 (void) strcpy(zargp->arg.locale, "C"); 6083 /* mdep was already copied in for us by uadmin */ 6084 if (mdep != NULL) 6085 (void) strlcpy(zargp->arg.bootbuf, mdep, 6086 sizeof (zargp->arg.bootbuf)); 6087 zone_hold(zone); 6088 6089 (void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0, 6090 TS_RUN, minclsyspri); 6091 exit(CLD_EXITED, 0); 6092 6093 return (EINVAL); 6094 } 6095 6096 /* 6097 * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's 6098 * status to ZONE_IS_SHUTTING_DOWN. 6099 * 6100 * This function also shuts down all running zones to ensure that they won't 6101 * fork new processes. 6102 */ 6103 void 6104 zone_shutdown_global(void) 6105 { 6106 zone_t *current_zonep; 6107 6108 ASSERT(INGLOBALZONE(curproc)); 6109 mutex_enter(&zonehash_lock); 6110 mutex_enter(&zone_status_lock); 6111 6112 /* Modify the global zone's status first. */ 6113 ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING); 6114 zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN); 6115 6116 /* 6117 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN. 6118 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so 6119 * could cause assertions to fail (e.g., assertions about a zone's 6120 * state during initialization, readying, or booting) or produce races. 6121 * We'll let threads continue to initialize and ready new zones: they'll 6122 * fail to boot the new zones when they see that the global zone is 6123 * shutting down. 6124 */ 6125 for (current_zonep = list_head(&zone_active); current_zonep != NULL; 6126 current_zonep = list_next(&zone_active, current_zonep)) { 6127 if (zone_status_get(current_zonep) == ZONE_IS_RUNNING) 6128 zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN); 6129 } 6130 mutex_exit(&zone_status_lock); 6131 mutex_exit(&zonehash_lock); 6132 } 6133 6134 /* 6135 * Returns true if the named dataset is visible in the current zone. 6136 * The 'write' parameter is set to 1 if the dataset is also writable. 6137 */ 6138 int 6139 zone_dataset_visible(const char *dataset, int *write) 6140 { 6141 static int zfstype = -1; 6142 zone_dataset_t *zd; 6143 size_t len; 6144 zone_t *zone = curproc->p_zone; 6145 const char *name = NULL; 6146 vfs_t *vfsp = NULL; 6147 6148 if (dataset[0] == '\0') 6149 return (0); 6150 6151 /* 6152 * Walk the list once, looking for datasets which match exactly, or 6153 * specify a dataset underneath an exported dataset. If found, return 6154 * true and note that it is writable. 6155 */ 6156 for (zd = list_head(&zone->zone_datasets); zd != NULL; 6157 zd = list_next(&zone->zone_datasets, zd)) { 6158 6159 len = strlen(zd->zd_dataset); 6160 if (strlen(dataset) >= len && 6161 bcmp(dataset, zd->zd_dataset, len) == 0 && 6162 (dataset[len] == '\0' || dataset[len] == '/' || 6163 dataset[len] == '@')) { 6164 if (write) 6165 *write = 1; 6166 return (1); 6167 } 6168 } 6169 6170 /* 6171 * Walk the list a second time, searching for datasets which are parents 6172 * of exported datasets. These should be visible, but read-only. 6173 * 6174 * Note that we also have to support forms such as 'pool/dataset/', with 6175 * a trailing slash. 6176 */ 6177 for (zd = list_head(&zone->zone_datasets); zd != NULL; 6178 zd = list_next(&zone->zone_datasets, zd)) { 6179 6180 len = strlen(dataset); 6181 if (dataset[len - 1] == '/') 6182 len--; /* Ignore trailing slash */ 6183 if (len < strlen(zd->zd_dataset) && 6184 bcmp(dataset, zd->zd_dataset, len) == 0 && 6185 zd->zd_dataset[len] == '/') { 6186 if (write) 6187 *write = 0; 6188 return (1); 6189 } 6190 } 6191 6192 /* 6193 * We reach here if the given dataset is not found in the zone_dataset 6194 * list. Check if this dataset was added as a filesystem (ie. "add fs") 6195 * instead of delegation. For this we search for the dataset in the 6196 * zone_vfslist of this zone. If found, return true and note that it is 6197 * not writable. 6198 */ 6199 6200 /* 6201 * Initialize zfstype if it is not initialized yet. 6202 */ 6203 if (zfstype == -1) { 6204 struct vfssw *vswp = vfs_getvfssw("zfs"); 6205 zfstype = vswp - vfssw; 6206 vfs_unrefvfssw(vswp); 6207 } 6208 6209 vfs_list_read_lock(); 6210 vfsp = zone->zone_vfslist; 6211 do { 6212 ASSERT(vfsp); 6213 if (vfsp->vfs_fstype == zfstype) { 6214 name = refstr_value(vfsp->vfs_resource); 6215 6216 /* 6217 * Check if we have an exact match. 6218 */ 6219 if (strcmp(dataset, name) == 0) { 6220 vfs_list_unlock(); 6221 if (write) 6222 *write = 0; 6223 return (1); 6224 } 6225 /* 6226 * We need to check if we are looking for parents of 6227 * a dataset. These should be visible, but read-only. 6228 */ 6229 len = strlen(dataset); 6230 if (dataset[len - 1] == '/') 6231 len--; 6232 6233 if (len < strlen(name) && 6234 bcmp(dataset, name, len) == 0 && name[len] == '/') { 6235 vfs_list_unlock(); 6236 if (write) 6237 *write = 0; 6238 return (1); 6239 } 6240 } 6241 vfsp = vfsp->vfs_zone_next; 6242 } while (vfsp != zone->zone_vfslist); 6243 6244 vfs_list_unlock(); 6245 return (0); 6246 } 6247 6248 /* 6249 * zone_find_by_any_path() - 6250 * 6251 * kernel-private routine similar to zone_find_by_path(), but which 6252 * effectively compares against zone paths rather than zonerootpath 6253 * (i.e., the last component of zonerootpaths, which should be "root/", 6254 * are not compared.) This is done in order to accurately identify all 6255 * paths, whether zone-visible or not, including those which are parallel 6256 * to /root/, such as /dev/, /home/, etc... 6257 * 6258 * If the specified path does not fall under any zone path then global 6259 * zone is returned. 6260 * 6261 * The treat_abs parameter indicates whether the path should be treated as 6262 * an absolute path although it does not begin with "/". (This supports 6263 * nfs mount syntax such as host:any/path.) 6264 * 6265 * The caller is responsible for zone_rele of the returned zone. 6266 */ 6267 zone_t * 6268 zone_find_by_any_path(const char *path, boolean_t treat_abs) 6269 { 6270 zone_t *zone; 6271 int path_offset = 0; 6272 6273 if (path == NULL) { 6274 zone_hold(global_zone); 6275 return (global_zone); 6276 } 6277 6278 if (*path != '/') { 6279 ASSERT(treat_abs); 6280 path_offset = 1; 6281 } 6282 6283 mutex_enter(&zonehash_lock); 6284 for (zone = list_head(&zone_active); zone != NULL; 6285 zone = list_next(&zone_active, zone)) { 6286 char *c; 6287 size_t pathlen; 6288 char *rootpath_start; 6289 6290 if (zone == global_zone) /* skip global zone */ 6291 continue; 6292 6293 /* scan backwards to find start of last component */ 6294 c = zone->zone_rootpath + zone->zone_rootpathlen - 2; 6295 do { 6296 c--; 6297 } while (*c != '/'); 6298 6299 pathlen = c - zone->zone_rootpath + 1 - path_offset; 6300 rootpath_start = (zone->zone_rootpath + path_offset); 6301 if (strncmp(path, rootpath_start, pathlen) == 0) 6302 break; 6303 } 6304 if (zone == NULL) 6305 zone = global_zone; 6306 zone_hold(zone); 6307 mutex_exit(&zonehash_lock); 6308 return (zone); 6309 } 6310 6311 /* 6312 * Finds a zone_dl_t with the given linkid in the given zone. Returns the 6313 * zone_dl_t pointer if found, and NULL otherwise. 6314 */ 6315 static zone_dl_t * 6316 zone_find_dl(zone_t *zone, datalink_id_t linkid) 6317 { 6318 zone_dl_t *zdl; 6319 6320 ASSERT(mutex_owned(&zone->zone_lock)); 6321 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6322 zdl = list_next(&zone->zone_dl_list, zdl)) { 6323 if (zdl->zdl_id == linkid) 6324 break; 6325 } 6326 return (zdl); 6327 } 6328 6329 static boolean_t 6330 zone_dl_exists(zone_t *zone, datalink_id_t linkid) 6331 { 6332 boolean_t exists; 6333 6334 mutex_enter(&zone->zone_lock); 6335 exists = (zone_find_dl(zone, linkid) != NULL); 6336 mutex_exit(&zone->zone_lock); 6337 return (exists); 6338 } 6339 6340 /* 6341 * Add an data link name for the zone. 6342 */ 6343 static int 6344 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) 6345 { 6346 zone_dl_t *zdl; 6347 zone_t *zone; 6348 zone_t *thiszone; 6349 6350 if ((thiszone = zone_find_by_id(zoneid)) == NULL) 6351 return (set_errno(ENXIO)); 6352 6353 /* Verify that the datalink ID doesn't already belong to a zone. */ 6354 mutex_enter(&zonehash_lock); 6355 for (zone = list_head(&zone_active); zone != NULL; 6356 zone = list_next(&zone_active, zone)) { 6357 if (zone_dl_exists(zone, linkid)) { 6358 mutex_exit(&zonehash_lock); 6359 zone_rele(thiszone); 6360 return (set_errno((zone == thiszone) ? EEXIST : EPERM)); 6361 } 6362 } 6363 6364 zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP); 6365 zdl->zdl_id = linkid; 6366 zdl->zdl_net = NULL; 6367 mutex_enter(&thiszone->zone_lock); 6368 list_insert_head(&thiszone->zone_dl_list, zdl); 6369 mutex_exit(&thiszone->zone_lock); 6370 mutex_exit(&zonehash_lock); 6371 zone_rele(thiszone); 6372 return (0); 6373 } 6374 6375 static int 6376 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) 6377 { 6378 zone_dl_t *zdl; 6379 zone_t *zone; 6380 int err = 0; 6381 6382 if ((zone = zone_find_by_id(zoneid)) == NULL) 6383 return (set_errno(EINVAL)); 6384 6385 mutex_enter(&zone->zone_lock); 6386 if ((zdl = zone_find_dl(zone, linkid)) == NULL) { 6387 err = ENXIO; 6388 } else { 6389 list_remove(&zone->zone_dl_list, zdl); 6390 if (zdl->zdl_net != NULL) 6391 nvlist_free(zdl->zdl_net); 6392 kmem_free(zdl, sizeof (zone_dl_t)); 6393 } 6394 mutex_exit(&zone->zone_lock); 6395 zone_rele(zone); 6396 return (err == 0 ? 0 : set_errno(err)); 6397 } 6398 6399 /* 6400 * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned 6401 * the linkid. Otherwise we just check if the specified zoneidp has been 6402 * assigned the supplied linkid. 6403 */ 6404 int 6405 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) 6406 { 6407 zone_t *zone; 6408 int err = ENXIO; 6409 6410 if (*zoneidp != ALL_ZONES) { 6411 if ((zone = zone_find_by_id(*zoneidp)) != NULL) { 6412 if (zone_dl_exists(zone, linkid)) 6413 err = 0; 6414 zone_rele(zone); 6415 } 6416 return (err); 6417 } 6418 6419 mutex_enter(&zonehash_lock); 6420 for (zone = list_head(&zone_active); zone != NULL; 6421 zone = list_next(&zone_active, zone)) { 6422 if (zone_dl_exists(zone, linkid)) { 6423 *zoneidp = zone->zone_id; 6424 err = 0; 6425 break; 6426 } 6427 } 6428 mutex_exit(&zonehash_lock); 6429 return (err); 6430 } 6431 6432 /* 6433 * Get the list of datalink IDs assigned to a zone. 6434 * 6435 * On input, *nump is the number of datalink IDs that can fit in the supplied 6436 * idarray. Upon return, *nump is either set to the number of datalink IDs 6437 * that were placed in the array if the array was large enough, or to the 6438 * number of datalink IDs that the function needs to place in the array if the 6439 * array is too small. 6440 */ 6441 static int 6442 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) 6443 { 6444 uint_t num, dlcount; 6445 zone_t *zone; 6446 zone_dl_t *zdl; 6447 datalink_id_t *idptr = idarray; 6448 6449 if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) 6450 return (set_errno(EFAULT)); 6451 if ((zone = zone_find_by_id(zoneid)) == NULL) 6452 return (set_errno(ENXIO)); 6453 6454 num = 0; 6455 mutex_enter(&zone->zone_lock); 6456 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6457 zdl = list_next(&zone->zone_dl_list, zdl)) { 6458 /* 6459 * If the list is bigger than what the caller supplied, just 6460 * count, don't do copyout. 6461 */ 6462 if (++num > dlcount) 6463 continue; 6464 if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) { 6465 mutex_exit(&zone->zone_lock); 6466 zone_rele(zone); 6467 return (set_errno(EFAULT)); 6468 } 6469 idptr++; 6470 } 6471 mutex_exit(&zone->zone_lock); 6472 zone_rele(zone); 6473 6474 /* Increased or decreased, caller should be notified. */ 6475 if (num != dlcount) { 6476 if (copyout(&num, nump, sizeof (num)) != 0) 6477 return (set_errno(EFAULT)); 6478 } 6479 return (0); 6480 } 6481 6482 /* 6483 * Public interface for looking up a zone by zoneid. It's a customized version 6484 * for netstack_zone_create(). It can only be called from the zsd create 6485 * callbacks, since it doesn't have reference on the zone structure hence if 6486 * it is called elsewhere the zone could disappear after the zonehash_lock 6487 * is dropped. 6488 * 6489 * Furthermore it 6490 * 1. Doesn't check the status of the zone. 6491 * 2. It will be called even before zone_init is called, in that case the 6492 * address of zone0 is returned directly, and netstack_zone_create() 6493 * will only assign a value to zone0.zone_netstack, won't break anything. 6494 * 3. Returns without the zone being held. 6495 */ 6496 zone_t * 6497 zone_find_by_id_nolock(zoneid_t zoneid) 6498 { 6499 zone_t *zone; 6500 6501 mutex_enter(&zonehash_lock); 6502 if (zonehashbyid == NULL) 6503 zone = &zone0; 6504 else 6505 zone = zone_find_all_by_id(zoneid); 6506 mutex_exit(&zonehash_lock); 6507 return (zone); 6508 } 6509 6510 /* 6511 * Walk the datalinks for a given zone 6512 */ 6513 int 6514 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *), 6515 void *data) 6516 { 6517 zone_t *zone; 6518 zone_dl_t *zdl; 6519 datalink_id_t *idarray; 6520 uint_t idcount = 0; 6521 int i, ret = 0; 6522 6523 if ((zone = zone_find_by_id(zoneid)) == NULL) 6524 return (ENOENT); 6525 6526 /* 6527 * We first build an array of linkid's so that we can walk these and 6528 * execute the callback with the zone_lock dropped. 6529 */ 6530 mutex_enter(&zone->zone_lock); 6531 for (zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6532 zdl = list_next(&zone->zone_dl_list, zdl)) { 6533 idcount++; 6534 } 6535 6536 if (idcount == 0) { 6537 mutex_exit(&zone->zone_lock); 6538 zone_rele(zone); 6539 return (0); 6540 } 6541 6542 idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP); 6543 if (idarray == NULL) { 6544 mutex_exit(&zone->zone_lock); 6545 zone_rele(zone); 6546 return (ENOMEM); 6547 } 6548 6549 for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL; 6550 i++, zdl = list_next(&zone->zone_dl_list, zdl)) { 6551 idarray[i] = zdl->zdl_id; 6552 } 6553 6554 mutex_exit(&zone->zone_lock); 6555 6556 for (i = 0; i < idcount && ret == 0; i++) { 6557 if ((ret = (*cb)(idarray[i], data)) != 0) 6558 break; 6559 } 6560 6561 zone_rele(zone); 6562 kmem_free(idarray, sizeof (datalink_id_t) * idcount); 6563 return (ret); 6564 } 6565 6566 static char * 6567 zone_net_type2name(int type) 6568 { 6569 switch (type) { 6570 case ZONE_NETWORK_ADDRESS: 6571 return (ZONE_NET_ADDRNAME); 6572 case ZONE_NETWORK_DEFROUTER: 6573 return (ZONE_NET_RTRNAME); 6574 default: 6575 return (NULL); 6576 } 6577 } 6578 6579 static int 6580 zone_set_network(zoneid_t zoneid, zone_net_data_t *znbuf) 6581 { 6582 zone_t *zone; 6583 zone_dl_t *zdl; 6584 nvlist_t *nvl; 6585 int err = 0; 6586 uint8_t *new = NULL; 6587 char *nvname; 6588 int bufsize; 6589 datalink_id_t linkid = znbuf->zn_linkid; 6590 6591 if (secpolicy_zone_config(CRED()) != 0) 6592 return (set_errno(EPERM)); 6593 6594 if (zoneid == GLOBAL_ZONEID) 6595 return (set_errno(EINVAL)); 6596 6597 nvname = zone_net_type2name(znbuf->zn_type); 6598 bufsize = znbuf->zn_len; 6599 new = znbuf->zn_val; 6600 if (nvname == NULL) 6601 return (set_errno(EINVAL)); 6602 6603 if ((zone = zone_find_by_id(zoneid)) == NULL) { 6604 return (set_errno(EINVAL)); 6605 } 6606 6607 mutex_enter(&zone->zone_lock); 6608 if ((zdl = zone_find_dl(zone, linkid)) == NULL) { 6609 err = ENXIO; 6610 goto done; 6611 } 6612 if ((nvl = zdl->zdl_net) == NULL) { 6613 if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)) { 6614 err = ENOMEM; 6615 goto done; 6616 } else { 6617 zdl->zdl_net = nvl; 6618 } 6619 } 6620 if (nvlist_exists(nvl, nvname)) { 6621 err = EINVAL; 6622 goto done; 6623 } 6624 err = nvlist_add_uint8_array(nvl, nvname, new, bufsize); 6625 ASSERT(err == 0); 6626 done: 6627 mutex_exit(&zone->zone_lock); 6628 zone_rele(zone); 6629 if (err != 0) 6630 return (set_errno(err)); 6631 else 6632 return (0); 6633 } 6634 6635 static int 6636 zone_get_network(zoneid_t zoneid, zone_net_data_t *znbuf) 6637 { 6638 zone_t *zone; 6639 zone_dl_t *zdl; 6640 nvlist_t *nvl; 6641 uint8_t *ptr; 6642 uint_t psize; 6643 int err = 0; 6644 char *nvname; 6645 int bufsize; 6646 void *buf; 6647 datalink_id_t linkid = znbuf->zn_linkid; 6648 6649 if (zoneid == GLOBAL_ZONEID) 6650 return (set_errno(EINVAL)); 6651 6652 nvname = zone_net_type2name(znbuf->zn_type); 6653 bufsize = znbuf->zn_len; 6654 buf = znbuf->zn_val; 6655 6656 if (nvname == NULL) 6657 return (set_errno(EINVAL)); 6658 if ((zone = zone_find_by_id(zoneid)) == NULL) 6659 return (set_errno(EINVAL)); 6660 6661 mutex_enter(&zone->zone_lock); 6662 if ((zdl = zone_find_dl(zone, linkid)) == NULL) { 6663 err = ENXIO; 6664 goto done; 6665 } 6666 if ((nvl = zdl->zdl_net) == NULL || !nvlist_exists(nvl, nvname)) { 6667 err = ENOENT; 6668 goto done; 6669 } 6670 err = nvlist_lookup_uint8_array(nvl, nvname, &ptr, &psize); 6671 ASSERT(err == 0); 6672 6673 if (psize > bufsize) { 6674 err = ENOBUFS; 6675 goto done; 6676 } 6677 znbuf->zn_len = psize; 6678 bcopy(ptr, buf, psize); 6679 done: 6680 mutex_exit(&zone->zone_lock); 6681 zone_rele(zone); 6682 if (err != 0) 6683 return (set_errno(err)); 6684 else 6685 return (0); 6686 } 6687