xref: /titanic_44/usr/src/uts/common/os/zone.c (revision b64bfe7dc77dc5c5561cdcd10c80b0b550701a24)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Zones
28  *
29  *   A zone is a named collection of processes, namespace constraints,
30  *   and other system resources which comprise a secure and manageable
31  *   application containment facility.
32  *
33  *   Zones (represented by the reference counted zone_t) are tracked in
34  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
35  *   (zoneid_t) are used to track zone association.  Zone IDs are
36  *   dynamically generated when the zone is created; if a persistent
37  *   identifier is needed (core files, accounting logs, audit trail,
38  *   etc.), the zone name should be used.
39  *
40  *
41  *   Global Zone:
42  *
43  *   The global zone (zoneid 0) is automatically associated with all
44  *   system resources that have not been bound to a user-created zone.
45  *   This means that even systems where zones are not in active use
46  *   have a global zone, and all processes, mounts, etc. are
47  *   associated with that zone.  The global zone is generally
48  *   unconstrained in terms of privileges and access, though the usual
49  *   credential and privilege based restrictions apply.
50  *
51  *
52  *   Zone States:
53  *
54  *   The states in which a zone may be in and the transitions are as
55  *   follows:
56  *
57  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
58  *   initialized zone is added to the list of active zones on the system but
59  *   isn't accessible.
60  *
61  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
62  *   not yet completed. Not possible to enter the zone, but attributes can
63  *   be retrieved.
64  *
65  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
66  *   ready.  The zone is made visible after the ZSD constructor callbacks are
67  *   executed.  A zone remains in this state until it transitions into
68  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
69  *
70  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
71  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
72  *   state.
73  *
74  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
75  *   successfully started init.   A zone remains in this state until
76  *   zone_shutdown() is called.
77  *
78  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
79  *   killing all processes running in the zone. The zone remains
80  *   in this state until there are no more user processes running in the zone.
81  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
82  *   Since zone_shutdown() is restartable, it may be called successfully
83  *   multiple times for the same zone_t.  Setting of the zone's state to
84  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
85  *   the zone's status without worrying about it being a moving target.
86  *
87  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
88  *   are no more user processes in the zone.  The zone remains in this
89  *   state until there are no more kernel threads associated with the
90  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
91  *   fail.
92  *
93  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
94  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
95  *   join the zone or create kernel threads therein.
96  *
97  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
98  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
99  *   return NULL from now on.
100  *
101  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
102  *   processes or threads doing work on behalf of the zone.  The zone is
103  *   removed from the list of active zones.  zone_destroy() returns, and
104  *   the zone can be recreated.
105  *
106  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
107  *   callbacks are executed, and all memory associated with the zone is
108  *   freed.
109  *
110  *   Threads can wait for the zone to enter a requested state by using
111  *   zone_status_wait() or zone_status_timedwait() with the desired
112  *   state passed in as an argument.  Zone state transitions are
113  *   uni-directional; it is not possible to move back to an earlier state.
114  *
115  *
116  *   Zone-Specific Data:
117  *
118  *   Subsystems needing to maintain zone-specific data can store that
119  *   data using the ZSD mechanism.  This provides a zone-specific data
120  *   store, similar to thread-specific data (see pthread_getspecific(3C)
121  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
122  *   to register callbacks to be invoked when a zone is created, shut
123  *   down, or destroyed.  This can be used to initialize zone-specific
124  *   data for new zones and to clean up when zones go away.
125  *
126  *
127  *   Data Structures:
128  *
129  *   The per-zone structure (zone_t) is reference counted, and freed
130  *   when all references are released.  zone_hold and zone_rele can be
131  *   used to adjust the reference count.  In addition, reference counts
132  *   associated with the cred_t structure are tracked separately using
133  *   zone_cred_hold and zone_cred_rele.
134  *
135  *   Pointers to active zone_t's are stored in two hash tables; one
136  *   for searching by id, the other for searching by name.  Lookups
137  *   can be performed on either basis, using zone_find_by_id and
138  *   zone_find_by_name.  Both return zone_t pointers with the zone
139  *   held, so zone_rele should be called when the pointer is no longer
140  *   needed.  Zones can also be searched by path; zone_find_by_path
141  *   returns the zone with which a path name is associated (global
142  *   zone if the path is not within some other zone's file system
143  *   hierarchy).  This currently requires iterating through each zone,
144  *   so it is slower than an id or name search via a hash table.
145  *
146  *
147  *   Locking:
148  *
149  *   zonehash_lock: This is a top-level global lock used to protect the
150  *       zone hash tables and lists.  Zones cannot be created or destroyed
151  *       while this lock is held.
152  *   zone_status_lock: This is a global lock protecting zone state.
153  *       Zones cannot change state while this lock is held.  It also
154  *       protects the list of kernel threads associated with a zone.
155  *   zone_lock: This is a per-zone lock used to protect several fields of
156  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
157  *       this lock means that the zone cannot go away.
158  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
159  *	 related to the zone.max-lwps rctl.
160  *   zone_mem_lock: This is a per-zone lock used to protect the fields
161  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
162  *   zone_rctl_lock: This is a per-zone lock used to protect other rctls,
163  *       currently just max_lofi
164  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
165  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
166  *       list (a list of zones in the ZONE_IS_DEAD state).
167  *
168  *   Ordering requirements:
169  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
170  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
171  *
172  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
173  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
174  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock
175  *
176  *   Blocking memory allocations are permitted while holding any of the
177  *   zone locks.
178  *
179  *
180  *   System Call Interface:
181  *
182  *   The zone subsystem can be managed and queried from user level with
183  *   the following system calls (all subcodes of the primary "zone"
184  *   system call):
185  *   - zone_create: creates a zone with selected attributes (name,
186  *     root path, privileges, resource controls, ZFS datasets)
187  *   - zone_enter: allows the current process to enter a zone
188  *   - zone_getattr: reports attributes of a zone
189  *   - zone_setattr: set attributes of a zone
190  *   - zone_boot: set 'init' running for the zone
191  *   - zone_list: lists all zones active in the system
192  *   - zone_lookup: looks up zone id based on name
193  *   - zone_shutdown: initiates shutdown process (see states above)
194  *   - zone_destroy: completes shutdown process (see states above)
195  *
196  */
197 
198 #include <sys/priv_impl.h>
199 #include <sys/cred.h>
200 #include <c2/audit.h>
201 #include <sys/debug.h>
202 #include <sys/file.h>
203 #include <sys/kmem.h>
204 #include <sys/kstat.h>
205 #include <sys/mutex.h>
206 #include <sys/note.h>
207 #include <sys/pathname.h>
208 #include <sys/proc.h>
209 #include <sys/project.h>
210 #include <sys/sysevent.h>
211 #include <sys/task.h>
212 #include <sys/systm.h>
213 #include <sys/types.h>
214 #include <sys/utsname.h>
215 #include <sys/vnode.h>
216 #include <sys/vfs.h>
217 #include <sys/systeminfo.h>
218 #include <sys/policy.h>
219 #include <sys/cred_impl.h>
220 #include <sys/contract_impl.h>
221 #include <sys/contract/process_impl.h>
222 #include <sys/class.h>
223 #include <sys/pool.h>
224 #include <sys/pool_pset.h>
225 #include <sys/pset.h>
226 #include <sys/sysmacros.h>
227 #include <sys/callb.h>
228 #include <sys/vmparam.h>
229 #include <sys/corectl.h>
230 #include <sys/ipc_impl.h>
231 #include <sys/klpd.h>
232 
233 #include <sys/door.h>
234 #include <sys/cpuvar.h>
235 #include <sys/sdt.h>
236 
237 #include <sys/uadmin.h>
238 #include <sys/session.h>
239 #include <sys/cmn_err.h>
240 #include <sys/modhash.h>
241 #include <sys/sunddi.h>
242 #include <sys/nvpair.h>
243 #include <sys/rctl.h>
244 #include <sys/fss.h>
245 #include <sys/brand.h>
246 #include <sys/zone.h>
247 #include <net/if.h>
248 #include <sys/cpucaps.h>
249 #include <vm/seg.h>
250 #include <sys/mac.h>
251 
252 /* List of data link IDs which are accessible from the zone */
253 typedef struct zone_dl {
254 	datalink_id_t	zdl_id;
255 	list_node_t	zdl_linkage;
256 } zone_dl_t;
257 
258 /*
259  * cv used to signal that all references to the zone have been released.  This
260  * needs to be global since there may be multiple waiters, and the first to
261  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
262  */
263 static kcondvar_t zone_destroy_cv;
264 /*
265  * Lock used to serialize access to zone_cv.  This could have been per-zone,
266  * but then we'd need another lock for zone_destroy_cv, and why bother?
267  */
268 static kmutex_t zone_status_lock;
269 
270 /*
271  * ZSD-related global variables.
272  */
273 static kmutex_t zsd_key_lock;	/* protects the following two */
274 /*
275  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
276  */
277 static zone_key_t zsd_keyval = 0;
278 /*
279  * Global list of registered keys.  We use this when a new zone is created.
280  */
281 static list_t zsd_registered_keys;
282 
283 int zone_hash_size = 256;
284 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
285 static kmutex_t zonehash_lock;
286 static uint_t zonecount;
287 static id_space_t *zoneid_space;
288 
289 /*
290  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
291  * kernel proper runs, and which manages all other zones.
292  *
293  * Although not declared as static, the variable "zone0" should not be used
294  * except for by code that needs to reference the global zone early on in boot,
295  * before it is fully initialized.  All other consumers should use
296  * 'global_zone'.
297  */
298 zone_t zone0;
299 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
300 
301 /*
302  * List of active zones, protected by zonehash_lock.
303  */
304 static list_t zone_active;
305 
306 /*
307  * List of destroyed zones that still have outstanding cred references.
308  * Used for debugging.  Uses a separate lock to avoid lock ordering
309  * problems in zone_free.
310  */
311 static list_t zone_deathrow;
312 static kmutex_t zone_deathrow_lock;
313 
314 /* number of zones is limited by virtual interface limit in IP */
315 uint_t maxzones = 8192;
316 
317 /* Event channel to sent zone state change notifications */
318 evchan_t *zone_event_chan;
319 
320 /*
321  * This table holds the mapping from kernel zone states to
322  * states visible in the state notification API.
323  * The idea is that we only expose "obvious" states and
324  * do not expose states which are just implementation details.
325  */
326 const char  *zone_status_table[] = {
327 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
328 	ZONE_EVENT_INITIALIZED,		/* initialized */
329 	ZONE_EVENT_READY,		/* ready */
330 	ZONE_EVENT_READY,		/* booting */
331 	ZONE_EVENT_RUNNING,		/* running */
332 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
333 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
334 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
335 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
336 	ZONE_EVENT_UNINITIALIZED,	/* dead */
337 };
338 
339 /*
340  * This isn't static so lint doesn't complain.
341  */
342 rctl_hndl_t rc_zone_cpu_shares;
343 rctl_hndl_t rc_zone_locked_mem;
344 rctl_hndl_t rc_zone_max_swap;
345 rctl_hndl_t rc_zone_max_lofi;
346 rctl_hndl_t rc_zone_cpu_cap;
347 rctl_hndl_t rc_zone_nlwps;
348 rctl_hndl_t rc_zone_nprocs;
349 rctl_hndl_t rc_zone_shmmax;
350 rctl_hndl_t rc_zone_shmmni;
351 rctl_hndl_t rc_zone_semmni;
352 rctl_hndl_t rc_zone_msgmni;
353 /*
354  * Synchronization primitives used to synchronize between mounts and zone
355  * creation/destruction.
356  */
357 static int mounts_in_progress;
358 static kcondvar_t mount_cv;
359 static kmutex_t mount_lock;
360 
361 const char * const zone_default_initname = "/sbin/init";
362 static char * const zone_prefix = "/zone/";
363 static int zone_shutdown(zoneid_t zoneid);
364 static int zone_add_datalink(zoneid_t, datalink_id_t);
365 static int zone_remove_datalink(zoneid_t, datalink_id_t);
366 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
367 
368 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
369 
370 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
371 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
372 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
373 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
374     zone_key_t);
375 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
376 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
377     kmutex_t *);
378 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
379     kmutex_t *);
380 
381 /*
382  * Bump this number when you alter the zone syscall interfaces; this is
383  * because we need to have support for previous API versions in libc
384  * to support patching; libc calls into the kernel to determine this number.
385  *
386  * Version 1 of the API is the version originally shipped with Solaris 10
387  * Version 2 alters the zone_create system call in order to support more
388  *     arguments by moving the args into a structure; and to do better
389  *     error reporting when zone_create() fails.
390  * Version 3 alters the zone_create system call in order to support the
391  *     import of ZFS datasets to zones.
392  * Version 4 alters the zone_create system call in order to support
393  *     Trusted Extensions.
394  * Version 5 alters the zone_boot system call, and converts its old
395  *     bootargs parameter to be set by the zone_setattr API instead.
396  * Version 6 adds the flag argument to zone_create.
397  */
398 static const int ZONE_SYSCALL_API_VERSION = 6;
399 
400 /*
401  * Certain filesystems (such as NFS and autofs) need to know which zone
402  * the mount is being placed in.  Because of this, we need to be able to
403  * ensure that a zone isn't in the process of being created such that
404  * nfs_mount() thinks it is in the global zone, while by the time it
405  * gets added the list of mounted zones, it ends up on zoneA's mount
406  * list.
407  *
408  * The following functions: block_mounts()/resume_mounts() and
409  * mount_in_progress()/mount_completed() are used by zones and the VFS
410  * layer (respectively) to synchronize zone creation and new mounts.
411  *
412  * The semantics are like a reader-reader lock such that there may
413  * either be multiple mounts (or zone creations, if that weren't
414  * serialized by zonehash_lock) in progress at the same time, but not
415  * both.
416  *
417  * We use cv's so the user can ctrl-C out of the operation if it's
418  * taking too long.
419  *
420  * The semantics are such that there is unfair bias towards the
421  * "current" operation.  This means that zone creations may starve if
422  * there is a rapid succession of new mounts coming in to the system, or
423  * there is a remote possibility that zones will be created at such a
424  * rate that new mounts will not be able to proceed.
425  */
426 /*
427  * Prevent new mounts from progressing to the point of calling
428  * VFS_MOUNT().  If there are already mounts in this "region", wait for
429  * them to complete.
430  */
431 static int
432 block_mounts(void)
433 {
434 	int retval = 0;
435 
436 	/*
437 	 * Since it may block for a long time, block_mounts() shouldn't be
438 	 * called with zonehash_lock held.
439 	 */
440 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
441 	mutex_enter(&mount_lock);
442 	while (mounts_in_progress > 0) {
443 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
444 			goto signaled;
445 	}
446 	/*
447 	 * A negative value of mounts_in_progress indicates that mounts
448 	 * have been blocked by (-mounts_in_progress) different callers.
449 	 */
450 	mounts_in_progress--;
451 	retval = 1;
452 signaled:
453 	mutex_exit(&mount_lock);
454 	return (retval);
455 }
456 
457 /*
458  * The VFS layer may progress with new mounts as far as we're concerned.
459  * Allow them to progress if we were the last obstacle.
460  */
461 static void
462 resume_mounts(void)
463 {
464 	mutex_enter(&mount_lock);
465 	if (++mounts_in_progress == 0)
466 		cv_broadcast(&mount_cv);
467 	mutex_exit(&mount_lock);
468 }
469 
470 /*
471  * The VFS layer is busy with a mount; zones should wait until all
472  * mounts are completed to progress.
473  */
474 void
475 mount_in_progress(void)
476 {
477 	mutex_enter(&mount_lock);
478 	while (mounts_in_progress < 0)
479 		cv_wait(&mount_cv, &mount_lock);
480 	mounts_in_progress++;
481 	mutex_exit(&mount_lock);
482 }
483 
484 /*
485  * VFS is done with one mount; wake up any waiting block_mounts()
486  * callers if this is the last mount.
487  */
488 void
489 mount_completed(void)
490 {
491 	mutex_enter(&mount_lock);
492 	if (--mounts_in_progress == 0)
493 		cv_broadcast(&mount_cv);
494 	mutex_exit(&mount_lock);
495 }
496 
497 /*
498  * ZSD routines.
499  *
500  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
501  * defined by the pthread_key_create() and related interfaces.
502  *
503  * Kernel subsystems may register one or more data items and/or
504  * callbacks to be executed when a zone is created, shutdown, or
505  * destroyed.
506  *
507  * Unlike the thread counterpart, destructor callbacks will be executed
508  * even if the data pointer is NULL and/or there are no constructor
509  * callbacks, so it is the responsibility of such callbacks to check for
510  * NULL data values if necessary.
511  *
512  * The locking strategy and overall picture is as follows:
513  *
514  * When someone calls zone_key_create(), a template ZSD entry is added to the
515  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
516  * holding that lock all the existing zones are marked as
517  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
518  * zone_zsd list (protected by zone_lock). The global list is updated first
519  * (under zone_key_lock) to make sure that newly created zones use the
520  * most recent list of keys. Then under zonehash_lock we walk the zones
521  * and mark them.  Similar locking is used in zone_key_delete().
522  *
523  * The actual create, shutdown, and destroy callbacks are done without
524  * holding any lock. And zsd_flags are used to ensure that the operations
525  * completed so that when zone_key_create (and zone_create) is done, as well as
526  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
527  * are completed.
528  *
529  * When new zones are created constructor callbacks for all registered ZSD
530  * entries will be called. That also uses the above two phases of marking
531  * what needs to be done, and then running the callbacks without holding
532  * any locks.
533  *
534  * The framework does not provide any locking around zone_getspecific() and
535  * zone_setspecific() apart from that needed for internal consistency, so
536  * callers interested in atomic "test-and-set" semantics will need to provide
537  * their own locking.
538  */
539 
540 /*
541  * Helper function to find the zsd_entry associated with the key in the
542  * given list.
543  */
544 static struct zsd_entry *
545 zsd_find(list_t *l, zone_key_t key)
546 {
547 	struct zsd_entry *zsd;
548 
549 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
550 		if (zsd->zsd_key == key) {
551 			return (zsd);
552 		}
553 	}
554 	return (NULL);
555 }
556 
557 /*
558  * Helper function to find the zsd_entry associated with the key in the
559  * given list. Move it to the front of the list.
560  */
561 static struct zsd_entry *
562 zsd_find_mru(list_t *l, zone_key_t key)
563 {
564 	struct zsd_entry *zsd;
565 
566 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
567 		if (zsd->zsd_key == key) {
568 			/*
569 			 * Move to head of list to keep list in MRU order.
570 			 */
571 			if (zsd != list_head(l)) {
572 				list_remove(l, zsd);
573 				list_insert_head(l, zsd);
574 			}
575 			return (zsd);
576 		}
577 	}
578 	return (NULL);
579 }
580 
581 void
582 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
583     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
584 {
585 	struct zsd_entry *zsdp;
586 	struct zsd_entry *t;
587 	struct zone *zone;
588 	zone_key_t  key;
589 
590 	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
591 	zsdp->zsd_data = NULL;
592 	zsdp->zsd_create = create;
593 	zsdp->zsd_shutdown = shutdown;
594 	zsdp->zsd_destroy = destroy;
595 
596 	/*
597 	 * Insert in global list of callbacks. Makes future zone creations
598 	 * see it.
599 	 */
600 	mutex_enter(&zsd_key_lock);
601 	key = zsdp->zsd_key = ++zsd_keyval;
602 	ASSERT(zsd_keyval != 0);
603 	list_insert_tail(&zsd_registered_keys, zsdp);
604 	mutex_exit(&zsd_key_lock);
605 
606 	/*
607 	 * Insert for all existing zones and mark them as needing
608 	 * a create callback.
609 	 */
610 	mutex_enter(&zonehash_lock);	/* stop the world */
611 	for (zone = list_head(&zone_active); zone != NULL;
612 	    zone = list_next(&zone_active, zone)) {
613 		zone_status_t status;
614 
615 		mutex_enter(&zone->zone_lock);
616 
617 		/* Skip zones that are on the way down or not yet up */
618 		status = zone_status_get(zone);
619 		if (status >= ZONE_IS_DOWN ||
620 		    status == ZONE_IS_UNINITIALIZED) {
621 			mutex_exit(&zone->zone_lock);
622 			continue;
623 		}
624 
625 		t = zsd_find_mru(&zone->zone_zsd, key);
626 		if (t != NULL) {
627 			/*
628 			 * A zsd_configure already inserted it after
629 			 * we dropped zsd_key_lock above.
630 			 */
631 			mutex_exit(&zone->zone_lock);
632 			continue;
633 		}
634 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
635 		t->zsd_key = key;
636 		t->zsd_create = create;
637 		t->zsd_shutdown = shutdown;
638 		t->zsd_destroy = destroy;
639 		if (create != NULL) {
640 			t->zsd_flags = ZSD_CREATE_NEEDED;
641 			DTRACE_PROBE2(zsd__create__needed,
642 			    zone_t *, zone, zone_key_t, key);
643 		}
644 		list_insert_tail(&zone->zone_zsd, t);
645 		mutex_exit(&zone->zone_lock);
646 	}
647 	mutex_exit(&zonehash_lock);
648 
649 	if (create != NULL) {
650 		/* Now call the create callback for this key */
651 		zsd_apply_all_zones(zsd_apply_create, key);
652 	}
653 	/*
654 	 * It is safe for consumers to use the key now, make it
655 	 * globally visible. Specifically zone_getspecific() will
656 	 * always successfully return the zone specific data associated
657 	 * with the key.
658 	 */
659 	*keyp = key;
660 
661 }
662 
663 /*
664  * Function called when a module is being unloaded, or otherwise wishes
665  * to unregister its ZSD key and callbacks.
666  *
667  * Remove from the global list and determine the functions that need to
668  * be called under a global lock. Then call the functions without
669  * holding any locks. Finally free up the zone_zsd entries. (The apply
670  * functions need to access the zone_zsd entries to find zsd_data etc.)
671  */
672 int
673 zone_key_delete(zone_key_t key)
674 {
675 	struct zsd_entry *zsdp = NULL;
676 	zone_t *zone;
677 
678 	mutex_enter(&zsd_key_lock);
679 	zsdp = zsd_find_mru(&zsd_registered_keys, key);
680 	if (zsdp == NULL) {
681 		mutex_exit(&zsd_key_lock);
682 		return (-1);
683 	}
684 	list_remove(&zsd_registered_keys, zsdp);
685 	mutex_exit(&zsd_key_lock);
686 
687 	mutex_enter(&zonehash_lock);
688 	for (zone = list_head(&zone_active); zone != NULL;
689 	    zone = list_next(&zone_active, zone)) {
690 		struct zsd_entry *del;
691 
692 		mutex_enter(&zone->zone_lock);
693 		del = zsd_find_mru(&zone->zone_zsd, key);
694 		if (del == NULL) {
695 			/*
696 			 * Somebody else got here first e.g the zone going
697 			 * away.
698 			 */
699 			mutex_exit(&zone->zone_lock);
700 			continue;
701 		}
702 		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
703 		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
704 		if (del->zsd_shutdown != NULL &&
705 		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
706 			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
707 			DTRACE_PROBE2(zsd__shutdown__needed,
708 			    zone_t *, zone, zone_key_t, key);
709 		}
710 		if (del->zsd_destroy != NULL &&
711 		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
712 			del->zsd_flags |= ZSD_DESTROY_NEEDED;
713 			DTRACE_PROBE2(zsd__destroy__needed,
714 			    zone_t *, zone, zone_key_t, key);
715 		}
716 		mutex_exit(&zone->zone_lock);
717 	}
718 	mutex_exit(&zonehash_lock);
719 	kmem_free(zsdp, sizeof (*zsdp));
720 
721 	/* Now call the shutdown and destroy callback for this key */
722 	zsd_apply_all_zones(zsd_apply_shutdown, key);
723 	zsd_apply_all_zones(zsd_apply_destroy, key);
724 
725 	/* Now we can free up the zsdp structures in each zone */
726 	mutex_enter(&zonehash_lock);
727 	for (zone = list_head(&zone_active); zone != NULL;
728 	    zone = list_next(&zone_active, zone)) {
729 		struct zsd_entry *del;
730 
731 		mutex_enter(&zone->zone_lock);
732 		del = zsd_find(&zone->zone_zsd, key);
733 		if (del != NULL) {
734 			list_remove(&zone->zone_zsd, del);
735 			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
736 			kmem_free(del, sizeof (*del));
737 		}
738 		mutex_exit(&zone->zone_lock);
739 	}
740 	mutex_exit(&zonehash_lock);
741 
742 	return (0);
743 }
744 
745 /*
746  * ZSD counterpart of pthread_setspecific().
747  *
748  * Since all zsd callbacks, including those with no create function,
749  * have an entry in zone_zsd, if the key is registered it is part of
750  * the zone_zsd list.
751  * Return an error if the key wasn't registerd.
752  */
753 int
754 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
755 {
756 	struct zsd_entry *t;
757 
758 	mutex_enter(&zone->zone_lock);
759 	t = zsd_find_mru(&zone->zone_zsd, key);
760 	if (t != NULL) {
761 		/*
762 		 * Replace old value with new
763 		 */
764 		t->zsd_data = (void *)data;
765 		mutex_exit(&zone->zone_lock);
766 		return (0);
767 	}
768 	mutex_exit(&zone->zone_lock);
769 	return (-1);
770 }
771 
772 /*
773  * ZSD counterpart of pthread_getspecific().
774  */
775 void *
776 zone_getspecific(zone_key_t key, zone_t *zone)
777 {
778 	struct zsd_entry *t;
779 	void *data;
780 
781 	mutex_enter(&zone->zone_lock);
782 	t = zsd_find_mru(&zone->zone_zsd, key);
783 	data = (t == NULL ? NULL : t->zsd_data);
784 	mutex_exit(&zone->zone_lock);
785 	return (data);
786 }
787 
788 /*
789  * Function used to initialize a zone's list of ZSD callbacks and data
790  * when the zone is being created.  The callbacks are initialized from
791  * the template list (zsd_registered_keys). The constructor callback is
792  * executed later (once the zone exists and with locks dropped).
793  */
794 static void
795 zone_zsd_configure(zone_t *zone)
796 {
797 	struct zsd_entry *zsdp;
798 	struct zsd_entry *t;
799 
800 	ASSERT(MUTEX_HELD(&zonehash_lock));
801 	ASSERT(list_head(&zone->zone_zsd) == NULL);
802 	mutex_enter(&zone->zone_lock);
803 	mutex_enter(&zsd_key_lock);
804 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
805 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
806 		/*
807 		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
808 		 * should not have added anything to it.
809 		 */
810 		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
811 
812 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
813 		t->zsd_key = zsdp->zsd_key;
814 		t->zsd_create = zsdp->zsd_create;
815 		t->zsd_shutdown = zsdp->zsd_shutdown;
816 		t->zsd_destroy = zsdp->zsd_destroy;
817 		if (zsdp->zsd_create != NULL) {
818 			t->zsd_flags = ZSD_CREATE_NEEDED;
819 			DTRACE_PROBE2(zsd__create__needed,
820 			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
821 		}
822 		list_insert_tail(&zone->zone_zsd, t);
823 	}
824 	mutex_exit(&zsd_key_lock);
825 	mutex_exit(&zone->zone_lock);
826 }
827 
828 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
829 
830 /*
831  * Helper function to execute shutdown or destructor callbacks.
832  */
833 static void
834 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
835 {
836 	struct zsd_entry *t;
837 
838 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
839 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
840 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
841 
842 	/*
843 	 * Run the callback solely based on what is registered for the zone
844 	 * in zone_zsd. The global list can change independently of this
845 	 * as keys are registered and unregistered and we don't register new
846 	 * callbacks for a zone that is in the process of going away.
847 	 */
848 	mutex_enter(&zone->zone_lock);
849 	for (t = list_head(&zone->zone_zsd); t != NULL;
850 	    t = list_next(&zone->zone_zsd, t)) {
851 		zone_key_t key = t->zsd_key;
852 
853 		/* Skip if no callbacks registered */
854 
855 		if (ct == ZSD_SHUTDOWN) {
856 			if (t->zsd_shutdown != NULL &&
857 			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
858 				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
859 				DTRACE_PROBE2(zsd__shutdown__needed,
860 				    zone_t *, zone, zone_key_t, key);
861 			}
862 		} else {
863 			if (t->zsd_destroy != NULL &&
864 			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
865 				t->zsd_flags |= ZSD_DESTROY_NEEDED;
866 				DTRACE_PROBE2(zsd__destroy__needed,
867 				    zone_t *, zone, zone_key_t, key);
868 			}
869 		}
870 	}
871 	mutex_exit(&zone->zone_lock);
872 
873 	/* Now call the shutdown and destroy callback for this key */
874 	zsd_apply_all_keys(zsd_apply_shutdown, zone);
875 	zsd_apply_all_keys(zsd_apply_destroy, zone);
876 
877 }
878 
879 /*
880  * Called when the zone is going away; free ZSD-related memory, and
881  * destroy the zone_zsd list.
882  */
883 static void
884 zone_free_zsd(zone_t *zone)
885 {
886 	struct zsd_entry *t, *next;
887 
888 	/*
889 	 * Free all the zsd_entry's we had on this zone.
890 	 */
891 	mutex_enter(&zone->zone_lock);
892 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
893 		next = list_next(&zone->zone_zsd, t);
894 		list_remove(&zone->zone_zsd, t);
895 		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
896 		kmem_free(t, sizeof (*t));
897 	}
898 	list_destroy(&zone->zone_zsd);
899 	mutex_exit(&zone->zone_lock);
900 
901 }
902 
903 /*
904  * Apply a function to all zones for particular key value.
905  *
906  * The applyfn has to drop zonehash_lock if it does some work, and
907  * then reacquire it before it returns.
908  * When the lock is dropped we don't follow list_next even
909  * if it is possible to do so without any hazards. This is
910  * because we want the design to allow for the list of zones
911  * to change in any arbitrary way during the time the
912  * lock was dropped.
913  *
914  * It is safe to restart the loop at list_head since the applyfn
915  * changes the zsd_flags as it does work, so a subsequent
916  * pass through will have no effect in applyfn, hence the loop will terminate
917  * in at worst O(N^2).
918  */
919 static void
920 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
921 {
922 	zone_t *zone;
923 
924 	mutex_enter(&zonehash_lock);
925 	zone = list_head(&zone_active);
926 	while (zone != NULL) {
927 		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
928 			/* Lock dropped - restart at head */
929 			zone = list_head(&zone_active);
930 		} else {
931 			zone = list_next(&zone_active, zone);
932 		}
933 	}
934 	mutex_exit(&zonehash_lock);
935 }
936 
937 /*
938  * Apply a function to all keys for a particular zone.
939  *
940  * The applyfn has to drop zonehash_lock if it does some work, and
941  * then reacquire it before it returns.
942  * When the lock is dropped we don't follow list_next even
943  * if it is possible to do so without any hazards. This is
944  * because we want the design to allow for the list of zsd callbacks
945  * to change in any arbitrary way during the time the
946  * lock was dropped.
947  *
948  * It is safe to restart the loop at list_head since the applyfn
949  * changes the zsd_flags as it does work, so a subsequent
950  * pass through will have no effect in applyfn, hence the loop will terminate
951  * in at worst O(N^2).
952  */
953 static void
954 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
955 {
956 	struct zsd_entry *t;
957 
958 	mutex_enter(&zone->zone_lock);
959 	t = list_head(&zone->zone_zsd);
960 	while (t != NULL) {
961 		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
962 			/* Lock dropped - restart at head */
963 			t = list_head(&zone->zone_zsd);
964 		} else {
965 			t = list_next(&zone->zone_zsd, t);
966 		}
967 	}
968 	mutex_exit(&zone->zone_lock);
969 }
970 
971 /*
972  * Call the create function for the zone and key if CREATE_NEEDED
973  * is set.
974  * If some other thread gets here first and sets CREATE_INPROGRESS, then
975  * we wait for that thread to complete so that we can ensure that
976  * all the callbacks are done when we've looped over all zones/keys.
977  *
978  * When we call the create function, we drop the global held by the
979  * caller, and return true to tell the caller it needs to re-evalute the
980  * state.
981  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
982  * remains held on exit.
983  */
984 static boolean_t
985 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
986     zone_t *zone, zone_key_t key)
987 {
988 	void *result;
989 	struct zsd_entry *t;
990 	boolean_t dropped;
991 
992 	if (lockp != NULL) {
993 		ASSERT(MUTEX_HELD(lockp));
994 	}
995 	if (zone_lock_held) {
996 		ASSERT(MUTEX_HELD(&zone->zone_lock));
997 	} else {
998 		mutex_enter(&zone->zone_lock);
999 	}
1000 
1001 	t = zsd_find(&zone->zone_zsd, key);
1002 	if (t == NULL) {
1003 		/*
1004 		 * Somebody else got here first e.g the zone going
1005 		 * away.
1006 		 */
1007 		if (!zone_lock_held)
1008 			mutex_exit(&zone->zone_lock);
1009 		return (B_FALSE);
1010 	}
1011 	dropped = B_FALSE;
1012 	if (zsd_wait_for_inprogress(zone, t, lockp))
1013 		dropped = B_TRUE;
1014 
1015 	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1016 		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1017 		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1018 		DTRACE_PROBE2(zsd__create__inprogress,
1019 		    zone_t *, zone, zone_key_t, key);
1020 		mutex_exit(&zone->zone_lock);
1021 		if (lockp != NULL)
1022 			mutex_exit(lockp);
1023 
1024 		dropped = B_TRUE;
1025 		ASSERT(t->zsd_create != NULL);
1026 		DTRACE_PROBE2(zsd__create__start,
1027 		    zone_t *, zone, zone_key_t, key);
1028 
1029 		result = (*t->zsd_create)(zone->zone_id);
1030 
1031 		DTRACE_PROBE2(zsd__create__end,
1032 		    zone_t *, zone, voidn *, result);
1033 
1034 		ASSERT(result != NULL);
1035 		if (lockp != NULL)
1036 			mutex_enter(lockp);
1037 		mutex_enter(&zone->zone_lock);
1038 		t->zsd_data = result;
1039 		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1040 		t->zsd_flags |= ZSD_CREATE_COMPLETED;
1041 		cv_broadcast(&t->zsd_cv);
1042 		DTRACE_PROBE2(zsd__create__completed,
1043 		    zone_t *, zone, zone_key_t, key);
1044 	}
1045 	if (!zone_lock_held)
1046 		mutex_exit(&zone->zone_lock);
1047 	return (dropped);
1048 }
1049 
1050 /*
1051  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1052  * is set.
1053  * If some other thread gets here first and sets *_INPROGRESS, then
1054  * we wait for that thread to complete so that we can ensure that
1055  * all the callbacks are done when we've looped over all zones/keys.
1056  *
1057  * When we call the shutdown function, we drop the global held by the
1058  * caller, and return true to tell the caller it needs to re-evalute the
1059  * state.
1060  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1061  * remains held on exit.
1062  */
1063 static boolean_t
1064 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1065     zone_t *zone, zone_key_t key)
1066 {
1067 	struct zsd_entry *t;
1068 	void *data;
1069 	boolean_t dropped;
1070 
1071 	if (lockp != NULL) {
1072 		ASSERT(MUTEX_HELD(lockp));
1073 	}
1074 	if (zone_lock_held) {
1075 		ASSERT(MUTEX_HELD(&zone->zone_lock));
1076 	} else {
1077 		mutex_enter(&zone->zone_lock);
1078 	}
1079 
1080 	t = zsd_find(&zone->zone_zsd, key);
1081 	if (t == NULL) {
1082 		/*
1083 		 * Somebody else got here first e.g the zone going
1084 		 * away.
1085 		 */
1086 		if (!zone_lock_held)
1087 			mutex_exit(&zone->zone_lock);
1088 		return (B_FALSE);
1089 	}
1090 	dropped = B_FALSE;
1091 	if (zsd_wait_for_creator(zone, t, lockp))
1092 		dropped = B_TRUE;
1093 
1094 	if (zsd_wait_for_inprogress(zone, t, lockp))
1095 		dropped = B_TRUE;
1096 
1097 	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1098 		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1099 		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1100 		DTRACE_PROBE2(zsd__shutdown__inprogress,
1101 		    zone_t *, zone, zone_key_t, key);
1102 		mutex_exit(&zone->zone_lock);
1103 		if (lockp != NULL)
1104 			mutex_exit(lockp);
1105 		dropped = B_TRUE;
1106 
1107 		ASSERT(t->zsd_shutdown != NULL);
1108 		data = t->zsd_data;
1109 
1110 		DTRACE_PROBE2(zsd__shutdown__start,
1111 		    zone_t *, zone, zone_key_t, key);
1112 
1113 		(t->zsd_shutdown)(zone->zone_id, data);
1114 		DTRACE_PROBE2(zsd__shutdown__end,
1115 		    zone_t *, zone, zone_key_t, key);
1116 
1117 		if (lockp != NULL)
1118 			mutex_enter(lockp);
1119 		mutex_enter(&zone->zone_lock);
1120 		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1121 		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1122 		cv_broadcast(&t->zsd_cv);
1123 		DTRACE_PROBE2(zsd__shutdown__completed,
1124 		    zone_t *, zone, zone_key_t, key);
1125 	}
1126 	if (!zone_lock_held)
1127 		mutex_exit(&zone->zone_lock);
1128 	return (dropped);
1129 }
1130 
1131 /*
1132  * Call the destroy function for the zone and key if DESTROY_NEEDED
1133  * is set.
1134  * If some other thread gets here first and sets *_INPROGRESS, then
1135  * we wait for that thread to complete so that we can ensure that
1136  * all the callbacks are done when we've looped over all zones/keys.
1137  *
1138  * When we call the destroy function, we drop the global held by the
1139  * caller, and return true to tell the caller it needs to re-evalute the
1140  * state.
1141  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1142  * remains held on exit.
1143  */
1144 static boolean_t
1145 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1146     zone_t *zone, zone_key_t key)
1147 {
1148 	struct zsd_entry *t;
1149 	void *data;
1150 	boolean_t dropped;
1151 
1152 	if (lockp != NULL) {
1153 		ASSERT(MUTEX_HELD(lockp));
1154 	}
1155 	if (zone_lock_held) {
1156 		ASSERT(MUTEX_HELD(&zone->zone_lock));
1157 	} else {
1158 		mutex_enter(&zone->zone_lock);
1159 	}
1160 
1161 	t = zsd_find(&zone->zone_zsd, key);
1162 	if (t == NULL) {
1163 		/*
1164 		 * Somebody else got here first e.g the zone going
1165 		 * away.
1166 		 */
1167 		if (!zone_lock_held)
1168 			mutex_exit(&zone->zone_lock);
1169 		return (B_FALSE);
1170 	}
1171 	dropped = B_FALSE;
1172 	if (zsd_wait_for_creator(zone, t, lockp))
1173 		dropped = B_TRUE;
1174 
1175 	if (zsd_wait_for_inprogress(zone, t, lockp))
1176 		dropped = B_TRUE;
1177 
1178 	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1179 		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1180 		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1181 		DTRACE_PROBE2(zsd__destroy__inprogress,
1182 		    zone_t *, zone, zone_key_t, key);
1183 		mutex_exit(&zone->zone_lock);
1184 		if (lockp != NULL)
1185 			mutex_exit(lockp);
1186 		dropped = B_TRUE;
1187 
1188 		ASSERT(t->zsd_destroy != NULL);
1189 		data = t->zsd_data;
1190 		DTRACE_PROBE2(zsd__destroy__start,
1191 		    zone_t *, zone, zone_key_t, key);
1192 
1193 		(t->zsd_destroy)(zone->zone_id, data);
1194 		DTRACE_PROBE2(zsd__destroy__end,
1195 		    zone_t *, zone, zone_key_t, key);
1196 
1197 		if (lockp != NULL)
1198 			mutex_enter(lockp);
1199 		mutex_enter(&zone->zone_lock);
1200 		t->zsd_data = NULL;
1201 		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1202 		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1203 		cv_broadcast(&t->zsd_cv);
1204 		DTRACE_PROBE2(zsd__destroy__completed,
1205 		    zone_t *, zone, zone_key_t, key);
1206 	}
1207 	if (!zone_lock_held)
1208 		mutex_exit(&zone->zone_lock);
1209 	return (dropped);
1210 }
1211 
1212 /*
1213  * Wait for any CREATE_NEEDED flag to be cleared.
1214  * Returns true if lockp was temporarily dropped while waiting.
1215  */
1216 static boolean_t
1217 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1218 {
1219 	boolean_t dropped = B_FALSE;
1220 
1221 	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1222 		DTRACE_PROBE2(zsd__wait__for__creator,
1223 		    zone_t *, zone, struct zsd_entry *, t);
1224 		if (lockp != NULL) {
1225 			dropped = B_TRUE;
1226 			mutex_exit(lockp);
1227 		}
1228 		cv_wait(&t->zsd_cv, &zone->zone_lock);
1229 		if (lockp != NULL) {
1230 			/* First drop zone_lock to preserve order */
1231 			mutex_exit(&zone->zone_lock);
1232 			mutex_enter(lockp);
1233 			mutex_enter(&zone->zone_lock);
1234 		}
1235 	}
1236 	return (dropped);
1237 }
1238 
1239 /*
1240  * Wait for any INPROGRESS flag to be cleared.
1241  * Returns true if lockp was temporarily dropped while waiting.
1242  */
1243 static boolean_t
1244 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1245 {
1246 	boolean_t dropped = B_FALSE;
1247 
1248 	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1249 		DTRACE_PROBE2(zsd__wait__for__inprogress,
1250 		    zone_t *, zone, struct zsd_entry *, t);
1251 		if (lockp != NULL) {
1252 			dropped = B_TRUE;
1253 			mutex_exit(lockp);
1254 		}
1255 		cv_wait(&t->zsd_cv, &zone->zone_lock);
1256 		if (lockp != NULL) {
1257 			/* First drop zone_lock to preserve order */
1258 			mutex_exit(&zone->zone_lock);
1259 			mutex_enter(lockp);
1260 			mutex_enter(&zone->zone_lock);
1261 		}
1262 	}
1263 	return (dropped);
1264 }
1265 
1266 /*
1267  * Frees memory associated with the zone dataset list.
1268  */
1269 static void
1270 zone_free_datasets(zone_t *zone)
1271 {
1272 	zone_dataset_t *t, *next;
1273 
1274 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1275 		next = list_next(&zone->zone_datasets, t);
1276 		list_remove(&zone->zone_datasets, t);
1277 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1278 		kmem_free(t, sizeof (*t));
1279 	}
1280 	list_destroy(&zone->zone_datasets);
1281 }
1282 
1283 /*
1284  * zone.cpu-shares resource control support.
1285  */
1286 /*ARGSUSED*/
1287 static rctl_qty_t
1288 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1289 {
1290 	ASSERT(MUTEX_HELD(&p->p_lock));
1291 	return (p->p_zone->zone_shares);
1292 }
1293 
1294 /*ARGSUSED*/
1295 static int
1296 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1297     rctl_qty_t nv)
1298 {
1299 	ASSERT(MUTEX_HELD(&p->p_lock));
1300 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1301 	if (e->rcep_p.zone == NULL)
1302 		return (0);
1303 
1304 	e->rcep_p.zone->zone_shares = nv;
1305 	return (0);
1306 }
1307 
1308 static rctl_ops_t zone_cpu_shares_ops = {
1309 	rcop_no_action,
1310 	zone_cpu_shares_usage,
1311 	zone_cpu_shares_set,
1312 	rcop_no_test
1313 };
1314 
1315 /*
1316  * zone.cpu-cap resource control support.
1317  */
1318 /*ARGSUSED*/
1319 static rctl_qty_t
1320 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1321 {
1322 	ASSERT(MUTEX_HELD(&p->p_lock));
1323 	return (cpucaps_zone_get(p->p_zone));
1324 }
1325 
1326 /*ARGSUSED*/
1327 static int
1328 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1329     rctl_qty_t nv)
1330 {
1331 	zone_t *zone = e->rcep_p.zone;
1332 
1333 	ASSERT(MUTEX_HELD(&p->p_lock));
1334 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1335 
1336 	if (zone == NULL)
1337 		return (0);
1338 
1339 	/*
1340 	 * set cap to the new value.
1341 	 */
1342 	return (cpucaps_zone_set(zone, nv));
1343 }
1344 
1345 static rctl_ops_t zone_cpu_cap_ops = {
1346 	rcop_no_action,
1347 	zone_cpu_cap_get,
1348 	zone_cpu_cap_set,
1349 	rcop_no_test
1350 };
1351 
1352 /*ARGSUSED*/
1353 static rctl_qty_t
1354 zone_lwps_usage(rctl_t *r, proc_t *p)
1355 {
1356 	rctl_qty_t nlwps;
1357 	zone_t *zone = p->p_zone;
1358 
1359 	ASSERT(MUTEX_HELD(&p->p_lock));
1360 
1361 	mutex_enter(&zone->zone_nlwps_lock);
1362 	nlwps = zone->zone_nlwps;
1363 	mutex_exit(&zone->zone_nlwps_lock);
1364 
1365 	return (nlwps);
1366 }
1367 
1368 /*ARGSUSED*/
1369 static int
1370 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1371     rctl_qty_t incr, uint_t flags)
1372 {
1373 	rctl_qty_t nlwps;
1374 
1375 	ASSERT(MUTEX_HELD(&p->p_lock));
1376 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1377 	if (e->rcep_p.zone == NULL)
1378 		return (0);
1379 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1380 	nlwps = e->rcep_p.zone->zone_nlwps;
1381 
1382 	if (nlwps + incr > rcntl->rcv_value)
1383 		return (1);
1384 
1385 	return (0);
1386 }
1387 
1388 /*ARGSUSED*/
1389 static int
1390 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1391 {
1392 	ASSERT(MUTEX_HELD(&p->p_lock));
1393 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1394 	if (e->rcep_p.zone == NULL)
1395 		return (0);
1396 	e->rcep_p.zone->zone_nlwps_ctl = nv;
1397 	return (0);
1398 }
1399 
1400 static rctl_ops_t zone_lwps_ops = {
1401 	rcop_no_action,
1402 	zone_lwps_usage,
1403 	zone_lwps_set,
1404 	zone_lwps_test,
1405 };
1406 
1407 /*ARGSUSED*/
1408 static rctl_qty_t
1409 zone_procs_usage(rctl_t *r, proc_t *p)
1410 {
1411 	rctl_qty_t nprocs;
1412 	zone_t *zone = p->p_zone;
1413 
1414 	ASSERT(MUTEX_HELD(&p->p_lock));
1415 
1416 	mutex_enter(&zone->zone_nlwps_lock);
1417 	nprocs = zone->zone_nprocs;
1418 	mutex_exit(&zone->zone_nlwps_lock);
1419 
1420 	return (nprocs);
1421 }
1422 
1423 /*ARGSUSED*/
1424 static int
1425 zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1426     rctl_qty_t incr, uint_t flags)
1427 {
1428 	rctl_qty_t nprocs;
1429 
1430 	ASSERT(MUTEX_HELD(&p->p_lock));
1431 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1432 	if (e->rcep_p.zone == NULL)
1433 		return (0);
1434 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1435 	nprocs = e->rcep_p.zone->zone_nprocs;
1436 
1437 	if (nprocs + incr > rcntl->rcv_value)
1438 		return (1);
1439 
1440 	return (0);
1441 }
1442 
1443 /*ARGSUSED*/
1444 static int
1445 zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1446 {
1447 	ASSERT(MUTEX_HELD(&p->p_lock));
1448 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1449 	if (e->rcep_p.zone == NULL)
1450 		return (0);
1451 	e->rcep_p.zone->zone_nprocs_ctl = nv;
1452 	return (0);
1453 }
1454 
1455 static rctl_ops_t zone_procs_ops = {
1456 	rcop_no_action,
1457 	zone_procs_usage,
1458 	zone_procs_set,
1459 	zone_procs_test,
1460 };
1461 
1462 /*ARGSUSED*/
1463 static int
1464 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1465     rctl_qty_t incr, uint_t flags)
1466 {
1467 	rctl_qty_t v;
1468 	ASSERT(MUTEX_HELD(&p->p_lock));
1469 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1470 	v = e->rcep_p.zone->zone_shmmax + incr;
1471 	if (v > rval->rcv_value)
1472 		return (1);
1473 	return (0);
1474 }
1475 
1476 static rctl_ops_t zone_shmmax_ops = {
1477 	rcop_no_action,
1478 	rcop_no_usage,
1479 	rcop_no_set,
1480 	zone_shmmax_test
1481 };
1482 
1483 /*ARGSUSED*/
1484 static int
1485 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1486     rctl_qty_t incr, uint_t flags)
1487 {
1488 	rctl_qty_t v;
1489 	ASSERT(MUTEX_HELD(&p->p_lock));
1490 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1491 	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1492 	if (v > rval->rcv_value)
1493 		return (1);
1494 	return (0);
1495 }
1496 
1497 static rctl_ops_t zone_shmmni_ops = {
1498 	rcop_no_action,
1499 	rcop_no_usage,
1500 	rcop_no_set,
1501 	zone_shmmni_test
1502 };
1503 
1504 /*ARGSUSED*/
1505 static int
1506 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1507     rctl_qty_t incr, uint_t flags)
1508 {
1509 	rctl_qty_t v;
1510 	ASSERT(MUTEX_HELD(&p->p_lock));
1511 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1512 	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1513 	if (v > rval->rcv_value)
1514 		return (1);
1515 	return (0);
1516 }
1517 
1518 static rctl_ops_t zone_semmni_ops = {
1519 	rcop_no_action,
1520 	rcop_no_usage,
1521 	rcop_no_set,
1522 	zone_semmni_test
1523 };
1524 
1525 /*ARGSUSED*/
1526 static int
1527 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1528     rctl_qty_t incr, uint_t flags)
1529 {
1530 	rctl_qty_t v;
1531 	ASSERT(MUTEX_HELD(&p->p_lock));
1532 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1533 	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1534 	if (v > rval->rcv_value)
1535 		return (1);
1536 	return (0);
1537 }
1538 
1539 static rctl_ops_t zone_msgmni_ops = {
1540 	rcop_no_action,
1541 	rcop_no_usage,
1542 	rcop_no_set,
1543 	zone_msgmni_test
1544 };
1545 
1546 /*ARGSUSED*/
1547 static rctl_qty_t
1548 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1549 {
1550 	rctl_qty_t q;
1551 	ASSERT(MUTEX_HELD(&p->p_lock));
1552 	mutex_enter(&p->p_zone->zone_mem_lock);
1553 	q = p->p_zone->zone_locked_mem;
1554 	mutex_exit(&p->p_zone->zone_mem_lock);
1555 	return (q);
1556 }
1557 
1558 /*ARGSUSED*/
1559 static int
1560 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1561     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1562 {
1563 	rctl_qty_t q;
1564 	zone_t *z;
1565 
1566 	z = e->rcep_p.zone;
1567 	ASSERT(MUTEX_HELD(&p->p_lock));
1568 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1569 	q = z->zone_locked_mem;
1570 	if (q + incr > rcntl->rcv_value)
1571 		return (1);
1572 	return (0);
1573 }
1574 
1575 /*ARGSUSED*/
1576 static int
1577 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1578     rctl_qty_t nv)
1579 {
1580 	ASSERT(MUTEX_HELD(&p->p_lock));
1581 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1582 	if (e->rcep_p.zone == NULL)
1583 		return (0);
1584 	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1585 	return (0);
1586 }
1587 
1588 static rctl_ops_t zone_locked_mem_ops = {
1589 	rcop_no_action,
1590 	zone_locked_mem_usage,
1591 	zone_locked_mem_set,
1592 	zone_locked_mem_test
1593 };
1594 
1595 /*ARGSUSED*/
1596 static rctl_qty_t
1597 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1598 {
1599 	rctl_qty_t q;
1600 	zone_t *z = p->p_zone;
1601 
1602 	ASSERT(MUTEX_HELD(&p->p_lock));
1603 	mutex_enter(&z->zone_mem_lock);
1604 	q = z->zone_max_swap;
1605 	mutex_exit(&z->zone_mem_lock);
1606 	return (q);
1607 }
1608 
1609 /*ARGSUSED*/
1610 static int
1611 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1612     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1613 {
1614 	rctl_qty_t q;
1615 	zone_t *z;
1616 
1617 	z = e->rcep_p.zone;
1618 	ASSERT(MUTEX_HELD(&p->p_lock));
1619 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1620 	q = z->zone_max_swap;
1621 	if (q + incr > rcntl->rcv_value)
1622 		return (1);
1623 	return (0);
1624 }
1625 
1626 /*ARGSUSED*/
1627 static int
1628 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1629     rctl_qty_t nv)
1630 {
1631 	ASSERT(MUTEX_HELD(&p->p_lock));
1632 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1633 	if (e->rcep_p.zone == NULL)
1634 		return (0);
1635 	e->rcep_p.zone->zone_max_swap_ctl = nv;
1636 	return (0);
1637 }
1638 
1639 static rctl_ops_t zone_max_swap_ops = {
1640 	rcop_no_action,
1641 	zone_max_swap_usage,
1642 	zone_max_swap_set,
1643 	zone_max_swap_test
1644 };
1645 
1646 /*ARGSUSED*/
1647 static rctl_qty_t
1648 zone_max_lofi_usage(rctl_t *rctl, struct proc *p)
1649 {
1650 	rctl_qty_t q;
1651 	zone_t *z = p->p_zone;
1652 
1653 	ASSERT(MUTEX_HELD(&p->p_lock));
1654 	mutex_enter(&z->zone_rctl_lock);
1655 	q = z->zone_max_lofi;
1656 	mutex_exit(&z->zone_rctl_lock);
1657 	return (q);
1658 }
1659 
1660 /*ARGSUSED*/
1661 static int
1662 zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1663     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1664 {
1665 	rctl_qty_t q;
1666 	zone_t *z;
1667 
1668 	z = e->rcep_p.zone;
1669 	ASSERT(MUTEX_HELD(&p->p_lock));
1670 	ASSERT(MUTEX_HELD(&z->zone_rctl_lock));
1671 	q = z->zone_max_lofi;
1672 	if (q + incr > rcntl->rcv_value)
1673 		return (1);
1674 	return (0);
1675 }
1676 
1677 /*ARGSUSED*/
1678 static int
1679 zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1680     rctl_qty_t nv)
1681 {
1682 	ASSERT(MUTEX_HELD(&p->p_lock));
1683 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1684 	if (e->rcep_p.zone == NULL)
1685 		return (0);
1686 	e->rcep_p.zone->zone_max_lofi_ctl = nv;
1687 	return (0);
1688 }
1689 
1690 static rctl_ops_t zone_max_lofi_ops = {
1691 	rcop_no_action,
1692 	zone_max_lofi_usage,
1693 	zone_max_lofi_set,
1694 	zone_max_lofi_test
1695 };
1696 
1697 /*
1698  * Helper function to brand the zone with a unique ID.
1699  */
1700 static void
1701 zone_uniqid(zone_t *zone)
1702 {
1703 	static uint64_t uniqid = 0;
1704 
1705 	ASSERT(MUTEX_HELD(&zonehash_lock));
1706 	zone->zone_uniqid = uniqid++;
1707 }
1708 
1709 /*
1710  * Returns a held pointer to the "kcred" for the specified zone.
1711  */
1712 struct cred *
1713 zone_get_kcred(zoneid_t zoneid)
1714 {
1715 	zone_t *zone;
1716 	cred_t *cr;
1717 
1718 	if ((zone = zone_find_by_id(zoneid)) == NULL)
1719 		return (NULL);
1720 	cr = zone->zone_kcred;
1721 	crhold(cr);
1722 	zone_rele(zone);
1723 	return (cr);
1724 }
1725 
1726 static int
1727 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1728 {
1729 	zone_t *zone = ksp->ks_private;
1730 	zone_kstat_t *zk = ksp->ks_data;
1731 
1732 	if (rw == KSTAT_WRITE)
1733 		return (EACCES);
1734 
1735 	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1736 	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1737 	return (0);
1738 }
1739 
1740 static int
1741 zone_nprocs_kstat_update(kstat_t *ksp, int rw)
1742 {
1743 	zone_t *zone = ksp->ks_private;
1744 	zone_kstat_t *zk = ksp->ks_data;
1745 
1746 	if (rw == KSTAT_WRITE)
1747 		return (EACCES);
1748 
1749 	zk->zk_usage.value.ui64 = zone->zone_nprocs;
1750 	zk->zk_value.value.ui64 = zone->zone_nprocs_ctl;
1751 	return (0);
1752 }
1753 
1754 static int
1755 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1756 {
1757 	zone_t *zone = ksp->ks_private;
1758 	zone_kstat_t *zk = ksp->ks_data;
1759 
1760 	if (rw == KSTAT_WRITE)
1761 		return (EACCES);
1762 
1763 	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1764 	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1765 	return (0);
1766 }
1767 
1768 static kstat_t *
1769 zone_kstat_create_common(zone_t *zone, char *name,
1770     int (*updatefunc) (kstat_t *, int))
1771 {
1772 	kstat_t *ksp;
1773 	zone_kstat_t *zk;
1774 
1775 	ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED,
1776 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1777 	    KSTAT_FLAG_VIRTUAL);
1778 
1779 	if (ksp == NULL)
1780 		return (NULL);
1781 
1782 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1783 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1784 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1785 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1786 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1787 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1788 	ksp->ks_update = updatefunc;
1789 	ksp->ks_private = zone;
1790 	kstat_install(ksp);
1791 	return (ksp);
1792 }
1793 
1794 static void
1795 zone_kstat_create(zone_t *zone)
1796 {
1797 	zone->zone_lockedmem_kstat = zone_kstat_create_common(zone,
1798 	    "lockedmem", zone_lockedmem_kstat_update);
1799 	zone->zone_swapresv_kstat = zone_kstat_create_common(zone,
1800 	    "swapresv", zone_swapresv_kstat_update);
1801 	zone->zone_nprocs_kstat = zone_kstat_create_common(zone,
1802 	    "nprocs", zone_nprocs_kstat_update);
1803 }
1804 
1805 static void
1806 zone_kstat_delete_common(kstat_t **pkstat)
1807 {
1808 	void *data;
1809 
1810 	if (*pkstat != NULL) {
1811 		data = (*pkstat)->ks_data;
1812 		kstat_delete(*pkstat);
1813 		kmem_free(data, sizeof (zone_kstat_t));
1814 		*pkstat = NULL;
1815 	}
1816 }
1817 
1818 static void
1819 zone_kstat_delete(zone_t *zone)
1820 {
1821 	zone_kstat_delete_common(&zone->zone_lockedmem_kstat);
1822 	zone_kstat_delete_common(&zone->zone_swapresv_kstat);
1823 	zone_kstat_delete_common(&zone->zone_nprocs_kstat);
1824 }
1825 
1826 /*
1827  * Called very early on in boot to initialize the ZSD list so that
1828  * zone_key_create() can be called before zone_init().  It also initializes
1829  * portions of zone0 which may be used before zone_init() is called.  The
1830  * variable "global_zone" will be set when zone0 is fully initialized by
1831  * zone_init().
1832  */
1833 void
1834 zone_zsd_init(void)
1835 {
1836 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1837 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1838 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1839 	    offsetof(struct zsd_entry, zsd_linkage));
1840 	list_create(&zone_active, sizeof (zone_t),
1841 	    offsetof(zone_t, zone_linkage));
1842 	list_create(&zone_deathrow, sizeof (zone_t),
1843 	    offsetof(zone_t, zone_linkage));
1844 
1845 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1846 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1847 	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1848 	zone0.zone_shares = 1;
1849 	zone0.zone_nlwps = 0;
1850 	zone0.zone_nlwps_ctl = INT_MAX;
1851 	zone0.zone_nprocs = 0;
1852 	zone0.zone_nprocs_ctl = INT_MAX;
1853 	zone0.zone_locked_mem = 0;
1854 	zone0.zone_locked_mem_ctl = UINT64_MAX;
1855 	ASSERT(zone0.zone_max_swap == 0);
1856 	zone0.zone_max_swap_ctl = UINT64_MAX;
1857 	zone0.zone_max_lofi = 0;
1858 	zone0.zone_max_lofi_ctl = UINT64_MAX;
1859 	zone0.zone_shmmax = 0;
1860 	zone0.zone_ipc.ipcq_shmmni = 0;
1861 	zone0.zone_ipc.ipcq_semmni = 0;
1862 	zone0.zone_ipc.ipcq_msgmni = 0;
1863 	zone0.zone_name = GLOBAL_ZONENAME;
1864 	zone0.zone_nodename = utsname.nodename;
1865 	zone0.zone_domain = srpc_domain;
1866 	zone0.zone_hostid = HW_INVALID_HOSTID;
1867 	zone0.zone_fs_allowed = NULL;
1868 	zone0.zone_ref = 1;
1869 	zone0.zone_id = GLOBAL_ZONEID;
1870 	zone0.zone_status = ZONE_IS_RUNNING;
1871 	zone0.zone_rootpath = "/";
1872 	zone0.zone_rootpathlen = 2;
1873 	zone0.zone_psetid = ZONE_PS_INVAL;
1874 	zone0.zone_ncpus = 0;
1875 	zone0.zone_ncpus_online = 0;
1876 	zone0.zone_proc_initpid = 1;
1877 	zone0.zone_initname = initname;
1878 	zone0.zone_lockedmem_kstat = NULL;
1879 	zone0.zone_swapresv_kstat = NULL;
1880 	zone0.zone_nprocs_kstat = NULL;
1881 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1882 	    offsetof(struct zsd_entry, zsd_linkage));
1883 	list_insert_head(&zone_active, &zone0);
1884 
1885 	/*
1886 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1887 	 * to anything meaningful.  It is assigned to be 'rootdir' in
1888 	 * vfs_mountroot().
1889 	 */
1890 	zone0.zone_rootvp = NULL;
1891 	zone0.zone_vfslist = NULL;
1892 	zone0.zone_bootargs = initargs;
1893 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1894 	/*
1895 	 * The global zone has all privileges
1896 	 */
1897 	priv_fillset(zone0.zone_privset);
1898 	/*
1899 	 * Add p0 to the global zone
1900 	 */
1901 	zone0.zone_zsched = &p0;
1902 	p0.p_zone = &zone0;
1903 }
1904 
1905 /*
1906  * Compute a hash value based on the contents of the label and the DOI.  The
1907  * hash algorithm is somewhat arbitrary, but is based on the observation that
1908  * humans will likely pick labels that differ by amounts that work out to be
1909  * multiples of the number of hash chains, and thus stirring in some primes
1910  * should help.
1911  */
1912 static uint_t
1913 hash_bylabel(void *hdata, mod_hash_key_t key)
1914 {
1915 	const ts_label_t *lab = (ts_label_t *)key;
1916 	const uint32_t *up, *ue;
1917 	uint_t hash;
1918 	int i;
1919 
1920 	_NOTE(ARGUNUSED(hdata));
1921 
1922 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1923 	/* we depend on alignment of label, but not representation */
1924 	up = (const uint32_t *)&lab->tsl_label;
1925 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1926 	i = 1;
1927 	while (up < ue) {
1928 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1929 		hash += *up + (*up << ((i % 16) + 1));
1930 		up++;
1931 		i++;
1932 	}
1933 	return (hash);
1934 }
1935 
1936 /*
1937  * All that mod_hash cares about here is zero (equal) versus non-zero (not
1938  * equal).  This may need to be changed if less than / greater than is ever
1939  * needed.
1940  */
1941 static int
1942 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1943 {
1944 	ts_label_t *lab1 = (ts_label_t *)key1;
1945 	ts_label_t *lab2 = (ts_label_t *)key2;
1946 
1947 	return (label_equal(lab1, lab2) ? 0 : 1);
1948 }
1949 
1950 /*
1951  * Called by main() to initialize the zones framework.
1952  */
1953 void
1954 zone_init(void)
1955 {
1956 	rctl_dict_entry_t *rde;
1957 	rctl_val_t *dval;
1958 	rctl_set_t *set;
1959 	rctl_alloc_gp_t *gp;
1960 	rctl_entity_p_t e;
1961 	int res;
1962 
1963 	ASSERT(curproc == &p0);
1964 
1965 	/*
1966 	 * Create ID space for zone IDs.  ID 0 is reserved for the
1967 	 * global zone.
1968 	 */
1969 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1970 
1971 	/*
1972 	 * Initialize generic zone resource controls, if any.
1973 	 */
1974 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1975 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1976 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1977 	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
1978 
1979 	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
1980 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
1981 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
1982 	    RCTL_GLOBAL_INFINITE,
1983 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
1984 
1985 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1986 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1987 	    INT_MAX, INT_MAX, &zone_lwps_ops);
1988 
1989 	rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE,
1990 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1991 	    INT_MAX, INT_MAX, &zone_procs_ops);
1992 
1993 	/*
1994 	 * System V IPC resource controls
1995 	 */
1996 	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
1997 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1998 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
1999 
2000 	rc_zone_semmni = rctl_register("zone.max-sem-ids",
2001 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2002 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
2003 
2004 	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
2005 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2006 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
2007 
2008 	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
2009 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
2010 	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
2011 
2012 	/*
2013 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
2014 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
2015 	 */
2016 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2017 	bzero(dval, sizeof (rctl_val_t));
2018 	dval->rcv_value = 1;
2019 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
2020 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
2021 	dval->rcv_action_recip_pid = -1;
2022 
2023 	rde = rctl_dict_lookup("zone.cpu-shares");
2024 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
2025 
2026 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
2027 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2028 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2029 	    &zone_locked_mem_ops);
2030 
2031 	rc_zone_max_swap = rctl_register("zone.max-swap",
2032 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
2033 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2034 	    &zone_max_swap_ops);
2035 
2036 	rc_zone_max_lofi = rctl_register("zone.max-lofi",
2037 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |
2038 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
2039 	    &zone_max_lofi_ops);
2040 
2041 	/*
2042 	 * Initialize the ``global zone''.
2043 	 */
2044 	set = rctl_set_create();
2045 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2046 	mutex_enter(&p0.p_lock);
2047 	e.rcep_p.zone = &zone0;
2048 	e.rcep_t = RCENTITY_ZONE;
2049 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
2050 	    gp);
2051 
2052 	zone0.zone_nlwps = p0.p_lwpcnt;
2053 	zone0.zone_nprocs = 1;
2054 	zone0.zone_ntasks = 1;
2055 	mutex_exit(&p0.p_lock);
2056 	zone0.zone_restart_init = B_TRUE;
2057 	zone0.zone_brand = &native_brand;
2058 	rctl_prealloc_destroy(gp);
2059 	/*
2060 	 * pool_default hasn't been initialized yet, so we let pool_init()
2061 	 * take care of making sure the global zone is in the default pool.
2062 	 */
2063 
2064 	/*
2065 	 * Initialize global zone kstats
2066 	 */
2067 	zone_kstat_create(&zone0);
2068 
2069 	/*
2070 	 * Initialize zone label.
2071 	 * mlp are initialized when tnzonecfg is loaded.
2072 	 */
2073 	zone0.zone_slabel = l_admin_low;
2074 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2075 	label_hold(l_admin_low);
2076 
2077 	/*
2078 	 * Initialise the lock for the database structure used by mntfs.
2079 	 */
2080 	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
2081 
2082 	mutex_enter(&zonehash_lock);
2083 	zone_uniqid(&zone0);
2084 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
2085 
2086 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
2087 	    mod_hash_null_valdtor);
2088 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
2089 	    zone_hash_size, mod_hash_null_valdtor);
2090 	/*
2091 	 * maintain zonehashbylabel only for labeled systems
2092 	 */
2093 	if (is_system_labeled())
2094 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
2095 		    zone_hash_size, mod_hash_null_keydtor,
2096 		    mod_hash_null_valdtor, hash_bylabel, NULL,
2097 		    hash_labelkey_cmp, KM_SLEEP);
2098 	zonecount = 1;
2099 
2100 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
2101 	    (mod_hash_val_t)&zone0);
2102 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
2103 	    (mod_hash_val_t)&zone0);
2104 	if (is_system_labeled()) {
2105 		zone0.zone_flags |= ZF_HASHED_LABEL;
2106 		(void) mod_hash_insert(zonehashbylabel,
2107 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
2108 	}
2109 	mutex_exit(&zonehash_lock);
2110 
2111 	/*
2112 	 * We avoid setting zone_kcred until now, since kcred is initialized
2113 	 * sometime after zone_zsd_init() and before zone_init().
2114 	 */
2115 	zone0.zone_kcred = kcred;
2116 	/*
2117 	 * The global zone is fully initialized (except for zone_rootvp which
2118 	 * will be set when the root filesystem is mounted).
2119 	 */
2120 	global_zone = &zone0;
2121 
2122 	/*
2123 	 * Setup an event channel to send zone status change notifications on
2124 	 */
2125 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
2126 	    EVCH_CREAT);
2127 
2128 	if (res)
2129 		panic("Sysevent_evc_bind failed during zone setup.\n");
2130 
2131 }
2132 
2133 static void
2134 zone_free(zone_t *zone)
2135 {
2136 	ASSERT(zone != global_zone);
2137 	ASSERT(zone->zone_ntasks == 0);
2138 	ASSERT(zone->zone_nlwps == 0);
2139 	ASSERT(zone->zone_nprocs == 0);
2140 	ASSERT(zone->zone_cred_ref == 0);
2141 	ASSERT(zone->zone_kcred == NULL);
2142 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
2143 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2144 
2145 	/*
2146 	 * Remove any zone caps.
2147 	 */
2148 	cpucaps_zone_remove(zone);
2149 
2150 	ASSERT(zone->zone_cpucap == NULL);
2151 
2152 	/* remove from deathrow list */
2153 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
2154 		ASSERT(zone->zone_ref == 0);
2155 		mutex_enter(&zone_deathrow_lock);
2156 		list_remove(&zone_deathrow, zone);
2157 		mutex_exit(&zone_deathrow_lock);
2158 	}
2159 
2160 	zone_free_zsd(zone);
2161 	zone_free_datasets(zone);
2162 	list_destroy(&zone->zone_dl_list);
2163 
2164 	if (zone->zone_rootvp != NULL)
2165 		VN_RELE(zone->zone_rootvp);
2166 	if (zone->zone_rootpath)
2167 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2168 	if (zone->zone_name != NULL)
2169 		kmem_free(zone->zone_name, ZONENAME_MAX);
2170 	if (zone->zone_slabel != NULL)
2171 		label_rele(zone->zone_slabel);
2172 	if (zone->zone_nodename != NULL)
2173 		kmem_free(zone->zone_nodename, _SYS_NMLN);
2174 	if (zone->zone_domain != NULL)
2175 		kmem_free(zone->zone_domain, _SYS_NMLN);
2176 	if (zone->zone_privset != NULL)
2177 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
2178 	if (zone->zone_rctls != NULL)
2179 		rctl_set_free(zone->zone_rctls);
2180 	if (zone->zone_bootargs != NULL)
2181 		strfree(zone->zone_bootargs);
2182 	if (zone->zone_initname != NULL)
2183 		strfree(zone->zone_initname);
2184 	if (zone->zone_fs_allowed != NULL)
2185 		strfree(zone->zone_fs_allowed);
2186 	if (zone->zone_pfexecd != NULL)
2187 		klpd_freelist(&zone->zone_pfexecd);
2188 	id_free(zoneid_space, zone->zone_id);
2189 	mutex_destroy(&zone->zone_lock);
2190 	cv_destroy(&zone->zone_cv);
2191 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2192 	rw_destroy(&zone->zone_mntfs_db_lock);
2193 	kmem_free(zone, sizeof (zone_t));
2194 }
2195 
2196 /*
2197  * See block comment at the top of this file for information about zone
2198  * status values.
2199  */
2200 /*
2201  * Convenience function for setting zone status.
2202  */
2203 static void
2204 zone_status_set(zone_t *zone, zone_status_t status)
2205 {
2206 
2207 	nvlist_t *nvl = NULL;
2208 	ASSERT(MUTEX_HELD(&zone_status_lock));
2209 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2210 	    status >= zone_status_get(zone));
2211 
2212 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2213 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2214 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2215 	    zone_status_table[status]) ||
2216 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2217 	    zone_status_table[zone->zone_status]) ||
2218 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2219 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2220 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2221 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2222 #ifdef DEBUG
2223 		(void) printf(
2224 		    "Failed to allocate and send zone state change event.\n");
2225 #endif
2226 	}
2227 	nvlist_free(nvl);
2228 
2229 	zone->zone_status = status;
2230 
2231 	cv_broadcast(&zone->zone_cv);
2232 }
2233 
2234 /*
2235  * Public function to retrieve the zone status.  The zone status may
2236  * change after it is retrieved.
2237  */
2238 zone_status_t
2239 zone_status_get(zone_t *zone)
2240 {
2241 	return (zone->zone_status);
2242 }
2243 
2244 static int
2245 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2246 {
2247 	char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2248 	int err = 0;
2249 
2250 	ASSERT(zone != global_zone);
2251 	if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0)
2252 		goto done;	/* EFAULT or ENAMETOOLONG */
2253 
2254 	if (zone->zone_bootargs != NULL)
2255 		strfree(zone->zone_bootargs);
2256 
2257 	zone->zone_bootargs = strdup(buf);
2258 
2259 done:
2260 	kmem_free(buf, BOOTARGS_MAX);
2261 	return (err);
2262 }
2263 
2264 static int
2265 zone_set_brand(zone_t *zone, const char *brand)
2266 {
2267 	struct brand_attr *attrp;
2268 	brand_t *bp;
2269 
2270 	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2271 	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2272 		kmem_free(attrp, sizeof (struct brand_attr));
2273 		return (EFAULT);
2274 	}
2275 
2276 	bp = brand_register_zone(attrp);
2277 	kmem_free(attrp, sizeof (struct brand_attr));
2278 	if (bp == NULL)
2279 		return (EINVAL);
2280 
2281 	/*
2282 	 * This is the only place where a zone can change it's brand.
2283 	 * We already need to hold zone_status_lock to check the zone
2284 	 * status, so we'll just use that lock to serialize zone
2285 	 * branding requests as well.
2286 	 */
2287 	mutex_enter(&zone_status_lock);
2288 
2289 	/* Re-Branding is not allowed and the zone can't be booted yet */
2290 	if ((ZONE_IS_BRANDED(zone)) ||
2291 	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2292 		mutex_exit(&zone_status_lock);
2293 		brand_unregister_zone(bp);
2294 		return (EINVAL);
2295 	}
2296 
2297 	/* set up the brand specific data */
2298 	zone->zone_brand = bp;
2299 	ZBROP(zone)->b_init_brand_data(zone);
2300 
2301 	mutex_exit(&zone_status_lock);
2302 	return (0);
2303 }
2304 
2305 static int
2306 zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed)
2307 {
2308 	char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP);
2309 	int err = 0;
2310 
2311 	ASSERT(zone != global_zone);
2312 	if ((err = copyinstr(zone_fs_allowed, buf,
2313 	    ZONE_FS_ALLOWED_MAX, NULL)) != 0)
2314 		goto done;
2315 
2316 	if (zone->zone_fs_allowed != NULL)
2317 		strfree(zone->zone_fs_allowed);
2318 
2319 	zone->zone_fs_allowed = strdup(buf);
2320 
2321 done:
2322 	kmem_free(buf, ZONE_FS_ALLOWED_MAX);
2323 	return (err);
2324 }
2325 
2326 static int
2327 zone_set_initname(zone_t *zone, const char *zone_initname)
2328 {
2329 	char initname[INITNAME_SZ];
2330 	size_t len;
2331 	int err = 0;
2332 
2333 	ASSERT(zone != global_zone);
2334 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2335 		return (err);	/* EFAULT or ENAMETOOLONG */
2336 
2337 	if (zone->zone_initname != NULL)
2338 		strfree(zone->zone_initname);
2339 
2340 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2341 	(void) strcpy(zone->zone_initname, initname);
2342 	return (0);
2343 }
2344 
2345 static int
2346 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2347 {
2348 	uint64_t mcap;
2349 	int err = 0;
2350 
2351 	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2352 		zone->zone_phys_mcap = mcap;
2353 
2354 	return (err);
2355 }
2356 
2357 static int
2358 zone_set_sched_class(zone_t *zone, const char *new_class)
2359 {
2360 	char sched_class[PC_CLNMSZ];
2361 	id_t classid;
2362 	int err;
2363 
2364 	ASSERT(zone != global_zone);
2365 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2366 		return (err);	/* EFAULT or ENAMETOOLONG */
2367 
2368 	if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid))
2369 		return (set_errno(EINVAL));
2370 	zone->zone_defaultcid = classid;
2371 	ASSERT(zone->zone_defaultcid > 0 &&
2372 	    zone->zone_defaultcid < loaded_classes);
2373 
2374 	return (0);
2375 }
2376 
2377 /*
2378  * Block indefinitely waiting for (zone_status >= status)
2379  */
2380 void
2381 zone_status_wait(zone_t *zone, zone_status_t status)
2382 {
2383 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2384 
2385 	mutex_enter(&zone_status_lock);
2386 	while (zone->zone_status < status) {
2387 		cv_wait(&zone->zone_cv, &zone_status_lock);
2388 	}
2389 	mutex_exit(&zone_status_lock);
2390 }
2391 
2392 /*
2393  * Private CPR-safe version of zone_status_wait().
2394  */
2395 static void
2396 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2397 {
2398 	callb_cpr_t cprinfo;
2399 
2400 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2401 
2402 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2403 	    str);
2404 	mutex_enter(&zone_status_lock);
2405 	while (zone->zone_status < status) {
2406 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2407 		cv_wait(&zone->zone_cv, &zone_status_lock);
2408 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2409 	}
2410 	/*
2411 	 * zone_status_lock is implicitly released by the following.
2412 	 */
2413 	CALLB_CPR_EXIT(&cprinfo);
2414 }
2415 
2416 /*
2417  * Block until zone enters requested state or signal is received.  Return (0)
2418  * if signaled, non-zero otherwise.
2419  */
2420 int
2421 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2422 {
2423 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2424 
2425 	mutex_enter(&zone_status_lock);
2426 	while (zone->zone_status < status) {
2427 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2428 			mutex_exit(&zone_status_lock);
2429 			return (0);
2430 		}
2431 	}
2432 	mutex_exit(&zone_status_lock);
2433 	return (1);
2434 }
2435 
2436 /*
2437  * Block until the zone enters the requested state or the timeout expires,
2438  * whichever happens first.  Return (-1) if operation timed out, time remaining
2439  * otherwise.
2440  */
2441 clock_t
2442 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2443 {
2444 	clock_t timeleft = 0;
2445 
2446 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2447 
2448 	mutex_enter(&zone_status_lock);
2449 	while (zone->zone_status < status && timeleft != -1) {
2450 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2451 	}
2452 	mutex_exit(&zone_status_lock);
2453 	return (timeleft);
2454 }
2455 
2456 /*
2457  * Block until the zone enters the requested state, the current process is
2458  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2459  * operation timed out, 0 if signaled, time remaining otherwise.
2460  */
2461 clock_t
2462 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2463 {
2464 	clock_t timeleft = tim - ddi_get_lbolt();
2465 
2466 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2467 
2468 	mutex_enter(&zone_status_lock);
2469 	while (zone->zone_status < status) {
2470 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2471 		    tim);
2472 		if (timeleft <= 0)
2473 			break;
2474 	}
2475 	mutex_exit(&zone_status_lock);
2476 	return (timeleft);
2477 }
2478 
2479 /*
2480  * Zones have two reference counts: one for references from credential
2481  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2482  * This is so we can allow a zone to be rebooted while there are still
2483  * outstanding cred references, since certain drivers cache dblks (which
2484  * implicitly results in cached creds).  We wait for zone_ref to drop to
2485  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2486  * later freed when the zone_cred_ref drops to 0, though nothing other
2487  * than the zone id and privilege set should be accessed once the zone
2488  * is "dead".
2489  *
2490  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2491  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2492  * to 0.  This can be useful to flush out other sources of cached creds
2493  * that may be less innocuous than the driver case.
2494  */
2495 
2496 int zone_wait_for_cred = 0;
2497 
2498 static void
2499 zone_hold_locked(zone_t *z)
2500 {
2501 	ASSERT(MUTEX_HELD(&z->zone_lock));
2502 	z->zone_ref++;
2503 	ASSERT(z->zone_ref != 0);
2504 }
2505 
2506 void
2507 zone_hold(zone_t *z)
2508 {
2509 	mutex_enter(&z->zone_lock);
2510 	zone_hold_locked(z);
2511 	mutex_exit(&z->zone_lock);
2512 }
2513 
2514 /*
2515  * If the non-cred ref count drops to 1 and either the cred ref count
2516  * is 0 or we aren't waiting for cred references, the zone is ready to
2517  * be destroyed.
2518  */
2519 #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
2520 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2521 
2522 void
2523 zone_rele(zone_t *z)
2524 {
2525 	boolean_t wakeup;
2526 
2527 	mutex_enter(&z->zone_lock);
2528 	ASSERT(z->zone_ref != 0);
2529 	z->zone_ref--;
2530 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2531 		/* no more refs, free the structure */
2532 		mutex_exit(&z->zone_lock);
2533 		zone_free(z);
2534 		return;
2535 	}
2536 	/* signal zone_destroy so the zone can finish halting */
2537 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2538 	mutex_exit(&z->zone_lock);
2539 
2540 	if (wakeup) {
2541 		/*
2542 		 * Grabbing zonehash_lock here effectively synchronizes with
2543 		 * zone_destroy() to avoid missed signals.
2544 		 */
2545 		mutex_enter(&zonehash_lock);
2546 		cv_broadcast(&zone_destroy_cv);
2547 		mutex_exit(&zonehash_lock);
2548 	}
2549 }
2550 
2551 void
2552 zone_cred_hold(zone_t *z)
2553 {
2554 	mutex_enter(&z->zone_lock);
2555 	z->zone_cred_ref++;
2556 	ASSERT(z->zone_cred_ref != 0);
2557 	mutex_exit(&z->zone_lock);
2558 }
2559 
2560 void
2561 zone_cred_rele(zone_t *z)
2562 {
2563 	boolean_t wakeup;
2564 
2565 	mutex_enter(&z->zone_lock);
2566 	ASSERT(z->zone_cred_ref != 0);
2567 	z->zone_cred_ref--;
2568 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2569 		/* no more refs, free the structure */
2570 		mutex_exit(&z->zone_lock);
2571 		zone_free(z);
2572 		return;
2573 	}
2574 	/*
2575 	 * If zone_destroy is waiting for the cred references to drain
2576 	 * out, and they have, signal it.
2577 	 */
2578 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2579 	    zone_status_get(z) >= ZONE_IS_DEAD);
2580 	mutex_exit(&z->zone_lock);
2581 
2582 	if (wakeup) {
2583 		/*
2584 		 * Grabbing zonehash_lock here effectively synchronizes with
2585 		 * zone_destroy() to avoid missed signals.
2586 		 */
2587 		mutex_enter(&zonehash_lock);
2588 		cv_broadcast(&zone_destroy_cv);
2589 		mutex_exit(&zonehash_lock);
2590 	}
2591 }
2592 
2593 void
2594 zone_task_hold(zone_t *z)
2595 {
2596 	mutex_enter(&z->zone_lock);
2597 	z->zone_ntasks++;
2598 	ASSERT(z->zone_ntasks != 0);
2599 	mutex_exit(&z->zone_lock);
2600 }
2601 
2602 void
2603 zone_task_rele(zone_t *zone)
2604 {
2605 	uint_t refcnt;
2606 
2607 	mutex_enter(&zone->zone_lock);
2608 	ASSERT(zone->zone_ntasks != 0);
2609 	refcnt = --zone->zone_ntasks;
2610 	if (refcnt > 1)	{	/* Common case */
2611 		mutex_exit(&zone->zone_lock);
2612 		return;
2613 	}
2614 	zone_hold_locked(zone);	/* so we can use the zone_t later */
2615 	mutex_exit(&zone->zone_lock);
2616 	if (refcnt == 1) {
2617 		/*
2618 		 * See if the zone is shutting down.
2619 		 */
2620 		mutex_enter(&zone_status_lock);
2621 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2622 			goto out;
2623 		}
2624 
2625 		/*
2626 		 * Make sure the ntasks didn't change since we
2627 		 * dropped zone_lock.
2628 		 */
2629 		mutex_enter(&zone->zone_lock);
2630 		if (refcnt != zone->zone_ntasks) {
2631 			mutex_exit(&zone->zone_lock);
2632 			goto out;
2633 		}
2634 		mutex_exit(&zone->zone_lock);
2635 
2636 		/*
2637 		 * No more user processes in the zone.  The zone is empty.
2638 		 */
2639 		zone_status_set(zone, ZONE_IS_EMPTY);
2640 		goto out;
2641 	}
2642 
2643 	ASSERT(refcnt == 0);
2644 	/*
2645 	 * zsched has exited; the zone is dead.
2646 	 */
2647 	zone->zone_zsched = NULL;		/* paranoia */
2648 	mutex_enter(&zone_status_lock);
2649 	zone_status_set(zone, ZONE_IS_DEAD);
2650 out:
2651 	mutex_exit(&zone_status_lock);
2652 	zone_rele(zone);
2653 }
2654 
2655 zoneid_t
2656 getzoneid(void)
2657 {
2658 	return (curproc->p_zone->zone_id);
2659 }
2660 
2661 /*
2662  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2663  * check the validity of a zone's state.
2664  */
2665 static zone_t *
2666 zone_find_all_by_id(zoneid_t zoneid)
2667 {
2668 	mod_hash_val_t hv;
2669 	zone_t *zone = NULL;
2670 
2671 	ASSERT(MUTEX_HELD(&zonehash_lock));
2672 
2673 	if (mod_hash_find(zonehashbyid,
2674 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2675 		zone = (zone_t *)hv;
2676 	return (zone);
2677 }
2678 
2679 static zone_t *
2680 zone_find_all_by_label(const ts_label_t *label)
2681 {
2682 	mod_hash_val_t hv;
2683 	zone_t *zone = NULL;
2684 
2685 	ASSERT(MUTEX_HELD(&zonehash_lock));
2686 
2687 	/*
2688 	 * zonehashbylabel is not maintained for unlabeled systems
2689 	 */
2690 	if (!is_system_labeled())
2691 		return (NULL);
2692 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2693 		zone = (zone_t *)hv;
2694 	return (zone);
2695 }
2696 
2697 static zone_t *
2698 zone_find_all_by_name(char *name)
2699 {
2700 	mod_hash_val_t hv;
2701 	zone_t *zone = NULL;
2702 
2703 	ASSERT(MUTEX_HELD(&zonehash_lock));
2704 
2705 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2706 		zone = (zone_t *)hv;
2707 	return (zone);
2708 }
2709 
2710 /*
2711  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2712  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2713  * Caller must call zone_rele() once it is done with the zone.
2714  *
2715  * The zone may begin the zone_destroy() sequence immediately after this
2716  * function returns, but may be safely used until zone_rele() is called.
2717  */
2718 zone_t *
2719 zone_find_by_id(zoneid_t zoneid)
2720 {
2721 	zone_t *zone;
2722 	zone_status_t status;
2723 
2724 	mutex_enter(&zonehash_lock);
2725 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2726 		mutex_exit(&zonehash_lock);
2727 		return (NULL);
2728 	}
2729 	status = zone_status_get(zone);
2730 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2731 		/*
2732 		 * For all practical purposes the zone doesn't exist.
2733 		 */
2734 		mutex_exit(&zonehash_lock);
2735 		return (NULL);
2736 	}
2737 	zone_hold(zone);
2738 	mutex_exit(&zonehash_lock);
2739 	return (zone);
2740 }
2741 
2742 /*
2743  * Similar to zone_find_by_id, but using zone label as the key.
2744  */
2745 zone_t *
2746 zone_find_by_label(const ts_label_t *label)
2747 {
2748 	zone_t *zone;
2749 	zone_status_t status;
2750 
2751 	mutex_enter(&zonehash_lock);
2752 	if ((zone = zone_find_all_by_label(label)) == NULL) {
2753 		mutex_exit(&zonehash_lock);
2754 		return (NULL);
2755 	}
2756 
2757 	status = zone_status_get(zone);
2758 	if (status > ZONE_IS_DOWN) {
2759 		/*
2760 		 * For all practical purposes the zone doesn't exist.
2761 		 */
2762 		mutex_exit(&zonehash_lock);
2763 		return (NULL);
2764 	}
2765 	zone_hold(zone);
2766 	mutex_exit(&zonehash_lock);
2767 	return (zone);
2768 }
2769 
2770 /*
2771  * Similar to zone_find_by_id, but using zone name as the key.
2772  */
2773 zone_t *
2774 zone_find_by_name(char *name)
2775 {
2776 	zone_t *zone;
2777 	zone_status_t status;
2778 
2779 	mutex_enter(&zonehash_lock);
2780 	if ((zone = zone_find_all_by_name(name)) == NULL) {
2781 		mutex_exit(&zonehash_lock);
2782 		return (NULL);
2783 	}
2784 	status = zone_status_get(zone);
2785 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2786 		/*
2787 		 * For all practical purposes the zone doesn't exist.
2788 		 */
2789 		mutex_exit(&zonehash_lock);
2790 		return (NULL);
2791 	}
2792 	zone_hold(zone);
2793 	mutex_exit(&zonehash_lock);
2794 	return (zone);
2795 }
2796 
2797 /*
2798  * Similar to zone_find_by_id(), using the path as a key.  For instance,
2799  * if there is a zone "foo" rooted at /foo/root, and the path argument
2800  * is "/foo/root/proc", it will return the held zone_t corresponding to
2801  * zone "foo".
2802  *
2803  * zone_find_by_path() always returns a non-NULL value, since at the
2804  * very least every path will be contained in the global zone.
2805  *
2806  * As with the other zone_find_by_*() functions, the caller is
2807  * responsible for zone_rele()ing the return value of this function.
2808  */
2809 zone_t *
2810 zone_find_by_path(const char *path)
2811 {
2812 	zone_t *zone;
2813 	zone_t *zret = NULL;
2814 	zone_status_t status;
2815 
2816 	if (path == NULL) {
2817 		/*
2818 		 * Call from rootconf().
2819 		 */
2820 		zone_hold(global_zone);
2821 		return (global_zone);
2822 	}
2823 	ASSERT(*path == '/');
2824 	mutex_enter(&zonehash_lock);
2825 	for (zone = list_head(&zone_active); zone != NULL;
2826 	    zone = list_next(&zone_active, zone)) {
2827 		if (ZONE_PATH_VISIBLE(path, zone))
2828 			zret = zone;
2829 	}
2830 	ASSERT(zret != NULL);
2831 	status = zone_status_get(zret);
2832 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2833 		/*
2834 		 * Zone practically doesn't exist.
2835 		 */
2836 		zret = global_zone;
2837 	}
2838 	zone_hold(zret);
2839 	mutex_exit(&zonehash_lock);
2840 	return (zret);
2841 }
2842 
2843 /*
2844  * Get the number of cpus visible to this zone.  The system-wide global
2845  * 'ncpus' is returned if pools are disabled, the caller is in the
2846  * global zone, or a NULL zone argument is passed in.
2847  */
2848 int
2849 zone_ncpus_get(zone_t *zone)
2850 {
2851 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2852 
2853 	return (myncpus != 0 ? myncpus : ncpus);
2854 }
2855 
2856 /*
2857  * Get the number of online cpus visible to this zone.  The system-wide
2858  * global 'ncpus_online' is returned if pools are disabled, the caller
2859  * is in the global zone, or a NULL zone argument is passed in.
2860  */
2861 int
2862 zone_ncpus_online_get(zone_t *zone)
2863 {
2864 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
2865 
2866 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
2867 }
2868 
2869 /*
2870  * Return the pool to which the zone is currently bound.
2871  */
2872 pool_t *
2873 zone_pool_get(zone_t *zone)
2874 {
2875 	ASSERT(pool_lock_held());
2876 
2877 	return (zone->zone_pool);
2878 }
2879 
2880 /*
2881  * Set the zone's pool pointer and update the zone's visibility to match
2882  * the resources in the new pool.
2883  */
2884 void
2885 zone_pool_set(zone_t *zone, pool_t *pool)
2886 {
2887 	ASSERT(pool_lock_held());
2888 	ASSERT(MUTEX_HELD(&cpu_lock));
2889 
2890 	zone->zone_pool = pool;
2891 	zone_pset_set(zone, pool->pool_pset->pset_id);
2892 }
2893 
2894 /*
2895  * Return the cached value of the id of the processor set to which the
2896  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
2897  * facility is disabled.
2898  */
2899 psetid_t
2900 zone_pset_get(zone_t *zone)
2901 {
2902 	ASSERT(MUTEX_HELD(&cpu_lock));
2903 
2904 	return (zone->zone_psetid);
2905 }
2906 
2907 /*
2908  * Set the cached value of the id of the processor set to which the zone
2909  * is currently bound.  Also update the zone's visibility to match the
2910  * resources in the new processor set.
2911  */
2912 void
2913 zone_pset_set(zone_t *zone, psetid_t newpsetid)
2914 {
2915 	psetid_t oldpsetid;
2916 
2917 	ASSERT(MUTEX_HELD(&cpu_lock));
2918 	oldpsetid = zone_pset_get(zone);
2919 
2920 	if (oldpsetid == newpsetid)
2921 		return;
2922 	/*
2923 	 * Global zone sees all.
2924 	 */
2925 	if (zone != global_zone) {
2926 		zone->zone_psetid = newpsetid;
2927 		if (newpsetid != ZONE_PS_INVAL)
2928 			pool_pset_visibility_add(newpsetid, zone);
2929 		if (oldpsetid != ZONE_PS_INVAL)
2930 			pool_pset_visibility_remove(oldpsetid, zone);
2931 	}
2932 	/*
2933 	 * Disabling pools, so we should start using the global values
2934 	 * for ncpus and ncpus_online.
2935 	 */
2936 	if (newpsetid == ZONE_PS_INVAL) {
2937 		zone->zone_ncpus = 0;
2938 		zone->zone_ncpus_online = 0;
2939 	}
2940 }
2941 
2942 /*
2943  * Walk the list of active zones and issue the provided callback for
2944  * each of them.
2945  *
2946  * Caller must not be holding any locks that may be acquired under
2947  * zonehash_lock.  See comment at the beginning of the file for a list of
2948  * common locks and their interactions with zones.
2949  */
2950 int
2951 zone_walk(int (*cb)(zone_t *, void *), void *data)
2952 {
2953 	zone_t *zone;
2954 	int ret = 0;
2955 	zone_status_t status;
2956 
2957 	mutex_enter(&zonehash_lock);
2958 	for (zone = list_head(&zone_active); zone != NULL;
2959 	    zone = list_next(&zone_active, zone)) {
2960 		/*
2961 		 * Skip zones that shouldn't be externally visible.
2962 		 */
2963 		status = zone_status_get(zone);
2964 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
2965 			continue;
2966 		/*
2967 		 * Bail immediately if any callback invocation returns a
2968 		 * non-zero value.
2969 		 */
2970 		ret = (*cb)(zone, data);
2971 		if (ret != 0)
2972 			break;
2973 	}
2974 	mutex_exit(&zonehash_lock);
2975 	return (ret);
2976 }
2977 
2978 static int
2979 zone_set_root(zone_t *zone, const char *upath)
2980 {
2981 	vnode_t *vp;
2982 	int trycount;
2983 	int error = 0;
2984 	char *path;
2985 	struct pathname upn, pn;
2986 	size_t pathlen;
2987 
2988 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
2989 		return (error);
2990 
2991 	pn_alloc(&pn);
2992 
2993 	/* prevent infinite loop */
2994 	trycount = 10;
2995 	for (;;) {
2996 		if (--trycount <= 0) {
2997 			error = ESTALE;
2998 			goto out;
2999 		}
3000 
3001 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
3002 			/*
3003 			 * VOP_ACCESS() may cover 'vp' with a new
3004 			 * filesystem, if 'vp' is an autoFS vnode.
3005 			 * Get the new 'vp' if so.
3006 			 */
3007 			if ((error =
3008 			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
3009 			    (!vn_ismntpt(vp) ||
3010 			    (error = traverse(&vp)) == 0)) {
3011 				pathlen = pn.pn_pathlen + 2;
3012 				path = kmem_alloc(pathlen, KM_SLEEP);
3013 				(void) strncpy(path, pn.pn_path,
3014 				    pn.pn_pathlen + 1);
3015 				path[pathlen - 2] = '/';
3016 				path[pathlen - 1] = '\0';
3017 				pn_free(&pn);
3018 				pn_free(&upn);
3019 
3020 				/* Success! */
3021 				break;
3022 			}
3023 			VN_RELE(vp);
3024 		}
3025 		if (error != ESTALE)
3026 			goto out;
3027 	}
3028 
3029 	ASSERT(error == 0);
3030 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
3031 	zone->zone_rootpath = path;
3032 	zone->zone_rootpathlen = pathlen;
3033 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
3034 		zone->zone_flags |= ZF_IS_SCRATCH;
3035 	return (0);
3036 
3037 out:
3038 	pn_free(&pn);
3039 	pn_free(&upn);
3040 	return (error);
3041 }
3042 
3043 #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
3044 			((c) >= 'a' && (c) <= 'z') || \
3045 			((c) >= 'A' && (c) <= 'Z'))
3046 
3047 static int
3048 zone_set_name(zone_t *zone, const char *uname)
3049 {
3050 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3051 	size_t len;
3052 	int i, err;
3053 
3054 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
3055 		kmem_free(kname, ZONENAME_MAX);
3056 		return (err);	/* EFAULT or ENAMETOOLONG */
3057 	}
3058 
3059 	/* must be less than ZONENAME_MAX */
3060 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
3061 		kmem_free(kname, ZONENAME_MAX);
3062 		return (EINVAL);
3063 	}
3064 
3065 	/*
3066 	 * Name must start with an alphanumeric and must contain only
3067 	 * alphanumerics, '-', '_' and '.'.
3068 	 */
3069 	if (!isalnum(kname[0])) {
3070 		kmem_free(kname, ZONENAME_MAX);
3071 		return (EINVAL);
3072 	}
3073 	for (i = 1; i < len - 1; i++) {
3074 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
3075 		    kname[i] != '.') {
3076 			kmem_free(kname, ZONENAME_MAX);
3077 			return (EINVAL);
3078 		}
3079 	}
3080 
3081 	zone->zone_name = kname;
3082 	return (0);
3083 }
3084 
3085 /*
3086  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
3087  * is NULL or it points to a zone with no hostid emulation, then the machine's
3088  * hostid (i.e., the global zone's hostid) is returned.  This function returns
3089  * zero if neither the zone nor the host machine (global zone) have hostids.  It
3090  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
3091  * hostid and the machine's hostid is invalid.
3092  */
3093 uint32_t
3094 zone_get_hostid(zone_t *zonep)
3095 {
3096 	unsigned long machine_hostid;
3097 
3098 	if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
3099 		if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
3100 			return (HW_INVALID_HOSTID);
3101 		return ((uint32_t)machine_hostid);
3102 	}
3103 	return (zonep->zone_hostid);
3104 }
3105 
3106 /*
3107  * Similar to thread_create(), but makes sure the thread is in the appropriate
3108  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
3109  */
3110 /*ARGSUSED*/
3111 kthread_t *
3112 zthread_create(
3113     caddr_t stk,
3114     size_t stksize,
3115     void (*proc)(),
3116     void *arg,
3117     size_t len,
3118     pri_t pri)
3119 {
3120 	kthread_t *t;
3121 	zone_t *zone = curproc->p_zone;
3122 	proc_t *pp = zone->zone_zsched;
3123 
3124 	zone_hold(zone);	/* Reference to be dropped when thread exits */
3125 
3126 	/*
3127 	 * No-one should be trying to create threads if the zone is shutting
3128 	 * down and there aren't any kernel threads around.  See comment
3129 	 * in zthread_exit().
3130 	 */
3131 	ASSERT(!(zone->zone_kthreads == NULL &&
3132 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
3133 	/*
3134 	 * Create a thread, but don't let it run until we've finished setting
3135 	 * things up.
3136 	 */
3137 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
3138 	ASSERT(t->t_forw == NULL);
3139 	mutex_enter(&zone_status_lock);
3140 	if (zone->zone_kthreads == NULL) {
3141 		t->t_forw = t->t_back = t;
3142 	} else {
3143 		kthread_t *tx = zone->zone_kthreads;
3144 
3145 		t->t_forw = tx;
3146 		t->t_back = tx->t_back;
3147 		tx->t_back->t_forw = t;
3148 		tx->t_back = t;
3149 	}
3150 	zone->zone_kthreads = t;
3151 	mutex_exit(&zone_status_lock);
3152 
3153 	mutex_enter(&pp->p_lock);
3154 	t->t_proc_flag |= TP_ZTHREAD;
3155 	project_rele(t->t_proj);
3156 	t->t_proj = project_hold(pp->p_task->tk_proj);
3157 
3158 	/*
3159 	 * Setup complete, let it run.
3160 	 */
3161 	thread_lock(t);
3162 	t->t_schedflag |= TS_ALLSTART;
3163 	setrun_locked(t);
3164 	thread_unlock(t);
3165 
3166 	mutex_exit(&pp->p_lock);
3167 
3168 	return (t);
3169 }
3170 
3171 /*
3172  * Similar to thread_exit().  Must be called by threads created via
3173  * zthread_exit().
3174  */
3175 void
3176 zthread_exit(void)
3177 {
3178 	kthread_t *t = curthread;
3179 	proc_t *pp = curproc;
3180 	zone_t *zone = pp->p_zone;
3181 
3182 	mutex_enter(&zone_status_lock);
3183 
3184 	/*
3185 	 * Reparent to p0
3186 	 */
3187 	kpreempt_disable();
3188 	mutex_enter(&pp->p_lock);
3189 	t->t_proc_flag &= ~TP_ZTHREAD;
3190 	t->t_procp = &p0;
3191 	hat_thread_exit(t);
3192 	mutex_exit(&pp->p_lock);
3193 	kpreempt_enable();
3194 
3195 	if (t->t_back == t) {
3196 		ASSERT(t->t_forw == t);
3197 		/*
3198 		 * If the zone is empty, once the thread count
3199 		 * goes to zero no further kernel threads can be
3200 		 * created.  This is because if the creator is a process
3201 		 * in the zone, then it must have exited before the zone
3202 		 * state could be set to ZONE_IS_EMPTY.
3203 		 * Otherwise, if the creator is a kernel thread in the
3204 		 * zone, the thread count is non-zero.
3205 		 *
3206 		 * This really means that non-zone kernel threads should
3207 		 * not create zone kernel threads.
3208 		 */
3209 		zone->zone_kthreads = NULL;
3210 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3211 			zone_status_set(zone, ZONE_IS_DOWN);
3212 			/*
3213 			 * Remove any CPU caps on this zone.
3214 			 */
3215 			cpucaps_zone_remove(zone);
3216 		}
3217 	} else {
3218 		t->t_forw->t_back = t->t_back;
3219 		t->t_back->t_forw = t->t_forw;
3220 		if (zone->zone_kthreads == t)
3221 			zone->zone_kthreads = t->t_forw;
3222 	}
3223 	mutex_exit(&zone_status_lock);
3224 	zone_rele(zone);
3225 	thread_exit();
3226 	/* NOTREACHED */
3227 }
3228 
3229 static void
3230 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3231 {
3232 	vnode_t *oldvp;
3233 
3234 	/* we're going to hold a reference here to the directory */
3235 	VN_HOLD(vp);
3236 
3237 	/* update abs cwd/root path see c2/audit.c */
3238 	if (AU_AUDITING())
3239 		audit_chdirec(vp, vpp);
3240 
3241 	mutex_enter(&pp->p_lock);
3242 	oldvp = *vpp;
3243 	*vpp = vp;
3244 	mutex_exit(&pp->p_lock);
3245 	if (oldvp != NULL)
3246 		VN_RELE(oldvp);
3247 }
3248 
3249 /*
3250  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3251  */
3252 static int
3253 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3254 {
3255 	nvpair_t *nvp = NULL;
3256 	boolean_t priv_set = B_FALSE;
3257 	boolean_t limit_set = B_FALSE;
3258 	boolean_t action_set = B_FALSE;
3259 
3260 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3261 		const char *name;
3262 		uint64_t ui64;
3263 
3264 		name = nvpair_name(nvp);
3265 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3266 			return (EINVAL);
3267 		(void) nvpair_value_uint64(nvp, &ui64);
3268 		if (strcmp(name, "privilege") == 0) {
3269 			/*
3270 			 * Currently only privileged values are allowed, but
3271 			 * this may change in the future.
3272 			 */
3273 			if (ui64 != RCPRIV_PRIVILEGED)
3274 				return (EINVAL);
3275 			rv->rcv_privilege = ui64;
3276 			priv_set = B_TRUE;
3277 		} else if (strcmp(name, "limit") == 0) {
3278 			rv->rcv_value = ui64;
3279 			limit_set = B_TRUE;
3280 		} else if (strcmp(name, "action") == 0) {
3281 			if (ui64 != RCTL_LOCAL_NOACTION &&
3282 			    ui64 != RCTL_LOCAL_DENY)
3283 				return (EINVAL);
3284 			rv->rcv_flagaction = ui64;
3285 			action_set = B_TRUE;
3286 		} else {
3287 			return (EINVAL);
3288 		}
3289 	}
3290 
3291 	if (!(priv_set && limit_set && action_set))
3292 		return (EINVAL);
3293 	rv->rcv_action_signal = 0;
3294 	rv->rcv_action_recipient = NULL;
3295 	rv->rcv_action_recip_pid = -1;
3296 	rv->rcv_firing_time = 0;
3297 
3298 	return (0);
3299 }
3300 
3301 /*
3302  * Non-global zone version of start_init.
3303  */
3304 void
3305 zone_start_init(void)
3306 {
3307 	proc_t *p = ttoproc(curthread);
3308 	zone_t *z = p->p_zone;
3309 
3310 	ASSERT(!INGLOBALZONE(curproc));
3311 
3312 	/*
3313 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
3314 	 * storing just the pid of init is sufficient.
3315 	 */
3316 	z->zone_proc_initpid = p->p_pid;
3317 
3318 	/*
3319 	 * We maintain zone_boot_err so that we can return the cause of the
3320 	 * failure back to the caller of the zone_boot syscall.
3321 	 */
3322 	p->p_zone->zone_boot_err = start_init_common();
3323 
3324 	/*
3325 	 * We will prevent booting zones from becoming running zones if the
3326 	 * global zone is shutting down.
3327 	 */
3328 	mutex_enter(&zone_status_lock);
3329 	if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
3330 	    ZONE_IS_SHUTTING_DOWN) {
3331 		/*
3332 		 * Make sure we are still in the booting state-- we could have
3333 		 * raced and already be shutting down, or even further along.
3334 		 */
3335 		if (zone_status_get(z) == ZONE_IS_BOOTING) {
3336 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3337 		}
3338 		mutex_exit(&zone_status_lock);
3339 		/* It's gone bad, dispose of the process */
3340 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3341 			mutex_enter(&p->p_lock);
3342 			ASSERT(p->p_flag & SEXITLWPS);
3343 			lwp_exit();
3344 		}
3345 	} else {
3346 		if (zone_status_get(z) == ZONE_IS_BOOTING)
3347 			zone_status_set(z, ZONE_IS_RUNNING);
3348 		mutex_exit(&zone_status_lock);
3349 		/* cause the process to return to userland. */
3350 		lwp_rtt();
3351 	}
3352 }
3353 
3354 struct zsched_arg {
3355 	zone_t *zone;
3356 	nvlist_t *nvlist;
3357 };
3358 
3359 /*
3360  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3361  * anything to do with scheduling, but rather with the fact that
3362  * per-zone kernel threads are parented to zsched, just like regular
3363  * kernel threads are parented to sched (p0).
3364  *
3365  * zsched is also responsible for launching init for the zone.
3366  */
3367 static void
3368 zsched(void *arg)
3369 {
3370 	struct zsched_arg *za = arg;
3371 	proc_t *pp = curproc;
3372 	proc_t *initp = proc_init;
3373 	zone_t *zone = za->zone;
3374 	cred_t *cr, *oldcred;
3375 	rctl_set_t *set;
3376 	rctl_alloc_gp_t *gp;
3377 	contract_t *ct = NULL;
3378 	task_t *tk, *oldtk;
3379 	rctl_entity_p_t e;
3380 	kproject_t *pj;
3381 
3382 	nvlist_t *nvl = za->nvlist;
3383 	nvpair_t *nvp = NULL;
3384 
3385 	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3386 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3387 	PTOU(pp)->u_argc = 0;
3388 	PTOU(pp)->u_argv = NULL;
3389 	PTOU(pp)->u_envp = NULL;
3390 	closeall(P_FINFO(pp));
3391 
3392 	/*
3393 	 * We are this zone's "zsched" process.  As the zone isn't generally
3394 	 * visible yet we don't need to grab any locks before initializing its
3395 	 * zone_proc pointer.
3396 	 */
3397 	zone_hold(zone);  /* this hold is released by zone_destroy() */
3398 	zone->zone_zsched = pp;
3399 	mutex_enter(&pp->p_lock);
3400 	pp->p_zone = zone;
3401 	mutex_exit(&pp->p_lock);
3402 
3403 	/*
3404 	 * Disassociate process from its 'parent'; parent ourselves to init
3405 	 * (pid 1) and change other values as needed.
3406 	 */
3407 	sess_create();
3408 
3409 	mutex_enter(&pidlock);
3410 	proc_detach(pp);
3411 	pp->p_ppid = 1;
3412 	pp->p_flag |= SZONETOP;
3413 	pp->p_ancpid = 1;
3414 	pp->p_parent = initp;
3415 	pp->p_psibling = NULL;
3416 	if (initp->p_child)
3417 		initp->p_child->p_psibling = pp;
3418 	pp->p_sibling = initp->p_child;
3419 	initp->p_child = pp;
3420 
3421 	/* Decrement what newproc() incremented. */
3422 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3423 	/*
3424 	 * Our credentials are about to become kcred-like, so we don't care
3425 	 * about the caller's ruid.
3426 	 */
3427 	upcount_inc(crgetruid(kcred), zone->zone_id);
3428 	mutex_exit(&pidlock);
3429 
3430 	/*
3431 	 * getting out of global zone, so decrement lwp and process counts
3432 	 */
3433 	pj = pp->p_task->tk_proj;
3434 	mutex_enter(&global_zone->zone_nlwps_lock);
3435 	pj->kpj_nlwps -= pp->p_lwpcnt;
3436 	global_zone->zone_nlwps -= pp->p_lwpcnt;
3437 	pj->kpj_nprocs--;
3438 	global_zone->zone_nprocs--;
3439 	mutex_exit(&global_zone->zone_nlwps_lock);
3440 
3441 	/*
3442 	 * Decrement locked memory counts on old zone and project.
3443 	 */
3444 	mutex_enter(&global_zone->zone_mem_lock);
3445 	global_zone->zone_locked_mem -= pp->p_locked_mem;
3446 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3447 	mutex_exit(&global_zone->zone_mem_lock);
3448 
3449 	/*
3450 	 * Create and join a new task in project '0' of this zone.
3451 	 *
3452 	 * We don't need to call holdlwps() since we know we're the only lwp in
3453 	 * this process.
3454 	 *
3455 	 * task_join() returns with p_lock held.
3456 	 */
3457 	tk = task_create(0, zone);
3458 	mutex_enter(&cpu_lock);
3459 	oldtk = task_join(tk, 0);
3460 
3461 	pj = pp->p_task->tk_proj;
3462 
3463 	mutex_enter(&zone->zone_mem_lock);
3464 	zone->zone_locked_mem += pp->p_locked_mem;
3465 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3466 	mutex_exit(&zone->zone_mem_lock);
3467 
3468 	/*
3469 	 * add lwp and process counts to zsched's zone, and increment
3470 	 * project's task and process count due to the task created in
3471 	 * the above task_create.
3472 	 */
3473 	mutex_enter(&zone->zone_nlwps_lock);
3474 	pj->kpj_nlwps += pp->p_lwpcnt;
3475 	pj->kpj_ntasks += 1;
3476 	zone->zone_nlwps += pp->p_lwpcnt;
3477 	pj->kpj_nprocs++;
3478 	zone->zone_nprocs++;
3479 	mutex_exit(&zone->zone_nlwps_lock);
3480 
3481 	mutex_exit(&curproc->p_lock);
3482 	mutex_exit(&cpu_lock);
3483 	task_rele(oldtk);
3484 
3485 	/*
3486 	 * The process was created by a process in the global zone, hence the
3487 	 * credentials are wrong.  We might as well have kcred-ish credentials.
3488 	 */
3489 	cr = zone->zone_kcred;
3490 	crhold(cr);
3491 	mutex_enter(&pp->p_crlock);
3492 	oldcred = pp->p_cred;
3493 	pp->p_cred = cr;
3494 	mutex_exit(&pp->p_crlock);
3495 	crfree(oldcred);
3496 
3497 	/*
3498 	 * Hold credentials again (for thread)
3499 	 */
3500 	crhold(cr);
3501 
3502 	/*
3503 	 * p_lwpcnt can't change since this is a kernel process.
3504 	 */
3505 	crset(pp, cr);
3506 
3507 	/*
3508 	 * Chroot
3509 	 */
3510 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3511 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3512 
3513 	/*
3514 	 * Initialize zone's rctl set.
3515 	 */
3516 	set = rctl_set_create();
3517 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3518 	mutex_enter(&pp->p_lock);
3519 	e.rcep_p.zone = zone;
3520 	e.rcep_t = RCENTITY_ZONE;
3521 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3522 	mutex_exit(&pp->p_lock);
3523 	rctl_prealloc_destroy(gp);
3524 
3525 	/*
3526 	 * Apply the rctls passed in to zone_create().  This is basically a list
3527 	 * assignment: all of the old values are removed and the new ones
3528 	 * inserted.  That is, if an empty list is passed in, all values are
3529 	 * removed.
3530 	 */
3531 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3532 		rctl_dict_entry_t *rde;
3533 		rctl_hndl_t hndl;
3534 		char *name;
3535 		nvlist_t **nvlarray;
3536 		uint_t i, nelem;
3537 		int error;	/* For ASSERT()s */
3538 
3539 		name = nvpair_name(nvp);
3540 		hndl = rctl_hndl_lookup(name);
3541 		ASSERT(hndl != -1);
3542 		rde = rctl_dict_lookup_hndl(hndl);
3543 		ASSERT(rde != NULL);
3544 
3545 		for (; /* ever */; ) {
3546 			rctl_val_t oval;
3547 
3548 			mutex_enter(&pp->p_lock);
3549 			error = rctl_local_get(hndl, NULL, &oval, pp);
3550 			mutex_exit(&pp->p_lock);
3551 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
3552 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3553 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
3554 				break;
3555 			mutex_enter(&pp->p_lock);
3556 			error = rctl_local_delete(hndl, &oval, pp);
3557 			mutex_exit(&pp->p_lock);
3558 			ASSERT(error == 0);
3559 		}
3560 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3561 		ASSERT(error == 0);
3562 		for (i = 0; i < nelem; i++) {
3563 			rctl_val_t *nvalp;
3564 
3565 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3566 			error = nvlist2rctlval(nvlarray[i], nvalp);
3567 			ASSERT(error == 0);
3568 			/*
3569 			 * rctl_local_insert can fail if the value being
3570 			 * inserted is a duplicate; this is OK.
3571 			 */
3572 			mutex_enter(&pp->p_lock);
3573 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
3574 				kmem_cache_free(rctl_val_cache, nvalp);
3575 			mutex_exit(&pp->p_lock);
3576 		}
3577 	}
3578 	/*
3579 	 * Tell the world that we're done setting up.
3580 	 *
3581 	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3582 	 * and atomically set the zone's processor set visibility.  Once
3583 	 * we drop pool_lock() this zone will automatically get updated
3584 	 * to reflect any future changes to the pools configuration.
3585 	 *
3586 	 * Note that after we drop the locks below (zonehash_lock in
3587 	 * particular) other operations such as a zone_getattr call can
3588 	 * now proceed and observe the zone. That is the reason for doing a
3589 	 * state transition to the INITIALIZED state.
3590 	 */
3591 	pool_lock();
3592 	mutex_enter(&cpu_lock);
3593 	mutex_enter(&zonehash_lock);
3594 	zone_uniqid(zone);
3595 	zone_zsd_configure(zone);
3596 	if (pool_state == POOL_ENABLED)
3597 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
3598 	mutex_enter(&zone_status_lock);
3599 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3600 	zone_status_set(zone, ZONE_IS_INITIALIZED);
3601 	mutex_exit(&zone_status_lock);
3602 	mutex_exit(&zonehash_lock);
3603 	mutex_exit(&cpu_lock);
3604 	pool_unlock();
3605 
3606 	/* Now call the create callback for this key */
3607 	zsd_apply_all_keys(zsd_apply_create, zone);
3608 
3609 	/* The callbacks are complete. Mark ZONE_IS_READY */
3610 	mutex_enter(&zone_status_lock);
3611 	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3612 	zone_status_set(zone, ZONE_IS_READY);
3613 	mutex_exit(&zone_status_lock);
3614 
3615 	/*
3616 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
3617 	 * we launch init, and set the state to running.
3618 	 */
3619 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3620 
3621 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3622 		id_t cid;
3623 
3624 		/*
3625 		 * Ok, this is a little complicated.  We need to grab the
3626 		 * zone's pool's scheduling class ID; note that by now, we
3627 		 * are already bound to a pool if we need to be (zoneadmd
3628 		 * will have done that to us while we're in the READY
3629 		 * state).  *But* the scheduling class for the zone's 'init'
3630 		 * must be explicitly passed to newproc, which doesn't
3631 		 * respect pool bindings.
3632 		 *
3633 		 * We hold the pool_lock across the call to newproc() to
3634 		 * close the obvious race: the pool's scheduling class
3635 		 * could change before we manage to create the LWP with
3636 		 * classid 'cid'.
3637 		 */
3638 		pool_lock();
3639 		if (zone->zone_defaultcid > 0)
3640 			cid = zone->zone_defaultcid;
3641 		else
3642 			cid = pool_get_class(zone->zone_pool);
3643 		if (cid == -1)
3644 			cid = defaultcid;
3645 
3646 		/*
3647 		 * If this fails, zone_boot will ultimately fail.  The
3648 		 * state of the zone will be set to SHUTTING_DOWN-- userland
3649 		 * will have to tear down the zone, and fail, or try again.
3650 		 */
3651 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3652 		    minclsyspri - 1, &ct, 0)) != 0) {
3653 			mutex_enter(&zone_status_lock);
3654 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3655 			mutex_exit(&zone_status_lock);
3656 		}
3657 		pool_unlock();
3658 	}
3659 
3660 	/*
3661 	 * Wait for zone_destroy() to be called.  This is what we spend
3662 	 * most of our life doing.
3663 	 */
3664 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3665 
3666 	if (ct)
3667 		/*
3668 		 * At this point the process contract should be empty.
3669 		 * (Though if it isn't, it's not the end of the world.)
3670 		 */
3671 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3672 
3673 	/*
3674 	 * Allow kcred to be freed when all referring processes
3675 	 * (including this one) go away.  We can't just do this in
3676 	 * zone_free because we need to wait for the zone_cred_ref to
3677 	 * drop to 0 before calling zone_free, and the existence of
3678 	 * zone_kcred will prevent that.  Thus, we call crfree here to
3679 	 * balance the crdup in zone_create.  The crhold calls earlier
3680 	 * in zsched will be dropped when the thread and process exit.
3681 	 */
3682 	crfree(zone->zone_kcred);
3683 	zone->zone_kcred = NULL;
3684 
3685 	exit(CLD_EXITED, 0);
3686 }
3687 
3688 /*
3689  * Helper function to determine if there are any submounts of the
3690  * provided path.  Used to make sure the zone doesn't "inherit" any
3691  * mounts from before it is created.
3692  */
3693 static uint_t
3694 zone_mount_count(const char *rootpath)
3695 {
3696 	vfs_t *vfsp;
3697 	uint_t count = 0;
3698 	size_t rootpathlen = strlen(rootpath);
3699 
3700 	/*
3701 	 * Holding zonehash_lock prevents race conditions with
3702 	 * vfs_list_add()/vfs_list_remove() since we serialize with
3703 	 * zone_find_by_path().
3704 	 */
3705 	ASSERT(MUTEX_HELD(&zonehash_lock));
3706 	/*
3707 	 * The rootpath must end with a '/'
3708 	 */
3709 	ASSERT(rootpath[rootpathlen - 1] == '/');
3710 
3711 	/*
3712 	 * This intentionally does not count the rootpath itself if that
3713 	 * happens to be a mount point.
3714 	 */
3715 	vfs_list_read_lock();
3716 	vfsp = rootvfs;
3717 	do {
3718 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
3719 		    rootpathlen) == 0)
3720 			count++;
3721 		vfsp = vfsp->vfs_next;
3722 	} while (vfsp != rootvfs);
3723 	vfs_list_unlock();
3724 	return (count);
3725 }
3726 
3727 /*
3728  * Helper function to make sure that a zone created on 'rootpath'
3729  * wouldn't end up containing other zones' rootpaths.
3730  */
3731 static boolean_t
3732 zone_is_nested(const char *rootpath)
3733 {
3734 	zone_t *zone;
3735 	size_t rootpathlen = strlen(rootpath);
3736 	size_t len;
3737 
3738 	ASSERT(MUTEX_HELD(&zonehash_lock));
3739 
3740 	/*
3741 	 * zone_set_root() appended '/' and '\0' at the end of rootpath
3742 	 */
3743 	if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
3744 	    (rootpath[1] == '/') && (rootpath[2] == '\0'))
3745 		return (B_TRUE);
3746 
3747 	for (zone = list_head(&zone_active); zone != NULL;
3748 	    zone = list_next(&zone_active, zone)) {
3749 		if (zone == global_zone)
3750 			continue;
3751 		len = strlen(zone->zone_rootpath);
3752 		if (strncmp(rootpath, zone->zone_rootpath,
3753 		    MIN(rootpathlen, len)) == 0)
3754 			return (B_TRUE);
3755 	}
3756 	return (B_FALSE);
3757 }
3758 
3759 static int
3760 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3761     size_t zone_privssz)
3762 {
3763 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3764 
3765 	if (zone_privssz < sizeof (priv_set_t))
3766 		return (set_errno(ENOMEM));
3767 
3768 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3769 		kmem_free(privs, sizeof (priv_set_t));
3770 		return (EFAULT);
3771 	}
3772 
3773 	zone->zone_privset = privs;
3774 	return (0);
3775 }
3776 
3777 /*
3778  * We make creative use of nvlists to pass in rctls from userland.  The list is
3779  * a list of the following structures:
3780  *
3781  * (name = rctl_name, value = nvpair_list_array)
3782  *
3783  * Where each element of the nvpair_list_array is of the form:
3784  *
3785  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3786  * 	(name = "limit", value = uint64_t),
3787  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3788  */
3789 static int
3790 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3791 {
3792 	nvpair_t *nvp = NULL;
3793 	nvlist_t *nvl = NULL;
3794 	char *kbuf;
3795 	int error;
3796 	rctl_val_t rv;
3797 
3798 	*nvlp = NULL;
3799 
3800 	if (buflen == 0)
3801 		return (0);
3802 
3803 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3804 		return (ENOMEM);
3805 	if (copyin(ubuf, kbuf, buflen)) {
3806 		error = EFAULT;
3807 		goto out;
3808 	}
3809 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3810 		/*
3811 		 * nvl may have been allocated/free'd, but the value set to
3812 		 * non-NULL, so we reset it here.
3813 		 */
3814 		nvl = NULL;
3815 		error = EINVAL;
3816 		goto out;
3817 	}
3818 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3819 		rctl_dict_entry_t *rde;
3820 		rctl_hndl_t hndl;
3821 		nvlist_t **nvlarray;
3822 		uint_t i, nelem;
3823 		char *name;
3824 
3825 		error = EINVAL;
3826 		name = nvpair_name(nvp);
3827 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3828 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3829 			goto out;
3830 		}
3831 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
3832 			goto out;
3833 		}
3834 		rde = rctl_dict_lookup_hndl(hndl);
3835 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3836 		ASSERT(error == 0);
3837 		for (i = 0; i < nelem; i++) {
3838 			if (error = nvlist2rctlval(nvlarray[i], &rv))
3839 				goto out;
3840 		}
3841 		if (rctl_invalid_value(rde, &rv)) {
3842 			error = EINVAL;
3843 			goto out;
3844 		}
3845 	}
3846 	error = 0;
3847 	*nvlp = nvl;
3848 out:
3849 	kmem_free(kbuf, buflen);
3850 	if (error && nvl != NULL)
3851 		nvlist_free(nvl);
3852 	return (error);
3853 }
3854 
3855 int
3856 zone_create_error(int er_error, int er_ext, int *er_out) {
3857 	if (er_out != NULL) {
3858 		if (copyout(&er_ext, er_out, sizeof (int))) {
3859 			return (set_errno(EFAULT));
3860 		}
3861 	}
3862 	return (set_errno(er_error));
3863 }
3864 
3865 static int
3866 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
3867 {
3868 	ts_label_t *tsl;
3869 	bslabel_t blab;
3870 
3871 	/* Get label from user */
3872 	if (copyin(lab, &blab, sizeof (blab)) != 0)
3873 		return (EFAULT);
3874 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
3875 	if (tsl == NULL)
3876 		return (ENOMEM);
3877 
3878 	zone->zone_slabel = tsl;
3879 	return (0);
3880 }
3881 
3882 /*
3883  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3884  */
3885 static int
3886 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3887 {
3888 	char *kbuf;
3889 	char *dataset, *next;
3890 	zone_dataset_t *zd;
3891 	size_t len;
3892 
3893 	if (ubuf == NULL || buflen == 0)
3894 		return (0);
3895 
3896 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3897 		return (ENOMEM);
3898 
3899 	if (copyin(ubuf, kbuf, buflen) != 0) {
3900 		kmem_free(kbuf, buflen);
3901 		return (EFAULT);
3902 	}
3903 
3904 	dataset = next = kbuf;
3905 	for (;;) {
3906 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3907 
3908 		next = strchr(dataset, ',');
3909 
3910 		if (next == NULL)
3911 			len = strlen(dataset);
3912 		else
3913 			len = next - dataset;
3914 
3915 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3916 		bcopy(dataset, zd->zd_dataset, len);
3917 		zd->zd_dataset[len] = '\0';
3918 
3919 		list_insert_head(&zone->zone_datasets, zd);
3920 
3921 		if (next == NULL)
3922 			break;
3923 
3924 		dataset = next + 1;
3925 	}
3926 
3927 	kmem_free(kbuf, buflen);
3928 	return (0);
3929 }
3930 
3931 /*
3932  * System call to create/initialize a new zone named 'zone_name', rooted
3933  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
3934  * and initialized with the zone-wide rctls described in 'rctlbuf', and
3935  * with labeling set by 'match', 'doi', and 'label'.
3936  *
3937  * If extended error is non-null, we may use it to return more detailed
3938  * error information.
3939  */
3940 static zoneid_t
3941 zone_create(const char *zone_name, const char *zone_root,
3942     const priv_set_t *zone_privs, size_t zone_privssz,
3943     caddr_t rctlbuf, size_t rctlbufsz,
3944     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
3945     int match, uint32_t doi, const bslabel_t *label,
3946     int flags)
3947 {
3948 	struct zsched_arg zarg;
3949 	nvlist_t *rctls = NULL;
3950 	proc_t *pp = curproc;
3951 	zone_t *zone, *ztmp;
3952 	zoneid_t zoneid;
3953 	int error;
3954 	int error2 = 0;
3955 	char *str;
3956 	cred_t *zkcr;
3957 	boolean_t insert_label_hash;
3958 
3959 	if (secpolicy_zone_config(CRED()) != 0)
3960 		return (set_errno(EPERM));
3961 
3962 	/* can't boot zone from within chroot environment */
3963 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
3964 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3965 		    extended_error));
3966 
3967 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
3968 	zoneid = zone->zone_id = id_alloc(zoneid_space);
3969 	zone->zone_status = ZONE_IS_UNINITIALIZED;
3970 	zone->zone_pool = pool_default;
3971 	zone->zone_pool_mod = gethrtime();
3972 	zone->zone_psetid = ZONE_PS_INVAL;
3973 	zone->zone_ncpus = 0;
3974 	zone->zone_ncpus_online = 0;
3975 	zone->zone_restart_init = B_TRUE;
3976 	zone->zone_brand = &native_brand;
3977 	zone->zone_initname = NULL;
3978 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
3979 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
3980 	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
3981 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
3982 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
3983 	    offsetof(struct zsd_entry, zsd_linkage));
3984 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3985 	    offsetof(zone_dataset_t, zd_linkage));
3986 	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
3987 	    offsetof(zone_dl_t, zdl_linkage));
3988 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
3989 	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
3990 
3991 	if (flags & ZCF_NET_EXCL) {
3992 		zone->zone_flags |= ZF_NET_EXCL;
3993 	}
3994 
3995 	if ((error = zone_set_name(zone, zone_name)) != 0) {
3996 		zone_free(zone);
3997 		return (zone_create_error(error, 0, extended_error));
3998 	}
3999 
4000 	if ((error = zone_set_root(zone, zone_root)) != 0) {
4001 		zone_free(zone);
4002 		return (zone_create_error(error, 0, extended_error));
4003 	}
4004 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
4005 		zone_free(zone);
4006 		return (zone_create_error(error, 0, extended_error));
4007 	}
4008 
4009 	/* initialize node name to be the same as zone name */
4010 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4011 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
4012 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
4013 
4014 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
4015 	zone->zone_domain[0] = '\0';
4016 	zone->zone_hostid = HW_INVALID_HOSTID;
4017 	zone->zone_shares = 1;
4018 	zone->zone_shmmax = 0;
4019 	zone->zone_ipc.ipcq_shmmni = 0;
4020 	zone->zone_ipc.ipcq_semmni = 0;
4021 	zone->zone_ipc.ipcq_msgmni = 0;
4022 	zone->zone_bootargs = NULL;
4023 	zone->zone_fs_allowed = NULL;
4024 	zone->zone_initname =
4025 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
4026 	(void) strcpy(zone->zone_initname, zone_default_initname);
4027 	zone->zone_nlwps = 0;
4028 	zone->zone_nlwps_ctl = INT_MAX;
4029 	zone->zone_nprocs = 0;
4030 	zone->zone_nprocs_ctl = INT_MAX;
4031 	zone->zone_locked_mem = 0;
4032 	zone->zone_locked_mem_ctl = UINT64_MAX;
4033 	zone->zone_max_swap = 0;
4034 	zone->zone_max_swap_ctl = UINT64_MAX;
4035 	zone->zone_max_lofi = 0;
4036 	zone->zone_max_lofi_ctl = UINT64_MAX;
4037 	zone0.zone_lockedmem_kstat = NULL;
4038 	zone0.zone_swapresv_kstat = NULL;
4039 
4040 	/*
4041 	 * Zsched initializes the rctls.
4042 	 */
4043 	zone->zone_rctls = NULL;
4044 
4045 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
4046 		zone_free(zone);
4047 		return (zone_create_error(error, 0, extended_error));
4048 	}
4049 
4050 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
4051 		zone_free(zone);
4052 		return (set_errno(error));
4053 	}
4054 
4055 	/*
4056 	 * Read in the trusted system parameters:
4057 	 * match flag and sensitivity label.
4058 	 */
4059 	zone->zone_match = match;
4060 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4061 		/* Fail if requested to set doi to anything but system's doi */
4062 		if (doi != 0 && doi != default_doi) {
4063 			zone_free(zone);
4064 			return (set_errno(EINVAL));
4065 		}
4066 		/* Always apply system's doi to the zone */
4067 		error = zone_set_label(zone, label, default_doi);
4068 		if (error != 0) {
4069 			zone_free(zone);
4070 			return (set_errno(error));
4071 		}
4072 		insert_label_hash = B_TRUE;
4073 	} else {
4074 		/* all zones get an admin_low label if system is not labeled */
4075 		zone->zone_slabel = l_admin_low;
4076 		label_hold(l_admin_low);
4077 		insert_label_hash = B_FALSE;
4078 	}
4079 
4080 	/*
4081 	 * Stop all lwps since that's what normally happens as part of fork().
4082 	 * This needs to happen before we grab any locks to avoid deadlock
4083 	 * (another lwp in the process could be waiting for the held lock).
4084 	 */
4085 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
4086 		zone_free(zone);
4087 		if (rctls)
4088 			nvlist_free(rctls);
4089 		return (zone_create_error(error, 0, extended_error));
4090 	}
4091 
4092 	if (block_mounts() == 0) {
4093 		mutex_enter(&pp->p_lock);
4094 		if (curthread != pp->p_agenttp)
4095 			continuelwps(pp);
4096 		mutex_exit(&pp->p_lock);
4097 		zone_free(zone);
4098 		if (rctls)
4099 			nvlist_free(rctls);
4100 		return (zone_create_error(error, 0, extended_error));
4101 	}
4102 
4103 	/*
4104 	 * Set up credential for kernel access.  After this, any errors
4105 	 * should go through the dance in errout rather than calling
4106 	 * zone_free directly.
4107 	 */
4108 	zone->zone_kcred = crdup(kcred);
4109 	crsetzone(zone->zone_kcred, zone);
4110 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
4111 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
4112 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
4113 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
4114 
4115 	mutex_enter(&zonehash_lock);
4116 	/*
4117 	 * Make sure zone doesn't already exist.
4118 	 *
4119 	 * If the system and zone are labeled,
4120 	 * make sure no other zone exists that has the same label.
4121 	 */
4122 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
4123 	    (insert_label_hash &&
4124 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
4125 		zone_status_t status;
4126 
4127 		status = zone_status_get(ztmp);
4128 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
4129 			error = EEXIST;
4130 		else
4131 			error = EBUSY;
4132 
4133 		if (insert_label_hash)
4134 			error2 = ZE_LABELINUSE;
4135 
4136 		goto errout;
4137 	}
4138 
4139 	/*
4140 	 * Don't allow zone creations which would cause one zone's rootpath to
4141 	 * be accessible from that of another (non-global) zone.
4142 	 */
4143 	if (zone_is_nested(zone->zone_rootpath)) {
4144 		error = EBUSY;
4145 		goto errout;
4146 	}
4147 
4148 	ASSERT(zonecount != 0);		/* check for leaks */
4149 	if (zonecount + 1 > maxzones) {
4150 		error = ENOMEM;
4151 		goto errout;
4152 	}
4153 
4154 	if (zone_mount_count(zone->zone_rootpath) != 0) {
4155 		error = EBUSY;
4156 		error2 = ZE_AREMOUNTS;
4157 		goto errout;
4158 	}
4159 
4160 	/*
4161 	 * Zone is still incomplete, but we need to drop all locks while
4162 	 * zsched() initializes this zone's kernel process.  We
4163 	 * optimistically add the zone to the hashtable and associated
4164 	 * lists so a parallel zone_create() doesn't try to create the
4165 	 * same zone.
4166 	 */
4167 	zonecount++;
4168 	(void) mod_hash_insert(zonehashbyid,
4169 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
4170 	    (mod_hash_val_t)(uintptr_t)zone);
4171 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
4172 	(void) strcpy(str, zone->zone_name);
4173 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
4174 	    (mod_hash_val_t)(uintptr_t)zone);
4175 	if (insert_label_hash) {
4176 		(void) mod_hash_insert(zonehashbylabel,
4177 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
4178 		zone->zone_flags |= ZF_HASHED_LABEL;
4179 	}
4180 
4181 	/*
4182 	 * Insert into active list.  At this point there are no 'hold's
4183 	 * on the zone, but everyone else knows not to use it, so we can
4184 	 * continue to use it.  zsched() will do a zone_hold() if the
4185 	 * newproc() is successful.
4186 	 */
4187 	list_insert_tail(&zone_active, zone);
4188 	mutex_exit(&zonehash_lock);
4189 
4190 	zarg.zone = zone;
4191 	zarg.nvlist = rctls;
4192 	/*
4193 	 * The process, task, and project rctls are probably wrong;
4194 	 * we need an interface to get the default values of all rctls,
4195 	 * and initialize zsched appropriately.  I'm not sure that that
4196 	 * makes much of a difference, though.
4197 	 */
4198 	error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0);
4199 	if (error != 0) {
4200 		/*
4201 		 * We need to undo all globally visible state.
4202 		 */
4203 		mutex_enter(&zonehash_lock);
4204 		list_remove(&zone_active, zone);
4205 		if (zone->zone_flags & ZF_HASHED_LABEL) {
4206 			ASSERT(zone->zone_slabel != NULL);
4207 			(void) mod_hash_destroy(zonehashbylabel,
4208 			    (mod_hash_key_t)zone->zone_slabel);
4209 		}
4210 		(void) mod_hash_destroy(zonehashbyname,
4211 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
4212 		(void) mod_hash_destroy(zonehashbyid,
4213 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4214 		ASSERT(zonecount > 1);
4215 		zonecount--;
4216 		goto errout;
4217 	}
4218 
4219 	/*
4220 	 * Zone creation can't fail from now on.
4221 	 */
4222 
4223 	/*
4224 	 * Create zone kstats
4225 	 */
4226 	zone_kstat_create(zone);
4227 
4228 	/*
4229 	 * Let the other lwps continue.
4230 	 */
4231 	mutex_enter(&pp->p_lock);
4232 	if (curthread != pp->p_agenttp)
4233 		continuelwps(pp);
4234 	mutex_exit(&pp->p_lock);
4235 
4236 	/*
4237 	 * Wait for zsched to finish initializing the zone.
4238 	 */
4239 	zone_status_wait(zone, ZONE_IS_READY);
4240 	/*
4241 	 * The zone is fully visible, so we can let mounts progress.
4242 	 */
4243 	resume_mounts();
4244 	if (rctls)
4245 		nvlist_free(rctls);
4246 
4247 	return (zoneid);
4248 
4249 errout:
4250 	mutex_exit(&zonehash_lock);
4251 	/*
4252 	 * Let the other lwps continue.
4253 	 */
4254 	mutex_enter(&pp->p_lock);
4255 	if (curthread != pp->p_agenttp)
4256 		continuelwps(pp);
4257 	mutex_exit(&pp->p_lock);
4258 
4259 	resume_mounts();
4260 	if (rctls)
4261 		nvlist_free(rctls);
4262 	/*
4263 	 * There is currently one reference to the zone, a cred_ref from
4264 	 * zone_kcred.  To free the zone, we call crfree, which will call
4265 	 * zone_cred_rele, which will call zone_free.
4266 	 */
4267 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
4268 	ASSERT(zone->zone_kcred->cr_ref == 1);
4269 	ASSERT(zone->zone_ref == 0);
4270 	zkcr = zone->zone_kcred;
4271 	zone->zone_kcred = NULL;
4272 	crfree(zkcr);				/* triggers call to zone_free */
4273 	return (zone_create_error(error, error2, extended_error));
4274 }
4275 
4276 /*
4277  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4278  * the heavy lifting.  initname is the path to the program to launch
4279  * at the "top" of the zone; if this is NULL, we use the system default,
4280  * which is stored at zone_default_initname.
4281  */
4282 static int
4283 zone_boot(zoneid_t zoneid)
4284 {
4285 	int err;
4286 	zone_t *zone;
4287 
4288 	if (secpolicy_zone_config(CRED()) != 0)
4289 		return (set_errno(EPERM));
4290 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4291 		return (set_errno(EINVAL));
4292 
4293 	mutex_enter(&zonehash_lock);
4294 	/*
4295 	 * Look for zone under hash lock to prevent races with calls to
4296 	 * zone_shutdown, zone_destroy, etc.
4297 	 */
4298 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4299 		mutex_exit(&zonehash_lock);
4300 		return (set_errno(EINVAL));
4301 	}
4302 
4303 	mutex_enter(&zone_status_lock);
4304 	if (zone_status_get(zone) != ZONE_IS_READY) {
4305 		mutex_exit(&zone_status_lock);
4306 		mutex_exit(&zonehash_lock);
4307 		return (set_errno(EINVAL));
4308 	}
4309 	zone_status_set(zone, ZONE_IS_BOOTING);
4310 	mutex_exit(&zone_status_lock);
4311 
4312 	zone_hold(zone);	/* so we can use the zone_t later */
4313 	mutex_exit(&zonehash_lock);
4314 
4315 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4316 		zone_rele(zone);
4317 		return (set_errno(EINTR));
4318 	}
4319 
4320 	/*
4321 	 * Boot (starting init) might have failed, in which case the zone
4322 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
4323 	 * be placed in zone->zone_boot_err, and so we return that.
4324 	 */
4325 	err = zone->zone_boot_err;
4326 	zone_rele(zone);
4327 	return (err ? set_errno(err) : 0);
4328 }
4329 
4330 /*
4331  * Kills all user processes in the zone, waiting for them all to exit
4332  * before returning.
4333  */
4334 static int
4335 zone_empty(zone_t *zone)
4336 {
4337 	int waitstatus;
4338 
4339 	/*
4340 	 * We need to drop zonehash_lock before killing all
4341 	 * processes, otherwise we'll deadlock with zone_find_*
4342 	 * which can be called from the exit path.
4343 	 */
4344 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4345 	while ((waitstatus = zone_status_timedwait_sig(zone,
4346 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
4347 		killall(zone->zone_id);
4348 	}
4349 	/*
4350 	 * return EINTR if we were signaled
4351 	 */
4352 	if (waitstatus == 0)
4353 		return (EINTR);
4354 	return (0);
4355 }
4356 
4357 /*
4358  * This function implements the policy for zone visibility.
4359  *
4360  * In standard Solaris, a non-global zone can only see itself.
4361  *
4362  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4363  * it dominates. For this test, the label of the global zone is treated as
4364  * admin_high so it is special-cased instead of being checked for dominance.
4365  *
4366  * Returns true if zone attributes are viewable, false otherwise.
4367  */
4368 static boolean_t
4369 zone_list_access(zone_t *zone)
4370 {
4371 
4372 	if (curproc->p_zone == global_zone ||
4373 	    curproc->p_zone == zone) {
4374 		return (B_TRUE);
4375 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4376 		bslabel_t *curproc_label;
4377 		bslabel_t *zone_label;
4378 
4379 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4380 		zone_label = label2bslabel(zone->zone_slabel);
4381 
4382 		if (zone->zone_id != GLOBAL_ZONEID &&
4383 		    bldominates(curproc_label, zone_label)) {
4384 			return (B_TRUE);
4385 		} else {
4386 			return (B_FALSE);
4387 		}
4388 	} else {
4389 		return (B_FALSE);
4390 	}
4391 }
4392 
4393 /*
4394  * Systemcall to start the zone's halt sequence.  By the time this
4395  * function successfully returns, all user processes and kernel threads
4396  * executing in it will have exited, ZSD shutdown callbacks executed,
4397  * and the zone status set to ZONE_IS_DOWN.
4398  *
4399  * It is possible that the call will interrupt itself if the caller is the
4400  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4401  */
4402 static int
4403 zone_shutdown(zoneid_t zoneid)
4404 {
4405 	int error;
4406 	zone_t *zone;
4407 	zone_status_t status;
4408 
4409 	if (secpolicy_zone_config(CRED()) != 0)
4410 		return (set_errno(EPERM));
4411 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4412 		return (set_errno(EINVAL));
4413 
4414 	/*
4415 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
4416 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
4417 	 *
4418 	 * e.g. NFS can fail the mount if it determines that the zone
4419 	 * has already begun the shutdown sequence.
4420 	 */
4421 	if (block_mounts() == 0)
4422 		return (set_errno(EINTR));
4423 	mutex_enter(&zonehash_lock);
4424 	/*
4425 	 * Look for zone under hash lock to prevent races with other
4426 	 * calls to zone_shutdown and zone_destroy.
4427 	 */
4428 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4429 		mutex_exit(&zonehash_lock);
4430 		resume_mounts();
4431 		return (set_errno(EINVAL));
4432 	}
4433 	mutex_enter(&zone_status_lock);
4434 	status = zone_status_get(zone);
4435 	/*
4436 	 * Fail if the zone isn't fully initialized yet.
4437 	 */
4438 	if (status < ZONE_IS_READY) {
4439 		mutex_exit(&zone_status_lock);
4440 		mutex_exit(&zonehash_lock);
4441 		resume_mounts();
4442 		return (set_errno(EINVAL));
4443 	}
4444 	/*
4445 	 * If conditions required for zone_shutdown() to return have been met,
4446 	 * return success.
4447 	 */
4448 	if (status >= ZONE_IS_DOWN) {
4449 		mutex_exit(&zone_status_lock);
4450 		mutex_exit(&zonehash_lock);
4451 		resume_mounts();
4452 		return (0);
4453 	}
4454 	/*
4455 	 * If zone_shutdown() hasn't been called before, go through the motions.
4456 	 * If it has, there's nothing to do but wait for the kernel threads to
4457 	 * drain.
4458 	 */
4459 	if (status < ZONE_IS_EMPTY) {
4460 		uint_t ntasks;
4461 
4462 		mutex_enter(&zone->zone_lock);
4463 		if ((ntasks = zone->zone_ntasks) != 1) {
4464 			/*
4465 			 * There's still stuff running.
4466 			 */
4467 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4468 		}
4469 		mutex_exit(&zone->zone_lock);
4470 		if (ntasks == 1) {
4471 			/*
4472 			 * The only way to create another task is through
4473 			 * zone_enter(), which will block until we drop
4474 			 * zonehash_lock.  The zone is empty.
4475 			 */
4476 			if (zone->zone_kthreads == NULL) {
4477 				/*
4478 				 * Skip ahead to ZONE_IS_DOWN
4479 				 */
4480 				zone_status_set(zone, ZONE_IS_DOWN);
4481 			} else {
4482 				zone_status_set(zone, ZONE_IS_EMPTY);
4483 			}
4484 		}
4485 	}
4486 	zone_hold(zone);	/* so we can use the zone_t later */
4487 	mutex_exit(&zone_status_lock);
4488 	mutex_exit(&zonehash_lock);
4489 	resume_mounts();
4490 
4491 	if (error = zone_empty(zone)) {
4492 		zone_rele(zone);
4493 		return (set_errno(error));
4494 	}
4495 	/*
4496 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
4497 	 * longer be notified of changes to the pools configuration, so
4498 	 * in order to not end up with a stale pool pointer, we point
4499 	 * ourselves at the default pool and remove all resource
4500 	 * visibility.  This is especially important as the zone_t may
4501 	 * languish on the deathrow for a very long time waiting for
4502 	 * cred's to drain out.
4503 	 *
4504 	 * This rebinding of the zone can happen multiple times
4505 	 * (presumably due to interrupted or parallel systemcalls)
4506 	 * without any adverse effects.
4507 	 */
4508 	if (pool_lock_intr() != 0) {
4509 		zone_rele(zone);
4510 		return (set_errno(EINTR));
4511 	}
4512 	if (pool_state == POOL_ENABLED) {
4513 		mutex_enter(&cpu_lock);
4514 		zone_pool_set(zone, pool_default);
4515 		/*
4516 		 * The zone no longer needs to be able to see any cpus.
4517 		 */
4518 		zone_pset_set(zone, ZONE_PS_INVAL);
4519 		mutex_exit(&cpu_lock);
4520 	}
4521 	pool_unlock();
4522 
4523 	/*
4524 	 * ZSD shutdown callbacks can be executed multiple times, hence
4525 	 * it is safe to not be holding any locks across this call.
4526 	 */
4527 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4528 
4529 	mutex_enter(&zone_status_lock);
4530 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4531 		zone_status_set(zone, ZONE_IS_DOWN);
4532 	mutex_exit(&zone_status_lock);
4533 
4534 	/*
4535 	 * Wait for kernel threads to drain.
4536 	 */
4537 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4538 		zone_rele(zone);
4539 		return (set_errno(EINTR));
4540 	}
4541 
4542 	/*
4543 	 * Zone can be become down/destroyable even if the above wait
4544 	 * returns EINTR, so any code added here may never execute.
4545 	 * (i.e. don't add code here)
4546 	 */
4547 
4548 	zone_rele(zone);
4549 	return (0);
4550 }
4551 
4552 /*
4553  * Systemcall entry point to finalize the zone halt process.  The caller
4554  * must have already successfully called zone_shutdown().
4555  *
4556  * Upon successful completion, the zone will have been fully destroyed:
4557  * zsched will have exited, destructor callbacks executed, and the zone
4558  * removed from the list of active zones.
4559  */
4560 static int
4561 zone_destroy(zoneid_t zoneid)
4562 {
4563 	uint64_t uniqid;
4564 	zone_t *zone;
4565 	zone_status_t status;
4566 
4567 	if (secpolicy_zone_config(CRED()) != 0)
4568 		return (set_errno(EPERM));
4569 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4570 		return (set_errno(EINVAL));
4571 
4572 	mutex_enter(&zonehash_lock);
4573 	/*
4574 	 * Look for zone under hash lock to prevent races with other
4575 	 * calls to zone_destroy.
4576 	 */
4577 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4578 		mutex_exit(&zonehash_lock);
4579 		return (set_errno(EINVAL));
4580 	}
4581 
4582 	if (zone_mount_count(zone->zone_rootpath) != 0) {
4583 		mutex_exit(&zonehash_lock);
4584 		return (set_errno(EBUSY));
4585 	}
4586 	mutex_enter(&zone_status_lock);
4587 	status = zone_status_get(zone);
4588 	if (status < ZONE_IS_DOWN) {
4589 		mutex_exit(&zone_status_lock);
4590 		mutex_exit(&zonehash_lock);
4591 		return (set_errno(EBUSY));
4592 	} else if (status == ZONE_IS_DOWN) {
4593 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
4594 	}
4595 	mutex_exit(&zone_status_lock);
4596 	zone_hold(zone);
4597 	mutex_exit(&zonehash_lock);
4598 
4599 	/*
4600 	 * wait for zsched to exit
4601 	 */
4602 	zone_status_wait(zone, ZONE_IS_DEAD);
4603 	zone_zsd_callbacks(zone, ZSD_DESTROY);
4604 	zone->zone_netstack = NULL;
4605 	uniqid = zone->zone_uniqid;
4606 	zone_rele(zone);
4607 	zone = NULL;	/* potentially free'd */
4608 
4609 	mutex_enter(&zonehash_lock);
4610 	for (; /* ever */; ) {
4611 		boolean_t unref;
4612 
4613 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
4614 		    zone->zone_uniqid != uniqid) {
4615 			/*
4616 			 * The zone has gone away.  Necessary conditions
4617 			 * are met, so we return success.
4618 			 */
4619 			mutex_exit(&zonehash_lock);
4620 			return (0);
4621 		}
4622 		mutex_enter(&zone->zone_lock);
4623 		unref = ZONE_IS_UNREF(zone);
4624 		mutex_exit(&zone->zone_lock);
4625 		if (unref) {
4626 			/*
4627 			 * There is only one reference to the zone -- that
4628 			 * added when the zone was added to the hashtables --
4629 			 * and things will remain this way until we drop
4630 			 * zonehash_lock... we can go ahead and cleanup the
4631 			 * zone.
4632 			 */
4633 			break;
4634 		}
4635 
4636 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
4637 			/* Signaled */
4638 			mutex_exit(&zonehash_lock);
4639 			return (set_errno(EINTR));
4640 		}
4641 
4642 	}
4643 
4644 	/*
4645 	 * Remove CPU cap for this zone now since we're not going to
4646 	 * fail below this point.
4647 	 */
4648 	cpucaps_zone_remove(zone);
4649 
4650 	/* Get rid of the zone's kstats */
4651 	zone_kstat_delete(zone);
4652 
4653 	/* remove the pfexecd doors */
4654 	if (zone->zone_pfexecd != NULL) {
4655 		klpd_freelist(&zone->zone_pfexecd);
4656 		zone->zone_pfexecd = NULL;
4657 	}
4658 
4659 	/* free brand specific data */
4660 	if (ZONE_IS_BRANDED(zone))
4661 		ZBROP(zone)->b_free_brand_data(zone);
4662 
4663 	/* Say goodbye to brand framework. */
4664 	brand_unregister_zone(zone->zone_brand);
4665 
4666 	/*
4667 	 * It is now safe to let the zone be recreated; remove it from the
4668 	 * lists.  The memory will not be freed until the last cred
4669 	 * reference goes away.
4670 	 */
4671 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
4672 	zonecount--;
4673 	/* remove from active list and hash tables */
4674 	list_remove(&zone_active, zone);
4675 	(void) mod_hash_destroy(zonehashbyname,
4676 	    (mod_hash_key_t)zone->zone_name);
4677 	(void) mod_hash_destroy(zonehashbyid,
4678 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4679 	if (zone->zone_flags & ZF_HASHED_LABEL)
4680 		(void) mod_hash_destroy(zonehashbylabel,
4681 		    (mod_hash_key_t)zone->zone_slabel);
4682 	mutex_exit(&zonehash_lock);
4683 
4684 	/*
4685 	 * Release the root vnode; we're not using it anymore.  Nor should any
4686 	 * other thread that might access it exist.
4687 	 */
4688 	if (zone->zone_rootvp != NULL) {
4689 		VN_RELE(zone->zone_rootvp);
4690 		zone->zone_rootvp = NULL;
4691 	}
4692 
4693 	/* add to deathrow list */
4694 	mutex_enter(&zone_deathrow_lock);
4695 	list_insert_tail(&zone_deathrow, zone);
4696 	mutex_exit(&zone_deathrow_lock);
4697 
4698 	/*
4699 	 * Drop last reference (which was added by zsched()), this will
4700 	 * free the zone unless there are outstanding cred references.
4701 	 */
4702 	zone_rele(zone);
4703 	return (0);
4704 }
4705 
4706 /*
4707  * Systemcall entry point for zone_getattr(2).
4708  */
4709 static ssize_t
4710 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4711 {
4712 	size_t size;
4713 	int error = 0, err;
4714 	zone_t *zone;
4715 	char *zonepath;
4716 	char *outstr;
4717 	zone_status_t zone_status;
4718 	pid_t initpid;
4719 	boolean_t global = (curzone == global_zone);
4720 	boolean_t inzone = (curzone->zone_id == zoneid);
4721 	ushort_t flags;
4722 
4723 	mutex_enter(&zonehash_lock);
4724 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4725 		mutex_exit(&zonehash_lock);
4726 		return (set_errno(EINVAL));
4727 	}
4728 	zone_status = zone_status_get(zone);
4729 	if (zone_status < ZONE_IS_INITIALIZED) {
4730 		mutex_exit(&zonehash_lock);
4731 		return (set_errno(EINVAL));
4732 	}
4733 	zone_hold(zone);
4734 	mutex_exit(&zonehash_lock);
4735 
4736 	/*
4737 	 * If not in the global zone, don't show information about other zones,
4738 	 * unless the system is labeled and the local zone's label dominates
4739 	 * the other zone.
4740 	 */
4741 	if (!zone_list_access(zone)) {
4742 		zone_rele(zone);
4743 		return (set_errno(EINVAL));
4744 	}
4745 
4746 	switch (attr) {
4747 	case ZONE_ATTR_ROOT:
4748 		if (global) {
4749 			/*
4750 			 * Copy the path to trim the trailing "/" (except for
4751 			 * the global zone).
4752 			 */
4753 			if (zone != global_zone)
4754 				size = zone->zone_rootpathlen - 1;
4755 			else
4756 				size = zone->zone_rootpathlen;
4757 			zonepath = kmem_alloc(size, KM_SLEEP);
4758 			bcopy(zone->zone_rootpath, zonepath, size);
4759 			zonepath[size - 1] = '\0';
4760 		} else {
4761 			if (inzone || !is_system_labeled()) {
4762 				/*
4763 				 * Caller is not in the global zone.
4764 				 * if the query is on the current zone
4765 				 * or the system is not labeled,
4766 				 * just return faked-up path for current zone.
4767 				 */
4768 				zonepath = "/";
4769 				size = 2;
4770 			} else {
4771 				/*
4772 				 * Return related path for current zone.
4773 				 */
4774 				int prefix_len = strlen(zone_prefix);
4775 				int zname_len = strlen(zone->zone_name);
4776 
4777 				size = prefix_len + zname_len + 1;
4778 				zonepath = kmem_alloc(size, KM_SLEEP);
4779 				bcopy(zone_prefix, zonepath, prefix_len);
4780 				bcopy(zone->zone_name, zonepath +
4781 				    prefix_len, zname_len);
4782 				zonepath[size - 1] = '\0';
4783 			}
4784 		}
4785 		if (bufsize > size)
4786 			bufsize = size;
4787 		if (buf != NULL) {
4788 			err = copyoutstr(zonepath, buf, bufsize, NULL);
4789 			if (err != 0 && err != ENAMETOOLONG)
4790 				error = EFAULT;
4791 		}
4792 		if (global || (is_system_labeled() && !inzone))
4793 			kmem_free(zonepath, size);
4794 		break;
4795 
4796 	case ZONE_ATTR_NAME:
4797 		size = strlen(zone->zone_name) + 1;
4798 		if (bufsize > size)
4799 			bufsize = size;
4800 		if (buf != NULL) {
4801 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
4802 			if (err != 0 && err != ENAMETOOLONG)
4803 				error = EFAULT;
4804 		}
4805 		break;
4806 
4807 	case ZONE_ATTR_STATUS:
4808 		/*
4809 		 * Since we're not holding zonehash_lock, the zone status
4810 		 * may be anything; leave it up to userland to sort it out.
4811 		 */
4812 		size = sizeof (zone_status);
4813 		if (bufsize > size)
4814 			bufsize = size;
4815 		zone_status = zone_status_get(zone);
4816 		if (buf != NULL &&
4817 		    copyout(&zone_status, buf, bufsize) != 0)
4818 			error = EFAULT;
4819 		break;
4820 	case ZONE_ATTR_FLAGS:
4821 		size = sizeof (zone->zone_flags);
4822 		if (bufsize > size)
4823 			bufsize = size;
4824 		flags = zone->zone_flags;
4825 		if (buf != NULL &&
4826 		    copyout(&flags, buf, bufsize) != 0)
4827 			error = EFAULT;
4828 		break;
4829 	case ZONE_ATTR_PRIVSET:
4830 		size = sizeof (priv_set_t);
4831 		if (bufsize > size)
4832 			bufsize = size;
4833 		if (buf != NULL &&
4834 		    copyout(zone->zone_privset, buf, bufsize) != 0)
4835 			error = EFAULT;
4836 		break;
4837 	case ZONE_ATTR_UNIQID:
4838 		size = sizeof (zone->zone_uniqid);
4839 		if (bufsize > size)
4840 			bufsize = size;
4841 		if (buf != NULL &&
4842 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
4843 			error = EFAULT;
4844 		break;
4845 	case ZONE_ATTR_POOLID:
4846 		{
4847 			pool_t *pool;
4848 			poolid_t poolid;
4849 
4850 			if (pool_lock_intr() != 0) {
4851 				error = EINTR;
4852 				break;
4853 			}
4854 			pool = zone_pool_get(zone);
4855 			poolid = pool->pool_id;
4856 			pool_unlock();
4857 			size = sizeof (poolid);
4858 			if (bufsize > size)
4859 				bufsize = size;
4860 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
4861 				error = EFAULT;
4862 		}
4863 		break;
4864 	case ZONE_ATTR_SLBL:
4865 		size = sizeof (bslabel_t);
4866 		if (bufsize > size)
4867 			bufsize = size;
4868 		if (zone->zone_slabel == NULL)
4869 			error = EINVAL;
4870 		else if (buf != NULL &&
4871 		    copyout(label2bslabel(zone->zone_slabel), buf,
4872 		    bufsize) != 0)
4873 			error = EFAULT;
4874 		break;
4875 	case ZONE_ATTR_INITPID:
4876 		size = sizeof (initpid);
4877 		if (bufsize > size)
4878 			bufsize = size;
4879 		initpid = zone->zone_proc_initpid;
4880 		if (initpid == -1) {
4881 			error = ESRCH;
4882 			break;
4883 		}
4884 		if (buf != NULL &&
4885 		    copyout(&initpid, buf, bufsize) != 0)
4886 			error = EFAULT;
4887 		break;
4888 	case ZONE_ATTR_BRAND:
4889 		size = strlen(zone->zone_brand->b_name) + 1;
4890 
4891 		if (bufsize > size)
4892 			bufsize = size;
4893 		if (buf != NULL) {
4894 			err = copyoutstr(zone->zone_brand->b_name, buf,
4895 			    bufsize, NULL);
4896 			if (err != 0 && err != ENAMETOOLONG)
4897 				error = EFAULT;
4898 		}
4899 		break;
4900 	case ZONE_ATTR_INITNAME:
4901 		size = strlen(zone->zone_initname) + 1;
4902 		if (bufsize > size)
4903 			bufsize = size;
4904 		if (buf != NULL) {
4905 			err = copyoutstr(zone->zone_initname, buf, bufsize,
4906 			    NULL);
4907 			if (err != 0 && err != ENAMETOOLONG)
4908 				error = EFAULT;
4909 		}
4910 		break;
4911 	case ZONE_ATTR_BOOTARGS:
4912 		if (zone->zone_bootargs == NULL)
4913 			outstr = "";
4914 		else
4915 			outstr = zone->zone_bootargs;
4916 		size = strlen(outstr) + 1;
4917 		if (bufsize > size)
4918 			bufsize = size;
4919 		if (buf != NULL) {
4920 			err = copyoutstr(outstr, buf, bufsize, NULL);
4921 			if (err != 0 && err != ENAMETOOLONG)
4922 				error = EFAULT;
4923 		}
4924 		break;
4925 	case ZONE_ATTR_PHYS_MCAP:
4926 		size = sizeof (zone->zone_phys_mcap);
4927 		if (bufsize > size)
4928 			bufsize = size;
4929 		if (buf != NULL &&
4930 		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
4931 			error = EFAULT;
4932 		break;
4933 	case ZONE_ATTR_SCHED_CLASS:
4934 		mutex_enter(&class_lock);
4935 
4936 		if (zone->zone_defaultcid >= loaded_classes)
4937 			outstr = "";
4938 		else
4939 			outstr = sclass[zone->zone_defaultcid].cl_name;
4940 		size = strlen(outstr) + 1;
4941 		if (bufsize > size)
4942 			bufsize = size;
4943 		if (buf != NULL) {
4944 			err = copyoutstr(outstr, buf, bufsize, NULL);
4945 			if (err != 0 && err != ENAMETOOLONG)
4946 				error = EFAULT;
4947 		}
4948 
4949 		mutex_exit(&class_lock);
4950 		break;
4951 	case ZONE_ATTR_HOSTID:
4952 		if (zone->zone_hostid != HW_INVALID_HOSTID &&
4953 		    bufsize == sizeof (zone->zone_hostid)) {
4954 			size = sizeof (zone->zone_hostid);
4955 			if (buf != NULL && copyout(&zone->zone_hostid, buf,
4956 			    bufsize) != 0)
4957 				error = EFAULT;
4958 		} else {
4959 			error = EINVAL;
4960 		}
4961 		break;
4962 	case ZONE_ATTR_FS_ALLOWED:
4963 		if (zone->zone_fs_allowed == NULL)
4964 			outstr = "";
4965 		else
4966 			outstr = zone->zone_fs_allowed;
4967 		size = strlen(outstr) + 1;
4968 		if (bufsize > size)
4969 			bufsize = size;
4970 		if (buf != NULL) {
4971 			err = copyoutstr(outstr, buf, bufsize, NULL);
4972 			if (err != 0 && err != ENAMETOOLONG)
4973 				error = EFAULT;
4974 		}
4975 		break;
4976 	default:
4977 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
4978 			size = bufsize;
4979 			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
4980 		} else {
4981 			error = EINVAL;
4982 		}
4983 	}
4984 	zone_rele(zone);
4985 
4986 	if (error)
4987 		return (set_errno(error));
4988 	return ((ssize_t)size);
4989 }
4990 
4991 /*
4992  * Systemcall entry point for zone_setattr(2).
4993  */
4994 /*ARGSUSED*/
4995 static int
4996 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4997 {
4998 	zone_t *zone;
4999 	zone_status_t zone_status;
5000 	int err;
5001 
5002 	if (secpolicy_zone_config(CRED()) != 0)
5003 		return (set_errno(EPERM));
5004 
5005 	/*
5006 	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
5007 	 * global zone.
5008 	 */
5009 	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
5010 		return (set_errno(EINVAL));
5011 	}
5012 
5013 	mutex_enter(&zonehash_lock);
5014 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
5015 		mutex_exit(&zonehash_lock);
5016 		return (set_errno(EINVAL));
5017 	}
5018 	zone_hold(zone);
5019 	mutex_exit(&zonehash_lock);
5020 
5021 	/*
5022 	 * At present most attributes can only be set on non-running,
5023 	 * non-global zones.
5024 	 */
5025 	zone_status = zone_status_get(zone);
5026 	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
5027 		goto done;
5028 
5029 	switch (attr) {
5030 	case ZONE_ATTR_INITNAME:
5031 		err = zone_set_initname(zone, (const char *)buf);
5032 		break;
5033 	case ZONE_ATTR_BOOTARGS:
5034 		err = zone_set_bootargs(zone, (const char *)buf);
5035 		break;
5036 	case ZONE_ATTR_BRAND:
5037 		err = zone_set_brand(zone, (const char *)buf);
5038 		break;
5039 	case ZONE_ATTR_FS_ALLOWED:
5040 		err = zone_set_fs_allowed(zone, (const char *)buf);
5041 		break;
5042 	case ZONE_ATTR_PHYS_MCAP:
5043 		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
5044 		break;
5045 	case ZONE_ATTR_SCHED_CLASS:
5046 		err = zone_set_sched_class(zone, (const char *)buf);
5047 		break;
5048 	case ZONE_ATTR_HOSTID:
5049 		if (bufsize == sizeof (zone->zone_hostid)) {
5050 			if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
5051 				err = 0;
5052 			else
5053 				err = EFAULT;
5054 		} else {
5055 			err = EINVAL;
5056 		}
5057 		break;
5058 	default:
5059 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
5060 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
5061 		else
5062 			err = EINVAL;
5063 	}
5064 
5065 done:
5066 	zone_rele(zone);
5067 	return (err != 0 ? set_errno(err) : 0);
5068 }
5069 
5070 /*
5071  * Return zero if the process has at least one vnode mapped in to its
5072  * address space which shouldn't be allowed to change zones.
5073  *
5074  * Also return zero if the process has any shared mappings which reserve
5075  * swap.  This is because the counting for zone.max-swap does not allow swap
5076  * reservation to be shared between zones.  zone swap reservation is counted
5077  * on zone->zone_max_swap.
5078  */
5079 static int
5080 as_can_change_zones(void)
5081 {
5082 	proc_t *pp = curproc;
5083 	struct seg *seg;
5084 	struct as *as = pp->p_as;
5085 	vnode_t *vp;
5086 	int allow = 1;
5087 
5088 	ASSERT(pp->p_as != &kas);
5089 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
5090 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
5091 
5092 		/*
5093 		 * Cannot enter zone with shared anon memory which
5094 		 * reserves swap.  See comment above.
5095 		 */
5096 		if (seg_can_change_zones(seg) == B_FALSE) {
5097 			allow = 0;
5098 			break;
5099 		}
5100 		/*
5101 		 * if we can't get a backing vnode for this segment then skip
5102 		 * it.
5103 		 */
5104 		vp = NULL;
5105 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
5106 			continue;
5107 		if (!vn_can_change_zones(vp)) { /* bail on first match */
5108 			allow = 0;
5109 			break;
5110 		}
5111 	}
5112 	AS_LOCK_EXIT(as, &as->a_lock);
5113 	return (allow);
5114 }
5115 
5116 /*
5117  * Count swap reserved by curproc's address space
5118  */
5119 static size_t
5120 as_swresv(void)
5121 {
5122 	proc_t *pp = curproc;
5123 	struct seg *seg;
5124 	struct as *as = pp->p_as;
5125 	size_t swap = 0;
5126 
5127 	ASSERT(pp->p_as != &kas);
5128 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
5129 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
5130 		swap += seg_swresv(seg);
5131 
5132 	return (swap);
5133 }
5134 
5135 /*
5136  * Systemcall entry point for zone_enter().
5137  *
5138  * The current process is injected into said zone.  In the process
5139  * it will change its project membership, privileges, rootdir/cwd,
5140  * zone-wide rctls, and pool association to match those of the zone.
5141  *
5142  * The first zone_enter() called while the zone is in the ZONE_IS_READY
5143  * state will transition it to ZONE_IS_RUNNING.  Processes may only
5144  * enter a zone that is "ready" or "running".
5145  */
5146 static int
5147 zone_enter(zoneid_t zoneid)
5148 {
5149 	zone_t *zone;
5150 	vnode_t *vp;
5151 	proc_t *pp = curproc;
5152 	contract_t *ct;
5153 	cont_process_t *ctp;
5154 	task_t *tk, *oldtk;
5155 	kproject_t *zone_proj0;
5156 	cred_t *cr, *newcr;
5157 	pool_t *oldpool, *newpool;
5158 	sess_t *sp;
5159 	uid_t uid;
5160 	zone_status_t status;
5161 	int err = 0;
5162 	rctl_entity_p_t e;
5163 	size_t swap;
5164 	kthread_id_t t;
5165 
5166 	if (secpolicy_zone_config(CRED()) != 0)
5167 		return (set_errno(EPERM));
5168 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
5169 		return (set_errno(EINVAL));
5170 
5171 	/*
5172 	 * Stop all lwps so we don't need to hold a lock to look at
5173 	 * curproc->p_zone.  This needs to happen before we grab any
5174 	 * locks to avoid deadlock (another lwp in the process could
5175 	 * be waiting for the held lock).
5176 	 */
5177 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
5178 		return (set_errno(EINTR));
5179 
5180 	/*
5181 	 * Make sure we're not changing zones with files open or mapped in
5182 	 * to our address space which shouldn't be changing zones.
5183 	 */
5184 	if (!files_can_change_zones()) {
5185 		err = EBADF;
5186 		goto out;
5187 	}
5188 	if (!as_can_change_zones()) {
5189 		err = EFAULT;
5190 		goto out;
5191 	}
5192 
5193 	mutex_enter(&zonehash_lock);
5194 	if (pp->p_zone != global_zone) {
5195 		mutex_exit(&zonehash_lock);
5196 		err = EINVAL;
5197 		goto out;
5198 	}
5199 
5200 	zone = zone_find_all_by_id(zoneid);
5201 	if (zone == NULL) {
5202 		mutex_exit(&zonehash_lock);
5203 		err = EINVAL;
5204 		goto out;
5205 	}
5206 
5207 	/*
5208 	 * To prevent processes in a zone from holding contracts on
5209 	 * extrazonal resources, and to avoid process contract
5210 	 * memberships which span zones, contract holders and processes
5211 	 * which aren't the sole members of their encapsulating process
5212 	 * contracts are not allowed to zone_enter.
5213 	 */
5214 	ctp = pp->p_ct_process;
5215 	ct = &ctp->conp_contract;
5216 	mutex_enter(&ct->ct_lock);
5217 	mutex_enter(&pp->p_lock);
5218 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
5219 		mutex_exit(&pp->p_lock);
5220 		mutex_exit(&ct->ct_lock);
5221 		mutex_exit(&zonehash_lock);
5222 		err = EINVAL;
5223 		goto out;
5224 	}
5225 
5226 	/*
5227 	 * Moreover, we don't allow processes whose encapsulating
5228 	 * process contracts have inherited extrazonal contracts.
5229 	 * While it would be easier to eliminate all process contracts
5230 	 * with inherited contracts, we need to be able to give a
5231 	 * restarted init (or other zone-penetrating process) its
5232 	 * predecessor's contracts.
5233 	 */
5234 	if (ctp->conp_ninherited != 0) {
5235 		contract_t *next;
5236 		for (next = list_head(&ctp->conp_inherited); next;
5237 		    next = list_next(&ctp->conp_inherited, next)) {
5238 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
5239 				mutex_exit(&pp->p_lock);
5240 				mutex_exit(&ct->ct_lock);
5241 				mutex_exit(&zonehash_lock);
5242 				err = EINVAL;
5243 				goto out;
5244 			}
5245 		}
5246 	}
5247 
5248 	mutex_exit(&pp->p_lock);
5249 	mutex_exit(&ct->ct_lock);
5250 
5251 	status = zone_status_get(zone);
5252 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
5253 		/*
5254 		 * Can't join
5255 		 */
5256 		mutex_exit(&zonehash_lock);
5257 		err = EINVAL;
5258 		goto out;
5259 	}
5260 
5261 	/*
5262 	 * Make sure new priv set is within the permitted set for caller
5263 	 */
5264 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
5265 		mutex_exit(&zonehash_lock);
5266 		err = EPERM;
5267 		goto out;
5268 	}
5269 	/*
5270 	 * We want to momentarily drop zonehash_lock while we optimistically
5271 	 * bind curproc to the pool it should be running in.  This is safe
5272 	 * since the zone can't disappear (we have a hold on it).
5273 	 */
5274 	zone_hold(zone);
5275 	mutex_exit(&zonehash_lock);
5276 
5277 	/*
5278 	 * Grab pool_lock to keep the pools configuration from changing
5279 	 * and to stop ourselves from getting rebound to another pool
5280 	 * until we join the zone.
5281 	 */
5282 	if (pool_lock_intr() != 0) {
5283 		zone_rele(zone);
5284 		err = EINTR;
5285 		goto out;
5286 	}
5287 	ASSERT(secpolicy_pool(CRED()) == 0);
5288 	/*
5289 	 * Bind ourselves to the pool currently associated with the zone.
5290 	 */
5291 	oldpool = curproc->p_pool;
5292 	newpool = zone_pool_get(zone);
5293 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
5294 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
5295 	    POOL_BIND_ALL)) != 0) {
5296 		pool_unlock();
5297 		zone_rele(zone);
5298 		goto out;
5299 	}
5300 
5301 	/*
5302 	 * Grab cpu_lock now; we'll need it later when we call
5303 	 * task_join().
5304 	 */
5305 	mutex_enter(&cpu_lock);
5306 	mutex_enter(&zonehash_lock);
5307 	/*
5308 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5309 	 */
5310 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5311 		/*
5312 		 * Can't join anymore.
5313 		 */
5314 		mutex_exit(&zonehash_lock);
5315 		mutex_exit(&cpu_lock);
5316 		if (pool_state == POOL_ENABLED &&
5317 		    newpool != oldpool)
5318 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
5319 			    POOL_BIND_ALL);
5320 		pool_unlock();
5321 		zone_rele(zone);
5322 		err = EINVAL;
5323 		goto out;
5324 	}
5325 
5326 	/*
5327 	 * a_lock must be held while transfering locked memory and swap
5328 	 * reservation from the global zone to the non global zone because
5329 	 * asynchronous faults on the processes' address space can lock
5330 	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5331 	 * segments respectively.
5332 	 */
5333 	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5334 	swap = as_swresv();
5335 	mutex_enter(&pp->p_lock);
5336 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5337 	/* verify that we do not exceed and task or lwp limits */
5338 	mutex_enter(&zone->zone_nlwps_lock);
5339 	/* add new lwps to zone and zone's proj0 */
5340 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5341 	zone->zone_nlwps += pp->p_lwpcnt;
5342 	/* add 1 task to zone's proj0 */
5343 	zone_proj0->kpj_ntasks += 1;
5344 
5345 	zone_proj0->kpj_nprocs++;
5346 	zone->zone_nprocs++;
5347 	mutex_exit(&zone->zone_nlwps_lock);
5348 
5349 	mutex_enter(&zone->zone_mem_lock);
5350 	zone->zone_locked_mem += pp->p_locked_mem;
5351 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5352 	zone->zone_max_swap += swap;
5353 	mutex_exit(&zone->zone_mem_lock);
5354 
5355 	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5356 	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5357 	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5358 
5359 	/* remove lwps and process from proc's old zone and old project */
5360 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
5361 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5362 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5363 	pp->p_task->tk_proj->kpj_nprocs--;
5364 	pp->p_zone->zone_nprocs--;
5365 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
5366 
5367 	mutex_enter(&pp->p_zone->zone_mem_lock);
5368 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5369 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5370 	pp->p_zone->zone_max_swap -= swap;
5371 	mutex_exit(&pp->p_zone->zone_mem_lock);
5372 
5373 	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5374 	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5375 	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5376 
5377 	pp->p_flag |= SZONETOP;
5378 	pp->p_zone = zone;
5379 	mutex_exit(&pp->p_lock);
5380 	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5381 
5382 	/*
5383 	 * Joining the zone cannot fail from now on.
5384 	 *
5385 	 * This means that a lot of the following code can be commonized and
5386 	 * shared with zsched().
5387 	 */
5388 
5389 	/*
5390 	 * If the process contract fmri was inherited, we need to
5391 	 * flag this so that any contract status will not leak
5392 	 * extra zone information, svc_fmri in this case
5393 	 */
5394 	if (ctp->conp_svc_ctid != ct->ct_id) {
5395 		mutex_enter(&ct->ct_lock);
5396 		ctp->conp_svc_zone_enter = ct->ct_id;
5397 		mutex_exit(&ct->ct_lock);
5398 	}
5399 
5400 	/*
5401 	 * Reset the encapsulating process contract's zone.
5402 	 */
5403 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5404 	contract_setzuniqid(ct, zone->zone_uniqid);
5405 
5406 	/*
5407 	 * Create a new task and associate the process with the project keyed
5408 	 * by (projid,zoneid).
5409 	 *
5410 	 * We might as well be in project 0; the global zone's projid doesn't
5411 	 * make much sense in a zone anyhow.
5412 	 *
5413 	 * This also increments zone_ntasks, and returns with p_lock held.
5414 	 */
5415 	tk = task_create(0, zone);
5416 	oldtk = task_join(tk, 0);
5417 	mutex_exit(&cpu_lock);
5418 
5419 	/*
5420 	 * call RCTLOP_SET functions on this proc
5421 	 */
5422 	e.rcep_p.zone = zone;
5423 	e.rcep_t = RCENTITY_ZONE;
5424 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5425 	    RCD_CALLBACK);
5426 	mutex_exit(&pp->p_lock);
5427 
5428 	/*
5429 	 * We don't need to hold any of zsched's locks here; not only do we know
5430 	 * the process and zone aren't going away, we know its session isn't
5431 	 * changing either.
5432 	 *
5433 	 * By joining zsched's session here, we mimic the behavior in the
5434 	 * global zone of init's sid being the pid of sched.  We extend this
5435 	 * to all zlogin-like zone_enter()'ing processes as well.
5436 	 */
5437 	mutex_enter(&pidlock);
5438 	sp = zone->zone_zsched->p_sessp;
5439 	sess_hold(zone->zone_zsched);
5440 	mutex_enter(&pp->p_lock);
5441 	pgexit(pp);
5442 	sess_rele(pp->p_sessp, B_TRUE);
5443 	pp->p_sessp = sp;
5444 	pgjoin(pp, zone->zone_zsched->p_pidp);
5445 
5446 	/*
5447 	 * If any threads are scheduled to be placed on zone wait queue they
5448 	 * should abandon the idea since the wait queue is changing.
5449 	 * We need to be holding pidlock & p_lock to do this.
5450 	 */
5451 	if ((t = pp->p_tlist) != NULL) {
5452 		do {
5453 			thread_lock(t);
5454 			/*
5455 			 * Kick this thread so that he doesn't sit
5456 			 * on a wrong wait queue.
5457 			 */
5458 			if (ISWAITING(t))
5459 				setrun_locked(t);
5460 
5461 			if (t->t_schedflag & TS_ANYWAITQ)
5462 				t->t_schedflag &= ~ TS_ANYWAITQ;
5463 
5464 			thread_unlock(t);
5465 		} while ((t = t->t_forw) != pp->p_tlist);
5466 	}
5467 
5468 	/*
5469 	 * If there is a default scheduling class for the zone and it is not
5470 	 * the class we are currently in, change all of the threads in the
5471 	 * process to the new class.  We need to be holding pidlock & p_lock
5472 	 * when we call parmsset so this is a good place to do it.
5473 	 */
5474 	if (zone->zone_defaultcid > 0 &&
5475 	    zone->zone_defaultcid != curthread->t_cid) {
5476 		pcparms_t pcparms;
5477 
5478 		pcparms.pc_cid = zone->zone_defaultcid;
5479 		pcparms.pc_clparms[0] = 0;
5480 
5481 		/*
5482 		 * If setting the class fails, we still want to enter the zone.
5483 		 */
5484 		if ((t = pp->p_tlist) != NULL) {
5485 			do {
5486 				(void) parmsset(&pcparms, t);
5487 			} while ((t = t->t_forw) != pp->p_tlist);
5488 		}
5489 	}
5490 
5491 	mutex_exit(&pp->p_lock);
5492 	mutex_exit(&pidlock);
5493 
5494 	mutex_exit(&zonehash_lock);
5495 	/*
5496 	 * We're firmly in the zone; let pools progress.
5497 	 */
5498 	pool_unlock();
5499 	task_rele(oldtk);
5500 	/*
5501 	 * We don't need to retain a hold on the zone since we already
5502 	 * incremented zone_ntasks, so the zone isn't going anywhere.
5503 	 */
5504 	zone_rele(zone);
5505 
5506 	/*
5507 	 * Chroot
5508 	 */
5509 	vp = zone->zone_rootvp;
5510 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
5511 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
5512 
5513 	/*
5514 	 * Change process credentials
5515 	 */
5516 	newcr = cralloc();
5517 	mutex_enter(&pp->p_crlock);
5518 	cr = pp->p_cred;
5519 	crcopy_to(cr, newcr);
5520 	crsetzone(newcr, zone);
5521 	pp->p_cred = newcr;
5522 
5523 	/*
5524 	 * Restrict all process privilege sets to zone limit
5525 	 */
5526 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
5527 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
5528 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
5529 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
5530 	mutex_exit(&pp->p_crlock);
5531 	crset(pp, newcr);
5532 
5533 	/*
5534 	 * Adjust upcount to reflect zone entry.
5535 	 */
5536 	uid = crgetruid(newcr);
5537 	mutex_enter(&pidlock);
5538 	upcount_dec(uid, GLOBAL_ZONEID);
5539 	upcount_inc(uid, zoneid);
5540 	mutex_exit(&pidlock);
5541 
5542 	/*
5543 	 * Set up core file path and content.
5544 	 */
5545 	set_core_defaults();
5546 
5547 out:
5548 	/*
5549 	 * Let the other lwps continue.
5550 	 */
5551 	mutex_enter(&pp->p_lock);
5552 	if (curthread != pp->p_agenttp)
5553 		continuelwps(pp);
5554 	mutex_exit(&pp->p_lock);
5555 
5556 	return (err != 0 ? set_errno(err) : 0);
5557 }
5558 
5559 /*
5560  * Systemcall entry point for zone_list(2).
5561  *
5562  * Processes running in a (non-global) zone only see themselves.
5563  * On labeled systems, they see all zones whose label they dominate.
5564  */
5565 static int
5566 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
5567 {
5568 	zoneid_t *zoneids;
5569 	zone_t *zone, *myzone;
5570 	uint_t user_nzones, real_nzones;
5571 	uint_t domi_nzones;
5572 	int error;
5573 
5574 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
5575 		return (set_errno(EFAULT));
5576 
5577 	myzone = curproc->p_zone;
5578 	if (myzone != global_zone) {
5579 		bslabel_t *mybslab;
5580 
5581 		if (!is_system_labeled()) {
5582 			/* just return current zone */
5583 			real_nzones = domi_nzones = 1;
5584 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
5585 			zoneids[0] = myzone->zone_id;
5586 		} else {
5587 			/* return all zones that are dominated */
5588 			mutex_enter(&zonehash_lock);
5589 			real_nzones = zonecount;
5590 			domi_nzones = 0;
5591 			if (real_nzones > 0) {
5592 				zoneids = kmem_alloc(real_nzones *
5593 				    sizeof (zoneid_t), KM_SLEEP);
5594 				mybslab = label2bslabel(myzone->zone_slabel);
5595 				for (zone = list_head(&zone_active);
5596 				    zone != NULL;
5597 				    zone = list_next(&zone_active, zone)) {
5598 					if (zone->zone_id == GLOBAL_ZONEID)
5599 						continue;
5600 					if (zone != myzone &&
5601 					    (zone->zone_flags & ZF_IS_SCRATCH))
5602 						continue;
5603 					/*
5604 					 * Note that a label always dominates
5605 					 * itself, so myzone is always included
5606 					 * in the list.
5607 					 */
5608 					if (bldominates(mybslab,
5609 					    label2bslabel(zone->zone_slabel))) {
5610 						zoneids[domi_nzones++] =
5611 						    zone->zone_id;
5612 					}
5613 				}
5614 			}
5615 			mutex_exit(&zonehash_lock);
5616 		}
5617 	} else {
5618 		mutex_enter(&zonehash_lock);
5619 		real_nzones = zonecount;
5620 		domi_nzones = 0;
5621 		if (real_nzones > 0) {
5622 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
5623 			    KM_SLEEP);
5624 			for (zone = list_head(&zone_active); zone != NULL;
5625 			    zone = list_next(&zone_active, zone))
5626 				zoneids[domi_nzones++] = zone->zone_id;
5627 			ASSERT(domi_nzones == real_nzones);
5628 		}
5629 		mutex_exit(&zonehash_lock);
5630 	}
5631 
5632 	/*
5633 	 * If user has allocated space for fewer entries than we found, then
5634 	 * return only up to his limit.  Either way, tell him exactly how many
5635 	 * we found.
5636 	 */
5637 	if (domi_nzones < user_nzones)
5638 		user_nzones = domi_nzones;
5639 	error = 0;
5640 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
5641 		error = EFAULT;
5642 	} else if (zoneidlist != NULL && user_nzones != 0) {
5643 		if (copyout(zoneids, zoneidlist,
5644 		    user_nzones * sizeof (zoneid_t)) != 0)
5645 			error = EFAULT;
5646 	}
5647 
5648 	if (real_nzones > 0)
5649 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
5650 
5651 	if (error != 0)
5652 		return (set_errno(error));
5653 	else
5654 		return (0);
5655 }
5656 
5657 /*
5658  * Systemcall entry point for zone_lookup(2).
5659  *
5660  * Non-global zones are only able to see themselves and (on labeled systems)
5661  * the zones they dominate.
5662  */
5663 static zoneid_t
5664 zone_lookup(const char *zone_name)
5665 {
5666 	char *kname;
5667 	zone_t *zone;
5668 	zoneid_t zoneid;
5669 	int err;
5670 
5671 	if (zone_name == NULL) {
5672 		/* return caller's zone id */
5673 		return (getzoneid());
5674 	}
5675 
5676 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
5677 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
5678 		kmem_free(kname, ZONENAME_MAX);
5679 		return (set_errno(err));
5680 	}
5681 
5682 	mutex_enter(&zonehash_lock);
5683 	zone = zone_find_all_by_name(kname);
5684 	kmem_free(kname, ZONENAME_MAX);
5685 	/*
5686 	 * In a non-global zone, can only lookup global and own name.
5687 	 * In Trusted Extensions zone label dominance rules apply.
5688 	 */
5689 	if (zone == NULL ||
5690 	    zone_status_get(zone) < ZONE_IS_READY ||
5691 	    !zone_list_access(zone)) {
5692 		mutex_exit(&zonehash_lock);
5693 		return (set_errno(EINVAL));
5694 	} else {
5695 		zoneid = zone->zone_id;
5696 		mutex_exit(&zonehash_lock);
5697 		return (zoneid);
5698 	}
5699 }
5700 
5701 static int
5702 zone_version(int *version_arg)
5703 {
5704 	int version = ZONE_SYSCALL_API_VERSION;
5705 
5706 	if (copyout(&version, version_arg, sizeof (int)) != 0)
5707 		return (set_errno(EFAULT));
5708 	return (0);
5709 }
5710 
5711 /* ARGSUSED */
5712 long
5713 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
5714 {
5715 	zone_def zs;
5716 	int err;
5717 
5718 	switch (cmd) {
5719 	case ZONE_CREATE:
5720 		if (get_udatamodel() == DATAMODEL_NATIVE) {
5721 			if (copyin(arg1, &zs, sizeof (zone_def))) {
5722 				return (set_errno(EFAULT));
5723 			}
5724 		} else {
5725 #ifdef _SYSCALL32_IMPL
5726 			zone_def32 zs32;
5727 
5728 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
5729 				return (set_errno(EFAULT));
5730 			}
5731 			zs.zone_name =
5732 			    (const char *)(unsigned long)zs32.zone_name;
5733 			zs.zone_root =
5734 			    (const char *)(unsigned long)zs32.zone_root;
5735 			zs.zone_privs =
5736 			    (const struct priv_set *)
5737 			    (unsigned long)zs32.zone_privs;
5738 			zs.zone_privssz = zs32.zone_privssz;
5739 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
5740 			zs.rctlbufsz = zs32.rctlbufsz;
5741 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
5742 			zs.zfsbufsz = zs32.zfsbufsz;
5743 			zs.extended_error =
5744 			    (int *)(unsigned long)zs32.extended_error;
5745 			zs.match = zs32.match;
5746 			zs.doi = zs32.doi;
5747 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
5748 			zs.flags = zs32.flags;
5749 #else
5750 			panic("get_udatamodel() returned bogus result\n");
5751 #endif
5752 		}
5753 
5754 		return (zone_create(zs.zone_name, zs.zone_root,
5755 		    zs.zone_privs, zs.zone_privssz,
5756 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
5757 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
5758 		    zs.extended_error, zs.match, zs.doi,
5759 		    zs.label, zs.flags));
5760 	case ZONE_BOOT:
5761 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
5762 	case ZONE_DESTROY:
5763 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
5764 	case ZONE_GETATTR:
5765 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
5766 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5767 	case ZONE_SETATTR:
5768 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
5769 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5770 	case ZONE_ENTER:
5771 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
5772 	case ZONE_LIST:
5773 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
5774 	case ZONE_SHUTDOWN:
5775 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
5776 	case ZONE_LOOKUP:
5777 		return (zone_lookup((const char *)arg1));
5778 	case ZONE_VERSION:
5779 		return (zone_version((int *)arg1));
5780 	case ZONE_ADD_DATALINK:
5781 		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
5782 		    (datalink_id_t)(uintptr_t)arg2));
5783 	case ZONE_DEL_DATALINK:
5784 		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
5785 		    (datalink_id_t)(uintptr_t)arg2));
5786 	case ZONE_CHECK_DATALINK: {
5787 		zoneid_t	zoneid;
5788 		boolean_t	need_copyout;
5789 
5790 		if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
5791 			return (EFAULT);
5792 		need_copyout = (zoneid == ALL_ZONES);
5793 		err = zone_check_datalink(&zoneid,
5794 		    (datalink_id_t)(uintptr_t)arg2);
5795 		if (err == 0 && need_copyout) {
5796 			if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
5797 				err = EFAULT;
5798 		}
5799 		return (err == 0 ? 0 : set_errno(err));
5800 	}
5801 	case ZONE_LIST_DATALINK:
5802 		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
5803 		    (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
5804 	default:
5805 		return (set_errno(EINVAL));
5806 	}
5807 }
5808 
5809 struct zarg {
5810 	zone_t *zone;
5811 	zone_cmd_arg_t arg;
5812 };
5813 
5814 static int
5815 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
5816 {
5817 	char *buf;
5818 	size_t buflen;
5819 	int error;
5820 
5821 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
5822 	buf = kmem_alloc(buflen, KM_SLEEP);
5823 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
5824 	error = door_ki_open(buf, doorp);
5825 	kmem_free(buf, buflen);
5826 	return (error);
5827 }
5828 
5829 static void
5830 zone_release_door(door_handle_t *doorp)
5831 {
5832 	door_ki_rele(*doorp);
5833 	*doorp = NULL;
5834 }
5835 
5836 static void
5837 zone_ki_call_zoneadmd(struct zarg *zargp)
5838 {
5839 	door_handle_t door = NULL;
5840 	door_arg_t darg, save_arg;
5841 	char *zone_name;
5842 	size_t zone_namelen;
5843 	zoneid_t zoneid;
5844 	zone_t *zone;
5845 	zone_cmd_arg_t arg;
5846 	uint64_t uniqid;
5847 	size_t size;
5848 	int error;
5849 	int retry;
5850 
5851 	zone = zargp->zone;
5852 	arg = zargp->arg;
5853 	kmem_free(zargp, sizeof (*zargp));
5854 
5855 	zone_namelen = strlen(zone->zone_name) + 1;
5856 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
5857 	bcopy(zone->zone_name, zone_name, zone_namelen);
5858 	zoneid = zone->zone_id;
5859 	uniqid = zone->zone_uniqid;
5860 	/*
5861 	 * zoneadmd may be down, but at least we can empty out the zone.
5862 	 * We can ignore the return value of zone_empty() since we're called
5863 	 * from a kernel thread and know we won't be delivered any signals.
5864 	 */
5865 	ASSERT(curproc == &p0);
5866 	(void) zone_empty(zone);
5867 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
5868 	zone_rele(zone);
5869 
5870 	size = sizeof (arg);
5871 	darg.rbuf = (char *)&arg;
5872 	darg.data_ptr = (char *)&arg;
5873 	darg.rsize = size;
5874 	darg.data_size = size;
5875 	darg.desc_ptr = NULL;
5876 	darg.desc_num = 0;
5877 
5878 	save_arg = darg;
5879 	/*
5880 	 * Since we're not holding a reference to the zone, any number of
5881 	 * things can go wrong, including the zone disappearing before we get a
5882 	 * chance to talk to zoneadmd.
5883 	 */
5884 	for (retry = 0; /* forever */; retry++) {
5885 		if (door == NULL &&
5886 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
5887 			goto next;
5888 		}
5889 		ASSERT(door != NULL);
5890 
5891 		if ((error = door_ki_upcall_limited(door, &darg, NULL,
5892 		    SIZE_MAX, 0)) == 0) {
5893 			break;
5894 		}
5895 		switch (error) {
5896 		case EINTR:
5897 			/* FALLTHROUGH */
5898 		case EAGAIN:	/* process may be forking */
5899 			/*
5900 			 * Back off for a bit
5901 			 */
5902 			break;
5903 		case EBADF:
5904 			zone_release_door(&door);
5905 			if (zone_lookup_door(zone_name, &door) != 0) {
5906 				/*
5907 				 * zoneadmd may be dead, but it may come back to
5908 				 * life later.
5909 				 */
5910 				break;
5911 			}
5912 			break;
5913 		default:
5914 			cmn_err(CE_WARN,
5915 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
5916 			    error);
5917 			goto out;
5918 		}
5919 next:
5920 		/*
5921 		 * If this isn't the same zone_t that we originally had in mind,
5922 		 * then this is the same as if two kadmin requests come in at
5923 		 * the same time: the first one wins.  This means we lose, so we
5924 		 * bail.
5925 		 */
5926 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
5927 			/*
5928 			 * Problem is solved.
5929 			 */
5930 			break;
5931 		}
5932 		if (zone->zone_uniqid != uniqid) {
5933 			/*
5934 			 * zoneid recycled
5935 			 */
5936 			zone_rele(zone);
5937 			break;
5938 		}
5939 		/*
5940 		 * We could zone_status_timedwait(), but there doesn't seem to
5941 		 * be much point in doing that (plus, it would mean that
5942 		 * zone_free() isn't called until this thread exits).
5943 		 */
5944 		zone_rele(zone);
5945 		delay(hz);
5946 		darg = save_arg;
5947 	}
5948 out:
5949 	if (door != NULL) {
5950 		zone_release_door(&door);
5951 	}
5952 	kmem_free(zone_name, zone_namelen);
5953 	thread_exit();
5954 }
5955 
5956 /*
5957  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
5958  * kadmin().  The caller is a process in the zone.
5959  *
5960  * In order to shutdown the zone, we will hand off control to zoneadmd
5961  * (running in the global zone) via a door.  We do a half-hearted job at
5962  * killing all processes in the zone, create a kernel thread to contact
5963  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
5964  * a form of generation number used to let zoneadmd (as well as
5965  * zone_destroy()) know exactly which zone they're re talking about.
5966  */
5967 int
5968 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
5969 {
5970 	struct zarg *zargp;
5971 	zone_cmd_t zcmd;
5972 	zone_t *zone;
5973 
5974 	zone = curproc->p_zone;
5975 	ASSERT(getzoneid() != GLOBAL_ZONEID);
5976 
5977 	switch (cmd) {
5978 	case A_SHUTDOWN:
5979 		switch (fcn) {
5980 		case AD_HALT:
5981 		case AD_POWEROFF:
5982 			zcmd = Z_HALT;
5983 			break;
5984 		case AD_BOOT:
5985 			zcmd = Z_REBOOT;
5986 			break;
5987 		case AD_IBOOT:
5988 		case AD_SBOOT:
5989 		case AD_SIBOOT:
5990 		case AD_NOSYNC:
5991 			return (ENOTSUP);
5992 		default:
5993 			return (EINVAL);
5994 		}
5995 		break;
5996 	case A_REBOOT:
5997 		zcmd = Z_REBOOT;
5998 		break;
5999 	case A_FTRACE:
6000 	case A_REMOUNT:
6001 	case A_FREEZE:
6002 	case A_DUMP:
6003 	case A_CONFIG:
6004 		return (ENOTSUP);
6005 	default:
6006 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
6007 		return (EINVAL);
6008 	}
6009 
6010 	if (secpolicy_zone_admin(credp, B_FALSE))
6011 		return (EPERM);
6012 	mutex_enter(&zone_status_lock);
6013 
6014 	/*
6015 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
6016 	 * is in the zone.
6017 	 */
6018 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
6019 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
6020 		/*
6021 		 * This zone is already on its way down.
6022 		 */
6023 		mutex_exit(&zone_status_lock);
6024 		return (0);
6025 	}
6026 	/*
6027 	 * Prevent future zone_enter()s
6028 	 */
6029 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
6030 	mutex_exit(&zone_status_lock);
6031 
6032 	/*
6033 	 * Kill everyone now and call zoneadmd later.
6034 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
6035 	 * later.
6036 	 */
6037 	killall(zone->zone_id);
6038 	/*
6039 	 * Now, create the thread to contact zoneadmd and do the rest of the
6040 	 * work.  This thread can't be created in our zone otherwise
6041 	 * zone_destroy() would deadlock.
6042 	 */
6043 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
6044 	zargp->arg.cmd = zcmd;
6045 	zargp->arg.uniqid = zone->zone_uniqid;
6046 	zargp->zone = zone;
6047 	(void) strcpy(zargp->arg.locale, "C");
6048 	/* mdep was already copied in for us by uadmin */
6049 	if (mdep != NULL)
6050 		(void) strlcpy(zargp->arg.bootbuf, mdep,
6051 		    sizeof (zargp->arg.bootbuf));
6052 	zone_hold(zone);
6053 
6054 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
6055 	    TS_RUN, minclsyspri);
6056 	exit(CLD_EXITED, 0);
6057 
6058 	return (EINVAL);
6059 }
6060 
6061 /*
6062  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
6063  * status to ZONE_IS_SHUTTING_DOWN.
6064  *
6065  * This function also shuts down all running zones to ensure that they won't
6066  * fork new processes.
6067  */
6068 void
6069 zone_shutdown_global(void)
6070 {
6071 	zone_t *current_zonep;
6072 
6073 	ASSERT(INGLOBALZONE(curproc));
6074 	mutex_enter(&zonehash_lock);
6075 	mutex_enter(&zone_status_lock);
6076 
6077 	/* Modify the global zone's status first. */
6078 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
6079 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
6080 
6081 	/*
6082 	 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
6083 	 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
6084 	 * could cause assertions to fail (e.g., assertions about a zone's
6085 	 * state during initialization, readying, or booting) or produce races.
6086 	 * We'll let threads continue to initialize and ready new zones: they'll
6087 	 * fail to boot the new zones when they see that the global zone is
6088 	 * shutting down.
6089 	 */
6090 	for (current_zonep = list_head(&zone_active); current_zonep != NULL;
6091 	    current_zonep = list_next(&zone_active, current_zonep)) {
6092 		if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
6093 			zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
6094 	}
6095 	mutex_exit(&zone_status_lock);
6096 	mutex_exit(&zonehash_lock);
6097 }
6098 
6099 /*
6100  * Returns true if the named dataset is visible in the current zone.
6101  * The 'write' parameter is set to 1 if the dataset is also writable.
6102  */
6103 int
6104 zone_dataset_visible(const char *dataset, int *write)
6105 {
6106 	static int zfstype = -1;
6107 	zone_dataset_t *zd;
6108 	size_t len;
6109 	zone_t *zone = curproc->p_zone;
6110 	const char *name = NULL;
6111 	vfs_t *vfsp = NULL;
6112 
6113 	if (dataset[0] == '\0')
6114 		return (0);
6115 
6116 	/*
6117 	 * Walk the list once, looking for datasets which match exactly, or
6118 	 * specify a dataset underneath an exported dataset.  If found, return
6119 	 * true and note that it is writable.
6120 	 */
6121 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6122 	    zd = list_next(&zone->zone_datasets, zd)) {
6123 
6124 		len = strlen(zd->zd_dataset);
6125 		if (strlen(dataset) >= len &&
6126 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6127 		    (dataset[len] == '\0' || dataset[len] == '/' ||
6128 		    dataset[len] == '@')) {
6129 			if (write)
6130 				*write = 1;
6131 			return (1);
6132 		}
6133 	}
6134 
6135 	/*
6136 	 * Walk the list a second time, searching for datasets which are parents
6137 	 * of exported datasets.  These should be visible, but read-only.
6138 	 *
6139 	 * Note that we also have to support forms such as 'pool/dataset/', with
6140 	 * a trailing slash.
6141 	 */
6142 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
6143 	    zd = list_next(&zone->zone_datasets, zd)) {
6144 
6145 		len = strlen(dataset);
6146 		if (dataset[len - 1] == '/')
6147 			len--;	/* Ignore trailing slash */
6148 		if (len < strlen(zd->zd_dataset) &&
6149 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
6150 		    zd->zd_dataset[len] == '/') {
6151 			if (write)
6152 				*write = 0;
6153 			return (1);
6154 		}
6155 	}
6156 
6157 	/*
6158 	 * We reach here if the given dataset is not found in the zone_dataset
6159 	 * list. Check if this dataset was added as a filesystem (ie. "add fs")
6160 	 * instead of delegation. For this we search for the dataset in the
6161 	 * zone_vfslist of this zone. If found, return true and note that it is
6162 	 * not writable.
6163 	 */
6164 
6165 	/*
6166 	 * Initialize zfstype if it is not initialized yet.
6167 	 */
6168 	if (zfstype == -1) {
6169 		struct vfssw *vswp = vfs_getvfssw("zfs");
6170 		zfstype = vswp - vfssw;
6171 		vfs_unrefvfssw(vswp);
6172 	}
6173 
6174 	vfs_list_read_lock();
6175 	vfsp = zone->zone_vfslist;
6176 	do {
6177 		ASSERT(vfsp);
6178 		if (vfsp->vfs_fstype == zfstype) {
6179 			name = refstr_value(vfsp->vfs_resource);
6180 
6181 			/*
6182 			 * Check if we have an exact match.
6183 			 */
6184 			if (strcmp(dataset, name) == 0) {
6185 				vfs_list_unlock();
6186 				if (write)
6187 					*write = 0;
6188 				return (1);
6189 			}
6190 			/*
6191 			 * We need to check if we are looking for parents of
6192 			 * a dataset. These should be visible, but read-only.
6193 			 */
6194 			len = strlen(dataset);
6195 			if (dataset[len - 1] == '/')
6196 				len--;
6197 
6198 			if (len < strlen(name) &&
6199 			    bcmp(dataset, name, len) == 0 && name[len] == '/') {
6200 				vfs_list_unlock();
6201 				if (write)
6202 					*write = 0;
6203 				return (1);
6204 			}
6205 		}
6206 		vfsp = vfsp->vfs_zone_next;
6207 	} while (vfsp != zone->zone_vfslist);
6208 
6209 	vfs_list_unlock();
6210 	return (0);
6211 }
6212 
6213 /*
6214  * zone_find_by_any_path() -
6215  *
6216  * kernel-private routine similar to zone_find_by_path(), but which
6217  * effectively compares against zone paths rather than zonerootpath
6218  * (i.e., the last component of zonerootpaths, which should be "root/",
6219  * are not compared.)  This is done in order to accurately identify all
6220  * paths, whether zone-visible or not, including those which are parallel
6221  * to /root/, such as /dev/, /home/, etc...
6222  *
6223  * If the specified path does not fall under any zone path then global
6224  * zone is returned.
6225  *
6226  * The treat_abs parameter indicates whether the path should be treated as
6227  * an absolute path although it does not begin with "/".  (This supports
6228  * nfs mount syntax such as host:any/path.)
6229  *
6230  * The caller is responsible for zone_rele of the returned zone.
6231  */
6232 zone_t *
6233 zone_find_by_any_path(const char *path, boolean_t treat_abs)
6234 {
6235 	zone_t *zone;
6236 	int path_offset = 0;
6237 
6238 	if (path == NULL) {
6239 		zone_hold(global_zone);
6240 		return (global_zone);
6241 	}
6242 
6243 	if (*path != '/') {
6244 		ASSERT(treat_abs);
6245 		path_offset = 1;
6246 	}
6247 
6248 	mutex_enter(&zonehash_lock);
6249 	for (zone = list_head(&zone_active); zone != NULL;
6250 	    zone = list_next(&zone_active, zone)) {
6251 		char	*c;
6252 		size_t	pathlen;
6253 		char *rootpath_start;
6254 
6255 		if (zone == global_zone)	/* skip global zone */
6256 			continue;
6257 
6258 		/* scan backwards to find start of last component */
6259 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
6260 		do {
6261 			c--;
6262 		} while (*c != '/');
6263 
6264 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
6265 		rootpath_start = (zone->zone_rootpath + path_offset);
6266 		if (strncmp(path, rootpath_start, pathlen) == 0)
6267 			break;
6268 	}
6269 	if (zone == NULL)
6270 		zone = global_zone;
6271 	zone_hold(zone);
6272 	mutex_exit(&zonehash_lock);
6273 	return (zone);
6274 }
6275 
6276 /*
6277  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
6278  * zone_dl_t pointer if found, and NULL otherwise.
6279  */
6280 static zone_dl_t *
6281 zone_find_dl(zone_t *zone, datalink_id_t linkid)
6282 {
6283 	zone_dl_t *zdl;
6284 
6285 	ASSERT(mutex_owned(&zone->zone_lock));
6286 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6287 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
6288 		if (zdl->zdl_id == linkid)
6289 			break;
6290 	}
6291 	return (zdl);
6292 }
6293 
6294 static boolean_t
6295 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
6296 {
6297 	boolean_t exists;
6298 
6299 	mutex_enter(&zone->zone_lock);
6300 	exists = (zone_find_dl(zone, linkid) != NULL);
6301 	mutex_exit(&zone->zone_lock);
6302 	return (exists);
6303 }
6304 
6305 /*
6306  * Add an data link name for the zone.
6307  */
6308 static int
6309 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
6310 {
6311 	zone_dl_t *zdl;
6312 	zone_t *zone;
6313 	zone_t *thiszone;
6314 
6315 	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
6316 		return (set_errno(ENXIO));
6317 
6318 	/* Verify that the datalink ID doesn't already belong to a zone. */
6319 	mutex_enter(&zonehash_lock);
6320 	for (zone = list_head(&zone_active); zone != NULL;
6321 	    zone = list_next(&zone_active, zone)) {
6322 		if (zone_dl_exists(zone, linkid)) {
6323 			mutex_exit(&zonehash_lock);
6324 			zone_rele(thiszone);
6325 			return (set_errno((zone == thiszone) ? EEXIST : EPERM));
6326 		}
6327 	}
6328 
6329 	zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
6330 	zdl->zdl_id = linkid;
6331 	mutex_enter(&thiszone->zone_lock);
6332 	list_insert_head(&thiszone->zone_dl_list, zdl);
6333 	mutex_exit(&thiszone->zone_lock);
6334 	mutex_exit(&zonehash_lock);
6335 	zone_rele(thiszone);
6336 	return (0);
6337 }
6338 
6339 static int
6340 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
6341 {
6342 	zone_dl_t *zdl;
6343 	zone_t *zone;
6344 	int err = 0;
6345 
6346 	if ((zone = zone_find_by_id(zoneid)) == NULL)
6347 		return (set_errno(EINVAL));
6348 
6349 	mutex_enter(&zone->zone_lock);
6350 	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
6351 		err = ENXIO;
6352 	} else {
6353 		list_remove(&zone->zone_dl_list, zdl);
6354 		kmem_free(zdl, sizeof (zone_dl_t));
6355 	}
6356 	mutex_exit(&zone->zone_lock);
6357 	zone_rele(zone);
6358 	return (err == 0 ? 0 : set_errno(err));
6359 }
6360 
6361 /*
6362  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
6363  * the linkid.  Otherwise we just check if the specified zoneidp has been
6364  * assigned the supplied linkid.
6365  */
6366 int
6367 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
6368 {
6369 	zone_t *zone;
6370 	int err = ENXIO;
6371 
6372 	if (*zoneidp != ALL_ZONES) {
6373 		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
6374 			if (zone_dl_exists(zone, linkid))
6375 				err = 0;
6376 			zone_rele(zone);
6377 		}
6378 		return (err);
6379 	}
6380 
6381 	mutex_enter(&zonehash_lock);
6382 	for (zone = list_head(&zone_active); zone != NULL;
6383 	    zone = list_next(&zone_active, zone)) {
6384 		if (zone_dl_exists(zone, linkid)) {
6385 			*zoneidp = zone->zone_id;
6386 			err = 0;
6387 			break;
6388 		}
6389 	}
6390 	mutex_exit(&zonehash_lock);
6391 	return (err);
6392 }
6393 
6394 /*
6395  * Get the list of datalink IDs assigned to a zone.
6396  *
6397  * On input, *nump is the number of datalink IDs that can fit in the supplied
6398  * idarray.  Upon return, *nump is either set to the number of datalink IDs
6399  * that were placed in the array if the array was large enough, or to the
6400  * number of datalink IDs that the function needs to place in the array if the
6401  * array is too small.
6402  */
6403 static int
6404 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
6405 {
6406 	uint_t num, dlcount;
6407 	zone_t *zone;
6408 	zone_dl_t *zdl;
6409 	datalink_id_t *idptr = idarray;
6410 
6411 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6412 		return (set_errno(EFAULT));
6413 	if ((zone = zone_find_by_id(zoneid)) == NULL)
6414 		return (set_errno(ENXIO));
6415 
6416 	num = 0;
6417 	mutex_enter(&zone->zone_lock);
6418 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6419 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
6420 		/*
6421 		 * If the list is bigger than what the caller supplied, just
6422 		 * count, don't do copyout.
6423 		 */
6424 		if (++num > dlcount)
6425 			continue;
6426 		if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
6427 			mutex_exit(&zone->zone_lock);
6428 			zone_rele(zone);
6429 			return (set_errno(EFAULT));
6430 		}
6431 		idptr++;
6432 	}
6433 	mutex_exit(&zone->zone_lock);
6434 	zone_rele(zone);
6435 
6436 	/* Increased or decreased, caller should be notified. */
6437 	if (num != dlcount) {
6438 		if (copyout(&num, nump, sizeof (num)) != 0)
6439 			return (set_errno(EFAULT));
6440 	}
6441 	return (0);
6442 }
6443 
6444 /*
6445  * Public interface for looking up a zone by zoneid. It's a customized version
6446  * for netstack_zone_create(). It can only be called from the zsd create
6447  * callbacks, since it doesn't have reference on the zone structure hence if
6448  * it is called elsewhere the zone could disappear after the zonehash_lock
6449  * is dropped.
6450  *
6451  * Furthermore it
6452  * 1. Doesn't check the status of the zone.
6453  * 2. It will be called even before zone_init is called, in that case the
6454  *    address of zone0 is returned directly, and netstack_zone_create()
6455  *    will only assign a value to zone0.zone_netstack, won't break anything.
6456  * 3. Returns without the zone being held.
6457  */
6458 zone_t *
6459 zone_find_by_id_nolock(zoneid_t zoneid)
6460 {
6461 	zone_t *zone;
6462 
6463 	mutex_enter(&zonehash_lock);
6464 	if (zonehashbyid == NULL)
6465 		zone = &zone0;
6466 	else
6467 		zone = zone_find_all_by_id(zoneid);
6468 	mutex_exit(&zonehash_lock);
6469 	return (zone);
6470 }
6471 
6472 /*
6473  * Walk the datalinks for a given zone
6474  */
6475 int
6476 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
6477     void *data)
6478 {
6479 	zone_t		*zone;
6480 	zone_dl_t	*zdl;
6481 	datalink_id_t	*idarray;
6482 	uint_t		idcount = 0;
6483 	int		i, ret = 0;
6484 
6485 	if ((zone = zone_find_by_id(zoneid)) == NULL)
6486 		return (ENOENT);
6487 
6488 	/*
6489 	 * We first build an array of linkid's so that we can walk these and
6490 	 * execute the callback with the zone_lock dropped.
6491 	 */
6492 	mutex_enter(&zone->zone_lock);
6493 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6494 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
6495 		idcount++;
6496 	}
6497 
6498 	if (idcount == 0) {
6499 		mutex_exit(&zone->zone_lock);
6500 		zone_rele(zone);
6501 		return (0);
6502 	}
6503 
6504 	idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
6505 	if (idarray == NULL) {
6506 		mutex_exit(&zone->zone_lock);
6507 		zone_rele(zone);
6508 		return (ENOMEM);
6509 	}
6510 
6511 	for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
6512 	    i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
6513 		idarray[i] = zdl->zdl_id;
6514 	}
6515 
6516 	mutex_exit(&zone->zone_lock);
6517 
6518 	for (i = 0; i < idcount && ret == 0; i++) {
6519 		if ((ret = (*cb)(idarray[i], data)) != 0)
6520 			break;
6521 	}
6522 
6523 	zone_rele(zone);
6524 	kmem_free(idarray, sizeof (datalink_id_t) * idcount);
6525 	return (ret);
6526 }
6527