xref: /titanic_52/usr/src/uts/common/os/zone.c (revision 4bff34e37def8a90f9194d81bc345c52ba20086a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Zones
31  *
32  *   A zone is a named collection of processes, namespace constraints,
33  *   and other system resources which comprise a secure and manageable
34  *   application containment facility.
35  *
36  *   Zones (represented by the reference counted zone_t) are tracked in
37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38  *   (zoneid_t) are used to track zone association.  Zone IDs are
39  *   dynamically generated when the zone is created; if a persistent
40  *   identifier is needed (core files, accounting logs, audit trail,
41  *   etc.), the zone name should be used.
42  *
43  *
44  *   Global Zone:
45  *
46  *   The global zone (zoneid 0) is automatically associated with all
47  *   system resources that have not been bound to a user-created zone.
48  *   This means that even systems where zones are not in active use
49  *   have a global zone, and all processes, mounts, etc. are
50  *   associated with that zone.  The global zone is generally
51  *   unconstrained in terms of privileges and access, though the usual
52  *   credential and privilege based restrictions apply.
53  *
54  *
55  *   Zone States:
56  *
57  *   The states in which a zone may be in and the transitions are as
58  *   follows:
59  *
60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61  *   initialized zone is added to the list of active zones on the system but
62  *   isn't accessible.
63  *
64  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
65  *   not yet completed. Not possible to enter the zone, but attributes can
66  *   be retrieved.
67  *
68  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
69  *   ready.  The zone is made visible after the ZSD constructor callbacks are
70  *   executed.  A zone remains in this state until it transitions into
71  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
72  *
73  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
74  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
75  *   state.
76  *
77  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
78  *   successfully started init.   A zone remains in this state until
79  *   zone_shutdown() is called.
80  *
81  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
82  *   killing all processes running in the zone. The zone remains
83  *   in this state until there are no more user processes running in the zone.
84  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
85  *   Since zone_shutdown() is restartable, it may be called successfully
86  *   multiple times for the same zone_t.  Setting of the zone's state to
87  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
88  *   the zone's status without worrying about it being a moving target.
89  *
90  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
91  *   are no more user processes in the zone.  The zone remains in this
92  *   state until there are no more kernel threads associated with the
93  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
94  *   fail.
95  *
96  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
97  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
98  *   join the zone or create kernel threads therein.
99  *
100  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
101  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
102  *   return NULL from now on.
103  *
104  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
105  *   processes or threads doing work on behalf of the zone.  The zone is
106  *   removed from the list of active zones.  zone_destroy() returns, and
107  *   the zone can be recreated.
108  *
109  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
110  *   callbacks are executed, and all memory associated with the zone is
111  *   freed.
112  *
113  *   Threads can wait for the zone to enter a requested state by using
114  *   zone_status_wait() or zone_status_timedwait() with the desired
115  *   state passed in as an argument.  Zone state transitions are
116  *   uni-directional; it is not possible to move back to an earlier state.
117  *
118  *
119  *   Zone-Specific Data:
120  *
121  *   Subsystems needing to maintain zone-specific data can store that
122  *   data using the ZSD mechanism.  This provides a zone-specific data
123  *   store, similar to thread-specific data (see pthread_getspecific(3C)
124  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
125  *   to register callbacks to be invoked when a zone is created, shut
126  *   down, or destroyed.  This can be used to initialize zone-specific
127  *   data for new zones and to clean up when zones go away.
128  *
129  *
130  *   Data Structures:
131  *
132  *   The per-zone structure (zone_t) is reference counted, and freed
133  *   when all references are released.  zone_hold and zone_rele can be
134  *   used to adjust the reference count.  In addition, reference counts
135  *   associated with the cred_t structure are tracked separately using
136  *   zone_cred_hold and zone_cred_rele.
137  *
138  *   Pointers to active zone_t's are stored in two hash tables; one
139  *   for searching by id, the other for searching by name.  Lookups
140  *   can be performed on either basis, using zone_find_by_id and
141  *   zone_find_by_name.  Both return zone_t pointers with the zone
142  *   held, so zone_rele should be called when the pointer is no longer
143  *   needed.  Zones can also be searched by path; zone_find_by_path
144  *   returns the zone with which a path name is associated (global
145  *   zone if the path is not within some other zone's file system
146  *   hierarchy).  This currently requires iterating through each zone,
147  *   so it is slower than an id or name search via a hash table.
148  *
149  *
150  *   Locking:
151  *
152  *   zonehash_lock: This is a top-level global lock used to protect the
153  *       zone hash tables and lists.  Zones cannot be created or destroyed
154  *       while this lock is held.
155  *   zone_status_lock: This is a global lock protecting zone state.
156  *       Zones cannot change state while this lock is held.  It also
157  *       protects the list of kernel threads associated with a zone.
158  *   zone_lock: This is a per-zone lock used to protect several fields of
159  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
160  *       this lock means that the zone cannot go away.
161  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
162  *	 related to the zone.max-lwps rctl.
163  *   zone_mem_lock: This is a per-zone lock used to protect the fields
164  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
165  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
166  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
167  *       list (a list of zones in the ZONE_IS_DEAD state).
168  *
169  *   Ordering requirements:
170  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
171  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
172  *
173  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
174  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
175  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
176  *
177  *   Blocking memory allocations are permitted while holding any of the
178  *   zone locks.
179  *
180  *
181  *   System Call Interface:
182  *
183  *   The zone subsystem can be managed and queried from user level with
184  *   the following system calls (all subcodes of the primary "zone"
185  *   system call):
186  *   - zone_create: creates a zone with selected attributes (name,
187  *     root path, privileges, resource controls, ZFS datasets)
188  *   - zone_enter: allows the current process to enter a zone
189  *   - zone_getattr: reports attributes of a zone
190  *   - zone_setattr: set attributes of a zone
191  *   - zone_boot: set 'init' running for the zone
192  *   - zone_list: lists all zones active in the system
193  *   - zone_lookup: looks up zone id based on name
194  *   - zone_shutdown: initiates shutdown process (see states above)
195  *   - zone_destroy: completes shutdown process (see states above)
196  *
197  */
198 
199 #include <sys/priv_impl.h>
200 #include <sys/cred.h>
201 #include <c2/audit.h>
202 #include <sys/debug.h>
203 #include <sys/file.h>
204 #include <sys/kmem.h>
205 #include <sys/kstat.h>
206 #include <sys/mutex.h>
207 #include <sys/note.h>
208 #include <sys/pathname.h>
209 #include <sys/proc.h>
210 #include <sys/project.h>
211 #include <sys/sysevent.h>
212 #include <sys/task.h>
213 #include <sys/systm.h>
214 #include <sys/types.h>
215 #include <sys/utsname.h>
216 #include <sys/vnode.h>
217 #include <sys/vfs.h>
218 #include <sys/systeminfo.h>
219 #include <sys/policy.h>
220 #include <sys/cred_impl.h>
221 #include <sys/contract_impl.h>
222 #include <sys/contract/process_impl.h>
223 #include <sys/class.h>
224 #include <sys/pool.h>
225 #include <sys/pool_pset.h>
226 #include <sys/pset.h>
227 #include <sys/sysmacros.h>
228 #include <sys/callb.h>
229 #include <sys/vmparam.h>
230 #include <sys/corectl.h>
231 #include <sys/ipc_impl.h>
232 
233 #include <sys/door.h>
234 #include <sys/cpuvar.h>
235 #include <sys/sdt.h>
236 
237 #include <sys/uadmin.h>
238 #include <sys/session.h>
239 #include <sys/cmn_err.h>
240 #include <sys/modhash.h>
241 #include <sys/sunddi.h>
242 #include <sys/nvpair.h>
243 #include <sys/rctl.h>
244 #include <sys/fss.h>
245 #include <sys/brand.h>
246 #include <sys/zone.h>
247 #include <net/if.h>
248 #include <sys/cpucaps.h>
249 #include <vm/seg.h>
250 
251 /*
252  * cv used to signal that all references to the zone have been released.  This
253  * needs to be global since there may be multiple waiters, and the first to
254  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
255  */
256 static kcondvar_t zone_destroy_cv;
257 /*
258  * Lock used to serialize access to zone_cv.  This could have been per-zone,
259  * but then we'd need another lock for zone_destroy_cv, and why bother?
260  */
261 static kmutex_t zone_status_lock;
262 
263 /*
264  * ZSD-related global variables.
265  */
266 static kmutex_t zsd_key_lock;	/* protects the following two */
267 /*
268  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
269  */
270 static zone_key_t zsd_keyval = 0;
271 /*
272  * Global list of registered keys.  We use this when a new zone is created.
273  */
274 static list_t zsd_registered_keys;
275 
276 int zone_hash_size = 256;
277 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
278 static kmutex_t zonehash_lock;
279 static uint_t zonecount;
280 static id_space_t *zoneid_space;
281 
282 /*
283  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
284  * kernel proper runs, and which manages all other zones.
285  *
286  * Although not declared as static, the variable "zone0" should not be used
287  * except for by code that needs to reference the global zone early on in boot,
288  * before it is fully initialized.  All other consumers should use
289  * 'global_zone'.
290  */
291 zone_t zone0;
292 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
293 
294 /*
295  * List of active zones, protected by zonehash_lock.
296  */
297 static list_t zone_active;
298 
299 /*
300  * List of destroyed zones that still have outstanding cred references.
301  * Used for debugging.  Uses a separate lock to avoid lock ordering
302  * problems in zone_free.
303  */
304 static list_t zone_deathrow;
305 static kmutex_t zone_deathrow_lock;
306 
307 /* number of zones is limited by virtual interface limit in IP */
308 uint_t maxzones = 8192;
309 
310 /* Event channel to sent zone state change notifications */
311 evchan_t *zone_event_chan;
312 
313 /*
314  * This table holds the mapping from kernel zone states to
315  * states visible in the state notification API.
316  * The idea is that we only expose "obvious" states and
317  * do not expose states which are just implementation details.
318  */
319 const char  *zone_status_table[] = {
320 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
321 	ZONE_EVENT_INITIALIZED,		/* initialized */
322 	ZONE_EVENT_READY,		/* ready */
323 	ZONE_EVENT_READY,		/* booting */
324 	ZONE_EVENT_RUNNING,		/* running */
325 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
326 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
327 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
328 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
329 	ZONE_EVENT_UNINITIALIZED,	/* dead */
330 };
331 
332 /*
333  * This isn't static so lint doesn't complain.
334  */
335 rctl_hndl_t rc_zone_cpu_shares;
336 rctl_hndl_t rc_zone_locked_mem;
337 rctl_hndl_t rc_zone_max_swap;
338 rctl_hndl_t rc_zone_cpu_cap;
339 rctl_hndl_t rc_zone_nlwps;
340 rctl_hndl_t rc_zone_shmmax;
341 rctl_hndl_t rc_zone_shmmni;
342 rctl_hndl_t rc_zone_semmni;
343 rctl_hndl_t rc_zone_msgmni;
344 /*
345  * Synchronization primitives used to synchronize between mounts and zone
346  * creation/destruction.
347  */
348 static int mounts_in_progress;
349 static kcondvar_t mount_cv;
350 static kmutex_t mount_lock;
351 
352 const char * const zone_default_initname = "/sbin/init";
353 static char * const zone_prefix = "/zone/";
354 static int zone_shutdown(zoneid_t zoneid);
355 static int zone_add_datalink(zoneid_t, char *);
356 static int zone_remove_datalink(zoneid_t, char *);
357 static int zone_check_datalink(zoneid_t *, char *);
358 static int zone_list_datalink(zoneid_t, int *, char *);
359 
360 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
361 
362 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
363 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
364 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
365 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
366     zone_key_t);
367 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
368 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
369     kmutex_t *);
370 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
371     kmutex_t *);
372 
373 /*
374  * Bump this number when you alter the zone syscall interfaces; this is
375  * because we need to have support for previous API versions in libc
376  * to support patching; libc calls into the kernel to determine this number.
377  *
378  * Version 1 of the API is the version originally shipped with Solaris 10
379  * Version 2 alters the zone_create system call in order to support more
380  *     arguments by moving the args into a structure; and to do better
381  *     error reporting when zone_create() fails.
382  * Version 3 alters the zone_create system call in order to support the
383  *     import of ZFS datasets to zones.
384  * Version 4 alters the zone_create system call in order to support
385  *     Trusted Extensions.
386  * Version 5 alters the zone_boot system call, and converts its old
387  *     bootargs parameter to be set by the zone_setattr API instead.
388  * Version 6 adds the flag argument to zone_create.
389  */
390 static const int ZONE_SYSCALL_API_VERSION = 6;
391 
392 /*
393  * Certain filesystems (such as NFS and autofs) need to know which zone
394  * the mount is being placed in.  Because of this, we need to be able to
395  * ensure that a zone isn't in the process of being created such that
396  * nfs_mount() thinks it is in the global zone, while by the time it
397  * gets added the list of mounted zones, it ends up on zoneA's mount
398  * list.
399  *
400  * The following functions: block_mounts()/resume_mounts() and
401  * mount_in_progress()/mount_completed() are used by zones and the VFS
402  * layer (respectively) to synchronize zone creation and new mounts.
403  *
404  * The semantics are like a reader-reader lock such that there may
405  * either be multiple mounts (or zone creations, if that weren't
406  * serialized by zonehash_lock) in progress at the same time, but not
407  * both.
408  *
409  * We use cv's so the user can ctrl-C out of the operation if it's
410  * taking too long.
411  *
412  * The semantics are such that there is unfair bias towards the
413  * "current" operation.  This means that zone creations may starve if
414  * there is a rapid succession of new mounts coming in to the system, or
415  * there is a remote possibility that zones will be created at such a
416  * rate that new mounts will not be able to proceed.
417  */
418 /*
419  * Prevent new mounts from progressing to the point of calling
420  * VFS_MOUNT().  If there are already mounts in this "region", wait for
421  * them to complete.
422  */
423 static int
424 block_mounts(void)
425 {
426 	int retval = 0;
427 
428 	/*
429 	 * Since it may block for a long time, block_mounts() shouldn't be
430 	 * called with zonehash_lock held.
431 	 */
432 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
433 	mutex_enter(&mount_lock);
434 	while (mounts_in_progress > 0) {
435 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
436 			goto signaled;
437 	}
438 	/*
439 	 * A negative value of mounts_in_progress indicates that mounts
440 	 * have been blocked by (-mounts_in_progress) different callers.
441 	 */
442 	mounts_in_progress--;
443 	retval = 1;
444 signaled:
445 	mutex_exit(&mount_lock);
446 	return (retval);
447 }
448 
449 /*
450  * The VFS layer may progress with new mounts as far as we're concerned.
451  * Allow them to progress if we were the last obstacle.
452  */
453 static void
454 resume_mounts(void)
455 {
456 	mutex_enter(&mount_lock);
457 	if (++mounts_in_progress == 0)
458 		cv_broadcast(&mount_cv);
459 	mutex_exit(&mount_lock);
460 }
461 
462 /*
463  * The VFS layer is busy with a mount; zones should wait until all
464  * mounts are completed to progress.
465  */
466 void
467 mount_in_progress(void)
468 {
469 	mutex_enter(&mount_lock);
470 	while (mounts_in_progress < 0)
471 		cv_wait(&mount_cv, &mount_lock);
472 	mounts_in_progress++;
473 	mutex_exit(&mount_lock);
474 }
475 
476 /*
477  * VFS is done with one mount; wake up any waiting block_mounts()
478  * callers if this is the last mount.
479  */
480 void
481 mount_completed(void)
482 {
483 	mutex_enter(&mount_lock);
484 	if (--mounts_in_progress == 0)
485 		cv_broadcast(&mount_cv);
486 	mutex_exit(&mount_lock);
487 }
488 
489 /*
490  * ZSD routines.
491  *
492  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
493  * defined by the pthread_key_create() and related interfaces.
494  *
495  * Kernel subsystems may register one or more data items and/or
496  * callbacks to be executed when a zone is created, shutdown, or
497  * destroyed.
498  *
499  * Unlike the thread counterpart, destructor callbacks will be executed
500  * even if the data pointer is NULL and/or there are no constructor
501  * callbacks, so it is the responsibility of such callbacks to check for
502  * NULL data values if necessary.
503  *
504  * The locking strategy and overall picture is as follows:
505  *
506  * When someone calls zone_key_create(), a template ZSD entry is added to the
507  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
508  * holding that lock all the existing zones are marked as
509  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
510  * zone_zsd list (protected by zone_lock). The global list is updated first
511  * (under zone_key_lock) to make sure that newly created zones use the
512  * most recent list of keys. Then under zonehash_lock we walk the zones
513  * and mark them.  Similar locking is used in zone_key_delete().
514  *
515  * The actual create, shutdown, and destroy callbacks are done without
516  * holding any lock. And zsd_flags are used to ensure that the operations
517  * completed so that when zone_key_create (and zone_create) is done, as well as
518  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
519  * are completed.
520  *
521  * When new zones are created constructor callbacks for all registered ZSD
522  * entries will be called. That also uses the above two phases of marking
523  * what needs to be done, and then running the callbacks without holding
524  * any locks.
525  *
526  * The framework does not provide any locking around zone_getspecific() and
527  * zone_setspecific() apart from that needed for internal consistency, so
528  * callers interested in atomic "test-and-set" semantics will need to provide
529  * their own locking.
530  */
531 
532 /*
533  * Helper function to find the zsd_entry associated with the key in the
534  * given list.
535  */
536 static struct zsd_entry *
537 zsd_find(list_t *l, zone_key_t key)
538 {
539 	struct zsd_entry *zsd;
540 
541 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
542 		if (zsd->zsd_key == key) {
543 			return (zsd);
544 		}
545 	}
546 	return (NULL);
547 }
548 
549 /*
550  * Helper function to find the zsd_entry associated with the key in the
551  * given list. Move it to the front of the list.
552  */
553 static struct zsd_entry *
554 zsd_find_mru(list_t *l, zone_key_t key)
555 {
556 	struct zsd_entry *zsd;
557 
558 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
559 		if (zsd->zsd_key == key) {
560 			/*
561 			 * Move to head of list to keep list in MRU order.
562 			 */
563 			if (zsd != list_head(l)) {
564 				list_remove(l, zsd);
565 				list_insert_head(l, zsd);
566 			}
567 			return (zsd);
568 		}
569 	}
570 	return (NULL);
571 }
572 
573 void
574 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
575     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
576 {
577 	struct zsd_entry *zsdp;
578 	struct zsd_entry *t;
579 	struct zone *zone;
580 	zone_key_t  key;
581 
582 	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
583 	zsdp->zsd_data = NULL;
584 	zsdp->zsd_create = create;
585 	zsdp->zsd_shutdown = shutdown;
586 	zsdp->zsd_destroy = destroy;
587 
588 	/*
589 	 * Insert in global list of callbacks. Makes future zone creations
590 	 * see it.
591 	 */
592 	mutex_enter(&zsd_key_lock);
593 	*keyp = key = zsdp->zsd_key = ++zsd_keyval;
594 	ASSERT(zsd_keyval != 0);
595 	list_insert_tail(&zsd_registered_keys, zsdp);
596 	mutex_exit(&zsd_key_lock);
597 
598 	/*
599 	 * Insert for all existing zones and mark them as needing
600 	 * a create callback.
601 	 */
602 	mutex_enter(&zonehash_lock);	/* stop the world */
603 	for (zone = list_head(&zone_active); zone != NULL;
604 	    zone = list_next(&zone_active, zone)) {
605 		zone_status_t status;
606 
607 		mutex_enter(&zone->zone_lock);
608 
609 		/* Skip zones that are on the way down or not yet up */
610 		status = zone_status_get(zone);
611 		if (status >= ZONE_IS_DOWN ||
612 		    status == ZONE_IS_UNINITIALIZED) {
613 			mutex_exit(&zone->zone_lock);
614 			continue;
615 		}
616 
617 		t = zsd_find_mru(&zone->zone_zsd, key);
618 		if (t != NULL) {
619 			/*
620 			 * A zsd_configure already inserted it after
621 			 * we dropped zsd_key_lock above.
622 			 */
623 			mutex_exit(&zone->zone_lock);
624 			continue;
625 		}
626 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
627 		t->zsd_key = key;
628 		t->zsd_create = create;
629 		t->zsd_shutdown = shutdown;
630 		t->zsd_destroy = destroy;
631 		if (create != NULL) {
632 			t->zsd_flags = ZSD_CREATE_NEEDED;
633 			DTRACE_PROBE2(zsd__create__needed,
634 			    zone_t *, zone, zone_key_t, key);
635 		}
636 		list_insert_tail(&zone->zone_zsd, t);
637 		mutex_exit(&zone->zone_lock);
638 	}
639 	mutex_exit(&zonehash_lock);
640 
641 	if (create != NULL) {
642 		/* Now call the create callback for this key */
643 		zsd_apply_all_zones(zsd_apply_create, key);
644 	}
645 }
646 
647 /*
648  * Function called when a module is being unloaded, or otherwise wishes
649  * to unregister its ZSD key and callbacks.
650  *
651  * Remove from the global list and determine the functions that need to
652  * be called under a global lock. Then call the functions without
653  * holding any locks. Finally free up the zone_zsd entries. (The apply
654  * functions need to access the zone_zsd entries to find zsd_data etc.)
655  */
656 int
657 zone_key_delete(zone_key_t key)
658 {
659 	struct zsd_entry *zsdp = NULL;
660 	zone_t *zone;
661 
662 	mutex_enter(&zsd_key_lock);
663 	zsdp = zsd_find_mru(&zsd_registered_keys, key);
664 	if (zsdp == NULL) {
665 		mutex_exit(&zsd_key_lock);
666 		return (-1);
667 	}
668 	list_remove(&zsd_registered_keys, zsdp);
669 	mutex_exit(&zsd_key_lock);
670 
671 	mutex_enter(&zonehash_lock);
672 	for (zone = list_head(&zone_active); zone != NULL;
673 	    zone = list_next(&zone_active, zone)) {
674 		struct zsd_entry *del;
675 
676 		mutex_enter(&zone->zone_lock);
677 		del = zsd_find_mru(&zone->zone_zsd, key);
678 		if (del == NULL) {
679 			/*
680 			 * Somebody else got here first e.g the zone going
681 			 * away.
682 			 */
683 			mutex_exit(&zone->zone_lock);
684 			continue;
685 		}
686 		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
687 		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
688 		if (del->zsd_shutdown != NULL &&
689 		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
690 			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
691 			DTRACE_PROBE2(zsd__shutdown__needed,
692 			    zone_t *, zone, zone_key_t, key);
693 		}
694 		if (del->zsd_destroy != NULL &&
695 		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
696 			del->zsd_flags |= ZSD_DESTROY_NEEDED;
697 			DTRACE_PROBE2(zsd__destroy__needed,
698 			    zone_t *, zone, zone_key_t, key);
699 		}
700 		mutex_exit(&zone->zone_lock);
701 	}
702 	mutex_exit(&zonehash_lock);
703 	kmem_free(zsdp, sizeof (*zsdp));
704 
705 	/* Now call the shutdown and destroy callback for this key */
706 	zsd_apply_all_zones(zsd_apply_shutdown, key);
707 	zsd_apply_all_zones(zsd_apply_destroy, key);
708 
709 	/* Now we can free up the zsdp structures in each zone */
710 	mutex_enter(&zonehash_lock);
711 	for (zone = list_head(&zone_active); zone != NULL;
712 	    zone = list_next(&zone_active, zone)) {
713 		struct zsd_entry *del;
714 
715 		mutex_enter(&zone->zone_lock);
716 		del = zsd_find(&zone->zone_zsd, key);
717 		if (del != NULL) {
718 			list_remove(&zone->zone_zsd, del);
719 			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
720 			kmem_free(del, sizeof (*del));
721 		}
722 		mutex_exit(&zone->zone_lock);
723 	}
724 	mutex_exit(&zonehash_lock);
725 
726 	return (0);
727 }
728 
729 /*
730  * ZSD counterpart of pthread_setspecific().
731  *
732  * Since all zsd callbacks, including those with no create function,
733  * have an entry in zone_zsd, if the key is registered it is part of
734  * the zone_zsd list.
735  * Return an error if the key wasn't registerd.
736  */
737 int
738 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
739 {
740 	struct zsd_entry *t;
741 
742 	mutex_enter(&zone->zone_lock);
743 	t = zsd_find_mru(&zone->zone_zsd, key);
744 	if (t != NULL) {
745 		/*
746 		 * Replace old value with new
747 		 */
748 		t->zsd_data = (void *)data;
749 		mutex_exit(&zone->zone_lock);
750 		return (0);
751 	}
752 	mutex_exit(&zone->zone_lock);
753 	return (-1);
754 }
755 
756 /*
757  * ZSD counterpart of pthread_getspecific().
758  */
759 void *
760 zone_getspecific(zone_key_t key, zone_t *zone)
761 {
762 	struct zsd_entry *t;
763 	void *data;
764 
765 	mutex_enter(&zone->zone_lock);
766 	t = zsd_find_mru(&zone->zone_zsd, key);
767 	data = (t == NULL ? NULL : t->zsd_data);
768 	mutex_exit(&zone->zone_lock);
769 	return (data);
770 }
771 
772 /*
773  * Function used to initialize a zone's list of ZSD callbacks and data
774  * when the zone is being created.  The callbacks are initialized from
775  * the template list (zsd_registered_keys). The constructor callback is
776  * executed later (once the zone exists and with locks dropped).
777  */
778 static void
779 zone_zsd_configure(zone_t *zone)
780 {
781 	struct zsd_entry *zsdp;
782 	struct zsd_entry *t;
783 
784 	ASSERT(MUTEX_HELD(&zonehash_lock));
785 	ASSERT(list_head(&zone->zone_zsd) == NULL);
786 	mutex_enter(&zone->zone_lock);
787 	mutex_enter(&zsd_key_lock);
788 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
789 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
790 		/*
791 		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
792 		 * should not have added anything to it.
793 		 */
794 		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
795 
796 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
797 		t->zsd_key = zsdp->zsd_key;
798 		t->zsd_create = zsdp->zsd_create;
799 		t->zsd_shutdown = zsdp->zsd_shutdown;
800 		t->zsd_destroy = zsdp->zsd_destroy;
801 		if (zsdp->zsd_create != NULL) {
802 			t->zsd_flags = ZSD_CREATE_NEEDED;
803 			DTRACE_PROBE2(zsd__create__needed,
804 			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
805 		}
806 		list_insert_tail(&zone->zone_zsd, t);
807 	}
808 	mutex_exit(&zsd_key_lock);
809 	mutex_exit(&zone->zone_lock);
810 }
811 
812 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
813 
814 /*
815  * Helper function to execute shutdown or destructor callbacks.
816  */
817 static void
818 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
819 {
820 	struct zsd_entry *t;
821 
822 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
823 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
824 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
825 
826 	/*
827 	 * Run the callback solely based on what is registered for the zone
828 	 * in zone_zsd. The global list can change independently of this
829 	 * as keys are registered and unregistered and we don't register new
830 	 * callbacks for a zone that is in the process of going away.
831 	 */
832 	mutex_enter(&zone->zone_lock);
833 	for (t = list_head(&zone->zone_zsd); t != NULL;
834 	    t = list_next(&zone->zone_zsd, t)) {
835 		zone_key_t key = t->zsd_key;
836 
837 		/* Skip if no callbacks registered */
838 
839 		if (ct == ZSD_SHUTDOWN) {
840 			if (t->zsd_shutdown != NULL &&
841 			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
842 				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
843 				DTRACE_PROBE2(zsd__shutdown__needed,
844 				    zone_t *, zone, zone_key_t, key);
845 			}
846 		} else {
847 			if (t->zsd_destroy != NULL &&
848 			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
849 				t->zsd_flags |= ZSD_DESTROY_NEEDED;
850 				DTRACE_PROBE2(zsd__destroy__needed,
851 				    zone_t *, zone, zone_key_t, key);
852 			}
853 		}
854 	}
855 	mutex_exit(&zone->zone_lock);
856 
857 	/* Now call the shutdown and destroy callback for this key */
858 	zsd_apply_all_keys(zsd_apply_shutdown, zone);
859 	zsd_apply_all_keys(zsd_apply_destroy, zone);
860 
861 }
862 
863 /*
864  * Called when the zone is going away; free ZSD-related memory, and
865  * destroy the zone_zsd list.
866  */
867 static void
868 zone_free_zsd(zone_t *zone)
869 {
870 	struct zsd_entry *t, *next;
871 
872 	/*
873 	 * Free all the zsd_entry's we had on this zone.
874 	 */
875 	mutex_enter(&zone->zone_lock);
876 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
877 		next = list_next(&zone->zone_zsd, t);
878 		list_remove(&zone->zone_zsd, t);
879 		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
880 		kmem_free(t, sizeof (*t));
881 	}
882 	list_destroy(&zone->zone_zsd);
883 	mutex_exit(&zone->zone_lock);
884 
885 }
886 
887 /*
888  * Apply a function to all zones for particular key value.
889  *
890  * The applyfn has to drop zonehash_lock if it does some work, and
891  * then reacquire it before it returns.
892  * When the lock is dropped we don't follow list_next even
893  * if it is possible to do so without any hazards. This is
894  * because we want the design to allow for the list of zones
895  * to change in any arbitrary way during the time the
896  * lock was dropped.
897  *
898  * It is safe to restart the loop at list_head since the applyfn
899  * changes the zsd_flags as it does work, so a subsequent
900  * pass through will have no effect in applyfn, hence the loop will terminate
901  * in at worst O(N^2).
902  */
903 static void
904 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
905 {
906 	zone_t *zone;
907 
908 	mutex_enter(&zonehash_lock);
909 	zone = list_head(&zone_active);
910 	while (zone != NULL) {
911 		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
912 			/* Lock dropped - restart at head */
913 			zone = list_head(&zone_active);
914 		} else {
915 			zone = list_next(&zone_active, zone);
916 		}
917 	}
918 	mutex_exit(&zonehash_lock);
919 }
920 
921 /*
922  * Apply a function to all keys for a particular zone.
923  *
924  * The applyfn has to drop zonehash_lock if it does some work, and
925  * then reacquire it before it returns.
926  * When the lock is dropped we don't follow list_next even
927  * if it is possible to do so without any hazards. This is
928  * because we want the design to allow for the list of zsd callbacks
929  * to change in any arbitrary way during the time the
930  * lock was dropped.
931  *
932  * It is safe to restart the loop at list_head since the applyfn
933  * changes the zsd_flags as it does work, so a subsequent
934  * pass through will have no effect in applyfn, hence the loop will terminate
935  * in at worst O(N^2).
936  */
937 static void
938 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
939 {
940 	struct zsd_entry *t;
941 
942 	mutex_enter(&zone->zone_lock);
943 	t = list_head(&zone->zone_zsd);
944 	while (t != NULL) {
945 		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
946 			/* Lock dropped - restart at head */
947 			t = list_head(&zone->zone_zsd);
948 		} else {
949 			t = list_next(&zone->zone_zsd, t);
950 		}
951 	}
952 	mutex_exit(&zone->zone_lock);
953 }
954 
955 /*
956  * Call the create function for the zone and key if CREATE_NEEDED
957  * is set.
958  * If some other thread gets here first and sets CREATE_INPROGRESS, then
959  * we wait for that thread to complete so that we can ensure that
960  * all the callbacks are done when we've looped over all zones/keys.
961  *
962  * When we call the create function, we drop the global held by the
963  * caller, and return true to tell the caller it needs to re-evalute the
964  * state.
965  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
966  * remains held on exit.
967  */
968 static boolean_t
969 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
970     zone_t *zone, zone_key_t key)
971 {
972 	void *result;
973 	struct zsd_entry *t;
974 	boolean_t dropped;
975 
976 	if (lockp != NULL) {
977 		ASSERT(MUTEX_HELD(lockp));
978 	}
979 	if (zone_lock_held) {
980 		ASSERT(MUTEX_HELD(&zone->zone_lock));
981 	} else {
982 		mutex_enter(&zone->zone_lock);
983 	}
984 
985 	t = zsd_find(&zone->zone_zsd, key);
986 	if (t == NULL) {
987 		/*
988 		 * Somebody else got here first e.g the zone going
989 		 * away.
990 		 */
991 		if (!zone_lock_held)
992 			mutex_exit(&zone->zone_lock);
993 		return (B_FALSE);
994 	}
995 	dropped = B_FALSE;
996 	if (zsd_wait_for_inprogress(zone, t, lockp))
997 		dropped = B_TRUE;
998 
999 	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
1000 		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
1001 		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
1002 		DTRACE_PROBE2(zsd__create__inprogress,
1003 		    zone_t *, zone, zone_key_t, key);
1004 		mutex_exit(&zone->zone_lock);
1005 		if (lockp != NULL)
1006 			mutex_exit(lockp);
1007 
1008 		dropped = B_TRUE;
1009 		ASSERT(t->zsd_create != NULL);
1010 		DTRACE_PROBE2(zsd__create__start,
1011 		    zone_t *, zone, zone_key_t, key);
1012 
1013 		result = (*t->zsd_create)(zone->zone_id);
1014 
1015 		DTRACE_PROBE2(zsd__create__end,
1016 		    zone_t *, zone, voidn *, result);
1017 
1018 		ASSERT(result != NULL);
1019 		if (lockp != NULL)
1020 			mutex_enter(lockp);
1021 		mutex_enter(&zone->zone_lock);
1022 		t->zsd_data = result;
1023 		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
1024 		t->zsd_flags |= ZSD_CREATE_COMPLETED;
1025 		cv_broadcast(&t->zsd_cv);
1026 		DTRACE_PROBE2(zsd__create__completed,
1027 		    zone_t *, zone, zone_key_t, key);
1028 	}
1029 	if (!zone_lock_held)
1030 		mutex_exit(&zone->zone_lock);
1031 	return (dropped);
1032 }
1033 
1034 /*
1035  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
1036  * is set.
1037  * If some other thread gets here first and sets *_INPROGRESS, then
1038  * we wait for that thread to complete so that we can ensure that
1039  * all the callbacks are done when we've looped over all zones/keys.
1040  *
1041  * When we call the shutdown function, we drop the global held by the
1042  * caller, and return true to tell the caller it needs to re-evalute the
1043  * state.
1044  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1045  * remains held on exit.
1046  */
1047 static boolean_t
1048 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
1049     zone_t *zone, zone_key_t key)
1050 {
1051 	struct zsd_entry *t;
1052 	void *data;
1053 	boolean_t dropped;
1054 
1055 	if (lockp != NULL) {
1056 		ASSERT(MUTEX_HELD(lockp));
1057 	}
1058 	if (zone_lock_held) {
1059 		ASSERT(MUTEX_HELD(&zone->zone_lock));
1060 	} else {
1061 		mutex_enter(&zone->zone_lock);
1062 	}
1063 
1064 	t = zsd_find(&zone->zone_zsd, key);
1065 	if (t == NULL) {
1066 		/*
1067 		 * Somebody else got here first e.g the zone going
1068 		 * away.
1069 		 */
1070 		if (!zone_lock_held)
1071 			mutex_exit(&zone->zone_lock);
1072 		return (B_FALSE);
1073 	}
1074 	dropped = B_FALSE;
1075 	if (zsd_wait_for_creator(zone, t, lockp))
1076 		dropped = B_TRUE;
1077 
1078 	if (zsd_wait_for_inprogress(zone, t, lockp))
1079 		dropped = B_TRUE;
1080 
1081 	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
1082 		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
1083 		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
1084 		DTRACE_PROBE2(zsd__shutdown__inprogress,
1085 		    zone_t *, zone, zone_key_t, key);
1086 		mutex_exit(&zone->zone_lock);
1087 		if (lockp != NULL)
1088 			mutex_exit(lockp);
1089 		dropped = B_TRUE;
1090 
1091 		ASSERT(t->zsd_shutdown != NULL);
1092 		data = t->zsd_data;
1093 
1094 		DTRACE_PROBE2(zsd__shutdown__start,
1095 		    zone_t *, zone, zone_key_t, key);
1096 
1097 		(t->zsd_shutdown)(zone->zone_id, data);
1098 		DTRACE_PROBE2(zsd__shutdown__end,
1099 		    zone_t *, zone, zone_key_t, key);
1100 
1101 		if (lockp != NULL)
1102 			mutex_enter(lockp);
1103 		mutex_enter(&zone->zone_lock);
1104 		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
1105 		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
1106 		cv_broadcast(&t->zsd_cv);
1107 		DTRACE_PROBE2(zsd__shutdown__completed,
1108 		    zone_t *, zone, zone_key_t, key);
1109 	}
1110 	if (!zone_lock_held)
1111 		mutex_exit(&zone->zone_lock);
1112 	return (dropped);
1113 }
1114 
1115 /*
1116  * Call the destroy function for the zone and key if DESTROY_NEEDED
1117  * is set.
1118  * If some other thread gets here first and sets *_INPROGRESS, then
1119  * we wait for that thread to complete so that we can ensure that
1120  * all the callbacks are done when we've looped over all zones/keys.
1121  *
1122  * When we call the destroy function, we drop the global held by the
1123  * caller, and return true to tell the caller it needs to re-evalute the
1124  * state.
1125  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
1126  * remains held on exit.
1127  */
1128 static boolean_t
1129 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
1130     zone_t *zone, zone_key_t key)
1131 {
1132 	struct zsd_entry *t;
1133 	void *data;
1134 	boolean_t dropped;
1135 
1136 	if (lockp != NULL) {
1137 		ASSERT(MUTEX_HELD(lockp));
1138 	}
1139 	if (zone_lock_held) {
1140 		ASSERT(MUTEX_HELD(&zone->zone_lock));
1141 	} else {
1142 		mutex_enter(&zone->zone_lock);
1143 	}
1144 
1145 	t = zsd_find(&zone->zone_zsd, key);
1146 	if (t == NULL) {
1147 		/*
1148 		 * Somebody else got here first e.g the zone going
1149 		 * away.
1150 		 */
1151 		if (!zone_lock_held)
1152 			mutex_exit(&zone->zone_lock);
1153 		return (B_FALSE);
1154 	}
1155 	dropped = B_FALSE;
1156 	if (zsd_wait_for_creator(zone, t, lockp))
1157 		dropped = B_TRUE;
1158 
1159 	if (zsd_wait_for_inprogress(zone, t, lockp))
1160 		dropped = B_TRUE;
1161 
1162 	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
1163 		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
1164 		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
1165 		DTRACE_PROBE2(zsd__destroy__inprogress,
1166 		    zone_t *, zone, zone_key_t, key);
1167 		mutex_exit(&zone->zone_lock);
1168 		if (lockp != NULL)
1169 			mutex_exit(lockp);
1170 		dropped = B_TRUE;
1171 
1172 		ASSERT(t->zsd_destroy != NULL);
1173 		data = t->zsd_data;
1174 		DTRACE_PROBE2(zsd__destroy__start,
1175 		    zone_t *, zone, zone_key_t, key);
1176 
1177 		(t->zsd_destroy)(zone->zone_id, data);
1178 		DTRACE_PROBE2(zsd__destroy__end,
1179 		    zone_t *, zone, zone_key_t, key);
1180 
1181 		if (lockp != NULL)
1182 			mutex_enter(lockp);
1183 		mutex_enter(&zone->zone_lock);
1184 		t->zsd_data = NULL;
1185 		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
1186 		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
1187 		cv_broadcast(&t->zsd_cv);
1188 		DTRACE_PROBE2(zsd__destroy__completed,
1189 		    zone_t *, zone, zone_key_t, key);
1190 	}
1191 	if (!zone_lock_held)
1192 		mutex_exit(&zone->zone_lock);
1193 	return (dropped);
1194 }
1195 
1196 /*
1197  * Wait for any CREATE_NEEDED flag to be cleared.
1198  * Returns true if lockp was temporarily dropped while waiting.
1199  */
1200 static boolean_t
1201 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1202 {
1203 	boolean_t dropped = B_FALSE;
1204 
1205 	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
1206 		DTRACE_PROBE2(zsd__wait__for__creator,
1207 		    zone_t *, zone, struct zsd_entry *, t);
1208 		if (lockp != NULL) {
1209 			dropped = B_TRUE;
1210 			mutex_exit(lockp);
1211 		}
1212 		cv_wait(&t->zsd_cv, &zone->zone_lock);
1213 		if (lockp != NULL) {
1214 			/* First drop zone_lock to preserve order */
1215 			mutex_exit(&zone->zone_lock);
1216 			mutex_enter(lockp);
1217 			mutex_enter(&zone->zone_lock);
1218 		}
1219 	}
1220 	return (dropped);
1221 }
1222 
1223 /*
1224  * Wait for any INPROGRESS flag to be cleared.
1225  * Returns true if lockp was temporarily dropped while waiting.
1226  */
1227 static boolean_t
1228 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
1229 {
1230 	boolean_t dropped = B_FALSE;
1231 
1232 	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
1233 		DTRACE_PROBE2(zsd__wait__for__inprogress,
1234 		    zone_t *, zone, struct zsd_entry *, t);
1235 		if (lockp != NULL) {
1236 			dropped = B_TRUE;
1237 			mutex_exit(lockp);
1238 		}
1239 		cv_wait(&t->zsd_cv, &zone->zone_lock);
1240 		if (lockp != NULL) {
1241 			/* First drop zone_lock to preserve order */
1242 			mutex_exit(&zone->zone_lock);
1243 			mutex_enter(lockp);
1244 			mutex_enter(&zone->zone_lock);
1245 		}
1246 	}
1247 	return (dropped);
1248 }
1249 
1250 /*
1251  * Frees memory associated with the zone dataset list.
1252  */
1253 static void
1254 zone_free_datasets(zone_t *zone)
1255 {
1256 	zone_dataset_t *t, *next;
1257 
1258 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
1259 		next = list_next(&zone->zone_datasets, t);
1260 		list_remove(&zone->zone_datasets, t);
1261 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
1262 		kmem_free(t, sizeof (*t));
1263 	}
1264 	list_destroy(&zone->zone_datasets);
1265 }
1266 
1267 /*
1268  * zone.cpu-shares resource control support.
1269  */
1270 /*ARGSUSED*/
1271 static rctl_qty_t
1272 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
1273 {
1274 	ASSERT(MUTEX_HELD(&p->p_lock));
1275 	return (p->p_zone->zone_shares);
1276 }
1277 
1278 /*ARGSUSED*/
1279 static int
1280 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1281     rctl_qty_t nv)
1282 {
1283 	ASSERT(MUTEX_HELD(&p->p_lock));
1284 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1285 	if (e->rcep_p.zone == NULL)
1286 		return (0);
1287 
1288 	e->rcep_p.zone->zone_shares = nv;
1289 	return (0);
1290 }
1291 
1292 static rctl_ops_t zone_cpu_shares_ops = {
1293 	rcop_no_action,
1294 	zone_cpu_shares_usage,
1295 	zone_cpu_shares_set,
1296 	rcop_no_test
1297 };
1298 
1299 /*
1300  * zone.cpu-cap resource control support.
1301  */
1302 /*ARGSUSED*/
1303 static rctl_qty_t
1304 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
1305 {
1306 	ASSERT(MUTEX_HELD(&p->p_lock));
1307 	return (cpucaps_zone_get(p->p_zone));
1308 }
1309 
1310 /*ARGSUSED*/
1311 static int
1312 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1313     rctl_qty_t nv)
1314 {
1315 	zone_t *zone = e->rcep_p.zone;
1316 
1317 	ASSERT(MUTEX_HELD(&p->p_lock));
1318 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1319 
1320 	if (zone == NULL)
1321 		return (0);
1322 
1323 	/*
1324 	 * set cap to the new value.
1325 	 */
1326 	return (cpucaps_zone_set(zone, nv));
1327 }
1328 
1329 static rctl_ops_t zone_cpu_cap_ops = {
1330 	rcop_no_action,
1331 	zone_cpu_cap_get,
1332 	zone_cpu_cap_set,
1333 	rcop_no_test
1334 };
1335 
1336 /*ARGSUSED*/
1337 static rctl_qty_t
1338 zone_lwps_usage(rctl_t *r, proc_t *p)
1339 {
1340 	rctl_qty_t nlwps;
1341 	zone_t *zone = p->p_zone;
1342 
1343 	ASSERT(MUTEX_HELD(&p->p_lock));
1344 
1345 	mutex_enter(&zone->zone_nlwps_lock);
1346 	nlwps = zone->zone_nlwps;
1347 	mutex_exit(&zone->zone_nlwps_lock);
1348 
1349 	return (nlwps);
1350 }
1351 
1352 /*ARGSUSED*/
1353 static int
1354 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
1355     rctl_qty_t incr, uint_t flags)
1356 {
1357 	rctl_qty_t nlwps;
1358 
1359 	ASSERT(MUTEX_HELD(&p->p_lock));
1360 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1361 	if (e->rcep_p.zone == NULL)
1362 		return (0);
1363 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
1364 	nlwps = e->rcep_p.zone->zone_nlwps;
1365 
1366 	if (nlwps + incr > rcntl->rcv_value)
1367 		return (1);
1368 
1369 	return (0);
1370 }
1371 
1372 /*ARGSUSED*/
1373 static int
1374 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
1375 {
1376 	ASSERT(MUTEX_HELD(&p->p_lock));
1377 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1378 	if (e->rcep_p.zone == NULL)
1379 		return (0);
1380 	e->rcep_p.zone->zone_nlwps_ctl = nv;
1381 	return (0);
1382 }
1383 
1384 static rctl_ops_t zone_lwps_ops = {
1385 	rcop_no_action,
1386 	zone_lwps_usage,
1387 	zone_lwps_set,
1388 	zone_lwps_test,
1389 };
1390 
1391 /*ARGSUSED*/
1392 static int
1393 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1394     rctl_qty_t incr, uint_t flags)
1395 {
1396 	rctl_qty_t v;
1397 	ASSERT(MUTEX_HELD(&p->p_lock));
1398 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1399 	v = e->rcep_p.zone->zone_shmmax + incr;
1400 	if (v > rval->rcv_value)
1401 		return (1);
1402 	return (0);
1403 }
1404 
1405 static rctl_ops_t zone_shmmax_ops = {
1406 	rcop_no_action,
1407 	rcop_no_usage,
1408 	rcop_no_set,
1409 	zone_shmmax_test
1410 };
1411 
1412 /*ARGSUSED*/
1413 static int
1414 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1415     rctl_qty_t incr, uint_t flags)
1416 {
1417 	rctl_qty_t v;
1418 	ASSERT(MUTEX_HELD(&p->p_lock));
1419 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1420 	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
1421 	if (v > rval->rcv_value)
1422 		return (1);
1423 	return (0);
1424 }
1425 
1426 static rctl_ops_t zone_shmmni_ops = {
1427 	rcop_no_action,
1428 	rcop_no_usage,
1429 	rcop_no_set,
1430 	zone_shmmni_test
1431 };
1432 
1433 /*ARGSUSED*/
1434 static int
1435 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1436     rctl_qty_t incr, uint_t flags)
1437 {
1438 	rctl_qty_t v;
1439 	ASSERT(MUTEX_HELD(&p->p_lock));
1440 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1441 	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
1442 	if (v > rval->rcv_value)
1443 		return (1);
1444 	return (0);
1445 }
1446 
1447 static rctl_ops_t zone_semmni_ops = {
1448 	rcop_no_action,
1449 	rcop_no_usage,
1450 	rcop_no_set,
1451 	zone_semmni_test
1452 };
1453 
1454 /*ARGSUSED*/
1455 static int
1456 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1457     rctl_qty_t incr, uint_t flags)
1458 {
1459 	rctl_qty_t v;
1460 	ASSERT(MUTEX_HELD(&p->p_lock));
1461 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1462 	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1463 	if (v > rval->rcv_value)
1464 		return (1);
1465 	return (0);
1466 }
1467 
1468 static rctl_ops_t zone_msgmni_ops = {
1469 	rcop_no_action,
1470 	rcop_no_usage,
1471 	rcop_no_set,
1472 	zone_msgmni_test
1473 };
1474 
1475 /*ARGSUSED*/
1476 static rctl_qty_t
1477 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1478 {
1479 	rctl_qty_t q;
1480 	ASSERT(MUTEX_HELD(&p->p_lock));
1481 	mutex_enter(&p->p_zone->zone_mem_lock);
1482 	q = p->p_zone->zone_locked_mem;
1483 	mutex_exit(&p->p_zone->zone_mem_lock);
1484 	return (q);
1485 }
1486 
1487 /*ARGSUSED*/
1488 static int
1489 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1490     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1491 {
1492 	rctl_qty_t q;
1493 	zone_t *z;
1494 
1495 	z = e->rcep_p.zone;
1496 	ASSERT(MUTEX_HELD(&p->p_lock));
1497 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1498 	q = z->zone_locked_mem;
1499 	if (q + incr > rcntl->rcv_value)
1500 		return (1);
1501 	return (0);
1502 }
1503 
1504 /*ARGSUSED*/
1505 static int
1506 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1507     rctl_qty_t nv)
1508 {
1509 	ASSERT(MUTEX_HELD(&p->p_lock));
1510 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1511 	if (e->rcep_p.zone == NULL)
1512 		return (0);
1513 	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1514 	return (0);
1515 }
1516 
1517 static rctl_ops_t zone_locked_mem_ops = {
1518 	rcop_no_action,
1519 	zone_locked_mem_usage,
1520 	zone_locked_mem_set,
1521 	zone_locked_mem_test
1522 };
1523 
1524 /*ARGSUSED*/
1525 static rctl_qty_t
1526 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1527 {
1528 	rctl_qty_t q;
1529 	zone_t *z = p->p_zone;
1530 
1531 	ASSERT(MUTEX_HELD(&p->p_lock));
1532 	mutex_enter(&z->zone_mem_lock);
1533 	q = z->zone_max_swap;
1534 	mutex_exit(&z->zone_mem_lock);
1535 	return (q);
1536 }
1537 
1538 /*ARGSUSED*/
1539 static int
1540 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1541     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1542 {
1543 	rctl_qty_t q;
1544 	zone_t *z;
1545 
1546 	z = e->rcep_p.zone;
1547 	ASSERT(MUTEX_HELD(&p->p_lock));
1548 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1549 	q = z->zone_max_swap;
1550 	if (q + incr > rcntl->rcv_value)
1551 		return (1);
1552 	return (0);
1553 }
1554 
1555 /*ARGSUSED*/
1556 static int
1557 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1558     rctl_qty_t nv)
1559 {
1560 	ASSERT(MUTEX_HELD(&p->p_lock));
1561 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1562 	if (e->rcep_p.zone == NULL)
1563 		return (0);
1564 	e->rcep_p.zone->zone_max_swap_ctl = nv;
1565 	return (0);
1566 }
1567 
1568 static rctl_ops_t zone_max_swap_ops = {
1569 	rcop_no_action,
1570 	zone_max_swap_usage,
1571 	zone_max_swap_set,
1572 	zone_max_swap_test
1573 };
1574 
1575 /*
1576  * Helper function to brand the zone with a unique ID.
1577  */
1578 static void
1579 zone_uniqid(zone_t *zone)
1580 {
1581 	static uint64_t uniqid = 0;
1582 
1583 	ASSERT(MUTEX_HELD(&zonehash_lock));
1584 	zone->zone_uniqid = uniqid++;
1585 }
1586 
1587 /*
1588  * Returns a held pointer to the "kcred" for the specified zone.
1589  */
1590 struct cred *
1591 zone_get_kcred(zoneid_t zoneid)
1592 {
1593 	zone_t *zone;
1594 	cred_t *cr;
1595 
1596 	if ((zone = zone_find_by_id(zoneid)) == NULL)
1597 		return (NULL);
1598 	cr = zone->zone_kcred;
1599 	crhold(cr);
1600 	zone_rele(zone);
1601 	return (cr);
1602 }
1603 
1604 static int
1605 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1606 {
1607 	zone_t *zone = ksp->ks_private;
1608 	zone_kstat_t *zk = ksp->ks_data;
1609 
1610 	if (rw == KSTAT_WRITE)
1611 		return (EACCES);
1612 
1613 	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1614 	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1615 	return (0);
1616 }
1617 
1618 static int
1619 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1620 {
1621 	zone_t *zone = ksp->ks_private;
1622 	zone_kstat_t *zk = ksp->ks_data;
1623 
1624 	if (rw == KSTAT_WRITE)
1625 		return (EACCES);
1626 
1627 	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1628 	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1629 	return (0);
1630 }
1631 
1632 static void
1633 zone_kstat_create(zone_t *zone)
1634 {
1635 	kstat_t *ksp;
1636 	zone_kstat_t *zk;
1637 
1638 	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
1639 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1640 	    KSTAT_FLAG_VIRTUAL);
1641 
1642 	if (ksp == NULL)
1643 		return;
1644 
1645 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1646 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1647 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1648 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1649 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1650 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1651 	ksp->ks_update = zone_lockedmem_kstat_update;
1652 	ksp->ks_private = zone;
1653 	kstat_install(ksp);
1654 
1655 	zone->zone_lockedmem_kstat = ksp;
1656 
1657 	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
1658 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1659 	    KSTAT_FLAG_VIRTUAL);
1660 
1661 	if (ksp == NULL)
1662 		return;
1663 
1664 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1665 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1666 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1667 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1668 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1669 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1670 	ksp->ks_update = zone_swapresv_kstat_update;
1671 	ksp->ks_private = zone;
1672 	kstat_install(ksp);
1673 
1674 	zone->zone_swapresv_kstat = ksp;
1675 }
1676 
1677 static void
1678 zone_kstat_delete(zone_t *zone)
1679 {
1680 	void *data;
1681 
1682 	if (zone->zone_lockedmem_kstat != NULL) {
1683 		data = zone->zone_lockedmem_kstat->ks_data;
1684 		kstat_delete(zone->zone_lockedmem_kstat);
1685 		kmem_free(data, sizeof (zone_kstat_t));
1686 	}
1687 	if (zone->zone_swapresv_kstat != NULL) {
1688 		data = zone->zone_swapresv_kstat->ks_data;
1689 		kstat_delete(zone->zone_swapresv_kstat);
1690 		kmem_free(data, sizeof (zone_kstat_t));
1691 	}
1692 }
1693 
1694 /*
1695  * Called very early on in boot to initialize the ZSD list so that
1696  * zone_key_create() can be called before zone_init().  It also initializes
1697  * portions of zone0 which may be used before zone_init() is called.  The
1698  * variable "global_zone" will be set when zone0 is fully initialized by
1699  * zone_init().
1700  */
1701 void
1702 zone_zsd_init(void)
1703 {
1704 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1705 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1706 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1707 	    offsetof(struct zsd_entry, zsd_linkage));
1708 	list_create(&zone_active, sizeof (zone_t),
1709 	    offsetof(zone_t, zone_linkage));
1710 	list_create(&zone_deathrow, sizeof (zone_t),
1711 	    offsetof(zone_t, zone_linkage));
1712 
1713 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1714 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1715 	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1716 	zone0.zone_shares = 1;
1717 	zone0.zone_nlwps = 0;
1718 	zone0.zone_nlwps_ctl = INT_MAX;
1719 	zone0.zone_locked_mem = 0;
1720 	zone0.zone_locked_mem_ctl = UINT64_MAX;
1721 	ASSERT(zone0.zone_max_swap == 0);
1722 	zone0.zone_max_swap_ctl = UINT64_MAX;
1723 	zone0.zone_shmmax = 0;
1724 	zone0.zone_ipc.ipcq_shmmni = 0;
1725 	zone0.zone_ipc.ipcq_semmni = 0;
1726 	zone0.zone_ipc.ipcq_msgmni = 0;
1727 	zone0.zone_name = GLOBAL_ZONENAME;
1728 	zone0.zone_nodename = utsname.nodename;
1729 	zone0.zone_domain = srpc_domain;
1730 	zone0.zone_ref = 1;
1731 	zone0.zone_id = GLOBAL_ZONEID;
1732 	zone0.zone_status = ZONE_IS_RUNNING;
1733 	zone0.zone_rootpath = "/";
1734 	zone0.zone_rootpathlen = 2;
1735 	zone0.zone_psetid = ZONE_PS_INVAL;
1736 	zone0.zone_ncpus = 0;
1737 	zone0.zone_ncpus_online = 0;
1738 	zone0.zone_proc_initpid = 1;
1739 	zone0.zone_initname = initname;
1740 	zone0.zone_lockedmem_kstat = NULL;
1741 	zone0.zone_swapresv_kstat = NULL;
1742 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1743 	    offsetof(struct zsd_entry, zsd_linkage));
1744 	list_insert_head(&zone_active, &zone0);
1745 
1746 	/*
1747 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1748 	 * to anything meaningful.  It is assigned to be 'rootdir' in
1749 	 * vfs_mountroot().
1750 	 */
1751 	zone0.zone_rootvp = NULL;
1752 	zone0.zone_vfslist = NULL;
1753 	zone0.zone_bootargs = initargs;
1754 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1755 	/*
1756 	 * The global zone has all privileges
1757 	 */
1758 	priv_fillset(zone0.zone_privset);
1759 	/*
1760 	 * Add p0 to the global zone
1761 	 */
1762 	zone0.zone_zsched = &p0;
1763 	p0.p_zone = &zone0;
1764 }
1765 
1766 /*
1767  * Compute a hash value based on the contents of the label and the DOI.  The
1768  * hash algorithm is somewhat arbitrary, but is based on the observation that
1769  * humans will likely pick labels that differ by amounts that work out to be
1770  * multiples of the number of hash chains, and thus stirring in some primes
1771  * should help.
1772  */
1773 static uint_t
1774 hash_bylabel(void *hdata, mod_hash_key_t key)
1775 {
1776 	const ts_label_t *lab = (ts_label_t *)key;
1777 	const uint32_t *up, *ue;
1778 	uint_t hash;
1779 	int i;
1780 
1781 	_NOTE(ARGUNUSED(hdata));
1782 
1783 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1784 	/* we depend on alignment of label, but not representation */
1785 	up = (const uint32_t *)&lab->tsl_label;
1786 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1787 	i = 1;
1788 	while (up < ue) {
1789 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1790 		hash += *up + (*up << ((i % 16) + 1));
1791 		up++;
1792 		i++;
1793 	}
1794 	return (hash);
1795 }
1796 
1797 /*
1798  * All that mod_hash cares about here is zero (equal) versus non-zero (not
1799  * equal).  This may need to be changed if less than / greater than is ever
1800  * needed.
1801  */
1802 static int
1803 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1804 {
1805 	ts_label_t *lab1 = (ts_label_t *)key1;
1806 	ts_label_t *lab2 = (ts_label_t *)key2;
1807 
1808 	return (label_equal(lab1, lab2) ? 0 : 1);
1809 }
1810 
1811 /*
1812  * Called by main() to initialize the zones framework.
1813  */
1814 void
1815 zone_init(void)
1816 {
1817 	rctl_dict_entry_t *rde;
1818 	rctl_val_t *dval;
1819 	rctl_set_t *set;
1820 	rctl_alloc_gp_t *gp;
1821 	rctl_entity_p_t e;
1822 	int res;
1823 
1824 	ASSERT(curproc == &p0);
1825 
1826 	/*
1827 	 * Create ID space for zone IDs.  ID 0 is reserved for the
1828 	 * global zone.
1829 	 */
1830 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1831 
1832 	/*
1833 	 * Initialize generic zone resource controls, if any.
1834 	 */
1835 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1836 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1837 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1838 	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
1839 
1840 	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
1841 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
1842 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
1843 	    RCTL_GLOBAL_INFINITE,
1844 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
1845 
1846 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1847 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1848 	    INT_MAX, INT_MAX, &zone_lwps_ops);
1849 	/*
1850 	 * System V IPC resource controls
1851 	 */
1852 	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
1853 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1854 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
1855 
1856 	rc_zone_semmni = rctl_register("zone.max-sem-ids",
1857 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1858 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
1859 
1860 	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
1861 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1862 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
1863 
1864 	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
1865 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1866 	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
1867 
1868 	/*
1869 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
1870 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
1871 	 */
1872 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
1873 	bzero(dval, sizeof (rctl_val_t));
1874 	dval->rcv_value = 1;
1875 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
1876 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
1877 	dval->rcv_action_recip_pid = -1;
1878 
1879 	rde = rctl_dict_lookup("zone.cpu-shares");
1880 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
1881 
1882 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
1883 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1884 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1885 	    &zone_locked_mem_ops);
1886 
1887 	rc_zone_max_swap = rctl_register("zone.max-swap",
1888 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1889 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1890 	    &zone_max_swap_ops);
1891 
1892 	/*
1893 	 * Initialize the ``global zone''.
1894 	 */
1895 	set = rctl_set_create();
1896 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
1897 	mutex_enter(&p0.p_lock);
1898 	e.rcep_p.zone = &zone0;
1899 	e.rcep_t = RCENTITY_ZONE;
1900 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1901 	    gp);
1902 
1903 	zone0.zone_nlwps = p0.p_lwpcnt;
1904 	zone0.zone_ntasks = 1;
1905 	mutex_exit(&p0.p_lock);
1906 	zone0.zone_restart_init = B_TRUE;
1907 	zone0.zone_brand = &native_brand;
1908 	rctl_prealloc_destroy(gp);
1909 	/*
1910 	 * pool_default hasn't been initialized yet, so we let pool_init()
1911 	 * take care of making sure the global zone is in the default pool.
1912 	 */
1913 
1914 	/*
1915 	 * Initialize global zone kstats
1916 	 */
1917 	zone_kstat_create(&zone0);
1918 
1919 	/*
1920 	 * Initialize zone label.
1921 	 * mlp are initialized when tnzonecfg is loaded.
1922 	 */
1923 	zone0.zone_slabel = l_admin_low;
1924 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
1925 	label_hold(l_admin_low);
1926 
1927 	mutex_enter(&zonehash_lock);
1928 	zone_uniqid(&zone0);
1929 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
1930 
1931 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
1932 	    mod_hash_null_valdtor);
1933 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
1934 	    zone_hash_size, mod_hash_null_valdtor);
1935 	/*
1936 	 * maintain zonehashbylabel only for labeled systems
1937 	 */
1938 	if (is_system_labeled())
1939 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
1940 		    zone_hash_size, mod_hash_null_keydtor,
1941 		    mod_hash_null_valdtor, hash_bylabel, NULL,
1942 		    hash_labelkey_cmp, KM_SLEEP);
1943 	zonecount = 1;
1944 
1945 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
1946 	    (mod_hash_val_t)&zone0);
1947 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
1948 	    (mod_hash_val_t)&zone0);
1949 	if (is_system_labeled()) {
1950 		zone0.zone_flags |= ZF_HASHED_LABEL;
1951 		(void) mod_hash_insert(zonehashbylabel,
1952 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
1953 	}
1954 	mutex_exit(&zonehash_lock);
1955 
1956 	/*
1957 	 * We avoid setting zone_kcred until now, since kcred is initialized
1958 	 * sometime after zone_zsd_init() and before zone_init().
1959 	 */
1960 	zone0.zone_kcred = kcred;
1961 	/*
1962 	 * The global zone is fully initialized (except for zone_rootvp which
1963 	 * will be set when the root filesystem is mounted).
1964 	 */
1965 	global_zone = &zone0;
1966 
1967 	/*
1968 	 * Setup an event channel to send zone status change notifications on
1969 	 */
1970 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
1971 	    EVCH_CREAT);
1972 
1973 	if (res)
1974 		panic("Sysevent_evc_bind failed during zone setup.\n");
1975 
1976 }
1977 
1978 static void
1979 zone_free(zone_t *zone)
1980 {
1981 	ASSERT(zone != global_zone);
1982 	ASSERT(zone->zone_ntasks == 0);
1983 	ASSERT(zone->zone_nlwps == 0);
1984 	ASSERT(zone->zone_cred_ref == 0);
1985 	ASSERT(zone->zone_kcred == NULL);
1986 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
1987 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
1988 
1989 	/*
1990 	 * Remove any zone caps.
1991 	 */
1992 	cpucaps_zone_remove(zone);
1993 
1994 	ASSERT(zone->zone_cpucap == NULL);
1995 
1996 	/* remove from deathrow list */
1997 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
1998 		ASSERT(zone->zone_ref == 0);
1999 		mutex_enter(&zone_deathrow_lock);
2000 		list_remove(&zone_deathrow, zone);
2001 		mutex_exit(&zone_deathrow_lock);
2002 	}
2003 
2004 	zone_free_zsd(zone);
2005 	zone_free_datasets(zone);
2006 
2007 	if (zone->zone_rootvp != NULL)
2008 		VN_RELE(zone->zone_rootvp);
2009 	if (zone->zone_rootpath)
2010 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
2011 	if (zone->zone_name != NULL)
2012 		kmem_free(zone->zone_name, ZONENAME_MAX);
2013 	if (zone->zone_slabel != NULL)
2014 		label_rele(zone->zone_slabel);
2015 	if (zone->zone_nodename != NULL)
2016 		kmem_free(zone->zone_nodename, _SYS_NMLN);
2017 	if (zone->zone_domain != NULL)
2018 		kmem_free(zone->zone_domain, _SYS_NMLN);
2019 	if (zone->zone_privset != NULL)
2020 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
2021 	if (zone->zone_rctls != NULL)
2022 		rctl_set_free(zone->zone_rctls);
2023 	if (zone->zone_bootargs != NULL)
2024 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
2025 	if (zone->zone_initname != NULL)
2026 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
2027 	id_free(zoneid_space, zone->zone_id);
2028 	mutex_destroy(&zone->zone_lock);
2029 	cv_destroy(&zone->zone_cv);
2030 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
2031 	kmem_free(zone, sizeof (zone_t));
2032 }
2033 
2034 /*
2035  * See block comment at the top of this file for information about zone
2036  * status values.
2037  */
2038 /*
2039  * Convenience function for setting zone status.
2040  */
2041 static void
2042 zone_status_set(zone_t *zone, zone_status_t status)
2043 {
2044 
2045 	nvlist_t *nvl = NULL;
2046 	ASSERT(MUTEX_HELD(&zone_status_lock));
2047 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
2048 	    status >= zone_status_get(zone));
2049 
2050 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
2051 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
2052 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
2053 	    zone_status_table[status]) ||
2054 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
2055 	    zone_status_table[zone->zone_status]) ||
2056 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
2057 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
2058 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
2059 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
2060 #ifdef DEBUG
2061 		(void) printf(
2062 		    "Failed to allocate and send zone state change event.\n");
2063 #endif
2064 	}
2065 	nvlist_free(nvl);
2066 
2067 	zone->zone_status = status;
2068 
2069 	cv_broadcast(&zone->zone_cv);
2070 }
2071 
2072 /*
2073  * Public function to retrieve the zone status.  The zone status may
2074  * change after it is retrieved.
2075  */
2076 zone_status_t
2077 zone_status_get(zone_t *zone)
2078 {
2079 	return (zone->zone_status);
2080 }
2081 
2082 static int
2083 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
2084 {
2085 	char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
2086 	int err = 0;
2087 
2088 	ASSERT(zone != global_zone);
2089 	if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0)
2090 		goto done;	/* EFAULT or ENAMETOOLONG */
2091 
2092 	if (zone->zone_bootargs != NULL)
2093 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
2094 
2095 	zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP);
2096 	(void) strcpy(zone->zone_bootargs, bootargs);
2097 
2098 done:
2099 	kmem_free(bootargs, BOOTARGS_MAX);
2100 	return (err);
2101 }
2102 
2103 static int
2104 zone_set_brand(zone_t *zone, const char *brand)
2105 {
2106 	struct brand_attr *attrp;
2107 	brand_t *bp;
2108 
2109 	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
2110 	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
2111 		kmem_free(attrp, sizeof (struct brand_attr));
2112 		return (EFAULT);
2113 	}
2114 
2115 	bp = brand_register_zone(attrp);
2116 	kmem_free(attrp, sizeof (struct brand_attr));
2117 	if (bp == NULL)
2118 		return (EINVAL);
2119 
2120 	/*
2121 	 * This is the only place where a zone can change it's brand.
2122 	 * We already need to hold zone_status_lock to check the zone
2123 	 * status, so we'll just use that lock to serialize zone
2124 	 * branding requests as well.
2125 	 */
2126 	mutex_enter(&zone_status_lock);
2127 
2128 	/* Re-Branding is not allowed and the zone can't be booted yet */
2129 	if ((ZONE_IS_BRANDED(zone)) ||
2130 	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
2131 		mutex_exit(&zone_status_lock);
2132 		brand_unregister_zone(bp);
2133 		return (EINVAL);
2134 	}
2135 
2136 	if (is_system_labeled() &&
2137 	    strncmp(attrp->ba_brandname, NATIVE_BRAND_NAME, MAXNAMELEN) != 0) {
2138 		mutex_exit(&zone_status_lock);
2139 		brand_unregister_zone(bp);
2140 		return (EPERM);
2141 	}
2142 
2143 	/* set up the brand specific data */
2144 	zone->zone_brand = bp;
2145 	ZBROP(zone)->b_init_brand_data(zone);
2146 
2147 	mutex_exit(&zone_status_lock);
2148 	return (0);
2149 }
2150 
2151 static int
2152 zone_set_initname(zone_t *zone, const char *zone_initname)
2153 {
2154 	char initname[INITNAME_SZ];
2155 	size_t len;
2156 	int err = 0;
2157 
2158 	ASSERT(zone != global_zone);
2159 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
2160 		return (err);	/* EFAULT or ENAMETOOLONG */
2161 
2162 	if (zone->zone_initname != NULL)
2163 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
2164 
2165 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
2166 	(void) strcpy(zone->zone_initname, initname);
2167 	return (0);
2168 }
2169 
2170 static int
2171 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
2172 {
2173 	uint64_t mcap;
2174 	int err = 0;
2175 
2176 	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
2177 		zone->zone_phys_mcap = mcap;
2178 
2179 	return (err);
2180 }
2181 
2182 static int
2183 zone_set_sched_class(zone_t *zone, const char *new_class)
2184 {
2185 	char sched_class[PC_CLNMSZ];
2186 	id_t classid;
2187 	int err;
2188 
2189 	ASSERT(zone != global_zone);
2190 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
2191 		return (err);	/* EFAULT or ENAMETOOLONG */
2192 
2193 	if (getcid(sched_class, &classid) != 0 || classid == syscid)
2194 		return (set_errno(EINVAL));
2195 	zone->zone_defaultcid = classid;
2196 	ASSERT(zone->zone_defaultcid > 0 &&
2197 	    zone->zone_defaultcid < loaded_classes);
2198 
2199 	return (0);
2200 }
2201 
2202 /*
2203  * Block indefinitely waiting for (zone_status >= status)
2204  */
2205 void
2206 zone_status_wait(zone_t *zone, zone_status_t status)
2207 {
2208 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2209 
2210 	mutex_enter(&zone_status_lock);
2211 	while (zone->zone_status < status) {
2212 		cv_wait(&zone->zone_cv, &zone_status_lock);
2213 	}
2214 	mutex_exit(&zone_status_lock);
2215 }
2216 
2217 /*
2218  * Private CPR-safe version of zone_status_wait().
2219  */
2220 static void
2221 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
2222 {
2223 	callb_cpr_t cprinfo;
2224 
2225 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2226 
2227 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
2228 	    str);
2229 	mutex_enter(&zone_status_lock);
2230 	while (zone->zone_status < status) {
2231 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2232 		cv_wait(&zone->zone_cv, &zone_status_lock);
2233 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
2234 	}
2235 	/*
2236 	 * zone_status_lock is implicitly released by the following.
2237 	 */
2238 	CALLB_CPR_EXIT(&cprinfo);
2239 }
2240 
2241 /*
2242  * Block until zone enters requested state or signal is received.  Return (0)
2243  * if signaled, non-zero otherwise.
2244  */
2245 int
2246 zone_status_wait_sig(zone_t *zone, zone_status_t status)
2247 {
2248 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2249 
2250 	mutex_enter(&zone_status_lock);
2251 	while (zone->zone_status < status) {
2252 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
2253 			mutex_exit(&zone_status_lock);
2254 			return (0);
2255 		}
2256 	}
2257 	mutex_exit(&zone_status_lock);
2258 	return (1);
2259 }
2260 
2261 /*
2262  * Block until the zone enters the requested state or the timeout expires,
2263  * whichever happens first.  Return (-1) if operation timed out, time remaining
2264  * otherwise.
2265  */
2266 clock_t
2267 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
2268 {
2269 	clock_t timeleft = 0;
2270 
2271 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2272 
2273 	mutex_enter(&zone_status_lock);
2274 	while (zone->zone_status < status && timeleft != -1) {
2275 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
2276 	}
2277 	mutex_exit(&zone_status_lock);
2278 	return (timeleft);
2279 }
2280 
2281 /*
2282  * Block until the zone enters the requested state, the current process is
2283  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
2284  * operation timed out, 0 if signaled, time remaining otherwise.
2285  */
2286 clock_t
2287 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
2288 {
2289 	clock_t timeleft = tim - lbolt;
2290 
2291 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
2292 
2293 	mutex_enter(&zone_status_lock);
2294 	while (zone->zone_status < status) {
2295 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
2296 		    tim);
2297 		if (timeleft <= 0)
2298 			break;
2299 	}
2300 	mutex_exit(&zone_status_lock);
2301 	return (timeleft);
2302 }
2303 
2304 /*
2305  * Zones have two reference counts: one for references from credential
2306  * structures (zone_cred_ref), and one (zone_ref) for everything else.
2307  * This is so we can allow a zone to be rebooted while there are still
2308  * outstanding cred references, since certain drivers cache dblks (which
2309  * implicitly results in cached creds).  We wait for zone_ref to drop to
2310  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
2311  * later freed when the zone_cred_ref drops to 0, though nothing other
2312  * than the zone id and privilege set should be accessed once the zone
2313  * is "dead".
2314  *
2315  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
2316  * to force halt/reboot to block waiting for the zone_cred_ref to drop
2317  * to 0.  This can be useful to flush out other sources of cached creds
2318  * that may be less innocuous than the driver case.
2319  */
2320 
2321 int zone_wait_for_cred = 0;
2322 
2323 static void
2324 zone_hold_locked(zone_t *z)
2325 {
2326 	ASSERT(MUTEX_HELD(&z->zone_lock));
2327 	z->zone_ref++;
2328 	ASSERT(z->zone_ref != 0);
2329 }
2330 
2331 void
2332 zone_hold(zone_t *z)
2333 {
2334 	mutex_enter(&z->zone_lock);
2335 	zone_hold_locked(z);
2336 	mutex_exit(&z->zone_lock);
2337 }
2338 
2339 /*
2340  * If the non-cred ref count drops to 1 and either the cred ref count
2341  * is 0 or we aren't waiting for cred references, the zone is ready to
2342  * be destroyed.
2343  */
2344 #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
2345 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
2346 
2347 void
2348 zone_rele(zone_t *z)
2349 {
2350 	boolean_t wakeup;
2351 
2352 	mutex_enter(&z->zone_lock);
2353 	ASSERT(z->zone_ref != 0);
2354 	z->zone_ref--;
2355 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2356 		/* no more refs, free the structure */
2357 		mutex_exit(&z->zone_lock);
2358 		zone_free(z);
2359 		return;
2360 	}
2361 	/* signal zone_destroy so the zone can finish halting */
2362 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
2363 	mutex_exit(&z->zone_lock);
2364 
2365 	if (wakeup) {
2366 		/*
2367 		 * Grabbing zonehash_lock here effectively synchronizes with
2368 		 * zone_destroy() to avoid missed signals.
2369 		 */
2370 		mutex_enter(&zonehash_lock);
2371 		cv_broadcast(&zone_destroy_cv);
2372 		mutex_exit(&zonehash_lock);
2373 	}
2374 }
2375 
2376 void
2377 zone_cred_hold(zone_t *z)
2378 {
2379 	mutex_enter(&z->zone_lock);
2380 	z->zone_cred_ref++;
2381 	ASSERT(z->zone_cred_ref != 0);
2382 	mutex_exit(&z->zone_lock);
2383 }
2384 
2385 void
2386 zone_cred_rele(zone_t *z)
2387 {
2388 	boolean_t wakeup;
2389 
2390 	mutex_enter(&z->zone_lock);
2391 	ASSERT(z->zone_cred_ref != 0);
2392 	z->zone_cred_ref--;
2393 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
2394 		/* no more refs, free the structure */
2395 		mutex_exit(&z->zone_lock);
2396 		zone_free(z);
2397 		return;
2398 	}
2399 	/*
2400 	 * If zone_destroy is waiting for the cred references to drain
2401 	 * out, and they have, signal it.
2402 	 */
2403 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
2404 	    zone_status_get(z) >= ZONE_IS_DEAD);
2405 	mutex_exit(&z->zone_lock);
2406 
2407 	if (wakeup) {
2408 		/*
2409 		 * Grabbing zonehash_lock here effectively synchronizes with
2410 		 * zone_destroy() to avoid missed signals.
2411 		 */
2412 		mutex_enter(&zonehash_lock);
2413 		cv_broadcast(&zone_destroy_cv);
2414 		mutex_exit(&zonehash_lock);
2415 	}
2416 }
2417 
2418 void
2419 zone_task_hold(zone_t *z)
2420 {
2421 	mutex_enter(&z->zone_lock);
2422 	z->zone_ntasks++;
2423 	ASSERT(z->zone_ntasks != 0);
2424 	mutex_exit(&z->zone_lock);
2425 }
2426 
2427 void
2428 zone_task_rele(zone_t *zone)
2429 {
2430 	uint_t refcnt;
2431 
2432 	mutex_enter(&zone->zone_lock);
2433 	ASSERT(zone->zone_ntasks != 0);
2434 	refcnt = --zone->zone_ntasks;
2435 	if (refcnt > 1)	{	/* Common case */
2436 		mutex_exit(&zone->zone_lock);
2437 		return;
2438 	}
2439 	zone_hold_locked(zone);	/* so we can use the zone_t later */
2440 	mutex_exit(&zone->zone_lock);
2441 	if (refcnt == 1) {
2442 		/*
2443 		 * See if the zone is shutting down.
2444 		 */
2445 		mutex_enter(&zone_status_lock);
2446 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
2447 			goto out;
2448 		}
2449 
2450 		/*
2451 		 * Make sure the ntasks didn't change since we
2452 		 * dropped zone_lock.
2453 		 */
2454 		mutex_enter(&zone->zone_lock);
2455 		if (refcnt != zone->zone_ntasks) {
2456 			mutex_exit(&zone->zone_lock);
2457 			goto out;
2458 		}
2459 		mutex_exit(&zone->zone_lock);
2460 
2461 		/*
2462 		 * No more user processes in the zone.  The zone is empty.
2463 		 */
2464 		zone_status_set(zone, ZONE_IS_EMPTY);
2465 		goto out;
2466 	}
2467 
2468 	ASSERT(refcnt == 0);
2469 	/*
2470 	 * zsched has exited; the zone is dead.
2471 	 */
2472 	zone->zone_zsched = NULL;		/* paranoia */
2473 	mutex_enter(&zone_status_lock);
2474 	zone_status_set(zone, ZONE_IS_DEAD);
2475 out:
2476 	mutex_exit(&zone_status_lock);
2477 	zone_rele(zone);
2478 }
2479 
2480 zoneid_t
2481 getzoneid(void)
2482 {
2483 	return (curproc->p_zone->zone_id);
2484 }
2485 
2486 /*
2487  * Internal versions of zone_find_by_*().  These don't zone_hold() or
2488  * check the validity of a zone's state.
2489  */
2490 static zone_t *
2491 zone_find_all_by_id(zoneid_t zoneid)
2492 {
2493 	mod_hash_val_t hv;
2494 	zone_t *zone = NULL;
2495 
2496 	ASSERT(MUTEX_HELD(&zonehash_lock));
2497 
2498 	if (mod_hash_find(zonehashbyid,
2499 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
2500 		zone = (zone_t *)hv;
2501 	return (zone);
2502 }
2503 
2504 static zone_t *
2505 zone_find_all_by_label(const ts_label_t *label)
2506 {
2507 	mod_hash_val_t hv;
2508 	zone_t *zone = NULL;
2509 
2510 	ASSERT(MUTEX_HELD(&zonehash_lock));
2511 
2512 	/*
2513 	 * zonehashbylabel is not maintained for unlabeled systems
2514 	 */
2515 	if (!is_system_labeled())
2516 		return (NULL);
2517 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2518 		zone = (zone_t *)hv;
2519 	return (zone);
2520 }
2521 
2522 static zone_t *
2523 zone_find_all_by_name(char *name)
2524 {
2525 	mod_hash_val_t hv;
2526 	zone_t *zone = NULL;
2527 
2528 	ASSERT(MUTEX_HELD(&zonehash_lock));
2529 
2530 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2531 		zone = (zone_t *)hv;
2532 	return (zone);
2533 }
2534 
2535 /*
2536  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2537  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2538  * Caller must call zone_rele() once it is done with the zone.
2539  *
2540  * The zone may begin the zone_destroy() sequence immediately after this
2541  * function returns, but may be safely used until zone_rele() is called.
2542  */
2543 zone_t *
2544 zone_find_by_id(zoneid_t zoneid)
2545 {
2546 	zone_t *zone;
2547 	zone_status_t status;
2548 
2549 	mutex_enter(&zonehash_lock);
2550 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2551 		mutex_exit(&zonehash_lock);
2552 		return (NULL);
2553 	}
2554 	status = zone_status_get(zone);
2555 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2556 		/*
2557 		 * For all practical purposes the zone doesn't exist.
2558 		 */
2559 		mutex_exit(&zonehash_lock);
2560 		return (NULL);
2561 	}
2562 	zone_hold(zone);
2563 	mutex_exit(&zonehash_lock);
2564 	return (zone);
2565 }
2566 
2567 /*
2568  * Similar to zone_find_by_id, but using zone label as the key.
2569  */
2570 zone_t *
2571 zone_find_by_label(const ts_label_t *label)
2572 {
2573 	zone_t *zone;
2574 	zone_status_t status;
2575 
2576 	mutex_enter(&zonehash_lock);
2577 	if ((zone = zone_find_all_by_label(label)) == NULL) {
2578 		mutex_exit(&zonehash_lock);
2579 		return (NULL);
2580 	}
2581 
2582 	status = zone_status_get(zone);
2583 	if (status > ZONE_IS_DOWN) {
2584 		/*
2585 		 * For all practical purposes the zone doesn't exist.
2586 		 */
2587 		mutex_exit(&zonehash_lock);
2588 		return (NULL);
2589 	}
2590 	zone_hold(zone);
2591 	mutex_exit(&zonehash_lock);
2592 	return (zone);
2593 }
2594 
2595 /*
2596  * Similar to zone_find_by_id, but using zone name as the key.
2597  */
2598 zone_t *
2599 zone_find_by_name(char *name)
2600 {
2601 	zone_t *zone;
2602 	zone_status_t status;
2603 
2604 	mutex_enter(&zonehash_lock);
2605 	if ((zone = zone_find_all_by_name(name)) == NULL) {
2606 		mutex_exit(&zonehash_lock);
2607 		return (NULL);
2608 	}
2609 	status = zone_status_get(zone);
2610 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2611 		/*
2612 		 * For all practical purposes the zone doesn't exist.
2613 		 */
2614 		mutex_exit(&zonehash_lock);
2615 		return (NULL);
2616 	}
2617 	zone_hold(zone);
2618 	mutex_exit(&zonehash_lock);
2619 	return (zone);
2620 }
2621 
2622 /*
2623  * Similar to zone_find_by_id(), using the path as a key.  For instance,
2624  * if there is a zone "foo" rooted at /foo/root, and the path argument
2625  * is "/foo/root/proc", it will return the held zone_t corresponding to
2626  * zone "foo".
2627  *
2628  * zone_find_by_path() always returns a non-NULL value, since at the
2629  * very least every path will be contained in the global zone.
2630  *
2631  * As with the other zone_find_by_*() functions, the caller is
2632  * responsible for zone_rele()ing the return value of this function.
2633  */
2634 zone_t *
2635 zone_find_by_path(const char *path)
2636 {
2637 	zone_t *zone;
2638 	zone_t *zret = NULL;
2639 	zone_status_t status;
2640 
2641 	if (path == NULL) {
2642 		/*
2643 		 * Call from rootconf().
2644 		 */
2645 		zone_hold(global_zone);
2646 		return (global_zone);
2647 	}
2648 	ASSERT(*path == '/');
2649 	mutex_enter(&zonehash_lock);
2650 	for (zone = list_head(&zone_active); zone != NULL;
2651 	    zone = list_next(&zone_active, zone)) {
2652 		if (ZONE_PATH_VISIBLE(path, zone))
2653 			zret = zone;
2654 	}
2655 	ASSERT(zret != NULL);
2656 	status = zone_status_get(zret);
2657 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2658 		/*
2659 		 * Zone practically doesn't exist.
2660 		 */
2661 		zret = global_zone;
2662 	}
2663 	zone_hold(zret);
2664 	mutex_exit(&zonehash_lock);
2665 	return (zret);
2666 }
2667 
2668 /*
2669  * Get the number of cpus visible to this zone.  The system-wide global
2670  * 'ncpus' is returned if pools are disabled, the caller is in the
2671  * global zone, or a NULL zone argument is passed in.
2672  */
2673 int
2674 zone_ncpus_get(zone_t *zone)
2675 {
2676 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2677 
2678 	return (myncpus != 0 ? myncpus : ncpus);
2679 }
2680 
2681 /*
2682  * Get the number of online cpus visible to this zone.  The system-wide
2683  * global 'ncpus_online' is returned if pools are disabled, the caller
2684  * is in the global zone, or a NULL zone argument is passed in.
2685  */
2686 int
2687 zone_ncpus_online_get(zone_t *zone)
2688 {
2689 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
2690 
2691 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
2692 }
2693 
2694 /*
2695  * Return the pool to which the zone is currently bound.
2696  */
2697 pool_t *
2698 zone_pool_get(zone_t *zone)
2699 {
2700 	ASSERT(pool_lock_held());
2701 
2702 	return (zone->zone_pool);
2703 }
2704 
2705 /*
2706  * Set the zone's pool pointer and update the zone's visibility to match
2707  * the resources in the new pool.
2708  */
2709 void
2710 zone_pool_set(zone_t *zone, pool_t *pool)
2711 {
2712 	ASSERT(pool_lock_held());
2713 	ASSERT(MUTEX_HELD(&cpu_lock));
2714 
2715 	zone->zone_pool = pool;
2716 	zone_pset_set(zone, pool->pool_pset->pset_id);
2717 }
2718 
2719 /*
2720  * Return the cached value of the id of the processor set to which the
2721  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
2722  * facility is disabled.
2723  */
2724 psetid_t
2725 zone_pset_get(zone_t *zone)
2726 {
2727 	ASSERT(MUTEX_HELD(&cpu_lock));
2728 
2729 	return (zone->zone_psetid);
2730 }
2731 
2732 /*
2733  * Set the cached value of the id of the processor set to which the zone
2734  * is currently bound.  Also update the zone's visibility to match the
2735  * resources in the new processor set.
2736  */
2737 void
2738 zone_pset_set(zone_t *zone, psetid_t newpsetid)
2739 {
2740 	psetid_t oldpsetid;
2741 
2742 	ASSERT(MUTEX_HELD(&cpu_lock));
2743 	oldpsetid = zone_pset_get(zone);
2744 
2745 	if (oldpsetid == newpsetid)
2746 		return;
2747 	/*
2748 	 * Global zone sees all.
2749 	 */
2750 	if (zone != global_zone) {
2751 		zone->zone_psetid = newpsetid;
2752 		if (newpsetid != ZONE_PS_INVAL)
2753 			pool_pset_visibility_add(newpsetid, zone);
2754 		if (oldpsetid != ZONE_PS_INVAL)
2755 			pool_pset_visibility_remove(oldpsetid, zone);
2756 	}
2757 	/*
2758 	 * Disabling pools, so we should start using the global values
2759 	 * for ncpus and ncpus_online.
2760 	 */
2761 	if (newpsetid == ZONE_PS_INVAL) {
2762 		zone->zone_ncpus = 0;
2763 		zone->zone_ncpus_online = 0;
2764 	}
2765 }
2766 
2767 /*
2768  * Walk the list of active zones and issue the provided callback for
2769  * each of them.
2770  *
2771  * Caller must not be holding any locks that may be acquired under
2772  * zonehash_lock.  See comment at the beginning of the file for a list of
2773  * common locks and their interactions with zones.
2774  */
2775 int
2776 zone_walk(int (*cb)(zone_t *, void *), void *data)
2777 {
2778 	zone_t *zone;
2779 	int ret = 0;
2780 	zone_status_t status;
2781 
2782 	mutex_enter(&zonehash_lock);
2783 	for (zone = list_head(&zone_active); zone != NULL;
2784 	    zone = list_next(&zone_active, zone)) {
2785 		/*
2786 		 * Skip zones that shouldn't be externally visible.
2787 		 */
2788 		status = zone_status_get(zone);
2789 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
2790 			continue;
2791 		/*
2792 		 * Bail immediately if any callback invocation returns a
2793 		 * non-zero value.
2794 		 */
2795 		ret = (*cb)(zone, data);
2796 		if (ret != 0)
2797 			break;
2798 	}
2799 	mutex_exit(&zonehash_lock);
2800 	return (ret);
2801 }
2802 
2803 static int
2804 zone_set_root(zone_t *zone, const char *upath)
2805 {
2806 	vnode_t *vp;
2807 	int trycount;
2808 	int error = 0;
2809 	char *path;
2810 	struct pathname upn, pn;
2811 	size_t pathlen;
2812 
2813 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
2814 		return (error);
2815 
2816 	pn_alloc(&pn);
2817 
2818 	/* prevent infinite loop */
2819 	trycount = 10;
2820 	for (;;) {
2821 		if (--trycount <= 0) {
2822 			error = ESTALE;
2823 			goto out;
2824 		}
2825 
2826 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
2827 			/*
2828 			 * VOP_ACCESS() may cover 'vp' with a new
2829 			 * filesystem, if 'vp' is an autoFS vnode.
2830 			 * Get the new 'vp' if so.
2831 			 */
2832 			if ((error =
2833 			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
2834 			    (!vn_ismntpt(vp) ||
2835 			    (error = traverse(&vp)) == 0)) {
2836 				pathlen = pn.pn_pathlen + 2;
2837 				path = kmem_alloc(pathlen, KM_SLEEP);
2838 				(void) strncpy(path, pn.pn_path,
2839 				    pn.pn_pathlen + 1);
2840 				path[pathlen - 2] = '/';
2841 				path[pathlen - 1] = '\0';
2842 				pn_free(&pn);
2843 				pn_free(&upn);
2844 
2845 				/* Success! */
2846 				break;
2847 			}
2848 			VN_RELE(vp);
2849 		}
2850 		if (error != ESTALE)
2851 			goto out;
2852 	}
2853 
2854 	ASSERT(error == 0);
2855 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
2856 	zone->zone_rootpath = path;
2857 	zone->zone_rootpathlen = pathlen;
2858 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
2859 		zone->zone_flags |= ZF_IS_SCRATCH;
2860 	return (0);
2861 
2862 out:
2863 	pn_free(&pn);
2864 	pn_free(&upn);
2865 	return (error);
2866 }
2867 
2868 #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
2869 			((c) >= 'a' && (c) <= 'z') || \
2870 			((c) >= 'A' && (c) <= 'Z'))
2871 
2872 static int
2873 zone_set_name(zone_t *zone, const char *uname)
2874 {
2875 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
2876 	size_t len;
2877 	int i, err;
2878 
2879 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
2880 		kmem_free(kname, ZONENAME_MAX);
2881 		return (err);	/* EFAULT or ENAMETOOLONG */
2882 	}
2883 
2884 	/* must be less than ZONENAME_MAX */
2885 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
2886 		kmem_free(kname, ZONENAME_MAX);
2887 		return (EINVAL);
2888 	}
2889 
2890 	/*
2891 	 * Name must start with an alphanumeric and must contain only
2892 	 * alphanumerics, '-', '_' and '.'.
2893 	 */
2894 	if (!isalnum(kname[0])) {
2895 		kmem_free(kname, ZONENAME_MAX);
2896 		return (EINVAL);
2897 	}
2898 	for (i = 1; i < len - 1; i++) {
2899 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
2900 		    kname[i] != '.') {
2901 			kmem_free(kname, ZONENAME_MAX);
2902 			return (EINVAL);
2903 		}
2904 	}
2905 
2906 	zone->zone_name = kname;
2907 	return (0);
2908 }
2909 
2910 /*
2911  * Similar to thread_create(), but makes sure the thread is in the appropriate
2912  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
2913  */
2914 /*ARGSUSED*/
2915 kthread_t *
2916 zthread_create(
2917     caddr_t stk,
2918     size_t stksize,
2919     void (*proc)(),
2920     void *arg,
2921     size_t len,
2922     pri_t pri)
2923 {
2924 	kthread_t *t;
2925 	zone_t *zone = curproc->p_zone;
2926 	proc_t *pp = zone->zone_zsched;
2927 
2928 	zone_hold(zone);	/* Reference to be dropped when thread exits */
2929 
2930 	/*
2931 	 * No-one should be trying to create threads if the zone is shutting
2932 	 * down and there aren't any kernel threads around.  See comment
2933 	 * in zthread_exit().
2934 	 */
2935 	ASSERT(!(zone->zone_kthreads == NULL &&
2936 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
2937 	/*
2938 	 * Create a thread, but don't let it run until we've finished setting
2939 	 * things up.
2940 	 */
2941 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
2942 	ASSERT(t->t_forw == NULL);
2943 	mutex_enter(&zone_status_lock);
2944 	if (zone->zone_kthreads == NULL) {
2945 		t->t_forw = t->t_back = t;
2946 	} else {
2947 		kthread_t *tx = zone->zone_kthreads;
2948 
2949 		t->t_forw = tx;
2950 		t->t_back = tx->t_back;
2951 		tx->t_back->t_forw = t;
2952 		tx->t_back = t;
2953 	}
2954 	zone->zone_kthreads = t;
2955 	mutex_exit(&zone_status_lock);
2956 
2957 	mutex_enter(&pp->p_lock);
2958 	t->t_proc_flag |= TP_ZTHREAD;
2959 	project_rele(t->t_proj);
2960 	t->t_proj = project_hold(pp->p_task->tk_proj);
2961 
2962 	/*
2963 	 * Setup complete, let it run.
2964 	 */
2965 	thread_lock(t);
2966 	t->t_schedflag |= TS_ALLSTART;
2967 	setrun_locked(t);
2968 	thread_unlock(t);
2969 
2970 	mutex_exit(&pp->p_lock);
2971 
2972 	return (t);
2973 }
2974 
2975 /*
2976  * Similar to thread_exit().  Must be called by threads created via
2977  * zthread_exit().
2978  */
2979 void
2980 zthread_exit(void)
2981 {
2982 	kthread_t *t = curthread;
2983 	proc_t *pp = curproc;
2984 	zone_t *zone = pp->p_zone;
2985 
2986 	mutex_enter(&zone_status_lock);
2987 
2988 	/*
2989 	 * Reparent to p0
2990 	 */
2991 	kpreempt_disable();
2992 	mutex_enter(&pp->p_lock);
2993 	t->t_proc_flag &= ~TP_ZTHREAD;
2994 	t->t_procp = &p0;
2995 	hat_thread_exit(t);
2996 	mutex_exit(&pp->p_lock);
2997 	kpreempt_enable();
2998 
2999 	if (t->t_back == t) {
3000 		ASSERT(t->t_forw == t);
3001 		/*
3002 		 * If the zone is empty, once the thread count
3003 		 * goes to zero no further kernel threads can be
3004 		 * created.  This is because if the creator is a process
3005 		 * in the zone, then it must have exited before the zone
3006 		 * state could be set to ZONE_IS_EMPTY.
3007 		 * Otherwise, if the creator is a kernel thread in the
3008 		 * zone, the thread count is non-zero.
3009 		 *
3010 		 * This really means that non-zone kernel threads should
3011 		 * not create zone kernel threads.
3012 		 */
3013 		zone->zone_kthreads = NULL;
3014 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
3015 			zone_status_set(zone, ZONE_IS_DOWN);
3016 			/*
3017 			 * Remove any CPU caps on this zone.
3018 			 */
3019 			cpucaps_zone_remove(zone);
3020 		}
3021 	} else {
3022 		t->t_forw->t_back = t->t_back;
3023 		t->t_back->t_forw = t->t_forw;
3024 		if (zone->zone_kthreads == t)
3025 			zone->zone_kthreads = t->t_forw;
3026 	}
3027 	mutex_exit(&zone_status_lock);
3028 	zone_rele(zone);
3029 	thread_exit();
3030 	/* NOTREACHED */
3031 }
3032 
3033 static void
3034 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
3035 {
3036 	vnode_t *oldvp;
3037 
3038 	/* we're going to hold a reference here to the directory */
3039 	VN_HOLD(vp);
3040 
3041 	if (audit_active)	/* update abs cwd/root path see c2audit.c */
3042 		audit_chdirec(vp, vpp);
3043 
3044 	mutex_enter(&pp->p_lock);
3045 	oldvp = *vpp;
3046 	*vpp = vp;
3047 	mutex_exit(&pp->p_lock);
3048 	if (oldvp != NULL)
3049 		VN_RELE(oldvp);
3050 }
3051 
3052 /*
3053  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
3054  */
3055 static int
3056 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
3057 {
3058 	nvpair_t *nvp = NULL;
3059 	boolean_t priv_set = B_FALSE;
3060 	boolean_t limit_set = B_FALSE;
3061 	boolean_t action_set = B_FALSE;
3062 
3063 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3064 		const char *name;
3065 		uint64_t ui64;
3066 
3067 		name = nvpair_name(nvp);
3068 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
3069 			return (EINVAL);
3070 		(void) nvpair_value_uint64(nvp, &ui64);
3071 		if (strcmp(name, "privilege") == 0) {
3072 			/*
3073 			 * Currently only privileged values are allowed, but
3074 			 * this may change in the future.
3075 			 */
3076 			if (ui64 != RCPRIV_PRIVILEGED)
3077 				return (EINVAL);
3078 			rv->rcv_privilege = ui64;
3079 			priv_set = B_TRUE;
3080 		} else if (strcmp(name, "limit") == 0) {
3081 			rv->rcv_value = ui64;
3082 			limit_set = B_TRUE;
3083 		} else if (strcmp(name, "action") == 0) {
3084 			if (ui64 != RCTL_LOCAL_NOACTION &&
3085 			    ui64 != RCTL_LOCAL_DENY)
3086 				return (EINVAL);
3087 			rv->rcv_flagaction = ui64;
3088 			action_set = B_TRUE;
3089 		} else {
3090 			return (EINVAL);
3091 		}
3092 	}
3093 
3094 	if (!(priv_set && limit_set && action_set))
3095 		return (EINVAL);
3096 	rv->rcv_action_signal = 0;
3097 	rv->rcv_action_recipient = NULL;
3098 	rv->rcv_action_recip_pid = -1;
3099 	rv->rcv_firing_time = 0;
3100 
3101 	return (0);
3102 }
3103 
3104 /*
3105  * Non-global zone version of start_init.
3106  */
3107 void
3108 zone_start_init(void)
3109 {
3110 	proc_t *p = ttoproc(curthread);
3111 	zone_t *z = p->p_zone;
3112 
3113 	ASSERT(!INGLOBALZONE(curproc));
3114 
3115 	/*
3116 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
3117 	 * storing just the pid of init is sufficient.
3118 	 */
3119 	z->zone_proc_initpid = p->p_pid;
3120 
3121 	/*
3122 	 * We maintain zone_boot_err so that we can return the cause of the
3123 	 * failure back to the caller of the zone_boot syscall.
3124 	 */
3125 	p->p_zone->zone_boot_err = start_init_common();
3126 
3127 	mutex_enter(&zone_status_lock);
3128 	if (z->zone_boot_err != 0) {
3129 		/*
3130 		 * Make sure we are still in the booting state-- we could have
3131 		 * raced and already be shutting down, or even further along.
3132 		 */
3133 		if (zone_status_get(z) == ZONE_IS_BOOTING) {
3134 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
3135 		}
3136 		mutex_exit(&zone_status_lock);
3137 		/* It's gone bad, dispose of the process */
3138 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
3139 			mutex_enter(&p->p_lock);
3140 			ASSERT(p->p_flag & SEXITLWPS);
3141 			lwp_exit();
3142 		}
3143 	} else {
3144 		if (zone_status_get(z) == ZONE_IS_BOOTING)
3145 			zone_status_set(z, ZONE_IS_RUNNING);
3146 		mutex_exit(&zone_status_lock);
3147 		/* cause the process to return to userland. */
3148 		lwp_rtt();
3149 	}
3150 }
3151 
3152 struct zsched_arg {
3153 	zone_t *zone;
3154 	nvlist_t *nvlist;
3155 };
3156 
3157 /*
3158  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
3159  * anything to do with scheduling, but rather with the fact that
3160  * per-zone kernel threads are parented to zsched, just like regular
3161  * kernel threads are parented to sched (p0).
3162  *
3163  * zsched is also responsible for launching init for the zone.
3164  */
3165 static void
3166 zsched(void *arg)
3167 {
3168 	struct zsched_arg *za = arg;
3169 	proc_t *pp = curproc;
3170 	proc_t *initp = proc_init;
3171 	zone_t *zone = za->zone;
3172 	cred_t *cr, *oldcred;
3173 	rctl_set_t *set;
3174 	rctl_alloc_gp_t *gp;
3175 	contract_t *ct = NULL;
3176 	task_t *tk, *oldtk;
3177 	rctl_entity_p_t e;
3178 	kproject_t *pj;
3179 
3180 	nvlist_t *nvl = za->nvlist;
3181 	nvpair_t *nvp = NULL;
3182 
3183 	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
3184 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
3185 	PTOU(pp)->u_argc = 0;
3186 	PTOU(pp)->u_argv = NULL;
3187 	PTOU(pp)->u_envp = NULL;
3188 	closeall(P_FINFO(pp));
3189 
3190 	/*
3191 	 * We are this zone's "zsched" process.  As the zone isn't generally
3192 	 * visible yet we don't need to grab any locks before initializing its
3193 	 * zone_proc pointer.
3194 	 */
3195 	zone_hold(zone);  /* this hold is released by zone_destroy() */
3196 	zone->zone_zsched = pp;
3197 	mutex_enter(&pp->p_lock);
3198 	pp->p_zone = zone;
3199 	mutex_exit(&pp->p_lock);
3200 
3201 	/*
3202 	 * Disassociate process from its 'parent'; parent ourselves to init
3203 	 * (pid 1) and change other values as needed.
3204 	 */
3205 	sess_create();
3206 
3207 	mutex_enter(&pidlock);
3208 	proc_detach(pp);
3209 	pp->p_ppid = 1;
3210 	pp->p_flag |= SZONETOP;
3211 	pp->p_ancpid = 1;
3212 	pp->p_parent = initp;
3213 	pp->p_psibling = NULL;
3214 	if (initp->p_child)
3215 		initp->p_child->p_psibling = pp;
3216 	pp->p_sibling = initp->p_child;
3217 	initp->p_child = pp;
3218 
3219 	/* Decrement what newproc() incremented. */
3220 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
3221 	/*
3222 	 * Our credentials are about to become kcred-like, so we don't care
3223 	 * about the caller's ruid.
3224 	 */
3225 	upcount_inc(crgetruid(kcred), zone->zone_id);
3226 	mutex_exit(&pidlock);
3227 
3228 	/*
3229 	 * getting out of global zone, so decrement lwp counts
3230 	 */
3231 	pj = pp->p_task->tk_proj;
3232 	mutex_enter(&global_zone->zone_nlwps_lock);
3233 	pj->kpj_nlwps -= pp->p_lwpcnt;
3234 	global_zone->zone_nlwps -= pp->p_lwpcnt;
3235 	mutex_exit(&global_zone->zone_nlwps_lock);
3236 
3237 	/*
3238 	 * Decrement locked memory counts on old zone and project.
3239 	 */
3240 	mutex_enter(&global_zone->zone_mem_lock);
3241 	global_zone->zone_locked_mem -= pp->p_locked_mem;
3242 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
3243 	mutex_exit(&global_zone->zone_mem_lock);
3244 
3245 	/*
3246 	 * Create and join a new task in project '0' of this zone.
3247 	 *
3248 	 * We don't need to call holdlwps() since we know we're the only lwp in
3249 	 * this process.
3250 	 *
3251 	 * task_join() returns with p_lock held.
3252 	 */
3253 	tk = task_create(0, zone);
3254 	mutex_enter(&cpu_lock);
3255 	oldtk = task_join(tk, 0);
3256 
3257 	pj = pp->p_task->tk_proj;
3258 
3259 	mutex_enter(&zone->zone_mem_lock);
3260 	zone->zone_locked_mem += pp->p_locked_mem;
3261 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
3262 	mutex_exit(&zone->zone_mem_lock);
3263 
3264 	/*
3265 	 * add lwp counts to zsched's zone, and increment project's task count
3266 	 * due to the task created in the above tasksys_settaskid
3267 	 */
3268 
3269 	mutex_enter(&zone->zone_nlwps_lock);
3270 	pj->kpj_nlwps += pp->p_lwpcnt;
3271 	pj->kpj_ntasks += 1;
3272 	zone->zone_nlwps += pp->p_lwpcnt;
3273 	mutex_exit(&zone->zone_nlwps_lock);
3274 
3275 	mutex_exit(&curproc->p_lock);
3276 	mutex_exit(&cpu_lock);
3277 	task_rele(oldtk);
3278 
3279 	/*
3280 	 * The process was created by a process in the global zone, hence the
3281 	 * credentials are wrong.  We might as well have kcred-ish credentials.
3282 	 */
3283 	cr = zone->zone_kcred;
3284 	crhold(cr);
3285 	mutex_enter(&pp->p_crlock);
3286 	oldcred = pp->p_cred;
3287 	pp->p_cred = cr;
3288 	mutex_exit(&pp->p_crlock);
3289 	crfree(oldcred);
3290 
3291 	/*
3292 	 * Hold credentials again (for thread)
3293 	 */
3294 	crhold(cr);
3295 
3296 	/*
3297 	 * p_lwpcnt can't change since this is a kernel process.
3298 	 */
3299 	crset(pp, cr);
3300 
3301 	/*
3302 	 * Chroot
3303 	 */
3304 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
3305 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
3306 
3307 	/*
3308 	 * Initialize zone's rctl set.
3309 	 */
3310 	set = rctl_set_create();
3311 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
3312 	mutex_enter(&pp->p_lock);
3313 	e.rcep_p.zone = zone;
3314 	e.rcep_t = RCENTITY_ZONE;
3315 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
3316 	mutex_exit(&pp->p_lock);
3317 	rctl_prealloc_destroy(gp);
3318 
3319 	/*
3320 	 * Apply the rctls passed in to zone_create().  This is basically a list
3321 	 * assignment: all of the old values are removed and the new ones
3322 	 * inserted.  That is, if an empty list is passed in, all values are
3323 	 * removed.
3324 	 */
3325 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3326 		rctl_dict_entry_t *rde;
3327 		rctl_hndl_t hndl;
3328 		char *name;
3329 		nvlist_t **nvlarray;
3330 		uint_t i, nelem;
3331 		int error;	/* For ASSERT()s */
3332 
3333 		name = nvpair_name(nvp);
3334 		hndl = rctl_hndl_lookup(name);
3335 		ASSERT(hndl != -1);
3336 		rde = rctl_dict_lookup_hndl(hndl);
3337 		ASSERT(rde != NULL);
3338 
3339 		for (; /* ever */; ) {
3340 			rctl_val_t oval;
3341 
3342 			mutex_enter(&pp->p_lock);
3343 			error = rctl_local_get(hndl, NULL, &oval, pp);
3344 			mutex_exit(&pp->p_lock);
3345 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
3346 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
3347 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
3348 				break;
3349 			mutex_enter(&pp->p_lock);
3350 			error = rctl_local_delete(hndl, &oval, pp);
3351 			mutex_exit(&pp->p_lock);
3352 			ASSERT(error == 0);
3353 		}
3354 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3355 		ASSERT(error == 0);
3356 		for (i = 0; i < nelem; i++) {
3357 			rctl_val_t *nvalp;
3358 
3359 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
3360 			error = nvlist2rctlval(nvlarray[i], nvalp);
3361 			ASSERT(error == 0);
3362 			/*
3363 			 * rctl_local_insert can fail if the value being
3364 			 * inserted is a duplicate; this is OK.
3365 			 */
3366 			mutex_enter(&pp->p_lock);
3367 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
3368 				kmem_cache_free(rctl_val_cache, nvalp);
3369 			mutex_exit(&pp->p_lock);
3370 		}
3371 	}
3372 	/*
3373 	 * Tell the world that we're done setting up.
3374 	 *
3375 	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
3376 	 * and atomically set the zone's processor set visibility.  Once
3377 	 * we drop pool_lock() this zone will automatically get updated
3378 	 * to reflect any future changes to the pools configuration.
3379 	 *
3380 	 * Note that after we drop the locks below (zonehash_lock in
3381 	 * particular) other operations such as a zone_getattr call can
3382 	 * now proceed and observe the zone. That is the reason for doing a
3383 	 * state transition to the INITIALIZED state.
3384 	 */
3385 	pool_lock();
3386 	mutex_enter(&cpu_lock);
3387 	mutex_enter(&zonehash_lock);
3388 	zone_uniqid(zone);
3389 	zone_zsd_configure(zone);
3390 	if (pool_state == POOL_ENABLED)
3391 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
3392 	mutex_enter(&zone_status_lock);
3393 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
3394 	zone_status_set(zone, ZONE_IS_INITIALIZED);
3395 	mutex_exit(&zone_status_lock);
3396 	mutex_exit(&zonehash_lock);
3397 	mutex_exit(&cpu_lock);
3398 	pool_unlock();
3399 
3400 	/* Now call the create callback for this key */
3401 	zsd_apply_all_keys(zsd_apply_create, zone);
3402 
3403 	/* The callbacks are complete. Mark ZONE_IS_READY */
3404 	mutex_enter(&zone_status_lock);
3405 	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
3406 	zone_status_set(zone, ZONE_IS_READY);
3407 	mutex_exit(&zone_status_lock);
3408 
3409 	/*
3410 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
3411 	 * we launch init, and set the state to running.
3412 	 */
3413 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
3414 
3415 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
3416 		id_t cid;
3417 
3418 		/*
3419 		 * Ok, this is a little complicated.  We need to grab the
3420 		 * zone's pool's scheduling class ID; note that by now, we
3421 		 * are already bound to a pool if we need to be (zoneadmd
3422 		 * will have done that to us while we're in the READY
3423 		 * state).  *But* the scheduling class for the zone's 'init'
3424 		 * must be explicitly passed to newproc, which doesn't
3425 		 * respect pool bindings.
3426 		 *
3427 		 * We hold the pool_lock across the call to newproc() to
3428 		 * close the obvious race: the pool's scheduling class
3429 		 * could change before we manage to create the LWP with
3430 		 * classid 'cid'.
3431 		 */
3432 		pool_lock();
3433 		if (zone->zone_defaultcid > 0)
3434 			cid = zone->zone_defaultcid;
3435 		else
3436 			cid = pool_get_class(zone->zone_pool);
3437 		if (cid == -1)
3438 			cid = defaultcid;
3439 
3440 		/*
3441 		 * If this fails, zone_boot will ultimately fail.  The
3442 		 * state of the zone will be set to SHUTTING_DOWN-- userland
3443 		 * will have to tear down the zone, and fail, or try again.
3444 		 */
3445 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
3446 		    minclsyspri - 1, &ct)) != 0) {
3447 			mutex_enter(&zone_status_lock);
3448 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3449 			mutex_exit(&zone_status_lock);
3450 		}
3451 		pool_unlock();
3452 	}
3453 
3454 	/*
3455 	 * Wait for zone_destroy() to be called.  This is what we spend
3456 	 * most of our life doing.
3457 	 */
3458 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
3459 
3460 	if (ct)
3461 		/*
3462 		 * At this point the process contract should be empty.
3463 		 * (Though if it isn't, it's not the end of the world.)
3464 		 */
3465 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
3466 
3467 	/*
3468 	 * Allow kcred to be freed when all referring processes
3469 	 * (including this one) go away.  We can't just do this in
3470 	 * zone_free because we need to wait for the zone_cred_ref to
3471 	 * drop to 0 before calling zone_free, and the existence of
3472 	 * zone_kcred will prevent that.  Thus, we call crfree here to
3473 	 * balance the crdup in zone_create.  The crhold calls earlier
3474 	 * in zsched will be dropped when the thread and process exit.
3475 	 */
3476 	crfree(zone->zone_kcred);
3477 	zone->zone_kcred = NULL;
3478 
3479 	exit(CLD_EXITED, 0);
3480 }
3481 
3482 /*
3483  * Helper function to determine if there are any submounts of the
3484  * provided path.  Used to make sure the zone doesn't "inherit" any
3485  * mounts from before it is created.
3486  */
3487 static uint_t
3488 zone_mount_count(const char *rootpath)
3489 {
3490 	vfs_t *vfsp;
3491 	uint_t count = 0;
3492 	size_t rootpathlen = strlen(rootpath);
3493 
3494 	/*
3495 	 * Holding zonehash_lock prevents race conditions with
3496 	 * vfs_list_add()/vfs_list_remove() since we serialize with
3497 	 * zone_find_by_path().
3498 	 */
3499 	ASSERT(MUTEX_HELD(&zonehash_lock));
3500 	/*
3501 	 * The rootpath must end with a '/'
3502 	 */
3503 	ASSERT(rootpath[rootpathlen - 1] == '/');
3504 
3505 	/*
3506 	 * This intentionally does not count the rootpath itself if that
3507 	 * happens to be a mount point.
3508 	 */
3509 	vfs_list_read_lock();
3510 	vfsp = rootvfs;
3511 	do {
3512 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
3513 		    rootpathlen) == 0)
3514 			count++;
3515 		vfsp = vfsp->vfs_next;
3516 	} while (vfsp != rootvfs);
3517 	vfs_list_unlock();
3518 	return (count);
3519 }
3520 
3521 /*
3522  * Helper function to make sure that a zone created on 'rootpath'
3523  * wouldn't end up containing other zones' rootpaths.
3524  */
3525 static boolean_t
3526 zone_is_nested(const char *rootpath)
3527 {
3528 	zone_t *zone;
3529 	size_t rootpathlen = strlen(rootpath);
3530 	size_t len;
3531 
3532 	ASSERT(MUTEX_HELD(&zonehash_lock));
3533 
3534 	for (zone = list_head(&zone_active); zone != NULL;
3535 	    zone = list_next(&zone_active, zone)) {
3536 		if (zone == global_zone)
3537 			continue;
3538 		len = strlen(zone->zone_rootpath);
3539 		if (strncmp(rootpath, zone->zone_rootpath,
3540 		    MIN(rootpathlen, len)) == 0)
3541 			return (B_TRUE);
3542 	}
3543 	return (B_FALSE);
3544 }
3545 
3546 static int
3547 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3548     size_t zone_privssz)
3549 {
3550 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3551 
3552 	if (zone_privssz < sizeof (priv_set_t))
3553 		return (set_errno(ENOMEM));
3554 
3555 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3556 		kmem_free(privs, sizeof (priv_set_t));
3557 		return (EFAULT);
3558 	}
3559 
3560 	zone->zone_privset = privs;
3561 	return (0);
3562 }
3563 
3564 /*
3565  * We make creative use of nvlists to pass in rctls from userland.  The list is
3566  * a list of the following structures:
3567  *
3568  * (name = rctl_name, value = nvpair_list_array)
3569  *
3570  * Where each element of the nvpair_list_array is of the form:
3571  *
3572  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3573  * 	(name = "limit", value = uint64_t),
3574  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3575  */
3576 static int
3577 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3578 {
3579 	nvpair_t *nvp = NULL;
3580 	nvlist_t *nvl = NULL;
3581 	char *kbuf;
3582 	int error;
3583 	rctl_val_t rv;
3584 
3585 	*nvlp = NULL;
3586 
3587 	if (buflen == 0)
3588 		return (0);
3589 
3590 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3591 		return (ENOMEM);
3592 	if (copyin(ubuf, kbuf, buflen)) {
3593 		error = EFAULT;
3594 		goto out;
3595 	}
3596 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3597 		/*
3598 		 * nvl may have been allocated/free'd, but the value set to
3599 		 * non-NULL, so we reset it here.
3600 		 */
3601 		nvl = NULL;
3602 		error = EINVAL;
3603 		goto out;
3604 	}
3605 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3606 		rctl_dict_entry_t *rde;
3607 		rctl_hndl_t hndl;
3608 		nvlist_t **nvlarray;
3609 		uint_t i, nelem;
3610 		char *name;
3611 
3612 		error = EINVAL;
3613 		name = nvpair_name(nvp);
3614 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3615 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3616 			goto out;
3617 		}
3618 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
3619 			goto out;
3620 		}
3621 		rde = rctl_dict_lookup_hndl(hndl);
3622 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3623 		ASSERT(error == 0);
3624 		for (i = 0; i < nelem; i++) {
3625 			if (error = nvlist2rctlval(nvlarray[i], &rv))
3626 				goto out;
3627 		}
3628 		if (rctl_invalid_value(rde, &rv)) {
3629 			error = EINVAL;
3630 			goto out;
3631 		}
3632 	}
3633 	error = 0;
3634 	*nvlp = nvl;
3635 out:
3636 	kmem_free(kbuf, buflen);
3637 	if (error && nvl != NULL)
3638 		nvlist_free(nvl);
3639 	return (error);
3640 }
3641 
3642 int
3643 zone_create_error(int er_error, int er_ext, int *er_out) {
3644 	if (er_out != NULL) {
3645 		if (copyout(&er_ext, er_out, sizeof (int))) {
3646 			return (set_errno(EFAULT));
3647 		}
3648 	}
3649 	return (set_errno(er_error));
3650 }
3651 
3652 static int
3653 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
3654 {
3655 	ts_label_t *tsl;
3656 	bslabel_t blab;
3657 
3658 	/* Get label from user */
3659 	if (copyin(lab, &blab, sizeof (blab)) != 0)
3660 		return (EFAULT);
3661 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
3662 	if (tsl == NULL)
3663 		return (ENOMEM);
3664 
3665 	zone->zone_slabel = tsl;
3666 	return (0);
3667 }
3668 
3669 /*
3670  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3671  */
3672 static int
3673 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3674 {
3675 	char *kbuf;
3676 	char *dataset, *next;
3677 	zone_dataset_t *zd;
3678 	size_t len;
3679 
3680 	if (ubuf == NULL || buflen == 0)
3681 		return (0);
3682 
3683 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3684 		return (ENOMEM);
3685 
3686 	if (copyin(ubuf, kbuf, buflen) != 0) {
3687 		kmem_free(kbuf, buflen);
3688 		return (EFAULT);
3689 	}
3690 
3691 	dataset = next = kbuf;
3692 	for (;;) {
3693 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3694 
3695 		next = strchr(dataset, ',');
3696 
3697 		if (next == NULL)
3698 			len = strlen(dataset);
3699 		else
3700 			len = next - dataset;
3701 
3702 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3703 		bcopy(dataset, zd->zd_dataset, len);
3704 		zd->zd_dataset[len] = '\0';
3705 
3706 		list_insert_head(&zone->zone_datasets, zd);
3707 
3708 		if (next == NULL)
3709 			break;
3710 
3711 		dataset = next + 1;
3712 	}
3713 
3714 	kmem_free(kbuf, buflen);
3715 	return (0);
3716 }
3717 
3718 /*
3719  * System call to create/initialize a new zone named 'zone_name', rooted
3720  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
3721  * and initialized with the zone-wide rctls described in 'rctlbuf', and
3722  * with labeling set by 'match', 'doi', and 'label'.
3723  *
3724  * If extended error is non-null, we may use it to return more detailed
3725  * error information.
3726  */
3727 static zoneid_t
3728 zone_create(const char *zone_name, const char *zone_root,
3729     const priv_set_t *zone_privs, size_t zone_privssz,
3730     caddr_t rctlbuf, size_t rctlbufsz,
3731     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
3732     int match, uint32_t doi, const bslabel_t *label,
3733     int flags)
3734 {
3735 	struct zsched_arg zarg;
3736 	nvlist_t *rctls = NULL;
3737 	proc_t *pp = curproc;
3738 	zone_t *zone, *ztmp;
3739 	zoneid_t zoneid;
3740 	int error;
3741 	int error2 = 0;
3742 	char *str;
3743 	cred_t *zkcr;
3744 	boolean_t insert_label_hash;
3745 
3746 	if (secpolicy_zone_config(CRED()) != 0)
3747 		return (set_errno(EPERM));
3748 
3749 	/* can't boot zone from within chroot environment */
3750 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
3751 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3752 		    extended_error));
3753 
3754 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
3755 	zoneid = zone->zone_id = id_alloc(zoneid_space);
3756 	zone->zone_status = ZONE_IS_UNINITIALIZED;
3757 	zone->zone_pool = pool_default;
3758 	zone->zone_pool_mod = gethrtime();
3759 	zone->zone_psetid = ZONE_PS_INVAL;
3760 	zone->zone_ncpus = 0;
3761 	zone->zone_ncpus_online = 0;
3762 	zone->zone_restart_init = B_TRUE;
3763 	zone->zone_brand = &native_brand;
3764 	zone->zone_initname = NULL;
3765 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
3766 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
3767 	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
3768 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
3769 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
3770 	    offsetof(struct zsd_entry, zsd_linkage));
3771 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3772 	    offsetof(zone_dataset_t, zd_linkage));
3773 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
3774 
3775 	if (flags & ZCF_NET_EXCL) {
3776 		zone->zone_flags |= ZF_NET_EXCL;
3777 	}
3778 
3779 	if ((error = zone_set_name(zone, zone_name)) != 0) {
3780 		zone_free(zone);
3781 		return (zone_create_error(error, 0, extended_error));
3782 	}
3783 
3784 	if ((error = zone_set_root(zone, zone_root)) != 0) {
3785 		zone_free(zone);
3786 		return (zone_create_error(error, 0, extended_error));
3787 	}
3788 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
3789 		zone_free(zone);
3790 		return (zone_create_error(error, 0, extended_error));
3791 	}
3792 
3793 	/* initialize node name to be the same as zone name */
3794 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3795 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
3796 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
3797 
3798 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3799 	zone->zone_domain[0] = '\0';
3800 	zone->zone_shares = 1;
3801 	zone->zone_shmmax = 0;
3802 	zone->zone_ipc.ipcq_shmmni = 0;
3803 	zone->zone_ipc.ipcq_semmni = 0;
3804 	zone->zone_ipc.ipcq_msgmni = 0;
3805 	zone->zone_bootargs = NULL;
3806 	zone->zone_initname =
3807 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
3808 	(void) strcpy(zone->zone_initname, zone_default_initname);
3809 	zone->zone_nlwps = 0;
3810 	zone->zone_nlwps_ctl = INT_MAX;
3811 	zone->zone_locked_mem = 0;
3812 	zone->zone_locked_mem_ctl = UINT64_MAX;
3813 	zone->zone_max_swap = 0;
3814 	zone->zone_max_swap_ctl = UINT64_MAX;
3815 	zone0.zone_lockedmem_kstat = NULL;
3816 	zone0.zone_swapresv_kstat = NULL;
3817 
3818 	/*
3819 	 * Zsched initializes the rctls.
3820 	 */
3821 	zone->zone_rctls = NULL;
3822 
3823 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
3824 		zone_free(zone);
3825 		return (zone_create_error(error, 0, extended_error));
3826 	}
3827 
3828 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
3829 		zone_free(zone);
3830 		return (set_errno(error));
3831 	}
3832 
3833 	/*
3834 	 * Read in the trusted system parameters:
3835 	 * match flag and sensitivity label.
3836 	 */
3837 	zone->zone_match = match;
3838 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3839 		/* Fail if requested to set doi to anything but system's doi */
3840 		if (doi != 0 && doi != default_doi) {
3841 			zone_free(zone);
3842 			return (set_errno(EINVAL));
3843 		}
3844 		/* Always apply system's doi to the zone */
3845 		error = zone_set_label(zone, label, default_doi);
3846 		if (error != 0) {
3847 			zone_free(zone);
3848 			return (set_errno(error));
3849 		}
3850 		insert_label_hash = B_TRUE;
3851 	} else {
3852 		/* all zones get an admin_low label if system is not labeled */
3853 		zone->zone_slabel = l_admin_low;
3854 		label_hold(l_admin_low);
3855 		insert_label_hash = B_FALSE;
3856 	}
3857 
3858 	/*
3859 	 * Stop all lwps since that's what normally happens as part of fork().
3860 	 * This needs to happen before we grab any locks to avoid deadlock
3861 	 * (another lwp in the process could be waiting for the held lock).
3862 	 */
3863 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
3864 		zone_free(zone);
3865 		if (rctls)
3866 			nvlist_free(rctls);
3867 		return (zone_create_error(error, 0, extended_error));
3868 	}
3869 
3870 	if (block_mounts() == 0) {
3871 		mutex_enter(&pp->p_lock);
3872 		if (curthread != pp->p_agenttp)
3873 			continuelwps(pp);
3874 		mutex_exit(&pp->p_lock);
3875 		zone_free(zone);
3876 		if (rctls)
3877 			nvlist_free(rctls);
3878 		return (zone_create_error(error, 0, extended_error));
3879 	}
3880 
3881 	/*
3882 	 * Set up credential for kernel access.  After this, any errors
3883 	 * should go through the dance in errout rather than calling
3884 	 * zone_free directly.
3885 	 */
3886 	zone->zone_kcred = crdup(kcred);
3887 	crsetzone(zone->zone_kcred, zone);
3888 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
3889 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
3890 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
3891 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
3892 
3893 	mutex_enter(&zonehash_lock);
3894 	/*
3895 	 * Make sure zone doesn't already exist.
3896 	 *
3897 	 * If the system and zone are labeled,
3898 	 * make sure no other zone exists that has the same label.
3899 	 */
3900 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
3901 	    (insert_label_hash &&
3902 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
3903 		zone_status_t status;
3904 
3905 		status = zone_status_get(ztmp);
3906 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
3907 			error = EEXIST;
3908 		else
3909 			error = EBUSY;
3910 
3911 		if (insert_label_hash)
3912 			error2 = ZE_LABELINUSE;
3913 
3914 		goto errout;
3915 	}
3916 
3917 	/*
3918 	 * Don't allow zone creations which would cause one zone's rootpath to
3919 	 * be accessible from that of another (non-global) zone.
3920 	 */
3921 	if (zone_is_nested(zone->zone_rootpath)) {
3922 		error = EBUSY;
3923 		goto errout;
3924 	}
3925 
3926 	ASSERT(zonecount != 0);		/* check for leaks */
3927 	if (zonecount + 1 > maxzones) {
3928 		error = ENOMEM;
3929 		goto errout;
3930 	}
3931 
3932 	if (zone_mount_count(zone->zone_rootpath) != 0) {
3933 		error = EBUSY;
3934 		error2 = ZE_AREMOUNTS;
3935 		goto errout;
3936 	}
3937 
3938 	/*
3939 	 * Zone is still incomplete, but we need to drop all locks while
3940 	 * zsched() initializes this zone's kernel process.  We
3941 	 * optimistically add the zone to the hashtable and associated
3942 	 * lists so a parallel zone_create() doesn't try to create the
3943 	 * same zone.
3944 	 */
3945 	zonecount++;
3946 	(void) mod_hash_insert(zonehashbyid,
3947 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
3948 	    (mod_hash_val_t)(uintptr_t)zone);
3949 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
3950 	(void) strcpy(str, zone->zone_name);
3951 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
3952 	    (mod_hash_val_t)(uintptr_t)zone);
3953 	if (insert_label_hash) {
3954 		(void) mod_hash_insert(zonehashbylabel,
3955 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
3956 		zone->zone_flags |= ZF_HASHED_LABEL;
3957 	}
3958 
3959 	/*
3960 	 * Insert into active list.  At this point there are no 'hold's
3961 	 * on the zone, but everyone else knows not to use it, so we can
3962 	 * continue to use it.  zsched() will do a zone_hold() if the
3963 	 * newproc() is successful.
3964 	 */
3965 	list_insert_tail(&zone_active, zone);
3966 	mutex_exit(&zonehash_lock);
3967 
3968 	zarg.zone = zone;
3969 	zarg.nvlist = rctls;
3970 	/*
3971 	 * The process, task, and project rctls are probably wrong;
3972 	 * we need an interface to get the default values of all rctls,
3973 	 * and initialize zsched appropriately.  I'm not sure that that
3974 	 * makes much of a difference, though.
3975 	 */
3976 	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
3977 		/*
3978 		 * We need to undo all globally visible state.
3979 		 */
3980 		mutex_enter(&zonehash_lock);
3981 		list_remove(&zone_active, zone);
3982 		if (zone->zone_flags & ZF_HASHED_LABEL) {
3983 			ASSERT(zone->zone_slabel != NULL);
3984 			(void) mod_hash_destroy(zonehashbylabel,
3985 			    (mod_hash_key_t)zone->zone_slabel);
3986 		}
3987 		(void) mod_hash_destroy(zonehashbyname,
3988 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
3989 		(void) mod_hash_destroy(zonehashbyid,
3990 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3991 		ASSERT(zonecount > 1);
3992 		zonecount--;
3993 		goto errout;
3994 	}
3995 
3996 	/*
3997 	 * Zone creation can't fail from now on.
3998 	 */
3999 
4000 	/*
4001 	 * Create zone kstats
4002 	 */
4003 	zone_kstat_create(zone);
4004 
4005 	/*
4006 	 * Let the other lwps continue.
4007 	 */
4008 	mutex_enter(&pp->p_lock);
4009 	if (curthread != pp->p_agenttp)
4010 		continuelwps(pp);
4011 	mutex_exit(&pp->p_lock);
4012 
4013 	/*
4014 	 * Wait for zsched to finish initializing the zone.
4015 	 */
4016 	zone_status_wait(zone, ZONE_IS_READY);
4017 	/*
4018 	 * The zone is fully visible, so we can let mounts progress.
4019 	 */
4020 	resume_mounts();
4021 	if (rctls)
4022 		nvlist_free(rctls);
4023 
4024 	return (zoneid);
4025 
4026 errout:
4027 	mutex_exit(&zonehash_lock);
4028 	/*
4029 	 * Let the other lwps continue.
4030 	 */
4031 	mutex_enter(&pp->p_lock);
4032 	if (curthread != pp->p_agenttp)
4033 		continuelwps(pp);
4034 	mutex_exit(&pp->p_lock);
4035 
4036 	resume_mounts();
4037 	if (rctls)
4038 		nvlist_free(rctls);
4039 	/*
4040 	 * There is currently one reference to the zone, a cred_ref from
4041 	 * zone_kcred.  To free the zone, we call crfree, which will call
4042 	 * zone_cred_rele, which will call zone_free.
4043 	 */
4044 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
4045 	ASSERT(zone->zone_kcred->cr_ref == 1);
4046 	ASSERT(zone->zone_ref == 0);
4047 	zkcr = zone->zone_kcred;
4048 	zone->zone_kcred = NULL;
4049 	crfree(zkcr);				/* triggers call to zone_free */
4050 	return (zone_create_error(error, error2, extended_error));
4051 }
4052 
4053 /*
4054  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
4055  * the heavy lifting.  initname is the path to the program to launch
4056  * at the "top" of the zone; if this is NULL, we use the system default,
4057  * which is stored at zone_default_initname.
4058  */
4059 static int
4060 zone_boot(zoneid_t zoneid)
4061 {
4062 	int err;
4063 	zone_t *zone;
4064 
4065 	if (secpolicy_zone_config(CRED()) != 0)
4066 		return (set_errno(EPERM));
4067 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4068 		return (set_errno(EINVAL));
4069 
4070 	mutex_enter(&zonehash_lock);
4071 	/*
4072 	 * Look for zone under hash lock to prevent races with calls to
4073 	 * zone_shutdown, zone_destroy, etc.
4074 	 */
4075 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4076 		mutex_exit(&zonehash_lock);
4077 		return (set_errno(EINVAL));
4078 	}
4079 
4080 	mutex_enter(&zone_status_lock);
4081 	if (zone_status_get(zone) != ZONE_IS_READY) {
4082 		mutex_exit(&zone_status_lock);
4083 		mutex_exit(&zonehash_lock);
4084 		return (set_errno(EINVAL));
4085 	}
4086 	zone_status_set(zone, ZONE_IS_BOOTING);
4087 	mutex_exit(&zone_status_lock);
4088 
4089 	zone_hold(zone);	/* so we can use the zone_t later */
4090 	mutex_exit(&zonehash_lock);
4091 
4092 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
4093 		zone_rele(zone);
4094 		return (set_errno(EINTR));
4095 	}
4096 
4097 	/*
4098 	 * Boot (starting init) might have failed, in which case the zone
4099 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
4100 	 * be placed in zone->zone_boot_err, and so we return that.
4101 	 */
4102 	err = zone->zone_boot_err;
4103 	zone_rele(zone);
4104 	return (err ? set_errno(err) : 0);
4105 }
4106 
4107 /*
4108  * Kills all user processes in the zone, waiting for them all to exit
4109  * before returning.
4110  */
4111 static int
4112 zone_empty(zone_t *zone)
4113 {
4114 	int waitstatus;
4115 
4116 	/*
4117 	 * We need to drop zonehash_lock before killing all
4118 	 * processes, otherwise we'll deadlock with zone_find_*
4119 	 * which can be called from the exit path.
4120 	 */
4121 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
4122 	while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
4123 	    ZONE_IS_EMPTY)) == -1) {
4124 		killall(zone->zone_id);
4125 	}
4126 	/*
4127 	 * return EINTR if we were signaled
4128 	 */
4129 	if (waitstatus == 0)
4130 		return (EINTR);
4131 	return (0);
4132 }
4133 
4134 /*
4135  * This function implements the policy for zone visibility.
4136  *
4137  * In standard Solaris, a non-global zone can only see itself.
4138  *
4139  * In Trusted Extensions, a labeled zone can lookup any zone whose label
4140  * it dominates. For this test, the label of the global zone is treated as
4141  * admin_high so it is special-cased instead of being checked for dominance.
4142  *
4143  * Returns true if zone attributes are viewable, false otherwise.
4144  */
4145 static boolean_t
4146 zone_list_access(zone_t *zone)
4147 {
4148 
4149 	if (curproc->p_zone == global_zone ||
4150 	    curproc->p_zone == zone) {
4151 		return (B_TRUE);
4152 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
4153 		bslabel_t *curproc_label;
4154 		bslabel_t *zone_label;
4155 
4156 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
4157 		zone_label = label2bslabel(zone->zone_slabel);
4158 
4159 		if (zone->zone_id != GLOBAL_ZONEID &&
4160 		    bldominates(curproc_label, zone_label)) {
4161 			return (B_TRUE);
4162 		} else {
4163 			return (B_FALSE);
4164 		}
4165 	} else {
4166 		return (B_FALSE);
4167 	}
4168 }
4169 
4170 /*
4171  * Systemcall to start the zone's halt sequence.  By the time this
4172  * function successfully returns, all user processes and kernel threads
4173  * executing in it will have exited, ZSD shutdown callbacks executed,
4174  * and the zone status set to ZONE_IS_DOWN.
4175  *
4176  * It is possible that the call will interrupt itself if the caller is the
4177  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
4178  */
4179 static int
4180 zone_shutdown(zoneid_t zoneid)
4181 {
4182 	int error;
4183 	zone_t *zone;
4184 	zone_status_t status;
4185 
4186 	if (secpolicy_zone_config(CRED()) != 0)
4187 		return (set_errno(EPERM));
4188 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4189 		return (set_errno(EINVAL));
4190 
4191 	/*
4192 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
4193 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
4194 	 *
4195 	 * e.g. NFS can fail the mount if it determines that the zone
4196 	 * has already begun the shutdown sequence.
4197 	 */
4198 	if (block_mounts() == 0)
4199 		return (set_errno(EINTR));
4200 	mutex_enter(&zonehash_lock);
4201 	/*
4202 	 * Look for zone under hash lock to prevent races with other
4203 	 * calls to zone_shutdown and zone_destroy.
4204 	 */
4205 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4206 		mutex_exit(&zonehash_lock);
4207 		resume_mounts();
4208 		return (set_errno(EINVAL));
4209 	}
4210 	mutex_enter(&zone_status_lock);
4211 	status = zone_status_get(zone);
4212 	/*
4213 	 * Fail if the zone isn't fully initialized yet.
4214 	 */
4215 	if (status < ZONE_IS_READY) {
4216 		mutex_exit(&zone_status_lock);
4217 		mutex_exit(&zonehash_lock);
4218 		resume_mounts();
4219 		return (set_errno(EINVAL));
4220 	}
4221 	/*
4222 	 * If conditions required for zone_shutdown() to return have been met,
4223 	 * return success.
4224 	 */
4225 	if (status >= ZONE_IS_DOWN) {
4226 		mutex_exit(&zone_status_lock);
4227 		mutex_exit(&zonehash_lock);
4228 		resume_mounts();
4229 		return (0);
4230 	}
4231 	/*
4232 	 * If zone_shutdown() hasn't been called before, go through the motions.
4233 	 * If it has, there's nothing to do but wait for the kernel threads to
4234 	 * drain.
4235 	 */
4236 	if (status < ZONE_IS_EMPTY) {
4237 		uint_t ntasks;
4238 
4239 		mutex_enter(&zone->zone_lock);
4240 		if ((ntasks = zone->zone_ntasks) != 1) {
4241 			/*
4242 			 * There's still stuff running.
4243 			 */
4244 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4245 		}
4246 		mutex_exit(&zone->zone_lock);
4247 		if (ntasks == 1) {
4248 			/*
4249 			 * The only way to create another task is through
4250 			 * zone_enter(), which will block until we drop
4251 			 * zonehash_lock.  The zone is empty.
4252 			 */
4253 			if (zone->zone_kthreads == NULL) {
4254 				/*
4255 				 * Skip ahead to ZONE_IS_DOWN
4256 				 */
4257 				zone_status_set(zone, ZONE_IS_DOWN);
4258 			} else {
4259 				zone_status_set(zone, ZONE_IS_EMPTY);
4260 			}
4261 		}
4262 	}
4263 	zone_hold(zone);	/* so we can use the zone_t later */
4264 	mutex_exit(&zone_status_lock);
4265 	mutex_exit(&zonehash_lock);
4266 	resume_mounts();
4267 
4268 	if (error = zone_empty(zone)) {
4269 		zone_rele(zone);
4270 		return (set_errno(error));
4271 	}
4272 	/*
4273 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
4274 	 * longer be notified of changes to the pools configuration, so
4275 	 * in order to not end up with a stale pool pointer, we point
4276 	 * ourselves at the default pool and remove all resource
4277 	 * visibility.  This is especially important as the zone_t may
4278 	 * languish on the deathrow for a very long time waiting for
4279 	 * cred's to drain out.
4280 	 *
4281 	 * This rebinding of the zone can happen multiple times
4282 	 * (presumably due to interrupted or parallel systemcalls)
4283 	 * without any adverse effects.
4284 	 */
4285 	if (pool_lock_intr() != 0) {
4286 		zone_rele(zone);
4287 		return (set_errno(EINTR));
4288 	}
4289 	if (pool_state == POOL_ENABLED) {
4290 		mutex_enter(&cpu_lock);
4291 		zone_pool_set(zone, pool_default);
4292 		/*
4293 		 * The zone no longer needs to be able to see any cpus.
4294 		 */
4295 		zone_pset_set(zone, ZONE_PS_INVAL);
4296 		mutex_exit(&cpu_lock);
4297 	}
4298 	pool_unlock();
4299 
4300 	/*
4301 	 * ZSD shutdown callbacks can be executed multiple times, hence
4302 	 * it is safe to not be holding any locks across this call.
4303 	 */
4304 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
4305 
4306 	mutex_enter(&zone_status_lock);
4307 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
4308 		zone_status_set(zone, ZONE_IS_DOWN);
4309 	mutex_exit(&zone_status_lock);
4310 
4311 	/*
4312 	 * Wait for kernel threads to drain.
4313 	 */
4314 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
4315 		zone_rele(zone);
4316 		return (set_errno(EINTR));
4317 	}
4318 
4319 	/*
4320 	 * Zone can be become down/destroyable even if the above wait
4321 	 * returns EINTR, so any code added here may never execute.
4322 	 * (i.e. don't add code here)
4323 	 */
4324 
4325 	zone_rele(zone);
4326 	return (0);
4327 }
4328 
4329 /*
4330  * Systemcall entry point to finalize the zone halt process.  The caller
4331  * must have already successfully called zone_shutdown().
4332  *
4333  * Upon successful completion, the zone will have been fully destroyed:
4334  * zsched will have exited, destructor callbacks executed, and the zone
4335  * removed from the list of active zones.
4336  */
4337 static int
4338 zone_destroy(zoneid_t zoneid)
4339 {
4340 	uint64_t uniqid;
4341 	zone_t *zone;
4342 	zone_status_t status;
4343 
4344 	if (secpolicy_zone_config(CRED()) != 0)
4345 		return (set_errno(EPERM));
4346 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4347 		return (set_errno(EINVAL));
4348 
4349 	mutex_enter(&zonehash_lock);
4350 	/*
4351 	 * Look for zone under hash lock to prevent races with other
4352 	 * calls to zone_destroy.
4353 	 */
4354 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4355 		mutex_exit(&zonehash_lock);
4356 		return (set_errno(EINVAL));
4357 	}
4358 
4359 	if (zone_mount_count(zone->zone_rootpath) != 0) {
4360 		mutex_exit(&zonehash_lock);
4361 		return (set_errno(EBUSY));
4362 	}
4363 	mutex_enter(&zone_status_lock);
4364 	status = zone_status_get(zone);
4365 	if (status < ZONE_IS_DOWN) {
4366 		mutex_exit(&zone_status_lock);
4367 		mutex_exit(&zonehash_lock);
4368 		return (set_errno(EBUSY));
4369 	} else if (status == ZONE_IS_DOWN) {
4370 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
4371 	}
4372 	mutex_exit(&zone_status_lock);
4373 	zone_hold(zone);
4374 	mutex_exit(&zonehash_lock);
4375 
4376 	/*
4377 	 * wait for zsched to exit
4378 	 */
4379 	zone_status_wait(zone, ZONE_IS_DEAD);
4380 	zone_zsd_callbacks(zone, ZSD_DESTROY);
4381 	zone->zone_netstack = NULL;
4382 	uniqid = zone->zone_uniqid;
4383 	zone_rele(zone);
4384 	zone = NULL;	/* potentially free'd */
4385 
4386 	mutex_enter(&zonehash_lock);
4387 	for (; /* ever */; ) {
4388 		boolean_t unref;
4389 
4390 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
4391 		    zone->zone_uniqid != uniqid) {
4392 			/*
4393 			 * The zone has gone away.  Necessary conditions
4394 			 * are met, so we return success.
4395 			 */
4396 			mutex_exit(&zonehash_lock);
4397 			return (0);
4398 		}
4399 		mutex_enter(&zone->zone_lock);
4400 		unref = ZONE_IS_UNREF(zone);
4401 		mutex_exit(&zone->zone_lock);
4402 		if (unref) {
4403 			/*
4404 			 * There is only one reference to the zone -- that
4405 			 * added when the zone was added to the hashtables --
4406 			 * and things will remain this way until we drop
4407 			 * zonehash_lock... we can go ahead and cleanup the
4408 			 * zone.
4409 			 */
4410 			break;
4411 		}
4412 
4413 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
4414 			/* Signaled */
4415 			mutex_exit(&zonehash_lock);
4416 			return (set_errno(EINTR));
4417 		}
4418 
4419 	}
4420 
4421 	/*
4422 	 * Remove CPU cap for this zone now since we're not going to
4423 	 * fail below this point.
4424 	 */
4425 	cpucaps_zone_remove(zone);
4426 
4427 	/* Get rid of the zone's kstats */
4428 	zone_kstat_delete(zone);
4429 
4430 	/* free brand specific data */
4431 	if (ZONE_IS_BRANDED(zone))
4432 		ZBROP(zone)->b_free_brand_data(zone);
4433 
4434 	/* Say goodbye to brand framework. */
4435 	brand_unregister_zone(zone->zone_brand);
4436 
4437 	/*
4438 	 * It is now safe to let the zone be recreated; remove it from the
4439 	 * lists.  The memory will not be freed until the last cred
4440 	 * reference goes away.
4441 	 */
4442 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
4443 	zonecount--;
4444 	/* remove from active list and hash tables */
4445 	list_remove(&zone_active, zone);
4446 	(void) mod_hash_destroy(zonehashbyname,
4447 	    (mod_hash_key_t)zone->zone_name);
4448 	(void) mod_hash_destroy(zonehashbyid,
4449 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
4450 	if (zone->zone_flags & ZF_HASHED_LABEL)
4451 		(void) mod_hash_destroy(zonehashbylabel,
4452 		    (mod_hash_key_t)zone->zone_slabel);
4453 	mutex_exit(&zonehash_lock);
4454 
4455 	/*
4456 	 * Release the root vnode; we're not using it anymore.  Nor should any
4457 	 * other thread that might access it exist.
4458 	 */
4459 	if (zone->zone_rootvp != NULL) {
4460 		VN_RELE(zone->zone_rootvp);
4461 		zone->zone_rootvp = NULL;
4462 	}
4463 
4464 	/* add to deathrow list */
4465 	mutex_enter(&zone_deathrow_lock);
4466 	list_insert_tail(&zone_deathrow, zone);
4467 	mutex_exit(&zone_deathrow_lock);
4468 
4469 	/*
4470 	 * Drop last reference (which was added by zsched()), this will
4471 	 * free the zone unless there are outstanding cred references.
4472 	 */
4473 	zone_rele(zone);
4474 	return (0);
4475 }
4476 
4477 /*
4478  * Systemcall entry point for zone_getattr(2).
4479  */
4480 static ssize_t
4481 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4482 {
4483 	size_t size;
4484 	int error = 0, err;
4485 	zone_t *zone;
4486 	char *zonepath;
4487 	char *outstr;
4488 	zone_status_t zone_status;
4489 	pid_t initpid;
4490 	boolean_t global = (curzone == global_zone);
4491 	boolean_t inzone = (curzone->zone_id == zoneid);
4492 	ushort_t flags;
4493 
4494 	mutex_enter(&zonehash_lock);
4495 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4496 		mutex_exit(&zonehash_lock);
4497 		return (set_errno(EINVAL));
4498 	}
4499 	zone_status = zone_status_get(zone);
4500 	if (zone_status < ZONE_IS_INITIALIZED) {
4501 		mutex_exit(&zonehash_lock);
4502 		return (set_errno(EINVAL));
4503 	}
4504 	zone_hold(zone);
4505 	mutex_exit(&zonehash_lock);
4506 
4507 	/*
4508 	 * If not in the global zone, don't show information about other zones,
4509 	 * unless the system is labeled and the local zone's label dominates
4510 	 * the other zone.
4511 	 */
4512 	if (!zone_list_access(zone)) {
4513 		zone_rele(zone);
4514 		return (set_errno(EINVAL));
4515 	}
4516 
4517 	switch (attr) {
4518 	case ZONE_ATTR_ROOT:
4519 		if (global) {
4520 			/*
4521 			 * Copy the path to trim the trailing "/" (except for
4522 			 * the global zone).
4523 			 */
4524 			if (zone != global_zone)
4525 				size = zone->zone_rootpathlen - 1;
4526 			else
4527 				size = zone->zone_rootpathlen;
4528 			zonepath = kmem_alloc(size, KM_SLEEP);
4529 			bcopy(zone->zone_rootpath, zonepath, size);
4530 			zonepath[size - 1] = '\0';
4531 		} else {
4532 			if (inzone || !is_system_labeled()) {
4533 				/*
4534 				 * Caller is not in the global zone.
4535 				 * if the query is on the current zone
4536 				 * or the system is not labeled,
4537 				 * just return faked-up path for current zone.
4538 				 */
4539 				zonepath = "/";
4540 				size = 2;
4541 			} else {
4542 				/*
4543 				 * Return related path for current zone.
4544 				 */
4545 				int prefix_len = strlen(zone_prefix);
4546 				int zname_len = strlen(zone->zone_name);
4547 
4548 				size = prefix_len + zname_len + 1;
4549 				zonepath = kmem_alloc(size, KM_SLEEP);
4550 				bcopy(zone_prefix, zonepath, prefix_len);
4551 				bcopy(zone->zone_name, zonepath +
4552 				    prefix_len, zname_len);
4553 				zonepath[size - 1] = '\0';
4554 			}
4555 		}
4556 		if (bufsize > size)
4557 			bufsize = size;
4558 		if (buf != NULL) {
4559 			err = copyoutstr(zonepath, buf, bufsize, NULL);
4560 			if (err != 0 && err != ENAMETOOLONG)
4561 				error = EFAULT;
4562 		}
4563 		if (global || (is_system_labeled() && !inzone))
4564 			kmem_free(zonepath, size);
4565 		break;
4566 
4567 	case ZONE_ATTR_NAME:
4568 		size = strlen(zone->zone_name) + 1;
4569 		if (bufsize > size)
4570 			bufsize = size;
4571 		if (buf != NULL) {
4572 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
4573 			if (err != 0 && err != ENAMETOOLONG)
4574 				error = EFAULT;
4575 		}
4576 		break;
4577 
4578 	case ZONE_ATTR_STATUS:
4579 		/*
4580 		 * Since we're not holding zonehash_lock, the zone status
4581 		 * may be anything; leave it up to userland to sort it out.
4582 		 */
4583 		size = sizeof (zone_status);
4584 		if (bufsize > size)
4585 			bufsize = size;
4586 		zone_status = zone_status_get(zone);
4587 		if (buf != NULL &&
4588 		    copyout(&zone_status, buf, bufsize) != 0)
4589 			error = EFAULT;
4590 		break;
4591 	case ZONE_ATTR_FLAGS:
4592 		size = sizeof (zone->zone_flags);
4593 		if (bufsize > size)
4594 			bufsize = size;
4595 		flags = zone->zone_flags;
4596 		if (buf != NULL &&
4597 		    copyout(&flags, buf, bufsize) != 0)
4598 			error = EFAULT;
4599 		break;
4600 	case ZONE_ATTR_PRIVSET:
4601 		size = sizeof (priv_set_t);
4602 		if (bufsize > size)
4603 			bufsize = size;
4604 		if (buf != NULL &&
4605 		    copyout(zone->zone_privset, buf, bufsize) != 0)
4606 			error = EFAULT;
4607 		break;
4608 	case ZONE_ATTR_UNIQID:
4609 		size = sizeof (zone->zone_uniqid);
4610 		if (bufsize > size)
4611 			bufsize = size;
4612 		if (buf != NULL &&
4613 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
4614 			error = EFAULT;
4615 		break;
4616 	case ZONE_ATTR_POOLID:
4617 		{
4618 			pool_t *pool;
4619 			poolid_t poolid;
4620 
4621 			if (pool_lock_intr() != 0) {
4622 				error = EINTR;
4623 				break;
4624 			}
4625 			pool = zone_pool_get(zone);
4626 			poolid = pool->pool_id;
4627 			pool_unlock();
4628 			size = sizeof (poolid);
4629 			if (bufsize > size)
4630 				bufsize = size;
4631 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
4632 				error = EFAULT;
4633 		}
4634 		break;
4635 	case ZONE_ATTR_SLBL:
4636 		size = sizeof (bslabel_t);
4637 		if (bufsize > size)
4638 			bufsize = size;
4639 		if (zone->zone_slabel == NULL)
4640 			error = EINVAL;
4641 		else if (buf != NULL &&
4642 		    copyout(label2bslabel(zone->zone_slabel), buf,
4643 		    bufsize) != 0)
4644 			error = EFAULT;
4645 		break;
4646 	case ZONE_ATTR_INITPID:
4647 		size = sizeof (initpid);
4648 		if (bufsize > size)
4649 			bufsize = size;
4650 		initpid = zone->zone_proc_initpid;
4651 		if (initpid == -1) {
4652 			error = ESRCH;
4653 			break;
4654 		}
4655 		if (buf != NULL &&
4656 		    copyout(&initpid, buf, bufsize) != 0)
4657 			error = EFAULT;
4658 		break;
4659 	case ZONE_ATTR_BRAND:
4660 		size = strlen(zone->zone_brand->b_name) + 1;
4661 
4662 		if (bufsize > size)
4663 			bufsize = size;
4664 		if (buf != NULL) {
4665 			err = copyoutstr(zone->zone_brand->b_name, buf,
4666 			    bufsize, NULL);
4667 			if (err != 0 && err != ENAMETOOLONG)
4668 				error = EFAULT;
4669 		}
4670 		break;
4671 	case ZONE_ATTR_INITNAME:
4672 		size = strlen(zone->zone_initname) + 1;
4673 		if (bufsize > size)
4674 			bufsize = size;
4675 		if (buf != NULL) {
4676 			err = copyoutstr(zone->zone_initname, buf, bufsize,
4677 			    NULL);
4678 			if (err != 0 && err != ENAMETOOLONG)
4679 				error = EFAULT;
4680 		}
4681 		break;
4682 	case ZONE_ATTR_BOOTARGS:
4683 		if (zone->zone_bootargs == NULL)
4684 			outstr = "";
4685 		else
4686 			outstr = zone->zone_bootargs;
4687 		size = strlen(outstr) + 1;
4688 		if (bufsize > size)
4689 			bufsize = size;
4690 		if (buf != NULL) {
4691 			err = copyoutstr(outstr, buf, bufsize, NULL);
4692 			if (err != 0 && err != ENAMETOOLONG)
4693 				error = EFAULT;
4694 		}
4695 		break;
4696 	case ZONE_ATTR_PHYS_MCAP:
4697 		size = sizeof (zone->zone_phys_mcap);
4698 		if (bufsize > size)
4699 			bufsize = size;
4700 		if (buf != NULL &&
4701 		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
4702 			error = EFAULT;
4703 		break;
4704 	case ZONE_ATTR_SCHED_CLASS:
4705 		mutex_enter(&class_lock);
4706 
4707 		if (zone->zone_defaultcid >= loaded_classes)
4708 			outstr = "";
4709 		else
4710 			outstr = sclass[zone->zone_defaultcid].cl_name;
4711 		size = strlen(outstr) + 1;
4712 		if (bufsize > size)
4713 			bufsize = size;
4714 		if (buf != NULL) {
4715 			err = copyoutstr(outstr, buf, bufsize, NULL);
4716 			if (err != 0 && err != ENAMETOOLONG)
4717 				error = EFAULT;
4718 		}
4719 
4720 		mutex_exit(&class_lock);
4721 		break;
4722 	default:
4723 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
4724 			size = bufsize;
4725 			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
4726 		} else {
4727 			error = EINVAL;
4728 		}
4729 	}
4730 	zone_rele(zone);
4731 
4732 	if (error)
4733 		return (set_errno(error));
4734 	return ((ssize_t)size);
4735 }
4736 
4737 /*
4738  * Systemcall entry point for zone_setattr(2).
4739  */
4740 /*ARGSUSED*/
4741 static int
4742 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4743 {
4744 	zone_t *zone;
4745 	zone_status_t zone_status;
4746 	int err;
4747 
4748 	if (secpolicy_zone_config(CRED()) != 0)
4749 		return (set_errno(EPERM));
4750 
4751 	/*
4752 	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
4753 	 * global zone.
4754 	 */
4755 	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
4756 		return (set_errno(EINVAL));
4757 	}
4758 
4759 	mutex_enter(&zonehash_lock);
4760 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4761 		mutex_exit(&zonehash_lock);
4762 		return (set_errno(EINVAL));
4763 	}
4764 	zone_hold(zone);
4765 	mutex_exit(&zonehash_lock);
4766 
4767 	/*
4768 	 * At present most attributes can only be set on non-running,
4769 	 * non-global zones.
4770 	 */
4771 	zone_status = zone_status_get(zone);
4772 	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
4773 		goto done;
4774 
4775 	switch (attr) {
4776 	case ZONE_ATTR_INITNAME:
4777 		err = zone_set_initname(zone, (const char *)buf);
4778 		break;
4779 	case ZONE_ATTR_BOOTARGS:
4780 		err = zone_set_bootargs(zone, (const char *)buf);
4781 		break;
4782 	case ZONE_ATTR_BRAND:
4783 		err = zone_set_brand(zone, (const char *)buf);
4784 		break;
4785 	case ZONE_ATTR_PHYS_MCAP:
4786 		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
4787 		break;
4788 	case ZONE_ATTR_SCHED_CLASS:
4789 		err = zone_set_sched_class(zone, (const char *)buf);
4790 		break;
4791 	default:
4792 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
4793 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
4794 		else
4795 			err = EINVAL;
4796 	}
4797 
4798 done:
4799 	zone_rele(zone);
4800 	return (err != 0 ? set_errno(err) : 0);
4801 }
4802 
4803 /*
4804  * Return zero if the process has at least one vnode mapped in to its
4805  * address space which shouldn't be allowed to change zones.
4806  *
4807  * Also return zero if the process has any shared mappings which reserve
4808  * swap.  This is because the counting for zone.max-swap does not allow swap
4809  * reservation to be shared between zones.  zone swap reservation is counted
4810  * on zone->zone_max_swap.
4811  */
4812 static int
4813 as_can_change_zones(void)
4814 {
4815 	proc_t *pp = curproc;
4816 	struct seg *seg;
4817 	struct as *as = pp->p_as;
4818 	vnode_t *vp;
4819 	int allow = 1;
4820 
4821 	ASSERT(pp->p_as != &kas);
4822 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
4823 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
4824 
4825 		/*
4826 		 * Cannot enter zone with shared anon memory which
4827 		 * reserves swap.  See comment above.
4828 		 */
4829 		if (seg_can_change_zones(seg) == B_FALSE) {
4830 			allow = 0;
4831 			break;
4832 		}
4833 		/*
4834 		 * if we can't get a backing vnode for this segment then skip
4835 		 * it.
4836 		 */
4837 		vp = NULL;
4838 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
4839 			continue;
4840 		if (!vn_can_change_zones(vp)) { /* bail on first match */
4841 			allow = 0;
4842 			break;
4843 		}
4844 	}
4845 	AS_LOCK_EXIT(as, &as->a_lock);
4846 	return (allow);
4847 }
4848 
4849 /*
4850  * Count swap reserved by curproc's address space
4851  */
4852 static size_t
4853 as_swresv(void)
4854 {
4855 	proc_t *pp = curproc;
4856 	struct seg *seg;
4857 	struct as *as = pp->p_as;
4858 	size_t swap = 0;
4859 
4860 	ASSERT(pp->p_as != &kas);
4861 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
4862 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
4863 		swap += seg_swresv(seg);
4864 
4865 	return (swap);
4866 }
4867 
4868 /*
4869  * Systemcall entry point for zone_enter().
4870  *
4871  * The current process is injected into said zone.  In the process
4872  * it will change its project membership, privileges, rootdir/cwd,
4873  * zone-wide rctls, and pool association to match those of the zone.
4874  *
4875  * The first zone_enter() called while the zone is in the ZONE_IS_READY
4876  * state will transition it to ZONE_IS_RUNNING.  Processes may only
4877  * enter a zone that is "ready" or "running".
4878  */
4879 static int
4880 zone_enter(zoneid_t zoneid)
4881 {
4882 	zone_t *zone;
4883 	vnode_t *vp;
4884 	proc_t *pp = curproc;
4885 	contract_t *ct;
4886 	cont_process_t *ctp;
4887 	task_t *tk, *oldtk;
4888 	kproject_t *zone_proj0;
4889 	cred_t *cr, *newcr;
4890 	pool_t *oldpool, *newpool;
4891 	sess_t *sp;
4892 	uid_t uid;
4893 	zone_status_t status;
4894 	int err = 0;
4895 	rctl_entity_p_t e;
4896 	size_t swap;
4897 	kthread_id_t t;
4898 
4899 	if (secpolicy_zone_config(CRED()) != 0)
4900 		return (set_errno(EPERM));
4901 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4902 		return (set_errno(EINVAL));
4903 
4904 	/*
4905 	 * Stop all lwps so we don't need to hold a lock to look at
4906 	 * curproc->p_zone.  This needs to happen before we grab any
4907 	 * locks to avoid deadlock (another lwp in the process could
4908 	 * be waiting for the held lock).
4909 	 */
4910 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
4911 		return (set_errno(EINTR));
4912 
4913 	/*
4914 	 * Make sure we're not changing zones with files open or mapped in
4915 	 * to our address space which shouldn't be changing zones.
4916 	 */
4917 	if (!files_can_change_zones()) {
4918 		err = EBADF;
4919 		goto out;
4920 	}
4921 	if (!as_can_change_zones()) {
4922 		err = EFAULT;
4923 		goto out;
4924 	}
4925 
4926 	mutex_enter(&zonehash_lock);
4927 	if (pp->p_zone != global_zone) {
4928 		mutex_exit(&zonehash_lock);
4929 		err = EINVAL;
4930 		goto out;
4931 	}
4932 
4933 	zone = zone_find_all_by_id(zoneid);
4934 	if (zone == NULL) {
4935 		mutex_exit(&zonehash_lock);
4936 		err = EINVAL;
4937 		goto out;
4938 	}
4939 
4940 	/*
4941 	 * To prevent processes in a zone from holding contracts on
4942 	 * extrazonal resources, and to avoid process contract
4943 	 * memberships which span zones, contract holders and processes
4944 	 * which aren't the sole members of their encapsulating process
4945 	 * contracts are not allowed to zone_enter.
4946 	 */
4947 	ctp = pp->p_ct_process;
4948 	ct = &ctp->conp_contract;
4949 	mutex_enter(&ct->ct_lock);
4950 	mutex_enter(&pp->p_lock);
4951 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
4952 		mutex_exit(&pp->p_lock);
4953 		mutex_exit(&ct->ct_lock);
4954 		mutex_exit(&zonehash_lock);
4955 		err = EINVAL;
4956 		goto out;
4957 	}
4958 
4959 	/*
4960 	 * Moreover, we don't allow processes whose encapsulating
4961 	 * process contracts have inherited extrazonal contracts.
4962 	 * While it would be easier to eliminate all process contracts
4963 	 * with inherited contracts, we need to be able to give a
4964 	 * restarted init (or other zone-penetrating process) its
4965 	 * predecessor's contracts.
4966 	 */
4967 	if (ctp->conp_ninherited != 0) {
4968 		contract_t *next;
4969 		for (next = list_head(&ctp->conp_inherited); next;
4970 		    next = list_next(&ctp->conp_inherited, next)) {
4971 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
4972 				mutex_exit(&pp->p_lock);
4973 				mutex_exit(&ct->ct_lock);
4974 				mutex_exit(&zonehash_lock);
4975 				err = EINVAL;
4976 				goto out;
4977 			}
4978 		}
4979 	}
4980 	mutex_exit(&pp->p_lock);
4981 	mutex_exit(&ct->ct_lock);
4982 
4983 	status = zone_status_get(zone);
4984 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
4985 		/*
4986 		 * Can't join
4987 		 */
4988 		mutex_exit(&zonehash_lock);
4989 		err = EINVAL;
4990 		goto out;
4991 	}
4992 
4993 	/*
4994 	 * Make sure new priv set is within the permitted set for caller
4995 	 */
4996 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
4997 		mutex_exit(&zonehash_lock);
4998 		err = EPERM;
4999 		goto out;
5000 	}
5001 	/*
5002 	 * We want to momentarily drop zonehash_lock while we optimistically
5003 	 * bind curproc to the pool it should be running in.  This is safe
5004 	 * since the zone can't disappear (we have a hold on it).
5005 	 */
5006 	zone_hold(zone);
5007 	mutex_exit(&zonehash_lock);
5008 
5009 	/*
5010 	 * Grab pool_lock to keep the pools configuration from changing
5011 	 * and to stop ourselves from getting rebound to another pool
5012 	 * until we join the zone.
5013 	 */
5014 	if (pool_lock_intr() != 0) {
5015 		zone_rele(zone);
5016 		err = EINTR;
5017 		goto out;
5018 	}
5019 	ASSERT(secpolicy_pool(CRED()) == 0);
5020 	/*
5021 	 * Bind ourselves to the pool currently associated with the zone.
5022 	 */
5023 	oldpool = curproc->p_pool;
5024 	newpool = zone_pool_get(zone);
5025 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
5026 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
5027 	    POOL_BIND_ALL)) != 0) {
5028 		pool_unlock();
5029 		zone_rele(zone);
5030 		goto out;
5031 	}
5032 
5033 	/*
5034 	 * Grab cpu_lock now; we'll need it later when we call
5035 	 * task_join().
5036 	 */
5037 	mutex_enter(&cpu_lock);
5038 	mutex_enter(&zonehash_lock);
5039 	/*
5040 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
5041 	 */
5042 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
5043 		/*
5044 		 * Can't join anymore.
5045 		 */
5046 		mutex_exit(&zonehash_lock);
5047 		mutex_exit(&cpu_lock);
5048 		if (pool_state == POOL_ENABLED &&
5049 		    newpool != oldpool)
5050 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
5051 			    POOL_BIND_ALL);
5052 		pool_unlock();
5053 		zone_rele(zone);
5054 		err = EINVAL;
5055 		goto out;
5056 	}
5057 
5058 	/*
5059 	 * a_lock must be held while transfering locked memory and swap
5060 	 * reservation from the global zone to the non global zone because
5061 	 * asynchronous faults on the processes' address space can lock
5062 	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
5063 	 * segments respectively.
5064 	 */
5065 	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
5066 	swap = as_swresv();
5067 	mutex_enter(&pp->p_lock);
5068 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
5069 	/* verify that we do not exceed and task or lwp limits */
5070 	mutex_enter(&zone->zone_nlwps_lock);
5071 	/* add new lwps to zone and zone's proj0 */
5072 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
5073 	zone->zone_nlwps += pp->p_lwpcnt;
5074 	/* add 1 task to zone's proj0 */
5075 	zone_proj0->kpj_ntasks += 1;
5076 	mutex_exit(&zone->zone_nlwps_lock);
5077 
5078 	mutex_enter(&zone->zone_mem_lock);
5079 	zone->zone_locked_mem += pp->p_locked_mem;
5080 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
5081 	zone->zone_max_swap += swap;
5082 	mutex_exit(&zone->zone_mem_lock);
5083 
5084 	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
5085 	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
5086 	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
5087 
5088 	/* remove lwps from proc's old zone and old project */
5089 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
5090 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
5091 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
5092 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
5093 
5094 	mutex_enter(&pp->p_zone->zone_mem_lock);
5095 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
5096 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
5097 	pp->p_zone->zone_max_swap -= swap;
5098 	mutex_exit(&pp->p_zone->zone_mem_lock);
5099 
5100 	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5101 	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
5102 	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
5103 
5104 	mutex_exit(&pp->p_lock);
5105 	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
5106 
5107 	/*
5108 	 * Joining the zone cannot fail from now on.
5109 	 *
5110 	 * This means that a lot of the following code can be commonized and
5111 	 * shared with zsched().
5112 	 */
5113 
5114 	/*
5115 	 * Reset the encapsulating process contract's zone.
5116 	 */
5117 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
5118 	contract_setzuniqid(ct, zone->zone_uniqid);
5119 
5120 	/*
5121 	 * Create a new task and associate the process with the project keyed
5122 	 * by (projid,zoneid).
5123 	 *
5124 	 * We might as well be in project 0; the global zone's projid doesn't
5125 	 * make much sense in a zone anyhow.
5126 	 *
5127 	 * This also increments zone_ntasks, and returns with p_lock held.
5128 	 */
5129 	tk = task_create(0, zone);
5130 	oldtk = task_join(tk, 0);
5131 	mutex_exit(&cpu_lock);
5132 
5133 	pp->p_flag |= SZONETOP;
5134 	pp->p_zone = zone;
5135 
5136 	/*
5137 	 * call RCTLOP_SET functions on this proc
5138 	 */
5139 	e.rcep_p.zone = zone;
5140 	e.rcep_t = RCENTITY_ZONE;
5141 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
5142 	    RCD_CALLBACK);
5143 	mutex_exit(&pp->p_lock);
5144 
5145 	/*
5146 	 * We don't need to hold any of zsched's locks here; not only do we know
5147 	 * the process and zone aren't going away, we know its session isn't
5148 	 * changing either.
5149 	 *
5150 	 * By joining zsched's session here, we mimic the behavior in the
5151 	 * global zone of init's sid being the pid of sched.  We extend this
5152 	 * to all zlogin-like zone_enter()'ing processes as well.
5153 	 */
5154 	mutex_enter(&pidlock);
5155 	sp = zone->zone_zsched->p_sessp;
5156 	sess_hold(zone->zone_zsched);
5157 	mutex_enter(&pp->p_lock);
5158 	pgexit(pp);
5159 	sess_rele(pp->p_sessp, B_TRUE);
5160 	pp->p_sessp = sp;
5161 	pgjoin(pp, zone->zone_zsched->p_pidp);
5162 
5163 	/*
5164 	 * If any threads are scheduled to be placed on zone wait queue they
5165 	 * should abandon the idea since the wait queue is changing.
5166 	 * We need to be holding pidlock & p_lock to do this.
5167 	 */
5168 	if ((t = pp->p_tlist) != NULL) {
5169 		do {
5170 			thread_lock(t);
5171 			/*
5172 			 * Kick this thread so that he doesn't sit
5173 			 * on a wrong wait queue.
5174 			 */
5175 			if (ISWAITING(t))
5176 				setrun_locked(t);
5177 
5178 			if (t->t_schedflag & TS_ANYWAITQ)
5179 				t->t_schedflag &= ~ TS_ANYWAITQ;
5180 
5181 			thread_unlock(t);
5182 		} while ((t = t->t_forw) != pp->p_tlist);
5183 	}
5184 
5185 	/*
5186 	 * If there is a default scheduling class for the zone and it is not
5187 	 * the class we are currently in, change all of the threads in the
5188 	 * process to the new class.  We need to be holding pidlock & p_lock
5189 	 * when we call parmsset so this is a good place to do it.
5190 	 */
5191 	if (zone->zone_defaultcid > 0 &&
5192 	    zone->zone_defaultcid != curthread->t_cid) {
5193 		pcparms_t pcparms;
5194 
5195 		pcparms.pc_cid = zone->zone_defaultcid;
5196 		pcparms.pc_clparms[0] = 0;
5197 
5198 		/*
5199 		 * If setting the class fails, we still want to enter the zone.
5200 		 */
5201 		if ((t = pp->p_tlist) != NULL) {
5202 			do {
5203 				(void) parmsset(&pcparms, t);
5204 			} while ((t = t->t_forw) != pp->p_tlist);
5205 		}
5206 	}
5207 
5208 	mutex_exit(&pp->p_lock);
5209 	mutex_exit(&pidlock);
5210 
5211 	mutex_exit(&zonehash_lock);
5212 	/*
5213 	 * We're firmly in the zone; let pools progress.
5214 	 */
5215 	pool_unlock();
5216 	task_rele(oldtk);
5217 	/*
5218 	 * We don't need to retain a hold on the zone since we already
5219 	 * incremented zone_ntasks, so the zone isn't going anywhere.
5220 	 */
5221 	zone_rele(zone);
5222 
5223 	/*
5224 	 * Chroot
5225 	 */
5226 	vp = zone->zone_rootvp;
5227 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
5228 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
5229 
5230 	/*
5231 	 * Change process credentials
5232 	 */
5233 	newcr = cralloc();
5234 	mutex_enter(&pp->p_crlock);
5235 	cr = pp->p_cred;
5236 	crcopy_to(cr, newcr);
5237 	crsetzone(newcr, zone);
5238 	pp->p_cred = newcr;
5239 
5240 	/*
5241 	 * Restrict all process privilege sets to zone limit
5242 	 */
5243 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
5244 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
5245 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
5246 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
5247 	mutex_exit(&pp->p_crlock);
5248 	crset(pp, newcr);
5249 
5250 	/*
5251 	 * Adjust upcount to reflect zone entry.
5252 	 */
5253 	uid = crgetruid(newcr);
5254 	mutex_enter(&pidlock);
5255 	upcount_dec(uid, GLOBAL_ZONEID);
5256 	upcount_inc(uid, zoneid);
5257 	mutex_exit(&pidlock);
5258 
5259 	/*
5260 	 * Set up core file path and content.
5261 	 */
5262 	set_core_defaults();
5263 
5264 out:
5265 	/*
5266 	 * Let the other lwps continue.
5267 	 */
5268 	mutex_enter(&pp->p_lock);
5269 	if (curthread != pp->p_agenttp)
5270 		continuelwps(pp);
5271 	mutex_exit(&pp->p_lock);
5272 
5273 	return (err != 0 ? set_errno(err) : 0);
5274 }
5275 
5276 /*
5277  * Systemcall entry point for zone_list(2).
5278  *
5279  * Processes running in a (non-global) zone only see themselves.
5280  * On labeled systems, they see all zones whose label they dominate.
5281  */
5282 static int
5283 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
5284 {
5285 	zoneid_t *zoneids;
5286 	zone_t *zone, *myzone;
5287 	uint_t user_nzones, real_nzones;
5288 	uint_t domi_nzones;
5289 	int error;
5290 
5291 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
5292 		return (set_errno(EFAULT));
5293 
5294 	myzone = curproc->p_zone;
5295 	if (myzone != global_zone) {
5296 		bslabel_t *mybslab;
5297 
5298 		if (!is_system_labeled()) {
5299 			/* just return current zone */
5300 			real_nzones = domi_nzones = 1;
5301 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
5302 			zoneids[0] = myzone->zone_id;
5303 		} else {
5304 			/* return all zones that are dominated */
5305 			mutex_enter(&zonehash_lock);
5306 			real_nzones = zonecount;
5307 			domi_nzones = 0;
5308 			if (real_nzones > 0) {
5309 				zoneids = kmem_alloc(real_nzones *
5310 				    sizeof (zoneid_t), KM_SLEEP);
5311 				mybslab = label2bslabel(myzone->zone_slabel);
5312 				for (zone = list_head(&zone_active);
5313 				    zone != NULL;
5314 				    zone = list_next(&zone_active, zone)) {
5315 					if (zone->zone_id == GLOBAL_ZONEID)
5316 						continue;
5317 					if (zone != myzone &&
5318 					    (zone->zone_flags & ZF_IS_SCRATCH))
5319 						continue;
5320 					/*
5321 					 * Note that a label always dominates
5322 					 * itself, so myzone is always included
5323 					 * in the list.
5324 					 */
5325 					if (bldominates(mybslab,
5326 					    label2bslabel(zone->zone_slabel))) {
5327 						zoneids[domi_nzones++] =
5328 						    zone->zone_id;
5329 					}
5330 				}
5331 			}
5332 			mutex_exit(&zonehash_lock);
5333 		}
5334 	} else {
5335 		mutex_enter(&zonehash_lock);
5336 		real_nzones = zonecount;
5337 		domi_nzones = 0;
5338 		if (real_nzones > 0) {
5339 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
5340 			    KM_SLEEP);
5341 			for (zone = list_head(&zone_active); zone != NULL;
5342 			    zone = list_next(&zone_active, zone))
5343 				zoneids[domi_nzones++] = zone->zone_id;
5344 			ASSERT(domi_nzones == real_nzones);
5345 		}
5346 		mutex_exit(&zonehash_lock);
5347 	}
5348 
5349 	/*
5350 	 * If user has allocated space for fewer entries than we found, then
5351 	 * return only up to his limit.  Either way, tell him exactly how many
5352 	 * we found.
5353 	 */
5354 	if (domi_nzones < user_nzones)
5355 		user_nzones = domi_nzones;
5356 	error = 0;
5357 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
5358 		error = EFAULT;
5359 	} else if (zoneidlist != NULL && user_nzones != 0) {
5360 		if (copyout(zoneids, zoneidlist,
5361 		    user_nzones * sizeof (zoneid_t)) != 0)
5362 			error = EFAULT;
5363 	}
5364 
5365 	if (real_nzones > 0)
5366 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
5367 
5368 	if (error != 0)
5369 		return (set_errno(error));
5370 	else
5371 		return (0);
5372 }
5373 
5374 /*
5375  * Systemcall entry point for zone_lookup(2).
5376  *
5377  * Non-global zones are only able to see themselves and (on labeled systems)
5378  * the zones they dominate.
5379  */
5380 static zoneid_t
5381 zone_lookup(const char *zone_name)
5382 {
5383 	char *kname;
5384 	zone_t *zone;
5385 	zoneid_t zoneid;
5386 	int err;
5387 
5388 	if (zone_name == NULL) {
5389 		/* return caller's zone id */
5390 		return (getzoneid());
5391 	}
5392 
5393 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
5394 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
5395 		kmem_free(kname, ZONENAME_MAX);
5396 		return (set_errno(err));
5397 	}
5398 
5399 	mutex_enter(&zonehash_lock);
5400 	zone = zone_find_all_by_name(kname);
5401 	kmem_free(kname, ZONENAME_MAX);
5402 	/*
5403 	 * In a non-global zone, can only lookup global and own name.
5404 	 * In Trusted Extensions zone label dominance rules apply.
5405 	 */
5406 	if (zone == NULL ||
5407 	    zone_status_get(zone) < ZONE_IS_READY ||
5408 	    !zone_list_access(zone)) {
5409 		mutex_exit(&zonehash_lock);
5410 		return (set_errno(EINVAL));
5411 	} else {
5412 		zoneid = zone->zone_id;
5413 		mutex_exit(&zonehash_lock);
5414 		return (zoneid);
5415 	}
5416 }
5417 
5418 static int
5419 zone_version(int *version_arg)
5420 {
5421 	int version = ZONE_SYSCALL_API_VERSION;
5422 
5423 	if (copyout(&version, version_arg, sizeof (int)) != 0)
5424 		return (set_errno(EFAULT));
5425 	return (0);
5426 }
5427 
5428 /* ARGSUSED */
5429 long
5430 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
5431 {
5432 	zone_def zs;
5433 
5434 	switch (cmd) {
5435 	case ZONE_CREATE:
5436 		if (get_udatamodel() == DATAMODEL_NATIVE) {
5437 			if (copyin(arg1, &zs, sizeof (zone_def))) {
5438 				return (set_errno(EFAULT));
5439 			}
5440 		} else {
5441 #ifdef _SYSCALL32_IMPL
5442 			zone_def32 zs32;
5443 
5444 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
5445 				return (set_errno(EFAULT));
5446 			}
5447 			zs.zone_name =
5448 			    (const char *)(unsigned long)zs32.zone_name;
5449 			zs.zone_root =
5450 			    (const char *)(unsigned long)zs32.zone_root;
5451 			zs.zone_privs =
5452 			    (const struct priv_set *)
5453 			    (unsigned long)zs32.zone_privs;
5454 			zs.zone_privssz = zs32.zone_privssz;
5455 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
5456 			zs.rctlbufsz = zs32.rctlbufsz;
5457 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
5458 			zs.zfsbufsz = zs32.zfsbufsz;
5459 			zs.extended_error =
5460 			    (int *)(unsigned long)zs32.extended_error;
5461 			zs.match = zs32.match;
5462 			zs.doi = zs32.doi;
5463 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
5464 			zs.flags = zs32.flags;
5465 #else
5466 			panic("get_udatamodel() returned bogus result\n");
5467 #endif
5468 		}
5469 
5470 		return (zone_create(zs.zone_name, zs.zone_root,
5471 		    zs.zone_privs, zs.zone_privssz,
5472 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
5473 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
5474 		    zs.extended_error, zs.match, zs.doi,
5475 		    zs.label, zs.flags));
5476 	case ZONE_BOOT:
5477 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
5478 	case ZONE_DESTROY:
5479 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
5480 	case ZONE_GETATTR:
5481 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
5482 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5483 	case ZONE_SETATTR:
5484 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
5485 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
5486 	case ZONE_ENTER:
5487 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
5488 	case ZONE_LIST:
5489 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
5490 	case ZONE_SHUTDOWN:
5491 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
5492 	case ZONE_LOOKUP:
5493 		return (zone_lookup((const char *)arg1));
5494 	case ZONE_VERSION:
5495 		return (zone_version((int *)arg1));
5496 	case ZONE_ADD_DATALINK:
5497 		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
5498 		    (char *)arg2));
5499 	case ZONE_DEL_DATALINK:
5500 		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
5501 		    (char *)arg2));
5502 	case ZONE_CHECK_DATALINK:
5503 		return (zone_check_datalink((zoneid_t *)arg1, (char *)arg2));
5504 	case ZONE_LIST_DATALINK:
5505 		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
5506 		    (int *)arg2, (char *)arg3));
5507 	default:
5508 		return (set_errno(EINVAL));
5509 	}
5510 }
5511 
5512 struct zarg {
5513 	zone_t *zone;
5514 	zone_cmd_arg_t arg;
5515 };
5516 
5517 static int
5518 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
5519 {
5520 	char *buf;
5521 	size_t buflen;
5522 	int error;
5523 
5524 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
5525 	buf = kmem_alloc(buflen, KM_SLEEP);
5526 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
5527 	error = door_ki_open(buf, doorp);
5528 	kmem_free(buf, buflen);
5529 	return (error);
5530 }
5531 
5532 static void
5533 zone_release_door(door_handle_t *doorp)
5534 {
5535 	door_ki_rele(*doorp);
5536 	*doorp = NULL;
5537 }
5538 
5539 static void
5540 zone_ki_call_zoneadmd(struct zarg *zargp)
5541 {
5542 	door_handle_t door = NULL;
5543 	door_arg_t darg, save_arg;
5544 	char *zone_name;
5545 	size_t zone_namelen;
5546 	zoneid_t zoneid;
5547 	zone_t *zone;
5548 	zone_cmd_arg_t arg;
5549 	uint64_t uniqid;
5550 	size_t size;
5551 	int error;
5552 	int retry;
5553 
5554 	zone = zargp->zone;
5555 	arg = zargp->arg;
5556 	kmem_free(zargp, sizeof (*zargp));
5557 
5558 	zone_namelen = strlen(zone->zone_name) + 1;
5559 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
5560 	bcopy(zone->zone_name, zone_name, zone_namelen);
5561 	zoneid = zone->zone_id;
5562 	uniqid = zone->zone_uniqid;
5563 	/*
5564 	 * zoneadmd may be down, but at least we can empty out the zone.
5565 	 * We can ignore the return value of zone_empty() since we're called
5566 	 * from a kernel thread and know we won't be delivered any signals.
5567 	 */
5568 	ASSERT(curproc == &p0);
5569 	(void) zone_empty(zone);
5570 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
5571 	zone_rele(zone);
5572 
5573 	size = sizeof (arg);
5574 	darg.rbuf = (char *)&arg;
5575 	darg.data_ptr = (char *)&arg;
5576 	darg.rsize = size;
5577 	darg.data_size = size;
5578 	darg.desc_ptr = NULL;
5579 	darg.desc_num = 0;
5580 
5581 	save_arg = darg;
5582 	/*
5583 	 * Since we're not holding a reference to the zone, any number of
5584 	 * things can go wrong, including the zone disappearing before we get a
5585 	 * chance to talk to zoneadmd.
5586 	 */
5587 	for (retry = 0; /* forever */; retry++) {
5588 		if (door == NULL &&
5589 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
5590 			goto next;
5591 		}
5592 		ASSERT(door != NULL);
5593 
5594 		if ((error = door_ki_upcall(door, &darg)) == 0) {
5595 			break;
5596 		}
5597 		switch (error) {
5598 		case EINTR:
5599 			/* FALLTHROUGH */
5600 		case EAGAIN:	/* process may be forking */
5601 			/*
5602 			 * Back off for a bit
5603 			 */
5604 			break;
5605 		case EBADF:
5606 			zone_release_door(&door);
5607 			if (zone_lookup_door(zone_name, &door) != 0) {
5608 				/*
5609 				 * zoneadmd may be dead, but it may come back to
5610 				 * life later.
5611 				 */
5612 				break;
5613 			}
5614 			break;
5615 		default:
5616 			cmn_err(CE_WARN,
5617 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
5618 			    error);
5619 			goto out;
5620 		}
5621 next:
5622 		/*
5623 		 * If this isn't the same zone_t that we originally had in mind,
5624 		 * then this is the same as if two kadmin requests come in at
5625 		 * the same time: the first one wins.  This means we lose, so we
5626 		 * bail.
5627 		 */
5628 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
5629 			/*
5630 			 * Problem is solved.
5631 			 */
5632 			break;
5633 		}
5634 		if (zone->zone_uniqid != uniqid) {
5635 			/*
5636 			 * zoneid recycled
5637 			 */
5638 			zone_rele(zone);
5639 			break;
5640 		}
5641 		/*
5642 		 * We could zone_status_timedwait(), but there doesn't seem to
5643 		 * be much point in doing that (plus, it would mean that
5644 		 * zone_free() isn't called until this thread exits).
5645 		 */
5646 		zone_rele(zone);
5647 		delay(hz);
5648 		darg = save_arg;
5649 	}
5650 out:
5651 	if (door != NULL) {
5652 		zone_release_door(&door);
5653 	}
5654 	kmem_free(zone_name, zone_namelen);
5655 	thread_exit();
5656 }
5657 
5658 /*
5659  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
5660  * kadmin().  The caller is a process in the zone.
5661  *
5662  * In order to shutdown the zone, we will hand off control to zoneadmd
5663  * (running in the global zone) via a door.  We do a half-hearted job at
5664  * killing all processes in the zone, create a kernel thread to contact
5665  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
5666  * a form of generation number used to let zoneadmd (as well as
5667  * zone_destroy()) know exactly which zone they're re talking about.
5668  */
5669 int
5670 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
5671 {
5672 	struct zarg *zargp;
5673 	zone_cmd_t zcmd;
5674 	zone_t *zone;
5675 
5676 	zone = curproc->p_zone;
5677 	ASSERT(getzoneid() != GLOBAL_ZONEID);
5678 
5679 	switch (cmd) {
5680 	case A_SHUTDOWN:
5681 		switch (fcn) {
5682 		case AD_HALT:
5683 		case AD_POWEROFF:
5684 			zcmd = Z_HALT;
5685 			break;
5686 		case AD_BOOT:
5687 			zcmd = Z_REBOOT;
5688 			break;
5689 		case AD_IBOOT:
5690 		case AD_SBOOT:
5691 		case AD_SIBOOT:
5692 		case AD_NOSYNC:
5693 			return (ENOTSUP);
5694 		default:
5695 			return (EINVAL);
5696 		}
5697 		break;
5698 	case A_REBOOT:
5699 		zcmd = Z_REBOOT;
5700 		break;
5701 	case A_FTRACE:
5702 	case A_REMOUNT:
5703 	case A_FREEZE:
5704 	case A_DUMP:
5705 		return (ENOTSUP);
5706 	default:
5707 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
5708 		return (EINVAL);
5709 	}
5710 
5711 	if (secpolicy_zone_admin(credp, B_FALSE))
5712 		return (EPERM);
5713 	mutex_enter(&zone_status_lock);
5714 
5715 	/*
5716 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
5717 	 * is in the zone.
5718 	 */
5719 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
5720 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
5721 		/*
5722 		 * This zone is already on its way down.
5723 		 */
5724 		mutex_exit(&zone_status_lock);
5725 		return (0);
5726 	}
5727 	/*
5728 	 * Prevent future zone_enter()s
5729 	 */
5730 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5731 	mutex_exit(&zone_status_lock);
5732 
5733 	/*
5734 	 * Kill everyone now and call zoneadmd later.
5735 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
5736 	 * later.
5737 	 */
5738 	killall(zone->zone_id);
5739 	/*
5740 	 * Now, create the thread to contact zoneadmd and do the rest of the
5741 	 * work.  This thread can't be created in our zone otherwise
5742 	 * zone_destroy() would deadlock.
5743 	 */
5744 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
5745 	zargp->arg.cmd = zcmd;
5746 	zargp->arg.uniqid = zone->zone_uniqid;
5747 	zargp->zone = zone;
5748 	(void) strcpy(zargp->arg.locale, "C");
5749 	/* mdep was already copied in for us by uadmin */
5750 	if (mdep != NULL)
5751 		(void) strlcpy(zargp->arg.bootbuf, mdep,
5752 		    sizeof (zargp->arg.bootbuf));
5753 	zone_hold(zone);
5754 
5755 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
5756 	    TS_RUN, minclsyspri);
5757 	exit(CLD_EXITED, 0);
5758 
5759 	return (EINVAL);
5760 }
5761 
5762 /*
5763  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
5764  * status to ZONE_IS_SHUTTING_DOWN.
5765  */
5766 void
5767 zone_shutdown_global(void)
5768 {
5769 	ASSERT(curproc->p_zone == global_zone);
5770 
5771 	mutex_enter(&zone_status_lock);
5772 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
5773 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
5774 	mutex_exit(&zone_status_lock);
5775 }
5776 
5777 /*
5778  * Returns true if the named dataset is visible in the current zone.
5779  * The 'write' parameter is set to 1 if the dataset is also writable.
5780  */
5781 int
5782 zone_dataset_visible(const char *dataset, int *write)
5783 {
5784 	zone_dataset_t *zd;
5785 	size_t len;
5786 	zone_t *zone = curproc->p_zone;
5787 
5788 	if (dataset[0] == '\0')
5789 		return (0);
5790 
5791 	/*
5792 	 * Walk the list once, looking for datasets which match exactly, or
5793 	 * specify a dataset underneath an exported dataset.  If found, return
5794 	 * true and note that it is writable.
5795 	 */
5796 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5797 	    zd = list_next(&zone->zone_datasets, zd)) {
5798 
5799 		len = strlen(zd->zd_dataset);
5800 		if (strlen(dataset) >= len &&
5801 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5802 		    (dataset[len] == '\0' || dataset[len] == '/' ||
5803 		    dataset[len] == '@')) {
5804 			if (write)
5805 				*write = 1;
5806 			return (1);
5807 		}
5808 	}
5809 
5810 	/*
5811 	 * Walk the list a second time, searching for datasets which are parents
5812 	 * of exported datasets.  These should be visible, but read-only.
5813 	 *
5814 	 * Note that we also have to support forms such as 'pool/dataset/', with
5815 	 * a trailing slash.
5816 	 */
5817 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5818 	    zd = list_next(&zone->zone_datasets, zd)) {
5819 
5820 		len = strlen(dataset);
5821 		if (dataset[len - 1] == '/')
5822 			len--;	/* Ignore trailing slash */
5823 		if (len < strlen(zd->zd_dataset) &&
5824 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5825 		    zd->zd_dataset[len] == '/') {
5826 			if (write)
5827 				*write = 0;
5828 			return (1);
5829 		}
5830 	}
5831 
5832 	return (0);
5833 }
5834 
5835 /*
5836  * zone_find_by_any_path() -
5837  *
5838  * kernel-private routine similar to zone_find_by_path(), but which
5839  * effectively compares against zone paths rather than zonerootpath
5840  * (i.e., the last component of zonerootpaths, which should be "root/",
5841  * are not compared.)  This is done in order to accurately identify all
5842  * paths, whether zone-visible or not, including those which are parallel
5843  * to /root/, such as /dev/, /home/, etc...
5844  *
5845  * If the specified path does not fall under any zone path then global
5846  * zone is returned.
5847  *
5848  * The treat_abs parameter indicates whether the path should be treated as
5849  * an absolute path although it does not begin with "/".  (This supports
5850  * nfs mount syntax such as host:any/path.)
5851  *
5852  * The caller is responsible for zone_rele of the returned zone.
5853  */
5854 zone_t *
5855 zone_find_by_any_path(const char *path, boolean_t treat_abs)
5856 {
5857 	zone_t *zone;
5858 	int path_offset = 0;
5859 
5860 	if (path == NULL) {
5861 		zone_hold(global_zone);
5862 		return (global_zone);
5863 	}
5864 
5865 	if (*path != '/') {
5866 		ASSERT(treat_abs);
5867 		path_offset = 1;
5868 	}
5869 
5870 	mutex_enter(&zonehash_lock);
5871 	for (zone = list_head(&zone_active); zone != NULL;
5872 	    zone = list_next(&zone_active, zone)) {
5873 		char	*c;
5874 		size_t	pathlen;
5875 		char *rootpath_start;
5876 
5877 		if (zone == global_zone)	/* skip global zone */
5878 			continue;
5879 
5880 		/* scan backwards to find start of last component */
5881 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
5882 		do {
5883 			c--;
5884 		} while (*c != '/');
5885 
5886 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
5887 		rootpath_start = (zone->zone_rootpath + path_offset);
5888 		if (strncmp(path, rootpath_start, pathlen) == 0)
5889 			break;
5890 	}
5891 	if (zone == NULL)
5892 		zone = global_zone;
5893 	zone_hold(zone);
5894 	mutex_exit(&zonehash_lock);
5895 	return (zone);
5896 }
5897 
5898 /* List of data link names which are accessible from the zone */
5899 struct dlnamelist {
5900 	char			dlnl_name[LIFNAMSIZ];
5901 	struct dlnamelist	*dlnl_next;
5902 };
5903 
5904 
5905 /*
5906  * Check whether the datalink name (dlname) itself is present.
5907  * Return true if found.
5908  */
5909 static boolean_t
5910 zone_dlname(zone_t *zone, char *dlname)
5911 {
5912 	struct dlnamelist *dlnl;
5913 	boolean_t found = B_FALSE;
5914 
5915 	mutex_enter(&zone->zone_lock);
5916 	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5917 		if (strncmp(dlnl->dlnl_name, dlname, LIFNAMSIZ) == 0) {
5918 			found = B_TRUE;
5919 			break;
5920 		}
5921 	}
5922 	mutex_exit(&zone->zone_lock);
5923 	return (found);
5924 }
5925 
5926 /*
5927  * Add an data link name for the zone. Does not check for duplicates.
5928  */
5929 static int
5930 zone_add_datalink(zoneid_t zoneid, char *dlname)
5931 {
5932 	struct dlnamelist *dlnl;
5933 	zone_t *zone;
5934 	zone_t *thiszone;
5935 	int err;
5936 
5937 	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5938 	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5939 		kmem_free(dlnl, sizeof (struct dlnamelist));
5940 		return (set_errno(err));
5941 	}
5942 
5943 	thiszone = zone_find_by_id(zoneid);
5944 	if (thiszone == NULL) {
5945 		kmem_free(dlnl, sizeof (struct dlnamelist));
5946 		return (set_errno(ENXIO));
5947 	}
5948 
5949 	/*
5950 	 * Verify that the datalink name isn't already used by a different
5951 	 * zone while allowing duplicate entries for the same zone (e.g. due
5952 	 * to both using IPv4 and IPv6 on an interface)
5953 	 */
5954 	mutex_enter(&zonehash_lock);
5955 	for (zone = list_head(&zone_active); zone != NULL;
5956 	    zone = list_next(&zone_active, zone)) {
5957 		if (zone->zone_id == zoneid)
5958 			continue;
5959 
5960 		if (zone_dlname(zone, dlnl->dlnl_name)) {
5961 			mutex_exit(&zonehash_lock);
5962 			zone_rele(thiszone);
5963 			kmem_free(dlnl, sizeof (struct dlnamelist));
5964 			return (set_errno(EPERM));
5965 		}
5966 	}
5967 	mutex_enter(&thiszone->zone_lock);
5968 	dlnl->dlnl_next = thiszone->zone_dl_list;
5969 	thiszone->zone_dl_list = dlnl;
5970 	mutex_exit(&thiszone->zone_lock);
5971 	mutex_exit(&zonehash_lock);
5972 	zone_rele(thiszone);
5973 	return (0);
5974 }
5975 
5976 static int
5977 zone_remove_datalink(zoneid_t zoneid, char *dlname)
5978 {
5979 	struct dlnamelist *dlnl, *odlnl, **dlnlp;
5980 	zone_t *zone;
5981 	int err;
5982 
5983 	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5984 	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5985 		kmem_free(dlnl, sizeof (struct dlnamelist));
5986 		return (set_errno(err));
5987 	}
5988 	zone = zone_find_by_id(zoneid);
5989 	if (zone == NULL) {
5990 		kmem_free(dlnl, sizeof (struct dlnamelist));
5991 		return (set_errno(EINVAL));
5992 	}
5993 
5994 	mutex_enter(&zone->zone_lock);
5995 	/* Look for match */
5996 	dlnlp = &zone->zone_dl_list;
5997 	while (*dlnlp != NULL) {
5998 		if (strncmp(dlnl->dlnl_name, (*dlnlp)->dlnl_name,
5999 		    LIFNAMSIZ) == 0)
6000 			goto found;
6001 		dlnlp = &((*dlnlp)->dlnl_next);
6002 	}
6003 	mutex_exit(&zone->zone_lock);
6004 	zone_rele(zone);
6005 	kmem_free(dlnl, sizeof (struct dlnamelist));
6006 	return (set_errno(ENXIO));
6007 
6008 found:
6009 	odlnl = *dlnlp;
6010 	*dlnlp = (*dlnlp)->dlnl_next;
6011 	kmem_free(odlnl, sizeof (struct dlnamelist));
6012 
6013 	mutex_exit(&zone->zone_lock);
6014 	zone_rele(zone);
6015 	kmem_free(dlnl, sizeof (struct dlnamelist));
6016 	return (0);
6017 }
6018 
6019 /*
6020  * Using the zoneidp as ALL_ZONES, we can lookup which zone is using datalink
6021  * name (dlname); otherwise we just check if the specified zoneidp has access
6022  * to the datalink name.
6023  */
6024 static int
6025 zone_check_datalink(zoneid_t *zoneidp, char *dlname)
6026 {
6027 	zoneid_t id;
6028 	char *dln;
6029 	zone_t *zone;
6030 	int err = 0;
6031 	boolean_t allzones = B_FALSE;
6032 
6033 	if (copyin(zoneidp, &id, sizeof (id)) != 0) {
6034 		return (set_errno(EFAULT));
6035 	}
6036 	dln = kmem_zalloc(LIFNAMSIZ, KM_SLEEP);
6037 	if ((err = copyinstr(dlname, dln, LIFNAMSIZ, NULL)) != 0) {
6038 		kmem_free(dln, LIFNAMSIZ);
6039 		return (set_errno(err));
6040 	}
6041 
6042 	if (id == ALL_ZONES)
6043 		allzones = B_TRUE;
6044 
6045 	/*
6046 	 * Check whether datalink name is already used.
6047 	 */
6048 	mutex_enter(&zonehash_lock);
6049 	for (zone = list_head(&zone_active); zone != NULL;
6050 	    zone = list_next(&zone_active, zone)) {
6051 		if (allzones || (id == zone->zone_id)) {
6052 			if (!zone_dlname(zone, dln))
6053 				continue;
6054 			if (allzones)
6055 				err = copyout(&zone->zone_id, zoneidp,
6056 				    sizeof (*zoneidp));
6057 
6058 			mutex_exit(&zonehash_lock);
6059 			kmem_free(dln, LIFNAMSIZ);
6060 			return (err ? set_errno(EFAULT) : 0);
6061 		}
6062 	}
6063 
6064 	/* datalink name is not found in any active zone. */
6065 	mutex_exit(&zonehash_lock);
6066 	kmem_free(dln, LIFNAMSIZ);
6067 	return (set_errno(ENXIO));
6068 }
6069 
6070 /*
6071  * Get the names of the datalinks assigned to a zone.
6072  * Here *nump is the number of datalinks, and the assumption
6073  * is that the caller will guarantee that the the supplied buffer is
6074  * big enough to hold at least #*nump datalink names, that is,
6075  * LIFNAMSIZ X *nump
6076  * On return, *nump will be the "new" number of datalinks, if it
6077  * ever changed.
6078  */
6079 static int
6080 zone_list_datalink(zoneid_t zoneid, int *nump, char *buf)
6081 {
6082 	int num, dlcount;
6083 	zone_t *zone;
6084 	struct dlnamelist *dlnl;
6085 	char *ptr;
6086 
6087 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
6088 		return (set_errno(EFAULT));
6089 
6090 	zone = zone_find_by_id(zoneid);
6091 	if (zone == NULL) {
6092 		return (set_errno(ENXIO));
6093 	}
6094 
6095 	num = 0;
6096 	mutex_enter(&zone->zone_lock);
6097 	ptr = buf;
6098 	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
6099 		/*
6100 		 * If the list changed and the new number is bigger
6101 		 * than what the caller supplied, just count, don't
6102 		 * do copyout
6103 		 */
6104 		if (++num > dlcount)
6105 			continue;
6106 		if (copyout(dlnl->dlnl_name, ptr, LIFNAMSIZ) != 0) {
6107 			mutex_exit(&zone->zone_lock);
6108 			zone_rele(zone);
6109 			return (set_errno(EFAULT));
6110 		}
6111 		ptr += LIFNAMSIZ;
6112 	}
6113 	mutex_exit(&zone->zone_lock);
6114 	zone_rele(zone);
6115 
6116 	/* Increased or decreased, caller should be notified. */
6117 	if (num != dlcount) {
6118 		if (copyout(&num, nump, sizeof (num)) != 0) {
6119 			return (set_errno(EFAULT));
6120 		}
6121 	}
6122 	return (0);
6123 }
6124 
6125 /*
6126  * Public interface for looking up a zone by zoneid. It's a customized version
6127  * for netstack_zone_create(). It can only be called from the zsd create
6128  * callbacks, since it doesn't have reference on the zone structure hence if
6129  * it is called elsewhere the zone could disappear after the zonehash_lock
6130  * is dropped.
6131  *
6132  * Furthermore it
6133  * 1. Doesn't check the status of the zone.
6134  * 2. It will be called even before zone_init is called, in that case the
6135  *    address of zone0 is returned directly, and netstack_zone_create()
6136  *    will only assign a value to zone0.zone_netstack, won't break anything.
6137  * 3. Returns without the zone being held.
6138  */
6139 zone_t *
6140 zone_find_by_id_nolock(zoneid_t zoneid)
6141 {
6142 	zone_t *zone;
6143 
6144 	mutex_enter(&zonehash_lock);
6145 	if (zonehashbyid == NULL)
6146 		zone = &zone0;
6147 	else
6148 		zone = zone_find_all_by_id(zoneid);
6149 	mutex_exit(&zonehash_lock);
6150 	return (zone);
6151 }
6152 
6153 /*
6154  * Walk the datalinks for a given zone
6155  */
6156 int
6157 zone_datalink_walk(zoneid_t zoneid, int (*cb)(const char *, void *), void *data)
6158 {
6159 	zone_t *zone;
6160 	struct dlnamelist *dlnl;
6161 	int ret = 0;
6162 
6163 	if ((zone = zone_find_by_id(zoneid)) == NULL)
6164 		return (ENOENT);
6165 
6166 	mutex_enter(&zone->zone_lock);
6167 	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
6168 		if ((ret = (*cb)(dlnl->dlnl_name, data)) != 0)
6169 			break;
6170 	}
6171 	mutex_exit(&zone->zone_lock);
6172 	zone_rele(zone);
6173 	return (ret);
6174 }
6175