xref: /titanic_41/usr/src/uts/common/os/zone.c (revision 058561cbaa119a6f2659bc27ef343e1b47266bb2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Zones
31  *
32  *   A zone is a named collection of processes, namespace constraints,
33  *   and other system resources which comprise a secure and manageable
34  *   application containment facility.
35  *
36  *   Zones (represented by the reference counted zone_t) are tracked in
37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38  *   (zoneid_t) are used to track zone association.  Zone IDs are
39  *   dynamically generated when the zone is created; if a persistent
40  *   identifier is needed (core files, accounting logs, audit trail,
41  *   etc.), the zone name should be used.
42  *
43  *
44  *   Global Zone:
45  *
46  *   The global zone (zoneid 0) is automatically associated with all
47  *   system resources that have not been bound to a user-created zone.
48  *   This means that even systems where zones are not in active use
49  *   have a global zone, and all processes, mounts, etc. are
50  *   associated with that zone.  The global zone is generally
51  *   unconstrained in terms of privileges and access, though the usual
52  *   credential and privilege based restrictions apply.
53  *
54  *
55  *   Zone States:
56  *
57  *   The states in which a zone may be in and the transitions are as
58  *   follows:
59  *
60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61  *   initialized zone is added to the list of active zones on the system but
62  *   isn't accessible.
63  *
64  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
65  *   ready.  The zone is made visible after the ZSD constructor callbacks are
66  *   executed.  A zone remains in this state until it transitions into
67  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
68  *
69  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
70  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
71  *   state.
72  *
73  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
74  *   successfully started init.   A zone remains in this state until
75  *   zone_shutdown() is called.
76  *
77  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
78  *   killing all processes running in the zone. The zone remains
79  *   in this state until there are no more user processes running in the zone.
80  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
81  *   Since zone_shutdown() is restartable, it may be called successfully
82  *   multiple times for the same zone_t.  Setting of the zone's state to
83  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
84  *   the zone's status without worrying about it being a moving target.
85  *
86  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
87  *   are no more user processes in the zone.  The zone remains in this
88  *   state until there are no more kernel threads associated with the
89  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
90  *   fail.
91  *
92  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
93  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
94  *   join the zone or create kernel threads therein.
95  *
96  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
97  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
98  *   return NULL from now on.
99  *
100  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
101  *   processes or threads doing work on behalf of the zone.  The zone is
102  *   removed from the list of active zones.  zone_destroy() returns, and
103  *   the zone can be recreated.
104  *
105  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
106  *   callbacks are executed, and all memory associated with the zone is
107  *   freed.
108  *
109  *   Threads can wait for the zone to enter a requested state by using
110  *   zone_status_wait() or zone_status_timedwait() with the desired
111  *   state passed in as an argument.  Zone state transitions are
112  *   uni-directional; it is not possible to move back to an earlier state.
113  *
114  *
115  *   Zone-Specific Data:
116  *
117  *   Subsystems needing to maintain zone-specific data can store that
118  *   data using the ZSD mechanism.  This provides a zone-specific data
119  *   store, similar to thread-specific data (see pthread_getspecific(3C)
120  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
121  *   to register callbacks to be invoked when a zone is created, shut
122  *   down, or destroyed.  This can be used to initialize zone-specific
123  *   data for new zones and to clean up when zones go away.
124  *
125  *
126  *   Data Structures:
127  *
128  *   The per-zone structure (zone_t) is reference counted, and freed
129  *   when all references are released.  zone_hold and zone_rele can be
130  *   used to adjust the reference count.  In addition, reference counts
131  *   associated with the cred_t structure are tracked separately using
132  *   zone_cred_hold and zone_cred_rele.
133  *
134  *   Pointers to active zone_t's are stored in two hash tables; one
135  *   for searching by id, the other for searching by name.  Lookups
136  *   can be performed on either basis, using zone_find_by_id and
137  *   zone_find_by_name.  Both return zone_t pointers with the zone
138  *   held, so zone_rele should be called when the pointer is no longer
139  *   needed.  Zones can also be searched by path; zone_find_by_path
140  *   returns the zone with which a path name is associated (global
141  *   zone if the path is not within some other zone's file system
142  *   hierarchy).  This currently requires iterating through each zone,
143  *   so it is slower than an id or name search via a hash table.
144  *
145  *
146  *   Locking:
147  *
148  *   zonehash_lock: This is a top-level global lock used to protect the
149  *       zone hash tables and lists.  Zones cannot be created or destroyed
150  *       while this lock is held.
151  *   zone_status_lock: This is a global lock protecting zone state.
152  *       Zones cannot change state while this lock is held.  It also
153  *       protects the list of kernel threads associated with a zone.
154  *   zone_lock: This is a per-zone lock used to protect several fields of
155  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
156  *       this lock means that the zone cannot go away.
157  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
158  *	 related to the zone.max-lwps rctl.
159  *   zone_mem_lock: This is a per-zone lock used to protect the fields
160  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
161  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
162  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
163  *       list (a list of zones in the ZONE_IS_DEAD state).
164  *
165  *   Ordering requirements:
166  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
167  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
168  *
169  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
170  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
171  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
172  *
173  *   Blocking memory allocations are permitted while holding any of the
174  *   zone locks.
175  *
176  *
177  *   System Call Interface:
178  *
179  *   The zone subsystem can be managed and queried from user level with
180  *   the following system calls (all subcodes of the primary "zone"
181  *   system call):
182  *   - zone_create: creates a zone with selected attributes (name,
183  *     root path, privileges, resource controls, ZFS datasets)
184  *   - zone_enter: allows the current process to enter a zone
185  *   - zone_getattr: reports attributes of a zone
186  *   - zone_setattr: set attributes of a zone
187  *   - zone_boot: set 'init' running for the zone
188  *   - zone_list: lists all zones active in the system
189  *   - zone_lookup: looks up zone id based on name
190  *   - zone_shutdown: initiates shutdown process (see states above)
191  *   - zone_destroy: completes shutdown process (see states above)
192  *
193  */
194 
195 #include <sys/priv_impl.h>
196 #include <sys/cred.h>
197 #include <c2/audit.h>
198 #include <sys/debug.h>
199 #include <sys/file.h>
200 #include <sys/kmem.h>
201 #include <sys/kstat.h>
202 #include <sys/mutex.h>
203 #include <sys/note.h>
204 #include <sys/pathname.h>
205 #include <sys/proc.h>
206 #include <sys/project.h>
207 #include <sys/sysevent.h>
208 #include <sys/task.h>
209 #include <sys/systm.h>
210 #include <sys/types.h>
211 #include <sys/utsname.h>
212 #include <sys/vnode.h>
213 #include <sys/vfs.h>
214 #include <sys/systeminfo.h>
215 #include <sys/policy.h>
216 #include <sys/cred_impl.h>
217 #include <sys/contract_impl.h>
218 #include <sys/contract/process_impl.h>
219 #include <sys/class.h>
220 #include <sys/pool.h>
221 #include <sys/pool_pset.h>
222 #include <sys/pset.h>
223 #include <sys/sysmacros.h>
224 #include <sys/callb.h>
225 #include <sys/vmparam.h>
226 #include <sys/corectl.h>
227 #include <sys/ipc_impl.h>
228 
229 #include <sys/door.h>
230 #include <sys/cpuvar.h>
231 
232 #include <sys/uadmin.h>
233 #include <sys/session.h>
234 #include <sys/cmn_err.h>
235 #include <sys/modhash.h>
236 #include <sys/sunddi.h>
237 #include <sys/nvpair.h>
238 #include <sys/rctl.h>
239 #include <sys/fss.h>
240 #include <sys/brand.h>
241 #include <sys/zone.h>
242 #include <net/if.h>
243 #include <vm/seg.h>
244 
245 /*
246  * cv used to signal that all references to the zone have been released.  This
247  * needs to be global since there may be multiple waiters, and the first to
248  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
249  */
250 static kcondvar_t zone_destroy_cv;
251 /*
252  * Lock used to serialize access to zone_cv.  This could have been per-zone,
253  * but then we'd need another lock for zone_destroy_cv, and why bother?
254  */
255 static kmutex_t zone_status_lock;
256 
257 /*
258  * ZSD-related global variables.
259  */
260 static kmutex_t zsd_key_lock;	/* protects the following two */
261 /*
262  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
263  */
264 static zone_key_t zsd_keyval = 0;
265 /*
266  * Global list of registered keys.  We use this when a new zone is created.
267  */
268 static list_t zsd_registered_keys;
269 
270 int zone_hash_size = 256;
271 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
272 static kmutex_t zonehash_lock;
273 static uint_t zonecount;
274 static id_space_t *zoneid_space;
275 
276 /*
277  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
278  * kernel proper runs, and which manages all other zones.
279  *
280  * Although not declared as static, the variable "zone0" should not be used
281  * except for by code that needs to reference the global zone early on in boot,
282  * before it is fully initialized.  All other consumers should use
283  * 'global_zone'.
284  */
285 zone_t zone0;
286 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
287 
288 /*
289  * List of active zones, protected by zonehash_lock.
290  */
291 static list_t zone_active;
292 
293 /*
294  * List of destroyed zones that still have outstanding cred references.
295  * Used for debugging.  Uses a separate lock to avoid lock ordering
296  * problems in zone_free.
297  */
298 static list_t zone_deathrow;
299 static kmutex_t zone_deathrow_lock;
300 
301 /* number of zones is limited by virtual interface limit in IP */
302 uint_t maxzones = 8192;
303 
304 /* Event channel to sent zone state change notifications */
305 evchan_t *zone_event_chan;
306 
307 /*
308  * This table holds the mapping from kernel zone states to
309  * states visible in the state notification API.
310  * The idea is that we only expose "obvious" states and
311  * do not expose states which are just implementation details.
312  */
313 const char  *zone_status_table[] = {
314 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
315 	ZONE_EVENT_READY,		/* ready */
316 	ZONE_EVENT_READY,		/* booting */
317 	ZONE_EVENT_RUNNING,		/* running */
318 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
319 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
320 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
321 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
322 	ZONE_EVENT_UNINITIALIZED,	/* dead */
323 };
324 
325 /*
326  * This isn't static so lint doesn't complain.
327  */
328 rctl_hndl_t rc_zone_cpu_shares;
329 rctl_hndl_t rc_zone_locked_mem;
330 rctl_hndl_t rc_zone_max_swap;
331 rctl_hndl_t rc_zone_nlwps;
332 rctl_hndl_t rc_zone_shmmax;
333 rctl_hndl_t rc_zone_shmmni;
334 rctl_hndl_t rc_zone_semmni;
335 rctl_hndl_t rc_zone_msgmni;
336 /*
337  * Synchronization primitives used to synchronize between mounts and zone
338  * creation/destruction.
339  */
340 static int mounts_in_progress;
341 static kcondvar_t mount_cv;
342 static kmutex_t mount_lock;
343 
344 const char * const zone_default_initname = "/sbin/init";
345 static char * const zone_prefix = "/zone/";
346 static int zone_shutdown(zoneid_t zoneid);
347 static int zone_add_datalink(zoneid_t, char *);
348 static int zone_remove_datalink(zoneid_t, char *);
349 static int zone_check_datalink(zoneid_t *, char *);
350 static int zone_list_datalink(zoneid_t, int *, char *);
351 
352 /*
353  * Bump this number when you alter the zone syscall interfaces; this is
354  * because we need to have support for previous API versions in libc
355  * to support patching; libc calls into the kernel to determine this number.
356  *
357  * Version 1 of the API is the version originally shipped with Solaris 10
358  * Version 2 alters the zone_create system call in order to support more
359  *     arguments by moving the args into a structure; and to do better
360  *     error reporting when zone_create() fails.
361  * Version 3 alters the zone_create system call in order to support the
362  *     import of ZFS datasets to zones.
363  * Version 4 alters the zone_create system call in order to support
364  *     Trusted Extensions.
365  * Version 5 alters the zone_boot system call, and converts its old
366  *     bootargs parameter to be set by the zone_setattr API instead.
367  * Version 6 adds the flag argument to zone_create.
368  */
369 static const int ZONE_SYSCALL_API_VERSION = 6;
370 
371 /*
372  * Certain filesystems (such as NFS and autofs) need to know which zone
373  * the mount is being placed in.  Because of this, we need to be able to
374  * ensure that a zone isn't in the process of being created such that
375  * nfs_mount() thinks it is in the global zone, while by the time it
376  * gets added the list of mounted zones, it ends up on zoneA's mount
377  * list.
378  *
379  * The following functions: block_mounts()/resume_mounts() and
380  * mount_in_progress()/mount_completed() are used by zones and the VFS
381  * layer (respectively) to synchronize zone creation and new mounts.
382  *
383  * The semantics are like a reader-reader lock such that there may
384  * either be multiple mounts (or zone creations, if that weren't
385  * serialized by zonehash_lock) in progress at the same time, but not
386  * both.
387  *
388  * We use cv's so the user can ctrl-C out of the operation if it's
389  * taking too long.
390  *
391  * The semantics are such that there is unfair bias towards the
392  * "current" operation.  This means that zone creations may starve if
393  * there is a rapid succession of new mounts coming in to the system, or
394  * there is a remote possibility that zones will be created at such a
395  * rate that new mounts will not be able to proceed.
396  */
397 /*
398  * Prevent new mounts from progressing to the point of calling
399  * VFS_MOUNT().  If there are already mounts in this "region", wait for
400  * them to complete.
401  */
402 static int
403 block_mounts(void)
404 {
405 	int retval = 0;
406 
407 	/*
408 	 * Since it may block for a long time, block_mounts() shouldn't be
409 	 * called with zonehash_lock held.
410 	 */
411 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
412 	mutex_enter(&mount_lock);
413 	while (mounts_in_progress > 0) {
414 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
415 			goto signaled;
416 	}
417 	/*
418 	 * A negative value of mounts_in_progress indicates that mounts
419 	 * have been blocked by (-mounts_in_progress) different callers.
420 	 */
421 	mounts_in_progress--;
422 	retval = 1;
423 signaled:
424 	mutex_exit(&mount_lock);
425 	return (retval);
426 }
427 
428 /*
429  * The VFS layer may progress with new mounts as far as we're concerned.
430  * Allow them to progress if we were the last obstacle.
431  */
432 static void
433 resume_mounts(void)
434 {
435 	mutex_enter(&mount_lock);
436 	if (++mounts_in_progress == 0)
437 		cv_broadcast(&mount_cv);
438 	mutex_exit(&mount_lock);
439 }
440 
441 /*
442  * The VFS layer is busy with a mount; zones should wait until all
443  * mounts are completed to progress.
444  */
445 void
446 mount_in_progress(void)
447 {
448 	mutex_enter(&mount_lock);
449 	while (mounts_in_progress < 0)
450 		cv_wait(&mount_cv, &mount_lock);
451 	mounts_in_progress++;
452 	mutex_exit(&mount_lock);
453 }
454 
455 /*
456  * VFS is done with one mount; wake up any waiting block_mounts()
457  * callers if this is the last mount.
458  */
459 void
460 mount_completed(void)
461 {
462 	mutex_enter(&mount_lock);
463 	if (--mounts_in_progress == 0)
464 		cv_broadcast(&mount_cv);
465 	mutex_exit(&mount_lock);
466 }
467 
468 /*
469  * ZSD routines.
470  *
471  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
472  * defined by the pthread_key_create() and related interfaces.
473  *
474  * Kernel subsystems may register one or more data items and/or
475  * callbacks to be executed when a zone is created, shutdown, or
476  * destroyed.
477  *
478  * Unlike the thread counterpart, destructor callbacks will be executed
479  * even if the data pointer is NULL and/or there are no constructor
480  * callbacks, so it is the responsibility of such callbacks to check for
481  * NULL data values if necessary.
482  *
483  * The locking strategy and overall picture is as follows:
484  *
485  * When someone calls zone_key_create(), a template ZSD entry is added to the
486  * global list "zsd_registered_keys", protected by zsd_key_lock.  The
487  * constructor callback is called immediately on all existing zones, and a
488  * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
489  * zone_lock).  As this operation requires the list of zones, the list of
490  * registered keys, and the per-zone list of ZSD entries to remain constant
491  * throughout the entire operation, it must grab zonehash_lock, zone_lock for
492  * all existing zones, and zsd_key_lock, in that order.  Similar locking is
493  * needed when zone_key_delete() is called.  It is thus sufficient to hold
494  * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
495  * per-zone zone_zsd list.
496  *
497  * Note that this implementation does not make a copy of the ZSD entry if a
498  * constructor callback is not provided.  A zone_getspecific() on such an
499  * uninitialized ZSD entry will return NULL.
500  *
501  * When new zones are created constructor callbacks for all registered ZSD
502  * entries will be called.
503  *
504  * The framework does not provide any locking around zone_getspecific() and
505  * zone_setspecific() apart from that needed for internal consistency, so
506  * callers interested in atomic "test-and-set" semantics will need to provide
507  * their own locking.
508  */
509 void
510 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
511     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
512 {
513 	struct zsd_entry *zsdp;
514 	struct zsd_entry *t;
515 	struct zone *zone;
516 
517 	zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
518 	zsdp->zsd_data = NULL;
519 	zsdp->zsd_create = create;
520 	zsdp->zsd_shutdown = shutdown;
521 	zsdp->zsd_destroy = destroy;
522 
523 	mutex_enter(&zonehash_lock);	/* stop the world */
524 	for (zone = list_head(&zone_active); zone != NULL;
525 	    zone = list_next(&zone_active, zone))
526 		mutex_enter(&zone->zone_lock);	/* lock all zones */
527 
528 	mutex_enter(&zsd_key_lock);
529 	*keyp = zsdp->zsd_key = ++zsd_keyval;
530 	ASSERT(zsd_keyval != 0);
531 	list_insert_tail(&zsd_registered_keys, zsdp);
532 	mutex_exit(&zsd_key_lock);
533 
534 	if (create != NULL) {
535 		for (zone = list_head(&zone_active); zone != NULL;
536 		    zone = list_next(&zone_active, zone)) {
537 			t = kmem_alloc(sizeof (*t), KM_SLEEP);
538 			t->zsd_key = *keyp;
539 			t->zsd_data = (*create)(zone->zone_id);
540 			t->zsd_create = create;
541 			t->zsd_shutdown = shutdown;
542 			t->zsd_destroy = destroy;
543 			list_insert_tail(&zone->zone_zsd, t);
544 		}
545 	}
546 	for (zone = list_head(&zone_active); zone != NULL;
547 	    zone = list_next(&zone_active, zone))
548 		mutex_exit(&zone->zone_lock);
549 	mutex_exit(&zonehash_lock);
550 }
551 
552 /*
553  * Helper function to find the zsd_entry associated with the key in the
554  * given list.
555  */
556 static struct zsd_entry *
557 zsd_find(list_t *l, zone_key_t key)
558 {
559 	struct zsd_entry *zsd;
560 
561 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
562 		if (zsd->zsd_key == key) {
563 			/*
564 			 * Move to head of list to keep list in MRU order.
565 			 */
566 			if (zsd != list_head(l)) {
567 				list_remove(l, zsd);
568 				list_insert_head(l, zsd);
569 			}
570 			return (zsd);
571 		}
572 	}
573 	return (NULL);
574 }
575 
576 /*
577  * Function called when a module is being unloaded, or otherwise wishes
578  * to unregister its ZSD key and callbacks.
579  */
580 int
581 zone_key_delete(zone_key_t key)
582 {
583 	struct zsd_entry *zsdp = NULL;
584 	zone_t *zone;
585 
586 	mutex_enter(&zonehash_lock);	/* Zone create/delete waits for us */
587 	for (zone = list_head(&zone_active); zone != NULL;
588 	    zone = list_next(&zone_active, zone))
589 		mutex_enter(&zone->zone_lock);	/* lock all zones */
590 
591 	mutex_enter(&zsd_key_lock);
592 	zsdp = zsd_find(&zsd_registered_keys, key);
593 	if (zsdp == NULL)
594 		goto notfound;
595 	list_remove(&zsd_registered_keys, zsdp);
596 	mutex_exit(&zsd_key_lock);
597 
598 	for (zone = list_head(&zone_active); zone != NULL;
599 	    zone = list_next(&zone_active, zone)) {
600 		struct zsd_entry *del;
601 		void *data;
602 
603 		if (!(zone->zone_flags & ZF_DESTROYED)) {
604 			del = zsd_find(&zone->zone_zsd, key);
605 			if (del != NULL) {
606 				data = del->zsd_data;
607 				ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
608 				ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
609 				list_remove(&zone->zone_zsd, del);
610 				kmem_free(del, sizeof (*del));
611 			} else {
612 				data = NULL;
613 			}
614 			if (zsdp->zsd_shutdown)
615 				zsdp->zsd_shutdown(zone->zone_id, data);
616 			if (zsdp->zsd_destroy)
617 				zsdp->zsd_destroy(zone->zone_id, data);
618 		}
619 		mutex_exit(&zone->zone_lock);
620 	}
621 	mutex_exit(&zonehash_lock);
622 	kmem_free(zsdp, sizeof (*zsdp));
623 	return (0);
624 
625 notfound:
626 	mutex_exit(&zsd_key_lock);
627 	for (zone = list_head(&zone_active); zone != NULL;
628 	    zone = list_next(&zone_active, zone))
629 		mutex_exit(&zone->zone_lock);
630 	mutex_exit(&zonehash_lock);
631 	return (-1);
632 }
633 
634 /*
635  * ZSD counterpart of pthread_setspecific().
636  */
637 int
638 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
639 {
640 	struct zsd_entry *t;
641 	struct zsd_entry *zsdp = NULL;
642 
643 	mutex_enter(&zone->zone_lock);
644 	t = zsd_find(&zone->zone_zsd, key);
645 	if (t != NULL) {
646 		/*
647 		 * Replace old value with new
648 		 */
649 		t->zsd_data = (void *)data;
650 		mutex_exit(&zone->zone_lock);
651 		return (0);
652 	}
653 	/*
654 	 * If there was no previous value, go through the list of registered
655 	 * keys.
656 	 *
657 	 * We avoid grabbing zsd_key_lock until we are sure we need it; this is
658 	 * necessary for shutdown callbacks to be able to execute without fear
659 	 * of deadlock.
660 	 */
661 	mutex_enter(&zsd_key_lock);
662 	zsdp = zsd_find(&zsd_registered_keys, key);
663 	if (zsdp == NULL) { 	/* Key was not registered */
664 		mutex_exit(&zsd_key_lock);
665 		mutex_exit(&zone->zone_lock);
666 		return (-1);
667 	}
668 
669 	/*
670 	 * Add a zsd_entry to this zone, using the template we just retrieved
671 	 * to initialize the constructor and destructor(s).
672 	 */
673 	t = kmem_alloc(sizeof (*t), KM_SLEEP);
674 	t->zsd_key = key;
675 	t->zsd_data = (void *)data;
676 	t->zsd_create = zsdp->zsd_create;
677 	t->zsd_shutdown = zsdp->zsd_shutdown;
678 	t->zsd_destroy = zsdp->zsd_destroy;
679 	list_insert_tail(&zone->zone_zsd, t);
680 	mutex_exit(&zsd_key_lock);
681 	mutex_exit(&zone->zone_lock);
682 	return (0);
683 }
684 
685 /*
686  * ZSD counterpart of pthread_getspecific().
687  */
688 void *
689 zone_getspecific(zone_key_t key, zone_t *zone)
690 {
691 	struct zsd_entry *t;
692 	void *data;
693 
694 	mutex_enter(&zone->zone_lock);
695 	t = zsd_find(&zone->zone_zsd, key);
696 	data = (t == NULL ? NULL : t->zsd_data);
697 	mutex_exit(&zone->zone_lock);
698 	return (data);
699 }
700 
701 /*
702  * Function used to initialize a zone's list of ZSD callbacks and data
703  * when the zone is being created.  The callbacks are initialized from
704  * the template list (zsd_registered_keys), and the constructor
705  * callback executed (if one exists).
706  *
707  * This is called before the zone is made publicly available, hence no
708  * need to grab zone_lock.
709  *
710  * Although we grab and release zsd_key_lock, new entries cannot be
711  * added to or removed from the zsd_registered_keys list until we
712  * release zonehash_lock, so there isn't a window for a
713  * zone_key_create() to come in after we've dropped zsd_key_lock but
714  * before the zone is added to the zone list, such that the constructor
715  * callbacks aren't executed for the new zone.
716  */
717 static void
718 zone_zsd_configure(zone_t *zone)
719 {
720 	struct zsd_entry *zsdp;
721 	struct zsd_entry *t;
722 	zoneid_t zoneid = zone->zone_id;
723 
724 	ASSERT(MUTEX_HELD(&zonehash_lock));
725 	ASSERT(list_head(&zone->zone_zsd) == NULL);
726 	mutex_enter(&zsd_key_lock);
727 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
728 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
729 		if (zsdp->zsd_create != NULL) {
730 			t = kmem_alloc(sizeof (*t), KM_SLEEP);
731 			t->zsd_key = zsdp->zsd_key;
732 			t->zsd_create = zsdp->zsd_create;
733 			t->zsd_data = (*t->zsd_create)(zoneid);
734 			t->zsd_shutdown = zsdp->zsd_shutdown;
735 			t->zsd_destroy = zsdp->zsd_destroy;
736 			list_insert_tail(&zone->zone_zsd, t);
737 		}
738 	}
739 	mutex_exit(&zsd_key_lock);
740 }
741 
742 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
743 
744 /*
745  * Helper function to execute shutdown or destructor callbacks.
746  */
747 static void
748 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
749 {
750 	struct zsd_entry *zsdp;
751 	struct zsd_entry *t;
752 	zoneid_t zoneid = zone->zone_id;
753 
754 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
755 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
756 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
757 
758 	mutex_enter(&zone->zone_lock);
759 	if (ct == ZSD_DESTROY) {
760 		if (zone->zone_flags & ZF_DESTROYED) {
761 			/*
762 			 * Make sure destructors are only called once.
763 			 */
764 			mutex_exit(&zone->zone_lock);
765 			return;
766 		}
767 		zone->zone_flags |= ZF_DESTROYED;
768 	}
769 	mutex_exit(&zone->zone_lock);
770 
771 	/*
772 	 * Both zsd_key_lock and zone_lock need to be held in order to add or
773 	 * remove a ZSD key, (either globally as part of
774 	 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
775 	 * possible through zone_setspecific()), so it's sufficient to hold
776 	 * zsd_key_lock here.
777 	 *
778 	 * This is a good thing, since we don't want to recursively try to grab
779 	 * zone_lock if a callback attempts to do something like a crfree() or
780 	 * zone_rele().
781 	 */
782 	mutex_enter(&zsd_key_lock);
783 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
784 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
785 		zone_key_t key = zsdp->zsd_key;
786 
787 		/* Skip if no callbacks registered */
788 		if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
789 			continue;
790 		if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
791 			continue;
792 		/*
793 		 * Call the callback with the zone-specific data if we can find
794 		 * any, otherwise with NULL.
795 		 */
796 		t = zsd_find(&zone->zone_zsd, key);
797 		if (t != NULL) {
798 			if (ct == ZSD_SHUTDOWN) {
799 				t->zsd_shutdown(zoneid, t->zsd_data);
800 			} else {
801 				ASSERT(ct == ZSD_DESTROY);
802 				t->zsd_destroy(zoneid, t->zsd_data);
803 			}
804 		} else {
805 			if (ct == ZSD_SHUTDOWN) {
806 				zsdp->zsd_shutdown(zoneid, NULL);
807 			} else {
808 				ASSERT(ct == ZSD_DESTROY);
809 				zsdp->zsd_destroy(zoneid, NULL);
810 			}
811 		}
812 	}
813 	mutex_exit(&zsd_key_lock);
814 }
815 
816 /*
817  * Called when the zone is going away; free ZSD-related memory, and
818  * destroy the zone_zsd list.
819  */
820 static void
821 zone_free_zsd(zone_t *zone)
822 {
823 	struct zsd_entry *t, *next;
824 
825 	/*
826 	 * Free all the zsd_entry's we had on this zone.
827 	 */
828 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
829 		next = list_next(&zone->zone_zsd, t);
830 		list_remove(&zone->zone_zsd, t);
831 		kmem_free(t, sizeof (*t));
832 	}
833 	list_destroy(&zone->zone_zsd);
834 }
835 
836 /*
837  * Frees memory associated with the zone dataset list.
838  */
839 static void
840 zone_free_datasets(zone_t *zone)
841 {
842 	zone_dataset_t *t, *next;
843 
844 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
845 		next = list_next(&zone->zone_datasets, t);
846 		list_remove(&zone->zone_datasets, t);
847 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
848 		kmem_free(t, sizeof (*t));
849 	}
850 	list_destroy(&zone->zone_datasets);
851 }
852 
853 /*
854  * zone.cpu-shares resource control support.
855  */
856 /*ARGSUSED*/
857 static rctl_qty_t
858 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
859 {
860 	ASSERT(MUTEX_HELD(&p->p_lock));
861 	return (p->p_zone->zone_shares);
862 }
863 
864 /*ARGSUSED*/
865 static int
866 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
867     rctl_qty_t nv)
868 {
869 	ASSERT(MUTEX_HELD(&p->p_lock));
870 	ASSERT(e->rcep_t == RCENTITY_ZONE);
871 	if (e->rcep_p.zone == NULL)
872 		return (0);
873 
874 	e->rcep_p.zone->zone_shares = nv;
875 	return (0);
876 }
877 
878 static rctl_ops_t zone_cpu_shares_ops = {
879 	rcop_no_action,
880 	zone_cpu_shares_usage,
881 	zone_cpu_shares_set,
882 	rcop_no_test
883 };
884 
885 /*ARGSUSED*/
886 static rctl_qty_t
887 zone_lwps_usage(rctl_t *r, proc_t *p)
888 {
889 	rctl_qty_t nlwps;
890 	zone_t *zone = p->p_zone;
891 
892 	ASSERT(MUTEX_HELD(&p->p_lock));
893 
894 	mutex_enter(&zone->zone_nlwps_lock);
895 	nlwps = zone->zone_nlwps;
896 	mutex_exit(&zone->zone_nlwps_lock);
897 
898 	return (nlwps);
899 }
900 
901 /*ARGSUSED*/
902 static int
903 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
904     rctl_qty_t incr, uint_t flags)
905 {
906 	rctl_qty_t nlwps;
907 
908 	ASSERT(MUTEX_HELD(&p->p_lock));
909 	ASSERT(e->rcep_t == RCENTITY_ZONE);
910 	if (e->rcep_p.zone == NULL)
911 		return (0);
912 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
913 	nlwps = e->rcep_p.zone->zone_nlwps;
914 
915 	if (nlwps + incr > rcntl->rcv_value)
916 		return (1);
917 
918 	return (0);
919 }
920 
921 /*ARGSUSED*/
922 static int
923 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
924 {
925 	ASSERT(MUTEX_HELD(&p->p_lock));
926 	ASSERT(e->rcep_t == RCENTITY_ZONE);
927 	if (e->rcep_p.zone == NULL)
928 		return (0);
929 	e->rcep_p.zone->zone_nlwps_ctl = nv;
930 	return (0);
931 }
932 
933 static rctl_ops_t zone_lwps_ops = {
934 	rcop_no_action,
935 	zone_lwps_usage,
936 	zone_lwps_set,
937 	zone_lwps_test,
938 };
939 
940 /*ARGSUSED*/
941 static int
942 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
943     rctl_qty_t incr, uint_t flags)
944 {
945 	rctl_qty_t v;
946 	ASSERT(MUTEX_HELD(&p->p_lock));
947 	ASSERT(e->rcep_t == RCENTITY_ZONE);
948 	v = e->rcep_p.zone->zone_shmmax + incr;
949 	if (v > rval->rcv_value)
950 		return (1);
951 	return (0);
952 }
953 
954 static rctl_ops_t zone_shmmax_ops = {
955 	rcop_no_action,
956 	rcop_no_usage,
957 	rcop_no_set,
958 	zone_shmmax_test
959 };
960 
961 /*ARGSUSED*/
962 static int
963 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
964     rctl_qty_t incr, uint_t flags)
965 {
966 	rctl_qty_t v;
967 	ASSERT(MUTEX_HELD(&p->p_lock));
968 	ASSERT(e->rcep_t == RCENTITY_ZONE);
969 	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
970 	if (v > rval->rcv_value)
971 		return (1);
972 	return (0);
973 }
974 
975 static rctl_ops_t zone_shmmni_ops = {
976 	rcop_no_action,
977 	rcop_no_usage,
978 	rcop_no_set,
979 	zone_shmmni_test
980 };
981 
982 /*ARGSUSED*/
983 static int
984 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
985     rctl_qty_t incr, uint_t flags)
986 {
987 	rctl_qty_t v;
988 	ASSERT(MUTEX_HELD(&p->p_lock));
989 	ASSERT(e->rcep_t == RCENTITY_ZONE);
990 	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
991 	if (v > rval->rcv_value)
992 		return (1);
993 	return (0);
994 }
995 
996 static rctl_ops_t zone_semmni_ops = {
997 	rcop_no_action,
998 	rcop_no_usage,
999 	rcop_no_set,
1000 	zone_semmni_test
1001 };
1002 
1003 /*ARGSUSED*/
1004 static int
1005 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
1006     rctl_qty_t incr, uint_t flags)
1007 {
1008 	rctl_qty_t v;
1009 	ASSERT(MUTEX_HELD(&p->p_lock));
1010 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1011 	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
1012 	if (v > rval->rcv_value)
1013 		return (1);
1014 	return (0);
1015 }
1016 
1017 static rctl_ops_t zone_msgmni_ops = {
1018 	rcop_no_action,
1019 	rcop_no_usage,
1020 	rcop_no_set,
1021 	zone_msgmni_test
1022 };
1023 
1024 /*ARGSUSED*/
1025 static rctl_qty_t
1026 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
1027 {
1028 	rctl_qty_t q;
1029 	ASSERT(MUTEX_HELD(&p->p_lock));
1030 	mutex_enter(&p->p_zone->zone_mem_lock);
1031 	q = p->p_zone->zone_locked_mem;
1032 	mutex_exit(&p->p_zone->zone_mem_lock);
1033 	return (q);
1034 }
1035 
1036 /*ARGSUSED*/
1037 static int
1038 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1039     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1040 {
1041 	rctl_qty_t q;
1042 	zone_t *z;
1043 
1044 	z = e->rcep_p.zone;
1045 	ASSERT(MUTEX_HELD(&p->p_lock));
1046 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1047 	q = z->zone_locked_mem;
1048 	if (q + incr > rcntl->rcv_value)
1049 		return (1);
1050 	return (0);
1051 }
1052 
1053 /*ARGSUSED*/
1054 static int
1055 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1056     rctl_qty_t nv)
1057 {
1058 	ASSERT(MUTEX_HELD(&p->p_lock));
1059 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1060 	if (e->rcep_p.zone == NULL)
1061 		return (0);
1062 	e->rcep_p.zone->zone_locked_mem_ctl = nv;
1063 	return (0);
1064 }
1065 
1066 static rctl_ops_t zone_locked_mem_ops = {
1067 	rcop_no_action,
1068 	zone_locked_mem_usage,
1069 	zone_locked_mem_set,
1070 	zone_locked_mem_test
1071 };
1072 
1073 /*ARGSUSED*/
1074 static rctl_qty_t
1075 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
1076 {
1077 	rctl_qty_t q;
1078 	zone_t *z = p->p_zone;
1079 
1080 	ASSERT(MUTEX_HELD(&p->p_lock));
1081 	mutex_enter(&z->zone_mem_lock);
1082 	q = z->zone_max_swap;
1083 	mutex_exit(&z->zone_mem_lock);
1084 	return (q);
1085 }
1086 
1087 /*ARGSUSED*/
1088 static int
1089 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
1090     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
1091 {
1092 	rctl_qty_t q;
1093 	zone_t *z;
1094 
1095 	z = e->rcep_p.zone;
1096 	ASSERT(MUTEX_HELD(&p->p_lock));
1097 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
1098 	q = z->zone_max_swap;
1099 	if (q + incr > rcntl->rcv_value)
1100 		return (1);
1101 	return (0);
1102 }
1103 
1104 /*ARGSUSED*/
1105 static int
1106 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
1107     rctl_qty_t nv)
1108 {
1109 	ASSERT(MUTEX_HELD(&p->p_lock));
1110 	ASSERT(e->rcep_t == RCENTITY_ZONE);
1111 	if (e->rcep_p.zone == NULL)
1112 		return (0);
1113 	e->rcep_p.zone->zone_max_swap_ctl = nv;
1114 	return (0);
1115 }
1116 
1117 static rctl_ops_t zone_max_swap_ops = {
1118 	rcop_no_action,
1119 	zone_max_swap_usage,
1120 	zone_max_swap_set,
1121 	zone_max_swap_test
1122 };
1123 
1124 /*
1125  * Helper function to brand the zone with a unique ID.
1126  */
1127 static void
1128 zone_uniqid(zone_t *zone)
1129 {
1130 	static uint64_t uniqid = 0;
1131 
1132 	ASSERT(MUTEX_HELD(&zonehash_lock));
1133 	zone->zone_uniqid = uniqid++;
1134 }
1135 
1136 /*
1137  * Returns a held pointer to the "kcred" for the specified zone.
1138  */
1139 struct cred *
1140 zone_get_kcred(zoneid_t zoneid)
1141 {
1142 	zone_t *zone;
1143 	cred_t *cr;
1144 
1145 	if ((zone = zone_find_by_id(zoneid)) == NULL)
1146 		return (NULL);
1147 	cr = zone->zone_kcred;
1148 	crhold(cr);
1149 	zone_rele(zone);
1150 	return (cr);
1151 }
1152 
1153 static int
1154 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
1155 {
1156 	zone_t *zone = ksp->ks_private;
1157 	zone_kstat_t *zk = ksp->ks_data;
1158 
1159 	if (rw == KSTAT_WRITE)
1160 		return (EACCES);
1161 
1162 	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
1163 	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
1164 	return (0);
1165 }
1166 
1167 static int
1168 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
1169 {
1170 	zone_t *zone = ksp->ks_private;
1171 	zone_kstat_t *zk = ksp->ks_data;
1172 
1173 	if (rw == KSTAT_WRITE)
1174 		return (EACCES);
1175 
1176 	zk->zk_usage.value.ui64 = zone->zone_max_swap;
1177 	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
1178 	return (0);
1179 }
1180 
1181 static void
1182 zone_kstat_create(zone_t *zone)
1183 {
1184 	kstat_t *ksp;
1185 	zone_kstat_t *zk;
1186 
1187 	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
1188 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1189 	    KSTAT_FLAG_VIRTUAL);
1190 
1191 	if (ksp == NULL)
1192 		return;
1193 
1194 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1195 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1196 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1197 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1198 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1199 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1200 	ksp->ks_update = zone_lockedmem_kstat_update;
1201 	ksp->ks_private = zone;
1202 	kstat_install(ksp);
1203 
1204 	zone->zone_lockedmem_kstat = ksp;
1205 
1206 	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
1207 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
1208 	    KSTAT_FLAG_VIRTUAL);
1209 
1210 	if (ksp == NULL)
1211 		return;
1212 
1213 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
1214 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
1215 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
1216 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
1217 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
1218 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
1219 	ksp->ks_update = zone_swapresv_kstat_update;
1220 	ksp->ks_private = zone;
1221 	kstat_install(ksp);
1222 
1223 	zone->zone_swapresv_kstat = ksp;
1224 }
1225 
1226 static void
1227 zone_kstat_delete(zone_t *zone)
1228 {
1229 	void *data;
1230 
1231 	if (zone->zone_lockedmem_kstat != NULL) {
1232 		data = zone->zone_lockedmem_kstat->ks_data;
1233 		kstat_delete(zone->zone_lockedmem_kstat);
1234 		kmem_free(data, sizeof (zone_kstat_t));
1235 	}
1236 	if (zone->zone_swapresv_kstat != NULL) {
1237 		data = zone->zone_swapresv_kstat->ks_data;
1238 		kstat_delete(zone->zone_swapresv_kstat);
1239 		kmem_free(data, sizeof (zone_kstat_t));
1240 	}
1241 }
1242 
1243 /*
1244  * Called very early on in boot to initialize the ZSD list so that
1245  * zone_key_create() can be called before zone_init().  It also initializes
1246  * portions of zone0 which may be used before zone_init() is called.  The
1247  * variable "global_zone" will be set when zone0 is fully initialized by
1248  * zone_init().
1249  */
1250 void
1251 zone_zsd_init(void)
1252 {
1253 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
1254 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
1255 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
1256 	    offsetof(struct zsd_entry, zsd_linkage));
1257 	list_create(&zone_active, sizeof (zone_t),
1258 	    offsetof(zone_t, zone_linkage));
1259 	list_create(&zone_deathrow, sizeof (zone_t),
1260 	    offsetof(zone_t, zone_linkage));
1261 
1262 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
1263 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
1264 	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
1265 	zone0.zone_shares = 1;
1266 	zone0.zone_nlwps = 0;
1267 	zone0.zone_nlwps_ctl = INT_MAX;
1268 	zone0.zone_locked_mem = 0;
1269 	zone0.zone_locked_mem_ctl = UINT64_MAX;
1270 	ASSERT(zone0.zone_max_swap == 0);
1271 	zone0.zone_max_swap_ctl = UINT64_MAX;
1272 	zone0.zone_shmmax = 0;
1273 	zone0.zone_ipc.ipcq_shmmni = 0;
1274 	zone0.zone_ipc.ipcq_semmni = 0;
1275 	zone0.zone_ipc.ipcq_msgmni = 0;
1276 	zone0.zone_name = GLOBAL_ZONENAME;
1277 	zone0.zone_nodename = utsname.nodename;
1278 	zone0.zone_domain = srpc_domain;
1279 	zone0.zone_ref = 1;
1280 	zone0.zone_id = GLOBAL_ZONEID;
1281 	zone0.zone_status = ZONE_IS_RUNNING;
1282 	zone0.zone_rootpath = "/";
1283 	zone0.zone_rootpathlen = 2;
1284 	zone0.zone_psetid = ZONE_PS_INVAL;
1285 	zone0.zone_ncpus = 0;
1286 	zone0.zone_ncpus_online = 0;
1287 	zone0.zone_proc_initpid = 1;
1288 	zone0.zone_initname = initname;
1289 	zone0.zone_lockedmem_kstat = NULL;
1290 	zone0.zone_swapresv_kstat = NULL;
1291 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
1292 	    offsetof(struct zsd_entry, zsd_linkage));
1293 	list_insert_head(&zone_active, &zone0);
1294 
1295 	/*
1296 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
1297 	 * to anything meaningful.  It is assigned to be 'rootdir' in
1298 	 * vfs_mountroot().
1299 	 */
1300 	zone0.zone_rootvp = NULL;
1301 	zone0.zone_vfslist = NULL;
1302 	zone0.zone_bootargs = initargs;
1303 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
1304 	/*
1305 	 * The global zone has all privileges
1306 	 */
1307 	priv_fillset(zone0.zone_privset);
1308 	/*
1309 	 * Add p0 to the global zone
1310 	 */
1311 	zone0.zone_zsched = &p0;
1312 	p0.p_zone = &zone0;
1313 }
1314 
1315 /*
1316  * Compute a hash value based on the contents of the label and the DOI.  The
1317  * hash algorithm is somewhat arbitrary, but is based on the observation that
1318  * humans will likely pick labels that differ by amounts that work out to be
1319  * multiples of the number of hash chains, and thus stirring in some primes
1320  * should help.
1321  */
1322 static uint_t
1323 hash_bylabel(void *hdata, mod_hash_key_t key)
1324 {
1325 	const ts_label_t *lab = (ts_label_t *)key;
1326 	const uint32_t *up, *ue;
1327 	uint_t hash;
1328 	int i;
1329 
1330 	_NOTE(ARGUNUSED(hdata));
1331 
1332 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1333 	/* we depend on alignment of label, but not representation */
1334 	up = (const uint32_t *)&lab->tsl_label;
1335 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1336 	i = 1;
1337 	while (up < ue) {
1338 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1339 		hash += *up + (*up << ((i % 16) + 1));
1340 		up++;
1341 		i++;
1342 	}
1343 	return (hash);
1344 }
1345 
1346 /*
1347  * All that mod_hash cares about here is zero (equal) versus non-zero (not
1348  * equal).  This may need to be changed if less than / greater than is ever
1349  * needed.
1350  */
1351 static int
1352 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1353 {
1354 	ts_label_t *lab1 = (ts_label_t *)key1;
1355 	ts_label_t *lab2 = (ts_label_t *)key2;
1356 
1357 	return (label_equal(lab1, lab2) ? 0 : 1);
1358 }
1359 
1360 /*
1361  * Called by main() to initialize the zones framework.
1362  */
1363 void
1364 zone_init(void)
1365 {
1366 	rctl_dict_entry_t *rde;
1367 	rctl_val_t *dval;
1368 	rctl_set_t *set;
1369 	rctl_alloc_gp_t *gp;
1370 	rctl_entity_p_t e;
1371 	int res;
1372 
1373 	ASSERT(curproc == &p0);
1374 
1375 	/*
1376 	 * Create ID space for zone IDs.  ID 0 is reserved for the
1377 	 * global zone.
1378 	 */
1379 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1380 
1381 	/*
1382 	 * Initialize generic zone resource controls, if any.
1383 	 */
1384 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1385 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1386 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1387 	    FSS_MAXSHARES, FSS_MAXSHARES,
1388 	    &zone_cpu_shares_ops);
1389 
1390 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1391 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1392 	    INT_MAX, INT_MAX, &zone_lwps_ops);
1393 	/*
1394 	 * System V IPC resource controls
1395 	 */
1396 	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
1397 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1398 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
1399 
1400 	rc_zone_semmni = rctl_register("zone.max-sem-ids",
1401 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1402 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
1403 
1404 	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
1405 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1406 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
1407 
1408 	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
1409 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
1410 	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
1411 
1412 	/*
1413 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
1414 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
1415 	 */
1416 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
1417 	bzero(dval, sizeof (rctl_val_t));
1418 	dval->rcv_value = 1;
1419 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
1420 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
1421 	dval->rcv_action_recip_pid = -1;
1422 
1423 	rde = rctl_dict_lookup("zone.cpu-shares");
1424 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
1425 
1426 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
1427 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1428 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1429 	    &zone_locked_mem_ops);
1430 
1431 	rc_zone_max_swap = rctl_register("zone.max-swap",
1432 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
1433 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
1434 	    &zone_max_swap_ops);
1435 
1436 	/*
1437 	 * Initialize the ``global zone''.
1438 	 */
1439 	set = rctl_set_create();
1440 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
1441 	mutex_enter(&p0.p_lock);
1442 	e.rcep_p.zone = &zone0;
1443 	e.rcep_t = RCENTITY_ZONE;
1444 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1445 	    gp);
1446 
1447 	zone0.zone_nlwps = p0.p_lwpcnt;
1448 	zone0.zone_ntasks = 1;
1449 	mutex_exit(&p0.p_lock);
1450 	zone0.zone_restart_init = B_TRUE;
1451 	zone0.zone_brand = &native_brand;
1452 	rctl_prealloc_destroy(gp);
1453 	/*
1454 	 * pool_default hasn't been initialized yet, so we let pool_init()
1455 	 * take care of making sure the global zone is in the default pool.
1456 	 */
1457 
1458 	/*
1459 	 * Initialize global zone kstats
1460 	 */
1461 	zone_kstat_create(&zone0);
1462 
1463 	/*
1464 	 * Initialize zone label.
1465 	 * mlp are initialized when tnzonecfg is loaded.
1466 	 */
1467 	zone0.zone_slabel = l_admin_low;
1468 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
1469 	label_hold(l_admin_low);
1470 
1471 	mutex_enter(&zonehash_lock);
1472 	zone_uniqid(&zone0);
1473 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
1474 
1475 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
1476 	    mod_hash_null_valdtor);
1477 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
1478 	    zone_hash_size, mod_hash_null_valdtor);
1479 	/*
1480 	 * maintain zonehashbylabel only for labeled systems
1481 	 */
1482 	if (is_system_labeled())
1483 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
1484 		    zone_hash_size, mod_hash_null_keydtor,
1485 		    mod_hash_null_valdtor, hash_bylabel, NULL,
1486 		    hash_labelkey_cmp, KM_SLEEP);
1487 	zonecount = 1;
1488 
1489 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
1490 	    (mod_hash_val_t)&zone0);
1491 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
1492 	    (mod_hash_val_t)&zone0);
1493 	if (is_system_labeled()) {
1494 		zone0.zone_flags |= ZF_HASHED_LABEL;
1495 		(void) mod_hash_insert(zonehashbylabel,
1496 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
1497 	}
1498 	mutex_exit(&zonehash_lock);
1499 
1500 	/*
1501 	 * We avoid setting zone_kcred until now, since kcred is initialized
1502 	 * sometime after zone_zsd_init() and before zone_init().
1503 	 */
1504 	zone0.zone_kcred = kcred;
1505 	/*
1506 	 * The global zone is fully initialized (except for zone_rootvp which
1507 	 * will be set when the root filesystem is mounted).
1508 	 */
1509 	global_zone = &zone0;
1510 
1511 	/*
1512 	 * Setup an event channel to send zone status change notifications on
1513 	 */
1514 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
1515 	    EVCH_CREAT);
1516 
1517 	if (res)
1518 		panic("Sysevent_evc_bind failed during zone setup.\n");
1519 
1520 }
1521 
1522 static void
1523 zone_free(zone_t *zone)
1524 {
1525 	ASSERT(zone != global_zone);
1526 	ASSERT(zone->zone_ntasks == 0);
1527 	ASSERT(zone->zone_nlwps == 0);
1528 	ASSERT(zone->zone_cred_ref == 0);
1529 	ASSERT(zone->zone_kcred == NULL);
1530 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
1531 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
1532 
1533 	/* remove from deathrow list */
1534 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
1535 		ASSERT(zone->zone_ref == 0);
1536 		mutex_enter(&zone_deathrow_lock);
1537 		list_remove(&zone_deathrow, zone);
1538 		mutex_exit(&zone_deathrow_lock);
1539 	}
1540 
1541 	zone_free_zsd(zone);
1542 	zone_free_datasets(zone);
1543 
1544 	if (zone->zone_rootvp != NULL)
1545 		VN_RELE(zone->zone_rootvp);
1546 	if (zone->zone_rootpath)
1547 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
1548 	if (zone->zone_name != NULL)
1549 		kmem_free(zone->zone_name, ZONENAME_MAX);
1550 	if (zone->zone_slabel != NULL)
1551 		label_rele(zone->zone_slabel);
1552 	if (zone->zone_nodename != NULL)
1553 		kmem_free(zone->zone_nodename, _SYS_NMLN);
1554 	if (zone->zone_domain != NULL)
1555 		kmem_free(zone->zone_domain, _SYS_NMLN);
1556 	if (zone->zone_privset != NULL)
1557 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
1558 	if (zone->zone_rctls != NULL)
1559 		rctl_set_free(zone->zone_rctls);
1560 	if (zone->zone_bootargs != NULL)
1561 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1562 	if (zone->zone_initname != NULL)
1563 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1564 	id_free(zoneid_space, zone->zone_id);
1565 	mutex_destroy(&zone->zone_lock);
1566 	cv_destroy(&zone->zone_cv);
1567 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
1568 	kmem_free(zone, sizeof (zone_t));
1569 }
1570 
1571 /*
1572  * See block comment at the top of this file for information about zone
1573  * status values.
1574  */
1575 /*
1576  * Convenience function for setting zone status.
1577  */
1578 static void
1579 zone_status_set(zone_t *zone, zone_status_t status)
1580 {
1581 
1582 	nvlist_t *nvl = NULL;
1583 	ASSERT(MUTEX_HELD(&zone_status_lock));
1584 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
1585 	    status >= zone_status_get(zone));
1586 
1587 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
1588 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
1589 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
1590 	    zone_status_table[status]) ||
1591 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
1592 	    zone_status_table[zone->zone_status]) ||
1593 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
1594 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
1595 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
1596 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
1597 #ifdef DEBUG
1598 		(void) printf(
1599 		    "Failed to allocate and send zone state change event.\n");
1600 #endif
1601 	}
1602 	nvlist_free(nvl);
1603 
1604 	zone->zone_status = status;
1605 
1606 	cv_broadcast(&zone->zone_cv);
1607 }
1608 
1609 /*
1610  * Public function to retrieve the zone status.  The zone status may
1611  * change after it is retrieved.
1612  */
1613 zone_status_t
1614 zone_status_get(zone_t *zone)
1615 {
1616 	return (zone->zone_status);
1617 }
1618 
1619 static int
1620 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
1621 {
1622 	char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
1623 	int err = 0;
1624 
1625 	ASSERT(zone != global_zone);
1626 	if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0)
1627 		goto done;	/* EFAULT or ENAMETOOLONG */
1628 
1629 	if (zone->zone_bootargs != NULL)
1630 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1631 
1632 	zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP);
1633 	(void) strcpy(zone->zone_bootargs, bootargs);
1634 
1635 done:
1636 	kmem_free(bootargs, BOOTARGS_MAX);
1637 	return (err);
1638 }
1639 
1640 static int
1641 zone_set_initname(zone_t *zone, const char *zone_initname)
1642 {
1643 	char initname[INITNAME_SZ];
1644 	size_t len;
1645 	int err = 0;
1646 
1647 	ASSERT(zone != global_zone);
1648 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
1649 		return (err);	/* EFAULT or ENAMETOOLONG */
1650 
1651 	if (zone->zone_initname != NULL)
1652 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1653 
1654 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
1655 	(void) strcpy(zone->zone_initname, initname);
1656 	return (0);
1657 }
1658 
1659 static int
1660 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
1661 {
1662 	uint64_t mcap;
1663 	int err = 0;
1664 
1665 	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
1666 		zone->zone_phys_mcap = mcap;
1667 
1668 	return (err);
1669 }
1670 
1671 static int
1672 zone_set_sched_class(zone_t *zone, const char *new_class)
1673 {
1674 	char sched_class[PC_CLNMSZ];
1675 	id_t classid;
1676 	int err;
1677 
1678 	ASSERT(zone != global_zone);
1679 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
1680 		return (err);	/* EFAULT or ENAMETOOLONG */
1681 
1682 	if (getcid(sched_class, &classid) != 0 || classid == syscid)
1683 		return (set_errno(EINVAL));
1684 	zone->zone_defaultcid = classid;
1685 	ASSERT(zone->zone_defaultcid > 0 &&
1686 	    zone->zone_defaultcid < loaded_classes);
1687 
1688 	return (0);
1689 }
1690 
1691 /*
1692  * Block indefinitely waiting for (zone_status >= status)
1693  */
1694 void
1695 zone_status_wait(zone_t *zone, zone_status_t status)
1696 {
1697 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1698 
1699 	mutex_enter(&zone_status_lock);
1700 	while (zone->zone_status < status) {
1701 		cv_wait(&zone->zone_cv, &zone_status_lock);
1702 	}
1703 	mutex_exit(&zone_status_lock);
1704 }
1705 
1706 /*
1707  * Private CPR-safe version of zone_status_wait().
1708  */
1709 static void
1710 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
1711 {
1712 	callb_cpr_t cprinfo;
1713 
1714 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1715 
1716 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
1717 	    str);
1718 	mutex_enter(&zone_status_lock);
1719 	while (zone->zone_status < status) {
1720 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1721 		cv_wait(&zone->zone_cv, &zone_status_lock);
1722 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
1723 	}
1724 	/*
1725 	 * zone_status_lock is implicitly released by the following.
1726 	 */
1727 	CALLB_CPR_EXIT(&cprinfo);
1728 }
1729 
1730 /*
1731  * Block until zone enters requested state or signal is received.  Return (0)
1732  * if signaled, non-zero otherwise.
1733  */
1734 int
1735 zone_status_wait_sig(zone_t *zone, zone_status_t status)
1736 {
1737 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1738 
1739 	mutex_enter(&zone_status_lock);
1740 	while (zone->zone_status < status) {
1741 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
1742 			mutex_exit(&zone_status_lock);
1743 			return (0);
1744 		}
1745 	}
1746 	mutex_exit(&zone_status_lock);
1747 	return (1);
1748 }
1749 
1750 /*
1751  * Block until the zone enters the requested state or the timeout expires,
1752  * whichever happens first.  Return (-1) if operation timed out, time remaining
1753  * otherwise.
1754  */
1755 clock_t
1756 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
1757 {
1758 	clock_t timeleft = 0;
1759 
1760 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1761 
1762 	mutex_enter(&zone_status_lock);
1763 	while (zone->zone_status < status && timeleft != -1) {
1764 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
1765 	}
1766 	mutex_exit(&zone_status_lock);
1767 	return (timeleft);
1768 }
1769 
1770 /*
1771  * Block until the zone enters the requested state, the current process is
1772  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
1773  * operation timed out, 0 if signaled, time remaining otherwise.
1774  */
1775 clock_t
1776 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
1777 {
1778 	clock_t timeleft = tim - lbolt;
1779 
1780 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1781 
1782 	mutex_enter(&zone_status_lock);
1783 	while (zone->zone_status < status) {
1784 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
1785 		    tim);
1786 		if (timeleft <= 0)
1787 			break;
1788 	}
1789 	mutex_exit(&zone_status_lock);
1790 	return (timeleft);
1791 }
1792 
1793 /*
1794  * Zones have two reference counts: one for references from credential
1795  * structures (zone_cred_ref), and one (zone_ref) for everything else.
1796  * This is so we can allow a zone to be rebooted while there are still
1797  * outstanding cred references, since certain drivers cache dblks (which
1798  * implicitly results in cached creds).  We wait for zone_ref to drop to
1799  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
1800  * later freed when the zone_cred_ref drops to 0, though nothing other
1801  * than the zone id and privilege set should be accessed once the zone
1802  * is "dead".
1803  *
1804  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
1805  * to force halt/reboot to block waiting for the zone_cred_ref to drop
1806  * to 0.  This can be useful to flush out other sources of cached creds
1807  * that may be less innocuous than the driver case.
1808  */
1809 
1810 int zone_wait_for_cred = 0;
1811 
1812 static void
1813 zone_hold_locked(zone_t *z)
1814 {
1815 	ASSERT(MUTEX_HELD(&z->zone_lock));
1816 	z->zone_ref++;
1817 	ASSERT(z->zone_ref != 0);
1818 }
1819 
1820 void
1821 zone_hold(zone_t *z)
1822 {
1823 	mutex_enter(&z->zone_lock);
1824 	zone_hold_locked(z);
1825 	mutex_exit(&z->zone_lock);
1826 }
1827 
1828 /*
1829  * If the non-cred ref count drops to 1 and either the cred ref count
1830  * is 0 or we aren't waiting for cred references, the zone is ready to
1831  * be destroyed.
1832  */
1833 #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
1834 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
1835 
1836 void
1837 zone_rele(zone_t *z)
1838 {
1839 	boolean_t wakeup;
1840 
1841 	mutex_enter(&z->zone_lock);
1842 	ASSERT(z->zone_ref != 0);
1843 	z->zone_ref--;
1844 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1845 		/* no more refs, free the structure */
1846 		mutex_exit(&z->zone_lock);
1847 		zone_free(z);
1848 		return;
1849 	}
1850 	/* signal zone_destroy so the zone can finish halting */
1851 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
1852 	mutex_exit(&z->zone_lock);
1853 
1854 	if (wakeup) {
1855 		/*
1856 		 * Grabbing zonehash_lock here effectively synchronizes with
1857 		 * zone_destroy() to avoid missed signals.
1858 		 */
1859 		mutex_enter(&zonehash_lock);
1860 		cv_broadcast(&zone_destroy_cv);
1861 		mutex_exit(&zonehash_lock);
1862 	}
1863 }
1864 
1865 void
1866 zone_cred_hold(zone_t *z)
1867 {
1868 	mutex_enter(&z->zone_lock);
1869 	z->zone_cred_ref++;
1870 	ASSERT(z->zone_cred_ref != 0);
1871 	mutex_exit(&z->zone_lock);
1872 }
1873 
1874 void
1875 zone_cred_rele(zone_t *z)
1876 {
1877 	boolean_t wakeup;
1878 
1879 	mutex_enter(&z->zone_lock);
1880 	ASSERT(z->zone_cred_ref != 0);
1881 	z->zone_cred_ref--;
1882 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1883 		/* no more refs, free the structure */
1884 		mutex_exit(&z->zone_lock);
1885 		zone_free(z);
1886 		return;
1887 	}
1888 	/*
1889 	 * If zone_destroy is waiting for the cred references to drain
1890 	 * out, and they have, signal it.
1891 	 */
1892 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
1893 	    zone_status_get(z) >= ZONE_IS_DEAD);
1894 	mutex_exit(&z->zone_lock);
1895 
1896 	if (wakeup) {
1897 		/*
1898 		 * Grabbing zonehash_lock here effectively synchronizes with
1899 		 * zone_destroy() to avoid missed signals.
1900 		 */
1901 		mutex_enter(&zonehash_lock);
1902 		cv_broadcast(&zone_destroy_cv);
1903 		mutex_exit(&zonehash_lock);
1904 	}
1905 }
1906 
1907 void
1908 zone_task_hold(zone_t *z)
1909 {
1910 	mutex_enter(&z->zone_lock);
1911 	z->zone_ntasks++;
1912 	ASSERT(z->zone_ntasks != 0);
1913 	mutex_exit(&z->zone_lock);
1914 }
1915 
1916 void
1917 zone_task_rele(zone_t *zone)
1918 {
1919 	uint_t refcnt;
1920 
1921 	mutex_enter(&zone->zone_lock);
1922 	ASSERT(zone->zone_ntasks != 0);
1923 	refcnt = --zone->zone_ntasks;
1924 	if (refcnt > 1)	{	/* Common case */
1925 		mutex_exit(&zone->zone_lock);
1926 		return;
1927 	}
1928 	zone_hold_locked(zone);	/* so we can use the zone_t later */
1929 	mutex_exit(&zone->zone_lock);
1930 	if (refcnt == 1) {
1931 		/*
1932 		 * See if the zone is shutting down.
1933 		 */
1934 		mutex_enter(&zone_status_lock);
1935 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
1936 			goto out;
1937 		}
1938 
1939 		/*
1940 		 * Make sure the ntasks didn't change since we
1941 		 * dropped zone_lock.
1942 		 */
1943 		mutex_enter(&zone->zone_lock);
1944 		if (refcnt != zone->zone_ntasks) {
1945 			mutex_exit(&zone->zone_lock);
1946 			goto out;
1947 		}
1948 		mutex_exit(&zone->zone_lock);
1949 
1950 		/*
1951 		 * No more user processes in the zone.  The zone is empty.
1952 		 */
1953 		zone_status_set(zone, ZONE_IS_EMPTY);
1954 		goto out;
1955 	}
1956 
1957 	ASSERT(refcnt == 0);
1958 	/*
1959 	 * zsched has exited; the zone is dead.
1960 	 */
1961 	zone->zone_zsched = NULL;		/* paranoia */
1962 	mutex_enter(&zone_status_lock);
1963 	zone_status_set(zone, ZONE_IS_DEAD);
1964 out:
1965 	mutex_exit(&zone_status_lock);
1966 	zone_rele(zone);
1967 }
1968 
1969 zoneid_t
1970 getzoneid(void)
1971 {
1972 	return (curproc->p_zone->zone_id);
1973 }
1974 
1975 /*
1976  * Internal versions of zone_find_by_*().  These don't zone_hold() or
1977  * check the validity of a zone's state.
1978  */
1979 static zone_t *
1980 zone_find_all_by_id(zoneid_t zoneid)
1981 {
1982 	mod_hash_val_t hv;
1983 	zone_t *zone = NULL;
1984 
1985 	ASSERT(MUTEX_HELD(&zonehash_lock));
1986 
1987 	if (mod_hash_find(zonehashbyid,
1988 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
1989 		zone = (zone_t *)hv;
1990 	return (zone);
1991 }
1992 
1993 static zone_t *
1994 zone_find_all_by_label(const ts_label_t *label)
1995 {
1996 	mod_hash_val_t hv;
1997 	zone_t *zone = NULL;
1998 
1999 	ASSERT(MUTEX_HELD(&zonehash_lock));
2000 
2001 	/*
2002 	 * zonehashbylabel is not maintained for unlabeled systems
2003 	 */
2004 	if (!is_system_labeled())
2005 		return (NULL);
2006 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
2007 		zone = (zone_t *)hv;
2008 	return (zone);
2009 }
2010 
2011 static zone_t *
2012 zone_find_all_by_name(char *name)
2013 {
2014 	mod_hash_val_t hv;
2015 	zone_t *zone = NULL;
2016 
2017 	ASSERT(MUTEX_HELD(&zonehash_lock));
2018 
2019 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
2020 		zone = (zone_t *)hv;
2021 	return (zone);
2022 }
2023 
2024 /*
2025  * Public interface for looking up a zone by zoneid.  Only returns the zone if
2026  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
2027  * Caller must call zone_rele() once it is done with the zone.
2028  *
2029  * The zone may begin the zone_destroy() sequence immediately after this
2030  * function returns, but may be safely used until zone_rele() is called.
2031  */
2032 zone_t *
2033 zone_find_by_id(zoneid_t zoneid)
2034 {
2035 	zone_t *zone;
2036 	zone_status_t status;
2037 
2038 	mutex_enter(&zonehash_lock);
2039 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2040 		mutex_exit(&zonehash_lock);
2041 		return (NULL);
2042 	}
2043 	status = zone_status_get(zone);
2044 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2045 		/*
2046 		 * For all practical purposes the zone doesn't exist.
2047 		 */
2048 		mutex_exit(&zonehash_lock);
2049 		return (NULL);
2050 	}
2051 	zone_hold(zone);
2052 	mutex_exit(&zonehash_lock);
2053 	return (zone);
2054 }
2055 
2056 /*
2057  * Similar to zone_find_by_id, but using zone label as the key.
2058  */
2059 zone_t *
2060 zone_find_by_label(const ts_label_t *label)
2061 {
2062 	zone_t *zone;
2063 	zone_status_t status;
2064 
2065 	mutex_enter(&zonehash_lock);
2066 	if ((zone = zone_find_all_by_label(label)) == NULL) {
2067 		mutex_exit(&zonehash_lock);
2068 		return (NULL);
2069 	}
2070 
2071 	status = zone_status_get(zone);
2072 	if (status > ZONE_IS_DOWN) {
2073 		/*
2074 		 * For all practical purposes the zone doesn't exist.
2075 		 */
2076 		mutex_exit(&zonehash_lock);
2077 		return (NULL);
2078 	}
2079 	zone_hold(zone);
2080 	mutex_exit(&zonehash_lock);
2081 	return (zone);
2082 }
2083 
2084 /*
2085  * Similar to zone_find_by_id, but using zone name as the key.
2086  */
2087 zone_t *
2088 zone_find_by_name(char *name)
2089 {
2090 	zone_t *zone;
2091 	zone_status_t status;
2092 
2093 	mutex_enter(&zonehash_lock);
2094 	if ((zone = zone_find_all_by_name(name)) == NULL) {
2095 		mutex_exit(&zonehash_lock);
2096 		return (NULL);
2097 	}
2098 	status = zone_status_get(zone);
2099 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2100 		/*
2101 		 * For all practical purposes the zone doesn't exist.
2102 		 */
2103 		mutex_exit(&zonehash_lock);
2104 		return (NULL);
2105 	}
2106 	zone_hold(zone);
2107 	mutex_exit(&zonehash_lock);
2108 	return (zone);
2109 }
2110 
2111 /*
2112  * Similar to zone_find_by_id(), using the path as a key.  For instance,
2113  * if there is a zone "foo" rooted at /foo/root, and the path argument
2114  * is "/foo/root/proc", it will return the held zone_t corresponding to
2115  * zone "foo".
2116  *
2117  * zone_find_by_path() always returns a non-NULL value, since at the
2118  * very least every path will be contained in the global zone.
2119  *
2120  * As with the other zone_find_by_*() functions, the caller is
2121  * responsible for zone_rele()ing the return value of this function.
2122  */
2123 zone_t *
2124 zone_find_by_path(const char *path)
2125 {
2126 	zone_t *zone;
2127 	zone_t *zret = NULL;
2128 	zone_status_t status;
2129 
2130 	if (path == NULL) {
2131 		/*
2132 		 * Call from rootconf().
2133 		 */
2134 		zone_hold(global_zone);
2135 		return (global_zone);
2136 	}
2137 	ASSERT(*path == '/');
2138 	mutex_enter(&zonehash_lock);
2139 	for (zone = list_head(&zone_active); zone != NULL;
2140 	    zone = list_next(&zone_active, zone)) {
2141 		if (ZONE_PATH_VISIBLE(path, zone))
2142 			zret = zone;
2143 	}
2144 	ASSERT(zret != NULL);
2145 	status = zone_status_get(zret);
2146 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
2147 		/*
2148 		 * Zone practically doesn't exist.
2149 		 */
2150 		zret = global_zone;
2151 	}
2152 	zone_hold(zret);
2153 	mutex_exit(&zonehash_lock);
2154 	return (zret);
2155 }
2156 
2157 /*
2158  * Get the number of cpus visible to this zone.  The system-wide global
2159  * 'ncpus' is returned if pools are disabled, the caller is in the
2160  * global zone, or a NULL zone argument is passed in.
2161  */
2162 int
2163 zone_ncpus_get(zone_t *zone)
2164 {
2165 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
2166 
2167 	return (myncpus != 0 ? myncpus : ncpus);
2168 }
2169 
2170 /*
2171  * Get the number of online cpus visible to this zone.  The system-wide
2172  * global 'ncpus_online' is returned if pools are disabled, the caller
2173  * is in the global zone, or a NULL zone argument is passed in.
2174  */
2175 int
2176 zone_ncpus_online_get(zone_t *zone)
2177 {
2178 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
2179 
2180 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
2181 }
2182 
2183 /*
2184  * Return the pool to which the zone is currently bound.
2185  */
2186 pool_t *
2187 zone_pool_get(zone_t *zone)
2188 {
2189 	ASSERT(pool_lock_held());
2190 
2191 	return (zone->zone_pool);
2192 }
2193 
2194 /*
2195  * Set the zone's pool pointer and update the zone's visibility to match
2196  * the resources in the new pool.
2197  */
2198 void
2199 zone_pool_set(zone_t *zone, pool_t *pool)
2200 {
2201 	ASSERT(pool_lock_held());
2202 	ASSERT(MUTEX_HELD(&cpu_lock));
2203 
2204 	zone->zone_pool = pool;
2205 	zone_pset_set(zone, pool->pool_pset->pset_id);
2206 }
2207 
2208 /*
2209  * Return the cached value of the id of the processor set to which the
2210  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
2211  * facility is disabled.
2212  */
2213 psetid_t
2214 zone_pset_get(zone_t *zone)
2215 {
2216 	ASSERT(MUTEX_HELD(&cpu_lock));
2217 
2218 	return (zone->zone_psetid);
2219 }
2220 
2221 /*
2222  * Set the cached value of the id of the processor set to which the zone
2223  * is currently bound.  Also update the zone's visibility to match the
2224  * resources in the new processor set.
2225  */
2226 void
2227 zone_pset_set(zone_t *zone, psetid_t newpsetid)
2228 {
2229 	psetid_t oldpsetid;
2230 
2231 	ASSERT(MUTEX_HELD(&cpu_lock));
2232 	oldpsetid = zone_pset_get(zone);
2233 
2234 	if (oldpsetid == newpsetid)
2235 		return;
2236 	/*
2237 	 * Global zone sees all.
2238 	 */
2239 	if (zone != global_zone) {
2240 		zone->zone_psetid = newpsetid;
2241 		if (newpsetid != ZONE_PS_INVAL)
2242 			pool_pset_visibility_add(newpsetid, zone);
2243 		if (oldpsetid != ZONE_PS_INVAL)
2244 			pool_pset_visibility_remove(oldpsetid, zone);
2245 	}
2246 	/*
2247 	 * Disabling pools, so we should start using the global values
2248 	 * for ncpus and ncpus_online.
2249 	 */
2250 	if (newpsetid == ZONE_PS_INVAL) {
2251 		zone->zone_ncpus = 0;
2252 		zone->zone_ncpus_online = 0;
2253 	}
2254 }
2255 
2256 /*
2257  * Walk the list of active zones and issue the provided callback for
2258  * each of them.
2259  *
2260  * Caller must not be holding any locks that may be acquired under
2261  * zonehash_lock.  See comment at the beginning of the file for a list of
2262  * common locks and their interactions with zones.
2263  */
2264 int
2265 zone_walk(int (*cb)(zone_t *, void *), void *data)
2266 {
2267 	zone_t *zone;
2268 	int ret = 0;
2269 	zone_status_t status;
2270 
2271 	mutex_enter(&zonehash_lock);
2272 	for (zone = list_head(&zone_active); zone != NULL;
2273 	    zone = list_next(&zone_active, zone)) {
2274 		/*
2275 		 * Skip zones that shouldn't be externally visible.
2276 		 */
2277 		status = zone_status_get(zone);
2278 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
2279 			continue;
2280 		/*
2281 		 * Bail immediately if any callback invocation returns a
2282 		 * non-zero value.
2283 		 */
2284 		ret = (*cb)(zone, data);
2285 		if (ret != 0)
2286 			break;
2287 	}
2288 	mutex_exit(&zonehash_lock);
2289 	return (ret);
2290 }
2291 
2292 static int
2293 zone_set_root(zone_t *zone, const char *upath)
2294 {
2295 	vnode_t *vp;
2296 	int trycount;
2297 	int error = 0;
2298 	char *path;
2299 	struct pathname upn, pn;
2300 	size_t pathlen;
2301 
2302 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
2303 		return (error);
2304 
2305 	pn_alloc(&pn);
2306 
2307 	/* prevent infinite loop */
2308 	trycount = 10;
2309 	for (;;) {
2310 		if (--trycount <= 0) {
2311 			error = ESTALE;
2312 			goto out;
2313 		}
2314 
2315 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
2316 			/*
2317 			 * VOP_ACCESS() may cover 'vp' with a new
2318 			 * filesystem, if 'vp' is an autoFS vnode.
2319 			 * Get the new 'vp' if so.
2320 			 */
2321 			if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 &&
2322 			    (vp->v_vfsmountedhere == NULL ||
2323 			    (error = traverse(&vp)) == 0)) {
2324 				pathlen = pn.pn_pathlen + 2;
2325 				path = kmem_alloc(pathlen, KM_SLEEP);
2326 				(void) strncpy(path, pn.pn_path,
2327 				    pn.pn_pathlen + 1);
2328 				path[pathlen - 2] = '/';
2329 				path[pathlen - 1] = '\0';
2330 				pn_free(&pn);
2331 				pn_free(&upn);
2332 
2333 				/* Success! */
2334 				break;
2335 			}
2336 			VN_RELE(vp);
2337 		}
2338 		if (error != ESTALE)
2339 			goto out;
2340 	}
2341 
2342 	ASSERT(error == 0);
2343 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
2344 	zone->zone_rootpath = path;
2345 	zone->zone_rootpathlen = pathlen;
2346 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
2347 		zone->zone_flags |= ZF_IS_SCRATCH;
2348 	return (0);
2349 
2350 out:
2351 	pn_free(&pn);
2352 	pn_free(&upn);
2353 	return (error);
2354 }
2355 
2356 #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
2357 			((c) >= 'a' && (c) <= 'z') || \
2358 			((c) >= 'A' && (c) <= 'Z'))
2359 
2360 static int
2361 zone_set_name(zone_t *zone, const char *uname)
2362 {
2363 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
2364 	size_t len;
2365 	int i, err;
2366 
2367 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
2368 		kmem_free(kname, ZONENAME_MAX);
2369 		return (err);	/* EFAULT or ENAMETOOLONG */
2370 	}
2371 
2372 	/* must be less than ZONENAME_MAX */
2373 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
2374 		kmem_free(kname, ZONENAME_MAX);
2375 		return (EINVAL);
2376 	}
2377 
2378 	/*
2379 	 * Name must start with an alphanumeric and must contain only
2380 	 * alphanumerics, '-', '_' and '.'.
2381 	 */
2382 	if (!isalnum(kname[0])) {
2383 		kmem_free(kname, ZONENAME_MAX);
2384 		return (EINVAL);
2385 	}
2386 	for (i = 1; i < len - 1; i++) {
2387 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
2388 		    kname[i] != '.') {
2389 			kmem_free(kname, ZONENAME_MAX);
2390 			return (EINVAL);
2391 		}
2392 	}
2393 
2394 	zone->zone_name = kname;
2395 	return (0);
2396 }
2397 
2398 /*
2399  * Similar to thread_create(), but makes sure the thread is in the appropriate
2400  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
2401  */
2402 /*ARGSUSED*/
2403 kthread_t *
2404 zthread_create(
2405     caddr_t stk,
2406     size_t stksize,
2407     void (*proc)(),
2408     void *arg,
2409     size_t len,
2410     pri_t pri)
2411 {
2412 	kthread_t *t;
2413 	zone_t *zone = curproc->p_zone;
2414 	proc_t *pp = zone->zone_zsched;
2415 
2416 	zone_hold(zone);	/* Reference to be dropped when thread exits */
2417 
2418 	/*
2419 	 * No-one should be trying to create threads if the zone is shutting
2420 	 * down and there aren't any kernel threads around.  See comment
2421 	 * in zthread_exit().
2422 	 */
2423 	ASSERT(!(zone->zone_kthreads == NULL &&
2424 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
2425 	/*
2426 	 * Create a thread, but don't let it run until we've finished setting
2427 	 * things up.
2428 	 */
2429 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
2430 	ASSERT(t->t_forw == NULL);
2431 	mutex_enter(&zone_status_lock);
2432 	if (zone->zone_kthreads == NULL) {
2433 		t->t_forw = t->t_back = t;
2434 	} else {
2435 		kthread_t *tx = zone->zone_kthreads;
2436 
2437 		t->t_forw = tx;
2438 		t->t_back = tx->t_back;
2439 		tx->t_back->t_forw = t;
2440 		tx->t_back = t;
2441 	}
2442 	zone->zone_kthreads = t;
2443 	mutex_exit(&zone_status_lock);
2444 
2445 	mutex_enter(&pp->p_lock);
2446 	t->t_proc_flag |= TP_ZTHREAD;
2447 	project_rele(t->t_proj);
2448 	t->t_proj = project_hold(pp->p_task->tk_proj);
2449 
2450 	/*
2451 	 * Setup complete, let it run.
2452 	 */
2453 	thread_lock(t);
2454 	t->t_schedflag |= TS_ALLSTART;
2455 	setrun_locked(t);
2456 	thread_unlock(t);
2457 
2458 	mutex_exit(&pp->p_lock);
2459 
2460 	return (t);
2461 }
2462 
2463 /*
2464  * Similar to thread_exit().  Must be called by threads created via
2465  * zthread_exit().
2466  */
2467 void
2468 zthread_exit(void)
2469 {
2470 	kthread_t *t = curthread;
2471 	proc_t *pp = curproc;
2472 	zone_t *zone = pp->p_zone;
2473 
2474 	mutex_enter(&zone_status_lock);
2475 
2476 	/*
2477 	 * Reparent to p0
2478 	 */
2479 	kpreempt_disable();
2480 	mutex_enter(&pp->p_lock);
2481 	t->t_proc_flag &= ~TP_ZTHREAD;
2482 	t->t_procp = &p0;
2483 	hat_thread_exit(t);
2484 	mutex_exit(&pp->p_lock);
2485 	kpreempt_enable();
2486 
2487 	if (t->t_back == t) {
2488 		ASSERT(t->t_forw == t);
2489 		/*
2490 		 * If the zone is empty, once the thread count
2491 		 * goes to zero no further kernel threads can be
2492 		 * created.  This is because if the creator is a process
2493 		 * in the zone, then it must have exited before the zone
2494 		 * state could be set to ZONE_IS_EMPTY.
2495 		 * Otherwise, if the creator is a kernel thread in the
2496 		 * zone, the thread count is non-zero.
2497 		 *
2498 		 * This really means that non-zone kernel threads should
2499 		 * not create zone kernel threads.
2500 		 */
2501 		zone->zone_kthreads = NULL;
2502 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
2503 			zone_status_set(zone, ZONE_IS_DOWN);
2504 		}
2505 	} else {
2506 		t->t_forw->t_back = t->t_back;
2507 		t->t_back->t_forw = t->t_forw;
2508 		if (zone->zone_kthreads == t)
2509 			zone->zone_kthreads = t->t_forw;
2510 	}
2511 	mutex_exit(&zone_status_lock);
2512 	zone_rele(zone);
2513 	thread_exit();
2514 	/* NOTREACHED */
2515 }
2516 
2517 static void
2518 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
2519 {
2520 	vnode_t *oldvp;
2521 
2522 	/* we're going to hold a reference here to the directory */
2523 	VN_HOLD(vp);
2524 
2525 #ifdef C2_AUDIT
2526 	if (audit_active)	/* update abs cwd/root path see c2audit.c */
2527 		audit_chdirec(vp, vpp);
2528 #endif
2529 
2530 	mutex_enter(&pp->p_lock);
2531 	oldvp = *vpp;
2532 	*vpp = vp;
2533 	mutex_exit(&pp->p_lock);
2534 	if (oldvp != NULL)
2535 		VN_RELE(oldvp);
2536 }
2537 
2538 /*
2539  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
2540  */
2541 static int
2542 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
2543 {
2544 	nvpair_t *nvp = NULL;
2545 	boolean_t priv_set = B_FALSE;
2546 	boolean_t limit_set = B_FALSE;
2547 	boolean_t action_set = B_FALSE;
2548 
2549 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2550 		const char *name;
2551 		uint64_t ui64;
2552 
2553 		name = nvpair_name(nvp);
2554 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
2555 			return (EINVAL);
2556 		(void) nvpair_value_uint64(nvp, &ui64);
2557 		if (strcmp(name, "privilege") == 0) {
2558 			/*
2559 			 * Currently only privileged values are allowed, but
2560 			 * this may change in the future.
2561 			 */
2562 			if (ui64 != RCPRIV_PRIVILEGED)
2563 				return (EINVAL);
2564 			rv->rcv_privilege = ui64;
2565 			priv_set = B_TRUE;
2566 		} else if (strcmp(name, "limit") == 0) {
2567 			rv->rcv_value = ui64;
2568 			limit_set = B_TRUE;
2569 		} else if (strcmp(name, "action") == 0) {
2570 			if (ui64 != RCTL_LOCAL_NOACTION &&
2571 			    ui64 != RCTL_LOCAL_DENY)
2572 				return (EINVAL);
2573 			rv->rcv_flagaction = ui64;
2574 			action_set = B_TRUE;
2575 		} else {
2576 			return (EINVAL);
2577 		}
2578 	}
2579 
2580 	if (!(priv_set && limit_set && action_set))
2581 		return (EINVAL);
2582 	rv->rcv_action_signal = 0;
2583 	rv->rcv_action_recipient = NULL;
2584 	rv->rcv_action_recip_pid = -1;
2585 	rv->rcv_firing_time = 0;
2586 
2587 	return (0);
2588 }
2589 
2590 /*
2591  * Non-global zone version of start_init.
2592  */
2593 void
2594 zone_start_init(void)
2595 {
2596 	proc_t *p = ttoproc(curthread);
2597 	zone_t *z = p->p_zone;
2598 
2599 	ASSERT(!INGLOBALZONE(curproc));
2600 
2601 	/*
2602 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
2603 	 * storing just the pid of init is sufficient.
2604 	 */
2605 	z->zone_proc_initpid = p->p_pid;
2606 
2607 	/*
2608 	 * We maintain zone_boot_err so that we can return the cause of the
2609 	 * failure back to the caller of the zone_boot syscall.
2610 	 */
2611 	p->p_zone->zone_boot_err = start_init_common();
2612 
2613 	mutex_enter(&zone_status_lock);
2614 	if (z->zone_boot_err != 0) {
2615 		/*
2616 		 * Make sure we are still in the booting state-- we could have
2617 		 * raced and already be shutting down, or even further along.
2618 		 */
2619 		if (zone_status_get(z) == ZONE_IS_BOOTING)
2620 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
2621 		mutex_exit(&zone_status_lock);
2622 		/* It's gone bad, dispose of the process */
2623 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
2624 			mutex_enter(&p->p_lock);
2625 			ASSERT(p->p_flag & SEXITLWPS);
2626 			lwp_exit();
2627 		}
2628 	} else {
2629 		if (zone_status_get(z) == ZONE_IS_BOOTING)
2630 			zone_status_set(z, ZONE_IS_RUNNING);
2631 		mutex_exit(&zone_status_lock);
2632 		/* cause the process to return to userland. */
2633 		lwp_rtt();
2634 	}
2635 }
2636 
2637 struct zsched_arg {
2638 	zone_t *zone;
2639 	nvlist_t *nvlist;
2640 };
2641 
2642 /*
2643  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
2644  * anything to do with scheduling, but rather with the fact that
2645  * per-zone kernel threads are parented to zsched, just like regular
2646  * kernel threads are parented to sched (p0).
2647  *
2648  * zsched is also responsible for launching init for the zone.
2649  */
2650 static void
2651 zsched(void *arg)
2652 {
2653 	struct zsched_arg *za = arg;
2654 	proc_t *pp = curproc;
2655 	proc_t *initp = proc_init;
2656 	zone_t *zone = za->zone;
2657 	cred_t *cr, *oldcred;
2658 	rctl_set_t *set;
2659 	rctl_alloc_gp_t *gp;
2660 	contract_t *ct = NULL;
2661 	task_t *tk, *oldtk;
2662 	rctl_entity_p_t e;
2663 	kproject_t *pj;
2664 
2665 	nvlist_t *nvl = za->nvlist;
2666 	nvpair_t *nvp = NULL;
2667 
2668 	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
2669 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
2670 	PTOU(pp)->u_argc = 0;
2671 	PTOU(pp)->u_argv = NULL;
2672 	PTOU(pp)->u_envp = NULL;
2673 	closeall(P_FINFO(pp));
2674 
2675 	/*
2676 	 * We are this zone's "zsched" process.  As the zone isn't generally
2677 	 * visible yet we don't need to grab any locks before initializing its
2678 	 * zone_proc pointer.
2679 	 */
2680 	zone_hold(zone);  /* this hold is released by zone_destroy() */
2681 	zone->zone_zsched = pp;
2682 	mutex_enter(&pp->p_lock);
2683 	pp->p_zone = zone;
2684 	mutex_exit(&pp->p_lock);
2685 
2686 	/*
2687 	 * Disassociate process from its 'parent'; parent ourselves to init
2688 	 * (pid 1) and change other values as needed.
2689 	 */
2690 	sess_create();
2691 
2692 	mutex_enter(&pidlock);
2693 	proc_detach(pp);
2694 	pp->p_ppid = 1;
2695 	pp->p_flag |= SZONETOP;
2696 	pp->p_ancpid = 1;
2697 	pp->p_parent = initp;
2698 	pp->p_psibling = NULL;
2699 	if (initp->p_child)
2700 		initp->p_child->p_psibling = pp;
2701 	pp->p_sibling = initp->p_child;
2702 	initp->p_child = pp;
2703 
2704 	/* Decrement what newproc() incremented. */
2705 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
2706 	/*
2707 	 * Our credentials are about to become kcred-like, so we don't care
2708 	 * about the caller's ruid.
2709 	 */
2710 	upcount_inc(crgetruid(kcred), zone->zone_id);
2711 	mutex_exit(&pidlock);
2712 
2713 	/*
2714 	 * getting out of global zone, so decrement lwp counts
2715 	 */
2716 	pj = pp->p_task->tk_proj;
2717 	mutex_enter(&global_zone->zone_nlwps_lock);
2718 	pj->kpj_nlwps -= pp->p_lwpcnt;
2719 	global_zone->zone_nlwps -= pp->p_lwpcnt;
2720 	mutex_exit(&global_zone->zone_nlwps_lock);
2721 
2722 	/*
2723 	 * Decrement locked memory counts on old zone and project.
2724 	 */
2725 	mutex_enter(&global_zone->zone_mem_lock);
2726 	global_zone->zone_locked_mem -= pp->p_locked_mem;
2727 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
2728 	mutex_exit(&global_zone->zone_mem_lock);
2729 
2730 	/*
2731 	 * Create and join a new task in project '0' of this zone.
2732 	 *
2733 	 * We don't need to call holdlwps() since we know we're the only lwp in
2734 	 * this process.
2735 	 *
2736 	 * task_join() returns with p_lock held.
2737 	 */
2738 	tk = task_create(0, zone);
2739 	mutex_enter(&cpu_lock);
2740 	oldtk = task_join(tk, 0);
2741 
2742 	pj = pp->p_task->tk_proj;
2743 
2744 	mutex_enter(&zone->zone_mem_lock);
2745 	zone->zone_locked_mem += pp->p_locked_mem;
2746 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
2747 	mutex_exit(&zone->zone_mem_lock);
2748 
2749 	/*
2750 	 * add lwp counts to zsched's zone, and increment project's task count
2751 	 * due to the task created in the above tasksys_settaskid
2752 	 */
2753 
2754 	mutex_enter(&zone->zone_nlwps_lock);
2755 	pj->kpj_nlwps += pp->p_lwpcnt;
2756 	pj->kpj_ntasks += 1;
2757 	zone->zone_nlwps += pp->p_lwpcnt;
2758 	mutex_exit(&zone->zone_nlwps_lock);
2759 
2760 	mutex_exit(&curproc->p_lock);
2761 	mutex_exit(&cpu_lock);
2762 	task_rele(oldtk);
2763 
2764 	/*
2765 	 * The process was created by a process in the global zone, hence the
2766 	 * credentials are wrong.  We might as well have kcred-ish credentials.
2767 	 */
2768 	cr = zone->zone_kcred;
2769 	crhold(cr);
2770 	mutex_enter(&pp->p_crlock);
2771 	oldcred = pp->p_cred;
2772 	pp->p_cred = cr;
2773 	mutex_exit(&pp->p_crlock);
2774 	crfree(oldcred);
2775 
2776 	/*
2777 	 * Hold credentials again (for thread)
2778 	 */
2779 	crhold(cr);
2780 
2781 	/*
2782 	 * p_lwpcnt can't change since this is a kernel process.
2783 	 */
2784 	crset(pp, cr);
2785 
2786 	/*
2787 	 * Chroot
2788 	 */
2789 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
2790 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
2791 
2792 	/*
2793 	 * Initialize zone's rctl set.
2794 	 */
2795 	set = rctl_set_create();
2796 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2797 	mutex_enter(&pp->p_lock);
2798 	e.rcep_p.zone = zone;
2799 	e.rcep_t = RCENTITY_ZONE;
2800 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
2801 	mutex_exit(&pp->p_lock);
2802 	rctl_prealloc_destroy(gp);
2803 
2804 	/*
2805 	 * Apply the rctls passed in to zone_create().  This is basically a list
2806 	 * assignment: all of the old values are removed and the new ones
2807 	 * inserted.  That is, if an empty list is passed in, all values are
2808 	 * removed.
2809 	 */
2810 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2811 		rctl_dict_entry_t *rde;
2812 		rctl_hndl_t hndl;
2813 		char *name;
2814 		nvlist_t **nvlarray;
2815 		uint_t i, nelem;
2816 		int error;	/* For ASSERT()s */
2817 
2818 		name = nvpair_name(nvp);
2819 		hndl = rctl_hndl_lookup(name);
2820 		ASSERT(hndl != -1);
2821 		rde = rctl_dict_lookup_hndl(hndl);
2822 		ASSERT(rde != NULL);
2823 
2824 		for (; /* ever */; ) {
2825 			rctl_val_t oval;
2826 
2827 			mutex_enter(&pp->p_lock);
2828 			error = rctl_local_get(hndl, NULL, &oval, pp);
2829 			mutex_exit(&pp->p_lock);
2830 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
2831 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
2832 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
2833 				break;
2834 			mutex_enter(&pp->p_lock);
2835 			error = rctl_local_delete(hndl, &oval, pp);
2836 			mutex_exit(&pp->p_lock);
2837 			ASSERT(error == 0);
2838 		}
2839 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2840 		ASSERT(error == 0);
2841 		for (i = 0; i < nelem; i++) {
2842 			rctl_val_t *nvalp;
2843 
2844 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2845 			error = nvlist2rctlval(nvlarray[i], nvalp);
2846 			ASSERT(error == 0);
2847 			/*
2848 			 * rctl_local_insert can fail if the value being
2849 			 * inserted is a duplicate; this is OK.
2850 			 */
2851 			mutex_enter(&pp->p_lock);
2852 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
2853 				kmem_cache_free(rctl_val_cache, nvalp);
2854 			mutex_exit(&pp->p_lock);
2855 		}
2856 	}
2857 	/*
2858 	 * Tell the world that we're done setting up.
2859 	 *
2860 	 * At this point we want to set the zone status to ZONE_IS_READY
2861 	 * and atomically set the zone's processor set visibility.  Once
2862 	 * we drop pool_lock() this zone will automatically get updated
2863 	 * to reflect any future changes to the pools configuration.
2864 	 */
2865 	pool_lock();
2866 	mutex_enter(&cpu_lock);
2867 	mutex_enter(&zonehash_lock);
2868 	zone_uniqid(zone);
2869 	zone_zsd_configure(zone);
2870 	if (pool_state == POOL_ENABLED)
2871 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
2872 	mutex_enter(&zone_status_lock);
2873 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2874 	zone_status_set(zone, ZONE_IS_READY);
2875 	mutex_exit(&zone_status_lock);
2876 	mutex_exit(&zonehash_lock);
2877 	mutex_exit(&cpu_lock);
2878 	pool_unlock();
2879 
2880 	/*
2881 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
2882 	 * we launch init, and set the state to running.
2883 	 */
2884 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
2885 
2886 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
2887 		id_t cid;
2888 
2889 		/*
2890 		 * Ok, this is a little complicated.  We need to grab the
2891 		 * zone's pool's scheduling class ID; note that by now, we
2892 		 * are already bound to a pool if we need to be (zoneadmd
2893 		 * will have done that to us while we're in the READY
2894 		 * state).  *But* the scheduling class for the zone's 'init'
2895 		 * must be explicitly passed to newproc, which doesn't
2896 		 * respect pool bindings.
2897 		 *
2898 		 * We hold the pool_lock across the call to newproc() to
2899 		 * close the obvious race: the pool's scheduling class
2900 		 * could change before we manage to create the LWP with
2901 		 * classid 'cid'.
2902 		 */
2903 		pool_lock();
2904 		if (zone->zone_defaultcid > 0)
2905 			cid = zone->zone_defaultcid;
2906 		else
2907 			cid = pool_get_class(zone->zone_pool);
2908 		if (cid == -1)
2909 			cid = defaultcid;
2910 
2911 		/*
2912 		 * If this fails, zone_boot will ultimately fail.  The
2913 		 * state of the zone will be set to SHUTTING_DOWN-- userland
2914 		 * will have to tear down the zone, and fail, or try again.
2915 		 */
2916 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
2917 		    minclsyspri - 1, &ct)) != 0) {
2918 			mutex_enter(&zone_status_lock);
2919 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
2920 			mutex_exit(&zone_status_lock);
2921 		}
2922 		pool_unlock();
2923 	}
2924 
2925 	/*
2926 	 * Wait for zone_destroy() to be called.  This is what we spend
2927 	 * most of our life doing.
2928 	 */
2929 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
2930 
2931 	if (ct)
2932 		/*
2933 		 * At this point the process contract should be empty.
2934 		 * (Though if it isn't, it's not the end of the world.)
2935 		 */
2936 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
2937 
2938 	/*
2939 	 * Allow kcred to be freed when all referring processes
2940 	 * (including this one) go away.  We can't just do this in
2941 	 * zone_free because we need to wait for the zone_cred_ref to
2942 	 * drop to 0 before calling zone_free, and the existence of
2943 	 * zone_kcred will prevent that.  Thus, we call crfree here to
2944 	 * balance the crdup in zone_create.  The crhold calls earlier
2945 	 * in zsched will be dropped when the thread and process exit.
2946 	 */
2947 	crfree(zone->zone_kcred);
2948 	zone->zone_kcred = NULL;
2949 
2950 	exit(CLD_EXITED, 0);
2951 }
2952 
2953 /*
2954  * Helper function to determine if there are any submounts of the
2955  * provided path.  Used to make sure the zone doesn't "inherit" any
2956  * mounts from before it is created.
2957  */
2958 static uint_t
2959 zone_mount_count(const char *rootpath)
2960 {
2961 	vfs_t *vfsp;
2962 	uint_t count = 0;
2963 	size_t rootpathlen = strlen(rootpath);
2964 
2965 	/*
2966 	 * Holding zonehash_lock prevents race conditions with
2967 	 * vfs_list_add()/vfs_list_remove() since we serialize with
2968 	 * zone_find_by_path().
2969 	 */
2970 	ASSERT(MUTEX_HELD(&zonehash_lock));
2971 	/*
2972 	 * The rootpath must end with a '/'
2973 	 */
2974 	ASSERT(rootpath[rootpathlen - 1] == '/');
2975 
2976 	/*
2977 	 * This intentionally does not count the rootpath itself if that
2978 	 * happens to be a mount point.
2979 	 */
2980 	vfs_list_read_lock();
2981 	vfsp = rootvfs;
2982 	do {
2983 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
2984 		    rootpathlen) == 0)
2985 			count++;
2986 		vfsp = vfsp->vfs_next;
2987 	} while (vfsp != rootvfs);
2988 	vfs_list_unlock();
2989 	return (count);
2990 }
2991 
2992 /*
2993  * Helper function to make sure that a zone created on 'rootpath'
2994  * wouldn't end up containing other zones' rootpaths.
2995  */
2996 static boolean_t
2997 zone_is_nested(const char *rootpath)
2998 {
2999 	zone_t *zone;
3000 	size_t rootpathlen = strlen(rootpath);
3001 	size_t len;
3002 
3003 	ASSERT(MUTEX_HELD(&zonehash_lock));
3004 
3005 	for (zone = list_head(&zone_active); zone != NULL;
3006 	    zone = list_next(&zone_active, zone)) {
3007 		if (zone == global_zone)
3008 			continue;
3009 		len = strlen(zone->zone_rootpath);
3010 		if (strncmp(rootpath, zone->zone_rootpath,
3011 		    MIN(rootpathlen, len)) == 0)
3012 			return (B_TRUE);
3013 	}
3014 	return (B_FALSE);
3015 }
3016 
3017 static int
3018 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
3019     size_t zone_privssz)
3020 {
3021 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
3022 
3023 	if (zone_privssz < sizeof (priv_set_t))
3024 		return (set_errno(ENOMEM));
3025 
3026 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
3027 		kmem_free(privs, sizeof (priv_set_t));
3028 		return (EFAULT);
3029 	}
3030 
3031 	zone->zone_privset = privs;
3032 	return (0);
3033 }
3034 
3035 /*
3036  * We make creative use of nvlists to pass in rctls from userland.  The list is
3037  * a list of the following structures:
3038  *
3039  * (name = rctl_name, value = nvpair_list_array)
3040  *
3041  * Where each element of the nvpair_list_array is of the form:
3042  *
3043  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
3044  * 	(name = "limit", value = uint64_t),
3045  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
3046  */
3047 static int
3048 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
3049 {
3050 	nvpair_t *nvp = NULL;
3051 	nvlist_t *nvl = NULL;
3052 	char *kbuf;
3053 	int error;
3054 	rctl_val_t rv;
3055 
3056 	*nvlp = NULL;
3057 
3058 	if (buflen == 0)
3059 		return (0);
3060 
3061 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3062 		return (ENOMEM);
3063 	if (copyin(ubuf, kbuf, buflen)) {
3064 		error = EFAULT;
3065 		goto out;
3066 	}
3067 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
3068 		/*
3069 		 * nvl may have been allocated/free'd, but the value set to
3070 		 * non-NULL, so we reset it here.
3071 		 */
3072 		nvl = NULL;
3073 		error = EINVAL;
3074 		goto out;
3075 	}
3076 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
3077 		rctl_dict_entry_t *rde;
3078 		rctl_hndl_t hndl;
3079 		nvlist_t **nvlarray;
3080 		uint_t i, nelem;
3081 		char *name;
3082 
3083 		error = EINVAL;
3084 		name = nvpair_name(nvp);
3085 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
3086 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
3087 			goto out;
3088 		}
3089 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
3090 			goto out;
3091 		}
3092 		rde = rctl_dict_lookup_hndl(hndl);
3093 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
3094 		ASSERT(error == 0);
3095 		for (i = 0; i < nelem; i++) {
3096 			if (error = nvlist2rctlval(nvlarray[i], &rv))
3097 				goto out;
3098 		}
3099 		if (rctl_invalid_value(rde, &rv)) {
3100 			error = EINVAL;
3101 			goto out;
3102 		}
3103 	}
3104 	error = 0;
3105 	*nvlp = nvl;
3106 out:
3107 	kmem_free(kbuf, buflen);
3108 	if (error && nvl != NULL)
3109 		nvlist_free(nvl);
3110 	return (error);
3111 }
3112 
3113 int
3114 zone_create_error(int er_error, int er_ext, int *er_out) {
3115 	if (er_out != NULL) {
3116 		if (copyout(&er_ext, er_out, sizeof (int))) {
3117 			return (set_errno(EFAULT));
3118 		}
3119 	}
3120 	return (set_errno(er_error));
3121 }
3122 
3123 static int
3124 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
3125 {
3126 	ts_label_t *tsl;
3127 	bslabel_t blab;
3128 
3129 	/* Get label from user */
3130 	if (copyin(lab, &blab, sizeof (blab)) != 0)
3131 		return (EFAULT);
3132 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
3133 	if (tsl == NULL)
3134 		return (ENOMEM);
3135 
3136 	zone->zone_slabel = tsl;
3137 	return (0);
3138 }
3139 
3140 /*
3141  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
3142  */
3143 static int
3144 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
3145 {
3146 	char *kbuf;
3147 	char *dataset, *next;
3148 	zone_dataset_t *zd;
3149 	size_t len;
3150 
3151 	if (ubuf == NULL || buflen == 0)
3152 		return (0);
3153 
3154 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
3155 		return (ENOMEM);
3156 
3157 	if (copyin(ubuf, kbuf, buflen) != 0) {
3158 		kmem_free(kbuf, buflen);
3159 		return (EFAULT);
3160 	}
3161 
3162 	dataset = next = kbuf;
3163 	for (;;) {
3164 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
3165 
3166 		next = strchr(dataset, ',');
3167 
3168 		if (next == NULL)
3169 			len = strlen(dataset);
3170 		else
3171 			len = next - dataset;
3172 
3173 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
3174 		bcopy(dataset, zd->zd_dataset, len);
3175 		zd->zd_dataset[len] = '\0';
3176 
3177 		list_insert_head(&zone->zone_datasets, zd);
3178 
3179 		if (next == NULL)
3180 			break;
3181 
3182 		dataset = next + 1;
3183 	}
3184 
3185 	kmem_free(kbuf, buflen);
3186 	return (0);
3187 }
3188 
3189 /*
3190  * System call to create/initialize a new zone named 'zone_name', rooted
3191  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
3192  * and initialized with the zone-wide rctls described in 'rctlbuf', and
3193  * with labeling set by 'match', 'doi', and 'label'.
3194  *
3195  * If extended error is non-null, we may use it to return more detailed
3196  * error information.
3197  */
3198 static zoneid_t
3199 zone_create(const char *zone_name, const char *zone_root,
3200     const priv_set_t *zone_privs, size_t zone_privssz,
3201     caddr_t rctlbuf, size_t rctlbufsz,
3202     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
3203     int match, uint32_t doi, const bslabel_t *label,
3204     int flags)
3205 {
3206 	struct zsched_arg zarg;
3207 	nvlist_t *rctls = NULL;
3208 	proc_t *pp = curproc;
3209 	zone_t *zone, *ztmp;
3210 	zoneid_t zoneid;
3211 	int error;
3212 	int error2 = 0;
3213 	char *str;
3214 	cred_t *zkcr;
3215 	boolean_t insert_label_hash;
3216 
3217 	if (secpolicy_zone_config(CRED()) != 0)
3218 		return (set_errno(EPERM));
3219 
3220 	/* can't boot zone from within chroot environment */
3221 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
3222 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
3223 		    extended_error));
3224 
3225 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
3226 	zoneid = zone->zone_id = id_alloc(zoneid_space);
3227 	zone->zone_status = ZONE_IS_UNINITIALIZED;
3228 	zone->zone_pool = pool_default;
3229 	zone->zone_pool_mod = gethrtime();
3230 	zone->zone_psetid = ZONE_PS_INVAL;
3231 	zone->zone_ncpus = 0;
3232 	zone->zone_ncpus_online = 0;
3233 	zone->zone_restart_init = B_TRUE;
3234 	zone->zone_brand = &native_brand;
3235 	zone->zone_initname = NULL;
3236 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
3237 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
3238 	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
3239 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
3240 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
3241 	    offsetof(struct zsd_entry, zsd_linkage));
3242 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
3243 	    offsetof(zone_dataset_t, zd_linkage));
3244 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
3245 
3246 	if (flags & ZCF_NET_EXCL) {
3247 		zone->zone_flags |= ZF_NET_EXCL;
3248 	}
3249 
3250 	if ((error = zone_set_name(zone, zone_name)) != 0) {
3251 		zone_free(zone);
3252 		return (zone_create_error(error, 0, extended_error));
3253 	}
3254 
3255 	if ((error = zone_set_root(zone, zone_root)) != 0) {
3256 		zone_free(zone);
3257 		return (zone_create_error(error, 0, extended_error));
3258 	}
3259 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
3260 		zone_free(zone);
3261 		return (zone_create_error(error, 0, extended_error));
3262 	}
3263 
3264 	/* initialize node name to be the same as zone name */
3265 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3266 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
3267 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
3268 
3269 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
3270 	zone->zone_domain[0] = '\0';
3271 	zone->zone_shares = 1;
3272 	zone->zone_shmmax = 0;
3273 	zone->zone_ipc.ipcq_shmmni = 0;
3274 	zone->zone_ipc.ipcq_semmni = 0;
3275 	zone->zone_ipc.ipcq_msgmni = 0;
3276 	zone->zone_bootargs = NULL;
3277 	zone->zone_initname =
3278 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
3279 	(void) strcpy(zone->zone_initname, zone_default_initname);
3280 	zone->zone_nlwps = 0;
3281 	zone->zone_nlwps_ctl = INT_MAX;
3282 	zone->zone_locked_mem = 0;
3283 	zone->zone_locked_mem_ctl = UINT64_MAX;
3284 	zone->zone_max_swap = 0;
3285 	zone->zone_max_swap_ctl = UINT64_MAX;
3286 	zone0.zone_lockedmem_kstat = NULL;
3287 	zone0.zone_swapresv_kstat = NULL;
3288 
3289 	/*
3290 	 * Zsched initializes the rctls.
3291 	 */
3292 	zone->zone_rctls = NULL;
3293 
3294 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
3295 		zone_free(zone);
3296 		return (zone_create_error(error, 0, extended_error));
3297 	}
3298 
3299 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
3300 		zone_free(zone);
3301 		return (set_errno(error));
3302 	}
3303 
3304 	/*
3305 	 * Read in the trusted system parameters:
3306 	 * match flag and sensitivity label.
3307 	 */
3308 	zone->zone_match = match;
3309 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3310 		error = zone_set_label(zone, label, doi);
3311 		if (error != 0) {
3312 			zone_free(zone);
3313 			return (set_errno(error));
3314 		}
3315 		insert_label_hash = B_TRUE;
3316 	} else {
3317 		/* all zones get an admin_low label if system is not labeled */
3318 		zone->zone_slabel = l_admin_low;
3319 		label_hold(l_admin_low);
3320 		insert_label_hash = B_FALSE;
3321 	}
3322 
3323 	/*
3324 	 * Stop all lwps since that's what normally happens as part of fork().
3325 	 * This needs to happen before we grab any locks to avoid deadlock
3326 	 * (another lwp in the process could be waiting for the held lock).
3327 	 */
3328 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
3329 		zone_free(zone);
3330 		if (rctls)
3331 			nvlist_free(rctls);
3332 		return (zone_create_error(error, 0, extended_error));
3333 	}
3334 
3335 	if (block_mounts() == 0) {
3336 		mutex_enter(&pp->p_lock);
3337 		if (curthread != pp->p_agenttp)
3338 			continuelwps(pp);
3339 		mutex_exit(&pp->p_lock);
3340 		zone_free(zone);
3341 		if (rctls)
3342 			nvlist_free(rctls);
3343 		return (zone_create_error(error, 0, extended_error));
3344 	}
3345 
3346 	/*
3347 	 * Set up credential for kernel access.  After this, any errors
3348 	 * should go through the dance in errout rather than calling
3349 	 * zone_free directly.
3350 	 */
3351 	zone->zone_kcred = crdup(kcred);
3352 	crsetzone(zone->zone_kcred, zone);
3353 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
3354 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
3355 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
3356 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
3357 
3358 	mutex_enter(&zonehash_lock);
3359 	/*
3360 	 * Make sure zone doesn't already exist.
3361 	 *
3362 	 * If the system and zone are labeled,
3363 	 * make sure no other zone exists that has the same label.
3364 	 */
3365 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
3366 	    (insert_label_hash &&
3367 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
3368 		zone_status_t status;
3369 
3370 		status = zone_status_get(ztmp);
3371 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
3372 			error = EEXIST;
3373 		else
3374 			error = EBUSY;
3375 		goto errout;
3376 	}
3377 
3378 	/*
3379 	 * Don't allow zone creations which would cause one zone's rootpath to
3380 	 * be accessible from that of another (non-global) zone.
3381 	 */
3382 	if (zone_is_nested(zone->zone_rootpath)) {
3383 		error = EBUSY;
3384 		goto errout;
3385 	}
3386 
3387 	ASSERT(zonecount != 0);		/* check for leaks */
3388 	if (zonecount + 1 > maxzones) {
3389 		error = ENOMEM;
3390 		goto errout;
3391 	}
3392 
3393 	if (zone_mount_count(zone->zone_rootpath) != 0) {
3394 		error = EBUSY;
3395 		error2 = ZE_AREMOUNTS;
3396 		goto errout;
3397 	}
3398 
3399 	/*
3400 	 * Zone is still incomplete, but we need to drop all locks while
3401 	 * zsched() initializes this zone's kernel process.  We
3402 	 * optimistically add the zone to the hashtable and associated
3403 	 * lists so a parallel zone_create() doesn't try to create the
3404 	 * same zone.
3405 	 */
3406 	zonecount++;
3407 	(void) mod_hash_insert(zonehashbyid,
3408 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
3409 	    (mod_hash_val_t)(uintptr_t)zone);
3410 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
3411 	(void) strcpy(str, zone->zone_name);
3412 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
3413 	    (mod_hash_val_t)(uintptr_t)zone);
3414 	if (insert_label_hash) {
3415 		(void) mod_hash_insert(zonehashbylabel,
3416 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
3417 		zone->zone_flags |= ZF_HASHED_LABEL;
3418 	}
3419 
3420 	/*
3421 	 * Insert into active list.  At this point there are no 'hold's
3422 	 * on the zone, but everyone else knows not to use it, so we can
3423 	 * continue to use it.  zsched() will do a zone_hold() if the
3424 	 * newproc() is successful.
3425 	 */
3426 	list_insert_tail(&zone_active, zone);
3427 	mutex_exit(&zonehash_lock);
3428 
3429 	zarg.zone = zone;
3430 	zarg.nvlist = rctls;
3431 	/*
3432 	 * The process, task, and project rctls are probably wrong;
3433 	 * we need an interface to get the default values of all rctls,
3434 	 * and initialize zsched appropriately.  I'm not sure that that
3435 	 * makes much of a difference, though.
3436 	 */
3437 	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
3438 		/*
3439 		 * We need to undo all globally visible state.
3440 		 */
3441 		mutex_enter(&zonehash_lock);
3442 		list_remove(&zone_active, zone);
3443 		if (zone->zone_flags & ZF_HASHED_LABEL) {
3444 			ASSERT(zone->zone_slabel != NULL);
3445 			(void) mod_hash_destroy(zonehashbylabel,
3446 			    (mod_hash_key_t)zone->zone_slabel);
3447 		}
3448 		(void) mod_hash_destroy(zonehashbyname,
3449 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
3450 		(void) mod_hash_destroy(zonehashbyid,
3451 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3452 		ASSERT(zonecount > 1);
3453 		zonecount--;
3454 		goto errout;
3455 	}
3456 
3457 	/*
3458 	 * Zone creation can't fail from now on.
3459 	 */
3460 
3461 	/*
3462 	 * Create zone kstats
3463 	 */
3464 	zone_kstat_create(zone);
3465 
3466 	/*
3467 	 * Let the other lwps continue.
3468 	 */
3469 	mutex_enter(&pp->p_lock);
3470 	if (curthread != pp->p_agenttp)
3471 		continuelwps(pp);
3472 	mutex_exit(&pp->p_lock);
3473 
3474 	/*
3475 	 * Wait for zsched to finish initializing the zone.
3476 	 */
3477 	zone_status_wait(zone, ZONE_IS_READY);
3478 	/*
3479 	 * The zone is fully visible, so we can let mounts progress.
3480 	 */
3481 	resume_mounts();
3482 	if (rctls)
3483 		nvlist_free(rctls);
3484 
3485 	return (zoneid);
3486 
3487 errout:
3488 	mutex_exit(&zonehash_lock);
3489 	/*
3490 	 * Let the other lwps continue.
3491 	 */
3492 	mutex_enter(&pp->p_lock);
3493 	if (curthread != pp->p_agenttp)
3494 		continuelwps(pp);
3495 	mutex_exit(&pp->p_lock);
3496 
3497 	resume_mounts();
3498 	if (rctls)
3499 		nvlist_free(rctls);
3500 	/*
3501 	 * There is currently one reference to the zone, a cred_ref from
3502 	 * zone_kcred.  To free the zone, we call crfree, which will call
3503 	 * zone_cred_rele, which will call zone_free.
3504 	 */
3505 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
3506 	ASSERT(zone->zone_kcred->cr_ref == 1);
3507 	ASSERT(zone->zone_ref == 0);
3508 	zkcr = zone->zone_kcred;
3509 	zone->zone_kcred = NULL;
3510 	crfree(zkcr);				/* triggers call to zone_free */
3511 	return (zone_create_error(error, error2, extended_error));
3512 }
3513 
3514 /*
3515  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
3516  * the heavy lifting.  initname is the path to the program to launch
3517  * at the "top" of the zone; if this is NULL, we use the system default,
3518  * which is stored at zone_default_initname.
3519  */
3520 static int
3521 zone_boot(zoneid_t zoneid)
3522 {
3523 	int err;
3524 	zone_t *zone;
3525 
3526 	if (secpolicy_zone_config(CRED()) != 0)
3527 		return (set_errno(EPERM));
3528 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3529 		return (set_errno(EINVAL));
3530 
3531 	mutex_enter(&zonehash_lock);
3532 	/*
3533 	 * Look for zone under hash lock to prevent races with calls to
3534 	 * zone_shutdown, zone_destroy, etc.
3535 	 */
3536 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3537 		mutex_exit(&zonehash_lock);
3538 		return (set_errno(EINVAL));
3539 	}
3540 
3541 	mutex_enter(&zone_status_lock);
3542 	if (zone_status_get(zone) != ZONE_IS_READY) {
3543 		mutex_exit(&zone_status_lock);
3544 		mutex_exit(&zonehash_lock);
3545 		return (set_errno(EINVAL));
3546 	}
3547 	zone_status_set(zone, ZONE_IS_BOOTING);
3548 	mutex_exit(&zone_status_lock);
3549 
3550 	zone_hold(zone);	/* so we can use the zone_t later */
3551 	mutex_exit(&zonehash_lock);
3552 
3553 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
3554 		zone_rele(zone);
3555 		return (set_errno(EINTR));
3556 	}
3557 
3558 	/*
3559 	 * Boot (starting init) might have failed, in which case the zone
3560 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
3561 	 * be placed in zone->zone_boot_err, and so we return that.
3562 	 */
3563 	err = zone->zone_boot_err;
3564 	zone_rele(zone);
3565 	return (err ? set_errno(err) : 0);
3566 }
3567 
3568 /*
3569  * Kills all user processes in the zone, waiting for them all to exit
3570  * before returning.
3571  */
3572 static int
3573 zone_empty(zone_t *zone)
3574 {
3575 	int waitstatus;
3576 
3577 	/*
3578 	 * We need to drop zonehash_lock before killing all
3579 	 * processes, otherwise we'll deadlock with zone_find_*
3580 	 * which can be called from the exit path.
3581 	 */
3582 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
3583 	while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
3584 	    ZONE_IS_EMPTY)) == -1) {
3585 		killall(zone->zone_id);
3586 	}
3587 	/*
3588 	 * return EINTR if we were signaled
3589 	 */
3590 	if (waitstatus == 0)
3591 		return (EINTR);
3592 	return (0);
3593 }
3594 
3595 /*
3596  * This function implements the policy for zone visibility.
3597  *
3598  * In standard Solaris, a non-global zone can only see itself.
3599  *
3600  * In Trusted Extensions, a labeled zone can lookup any zone whose label
3601  * it dominates. For this test, the label of the global zone is treated as
3602  * admin_high so it is special-cased instead of being checked for dominance.
3603  *
3604  * Returns true if zone attributes are viewable, false otherwise.
3605  */
3606 static boolean_t
3607 zone_list_access(zone_t *zone)
3608 {
3609 
3610 	if (curproc->p_zone == global_zone ||
3611 	    curproc->p_zone == zone) {
3612 		return (B_TRUE);
3613 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3614 		bslabel_t *curproc_label;
3615 		bslabel_t *zone_label;
3616 
3617 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
3618 		zone_label = label2bslabel(zone->zone_slabel);
3619 
3620 		if (zone->zone_id != GLOBAL_ZONEID &&
3621 		    bldominates(curproc_label, zone_label)) {
3622 			return (B_TRUE);
3623 		} else {
3624 			return (B_FALSE);
3625 		}
3626 	} else {
3627 		return (B_FALSE);
3628 	}
3629 }
3630 
3631 /*
3632  * Systemcall to start the zone's halt sequence.  By the time this
3633  * function successfully returns, all user processes and kernel threads
3634  * executing in it will have exited, ZSD shutdown callbacks executed,
3635  * and the zone status set to ZONE_IS_DOWN.
3636  *
3637  * It is possible that the call will interrupt itself if the caller is the
3638  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
3639  */
3640 static int
3641 zone_shutdown(zoneid_t zoneid)
3642 {
3643 	int error;
3644 	zone_t *zone;
3645 	zone_status_t status;
3646 
3647 	if (secpolicy_zone_config(CRED()) != 0)
3648 		return (set_errno(EPERM));
3649 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3650 		return (set_errno(EINVAL));
3651 
3652 	/*
3653 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
3654 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
3655 	 *
3656 	 * e.g. NFS can fail the mount if it determines that the zone
3657 	 * has already begun the shutdown sequence.
3658 	 */
3659 	if (block_mounts() == 0)
3660 		return (set_errno(EINTR));
3661 	mutex_enter(&zonehash_lock);
3662 	/*
3663 	 * Look for zone under hash lock to prevent races with other
3664 	 * calls to zone_shutdown and zone_destroy.
3665 	 */
3666 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3667 		mutex_exit(&zonehash_lock);
3668 		resume_mounts();
3669 		return (set_errno(EINVAL));
3670 	}
3671 	mutex_enter(&zone_status_lock);
3672 	status = zone_status_get(zone);
3673 	/*
3674 	 * Fail if the zone isn't fully initialized yet.
3675 	 */
3676 	if (status < ZONE_IS_READY) {
3677 		mutex_exit(&zone_status_lock);
3678 		mutex_exit(&zonehash_lock);
3679 		resume_mounts();
3680 		return (set_errno(EINVAL));
3681 	}
3682 	/*
3683 	 * If conditions required for zone_shutdown() to return have been met,
3684 	 * return success.
3685 	 */
3686 	if (status >= ZONE_IS_DOWN) {
3687 		mutex_exit(&zone_status_lock);
3688 		mutex_exit(&zonehash_lock);
3689 		resume_mounts();
3690 		return (0);
3691 	}
3692 	/*
3693 	 * If zone_shutdown() hasn't been called before, go through the motions.
3694 	 * If it has, there's nothing to do but wait for the kernel threads to
3695 	 * drain.
3696 	 */
3697 	if (status < ZONE_IS_EMPTY) {
3698 		uint_t ntasks;
3699 
3700 		mutex_enter(&zone->zone_lock);
3701 		if ((ntasks = zone->zone_ntasks) != 1) {
3702 			/*
3703 			 * There's still stuff running.
3704 			 */
3705 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3706 		}
3707 		mutex_exit(&zone->zone_lock);
3708 		if (ntasks == 1) {
3709 			/*
3710 			 * The only way to create another task is through
3711 			 * zone_enter(), which will block until we drop
3712 			 * zonehash_lock.  The zone is empty.
3713 			 */
3714 			if (zone->zone_kthreads == NULL) {
3715 				/*
3716 				 * Skip ahead to ZONE_IS_DOWN
3717 				 */
3718 				zone_status_set(zone, ZONE_IS_DOWN);
3719 			} else {
3720 				zone_status_set(zone, ZONE_IS_EMPTY);
3721 			}
3722 		}
3723 	}
3724 	zone_hold(zone);	/* so we can use the zone_t later */
3725 	mutex_exit(&zone_status_lock);
3726 	mutex_exit(&zonehash_lock);
3727 	resume_mounts();
3728 
3729 	if (error = zone_empty(zone)) {
3730 		zone_rele(zone);
3731 		return (set_errno(error));
3732 	}
3733 	/*
3734 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
3735 	 * longer be notified of changes to the pools configuration, so
3736 	 * in order to not end up with a stale pool pointer, we point
3737 	 * ourselves at the default pool and remove all resource
3738 	 * visibility.  This is especially important as the zone_t may
3739 	 * languish on the deathrow for a very long time waiting for
3740 	 * cred's to drain out.
3741 	 *
3742 	 * This rebinding of the zone can happen multiple times
3743 	 * (presumably due to interrupted or parallel systemcalls)
3744 	 * without any adverse effects.
3745 	 */
3746 	if (pool_lock_intr() != 0) {
3747 		zone_rele(zone);
3748 		return (set_errno(EINTR));
3749 	}
3750 	if (pool_state == POOL_ENABLED) {
3751 		mutex_enter(&cpu_lock);
3752 		zone_pool_set(zone, pool_default);
3753 		/*
3754 		 * The zone no longer needs to be able to see any cpus.
3755 		 */
3756 		zone_pset_set(zone, ZONE_PS_INVAL);
3757 		mutex_exit(&cpu_lock);
3758 	}
3759 	pool_unlock();
3760 
3761 	/*
3762 	 * ZSD shutdown callbacks can be executed multiple times, hence
3763 	 * it is safe to not be holding any locks across this call.
3764 	 */
3765 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
3766 
3767 	mutex_enter(&zone_status_lock);
3768 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
3769 		zone_status_set(zone, ZONE_IS_DOWN);
3770 	mutex_exit(&zone_status_lock);
3771 
3772 	/*
3773 	 * Wait for kernel threads to drain.
3774 	 */
3775 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
3776 		zone_rele(zone);
3777 		return (set_errno(EINTR));
3778 	}
3779 
3780 	brand_unregister_zone(zone->zone_brand);
3781 
3782 	zone_rele(zone);
3783 	return (0);
3784 }
3785 
3786 /*
3787  * Systemcall entry point to finalize the zone halt process.  The caller
3788  * must have already successfully called zone_shutdown().
3789  *
3790  * Upon successful completion, the zone will have been fully destroyed:
3791  * zsched will have exited, destructor callbacks executed, and the zone
3792  * removed from the list of active zones.
3793  */
3794 static int
3795 zone_destroy(zoneid_t zoneid)
3796 {
3797 	uint64_t uniqid;
3798 	zone_t *zone;
3799 	zone_status_t status;
3800 
3801 	if (secpolicy_zone_config(CRED()) != 0)
3802 		return (set_errno(EPERM));
3803 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3804 		return (set_errno(EINVAL));
3805 
3806 	mutex_enter(&zonehash_lock);
3807 	/*
3808 	 * Look for zone under hash lock to prevent races with other
3809 	 * calls to zone_destroy.
3810 	 */
3811 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3812 		mutex_exit(&zonehash_lock);
3813 		return (set_errno(EINVAL));
3814 	}
3815 
3816 	if (zone_mount_count(zone->zone_rootpath) != 0) {
3817 		mutex_exit(&zonehash_lock);
3818 		return (set_errno(EBUSY));
3819 	}
3820 	mutex_enter(&zone_status_lock);
3821 	status = zone_status_get(zone);
3822 	if (status < ZONE_IS_DOWN) {
3823 		mutex_exit(&zone_status_lock);
3824 		mutex_exit(&zonehash_lock);
3825 		return (set_errno(EBUSY));
3826 	} else if (status == ZONE_IS_DOWN) {
3827 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
3828 	}
3829 	mutex_exit(&zone_status_lock);
3830 	zone_hold(zone);
3831 	mutex_exit(&zonehash_lock);
3832 
3833 	/*
3834 	 * wait for zsched to exit
3835 	 */
3836 	zone_status_wait(zone, ZONE_IS_DEAD);
3837 	zone_zsd_callbacks(zone, ZSD_DESTROY);
3838 	zone->zone_netstack = NULL;
3839 	uniqid = zone->zone_uniqid;
3840 	zone_rele(zone);
3841 	zone = NULL;	/* potentially free'd */
3842 
3843 	mutex_enter(&zonehash_lock);
3844 	for (; /* ever */; ) {
3845 		boolean_t unref;
3846 
3847 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
3848 		    zone->zone_uniqid != uniqid) {
3849 			/*
3850 			 * The zone has gone away.  Necessary conditions
3851 			 * are met, so we return success.
3852 			 */
3853 			mutex_exit(&zonehash_lock);
3854 			return (0);
3855 		}
3856 		mutex_enter(&zone->zone_lock);
3857 		unref = ZONE_IS_UNREF(zone);
3858 		mutex_exit(&zone->zone_lock);
3859 		if (unref) {
3860 			/*
3861 			 * There is only one reference to the zone -- that
3862 			 * added when the zone was added to the hashtables --
3863 			 * and things will remain this way until we drop
3864 			 * zonehash_lock... we can go ahead and cleanup the
3865 			 * zone.
3866 			 */
3867 			break;
3868 		}
3869 
3870 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
3871 			/* Signaled */
3872 			mutex_exit(&zonehash_lock);
3873 			return (set_errno(EINTR));
3874 		}
3875 
3876 	}
3877 
3878 	/* Get rid of the zone's kstats */
3879 	zone_kstat_delete(zone);
3880 
3881 	/*
3882 	 * It is now safe to let the zone be recreated; remove it from the
3883 	 * lists.  The memory will not be freed until the last cred
3884 	 * reference goes away.
3885 	 */
3886 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
3887 	zonecount--;
3888 	/* remove from active list and hash tables */
3889 	list_remove(&zone_active, zone);
3890 	(void) mod_hash_destroy(zonehashbyname,
3891 	    (mod_hash_key_t)zone->zone_name);
3892 	(void) mod_hash_destroy(zonehashbyid,
3893 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3894 	if (zone->zone_flags & ZF_HASHED_LABEL)
3895 		(void) mod_hash_destroy(zonehashbylabel,
3896 		    (mod_hash_key_t)zone->zone_slabel);
3897 	mutex_exit(&zonehash_lock);
3898 
3899 	/*
3900 	 * Release the root vnode; we're not using it anymore.  Nor should any
3901 	 * other thread that might access it exist.
3902 	 */
3903 	if (zone->zone_rootvp != NULL) {
3904 		VN_RELE(zone->zone_rootvp);
3905 		zone->zone_rootvp = NULL;
3906 	}
3907 
3908 	/* add to deathrow list */
3909 	mutex_enter(&zone_deathrow_lock);
3910 	list_insert_tail(&zone_deathrow, zone);
3911 	mutex_exit(&zone_deathrow_lock);
3912 
3913 	/*
3914 	 * Drop last reference (which was added by zsched()), this will
3915 	 * free the zone unless there are outstanding cred references.
3916 	 */
3917 	zone_rele(zone);
3918 	return (0);
3919 }
3920 
3921 /*
3922  * Systemcall entry point for zone_getattr(2).
3923  */
3924 static ssize_t
3925 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
3926 {
3927 	size_t size;
3928 	int error = 0, err;
3929 	zone_t *zone;
3930 	char *zonepath;
3931 	char *outstr;
3932 	zone_status_t zone_status;
3933 	pid_t initpid;
3934 	boolean_t global = (curproc->p_zone == global_zone);
3935 	boolean_t curzone = (curproc->p_zone->zone_id == zoneid);
3936 	ushort_t flags;
3937 
3938 	mutex_enter(&zonehash_lock);
3939 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3940 		mutex_exit(&zonehash_lock);
3941 		return (set_errno(EINVAL));
3942 	}
3943 	zone_status = zone_status_get(zone);
3944 	if (zone_status < ZONE_IS_READY) {
3945 		mutex_exit(&zonehash_lock);
3946 		return (set_errno(EINVAL));
3947 	}
3948 	zone_hold(zone);
3949 	mutex_exit(&zonehash_lock);
3950 
3951 	/*
3952 	 * If not in the global zone, don't show information about other zones,
3953 	 * unless the system is labeled and the local zone's label dominates
3954 	 * the other zone.
3955 	 */
3956 	if (!zone_list_access(zone)) {
3957 		zone_rele(zone);
3958 		return (set_errno(EINVAL));
3959 	}
3960 
3961 	switch (attr) {
3962 	case ZONE_ATTR_ROOT:
3963 		if (global) {
3964 			/*
3965 			 * Copy the path to trim the trailing "/" (except for
3966 			 * the global zone).
3967 			 */
3968 			if (zone != global_zone)
3969 				size = zone->zone_rootpathlen - 1;
3970 			else
3971 				size = zone->zone_rootpathlen;
3972 			zonepath = kmem_alloc(size, KM_SLEEP);
3973 			bcopy(zone->zone_rootpath, zonepath, size);
3974 			zonepath[size - 1] = '\0';
3975 		} else {
3976 			if (curzone || !is_system_labeled()) {
3977 				/*
3978 				 * Caller is not in the global zone.
3979 				 * if the query is on the current zone
3980 				 * or the system is not labeled,
3981 				 * just return faked-up path for current zone.
3982 				 */
3983 				zonepath = "/";
3984 				size = 2;
3985 			} else {
3986 				/*
3987 				 * Return related path for current zone.
3988 				 */
3989 				int prefix_len = strlen(zone_prefix);
3990 				int zname_len = strlen(zone->zone_name);
3991 
3992 				size = prefix_len + zname_len + 1;
3993 				zonepath = kmem_alloc(size, KM_SLEEP);
3994 				bcopy(zone_prefix, zonepath, prefix_len);
3995 				bcopy(zone->zone_name, zonepath +
3996 				    prefix_len, zname_len);
3997 				zonepath[size - 1] = '\0';
3998 			}
3999 		}
4000 		if (bufsize > size)
4001 			bufsize = size;
4002 		if (buf != NULL) {
4003 			err = copyoutstr(zonepath, buf, bufsize, NULL);
4004 			if (err != 0 && err != ENAMETOOLONG)
4005 				error = EFAULT;
4006 		}
4007 		if (global || (is_system_labeled() && !curzone))
4008 			kmem_free(zonepath, size);
4009 		break;
4010 
4011 	case ZONE_ATTR_NAME:
4012 		size = strlen(zone->zone_name) + 1;
4013 		if (bufsize > size)
4014 			bufsize = size;
4015 		if (buf != NULL) {
4016 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
4017 			if (err != 0 && err != ENAMETOOLONG)
4018 				error = EFAULT;
4019 		}
4020 		break;
4021 
4022 	case ZONE_ATTR_STATUS:
4023 		/*
4024 		 * Since we're not holding zonehash_lock, the zone status
4025 		 * may be anything; leave it up to userland to sort it out.
4026 		 */
4027 		size = sizeof (zone_status);
4028 		if (bufsize > size)
4029 			bufsize = size;
4030 		zone_status = zone_status_get(zone);
4031 		if (buf != NULL &&
4032 		    copyout(&zone_status, buf, bufsize) != 0)
4033 			error = EFAULT;
4034 		break;
4035 	case ZONE_ATTR_FLAGS:
4036 		size = sizeof (zone->zone_flags);
4037 		if (bufsize > size)
4038 			bufsize = size;
4039 		flags = zone->zone_flags;
4040 		if (buf != NULL &&
4041 		    copyout(&flags, buf, bufsize) != 0)
4042 			error = EFAULT;
4043 		break;
4044 	case ZONE_ATTR_PRIVSET:
4045 		size = sizeof (priv_set_t);
4046 		if (bufsize > size)
4047 			bufsize = size;
4048 		if (buf != NULL &&
4049 		    copyout(zone->zone_privset, buf, bufsize) != 0)
4050 			error = EFAULT;
4051 		break;
4052 	case ZONE_ATTR_UNIQID:
4053 		size = sizeof (zone->zone_uniqid);
4054 		if (bufsize > size)
4055 			bufsize = size;
4056 		if (buf != NULL &&
4057 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
4058 			error = EFAULT;
4059 		break;
4060 	case ZONE_ATTR_POOLID:
4061 		{
4062 			pool_t *pool;
4063 			poolid_t poolid;
4064 
4065 			if (pool_lock_intr() != 0) {
4066 				error = EINTR;
4067 				break;
4068 			}
4069 			pool = zone_pool_get(zone);
4070 			poolid = pool->pool_id;
4071 			pool_unlock();
4072 			size = sizeof (poolid);
4073 			if (bufsize > size)
4074 				bufsize = size;
4075 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
4076 				error = EFAULT;
4077 		}
4078 		break;
4079 	case ZONE_ATTR_SLBL:
4080 		size = sizeof (bslabel_t);
4081 		if (bufsize > size)
4082 			bufsize = size;
4083 		if (zone->zone_slabel == NULL)
4084 			error = EINVAL;
4085 		else if (buf != NULL &&
4086 		    copyout(label2bslabel(zone->zone_slabel), buf,
4087 		    bufsize) != 0)
4088 			error = EFAULT;
4089 		break;
4090 	case ZONE_ATTR_INITPID:
4091 		size = sizeof (initpid);
4092 		if (bufsize > size)
4093 			bufsize = size;
4094 		initpid = zone->zone_proc_initpid;
4095 		if (initpid == -1) {
4096 			error = ESRCH;
4097 			break;
4098 		}
4099 		if (buf != NULL &&
4100 		    copyout(&initpid, buf, bufsize) != 0)
4101 			error = EFAULT;
4102 		break;
4103 	case ZONE_ATTR_BRAND:
4104 		size = strlen(zone->zone_brand->b_name) + 1;
4105 
4106 		if (bufsize > size)
4107 			bufsize = size;
4108 		if (buf != NULL) {
4109 			err = copyoutstr(zone->zone_brand->b_name, buf,
4110 			    bufsize, NULL);
4111 			if (err != 0 && err != ENAMETOOLONG)
4112 				error = EFAULT;
4113 		}
4114 		break;
4115 	case ZONE_ATTR_INITNAME:
4116 		size = strlen(zone->zone_initname) + 1;
4117 		if (bufsize > size)
4118 			bufsize = size;
4119 		if (buf != NULL) {
4120 			err = copyoutstr(zone->zone_initname, buf, bufsize,
4121 			    NULL);
4122 			if (err != 0 && err != ENAMETOOLONG)
4123 				error = EFAULT;
4124 		}
4125 		break;
4126 	case ZONE_ATTR_BOOTARGS:
4127 		if (zone->zone_bootargs == NULL)
4128 			outstr = "";
4129 		else
4130 			outstr = zone->zone_bootargs;
4131 		size = strlen(outstr) + 1;
4132 		if (bufsize > size)
4133 			bufsize = size;
4134 		if (buf != NULL) {
4135 			err = copyoutstr(outstr, buf, bufsize, NULL);
4136 			if (err != 0 && err != ENAMETOOLONG)
4137 				error = EFAULT;
4138 		}
4139 		break;
4140 	case ZONE_ATTR_PHYS_MCAP:
4141 		size = sizeof (zone->zone_phys_mcap);
4142 		if (bufsize > size)
4143 			bufsize = size;
4144 		if (buf != NULL &&
4145 		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
4146 			error = EFAULT;
4147 		break;
4148 	case ZONE_ATTR_SCHED_CLASS:
4149 		mutex_enter(&class_lock);
4150 
4151 		if (zone->zone_defaultcid >= loaded_classes)
4152 			outstr = "";
4153 		else
4154 			outstr = sclass[zone->zone_defaultcid].cl_name;
4155 		size = strlen(outstr) + 1;
4156 		if (bufsize > size)
4157 			bufsize = size;
4158 		if (buf != NULL) {
4159 			err = copyoutstr(outstr, buf, bufsize, NULL);
4160 			if (err != 0 && err != ENAMETOOLONG)
4161 				error = EFAULT;
4162 		}
4163 
4164 		mutex_exit(&class_lock);
4165 		break;
4166 	default:
4167 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
4168 			size = bufsize;
4169 			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
4170 		} else {
4171 			error = EINVAL;
4172 		}
4173 	}
4174 	zone_rele(zone);
4175 
4176 	if (error)
4177 		return (set_errno(error));
4178 	return ((ssize_t)size);
4179 }
4180 
4181 /*
4182  * Systemcall entry point for zone_setattr(2).
4183  */
4184 /*ARGSUSED*/
4185 static int
4186 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
4187 {
4188 	zone_t *zone;
4189 	zone_status_t zone_status;
4190 	struct brand_attr *attrp;
4191 	int err;
4192 
4193 	if (secpolicy_zone_config(CRED()) != 0)
4194 		return (set_errno(EPERM));
4195 
4196 	/*
4197 	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
4198 	 * global zone.
4199 	 */
4200 	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
4201 		return (set_errno(EINVAL));
4202 	}
4203 
4204 	mutex_enter(&zonehash_lock);
4205 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
4206 		mutex_exit(&zonehash_lock);
4207 		return (set_errno(EINVAL));
4208 	}
4209 	zone_hold(zone);
4210 	mutex_exit(&zonehash_lock);
4211 
4212 	/*
4213 	 * At present most attributes can only be set on non-running,
4214 	 * non-global zones.
4215 	 */
4216 	zone_status = zone_status_get(zone);
4217 	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
4218 		goto done;
4219 
4220 	switch (attr) {
4221 	case ZONE_ATTR_INITNAME:
4222 		err = zone_set_initname(zone, (const char *)buf);
4223 		break;
4224 	case ZONE_ATTR_BOOTARGS:
4225 		err = zone_set_bootargs(zone, (const char *)buf);
4226 		break;
4227 	case ZONE_ATTR_BRAND:
4228 		ASSERT(!ZONE_IS_BRANDED(zone));
4229 		err = 0;
4230 		attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
4231 		if ((buf == NULL) ||
4232 		    (copyin(buf, attrp, sizeof (struct brand_attr)) != 0)) {
4233 			kmem_free(attrp, sizeof (struct brand_attr));
4234 			err = EFAULT;
4235 			break;
4236 		}
4237 
4238 		if (is_system_labeled() && strncmp(attrp->ba_brandname,
4239 		    NATIVE_BRAND_NAME, MAXNAMELEN) != 0) {
4240 			err = EPERM;
4241 			break;
4242 		}
4243 
4244 		zone->zone_brand = brand_register_zone(attrp);
4245 		kmem_free(attrp, sizeof (struct brand_attr));
4246 		if (zone->zone_brand == NULL)
4247 			err = EINVAL;
4248 		break;
4249 	case ZONE_ATTR_PHYS_MCAP:
4250 		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
4251 		break;
4252 	case ZONE_ATTR_SCHED_CLASS:
4253 		err = zone_set_sched_class(zone, (const char *)buf);
4254 		break;
4255 	default:
4256 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
4257 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
4258 		else
4259 			err = EINVAL;
4260 	}
4261 
4262 done:
4263 	zone_rele(zone);
4264 	return (err != 0 ? set_errno(err) : 0);
4265 }
4266 
4267 /*
4268  * Return zero if the process has at least one vnode mapped in to its
4269  * address space which shouldn't be allowed to change zones.
4270  *
4271  * Also return zero if the process has any shared mappings which reserve
4272  * swap.  This is because the counting for zone.max-swap does not allow swap
4273  * revervation to be shared between zones.  zone swap reservation is counted
4274  * on zone->zone_max_swap.
4275  */
4276 static int
4277 as_can_change_zones(void)
4278 {
4279 	proc_t *pp = curproc;
4280 	struct seg *seg;
4281 	struct as *as = pp->p_as;
4282 	vnode_t *vp;
4283 	int allow = 1;
4284 
4285 	ASSERT(pp->p_as != &kas);
4286 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
4287 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
4288 
4289 		/*
4290 		 * Cannot enter zone with shared anon memory which
4291 		 * reserves swap.  See comment above.
4292 		 */
4293 		if (seg_can_change_zones(seg) == B_FALSE) {
4294 			allow = 0;
4295 			break;
4296 		}
4297 		/*
4298 		 * if we can't get a backing vnode for this segment then skip
4299 		 * it.
4300 		 */
4301 		vp = NULL;
4302 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
4303 			continue;
4304 		if (!vn_can_change_zones(vp)) { /* bail on first match */
4305 			allow = 0;
4306 			break;
4307 		}
4308 	}
4309 	AS_LOCK_EXIT(as, &as->a_lock);
4310 	return (allow);
4311 }
4312 
4313 /*
4314  * Count swap reserved by curproc's address space
4315  */
4316 static size_t
4317 as_swresv(void)
4318 {
4319 	proc_t *pp = curproc;
4320 	struct seg *seg;
4321 	struct as *as = pp->p_as;
4322 	size_t swap = 0;
4323 
4324 	ASSERT(pp->p_as != &kas);
4325 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
4326 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
4327 		swap += seg_swresv(seg);
4328 
4329 	return (swap);
4330 }
4331 
4332 /*
4333  * Systemcall entry point for zone_enter().
4334  *
4335  * The current process is injected into said zone.  In the process
4336  * it will change its project membership, privileges, rootdir/cwd,
4337  * zone-wide rctls, and pool association to match those of the zone.
4338  *
4339  * The first zone_enter() called while the zone is in the ZONE_IS_READY
4340  * state will transition it to ZONE_IS_RUNNING.  Processes may only
4341  * enter a zone that is "ready" or "running".
4342  */
4343 static int
4344 zone_enter(zoneid_t zoneid)
4345 {
4346 	zone_t *zone;
4347 	vnode_t *vp;
4348 	proc_t *pp = curproc;
4349 	contract_t *ct;
4350 	cont_process_t *ctp;
4351 	task_t *tk, *oldtk;
4352 	kproject_t *zone_proj0;
4353 	cred_t *cr, *newcr;
4354 	pool_t *oldpool, *newpool;
4355 	sess_t *sp;
4356 	uid_t uid;
4357 	zone_status_t status;
4358 	int err = 0;
4359 	rctl_entity_p_t e;
4360 	size_t swap;
4361 
4362 	if (secpolicy_zone_config(CRED()) != 0)
4363 		return (set_errno(EPERM));
4364 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
4365 		return (set_errno(EINVAL));
4366 
4367 	/*
4368 	 * Stop all lwps so we don't need to hold a lock to look at
4369 	 * curproc->p_zone.  This needs to happen before we grab any
4370 	 * locks to avoid deadlock (another lwp in the process could
4371 	 * be waiting for the held lock).
4372 	 */
4373 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
4374 		return (set_errno(EINTR));
4375 
4376 	/*
4377 	 * Make sure we're not changing zones with files open or mapped in
4378 	 * to our address space which shouldn't be changing zones.
4379 	 */
4380 	if (!files_can_change_zones()) {
4381 		err = EBADF;
4382 		goto out;
4383 	}
4384 	if (!as_can_change_zones()) {
4385 		err = EFAULT;
4386 		goto out;
4387 	}
4388 
4389 	mutex_enter(&zonehash_lock);
4390 	if (pp->p_zone != global_zone) {
4391 		mutex_exit(&zonehash_lock);
4392 		err = EINVAL;
4393 		goto out;
4394 	}
4395 
4396 	zone = zone_find_all_by_id(zoneid);
4397 	if (zone == NULL) {
4398 		mutex_exit(&zonehash_lock);
4399 		err = EINVAL;
4400 		goto out;
4401 	}
4402 
4403 	/*
4404 	 * To prevent processes in a zone from holding contracts on
4405 	 * extrazonal resources, and to avoid process contract
4406 	 * memberships which span zones, contract holders and processes
4407 	 * which aren't the sole members of their encapsulating process
4408 	 * contracts are not allowed to zone_enter.
4409 	 */
4410 	ctp = pp->p_ct_process;
4411 	ct = &ctp->conp_contract;
4412 	mutex_enter(&ct->ct_lock);
4413 	mutex_enter(&pp->p_lock);
4414 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
4415 		mutex_exit(&pp->p_lock);
4416 		mutex_exit(&ct->ct_lock);
4417 		mutex_exit(&zonehash_lock);
4418 		pool_unlock();
4419 		err = EINVAL;
4420 		goto out;
4421 	}
4422 
4423 	/*
4424 	 * Moreover, we don't allow processes whose encapsulating
4425 	 * process contracts have inherited extrazonal contracts.
4426 	 * While it would be easier to eliminate all process contracts
4427 	 * with inherited contracts, we need to be able to give a
4428 	 * restarted init (or other zone-penetrating process) its
4429 	 * predecessor's contracts.
4430 	 */
4431 	if (ctp->conp_ninherited != 0) {
4432 		contract_t *next;
4433 		for (next = list_head(&ctp->conp_inherited); next;
4434 		    next = list_next(&ctp->conp_inherited, next)) {
4435 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
4436 				mutex_exit(&pp->p_lock);
4437 				mutex_exit(&ct->ct_lock);
4438 				mutex_exit(&zonehash_lock);
4439 				pool_unlock();
4440 				err = EINVAL;
4441 				goto out;
4442 			}
4443 		}
4444 	}
4445 	mutex_exit(&pp->p_lock);
4446 	mutex_exit(&ct->ct_lock);
4447 
4448 	status = zone_status_get(zone);
4449 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
4450 		/*
4451 		 * Can't join
4452 		 */
4453 		mutex_exit(&zonehash_lock);
4454 		err = EINVAL;
4455 		goto out;
4456 	}
4457 
4458 	/*
4459 	 * Make sure new priv set is within the permitted set for caller
4460 	 */
4461 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
4462 		mutex_exit(&zonehash_lock);
4463 		err = EPERM;
4464 		goto out;
4465 	}
4466 	/*
4467 	 * We want to momentarily drop zonehash_lock while we optimistically
4468 	 * bind curproc to the pool it should be running in.  This is safe
4469 	 * since the zone can't disappear (we have a hold on it).
4470 	 */
4471 	zone_hold(zone);
4472 	mutex_exit(&zonehash_lock);
4473 
4474 	/*
4475 	 * Grab pool_lock to keep the pools configuration from changing
4476 	 * and to stop ourselves from getting rebound to another pool
4477 	 * until we join the zone.
4478 	 */
4479 	if (pool_lock_intr() != 0) {
4480 		zone_rele(zone);
4481 		err = EINTR;
4482 		goto out;
4483 	}
4484 	ASSERT(secpolicy_pool(CRED()) == 0);
4485 	/*
4486 	 * Bind ourselves to the pool currently associated with the zone.
4487 	 */
4488 	oldpool = curproc->p_pool;
4489 	newpool = zone_pool_get(zone);
4490 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
4491 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
4492 	    POOL_BIND_ALL)) != 0) {
4493 		pool_unlock();
4494 		zone_rele(zone);
4495 		goto out;
4496 	}
4497 
4498 	/*
4499 	 * Grab cpu_lock now; we'll need it later when we call
4500 	 * task_join().
4501 	 */
4502 	mutex_enter(&cpu_lock);
4503 	mutex_enter(&zonehash_lock);
4504 	/*
4505 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
4506 	 */
4507 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
4508 		/*
4509 		 * Can't join anymore.
4510 		 */
4511 		mutex_exit(&zonehash_lock);
4512 		mutex_exit(&cpu_lock);
4513 		if (pool_state == POOL_ENABLED &&
4514 		    newpool != oldpool)
4515 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
4516 			    POOL_BIND_ALL);
4517 		pool_unlock();
4518 		zone_rele(zone);
4519 		err = EINVAL;
4520 		goto out;
4521 	}
4522 
4523 	/*
4524 	 * a_lock must be held while transfering locked memory and swap
4525 	 * reservation from the global zone to the non global zone because
4526 	 * asynchronous faults on the processes' address space can lock
4527 	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
4528 	 * segments respectively.
4529 	 */
4530 	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
4531 	swap = as_swresv();
4532 	mutex_enter(&pp->p_lock);
4533 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
4534 	/* verify that we do not exceed and task or lwp limits */
4535 	mutex_enter(&zone->zone_nlwps_lock);
4536 	/* add new lwps to zone and zone's proj0 */
4537 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
4538 	zone->zone_nlwps += pp->p_lwpcnt;
4539 	/* add 1 task to zone's proj0 */
4540 	zone_proj0->kpj_ntasks += 1;
4541 	mutex_exit(&zone->zone_nlwps_lock);
4542 
4543 	mutex_enter(&zone->zone_mem_lock);
4544 	zone->zone_locked_mem += pp->p_locked_mem;
4545 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
4546 	zone->zone_max_swap += swap;
4547 	mutex_exit(&zone->zone_mem_lock);
4548 
4549 	/* remove lwps from proc's old zone and old project */
4550 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
4551 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
4552 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
4553 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
4554 
4555 	mutex_enter(&pp->p_zone->zone_mem_lock);
4556 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
4557 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
4558 	pp->p_zone->zone_max_swap -= swap;
4559 	mutex_exit(&pp->p_zone->zone_mem_lock);
4560 
4561 	mutex_exit(&pp->p_lock);
4562 	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
4563 
4564 	/*
4565 	 * Joining the zone cannot fail from now on.
4566 	 *
4567 	 * This means that a lot of the following code can be commonized and
4568 	 * shared with zsched().
4569 	 */
4570 
4571 	/*
4572 	 * Reset the encapsulating process contract's zone.
4573 	 */
4574 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
4575 	contract_setzuniqid(ct, zone->zone_uniqid);
4576 
4577 	/*
4578 	 * Create a new task and associate the process with the project keyed
4579 	 * by (projid,zoneid).
4580 	 *
4581 	 * We might as well be in project 0; the global zone's projid doesn't
4582 	 * make much sense in a zone anyhow.
4583 	 *
4584 	 * This also increments zone_ntasks, and returns with p_lock held.
4585 	 */
4586 	tk = task_create(0, zone);
4587 	oldtk = task_join(tk, 0);
4588 	mutex_exit(&cpu_lock);
4589 
4590 	pp->p_flag |= SZONETOP;
4591 	pp->p_zone = zone;
4592 
4593 	/*
4594 	 * call RCTLOP_SET functions on this proc
4595 	 */
4596 	e.rcep_p.zone = zone;
4597 	e.rcep_t = RCENTITY_ZONE;
4598 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
4599 	    RCD_CALLBACK);
4600 	mutex_exit(&pp->p_lock);
4601 
4602 	/*
4603 	 * We don't need to hold any of zsched's locks here; not only do we know
4604 	 * the process and zone aren't going away, we know its session isn't
4605 	 * changing either.
4606 	 *
4607 	 * By joining zsched's session here, we mimic the behavior in the
4608 	 * global zone of init's sid being the pid of sched.  We extend this
4609 	 * to all zlogin-like zone_enter()'ing processes as well.
4610 	 */
4611 	mutex_enter(&pidlock);
4612 	sp = zone->zone_zsched->p_sessp;
4613 	sess_hold(zone->zone_zsched);
4614 	mutex_enter(&pp->p_lock);
4615 	pgexit(pp);
4616 	sess_rele(pp->p_sessp, B_TRUE);
4617 	pp->p_sessp = sp;
4618 	pgjoin(pp, zone->zone_zsched->p_pidp);
4619 
4620 	/*
4621 	 * If there is a default scheduling class for the zone and it is not
4622 	 * the class we are currently in, change all of the threads in the
4623 	 * process to the new class.  We need to be holding pidlock & p_lock
4624 	 * when we call parmsset so this is a good place to do it.
4625 	 */
4626 	if (zone->zone_defaultcid > 0 &&
4627 	    zone->zone_defaultcid != curthread->t_cid) {
4628 		pcparms_t pcparms;
4629 		kthread_id_t t;
4630 
4631 		pcparms.pc_cid = zone->zone_defaultcid;
4632 		pcparms.pc_clparms[0] = 0;
4633 
4634 		/*
4635 		 * If setting the class fails, we still want to enter the zone.
4636 		 */
4637 		if ((t = pp->p_tlist) != NULL) {
4638 			do {
4639 				(void) parmsset(&pcparms, t);
4640 			} while ((t = t->t_forw) != pp->p_tlist);
4641 		}
4642 	}
4643 
4644 	mutex_exit(&pp->p_lock);
4645 	mutex_exit(&pidlock);
4646 
4647 	mutex_exit(&zonehash_lock);
4648 	/*
4649 	 * We're firmly in the zone; let pools progress.
4650 	 */
4651 	pool_unlock();
4652 	task_rele(oldtk);
4653 	/*
4654 	 * We don't need to retain a hold on the zone since we already
4655 	 * incremented zone_ntasks, so the zone isn't going anywhere.
4656 	 */
4657 	zone_rele(zone);
4658 
4659 	/*
4660 	 * Chroot
4661 	 */
4662 	vp = zone->zone_rootvp;
4663 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
4664 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
4665 
4666 	/*
4667 	 * Change process credentials
4668 	 */
4669 	newcr = cralloc();
4670 	mutex_enter(&pp->p_crlock);
4671 	cr = pp->p_cred;
4672 	crcopy_to(cr, newcr);
4673 	crsetzone(newcr, zone);
4674 	pp->p_cred = newcr;
4675 
4676 	/*
4677 	 * Restrict all process privilege sets to zone limit
4678 	 */
4679 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
4680 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
4681 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
4682 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
4683 	mutex_exit(&pp->p_crlock);
4684 	crset(pp, newcr);
4685 
4686 	/*
4687 	 * Adjust upcount to reflect zone entry.
4688 	 */
4689 	uid = crgetruid(newcr);
4690 	mutex_enter(&pidlock);
4691 	upcount_dec(uid, GLOBAL_ZONEID);
4692 	upcount_inc(uid, zoneid);
4693 	mutex_exit(&pidlock);
4694 
4695 	/*
4696 	 * Set up core file path and content.
4697 	 */
4698 	set_core_defaults();
4699 
4700 out:
4701 	/*
4702 	 * Let the other lwps continue.
4703 	 */
4704 	mutex_enter(&pp->p_lock);
4705 	if (curthread != pp->p_agenttp)
4706 		continuelwps(pp);
4707 	mutex_exit(&pp->p_lock);
4708 
4709 	return (err != 0 ? set_errno(err) : 0);
4710 }
4711 
4712 /*
4713  * Systemcall entry point for zone_list(2).
4714  *
4715  * Processes running in a (non-global) zone only see themselves.
4716  * On labeled systems, they see all zones whose label they dominate.
4717  */
4718 static int
4719 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
4720 {
4721 	zoneid_t *zoneids;
4722 	zone_t *zone, *myzone;
4723 	uint_t user_nzones, real_nzones;
4724 	uint_t domi_nzones;
4725 	int error;
4726 
4727 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
4728 		return (set_errno(EFAULT));
4729 
4730 	myzone = curproc->p_zone;
4731 	if (myzone != global_zone) {
4732 		bslabel_t *mybslab;
4733 
4734 		if (!is_system_labeled()) {
4735 			/* just return current zone */
4736 			real_nzones = domi_nzones = 1;
4737 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
4738 			zoneids[0] = myzone->zone_id;
4739 		} else {
4740 			/* return all zones that are dominated */
4741 			mutex_enter(&zonehash_lock);
4742 			real_nzones = zonecount;
4743 			domi_nzones = 0;
4744 			if (real_nzones > 0) {
4745 				zoneids = kmem_alloc(real_nzones *
4746 				    sizeof (zoneid_t), KM_SLEEP);
4747 				mybslab = label2bslabel(myzone->zone_slabel);
4748 				for (zone = list_head(&zone_active);
4749 				    zone != NULL;
4750 				    zone = list_next(&zone_active, zone)) {
4751 					if (zone->zone_id == GLOBAL_ZONEID)
4752 						continue;
4753 					if (zone != myzone &&
4754 					    (zone->zone_flags & ZF_IS_SCRATCH))
4755 						continue;
4756 					/*
4757 					 * Note that a label always dominates
4758 					 * itself, so myzone is always included
4759 					 * in the list.
4760 					 */
4761 					if (bldominates(mybslab,
4762 					    label2bslabel(zone->zone_slabel))) {
4763 						zoneids[domi_nzones++] =
4764 						    zone->zone_id;
4765 					}
4766 				}
4767 			}
4768 			mutex_exit(&zonehash_lock);
4769 		}
4770 	} else {
4771 		mutex_enter(&zonehash_lock);
4772 		real_nzones = zonecount;
4773 		domi_nzones = 0;
4774 		if (real_nzones > 0) {
4775 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
4776 			    KM_SLEEP);
4777 			for (zone = list_head(&zone_active); zone != NULL;
4778 			    zone = list_next(&zone_active, zone))
4779 				zoneids[domi_nzones++] = zone->zone_id;
4780 			ASSERT(domi_nzones == real_nzones);
4781 		}
4782 		mutex_exit(&zonehash_lock);
4783 	}
4784 
4785 	/*
4786 	 * If user has allocated space for fewer entries than we found, then
4787 	 * return only up to his limit.  Either way, tell him exactly how many
4788 	 * we found.
4789 	 */
4790 	if (domi_nzones < user_nzones)
4791 		user_nzones = domi_nzones;
4792 	error = 0;
4793 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
4794 		error = EFAULT;
4795 	} else if (zoneidlist != NULL && user_nzones != 0) {
4796 		if (copyout(zoneids, zoneidlist,
4797 		    user_nzones * sizeof (zoneid_t)) != 0)
4798 			error = EFAULT;
4799 	}
4800 
4801 	if (real_nzones > 0)
4802 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
4803 
4804 	if (error != 0)
4805 		return (set_errno(error));
4806 	else
4807 		return (0);
4808 }
4809 
4810 /*
4811  * Systemcall entry point for zone_lookup(2).
4812  *
4813  * Non-global zones are only able to see themselves and (on labeled systems)
4814  * the zones they dominate.
4815  */
4816 static zoneid_t
4817 zone_lookup(const char *zone_name)
4818 {
4819 	char *kname;
4820 	zone_t *zone;
4821 	zoneid_t zoneid;
4822 	int err;
4823 
4824 	if (zone_name == NULL) {
4825 		/* return caller's zone id */
4826 		return (getzoneid());
4827 	}
4828 
4829 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
4830 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
4831 		kmem_free(kname, ZONENAME_MAX);
4832 		return (set_errno(err));
4833 	}
4834 
4835 	mutex_enter(&zonehash_lock);
4836 	zone = zone_find_all_by_name(kname);
4837 	kmem_free(kname, ZONENAME_MAX);
4838 	/*
4839 	 * In a non-global zone, can only lookup global and own name.
4840 	 * In Trusted Extensions zone label dominance rules apply.
4841 	 */
4842 	if (zone == NULL ||
4843 	    zone_status_get(zone) < ZONE_IS_READY ||
4844 	    !zone_list_access(zone)) {
4845 		mutex_exit(&zonehash_lock);
4846 		return (set_errno(EINVAL));
4847 	} else {
4848 		zoneid = zone->zone_id;
4849 		mutex_exit(&zonehash_lock);
4850 		return (zoneid);
4851 	}
4852 }
4853 
4854 static int
4855 zone_version(int *version_arg)
4856 {
4857 	int version = ZONE_SYSCALL_API_VERSION;
4858 
4859 	if (copyout(&version, version_arg, sizeof (int)) != 0)
4860 		return (set_errno(EFAULT));
4861 	return (0);
4862 }
4863 
4864 /* ARGSUSED */
4865 long
4866 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
4867 {
4868 	zone_def zs;
4869 
4870 	switch (cmd) {
4871 	case ZONE_CREATE:
4872 		if (get_udatamodel() == DATAMODEL_NATIVE) {
4873 			if (copyin(arg1, &zs, sizeof (zone_def))) {
4874 				return (set_errno(EFAULT));
4875 			}
4876 		} else {
4877 #ifdef _SYSCALL32_IMPL
4878 			zone_def32 zs32;
4879 
4880 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
4881 				return (set_errno(EFAULT));
4882 			}
4883 			zs.zone_name =
4884 			    (const char *)(unsigned long)zs32.zone_name;
4885 			zs.zone_root =
4886 			    (const char *)(unsigned long)zs32.zone_root;
4887 			zs.zone_privs =
4888 			    (const struct priv_set *)
4889 			    (unsigned long)zs32.zone_privs;
4890 			zs.zone_privssz = zs32.zone_privssz;
4891 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
4892 			zs.rctlbufsz = zs32.rctlbufsz;
4893 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
4894 			zs.zfsbufsz = zs32.zfsbufsz;
4895 			zs.extended_error =
4896 			    (int *)(unsigned long)zs32.extended_error;
4897 			zs.match = zs32.match;
4898 			zs.doi = zs32.doi;
4899 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
4900 			zs.flags = zs32.flags;
4901 #else
4902 			panic("get_udatamodel() returned bogus result\n");
4903 #endif
4904 		}
4905 
4906 		return (zone_create(zs.zone_name, zs.zone_root,
4907 		    zs.zone_privs, zs.zone_privssz,
4908 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
4909 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
4910 		    zs.extended_error, zs.match, zs.doi,
4911 		    zs.label, zs.flags));
4912 	case ZONE_BOOT:
4913 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
4914 	case ZONE_DESTROY:
4915 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
4916 	case ZONE_GETATTR:
4917 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
4918 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
4919 	case ZONE_SETATTR:
4920 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
4921 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
4922 	case ZONE_ENTER:
4923 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
4924 	case ZONE_LIST:
4925 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
4926 	case ZONE_SHUTDOWN:
4927 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
4928 	case ZONE_LOOKUP:
4929 		return (zone_lookup((const char *)arg1));
4930 	case ZONE_VERSION:
4931 		return (zone_version((int *)arg1));
4932 	case ZONE_ADD_DATALINK:
4933 		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
4934 		    (char *)arg2));
4935 	case ZONE_DEL_DATALINK:
4936 		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
4937 		    (char *)arg2));
4938 	case ZONE_CHECK_DATALINK:
4939 		return (zone_check_datalink((zoneid_t *)arg1, (char *)arg2));
4940 	case ZONE_LIST_DATALINK:
4941 		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
4942 		    (int *)arg2, (char *)arg3));
4943 	default:
4944 		return (set_errno(EINVAL));
4945 	}
4946 }
4947 
4948 struct zarg {
4949 	zone_t *zone;
4950 	zone_cmd_arg_t arg;
4951 };
4952 
4953 static int
4954 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
4955 {
4956 	char *buf;
4957 	size_t buflen;
4958 	int error;
4959 
4960 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
4961 	buf = kmem_alloc(buflen, KM_SLEEP);
4962 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
4963 	error = door_ki_open(buf, doorp);
4964 	kmem_free(buf, buflen);
4965 	return (error);
4966 }
4967 
4968 static void
4969 zone_release_door(door_handle_t *doorp)
4970 {
4971 	door_ki_rele(*doorp);
4972 	*doorp = NULL;
4973 }
4974 
4975 static void
4976 zone_ki_call_zoneadmd(struct zarg *zargp)
4977 {
4978 	door_handle_t door = NULL;
4979 	door_arg_t darg, save_arg;
4980 	char *zone_name;
4981 	size_t zone_namelen;
4982 	zoneid_t zoneid;
4983 	zone_t *zone;
4984 	zone_cmd_arg_t arg;
4985 	uint64_t uniqid;
4986 	size_t size;
4987 	int error;
4988 	int retry;
4989 
4990 	zone = zargp->zone;
4991 	arg = zargp->arg;
4992 	kmem_free(zargp, sizeof (*zargp));
4993 
4994 	zone_namelen = strlen(zone->zone_name) + 1;
4995 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
4996 	bcopy(zone->zone_name, zone_name, zone_namelen);
4997 	zoneid = zone->zone_id;
4998 	uniqid = zone->zone_uniqid;
4999 	/*
5000 	 * zoneadmd may be down, but at least we can empty out the zone.
5001 	 * We can ignore the return value of zone_empty() since we're called
5002 	 * from a kernel thread and know we won't be delivered any signals.
5003 	 */
5004 	ASSERT(curproc == &p0);
5005 	(void) zone_empty(zone);
5006 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
5007 	zone_rele(zone);
5008 
5009 	size = sizeof (arg);
5010 	darg.rbuf = (char *)&arg;
5011 	darg.data_ptr = (char *)&arg;
5012 	darg.rsize = size;
5013 	darg.data_size = size;
5014 	darg.desc_ptr = NULL;
5015 	darg.desc_num = 0;
5016 
5017 	save_arg = darg;
5018 	/*
5019 	 * Since we're not holding a reference to the zone, any number of
5020 	 * things can go wrong, including the zone disappearing before we get a
5021 	 * chance to talk to zoneadmd.
5022 	 */
5023 	for (retry = 0; /* forever */; retry++) {
5024 		if (door == NULL &&
5025 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
5026 			goto next;
5027 		}
5028 		ASSERT(door != NULL);
5029 
5030 		if ((error = door_ki_upcall(door, &darg)) == 0) {
5031 			break;
5032 		}
5033 		switch (error) {
5034 		case EINTR:
5035 			/* FALLTHROUGH */
5036 		case EAGAIN:	/* process may be forking */
5037 			/*
5038 			 * Back off for a bit
5039 			 */
5040 			break;
5041 		case EBADF:
5042 			zone_release_door(&door);
5043 			if (zone_lookup_door(zone_name, &door) != 0) {
5044 				/*
5045 				 * zoneadmd may be dead, but it may come back to
5046 				 * life later.
5047 				 */
5048 				break;
5049 			}
5050 			break;
5051 		default:
5052 			cmn_err(CE_WARN,
5053 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
5054 			    error);
5055 			goto out;
5056 		}
5057 next:
5058 		/*
5059 		 * If this isn't the same zone_t that we originally had in mind,
5060 		 * then this is the same as if two kadmin requests come in at
5061 		 * the same time: the first one wins.  This means we lose, so we
5062 		 * bail.
5063 		 */
5064 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
5065 			/*
5066 			 * Problem is solved.
5067 			 */
5068 			break;
5069 		}
5070 		if (zone->zone_uniqid != uniqid) {
5071 			/*
5072 			 * zoneid recycled
5073 			 */
5074 			zone_rele(zone);
5075 			break;
5076 		}
5077 		/*
5078 		 * We could zone_status_timedwait(), but there doesn't seem to
5079 		 * be much point in doing that (plus, it would mean that
5080 		 * zone_free() isn't called until this thread exits).
5081 		 */
5082 		zone_rele(zone);
5083 		delay(hz);
5084 		darg = save_arg;
5085 	}
5086 out:
5087 	if (door != NULL) {
5088 		zone_release_door(&door);
5089 	}
5090 	kmem_free(zone_name, zone_namelen);
5091 	thread_exit();
5092 }
5093 
5094 /*
5095  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
5096  * kadmin().  The caller is a process in the zone.
5097  *
5098  * In order to shutdown the zone, we will hand off control to zoneadmd
5099  * (running in the global zone) via a door.  We do a half-hearted job at
5100  * killing all processes in the zone, create a kernel thread to contact
5101  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
5102  * a form of generation number used to let zoneadmd (as well as
5103  * zone_destroy()) know exactly which zone they're re talking about.
5104  */
5105 int
5106 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
5107 {
5108 	struct zarg *zargp;
5109 	zone_cmd_t zcmd;
5110 	zone_t *zone;
5111 
5112 	zone = curproc->p_zone;
5113 	ASSERT(getzoneid() != GLOBAL_ZONEID);
5114 
5115 	switch (cmd) {
5116 	case A_SHUTDOWN:
5117 		switch (fcn) {
5118 		case AD_HALT:
5119 		case AD_POWEROFF:
5120 			zcmd = Z_HALT;
5121 			break;
5122 		case AD_BOOT:
5123 			zcmd = Z_REBOOT;
5124 			break;
5125 		case AD_IBOOT:
5126 		case AD_SBOOT:
5127 		case AD_SIBOOT:
5128 		case AD_NOSYNC:
5129 			return (ENOTSUP);
5130 		default:
5131 			return (EINVAL);
5132 		}
5133 		break;
5134 	case A_REBOOT:
5135 		zcmd = Z_REBOOT;
5136 		break;
5137 	case A_FTRACE:
5138 	case A_REMOUNT:
5139 	case A_FREEZE:
5140 	case A_DUMP:
5141 		return (ENOTSUP);
5142 	default:
5143 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
5144 		return (EINVAL);
5145 	}
5146 
5147 	if (secpolicy_zone_admin(credp, B_FALSE))
5148 		return (EPERM);
5149 	mutex_enter(&zone_status_lock);
5150 
5151 	/*
5152 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
5153 	 * is in the zone.
5154 	 */
5155 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
5156 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
5157 		/*
5158 		 * This zone is already on its way down.
5159 		 */
5160 		mutex_exit(&zone_status_lock);
5161 		return (0);
5162 	}
5163 	/*
5164 	 * Prevent future zone_enter()s
5165 	 */
5166 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
5167 	mutex_exit(&zone_status_lock);
5168 
5169 	/*
5170 	 * Kill everyone now and call zoneadmd later.
5171 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
5172 	 * later.
5173 	 */
5174 	killall(zone->zone_id);
5175 	/*
5176 	 * Now, create the thread to contact zoneadmd and do the rest of the
5177 	 * work.  This thread can't be created in our zone otherwise
5178 	 * zone_destroy() would deadlock.
5179 	 */
5180 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
5181 	zargp->arg.cmd = zcmd;
5182 	zargp->arg.uniqid = zone->zone_uniqid;
5183 	zargp->zone = zone;
5184 	(void) strcpy(zargp->arg.locale, "C");
5185 	/* mdep was already copied in for us by uadmin */
5186 	if (mdep != NULL)
5187 		(void) strlcpy(zargp->arg.bootbuf, mdep,
5188 		    sizeof (zargp->arg.bootbuf));
5189 	zone_hold(zone);
5190 
5191 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
5192 	    TS_RUN, minclsyspri);
5193 	exit(CLD_EXITED, 0);
5194 
5195 	return (EINVAL);
5196 }
5197 
5198 /*
5199  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
5200  * status to ZONE_IS_SHUTTING_DOWN.
5201  */
5202 void
5203 zone_shutdown_global(void)
5204 {
5205 	ASSERT(curproc->p_zone == global_zone);
5206 
5207 	mutex_enter(&zone_status_lock);
5208 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
5209 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
5210 	mutex_exit(&zone_status_lock);
5211 }
5212 
5213 /*
5214  * Returns true if the named dataset is visible in the current zone.
5215  * The 'write' parameter is set to 1 if the dataset is also writable.
5216  */
5217 int
5218 zone_dataset_visible(const char *dataset, int *write)
5219 {
5220 	zone_dataset_t *zd;
5221 	size_t len;
5222 	zone_t *zone = curproc->p_zone;
5223 
5224 	if (dataset[0] == '\0')
5225 		return (0);
5226 
5227 	/*
5228 	 * Walk the list once, looking for datasets which match exactly, or
5229 	 * specify a dataset underneath an exported dataset.  If found, return
5230 	 * true and note that it is writable.
5231 	 */
5232 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5233 	    zd = list_next(&zone->zone_datasets, zd)) {
5234 
5235 		len = strlen(zd->zd_dataset);
5236 		if (strlen(dataset) >= len &&
5237 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5238 		    (dataset[len] == '\0' || dataset[len] == '/' ||
5239 		    dataset[len] == '@')) {
5240 			if (write)
5241 				*write = 1;
5242 			return (1);
5243 		}
5244 	}
5245 
5246 	/*
5247 	 * Walk the list a second time, searching for datasets which are parents
5248 	 * of exported datasets.  These should be visible, but read-only.
5249 	 *
5250 	 * Note that we also have to support forms such as 'pool/dataset/', with
5251 	 * a trailing slash.
5252 	 */
5253 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
5254 	    zd = list_next(&zone->zone_datasets, zd)) {
5255 
5256 		len = strlen(dataset);
5257 		if (dataset[len - 1] == '/')
5258 			len--;	/* Ignore trailing slash */
5259 		if (len < strlen(zd->zd_dataset) &&
5260 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
5261 		    zd->zd_dataset[len] == '/') {
5262 			if (write)
5263 				*write = 0;
5264 			return (1);
5265 		}
5266 	}
5267 
5268 	return (0);
5269 }
5270 
5271 /*
5272  * zone_find_by_any_path() -
5273  *
5274  * kernel-private routine similar to zone_find_by_path(), but which
5275  * effectively compares against zone paths rather than zonerootpath
5276  * (i.e., the last component of zonerootpaths, which should be "root/",
5277  * are not compared.)  This is done in order to accurately identify all
5278  * paths, whether zone-visible or not, including those which are parallel
5279  * to /root/, such as /dev/, /home/, etc...
5280  *
5281  * If the specified path does not fall under any zone path then global
5282  * zone is returned.
5283  *
5284  * The treat_abs parameter indicates whether the path should be treated as
5285  * an absolute path although it does not begin with "/".  (This supports
5286  * nfs mount syntax such as host:any/path.)
5287  *
5288  * The caller is responsible for zone_rele of the returned zone.
5289  */
5290 zone_t *
5291 zone_find_by_any_path(const char *path, boolean_t treat_abs)
5292 {
5293 	zone_t *zone;
5294 	int path_offset = 0;
5295 
5296 	if (path == NULL) {
5297 		zone_hold(global_zone);
5298 		return (global_zone);
5299 	}
5300 
5301 	if (*path != '/') {
5302 		ASSERT(treat_abs);
5303 		path_offset = 1;
5304 	}
5305 
5306 	mutex_enter(&zonehash_lock);
5307 	for (zone = list_head(&zone_active); zone != NULL;
5308 	    zone = list_next(&zone_active, zone)) {
5309 		char	*c;
5310 		size_t	pathlen;
5311 		char *rootpath_start;
5312 
5313 		if (zone == global_zone)	/* skip global zone */
5314 			continue;
5315 
5316 		/* scan backwards to find start of last component */
5317 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
5318 		do {
5319 			c--;
5320 		} while (*c != '/');
5321 
5322 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
5323 		rootpath_start = (zone->zone_rootpath + path_offset);
5324 		if (strncmp(path, rootpath_start, pathlen) == 0)
5325 			break;
5326 	}
5327 	if (zone == NULL)
5328 		zone = global_zone;
5329 	zone_hold(zone);
5330 	mutex_exit(&zonehash_lock);
5331 	return (zone);
5332 }
5333 
5334 /* List of data link names which are accessible from the zone */
5335 struct dlnamelist {
5336 	char			dlnl_name[LIFNAMSIZ];
5337 	struct dlnamelist	*dlnl_next;
5338 };
5339 
5340 
5341 /*
5342  * Check whether the datalink name (dlname) itself is present.
5343  * Return true if found.
5344  */
5345 static boolean_t
5346 zone_dlname(zone_t *zone, char *dlname)
5347 {
5348 	struct dlnamelist *dlnl;
5349 	boolean_t found = B_FALSE;
5350 
5351 	mutex_enter(&zone->zone_lock);
5352 	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5353 		if (strncmp(dlnl->dlnl_name, dlname, LIFNAMSIZ) == 0) {
5354 			found = B_TRUE;
5355 			break;
5356 		}
5357 	}
5358 	mutex_exit(&zone->zone_lock);
5359 	return (found);
5360 }
5361 
5362 /*
5363  * Add an data link name for the zone. Does not check for duplicates.
5364  */
5365 static int
5366 zone_add_datalink(zoneid_t zoneid, char *dlname)
5367 {
5368 	struct dlnamelist *dlnl;
5369 	zone_t *zone;
5370 	zone_t *thiszone;
5371 	int err;
5372 
5373 	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5374 	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5375 		kmem_free(dlnl, sizeof (struct dlnamelist));
5376 		return (set_errno(err));
5377 	}
5378 
5379 	thiszone = zone_find_by_id(zoneid);
5380 	if (thiszone == NULL) {
5381 		kmem_free(dlnl, sizeof (struct dlnamelist));
5382 		return (set_errno(ENXIO));
5383 	}
5384 
5385 	/*
5386 	 * Verify that the datalink name isn't already used by a different
5387 	 * zone while allowing duplicate entries for the same zone (e.g. due
5388 	 * to both using IPv4 and IPv6 on an interface)
5389 	 */
5390 	mutex_enter(&zonehash_lock);
5391 	for (zone = list_head(&zone_active); zone != NULL;
5392 	    zone = list_next(&zone_active, zone)) {
5393 		if (zone->zone_id == zoneid)
5394 			continue;
5395 
5396 		if (zone_dlname(zone, dlnl->dlnl_name)) {
5397 			mutex_exit(&zonehash_lock);
5398 			zone_rele(thiszone);
5399 			kmem_free(dlnl, sizeof (struct dlnamelist));
5400 			return (set_errno(EPERM));
5401 		}
5402 	}
5403 	mutex_enter(&thiszone->zone_lock);
5404 	dlnl->dlnl_next = thiszone->zone_dl_list;
5405 	thiszone->zone_dl_list = dlnl;
5406 	mutex_exit(&thiszone->zone_lock);
5407 	mutex_exit(&zonehash_lock);
5408 	zone_rele(thiszone);
5409 	return (0);
5410 }
5411 
5412 static int
5413 zone_remove_datalink(zoneid_t zoneid, char *dlname)
5414 {
5415 	struct dlnamelist *dlnl, *odlnl, **dlnlp;
5416 	zone_t *zone;
5417 	int err;
5418 
5419 	dlnl = kmem_zalloc(sizeof (struct dlnamelist), KM_SLEEP);
5420 	if ((err = copyinstr(dlname, dlnl->dlnl_name, LIFNAMSIZ, NULL)) != 0) {
5421 		kmem_free(dlnl, sizeof (struct dlnamelist));
5422 		return (set_errno(err));
5423 	}
5424 	zone = zone_find_by_id(zoneid);
5425 	if (zone == NULL) {
5426 		kmem_free(dlnl, sizeof (struct dlnamelist));
5427 		return (set_errno(EINVAL));
5428 	}
5429 
5430 	mutex_enter(&zone->zone_lock);
5431 	/* Look for match */
5432 	dlnlp = &zone->zone_dl_list;
5433 	while (*dlnlp != NULL) {
5434 		if (strncmp(dlnl->dlnl_name, (*dlnlp)->dlnl_name,
5435 		    LIFNAMSIZ) == 0)
5436 			goto found;
5437 		dlnlp = &((*dlnlp)->dlnl_next);
5438 	}
5439 	mutex_exit(&zone->zone_lock);
5440 	zone_rele(zone);
5441 	kmem_free(dlnl, sizeof (struct dlnamelist));
5442 	return (set_errno(ENXIO));
5443 
5444 found:
5445 	odlnl = *dlnlp;
5446 	*dlnlp = (*dlnlp)->dlnl_next;
5447 	kmem_free(odlnl, sizeof (struct dlnamelist));
5448 
5449 	mutex_exit(&zone->zone_lock);
5450 	zone_rele(zone);
5451 	kmem_free(dlnl, sizeof (struct dlnamelist));
5452 	return (0);
5453 }
5454 
5455 /*
5456  * Using the zoneidp as ALL_ZONES, we can lookup which zone is using datalink
5457  * name (dlname); otherwise we just check if the specified zoneidp has access
5458  * to the datalink name.
5459  */
5460 static int
5461 zone_check_datalink(zoneid_t *zoneidp, char *dlname)
5462 {
5463 	zoneid_t id;
5464 	char *dln;
5465 	zone_t *zone;
5466 	int err = 0;
5467 	boolean_t allzones = B_FALSE;
5468 
5469 	if (copyin(zoneidp, &id, sizeof (id)) != 0) {
5470 		return (set_errno(EFAULT));
5471 	}
5472 	dln = kmem_zalloc(LIFNAMSIZ, KM_SLEEP);
5473 	if ((err = copyinstr(dlname, dln, LIFNAMSIZ, NULL)) != 0) {
5474 		kmem_free(dln, LIFNAMSIZ);
5475 		return (set_errno(err));
5476 	}
5477 
5478 	if (id == ALL_ZONES)
5479 		allzones = B_TRUE;
5480 
5481 	/*
5482 	 * Check whether datalink name is already used.
5483 	 */
5484 	mutex_enter(&zonehash_lock);
5485 	for (zone = list_head(&zone_active); zone != NULL;
5486 	    zone = list_next(&zone_active, zone)) {
5487 		if (allzones || (id == zone->zone_id)) {
5488 			if (!zone_dlname(zone, dln))
5489 				continue;
5490 			if (allzones)
5491 				err = copyout(&zone->zone_id, zoneidp,
5492 				    sizeof (*zoneidp));
5493 
5494 			mutex_exit(&zonehash_lock);
5495 			kmem_free(dln, LIFNAMSIZ);
5496 			return (err ? set_errno(EFAULT) : 0);
5497 		}
5498 	}
5499 
5500 	/* datalink name is not found in any active zone. */
5501 	mutex_exit(&zonehash_lock);
5502 	kmem_free(dln, LIFNAMSIZ);
5503 	return (set_errno(ENXIO));
5504 }
5505 
5506 /*
5507  * Get the names of the datalinks assigned to a zone.
5508  * Here *nump is the number of datalinks, and the assumption
5509  * is that the caller will gurantee that the the supplied buffer is
5510  * big enough to hold at least #*nump datalink names, that is,
5511  * LIFNAMSIZ X *nump
5512  * On return, *nump will be the "new" number of datalinks, if it
5513  * ever changed.
5514  */
5515 static int
5516 zone_list_datalink(zoneid_t zoneid, int *nump, char *buf)
5517 {
5518 	int num, dlcount;
5519 	zone_t *zone;
5520 	struct dlnamelist *dlnl;
5521 	char *ptr;
5522 
5523 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
5524 		return (set_errno(EFAULT));
5525 
5526 	zone = zone_find_by_id(zoneid);
5527 	if (zone == NULL) {
5528 		return (set_errno(ENXIO));
5529 	}
5530 
5531 	num = 0;
5532 	mutex_enter(&zone->zone_lock);
5533 	ptr = buf;
5534 	for (dlnl = zone->zone_dl_list; dlnl != NULL; dlnl = dlnl->dlnl_next) {
5535 		/*
5536 		 * If the list changed and the new number is bigger
5537 		 * than what the caller supplied, just count, don't
5538 		 * do copyout
5539 		 */
5540 		if (++num > dlcount)
5541 			continue;
5542 		if (copyout(dlnl->dlnl_name, ptr, LIFNAMSIZ) != 0) {
5543 			mutex_exit(&zone->zone_lock);
5544 			zone_rele(zone);
5545 			return (set_errno(EFAULT));
5546 		}
5547 		ptr += LIFNAMSIZ;
5548 	}
5549 	mutex_exit(&zone->zone_lock);
5550 	zone_rele(zone);
5551 
5552 	/* Increased or decreased, caller should be notified. */
5553 	if (num != dlcount) {
5554 		if (copyout(&num, nump, sizeof (num)) != 0) {
5555 			return (set_errno(EFAULT));
5556 		}
5557 	}
5558 	return (0);
5559 }
5560 
5561 /*
5562  * Public interface for looking up a zone by zoneid. It's a customized version
5563  * for netstack_zone_create(), it:
5564  * 1. Doesn't acquire the zonehash_lock, since it is called from
5565  *    zone_key_create() or zone_zsd_configure(), lock already held.
5566  * 2. Doesn't check the status of the zone.
5567  * 3. It will be called even before zone_init is called, in that case the
5568  *    address of zone0 is returned directly, and netstack_zone_create()
5569  *    will only assign a value to zone0.zone_netstack, won't break anything.
5570  */
5571 zone_t *
5572 zone_find_by_id_nolock(zoneid_t zoneid)
5573 {
5574 	ASSERT(MUTEX_HELD(&zonehash_lock));
5575 
5576 	if (zonehashbyid == NULL)
5577 		return (&zone0);
5578 	else
5579 		return (zone_find_all_by_id(zoneid));
5580 }
5581