xref: /titanic_51/usr/src/uts/common/os/zone.c (revision ac4d633f367252125bb35e97c5725d2aa68c1291)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Zones
31  *
32  *   A zone is a named collection of processes, namespace constraints,
33  *   and other system resources which comprise a secure and manageable
34  *   application containment facility.
35  *
36  *   Zones (represented by the reference counted zone_t) are tracked in
37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38  *   (zoneid_t) are used to track zone association.  Zone IDs are
39  *   dynamically generated when the zone is created; if a persistent
40  *   identifier is needed (core files, accounting logs, audit trail,
41  *   etc.), the zone name should be used.
42  *
43  *
44  *   Global Zone:
45  *
46  *   The global zone (zoneid 0) is automatically associated with all
47  *   system resources that have not been bound to a user-created zone.
48  *   This means that even systems where zones are not in active use
49  *   have a global zone, and all processes, mounts, etc. are
50  *   associated with that zone.  The global zone is generally
51  *   unconstrained in terms of privileges and access, though the usual
52  *   credential and privilege based restrictions apply.
53  *
54  *
55  *   Zone States:
56  *
57  *   The states in which a zone may be in and the transitions are as
58  *   follows:
59  *
60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61  *   initialized zone is added to the list of active zones on the system but
62  *   isn't accessible.
63  *
64  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
65  *   ready.  The zone is made visible after the ZSD constructor callbacks are
66  *   executed.  A zone remains in this state until it transitions into
67  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
68  *
69  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
70  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
71  *   state.
72  *
73  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
74  *   successfully started init.   A zone remains in this state until
75  *   zone_shutdown() is called.
76  *
77  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
78  *   killing all processes running in the zone. The zone remains
79  *   in this state until there are no more user processes running in the zone.
80  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
81  *   Since zone_shutdown() is restartable, it may be called successfully
82  *   multiple times for the same zone_t.  Setting of the zone's state to
83  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
84  *   the zone's status without worrying about it being a moving target.
85  *
86  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
87  *   are no more user processes in the zone.  The zone remains in this
88  *   state until there are no more kernel threads associated with the
89  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
90  *   fail.
91  *
92  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
93  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
94  *   join the zone or create kernel threads therein.
95  *
96  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
97  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
98  *   return NULL from now on.
99  *
100  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
101  *   processes or threads doing work on behalf of the zone.  The zone is
102  *   removed from the list of active zones.  zone_destroy() returns, and
103  *   the zone can be recreated.
104  *
105  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
106  *   callbacks are executed, and all memory associated with the zone is
107  *   freed.
108  *
109  *   Threads can wait for the zone to enter a requested state by using
110  *   zone_status_wait() or zone_status_timedwait() with the desired
111  *   state passed in as an argument.  Zone state transitions are
112  *   uni-directional; it is not possible to move back to an earlier state.
113  *
114  *
115  *   Zone-Specific Data:
116  *
117  *   Subsystems needing to maintain zone-specific data can store that
118  *   data using the ZSD mechanism.  This provides a zone-specific data
119  *   store, similar to thread-specific data (see pthread_getspecific(3C)
120  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
121  *   to register callbacks to be invoked when a zone is created, shut
122  *   down, or destroyed.  This can be used to initialize zone-specific
123  *   data for new zones and to clean up when zones go away.
124  *
125  *
126  *   Data Structures:
127  *
128  *   The per-zone structure (zone_t) is reference counted, and freed
129  *   when all references are released.  zone_hold and zone_rele can be
130  *   used to adjust the reference count.  In addition, reference counts
131  *   associated with the cred_t structure are tracked separately using
132  *   zone_cred_hold and zone_cred_rele.
133  *
134  *   Pointers to active zone_t's are stored in two hash tables; one
135  *   for searching by id, the other for searching by name.  Lookups
136  *   can be performed on either basis, using zone_find_by_id and
137  *   zone_find_by_name.  Both return zone_t pointers with the zone
138  *   held, so zone_rele should be called when the pointer is no longer
139  *   needed.  Zones can also be searched by path; zone_find_by_path
140  *   returns the zone with which a path name is associated (global
141  *   zone if the path is not within some other zone's file system
142  *   hierarchy).  This currently requires iterating through each zone,
143  *   so it is slower than an id or name search via a hash table.
144  *
145  *
146  *   Locking:
147  *
148  *   zonehash_lock: This is a top-level global lock used to protect the
149  *       zone hash tables and lists.  Zones cannot be created or destroyed
150  *       while this lock is held.
151  *   zone_status_lock: This is a global lock protecting zone state.
152  *       Zones cannot change state while this lock is held.  It also
153  *       protects the list of kernel threads associated with a zone.
154  *   zone_lock: This is a per-zone lock used to protect several fields of
155  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
156  *       this lock means that the zone cannot go away.
157  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
158  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
159  *       list (a list of zones in the ZONE_IS_DEAD state).
160  *
161  *   Ordering requirements:
162  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
163  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
164  *
165  *   Blocking memory allocations are permitted while holding any of the
166  *   zone locks.
167  *
168  *
169  *   System Call Interface:
170  *
171  *   The zone subsystem can be managed and queried from user level with
172  *   the following system calls (all subcodes of the primary "zone"
173  *   system call):
174  *   - zone_create: creates a zone with selected attributes (name,
175  *     root path, privileges, resource controls, ZFS datasets)
176  *   - zone_enter: allows the current process to enter a zone
177  *   - zone_getattr: reports attributes of a zone
178  *   - zone_setattr: set attributes of a zone
179  *   - zone_boot: set 'init' running for the zone
180  *   - zone_list: lists all zones active in the system
181  *   - zone_lookup: looks up zone id based on name
182  *   - zone_shutdown: initiates shutdown process (see states above)
183  *   - zone_destroy: completes shutdown process (see states above)
184  *
185  */
186 
187 #include <sys/priv_impl.h>
188 #include <sys/cred.h>
189 #include <c2/audit.h>
190 #include <sys/debug.h>
191 #include <sys/file.h>
192 #include <sys/kmem.h>
193 #include <sys/mutex.h>
194 #include <sys/note.h>
195 #include <sys/pathname.h>
196 #include <sys/proc.h>
197 #include <sys/project.h>
198 #include <sys/sysevent.h>
199 #include <sys/task.h>
200 #include <sys/systm.h>
201 #include <sys/types.h>
202 #include <sys/utsname.h>
203 #include <sys/vnode.h>
204 #include <sys/vfs.h>
205 #include <sys/systeminfo.h>
206 #include <sys/policy.h>
207 #include <sys/cred_impl.h>
208 #include <sys/contract_impl.h>
209 #include <sys/contract/process_impl.h>
210 #include <sys/class.h>
211 #include <sys/pool.h>
212 #include <sys/pool_pset.h>
213 #include <sys/pset.h>
214 #include <sys/sysmacros.h>
215 #include <sys/callb.h>
216 #include <sys/vmparam.h>
217 #include <sys/corectl.h>
218 
219 #include <sys/door.h>
220 #include <sys/cpuvar.h>
221 
222 #include <sys/uadmin.h>
223 #include <sys/session.h>
224 #include <sys/cmn_err.h>
225 #include <sys/modhash.h>
226 #include <sys/sunddi.h>
227 #include <sys/nvpair.h>
228 #include <sys/rctl.h>
229 #include <sys/fss.h>
230 #include <sys/zone.h>
231 #include <sys/tsol/label.h>
232 
233 /*
234  * cv used to signal that all references to the zone have been released.  This
235  * needs to be global since there may be multiple waiters, and the first to
236  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
237  */
238 static kcondvar_t zone_destroy_cv;
239 /*
240  * Lock used to serialize access to zone_cv.  This could have been per-zone,
241  * but then we'd need another lock for zone_destroy_cv, and why bother?
242  */
243 static kmutex_t zone_status_lock;
244 
245 /*
246  * ZSD-related global variables.
247  */
248 static kmutex_t zsd_key_lock;	/* protects the following two */
249 /*
250  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
251  */
252 static zone_key_t zsd_keyval = 0;
253 /*
254  * Global list of registered keys.  We use this when a new zone is created.
255  */
256 static list_t zsd_registered_keys;
257 
258 int zone_hash_size = 256;
259 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
260 static kmutex_t zonehash_lock;
261 static uint_t zonecount;
262 static id_space_t *zoneid_space;
263 
264 /*
265  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
266  * kernel proper runs, and which manages all other zones.
267  *
268  * Although not declared as static, the variable "zone0" should not be used
269  * except for by code that needs to reference the global zone early on in boot,
270  * before it is fully initialized.  All other consumers should use
271  * 'global_zone'.
272  */
273 zone_t zone0;
274 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
275 
276 /*
277  * List of active zones, protected by zonehash_lock.
278  */
279 static list_t zone_active;
280 
281 /*
282  * List of destroyed zones that still have outstanding cred references.
283  * Used for debugging.  Uses a separate lock to avoid lock ordering
284  * problems in zone_free.
285  */
286 static list_t zone_deathrow;
287 static kmutex_t zone_deathrow_lock;
288 
289 /* number of zones is limited by virtual interface limit in IP */
290 uint_t maxzones = 8192;
291 
292 /* Event channel to sent zone state change notifications */
293 evchan_t *zone_event_chan;
294 
295 /*
296  * This table holds the mapping from kernel zone states to
297  * states visible in the state notification API.
298  * The idea is that we only expose "obvious" states and
299  * do not expose states which are just implementation details.
300  */
301 const char  *zone_status_table[] = {
302 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
303 	ZONE_EVENT_READY,		/* ready */
304 	ZONE_EVENT_READY,		/* booting */
305 	ZONE_EVENT_RUNNING,		/* running */
306 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
307 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
308 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
309 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
310 	ZONE_EVENT_UNINITIALIZED,	/* dead */
311 };
312 
313 /*
314  * This isn't static so lint doesn't complain.
315  */
316 rctl_hndl_t rc_zone_cpu_shares;
317 rctl_hndl_t rc_zone_nlwps;
318 /*
319  * Synchronization primitives used to synchronize between mounts and zone
320  * creation/destruction.
321  */
322 static int mounts_in_progress;
323 static kcondvar_t mount_cv;
324 static kmutex_t mount_lock;
325 
326 const char * const zone_default_initname = "/sbin/init";
327 static char * const zone_prefix = "/zone/";
328 
329 static int zone_shutdown(zoneid_t zoneid);
330 
331 /*
332  * Bump this number when you alter the zone syscall interfaces; this is
333  * because we need to have support for previous API versions in libc
334  * to support patching; libc calls into the kernel to determine this number.
335  *
336  * Version 1 of the API is the version originally shipped with Solaris 10
337  * Version 2 alters the zone_create system call in order to support more
338  *     arguments by moving the args into a structure; and to do better
339  *     error reporting when zone_create() fails.
340  * Version 3 alters the zone_create system call in order to support the
341  *     import of ZFS datasets to zones.
342  * Version 4 alters the zone_create system call in order to support
343  *     Trusted Extensions.
344  * Version 5 alters the zone_boot system call, and converts its old
345  *     bootargs parameter to be set by the zone_setattr API instead.
346  */
347 static const int ZONE_SYSCALL_API_VERSION = 5;
348 
349 /*
350  * Certain filesystems (such as NFS and autofs) need to know which zone
351  * the mount is being placed in.  Because of this, we need to be able to
352  * ensure that a zone isn't in the process of being created such that
353  * nfs_mount() thinks it is in the global zone, while by the time it
354  * gets added the list of mounted zones, it ends up on zoneA's mount
355  * list.
356  *
357  * The following functions: block_mounts()/resume_mounts() and
358  * mount_in_progress()/mount_completed() are used by zones and the VFS
359  * layer (respectively) to synchronize zone creation and new mounts.
360  *
361  * The semantics are like a reader-reader lock such that there may
362  * either be multiple mounts (or zone creations, if that weren't
363  * serialized by zonehash_lock) in progress at the same time, but not
364  * both.
365  *
366  * We use cv's so the user can ctrl-C out of the operation if it's
367  * taking too long.
368  *
369  * The semantics are such that there is unfair bias towards the
370  * "current" operation.  This means that zone creations may starve if
371  * there is a rapid succession of new mounts coming in to the system, or
372  * there is a remote possibility that zones will be created at such a
373  * rate that new mounts will not be able to proceed.
374  */
375 /*
376  * Prevent new mounts from progressing to the point of calling
377  * VFS_MOUNT().  If there are already mounts in this "region", wait for
378  * them to complete.
379  */
380 static int
381 block_mounts(void)
382 {
383 	int retval = 0;
384 
385 	/*
386 	 * Since it may block for a long time, block_mounts() shouldn't be
387 	 * called with zonehash_lock held.
388 	 */
389 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
390 	mutex_enter(&mount_lock);
391 	while (mounts_in_progress > 0) {
392 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
393 			goto signaled;
394 	}
395 	/*
396 	 * A negative value of mounts_in_progress indicates that mounts
397 	 * have been blocked by (-mounts_in_progress) different callers.
398 	 */
399 	mounts_in_progress--;
400 	retval = 1;
401 signaled:
402 	mutex_exit(&mount_lock);
403 	return (retval);
404 }
405 
406 /*
407  * The VFS layer may progress with new mounts as far as we're concerned.
408  * Allow them to progress if we were the last obstacle.
409  */
410 static void
411 resume_mounts(void)
412 {
413 	mutex_enter(&mount_lock);
414 	if (++mounts_in_progress == 0)
415 		cv_broadcast(&mount_cv);
416 	mutex_exit(&mount_lock);
417 }
418 
419 /*
420  * The VFS layer is busy with a mount; zones should wait until all
421  * mounts are completed to progress.
422  */
423 void
424 mount_in_progress(void)
425 {
426 	mutex_enter(&mount_lock);
427 	while (mounts_in_progress < 0)
428 		cv_wait(&mount_cv, &mount_lock);
429 	mounts_in_progress++;
430 	mutex_exit(&mount_lock);
431 }
432 
433 /*
434  * VFS is done with one mount; wake up any waiting block_mounts()
435  * callers if this is the last mount.
436  */
437 void
438 mount_completed(void)
439 {
440 	mutex_enter(&mount_lock);
441 	if (--mounts_in_progress == 0)
442 		cv_broadcast(&mount_cv);
443 	mutex_exit(&mount_lock);
444 }
445 
446 /*
447  * ZSD routines.
448  *
449  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
450  * defined by the pthread_key_create() and related interfaces.
451  *
452  * Kernel subsystems may register one or more data items and/or
453  * callbacks to be executed when a zone is created, shutdown, or
454  * destroyed.
455  *
456  * Unlike the thread counterpart, destructor callbacks will be executed
457  * even if the data pointer is NULL and/or there are no constructor
458  * callbacks, so it is the responsibility of such callbacks to check for
459  * NULL data values if necessary.
460  *
461  * The locking strategy and overall picture is as follows:
462  *
463  * When someone calls zone_key_create(), a template ZSD entry is added to the
464  * global list "zsd_registered_keys", protected by zsd_key_lock.  The
465  * constructor callback is called immediately on all existing zones, and a
466  * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
467  * zone_lock).  As this operation requires the list of zones, the list of
468  * registered keys, and the per-zone list of ZSD entries to remain constant
469  * throughout the entire operation, it must grab zonehash_lock, zone_lock for
470  * all existing zones, and zsd_key_lock, in that order.  Similar locking is
471  * needed when zone_key_delete() is called.  It is thus sufficient to hold
472  * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
473  * per-zone zone_zsd list.
474  *
475  * Note that this implementation does not make a copy of the ZSD entry if a
476  * constructor callback is not provided.  A zone_getspecific() on such an
477  * uninitialized ZSD entry will return NULL.
478  *
479  * When new zones are created constructor callbacks for all registered ZSD
480  * entries will be called.
481  *
482  * The framework does not provide any locking around zone_getspecific() and
483  * zone_setspecific() apart from that needed for internal consistency, so
484  * callers interested in atomic "test-and-set" semantics will need to provide
485  * their own locking.
486  */
487 void
488 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
489     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
490 {
491 	struct zsd_entry *zsdp;
492 	struct zsd_entry *t;
493 	struct zone *zone;
494 
495 	zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
496 	zsdp->zsd_data = NULL;
497 	zsdp->zsd_create = create;
498 	zsdp->zsd_shutdown = shutdown;
499 	zsdp->zsd_destroy = destroy;
500 
501 	mutex_enter(&zonehash_lock);	/* stop the world */
502 	for (zone = list_head(&zone_active); zone != NULL;
503 	    zone = list_next(&zone_active, zone))
504 		mutex_enter(&zone->zone_lock);	/* lock all zones */
505 
506 	mutex_enter(&zsd_key_lock);
507 	*keyp = zsdp->zsd_key = ++zsd_keyval;
508 	ASSERT(zsd_keyval != 0);
509 	list_insert_tail(&zsd_registered_keys, zsdp);
510 	mutex_exit(&zsd_key_lock);
511 
512 	if (create != NULL) {
513 		for (zone = list_head(&zone_active); zone != NULL;
514 		    zone = list_next(&zone_active, zone)) {
515 			t = kmem_alloc(sizeof (*t), KM_SLEEP);
516 			t->zsd_key = *keyp;
517 			t->zsd_data = (*create)(zone->zone_id);
518 			t->zsd_create = create;
519 			t->zsd_shutdown = shutdown;
520 			t->zsd_destroy = destroy;
521 			list_insert_tail(&zone->zone_zsd, t);
522 		}
523 	}
524 	for (zone = list_head(&zone_active); zone != NULL;
525 	    zone = list_next(&zone_active, zone))
526 		mutex_exit(&zone->zone_lock);
527 	mutex_exit(&zonehash_lock);
528 }
529 
530 /*
531  * Helper function to find the zsd_entry associated with the key in the
532  * given list.
533  */
534 static struct zsd_entry *
535 zsd_find(list_t *l, zone_key_t key)
536 {
537 	struct zsd_entry *zsd;
538 
539 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
540 		if (zsd->zsd_key == key) {
541 			/*
542 			 * Move to head of list to keep list in MRU order.
543 			 */
544 			if (zsd != list_head(l)) {
545 				list_remove(l, zsd);
546 				list_insert_head(l, zsd);
547 			}
548 			return (zsd);
549 		}
550 	}
551 	return (NULL);
552 }
553 
554 /*
555  * Function called when a module is being unloaded, or otherwise wishes
556  * to unregister its ZSD key and callbacks.
557  */
558 int
559 zone_key_delete(zone_key_t key)
560 {
561 	struct zsd_entry *zsdp = NULL;
562 	zone_t *zone;
563 
564 	mutex_enter(&zonehash_lock);	/* Zone create/delete waits for us */
565 	for (zone = list_head(&zone_active); zone != NULL;
566 	    zone = list_next(&zone_active, zone))
567 		mutex_enter(&zone->zone_lock);	/* lock all zones */
568 
569 	mutex_enter(&zsd_key_lock);
570 	zsdp = zsd_find(&zsd_registered_keys, key);
571 	if (zsdp == NULL)
572 		goto notfound;
573 	list_remove(&zsd_registered_keys, zsdp);
574 	mutex_exit(&zsd_key_lock);
575 
576 	for (zone = list_head(&zone_active); zone != NULL;
577 	    zone = list_next(&zone_active, zone)) {
578 		struct zsd_entry *del;
579 		void *data;
580 
581 		if (!(zone->zone_flags & ZF_DESTROYED)) {
582 			del = zsd_find(&zone->zone_zsd, key);
583 			if (del != NULL) {
584 				data = del->zsd_data;
585 				ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
586 				ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
587 				list_remove(&zone->zone_zsd, del);
588 				kmem_free(del, sizeof (*del));
589 			} else {
590 				data = NULL;
591 			}
592 			if (zsdp->zsd_shutdown)
593 				zsdp->zsd_shutdown(zone->zone_id, data);
594 			if (zsdp->zsd_destroy)
595 				zsdp->zsd_destroy(zone->zone_id, data);
596 		}
597 		mutex_exit(&zone->zone_lock);
598 	}
599 	mutex_exit(&zonehash_lock);
600 	kmem_free(zsdp, sizeof (*zsdp));
601 	return (0);
602 
603 notfound:
604 	mutex_exit(&zsd_key_lock);
605 	for (zone = list_head(&zone_active); zone != NULL;
606 	    zone = list_next(&zone_active, zone))
607 		mutex_exit(&zone->zone_lock);
608 	mutex_exit(&zonehash_lock);
609 	return (-1);
610 }
611 
612 /*
613  * ZSD counterpart of pthread_setspecific().
614  */
615 int
616 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
617 {
618 	struct zsd_entry *t;
619 	struct zsd_entry *zsdp = NULL;
620 
621 	mutex_enter(&zone->zone_lock);
622 	t = zsd_find(&zone->zone_zsd, key);
623 	if (t != NULL) {
624 		/*
625 		 * Replace old value with new
626 		 */
627 		t->zsd_data = (void *)data;
628 		mutex_exit(&zone->zone_lock);
629 		return (0);
630 	}
631 	/*
632 	 * If there was no previous value, go through the list of registered
633 	 * keys.
634 	 *
635 	 * We avoid grabbing zsd_key_lock until we are sure we need it; this is
636 	 * necessary for shutdown callbacks to be able to execute without fear
637 	 * of deadlock.
638 	 */
639 	mutex_enter(&zsd_key_lock);
640 	zsdp = zsd_find(&zsd_registered_keys, key);
641 	if (zsdp == NULL) { 	/* Key was not registered */
642 		mutex_exit(&zsd_key_lock);
643 		mutex_exit(&zone->zone_lock);
644 		return (-1);
645 	}
646 
647 	/*
648 	 * Add a zsd_entry to this zone, using the template we just retrieved
649 	 * to initialize the constructor and destructor(s).
650 	 */
651 	t = kmem_alloc(sizeof (*t), KM_SLEEP);
652 	t->zsd_key = key;
653 	t->zsd_data = (void *)data;
654 	t->zsd_create = zsdp->zsd_create;
655 	t->zsd_shutdown = zsdp->zsd_shutdown;
656 	t->zsd_destroy = zsdp->zsd_destroy;
657 	list_insert_tail(&zone->zone_zsd, t);
658 	mutex_exit(&zsd_key_lock);
659 	mutex_exit(&zone->zone_lock);
660 	return (0);
661 }
662 
663 /*
664  * ZSD counterpart of pthread_getspecific().
665  */
666 void *
667 zone_getspecific(zone_key_t key, zone_t *zone)
668 {
669 	struct zsd_entry *t;
670 	void *data;
671 
672 	mutex_enter(&zone->zone_lock);
673 	t = zsd_find(&zone->zone_zsd, key);
674 	data = (t == NULL ? NULL : t->zsd_data);
675 	mutex_exit(&zone->zone_lock);
676 	return (data);
677 }
678 
679 /*
680  * Function used to initialize a zone's list of ZSD callbacks and data
681  * when the zone is being created.  The callbacks are initialized from
682  * the template list (zsd_registered_keys), and the constructor
683  * callback executed (if one exists).
684  *
685  * This is called before the zone is made publicly available, hence no
686  * need to grab zone_lock.
687  *
688  * Although we grab and release zsd_key_lock, new entries cannot be
689  * added to or removed from the zsd_registered_keys list until we
690  * release zonehash_lock, so there isn't a window for a
691  * zone_key_create() to come in after we've dropped zsd_key_lock but
692  * before the zone is added to the zone list, such that the constructor
693  * callbacks aren't executed for the new zone.
694  */
695 static void
696 zone_zsd_configure(zone_t *zone)
697 {
698 	struct zsd_entry *zsdp;
699 	struct zsd_entry *t;
700 	zoneid_t zoneid = zone->zone_id;
701 
702 	ASSERT(MUTEX_HELD(&zonehash_lock));
703 	ASSERT(list_head(&zone->zone_zsd) == NULL);
704 	mutex_enter(&zsd_key_lock);
705 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
706 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
707 		if (zsdp->zsd_create != NULL) {
708 			t = kmem_alloc(sizeof (*t), KM_SLEEP);
709 			t->zsd_key = zsdp->zsd_key;
710 			t->zsd_create = zsdp->zsd_create;
711 			t->zsd_data = (*t->zsd_create)(zoneid);
712 			t->zsd_shutdown = zsdp->zsd_shutdown;
713 			t->zsd_destroy = zsdp->zsd_destroy;
714 			list_insert_tail(&zone->zone_zsd, t);
715 		}
716 	}
717 	mutex_exit(&zsd_key_lock);
718 }
719 
720 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
721 
722 /*
723  * Helper function to execute shutdown or destructor callbacks.
724  */
725 static void
726 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
727 {
728 	struct zsd_entry *zsdp;
729 	struct zsd_entry *t;
730 	zoneid_t zoneid = zone->zone_id;
731 
732 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
733 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
734 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
735 
736 	mutex_enter(&zone->zone_lock);
737 	if (ct == ZSD_DESTROY) {
738 		if (zone->zone_flags & ZF_DESTROYED) {
739 			/*
740 			 * Make sure destructors are only called once.
741 			 */
742 			mutex_exit(&zone->zone_lock);
743 			return;
744 		}
745 		zone->zone_flags |= ZF_DESTROYED;
746 	}
747 	mutex_exit(&zone->zone_lock);
748 
749 	/*
750 	 * Both zsd_key_lock and zone_lock need to be held in order to add or
751 	 * remove a ZSD key, (either globally as part of
752 	 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
753 	 * possible through zone_setspecific()), so it's sufficient to hold
754 	 * zsd_key_lock here.
755 	 *
756 	 * This is a good thing, since we don't want to recursively try to grab
757 	 * zone_lock if a callback attempts to do something like a crfree() or
758 	 * zone_rele().
759 	 */
760 	mutex_enter(&zsd_key_lock);
761 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
762 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
763 		zone_key_t key = zsdp->zsd_key;
764 
765 		/* Skip if no callbacks registered */
766 		if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
767 			continue;
768 		if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
769 			continue;
770 		/*
771 		 * Call the callback with the zone-specific data if we can find
772 		 * any, otherwise with NULL.
773 		 */
774 		t = zsd_find(&zone->zone_zsd, key);
775 		if (t != NULL) {
776 			if (ct == ZSD_SHUTDOWN) {
777 				t->zsd_shutdown(zoneid, t->zsd_data);
778 			} else {
779 				ASSERT(ct == ZSD_DESTROY);
780 				t->zsd_destroy(zoneid, t->zsd_data);
781 			}
782 		} else {
783 			if (ct == ZSD_SHUTDOWN) {
784 				zsdp->zsd_shutdown(zoneid, NULL);
785 			} else {
786 				ASSERT(ct == ZSD_DESTROY);
787 				zsdp->zsd_destroy(zoneid, NULL);
788 			}
789 		}
790 	}
791 	mutex_exit(&zsd_key_lock);
792 }
793 
794 /*
795  * Called when the zone is going away; free ZSD-related memory, and
796  * destroy the zone_zsd list.
797  */
798 static void
799 zone_free_zsd(zone_t *zone)
800 {
801 	struct zsd_entry *t, *next;
802 
803 	/*
804 	 * Free all the zsd_entry's we had on this zone.
805 	 */
806 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
807 		next = list_next(&zone->zone_zsd, t);
808 		list_remove(&zone->zone_zsd, t);
809 		kmem_free(t, sizeof (*t));
810 	}
811 	list_destroy(&zone->zone_zsd);
812 }
813 
814 /*
815  * Frees memory associated with the zone dataset list.
816  */
817 static void
818 zone_free_datasets(zone_t *zone)
819 {
820 	zone_dataset_t *t, *next;
821 
822 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
823 		next = list_next(&zone->zone_datasets, t);
824 		list_remove(&zone->zone_datasets, t);
825 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
826 		kmem_free(t, sizeof (*t));
827 	}
828 	list_destroy(&zone->zone_datasets);
829 }
830 
831 /*
832  * zone.cpu-shares resource control support.
833  */
834 /*ARGSUSED*/
835 static rctl_qty_t
836 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
837 {
838 	ASSERT(MUTEX_HELD(&p->p_lock));
839 	return (p->p_zone->zone_shares);
840 }
841 
842 /*ARGSUSED*/
843 static int
844 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
845     rctl_qty_t nv)
846 {
847 	ASSERT(MUTEX_HELD(&p->p_lock));
848 	ASSERT(e->rcep_t == RCENTITY_ZONE);
849 	if (e->rcep_p.zone == NULL)
850 		return (0);
851 
852 	e->rcep_p.zone->zone_shares = nv;
853 	return (0);
854 }
855 
856 static rctl_ops_t zone_cpu_shares_ops = {
857 	rcop_no_action,
858 	zone_cpu_shares_usage,
859 	zone_cpu_shares_set,
860 	rcop_no_test
861 };
862 
863 /*ARGSUSED*/
864 static rctl_qty_t
865 zone_lwps_usage(rctl_t *r, proc_t *p)
866 {
867 	rctl_qty_t nlwps;
868 	zone_t *zone = p->p_zone;
869 
870 	ASSERT(MUTEX_HELD(&p->p_lock));
871 
872 	mutex_enter(&zone->zone_nlwps_lock);
873 	nlwps = zone->zone_nlwps;
874 	mutex_exit(&zone->zone_nlwps_lock);
875 
876 	return (nlwps);
877 }
878 
879 /*ARGSUSED*/
880 static int
881 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
882     rctl_qty_t incr, uint_t flags)
883 {
884 	rctl_qty_t nlwps;
885 
886 	ASSERT(MUTEX_HELD(&p->p_lock));
887 	ASSERT(e->rcep_t == RCENTITY_ZONE);
888 	if (e->rcep_p.zone == NULL)
889 		return (0);
890 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
891 	nlwps = e->rcep_p.zone->zone_nlwps;
892 
893 	if (nlwps + incr > rcntl->rcv_value)
894 		return (1);
895 
896 	return (0);
897 }
898 
899 /*ARGSUSED*/
900 static int
901 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) {
902 
903 	ASSERT(MUTEX_HELD(&p->p_lock));
904 	ASSERT(e->rcep_t == RCENTITY_ZONE);
905 	if (e->rcep_p.zone == NULL)
906 		return (0);
907 	e->rcep_p.zone->zone_nlwps_ctl = nv;
908 	return (0);
909 }
910 
911 static rctl_ops_t zone_lwps_ops = {
912 	rcop_no_action,
913 	zone_lwps_usage,
914 	zone_lwps_set,
915 	zone_lwps_test,
916 };
917 
918 /*
919  * Helper function to brand the zone with a unique ID.
920  */
921 static void
922 zone_uniqid(zone_t *zone)
923 {
924 	static uint64_t uniqid = 0;
925 
926 	ASSERT(MUTEX_HELD(&zonehash_lock));
927 	zone->zone_uniqid = uniqid++;
928 }
929 
930 /*
931  * Returns a held pointer to the "kcred" for the specified zone.
932  */
933 struct cred *
934 zone_get_kcred(zoneid_t zoneid)
935 {
936 	zone_t *zone;
937 	cred_t *cr;
938 
939 	if ((zone = zone_find_by_id(zoneid)) == NULL)
940 		return (NULL);
941 	cr = zone->zone_kcred;
942 	crhold(cr);
943 	zone_rele(zone);
944 	return (cr);
945 }
946 
947 /*
948  * Called very early on in boot to initialize the ZSD list so that
949  * zone_key_create() can be called before zone_init().  It also initializes
950  * portions of zone0 which may be used before zone_init() is called.  The
951  * variable "global_zone" will be set when zone0 is fully initialized by
952  * zone_init().
953  */
954 void
955 zone_zsd_init(void)
956 {
957 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
958 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
959 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
960 	    offsetof(struct zsd_entry, zsd_linkage));
961 	list_create(&zone_active, sizeof (zone_t),
962 	    offsetof(zone_t, zone_linkage));
963 	list_create(&zone_deathrow, sizeof (zone_t),
964 	    offsetof(zone_t, zone_linkage));
965 
966 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
967 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
968 	zone0.zone_shares = 1;
969 	zone0.zone_nlwps_ctl = INT_MAX;
970 	zone0.zone_name = GLOBAL_ZONENAME;
971 	zone0.zone_nodename = utsname.nodename;
972 	zone0.zone_domain = srpc_domain;
973 	zone0.zone_ref = 1;
974 	zone0.zone_id = GLOBAL_ZONEID;
975 	zone0.zone_status = ZONE_IS_RUNNING;
976 	zone0.zone_rootpath = "/";
977 	zone0.zone_rootpathlen = 2;
978 	zone0.zone_psetid = ZONE_PS_INVAL;
979 	zone0.zone_ncpus = 0;
980 	zone0.zone_ncpus_online = 0;
981 	zone0.zone_proc_initpid = 1;
982 	zone0.zone_initname = initname;
983 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
984 	    offsetof(struct zsd_entry, zsd_linkage));
985 	list_insert_head(&zone_active, &zone0);
986 
987 	/*
988 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
989 	 * to anything meaningful.  It is assigned to be 'rootdir' in
990 	 * vfs_mountroot().
991 	 */
992 	zone0.zone_rootvp = NULL;
993 	zone0.zone_vfslist = NULL;
994 	zone0.zone_bootargs = initargs;
995 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
996 	/*
997 	 * The global zone has all privileges
998 	 */
999 	priv_fillset(zone0.zone_privset);
1000 	/*
1001 	 * Add p0 to the global zone
1002 	 */
1003 	zone0.zone_zsched = &p0;
1004 	p0.p_zone = &zone0;
1005 }
1006 
1007 /*
1008  * Compute a hash value based on the contents of the label and the DOI.  The
1009  * hash algorithm is somewhat arbitrary, but is based on the observation that
1010  * humans will likely pick labels that differ by amounts that work out to be
1011  * multiples of the number of hash chains, and thus stirring in some primes
1012  * should help.
1013  */
1014 static uint_t
1015 hash_bylabel(void *hdata, mod_hash_key_t key)
1016 {
1017 	const ts_label_t *lab = (ts_label_t *)key;
1018 	const uint32_t *up, *ue;
1019 	uint_t hash;
1020 	int i;
1021 
1022 	_NOTE(ARGUNUSED(hdata));
1023 
1024 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
1025 	/* we depend on alignment of label, but not representation */
1026 	up = (const uint32_t *)&lab->tsl_label;
1027 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
1028 	i = 1;
1029 	while (up < ue) {
1030 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
1031 		hash += *up + (*up << ((i % 16) + 1));
1032 		up++;
1033 		i++;
1034 	}
1035 	return (hash);
1036 }
1037 
1038 /*
1039  * All that mod_hash cares about here is zero (equal) versus non-zero (not
1040  * equal).  This may need to be changed if less than / greater than is ever
1041  * needed.
1042  */
1043 static int
1044 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1045 {
1046 	ts_label_t *lab1 = (ts_label_t *)key1;
1047 	ts_label_t *lab2 = (ts_label_t *)key2;
1048 
1049 	return (label_equal(lab1, lab2) ? 0 : 1);
1050 }
1051 
1052 /*
1053  * Called by main() to initialize the zones framework.
1054  */
1055 void
1056 zone_init(void)
1057 {
1058 	rctl_dict_entry_t *rde;
1059 	rctl_val_t *dval;
1060 	rctl_set_t *set;
1061 	rctl_alloc_gp_t *gp;
1062 	rctl_entity_p_t e;
1063 	int res;
1064 
1065 	ASSERT(curproc == &p0);
1066 
1067 	/*
1068 	 * Create ID space for zone IDs.  ID 0 is reserved for the
1069 	 * global zone.
1070 	 */
1071 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
1072 
1073 	/*
1074 	 * Initialize generic zone resource controls, if any.
1075 	 */
1076 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
1077 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
1078 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
1079 	    FSS_MAXSHARES, FSS_MAXSHARES,
1080 	    &zone_cpu_shares_ops);
1081 
1082 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
1083 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
1084 	    INT_MAX, INT_MAX, &zone_lwps_ops);
1085 	/*
1086 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
1087 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
1088 	 */
1089 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
1090 	bzero(dval, sizeof (rctl_val_t));
1091 	dval->rcv_value = 1;
1092 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
1093 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
1094 	dval->rcv_action_recip_pid = -1;
1095 
1096 	rde = rctl_dict_lookup("zone.cpu-shares");
1097 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
1098 
1099 	/*
1100 	 * Initialize the ``global zone''.
1101 	 */
1102 	set = rctl_set_create();
1103 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
1104 	mutex_enter(&p0.p_lock);
1105 	e.rcep_p.zone = &zone0;
1106 	e.rcep_t = RCENTITY_ZONE;
1107 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1108 	    gp);
1109 
1110 	zone0.zone_nlwps = p0.p_lwpcnt;
1111 	zone0.zone_ntasks = 1;
1112 	mutex_exit(&p0.p_lock);
1113 	rctl_prealloc_destroy(gp);
1114 	/*
1115 	 * pool_default hasn't been initialized yet, so we let pool_init() take
1116 	 * care of making the global zone is in the default pool.
1117 	 */
1118 
1119 	/*
1120 	 * Initialize zone label.
1121 	 * mlp are initialized when tnzonecfg is loaded.
1122 	 */
1123 	zone0.zone_slabel = l_admin_low;
1124 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
1125 	label_hold(l_admin_low);
1126 
1127 	mutex_enter(&zonehash_lock);
1128 	zone_uniqid(&zone0);
1129 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
1130 
1131 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
1132 	    mod_hash_null_valdtor);
1133 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
1134 	    zone_hash_size, mod_hash_null_valdtor);
1135 	/*
1136 	 * maintain zonehashbylabel only for labeled systems
1137 	 */
1138 	if (is_system_labeled())
1139 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
1140 		    zone_hash_size, mod_hash_null_keydtor,
1141 		    mod_hash_null_valdtor, hash_bylabel, NULL,
1142 		    hash_labelkey_cmp, KM_SLEEP);
1143 	zonecount = 1;
1144 
1145 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
1146 	    (mod_hash_val_t)&zone0);
1147 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
1148 	    (mod_hash_val_t)&zone0);
1149 	if (is_system_labeled()) {
1150 		zone0.zone_flags |= ZF_HASHED_LABEL;
1151 		(void) mod_hash_insert(zonehashbylabel,
1152 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
1153 	}
1154 	mutex_exit(&zonehash_lock);
1155 
1156 	/*
1157 	 * We avoid setting zone_kcred until now, since kcred is initialized
1158 	 * sometime after zone_zsd_init() and before zone_init().
1159 	 */
1160 	zone0.zone_kcred = kcred;
1161 	/*
1162 	 * The global zone is fully initialized (except for zone_rootvp which
1163 	 * will be set when the root filesystem is mounted).
1164 	 */
1165 	global_zone = &zone0;
1166 
1167 	/*
1168 	 * Setup an event channel to send zone status change notifications on
1169 	 */
1170 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
1171 	    EVCH_CREAT);
1172 
1173 	if (res)
1174 		panic("Sysevent_evc_bind failed during zone setup.\n");
1175 }
1176 
1177 static void
1178 zone_free(zone_t *zone)
1179 {
1180 	ASSERT(zone != global_zone);
1181 	ASSERT(zone->zone_ntasks == 0);
1182 	ASSERT(zone->zone_nlwps == 0);
1183 	ASSERT(zone->zone_cred_ref == 0);
1184 	ASSERT(zone->zone_kcred == NULL);
1185 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
1186 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
1187 
1188 	/* remove from deathrow list */
1189 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
1190 		ASSERT(zone->zone_ref == 0);
1191 		mutex_enter(&zone_deathrow_lock);
1192 		list_remove(&zone_deathrow, zone);
1193 		mutex_exit(&zone_deathrow_lock);
1194 	}
1195 
1196 	zone_free_zsd(zone);
1197 	zone_free_datasets(zone);
1198 
1199 	if (zone->zone_rootvp != NULL)
1200 		VN_RELE(zone->zone_rootvp);
1201 	if (zone->zone_rootpath)
1202 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
1203 	if (zone->zone_name != NULL)
1204 		kmem_free(zone->zone_name, ZONENAME_MAX);
1205 	if (zone->zone_slabel != NULL)
1206 		label_rele(zone->zone_slabel);
1207 	if (zone->zone_nodename != NULL)
1208 		kmem_free(zone->zone_nodename, _SYS_NMLN);
1209 	if (zone->zone_domain != NULL)
1210 		kmem_free(zone->zone_domain, _SYS_NMLN);
1211 	if (zone->zone_privset != NULL)
1212 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
1213 	if (zone->zone_rctls != NULL)
1214 		rctl_set_free(zone->zone_rctls);
1215 	if (zone->zone_bootargs != NULL)
1216 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1217 	if (zone->zone_initname != NULL)
1218 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1219 	id_free(zoneid_space, zone->zone_id);
1220 	mutex_destroy(&zone->zone_lock);
1221 	cv_destroy(&zone->zone_cv);
1222 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
1223 	kmem_free(zone, sizeof (zone_t));
1224 }
1225 
1226 /*
1227  * See block comment at the top of this file for information about zone
1228  * status values.
1229  */
1230 /*
1231  * Convenience function for setting zone status.
1232  */
1233 static void
1234 zone_status_set(zone_t *zone, zone_status_t status)
1235 {
1236 
1237 	nvlist_t *nvl = NULL;
1238 	ASSERT(MUTEX_HELD(&zone_status_lock));
1239 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
1240 	    status >= zone_status_get(zone));
1241 
1242 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
1243 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
1244 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
1245 	    zone_status_table[status]) ||
1246 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
1247 	    zone_status_table[zone->zone_status]) ||
1248 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
1249 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
1250 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
1251 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
1252 #ifdef DEBUG
1253 		(void) printf(
1254 		    "Failed to allocate and send zone state change event.\n");
1255 #endif
1256 	}
1257 	nvlist_free(nvl);
1258 
1259 	zone->zone_status = status;
1260 
1261 	cv_broadcast(&zone->zone_cv);
1262 }
1263 
1264 /*
1265  * Public function to retrieve the zone status.  The zone status may
1266  * change after it is retrieved.
1267  */
1268 zone_status_t
1269 zone_status_get(zone_t *zone)
1270 {
1271 	return (zone->zone_status);
1272 }
1273 
1274 static int
1275 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
1276 {
1277 	char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
1278 	int err = 0;
1279 
1280 	ASSERT(zone != global_zone);
1281 	if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0)
1282 		goto done;	/* EFAULT or ENAMETOOLONG */
1283 
1284 	if (zone->zone_bootargs != NULL)
1285 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
1286 
1287 	zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP);
1288 	(void) strcpy(zone->zone_bootargs, bootargs);
1289 
1290 done:
1291 	kmem_free(bootargs, BOOTARGS_MAX);
1292 	return (err);
1293 }
1294 
1295 static int
1296 zone_set_initname(zone_t *zone, const char *zone_initname)
1297 {
1298 	char initname[INITNAME_SZ];
1299 	size_t len;
1300 	int err = 0;
1301 
1302 	ASSERT(zone != global_zone);
1303 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
1304 		return (err);	/* EFAULT or ENAMETOOLONG */
1305 
1306 	if (zone->zone_initname != NULL)
1307 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
1308 
1309 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
1310 	(void) strcpy(zone->zone_initname, initname);
1311 	return (0);
1312 }
1313 
1314 /*
1315  * Block indefinitely waiting for (zone_status >= status)
1316  */
1317 void
1318 zone_status_wait(zone_t *zone, zone_status_t status)
1319 {
1320 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1321 
1322 	mutex_enter(&zone_status_lock);
1323 	while (zone->zone_status < status) {
1324 		cv_wait(&zone->zone_cv, &zone_status_lock);
1325 	}
1326 	mutex_exit(&zone_status_lock);
1327 }
1328 
1329 /*
1330  * Private CPR-safe version of zone_status_wait().
1331  */
1332 static void
1333 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
1334 {
1335 	callb_cpr_t cprinfo;
1336 
1337 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1338 
1339 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
1340 	    str);
1341 	mutex_enter(&zone_status_lock);
1342 	while (zone->zone_status < status) {
1343 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1344 		cv_wait(&zone->zone_cv, &zone_status_lock);
1345 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
1346 	}
1347 	/*
1348 	 * zone_status_lock is implicitly released by the following.
1349 	 */
1350 	CALLB_CPR_EXIT(&cprinfo);
1351 }
1352 
1353 /*
1354  * Block until zone enters requested state or signal is received.  Return (0)
1355  * if signaled, non-zero otherwise.
1356  */
1357 int
1358 zone_status_wait_sig(zone_t *zone, zone_status_t status)
1359 {
1360 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1361 
1362 	mutex_enter(&zone_status_lock);
1363 	while (zone->zone_status < status) {
1364 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
1365 			mutex_exit(&zone_status_lock);
1366 			return (0);
1367 		}
1368 	}
1369 	mutex_exit(&zone_status_lock);
1370 	return (1);
1371 }
1372 
1373 /*
1374  * Block until the zone enters the requested state or the timeout expires,
1375  * whichever happens first.  Return (-1) if operation timed out, time remaining
1376  * otherwise.
1377  */
1378 clock_t
1379 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
1380 {
1381 	clock_t timeleft = 0;
1382 
1383 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1384 
1385 	mutex_enter(&zone_status_lock);
1386 	while (zone->zone_status < status && timeleft != -1) {
1387 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
1388 	}
1389 	mutex_exit(&zone_status_lock);
1390 	return (timeleft);
1391 }
1392 
1393 /*
1394  * Block until the zone enters the requested state, the current process is
1395  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
1396  * operation timed out, 0 if signaled, time remaining otherwise.
1397  */
1398 clock_t
1399 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
1400 {
1401 	clock_t timeleft = tim - lbolt;
1402 
1403 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1404 
1405 	mutex_enter(&zone_status_lock);
1406 	while (zone->zone_status < status) {
1407 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
1408 		    tim);
1409 		if (timeleft <= 0)
1410 			break;
1411 	}
1412 	mutex_exit(&zone_status_lock);
1413 	return (timeleft);
1414 }
1415 
1416 /*
1417  * Zones have two reference counts: one for references from credential
1418  * structures (zone_cred_ref), and one (zone_ref) for everything else.
1419  * This is so we can allow a zone to be rebooted while there are still
1420  * outstanding cred references, since certain drivers cache dblks (which
1421  * implicitly results in cached creds).  We wait for zone_ref to drop to
1422  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
1423  * later freed when the zone_cred_ref drops to 0, though nothing other
1424  * than the zone id and privilege set should be accessed once the zone
1425  * is "dead".
1426  *
1427  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
1428  * to force halt/reboot to block waiting for the zone_cred_ref to drop
1429  * to 0.  This can be useful to flush out other sources of cached creds
1430  * that may be less innocuous than the driver case.
1431  */
1432 
1433 int zone_wait_for_cred = 0;
1434 
1435 static void
1436 zone_hold_locked(zone_t *z)
1437 {
1438 	ASSERT(MUTEX_HELD(&z->zone_lock));
1439 	z->zone_ref++;
1440 	ASSERT(z->zone_ref != 0);
1441 }
1442 
1443 void
1444 zone_hold(zone_t *z)
1445 {
1446 	mutex_enter(&z->zone_lock);
1447 	zone_hold_locked(z);
1448 	mutex_exit(&z->zone_lock);
1449 }
1450 
1451 /*
1452  * If the non-cred ref count drops to 1 and either the cred ref count
1453  * is 0 or we aren't waiting for cred references, the zone is ready to
1454  * be destroyed.
1455  */
1456 #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
1457 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
1458 
1459 void
1460 zone_rele(zone_t *z)
1461 {
1462 	boolean_t wakeup;
1463 
1464 	mutex_enter(&z->zone_lock);
1465 	ASSERT(z->zone_ref != 0);
1466 	z->zone_ref--;
1467 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1468 		/* no more refs, free the structure */
1469 		mutex_exit(&z->zone_lock);
1470 		zone_free(z);
1471 		return;
1472 	}
1473 	/* signal zone_destroy so the zone can finish halting */
1474 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
1475 	mutex_exit(&z->zone_lock);
1476 
1477 	if (wakeup) {
1478 		/*
1479 		 * Grabbing zonehash_lock here effectively synchronizes with
1480 		 * zone_destroy() to avoid missed signals.
1481 		 */
1482 		mutex_enter(&zonehash_lock);
1483 		cv_broadcast(&zone_destroy_cv);
1484 		mutex_exit(&zonehash_lock);
1485 	}
1486 }
1487 
1488 void
1489 zone_cred_hold(zone_t *z)
1490 {
1491 	mutex_enter(&z->zone_lock);
1492 	z->zone_cred_ref++;
1493 	ASSERT(z->zone_cred_ref != 0);
1494 	mutex_exit(&z->zone_lock);
1495 }
1496 
1497 void
1498 zone_cred_rele(zone_t *z)
1499 {
1500 	boolean_t wakeup;
1501 
1502 	mutex_enter(&z->zone_lock);
1503 	ASSERT(z->zone_cred_ref != 0);
1504 	z->zone_cred_ref--;
1505 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1506 		/* no more refs, free the structure */
1507 		mutex_exit(&z->zone_lock);
1508 		zone_free(z);
1509 		return;
1510 	}
1511 	/*
1512 	 * If zone_destroy is waiting for the cred references to drain
1513 	 * out, and they have, signal it.
1514 	 */
1515 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
1516 	    zone_status_get(z) >= ZONE_IS_DEAD);
1517 	mutex_exit(&z->zone_lock);
1518 
1519 	if (wakeup) {
1520 		/*
1521 		 * Grabbing zonehash_lock here effectively synchronizes with
1522 		 * zone_destroy() to avoid missed signals.
1523 		 */
1524 		mutex_enter(&zonehash_lock);
1525 		cv_broadcast(&zone_destroy_cv);
1526 		mutex_exit(&zonehash_lock);
1527 	}
1528 }
1529 
1530 void
1531 zone_task_hold(zone_t *z)
1532 {
1533 	mutex_enter(&z->zone_lock);
1534 	z->zone_ntasks++;
1535 	ASSERT(z->zone_ntasks != 0);
1536 	mutex_exit(&z->zone_lock);
1537 }
1538 
1539 void
1540 zone_task_rele(zone_t *zone)
1541 {
1542 	uint_t refcnt;
1543 
1544 	mutex_enter(&zone->zone_lock);
1545 	ASSERT(zone->zone_ntasks != 0);
1546 	refcnt = --zone->zone_ntasks;
1547 	if (refcnt > 1)	{	/* Common case */
1548 		mutex_exit(&zone->zone_lock);
1549 		return;
1550 	}
1551 	zone_hold_locked(zone);	/* so we can use the zone_t later */
1552 	mutex_exit(&zone->zone_lock);
1553 	if (refcnt == 1) {
1554 		/*
1555 		 * See if the zone is shutting down.
1556 		 */
1557 		mutex_enter(&zone_status_lock);
1558 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
1559 			goto out;
1560 		}
1561 
1562 		/*
1563 		 * Make sure the ntasks didn't change since we
1564 		 * dropped zone_lock.
1565 		 */
1566 		mutex_enter(&zone->zone_lock);
1567 		if (refcnt != zone->zone_ntasks) {
1568 			mutex_exit(&zone->zone_lock);
1569 			goto out;
1570 		}
1571 		mutex_exit(&zone->zone_lock);
1572 
1573 		/*
1574 		 * No more user processes in the zone.  The zone is empty.
1575 		 */
1576 		zone_status_set(zone, ZONE_IS_EMPTY);
1577 		goto out;
1578 	}
1579 
1580 	ASSERT(refcnt == 0);
1581 	/*
1582 	 * zsched has exited; the zone is dead.
1583 	 */
1584 	zone->zone_zsched = NULL;		/* paranoia */
1585 	mutex_enter(&zone_status_lock);
1586 	zone_status_set(zone, ZONE_IS_DEAD);
1587 out:
1588 	mutex_exit(&zone_status_lock);
1589 	zone_rele(zone);
1590 }
1591 
1592 zoneid_t
1593 getzoneid(void)
1594 {
1595 	return (curproc->p_zone->zone_id);
1596 }
1597 
1598 /*
1599  * Internal versions of zone_find_by_*().  These don't zone_hold() or
1600  * check the validity of a zone's state.
1601  */
1602 static zone_t *
1603 zone_find_all_by_id(zoneid_t zoneid)
1604 {
1605 	mod_hash_val_t hv;
1606 	zone_t *zone = NULL;
1607 
1608 	ASSERT(MUTEX_HELD(&zonehash_lock));
1609 
1610 	if (mod_hash_find(zonehashbyid,
1611 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
1612 		zone = (zone_t *)hv;
1613 	return (zone);
1614 }
1615 
1616 static zone_t *
1617 zone_find_all_by_label(const ts_label_t *label)
1618 {
1619 	mod_hash_val_t hv;
1620 	zone_t *zone = NULL;
1621 
1622 	ASSERT(MUTEX_HELD(&zonehash_lock));
1623 
1624 	/*
1625 	 * zonehashbylabel is not maintained for unlabeled systems
1626 	 */
1627 	if (!is_system_labeled())
1628 		return (NULL);
1629 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
1630 		zone = (zone_t *)hv;
1631 	return (zone);
1632 }
1633 
1634 static zone_t *
1635 zone_find_all_by_name(char *name)
1636 {
1637 	mod_hash_val_t hv;
1638 	zone_t *zone = NULL;
1639 
1640 	ASSERT(MUTEX_HELD(&zonehash_lock));
1641 
1642 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
1643 		zone = (zone_t *)hv;
1644 	return (zone);
1645 }
1646 
1647 /*
1648  * Public interface for looking up a zone by zoneid.  Only returns the zone if
1649  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
1650  * Caller must call zone_rele() once it is done with the zone.
1651  *
1652  * The zone may begin the zone_destroy() sequence immediately after this
1653  * function returns, but may be safely used until zone_rele() is called.
1654  */
1655 zone_t *
1656 zone_find_by_id(zoneid_t zoneid)
1657 {
1658 	zone_t *zone;
1659 	zone_status_t status;
1660 
1661 	mutex_enter(&zonehash_lock);
1662 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
1663 		mutex_exit(&zonehash_lock);
1664 		return (NULL);
1665 	}
1666 	status = zone_status_get(zone);
1667 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
1668 		/*
1669 		 * For all practical purposes the zone doesn't exist.
1670 		 */
1671 		mutex_exit(&zonehash_lock);
1672 		return (NULL);
1673 	}
1674 	zone_hold(zone);
1675 	mutex_exit(&zonehash_lock);
1676 	return (zone);
1677 }
1678 
1679 /*
1680  * Similar to zone_find_by_id, but using zone label as the key.
1681  */
1682 zone_t *
1683 zone_find_by_label(const ts_label_t *label)
1684 {
1685 	zone_t *zone;
1686 	zone_status_t status;
1687 
1688 	mutex_enter(&zonehash_lock);
1689 	if ((zone = zone_find_all_by_label(label)) == NULL) {
1690 		mutex_exit(&zonehash_lock);
1691 		return (NULL);
1692 	}
1693 
1694 	status = zone_status_get(zone);
1695 	if (status > ZONE_IS_DOWN) {
1696 		/*
1697 		 * For all practical purposes the zone doesn't exist.
1698 		 */
1699 		mutex_exit(&zonehash_lock);
1700 		return (NULL);
1701 	}
1702 	zone_hold(zone);
1703 	mutex_exit(&zonehash_lock);
1704 	return (zone);
1705 }
1706 
1707 /*
1708  * Similar to zone_find_by_id, but using zone name as the key.
1709  */
1710 zone_t *
1711 zone_find_by_name(char *name)
1712 {
1713 	zone_t *zone;
1714 	zone_status_t status;
1715 
1716 	mutex_enter(&zonehash_lock);
1717 	if ((zone = zone_find_all_by_name(name)) == NULL) {
1718 		mutex_exit(&zonehash_lock);
1719 		return (NULL);
1720 	}
1721 	status = zone_status_get(zone);
1722 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
1723 		/*
1724 		 * For all practical purposes the zone doesn't exist.
1725 		 */
1726 		mutex_exit(&zonehash_lock);
1727 		return (NULL);
1728 	}
1729 	zone_hold(zone);
1730 	mutex_exit(&zonehash_lock);
1731 	return (zone);
1732 }
1733 
1734 /*
1735  * Similar to zone_find_by_id(), using the path as a key.  For instance,
1736  * if there is a zone "foo" rooted at /foo/root, and the path argument
1737  * is "/foo/root/proc", it will return the held zone_t corresponding to
1738  * zone "foo".
1739  *
1740  * zone_find_by_path() always returns a non-NULL value, since at the
1741  * very least every path will be contained in the global zone.
1742  *
1743  * As with the other zone_find_by_*() functions, the caller is
1744  * responsible for zone_rele()ing the return value of this function.
1745  */
1746 zone_t *
1747 zone_find_by_path(const char *path)
1748 {
1749 	zone_t *zone;
1750 	zone_t *zret = NULL;
1751 	zone_status_t status;
1752 
1753 	if (path == NULL) {
1754 		/*
1755 		 * Call from rootconf().
1756 		 */
1757 		zone_hold(global_zone);
1758 		return (global_zone);
1759 	}
1760 	ASSERT(*path == '/');
1761 	mutex_enter(&zonehash_lock);
1762 	for (zone = list_head(&zone_active); zone != NULL;
1763 	    zone = list_next(&zone_active, zone)) {
1764 		if (ZONE_PATH_VISIBLE(path, zone))
1765 			zret = zone;
1766 	}
1767 	ASSERT(zret != NULL);
1768 	status = zone_status_get(zret);
1769 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
1770 		/*
1771 		 * Zone practically doesn't exist.
1772 		 */
1773 		zret = global_zone;
1774 	}
1775 	zone_hold(zret);
1776 	mutex_exit(&zonehash_lock);
1777 	return (zret);
1778 }
1779 
1780 /*
1781  * Get the number of cpus visible to this zone.  The system-wide global
1782  * 'ncpus' is returned if pools are disabled, the caller is in the
1783  * global zone, or a NULL zone argument is passed in.
1784  */
1785 int
1786 zone_ncpus_get(zone_t *zone)
1787 {
1788 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
1789 
1790 	return (myncpus != 0 ? myncpus : ncpus);
1791 }
1792 
1793 /*
1794  * Get the number of online cpus visible to this zone.  The system-wide
1795  * global 'ncpus_online' is returned if pools are disabled, the caller
1796  * is in the global zone, or a NULL zone argument is passed in.
1797  */
1798 int
1799 zone_ncpus_online_get(zone_t *zone)
1800 {
1801 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
1802 
1803 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
1804 }
1805 
1806 /*
1807  * Return the pool to which the zone is currently bound.
1808  */
1809 pool_t *
1810 zone_pool_get(zone_t *zone)
1811 {
1812 	ASSERT(pool_lock_held());
1813 
1814 	return (zone->zone_pool);
1815 }
1816 
1817 /*
1818  * Set the zone's pool pointer and update the zone's visibility to match
1819  * the resources in the new pool.
1820  */
1821 void
1822 zone_pool_set(zone_t *zone, pool_t *pool)
1823 {
1824 	ASSERT(pool_lock_held());
1825 	ASSERT(MUTEX_HELD(&cpu_lock));
1826 
1827 	zone->zone_pool = pool;
1828 	zone_pset_set(zone, pool->pool_pset->pset_id);
1829 }
1830 
1831 /*
1832  * Return the cached value of the id of the processor set to which the
1833  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
1834  * facility is disabled.
1835  */
1836 psetid_t
1837 zone_pset_get(zone_t *zone)
1838 {
1839 	ASSERT(MUTEX_HELD(&cpu_lock));
1840 
1841 	return (zone->zone_psetid);
1842 }
1843 
1844 /*
1845  * Set the cached value of the id of the processor set to which the zone
1846  * is currently bound.  Also update the zone's visibility to match the
1847  * resources in the new processor set.
1848  */
1849 void
1850 zone_pset_set(zone_t *zone, psetid_t newpsetid)
1851 {
1852 	psetid_t oldpsetid;
1853 
1854 	ASSERT(MUTEX_HELD(&cpu_lock));
1855 	oldpsetid = zone_pset_get(zone);
1856 
1857 	if (oldpsetid == newpsetid)
1858 		return;
1859 	/*
1860 	 * Global zone sees all.
1861 	 */
1862 	if (zone != global_zone) {
1863 		zone->zone_psetid = newpsetid;
1864 		if (newpsetid != ZONE_PS_INVAL)
1865 			pool_pset_visibility_add(newpsetid, zone);
1866 		if (oldpsetid != ZONE_PS_INVAL)
1867 			pool_pset_visibility_remove(oldpsetid, zone);
1868 	}
1869 	/*
1870 	 * Disabling pools, so we should start using the global values
1871 	 * for ncpus and ncpus_online.
1872 	 */
1873 	if (newpsetid == ZONE_PS_INVAL) {
1874 		zone->zone_ncpus = 0;
1875 		zone->zone_ncpus_online = 0;
1876 	}
1877 }
1878 
1879 /*
1880  * Walk the list of active zones and issue the provided callback for
1881  * each of them.
1882  *
1883  * Caller must not be holding any locks that may be acquired under
1884  * zonehash_lock.  See comment at the beginning of the file for a list of
1885  * common locks and their interactions with zones.
1886  */
1887 int
1888 zone_walk(int (*cb)(zone_t *, void *), void *data)
1889 {
1890 	zone_t *zone;
1891 	int ret = 0;
1892 	zone_status_t status;
1893 
1894 	mutex_enter(&zonehash_lock);
1895 	for (zone = list_head(&zone_active); zone != NULL;
1896 	    zone = list_next(&zone_active, zone)) {
1897 		/*
1898 		 * Skip zones that shouldn't be externally visible.
1899 		 */
1900 		status = zone_status_get(zone);
1901 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
1902 			continue;
1903 		/*
1904 		 * Bail immediately if any callback invocation returns a
1905 		 * non-zero value.
1906 		 */
1907 		ret = (*cb)(zone, data);
1908 		if (ret != 0)
1909 			break;
1910 	}
1911 	mutex_exit(&zonehash_lock);
1912 	return (ret);
1913 }
1914 
1915 static int
1916 zone_set_root(zone_t *zone, const char *upath)
1917 {
1918 	vnode_t *vp;
1919 	int trycount;
1920 	int error = 0;
1921 	char *path;
1922 	struct pathname upn, pn;
1923 	size_t pathlen;
1924 
1925 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
1926 		return (error);
1927 
1928 	pn_alloc(&pn);
1929 
1930 	/* prevent infinite loop */
1931 	trycount = 10;
1932 	for (;;) {
1933 		if (--trycount <= 0) {
1934 			error = ESTALE;
1935 			goto out;
1936 		}
1937 
1938 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
1939 			/*
1940 			 * VOP_ACCESS() may cover 'vp' with a new
1941 			 * filesystem, if 'vp' is an autoFS vnode.
1942 			 * Get the new 'vp' if so.
1943 			 */
1944 			if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 &&
1945 			    (vp->v_vfsmountedhere == NULL ||
1946 			    (error = traverse(&vp)) == 0)) {
1947 				pathlen = pn.pn_pathlen + 2;
1948 				path = kmem_alloc(pathlen, KM_SLEEP);
1949 				(void) strncpy(path, pn.pn_path,
1950 				    pn.pn_pathlen + 1);
1951 				path[pathlen - 2] = '/';
1952 				path[pathlen - 1] = '\0';
1953 				pn_free(&pn);
1954 				pn_free(&upn);
1955 
1956 				/* Success! */
1957 				break;
1958 			}
1959 			VN_RELE(vp);
1960 		}
1961 		if (error != ESTALE)
1962 			goto out;
1963 	}
1964 
1965 	ASSERT(error == 0);
1966 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
1967 	zone->zone_rootpath = path;
1968 	zone->zone_rootpathlen = pathlen;
1969 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
1970 		zone->zone_flags |= ZF_IS_SCRATCH;
1971 	return (0);
1972 
1973 out:
1974 	pn_free(&pn);
1975 	pn_free(&upn);
1976 	return (error);
1977 }
1978 
1979 #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
1980 			((c) >= 'a' && (c) <= 'z') || \
1981 			((c) >= 'A' && (c) <= 'Z'))
1982 
1983 static int
1984 zone_set_name(zone_t *zone, const char *uname)
1985 {
1986 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
1987 	size_t len;
1988 	int i, err;
1989 
1990 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
1991 		kmem_free(kname, ZONENAME_MAX);
1992 		return (err);	/* EFAULT or ENAMETOOLONG */
1993 	}
1994 
1995 	/* must be less than ZONENAME_MAX */
1996 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
1997 		kmem_free(kname, ZONENAME_MAX);
1998 		return (EINVAL);
1999 	}
2000 
2001 	/*
2002 	 * Name must start with an alphanumeric and must contain only
2003 	 * alphanumerics, '-', '_' and '.'.
2004 	 */
2005 	if (!isalnum(kname[0])) {
2006 		kmem_free(kname, ZONENAME_MAX);
2007 		return (EINVAL);
2008 	}
2009 	for (i = 1; i < len - 1; i++) {
2010 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
2011 		    kname[i] != '.') {
2012 			kmem_free(kname, ZONENAME_MAX);
2013 			return (EINVAL);
2014 		}
2015 	}
2016 
2017 	zone->zone_name = kname;
2018 	return (0);
2019 }
2020 
2021 /*
2022  * Similar to thread_create(), but makes sure the thread is in the appropriate
2023  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
2024  */
2025 /*ARGSUSED*/
2026 kthread_t *
2027 zthread_create(
2028     caddr_t stk,
2029     size_t stksize,
2030     void (*proc)(),
2031     void *arg,
2032     size_t len,
2033     pri_t pri)
2034 {
2035 	kthread_t *t;
2036 	zone_t *zone = curproc->p_zone;
2037 	proc_t *pp = zone->zone_zsched;
2038 
2039 	zone_hold(zone);	/* Reference to be dropped when thread exits */
2040 
2041 	/*
2042 	 * No-one should be trying to create threads if the zone is shutting
2043 	 * down and there aren't any kernel threads around.  See comment
2044 	 * in zthread_exit().
2045 	 */
2046 	ASSERT(!(zone->zone_kthreads == NULL &&
2047 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
2048 	/*
2049 	 * Create a thread, but don't let it run until we've finished setting
2050 	 * things up.
2051 	 */
2052 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
2053 	ASSERT(t->t_forw == NULL);
2054 	mutex_enter(&zone_status_lock);
2055 	if (zone->zone_kthreads == NULL) {
2056 		t->t_forw = t->t_back = t;
2057 	} else {
2058 		kthread_t *tx = zone->zone_kthreads;
2059 
2060 		t->t_forw = tx;
2061 		t->t_back = tx->t_back;
2062 		tx->t_back->t_forw = t;
2063 		tx->t_back = t;
2064 	}
2065 	zone->zone_kthreads = t;
2066 	mutex_exit(&zone_status_lock);
2067 
2068 	mutex_enter(&pp->p_lock);
2069 	t->t_proc_flag |= TP_ZTHREAD;
2070 	project_rele(t->t_proj);
2071 	t->t_proj = project_hold(pp->p_task->tk_proj);
2072 
2073 	/*
2074 	 * Setup complete, let it run.
2075 	 */
2076 	thread_lock(t);
2077 	t->t_schedflag |= TS_ALLSTART;
2078 	setrun_locked(t);
2079 	thread_unlock(t);
2080 
2081 	mutex_exit(&pp->p_lock);
2082 
2083 	return (t);
2084 }
2085 
2086 /*
2087  * Similar to thread_exit().  Must be called by threads created via
2088  * zthread_exit().
2089  */
2090 void
2091 zthread_exit(void)
2092 {
2093 	kthread_t *t = curthread;
2094 	proc_t *pp = curproc;
2095 	zone_t *zone = pp->p_zone;
2096 
2097 	mutex_enter(&zone_status_lock);
2098 
2099 	/*
2100 	 * Reparent to p0
2101 	 */
2102 	kpreempt_disable();
2103 	mutex_enter(&pp->p_lock);
2104 	t->t_proc_flag &= ~TP_ZTHREAD;
2105 	t->t_procp = &p0;
2106 	hat_thread_exit(t);
2107 	mutex_exit(&pp->p_lock);
2108 	kpreempt_enable();
2109 
2110 	if (t->t_back == t) {
2111 		ASSERT(t->t_forw == t);
2112 		/*
2113 		 * If the zone is empty, once the thread count
2114 		 * goes to zero no further kernel threads can be
2115 		 * created.  This is because if the creator is a process
2116 		 * in the zone, then it must have exited before the zone
2117 		 * state could be set to ZONE_IS_EMPTY.
2118 		 * Otherwise, if the creator is a kernel thread in the
2119 		 * zone, the thread count is non-zero.
2120 		 *
2121 		 * This really means that non-zone kernel threads should
2122 		 * not create zone kernel threads.
2123 		 */
2124 		zone->zone_kthreads = NULL;
2125 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
2126 			zone_status_set(zone, ZONE_IS_DOWN);
2127 		}
2128 	} else {
2129 		t->t_forw->t_back = t->t_back;
2130 		t->t_back->t_forw = t->t_forw;
2131 		if (zone->zone_kthreads == t)
2132 			zone->zone_kthreads = t->t_forw;
2133 	}
2134 	mutex_exit(&zone_status_lock);
2135 	zone_rele(zone);
2136 	thread_exit();
2137 	/* NOTREACHED */
2138 }
2139 
2140 static void
2141 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
2142 {
2143 	vnode_t *oldvp;
2144 
2145 	/* we're going to hold a reference here to the directory */
2146 	VN_HOLD(vp);
2147 
2148 #ifdef C2_AUDIT
2149 	if (audit_active)	/* update abs cwd/root path see c2audit.c */
2150 		audit_chdirec(vp, vpp);
2151 #endif
2152 
2153 	mutex_enter(&pp->p_lock);
2154 	oldvp = *vpp;
2155 	*vpp = vp;
2156 	mutex_exit(&pp->p_lock);
2157 	if (oldvp != NULL)
2158 		VN_RELE(oldvp);
2159 }
2160 
2161 /*
2162  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
2163  */
2164 static int
2165 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
2166 {
2167 	nvpair_t *nvp = NULL;
2168 	boolean_t priv_set = B_FALSE;
2169 	boolean_t limit_set = B_FALSE;
2170 	boolean_t action_set = B_FALSE;
2171 
2172 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2173 		const char *name;
2174 		uint64_t ui64;
2175 
2176 		name = nvpair_name(nvp);
2177 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
2178 			return (EINVAL);
2179 		(void) nvpair_value_uint64(nvp, &ui64);
2180 		if (strcmp(name, "privilege") == 0) {
2181 			/*
2182 			 * Currently only privileged values are allowed, but
2183 			 * this may change in the future.
2184 			 */
2185 			if (ui64 != RCPRIV_PRIVILEGED)
2186 				return (EINVAL);
2187 			rv->rcv_privilege = ui64;
2188 			priv_set = B_TRUE;
2189 		} else if (strcmp(name, "limit") == 0) {
2190 			rv->rcv_value = ui64;
2191 			limit_set = B_TRUE;
2192 		} else if (strcmp(name, "action") == 0) {
2193 			if (ui64 != RCTL_LOCAL_NOACTION &&
2194 			    ui64 != RCTL_LOCAL_DENY)
2195 				return (EINVAL);
2196 			rv->rcv_flagaction = ui64;
2197 			action_set = B_TRUE;
2198 		} else {
2199 			return (EINVAL);
2200 		}
2201 	}
2202 
2203 	if (!(priv_set && limit_set && action_set))
2204 		return (EINVAL);
2205 	rv->rcv_action_signal = 0;
2206 	rv->rcv_action_recipient = NULL;
2207 	rv->rcv_action_recip_pid = -1;
2208 	rv->rcv_firing_time = 0;
2209 
2210 	return (0);
2211 }
2212 
2213 /*
2214  * Non-global zone version of start_init.
2215  */
2216 void
2217 zone_start_init(void)
2218 {
2219 	proc_t *p = ttoproc(curthread);
2220 
2221 	ASSERT(!INGLOBALZONE(curproc));
2222 
2223 	/*
2224 	 * We maintain zone_boot_err so that we can return the cause of the
2225 	 * failure back to the caller of the zone_boot syscall.
2226 	 */
2227 	p->p_zone->zone_boot_err = start_init_common();
2228 
2229 	mutex_enter(&zone_status_lock);
2230 	if (p->p_zone->zone_boot_err != 0) {
2231 		/*
2232 		 * Make sure we are still in the booting state-- we could have
2233 		 * raced and already be shutting down, or even further along.
2234 		 */
2235 		if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING)
2236 			zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN);
2237 		mutex_exit(&zone_status_lock);
2238 		/* It's gone bad, dispose of the process */
2239 		if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) {
2240 			mutex_enter(&p->p_lock);
2241 			ASSERT(p->p_flag & SEXITLWPS);
2242 			lwp_exit();
2243 		}
2244 	} else {
2245 		if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING)
2246 			zone_status_set(p->p_zone, ZONE_IS_RUNNING);
2247 		mutex_exit(&zone_status_lock);
2248 		/* cause the process to return to userland. */
2249 		lwp_rtt();
2250 	}
2251 }
2252 
2253 struct zsched_arg {
2254 	zone_t *zone;
2255 	nvlist_t *nvlist;
2256 };
2257 
2258 /*
2259  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
2260  * anything to do with scheduling, but rather with the fact that
2261  * per-zone kernel threads are parented to zsched, just like regular
2262  * kernel threads are parented to sched (p0).
2263  *
2264  * zsched is also responsible for launching init for the zone.
2265  */
2266 static void
2267 zsched(void *arg)
2268 {
2269 	struct zsched_arg *za = arg;
2270 	proc_t *pp = curproc;
2271 	proc_t *initp = proc_init;
2272 	zone_t *zone = za->zone;
2273 	cred_t *cr, *oldcred;
2274 	rctl_set_t *set;
2275 	rctl_alloc_gp_t *gp;
2276 	contract_t *ct = NULL;
2277 	task_t *tk, *oldtk;
2278 	rctl_entity_p_t e;
2279 	kproject_t *pj;
2280 
2281 	nvlist_t *nvl = za->nvlist;
2282 	nvpair_t *nvp = NULL;
2283 
2284 	bcopy("zsched", u.u_psargs, sizeof ("zsched"));
2285 	bcopy("zsched", u.u_comm, sizeof ("zsched"));
2286 	u.u_argc = 0;
2287 	u.u_argv = NULL;
2288 	u.u_envp = NULL;
2289 	closeall(P_FINFO(pp));
2290 
2291 	/*
2292 	 * We are this zone's "zsched" process.  As the zone isn't generally
2293 	 * visible yet we don't need to grab any locks before initializing its
2294 	 * zone_proc pointer.
2295 	 */
2296 	zone_hold(zone);  /* this hold is released by zone_destroy() */
2297 	zone->zone_zsched = pp;
2298 	mutex_enter(&pp->p_lock);
2299 	pp->p_zone = zone;
2300 	mutex_exit(&pp->p_lock);
2301 
2302 	/*
2303 	 * Disassociate process from its 'parent'; parent ourselves to init
2304 	 * (pid 1) and change other values as needed.
2305 	 */
2306 	sess_create();
2307 
2308 	mutex_enter(&pidlock);
2309 	proc_detach(pp);
2310 	pp->p_ppid = 1;
2311 	pp->p_flag |= SZONETOP;
2312 	pp->p_ancpid = 1;
2313 	pp->p_parent = initp;
2314 	pp->p_psibling = NULL;
2315 	if (initp->p_child)
2316 		initp->p_child->p_psibling = pp;
2317 	pp->p_sibling = initp->p_child;
2318 	initp->p_child = pp;
2319 
2320 	/* Decrement what newproc() incremented. */
2321 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
2322 	/*
2323 	 * Our credentials are about to become kcred-like, so we don't care
2324 	 * about the caller's ruid.
2325 	 */
2326 	upcount_inc(crgetruid(kcred), zone->zone_id);
2327 	mutex_exit(&pidlock);
2328 
2329 	/*
2330 	 * getting out of global zone, so decrement lwp counts
2331 	 */
2332 	pj = pp->p_task->tk_proj;
2333 	mutex_enter(&global_zone->zone_nlwps_lock);
2334 	pj->kpj_nlwps -= pp->p_lwpcnt;
2335 	global_zone->zone_nlwps -= pp->p_lwpcnt;
2336 	mutex_exit(&global_zone->zone_nlwps_lock);
2337 
2338 	/*
2339 	 * Create and join a new task in project '0' of this zone.
2340 	 *
2341 	 * We don't need to call holdlwps() since we know we're the only lwp in
2342 	 * this process.
2343 	 *
2344 	 * task_join() returns with p_lock held.
2345 	 */
2346 	tk = task_create(0, zone);
2347 	mutex_enter(&cpu_lock);
2348 	oldtk = task_join(tk, 0);
2349 	mutex_exit(&curproc->p_lock);
2350 	mutex_exit(&cpu_lock);
2351 	task_rele(oldtk);
2352 
2353 	/*
2354 	 * add lwp counts to zsched's zone, and increment project's task count
2355 	 * due to the task created in the above tasksys_settaskid
2356 	 */
2357 	pj = pp->p_task->tk_proj;
2358 	mutex_enter(&zone->zone_nlwps_lock);
2359 	pj->kpj_nlwps += pp->p_lwpcnt;
2360 	pj->kpj_ntasks += 1;
2361 	zone->zone_nlwps += pp->p_lwpcnt;
2362 	mutex_exit(&zone->zone_nlwps_lock);
2363 
2364 	/*
2365 	 * The process was created by a process in the global zone, hence the
2366 	 * credentials are wrong.  We might as well have kcred-ish credentials.
2367 	 */
2368 	cr = zone->zone_kcred;
2369 	crhold(cr);
2370 	mutex_enter(&pp->p_crlock);
2371 	oldcred = pp->p_cred;
2372 	pp->p_cred = cr;
2373 	mutex_exit(&pp->p_crlock);
2374 	crfree(oldcred);
2375 
2376 	/*
2377 	 * Hold credentials again (for thread)
2378 	 */
2379 	crhold(cr);
2380 
2381 	/*
2382 	 * p_lwpcnt can't change since this is a kernel process.
2383 	 */
2384 	crset(pp, cr);
2385 
2386 	/*
2387 	 * Chroot
2388 	 */
2389 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
2390 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
2391 
2392 	/*
2393 	 * Initialize zone's rctl set.
2394 	 */
2395 	set = rctl_set_create();
2396 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2397 	mutex_enter(&pp->p_lock);
2398 	e.rcep_p.zone = zone;
2399 	e.rcep_t = RCENTITY_ZONE;
2400 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
2401 	mutex_exit(&pp->p_lock);
2402 	rctl_prealloc_destroy(gp);
2403 
2404 	/*
2405 	 * Apply the rctls passed in to zone_create().  This is basically a list
2406 	 * assignment: all of the old values are removed and the new ones
2407 	 * inserted.  That is, if an empty list is passed in, all values are
2408 	 * removed.
2409 	 */
2410 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2411 		rctl_dict_entry_t *rde;
2412 		rctl_hndl_t hndl;
2413 		char *name;
2414 		nvlist_t **nvlarray;
2415 		uint_t i, nelem;
2416 		int error;	/* For ASSERT()s */
2417 
2418 		name = nvpair_name(nvp);
2419 		hndl = rctl_hndl_lookup(name);
2420 		ASSERT(hndl != -1);
2421 		rde = rctl_dict_lookup_hndl(hndl);
2422 		ASSERT(rde != NULL);
2423 
2424 		for (; /* ever */; ) {
2425 			rctl_val_t oval;
2426 
2427 			mutex_enter(&pp->p_lock);
2428 			error = rctl_local_get(hndl, NULL, &oval, pp);
2429 			mutex_exit(&pp->p_lock);
2430 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
2431 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
2432 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
2433 				break;
2434 			mutex_enter(&pp->p_lock);
2435 			error = rctl_local_delete(hndl, &oval, pp);
2436 			mutex_exit(&pp->p_lock);
2437 			ASSERT(error == 0);
2438 		}
2439 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2440 		ASSERT(error == 0);
2441 		for (i = 0; i < nelem; i++) {
2442 			rctl_val_t *nvalp;
2443 
2444 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2445 			error = nvlist2rctlval(nvlarray[i], nvalp);
2446 			ASSERT(error == 0);
2447 			/*
2448 			 * rctl_local_insert can fail if the value being
2449 			 * inserted is a duplicate; this is OK.
2450 			 */
2451 			mutex_enter(&pp->p_lock);
2452 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
2453 				kmem_cache_free(rctl_val_cache, nvalp);
2454 			mutex_exit(&pp->p_lock);
2455 		}
2456 	}
2457 	/*
2458 	 * Tell the world that we're done setting up.
2459 	 *
2460 	 * At this point we want to set the zone status to ZONE_IS_READY
2461 	 * and atomically set the zone's processor set visibility.  Once
2462 	 * we drop pool_lock() this zone will automatically get updated
2463 	 * to reflect any future changes to the pools configuration.
2464 	 */
2465 	pool_lock();
2466 	mutex_enter(&cpu_lock);
2467 	mutex_enter(&zonehash_lock);
2468 	zone_uniqid(zone);
2469 	zone_zsd_configure(zone);
2470 	if (pool_state == POOL_ENABLED)
2471 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
2472 	mutex_enter(&zone_status_lock);
2473 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2474 	zone_status_set(zone, ZONE_IS_READY);
2475 	mutex_exit(&zone_status_lock);
2476 	mutex_exit(&zonehash_lock);
2477 	mutex_exit(&cpu_lock);
2478 	pool_unlock();
2479 
2480 	/*
2481 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
2482 	 * we launch init, and set the state to running.
2483 	 */
2484 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
2485 
2486 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
2487 		id_t cid;
2488 
2489 		/*
2490 		 * Ok, this is a little complicated.  We need to grab the
2491 		 * zone's pool's scheduling class ID; note that by now, we
2492 		 * are already bound to a pool if we need to be (zoneadmd
2493 		 * will have done that to us while we're in the READY
2494 		 * state).  *But* the scheduling class for the zone's 'init'
2495 		 * must be explicitly passed to newproc, which doesn't
2496 		 * respect pool bindings.
2497 		 *
2498 		 * We hold the pool_lock across the call to newproc() to
2499 		 * close the obvious race: the pool's scheduling class
2500 		 * could change before we manage to create the LWP with
2501 		 * classid 'cid'.
2502 		 */
2503 		pool_lock();
2504 		cid = pool_get_class(zone->zone_pool);
2505 		if (cid == -1)
2506 			cid = defaultcid;
2507 
2508 		/*
2509 		 * If this fails, zone_boot will ultimately fail.  The
2510 		 * state of the zone will be set to SHUTTING_DOWN-- userland
2511 		 * will have to tear down the zone, and fail, or try again.
2512 		 */
2513 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
2514 		    minclsyspri - 1, &ct)) != 0) {
2515 			mutex_enter(&zone_status_lock);
2516 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
2517 			mutex_exit(&zone_status_lock);
2518 		}
2519 		pool_unlock();
2520 	}
2521 
2522 	/*
2523 	 * Wait for zone_destroy() to be called.  This is what we spend
2524 	 * most of our life doing.
2525 	 */
2526 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
2527 
2528 	if (ct)
2529 		/*
2530 		 * At this point the process contract should be empty.
2531 		 * (Though if it isn't, it's not the end of the world.)
2532 		 */
2533 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
2534 
2535 	/*
2536 	 * Allow kcred to be freed when all referring processes
2537 	 * (including this one) go away.  We can't just do this in
2538 	 * zone_free because we need to wait for the zone_cred_ref to
2539 	 * drop to 0 before calling zone_free, and the existence of
2540 	 * zone_kcred will prevent that.  Thus, we call crfree here to
2541 	 * balance the crdup in zone_create.  The crhold calls earlier
2542 	 * in zsched will be dropped when the thread and process exit.
2543 	 */
2544 	crfree(zone->zone_kcred);
2545 	zone->zone_kcred = NULL;
2546 
2547 	exit(CLD_EXITED, 0);
2548 }
2549 
2550 /*
2551  * Helper function to determine if there are any submounts of the
2552  * provided path.  Used to make sure the zone doesn't "inherit" any
2553  * mounts from before it is created.
2554  */
2555 static uint_t
2556 zone_mount_count(const char *rootpath)
2557 {
2558 	vfs_t *vfsp;
2559 	uint_t count = 0;
2560 	size_t rootpathlen = strlen(rootpath);
2561 
2562 	/*
2563 	 * Holding zonehash_lock prevents race conditions with
2564 	 * vfs_list_add()/vfs_list_remove() since we serialize with
2565 	 * zone_find_by_path().
2566 	 */
2567 	ASSERT(MUTEX_HELD(&zonehash_lock));
2568 	/*
2569 	 * The rootpath must end with a '/'
2570 	 */
2571 	ASSERT(rootpath[rootpathlen - 1] == '/');
2572 
2573 	/*
2574 	 * This intentionally does not count the rootpath itself if that
2575 	 * happens to be a mount point.
2576 	 */
2577 	vfs_list_read_lock();
2578 	vfsp = rootvfs;
2579 	do {
2580 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
2581 		    rootpathlen) == 0)
2582 			count++;
2583 		vfsp = vfsp->vfs_next;
2584 	} while (vfsp != rootvfs);
2585 	vfs_list_unlock();
2586 	return (count);
2587 }
2588 
2589 /*
2590  * Helper function to make sure that a zone created on 'rootpath'
2591  * wouldn't end up containing other zones' rootpaths.
2592  */
2593 static boolean_t
2594 zone_is_nested(const char *rootpath)
2595 {
2596 	zone_t *zone;
2597 	size_t rootpathlen = strlen(rootpath);
2598 	size_t len;
2599 
2600 	ASSERT(MUTEX_HELD(&zonehash_lock));
2601 
2602 	for (zone = list_head(&zone_active); zone != NULL;
2603 	    zone = list_next(&zone_active, zone)) {
2604 		if (zone == global_zone)
2605 			continue;
2606 		len = strlen(zone->zone_rootpath);
2607 		if (strncmp(rootpath, zone->zone_rootpath,
2608 		    MIN(rootpathlen, len)) == 0)
2609 			return (B_TRUE);
2610 	}
2611 	return (B_FALSE);
2612 }
2613 
2614 static int
2615 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
2616     size_t zone_privssz)
2617 {
2618 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2619 
2620 	if (zone_privssz < sizeof (priv_set_t))
2621 		return (set_errno(ENOMEM));
2622 
2623 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
2624 		kmem_free(privs, sizeof (priv_set_t));
2625 		return (EFAULT);
2626 	}
2627 
2628 	zone->zone_privset = privs;
2629 	return (0);
2630 }
2631 
2632 /*
2633  * We make creative use of nvlists to pass in rctls from userland.  The list is
2634  * a list of the following structures:
2635  *
2636  * (name = rctl_name, value = nvpair_list_array)
2637  *
2638  * Where each element of the nvpair_list_array is of the form:
2639  *
2640  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
2641  * 	(name = "limit", value = uint64_t),
2642  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
2643  */
2644 static int
2645 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
2646 {
2647 	nvpair_t *nvp = NULL;
2648 	nvlist_t *nvl = NULL;
2649 	char *kbuf;
2650 	int error;
2651 	rctl_val_t rv;
2652 
2653 	*nvlp = NULL;
2654 
2655 	if (buflen == 0)
2656 		return (0);
2657 
2658 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
2659 		return (ENOMEM);
2660 	if (copyin(ubuf, kbuf, buflen)) {
2661 		error = EFAULT;
2662 		goto out;
2663 	}
2664 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
2665 		/*
2666 		 * nvl may have been allocated/free'd, but the value set to
2667 		 * non-NULL, so we reset it here.
2668 		 */
2669 		nvl = NULL;
2670 		error = EINVAL;
2671 		goto out;
2672 	}
2673 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2674 		rctl_dict_entry_t *rde;
2675 		rctl_hndl_t hndl;
2676 		nvlist_t **nvlarray;
2677 		uint_t i, nelem;
2678 		char *name;
2679 
2680 		error = EINVAL;
2681 		name = nvpair_name(nvp);
2682 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
2683 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
2684 			goto out;
2685 		}
2686 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
2687 			goto out;
2688 		}
2689 		rde = rctl_dict_lookup_hndl(hndl);
2690 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2691 		ASSERT(error == 0);
2692 		for (i = 0; i < nelem; i++) {
2693 			if (error = nvlist2rctlval(nvlarray[i], &rv))
2694 				goto out;
2695 		}
2696 		if (rctl_invalid_value(rde, &rv)) {
2697 			error = EINVAL;
2698 			goto out;
2699 		}
2700 	}
2701 	error = 0;
2702 	*nvlp = nvl;
2703 out:
2704 	kmem_free(kbuf, buflen);
2705 	if (error && nvl != NULL)
2706 		nvlist_free(nvl);
2707 	return (error);
2708 }
2709 
2710 int
2711 zone_create_error(int er_error, int er_ext, int *er_out) {
2712 	if (er_out != NULL) {
2713 		if (copyout(&er_ext, er_out, sizeof (int))) {
2714 			return (set_errno(EFAULT));
2715 		}
2716 	}
2717 	return (set_errno(er_error));
2718 }
2719 
2720 static int
2721 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
2722 {
2723 	ts_label_t *tsl;
2724 	bslabel_t blab;
2725 
2726 	/* Get label from user */
2727 	if (copyin(lab, &blab, sizeof (blab)) != 0)
2728 		return (EFAULT);
2729 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
2730 	if (tsl == NULL)
2731 		return (ENOMEM);
2732 
2733 	zone->zone_slabel = tsl;
2734 	return (0);
2735 }
2736 
2737 /*
2738  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
2739  */
2740 static int
2741 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
2742 {
2743 	char *kbuf;
2744 	char *dataset, *next;
2745 	zone_dataset_t *zd;
2746 	size_t len;
2747 
2748 	if (ubuf == NULL || buflen == 0)
2749 		return (0);
2750 
2751 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
2752 		return (ENOMEM);
2753 
2754 	if (copyin(ubuf, kbuf, buflen) != 0) {
2755 		kmem_free(kbuf, buflen);
2756 		return (EFAULT);
2757 	}
2758 
2759 	dataset = next = kbuf;
2760 	for (;;) {
2761 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
2762 
2763 		next = strchr(dataset, ',');
2764 
2765 		if (next == NULL)
2766 			len = strlen(dataset);
2767 		else
2768 			len = next - dataset;
2769 
2770 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
2771 		bcopy(dataset, zd->zd_dataset, len);
2772 		zd->zd_dataset[len] = '\0';
2773 
2774 		list_insert_head(&zone->zone_datasets, zd);
2775 
2776 		if (next == NULL)
2777 			break;
2778 
2779 		dataset = next + 1;
2780 	}
2781 
2782 	kmem_free(kbuf, buflen);
2783 	return (0);
2784 }
2785 
2786 /*
2787  * System call to create/initialize a new zone named 'zone_name', rooted
2788  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
2789  * and initialized with the zone-wide rctls described in 'rctlbuf', and
2790  * with labeling set by 'match', 'doi', and 'label'.
2791  *
2792  * If extended error is non-null, we may use it to return more detailed
2793  * error information.
2794  */
2795 static zoneid_t
2796 zone_create(const char *zone_name, const char *zone_root,
2797     const priv_set_t *zone_privs, size_t zone_privssz,
2798     caddr_t rctlbuf, size_t rctlbufsz,
2799     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
2800     int match, uint32_t doi, const bslabel_t *label)
2801 {
2802 	struct zsched_arg zarg;
2803 	nvlist_t *rctls = NULL;
2804 	proc_t *pp = curproc;
2805 	zone_t *zone, *ztmp;
2806 	zoneid_t zoneid;
2807 	int error;
2808 	int error2 = 0;
2809 	char *str;
2810 	cred_t *zkcr;
2811 	boolean_t insert_label_hash;
2812 
2813 	if (secpolicy_zone_config(CRED()) != 0)
2814 		return (set_errno(EPERM));
2815 
2816 	/* can't boot zone from within chroot environment */
2817 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
2818 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
2819 		    extended_error));
2820 
2821 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
2822 	zoneid = zone->zone_id = id_alloc(zoneid_space);
2823 	zone->zone_status = ZONE_IS_UNINITIALIZED;
2824 	zone->zone_pool = pool_default;
2825 	zone->zone_pool_mod = gethrtime();
2826 	zone->zone_psetid = ZONE_PS_INVAL;
2827 	zone->zone_ncpus = 0;
2828 	zone->zone_ncpus_online = 0;
2829 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
2830 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2831 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
2832 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
2833 	    offsetof(struct zsd_entry, zsd_linkage));
2834 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
2835 	    offsetof(zone_dataset_t, zd_linkage));
2836 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
2837 
2838 	if ((error = zone_set_name(zone, zone_name)) != 0) {
2839 		zone_free(zone);
2840 		return (zone_create_error(error, 0, extended_error));
2841 	}
2842 
2843 	if ((error = zone_set_root(zone, zone_root)) != 0) {
2844 		zone_free(zone);
2845 		return (zone_create_error(error, 0, extended_error));
2846 	}
2847 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
2848 		zone_free(zone);
2849 		return (zone_create_error(error, 0, extended_error));
2850 	}
2851 
2852 	/* initialize node name to be the same as zone name */
2853 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
2854 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
2855 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
2856 
2857 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
2858 	zone->zone_domain[0] = '\0';
2859 	zone->zone_shares = 1;
2860 	zone->zone_bootargs = NULL;
2861 	zone->zone_initname =
2862 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
2863 	(void) strcpy(zone->zone_initname, zone_default_initname);
2864 
2865 	/*
2866 	 * Zsched initializes the rctls.
2867 	 */
2868 	zone->zone_rctls = NULL;
2869 
2870 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
2871 		zone_free(zone);
2872 		return (zone_create_error(error, 0, extended_error));
2873 	}
2874 
2875 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
2876 		zone_free(zone);
2877 		return (set_errno(error));
2878 	}
2879 
2880 	/*
2881 	 * Read in the trusted system parameters:
2882 	 * match flag and sensitivity label.
2883 	 */
2884 	zone->zone_match = match;
2885 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
2886 		error = zone_set_label(zone, label, doi);
2887 		if (error != 0) {
2888 			zone_free(zone);
2889 			return (set_errno(error));
2890 		}
2891 		insert_label_hash = B_TRUE;
2892 	} else {
2893 		/* all zones get an admin_low label if system is not labeled */
2894 		zone->zone_slabel = l_admin_low;
2895 		label_hold(l_admin_low);
2896 		insert_label_hash = B_FALSE;
2897 	}
2898 
2899 	/*
2900 	 * Stop all lwps since that's what normally happens as part of fork().
2901 	 * This needs to happen before we grab any locks to avoid deadlock
2902 	 * (another lwp in the process could be waiting for the held lock).
2903 	 */
2904 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
2905 		zone_free(zone);
2906 		if (rctls)
2907 			nvlist_free(rctls);
2908 		return (zone_create_error(error, 0, extended_error));
2909 	}
2910 
2911 	if (block_mounts() == 0) {
2912 		mutex_enter(&pp->p_lock);
2913 		if (curthread != pp->p_agenttp)
2914 			continuelwps(pp);
2915 		mutex_exit(&pp->p_lock);
2916 		zone_free(zone);
2917 		if (rctls)
2918 			nvlist_free(rctls);
2919 		return (zone_create_error(error, 0, extended_error));
2920 	}
2921 
2922 	/*
2923 	 * Set up credential for kernel access.  After this, any errors
2924 	 * should go through the dance in errout rather than calling
2925 	 * zone_free directly.
2926 	 */
2927 	zone->zone_kcred = crdup(kcred);
2928 	crsetzone(zone->zone_kcred, zone);
2929 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
2930 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
2931 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
2932 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
2933 
2934 	mutex_enter(&zonehash_lock);
2935 	/*
2936 	 * Make sure zone doesn't already exist.
2937 	 *
2938 	 * If the system and zone are labeled,
2939 	 * make sure no other zone exists that has the same label.
2940 	 */
2941 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
2942 	    (insert_label_hash &&
2943 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
2944 		zone_status_t status;
2945 
2946 		status = zone_status_get(ztmp);
2947 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
2948 			error = EEXIST;
2949 		else
2950 			error = EBUSY;
2951 		goto errout;
2952 	}
2953 
2954 	/*
2955 	 * Don't allow zone creations which would cause one zone's rootpath to
2956 	 * be accessible from that of another (non-global) zone.
2957 	 */
2958 	if (zone_is_nested(zone->zone_rootpath)) {
2959 		error = EBUSY;
2960 		goto errout;
2961 	}
2962 
2963 	ASSERT(zonecount != 0);		/* check for leaks */
2964 	if (zonecount + 1 > maxzones) {
2965 		error = ENOMEM;
2966 		goto errout;
2967 	}
2968 
2969 	if (zone_mount_count(zone->zone_rootpath) != 0) {
2970 		error = EBUSY;
2971 		error2 = ZE_AREMOUNTS;
2972 		goto errout;
2973 	}
2974 
2975 	/*
2976 	 * Zone is still incomplete, but we need to drop all locks while
2977 	 * zsched() initializes this zone's kernel process.  We
2978 	 * optimistically add the zone to the hashtable and associated
2979 	 * lists so a parallel zone_create() doesn't try to create the
2980 	 * same zone.
2981 	 */
2982 	zonecount++;
2983 	(void) mod_hash_insert(zonehashbyid,
2984 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
2985 	    (mod_hash_val_t)(uintptr_t)zone);
2986 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
2987 	(void) strcpy(str, zone->zone_name);
2988 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
2989 	    (mod_hash_val_t)(uintptr_t)zone);
2990 	if (insert_label_hash) {
2991 		(void) mod_hash_insert(zonehashbylabel,
2992 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
2993 		zone->zone_flags |= ZF_HASHED_LABEL;
2994 	}
2995 
2996 	/*
2997 	 * Insert into active list.  At this point there are no 'hold's
2998 	 * on the zone, but everyone else knows not to use it, so we can
2999 	 * continue to use it.  zsched() will do a zone_hold() if the
3000 	 * newproc() is successful.
3001 	 */
3002 	list_insert_tail(&zone_active, zone);
3003 	mutex_exit(&zonehash_lock);
3004 
3005 	zarg.zone = zone;
3006 	zarg.nvlist = rctls;
3007 	/*
3008 	 * The process, task, and project rctls are probably wrong;
3009 	 * we need an interface to get the default values of all rctls,
3010 	 * and initialize zsched appropriately.  I'm not sure that that
3011 	 * makes much of a difference, though.
3012 	 */
3013 	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
3014 		/*
3015 		 * We need to undo all globally visible state.
3016 		 */
3017 		mutex_enter(&zonehash_lock);
3018 		list_remove(&zone_active, zone);
3019 		if (zone->zone_flags & ZF_HASHED_LABEL) {
3020 			ASSERT(zone->zone_slabel != NULL);
3021 			(void) mod_hash_destroy(zonehashbylabel,
3022 			    (mod_hash_key_t)zone->zone_slabel);
3023 		}
3024 		(void) mod_hash_destroy(zonehashbyname,
3025 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
3026 		(void) mod_hash_destroy(zonehashbyid,
3027 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3028 		ASSERT(zonecount > 1);
3029 		zonecount--;
3030 		goto errout;
3031 	}
3032 
3033 	/*
3034 	 * Zone creation can't fail from now on.
3035 	 */
3036 
3037 	/*
3038 	 * Let the other lwps continue.
3039 	 */
3040 	mutex_enter(&pp->p_lock);
3041 	if (curthread != pp->p_agenttp)
3042 		continuelwps(pp);
3043 	mutex_exit(&pp->p_lock);
3044 
3045 	/*
3046 	 * Wait for zsched to finish initializing the zone.
3047 	 */
3048 	zone_status_wait(zone, ZONE_IS_READY);
3049 	/*
3050 	 * The zone is fully visible, so we can let mounts progress.
3051 	 */
3052 	resume_mounts();
3053 	if (rctls)
3054 		nvlist_free(rctls);
3055 
3056 	return (zoneid);
3057 
3058 errout:
3059 	mutex_exit(&zonehash_lock);
3060 	/*
3061 	 * Let the other lwps continue.
3062 	 */
3063 	mutex_enter(&pp->p_lock);
3064 	if (curthread != pp->p_agenttp)
3065 		continuelwps(pp);
3066 	mutex_exit(&pp->p_lock);
3067 
3068 	resume_mounts();
3069 	if (rctls)
3070 		nvlist_free(rctls);
3071 	/*
3072 	 * There is currently one reference to the zone, a cred_ref from
3073 	 * zone_kcred.  To free the zone, we call crfree, which will call
3074 	 * zone_cred_rele, which will call zone_free.
3075 	 */
3076 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
3077 	ASSERT(zone->zone_kcred->cr_ref == 1);
3078 	ASSERT(zone->zone_ref == 0);
3079 	zkcr = zone->zone_kcred;
3080 	zone->zone_kcred = NULL;
3081 	crfree(zkcr);				/* triggers call to zone_free */
3082 	return (zone_create_error(error, error2, extended_error));
3083 }
3084 
3085 /*
3086  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
3087  * the heavy lifting.  initname is the path to the program to launch
3088  * at the "top" of the zone; if this is NULL, we use the system default,
3089  * which is stored at zone_default_initname.
3090  */
3091 static int
3092 zone_boot(zoneid_t zoneid)
3093 {
3094 	int err;
3095 	zone_t *zone;
3096 
3097 	if (secpolicy_zone_config(CRED()) != 0)
3098 		return (set_errno(EPERM));
3099 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3100 		return (set_errno(EINVAL));
3101 
3102 	mutex_enter(&zonehash_lock);
3103 	/*
3104 	 * Look for zone under hash lock to prevent races with calls to
3105 	 * zone_shutdown, zone_destroy, etc.
3106 	 */
3107 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3108 		mutex_exit(&zonehash_lock);
3109 		return (set_errno(EINVAL));
3110 	}
3111 
3112 	mutex_enter(&zone_status_lock);
3113 	if (zone_status_get(zone) != ZONE_IS_READY) {
3114 		mutex_exit(&zone_status_lock);
3115 		mutex_exit(&zonehash_lock);
3116 		return (set_errno(EINVAL));
3117 	}
3118 	zone_status_set(zone, ZONE_IS_BOOTING);
3119 	mutex_exit(&zone_status_lock);
3120 
3121 	zone_hold(zone);	/* so we can use the zone_t later */
3122 	mutex_exit(&zonehash_lock);
3123 
3124 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
3125 		zone_rele(zone);
3126 		return (set_errno(EINTR));
3127 	}
3128 
3129 	/*
3130 	 * Boot (starting init) might have failed, in which case the zone
3131 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
3132 	 * be placed in zone->zone_boot_err, and so we return that.
3133 	 */
3134 	err = zone->zone_boot_err;
3135 	zone_rele(zone);
3136 	return (err ? set_errno(err) : 0);
3137 }
3138 
3139 /*
3140  * Kills all user processes in the zone, waiting for them all to exit
3141  * before returning.
3142  */
3143 static int
3144 zone_empty(zone_t *zone)
3145 {
3146 	int waitstatus;
3147 
3148 	/*
3149 	 * We need to drop zonehash_lock before killing all
3150 	 * processes, otherwise we'll deadlock with zone_find_*
3151 	 * which can be called from the exit path.
3152 	 */
3153 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
3154 	while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
3155 	    ZONE_IS_EMPTY)) == -1) {
3156 		killall(zone->zone_id);
3157 	}
3158 	/*
3159 	 * return EINTR if we were signaled
3160 	 */
3161 	if (waitstatus == 0)
3162 		return (EINTR);
3163 	return (0);
3164 }
3165 
3166 /*
3167  * This function implements the policy for zone visibility.
3168  *
3169  * In standard Solaris, a non-global zone can only see itself.
3170  *
3171  * In Trusted Extensions, a labeled zone can lookup any zone whose label
3172  * it dominates. For this test, the label of the global zone is treated as
3173  * admin_high so it is special-cased instead of being checked for dominance.
3174  *
3175  * Returns true if zone attributes are viewable, false otherwise.
3176  */
3177 static boolean_t
3178 zone_list_access(zone_t *zone)
3179 {
3180 
3181 	if (curproc->p_zone == global_zone ||
3182 	    curproc->p_zone == zone) {
3183 		return (B_TRUE);
3184 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
3185 		bslabel_t *curproc_label;
3186 		bslabel_t *zone_label;
3187 
3188 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
3189 		zone_label = label2bslabel(zone->zone_slabel);
3190 
3191 		if (zone->zone_id != GLOBAL_ZONEID &&
3192 		    bldominates(curproc_label, zone_label)) {
3193 			return (B_TRUE);
3194 		} else {
3195 			return (B_FALSE);
3196 		}
3197 	} else {
3198 		return (B_FALSE);
3199 	}
3200 }
3201 
3202 /*
3203  * Systemcall to start the zone's halt sequence.  By the time this
3204  * function successfully returns, all user processes and kernel threads
3205  * executing in it will have exited, ZSD shutdown callbacks executed,
3206  * and the zone status set to ZONE_IS_DOWN.
3207  *
3208  * It is possible that the call will interrupt itself if the caller is the
3209  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
3210  */
3211 static int
3212 zone_shutdown(zoneid_t zoneid)
3213 {
3214 	int error;
3215 	zone_t *zone;
3216 	zone_status_t status;
3217 
3218 	if (secpolicy_zone_config(CRED()) != 0)
3219 		return (set_errno(EPERM));
3220 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3221 		return (set_errno(EINVAL));
3222 
3223 	/*
3224 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
3225 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
3226 	 *
3227 	 * e.g. NFS can fail the mount if it determines that the zone
3228 	 * has already begun the shutdown sequence.
3229 	 */
3230 	if (block_mounts() == 0)
3231 		return (set_errno(EINTR));
3232 	mutex_enter(&zonehash_lock);
3233 	/*
3234 	 * Look for zone under hash lock to prevent races with other
3235 	 * calls to zone_shutdown and zone_destroy.
3236 	 */
3237 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3238 		mutex_exit(&zonehash_lock);
3239 		resume_mounts();
3240 		return (set_errno(EINVAL));
3241 	}
3242 	mutex_enter(&zone_status_lock);
3243 	status = zone_status_get(zone);
3244 	/*
3245 	 * Fail if the zone isn't fully initialized yet.
3246 	 */
3247 	if (status < ZONE_IS_READY) {
3248 		mutex_exit(&zone_status_lock);
3249 		mutex_exit(&zonehash_lock);
3250 		resume_mounts();
3251 		return (set_errno(EINVAL));
3252 	}
3253 	/*
3254 	 * If conditions required for zone_shutdown() to return have been met,
3255 	 * return success.
3256 	 */
3257 	if (status >= ZONE_IS_DOWN) {
3258 		mutex_exit(&zone_status_lock);
3259 		mutex_exit(&zonehash_lock);
3260 		resume_mounts();
3261 		return (0);
3262 	}
3263 	/*
3264 	 * If zone_shutdown() hasn't been called before, go through the motions.
3265 	 * If it has, there's nothing to do but wait for the kernel threads to
3266 	 * drain.
3267 	 */
3268 	if (status < ZONE_IS_EMPTY) {
3269 		uint_t ntasks;
3270 
3271 		mutex_enter(&zone->zone_lock);
3272 		if ((ntasks = zone->zone_ntasks) != 1) {
3273 			/*
3274 			 * There's still stuff running.
3275 			 */
3276 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3277 		}
3278 		mutex_exit(&zone->zone_lock);
3279 		if (ntasks == 1) {
3280 			/*
3281 			 * The only way to create another task is through
3282 			 * zone_enter(), which will block until we drop
3283 			 * zonehash_lock.  The zone is empty.
3284 			 */
3285 			if (zone->zone_kthreads == NULL) {
3286 				/*
3287 				 * Skip ahead to ZONE_IS_DOWN
3288 				 */
3289 				zone_status_set(zone, ZONE_IS_DOWN);
3290 			} else {
3291 				zone_status_set(zone, ZONE_IS_EMPTY);
3292 			}
3293 		}
3294 	}
3295 	zone_hold(zone);	/* so we can use the zone_t later */
3296 	mutex_exit(&zone_status_lock);
3297 	mutex_exit(&zonehash_lock);
3298 	resume_mounts();
3299 
3300 	if (error = zone_empty(zone)) {
3301 		zone_rele(zone);
3302 		return (set_errno(error));
3303 	}
3304 	/*
3305 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
3306 	 * longer be notified of changes to the pools configuration, so
3307 	 * in order to not end up with a stale pool pointer, we point
3308 	 * ourselves at the default pool and remove all resource
3309 	 * visibility.  This is especially important as the zone_t may
3310 	 * languish on the deathrow for a very long time waiting for
3311 	 * cred's to drain out.
3312 	 *
3313 	 * This rebinding of the zone can happen multiple times
3314 	 * (presumably due to interrupted or parallel systemcalls)
3315 	 * without any adverse effects.
3316 	 */
3317 	if (pool_lock_intr() != 0) {
3318 		zone_rele(zone);
3319 		return (set_errno(EINTR));
3320 	}
3321 	if (pool_state == POOL_ENABLED) {
3322 		mutex_enter(&cpu_lock);
3323 		zone_pool_set(zone, pool_default);
3324 		/*
3325 		 * The zone no longer needs to be able to see any cpus.
3326 		 */
3327 		zone_pset_set(zone, ZONE_PS_INVAL);
3328 		mutex_exit(&cpu_lock);
3329 	}
3330 	pool_unlock();
3331 
3332 	/*
3333 	 * ZSD shutdown callbacks can be executed multiple times, hence
3334 	 * it is safe to not be holding any locks across this call.
3335 	 */
3336 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
3337 
3338 	mutex_enter(&zone_status_lock);
3339 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
3340 		zone_status_set(zone, ZONE_IS_DOWN);
3341 	mutex_exit(&zone_status_lock);
3342 
3343 	/*
3344 	 * Wait for kernel threads to drain.
3345 	 */
3346 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
3347 		zone_rele(zone);
3348 		return (set_errno(EINTR));
3349 	}
3350 	zone_rele(zone);
3351 	return (0);
3352 }
3353 
3354 /*
3355  * Systemcall entry point to finalize the zone halt process.  The caller
3356  * must have already successfully callefd zone_shutdown().
3357  *
3358  * Upon successful completion, the zone will have been fully destroyed:
3359  * zsched will have exited, destructor callbacks executed, and the zone
3360  * removed from the list of active zones.
3361  */
3362 static int
3363 zone_destroy(zoneid_t zoneid)
3364 {
3365 	uint64_t uniqid;
3366 	zone_t *zone;
3367 	zone_status_t status;
3368 
3369 	if (secpolicy_zone_config(CRED()) != 0)
3370 		return (set_errno(EPERM));
3371 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3372 		return (set_errno(EINVAL));
3373 
3374 	mutex_enter(&zonehash_lock);
3375 	/*
3376 	 * Look for zone under hash lock to prevent races with other
3377 	 * calls to zone_destroy.
3378 	 */
3379 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3380 		mutex_exit(&zonehash_lock);
3381 		return (set_errno(EINVAL));
3382 	}
3383 
3384 	if (zone_mount_count(zone->zone_rootpath) != 0) {
3385 		mutex_exit(&zonehash_lock);
3386 		return (set_errno(EBUSY));
3387 	}
3388 	mutex_enter(&zone_status_lock);
3389 	status = zone_status_get(zone);
3390 	if (status < ZONE_IS_DOWN) {
3391 		mutex_exit(&zone_status_lock);
3392 		mutex_exit(&zonehash_lock);
3393 		return (set_errno(EBUSY));
3394 	} else if (status == ZONE_IS_DOWN) {
3395 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
3396 	}
3397 	mutex_exit(&zone_status_lock);
3398 	zone_hold(zone);
3399 	mutex_exit(&zonehash_lock);
3400 
3401 	/*
3402 	 * wait for zsched to exit
3403 	 */
3404 	zone_status_wait(zone, ZONE_IS_DEAD);
3405 	zone_zsd_callbacks(zone, ZSD_DESTROY);
3406 	uniqid = zone->zone_uniqid;
3407 	zone_rele(zone);
3408 	zone = NULL;	/* potentially free'd */
3409 
3410 	mutex_enter(&zonehash_lock);
3411 	for (; /* ever */; ) {
3412 		boolean_t unref;
3413 
3414 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
3415 		    zone->zone_uniqid != uniqid) {
3416 			/*
3417 			 * The zone has gone away.  Necessary conditions
3418 			 * are met, so we return success.
3419 			 */
3420 			mutex_exit(&zonehash_lock);
3421 			return (0);
3422 		}
3423 		mutex_enter(&zone->zone_lock);
3424 		unref = ZONE_IS_UNREF(zone);
3425 		mutex_exit(&zone->zone_lock);
3426 		if (unref) {
3427 			/*
3428 			 * There is only one reference to the zone -- that
3429 			 * added when the zone was added to the hashtables --
3430 			 * and things will remain this way until we drop
3431 			 * zonehash_lock... we can go ahead and cleanup the
3432 			 * zone.
3433 			 */
3434 			break;
3435 		}
3436 
3437 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
3438 			/* Signaled */
3439 			mutex_exit(&zonehash_lock);
3440 			return (set_errno(EINTR));
3441 		}
3442 
3443 	}
3444 
3445 	/*
3446 	 * It is now safe to let the zone be recreated; remove it from the
3447 	 * lists.  The memory will not be freed until the last cred
3448 	 * reference goes away.
3449 	 */
3450 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
3451 	zonecount--;
3452 	/* remove from active list and hash tables */
3453 	list_remove(&zone_active, zone);
3454 	(void) mod_hash_destroy(zonehashbyname,
3455 	    (mod_hash_key_t)zone->zone_name);
3456 	(void) mod_hash_destroy(zonehashbyid,
3457 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3458 	if (zone->zone_flags & ZF_HASHED_LABEL)
3459 		(void) mod_hash_destroy(zonehashbylabel,
3460 		    (mod_hash_key_t)zone->zone_slabel);
3461 	mutex_exit(&zonehash_lock);
3462 
3463 	/*
3464 	 * Release the root vnode; we're not using it anymore.  Nor should any
3465 	 * other thread that might access it exist.
3466 	 */
3467 	if (zone->zone_rootvp != NULL) {
3468 		VN_RELE(zone->zone_rootvp);
3469 		zone->zone_rootvp = NULL;
3470 	}
3471 
3472 	/* add to deathrow list */
3473 	mutex_enter(&zone_deathrow_lock);
3474 	list_insert_tail(&zone_deathrow, zone);
3475 	mutex_exit(&zone_deathrow_lock);
3476 
3477 	/*
3478 	 * Drop last reference (which was added by zsched()), this will
3479 	 * free the zone unless there are outstanding cred references.
3480 	 */
3481 	zone_rele(zone);
3482 	return (0);
3483 }
3484 
3485 /*
3486  * Systemcall entry point for zone_getattr(2).
3487  */
3488 static ssize_t
3489 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
3490 {
3491 	size_t size;
3492 	int error = 0, err;
3493 	zone_t *zone;
3494 	char *zonepath;
3495 	char *outstr;
3496 	zone_status_t zone_status;
3497 	pid_t initpid;
3498 	boolean_t global = (curproc->p_zone == global_zone);
3499 	boolean_t curzone = (curproc->p_zone->zone_id == zoneid);
3500 
3501 	mutex_enter(&zonehash_lock);
3502 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3503 		mutex_exit(&zonehash_lock);
3504 		return (set_errno(EINVAL));
3505 	}
3506 	zone_status = zone_status_get(zone);
3507 	if (zone_status < ZONE_IS_READY) {
3508 		mutex_exit(&zonehash_lock);
3509 		return (set_errno(EINVAL));
3510 	}
3511 	zone_hold(zone);
3512 	mutex_exit(&zonehash_lock);
3513 
3514 	/*
3515 	 * If not in the global zone, don't show information about other zones,
3516 	 * unless the system is labeled and the local zone's label dominates
3517 	 * the other zone.
3518 	 */
3519 	if (!zone_list_access(zone)) {
3520 		zone_rele(zone);
3521 		return (set_errno(EINVAL));
3522 	}
3523 
3524 	switch (attr) {
3525 	case ZONE_ATTR_ROOT:
3526 		if (global) {
3527 			/*
3528 			 * Copy the path to trim the trailing "/" (except for
3529 			 * the global zone).
3530 			 */
3531 			if (zone != global_zone)
3532 				size = zone->zone_rootpathlen - 1;
3533 			else
3534 				size = zone->zone_rootpathlen;
3535 			zonepath = kmem_alloc(size, KM_SLEEP);
3536 			bcopy(zone->zone_rootpath, zonepath, size);
3537 			zonepath[size - 1] = '\0';
3538 		} else {
3539 			if (curzone || !is_system_labeled()) {
3540 				/*
3541 				 * Caller is not in the global zone.
3542 				 * if the query is on the current zone
3543 				 * or the system is not labeled,
3544 				 * just return faked-up path for current zone.
3545 				 */
3546 				zonepath = "/";
3547 				size = 2;
3548 			} else {
3549 				/*
3550 				 * Return related path for current zone.
3551 				 */
3552 				int prefix_len = strlen(zone_prefix);
3553 				int zname_len = strlen(zone->zone_name);
3554 
3555 				size = prefix_len + zname_len + 1;
3556 				zonepath = kmem_alloc(size, KM_SLEEP);
3557 				bcopy(zone_prefix, zonepath, prefix_len);
3558 				bcopy(zone->zone_name, zonepath +
3559 				    prefix_len, zname_len);
3560 				zonepath[size - 1] = '\0';
3561 			}
3562 		}
3563 		if (bufsize > size)
3564 			bufsize = size;
3565 		if (buf != NULL) {
3566 			err = copyoutstr(zonepath, buf, bufsize, NULL);
3567 			if (err != 0 && err != ENAMETOOLONG)
3568 				error = EFAULT;
3569 		}
3570 		if (global || (is_system_labeled() && !curzone))
3571 			kmem_free(zonepath, size);
3572 		break;
3573 
3574 	case ZONE_ATTR_NAME:
3575 		size = strlen(zone->zone_name) + 1;
3576 		if (bufsize > size)
3577 			bufsize = size;
3578 		if (buf != NULL) {
3579 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
3580 			if (err != 0 && err != ENAMETOOLONG)
3581 				error = EFAULT;
3582 		}
3583 		break;
3584 
3585 	case ZONE_ATTR_STATUS:
3586 		/*
3587 		 * Since we're not holding zonehash_lock, the zone status
3588 		 * may be anything; leave it up to userland to sort it out.
3589 		 */
3590 		size = sizeof (zone_status);
3591 		if (bufsize > size)
3592 			bufsize = size;
3593 		zone_status = zone_status_get(zone);
3594 		if (buf != NULL &&
3595 		    copyout(&zone_status, buf, bufsize) != 0)
3596 			error = EFAULT;
3597 		break;
3598 	case ZONE_ATTR_PRIVSET:
3599 		size = sizeof (priv_set_t);
3600 		if (bufsize > size)
3601 			bufsize = size;
3602 		if (buf != NULL &&
3603 		    copyout(zone->zone_privset, buf, bufsize) != 0)
3604 			error = EFAULT;
3605 		break;
3606 	case ZONE_ATTR_UNIQID:
3607 		size = sizeof (zone->zone_uniqid);
3608 		if (bufsize > size)
3609 			bufsize = size;
3610 		if (buf != NULL &&
3611 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
3612 			error = EFAULT;
3613 		break;
3614 	case ZONE_ATTR_POOLID:
3615 		{
3616 			pool_t *pool;
3617 			poolid_t poolid;
3618 
3619 			if (pool_lock_intr() != 0) {
3620 				error = EINTR;
3621 				break;
3622 			}
3623 			pool = zone_pool_get(zone);
3624 			poolid = pool->pool_id;
3625 			pool_unlock();
3626 			size = sizeof (poolid);
3627 			if (bufsize > size)
3628 				bufsize = size;
3629 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
3630 				error = EFAULT;
3631 		}
3632 		break;
3633 	case ZONE_ATTR_SLBL:
3634 		size = sizeof (bslabel_t);
3635 		if (bufsize > size)
3636 			bufsize = size;
3637 		if (zone->zone_slabel == NULL)
3638 			error = EINVAL;
3639 		else if (buf != NULL &&
3640 		    copyout(label2bslabel(zone->zone_slabel), buf,
3641 		    bufsize) != 0)
3642 			error = EFAULT;
3643 		break;
3644 	case ZONE_ATTR_INITPID:
3645 		size = sizeof (initpid);
3646 		if (bufsize > size)
3647 			bufsize = size;
3648 		initpid = zone->zone_proc_initpid;
3649 		if (initpid == -1) {
3650 			error = ESRCH;
3651 			break;
3652 		}
3653 		if (buf != NULL &&
3654 		    copyout(&initpid, buf, bufsize) != 0)
3655 			error = EFAULT;
3656 		break;
3657 	case ZONE_ATTR_INITNAME:
3658 		size = strlen(zone->zone_initname) + 1;
3659 		if (bufsize > size)
3660 			bufsize = size;
3661 		if (buf != NULL) {
3662 			err = copyoutstr(zone->zone_initname, buf, bufsize,
3663 			    NULL);
3664 			if (err != 0 && err != ENAMETOOLONG)
3665 				error = EFAULT;
3666 		}
3667 		break;
3668 	case ZONE_ATTR_BOOTARGS:
3669 		if (zone->zone_bootargs == NULL)
3670 			outstr = "";
3671 		else
3672 			outstr = zone->zone_bootargs;
3673 		size = strlen(outstr) + 1;
3674 		if (bufsize > size)
3675 			bufsize = size;
3676 		if (buf != NULL) {
3677 			err = copyoutstr(outstr, buf, bufsize, NULL);
3678 			if (err != 0 && err != ENAMETOOLONG)
3679 				error = EFAULT;
3680 		}
3681 		break;
3682 	default:
3683 		error = EINVAL;
3684 	}
3685 	zone_rele(zone);
3686 
3687 	if (error)
3688 		return (set_errno(error));
3689 	return ((ssize_t)size);
3690 }
3691 
3692 /*
3693  * Systemcall entry point for zone_setattr(2).
3694  */
3695 /*ARGSUSED*/
3696 static int
3697 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
3698 {
3699 	zone_t *zone;
3700 	zone_status_t zone_status;
3701 	int err;
3702 
3703 	if (secpolicy_zone_config(CRED()) != 0)
3704 		return (set_errno(EPERM));
3705 
3706 	/*
3707 	 * At present, attributes can only be set on non-running,
3708 	 * non-global zones.
3709 	 */
3710 	if (zoneid == GLOBAL_ZONEID) {
3711 		return (set_errno(EINVAL));
3712 	}
3713 
3714 	mutex_enter(&zonehash_lock);
3715 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3716 		mutex_exit(&zonehash_lock);
3717 		return (set_errno(EINVAL));
3718 	}
3719 	zone_hold(zone);
3720 	mutex_exit(&zonehash_lock);
3721 
3722 	zone_status = zone_status_get(zone);
3723 	if (zone_status > ZONE_IS_READY)
3724 		goto done;
3725 
3726 	switch (attr) {
3727 	case ZONE_ATTR_INITNAME:
3728 		err = zone_set_initname(zone, (const char *)buf);
3729 		break;
3730 	case ZONE_ATTR_BOOTARGS:
3731 		err = zone_set_bootargs(zone, (const char *)buf);
3732 		break;
3733 	default:
3734 		err = EINVAL;
3735 	}
3736 
3737 done:
3738 	zone_rele(zone);
3739 	return (err != 0 ? set_errno(err) : 0);
3740 }
3741 
3742 /*
3743  * Return zero if the process has at least one vnode mapped in to its
3744  * address space which shouldn't be allowed to change zones.
3745  */
3746 static int
3747 as_can_change_zones(void)
3748 {
3749 	proc_t *pp = curproc;
3750 	struct seg *seg;
3751 	struct as *as = pp->p_as;
3752 	vnode_t *vp;
3753 	int allow = 1;
3754 
3755 	ASSERT(pp->p_as != &kas);
3756 	AS_LOCK_ENTER(&as, &as->a_lock, RW_READER);
3757 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
3758 		/*
3759 		 * if we can't get a backing vnode for this segment then skip
3760 		 * it.
3761 		 */
3762 		vp = NULL;
3763 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
3764 			continue;
3765 		if (!vn_can_change_zones(vp)) { /* bail on first match */
3766 			allow = 0;
3767 			break;
3768 		}
3769 	}
3770 	AS_LOCK_EXIT(&as, &as->a_lock);
3771 	return (allow);
3772 }
3773 
3774 /*
3775  * Systemcall entry point for zone_enter().
3776  *
3777  * The current process is injected into said zone.  In the process
3778  * it will change its project membership, privileges, rootdir/cwd,
3779  * zone-wide rctls, and pool association to match those of the zone.
3780  *
3781  * The first zone_enter() called while the zone is in the ZONE_IS_READY
3782  * state will transition it to ZONE_IS_RUNNING.  Processes may only
3783  * enter a zone that is "ready" or "running".
3784  */
3785 static int
3786 zone_enter(zoneid_t zoneid)
3787 {
3788 	zone_t *zone;
3789 	vnode_t *vp;
3790 	proc_t *pp = curproc;
3791 	contract_t *ct;
3792 	cont_process_t *ctp;
3793 	task_t *tk, *oldtk;
3794 	kproject_t *zone_proj0;
3795 	cred_t *cr, *newcr;
3796 	pool_t *oldpool, *newpool;
3797 	sess_t *sp;
3798 	uid_t uid;
3799 	zone_status_t status;
3800 	int err = 0;
3801 	rctl_entity_p_t e;
3802 
3803 	if (secpolicy_zone_config(CRED()) != 0)
3804 		return (set_errno(EPERM));
3805 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3806 		return (set_errno(EINVAL));
3807 
3808 	/*
3809 	 * Stop all lwps so we don't need to hold a lock to look at
3810 	 * curproc->p_zone.  This needs to happen before we grab any
3811 	 * locks to avoid deadlock (another lwp in the process could
3812 	 * be waiting for the held lock).
3813 	 */
3814 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
3815 		return (set_errno(EINTR));
3816 
3817 	/*
3818 	 * Make sure we're not changing zones with files open or mapped in
3819 	 * to our address space which shouldn't be changing zones.
3820 	 */
3821 	if (!files_can_change_zones()) {
3822 		err = EBADF;
3823 		goto out;
3824 	}
3825 	if (!as_can_change_zones()) {
3826 		err = EFAULT;
3827 		goto out;
3828 	}
3829 
3830 	mutex_enter(&zonehash_lock);
3831 	if (pp->p_zone != global_zone) {
3832 		mutex_exit(&zonehash_lock);
3833 		err = EINVAL;
3834 		goto out;
3835 	}
3836 
3837 	zone = zone_find_all_by_id(zoneid);
3838 	if (zone == NULL) {
3839 		mutex_exit(&zonehash_lock);
3840 		err = EINVAL;
3841 		goto out;
3842 	}
3843 
3844 	/*
3845 	 * To prevent processes in a zone from holding contracts on
3846 	 * extrazonal resources, and to avoid process contract
3847 	 * memberships which span zones, contract holders and processes
3848 	 * which aren't the sole members of their encapsulating process
3849 	 * contracts are not allowed to zone_enter.
3850 	 */
3851 	ctp = pp->p_ct_process;
3852 	ct = &ctp->conp_contract;
3853 	mutex_enter(&ct->ct_lock);
3854 	mutex_enter(&pp->p_lock);
3855 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
3856 		mutex_exit(&pp->p_lock);
3857 		mutex_exit(&ct->ct_lock);
3858 		mutex_exit(&zonehash_lock);
3859 		pool_unlock();
3860 		err = EINVAL;
3861 		goto out;
3862 	}
3863 
3864 	/*
3865 	 * Moreover, we don't allow processes whose encapsulating
3866 	 * process contracts have inherited extrazonal contracts.
3867 	 * While it would be easier to eliminate all process contracts
3868 	 * with inherited contracts, we need to be able to give a
3869 	 * restarted init (or other zone-penetrating process) its
3870 	 * predecessor's contracts.
3871 	 */
3872 	if (ctp->conp_ninherited != 0) {
3873 		contract_t *next;
3874 		for (next = list_head(&ctp->conp_inherited); next;
3875 		    next = list_next(&ctp->conp_inherited, next)) {
3876 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
3877 				mutex_exit(&pp->p_lock);
3878 				mutex_exit(&ct->ct_lock);
3879 				mutex_exit(&zonehash_lock);
3880 				pool_unlock();
3881 				err = EINVAL;
3882 				goto out;
3883 			}
3884 		}
3885 	}
3886 	mutex_exit(&pp->p_lock);
3887 	mutex_exit(&ct->ct_lock);
3888 
3889 	status = zone_status_get(zone);
3890 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
3891 		/*
3892 		 * Can't join
3893 		 */
3894 		mutex_exit(&zonehash_lock);
3895 		err = EINVAL;
3896 		goto out;
3897 	}
3898 
3899 	/*
3900 	 * Make sure new priv set is within the permitted set for caller
3901 	 */
3902 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
3903 		mutex_exit(&zonehash_lock);
3904 		err = EPERM;
3905 		goto out;
3906 	}
3907 	/*
3908 	 * We want to momentarily drop zonehash_lock while we optimistically
3909 	 * bind curproc to the pool it should be running in.  This is safe
3910 	 * since the zone can't disappear (we have a hold on it).
3911 	 */
3912 	zone_hold(zone);
3913 	mutex_exit(&zonehash_lock);
3914 
3915 	/*
3916 	 * Grab pool_lock to keep the pools configuration from changing
3917 	 * and to stop ourselves from getting rebound to another pool
3918 	 * until we join the zone.
3919 	 */
3920 	if (pool_lock_intr() != 0) {
3921 		zone_rele(zone);
3922 		err = EINTR;
3923 		goto out;
3924 	}
3925 	ASSERT(secpolicy_pool(CRED()) == 0);
3926 	/*
3927 	 * Bind ourselves to the pool currently associated with the zone.
3928 	 */
3929 	oldpool = curproc->p_pool;
3930 	newpool = zone_pool_get(zone);
3931 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
3932 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
3933 	    POOL_BIND_ALL)) != 0) {
3934 		pool_unlock();
3935 		zone_rele(zone);
3936 		goto out;
3937 	}
3938 
3939 	/*
3940 	 * Grab cpu_lock now; we'll need it later when we call
3941 	 * task_join().
3942 	 */
3943 	mutex_enter(&cpu_lock);
3944 	mutex_enter(&zonehash_lock);
3945 	/*
3946 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
3947 	 */
3948 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
3949 		/*
3950 		 * Can't join anymore.
3951 		 */
3952 		mutex_exit(&zonehash_lock);
3953 		mutex_exit(&cpu_lock);
3954 		if (pool_state == POOL_ENABLED &&
3955 		    newpool != oldpool)
3956 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
3957 			    POOL_BIND_ALL);
3958 		pool_unlock();
3959 		zone_rele(zone);
3960 		err = EINVAL;
3961 		goto out;
3962 	}
3963 
3964 	mutex_enter(&pp->p_lock);
3965 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
3966 	/* verify that we do not exceed and task or lwp limits */
3967 	mutex_enter(&zone->zone_nlwps_lock);
3968 	/* add new lwps to zone and zone's proj0 */
3969 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
3970 	zone->zone_nlwps += pp->p_lwpcnt;
3971 	/* add 1 task to zone's proj0 */
3972 	zone_proj0->kpj_ntasks += 1;
3973 	mutex_exit(&pp->p_lock);
3974 	mutex_exit(&zone->zone_nlwps_lock);
3975 
3976 	/* remove lwps from proc's old zone and old project */
3977 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
3978 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
3979 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
3980 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
3981 
3982 	/*
3983 	 * Joining the zone cannot fail from now on.
3984 	 *
3985 	 * This means that a lot of the following code can be commonized and
3986 	 * shared with zsched().
3987 	 */
3988 
3989 	/*
3990 	 * Reset the encapsulating process contract's zone.
3991 	 */
3992 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
3993 	contract_setzuniqid(ct, zone->zone_uniqid);
3994 
3995 	/*
3996 	 * Create a new task and associate the process with the project keyed
3997 	 * by (projid,zoneid).
3998 	 *
3999 	 * We might as well be in project 0; the global zone's projid doesn't
4000 	 * make much sense in a zone anyhow.
4001 	 *
4002 	 * This also increments zone_ntasks, and returns with p_lock held.
4003 	 */
4004 	tk = task_create(0, zone);
4005 	oldtk = task_join(tk, 0);
4006 	mutex_exit(&cpu_lock);
4007 
4008 	pp->p_flag |= SZONETOP;
4009 	pp->p_zone = zone;
4010 
4011 	/*
4012 	 * call RCTLOP_SET functions on this proc
4013 	 */
4014 	e.rcep_p.zone = zone;
4015 	e.rcep_t = RCENTITY_ZONE;
4016 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
4017 	    RCD_CALLBACK);
4018 	mutex_exit(&pp->p_lock);
4019 
4020 	/*
4021 	 * We don't need to hold any of zsched's locks here; not only do we know
4022 	 * the process and zone aren't going away, we know its session isn't
4023 	 * changing either.
4024 	 *
4025 	 * By joining zsched's session here, we mimic the behavior in the
4026 	 * global zone of init's sid being the pid of sched.  We extend this
4027 	 * to all zlogin-like zone_enter()'ing processes as well.
4028 	 */
4029 	mutex_enter(&pidlock);
4030 	sp = zone->zone_zsched->p_sessp;
4031 	SESS_HOLD(sp);
4032 	mutex_enter(&pp->p_lock);
4033 	pgexit(pp);
4034 	SESS_RELE(pp->p_sessp);
4035 	pp->p_sessp = sp;
4036 	pgjoin(pp, zone->zone_zsched->p_pidp);
4037 	mutex_exit(&pp->p_lock);
4038 	mutex_exit(&pidlock);
4039 
4040 	mutex_exit(&zonehash_lock);
4041 	/*
4042 	 * We're firmly in the zone; let pools progress.
4043 	 */
4044 	pool_unlock();
4045 	task_rele(oldtk);
4046 	/*
4047 	 * We don't need to retain a hold on the zone since we already
4048 	 * incremented zone_ntasks, so the zone isn't going anywhere.
4049 	 */
4050 	zone_rele(zone);
4051 
4052 	/*
4053 	 * Chroot
4054 	 */
4055 	vp = zone->zone_rootvp;
4056 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
4057 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
4058 
4059 	/*
4060 	 * Change process credentials
4061 	 */
4062 	newcr = cralloc();
4063 	mutex_enter(&pp->p_crlock);
4064 	cr = pp->p_cred;
4065 	crcopy_to(cr, newcr);
4066 	crsetzone(newcr, zone);
4067 	pp->p_cred = newcr;
4068 
4069 	/*
4070 	 * Restrict all process privilege sets to zone limit
4071 	 */
4072 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
4073 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
4074 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
4075 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
4076 	mutex_exit(&pp->p_crlock);
4077 	crset(pp, newcr);
4078 
4079 	/*
4080 	 * Adjust upcount to reflect zone entry.
4081 	 */
4082 	uid = crgetruid(newcr);
4083 	mutex_enter(&pidlock);
4084 	upcount_dec(uid, GLOBAL_ZONEID);
4085 	upcount_inc(uid, zoneid);
4086 	mutex_exit(&pidlock);
4087 
4088 	/*
4089 	 * Set up core file path and content.
4090 	 */
4091 	set_core_defaults();
4092 
4093 out:
4094 	/*
4095 	 * Let the other lwps continue.
4096 	 */
4097 	mutex_enter(&pp->p_lock);
4098 	if (curthread != pp->p_agenttp)
4099 		continuelwps(pp);
4100 	mutex_exit(&pp->p_lock);
4101 
4102 	return (err != 0 ? set_errno(err) : 0);
4103 }
4104 
4105 /*
4106  * Systemcall entry point for zone_list(2).
4107  *
4108  * Processes running in a (non-global) zone only see themselves.
4109  * On labeled systems, they see all zones whose label they dominate.
4110  */
4111 static int
4112 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
4113 {
4114 	zoneid_t *zoneids;
4115 	zone_t *zone, *myzone;
4116 	uint_t user_nzones, real_nzones;
4117 	uint_t domi_nzones;
4118 	int error;
4119 
4120 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
4121 		return (set_errno(EFAULT));
4122 
4123 	myzone = curproc->p_zone;
4124 	if (myzone != global_zone) {
4125 		bslabel_t *mybslab;
4126 
4127 		if (!is_system_labeled()) {
4128 			/* just return current zone */
4129 			real_nzones = domi_nzones = 1;
4130 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
4131 			zoneids[0] = myzone->zone_id;
4132 		} else {
4133 			/* return all zones that are dominated */
4134 			mutex_enter(&zonehash_lock);
4135 			real_nzones = zonecount;
4136 			domi_nzones = 0;
4137 			if (real_nzones > 0) {
4138 				zoneids = kmem_alloc(real_nzones *
4139 				    sizeof (zoneid_t), KM_SLEEP);
4140 				mybslab = label2bslabel(myzone->zone_slabel);
4141 				for (zone = list_head(&zone_active);
4142 				    zone != NULL;
4143 				    zone = list_next(&zone_active, zone)) {
4144 					if (zone->zone_id == GLOBAL_ZONEID)
4145 						continue;
4146 					if (zone != myzone &&
4147 					    (zone->zone_flags & ZF_IS_SCRATCH))
4148 						continue;
4149 					/*
4150 					 * Note that a label always dominates
4151 					 * itself, so myzone is always included
4152 					 * in the list.
4153 					 */
4154 					if (bldominates(mybslab,
4155 					    label2bslabel(zone->zone_slabel))) {
4156 						zoneids[domi_nzones++] =
4157 						    zone->zone_id;
4158 					}
4159 				}
4160 			}
4161 			mutex_exit(&zonehash_lock);
4162 		}
4163 	} else {
4164 		mutex_enter(&zonehash_lock);
4165 		real_nzones = zonecount;
4166 		domi_nzones = 0;
4167 		if (real_nzones > 0) {
4168 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
4169 			    KM_SLEEP);
4170 			for (zone = list_head(&zone_active); zone != NULL;
4171 			    zone = list_next(&zone_active, zone))
4172 				zoneids[domi_nzones++] = zone->zone_id;
4173 			ASSERT(domi_nzones == real_nzones);
4174 		}
4175 		mutex_exit(&zonehash_lock);
4176 	}
4177 
4178 	/*
4179 	 * If user has allocated space for fewer entries than we found, then
4180 	 * return only up to his limit.  Either way, tell him exactly how many
4181 	 * we found.
4182 	 */
4183 	if (domi_nzones < user_nzones)
4184 		user_nzones = domi_nzones;
4185 	error = 0;
4186 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
4187 		error = EFAULT;
4188 	} else if (zoneidlist != NULL && user_nzones != 0) {
4189 		if (copyout(zoneids, zoneidlist,
4190 		    user_nzones * sizeof (zoneid_t)) != 0)
4191 			error = EFAULT;
4192 	}
4193 
4194 	if (real_nzones > 0)
4195 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
4196 
4197 	if (error != 0)
4198 		return (set_errno(error));
4199 	else
4200 		return (0);
4201 }
4202 
4203 /*
4204  * Systemcall entry point for zone_lookup(2).
4205  *
4206  * Non-global zones are only able to see themselves and (on labeled systems)
4207  * the zones they dominate.
4208  */
4209 static zoneid_t
4210 zone_lookup(const char *zone_name)
4211 {
4212 	char *kname;
4213 	zone_t *zone;
4214 	zoneid_t zoneid;
4215 	int err;
4216 
4217 	if (zone_name == NULL) {
4218 		/* return caller's zone id */
4219 		return (getzoneid());
4220 	}
4221 
4222 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
4223 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
4224 		kmem_free(kname, ZONENAME_MAX);
4225 		return (set_errno(err));
4226 	}
4227 
4228 	mutex_enter(&zonehash_lock);
4229 	zone = zone_find_all_by_name(kname);
4230 	kmem_free(kname, ZONENAME_MAX);
4231 	/*
4232 	 * In a non-global zone, can only lookup global and own name.
4233 	 * In Trusted Extensions zone label dominance rules apply.
4234 	 */
4235 	if (zone == NULL ||
4236 	    zone_status_get(zone) < ZONE_IS_READY ||
4237 	    !zone_list_access(zone)) {
4238 		mutex_exit(&zonehash_lock);
4239 		return (set_errno(EINVAL));
4240 	} else {
4241 		zoneid = zone->zone_id;
4242 		mutex_exit(&zonehash_lock);
4243 		return (zoneid);
4244 	}
4245 }
4246 
4247 static int
4248 zone_version(int *version_arg)
4249 {
4250 	int version = ZONE_SYSCALL_API_VERSION;
4251 
4252 	if (copyout(&version, version_arg, sizeof (int)) != 0)
4253 		return (set_errno(EFAULT));
4254 	return (0);
4255 }
4256 
4257 /* ARGSUSED */
4258 long
4259 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
4260 {
4261 	zone_def zs;
4262 
4263 	switch (cmd) {
4264 	case ZONE_CREATE:
4265 		if (get_udatamodel() == DATAMODEL_NATIVE) {
4266 			if (copyin(arg1, &zs, sizeof (zone_def))) {
4267 				return (set_errno(EFAULT));
4268 			}
4269 		} else {
4270 #ifdef _SYSCALL32_IMPL
4271 			zone_def32 zs32;
4272 
4273 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
4274 				return (set_errno(EFAULT));
4275 			}
4276 			zs.zone_name =
4277 			    (const char *)(unsigned long)zs32.zone_name;
4278 			zs.zone_root =
4279 			    (const char *)(unsigned long)zs32.zone_root;
4280 			zs.zone_privs =
4281 			    (const struct priv_set *)
4282 			    (unsigned long)zs32.zone_privs;
4283 			zs.zone_privssz = zs32.zone_privssz;
4284 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
4285 			zs.rctlbufsz = zs32.rctlbufsz;
4286 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
4287 			zs.zfsbufsz = zs32.zfsbufsz;
4288 			zs.extended_error =
4289 			    (int *)(unsigned long)zs32.extended_error;
4290 			zs.match = zs32.match;
4291 			zs.doi = zs32.doi;
4292 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
4293 #else
4294 			panic("get_udatamodel() returned bogus result\n");
4295 #endif
4296 		}
4297 
4298 		return (zone_create(zs.zone_name, zs.zone_root,
4299 		    zs.zone_privs, zs.zone_privssz,
4300 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
4301 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
4302 		    zs.extended_error, zs.match, zs.doi,
4303 		    zs.label));
4304 	case ZONE_BOOT:
4305 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
4306 	case ZONE_DESTROY:
4307 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
4308 	case ZONE_GETATTR:
4309 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
4310 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
4311 	case ZONE_SETATTR:
4312 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
4313 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
4314 	case ZONE_ENTER:
4315 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
4316 	case ZONE_LIST:
4317 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
4318 	case ZONE_SHUTDOWN:
4319 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
4320 	case ZONE_LOOKUP:
4321 		return (zone_lookup((const char *)arg1));
4322 	case ZONE_VERSION:
4323 		return (zone_version((int *)arg1));
4324 	default:
4325 		return (set_errno(EINVAL));
4326 	}
4327 }
4328 
4329 struct zarg {
4330 	zone_t *zone;
4331 	zone_cmd_arg_t arg;
4332 };
4333 
4334 static int
4335 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
4336 {
4337 	char *buf;
4338 	size_t buflen;
4339 	int error;
4340 
4341 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
4342 	buf = kmem_alloc(buflen, KM_SLEEP);
4343 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
4344 	error = door_ki_open(buf, doorp);
4345 	kmem_free(buf, buflen);
4346 	return (error);
4347 }
4348 
4349 static void
4350 zone_release_door(door_handle_t *doorp)
4351 {
4352 	door_ki_rele(*doorp);
4353 	*doorp = NULL;
4354 }
4355 
4356 static void
4357 zone_ki_call_zoneadmd(struct zarg *zargp)
4358 {
4359 	door_handle_t door = NULL;
4360 	door_arg_t darg, save_arg;
4361 	char *zone_name;
4362 	size_t zone_namelen;
4363 	zoneid_t zoneid;
4364 	zone_t *zone;
4365 	zone_cmd_arg_t arg;
4366 	uint64_t uniqid;
4367 	size_t size;
4368 	int error;
4369 	int retry;
4370 
4371 	zone = zargp->zone;
4372 	arg = zargp->arg;
4373 	kmem_free(zargp, sizeof (*zargp));
4374 
4375 	zone_namelen = strlen(zone->zone_name) + 1;
4376 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
4377 	bcopy(zone->zone_name, zone_name, zone_namelen);
4378 	zoneid = zone->zone_id;
4379 	uniqid = zone->zone_uniqid;
4380 	/*
4381 	 * zoneadmd may be down, but at least we can empty out the zone.
4382 	 * We can ignore the return value of zone_empty() since we're called
4383 	 * from a kernel thread and know we won't be delivered any signals.
4384 	 */
4385 	ASSERT(curproc == &p0);
4386 	(void) zone_empty(zone);
4387 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
4388 	zone_rele(zone);
4389 
4390 	size = sizeof (arg);
4391 	darg.rbuf = (char *)&arg;
4392 	darg.data_ptr = (char *)&arg;
4393 	darg.rsize = size;
4394 	darg.data_size = size;
4395 	darg.desc_ptr = NULL;
4396 	darg.desc_num = 0;
4397 
4398 	save_arg = darg;
4399 	/*
4400 	 * Since we're not holding a reference to the zone, any number of
4401 	 * things can go wrong, including the zone disappearing before we get a
4402 	 * chance to talk to zoneadmd.
4403 	 */
4404 	for (retry = 0; /* forever */; retry++) {
4405 		if (door == NULL &&
4406 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
4407 			goto next;
4408 		}
4409 		ASSERT(door != NULL);
4410 
4411 		if ((error = door_ki_upcall(door, &darg)) == 0) {
4412 			break;
4413 		}
4414 		switch (error) {
4415 		case EINTR:
4416 			/* FALLTHROUGH */
4417 		case EAGAIN:	/* process may be forking */
4418 			/*
4419 			 * Back off for a bit
4420 			 */
4421 			break;
4422 		case EBADF:
4423 			zone_release_door(&door);
4424 			if (zone_lookup_door(zone_name, &door) != 0) {
4425 				/*
4426 				 * zoneadmd may be dead, but it may come back to
4427 				 * life later.
4428 				 */
4429 				break;
4430 			}
4431 			break;
4432 		default:
4433 			cmn_err(CE_WARN,
4434 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
4435 			    error);
4436 			goto out;
4437 		}
4438 next:
4439 		/*
4440 		 * If this isn't the same zone_t that we originally had in mind,
4441 		 * then this is the same as if two kadmin requests come in at
4442 		 * the same time: the first one wins.  This means we lose, so we
4443 		 * bail.
4444 		 */
4445 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
4446 			/*
4447 			 * Problem is solved.
4448 			 */
4449 			break;
4450 		}
4451 		if (zone->zone_uniqid != uniqid) {
4452 			/*
4453 			 * zoneid recycled
4454 			 */
4455 			zone_rele(zone);
4456 			break;
4457 		}
4458 		/*
4459 		 * We could zone_status_timedwait(), but there doesn't seem to
4460 		 * be much point in doing that (plus, it would mean that
4461 		 * zone_free() isn't called until this thread exits).
4462 		 */
4463 		zone_rele(zone);
4464 		delay(hz);
4465 		darg = save_arg;
4466 	}
4467 out:
4468 	if (door != NULL) {
4469 		zone_release_door(&door);
4470 	}
4471 	kmem_free(zone_name, zone_namelen);
4472 	thread_exit();
4473 }
4474 
4475 /*
4476  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
4477  * kadmin().  The caller is a process in the zone.
4478  *
4479  * In order to shutdown the zone, we will hand off control to zoneadmd
4480  * (running in the global zone) via a door.  We do a half-hearted job at
4481  * killing all processes in the zone, create a kernel thread to contact
4482  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
4483  * a form of generation number used to let zoneadmd (as well as
4484  * zone_destroy()) know exactly which zone they're re talking about.
4485  */
4486 int
4487 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
4488 {
4489 	struct zarg *zargp;
4490 	zone_cmd_t zcmd;
4491 	zone_t *zone;
4492 
4493 	zone = curproc->p_zone;
4494 	ASSERT(getzoneid() != GLOBAL_ZONEID);
4495 
4496 	switch (cmd) {
4497 	case A_SHUTDOWN:
4498 		switch (fcn) {
4499 		case AD_HALT:
4500 		case AD_POWEROFF:
4501 			zcmd = Z_HALT;
4502 			break;
4503 		case AD_BOOT:
4504 			zcmd = Z_REBOOT;
4505 			break;
4506 		case AD_IBOOT:
4507 		case AD_SBOOT:
4508 		case AD_SIBOOT:
4509 		case AD_NOSYNC:
4510 			return (ENOTSUP);
4511 		default:
4512 			return (EINVAL);
4513 		}
4514 		break;
4515 	case A_REBOOT:
4516 		zcmd = Z_REBOOT;
4517 		break;
4518 	case A_FTRACE:
4519 	case A_REMOUNT:
4520 	case A_FREEZE:
4521 	case A_DUMP:
4522 		return (ENOTSUP);
4523 	default:
4524 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
4525 		return (EINVAL);
4526 	}
4527 
4528 	if (secpolicy_zone_admin(credp, B_FALSE))
4529 		return (EPERM);
4530 	mutex_enter(&zone_status_lock);
4531 
4532 	/*
4533 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
4534 	 * is in the zone.
4535 	 */
4536 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
4537 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
4538 		/*
4539 		 * This zone is already on its way down.
4540 		 */
4541 		mutex_exit(&zone_status_lock);
4542 		return (0);
4543 	}
4544 	/*
4545 	 * Prevent future zone_enter()s
4546 	 */
4547 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
4548 	mutex_exit(&zone_status_lock);
4549 
4550 	/*
4551 	 * Kill everyone now and call zoneadmd later.
4552 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
4553 	 * later.
4554 	 */
4555 	killall(zone->zone_id);
4556 	/*
4557 	 * Now, create the thread to contact zoneadmd and do the rest of the
4558 	 * work.  This thread can't be created in our zone otherwise
4559 	 * zone_destroy() would deadlock.
4560 	 */
4561 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
4562 	zargp->arg.cmd = zcmd;
4563 	zargp->arg.uniqid = zone->zone_uniqid;
4564 	zargp->zone = zone;
4565 	(void) strcpy(zargp->arg.locale, "C");
4566 	/* mdep was already copied in for us by uadmin */
4567 	if (mdep != NULL)
4568 		(void) strlcpy(zargp->arg.bootbuf, mdep,
4569 		    sizeof (zargp->arg.bootbuf));
4570 	zone_hold(zone);
4571 
4572 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
4573 	    TS_RUN, minclsyspri);
4574 	exit(CLD_EXITED, 0);
4575 
4576 	return (EINVAL);
4577 }
4578 
4579 /*
4580  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
4581  * status to ZONE_IS_SHUTTING_DOWN.
4582  */
4583 void
4584 zone_shutdown_global(void)
4585 {
4586 	ASSERT(curproc->p_zone == global_zone);
4587 
4588 	mutex_enter(&zone_status_lock);
4589 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
4590 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
4591 	mutex_exit(&zone_status_lock);
4592 }
4593 
4594 /*
4595  * Returns true if the named dataset is visible in the current zone.
4596  * The 'write' parameter is set to 1 if the dataset is also writable.
4597  */
4598 int
4599 zone_dataset_visible(const char *dataset, int *write)
4600 {
4601 	zone_dataset_t *zd;
4602 	size_t len;
4603 	zone_t *zone = curproc->p_zone;
4604 
4605 	if (dataset[0] == '\0')
4606 		return (0);
4607 
4608 	/*
4609 	 * Walk the list once, looking for datasets which match exactly, or
4610 	 * specify a dataset underneath an exported dataset.  If found, return
4611 	 * true and note that it is writable.
4612 	 */
4613 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
4614 	    zd = list_next(&zone->zone_datasets, zd)) {
4615 
4616 		len = strlen(zd->zd_dataset);
4617 		if (strlen(dataset) >= len &&
4618 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
4619 		    (dataset[len] == '\0' || dataset[len] == '/' ||
4620 		    dataset[len] == '@')) {
4621 			if (write)
4622 				*write = 1;
4623 			return (1);
4624 		}
4625 	}
4626 
4627 	/*
4628 	 * Walk the list a second time, searching for datasets which are parents
4629 	 * of exported datasets.  These should be visible, but read-only.
4630 	 *
4631 	 * Note that we also have to support forms such as 'pool/dataset/', with
4632 	 * a trailing slash.
4633 	 */
4634 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
4635 	    zd = list_next(&zone->zone_datasets, zd)) {
4636 
4637 		len = strlen(dataset);
4638 		if (dataset[len - 1] == '/')
4639 			len--;	/* Ignore trailing slash */
4640 		if (len < strlen(zd->zd_dataset) &&
4641 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
4642 		    zd->zd_dataset[len] == '/') {
4643 			if (write)
4644 				*write = 0;
4645 			return (1);
4646 		}
4647 	}
4648 
4649 	return (0);
4650 }
4651 
4652 /*
4653  * zone_find_by_any_path() -
4654  *
4655  * kernel-private routine similar to zone_find_by_path(), but which
4656  * effectively compares against zone paths rather than zonerootpath
4657  * (i.e., the last component of zonerootpaths, which should be "root/",
4658  * are not compared.)  This is done in order to accurately identify all
4659  * paths, whether zone-visible or not, including those which are parallel
4660  * to /root/, such as /dev/, /home/, etc...
4661  *
4662  * If the specified path does not fall under any zone path then global
4663  * zone is returned.
4664  *
4665  * The treat_abs parameter indicates whether the path should be treated as
4666  * an absolute path although it does not begin with "/".  (This supports
4667  * nfs mount syntax such as host:any/path.)
4668  *
4669  * The caller is responsible for zone_rele of the returned zone.
4670  */
4671 zone_t *
4672 zone_find_by_any_path(const char *path, boolean_t treat_abs)
4673 {
4674 	zone_t *zone;
4675 	int path_offset = 0;
4676 
4677 	if (path == NULL) {
4678 		zone_hold(global_zone);
4679 		return (global_zone);
4680 	}
4681 
4682 	if (*path != '/') {
4683 		ASSERT(treat_abs);
4684 		path_offset = 1;
4685 	}
4686 
4687 	mutex_enter(&zonehash_lock);
4688 	for (zone = list_head(&zone_active); zone != NULL;
4689 	    zone = list_next(&zone_active, zone)) {
4690 		char	*c;
4691 		size_t	pathlen;
4692 		char *rootpath_start;
4693 
4694 		if (zone == global_zone)	/* skip global zone */
4695 			continue;
4696 
4697 		/* scan backwards to find start of last component */
4698 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
4699 		do {
4700 			c--;
4701 		} while (*c != '/');
4702 
4703 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
4704 		rootpath_start = (zone->zone_rootpath + path_offset);
4705 		if (strncmp(path, rootpath_start, pathlen) == 0)
4706 			break;
4707 	}
4708 	if (zone == NULL)
4709 		zone = global_zone;
4710 	zone_hold(zone);
4711 	mutex_exit(&zonehash_lock);
4712 	return (zone);
4713 }
4714