xref: /titanic_50/usr/src/uts/common/os/zone.c (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Zones
31  *
32  *   A zone is a named collection of processes, namespace constraints,
33  *   and other system resources which comprise a secure and manageable
34  *   application containment facility.
35  *
36  *   Zones (represented by the reference counted zone_t) are tracked in
37  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
38  *   (zoneid_t) are used to track zone association.  Zone IDs are
39  *   dynamically generated when the zone is created; if a persistent
40  *   identifier is needed (core files, accounting logs, audit trail,
41  *   etc.), the zone name should be used.
42  *
43  *
44  *   Global Zone:
45  *
46  *   The global zone (zoneid 0) is automatically associated with all
47  *   system resources that have not been bound to a user-created zone.
48  *   This means that even systems where zones are not in active use
49  *   have a global zone, and all processes, mounts, etc. are
50  *   associated with that zone.  The global zone is generally
51  *   unconstrained in terms of privileges and access, though the usual
52  *   credential and privilege based restrictions apply.
53  *
54  *
55  *   Zone States:
56  *
57  *   The states in which a zone may be in and the transitions are as
58  *   follows:
59  *
60  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
61  *   initialized zone is added to the list of active zones on the system but
62  *   isn't accessible.
63  *
64  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
65  *   ready.  The zone is made visible after the ZSD constructor callbacks are
66  *   executed.  A zone remains in this state until it transitions into
67  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
68  *
69  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
70  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
71  *   state.
72  *
73  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
74  *   successfully started init.   A zone remains in this state until
75  *   zone_shutdown() is called.
76  *
77  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
78  *   killing all processes running in the zone. The zone remains
79  *   in this state until there are no more user processes running in the zone.
80  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
81  *   Since zone_shutdown() is restartable, it may be called successfully
82  *   multiple times for the same zone_t.  Setting of the zone's state to
83  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
84  *   the zone's status without worrying about it being a moving target.
85  *
86  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
87  *   are no more user processes in the zone.  The zone remains in this
88  *   state until there are no more kernel threads associated with the
89  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
90  *   fail.
91  *
92  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
93  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
94  *   join the zone or create kernel threads therein.
95  *
96  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
97  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
98  *   return NULL from now on.
99  *
100  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
101  *   processes or threads doing work on behalf of the zone.  The zone is
102  *   removed from the list of active zones.  zone_destroy() returns, and
103  *   the zone can be recreated.
104  *
105  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
106  *   callbacks are executed, and all memory associated with the zone is
107  *   freed.
108  *
109  *   Threads can wait for the zone to enter a requested state by using
110  *   zone_status_wait() or zone_status_timedwait() with the desired
111  *   state passed in as an argument.  Zone state transitions are
112  *   uni-directional; it is not possible to move back to an earlier state.
113  *
114  *
115  *   Zone-Specific Data:
116  *
117  *   Subsystems needing to maintain zone-specific data can store that
118  *   data using the ZSD mechanism.  This provides a zone-specific data
119  *   store, similar to thread-specific data (see pthread_getspecific(3C)
120  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
121  *   to register callbacks to be invoked when a zone is created, shut
122  *   down, or destroyed.  This can be used to initialize zone-specific
123  *   data for new zones and to clean up when zones go away.
124  *
125  *
126  *   Data Structures:
127  *
128  *   The per-zone structure (zone_t) is reference counted, and freed
129  *   when all references are released.  zone_hold and zone_rele can be
130  *   used to adjust the reference count.  In addition, reference counts
131  *   associated with the cred_t structure are tracked separately using
132  *   zone_cred_hold and zone_cred_rele.
133  *
134  *   Pointers to active zone_t's are stored in two hash tables; one
135  *   for searching by id, the other for searching by name.  Lookups
136  *   can be performed on either basis, using zone_find_by_id and
137  *   zone_find_by_name.  Both return zone_t pointers with the zone
138  *   held, so zone_rele should be called when the pointer is no longer
139  *   needed.  Zones can also be searched by path; zone_find_by_path
140  *   returns the zone with which a path name is associated (global
141  *   zone if the path is not within some other zone's file system
142  *   hierarchy).  This currently requires iterating through each zone,
143  *   so it is slower than an id or name search via a hash table.
144  *
145  *
146  *   Locking:
147  *
148  *   zonehash_lock: This is a top-level global lock used to protect the
149  *       zone hash tables and lists.  Zones cannot be created or destroyed
150  *       while this lock is held.
151  *   zone_status_lock: This is a global lock protecting zone state.
152  *       Zones cannot change state while this lock is held.  It also
153  *       protects the list of kernel threads associated with a zone.
154  *   zone_lock: This is a per-zone lock used to protect several fields of
155  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
156  *       this lock means that the zone cannot go away.
157  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
158  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
159  *       list (a list of zones in the ZONE_IS_DEAD state).
160  *
161  *   Ordering requirements:
162  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
163  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
164  *
165  *   Blocking memory allocations are permitted while holding any of the
166  *   zone locks.
167  *
168  *
169  *   System Call Interface:
170  *
171  *   The zone subsystem can be managed and queried from user level with
172  *   the following system calls (all subcodes of the primary "zone"
173  *   system call):
174  *   - zone_create: creates a zone with selected attributes (name,
175  *     root path, privileges, resource controls)
176  *   - zone_enter: allows the current process to enter a zone
177  *   - zone_getattr: reports attributes of a zone
178  *   - zone_list: lists all zones active in the system
179  *   - zone_lookup: looks up zone id based on name
180  *   - zone_shutdown: initiates shutdown process (see states above)
181  *   - zone_destroy: completes shutdown process (see states above)
182  *
183  */
184 
185 #include <sys/priv_impl.h>
186 #include <sys/cred.h>
187 #include <c2/audit.h>
188 #include <sys/ddi.h>
189 #include <sys/debug.h>
190 #include <sys/file.h>
191 #include <sys/kmem.h>
192 #include <sys/mutex.h>
193 #include <sys/pathname.h>
194 #include <sys/proc.h>
195 #include <sys/project.h>
196 #include <sys/task.h>
197 #include <sys/systm.h>
198 #include <sys/types.h>
199 #include <sys/utsname.h>
200 #include <sys/vnode.h>
201 #include <sys/vfs.h>
202 #include <sys/systeminfo.h>
203 #include <sys/policy.h>
204 #include <sys/cred_impl.h>
205 #include <sys/contract_impl.h>
206 #include <sys/contract/process_impl.h>
207 #include <sys/class.h>
208 #include <sys/pool.h>
209 #include <sys/pool_pset.h>
210 #include <sys/pset.h>
211 #include <sys/log.h>
212 #include <sys/sysmacros.h>
213 #include <sys/callb.h>
214 #include <sys/vmparam.h>
215 #include <sys/corectl.h>
216 
217 #include <sys/door.h>
218 #include <sys/cpuvar.h>
219 #include <sys/fs/snode.h>
220 
221 #include <sys/uadmin.h>
222 #include <sys/session.h>
223 #include <sys/cmn_err.h>
224 #include <sys/modhash.h>
225 #include <sys/nvpair.h>
226 #include <sys/rctl.h>
227 #include <sys/fss.h>
228 #include <sys/zone.h>
229 
230 /*
231  * cv used to signal that all references to the zone have been released.  This
232  * needs to be global since there may be multiple waiters, and the first to
233  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
234  */
235 static kcondvar_t zone_destroy_cv;
236 /*
237  * Lock used to serialize access to zone_cv.  This could have been per-zone,
238  * but then we'd need another lock for zone_destroy_cv, and why bother?
239  */
240 static kmutex_t zone_status_lock;
241 
242 /*
243  * ZSD-related global variables.
244  */
245 static kmutex_t zsd_key_lock;	/* protects the following two */
246 /*
247  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
248  */
249 static zone_key_t zsd_keyval = 0;
250 /*
251  * Global list of registered keys.  We use this when a new zone is created.
252  */
253 static list_t zsd_registered_keys;
254 
255 int zone_hash_size = 256;
256 static mod_hash_t *zonehashbyname, *zonehashbyid;
257 static kmutex_t zonehash_lock;
258 static uint_t zonecount;
259 static id_space_t *zoneid_space;
260 
261 /*
262  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
263  * kernel proper runs, and which manages all other zones.
264  *
265  * Although not declared as static, the variable "zone0" should not be used
266  * except for by code that needs to reference the global zone early on in boot,
267  * before it is fully initialized.  All other consumers should use
268  * 'global_zone'.
269  */
270 zone_t zone0;
271 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
272 
273 /*
274  * List of active zones, protected by zonehash_lock.
275  */
276 static list_t zone_active;
277 
278 /*
279  * List of destroyed zones that still have outstanding cred references.
280  * Used for debugging.  Uses a separate lock to avoid lock ordering
281  * problems in zone_free.
282  */
283 static list_t zone_deathrow;
284 static kmutex_t zone_deathrow_lock;
285 
286 /* number of zones is limited by virtual interface limit in IP */
287 uint_t maxzones = 8192;
288 
289 /*
290  * This isn't static so lint doesn't complain.
291  */
292 rctl_hndl_t rc_zone_cpu_shares;
293 rctl_hndl_t rc_zone_nlwps;
294 /*
295  * Synchronization primitives used to synchronize between mounts and zone
296  * creation/destruction.
297  */
298 static int mounts_in_progress;
299 static kcondvar_t mount_cv;
300 static kmutex_t mount_lock;
301 
302 const char * const zone_initname = "/sbin/init";
303 
304 static int zone_shutdown(zoneid_t zoneid);
305 
306 /*
307  * Certain filesystems (such as NFS and autofs) need to know which zone
308  * the mount is being placed in.  Because of this, we need to be able to
309  * ensure that a zone isn't in the process of being created such that
310  * nfs_mount() thinks it is in the global zone, while by the time it
311  * gets added the list of mounted zones, it ends up on zoneA's mount
312  * list.
313  *
314  * The following functions: block_mounts()/resume_mounts() and
315  * mount_in_progress()/mount_completed() are used by zones and the VFS
316  * layer (respectively) to synchronize zone creation and new mounts.
317  *
318  * The semantics are like a reader-reader lock such that there may
319  * either be multiple mounts (or zone creations, if that weren't
320  * serialized by zonehash_lock) in progress at the same time, but not
321  * both.
322  *
323  * We use cv's so the user can ctrl-C out of the operation if it's
324  * taking too long.
325  *
326  * The semantics are such that there is unfair bias towards the
327  * "current" operation.  This means that zone creations may starve if
328  * there is a rapid succession of new mounts coming in to the system, or
329  * there is a remote possibility that zones will be created at such a
330  * rate that new mounts will not be able to proceed.
331  */
332 /*
333  * Prevent new mounts from progressing to the point of calling
334  * VFS_MOUNT().  If there are already mounts in this "region", wait for
335  * them to complete.
336  */
337 static int
338 block_mounts(void)
339 {
340 	int retval = 0;
341 
342 	/*
343 	 * Since it may block for a long time, block_mounts() shouldn't be
344 	 * called with zonehash_lock held.
345 	 */
346 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
347 	mutex_enter(&mount_lock);
348 	while (mounts_in_progress > 0) {
349 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
350 			goto signaled;
351 	}
352 	/*
353 	 * A negative value of mounts_in_progress indicates that mounts
354 	 * have been blocked by (-mounts_in_progress) different callers.
355 	 */
356 	mounts_in_progress--;
357 	retval = 1;
358 signaled:
359 	mutex_exit(&mount_lock);
360 	return (retval);
361 }
362 
363 /*
364  * The VFS layer may progress with new mounts as far as we're concerned.
365  * Allow them to progress if we were the last obstacle.
366  */
367 static void
368 resume_mounts(void)
369 {
370 	mutex_enter(&mount_lock);
371 	if (++mounts_in_progress == 0)
372 		cv_broadcast(&mount_cv);
373 	mutex_exit(&mount_lock);
374 }
375 
376 /*
377  * The VFS layer is busy with a mount; zones should wait until all
378  * mounts are completed to progress.
379  */
380 void
381 mount_in_progress(void)
382 {
383 	mutex_enter(&mount_lock);
384 	while (mounts_in_progress < 0)
385 		cv_wait(&mount_cv, &mount_lock);
386 	mounts_in_progress++;
387 	mutex_exit(&mount_lock);
388 }
389 
390 /*
391  * VFS is done with one mount; wake up any waiting block_mounts()
392  * callers if this is the last mount.
393  */
394 void
395 mount_completed(void)
396 {
397 	mutex_enter(&mount_lock);
398 	if (--mounts_in_progress == 0)
399 		cv_broadcast(&mount_cv);
400 	mutex_exit(&mount_lock);
401 }
402 
403 /*
404  * ZSD routines.
405  *
406  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
407  * defined by the pthread_key_create() and related interfaces.
408  *
409  * Kernel subsystems may register one or more data items and/or
410  * callbacks to be executed when a zone is created, shutdown, or
411  * destroyed.
412  *
413  * Unlike the thread counterpart, destructor callbacks will be executed
414  * even if the data pointer is NULL and/or there are no constructor
415  * callbacks, so it is the responsibility of such callbacks to check for
416  * NULL data values if necessary.
417  *
418  * The locking strategy and overall picture is as follows:
419  *
420  * When someone calls zone_key_create(), a template ZSD entry is added to the
421  * global list "zsd_registered_keys", protected by zsd_key_lock.  The
422  * constructor callback is called immediately on all existing zones, and a
423  * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
424  * zone_lock).  As this operation requires the list of zones, the list of
425  * registered keys, and the per-zone list of ZSD entries to remain constant
426  * throughout the entire operation, it must grab zonehash_lock, zone_lock for
427  * all existing zones, and zsd_key_lock, in that order.  Similar locking is
428  * needed when zone_key_delete() is called.  It is thus sufficient to hold
429  * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
430  * per-zone zone_zsd list.
431  *
432  * Note that this implementation does not make a copy of the ZSD entry if a
433  * constructor callback is not provided.  A zone_getspecific() on such an
434  * uninitialized ZSD entry will return NULL.
435  *
436  * When new zones are created constructor callbacks for all registered ZSD
437  * entries will be called.
438  *
439  * The framework does not provide any locking around zone_getspecific() and
440  * zone_setspecific() apart from that needed for internal consistency, so
441  * callers interested in atomic "test-and-set" semantics will need to provide
442  * their own locking.
443  */
444 void
445 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
446     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
447 {
448 	struct zsd_entry *zsdp;
449 	struct zsd_entry *t;
450 	struct zone *zone;
451 
452 	zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
453 	zsdp->zsd_data = NULL;
454 	zsdp->zsd_create = create;
455 	zsdp->zsd_shutdown = shutdown;
456 	zsdp->zsd_destroy = destroy;
457 
458 	mutex_enter(&zonehash_lock);	/* stop the world */
459 	for (zone = list_head(&zone_active); zone != NULL;
460 	    zone = list_next(&zone_active, zone))
461 		mutex_enter(&zone->zone_lock);	/* lock all zones */
462 
463 	mutex_enter(&zsd_key_lock);
464 	*keyp = zsdp->zsd_key = ++zsd_keyval;
465 	ASSERT(zsd_keyval != 0);
466 	list_insert_tail(&zsd_registered_keys, zsdp);
467 	mutex_exit(&zsd_key_lock);
468 
469 	if (create != NULL) {
470 		for (zone = list_head(&zone_active); zone != NULL;
471 		    zone = list_next(&zone_active, zone)) {
472 			t = kmem_alloc(sizeof (*t), KM_SLEEP);
473 			t->zsd_key = *keyp;
474 			t->zsd_data = (*create)(zone->zone_id);
475 			t->zsd_create = create;
476 			t->zsd_shutdown = shutdown;
477 			t->zsd_destroy = destroy;
478 			list_insert_tail(&zone->zone_zsd, t);
479 		}
480 	}
481 	for (zone = list_head(&zone_active); zone != NULL;
482 	    zone = list_next(&zone_active, zone))
483 		mutex_exit(&zone->zone_lock);
484 	mutex_exit(&zonehash_lock);
485 }
486 
487 /*
488  * Helper function to find the zsd_entry associated with the key in the
489  * given list.
490  */
491 static struct zsd_entry *
492 zsd_find(list_t *l, zone_key_t key)
493 {
494 	struct zsd_entry *zsd;
495 
496 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
497 		if (zsd->zsd_key == key) {
498 			/*
499 			 * Move to head of list to keep list in MRU order.
500 			 */
501 			if (zsd != list_head(l)) {
502 				list_remove(l, zsd);
503 				list_insert_head(l, zsd);
504 			}
505 			return (zsd);
506 		}
507 	}
508 	return (NULL);
509 }
510 
511 /*
512  * Function called when a module is being unloaded, or otherwise wishes
513  * to unregister its ZSD key and callbacks.
514  */
515 int
516 zone_key_delete(zone_key_t key)
517 {
518 	struct zsd_entry *zsdp = NULL;
519 	zone_t *zone;
520 
521 	mutex_enter(&zonehash_lock);	/* Zone create/delete waits for us */
522 	for (zone = list_head(&zone_active); zone != NULL;
523 	    zone = list_next(&zone_active, zone))
524 		mutex_enter(&zone->zone_lock);	/* lock all zones */
525 
526 	mutex_enter(&zsd_key_lock);
527 	zsdp = zsd_find(&zsd_registered_keys, key);
528 	if (zsdp == NULL)
529 		goto notfound;
530 	list_remove(&zsd_registered_keys, zsdp);
531 	mutex_exit(&zsd_key_lock);
532 
533 	for (zone = list_head(&zone_active); zone != NULL;
534 	    zone = list_next(&zone_active, zone)) {
535 		struct zsd_entry *del;
536 		void *data;
537 
538 		if (!(zone->zone_flags & ZF_DESTROYED)) {
539 			del = zsd_find(&zone->zone_zsd, key);
540 			if (del != NULL) {
541 				data = del->zsd_data;
542 				ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
543 				ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
544 				list_remove(&zone->zone_zsd, del);
545 				kmem_free(del, sizeof (*del));
546 			} else {
547 				data = NULL;
548 			}
549 			if (zsdp->zsd_shutdown)
550 				zsdp->zsd_shutdown(zone->zone_id, data);
551 			if (zsdp->zsd_destroy)
552 				zsdp->zsd_destroy(zone->zone_id, data);
553 		}
554 		mutex_exit(&zone->zone_lock);
555 	}
556 	mutex_exit(&zonehash_lock);
557 	kmem_free(zsdp, sizeof (*zsdp));
558 	return (0);
559 
560 notfound:
561 	mutex_exit(&zsd_key_lock);
562 	for (zone = list_head(&zone_active); zone != NULL;
563 	    zone = list_next(&zone_active, zone))
564 		mutex_exit(&zone->zone_lock);
565 	mutex_exit(&zonehash_lock);
566 	return (-1);
567 }
568 
569 /*
570  * ZSD counterpart of pthread_setspecific().
571  */
572 int
573 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
574 {
575 	struct zsd_entry *t;
576 	struct zsd_entry *zsdp = NULL;
577 
578 	mutex_enter(&zone->zone_lock);
579 	t = zsd_find(&zone->zone_zsd, key);
580 	if (t != NULL) {
581 		/*
582 		 * Replace old value with new
583 		 */
584 		t->zsd_data = (void *)data;
585 		mutex_exit(&zone->zone_lock);
586 		return (0);
587 	}
588 	/*
589 	 * If there was no previous value, go through the list of registered
590 	 * keys.
591 	 *
592 	 * We avoid grabbing zsd_key_lock until we are sure we need it; this is
593 	 * necessary for shutdown callbacks to be able to execute without fear
594 	 * of deadlock.
595 	 */
596 	mutex_enter(&zsd_key_lock);
597 	zsdp = zsd_find(&zsd_registered_keys, key);
598 	if (zsdp == NULL) { 	/* Key was not registered */
599 		mutex_exit(&zsd_key_lock);
600 		mutex_exit(&zone->zone_lock);
601 		return (-1);
602 	}
603 
604 	/*
605 	 * Add a zsd_entry to this zone, using the template we just retrieved
606 	 * to initialize the constructor and destructor(s).
607 	 */
608 	t = kmem_alloc(sizeof (*t), KM_SLEEP);
609 	t->zsd_key = key;
610 	t->zsd_data = (void *)data;
611 	t->zsd_create = zsdp->zsd_create;
612 	t->zsd_shutdown = zsdp->zsd_shutdown;
613 	t->zsd_destroy = zsdp->zsd_destroy;
614 	list_insert_tail(&zone->zone_zsd, t);
615 	mutex_exit(&zsd_key_lock);
616 	mutex_exit(&zone->zone_lock);
617 	return (0);
618 }
619 
620 /*
621  * ZSD counterpart of pthread_getspecific().
622  */
623 void *
624 zone_getspecific(zone_key_t key, zone_t *zone)
625 {
626 	struct zsd_entry *t;
627 	void *data;
628 
629 	mutex_enter(&zone->zone_lock);
630 	t = zsd_find(&zone->zone_zsd, key);
631 	data = (t == NULL ? NULL : t->zsd_data);
632 	mutex_exit(&zone->zone_lock);
633 	return (data);
634 }
635 
636 /*
637  * Function used to initialize a zone's list of ZSD callbacks and data
638  * when the zone is being created.  The callbacks are initialized from
639  * the template list (zsd_registered_keys), and the constructor
640  * callback executed (if one exists).
641  *
642  * This is called before the zone is made publicly available, hence no
643  * need to grab zone_lock.
644  *
645  * Although we grab and release zsd_key_lock, new entries cannot be
646  * added to or removed from the zsd_registered_keys list until we
647  * release zonehash_lock, so there isn't a window for a
648  * zone_key_create() to come in after we've dropped zsd_key_lock but
649  * before the zone is added to the zone list, such that the constructor
650  * callbacks aren't executed for the new zone.
651  */
652 static void
653 zone_zsd_configure(zone_t *zone)
654 {
655 	struct zsd_entry *zsdp;
656 	struct zsd_entry *t;
657 	zoneid_t zoneid = zone->zone_id;
658 
659 	ASSERT(MUTEX_HELD(&zonehash_lock));
660 	ASSERT(list_head(&zone->zone_zsd) == NULL);
661 	mutex_enter(&zsd_key_lock);
662 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
663 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
664 		if (zsdp->zsd_create != NULL) {
665 			t = kmem_alloc(sizeof (*t), KM_SLEEP);
666 			t->zsd_key = zsdp->zsd_key;
667 			t->zsd_create = zsdp->zsd_create;
668 			t->zsd_data = (*t->zsd_create)(zoneid);
669 			t->zsd_shutdown = zsdp->zsd_shutdown;
670 			t->zsd_destroy = zsdp->zsd_destroy;
671 			list_insert_tail(&zone->zone_zsd, t);
672 		}
673 	}
674 	mutex_exit(&zsd_key_lock);
675 }
676 
677 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
678 
679 /*
680  * Helper function to execute shutdown or destructor callbacks.
681  */
682 static void
683 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
684 {
685 	struct zsd_entry *zsdp;
686 	struct zsd_entry *t;
687 	zoneid_t zoneid = zone->zone_id;
688 
689 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
690 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
691 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
692 
693 	mutex_enter(&zone->zone_lock);
694 	if (ct == ZSD_DESTROY) {
695 		if (zone->zone_flags & ZF_DESTROYED) {
696 			/*
697 			 * Make sure destructors are only called once.
698 			 */
699 			mutex_exit(&zone->zone_lock);
700 			return;
701 		}
702 		zone->zone_flags |= ZF_DESTROYED;
703 	}
704 	mutex_exit(&zone->zone_lock);
705 
706 	/*
707 	 * Both zsd_key_lock and zone_lock need to be held in order to add or
708 	 * remove a ZSD key, (either globally as part of
709 	 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
710 	 * possible through zone_setspecific()), so it's sufficient to hold
711 	 * zsd_key_lock here.
712 	 *
713 	 * This is a good thing, since we don't want to recursively try to grab
714 	 * zone_lock if a callback attempts to do something like a crfree() or
715 	 * zone_rele().
716 	 */
717 	mutex_enter(&zsd_key_lock);
718 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
719 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
720 		zone_key_t key = zsdp->zsd_key;
721 
722 		/* Skip if no callbacks registered */
723 		if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
724 			continue;
725 		if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
726 			continue;
727 		/*
728 		 * Call the callback with the zone-specific data if we can find
729 		 * any, otherwise with NULL.
730 		 */
731 		t = zsd_find(&zone->zone_zsd, key);
732 		if (t != NULL) {
733 			if (ct == ZSD_SHUTDOWN) {
734 				t->zsd_shutdown(zoneid, t->zsd_data);
735 			} else {
736 				ASSERT(ct == ZSD_DESTROY);
737 				t->zsd_destroy(zoneid, t->zsd_data);
738 			}
739 		} else {
740 			if (ct == ZSD_SHUTDOWN) {
741 				zsdp->zsd_shutdown(zoneid, NULL);
742 			} else {
743 				ASSERT(ct == ZSD_DESTROY);
744 				zsdp->zsd_destroy(zoneid, NULL);
745 			}
746 		}
747 	}
748 	mutex_exit(&zsd_key_lock);
749 }
750 
751 /*
752  * Called when the zone is going away; free ZSD-related memory, and
753  * destroy the zone_zsd list.
754  */
755 static void
756 zone_free_zsd(zone_t *zone)
757 {
758 	struct zsd_entry *t, *next;
759 
760 	/*
761 	 * Free all the zsd_entry's we had on this zone.
762 	 */
763 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
764 		next = list_next(&zone->zone_zsd, t);
765 		list_remove(&zone->zone_zsd, t);
766 		kmem_free(t, sizeof (*t));
767 	}
768 	list_destroy(&zone->zone_zsd);
769 }
770 
771 /*
772  * zone.cpu-shares resource control support.
773  */
774 /*ARGSUSED*/
775 static rctl_qty_t
776 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
777 {
778 	ASSERT(MUTEX_HELD(&p->p_lock));
779 	return (p->p_zone->zone_shares);
780 }
781 
782 /*ARGSUSED*/
783 static int
784 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
785     rctl_qty_t nv)
786 {
787 	ASSERT(MUTEX_HELD(&p->p_lock));
788 	ASSERT(e->rcep_t == RCENTITY_ZONE);
789 	if (e->rcep_p.zone == NULL)
790 		return (0);
791 
792 	e->rcep_p.zone->zone_shares = nv;
793 	return (0);
794 }
795 
796 static rctl_ops_t zone_cpu_shares_ops = {
797 	rcop_no_action,
798 	zone_cpu_shares_usage,
799 	zone_cpu_shares_set,
800 	rcop_no_test
801 };
802 
803 /*ARGSUSED*/
804 static rctl_qty_t
805 zone_lwps_usage(rctl_t *r, proc_t *p)
806 {
807 	rctl_qty_t nlwps;
808 	zone_t *zone = p->p_zone;
809 
810 	ASSERT(MUTEX_HELD(&p->p_lock));
811 
812 	mutex_enter(&zone->zone_nlwps_lock);
813 	nlwps = zone->zone_nlwps;
814 	mutex_exit(&zone->zone_nlwps_lock);
815 
816 	return (nlwps);
817 }
818 
819 /*ARGSUSED*/
820 static int
821 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
822     rctl_qty_t incr, uint_t flags)
823 {
824 	rctl_qty_t nlwps;
825 
826 	ASSERT(MUTEX_HELD(&p->p_lock));
827 	ASSERT(e->rcep_t == RCENTITY_ZONE);
828 	if (e->rcep_p.zone == NULL)
829 		return (0);
830 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
831 	nlwps = e->rcep_p.zone->zone_nlwps;
832 
833 	if (nlwps + incr > rcntl->rcv_value)
834 		return (1);
835 
836 	return (0);
837 }
838 
839 /*ARGSUSED*/
840 static int
841 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) {
842 
843 	ASSERT(MUTEX_HELD(&p->p_lock));
844 	ASSERT(e->rcep_t == RCENTITY_ZONE);
845 	if (e->rcep_p.zone == NULL)
846 		return (0);
847 	e->rcep_p.zone->zone_nlwps_ctl = nv;
848 	return (0);
849 }
850 
851 static rctl_ops_t zone_lwps_ops = {
852 	rcop_no_action,
853 	zone_lwps_usage,
854 	zone_lwps_set,
855 	zone_lwps_test,
856 };
857 
858 /*
859  * Helper function to brand the zone with a unique ID.
860  */
861 static void
862 zone_uniqid(zone_t *zone)
863 {
864 	static uint64_t uniqid = 0;
865 
866 	ASSERT(MUTEX_HELD(&zonehash_lock));
867 	zone->zone_uniqid = uniqid++;
868 }
869 
870 /*
871  * Returns a held pointer to the "kcred" for the specified zone.
872  */
873 struct cred *
874 zone_get_kcred(zoneid_t zoneid)
875 {
876 	zone_t *zone;
877 	cred_t *cr;
878 
879 	if ((zone = zone_find_by_id(zoneid)) == NULL)
880 		return (NULL);
881 	cr = zone->zone_kcred;
882 	crhold(cr);
883 	zone_rele(zone);
884 	return (cr);
885 }
886 
887 /*
888  * Called very early on in boot to initialize the ZSD list so that
889  * zone_key_create() can be called before zone_init().  It also initializes
890  * portions of zone0 which may be used before zone_init() is called.  The
891  * variable "global_zone" will be set when zone0 is fully initialized by
892  * zone_init().
893  */
894 void
895 zone_zsd_init(void)
896 {
897 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
898 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
899 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
900 	    offsetof(struct zsd_entry, zsd_linkage));
901 	list_create(&zone_active, sizeof (zone_t),
902 	    offsetof(zone_t, zone_linkage));
903 	list_create(&zone_deathrow, sizeof (zone_t),
904 	    offsetof(zone_t, zone_linkage));
905 
906 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
907 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
908 	zone0.zone_shares = 1;
909 	zone0.zone_nlwps_ctl = INT_MAX;
910 	zone0.zone_name = GLOBAL_ZONENAME;
911 	zone0.zone_nodename = utsname.nodename;
912 	zone0.zone_domain = srpc_domain;
913 	zone0.zone_ref = 1;
914 	zone0.zone_id = GLOBAL_ZONEID;
915 	zone0.zone_status = ZONE_IS_RUNNING;
916 	zone0.zone_rootpath = "/";
917 	zone0.zone_rootpathlen = 2;
918 	zone0.zone_psetid = ZONE_PS_INVAL;
919 	zone0.zone_ncpus = 0;
920 	zone0.zone_ncpus_online = 0;
921 	zone0.zone_proc_initpid = 1;
922 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
923 	    offsetof(struct zsd_entry, zsd_linkage));
924 	list_insert_head(&zone_active, &zone0);
925 
926 	/*
927 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
928 	 * to anything meaningful.  It is assigned to be 'rootdir' in
929 	 * vfs_mountroot().
930 	 */
931 	zone0.zone_rootvp = NULL;
932 	zone0.zone_vfslist = NULL;
933 	zone0.zone_bootargs = NULL;
934 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
935 	/*
936 	 * The global zone has all privileges
937 	 */
938 	priv_fillset(zone0.zone_privset);
939 	/*
940 	 * Add p0 to the global zone
941 	 */
942 	zone0.zone_zsched = &p0;
943 	p0.p_zone = &zone0;
944 }
945 
946 /*
947  * Called by main() to initialize the zones framework.
948  */
949 void
950 zone_init(void)
951 {
952 	rctl_dict_entry_t *rde;
953 	rctl_val_t *dval;
954 	rctl_set_t *set;
955 	rctl_alloc_gp_t *gp;
956 	rctl_entity_p_t e;
957 
958 	ASSERT(curproc == &p0);
959 
960 	/*
961 	 * Create ID space for zone IDs.  ID 0 is reserved for the
962 	 * global zone.
963 	 */
964 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
965 
966 	/*
967 	 * Initialize generic zone resource controls, if any.
968 	 */
969 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
970 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
971 	    RCTL_GLOBAL_NOBASIC |
972 	    RCTL_GLOBAL_COUNT, FSS_MAXSHARES, FSS_MAXSHARES,
973 	    &zone_cpu_shares_ops);
974 
975 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
976 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
977 	    INT_MAX, INT_MAX, &zone_lwps_ops);
978 	/*
979 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
980 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
981 	 */
982 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
983 	bzero(dval, sizeof (rctl_val_t));
984 	dval->rcv_value = 1;
985 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
986 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
987 	dval->rcv_action_recip_pid = -1;
988 
989 	rde = rctl_dict_lookup("zone.cpu-shares");
990 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
991 
992 	/*
993 	 * Initialize the ``global zone''.
994 	 */
995 	set = rctl_set_create();
996 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
997 	mutex_enter(&p0.p_lock);
998 	e.rcep_p.zone = &zone0;
999 	e.rcep_t = RCENTITY_ZONE;
1000 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
1001 	    gp);
1002 
1003 	zone0.zone_nlwps = p0.p_lwpcnt;
1004 	zone0.zone_ntasks = 1;
1005 	mutex_exit(&p0.p_lock);
1006 	rctl_prealloc_destroy(gp);
1007 	/*
1008 	 * pool_default hasn't been initialized yet, so we let pool_init() take
1009 	 * care of making the global zone is in the default pool.
1010 	 */
1011 	mutex_enter(&zonehash_lock);
1012 	zone_uniqid(&zone0);
1013 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
1014 	mutex_exit(&zonehash_lock);
1015 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
1016 	    mod_hash_null_valdtor);
1017 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
1018 	    zone_hash_size, mod_hash_null_valdtor);
1019 	zonecount = 1;
1020 
1021 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
1022 	    (mod_hash_val_t)&zone0);
1023 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
1024 	    (mod_hash_val_t)&zone0);
1025 	/*
1026 	 * We avoid setting zone_kcred until now, since kcred is initialized
1027 	 * sometime after zone_zsd_init() and before zone_init().
1028 	 */
1029 	zone0.zone_kcred = kcred;
1030 	/*
1031 	 * The global zone is fully initialized (except for zone_rootvp which
1032 	 * will be set when the root filesystem is mounted).
1033 	 */
1034 	global_zone = &zone0;
1035 }
1036 
1037 static void
1038 zone_free(zone_t *zone)
1039 {
1040 	ASSERT(zone != global_zone);
1041 	ASSERT(zone->zone_ntasks == 0);
1042 	ASSERT(zone->zone_nlwps == 0);
1043 	ASSERT(zone->zone_cred_ref == 0);
1044 	ASSERT(zone->zone_kcred == NULL);
1045 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
1046 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
1047 
1048 	/* remove from deathrow list */
1049 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
1050 		ASSERT(zone->zone_ref == 0);
1051 		mutex_enter(&zone_deathrow_lock);
1052 		list_remove(&zone_deathrow, zone);
1053 		mutex_exit(&zone_deathrow_lock);
1054 	}
1055 
1056 	zone_free_zsd(zone);
1057 
1058 	if (zone->zone_rootvp != NULL)
1059 		VN_RELE(zone->zone_rootvp);
1060 	if (zone->zone_rootpath)
1061 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
1062 	if (zone->zone_name != NULL)
1063 		kmem_free(zone->zone_name, ZONENAME_MAX);
1064 	if (zone->zone_nodename != NULL)
1065 		kmem_free(zone->zone_nodename, _SYS_NMLN);
1066 	if (zone->zone_domain != NULL)
1067 		kmem_free(zone->zone_domain, _SYS_NMLN);
1068 	if (zone->zone_privset != NULL)
1069 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
1070 	if (zone->zone_rctls != NULL)
1071 		rctl_set_free(zone->zone_rctls);
1072 	if (zone->zone_bootargs != NULL)
1073 		kmem_free(zone->zone_bootargs, ZONEBOOTARGS_MAX);
1074 	id_free(zoneid_space, zone->zone_id);
1075 	mutex_destroy(&zone->zone_lock);
1076 	cv_destroy(&zone->zone_cv);
1077 	kmem_free(zone, sizeof (zone_t));
1078 }
1079 
1080 /*
1081  * See block comment at the top of this file for information about zone
1082  * status values.
1083  */
1084 /*
1085  * Convenience function for setting zone status.
1086  */
1087 static void
1088 zone_status_set(zone_t *zone, zone_status_t status)
1089 {
1090 	ASSERT(MUTEX_HELD(&zone_status_lock));
1091 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
1092 	    status >= zone_status_get(zone));
1093 	zone->zone_status = status;
1094 	cv_broadcast(&zone->zone_cv);
1095 }
1096 
1097 /*
1098  * Public function to retrieve the zone status.  The zone status may
1099  * change after it is retrieved.
1100  */
1101 zone_status_t
1102 zone_status_get(zone_t *zone)
1103 {
1104 	return (zone->zone_status);
1105 }
1106 
1107 static int
1108 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
1109 {
1110 	char *bootargs = kmem_zalloc(ZONEBOOTARGS_MAX, KM_SLEEP);
1111 	size_t len;
1112 	int err;
1113 
1114 	err = copyinstr(zone_bootargs, bootargs, ZONEBOOTARGS_MAX - 1, &len);
1115 	if (err != 0) {
1116 		kmem_free(bootargs, ZONEBOOTARGS_MAX);
1117 		return (err);	/* EFAULT or ENAMETOOLONG */
1118 	}
1119 	bootargs[len] = '\0';
1120 
1121 	ASSERT(zone->zone_bootargs == NULL);
1122 	zone->zone_bootargs = bootargs;
1123 	return (0);
1124 }
1125 
1126 /*
1127  * Block indefinitely waiting for (zone_status >= status)
1128  */
1129 void
1130 zone_status_wait(zone_t *zone, zone_status_t status)
1131 {
1132 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1133 
1134 	mutex_enter(&zone_status_lock);
1135 	while (zone->zone_status < status) {
1136 		cv_wait(&zone->zone_cv, &zone_status_lock);
1137 	}
1138 	mutex_exit(&zone_status_lock);
1139 }
1140 
1141 /*
1142  * Private CPR-safe version of zone_status_wait().
1143  */
1144 static void
1145 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
1146 {
1147 	callb_cpr_t cprinfo;
1148 
1149 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1150 
1151 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
1152 	    str);
1153 	mutex_enter(&zone_status_lock);
1154 	while (zone->zone_status < status) {
1155 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1156 		cv_wait(&zone->zone_cv, &zone_status_lock);
1157 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
1158 	}
1159 	/*
1160 	 * zone_status_lock is implicitly released by the following.
1161 	 */
1162 	CALLB_CPR_EXIT(&cprinfo);
1163 }
1164 
1165 /*
1166  * Block until zone enters requested state or signal is received.  Return (0)
1167  * if signaled, non-zero otherwise.
1168  */
1169 int
1170 zone_status_wait_sig(zone_t *zone, zone_status_t status)
1171 {
1172 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1173 
1174 	mutex_enter(&zone_status_lock);
1175 	while (zone->zone_status < status) {
1176 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
1177 			mutex_exit(&zone_status_lock);
1178 			return (0);
1179 		}
1180 	}
1181 	mutex_exit(&zone_status_lock);
1182 	return (1);
1183 }
1184 
1185 /*
1186  * Block until the zone enters the requested state or the timeout expires,
1187  * whichever happens first.  Return (-1) if operation timed out, time remaining
1188  * otherwise.
1189  */
1190 clock_t
1191 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
1192 {
1193 	clock_t timeleft = 0;
1194 
1195 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1196 
1197 	mutex_enter(&zone_status_lock);
1198 	while (zone->zone_status < status && timeleft != -1) {
1199 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
1200 	}
1201 	mutex_exit(&zone_status_lock);
1202 	return (timeleft);
1203 }
1204 
1205 /*
1206  * Block until the zone enters the requested state, the current process is
1207  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
1208  * operation timed out, 0 if signaled, time remaining otherwise.
1209  */
1210 clock_t
1211 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
1212 {
1213 	clock_t timeleft = tim - lbolt;
1214 
1215 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
1216 
1217 	mutex_enter(&zone_status_lock);
1218 	while (zone->zone_status < status) {
1219 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
1220 		    tim);
1221 		if (timeleft <= 0)
1222 			break;
1223 	}
1224 	mutex_exit(&zone_status_lock);
1225 	return (timeleft);
1226 }
1227 
1228 /*
1229  * Zones have two reference counts: one for references from credential
1230  * structures (zone_cred_ref), and one (zone_ref) for everything else.
1231  * This is so we can allow a zone to be rebooted while there are still
1232  * outstanding cred references, since certain drivers cache dblks (which
1233  * implicitly results in cached creds).  We wait for zone_ref to drop to
1234  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
1235  * later freed when the zone_cred_ref drops to 0, though nothing other
1236  * than the zone id and privilege set should be accessed once the zone
1237  * is "dead".
1238  *
1239  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
1240  * to force halt/reboot to block waiting for the zone_cred_ref to drop
1241  * to 0.  This can be useful to flush out other sources of cached creds
1242  * that may be less innocuous than the driver case.
1243  */
1244 
1245 int zone_wait_for_cred = 0;
1246 
1247 static void
1248 zone_hold_locked(zone_t *z)
1249 {
1250 	ASSERT(MUTEX_HELD(&z->zone_lock));
1251 	z->zone_ref++;
1252 	ASSERT(z->zone_ref != 0);
1253 }
1254 
1255 void
1256 zone_hold(zone_t *z)
1257 {
1258 	mutex_enter(&z->zone_lock);
1259 	zone_hold_locked(z);
1260 	mutex_exit(&z->zone_lock);
1261 }
1262 
1263 /*
1264  * If the non-cred ref count drops to 1 and either the cred ref count
1265  * is 0 or we aren't waiting for cred references, the zone is ready to
1266  * be destroyed.
1267  */
1268 #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
1269 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
1270 
1271 void
1272 zone_rele(zone_t *z)
1273 {
1274 	boolean_t wakeup;
1275 
1276 	mutex_enter(&z->zone_lock);
1277 	ASSERT(z->zone_ref != 0);
1278 	z->zone_ref--;
1279 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1280 		/* no more refs, free the structure */
1281 		mutex_exit(&z->zone_lock);
1282 		zone_free(z);
1283 		return;
1284 	}
1285 	/* signal zone_destroy so the zone can finish halting */
1286 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
1287 	mutex_exit(&z->zone_lock);
1288 
1289 	if (wakeup) {
1290 		/*
1291 		 * Grabbing zonehash_lock here effectively synchronizes with
1292 		 * zone_destroy() to avoid missed signals.
1293 		 */
1294 		mutex_enter(&zonehash_lock);
1295 		cv_broadcast(&zone_destroy_cv);
1296 		mutex_exit(&zonehash_lock);
1297 	}
1298 }
1299 
1300 void
1301 zone_cred_hold(zone_t *z)
1302 {
1303 	mutex_enter(&z->zone_lock);
1304 	z->zone_cred_ref++;
1305 	ASSERT(z->zone_cred_ref != 0);
1306 	mutex_exit(&z->zone_lock);
1307 }
1308 
1309 void
1310 zone_cred_rele(zone_t *z)
1311 {
1312 	boolean_t wakeup;
1313 
1314 	mutex_enter(&z->zone_lock);
1315 	ASSERT(z->zone_cred_ref != 0);
1316 	z->zone_cred_ref--;
1317 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
1318 		/* no more refs, free the structure */
1319 		mutex_exit(&z->zone_lock);
1320 		zone_free(z);
1321 		return;
1322 	}
1323 	/*
1324 	 * If zone_destroy is waiting for the cred references to drain
1325 	 * out, and they have, signal it.
1326 	 */
1327 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
1328 	    zone_status_get(z) >= ZONE_IS_DEAD);
1329 	mutex_exit(&z->zone_lock);
1330 
1331 	if (wakeup) {
1332 		/*
1333 		 * Grabbing zonehash_lock here effectively synchronizes with
1334 		 * zone_destroy() to avoid missed signals.
1335 		 */
1336 		mutex_enter(&zonehash_lock);
1337 		cv_broadcast(&zone_destroy_cv);
1338 		mutex_exit(&zonehash_lock);
1339 	}
1340 }
1341 
1342 void
1343 zone_task_hold(zone_t *z)
1344 {
1345 	mutex_enter(&z->zone_lock);
1346 	z->zone_ntasks++;
1347 	ASSERT(z->zone_ntasks != 0);
1348 	mutex_exit(&z->zone_lock);
1349 }
1350 
1351 void
1352 zone_task_rele(zone_t *zone)
1353 {
1354 	uint_t refcnt;
1355 
1356 	mutex_enter(&zone->zone_lock);
1357 	ASSERT(zone->zone_ntasks != 0);
1358 	refcnt = --zone->zone_ntasks;
1359 	if (refcnt > 1)	{	/* Common case */
1360 		mutex_exit(&zone->zone_lock);
1361 		return;
1362 	}
1363 	zone_hold_locked(zone);	/* so we can use the zone_t later */
1364 	mutex_exit(&zone->zone_lock);
1365 	if (refcnt == 1) {
1366 		/*
1367 		 * See if the zone is shutting down.
1368 		 */
1369 		mutex_enter(&zone_status_lock);
1370 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
1371 			goto out;
1372 		}
1373 
1374 		/*
1375 		 * Make sure the ntasks didn't change since we
1376 		 * dropped zone_lock.
1377 		 */
1378 		mutex_enter(&zone->zone_lock);
1379 		if (refcnt != zone->zone_ntasks) {
1380 			mutex_exit(&zone->zone_lock);
1381 			goto out;
1382 		}
1383 		mutex_exit(&zone->zone_lock);
1384 
1385 		/*
1386 		 * No more user processes in the zone.  The zone is empty.
1387 		 */
1388 		zone_status_set(zone, ZONE_IS_EMPTY);
1389 		goto out;
1390 	}
1391 
1392 	ASSERT(refcnt == 0);
1393 	/*
1394 	 * zsched has exited; the zone is dead.
1395 	 */
1396 	zone->zone_zsched = NULL;		/* paranoia */
1397 	mutex_enter(&zone_status_lock);
1398 	zone_status_set(zone, ZONE_IS_DEAD);
1399 out:
1400 	mutex_exit(&zone_status_lock);
1401 	zone_rele(zone);
1402 }
1403 
1404 zoneid_t
1405 getzoneid(void)
1406 {
1407 	return (curproc->p_zone->zone_id);
1408 }
1409 
1410 /*
1411  * Internal versions of zone_find_by_*().  These don't zone_hold() or
1412  * check the validity of a zone's state.
1413  */
1414 static zone_t *
1415 zone_find_all_by_id(zoneid_t zoneid)
1416 {
1417 	mod_hash_val_t hv;
1418 	zone_t *zone = NULL;
1419 
1420 	ASSERT(MUTEX_HELD(&zonehash_lock));
1421 
1422 	if (mod_hash_find(zonehashbyid,
1423 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
1424 		zone = (zone_t *)hv;
1425 	return (zone);
1426 }
1427 
1428 static zone_t *
1429 zone_find_all_by_name(char *name)
1430 {
1431 	mod_hash_val_t hv;
1432 	zone_t *zone = NULL;
1433 
1434 	ASSERT(MUTEX_HELD(&zonehash_lock));
1435 
1436 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
1437 		zone = (zone_t *)hv;
1438 	return (zone);
1439 }
1440 
1441 /*
1442  * Public interface for looking up a zone by zoneid.  Only returns the zone if
1443  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
1444  * Caller must call zone_rele() once it is done with the zone.
1445  *
1446  * The zone may begin the zone_destroy() sequence immediately after this
1447  * function returns, but may be safely used until zone_rele() is called.
1448  */
1449 zone_t *
1450 zone_find_by_id(zoneid_t zoneid)
1451 {
1452 	zone_t *zone;
1453 	zone_status_t status;
1454 
1455 	mutex_enter(&zonehash_lock);
1456 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
1457 		mutex_exit(&zonehash_lock);
1458 		return (NULL);
1459 	}
1460 	status = zone_status_get(zone);
1461 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
1462 		/*
1463 		 * For all practical purposes the zone doesn't exist.
1464 		 */
1465 		mutex_exit(&zonehash_lock);
1466 		return (NULL);
1467 	}
1468 	zone_hold(zone);
1469 	mutex_exit(&zonehash_lock);
1470 	return (zone);
1471 }
1472 
1473 /*
1474  * Similar to zone_find_by_id, but using zone name as the key.
1475  */
1476 zone_t *
1477 zone_find_by_name(char *name)
1478 {
1479 	zone_t *zone;
1480 	zone_status_t status;
1481 
1482 	mutex_enter(&zonehash_lock);
1483 	if ((zone = zone_find_all_by_name(name)) == NULL) {
1484 		mutex_exit(&zonehash_lock);
1485 		return (NULL);
1486 	}
1487 	status = zone_status_get(zone);
1488 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
1489 		/*
1490 		 * For all practical purposes the zone doesn't exist.
1491 		 */
1492 		mutex_exit(&zonehash_lock);
1493 		return (NULL);
1494 	}
1495 	zone_hold(zone);
1496 	mutex_exit(&zonehash_lock);
1497 	return (zone);
1498 }
1499 
1500 /*
1501  * Similar to zone_find_by_id(), using the path as a key.  For instance,
1502  * if there is a zone "foo" rooted at /foo/root, and the path argument
1503  * is "/foo/root/proc", it will return the held zone_t corresponding to
1504  * zone "foo".
1505  *
1506  * zone_find_by_path() always returns a non-NULL value, since at the
1507  * very least every path will be contained in the global zone.
1508  *
1509  * As with the other zone_find_by_*() functions, the caller is
1510  * responsible for zone_rele()ing the return value of this function.
1511  */
1512 zone_t *
1513 zone_find_by_path(const char *path)
1514 {
1515 	zone_t *zone;
1516 	zone_t *zret = NULL;
1517 	zone_status_t status;
1518 
1519 	if (path == NULL) {
1520 		/*
1521 		 * Call from rootconf().
1522 		 */
1523 		zone_hold(global_zone);
1524 		return (global_zone);
1525 	}
1526 	ASSERT(*path == '/');
1527 	mutex_enter(&zonehash_lock);
1528 	for (zone = list_head(&zone_active); zone != NULL;
1529 	    zone = list_next(&zone_active, zone)) {
1530 		if (ZONE_PATH_VISIBLE(path, zone))
1531 			zret = zone;
1532 	}
1533 	ASSERT(zret != NULL);
1534 	status = zone_status_get(zret);
1535 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
1536 		/*
1537 		 * Zone practically doesn't exist.
1538 		 */
1539 		zret = global_zone;
1540 	}
1541 	zone_hold(zret);
1542 	mutex_exit(&zonehash_lock);
1543 	return (zret);
1544 }
1545 
1546 /*
1547  * Get the number of cpus visible to this zone.  The system-wide global
1548  * 'ncpus' is returned if pools are disabled, the caller is in the
1549  * global zone, or a NULL zone argument is passed in.
1550  */
1551 int
1552 zone_ncpus_get(zone_t *zone)
1553 {
1554 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
1555 
1556 	return (myncpus != 0 ? myncpus : ncpus);
1557 }
1558 
1559 /*
1560  * Get the number of online cpus visible to this zone.  The system-wide
1561  * global 'ncpus_online' is returned if pools are disabled, the caller
1562  * is in the global zone, or a NULL zone argument is passed in.
1563  */
1564 int
1565 zone_ncpus_online_get(zone_t *zone)
1566 {
1567 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
1568 
1569 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
1570 }
1571 
1572 /*
1573  * Return the pool to which the zone is currently bound.
1574  */
1575 pool_t *
1576 zone_pool_get(zone_t *zone)
1577 {
1578 	ASSERT(pool_lock_held());
1579 
1580 	return (zone->zone_pool);
1581 }
1582 
1583 /*
1584  * Set the zone's pool pointer and update the zone's visibility to match
1585  * the resources in the new pool.
1586  */
1587 void
1588 zone_pool_set(zone_t *zone, pool_t *pool)
1589 {
1590 	ASSERT(pool_lock_held());
1591 	ASSERT(MUTEX_HELD(&cpu_lock));
1592 
1593 	zone->zone_pool = pool;
1594 	zone_pset_set(zone, pool->pool_pset->pset_id);
1595 }
1596 
1597 /*
1598  * Return the cached value of the id of the processor set to which the
1599  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
1600  * facility is disabled.
1601  */
1602 psetid_t
1603 zone_pset_get(zone_t *zone)
1604 {
1605 	ASSERT(MUTEX_HELD(&cpu_lock));
1606 
1607 	return (zone->zone_psetid);
1608 }
1609 
1610 /*
1611  * Set the cached value of the id of the processor set to which the zone
1612  * is currently bound.  Also update the zone's visibility to match the
1613  * resources in the new processor set.
1614  */
1615 void
1616 zone_pset_set(zone_t *zone, psetid_t newpsetid)
1617 {
1618 	psetid_t oldpsetid;
1619 
1620 	ASSERT(MUTEX_HELD(&cpu_lock));
1621 	oldpsetid = zone_pset_get(zone);
1622 
1623 	if (oldpsetid == newpsetid)
1624 		return;
1625 	/*
1626 	 * Global zone sees all.
1627 	 */
1628 	if (zone != global_zone) {
1629 		zone->zone_psetid = newpsetid;
1630 		if (newpsetid != ZONE_PS_INVAL)
1631 			pool_pset_visibility_add(newpsetid, zone);
1632 		if (oldpsetid != ZONE_PS_INVAL)
1633 			pool_pset_visibility_remove(oldpsetid, zone);
1634 	}
1635 	/*
1636 	 * Disabling pools, so we should start using the global values
1637 	 * for ncpus and ncpus_online.
1638 	 */
1639 	if (newpsetid == ZONE_PS_INVAL) {
1640 		zone->zone_ncpus = 0;
1641 		zone->zone_ncpus_online = 0;
1642 	}
1643 }
1644 
1645 /*
1646  * Walk the list of active zones and issue the provided callback for
1647  * each of them.
1648  *
1649  * Caller must not be holding any locks that may be acquired under
1650  * zonehash_lock.  See comment at the beginning of the file for a list of
1651  * common locks and their interactions with zones.
1652  */
1653 int
1654 zone_walk(int (*cb)(zone_t *, void *), void *data)
1655 {
1656 	zone_t *zone;
1657 	int ret = 0;
1658 	zone_status_t status;
1659 
1660 	mutex_enter(&zonehash_lock);
1661 	for (zone = list_head(&zone_active); zone != NULL;
1662 	    zone = list_next(&zone_active, zone)) {
1663 		/*
1664 		 * Skip zones that shouldn't be externally visible.
1665 		 */
1666 		status = zone_status_get(zone);
1667 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
1668 			continue;
1669 		/*
1670 		 * Bail immediately if any callback invocation returns a
1671 		 * non-zero value.
1672 		 */
1673 		ret = (*cb)(zone, data);
1674 		if (ret != 0)
1675 			break;
1676 	}
1677 	mutex_exit(&zonehash_lock);
1678 	return (ret);
1679 }
1680 
1681 static int
1682 zone_set_root(zone_t *zone, const char *upath)
1683 {
1684 	vnode_t *vp;
1685 	int trycount;
1686 	int error = 0;
1687 	char *path;
1688 	struct pathname upn, pn;
1689 	size_t pathlen;
1690 
1691 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
1692 		return (error);
1693 
1694 	pn_alloc(&pn);
1695 
1696 	/* prevent infinite loop */
1697 	trycount = 10;
1698 	for (;;) {
1699 		if (--trycount <= 0) {
1700 			error = ESTALE;
1701 			goto out;
1702 		}
1703 
1704 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
1705 			/*
1706 			 * VOP_ACCESS() may cover 'vp' with a new
1707 			 * filesystem, if 'vp' is an autoFS vnode.
1708 			 * Get the new 'vp' if so.
1709 			 */
1710 			if ((error = VOP_ACCESS(vp, VEXEC, 0, CRED())) == 0 &&
1711 			    (vp->v_vfsmountedhere == NULL ||
1712 			    (error = traverse(&vp)) == 0)) {
1713 				pathlen = pn.pn_pathlen + 2;
1714 				path = kmem_alloc(pathlen, KM_SLEEP);
1715 				(void) strncpy(path, pn.pn_path,
1716 				    pn.pn_pathlen + 1);
1717 				path[pathlen - 2] = '/';
1718 				path[pathlen - 1] = '\0';
1719 				pn_free(&pn);
1720 				pn_free(&upn);
1721 
1722 				/* Success! */
1723 				break;
1724 			}
1725 			VN_RELE(vp);
1726 		}
1727 		if (error != ESTALE)
1728 			goto out;
1729 	}
1730 
1731 	ASSERT(error == 0);
1732 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
1733 	zone->zone_rootpath = path;
1734 	zone->zone_rootpathlen = pathlen;
1735 	return (0);
1736 
1737 out:
1738 	pn_free(&pn);
1739 	pn_free(&upn);
1740 	return (error);
1741 }
1742 
1743 #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
1744 			((c) >= 'a' && (c) <= 'z') || \
1745 			((c) >= 'A' && (c) <= 'Z'))
1746 
1747 static int
1748 zone_set_name(zone_t *zone, const char *uname)
1749 {
1750 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
1751 	size_t len;
1752 	int i, err;
1753 
1754 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
1755 		kmem_free(kname, ZONENAME_MAX);
1756 		return (err);	/* EFAULT or ENAMETOOLONG */
1757 	}
1758 
1759 	/* must be less than ZONENAME_MAX */
1760 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
1761 		kmem_free(kname, ZONENAME_MAX);
1762 		return (EINVAL);
1763 	}
1764 
1765 	/*
1766 	 * Name must start with an alphanumeric and must contain only
1767 	 * alphanumerics, '-', '_' and '.'.
1768 	 */
1769 	if (!isalnum(kname[0])) {
1770 		kmem_free(kname, ZONENAME_MAX);
1771 		return (EINVAL);
1772 	}
1773 	for (i = 1; i < len - 1; i++) {
1774 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
1775 		    kname[i] != '.') {
1776 			kmem_free(kname, ZONENAME_MAX);
1777 			return (EINVAL);
1778 		}
1779 	}
1780 
1781 	zone->zone_name = kname;
1782 	return (0);
1783 }
1784 
1785 /*
1786  * Similar to thread_create(), but makes sure the thread is in the appropriate
1787  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
1788  */
1789 /*ARGSUSED*/
1790 kthread_t *
1791 zthread_create(
1792     caddr_t stk,
1793     size_t stksize,
1794     void (*proc)(),
1795     void *arg,
1796     size_t len,
1797     pri_t pri)
1798 {
1799 	kthread_t *t;
1800 	zone_t *zone = curproc->p_zone;
1801 	proc_t *pp = zone->zone_zsched;
1802 
1803 	zone_hold(zone);	/* Reference to be dropped when thread exits */
1804 
1805 	/*
1806 	 * No-one should be trying to create threads if the zone is shutting
1807 	 * down and there aren't any kernel threads around.  See comment
1808 	 * in zthread_exit().
1809 	 */
1810 	ASSERT(!(zone->zone_kthreads == NULL &&
1811 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
1812 	/*
1813 	 * Create a thread, but don't let it run until we've finished setting
1814 	 * things up.
1815 	 */
1816 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
1817 	ASSERT(t->t_forw == NULL);
1818 	mutex_enter(&zone_status_lock);
1819 	if (zone->zone_kthreads == NULL) {
1820 		t->t_forw = t->t_back = t;
1821 	} else {
1822 		kthread_t *tx = zone->zone_kthreads;
1823 
1824 		t->t_forw = tx;
1825 		t->t_back = tx->t_back;
1826 		tx->t_back->t_forw = t;
1827 		tx->t_back = t;
1828 	}
1829 	zone->zone_kthreads = t;
1830 	mutex_exit(&zone_status_lock);
1831 
1832 	mutex_enter(&pp->p_lock);
1833 	t->t_proc_flag |= TP_ZTHREAD;
1834 	project_rele(t->t_proj);
1835 	t->t_proj = project_hold(pp->p_task->tk_proj);
1836 
1837 	/*
1838 	 * Setup complete, let it run.
1839 	 */
1840 	thread_lock(t);
1841 	t->t_schedflag |= TS_ALLSTART;
1842 	setrun_locked(t);
1843 	thread_unlock(t);
1844 
1845 	mutex_exit(&pp->p_lock);
1846 
1847 	return (t);
1848 }
1849 
1850 /*
1851  * Similar to thread_exit().  Must be called by threads created via
1852  * zthread_exit().
1853  */
1854 void
1855 zthread_exit(void)
1856 {
1857 	kthread_t *t = curthread;
1858 	proc_t *pp = curproc;
1859 	zone_t *zone = pp->p_zone;
1860 
1861 	mutex_enter(&zone_status_lock);
1862 
1863 	/*
1864 	 * Reparent to p0
1865 	 */
1866 	mutex_enter(&pp->p_lock);
1867 	t->t_proc_flag &= ~TP_ZTHREAD;
1868 	t->t_procp = &p0;
1869 	hat_thread_exit(t);
1870 	mutex_exit(&pp->p_lock);
1871 
1872 	if (t->t_back == t) {
1873 		ASSERT(t->t_forw == t);
1874 		/*
1875 		 * If the zone is empty, once the thread count
1876 		 * goes to zero no further kernel threads can be
1877 		 * created.  This is because if the creator is a process
1878 		 * in the zone, then it must have exited before the zone
1879 		 * state could be set to ZONE_IS_EMPTY.
1880 		 * Otherwise, if the creator is a kernel thread in the
1881 		 * zone, the thread count is non-zero.
1882 		 *
1883 		 * This really means that non-zone kernel threads should
1884 		 * not create zone kernel threads.
1885 		 */
1886 		zone->zone_kthreads = NULL;
1887 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
1888 			zone_status_set(zone, ZONE_IS_DOWN);
1889 		}
1890 	} else {
1891 		t->t_forw->t_back = t->t_back;
1892 		t->t_back->t_forw = t->t_forw;
1893 		if (zone->zone_kthreads == t)
1894 			zone->zone_kthreads = t->t_forw;
1895 	}
1896 	mutex_exit(&zone_status_lock);
1897 	zone_rele(zone);
1898 	thread_exit();
1899 	/* NOTREACHED */
1900 }
1901 
1902 static void
1903 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
1904 {
1905 	vnode_t *oldvp;
1906 
1907 	/* we're going to hold a reference here to the directory */
1908 	VN_HOLD(vp);
1909 
1910 #ifdef C2_AUDIT
1911 	if (audit_active)	/* update abs cwd/root path see c2audit.c */
1912 		audit_chdirec(vp, vpp);
1913 #endif
1914 
1915 	mutex_enter(&pp->p_lock);
1916 	oldvp = *vpp;
1917 	*vpp = vp;
1918 	mutex_exit(&pp->p_lock);
1919 	if (oldvp != NULL)
1920 		VN_RELE(oldvp);
1921 }
1922 
1923 /*
1924  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
1925  */
1926 static int
1927 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
1928 {
1929 	nvpair_t *nvp = NULL;
1930 	boolean_t priv_set = B_FALSE;
1931 	boolean_t limit_set = B_FALSE;
1932 	boolean_t action_set = B_FALSE;
1933 
1934 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
1935 		const char *name;
1936 		uint64_t ui64;
1937 
1938 		name = nvpair_name(nvp);
1939 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
1940 			return (EINVAL);
1941 		(void) nvpair_value_uint64(nvp, &ui64);
1942 		if (strcmp(name, "privilege") == 0) {
1943 			/*
1944 			 * Currently only privileged values are allowed, but
1945 			 * this may change in the future.
1946 			 */
1947 			if (ui64 != RCPRIV_PRIVILEGED)
1948 				return (EINVAL);
1949 			rv->rcv_privilege = ui64;
1950 			priv_set = B_TRUE;
1951 		} else if (strcmp(name, "limit") == 0) {
1952 			rv->rcv_value = ui64;
1953 			limit_set = B_TRUE;
1954 		} else if (strcmp(name, "action") == 0) {
1955 			if (ui64 != RCTL_LOCAL_NOACTION &&
1956 			    ui64 != RCTL_LOCAL_DENY)
1957 				return (EINVAL);
1958 			rv->rcv_flagaction = ui64;
1959 			action_set = B_TRUE;
1960 		} else {
1961 			return (EINVAL);
1962 		}
1963 	}
1964 
1965 	if (!(priv_set && limit_set && action_set))
1966 		return (EINVAL);
1967 	rv->rcv_action_signal = 0;
1968 	rv->rcv_action_recipient = NULL;
1969 	rv->rcv_action_recip_pid = -1;
1970 	rv->rcv_firing_time = 0;
1971 
1972 	return (0);
1973 }
1974 
1975 void
1976 zone_icode(void)
1977 {
1978 	proc_t *p = ttoproc(curthread);
1979 	struct core_globals	*cg;
1980 
1981 	/*
1982 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
1983 	 * storing just the pid of init is sufficient.
1984 	 */
1985 	p->p_zone->zone_proc_initpid = p->p_pid;
1986 
1987 	/*
1988 	 * Allocate user address space and stack segment
1989 	 */
1990 
1991 	p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0;
1992 	p->p_usrstack = (caddr_t)USRSTACK32;
1993 	p->p_model = DATAMODEL_ILP32;
1994 	p->p_stkprot = PROT_ZFOD & ~PROT_EXEC;
1995 	p->p_datprot = PROT_ZFOD & ~PROT_EXEC;
1996 	p->p_stk_ctl = INT32_MAX;
1997 
1998 	p->p_as = as_alloc();
1999 	p->p_as->a_userlimit = (caddr_t)USERLIMIT32;
2000 	(void) hat_setup(p->p_as->a_hat, HAT_INIT);
2001 
2002 	cg = zone_getspecific(core_zone_key, p->p_zone);
2003 	ASSERT(cg != NULL);
2004 	corectl_path_hold(cg->core_default_path);
2005 	corectl_content_hold(cg->core_default_content);
2006 	p->p_corefile = cg->core_default_path;
2007 	p->p_content = cg->core_default_content;
2008 
2009 	init_mstate(curthread, LMS_SYSTEM);
2010 
2011 	p->p_zone->zone_boot_err = exec_init(zone_initname, 0,
2012 	    p->p_zone->zone_bootargs);
2013 
2014 	mutex_enter(&zone_status_lock);
2015 	if (p->p_zone->zone_boot_err != 0) {
2016 		/*
2017 		 * Make sure we are still in the booting state-- we could have
2018 		 * raced and already be shutting down, or even further along.
2019 		 */
2020 		if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING)
2021 			zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN);
2022 		mutex_exit(&zone_status_lock);
2023 		/* It's gone bad, dispose of the process */
2024 		if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) {
2025 			mutex_enter(&curproc->p_lock);
2026 			ASSERT(curproc->p_flag & SEXITLWPS);
2027 			lwp_exit();
2028 		}
2029 	} else {
2030 		if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING)
2031 			zone_status_set(p->p_zone, ZONE_IS_RUNNING);
2032 		mutex_exit(&zone_status_lock);
2033 		/* cause the process to return to userland. */
2034 		lwp_rtt();
2035 	}
2036 }
2037 
2038 struct zsched_arg {
2039 	zone_t *zone;
2040 	nvlist_t *nvlist;
2041 };
2042 
2043 /*
2044  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
2045  * anything to do with scheduling, but rather with the fact that
2046  * per-zone kernel threads are parented to zsched, just like regular
2047  * kernel threads are parented to sched (p0).
2048  *
2049  * zsched is also responsible for launching init for the zone.
2050  */
2051 static void
2052 zsched(void *arg)
2053 {
2054 	struct zsched_arg *za = arg;
2055 	proc_t *pp = curproc;
2056 	proc_t *initp = proc_init;
2057 	zone_t *zone = za->zone;
2058 	cred_t *cr, *oldcred;
2059 	rctl_set_t *set;
2060 	rctl_alloc_gp_t *gp;
2061 	contract_t *ct = NULL;
2062 	task_t *tk, *oldtk;
2063 	rctl_entity_p_t e;
2064 	kproject_t *pj;
2065 
2066 	nvlist_t *nvl = za->nvlist;
2067 	nvpair_t *nvp = NULL;
2068 
2069 	bcopy("zsched", u.u_psargs, sizeof ("zsched"));
2070 	bcopy("zsched", u.u_comm, sizeof ("zsched"));
2071 	u.u_argc = 0;
2072 	u.u_argv = NULL;
2073 	u.u_envp = NULL;
2074 	closeall(P_FINFO(pp));
2075 
2076 	/*
2077 	 * We are this zone's "zsched" process.  As the zone isn't generally
2078 	 * visible yet we don't need to grab any locks before initializing its
2079 	 * zone_proc pointer.
2080 	 */
2081 	zone_hold(zone);  /* this hold is released by zone_destroy() */
2082 	zone->zone_zsched = pp;
2083 	mutex_enter(&pp->p_lock);
2084 	pp->p_zone = zone;
2085 	mutex_exit(&pp->p_lock);
2086 
2087 	/*
2088 	 * Disassociate process from its 'parent'; parent ourselves to init
2089 	 * (pid 1) and change other values as needed.
2090 	 */
2091 	sess_create();
2092 
2093 	mutex_enter(&pidlock);
2094 	proc_detach(pp);
2095 	pp->p_ppid = 1;
2096 	pp->p_flag |= SZONETOP;
2097 	pp->p_ancpid = 1;
2098 	pp->p_parent = initp;
2099 	pp->p_psibling = NULL;
2100 	if (initp->p_child)
2101 		initp->p_child->p_psibling = pp;
2102 	pp->p_sibling = initp->p_child;
2103 	initp->p_child = pp;
2104 
2105 	/* Decrement what newproc() incremented. */
2106 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
2107 	/*
2108 	 * Our credentials are about to become kcred-like, so we don't care
2109 	 * about the caller's ruid.
2110 	 */
2111 	upcount_inc(crgetruid(kcred), zone->zone_id);
2112 	mutex_exit(&pidlock);
2113 
2114 	/*
2115 	 * getting out of global zone, so decrement lwp counts
2116 	 */
2117 	pj = pp->p_task->tk_proj;
2118 	mutex_enter(&global_zone->zone_nlwps_lock);
2119 	pj->kpj_nlwps -= pp->p_lwpcnt;
2120 	global_zone->zone_nlwps -= pp->p_lwpcnt;
2121 	mutex_exit(&global_zone->zone_nlwps_lock);
2122 
2123 	/*
2124 	 * Create and join a new task in project '0' of this zone.
2125 	 *
2126 	 * We don't need to call holdlwps() since we know we're the only lwp in
2127 	 * this process.
2128 	 *
2129 	 * task_join() returns with p_lock held.
2130 	 */
2131 	tk = task_create(0, zone);
2132 	mutex_enter(&cpu_lock);
2133 	oldtk = task_join(tk, 0);
2134 	mutex_exit(&curproc->p_lock);
2135 	mutex_exit(&cpu_lock);
2136 	task_rele(oldtk);
2137 
2138 	/*
2139 	 * add lwp counts to zsched's zone, and increment project's task count
2140 	 * due to the task created in the above tasksys_settaskid
2141 	 */
2142 	pj = pp->p_task->tk_proj;
2143 	mutex_enter(&zone->zone_nlwps_lock);
2144 	pj->kpj_nlwps += pp->p_lwpcnt;
2145 	pj->kpj_ntasks += 1;
2146 	zone->zone_nlwps += pp->p_lwpcnt;
2147 	mutex_exit(&zone->zone_nlwps_lock);
2148 
2149 	/*
2150 	 * The process was created by a process in the global zone, hence the
2151 	 * credentials are wrong.  We might as well have kcred-ish credentials.
2152 	 */
2153 	cr = zone->zone_kcred;
2154 	crhold(cr);
2155 	mutex_enter(&pp->p_crlock);
2156 	oldcred = pp->p_cred;
2157 	pp->p_cred = cr;
2158 	mutex_exit(&pp->p_crlock);
2159 	crfree(oldcred);
2160 
2161 	/*
2162 	 * Hold credentials again (for thread)
2163 	 */
2164 	crhold(cr);
2165 
2166 	/*
2167 	 * p_lwpcnt can't change since this is a kernel process.
2168 	 */
2169 	crset(pp, cr);
2170 
2171 	/*
2172 	 * Chroot
2173 	 */
2174 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
2175 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
2176 
2177 	/*
2178 	 * Initialize zone's rctl set.
2179 	 */
2180 	set = rctl_set_create();
2181 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
2182 	mutex_enter(&pp->p_lock);
2183 	e.rcep_p.zone = zone;
2184 	e.rcep_t = RCENTITY_ZONE;
2185 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
2186 	mutex_exit(&pp->p_lock);
2187 	rctl_prealloc_destroy(gp);
2188 
2189 	/*
2190 	 * Apply the rctls passed in to zone_create().  This is basically a list
2191 	 * assignment: all of the old values are removed and the new ones
2192 	 * inserted.  That is, if an empty list is passed in, all values are
2193 	 * removed.
2194 	 */
2195 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2196 		rctl_dict_entry_t *rde;
2197 		rctl_hndl_t hndl;
2198 		char *name;
2199 		nvlist_t **nvlarray;
2200 		uint_t i, nelem;
2201 		int error;	/* For ASSERT()s */
2202 
2203 		name = nvpair_name(nvp);
2204 		hndl = rctl_hndl_lookup(name);
2205 		ASSERT(hndl != -1);
2206 		rde = rctl_dict_lookup_hndl(hndl);
2207 		ASSERT(rde != NULL);
2208 
2209 		for (; /* ever */; ) {
2210 			rctl_val_t oval;
2211 
2212 			mutex_enter(&pp->p_lock);
2213 			error = rctl_local_get(hndl, NULL, &oval, pp);
2214 			mutex_exit(&pp->p_lock);
2215 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
2216 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
2217 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
2218 				break;
2219 			mutex_enter(&pp->p_lock);
2220 			error = rctl_local_delete(hndl, &oval, pp);
2221 			mutex_exit(&pp->p_lock);
2222 			ASSERT(error == 0);
2223 		}
2224 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2225 		ASSERT(error == 0);
2226 		for (i = 0; i < nelem; i++) {
2227 			rctl_val_t *nvalp;
2228 
2229 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
2230 			error = nvlist2rctlval(nvlarray[i], nvalp);
2231 			ASSERT(error == 0);
2232 			/*
2233 			 * rctl_local_insert can fail if the value being
2234 			 * inserted is a duplicate; this is OK.
2235 			 */
2236 			mutex_enter(&pp->p_lock);
2237 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
2238 				kmem_cache_free(rctl_val_cache, nvalp);
2239 			mutex_exit(&pp->p_lock);
2240 		}
2241 	}
2242 	/*
2243 	 * Tell the world that we're done setting up.
2244 	 *
2245 	 * At this point we want to set the zone status to ZONE_IS_READY
2246 	 * and atomically set the zone's processor set visibility.  Once
2247 	 * we drop pool_lock() this zone will automatically get updated
2248 	 * to reflect any future changes to the pools configuration.
2249 	 */
2250 	pool_lock();
2251 	mutex_enter(&cpu_lock);
2252 	mutex_enter(&zonehash_lock);
2253 	zone_uniqid(zone);
2254 	zone_zsd_configure(zone);
2255 	if (pool_state == POOL_ENABLED)
2256 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
2257 	mutex_enter(&zone_status_lock);
2258 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
2259 	zone_status_set(zone, ZONE_IS_READY);
2260 	mutex_exit(&zone_status_lock);
2261 	mutex_exit(&zonehash_lock);
2262 	mutex_exit(&cpu_lock);
2263 	pool_unlock();
2264 
2265 	/*
2266 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
2267 	 * we launch init, and set the state to running.
2268 	 */
2269 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
2270 
2271 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
2272 		id_t cid;
2273 
2274 		/*
2275 		 * Ok, this is a little complicated.  We need to grab the
2276 		 * zone's pool's scheduling class ID; note that by now, we
2277 		 * are already bound to a pool if we need to be (zoneadmd
2278 		 * will have done that to us while we're in the READY
2279 		 * state).  *But* the scheduling class for the zone's 'init'
2280 		 * must be explicitly passed to newproc, which doesn't
2281 		 * respect pool bindings.
2282 		 *
2283 		 * We hold the pool_lock across the call to newproc() to
2284 		 * close the obvious race: the pool's scheduling class
2285 		 * could change before we manage to create the LWP with
2286 		 * classid 'cid'.
2287 		 */
2288 		pool_lock();
2289 		cid = pool_get_class(zone->zone_pool);
2290 		if (cid == -1)
2291 			cid = defaultcid;
2292 
2293 		/*
2294 		 * If this fails, zone_boot will ultimately fail.  The
2295 		 * state of the zone will be set to SHUTTING_DOWN-- userland
2296 		 * will have to tear down the zone, and fail, or try again.
2297 		 */
2298 		if ((zone->zone_boot_err = newproc(zone_icode, NULL, cid,
2299 		    minclsyspri - 1, &ct)) != 0) {
2300 			mutex_enter(&zone_status_lock);
2301 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
2302 			mutex_exit(&zone_status_lock);
2303 		}
2304 		pool_unlock();
2305 	}
2306 
2307 	/*
2308 	 * Wait for zone_destroy() to be called.  This is what we spend
2309 	 * most of our life doing.
2310 	 */
2311 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
2312 
2313 	if (ct)
2314 		/*
2315 		 * At this point the process contract should be empty.
2316 		 * (Though if it isn't, it's not the end of the world.)
2317 		 */
2318 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
2319 
2320 	/*
2321 	 * Allow kcred to be freed when all referring processes
2322 	 * (including this one) go away.  We can't just do this in
2323 	 * zone_free because we need to wait for the zone_cred_ref to
2324 	 * drop to 0 before calling zone_free, and the existence of
2325 	 * zone_kcred will prevent that.  Thus, we call crfree here to
2326 	 * balance the crdup in zone_create.  The crhold calls earlier
2327 	 * in zsched will be dropped when the thread and process exit.
2328 	 */
2329 	crfree(zone->zone_kcred);
2330 	zone->zone_kcred = NULL;
2331 
2332 	exit(CLD_EXITED, 0);
2333 }
2334 
2335 /*
2336  * Helper function to determine if there are any submounts of the
2337  * provided path.  Used to make sure the zone doesn't "inherit" any
2338  * mounts from before it is created.
2339  */
2340 static uint_t
2341 zone_mount_count(const char *rootpath)
2342 {
2343 	vfs_t *vfsp;
2344 	uint_t count = 0;
2345 	size_t rootpathlen = strlen(rootpath);
2346 
2347 	/*
2348 	 * Holding zonehash_lock prevents race conditions with
2349 	 * vfs_list_add()/vfs_list_remove() since we serialize with
2350 	 * zone_find_by_path().
2351 	 */
2352 	ASSERT(MUTEX_HELD(&zonehash_lock));
2353 	/*
2354 	 * The rootpath must end with a '/'
2355 	 */
2356 	ASSERT(rootpath[rootpathlen - 1] == '/');
2357 
2358 	/*
2359 	 * This intentionally does not count the rootpath itself if that
2360 	 * happens to be a mount point.
2361 	 */
2362 	vfs_list_read_lock();
2363 	vfsp = rootvfs;
2364 	do {
2365 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
2366 		    rootpathlen) == 0)
2367 			count++;
2368 		vfsp = vfsp->vfs_next;
2369 	} while (vfsp != rootvfs);
2370 	vfs_list_unlock();
2371 	return (count);
2372 }
2373 
2374 /*
2375  * Helper function to make sure that a zone created on 'rootpath'
2376  * wouldn't end up containing other zones' rootpaths.
2377  */
2378 static boolean_t
2379 zone_is_nested(const char *rootpath)
2380 {
2381 	zone_t *zone;
2382 	size_t rootpathlen = strlen(rootpath);
2383 	size_t len;
2384 
2385 	ASSERT(MUTEX_HELD(&zonehash_lock));
2386 
2387 	for (zone = list_head(&zone_active); zone != NULL;
2388 	    zone = list_next(&zone_active, zone)) {
2389 		if (zone == global_zone)
2390 			continue;
2391 		len = strlen(zone->zone_rootpath);
2392 		if (strncmp(rootpath, zone->zone_rootpath,
2393 		    MIN(rootpathlen, len)) == 0)
2394 			return (B_TRUE);
2395 	}
2396 	return (B_FALSE);
2397 }
2398 
2399 static int
2400 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs)
2401 {
2402 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
2403 
2404 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
2405 		kmem_free(privs, sizeof (priv_set_t));
2406 		return (EFAULT);
2407 	}
2408 
2409 	zone->zone_privset = privs;
2410 	return (0);
2411 }
2412 
2413 /*
2414  * We make creative use of nvlists to pass in rctls from userland.  The list is
2415  * a list of the following structures:
2416  *
2417  * (name = rctl_name, value = nvpair_list_array)
2418  *
2419  * Where each element of the nvpair_list_array is of the form:
2420  *
2421  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
2422  * 	(name = "limit", value = uint64_t),
2423  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
2424  */
2425 static int
2426 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
2427 {
2428 	nvpair_t *nvp = NULL;
2429 	nvlist_t *nvl = NULL;
2430 	char *kbuf;
2431 	int error;
2432 	rctl_val_t rv;
2433 
2434 	*nvlp = NULL;
2435 
2436 	if (buflen == 0)
2437 		return (0);
2438 
2439 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
2440 		return (ENOMEM);
2441 	if (copyin(ubuf, kbuf, buflen)) {
2442 		error = EFAULT;
2443 		goto out;
2444 	}
2445 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
2446 		/*
2447 		 * nvl may have been allocated/free'd, but the value set to
2448 		 * non-NULL, so we reset it here.
2449 		 */
2450 		nvl = NULL;
2451 		error = EINVAL;
2452 		goto out;
2453 	}
2454 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
2455 		rctl_dict_entry_t *rde;
2456 		rctl_hndl_t hndl;
2457 		nvlist_t **nvlarray;
2458 		uint_t i, nelem;
2459 		char *name;
2460 
2461 		error = EINVAL;
2462 		name = nvpair_name(nvp);
2463 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
2464 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
2465 			goto out;
2466 		}
2467 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
2468 			goto out;
2469 		}
2470 		rde = rctl_dict_lookup_hndl(hndl);
2471 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
2472 		ASSERT(error == 0);
2473 		for (i = 0; i < nelem; i++) {
2474 			if (error = nvlist2rctlval(nvlarray[i], &rv))
2475 				goto out;
2476 		}
2477 		if (rctl_invalid_value(rde, &rv)) {
2478 			error = EINVAL;
2479 			goto out;
2480 		}
2481 	}
2482 	error = 0;
2483 	*nvlp = nvl;
2484 out:
2485 	kmem_free(kbuf, buflen);
2486 	if (error && nvl != NULL)
2487 		nvlist_free(nvl);
2488 	return (error);
2489 }
2490 
2491 int
2492 zone_create_error(int er_error, int er_ext, int *er_out) {
2493 	if (er_out != NULL) {
2494 		if (copyout(&er_ext, er_out, sizeof (int))) {
2495 			return (set_errno(EFAULT));
2496 		}
2497 	}
2498 	return (set_errno(er_error));
2499 }
2500 
2501 /*
2502  * System call to create/initialize a new zone named 'zone_name', rooted
2503  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
2504  * and initialized with the zone-wide rctls described in 'rctlbuf'.
2505  *
2506  * If extended error is non-null, we may use it to return more detailed
2507  * error information.
2508  */
2509 static zoneid_t
2510 zone_create(const char *zone_name, const char *zone_root,
2511     const priv_set_t *zone_privs, caddr_t rctlbuf, size_t rctlbufsz,
2512     int *extended_error)
2513 {
2514 	struct zsched_arg zarg;
2515 	nvlist_t *rctls = NULL;
2516 	proc_t *pp = curproc;
2517 	zone_t *zone, *ztmp;
2518 	zoneid_t zoneid;
2519 	int error;
2520 	int error2 = 0;
2521 	char *str;
2522 	cred_t *zkcr;
2523 
2524 	if (secpolicy_zone_config(CRED()) != 0)
2525 		return (set_errno(EPERM));
2526 
2527 	/* can't boot zone from within chroot environment */
2528 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
2529 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
2530 			extended_error));
2531 
2532 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
2533 	zoneid = zone->zone_id = id_alloc(zoneid_space);
2534 	zone->zone_status = ZONE_IS_UNINITIALIZED;
2535 	zone->zone_pool = pool_default;
2536 	zone->zone_pool_mod = gethrtime();
2537 	zone->zone_psetid = ZONE_PS_INVAL;
2538 	zone->zone_ncpus = 0;
2539 	zone->zone_ncpus_online = 0;
2540 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
2541 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
2542 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
2543 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
2544 	    offsetof(struct zsd_entry, zsd_linkage));
2545 
2546 	if ((error = zone_set_name(zone, zone_name)) != 0) {
2547 		zone_free(zone);
2548 		return (zone_create_error(error, 0, extended_error));
2549 	}
2550 
2551 	if ((error = zone_set_root(zone, zone_root)) != 0) {
2552 		zone_free(zone);
2553 		return (zone_create_error(error, 0, extended_error));
2554 	}
2555 	if ((error = zone_set_privset(zone, zone_privs)) != 0) {
2556 		zone_free(zone);
2557 		return (zone_create_error(error, 0, extended_error));
2558 	}
2559 
2560 	/* initialize node name to be the same as zone name */
2561 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
2562 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
2563 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
2564 
2565 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
2566 	zone->zone_domain[0] = '\0';
2567 	zone->zone_shares = 1;
2568 	zone->zone_bootargs = NULL;
2569 
2570 	/*
2571 	 * Zsched initializes the rctls.
2572 	 */
2573 	zone->zone_rctls = NULL;
2574 
2575 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
2576 		zone_free(zone);
2577 		return (zone_create_error(error, 0, extended_error));
2578 	}
2579 
2580 	/*
2581 	 * Stop all lwps since that's what normally happens as part of fork().
2582 	 * This needs to happen before we grab any locks to avoid deadlock
2583 	 * (another lwp in the process could be waiting for the held lock).
2584 	 */
2585 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
2586 		zone_free(zone);
2587 		if (rctls)
2588 			nvlist_free(rctls);
2589 		return (zone_create_error(error, 0, extended_error));
2590 	}
2591 
2592 	if (block_mounts() == 0) {
2593 		mutex_enter(&pp->p_lock);
2594 		if (curthread != pp->p_agenttp)
2595 			continuelwps(pp);
2596 		mutex_exit(&pp->p_lock);
2597 		zone_free(zone);
2598 		if (rctls)
2599 			nvlist_free(rctls);
2600 		return (zone_create_error(error, 0, extended_error));
2601 	}
2602 
2603 	/*
2604 	 * Set up credential for kernel access.  After this, any errors
2605 	 * should go through the dance in errout rather than calling
2606 	 * zone_free directly.
2607 	 */
2608 	zone->zone_kcred = crdup(kcred);
2609 	crsetzone(zone->zone_kcred, zone);
2610 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
2611 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
2612 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
2613 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
2614 
2615 	mutex_enter(&zonehash_lock);
2616 	/*
2617 	 * Make sure zone doesn't already exist.
2618 	 */
2619 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL) {
2620 		zone_status_t status;
2621 
2622 		status = zone_status_get(ztmp);
2623 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
2624 			error = EEXIST;
2625 		else
2626 			error = EBUSY;
2627 		goto errout;
2628 	}
2629 
2630 	/*
2631 	 * Don't allow zone creations which would cause one zone's rootpath to
2632 	 * be accessible from that of another (non-global) zone.
2633 	 */
2634 	if (zone_is_nested(zone->zone_rootpath)) {
2635 		error = EBUSY;
2636 		goto errout;
2637 	}
2638 
2639 	ASSERT(zonecount != 0);		/* check for leaks */
2640 	if (zonecount + 1 > maxzones) {
2641 		error = ENOMEM;
2642 		goto errout;
2643 	}
2644 
2645 	if (zone_mount_count(zone->zone_rootpath) != 0) {
2646 		error = EBUSY;
2647 		error2 = ZE_AREMOUNTS;
2648 		goto errout;
2649 	}
2650 
2651 	/*
2652 	 * Zone is still incomplete, but we need to drop all locks while
2653 	 * zsched() initializes this zone's kernel process.  We
2654 	 * optimistically add the zone to the hashtable and associated
2655 	 * lists so a parallel zone_create() doesn't try to create the
2656 	 * same zone.
2657 	 */
2658 	zonecount++;
2659 	(void) mod_hash_insert(zonehashbyid,
2660 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
2661 	    (mod_hash_val_t)(uintptr_t)zone);
2662 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
2663 	(void) strcpy(str, zone->zone_name);
2664 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
2665 	    (mod_hash_val_t)(uintptr_t)zone);
2666 	/*
2667 	 * Insert into active list.  At this point there are no 'hold's
2668 	 * on the zone, but everyone else knows not to use it, so we can
2669 	 * continue to use it.  zsched() will do a zone_hold() if the
2670 	 * newproc() is successful.
2671 	 */
2672 	list_insert_tail(&zone_active, zone);
2673 	mutex_exit(&zonehash_lock);
2674 
2675 	zarg.zone = zone;
2676 	zarg.nvlist = rctls;
2677 	/*
2678 	 * The process, task, and project rctls are probably wrong;
2679 	 * we need an interface to get the default values of all rctls,
2680 	 * and initialize zsched appropriately.  I'm not sure that that
2681 	 * makes much of a difference, though.
2682 	 */
2683 	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
2684 		/*
2685 		 * We need to undo all globally visible state.
2686 		 */
2687 		mutex_enter(&zonehash_lock);
2688 		list_remove(&zone_active, zone);
2689 		(void) mod_hash_destroy(zonehashbyname,
2690 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
2691 		(void) mod_hash_destroy(zonehashbyid,
2692 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
2693 		ASSERT(zonecount > 1);
2694 		zonecount--;
2695 		goto errout;
2696 	}
2697 
2698 	/*
2699 	 * Zone creation can't fail from now on.
2700 	 */
2701 
2702 	/*
2703 	 * Let the other lwps continue.
2704 	 */
2705 	mutex_enter(&pp->p_lock);
2706 	if (curthread != pp->p_agenttp)
2707 		continuelwps(pp);
2708 	mutex_exit(&pp->p_lock);
2709 
2710 	/*
2711 	 * Wait for zsched to finish initializing the zone.
2712 	 */
2713 	zone_status_wait(zone, ZONE_IS_READY);
2714 	/*
2715 	 * The zone is fully visible, so we can let mounts progress.
2716 	 */
2717 	resume_mounts();
2718 	if (rctls)
2719 		nvlist_free(rctls);
2720 
2721 	return (zoneid);
2722 
2723 errout:
2724 	mutex_exit(&zonehash_lock);
2725 	/*
2726 	 * Let the other lwps continue.
2727 	 */
2728 	mutex_enter(&pp->p_lock);
2729 	if (curthread != pp->p_agenttp)
2730 		continuelwps(pp);
2731 	mutex_exit(&pp->p_lock);
2732 
2733 	resume_mounts();
2734 	if (rctls)
2735 		nvlist_free(rctls);
2736 	/*
2737 	 * There is currently one reference to the zone, a cred_ref from
2738 	 * zone_kcred.  To free the zone, we call crfree, which will call
2739 	 * zone_cred_rele, which will call zone_free.
2740 	 */
2741 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
2742 	ASSERT(zone->zone_kcred->cr_ref == 1);
2743 	ASSERT(zone->zone_ref == 0);
2744 	zkcr = zone->zone_kcred;
2745 	zone->zone_kcred = NULL;
2746 	crfree(zkcr);				/* triggers call to zone_free */
2747 	return (zone_create_error(error, error2, extended_error));
2748 }
2749 
2750 /*
2751  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
2752  * the heavy lifting.
2753  */
2754 static int
2755 zone_boot(zoneid_t zoneid, const char *bootargs)
2756 {
2757 	int err;
2758 	zone_t *zone;
2759 
2760 	if (secpolicy_zone_config(CRED()) != 0)
2761 		return (set_errno(EPERM));
2762 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
2763 		return (set_errno(EINVAL));
2764 
2765 	mutex_enter(&zonehash_lock);
2766 	/*
2767 	 * Look for zone under hash lock to prevent races with calls to
2768 	 * zone_shutdown, zone_destroy, etc.
2769 	 */
2770 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2771 		mutex_exit(&zonehash_lock);
2772 		return (set_errno(EINVAL));
2773 	}
2774 
2775 	if ((err = zone_set_bootargs(zone, bootargs)) != 0) {
2776 		mutex_exit(&zonehash_lock);
2777 		return (set_errno(err));
2778 	}
2779 
2780 	mutex_enter(&zone_status_lock);
2781 	if (zone_status_get(zone) != ZONE_IS_READY) {
2782 		mutex_exit(&zone_status_lock);
2783 		mutex_exit(&zonehash_lock);
2784 		return (set_errno(EINVAL));
2785 	}
2786 	zone_status_set(zone, ZONE_IS_BOOTING);
2787 	mutex_exit(&zone_status_lock);
2788 
2789 	zone_hold(zone);	/* so we can use the zone_t later */
2790 	mutex_exit(&zonehash_lock);
2791 
2792 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
2793 		zone_rele(zone);
2794 		return (set_errno(EINTR));
2795 	}
2796 
2797 	/*
2798 	 * Boot (starting init) might have failed, in which case the zone
2799 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
2800 	 * be placed in zone->zone_boot_err, and so we return that.
2801 	 */
2802 	err = zone->zone_boot_err;
2803 	zone_rele(zone);
2804 	return (err ? set_errno(err) : 0);
2805 }
2806 
2807 /*
2808  * Kills all user processes in the zone, waiting for them all to exit
2809  * before returning.
2810  */
2811 static int
2812 zone_empty(zone_t *zone)
2813 {
2814 	int waitstatus;
2815 
2816 	/*
2817 	 * We need to drop zonehash_lock before killing all
2818 	 * processes, otherwise we'll deadlock with zone_find_*
2819 	 * which can be called from the exit path.
2820 	 */
2821 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
2822 	while ((waitstatus = zone_status_timedwait_sig(zone, lbolt + hz,
2823 	    ZONE_IS_EMPTY)) == -1) {
2824 		killall(zone->zone_id);
2825 	}
2826 	/*
2827 	 * return EINTR if we were signaled
2828 	 */
2829 	if (waitstatus == 0)
2830 		return (EINTR);
2831 	return (0);
2832 }
2833 
2834 /*
2835  * Systemcall to start the zone's halt sequence.  By the time this
2836  * function successfully returns, all user processes and kernel threads
2837  * executing in it will have exited, ZSD shutdown callbacks executed,
2838  * and the zone status set to ZONE_IS_DOWN.
2839  *
2840  * It is possible that the call will interrupt itself if the caller is the
2841  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
2842  */
2843 static int
2844 zone_shutdown(zoneid_t zoneid)
2845 {
2846 	int error;
2847 	zone_t *zone;
2848 	zone_status_t status;
2849 
2850 	if (secpolicy_zone_config(CRED()) != 0)
2851 		return (set_errno(EPERM));
2852 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
2853 		return (set_errno(EINVAL));
2854 
2855 	/*
2856 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
2857 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
2858 	 *
2859 	 * e.g. NFS can fail the mount if it determines that the zone
2860 	 * has already begun the shutdown sequence.
2861 	 */
2862 	if (block_mounts() == 0)
2863 		return (set_errno(EINTR));
2864 	mutex_enter(&zonehash_lock);
2865 	/*
2866 	 * Look for zone under hash lock to prevent races with other
2867 	 * calls to zone_shutdown and zone_destroy.
2868 	 */
2869 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
2870 		mutex_exit(&zonehash_lock);
2871 		resume_mounts();
2872 		return (set_errno(EINVAL));
2873 	}
2874 	mutex_enter(&zone_status_lock);
2875 	status = zone_status_get(zone);
2876 	/*
2877 	 * Fail if the zone isn't fully initialized yet.
2878 	 */
2879 	if (status < ZONE_IS_READY) {
2880 		mutex_exit(&zone_status_lock);
2881 		mutex_exit(&zonehash_lock);
2882 		resume_mounts();
2883 		return (set_errno(EINVAL));
2884 	}
2885 	/*
2886 	 * If conditions required for zone_shutdown() to return have been met,
2887 	 * return success.
2888 	 */
2889 	if (status >= ZONE_IS_DOWN) {
2890 		mutex_exit(&zone_status_lock);
2891 		mutex_exit(&zonehash_lock);
2892 		resume_mounts();
2893 		return (0);
2894 	}
2895 	/*
2896 	 * If zone_shutdown() hasn't been called before, go through the motions.
2897 	 * If it has, there's nothing to do but wait for the kernel threads to
2898 	 * drain.
2899 	 */
2900 	if (status < ZONE_IS_EMPTY) {
2901 		uint_t ntasks;
2902 
2903 		mutex_enter(&zone->zone_lock);
2904 		if ((ntasks = zone->zone_ntasks) != 1) {
2905 			/*
2906 			 * There's still stuff running.
2907 			 */
2908 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
2909 		}
2910 		mutex_exit(&zone->zone_lock);
2911 		if (ntasks == 1) {
2912 			/*
2913 			 * The only way to create another task is through
2914 			 * zone_enter(), which will block until we drop
2915 			 * zonehash_lock.  The zone is empty.
2916 			 */
2917 			if (zone->zone_kthreads == NULL) {
2918 				/*
2919 				 * Skip ahead to ZONE_IS_DOWN
2920 				 */
2921 				zone_status_set(zone, ZONE_IS_DOWN);
2922 			} else {
2923 				zone_status_set(zone, ZONE_IS_EMPTY);
2924 			}
2925 		}
2926 	}
2927 	zone_hold(zone);	/* so we can use the zone_t later */
2928 	mutex_exit(&zone_status_lock);
2929 	mutex_exit(&zonehash_lock);
2930 	resume_mounts();
2931 
2932 	if (error = zone_empty(zone)) {
2933 		zone_rele(zone);
2934 		return (set_errno(error));
2935 	}
2936 	/*
2937 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
2938 	 * longer be notified of changes to the pools configuration, so
2939 	 * in order to not end up with a stale pool pointer, we point
2940 	 * ourselves at the default pool and remove all resource
2941 	 * visibility.  This is especially important as the zone_t may
2942 	 * languish on the deathrow for a very long time waiting for
2943 	 * cred's to drain out.
2944 	 *
2945 	 * This rebinding of the zone can happen multiple times
2946 	 * (presumably due to interrupted or parallel systemcalls)
2947 	 * without any adverse effects.
2948 	 */
2949 	if (pool_lock_intr() != 0) {
2950 		zone_rele(zone);
2951 		return (set_errno(EINTR));
2952 	}
2953 	if (pool_state == POOL_ENABLED) {
2954 		mutex_enter(&cpu_lock);
2955 		zone_pool_set(zone, pool_default);
2956 		/*
2957 		 * The zone no longer needs to be able to see any cpus.
2958 		 */
2959 		zone_pset_set(zone, ZONE_PS_INVAL);
2960 		mutex_exit(&cpu_lock);
2961 	}
2962 	pool_unlock();
2963 
2964 	/*
2965 	 * ZSD shutdown callbacks can be executed multiple times, hence
2966 	 * it is safe to not be holding any locks across this call.
2967 	 */
2968 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
2969 
2970 	mutex_enter(&zone_status_lock);
2971 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
2972 		zone_status_set(zone, ZONE_IS_DOWN);
2973 	mutex_exit(&zone_status_lock);
2974 
2975 	/*
2976 	 * Wait for kernel threads to drain.
2977 	 */
2978 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
2979 		zone_rele(zone);
2980 		return (set_errno(EINTR));
2981 	}
2982 	zone_rele(zone);
2983 	return (0);
2984 }
2985 
2986 /*
2987  * Systemcall entry point to finalize the zone halt process.  The caller
2988  * must have already successfully callefd zone_shutdown().
2989  *
2990  * Upon successful completion, the zone will have been fully destroyed:
2991  * zsched will have exited, destructor callbacks executed, and the zone
2992  * removed from the list of active zones.
2993  */
2994 static int
2995 zone_destroy(zoneid_t zoneid)
2996 {
2997 	uint64_t uniqid;
2998 	zone_t *zone;
2999 	zone_status_t status;
3000 
3001 	if (secpolicy_zone_config(CRED()) != 0)
3002 		return (set_errno(EPERM));
3003 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3004 		return (set_errno(EINVAL));
3005 
3006 	mutex_enter(&zonehash_lock);
3007 	/*
3008 	 * Look for zone under hash lock to prevent races with other
3009 	 * calls to zone_destroy.
3010 	 */
3011 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3012 		mutex_exit(&zonehash_lock);
3013 		return (set_errno(EINVAL));
3014 	}
3015 
3016 	if (zone_mount_count(zone->zone_rootpath) != 0) {
3017 		mutex_exit(&zonehash_lock);
3018 		return (set_errno(EBUSY));
3019 	}
3020 	mutex_enter(&zone_status_lock);
3021 	status = zone_status_get(zone);
3022 	if (status < ZONE_IS_DOWN) {
3023 		mutex_exit(&zone_status_lock);
3024 		mutex_exit(&zonehash_lock);
3025 		return (set_errno(EBUSY));
3026 	} else if (status == ZONE_IS_DOWN) {
3027 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
3028 	}
3029 	mutex_exit(&zone_status_lock);
3030 	zone_hold(zone);
3031 	mutex_exit(&zonehash_lock);
3032 
3033 	/*
3034 	 * wait for zsched to exit
3035 	 */
3036 	zone_status_wait(zone, ZONE_IS_DEAD);
3037 	zone_zsd_callbacks(zone, ZSD_DESTROY);
3038 	uniqid = zone->zone_uniqid;
3039 	zone_rele(zone);
3040 	zone = NULL;	/* potentially free'd */
3041 
3042 	mutex_enter(&zonehash_lock);
3043 	for (; /* ever */; ) {
3044 		boolean_t unref;
3045 
3046 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
3047 		    zone->zone_uniqid != uniqid) {
3048 			/*
3049 			 * The zone has gone away.  Necessary conditions
3050 			 * are met, so we return success.
3051 			 */
3052 			mutex_exit(&zonehash_lock);
3053 			return (0);
3054 		}
3055 		mutex_enter(&zone->zone_lock);
3056 		unref = ZONE_IS_UNREF(zone);
3057 		mutex_exit(&zone->zone_lock);
3058 		if (unref) {
3059 			/*
3060 			 * There is only one reference to the zone -- that
3061 			 * added when the zone was added to the hashtables --
3062 			 * and things will remain this way until we drop
3063 			 * zonehash_lock... we can go ahead and cleanup the
3064 			 * zone.
3065 			 */
3066 			break;
3067 		}
3068 
3069 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
3070 			/* Signaled */
3071 			mutex_exit(&zonehash_lock);
3072 			return (set_errno(EINTR));
3073 		}
3074 
3075 	}
3076 
3077 	/*
3078 	 * It is now safe to let the zone be recreated; remove it from the
3079 	 * lists.  The memory will not be freed until the last cred
3080 	 * reference goes away.
3081 	 */
3082 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
3083 	zonecount--;
3084 	/* remove from active list and hash tables */
3085 	list_remove(&zone_active, zone);
3086 	(void) mod_hash_destroy(zonehashbyname,
3087 	    (mod_hash_key_t)zone->zone_name);
3088 	(void) mod_hash_destroy(zonehashbyid,
3089 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
3090 	mutex_exit(&zonehash_lock);
3091 
3092 	/* add to deathrow list */
3093 	mutex_enter(&zone_deathrow_lock);
3094 	list_insert_tail(&zone_deathrow, zone);
3095 	mutex_exit(&zone_deathrow_lock);
3096 
3097 	/*
3098 	 * Drop last reference (which was added by zsched()), this will
3099 	 * free the zone unless there are outstanding cred references.
3100 	 */
3101 	zone_rele(zone);
3102 	return (0);
3103 }
3104 
3105 /*
3106  * Systemcall entry point for zone_getattr(2).
3107  */
3108 static ssize_t
3109 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
3110 {
3111 	size_t size;
3112 	int error = 0, err;
3113 	zone_t *zone;
3114 	char *zonepath;
3115 	zone_status_t zone_status;
3116 	pid_t initpid;
3117 	boolean_t global = (curproc->p_zone == global_zone);
3118 
3119 	mutex_enter(&zonehash_lock);
3120 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
3121 		mutex_exit(&zonehash_lock);
3122 		return (set_errno(EINVAL));
3123 	}
3124 	zone_status = zone_status_get(zone);
3125 	if (zone_status < ZONE_IS_READY) {
3126 		mutex_exit(&zonehash_lock);
3127 		return (set_errno(EINVAL));
3128 	}
3129 	zone_hold(zone);
3130 	mutex_exit(&zonehash_lock);
3131 
3132 	/*
3133 	 * If not in the global zone, don't show information about other zones.
3134 	 */
3135 	if (!global && curproc->p_zone != zone) {
3136 		zone_rele(zone);
3137 		return (set_errno(EINVAL));
3138 	}
3139 
3140 	switch (attr) {
3141 	case ZONE_ATTR_ROOT:
3142 		if (global) {
3143 			/*
3144 			 * Copy the path to trim the trailing "/" (except for
3145 			 * the global zone).
3146 			 */
3147 			if (zone != global_zone)
3148 				size = zone->zone_rootpathlen - 1;
3149 			else
3150 				size = zone->zone_rootpathlen;
3151 			zonepath = kmem_alloc(size, KM_SLEEP);
3152 			bcopy(zone->zone_rootpath, zonepath, size);
3153 			zonepath[size - 1] = '\0';
3154 		} else {
3155 			/*
3156 			 * Caller is not in the global zone, just return
3157 			 * faked-up path for current zone.
3158 			 */
3159 			zonepath = "/";
3160 			size = 2;
3161 		}
3162 		if (bufsize > size)
3163 			bufsize = size;
3164 		if (buf != NULL) {
3165 			err = copyoutstr(zonepath, buf, bufsize, NULL);
3166 			if (err != 0 && err != ENAMETOOLONG)
3167 				error = EFAULT;
3168 		}
3169 		if (global)
3170 			kmem_free(zonepath, size);
3171 		break;
3172 
3173 	case ZONE_ATTR_NAME:
3174 		size = strlen(zone->zone_name) + 1;
3175 		if (bufsize > size)
3176 			bufsize = size;
3177 		if (buf != NULL) {
3178 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
3179 			if (err != 0 && err != ENAMETOOLONG)
3180 				error = EFAULT;
3181 		}
3182 		break;
3183 
3184 	case ZONE_ATTR_STATUS:
3185 		/*
3186 		 * Since we're not holding zonehash_lock, the zone status
3187 		 * may be anything; leave it up to userland to sort it out.
3188 		 */
3189 		size = sizeof (zone_status);
3190 		if (bufsize > size)
3191 			bufsize = size;
3192 		zone_status = zone_status_get(zone);
3193 		if (buf != NULL &&
3194 		    copyout(&zone_status, buf, bufsize) != 0)
3195 			error = EFAULT;
3196 		break;
3197 	case ZONE_ATTR_PRIVSET:
3198 		size = sizeof (priv_set_t);
3199 		if (bufsize > size)
3200 			bufsize = size;
3201 		if (buf != NULL &&
3202 		    copyout(zone->zone_privset, buf, bufsize) != 0)
3203 			error = EFAULT;
3204 		break;
3205 	case ZONE_ATTR_UNIQID:
3206 		size = sizeof (zone->zone_uniqid);
3207 		if (bufsize > size)
3208 			bufsize = size;
3209 		if (buf != NULL &&
3210 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
3211 			error = EFAULT;
3212 		break;
3213 	case ZONE_ATTR_POOLID:
3214 		{
3215 			pool_t *pool;
3216 			poolid_t poolid;
3217 
3218 			if (pool_lock_intr() != 0) {
3219 				error = EINTR;
3220 				break;
3221 			}
3222 			pool = zone_pool_get(zone);
3223 			poolid = pool->pool_id;
3224 			pool_unlock();
3225 			size = sizeof (poolid);
3226 			if (bufsize > size)
3227 				bufsize = size;
3228 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
3229 				error = EFAULT;
3230 		}
3231 		break;
3232 	case ZONE_ATTR_INITPID:
3233 		size = sizeof (initpid);
3234 		if (bufsize > size)
3235 			bufsize = size;
3236 		initpid = zone->zone_proc_initpid;
3237 		if (initpid == -1) {
3238 			error = ESRCH;
3239 			break;
3240 		}
3241 		if (buf != NULL &&
3242 		    copyout(&initpid, buf, bufsize) != 0)
3243 			error = EFAULT;
3244 		break;
3245 	default:
3246 		error = EINVAL;
3247 	}
3248 	zone_rele(zone);
3249 
3250 	if (error)
3251 		return (set_errno(error));
3252 	return ((ssize_t)size);
3253 }
3254 
3255 /*
3256  * Return zero if the process has at least one vnode mapped in to its
3257  * address space which shouldn't be allowed to change zones.
3258  */
3259 static int
3260 as_can_change_zones(void)
3261 {
3262 	proc_t *pp = curproc;
3263 	struct seg *seg;
3264 	struct as *as = pp->p_as;
3265 	vnode_t *vp;
3266 	int allow = 1;
3267 
3268 	ASSERT(pp->p_as != &kas);
3269 	AS_LOCK_ENTER(&as, &as->a_lock, RW_READER);
3270 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
3271 		/*
3272 		 * if we can't get a backing vnode for this segment then skip
3273 		 * it.
3274 		 */
3275 		vp = NULL;
3276 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
3277 			continue;
3278 		if (!vn_can_change_zones(vp)) { /* bail on first match */
3279 			allow = 0;
3280 			break;
3281 		}
3282 	}
3283 	AS_LOCK_EXIT(&as, &as->a_lock);
3284 	return (allow);
3285 }
3286 
3287 /*
3288  * Systemcall entry point for zone_enter().
3289  *
3290  * The current process is injected into said zone.  In the process
3291  * it will change its project membership, privileges, rootdir/cwd,
3292  * zone-wide rctls, and pool association to match those of the zone.
3293  *
3294  * The first zone_enter() called while the zone is in the ZONE_IS_READY
3295  * state will transition it to ZONE_IS_RUNNING.  Processes may only
3296  * enter a zone that is "ready" or "running".
3297  */
3298 static int
3299 zone_enter(zoneid_t zoneid)
3300 {
3301 	zone_t *zone;
3302 	vnode_t *vp;
3303 	proc_t *pp = curproc;
3304 	contract_t *ct;
3305 	cont_process_t *ctp;
3306 	task_t *tk, *oldtk;
3307 	kproject_t *zone_proj0;
3308 	cred_t *cr, *newcr;
3309 	pool_t *oldpool, *newpool;
3310 	sess_t *sp;
3311 	uid_t uid;
3312 	zone_status_t status;
3313 	int err = 0;
3314 	rctl_entity_p_t e;
3315 
3316 	if (secpolicy_zone_config(CRED()) != 0)
3317 		return (set_errno(EPERM));
3318 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
3319 		return (set_errno(EINVAL));
3320 
3321 	/*
3322 	 * Stop all lwps so we don't need to hold a lock to look at
3323 	 * curproc->p_zone.  This needs to happen before we grab any
3324 	 * locks to avoid deadlock (another lwp in the process could
3325 	 * be waiting for the held lock).
3326 	 */
3327 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
3328 		return (set_errno(EINTR));
3329 
3330 	/*
3331 	 * Make sure we're not changing zones with files open or mapped in
3332 	 * to our address space which shouldn't be changing zones.
3333 	 */
3334 	if (!files_can_change_zones()) {
3335 		err = EBADF;
3336 		goto out;
3337 	}
3338 	if (!as_can_change_zones()) {
3339 		err = EFAULT;
3340 		goto out;
3341 	}
3342 
3343 	mutex_enter(&zonehash_lock);
3344 	if (pp->p_zone != global_zone) {
3345 		mutex_exit(&zonehash_lock);
3346 		err = EINVAL;
3347 		goto out;
3348 	}
3349 
3350 	zone = zone_find_all_by_id(zoneid);
3351 	if (zone == NULL) {
3352 		mutex_exit(&zonehash_lock);
3353 		err = EINVAL;
3354 		goto out;
3355 	}
3356 
3357 	/*
3358 	 * To prevent processes in a zone from holding contracts on
3359 	 * extrazonal resources, and to avoid process contract
3360 	 * memberships which span zones, contract holders and processes
3361 	 * which aren't the sole members of their encapsulating process
3362 	 * contracts are not allowed to zone_enter.
3363 	 */
3364 	ctp = pp->p_ct_process;
3365 	ct = &ctp->conp_contract;
3366 	mutex_enter(&ct->ct_lock);
3367 	mutex_enter(&pp->p_lock);
3368 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
3369 		mutex_exit(&pp->p_lock);
3370 		mutex_exit(&ct->ct_lock);
3371 		mutex_exit(&zonehash_lock);
3372 		pool_unlock();
3373 		err = EINVAL;
3374 		goto out;
3375 	}
3376 
3377 	/*
3378 	 * Moreover, we don't allow processes whose encapsulating
3379 	 * process contracts have inherited extrazonal contracts.
3380 	 * While it would be easier to eliminate all process contracts
3381 	 * with inherited contracts, we need to be able to give a
3382 	 * restarted init (or other zone-penetrating process) its
3383 	 * predecessor's contracts.
3384 	 */
3385 	if (ctp->conp_ninherited != 0) {
3386 		contract_t *next;
3387 		for (next = list_head(&ctp->conp_inherited); next;
3388 		    next = list_next(&ctp->conp_inherited, next)) {
3389 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
3390 				mutex_exit(&pp->p_lock);
3391 				mutex_exit(&ct->ct_lock);
3392 				mutex_exit(&zonehash_lock);
3393 				pool_unlock();
3394 				err = EINVAL;
3395 				goto out;
3396 			}
3397 		}
3398 	}
3399 	mutex_exit(&pp->p_lock);
3400 	mutex_exit(&ct->ct_lock);
3401 
3402 	status = zone_status_get(zone);
3403 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
3404 		/*
3405 		 * Can't join
3406 		 */
3407 		mutex_exit(&zonehash_lock);
3408 		err = EINVAL;
3409 		goto out;
3410 	}
3411 
3412 	/*
3413 	 * Make sure new priv set is within the permitted set for caller
3414 	 */
3415 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
3416 		mutex_exit(&zonehash_lock);
3417 		err = EPERM;
3418 		goto out;
3419 	}
3420 	/*
3421 	 * We want to momentarily drop zonehash_lock while we optimistically
3422 	 * bind curproc to the pool it should be running in.  This is safe
3423 	 * since the zone can't disappear (we have a hold on it).
3424 	 */
3425 	zone_hold(zone);
3426 	mutex_exit(&zonehash_lock);
3427 
3428 	/*
3429 	 * Grab pool_lock to keep the pools configuration from changing
3430 	 * and to stop ourselves from getting rebound to another pool
3431 	 * until we join the zone.
3432 	 */
3433 	if (pool_lock_intr() != 0) {
3434 		zone_rele(zone);
3435 		err = EINTR;
3436 		goto out;
3437 	}
3438 	ASSERT(secpolicy_pool(CRED()) == 0);
3439 	/*
3440 	 * Bind ourselves to the pool currently associated with the zone.
3441 	 */
3442 	oldpool = curproc->p_pool;
3443 	newpool = zone_pool_get(zone);
3444 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
3445 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
3446 	    POOL_BIND_ALL)) != 0) {
3447 		pool_unlock();
3448 		zone_rele(zone);
3449 		goto out;
3450 	}
3451 
3452 	/*
3453 	 * Grab cpu_lock now; we'll need it later when we call
3454 	 * task_join().
3455 	 */
3456 	mutex_enter(&cpu_lock);
3457 	mutex_enter(&zonehash_lock);
3458 	/*
3459 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
3460 	 */
3461 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
3462 		/*
3463 		 * Can't join anymore.
3464 		 */
3465 		mutex_exit(&zonehash_lock);
3466 		mutex_exit(&cpu_lock);
3467 		if (pool_state == POOL_ENABLED &&
3468 		    newpool != oldpool)
3469 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
3470 			    POOL_BIND_ALL);
3471 		pool_unlock();
3472 		zone_rele(zone);
3473 		err = EINVAL;
3474 		goto out;
3475 	}
3476 
3477 	mutex_enter(&pp->p_lock);
3478 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
3479 	/* verify that we do not exceed and task or lwp limits */
3480 	mutex_enter(&zone->zone_nlwps_lock);
3481 	/* add new lwps to zone and zone's proj0 */
3482 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
3483 	zone->zone_nlwps += pp->p_lwpcnt;
3484 	/* add 1 task to zone's proj0 */
3485 	zone_proj0->kpj_ntasks += 1;
3486 	mutex_exit(&pp->p_lock);
3487 	mutex_exit(&zone->zone_nlwps_lock);
3488 
3489 	/* remove lwps from proc's old zone and old project */
3490 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
3491 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
3492 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
3493 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
3494 
3495 	/*
3496 	 * Joining the zone cannot fail from now on.
3497 	 *
3498 	 * This means that a lot of the following code can be commonized and
3499 	 * shared with zsched().
3500 	 */
3501 
3502 	/*
3503 	 * Reset the encapsulating process contract's zone.
3504 	 */
3505 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
3506 	contract_setzuniqid(ct, zone->zone_uniqid);
3507 
3508 	/*
3509 	 * Create a new task and associate the process with the project keyed
3510 	 * by (projid,zoneid).
3511 	 *
3512 	 * We might as well be in project 0; the global zone's projid doesn't
3513 	 * make much sense in a zone anyhow.
3514 	 *
3515 	 * This also increments zone_ntasks, and returns with p_lock held.
3516 	 */
3517 	tk = task_create(0, zone);
3518 	oldtk = task_join(tk, 0);
3519 	mutex_exit(&cpu_lock);
3520 
3521 	pp->p_flag |= SZONETOP;
3522 	pp->p_zone = zone;
3523 
3524 	/*
3525 	 * call RCTLOP_SET functions on this proc
3526 	 */
3527 	e.rcep_p.zone = zone;
3528 	e.rcep_t = RCENTITY_ZONE;
3529 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
3530 	    RCD_CALLBACK);
3531 	mutex_exit(&pp->p_lock);
3532 
3533 	/*
3534 	 * We don't need to hold any of zsched's locks here; not only do we know
3535 	 * the process and zone aren't going away, we know its session isn't
3536 	 * changing either.
3537 	 *
3538 	 * By joining zsched's session here, we mimic the behavior in the
3539 	 * global zone of init's sid being the pid of sched.  We extend this
3540 	 * to all zlogin-like zone_enter()'ing processes as well.
3541 	 */
3542 	mutex_enter(&pidlock);
3543 	sp = zone->zone_zsched->p_sessp;
3544 	SESS_HOLD(sp);
3545 	mutex_enter(&pp->p_lock);
3546 	pgexit(pp);
3547 	SESS_RELE(pp->p_sessp);
3548 	pp->p_sessp = sp;
3549 	pgjoin(pp, zone->zone_zsched->p_pidp);
3550 	mutex_exit(&pp->p_lock);
3551 	mutex_exit(&pidlock);
3552 
3553 	mutex_exit(&zonehash_lock);
3554 	/*
3555 	 * We're firmly in the zone; let pools progress.
3556 	 */
3557 	pool_unlock();
3558 	task_rele(oldtk);
3559 	/*
3560 	 * We don't need to retain a hold on the zone since we already
3561 	 * incremented zone_ntasks, so the zone isn't going anywhere.
3562 	 */
3563 	zone_rele(zone);
3564 
3565 	/*
3566 	 * Chroot
3567 	 */
3568 	vp = zone->zone_rootvp;
3569 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
3570 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
3571 
3572 	/*
3573 	 * Change process credentials
3574 	 */
3575 	newcr = cralloc();
3576 	mutex_enter(&pp->p_crlock);
3577 	cr = pp->p_cred;
3578 	crcopy_to(cr, newcr);
3579 	crsetzone(newcr, zone);
3580 	pp->p_cred = newcr;
3581 
3582 	/*
3583 	 * Restrict all process privilege sets to zone limit
3584 	 */
3585 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
3586 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
3587 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
3588 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
3589 	mutex_exit(&pp->p_crlock);
3590 	crset(pp, newcr);
3591 
3592 	/*
3593 	 * Adjust upcount to reflect zone entry.
3594 	 */
3595 	uid = crgetruid(newcr);
3596 	mutex_enter(&pidlock);
3597 	upcount_dec(uid, GLOBAL_ZONEID);
3598 	upcount_inc(uid, zoneid);
3599 	mutex_exit(&pidlock);
3600 
3601 	/*
3602 	 * Set up core file path and content.
3603 	 */
3604 	set_core_defaults();
3605 
3606 out:
3607 	/*
3608 	 * Let the other lwps continue.
3609 	 */
3610 	mutex_enter(&pp->p_lock);
3611 	if (curthread != pp->p_agenttp)
3612 		continuelwps(pp);
3613 	mutex_exit(&pp->p_lock);
3614 
3615 	return (err != 0 ? set_errno(err) : 0);
3616 }
3617 
3618 /*
3619  * Systemcall entry point for zone_list(2).
3620  *
3621  * Processes running in a (non-global) zone only see themselves.
3622  */
3623 static int
3624 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
3625 {
3626 	zoneid_t *zoneids;
3627 	zone_t *zone;
3628 	uint_t user_nzones, real_nzones;
3629 	int error = 0;
3630 	uint_t i;
3631 
3632 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
3633 		return (set_errno(EFAULT));
3634 
3635 	if (curproc->p_zone != global_zone) {
3636 		/* just return current zone */
3637 		real_nzones = 1;
3638 		zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
3639 		zoneids[0] = curproc->p_zone->zone_id;
3640 	} else {
3641 		mutex_enter(&zonehash_lock);
3642 		real_nzones = zonecount;
3643 		if (real_nzones) {
3644 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
3645 			    KM_SLEEP);
3646 			i = 0;
3647 			for (zone = list_head(&zone_active); zone != NULL;
3648 			    zone = list_next(&zone_active, zone))
3649 				zoneids[i++] = zone->zone_id;
3650 			ASSERT(i == real_nzones);
3651 		}
3652 		mutex_exit(&zonehash_lock);
3653 	}
3654 
3655 	if (user_nzones > real_nzones)
3656 		user_nzones = real_nzones;
3657 
3658 	if (copyout(&real_nzones, numzones, sizeof (uint_t)) != 0)
3659 		error = EFAULT;
3660 	else if (zoneidlist != NULL && user_nzones != 0) {
3661 		if (copyout(zoneids, zoneidlist,
3662 		    user_nzones * sizeof (zoneid_t)) != 0)
3663 			error = EFAULT;
3664 	}
3665 
3666 	if (real_nzones)
3667 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
3668 
3669 	if (error)
3670 		return (set_errno(error));
3671 	else
3672 		return (0);
3673 }
3674 
3675 /*
3676  * Systemcall entry point for zone_lookup(2).
3677  *
3678  * Non-global zones are only able to see themselves.
3679  */
3680 static zoneid_t
3681 zone_lookup(const char *zone_name)
3682 {
3683 	char *kname;
3684 	zone_t *zone;
3685 	zoneid_t zoneid;
3686 	int err;
3687 
3688 	if (zone_name == NULL) {
3689 		/* return caller's zone id */
3690 		return (getzoneid());
3691 	}
3692 
3693 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
3694 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
3695 		kmem_free(kname, ZONENAME_MAX);
3696 		return (set_errno(err));
3697 	}
3698 
3699 	mutex_enter(&zonehash_lock);
3700 	zone = zone_find_all_by_name(kname);
3701 	kmem_free(kname, ZONENAME_MAX);
3702 	if (zone == NULL || zone_status_get(zone) < ZONE_IS_READY ||
3703 	    (curproc->p_zone != global_zone && curproc->p_zone != zone)) {
3704 		/* in non-global zone, can only lookup own name */
3705 		mutex_exit(&zonehash_lock);
3706 		return (set_errno(EINVAL));
3707 	}
3708 	zoneid = zone->zone_id;
3709 	mutex_exit(&zonehash_lock);
3710 	return (zoneid);
3711 }
3712 
3713 /* ARGSUSED */
3714 long
3715 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5)
3716 {
3717 	zone_def zs;
3718 
3719 	switch (cmd) {
3720 	case ZONE_CREATE:
3721 		if (get_udatamodel() == DATAMODEL_NATIVE) {
3722 			if (copyin(arg1, &zs, sizeof (zone_def))) {
3723 				return (set_errno(EFAULT));
3724 			}
3725 		} else {
3726 #ifdef _SYSCALL32_IMPL
3727 			zone_def32 zs32;
3728 
3729 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
3730 				return (set_errno(EFAULT));
3731 			}
3732 			zs.zone_name =
3733 			    (const char *)(unsigned long)zs32.zone_name;
3734 			zs.zone_root =
3735 			    (const char *)(unsigned long)zs32.zone_root;
3736 			zs.zone_privs =
3737 			    (const struct priv_set *)
3738 			    (unsigned long)zs32.zone_privs;
3739 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
3740 			zs.rctlbufsz = zs32.rctlbufsz;
3741 			zs.extended_error =
3742 			    (int *)(unsigned long)zs32.extended_error;
3743 #else
3744 			panic("get_udatamodel() returned bogus result\n");
3745 #endif
3746 		}
3747 
3748 		return (zone_create(zs.zone_name, zs.zone_root,
3749 			zs.zone_privs, (caddr_t)zs.rctlbuf, zs.rctlbufsz,
3750 			zs.extended_error));
3751 	case ZONE_BOOT:
3752 		return (zone_boot((zoneid_t)(uintptr_t)arg1,
3753 		    (const char *)arg2));
3754 	case ZONE_DESTROY:
3755 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
3756 	case ZONE_GETATTR:
3757 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
3758 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
3759 	case ZONE_ENTER:
3760 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
3761 	case ZONE_LIST:
3762 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
3763 	case ZONE_SHUTDOWN:
3764 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
3765 	case ZONE_LOOKUP:
3766 		return (zone_lookup((const char *)arg1));
3767 	default:
3768 		return (set_errno(EINVAL));
3769 	}
3770 }
3771 
3772 struct zarg {
3773 	zone_t *zone;
3774 	zone_cmd_arg_t arg;
3775 };
3776 
3777 static int
3778 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
3779 {
3780 	char *buf;
3781 	size_t buflen;
3782 	int error;
3783 
3784 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
3785 	buf = kmem_alloc(buflen, KM_SLEEP);
3786 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
3787 	error = door_ki_open(buf, doorp);
3788 	kmem_free(buf, buflen);
3789 	return (error);
3790 }
3791 
3792 static void
3793 zone_release_door(door_handle_t *doorp)
3794 {
3795 	door_ki_rele(*doorp);
3796 	*doorp = NULL;
3797 }
3798 
3799 static void
3800 zone_ki_call_zoneadmd(struct zarg *zargp)
3801 {
3802 	door_handle_t door = NULL;
3803 	door_arg_t darg, save_arg;
3804 	char *zone_name;
3805 	size_t zone_namelen;
3806 	zoneid_t zoneid;
3807 	zone_t *zone;
3808 	zone_cmd_arg_t arg;
3809 	uint64_t uniqid;
3810 	size_t size;
3811 	int error;
3812 	int retry;
3813 
3814 	zone = zargp->zone;
3815 	arg = zargp->arg;
3816 	kmem_free(zargp, sizeof (*zargp));
3817 
3818 	zone_namelen = strlen(zone->zone_name) + 1;
3819 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
3820 	bcopy(zone->zone_name, zone_name, zone_namelen);
3821 	zoneid = zone->zone_id;
3822 	uniqid = zone->zone_uniqid;
3823 	/*
3824 	 * zoneadmd may be down, but at least we can empty out the zone.
3825 	 * We can ignore the return value of zone_empty() since we're called
3826 	 * from a kernel thread and know we won't be delivered any signals.
3827 	 */
3828 	ASSERT(curproc == &p0);
3829 	(void) zone_empty(zone);
3830 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
3831 	zone_rele(zone);
3832 
3833 	size = sizeof (arg);
3834 	darg.rbuf = (char *)&arg;
3835 	darg.data_ptr = (char *)&arg;
3836 	darg.rsize = size;
3837 	darg.data_size = size;
3838 	darg.desc_ptr = NULL;
3839 	darg.desc_num = 0;
3840 
3841 	save_arg = darg;
3842 	/*
3843 	 * Since we're not holding a reference to the zone, any number of
3844 	 * things can go wrong, including the zone disappearing before we get a
3845 	 * chance to talk to zoneadmd.
3846 	 */
3847 	for (retry = 0; /* forever */; retry++) {
3848 		if (door == NULL &&
3849 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
3850 			goto next;
3851 		}
3852 		ASSERT(door != NULL);
3853 
3854 		if ((error = door_ki_upcall(door, &darg)) == 0) {
3855 			break;
3856 		}
3857 		switch (error) {
3858 		case EINTR:
3859 			/* FALLTHROUGH */
3860 		case EAGAIN:	/* process may be forking */
3861 			/*
3862 			 * Back off for a bit
3863 			 */
3864 			break;
3865 		case EBADF:
3866 			zone_release_door(&door);
3867 			if (zone_lookup_door(zone_name, &door) != 0) {
3868 				/*
3869 				 * zoneadmd may be dead, but it may come back to
3870 				 * life later.
3871 				 */
3872 				break;
3873 			}
3874 			break;
3875 		default:
3876 			cmn_err(CE_WARN,
3877 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
3878 			    error);
3879 			goto out;
3880 		}
3881 next:
3882 		/*
3883 		 * If this isn't the same zone_t that we originally had in mind,
3884 		 * then this is the same as if two kadmin requests come in at
3885 		 * the same time: the first one wins.  This means we lose, so we
3886 		 * bail.
3887 		 */
3888 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
3889 			/*
3890 			 * Problem is solved.
3891 			 */
3892 			break;
3893 		}
3894 		if (zone->zone_uniqid != uniqid) {
3895 			/*
3896 			 * zoneid recycled
3897 			 */
3898 			zone_rele(zone);
3899 			break;
3900 		}
3901 		/*
3902 		 * We could zone_status_timedwait(), but there doesn't seem to
3903 		 * be much point in doing that (plus, it would mean that
3904 		 * zone_free() isn't called until this thread exits).
3905 		 */
3906 		zone_rele(zone);
3907 		delay(hz);
3908 		darg = save_arg;
3909 	}
3910 out:
3911 	if (door != NULL) {
3912 		zone_release_door(&door);
3913 	}
3914 	kmem_free(zone_name, zone_namelen);
3915 	thread_exit();
3916 }
3917 
3918 /*
3919  * Entry point for uadmin() to tell the zone to go away or reboot.  The caller
3920  * is a process in the zone to be modified.
3921  *
3922  * In order to shutdown the zone, we will hand off control to zoneadmd
3923  * (running in the global zone) via a door.  We do a half-hearted job at
3924  * killing all processes in the zone, create a kernel thread to contact
3925  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
3926  * a form of generation number used to let zoneadmd (as well as
3927  * zone_destroy()) know exactly which zone they're re talking about.
3928  */
3929 int
3930 zone_uadmin(int cmd, int fcn, cred_t *credp)
3931 {
3932 	struct zarg *zargp;
3933 	zone_cmd_t zcmd;
3934 	zone_t *zone;
3935 
3936 	zone = curproc->p_zone;
3937 	ASSERT(getzoneid() != GLOBAL_ZONEID);
3938 
3939 	switch (cmd) {
3940 	case A_SHUTDOWN:
3941 		switch (fcn) {
3942 		case AD_HALT:
3943 		case AD_POWEROFF:
3944 			zcmd = Z_HALT;
3945 			break;
3946 		case AD_BOOT:
3947 			zcmd = Z_REBOOT;
3948 			break;
3949 		case AD_IBOOT:
3950 		case AD_SBOOT:
3951 		case AD_SIBOOT:
3952 		case AD_NOSYNC:
3953 			return (ENOTSUP);
3954 		default:
3955 			return (EINVAL);
3956 		}
3957 		break;
3958 	case A_REBOOT:
3959 		zcmd = Z_REBOOT;
3960 		break;
3961 	case A_FTRACE:
3962 	case A_REMOUNT:
3963 	case A_FREEZE:
3964 	case A_DUMP:
3965 		return (ENOTSUP);
3966 	default:
3967 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
3968 		return (EINVAL);
3969 	}
3970 
3971 	if (secpolicy_zone_admin(credp, B_FALSE))
3972 		return (EPERM);
3973 	mutex_enter(&zone_status_lock);
3974 	/*
3975 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
3976 	 * is in the zone.
3977 	 */
3978 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
3979 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
3980 		/*
3981 		 * This zone is already on its way down.
3982 		 */
3983 		mutex_exit(&zone_status_lock);
3984 		return (0);
3985 	}
3986 	/*
3987 	 * Prevent future zone_enter()s
3988 	 */
3989 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
3990 	mutex_exit(&zone_status_lock);
3991 
3992 	/*
3993 	 * Kill everyone now and call zoneadmd later.
3994 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
3995 	 * later.
3996 	 */
3997 	killall(zone->zone_id);
3998 	/*
3999 	 * Now, create the thread to contact zoneadmd and do the rest of the
4000 	 * work.  This thread can't be created in our zone otherwise
4001 	 * zone_destroy() would deadlock.
4002 	 */
4003 	zargp = kmem_alloc(sizeof (*zargp), KM_SLEEP);
4004 	zargp->arg.cmd = zcmd;
4005 	zargp->arg.uniqid = zone->zone_uniqid;
4006 	(void) strcpy(zargp->arg.locale, "C");
4007 	zone_hold(zargp->zone = zone);
4008 
4009 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
4010 	    TS_RUN, minclsyspri);
4011 	exit(CLD_EXITED, 0);
4012 
4013 	return (EINVAL);
4014 }
4015 
4016 /*
4017  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
4018  * status to ZONE_IS_SHUTTING_DOWN.
4019  */
4020 void
4021 zone_shutdown_global(void)
4022 {
4023 	ASSERT(curproc->p_zone == global_zone);
4024 
4025 	mutex_enter(&zone_status_lock);
4026 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
4027 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
4028 	mutex_exit(&zone_status_lock);
4029 }
4030