xref: /titanic_44/usr/src/cmd/zoneadmd/vplat.c (revision e9dbad6f263d5570ed7ff5443ec5b958af8c24d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This module contains functions used to bring up and tear down the
31  * Virtual Platform: [un]mounting file-systems, [un]plumbing network
32  * interfaces, [un]configuring devices, establishing resource controls,
33  * and creating/destroying the zone in the kernel.  These actions, on
34  * the way up, ready the zone; on the way down, they halt the zone.
35  * See the much longer block comment at the beginning of zoneadmd.c
36  * for a bigger picture of how the whole program functions.
37  *
38  * This module also has primary responsibility for the layout of "scratch
39  * zones."  These are mounted, but inactive, zones that are used during
40  * operating system upgrade and potentially other administrative action.  The
41  * scratch zone environment is similar to the miniroot environment.  The zone's
42  * actual root is mounted read-write on /a, and the standard paths (/usr,
43  * /sbin, /lib) all lead to read-only copies of the running system's binaries.
44  * This allows the administrative tools to manipulate the zone using "-R /a"
45  * without relying on any binaries in the zone itself.
46  *
47  * If the scratch zone is on an alternate root (Live Upgrade [LU] boot
48  * environment), then we must resolve the lofs mounts used there to uncover
49  * writable (unshared) resources.  Shared resources, though, are always
50  * read-only.  In addition, if the "same" zone with a different root path is
51  * currently running, then "/b" inside the zone points to the running zone's
52  * root.  This allows LU to synchronize configuration files during the upgrade
53  * process.
54  *
55  * To construct this environment, this module creates a tmpfs mount on
56  * $ZONEPATH/lu.  Inside this scratch area, the miniroot-like environment as
57  * described above is constructed on the fly.  The zone is then created using
58  * $ZONEPATH/lu as the root.
59  *
60  * Note that scratch zones are inactive.  The zone's bits are not running and
61  * likely cannot be run correctly until upgrade is done.  Init is not running
62  * there, nor is SMF.  Because of this, the "mounted" state of a scratch zone
63  * is not a part of the usual halt/ready/boot state machine.
64  */
65 
66 #include <sys/param.h>
67 #include <sys/mount.h>
68 #include <sys/mntent.h>
69 #include <sys/socket.h>
70 #include <sys/utsname.h>
71 #include <sys/types.h>
72 #include <sys/stat.h>
73 #include <sys/sockio.h>
74 #include <sys/stropts.h>
75 #include <sys/conf.h>
76 
77 #include <inet/tcp.h>
78 #include <arpa/inet.h>
79 #include <netinet/in.h>
80 #include <net/route.h>
81 
82 #include <stdio.h>
83 #include <errno.h>
84 #include <fcntl.h>
85 #include <unistd.h>
86 #include <rctl.h>
87 #include <stdlib.h>
88 #include <string.h>
89 #include <strings.h>
90 #include <wait.h>
91 #include <limits.h>
92 #include <libgen.h>
93 #include <libzfs.h>
94 #include <libdevinfo.h>
95 #include <zone.h>
96 #include <assert.h>
97 #include <libcontract.h>
98 #include <libcontract_priv.h>
99 #include <uuid/uuid.h>
100 
101 #include <sys/mntio.h>
102 #include <sys/mnttab.h>
103 #include <sys/fs/autofs.h>	/* for _autofssys() */
104 #include <sys/fs/lofs_info.h>
105 #include <sys/fs/zfs.h>
106 
107 #include <pool.h>
108 #include <sys/pool.h>
109 
110 #include <libzonecfg.h>
111 #include <synch.h>
112 
113 #include "zoneadmd.h"
114 #include <tsol/label.h>
115 #include <libtsnet.h>
116 #include <sys/priv.h>
117 
118 #define	V4_ADDR_LEN	32
119 #define	V6_ADDR_LEN	128
120 
121 /* 0755 is the default directory mode. */
122 #define	DEFAULT_DIR_MODE \
123 	(S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)
124 
125 #define	IPD_DEFAULT_OPTS \
126 	MNTOPT_RO "," MNTOPT_LOFS_NOSUB "," MNTOPT_NODEVICES
127 
128 #define	DFSTYPES	"/etc/dfs/fstypes"
129 #define	MAXTNZLEN	2048
130 
131 /*
132  * This is the set of directories and devices (relative to <zone_root>/dev)
133  * which must be present in every zone.  Users can augment this list with
134  * additional device rules in their zone configuration, but at present cannot
135  * remove any of the this set of standard devices.
136  */
137 static const char *standard_devs[] = {
138 	"arp",
139 	"conslog",
140 	"cpu/self/cpuid",
141 	"crypto",
142 	"cryptoadm",
143 	"dsk",
144 	"dtrace/*",
145 	"dtrace/provider/*",
146 	"fd",
147 	"kstat",
148 	"lo0",
149 	"lo1",
150 	"lo2",
151 	"lo3",
152 	"log",
153 	"logindmux",
154 	"null",
155 #ifdef __sparc
156 	"openprom",
157 #endif
158 	"poll",
159 	"pool",
160 	"ptmx",
161 	"pts/*",
162 	"random",
163 	"rdsk",
164 	"rmt",
165 	"sad/user",
166 	"swap",
167 	"sysevent",
168 	"tcp",
169 	"tcp6",
170 	"term",
171 	"ticlts",
172 	"ticots",
173 	"ticotsord",
174 	"tty",
175 	"udp",
176 	"udp6",
177 	"urandom",
178 	"zero",
179 	"zfs",
180 	NULL
181 };
182 
183 struct source_target {
184 	const char *source;
185 	const char *target;
186 };
187 
188 /*
189  * Set of symlinks (relative to <zone_root>/dev) which must be present in
190  * every zone.
191  */
192 static struct source_target standard_devlinks[] = {
193 	{ "stderr",	"./fd/2" },
194 	{ "stdin",	"./fd/0" },
195 	{ "stdout",	"./fd/1" },
196 	{ "dtremote",	"/dev/null" },
197 	{ "console",	"zconsole" },
198 	{ "syscon",	"zconsole" },
199 	{ "sysmsg",	"zconsole" },
200 	{ "systty",	"zconsole" },
201 	{ "msglog",	"zconsole" },
202 	{ NULL, NULL }
203 };
204 
205 static int vplat_mount_dev(zlog_t *);
206 
207 /* for routing socket */
208 static int rts_seqno = 0;
209 
210 /* mangled zone name when mounting in an alternate root environment */
211 static char kernzone[ZONENAME_MAX];
212 
213 /* array of cached mount entries for resolve_lofs */
214 static struct mnttab *resolve_lofs_mnts, *resolve_lofs_mnt_max;
215 
216 /* for Trusted Extensions */
217 static tsol_zcent_t *get_zone_label(zlog_t *, priv_set_t *);
218 static int tsol_mounts(zlog_t *, char *, char *);
219 static void tsol_unmounts(zlog_t *, char *);
220 static m_label_t *zlabel = NULL;
221 static m_label_t *zid_label = NULL;
222 static priv_set_t *zprivs = NULL;
223 
224 /* from libsocket, not in any header file */
225 extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
226 
227 /*
228  * An optimization for build_mnttable: reallocate (and potentially copy the
229  * data) only once every N times through the loop.
230  */
231 #define	MNTTAB_HUNK	32
232 
233 /*
234  * Private autofs system call
235  */
236 extern int _autofssys(int, void *);
237 
238 static int
239 autofs_cleanup(zoneid_t zoneid)
240 {
241 	/*
242 	 * Ask autofs to unmount all trigger nodes in the given zone.
243 	 */
244 	return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
245 }
246 
247 static void
248 free_mnttable(struct mnttab *mnt_array, uint_t nelem)
249 {
250 	uint_t i;
251 
252 	if (mnt_array == NULL)
253 		return;
254 	for (i = 0; i < nelem; i++) {
255 		free(mnt_array[i].mnt_mountp);
256 		free(mnt_array[i].mnt_fstype);
257 		free(mnt_array[i].mnt_special);
258 		free(mnt_array[i].mnt_mntopts);
259 		assert(mnt_array[i].mnt_time == NULL);
260 	}
261 	free(mnt_array);
262 }
263 
264 /*
265  * Build the mount table for the zone rooted at "zroot", storing the resulting
266  * array of struct mnttabs in "mnt_arrayp" and the number of elements in the
267  * array in "nelemp".
268  */
269 static int
270 build_mnttable(zlog_t *zlogp, const char *zroot, size_t zrootlen, FILE *mnttab,
271     struct mnttab **mnt_arrayp, uint_t *nelemp)
272 {
273 	struct mnttab mnt;
274 	struct mnttab *mnts;
275 	struct mnttab *mnp;
276 	uint_t nmnt;
277 
278 	rewind(mnttab);
279 	resetmnttab(mnttab);
280 	nmnt = 0;
281 	mnts = NULL;
282 	while (getmntent(mnttab, &mnt) == 0) {
283 		struct mnttab *tmp_array;
284 
285 		if (strncmp(mnt.mnt_mountp, zroot, zrootlen) != 0)
286 			continue;
287 		if (nmnt % MNTTAB_HUNK == 0) {
288 			tmp_array = realloc(mnts,
289 			    (nmnt + MNTTAB_HUNK) * sizeof (*mnts));
290 			if (tmp_array == NULL) {
291 				free_mnttable(mnts, nmnt);
292 				return (-1);
293 			}
294 			mnts = tmp_array;
295 		}
296 		mnp = &mnts[nmnt++];
297 
298 		/*
299 		 * Zero out any fields we're not using.
300 		 */
301 		(void) memset(mnp, 0, sizeof (*mnp));
302 
303 		if (mnt.mnt_special != NULL)
304 			mnp->mnt_special = strdup(mnt.mnt_special);
305 		if (mnt.mnt_mntopts != NULL)
306 			mnp->mnt_mntopts = strdup(mnt.mnt_mntopts);
307 		mnp->mnt_mountp = strdup(mnt.mnt_mountp);
308 		mnp->mnt_fstype = strdup(mnt.mnt_fstype);
309 		if ((mnt.mnt_special != NULL && mnp->mnt_special == NULL) ||
310 		    (mnt.mnt_mntopts != NULL && mnp->mnt_mntopts == NULL) ||
311 		    mnp->mnt_mountp == NULL || mnp->mnt_fstype == NULL) {
312 			zerror(zlogp, B_TRUE, "memory allocation failed");
313 			free_mnttable(mnts, nmnt);
314 			return (-1);
315 		}
316 	}
317 	*mnt_arrayp = mnts;
318 	*nelemp = nmnt;
319 	return (0);
320 }
321 
322 /*
323  * This is an optimization.  The resolve_lofs function is used quite frequently
324  * to manipulate file paths, and on a machine with a large number of zones,
325  * there will be a huge number of mounted file systems.  Thus, we trigger a
326  * reread of the list of mount points
327  */
328 static void
329 lofs_discard_mnttab(void)
330 {
331 	free_mnttable(resolve_lofs_mnts,
332 	    resolve_lofs_mnt_max - resolve_lofs_mnts);
333 	resolve_lofs_mnts = resolve_lofs_mnt_max = NULL;
334 }
335 
336 static int
337 lofs_read_mnttab(zlog_t *zlogp)
338 {
339 	FILE *mnttab;
340 	uint_t nmnts;
341 
342 	if ((mnttab = fopen(MNTTAB, "r")) == NULL)
343 		return (-1);
344 	if (build_mnttable(zlogp, "", 0, mnttab, &resolve_lofs_mnts,
345 	    &nmnts) == -1) {
346 		(void) fclose(mnttab);
347 		return (-1);
348 	}
349 	(void) fclose(mnttab);
350 	resolve_lofs_mnt_max = resolve_lofs_mnts + nmnts;
351 	return (0);
352 }
353 
354 /*
355  * This function loops over potential loopback mounts and symlinks in a given
356  * path and resolves them all down to an absolute path.
357  */
358 static void
359 resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen)
360 {
361 	int len, arlen;
362 	const char *altroot;
363 	char tmppath[MAXPATHLEN];
364 	boolean_t outside_altroot;
365 
366 	if ((len = resolvepath(path, tmppath, sizeof (tmppath))) == -1)
367 		return;
368 	tmppath[len] = '\0';
369 	(void) strlcpy(path, tmppath, sizeof (tmppath));
370 
371 	/* This happens once per zoneadmd operation. */
372 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
373 		return;
374 
375 	altroot = zonecfg_get_root();
376 	arlen = strlen(altroot);
377 	outside_altroot = B_FALSE;
378 	for (;;) {
379 		struct mnttab *mnp;
380 
381 		for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
382 		    mnp++) {
383 			if (mnp->mnt_fstype == NULL ||
384 			    mnp->mnt_mountp == NULL ||
385 			    mnp->mnt_special == NULL ||
386 			    strcmp(mnp->mnt_fstype, MNTTYPE_LOFS) != 0)
387 				continue;
388 			len = strlen(mnp->mnt_mountp);
389 			if (strncmp(mnp->mnt_mountp, path, len) == 0 &&
390 			    (path[len] == '/' || path[len] == '\0'))
391 				break;
392 		}
393 		if (mnp >= resolve_lofs_mnt_max)
394 			break;
395 		if (outside_altroot) {
396 			char *cp;
397 			int olen = sizeof (MNTOPT_RO) - 1;
398 
399 			/*
400 			 * If we run into a read-only mount outside of the
401 			 * alternate root environment, then the user doesn't
402 			 * want this path to be made read-write.
403 			 */
404 			if (mnp->mnt_mntopts != NULL &&
405 			    (cp = strstr(mnp->mnt_mntopts, MNTOPT_RO)) !=
406 			    NULL &&
407 			    (cp == mnp->mnt_mntopts || cp[-1] == ',') &&
408 			    (cp[olen] == '\0' || cp[olen] == ',')) {
409 				break;
410 			}
411 		} else if (arlen > 0 &&
412 		    (strncmp(mnp->mnt_special, altroot, arlen) != 0 ||
413 		    (mnp->mnt_special[arlen] != '\0' &&
414 		    mnp->mnt_special[arlen] != '/'))) {
415 			outside_altroot = B_TRUE;
416 		}
417 		/* use temporary buffer because new path might be longer */
418 		(void) snprintf(tmppath, sizeof (tmppath), "%s%s",
419 		    mnp->mnt_special, path + len);
420 		if ((len = resolvepath(tmppath, path, pathlen)) == -1)
421 			break;
422 		path[len] = '\0';
423 	}
424 }
425 
426 /*
427  * For a regular mount, check if a replacement lofs mount is needed because the
428  * referenced device is already mounted somewhere.
429  */
430 static int
431 check_lofs_needed(zlog_t *zlogp, struct zone_fstab *fsptr)
432 {
433 	struct mnttab *mnp;
434 	zone_fsopt_t *optptr, *onext;
435 
436 	/* This happens once per zoneadmd operation. */
437 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
438 		return (-1);
439 
440 	/*
441 	 * If this special node isn't already in use, then it's ours alone;
442 	 * no need to worry about conflicting mounts.
443 	 */
444 	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
445 	    mnp++) {
446 		if (strcmp(mnp->mnt_special, fsptr->zone_fs_special) == 0)
447 			break;
448 	}
449 	if (mnp >= resolve_lofs_mnt_max)
450 		return (0);
451 
452 	/*
453 	 * Convert this duplicate mount into a lofs mount.
454 	 */
455 	(void) strlcpy(fsptr->zone_fs_special, mnp->mnt_mountp,
456 	    sizeof (fsptr->zone_fs_special));
457 	(void) strlcpy(fsptr->zone_fs_type, MNTTYPE_LOFS,
458 	    sizeof (fsptr->zone_fs_type));
459 	fsptr->zone_fs_raw[0] = '\0';
460 
461 	/*
462 	 * Discard all but one of the original options and set that to be the
463 	 * same set of options used for inherit package directory resources.
464 	 */
465 	optptr = fsptr->zone_fs_options;
466 	if (optptr == NULL) {
467 		optptr = malloc(sizeof (*optptr));
468 		if (optptr == NULL) {
469 			zerror(zlogp, B_TRUE, "cannot mount %s",
470 			    fsptr->zone_fs_dir);
471 			return (-1);
472 		}
473 	} else {
474 		while ((onext = optptr->zone_fsopt_next) != NULL) {
475 			optptr->zone_fsopt_next = onext->zone_fsopt_next;
476 			free(onext);
477 		}
478 	}
479 	(void) strcpy(optptr->zone_fsopt_opt, IPD_DEFAULT_OPTS);
480 	optptr->zone_fsopt_next = NULL;
481 	fsptr->zone_fs_options = optptr;
482 	return (0);
483 }
484 
485 static int
486 make_one_dir(zlog_t *zlogp, const char *prefix, const char *subdir, mode_t mode)
487 {
488 	char path[MAXPATHLEN];
489 	struct stat st;
490 
491 	if (snprintf(path, sizeof (path), "%s%s", prefix, subdir) >
492 	    sizeof (path)) {
493 		zerror(zlogp, B_FALSE, "pathname %s%s is too long", prefix,
494 		    subdir);
495 		return (-1);
496 	}
497 
498 	if (lstat(path, &st) == 0) {
499 		/*
500 		 * We don't check the file mode since presumably the zone
501 		 * administrator may have had good reason to change the mode,
502 		 * and we don't need to second guess him.
503 		 */
504 		if (!S_ISDIR(st.st_mode)) {
505 			if (is_system_labeled() &&
506 			    S_ISREG(st.st_mode)) {
507 				/*
508 				 * The need to mount readonly copies of
509 				 * global zone /etc/ files is unique to
510 				 * Trusted Extensions.
511 				 */
512 				if (strncmp(subdir, "/etc/",
513 				    strlen("/etc/")) != 0) {
514 					zerror(zlogp, B_FALSE,
515 					    "%s is not in /etc", path);
516 					return (-1);
517 				}
518 			} else {
519 				zerror(zlogp, B_FALSE,
520 				    "%s is not a directory", path);
521 				return (-1);
522 			}
523 		}
524 	} else if (mkdirp(path, mode) != 0) {
525 		if (errno == EROFS)
526 			zerror(zlogp, B_FALSE, "Could not mkdir %s.\nIt is on "
527 			    "a read-only file system in this local zone.\nMake "
528 			    "sure %s exists in the global zone.", path, subdir);
529 		else
530 			zerror(zlogp, B_TRUE, "mkdirp of %s failed", path);
531 		return (-1);
532 	}
533 	return (0);
534 }
535 
536 static void
537 free_remote_fstypes(char **types)
538 {
539 	uint_t i;
540 
541 	if (types == NULL)
542 		return;
543 	for (i = 0; types[i] != NULL; i++)
544 		free(types[i]);
545 	free(types);
546 }
547 
548 static char **
549 get_remote_fstypes(zlog_t *zlogp)
550 {
551 	char **types = NULL;
552 	FILE *fp;
553 	char buf[MAXPATHLEN];
554 	char fstype[MAXPATHLEN];
555 	uint_t lines = 0;
556 	uint_t i;
557 
558 	if ((fp = fopen(DFSTYPES, "r")) == NULL) {
559 		zerror(zlogp, B_TRUE, "failed to open %s", DFSTYPES);
560 		return (NULL);
561 	}
562 	/*
563 	 * Count the number of lines
564 	 */
565 	while (fgets(buf, sizeof (buf), fp) != NULL)
566 		lines++;
567 	if (lines == 0)	/* didn't read anything; empty file */
568 		goto out;
569 	rewind(fp);
570 	/*
571 	 * Allocate enough space for a NULL-terminated array.
572 	 */
573 	types = calloc(lines + 1, sizeof (char *));
574 	if (types == NULL) {
575 		zerror(zlogp, B_TRUE, "memory allocation failed");
576 		goto out;
577 	}
578 	i = 0;
579 	while (fgets(buf, sizeof (buf), fp) != NULL) {
580 		/* LINTED - fstype is big enough to hold buf */
581 		if (sscanf(buf, "%s", fstype) == 0) {
582 			zerror(zlogp, B_FALSE, "unable to parse %s", DFSTYPES);
583 			free_remote_fstypes(types);
584 			types = NULL;
585 			goto out;
586 		}
587 		types[i] = strdup(fstype);
588 		if (types[i] == NULL) {
589 			zerror(zlogp, B_TRUE, "memory allocation failed");
590 			free_remote_fstypes(types);
591 			types = NULL;
592 			goto out;
593 		}
594 		i++;
595 	}
596 out:
597 	(void) fclose(fp);
598 	return (types);
599 }
600 
601 static boolean_t
602 is_remote_fstype(const char *fstype, char *const *remote_fstypes)
603 {
604 	uint_t i;
605 
606 	if (remote_fstypes == NULL)
607 		return (B_FALSE);
608 	for (i = 0; remote_fstypes[i] != NULL; i++) {
609 		if (strcmp(remote_fstypes[i], fstype) == 0)
610 			return (B_TRUE);
611 	}
612 	return (B_FALSE);
613 }
614 
615 /*
616  * This converts a zone root path (normally of the form .../root) to a Live
617  * Upgrade scratch zone root (of the form .../lu).
618  */
619 static void
620 root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
621 {
622 	if (!isresolved && zonecfg_in_alt_root())
623 		resolve_lofs(zlogp, zroot, zrootlen);
624 	(void) strcpy(strrchr(zroot, '/') + 1, "lu");
625 }
626 
627 /*
628  * The general strategy for unmounting filesystems is as follows:
629  *
630  * - Remote filesystems may be dead, and attempting to contact them as
631  * part of a regular unmount may hang forever; we want to always try to
632  * forcibly unmount such filesystems and only fall back to regular
633  * unmounts if the filesystem doesn't support forced unmounts.
634  *
635  * - We don't want to unnecessarily corrupt metadata on local
636  * filesystems (ie UFS), so we want to start off with graceful unmounts,
637  * and only escalate to doing forced unmounts if we get stuck.
638  *
639  * We start off walking backwards through the mount table.  This doesn't
640  * give us strict ordering but ensures that we try to unmount submounts
641  * first.  We thus limit the number of failed umount2(2) calls.
642  *
643  * The mechanism for determining if we're stuck is to count the number
644  * of failed unmounts each iteration through the mount table.  This
645  * gives us an upper bound on the number of filesystems which remain
646  * mounted (autofs trigger nodes are dealt with separately).  If at the
647  * end of one unmount+autofs_cleanup cycle we still have the same number
648  * of mounts that we started out with, we're stuck and try a forced
649  * unmount.  If that fails (filesystem doesn't support forced unmounts)
650  * then we bail and are unable to teardown the zone.  If it succeeds,
651  * we're no longer stuck so we continue with our policy of trying
652  * graceful mounts first.
653  *
654  * Zone must be down (ie, no processes or threads active).
655  */
656 static int
657 unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
658 {
659 	int error = 0;
660 	FILE *mnttab;
661 	struct mnttab *mnts;
662 	uint_t nmnt;
663 	char zroot[MAXPATHLEN + 1];
664 	size_t zrootlen;
665 	uint_t oldcount = UINT_MAX;
666 	boolean_t stuck = B_FALSE;
667 	char **remote_fstypes = NULL;
668 
669 	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
670 		zerror(zlogp, B_FALSE, "unable to determine zone root");
671 		return (-1);
672 	}
673 	if (unmount_cmd)
674 		root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
675 
676 	(void) strcat(zroot, "/");
677 	zrootlen = strlen(zroot);
678 
679 	/*
680 	 * For Trusted Extensions unmount each higher level zone's mount
681 	 * of our zone's /export/home
682 	 */
683 	if (!unmount_cmd)
684 		tsol_unmounts(zlogp, zone_name);
685 
686 	if ((mnttab = fopen(MNTTAB, "r")) == NULL) {
687 		zerror(zlogp, B_TRUE, "failed to open %s", MNTTAB);
688 		return (-1);
689 	}
690 	/*
691 	 * Use our hacky mntfs ioctl so we see everything, even mounts with
692 	 * MS_NOMNTTAB.
693 	 */
694 	if (ioctl(fileno(mnttab), MNTIOC_SHOWHIDDEN, NULL) < 0) {
695 		zerror(zlogp, B_TRUE, "unable to configure %s", MNTTAB);
696 		error++;
697 		goto out;
698 	}
699 
700 	/*
701 	 * Build the list of remote fstypes so we know which ones we
702 	 * should forcibly unmount.
703 	 */
704 	remote_fstypes = get_remote_fstypes(zlogp);
705 	for (; /* ever */; ) {
706 		uint_t newcount = 0;
707 		boolean_t unmounted;
708 		struct mnttab *mnp;
709 		char *path;
710 		uint_t i;
711 
712 		mnts = NULL;
713 		nmnt = 0;
714 		/*
715 		 * MNTTAB gives us a way to walk through mounted
716 		 * filesystems; we need to be able to walk them in
717 		 * reverse order, so we build a list of all mounted
718 		 * filesystems.
719 		 */
720 		if (build_mnttable(zlogp, zroot, zrootlen, mnttab, &mnts,
721 		    &nmnt) != 0) {
722 			error++;
723 			goto out;
724 		}
725 		for (i = 0; i < nmnt; i++) {
726 			mnp = &mnts[nmnt - i - 1]; /* access in reverse order */
727 			path = mnp->mnt_mountp;
728 			unmounted = B_FALSE;
729 			/*
730 			 * Try forced unmount first for remote filesystems.
731 			 *
732 			 * Not all remote filesystems support forced unmounts,
733 			 * so if this fails (ENOTSUP) we'll continue on
734 			 * and try a regular unmount.
735 			 */
736 			if (is_remote_fstype(mnp->mnt_fstype, remote_fstypes)) {
737 				if (umount2(path, MS_FORCE) == 0)
738 					unmounted = B_TRUE;
739 			}
740 			/*
741 			 * Try forced unmount if we're stuck.
742 			 */
743 			if (stuck) {
744 				if (umount2(path, MS_FORCE) == 0) {
745 					unmounted = B_TRUE;
746 					stuck = B_FALSE;
747 				} else {
748 					/*
749 					 * The first failure indicates a
750 					 * mount we won't be able to get
751 					 * rid of automatically, so we
752 					 * bail.
753 					 */
754 					error++;
755 					zerror(zlogp, B_FALSE,
756 					    "unable to unmount '%s'", path);
757 					free_mnttable(mnts, nmnt);
758 					goto out;
759 				}
760 			}
761 			/*
762 			 * Try regular unmounts for everything else.
763 			 */
764 			if (!unmounted && umount2(path, 0) != 0)
765 				newcount++;
766 		}
767 		free_mnttable(mnts, nmnt);
768 
769 		if (newcount == 0)
770 			break;
771 		if (newcount >= oldcount) {
772 			/*
773 			 * Last round didn't unmount anything; we're stuck and
774 			 * should start trying forced unmounts.
775 			 */
776 			stuck = B_TRUE;
777 		}
778 		oldcount = newcount;
779 
780 		/*
781 		 * Autofs doesn't let you unmount its trigger nodes from
782 		 * userland so we have to tell the kernel to cleanup for us.
783 		 */
784 		if (autofs_cleanup(zoneid) != 0) {
785 			zerror(zlogp, B_TRUE, "unable to remove autofs nodes");
786 			error++;
787 			goto out;
788 		}
789 	}
790 
791 out:
792 	free_remote_fstypes(remote_fstypes);
793 	(void) fclose(mnttab);
794 	return (error ? -1 : 0);
795 }
796 
797 static int
798 fs_compare(const void *m1, const void *m2)
799 {
800 	struct zone_fstab *i = (struct zone_fstab *)m1;
801 	struct zone_fstab *j = (struct zone_fstab *)m2;
802 
803 	return (strcmp(i->zone_fs_dir, j->zone_fs_dir));
804 }
805 
806 /*
807  * Fork and exec (and wait for) the mentioned binary with the provided
808  * arguments.  Returns (-1) if something went wrong with fork(2) or exec(2),
809  * returns the exit status otherwise.
810  *
811  * If we were unable to exec the provided pathname (for whatever
812  * reason), we return the special token ZEXIT_EXEC.  The current value
813  * of ZEXIT_EXEC doesn't conflict with legitimate exit codes of the
814  * consumers of this function; any future consumers must make sure this
815  * remains the case.
816  */
817 static int
818 forkexec(zlog_t *zlogp, const char *path, char *const argv[])
819 {
820 	pid_t child_pid;
821 	int child_status = 0;
822 
823 	/*
824 	 * Do not let another thread localize a message while we are forking.
825 	 */
826 	(void) mutex_lock(&msglock);
827 	child_pid = fork();
828 	(void) mutex_unlock(&msglock);
829 	if (child_pid == -1) {
830 		zerror(zlogp, B_TRUE, "could not fork for %s", argv[0]);
831 		return (-1);
832 	} else if (child_pid == 0) {
833 		closefrom(0);
834 		/* redirect stdin, stdout & stderr to /dev/null */
835 		(void) open("/dev/null", O_RDONLY);	/* stdin */
836 		(void) open("/dev/null", O_WRONLY);	/* stdout */
837 		(void) open("/dev/null", O_WRONLY);	/* stderr */
838 		(void) execv(path, argv);
839 		/*
840 		 * Since we are in the child, there is no point calling zerror()
841 		 * since there is nobody waiting to consume it.  So exit with a
842 		 * special code that the parent will recognize and call zerror()
843 		 * accordingly.
844 		 */
845 
846 		_exit(ZEXIT_EXEC);
847 	} else {
848 		(void) waitpid(child_pid, &child_status, 0);
849 	}
850 
851 	if (WIFSIGNALED(child_status)) {
852 		zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
853 		    "signal %d", path, WTERMSIG(child_status));
854 		return (-1);
855 	}
856 	assert(WIFEXITED(child_status));
857 	if (WEXITSTATUS(child_status) == ZEXIT_EXEC) {
858 		zerror(zlogp, B_FALSE, "failed to exec %s", path);
859 		return (-1);
860 	}
861 	return (WEXITSTATUS(child_status));
862 }
863 
864 static int
865 dofsck(zlog_t *zlogp, const char *fstype, const char *rawdev)
866 {
867 	char cmdbuf[MAXPATHLEN];
868 	char *argv[4];
869 	int status;
870 
871 	/*
872 	 * We could alternatively have called /usr/sbin/fsck -F <fstype>, but
873 	 * that would cost us an extra fork/exec without buying us anything.
874 	 */
875 	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/fsck", fstype)
876 	    > sizeof (cmdbuf)) {
877 		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
878 		return (-1);
879 	}
880 
881 	argv[0] = "fsck";
882 	argv[1] = "-m";
883 	argv[2] = (char *)rawdev;
884 	argv[3] = NULL;
885 
886 	status = forkexec(zlogp, cmdbuf, argv);
887 	if (status == 0 || status == -1)
888 		return (status);
889 	zerror(zlogp, B_FALSE, "fsck of '%s' failed with exit status %d; "
890 	    "run fsck manually", rawdev, status);
891 	return (-1);
892 }
893 
894 static int
895 domount(zlog_t *zlogp, const char *fstype, const char *opts,
896     const char *special, const char *directory)
897 {
898 	char cmdbuf[MAXPATHLEN];
899 	char *argv[6];
900 	int status;
901 
902 	/*
903 	 * We could alternatively have called /usr/sbin/mount -F <fstype>, but
904 	 * that would cost us an extra fork/exec without buying us anything.
905 	 */
906 	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/mount", fstype)
907 	    > sizeof (cmdbuf)) {
908 		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
909 		return (-1);
910 	}
911 	argv[0] = "mount";
912 	if (opts[0] == '\0') {
913 		argv[1] = (char *)special;
914 		argv[2] = (char *)directory;
915 		argv[3] = NULL;
916 	} else {
917 		argv[1] = "-o";
918 		argv[2] = (char *)opts;
919 		argv[3] = (char *)special;
920 		argv[4] = (char *)directory;
921 		argv[5] = NULL;
922 	}
923 
924 	status = forkexec(zlogp, cmdbuf, argv);
925 	if (status == 0 || status == -1)
926 		return (status);
927 	if (opts[0] == '\0')
928 		zerror(zlogp, B_FALSE, "\"%s %s %s\" "
929 		    "failed with exit code %d",
930 		    cmdbuf, special, directory, status);
931 	else
932 		zerror(zlogp, B_FALSE, "\"%s -o %s %s %s\" "
933 		    "failed with exit code %d",
934 		    cmdbuf, opts, special, directory, status);
935 	return (-1);
936 }
937 
938 /*
939  * Make sure if a given path exists, it is not a sym-link, and is a directory.
940  */
941 static int
942 check_path(zlog_t *zlogp, const char *path)
943 {
944 	struct stat statbuf;
945 	char respath[MAXPATHLEN];
946 	int res;
947 
948 	if (lstat(path, &statbuf) != 0) {
949 		if (errno == ENOENT)
950 			return (0);
951 		zerror(zlogp, B_TRUE, "can't stat %s", path);
952 		return (-1);
953 	}
954 	if (S_ISLNK(statbuf.st_mode)) {
955 		zerror(zlogp, B_FALSE, "%s is a symlink", path);
956 		return (-1);
957 	}
958 	if (!S_ISDIR(statbuf.st_mode)) {
959 		if (is_system_labeled() && S_ISREG(statbuf.st_mode)) {
960 			/*
961 			 * The need to mount readonly copies of
962 			 * global zone /etc/ files is unique to
963 			 * Trusted Extensions.
964 			 * The check for /etc/ via strstr() is to
965 			 * allow paths like $ZONEROOT/etc/passwd
966 			 */
967 			if (strstr(path, "/etc/") == NULL) {
968 				zerror(zlogp, B_FALSE,
969 				    "%s is not in /etc", path);
970 				return (-1);
971 			}
972 		} else {
973 			zerror(zlogp, B_FALSE, "%s is not a directory", path);
974 			return (-1);
975 		}
976 	}
977 	if ((res = resolvepath(path, respath, sizeof (respath))) == -1) {
978 		zerror(zlogp, B_TRUE, "unable to resolve path %s", path);
979 		return (-1);
980 	}
981 	respath[res] = '\0';
982 	if (strcmp(path, respath) != 0) {
983 		/*
984 		 * We don't like ".."s and "."s throwing us off
985 		 */
986 		zerror(zlogp, B_FALSE, "%s is not a canonical path", path);
987 		return (-1);
988 	}
989 	return (0);
990 }
991 
992 /*
993  * Check every component of rootpath/relpath.  If any component fails (ie,
994  * exists but isn't the canonical path to a directory), it is returned in
995  * badpath, which is assumed to be at least of size MAXPATHLEN.
996  *
997  * Relpath must begin with '/'.
998  */
999 static boolean_t
1000 valid_mount_path(zlog_t *zlogp, const char *rootpath, const char *relpath)
1001 {
1002 	char abspath[MAXPATHLEN], *slashp;
1003 
1004 	/*
1005 	 * Make sure abspath has at least one '/' after its rootpath
1006 	 * component, and ends with '/'.
1007 	 */
1008 	if (snprintf(abspath, sizeof (abspath), "%s%s/", rootpath, relpath) >
1009 	    sizeof (abspath)) {
1010 		zerror(zlogp, B_FALSE, "pathname %s%s is too long", rootpath,
1011 		    relpath);
1012 		return (B_FALSE);
1013 	}
1014 
1015 	slashp = &abspath[strlen(rootpath)];
1016 	assert(*slashp == '/');
1017 	do {
1018 		*slashp = '\0';
1019 		if (check_path(zlogp, abspath) != 0)
1020 			return (B_FALSE);
1021 		*slashp = '/';
1022 		slashp++;
1023 	} while ((slashp = strchr(slashp, '/')) != NULL);
1024 	return (B_TRUE);
1025 }
1026 
1027 static int
1028 mount_one(zlog_t *zlogp, struct zone_fstab *fsptr, const char *rootpath)
1029 {
1030 	char    path[MAXPATHLEN];
1031 	char	specpath[MAXPATHLEN];
1032 	char    optstr[MAX_MNTOPT_STR];
1033 	zone_fsopt_t *optptr;
1034 
1035 	if (!valid_mount_path(zlogp, rootpath, fsptr->zone_fs_dir)) {
1036 		zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
1037 		    rootpath, fsptr->zone_fs_dir);
1038 		return (-1);
1039 	}
1040 
1041 	if (make_one_dir(zlogp, rootpath, fsptr->zone_fs_dir,
1042 	    DEFAULT_DIR_MODE) != 0)
1043 		return (-1);
1044 
1045 	(void) snprintf(path, sizeof (path), "%s%s", rootpath,
1046 	    fsptr->zone_fs_dir);
1047 
1048 	if (strlen(fsptr->zone_fs_special) == 0) {
1049 		/*
1050 		 * A zero-length special is how we distinguish IPDs from
1051 		 * general-purpose FSs.  Make sure it mounts from a place that
1052 		 * can be seen via the alternate zone's root.
1053 		 */
1054 		if (snprintf(specpath, sizeof (specpath), "%s%s",
1055 		    zonecfg_get_root(), fsptr->zone_fs_dir) >=
1056 		    sizeof (specpath)) {
1057 			zerror(zlogp, B_FALSE, "cannot mount %s: path too "
1058 			    "long in alternate root", fsptr->zone_fs_dir);
1059 			return (-1);
1060 		}
1061 		if (zonecfg_in_alt_root())
1062 			resolve_lofs(zlogp, specpath, sizeof (specpath));
1063 		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS,
1064 		    specpath, path) != 0) {
1065 			zerror(zlogp, B_TRUE, "failed to loopback mount %s",
1066 			    specpath);
1067 			return (-1);
1068 		}
1069 		return (0);
1070 	}
1071 
1072 	/*
1073 	 * In general the strategy here is to do just as much verification as
1074 	 * necessary to avoid crashing or otherwise doing something bad; if the
1075 	 * administrator initiated the operation via zoneadm(1m), he'll get
1076 	 * auto-verification which will let him know what's wrong.  If he
1077 	 * modifies the zone configuration of a running zone and doesn't attempt
1078 	 * to verify that it's OK we won't crash but won't bother trying to be
1079 	 * too helpful either.  zoneadm verify is only a couple keystrokes away.
1080 	 */
1081 	if (!zonecfg_valid_fs_type(fsptr->zone_fs_type)) {
1082 		zerror(zlogp, B_FALSE, "cannot mount %s on %s: "
1083 		    "invalid file-system type %s", fsptr->zone_fs_special,
1084 		    fsptr->zone_fs_dir, fsptr->zone_fs_type);
1085 		return (-1);
1086 	}
1087 
1088 	/*
1089 	 * If we're looking at an alternate root environment, then construct
1090 	 * read-only loopback mounts as necessary.  For all lofs mounts, make
1091 	 * sure that the 'special' entry points inside the alternate root.  (We
1092 	 * don't do this with other mounts, as devfs isn't in the alternate
1093 	 * root, and we need to assume the device environment is roughly the
1094 	 * same.)
1095 	 */
1096 	if (zonecfg_in_alt_root()) {
1097 		struct stat64 st;
1098 
1099 		if (stat64(fsptr->zone_fs_special, &st) != -1 &&
1100 		    S_ISBLK(st.st_mode) &&
1101 		    check_lofs_needed(zlogp, fsptr) == -1)
1102 			return (-1);
1103 		if (strcmp(fsptr->zone_fs_type, MNTTYPE_LOFS) == 0) {
1104 			if (snprintf(specpath, sizeof (specpath), "%s%s",
1105 			    zonecfg_get_root(), fsptr->zone_fs_special) >=
1106 			    sizeof (specpath)) {
1107 				zerror(zlogp, B_FALSE, "cannot mount %s: path "
1108 				    "too long in alternate root",
1109 				    fsptr->zone_fs_special);
1110 				return (-1);
1111 			}
1112 			resolve_lofs(zlogp, specpath, sizeof (specpath));
1113 			(void) strlcpy(fsptr->zone_fs_special, specpath,
1114 			    sizeof (fsptr->zone_fs_special));
1115 		}
1116 	}
1117 
1118 	/*
1119 	 * Run 'fsck -m' if there's a device to fsck.
1120 	 */
1121 	if (fsptr->zone_fs_raw[0] != '\0' &&
1122 	    dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_raw) != 0)
1123 		return (-1);
1124 
1125 	/*
1126 	 * Build up mount option string.
1127 	 */
1128 	optstr[0] = '\0';
1129 	if (fsptr->zone_fs_options != NULL) {
1130 		(void) strlcpy(optstr, fsptr->zone_fs_options->zone_fsopt_opt,
1131 		    sizeof (optstr));
1132 		for (optptr = fsptr->zone_fs_options->zone_fsopt_next;
1133 		    optptr != NULL; optptr = optptr->zone_fsopt_next) {
1134 			(void) strlcat(optstr, ",", sizeof (optstr));
1135 			(void) strlcat(optstr, optptr->zone_fsopt_opt,
1136 			    sizeof (optstr));
1137 		}
1138 	}
1139 	return (domount(zlogp, fsptr->zone_fs_type, optstr,
1140 	    fsptr->zone_fs_special, path));
1141 }
1142 
1143 static void
1144 free_fs_data(struct zone_fstab *fsarray, uint_t nelem)
1145 {
1146 	uint_t i;
1147 
1148 	if (fsarray == NULL)
1149 		return;
1150 	for (i = 0; i < nelem; i++)
1151 		zonecfg_free_fs_option_list(fsarray[i].zone_fs_options);
1152 	free(fsarray);
1153 }
1154 
1155 /*
1156  * This function initiates the creation of a small Solaris Environment for
1157  * scratch zone. The Environment creation process is split up into two
1158  * functions(build_mounted_pre_var() and build_mounted_post_var()). It
1159  * is done this way because:
1160  * 	We need to have both /etc and /var in the root of the scratchzone.
1161  * 	We loopback mount zone's own /etc and /var into the root of the
1162  * 	scratch zone. Unlike /etc, /var can be a seperate filesystem. So we
1163  * 	need to delay the mount of /var till the zone's root gets populated.
1164  *	So mounting of localdirs[](/etc and /var) have been moved to the
1165  * 	build_mounted_post_var() which gets called only after the zone
1166  * 	specific filesystems are mounted.
1167  */
1168 static boolean_t
1169 build_mounted_pre_var(zlog_t *zlogp, char *rootpath,
1170     size_t rootlen, const char *zonepath)
1171 {
1172 	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1173 	char luroot[MAXPATHLEN];
1174 	const char **cpp;
1175 	static const char *mkdirs[] = {
1176 		"/system", "/system/contract", "/system/object", "/proc",
1177 		"/dev", "/tmp", "/a", NULL
1178 	};
1179 	char *altstr;
1180 	FILE *fp;
1181 	uuid_t uuid;
1182 
1183 	resolve_lofs(zlogp, rootpath, rootlen);
1184 	(void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
1185 	resolve_lofs(zlogp, luroot, sizeof (luroot));
1186 	(void) snprintf(tmp, sizeof (tmp), "%s/bin", luroot);
1187 	(void) symlink("./usr/bin", tmp);
1188 
1189 	/*
1190 	 * These are mostly special mount points; not handled here.  (See
1191 	 * zone_mount_early.)
1192 	 */
1193 	for (cpp = mkdirs; *cpp != NULL; cpp++) {
1194 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1195 		if (mkdir(tmp, 0755) != 0) {
1196 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1197 			return (B_FALSE);
1198 		}
1199 	}
1200 	/*
1201 	 * This is here to support lucopy.  If there's an instance of this same
1202 	 * zone on the current running system, then we mount its root up as
1203 	 * read-only inside the scratch zone.
1204 	 */
1205 	(void) zonecfg_get_uuid(zone_name, uuid);
1206 	altstr = strdup(zonecfg_get_root());
1207 	if (altstr == NULL) {
1208 		zerror(zlogp, B_TRUE, "memory allocation failed");
1209 		return (B_FALSE);
1210 	}
1211 	zonecfg_set_root("");
1212 	(void) strlcpy(tmp, zone_name, sizeof (tmp));
1213 	(void) zonecfg_get_name_by_uuid(uuid, tmp, sizeof (tmp));
1214 	if (zone_get_rootpath(tmp, fromdir, sizeof (fromdir)) == Z_OK &&
1215 	    strcmp(fromdir, rootpath) != 0) {
1216 		(void) snprintf(tmp, sizeof (tmp), "%s/b", luroot);
1217 		if (mkdir(tmp, 0755) != 0) {
1218 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1219 			return (B_FALSE);
1220 		}
1221 		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, fromdir,
1222 		    tmp) != 0) {
1223 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1224 			    fromdir);
1225 			return (B_FALSE);
1226 		}
1227 	}
1228 	zonecfg_set_root(altstr);
1229 	free(altstr);
1230 
1231 	if ((fp = zonecfg_open_scratch(luroot, B_TRUE)) == NULL) {
1232 		zerror(zlogp, B_TRUE, "cannot open zone mapfile");
1233 		return (B_FALSE);
1234 	}
1235 	(void) ftruncate(fileno(fp), 0);
1236 	if (zonecfg_add_scratch(fp, zone_name, kernzone, "/") == -1) {
1237 		zerror(zlogp, B_TRUE, "cannot add zone mapfile entry");
1238 	}
1239 	zonecfg_close_scratch(fp);
1240 	(void) snprintf(tmp, sizeof (tmp), "%s/a", luroot);
1241 	if (domount(zlogp, MNTTYPE_LOFS, "", rootpath, tmp) != 0)
1242 		return (B_FALSE);
1243 	(void) strlcpy(rootpath, tmp, rootlen);
1244 	return (B_TRUE);
1245 }
1246 
1247 
1248 static boolean_t
1249 build_mounted_post_var(zlog_t *zlogp, char *rootpath, const char *zonepath)
1250 {
1251 	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1252 	char luroot[MAXPATHLEN];
1253 	const char **cpp;
1254 	static const char *localdirs[] = {
1255 		"/etc", "/var", NULL
1256 	};
1257 	static const char *loopdirs[] = {
1258 		"/etc/lib", "/etc/fs", "/lib", "/sbin", "/platform",
1259 		"/usr", NULL
1260 	};
1261 	static const char *tmpdirs[] = {
1262 		"/tmp", "/var/run", NULL
1263 	};
1264 	struct stat st;
1265 
1266 	(void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
1267 
1268 	/*
1269 	 * These are mounted read-write from the zone undergoing upgrade.  We
1270 	 * must be careful not to 'leak' things from the main system into the
1271 	 * zone, and this accomplishes that goal.
1272 	 */
1273 	for (cpp = localdirs; *cpp != NULL; cpp++) {
1274 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1275 		(void) snprintf(fromdir, sizeof (fromdir), "%s%s", rootpath,
1276 		    *cpp);
1277 		if (mkdir(tmp, 0755) != 0) {
1278 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1279 			return (B_FALSE);
1280 		}
1281 		if (domount(zlogp, MNTTYPE_LOFS, "", fromdir, tmp) != 0) {
1282 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1283 			    *cpp);
1284 			return (B_FALSE);
1285 		}
1286 	}
1287 
1288 	/*
1289 	 * These are things mounted read-only from the running system because
1290 	 * they contain binaries that must match system.
1291 	 */
1292 	for (cpp = loopdirs; *cpp != NULL; cpp++) {
1293 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1294 		if (mkdir(tmp, 0755) != 0) {
1295 			if (errno != EEXIST) {
1296 				zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1297 				return (B_FALSE);
1298 			}
1299 			if (lstat(tmp, &st) != 0) {
1300 				zerror(zlogp, B_TRUE, "cannot stat %s", tmp);
1301 				return (B_FALSE);
1302 			}
1303 			/*
1304 			 * Ignore any non-directories encountered.  These are
1305 			 * things that have been converted into symlinks
1306 			 * (/etc/fs and /etc/lib) and no longer need a lofs
1307 			 * fixup.
1308 			 */
1309 			if (!S_ISDIR(st.st_mode))
1310 				continue;
1311 		}
1312 		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, *cpp,
1313 		    tmp) != 0) {
1314 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1315 			    *cpp);
1316 			return (B_FALSE);
1317 		}
1318 	}
1319 
1320 	/*
1321 	 * These are things with tmpfs mounted inside.
1322 	 */
1323 	for (cpp = tmpdirs; *cpp != NULL; cpp++) {
1324 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1325 		if (mkdir(tmp, 0755) != 0 && errno != EEXIST) {
1326 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1327 			return (B_FALSE);
1328 		}
1329 		if (domount(zlogp, MNTTYPE_TMPFS, "", "swap", tmp) != 0) {
1330 			zerror(zlogp, B_TRUE, "cannot mount swap on %s", *cpp);
1331 			return (B_FALSE);
1332 		}
1333 	}
1334 	return (B_TRUE);
1335 }
1336 
1337 static int
1338 mount_filesystems(zlog_t *zlogp, boolean_t mount_cmd)
1339 {
1340 	char	rootpath[MAXPATHLEN];
1341 	char	zonepath[MAXPATHLEN];
1342 	int	num_fs = 0, i;
1343 	struct zone_fstab fstab, *fs_ptr = NULL, *tmp_ptr;
1344 	struct zone_fstab *fsp;
1345 	zone_dochandle_t handle = NULL;
1346 	zone_state_t zstate;
1347 
1348 	if (zone_get_state(zone_name, &zstate) != Z_OK ||
1349 	    (zstate != ZONE_STATE_READY && zstate != ZONE_STATE_MOUNTED)) {
1350 		zerror(zlogp, B_FALSE,
1351 		    "zone must be in '%s' or '%s' state to mount file-systems",
1352 		    zone_state_str(ZONE_STATE_READY),
1353 		    zone_state_str(ZONE_STATE_MOUNTED));
1354 		goto bad;
1355 	}
1356 
1357 	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
1358 		zerror(zlogp, B_TRUE, "unable to determine zone path");
1359 		goto bad;
1360 	}
1361 
1362 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
1363 		zerror(zlogp, B_TRUE, "unable to determine zone root");
1364 		goto bad;
1365 	}
1366 
1367 	if ((handle = zonecfg_init_handle()) == NULL) {
1368 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
1369 		goto bad;
1370 	}
1371 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
1372 	    zonecfg_setfsent(handle) != Z_OK) {
1373 		zerror(zlogp, B_FALSE, "invalid configuration");
1374 		goto bad;
1375 	}
1376 
1377 	/*
1378 	 * Iterate through the rest of the filesystems, first the IPDs, then
1379 	 * the general FSs.  Sort them all, then mount them in sorted order.
1380 	 * This is to make sure the higher level directories (e.g., /usr)
1381 	 * get mounted before any beneath them (e.g., /usr/local).
1382 	 */
1383 	if (zonecfg_setipdent(handle) != Z_OK) {
1384 		zerror(zlogp, B_FALSE, "invalid configuration");
1385 		goto bad;
1386 	}
1387 	while (zonecfg_getipdent(handle, &fstab) == Z_OK) {
1388 		num_fs++;
1389 		if ((tmp_ptr = realloc(fs_ptr,
1390 		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1391 			zerror(zlogp, B_TRUE, "memory allocation failed");
1392 			num_fs--;
1393 			(void) zonecfg_endipdent(handle);
1394 			goto bad;
1395 		}
1396 		fs_ptr = tmp_ptr;
1397 		fsp = &fs_ptr[num_fs - 1];
1398 		/*
1399 		 * IPDs logically only have a mount point; all other properties
1400 		 * are implied.
1401 		 */
1402 		(void) strlcpy(fsp->zone_fs_dir,
1403 		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1404 		fsp->zone_fs_special[0] = '\0';
1405 		fsp->zone_fs_raw[0] = '\0';
1406 		fsp->zone_fs_type[0] = '\0';
1407 		fsp->zone_fs_options = NULL;
1408 	}
1409 	(void) zonecfg_endipdent(handle);
1410 
1411 	if (zonecfg_setfsent(handle) != Z_OK) {
1412 		zerror(zlogp, B_FALSE, "invalid configuration");
1413 		goto bad;
1414 	}
1415 	while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
1416 		/*
1417 		 * ZFS filesystems will not be accessible under an alternate
1418 		 * root, since the pool will not be known.  Ignore them in this
1419 		 * case.
1420 		 */
1421 		if (mount_cmd && strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
1422 			continue;
1423 
1424 		num_fs++;
1425 		if ((tmp_ptr = realloc(fs_ptr,
1426 		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1427 			zerror(zlogp, B_TRUE, "memory allocation failed");
1428 			num_fs--;
1429 			(void) zonecfg_endfsent(handle);
1430 			goto bad;
1431 		}
1432 		fs_ptr = tmp_ptr;
1433 		fsp = &fs_ptr[num_fs - 1];
1434 		(void) strlcpy(fsp->zone_fs_dir,
1435 		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1436 		(void) strlcpy(fsp->zone_fs_special, fstab.zone_fs_special,
1437 		    sizeof (fsp->zone_fs_special));
1438 		(void) strlcpy(fsp->zone_fs_raw, fstab.zone_fs_raw,
1439 		    sizeof (fsp->zone_fs_raw));
1440 		(void) strlcpy(fsp->zone_fs_type, fstab.zone_fs_type,
1441 		    sizeof (fsp->zone_fs_type));
1442 		fsp->zone_fs_options = fstab.zone_fs_options;
1443 	}
1444 	(void) zonecfg_endfsent(handle);
1445 	zonecfg_fini_handle(handle);
1446 	handle = NULL;
1447 
1448 	/*
1449 	 * When we're mounting a zone for administration, / is the
1450 	 * scratch zone and dev is mounted at /dev.  The to-be-upgraded
1451 	 * zone is mounted at /a, and we set up that environment so that
1452 	 * process can access both the running system's utilities
1453 	 * and the to-be-modified zone's files.  The only exception
1454 	 * is the zone's /dev which isn't mounted at all, which is
1455 	 * the same as global zone installation where /a/dev and
1456 	 * /a/devices are not mounted.
1457 	 * Zone mounting is done in three phases.
1458 	 *   1) Create and populate lu directory (build_mounted_pre_var()).
1459 	 *   2) Mount the required filesystems as per the zone configuration.
1460 	 *   3) Set up the rest of the scratch zone environment
1461 	 *	(build_mounted_post_var()).
1462 	 */
1463 	if (mount_cmd &&
1464 	    !build_mounted_pre_var(zlogp,
1465 	    rootpath, sizeof (rootpath), zonepath))
1466 		goto bad;
1467 
1468 	qsort(fs_ptr, num_fs, sizeof (*fs_ptr), fs_compare);
1469 	for (i = 0; i < num_fs; i++) {
1470 		if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1471 			goto bad;
1472 	}
1473 	if (mount_cmd &&
1474 	    !build_mounted_post_var(zlogp, rootpath, zonepath))
1475 		goto bad;
1476 
1477 	/*
1478 	 * For Trusted Extensions cross-mount each lower level /export/home
1479 	 */
1480 	if (!mount_cmd && tsol_mounts(zlogp, zone_name, rootpath) != 0)
1481 		goto bad;
1482 
1483 	free_fs_data(fs_ptr, num_fs);
1484 
1485 	/*
1486 	 * Everything looks fine.
1487 	 */
1488 	return (0);
1489 
1490 bad:
1491 	if (handle != NULL)
1492 		zonecfg_fini_handle(handle);
1493 	free_fs_data(fs_ptr, num_fs);
1494 	return (-1);
1495 }
1496 
1497 /* caller makes sure neither parameter is NULL */
1498 static int
1499 addr2netmask(char *prefixstr, int maxprefixlen, uchar_t *maskstr)
1500 {
1501 	int prefixlen;
1502 
1503 	prefixlen = atoi(prefixstr);
1504 	if (prefixlen < 0 || prefixlen > maxprefixlen)
1505 		return (1);
1506 	while (prefixlen > 0) {
1507 		if (prefixlen >= 8) {
1508 			*maskstr++ = 0xFF;
1509 			prefixlen -= 8;
1510 			continue;
1511 		}
1512 		*maskstr |= 1 << (8 - prefixlen);
1513 		prefixlen--;
1514 	}
1515 	return (0);
1516 }
1517 
1518 /*
1519  * Tear down all interfaces belonging to the given zone.  This should
1520  * be called with the zone in a state other than "running", so that
1521  * interfaces can't be assigned to the zone after this returns.
1522  *
1523  * If anything goes wrong, log an error message and return an error.
1524  */
1525 static int
1526 unconfigure_network_interfaces(zlog_t *zlogp, zoneid_t zone_id)
1527 {
1528 	struct lifnum lifn;
1529 	struct lifconf lifc;
1530 	struct lifreq *lifrp, lifrl;
1531 	int64_t lifc_flags = LIFC_NOXMIT | LIFC_ALLZONES;
1532 	int num_ifs, s, i, ret_code = 0;
1533 	uint_t bufsize;
1534 	char *buf = NULL;
1535 
1536 	if ((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
1537 		zerror(zlogp, B_TRUE, "could not get socket");
1538 		ret_code = -1;
1539 		goto bad;
1540 	}
1541 	lifn.lifn_family = AF_UNSPEC;
1542 	lifn.lifn_flags = (int)lifc_flags;
1543 	if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
1544 		zerror(zlogp, B_TRUE,
1545 		    "could not determine number of interfaces");
1546 		ret_code = -1;
1547 		goto bad;
1548 	}
1549 	num_ifs = lifn.lifn_count;
1550 	bufsize = num_ifs * sizeof (struct lifreq);
1551 	if ((buf = malloc(bufsize)) == NULL) {
1552 		zerror(zlogp, B_TRUE, "memory allocation failed");
1553 		ret_code = -1;
1554 		goto bad;
1555 	}
1556 	lifc.lifc_family = AF_UNSPEC;
1557 	lifc.lifc_flags = (int)lifc_flags;
1558 	lifc.lifc_len = bufsize;
1559 	lifc.lifc_buf = buf;
1560 	if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
1561 		zerror(zlogp, B_TRUE, "could not get configured interfaces");
1562 		ret_code = -1;
1563 		goto bad;
1564 	}
1565 	lifrp = lifc.lifc_req;
1566 	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, lifrp++) {
1567 		(void) close(s);
1568 		if ((s = socket(lifrp->lifr_addr.ss_family, SOCK_DGRAM, 0)) <
1569 		    0) {
1570 			zerror(zlogp, B_TRUE, "%s: could not get socket",
1571 			    lifrl.lifr_name);
1572 			ret_code = -1;
1573 			continue;
1574 		}
1575 		(void) memset(&lifrl, 0, sizeof (lifrl));
1576 		(void) strncpy(lifrl.lifr_name, lifrp->lifr_name,
1577 		    sizeof (lifrl.lifr_name));
1578 		if (ioctl(s, SIOCGLIFZONE, (caddr_t)&lifrl) < 0) {
1579 			zerror(zlogp, B_TRUE,
1580 			    "%s: could not determine zone interface belongs to",
1581 			    lifrl.lifr_name);
1582 			ret_code = -1;
1583 			continue;
1584 		}
1585 		if (lifrl.lifr_zoneid == zone_id) {
1586 			if (ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifrl) < 0) {
1587 				zerror(zlogp, B_TRUE,
1588 				    "%s: could not remove interface",
1589 				    lifrl.lifr_name);
1590 				ret_code = -1;
1591 				continue;
1592 			}
1593 		}
1594 	}
1595 bad:
1596 	if (s > 0)
1597 		(void) close(s);
1598 	if (buf)
1599 		free(buf);
1600 	return (ret_code);
1601 }
1602 
1603 static union	sockunion {
1604 	struct	sockaddr sa;
1605 	struct	sockaddr_in sin;
1606 	struct	sockaddr_dl sdl;
1607 	struct	sockaddr_in6 sin6;
1608 } so_dst, so_ifp;
1609 
1610 static struct {
1611 	struct	rt_msghdr hdr;
1612 	char	space[512];
1613 } rtmsg;
1614 
1615 static int
1616 salen(struct sockaddr *sa)
1617 {
1618 	switch (sa->sa_family) {
1619 	case AF_INET:
1620 		return (sizeof (struct sockaddr_in));
1621 	case AF_LINK:
1622 		return (sizeof (struct sockaddr_dl));
1623 	case AF_INET6:
1624 		return (sizeof (struct sockaddr_in6));
1625 	default:
1626 		return (sizeof (struct sockaddr));
1627 	}
1628 }
1629 
1630 #define	ROUNDUP_LONG(a) \
1631 	((a) > 0 ? (1 + (((a) - 1) | (sizeof (long) - 1))) : sizeof (long))
1632 
1633 /*
1634  * Look up which zone is using a given IP address.  The address in question
1635  * is expected to have been stuffed into the structure to which lifr points
1636  * via a previous SIOCGLIFADDR ioctl().
1637  *
1638  * This is done using black router socket magic.
1639  *
1640  * Return the name of the zone on success or NULL on failure.
1641  *
1642  * This is a lot of code for a simple task; a new ioctl request to take care
1643  * of this might be a useful RFE.
1644  */
1645 
1646 static char *
1647 who_is_using(zlog_t *zlogp, struct lifreq *lifr)
1648 {
1649 	static char answer[ZONENAME_MAX];
1650 	pid_t pid;
1651 	int s, rlen, l, i;
1652 	char *cp = rtmsg.space;
1653 	struct sockaddr_dl *ifp = NULL;
1654 	struct sockaddr *sa;
1655 	char save_if_name[LIFNAMSIZ];
1656 
1657 	answer[0] = '\0';
1658 
1659 	pid = getpid();
1660 	if ((s = socket(PF_ROUTE, SOCK_RAW, 0)) < 0) {
1661 		zerror(zlogp, B_TRUE, "could not get routing socket");
1662 		return (NULL);
1663 	}
1664 
1665 	if (lifr->lifr_addr.ss_family == AF_INET) {
1666 		struct sockaddr_in *sin4;
1667 
1668 		so_dst.sa.sa_family = AF_INET;
1669 		sin4 = (struct sockaddr_in *)&lifr->lifr_addr;
1670 		so_dst.sin.sin_addr = sin4->sin_addr;
1671 	} else {
1672 		struct sockaddr_in6 *sin6;
1673 
1674 		so_dst.sa.sa_family = AF_INET6;
1675 		sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
1676 		so_dst.sin6.sin6_addr = sin6->sin6_addr;
1677 	}
1678 
1679 	so_ifp.sa.sa_family = AF_LINK;
1680 
1681 	(void) memset(&rtmsg, 0, sizeof (rtmsg));
1682 	rtmsg.hdr.rtm_type = RTM_GET;
1683 	rtmsg.hdr.rtm_flags = RTF_UP | RTF_HOST;
1684 	rtmsg.hdr.rtm_version = RTM_VERSION;
1685 	rtmsg.hdr.rtm_seq = ++rts_seqno;
1686 	rtmsg.hdr.rtm_addrs = RTA_IFP | RTA_DST;
1687 
1688 	l = ROUNDUP_LONG(salen(&so_dst.sa));
1689 	(void) memmove(cp, &(so_dst), l);
1690 	cp += l;
1691 	l = ROUNDUP_LONG(salen(&so_ifp.sa));
1692 	(void) memmove(cp, &(so_ifp), l);
1693 	cp += l;
1694 
1695 	rtmsg.hdr.rtm_msglen = l = cp - (char *)&rtmsg;
1696 
1697 	if ((rlen = write(s, &rtmsg, l)) < 0) {
1698 		zerror(zlogp, B_TRUE, "writing to routing socket");
1699 		return (NULL);
1700 	} else if (rlen < (int)rtmsg.hdr.rtm_msglen) {
1701 		zerror(zlogp, B_TRUE,
1702 		    "write to routing socket got only %d for len\n", rlen);
1703 		return (NULL);
1704 	}
1705 	do {
1706 		l = read(s, &rtmsg, sizeof (rtmsg));
1707 	} while (l > 0 && (rtmsg.hdr.rtm_seq != rts_seqno ||
1708 	    rtmsg.hdr.rtm_pid != pid));
1709 	if (l < 0) {
1710 		zerror(zlogp, B_TRUE, "reading from routing socket");
1711 		return (NULL);
1712 	}
1713 
1714 	if (rtmsg.hdr.rtm_version != RTM_VERSION) {
1715 		zerror(zlogp, B_FALSE,
1716 		    "routing message version %d not understood",
1717 		    rtmsg.hdr.rtm_version);
1718 		return (NULL);
1719 	}
1720 	if (rtmsg.hdr.rtm_msglen != (ushort_t)l) {
1721 		zerror(zlogp, B_FALSE, "message length mismatch, "
1722 		    "expected %d bytes, returned %d bytes",
1723 		    rtmsg.hdr.rtm_msglen, l);
1724 		return (NULL);
1725 	}
1726 	if (rtmsg.hdr.rtm_errno != 0)  {
1727 		errno = rtmsg.hdr.rtm_errno;
1728 		zerror(zlogp, B_TRUE, "RTM_GET routing socket message");
1729 		return (NULL);
1730 	}
1731 	if ((rtmsg.hdr.rtm_addrs & RTA_IFP) == 0) {
1732 		zerror(zlogp, B_FALSE, "interface not found");
1733 		return (NULL);
1734 	}
1735 	cp = ((char *)(&rtmsg.hdr + 1));
1736 	for (i = 1; i != 0; i <<= 1) {
1737 		/* LINTED E_BAD_PTR_CAST_ALIGN */
1738 		sa = (struct sockaddr *)cp;
1739 		if (i != RTA_IFP) {
1740 			if ((i & rtmsg.hdr.rtm_addrs) != 0)
1741 				cp += ROUNDUP_LONG(salen(sa));
1742 			continue;
1743 		}
1744 		if (sa->sa_family == AF_LINK &&
1745 		    ((struct sockaddr_dl *)sa)->sdl_nlen != 0)
1746 			ifp = (struct sockaddr_dl *)sa;
1747 		break;
1748 	}
1749 	if (ifp == NULL) {
1750 		zerror(zlogp, B_FALSE, "interface could not be determined");
1751 		return (NULL);
1752 	}
1753 
1754 	/*
1755 	 * We need to set the I/F name to what we got above, then do the
1756 	 * appropriate ioctl to get its zone name.  But lifr->lifr_name is
1757 	 * used by the calling function to do a REMOVEIF, so if we leave the
1758 	 * "good" zone's I/F name in place, *that* I/F will be removed instead
1759 	 * of the bad one.  So we save the old (bad) I/F name before over-
1760 	 * writing it and doing the ioctl, then restore it after the ioctl.
1761 	 */
1762 	(void) strlcpy(save_if_name, lifr->lifr_name, sizeof (save_if_name));
1763 	(void) strncpy(lifr->lifr_name, ifp->sdl_data, ifp->sdl_nlen);
1764 	lifr->lifr_name[ifp->sdl_nlen] = '\0';
1765 	i = ioctl(s, SIOCGLIFZONE, lifr);
1766 	(void) strlcpy(lifr->lifr_name, save_if_name, sizeof (save_if_name));
1767 	if (i < 0) {
1768 		zerror(zlogp, B_TRUE,
1769 		    "%s: could not determine the zone interface belongs to",
1770 		    lifr->lifr_name);
1771 		return (NULL);
1772 	}
1773 	if (getzonenamebyid(lifr->lifr_zoneid, answer, sizeof (answer)) < 0)
1774 		(void) snprintf(answer, sizeof (answer), "%d",
1775 		    lifr->lifr_zoneid);
1776 
1777 	if (strlen(answer) > 0)
1778 		return (answer);
1779 	return (NULL);
1780 }
1781 
1782 typedef struct mcast_rtmsg_s {
1783 	struct rt_msghdr	m_rtm;
1784 	union {
1785 		struct {
1786 			struct sockaddr_in	m_dst;
1787 			struct sockaddr_in	m_gw;
1788 			struct sockaddr_in	m_netmask;
1789 		} m_v4;
1790 		struct {
1791 			struct sockaddr_in6	m_dst;
1792 			struct sockaddr_in6	m_gw;
1793 			struct sockaddr_in6	m_netmask;
1794 		} m_v6;
1795 	} m_u;
1796 } mcast_rtmsg_t;
1797 #define	m_dst4		m_u.m_v4.m_dst
1798 #define	m_dst6		m_u.m_v6.m_dst
1799 #define	m_gw4		m_u.m_v4.m_gw
1800 #define	m_gw6		m_u.m_v6.m_gw
1801 #define	m_netmask4	m_u.m_v4.m_netmask
1802 #define	m_netmask6	m_u.m_v6.m_netmask
1803 
1804 /*
1805  * Configures a single interface: a new virtual interface is added, based on
1806  * the physical interface nwiftabptr->zone_nwif_physical, with the address
1807  * specified in nwiftabptr->zone_nwif_address, for zone zone_id.  Note that
1808  * the "address" can be an IPv6 address (with a /prefixlength required), an
1809  * IPv4 address (with a /prefixlength optional), or a name; for the latter,
1810  * an IPv4 name-to-address resolution will be attempted.
1811  *
1812  * A default interface route for multicast is created on the first IPv4 and
1813  * IPv6 interfaces (that have the IFF_MULTICAST flag set), respectively.
1814  * This should really be done in the init scripts if we ever allow zones to
1815  * modify the routing tables.
1816  *
1817  * If anything goes wrong, we log an detailed error message, attempt to tear
1818  * down whatever we set up and return an error.
1819  */
1820 static int
1821 configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
1822     struct zone_nwiftab *nwiftabptr, boolean_t *mcast_rt_v4_setp,
1823     boolean_t *mcast_rt_v6_setp)
1824 {
1825 	struct lifreq lifr;
1826 	struct sockaddr_in netmask4;
1827 	struct sockaddr_in6 netmask6;
1828 	struct in_addr in4;
1829 	struct in6_addr in6;
1830 	sa_family_t af;
1831 	char *slashp = strchr(nwiftabptr->zone_nwif_address, '/');
1832 	mcast_rtmsg_t mcast_rtmsg;
1833 	int s;
1834 	int rs;
1835 	int rlen;
1836 	boolean_t got_netmask = B_FALSE;
1837 	char addrstr4[INET_ADDRSTRLEN];
1838 	int res;
1839 
1840 	res = zonecfg_valid_net_address(nwiftabptr->zone_nwif_address, &lifr);
1841 	if (res != Z_OK) {
1842 		zerror(zlogp, B_FALSE, "%s: %s", zonecfg_strerror(res),
1843 		    nwiftabptr->zone_nwif_address);
1844 		return (-1);
1845 	}
1846 	af = lifr.lifr_addr.ss_family;
1847 	if (af == AF_INET)
1848 		in4 = ((struct sockaddr_in *)(&lifr.lifr_addr))->sin_addr;
1849 	else
1850 		in6 = ((struct sockaddr_in6 *)(&lifr.lifr_addr))->sin6_addr;
1851 
1852 	if ((s = socket(af, SOCK_DGRAM, 0)) < 0) {
1853 		zerror(zlogp, B_TRUE, "could not get socket");
1854 		return (-1);
1855 	}
1856 
1857 	(void) strlcpy(lifr.lifr_name, nwiftabptr->zone_nwif_physical,
1858 	    sizeof (lifr.lifr_name));
1859 	if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
1860 		/*
1861 		 * Here, we know that the interface can't be brought up.
1862 		 * A similar warning message was already printed out to
1863 		 * the console by zoneadm(1M) so instead we log the
1864 		 * message to syslog and continue.
1865 		 */
1866 		zerror(&logsys, B_TRUE, "WARNING: skipping interface "
1867 		    "'%s' which may not be present/plumbed in the "
1868 		    "global zone.", lifr.lifr_name);
1869 		(void) close(s);
1870 		return (Z_OK);
1871 	}
1872 
1873 	if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
1874 		zerror(zlogp, B_TRUE,
1875 		    "%s: could not set IP address to %s",
1876 		    lifr.lifr_name, nwiftabptr->zone_nwif_address);
1877 		goto bad;
1878 	}
1879 
1880 	/* Preserve literal IPv4 address for later potential printing. */
1881 	if (af == AF_INET)
1882 		(void) inet_ntop(AF_INET, &in4, addrstr4, INET_ADDRSTRLEN);
1883 
1884 	lifr.lifr_zoneid = zone_id;
1885 	if (ioctl(s, SIOCSLIFZONE, (caddr_t)&lifr) < 0) {
1886 		zerror(zlogp, B_TRUE, "%s: could not place interface into zone",
1887 		    lifr.lifr_name);
1888 		goto bad;
1889 	}
1890 
1891 	if (strcmp(nwiftabptr->zone_nwif_physical, "lo0") == 0) {
1892 		got_netmask = B_TRUE;	/* default setting will be correct */
1893 	} else {
1894 		if (af == AF_INET) {
1895 			/*
1896 			 * The IPv4 netmask can be determined either
1897 			 * directly if a prefix length was supplied with
1898 			 * the address or via the netmasks database.  Not
1899 			 * being able to determine it is a common failure,
1900 			 * but it often is not fatal to operation of the
1901 			 * interface.  In that case, a warning will be
1902 			 * printed after the rest of the interface's
1903 			 * parameters have been configured.
1904 			 */
1905 			(void) memset(&netmask4, 0, sizeof (netmask4));
1906 			if (slashp != NULL) {
1907 				if (addr2netmask(slashp + 1, V4_ADDR_LEN,
1908 				    (uchar_t *)&netmask4.sin_addr) != 0) {
1909 					*slashp = '/';
1910 					zerror(zlogp, B_FALSE,
1911 					    "%s: invalid prefix length in %s",
1912 					    lifr.lifr_name,
1913 					    nwiftabptr->zone_nwif_address);
1914 					goto bad;
1915 				}
1916 				got_netmask = B_TRUE;
1917 			} else if (getnetmaskbyaddr(in4,
1918 			    &netmask4.sin_addr) == 0) {
1919 				got_netmask = B_TRUE;
1920 			}
1921 			if (got_netmask) {
1922 				netmask4.sin_family = af;
1923 				(void) memcpy(&lifr.lifr_addr, &netmask4,
1924 				    sizeof (netmask4));
1925 			}
1926 		} else {
1927 			(void) memset(&netmask6, 0, sizeof (netmask6));
1928 			if (addr2netmask(slashp + 1, V6_ADDR_LEN,
1929 			    (uchar_t *)&netmask6.sin6_addr) != 0) {
1930 				*slashp = '/';
1931 				zerror(zlogp, B_FALSE,
1932 				    "%s: invalid prefix length in %s",
1933 				    lifr.lifr_name,
1934 				    nwiftabptr->zone_nwif_address);
1935 				goto bad;
1936 			}
1937 			got_netmask = B_TRUE;
1938 			netmask6.sin6_family = af;
1939 			(void) memcpy(&lifr.lifr_addr, &netmask6,
1940 			    sizeof (netmask6));
1941 		}
1942 		if (got_netmask &&
1943 		    ioctl(s, SIOCSLIFNETMASK, (caddr_t)&lifr) < 0) {
1944 			zerror(zlogp, B_TRUE, "%s: could not set netmask",
1945 			    lifr.lifr_name);
1946 			goto bad;
1947 		}
1948 
1949 		/*
1950 		 * This doesn't set the broadcast address at all. Rather, it
1951 		 * gets, then sets the interface's address, relying on the fact
1952 		 * that resetting the address will reset the broadcast address.
1953 		 */
1954 		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
1955 			zerror(zlogp, B_TRUE, "%s: could not get address",
1956 			    lifr.lifr_name);
1957 			goto bad;
1958 		}
1959 		if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
1960 			zerror(zlogp, B_TRUE,
1961 			    "%s: could not reset broadcast address",
1962 			    lifr.lifr_name);
1963 			goto bad;
1964 		}
1965 	}
1966 
1967 	if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
1968 		zerror(zlogp, B_TRUE, "%s: could not get flags",
1969 		    lifr.lifr_name);
1970 		goto bad;
1971 	}
1972 	lifr.lifr_flags |= IFF_UP;
1973 	if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
1974 		int save_errno = errno;
1975 		char *zone_using;
1976 
1977 		/*
1978 		 * If we failed with something other than EADDRNOTAVAIL,
1979 		 * then skip to the end.  Otherwise, look up our address,
1980 		 * then call a function to determine which zone is already
1981 		 * using that address.
1982 		 */
1983 		if (errno != EADDRNOTAVAIL) {
1984 			zerror(zlogp, B_TRUE,
1985 			    "%s: could not bring interface up", lifr.lifr_name);
1986 			goto bad;
1987 		}
1988 		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
1989 			zerror(zlogp, B_TRUE, "%s: could not get address",
1990 			    lifr.lifr_name);
1991 			goto bad;
1992 		}
1993 		zone_using = who_is_using(zlogp, &lifr);
1994 		errno = save_errno;
1995 		if (zone_using == NULL)
1996 			zerror(zlogp, B_TRUE,
1997 			    "%s: could not bring interface up", lifr.lifr_name);
1998 		else
1999 			zerror(zlogp, B_TRUE, "%s: could not bring interface "
2000 			    "up: address in use by zone '%s'", lifr.lifr_name,
2001 			    zone_using);
2002 		goto bad;
2003 	}
2004 	if ((lifr.lifr_flags & IFF_MULTICAST) && ((af == AF_INET &&
2005 	    mcast_rt_v4_setp != NULL && *mcast_rt_v4_setp == B_FALSE) ||
2006 	    (af == AF_INET6 &&
2007 	    mcast_rt_v6_setp != NULL && *mcast_rt_v6_setp == B_FALSE))) {
2008 		rs = socket(PF_ROUTE, SOCK_RAW, 0);
2009 		if (rs < 0) {
2010 			zerror(zlogp, B_TRUE, "%s: could not create "
2011 			    "routing socket", lifr.lifr_name);
2012 			goto bad;
2013 		}
2014 		(void) shutdown(rs, 0);
2015 		(void) memset((void *)&mcast_rtmsg, 0, sizeof (mcast_rtmsg_t));
2016 		mcast_rtmsg.m_rtm.rtm_msglen =  sizeof (struct rt_msghdr) +
2017 		    3 * (af == AF_INET ? sizeof (struct sockaddr_in) :
2018 		    sizeof (struct sockaddr_in6));
2019 		mcast_rtmsg.m_rtm.rtm_version = RTM_VERSION;
2020 		mcast_rtmsg.m_rtm.rtm_type = RTM_ADD;
2021 		mcast_rtmsg.m_rtm.rtm_flags = RTF_UP;
2022 		mcast_rtmsg.m_rtm.rtm_addrs =
2023 		    RTA_DST | RTA_GATEWAY | RTA_NETMASK;
2024 		mcast_rtmsg.m_rtm.rtm_seq = ++rts_seqno;
2025 		if (af == AF_INET) {
2026 			mcast_rtmsg.m_dst4.sin_family = AF_INET;
2027 			mcast_rtmsg.m_dst4.sin_addr.s_addr =
2028 			    htonl(INADDR_UNSPEC_GROUP);
2029 			mcast_rtmsg.m_gw4.sin_family = AF_INET;
2030 			mcast_rtmsg.m_gw4.sin_addr = in4;
2031 			mcast_rtmsg.m_netmask4.sin_family = AF_INET;
2032 			mcast_rtmsg.m_netmask4.sin_addr.s_addr =
2033 			    htonl(IN_CLASSD_NET);
2034 		} else {
2035 			mcast_rtmsg.m_dst6.sin6_family = AF_INET6;
2036 			mcast_rtmsg.m_dst6.sin6_addr.s6_addr[0] = 0xffU;
2037 			mcast_rtmsg.m_gw6.sin6_family = AF_INET6;
2038 			mcast_rtmsg.m_gw6.sin6_addr = in6;
2039 			mcast_rtmsg.m_netmask6.sin6_family = AF_INET6;
2040 			mcast_rtmsg.m_netmask6.sin6_addr.s6_addr[0] = 0xffU;
2041 		}
2042 		rlen = write(rs, (char *)&mcast_rtmsg,
2043 		    mcast_rtmsg.m_rtm.rtm_msglen);
2044 		/*
2045 		 * The write to the multicast socket will fail if the
2046 		 * interface belongs to a failed IPMP group. This is a
2047 		 * non-fatal error and the zone will continue booting.
2048 		 * While the zone is running, if any interface in the
2049 		 * failed IPMP group recovers, the zone will fallback to
2050 		 * using that interface.
2051 		 */
2052 		if (rlen < mcast_rtmsg.m_rtm.rtm_msglen) {
2053 			if (rlen < 0) {
2054 				zerror(zlogp, B_TRUE, "WARNING: interface "
2055 				    "'%s' not available as default for "
2056 				    "multicast.", lifr.lifr_name);
2057 			} else {
2058 				zerror(zlogp, B_FALSE, "WARNING: interface "
2059 				    "'%s' not available as default for "
2060 				    "multicast; routing socket returned "
2061 				    "unexpected %d bytes.",
2062 				    lifr.lifr_name, rlen);
2063 			}
2064 		} else {
2065 
2066 			if (af == AF_INET) {
2067 				*mcast_rt_v4_setp = B_TRUE;
2068 			} else {
2069 				*mcast_rt_v6_setp = B_TRUE;
2070 			}
2071 		}
2072 		(void) close(rs);
2073 	}
2074 
2075 	if (!got_netmask) {
2076 		/*
2077 		 * A common, but often non-fatal problem, is that the system
2078 		 * cannot find the netmask for an interface address. This is
2079 		 * often caused by it being only in /etc/inet/netmasks, but
2080 		 * /etc/nsswitch.conf says to use NIS or NIS+ and it's not
2081 		 * in that. This doesn't show up at boot because the netmask
2082 		 * is obtained from /etc/inet/netmasks when no network
2083 		 * interfaces are up, but isn't consulted when NIS/NIS+ is
2084 		 * available. We warn the user here that something like this
2085 		 * has happened and we're just running with a default and
2086 		 * possible incorrect netmask.
2087 		 */
2088 		char buffer[INET6_ADDRSTRLEN];
2089 		void  *addr;
2090 
2091 		if (af == AF_INET)
2092 			addr = &((struct sockaddr_in *)
2093 			    (&lifr.lifr_addr))->sin_addr;
2094 		else
2095 			addr = &((struct sockaddr_in6 *)
2096 			    (&lifr.lifr_addr))->sin6_addr;
2097 
2098 		/* Find out what netmask interface is going to be using */
2099 		if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
2100 		    inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL)
2101 			goto bad;
2102 		zerror(zlogp, B_FALSE,
2103 		    "WARNING: %s: no matching subnet found in netmasks(4) for "
2104 		    "%s; using default of %s.",
2105 		    lifr.lifr_name, addrstr4, buffer);
2106 	}
2107 
2108 	(void) close(s);
2109 	return (Z_OK);
2110 bad:
2111 	(void) ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifr);
2112 	(void) close(s);
2113 	return (-1);
2114 }
2115 
2116 /*
2117  * Sets up network interfaces based on information from the zone configuration.
2118  * An IPv4 loopback interface is set up "for free", modeling the global system.
2119  * If any of the configuration interfaces were IPv6, then an IPv6 loopback
2120  * address is set up as well.
2121  *
2122  * If anything goes wrong, we log a general error message, attempt to tear down
2123  * whatever we set up, and return an error.
2124  */
2125 static int
2126 configure_network_interfaces(zlog_t *zlogp)
2127 {
2128 	zone_dochandle_t handle;
2129 	struct zone_nwiftab nwiftab, loopback_iftab;
2130 	boolean_t saw_v6 = B_FALSE;
2131 	boolean_t mcast_rt_v4_set = B_FALSE;
2132 	boolean_t mcast_rt_v6_set = B_FALSE;
2133 	zoneid_t zoneid;
2134 
2135 	if ((zoneid = getzoneidbyname(zone_name)) == ZONE_ID_UNDEFINED) {
2136 		zerror(zlogp, B_TRUE, "unable to get zoneid");
2137 		return (-1);
2138 	}
2139 
2140 	if ((handle = zonecfg_init_handle()) == NULL) {
2141 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2142 		return (-1);
2143 	}
2144 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2145 		zerror(zlogp, B_FALSE, "invalid configuration");
2146 		zonecfg_fini_handle(handle);
2147 		return (-1);
2148 	}
2149 	if (zonecfg_setnwifent(handle) == Z_OK) {
2150 		for (;;) {
2151 			struct in6_addr in6;
2152 
2153 			if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2154 				break;
2155 			if (configure_one_interface(zlogp, zoneid,
2156 			    &nwiftab, &mcast_rt_v4_set, &mcast_rt_v6_set) !=
2157 			    Z_OK) {
2158 				(void) zonecfg_endnwifent(handle);
2159 				zonecfg_fini_handle(handle);
2160 				return (-1);
2161 			}
2162 			if (inet_pton(AF_INET6, nwiftab.zone_nwif_address,
2163 			    &in6) == 1)
2164 				saw_v6 = B_TRUE;
2165 		}
2166 		(void) zonecfg_endnwifent(handle);
2167 	}
2168 	zonecfg_fini_handle(handle);
2169 	(void) strlcpy(loopback_iftab.zone_nwif_physical, "lo0",
2170 	    sizeof (loopback_iftab.zone_nwif_physical));
2171 	(void) strlcpy(loopback_iftab.zone_nwif_address, "127.0.0.1",
2172 	    sizeof (loopback_iftab.zone_nwif_address));
2173 	if (configure_one_interface(zlogp, zoneid, &loopback_iftab, NULL, NULL)
2174 	    != Z_OK) {
2175 		return (-1);
2176 	}
2177 	if (saw_v6) {
2178 		(void) strlcpy(loopback_iftab.zone_nwif_address, "::1/128",
2179 		    sizeof (loopback_iftab.zone_nwif_address));
2180 		if (configure_one_interface(zlogp, zoneid,
2181 		    &loopback_iftab, NULL, NULL) != Z_OK) {
2182 			return (-1);
2183 		}
2184 	}
2185 	return (0);
2186 }
2187 
2188 static int
2189 tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
2190     const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
2191 {
2192 	int fd;
2193 	struct strioctl ioc;
2194 	tcp_ioc_abort_conn_t conn;
2195 	int error;
2196 
2197 	conn.ac_local = *local;
2198 	conn.ac_remote = *remote;
2199 	conn.ac_start = TCPS_SYN_SENT;
2200 	conn.ac_end = TCPS_TIME_WAIT;
2201 	conn.ac_zoneid = zoneid;
2202 
2203 	ioc.ic_cmd = TCP_IOC_ABORT_CONN;
2204 	ioc.ic_timout = -1; /* infinite timeout */
2205 	ioc.ic_len = sizeof (conn);
2206 	ioc.ic_dp = (char *)&conn;
2207 
2208 	if ((fd = open("/dev/tcp", O_RDONLY)) < 0) {
2209 		zerror(zlogp, B_TRUE, "unable to open %s", "/dev/tcp");
2210 		return (-1);
2211 	}
2212 
2213 	error = ioctl(fd, I_STR, &ioc);
2214 	(void) close(fd);
2215 	if (error == 0 || errno == ENOENT)	/* ENOENT is not an error */
2216 		return (0);
2217 	return (-1);
2218 }
2219 
2220 static int
2221 tcp_abort_connections(zlog_t *zlogp, zoneid_t zoneid)
2222 {
2223 	struct sockaddr_storage l, r;
2224 	struct sockaddr_in *local, *remote;
2225 	struct sockaddr_in6 *local6, *remote6;
2226 	int error;
2227 
2228 	/*
2229 	 * Abort IPv4 connections.
2230 	 */
2231 	bzero(&l, sizeof (*local));
2232 	local = (struct sockaddr_in *)&l;
2233 	local->sin_family = AF_INET;
2234 	local->sin_addr.s_addr = INADDR_ANY;
2235 	local->sin_port = 0;
2236 
2237 	bzero(&r, sizeof (*remote));
2238 	remote = (struct sockaddr_in *)&r;
2239 	remote->sin_family = AF_INET;
2240 	remote->sin_addr.s_addr = INADDR_ANY;
2241 	remote->sin_port = 0;
2242 
2243 	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2244 		return (error);
2245 
2246 	/*
2247 	 * Abort IPv6 connections.
2248 	 */
2249 	bzero(&l, sizeof (*local6));
2250 	local6 = (struct sockaddr_in6 *)&l;
2251 	local6->sin6_family = AF_INET6;
2252 	local6->sin6_port = 0;
2253 	local6->sin6_addr = in6addr_any;
2254 
2255 	bzero(&r, sizeof (*remote6));
2256 	remote6 = (struct sockaddr_in6 *)&r;
2257 	remote6->sin6_family = AF_INET6;
2258 	remote6->sin6_port = 0;
2259 	remote6->sin6_addr = in6addr_any;
2260 
2261 	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2262 		return (error);
2263 	return (0);
2264 }
2265 
2266 static int
2267 get_privset(zlog_t *zlogp, priv_set_t *privs, boolean_t mount_cmd)
2268 {
2269 	int error = -1;
2270 	zone_dochandle_t handle;
2271 	char *privname = NULL;
2272 
2273 	if (mount_cmd) {
2274 		if (zonecfg_default_privset(privs) == Z_OK)
2275 			return (0);
2276 		zerror(zlogp, B_FALSE,
2277 		    "failed to determine the zone's default privilege set");
2278 		return (-1);
2279 	}
2280 
2281 	if ((handle = zonecfg_init_handle()) == NULL) {
2282 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2283 		return (-1);
2284 	}
2285 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2286 		zerror(zlogp, B_FALSE, "invalid configuration");
2287 		zonecfg_fini_handle(handle);
2288 		return (-1);
2289 	}
2290 
2291 	switch (zonecfg_get_privset(handle, privs, &privname)) {
2292 	case Z_OK:
2293 		error = 0;
2294 		break;
2295 	case Z_PRIV_PROHIBITED:
2296 		zerror(zlogp, B_FALSE, "privilege \"%s\" is not permitted "
2297 		    "within the zone's privilege set", privname);
2298 		break;
2299 	case Z_PRIV_REQUIRED:
2300 		zerror(zlogp, B_FALSE, "required privilege \"%s\" is missing "
2301 		    "from the zone's privilege set", privname);
2302 		break;
2303 	case Z_PRIV_UNKNOWN:
2304 		zerror(zlogp, B_FALSE, "unknown privilege \"%s\" specified "
2305 		    "in the zone's privilege set", privname);
2306 		break;
2307 	default:
2308 		zerror(zlogp, B_FALSE, "failed to determine the zone's "
2309 		    "privilege set");
2310 		break;
2311 	}
2312 
2313 	free(privname);
2314 	zonecfg_fini_handle(handle);
2315 	return (error);
2316 }
2317 
2318 static int
2319 get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
2320 {
2321 	nvlist_t *nvl = NULL;
2322 	char *nvl_packed = NULL;
2323 	size_t nvl_size = 0;
2324 	nvlist_t **nvlv = NULL;
2325 	int rctlcount = 0;
2326 	int error = -1;
2327 	zone_dochandle_t handle;
2328 	struct zone_rctltab rctltab;
2329 	rctlblk_t *rctlblk = NULL;
2330 
2331 	*bufp = NULL;
2332 	*bufsizep = 0;
2333 
2334 	if ((handle = zonecfg_init_handle()) == NULL) {
2335 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2336 		return (-1);
2337 	}
2338 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2339 		zerror(zlogp, B_FALSE, "invalid configuration");
2340 		zonecfg_fini_handle(handle);
2341 		return (-1);
2342 	}
2343 
2344 	rctltab.zone_rctl_valptr = NULL;
2345 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
2346 		zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc");
2347 		goto out;
2348 	}
2349 
2350 	if (zonecfg_setrctlent(handle) != Z_OK) {
2351 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent");
2352 		goto out;
2353 	}
2354 
2355 	if ((rctlblk = malloc(rctlblk_size())) == NULL) {
2356 		zerror(zlogp, B_TRUE, "memory allocation failed");
2357 		goto out;
2358 	}
2359 	while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) {
2360 		struct zone_rctlvaltab *rctlval;
2361 		uint_t i, count;
2362 		const char *name = rctltab.zone_rctl_name;
2363 
2364 		/* zoneadm should have already warned about unknown rctls. */
2365 		if (!zonecfg_is_rctl(name)) {
2366 			zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2367 			rctltab.zone_rctl_valptr = NULL;
2368 			continue;
2369 		}
2370 		count = 0;
2371 		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
2372 		    rctlval = rctlval->zone_rctlval_next) {
2373 			count++;
2374 		}
2375 		if (count == 0) {	/* ignore */
2376 			continue;	/* Nothing to free */
2377 		}
2378 		if ((nvlv = malloc(sizeof (*nvlv) * count)) == NULL)
2379 			goto out;
2380 		i = 0;
2381 		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
2382 		    rctlval = rctlval->zone_rctlval_next, i++) {
2383 			if (nvlist_alloc(&nvlv[i], NV_UNIQUE_NAME, 0) != 0) {
2384 				zerror(zlogp, B_TRUE, "%s failed",
2385 				    "nvlist_alloc");
2386 				goto out;
2387 			}
2388 			if (zonecfg_construct_rctlblk(rctlval, rctlblk)
2389 			    != Z_OK) {
2390 				zerror(zlogp, B_FALSE, "invalid rctl value: "
2391 				    "(priv=%s,limit=%s,action=%s)",
2392 				    rctlval->zone_rctlval_priv,
2393 				    rctlval->zone_rctlval_limit,
2394 				    rctlval->zone_rctlval_action);
2395 				goto out;
2396 			}
2397 			if (!zonecfg_valid_rctl(name, rctlblk)) {
2398 				zerror(zlogp, B_FALSE,
2399 				    "(priv=%s,limit=%s,action=%s) is not a "
2400 				    "valid value for rctl '%s'",
2401 				    rctlval->zone_rctlval_priv,
2402 				    rctlval->zone_rctlval_limit,
2403 				    rctlval->zone_rctlval_action,
2404 				    name);
2405 				goto out;
2406 			}
2407 			if (nvlist_add_uint64(nvlv[i], "privilege",
2408 			    rctlblk_get_privilege(rctlblk)) != 0) {
2409 				zerror(zlogp, B_FALSE, "%s failed",
2410 				    "nvlist_add_uint64");
2411 				goto out;
2412 			}
2413 			if (nvlist_add_uint64(nvlv[i], "limit",
2414 			    rctlblk_get_value(rctlblk)) != 0) {
2415 				zerror(zlogp, B_FALSE, "%s failed",
2416 				    "nvlist_add_uint64");
2417 				goto out;
2418 			}
2419 			if (nvlist_add_uint64(nvlv[i], "action",
2420 			    (uint_t)rctlblk_get_local_action(rctlblk, NULL))
2421 			    != 0) {
2422 				zerror(zlogp, B_FALSE, "%s failed",
2423 				    "nvlist_add_uint64");
2424 				goto out;
2425 			}
2426 		}
2427 		zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2428 		rctltab.zone_rctl_valptr = NULL;
2429 		if (nvlist_add_nvlist_array(nvl, (char *)name, nvlv, count)
2430 		    != 0) {
2431 			zerror(zlogp, B_FALSE, "%s failed",
2432 			    "nvlist_add_nvlist_array");
2433 			goto out;
2434 		}
2435 		for (i = 0; i < count; i++)
2436 			nvlist_free(nvlv[i]);
2437 		free(nvlv);
2438 		nvlv = NULL;
2439 		rctlcount++;
2440 	}
2441 	(void) zonecfg_endrctlent(handle);
2442 
2443 	if (rctlcount == 0) {
2444 		error = 0;
2445 		goto out;
2446 	}
2447 	if (nvlist_pack(nvl, &nvl_packed, &nvl_size, NV_ENCODE_NATIVE, 0)
2448 	    != 0) {
2449 		zerror(zlogp, B_FALSE, "%s failed", "nvlist_pack");
2450 		goto out;
2451 	}
2452 
2453 	error = 0;
2454 	*bufp = nvl_packed;
2455 	*bufsizep = nvl_size;
2456 
2457 out:
2458 	free(rctlblk);
2459 	zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2460 	if (error && nvl_packed != NULL)
2461 		free(nvl_packed);
2462 	if (nvl != NULL)
2463 		nvlist_free(nvl);
2464 	if (nvlv != NULL)
2465 		free(nvlv);
2466 	if (handle != NULL)
2467 		zonecfg_fini_handle(handle);
2468 	return (error);
2469 }
2470 
2471 static int
2472 get_zone_pool(zlog_t *zlogp, char *poolbuf, size_t bufsz)
2473 {
2474 	zone_dochandle_t handle;
2475 	int error;
2476 
2477 	if ((handle = zonecfg_init_handle()) == NULL) {
2478 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2479 		return (Z_NOMEM);
2480 	}
2481 	error = zonecfg_get_snapshot_handle(zone_name, handle);
2482 	if (error != Z_OK) {
2483 		zerror(zlogp, B_FALSE, "invalid configuration");
2484 		zonecfg_fini_handle(handle);
2485 		return (error);
2486 	}
2487 	error = zonecfg_get_pool(handle, poolbuf, bufsz);
2488 	zonecfg_fini_handle(handle);
2489 	return (error);
2490 }
2491 
2492 static int
2493 get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
2494 {
2495 	zone_dochandle_t handle;
2496 	struct zone_dstab dstab;
2497 	size_t total, offset, len;
2498 	int error = -1;
2499 	char *str;
2500 
2501 	*bufp = NULL;
2502 	*bufsizep = 0;
2503 
2504 	if ((handle = zonecfg_init_handle()) == NULL) {
2505 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2506 		return (-1);
2507 	}
2508 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2509 		zerror(zlogp, B_FALSE, "invalid configuration");
2510 		zonecfg_fini_handle(handle);
2511 		return (-1);
2512 	}
2513 
2514 	if (zonecfg_setdsent(handle) != Z_OK) {
2515 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
2516 		goto out;
2517 	}
2518 
2519 	total = 0;
2520 	while (zonecfg_getdsent(handle, &dstab) == Z_OK)
2521 		total += strlen(dstab.zone_dataset_name) + 1;
2522 	(void) zonecfg_enddsent(handle);
2523 
2524 	if (total == 0) {
2525 		error = 0;
2526 		goto out;
2527 	}
2528 
2529 	if ((str = malloc(total)) == NULL) {
2530 		zerror(zlogp, B_TRUE, "memory allocation failed");
2531 		goto out;
2532 	}
2533 
2534 	if (zonecfg_setdsent(handle) != Z_OK) {
2535 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
2536 		goto out;
2537 	}
2538 	offset = 0;
2539 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
2540 		len = strlen(dstab.zone_dataset_name);
2541 		(void) strlcpy(str + offset, dstab.zone_dataset_name,
2542 		    sizeof (dstab.zone_dataset_name) - offset);
2543 		offset += len;
2544 		if (offset != total - 1)
2545 			str[offset++] = ',';
2546 	}
2547 	(void) zonecfg_enddsent(handle);
2548 
2549 	error = 0;
2550 	*bufp = str;
2551 	*bufsizep = total;
2552 
2553 out:
2554 	if (error != 0 && str != NULL)
2555 		free(str);
2556 	if (handle != NULL)
2557 		zonecfg_fini_handle(handle);
2558 
2559 	return (error);
2560 }
2561 
2562 static int
2563 validate_datasets(zlog_t *zlogp)
2564 {
2565 	zone_dochandle_t handle;
2566 	struct zone_dstab dstab;
2567 	zfs_handle_t *zhp;
2568 	libzfs_handle_t *hdl;
2569 
2570 	if ((handle = zonecfg_init_handle()) == NULL) {
2571 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2572 		return (-1);
2573 	}
2574 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2575 		zerror(zlogp, B_FALSE, "invalid configuration");
2576 		zonecfg_fini_handle(handle);
2577 		return (-1);
2578 	}
2579 
2580 	if (zonecfg_setdsent(handle) != Z_OK) {
2581 		zerror(zlogp, B_FALSE, "invalid configuration");
2582 		zonecfg_fini_handle(handle);
2583 		return (-1);
2584 	}
2585 
2586 	if ((hdl = libzfs_init()) == NULL) {
2587 		zerror(zlogp, B_FALSE, "opening ZFS library");
2588 		zonecfg_fini_handle(handle);
2589 		return (-1);
2590 	}
2591 
2592 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
2593 
2594 		if ((zhp = zfs_open(hdl, dstab.zone_dataset_name,
2595 		    ZFS_TYPE_FILESYSTEM)) == NULL) {
2596 			zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
2597 			    dstab.zone_dataset_name);
2598 			zonecfg_fini_handle(handle);
2599 			libzfs_fini(hdl);
2600 			return (-1);
2601 		}
2602 
2603 		/*
2604 		 * Automatically set the 'zoned' property.  We check the value
2605 		 * first because we'll get EPERM if it is already set.
2606 		 */
2607 		if (!zfs_prop_get_int(zhp, ZFS_PROP_ZONED) &&
2608 		    zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_ZONED),
2609 		    "on") != 0) {
2610 			zerror(zlogp, B_FALSE, "cannot set 'zoned' "
2611 			    "property for ZFS dataset '%s'\n",
2612 			    dstab.zone_dataset_name);
2613 			zonecfg_fini_handle(handle);
2614 			zfs_close(zhp);
2615 			libzfs_fini(hdl);
2616 			return (-1);
2617 		}
2618 
2619 		zfs_close(zhp);
2620 	}
2621 	(void) zonecfg_enddsent(handle);
2622 
2623 	zonecfg_fini_handle(handle);
2624 	libzfs_fini(hdl);
2625 
2626 	return (0);
2627 }
2628 
2629 static int
2630 bind_to_pool(zlog_t *zlogp, zoneid_t zoneid)
2631 {
2632 	pool_conf_t *poolconf;
2633 	pool_t *pool;
2634 	char poolname[MAXPATHLEN];
2635 	int status;
2636 	int error;
2637 
2638 	/*
2639 	 * Find the pool mentioned in the zone configuration, and bind to it.
2640 	 */
2641 	error = get_zone_pool(zlogp, poolname, sizeof (poolname));
2642 	if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) {
2643 		/*
2644 		 * The property is not set on the zone, so the pool
2645 		 * should be bound to the default pool.  But that's
2646 		 * already done by the kernel, so we can just return.
2647 		 */
2648 		return (0);
2649 	}
2650 	if (error != Z_OK) {
2651 		/*
2652 		 * Not an error, even though it shouldn't be happening.
2653 		 */
2654 		zerror(zlogp, B_FALSE,
2655 		    "WARNING: unable to retrieve default pool.");
2656 		return (0);
2657 	}
2658 	/*
2659 	 * Don't do anything if pools aren't enabled.
2660 	 */
2661 	if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) {
2662 		zerror(zlogp, B_FALSE, "WARNING: pools facility not active; "
2663 		    "zone will not be bound to pool '%s'.", poolname);
2664 		return (0);
2665 	}
2666 	/*
2667 	 * Try to provide a sane error message if the requested pool doesn't
2668 	 * exist.
2669 	 */
2670 	if ((poolconf = pool_conf_alloc()) == NULL) {
2671 		zerror(zlogp, B_FALSE, "%s failed", "pool_conf_alloc");
2672 		return (-1);
2673 	}
2674 	if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) !=
2675 	    PO_SUCCESS) {
2676 		zerror(zlogp, B_FALSE, "%s failed", "pool_conf_open");
2677 		pool_conf_free(poolconf);
2678 		return (-1);
2679 	}
2680 	pool = pool_get_pool(poolconf, poolname);
2681 	(void) pool_conf_close(poolconf);
2682 	pool_conf_free(poolconf);
2683 	if (pool == NULL) {
2684 		zerror(zlogp, B_FALSE, "WARNING: pool '%s' not found; "
2685 		    "using default pool.", poolname);
2686 		return (0);
2687 	}
2688 	/*
2689 	 * Bind the zone to the pool.
2690 	 */
2691 	if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) {
2692 		zerror(zlogp, B_FALSE, "WARNING: unable to bind to pool '%s'; "
2693 		    "using default pool.", poolname);
2694 	}
2695 	return (0);
2696 }
2697 
2698 /*
2699  * Mount lower level home directories into/from current zone
2700  * Share exported directories specified in dfstab for zone
2701  */
2702 static int
2703 tsol_mounts(zlog_t *zlogp, char *zone_name, char *rootpath)
2704 {
2705 	zoneid_t *zids = NULL;
2706 	priv_set_t *zid_privs;
2707 	const priv_impl_info_t *ip = NULL;
2708 	uint_t nzents_saved;
2709 	uint_t nzents;
2710 	int i;
2711 	char readonly[] = "ro";
2712 	struct zone_fstab lower_fstab;
2713 	char *argv[4];
2714 
2715 	if (!is_system_labeled())
2716 		return (0);
2717 
2718 	if (zid_label == NULL) {
2719 		zid_label = m_label_alloc(MAC_LABEL);
2720 		if (zid_label == NULL)
2721 			return (-1);
2722 	}
2723 
2724 	/* Make sure our zone has an /export/home dir */
2725 	(void) make_one_dir(zlogp, rootpath, "/export/home",
2726 	    DEFAULT_DIR_MODE);
2727 
2728 	lower_fstab.zone_fs_raw[0] = '\0';
2729 	(void) strlcpy(lower_fstab.zone_fs_type, MNTTYPE_LOFS,
2730 	    sizeof (lower_fstab.zone_fs_type));
2731 	lower_fstab.zone_fs_options = NULL;
2732 	(void) zonecfg_add_fs_option(&lower_fstab, readonly);
2733 
2734 	/*
2735 	 * Get the list of zones from the kernel
2736 	 */
2737 	if (zone_list(NULL, &nzents) != 0) {
2738 		zerror(zlogp, B_TRUE, "unable to list zones");
2739 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2740 		return (-1);
2741 	}
2742 again:
2743 	if (nzents == 0) {
2744 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2745 		return (-1);
2746 	}
2747 
2748 	zids = malloc(nzents * sizeof (zoneid_t));
2749 	if (zids == NULL) {
2750 		zerror(zlogp, B_TRUE, "memory allocation failed");
2751 		return (-1);
2752 	}
2753 	nzents_saved = nzents;
2754 
2755 	if (zone_list(zids, &nzents) != 0) {
2756 		zerror(zlogp, B_TRUE, "unable to list zones");
2757 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2758 		free(zids);
2759 		return (-1);
2760 	}
2761 	if (nzents != nzents_saved) {
2762 		/* list changed, try again */
2763 		free(zids);
2764 		goto again;
2765 	}
2766 
2767 	ip = getprivimplinfo();
2768 	if ((zid_privs = priv_allocset()) == NULL) {
2769 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
2770 		zonecfg_free_fs_option_list(
2771 		    lower_fstab.zone_fs_options);
2772 		free(zids);
2773 		return (-1);
2774 	}
2775 
2776 	for (i = 0; i < nzents; i++) {
2777 		char zid_name[ZONENAME_MAX];
2778 		zone_state_t zid_state;
2779 		char zid_rpath[MAXPATHLEN];
2780 		struct stat stat_buf;
2781 
2782 		if (zids[i] == GLOBAL_ZONEID)
2783 			continue;
2784 
2785 		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
2786 			continue;
2787 
2788 		/*
2789 		 * Do special setup for the zone we are booting
2790 		 */
2791 		if (strcmp(zid_name, zone_name) == 0) {
2792 			struct zone_fstab autofs_fstab;
2793 			char map_path[MAXPATHLEN];
2794 			int fd;
2795 
2796 			/*
2797 			 * Create auto_home_<zone> map for this zone
2798 			 * in the global zone. The local zone entry
2799 			 * will be created by automount when the zone
2800 			 * is booted.
2801 			 */
2802 
2803 			(void) snprintf(autofs_fstab.zone_fs_special,
2804 			    MAXPATHLEN, "auto_home_%s", zid_name);
2805 
2806 			(void) snprintf(autofs_fstab.zone_fs_dir, MAXPATHLEN,
2807 			    "/zone/%s/home", zid_name);
2808 
2809 			(void) snprintf(map_path, sizeof (map_path),
2810 			    "/etc/%s", autofs_fstab.zone_fs_special);
2811 			/*
2812 			 * If the map file doesn't exist create a template
2813 			 */
2814 			if ((fd = open(map_path, O_RDWR | O_CREAT | O_EXCL,
2815 			    S_IRUSR | S_IWUSR | S_IRGRP| S_IROTH)) != -1) {
2816 				int len;
2817 				char map_rec[MAXPATHLEN];
2818 
2819 				len = snprintf(map_rec, sizeof (map_rec),
2820 				    "+%s\n*\t-fstype=lofs\t:%s/export/home/&\n",
2821 				    autofs_fstab.zone_fs_special, rootpath);
2822 				(void) write(fd, map_rec, len);
2823 				(void) close(fd);
2824 			}
2825 
2826 			/*
2827 			 * Mount auto_home_<zone> in the global zone if absent.
2828 			 * If it's already of type autofs, then
2829 			 * don't mount it again.
2830 			 */
2831 			if ((stat(autofs_fstab.zone_fs_dir, &stat_buf) == -1) ||
2832 			    strcmp(stat_buf.st_fstype, MNTTYPE_AUTOFS) != 0) {
2833 				char optstr[] = "indirect,ignore,nobrowse";
2834 
2835 				(void) make_one_dir(zlogp, "",
2836 				    autofs_fstab.zone_fs_dir, DEFAULT_DIR_MODE);
2837 
2838 				/*
2839 				 * Mount will fail if automounter has already
2840 				 * processed the auto_home_<zonename> map
2841 				 */
2842 				(void) domount(zlogp, MNTTYPE_AUTOFS, optstr,
2843 				    autofs_fstab.zone_fs_special,
2844 				    autofs_fstab.zone_fs_dir);
2845 			}
2846 			continue;
2847 		}
2848 
2849 
2850 		if (zone_get_state(zid_name, &zid_state) != Z_OK ||
2851 		    (zid_state != ZONE_STATE_READY &&
2852 		    zid_state != ZONE_STATE_RUNNING))
2853 			/* Skip over zones without mounted filesystems */
2854 			continue;
2855 
2856 		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
2857 		    sizeof (m_label_t)) < 0)
2858 			/* Skip over zones with unspecified label */
2859 			continue;
2860 
2861 		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
2862 		    sizeof (zid_rpath)) == -1)
2863 			/* Skip over zones with bad path */
2864 			continue;
2865 
2866 		if (zone_getattr(zids[i], ZONE_ATTR_PRIVSET, zid_privs,
2867 		    sizeof (priv_chunk_t) * ip->priv_setsize) == -1)
2868 			/* Skip over zones with bad privs */
2869 			continue;
2870 
2871 		/*
2872 		 * Reading down is valid according to our label model
2873 		 * but some customers want to disable it because it
2874 		 * allows execute down and other possible attacks.
2875 		 * Therefore, we restrict this feature to zones that
2876 		 * have the NET_MAC_AWARE privilege which is required
2877 		 * for NFS read-down semantics.
2878 		 */
2879 		if ((bldominates(zlabel, zid_label)) &&
2880 		    (priv_ismember(zprivs, PRIV_NET_MAC_AWARE))) {
2881 			/*
2882 			 * Our zone dominates this one.
2883 			 * Create a lofs mount from lower zone's /export/home
2884 			 */
2885 			(void) snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
2886 			    "%s/zone/%s/export/home", rootpath, zid_name);
2887 
2888 			/*
2889 			 * If the target is already an LOFS mount
2890 			 * then don't do it again.
2891 			 */
2892 			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
2893 			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
2894 
2895 				if (snprintf(lower_fstab.zone_fs_special,
2896 				    MAXPATHLEN, "%s/export",
2897 				    zid_rpath) > MAXPATHLEN)
2898 					continue;
2899 
2900 				/*
2901 				 * Make sure the lower-level home exists
2902 				 */
2903 				if (make_one_dir(zlogp,
2904 				    lower_fstab.zone_fs_special,
2905 				    "/home", DEFAULT_DIR_MODE) != 0)
2906 					continue;
2907 
2908 				(void) strlcat(lower_fstab.zone_fs_special,
2909 				    "/home", MAXPATHLEN);
2910 
2911 				/*
2912 				 * Mount can fail because the lower-level
2913 				 * zone may have already done a mount up.
2914 				 */
2915 				(void) mount_one(zlogp, &lower_fstab, "");
2916 			}
2917 		} else if ((bldominates(zid_label, zlabel)) &&
2918 		    (priv_ismember(zid_privs, PRIV_NET_MAC_AWARE))) {
2919 			/*
2920 			 * This zone dominates our zone.
2921 			 * Create a lofs mount from our zone's /export/home
2922 			 */
2923 			if (snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
2924 			    "%s/zone/%s/export/home", zid_rpath,
2925 			    zone_name) > MAXPATHLEN)
2926 				continue;
2927 
2928 			/*
2929 			 * If the target is already an LOFS mount
2930 			 * then don't do it again.
2931 			 */
2932 			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
2933 			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
2934 
2935 				(void) snprintf(lower_fstab.zone_fs_special,
2936 				    MAXPATHLEN, "%s/export/home", rootpath);
2937 
2938 				/*
2939 				 * Mount can fail because the higher-level
2940 				 * zone may have already done a mount down.
2941 				 */
2942 				(void) mount_one(zlogp, &lower_fstab, "");
2943 			}
2944 		}
2945 	}
2946 	zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2947 	priv_freeset(zid_privs);
2948 	free(zids);
2949 
2950 	/*
2951 	 * Now share any exported directories from this zone.
2952 	 * Each zone can have its own dfstab.
2953 	 */
2954 
2955 	argv[0] = "zoneshare";
2956 	argv[1] = "-z";
2957 	argv[2] = zone_name;
2958 	argv[3] = NULL;
2959 
2960 	(void) forkexec(zlogp, "/usr/lib/zones/zoneshare", argv);
2961 	/* Don't check for errors since they don't affect the zone */
2962 
2963 	return (0);
2964 }
2965 
2966 /*
2967  * Unmount lofs mounts from higher level zones
2968  * Unshare nfs exported directories
2969  */
2970 static void
2971 tsol_unmounts(zlog_t *zlogp, char *zone_name)
2972 {
2973 	zoneid_t *zids = NULL;
2974 	uint_t nzents_saved;
2975 	uint_t nzents;
2976 	int i;
2977 	char *argv[4];
2978 	char path[MAXPATHLEN];
2979 
2980 	if (!is_system_labeled())
2981 		return;
2982 
2983 	/*
2984 	 * Get the list of zones from the kernel
2985 	 */
2986 	if (zone_list(NULL, &nzents) != 0) {
2987 		return;
2988 	}
2989 
2990 	if (zid_label == NULL) {
2991 		zid_label = m_label_alloc(MAC_LABEL);
2992 		if (zid_label == NULL)
2993 			return;
2994 	}
2995 
2996 again:
2997 	if (nzents == 0)
2998 		return;
2999 
3000 	zids = malloc(nzents * sizeof (zoneid_t));
3001 	if (zids == NULL) {
3002 		zerror(zlogp, B_TRUE, "memory allocation failed");
3003 		return;
3004 	}
3005 	nzents_saved = nzents;
3006 
3007 	if (zone_list(zids, &nzents) != 0) {
3008 		free(zids);
3009 		return;
3010 	}
3011 	if (nzents != nzents_saved) {
3012 		/* list changed, try again */
3013 		free(zids);
3014 		goto again;
3015 	}
3016 
3017 	for (i = 0; i < nzents; i++) {
3018 		char zid_name[ZONENAME_MAX];
3019 		zone_state_t zid_state;
3020 		char zid_rpath[MAXPATHLEN];
3021 
3022 		if (zids[i] == GLOBAL_ZONEID)
3023 			continue;
3024 
3025 		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
3026 			continue;
3027 
3028 		/*
3029 		 * Skip the zone we are halting
3030 		 */
3031 		if (strcmp(zid_name, zone_name) == 0)
3032 			continue;
3033 
3034 		if ((zone_getattr(zids[i], ZONE_ATTR_STATUS, &zid_state,
3035 		    sizeof (zid_state)) < 0) ||
3036 		    (zid_state < ZONE_IS_READY))
3037 			/* Skip over zones without mounted filesystems */
3038 			continue;
3039 
3040 		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
3041 		    sizeof (m_label_t)) < 0)
3042 			/* Skip over zones with unspecified label */
3043 			continue;
3044 
3045 		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
3046 		    sizeof (zid_rpath)) == -1)
3047 			/* Skip over zones with bad path */
3048 			continue;
3049 
3050 		if (zlabel != NULL && bldominates(zid_label, zlabel)) {
3051 			/*
3052 			 * This zone dominates our zone.
3053 			 * Unmount the lofs mount of our zone's /export/home
3054 			 */
3055 
3056 			if (snprintf(path, MAXPATHLEN,
3057 			    "%s/zone/%s/export/home", zid_rpath,
3058 			    zone_name) > MAXPATHLEN)
3059 				continue;
3060 
3061 			/* Skip over mount failures */
3062 			(void) umount(path);
3063 		}
3064 	}
3065 	free(zids);
3066 
3067 	/*
3068 	 * Unmount global zone autofs trigger for this zone
3069 	 */
3070 	(void) snprintf(path, MAXPATHLEN, "/zone/%s/home", zone_name);
3071 	/* Skip over mount failures */
3072 	(void) umount(path);
3073 
3074 	/*
3075 	 * Next unshare any exported directories from this zone.
3076 	 */
3077 
3078 	argv[0] = "zoneunshare";
3079 	argv[1] = "-z";
3080 	argv[2] = zone_name;
3081 	argv[3] = NULL;
3082 
3083 	(void) forkexec(zlogp, "/usr/lib/zones/zoneunshare", argv);
3084 	/* Don't check for errors since they don't affect the zone */
3085 
3086 	/*
3087 	 * Finally, deallocate any devices in the zone.
3088 	 */
3089 
3090 	argv[0] = "deallocate";
3091 	argv[1] = "-Isz";
3092 	argv[2] = zone_name;
3093 	argv[3] = NULL;
3094 
3095 	(void) forkexec(zlogp, "/usr/sbin/deallocate", argv);
3096 	/* Don't check for errors since they don't affect the zone */
3097 }
3098 
3099 /*
3100  * Fetch the Trusted Extensions label and multi-level ports (MLPs) for
3101  * this zone.
3102  */
3103 static tsol_zcent_t *
3104 get_zone_label(zlog_t *zlogp, priv_set_t *privs)
3105 {
3106 	FILE *fp;
3107 	tsol_zcent_t *zcent = NULL;
3108 	char line[MAXTNZLEN];
3109 
3110 	if ((fp = fopen(TNZONECFG_PATH, "r")) == NULL) {
3111 		zerror(zlogp, B_TRUE, "%s", TNZONECFG_PATH);
3112 		return (NULL);
3113 	}
3114 
3115 	while (fgets(line, sizeof (line), fp) != NULL) {
3116 		/*
3117 		 * Check for malformed database
3118 		 */
3119 		if (strlen(line) == MAXTNZLEN - 1)
3120 			break;
3121 		if ((zcent = tsol_sgetzcent(line, NULL, NULL)) == NULL)
3122 			continue;
3123 		if (strcmp(zcent->zc_name, zone_name) == 0)
3124 			break;
3125 		tsol_freezcent(zcent);
3126 		zcent = NULL;
3127 	}
3128 	(void) fclose(fp);
3129 
3130 	if (zcent == NULL) {
3131 		zerror(zlogp, B_FALSE, "zone requires a label assignment. "
3132 		    "See tnzonecfg(4)");
3133 	} else {
3134 		if (zlabel == NULL)
3135 			zlabel = m_label_alloc(MAC_LABEL);
3136 		/*
3137 		 * Save this zone's privileges for later read-down processing
3138 		 */
3139 		if ((zprivs = priv_allocset()) == NULL) {
3140 			zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3141 			return (NULL);
3142 		} else {
3143 			priv_copyset(privs, zprivs);
3144 		}
3145 	}
3146 	return (zcent);
3147 }
3148 
3149 /*
3150  * Add the Trusted Extensions multi-level ports for this zone.
3151  */
3152 static void
3153 set_mlps(zlog_t *zlogp, zoneid_t zoneid, tsol_zcent_t *zcent)
3154 {
3155 	tsol_mlp_t *mlp;
3156 	tsol_mlpent_t tsme;
3157 
3158 	if (!is_system_labeled())
3159 		return;
3160 
3161 	tsme.tsme_zoneid = zoneid;
3162 	tsme.tsme_flags = 0;
3163 	for (mlp = zcent->zc_private_mlp; !TSOL_MLP_END(mlp); mlp++) {
3164 		tsme.tsme_mlp = *mlp;
3165 		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
3166 			zerror(zlogp, B_TRUE, "cannot set zone-specific MLP "
3167 			    "on %d-%d/%d", mlp->mlp_port,
3168 			    mlp->mlp_port_upper, mlp->mlp_ipp);
3169 		}
3170 	}
3171 
3172 	tsme.tsme_flags = TSOL_MEF_SHARED;
3173 	for (mlp = zcent->zc_shared_mlp; !TSOL_MLP_END(mlp); mlp++) {
3174 		tsme.tsme_mlp = *mlp;
3175 		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
3176 			zerror(zlogp, B_TRUE, "cannot set shared MLP "
3177 			    "on %d-%d/%d", mlp->mlp_port,
3178 			    mlp->mlp_port_upper, mlp->mlp_ipp);
3179 		}
3180 	}
3181 }
3182 
3183 static void
3184 remove_mlps(zlog_t *zlogp, zoneid_t zoneid)
3185 {
3186 	tsol_mlpent_t tsme;
3187 
3188 	if (!is_system_labeled())
3189 		return;
3190 
3191 	(void) memset(&tsme, 0, sizeof (tsme));
3192 	tsme.tsme_zoneid = zoneid;
3193 	if (tnmlp(TNDB_FLUSH, &tsme) != 0)
3194 		zerror(zlogp, B_TRUE, "cannot flush MLPs");
3195 }
3196 
3197 int
3198 prtmount(const char *fs, void *x) {
3199 	zerror((zlog_t *)x, B_FALSE, "  %s", fs);
3200 	return (0);
3201 }
3202 
3203 /*
3204  * Look for zones running on the main system that are using this root (or any
3205  * subdirectory of it).  Return B_TRUE and print an error if a conflicting zone
3206  * is found or if we can't tell.
3207  */
3208 static boolean_t
3209 duplicate_zone_root(zlog_t *zlogp, const char *rootpath)
3210 {
3211 	zoneid_t *zids = NULL;
3212 	uint_t nzids = 0;
3213 	boolean_t retv;
3214 	int rlen, zlen;
3215 	char zroot[MAXPATHLEN];
3216 	char zonename[ZONENAME_MAX];
3217 
3218 	for (;;) {
3219 		nzids += 10;
3220 		zids = malloc(nzids * sizeof (*zids));
3221 		if (zids == NULL) {
3222 			zerror(zlogp, B_TRUE, "memory allocation failed");
3223 			return (B_TRUE);
3224 		}
3225 		if (zone_list(zids, &nzids) == 0)
3226 			break;
3227 		free(zids);
3228 	}
3229 	retv = B_FALSE;
3230 	rlen = strlen(rootpath);
3231 	while (nzids > 0) {
3232 		/*
3233 		 * Ignore errors; they just mean that the zone has disappeared
3234 		 * while we were busy.
3235 		 */
3236 		if (zone_getattr(zids[--nzids], ZONE_ATTR_ROOT, zroot,
3237 		    sizeof (zroot)) == -1)
3238 			continue;
3239 		zlen = strlen(zroot);
3240 		if (zlen > rlen)
3241 			zlen = rlen;
3242 		if (strncmp(rootpath, zroot, zlen) == 0 &&
3243 		    (zroot[zlen] == '\0' || zroot[zlen] == '/') &&
3244 		    (rootpath[zlen] == '\0' || rootpath[zlen] == '/')) {
3245 			if (getzonenamebyid(zids[nzids], zonename,
3246 			    sizeof (zonename)) == -1)
3247 				(void) snprintf(zonename, sizeof (zonename),
3248 				    "id %d", (int)zids[nzids]);
3249 			zerror(zlogp, B_FALSE,
3250 			    "zone root %s already in use by zone %s",
3251 			    rootpath, zonename);
3252 			retv = B_TRUE;
3253 			break;
3254 		}
3255 	}
3256 	free(zids);
3257 	return (retv);
3258 }
3259 
3260 /*
3261  * Search for loopback mounts that use this same source node (same device and
3262  * inode).  Return B_TRUE if there is one or if we can't tell.
3263  */
3264 static boolean_t
3265 duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
3266 {
3267 	struct stat64 rst, zst;
3268 	struct mnttab *mnp;
3269 
3270 	if (stat64(rootpath, &rst) == -1) {
3271 		zerror(zlogp, B_TRUE, "can't stat %s", rootpath);
3272 		return (B_TRUE);
3273 	}
3274 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
3275 		return (B_TRUE);
3276 	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max; mnp++) {
3277 		if (mnp->mnt_fstype == NULL ||
3278 		    strcmp(MNTTYPE_LOFS, mnp->mnt_fstype) != 0)
3279 			continue;
3280 		/* We're looking at a loopback mount.  Stat it. */
3281 		if (mnp->mnt_special != NULL &&
3282 		    stat64(mnp->mnt_special, &zst) != -1 &&
3283 		    rst.st_dev == zst.st_dev && rst.st_ino == zst.st_ino) {
3284 			zerror(zlogp, B_FALSE,
3285 			    "zone root %s is reachable through %s",
3286 			    rootpath, mnp->mnt_mountp);
3287 			return (B_TRUE);
3288 		}
3289 	}
3290 	return (B_FALSE);
3291 }
3292 
3293 zoneid_t
3294 vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
3295 {
3296 	zoneid_t rval = -1;
3297 	priv_set_t *privs;
3298 	char rootpath[MAXPATHLEN];
3299 	char *rctlbuf = NULL;
3300 	size_t rctlbufsz = 0;
3301 	char *zfsbuf = NULL;
3302 	size_t zfsbufsz = 0;
3303 	zoneid_t zoneid = -1;
3304 	int xerr;
3305 	char *kzone;
3306 	FILE *fp = NULL;
3307 	tsol_zcent_t *zcent = NULL;
3308 	int match = 0;
3309 	int doi = 0;
3310 
3311 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
3312 		zerror(zlogp, B_TRUE, "unable to determine zone root");
3313 		return (-1);
3314 	}
3315 	if (zonecfg_in_alt_root())
3316 		resolve_lofs(zlogp, rootpath, sizeof (rootpath));
3317 
3318 	if ((privs = priv_allocset()) == NULL) {
3319 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3320 		return (-1);
3321 	}
3322 	priv_emptyset(privs);
3323 	if (get_privset(zlogp, privs, mount_cmd) != 0)
3324 		goto error;
3325 
3326 	if (!mount_cmd && get_rctls(zlogp, &rctlbuf, &rctlbufsz) != 0) {
3327 		zerror(zlogp, B_FALSE, "Unable to get list of rctls");
3328 		goto error;
3329 	}
3330 
3331 	if (get_datasets(zlogp, &zfsbuf, &zfsbufsz) != 0) {
3332 		zerror(zlogp, B_FALSE, "Unable to get list of ZFS datasets");
3333 		goto error;
3334 	}
3335 
3336 	if (!mount_cmd && is_system_labeled()) {
3337 		zcent = get_zone_label(zlogp, privs);
3338 		if (zcent != NULL) {
3339 			match = zcent->zc_match;
3340 			doi = zcent->zc_doi;
3341 			*zlabel = zcent->zc_label;
3342 		} else {
3343 			goto error;
3344 		}
3345 	}
3346 
3347 	kzone = zone_name;
3348 
3349 	/*
3350 	 * We must do this scan twice.  First, we look for zones running on the
3351 	 * main system that are using this root (or any subdirectory of it).
3352 	 * Next, we reduce to the shortest path and search for loopback mounts
3353 	 * that use this same source node (same device and inode).
3354 	 */
3355 	if (duplicate_zone_root(zlogp, rootpath))
3356 		goto error;
3357 	if (duplicate_reachable_path(zlogp, rootpath))
3358 		goto error;
3359 
3360 	if (mount_cmd) {
3361 		root_to_lu(zlogp, rootpath, sizeof (rootpath), B_TRUE);
3362 
3363 		/*
3364 		 * Forge up a special root for this zone.  When a zone is
3365 		 * mounted, we can't let the zone have its own root because the
3366 		 * tools that will be used in this "scratch zone" need access
3367 		 * to both the zone's resources and the running machine's
3368 		 * executables.
3369 		 *
3370 		 * Note that the mkdir here also catches read-only filesystems.
3371 		 */
3372 		if (mkdir(rootpath, 0755) != 0 && errno != EEXIST) {
3373 			zerror(zlogp, B_TRUE, "cannot create %s", rootpath);
3374 			goto error;
3375 		}
3376 		if (domount(zlogp, "tmpfs", "", "swap", rootpath) != 0)
3377 			goto error;
3378 	}
3379 
3380 	if (zonecfg_in_alt_root()) {
3381 		/*
3382 		 * If we are mounting up a zone in an alternate root partition,
3383 		 * then we have some additional work to do before starting the
3384 		 * zone.  First, resolve the root path down so that we're not
3385 		 * fooled by duplicates.  Then forge up an internal name for
3386 		 * the zone.
3387 		 */
3388 		if ((fp = zonecfg_open_scratch("", B_TRUE)) == NULL) {
3389 			zerror(zlogp, B_TRUE, "cannot open mapfile");
3390 			goto error;
3391 		}
3392 		if (zonecfg_lock_scratch(fp) != 0) {
3393 			zerror(zlogp, B_TRUE, "cannot lock mapfile");
3394 			goto error;
3395 		}
3396 		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
3397 		    NULL, 0) == 0) {
3398 			zerror(zlogp, B_FALSE, "scratch zone already running");
3399 			goto error;
3400 		}
3401 		/* This is the preferred name */
3402 		(void) snprintf(kernzone, sizeof (kernzone), "SUNWlu-%s",
3403 		    zone_name);
3404 		srandom(getpid());
3405 		while (zonecfg_reverse_scratch(fp, kernzone, NULL, 0, NULL,
3406 		    0) == 0) {
3407 			/* This is just an arbitrary name; note "." usage */
3408 			(void) snprintf(kernzone, sizeof (kernzone),
3409 			    "SUNWlu.%08lX%08lX", random(), random());
3410 		}
3411 		kzone = kernzone;
3412 	}
3413 
3414 	xerr = 0;
3415 	if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
3416 	    rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel)) == -1) {
3417 		if (xerr == ZE_AREMOUNTS) {
3418 			if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
3419 				zerror(zlogp, B_FALSE,
3420 				    "An unknown file-system is mounted on "
3421 				    "a subdirectory of %s", rootpath);
3422 			} else {
3423 
3424 				zerror(zlogp, B_FALSE,
3425 				    "These file-systems are mounted on "
3426 				    "subdirectories of %s:", rootpath);
3427 				(void) zonecfg_find_mounts(rootpath,
3428 				    prtmount, zlogp);
3429 			}
3430 		} else if (xerr == ZE_CHROOTED) {
3431 			zerror(zlogp, B_FALSE, "%s: "
3432 			    "cannot create a zone from a chrooted "
3433 			    "environment", "zone_create");
3434 		} else {
3435 			zerror(zlogp, B_TRUE, "%s failed", "zone_create");
3436 		}
3437 		goto error;
3438 	}
3439 
3440 	if (zonecfg_in_alt_root() &&
3441 	    zonecfg_add_scratch(fp, zone_name, kernzone,
3442 	    zonecfg_get_root()) == -1) {
3443 		zerror(zlogp, B_TRUE, "cannot add mapfile entry");
3444 		goto error;
3445 	}
3446 
3447 	/*
3448 	 * The following is a warning, not an error, and is not performed when
3449 	 * merely mounting a zone for administrative use.
3450 	 */
3451 	if (!mount_cmd && bind_to_pool(zlogp, zoneid) != 0)
3452 		zerror(zlogp, B_FALSE, "WARNING: unable to bind zone to "
3453 		    "requested pool; using default pool.");
3454 	if (!mount_cmd)
3455 		set_mlps(zlogp, zoneid, zcent);
3456 	rval = zoneid;
3457 	zoneid = -1;
3458 
3459 error:
3460 	if (zoneid != -1)
3461 		(void) zone_destroy(zoneid);
3462 	if (rctlbuf != NULL)
3463 		free(rctlbuf);
3464 	priv_freeset(privs);
3465 	if (fp != NULL)
3466 		zonecfg_close_scratch(fp);
3467 	lofs_discard_mnttab();
3468 	if (zcent != NULL)
3469 		tsol_freezcent(zcent);
3470 	return (rval);
3471 }
3472 
3473 /*
3474  * Enter the zone and write a /etc/zones/index file there.  This allows
3475  * libzonecfg (and thus zoneadm) to report the UUID and potentially other zone
3476  * details from inside the zone.
3477  */
3478 static void
3479 write_index_file(zoneid_t zoneid)
3480 {
3481 	FILE *zef;
3482 	FILE *zet;
3483 	struct zoneent *zep;
3484 	pid_t child;
3485 	int tmpl_fd;
3486 	ctid_t ct;
3487 	int fd;
3488 	char uuidstr[UUID_PRINTABLE_STRING_LENGTH];
3489 
3490 	/* Locate the zone entry in the global zone's index file */
3491 	if ((zef = setzoneent()) == NULL)
3492 		return;
3493 	while ((zep = getzoneent_private(zef)) != NULL) {
3494 		if (strcmp(zep->zone_name, zone_name) == 0)
3495 			break;
3496 		free(zep);
3497 	}
3498 	endzoneent(zef);
3499 	if (zep == NULL)
3500 		return;
3501 
3502 	if ((tmpl_fd = init_template()) == -1) {
3503 		free(zep);
3504 		return;
3505 	}
3506 
3507 	if ((child = fork()) == -1) {
3508 		(void) ct_tmpl_clear(tmpl_fd);
3509 		(void) close(tmpl_fd);
3510 		free(zep);
3511 		return;
3512 	}
3513 
3514 	/* parent waits for child to finish */
3515 	if (child != 0) {
3516 		free(zep);
3517 		if (contract_latest(&ct) == -1)
3518 			ct = -1;
3519 		(void) ct_tmpl_clear(tmpl_fd);
3520 		(void) close(tmpl_fd);
3521 		(void) waitpid(child, NULL, 0);
3522 		(void) contract_abandon_id(ct);
3523 		return;
3524 	}
3525 
3526 	/* child enters zone and sets up index file */
3527 	(void) ct_tmpl_clear(tmpl_fd);
3528 	if (zone_enter(zoneid) != -1) {
3529 		(void) mkdir(ZONE_CONFIG_ROOT, ZONE_CONFIG_MODE);
3530 		(void) chown(ZONE_CONFIG_ROOT, ZONE_CONFIG_UID,
3531 		    ZONE_CONFIG_GID);
3532 		fd = open(ZONE_INDEX_FILE, O_WRONLY|O_CREAT|O_TRUNC,
3533 		    ZONE_INDEX_MODE);
3534 		if (fd != -1 && (zet = fdopen(fd, "w")) != NULL) {
3535 			(void) fchown(fd, ZONE_INDEX_UID, ZONE_INDEX_GID);
3536 			if (uuid_is_null(zep->zone_uuid))
3537 				uuidstr[0] = '\0';
3538 			else
3539 				uuid_unparse(zep->zone_uuid, uuidstr);
3540 			(void) fprintf(zet, "%s:%s:/:%s\n", zep->zone_name,
3541 			    zone_state_str(zep->zone_state),
3542 			    uuidstr);
3543 			(void) fclose(zet);
3544 		}
3545 	}
3546 	_exit(0);
3547 }
3548 
3549 int
3550 vplat_bringup(zlog_t *zlogp, boolean_t mount_cmd, zoneid_t zoneid)
3551 {
3552 
3553 	if (!mount_cmd && validate_datasets(zlogp) != 0) {
3554 		lofs_discard_mnttab();
3555 		return (-1);
3556 	}
3557 
3558 	if (mount_filesystems(zlogp, mount_cmd) != 0) {
3559 		lofs_discard_mnttab();
3560 		return (-1);
3561 	}
3562 
3563 	/* mount /dev for zone (both normal and scratch zone) */
3564 	if (vplat_mount_dev(zlogp) != 0) {
3565 		lofs_discard_mnttab();
3566 		return (-1);
3567 	}
3568 
3569 	if (!mount_cmd && configure_network_interfaces(zlogp) != 0) {
3570 		lofs_discard_mnttab();
3571 		return (-1);
3572 	}
3573 
3574 	write_index_file(zoneid);
3575 
3576 	lofs_discard_mnttab();
3577 	return (0);
3578 }
3579 
3580 static int
3581 lu_root_teardown(zlog_t *zlogp)
3582 {
3583 	char zroot[MAXPATHLEN];
3584 
3585 	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
3586 		zerror(zlogp, B_FALSE, "unable to determine zone root");
3587 		return (-1);
3588 	}
3589 	root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
3590 
3591 	/*
3592 	 * At this point, the processes are gone, the filesystems (save the
3593 	 * root) are unmounted, and the zone is on death row.  But there may
3594 	 * still be creds floating about in the system that reference the
3595 	 * zone_t, and which pin down zone_rootvp causing this call to fail
3596 	 * with EBUSY.  Thus, we try for a little while before just giving up.
3597 	 * (How I wish this were not true, and umount2 just did the right
3598 	 * thing, or tmpfs supported MS_FORCE This is a gross hack.)
3599 	 */
3600 	if (umount2(zroot, MS_FORCE) != 0) {
3601 		if (errno == ENOTSUP && umount2(zroot, 0) == 0)
3602 			goto unmounted;
3603 		if (errno == EBUSY) {
3604 			int tries = 10;
3605 
3606 			while (--tries >= 0) {
3607 				(void) sleep(1);
3608 				if (umount2(zroot, 0) == 0)
3609 					goto unmounted;
3610 				if (errno != EBUSY)
3611 					break;
3612 			}
3613 		}
3614 		zerror(zlogp, B_TRUE, "unable to unmount '%s'", zroot);
3615 		return (-1);
3616 	}
3617 unmounted:
3618 
3619 	/*
3620 	 * Only zones in an alternate root environment have scratch zone
3621 	 * entries.
3622 	 */
3623 	if (zonecfg_in_alt_root()) {
3624 		FILE *fp;
3625 		int retv;
3626 
3627 		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
3628 			zerror(zlogp, B_TRUE, "cannot open mapfile");
3629 			return (-1);
3630 		}
3631 		retv = -1;
3632 		if (zonecfg_lock_scratch(fp) != 0)
3633 			zerror(zlogp, B_TRUE, "cannot lock mapfile");
3634 		else if (zonecfg_delete_scratch(fp, kernzone) != 0)
3635 			zerror(zlogp, B_TRUE, "cannot delete map entry");
3636 		else
3637 			retv = 0;
3638 		zonecfg_close_scratch(fp);
3639 		return (retv);
3640 	} else {
3641 		return (0);
3642 	}
3643 }
3644 
3645 int
3646 vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd)
3647 {
3648 	char *kzone;
3649 	zoneid_t zoneid;
3650 
3651 	kzone = zone_name;
3652 	if (zonecfg_in_alt_root()) {
3653 		FILE *fp;
3654 
3655 		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
3656 			zerror(zlogp, B_TRUE, "unable to open map file");
3657 			goto error;
3658 		}
3659 		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
3660 		    kernzone, sizeof (kernzone)) != 0) {
3661 			zerror(zlogp, B_FALSE, "unable to find scratch zone");
3662 			zonecfg_close_scratch(fp);
3663 			goto error;
3664 		}
3665 		zonecfg_close_scratch(fp);
3666 		kzone = kernzone;
3667 	}
3668 
3669 	if ((zoneid = getzoneidbyname(kzone)) == ZONE_ID_UNDEFINED) {
3670 		if (!bringup_failure_recovery)
3671 			zerror(zlogp, B_TRUE, "unable to get zoneid");
3672 		if (unmount_cmd)
3673 			(void) lu_root_teardown(zlogp);
3674 		goto error;
3675 	}
3676 
3677 	if (zone_shutdown(zoneid) != 0) {
3678 		zerror(zlogp, B_TRUE, "unable to shutdown zone");
3679 		goto error;
3680 	}
3681 
3682 	if (!unmount_cmd &&
3683 	    unconfigure_network_interfaces(zlogp, zoneid) != 0) {
3684 		zerror(zlogp, B_FALSE,
3685 		    "unable to unconfigure network interfaces in zone");
3686 		goto error;
3687 	}
3688 
3689 	if (!unmount_cmd && tcp_abort_connections(zlogp, zoneid) != 0) {
3690 		zerror(zlogp, B_TRUE, "unable to abort TCP connections");
3691 		goto error;
3692 	}
3693 
3694 	/* destroy zconsole before umount /dev */
3695 	if (!unmount_cmd)
3696 		destroy_console_slave();
3697 
3698 	if (unmount_filesystems(zlogp, zoneid, unmount_cmd) != 0) {
3699 		zerror(zlogp, B_FALSE,
3700 		    "unable to unmount file systems in zone");
3701 		goto error;
3702 	}
3703 
3704 	remove_mlps(zlogp, zoneid);
3705 
3706 	if (zone_destroy(zoneid) != 0) {
3707 		zerror(zlogp, B_TRUE, "unable to destroy zone");
3708 		goto error;
3709 	}
3710 
3711 	/*
3712 	 * Special teardown for alternate boot environments: remove the tmpfs
3713 	 * root for the zone and then remove it from the map file.
3714 	 */
3715 	if (unmount_cmd && lu_root_teardown(zlogp) != 0)
3716 		goto error;
3717 
3718 	lofs_discard_mnttab();
3719 	return (0);
3720 
3721 error:
3722 	lofs_discard_mnttab();
3723 	return (-1);
3724 }
3725 
3726 /*
3727  * Apply the standard lists of devices/symlinks/mappings and the user-specified
3728  * list of devices (via zonecfg) to the /dev filesystem.  The filesystem will
3729  * use these as a profile/filter to determine what exists in /dev.
3730  */
3731 static int
3732 vplat_mount_dev(zlog_t *zlogp)
3733 {
3734 	char			zonedevpath[MAXPATHLEN];
3735 	zone_dochandle_t	handle = NULL;
3736 	struct zone_devtab	ztab;
3737 	zone_fsopt_t		opt_attr;
3738 	di_prof_t		prof = NULL;
3739 	int			i, err, len;
3740 	int			retval = -1;
3741 
3742 	struct zone_fstab devtab = {
3743 		"/dev",
3744 		"/dev",
3745 		MNTTYPE_DEV,
3746 		NULL,
3747 		""
3748 	};
3749 
3750 	if (err = zone_get_devroot(zone_name, zonedevpath,
3751 	    sizeof (zonedevpath))) {
3752 		zerror(zlogp, B_FALSE, "can't get zone dev: %s",
3753 		    zonecfg_strerror(err));
3754 		return (-1);
3755 	}
3756 
3757 	/*
3758 	 * The old /dev was a lofs mount from <zonepath>/dev, with
3759 	 * dev fs, that becomes a mount on <zonepath>/root/dev.
3760 	 * However, we need to preserve device permission bits during
3761 	 * upgrade.  What we should do is migrate the attribute directory
3762 	 * on upgrade, but for now, preserve it at <zonepath>/dev.
3763 	 */
3764 	(void) strcpy(opt_attr.zone_fsopt_opt, "attrdir=");
3765 	len = strlen(opt_attr.zone_fsopt_opt);
3766 	if (err = zone_get_zonepath(zone_name,
3767 	    opt_attr.zone_fsopt_opt + len, MAX_MNTOPT_STR - len)) {
3768 		zerror(zlogp, B_FALSE, "can't get zone path: %s",
3769 		    zonecfg_strerror(err));
3770 		return (-1);
3771 	}
3772 
3773 	if (make_one_dir(zlogp, opt_attr.zone_fsopt_opt + len, "/dev",
3774 	    DEFAULT_DIR_MODE) != 0)
3775 		return (-1);
3776 
3777 	(void) strlcat(opt_attr.zone_fsopt_opt, "/dev", MAX_MNTOPT_STR);
3778 	devtab.zone_fs_options = &opt_attr;
3779 	opt_attr.zone_fsopt_next = NULL;
3780 
3781 	/* mount /dev inside the zone */
3782 	i = strlen(zonedevpath);
3783 	if (mount_one(zlogp, &devtab, zonedevpath))
3784 		return (-1);
3785 
3786 	(void) strlcat(zonedevpath, "/dev", sizeof (zonedevpath));
3787 	if (di_prof_init(zonedevpath, &prof)) {
3788 		zerror(zlogp, B_TRUE, "failed to initialize profile");
3789 		goto cleanup;
3790 	}
3791 
3792 	/* Add the standard devices and directories */
3793 	for (i = 0; standard_devs[i] != NULL; ++i) {
3794 		if (di_prof_add_dev(prof, standard_devs[i])) {
3795 			zerror(zlogp, B_TRUE, "failed to add "
3796 			    "standard device");
3797 			goto cleanup;
3798 		}
3799 	}
3800 
3801 	/* Add the standard symlinks */
3802 	for (i = 0; standard_devlinks[i].source != NULL; ++i) {
3803 		if (di_prof_add_symlink(prof,
3804 		    standard_devlinks[i].source,
3805 		    standard_devlinks[i].target)) {
3806 			zerror(zlogp, B_TRUE, "failed to add "
3807 			    "standard symlink");
3808 			goto cleanup;
3809 		}
3810 	}
3811 
3812 	/* Add user-specified devices and directories */
3813 	if ((handle = zonecfg_init_handle()) == NULL) {
3814 		zerror(zlogp, B_FALSE, "can't initialize zone handle");
3815 		goto cleanup;
3816 	}
3817 	if (err = zonecfg_get_handle(zone_name, handle)) {
3818 		zerror(zlogp, B_FALSE, "can't get handle for zone "
3819 		    "%s: %s", zone_name, zonecfg_strerror(err));
3820 		goto cleanup;
3821 	}
3822 	if (err = zonecfg_setdevent(handle)) {
3823 		zerror(zlogp, B_FALSE, "%s: %s", zone_name,
3824 		    zonecfg_strerror(err));
3825 		goto cleanup;
3826 	}
3827 	while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
3828 		if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
3829 			zerror(zlogp, B_TRUE, "failed to add "
3830 			    "user-specified device");
3831 			goto cleanup;
3832 		}
3833 	}
3834 	(void) zonecfg_enddevent(handle);
3835 
3836 	/* Send profile to kernel */
3837 	if (di_prof_commit(prof)) {
3838 		zerror(zlogp, B_TRUE, "failed to commit profile");
3839 		goto cleanup;
3840 	}
3841 
3842 	retval = 0;
3843 
3844 cleanup:
3845 	if (handle)
3846 		zonecfg_fini_handle(handle);
3847 	if (prof)
3848 		di_prof_fini(prof);
3849 	return (retval);
3850 }
3851