xref: /titanic_50/usr/src/cmd/zoneadmd/vplat.c (revision 3bf5ae9eedb977fad5c8a4029f296a9ec010c06e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This module contains functions used to bring up and tear down the
31  * Virtual Platform: [un]mounting file-systems, [un]plumbing network
32  * interfaces, [un]configuring devices, establishing resource controls,
33  * and creating/destroying the zone in the kernel.  These actions, on
34  * the way up, ready the zone; on the way down, they halt the zone.
35  * See the much longer block comment at the beginning of zoneadmd.c
36  * for a bigger picture of how the whole program functions.
37  *
38  * This module also has primary responsibility for the layout of "scratch
39  * zones."  These are mounted, but inactive, zones that are used during
40  * operating system upgrade and potentially other administrative action.  The
41  * scratch zone environment is similar to the miniroot environment.  The zone's
42  * actual root is mounted read-write on /a, and the standard paths (/usr,
43  * /sbin, /lib) all lead to read-only copies of the running system's binaries.
44  * This allows the administrative tools to manipulate the zone using "-R /a"
45  * without relying on any binaries in the zone itself.
46  *
47  * If the scratch zone is on an alternate root (Live Upgrade [LU] boot
48  * environment), then we must resolve the lofs mounts used there to uncover
49  * writable (unshared) resources.  Shared resources, though, are always
50  * read-only.  In addition, if the "same" zone with a different root path is
51  * currently running, then "/b" inside the zone points to the running zone's
52  * root.  This allows LU to synchronize configuration files during the upgrade
53  * process.
54  *
55  * To construct this environment, this module creates a tmpfs mount on
56  * $ZONEPATH/lu.  Inside this scratch area, the miniroot-like environment as
57  * described above is constructed on the fly.  The zone is then created using
58  * $ZONEPATH/lu as the root.
59  *
60  * Note that scratch zones are inactive.  The zone's bits are not running and
61  * likely cannot be run correctly until upgrade is done.  Init is not running
62  * there, nor is SMF.  Because of this, the "mounted" state of a scratch zone
63  * is not a part of the usual halt/ready/boot state machine.
64  */
65 
66 #include <sys/param.h>
67 #include <sys/mount.h>
68 #include <sys/mntent.h>
69 #include <sys/socket.h>
70 #include <sys/utsname.h>
71 #include <sys/types.h>
72 #include <sys/stat.h>
73 #include <sys/sockio.h>
74 #include <sys/stropts.h>
75 #include <sys/conf.h>
76 
77 #include <inet/tcp.h>
78 #include <arpa/inet.h>
79 #include <netinet/in.h>
80 #include <net/route.h>
81 
82 #include <stdio.h>
83 #include <errno.h>
84 #include <fcntl.h>
85 #include <unistd.h>
86 #include <rctl.h>
87 #include <stdlib.h>
88 #include <string.h>
89 #include <strings.h>
90 #include <wait.h>
91 #include <limits.h>
92 #include <libgen.h>
93 #include <libzfs.h>
94 #include <libdevinfo.h>
95 #include <zone.h>
96 #include <assert.h>
97 #include <libcontract.h>
98 #include <libcontract_priv.h>
99 #include <uuid/uuid.h>
100 
101 #include <sys/mntio.h>
102 #include <sys/mnttab.h>
103 #include <sys/fs/autofs.h>	/* for _autofssys() */
104 #include <sys/fs/lofs_info.h>
105 #include <sys/fs/zfs.h>
106 
107 #include <pool.h>
108 #include <sys/pool.h>
109 
110 #include <libzonecfg.h>
111 #include <synch.h>
112 
113 #include "zoneadmd.h"
114 #include <tsol/label.h>
115 #include <libtsnet.h>
116 #include <sys/priv.h>
117 
118 #define	V4_ADDR_LEN	32
119 #define	V6_ADDR_LEN	128
120 
121 /* 0755 is the default directory mode. */
122 #define	DEFAULT_DIR_MODE \
123 	(S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)
124 
125 #define	IPD_DEFAULT_OPTS \
126 	MNTOPT_RO "," MNTOPT_LOFS_NOSUB "," MNTOPT_NODEVICES
127 
128 #define	DFSTYPES	"/etc/dfs/fstypes"
129 #define	MAXTNZLEN	2048
130 
131 /*
132  * This is the set of directories and devices (relative to <zone_root>/dev)
133  * which must be present in every zone.  Users can augment this list with
134  * additional device rules in their zone configuration, but at present cannot
135  * remove any of the this set of standard devices.
136  */
137 static const char *standard_devs[] = {
138 	"arp",
139 	"conslog",
140 	"cpu/self/cpuid",
141 	"crypto",
142 	"cryptoadm",
143 	"dsk",
144 	"dtrace/*",
145 	"dtrace/provider/*",
146 	"fd",
147 	"kstat",
148 	"lo0",
149 	"lo1",
150 	"lo2",
151 	"lo3",
152 	"log",
153 	"logindmux",
154 	"null",
155 #ifdef __sparc
156 	"openprom",
157 #endif
158 	"poll",
159 	"pool",
160 	"ptmx",
161 	"pts/*",
162 	"random",
163 	"rdsk",
164 	"rmt",
165 	"sad/user",
166 	"swap",
167 	"sysevent",
168 	"tcp",
169 	"tcp6",
170 	"term",
171 	"ticlts",
172 	"ticots",
173 	"ticotsord",
174 	"tty",
175 	"udp",
176 	"udp6",
177 	"urandom",
178 	"zero",
179 	"zfs",
180 	NULL
181 };
182 
183 struct source_target {
184 	const char *source;
185 	const char *target;
186 };
187 
188 /*
189  * Set of symlinks (relative to <zone_root>/dev) which must be present in
190  * every zone.
191  */
192 static struct source_target standard_devlinks[] = {
193 	{ "stderr",	"./fd/2" },
194 	{ "stdin",	"./fd/0" },
195 	{ "stdout",	"./fd/1" },
196 	{ "dtremote",	"/dev/null" },
197 	{ "console",	"zconsole" },
198 	{ "syscon",	"zconsole" },
199 	{ "sysmsg",	"zconsole" },
200 	{ "systty",	"zconsole" },
201 	{ "msglog",	"zconsole" },
202 	{ NULL, NULL }
203 };
204 
205 static int vplat_mount_dev(zlog_t *);
206 
207 /* for routing socket */
208 static int rts_seqno = 0;
209 
210 /* mangled zone name when mounting in an alternate root environment */
211 static char kernzone[ZONENAME_MAX];
212 
213 /* array of cached mount entries for resolve_lofs */
214 static struct mnttab *resolve_lofs_mnts, *resolve_lofs_mnt_max;
215 
216 /* for Trusted Extensions */
217 static tsol_zcent_t *get_zone_label(zlog_t *, priv_set_t *);
218 static int tsol_mounts(zlog_t *, char *, char *);
219 static void tsol_unmounts(zlog_t *, char *);
220 static m_label_t *zlabel = NULL;
221 static m_label_t *zid_label = NULL;
222 static priv_set_t *zprivs = NULL;
223 
224 /* from libsocket, not in any header file */
225 extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
226 
227 /*
228  * An optimization for build_mnttable: reallocate (and potentially copy the
229  * data) only once every N times through the loop.
230  */
231 #define	MNTTAB_HUNK	32
232 
233 /*
234  * Private autofs system call
235  */
236 extern int _autofssys(int, void *);
237 
238 static int
239 autofs_cleanup(zoneid_t zoneid)
240 {
241 	/*
242 	 * Ask autofs to unmount all trigger nodes in the given zone.
243 	 */
244 	return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
245 }
246 
247 static void
248 free_mnttable(struct mnttab *mnt_array, uint_t nelem)
249 {
250 	uint_t i;
251 
252 	if (mnt_array == NULL)
253 		return;
254 	for (i = 0; i < nelem; i++) {
255 		free(mnt_array[i].mnt_mountp);
256 		free(mnt_array[i].mnt_fstype);
257 		free(mnt_array[i].mnt_special);
258 		free(mnt_array[i].mnt_mntopts);
259 		assert(mnt_array[i].mnt_time == NULL);
260 	}
261 	free(mnt_array);
262 }
263 
264 /*
265  * Build the mount table for the zone rooted at "zroot", storing the resulting
266  * array of struct mnttabs in "mnt_arrayp" and the number of elements in the
267  * array in "nelemp".
268  */
269 static int
270 build_mnttable(zlog_t *zlogp, const char *zroot, size_t zrootlen, FILE *mnttab,
271     struct mnttab **mnt_arrayp, uint_t *nelemp)
272 {
273 	struct mnttab mnt;
274 	struct mnttab *mnts;
275 	struct mnttab *mnp;
276 	uint_t nmnt;
277 
278 	rewind(mnttab);
279 	resetmnttab(mnttab);
280 	nmnt = 0;
281 	mnts = NULL;
282 	while (getmntent(mnttab, &mnt) == 0) {
283 		struct mnttab *tmp_array;
284 
285 		if (strncmp(mnt.mnt_mountp, zroot, zrootlen) != 0)
286 			continue;
287 		if (nmnt % MNTTAB_HUNK == 0) {
288 			tmp_array = realloc(mnts,
289 			    (nmnt + MNTTAB_HUNK) * sizeof (*mnts));
290 			if (tmp_array == NULL) {
291 				free_mnttable(mnts, nmnt);
292 				return (-1);
293 			}
294 			mnts = tmp_array;
295 		}
296 		mnp = &mnts[nmnt++];
297 
298 		/*
299 		 * Zero out any fields we're not using.
300 		 */
301 		(void) memset(mnp, 0, sizeof (*mnp));
302 
303 		if (mnt.mnt_special != NULL)
304 			mnp->mnt_special = strdup(mnt.mnt_special);
305 		if (mnt.mnt_mntopts != NULL)
306 			mnp->mnt_mntopts = strdup(mnt.mnt_mntopts);
307 		mnp->mnt_mountp = strdup(mnt.mnt_mountp);
308 		mnp->mnt_fstype = strdup(mnt.mnt_fstype);
309 		if ((mnt.mnt_special != NULL && mnp->mnt_special == NULL) ||
310 		    (mnt.mnt_mntopts != NULL && mnp->mnt_mntopts == NULL) ||
311 		    mnp->mnt_mountp == NULL || mnp->mnt_fstype == NULL) {
312 			zerror(zlogp, B_TRUE, "memory allocation failed");
313 			free_mnttable(mnts, nmnt);
314 			return (-1);
315 		}
316 	}
317 	*mnt_arrayp = mnts;
318 	*nelemp = nmnt;
319 	return (0);
320 }
321 
322 /*
323  * This is an optimization.  The resolve_lofs function is used quite frequently
324  * to manipulate file paths, and on a machine with a large number of zones,
325  * there will be a huge number of mounted file systems.  Thus, we trigger a
326  * reread of the list of mount points
327  */
328 static void
329 lofs_discard_mnttab(void)
330 {
331 	free_mnttable(resolve_lofs_mnts,
332 	    resolve_lofs_mnt_max - resolve_lofs_mnts);
333 	resolve_lofs_mnts = resolve_lofs_mnt_max = NULL;
334 }
335 
336 static int
337 lofs_read_mnttab(zlog_t *zlogp)
338 {
339 	FILE *mnttab;
340 	uint_t nmnts;
341 
342 	if ((mnttab = fopen(MNTTAB, "r")) == NULL)
343 		return (-1);
344 	if (build_mnttable(zlogp, "", 0, mnttab, &resolve_lofs_mnts,
345 	    &nmnts) == -1) {
346 		(void) fclose(mnttab);
347 		return (-1);
348 	}
349 	(void) fclose(mnttab);
350 	resolve_lofs_mnt_max = resolve_lofs_mnts + nmnts;
351 	return (0);
352 }
353 
354 /*
355  * This function loops over potential loopback mounts and symlinks in a given
356  * path and resolves them all down to an absolute path.
357  */
358 static void
359 resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen)
360 {
361 	int len, arlen;
362 	const char *altroot;
363 	char tmppath[MAXPATHLEN];
364 	boolean_t outside_altroot;
365 
366 	if ((len = resolvepath(path, tmppath, sizeof (tmppath))) == -1)
367 		return;
368 	tmppath[len] = '\0';
369 	(void) strlcpy(path, tmppath, sizeof (tmppath));
370 
371 	/* This happens once per zoneadmd operation. */
372 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
373 		return;
374 
375 	altroot = zonecfg_get_root();
376 	arlen = strlen(altroot);
377 	outside_altroot = B_FALSE;
378 	for (;;) {
379 		struct mnttab *mnp;
380 
381 		for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
382 		    mnp++) {
383 			if (mnp->mnt_fstype == NULL ||
384 			    mnp->mnt_mountp == NULL ||
385 			    mnp->mnt_special == NULL ||
386 			    strcmp(mnp->mnt_fstype, MNTTYPE_LOFS) != 0)
387 				continue;
388 			len = strlen(mnp->mnt_mountp);
389 			if (strncmp(mnp->mnt_mountp, path, len) == 0 &&
390 			    (path[len] == '/' || path[len] == '\0'))
391 				break;
392 		}
393 		if (mnp >= resolve_lofs_mnt_max)
394 			break;
395 		if (outside_altroot) {
396 			char *cp;
397 			int olen = sizeof (MNTOPT_RO) - 1;
398 
399 			/*
400 			 * If we run into a read-only mount outside of the
401 			 * alternate root environment, then the user doesn't
402 			 * want this path to be made read-write.
403 			 */
404 			if (mnp->mnt_mntopts != NULL &&
405 			    (cp = strstr(mnp->mnt_mntopts, MNTOPT_RO)) !=
406 			    NULL &&
407 			    (cp == mnp->mnt_mntopts || cp[-1] == ',') &&
408 			    (cp[olen] == '\0' || cp[olen] == ',')) {
409 				break;
410 			}
411 		} else if (arlen > 0 &&
412 		    (strncmp(mnp->mnt_special, altroot, arlen) != 0 ||
413 		    (mnp->mnt_special[arlen] != '\0' &&
414 		    mnp->mnt_special[arlen] != '/'))) {
415 			outside_altroot = B_TRUE;
416 		}
417 		/* use temporary buffer because new path might be longer */
418 		(void) snprintf(tmppath, sizeof (tmppath), "%s%s",
419 		    mnp->mnt_special, path + len);
420 		if ((len = resolvepath(tmppath, path, pathlen)) == -1)
421 			break;
422 		path[len] = '\0';
423 	}
424 }
425 
426 /*
427  * For a regular mount, check if a replacement lofs mount is needed because the
428  * referenced device is already mounted somewhere.
429  */
430 static int
431 check_lofs_needed(zlog_t *zlogp, struct zone_fstab *fsptr)
432 {
433 	struct mnttab *mnp;
434 	zone_fsopt_t *optptr, *onext;
435 
436 	/* This happens once per zoneadmd operation. */
437 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
438 		return (-1);
439 
440 	/*
441 	 * If this special node isn't already in use, then it's ours alone;
442 	 * no need to worry about conflicting mounts.
443 	 */
444 	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
445 	    mnp++) {
446 		if (strcmp(mnp->mnt_special, fsptr->zone_fs_special) == 0)
447 			break;
448 	}
449 	if (mnp >= resolve_lofs_mnt_max)
450 		return (0);
451 
452 	/*
453 	 * Convert this duplicate mount into a lofs mount.
454 	 */
455 	(void) strlcpy(fsptr->zone_fs_special, mnp->mnt_mountp,
456 	    sizeof (fsptr->zone_fs_special));
457 	(void) strlcpy(fsptr->zone_fs_type, MNTTYPE_LOFS,
458 	    sizeof (fsptr->zone_fs_type));
459 	fsptr->zone_fs_raw[0] = '\0';
460 
461 	/*
462 	 * Discard all but one of the original options and set that to be the
463 	 * same set of options used for inherit package directory resources.
464 	 */
465 	optptr = fsptr->zone_fs_options;
466 	if (optptr == NULL) {
467 		optptr = malloc(sizeof (*optptr));
468 		if (optptr == NULL) {
469 			zerror(zlogp, B_TRUE, "cannot mount %s",
470 			    fsptr->zone_fs_dir);
471 			return (-1);
472 		}
473 	} else {
474 		while ((onext = optptr->zone_fsopt_next) != NULL) {
475 			optptr->zone_fsopt_next = onext->zone_fsopt_next;
476 			free(onext);
477 		}
478 	}
479 	(void) strcpy(optptr->zone_fsopt_opt, IPD_DEFAULT_OPTS);
480 	optptr->zone_fsopt_next = NULL;
481 	fsptr->zone_fs_options = optptr;
482 	return (0);
483 }
484 
485 static int
486 make_one_dir(zlog_t *zlogp, const char *prefix, const char *subdir, mode_t mode)
487 {
488 	char path[MAXPATHLEN];
489 	struct stat st;
490 
491 	if (snprintf(path, sizeof (path), "%s%s", prefix, subdir) >
492 	    sizeof (path)) {
493 		zerror(zlogp, B_FALSE, "pathname %s%s is too long", prefix,
494 		    subdir);
495 		return (-1);
496 	}
497 
498 	if (lstat(path, &st) == 0) {
499 		/*
500 		 * We don't check the file mode since presumably the zone
501 		 * administrator may have had good reason to change the mode,
502 		 * and we don't need to second guess him.
503 		 */
504 		if (!S_ISDIR(st.st_mode)) {
505 			if (is_system_labeled() &&
506 			    S_ISREG(st.st_mode)) {
507 				/*
508 				 * The need to mount readonly copies of
509 				 * global zone /etc/ files is unique to
510 				 * Trusted Extensions.
511 				 */
512 				if (strncmp(subdir, "/etc/",
513 				    strlen("/etc/")) != 0) {
514 					zerror(zlogp, B_FALSE,
515 					    "%s is not in /etc", path);
516 					return (-1);
517 				}
518 			} else {
519 				zerror(zlogp, B_FALSE,
520 				    "%s is not a directory", path);
521 				return (-1);
522 			}
523 		}
524 	} else if (mkdirp(path, mode) != 0) {
525 		if (errno == EROFS)
526 			zerror(zlogp, B_FALSE, "Could not mkdir %s.\nIt is on "
527 			    "a read-only file system in this local zone.\nMake "
528 			    "sure %s exists in the global zone.", path, subdir);
529 		else
530 			zerror(zlogp, B_TRUE, "mkdirp of %s failed", path);
531 		return (-1);
532 	}
533 	return (0);
534 }
535 
536 static void
537 free_remote_fstypes(char **types)
538 {
539 	uint_t i;
540 
541 	if (types == NULL)
542 		return;
543 	for (i = 0; types[i] != NULL; i++)
544 		free(types[i]);
545 	free(types);
546 }
547 
548 static char **
549 get_remote_fstypes(zlog_t *zlogp)
550 {
551 	char **types = NULL;
552 	FILE *fp;
553 	char buf[MAXPATHLEN];
554 	char fstype[MAXPATHLEN];
555 	uint_t lines = 0;
556 	uint_t i;
557 
558 	if ((fp = fopen(DFSTYPES, "r")) == NULL) {
559 		zerror(zlogp, B_TRUE, "failed to open %s", DFSTYPES);
560 		return (NULL);
561 	}
562 	/*
563 	 * Count the number of lines
564 	 */
565 	while (fgets(buf, sizeof (buf), fp) != NULL)
566 		lines++;
567 	if (lines == 0)	/* didn't read anything; empty file */
568 		goto out;
569 	rewind(fp);
570 	/*
571 	 * Allocate enough space for a NULL-terminated array.
572 	 */
573 	types = calloc(lines + 1, sizeof (char *));
574 	if (types == NULL) {
575 		zerror(zlogp, B_TRUE, "memory allocation failed");
576 		goto out;
577 	}
578 	i = 0;
579 	while (fgets(buf, sizeof (buf), fp) != NULL) {
580 		/* LINTED - fstype is big enough to hold buf */
581 		if (sscanf(buf, "%s", fstype) == 0) {
582 			zerror(zlogp, B_FALSE, "unable to parse %s", DFSTYPES);
583 			free_remote_fstypes(types);
584 			types = NULL;
585 			goto out;
586 		}
587 		types[i] = strdup(fstype);
588 		if (types[i] == NULL) {
589 			zerror(zlogp, B_TRUE, "memory allocation failed");
590 			free_remote_fstypes(types);
591 			types = NULL;
592 			goto out;
593 		}
594 		i++;
595 	}
596 out:
597 	(void) fclose(fp);
598 	return (types);
599 }
600 
601 static boolean_t
602 is_remote_fstype(const char *fstype, char *const *remote_fstypes)
603 {
604 	uint_t i;
605 
606 	if (remote_fstypes == NULL)
607 		return (B_FALSE);
608 	for (i = 0; remote_fstypes[i] != NULL; i++) {
609 		if (strcmp(remote_fstypes[i], fstype) == 0)
610 			return (B_TRUE);
611 	}
612 	return (B_FALSE);
613 }
614 
615 /*
616  * This converts a zone root path (normally of the form .../root) to a Live
617  * Upgrade scratch zone root (of the form .../lu).
618  */
619 static void
620 root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
621 {
622 	if (!isresolved && zonecfg_in_alt_root())
623 		resolve_lofs(zlogp, zroot, zrootlen);
624 	(void) strcpy(strrchr(zroot, '/') + 1, "lu");
625 }
626 
627 /*
628  * The general strategy for unmounting filesystems is as follows:
629  *
630  * - Remote filesystems may be dead, and attempting to contact them as
631  * part of a regular unmount may hang forever; we want to always try to
632  * forcibly unmount such filesystems and only fall back to regular
633  * unmounts if the filesystem doesn't support forced unmounts.
634  *
635  * - We don't want to unnecessarily corrupt metadata on local
636  * filesystems (ie UFS), so we want to start off with graceful unmounts,
637  * and only escalate to doing forced unmounts if we get stuck.
638  *
639  * We start off walking backwards through the mount table.  This doesn't
640  * give us strict ordering but ensures that we try to unmount submounts
641  * first.  We thus limit the number of failed umount2(2) calls.
642  *
643  * The mechanism for determining if we're stuck is to count the number
644  * of failed unmounts each iteration through the mount table.  This
645  * gives us an upper bound on the number of filesystems which remain
646  * mounted (autofs trigger nodes are dealt with separately).  If at the
647  * end of one unmount+autofs_cleanup cycle we still have the same number
648  * of mounts that we started out with, we're stuck and try a forced
649  * unmount.  If that fails (filesystem doesn't support forced unmounts)
650  * then we bail and are unable to teardown the zone.  If it succeeds,
651  * we're no longer stuck so we continue with our policy of trying
652  * graceful mounts first.
653  *
654  * Zone must be down (ie, no processes or threads active).
655  */
656 static int
657 unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
658 {
659 	int error = 0;
660 	FILE *mnttab;
661 	struct mnttab *mnts;
662 	uint_t nmnt;
663 	char zroot[MAXPATHLEN + 1];
664 	size_t zrootlen;
665 	uint_t oldcount = UINT_MAX;
666 	boolean_t stuck = B_FALSE;
667 	char **remote_fstypes = NULL;
668 
669 	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
670 		zerror(zlogp, B_FALSE, "unable to determine zone root");
671 		return (-1);
672 	}
673 	if (unmount_cmd)
674 		root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
675 
676 	(void) strcat(zroot, "/");
677 	zrootlen = strlen(zroot);
678 
679 	/*
680 	 * For Trusted Extensions unmount each higher level zone's mount
681 	 * of our zone's /export/home
682 	 */
683 	if (!unmount_cmd)
684 		tsol_unmounts(zlogp, zone_name);
685 
686 	if ((mnttab = fopen(MNTTAB, "r")) == NULL) {
687 		zerror(zlogp, B_TRUE, "failed to open %s", MNTTAB);
688 		return (-1);
689 	}
690 	/*
691 	 * Use our hacky mntfs ioctl so we see everything, even mounts with
692 	 * MS_NOMNTTAB.
693 	 */
694 	if (ioctl(fileno(mnttab), MNTIOC_SHOWHIDDEN, NULL) < 0) {
695 		zerror(zlogp, B_TRUE, "unable to configure %s", MNTTAB);
696 		error++;
697 		goto out;
698 	}
699 
700 	/*
701 	 * Build the list of remote fstypes so we know which ones we
702 	 * should forcibly unmount.
703 	 */
704 	remote_fstypes = get_remote_fstypes(zlogp);
705 	for (; /* ever */; ) {
706 		uint_t newcount = 0;
707 		boolean_t unmounted;
708 		struct mnttab *mnp;
709 		char *path;
710 		uint_t i;
711 
712 		mnts = NULL;
713 		nmnt = 0;
714 		/*
715 		 * MNTTAB gives us a way to walk through mounted
716 		 * filesystems; we need to be able to walk them in
717 		 * reverse order, so we build a list of all mounted
718 		 * filesystems.
719 		 */
720 		if (build_mnttable(zlogp, zroot, zrootlen, mnttab, &mnts,
721 		    &nmnt) != 0) {
722 			error++;
723 			goto out;
724 		}
725 		for (i = 0; i < nmnt; i++) {
726 			mnp = &mnts[nmnt - i - 1]; /* access in reverse order */
727 			path = mnp->mnt_mountp;
728 			unmounted = B_FALSE;
729 			/*
730 			 * Try forced unmount first for remote filesystems.
731 			 *
732 			 * Not all remote filesystems support forced unmounts,
733 			 * so if this fails (ENOTSUP) we'll continue on
734 			 * and try a regular unmount.
735 			 */
736 			if (is_remote_fstype(mnp->mnt_fstype, remote_fstypes)) {
737 				if (umount2(path, MS_FORCE) == 0)
738 					unmounted = B_TRUE;
739 			}
740 			/*
741 			 * Try forced unmount if we're stuck.
742 			 */
743 			if (stuck) {
744 				if (umount2(path, MS_FORCE) == 0) {
745 					unmounted = B_TRUE;
746 					stuck = B_FALSE;
747 				} else {
748 					/*
749 					 * The first failure indicates a
750 					 * mount we won't be able to get
751 					 * rid of automatically, so we
752 					 * bail.
753 					 */
754 					error++;
755 					zerror(zlogp, B_FALSE,
756 					    "unable to unmount '%s'", path);
757 					free_mnttable(mnts, nmnt);
758 					goto out;
759 				}
760 			}
761 			/*
762 			 * Try regular unmounts for everything else.
763 			 */
764 			if (!unmounted && umount2(path, 0) != 0)
765 				newcount++;
766 		}
767 		free_mnttable(mnts, nmnt);
768 
769 		if (newcount == 0)
770 			break;
771 		if (newcount >= oldcount) {
772 			/*
773 			 * Last round didn't unmount anything; we're stuck and
774 			 * should start trying forced unmounts.
775 			 */
776 			stuck = B_TRUE;
777 		}
778 		oldcount = newcount;
779 
780 		/*
781 		 * Autofs doesn't let you unmount its trigger nodes from
782 		 * userland so we have to tell the kernel to cleanup for us.
783 		 */
784 		if (autofs_cleanup(zoneid) != 0) {
785 			zerror(zlogp, B_TRUE, "unable to remove autofs nodes");
786 			error++;
787 			goto out;
788 		}
789 	}
790 
791 out:
792 	free_remote_fstypes(remote_fstypes);
793 	(void) fclose(mnttab);
794 	return (error ? -1 : 0);
795 }
796 
797 static int
798 fs_compare(const void *m1, const void *m2)
799 {
800 	struct zone_fstab *i = (struct zone_fstab *)m1;
801 	struct zone_fstab *j = (struct zone_fstab *)m2;
802 
803 	return (strcmp(i->zone_fs_dir, j->zone_fs_dir));
804 }
805 
806 /*
807  * Fork and exec (and wait for) the mentioned binary with the provided
808  * arguments.  Returns (-1) if something went wrong with fork(2) or exec(2),
809  * returns the exit status otherwise.
810  *
811  * If we were unable to exec the provided pathname (for whatever
812  * reason), we return the special token ZEXIT_EXEC.  The current value
813  * of ZEXIT_EXEC doesn't conflict with legitimate exit codes of the
814  * consumers of this function; any future consumers must make sure this
815  * remains the case.
816  */
817 static int
818 forkexec(zlog_t *zlogp, const char *path, char *const argv[])
819 {
820 	pid_t child_pid;
821 	int child_status = 0;
822 
823 	/*
824 	 * Do not let another thread localize a message while we are forking.
825 	 */
826 	(void) mutex_lock(&msglock);
827 	child_pid = fork();
828 	(void) mutex_unlock(&msglock);
829 	if (child_pid == -1) {
830 		zerror(zlogp, B_TRUE, "could not fork for %s", argv[0]);
831 		return (-1);
832 	} else if (child_pid == 0) {
833 		closefrom(0);
834 		/* redirect stdin, stdout & stderr to /dev/null */
835 		(void) open("/dev/null", O_RDONLY);	/* stdin */
836 		(void) open("/dev/null", O_WRONLY);	/* stdout */
837 		(void) open("/dev/null", O_WRONLY);	/* stderr */
838 		(void) execv(path, argv);
839 		/*
840 		 * Since we are in the child, there is no point calling zerror()
841 		 * since there is nobody waiting to consume it.  So exit with a
842 		 * special code that the parent will recognize and call zerror()
843 		 * accordingly.
844 		 */
845 
846 		_exit(ZEXIT_EXEC);
847 	} else {
848 		(void) waitpid(child_pid, &child_status, 0);
849 	}
850 
851 	if (WIFSIGNALED(child_status)) {
852 		zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
853 		    "signal %d", path, WTERMSIG(child_status));
854 		return (-1);
855 	}
856 	assert(WIFEXITED(child_status));
857 	if (WEXITSTATUS(child_status) == ZEXIT_EXEC) {
858 		zerror(zlogp, B_FALSE, "failed to exec %s", path);
859 		return (-1);
860 	}
861 	return (WEXITSTATUS(child_status));
862 }
863 
864 static int
865 dofsck(zlog_t *zlogp, const char *fstype, const char *rawdev)
866 {
867 	char cmdbuf[MAXPATHLEN];
868 	char *argv[4];
869 	int status;
870 
871 	/*
872 	 * We could alternatively have called /usr/sbin/fsck -F <fstype>, but
873 	 * that would cost us an extra fork/exec without buying us anything.
874 	 */
875 	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/fsck", fstype)
876 	    > sizeof (cmdbuf)) {
877 		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
878 		return (-1);
879 	}
880 
881 	argv[0] = "fsck";
882 	argv[1] = "-m";
883 	argv[2] = (char *)rawdev;
884 	argv[3] = NULL;
885 
886 	status = forkexec(zlogp, cmdbuf, argv);
887 	if (status == 0 || status == -1)
888 		return (status);
889 	zerror(zlogp, B_FALSE, "fsck of '%s' failed with exit status %d; "
890 	    "run fsck manually", rawdev, status);
891 	return (-1);
892 }
893 
894 static int
895 domount(zlog_t *zlogp, const char *fstype, const char *opts,
896     const char *special, const char *directory)
897 {
898 	char cmdbuf[MAXPATHLEN];
899 	char *argv[6];
900 	int status;
901 
902 	/*
903 	 * We could alternatively have called /usr/sbin/mount -F <fstype>, but
904 	 * that would cost us an extra fork/exec without buying us anything.
905 	 */
906 	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/mount", fstype)
907 	    > sizeof (cmdbuf)) {
908 		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
909 		return (-1);
910 	}
911 	argv[0] = "mount";
912 	if (opts[0] == '\0') {
913 		argv[1] = (char *)special;
914 		argv[2] = (char *)directory;
915 		argv[3] = NULL;
916 	} else {
917 		argv[1] = "-o";
918 		argv[2] = (char *)opts;
919 		argv[3] = (char *)special;
920 		argv[4] = (char *)directory;
921 		argv[5] = NULL;
922 	}
923 
924 	status = forkexec(zlogp, cmdbuf, argv);
925 	if (status == 0 || status == -1)
926 		return (status);
927 	if (opts[0] == '\0')
928 		zerror(zlogp, B_FALSE, "\"%s %s %s\" "
929 		    "failed with exit code %d",
930 		    cmdbuf, special, directory, status);
931 	else
932 		zerror(zlogp, B_FALSE, "\"%s -o %s %s %s\" "
933 		    "failed with exit code %d",
934 		    cmdbuf, opts, special, directory, status);
935 	return (-1);
936 }
937 
938 /*
939  * Make sure if a given path exists, it is not a sym-link, and is a directory.
940  */
941 static int
942 check_path(zlog_t *zlogp, const char *path)
943 {
944 	struct stat statbuf;
945 	char respath[MAXPATHLEN];
946 	int res;
947 
948 	if (lstat(path, &statbuf) != 0) {
949 		if (errno == ENOENT)
950 			return (0);
951 		zerror(zlogp, B_TRUE, "can't stat %s", path);
952 		return (-1);
953 	}
954 	if (S_ISLNK(statbuf.st_mode)) {
955 		zerror(zlogp, B_FALSE, "%s is a symlink", path);
956 		return (-1);
957 	}
958 	if (!S_ISDIR(statbuf.st_mode)) {
959 		if (is_system_labeled() && S_ISREG(statbuf.st_mode)) {
960 			/*
961 			 * The need to mount readonly copies of
962 			 * global zone /etc/ files is unique to
963 			 * Trusted Extensions.
964 			 * The check for /etc/ via strstr() is to
965 			 * allow paths like $ZONEROOT/etc/passwd
966 			 */
967 			if (strstr(path, "/etc/") == NULL) {
968 				zerror(zlogp, B_FALSE,
969 				    "%s is not in /etc", path);
970 				return (-1);
971 			}
972 		} else {
973 			zerror(zlogp, B_FALSE, "%s is not a directory", path);
974 			return (-1);
975 		}
976 	}
977 	if ((res = resolvepath(path, respath, sizeof (respath))) == -1) {
978 		zerror(zlogp, B_TRUE, "unable to resolve path %s", path);
979 		return (-1);
980 	}
981 	respath[res] = '\0';
982 	if (strcmp(path, respath) != 0) {
983 		/*
984 		 * We don't like ".."s and "."s throwing us off
985 		 */
986 		zerror(zlogp, B_FALSE, "%s is not a canonical path", path);
987 		return (-1);
988 	}
989 	return (0);
990 }
991 
992 /*
993  * Check every component of rootpath/relpath.  If any component fails (ie,
994  * exists but isn't the canonical path to a directory), it is returned in
995  * badpath, which is assumed to be at least of size MAXPATHLEN.
996  *
997  * Relpath must begin with '/'.
998  */
999 static boolean_t
1000 valid_mount_path(zlog_t *zlogp, const char *rootpath, const char *relpath)
1001 {
1002 	char abspath[MAXPATHLEN], *slashp;
1003 
1004 	/*
1005 	 * Make sure abspath has at least one '/' after its rootpath
1006 	 * component, and ends with '/'.
1007 	 */
1008 	if (snprintf(abspath, sizeof (abspath), "%s%s/", rootpath, relpath) >
1009 	    sizeof (abspath)) {
1010 		zerror(zlogp, B_FALSE, "pathname %s%s is too long", rootpath,
1011 		    relpath);
1012 		return (B_FALSE);
1013 	}
1014 
1015 	slashp = &abspath[strlen(rootpath)];
1016 	assert(*slashp == '/');
1017 	do {
1018 		*slashp = '\0';
1019 		if (check_path(zlogp, abspath) != 0)
1020 			return (B_FALSE);
1021 		*slashp = '/';
1022 		slashp++;
1023 	} while ((slashp = strchr(slashp, '/')) != NULL);
1024 	return (B_TRUE);
1025 }
1026 
1027 static int
1028 mount_one(zlog_t *zlogp, struct zone_fstab *fsptr, const char *rootpath)
1029 {
1030 	char    path[MAXPATHLEN];
1031 	char	specpath[MAXPATHLEN];
1032 	char    optstr[MAX_MNTOPT_STR];
1033 	zone_fsopt_t *optptr;
1034 
1035 	if (!valid_mount_path(zlogp, rootpath, fsptr->zone_fs_dir)) {
1036 		zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
1037 		    rootpath, fsptr->zone_fs_dir);
1038 		return (-1);
1039 	}
1040 
1041 	if (make_one_dir(zlogp, rootpath, fsptr->zone_fs_dir,
1042 	    DEFAULT_DIR_MODE) != 0)
1043 		return (-1);
1044 
1045 	(void) snprintf(path, sizeof (path), "%s%s", rootpath,
1046 	    fsptr->zone_fs_dir);
1047 
1048 	if (strlen(fsptr->zone_fs_special) == 0) {
1049 		/*
1050 		 * A zero-length special is how we distinguish IPDs from
1051 		 * general-purpose FSs.  Make sure it mounts from a place that
1052 		 * can be seen via the alternate zone's root.
1053 		 */
1054 		if (snprintf(specpath, sizeof (specpath), "%s%s",
1055 		    zonecfg_get_root(), fsptr->zone_fs_dir) >=
1056 		    sizeof (specpath)) {
1057 			zerror(zlogp, B_FALSE, "cannot mount %s: path too "
1058 			    "long in alternate root", fsptr->zone_fs_dir);
1059 			return (-1);
1060 		}
1061 		if (zonecfg_in_alt_root())
1062 			resolve_lofs(zlogp, specpath, sizeof (specpath));
1063 		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS,
1064 		    specpath, path) != 0) {
1065 			zerror(zlogp, B_TRUE, "failed to loopback mount %s",
1066 			    specpath);
1067 			return (-1);
1068 		}
1069 		return (0);
1070 	}
1071 
1072 	/*
1073 	 * In general the strategy here is to do just as much verification as
1074 	 * necessary to avoid crashing or otherwise doing something bad; if the
1075 	 * administrator initiated the operation via zoneadm(1m), he'll get
1076 	 * auto-verification which will let him know what's wrong.  If he
1077 	 * modifies the zone configuration of a running zone and doesn't attempt
1078 	 * to verify that it's OK we won't crash but won't bother trying to be
1079 	 * too helpful either.  zoneadm verify is only a couple keystrokes away.
1080 	 */
1081 	if (!zonecfg_valid_fs_type(fsptr->zone_fs_type)) {
1082 		zerror(zlogp, B_FALSE, "cannot mount %s on %s: "
1083 		    "invalid file-system type %s", fsptr->zone_fs_special,
1084 		    fsptr->zone_fs_dir, fsptr->zone_fs_type);
1085 		return (-1);
1086 	}
1087 
1088 	/*
1089 	 * If we're looking at an alternate root environment, then construct
1090 	 * read-only loopback mounts as necessary.  For all lofs mounts, make
1091 	 * sure that the 'special' entry points inside the alternate root.  (We
1092 	 * don't do this with other mounts, as devfs isn't in the alternate
1093 	 * root, and we need to assume the device environment is roughly the
1094 	 * same.)
1095 	 */
1096 	if (zonecfg_in_alt_root()) {
1097 		struct stat64 st;
1098 
1099 		if (stat64(fsptr->zone_fs_special, &st) != -1 &&
1100 		    S_ISBLK(st.st_mode) &&
1101 		    check_lofs_needed(zlogp, fsptr) == -1)
1102 			return (-1);
1103 		if (strcmp(fsptr->zone_fs_type, MNTTYPE_LOFS) == 0) {
1104 			if (snprintf(specpath, sizeof (specpath), "%s%s",
1105 			    zonecfg_get_root(), fsptr->zone_fs_special) >=
1106 			    sizeof (specpath)) {
1107 				zerror(zlogp, B_FALSE, "cannot mount %s: path "
1108 				    "too long in alternate root",
1109 				    fsptr->zone_fs_special);
1110 				return (-1);
1111 			}
1112 			resolve_lofs(zlogp, specpath, sizeof (specpath));
1113 			(void) strlcpy(fsptr->zone_fs_special, specpath,
1114 			    sizeof (fsptr->zone_fs_special));
1115 		}
1116 	}
1117 
1118 	/*
1119 	 * Run 'fsck -m' if there's a device to fsck.
1120 	 */
1121 	if (fsptr->zone_fs_raw[0] != '\0' &&
1122 	    dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_raw) != 0)
1123 		return (-1);
1124 
1125 	/*
1126 	 * Build up mount option string.
1127 	 */
1128 	optstr[0] = '\0';
1129 	if (fsptr->zone_fs_options != NULL) {
1130 		(void) strlcpy(optstr, fsptr->zone_fs_options->zone_fsopt_opt,
1131 		    sizeof (optstr));
1132 		for (optptr = fsptr->zone_fs_options->zone_fsopt_next;
1133 		    optptr != NULL; optptr = optptr->zone_fsopt_next) {
1134 			(void) strlcat(optstr, ",", sizeof (optstr));
1135 			(void) strlcat(optstr, optptr->zone_fsopt_opt,
1136 			    sizeof (optstr));
1137 		}
1138 	}
1139 	return (domount(zlogp, fsptr->zone_fs_type, optstr,
1140 	    fsptr->zone_fs_special, path));
1141 }
1142 
1143 static void
1144 free_fs_data(struct zone_fstab *fsarray, uint_t nelem)
1145 {
1146 	uint_t i;
1147 
1148 	if (fsarray == NULL)
1149 		return;
1150 	for (i = 0; i < nelem; i++)
1151 		zonecfg_free_fs_option_list(fsarray[i].zone_fs_options);
1152 	free(fsarray);
1153 }
1154 
1155 /*
1156  * This function initiates the creation of a small Solaris Environment for
1157  * scratch zone. The Environment creation process is split up into two
1158  * functions(build_mounted_pre_var() and build_mounted_post_var()). It
1159  * is done this way because:
1160  * 	We need to have both /etc and /var in the root of the scratchzone.
1161  * 	We loopback mount zone's own /etc and /var into the root of the
1162  * 	scratch zone. Unlike /etc, /var can be a seperate filesystem. So we
1163  * 	need to delay the mount of /var till the zone's root gets populated.
1164  *	So mounting of localdirs[](/etc and /var) have been moved to the
1165  * 	build_mounted_post_var() which gets called only after the zone
1166  * 	specific filesystems are mounted.
1167  */
1168 static boolean_t
1169 build_mounted_pre_var(zlog_t *zlogp, char *rootpath,
1170     size_t rootlen, const char *zonepath)
1171 {
1172 	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1173 	char luroot[MAXPATHLEN];
1174 	const char **cpp;
1175 	static const char *mkdirs[] = {
1176 		"/system", "/system/contract", "/system/object", "/proc",
1177 		"/dev", "/tmp", "/a", NULL
1178 	};
1179 	char *altstr;
1180 	FILE *fp;
1181 	uuid_t uuid;
1182 
1183 	resolve_lofs(zlogp, rootpath, rootlen);
1184 	(void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
1185 	resolve_lofs(zlogp, luroot, sizeof (luroot));
1186 	(void) snprintf(tmp, sizeof (tmp), "%s/bin", luroot);
1187 	(void) symlink("./usr/bin", tmp);
1188 
1189 	/*
1190 	 * These are mostly special mount points; not handled here.  (See
1191 	 * zone_mount_early.)
1192 	 */
1193 	for (cpp = mkdirs; *cpp != NULL; cpp++) {
1194 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1195 		if (mkdir(tmp, 0755) != 0) {
1196 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1197 			return (B_FALSE);
1198 		}
1199 	}
1200 	/*
1201 	 * This is here to support lucopy.  If there's an instance of this same
1202 	 * zone on the current running system, then we mount its root up as
1203 	 * read-only inside the scratch zone.
1204 	 */
1205 	(void) zonecfg_get_uuid(zone_name, uuid);
1206 	altstr = strdup(zonecfg_get_root());
1207 	if (altstr == NULL) {
1208 		zerror(zlogp, B_TRUE, "memory allocation failed");
1209 		return (B_FALSE);
1210 	}
1211 	zonecfg_set_root("");
1212 	(void) strlcpy(tmp, zone_name, sizeof (tmp));
1213 	(void) zonecfg_get_name_by_uuid(uuid, tmp, sizeof (tmp));
1214 	if (zone_get_rootpath(tmp, fromdir, sizeof (fromdir)) == Z_OK &&
1215 	    strcmp(fromdir, rootpath) != 0) {
1216 		(void) snprintf(tmp, sizeof (tmp), "%s/b", luroot);
1217 		if (mkdir(tmp, 0755) != 0) {
1218 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1219 			return (B_FALSE);
1220 		}
1221 		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, fromdir,
1222 		    tmp) != 0) {
1223 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1224 			    fromdir);
1225 			return (B_FALSE);
1226 		}
1227 	}
1228 	zonecfg_set_root(altstr);
1229 	free(altstr);
1230 
1231 	if ((fp = zonecfg_open_scratch(luroot, B_TRUE)) == NULL) {
1232 		zerror(zlogp, B_TRUE, "cannot open zone mapfile");
1233 		return (B_FALSE);
1234 	}
1235 	(void) ftruncate(fileno(fp), 0);
1236 	if (zonecfg_add_scratch(fp, zone_name, kernzone, "/") == -1) {
1237 		zerror(zlogp, B_TRUE, "cannot add zone mapfile entry");
1238 	}
1239 	zonecfg_close_scratch(fp);
1240 	(void) snprintf(tmp, sizeof (tmp), "%s/a", luroot);
1241 	if (domount(zlogp, MNTTYPE_LOFS, "", rootpath, tmp) != 0)
1242 		return (B_FALSE);
1243 	(void) strlcpy(rootpath, tmp, rootlen);
1244 	return (B_TRUE);
1245 }
1246 
1247 
1248 static boolean_t
1249 build_mounted_post_var(zlog_t *zlogp, char *rootpath, const char *zonepath)
1250 {
1251 	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1252 	char luroot[MAXPATHLEN];
1253 	const char **cpp;
1254 	static const char *localdirs[] = {
1255 		"/etc", "/var", NULL
1256 	};
1257 	static const char *loopdirs[] = {
1258 		"/etc/lib", "/etc/fs", "/lib", "/sbin", "/platform",
1259 		"/usr", NULL
1260 	};
1261 	static const char *tmpdirs[] = {
1262 		"/tmp", "/var/run", NULL
1263 	};
1264 	struct stat st;
1265 
1266 	(void) snprintf(luroot, sizeof (luroot), "%s/lu", zonepath);
1267 
1268 	/*
1269 	 * These are mounted read-write from the zone undergoing upgrade.  We
1270 	 * must be careful not to 'leak' things from the main system into the
1271 	 * zone, and this accomplishes that goal.
1272 	 */
1273 	for (cpp = localdirs; *cpp != NULL; cpp++) {
1274 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1275 		(void) snprintf(fromdir, sizeof (fromdir), "%s%s", rootpath,
1276 		    *cpp);
1277 		if (mkdir(tmp, 0755) != 0) {
1278 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1279 			return (B_FALSE);
1280 		}
1281 		if (domount(zlogp, MNTTYPE_LOFS, "", fromdir, tmp) != 0) {
1282 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1283 			    *cpp);
1284 			return (B_FALSE);
1285 		}
1286 	}
1287 
1288 	/*
1289 	 * These are things mounted read-only from the running system because
1290 	 * they contain binaries that must match system.
1291 	 */
1292 	for (cpp = loopdirs; *cpp != NULL; cpp++) {
1293 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1294 		if (mkdir(tmp, 0755) != 0) {
1295 			if (errno != EEXIST) {
1296 				zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1297 				return (B_FALSE);
1298 			}
1299 			if (lstat(tmp, &st) != 0) {
1300 				zerror(zlogp, B_TRUE, "cannot stat %s", tmp);
1301 				return (B_FALSE);
1302 			}
1303 			/*
1304 			 * Ignore any non-directories encountered.  These are
1305 			 * things that have been converted into symlinks
1306 			 * (/etc/fs and /etc/lib) and no longer need a lofs
1307 			 * fixup.
1308 			 */
1309 			if (!S_ISDIR(st.st_mode))
1310 				continue;
1311 		}
1312 		if (domount(zlogp, MNTTYPE_LOFS, IPD_DEFAULT_OPTS, *cpp,
1313 		    tmp) != 0) {
1314 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1315 			    *cpp);
1316 			return (B_FALSE);
1317 		}
1318 	}
1319 
1320 	/*
1321 	 * These are things with tmpfs mounted inside.
1322 	 */
1323 	for (cpp = tmpdirs; *cpp != NULL; cpp++) {
1324 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1325 		if (mkdir(tmp, 0755) != 0 && errno != EEXIST) {
1326 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1327 			return (B_FALSE);
1328 		}
1329 		if (domount(zlogp, MNTTYPE_TMPFS, "", "swap", tmp) != 0) {
1330 			zerror(zlogp, B_TRUE, "cannot mount swap on %s", *cpp);
1331 			return (B_FALSE);
1332 		}
1333 	}
1334 	return (B_TRUE);
1335 }
1336 
1337 static int
1338 mount_filesystems(zlog_t *zlogp, boolean_t mount_cmd)
1339 {
1340 	char	rootpath[MAXPATHLEN];
1341 	char	zonepath[MAXPATHLEN];
1342 	int	num_fs = 0, i;
1343 	struct zone_fstab fstab, *fs_ptr = NULL, *tmp_ptr;
1344 	struct zone_fstab *fsp;
1345 	zone_dochandle_t handle = NULL;
1346 	zone_state_t zstate;
1347 
1348 	if (zone_get_state(zone_name, &zstate) != Z_OK ||
1349 	    (zstate != ZONE_STATE_READY && zstate != ZONE_STATE_MOUNTED)) {
1350 		zerror(zlogp, B_FALSE,
1351 		    "zone must be in '%s' or '%s' state to mount file-systems",
1352 		    zone_state_str(ZONE_STATE_READY),
1353 		    zone_state_str(ZONE_STATE_MOUNTED));
1354 		goto bad;
1355 	}
1356 
1357 	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
1358 		zerror(zlogp, B_TRUE, "unable to determine zone path");
1359 		goto bad;
1360 	}
1361 
1362 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
1363 		zerror(zlogp, B_TRUE, "unable to determine zone root");
1364 		goto bad;
1365 	}
1366 
1367 	if ((handle = zonecfg_init_handle()) == NULL) {
1368 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
1369 		goto bad;
1370 	}
1371 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
1372 	    zonecfg_setfsent(handle) != Z_OK) {
1373 		zerror(zlogp, B_FALSE, "invalid configuration");
1374 		goto bad;
1375 	}
1376 
1377 	/*
1378 	 * Iterate through the rest of the filesystems, first the IPDs, then
1379 	 * the general FSs.  Sort them all, then mount them in sorted order.
1380 	 * This is to make sure the higher level directories (e.g., /usr)
1381 	 * get mounted before any beneath them (e.g., /usr/local).
1382 	 */
1383 	if (zonecfg_setipdent(handle) != Z_OK) {
1384 		zerror(zlogp, B_FALSE, "invalid configuration");
1385 		goto bad;
1386 	}
1387 	while (zonecfg_getipdent(handle, &fstab) == Z_OK) {
1388 		num_fs++;
1389 		if ((tmp_ptr = realloc(fs_ptr,
1390 		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1391 			zerror(zlogp, B_TRUE, "memory allocation failed");
1392 			num_fs--;
1393 			(void) zonecfg_endipdent(handle);
1394 			goto bad;
1395 		}
1396 		fs_ptr = tmp_ptr;
1397 		fsp = &fs_ptr[num_fs - 1];
1398 		/*
1399 		 * IPDs logically only have a mount point; all other properties
1400 		 * are implied.
1401 		 */
1402 		(void) strlcpy(fsp->zone_fs_dir,
1403 		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1404 		fsp->zone_fs_special[0] = '\0';
1405 		fsp->zone_fs_raw[0] = '\0';
1406 		fsp->zone_fs_type[0] = '\0';
1407 		fsp->zone_fs_options = NULL;
1408 	}
1409 	(void) zonecfg_endipdent(handle);
1410 
1411 	if (zonecfg_setfsent(handle) != Z_OK) {
1412 		zerror(zlogp, B_FALSE, "invalid configuration");
1413 		goto bad;
1414 	}
1415 	while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
1416 		/*
1417 		 * ZFS filesystems will not be accessible under an alternate
1418 		 * root, since the pool will not be known.  Ignore them in this
1419 		 * case.
1420 		 */
1421 		if (mount_cmd && strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
1422 			continue;
1423 
1424 		num_fs++;
1425 		if ((tmp_ptr = realloc(fs_ptr,
1426 		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1427 			zerror(zlogp, B_TRUE, "memory allocation failed");
1428 			num_fs--;
1429 			(void) zonecfg_endfsent(handle);
1430 			goto bad;
1431 		}
1432 		fs_ptr = tmp_ptr;
1433 		fsp = &fs_ptr[num_fs - 1];
1434 		(void) strlcpy(fsp->zone_fs_dir,
1435 		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1436 		(void) strlcpy(fsp->zone_fs_special, fstab.zone_fs_special,
1437 		    sizeof (fsp->zone_fs_special));
1438 		(void) strlcpy(fsp->zone_fs_raw, fstab.zone_fs_raw,
1439 		    sizeof (fsp->zone_fs_raw));
1440 		(void) strlcpy(fsp->zone_fs_type, fstab.zone_fs_type,
1441 		    sizeof (fsp->zone_fs_type));
1442 		fsp->zone_fs_options = fstab.zone_fs_options;
1443 	}
1444 	(void) zonecfg_endfsent(handle);
1445 	zonecfg_fini_handle(handle);
1446 	handle = NULL;
1447 
1448 	/*
1449 	 * When we're mounting a zone for administration, / is the
1450 	 * scratch zone and dev is mounted at /dev.  The to-be-upgraded
1451 	 * zone is mounted at /a, and we set up that environment so that
1452 	 * process can access both the running system's utilities
1453 	 * and the to-be-modified zone's files.  The only exception
1454 	 * is the zone's /dev which isn't mounted at all, which is
1455 	 * the same as global zone installation where /a/dev and
1456 	 * /a/devices are not mounted.
1457 	 * Zone mounting is done in three phases.
1458 	 *   1) Create and populate lu directory (build_mounted_pre_var()).
1459 	 *   2) Mount the required filesystems as per the zone configuration.
1460 	 *   3) Set up the rest of the scratch zone environment
1461 	 *	(build_mounted_post_var()).
1462 	 */
1463 	if (mount_cmd &&
1464 	    !build_mounted_pre_var(zlogp,
1465 	    rootpath, sizeof (rootpath), zonepath))
1466 		goto bad;
1467 
1468 	qsort(fs_ptr, num_fs, sizeof (*fs_ptr), fs_compare);
1469 	for (i = 0; i < num_fs; i++) {
1470 		if (mount_one(zlogp, &fs_ptr[i], rootpath) != 0)
1471 			goto bad;
1472 	}
1473 	if (mount_cmd &&
1474 	    !build_mounted_post_var(zlogp, rootpath, zonepath))
1475 		goto bad;
1476 
1477 	/*
1478 	 * For Trusted Extensions cross-mount each lower level /export/home
1479 	 */
1480 	if (!mount_cmd && tsol_mounts(zlogp, zone_name, rootpath) != 0)
1481 		goto bad;
1482 
1483 	free_fs_data(fs_ptr, num_fs);
1484 
1485 	/*
1486 	 * Everything looks fine.
1487 	 */
1488 	return (0);
1489 
1490 bad:
1491 	if (handle != NULL)
1492 		zonecfg_fini_handle(handle);
1493 	free_fs_data(fs_ptr, num_fs);
1494 	return (-1);
1495 }
1496 
1497 /* caller makes sure neither parameter is NULL */
1498 static int
1499 addr2netmask(char *prefixstr, int maxprefixlen, uchar_t *maskstr)
1500 {
1501 	int prefixlen;
1502 
1503 	prefixlen = atoi(prefixstr);
1504 	if (prefixlen < 0 || prefixlen > maxprefixlen)
1505 		return (1);
1506 	while (prefixlen > 0) {
1507 		if (prefixlen >= 8) {
1508 			*maskstr++ = 0xFF;
1509 			prefixlen -= 8;
1510 			continue;
1511 		}
1512 		*maskstr |= 1 << (8 - prefixlen);
1513 		prefixlen--;
1514 	}
1515 	return (0);
1516 }
1517 
1518 /*
1519  * Tear down all interfaces belonging to the given zone.  This should
1520  * be called with the zone in a state other than "running", so that
1521  * interfaces can't be assigned to the zone after this returns.
1522  *
1523  * If anything goes wrong, log an error message and return an error.
1524  */
1525 static int
1526 unconfigure_network_interfaces(zlog_t *zlogp, zoneid_t zone_id)
1527 {
1528 	struct lifnum lifn;
1529 	struct lifconf lifc;
1530 	struct lifreq *lifrp, lifrl;
1531 	int64_t lifc_flags = LIFC_NOXMIT | LIFC_ALLZONES;
1532 	int num_ifs, s, i, ret_code = 0;
1533 	uint_t bufsize;
1534 	char *buf = NULL;
1535 
1536 	if ((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
1537 		zerror(zlogp, B_TRUE, "could not get socket");
1538 		ret_code = -1;
1539 		goto bad;
1540 	}
1541 	lifn.lifn_family = AF_UNSPEC;
1542 	lifn.lifn_flags = (int)lifc_flags;
1543 	if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
1544 		zerror(zlogp, B_TRUE,
1545 		    "could not determine number of interfaces");
1546 		ret_code = -1;
1547 		goto bad;
1548 	}
1549 	num_ifs = lifn.lifn_count;
1550 	bufsize = num_ifs * sizeof (struct lifreq);
1551 	if ((buf = malloc(bufsize)) == NULL) {
1552 		zerror(zlogp, B_TRUE, "memory allocation failed");
1553 		ret_code = -1;
1554 		goto bad;
1555 	}
1556 	lifc.lifc_family = AF_UNSPEC;
1557 	lifc.lifc_flags = (int)lifc_flags;
1558 	lifc.lifc_len = bufsize;
1559 	lifc.lifc_buf = buf;
1560 	if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
1561 		zerror(zlogp, B_TRUE, "could not get configured interfaces");
1562 		ret_code = -1;
1563 		goto bad;
1564 	}
1565 	lifrp = lifc.lifc_req;
1566 	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, lifrp++) {
1567 		(void) close(s);
1568 		if ((s = socket(lifrp->lifr_addr.ss_family, SOCK_DGRAM, 0)) <
1569 		    0) {
1570 			zerror(zlogp, B_TRUE, "%s: could not get socket",
1571 			    lifrl.lifr_name);
1572 			ret_code = -1;
1573 			continue;
1574 		}
1575 		(void) memset(&lifrl, 0, sizeof (lifrl));
1576 		(void) strncpy(lifrl.lifr_name, lifrp->lifr_name,
1577 		    sizeof (lifrl.lifr_name));
1578 		if (ioctl(s, SIOCGLIFZONE, (caddr_t)&lifrl) < 0) {
1579 			zerror(zlogp, B_TRUE,
1580 			    "%s: could not determine zone interface belongs to",
1581 			    lifrl.lifr_name);
1582 			ret_code = -1;
1583 			continue;
1584 		}
1585 		if (lifrl.lifr_zoneid == zone_id) {
1586 			if (ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifrl) < 0) {
1587 				zerror(zlogp, B_TRUE,
1588 				    "%s: could not remove interface",
1589 				    lifrl.lifr_name);
1590 				ret_code = -1;
1591 				continue;
1592 			}
1593 		}
1594 	}
1595 bad:
1596 	if (s > 0)
1597 		(void) close(s);
1598 	if (buf)
1599 		free(buf);
1600 	return (ret_code);
1601 }
1602 
1603 static union	sockunion {
1604 	struct	sockaddr sa;
1605 	struct	sockaddr_in sin;
1606 	struct	sockaddr_dl sdl;
1607 	struct	sockaddr_in6 sin6;
1608 } so_dst, so_ifp;
1609 
1610 static struct {
1611 	struct	rt_msghdr hdr;
1612 	char	space[512];
1613 } rtmsg;
1614 
1615 static int
1616 salen(struct sockaddr *sa)
1617 {
1618 	switch (sa->sa_family) {
1619 	case AF_INET:
1620 		return (sizeof (struct sockaddr_in));
1621 	case AF_LINK:
1622 		return (sizeof (struct sockaddr_dl));
1623 	case AF_INET6:
1624 		return (sizeof (struct sockaddr_in6));
1625 	default:
1626 		return (sizeof (struct sockaddr));
1627 	}
1628 }
1629 
1630 #define	ROUNDUP_LONG(a) \
1631 	((a) > 0 ? (1 + (((a) - 1) | (sizeof (long) - 1))) : sizeof (long))
1632 
1633 /*
1634  * Look up which zone is using a given IP address.  The address in question
1635  * is expected to have been stuffed into the structure to which lifr points
1636  * via a previous SIOCGLIFADDR ioctl().
1637  *
1638  * This is done using black router socket magic.
1639  *
1640  * Return the name of the zone on success or NULL on failure.
1641  *
1642  * This is a lot of code for a simple task; a new ioctl request to take care
1643  * of this might be a useful RFE.
1644  */
1645 
1646 static char *
1647 who_is_using(zlog_t *zlogp, struct lifreq *lifr)
1648 {
1649 	static char answer[ZONENAME_MAX];
1650 	pid_t pid;
1651 	int s, rlen, l, i;
1652 	char *cp = rtmsg.space;
1653 	struct sockaddr_dl *ifp = NULL;
1654 	struct sockaddr *sa;
1655 	char save_if_name[LIFNAMSIZ];
1656 
1657 	answer[0] = '\0';
1658 
1659 	pid = getpid();
1660 	if ((s = socket(PF_ROUTE, SOCK_RAW, 0)) < 0) {
1661 		zerror(zlogp, B_TRUE, "could not get routing socket");
1662 		return (NULL);
1663 	}
1664 
1665 	if (lifr->lifr_addr.ss_family == AF_INET) {
1666 		struct sockaddr_in *sin4;
1667 
1668 		so_dst.sa.sa_family = AF_INET;
1669 		sin4 = (struct sockaddr_in *)&lifr->lifr_addr;
1670 		so_dst.sin.sin_addr = sin4->sin_addr;
1671 	} else {
1672 		struct sockaddr_in6 *sin6;
1673 
1674 		so_dst.sa.sa_family = AF_INET6;
1675 		sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
1676 		so_dst.sin6.sin6_addr = sin6->sin6_addr;
1677 	}
1678 
1679 	so_ifp.sa.sa_family = AF_LINK;
1680 
1681 	(void) memset(&rtmsg, 0, sizeof (rtmsg));
1682 	rtmsg.hdr.rtm_type = RTM_GET;
1683 	rtmsg.hdr.rtm_flags = RTF_UP | RTF_HOST;
1684 	rtmsg.hdr.rtm_version = RTM_VERSION;
1685 	rtmsg.hdr.rtm_seq = ++rts_seqno;
1686 	rtmsg.hdr.rtm_addrs = RTA_IFP | RTA_DST;
1687 
1688 	l = ROUNDUP_LONG(salen(&so_dst.sa));
1689 	(void) memmove(cp, &(so_dst), l);
1690 	cp += l;
1691 	l = ROUNDUP_LONG(salen(&so_ifp.sa));
1692 	(void) memmove(cp, &(so_ifp), l);
1693 	cp += l;
1694 
1695 	rtmsg.hdr.rtm_msglen = l = cp - (char *)&rtmsg;
1696 
1697 	if ((rlen = write(s, &rtmsg, l)) < 0) {
1698 		zerror(zlogp, B_TRUE, "writing to routing socket");
1699 		return (NULL);
1700 	} else if (rlen < (int)rtmsg.hdr.rtm_msglen) {
1701 		zerror(zlogp, B_TRUE,
1702 		    "write to routing socket got only %d for len\n", rlen);
1703 		return (NULL);
1704 	}
1705 	do {
1706 		l = read(s, &rtmsg, sizeof (rtmsg));
1707 	} while (l > 0 && (rtmsg.hdr.rtm_seq != rts_seqno ||
1708 	    rtmsg.hdr.rtm_pid != pid));
1709 	if (l < 0) {
1710 		zerror(zlogp, B_TRUE, "reading from routing socket");
1711 		return (NULL);
1712 	}
1713 
1714 	if (rtmsg.hdr.rtm_version != RTM_VERSION) {
1715 		zerror(zlogp, B_FALSE,
1716 		    "routing message version %d not understood",
1717 		    rtmsg.hdr.rtm_version);
1718 		return (NULL);
1719 	}
1720 	if (rtmsg.hdr.rtm_msglen != (ushort_t)l) {
1721 		zerror(zlogp, B_FALSE, "message length mismatch, "
1722 		    "expected %d bytes, returned %d bytes",
1723 		    rtmsg.hdr.rtm_msglen, l);
1724 		return (NULL);
1725 	}
1726 	if (rtmsg.hdr.rtm_errno != 0)  {
1727 		errno = rtmsg.hdr.rtm_errno;
1728 		zerror(zlogp, B_TRUE, "RTM_GET routing socket message");
1729 		return (NULL);
1730 	}
1731 	if ((rtmsg.hdr.rtm_addrs & RTA_IFP) == 0) {
1732 		zerror(zlogp, B_FALSE, "interface not found");
1733 		return (NULL);
1734 	}
1735 	cp = ((char *)(&rtmsg.hdr + 1));
1736 	for (i = 1; i != 0; i <<= 1) {
1737 		/* LINTED E_BAD_PTR_CAST_ALIGN */
1738 		sa = (struct sockaddr *)cp;
1739 		if (i != RTA_IFP) {
1740 			if ((i & rtmsg.hdr.rtm_addrs) != 0)
1741 				cp += ROUNDUP_LONG(salen(sa));
1742 			continue;
1743 		}
1744 		if (sa->sa_family == AF_LINK &&
1745 		    ((struct sockaddr_dl *)sa)->sdl_nlen != 0)
1746 			ifp = (struct sockaddr_dl *)sa;
1747 		break;
1748 	}
1749 	if (ifp == NULL) {
1750 		zerror(zlogp, B_FALSE, "interface could not be determined");
1751 		return (NULL);
1752 	}
1753 
1754 	/*
1755 	 * We need to set the I/F name to what we got above, then do the
1756 	 * appropriate ioctl to get its zone name.  But lifr->lifr_name is
1757 	 * used by the calling function to do a REMOVEIF, so if we leave the
1758 	 * "good" zone's I/F name in place, *that* I/F will be removed instead
1759 	 * of the bad one.  So we save the old (bad) I/F name before over-
1760 	 * writing it and doing the ioctl, then restore it after the ioctl.
1761 	 */
1762 	(void) strlcpy(save_if_name, lifr->lifr_name, sizeof (save_if_name));
1763 	(void) strncpy(lifr->lifr_name, ifp->sdl_data, ifp->sdl_nlen);
1764 	lifr->lifr_name[ifp->sdl_nlen] = '\0';
1765 	i = ioctl(s, SIOCGLIFZONE, lifr);
1766 	(void) strlcpy(lifr->lifr_name, save_if_name, sizeof (save_if_name));
1767 	if (i < 0) {
1768 		zerror(zlogp, B_TRUE,
1769 		    "%s: could not determine the zone interface belongs to",
1770 		    lifr->lifr_name);
1771 		return (NULL);
1772 	}
1773 	if (getzonenamebyid(lifr->lifr_zoneid, answer, sizeof (answer)) < 0)
1774 		(void) snprintf(answer, sizeof (answer), "%d",
1775 		    lifr->lifr_zoneid);
1776 
1777 	if (strlen(answer) > 0)
1778 		return (answer);
1779 	return (NULL);
1780 }
1781 
1782 typedef struct mcast_rtmsg_s {
1783 	struct rt_msghdr	m_rtm;
1784 	union {
1785 		struct {
1786 			struct sockaddr_in	m_dst;
1787 			struct sockaddr_in	m_gw;
1788 			struct sockaddr_in	m_netmask;
1789 		} m_v4;
1790 		struct {
1791 			struct sockaddr_in6	m_dst;
1792 			struct sockaddr_in6	m_gw;
1793 			struct sockaddr_in6	m_netmask;
1794 		} m_v6;
1795 	} m_u;
1796 } mcast_rtmsg_t;
1797 #define	m_dst4		m_u.m_v4.m_dst
1798 #define	m_dst6		m_u.m_v6.m_dst
1799 #define	m_gw4		m_u.m_v4.m_gw
1800 #define	m_gw6		m_u.m_v6.m_gw
1801 #define	m_netmask4	m_u.m_v4.m_netmask
1802 #define	m_netmask6	m_u.m_v6.m_netmask
1803 
1804 /*
1805  * Configures a single interface: a new virtual interface is added, based on
1806  * the physical interface nwiftabptr->zone_nwif_physical, with the address
1807  * specified in nwiftabptr->zone_nwif_address, for zone zone_id.  Note that
1808  * the "address" can be an IPv6 address (with a /prefixlength required), an
1809  * IPv4 address (with a /prefixlength optional), or a name; for the latter,
1810  * an IPv4 name-to-address resolution will be attempted.
1811  *
1812  * A default interface route for multicast is created on the first IPv4 and
1813  * IPv6 interfaces (that have the IFF_MULTICAST flag set), respectively.
1814  * This should really be done in the init scripts if we ever allow zones to
1815  * modify the routing tables.
1816  *
1817  * If anything goes wrong, we log an detailed error message, attempt to tear
1818  * down whatever we set up and return an error.
1819  */
1820 static int
1821 configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
1822     struct zone_nwiftab *nwiftabptr, boolean_t *mcast_rt_v4_setp,
1823     boolean_t *mcast_rt_v6_setp)
1824 {
1825 	struct lifreq lifr;
1826 	struct sockaddr_in netmask4;
1827 	struct sockaddr_in6 netmask6;
1828 	struct in_addr in4;
1829 	struct in6_addr in6;
1830 	sa_family_t af;
1831 	char *slashp = strchr(nwiftabptr->zone_nwif_address, '/');
1832 	mcast_rtmsg_t mcast_rtmsg;
1833 	int s;
1834 	int rs;
1835 	int rlen;
1836 	boolean_t got_netmask = B_FALSE;
1837 	char addrstr4[INET_ADDRSTRLEN];
1838 	int res;
1839 
1840 	res = zonecfg_valid_net_address(nwiftabptr->zone_nwif_address, &lifr);
1841 	if (res != Z_OK) {
1842 		zerror(zlogp, B_FALSE, "%s: %s", zonecfg_strerror(res),
1843 		    nwiftabptr->zone_nwif_address);
1844 		return (-1);
1845 	}
1846 	af = lifr.lifr_addr.ss_family;
1847 	if (af == AF_INET)
1848 		in4 = ((struct sockaddr_in *)(&lifr.lifr_addr))->sin_addr;
1849 	else
1850 		in6 = ((struct sockaddr_in6 *)(&lifr.lifr_addr))->sin6_addr;
1851 
1852 	if ((s = socket(af, SOCK_DGRAM, 0)) < 0) {
1853 		zerror(zlogp, B_TRUE, "could not get socket");
1854 		return (-1);
1855 	}
1856 
1857 	(void) strlcpy(lifr.lifr_name, nwiftabptr->zone_nwif_physical,
1858 	    sizeof (lifr.lifr_name));
1859 	if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
1860 		/*
1861 		 * Here, we know that the interface can't be brought up.
1862 		 * A similar warning message was already printed out to
1863 		 * the console by zoneadm(1M) so instead we log the
1864 		 * message to syslog and continue.
1865 		 */
1866 		zerror(&logsys, B_TRUE, "WARNING: skipping interface "
1867 		    "'%s' which may not be present/plumbed in the "
1868 		    "global zone.", lifr.lifr_name);
1869 		(void) close(s);
1870 		return (Z_OK);
1871 	}
1872 
1873 	if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
1874 		zerror(zlogp, B_TRUE,
1875 		    "%s: could not set IP address to %s",
1876 		    lifr.lifr_name, nwiftabptr->zone_nwif_address);
1877 		goto bad;
1878 	}
1879 
1880 	/* Preserve literal IPv4 address for later potential printing. */
1881 	if (af == AF_INET)
1882 		(void) inet_ntop(AF_INET, &in4, addrstr4, INET_ADDRSTRLEN);
1883 
1884 	lifr.lifr_zoneid = zone_id;
1885 	if (ioctl(s, SIOCSLIFZONE, (caddr_t)&lifr) < 0) {
1886 		zerror(zlogp, B_TRUE, "%s: could not place interface into zone",
1887 		    lifr.lifr_name);
1888 		goto bad;
1889 	}
1890 
1891 	if (strcmp(nwiftabptr->zone_nwif_physical, "lo0") == 0) {
1892 		got_netmask = B_TRUE;	/* default setting will be correct */
1893 	} else {
1894 		if (af == AF_INET) {
1895 			/*
1896 			 * The IPv4 netmask can be determined either
1897 			 * directly if a prefix length was supplied with
1898 			 * the address or via the netmasks database.  Not
1899 			 * being able to determine it is a common failure,
1900 			 * but it often is not fatal to operation of the
1901 			 * interface.  In that case, a warning will be
1902 			 * printed after the rest of the interface's
1903 			 * parameters have been configured.
1904 			 */
1905 			(void) memset(&netmask4, 0, sizeof (netmask4));
1906 			if (slashp != NULL) {
1907 				if (addr2netmask(slashp + 1, V4_ADDR_LEN,
1908 				    (uchar_t *)&netmask4.sin_addr) != 0) {
1909 					*slashp = '/';
1910 					zerror(zlogp, B_FALSE,
1911 					    "%s: invalid prefix length in %s",
1912 					    lifr.lifr_name,
1913 					    nwiftabptr->zone_nwif_address);
1914 					goto bad;
1915 				}
1916 				got_netmask = B_TRUE;
1917 			} else if (getnetmaskbyaddr(in4,
1918 			    &netmask4.sin_addr) == 0) {
1919 				got_netmask = B_TRUE;
1920 			}
1921 			if (got_netmask) {
1922 				netmask4.sin_family = af;
1923 				(void) memcpy(&lifr.lifr_addr, &netmask4,
1924 				    sizeof (netmask4));
1925 			}
1926 		} else {
1927 			(void) memset(&netmask6, 0, sizeof (netmask6));
1928 			if (addr2netmask(slashp + 1, V6_ADDR_LEN,
1929 			    (uchar_t *)&netmask6.sin6_addr) != 0) {
1930 				*slashp = '/';
1931 				zerror(zlogp, B_FALSE,
1932 				    "%s: invalid prefix length in %s",
1933 				    lifr.lifr_name,
1934 				    nwiftabptr->zone_nwif_address);
1935 				goto bad;
1936 			}
1937 			got_netmask = B_TRUE;
1938 			netmask6.sin6_family = af;
1939 			(void) memcpy(&lifr.lifr_addr, &netmask6,
1940 			    sizeof (netmask6));
1941 		}
1942 		if (got_netmask &&
1943 		    ioctl(s, SIOCSLIFNETMASK, (caddr_t)&lifr) < 0) {
1944 			zerror(zlogp, B_TRUE, "%s: could not set netmask",
1945 			    lifr.lifr_name);
1946 			goto bad;
1947 		}
1948 
1949 		/*
1950 		 * This doesn't set the broadcast address at all. Rather, it
1951 		 * gets, then sets the interface's address, relying on the fact
1952 		 * that resetting the address will reset the broadcast address.
1953 		 */
1954 		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
1955 			zerror(zlogp, B_TRUE, "%s: could not get address",
1956 			    lifr.lifr_name);
1957 			goto bad;
1958 		}
1959 		if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
1960 			zerror(zlogp, B_TRUE,
1961 			    "%s: could not reset broadcast address",
1962 			    lifr.lifr_name);
1963 			goto bad;
1964 		}
1965 	}
1966 
1967 	if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
1968 		zerror(zlogp, B_TRUE, "%s: could not get flags",
1969 		    lifr.lifr_name);
1970 		goto bad;
1971 	}
1972 	lifr.lifr_flags |= IFF_UP;
1973 	if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
1974 		int save_errno = errno;
1975 		char *zone_using;
1976 
1977 		/*
1978 		 * If we failed with something other than EADDRNOTAVAIL,
1979 		 * then skip to the end.  Otherwise, look up our address,
1980 		 * then call a function to determine which zone is already
1981 		 * using that address.
1982 		 */
1983 		if (errno != EADDRNOTAVAIL) {
1984 			zerror(zlogp, B_TRUE,
1985 			    "%s: could not bring interface up", lifr.lifr_name);
1986 			goto bad;
1987 		}
1988 		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
1989 			zerror(zlogp, B_TRUE, "%s: could not get address",
1990 			    lifr.lifr_name);
1991 			goto bad;
1992 		}
1993 		zone_using = who_is_using(zlogp, &lifr);
1994 		errno = save_errno;
1995 		if (zone_using == NULL)
1996 			zerror(zlogp, B_TRUE,
1997 			    "%s: could not bring interface up", lifr.lifr_name);
1998 		else
1999 			zerror(zlogp, B_TRUE, "%s: could not bring interface "
2000 			    "up: address in use by zone '%s'", lifr.lifr_name,
2001 			    zone_using);
2002 		goto bad;
2003 	}
2004 	if ((lifr.lifr_flags & IFF_MULTICAST) && ((af == AF_INET &&
2005 	    mcast_rt_v4_setp != NULL && *mcast_rt_v4_setp == B_FALSE) ||
2006 	    (af == AF_INET6 &&
2007 	    mcast_rt_v6_setp != NULL && *mcast_rt_v6_setp == B_FALSE))) {
2008 		rs = socket(PF_ROUTE, SOCK_RAW, 0);
2009 		if (rs < 0) {
2010 			zerror(zlogp, B_TRUE, "%s: could not create "
2011 			    "routing socket", lifr.lifr_name);
2012 			goto bad;
2013 		}
2014 		(void) shutdown(rs, 0);
2015 		(void) memset((void *)&mcast_rtmsg, 0, sizeof (mcast_rtmsg_t));
2016 		mcast_rtmsg.m_rtm.rtm_msglen =  sizeof (struct rt_msghdr) +
2017 		    3 * (af == AF_INET ? sizeof (struct sockaddr_in) :
2018 		    sizeof (struct sockaddr_in6));
2019 		mcast_rtmsg.m_rtm.rtm_version = RTM_VERSION;
2020 		mcast_rtmsg.m_rtm.rtm_type = RTM_ADD;
2021 		mcast_rtmsg.m_rtm.rtm_flags = RTF_UP;
2022 		mcast_rtmsg.m_rtm.rtm_addrs =
2023 		    RTA_DST | RTA_GATEWAY | RTA_NETMASK;
2024 		mcast_rtmsg.m_rtm.rtm_seq = ++rts_seqno;
2025 		if (af == AF_INET) {
2026 			mcast_rtmsg.m_dst4.sin_family = AF_INET;
2027 			mcast_rtmsg.m_dst4.sin_addr.s_addr =
2028 			    htonl(INADDR_UNSPEC_GROUP);
2029 			mcast_rtmsg.m_gw4.sin_family = AF_INET;
2030 			mcast_rtmsg.m_gw4.sin_addr = in4;
2031 			mcast_rtmsg.m_netmask4.sin_family = AF_INET;
2032 			mcast_rtmsg.m_netmask4.sin_addr.s_addr =
2033 			    htonl(IN_CLASSD_NET);
2034 		} else {
2035 			mcast_rtmsg.m_dst6.sin6_family = AF_INET6;
2036 			mcast_rtmsg.m_dst6.sin6_addr.s6_addr[0] = 0xffU;
2037 			mcast_rtmsg.m_gw6.sin6_family = AF_INET6;
2038 			mcast_rtmsg.m_gw6.sin6_addr = in6;
2039 			mcast_rtmsg.m_netmask6.sin6_family = AF_INET6;
2040 			mcast_rtmsg.m_netmask6.sin6_addr.s6_addr[0] = 0xffU;
2041 		}
2042 		rlen = write(rs, (char *)&mcast_rtmsg,
2043 		    mcast_rtmsg.m_rtm.rtm_msglen);
2044 		/*
2045 		 * The write to the multicast socket will fail if the
2046 		 * interface belongs to a failed IPMP group. This is a
2047 		 * non-fatal error and the zone will continue booting.
2048 		 * While the zone is running, if any interface in the
2049 		 * failed IPMP group recovers, the zone will fallback to
2050 		 * using that interface.
2051 		 */
2052 		if (rlen < mcast_rtmsg.m_rtm.rtm_msglen) {
2053 			if (rlen < 0) {
2054 				zerror(zlogp, B_TRUE, "WARNING: interface "
2055 				    "'%s' not available as default for "
2056 				    "multicast.", lifr.lifr_name);
2057 			} else {
2058 				zerror(zlogp, B_FALSE, "WARNING: interface "
2059 				    "'%s' not available as default for "
2060 				    "multicast; routing socket returned "
2061 				    "unexpected %d bytes.",
2062 				    lifr.lifr_name, rlen);
2063 			}
2064 		} else {
2065 
2066 			if (af == AF_INET) {
2067 				*mcast_rt_v4_setp = B_TRUE;
2068 			} else {
2069 				*mcast_rt_v6_setp = B_TRUE;
2070 			}
2071 		}
2072 		(void) close(rs);
2073 	}
2074 
2075 	if (!got_netmask) {
2076 		/*
2077 		 * A common, but often non-fatal problem, is that the system
2078 		 * cannot find the netmask for an interface address. This is
2079 		 * often caused by it being only in /etc/inet/netmasks, but
2080 		 * /etc/nsswitch.conf says to use NIS or NIS+ and it's not
2081 		 * in that. This doesn't show up at boot because the netmask
2082 		 * is obtained from /etc/inet/netmasks when no network
2083 		 * interfaces are up, but isn't consulted when NIS/NIS+ is
2084 		 * available. We warn the user here that something like this
2085 		 * has happened and we're just running with a default and
2086 		 * possible incorrect netmask.
2087 		 */
2088 		char buffer[INET6_ADDRSTRLEN];
2089 		void  *addr;
2090 
2091 		if (af == AF_INET)
2092 			addr = &((struct sockaddr_in *)
2093 			    (&lifr.lifr_addr))->sin_addr;
2094 		else
2095 			addr = &((struct sockaddr_in6 *)
2096 			    (&lifr.lifr_addr))->sin6_addr;
2097 
2098 		/* Find out what netmask interface is going to be using */
2099 		if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
2100 		    inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL)
2101 			goto bad;
2102 		zerror(zlogp, B_FALSE,
2103 		    "WARNING: %s: no matching subnet found in netmasks(4) for "
2104 		    "%s; using default of %s.",
2105 		    lifr.lifr_name, addrstr4, buffer);
2106 	}
2107 
2108 	(void) close(s);
2109 	return (Z_OK);
2110 bad:
2111 	(void) ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifr);
2112 	(void) close(s);
2113 	return (-1);
2114 }
2115 
2116 /*
2117  * Sets up network interfaces based on information from the zone configuration.
2118  * An IPv4 loopback interface is set up "for free", modeling the global system.
2119  * If any of the configuration interfaces were IPv6, then an IPv6 loopback
2120  * address is set up as well.
2121  *
2122  * If anything goes wrong, we log a general error message, attempt to tear down
2123  * whatever we set up, and return an error.
2124  */
2125 static int
2126 configure_network_interfaces(zlog_t *zlogp)
2127 {
2128 	zone_dochandle_t handle;
2129 	struct zone_nwiftab nwiftab, loopback_iftab;
2130 	boolean_t saw_v6 = B_FALSE;
2131 	boolean_t mcast_rt_v4_set = B_FALSE;
2132 	boolean_t mcast_rt_v6_set = B_FALSE;
2133 	zoneid_t zoneid;
2134 
2135 	if ((zoneid = getzoneidbyname(zone_name)) == ZONE_ID_UNDEFINED) {
2136 		zerror(zlogp, B_TRUE, "unable to get zoneid");
2137 		return (-1);
2138 	}
2139 
2140 	if ((handle = zonecfg_init_handle()) == NULL) {
2141 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2142 		return (-1);
2143 	}
2144 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2145 		zerror(zlogp, B_FALSE, "invalid configuration");
2146 		zonecfg_fini_handle(handle);
2147 		return (-1);
2148 	}
2149 	if (zonecfg_setnwifent(handle) == Z_OK) {
2150 		for (;;) {
2151 			struct in6_addr in6;
2152 
2153 			if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2154 				break;
2155 			if (configure_one_interface(zlogp, zoneid,
2156 			    &nwiftab, &mcast_rt_v4_set, &mcast_rt_v6_set) !=
2157 			    Z_OK) {
2158 				(void) zonecfg_endnwifent(handle);
2159 				zonecfg_fini_handle(handle);
2160 				return (-1);
2161 			}
2162 			if (inet_pton(AF_INET6, nwiftab.zone_nwif_address,
2163 			    &in6) == 1)
2164 				saw_v6 = B_TRUE;
2165 		}
2166 		(void) zonecfg_endnwifent(handle);
2167 	}
2168 	zonecfg_fini_handle(handle);
2169 	(void) strlcpy(loopback_iftab.zone_nwif_physical, "lo0",
2170 	    sizeof (loopback_iftab.zone_nwif_physical));
2171 	(void) strlcpy(loopback_iftab.zone_nwif_address, "127.0.0.1",
2172 	    sizeof (loopback_iftab.zone_nwif_address));
2173 	if (configure_one_interface(zlogp, zoneid, &loopback_iftab, NULL, NULL)
2174 	    != Z_OK) {
2175 		return (-1);
2176 	}
2177 	if (saw_v6) {
2178 		(void) strlcpy(loopback_iftab.zone_nwif_address, "::1/128",
2179 		    sizeof (loopback_iftab.zone_nwif_address));
2180 		if (configure_one_interface(zlogp, zoneid,
2181 		    &loopback_iftab, NULL, NULL) != Z_OK) {
2182 			return (-1);
2183 		}
2184 	}
2185 	return (0);
2186 }
2187 
2188 static int
2189 tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
2190     const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
2191 {
2192 	int fd;
2193 	struct strioctl ioc;
2194 	tcp_ioc_abort_conn_t conn;
2195 	int error;
2196 
2197 	conn.ac_local = *local;
2198 	conn.ac_remote = *remote;
2199 	conn.ac_start = TCPS_SYN_SENT;
2200 	conn.ac_end = TCPS_TIME_WAIT;
2201 	conn.ac_zoneid = zoneid;
2202 
2203 	ioc.ic_cmd = TCP_IOC_ABORT_CONN;
2204 	ioc.ic_timout = -1; /* infinite timeout */
2205 	ioc.ic_len = sizeof (conn);
2206 	ioc.ic_dp = (char *)&conn;
2207 
2208 	if ((fd = open("/dev/tcp", O_RDONLY)) < 0) {
2209 		zerror(zlogp, B_TRUE, "unable to open %s", "/dev/tcp");
2210 		return (-1);
2211 	}
2212 
2213 	error = ioctl(fd, I_STR, &ioc);
2214 	(void) close(fd);
2215 	if (error == 0 || errno == ENOENT)	/* ENOENT is not an error */
2216 		return (0);
2217 	return (-1);
2218 }
2219 
2220 static int
2221 tcp_abort_connections(zlog_t *zlogp, zoneid_t zoneid)
2222 {
2223 	struct sockaddr_storage l, r;
2224 	struct sockaddr_in *local, *remote;
2225 	struct sockaddr_in6 *local6, *remote6;
2226 	int error;
2227 
2228 	/*
2229 	 * Abort IPv4 connections.
2230 	 */
2231 	bzero(&l, sizeof (*local));
2232 	local = (struct sockaddr_in *)&l;
2233 	local->sin_family = AF_INET;
2234 	local->sin_addr.s_addr = INADDR_ANY;
2235 	local->sin_port = 0;
2236 
2237 	bzero(&r, sizeof (*remote));
2238 	remote = (struct sockaddr_in *)&r;
2239 	remote->sin_family = AF_INET;
2240 	remote->sin_addr.s_addr = INADDR_ANY;
2241 	remote->sin_port = 0;
2242 
2243 	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2244 		return (error);
2245 
2246 	/*
2247 	 * Abort IPv6 connections.
2248 	 */
2249 	bzero(&l, sizeof (*local6));
2250 	local6 = (struct sockaddr_in6 *)&l;
2251 	local6->sin6_family = AF_INET6;
2252 	local6->sin6_port = 0;
2253 	local6->sin6_addr = in6addr_any;
2254 
2255 	bzero(&r, sizeof (*remote6));
2256 	remote6 = (struct sockaddr_in6 *)&r;
2257 	remote6->sin6_family = AF_INET6;
2258 	remote6->sin6_port = 0;
2259 	remote6->sin6_addr = in6addr_any;
2260 
2261 	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
2262 		return (error);
2263 	return (0);
2264 }
2265 
2266 static int
2267 get_privset(zlog_t *zlogp, priv_set_t *privs, boolean_t mount_cmd)
2268 {
2269 	int error = -1;
2270 	zone_dochandle_t handle;
2271 	char *privname = NULL;
2272 
2273 	if (mount_cmd) {
2274 		if (zonecfg_default_privset(privs) == Z_OK)
2275 			return (0);
2276 		zerror(zlogp, B_FALSE,
2277 		    "failed to determine the zone's default privilege set");
2278 		return (-1);
2279 	}
2280 
2281 	if ((handle = zonecfg_init_handle()) == NULL) {
2282 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2283 		return (-1);
2284 	}
2285 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2286 		zerror(zlogp, B_FALSE, "invalid configuration");
2287 		zonecfg_fini_handle(handle);
2288 		return (-1);
2289 	}
2290 
2291 	switch (zonecfg_get_privset(handle, privs, &privname)) {
2292 	case Z_OK:
2293 		error = 0;
2294 		break;
2295 	case Z_PRIV_PROHIBITED:
2296 		zerror(zlogp, B_FALSE, "privilege \"%s\" is not permitted "
2297 		    "within the zone's privilege set", privname);
2298 		break;
2299 	case Z_PRIV_REQUIRED:
2300 		zerror(zlogp, B_FALSE, "required privilege \"%s\" is missing "
2301 		    "from the zone's privilege set", privname);
2302 		break;
2303 	case Z_PRIV_UNKNOWN:
2304 		zerror(zlogp, B_FALSE, "unknown privilege \"%s\" specified "
2305 		    "in the zone's privilege set", privname);
2306 		break;
2307 	default:
2308 		zerror(zlogp, B_FALSE, "failed to determine the zone's "
2309 		    "privilege set");
2310 		break;
2311 	}
2312 
2313 	free(privname);
2314 	zonecfg_fini_handle(handle);
2315 	return (error);
2316 }
2317 
2318 static int
2319 get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
2320 {
2321 	nvlist_t *nvl = NULL;
2322 	char *nvl_packed = NULL;
2323 	size_t nvl_size = 0;
2324 	nvlist_t **nvlv = NULL;
2325 	int rctlcount = 0;
2326 	int error = -1;
2327 	zone_dochandle_t handle;
2328 	struct zone_rctltab rctltab;
2329 	rctlblk_t *rctlblk = NULL;
2330 
2331 	*bufp = NULL;
2332 	*bufsizep = 0;
2333 
2334 	if ((handle = zonecfg_init_handle()) == NULL) {
2335 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2336 		return (-1);
2337 	}
2338 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2339 		zerror(zlogp, B_FALSE, "invalid configuration");
2340 		zonecfg_fini_handle(handle);
2341 		return (-1);
2342 	}
2343 
2344 	rctltab.zone_rctl_valptr = NULL;
2345 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
2346 		zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc");
2347 		goto out;
2348 	}
2349 
2350 	if (zonecfg_setrctlent(handle) != Z_OK) {
2351 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent");
2352 		goto out;
2353 	}
2354 
2355 	if ((rctlblk = malloc(rctlblk_size())) == NULL) {
2356 		zerror(zlogp, B_TRUE, "memory allocation failed");
2357 		goto out;
2358 	}
2359 	while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) {
2360 		struct zone_rctlvaltab *rctlval;
2361 		uint_t i, count;
2362 		const char *name = rctltab.zone_rctl_name;
2363 
2364 		/* zoneadm should have already warned about unknown rctls. */
2365 		if (!zonecfg_is_rctl(name)) {
2366 			zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2367 			rctltab.zone_rctl_valptr = NULL;
2368 			continue;
2369 		}
2370 		count = 0;
2371 		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
2372 		    rctlval = rctlval->zone_rctlval_next) {
2373 			count++;
2374 		}
2375 		if (count == 0) {	/* ignore */
2376 			continue;	/* Nothing to free */
2377 		}
2378 		if ((nvlv = malloc(sizeof (*nvlv) * count)) == NULL)
2379 			goto out;
2380 		i = 0;
2381 		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
2382 		    rctlval = rctlval->zone_rctlval_next, i++) {
2383 			if (nvlist_alloc(&nvlv[i], NV_UNIQUE_NAME, 0) != 0) {
2384 				zerror(zlogp, B_TRUE, "%s failed",
2385 				    "nvlist_alloc");
2386 				goto out;
2387 			}
2388 			if (zonecfg_construct_rctlblk(rctlval, rctlblk)
2389 			    != Z_OK) {
2390 				zerror(zlogp, B_FALSE, "invalid rctl value: "
2391 				    "(priv=%s,limit=%s,action=%s)",
2392 				    rctlval->zone_rctlval_priv,
2393 				    rctlval->zone_rctlval_limit,
2394 				    rctlval->zone_rctlval_action);
2395 				goto out;
2396 			}
2397 			if (!zonecfg_valid_rctl(name, rctlblk)) {
2398 				zerror(zlogp, B_FALSE,
2399 				    "(priv=%s,limit=%s,action=%s) is not a "
2400 				    "valid value for rctl '%s'",
2401 				    rctlval->zone_rctlval_priv,
2402 				    rctlval->zone_rctlval_limit,
2403 				    rctlval->zone_rctlval_action,
2404 				    name);
2405 				goto out;
2406 			}
2407 			if (nvlist_add_uint64(nvlv[i], "privilege",
2408 			    rctlblk_get_privilege(rctlblk)) != 0) {
2409 				zerror(zlogp, B_FALSE, "%s failed",
2410 				    "nvlist_add_uint64");
2411 				goto out;
2412 			}
2413 			if (nvlist_add_uint64(nvlv[i], "limit",
2414 			    rctlblk_get_value(rctlblk)) != 0) {
2415 				zerror(zlogp, B_FALSE, "%s failed",
2416 				    "nvlist_add_uint64");
2417 				goto out;
2418 			}
2419 			if (nvlist_add_uint64(nvlv[i], "action",
2420 			    (uint_t)rctlblk_get_local_action(rctlblk, NULL))
2421 			    != 0) {
2422 				zerror(zlogp, B_FALSE, "%s failed",
2423 				    "nvlist_add_uint64");
2424 				goto out;
2425 			}
2426 		}
2427 		zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2428 		rctltab.zone_rctl_valptr = NULL;
2429 		if (nvlist_add_nvlist_array(nvl, (char *)name, nvlv, count)
2430 		    != 0) {
2431 			zerror(zlogp, B_FALSE, "%s failed",
2432 			    "nvlist_add_nvlist_array");
2433 			goto out;
2434 		}
2435 		for (i = 0; i < count; i++)
2436 			nvlist_free(nvlv[i]);
2437 		free(nvlv);
2438 		nvlv = NULL;
2439 		rctlcount++;
2440 	}
2441 	(void) zonecfg_endrctlent(handle);
2442 
2443 	if (rctlcount == 0) {
2444 		error = 0;
2445 		goto out;
2446 	}
2447 	if (nvlist_pack(nvl, &nvl_packed, &nvl_size, NV_ENCODE_NATIVE, 0)
2448 	    != 0) {
2449 		zerror(zlogp, B_FALSE, "%s failed", "nvlist_pack");
2450 		goto out;
2451 	}
2452 
2453 	error = 0;
2454 	*bufp = nvl_packed;
2455 	*bufsizep = nvl_size;
2456 
2457 out:
2458 	free(rctlblk);
2459 	zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
2460 	if (error && nvl_packed != NULL)
2461 		free(nvl_packed);
2462 	if (nvl != NULL)
2463 		nvlist_free(nvl);
2464 	if (nvlv != NULL)
2465 		free(nvlv);
2466 	if (handle != NULL)
2467 		zonecfg_fini_handle(handle);
2468 	return (error);
2469 }
2470 
2471 static int
2472 get_zone_pool(zlog_t *zlogp, char *poolbuf, size_t bufsz)
2473 {
2474 	zone_dochandle_t handle;
2475 	int error;
2476 
2477 	if ((handle = zonecfg_init_handle()) == NULL) {
2478 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2479 		return (Z_NOMEM);
2480 	}
2481 	error = zonecfg_get_snapshot_handle(zone_name, handle);
2482 	if (error != Z_OK) {
2483 		zerror(zlogp, B_FALSE, "invalid configuration");
2484 		zonecfg_fini_handle(handle);
2485 		return (error);
2486 	}
2487 	error = zonecfg_get_pool(handle, poolbuf, bufsz);
2488 	zonecfg_fini_handle(handle);
2489 	return (error);
2490 }
2491 
2492 static int
2493 get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
2494 {
2495 	zone_dochandle_t handle;
2496 	struct zone_dstab dstab;
2497 	size_t total, offset, len;
2498 	int error = -1;
2499 	char *str;
2500 
2501 	*bufp = NULL;
2502 	*bufsizep = 0;
2503 
2504 	if ((handle = zonecfg_init_handle()) == NULL) {
2505 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2506 		return (-1);
2507 	}
2508 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2509 		zerror(zlogp, B_FALSE, "invalid configuration");
2510 		zonecfg_fini_handle(handle);
2511 		return (-1);
2512 	}
2513 
2514 	if (zonecfg_setdsent(handle) != Z_OK) {
2515 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
2516 		goto out;
2517 	}
2518 
2519 	total = 0;
2520 	while (zonecfg_getdsent(handle, &dstab) == Z_OK)
2521 		total += strlen(dstab.zone_dataset_name) + 1;
2522 	(void) zonecfg_enddsent(handle);
2523 
2524 	if (total == 0) {
2525 		error = 0;
2526 		goto out;
2527 	}
2528 
2529 	if ((str = malloc(total)) == NULL) {
2530 		zerror(zlogp, B_TRUE, "memory allocation failed");
2531 		goto out;
2532 	}
2533 
2534 	if (zonecfg_setdsent(handle) != Z_OK) {
2535 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
2536 		goto out;
2537 	}
2538 	offset = 0;
2539 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
2540 		len = strlen(dstab.zone_dataset_name);
2541 		(void) strlcpy(str + offset, dstab.zone_dataset_name,
2542 		    sizeof (dstab.zone_dataset_name) - offset);
2543 		offset += len;
2544 		if (offset != total - 1)
2545 			str[offset++] = ',';
2546 	}
2547 	(void) zonecfg_enddsent(handle);
2548 
2549 	error = 0;
2550 	*bufp = str;
2551 	*bufsizep = total;
2552 
2553 out:
2554 	if (error != 0 && str != NULL)
2555 		free(str);
2556 	if (handle != NULL)
2557 		zonecfg_fini_handle(handle);
2558 
2559 	return (error);
2560 }
2561 
2562 static int
2563 validate_datasets(zlog_t *zlogp)
2564 {
2565 	zone_dochandle_t handle;
2566 	struct zone_dstab dstab;
2567 	zfs_handle_t *zhp;
2568 	libzfs_handle_t *hdl;
2569 
2570 	if ((handle = zonecfg_init_handle()) == NULL) {
2571 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2572 		return (-1);
2573 	}
2574 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2575 		zerror(zlogp, B_FALSE, "invalid configuration");
2576 		zonecfg_fini_handle(handle);
2577 		return (-1);
2578 	}
2579 
2580 	if (zonecfg_setdsent(handle) != Z_OK) {
2581 		zerror(zlogp, B_FALSE, "invalid configuration");
2582 		zonecfg_fini_handle(handle);
2583 		return (-1);
2584 	}
2585 
2586 	if ((hdl = libzfs_init()) == NULL) {
2587 		zerror(zlogp, B_FALSE, "opening ZFS library");
2588 		zonecfg_fini_handle(handle);
2589 		return (-1);
2590 	}
2591 
2592 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
2593 
2594 		if ((zhp = zfs_open(hdl, dstab.zone_dataset_name,
2595 		    ZFS_TYPE_FILESYSTEM)) == NULL) {
2596 			zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
2597 			    dstab.zone_dataset_name);
2598 			zonecfg_fini_handle(handle);
2599 			libzfs_fini(hdl);
2600 			return (-1);
2601 		}
2602 
2603 		/*
2604 		 * Automatically set the 'zoned' property.  We check the value
2605 		 * first because we'll get EPERM if it is already set.
2606 		 */
2607 		if (!zfs_prop_get_int(zhp, ZFS_PROP_ZONED) &&
2608 		    zfs_prop_set(zhp, ZFS_PROP_ZONED, "on") != 0) {
2609 			zerror(zlogp, B_FALSE, "cannot set 'zoned' "
2610 			    "property for ZFS dataset '%s'\n",
2611 			    dstab.zone_dataset_name);
2612 			zonecfg_fini_handle(handle);
2613 			zfs_close(zhp);
2614 			libzfs_fini(hdl);
2615 			return (-1);
2616 		}
2617 
2618 		zfs_close(zhp);
2619 	}
2620 	(void) zonecfg_enddsent(handle);
2621 
2622 	zonecfg_fini_handle(handle);
2623 	libzfs_fini(hdl);
2624 
2625 	return (0);
2626 }
2627 
2628 static int
2629 bind_to_pool(zlog_t *zlogp, zoneid_t zoneid)
2630 {
2631 	pool_conf_t *poolconf;
2632 	pool_t *pool;
2633 	char poolname[MAXPATHLEN];
2634 	int status;
2635 	int error;
2636 
2637 	/*
2638 	 * Find the pool mentioned in the zone configuration, and bind to it.
2639 	 */
2640 	error = get_zone_pool(zlogp, poolname, sizeof (poolname));
2641 	if (error == Z_NO_ENTRY || (error == Z_OK && strlen(poolname) == 0)) {
2642 		/*
2643 		 * The property is not set on the zone, so the pool
2644 		 * should be bound to the default pool.  But that's
2645 		 * already done by the kernel, so we can just return.
2646 		 */
2647 		return (0);
2648 	}
2649 	if (error != Z_OK) {
2650 		/*
2651 		 * Not an error, even though it shouldn't be happening.
2652 		 */
2653 		zerror(zlogp, B_FALSE,
2654 		    "WARNING: unable to retrieve default pool.");
2655 		return (0);
2656 	}
2657 	/*
2658 	 * Don't do anything if pools aren't enabled.
2659 	 */
2660 	if (pool_get_status(&status) != PO_SUCCESS || status != POOL_ENABLED) {
2661 		zerror(zlogp, B_FALSE, "WARNING: pools facility not active; "
2662 		    "zone will not be bound to pool '%s'.", poolname);
2663 		return (0);
2664 	}
2665 	/*
2666 	 * Try to provide a sane error message if the requested pool doesn't
2667 	 * exist.
2668 	 */
2669 	if ((poolconf = pool_conf_alloc()) == NULL) {
2670 		zerror(zlogp, B_FALSE, "%s failed", "pool_conf_alloc");
2671 		return (-1);
2672 	}
2673 	if (pool_conf_open(poolconf, pool_dynamic_location(), PO_RDONLY) !=
2674 	    PO_SUCCESS) {
2675 		zerror(zlogp, B_FALSE, "%s failed", "pool_conf_open");
2676 		pool_conf_free(poolconf);
2677 		return (-1);
2678 	}
2679 	pool = pool_get_pool(poolconf, poolname);
2680 	(void) pool_conf_close(poolconf);
2681 	pool_conf_free(poolconf);
2682 	if (pool == NULL) {
2683 		zerror(zlogp, B_FALSE, "WARNING: pool '%s' not found; "
2684 		    "using default pool.", poolname);
2685 		return (0);
2686 	}
2687 	/*
2688 	 * Bind the zone to the pool.
2689 	 */
2690 	if (pool_set_binding(poolname, P_ZONEID, zoneid) != PO_SUCCESS) {
2691 		zerror(zlogp, B_FALSE, "WARNING: unable to bind to pool '%s'; "
2692 		    "using default pool.", poolname);
2693 	}
2694 	return (0);
2695 }
2696 
2697 /*
2698  * Mount lower level home directories into/from current zone
2699  * Share exported directories specified in dfstab for zone
2700  */
2701 static int
2702 tsol_mounts(zlog_t *zlogp, char *zone_name, char *rootpath)
2703 {
2704 	zoneid_t *zids = NULL;
2705 	priv_set_t *zid_privs;
2706 	const priv_impl_info_t *ip = NULL;
2707 	uint_t nzents_saved;
2708 	uint_t nzents;
2709 	int i;
2710 	char readonly[] = "ro";
2711 	struct zone_fstab lower_fstab;
2712 	char *argv[4];
2713 
2714 	if (!is_system_labeled())
2715 		return (0);
2716 
2717 	if (zid_label == NULL) {
2718 		zid_label = m_label_alloc(MAC_LABEL);
2719 		if (zid_label == NULL)
2720 			return (-1);
2721 	}
2722 
2723 	/* Make sure our zone has an /export/home dir */
2724 	(void) make_one_dir(zlogp, rootpath, "/export/home",
2725 	    DEFAULT_DIR_MODE);
2726 
2727 	lower_fstab.zone_fs_raw[0] = '\0';
2728 	(void) strlcpy(lower_fstab.zone_fs_type, MNTTYPE_LOFS,
2729 	    sizeof (lower_fstab.zone_fs_type));
2730 	lower_fstab.zone_fs_options = NULL;
2731 	(void) zonecfg_add_fs_option(&lower_fstab, readonly);
2732 
2733 	/*
2734 	 * Get the list of zones from the kernel
2735 	 */
2736 	if (zone_list(NULL, &nzents) != 0) {
2737 		zerror(zlogp, B_TRUE, "unable to list zones");
2738 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2739 		return (-1);
2740 	}
2741 again:
2742 	if (nzents == 0) {
2743 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2744 		return (-1);
2745 	}
2746 
2747 	zids = malloc(nzents * sizeof (zoneid_t));
2748 	if (zids == NULL) {
2749 		zerror(zlogp, B_TRUE, "memory allocation failed");
2750 		return (-1);
2751 	}
2752 	nzents_saved = nzents;
2753 
2754 	if (zone_list(zids, &nzents) != 0) {
2755 		zerror(zlogp, B_TRUE, "unable to list zones");
2756 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2757 		free(zids);
2758 		return (-1);
2759 	}
2760 	if (nzents != nzents_saved) {
2761 		/* list changed, try again */
2762 		free(zids);
2763 		goto again;
2764 	}
2765 
2766 	ip = getprivimplinfo();
2767 	if ((zid_privs = priv_allocset()) == NULL) {
2768 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
2769 		zonecfg_free_fs_option_list(
2770 		    lower_fstab.zone_fs_options);
2771 		free(zids);
2772 		return (-1);
2773 	}
2774 
2775 	for (i = 0; i < nzents; i++) {
2776 		char zid_name[ZONENAME_MAX];
2777 		zone_state_t zid_state;
2778 		char zid_rpath[MAXPATHLEN];
2779 		struct stat stat_buf;
2780 
2781 		if (zids[i] == GLOBAL_ZONEID)
2782 			continue;
2783 
2784 		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
2785 			continue;
2786 
2787 		/*
2788 		 * Do special setup for the zone we are booting
2789 		 */
2790 		if (strcmp(zid_name, zone_name) == 0) {
2791 			struct zone_fstab autofs_fstab;
2792 			char map_path[MAXPATHLEN];
2793 			int fd;
2794 
2795 			/*
2796 			 * Create auto_home_<zone> map for this zone
2797 			 * in the global zone. The local zone entry
2798 			 * will be created by automount when the zone
2799 			 * is booted.
2800 			 */
2801 
2802 			(void) snprintf(autofs_fstab.zone_fs_special,
2803 			    MAXPATHLEN, "auto_home_%s", zid_name);
2804 
2805 			(void) snprintf(autofs_fstab.zone_fs_dir, MAXPATHLEN,
2806 			    "/zone/%s/home", zid_name);
2807 
2808 			(void) snprintf(map_path, sizeof (map_path),
2809 			    "/etc/%s", autofs_fstab.zone_fs_special);
2810 			/*
2811 			 * If the map file doesn't exist create a template
2812 			 */
2813 			if ((fd = open(map_path, O_RDWR | O_CREAT | O_EXCL,
2814 			    S_IRUSR | S_IWUSR | S_IRGRP| S_IROTH)) != -1) {
2815 				int len;
2816 				char map_rec[MAXPATHLEN];
2817 
2818 				len = snprintf(map_rec, sizeof (map_rec),
2819 				    "+%s\n*\t-fstype=lofs\t:%s/export/home/&\n",
2820 				    autofs_fstab.zone_fs_special, rootpath);
2821 				(void) write(fd, map_rec, len);
2822 				(void) close(fd);
2823 			}
2824 
2825 			/*
2826 			 * Mount auto_home_<zone> in the global zone if absent.
2827 			 * If it's already of type autofs, then
2828 			 * don't mount it again.
2829 			 */
2830 			if ((stat(autofs_fstab.zone_fs_dir, &stat_buf) == -1) ||
2831 			    strcmp(stat_buf.st_fstype, MNTTYPE_AUTOFS) != 0) {
2832 				char optstr[] = "indirect,ignore,nobrowse";
2833 
2834 				(void) make_one_dir(zlogp, "",
2835 				    autofs_fstab.zone_fs_dir, DEFAULT_DIR_MODE);
2836 
2837 				/*
2838 				 * Mount will fail if automounter has already
2839 				 * processed the auto_home_<zonename> map
2840 				 */
2841 				(void) domount(zlogp, MNTTYPE_AUTOFS, optstr,
2842 				    autofs_fstab.zone_fs_special,
2843 				    autofs_fstab.zone_fs_dir);
2844 			}
2845 			continue;
2846 		}
2847 
2848 
2849 		if (zone_get_state(zid_name, &zid_state) != Z_OK ||
2850 		    (zid_state != ZONE_STATE_READY &&
2851 		    zid_state != ZONE_STATE_RUNNING))
2852 			/* Skip over zones without mounted filesystems */
2853 			continue;
2854 
2855 		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
2856 		    sizeof (m_label_t)) < 0)
2857 			/* Skip over zones with unspecified label */
2858 			continue;
2859 
2860 		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
2861 		    sizeof (zid_rpath)) == -1)
2862 			/* Skip over zones with bad path */
2863 			continue;
2864 
2865 		if (zone_getattr(zids[i], ZONE_ATTR_PRIVSET, zid_privs,
2866 		    sizeof (priv_chunk_t) * ip->priv_setsize) == -1)
2867 			/* Skip over zones with bad privs */
2868 			continue;
2869 
2870 		/*
2871 		 * Reading down is valid according to our label model
2872 		 * but some customers want to disable it because it
2873 		 * allows execute down and other possible attacks.
2874 		 * Therefore, we restrict this feature to zones that
2875 		 * have the NET_MAC_AWARE privilege which is required
2876 		 * for NFS read-down semantics.
2877 		 */
2878 		if ((bldominates(zlabel, zid_label)) &&
2879 		    (priv_ismember(zprivs, PRIV_NET_MAC_AWARE))) {
2880 			/*
2881 			 * Our zone dominates this one.
2882 			 * Create a lofs mount from lower zone's /export/home
2883 			 */
2884 			(void) snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
2885 			    "%s/zone/%s/export/home", rootpath, zid_name);
2886 
2887 			/*
2888 			 * If the target is already an LOFS mount
2889 			 * then don't do it again.
2890 			 */
2891 			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
2892 			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
2893 
2894 				if (snprintf(lower_fstab.zone_fs_special,
2895 				    MAXPATHLEN, "%s/export",
2896 				    zid_rpath) > MAXPATHLEN)
2897 					continue;
2898 
2899 				/*
2900 				 * Make sure the lower-level home exists
2901 				 */
2902 				if (make_one_dir(zlogp,
2903 				    lower_fstab.zone_fs_special,
2904 				    "/home", DEFAULT_DIR_MODE) != 0)
2905 					continue;
2906 
2907 				(void) strlcat(lower_fstab.zone_fs_special,
2908 				    "/home", MAXPATHLEN);
2909 
2910 				/*
2911 				 * Mount can fail because the lower-level
2912 				 * zone may have already done a mount up.
2913 				 */
2914 				(void) mount_one(zlogp, &lower_fstab, "");
2915 			}
2916 		} else if ((bldominates(zid_label, zlabel)) &&
2917 		    (priv_ismember(zid_privs, PRIV_NET_MAC_AWARE))) {
2918 			/*
2919 			 * This zone dominates our zone.
2920 			 * Create a lofs mount from our zone's /export/home
2921 			 */
2922 			if (snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
2923 			    "%s/zone/%s/export/home", zid_rpath,
2924 			    zone_name) > MAXPATHLEN)
2925 				continue;
2926 
2927 			/*
2928 			 * If the target is already an LOFS mount
2929 			 * then don't do it again.
2930 			 */
2931 			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
2932 			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
2933 
2934 				(void) snprintf(lower_fstab.zone_fs_special,
2935 				    MAXPATHLEN, "%s/export/home", rootpath);
2936 
2937 				/*
2938 				 * Mount can fail because the higher-level
2939 				 * zone may have already done a mount down.
2940 				 */
2941 				(void) mount_one(zlogp, &lower_fstab, "");
2942 			}
2943 		}
2944 	}
2945 	zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
2946 	priv_freeset(zid_privs);
2947 	free(zids);
2948 
2949 	/*
2950 	 * Now share any exported directories from this zone.
2951 	 * Each zone can have its own dfstab.
2952 	 */
2953 
2954 	argv[0] = "zoneshare";
2955 	argv[1] = "-z";
2956 	argv[2] = zone_name;
2957 	argv[3] = NULL;
2958 
2959 	(void) forkexec(zlogp, "/usr/lib/zones/zoneshare", argv);
2960 	/* Don't check for errors since they don't affect the zone */
2961 
2962 	return (0);
2963 }
2964 
2965 /*
2966  * Unmount lofs mounts from higher level zones
2967  * Unshare nfs exported directories
2968  */
2969 static void
2970 tsol_unmounts(zlog_t *zlogp, char *zone_name)
2971 {
2972 	zoneid_t *zids = NULL;
2973 	uint_t nzents_saved;
2974 	uint_t nzents;
2975 	int i;
2976 	char *argv[4];
2977 	char path[MAXPATHLEN];
2978 
2979 	if (!is_system_labeled())
2980 		return;
2981 
2982 	/*
2983 	 * Get the list of zones from the kernel
2984 	 */
2985 	if (zone_list(NULL, &nzents) != 0) {
2986 		return;
2987 	}
2988 
2989 	if (zid_label == NULL) {
2990 		zid_label = m_label_alloc(MAC_LABEL);
2991 		if (zid_label == NULL)
2992 			return;
2993 	}
2994 
2995 again:
2996 	if (nzents == 0)
2997 		return;
2998 
2999 	zids = malloc(nzents * sizeof (zoneid_t));
3000 	if (zids == NULL) {
3001 		zerror(zlogp, B_TRUE, "memory allocation failed");
3002 		return;
3003 	}
3004 	nzents_saved = nzents;
3005 
3006 	if (zone_list(zids, &nzents) != 0) {
3007 		free(zids);
3008 		return;
3009 	}
3010 	if (nzents != nzents_saved) {
3011 		/* list changed, try again */
3012 		free(zids);
3013 		goto again;
3014 	}
3015 
3016 	for (i = 0; i < nzents; i++) {
3017 		char zid_name[ZONENAME_MAX];
3018 		zone_state_t zid_state;
3019 		char zid_rpath[MAXPATHLEN];
3020 
3021 		if (zids[i] == GLOBAL_ZONEID)
3022 			continue;
3023 
3024 		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
3025 			continue;
3026 
3027 		/*
3028 		 * Skip the zone we are halting
3029 		 */
3030 		if (strcmp(zid_name, zone_name) == 0)
3031 			continue;
3032 
3033 		if ((zone_getattr(zids[i], ZONE_ATTR_STATUS, &zid_state,
3034 		    sizeof (zid_state)) < 0) ||
3035 		    (zid_state < ZONE_IS_READY))
3036 			/* Skip over zones without mounted filesystems */
3037 			continue;
3038 
3039 		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
3040 		    sizeof (m_label_t)) < 0)
3041 			/* Skip over zones with unspecified label */
3042 			continue;
3043 
3044 		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
3045 		    sizeof (zid_rpath)) == -1)
3046 			/* Skip over zones with bad path */
3047 			continue;
3048 
3049 		if (zlabel != NULL && bldominates(zid_label, zlabel)) {
3050 			/*
3051 			 * This zone dominates our zone.
3052 			 * Unmount the lofs mount of our zone's /export/home
3053 			 */
3054 
3055 			if (snprintf(path, MAXPATHLEN,
3056 			    "%s/zone/%s/export/home", zid_rpath,
3057 			    zone_name) > MAXPATHLEN)
3058 				continue;
3059 
3060 			/* Skip over mount failures */
3061 			(void) umount(path);
3062 		}
3063 	}
3064 	free(zids);
3065 
3066 	/*
3067 	 * Unmount global zone autofs trigger for this zone
3068 	 */
3069 	(void) snprintf(path, MAXPATHLEN, "/zone/%s/home", zone_name);
3070 	/* Skip over mount failures */
3071 	(void) umount(path);
3072 
3073 	/*
3074 	 * Next unshare any exported directories from this zone.
3075 	 */
3076 
3077 	argv[0] = "zoneunshare";
3078 	argv[1] = "-z";
3079 	argv[2] = zone_name;
3080 	argv[3] = NULL;
3081 
3082 	(void) forkexec(zlogp, "/usr/lib/zones/zoneunshare", argv);
3083 	/* Don't check for errors since they don't affect the zone */
3084 
3085 	/*
3086 	 * Finally, deallocate any devices in the zone.
3087 	 */
3088 
3089 	argv[0] = "deallocate";
3090 	argv[1] = "-Isz";
3091 	argv[2] = zone_name;
3092 	argv[3] = NULL;
3093 
3094 	(void) forkexec(zlogp, "/usr/sbin/deallocate", argv);
3095 	/* Don't check for errors since they don't affect the zone */
3096 }
3097 
3098 /*
3099  * Fetch the Trusted Extensions label and multi-level ports (MLPs) for
3100  * this zone.
3101  */
3102 static tsol_zcent_t *
3103 get_zone_label(zlog_t *zlogp, priv_set_t *privs)
3104 {
3105 	FILE *fp;
3106 	tsol_zcent_t *zcent = NULL;
3107 	char line[MAXTNZLEN];
3108 
3109 	if ((fp = fopen(TNZONECFG_PATH, "r")) == NULL) {
3110 		zerror(zlogp, B_TRUE, "%s", TNZONECFG_PATH);
3111 		return (NULL);
3112 	}
3113 
3114 	while (fgets(line, sizeof (line), fp) != NULL) {
3115 		/*
3116 		 * Check for malformed database
3117 		 */
3118 		if (strlen(line) == MAXTNZLEN - 1)
3119 			break;
3120 		if ((zcent = tsol_sgetzcent(line, NULL, NULL)) == NULL)
3121 			continue;
3122 		if (strcmp(zcent->zc_name, zone_name) == 0)
3123 			break;
3124 		tsol_freezcent(zcent);
3125 		zcent = NULL;
3126 	}
3127 	(void) fclose(fp);
3128 
3129 	if (zcent == NULL) {
3130 		zerror(zlogp, B_FALSE, "zone requires a label assignment. "
3131 		    "See tnzonecfg(4)");
3132 	} else {
3133 		if (zlabel == NULL)
3134 			zlabel = m_label_alloc(MAC_LABEL);
3135 		/*
3136 		 * Save this zone's privileges for later read-down processing
3137 		 */
3138 		if ((zprivs = priv_allocset()) == NULL) {
3139 			zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3140 			return (NULL);
3141 		} else {
3142 			priv_copyset(privs, zprivs);
3143 		}
3144 	}
3145 	return (zcent);
3146 }
3147 
3148 /*
3149  * Add the Trusted Extensions multi-level ports for this zone.
3150  */
3151 static void
3152 set_mlps(zlog_t *zlogp, zoneid_t zoneid, tsol_zcent_t *zcent)
3153 {
3154 	tsol_mlp_t *mlp;
3155 	tsol_mlpent_t tsme;
3156 
3157 	if (!is_system_labeled())
3158 		return;
3159 
3160 	tsme.tsme_zoneid = zoneid;
3161 	tsme.tsme_flags = 0;
3162 	for (mlp = zcent->zc_private_mlp; !TSOL_MLP_END(mlp); mlp++) {
3163 		tsme.tsme_mlp = *mlp;
3164 		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
3165 			zerror(zlogp, B_TRUE, "cannot set zone-specific MLP "
3166 			    "on %d-%d/%d", mlp->mlp_port,
3167 			    mlp->mlp_port_upper, mlp->mlp_ipp);
3168 		}
3169 	}
3170 
3171 	tsme.tsme_flags = TSOL_MEF_SHARED;
3172 	for (mlp = zcent->zc_shared_mlp; !TSOL_MLP_END(mlp); mlp++) {
3173 		tsme.tsme_mlp = *mlp;
3174 		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
3175 			zerror(zlogp, B_TRUE, "cannot set shared MLP "
3176 			    "on %d-%d/%d", mlp->mlp_port,
3177 			    mlp->mlp_port_upper, mlp->mlp_ipp);
3178 		}
3179 	}
3180 }
3181 
3182 static void
3183 remove_mlps(zlog_t *zlogp, zoneid_t zoneid)
3184 {
3185 	tsol_mlpent_t tsme;
3186 
3187 	if (!is_system_labeled())
3188 		return;
3189 
3190 	(void) memset(&tsme, 0, sizeof (tsme));
3191 	tsme.tsme_zoneid = zoneid;
3192 	if (tnmlp(TNDB_FLUSH, &tsme) != 0)
3193 		zerror(zlogp, B_TRUE, "cannot flush MLPs");
3194 }
3195 
3196 int
3197 prtmount(const char *fs, void *x) {
3198 	zerror((zlog_t *)x, B_FALSE, "  %s", fs);
3199 	return (0);
3200 }
3201 
3202 /*
3203  * Look for zones running on the main system that are using this root (or any
3204  * subdirectory of it).  Return B_TRUE and print an error if a conflicting zone
3205  * is found or if we can't tell.
3206  */
3207 static boolean_t
3208 duplicate_zone_root(zlog_t *zlogp, const char *rootpath)
3209 {
3210 	zoneid_t *zids = NULL;
3211 	uint_t nzids = 0;
3212 	boolean_t retv;
3213 	int rlen, zlen;
3214 	char zroot[MAXPATHLEN];
3215 	char zonename[ZONENAME_MAX];
3216 
3217 	for (;;) {
3218 		nzids += 10;
3219 		zids = malloc(nzids * sizeof (*zids));
3220 		if (zids == NULL) {
3221 			zerror(zlogp, B_TRUE, "memory allocation failed");
3222 			return (B_TRUE);
3223 		}
3224 		if (zone_list(zids, &nzids) == 0)
3225 			break;
3226 		free(zids);
3227 	}
3228 	retv = B_FALSE;
3229 	rlen = strlen(rootpath);
3230 	while (nzids > 0) {
3231 		/*
3232 		 * Ignore errors; they just mean that the zone has disappeared
3233 		 * while we were busy.
3234 		 */
3235 		if (zone_getattr(zids[--nzids], ZONE_ATTR_ROOT, zroot,
3236 		    sizeof (zroot)) == -1)
3237 			continue;
3238 		zlen = strlen(zroot);
3239 		if (zlen > rlen)
3240 			zlen = rlen;
3241 		if (strncmp(rootpath, zroot, zlen) == 0 &&
3242 		    (zroot[zlen] == '\0' || zroot[zlen] == '/') &&
3243 		    (rootpath[zlen] == '\0' || rootpath[zlen] == '/')) {
3244 			if (getzonenamebyid(zids[nzids], zonename,
3245 			    sizeof (zonename)) == -1)
3246 				(void) snprintf(zonename, sizeof (zonename),
3247 				    "id %d", (int)zids[nzids]);
3248 			zerror(zlogp, B_FALSE,
3249 			    "zone root %s already in use by zone %s",
3250 			    rootpath, zonename);
3251 			retv = B_TRUE;
3252 			break;
3253 		}
3254 	}
3255 	free(zids);
3256 	return (retv);
3257 }
3258 
3259 /*
3260  * Search for loopback mounts that use this same source node (same device and
3261  * inode).  Return B_TRUE if there is one or if we can't tell.
3262  */
3263 static boolean_t
3264 duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
3265 {
3266 	struct stat64 rst, zst;
3267 	struct mnttab *mnp;
3268 
3269 	if (stat64(rootpath, &rst) == -1) {
3270 		zerror(zlogp, B_TRUE, "can't stat %s", rootpath);
3271 		return (B_TRUE);
3272 	}
3273 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
3274 		return (B_TRUE);
3275 	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max; mnp++) {
3276 		if (mnp->mnt_fstype == NULL ||
3277 		    strcmp(MNTTYPE_LOFS, mnp->mnt_fstype) != 0)
3278 			continue;
3279 		/* We're looking at a loopback mount.  Stat it. */
3280 		if (mnp->mnt_special != NULL &&
3281 		    stat64(mnp->mnt_special, &zst) != -1 &&
3282 		    rst.st_dev == zst.st_dev && rst.st_ino == zst.st_ino) {
3283 			zerror(zlogp, B_FALSE,
3284 			    "zone root %s is reachable through %s",
3285 			    rootpath, mnp->mnt_mountp);
3286 			return (B_TRUE);
3287 		}
3288 	}
3289 	return (B_FALSE);
3290 }
3291 
3292 zoneid_t
3293 vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
3294 {
3295 	zoneid_t rval = -1;
3296 	priv_set_t *privs;
3297 	char rootpath[MAXPATHLEN];
3298 	char *rctlbuf = NULL;
3299 	size_t rctlbufsz = 0;
3300 	char *zfsbuf = NULL;
3301 	size_t zfsbufsz = 0;
3302 	zoneid_t zoneid = -1;
3303 	int xerr;
3304 	char *kzone;
3305 	FILE *fp = NULL;
3306 	tsol_zcent_t *zcent = NULL;
3307 	int match = 0;
3308 	int doi = 0;
3309 
3310 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
3311 		zerror(zlogp, B_TRUE, "unable to determine zone root");
3312 		return (-1);
3313 	}
3314 	if (zonecfg_in_alt_root())
3315 		resolve_lofs(zlogp, rootpath, sizeof (rootpath));
3316 
3317 	if ((privs = priv_allocset()) == NULL) {
3318 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3319 		return (-1);
3320 	}
3321 	priv_emptyset(privs);
3322 	if (get_privset(zlogp, privs, mount_cmd) != 0)
3323 		goto error;
3324 
3325 	if (!mount_cmd && get_rctls(zlogp, &rctlbuf, &rctlbufsz) != 0) {
3326 		zerror(zlogp, B_FALSE, "Unable to get list of rctls");
3327 		goto error;
3328 	}
3329 
3330 	if (get_datasets(zlogp, &zfsbuf, &zfsbufsz) != 0) {
3331 		zerror(zlogp, B_FALSE, "Unable to get list of ZFS datasets");
3332 		goto error;
3333 	}
3334 
3335 	if (!mount_cmd && is_system_labeled()) {
3336 		zcent = get_zone_label(zlogp, privs);
3337 		if (zcent != NULL) {
3338 			match = zcent->zc_match;
3339 			doi = zcent->zc_doi;
3340 			*zlabel = zcent->zc_label;
3341 		} else {
3342 			goto error;
3343 		}
3344 	}
3345 
3346 	kzone = zone_name;
3347 
3348 	/*
3349 	 * We must do this scan twice.  First, we look for zones running on the
3350 	 * main system that are using this root (or any subdirectory of it).
3351 	 * Next, we reduce to the shortest path and search for loopback mounts
3352 	 * that use this same source node (same device and inode).
3353 	 */
3354 	if (duplicate_zone_root(zlogp, rootpath))
3355 		goto error;
3356 	if (duplicate_reachable_path(zlogp, rootpath))
3357 		goto error;
3358 
3359 	if (mount_cmd) {
3360 		root_to_lu(zlogp, rootpath, sizeof (rootpath), B_TRUE);
3361 
3362 		/*
3363 		 * Forge up a special root for this zone.  When a zone is
3364 		 * mounted, we can't let the zone have its own root because the
3365 		 * tools that will be used in this "scratch zone" need access
3366 		 * to both the zone's resources and the running machine's
3367 		 * executables.
3368 		 *
3369 		 * Note that the mkdir here also catches read-only filesystems.
3370 		 */
3371 		if (mkdir(rootpath, 0755) != 0 && errno != EEXIST) {
3372 			zerror(zlogp, B_TRUE, "cannot create %s", rootpath);
3373 			goto error;
3374 		}
3375 		if (domount(zlogp, "tmpfs", "", "swap", rootpath) != 0)
3376 			goto error;
3377 	}
3378 
3379 	if (zonecfg_in_alt_root()) {
3380 		/*
3381 		 * If we are mounting up a zone in an alternate root partition,
3382 		 * then we have some additional work to do before starting the
3383 		 * zone.  First, resolve the root path down so that we're not
3384 		 * fooled by duplicates.  Then forge up an internal name for
3385 		 * the zone.
3386 		 */
3387 		if ((fp = zonecfg_open_scratch("", B_TRUE)) == NULL) {
3388 			zerror(zlogp, B_TRUE, "cannot open mapfile");
3389 			goto error;
3390 		}
3391 		if (zonecfg_lock_scratch(fp) != 0) {
3392 			zerror(zlogp, B_TRUE, "cannot lock mapfile");
3393 			goto error;
3394 		}
3395 		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
3396 		    NULL, 0) == 0) {
3397 			zerror(zlogp, B_FALSE, "scratch zone already running");
3398 			goto error;
3399 		}
3400 		/* This is the preferred name */
3401 		(void) snprintf(kernzone, sizeof (kernzone), "SUNWlu-%s",
3402 		    zone_name);
3403 		srandom(getpid());
3404 		while (zonecfg_reverse_scratch(fp, kernzone, NULL, 0, NULL,
3405 		    0) == 0) {
3406 			/* This is just an arbitrary name; note "." usage */
3407 			(void) snprintf(kernzone, sizeof (kernzone),
3408 			    "SUNWlu.%08lX%08lX", random(), random());
3409 		}
3410 		kzone = kernzone;
3411 	}
3412 
3413 	xerr = 0;
3414 	if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
3415 	    rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel)) == -1) {
3416 		if (xerr == ZE_AREMOUNTS) {
3417 			if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
3418 				zerror(zlogp, B_FALSE,
3419 				    "An unknown file-system is mounted on "
3420 				    "a subdirectory of %s", rootpath);
3421 			} else {
3422 
3423 				zerror(zlogp, B_FALSE,
3424 				    "These file-systems are mounted on "
3425 				    "subdirectories of %s:", rootpath);
3426 				(void) zonecfg_find_mounts(rootpath,
3427 				    prtmount, zlogp);
3428 			}
3429 		} else if (xerr == ZE_CHROOTED) {
3430 			zerror(zlogp, B_FALSE, "%s: "
3431 			    "cannot create a zone from a chrooted "
3432 			    "environment", "zone_create");
3433 		} else {
3434 			zerror(zlogp, B_TRUE, "%s failed", "zone_create");
3435 		}
3436 		goto error;
3437 	}
3438 
3439 	if (zonecfg_in_alt_root() &&
3440 	    zonecfg_add_scratch(fp, zone_name, kernzone,
3441 	    zonecfg_get_root()) == -1) {
3442 		zerror(zlogp, B_TRUE, "cannot add mapfile entry");
3443 		goto error;
3444 	}
3445 
3446 	/*
3447 	 * The following is a warning, not an error, and is not performed when
3448 	 * merely mounting a zone for administrative use.
3449 	 */
3450 	if (!mount_cmd && bind_to_pool(zlogp, zoneid) != 0)
3451 		zerror(zlogp, B_FALSE, "WARNING: unable to bind zone to "
3452 		    "requested pool; using default pool.");
3453 	if (!mount_cmd)
3454 		set_mlps(zlogp, zoneid, zcent);
3455 	rval = zoneid;
3456 	zoneid = -1;
3457 
3458 error:
3459 	if (zoneid != -1)
3460 		(void) zone_destroy(zoneid);
3461 	if (rctlbuf != NULL)
3462 		free(rctlbuf);
3463 	priv_freeset(privs);
3464 	if (fp != NULL)
3465 		zonecfg_close_scratch(fp);
3466 	lofs_discard_mnttab();
3467 	if (zcent != NULL)
3468 		tsol_freezcent(zcent);
3469 	return (rval);
3470 }
3471 
3472 /*
3473  * Enter the zone and write a /etc/zones/index file there.  This allows
3474  * libzonecfg (and thus zoneadm) to report the UUID and potentially other zone
3475  * details from inside the zone.
3476  */
3477 static void
3478 write_index_file(zoneid_t zoneid)
3479 {
3480 	FILE *zef;
3481 	FILE *zet;
3482 	struct zoneent *zep;
3483 	pid_t child;
3484 	int tmpl_fd;
3485 	ctid_t ct;
3486 	int fd;
3487 	char uuidstr[UUID_PRINTABLE_STRING_LENGTH];
3488 
3489 	/* Locate the zone entry in the global zone's index file */
3490 	if ((zef = setzoneent()) == NULL)
3491 		return;
3492 	while ((zep = getzoneent_private(zef)) != NULL) {
3493 		if (strcmp(zep->zone_name, zone_name) == 0)
3494 			break;
3495 		free(zep);
3496 	}
3497 	endzoneent(zef);
3498 	if (zep == NULL)
3499 		return;
3500 
3501 	if ((tmpl_fd = init_template()) == -1) {
3502 		free(zep);
3503 		return;
3504 	}
3505 
3506 	if ((child = fork()) == -1) {
3507 		(void) ct_tmpl_clear(tmpl_fd);
3508 		(void) close(tmpl_fd);
3509 		free(zep);
3510 		return;
3511 	}
3512 
3513 	/* parent waits for child to finish */
3514 	if (child != 0) {
3515 		free(zep);
3516 		if (contract_latest(&ct) == -1)
3517 			ct = -1;
3518 		(void) ct_tmpl_clear(tmpl_fd);
3519 		(void) close(tmpl_fd);
3520 		(void) waitpid(child, NULL, 0);
3521 		(void) contract_abandon_id(ct);
3522 		return;
3523 	}
3524 
3525 	/* child enters zone and sets up index file */
3526 	(void) ct_tmpl_clear(tmpl_fd);
3527 	if (zone_enter(zoneid) != -1) {
3528 		(void) mkdir(ZONE_CONFIG_ROOT, ZONE_CONFIG_MODE);
3529 		(void) chown(ZONE_CONFIG_ROOT, ZONE_CONFIG_UID,
3530 		    ZONE_CONFIG_GID);
3531 		fd = open(ZONE_INDEX_FILE, O_WRONLY|O_CREAT|O_TRUNC,
3532 		    ZONE_INDEX_MODE);
3533 		if (fd != -1 && (zet = fdopen(fd, "w")) != NULL) {
3534 			(void) fchown(fd, ZONE_INDEX_UID, ZONE_INDEX_GID);
3535 			if (uuid_is_null(zep->zone_uuid))
3536 				uuidstr[0] = '\0';
3537 			else
3538 				uuid_unparse(zep->zone_uuid, uuidstr);
3539 			(void) fprintf(zet, "%s:%s:/:%s\n", zep->zone_name,
3540 			    zone_state_str(zep->zone_state),
3541 			    uuidstr);
3542 			(void) fclose(zet);
3543 		}
3544 	}
3545 	_exit(0);
3546 }
3547 
3548 int
3549 vplat_bringup(zlog_t *zlogp, boolean_t mount_cmd, zoneid_t zoneid)
3550 {
3551 
3552 	if (!mount_cmd && validate_datasets(zlogp) != 0) {
3553 		lofs_discard_mnttab();
3554 		return (-1);
3555 	}
3556 
3557 	if (mount_filesystems(zlogp, mount_cmd) != 0) {
3558 		lofs_discard_mnttab();
3559 		return (-1);
3560 	}
3561 
3562 	/* mount /dev for zone (both normal and scratch zone) */
3563 	if (vplat_mount_dev(zlogp) != 0) {
3564 		lofs_discard_mnttab();
3565 		return (-1);
3566 	}
3567 
3568 	if (!mount_cmd && configure_network_interfaces(zlogp) != 0) {
3569 		lofs_discard_mnttab();
3570 		return (-1);
3571 	}
3572 
3573 	write_index_file(zoneid);
3574 
3575 	lofs_discard_mnttab();
3576 	return (0);
3577 }
3578 
3579 static int
3580 lu_root_teardown(zlog_t *zlogp)
3581 {
3582 	char zroot[MAXPATHLEN];
3583 
3584 	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
3585 		zerror(zlogp, B_FALSE, "unable to determine zone root");
3586 		return (-1);
3587 	}
3588 	root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
3589 
3590 	/*
3591 	 * At this point, the processes are gone, the filesystems (save the
3592 	 * root) are unmounted, and the zone is on death row.  But there may
3593 	 * still be creds floating about in the system that reference the
3594 	 * zone_t, and which pin down zone_rootvp causing this call to fail
3595 	 * with EBUSY.  Thus, we try for a little while before just giving up.
3596 	 * (How I wish this were not true, and umount2 just did the right
3597 	 * thing, or tmpfs supported MS_FORCE This is a gross hack.)
3598 	 */
3599 	if (umount2(zroot, MS_FORCE) != 0) {
3600 		if (errno == ENOTSUP && umount2(zroot, 0) == 0)
3601 			goto unmounted;
3602 		if (errno == EBUSY) {
3603 			int tries = 10;
3604 
3605 			while (--tries >= 0) {
3606 				(void) sleep(1);
3607 				if (umount2(zroot, 0) == 0)
3608 					goto unmounted;
3609 				if (errno != EBUSY)
3610 					break;
3611 			}
3612 		}
3613 		zerror(zlogp, B_TRUE, "unable to unmount '%s'", zroot);
3614 		return (-1);
3615 	}
3616 unmounted:
3617 
3618 	/*
3619 	 * Only zones in an alternate root environment have scratch zone
3620 	 * entries.
3621 	 */
3622 	if (zonecfg_in_alt_root()) {
3623 		FILE *fp;
3624 		int retv;
3625 
3626 		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
3627 			zerror(zlogp, B_TRUE, "cannot open mapfile");
3628 			return (-1);
3629 		}
3630 		retv = -1;
3631 		if (zonecfg_lock_scratch(fp) != 0)
3632 			zerror(zlogp, B_TRUE, "cannot lock mapfile");
3633 		else if (zonecfg_delete_scratch(fp, kernzone) != 0)
3634 			zerror(zlogp, B_TRUE, "cannot delete map entry");
3635 		else
3636 			retv = 0;
3637 		zonecfg_close_scratch(fp);
3638 		return (retv);
3639 	} else {
3640 		return (0);
3641 	}
3642 }
3643 
3644 int
3645 vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd)
3646 {
3647 	char *kzone;
3648 	zoneid_t zoneid;
3649 
3650 	kzone = zone_name;
3651 	if (zonecfg_in_alt_root()) {
3652 		FILE *fp;
3653 
3654 		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
3655 			zerror(zlogp, B_TRUE, "unable to open map file");
3656 			goto error;
3657 		}
3658 		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
3659 		    kernzone, sizeof (kernzone)) != 0) {
3660 			zerror(zlogp, B_FALSE, "unable to find scratch zone");
3661 			zonecfg_close_scratch(fp);
3662 			goto error;
3663 		}
3664 		zonecfg_close_scratch(fp);
3665 		kzone = kernzone;
3666 	}
3667 
3668 	if ((zoneid = getzoneidbyname(kzone)) == ZONE_ID_UNDEFINED) {
3669 		if (!bringup_failure_recovery)
3670 			zerror(zlogp, B_TRUE, "unable to get zoneid");
3671 		if (unmount_cmd)
3672 			(void) lu_root_teardown(zlogp);
3673 		goto error;
3674 	}
3675 
3676 	if (zone_shutdown(zoneid) != 0) {
3677 		zerror(zlogp, B_TRUE, "unable to shutdown zone");
3678 		goto error;
3679 	}
3680 
3681 	if (!unmount_cmd &&
3682 	    unconfigure_network_interfaces(zlogp, zoneid) != 0) {
3683 		zerror(zlogp, B_FALSE,
3684 		    "unable to unconfigure network interfaces in zone");
3685 		goto error;
3686 	}
3687 
3688 	if (!unmount_cmd && tcp_abort_connections(zlogp, zoneid) != 0) {
3689 		zerror(zlogp, B_TRUE, "unable to abort TCP connections");
3690 		goto error;
3691 	}
3692 
3693 	/* destroy zconsole before umount /dev */
3694 	if (!unmount_cmd)
3695 		destroy_console_slave();
3696 
3697 	if (unmount_filesystems(zlogp, zoneid, unmount_cmd) != 0) {
3698 		zerror(zlogp, B_FALSE,
3699 		    "unable to unmount file systems in zone");
3700 		goto error;
3701 	}
3702 
3703 	remove_mlps(zlogp, zoneid);
3704 
3705 	if (zone_destroy(zoneid) != 0) {
3706 		zerror(zlogp, B_TRUE, "unable to destroy zone");
3707 		goto error;
3708 	}
3709 
3710 	/*
3711 	 * Special teardown for alternate boot environments: remove the tmpfs
3712 	 * root for the zone and then remove it from the map file.
3713 	 */
3714 	if (unmount_cmd && lu_root_teardown(zlogp) != 0)
3715 		goto error;
3716 
3717 	lofs_discard_mnttab();
3718 	return (0);
3719 
3720 error:
3721 	lofs_discard_mnttab();
3722 	return (-1);
3723 }
3724 
3725 /*
3726  * Apply the standard lists of devices/symlinks/mappings and the user-specified
3727  * list of devices (via zonecfg) to the /dev filesystem.  The filesystem will
3728  * use these as a profile/filter to determine what exists in /dev.
3729  */
3730 static int
3731 vplat_mount_dev(zlog_t *zlogp)
3732 {
3733 	char			zonedevpath[MAXPATHLEN];
3734 	zone_dochandle_t	handle = NULL;
3735 	struct zone_devtab	ztab;
3736 	zone_fsopt_t		opt_attr;
3737 	di_prof_t		prof = NULL;
3738 	int			i, err, len;
3739 	int			retval = -1;
3740 
3741 	struct zone_fstab devtab = {
3742 		"/dev",
3743 		"/dev",
3744 		MNTTYPE_DEV,
3745 		NULL,
3746 		""
3747 	};
3748 
3749 	if (err = zone_get_devroot(zone_name, zonedevpath,
3750 	    sizeof (zonedevpath))) {
3751 		zerror(zlogp, B_FALSE, "can't get zone dev: %s",
3752 		    zonecfg_strerror(err));
3753 		return (-1);
3754 	}
3755 
3756 	/*
3757 	 * The old /dev was a lofs mount from <zonepath>/dev, with
3758 	 * dev fs, that becomes a mount on <zonepath>/root/dev.
3759 	 * However, we need to preserve device permission bits during
3760 	 * upgrade.  What we should do is migrate the attribute directory
3761 	 * on upgrade, but for now, preserve it at <zonepath>/dev.
3762 	 */
3763 	(void) strcpy(opt_attr.zone_fsopt_opt, "attrdir=");
3764 	len = strlen(opt_attr.zone_fsopt_opt);
3765 	if (err = zone_get_zonepath(zone_name,
3766 	    opt_attr.zone_fsopt_opt + len, MAX_MNTOPT_STR - len)) {
3767 		zerror(zlogp, B_FALSE, "can't get zone path: %s",
3768 		    zonecfg_strerror(err));
3769 		return (-1);
3770 	}
3771 
3772 	if (make_one_dir(zlogp, opt_attr.zone_fsopt_opt + len, "/dev",
3773 	    DEFAULT_DIR_MODE) != 0)
3774 		return (-1);
3775 
3776 	(void) strlcat(opt_attr.zone_fsopt_opt, "/dev", MAX_MNTOPT_STR);
3777 	devtab.zone_fs_options = &opt_attr;
3778 	opt_attr.zone_fsopt_next = NULL;
3779 
3780 	/* mount /dev inside the zone */
3781 	i = strlen(zonedevpath);
3782 	if (mount_one(zlogp, &devtab, zonedevpath))
3783 		return (-1);
3784 
3785 	(void) strlcat(zonedevpath, "/dev", sizeof (zonedevpath));
3786 	if (di_prof_init(zonedevpath, &prof)) {
3787 		zerror(zlogp, B_TRUE, "failed to initialize profile");
3788 		goto cleanup;
3789 	}
3790 
3791 	/* Add the standard devices and directories */
3792 	for (i = 0; standard_devs[i] != NULL; ++i) {
3793 		if (di_prof_add_dev(prof, standard_devs[i])) {
3794 			zerror(zlogp, B_TRUE, "failed to add "
3795 			    "standard device");
3796 			goto cleanup;
3797 		}
3798 	}
3799 
3800 	/* Add the standard symlinks */
3801 	for (i = 0; standard_devlinks[i].source != NULL; ++i) {
3802 		if (di_prof_add_symlink(prof,
3803 		    standard_devlinks[i].source,
3804 		    standard_devlinks[i].target)) {
3805 			zerror(zlogp, B_TRUE, "failed to add "
3806 			    "standard symlink");
3807 			goto cleanup;
3808 		}
3809 	}
3810 
3811 	/* Add user-specified devices and directories */
3812 	if ((handle = zonecfg_init_handle()) == NULL) {
3813 		zerror(zlogp, B_FALSE, "can't initialize zone handle");
3814 		goto cleanup;
3815 	}
3816 	if (err = zonecfg_get_handle(zone_name, handle)) {
3817 		zerror(zlogp, B_FALSE, "can't get handle for zone "
3818 		    "%s: %s", zone_name, zonecfg_strerror(err));
3819 		goto cleanup;
3820 	}
3821 	if (err = zonecfg_setdevent(handle)) {
3822 		zerror(zlogp, B_FALSE, "%s: %s", zone_name,
3823 		    zonecfg_strerror(err));
3824 		goto cleanup;
3825 	}
3826 	while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
3827 		if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
3828 			zerror(zlogp, B_TRUE, "failed to add "
3829 			    "user-specified device");
3830 			goto cleanup;
3831 		}
3832 	}
3833 	(void) zonecfg_enddevent(handle);
3834 
3835 	/* Send profile to kernel */
3836 	if (di_prof_commit(prof)) {
3837 		zerror(zlogp, B_TRUE, "failed to commit profile");
3838 		goto cleanup;
3839 	}
3840 
3841 	retval = 0;
3842 
3843 cleanup:
3844 	if (handle)
3845 		zonecfg_fini_handle(handle);
3846 	if (prof)
3847 		di_prof_fini(prof);
3848 	return (retval);
3849 }
3850