xref: /titanic_51/usr/src/cmd/zoneadmd/vplat.c (revision a02e811112768aaf0fd2fb84f9a8d8261a295368)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
28  */
29 
30 /*
31  * This module contains functions used to bring up and tear down the
32  * Virtual Platform: [un]mounting file-systems, [un]plumbing network
33  * interfaces, [un]configuring devices, establishing resource controls,
34  * and creating/destroying the zone in the kernel.  These actions, on
35  * the way up, ready the zone; on the way down, they halt the zone.
36  * See the much longer block comment at the beginning of zoneadmd.c
37  * for a bigger picture of how the whole program functions.
38  *
39  * This module also has primary responsibility for the layout of "scratch
40  * zones."  These are mounted, but inactive, zones that are used during
41  * operating system upgrade and potentially other administrative action.  The
42  * scratch zone environment is similar to the miniroot environment.  The zone's
43  * actual root is mounted read-write on /a, and the standard paths (/usr,
44  * /sbin, /lib) all lead to read-only copies of the running system's binaries.
45  * This allows the administrative tools to manipulate the zone using "-R /a"
46  * without relying on any binaries in the zone itself.
47  *
48  * If the scratch zone is on an alternate root (Live Upgrade [LU] boot
49  * environment), then we must resolve the lofs mounts used there to uncover
50  * writable (unshared) resources.  Shared resources, though, are always
51  * read-only.  In addition, if the "same" zone with a different root path is
52  * currently running, then "/b" inside the zone points to the running zone's
53  * root.  This allows LU to synchronize configuration files during the upgrade
54  * process.
55  *
56  * To construct this environment, this module creates a tmpfs mount on
57  * $ZONEPATH/lu.  Inside this scratch area, the miniroot-like environment as
58  * described above is constructed on the fly.  The zone is then created using
59  * $ZONEPATH/lu as the root.
60  *
61  * Note that scratch zones are inactive.  The zone's bits are not running and
62  * likely cannot be run correctly until upgrade is done.  Init is not running
63  * there, nor is SMF.  Because of this, the "mounted" state of a scratch zone
64  * is not a part of the usual halt/ready/boot state machine.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/mount.h>
69 #include <sys/mntent.h>
70 #include <sys/socket.h>
71 #include <sys/utsname.h>
72 #include <sys/types.h>
73 #include <sys/stat.h>
74 #include <sys/sockio.h>
75 #include <sys/stropts.h>
76 #include <sys/conf.h>
77 #include <sys/systeminfo.h>
78 
79 #include <libdlpi.h>
80 #include <libdllink.h>
81 #include <libdlvlan.h>
82 
83 #include <inet/tcp.h>
84 #include <arpa/inet.h>
85 #include <netinet/in.h>
86 #include <net/route.h>
87 
88 #include <stdio.h>
89 #include <errno.h>
90 #include <fcntl.h>
91 #include <unistd.h>
92 #include <rctl.h>
93 #include <stdlib.h>
94 #include <string.h>
95 #include <strings.h>
96 #include <wait.h>
97 #include <limits.h>
98 #include <libgen.h>
99 #include <libzfs.h>
100 #include <libdevinfo.h>
101 #include <zone.h>
102 #include <assert.h>
103 #include <libcontract.h>
104 #include <libcontract_priv.h>
105 #include <uuid/uuid.h>
106 
107 #include <sys/mntio.h>
108 #include <sys/mnttab.h>
109 #include <sys/fs/autofs.h>	/* for _autofssys() */
110 #include <sys/fs/lofs_info.h>
111 #include <sys/fs/zfs.h>
112 
113 #include <pool.h>
114 #include <sys/pool.h>
115 #include <sys/priocntl.h>
116 
117 #include <libbrand.h>
118 #include <sys/brand.h>
119 #include <libzonecfg.h>
120 #include <synch.h>
121 
122 #include "zoneadmd.h"
123 #include <tsol/label.h>
124 #include <libtsnet.h>
125 #include <sys/priv.h>
126 #include <libinetutil.h>
127 
128 #define	V4_ADDR_LEN	32
129 #define	V6_ADDR_LEN	128
130 
131 #define	RESOURCE_DEFAULT_OPTS \
132 	MNTOPT_RO "," MNTOPT_LOFS_NOSUB "," MNTOPT_NODEVICES
133 
134 #define	DFSTYPES	"/etc/dfs/fstypes"
135 #define	MAXTNZLEN	2048
136 
137 #define	ALT_MOUNT(mount_cmd) 	((mount_cmd) != Z_MNT_BOOT)
138 
139 /* a reasonable estimate for the number of lwps per process */
140 #define	LWPS_PER_PROCESS	10
141 
142 /* for routing socket */
143 static int rts_seqno = 0;
144 
145 /* mangled zone name when mounting in an alternate root environment */
146 static char kernzone[ZONENAME_MAX];
147 
148 /* array of cached mount entries for resolve_lofs */
149 static struct mnttab *resolve_lofs_mnts, *resolve_lofs_mnt_max;
150 
151 /* for Trusted Extensions */
152 static tsol_zcent_t *get_zone_label(zlog_t *, priv_set_t *);
153 static int tsol_mounts(zlog_t *, char *, char *);
154 static void tsol_unmounts(zlog_t *, char *);
155 
156 static m_label_t *zlabel = NULL;
157 static m_label_t *zid_label = NULL;
158 static priv_set_t *zprivs = NULL;
159 
160 /* from libsocket, not in any header file */
161 extern int getnetmaskbyaddr(struct in_addr, struct in_addr *);
162 
163 /* from zoneadmd */
164 extern char query_hook[];
165 
166 /*
167  * For each "net" resource configured in zonecfg, we track a zone_addr_list_t
168  * node in a linked list that is sorted by linkid.  The list is constructed as
169  * the xml configuration file is parsed, and the information
170  * contained in each node is added to the kernel before the zone is
171  * booted, to be retrieved and applied from within the exclusive-IP NGZ
172  * on boot.
173  */
174 typedef struct zone_addr_list {
175 	struct zone_addr_list *za_next;
176 	datalink_id_t za_linkid;	/* datalink_id_t of interface */
177 	struct zone_nwiftab za_nwiftab; /* address, defrouter properties */
178 } zone_addr_list_t;
179 
180 /*
181  * An optimization for build_mnttable: reallocate (and potentially copy the
182  * data) only once every N times through the loop.
183  */
184 #define	MNTTAB_HUNK	32
185 
186 /* some handy macros */
187 #define	SIN(s)	((struct sockaddr_in *)s)
188 #define	SIN6(s)	((struct sockaddr_in6 *)s)
189 
190 /*
191  * Private autofs system call
192  */
193 extern int _autofssys(int, void *);
194 
195 static int
196 autofs_cleanup(zoneid_t zoneid)
197 {
198 	/*
199 	 * Ask autofs to unmount all trigger nodes in the given zone.
200 	 */
201 	return (_autofssys(AUTOFS_UNMOUNTALL, (void *)zoneid));
202 }
203 
204 static void
205 free_mnttable(struct mnttab *mnt_array, uint_t nelem)
206 {
207 	uint_t i;
208 
209 	if (mnt_array == NULL)
210 		return;
211 	for (i = 0; i < nelem; i++) {
212 		free(mnt_array[i].mnt_mountp);
213 		free(mnt_array[i].mnt_fstype);
214 		free(mnt_array[i].mnt_special);
215 		free(mnt_array[i].mnt_mntopts);
216 		assert(mnt_array[i].mnt_time == NULL);
217 	}
218 	free(mnt_array);
219 }
220 
221 /*
222  * Build the mount table for the zone rooted at "zroot", storing the resulting
223  * array of struct mnttabs in "mnt_arrayp" and the number of elements in the
224  * array in "nelemp".
225  */
226 static int
227 build_mnttable(zlog_t *zlogp, const char *zroot, size_t zrootlen, FILE *mnttab,
228     struct mnttab **mnt_arrayp, uint_t *nelemp)
229 {
230 	struct mnttab mnt;
231 	struct mnttab *mnts;
232 	struct mnttab *mnp;
233 	uint_t nmnt;
234 
235 	rewind(mnttab);
236 	resetmnttab(mnttab);
237 	nmnt = 0;
238 	mnts = NULL;
239 	while (getmntent(mnttab, &mnt) == 0) {
240 		struct mnttab *tmp_array;
241 
242 		if (strncmp(mnt.mnt_mountp, zroot, zrootlen) != 0)
243 			continue;
244 		if (nmnt % MNTTAB_HUNK == 0) {
245 			tmp_array = realloc(mnts,
246 			    (nmnt + MNTTAB_HUNK) * sizeof (*mnts));
247 			if (tmp_array == NULL) {
248 				free_mnttable(mnts, nmnt);
249 				return (-1);
250 			}
251 			mnts = tmp_array;
252 		}
253 		mnp = &mnts[nmnt++];
254 
255 		/*
256 		 * Zero out any fields we're not using.
257 		 */
258 		(void) memset(mnp, 0, sizeof (*mnp));
259 
260 		if (mnt.mnt_special != NULL)
261 			mnp->mnt_special = strdup(mnt.mnt_special);
262 		if (mnt.mnt_mntopts != NULL)
263 			mnp->mnt_mntopts = strdup(mnt.mnt_mntopts);
264 		mnp->mnt_mountp = strdup(mnt.mnt_mountp);
265 		mnp->mnt_fstype = strdup(mnt.mnt_fstype);
266 		if ((mnt.mnt_special != NULL && mnp->mnt_special == NULL) ||
267 		    (mnt.mnt_mntopts != NULL && mnp->mnt_mntopts == NULL) ||
268 		    mnp->mnt_mountp == NULL || mnp->mnt_fstype == NULL) {
269 			zerror(zlogp, B_TRUE, "memory allocation failed");
270 			free_mnttable(mnts, nmnt);
271 			return (-1);
272 		}
273 	}
274 	*mnt_arrayp = mnts;
275 	*nelemp = nmnt;
276 	return (0);
277 }
278 
279 /*
280  * This is an optimization.  The resolve_lofs function is used quite frequently
281  * to manipulate file paths, and on a machine with a large number of zones,
282  * there will be a huge number of mounted file systems.  Thus, we trigger a
283  * reread of the list of mount points
284  */
285 static void
286 lofs_discard_mnttab(void)
287 {
288 	free_mnttable(resolve_lofs_mnts,
289 	    resolve_lofs_mnt_max - resolve_lofs_mnts);
290 	resolve_lofs_mnts = resolve_lofs_mnt_max = NULL;
291 }
292 
293 static int
294 lofs_read_mnttab(zlog_t *zlogp)
295 {
296 	FILE *mnttab;
297 	uint_t nmnts;
298 
299 	if ((mnttab = fopen(MNTTAB, "r")) == NULL)
300 		return (-1);
301 	if (build_mnttable(zlogp, "", 0, mnttab, &resolve_lofs_mnts,
302 	    &nmnts) == -1) {
303 		(void) fclose(mnttab);
304 		return (-1);
305 	}
306 	(void) fclose(mnttab);
307 	resolve_lofs_mnt_max = resolve_lofs_mnts + nmnts;
308 	return (0);
309 }
310 
311 /*
312  * This function loops over potential loopback mounts and symlinks in a given
313  * path and resolves them all down to an absolute path.
314  */
315 void
316 resolve_lofs(zlog_t *zlogp, char *path, size_t pathlen)
317 {
318 	int len, arlen;
319 	const char *altroot;
320 	char tmppath[MAXPATHLEN];
321 	boolean_t outside_altroot;
322 
323 	if ((len = resolvepath(path, tmppath, sizeof (tmppath))) == -1)
324 		return;
325 	tmppath[len] = '\0';
326 	(void) strlcpy(path, tmppath, sizeof (tmppath));
327 
328 	/* This happens once per zoneadmd operation. */
329 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
330 		return;
331 
332 	altroot = zonecfg_get_root();
333 	arlen = strlen(altroot);
334 	outside_altroot = B_FALSE;
335 	for (;;) {
336 		struct mnttab *mnp;
337 
338 		/* Search in reverse order to find longest match */
339 		for (mnp = resolve_lofs_mnt_max - 1; mnp >= resolve_lofs_mnts;
340 		    mnp--) {
341 			if (mnp->mnt_fstype == NULL ||
342 			    mnp->mnt_mountp == NULL ||
343 			    mnp->mnt_special == NULL)
344 				continue;
345 			len = strlen(mnp->mnt_mountp);
346 			if (strncmp(mnp->mnt_mountp, path, len) == 0 &&
347 			    (path[len] == '/' || path[len] == '\0'))
348 				break;
349 		}
350 		if (mnp < resolve_lofs_mnts)
351 			break;
352 		/* If it's not a lofs then we're done */
353 		if (strcmp(mnp->mnt_fstype, MNTTYPE_LOFS) != 0)
354 			break;
355 		if (outside_altroot) {
356 			char *cp;
357 			int olen = sizeof (MNTOPT_RO) - 1;
358 
359 			/*
360 			 * If we run into a read-only mount outside of the
361 			 * alternate root environment, then the user doesn't
362 			 * want this path to be made read-write.
363 			 */
364 			if (mnp->mnt_mntopts != NULL &&
365 			    (cp = strstr(mnp->mnt_mntopts, MNTOPT_RO)) !=
366 			    NULL &&
367 			    (cp == mnp->mnt_mntopts || cp[-1] == ',') &&
368 			    (cp[olen] == '\0' || cp[olen] == ',')) {
369 				break;
370 			}
371 		} else if (arlen > 0 &&
372 		    (strncmp(mnp->mnt_special, altroot, arlen) != 0 ||
373 		    (mnp->mnt_special[arlen] != '\0' &&
374 		    mnp->mnt_special[arlen] != '/'))) {
375 			outside_altroot = B_TRUE;
376 		}
377 		/* use temporary buffer because new path might be longer */
378 		(void) snprintf(tmppath, sizeof (tmppath), "%s%s",
379 		    mnp->mnt_special, path + len);
380 		if ((len = resolvepath(tmppath, path, pathlen)) == -1)
381 			break;
382 		path[len] = '\0';
383 	}
384 }
385 
386 /*
387  * For a regular mount, check if a replacement lofs mount is needed because the
388  * referenced device is already mounted somewhere.
389  */
390 static int
391 check_lofs_needed(zlog_t *zlogp, struct zone_fstab *fsptr)
392 {
393 	struct mnttab *mnp;
394 	zone_fsopt_t *optptr, *onext;
395 
396 	/* This happens once per zoneadmd operation. */
397 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
398 		return (-1);
399 
400 	/*
401 	 * If this special node isn't already in use, then it's ours alone;
402 	 * no need to worry about conflicting mounts.
403 	 */
404 	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max;
405 	    mnp++) {
406 		if (strcmp(mnp->mnt_special, fsptr->zone_fs_special) == 0)
407 			break;
408 	}
409 	if (mnp >= resolve_lofs_mnt_max)
410 		return (0);
411 
412 	/*
413 	 * Convert this duplicate mount into a lofs mount.
414 	 */
415 	(void) strlcpy(fsptr->zone_fs_special, mnp->mnt_mountp,
416 	    sizeof (fsptr->zone_fs_special));
417 	(void) strlcpy(fsptr->zone_fs_type, MNTTYPE_LOFS,
418 	    sizeof (fsptr->zone_fs_type));
419 	fsptr->zone_fs_raw[0] = '\0';
420 
421 	/*
422 	 * Discard all but one of the original options and set that to our
423 	 * default set of options used for resources.
424 	 */
425 	optptr = fsptr->zone_fs_options;
426 	if (optptr == NULL) {
427 		optptr = malloc(sizeof (*optptr));
428 		if (optptr == NULL) {
429 			zerror(zlogp, B_TRUE, "cannot mount %s",
430 			    fsptr->zone_fs_dir);
431 			return (-1);
432 		}
433 	} else {
434 		while ((onext = optptr->zone_fsopt_next) != NULL) {
435 			optptr->zone_fsopt_next = onext->zone_fsopt_next;
436 			free(onext);
437 		}
438 	}
439 	(void) strcpy(optptr->zone_fsopt_opt, RESOURCE_DEFAULT_OPTS);
440 	optptr->zone_fsopt_next = NULL;
441 	fsptr->zone_fs_options = optptr;
442 	return (0);
443 }
444 
445 int
446 make_one_dir(zlog_t *zlogp, const char *prefix, const char *subdir, mode_t mode,
447     uid_t userid, gid_t groupid)
448 {
449 	char path[MAXPATHLEN];
450 	struct stat st;
451 
452 	if (snprintf(path, sizeof (path), "%s%s", prefix, subdir) >
453 	    sizeof (path)) {
454 		zerror(zlogp, B_FALSE, "pathname %s%s is too long", prefix,
455 		    subdir);
456 		return (-1);
457 	}
458 
459 	if (lstat(path, &st) == 0) {
460 		/*
461 		 * We don't check the file mode since presumably the zone
462 		 * administrator may have had good reason to change the mode,
463 		 * and we don't need to second guess him.
464 		 */
465 		if (!S_ISDIR(st.st_mode)) {
466 			if (S_ISREG(st.st_mode)) {
467 				/*
468 				 * Allow readonly mounts of /etc/ files; this
469 				 * is needed most by Trusted Extensions.
470 				 */
471 				if (strncmp(subdir, "/etc/",
472 				    strlen("/etc/")) != 0) {
473 					zerror(zlogp, B_FALSE,
474 					    "%s is not in /etc", path);
475 					return (-1);
476 				}
477 			} else {
478 				zerror(zlogp, B_FALSE,
479 				    "%s is not a directory", path);
480 				return (-1);
481 			}
482 		}
483 		return (0);
484 	}
485 
486 	if (mkdirp(path, mode) != 0) {
487 		if (errno == EROFS)
488 			zerror(zlogp, B_FALSE, "Could not mkdir %s.\nIt is on "
489 			    "a read-only file system in this local zone.\nMake "
490 			    "sure %s exists in the global zone.", path, subdir);
491 		else
492 			zerror(zlogp, B_TRUE, "mkdirp of %s failed", path);
493 		return (-1);
494 	}
495 
496 	(void) chown(path, userid, groupid);
497 	return (0);
498 }
499 
500 static void
501 free_remote_fstypes(char **types)
502 {
503 	uint_t i;
504 
505 	if (types == NULL)
506 		return;
507 	for (i = 0; types[i] != NULL; i++)
508 		free(types[i]);
509 	free(types);
510 }
511 
512 static char **
513 get_remote_fstypes(zlog_t *zlogp)
514 {
515 	char **types = NULL;
516 	FILE *fp;
517 	char buf[MAXPATHLEN];
518 	char fstype[MAXPATHLEN];
519 	uint_t lines = 0;
520 	uint_t i;
521 
522 	if ((fp = fopen(DFSTYPES, "r")) == NULL) {
523 		zerror(zlogp, B_TRUE, "failed to open %s", DFSTYPES);
524 		return (NULL);
525 	}
526 	/*
527 	 * Count the number of lines
528 	 */
529 	while (fgets(buf, sizeof (buf), fp) != NULL)
530 		lines++;
531 	if (lines == 0)	/* didn't read anything; empty file */
532 		goto out;
533 	rewind(fp);
534 	/*
535 	 * Allocate enough space for a NULL-terminated array.
536 	 */
537 	types = calloc(lines + 1, sizeof (char *));
538 	if (types == NULL) {
539 		zerror(zlogp, B_TRUE, "memory allocation failed");
540 		goto out;
541 	}
542 	i = 0;
543 	while (fgets(buf, sizeof (buf), fp) != NULL) {
544 		/* LINTED - fstype is big enough to hold buf */
545 		if (sscanf(buf, "%s", fstype) == 0) {
546 			zerror(zlogp, B_FALSE, "unable to parse %s", DFSTYPES);
547 			free_remote_fstypes(types);
548 			types = NULL;
549 			goto out;
550 		}
551 		types[i] = strdup(fstype);
552 		if (types[i] == NULL) {
553 			zerror(zlogp, B_TRUE, "memory allocation failed");
554 			free_remote_fstypes(types);
555 			types = NULL;
556 			goto out;
557 		}
558 		i++;
559 	}
560 out:
561 	(void) fclose(fp);
562 	return (types);
563 }
564 
565 static boolean_t
566 is_remote_fstype(const char *fstype, char *const *remote_fstypes)
567 {
568 	uint_t i;
569 
570 	if (remote_fstypes == NULL)
571 		return (B_FALSE);
572 	for (i = 0; remote_fstypes[i] != NULL; i++) {
573 		if (strcmp(remote_fstypes[i], fstype) == 0)
574 			return (B_TRUE);
575 	}
576 	return (B_FALSE);
577 }
578 
579 /*
580  * This converts a zone root path (normally of the form .../root) to a Live
581  * Upgrade scratch zone root (of the form .../lu).
582  */
583 static void
584 root_to_lu(zlog_t *zlogp, char *zroot, size_t zrootlen, boolean_t isresolved)
585 {
586 	if (!isresolved && zonecfg_in_alt_root())
587 		resolve_lofs(zlogp, zroot, zrootlen);
588 	(void) strcpy(strrchr(zroot, '/') + 1, "lu");
589 }
590 
591 /*
592  * The general strategy for unmounting filesystems is as follows:
593  *
594  * - Remote filesystems may be dead, and attempting to contact them as
595  * part of a regular unmount may hang forever; we want to always try to
596  * forcibly unmount such filesystems and only fall back to regular
597  * unmounts if the filesystem doesn't support forced unmounts.
598  *
599  * - We don't want to unnecessarily corrupt metadata on local
600  * filesystems (ie UFS), so we want to start off with graceful unmounts,
601  * and only escalate to doing forced unmounts if we get stuck.
602  *
603  * We start off walking backwards through the mount table.  This doesn't
604  * give us strict ordering but ensures that we try to unmount submounts
605  * first.  We thus limit the number of failed umount2(2) calls.
606  *
607  * The mechanism for determining if we're stuck is to count the number
608  * of failed unmounts each iteration through the mount table.  This
609  * gives us an upper bound on the number of filesystems which remain
610  * mounted (autofs trigger nodes are dealt with separately).  If at the
611  * end of one unmount+autofs_cleanup cycle we still have the same number
612  * of mounts that we started out with, we're stuck and try a forced
613  * unmount.  If that fails (filesystem doesn't support forced unmounts)
614  * then we bail and are unable to teardown the zone.  If it succeeds,
615  * we're no longer stuck so we continue with our policy of trying
616  * graceful mounts first.
617  *
618  * Zone must be down (ie, no processes or threads active).
619  */
620 static int
621 unmount_filesystems(zlog_t *zlogp, zoneid_t zoneid, boolean_t unmount_cmd)
622 {
623 	int error = 0;
624 	FILE *mnttab;
625 	struct mnttab *mnts;
626 	uint_t nmnt;
627 	char zroot[MAXPATHLEN + 1];
628 	size_t zrootlen;
629 	uint_t oldcount = UINT_MAX;
630 	boolean_t stuck = B_FALSE;
631 	char **remote_fstypes = NULL;
632 
633 	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
634 		zerror(zlogp, B_FALSE, "unable to determine zone root");
635 		return (-1);
636 	}
637 	if (unmount_cmd)
638 		root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
639 
640 	(void) strcat(zroot, "/");
641 	zrootlen = strlen(zroot);
642 
643 	/*
644 	 * For Trusted Extensions unmount each higher level zone's mount
645 	 * of our zone's /export/home
646 	 */
647 	if (!unmount_cmd)
648 		tsol_unmounts(zlogp, zone_name);
649 
650 	if ((mnttab = fopen(MNTTAB, "r")) == NULL) {
651 		zerror(zlogp, B_TRUE, "failed to open %s", MNTTAB);
652 		return (-1);
653 	}
654 	/*
655 	 * Use our hacky mntfs ioctl so we see everything, even mounts with
656 	 * MS_NOMNTTAB.
657 	 */
658 	if (ioctl(fileno(mnttab), MNTIOC_SHOWHIDDEN, NULL) < 0) {
659 		zerror(zlogp, B_TRUE, "unable to configure %s", MNTTAB);
660 		error++;
661 		goto out;
662 	}
663 
664 	/*
665 	 * Build the list of remote fstypes so we know which ones we
666 	 * should forcibly unmount.
667 	 */
668 	remote_fstypes = get_remote_fstypes(zlogp);
669 	for (; /* ever */; ) {
670 		uint_t newcount = 0;
671 		boolean_t unmounted;
672 		struct mnttab *mnp;
673 		char *path;
674 		uint_t i;
675 
676 		mnts = NULL;
677 		nmnt = 0;
678 		/*
679 		 * MNTTAB gives us a way to walk through mounted
680 		 * filesystems; we need to be able to walk them in
681 		 * reverse order, so we build a list of all mounted
682 		 * filesystems.
683 		 */
684 		if (build_mnttable(zlogp, zroot, zrootlen, mnttab, &mnts,
685 		    &nmnt) != 0) {
686 			error++;
687 			goto out;
688 		}
689 		for (i = 0; i < nmnt; i++) {
690 			mnp = &mnts[nmnt - i - 1]; /* access in reverse order */
691 			path = mnp->mnt_mountp;
692 			unmounted = B_FALSE;
693 			/*
694 			 * Try forced unmount first for remote filesystems.
695 			 *
696 			 * Not all remote filesystems support forced unmounts,
697 			 * so if this fails (ENOTSUP) we'll continue on
698 			 * and try a regular unmount.
699 			 */
700 			if (is_remote_fstype(mnp->mnt_fstype, remote_fstypes)) {
701 				if (umount2(path, MS_FORCE) == 0)
702 					unmounted = B_TRUE;
703 			}
704 			/*
705 			 * Try forced unmount if we're stuck.
706 			 */
707 			if (stuck) {
708 				if (umount2(path, MS_FORCE) == 0) {
709 					unmounted = B_TRUE;
710 					stuck = B_FALSE;
711 				} else {
712 					/*
713 					 * The first failure indicates a
714 					 * mount we won't be able to get
715 					 * rid of automatically, so we
716 					 * bail.
717 					 */
718 					error++;
719 					zerror(zlogp, B_FALSE,
720 					    "unable to unmount '%s'", path);
721 					free_mnttable(mnts, nmnt);
722 					goto out;
723 				}
724 			}
725 			/*
726 			 * Try regular unmounts for everything else.
727 			 */
728 			if (!unmounted && umount2(path, 0) != 0)
729 				newcount++;
730 		}
731 		free_mnttable(mnts, nmnt);
732 
733 		if (newcount == 0)
734 			break;
735 		if (newcount >= oldcount) {
736 			/*
737 			 * Last round didn't unmount anything; we're stuck and
738 			 * should start trying forced unmounts.
739 			 */
740 			stuck = B_TRUE;
741 		}
742 		oldcount = newcount;
743 
744 		/*
745 		 * Autofs doesn't let you unmount its trigger nodes from
746 		 * userland so we have to tell the kernel to cleanup for us.
747 		 */
748 		if (autofs_cleanup(zoneid) != 0) {
749 			zerror(zlogp, B_TRUE, "unable to remove autofs nodes");
750 			error++;
751 			goto out;
752 		}
753 	}
754 
755 out:
756 	free_remote_fstypes(remote_fstypes);
757 	(void) fclose(mnttab);
758 	return (error ? -1 : 0);
759 }
760 
761 static int
762 fs_compare(const void *m1, const void *m2)
763 {
764 	struct zone_fstab *i = (struct zone_fstab *)m1;
765 	struct zone_fstab *j = (struct zone_fstab *)m2;
766 
767 	return (strcmp(i->zone_fs_dir, j->zone_fs_dir));
768 }
769 
770 /*
771  * Fork and exec (and wait for) the mentioned binary with the provided
772  * arguments.  Returns (-1) if something went wrong with fork(2) or exec(2),
773  * returns the exit status otherwise.
774  *
775  * If we were unable to exec the provided pathname (for whatever
776  * reason), we return the special token ZEXIT_EXEC.  The current value
777  * of ZEXIT_EXEC doesn't conflict with legitimate exit codes of the
778  * consumers of this function; any future consumers must make sure this
779  * remains the case.
780  */
781 static int
782 forkexec(zlog_t *zlogp, const char *path, char *const argv[])
783 {
784 	pid_t child_pid;
785 	int child_status = 0;
786 
787 	/*
788 	 * Do not let another thread localize a message while we are forking.
789 	 */
790 	(void) mutex_lock(&msglock);
791 	child_pid = fork();
792 	(void) mutex_unlock(&msglock);
793 	if (child_pid == -1) {
794 		zerror(zlogp, B_TRUE, "could not fork for %s", argv[0]);
795 		return (-1);
796 	} else if (child_pid == 0) {
797 		closefrom(0);
798 		/* redirect stdin, stdout & stderr to /dev/null */
799 		(void) open("/dev/null", O_RDONLY);	/* stdin */
800 		(void) open("/dev/null", O_WRONLY);	/* stdout */
801 		(void) open("/dev/null", O_WRONLY);	/* stderr */
802 		(void) execv(path, argv);
803 		/*
804 		 * Since we are in the child, there is no point calling zerror()
805 		 * since there is nobody waiting to consume it.  So exit with a
806 		 * special code that the parent will recognize and call zerror()
807 		 * accordingly.
808 		 */
809 
810 		_exit(ZEXIT_EXEC);
811 	} else {
812 		(void) waitpid(child_pid, &child_status, 0);
813 	}
814 
815 	if (WIFSIGNALED(child_status)) {
816 		zerror(zlogp, B_FALSE, "%s unexpectedly terminated due to "
817 		    "signal %d", path, WTERMSIG(child_status));
818 		return (-1);
819 	}
820 	assert(WIFEXITED(child_status));
821 	if (WEXITSTATUS(child_status) == ZEXIT_EXEC) {
822 		zerror(zlogp, B_FALSE, "failed to exec %s", path);
823 		return (-1);
824 	}
825 	return (WEXITSTATUS(child_status));
826 }
827 
828 static int
829 isregfile(const char *path)
830 {
831 	struct stat64 st;
832 
833 	if (stat64(path, &st) == -1)
834 		return (-1);
835 
836 	return (S_ISREG(st.st_mode));
837 }
838 
839 static int
840 dofsck(zlog_t *zlogp, const char *fstype, const char *rawdev)
841 {
842 	char cmdbuf[MAXPATHLEN];
843 	char *argv[5];
844 	int status;
845 
846 	/*
847 	 * We could alternatively have called /usr/sbin/fsck -F <fstype>, but
848 	 * that would cost us an extra fork/exec without buying us anything.
849 	 */
850 	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/fsck", fstype)
851 	    >= sizeof (cmdbuf)) {
852 		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
853 		return (-1);
854 	}
855 
856 	/*
857 	 * If it doesn't exist, that's OK: we verified this previously
858 	 * in zoneadm.
859 	 */
860 	if (isregfile(cmdbuf) == -1)
861 		return (0);
862 
863 	argv[0] = "fsck";
864 	argv[1] = "-o";
865 	argv[2] = "p";
866 	argv[3] = (char *)rawdev;
867 	argv[4] = NULL;
868 
869 	status = forkexec(zlogp, cmdbuf, argv);
870 	if (status == 0 || status == -1)
871 		return (status);
872 	zerror(zlogp, B_FALSE, "fsck of '%s' failed with exit status %d; "
873 	    "run fsck manually", rawdev, status);
874 	return (-1);
875 }
876 
877 static int
878 domount(zlog_t *zlogp, const char *fstype, const char *opts,
879     const char *special, const char *directory)
880 {
881 	char cmdbuf[MAXPATHLEN];
882 	char *argv[6];
883 	int status;
884 
885 	/*
886 	 * We could alternatively have called /usr/sbin/mount -F <fstype>, but
887 	 * that would cost us an extra fork/exec without buying us anything.
888 	 */
889 	if (snprintf(cmdbuf, sizeof (cmdbuf), "/usr/lib/fs/%s/mount", fstype)
890 	    >= sizeof (cmdbuf)) {
891 		zerror(zlogp, B_FALSE, "file-system type %s too long", fstype);
892 		return (-1);
893 	}
894 	argv[0] = "mount";
895 	if (opts[0] == '\0') {
896 		argv[1] = (char *)special;
897 		argv[2] = (char *)directory;
898 		argv[3] = NULL;
899 	} else {
900 		argv[1] = "-o";
901 		argv[2] = (char *)opts;
902 		argv[3] = (char *)special;
903 		argv[4] = (char *)directory;
904 		argv[5] = NULL;
905 	}
906 
907 	status = forkexec(zlogp, cmdbuf, argv);
908 	if (status == 0 || status == -1)
909 		return (status);
910 	if (opts[0] == '\0')
911 		zerror(zlogp, B_FALSE, "\"%s %s %s\" "
912 		    "failed with exit code %d",
913 		    cmdbuf, special, directory, status);
914 	else
915 		zerror(zlogp, B_FALSE, "\"%s -o %s %s %s\" "
916 		    "failed with exit code %d",
917 		    cmdbuf, opts, special, directory, status);
918 	return (-1);
919 }
920 
921 /*
922  * Check if a given mount point path exists.
923  * If it does, make sure it doesn't contain any symlinks.
924  * Note that if "leaf" is false we're checking an intermediate
925  * component of the mount point path, so it must be a directory.
926  * If "leaf" is true, then we're checking the entire mount point
927  * path, so the mount point itself can be anything aside from a
928  * symbolic link.
929  *
930  * If the path is invalid then a negative value is returned.  If the
931  * path exists and is a valid mount point path then 0 is returned.
932  * If the path doesn't exist return a positive value.
933  */
934 static int
935 valid_mount_point(zlog_t *zlogp, const char *path, const boolean_t leaf)
936 {
937 	struct stat statbuf;
938 	char respath[MAXPATHLEN];
939 	int res;
940 
941 	if (lstat(path, &statbuf) != 0) {
942 		if (errno == ENOENT)
943 			return (1);
944 		zerror(zlogp, B_TRUE, "can't stat %s", path);
945 		return (-1);
946 	}
947 	if (S_ISLNK(statbuf.st_mode)) {
948 		zerror(zlogp, B_FALSE, "%s is a symlink", path);
949 		return (-1);
950 	}
951 	if (!leaf && !S_ISDIR(statbuf.st_mode)) {
952 		zerror(zlogp, B_FALSE, "%s is not a directory", path);
953 		return (-1);
954 	}
955 	if ((res = resolvepath(path, respath, sizeof (respath))) == -1) {
956 		zerror(zlogp, B_TRUE, "unable to resolve path %s", path);
957 		return (-1);
958 	}
959 	respath[res] = '\0';
960 	if (strcmp(path, respath) != 0) {
961 		/*
962 		 * We don't like ".."s, "."s, or "//"s throwing us off
963 		 */
964 		zerror(zlogp, B_FALSE, "%s is not a canonical path", path);
965 		return (-1);
966 	}
967 	return (0);
968 }
969 
970 /*
971  * Validate a mount point path.  A valid mount point path is an
972  * absolute path that either doesn't exist, or, if it does exists it
973  * must be an absolute canonical path that doesn't have any symbolic
974  * links in it.  The target of a mount point path can be any filesystem
975  * object.  (Different filesystems can support different mount points,
976  * for example "lofs" and "mntfs" both support files and directories
977  * while "ufs" just supports directories.)
978  *
979  * If the path is invalid then a negative value is returned.  If the
980  * path exists and is a valid mount point path then 0 is returned.
981  * If the path doesn't exist return a positive value.
982  */
983 int
984 valid_mount_path(zlog_t *zlogp, const char *rootpath, const char *spec,
985     const char *dir, const char *fstype)
986 {
987 	char abspath[MAXPATHLEN], *slashp, *slashp_next;
988 	int rv;
989 
990 	/*
991 	 * Sanity check the target mount point path.
992 	 * It must be a non-null string that starts with a '/'.
993 	 */
994 	if (dir[0] != '/') {
995 		/* Something went wrong. */
996 		zerror(zlogp, B_FALSE, "invalid mount directory, "
997 		    "type: \"%s\", special: \"%s\", dir: \"%s\"",
998 		    fstype, spec, dir);
999 		return (-1);
1000 	}
1001 
1002 	/*
1003 	 * Join rootpath and dir.  Make sure abspath ends with '/', this
1004 	 * is added to all paths (even non-directory paths) to allow us
1005 	 * to detect the end of paths below.  If the path already ends
1006 	 * in a '/', then that's ok too (although we'll fail the
1007 	 * cannonical path check in valid_mount_point()).
1008 	 */
1009 	if (snprintf(abspath, sizeof (abspath),
1010 	    "%s%s/", rootpath, dir) >= sizeof (abspath)) {
1011 		zerror(zlogp, B_FALSE, "pathname %s%s is too long",
1012 		    rootpath, dir);
1013 		return (-1);
1014 	}
1015 
1016 	/*
1017 	 * Starting with rootpath, verify the mount path one component
1018 	 * at a time.  Continue until we've evaluated all of abspath.
1019 	 */
1020 	slashp = &abspath[strlen(rootpath)];
1021 	assert(*slashp == '/');
1022 	do {
1023 		slashp_next = strchr(slashp + 1, '/');
1024 		*slashp = '\0';
1025 		if (slashp_next != NULL) {
1026 			/* This is an intermediary mount path component. */
1027 			rv = valid_mount_point(zlogp, abspath, B_FALSE);
1028 		} else {
1029 			/* This is the last component of the mount path. */
1030 			rv = valid_mount_point(zlogp, abspath, B_TRUE);
1031 		}
1032 		if (rv < 0)
1033 			return (rv);
1034 		*slashp = '/';
1035 	} while ((slashp = slashp_next) != NULL);
1036 	return (rv);
1037 }
1038 
1039 static int
1040 mount_one_dev_device_cb(void *arg, const char *match, const char *name)
1041 {
1042 	di_prof_t prof = arg;
1043 
1044 	if (name == NULL)
1045 		return (di_prof_add_dev(prof, match));
1046 	return (di_prof_add_map(prof, match, name));
1047 }
1048 
1049 static int
1050 mount_one_dev_symlink_cb(void *arg, const char *source, const char *target)
1051 {
1052 	di_prof_t prof = arg;
1053 
1054 	return (di_prof_add_symlink(prof, source, target));
1055 }
1056 
1057 int
1058 vplat_get_iptype(zlog_t *zlogp, zone_iptype_t *iptypep)
1059 {
1060 	zone_dochandle_t handle;
1061 
1062 	if ((handle = zonecfg_init_handle()) == NULL) {
1063 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
1064 		return (-1);
1065 	}
1066 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
1067 		zerror(zlogp, B_FALSE, "invalid configuration");
1068 		zonecfg_fini_handle(handle);
1069 		return (-1);
1070 	}
1071 	if (zonecfg_get_iptype(handle, iptypep) != Z_OK) {
1072 		zerror(zlogp, B_FALSE, "invalid ip-type configuration");
1073 		zonecfg_fini_handle(handle);
1074 		return (-1);
1075 	}
1076 	zonecfg_fini_handle(handle);
1077 	return (0);
1078 }
1079 
1080 /*
1081  * Apply the standard lists of devices/symlinks/mappings and the user-specified
1082  * list of devices (via zonecfg) to the /dev filesystem.  The filesystem will
1083  * use these as a profile/filter to determine what exists in /dev.
1084  */
1085 static int
1086 mount_one_dev(zlog_t *zlogp, char *devpath, zone_mnt_t mount_cmd)
1087 {
1088 	char			brand[MAXNAMELEN];
1089 	zone_dochandle_t	handle = NULL;
1090 	brand_handle_t		bh = NULL;
1091 	struct zone_devtab	ztab;
1092 	di_prof_t		prof = NULL;
1093 	int			err;
1094 	int			retval = -1;
1095 	zone_iptype_t		iptype;
1096 	const char 		*curr_iptype;
1097 
1098 	if (di_prof_init(devpath, &prof)) {
1099 		zerror(zlogp, B_TRUE, "failed to initialize profile");
1100 		goto cleanup;
1101 	}
1102 
1103 	/*
1104 	 * Get a handle to the brand info for this zone.
1105 	 * If we are mounting the zone, then we must always use the default
1106 	 * brand device mounts.
1107 	 */
1108 	if (ALT_MOUNT(mount_cmd)) {
1109 		(void) strlcpy(brand, default_brand, sizeof (brand));
1110 	} else {
1111 		(void) strlcpy(brand, brand_name, sizeof (brand));
1112 	}
1113 
1114 	if ((bh = brand_open(brand)) == NULL) {
1115 		zerror(zlogp, B_FALSE, "unable to determine zone brand");
1116 		goto cleanup;
1117 	}
1118 
1119 	if (vplat_get_iptype(zlogp, &iptype) < 0) {
1120 		zerror(zlogp, B_TRUE, "unable to determine ip-type");
1121 		goto cleanup;
1122 	}
1123 	switch (iptype) {
1124 	case ZS_SHARED:
1125 		curr_iptype = "shared";
1126 		break;
1127 	case ZS_EXCLUSIVE:
1128 		curr_iptype = "exclusive";
1129 		break;
1130 	}
1131 
1132 	if (brand_platform_iter_devices(bh, zone_name,
1133 	    mount_one_dev_device_cb, prof, curr_iptype) != 0) {
1134 		zerror(zlogp, B_TRUE, "failed to add standard device");
1135 		goto cleanup;
1136 	}
1137 
1138 	if (brand_platform_iter_link(bh,
1139 	    mount_one_dev_symlink_cb, prof) != 0) {
1140 		zerror(zlogp, B_TRUE, "failed to add standard symlink");
1141 		goto cleanup;
1142 	}
1143 
1144 	/* Add user-specified devices and directories */
1145 	if ((handle = zonecfg_init_handle()) == NULL) {
1146 		zerror(zlogp, B_FALSE, "can't initialize zone handle");
1147 		goto cleanup;
1148 	}
1149 	if (err = zonecfg_get_handle(zone_name, handle)) {
1150 		zerror(zlogp, B_FALSE, "can't get handle for zone "
1151 		    "%s: %s", zone_name, zonecfg_strerror(err));
1152 		goto cleanup;
1153 	}
1154 	if (err = zonecfg_setdevent(handle)) {
1155 		zerror(zlogp, B_FALSE, "%s: %s", zone_name,
1156 		    zonecfg_strerror(err));
1157 		goto cleanup;
1158 	}
1159 	while (zonecfg_getdevent(handle, &ztab) == Z_OK) {
1160 		if (di_prof_add_dev(prof, ztab.zone_dev_match)) {
1161 			zerror(zlogp, B_TRUE, "failed to add "
1162 			    "user-specified device");
1163 			goto cleanup;
1164 		}
1165 	}
1166 	(void) zonecfg_enddevent(handle);
1167 
1168 	/* Send profile to kernel */
1169 	if (di_prof_commit(prof)) {
1170 		zerror(zlogp, B_TRUE, "failed to commit profile");
1171 		goto cleanup;
1172 	}
1173 
1174 	retval = 0;
1175 
1176 cleanup:
1177 	if (bh != NULL)
1178 		brand_close(bh);
1179 	if (handle != NULL)
1180 		zonecfg_fini_handle(handle);
1181 	if (prof)
1182 		di_prof_fini(prof);
1183 	return (retval);
1184 }
1185 
1186 static int
1187 mount_one(zlog_t *zlogp, struct zone_fstab *fsptr, const char *rootpath,
1188     zone_mnt_t mount_cmd)
1189 {
1190 	char path[MAXPATHLEN];
1191 	char optstr[MAX_MNTOPT_STR];
1192 	zone_fsopt_t *optptr;
1193 	int rv;
1194 
1195 	if ((rv = valid_mount_path(zlogp, rootpath, fsptr->zone_fs_special,
1196 	    fsptr->zone_fs_dir, fsptr->zone_fs_type)) < 0) {
1197 		zerror(zlogp, B_FALSE, "%s%s is not a valid mount point",
1198 		    rootpath, fsptr->zone_fs_dir);
1199 		return (-1);
1200 	} else if (rv > 0) {
1201 		/* The mount point path doesn't exist, create it now. */
1202 		if (make_one_dir(zlogp, rootpath, fsptr->zone_fs_dir,
1203 		    DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
1204 		    DEFAULT_DIR_GROUP) != 0) {
1205 			zerror(zlogp, B_FALSE, "failed to create mount point");
1206 			return (-1);
1207 		}
1208 
1209 		/*
1210 		 * Now this might seem weird, but we need to invoke
1211 		 * valid_mount_path() again.  Why?  Because it checks
1212 		 * to make sure that the mount point path is canonical,
1213 		 * which it can only do if the path exists, so now that
1214 		 * we've created the path we have to verify it again.
1215 		 */
1216 		if ((rv = valid_mount_path(zlogp, rootpath,
1217 		    fsptr->zone_fs_special, fsptr->zone_fs_dir,
1218 		    fsptr->zone_fs_type)) < 0) {
1219 			zerror(zlogp, B_FALSE,
1220 			    "%s%s is not a valid mount point",
1221 			    rootpath, fsptr->zone_fs_dir);
1222 			return (-1);
1223 		}
1224 	}
1225 
1226 	(void) snprintf(path, sizeof (path), "%s%s", rootpath,
1227 	    fsptr->zone_fs_dir);
1228 
1229 	/*
1230 	 * In general the strategy here is to do just as much verification as
1231 	 * necessary to avoid crashing or otherwise doing something bad; if the
1232 	 * administrator initiated the operation via zoneadm(1m), he'll get
1233 	 * auto-verification which will let him know what's wrong.  If he
1234 	 * modifies the zone configuration of a running zone and doesn't attempt
1235 	 * to verify that it's OK we won't crash but won't bother trying to be
1236 	 * too helpful either.  zoneadm verify is only a couple keystrokes away.
1237 	 */
1238 	if (!zonecfg_valid_fs_type(fsptr->zone_fs_type)) {
1239 		zerror(zlogp, B_FALSE, "cannot mount %s on %s: "
1240 		    "invalid file-system type %s", fsptr->zone_fs_special,
1241 		    fsptr->zone_fs_dir, fsptr->zone_fs_type);
1242 		return (-1);
1243 	}
1244 
1245 	/*
1246 	 * If we're looking at an alternate root environment, then construct
1247 	 * read-only loopback mounts as necessary.  Note that any special
1248 	 * paths for lofs zone mounts in an alternate root must have
1249 	 * already been pre-pended with any alternate root path by the
1250 	 * time we get here.
1251 	 */
1252 	if (zonecfg_in_alt_root()) {
1253 		struct stat64 st;
1254 
1255 		if (stat64(fsptr->zone_fs_special, &st) != -1 &&
1256 		    S_ISBLK(st.st_mode)) {
1257 			/*
1258 			 * If we're going to mount a block device we need
1259 			 * to check if that device is already mounted
1260 			 * somewhere else, and if so, do a lofs mount
1261 			 * of the device instead of a direct mount
1262 			 */
1263 			if (check_lofs_needed(zlogp, fsptr) == -1)
1264 				return (-1);
1265 		} else if (strcmp(fsptr->zone_fs_type, MNTTYPE_LOFS) == 0) {
1266 			/*
1267 			 * For lofs mounts, the special node is inside the
1268 			 * alternate root.  We need lofs resolution for
1269 			 * this case in order to get at the underlying
1270 			 * read-write path.
1271 			 */
1272 			resolve_lofs(zlogp, fsptr->zone_fs_special,
1273 			    sizeof (fsptr->zone_fs_special));
1274 		}
1275 	}
1276 
1277 	/*
1278 	 * Run 'fsck -m' if there's a device to fsck.
1279 	 */
1280 	if (fsptr->zone_fs_raw[0] != '\0' &&
1281 	    dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_raw) != 0) {
1282 		return (-1);
1283 	} else if (isregfile(fsptr->zone_fs_special) == 1 &&
1284 	    dofsck(zlogp, fsptr->zone_fs_type, fsptr->zone_fs_special) != 0) {
1285 		return (-1);
1286 	}
1287 
1288 	/*
1289 	 * Build up mount option string.
1290 	 */
1291 	optstr[0] = '\0';
1292 	if (fsptr->zone_fs_options != NULL) {
1293 		(void) strlcpy(optstr, fsptr->zone_fs_options->zone_fsopt_opt,
1294 		    sizeof (optstr));
1295 		for (optptr = fsptr->zone_fs_options->zone_fsopt_next;
1296 		    optptr != NULL; optptr = optptr->zone_fsopt_next) {
1297 			(void) strlcat(optstr, ",", sizeof (optstr));
1298 			(void) strlcat(optstr, optptr->zone_fsopt_opt,
1299 			    sizeof (optstr));
1300 		}
1301 	}
1302 
1303 	if ((rv = domount(zlogp, fsptr->zone_fs_type, optstr,
1304 	    fsptr->zone_fs_special, path)) != 0)
1305 		return (rv);
1306 
1307 	/*
1308 	 * The mount succeeded.  If this was not a mount of /dev then
1309 	 * we're done.
1310 	 */
1311 	if (strcmp(fsptr->zone_fs_type, MNTTYPE_DEV) != 0)
1312 		return (0);
1313 
1314 	/*
1315 	 * We just mounted an instance of a /dev filesystem, so now we
1316 	 * need to configure it.
1317 	 */
1318 	return (mount_one_dev(zlogp, path, mount_cmd));
1319 }
1320 
1321 static void
1322 free_fs_data(struct zone_fstab *fsarray, uint_t nelem)
1323 {
1324 	uint_t i;
1325 
1326 	if (fsarray == NULL)
1327 		return;
1328 	for (i = 0; i < nelem; i++)
1329 		zonecfg_free_fs_option_list(fsarray[i].zone_fs_options);
1330 	free(fsarray);
1331 }
1332 
1333 /*
1334  * This function initiates the creation of a small Solaris Environment for
1335  * scratch zone. The Environment creation process is split up into two
1336  * functions(build_mounted_pre_var() and build_mounted_post_var()). It
1337  * is done this way because:
1338  * 	We need to have both /etc and /var in the root of the scratchzone.
1339  * 	We loopback mount zone's own /etc and /var into the root of the
1340  * 	scratch zone. Unlike /etc, /var can be a seperate filesystem. So we
1341  * 	need to delay the mount of /var till the zone's root gets populated.
1342  *	So mounting of localdirs[](/etc and /var) have been moved to the
1343  * 	build_mounted_post_var() which gets called only after the zone
1344  * 	specific filesystems are mounted.
1345  *
1346  * Note that the scratch zone we set up for updating the zone (Z_MNT_UPDATE)
1347  * does not loopback mount the zone's own /etc and /var into the root of the
1348  * scratch zone.
1349  */
1350 static boolean_t
1351 build_mounted_pre_var(zlog_t *zlogp, char *rootpath,
1352     size_t rootlen, const char *zonepath, char *luroot, size_t lurootlen)
1353 {
1354 	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1355 	const char **cpp;
1356 	static const char *mkdirs[] = {
1357 		"/system", "/system/contract", "/system/object", "/proc",
1358 		"/dev", "/tmp", "/a", NULL
1359 	};
1360 	char *altstr;
1361 	FILE *fp;
1362 	uuid_t uuid;
1363 
1364 	resolve_lofs(zlogp, rootpath, rootlen);
1365 	(void) snprintf(luroot, lurootlen, "%s/lu", zonepath);
1366 	resolve_lofs(zlogp, luroot, lurootlen);
1367 	(void) snprintf(tmp, sizeof (tmp), "%s/bin", luroot);
1368 	(void) symlink("./usr/bin", tmp);
1369 
1370 	/*
1371 	 * These are mostly special mount points; not handled here.  (See
1372 	 * zone_mount_early.)
1373 	 */
1374 	for (cpp = mkdirs; *cpp != NULL; cpp++) {
1375 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1376 		if (mkdir(tmp, 0755) != 0) {
1377 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1378 			return (B_FALSE);
1379 		}
1380 	}
1381 	/*
1382 	 * This is here to support lucopy.  If there's an instance of this same
1383 	 * zone on the current running system, then we mount its root up as
1384 	 * read-only inside the scratch zone.
1385 	 */
1386 	(void) zonecfg_get_uuid(zone_name, uuid);
1387 	altstr = strdup(zonecfg_get_root());
1388 	if (altstr == NULL) {
1389 		zerror(zlogp, B_TRUE, "memory allocation failed");
1390 		return (B_FALSE);
1391 	}
1392 	zonecfg_set_root("");
1393 	(void) strlcpy(tmp, zone_name, sizeof (tmp));
1394 	(void) zonecfg_get_name_by_uuid(uuid, tmp, sizeof (tmp));
1395 	if (zone_get_rootpath(tmp, fromdir, sizeof (fromdir)) == Z_OK &&
1396 	    strcmp(fromdir, rootpath) != 0) {
1397 		(void) snprintf(tmp, sizeof (tmp), "%s/b", luroot);
1398 		if (mkdir(tmp, 0755) != 0) {
1399 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1400 			return (B_FALSE);
1401 		}
1402 		if (domount(zlogp, MNTTYPE_LOFS, RESOURCE_DEFAULT_OPTS, fromdir,
1403 		    tmp) != 0) {
1404 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1405 			    fromdir);
1406 			return (B_FALSE);
1407 		}
1408 	}
1409 	zonecfg_set_root(altstr);
1410 	free(altstr);
1411 
1412 	if ((fp = zonecfg_open_scratch(luroot, B_TRUE)) == NULL) {
1413 		zerror(zlogp, B_TRUE, "cannot open zone mapfile");
1414 		return (B_FALSE);
1415 	}
1416 	(void) ftruncate(fileno(fp), 0);
1417 	if (zonecfg_add_scratch(fp, zone_name, kernzone, "/") == -1) {
1418 		zerror(zlogp, B_TRUE, "cannot add zone mapfile entry");
1419 	}
1420 	zonecfg_close_scratch(fp);
1421 	(void) snprintf(tmp, sizeof (tmp), "%s/a", luroot);
1422 	if (domount(zlogp, MNTTYPE_LOFS, "", rootpath, tmp) != 0)
1423 		return (B_FALSE);
1424 	(void) strlcpy(rootpath, tmp, rootlen);
1425 	return (B_TRUE);
1426 }
1427 
1428 
1429 static boolean_t
1430 build_mounted_post_var(zlog_t *zlogp, zone_mnt_t mount_cmd, char *rootpath,
1431     const char *luroot)
1432 {
1433 	char tmp[MAXPATHLEN], fromdir[MAXPATHLEN];
1434 	const char **cpp;
1435 	const char **loopdirs;
1436 	const char **tmpdirs;
1437 	static const char *localdirs[] = {
1438 		"/etc", "/var", NULL
1439 	};
1440 	static const char *scr_loopdirs[] = {
1441 		"/etc/lib", "/etc/fs", "/lib", "/sbin", "/platform",
1442 		"/usr", NULL
1443 	};
1444 	static const char *upd_loopdirs[] = {
1445 		"/etc", "/kernel", "/lib", "/opt", "/platform", "/sbin",
1446 		"/usr", "/var", NULL
1447 	};
1448 	static const char *scr_tmpdirs[] = {
1449 		"/tmp", "/var/run", NULL
1450 	};
1451 	static const char *upd_tmpdirs[] = {
1452 		"/tmp", "/var/run", "/var/tmp", NULL
1453 	};
1454 	struct stat st;
1455 
1456 	if (mount_cmd == Z_MNT_SCRATCH) {
1457 		/*
1458 		 * These are mounted read-write from the zone undergoing
1459 		 * upgrade.  We must be careful not to 'leak' things from the
1460 		 * main system into the zone, and this accomplishes that goal.
1461 		 */
1462 		for (cpp = localdirs; *cpp != NULL; cpp++) {
1463 			(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot,
1464 			    *cpp);
1465 			(void) snprintf(fromdir, sizeof (fromdir), "%s%s",
1466 			    rootpath, *cpp);
1467 			if (mkdir(tmp, 0755) != 0) {
1468 				zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1469 				return (B_FALSE);
1470 			}
1471 			if (domount(zlogp, MNTTYPE_LOFS, "", fromdir, tmp)
1472 			    != 0) {
1473 				zerror(zlogp, B_TRUE, "cannot mount %s on %s",
1474 				    tmp, *cpp);
1475 				return (B_FALSE);
1476 			}
1477 		}
1478 	}
1479 
1480 	if (mount_cmd == Z_MNT_UPDATE)
1481 		loopdirs = upd_loopdirs;
1482 	else
1483 		loopdirs = scr_loopdirs;
1484 
1485 	/*
1486 	 * These are things mounted read-only from the running system because
1487 	 * they contain binaries that must match system.
1488 	 */
1489 	for (cpp = loopdirs; *cpp != NULL; cpp++) {
1490 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1491 		if (mkdir(tmp, 0755) != 0) {
1492 			if (errno != EEXIST) {
1493 				zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1494 				return (B_FALSE);
1495 			}
1496 			if (lstat(tmp, &st) != 0) {
1497 				zerror(zlogp, B_TRUE, "cannot stat %s", tmp);
1498 				return (B_FALSE);
1499 			}
1500 			/*
1501 			 * Ignore any non-directories encountered.  These are
1502 			 * things that have been converted into symlinks
1503 			 * (/etc/fs and /etc/lib) and no longer need a lofs
1504 			 * fixup.
1505 			 */
1506 			if (!S_ISDIR(st.st_mode))
1507 				continue;
1508 		}
1509 		if (domount(zlogp, MNTTYPE_LOFS, RESOURCE_DEFAULT_OPTS, *cpp,
1510 		    tmp) != 0) {
1511 			zerror(zlogp, B_TRUE, "cannot mount %s on %s", tmp,
1512 			    *cpp);
1513 			return (B_FALSE);
1514 		}
1515 	}
1516 
1517 	if (mount_cmd == Z_MNT_UPDATE)
1518 		tmpdirs = upd_tmpdirs;
1519 	else
1520 		tmpdirs = scr_tmpdirs;
1521 
1522 	/*
1523 	 * These are things with tmpfs mounted inside.
1524 	 */
1525 	for (cpp = tmpdirs; *cpp != NULL; cpp++) {
1526 		(void) snprintf(tmp, sizeof (tmp), "%s%s", luroot, *cpp);
1527 		if (mount_cmd == Z_MNT_SCRATCH && mkdir(tmp, 0755) != 0 &&
1528 		    errno != EEXIST) {
1529 			zerror(zlogp, B_TRUE, "cannot create %s", tmp);
1530 			return (B_FALSE);
1531 		}
1532 
1533 		/*
1534 		 * We could set the mode for /tmp when we do the mkdir but
1535 		 * since that can be modified by the umask we will just set
1536 		 * the correct mode for /tmp now.
1537 		 */
1538 		if (strcmp(*cpp, "/tmp") == 0 && chmod(tmp, 01777) != 0) {
1539 			zerror(zlogp, B_TRUE, "cannot chmod %s", tmp);
1540 			return (B_FALSE);
1541 		}
1542 
1543 		if (domount(zlogp, MNTTYPE_TMPFS, "", "swap", tmp) != 0) {
1544 			zerror(zlogp, B_TRUE, "cannot mount swap on %s", *cpp);
1545 			return (B_FALSE);
1546 		}
1547 	}
1548 	return (B_TRUE);
1549 }
1550 
1551 typedef struct plat_gmount_cb_data {
1552 	zlog_t			*pgcd_zlogp;
1553 	struct zone_fstab	**pgcd_fs_tab;
1554 	int			*pgcd_num_fs;
1555 } plat_gmount_cb_data_t;
1556 
1557 /*
1558  * plat_gmount_cb() is a callback function invoked by libbrand to iterate
1559  * through all global brand platform mounts.
1560  */
1561 int
1562 plat_gmount_cb(void *data, const char *spec, const char *dir,
1563     const char *fstype, const char *opt)
1564 {
1565 	plat_gmount_cb_data_t	*cp = data;
1566 	zlog_t			*zlogp = cp->pgcd_zlogp;
1567 	struct zone_fstab	*fs_ptr = *cp->pgcd_fs_tab;
1568 	int			num_fs = *cp->pgcd_num_fs;
1569 	struct zone_fstab	*fsp, *tmp_ptr;
1570 
1571 	num_fs++;
1572 	if ((tmp_ptr = realloc(fs_ptr, num_fs * sizeof (*tmp_ptr))) == NULL) {
1573 		zerror(zlogp, B_TRUE, "memory allocation failed");
1574 		return (-1);
1575 	}
1576 
1577 	fs_ptr = tmp_ptr;
1578 	fsp = &fs_ptr[num_fs - 1];
1579 
1580 	/* update the callback struct passed in */
1581 	*cp->pgcd_fs_tab = fs_ptr;
1582 	*cp->pgcd_num_fs = num_fs;
1583 
1584 	fsp->zone_fs_raw[0] = '\0';
1585 	(void) strlcpy(fsp->zone_fs_special, spec,
1586 	    sizeof (fsp->zone_fs_special));
1587 	(void) strlcpy(fsp->zone_fs_dir, dir, sizeof (fsp->zone_fs_dir));
1588 	(void) strlcpy(fsp->zone_fs_type, fstype, sizeof (fsp->zone_fs_type));
1589 	fsp->zone_fs_options = NULL;
1590 	if ((opt != NULL) &&
1591 	    (zonecfg_add_fs_option(fsp, (char *)opt) != Z_OK)) {
1592 		zerror(zlogp, B_FALSE, "error adding property");
1593 		return (-1);
1594 	}
1595 
1596 	return (0);
1597 }
1598 
1599 static int
1600 mount_filesystems_fsent(zone_dochandle_t handle, zlog_t *zlogp,
1601     struct zone_fstab **fs_tabp, int *num_fsp, zone_mnt_t mount_cmd)
1602 {
1603 	struct zone_fstab *tmp_ptr, *fs_ptr, *fsp, fstab;
1604 	int num_fs;
1605 
1606 	num_fs = *num_fsp;
1607 	fs_ptr = *fs_tabp;
1608 
1609 	if (zonecfg_setfsent(handle) != Z_OK) {
1610 		zerror(zlogp, B_FALSE, "invalid configuration");
1611 		return (-1);
1612 	}
1613 	while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
1614 		/*
1615 		 * ZFS filesystems will not be accessible under an alternate
1616 		 * root, since the pool will not be known.  Ignore them in this
1617 		 * case.
1618 		 */
1619 		if (ALT_MOUNT(mount_cmd) &&
1620 		    strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
1621 			continue;
1622 
1623 		num_fs++;
1624 		if ((tmp_ptr = realloc(fs_ptr,
1625 		    num_fs * sizeof (*tmp_ptr))) == NULL) {
1626 			zerror(zlogp, B_TRUE, "memory allocation failed");
1627 			(void) zonecfg_endfsent(handle);
1628 			return (-1);
1629 		}
1630 		/* update the pointers passed in */
1631 		*fs_tabp = tmp_ptr;
1632 		*num_fsp = num_fs;
1633 
1634 		fs_ptr = tmp_ptr;
1635 		fsp = &fs_ptr[num_fs - 1];
1636 		(void) strlcpy(fsp->zone_fs_dir,
1637 		    fstab.zone_fs_dir, sizeof (fsp->zone_fs_dir));
1638 		(void) strlcpy(fsp->zone_fs_raw, fstab.zone_fs_raw,
1639 		    sizeof (fsp->zone_fs_raw));
1640 		(void) strlcpy(fsp->zone_fs_type, fstab.zone_fs_type,
1641 		    sizeof (fsp->zone_fs_type));
1642 		fsp->zone_fs_options = fstab.zone_fs_options;
1643 
1644 		/*
1645 		 * For all lofs mounts, make sure that the 'special'
1646 		 * entry points inside the alternate root.  The
1647 		 * source path for a lofs mount in a given zone needs
1648 		 * to be relative to the root of the boot environment
1649 		 * that contains the zone.  Note that we don't do this
1650 		 * for non-lofs mounts since they will have a device
1651 		 * as a backing store and device paths must always be
1652 		 * specified relative to the current boot environment.
1653 		 */
1654 		fsp->zone_fs_special[0] = '\0';
1655 		if (strcmp(fsp->zone_fs_type, MNTTYPE_LOFS) == 0) {
1656 			(void) strlcat(fsp->zone_fs_special, zonecfg_get_root(),
1657 			    sizeof (fsp->zone_fs_special));
1658 		}
1659 		(void) strlcat(fsp->zone_fs_special, fstab.zone_fs_special,
1660 		    sizeof (fsp->zone_fs_special));
1661 	}
1662 	(void) zonecfg_endfsent(handle);
1663 	return (0);
1664 }
1665 
1666 static int
1667 mount_filesystems(zlog_t *zlogp, zone_mnt_t mount_cmd)
1668 {
1669 	char rootpath[MAXPATHLEN];
1670 	char zonepath[MAXPATHLEN];
1671 	char brand[MAXNAMELEN];
1672 	char luroot[MAXPATHLEN];
1673 	int i, num_fs = 0;
1674 	struct zone_fstab *fs_ptr = NULL;
1675 	zone_dochandle_t handle = NULL;
1676 	zone_state_t zstate;
1677 	brand_handle_t bh;
1678 	plat_gmount_cb_data_t cb;
1679 
1680 	if (zone_get_state(zone_name, &zstate) != Z_OK ||
1681 	    (zstate != ZONE_STATE_READY && zstate != ZONE_STATE_MOUNTED)) {
1682 		zerror(zlogp, B_FALSE,
1683 		    "zone must be in '%s' or '%s' state to mount file-systems",
1684 		    zone_state_str(ZONE_STATE_READY),
1685 		    zone_state_str(ZONE_STATE_MOUNTED));
1686 		goto bad;
1687 	}
1688 
1689 	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
1690 		zerror(zlogp, B_TRUE, "unable to determine zone path");
1691 		goto bad;
1692 	}
1693 
1694 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
1695 		zerror(zlogp, B_TRUE, "unable to determine zone root");
1696 		goto bad;
1697 	}
1698 
1699 	if ((handle = zonecfg_init_handle()) == NULL) {
1700 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
1701 		goto bad;
1702 	}
1703 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK ||
1704 	    zonecfg_setfsent(handle) != Z_OK) {
1705 		zerror(zlogp, B_FALSE, "invalid configuration");
1706 		goto bad;
1707 	}
1708 
1709 	/*
1710 	 * If we are mounting the zone, then we must always use the default
1711 	 * brand global mounts.
1712 	 */
1713 	if (ALT_MOUNT(mount_cmd)) {
1714 		(void) strlcpy(brand, default_brand, sizeof (brand));
1715 	} else {
1716 		(void) strlcpy(brand, brand_name, sizeof (brand));
1717 	}
1718 
1719 	/* Get a handle to the brand info for this zone */
1720 	if ((bh = brand_open(brand)) == NULL) {
1721 		zerror(zlogp, B_FALSE, "unable to determine zone brand");
1722 		zonecfg_fini_handle(handle);
1723 		return (-1);
1724 	}
1725 
1726 	/*
1727 	 * Get the list of global filesystems to mount from the brand
1728 	 * configuration.
1729 	 */
1730 	cb.pgcd_zlogp = zlogp;
1731 	cb.pgcd_fs_tab = &fs_ptr;
1732 	cb.pgcd_num_fs = &num_fs;
1733 	if (brand_platform_iter_gmounts(bh, zonepath,
1734 	    plat_gmount_cb, &cb) != 0) {
1735 		zerror(zlogp, B_FALSE, "unable to mount filesystems");
1736 		brand_close(bh);
1737 		zonecfg_fini_handle(handle);
1738 		return (-1);
1739 	}
1740 	brand_close(bh);
1741 
1742 	/*
1743 	 * Iterate through the rest of the filesystems. Sort them all,
1744 	 * then mount them in sorted order. This is to make sure the
1745 	 * higher level directories (e.g., /usr) get mounted before
1746 	 * any beneath them (e.g., /usr/local).
1747 	 */
1748 	if (mount_filesystems_fsent(handle, zlogp, &fs_ptr, &num_fs,
1749 	    mount_cmd) != 0)
1750 		goto bad;
1751 
1752 	zonecfg_fini_handle(handle);
1753 	handle = NULL;
1754 
1755 	/*
1756 	 * Normally when we mount a zone all the zone filesystems
1757 	 * get mounted relative to rootpath, which is usually
1758 	 * <zonepath>/root.  But when mounting a zone for administration
1759 	 * purposes via the zone "mount" state, build_mounted_pre_var()
1760 	 * updates rootpath to be <zonepath>/lu/a so we'll mount all
1761 	 * the zones filesystems there instead.
1762 	 *
1763 	 * build_mounted_pre_var() and build_mounted_post_var() will
1764 	 * also do some extra work to create directories and lofs mount
1765 	 * a bunch of global zone file system paths into <zonepath>/lu.
1766 	 *
1767 	 * This allows us to be able to enter the zone (now rooted at
1768 	 * <zonepath>/lu) and run the upgrade/patch tools that are in the
1769 	 * global zone and have them upgrade the to-be-modified zone's
1770 	 * files mounted on /a.  (Which mirrors the existing standard
1771 	 * upgrade environment.)
1772 	 *
1773 	 * There is of course one catch.  When doing the upgrade
1774 	 * we need <zoneroot>/lu/dev to be the /dev filesystem
1775 	 * for the zone and we don't want to have any /dev filesystem
1776 	 * mounted at <zoneroot>/lu/a/dev.  Since /dev is specified
1777 	 * as a normal zone filesystem by default we'll try to mount
1778 	 * it at <zoneroot>/lu/a/dev, so we have to detect this
1779 	 * case and instead mount it at <zoneroot>/lu/dev.
1780 	 *
1781 	 * All this work is done in three phases:
1782 	 *   1) Create and populate lu directory (build_mounted_pre_var()).
1783 	 *   2) Mount the required filesystems as per the zone configuration.
1784 	 *   3) Set up the rest of the scratch zone environment
1785 	 *	(build_mounted_post_var()).
1786 	 */
1787 	if (ALT_MOUNT(mount_cmd) && !build_mounted_pre_var(zlogp,
1788 	    rootpath, sizeof (rootpath), zonepath, luroot, sizeof (luroot)))
1789 		goto bad;
1790 
1791 	qsort(fs_ptr, num_fs, sizeof (*fs_ptr), fs_compare);
1792 
1793 	for (i = 0; i < num_fs; i++) {
1794 		if (ALT_MOUNT(mount_cmd) &&
1795 		    strcmp(fs_ptr[i].zone_fs_dir, "/dev") == 0) {
1796 			size_t slen = strlen(rootpath) - 2;
1797 
1798 			/*
1799 			 * By default we'll try to mount /dev as /a/dev
1800 			 * but /dev is special and always goes at the top
1801 			 * so strip the trailing '/a' from the rootpath.
1802 			 */
1803 			assert(strcmp(&rootpath[slen], "/a") == 0);
1804 			rootpath[slen] = '\0';
1805 			if (mount_one(zlogp, &fs_ptr[i], rootpath, mount_cmd)
1806 			    != 0)
1807 				goto bad;
1808 			rootpath[slen] = '/';
1809 			continue;
1810 		}
1811 		if (mount_one(zlogp, &fs_ptr[i], rootpath, mount_cmd) != 0)
1812 			goto bad;
1813 	}
1814 	if (ALT_MOUNT(mount_cmd) &&
1815 	    !build_mounted_post_var(zlogp, mount_cmd, rootpath, luroot))
1816 		goto bad;
1817 
1818 	/*
1819 	 * For Trusted Extensions cross-mount each lower level /export/home
1820 	 */
1821 	if (mount_cmd == Z_MNT_BOOT &&
1822 	    tsol_mounts(zlogp, zone_name, rootpath) != 0)
1823 		goto bad;
1824 
1825 	free_fs_data(fs_ptr, num_fs);
1826 
1827 	/*
1828 	 * Everything looks fine.
1829 	 */
1830 	return (0);
1831 
1832 bad:
1833 	if (handle != NULL)
1834 		zonecfg_fini_handle(handle);
1835 	free_fs_data(fs_ptr, num_fs);
1836 	return (-1);
1837 }
1838 
1839 /* caller makes sure neither parameter is NULL */
1840 static int
1841 addr2netmask(char *prefixstr, int maxprefixlen, uchar_t *maskstr)
1842 {
1843 	int prefixlen;
1844 
1845 	prefixlen = atoi(prefixstr);
1846 	if (prefixlen < 0 || prefixlen > maxprefixlen)
1847 		return (1);
1848 	while (prefixlen > 0) {
1849 		if (prefixlen >= 8) {
1850 			*maskstr++ = 0xFF;
1851 			prefixlen -= 8;
1852 			continue;
1853 		}
1854 		*maskstr |= 1 << (8 - prefixlen);
1855 		prefixlen--;
1856 	}
1857 	return (0);
1858 }
1859 
1860 /*
1861  * Tear down all interfaces belonging to the given zone.  This should
1862  * be called with the zone in a state other than "running", so that
1863  * interfaces can't be assigned to the zone after this returns.
1864  *
1865  * If anything goes wrong, log an error message and return an error.
1866  */
1867 static int
1868 unconfigure_shared_network_interfaces(zlog_t *zlogp, zoneid_t zone_id)
1869 {
1870 	struct lifnum lifn;
1871 	struct lifconf lifc;
1872 	struct lifreq *lifrp, lifrl;
1873 	int64_t lifc_flags = LIFC_NOXMIT | LIFC_ALLZONES;
1874 	int num_ifs, s, i, ret_code = 0;
1875 	uint_t bufsize;
1876 	char *buf = NULL;
1877 
1878 	if ((s = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
1879 		zerror(zlogp, B_TRUE, "could not get socket");
1880 		ret_code = -1;
1881 		goto bad;
1882 	}
1883 	lifn.lifn_family = AF_UNSPEC;
1884 	lifn.lifn_flags = (int)lifc_flags;
1885 	if (ioctl(s, SIOCGLIFNUM, (char *)&lifn) < 0) {
1886 		zerror(zlogp, B_TRUE,
1887 		    "could not determine number of network interfaces");
1888 		ret_code = -1;
1889 		goto bad;
1890 	}
1891 	num_ifs = lifn.lifn_count;
1892 	bufsize = num_ifs * sizeof (struct lifreq);
1893 	if ((buf = malloc(bufsize)) == NULL) {
1894 		zerror(zlogp, B_TRUE, "memory allocation failed");
1895 		ret_code = -1;
1896 		goto bad;
1897 	}
1898 	lifc.lifc_family = AF_UNSPEC;
1899 	lifc.lifc_flags = (int)lifc_flags;
1900 	lifc.lifc_len = bufsize;
1901 	lifc.lifc_buf = buf;
1902 	if (ioctl(s, SIOCGLIFCONF, (char *)&lifc) < 0) {
1903 		zerror(zlogp, B_TRUE, "could not get configured network "
1904 		    "interfaces");
1905 		ret_code = -1;
1906 		goto bad;
1907 	}
1908 	lifrp = lifc.lifc_req;
1909 	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--, lifrp++) {
1910 		(void) close(s);
1911 		if ((s = socket(lifrp->lifr_addr.ss_family, SOCK_DGRAM, 0)) <
1912 		    0) {
1913 			zerror(zlogp, B_TRUE, "%s: could not get socket",
1914 			    lifrl.lifr_name);
1915 			ret_code = -1;
1916 			continue;
1917 		}
1918 		(void) memset(&lifrl, 0, sizeof (lifrl));
1919 		(void) strncpy(lifrl.lifr_name, lifrp->lifr_name,
1920 		    sizeof (lifrl.lifr_name));
1921 		if (ioctl(s, SIOCGLIFZONE, (caddr_t)&lifrl) < 0) {
1922 			if (errno == ENXIO)
1923 				/*
1924 				 * Interface may have been removed by admin or
1925 				 * another zone halting.
1926 				 */
1927 				continue;
1928 			zerror(zlogp, B_TRUE,
1929 			    "%s: could not determine the zone to which this "
1930 			    "network interface is bound", lifrl.lifr_name);
1931 			ret_code = -1;
1932 			continue;
1933 		}
1934 		if (lifrl.lifr_zoneid == zone_id) {
1935 			if (ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifrl) < 0) {
1936 				zerror(zlogp, B_TRUE,
1937 				    "%s: could not remove network interface",
1938 				    lifrl.lifr_name);
1939 				ret_code = -1;
1940 				continue;
1941 			}
1942 		}
1943 	}
1944 bad:
1945 	if (s > 0)
1946 		(void) close(s);
1947 	if (buf)
1948 		free(buf);
1949 	return (ret_code);
1950 }
1951 
1952 static union	sockunion {
1953 	struct	sockaddr sa;
1954 	struct	sockaddr_in sin;
1955 	struct	sockaddr_dl sdl;
1956 	struct	sockaddr_in6 sin6;
1957 } so_dst, so_ifp;
1958 
1959 static struct {
1960 	struct	rt_msghdr hdr;
1961 	char	space[512];
1962 } rtmsg;
1963 
1964 static int
1965 salen(struct sockaddr *sa)
1966 {
1967 	switch (sa->sa_family) {
1968 	case AF_INET:
1969 		return (sizeof (struct sockaddr_in));
1970 	case AF_LINK:
1971 		return (sizeof (struct sockaddr_dl));
1972 	case AF_INET6:
1973 		return (sizeof (struct sockaddr_in6));
1974 	default:
1975 		return (sizeof (struct sockaddr));
1976 	}
1977 }
1978 
1979 #define	ROUNDUP_LONG(a) \
1980 	((a) > 0 ? (1 + (((a) - 1) | (sizeof (long) - 1))) : sizeof (long))
1981 
1982 /*
1983  * Look up which zone is using a given IP address.  The address in question
1984  * is expected to have been stuffed into the structure to which lifr points
1985  * via a previous SIOCGLIFADDR ioctl().
1986  *
1987  * This is done using black router socket magic.
1988  *
1989  * Return the name of the zone on success or NULL on failure.
1990  *
1991  * This is a lot of code for a simple task; a new ioctl request to take care
1992  * of this might be a useful RFE.
1993  */
1994 
1995 static char *
1996 who_is_using(zlog_t *zlogp, struct lifreq *lifr)
1997 {
1998 	static char answer[ZONENAME_MAX];
1999 	pid_t pid;
2000 	int s, rlen, l, i;
2001 	char *cp = rtmsg.space;
2002 	struct sockaddr_dl *ifp = NULL;
2003 	struct sockaddr *sa;
2004 	char save_if_name[LIFNAMSIZ];
2005 
2006 	answer[0] = '\0';
2007 
2008 	pid = getpid();
2009 	if ((s = socket(PF_ROUTE, SOCK_RAW, 0)) < 0) {
2010 		zerror(zlogp, B_TRUE, "could not get routing socket");
2011 		return (NULL);
2012 	}
2013 
2014 	if (lifr->lifr_addr.ss_family == AF_INET) {
2015 		struct sockaddr_in *sin4;
2016 
2017 		so_dst.sa.sa_family = AF_INET;
2018 		sin4 = (struct sockaddr_in *)&lifr->lifr_addr;
2019 		so_dst.sin.sin_addr = sin4->sin_addr;
2020 	} else {
2021 		struct sockaddr_in6 *sin6;
2022 
2023 		so_dst.sa.sa_family = AF_INET6;
2024 		sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
2025 		so_dst.sin6.sin6_addr = sin6->sin6_addr;
2026 	}
2027 
2028 	so_ifp.sa.sa_family = AF_LINK;
2029 
2030 	(void) memset(&rtmsg, 0, sizeof (rtmsg));
2031 	rtmsg.hdr.rtm_type = RTM_GET;
2032 	rtmsg.hdr.rtm_flags = RTF_UP | RTF_HOST;
2033 	rtmsg.hdr.rtm_version = RTM_VERSION;
2034 	rtmsg.hdr.rtm_seq = ++rts_seqno;
2035 	rtmsg.hdr.rtm_addrs = RTA_IFP | RTA_DST;
2036 
2037 	l = ROUNDUP_LONG(salen(&so_dst.sa));
2038 	(void) memmove(cp, &(so_dst), l);
2039 	cp += l;
2040 	l = ROUNDUP_LONG(salen(&so_ifp.sa));
2041 	(void) memmove(cp, &(so_ifp), l);
2042 	cp += l;
2043 
2044 	rtmsg.hdr.rtm_msglen = l = cp - (char *)&rtmsg;
2045 
2046 	if ((rlen = write(s, &rtmsg, l)) < 0) {
2047 		zerror(zlogp, B_TRUE, "writing to routing socket");
2048 		return (NULL);
2049 	} else if (rlen < (int)rtmsg.hdr.rtm_msglen) {
2050 		zerror(zlogp, B_TRUE,
2051 		    "write to routing socket got only %d for len\n", rlen);
2052 		return (NULL);
2053 	}
2054 	do {
2055 		l = read(s, &rtmsg, sizeof (rtmsg));
2056 	} while (l > 0 && (rtmsg.hdr.rtm_seq != rts_seqno ||
2057 	    rtmsg.hdr.rtm_pid != pid));
2058 	if (l < 0) {
2059 		zerror(zlogp, B_TRUE, "reading from routing socket");
2060 		return (NULL);
2061 	}
2062 
2063 	if (rtmsg.hdr.rtm_version != RTM_VERSION) {
2064 		zerror(zlogp, B_FALSE,
2065 		    "routing message version %d not understood",
2066 		    rtmsg.hdr.rtm_version);
2067 		return (NULL);
2068 	}
2069 	if (rtmsg.hdr.rtm_msglen != (ushort_t)l) {
2070 		zerror(zlogp, B_FALSE, "message length mismatch, "
2071 		    "expected %d bytes, returned %d bytes",
2072 		    rtmsg.hdr.rtm_msglen, l);
2073 		return (NULL);
2074 	}
2075 	if (rtmsg.hdr.rtm_errno != 0)  {
2076 		errno = rtmsg.hdr.rtm_errno;
2077 		zerror(zlogp, B_TRUE, "RTM_GET routing socket message");
2078 		return (NULL);
2079 	}
2080 	if ((rtmsg.hdr.rtm_addrs & RTA_IFP) == 0) {
2081 		zerror(zlogp, B_FALSE, "network interface not found");
2082 		return (NULL);
2083 	}
2084 	cp = ((char *)(&rtmsg.hdr + 1));
2085 	for (i = 1; i != 0; i <<= 1) {
2086 		/* LINTED E_BAD_PTR_CAST_ALIGN */
2087 		sa = (struct sockaddr *)cp;
2088 		if (i != RTA_IFP) {
2089 			if ((i & rtmsg.hdr.rtm_addrs) != 0)
2090 				cp += ROUNDUP_LONG(salen(sa));
2091 			continue;
2092 		}
2093 		if (sa->sa_family == AF_LINK &&
2094 		    ((struct sockaddr_dl *)sa)->sdl_nlen != 0)
2095 			ifp = (struct sockaddr_dl *)sa;
2096 		break;
2097 	}
2098 	if (ifp == NULL) {
2099 		zerror(zlogp, B_FALSE, "network interface could not be "
2100 		    "determined");
2101 		return (NULL);
2102 	}
2103 
2104 	/*
2105 	 * We need to set the I/F name to what we got above, then do the
2106 	 * appropriate ioctl to get its zone name.  But lifr->lifr_name is
2107 	 * used by the calling function to do a REMOVEIF, so if we leave the
2108 	 * "good" zone's I/F name in place, *that* I/F will be removed instead
2109 	 * of the bad one.  So we save the old (bad) I/F name before over-
2110 	 * writing it and doing the ioctl, then restore it after the ioctl.
2111 	 */
2112 	(void) strlcpy(save_if_name, lifr->lifr_name, sizeof (save_if_name));
2113 	(void) strncpy(lifr->lifr_name, ifp->sdl_data, ifp->sdl_nlen);
2114 	lifr->lifr_name[ifp->sdl_nlen] = '\0';
2115 	i = ioctl(s, SIOCGLIFZONE, lifr);
2116 	(void) strlcpy(lifr->lifr_name, save_if_name, sizeof (save_if_name));
2117 	if (i < 0) {
2118 		zerror(zlogp, B_TRUE,
2119 		    "%s: could not determine the zone network interface "
2120 		    "belongs to", lifr->lifr_name);
2121 		return (NULL);
2122 	}
2123 	if (getzonenamebyid(lifr->lifr_zoneid, answer, sizeof (answer)) < 0)
2124 		(void) snprintf(answer, sizeof (answer), "%d",
2125 		    lifr->lifr_zoneid);
2126 
2127 	if (strlen(answer) > 0)
2128 		return (answer);
2129 	return (NULL);
2130 }
2131 
2132 /*
2133  * Configures a single interface: a new virtual interface is added, based on
2134  * the physical interface nwiftabptr->zone_nwif_physical, with the address
2135  * specified in nwiftabptr->zone_nwif_address, for zone zone_id.  Note that
2136  * the "address" can be an IPv6 address (with a /prefixlength required), an
2137  * IPv4 address (with a /prefixlength optional), or a name; for the latter,
2138  * an IPv4 name-to-address resolution will be attempted.
2139  *
2140  * If anything goes wrong, we log an detailed error message, attempt to tear
2141  * down whatever we set up and return an error.
2142  */
2143 static int
2144 configure_one_interface(zlog_t *zlogp, zoneid_t zone_id,
2145     struct zone_nwiftab *nwiftabptr)
2146 {
2147 	struct lifreq lifr;
2148 	struct sockaddr_in netmask4;
2149 	struct sockaddr_in6 netmask6;
2150 	struct sockaddr_storage laddr;
2151 	struct in_addr in4;
2152 	sa_family_t af;
2153 	char *slashp = strchr(nwiftabptr->zone_nwif_address, '/');
2154 	int s;
2155 	boolean_t got_netmask = B_FALSE;
2156 	boolean_t is_loopback = B_FALSE;
2157 	char addrstr4[INET_ADDRSTRLEN];
2158 	int res;
2159 
2160 	res = zonecfg_valid_net_address(nwiftabptr->zone_nwif_address, &lifr);
2161 	if (res != Z_OK) {
2162 		zerror(zlogp, B_FALSE, "%s: %s", zonecfg_strerror(res),
2163 		    nwiftabptr->zone_nwif_address);
2164 		return (-1);
2165 	}
2166 	af = lifr.lifr_addr.ss_family;
2167 	if (af == AF_INET)
2168 		in4 = ((struct sockaddr_in *)(&lifr.lifr_addr))->sin_addr;
2169 	if ((s = socket(af, SOCK_DGRAM, 0)) < 0) {
2170 		zerror(zlogp, B_TRUE, "could not get socket");
2171 		return (-1);
2172 	}
2173 
2174 	/*
2175 	 * This is a similar kind of "hack" like in addif() to get around
2176 	 * the problem of SIOCLIFADDIF.  The problem is that this ioctl
2177 	 * does not include the netmask when adding a logical interface.
2178 	 * To get around this problem, we first add the logical interface
2179 	 * with a 0 address.  After that, we set the netmask if provided.
2180 	 * Finally we set the interface address.
2181 	 */
2182 	laddr = lifr.lifr_addr;
2183 	(void) strlcpy(lifr.lifr_name, nwiftabptr->zone_nwif_physical,
2184 	    sizeof (lifr.lifr_name));
2185 	(void) memset(&lifr.lifr_addr, 0, sizeof (lifr.lifr_addr));
2186 
2187 	if (ioctl(s, SIOCLIFADDIF, (caddr_t)&lifr) < 0) {
2188 		/*
2189 		 * Here, we know that the interface can't be brought up.
2190 		 * A similar warning message was already printed out to
2191 		 * the console by zoneadm(1M) so instead we log the
2192 		 * message to syslog and continue.
2193 		 */
2194 		zerror(&logsys, B_TRUE, "WARNING: skipping network interface "
2195 		    "'%s' which may not be present/plumbed in the "
2196 		    "global zone.", lifr.lifr_name);
2197 		(void) close(s);
2198 		return (Z_OK);
2199 	}
2200 
2201 	/* Preserve literal IPv4 address for later potential printing. */
2202 	if (af == AF_INET)
2203 		(void) inet_ntop(AF_INET, &in4, addrstr4, INET_ADDRSTRLEN);
2204 
2205 	lifr.lifr_zoneid = zone_id;
2206 	if (ioctl(s, SIOCSLIFZONE, (caddr_t)&lifr) < 0) {
2207 		zerror(zlogp, B_TRUE, "%s: could not place network interface "
2208 		    "into zone", lifr.lifr_name);
2209 		goto bad;
2210 	}
2211 
2212 	/*
2213 	 * Loopback interface will use the default netmask assigned, if no
2214 	 * netmask is found.
2215 	 */
2216 	if (strcmp(nwiftabptr->zone_nwif_physical, "lo0") == 0) {
2217 		is_loopback = B_TRUE;
2218 	}
2219 	if (af == AF_INET) {
2220 		/*
2221 		 * The IPv4 netmask can be determined either
2222 		 * directly if a prefix length was supplied with
2223 		 * the address or via the netmasks database.  Not
2224 		 * being able to determine it is a common failure,
2225 		 * but it often is not fatal to operation of the
2226 		 * interface.  In that case, a warning will be
2227 		 * printed after the rest of the interface's
2228 		 * parameters have been configured.
2229 		 */
2230 		(void) memset(&netmask4, 0, sizeof (netmask4));
2231 		if (slashp != NULL) {
2232 			if (addr2netmask(slashp + 1, V4_ADDR_LEN,
2233 			    (uchar_t *)&netmask4.sin_addr) != 0) {
2234 				*slashp = '/';
2235 				zerror(zlogp, B_FALSE,
2236 				    "%s: invalid prefix length in %s",
2237 				    lifr.lifr_name,
2238 				    nwiftabptr->zone_nwif_address);
2239 				goto bad;
2240 			}
2241 			got_netmask = B_TRUE;
2242 		} else if (getnetmaskbyaddr(in4,
2243 		    &netmask4.sin_addr) == 0) {
2244 			got_netmask = B_TRUE;
2245 		}
2246 		if (got_netmask) {
2247 			netmask4.sin_family = af;
2248 			(void) memcpy(&lifr.lifr_addr, &netmask4,
2249 			    sizeof (netmask4));
2250 		}
2251 	} else {
2252 		(void) memset(&netmask6, 0, sizeof (netmask6));
2253 		if (addr2netmask(slashp + 1, V6_ADDR_LEN,
2254 		    (uchar_t *)&netmask6.sin6_addr) != 0) {
2255 			*slashp = '/';
2256 			zerror(zlogp, B_FALSE,
2257 			    "%s: invalid prefix length in %s",
2258 			    lifr.lifr_name,
2259 			    nwiftabptr->zone_nwif_address);
2260 			goto bad;
2261 		}
2262 		got_netmask = B_TRUE;
2263 		netmask6.sin6_family = af;
2264 		(void) memcpy(&lifr.lifr_addr, &netmask6,
2265 		    sizeof (netmask6));
2266 	}
2267 	if (got_netmask &&
2268 	    ioctl(s, SIOCSLIFNETMASK, (caddr_t)&lifr) < 0) {
2269 		zerror(zlogp, B_TRUE, "%s: could not set netmask",
2270 		    lifr.lifr_name);
2271 		goto bad;
2272 	}
2273 
2274 	/* Set the interface address */
2275 	lifr.lifr_addr = laddr;
2276 	if (ioctl(s, SIOCSLIFADDR, (caddr_t)&lifr) < 0) {
2277 		zerror(zlogp, B_TRUE,
2278 		    "%s: could not set IP address to %s",
2279 		    lifr.lifr_name, nwiftabptr->zone_nwif_address);
2280 		goto bad;
2281 	}
2282 
2283 	if (ioctl(s, SIOCGLIFFLAGS, (caddr_t)&lifr) < 0) {
2284 		zerror(zlogp, B_TRUE, "%s: could not get flags",
2285 		    lifr.lifr_name);
2286 		goto bad;
2287 	}
2288 	lifr.lifr_flags |= IFF_UP;
2289 	if (ioctl(s, SIOCSLIFFLAGS, (caddr_t)&lifr) < 0) {
2290 		int save_errno = errno;
2291 		char *zone_using;
2292 
2293 		/*
2294 		 * If we failed with something other than EADDRNOTAVAIL,
2295 		 * then skip to the end.  Otherwise, look up our address,
2296 		 * then call a function to determine which zone is already
2297 		 * using that address.
2298 		 */
2299 		if (errno != EADDRNOTAVAIL) {
2300 			zerror(zlogp, B_TRUE,
2301 			    "%s: could not bring network interface up",
2302 			    lifr.lifr_name);
2303 			goto bad;
2304 		}
2305 		if (ioctl(s, SIOCGLIFADDR, (caddr_t)&lifr) < 0) {
2306 			zerror(zlogp, B_TRUE, "%s: could not get address",
2307 			    lifr.lifr_name);
2308 			goto bad;
2309 		}
2310 		zone_using = who_is_using(zlogp, &lifr);
2311 		errno = save_errno;
2312 		if (zone_using == NULL)
2313 			zerror(zlogp, B_TRUE,
2314 			    "%s: could not bring network interface up",
2315 			    lifr.lifr_name);
2316 		else
2317 			zerror(zlogp, B_TRUE, "%s: could not bring network "
2318 			    "interface up: address in use by zone '%s'",
2319 			    lifr.lifr_name, zone_using);
2320 		goto bad;
2321 	}
2322 
2323 	if (!got_netmask && !is_loopback) {
2324 		/*
2325 		 * A common, but often non-fatal problem, is that the system
2326 		 * cannot find the netmask for an interface address. This is
2327 		 * often caused by it being only in /etc/inet/netmasks, but
2328 		 * /etc/nsswitch.conf says to use NIS or NIS+ and it's not
2329 		 * in that. This doesn't show up at boot because the netmask
2330 		 * is obtained from /etc/inet/netmasks when no network
2331 		 * interfaces are up, but isn't consulted when NIS/NIS+ is
2332 		 * available. We warn the user here that something like this
2333 		 * has happened and we're just running with a default and
2334 		 * possible incorrect netmask.
2335 		 */
2336 		char buffer[INET6_ADDRSTRLEN];
2337 		void  *addr;
2338 		const char *nomatch = "no matching subnet found in netmasks(4)";
2339 
2340 		if (af == AF_INET)
2341 			addr = &((struct sockaddr_in *)
2342 			    (&lifr.lifr_addr))->sin_addr;
2343 		else
2344 			addr = &((struct sockaddr_in6 *)
2345 			    (&lifr.lifr_addr))->sin6_addr;
2346 
2347 		/*
2348 		 * Find out what netmask the interface is going to be using.
2349 		 * If we just brought up an IPMP data address on an underlying
2350 		 * interface above, the address will have already migrated, so
2351 		 * the SIOCGLIFNETMASK won't be able to find it (but we need
2352 		 * to bring the address up to get the actual netmask).  Just
2353 		 * omit printing the actual netmask in this corner-case.
2354 		 */
2355 		if (ioctl(s, SIOCGLIFNETMASK, (caddr_t)&lifr) < 0 ||
2356 		    inet_ntop(af, addr, buffer, sizeof (buffer)) == NULL) {
2357 			zerror(zlogp, B_FALSE, "WARNING: %s; using default.",
2358 			    nomatch);
2359 		} else {
2360 			zerror(zlogp, B_FALSE,
2361 			    "WARNING: %s: %s: %s; using default of %s.",
2362 			    lifr.lifr_name, nomatch, addrstr4, buffer);
2363 		}
2364 	}
2365 
2366 	/*
2367 	 * If a default router was specified for this interface
2368 	 * set the route now. Ignore if already set.
2369 	 */
2370 	if (strlen(nwiftabptr->zone_nwif_defrouter) > 0) {
2371 		int status;
2372 		char *argv[7];
2373 
2374 		argv[0] = "route";
2375 		argv[1] = "add";
2376 		argv[2] = "-ifp";
2377 		argv[3] = nwiftabptr->zone_nwif_physical;
2378 		argv[4] = "default";
2379 		argv[5] = nwiftabptr->zone_nwif_defrouter;
2380 		argv[6] = NULL;
2381 
2382 		status = forkexec(zlogp, "/usr/sbin/route", argv);
2383 		if (status != 0 && status != EEXIST)
2384 			zerror(zlogp, B_FALSE, "Unable to set route for "
2385 			    "interface %s to %s\n",
2386 			    nwiftabptr->zone_nwif_physical,
2387 			    nwiftabptr->zone_nwif_defrouter);
2388 	}
2389 
2390 	(void) close(s);
2391 	return (Z_OK);
2392 bad:
2393 	(void) ioctl(s, SIOCLIFREMOVEIF, (caddr_t)&lifr);
2394 	(void) close(s);
2395 	return (-1);
2396 }
2397 
2398 /*
2399  * Sets up network interfaces based on information from the zone configuration.
2400  * IPv4 and IPv6 loopback interfaces are set up "for free", modeling the global
2401  * system.
2402  *
2403  * If anything goes wrong, we log a general error message, attempt to tear down
2404  * whatever we set up, and return an error.
2405  */
2406 static int
2407 configure_shared_network_interfaces(zlog_t *zlogp)
2408 {
2409 	zone_dochandle_t handle;
2410 	struct zone_nwiftab nwiftab, loopback_iftab;
2411 	zoneid_t zoneid;
2412 
2413 	if ((zoneid = getzoneidbyname(zone_name)) == ZONE_ID_UNDEFINED) {
2414 		zerror(zlogp, B_TRUE, "unable to get zoneid");
2415 		return (-1);
2416 	}
2417 
2418 	if ((handle = zonecfg_init_handle()) == NULL) {
2419 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2420 		return (-1);
2421 	}
2422 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2423 		zerror(zlogp, B_FALSE, "invalid configuration");
2424 		zonecfg_fini_handle(handle);
2425 		return (-1);
2426 	}
2427 	if (zonecfg_setnwifent(handle) == Z_OK) {
2428 		for (;;) {
2429 			if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2430 				break;
2431 			if (configure_one_interface(zlogp, zoneid, &nwiftab) !=
2432 			    Z_OK) {
2433 				(void) zonecfg_endnwifent(handle);
2434 				zonecfg_fini_handle(handle);
2435 				return (-1);
2436 			}
2437 		}
2438 		(void) zonecfg_endnwifent(handle);
2439 	}
2440 	zonecfg_fini_handle(handle);
2441 	if (is_system_labeled()) {
2442 		/*
2443 		 * Labeled zones share the loopback interface
2444 		 * so it is not plumbed for shared stack instances.
2445 		 */
2446 		return (0);
2447 	}
2448 	(void) strlcpy(loopback_iftab.zone_nwif_physical, "lo0",
2449 	    sizeof (loopback_iftab.zone_nwif_physical));
2450 	(void) strlcpy(loopback_iftab.zone_nwif_address, "127.0.0.1",
2451 	    sizeof (loopback_iftab.zone_nwif_address));
2452 	loopback_iftab.zone_nwif_defrouter[0] = '\0';
2453 	if (configure_one_interface(zlogp, zoneid, &loopback_iftab) != Z_OK)
2454 		return (-1);
2455 
2456 	/* Always plumb up the IPv6 loopback interface. */
2457 	(void) strlcpy(loopback_iftab.zone_nwif_address, "::1/128",
2458 	    sizeof (loopback_iftab.zone_nwif_address));
2459 	if (configure_one_interface(zlogp, zoneid, &loopback_iftab) != Z_OK)
2460 		return (-1);
2461 	return (0);
2462 }
2463 
2464 static void
2465 zdlerror(zlog_t *zlogp, dladm_status_t err, const char *dlname, const char *str)
2466 {
2467 	char errmsg[DLADM_STRSIZE];
2468 
2469 	(void) dladm_status2str(err, errmsg);
2470 	zerror(zlogp, B_FALSE, "%s '%s': %s", str, dlname, errmsg);
2471 }
2472 
2473 static int
2474 add_datalink(zlog_t *zlogp, char *zone_name, datalink_id_t linkid, char *dlname)
2475 {
2476 	dladm_status_t err;
2477 	boolean_t cpuset, poolset;
2478 	char *poolp;
2479 
2480 	/* First check if it's in use by global zone. */
2481 	if (zonecfg_ifname_exists(AF_INET, dlname) ||
2482 	    zonecfg_ifname_exists(AF_INET6, dlname)) {
2483 		zerror(zlogp, B_FALSE, "WARNING: skipping network interface "
2484 		    "'%s' which is used in the global zone", dlname);
2485 		return (-1);
2486 	}
2487 
2488 	/* Set zoneid of this link. */
2489 	err = dladm_set_linkprop(dld_handle, linkid, "zone", &zone_name, 1,
2490 	    DLADM_OPT_ACTIVE);
2491 	if (err != DLADM_STATUS_OK) {
2492 		zdlerror(zlogp, err, dlname,
2493 		    "WARNING: unable to add network interface");
2494 		return (-1);
2495 	}
2496 
2497 	/*
2498 	 * Set the pool of this link if the zone has a pool and
2499 	 * neither the cpus nor the pool datalink property is
2500 	 * already set.
2501 	 */
2502 	err = dladm_linkprop_is_set(dld_handle, linkid, DLADM_PROP_VAL_CURRENT,
2503 	    "cpus", &cpuset);
2504 	if (err != DLADM_STATUS_OK) {
2505 		zdlerror(zlogp, err, dlname,
2506 		    "WARNING: unable to check if cpus link property is set");
2507 	}
2508 	err = dladm_linkprop_is_set(dld_handle, linkid, DLADM_PROP_VAL_CURRENT,
2509 	    "pool", &poolset);
2510 	if (err != DLADM_STATUS_OK) {
2511 		zdlerror(zlogp, err, dlname,
2512 		    "WARNING: unable to check if pool link property is set");
2513 	}
2514 
2515 	if ((strlen(pool_name) != 0) && !cpuset && !poolset) {
2516 		poolp = pool_name;
2517 		err = dladm_set_linkprop(dld_handle, linkid, "pool",
2518 		    &poolp, 1, DLADM_OPT_ACTIVE);
2519 		if (err != DLADM_STATUS_OK) {
2520 			zerror(zlogp, B_FALSE, "WARNING: unable to set "
2521 			    "pool %s to datalink %s", pool_name, dlname);
2522 			bzero(pool_name, sizeof (pool_name));
2523 		}
2524 	} else {
2525 		bzero(pool_name, sizeof (pool_name));
2526 	}
2527 	return (0);
2528 }
2529 
2530 static boolean_t
2531 sockaddr_to_str(sa_family_t af, const struct sockaddr *sockaddr,
2532     char *straddr, size_t len)
2533 {
2534 	struct sockaddr_in *sin;
2535 	struct sockaddr_in6 *sin6;
2536 	const char *str = NULL;
2537 
2538 	if (af == AF_INET) {
2539 		/* LINTED E_BAD_PTR_CAST_ALIGN */
2540 		sin = SIN(sockaddr);
2541 		str = inet_ntop(AF_INET, (void *)&sin->sin_addr, straddr, len);
2542 	} else if (af == AF_INET6) {
2543 		/* LINTED E_BAD_PTR_CAST_ALIGN */
2544 		sin6 = SIN6(sockaddr);
2545 		str = inet_ntop(AF_INET6, (void *)&sin6->sin6_addr, straddr,
2546 		    len);
2547 	}
2548 
2549 	return (str != NULL);
2550 }
2551 
2552 static int
2553 ipv4_prefixlen(struct sockaddr_in *sin)
2554 {
2555 	struct sockaddr_in *m;
2556 	struct sockaddr_storage mask;
2557 
2558 	m = SIN(&mask);
2559 	m->sin_family = AF_INET;
2560 	if (getnetmaskbyaddr(sin->sin_addr, &m->sin_addr) == 0) {
2561 		return (mask2plen((struct sockaddr *)&mask));
2562 	} else if (IN_CLASSA(htonl(sin->sin_addr.s_addr))) {
2563 		return (8);
2564 	} else if (IN_CLASSB(ntohl(sin->sin_addr.s_addr))) {
2565 		return (16);
2566 	} else if (IN_CLASSC(ntohl(sin->sin_addr.s_addr))) {
2567 		return (24);
2568 	}
2569 	return (0);
2570 }
2571 
2572 static int
2573 zone_setattr_network(int type, zoneid_t zoneid, datalink_id_t linkid,
2574     void *buf, size_t bufsize)
2575 {
2576 	zone_net_data_t *zndata;
2577 	size_t znsize;
2578 	int err;
2579 
2580 	znsize = sizeof (*zndata) + bufsize;
2581 	zndata = calloc(1, znsize);
2582 	if (zndata == NULL)
2583 		return (ENOMEM);
2584 	zndata->zn_type = type;
2585 	zndata->zn_len = bufsize;
2586 	zndata->zn_linkid = linkid;
2587 	bcopy(buf, zndata->zn_val, zndata->zn_len);
2588 	err = zone_setattr(zoneid, ZONE_ATTR_NETWORK, zndata, znsize);
2589 	free(zndata);
2590 	return (err);
2591 }
2592 
2593 static int
2594 add_net_for_linkid(zlog_t *zlogp, zoneid_t zoneid, zone_addr_list_t *start)
2595 {
2596 	struct lifreq lifr;
2597 	char **astr, *address;
2598 	dladm_status_t dlstatus;
2599 	char *ip_nospoof = "ip-nospoof";
2600 	int nnet, naddr, err = 0, j;
2601 	size_t zlen, cpleft;
2602 	zone_addr_list_t *ptr, *end;
2603 	char  tmp[INET6_ADDRSTRLEN], *maskstr;
2604 	char *zaddr, *cp;
2605 	struct in6_addr *routes = NULL;
2606 	boolean_t is_set;
2607 	datalink_id_t linkid;
2608 
2609 	assert(start != NULL);
2610 	naddr = 0; /* number of addresses */
2611 	nnet = 0; /* number of net resources */
2612 	linkid = start->za_linkid;
2613 	for (ptr = start; ptr != NULL && ptr->za_linkid == linkid;
2614 	    ptr = ptr->za_next) {
2615 		nnet++;
2616 	}
2617 	end = ptr;
2618 	zlen = nnet * (INET6_ADDRSTRLEN + 1);
2619 	astr = calloc(1, nnet * sizeof (uintptr_t));
2620 	zaddr = calloc(1, zlen);
2621 	if (astr == NULL || zaddr == NULL) {
2622 		err = ENOMEM;
2623 		goto done;
2624 	}
2625 	cp = zaddr;
2626 	cpleft = zlen;
2627 	j = 0;
2628 	for (ptr = start; ptr != end; ptr = ptr->za_next) {
2629 		address = ptr->za_nwiftab.zone_nwif_allowed_address;
2630 		if (address[0] == '\0')
2631 			continue;
2632 		(void) snprintf(tmp, sizeof (tmp), "%s", address);
2633 		/*
2634 		 * Validate the data. zonecfg_valid_net_address() clobbers
2635 		 * the /<mask> in the address string.
2636 		 */
2637 		if (zonecfg_valid_net_address(address, &lifr) != Z_OK) {
2638 			zerror(zlogp, B_FALSE, "invalid address [%s]\n",
2639 			    address);
2640 			err = EINVAL;
2641 			goto done;
2642 		}
2643 		/*
2644 		 * convert any hostnames to numeric address strings.
2645 		 */
2646 		if (!sockaddr_to_str(lifr.lifr_addr.ss_family,
2647 		    (const struct sockaddr *)&lifr.lifr_addr, cp, cpleft)) {
2648 			err = EINVAL;
2649 			goto done;
2650 		}
2651 		/*
2652 		 * make a copy of the numeric string for the data needed
2653 		 * by the "allowed-ips" datalink property.
2654 		 */
2655 		astr[j] = strdup(cp);
2656 		if (astr[j] == NULL) {
2657 			err = ENOMEM;
2658 			goto done;
2659 		}
2660 		j++;
2661 		/*
2662 		 * compute the default netmask from the address, if necessary
2663 		 */
2664 		if ((maskstr = strchr(tmp, '/')) == NULL) {
2665 			int prefixlen;
2666 
2667 			if (lifr.lifr_addr.ss_family == AF_INET) {
2668 				prefixlen = ipv4_prefixlen(
2669 				    SIN(&lifr.lifr_addr));
2670 			} else {
2671 				struct sockaddr_in6 *sin6;
2672 
2673 				sin6 = SIN6(&lifr.lifr_addr);
2674 				if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
2675 					prefixlen = 10;
2676 				else
2677 					prefixlen = 64;
2678 			}
2679 			(void) snprintf(tmp, sizeof (tmp), "%d", prefixlen);
2680 			maskstr = tmp;
2681 		} else {
2682 			maskstr++;
2683 		}
2684 		/* append the "/<netmask>" */
2685 		(void) strlcat(cp, "/", cpleft);
2686 		(void) strlcat(cp, maskstr, cpleft);
2687 		(void) strlcat(cp, ",", cpleft);
2688 		cp += strnlen(cp, zlen);
2689 		cpleft = &zaddr[INET6_ADDRSTRLEN] - cp;
2690 	}
2691 	naddr = j; /* the actual number of addresses in the net resource */
2692 	assert(naddr <= nnet);
2693 
2694 	/*
2695 	 * zonecfg has already verified that the defrouter property can only
2696 	 * be set if there is at least one address defined for the net resource.
2697 	 * If j is 0, there are no addresses defined, and therefore no routers
2698 	 * to configure, and we are done at that point.
2699 	 */
2700 	if (j == 0)
2701 		goto done;
2702 
2703 	/* over-write last ',' with '\0' */
2704 	zaddr[strnlen(zaddr, zlen) + 1] = '\0';
2705 
2706 	/*
2707 	 * First make sure L3 protection is not already set on the link.
2708 	 */
2709 	dlstatus = dladm_linkprop_is_set(dld_handle, linkid, DLADM_OPT_ACTIVE,
2710 	    "protection", &is_set);
2711 	if (dlstatus != DLADM_STATUS_OK) {
2712 		err = EINVAL;
2713 		zerror(zlogp, B_FALSE, "unable to check if protection is set");
2714 		goto done;
2715 	}
2716 	if (is_set) {
2717 		err = EINVAL;
2718 		zerror(zlogp, B_FALSE, "Protection is already set");
2719 		goto done;
2720 	}
2721 	dlstatus = dladm_linkprop_is_set(dld_handle, linkid, DLADM_OPT_ACTIVE,
2722 	    "allowed-ips", &is_set);
2723 	if (dlstatus != DLADM_STATUS_OK) {
2724 		err = EINVAL;
2725 		zerror(zlogp, B_FALSE, "unable to check if allowed-ips is set");
2726 		goto done;
2727 	}
2728 	if (is_set) {
2729 		zerror(zlogp, B_FALSE, "allowed-ips is already set");
2730 		err = EINVAL;
2731 		goto done;
2732 	}
2733 
2734 	/*
2735 	 * Enable ip-nospoof for the link, and add address to the allowed-ips
2736 	 * list.
2737 	 */
2738 	dlstatus = dladm_set_linkprop(dld_handle, linkid, "protection",
2739 	    &ip_nospoof, 1, DLADM_OPT_ACTIVE);
2740 	if (dlstatus != DLADM_STATUS_OK) {
2741 		zerror(zlogp, B_FALSE, "could not set protection\n");
2742 		err = EINVAL;
2743 		goto done;
2744 	}
2745 	dlstatus = dladm_set_linkprop(dld_handle, linkid, "allowed-ips",
2746 	    astr, naddr, DLADM_OPT_ACTIVE);
2747 	if (dlstatus != DLADM_STATUS_OK) {
2748 		zerror(zlogp, B_FALSE, "could not set allowed-ips\n");
2749 		err = EINVAL;
2750 		goto done;
2751 	}
2752 
2753 	/* now set the address in the data-store */
2754 	err = zone_setattr_network(ZONE_NETWORK_ADDRESS, zoneid, linkid,
2755 	    zaddr, strnlen(zaddr, zlen) + 1);
2756 	if (err != 0)
2757 		goto done;
2758 
2759 	/*
2760 	 * add the defaultrouters
2761 	 */
2762 	routes = calloc(1, nnet * sizeof (*routes));
2763 	j = 0;
2764 	for (ptr = start; ptr != end; ptr = ptr->za_next) {
2765 		address = ptr->za_nwiftab.zone_nwif_defrouter;
2766 		if (address[0] == '\0')
2767 			continue;
2768 		if (strchr(address, '/') == NULL && strchr(address, ':') != 0) {
2769 			/*
2770 			 * zonecfg_valid_net_address() expects numeric IPv6
2771 			 * addresses to have a CIDR format netmask.
2772 			 */
2773 			(void) snprintf(tmp, sizeof (tmp), "/%d", V6_ADDR_LEN);
2774 			(void) strlcat(address, tmp, INET6_ADDRSTRLEN);
2775 		}
2776 		if (zonecfg_valid_net_address(address, &lifr) != Z_OK) {
2777 			zerror(zlogp, B_FALSE,
2778 			    "invalid router [%s]\n", address);
2779 			err = EINVAL;
2780 			goto done;
2781 		}
2782 		if (lifr.lifr_addr.ss_family == AF_INET6) {
2783 			routes[j] = SIN6(&lifr.lifr_addr)->sin6_addr;
2784 		} else {
2785 			IN6_INADDR_TO_V4MAPPED(&SIN(&lifr.lifr_addr)->sin_addr,
2786 			    &routes[j]);
2787 		}
2788 		j++;
2789 	}
2790 	assert(j <= nnet);
2791 	if (j > 0) {
2792 		err = zone_setattr_network(ZONE_NETWORK_DEFROUTER, zoneid,
2793 		    linkid, routes, j * sizeof (*routes));
2794 	}
2795 done:
2796 	free(routes);
2797 	for (j = 0; j < naddr; j++)
2798 		free(astr[j]);
2799 	free(astr);
2800 	free(zaddr);
2801 	return (err);
2802 
2803 }
2804 
2805 static int
2806 add_net(zlog_t *zlogp, zoneid_t zoneid, zone_addr_list_t *zalist)
2807 {
2808 	zone_addr_list_t *ptr;
2809 	datalink_id_t linkid;
2810 	int err;
2811 
2812 	if (zalist == NULL)
2813 		return (0);
2814 
2815 	linkid = zalist->za_linkid;
2816 
2817 	err = add_net_for_linkid(zlogp, zoneid, zalist);
2818 	if (err != 0)
2819 		return (err);
2820 
2821 	for (ptr = zalist; ptr != NULL; ptr = ptr->za_next) {
2822 		if (ptr->za_linkid == linkid)
2823 			continue;
2824 		linkid = ptr->za_linkid;
2825 		err = add_net_for_linkid(zlogp, zoneid, ptr);
2826 		if (err != 0)
2827 			return (err);
2828 	}
2829 	return (0);
2830 }
2831 
2832 /*
2833  * Add "new" to the list of network interfaces to be configured  by
2834  * add_net on zone boot in "old". The list of interfaces in "old" is
2835  * sorted by datalink_id_t, with interfaces sorted FIFO for a given
2836  * datalink_id_t.
2837  *
2838  * Returns the merged list of IP interfaces containing "old" and "new"
2839  */
2840 static zone_addr_list_t *
2841 add_ip_interface(zone_addr_list_t *old, zone_addr_list_t *new)
2842 {
2843 	zone_addr_list_t *ptr, *next;
2844 	datalink_id_t linkid = new->za_linkid;
2845 
2846 	assert(old != new);
2847 
2848 	if (old == NULL)
2849 		return (new);
2850 	for (ptr = old; ptr != NULL; ptr = ptr->za_next) {
2851 		if (ptr->za_linkid == linkid)
2852 			break;
2853 	}
2854 	if (ptr == NULL) {
2855 		/* linkid does not already exist, add to the beginning */
2856 		new->za_next = old;
2857 		return (new);
2858 	}
2859 	/*
2860 	 * adding to the middle of the list; ptr points at the first
2861 	 * occurrence of linkid. Find the last occurrence.
2862 	 */
2863 	while ((next = ptr->za_next) != NULL) {
2864 		if (next->za_linkid != linkid)
2865 			break;
2866 		ptr = next;
2867 	}
2868 	/* insert new after ptr */
2869 	new->za_next = next;
2870 	ptr->za_next = new;
2871 	return (old);
2872 }
2873 
2874 void
2875 free_ip_interface(zone_addr_list_t *zalist)
2876 {
2877 	zone_addr_list_t *ptr, *new;
2878 
2879 	for (ptr = zalist; ptr != NULL; ) {
2880 		new = ptr;
2881 		ptr = ptr->za_next;
2882 		free(new);
2883 	}
2884 }
2885 
2886 /*
2887  * Add the kernel access control information for the interface names.
2888  * If anything goes wrong, we log a general error message, attempt to tear down
2889  * whatever we set up, and return an error.
2890  */
2891 static int
2892 configure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
2893 {
2894 	zone_dochandle_t handle;
2895 	struct zone_nwiftab nwiftab;
2896 	char rootpath[MAXPATHLEN];
2897 	char path[MAXPATHLEN];
2898 	datalink_id_t linkid;
2899 	di_prof_t prof = NULL;
2900 	boolean_t added = B_FALSE;
2901 	zone_addr_list_t *zalist = NULL, *new;
2902 
2903 	if ((handle = zonecfg_init_handle()) == NULL) {
2904 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
2905 		return (-1);
2906 	}
2907 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
2908 		zerror(zlogp, B_FALSE, "invalid configuration");
2909 		zonecfg_fini_handle(handle);
2910 		return (-1);
2911 	}
2912 
2913 	if (zonecfg_setnwifent(handle) != Z_OK) {
2914 		zonecfg_fini_handle(handle);
2915 		return (0);
2916 	}
2917 
2918 	for (;;) {
2919 		if (zonecfg_getnwifent(handle, &nwiftab) != Z_OK)
2920 			break;
2921 
2922 		if (prof == NULL) {
2923 			if (zone_get_devroot(zone_name, rootpath,
2924 			    sizeof (rootpath)) != Z_OK) {
2925 				(void) zonecfg_endnwifent(handle);
2926 				zonecfg_fini_handle(handle);
2927 				zerror(zlogp, B_TRUE,
2928 				    "unable to determine dev root");
2929 				return (-1);
2930 			}
2931 			(void) snprintf(path, sizeof (path), "%s%s", rootpath,
2932 			    "/dev");
2933 			if (di_prof_init(path, &prof) != 0) {
2934 				(void) zonecfg_endnwifent(handle);
2935 				zonecfg_fini_handle(handle);
2936 				zerror(zlogp, B_TRUE,
2937 				    "failed to initialize profile");
2938 				return (-1);
2939 			}
2940 		}
2941 
2942 		/*
2943 		 * Create the /dev entry for backward compatibility.
2944 		 * Only create the /dev entry if it's not in use.
2945 		 * Note that the zone still boots when the assigned
2946 		 * interface is inaccessible, used by others, etc.
2947 		 * Also, when vanity naming is used, some interface do
2948 		 * do not have corresponding /dev node names (for example,
2949 		 * vanity named aggregations).  The /dev entry is not
2950 		 * created in that case.  The /dev/net entry is always
2951 		 * accessible.
2952 		 */
2953 		if (dladm_name2info(dld_handle, nwiftab.zone_nwif_physical,
2954 		    &linkid, NULL, NULL, NULL) == DLADM_STATUS_OK &&
2955 		    add_datalink(zlogp, zone_name, linkid,
2956 		    nwiftab.zone_nwif_physical) == 0) {
2957 			added = B_TRUE;
2958 		} else {
2959 			(void) zonecfg_endnwifent(handle);
2960 			zonecfg_fini_handle(handle);
2961 			zerror(zlogp, B_TRUE, "failed to add network device");
2962 			return (-1);
2963 		}
2964 		/* set up the new IP interface, and add them all later */
2965 		new = malloc(sizeof (*new));
2966 		if (new == NULL) {
2967 			zerror(zlogp, B_TRUE, "no memory for %s",
2968 			    nwiftab.zone_nwif_physical);
2969 			zonecfg_fini_handle(handle);
2970 			free_ip_interface(zalist);
2971 		}
2972 		bzero(new, sizeof (*new));
2973 		new->za_nwiftab = nwiftab;
2974 		new->za_linkid = linkid;
2975 		zalist = add_ip_interface(zalist, new);
2976 	}
2977 	if (zalist != NULL) {
2978 		if ((errno = add_net(zlogp, zoneid, zalist)) != 0) {
2979 			(void) zonecfg_endnwifent(handle);
2980 			zonecfg_fini_handle(handle);
2981 			zerror(zlogp, B_TRUE, "failed to add address");
2982 			free_ip_interface(zalist);
2983 			return (-1);
2984 		}
2985 		free_ip_interface(zalist);
2986 	}
2987 	(void) zonecfg_endnwifent(handle);
2988 	zonecfg_fini_handle(handle);
2989 
2990 	if (prof != NULL && added) {
2991 		if (di_prof_commit(prof) != 0) {
2992 			zerror(zlogp, B_TRUE, "failed to commit profile");
2993 			return (-1);
2994 		}
2995 	}
2996 	if (prof != NULL)
2997 		di_prof_fini(prof);
2998 
2999 	return (0);
3000 }
3001 
3002 static int
3003 remove_datalink_pool(zlog_t *zlogp, zoneid_t zoneid)
3004 {
3005 	ushort_t flags;
3006 	zone_iptype_t iptype;
3007 	int i, dlnum = 0;
3008 	datalink_id_t *dllink, *dllinks = NULL;
3009 	dladm_status_t err;
3010 
3011 	if (strlen(pool_name) == 0)
3012 		return (0);
3013 
3014 	if (zone_getattr(zoneid, ZONE_ATTR_FLAGS, &flags,
3015 	    sizeof (flags)) < 0) {
3016 		if (vplat_get_iptype(zlogp, &iptype) < 0) {
3017 			zerror(zlogp, B_FALSE, "unable to determine ip-type");
3018 			return (-1);
3019 		}
3020 	} else {
3021 		if (flags & ZF_NET_EXCL)
3022 			iptype = ZS_EXCLUSIVE;
3023 		else
3024 			iptype = ZS_SHARED;
3025 	}
3026 
3027 	if (iptype == ZS_EXCLUSIVE) {
3028 		/*
3029 		 * Get the datalink count and for each datalink,
3030 		 * attempt to clear the pool property and clear
3031 		 * the pool_name.
3032 		 */
3033 		if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
3034 			zerror(zlogp, B_TRUE, "unable to count network "
3035 			    "interfaces");
3036 			return (-1);
3037 		}
3038 
3039 		if (dlnum == 0)
3040 			return (0);
3041 
3042 		if ((dllinks = malloc(dlnum * sizeof (datalink_id_t)))
3043 		    == NULL) {
3044 			zerror(zlogp, B_TRUE, "memory allocation failed");
3045 			return (-1);
3046 		}
3047 		if (zone_list_datalink(zoneid, &dlnum, dllinks) != 0) {
3048 			zerror(zlogp, B_TRUE, "unable to list network "
3049 			    "interfaces");
3050 			return (-1);
3051 		}
3052 
3053 		bzero(pool_name, sizeof (pool_name));
3054 		for (i = 0, dllink = dllinks; i < dlnum; i++, dllink++) {
3055 			err = dladm_set_linkprop(dld_handle, *dllink, "pool",
3056 			    NULL, 0, DLADM_OPT_ACTIVE);
3057 			if (err != DLADM_STATUS_OK) {
3058 				zerror(zlogp, B_TRUE,
3059 				    "WARNING: unable to clear pool");
3060 			}
3061 		}
3062 		free(dllinks);
3063 	}
3064 	return (0);
3065 }
3066 
3067 static int
3068 remove_datalink_protect(zlog_t *zlogp, zoneid_t zoneid)
3069 {
3070 	ushort_t flags;
3071 	zone_iptype_t iptype;
3072 	int i, dlnum = 0;
3073 	dladm_status_t dlstatus;
3074 	datalink_id_t *dllink, *dllinks = NULL;
3075 
3076 	if (zone_getattr(zoneid, ZONE_ATTR_FLAGS, &flags,
3077 	    sizeof (flags)) < 0) {
3078 		if (vplat_get_iptype(zlogp, &iptype) < 0) {
3079 			zerror(zlogp, B_FALSE, "unable to determine ip-type");
3080 			return (-1);
3081 		}
3082 	} else {
3083 		if (flags & ZF_NET_EXCL)
3084 			iptype = ZS_EXCLUSIVE;
3085 		else
3086 			iptype = ZS_SHARED;
3087 	}
3088 
3089 	if (iptype != ZS_EXCLUSIVE)
3090 		return (0);
3091 
3092 	/*
3093 	 * Get the datalink count and for each datalink,
3094 	 * attempt to clear the pool property and clear
3095 	 * the pool_name.
3096 	 */
3097 	if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
3098 		zerror(zlogp, B_TRUE, "unable to count network interfaces");
3099 		return (-1);
3100 	}
3101 
3102 	if (dlnum == 0)
3103 		return (0);
3104 
3105 	if ((dllinks = malloc(dlnum * sizeof (datalink_id_t))) == NULL) {
3106 		zerror(zlogp, B_TRUE, "memory allocation failed");
3107 		return (-1);
3108 	}
3109 	if (zone_list_datalink(zoneid, &dlnum, dllinks) != 0) {
3110 		zerror(zlogp, B_TRUE, "unable to list network interfaces");
3111 		free(dllinks);
3112 		return (-1);
3113 	}
3114 
3115 	for (i = 0, dllink = dllinks; i < dlnum; i++, dllink++) {
3116 		char dlerr[DLADM_STRSIZE];
3117 
3118 		dlstatus = dladm_set_linkprop(dld_handle, *dllink,
3119 		    "protection", NULL, 0, DLADM_OPT_ACTIVE);
3120 		if (dlstatus == DLADM_STATUS_NOTFOUND) {
3121 			/* datalink does not belong to the GZ */
3122 			continue;
3123 		}
3124 		if (dlstatus != DLADM_STATUS_OK) {
3125 			zerror(zlogp, B_FALSE,
3126 			    dladm_status2str(dlstatus, dlerr));
3127 			free(dllinks);
3128 			return (-1);
3129 		}
3130 		dlstatus = dladm_set_linkprop(dld_handle, *dllink,
3131 		    "allowed-ips", NULL, 0, DLADM_OPT_ACTIVE);
3132 		if (dlstatus != DLADM_STATUS_OK) {
3133 			zerror(zlogp, B_FALSE,
3134 			    dladm_status2str(dlstatus, dlerr));
3135 			free(dllinks);
3136 			return (-1);
3137 		}
3138 	}
3139 	free(dllinks);
3140 	return (0);
3141 }
3142 
3143 static int
3144 unconfigure_exclusive_network_interfaces(zlog_t *zlogp, zoneid_t zoneid)
3145 {
3146 	int dlnum = 0;
3147 
3148 	/*
3149 	 * The kernel shutdown callback for the dls module should have removed
3150 	 * all datalinks from this zone.  If any remain, then there's a
3151 	 * problem.
3152 	 */
3153 	if (zone_list_datalink(zoneid, &dlnum, NULL) != 0) {
3154 		zerror(zlogp, B_TRUE, "unable to list network interfaces");
3155 		return (-1);
3156 	}
3157 	if (dlnum != 0) {
3158 		zerror(zlogp, B_FALSE,
3159 		    "datalinks remain in zone after shutdown");
3160 		return (-1);
3161 	}
3162 	return (0);
3163 }
3164 
3165 static int
3166 tcp_abort_conn(zlog_t *zlogp, zoneid_t zoneid,
3167     const struct sockaddr_storage *local, const struct sockaddr_storage *remote)
3168 {
3169 	int fd;
3170 	struct strioctl ioc;
3171 	tcp_ioc_abort_conn_t conn;
3172 	int error;
3173 
3174 	conn.ac_local = *local;
3175 	conn.ac_remote = *remote;
3176 	conn.ac_start = TCPS_SYN_SENT;
3177 	conn.ac_end = TCPS_TIME_WAIT;
3178 	conn.ac_zoneid = zoneid;
3179 
3180 	ioc.ic_cmd = TCP_IOC_ABORT_CONN;
3181 	ioc.ic_timout = -1; /* infinite timeout */
3182 	ioc.ic_len = sizeof (conn);
3183 	ioc.ic_dp = (char *)&conn;
3184 
3185 	if ((fd = open("/dev/tcp", O_RDONLY)) < 0) {
3186 		zerror(zlogp, B_TRUE, "unable to open %s", "/dev/tcp");
3187 		return (-1);
3188 	}
3189 
3190 	error = ioctl(fd, I_STR, &ioc);
3191 	(void) close(fd);
3192 	if (error == 0 || errno == ENOENT)	/* ENOENT is not an error */
3193 		return (0);
3194 	return (-1);
3195 }
3196 
3197 static int
3198 tcp_abort_connections(zlog_t *zlogp, zoneid_t zoneid)
3199 {
3200 	struct sockaddr_storage l, r;
3201 	struct sockaddr_in *local, *remote;
3202 	struct sockaddr_in6 *local6, *remote6;
3203 	int error;
3204 
3205 	/*
3206 	 * Abort IPv4 connections.
3207 	 */
3208 	bzero(&l, sizeof (*local));
3209 	local = (struct sockaddr_in *)&l;
3210 	local->sin_family = AF_INET;
3211 	local->sin_addr.s_addr = INADDR_ANY;
3212 	local->sin_port = 0;
3213 
3214 	bzero(&r, sizeof (*remote));
3215 	remote = (struct sockaddr_in *)&r;
3216 	remote->sin_family = AF_INET;
3217 	remote->sin_addr.s_addr = INADDR_ANY;
3218 	remote->sin_port = 0;
3219 
3220 	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
3221 		return (error);
3222 
3223 	/*
3224 	 * Abort IPv6 connections.
3225 	 */
3226 	bzero(&l, sizeof (*local6));
3227 	local6 = (struct sockaddr_in6 *)&l;
3228 	local6->sin6_family = AF_INET6;
3229 	local6->sin6_port = 0;
3230 	local6->sin6_addr = in6addr_any;
3231 
3232 	bzero(&r, sizeof (*remote6));
3233 	remote6 = (struct sockaddr_in6 *)&r;
3234 	remote6->sin6_family = AF_INET6;
3235 	remote6->sin6_port = 0;
3236 	remote6->sin6_addr = in6addr_any;
3237 
3238 	if ((error = tcp_abort_conn(zlogp, zoneid, &l, &r)) != 0)
3239 		return (error);
3240 	return (0);
3241 }
3242 
3243 static int
3244 get_privset(zlog_t *zlogp, priv_set_t *privs, zone_mnt_t mount_cmd)
3245 {
3246 	int error = -1;
3247 	zone_dochandle_t handle;
3248 	char *privname = NULL;
3249 
3250 	if ((handle = zonecfg_init_handle()) == NULL) {
3251 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3252 		return (-1);
3253 	}
3254 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
3255 		zerror(zlogp, B_FALSE, "invalid configuration");
3256 		zonecfg_fini_handle(handle);
3257 		return (-1);
3258 	}
3259 
3260 	if (ALT_MOUNT(mount_cmd)) {
3261 		zone_iptype_t	iptype;
3262 		const char	*curr_iptype;
3263 
3264 		if (zonecfg_get_iptype(handle, &iptype) != Z_OK) {
3265 			zerror(zlogp, B_TRUE, "unable to determine ip-type");
3266 			zonecfg_fini_handle(handle);
3267 			return (-1);
3268 		}
3269 
3270 		switch (iptype) {
3271 		case ZS_SHARED:
3272 			curr_iptype = "shared";
3273 			break;
3274 		case ZS_EXCLUSIVE:
3275 			curr_iptype = "exclusive";
3276 			break;
3277 		}
3278 
3279 		if (zonecfg_default_privset(privs, curr_iptype) == Z_OK) {
3280 			zonecfg_fini_handle(handle);
3281 			return (0);
3282 		}
3283 		zerror(zlogp, B_FALSE,
3284 		    "failed to determine the zone's default privilege set");
3285 		zonecfg_fini_handle(handle);
3286 		return (-1);
3287 	}
3288 
3289 	switch (zonecfg_get_privset(handle, privs, &privname)) {
3290 	case Z_OK:
3291 		error = 0;
3292 		break;
3293 	case Z_PRIV_PROHIBITED:
3294 		zerror(zlogp, B_FALSE, "privilege \"%s\" is not permitted "
3295 		    "within the zone's privilege set", privname);
3296 		break;
3297 	case Z_PRIV_REQUIRED:
3298 		zerror(zlogp, B_FALSE, "required privilege \"%s\" is missing "
3299 		    "from the zone's privilege set", privname);
3300 		break;
3301 	case Z_PRIV_UNKNOWN:
3302 		zerror(zlogp, B_FALSE, "unknown privilege \"%s\" specified "
3303 		    "in the zone's privilege set", privname);
3304 		break;
3305 	default:
3306 		zerror(zlogp, B_FALSE, "failed to determine the zone's "
3307 		    "privilege set");
3308 		break;
3309 	}
3310 
3311 	free(privname);
3312 	zonecfg_fini_handle(handle);
3313 	return (error);
3314 }
3315 
3316 static int
3317 get_rctls(zlog_t *zlogp, char **bufp, size_t *bufsizep)
3318 {
3319 	nvlist_t *nvl = NULL;
3320 	char *nvl_packed = NULL;
3321 	size_t nvl_size = 0;
3322 	nvlist_t **nvlv = NULL;
3323 	int rctlcount = 0;
3324 	int error = -1;
3325 	zone_dochandle_t handle;
3326 	struct zone_rctltab rctltab;
3327 	rctlblk_t *rctlblk = NULL;
3328 	uint64_t maxlwps;
3329 	uint64_t maxprocs;
3330 
3331 	*bufp = NULL;
3332 	*bufsizep = 0;
3333 
3334 	if ((handle = zonecfg_init_handle()) == NULL) {
3335 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3336 		return (-1);
3337 	}
3338 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
3339 		zerror(zlogp, B_FALSE, "invalid configuration");
3340 		zonecfg_fini_handle(handle);
3341 		return (-1);
3342 	}
3343 
3344 	rctltab.zone_rctl_valptr = NULL;
3345 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
3346 		zerror(zlogp, B_TRUE, "%s failed", "nvlist_alloc");
3347 		goto out;
3348 	}
3349 
3350 	/*
3351 	 * Allow the administrator to control both the maximum number of
3352 	 * process table slots and the maximum number of lwps with just the
3353 	 * max-processes property.  If only the max-processes property is set,
3354 	 * we add a max-lwps property with a limit derived from max-processes.
3355 	 */
3356 	if (zonecfg_get_aliased_rctl(handle, ALIAS_MAXPROCS, &maxprocs)
3357 	    == Z_OK &&
3358 	    zonecfg_get_aliased_rctl(handle, ALIAS_MAXLWPS, &maxlwps)
3359 	    == Z_NO_ENTRY) {
3360 		if (zonecfg_set_aliased_rctl(handle, ALIAS_MAXLWPS,
3361 		    maxprocs * LWPS_PER_PROCESS) != Z_OK) {
3362 			zerror(zlogp, B_FALSE, "unable to set max-lwps alias");
3363 			goto out;
3364 		}
3365 	}
3366 
3367 	if (zonecfg_setrctlent(handle) != Z_OK) {
3368 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setrctlent");
3369 		goto out;
3370 	}
3371 
3372 	if ((rctlblk = malloc(rctlblk_size())) == NULL) {
3373 		zerror(zlogp, B_TRUE, "memory allocation failed");
3374 		goto out;
3375 	}
3376 	while (zonecfg_getrctlent(handle, &rctltab) == Z_OK) {
3377 		struct zone_rctlvaltab *rctlval;
3378 		uint_t i, count;
3379 		const char *name = rctltab.zone_rctl_name;
3380 
3381 		/* zoneadm should have already warned about unknown rctls. */
3382 		if (!zonecfg_is_rctl(name)) {
3383 			zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
3384 			rctltab.zone_rctl_valptr = NULL;
3385 			continue;
3386 		}
3387 		count = 0;
3388 		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
3389 		    rctlval = rctlval->zone_rctlval_next) {
3390 			count++;
3391 		}
3392 		if (count == 0) {	/* ignore */
3393 			continue;	/* Nothing to free */
3394 		}
3395 		if ((nvlv = malloc(sizeof (*nvlv) * count)) == NULL)
3396 			goto out;
3397 		i = 0;
3398 		for (rctlval = rctltab.zone_rctl_valptr; rctlval != NULL;
3399 		    rctlval = rctlval->zone_rctlval_next, i++) {
3400 			if (nvlist_alloc(&nvlv[i], NV_UNIQUE_NAME, 0) != 0) {
3401 				zerror(zlogp, B_TRUE, "%s failed",
3402 				    "nvlist_alloc");
3403 				goto out;
3404 			}
3405 			if (zonecfg_construct_rctlblk(rctlval, rctlblk)
3406 			    != Z_OK) {
3407 				zerror(zlogp, B_FALSE, "invalid rctl value: "
3408 				    "(priv=%s,limit=%s,action=%s)",
3409 				    rctlval->zone_rctlval_priv,
3410 				    rctlval->zone_rctlval_limit,
3411 				    rctlval->zone_rctlval_action);
3412 				goto out;
3413 			}
3414 			if (!zonecfg_valid_rctl(name, rctlblk)) {
3415 				zerror(zlogp, B_FALSE,
3416 				    "(priv=%s,limit=%s,action=%s) is not a "
3417 				    "valid value for rctl '%s'",
3418 				    rctlval->zone_rctlval_priv,
3419 				    rctlval->zone_rctlval_limit,
3420 				    rctlval->zone_rctlval_action,
3421 				    name);
3422 				goto out;
3423 			}
3424 			if (nvlist_add_uint64(nvlv[i], "privilege",
3425 			    rctlblk_get_privilege(rctlblk)) != 0) {
3426 				zerror(zlogp, B_FALSE, "%s failed",
3427 				    "nvlist_add_uint64");
3428 				goto out;
3429 			}
3430 			if (nvlist_add_uint64(nvlv[i], "limit",
3431 			    rctlblk_get_value(rctlblk)) != 0) {
3432 				zerror(zlogp, B_FALSE, "%s failed",
3433 				    "nvlist_add_uint64");
3434 				goto out;
3435 			}
3436 			if (nvlist_add_uint64(nvlv[i], "action",
3437 			    (uint_t)rctlblk_get_local_action(rctlblk, NULL))
3438 			    != 0) {
3439 				zerror(zlogp, B_FALSE, "%s failed",
3440 				    "nvlist_add_uint64");
3441 				goto out;
3442 			}
3443 		}
3444 		zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
3445 		rctltab.zone_rctl_valptr = NULL;
3446 		if (nvlist_add_nvlist_array(nvl, (char *)name, nvlv, count)
3447 		    != 0) {
3448 			zerror(zlogp, B_FALSE, "%s failed",
3449 			    "nvlist_add_nvlist_array");
3450 			goto out;
3451 		}
3452 		for (i = 0; i < count; i++)
3453 			nvlist_free(nvlv[i]);
3454 		free(nvlv);
3455 		nvlv = NULL;
3456 		rctlcount++;
3457 	}
3458 	(void) zonecfg_endrctlent(handle);
3459 
3460 	if (rctlcount == 0) {
3461 		error = 0;
3462 		goto out;
3463 	}
3464 	if (nvlist_pack(nvl, &nvl_packed, &nvl_size, NV_ENCODE_NATIVE, 0)
3465 	    != 0) {
3466 		zerror(zlogp, B_FALSE, "%s failed", "nvlist_pack");
3467 		goto out;
3468 	}
3469 
3470 	error = 0;
3471 	*bufp = nvl_packed;
3472 	*bufsizep = nvl_size;
3473 
3474 out:
3475 	free(rctlblk);
3476 	zonecfg_free_rctl_value_list(rctltab.zone_rctl_valptr);
3477 	if (error && nvl_packed != NULL)
3478 		free(nvl_packed);
3479 	if (nvl != NULL)
3480 		nvlist_free(nvl);
3481 	if (nvlv != NULL)
3482 		free(nvlv);
3483 	if (handle != NULL)
3484 		zonecfg_fini_handle(handle);
3485 	return (error);
3486 }
3487 
3488 static int
3489 get_implicit_datasets(zlog_t *zlogp, char **retstr)
3490 {
3491 	char cmdbuf[2 * MAXPATHLEN];
3492 
3493 	if (query_hook[0] == '\0')
3494 		return (0);
3495 
3496 	if (snprintf(cmdbuf, sizeof (cmdbuf), "%s datasets", query_hook)
3497 	    > sizeof (cmdbuf))
3498 		return (-1);
3499 
3500 	if (do_subproc(zlogp, cmdbuf, retstr) != 0)
3501 		return (-1);
3502 
3503 	return (0);
3504 }
3505 
3506 static int
3507 get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
3508 {
3509 	zone_dochandle_t handle;
3510 	struct zone_dstab dstab;
3511 	size_t total, offset, len;
3512 	int error = -1;
3513 	char *str = NULL;
3514 	char *implicit_datasets = NULL;
3515 	int implicit_len = 0;
3516 
3517 	*bufp = NULL;
3518 	*bufsizep = 0;
3519 
3520 	if ((handle = zonecfg_init_handle()) == NULL) {
3521 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3522 		return (-1);
3523 	}
3524 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
3525 		zerror(zlogp, B_FALSE, "invalid configuration");
3526 		zonecfg_fini_handle(handle);
3527 		return (-1);
3528 	}
3529 
3530 	if (get_implicit_datasets(zlogp, &implicit_datasets) != 0) {
3531 		zerror(zlogp, B_FALSE, "getting implicit datasets failed");
3532 		goto out;
3533 	}
3534 
3535 	if (zonecfg_setdsent(handle) != Z_OK) {
3536 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
3537 		goto out;
3538 	}
3539 
3540 	total = 0;
3541 	while (zonecfg_getdsent(handle, &dstab) == Z_OK)
3542 		total += strlen(dstab.zone_dataset_name) + 1;
3543 	(void) zonecfg_enddsent(handle);
3544 
3545 	if (implicit_datasets != NULL)
3546 		implicit_len = strlen(implicit_datasets);
3547 	if (implicit_len > 0)
3548 		total += implicit_len + 1;
3549 
3550 	if (total == 0) {
3551 		error = 0;
3552 		goto out;
3553 	}
3554 
3555 	if ((str = malloc(total)) == NULL) {
3556 		zerror(zlogp, B_TRUE, "memory allocation failed");
3557 		goto out;
3558 	}
3559 
3560 	if (zonecfg_setdsent(handle) != Z_OK) {
3561 		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
3562 		goto out;
3563 	}
3564 	offset = 0;
3565 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
3566 		len = strlen(dstab.zone_dataset_name);
3567 		(void) strlcpy(str + offset, dstab.zone_dataset_name,
3568 		    total - offset);
3569 		offset += len;
3570 		if (offset < total - 1)
3571 			str[offset++] = ',';
3572 	}
3573 	(void) zonecfg_enddsent(handle);
3574 
3575 	if (implicit_len > 0)
3576 		(void) strlcpy(str + offset, implicit_datasets, total - offset);
3577 
3578 	error = 0;
3579 	*bufp = str;
3580 	*bufsizep = total;
3581 
3582 out:
3583 	if (error != 0 && str != NULL)
3584 		free(str);
3585 	if (handle != NULL)
3586 		zonecfg_fini_handle(handle);
3587 	if (implicit_datasets != NULL)
3588 		free(implicit_datasets);
3589 
3590 	return (error);
3591 }
3592 
3593 static int
3594 validate_datasets(zlog_t *zlogp)
3595 {
3596 	zone_dochandle_t handle;
3597 	struct zone_dstab dstab;
3598 	zfs_handle_t *zhp;
3599 	libzfs_handle_t *hdl;
3600 
3601 	if ((handle = zonecfg_init_handle()) == NULL) {
3602 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
3603 		return (-1);
3604 	}
3605 	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
3606 		zerror(zlogp, B_FALSE, "invalid configuration");
3607 		zonecfg_fini_handle(handle);
3608 		return (-1);
3609 	}
3610 
3611 	if (zonecfg_setdsent(handle) != Z_OK) {
3612 		zerror(zlogp, B_FALSE, "invalid configuration");
3613 		zonecfg_fini_handle(handle);
3614 		return (-1);
3615 	}
3616 
3617 	if ((hdl = libzfs_init()) == NULL) {
3618 		zerror(zlogp, B_FALSE, "opening ZFS library");
3619 		zonecfg_fini_handle(handle);
3620 		return (-1);
3621 	}
3622 
3623 	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
3624 
3625 		if ((zhp = zfs_open(hdl, dstab.zone_dataset_name,
3626 		    ZFS_TYPE_FILESYSTEM)) == NULL) {
3627 			zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
3628 			    dstab.zone_dataset_name);
3629 			zonecfg_fini_handle(handle);
3630 			libzfs_fini(hdl);
3631 			return (-1);
3632 		}
3633 
3634 		/*
3635 		 * Automatically set the 'zoned' property.  We check the value
3636 		 * first because we'll get EPERM if it is already set.
3637 		 */
3638 		if (!zfs_prop_get_int(zhp, ZFS_PROP_ZONED) &&
3639 		    zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_ZONED),
3640 		    "on") != 0) {
3641 			zerror(zlogp, B_FALSE, "cannot set 'zoned' "
3642 			    "property for ZFS dataset '%s'\n",
3643 			    dstab.zone_dataset_name);
3644 			zonecfg_fini_handle(handle);
3645 			zfs_close(zhp);
3646 			libzfs_fini(hdl);
3647 			return (-1);
3648 		}
3649 
3650 		zfs_close(zhp);
3651 	}
3652 	(void) zonecfg_enddsent(handle);
3653 
3654 	zonecfg_fini_handle(handle);
3655 	libzfs_fini(hdl);
3656 
3657 	return (0);
3658 }
3659 
3660 /*
3661  * Return true if the path is its own zfs file system.  We determine this
3662  * by stat-ing the path to see if it is zfs and stat-ing the parent to see
3663  * if it is a different fs.
3664  */
3665 boolean_t
3666 is_zonepath_zfs(char *zonepath)
3667 {
3668 	int res;
3669 	char *path;
3670 	char *parent;
3671 	struct statvfs64 buf1, buf2;
3672 
3673 	if (statvfs64(zonepath, &buf1) != 0)
3674 		return (B_FALSE);
3675 
3676 	if (strcmp(buf1.f_basetype, "zfs") != 0)
3677 		return (B_FALSE);
3678 
3679 	if ((path = strdup(zonepath)) == NULL)
3680 		return (B_FALSE);
3681 
3682 	parent = dirname(path);
3683 	res = statvfs64(parent, &buf2);
3684 	free(path);
3685 
3686 	if (res != 0)
3687 		return (B_FALSE);
3688 
3689 	if (buf1.f_fsid == buf2.f_fsid)
3690 		return (B_FALSE);
3691 
3692 	return (B_TRUE);
3693 }
3694 
3695 /*
3696  * Verify the MAC label in the root dataset for the zone.
3697  * If the label exists, it must match the label configured for the zone.
3698  * Otherwise if there's no label on the dataset, create one here.
3699  */
3700 
3701 static int
3702 validate_rootds_label(zlog_t *zlogp, char *rootpath, m_label_t *zone_sl)
3703 {
3704 	int		error = -1;
3705 	zfs_handle_t	*zhp;
3706 	libzfs_handle_t	*hdl;
3707 	m_label_t	ds_sl;
3708 	char		zonepath[MAXPATHLEN];
3709 	char		ds_hexsl[MAXNAMELEN];
3710 
3711 	if (!is_system_labeled())
3712 		return (0);
3713 
3714 	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
3715 		zerror(zlogp, B_TRUE, "unable to determine zone path");
3716 		return (-1);
3717 	}
3718 
3719 	if (!is_zonepath_zfs(zonepath))
3720 		return (0);
3721 
3722 	if ((hdl = libzfs_init()) == NULL) {
3723 		zerror(zlogp, B_FALSE, "opening ZFS library");
3724 		return (-1);
3725 	}
3726 
3727 	if ((zhp = zfs_path_to_zhandle(hdl, rootpath,
3728 	    ZFS_TYPE_FILESYSTEM)) == NULL) {
3729 		zerror(zlogp, B_FALSE, "cannot open ZFS dataset for path '%s'",
3730 		    rootpath);
3731 		libzfs_fini(hdl);
3732 		return (-1);
3733 	}
3734 
3735 	/* Get the mlslabel property if it exists. */
3736 	if ((zfs_prop_get(zhp, ZFS_PROP_MLSLABEL, ds_hexsl, MAXNAMELEN,
3737 	    NULL, NULL, 0, B_TRUE) != 0) ||
3738 	    (strcmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)) {
3739 		char		*str2 = NULL;
3740 
3741 		/*
3742 		 * No label on the dataset (or default only); create one.
3743 		 * (Only do this automatic labeling for the labeled brand.)
3744 		 */
3745 		if (strcmp(brand_name, LABELED_BRAND_NAME) != 0) {
3746 			error = 0;
3747 			goto out;
3748 		}
3749 
3750 		error = l_to_str_internal(zone_sl, &str2);
3751 		if (error)
3752 			goto out;
3753 		if (str2 == NULL) {
3754 			error = -1;
3755 			goto out;
3756 		}
3757 		if ((error = zfs_prop_set(zhp,
3758 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL), str2)) != 0) {
3759 			zerror(zlogp, B_FALSE, "cannot set 'mlslabel' "
3760 			    "property for root dataset at '%s'\n", rootpath);
3761 		}
3762 		free(str2);
3763 		goto out;
3764 	}
3765 
3766 	/* Convert the retrieved dataset label to binary form. */
3767 	error = hexstr_to_label(ds_hexsl, &ds_sl);
3768 	if (error) {
3769 		zerror(zlogp, B_FALSE, "invalid 'mlslabel' "
3770 		    "property on root dataset at '%s'\n", rootpath);
3771 		goto out;			/* exit with error */
3772 	}
3773 
3774 	/*
3775 	 * Perform a MAC check by comparing the zone label with the
3776 	 * dataset label.
3777 	 */
3778 	error = (!blequal(zone_sl, &ds_sl));
3779 	if (error)
3780 		zerror(zlogp, B_FALSE, "Rootpath dataset has mismatched label");
3781 out:
3782 	zfs_close(zhp);
3783 	libzfs_fini(hdl);
3784 
3785 	return (error);
3786 }
3787 
3788 /*
3789  * Mount lower level home directories into/from current zone
3790  * Share exported directories specified in dfstab for zone
3791  */
3792 static int
3793 tsol_mounts(zlog_t *zlogp, char *zone_name, char *rootpath)
3794 {
3795 	zoneid_t *zids = NULL;
3796 	priv_set_t *zid_privs;
3797 	const priv_impl_info_t *ip = NULL;
3798 	uint_t nzents_saved;
3799 	uint_t nzents;
3800 	int i;
3801 	char readonly[] = "ro";
3802 	struct zone_fstab lower_fstab;
3803 	char *argv[4];
3804 
3805 	if (!is_system_labeled())
3806 		return (0);
3807 
3808 	if (zid_label == NULL) {
3809 		zid_label = m_label_alloc(MAC_LABEL);
3810 		if (zid_label == NULL)
3811 			return (-1);
3812 	}
3813 
3814 	/* Make sure our zone has an /export/home dir */
3815 	(void) make_one_dir(zlogp, rootpath, "/export/home",
3816 	    DEFAULT_DIR_MODE, DEFAULT_DIR_USER, DEFAULT_DIR_GROUP);
3817 
3818 	lower_fstab.zone_fs_raw[0] = '\0';
3819 	(void) strlcpy(lower_fstab.zone_fs_type, MNTTYPE_LOFS,
3820 	    sizeof (lower_fstab.zone_fs_type));
3821 	lower_fstab.zone_fs_options = NULL;
3822 	(void) zonecfg_add_fs_option(&lower_fstab, readonly);
3823 
3824 	/*
3825 	 * Get the list of zones from the kernel
3826 	 */
3827 	if (zone_list(NULL, &nzents) != 0) {
3828 		zerror(zlogp, B_TRUE, "unable to list zones");
3829 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3830 		return (-1);
3831 	}
3832 again:
3833 	if (nzents == 0) {
3834 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3835 		return (-1);
3836 	}
3837 
3838 	zids = malloc(nzents * sizeof (zoneid_t));
3839 	if (zids == NULL) {
3840 		zerror(zlogp, B_TRUE, "memory allocation failed");
3841 		return (-1);
3842 	}
3843 	nzents_saved = nzents;
3844 
3845 	if (zone_list(zids, &nzents) != 0) {
3846 		zerror(zlogp, B_TRUE, "unable to list zones");
3847 		zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
3848 		free(zids);
3849 		return (-1);
3850 	}
3851 	if (nzents != nzents_saved) {
3852 		/* list changed, try again */
3853 		free(zids);
3854 		goto again;
3855 	}
3856 
3857 	ip = getprivimplinfo();
3858 	if ((zid_privs = priv_allocset()) == NULL) {
3859 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
3860 		zonecfg_free_fs_option_list(
3861 		    lower_fstab.zone_fs_options);
3862 		free(zids);
3863 		return (-1);
3864 	}
3865 
3866 	for (i = 0; i < nzents; i++) {
3867 		char zid_name[ZONENAME_MAX];
3868 		zone_state_t zid_state;
3869 		char zid_rpath[MAXPATHLEN];
3870 		struct stat stat_buf;
3871 
3872 		if (zids[i] == GLOBAL_ZONEID)
3873 			continue;
3874 
3875 		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
3876 			continue;
3877 
3878 		/*
3879 		 * Do special setup for the zone we are booting
3880 		 */
3881 		if (strcmp(zid_name, zone_name) == 0) {
3882 			struct zone_fstab autofs_fstab;
3883 			char map_path[MAXPATHLEN];
3884 			int fd;
3885 
3886 			/*
3887 			 * Create auto_home_<zone> map for this zone
3888 			 * in the global zone. The non-global zone entry
3889 			 * will be created by automount when the zone
3890 			 * is booted.
3891 			 */
3892 
3893 			(void) snprintf(autofs_fstab.zone_fs_special,
3894 			    MAXPATHLEN, "auto_home_%s", zid_name);
3895 
3896 			(void) snprintf(autofs_fstab.zone_fs_dir, MAXPATHLEN,
3897 			    "/zone/%s/home", zid_name);
3898 
3899 			(void) snprintf(map_path, sizeof (map_path),
3900 			    "/etc/%s", autofs_fstab.zone_fs_special);
3901 			/*
3902 			 * If the map file doesn't exist create a template
3903 			 */
3904 			if ((fd = open(map_path, O_RDWR | O_CREAT | O_EXCL,
3905 			    S_IRUSR | S_IWUSR | S_IRGRP| S_IROTH)) != -1) {
3906 				int len;
3907 				char map_rec[MAXPATHLEN];
3908 
3909 				len = snprintf(map_rec, sizeof (map_rec),
3910 				    "+%s\n*\t-fstype=lofs\t:%s/export/home/&\n",
3911 				    autofs_fstab.zone_fs_special, rootpath);
3912 				(void) write(fd, map_rec, len);
3913 				(void) close(fd);
3914 			}
3915 
3916 			/*
3917 			 * Mount auto_home_<zone> in the global zone if absent.
3918 			 * If it's already of type autofs, then
3919 			 * don't mount it again.
3920 			 */
3921 			if ((stat(autofs_fstab.zone_fs_dir, &stat_buf) == -1) ||
3922 			    strcmp(stat_buf.st_fstype, MNTTYPE_AUTOFS) != 0) {
3923 				char optstr[] = "indirect,ignore,nobrowse";
3924 
3925 				(void) make_one_dir(zlogp, "",
3926 				    autofs_fstab.zone_fs_dir, DEFAULT_DIR_MODE,
3927 				    DEFAULT_DIR_USER, DEFAULT_DIR_GROUP);
3928 
3929 				/*
3930 				 * Mount will fail if automounter has already
3931 				 * processed the auto_home_<zonename> map
3932 				 */
3933 				(void) domount(zlogp, MNTTYPE_AUTOFS, optstr,
3934 				    autofs_fstab.zone_fs_special,
3935 				    autofs_fstab.zone_fs_dir);
3936 			}
3937 			continue;
3938 		}
3939 
3940 
3941 		if (zone_get_state(zid_name, &zid_state) != Z_OK ||
3942 		    (zid_state != ZONE_STATE_READY &&
3943 		    zid_state != ZONE_STATE_RUNNING))
3944 			/* Skip over zones without mounted filesystems */
3945 			continue;
3946 
3947 		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
3948 		    sizeof (m_label_t)) < 0)
3949 			/* Skip over zones with unspecified label */
3950 			continue;
3951 
3952 		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
3953 		    sizeof (zid_rpath)) == -1)
3954 			/* Skip over zones with bad path */
3955 			continue;
3956 
3957 		if (zone_getattr(zids[i], ZONE_ATTR_PRIVSET, zid_privs,
3958 		    sizeof (priv_chunk_t) * ip->priv_setsize) == -1)
3959 			/* Skip over zones with bad privs */
3960 			continue;
3961 
3962 		/*
3963 		 * Reading down is valid according to our label model
3964 		 * but some customers want to disable it because it
3965 		 * allows execute down and other possible attacks.
3966 		 * Therefore, we restrict this feature to zones that
3967 		 * have the NET_MAC_AWARE privilege which is required
3968 		 * for NFS read-down semantics.
3969 		 */
3970 		if ((bldominates(zlabel, zid_label)) &&
3971 		    (priv_ismember(zprivs, PRIV_NET_MAC_AWARE))) {
3972 			/*
3973 			 * Our zone dominates this one.
3974 			 * Create a lofs mount from lower zone's /export/home
3975 			 */
3976 			(void) snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
3977 			    "%s/zone/%s/export/home", rootpath, zid_name);
3978 
3979 			/*
3980 			 * If the target is already an LOFS mount
3981 			 * then don't do it again.
3982 			 */
3983 			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
3984 			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
3985 
3986 				if (snprintf(lower_fstab.zone_fs_special,
3987 				    MAXPATHLEN, "%s/export",
3988 				    zid_rpath) > MAXPATHLEN)
3989 					continue;
3990 
3991 				/*
3992 				 * Make sure the lower-level home exists
3993 				 */
3994 				if (make_one_dir(zlogp,
3995 				    lower_fstab.zone_fs_special, "/home",
3996 				    DEFAULT_DIR_MODE, DEFAULT_DIR_USER,
3997 				    DEFAULT_DIR_GROUP) != 0)
3998 					continue;
3999 
4000 				(void) strlcat(lower_fstab.zone_fs_special,
4001 				    "/home", MAXPATHLEN);
4002 
4003 				/*
4004 				 * Mount can fail because the lower-level
4005 				 * zone may have already done a mount up.
4006 				 */
4007 				(void) mount_one(zlogp, &lower_fstab, "",
4008 				    Z_MNT_BOOT);
4009 			}
4010 		} else if ((bldominates(zid_label, zlabel)) &&
4011 		    (priv_ismember(zid_privs, PRIV_NET_MAC_AWARE))) {
4012 			/*
4013 			 * This zone dominates our zone.
4014 			 * Create a lofs mount from our zone's /export/home
4015 			 */
4016 			if (snprintf(lower_fstab.zone_fs_dir, MAXPATHLEN,
4017 			    "%s/zone/%s/export/home", zid_rpath,
4018 			    zone_name) > MAXPATHLEN)
4019 				continue;
4020 
4021 			/*
4022 			 * If the target is already an LOFS mount
4023 			 * then don't do it again.
4024 			 */
4025 			if ((stat(lower_fstab.zone_fs_dir, &stat_buf) == -1) ||
4026 			    strcmp(stat_buf.st_fstype, MNTTYPE_LOFS) != 0) {
4027 
4028 				(void) snprintf(lower_fstab.zone_fs_special,
4029 				    MAXPATHLEN, "%s/export/home", rootpath);
4030 
4031 				/*
4032 				 * Mount can fail because the higher-level
4033 				 * zone may have already done a mount down.
4034 				 */
4035 				(void) mount_one(zlogp, &lower_fstab, "",
4036 				    Z_MNT_BOOT);
4037 			}
4038 		}
4039 	}
4040 	zonecfg_free_fs_option_list(lower_fstab.zone_fs_options);
4041 	priv_freeset(zid_privs);
4042 	free(zids);
4043 
4044 	/*
4045 	 * Now share any exported directories from this zone.
4046 	 * Each zone can have its own dfstab.
4047 	 */
4048 
4049 	argv[0] = "zoneshare";
4050 	argv[1] = "-z";
4051 	argv[2] = zone_name;
4052 	argv[3] = NULL;
4053 
4054 	(void) forkexec(zlogp, "/usr/lib/zones/zoneshare", argv);
4055 	/* Don't check for errors since they don't affect the zone */
4056 
4057 	return (0);
4058 }
4059 
4060 /*
4061  * Unmount lofs mounts from higher level zones
4062  * Unshare nfs exported directories
4063  */
4064 static void
4065 tsol_unmounts(zlog_t *zlogp, char *zone_name)
4066 {
4067 	zoneid_t *zids = NULL;
4068 	uint_t nzents_saved;
4069 	uint_t nzents;
4070 	int i;
4071 	char *argv[4];
4072 	char path[MAXPATHLEN];
4073 
4074 	if (!is_system_labeled())
4075 		return;
4076 
4077 	/*
4078 	 * Get the list of zones from the kernel
4079 	 */
4080 	if (zone_list(NULL, &nzents) != 0) {
4081 		return;
4082 	}
4083 
4084 	if (zid_label == NULL) {
4085 		zid_label = m_label_alloc(MAC_LABEL);
4086 		if (zid_label == NULL)
4087 			return;
4088 	}
4089 
4090 again:
4091 	if (nzents == 0)
4092 		return;
4093 
4094 	zids = malloc(nzents * sizeof (zoneid_t));
4095 	if (zids == NULL) {
4096 		zerror(zlogp, B_TRUE, "memory allocation failed");
4097 		return;
4098 	}
4099 	nzents_saved = nzents;
4100 
4101 	if (zone_list(zids, &nzents) != 0) {
4102 		free(zids);
4103 		return;
4104 	}
4105 	if (nzents != nzents_saved) {
4106 		/* list changed, try again */
4107 		free(zids);
4108 		goto again;
4109 	}
4110 
4111 	for (i = 0; i < nzents; i++) {
4112 		char zid_name[ZONENAME_MAX];
4113 		zone_state_t zid_state;
4114 		char zid_rpath[MAXPATHLEN];
4115 
4116 		if (zids[i] == GLOBAL_ZONEID)
4117 			continue;
4118 
4119 		if (getzonenamebyid(zids[i], zid_name, ZONENAME_MAX) == -1)
4120 			continue;
4121 
4122 		/*
4123 		 * Skip the zone we are halting
4124 		 */
4125 		if (strcmp(zid_name, zone_name) == 0)
4126 			continue;
4127 
4128 		if ((zone_getattr(zids[i], ZONE_ATTR_STATUS, &zid_state,
4129 		    sizeof (zid_state)) < 0) ||
4130 		    (zid_state < ZONE_IS_READY))
4131 			/* Skip over zones without mounted filesystems */
4132 			continue;
4133 
4134 		if (zone_getattr(zids[i], ZONE_ATTR_SLBL, zid_label,
4135 		    sizeof (m_label_t)) < 0)
4136 			/* Skip over zones with unspecified label */
4137 			continue;
4138 
4139 		if (zone_getattr(zids[i], ZONE_ATTR_ROOT, zid_rpath,
4140 		    sizeof (zid_rpath)) == -1)
4141 			/* Skip over zones with bad path */
4142 			continue;
4143 
4144 		if (zlabel != NULL && bldominates(zid_label, zlabel)) {
4145 			/*
4146 			 * This zone dominates our zone.
4147 			 * Unmount the lofs mount of our zone's /export/home
4148 			 */
4149 
4150 			if (snprintf(path, MAXPATHLEN,
4151 			    "%s/zone/%s/export/home", zid_rpath,
4152 			    zone_name) > MAXPATHLEN)
4153 				continue;
4154 
4155 			/* Skip over mount failures */
4156 			(void) umount(path);
4157 		}
4158 	}
4159 	free(zids);
4160 
4161 	/*
4162 	 * Unmount global zone autofs trigger for this zone
4163 	 */
4164 	(void) snprintf(path, MAXPATHLEN, "/zone/%s/home", zone_name);
4165 	/* Skip over mount failures */
4166 	(void) umount(path);
4167 
4168 	/*
4169 	 * Next unshare any exported directories from this zone.
4170 	 */
4171 
4172 	argv[0] = "zoneunshare";
4173 	argv[1] = "-z";
4174 	argv[2] = zone_name;
4175 	argv[3] = NULL;
4176 
4177 	(void) forkexec(zlogp, "/usr/lib/zones/zoneunshare", argv);
4178 	/* Don't check for errors since they don't affect the zone */
4179 
4180 	/*
4181 	 * Finally, deallocate any devices in the zone.
4182 	 */
4183 
4184 	argv[0] = "deallocate";
4185 	argv[1] = "-Isz";
4186 	argv[2] = zone_name;
4187 	argv[3] = NULL;
4188 
4189 	(void) forkexec(zlogp, "/usr/sbin/deallocate", argv);
4190 	/* Don't check for errors since they don't affect the zone */
4191 }
4192 
4193 /*
4194  * Fetch the Trusted Extensions label and multi-level ports (MLPs) for
4195  * this zone.
4196  */
4197 static tsol_zcent_t *
4198 get_zone_label(zlog_t *zlogp, priv_set_t *privs)
4199 {
4200 	FILE *fp;
4201 	tsol_zcent_t *zcent = NULL;
4202 	char line[MAXTNZLEN];
4203 
4204 	if ((fp = fopen(TNZONECFG_PATH, "r")) == NULL) {
4205 		zerror(zlogp, B_TRUE, "%s", TNZONECFG_PATH);
4206 		return (NULL);
4207 	}
4208 
4209 	while (fgets(line, sizeof (line), fp) != NULL) {
4210 		/*
4211 		 * Check for malformed database
4212 		 */
4213 		if (strlen(line) == MAXTNZLEN - 1)
4214 			break;
4215 		if ((zcent = tsol_sgetzcent(line, NULL, NULL)) == NULL)
4216 			continue;
4217 		if (strcmp(zcent->zc_name, zone_name) == 0)
4218 			break;
4219 		tsol_freezcent(zcent);
4220 		zcent = NULL;
4221 	}
4222 	(void) fclose(fp);
4223 
4224 	if (zcent == NULL) {
4225 		zerror(zlogp, B_FALSE, "zone requires a label assignment. "
4226 		    "See tnzonecfg(4)");
4227 	} else {
4228 		if (zlabel == NULL)
4229 			zlabel = m_label_alloc(MAC_LABEL);
4230 		/*
4231 		 * Save this zone's privileges for later read-down processing
4232 		 */
4233 		if ((zprivs = priv_allocset()) == NULL) {
4234 			zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
4235 			return (NULL);
4236 		} else {
4237 			priv_copyset(privs, zprivs);
4238 		}
4239 	}
4240 	return (zcent);
4241 }
4242 
4243 /*
4244  * Add the Trusted Extensions multi-level ports for this zone.
4245  */
4246 static void
4247 set_mlps(zlog_t *zlogp, zoneid_t zoneid, tsol_zcent_t *zcent)
4248 {
4249 	tsol_mlp_t *mlp;
4250 	tsol_mlpent_t tsme;
4251 
4252 	if (!is_system_labeled())
4253 		return;
4254 
4255 	tsme.tsme_zoneid = zoneid;
4256 	tsme.tsme_flags = 0;
4257 	for (mlp = zcent->zc_private_mlp; !TSOL_MLP_END(mlp); mlp++) {
4258 		tsme.tsme_mlp = *mlp;
4259 		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
4260 			zerror(zlogp, B_TRUE, "cannot set zone-specific MLP "
4261 			    "on %d-%d/%d", mlp->mlp_port,
4262 			    mlp->mlp_port_upper, mlp->mlp_ipp);
4263 		}
4264 	}
4265 
4266 	tsme.tsme_flags = TSOL_MEF_SHARED;
4267 	for (mlp = zcent->zc_shared_mlp; !TSOL_MLP_END(mlp); mlp++) {
4268 		tsme.tsme_mlp = *mlp;
4269 		if (tnmlp(TNDB_LOAD, &tsme) != 0) {
4270 			zerror(zlogp, B_TRUE, "cannot set shared MLP "
4271 			    "on %d-%d/%d", mlp->mlp_port,
4272 			    mlp->mlp_port_upper, mlp->mlp_ipp);
4273 		}
4274 	}
4275 }
4276 
4277 static void
4278 remove_mlps(zlog_t *zlogp, zoneid_t zoneid)
4279 {
4280 	tsol_mlpent_t tsme;
4281 
4282 	if (!is_system_labeled())
4283 		return;
4284 
4285 	(void) memset(&tsme, 0, sizeof (tsme));
4286 	tsme.tsme_zoneid = zoneid;
4287 	if (tnmlp(TNDB_FLUSH, &tsme) != 0)
4288 		zerror(zlogp, B_TRUE, "cannot flush MLPs");
4289 }
4290 
4291 int
4292 prtmount(const struct mnttab *fs, void *x) {
4293 	zerror((zlog_t *)x, B_FALSE, "  %s", fs->mnt_mountp);
4294 	return (0);
4295 }
4296 
4297 /*
4298  * Look for zones running on the main system that are using this root (or any
4299  * subdirectory of it).  Return B_TRUE and print an error if a conflicting zone
4300  * is found or if we can't tell.
4301  */
4302 static boolean_t
4303 duplicate_zone_root(zlog_t *zlogp, const char *rootpath)
4304 {
4305 	zoneid_t *zids = NULL;
4306 	uint_t nzids = 0;
4307 	boolean_t retv;
4308 	int rlen, zlen;
4309 	char zroot[MAXPATHLEN];
4310 	char zonename[ZONENAME_MAX];
4311 
4312 	for (;;) {
4313 		nzids += 10;
4314 		zids = malloc(nzids * sizeof (*zids));
4315 		if (zids == NULL) {
4316 			zerror(zlogp, B_TRUE, "memory allocation failed");
4317 			return (B_TRUE);
4318 		}
4319 		if (zone_list(zids, &nzids) == 0)
4320 			break;
4321 		free(zids);
4322 	}
4323 	retv = B_FALSE;
4324 	rlen = strlen(rootpath);
4325 	while (nzids > 0) {
4326 		/*
4327 		 * Ignore errors; they just mean that the zone has disappeared
4328 		 * while we were busy.
4329 		 */
4330 		if (zone_getattr(zids[--nzids], ZONE_ATTR_ROOT, zroot,
4331 		    sizeof (zroot)) == -1)
4332 			continue;
4333 		zlen = strlen(zroot);
4334 		if (zlen > rlen)
4335 			zlen = rlen;
4336 		if (strncmp(rootpath, zroot, zlen) == 0 &&
4337 		    (zroot[zlen] == '\0' || zroot[zlen] == '/') &&
4338 		    (rootpath[zlen] == '\0' || rootpath[zlen] == '/')) {
4339 			if (getzonenamebyid(zids[nzids], zonename,
4340 			    sizeof (zonename)) == -1)
4341 				(void) snprintf(zonename, sizeof (zonename),
4342 				    "id %d", (int)zids[nzids]);
4343 			zerror(zlogp, B_FALSE,
4344 			    "zone root %s already in use by zone %s",
4345 			    rootpath, zonename);
4346 			retv = B_TRUE;
4347 			break;
4348 		}
4349 	}
4350 	free(zids);
4351 	return (retv);
4352 }
4353 
4354 /*
4355  * Search for loopback mounts that use this same source node (same device and
4356  * inode).  Return B_TRUE if there is one or if we can't tell.
4357  */
4358 static boolean_t
4359 duplicate_reachable_path(zlog_t *zlogp, const char *rootpath)
4360 {
4361 	struct stat64 rst, zst;
4362 	struct mnttab *mnp;
4363 
4364 	if (stat64(rootpath, &rst) == -1) {
4365 		zerror(zlogp, B_TRUE, "can't stat %s", rootpath);
4366 		return (B_TRUE);
4367 	}
4368 	if (resolve_lofs_mnts == NULL && lofs_read_mnttab(zlogp) == -1)
4369 		return (B_TRUE);
4370 	for (mnp = resolve_lofs_mnts; mnp < resolve_lofs_mnt_max; mnp++) {
4371 		if (mnp->mnt_fstype == NULL ||
4372 		    strcmp(MNTTYPE_LOFS, mnp->mnt_fstype) != 0)
4373 			continue;
4374 		/* We're looking at a loopback mount.  Stat it. */
4375 		if (mnp->mnt_special != NULL &&
4376 		    stat64(mnp->mnt_special, &zst) != -1 &&
4377 		    rst.st_dev == zst.st_dev && rst.st_ino == zst.st_ino) {
4378 			zerror(zlogp, B_FALSE,
4379 			    "zone root %s is reachable through %s",
4380 			    rootpath, mnp->mnt_mountp);
4381 			return (B_TRUE);
4382 		}
4383 	}
4384 	return (B_FALSE);
4385 }
4386 
4387 /*
4388  * Set memory cap and pool info for the zone's resource management
4389  * configuration.
4390  */
4391 static int
4392 setup_zone_rm(zlog_t *zlogp, char *zone_name, zoneid_t zoneid)
4393 {
4394 	int res;
4395 	uint64_t tmp;
4396 	struct zone_mcaptab mcap;
4397 	char sched[MAXNAMELEN];
4398 	zone_dochandle_t handle = NULL;
4399 	char pool_err[128];
4400 
4401 	if ((handle = zonecfg_init_handle()) == NULL) {
4402 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
4403 		return (Z_BAD_HANDLE);
4404 	}
4405 
4406 	if ((res = zonecfg_get_snapshot_handle(zone_name, handle)) != Z_OK) {
4407 		zerror(zlogp, B_FALSE, "invalid configuration");
4408 		zonecfg_fini_handle(handle);
4409 		return (res);
4410 	}
4411 
4412 	/*
4413 	 * If a memory cap is configured, set the cap in the kernel using
4414 	 * zone_setattr() and make sure the rcapd SMF service is enabled.
4415 	 */
4416 	if (zonecfg_getmcapent(handle, &mcap) == Z_OK) {
4417 		uint64_t num;
4418 		char smf_err[128];
4419 
4420 		num = (uint64_t)strtoull(mcap.zone_physmem_cap, NULL, 10);
4421 		if (zone_setattr(zoneid, ZONE_ATTR_PHYS_MCAP, &num, 0) == -1) {
4422 			zerror(zlogp, B_TRUE, "could not set zone memory cap");
4423 			zonecfg_fini_handle(handle);
4424 			return (Z_INVAL);
4425 		}
4426 
4427 		if (zonecfg_enable_rcapd(smf_err, sizeof (smf_err)) != Z_OK) {
4428 			zerror(zlogp, B_FALSE, "enabling system/rcap service "
4429 			    "failed: %s", smf_err);
4430 			zonecfg_fini_handle(handle);
4431 			return (Z_INVAL);
4432 		}
4433 	}
4434 
4435 	/* Get the scheduling class set in the zone configuration. */
4436 	if (zonecfg_get_sched_class(handle, sched, sizeof (sched)) == Z_OK &&
4437 	    strlen(sched) > 0) {
4438 		if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, sched,
4439 		    strlen(sched)) == -1)
4440 			zerror(zlogp, B_TRUE, "WARNING: unable to set the "
4441 			    "default scheduling class");
4442 
4443 	} else if (zonecfg_get_aliased_rctl(handle, ALIAS_SHARES, &tmp)
4444 	    == Z_OK) {
4445 		/*
4446 		 * If the zone has the zone.cpu-shares rctl set then we want to
4447 		 * use the Fair Share Scheduler (FSS) for processes in the
4448 		 * zone.  Check what scheduling class the zone would be running
4449 		 * in by default so we can print a warning and modify the class
4450 		 * if we wouldn't be using FSS.
4451 		 */
4452 		char class_name[PC_CLNMSZ];
4453 
4454 		if (zonecfg_get_dflt_sched_class(handle, class_name,
4455 		    sizeof (class_name)) != Z_OK) {
4456 			zerror(zlogp, B_FALSE, "WARNING: unable to determine "
4457 			    "the zone's scheduling class");
4458 
4459 		} else if (strcmp("FSS", class_name) != 0) {
4460 			zerror(zlogp, B_FALSE, "WARNING: The zone.cpu-shares "
4461 			    "rctl is set but\nFSS is not the default "
4462 			    "scheduling class for\nthis zone.  FSS will be "
4463 			    "used for processes\nin the zone but to get the "
4464 			    "full benefit of FSS,\nit should be the default "
4465 			    "scheduling class.\nSee dispadmin(1M) for more "
4466 			    "details.");
4467 
4468 			if (zone_setattr(zoneid, ZONE_ATTR_SCHED_CLASS, "FSS",
4469 			    strlen("FSS")) == -1)
4470 				zerror(zlogp, B_TRUE, "WARNING: unable to set "
4471 				    "zone scheduling class to FSS");
4472 		}
4473 	}
4474 
4475 	/*
4476 	 * The next few blocks of code attempt to set up temporary pools as
4477 	 * well as persistent pools.  In all cases we call the functions
4478 	 * unconditionally.  Within each funtion the code will check if the
4479 	 * zone is actually configured for a temporary pool or persistent pool
4480 	 * and just return if there is nothing to do.
4481 	 *
4482 	 * If we are rebooting we want to attempt to reuse any temporary pool
4483 	 * that was previously set up.  zonecfg_bind_tmp_pool() will do the
4484 	 * right thing in all cases (reuse or create) based on the current
4485 	 * zonecfg.
4486 	 */
4487 	if ((res = zonecfg_bind_tmp_pool(handle, zoneid, pool_err,
4488 	    sizeof (pool_err))) != Z_OK) {
4489 		if (res == Z_POOL || res == Z_POOL_CREATE || res == Z_POOL_BIND)
4490 			zerror(zlogp, B_FALSE, "%s: %s\ndedicated-cpu setting "
4491 			    "cannot be instantiated", zonecfg_strerror(res),
4492 			    pool_err);
4493 		else
4494 			zerror(zlogp, B_FALSE, "could not bind zone to "
4495 			    "temporary pool: %s", zonecfg_strerror(res));
4496 		zonecfg_fini_handle(handle);
4497 		return (Z_POOL_BIND);
4498 	}
4499 
4500 	/*
4501 	 * Check if we need to warn about poold not being enabled.
4502 	 */
4503 	if (zonecfg_warn_poold(handle)) {
4504 		zerror(zlogp, B_FALSE, "WARNING: A range of dedicated-cpus has "
4505 		    "been specified\nbut the dynamic pool service is not "
4506 		    "enabled.\nThe system will not dynamically adjust the\n"
4507 		    "processor allocation within the specified range\n"
4508 		    "until svc:/system/pools/dynamic is enabled.\n"
4509 		    "See poold(1M).");
4510 	}
4511 
4512 	/* The following is a warning, not an error. */
4513 	if ((res = zonecfg_bind_pool(handle, zoneid, pool_err,
4514 	    sizeof (pool_err))) != Z_OK) {
4515 		if (res == Z_POOL_BIND)
4516 			zerror(zlogp, B_FALSE, "WARNING: unable to bind to "
4517 			    "pool '%s'; using default pool.", pool_err);
4518 		else if (res == Z_POOL)
4519 			zerror(zlogp, B_FALSE, "WARNING: %s: %s",
4520 			    zonecfg_strerror(res), pool_err);
4521 		else
4522 			zerror(zlogp, B_FALSE, "WARNING: %s",
4523 			    zonecfg_strerror(res));
4524 	}
4525 
4526 	/* Update saved pool name in case it has changed */
4527 	(void) zonecfg_get_poolname(handle, zone_name, pool_name,
4528 	    sizeof (pool_name));
4529 
4530 	zonecfg_fini_handle(handle);
4531 	return (Z_OK);
4532 }
4533 
4534 static void
4535 report_prop_err(zlog_t *zlogp, const char *name, const char *value, int res)
4536 {
4537 	switch (res) {
4538 	case Z_TOO_BIG:
4539 		zerror(zlogp, B_FALSE, "%s property value is too large.", name);
4540 		break;
4541 
4542 	case Z_INVALID_PROPERTY:
4543 		zerror(zlogp, B_FALSE, "%s property value \"%s\" is not valid",
4544 		    name, value);
4545 		break;
4546 
4547 	default:
4548 		zerror(zlogp, B_TRUE, "fetching property %s: %d", name, res);
4549 		break;
4550 	}
4551 }
4552 
4553 /*
4554  * Sets the hostid of the new zone based on its configured value.  The zone's
4555  * zone_t structure must already exist in kernel memory.  'zlogp' refers to the
4556  * log used to report errors and warnings and must be non-NULL.  'zone_namep'
4557  * is the name of the new zone and must be non-NULL.  'zoneid' is the numeric
4558  * ID of the new zone.
4559  *
4560  * This function returns zero on success and a nonzero error code on failure.
4561  */
4562 static int
4563 setup_zone_hostid(zone_dochandle_t handle, zlog_t *zlogp, zoneid_t zoneid)
4564 {
4565 	int res;
4566 	char hostidp[HW_HOSTID_LEN];
4567 	unsigned int hostid;
4568 
4569 	res = zonecfg_get_hostid(handle, hostidp, sizeof (hostidp));
4570 
4571 	if (res == Z_BAD_PROPERTY) {
4572 		return (Z_OK);
4573 	} else if (res != Z_OK) {
4574 		report_prop_err(zlogp, "hostid", hostidp, res);
4575 		return (res);
4576 	}
4577 
4578 	hostid = (unsigned int)strtoul(hostidp, NULL, 16);
4579 	if ((res = zone_setattr(zoneid, ZONE_ATTR_HOSTID, &hostid,
4580 	    sizeof (hostid))) != 0) {
4581 		zerror(zlogp, B_TRUE,
4582 		    "zone hostid is not valid: %s: %d", hostidp, res);
4583 		return (Z_SYSTEM);
4584 	}
4585 
4586 	return (res);
4587 }
4588 
4589 static int
4590 setup_zone_fs_allowed(zone_dochandle_t handle, zlog_t *zlogp, zoneid_t zoneid)
4591 {
4592 	char fsallowedp[ZONE_FS_ALLOWED_MAX];
4593 	int res;
4594 
4595 	res = zonecfg_get_fs_allowed(handle, fsallowedp, sizeof (fsallowedp));
4596 
4597 	if (res == Z_BAD_PROPERTY) {
4598 		return (Z_OK);
4599 	} else if (res != Z_OK) {
4600 		report_prop_err(zlogp, "fs-allowed", fsallowedp, res);
4601 		return (res);
4602 	}
4603 
4604 	if (zone_setattr(zoneid, ZONE_ATTR_FS_ALLOWED, &fsallowedp,
4605 	    sizeof (fsallowedp)) != 0) {
4606 		zerror(zlogp, B_TRUE,
4607 		    "fs-allowed couldn't be set: %s: %d", fsallowedp, res);
4608 		return (Z_SYSTEM);
4609 	}
4610 
4611 	return (res);
4612 }
4613 
4614 static int
4615 setup_zone_attrs(zlog_t *zlogp, char *zone_namep, zoneid_t zoneid)
4616 {
4617 	zone_dochandle_t handle;
4618 	int res = Z_OK;
4619 
4620 	if ((handle = zonecfg_init_handle()) == NULL) {
4621 		zerror(zlogp, B_TRUE, "getting zone configuration handle");
4622 		return (Z_BAD_HANDLE);
4623 	}
4624 	if ((res = zonecfg_get_snapshot_handle(zone_namep, handle)) != Z_OK) {
4625 		zerror(zlogp, B_FALSE, "invalid configuration");
4626 		goto out;
4627 	}
4628 
4629 	if ((res = setup_zone_hostid(handle, zlogp, zoneid)) != Z_OK)
4630 		goto out;
4631 
4632 	if ((res = setup_zone_fs_allowed(handle, zlogp, zoneid)) != Z_OK)
4633 		goto out;
4634 
4635 out:
4636 	zonecfg_fini_handle(handle);
4637 	return (res);
4638 }
4639 
4640 zoneid_t
4641 vplat_create(zlog_t *zlogp, zone_mnt_t mount_cmd)
4642 {
4643 	zoneid_t rval = -1;
4644 	priv_set_t *privs;
4645 	char rootpath[MAXPATHLEN];
4646 	char *rctlbuf = NULL;
4647 	size_t rctlbufsz = 0;
4648 	char *zfsbuf = NULL;
4649 	size_t zfsbufsz = 0;
4650 	zoneid_t zoneid = -1;
4651 	int xerr;
4652 	char *kzone;
4653 	FILE *fp = NULL;
4654 	tsol_zcent_t *zcent = NULL;
4655 	int match = 0;
4656 	int doi = 0;
4657 	int flags;
4658 	zone_iptype_t iptype;
4659 
4660 	if (zone_get_rootpath(zone_name, rootpath, sizeof (rootpath)) != Z_OK) {
4661 		zerror(zlogp, B_TRUE, "unable to determine zone root");
4662 		return (-1);
4663 	}
4664 	if (zonecfg_in_alt_root())
4665 		resolve_lofs(zlogp, rootpath, sizeof (rootpath));
4666 
4667 	if (vplat_get_iptype(zlogp, &iptype) < 0) {
4668 		zerror(zlogp, B_TRUE, "unable to determine ip-type");
4669 		return (-1);
4670 	}
4671 	switch (iptype) {
4672 	case ZS_SHARED:
4673 		flags = 0;
4674 		break;
4675 	case ZS_EXCLUSIVE:
4676 		flags = ZCF_NET_EXCL;
4677 		break;
4678 	}
4679 
4680 	if ((privs = priv_allocset()) == NULL) {
4681 		zerror(zlogp, B_TRUE, "%s failed", "priv_allocset");
4682 		return (-1);
4683 	}
4684 	priv_emptyset(privs);
4685 	if (get_privset(zlogp, privs, mount_cmd) != 0)
4686 		goto error;
4687 
4688 	if (mount_cmd == Z_MNT_BOOT &&
4689 	    get_rctls(zlogp, &rctlbuf, &rctlbufsz) != 0) {
4690 		zerror(zlogp, B_FALSE, "Unable to get list of rctls");
4691 		goto error;
4692 	}
4693 
4694 	if (get_datasets(zlogp, &zfsbuf, &zfsbufsz) != 0) {
4695 		zerror(zlogp, B_FALSE, "Unable to get list of ZFS datasets");
4696 		goto error;
4697 	}
4698 
4699 	if (mount_cmd == Z_MNT_BOOT && is_system_labeled()) {
4700 		zcent = get_zone_label(zlogp, privs);
4701 		if (zcent != NULL) {
4702 			match = zcent->zc_match;
4703 			doi = zcent->zc_doi;
4704 			*zlabel = zcent->zc_label;
4705 		} else {
4706 			goto error;
4707 		}
4708 		if (validate_rootds_label(zlogp, rootpath, zlabel) != 0)
4709 			goto error;
4710 	}
4711 
4712 	kzone = zone_name;
4713 
4714 	/*
4715 	 * We must do this scan twice.  First, we look for zones running on the
4716 	 * main system that are using this root (or any subdirectory of it).
4717 	 * Next, we reduce to the shortest path and search for loopback mounts
4718 	 * that use this same source node (same device and inode).
4719 	 */
4720 	if (duplicate_zone_root(zlogp, rootpath))
4721 		goto error;
4722 	if (duplicate_reachable_path(zlogp, rootpath))
4723 		goto error;
4724 
4725 	if (ALT_MOUNT(mount_cmd)) {
4726 		root_to_lu(zlogp, rootpath, sizeof (rootpath), B_TRUE);
4727 
4728 		/*
4729 		 * Forge up a special root for this zone.  When a zone is
4730 		 * mounted, we can't let the zone have its own root because the
4731 		 * tools that will be used in this "scratch zone" need access
4732 		 * to both the zone's resources and the running machine's
4733 		 * executables.
4734 		 *
4735 		 * Note that the mkdir here also catches read-only filesystems.
4736 		 */
4737 		if (mkdir(rootpath, 0755) != 0 && errno != EEXIST) {
4738 			zerror(zlogp, B_TRUE, "cannot create %s", rootpath);
4739 			goto error;
4740 		}
4741 		if (domount(zlogp, "tmpfs", "", "swap", rootpath) != 0)
4742 			goto error;
4743 	}
4744 
4745 	if (zonecfg_in_alt_root()) {
4746 		/*
4747 		 * If we are mounting up a zone in an alternate root partition,
4748 		 * then we have some additional work to do before starting the
4749 		 * zone.  First, resolve the root path down so that we're not
4750 		 * fooled by duplicates.  Then forge up an internal name for
4751 		 * the zone.
4752 		 */
4753 		if ((fp = zonecfg_open_scratch("", B_TRUE)) == NULL) {
4754 			zerror(zlogp, B_TRUE, "cannot open mapfile");
4755 			goto error;
4756 		}
4757 		if (zonecfg_lock_scratch(fp) != 0) {
4758 			zerror(zlogp, B_TRUE, "cannot lock mapfile");
4759 			goto error;
4760 		}
4761 		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
4762 		    NULL, 0) == 0) {
4763 			zerror(zlogp, B_FALSE, "scratch zone already running");
4764 			goto error;
4765 		}
4766 		/* This is the preferred name */
4767 		(void) snprintf(kernzone, sizeof (kernzone), "SUNWlu-%s",
4768 		    zone_name);
4769 		srandom(getpid());
4770 		while (zonecfg_reverse_scratch(fp, kernzone, NULL, 0, NULL,
4771 		    0) == 0) {
4772 			/* This is just an arbitrary name; note "." usage */
4773 			(void) snprintf(kernzone, sizeof (kernzone),
4774 			    "SUNWlu.%08lX%08lX", random(), random());
4775 		}
4776 		kzone = kernzone;
4777 	}
4778 
4779 	xerr = 0;
4780 	if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
4781 	    rctlbufsz, zfsbuf, zfsbufsz, &xerr, match, doi, zlabel,
4782 	    flags)) == -1) {
4783 		if (xerr == ZE_AREMOUNTS) {
4784 			if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
4785 				zerror(zlogp, B_FALSE,
4786 				    "An unknown file-system is mounted on "
4787 				    "a subdirectory of %s", rootpath);
4788 			} else {
4789 
4790 				zerror(zlogp, B_FALSE,
4791 				    "These file-systems are mounted on "
4792 				    "subdirectories of %s:", rootpath);
4793 				(void) zonecfg_find_mounts(rootpath,
4794 				    prtmount, zlogp);
4795 			}
4796 		} else if (xerr == ZE_CHROOTED) {
4797 			zerror(zlogp, B_FALSE, "%s: "
4798 			    "cannot create a zone from a chrooted "
4799 			    "environment", "zone_create");
4800 		} else if (xerr == ZE_LABELINUSE) {
4801 			char zonename[ZONENAME_MAX];
4802 			(void) getzonenamebyid(getzoneidbylabel(zlabel),
4803 			    zonename, ZONENAME_MAX);
4804 			zerror(zlogp, B_FALSE, "The zone label is already "
4805 			    "used by the zone '%s'.", zonename);
4806 		} else {
4807 			zerror(zlogp, B_TRUE, "%s failed", "zone_create");
4808 		}
4809 		goto error;
4810 	}
4811 
4812 	if (zonecfg_in_alt_root() &&
4813 	    zonecfg_add_scratch(fp, zone_name, kernzone,
4814 	    zonecfg_get_root()) == -1) {
4815 		zerror(zlogp, B_TRUE, "cannot add mapfile entry");
4816 		goto error;
4817 	}
4818 
4819 	/*
4820 	 * The following actions are not performed when merely mounting a zone
4821 	 * for administrative use.
4822 	 */
4823 	if (mount_cmd == Z_MNT_BOOT) {
4824 		brand_handle_t bh;
4825 		struct brand_attr attr;
4826 		char modname[MAXPATHLEN];
4827 
4828 		if (setup_zone_attrs(zlogp, zone_name, zoneid) != Z_OK)
4829 			goto error;
4830 
4831 		if ((bh = brand_open(brand_name)) == NULL) {
4832 			zerror(zlogp, B_FALSE,
4833 			    "unable to determine brand name");
4834 			goto error;
4835 		}
4836 
4837 		if (!is_system_labeled() &&
4838 		    (strcmp(brand_name, LABELED_BRAND_NAME) == 0)) {
4839 			brand_close(bh);
4840 			zerror(zlogp, B_FALSE,
4841 			    "cannot boot labeled zone on unlabeled system");
4842 			goto error;
4843 		}
4844 
4845 		/*
4846 		 * If this brand requires any kernel support, now is the time to
4847 		 * get it loaded and initialized.
4848 		 */
4849 		if (brand_get_modname(bh, modname, MAXPATHLEN) < 0) {
4850 			brand_close(bh);
4851 			zerror(zlogp, B_FALSE,
4852 			    "unable to determine brand kernel module");
4853 			goto error;
4854 		}
4855 		brand_close(bh);
4856 
4857 		if (strlen(modname) > 0) {
4858 			(void) strlcpy(attr.ba_brandname, brand_name,
4859 			    sizeof (attr.ba_brandname));
4860 			(void) strlcpy(attr.ba_modname, modname,
4861 			    sizeof (attr.ba_modname));
4862 			if (zone_setattr(zoneid, ZONE_ATTR_BRAND, &attr,
4863 			    sizeof (attr) != 0)) {
4864 				zerror(zlogp, B_TRUE,
4865 				    "could not set zone brand attribute.");
4866 				goto error;
4867 			}
4868 		}
4869 
4870 		if (setup_zone_rm(zlogp, zone_name, zoneid) != Z_OK)
4871 			goto error;
4872 
4873 		set_mlps(zlogp, zoneid, zcent);
4874 	}
4875 
4876 	rval = zoneid;
4877 	zoneid = -1;
4878 
4879 error:
4880 	if (zoneid != -1) {
4881 		(void) zone_shutdown(zoneid);
4882 		(void) zone_destroy(zoneid);
4883 	}
4884 	if (rctlbuf != NULL)
4885 		free(rctlbuf);
4886 	priv_freeset(privs);
4887 	if (fp != NULL)
4888 		zonecfg_close_scratch(fp);
4889 	lofs_discard_mnttab();
4890 	if (zcent != NULL)
4891 		tsol_freezcent(zcent);
4892 	return (rval);
4893 }
4894 
4895 /*
4896  * Enter the zone and write a /etc/zones/index file there.  This allows
4897  * libzonecfg (and thus zoneadm) to report the UUID and potentially other zone
4898  * details from inside the zone.
4899  */
4900 static void
4901 write_index_file(zoneid_t zoneid)
4902 {
4903 	FILE *zef;
4904 	FILE *zet;
4905 	struct zoneent *zep;
4906 	pid_t child;
4907 	int tmpl_fd;
4908 	ctid_t ct;
4909 	int fd;
4910 	char uuidstr[UUID_PRINTABLE_STRING_LENGTH];
4911 
4912 	/* Locate the zone entry in the global zone's index file */
4913 	if ((zef = setzoneent()) == NULL)
4914 		return;
4915 	while ((zep = getzoneent_private(zef)) != NULL) {
4916 		if (strcmp(zep->zone_name, zone_name) == 0)
4917 			break;
4918 		free(zep);
4919 	}
4920 	endzoneent(zef);
4921 	if (zep == NULL)
4922 		return;
4923 
4924 	if ((tmpl_fd = init_template()) == -1) {
4925 		free(zep);
4926 		return;
4927 	}
4928 
4929 	if ((child = fork()) == -1) {
4930 		(void) ct_tmpl_clear(tmpl_fd);
4931 		(void) close(tmpl_fd);
4932 		free(zep);
4933 		return;
4934 	}
4935 
4936 	/* parent waits for child to finish */
4937 	if (child != 0) {
4938 		free(zep);
4939 		if (contract_latest(&ct) == -1)
4940 			ct = -1;
4941 		(void) ct_tmpl_clear(tmpl_fd);
4942 		(void) close(tmpl_fd);
4943 		(void) waitpid(child, NULL, 0);
4944 		(void) contract_abandon_id(ct);
4945 		return;
4946 	}
4947 
4948 	/* child enters zone and sets up index file */
4949 	(void) ct_tmpl_clear(tmpl_fd);
4950 	if (zone_enter(zoneid) != -1) {
4951 		(void) mkdir(ZONE_CONFIG_ROOT, ZONE_CONFIG_MODE);
4952 		(void) chown(ZONE_CONFIG_ROOT, ZONE_CONFIG_UID,
4953 		    ZONE_CONFIG_GID);
4954 		fd = open(ZONE_INDEX_FILE, O_WRONLY|O_CREAT|O_TRUNC,
4955 		    ZONE_INDEX_MODE);
4956 		if (fd != -1 && (zet = fdopen(fd, "w")) != NULL) {
4957 			(void) fchown(fd, ZONE_INDEX_UID, ZONE_INDEX_GID);
4958 			if (uuid_is_null(zep->zone_uuid))
4959 				uuidstr[0] = '\0';
4960 			else
4961 				uuid_unparse(zep->zone_uuid, uuidstr);
4962 			(void) fprintf(zet, "%s:%s:/:%s\n", zep->zone_name,
4963 			    zone_state_str(zep->zone_state),
4964 			    uuidstr);
4965 			(void) fclose(zet);
4966 		}
4967 	}
4968 	_exit(0);
4969 }
4970 
4971 int
4972 vplat_bringup(zlog_t *zlogp, zone_mnt_t mount_cmd, zoneid_t zoneid)
4973 {
4974 	char zonepath[MAXPATHLEN];
4975 
4976 	if (mount_cmd == Z_MNT_BOOT && validate_datasets(zlogp) != 0) {
4977 		lofs_discard_mnttab();
4978 		return (-1);
4979 	}
4980 
4981 	/*
4982 	 * Before we try to mount filesystems we need to create the
4983 	 * attribute backing store for /dev
4984 	 */
4985 	if (zone_get_zonepath(zone_name, zonepath, sizeof (zonepath)) != Z_OK) {
4986 		lofs_discard_mnttab();
4987 		return (-1);
4988 	}
4989 	resolve_lofs(zlogp, zonepath, sizeof (zonepath));
4990 
4991 	/* Make /dev directory owned by root, grouped sys */
4992 	if (make_one_dir(zlogp, zonepath, "/dev", DEFAULT_DIR_MODE,
4993 	    0, 3) != 0) {
4994 		lofs_discard_mnttab();
4995 		return (-1);
4996 	}
4997 
4998 	if (mount_filesystems(zlogp, mount_cmd) != 0) {
4999 		lofs_discard_mnttab();
5000 		return (-1);
5001 	}
5002 
5003 	if (mount_cmd == Z_MNT_BOOT) {
5004 		zone_iptype_t iptype;
5005 
5006 		if (vplat_get_iptype(zlogp, &iptype) < 0) {
5007 			zerror(zlogp, B_TRUE, "unable to determine ip-type");
5008 			lofs_discard_mnttab();
5009 			return (-1);
5010 		}
5011 
5012 		switch (iptype) {
5013 		case ZS_SHARED:
5014 			/* Always do this to make lo0 get configured */
5015 			if (configure_shared_network_interfaces(zlogp) != 0) {
5016 				lofs_discard_mnttab();
5017 				return (-1);
5018 			}
5019 			break;
5020 		case ZS_EXCLUSIVE:
5021 			if (configure_exclusive_network_interfaces(zlogp,
5022 			    zoneid) !=
5023 			    0) {
5024 				lofs_discard_mnttab();
5025 				return (-1);
5026 			}
5027 			break;
5028 		}
5029 	}
5030 
5031 	write_index_file(zoneid);
5032 
5033 	lofs_discard_mnttab();
5034 	return (0);
5035 }
5036 
5037 static int
5038 lu_root_teardown(zlog_t *zlogp)
5039 {
5040 	char zroot[MAXPATHLEN];
5041 
5042 	if (zone_get_rootpath(zone_name, zroot, sizeof (zroot)) != Z_OK) {
5043 		zerror(zlogp, B_FALSE, "unable to determine zone root");
5044 		return (-1);
5045 	}
5046 	root_to_lu(zlogp, zroot, sizeof (zroot), B_FALSE);
5047 
5048 	/*
5049 	 * At this point, the processes are gone, the filesystems (save the
5050 	 * root) are unmounted, and the zone is on death row.  But there may
5051 	 * still be creds floating about in the system that reference the
5052 	 * zone_t, and which pin down zone_rootvp causing this call to fail
5053 	 * with EBUSY.  Thus, we try for a little while before just giving up.
5054 	 * (How I wish this were not true, and umount2 just did the right
5055 	 * thing, or tmpfs supported MS_FORCE This is a gross hack.)
5056 	 */
5057 	if (umount2(zroot, MS_FORCE) != 0) {
5058 		if (errno == ENOTSUP && umount2(zroot, 0) == 0)
5059 			goto unmounted;
5060 		if (errno == EBUSY) {
5061 			int tries = 10;
5062 
5063 			while (--tries >= 0) {
5064 				(void) sleep(1);
5065 				if (umount2(zroot, 0) == 0)
5066 					goto unmounted;
5067 				if (errno != EBUSY)
5068 					break;
5069 			}
5070 		}
5071 		zerror(zlogp, B_TRUE, "unable to unmount '%s'", zroot);
5072 		return (-1);
5073 	}
5074 unmounted:
5075 
5076 	/*
5077 	 * Only zones in an alternate root environment have scratch zone
5078 	 * entries.
5079 	 */
5080 	if (zonecfg_in_alt_root()) {
5081 		FILE *fp;
5082 		int retv;
5083 
5084 		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
5085 			zerror(zlogp, B_TRUE, "cannot open mapfile");
5086 			return (-1);
5087 		}
5088 		retv = -1;
5089 		if (zonecfg_lock_scratch(fp) != 0)
5090 			zerror(zlogp, B_TRUE, "cannot lock mapfile");
5091 		else if (zonecfg_delete_scratch(fp, kernzone) != 0)
5092 			zerror(zlogp, B_TRUE, "cannot delete map entry");
5093 		else
5094 			retv = 0;
5095 		zonecfg_close_scratch(fp);
5096 		return (retv);
5097 	} else {
5098 		return (0);
5099 	}
5100 }
5101 
5102 int
5103 vplat_teardown(zlog_t *zlogp, boolean_t unmount_cmd, boolean_t rebooting)
5104 {
5105 	char *kzone;
5106 	zoneid_t zoneid;
5107 	int res;
5108 	char pool_err[128];
5109 	char zpath[MAXPATHLEN];
5110 	char cmdbuf[MAXPATHLEN];
5111 	brand_handle_t bh = NULL;
5112 	dladm_status_t status;
5113 	char errmsg[DLADM_STRSIZE];
5114 	ushort_t flags;
5115 
5116 	kzone = zone_name;
5117 	if (zonecfg_in_alt_root()) {
5118 		FILE *fp;
5119 
5120 		if ((fp = zonecfg_open_scratch("", B_FALSE)) == NULL) {
5121 			zerror(zlogp, B_TRUE, "unable to open map file");
5122 			goto error;
5123 		}
5124 		if (zonecfg_find_scratch(fp, zone_name, zonecfg_get_root(),
5125 		    kernzone, sizeof (kernzone)) != 0) {
5126 			zerror(zlogp, B_FALSE, "unable to find scratch zone");
5127 			zonecfg_close_scratch(fp);
5128 			goto error;
5129 		}
5130 		zonecfg_close_scratch(fp);
5131 		kzone = kernzone;
5132 	}
5133 
5134 	if ((zoneid = getzoneidbyname(kzone)) == ZONE_ID_UNDEFINED) {
5135 		if (!bringup_failure_recovery)
5136 			zerror(zlogp, B_TRUE, "unable to get zoneid");
5137 		if (unmount_cmd)
5138 			(void) lu_root_teardown(zlogp);
5139 		goto error;
5140 	}
5141 
5142 	if (remove_datalink_pool(zlogp, zoneid) != 0) {
5143 		zerror(zlogp, B_FALSE, "unable clear datalink pool property");
5144 		goto error;
5145 	}
5146 
5147 	if (remove_datalink_protect(zlogp, zoneid) != 0) {
5148 		zerror(zlogp, B_FALSE,
5149 		    "unable clear datalink protect property");
5150 		goto error;
5151 	}
5152 
5153 	/*
5154 	 * The datalinks assigned to the zone will be removed from the NGZ as
5155 	 * part of zone_shutdown() so that we need to remove protect/pool etc.
5156 	 * before zone_shutdown(). Even if the shutdown itself fails, the zone
5157 	 * will not be able to violate any constraints applied because the
5158 	 * datalinks are no longer available to the zone.
5159 	 */
5160 	if (zone_shutdown(zoneid) != 0) {
5161 		zerror(zlogp, B_TRUE, "unable to shutdown zone");
5162 		goto error;
5163 	}
5164 
5165 	/* Get the zonepath of this zone */
5166 	if (zone_get_zonepath(zone_name, zpath, sizeof (zpath)) != Z_OK) {
5167 		zerror(zlogp, B_FALSE, "unable to determine zone path");
5168 		goto error;
5169 	}
5170 
5171 	/* Get a handle to the brand info for this zone */
5172 	if ((bh = brand_open(brand_name)) == NULL) {
5173 		zerror(zlogp, B_FALSE, "unable to determine zone brand");
5174 		return (-1);
5175 	}
5176 	/*
5177 	 * If there is a brand 'halt' callback, execute it now to give the
5178 	 * brand a chance to cleanup any custom configuration.
5179 	 */
5180 	(void) strcpy(cmdbuf, EXEC_PREFIX);
5181 	if (brand_get_halt(bh, zone_name, zpath, cmdbuf + EXEC_LEN,
5182 	    sizeof (cmdbuf) - EXEC_LEN) < 0) {
5183 		brand_close(bh);
5184 		zerror(zlogp, B_FALSE, "unable to determine branded zone's "
5185 		    "halt callback.");
5186 		goto error;
5187 	}
5188 	brand_close(bh);
5189 
5190 	if ((strlen(cmdbuf) > EXEC_LEN) &&
5191 	    (do_subproc(zlogp, cmdbuf, NULL) != Z_OK)) {
5192 		zerror(zlogp, B_FALSE, "%s failed", cmdbuf);
5193 		goto error;
5194 	}
5195 
5196 	if (!unmount_cmd) {
5197 		zone_iptype_t iptype;
5198 
5199 		if (zone_getattr(zoneid, ZONE_ATTR_FLAGS, &flags,
5200 		    sizeof (flags)) < 0) {
5201 			if (vplat_get_iptype(zlogp, &iptype) < 0) {
5202 				zerror(zlogp, B_TRUE, "unable to determine "
5203 				    "ip-type");
5204 				goto error;
5205 			}
5206 		} else {
5207 			if (flags & ZF_NET_EXCL)
5208 				iptype = ZS_EXCLUSIVE;
5209 			else
5210 				iptype = ZS_SHARED;
5211 		}
5212 
5213 		switch (iptype) {
5214 		case ZS_SHARED:
5215 			if (unconfigure_shared_network_interfaces(zlogp,
5216 			    zoneid) != 0) {
5217 				zerror(zlogp, B_FALSE, "unable to unconfigure "
5218 				    "network interfaces in zone");
5219 				goto error;
5220 			}
5221 			break;
5222 		case ZS_EXCLUSIVE:
5223 			if (unconfigure_exclusive_network_interfaces(zlogp,
5224 			    zoneid) != 0) {
5225 				zerror(zlogp, B_FALSE, "unable to unconfigure "
5226 				    "network interfaces in zone");
5227 				goto error;
5228 			}
5229 			status = dladm_zone_halt(dld_handle, zoneid);
5230 			if (status != DLADM_STATUS_OK) {
5231 				zerror(zlogp, B_FALSE, "unable to notify "
5232 				    "dlmgmtd of zone halt: %s",
5233 				    dladm_status2str(status, errmsg));
5234 			}
5235 			break;
5236 		}
5237 	}
5238 
5239 	if (!unmount_cmd && tcp_abort_connections(zlogp, zoneid) != 0) {
5240 		zerror(zlogp, B_TRUE, "unable to abort TCP connections");
5241 		goto error;
5242 	}
5243 
5244 	if (unmount_filesystems(zlogp, zoneid, unmount_cmd) != 0) {
5245 		zerror(zlogp, B_FALSE,
5246 		    "unable to unmount file systems in zone");
5247 		goto error;
5248 	}
5249 
5250 	/*
5251 	 * If we are rebooting then we normally don't want to destroy an
5252 	 * existing temporary pool at this point so that we can just reuse it
5253 	 * when the zone boots back up.  However, it is also possible we were
5254 	 * running with a temporary pool and the zone configuration has been
5255 	 * modified to no longer use a temporary pool.  In that case we need
5256 	 * to destroy the temporary pool now.  This case looks like the case
5257 	 * where we never had a temporary pool configured but
5258 	 * zonecfg_destroy_tmp_pool will do the right thing either way.
5259 	 */
5260 	if (!unmount_cmd) {
5261 		boolean_t destroy_tmp_pool = B_TRUE;
5262 
5263 		if (rebooting) {
5264 			struct zone_psettab pset_tab;
5265 			zone_dochandle_t handle;
5266 
5267 			if ((handle = zonecfg_init_handle()) != NULL &&
5268 			    zonecfg_get_handle(zone_name, handle) == Z_OK &&
5269 			    zonecfg_lookup_pset(handle, &pset_tab) == Z_OK)
5270 				destroy_tmp_pool = B_FALSE;
5271 
5272 			zonecfg_fini_handle(handle);
5273 		}
5274 
5275 		if (destroy_tmp_pool) {
5276 			if ((res = zonecfg_destroy_tmp_pool(zone_name, pool_err,
5277 			    sizeof (pool_err))) != Z_OK) {
5278 				if (res == Z_POOL)
5279 					zerror(zlogp, B_FALSE, pool_err);
5280 			}
5281 		}
5282 	}
5283 
5284 	remove_mlps(zlogp, zoneid);
5285 
5286 	if (zone_destroy(zoneid) != 0) {
5287 		zerror(zlogp, B_TRUE, "unable to destroy zone");
5288 		goto error;
5289 	}
5290 
5291 	/*
5292 	 * Special teardown for alternate boot environments: remove the tmpfs
5293 	 * root for the zone and then remove it from the map file.
5294 	 */
5295 	if (unmount_cmd && lu_root_teardown(zlogp) != 0)
5296 		goto error;
5297 
5298 	lofs_discard_mnttab();
5299 	return (0);
5300 
5301 error:
5302 	lofs_discard_mnttab();
5303 	return (-1);
5304 }
5305