xref: /illumos-gate/usr/src/cmd/zlogin/zlogin.c (revision 4aac33d31b41cc7e3ac6fb66747ff2cae63d08cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * zlogin provides three types of login which allow users in the global
30  * zone to access non-global zones.
31  *
32  * - "interactive login" is similar to rlogin(1); for example, the user could
33  *   issue 'zlogin my-zone' or 'zlogin -e ^ -l me my-zone'.   The user is
34  *   granted a new pty (which is then shoved into the zone), and an I/O
35  *   loop between parent and child processes takes care of the interactive
36  *   session.  In this mode, login(1) (and its -c option, which means
37  *   "already authenticated") is employed to take care of the initialization
38  *   of the user's session.
39  *
40  * - "non-interactive login" is similar to su(1M); the user could issue
41  *   'zlogin my-zone ls -l' and the command would be run as specified.
42  *   In this mode, zlogin sets up pipes as the communication channel, and
43  *   'su' is used to do the login setup work.
44  *
45  * - "console login" is the equivalent to accessing the tip line for a
46  *   zone.  For example, the user can issue 'zlogin -C my-zone'.
47  *   In this mode, zlogin contacts the zoneadmd process via unix domain
48  *   socket.  If zoneadmd is not running, it starts it.  This allows the
49  *   console to be available anytime the zone is installed, regardless of
50  *   whether it is running.
51  */
52 
53 #include <sys/socket.h>
54 #include <sys/termios.h>
55 #include <sys/utsname.h>
56 #include <sys/stat.h>
57 #include <sys/types.h>
58 #include <sys/contract/process.h>
59 #include <sys/ctfs.h>
60 #include <sys/brand.h>
61 
62 #include <alloca.h>
63 #include <assert.h>
64 #include <ctype.h>
65 #include <door.h>
66 #include <errno.h>
67 #include <poll.h>
68 #include <priv.h>
69 #include <pwd.h>
70 #include <unistd.h>
71 #include <utmpx.h>
72 #include <sac.h>
73 #include <signal.h>
74 #include <stdarg.h>
75 #include <stdio.h>
76 #include <stdlib.h>
77 #include <string.h>
78 #include <strings.h>
79 #include <stropts.h>
80 #include <wait.h>
81 #include <zone.h>
82 #include <fcntl.h>
83 #include <libdevinfo.h>
84 #include <libintl.h>
85 #include <locale.h>
86 #include <libzonecfg.h>
87 #include <libcontract.h>
88 #include <libbrand.h>
89 
90 static int masterfd;
91 static struct termios save_termios;
92 static struct termios effective_termios;
93 static int save_fd;
94 static struct winsize winsize;
95 static volatile int dead;
96 static volatile pid_t child_pid = -1;
97 static int interactive = 0;
98 static priv_set_t *dropprivs;
99 
100 static int nocmdchar = 0;
101 static int failsafe = 0;
102 static char cmdchar = '~';
103 
104 static int pollerr = 0;
105 
106 static const char *pname;
107 
108 #if !defined(TEXT_DOMAIN)		/* should be defined by cc -D */
109 #define	TEXT_DOMAIN	"SYS_TEST"	/* Use this only if it wasn't */
110 #endif
111 
112 #define	SUPATH	"/usr/bin/su"
113 #define	FAILSAFESHELL	"/sbin/sh"
114 #define	DEFAULTSHELL	"/sbin/sh"
115 #define	DEF_PATH	"/usr/sbin:/usr/bin"
116 
117 /*
118  * The ZLOGIN_BUFSIZ is larger than PIPE_BUF so we can be sure we're clearing
119  * out the pipe when the child is exiting.  The ZLOGIN_RDBUFSIZ must be less
120  * than ZLOGIN_BUFSIZ (because we share the buffer in doio).  This value is
121  * also chosen in conjunction with the HI_WATER setting to make sure we
122  * don't fill up the pipe.  We can write FIFOHIWAT (16k) into the pipe before
123  * blocking.  By having ZLOGIN_RDBUFSIZ set to 1k and HI_WATER set to 8k, we
124  * know we can always write a ZLOGIN_RDBUFSIZ chunk into the pipe when there
125  * is less than HI_WATER data already in the pipe.
126  */
127 #define	ZLOGIN_BUFSIZ	8192
128 #define	ZLOGIN_RDBUFSIZ	1024
129 #define	HI_WATER	8192
130 
131 /*
132  * See canonify() below.  CANONIFY_LEN is the maximum length that a
133  * "canonical" sequence will expand to (backslash, three octal digits, NUL).
134  */
135 #define	CANONIFY_LEN 5
136 
137 static void
138 usage(void)
139 {
140 	(void) fprintf(stderr, gettext("usage: %s [ -CES ] [ -e cmdchar ] "
141 	    "[-l user] zonename [command [args ...] ]\n"), pname);
142 	exit(2);
143 }
144 
145 static const char *
146 getpname(const char *arg0)
147 {
148 	const char *p = strrchr(arg0, '/');
149 
150 	if (p == NULL)
151 		p = arg0;
152 	else
153 		p++;
154 
155 	pname = p;
156 	return (p);
157 }
158 
159 static void
160 zerror(const char *fmt, ...)
161 {
162 	va_list alist;
163 
164 	(void) fprintf(stderr, "%s: ", pname);
165 	va_start(alist, fmt);
166 	(void) vfprintf(stderr, fmt, alist);
167 	va_end(alist);
168 	(void) fprintf(stderr, "\n");
169 }
170 
171 static void
172 zperror(const char *str)
173 {
174 	const char *estr;
175 
176 	if ((estr = strerror(errno)) != NULL)
177 		(void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr);
178 	else
179 		(void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno);
180 }
181 
182 /*
183  * The first part of our privilege dropping scheme needs to be called before
184  * fork(), since we must have it for security; we don't want to be surprised
185  * later that we couldn't allocate the privset.
186  */
187 static int
188 prefork_dropprivs()
189 {
190 	if ((dropprivs = priv_allocset()) == NULL)
191 		return (1);
192 	priv_emptyset(dropprivs);
193 
194 	/*
195 	 * We need these privileges in order to query session information and
196 	 * send signals.
197 	 */
198 	if (interactive == 0) {
199 		if (priv_addset(dropprivs, "proc_session") == -1)
200 			return (1);
201 		if (priv_addset(dropprivs, "proc_zone") == -1)
202 			return (1);
203 		if (priv_addset(dropprivs, "proc_owner") == -1)
204 			return (1);
205 	}
206 
207 	return (0);
208 }
209 
210 /*
211  * The second part of the privilege drop.  We are paranoid about being attacked
212  * by the zone, so we drop all privileges.  This should prevent a compromise
213  * which gets us to fork(), exec(), symlink(), etc.
214  */
215 static void
216 postfork_dropprivs()
217 {
218 	if ((setppriv(PRIV_SET, PRIV_PERMITTED, dropprivs)) == -1) {
219 		zperror(gettext("Warning: could not set permitted privileges"));
220 	}
221 	if ((setppriv(PRIV_SET, PRIV_LIMIT, dropprivs)) == -1) {
222 		zperror(gettext("Warning: could not set limit privileges"));
223 	}
224 	if ((setppriv(PRIV_SET, PRIV_INHERITABLE, dropprivs)) == -1) {
225 		zperror(gettext("Warning: could not set inheritable "
226 		    "privileges"));
227 	}
228 }
229 
230 /*
231  * Create the unix domain socket and call the zoneadmd server; handshake
232  * with it to determine whether it will allow us to connect.
233  */
234 static int
235 get_console_master(const char *zname)
236 {
237 	int sockfd = -1;
238 	struct sockaddr_un servaddr;
239 	char clientid[MAXPATHLEN];
240 	char handshake[MAXPATHLEN], c;
241 	int msglen;
242 	int i = 0, err = 0;
243 
244 	if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
245 		zperror(gettext("could not create socket"));
246 		return (-1);
247 	}
248 
249 	bzero(&servaddr, sizeof (servaddr));
250 	servaddr.sun_family = AF_UNIX;
251 	(void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
252 	    "%s/%s.console_sock", ZONES_TMPDIR, zname);
253 
254 	if (connect(sockfd, (struct sockaddr *)&servaddr,
255 	    sizeof (servaddr)) == -1) {
256 		zperror(gettext("Could not connect to zone console"));
257 		goto bad;
258 	}
259 	masterfd = sockfd;
260 
261 	msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu %s\n",
262 	    getpid(), setlocale(LC_MESSAGES, NULL));
263 
264 	if (msglen >= sizeof (clientid) || msglen < 0) {
265 		zerror("protocol error");
266 		goto bad;
267 	}
268 
269 	if (write(masterfd, clientid, msglen) != msglen) {
270 		zerror("protocol error");
271 		goto bad;
272 	}
273 
274 	bzero(handshake, sizeof (handshake));
275 
276 	/*
277 	 * Take care not to accumulate more than our fill, and leave room for
278 	 * the NUL at the end.
279 	 */
280 	while ((err = read(masterfd, &c, 1)) == 1) {
281 		if (i >= (sizeof (handshake) - 1))
282 			break;
283 		if (c == '\n')
284 			break;
285 		handshake[i] = c;
286 		i++;
287 	}
288 
289 	/*
290 	 * If something went wrong during the handshake we bail; perhaps
291 	 * the server died off.
292 	 */
293 	if (err == -1) {
294 		zperror(gettext("Could not connect to zone console"));
295 		goto bad;
296 	}
297 
298 	if (strncmp(handshake, "OK", sizeof (handshake)) == 0)
299 		return (0);
300 
301 	zerror(gettext("Console is already in use by process ID %s."),
302 	    handshake);
303 bad:
304 	(void) close(sockfd);
305 	masterfd = -1;
306 	return (-1);
307 }
308 
309 
310 /*
311  * Routines to handle pty creation upon zone entry and to shuttle I/O back
312  * and forth between the two terminals.  We also compute and store the
313  * name of the slave terminal associated with the master side.
314  */
315 static int
316 get_master_pty()
317 {
318 	if ((masterfd = open("/dev/ptmx", O_RDWR|O_NONBLOCK)) < 0) {
319 		zperror(gettext("failed to obtain a pseudo-tty"));
320 		return (-1);
321 	}
322 	if (tcgetattr(STDIN_FILENO, &save_termios) == -1) {
323 		zperror(gettext("failed to get terminal settings from stdin"));
324 		return (-1);
325 	}
326 	(void) ioctl(STDIN_FILENO, TIOCGWINSZ, (char *)&winsize);
327 
328 	return (0);
329 }
330 
331 /*
332  * This is a bit tricky; normally a pts device will belong to the zone it
333  * is granted to.  But in the case of "entering" a zone, we need to establish
334  * the pty before entering the zone so that we can vector I/O to and from it
335  * from the global zone.
336  *
337  * We use the zonept() call to let the ptm driver know what we are up to;
338  * the only other hairy bit is the setting of zoneslavename (which happens
339  * above, in get_master_pty()).
340  */
341 static int
342 init_slave_pty(zoneid_t zoneid, char *devroot)
343 {
344 	int slavefd = -1;
345 	char *slavename, zoneslavename[MAXPATHLEN];
346 
347 	/*
348 	 * Set slave permissions, zone the pts, then unlock it.
349 	 */
350 	if (grantpt(masterfd) != 0) {
351 		zperror(gettext("grantpt failed"));
352 		return (-1);
353 	}
354 
355 	if (unlockpt(masterfd) != 0) {
356 		zperror(gettext("unlockpt failed"));
357 		return (-1);
358 	}
359 
360 	/*
361 	 * We must open the slave side before zoning this pty; otherwise
362 	 * the kernel would refuse us the open-- zoning a pty makes it
363 	 * inaccessible to the global zone.  Note we are trying to open
364 	 * the device node via the $ZONEROOT/dev path for this pty.
365 	 *
366 	 * Later we'll close the slave out when once we've opened it again
367 	 * from within the target zone.  Blarg.
368 	 */
369 	if ((slavename = ptsname(masterfd)) == NULL) {
370 		zperror(gettext("failed to get name for pseudo-tty"));
371 		return (-1);
372 	}
373 
374 	(void) snprintf(zoneslavename, sizeof (zoneslavename), "%s%s",
375 	    devroot, slavename);
376 
377 	if ((slavefd = open(zoneslavename, O_RDWR)) < 0) {
378 		zerror(gettext("failed to open %s: %s"), zoneslavename,
379 		    strerror(errno));
380 		return (-1);
381 	}
382 
383 	/*
384 	 * Push hardware emulation (ptem), line discipline (ldterm),
385 	 * and V7/4BSD/Xenix compatibility (ttcompat) modules.
386 	 */
387 	if (ioctl(slavefd, I_PUSH, "ptem") == -1) {
388 		zperror(gettext("failed to push ptem module"));
389 		if (!failsafe)
390 			goto bad;
391 	}
392 
393 	/*
394 	 * Anchor the stream to prevent malicious I_POPs; we prefer to do
395 	 * this prior to entering the zone so that we can detect any errors
396 	 * early, and so that we can set the anchor from the global zone.
397 	 */
398 	if (ioctl(slavefd, I_ANCHOR) == -1) {
399 		zperror(gettext("failed to set stream anchor"));
400 		if (!failsafe)
401 			goto bad;
402 	}
403 
404 	if (ioctl(slavefd, I_PUSH, "ldterm") == -1) {
405 		zperror(gettext("failed to push ldterm module"));
406 		if (!failsafe)
407 			goto bad;
408 	}
409 	if (ioctl(slavefd, I_PUSH, "ttcompat") == -1) {
410 		zperror(gettext("failed to push ttcompat module"));
411 		if (!failsafe)
412 			goto bad;
413 	}
414 
415 	/*
416 	 * Propagate terminal settings from the external term to the new one.
417 	 */
418 	if (tcsetattr(slavefd, TCSAFLUSH, &save_termios) == -1) {
419 		zperror(gettext("failed to set terminal settings"));
420 		if (!failsafe)
421 			goto bad;
422 	}
423 	(void) ioctl(slavefd, TIOCSWINSZ, (char *)&winsize);
424 
425 	if (zonept(masterfd, zoneid) != 0) {
426 		zperror(gettext("could not set zoneid of pty"));
427 		goto bad;
428 	}
429 
430 	return (slavefd);
431 
432 bad:
433 	(void) close(slavefd);
434 	return (-1);
435 }
436 
437 /*
438  * Place terminal into raw mode.
439  */
440 static int
441 set_tty_rawmode(int fd)
442 {
443 	struct termios term;
444 	if (tcgetattr(fd, &term) < 0) {
445 		zperror(gettext("failed to get user terminal settings"));
446 		return (-1);
447 	}
448 
449 	/* Stash for later, so we can revert back to previous mode */
450 	save_termios = term;
451 	save_fd = fd;
452 
453 	/* disable 8->7 bit strip, start/stop, enable any char to restart */
454 	term.c_iflag &= ~(ISTRIP|IXON|IXANY);
455 	/* disable NL->CR, CR->NL, ignore CR, UPPER->lower */
456 	term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC);
457 	/* disable output post-processing */
458 	term.c_oflag &= ~OPOST;
459 	/* disable canonical mode, signal chars, echo & extended functions */
460 	term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN);
461 
462 	term.c_cc[VMIN] = 1;    /* byte-at-a-time */
463 	term.c_cc[VTIME] = 0;
464 
465 	if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) {
466 		zperror(gettext("failed to set user terminal to raw mode"));
467 		return (-1);
468 	}
469 
470 	/*
471 	 * We need to know the value of VEOF so that we can properly process for
472 	 * client-side ~<EOF>.  But we have obliterated VEOF in term,
473 	 * because VMIN overloads the same array slot in non-canonical mode.
474 	 * Stupid @&^%!
475 	 *
476 	 * So here we construct the "effective" termios from the current
477 	 * terminal settings, and the corrected VEOF and VEOL settings.
478 	 */
479 	if (tcgetattr(STDIN_FILENO, &effective_termios) < 0) {
480 		zperror(gettext("failed to get user terminal settings"));
481 		return (-1);
482 	}
483 	effective_termios.c_cc[VEOF] = save_termios.c_cc[VEOF];
484 	effective_termios.c_cc[VEOL] = save_termios.c_cc[VEOL];
485 
486 	return (0);
487 }
488 
489 /*
490  * Copy terminal window size from our terminal to the pts.
491  */
492 /*ARGSUSED*/
493 static void
494 sigwinch(int s)
495 {
496 	struct winsize ws;
497 
498 	if (ioctl(0, TIOCGWINSZ, &ws) == 0)
499 		(void) ioctl(masterfd, TIOCSWINSZ, &ws);
500 }
501 
502 static void
503 /*ARGSUSED*/
504 sigcld(int s)
505 {
506 	int status;
507 	pid_t pid;
508 
509 	/*
510 	 * Peek at the exit status.  If this isn't the process we cared
511 	 * about, then just reap it.
512 	 */
513 	if ((pid = waitpid(child_pid, &status, WNOHANG|WNOWAIT)) != -1) {
514 		if (pid == child_pid &&
515 		    (WIFEXITED(status) || WIFSIGNALED(status)))
516 			dead = 1;
517 		else
518 			(void) waitpid(pid, &status, WNOHANG);
519 	}
520 }
521 
522 /*
523  * Some signals (currently, SIGINT) must be forwarded on to the process
524  * group of the child process.
525  */
526 static void
527 sig_forward(int s)
528 {
529 	if (child_pid != -1) {
530 		pid_t pgid = getpgid(child_pid);
531 		if (pgid != -1)
532 			(void) sigsend(P_PGID, pgid, s);
533 	}
534 }
535 
536 /*
537  * reset terminal settings for global environment
538  */
539 static void
540 reset_tty()
541 {
542 	(void) tcsetattr(save_fd, TCSADRAIN, &save_termios);
543 }
544 
545 /*
546  * Convert character to printable representation, for display with locally
547  * echoed command characters (like when we need to display ~^D)
548  */
549 static void
550 canonify(char c, char *cc)
551 {
552 	if (isprint(c)) {
553 		cc[0] = c;
554 		cc[1] = '\0';
555 	} else if (c >= 0 && c <= 31) {	/* ^@ through ^_ */
556 		cc[0] = '^';
557 		cc[1] = c + '@';
558 		cc[2] = '\0';
559 	} else {
560 		cc[0] = '\\';
561 		cc[1] = ((c >> 6) & 7) + '0';
562 		cc[2] = ((c >> 3) & 7) + '0';
563 		cc[3] = (c & 7) + '0';
564 		cc[4] = '\0';
565 	}
566 }
567 
568 /*
569  * process_user_input watches the input stream for the escape sequence for
570  * 'quit' (by default, tilde-period).  Because we might be fed just one
571  * keystroke at a time, state associated with the user input (are we at the
572  * beginning of the line?  are we locally echoing the next character?) is
573  * maintained by beginning_of_line and local_echo across calls to the routine.
574  * If the write to outfd fails, we'll try to read from infd in an attempt
575  * to prevent deadlock between the two processes.
576  *
577  * This routine returns -1 when the 'quit' escape sequence has been issued,
578  * and 0 otherwise.
579  */
580 static int
581 process_user_input(int outfd, int infd, char *buf, size_t nbytes)
582 {
583 	static boolean_t beginning_of_line = B_TRUE;
584 	static boolean_t local_echo = B_FALSE;
585 
586 	char c = *buf;
587 	for (c = *buf; nbytes > 0; c = *buf, --nbytes) {
588 		buf++;
589 		if (beginning_of_line && !nocmdchar) {
590 			beginning_of_line = B_FALSE;
591 			if (c == cmdchar) {
592 				local_echo = B_TRUE;
593 				continue;
594 			}
595 		} else if (local_echo) {
596 			local_echo = B_FALSE;
597 			if (c == '.' || c == effective_termios.c_cc[VEOF]) {
598 				char cc[CANONIFY_LEN];
599 
600 				canonify(c, cc);
601 				(void) write(STDOUT_FILENO, &cmdchar, 1);
602 				(void) write(STDOUT_FILENO, cc, strlen(cc));
603 				return (-1);
604 			}
605 		}
606 retry:
607 		if (write(outfd, &c, 1) <= 0) {
608 			/*
609 			 * Since the fd we are writing to is opened with
610 			 * O_NONBLOCK it is possible to get EAGAIN if the
611 			 * pipe is full.  One way this could happen is if we
612 			 * are writing a lot of data into the pipe in this loop
613 			 * and the application on the other end is echoing that
614 			 * data back out to its stdout.  The output pipe can
615 			 * fill up since we are stuck here in this loop and not
616 			 * draining the other pipe.  We can try to read some of
617 			 * the data to see if we can drain the pipe so that the
618 			 * application can continue to make progress.  The read
619 			 * is non-blocking so we won't hang here.  We also wait
620 			 * a bit before retrying since there could be other
621 			 * reasons why the pipe is full and we don't want to
622 			 * continuously retry.
623 			 */
624 			if (errno == EAGAIN) {
625 				struct timespec rqtp;
626 				int ln;
627 				char ibuf[ZLOGIN_BUFSIZ];
628 
629 				if ((ln = read(infd, ibuf, ZLOGIN_BUFSIZ)) > 0)
630 					(void) write(STDOUT_FILENO, ibuf, ln);
631 
632 				/* sleep for 10 milliseconds */
633 				rqtp.tv_sec = 0;
634 				rqtp.tv_nsec = 10 * (NANOSEC / MILLISEC);
635 				(void) nanosleep(&rqtp, NULL);
636 				if (!dead)
637 					goto retry;
638 			}
639 
640 			return (-1);
641 		}
642 		beginning_of_line = (c == '\r' || c == '\n' ||
643 		    c == effective_termios.c_cc[VKILL] ||
644 		    c == effective_termios.c_cc[VEOL] ||
645 		    c == effective_termios.c_cc[VSUSP] ||
646 		    c == effective_termios.c_cc[VINTR]);
647 	}
648 	return (0);
649 }
650 
651 /*
652  * This function prevents deadlock between zlogin and the application in the
653  * zone that it is talking to.  This can happen when we read from zlogin's
654  * stdin and write the data down the pipe to the application.  If the pipe
655  * is full, we'll block in the write.  Because zlogin could be blocked in
656  * the write, it would never read the application's stdout/stderr so the
657  * application can then block on those writes (when the pipe fills up).  If the
658  * the application gets blocked this way, it can never get around to reading
659  * its stdin so that zlogin can unblock from its write.  Once in this state,
660  * the two processes are deadlocked.
661  *
662  * To prevent this, we want to verify that we can write into the pipe before we
663  * read from our stdin.  If the pipe already is pretty full, we bypass the read
664  * for now.  We'll circle back here again after the poll() so that we can
665  * try again.  When this function is called, we already know there is data
666  * ready to read on STDIN_FILENO.  We return -1 if there is a problem, 0
667  * if everything is ok (even though we might not have read/written any data
668  * into the pipe on this iteration).
669  */
670 static int
671 process_raw_input(int stdin_fd, int appin_fd)
672 {
673 	int cc;
674 	struct stat sb;
675 	char ibuf[ZLOGIN_RDBUFSIZ];
676 
677 	/* Check how much data is already in the pipe */
678 	if (fstat(appin_fd, &sb) == -1) {
679 		perror("stat failed");
680 		return (-1);
681 	}
682 
683 	if (dead)
684 		return (-1);
685 
686 	/*
687 	 * The pipe already has a lot of data in it,  don't write any more
688 	 * right now.
689 	 */
690 	if (sb.st_size >= HI_WATER)
691 		return (0);
692 
693 	cc = read(STDIN_FILENO, ibuf, ZLOGIN_RDBUFSIZ);
694 	if (cc == -1 && (errno != EINTR || dead))
695 		return (-1);
696 
697 	if (cc == -1)	/* The read was interrupted. */
698 		return (0);
699 
700 	/*
701 	 * stdin_fd is stdin of the target; so, the thing we'll write the user
702 	 * data *to*.  Also, unlike on the output side, we propagate
703 	 * zero-length messages to the other side.
704 	 */
705 	if (write(stdin_fd, ibuf, cc) == -1)
706 		return (-1);
707 
708 	return (0);
709 }
710 
711 /*
712  * Write the output from the application running in the zone.  We can get
713  * a signal during the write (usually it would be SIGCHLD when the application
714  * has exited) so we loop to make sure we have written all of the data we read.
715  */
716 static int
717 process_output(int in_fd, int out_fd)
718 {
719 	int wrote = 0;
720 	int cc;
721 	char ibuf[ZLOGIN_BUFSIZ];
722 
723 	cc = read(in_fd, ibuf, ZLOGIN_BUFSIZ);
724 	if (cc == -1 && (errno != EINTR || dead))
725 		return (-1);
726 	if (cc == 0)	/* EOF */
727 		return (-1);
728 	if (cc == -1)	/* The read was interrupted. */
729 		return (0);
730 
731 	do {
732 		int len;
733 
734 		len = write(out_fd, ibuf + wrote, cc - wrote);
735 		if (len == -1 && errno != EINTR)
736 			return (-1);
737 		if (len != -1)
738 			wrote += len;
739 	} while (wrote < cc);
740 
741 	return (0);
742 }
743 
744 /*
745  * This is the main I/O loop, and is shared across all zlogin modes.
746  * Parameters:
747  * 	stdin_fd:  The fd representing 'stdin' for the slave side; input to
748  *	           the zone will be written here.
749  *
750  * 	appin_fd:  The fd representing the other end of the 'stdin' pipe (when
751  *		   we're running non-interactive); used in process_raw_input
752  *		   to ensure we don't fill up the application's stdin pipe.
753  *
754  *	stdout_fd: The fd representing 'stdout' for the slave side; output
755  *	           from the zone will arrive here.
756  *
757  *	stderr_fd: The fd representing 'stderr' for the slave side; output
758  *	           from the zone will arrive here.
759  *
760  *	raw_mode:  If TRUE, then no processing (for example, for '~.') will
761  *	           be performed on the input coming from STDIN.
762  *
763  * stderr_fd may be specified as -1 if there is no stderr (only non-interactive
764  * mode supplies a stderr).
765  *
766  */
767 static void
768 doio(int stdin_fd, int appin_fd, int stdout_fd, int stderr_fd,
769     boolean_t raw_mode)
770 {
771 	struct pollfd pollfds[3];
772 	char ibuf[ZLOGIN_BUFSIZ];
773 	int cc, ret;
774 
775 	/* read from stdout of zone and write to stdout of global zone */
776 	pollfds[0].fd = stdout_fd;
777 	pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI;
778 
779 	/* read from stderr of zone and write to stderr of global zone */
780 	pollfds[1].fd = stderr_fd;
781 	pollfds[1].events = pollfds[0].events;
782 
783 	/* read from stdin of global zone and write to stdin of zone */
784 	pollfds[2].fd = STDIN_FILENO;
785 	pollfds[2].events = pollfds[0].events;
786 
787 	for (;;) {
788 		pollfds[0].revents = pollfds[1].revents =
789 		    pollfds[2].revents = 0;
790 
791 		if (dead)
792 			break;
793 
794 		ret = poll(pollfds,
795 		    sizeof (pollfds) / sizeof (struct pollfd), -1);
796 		if (ret == -1 && errno != EINTR) {
797 			perror("poll failed");
798 			break;
799 		}
800 
801 		if (errno == EINTR && dead) {
802 			break;
803 		}
804 
805 		/* event from master side stdout */
806 		if (pollfds[0].revents) {
807 			if (pollfds[0].revents &
808 			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
809 				if (process_output(stdout_fd, STDOUT_FILENO)
810 				    != 0)
811 					break;
812 			} else {
813 				pollerr = pollfds[0].revents;
814 				break;
815 			}
816 		}
817 
818 		/* event from master side stderr */
819 		if (pollfds[1].revents) {
820 			if (pollfds[1].revents &
821 			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
822 				if (process_output(stderr_fd, STDERR_FILENO)
823 				    != 0)
824 					break;
825 			} else {
826 				pollerr = pollfds[1].revents;
827 				break;
828 			}
829 		}
830 
831 		/* event from user STDIN side */
832 		if (pollfds[2].revents) {
833 			if (pollfds[2].revents &
834 			    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
835 				/*
836 				 * stdin fd is stdin of the target; so,
837 				 * the thing we'll write the user data *to*.
838 				 *
839 				 * Also, unlike on the output side, we
840 				 * propagate zero-length messages to the
841 				 * other side.
842 				 */
843 				if (raw_mode == B_TRUE) {
844 					if (process_raw_input(stdin_fd,
845 					    appin_fd) == -1)
846 						break;
847 				} else {
848 					cc = read(STDIN_FILENO, ibuf,
849 					    ZLOGIN_RDBUFSIZ);
850 					if (cc == -1 &&
851 					    (errno != EINTR || dead))
852 						break;
853 
854 					if (cc != -1 &&
855 					    process_user_input(stdin_fd,
856 					    stdout_fd, ibuf, cc) == -1)
857 						break;
858 				}
859 			} else if (raw_mode == B_TRUE &&
860 			    pollfds[2].revents & POLLHUP) {
861 				/*
862 				 * It's OK to get a POLLHUP on STDIN-- it
863 				 * always happens if you do:
864 				 *
865 				 * echo foo | zlogin <zone> <command>
866 				 *
867 				 * We reset fd to -1 in this case to clear
868 				 * the condition and write an EOF to the
869 				 * other side in order to wrap things up.
870 				 */
871 				pollfds[2].fd = -1;
872 				(void) write(stdin_fd, ibuf, 0);
873 			} else {
874 				pollerr = pollfds[2].revents;
875 				break;
876 			}
877 		}
878 	}
879 
880 	/*
881 	 * We are in the midst of dying, but try to poll with a short
882 	 * timeout to see if we can catch the last bit of I/O from the
883 	 * children.
884 	 */
885 retry:
886 	pollfds[0].revents = pollfds[1].revents = 0;
887 	(void) poll(pollfds, 2, 100);
888 	if (pollfds[0].revents &
889 	    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
890 		if ((cc = read(stdout_fd, ibuf, ZLOGIN_BUFSIZ)) > 0) {
891 			(void) write(STDOUT_FILENO, ibuf, cc);
892 			goto retry;
893 		}
894 	}
895 	if (pollfds[1].revents &
896 	    (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
897 		if ((cc = read(stderr_fd, ibuf, ZLOGIN_BUFSIZ)) > 0) {
898 			(void) write(STDERR_FILENO, ibuf, cc);
899 			goto retry;
900 		}
901 	}
902 }
903 
904 static char **
905 zone_login_cmd(brand_handle_t bh, const char *login)
906 {
907 	static char result_buf[ARG_MAX];
908 	char **new_argv, *ptr, *lasts;
909 	int n, a;
910 
911 	/* Get the login command for the target zone. */
912 	bzero(result_buf, sizeof (result_buf));
913 	if (brand_get_login_cmd(bh, login,
914 	    result_buf, sizeof (result_buf)) != 0)
915 		return (NULL);
916 
917 	/*
918 	 * We got back a string that we'd like to execute.  But since
919 	 * we're not doing the execution via a shell we'll need to convert
920 	 * the exec string to an array of strings.  We'll do that here
921 	 * but we're going to be very simplistic about it and break stuff
922 	 * up based on spaces.  We're not even going to support any kind
923 	 * of quoting or escape characters.  It's truly amazing that
924 	 * there is no library function in OpenSolaris to do this for us.
925 	 */
926 
927 	/*
928 	 * Be paranoid.  Since we're deliniating based on spaces make
929 	 * sure there are no adjacent spaces.
930 	 */
931 	if (strstr(result_buf, "  ") != NULL)
932 		return (NULL);
933 
934 	/* Remove any trailing whitespace.  */
935 	n = strlen(result_buf);
936 	if (result_buf[n - 1] == ' ')
937 		result_buf[n - 1] = '\0';
938 
939 	/* Count how many elements there are in the exec string. */
940 	ptr = result_buf;
941 	for (n = 2; ((ptr = strchr(ptr + 1, (int)' ')) != NULL); n++)
942 		;
943 
944 	/* Allocate the argv array that we're going to return. */
945 	if ((new_argv = malloc(sizeof (char *) * n)) == NULL)
946 		return (NULL);
947 
948 	/* Tokenize the exec string and return. */
949 	a = 0;
950 	new_argv[a++] = result_buf;
951 	if (n > 2) {
952 		(void) strtok_r(result_buf, " ", &lasts);
953 		while ((new_argv[a++] = strtok_r(NULL, " ", &lasts)) != NULL)
954 			;
955 	} else {
956 		new_argv[a++] = NULL;
957 	}
958 	assert(n == a);
959 	return (new_argv);
960 }
961 
962 /*
963  * Prepare argv array for exec'd process; if we're passing commands to the
964  * new process, then use su(1M) to do the invocation.  Otherwise, use
965  * 'login -z <from_zonename> -f' (-z is an undocumented option which tells
966  * login that we're coming from another zone, and to disregard its CONSOLE
967  * checks).
968  */
969 static char **
970 prep_args(brand_handle_t bh, const char *login, char **argv)
971 {
972 	int argc = 0, a = 0, i, n = -1;
973 	char **new_argv;
974 
975 	if (argv != NULL) {
976 		size_t subshell_len = 1;
977 		char *subshell;
978 
979 		while (argv[argc] != NULL)
980 			argc++;
981 
982 		for (i = 0; i < argc; i++) {
983 			subshell_len += strlen(argv[i]) + 1;
984 		}
985 		if ((subshell = calloc(1, subshell_len)) == NULL)
986 			return (NULL);
987 
988 		for (i = 0; i < argc; i++) {
989 			(void) strcat(subshell, argv[i]);
990 			(void) strcat(subshell, " ");
991 		}
992 
993 		if (failsafe) {
994 			n = 4;
995 			if ((new_argv = malloc(sizeof (char *) * n)) == NULL)
996 				return (NULL);
997 
998 			new_argv[a++] = FAILSAFESHELL;
999 		} else {
1000 			n = 5;
1001 			if ((new_argv = malloc(sizeof (char *) * n)) == NULL)
1002 				return (NULL);
1003 
1004 			new_argv[a++] = SUPATH;
1005 			new_argv[a++] = (char *)login;
1006 		}
1007 		new_argv[a++] = "-c";
1008 		new_argv[a++] = subshell;
1009 		new_argv[a++] = NULL;
1010 		assert(a == n);
1011 	} else {
1012 		if (failsafe) {
1013 			n = 2;
1014 			if ((new_argv = malloc(sizeof (char *) * n)) == NULL)
1015 				return (NULL);
1016 			new_argv[a++] = FAILSAFESHELL;
1017 			new_argv[a++] = NULL;
1018 			assert(n == a);
1019 		} else {
1020 			new_argv = zone_login_cmd(bh, login);
1021 		}
1022 	}
1023 
1024 	return (new_argv);
1025 }
1026 
1027 /*
1028  * Helper routine for prep_env below.
1029  */
1030 static char *
1031 add_env(char *name, char *value)
1032 {
1033 	size_t sz = strlen(name) + strlen(value) + 2; /* name, =, value, NUL */
1034 	char *str;
1035 
1036 	if ((str = malloc(sz)) == NULL)
1037 		return (NULL);
1038 
1039 	(void) snprintf(str, sz, "%s=%s", name, value);
1040 	return (str);
1041 }
1042 
1043 /*
1044  * Prepare envp array for exec'd process.
1045  */
1046 static char **
1047 prep_env()
1048 {
1049 	int e = 0, size = 1;
1050 	char **new_env, *estr;
1051 	char *term = getenv("TERM");
1052 
1053 	size++;	/* for $PATH */
1054 	if (term != NULL)
1055 		size++;
1056 
1057 	/*
1058 	 * In failsafe mode we set $HOME, since '-l' isn't valid in this mode.
1059 	 * We also set $SHELL, since neither login nor su will be around to do
1060 	 * it.
1061 	 */
1062 	if (failsafe)
1063 		size += 2;
1064 
1065 	if ((new_env = malloc(sizeof (char *) * size)) == NULL)
1066 		return (NULL);
1067 
1068 	if ((estr = add_env("PATH", DEF_PATH)) == NULL)
1069 		return (NULL);
1070 	new_env[e++] = estr;
1071 
1072 	if (term != NULL) {
1073 		if ((estr = add_env("TERM", term)) == NULL)
1074 			return (NULL);
1075 		new_env[e++] = estr;
1076 	}
1077 
1078 	if (failsafe) {
1079 		if ((estr = add_env("HOME", "/")) == NULL)
1080 			return (NULL);
1081 		new_env[e++] = estr;
1082 
1083 		if ((estr = add_env("SHELL", FAILSAFESHELL)) == NULL)
1084 			return (NULL);
1085 		new_env[e++] = estr;
1086 	}
1087 
1088 	new_env[e++] = NULL;
1089 
1090 	assert(e == size);
1091 
1092 	return (new_env);
1093 }
1094 
1095 /*
1096  * Finish the preparation of the envp array for exec'd non-interactive
1097  * zlogins.  This is called in the child process *after* we zone_enter(), since
1098  * it derives things we can only know within the zone, such as $HOME, $SHELL,
1099  * etc.  We need only do this in the non-interactive, mode, since otherwise
1100  * login(1) will do it.  We don't do this in failsafe mode, since it presents
1101  * additional ways in which the command could fail, and we'd prefer to avoid
1102  * that.
1103  */
1104 static char **
1105 prep_env_noninteractive(char *login, char **env)
1106 {
1107 	size_t size;
1108 	struct passwd *pw;
1109 	char **new_env;
1110 	int e, i;
1111 	char *estr;
1112 	char varmail[LOGNAME_MAX + 11]; /* strlen(/var/mail/) = 10, NUL */
1113 
1114 	assert(env != NULL);
1115 	assert(failsafe == 0);
1116 
1117 	/*
1118 	 * Get existing envp size.
1119 	 */
1120 	for (size = 0; env[size] != NULL; size++)
1121 		;
1122 	e = size;
1123 
1124 	/*
1125 	 * Finish filling out the environment; we duplicate the environment
1126 	 * setup described in login(1), for lack of a better precedent.
1127 	 */
1128 	if ((pw = getpwnam(login)) != NULL) {
1129 		size += 3;	/* LOGNAME, HOME, MAIL */
1130 	}
1131 	size++;	/* always fill in SHELL */
1132 	size++; /* terminating NULL */
1133 
1134 	if ((new_env = malloc(sizeof (char *) * size)) == NULL)
1135 		goto malloc_fail;
1136 
1137 	/*
1138 	 * Copy existing elements of env into new_env.
1139 	 */
1140 	for (i = 0; env[i] != NULL; i++) {
1141 		if ((new_env[i] = strdup(env[i])) == NULL)
1142 			goto malloc_fail;
1143 	}
1144 	assert(e == i);
1145 
1146 	if (pw != NULL) {
1147 		if ((estr = add_env("LOGNAME", pw->pw_name)) == NULL)
1148 			goto malloc_fail;
1149 		new_env[e++] = estr;
1150 
1151 		if ((estr = add_env("HOME", pw->pw_dir)) == NULL)
1152 			goto malloc_fail;
1153 		new_env[e++] = estr;
1154 
1155 		if (chdir(pw->pw_dir) != 0)
1156 			zerror(gettext("Could not chdir to home directory "
1157 			    "%s: %s"), pw->pw_dir, strerror(errno));
1158 
1159 		(void) snprintf(varmail, sizeof (varmail), "/var/mail/%s",
1160 		    pw->pw_name);
1161 		if ((estr = add_env("MAIL", varmail)) == NULL)
1162 			goto malloc_fail;
1163 		new_env[e++] = estr;
1164 	}
1165 
1166 	if (pw != NULL && strlen(pw->pw_shell) > 0) {
1167 		if ((estr = add_env("SHELL", pw->pw_shell)) == NULL)
1168 			goto malloc_fail;
1169 		new_env[e++] = estr;
1170 	} else {
1171 		if ((estr = add_env("SHELL", DEFAULTSHELL)) == NULL)
1172 			goto malloc_fail;
1173 		new_env[e++] = estr;
1174 	}
1175 
1176 	new_env[e++] = NULL;	/* add terminating NULL */
1177 
1178 	assert(e == size);
1179 	return (new_env);
1180 
1181 malloc_fail:
1182 	zperror(gettext("failed to allocate memory for process environment"));
1183 	return (NULL);
1184 }
1185 
1186 static int
1187 close_func(void *slavefd, int fd)
1188 {
1189 	if (fd != *(int *)slavefd)
1190 		(void) close(fd);
1191 	return (0);
1192 }
1193 
1194 static void
1195 set_cmdchar(char *cmdcharstr)
1196 {
1197 	char c;
1198 	long lc;
1199 
1200 	if ((c = *cmdcharstr) != '\\') {
1201 		cmdchar = c;
1202 		return;
1203 	}
1204 
1205 	c = cmdcharstr[1];
1206 	if (c == '\0' || c == '\\') {
1207 		cmdchar = '\\';
1208 		return;
1209 	}
1210 
1211 	if (c < '0' || c > '7') {
1212 		zerror(gettext("Unrecognized escape character option %s"),
1213 		    cmdcharstr);
1214 		usage();
1215 	}
1216 
1217 	lc = strtol(cmdcharstr + 1, NULL, 8);
1218 	if (lc < 0 || lc > 255) {
1219 		zerror(gettext("Octal escape character '%s' too large"),
1220 		    cmdcharstr);
1221 		usage();
1222 	}
1223 	cmdchar = (char)lc;
1224 }
1225 
1226 static int
1227 setup_utmpx(char *slavename)
1228 {
1229 	struct utmpx ut;
1230 
1231 	bzero(&ut, sizeof (ut));
1232 	(void) strncpy(ut.ut_user, ".zlogin", sizeof (ut.ut_user));
1233 	(void) strncpy(ut.ut_line, slavename, sizeof (ut.ut_line));
1234 	ut.ut_pid = getpid();
1235 	ut.ut_id[0] = 'z';
1236 	ut.ut_id[1] = ut.ut_id[2] = ut.ut_id[3] = (char)SC_WILDC;
1237 	ut.ut_type = LOGIN_PROCESS;
1238 	(void) time(&ut.ut_tv.tv_sec);
1239 
1240 	if (makeutx(&ut) == NULL) {
1241 		zerror(gettext("makeutx failed"));
1242 		return (-1);
1243 	}
1244 	return (0);
1245 }
1246 
1247 static void
1248 release_lock_file(int lockfd)
1249 {
1250 	(void) close(lockfd);
1251 }
1252 
1253 static int
1254 grab_lock_file(const char *zone_name, int *lockfd)
1255 {
1256 	char pathbuf[PATH_MAX];
1257 	struct flock flock;
1258 
1259 	if (mkdir(ZONES_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
1260 		zerror(gettext("could not mkdir %s: %s"), ZONES_TMPDIR,
1261 		    strerror(errno));
1262 		return (-1);
1263 	}
1264 	(void) chmod(ZONES_TMPDIR, S_IRWXU);
1265 	(void) snprintf(pathbuf, sizeof (pathbuf), "%s/%s.zoneadm.lock",
1266 	    ZONES_TMPDIR, zone_name);
1267 
1268 	if ((*lockfd = open(pathbuf, O_RDWR|O_CREAT, S_IRUSR|S_IWUSR)) < 0) {
1269 		zerror(gettext("could not open %s: %s"), pathbuf,
1270 		    strerror(errno));
1271 		return (-1);
1272 	}
1273 	/*
1274 	 * Lock the file to synchronize with other zoneadmds
1275 	 */
1276 	flock.l_type = F_WRLCK;
1277 	flock.l_whence = SEEK_SET;
1278 	flock.l_start = (off_t)0;
1279 	flock.l_len = (off_t)0;
1280 	if (fcntl(*lockfd, F_SETLKW, &flock) < 0) {
1281 		zerror(gettext("unable to lock %s: %s"), pathbuf,
1282 		    strerror(errno));
1283 		release_lock_file(*lockfd);
1284 		return (-1);
1285 	}
1286 	return (Z_OK);
1287 }
1288 
1289 static int
1290 start_zoneadmd(const char *zone_name)
1291 {
1292 	pid_t retval;
1293 	int pstatus = 0, error = -1, lockfd, doorfd;
1294 	struct door_info info;
1295 	char doorpath[MAXPATHLEN];
1296 
1297 	(void) snprintf(doorpath, sizeof (doorpath), ZONE_DOOR_PATH, zone_name);
1298 
1299 	if (grab_lock_file(zone_name, &lockfd) != Z_OK)
1300 		return (-1);
1301 	/*
1302 	 * We must do the door check with the lock held.  Otherwise, we
1303 	 * might race against another zoneadm/zlogin process and wind
1304 	 * up with two processes trying to start zoneadmd at the same
1305 	 * time.  zoneadmd will detect this, and fail, but we prefer this
1306 	 * to be as seamless as is practical, from a user perspective.
1307 	 */
1308 	if ((doorfd = open(doorpath, O_RDONLY)) < 0) {
1309 		if (errno != ENOENT) {
1310 			zerror("failed to open %s: %s", doorpath,
1311 			    strerror(errno));
1312 			goto out;
1313 		}
1314 	} else {
1315 		/*
1316 		 * Seems to be working ok.
1317 		 */
1318 		if (door_info(doorfd, &info) == 0 &&
1319 		    ((info.di_attributes & DOOR_REVOKED) == 0)) {
1320 			error = 0;
1321 			goto out;
1322 		}
1323 	}
1324 
1325 	if ((child_pid = fork()) == -1) {
1326 		zperror(gettext("could not fork"));
1327 		goto out;
1328 	} else if (child_pid == 0) {
1329 		/* child process */
1330 		(void) execl("/usr/lib/zones/zoneadmd", "zoneadmd", "-z",
1331 		    zone_name, NULL);
1332 		zperror(gettext("could not exec zoneadmd"));
1333 		_exit(1);
1334 	}
1335 
1336 	/* parent process */
1337 	do {
1338 		retval = waitpid(child_pid, &pstatus, 0);
1339 	} while (retval != child_pid);
1340 	if (WIFSIGNALED(pstatus) ||
1341 	    (WIFEXITED(pstatus) && WEXITSTATUS(pstatus) != 0)) {
1342 		zerror(gettext("could not start %s"), "zoneadmd");
1343 		goto out;
1344 	}
1345 	error = 0;
1346 out:
1347 	release_lock_file(lockfd);
1348 	(void) close(doorfd);
1349 	return (error);
1350 }
1351 
1352 static int
1353 init_template(void)
1354 {
1355 	int fd;
1356 	int err = 0;
1357 
1358 	fd = open64(CTFS_ROOT "/process/template", O_RDWR);
1359 	if (fd == -1)
1360 		return (-1);
1361 
1362 	/*
1363 	 * zlogin doesn't do anything with the contract.
1364 	 * Deliver no events, don't inherit, and allow it to be orphaned.
1365 	 */
1366 	err |= ct_tmpl_set_critical(fd, 0);
1367 	err |= ct_tmpl_set_informative(fd, 0);
1368 	err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
1369 	err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
1370 	if (err || ct_tmpl_activate(fd)) {
1371 		(void) close(fd);
1372 		return (-1);
1373 	}
1374 
1375 	return (fd);
1376 }
1377 
1378 static int
1379 noninteractive_login(char *zonename, zoneid_t zoneid, char *login,
1380     char **new_args, char **new_env)
1381 {
1382 	pid_t retval;
1383 	int stdin_pipe[2], stdout_pipe[2], stderr_pipe[2];
1384 	int child_status;
1385 	int tmpl_fd;
1386 	sigset_t block_cld;
1387 
1388 	if ((tmpl_fd = init_template()) == -1) {
1389 		reset_tty();
1390 		zperror(gettext("could not create contract"));
1391 		return (1);
1392 	}
1393 
1394 	if (pipe(stdin_pipe) != 0) {
1395 		zperror(gettext("could not create STDIN pipe"));
1396 		return (1);
1397 	}
1398 	/*
1399 	 * When the user types ^D, we get a zero length message on STDIN.
1400 	 * We need to echo that down the pipe to send it to the other side;
1401 	 * but by default, pipes don't propagate zero-length messages.  We
1402 	 * toggle that behavior off using I_SWROPT.  See streamio(7i).
1403 	 */
1404 	if (ioctl(stdin_pipe[0], I_SWROPT, SNDZERO) != 0) {
1405 		zperror(gettext("could not configure STDIN pipe"));
1406 		return (1);
1407 
1408 	}
1409 	if (pipe(stdout_pipe) != 0) {
1410 		zperror(gettext("could not create STDOUT pipe"));
1411 		return (1);
1412 	}
1413 	if (pipe(stderr_pipe) != 0) {
1414 		zperror(gettext("could not create STDERR pipe"));
1415 		return (1);
1416 	}
1417 
1418 	/*
1419 	 * If any of the pipe FD's winds up being less than STDERR, then we
1420 	 * have a mess on our hands-- and we are lacking some of the I/O
1421 	 * streams we would expect anyway.  So we bail.
1422 	 */
1423 	if (stdin_pipe[0] <= STDERR_FILENO ||
1424 	    stdin_pipe[1] <= STDERR_FILENO ||
1425 	    stdout_pipe[0] <= STDERR_FILENO ||
1426 	    stdout_pipe[1] <= STDERR_FILENO ||
1427 	    stderr_pipe[0] <= STDERR_FILENO ||
1428 	    stderr_pipe[1] <= STDERR_FILENO) {
1429 		zperror(gettext("process lacks valid STDIN, STDOUT, STDERR"));
1430 		return (1);
1431 	}
1432 
1433 	if (prefork_dropprivs() != 0) {
1434 		zperror(gettext("could not allocate privilege set"));
1435 		return (1);
1436 	}
1437 
1438 	(void) sigset(SIGCLD, sigcld);
1439 	(void) sigemptyset(&block_cld);
1440 	(void) sigaddset(&block_cld, SIGCLD);
1441 	(void) sigprocmask(SIG_BLOCK, &block_cld, NULL);
1442 
1443 	if ((child_pid = fork()) == -1) {
1444 		(void) ct_tmpl_clear(tmpl_fd);
1445 		(void) close(tmpl_fd);
1446 		zperror(gettext("could not fork"));
1447 		return (1);
1448 	} else if (child_pid == 0) { /* child process */
1449 		(void) ct_tmpl_clear(tmpl_fd);
1450 
1451 		/*
1452 		 * Do a dance to get the pipes hooked up as FD's 0, 1 and 2.
1453 		 */
1454 		(void) close(STDIN_FILENO);
1455 		(void) close(STDOUT_FILENO);
1456 		(void) close(STDERR_FILENO);
1457 		(void) dup2(stdin_pipe[1], STDIN_FILENO);
1458 		(void) dup2(stdout_pipe[1], STDOUT_FILENO);
1459 		(void) dup2(stderr_pipe[1], STDERR_FILENO);
1460 		(void) closefrom(STDERR_FILENO + 1);
1461 
1462 		(void) sigset(SIGCLD, SIG_DFL);
1463 		(void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
1464 		/*
1465 		 * In case any of stdin, stdout or stderr are streams,
1466 		 * anchor them to prevent malicious I_POPs.
1467 		 */
1468 		(void) ioctl(STDIN_FILENO, I_ANCHOR);
1469 		(void) ioctl(STDOUT_FILENO, I_ANCHOR);
1470 		(void) ioctl(STDERR_FILENO, I_ANCHOR);
1471 
1472 		if (zone_enter(zoneid) == -1) {
1473 			zerror(gettext("could not enter zone %s: %s"),
1474 			    zonename, strerror(errno));
1475 			_exit(1);
1476 		}
1477 
1478 		if (!failsafe)
1479 			new_env = prep_env_noninteractive(login, new_env);
1480 
1481 		if (new_env == NULL) {
1482 			_exit(1);
1483 		}
1484 
1485 		/*
1486 		 * Move into a new process group; the zone_enter will have
1487 		 * placed us into zsched's session, and we want to be in
1488 		 * a unique process group.
1489 		 */
1490 		(void) setpgid(getpid(), getpid());
1491 
1492 		(void) execve(new_args[0], new_args, new_env);
1493 		zperror(gettext("exec failure"));
1494 		_exit(1);
1495 	}
1496 	/* parent */
1497 	(void) sigset(SIGINT, sig_forward);
1498 
1499 	postfork_dropprivs();
1500 
1501 	(void) ct_tmpl_clear(tmpl_fd);
1502 	(void) close(tmpl_fd);
1503 
1504 	(void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
1505 	doio(stdin_pipe[0], stdin_pipe[1], stdout_pipe[0], stderr_pipe[0],
1506 	    B_TRUE);
1507 	do {
1508 		retval = waitpid(child_pid, &child_status, 0);
1509 		if (retval == -1) {
1510 			child_status = 0;
1511 		}
1512 	} while (retval != child_pid && errno != ECHILD);
1513 
1514 	return (WEXITSTATUS(child_status));
1515 }
1516 
1517 int
1518 main(int argc, char **argv)
1519 {
1520 	int arg, console = 0;
1521 	zoneid_t zoneid;
1522 	zone_state_t st;
1523 	char *login = "root";
1524 	int lflag = 0;
1525 	char *zonename = NULL;
1526 	char **proc_args = NULL;
1527 	char **new_args, **new_env;
1528 	sigset_t block_cld;
1529 	char devroot[MAXPATHLEN];
1530 	char *slavename, slaveshortname[MAXPATHLEN];
1531 	priv_set_t *privset;
1532 	int tmpl_fd;
1533 	char zonebrand[MAXNAMELEN];
1534 	struct stat sb;
1535 	char kernzone[ZONENAME_MAX];
1536 	brand_handle_t bh;
1537 
1538 	(void) setlocale(LC_ALL, "");
1539 	(void) textdomain(TEXT_DOMAIN);
1540 
1541 	(void) getpname(argv[0]);
1542 
1543 	while ((arg = getopt(argc, argv, "ECR:Se:l:")) != EOF) {
1544 		switch (arg) {
1545 		case 'C':
1546 			console = 1;
1547 			break;
1548 		case 'E':
1549 			nocmdchar = 1;
1550 			break;
1551 		case 'R':	/* undocumented */
1552 			if (*optarg != '/') {
1553 				zerror(gettext("root path must be absolute."));
1554 				exit(2);
1555 			}
1556 			if (stat(optarg, &sb) == -1 || !S_ISDIR(sb.st_mode)) {
1557 				zerror(
1558 				    gettext("root path must be a directory."));
1559 				exit(2);
1560 			}
1561 			zonecfg_set_root(optarg);
1562 			break;
1563 		case 'S':
1564 			failsafe = 1;
1565 			break;
1566 		case 'e':
1567 			set_cmdchar(optarg);
1568 			break;
1569 		case 'l':
1570 			login = optarg;
1571 			lflag = 1;
1572 			break;
1573 		default:
1574 			usage();
1575 		}
1576 	}
1577 
1578 	if (console != 0 && lflag != 0) {
1579 		zerror(gettext("-l may not be specified for console login"));
1580 		usage();
1581 	}
1582 
1583 	if (console != 0 && failsafe != 0) {
1584 		zerror(gettext("-S may not be specified for console login"));
1585 		usage();
1586 	}
1587 
1588 	if (console != 0 && zonecfg_in_alt_root()) {
1589 		zerror(gettext("-R may not be specified for console login"));
1590 		exit(2);
1591 	}
1592 
1593 	if (failsafe != 0 && lflag != 0) {
1594 		zerror(gettext("-l may not be specified for failsafe login"));
1595 		usage();
1596 	}
1597 
1598 	if (optind == (argc - 1)) {
1599 		/*
1600 		 * zone name, no process name; this should be an interactive
1601 		 * as long as STDIN is really a tty.
1602 		 */
1603 		if (isatty(STDIN_FILENO))
1604 			interactive = 1;
1605 		zonename = argv[optind];
1606 	} else if (optind < (argc - 1)) {
1607 		if (console) {
1608 			zerror(gettext("Commands may not be specified for "
1609 			    "console login."));
1610 			usage();
1611 		}
1612 		/* zone name and process name, and possibly some args */
1613 		zonename = argv[optind];
1614 		proc_args = &argv[optind + 1];
1615 		interactive = 0;
1616 	} else {
1617 		usage();
1618 	}
1619 
1620 	if (getzoneid() != GLOBAL_ZONEID) {
1621 		zerror(gettext("'%s' may only be used from the global zone"),
1622 		    pname);
1623 		return (1);
1624 	}
1625 
1626 	if (strcmp(zonename, GLOBAL_ZONENAME) == 0) {
1627 		zerror(gettext("'%s' not applicable to the global zone"),
1628 		    pname);
1629 		return (1);
1630 	}
1631 
1632 	if (zone_get_state(zonename, &st) != Z_OK) {
1633 		zerror(gettext("zone '%s' unknown"), zonename);
1634 		return (1);
1635 	}
1636 
1637 	if (st < ZONE_STATE_INSTALLED) {
1638 		zerror(gettext("cannot login to a zone which is '%s'"),
1639 		    zone_state_str(st));
1640 		return (1);
1641 	}
1642 
1643 	/*
1644 	 * In both console and non-console cases, we require all privs.
1645 	 * In the console case, because we may need to startup zoneadmd.
1646 	 * In the non-console case in order to do zone_enter(2), zonept()
1647 	 * and other tasks.
1648 	 *
1649 	 * Future work: this solution is temporary.  Ultimately, we need to
1650 	 * move to a flexible system which allows the global admin to
1651 	 * designate that a particular user can zlogin (and probably zlogin
1652 	 * -C) to a particular zone.  This all-root business we have now is
1653 	 * quite sketchy.
1654 	 */
1655 	if ((privset = priv_allocset()) == NULL) {
1656 		zperror(gettext("priv_allocset failed"));
1657 		return (1);
1658 	}
1659 
1660 	if (getppriv(PRIV_EFFECTIVE, privset) != 0) {
1661 		zperror(gettext("getppriv failed"));
1662 		priv_freeset(privset);
1663 		return (1);
1664 	}
1665 
1666 	if (priv_isfullset(privset) == B_FALSE) {
1667 		zerror(gettext("You lack sufficient privilege to run "
1668 		    "this command (all privs required)"));
1669 		priv_freeset(privset);
1670 		return (1);
1671 	}
1672 	priv_freeset(privset);
1673 
1674 	/*
1675 	 * The console is a separate case from the rest of the code; handle
1676 	 * it first.
1677 	 */
1678 	if (console) {
1679 
1680 		/*
1681 		 * Ensure that zoneadmd for this zone is running.
1682 		 */
1683 		if (start_zoneadmd(zonename) == -1)
1684 			return (1);
1685 
1686 		/*
1687 		 * Make contact with zoneadmd.
1688 		 */
1689 		if (get_console_master(zonename) == -1)
1690 			return (1);
1691 
1692 		(void) printf(gettext("[Connected to zone '%s' console]\n"),
1693 		    zonename);
1694 
1695 		if (set_tty_rawmode(STDIN_FILENO) == -1) {
1696 			reset_tty();
1697 			zperror(gettext("failed to set stdin pty to raw mode"));
1698 			return (1);
1699 		}
1700 
1701 		(void) sigset(SIGWINCH, sigwinch);
1702 		(void) sigwinch(0);
1703 
1704 		/*
1705 		 * Run the I/O loop until we get disconnected.
1706 		 */
1707 		doio(masterfd, -1, masterfd, -1, B_FALSE);
1708 		reset_tty();
1709 		(void) printf(gettext("\n[Connection to zone '%s' console "
1710 		    "closed]\n"), zonename);
1711 
1712 		return (0);
1713 	}
1714 
1715 	if (st != ZONE_STATE_RUNNING && st != ZONE_STATE_MOUNTED) {
1716 		zerror(gettext("login allowed only to running zones "
1717 		    "(%s is '%s')."), zonename, zone_state_str(st));
1718 		return (1);
1719 	}
1720 
1721 	(void) strlcpy(kernzone, zonename, sizeof (kernzone));
1722 	if (zonecfg_in_alt_root()) {
1723 		FILE *fp = zonecfg_open_scratch("", B_FALSE);
1724 
1725 		if (fp == NULL || zonecfg_find_scratch(fp, zonename,
1726 		    zonecfg_get_root(), kernzone, sizeof (kernzone)) == -1) {
1727 			zerror(gettext("cannot find scratch zone %s"),
1728 			    zonename);
1729 			if (fp != NULL)
1730 				zonecfg_close_scratch(fp);
1731 			return (1);
1732 		}
1733 		zonecfg_close_scratch(fp);
1734 	}
1735 
1736 	if ((zoneid = getzoneidbyname(kernzone)) == -1) {
1737 		zerror(gettext("failed to get zoneid for zone '%s'"),
1738 		    zonename);
1739 		return (1);
1740 	}
1741 
1742 	/*
1743 	 * We need the zone root path only if we are setting up a pty.
1744 	 */
1745 	if (zone_get_devroot(zonename, devroot, sizeof (devroot)) == -1) {
1746 		zerror(gettext("could not get dev path for zone %s"),
1747 		    zonename);
1748 		return (1);
1749 	}
1750 
1751 	/* Get a handle to the brand info for this zone */
1752 	if ((zone_get_brand(zonename, zonebrand, sizeof (zonebrand)) != Z_OK) ||
1753 	    ((bh = brand_open(zonebrand)) == NULL)) {
1754 		zerror(gettext("could not get brand for zone %s"), zonename);
1755 		return (1);
1756 	}
1757 	if ((new_args = prep_args(bh, login, proc_args)) == NULL) {
1758 		zperror(gettext("could not assemble new arguments"));
1759 		brand_close(bh);
1760 		return (1);
1761 	}
1762 	brand_close(bh);
1763 
1764 	if ((new_env = prep_env()) == NULL) {
1765 		zperror(gettext("could not assemble new environment"));
1766 		return (1);
1767 	}
1768 
1769 	if (!interactive)
1770 		return (noninteractive_login(zonename, zoneid, login, new_args,
1771 		    new_env));
1772 
1773 	if (zonecfg_in_alt_root()) {
1774 		zerror(gettext("cannot use interactive login with scratch "
1775 		    "zone"));
1776 		return (1);
1777 	}
1778 
1779 	/*
1780 	 * Things are more complex in interactive mode; we get the
1781 	 * master side of the pty, then place the user's terminal into
1782 	 * raw mode.
1783 	 */
1784 	if (get_master_pty() == -1) {
1785 		zerror(gettext("could not setup master pty device"));
1786 		return (1);
1787 	}
1788 
1789 	/*
1790 	 * Compute the "short name" of the pts.  /dev/pts/2 --> pts/2
1791 	 */
1792 	if ((slavename = ptsname(masterfd)) == NULL) {
1793 		zperror(gettext("failed to get name for pseudo-tty"));
1794 		return (1);
1795 	}
1796 	if (strncmp(slavename, "/dev/", strlen("/dev/")) == 0)
1797 		(void) strlcpy(slaveshortname, slavename + strlen("/dev/"),
1798 		    sizeof (slaveshortname));
1799 	else
1800 		(void) strlcpy(slaveshortname, slavename,
1801 		    sizeof (slaveshortname));
1802 
1803 	(void) printf(gettext("[Connected to zone '%s' %s]\n"), zonename,
1804 	    slaveshortname);
1805 
1806 	if (set_tty_rawmode(STDIN_FILENO) == -1) {
1807 		reset_tty();
1808 		zperror(gettext("failed to set stdin pty to raw mode"));
1809 		return (1);
1810 	}
1811 
1812 	if (prefork_dropprivs() != 0) {
1813 		reset_tty();
1814 		zperror(gettext("could not allocate privilege set"));
1815 		return (1);
1816 	}
1817 
1818 	/*
1819 	 * We must mask SIGCLD until after we have coped with the fork
1820 	 * sufficiently to deal with it; otherwise we can race and receive the
1821 	 * signal before child_pid has been initialized (yes, this really
1822 	 * happens).
1823 	 */
1824 	(void) sigset(SIGCLD, sigcld);
1825 	(void) sigemptyset(&block_cld);
1826 	(void) sigaddset(&block_cld, SIGCLD);
1827 	(void) sigprocmask(SIG_BLOCK, &block_cld, NULL);
1828 
1829 	/*
1830 	 * We activate the contract template at the last minute to
1831 	 * avoid intermediate functions that could be using fork(2)
1832 	 * internally.
1833 	 */
1834 	if ((tmpl_fd = init_template()) == -1) {
1835 		reset_tty();
1836 		zperror(gettext("could not create contract"));
1837 		return (1);
1838 	}
1839 
1840 	if ((child_pid = fork()) == -1) {
1841 		(void) ct_tmpl_clear(tmpl_fd);
1842 		reset_tty();
1843 		zperror(gettext("could not fork"));
1844 		return (1);
1845 	} else if (child_pid == 0) { /* child process */
1846 		int slavefd, newslave;
1847 
1848 		(void) ct_tmpl_clear(tmpl_fd);
1849 		(void) close(tmpl_fd);
1850 
1851 		(void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
1852 
1853 		if ((slavefd = init_slave_pty(zoneid, devroot)) == -1)
1854 			return (1);
1855 
1856 		/*
1857 		 * Close all fds except for the slave pty.
1858 		 */
1859 		(void) fdwalk(close_func, &slavefd);
1860 
1861 		/*
1862 		 * Temporarily dup slavefd to stderr; that way if we have
1863 		 * to print out that zone_enter failed, the output will
1864 		 * have somewhere to go.
1865 		 */
1866 		if (slavefd != STDERR_FILENO)
1867 			(void) dup2(slavefd, STDERR_FILENO);
1868 
1869 		if (zone_enter(zoneid) == -1) {
1870 			zerror(gettext("could not enter zone %s: %s"),
1871 			    zonename, strerror(errno));
1872 			return (1);
1873 		}
1874 
1875 		if (slavefd != STDERR_FILENO)
1876 			(void) close(STDERR_FILENO);
1877 
1878 		/*
1879 		 * We take pains to get this process into a new process
1880 		 * group, and subsequently a new session.  In this way,
1881 		 * we'll have a session which doesn't yet have a controlling
1882 		 * terminal.  When we open the slave, it will become the
1883 		 * controlling terminal; no PIDs concerning pgrps or sids
1884 		 * will leak inappropriately into the zone.
1885 		 */
1886 		(void) setpgrp();
1887 
1888 		/*
1889 		 * We need the slave pty to be referenced from the zone's
1890 		 * /dev in order to ensure that the devt's, etc are all
1891 		 * correct.  Otherwise we break ttyname and the like.
1892 		 */
1893 		if ((newslave = open(slavename, O_RDWR)) == -1) {
1894 			(void) close(slavefd);
1895 			return (1);
1896 		}
1897 		(void) close(slavefd);
1898 		slavefd = newslave;
1899 
1900 		/*
1901 		 * dup the slave to the various FDs, so that when the
1902 		 * spawned process does a write/read it maps to the slave
1903 		 * pty.
1904 		 */
1905 		(void) dup2(slavefd, STDIN_FILENO);
1906 		(void) dup2(slavefd, STDOUT_FILENO);
1907 		(void) dup2(slavefd, STDERR_FILENO);
1908 		if (slavefd != STDIN_FILENO && slavefd != STDOUT_FILENO &&
1909 		    slavefd != STDERR_FILENO) {
1910 			(void) close(slavefd);
1911 		}
1912 
1913 		/*
1914 		 * In failsafe mode, we don't use login(1), so don't try
1915 		 * setting up a utmpx entry.
1916 		 *
1917 		 * A branded zone may have very different utmpx semantics.
1918 		 * At the moment, we only have two brand types:
1919 		 * Solaris-like (native, sn1) and Linux.  In the Solaris
1920 		 * case, we know exactly how to do the necessary utmpx
1921 		 * setup.  Fortunately for us, the Linux /bin/login is
1922 		 * prepared to deal with a non-initialized utmpx entry, so
1923 		 * we can simply skip it.  If future brands don't fall into
1924 		 * either category, we'll have to add a per-brand utmpx
1925 		 * setup hook.
1926 		 */
1927 		if (!failsafe && (strcmp(zonebrand, "lx") != 0))
1928 			if (setup_utmpx(slaveshortname) == -1)
1929 				return (1);
1930 
1931 		(void) execve(new_args[0], new_args, new_env);
1932 		zperror(gettext("exec failure"));
1933 		return (1);
1934 	}
1935 	(void) ct_tmpl_clear(tmpl_fd);
1936 	(void) close(tmpl_fd);
1937 
1938 	/*
1939 	 * The rest is only for the parent process.
1940 	 */
1941 	(void) sigset(SIGWINCH, sigwinch);
1942 
1943 	postfork_dropprivs();
1944 
1945 	(void) sigprocmask(SIG_UNBLOCK, &block_cld, NULL);
1946 	doio(masterfd, -1, masterfd, -1, B_FALSE);
1947 
1948 	reset_tty();
1949 	(void) fprintf(stderr,
1950 	    gettext("\n[Connection to zone '%s' %s closed]\n"), zonename,
1951 	    slaveshortname);
1952 
1953 	if (pollerr != 0) {
1954 		(void) fprintf(stderr, gettext("Error: connection closed due "
1955 		    "to unexpected pollevents=0x%x.\n"), pollerr);
1956 		return (1);
1957 	}
1958 
1959 	return (0);
1960 }
1961