xref: /titanic_51/usr/src/uts/common/os/exec.c (revision 57ef7aa924e4bfdf3118d9b5b4285dfc94b632f3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1988 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/systm.h>
34 #include <sys/signal.h>
35 #include <sys/cred_impl.h>
36 #include <sys/policy.h>
37 #include <sys/user.h>
38 #include <sys/errno.h>
39 #include <sys/file.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/mman.h>
43 #include <sys/acct.h>
44 #include <sys/cpuvar.h>
45 #include <sys/proc.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/pathname.h>
49 #include <sys/vm.h>
50 #include <sys/lgrp.h>
51 #include <sys/vtrace.h>
52 #include <sys/exec.h>
53 #include <sys/exechdr.h>
54 #include <sys/kmem.h>
55 #include <sys/prsystm.h>
56 #include <sys/modctl.h>
57 #include <sys/vmparam.h>
58 #include <sys/door.h>
59 #include <sys/schedctl.h>
60 #include <sys/utrap.h>
61 #include <sys/systeminfo.h>
62 #include <sys/stack.h>
63 #include <sys/rctl.h>
64 #include <sys/dtrace.h>
65 #include <sys/lwpchan_impl.h>
66 #include <sys/pool.h>
67 #include <sys/sdt.h>
68 #include <sys/brand.h>
69 
70 #include <c2/audit.h>
71 
72 #include <vm/hat.h>
73 #include <vm/anon.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_vn.h>
77 
78 #define	PRIV_RESET		0x01	/* needs to reset privs */
79 #define	PRIV_SETID		0x02	/* needs to change uids */
80 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
81 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
82 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
83 
84 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
85 static int hold_execsw(struct execsw *);
86 
87 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
88 #if defined(_SYSCALL32_IMPL)
89 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
90 #endif
91 
92 #define	PSUIDFLAGS		(SNOCD|SUGID)
93 
94 /*
95  * exec() - wrapper around exece providing NULL environment pointer
96  */
97 int
98 exec(const char *fname, const char **argp)
99 {
100 	return (exece(fname, argp, NULL));
101 }
102 
103 /*
104  * exece() - system call wrapper around exec_common()
105  */
106 int
107 exece(const char *fname, const char **argp, const char **envp)
108 {
109 	int error;
110 
111 	error = exec_common(fname, argp, envp, EBA_NONE);
112 	return (error ? (set_errno(error)) : 0);
113 }
114 
115 int
116 exec_common(const char *fname, const char **argp, const char **envp,
117     int brand_action)
118 {
119 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
120 	proc_t *p = ttoproc(curthread);
121 	klwp_t *lwp = ttolwp(curthread);
122 	struct user *up = PTOU(p);
123 	long execsz;		/* temporary count of exec size */
124 	int i;
125 	int error;
126 	char exec_file[MAXCOMLEN+1];
127 	struct pathname pn;
128 	struct pathname resolvepn;
129 	struct uarg args;
130 	struct execa ua;
131 	k_sigset_t savedmask;
132 	lwpdir_t *lwpdir = NULL;
133 	lwpdir_t **tidhash;
134 	lwpdir_t *old_lwpdir = NULL;
135 	uint_t old_lwpdir_sz;
136 	lwpdir_t **old_tidhash;
137 	uint_t old_tidhash_sz;
138 	lwpent_t *lep;
139 	boolean_t brandme = B_FALSE;
140 
141 	/*
142 	 * exec() is not supported for the /proc agent lwp.
143 	 */
144 	if (curthread == p->p_agenttp)
145 		return (ENOTSUP);
146 
147 	if (brand_action != EBA_NONE) {
148 		/*
149 		 * Brand actions are not supported for processes that are not
150 		 * running in a branded zone.
151 		 */
152 		if (!ZONE_IS_BRANDED(p->p_zone))
153 			return (ENOTSUP);
154 
155 		if (brand_action == EBA_NATIVE) {
156 			/* Only branded processes can be unbranded */
157 			if (!PROC_IS_BRANDED(p))
158 				return (ENOTSUP);
159 		} else {
160 			/* Only unbranded processes can be branded */
161 			if (PROC_IS_BRANDED(p))
162 				return (ENOTSUP);
163 			brandme = B_TRUE;
164 		}
165 	} else {
166 		/*
167 		 * If this is a native zone, or if the process is already
168 		 * branded, then we don't need to do anything.  If this is
169 		 * a native process in a branded zone, we need to brand the
170 		 * process as it exec()s the new binary.
171 		 */
172 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
173 			brandme = B_TRUE;
174 	}
175 
176 	/*
177 	 * Inform /proc that an exec() has started.
178 	 * Hold signals that are ignored by default so that we will
179 	 * not be interrupted by a signal that will be ignored after
180 	 * successful completion of gexec().
181 	 */
182 	mutex_enter(&p->p_lock);
183 	prexecstart();
184 	schedctl_finish_sigblock(curthread);
185 	savedmask = curthread->t_hold;
186 	sigorset(&curthread->t_hold, &ignoredefault);
187 	mutex_exit(&p->p_lock);
188 
189 	/*
190 	 * Look up path name and remember last component for later.
191 	 * To help coreadm expand its %d token, we attempt to save
192 	 * the directory containing the executable in p_execdir. The
193 	 * first call to lookuppn() may fail and return EINVAL because
194 	 * dirvpp is non-NULL. In that case, we make a second call to
195 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
196 	 * but coreadm is allowed to expand %d to the empty string and
197 	 * there are other cases in which that failure may occur.
198 	 */
199 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
200 		goto out;
201 	pn_alloc(&resolvepn);
202 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
203 		pn_free(&resolvepn);
204 		pn_free(&pn);
205 		if (error != EINVAL)
206 			goto out;
207 
208 		dir = NULL;
209 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
210 			goto out;
211 		pn_alloc(&resolvepn);
212 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
213 		    &vp)) != 0) {
214 			pn_free(&resolvepn);
215 			pn_free(&pn);
216 			goto out;
217 		}
218 	}
219 	if (vp == NULL) {
220 		if (dir != NULL)
221 			VN_RELE(dir);
222 		error = ENOENT;
223 		pn_free(&resolvepn);
224 		pn_free(&pn);
225 		goto out;
226 	}
227 
228 	if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
229 		if (dir != NULL)
230 			VN_RELE(dir);
231 		pn_free(&resolvepn);
232 		pn_free(&pn);
233 		VN_RELE(vp);
234 		goto out;
235 	}
236 
237 	/*
238 	 * We do not allow executing files in attribute directories.
239 	 * We test this by determining whether the resolved path
240 	 * contains a "/" when we're in an attribute directory;
241 	 * only if the pathname does not contain a "/" the resolved path
242 	 * points to a file in the current working (attribute) directory.
243 	 */
244 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
245 	    strchr(resolvepn.pn_path, '/') == NULL) {
246 		if (dir != NULL)
247 			VN_RELE(dir);
248 		error = EACCES;
249 		pn_free(&resolvepn);
250 		pn_free(&pn);
251 		VN_RELE(vp);
252 		goto out;
253 	}
254 
255 	bzero(exec_file, MAXCOMLEN+1);
256 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
257 	bzero(&args, sizeof (args));
258 	args.pathname = resolvepn.pn_path;
259 	/* don't free resolvepn until we are done with args */
260 	pn_free(&pn);
261 
262 	/*
263 	 * Specific exec handlers, or policies determined via
264 	 * /etc/system may override the historical default.
265 	 */
266 	args.stk_prot = PROT_ZFOD;
267 	args.dat_prot = PROT_ZFOD;
268 
269 	CPU_STATS_ADD_K(sys, sysexec, 1);
270 	DTRACE_PROC1(exec, char *, args.pathname);
271 
272 	ua.fname = fname;
273 	ua.argp = argp;
274 	ua.envp = envp;
275 
276 	/* If necessary, brand this process before we start the exec. */
277 	if (brandme)
278 		brand_setbrand(p);
279 
280 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
281 	    exec_file, p->p_cred, brand_action)) != 0) {
282 		if (brandme)
283 			brand_clearbrand(p);
284 		VN_RELE(vp);
285 		if (dir != NULL)
286 			VN_RELE(dir);
287 		pn_free(&resolvepn);
288 		goto fail;
289 	}
290 
291 	/*
292 	 * Free floating point registers (sun4u only)
293 	 */
294 	ASSERT(lwp != NULL);
295 	lwp_freeregs(lwp, 1);
296 
297 	/*
298 	 * Free thread and process context ops.
299 	 */
300 	if (curthread->t_ctx)
301 		freectx(curthread, 1);
302 	if (p->p_pctx)
303 		freepctx(p, 1);
304 
305 	/*
306 	 * Remember file name for accounting; clear any cached DTrace predicate.
307 	 */
308 	up->u_acflag &= ~AFORK;
309 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
310 	curthread->t_predcache = NULL;
311 
312 	/*
313 	 * Clear contract template state
314 	 */
315 	lwp_ctmpl_clear(lwp);
316 
317 	/*
318 	 * Save the directory in which we found the executable for expanding
319 	 * the %d token used in core file patterns.
320 	 */
321 	mutex_enter(&p->p_lock);
322 	tmpvp = p->p_execdir;
323 	p->p_execdir = dir;
324 	if (p->p_execdir != NULL)
325 		VN_HOLD(p->p_execdir);
326 	mutex_exit(&p->p_lock);
327 
328 	if (tmpvp != NULL)
329 		VN_RELE(tmpvp);
330 
331 	/*
332 	 * Reset stack state to the user stack, clear set of signals
333 	 * caught on the signal stack, and reset list of signals that
334 	 * restart system calls; the new program's environment should
335 	 * not be affected by detritus from the old program.  Any
336 	 * pending held signals remain held, so don't clear t_hold.
337 	 */
338 	mutex_enter(&p->p_lock);
339 	lwp->lwp_oldcontext = 0;
340 	lwp->lwp_ustack = 0;
341 	lwp->lwp_old_stk_ctl = 0;
342 	sigemptyset(&up->u_signodefer);
343 	sigemptyset(&up->u_sigonstack);
344 	sigemptyset(&up->u_sigresethand);
345 	lwp->lwp_sigaltstack.ss_sp = 0;
346 	lwp->lwp_sigaltstack.ss_size = 0;
347 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
348 
349 	/*
350 	 * Make saved resource limit == current resource limit.
351 	 */
352 	for (i = 0; i < RLIM_NLIMITS; i++) {
353 		/*CONSTCOND*/
354 		if (RLIM_SAVED(i)) {
355 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
356 			    &up->u_saved_rlimit[i]);
357 		}
358 	}
359 
360 	/*
361 	 * If the action was to catch the signal, then the action
362 	 * must be reset to SIG_DFL.
363 	 */
364 	sigdefault(p);
365 	p->p_flag &= ~(SNOWAIT|SJCTL);
366 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
367 	up->u_signal[SIGCLD - 1] = SIG_DFL;
368 
369 	/*
370 	 * Delete the dot4 sigqueues/signotifies.
371 	 */
372 	sigqfree(p);
373 
374 	mutex_exit(&p->p_lock);
375 
376 	mutex_enter(&p->p_pflock);
377 	p->p_prof.pr_base = NULL;
378 	p->p_prof.pr_size = 0;
379 	p->p_prof.pr_off = 0;
380 	p->p_prof.pr_scale = 0;
381 	p->p_prof.pr_samples = 0;
382 	mutex_exit(&p->p_pflock);
383 
384 	ASSERT(curthread->t_schedctl == NULL);
385 
386 #if defined(__sparc)
387 	if (p->p_utraps != NULL)
388 		utrap_free(p);
389 #endif	/* __sparc */
390 
391 	/*
392 	 * Close all close-on-exec files.
393 	 */
394 	close_exec(P_FINFO(p));
395 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
396 
397 	/* Unbrand ourself if necessary. */
398 	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
399 		brand_clearbrand(p);
400 
401 	setregs(&args);
402 
403 	/* Mark this as an executable vnode */
404 	mutex_enter(&vp->v_lock);
405 	vp->v_flag |= VVMEXEC;
406 	mutex_exit(&vp->v_lock);
407 
408 	VN_RELE(vp);
409 	if (dir != NULL)
410 		VN_RELE(dir);
411 	pn_free(&resolvepn);
412 
413 	/*
414 	 * Allocate a new lwp directory and lwpid hash table if necessary.
415 	 */
416 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
417 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
418 		lwpdir->ld_next = lwpdir + 1;
419 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
420 		if (p->p_lwpdir != NULL)
421 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
422 		else
423 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
424 	}
425 
426 	if (PROC_IS_BRANDED(p))
427 		BROP(p)->b_exec();
428 
429 	mutex_enter(&p->p_lock);
430 	prbarrier(p);
431 
432 	/*
433 	 * Reset lwp id to the default value of 1.
434 	 * This is a single-threaded process now
435 	 * and lwp #1 is lwp_wait()able by default.
436 	 * The t_unpark flag should not be inherited.
437 	 */
438 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
439 	curthread->t_tid = 1;
440 	kpreempt_disable();
441 	ASSERT(curthread->t_lpl != NULL);
442 	p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
443 	kpreempt_enable();
444 	if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
445 		lgrp_update_trthr_migrations(1);
446 	}
447 	curthread->t_unpark = 0;
448 	curthread->t_proc_flag |= TP_TWAIT;
449 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
450 	p->p_lwpdaemon = 0;			/* but oh well ... */
451 	p->p_lwpid = 1;
452 
453 	/*
454 	 * Install the newly-allocated lwp directory and lwpid hash table
455 	 * and insert the current thread into the new hash table.
456 	 */
457 	if (lwpdir != NULL) {
458 		old_lwpdir = p->p_lwpdir;
459 		old_lwpdir_sz = p->p_lwpdir_sz;
460 		old_tidhash = p->p_tidhash;
461 		old_tidhash_sz = p->p_tidhash_sz;
462 		p->p_lwpdir = p->p_lwpfree = lwpdir;
463 		p->p_lwpdir_sz = 2;
464 		p->p_tidhash = tidhash;
465 		p->p_tidhash_sz = 2;
466 		lep->le_thread = curthread;
467 		lep->le_lwpid = curthread->t_tid;
468 		lep->le_start = curthread->t_start;
469 		lwp_hash_in(p, lep);
470 	}
471 
472 	/*
473 	 * Restore the saved signal mask and
474 	 * inform /proc that the exec() has finished.
475 	 */
476 	curthread->t_hold = savedmask;
477 	prexecend();
478 	mutex_exit(&p->p_lock);
479 	if (old_lwpdir) {
480 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
481 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
482 	}
483 
484 	ASSERT(error == 0);
485 	DTRACE_PROC(exec__success);
486 	return (0);
487 
488 fail:
489 	DTRACE_PROC1(exec__failure, int, error);
490 out:		/* error return */
491 	mutex_enter(&p->p_lock);
492 	curthread->t_hold = savedmask;
493 	prexecend();
494 	mutex_exit(&p->p_lock);
495 	ASSERT(error != 0);
496 	return (error);
497 }
498 
499 
500 /*
501  * Perform generic exec duties and switchout to object-file specific
502  * handler.
503  */
504 int
505 gexec(
506 	struct vnode **vpp,
507 	struct execa *uap,
508 	struct uarg *args,
509 	struct intpdata *idatap,
510 	int level,
511 	long *execsz,
512 	caddr_t exec_file,
513 	struct cred *cred,
514 	int brand_action)
515 {
516 	struct vnode *vp;
517 	proc_t *pp = ttoproc(curthread);
518 	struct execsw *eswp;
519 	int error = 0;
520 	int suidflags = 0;
521 	ssize_t resid;
522 	uid_t uid, gid;
523 	struct vattr vattr;
524 	char magbuf[MAGIC_BYTES];
525 	int setid;
526 	cred_t *oldcred, *newcred = NULL;
527 	int privflags = 0;
528 	int setidfl;
529 
530 	/*
531 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
532 	 * previous setting so we can restore it if we encounter an error.
533 	 */
534 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
535 		mutex_enter(&pp->p_lock);
536 		suidflags = pp->p_flag & PSUIDFLAGS;
537 		pp->p_flag &= ~PSUIDFLAGS;
538 		mutex_exit(&pp->p_lock);
539 	}
540 
541 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
542 		goto bad;
543 
544 	/* need to open vnode for stateful file systems like rfs */
545 	if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
546 		goto bad;
547 	vp = *vpp;
548 
549 	/*
550 	 * Note: to support binary compatibility with SunOS a.out
551 	 * executables, we read in the first four bytes, as the
552 	 * magic number is in bytes 2-3.
553 	 */
554 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
555 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
556 		goto bad;
557 	if (resid != 0)
558 		goto bad;
559 
560 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
561 		goto bad;
562 
563 	if (level == 0 &&
564 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
565 
566 		newcred = cred = crdup(cred);
567 
568 		/* If we can, drop the PA bit */
569 		if ((privflags & PRIV_RESET) != 0)
570 			priv_adjust_PA(cred);
571 
572 		if (privflags & PRIV_SETID) {
573 			cred->cr_uid = uid;
574 			cred->cr_gid = gid;
575 			cred->cr_suid = uid;
576 			cred->cr_sgid = gid;
577 		}
578 
579 		if (privflags & MAC_FLAGS) {
580 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
581 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
582 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
583 		}
584 
585 		/*
586 		 * Implement the privilege updates:
587 		 *
588 		 * Restrict with L:
589 		 *
590 		 *	I' = I & L
591 		 *
592 		 *	E' = P' = (I' + F) & A
593 		 *
594 		 * But if running under ptrace, we cap I with P.
595 		 */
596 		if ((privflags & PRIV_RESET) != 0) {
597 			if ((privflags & PRIV_INCREASE) != 0 &&
598 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
599 				priv_intersect(&CR_OPPRIV(cred),
600 				    &CR_IPRIV(cred));
601 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
602 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
603 			priv_adjust_PA(cred);
604 		}
605 	}
606 
607 	/* SunOS 4.x buy-back */
608 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
609 	    (vattr.va_mode & (VSUID|VSGID))) {
610 		char path[MAXNAMELEN];
611 		refstr_t *mntpt = NULL;
612 		int ret = -1;
613 
614 		bzero(path, sizeof (path));
615 		zone_hold(pp->p_zone);
616 
617 		ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
618 		    sizeof (path), cred);
619 
620 		/* fallback to mountpoint if a path can't be found */
621 		if ((ret != 0) || (ret == 0 && path[0] == '\0'))
622 			mntpt = vfs_getmntpoint(vp->v_vfsp);
623 
624 		if (mntpt == NULL)
625 			zcmn_err(pp->p_zone->zone_id, CE_NOTE,
626 			    "!uid %d: setuid execution not allowed, "
627 			    "file=%s", cred->cr_uid, path);
628 		else
629 			zcmn_err(pp->p_zone->zone_id, CE_NOTE,
630 			    "!uid %d: setuid execution not allowed, "
631 			    "fs=%s, file=%s", cred->cr_uid,
632 			    ZONE_PATH_TRANSLATE(refstr_value(mntpt),
633 			    pp->p_zone), exec_file);
634 
635 		if (!INGLOBALZONE(pp)) {
636 			/* zone_rootpath always has trailing / */
637 			if (mntpt == NULL)
638 				cmn_err(CE_NOTE, "!zone: %s, uid: %d "
639 				    "setuid execution not allowed, file=%s%s",
640 				    pp->p_zone->zone_name, cred->cr_uid,
641 				    pp->p_zone->zone_rootpath, path + 1);
642 			else
643 				cmn_err(CE_NOTE, "!zone: %s, uid: %d "
644 				    "setuid execution not allowed, fs=%s, "
645 				    "file=%s", pp->p_zone->zone_name,
646 				    cred->cr_uid, refstr_value(mntpt),
647 				    exec_file);
648 		}
649 
650 		if (mntpt != NULL)
651 			refstr_rele(mntpt);
652 
653 		zone_rele(pp->p_zone);
654 	}
655 
656 	/*
657 	 * execsetid() told us whether or not we had to change the
658 	 * credentials of the process.  In privflags, it told us
659 	 * whether we gained any privileges or executed a set-uid executable.
660 	 */
661 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
662 
663 	/*
664 	 * Use /etc/system variable to determine if the stack
665 	 * should be marked as executable by default.
666 	 */
667 	if (noexec_user_stack)
668 		args->stk_prot &= ~PROT_EXEC;
669 
670 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
671 	args->ex_vp = vp;
672 
673 	/*
674 	 * Traditionally, the setid flags told the sub processes whether
675 	 * the file just executed was set-uid or set-gid; this caused
676 	 * some confusion as the 'setid' flag did not match the SUGID
677 	 * process flag which is only set when the uids/gids do not match.
678 	 * A script set-gid/set-uid to the real uid/gid would start with
679 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
680 	 * Now we flag those cases where the calling process cannot
681 	 * be trusted to influence the newly exec'ed process, either
682 	 * because it runs with more privileges or when the uids/gids
683 	 * do in fact not match.
684 	 * This also makes the runtime linker agree with the on exec
685 	 * values of SNOCD and SUGID.
686 	 */
687 	setidfl = 0;
688 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
689 	    !supgroupmember(cred->cr_gid, cred))) {
690 		setidfl |= EXECSETID_UGIDS;
691 	}
692 	if (setid & PRIV_SETUGID)
693 		setidfl |= EXECSETID_SETID;
694 	if (setid & PRIV_INCREASE)
695 		setidfl |= EXECSETID_PRIVS;
696 
697 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
698 	    setidfl, exec_file, cred, brand_action);
699 	rw_exit(eswp->exec_lock);
700 	if (error != 0) {
701 		if (newcred != NULL)
702 			crfree(newcred);
703 		goto bad;
704 	}
705 
706 	if (level == 0) {
707 		mutex_enter(&pp->p_crlock);
708 		if (newcred != NULL) {
709 			/*
710 			 * Free the old credentials, and set the new ones.
711 			 * Do this for both the process and the (single) thread.
712 			 */
713 			crfree(pp->p_cred);
714 			pp->p_cred = cred;	/* cred already held for proc */
715 			crhold(cred);		/* hold new cred for thread */
716 			/*
717 			 * DTrace accesses t_cred in probe context.  t_cred
718 			 * must always be either NULL, or point to a valid,
719 			 * allocated cred structure.
720 			 */
721 			oldcred = curthread->t_cred;
722 			curthread->t_cred = cred;
723 			crfree(oldcred);
724 		}
725 		/*
726 		 * On emerging from a successful exec(), the saved
727 		 * uid and gid equal the effective uid and gid.
728 		 */
729 		cred->cr_suid = cred->cr_uid;
730 		cred->cr_sgid = cred->cr_gid;
731 
732 		/*
733 		 * If the real and effective ids do not match, this
734 		 * is a setuid process that should not dump core.
735 		 * The group comparison is tricky; we prevent the code
736 		 * from flagging SNOCD when executing with an effective gid
737 		 * which is a supplementary group.
738 		 */
739 		if (cred->cr_ruid != cred->cr_uid ||
740 		    (cred->cr_rgid != cred->cr_gid &&
741 		    !supgroupmember(cred->cr_gid, cred)) ||
742 		    (privflags & PRIV_INCREASE) != 0)
743 			suidflags = PSUIDFLAGS;
744 		else
745 			suidflags = 0;
746 
747 		mutex_exit(&pp->p_crlock);
748 		if (suidflags) {
749 			mutex_enter(&pp->p_lock);
750 			pp->p_flag |= suidflags;
751 			mutex_exit(&pp->p_lock);
752 		}
753 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
754 			/*
755 			 * If process is traced via /proc, arrange to
756 			 * invalidate the associated /proc vnode.
757 			 */
758 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
759 				args->traceinval = 1;
760 		}
761 		if (pp->p_proc_flag & P_PR_PTRACE)
762 			psignal(pp, SIGTRAP);
763 		if (args->traceinval)
764 			prinvalidate(&pp->p_user);
765 	}
766 
767 	return (0);
768 bad:
769 	if (error == 0)
770 		error = ENOEXEC;
771 
772 	if (suidflags) {
773 		mutex_enter(&pp->p_lock);
774 		pp->p_flag |= suidflags;
775 		mutex_exit(&pp->p_lock);
776 	}
777 	return (error);
778 }
779 
780 extern char *execswnames[];
781 
782 struct execsw *
783 allocate_execsw(char *name, char *magic, size_t magic_size)
784 {
785 	int i, j;
786 	char *ename;
787 	char *magicp;
788 
789 	mutex_enter(&execsw_lock);
790 	for (i = 0; i < nexectype; i++) {
791 		if (execswnames[i] == NULL) {
792 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
793 			(void) strcpy(ename, name);
794 			execswnames[i] = ename;
795 			/*
796 			 * Set the magic number last so that we
797 			 * don't need to hold the execsw_lock in
798 			 * findexectype().
799 			 */
800 			magicp = kmem_alloc(magic_size, KM_SLEEP);
801 			for (j = 0; j < magic_size; j++)
802 				magicp[j] = magic[j];
803 			execsw[i].exec_magic = magicp;
804 			mutex_exit(&execsw_lock);
805 			return (&execsw[i]);
806 		}
807 	}
808 	mutex_exit(&execsw_lock);
809 	return (NULL);
810 }
811 
812 /*
813  * Find the exec switch table entry with the corresponding magic string.
814  */
815 struct execsw *
816 findexecsw(char *magic)
817 {
818 	struct execsw *eswp;
819 
820 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
821 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
822 		if (magic && eswp->exec_maglen != 0 &&
823 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
824 			return (eswp);
825 	}
826 	return (NULL);
827 }
828 
829 /*
830  * Find the execsw[] index for the given exec header string by looking for the
831  * magic string at a specified offset and length for each kind of executable
832  * file format until one matches.  If no execsw[] entry is found, try to
833  * autoload a module for this magic string.
834  */
835 struct execsw *
836 findexec_by_hdr(char *header)
837 {
838 	struct execsw *eswp;
839 
840 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
841 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
842 		if (header && eswp->exec_maglen != 0 &&
843 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
844 		    eswp->exec_maglen) == 0) {
845 			if (hold_execsw(eswp) != 0)
846 				return (NULL);
847 			return (eswp);
848 		}
849 	}
850 	return (NULL);	/* couldn't find the type */
851 }
852 
853 /*
854  * Find the execsw[] index for the given magic string.  If no execsw[] entry
855  * is found, try to autoload a module for this magic string.
856  */
857 struct execsw *
858 findexec_by_magic(char *magic)
859 {
860 	struct execsw *eswp;
861 
862 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
863 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
864 		if (magic && eswp->exec_maglen != 0 &&
865 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
866 			if (hold_execsw(eswp) != 0)
867 				return (NULL);
868 			return (eswp);
869 		}
870 	}
871 	return (NULL);	/* couldn't find the type */
872 }
873 
874 static int
875 hold_execsw(struct execsw *eswp)
876 {
877 	char *name;
878 
879 	rw_enter(eswp->exec_lock, RW_READER);
880 	while (!LOADED_EXEC(eswp)) {
881 		rw_exit(eswp->exec_lock);
882 		name = execswnames[eswp-execsw];
883 		ASSERT(name);
884 		if (modload("exec", name) == -1)
885 			return (-1);
886 		rw_enter(eswp->exec_lock, RW_READER);
887 	}
888 	return (0);
889 }
890 
891 static int
892 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
893 {
894 	proc_t *pp = ttoproc(curthread);
895 	uid_t uid, gid;
896 	cred_t *cr = pp->p_cred;
897 	int privflags = 0;
898 
899 	/*
900 	 * Remember credentials.
901 	 */
902 	uid = cr->cr_uid;
903 	gid = cr->cr_gid;
904 
905 	/* Will try to reset the PRIV_AWARE bit later. */
906 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
907 		privflags |= PRIV_RESET;
908 
909 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
910 		/*
911 		 * Set-uid root execution only allowed if the limit set
912 		 * holds all unsafe privileges.
913 		 */
914 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
915 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
916 			uid = vattrp->va_uid;
917 			privflags |= PRIV_SETUGID;
918 		}
919 		if (vattrp->va_mode & VSGID) {
920 			gid = vattrp->va_gid;
921 			privflags |= PRIV_SETUGID;
922 		}
923 	}
924 
925 	/*
926 	 * Do we need to change our credential anyway?
927 	 * This is the case when E != I or P != I, as
928 	 * we need to do the assignments (with F empty and A full)
929 	 * Or when I is not a subset of L; in that case we need to
930 	 * enforce L.
931 	 *
932 	 *		I' = L & I
933 	 *
934 	 *		E' = P' = (I' + F) & A
935 	 * or
936 	 *		E' = P' = I'
937 	 */
938 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
939 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
940 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
941 		privflags |= PRIV_RESET;
942 
943 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
944 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
945 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
946 		privflags |= MAC_FLAGS;
947 
948 	/*
949 	 * When we introduce the "forced" set then we will need
950 	 * to set PRIV_INCREASE here if I not a subset of P.
951 	 * If the "allowed" set is introduced we will need to do
952 	 * a similar thing; however, it seems more reasonable to
953 	 * have the allowed set reduce "L": script language interpreters
954 	 * would typically have an allowed set of "all".
955 	 */
956 
957 	/*
958 	 * Set setuid/setgid protections if no ptrace() compatibility.
959 	 * For privileged processes, honor setuid/setgid even in
960 	 * the presence of ptrace() compatibility.
961 	 */
962 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
963 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
964 	    (cr->cr_uid != uid ||
965 	    cr->cr_gid != gid ||
966 	    cr->cr_suid != uid ||
967 	    cr->cr_sgid != gid)) {
968 		*uidp = uid;
969 		*gidp = gid;
970 		privflags |= PRIV_SETID;
971 	}
972 	return (privflags);
973 }
974 
975 int
976 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
977 {
978 	int error;
979 	proc_t *p = ttoproc(curthread);
980 
981 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
982 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
983 		return (error);
984 	/*
985 	 * Check the access mode.
986 	 * If VPROC, ask /proc if the file is an object file.
987 	 */
988 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
989 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
990 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
991 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
992 		if (error == 0)
993 			error = EACCES;
994 		return (error);
995 	}
996 
997 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
998 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
999 		/*
1000 		 * If process is under ptrace(2) compatibility,
1001 		 * fail the exec(2).
1002 		 */
1003 		if (p->p_proc_flag & P_PR_PTRACE)
1004 			goto bad;
1005 		/*
1006 		 * Process is traced via /proc.
1007 		 * Arrange to invalidate the /proc vnode.
1008 		 */
1009 		args->traceinval = 1;
1010 	}
1011 	return (0);
1012 bad:
1013 	if (error == 0)
1014 		error = ENOEXEC;
1015 	return (error);
1016 }
1017 
1018 /*
1019  * Map a section of an executable file into the user's
1020  * address space.
1021  */
1022 int
1023 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1024     off_t offset, int prot, int page, uint_t szc)
1025 {
1026 	int error = 0;
1027 	off_t oldoffset;
1028 	caddr_t zfodbase, oldaddr;
1029 	size_t end, oldlen;
1030 	size_t zfoddiff;
1031 	label_t ljb;
1032 	proc_t *p = ttoproc(curthread);
1033 
1034 	oldaddr = addr;
1035 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1036 	if (len) {
1037 		oldlen = len;
1038 		len += ((size_t)oldaddr - (size_t)addr);
1039 		oldoffset = offset;
1040 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
1041 		if (page) {
1042 			spgcnt_t  prefltmem, availm, npages;
1043 			int preread;
1044 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1045 
1046 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1047 				mflag |= MAP_TEXT;
1048 			} else {
1049 				mflag |= MAP_INITDATA;
1050 			}
1051 
1052 			if (valid_usr_range(addr, len, prot, p->p_as,
1053 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1054 				error = ENOMEM;
1055 				goto bad;
1056 			}
1057 			if (error = VOP_MAP(vp, (offset_t)offset,
1058 			    p->p_as, &addr, len, prot, PROT_ALL,
1059 			    mflag, CRED(), NULL))
1060 				goto bad;
1061 
1062 			/*
1063 			 * If the segment can fit, then we prefault
1064 			 * the entire segment in.  This is based on the
1065 			 * model that says the best working set of a
1066 			 * small program is all of its pages.
1067 			 */
1068 			npages = (spgcnt_t)btopr(len);
1069 			prefltmem = freemem - desfree;
1070 			preread =
1071 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1072 
1073 			/*
1074 			 * If we aren't prefaulting the segment,
1075 			 * increment "deficit", if necessary to ensure
1076 			 * that pages will become available when this
1077 			 * process starts executing.
1078 			 */
1079 			availm = freemem - lotsfree;
1080 			if (preread == 0 && npages > availm &&
1081 			    deficit < lotsfree) {
1082 				deficit += MIN((pgcnt_t)(npages - availm),
1083 				    lotsfree - deficit);
1084 			}
1085 
1086 			if (preread) {
1087 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1088 				    "execmap preread:freemem %d size %lu",
1089 				    freemem, len);
1090 				(void) as_fault(p->p_as->a_hat, p->p_as,
1091 				    (caddr_t)addr, len, F_INVAL, S_READ);
1092 			}
1093 		} else {
1094 			if (valid_usr_range(addr, len, prot, p->p_as,
1095 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1096 				error = ENOMEM;
1097 				goto bad;
1098 			}
1099 
1100 			if (error = as_map(p->p_as, addr, len,
1101 			    segvn_create, zfod_argsp))
1102 				goto bad;
1103 			/*
1104 			 * Read in the segment in one big chunk.
1105 			 */
1106 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1107 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1108 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1109 				goto bad;
1110 			/*
1111 			 * Now set protections.
1112 			 */
1113 			if (prot != PROT_ZFOD) {
1114 				(void) as_setprot(p->p_as, (caddr_t)addr,
1115 				    len, prot);
1116 			}
1117 		}
1118 	}
1119 
1120 	if (zfodlen) {
1121 		struct as *as = curproc->p_as;
1122 		struct seg *seg;
1123 		uint_t zprot = 0;
1124 
1125 		end = (size_t)addr + len;
1126 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1127 		zfoddiff = (uintptr_t)zfodbase - end;
1128 		if (zfoddiff) {
1129 			/*
1130 			 * Before we go to zero the remaining space on the last
1131 			 * page, make sure we have write permission.
1132 			 */
1133 
1134 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1135 			seg = as_segat(curproc->p_as, (caddr_t)end);
1136 			if (seg != NULL)
1137 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1138 				    &zprot);
1139 			AS_LOCK_EXIT(as, &as->a_lock);
1140 
1141 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1142 				(void) as_setprot(as, (caddr_t)end,
1143 				    zfoddiff - 1, zprot | PROT_WRITE);
1144 			}
1145 
1146 			if (on_fault(&ljb)) {
1147 				no_fault();
1148 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1149 					(void) as_setprot(as, (caddr_t)end,
1150 					    zfoddiff - 1, zprot);
1151 				error = EFAULT;
1152 				goto bad;
1153 			}
1154 			uzero((void *)end, zfoddiff);
1155 			no_fault();
1156 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1157 				(void) as_setprot(as, (caddr_t)end,
1158 				    zfoddiff - 1, zprot);
1159 		}
1160 		if (zfodlen > zfoddiff) {
1161 			struct segvn_crargs crargs =
1162 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1163 
1164 			zfodlen -= zfoddiff;
1165 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1166 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1167 				error = ENOMEM;
1168 				goto bad;
1169 			}
1170 			if (szc > 0) {
1171 				/*
1172 				 * ASSERT alignment because the mapelfexec()
1173 				 * caller for the szc > 0 case extended zfod
1174 				 * so it's end is pgsz aligned.
1175 				 */
1176 				size_t pgsz = page_get_pagesize(szc);
1177 				ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1178 
1179 				if (IS_P2ALIGNED(zfodbase, pgsz)) {
1180 					crargs.szc = szc;
1181 				} else {
1182 					crargs.szc = AS_MAP_HEAP;
1183 				}
1184 			} else {
1185 				crargs.szc = AS_MAP_NO_LPOOB;
1186 			}
1187 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1188 			    zfodlen, segvn_create, &crargs))
1189 				goto bad;
1190 			if (prot != PROT_ZFOD) {
1191 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1192 				    zfodlen, prot);
1193 			}
1194 		}
1195 	}
1196 	return (0);
1197 bad:
1198 	return (error);
1199 }
1200 
1201 void
1202 setexecenv(struct execenv *ep)
1203 {
1204 	proc_t *p = ttoproc(curthread);
1205 	klwp_t *lwp = ttolwp(curthread);
1206 	struct vnode *vp;
1207 
1208 	p->p_bssbase = ep->ex_bssbase;
1209 	p->p_brkbase = ep->ex_brkbase;
1210 	p->p_brksize = ep->ex_brksize;
1211 	if (p->p_exec)
1212 		VN_RELE(p->p_exec);	/* out with the old */
1213 	vp = p->p_exec = ep->ex_vp;
1214 	if (vp != NULL)
1215 		VN_HOLD(vp);		/* in with the new */
1216 
1217 	lwp->lwp_sigaltstack.ss_sp = 0;
1218 	lwp->lwp_sigaltstack.ss_size = 0;
1219 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1220 }
1221 
1222 int
1223 execopen(struct vnode **vpp, int *fdp)
1224 {
1225 	struct vnode *vp = *vpp;
1226 	file_t *fp;
1227 	int error = 0;
1228 	int filemode = FREAD;
1229 
1230 	VN_HOLD(vp);		/* open reference */
1231 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1232 		VN_RELE(vp);
1233 		*fdp = -1;	/* just in case falloc changed value */
1234 		return (error);
1235 	}
1236 	if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1237 		VN_RELE(vp);
1238 		setf(*fdp, NULL);
1239 		unfalloc(fp);
1240 		*fdp = -1;
1241 		return (error);
1242 	}
1243 	*vpp = vp;		/* vnode should not have changed */
1244 	fp->f_vnode = vp;
1245 	mutex_exit(&fp->f_tlock);
1246 	setf(*fdp, fp);
1247 	return (0);
1248 }
1249 
1250 int
1251 execclose(int fd)
1252 {
1253 	return (closeandsetf(fd, NULL));
1254 }
1255 
1256 
1257 /*
1258  * noexec stub function.
1259  */
1260 /*ARGSUSED*/
1261 int
1262 noexec(
1263     struct vnode *vp,
1264     struct execa *uap,
1265     struct uarg *args,
1266     struct intpdata *idatap,
1267     int level,
1268     long *execsz,
1269     int setid,
1270     caddr_t exec_file,
1271     struct cred *cred)
1272 {
1273 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1274 	return (ENOEXEC);
1275 }
1276 
1277 /*
1278  * Support routines for building a user stack.
1279  *
1280  * execve(path, argv, envp) must construct a new stack with the specified
1281  * arguments and environment variables (see exec_args() for a description
1282  * of the user stack layout).  To do this, we copy the arguments and
1283  * environment variables from the old user address space into the kernel,
1284  * free the old as, create the new as, and copy our buffered information
1285  * to the new stack.  Our kernel buffer has the following structure:
1286  *
1287  *	+-----------------------+ <--- stk_base + stk_size
1288  *	| string offsets	|
1289  *	+-----------------------+ <--- stk_offp
1290  *	|			|
1291  *	| STK_AVAIL() space	|
1292  *	|			|
1293  *	+-----------------------+ <--- stk_strp
1294  *	| strings		|
1295  *	+-----------------------+ <--- stk_base
1296  *
1297  * When we add a string, we store the string's contents (including the null
1298  * terminator) at stk_strp, and we store the offset of the string relative to
1299  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1300  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1301  * the difference between these pointers.  If we run out of space, we return
1302  * an error and exec_args() starts all over again with a buffer twice as large.
1303  * When we're all done, the kernel buffer looks like this:
1304  *
1305  *	+-----------------------+ <--- stk_base + stk_size
1306  *	| argv[0] offset	|
1307  *	+-----------------------+
1308  *	| ...			|
1309  *	+-----------------------+
1310  *	| argv[argc-1] offset	|
1311  *	+-----------------------+
1312  *	| envp[0] offset	|
1313  *	+-----------------------+
1314  *	| ...			|
1315  *	+-----------------------+
1316  *	| envp[envc-1] offset	|
1317  *	+-----------------------+
1318  *	| AT_SUN_PLATFORM offset|
1319  *	+-----------------------+
1320  *	| AT_SUN_EXECNAME offset|
1321  *	+-----------------------+ <--- stk_offp
1322  *	|			|
1323  *	| STK_AVAIL() space	|
1324  *	|			|
1325  *	+-----------------------+ <--- stk_strp
1326  *	| AT_SUN_EXECNAME offset|
1327  *	+-----------------------+
1328  *	| AT_SUN_PLATFORM offset|
1329  *	+-----------------------+
1330  *	| envp[envc-1] string	|
1331  *	+-----------------------+
1332  *	| ...			|
1333  *	+-----------------------+
1334  *	| envp[0] string	|
1335  *	+-----------------------+
1336  *	| argv[argc-1] string	|
1337  *	+-----------------------+
1338  *	| ...			|
1339  *	+-----------------------+
1340  *	| argv[0] string	|
1341  *	+-----------------------+ <--- stk_base
1342  */
1343 
1344 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1345 
1346 /*
1347  * Add a string to the stack.
1348  */
1349 static int
1350 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1351 {
1352 	int error;
1353 	size_t len;
1354 
1355 	if (STK_AVAIL(args) < sizeof (int))
1356 		return (E2BIG);
1357 	*--args->stk_offp = args->stk_strp - args->stk_base;
1358 
1359 	if (segflg == UIO_USERSPACE) {
1360 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1361 		if (error != 0)
1362 			return (error);
1363 	} else {
1364 		len = strlen(sp) + 1;
1365 		if (len > STK_AVAIL(args))
1366 			return (E2BIG);
1367 		bcopy(sp, args->stk_strp, len);
1368 	}
1369 
1370 	args->stk_strp += len;
1371 
1372 	return (0);
1373 }
1374 
1375 static int
1376 stk_getptr(uarg_t *args, char *src, char **dst)
1377 {
1378 	int error;
1379 
1380 	if (args->from_model == DATAMODEL_NATIVE) {
1381 		ulong_t ptr;
1382 		error = fulword(src, &ptr);
1383 		*dst = (caddr_t)ptr;
1384 	} else {
1385 		uint32_t ptr;
1386 		error = fuword32(src, &ptr);
1387 		*dst = (caddr_t)(uintptr_t)ptr;
1388 	}
1389 	return (error);
1390 }
1391 
1392 static int
1393 stk_putptr(uarg_t *args, char *addr, char *value)
1394 {
1395 	if (args->to_model == DATAMODEL_NATIVE)
1396 		return (sulword(addr, (ulong_t)value));
1397 	else
1398 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1399 }
1400 
1401 static int
1402 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1403 {
1404 	char *sp;
1405 	int argc, error;
1406 	int argv_empty = 0;
1407 	size_t ptrsize = args->from_ptrsize;
1408 	size_t size, pad;
1409 	char *argv = (char *)uap->argp;
1410 	char *envp = (char *)uap->envp;
1411 
1412 	/*
1413 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1414 	 */
1415 	if (intp != NULL && intp->intp_name != NULL) {
1416 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1417 			return (error);
1418 		if (intp->intp_arg != NULL &&
1419 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1420 			return (error);
1421 		if (args->fname != NULL)
1422 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1423 		else
1424 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1425 		if (error)
1426 			return (error);
1427 
1428 		/*
1429 		 * Check for an empty argv[].
1430 		 */
1431 		if (stk_getptr(args, argv, &sp))
1432 			return (EFAULT);
1433 		if (sp == NULL)
1434 			argv_empty = 1;
1435 
1436 		argv += ptrsize;		/* ignore original argv[0] */
1437 	}
1438 
1439 	if (argv_empty == 0) {
1440 		/*
1441 		 * Add argv[] strings to the stack.
1442 		 */
1443 		for (;;) {
1444 			if (stk_getptr(args, argv, &sp))
1445 				return (EFAULT);
1446 			if (sp == NULL)
1447 				break;
1448 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1449 				return (error);
1450 			argv += ptrsize;
1451 		}
1452 	}
1453 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1454 	args->arglen = args->stk_strp - args->stk_base;
1455 
1456 	/*
1457 	 * Add environ[] strings to the stack.
1458 	 */
1459 	if (envp != NULL) {
1460 		for (;;) {
1461 			if (stk_getptr(args, envp, &sp))
1462 				return (EFAULT);
1463 			if (sp == NULL)
1464 				break;
1465 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1466 				return (error);
1467 			envp += ptrsize;
1468 		}
1469 	}
1470 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1471 	args->ne = args->na - argc;
1472 
1473 	/*
1474 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1475 	 * AT_SUN_EMULATOR strings to the stack.
1476 	 */
1477 	if (auxvpp != NULL && *auxvpp != NULL) {
1478 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1479 			return (error);
1480 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1481 			return (error);
1482 		if (args->brandname != NULL &&
1483 		    (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1484 			return (error);
1485 		if (args->emulator != NULL &&
1486 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1487 			return (error);
1488 	}
1489 
1490 	/*
1491 	 * Compute the size of the stack.  This includes all the pointers,
1492 	 * the space reserved for the aux vector, and all the strings.
1493 	 * The total number of pointers is args->na (which is argc + envc)
1494 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1495 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1496 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1497 	 * all the strings, at the very top of the stack.
1498 	 */
1499 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1500 	    (args->stk_strp - args->stk_base);
1501 
1502 	/*
1503 	 * Pad the string section with zeroes to align the stack size.
1504 	 */
1505 	pad = P2NPHASE(size, args->stk_align);
1506 
1507 	if (STK_AVAIL(args) < pad)
1508 		return (E2BIG);
1509 
1510 	args->usrstack_size = size + pad;
1511 
1512 	while (pad-- != 0)
1513 		*args->stk_strp++ = 0;
1514 
1515 	args->nc = args->stk_strp - args->stk_base;
1516 
1517 	return (0);
1518 }
1519 
1520 static int
1521 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1522 {
1523 	size_t ptrsize = args->to_ptrsize;
1524 	ssize_t pslen;
1525 	char *kstrp = args->stk_base;
1526 	char *ustrp = usrstack - args->nc - ptrsize;
1527 	char *usp = usrstack - args->usrstack_size;
1528 	int *offp = (int *)(args->stk_base + args->stk_size);
1529 	int envc = args->ne;
1530 	int argc = args->na - envc;
1531 	int i;
1532 
1533 	/*
1534 	 * Record argc for /proc.
1535 	 */
1536 	up->u_argc = argc;
1537 
1538 	/*
1539 	 * Put argc on the stack.  Note that even though it's an int,
1540 	 * it always consumes ptrsize bytes (for alignment).
1541 	 */
1542 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1543 		return (-1);
1544 
1545 	/*
1546 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1547 	 */
1548 	up->u_argv = (uintptr_t)(usp += ptrsize);
1549 
1550 	/*
1551 	 * Put the argv[] pointers on the stack.
1552 	 */
1553 	for (i = 0; i < argc; i++, usp += ptrsize)
1554 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1555 			return (-1);
1556 
1557 	/*
1558 	 * Copy arguments to u_psargs.
1559 	 */
1560 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1561 	for (i = 0; i < pslen; i++)
1562 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1563 	while (i < PSARGSZ)
1564 		up->u_psargs[i++] = '\0';
1565 
1566 	/*
1567 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1568 	 * record envp for /proc.
1569 	 */
1570 	up->u_envp = (uintptr_t)(usp += ptrsize);
1571 
1572 	/*
1573 	 * Put the envp[] pointers on the stack.
1574 	 */
1575 	for (i = 0; i < envc; i++, usp += ptrsize)
1576 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1577 			return (-1);
1578 
1579 	/*
1580 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1581 	 * remember where the stack ends, which is also where auxv begins.
1582 	 */
1583 	args->stackend = usp += ptrsize;
1584 
1585 	/*
1586 	 * Put all the argv[], envp[], and auxv strings on the stack.
1587 	 */
1588 	if (copyout(args->stk_base, ustrp, args->nc))
1589 		return (-1);
1590 
1591 	/*
1592 	 * Fill in the aux vector now that we know the user stack addresses
1593 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1594 	 * AT_SUN_EMULATOR strings.
1595 	 */
1596 	if (auxvpp != NULL && *auxvpp != NULL) {
1597 		if (args->to_model == DATAMODEL_NATIVE) {
1598 			auxv_t **a = (auxv_t **)auxvpp;
1599 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1600 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1601 			if (args->brandname != NULL)
1602 				ADDAUX(*a,
1603 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1604 			if (args->emulator != NULL)
1605 				ADDAUX(*a,
1606 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1607 		} else {
1608 			auxv32_t **a = (auxv32_t **)auxvpp;
1609 			ADDAUX(*a,
1610 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1611 			ADDAUX(*a,
1612 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1613 			if (args->brandname != NULL)
1614 				ADDAUX(*a, AT_SUN_BRANDNAME,
1615 				    (int)(uintptr_t)&ustrp[*--offp])
1616 			if (args->emulator != NULL)
1617 				ADDAUX(*a, AT_SUN_EMULATOR,
1618 				    (int)(uintptr_t)&ustrp[*--offp])
1619 		}
1620 	}
1621 
1622 	return (0);
1623 }
1624 
1625 /*
1626  * Initialize a new user stack with the specified arguments and environment.
1627  * The initial user stack layout is as follows:
1628  *
1629  *	User Stack
1630  *	+---------------+ <--- curproc->p_usrstack
1631  *	|		|
1632  *	| slew		|
1633  *	|		|
1634  *	+---------------+
1635  *	| NULL		|
1636  *	+---------------+
1637  *	|		|
1638  *	| auxv strings	|
1639  *	|		|
1640  *	+---------------+
1641  *	|		|
1642  *	| envp strings	|
1643  *	|		|
1644  *	+---------------+
1645  *	|		|
1646  *	| argv strings	|
1647  *	|		|
1648  *	+---------------+ <--- ustrp
1649  *	|		|
1650  *	| aux vector	|
1651  *	|		|
1652  *	+---------------+ <--- auxv
1653  *	| NULL		|
1654  *	+---------------+
1655  *	| envp[envc-1]	|
1656  *	+---------------+
1657  *	| ...		|
1658  *	+---------------+
1659  *	| envp[0]	|
1660  *	+---------------+ <--- envp[]
1661  *	| NULL		|
1662  *	+---------------+
1663  *	| argv[argc-1]	|
1664  *	+---------------+
1665  *	| ...		|
1666  *	+---------------+
1667  *	| argv[0]	|
1668  *	+---------------+ <--- argv[]
1669  *	| argc		|
1670  *	+---------------+ <--- stack base
1671  */
1672 int
1673 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1674 {
1675 	size_t size;
1676 	int error;
1677 	proc_t *p = ttoproc(curthread);
1678 	user_t *up = PTOU(p);
1679 	char *usrstack;
1680 	rctl_entity_p_t e;
1681 	struct as *as;
1682 	extern int use_stk_lpg;
1683 	size_t sp_slew;
1684 
1685 	args->from_model = p->p_model;
1686 	if (p->p_model == DATAMODEL_NATIVE) {
1687 		args->from_ptrsize = sizeof (long);
1688 	} else {
1689 		args->from_ptrsize = sizeof (int32_t);
1690 	}
1691 
1692 	if (args->to_model == DATAMODEL_NATIVE) {
1693 		args->to_ptrsize = sizeof (long);
1694 		args->ncargs = NCARGS;
1695 		args->stk_align = STACK_ALIGN;
1696 		if (args->addr32)
1697 			usrstack = (char *)USRSTACK64_32;
1698 		else
1699 			usrstack = (char *)USRSTACK;
1700 	} else {
1701 		args->to_ptrsize = sizeof (int32_t);
1702 		args->ncargs = NCARGS32;
1703 		args->stk_align = STACK_ALIGN32;
1704 		usrstack = (char *)USRSTACK32;
1705 	}
1706 
1707 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1708 
1709 #if defined(__sparc)
1710 	/*
1711 	 * Make sure user register windows are empty before
1712 	 * attempting to make a new stack.
1713 	 */
1714 	(void) flush_user_windows_to_stack(NULL);
1715 #endif
1716 
1717 	for (size = PAGESIZE; ; size *= 2) {
1718 		args->stk_size = size;
1719 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1720 		args->stk_strp = args->stk_base;
1721 		args->stk_offp = (int *)(args->stk_base + size);
1722 		error = stk_copyin(uap, args, intp, auxvpp);
1723 		if (error == 0)
1724 			break;
1725 		kmem_free(args->stk_base, size);
1726 		if (error != E2BIG && error != ENAMETOOLONG)
1727 			return (error);
1728 		if (size >= args->ncargs)
1729 			return (E2BIG);
1730 	}
1731 
1732 	size = args->usrstack_size;
1733 
1734 	ASSERT(error == 0);
1735 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1736 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1737 
1738 	if (size > args->ncargs) {
1739 		kmem_free(args->stk_base, args->stk_size);
1740 		return (E2BIG);
1741 	}
1742 
1743 	/*
1744 	 * Leave only the current lwp and force the other lwps to exit.
1745 	 * If another lwp beat us to the punch by calling exit(), bail out.
1746 	 */
1747 	if ((error = exitlwps(0)) != 0) {
1748 		kmem_free(args->stk_base, args->stk_size);
1749 		return (error);
1750 	}
1751 
1752 	/*
1753 	 * Revoke any doors created by the process.
1754 	 */
1755 	if (p->p_door_list)
1756 		door_exit();
1757 
1758 	/*
1759 	 * Release schedctl data structures.
1760 	 */
1761 	if (p->p_pagep)
1762 		schedctl_proc_cleanup();
1763 
1764 	/*
1765 	 * Clean up any DTrace helpers for the process.
1766 	 */
1767 	if (p->p_dtrace_helpers != NULL) {
1768 		ASSERT(dtrace_helpers_cleanup != NULL);
1769 		(*dtrace_helpers_cleanup)();
1770 	}
1771 
1772 	mutex_enter(&p->p_lock);
1773 	/*
1774 	 * Cleanup the DTrace provider associated with this process.
1775 	 */
1776 	if (p->p_dtrace_probes) {
1777 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1778 		dtrace_fasttrap_exec_ptr(p);
1779 	}
1780 	mutex_exit(&p->p_lock);
1781 
1782 	/*
1783 	 * discard the lwpchan cache.
1784 	 */
1785 	if (p->p_lcp != NULL)
1786 		lwpchan_destroy_cache(1);
1787 
1788 	/*
1789 	 * Delete the POSIX timers.
1790 	 */
1791 	if (p->p_itimer != NULL)
1792 		timer_exit();
1793 
1794 	if (audit_active)
1795 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1796 		    args->na - args->ne, args->ne);
1797 
1798 	/*
1799 	 * Ensure that we don't change resource associations while we
1800 	 * change address spaces.
1801 	 */
1802 	mutex_enter(&p->p_lock);
1803 	pool_barrier_enter();
1804 	mutex_exit(&p->p_lock);
1805 
1806 	/*
1807 	 * Destroy the old address space and create a new one.
1808 	 * From here on, any errors are fatal to the exec()ing process.
1809 	 * On error we return -1, which means the caller must SIGKILL
1810 	 * the process.
1811 	 */
1812 	relvm();
1813 
1814 	mutex_enter(&p->p_lock);
1815 	pool_barrier_exit();
1816 	mutex_exit(&p->p_lock);
1817 
1818 	up->u_execsw = args->execswp;
1819 
1820 	p->p_brkbase = NULL;
1821 	p->p_brksize = 0;
1822 	p->p_brkpageszc = 0;
1823 	p->p_stksize = 0;
1824 	p->p_stkpageszc = 0;
1825 	p->p_model = args->to_model;
1826 	p->p_usrstack = usrstack;
1827 	p->p_stkprot = args->stk_prot;
1828 	p->p_datprot = args->dat_prot;
1829 
1830 	/*
1831 	 * Reset resource controls such that all controls are again active as
1832 	 * well as appropriate to the potentially new address model for the
1833 	 * process.
1834 	 */
1835 	e.rcep_p.proc = p;
1836 	e.rcep_t = RCENTITY_PROCESS;
1837 	rctl_set_reset(p->p_rctls, p, &e);
1838 
1839 	/* Too early to call map_pgsz for the heap */
1840 	if (use_stk_lpg) {
1841 		p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
1842 	}
1843 
1844 	mutex_enter(&p->p_lock);
1845 	p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1846 	mutex_exit(&p->p_lock);
1847 
1848 	/*
1849 	 * Some platforms may choose to randomize real stack start by adding a
1850 	 * small slew (not more than a few hundred bytes) to the top of the
1851 	 * stack. This helps avoid cache thrashing when identical processes
1852 	 * simultaneously share caches that don't provide enough associativity
1853 	 * (e.g. sun4v systems). In this case stack slewing makes the same hot
1854 	 * stack variables in different processes to live in different cache
1855 	 * sets increasing effective associativity.
1856 	 */
1857 	sp_slew = exec_get_spslew();
1858 	ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
1859 	exec_set_sp(size + sp_slew);
1860 
1861 	as = as_alloc();
1862 	p->p_as = as;
1863 	as->a_proc = p;
1864 	if (p->p_model == DATAMODEL_ILP32 || args->addr32)
1865 		as->a_userlimit = (caddr_t)USERLIMIT32;
1866 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1867 	hat_join_srd(as->a_hat, args->ex_vp);
1868 
1869 	/*
1870 	 * Finally, write out the contents of the new stack.
1871 	 */
1872 	error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
1873 	kmem_free(args->stk_base, args->stk_size);
1874 	return (error);
1875 }
1876