xref: /titanic_50/usr/src/uts/common/os/exec.c (revision 841a5ea8d961db66071fae9753ca607abfe94491)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*	Copyright (c) 1988 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/vtrace.h>
53 #include <sys/exec.h>
54 #include <sys/exechdr.h>
55 #include <sys/kmem.h>
56 #include <sys/prsystm.h>
57 #include <sys/modctl.h>
58 #include <sys/vmparam.h>
59 #include <sys/schedctl.h>
60 #include <sys/utrap.h>
61 #include <sys/systeminfo.h>
62 #include <sys/stack.h>
63 #include <sys/rctl.h>
64 #include <sys/dtrace.h>
65 #include <sys/lwpchan_impl.h>
66 #include <sys/pool.h>
67 #include <sys/sdt.h>
68 
69 #include <c2/audit.h>
70 
71 #include <vm/hat.h>
72 #include <vm/anon.h>
73 #include <vm/as.h>
74 #include <vm/seg.h>
75 #include <vm/seg_vn.h>
76 
77 #define	PRIV_RESET		0x01	/* needs to reset privs */
78 #define	PRIV_SETID		0x02	/* needs to change uids */
79 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
80 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
81 
82 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
83 static int hold_execsw(struct execsw *);
84 
85 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
86 #if defined(_SYSCALL32_IMPL)
87 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
88 #endif
89 
90 #if defined(__i386) || defined(__amd64)
91 extern void ldt_free(proc_t *p);
92 extern void ldt_load(void);
93 #endif
94 
95 int exec_lpg_disable = 0;
96 
97 #define	PSUIDFLAGS		(SNOCD|SUGID)
98 
99 /*
100  * exec() - wrapper around exece providing NULL environment pointer
101  */
102 int
103 exec(const char *fname, const char **argp)
104 {
105 	return (exece(fname, argp, NULL));
106 }
107 
108 /*
109  * exece() - system call wrapper around exec_common()
110  */
111 int
112 exece(const char *fname, const char **argp, const char **envp)
113 {
114 	int error;
115 
116 	error = exec_common(fname, argp, envp);
117 	return (error ? (set_errno(error)) : 0);
118 }
119 
120 int
121 exec_common(const char *fname, const char **argp, const char **envp)
122 {
123 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
124 	proc_t *p = ttoproc(curthread);
125 	klwp_t *lwp = ttolwp(curthread);
126 	struct user *up = PTOU(p);
127 	long execsz;		/* temporary count of exec size */
128 	int i;
129 	int error;
130 	char exec_file[MAXCOMLEN+1];
131 	struct pathname pn;
132 	struct pathname resolvepn;
133 	struct uarg args;
134 	struct execa ua;
135 	k_sigset_t savedmask;
136 	lwpdir_t *lwpdir = NULL;
137 	lwpdir_t **tidhash;
138 	lwpdir_t *old_lwpdir = NULL;
139 	uint_t old_lwpdir_sz;
140 	lwpdir_t **old_tidhash;
141 	uint_t old_tidhash_sz;
142 	lwpent_t *lep;
143 
144 	/*
145 	 * exec() is not supported for the /proc agent lwp.
146 	 */
147 	if (curthread == p->p_agenttp)
148 		return (ENOTSUP);
149 
150 	if ((error = secpolicy_basic_exec(CRED())) != 0)
151 		return (error);
152 
153 	/*
154 	 * Inform /proc that an exec() has started.
155 	 * Hold signals that are ignored by default so that we will
156 	 * not be interrupted by a signal that will be ignored after
157 	 * successful completion of gexec().
158 	 */
159 	mutex_enter(&p->p_lock);
160 	prexecstart();
161 	schedctl_finish_sigblock(curthread);
162 	savedmask = curthread->t_hold;
163 	sigorset(&curthread->t_hold, &ignoredefault);
164 	mutex_exit(&p->p_lock);
165 
166 	/*
167 	 * Look up path name and remember last component for later.
168 	 * To help coreadm expand its %d token, we attempt to save
169 	 * the directory containing the executable in p_execdir. The
170 	 * first call to lookuppn() may fail and return EINVAL because
171 	 * dirvpp is non-NULL. In that case, we make a second call to
172 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
173 	 * but coreadm is allowed to expand %d to the empty string and
174 	 * there are other cases in which that failure may occur.
175 	 */
176 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
177 		goto out;
178 	pn_alloc(&resolvepn);
179 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
180 		pn_free(&resolvepn);
181 		pn_free(&pn);
182 		if (error != EINVAL)
183 			goto out;
184 
185 		dir = NULL;
186 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
187 			goto out;
188 		pn_alloc(&resolvepn);
189 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
190 		    &vp)) != 0) {
191 			pn_free(&resolvepn);
192 			pn_free(&pn);
193 			goto out;
194 		}
195 	}
196 	if (vp == NULL) {
197 		if (dir != NULL)
198 			VN_RELE(dir);
199 		error = ENOENT;
200 		pn_free(&resolvepn);
201 		pn_free(&pn);
202 		goto out;
203 	}
204 
205 	/*
206 	 * We do not allow executing files in attribute directories.
207 	 * We test this by determining whether the resolved path
208 	 * contains a "/" when we're in an attribute directory;
209 	 * only if the pathname does not contain a "/" the resolved path
210 	 * points to a file in the current working (attribute) directory.
211 	 */
212 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
213 	    strchr(resolvepn.pn_path, '/') == NULL) {
214 		if (dir != NULL)
215 			VN_RELE(dir);
216 		error = EACCES;
217 		pn_free(&resolvepn);
218 		pn_free(&pn);
219 		VN_RELE(vp);
220 		goto out;
221 	}
222 
223 	bzero(exec_file, MAXCOMLEN+1);
224 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
225 	bzero(&args, sizeof (args));
226 	args.pathname = resolvepn.pn_path;
227 	/* don't free resolvepn until we are done with args */
228 	pn_free(&pn);
229 
230 	/*
231 	 * Specific exec handlers, or policies determined via
232 	 * /etc/system may override the historical default.
233 	 */
234 	args.stk_prot = PROT_ZFOD;
235 	args.dat_prot = PROT_ZFOD;
236 
237 	CPU_STATS_ADD_K(sys, sysexec, 1);
238 	DTRACE_PROC1(exec, char *, args.pathname);
239 
240 	ua.fname = fname;
241 	ua.argp = argp;
242 	ua.envp = envp;
243 
244 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
245 	    exec_file, p->p_cred)) != 0) {
246 		VN_RELE(vp);
247 		if (dir != NULL)
248 			VN_RELE(dir);
249 		pn_free(&resolvepn);
250 		goto fail;
251 	}
252 
253 	/*
254 	 * Free floating point registers (sun4u only)
255 	 */
256 	ASSERT(lwp != NULL);
257 	lwp_freeregs(lwp, 1);
258 
259 	/*
260 	 * Free device context
261 	 */
262 	if (curthread->t_ctx)
263 		freectx(curthread, 1);
264 
265 	/*
266 	 * Remember file name for accounting; clear any cached DTrace predicate.
267 	 */
268 	up->u_acflag &= ~AFORK;
269 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
270 	curthread->t_predcache = NULL;
271 
272 	/*
273 	 * Clear contract template state
274 	 */
275 	lwp_ctmpl_clear(lwp);
276 
277 	/*
278 	 * Save the directory in which we found the executable for expanding
279 	 * the %d token used in core file patterns.
280 	 */
281 	mutex_enter(&p->p_lock);
282 	tmpvp = p->p_execdir;
283 	p->p_execdir = dir;
284 	if (p->p_execdir != NULL)
285 		VN_HOLD(p->p_execdir);
286 	mutex_exit(&p->p_lock);
287 
288 	if (tmpvp != NULL)
289 		VN_RELE(tmpvp);
290 
291 	/*
292 	 * Reset stack state to the user stack, clear set of signals
293 	 * caught on the signal stack, and reset list of signals that
294 	 * restart system calls; the new program's environment should
295 	 * not be affected by detritus from the old program.  Any
296 	 * pending held signals remain held, so don't clear t_hold.
297 	 */
298 	mutex_enter(&p->p_lock);
299 	lwp->lwp_oldcontext = 0;
300 	lwp->lwp_ustack = 0;
301 	lwp->lwp_old_stk_ctl = 0;
302 	sigemptyset(&up->u_signodefer);
303 	sigemptyset(&up->u_sigonstack);
304 	sigemptyset(&up->u_sigresethand);
305 	lwp->lwp_sigaltstack.ss_sp = 0;
306 	lwp->lwp_sigaltstack.ss_size = 0;
307 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
308 
309 	/*
310 	 * Make saved resource limit == current resource limit.
311 	 */
312 	for (i = 0; i < RLIM_NLIMITS; i++) {
313 		/*CONSTCOND*/
314 		if (RLIM_SAVED(i)) {
315 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
316 			    &up->u_saved_rlimit[i]);
317 		}
318 	}
319 
320 	/*
321 	 * If the action was to catch the signal, then the action
322 	 * must be reset to SIG_DFL.
323 	 */
324 	sigdefault(p);
325 	p->p_flag &= ~(SNOWAIT|SJCTL);
326 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
327 	up->u_signal[SIGCLD - 1] = SIG_DFL;
328 
329 	/*
330 	 * Delete the dot4 sigqueues/signotifies.
331 	 */
332 	sigqfree(p);
333 
334 	mutex_exit(&p->p_lock);
335 
336 	mutex_enter(&p->p_pflock);
337 	p->p_prof.pr_base = NULL;
338 	p->p_prof.pr_size = 0;
339 	p->p_prof.pr_off = 0;
340 	p->p_prof.pr_scale = 0;
341 	p->p_prof.pr_samples = 0;
342 	mutex_exit(&p->p_pflock);
343 
344 	ASSERT(curthread->t_schedctl == NULL);
345 
346 #if defined(__i386) || defined(__amd64)
347 	/* If the process uses a private LDT then change it to default */
348 	if (p->p_ldt)
349 		ldt_free(p);
350 #endif	/* __i386 || __amd64 */
351 
352 #if defined(__amd64)
353 	/*
354 	 * Make sure the process has the correct LDT descriptor for its data
355 	 * model.
356 	 */
357 	if (p->p_model == DATAMODEL_LP64)
358 		p->p_ldt_desc = ldt0_default64_desc;
359 	else
360 		p->p_ldt_desc = ldt0_default_desc;
361 
362 	/*
363 	 * Ensure the change of LDT is propagated into the LDTR.
364 	 */
365 	kpreempt_disable();
366 	ldt_load();
367 	kpreempt_enable();
368 #endif /* __amd64 */
369 
370 #if defined(__sparc)
371 	if (p->p_utraps != NULL)
372 		utrap_free(p);
373 #endif	/* __sparc */
374 
375 	/*
376 	 * Close all close-on-exec files.
377 	 */
378 	close_exec(P_FINFO(p));
379 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
380 	setregs(&args);
381 
382 	/* Mark this as an executable vnode */
383 	mutex_enter(&vp->v_lock);
384 	vp->v_flag |= VVMEXEC;
385 	mutex_exit(&vp->v_lock);
386 
387 	VN_RELE(vp);
388 	if (dir != NULL)
389 		VN_RELE(dir);
390 	pn_free(&resolvepn);
391 
392 	/*
393 	 * Allocate a new lwp directory and lwpid hash table if necessary.
394 	 */
395 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
396 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
397 		lwpdir->ld_next = lwpdir + 1;
398 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
399 		if (p->p_lwpdir != NULL)
400 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
401 		else
402 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
403 	}
404 
405 	mutex_enter(&p->p_lock);
406 	prbarrier(p);
407 
408 	/*
409 	 * Reset lwp id to the default value of 1.
410 	 * This is a single-threaded process now
411 	 * and lwp #1 is lwp_wait()able by default.
412 	 * The t_unpark flag should not be inherited.
413 	 */
414 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
415 	curthread->t_tid = 1;
416 	curthread->t_unpark = 0;
417 	curthread->t_proc_flag |= TP_TWAIT;
418 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
419 	p->p_lwpdaemon = 0;			/* but oh well ... */
420 	p->p_lwpid = 1;
421 
422 	/*
423 	 * Install the newly-allocated lwp directory and lwpid hash table
424 	 * and insert the current thread into the new hash table.
425 	 */
426 	if (lwpdir != NULL) {
427 		old_lwpdir = p->p_lwpdir;
428 		old_lwpdir_sz = p->p_lwpdir_sz;
429 		old_tidhash = p->p_tidhash;
430 		old_tidhash_sz = p->p_tidhash_sz;
431 		p->p_lwpdir = p->p_lwpfree = lwpdir;
432 		p->p_lwpdir_sz = 2;
433 		p->p_tidhash = tidhash;
434 		p->p_tidhash_sz = 2;
435 		lep->le_thread = curthread;
436 		lep->le_lwpid = curthread->t_tid;
437 		lep->le_start = curthread->t_start;
438 		lwp_hash_in(p, lep);
439 	}
440 	/*
441 	 * Restore the saved signal mask and
442 	 * inform /proc that the exec() has finished.
443 	 */
444 	curthread->t_hold = savedmask;
445 	prexecend();
446 	mutex_exit(&p->p_lock);
447 	if (old_lwpdir) {
448 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
449 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
450 	}
451 	ASSERT(error == 0);
452 	DTRACE_PROC(exec__success);
453 	return (0);
454 
455 fail:
456 	DTRACE_PROC1(exec__failure, int, error);
457 out:		/* error return */
458 	mutex_enter(&p->p_lock);
459 	curthread->t_hold = savedmask;
460 	prexecend();
461 	mutex_exit(&p->p_lock);
462 	ASSERT(error != 0);
463 	return (error);
464 }
465 
466 
467 /*
468  * Perform generic exec duties and switchout to object-file specific
469  * handler.
470  */
471 int
472 gexec(
473 	struct vnode **vpp,
474 	struct execa *uap,
475 	struct uarg *args,
476 	struct intpdata *idatap,
477 	int level,
478 	long *execsz,
479 	caddr_t exec_file,
480 	struct cred *cred)
481 {
482 	struct vnode *vp;
483 	proc_t *pp = ttoproc(curthread);
484 	struct execsw *eswp;
485 	int error = 0;
486 	int suidflags = 0;
487 	ssize_t resid;
488 	uid_t uid, gid;
489 	struct vattr vattr;
490 	char magbuf[MAGIC_BYTES];
491 	int setid;
492 	cred_t *oldcred, *newcred = NULL;
493 	int privflags = 0;
494 
495 	/*
496 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
497 	 * previous setting so we can restore it if we encounter an error.
498 	 */
499 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
500 		mutex_enter(&pp->p_lock);
501 		suidflags = pp->p_flag & PSUIDFLAGS;
502 		pp->p_flag &= ~PSUIDFLAGS;
503 		mutex_exit(&pp->p_lock);
504 	}
505 
506 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
507 		goto bad;
508 
509 	/* need to open vnode for stateful file systems like rfs */
510 	if ((error = VOP_OPEN(vpp, FREAD, CRED())) != 0)
511 		goto bad;
512 	vp = *vpp;
513 
514 	/*
515 	 * Note: to support binary compatibility with SunOS a.out
516 	 * executables, we read in the first four bytes, as the
517 	 * magic number is in bytes 2-3.
518 	 */
519 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
520 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
521 		goto bad;
522 	if (resid != 0)
523 		goto bad;
524 
525 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
526 		goto bad;
527 
528 	if (level == 0 &&
529 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
530 
531 		newcred = cred = crdup(cred);
532 
533 		/* If we can, drop the PA bit */
534 		if ((privflags & PRIV_RESET) != 0)
535 			priv_adjust_PA(cred);
536 
537 		if (privflags & PRIV_SETID) {
538 			cred->cr_uid = uid;
539 			cred->cr_gid = gid;
540 			cred->cr_suid = uid;
541 			cred->cr_sgid = gid;
542 		}
543 
544 		/*
545 		 * Implement the privilege updates:
546 		 *
547 		 * Restrict with L:
548 		 *
549 		 *	I' = I & L
550 		 *
551 		 *	E' = P' = (I' + F) & A
552 		 *
553 		 * But if running under ptrace, we cap I with P.
554 		 */
555 		if ((privflags & PRIV_RESET) != 0) {
556 			if ((privflags & PRIV_INCREASE) != 0 &&
557 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
558 				priv_intersect(&CR_OPPRIV(cred),
559 						    &CR_IPRIV(cred));
560 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
561 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
562 			priv_adjust_PA(cred);
563 		}
564 	}
565 
566 	/* SunOS 4.x buy-back */
567 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
568 	    (vattr.va_mode & (VSUID|VSGID))) {
569 		cmn_err(CE_NOTE,
570 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
571 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
572 	}
573 
574 	/*
575 	 * execsetid() told us whether or not we had to change the
576 	 * credentials of the process.  In privflags, it told us
577 	 * whether we gained any privileges or executed a set-uid executable.
578 	 */
579 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
580 
581 	/*
582 	 * Use /etc/system variable to determine if the stack
583 	 * should be marked as executable by default.
584 	 */
585 	if (noexec_user_stack)
586 		args->stk_prot &= ~PROT_EXEC;
587 
588 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
589 
590 	/*
591 	 * Traditionally, the setid flags told the sub processes whether
592 	 * the file just executed was set-uid or set-gid; this caused
593 	 * some confusion as the 'setid' flag did not match the SUGID
594 	 * process flag which is only set when the uids/gids do not match.
595 	 * A script set-gid/set-uid to the real uid/gid would start with
596 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
597 	 * Now we flag those cases where the calling process cannot
598 	 * be trusted to influence the newly exec'ed process, either
599 	 * because it runs with more privileges or when the uids/gids
600 	 * do in fact not match.
601 	 * This also makes the runtime linker agree with the on exec
602 	 * values of SNOCD and SUGID.
603 	 */
604 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
605 		(setid & PRIV_INCREASE) != 0 ||
606 		cred->cr_uid != cred->cr_ruid ||
607 		(cred->cr_rgid != cred->cr_gid &&
608 		!supgroupmember(cred->cr_gid, cred)), exec_file, cred);
609 	rw_exit(eswp->exec_lock);
610 	if (error != 0) {
611 		if (newcred != NULL)
612 			crfree(newcred);
613 		goto bad;
614 	}
615 
616 	if (level == 0) {
617 		mutex_enter(&pp->p_crlock);
618 		if (newcred != NULL) {
619 			/*
620 			 * Free the old credentials, and set the new ones.
621 			 * Do this for both the process and the (single) thread.
622 			 */
623 			crfree(pp->p_cred);
624 			pp->p_cred = cred;	/* cred already held for proc */
625 			crhold(cred);		/* hold new cred for thread */
626 			/*
627 			 * DTrace accesses t_cred in probe context.  t_cred
628 			 * must always be either NULL, or point to a valid,
629 			 * allocated cred structure.
630 			 */
631 			oldcred = curthread->t_cred;
632 			curthread->t_cred = cred;
633 			crfree(oldcred);
634 		}
635 		/*
636 		 * On emerging from a successful exec(), the saved
637 		 * uid and gid equal the effective uid and gid.
638 		 */
639 		cred->cr_suid = cred->cr_uid;
640 		cred->cr_sgid = cred->cr_gid;
641 
642 		/*
643 		 * If the real and effective ids do not match, this
644 		 * is a setuid process that should not dump core.
645 		 * The group comparison is tricky; we prevent the code
646 		 * from flagging SNOCD when executing with an effective gid
647 		 * which is a supplementary group.
648 		 */
649 		if (cred->cr_ruid != cred->cr_uid ||
650 		    (cred->cr_rgid != cred->cr_gid &&
651 		    !supgroupmember(cred->cr_gid, cred)) ||
652 		    (privflags & PRIV_INCREASE) != 0)
653 			suidflags = PSUIDFLAGS;
654 		else
655 			suidflags = 0;
656 
657 		mutex_exit(&pp->p_crlock);
658 		if (suidflags) {
659 			mutex_enter(&pp->p_lock);
660 			pp->p_flag |= suidflags;
661 			mutex_exit(&pp->p_lock);
662 		}
663 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
664 			/*
665 			 * If process is traced via /proc, arrange to
666 			 * invalidate the associated /proc vnode.
667 			 */
668 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
669 				args->traceinval = 1;
670 		}
671 		if (pp->p_proc_flag & P_PR_PTRACE)
672 			psignal(pp, SIGTRAP);
673 		if (args->traceinval)
674 			prinvalidate(&pp->p_user);
675 	}
676 
677 	return (0);
678 bad:
679 	if (error == 0)
680 		error = ENOEXEC;
681 
682 	if (suidflags) {
683 		mutex_enter(&pp->p_lock);
684 		pp->p_flag |= suidflags;
685 		mutex_exit(&pp->p_lock);
686 	}
687 	return (error);
688 }
689 
690 extern char *execswnames[];
691 
692 struct execsw *
693 allocate_execsw(char *name, char *magic, size_t magic_size)
694 {
695 	int i, j;
696 	char *ename;
697 	char *magicp;
698 
699 	mutex_enter(&execsw_lock);
700 	for (i = 0; i < nexectype; i++) {
701 		if (execswnames[i] == NULL) {
702 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
703 			(void) strcpy(ename, name);
704 			execswnames[i] = ename;
705 			/*
706 			 * Set the magic number last so that we
707 			 * don't need to hold the execsw_lock in
708 			 * findexectype().
709 			 */
710 			magicp = kmem_alloc(magic_size, KM_SLEEP);
711 			for (j = 0; j < magic_size; j++)
712 				magicp[j] = magic[j];
713 			execsw[i].exec_magic = magicp;
714 			mutex_exit(&execsw_lock);
715 			return (&execsw[i]);
716 		}
717 	}
718 	mutex_exit(&execsw_lock);
719 	return (NULL);
720 }
721 
722 /*
723  * Find the exec switch table entry with the corresponding magic string.
724  */
725 struct execsw *
726 findexecsw(char *magic)
727 {
728 	struct execsw *eswp;
729 
730 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
731 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
732 		if (magic && eswp->exec_maglen != 0 &&
733 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
734 			return (eswp);
735 	}
736 	return (NULL);
737 }
738 
739 /*
740  * Find the execsw[] index for the given exec header string by looking for the
741  * magic string at a specified offset and length for each kind of executable
742  * file format until one matches.  If no execsw[] entry is found, try to
743  * autoload a module for this magic string.
744  */
745 struct execsw *
746 findexec_by_hdr(char *header)
747 {
748 	struct execsw *eswp;
749 
750 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
751 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
752 		if (header && eswp->exec_maglen != 0 &&
753 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
754 			    eswp->exec_maglen) == 0) {
755 			if (hold_execsw(eswp) != 0)
756 				return (NULL);
757 			return (eswp);
758 		}
759 	}
760 	return (NULL);	/* couldn't find the type */
761 }
762 
763 /*
764  * Find the execsw[] index for the given magic string.  If no execsw[] entry
765  * is found, try to autoload a module for this magic string.
766  */
767 struct execsw *
768 findexec_by_magic(char *magic)
769 {
770 	struct execsw *eswp;
771 
772 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
773 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
774 		if (magic && eswp->exec_maglen != 0 &&
775 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
776 			if (hold_execsw(eswp) != 0)
777 				return (NULL);
778 			return (eswp);
779 		}
780 	}
781 	return (NULL);	/* couldn't find the type */
782 }
783 
784 static int
785 hold_execsw(struct execsw *eswp)
786 {
787 	char *name;
788 
789 	rw_enter(eswp->exec_lock, RW_READER);
790 	while (!LOADED_EXEC(eswp)) {
791 		rw_exit(eswp->exec_lock);
792 		name = execswnames[eswp-execsw];
793 		ASSERT(name);
794 		if (modload("exec", name) == -1)
795 			return (-1);
796 		rw_enter(eswp->exec_lock, RW_READER);
797 	}
798 	return (0);
799 }
800 
801 static int
802 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
803 {
804 	proc_t *pp = ttoproc(curthread);
805 	uid_t uid, gid;
806 	cred_t *cr = pp->p_cred;
807 	int privflags = 0;
808 
809 	/*
810 	 * Remember credentials.
811 	 */
812 	uid = cr->cr_uid;
813 	gid = cr->cr_gid;
814 
815 	/* Will try to reset the PRIV_AWARE bit later. */
816 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
817 		privflags |= PRIV_RESET;
818 
819 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
820 		/*
821 		 * Set-uid root execution only allowed if the limit set
822 		 * holds all unsafe privileges.
823 		 */
824 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
825 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
826 			uid = vattrp->va_uid;
827 			privflags |= PRIV_SETUGID;
828 		}
829 		if (vattrp->va_mode & VSGID) {
830 			gid = vattrp->va_gid;
831 			privflags |= PRIV_SETUGID;
832 		}
833 	}
834 
835 	/*
836 	 * Do we need to change our credential anyway?
837 	 * This is the case when E != I or P != I, as
838 	 * we need to do the assignments (with F empty and A full)
839 	 * Or when I is not a subset of L; in that case we need to
840 	 * enforce L.
841 	 *
842 	 *		I' = L & I
843 	 *
844 	 *		E' = P' = (I' + F) & A
845 	 * or
846 	 *		E' = P' = I'
847 	 */
848 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
849 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
850 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
851 		privflags |= PRIV_RESET;
852 
853 	/*
854 	 * When we introduce the "forced" set then we will need
855 	 * to set PRIV_INCREASE here if I not a subset of P.
856 	 * If the "allowed" set is introduced we will need to do
857 	 * a similar thing; however, it seems more reasonable to
858 	 * have the allowed set reduce "L": script language interpreters
859 	 * would typically have an allowed set of "all".
860 	 */
861 
862 	/*
863 	 * Set setuid/setgid protections if no ptrace() compatibility.
864 	 * For privileged processes, honor setuid/setgid even in
865 	 * the presence of ptrace() compatibility.
866 	 */
867 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
868 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
869 	    (cr->cr_uid != uid ||
870 	    cr->cr_gid != gid ||
871 	    cr->cr_suid != uid ||
872 	    cr->cr_sgid != gid)) {
873 		*uidp = uid;
874 		*gidp = gid;
875 		privflags |= PRIV_SETID;
876 	}
877 	return (privflags);
878 }
879 
880 int
881 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
882 {
883 	int error;
884 	proc_t *p = ttoproc(curthread);
885 
886 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
887 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred))
888 		return (error);
889 	/*
890 	 * Check the access mode.
891 	 * If VPROC, ask /proc if the file is an object file.
892 	 */
893 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred)) != 0 ||
894 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
895 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
896 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
897 		if (error == 0)
898 			error = EACCES;
899 		return (error);
900 	}
901 
902 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
903 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred))) {
904 		/*
905 		 * If process is under ptrace(2) compatibility,
906 		 * fail the exec(2).
907 		 */
908 		if (p->p_proc_flag & P_PR_PTRACE)
909 			goto bad;
910 		/*
911 		 * Process is traced via /proc.
912 		 * Arrange to invalidate the /proc vnode.
913 		 */
914 		args->traceinval = 1;
915 	}
916 	return (0);
917 bad:
918 	if (error == 0)
919 		error = ENOEXEC;
920 	return (error);
921 }
922 
923 /*
924  * Map a section of an executable file into the user's
925  * address space.
926  */
927 int
928 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
929     off_t offset, int prot, int page, uint_t szc)
930 {
931 	int error = 0;
932 	off_t oldoffset;
933 	caddr_t zfodbase, oldaddr;
934 	size_t end, oldlen;
935 	size_t zfoddiff;
936 	label_t ljb;
937 	proc_t *p = ttoproc(curthread);
938 
939 	oldaddr = addr;
940 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
941 	if (len) {
942 		oldlen = len;
943 		len += ((size_t)oldaddr - (size_t)addr);
944 		oldoffset = offset;
945 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
946 		if (page) {
947 			spgcnt_t  prefltmem, availm, npages;
948 			int preread;
949 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
950 
951 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
952 				mflag |= MAP_TEXT;
953 			} else {
954 				mflag |= MAP_INITDATA;
955 			}
956 
957 			if (valid_usr_range(addr, len, prot, p->p_as,
958 			    p->p_as->a_userlimit) != RANGE_OKAY) {
959 				error = ENOMEM;
960 				goto bad;
961 			}
962 			if (error = VOP_MAP(vp, (offset_t)offset,
963 			    p->p_as, &addr, len, prot, PROT_ALL,
964 			    mflag, CRED()))
965 				goto bad;
966 
967 			/*
968 			 * If the segment can fit, then we prefault
969 			 * the entire segment in.  This is based on the
970 			 * model that says the best working set of a
971 			 * small program is all of its pages.
972 			 */
973 			npages = (spgcnt_t)btopr(len);
974 			prefltmem = freemem - desfree;
975 			preread =
976 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
977 
978 			/*
979 			 * If we aren't prefaulting the segment,
980 			 * increment "deficit", if necessary to ensure
981 			 * that pages will become available when this
982 			 * process starts executing.
983 			 */
984 			availm = freemem - lotsfree;
985 			if (preread == 0 && npages > availm &&
986 			    deficit < lotsfree) {
987 				deficit += MIN((pgcnt_t)(npages - availm),
988 				    lotsfree - deficit);
989 			}
990 
991 			if (preread) {
992 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
993 				    "execmap preread:freemem %d size %lu",
994 				    freemem, len);
995 				(void) as_fault(p->p_as->a_hat, p->p_as,
996 				    (caddr_t)addr, len, F_INVAL, S_READ);
997 			}
998 		} else {
999 			if (valid_usr_range(addr, len, prot, p->p_as,
1000 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1001 				error = ENOMEM;
1002 				goto bad;
1003 			}
1004 
1005 			if (error = as_map(p->p_as, addr, len,
1006 			    segvn_create, zfod_argsp))
1007 				goto bad;
1008 			/*
1009 			 * Read in the segment in one big chunk.
1010 			 */
1011 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1012 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1013 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1014 				goto bad;
1015 			/*
1016 			 * Now set protections.
1017 			 */
1018 			if (prot != PROT_ZFOD) {
1019 				(void) as_setprot(p->p_as, (caddr_t)addr,
1020 				    len, prot);
1021 			}
1022 		}
1023 	}
1024 
1025 	if (zfodlen) {
1026 		end = (size_t)addr + len;
1027 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1028 		zfoddiff = (uintptr_t)zfodbase - end;
1029 		if (zfoddiff) {
1030 			if (on_fault(&ljb)) {
1031 				no_fault();
1032 				error = EFAULT;
1033 				goto bad;
1034 			}
1035 			uzero((void *)end, zfoddiff);
1036 			no_fault();
1037 		}
1038 		if (zfodlen > zfoddiff) {
1039 			struct segvn_crargs crargs =
1040 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1041 
1042 			zfodlen -= zfoddiff;
1043 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1044 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1045 				error = ENOMEM;
1046 				goto bad;
1047 			}
1048 			crargs.szc = szc;
1049 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1050 			    zfodlen, segvn_create, &crargs))
1051 				goto bad;
1052 			if (prot != PROT_ZFOD) {
1053 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1054 				    zfodlen, prot);
1055 			}
1056 		}
1057 	}
1058 	return (0);
1059 bad:
1060 	return (error);
1061 }
1062 
1063 void
1064 setexecenv(struct execenv *ep)
1065 {
1066 	proc_t *p = ttoproc(curthread);
1067 	klwp_t *lwp = ttolwp(curthread);
1068 	struct vnode *vp;
1069 
1070 	p->p_bssbase = ep->ex_bssbase;
1071 	p->p_brkbase = ep->ex_brkbase;
1072 	p->p_brksize = ep->ex_brksize;
1073 	if (p->p_exec)
1074 		VN_RELE(p->p_exec);	/* out with the old */
1075 	vp = p->p_exec = ep->ex_vp;
1076 	if (vp != NULL)
1077 		VN_HOLD(vp);		/* in with the new */
1078 
1079 	lwp->lwp_sigaltstack.ss_sp = 0;
1080 	lwp->lwp_sigaltstack.ss_size = 0;
1081 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1082 }
1083 
1084 int
1085 execopen(struct vnode **vpp, int *fdp)
1086 {
1087 	struct vnode *vp = *vpp;
1088 	file_t *fp;
1089 	int error = 0;
1090 	int filemode = FREAD;
1091 
1092 	VN_HOLD(vp);		/* open reference */
1093 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1094 		VN_RELE(vp);
1095 		*fdp = -1;	/* just in case falloc changed value */
1096 		return (error);
1097 	}
1098 	if (error = VOP_OPEN(&vp, filemode, CRED())) {
1099 		VN_RELE(vp);
1100 		setf(*fdp, NULL);
1101 		unfalloc(fp);
1102 		*fdp = -1;
1103 		return (error);
1104 	}
1105 	*vpp = vp;		/* vnode should not have changed */
1106 	fp->f_vnode = vp;
1107 	mutex_exit(&fp->f_tlock);
1108 	setf(*fdp, fp);
1109 	return (0);
1110 }
1111 
1112 int
1113 execclose(int fd)
1114 {
1115 	return (closeandsetf(fd, NULL));
1116 }
1117 
1118 
1119 /*
1120  * noexec stub function.
1121  */
1122 /*ARGSUSED*/
1123 int
1124 noexec(
1125     struct vnode *vp,
1126     struct execa *uap,
1127     struct uarg *args,
1128     struct intpdata *idatap,
1129     int level,
1130     long *execsz,
1131     int setid,
1132     caddr_t exec_file,
1133     struct cred *cred)
1134 {
1135 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1136 	return (ENOEXEC);
1137 }
1138 
1139 /*
1140  * Support routines for building a user stack.
1141  *
1142  * execve(path, argv, envp) must construct a new stack with the specified
1143  * arguments and environment variables (see exec_args() for a description
1144  * of the user stack layout).  To do this, we copy the arguments and
1145  * environment variables from the old user address space into the kernel,
1146  * free the old as, create the new as, and copy our buffered information
1147  * to the new stack.  Our kernel buffer has the following structure:
1148  *
1149  *	+-----------------------+ <--- stk_base + stk_size
1150  *	| string offsets	|
1151  *	+-----------------------+ <--- stk_offp
1152  *	|			|
1153  *	| STK_AVAIL() space	|
1154  *	|			|
1155  *	+-----------------------+ <--- stk_strp
1156  *	| strings		|
1157  *	+-----------------------+ <--- stk_base
1158  *
1159  * When we add a string, we store the string's contents (including the null
1160  * terminator) at stk_strp, and we store the offset of the string relative to
1161  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1162  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1163  * the difference between these pointers.  If we run out of space, we return
1164  * an error and exec_args() starts all over again with a buffer twice as large.
1165  * When we're all done, the kernel buffer looks like this:
1166  *
1167  *	+-----------------------+ <--- stk_base + stk_size
1168  *	| argv[0] offset	|
1169  *	+-----------------------+
1170  *	| ...			|
1171  *	+-----------------------+
1172  *	| argv[argc-1] offset	|
1173  *	+-----------------------+
1174  *	| envp[0] offset	|
1175  *	+-----------------------+
1176  *	| ...			|
1177  *	+-----------------------+
1178  *	| envp[envc-1] offset	|
1179  *	+-----------------------+
1180  *	| AT_SUN_PLATFORM offset|
1181  *	+-----------------------+
1182  *	| AT_SUN_EXECNAME offset|
1183  *	+-----------------------+ <--- stk_offp
1184  *	|			|
1185  *	| STK_AVAIL() space	|
1186  *	|			|
1187  *	+-----------------------+ <--- stk_strp
1188  *	| AT_SUN_EXECNAME offset|
1189  *	+-----------------------+
1190  *	| AT_SUN_PLATFORM offset|
1191  *	+-----------------------+
1192  *	| envp[envc-1] string	|
1193  *	+-----------------------+
1194  *	| ...			|
1195  *	+-----------------------+
1196  *	| envp[0] string	|
1197  *	+-----------------------+
1198  *	| argv[argc-1] string	|
1199  *	+-----------------------+
1200  *	| ...			|
1201  *	+-----------------------+
1202  *	| argv[0] string	|
1203  *	+-----------------------+ <--- stk_base
1204  */
1205 
1206 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1207 
1208 /*
1209  * Add a string to the stack.
1210  */
1211 static int
1212 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1213 {
1214 	int error;
1215 	size_t len;
1216 
1217 	if (STK_AVAIL(args) < sizeof (int))
1218 		return (E2BIG);
1219 	*--args->stk_offp = args->stk_strp - args->stk_base;
1220 
1221 	if (segflg == UIO_USERSPACE) {
1222 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1223 		if (error != 0)
1224 			return (error);
1225 	} else {
1226 		len = strlen(sp) + 1;
1227 		if (len > STK_AVAIL(args))
1228 			return (E2BIG);
1229 		bcopy(sp, args->stk_strp, len);
1230 	}
1231 
1232 	args->stk_strp += len;
1233 
1234 	return (0);
1235 }
1236 
1237 static int
1238 stk_getptr(uarg_t *args, char *src, char **dst)
1239 {
1240 	int error;
1241 
1242 	if (args->from_model == DATAMODEL_NATIVE) {
1243 		ulong_t ptr;
1244 		error = fulword(src, &ptr);
1245 		*dst = (caddr_t)ptr;
1246 	} else {
1247 		uint32_t ptr;
1248 		error = fuword32(src, &ptr);
1249 		*dst = (caddr_t)(uintptr_t)ptr;
1250 	}
1251 	return (error);
1252 }
1253 
1254 static int
1255 stk_putptr(uarg_t *args, char *addr, char *value)
1256 {
1257 	if (args->to_model == DATAMODEL_NATIVE)
1258 		return (sulword(addr, (ulong_t)value));
1259 	else
1260 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1261 }
1262 
1263 static int
1264 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1265 {
1266 	char *sp;
1267 	int argc, error;
1268 	int argv_empty = 0;
1269 	size_t ptrsize = args->from_ptrsize;
1270 	size_t size, pad;
1271 	char *argv = (char *)uap->argp;
1272 	char *envp = (char *)uap->envp;
1273 
1274 	/*
1275 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1276 	 */
1277 	if (intp != NULL && intp->intp_name != NULL) {
1278 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1279 			return (error);
1280 		if (intp->intp_arg != NULL &&
1281 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1282 			return (error);
1283 		if (args->fname != NULL)
1284 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1285 		else
1286 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1287 		if (error)
1288 			return (error);
1289 
1290 		/*
1291 		 * Check for an empty argv[].
1292 		 */
1293 		if (stk_getptr(args, argv, &sp))
1294 			return (EFAULT);
1295 		if (sp == NULL)
1296 			argv_empty = 1;
1297 
1298 		argv += ptrsize;		/* ignore original argv[0] */
1299 	}
1300 
1301 	if (argv_empty == 0) {
1302 		/*
1303 		 * Add argv[] strings to the stack.
1304 		 */
1305 		for (;;) {
1306 			if (stk_getptr(args, argv, &sp))
1307 				return (EFAULT);
1308 			if (sp == NULL)
1309 				break;
1310 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1311 				return (error);
1312 			argv += ptrsize;
1313 		}
1314 	}
1315 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1316 	args->arglen = args->stk_strp - args->stk_base;
1317 
1318 	/*
1319 	 * Add environ[] strings to the stack.
1320 	 */
1321 	if (envp != NULL) {
1322 		for (;;) {
1323 			if (stk_getptr(args, envp, &sp))
1324 				return (EFAULT);
1325 			if (sp == NULL)
1326 				break;
1327 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1328 				return (error);
1329 			envp += ptrsize;
1330 		}
1331 	}
1332 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1333 	args->ne = args->na - argc;
1334 
1335 	/*
1336 	 * Add AT_SUN_PLATFORM and AT_SUN_EXECNAME strings to the stack.
1337 	 */
1338 	if (auxvpp != NULL && *auxvpp != NULL) {
1339 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1340 			return (error);
1341 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1342 			return (error);
1343 	}
1344 
1345 	/*
1346 	 * Compute the size of the stack.  This includes all the pointers,
1347 	 * the space reserved for the aux vector, and all the strings.
1348 	 * The total number of pointers is args->na (which is argc + envc)
1349 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1350 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1351 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1352 	 * all the strings, at the very top of the stack.
1353 	 */
1354 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1355 	    (args->stk_strp - args->stk_base);
1356 
1357 	/*
1358 	 * Pad the string section with zeroes to align the stack size.
1359 	 */
1360 	pad = P2NPHASE(size, args->stk_align);
1361 
1362 	if (STK_AVAIL(args) < pad)
1363 		return (E2BIG);
1364 
1365 	args->usrstack_size = size + pad;
1366 
1367 	while (pad-- != 0)
1368 		*args->stk_strp++ = 0;
1369 
1370 	args->nc = args->stk_strp - args->stk_base;
1371 
1372 	return (0);
1373 }
1374 
1375 static int
1376 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1377 {
1378 	size_t ptrsize = args->to_ptrsize;
1379 	ssize_t pslen;
1380 	char *kstrp = args->stk_base;
1381 	char *ustrp = usrstack - args->nc - ptrsize;
1382 	char *usp = usrstack - args->usrstack_size;
1383 	int *offp = (int *)(args->stk_base + args->stk_size);
1384 	int envc = args->ne;
1385 	int argc = args->na - envc;
1386 	int i;
1387 
1388 	/*
1389 	 * Record argc for /proc.
1390 	 */
1391 	up->u_argc = argc;
1392 
1393 	/*
1394 	 * Put argc on the stack.  Note that even though it's an int,
1395 	 * it always consumes ptrsize bytes (for alignment).
1396 	 */
1397 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1398 		return (-1);
1399 
1400 	/*
1401 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1402 	 */
1403 	up->u_argv = (uintptr_t)(usp += ptrsize);
1404 
1405 	/*
1406 	 * Put the argv[] pointers on the stack.
1407 	 */
1408 	for (i = 0; i < argc; i++, usp += ptrsize)
1409 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1410 			return (-1);
1411 
1412 	/*
1413 	 * Copy arguments to u_psargs.
1414 	 */
1415 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1416 	for (i = 0; i < pslen; i++)
1417 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1418 	while (i < PSARGSZ)
1419 		up->u_psargs[i++] = '\0';
1420 
1421 	/*
1422 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1423 	 * record envp for /proc.
1424 	 */
1425 	up->u_envp = (uintptr_t)(usp += ptrsize);
1426 
1427 	/*
1428 	 * Put the envp[] pointers on the stack.
1429 	 */
1430 	for (i = 0; i < envc; i++, usp += ptrsize)
1431 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1432 			return (-1);
1433 
1434 	/*
1435 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1436 	 * remember where the stack ends, which is also where auxv begins.
1437 	 */
1438 	args->stackend = usp += ptrsize;
1439 
1440 	/*
1441 	 * Put all the argv[], envp[], and auxv strings on the stack.
1442 	 */
1443 	if (copyout(args->stk_base, ustrp, args->nc))
1444 		return (-1);
1445 
1446 	/*
1447 	 * Fill in the aux vector now that we know the user stack addresses
1448 	 * for the AT_SUN_PLATFORM and AT_SUN_EXECNAME strings.
1449 	 */
1450 	if (auxvpp != NULL && *auxvpp != NULL) {
1451 		if (args->to_model == DATAMODEL_NATIVE) {
1452 			auxv_t **a = (auxv_t **)auxvpp;
1453 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1454 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1455 		} else {
1456 			auxv32_t **a = (auxv32_t **)auxvpp;
1457 			ADDAUX(*a,
1458 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1459 			ADDAUX(*a,
1460 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp]);
1461 		}
1462 	}
1463 
1464 	return (0);
1465 }
1466 
1467 #ifdef DEBUG
1468 int mpss_brkpgszsel = 0;
1469 int mpss_stkpgszsel = 0;
1470 #endif
1471 
1472 /*
1473  * Initialize a new user stack with the specified arguments and environment.
1474  * The initial user stack layout is as follows:
1475  *
1476  *	User Stack
1477  *	+---------------+ <--- curproc->p_usrstack
1478  *	| NULL		|
1479  *	+---------------+
1480  *	|		|
1481  *	| auxv strings	|
1482  *	|		|
1483  *	+---------------+
1484  *	|		|
1485  *	| envp strings	|
1486  *	|		|
1487  *	+---------------+
1488  *	|		|
1489  *	| argv strings	|
1490  *	|		|
1491  *	+---------------+ <--- ustrp
1492  *	|		|
1493  *	| aux vector	|
1494  *	|		|
1495  *	+---------------+ <--- auxv
1496  *	| NULL		|
1497  *	+---------------+
1498  *	| envp[envc-1]	|
1499  *	+---------------+
1500  *	| ...		|
1501  *	+---------------+
1502  *	| envp[0]	|
1503  *	+---------------+ <--- envp[]
1504  *	| NULL		|
1505  *	+---------------+
1506  *	| argv[argc-1]	|
1507  *	+---------------+
1508  *	| ...		|
1509  *	+---------------+
1510  *	| argv[0]	|
1511  *	+---------------+ <--- argv[]
1512  *	| argc		|
1513  *	+---------------+ <--- stack base
1514  */
1515 int
1516 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1517 {
1518 	size_t size;
1519 	int error;
1520 	proc_t *p = ttoproc(curthread);
1521 	user_t *up = PTOU(p);
1522 	char *usrstack;
1523 	rctl_entity_p_t e;
1524 
1525 	struct as *as;
1526 
1527 	args->from_model = p->p_model;
1528 	if (p->p_model == DATAMODEL_NATIVE) {
1529 		args->from_ptrsize = sizeof (long);
1530 	} else {
1531 		args->from_ptrsize = sizeof (int32_t);
1532 	}
1533 
1534 	if (args->to_model == DATAMODEL_NATIVE) {
1535 		args->to_ptrsize = sizeof (long);
1536 		args->ncargs = NCARGS;
1537 		args->stk_align = STACK_ALIGN;
1538 		usrstack = (char *)USRSTACK;
1539 	} else {
1540 		args->to_ptrsize = sizeof (int32_t);
1541 		args->ncargs = NCARGS32;
1542 		args->stk_align = STACK_ALIGN32;
1543 		usrstack = (char *)USRSTACK32;
1544 	}
1545 
1546 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1547 
1548 #if defined(__sparc)
1549 	/*
1550 	 * Make sure user register windows are empty before
1551 	 * attempting to make a new stack.
1552 	 */
1553 	(void) flush_user_windows_to_stack(NULL);
1554 #endif
1555 
1556 	for (size = PAGESIZE; ; size *= 2) {
1557 		args->stk_size = size;
1558 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1559 		args->stk_strp = args->stk_base;
1560 		args->stk_offp = (int *)(args->stk_base + size);
1561 		error = stk_copyin(uap, args, intp, auxvpp);
1562 		if (error == 0)
1563 			break;
1564 		kmem_free(args->stk_base, size);
1565 		if (error != E2BIG && error != ENAMETOOLONG)
1566 			return (error);
1567 		if (size >= args->ncargs)
1568 			return (E2BIG);
1569 	}
1570 
1571 	size = args->usrstack_size;
1572 
1573 	ASSERT(error == 0);
1574 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1575 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1576 
1577 	if (size > args->ncargs) {
1578 		kmem_free(args->stk_base, args->stk_size);
1579 		return (E2BIG);
1580 	}
1581 
1582 	/*
1583 	 * Leave only the current lwp and force the other lwps to exit.
1584 	 * If another lwp beat us to the punch by calling exit(), bail out.
1585 	 */
1586 	if ((error = exitlwps(0)) != 0) {
1587 		kmem_free(args->stk_base, args->stk_size);
1588 		return (error);
1589 	}
1590 
1591 	/*
1592 	 * Revoke any doors created by the process.
1593 	 */
1594 	if (p->p_door_list)
1595 		door_exit();
1596 
1597 	/*
1598 	 * Release schedctl data structures.
1599 	 */
1600 	if (p->p_pagep)
1601 		schedctl_proc_cleanup();
1602 
1603 	/*
1604 	 * Clean up any DTrace helpers for the process.
1605 	 */
1606 	if (p->p_dtrace_helpers != NULL) {
1607 		ASSERT(dtrace_helpers_cleanup != NULL);
1608 		(*dtrace_helpers_cleanup)();
1609 	}
1610 
1611 	mutex_enter(&p->p_lock);
1612 	/*
1613 	 * Cleanup the DTrace provider associated with this process.
1614 	 */
1615 	if (p->p_dtrace_probes) {
1616 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1617 		dtrace_fasttrap_exec_ptr(p);
1618 	}
1619 	mutex_exit(&p->p_lock);
1620 
1621 	/*
1622 	 * discard the lwpchan cache.
1623 	 */
1624 	if (p->p_lcp != NULL)
1625 		lwpchan_destroy_cache(1);
1626 
1627 	/*
1628 	 * Delete the POSIX timers.
1629 	 */
1630 	if (p->p_itimer != NULL)
1631 		timer_exit();
1632 
1633 #ifdef C2_AUDIT
1634 	if (audit_active)
1635 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1636 		    args->na - args->ne, args->ne);
1637 #endif
1638 
1639 	/*
1640 	 * Ensure that we don't change resource associations while we
1641 	 * change address spaces.
1642 	 */
1643 	mutex_enter(&p->p_lock);
1644 	pool_barrier_enter();
1645 	mutex_exit(&p->p_lock);
1646 
1647 	/*
1648 	 * Destroy the old address space and create a new one.
1649 	 * From here on, any errors are fatal to the exec()ing process.
1650 	 * On error we return -1, which means the caller must SIGKILL
1651 	 * the process.
1652 	 */
1653 	relvm();
1654 
1655 	mutex_enter(&p->p_lock);
1656 	pool_barrier_exit();
1657 	mutex_exit(&p->p_lock);
1658 
1659 	up->u_execsw = args->execswp;
1660 
1661 	p->p_brkbase = NULL;
1662 	p->p_brksize = 0;
1663 	p->p_stksize = 0;
1664 	p->p_model = args->to_model;
1665 	p->p_usrstack = usrstack;
1666 	p->p_stkprot = args->stk_prot;
1667 	p->p_datprot = args->dat_prot;
1668 
1669 	/*
1670 	 * Reset resource controls such that all controls are again active as
1671 	 * well as appropriate to the potentially new address model for the
1672 	 * process.
1673 	 */
1674 	e.rcep_p.proc = p;
1675 	e.rcep_t = RCENTITY_PROCESS;
1676 	rctl_set_reset(p->p_rctls, p, &e);
1677 
1678 	if (exec_lpg_disable == 0) {
1679 #ifdef DEBUG
1680 		uint_t pgsizes = page_num_pagesizes();
1681 		uint_t szc;
1682 #endif
1683 		p->p_brkpageszc = args->brkpageszc;
1684 		p->p_stkpageszc = args->stkpageszc;
1685 
1686 		if (p->p_brkpageszc == 0) {
1687 			p->p_brkpageszc = page_szc(map_pgsz(MAPPGSZ_HEAP,
1688 			    p, 0, 0, NULL));
1689 		}
1690 		if (p->p_stkpageszc == 0) {
1691 			p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK,
1692 			    p, 0, 0, NULL));
1693 		}
1694 
1695 #ifdef DEBUG
1696 		if (mpss_brkpgszsel != 0) {
1697 			if (mpss_brkpgszsel == -1) {
1698 				szc = ((uint_t)gethrtime() >> 8) % pgsizes;
1699 			} else {
1700 				szc = mpss_brkpgszsel % pgsizes;
1701 			}
1702 			p->p_brkpageszc = szc;
1703 		}
1704 
1705 		if (mpss_stkpgszsel != 0) {
1706 			if (mpss_stkpgszsel == -1) {
1707 				szc = ((uint_t)gethrtime() >> 7) % pgsizes;
1708 			} else {
1709 				szc = mpss_stkpgszsel % pgsizes;
1710 			}
1711 			p->p_stkpageszc = szc;
1712 		}
1713 
1714 #endif
1715 		mutex_enter(&p->p_lock);
1716 		p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1717 		mutex_exit(&p->p_lock);
1718 
1719 	} else {
1720 		p->p_brkpageszc = 0;
1721 		p->p_stkpageszc = 0;
1722 	}
1723 
1724 	exec_set_sp(size);
1725 
1726 	as = as_alloc();
1727 	p->p_as = as;
1728 	if (p->p_model == DATAMODEL_ILP32)
1729 		as->a_userlimit = (caddr_t)USERLIMIT32;
1730 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1731 
1732 	/*
1733 	 * Finally, write out the contents of the new stack.
1734 	 */
1735 	error = stk_copyout(args, usrstack, auxvpp, up);
1736 	kmem_free(args->stk_base, args->stk_size);
1737 	return (error);
1738 }
1739