xref: /titanic_52/usr/src/uts/common/os/exec.c (revision 47911a7d5f24c2fc37e7b5bcc696fe32e750c16c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*	Copyright (c) 1988 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/lgrp.h>
53 #include <sys/vtrace.h>
54 #include <sys/exec.h>
55 #include <sys/exechdr.h>
56 #include <sys/kmem.h>
57 #include <sys/prsystm.h>
58 #include <sys/modctl.h>
59 #include <sys/vmparam.h>
60 #include <sys/schedctl.h>
61 #include <sys/utrap.h>
62 #include <sys/systeminfo.h>
63 #include <sys/stack.h>
64 #include <sys/rctl.h>
65 #include <sys/dtrace.h>
66 #include <sys/lwpchan_impl.h>
67 #include <sys/pool.h>
68 #include <sys/sdt.h>
69 #include <sys/brand.h>
70 
71 #include <c2/audit.h>
72 
73 #include <vm/hat.h>
74 #include <vm/anon.h>
75 #include <vm/as.h>
76 #include <vm/seg.h>
77 #include <vm/seg_vn.h>
78 
79 #define	PRIV_RESET		0x01	/* needs to reset privs */
80 #define	PRIV_SETID		0x02	/* needs to change uids */
81 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
82 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
83 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
84 
85 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
86 static int hold_execsw(struct execsw *);
87 
88 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
89 #if defined(_SYSCALL32_IMPL)
90 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
91 #endif
92 
93 #define	PSUIDFLAGS		(SNOCD|SUGID)
94 
95 /*
96  * exec() - wrapper around exece providing NULL environment pointer
97  */
98 int
99 exec(const char *fname, const char **argp)
100 {
101 	return (exece(fname, argp, NULL));
102 }
103 
104 /*
105  * exece() - system call wrapper around exec_common()
106  */
107 int
108 exece(const char *fname, const char **argp, const char **envp)
109 {
110 	int error;
111 
112 	error = exec_common(fname, argp, envp, EBA_NONE);
113 	return (error ? (set_errno(error)) : 0);
114 }
115 
116 int
117 exec_common(const char *fname, const char **argp, const char **envp,
118     int brand_action)
119 {
120 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
121 	proc_t *p = ttoproc(curthread);
122 	klwp_t *lwp = ttolwp(curthread);
123 	struct user *up = PTOU(p);
124 	long execsz;		/* temporary count of exec size */
125 	int i;
126 	int error;
127 	char exec_file[MAXCOMLEN+1];
128 	struct pathname pn;
129 	struct pathname resolvepn;
130 	struct uarg args;
131 	struct execa ua;
132 	k_sigset_t savedmask;
133 	lwpdir_t *lwpdir = NULL;
134 	lwpdir_t **tidhash;
135 	lwpdir_t *old_lwpdir = NULL;
136 	uint_t old_lwpdir_sz;
137 	lwpdir_t **old_tidhash;
138 	uint_t old_tidhash_sz;
139 	lwpent_t *lep;
140 	int brandme = 0;
141 
142 	/*
143 	 * exec() is not supported for the /proc agent lwp.
144 	 */
145 	if (curthread == p->p_agenttp)
146 		return (ENOTSUP);
147 
148 	if ((error = secpolicy_basic_exec(CRED())) != 0)
149 		return (error);
150 
151 	if (brand_action != EBA_NONE) {
152 		/*
153 		 * Brand actions are not supported for processes that are not
154 		 * running in a branded zone.
155 		 */
156 		if (!ZONE_IS_BRANDED(p->p_zone))
157 			return (ENOTSUP);
158 
159 		if (brand_action == EBA_NATIVE) {
160 			/* Only branded processes can be unbranded */
161 			if (!PROC_IS_BRANDED(p))
162 				return (ENOTSUP);
163 		} else {
164 			/* Only unbranded processes can be branded */
165 			if (PROC_IS_BRANDED(p))
166 				return (ENOTSUP);
167 			brandme = 1;
168 		}
169 	} else {
170 		/*
171 		 * If this is a native zone, or if the process is already
172 		 * branded, then we don't need to do anything.  If this is
173 		 * a native process in a branded zone, we need to brand the
174 		 * process as it exec()s the new binary.
175 		 */
176 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
177 			brandme = 1;
178 	}
179 
180 	/*
181 	 * Inform /proc that an exec() has started.
182 	 * Hold signals that are ignored by default so that we will
183 	 * not be interrupted by a signal that will be ignored after
184 	 * successful completion of gexec().
185 	 */
186 	mutex_enter(&p->p_lock);
187 	prexecstart();
188 	schedctl_finish_sigblock(curthread);
189 	savedmask = curthread->t_hold;
190 	sigorset(&curthread->t_hold, &ignoredefault);
191 	mutex_exit(&p->p_lock);
192 
193 	/*
194 	 * Look up path name and remember last component for later.
195 	 * To help coreadm expand its %d token, we attempt to save
196 	 * the directory containing the executable in p_execdir. The
197 	 * first call to lookuppn() may fail and return EINVAL because
198 	 * dirvpp is non-NULL. In that case, we make a second call to
199 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
200 	 * but coreadm is allowed to expand %d to the empty string and
201 	 * there are other cases in which that failure may occur.
202 	 */
203 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
204 		goto out;
205 	pn_alloc(&resolvepn);
206 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
207 		pn_free(&resolvepn);
208 		pn_free(&pn);
209 		if (error != EINVAL)
210 			goto out;
211 
212 		dir = NULL;
213 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
214 			goto out;
215 		pn_alloc(&resolvepn);
216 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
217 		    &vp)) != 0) {
218 			pn_free(&resolvepn);
219 			pn_free(&pn);
220 			goto out;
221 		}
222 	}
223 	if (vp == NULL) {
224 		if (dir != NULL)
225 			VN_RELE(dir);
226 		error = ENOENT;
227 		pn_free(&resolvepn);
228 		pn_free(&pn);
229 		goto out;
230 	}
231 
232 	/*
233 	 * We do not allow executing files in attribute directories.
234 	 * We test this by determining whether the resolved path
235 	 * contains a "/" when we're in an attribute directory;
236 	 * only if the pathname does not contain a "/" the resolved path
237 	 * points to a file in the current working (attribute) directory.
238 	 */
239 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
240 	    strchr(resolvepn.pn_path, '/') == NULL) {
241 		if (dir != NULL)
242 			VN_RELE(dir);
243 		error = EACCES;
244 		pn_free(&resolvepn);
245 		pn_free(&pn);
246 		VN_RELE(vp);
247 		goto out;
248 	}
249 
250 	bzero(exec_file, MAXCOMLEN+1);
251 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
252 	bzero(&args, sizeof (args));
253 	args.pathname = resolvepn.pn_path;
254 	/* don't free resolvepn until we are done with args */
255 	pn_free(&pn);
256 
257 	/*
258 	 * Specific exec handlers, or policies determined via
259 	 * /etc/system may override the historical default.
260 	 */
261 	args.stk_prot = PROT_ZFOD;
262 	args.dat_prot = PROT_ZFOD;
263 
264 	CPU_STATS_ADD_K(sys, sysexec, 1);
265 	DTRACE_PROC1(exec, char *, args.pathname);
266 
267 	ua.fname = fname;
268 	ua.argp = argp;
269 	ua.envp = envp;
270 
271 	/* If necessary, brand this process before we start the exec. */
272 	if (brandme != 0)
273 		brand_setbrand(p);
274 
275 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
276 	    exec_file, p->p_cred, brand_action)) != 0) {
277 		if (brandme != 0)
278 			BROP(p)->b_proc_exit(p, lwp);
279 		VN_RELE(vp);
280 		if (dir != NULL)
281 			VN_RELE(dir);
282 		pn_free(&resolvepn);
283 		goto fail;
284 	}
285 
286 	/*
287 	 * Free floating point registers (sun4u only)
288 	 */
289 	ASSERT(lwp != NULL);
290 	lwp_freeregs(lwp, 1);
291 
292 	/*
293 	 * Free thread and process context ops.
294 	 */
295 	if (curthread->t_ctx)
296 		freectx(curthread, 1);
297 	if (p->p_pctx)
298 		freepctx(p, 1);
299 
300 	/*
301 	 * Remember file name for accounting; clear any cached DTrace predicate.
302 	 */
303 	up->u_acflag &= ~AFORK;
304 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
305 	curthread->t_predcache = NULL;
306 
307 	/*
308 	 * Clear contract template state
309 	 */
310 	lwp_ctmpl_clear(lwp);
311 
312 	/*
313 	 * Save the directory in which we found the executable for expanding
314 	 * the %d token used in core file patterns.
315 	 */
316 	mutex_enter(&p->p_lock);
317 	tmpvp = p->p_execdir;
318 	p->p_execdir = dir;
319 	if (p->p_execdir != NULL)
320 		VN_HOLD(p->p_execdir);
321 	mutex_exit(&p->p_lock);
322 
323 	if (tmpvp != NULL)
324 		VN_RELE(tmpvp);
325 
326 	/*
327 	 * Reset stack state to the user stack, clear set of signals
328 	 * caught on the signal stack, and reset list of signals that
329 	 * restart system calls; the new program's environment should
330 	 * not be affected by detritus from the old program.  Any
331 	 * pending held signals remain held, so don't clear t_hold.
332 	 */
333 	mutex_enter(&p->p_lock);
334 	lwp->lwp_oldcontext = 0;
335 	lwp->lwp_ustack = 0;
336 	lwp->lwp_old_stk_ctl = 0;
337 	sigemptyset(&up->u_signodefer);
338 	sigemptyset(&up->u_sigonstack);
339 	sigemptyset(&up->u_sigresethand);
340 	lwp->lwp_sigaltstack.ss_sp = 0;
341 	lwp->lwp_sigaltstack.ss_size = 0;
342 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
343 
344 	/*
345 	 * Make saved resource limit == current resource limit.
346 	 */
347 	for (i = 0; i < RLIM_NLIMITS; i++) {
348 		/*CONSTCOND*/
349 		if (RLIM_SAVED(i)) {
350 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
351 			    &up->u_saved_rlimit[i]);
352 		}
353 	}
354 
355 	/*
356 	 * If the action was to catch the signal, then the action
357 	 * must be reset to SIG_DFL.
358 	 */
359 	sigdefault(p);
360 	p->p_flag &= ~(SNOWAIT|SJCTL);
361 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
362 	up->u_signal[SIGCLD - 1] = SIG_DFL;
363 
364 	/*
365 	 * Delete the dot4 sigqueues/signotifies.
366 	 */
367 	sigqfree(p);
368 
369 	mutex_exit(&p->p_lock);
370 
371 	mutex_enter(&p->p_pflock);
372 	p->p_prof.pr_base = NULL;
373 	p->p_prof.pr_size = 0;
374 	p->p_prof.pr_off = 0;
375 	p->p_prof.pr_scale = 0;
376 	p->p_prof.pr_samples = 0;
377 	mutex_exit(&p->p_pflock);
378 
379 	ASSERT(curthread->t_schedctl == NULL);
380 
381 #if defined(__sparc)
382 	if (p->p_utraps != NULL)
383 		utrap_free(p);
384 #endif	/* __sparc */
385 
386 	/*
387 	 * Close all close-on-exec files.
388 	 */
389 	close_exec(P_FINFO(p));
390 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
391 
392 	/* Unbrand ourself if requested. */
393 	if (brand_action == EBA_NATIVE)
394 		BROP(p)->b_proc_exit(p, lwp);
395 	ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p));
396 
397 	setregs(&args);
398 
399 	/* Mark this as an executable vnode */
400 	mutex_enter(&vp->v_lock);
401 	vp->v_flag |= VVMEXEC;
402 	mutex_exit(&vp->v_lock);
403 
404 	VN_RELE(vp);
405 	if (dir != NULL)
406 		VN_RELE(dir);
407 	pn_free(&resolvepn);
408 
409 	/*
410 	 * Allocate a new lwp directory and lwpid hash table if necessary.
411 	 */
412 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
413 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
414 		lwpdir->ld_next = lwpdir + 1;
415 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
416 		if (p->p_lwpdir != NULL)
417 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
418 		else
419 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
420 	}
421 
422 	if (PROC_IS_BRANDED(p))
423 		BROP(p)->b_exec();
424 
425 	mutex_enter(&p->p_lock);
426 	prbarrier(p);
427 
428 	/*
429 	 * Reset lwp id to the default value of 1.
430 	 * This is a single-threaded process now
431 	 * and lwp #1 is lwp_wait()able by default.
432 	 * The t_unpark flag should not be inherited.
433 	 */
434 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
435 	curthread->t_tid = 1;
436 	kpreempt_disable();
437 	ASSERT(curthread->t_lpl != NULL);
438 	p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
439 	kpreempt_enable();
440 	if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
441 		lgrp_update_trthr_migrations(1);
442 	}
443 	curthread->t_unpark = 0;
444 	curthread->t_proc_flag |= TP_TWAIT;
445 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
446 	p->p_lwpdaemon = 0;			/* but oh well ... */
447 	p->p_lwpid = 1;
448 
449 	/*
450 	 * Install the newly-allocated lwp directory and lwpid hash table
451 	 * and insert the current thread into the new hash table.
452 	 */
453 	if (lwpdir != NULL) {
454 		old_lwpdir = p->p_lwpdir;
455 		old_lwpdir_sz = p->p_lwpdir_sz;
456 		old_tidhash = p->p_tidhash;
457 		old_tidhash_sz = p->p_tidhash_sz;
458 		p->p_lwpdir = p->p_lwpfree = lwpdir;
459 		p->p_lwpdir_sz = 2;
460 		p->p_tidhash = tidhash;
461 		p->p_tidhash_sz = 2;
462 		lep->le_thread = curthread;
463 		lep->le_lwpid = curthread->t_tid;
464 		lep->le_start = curthread->t_start;
465 		lwp_hash_in(p, lep);
466 	}
467 
468 	/*
469 	 * Restore the saved signal mask and
470 	 * inform /proc that the exec() has finished.
471 	 */
472 	curthread->t_hold = savedmask;
473 	prexecend();
474 	mutex_exit(&p->p_lock);
475 	if (old_lwpdir) {
476 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
477 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
478 	}
479 
480 	ASSERT(error == 0);
481 	DTRACE_PROC(exec__success);
482 	return (0);
483 
484 fail:
485 	DTRACE_PROC1(exec__failure, int, error);
486 out:		/* error return */
487 	mutex_enter(&p->p_lock);
488 	curthread->t_hold = savedmask;
489 	prexecend();
490 	mutex_exit(&p->p_lock);
491 	ASSERT(error != 0);
492 	return (error);
493 }
494 
495 
496 /*
497  * Perform generic exec duties and switchout to object-file specific
498  * handler.
499  */
500 int
501 gexec(
502 	struct vnode **vpp,
503 	struct execa *uap,
504 	struct uarg *args,
505 	struct intpdata *idatap,
506 	int level,
507 	long *execsz,
508 	caddr_t exec_file,
509 	struct cred *cred,
510 	int brand_action)
511 {
512 	struct vnode *vp;
513 	proc_t *pp = ttoproc(curthread);
514 	struct execsw *eswp;
515 	int error = 0;
516 	int suidflags = 0;
517 	ssize_t resid;
518 	uid_t uid, gid;
519 	struct vattr vattr;
520 	char magbuf[MAGIC_BYTES];
521 	int setid;
522 	cred_t *oldcred, *newcred = NULL;
523 	int privflags = 0;
524 	int setidfl;
525 
526 	/*
527 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
528 	 * previous setting so we can restore it if we encounter an error.
529 	 */
530 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
531 		mutex_enter(&pp->p_lock);
532 		suidflags = pp->p_flag & PSUIDFLAGS;
533 		pp->p_flag &= ~PSUIDFLAGS;
534 		mutex_exit(&pp->p_lock);
535 	}
536 
537 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
538 		goto bad;
539 
540 	/* need to open vnode for stateful file systems like rfs */
541 	if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
542 		goto bad;
543 	vp = *vpp;
544 
545 	/*
546 	 * Note: to support binary compatibility with SunOS a.out
547 	 * executables, we read in the first four bytes, as the
548 	 * magic number is in bytes 2-3.
549 	 */
550 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
551 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
552 		goto bad;
553 	if (resid != 0)
554 		goto bad;
555 
556 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
557 		goto bad;
558 
559 	if (level == 0 &&
560 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
561 
562 		newcred = cred = crdup(cred);
563 
564 		/* If we can, drop the PA bit */
565 		if ((privflags & PRIV_RESET) != 0)
566 			priv_adjust_PA(cred);
567 
568 		if (privflags & PRIV_SETID) {
569 			cred->cr_uid = uid;
570 			cred->cr_gid = gid;
571 			cred->cr_suid = uid;
572 			cred->cr_sgid = gid;
573 		}
574 
575 		if (privflags & MAC_FLAGS) {
576 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
577 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
578 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
579 		}
580 
581 		/*
582 		 * Implement the privilege updates:
583 		 *
584 		 * Restrict with L:
585 		 *
586 		 *	I' = I & L
587 		 *
588 		 *	E' = P' = (I' + F) & A
589 		 *
590 		 * But if running under ptrace, we cap I with P.
591 		 */
592 		if ((privflags & PRIV_RESET) != 0) {
593 			if ((privflags & PRIV_INCREASE) != 0 &&
594 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
595 				priv_intersect(&CR_OPPRIV(cred),
596 						    &CR_IPRIV(cred));
597 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
598 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
599 			priv_adjust_PA(cred);
600 		}
601 	}
602 
603 	/* SunOS 4.x buy-back */
604 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
605 	    (vattr.va_mode & (VSUID|VSGID))) {
606 		cmn_err(CE_NOTE,
607 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
608 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
609 	}
610 
611 	/*
612 	 * execsetid() told us whether or not we had to change the
613 	 * credentials of the process.  In privflags, it told us
614 	 * whether we gained any privileges or executed a set-uid executable.
615 	 */
616 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
617 
618 	/*
619 	 * Use /etc/system variable to determine if the stack
620 	 * should be marked as executable by default.
621 	 */
622 	if (noexec_user_stack)
623 		args->stk_prot &= ~PROT_EXEC;
624 
625 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
626 	args->ex_vp = vp;
627 
628 	/*
629 	 * Traditionally, the setid flags told the sub processes whether
630 	 * the file just executed was set-uid or set-gid; this caused
631 	 * some confusion as the 'setid' flag did not match the SUGID
632 	 * process flag which is only set when the uids/gids do not match.
633 	 * A script set-gid/set-uid to the real uid/gid would start with
634 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
635 	 * Now we flag those cases where the calling process cannot
636 	 * be trusted to influence the newly exec'ed process, either
637 	 * because it runs with more privileges or when the uids/gids
638 	 * do in fact not match.
639 	 * This also makes the runtime linker agree with the on exec
640 	 * values of SNOCD and SUGID.
641 	 */
642 	setidfl = 0;
643 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
644 	    !supgroupmember(cred->cr_gid, cred))) {
645 		setidfl |= EXECSETID_UGIDS;
646 	}
647 	if (setid & PRIV_SETUGID)
648 		setidfl |= EXECSETID_SETID;
649 	if (setid & PRIV_INCREASE)
650 		setidfl |= EXECSETID_PRIVS;
651 
652 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
653 		setidfl, exec_file, cred, brand_action);
654 	rw_exit(eswp->exec_lock);
655 	if (error != 0) {
656 		if (newcred != NULL)
657 			crfree(newcred);
658 		goto bad;
659 	}
660 
661 	if (level == 0) {
662 		mutex_enter(&pp->p_crlock);
663 		if (newcred != NULL) {
664 			/*
665 			 * Free the old credentials, and set the new ones.
666 			 * Do this for both the process and the (single) thread.
667 			 */
668 			crfree(pp->p_cred);
669 			pp->p_cred = cred;	/* cred already held for proc */
670 			crhold(cred);		/* hold new cred for thread */
671 			/*
672 			 * DTrace accesses t_cred in probe context.  t_cred
673 			 * must always be either NULL, or point to a valid,
674 			 * allocated cred structure.
675 			 */
676 			oldcred = curthread->t_cred;
677 			curthread->t_cred = cred;
678 			crfree(oldcred);
679 		}
680 		/*
681 		 * On emerging from a successful exec(), the saved
682 		 * uid and gid equal the effective uid and gid.
683 		 */
684 		cred->cr_suid = cred->cr_uid;
685 		cred->cr_sgid = cred->cr_gid;
686 
687 		/*
688 		 * If the real and effective ids do not match, this
689 		 * is a setuid process that should not dump core.
690 		 * The group comparison is tricky; we prevent the code
691 		 * from flagging SNOCD when executing with an effective gid
692 		 * which is a supplementary group.
693 		 */
694 		if (cred->cr_ruid != cred->cr_uid ||
695 		    (cred->cr_rgid != cred->cr_gid &&
696 		    !supgroupmember(cred->cr_gid, cred)) ||
697 		    (privflags & PRIV_INCREASE) != 0)
698 			suidflags = PSUIDFLAGS;
699 		else
700 			suidflags = 0;
701 
702 		mutex_exit(&pp->p_crlock);
703 		if (suidflags) {
704 			mutex_enter(&pp->p_lock);
705 			pp->p_flag |= suidflags;
706 			mutex_exit(&pp->p_lock);
707 		}
708 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
709 			/*
710 			 * If process is traced via /proc, arrange to
711 			 * invalidate the associated /proc vnode.
712 			 */
713 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
714 				args->traceinval = 1;
715 		}
716 		if (pp->p_proc_flag & P_PR_PTRACE)
717 			psignal(pp, SIGTRAP);
718 		if (args->traceinval)
719 			prinvalidate(&pp->p_user);
720 	}
721 
722 	return (0);
723 bad:
724 	if (error == 0)
725 		error = ENOEXEC;
726 
727 	if (suidflags) {
728 		mutex_enter(&pp->p_lock);
729 		pp->p_flag |= suidflags;
730 		mutex_exit(&pp->p_lock);
731 	}
732 	return (error);
733 }
734 
735 extern char *execswnames[];
736 
737 struct execsw *
738 allocate_execsw(char *name, char *magic, size_t magic_size)
739 {
740 	int i, j;
741 	char *ename;
742 	char *magicp;
743 
744 	mutex_enter(&execsw_lock);
745 	for (i = 0; i < nexectype; i++) {
746 		if (execswnames[i] == NULL) {
747 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
748 			(void) strcpy(ename, name);
749 			execswnames[i] = ename;
750 			/*
751 			 * Set the magic number last so that we
752 			 * don't need to hold the execsw_lock in
753 			 * findexectype().
754 			 */
755 			magicp = kmem_alloc(magic_size, KM_SLEEP);
756 			for (j = 0; j < magic_size; j++)
757 				magicp[j] = magic[j];
758 			execsw[i].exec_magic = magicp;
759 			mutex_exit(&execsw_lock);
760 			return (&execsw[i]);
761 		}
762 	}
763 	mutex_exit(&execsw_lock);
764 	return (NULL);
765 }
766 
767 /*
768  * Find the exec switch table entry with the corresponding magic string.
769  */
770 struct execsw *
771 findexecsw(char *magic)
772 {
773 	struct execsw *eswp;
774 
775 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
776 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
777 		if (magic && eswp->exec_maglen != 0 &&
778 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
779 			return (eswp);
780 	}
781 	return (NULL);
782 }
783 
784 /*
785  * Find the execsw[] index for the given exec header string by looking for the
786  * magic string at a specified offset and length for each kind of executable
787  * file format until one matches.  If no execsw[] entry is found, try to
788  * autoload a module for this magic string.
789  */
790 struct execsw *
791 findexec_by_hdr(char *header)
792 {
793 	struct execsw *eswp;
794 
795 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
796 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
797 		if (header && eswp->exec_maglen != 0 &&
798 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
799 			    eswp->exec_maglen) == 0) {
800 			if (hold_execsw(eswp) != 0)
801 				return (NULL);
802 			return (eswp);
803 		}
804 	}
805 	return (NULL);	/* couldn't find the type */
806 }
807 
808 /*
809  * Find the execsw[] index for the given magic string.  If no execsw[] entry
810  * is found, try to autoload a module for this magic string.
811  */
812 struct execsw *
813 findexec_by_magic(char *magic)
814 {
815 	struct execsw *eswp;
816 
817 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
818 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
819 		if (magic && eswp->exec_maglen != 0 &&
820 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
821 			if (hold_execsw(eswp) != 0)
822 				return (NULL);
823 			return (eswp);
824 		}
825 	}
826 	return (NULL);	/* couldn't find the type */
827 }
828 
829 static int
830 hold_execsw(struct execsw *eswp)
831 {
832 	char *name;
833 
834 	rw_enter(eswp->exec_lock, RW_READER);
835 	while (!LOADED_EXEC(eswp)) {
836 		rw_exit(eswp->exec_lock);
837 		name = execswnames[eswp-execsw];
838 		ASSERT(name);
839 		if (modload("exec", name) == -1)
840 			return (-1);
841 		rw_enter(eswp->exec_lock, RW_READER);
842 	}
843 	return (0);
844 }
845 
846 static int
847 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
848 {
849 	proc_t *pp = ttoproc(curthread);
850 	uid_t uid, gid;
851 	cred_t *cr = pp->p_cred;
852 	int privflags = 0;
853 
854 	/*
855 	 * Remember credentials.
856 	 */
857 	uid = cr->cr_uid;
858 	gid = cr->cr_gid;
859 
860 	/* Will try to reset the PRIV_AWARE bit later. */
861 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
862 		privflags |= PRIV_RESET;
863 
864 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
865 		/*
866 		 * Set-uid root execution only allowed if the limit set
867 		 * holds all unsafe privileges.
868 		 */
869 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
870 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
871 			uid = vattrp->va_uid;
872 			privflags |= PRIV_SETUGID;
873 		}
874 		if (vattrp->va_mode & VSGID) {
875 			gid = vattrp->va_gid;
876 			privflags |= PRIV_SETUGID;
877 		}
878 	}
879 
880 	/*
881 	 * Do we need to change our credential anyway?
882 	 * This is the case when E != I or P != I, as
883 	 * we need to do the assignments (with F empty and A full)
884 	 * Or when I is not a subset of L; in that case we need to
885 	 * enforce L.
886 	 *
887 	 *		I' = L & I
888 	 *
889 	 *		E' = P' = (I' + F) & A
890 	 * or
891 	 *		E' = P' = I'
892 	 */
893 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
894 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
895 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
896 		privflags |= PRIV_RESET;
897 
898 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
899 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
900 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
901 		privflags |= MAC_FLAGS;
902 
903 	/*
904 	 * When we introduce the "forced" set then we will need
905 	 * to set PRIV_INCREASE here if I not a subset of P.
906 	 * If the "allowed" set is introduced we will need to do
907 	 * a similar thing; however, it seems more reasonable to
908 	 * have the allowed set reduce "L": script language interpreters
909 	 * would typically have an allowed set of "all".
910 	 */
911 
912 	/*
913 	 * Set setuid/setgid protections if no ptrace() compatibility.
914 	 * For privileged processes, honor setuid/setgid even in
915 	 * the presence of ptrace() compatibility.
916 	 */
917 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
918 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
919 	    (cr->cr_uid != uid ||
920 	    cr->cr_gid != gid ||
921 	    cr->cr_suid != uid ||
922 	    cr->cr_sgid != gid)) {
923 		*uidp = uid;
924 		*gidp = gid;
925 		privflags |= PRIV_SETID;
926 	}
927 	return (privflags);
928 }
929 
930 int
931 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
932 {
933 	int error;
934 	proc_t *p = ttoproc(curthread);
935 
936 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
937 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
938 		return (error);
939 	/*
940 	 * Check the access mode.
941 	 * If VPROC, ask /proc if the file is an object file.
942 	 */
943 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
944 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
945 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
946 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
947 		if (error == 0)
948 			error = EACCES;
949 		return (error);
950 	}
951 
952 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
953 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
954 		/*
955 		 * If process is under ptrace(2) compatibility,
956 		 * fail the exec(2).
957 		 */
958 		if (p->p_proc_flag & P_PR_PTRACE)
959 			goto bad;
960 		/*
961 		 * Process is traced via /proc.
962 		 * Arrange to invalidate the /proc vnode.
963 		 */
964 		args->traceinval = 1;
965 	}
966 	return (0);
967 bad:
968 	if (error == 0)
969 		error = ENOEXEC;
970 	return (error);
971 }
972 
973 /*
974  * Map a section of an executable file into the user's
975  * address space.
976  */
977 int
978 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
979     off_t offset, int prot, int page, uint_t szc)
980 {
981 	int error = 0;
982 	off_t oldoffset;
983 	caddr_t zfodbase, oldaddr;
984 	size_t end, oldlen;
985 	size_t zfoddiff;
986 	label_t ljb;
987 	proc_t *p = ttoproc(curthread);
988 
989 	oldaddr = addr;
990 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
991 	if (len) {
992 		oldlen = len;
993 		len += ((size_t)oldaddr - (size_t)addr);
994 		oldoffset = offset;
995 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
996 		if (page) {
997 			spgcnt_t  prefltmem, availm, npages;
998 			int preread;
999 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1000 
1001 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1002 				mflag |= MAP_TEXT;
1003 			} else {
1004 				mflag |= MAP_INITDATA;
1005 			}
1006 
1007 			if (valid_usr_range(addr, len, prot, p->p_as,
1008 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1009 				error = ENOMEM;
1010 				goto bad;
1011 			}
1012 			if (error = VOP_MAP(vp, (offset_t)offset,
1013 			    p->p_as, &addr, len, prot, PROT_ALL,
1014 			    mflag, CRED(), NULL))
1015 				goto bad;
1016 
1017 			/*
1018 			 * If the segment can fit, then we prefault
1019 			 * the entire segment in.  This is based on the
1020 			 * model that says the best working set of a
1021 			 * small program is all of its pages.
1022 			 */
1023 			npages = (spgcnt_t)btopr(len);
1024 			prefltmem = freemem - desfree;
1025 			preread =
1026 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1027 
1028 			/*
1029 			 * If we aren't prefaulting the segment,
1030 			 * increment "deficit", if necessary to ensure
1031 			 * that pages will become available when this
1032 			 * process starts executing.
1033 			 */
1034 			availm = freemem - lotsfree;
1035 			if (preread == 0 && npages > availm &&
1036 			    deficit < lotsfree) {
1037 				deficit += MIN((pgcnt_t)(npages - availm),
1038 				    lotsfree - deficit);
1039 			}
1040 
1041 			if (preread) {
1042 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1043 				    "execmap preread:freemem %d size %lu",
1044 				    freemem, len);
1045 				(void) as_fault(p->p_as->a_hat, p->p_as,
1046 				    (caddr_t)addr, len, F_INVAL, S_READ);
1047 			}
1048 		} else {
1049 			if (valid_usr_range(addr, len, prot, p->p_as,
1050 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1051 				error = ENOMEM;
1052 				goto bad;
1053 			}
1054 
1055 			if (error = as_map(p->p_as, addr, len,
1056 			    segvn_create, zfod_argsp))
1057 				goto bad;
1058 			/*
1059 			 * Read in the segment in one big chunk.
1060 			 */
1061 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1062 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1063 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1064 				goto bad;
1065 			/*
1066 			 * Now set protections.
1067 			 */
1068 			if (prot != PROT_ZFOD) {
1069 				(void) as_setprot(p->p_as, (caddr_t)addr,
1070 				    len, prot);
1071 			}
1072 		}
1073 	}
1074 
1075 	if (zfodlen) {
1076 		struct as *as = curproc->p_as;
1077 		struct seg *seg;
1078 		uint_t zprot = 0;
1079 
1080 		end = (size_t)addr + len;
1081 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1082 		zfoddiff = (uintptr_t)zfodbase - end;
1083 		if (zfoddiff) {
1084 			/*
1085 			 * Before we go to zero the remaining space on the last
1086 			 * page, make sure we have write permission.
1087 			 */
1088 
1089 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1090 			seg = as_segat(curproc->p_as, (caddr_t)end);
1091 			if (seg != NULL)
1092 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1093 				    &zprot);
1094 			AS_LOCK_EXIT(as, &as->a_lock);
1095 
1096 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1097 				(void) as_setprot(as, (caddr_t)end,
1098 				    zfoddiff - 1, zprot | PROT_WRITE);
1099 			}
1100 
1101 			if (on_fault(&ljb)) {
1102 				no_fault();
1103 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1104 					(void) as_setprot(as, (caddr_t)end,
1105 					zfoddiff - 1, zprot);
1106 				error = EFAULT;
1107 				goto bad;
1108 			}
1109 			uzero((void *)end, zfoddiff);
1110 			no_fault();
1111 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1112 				(void) as_setprot(as, (caddr_t)end,
1113 				    zfoddiff - 1, zprot);
1114 		}
1115 		if (zfodlen > zfoddiff) {
1116 			struct segvn_crargs crargs =
1117 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1118 
1119 			zfodlen -= zfoddiff;
1120 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1121 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1122 				error = ENOMEM;
1123 				goto bad;
1124 			}
1125 			if (szc > 0) {
1126 				/*
1127 				 * ASSERT alignment because the mapelfexec()
1128 				 * caller for the szc > 0 case extended zfod
1129 				 * so it's end is pgsz aligned.
1130 				 */
1131 				size_t pgsz = page_get_pagesize(szc);
1132 				ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1133 
1134 				if (IS_P2ALIGNED(zfodbase, pgsz)) {
1135 					crargs.szc = szc;
1136 				} else {
1137 					crargs.szc = AS_MAP_HEAP;
1138 				}
1139 			} else {
1140 				crargs.szc = AS_MAP_NO_LPOOB;
1141 			}
1142 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1143 			    zfodlen, segvn_create, &crargs))
1144 				goto bad;
1145 			if (prot != PROT_ZFOD) {
1146 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1147 				    zfodlen, prot);
1148 			}
1149 		}
1150 	}
1151 	return (0);
1152 bad:
1153 	return (error);
1154 }
1155 
1156 void
1157 setexecenv(struct execenv *ep)
1158 {
1159 	proc_t *p = ttoproc(curthread);
1160 	klwp_t *lwp = ttolwp(curthread);
1161 	struct vnode *vp;
1162 
1163 	p->p_bssbase = ep->ex_bssbase;
1164 	p->p_brkbase = ep->ex_brkbase;
1165 	p->p_brksize = ep->ex_brksize;
1166 	if (p->p_exec)
1167 		VN_RELE(p->p_exec);	/* out with the old */
1168 	vp = p->p_exec = ep->ex_vp;
1169 	if (vp != NULL)
1170 		VN_HOLD(vp);		/* in with the new */
1171 
1172 	lwp->lwp_sigaltstack.ss_sp = 0;
1173 	lwp->lwp_sigaltstack.ss_size = 0;
1174 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1175 }
1176 
1177 int
1178 execopen(struct vnode **vpp, int *fdp)
1179 {
1180 	struct vnode *vp = *vpp;
1181 	file_t *fp;
1182 	int error = 0;
1183 	int filemode = FREAD;
1184 
1185 	VN_HOLD(vp);		/* open reference */
1186 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1187 		VN_RELE(vp);
1188 		*fdp = -1;	/* just in case falloc changed value */
1189 		return (error);
1190 	}
1191 	if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1192 		VN_RELE(vp);
1193 		setf(*fdp, NULL);
1194 		unfalloc(fp);
1195 		*fdp = -1;
1196 		return (error);
1197 	}
1198 	*vpp = vp;		/* vnode should not have changed */
1199 	fp->f_vnode = vp;
1200 	mutex_exit(&fp->f_tlock);
1201 	setf(*fdp, fp);
1202 	return (0);
1203 }
1204 
1205 int
1206 execclose(int fd)
1207 {
1208 	return (closeandsetf(fd, NULL));
1209 }
1210 
1211 
1212 /*
1213  * noexec stub function.
1214  */
1215 /*ARGSUSED*/
1216 int
1217 noexec(
1218     struct vnode *vp,
1219     struct execa *uap,
1220     struct uarg *args,
1221     struct intpdata *idatap,
1222     int level,
1223     long *execsz,
1224     int setid,
1225     caddr_t exec_file,
1226     struct cred *cred)
1227 {
1228 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1229 	return (ENOEXEC);
1230 }
1231 
1232 /*
1233  * Support routines for building a user stack.
1234  *
1235  * execve(path, argv, envp) must construct a new stack with the specified
1236  * arguments and environment variables (see exec_args() for a description
1237  * of the user stack layout).  To do this, we copy the arguments and
1238  * environment variables from the old user address space into the kernel,
1239  * free the old as, create the new as, and copy our buffered information
1240  * to the new stack.  Our kernel buffer has the following structure:
1241  *
1242  *	+-----------------------+ <--- stk_base + stk_size
1243  *	| string offsets	|
1244  *	+-----------------------+ <--- stk_offp
1245  *	|			|
1246  *	| STK_AVAIL() space	|
1247  *	|			|
1248  *	+-----------------------+ <--- stk_strp
1249  *	| strings		|
1250  *	+-----------------------+ <--- stk_base
1251  *
1252  * When we add a string, we store the string's contents (including the null
1253  * terminator) at stk_strp, and we store the offset of the string relative to
1254  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1255  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1256  * the difference between these pointers.  If we run out of space, we return
1257  * an error and exec_args() starts all over again with a buffer twice as large.
1258  * When we're all done, the kernel buffer looks like this:
1259  *
1260  *	+-----------------------+ <--- stk_base + stk_size
1261  *	| argv[0] offset	|
1262  *	+-----------------------+
1263  *	| ...			|
1264  *	+-----------------------+
1265  *	| argv[argc-1] offset	|
1266  *	+-----------------------+
1267  *	| envp[0] offset	|
1268  *	+-----------------------+
1269  *	| ...			|
1270  *	+-----------------------+
1271  *	| envp[envc-1] offset	|
1272  *	+-----------------------+
1273  *	| AT_SUN_PLATFORM offset|
1274  *	+-----------------------+
1275  *	| AT_SUN_EXECNAME offset|
1276  *	+-----------------------+ <--- stk_offp
1277  *	|			|
1278  *	| STK_AVAIL() space	|
1279  *	|			|
1280  *	+-----------------------+ <--- stk_strp
1281  *	| AT_SUN_EXECNAME offset|
1282  *	+-----------------------+
1283  *	| AT_SUN_PLATFORM offset|
1284  *	+-----------------------+
1285  *	| envp[envc-1] string	|
1286  *	+-----------------------+
1287  *	| ...			|
1288  *	+-----------------------+
1289  *	| envp[0] string	|
1290  *	+-----------------------+
1291  *	| argv[argc-1] string	|
1292  *	+-----------------------+
1293  *	| ...			|
1294  *	+-----------------------+
1295  *	| argv[0] string	|
1296  *	+-----------------------+ <--- stk_base
1297  */
1298 
1299 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1300 
1301 /*
1302  * Add a string to the stack.
1303  */
1304 static int
1305 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1306 {
1307 	int error;
1308 	size_t len;
1309 
1310 	if (STK_AVAIL(args) < sizeof (int))
1311 		return (E2BIG);
1312 	*--args->stk_offp = args->stk_strp - args->stk_base;
1313 
1314 	if (segflg == UIO_USERSPACE) {
1315 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1316 		if (error != 0)
1317 			return (error);
1318 	} else {
1319 		len = strlen(sp) + 1;
1320 		if (len > STK_AVAIL(args))
1321 			return (E2BIG);
1322 		bcopy(sp, args->stk_strp, len);
1323 	}
1324 
1325 	args->stk_strp += len;
1326 
1327 	return (0);
1328 }
1329 
1330 static int
1331 stk_getptr(uarg_t *args, char *src, char **dst)
1332 {
1333 	int error;
1334 
1335 	if (args->from_model == DATAMODEL_NATIVE) {
1336 		ulong_t ptr;
1337 		error = fulword(src, &ptr);
1338 		*dst = (caddr_t)ptr;
1339 	} else {
1340 		uint32_t ptr;
1341 		error = fuword32(src, &ptr);
1342 		*dst = (caddr_t)(uintptr_t)ptr;
1343 	}
1344 	return (error);
1345 }
1346 
1347 static int
1348 stk_putptr(uarg_t *args, char *addr, char *value)
1349 {
1350 	if (args->to_model == DATAMODEL_NATIVE)
1351 		return (sulword(addr, (ulong_t)value));
1352 	else
1353 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1354 }
1355 
1356 static int
1357 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1358 {
1359 	char *sp;
1360 	int argc, error;
1361 	int argv_empty = 0;
1362 	size_t ptrsize = args->from_ptrsize;
1363 	size_t size, pad;
1364 	char *argv = (char *)uap->argp;
1365 	char *envp = (char *)uap->envp;
1366 
1367 	/*
1368 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1369 	 */
1370 	if (intp != NULL && intp->intp_name != NULL) {
1371 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1372 			return (error);
1373 		if (intp->intp_arg != NULL &&
1374 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1375 			return (error);
1376 		if (args->fname != NULL)
1377 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1378 		else
1379 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1380 		if (error)
1381 			return (error);
1382 
1383 		/*
1384 		 * Check for an empty argv[].
1385 		 */
1386 		if (stk_getptr(args, argv, &sp))
1387 			return (EFAULT);
1388 		if (sp == NULL)
1389 			argv_empty = 1;
1390 
1391 		argv += ptrsize;		/* ignore original argv[0] */
1392 	}
1393 
1394 	if (argv_empty == 0) {
1395 		/*
1396 		 * Add argv[] strings to the stack.
1397 		 */
1398 		for (;;) {
1399 			if (stk_getptr(args, argv, &sp))
1400 				return (EFAULT);
1401 			if (sp == NULL)
1402 				break;
1403 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1404 				return (error);
1405 			argv += ptrsize;
1406 		}
1407 	}
1408 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1409 	args->arglen = args->stk_strp - args->stk_base;
1410 
1411 	/*
1412 	 * Add environ[] strings to the stack.
1413 	 */
1414 	if (envp != NULL) {
1415 		for (;;) {
1416 			if (stk_getptr(args, envp, &sp))
1417 				return (EFAULT);
1418 			if (sp == NULL)
1419 				break;
1420 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1421 				return (error);
1422 			envp += ptrsize;
1423 		}
1424 	}
1425 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1426 	args->ne = args->na - argc;
1427 
1428 	/*
1429 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1430 	 * AT_SUN_EMULATOR strings to the stack.
1431 	 */
1432 	if (auxvpp != NULL && *auxvpp != NULL) {
1433 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1434 			return (error);
1435 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1436 			return (error);
1437 		if (args->brandname != NULL &&
1438 		    (error = stk_add(args, args->brandname,
1439 			UIO_SYSSPACE)) != 0)
1440 			return (error);
1441 		if (args->emulator != NULL &&
1442 		    (error = stk_add(args, args->emulator,
1443 			UIO_SYSSPACE)) != 0)
1444 			return (error);
1445 	}
1446 
1447 	/*
1448 	 * Compute the size of the stack.  This includes all the pointers,
1449 	 * the space reserved for the aux vector, and all the strings.
1450 	 * The total number of pointers is args->na (which is argc + envc)
1451 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1452 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1453 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1454 	 * all the strings, at the very top of the stack.
1455 	 */
1456 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1457 	    (args->stk_strp - args->stk_base);
1458 
1459 	/*
1460 	 * Pad the string section with zeroes to align the stack size.
1461 	 */
1462 	pad = P2NPHASE(size, args->stk_align);
1463 
1464 	if (STK_AVAIL(args) < pad)
1465 		return (E2BIG);
1466 
1467 	args->usrstack_size = size + pad;
1468 
1469 	while (pad-- != 0)
1470 		*args->stk_strp++ = 0;
1471 
1472 	args->nc = args->stk_strp - args->stk_base;
1473 
1474 	return (0);
1475 }
1476 
1477 static int
1478 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1479 {
1480 	size_t ptrsize = args->to_ptrsize;
1481 	ssize_t pslen;
1482 	char *kstrp = args->stk_base;
1483 	char *ustrp = usrstack - args->nc - ptrsize;
1484 	char *usp = usrstack - args->usrstack_size;
1485 	int *offp = (int *)(args->stk_base + args->stk_size);
1486 	int envc = args->ne;
1487 	int argc = args->na - envc;
1488 	int i;
1489 
1490 	/*
1491 	 * Record argc for /proc.
1492 	 */
1493 	up->u_argc = argc;
1494 
1495 	/*
1496 	 * Put argc on the stack.  Note that even though it's an int,
1497 	 * it always consumes ptrsize bytes (for alignment).
1498 	 */
1499 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1500 		return (-1);
1501 
1502 	/*
1503 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1504 	 */
1505 	up->u_argv = (uintptr_t)(usp += ptrsize);
1506 
1507 	/*
1508 	 * Put the argv[] pointers on the stack.
1509 	 */
1510 	for (i = 0; i < argc; i++, usp += ptrsize)
1511 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1512 			return (-1);
1513 
1514 	/*
1515 	 * Copy arguments to u_psargs.
1516 	 */
1517 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1518 	for (i = 0; i < pslen; i++)
1519 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1520 	while (i < PSARGSZ)
1521 		up->u_psargs[i++] = '\0';
1522 
1523 	/*
1524 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1525 	 * record envp for /proc.
1526 	 */
1527 	up->u_envp = (uintptr_t)(usp += ptrsize);
1528 
1529 	/*
1530 	 * Put the envp[] pointers on the stack.
1531 	 */
1532 	for (i = 0; i < envc; i++, usp += ptrsize)
1533 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1534 			return (-1);
1535 
1536 	/*
1537 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1538 	 * remember where the stack ends, which is also where auxv begins.
1539 	 */
1540 	args->stackend = usp += ptrsize;
1541 
1542 	/*
1543 	 * Put all the argv[], envp[], and auxv strings on the stack.
1544 	 */
1545 	if (copyout(args->stk_base, ustrp, args->nc))
1546 		return (-1);
1547 
1548 	/*
1549 	 * Fill in the aux vector now that we know the user stack addresses
1550 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1551 	 * AT_SUN_EMULATOR strings.
1552 	 */
1553 	if (auxvpp != NULL && *auxvpp != NULL) {
1554 		if (args->to_model == DATAMODEL_NATIVE) {
1555 			auxv_t **a = (auxv_t **)auxvpp;
1556 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1557 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1558 			if (args->brandname != NULL)
1559 				ADDAUX(*a,
1560 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1561 			if (args->emulator != NULL)
1562 				ADDAUX(*a,
1563 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1564 		} else {
1565 			auxv32_t **a = (auxv32_t **)auxvpp;
1566 			ADDAUX(*a,
1567 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1568 			ADDAUX(*a,
1569 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1570 			if (args->brandname != NULL)
1571 				ADDAUX(*a, AT_SUN_BRANDNAME,
1572 				    (int)(uintptr_t)&ustrp[*--offp])
1573 			if (args->emulator != NULL)
1574 				ADDAUX(*a, AT_SUN_EMULATOR,
1575 				    (int)(uintptr_t)&ustrp[*--offp])
1576 		}
1577 	}
1578 
1579 	return (0);
1580 }
1581 
1582 /*
1583  * Initialize a new user stack with the specified arguments and environment.
1584  * The initial user stack layout is as follows:
1585  *
1586  *	User Stack
1587  *	+---------------+ <--- curproc->p_usrstack
1588  *	|		|
1589  *	| slew		|
1590  *	|		|
1591  *	+---------------+
1592  *	| NULL		|
1593  *	+---------------+
1594  *	|		|
1595  *	| auxv strings	|
1596  *	|		|
1597  *	+---------------+
1598  *	|		|
1599  *	| envp strings	|
1600  *	|		|
1601  *	+---------------+
1602  *	|		|
1603  *	| argv strings	|
1604  *	|		|
1605  *	+---------------+ <--- ustrp
1606  *	|		|
1607  *	| aux vector	|
1608  *	|		|
1609  *	+---------------+ <--- auxv
1610  *	| NULL		|
1611  *	+---------------+
1612  *	| envp[envc-1]	|
1613  *	+---------------+
1614  *	| ...		|
1615  *	+---------------+
1616  *	| envp[0]	|
1617  *	+---------------+ <--- envp[]
1618  *	| NULL		|
1619  *	+---------------+
1620  *	| argv[argc-1]	|
1621  *	+---------------+
1622  *	| ...		|
1623  *	+---------------+
1624  *	| argv[0]	|
1625  *	+---------------+ <--- argv[]
1626  *	| argc		|
1627  *	+---------------+ <--- stack base
1628  */
1629 int
1630 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1631 {
1632 	size_t size;
1633 	int error;
1634 	proc_t *p = ttoproc(curthread);
1635 	user_t *up = PTOU(p);
1636 	char *usrstack;
1637 	rctl_entity_p_t e;
1638 	struct as *as;
1639 	extern int use_stk_lpg;
1640 	size_t sp_slew;
1641 
1642 	args->from_model = p->p_model;
1643 	if (p->p_model == DATAMODEL_NATIVE) {
1644 		args->from_ptrsize = sizeof (long);
1645 	} else {
1646 		args->from_ptrsize = sizeof (int32_t);
1647 	}
1648 
1649 	if (args->to_model == DATAMODEL_NATIVE) {
1650 		args->to_ptrsize = sizeof (long);
1651 		args->ncargs = NCARGS;
1652 		args->stk_align = STACK_ALIGN;
1653 		usrstack = (char *)USRSTACK;
1654 	} else {
1655 		args->to_ptrsize = sizeof (int32_t);
1656 		args->ncargs = NCARGS32;
1657 		args->stk_align = STACK_ALIGN32;
1658 		usrstack = (char *)USRSTACK32;
1659 	}
1660 
1661 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1662 
1663 #if defined(__sparc)
1664 	/*
1665 	 * Make sure user register windows are empty before
1666 	 * attempting to make a new stack.
1667 	 */
1668 	(void) flush_user_windows_to_stack(NULL);
1669 #endif
1670 
1671 	for (size = PAGESIZE; ; size *= 2) {
1672 		args->stk_size = size;
1673 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1674 		args->stk_strp = args->stk_base;
1675 		args->stk_offp = (int *)(args->stk_base + size);
1676 		error = stk_copyin(uap, args, intp, auxvpp);
1677 		if (error == 0)
1678 			break;
1679 		kmem_free(args->stk_base, size);
1680 		if (error != E2BIG && error != ENAMETOOLONG)
1681 			return (error);
1682 		if (size >= args->ncargs)
1683 			return (E2BIG);
1684 	}
1685 
1686 	size = args->usrstack_size;
1687 
1688 	ASSERT(error == 0);
1689 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1690 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1691 
1692 	if (size > args->ncargs) {
1693 		kmem_free(args->stk_base, args->stk_size);
1694 		return (E2BIG);
1695 	}
1696 
1697 	/*
1698 	 * Leave only the current lwp and force the other lwps to exit.
1699 	 * If another lwp beat us to the punch by calling exit(), bail out.
1700 	 */
1701 	if ((error = exitlwps(0)) != 0) {
1702 		kmem_free(args->stk_base, args->stk_size);
1703 		return (error);
1704 	}
1705 
1706 	/*
1707 	 * Revoke any doors created by the process.
1708 	 */
1709 	if (p->p_door_list)
1710 		door_exit();
1711 
1712 	/*
1713 	 * Release schedctl data structures.
1714 	 */
1715 	if (p->p_pagep)
1716 		schedctl_proc_cleanup();
1717 
1718 	/*
1719 	 * Clean up any DTrace helpers for the process.
1720 	 */
1721 	if (p->p_dtrace_helpers != NULL) {
1722 		ASSERT(dtrace_helpers_cleanup != NULL);
1723 		(*dtrace_helpers_cleanup)();
1724 	}
1725 
1726 	mutex_enter(&p->p_lock);
1727 	/*
1728 	 * Cleanup the DTrace provider associated with this process.
1729 	 */
1730 	if (p->p_dtrace_probes) {
1731 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1732 		dtrace_fasttrap_exec_ptr(p);
1733 	}
1734 	mutex_exit(&p->p_lock);
1735 
1736 	/*
1737 	 * discard the lwpchan cache.
1738 	 */
1739 	if (p->p_lcp != NULL)
1740 		lwpchan_destroy_cache(1);
1741 
1742 	/*
1743 	 * Delete the POSIX timers.
1744 	 */
1745 	if (p->p_itimer != NULL)
1746 		timer_exit();
1747 
1748 #ifdef C2_AUDIT
1749 	if (audit_active)
1750 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1751 		    args->na - args->ne, args->ne);
1752 #endif
1753 
1754 	/*
1755 	 * Ensure that we don't change resource associations while we
1756 	 * change address spaces.
1757 	 */
1758 	mutex_enter(&p->p_lock);
1759 	pool_barrier_enter();
1760 	mutex_exit(&p->p_lock);
1761 
1762 	/*
1763 	 * Destroy the old address space and create a new one.
1764 	 * From here on, any errors are fatal to the exec()ing process.
1765 	 * On error we return -1, which means the caller must SIGKILL
1766 	 * the process.
1767 	 */
1768 	relvm();
1769 
1770 	mutex_enter(&p->p_lock);
1771 	pool_barrier_exit();
1772 	mutex_exit(&p->p_lock);
1773 
1774 	up->u_execsw = args->execswp;
1775 
1776 	p->p_brkbase = NULL;
1777 	p->p_brksize = 0;
1778 	p->p_brkpageszc = 0;
1779 	p->p_stksize = 0;
1780 	p->p_stkpageszc = 0;
1781 	p->p_model = args->to_model;
1782 	p->p_usrstack = usrstack;
1783 	p->p_stkprot = args->stk_prot;
1784 	p->p_datprot = args->dat_prot;
1785 
1786 	/*
1787 	 * Reset resource controls such that all controls are again active as
1788 	 * well as appropriate to the potentially new address model for the
1789 	 * process.
1790 	 */
1791 	e.rcep_p.proc = p;
1792 	e.rcep_t = RCENTITY_PROCESS;
1793 	rctl_set_reset(p->p_rctls, p, &e);
1794 
1795 	/* Too early to call map_pgsz for the heap */
1796 	if (use_stk_lpg) {
1797 		p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
1798 	}
1799 
1800 	mutex_enter(&p->p_lock);
1801 	p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1802 	mutex_exit(&p->p_lock);
1803 
1804 	/*
1805 	 * Some platforms may choose to randomize real stack start by adding a
1806 	 * small slew (not more than a few hundred bytes) to the top of the
1807 	 * stack. This helps avoid cache thrashing when identical processes
1808 	 * simultaneously share caches that don't provide enough associativity
1809 	 * (e.g. sun4v systems). In this case stack slewing makes the same hot
1810 	 * stack variables in different processes to live in different cache
1811 	 * sets increasing effective associativity.
1812 	 */
1813 	sp_slew = exec_get_spslew();
1814 	ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
1815 	exec_set_sp(size + sp_slew);
1816 
1817 	as = as_alloc();
1818 	p->p_as = as;
1819 	as->a_proc = p;
1820 	if (p->p_model == DATAMODEL_ILP32)
1821 		as->a_userlimit = (caddr_t)USERLIMIT32;
1822 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1823 	hat_join_srd(as->a_hat, args->ex_vp);
1824 
1825 	/*
1826 	 * Finally, write out the contents of the new stack.
1827 	 */
1828 	error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
1829 	kmem_free(args->stk_base, args->stk_size);
1830 	return (error);
1831 }
1832