xref: /illumos-gate/usr/src/uts/common/os/exec.c (revision d1855c8182d5cf1cd290336767a7c8e7537c13a2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1988 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 /*
29  * Copyright 2017 Joyent, Inc.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/lgrp.h>
53 #include <sys/vtrace.h>
54 #include <sys/exec.h>
55 #include <sys/exechdr.h>
56 #include <sys/kmem.h>
57 #include <sys/prsystm.h>
58 #include <sys/modctl.h>
59 #include <sys/vmparam.h>
60 #include <sys/door.h>
61 #include <sys/schedctl.h>
62 #include <sys/utrap.h>
63 #include <sys/systeminfo.h>
64 #include <sys/stack.h>
65 #include <sys/rctl.h>
66 #include <sys/dtrace.h>
67 #include <sys/lwpchan_impl.h>
68 #include <sys/pool.h>
69 #include <sys/sdt.h>
70 #include <sys/brand.h>
71 #include <sys/klpd.h>
72 #include <sys/random.h>
73 
74 #include <c2/audit.h>
75 
76 #include <vm/hat.h>
77 #include <vm/anon.h>
78 #include <vm/as.h>
79 #include <vm/seg.h>
80 #include <vm/seg_vn.h>
81 #include <vm/seg_hole.h>
82 
83 #define	PRIV_RESET		0x01	/* needs to reset privs */
84 #define	PRIV_SETID		0x02	/* needs to change uids */
85 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
86 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
87 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
88 #define	PRIV_FORCED		0x20	/* has forced privileges */
89 
90 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
91     priv_set_t *, cred_t *, const char *);
92 static int hold_execsw(struct execsw *);
93 
94 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
95 uint_t auxv_hwcap_2 = 0;	/* AT_SUN_HWCAP2 */
96 #if defined(_SYSCALL32_IMPL)
97 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
98 uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */
99 #endif
100 
101 #define	PSUIDFLAGS		(SNOCD|SUGID)
102 
103 /*
104  * These are consumed within the specific exec modules, but are defined here
105  * because
106  *
107  * 1) The exec modules are unloadable, which would make this near useless.
108  *
109  * 2) We want them to be common across all of them, should more than ELF come
110  *    to support them.
111  *
112  * All must be powers of 2.
113  */
114 size_t aslr_max_brk_skew = 16 * 1024 * 1024; /* 16MB */
115 #pragma weak exec_stackgap = aslr_max_stack_skew /* Old, compatible name */
116 size_t aslr_max_stack_skew = 64 * 1024; /* 64KB */
117 
118 /*
119  * Size of guard segment for 64-bit processes and minimum size it can be shrunk
120  * to in the case of grow() operations.  These are kept as variables in case
121  * they need to be tuned in an emergency.
122  */
123 size_t stack_guard_seg_sz = 256 * 1024 * 1024;
124 size_t stack_guard_min_sz = 64 * 1024 * 1024;
125 
126 /*
127  * exece() - system call wrapper around exec_common()
128  */
129 int
130 exece(const char *fname, const char **argp, const char **envp)
131 {
132 	int error;
133 
134 	error = exec_common(fname, argp, envp, EBA_NONE);
135 	return (error ? (set_errno(error)) : 0);
136 }
137 
138 int
139 exec_common(const char *fname, const char **argp, const char **envp,
140     int brand_action)
141 {
142 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
143 	proc_t *p = ttoproc(curthread);
144 	klwp_t *lwp = ttolwp(curthread);
145 	struct user *up = PTOU(p);
146 	long execsz;		/* temporary count of exec size */
147 	int i;
148 	int error;
149 	char exec_file[MAXCOMLEN+1];
150 	struct pathname pn;
151 	struct pathname resolvepn;
152 	struct uarg args;
153 	struct execa ua;
154 	k_sigset_t savedmask;
155 	lwpdir_t *lwpdir = NULL;
156 	tidhash_t *tidhash;
157 	lwpdir_t *old_lwpdir = NULL;
158 	uint_t old_lwpdir_sz;
159 	tidhash_t *old_tidhash;
160 	uint_t old_tidhash_sz;
161 	ret_tidhash_t *ret_tidhash;
162 	lwpent_t *lep;
163 	boolean_t brandme = B_FALSE;
164 
165 	/*
166 	 * exec() is not supported for the /proc agent lwp.
167 	 */
168 	if (curthread == p->p_agenttp)
169 		return (ENOTSUP);
170 
171 	if (brand_action != EBA_NONE) {
172 		/*
173 		 * Brand actions are not supported for processes that are not
174 		 * running in a branded zone.
175 		 */
176 		if (!ZONE_IS_BRANDED(p->p_zone))
177 			return (ENOTSUP);
178 
179 		if (brand_action == EBA_NATIVE) {
180 			/* Only branded processes can be unbranded */
181 			if (!PROC_IS_BRANDED(p))
182 				return (ENOTSUP);
183 		} else {
184 			/* Only unbranded processes can be branded */
185 			if (PROC_IS_BRANDED(p))
186 				return (ENOTSUP);
187 			brandme = B_TRUE;
188 		}
189 	} else {
190 		/*
191 		 * If this is a native zone, or if the process is already
192 		 * branded, then we don't need to do anything.  If this is
193 		 * a native process in a branded zone, we need to brand the
194 		 * process as it exec()s the new binary.
195 		 */
196 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
197 			brandme = B_TRUE;
198 	}
199 
200 	/*
201 	 * Inform /proc that an exec() has started.
202 	 * Hold signals that are ignored by default so that we will
203 	 * not be interrupted by a signal that will be ignored after
204 	 * successful completion of gexec().
205 	 */
206 	mutex_enter(&p->p_lock);
207 	prexecstart();
208 	schedctl_finish_sigblock(curthread);
209 	savedmask = curthread->t_hold;
210 	sigorset(&curthread->t_hold, &ignoredefault);
211 	mutex_exit(&p->p_lock);
212 
213 	/*
214 	 * Look up path name and remember last component for later.
215 	 * To help coreadm expand its %d token, we attempt to save
216 	 * the directory containing the executable in p_execdir. The
217 	 * first call to lookuppn() may fail and return EINVAL because
218 	 * dirvpp is non-NULL. In that case, we make a second call to
219 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
220 	 * but coreadm is allowed to expand %d to the empty string and
221 	 * there are other cases in which that failure may occur.
222 	 */
223 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
224 		goto out;
225 	pn_alloc(&resolvepn);
226 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
227 		pn_free(&resolvepn);
228 		pn_free(&pn);
229 		if (error != EINVAL)
230 			goto out;
231 
232 		dir = NULL;
233 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
234 			goto out;
235 		pn_alloc(&resolvepn);
236 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
237 		    &vp)) != 0) {
238 			pn_free(&resolvepn);
239 			pn_free(&pn);
240 			goto out;
241 		}
242 	}
243 	if (vp == NULL) {
244 		if (dir != NULL)
245 			VN_RELE(dir);
246 		error = ENOENT;
247 		pn_free(&resolvepn);
248 		pn_free(&pn);
249 		goto out;
250 	}
251 
252 	if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
253 		if (dir != NULL)
254 			VN_RELE(dir);
255 		pn_free(&resolvepn);
256 		pn_free(&pn);
257 		VN_RELE(vp);
258 		goto out;
259 	}
260 
261 	/*
262 	 * We do not allow executing files in attribute directories.
263 	 * We test this by determining whether the resolved path
264 	 * contains a "/" when we're in an attribute directory;
265 	 * only if the pathname does not contain a "/" the resolved path
266 	 * points to a file in the current working (attribute) directory.
267 	 */
268 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
269 	    strchr(resolvepn.pn_path, '/') == NULL) {
270 		if (dir != NULL)
271 			VN_RELE(dir);
272 		error = EACCES;
273 		pn_free(&resolvepn);
274 		pn_free(&pn);
275 		VN_RELE(vp);
276 		goto out;
277 	}
278 
279 	bzero(exec_file, MAXCOMLEN+1);
280 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
281 	bzero(&args, sizeof (args));
282 	args.pathname = resolvepn.pn_path;
283 	/* don't free resolvepn until we are done with args */
284 	pn_free(&pn);
285 
286 	/*
287 	 * If we're running in a profile shell, then call pfexecd.
288 	 */
289 	if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
290 		error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
291 		    &args.scrubenv);
292 
293 		/* Returning errno in case we're not allowed to execute. */
294 		if (error > 0) {
295 			if (dir != NULL)
296 				VN_RELE(dir);
297 			pn_free(&resolvepn);
298 			VN_RELE(vp);
299 			goto out;
300 		}
301 
302 		/* Don't change the credentials when using old ptrace. */
303 		if (args.pfcred != NULL &&
304 		    (p->p_proc_flag & P_PR_PTRACE) != 0) {
305 			crfree(args.pfcred);
306 			args.pfcred = NULL;
307 			args.scrubenv = B_FALSE;
308 		}
309 	}
310 
311 	/*
312 	 * Specific exec handlers, or policies determined via
313 	 * /etc/system may override the historical default.
314 	 */
315 	args.stk_prot = PROT_ZFOD;
316 	args.dat_prot = PROT_ZFOD;
317 
318 	CPU_STATS_ADD_K(sys, sysexec, 1);
319 	DTRACE_PROC1(exec, char *, args.pathname);
320 
321 	ua.fname = fname;
322 	ua.argp = argp;
323 	ua.envp = envp;
324 
325 	/* If necessary, brand this process before we start the exec. */
326 	if (brandme)
327 		brand_setbrand(p);
328 
329 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
330 	    exec_file, p->p_cred, brand_action)) != 0) {
331 		if (brandme)
332 			brand_clearbrand(p, B_FALSE);
333 		VN_RELE(vp);
334 		if (dir != NULL)
335 			VN_RELE(dir);
336 		pn_free(&resolvepn);
337 		goto fail;
338 	}
339 
340 	/*
341 	 * Free floating point registers (sun4u only)
342 	 */
343 	ASSERT(lwp != NULL);
344 	lwp_freeregs(lwp, 1);
345 
346 	/*
347 	 * Free thread and process context ops.
348 	 */
349 	if (curthread->t_ctx)
350 		freectx(curthread, 1);
351 	if (p->p_pctx)
352 		freepctx(p, 1);
353 
354 	/*
355 	 * Remember file name for accounting; clear any cached DTrace predicate.
356 	 */
357 	up->u_acflag &= ~AFORK;
358 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
359 	curthread->t_predcache = NULL;
360 
361 	/*
362 	 * Clear contract template state
363 	 */
364 	lwp_ctmpl_clear(lwp);
365 
366 	/*
367 	 * Save the directory in which we found the executable for expanding
368 	 * the %d token used in core file patterns.
369 	 */
370 	mutex_enter(&p->p_lock);
371 	tmpvp = p->p_execdir;
372 	p->p_execdir = dir;
373 	if (p->p_execdir != NULL)
374 		VN_HOLD(p->p_execdir);
375 	mutex_exit(&p->p_lock);
376 
377 	if (tmpvp != NULL)
378 		VN_RELE(tmpvp);
379 
380 	/*
381 	 * Reset stack state to the user stack, clear set of signals
382 	 * caught on the signal stack, and reset list of signals that
383 	 * restart system calls; the new program's environment should
384 	 * not be affected by detritus from the old program.  Any
385 	 * pending held signals remain held, so don't clear t_hold.
386 	 */
387 	mutex_enter(&p->p_lock);
388 	lwp->lwp_oldcontext = 0;
389 	lwp->lwp_ustack = 0;
390 	lwp->lwp_old_stk_ctl = 0;
391 	sigemptyset(&up->u_signodefer);
392 	sigemptyset(&up->u_sigonstack);
393 	sigemptyset(&up->u_sigresethand);
394 	lwp->lwp_sigaltstack.ss_sp = 0;
395 	lwp->lwp_sigaltstack.ss_size = 0;
396 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
397 
398 	/*
399 	 * Make saved resource limit == current resource limit.
400 	 */
401 	for (i = 0; i < RLIM_NLIMITS; i++) {
402 		/*CONSTCOND*/
403 		if (RLIM_SAVED(i)) {
404 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
405 			    &up->u_saved_rlimit[i]);
406 		}
407 	}
408 
409 	/*
410 	 * If the action was to catch the signal, then the action
411 	 * must be reset to SIG_DFL.
412 	 */
413 	sigdefault(p);
414 	p->p_flag &= ~(SNOWAIT|SJCTL);
415 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
416 	up->u_signal[SIGCLD - 1] = SIG_DFL;
417 
418 	/*
419 	 * Delete the dot4 sigqueues/signotifies.
420 	 */
421 	sigqfree(p);
422 
423 	mutex_exit(&p->p_lock);
424 
425 	mutex_enter(&p->p_pflock);
426 	p->p_prof.pr_base = NULL;
427 	p->p_prof.pr_size = 0;
428 	p->p_prof.pr_off = 0;
429 	p->p_prof.pr_scale = 0;
430 	p->p_prof.pr_samples = 0;
431 	mutex_exit(&p->p_pflock);
432 
433 	ASSERT(curthread->t_schedctl == NULL);
434 
435 #if defined(__sparc)
436 	if (p->p_utraps != NULL)
437 		utrap_free(p);
438 #endif	/* __sparc */
439 
440 	/*
441 	 * Close all close-on-exec files.
442 	 */
443 	close_exec(P_FINFO(p));
444 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
445 
446 	/* Unbrand ourself if necessary. */
447 	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
448 		brand_clearbrand(p, B_FALSE);
449 
450 	setregs(&args);
451 
452 	/* Mark this as an executable vnode */
453 	mutex_enter(&vp->v_lock);
454 	vp->v_flag |= VVMEXEC;
455 	mutex_exit(&vp->v_lock);
456 
457 	VN_RELE(vp);
458 	if (dir != NULL)
459 		VN_RELE(dir);
460 	pn_free(&resolvepn);
461 
462 	/*
463 	 * Allocate a new lwp directory and lwpid hash table if necessary.
464 	 */
465 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
466 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
467 		lwpdir->ld_next = lwpdir + 1;
468 		tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
469 		if (p->p_lwpdir != NULL)
470 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
471 		else
472 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
473 	}
474 
475 	if (PROC_IS_BRANDED(p))
476 		BROP(p)->b_exec();
477 
478 	mutex_enter(&p->p_lock);
479 	prbarrier(p);
480 
481 	/*
482 	 * Reset lwp id to the default value of 1.
483 	 * This is a single-threaded process now
484 	 * and lwp #1 is lwp_wait()able by default.
485 	 * The t_unpark flag should not be inherited.
486 	 */
487 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
488 	curthread->t_tid = 1;
489 	kpreempt_disable();
490 	ASSERT(curthread->t_lpl != NULL);
491 	p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
492 	kpreempt_enable();
493 	if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
494 		lgrp_update_trthr_migrations(1);
495 	}
496 	curthread->t_unpark = 0;
497 	curthread->t_proc_flag |= TP_TWAIT;
498 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
499 	p->p_lwpdaemon = 0;			/* but oh well ... */
500 	p->p_lwpid = 1;
501 
502 	/*
503 	 * Install the newly-allocated lwp directory and lwpid hash table
504 	 * and insert the current thread into the new hash table.
505 	 */
506 	if (lwpdir != NULL) {
507 		old_lwpdir = p->p_lwpdir;
508 		old_lwpdir_sz = p->p_lwpdir_sz;
509 		old_tidhash = p->p_tidhash;
510 		old_tidhash_sz = p->p_tidhash_sz;
511 		p->p_lwpdir = p->p_lwpfree = lwpdir;
512 		p->p_lwpdir_sz = 2;
513 		lep->le_thread = curthread;
514 		lep->le_lwpid = curthread->t_tid;
515 		lep->le_start = curthread->t_start;
516 		lwp_hash_in(p, lep, tidhash, 2, 0);
517 		p->p_tidhash = tidhash;
518 		p->p_tidhash_sz = 2;
519 	}
520 	ret_tidhash = p->p_ret_tidhash;
521 	p->p_ret_tidhash = NULL;
522 
523 	/*
524 	 * Restore the saved signal mask and
525 	 * inform /proc that the exec() has finished.
526 	 */
527 	curthread->t_hold = savedmask;
528 	prexecend();
529 	mutex_exit(&p->p_lock);
530 	if (old_lwpdir) {
531 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
532 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
533 	}
534 	while (ret_tidhash != NULL) {
535 		ret_tidhash_t *next = ret_tidhash->rth_next;
536 		kmem_free(ret_tidhash->rth_tidhash,
537 		    ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
538 		kmem_free(ret_tidhash, sizeof (*ret_tidhash));
539 		ret_tidhash = next;
540 	}
541 
542 	ASSERT(error == 0);
543 	DTRACE_PROC(exec__success);
544 	return (0);
545 
546 fail:
547 	DTRACE_PROC1(exec__failure, int, error);
548 out:		/* error return */
549 	mutex_enter(&p->p_lock);
550 	curthread->t_hold = savedmask;
551 	prexecend();
552 	mutex_exit(&p->p_lock);
553 	ASSERT(error != 0);
554 	return (error);
555 }
556 
557 
558 /*
559  * Perform generic exec duties and switchout to object-file specific
560  * handler.
561  */
562 int
563 gexec(
564 	struct vnode **vpp,
565 	struct execa *uap,
566 	struct uarg *args,
567 	struct intpdata *idatap,
568 	int level,
569 	long *execsz,
570 	caddr_t exec_file,
571 	struct cred *cred,
572 	int brand_action)
573 {
574 	struct vnode *vp, *execvp = NULL;
575 	proc_t *pp = ttoproc(curthread);
576 	struct execsw *eswp;
577 	int error = 0;
578 	int suidflags = 0;
579 	ssize_t resid;
580 	uid_t uid, gid;
581 	struct vattr vattr;
582 	char magbuf[MAGIC_BYTES];
583 	int setid;
584 	cred_t *oldcred, *newcred = NULL;
585 	int privflags = 0;
586 	int setidfl;
587 	priv_set_t fset;
588 	secflagset_t old_secflags;
589 
590 	secflags_copy(&old_secflags, &pp->p_secflags.psf_effective);
591 
592 	/*
593 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
594 	 * previous setting so we can restore it if we encounter an error.
595 	 */
596 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
597 		mutex_enter(&pp->p_lock);
598 		suidflags = pp->p_flag & PSUIDFLAGS;
599 		pp->p_flag &= ~PSUIDFLAGS;
600 		mutex_exit(&pp->p_lock);
601 	}
602 
603 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
604 		goto bad_noclose;
605 
606 	/* need to open vnode for stateful file systems */
607 	if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
608 		goto bad_noclose;
609 	vp = *vpp;
610 
611 	/*
612 	 * Note: to support binary compatibility with SunOS a.out
613 	 * executables, we read in the first four bytes, as the
614 	 * magic number is in bytes 2-3.
615 	 */
616 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
617 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
618 		goto bad;
619 	if (resid != 0)
620 		goto bad;
621 
622 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
623 		goto bad;
624 
625 	if (level == 0 &&
626 	    (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
627 	    args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {
628 
629 		/* Pfcred is a credential with a ref count of 1 */
630 
631 		if (args->pfcred != NULL) {
632 			privflags |= PRIV_INCREASE|PRIV_RESET;
633 			newcred = cred = args->pfcred;
634 		} else {
635 			newcred = cred = crdup(cred);
636 		}
637 
638 		/* If we can, drop the PA bit */
639 		if ((privflags & PRIV_RESET) != 0)
640 			priv_adjust_PA(cred);
641 
642 		if (privflags & PRIV_SETID) {
643 			cred->cr_uid = uid;
644 			cred->cr_gid = gid;
645 			cred->cr_suid = uid;
646 			cred->cr_sgid = gid;
647 		}
648 
649 		if (privflags & MAC_FLAGS) {
650 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
651 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
652 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
653 		}
654 
655 		/*
656 		 * Implement the privilege updates:
657 		 *
658 		 * Restrict with L:
659 		 *
660 		 *	I' = I & L
661 		 *
662 		 *	E' = P' = (I' + F) & A
663 		 *
664 		 * But if running under ptrace, we cap I and F with P.
665 		 */
666 		if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
667 			if ((privflags & PRIV_INCREASE) != 0 &&
668 			    (pp->p_proc_flag & P_PR_PTRACE) != 0) {
669 				priv_intersect(&CR_OPPRIV(cred),
670 				    &CR_IPRIV(cred));
671 				priv_intersect(&CR_OPPRIV(cred), &fset);
672 			}
673 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
674 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
675 			if (privflags & PRIV_FORCED) {
676 				priv_set_PA(cred);
677 				priv_union(&fset, &CR_EPRIV(cred));
678 				priv_union(&fset, &CR_PPRIV(cred));
679 			}
680 			priv_adjust_PA(cred);
681 		}
682 	} else if (level == 0 && args->pfcred != NULL) {
683 		newcred = cred = args->pfcred;
684 		privflags |= PRIV_INCREASE;
685 		/* pfcred is not forced to adhere to these settings */
686 		priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
687 		CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
688 		priv_adjust_PA(cred);
689 	}
690 
691 	/* The new image gets the inheritable secflags as its secflags */
692 	secflags_promote(pp);
693 
694 	/* SunOS 4.x buy-back */
695 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
696 	    (vattr.va_mode & (VSUID|VSGID))) {
697 		char path[MAXNAMELEN];
698 		refstr_t *mntpt = NULL;
699 		int ret = -1;
700 
701 		bzero(path, sizeof (path));
702 		zone_hold(pp->p_zone);
703 
704 		ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
705 		    sizeof (path), cred);
706 
707 		/* fallback to mountpoint if a path can't be found */
708 		if ((ret != 0) || (ret == 0 && path[0] == '\0'))
709 			mntpt = vfs_getmntpoint(vp->v_vfsp);
710 
711 		if (mntpt == NULL)
712 			zcmn_err(pp->p_zone->zone_id, CE_NOTE,
713 			    "!uid %d: setuid execution not allowed, "
714 			    "file=%s", cred->cr_uid, path);
715 		else
716 			zcmn_err(pp->p_zone->zone_id, CE_NOTE,
717 			    "!uid %d: setuid execution not allowed, "
718 			    "fs=%s, file=%s", cred->cr_uid,
719 			    ZONE_PATH_TRANSLATE(refstr_value(mntpt),
720 			    pp->p_zone), exec_file);
721 
722 		if (!INGLOBALZONE(pp)) {
723 			/* zone_rootpath always has trailing / */
724 			if (mntpt == NULL)
725 				cmn_err(CE_NOTE, "!zone: %s, uid: %d "
726 				    "setuid execution not allowed, file=%s%s",
727 				    pp->p_zone->zone_name, cred->cr_uid,
728 				    pp->p_zone->zone_rootpath, path + 1);
729 			else
730 				cmn_err(CE_NOTE, "!zone: %s, uid: %d "
731 				    "setuid execution not allowed, fs=%s, "
732 				    "file=%s", pp->p_zone->zone_name,
733 				    cred->cr_uid, refstr_value(mntpt),
734 				    exec_file);
735 		}
736 
737 		if (mntpt != NULL)
738 			refstr_rele(mntpt);
739 
740 		zone_rele(pp->p_zone);
741 	}
742 
743 	/*
744 	 * execsetid() told us whether or not we had to change the
745 	 * credentials of the process.  In privflags, it told us
746 	 * whether we gained any privileges or executed a set-uid executable.
747 	 */
748 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));
749 
750 	/*
751 	 * Use /etc/system variable to determine if the stack
752 	 * should be marked as executable by default.
753 	 */
754 	if ((noexec_user_stack != 0) ||
755 	    secflag_enabled(pp, PROC_SEC_NOEXECSTACK))
756 		args->stk_prot &= ~PROT_EXEC;
757 
758 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
759 	args->ex_vp = vp;
760 
761 	/*
762 	 * Traditionally, the setid flags told the sub processes whether
763 	 * the file just executed was set-uid or set-gid; this caused
764 	 * some confusion as the 'setid' flag did not match the SUGID
765 	 * process flag which is only set when the uids/gids do not match.
766 	 * A script set-gid/set-uid to the real uid/gid would start with
767 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
768 	 * Now we flag those cases where the calling process cannot
769 	 * be trusted to influence the newly exec'ed process, either
770 	 * because it runs with more privileges or when the uids/gids
771 	 * do in fact not match.
772 	 * This also makes the runtime linker agree with the on exec
773 	 * values of SNOCD and SUGID.
774 	 */
775 	setidfl = 0;
776 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
777 	    !supgroupmember(cred->cr_gid, cred))) {
778 		setidfl |= EXECSETID_UGIDS;
779 	}
780 	if (setid & PRIV_SETUGID)
781 		setidfl |= EXECSETID_SETID;
782 	if (setid & PRIV_FORCED)
783 		setidfl |= EXECSETID_PRIVS;
784 
785 	execvp = pp->p_exec;
786 	if (execvp)
787 		VN_HOLD(execvp);
788 
789 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
790 	    setidfl, exec_file, cred, brand_action);
791 	rw_exit(eswp->exec_lock);
792 	if (error != 0) {
793 		if (execvp)
794 			VN_RELE(execvp);
795 		/*
796 		 * If this process's p_exec has been set to the vp of
797 		 * the executable by exec_func, we will return without
798 		 * calling VOP_CLOSE because proc_exit will close it
799 		 * on exit.
800 		 */
801 		if (pp->p_exec == vp)
802 			goto bad_noclose;
803 		else
804 			goto bad;
805 	}
806 
807 	if (level == 0) {
808 		uid_t oruid;
809 
810 		if (execvp != NULL) {
811 			/*
812 			 * Close the previous executable only if we are
813 			 * at level 0.
814 			 */
815 			(void) VOP_CLOSE(execvp, FREAD, 1, (offset_t)0,
816 			    cred, NULL);
817 		}
818 
819 		mutex_enter(&pp->p_crlock);
820 
821 		oruid = pp->p_cred->cr_ruid;
822 
823 		if (newcred != NULL) {
824 			/*
825 			 * Free the old credentials, and set the new ones.
826 			 * Do this for both the process and the (single) thread.
827 			 */
828 			crfree(pp->p_cred);
829 			pp->p_cred = cred;	/* cred already held for proc */
830 			crhold(cred);		/* hold new cred for thread */
831 			/*
832 			 * DTrace accesses t_cred in probe context.  t_cred
833 			 * must always be either NULL, or point to a valid,
834 			 * allocated cred structure.
835 			 */
836 			oldcred = curthread->t_cred;
837 			curthread->t_cred = cred;
838 			crfree(oldcred);
839 
840 			if (priv_basic_test >= 0 &&
841 			    !PRIV_ISASSERT(&CR_IPRIV(newcred),
842 			    priv_basic_test)) {
843 				pid_t pid = pp->p_pid;
844 				char *fn = PTOU(pp)->u_comm;
845 
846 				cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
847 				    "privilege removed from E/I", fn, pid);
848 			}
849 		}
850 		/*
851 		 * On emerging from a successful exec(), the saved
852 		 * uid and gid equal the effective uid and gid.
853 		 */
854 		cred->cr_suid = cred->cr_uid;
855 		cred->cr_sgid = cred->cr_gid;
856 
857 		/*
858 		 * If the real and effective ids do not match, this
859 		 * is a setuid process that should not dump core.
860 		 * The group comparison is tricky; we prevent the code
861 		 * from flagging SNOCD when executing with an effective gid
862 		 * which is a supplementary group.
863 		 */
864 		if (cred->cr_ruid != cred->cr_uid ||
865 		    (cred->cr_rgid != cred->cr_gid &&
866 		    !supgroupmember(cred->cr_gid, cred)) ||
867 		    (privflags & PRIV_INCREASE) != 0)
868 			suidflags = PSUIDFLAGS;
869 		else
870 			suidflags = 0;
871 
872 		mutex_exit(&pp->p_crlock);
873 		if (newcred != NULL && oruid != newcred->cr_ruid) {
874 			/* Note that the process remains in the same zone. */
875 			mutex_enter(&pidlock);
876 			upcount_dec(oruid, crgetzoneid(newcred));
877 			upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
878 			mutex_exit(&pidlock);
879 		}
880 		if (suidflags) {
881 			mutex_enter(&pp->p_lock);
882 			pp->p_flag |= suidflags;
883 			mutex_exit(&pp->p_lock);
884 		}
885 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
886 			/*
887 			 * If process is traced via /proc, arrange to
888 			 * invalidate the associated /proc vnode.
889 			 */
890 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
891 				args->traceinval = 1;
892 		}
893 		if (pp->p_proc_flag & P_PR_PTRACE)
894 			psignal(pp, SIGTRAP);
895 		if (args->traceinval)
896 			prinvalidate(&pp->p_user);
897 	}
898 	if (execvp)
899 		VN_RELE(execvp);
900 	return (0);
901 
902 bad:
903 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, cred, NULL);
904 
905 bad_noclose:
906 	if (newcred != NULL)
907 		crfree(newcred);
908 	if (error == 0)
909 		error = ENOEXEC;
910 
911 	mutex_enter(&pp->p_lock);
912 	if (suidflags) {
913 		pp->p_flag |= suidflags;
914 	}
915 	/*
916 	 * Restore the effective secflags, to maintain the invariant they
917 	 * never change for a given process
918 	 */
919 	secflags_copy(&pp->p_secflags.psf_effective, &old_secflags);
920 	mutex_exit(&pp->p_lock);
921 
922 	return (error);
923 }
924 
925 extern char *execswnames[];
926 
927 struct execsw *
928 allocate_execsw(char *name, char *magic, size_t magic_size)
929 {
930 	int i, j;
931 	char *ename;
932 	char *magicp;
933 
934 	mutex_enter(&execsw_lock);
935 	for (i = 0; i < nexectype; i++) {
936 		if (execswnames[i] == NULL) {
937 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
938 			(void) strcpy(ename, name);
939 			execswnames[i] = ename;
940 			/*
941 			 * Set the magic number last so that we
942 			 * don't need to hold the execsw_lock in
943 			 * findexectype().
944 			 */
945 			magicp = kmem_alloc(magic_size, KM_SLEEP);
946 			for (j = 0; j < magic_size; j++)
947 				magicp[j] = magic[j];
948 			execsw[i].exec_magic = magicp;
949 			mutex_exit(&execsw_lock);
950 			return (&execsw[i]);
951 		}
952 	}
953 	mutex_exit(&execsw_lock);
954 	return (NULL);
955 }
956 
957 /*
958  * Find the exec switch table entry with the corresponding magic string.
959  */
960 struct execsw *
961 findexecsw(char *magic)
962 {
963 	struct execsw *eswp;
964 
965 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
966 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
967 		if (magic && eswp->exec_maglen != 0 &&
968 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
969 			return (eswp);
970 	}
971 	return (NULL);
972 }
973 
974 /*
975  * Find the execsw[] index for the given exec header string by looking for the
976  * magic string at a specified offset and length for each kind of executable
977  * file format until one matches.  If no execsw[] entry is found, try to
978  * autoload a module for this magic string.
979  */
980 struct execsw *
981 findexec_by_hdr(char *header)
982 {
983 	struct execsw *eswp;
984 
985 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
986 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
987 		if (header && eswp->exec_maglen != 0 &&
988 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
989 		    eswp->exec_maglen) == 0) {
990 			if (hold_execsw(eswp) != 0)
991 				return (NULL);
992 			return (eswp);
993 		}
994 	}
995 	return (NULL);	/* couldn't find the type */
996 }
997 
998 /*
999  * Find the execsw[] index for the given magic string.  If no execsw[] entry
1000  * is found, try to autoload a module for this magic string.
1001  */
1002 struct execsw *
1003 findexec_by_magic(char *magic)
1004 {
1005 	struct execsw *eswp;
1006 
1007 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1008 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1009 		if (magic && eswp->exec_maglen != 0 &&
1010 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
1011 			if (hold_execsw(eswp) != 0)
1012 				return (NULL);
1013 			return (eswp);
1014 		}
1015 	}
1016 	return (NULL);	/* couldn't find the type */
1017 }
1018 
1019 static int
1020 hold_execsw(struct execsw *eswp)
1021 {
1022 	char *name;
1023 
1024 	rw_enter(eswp->exec_lock, RW_READER);
1025 	while (!LOADED_EXEC(eswp)) {
1026 		rw_exit(eswp->exec_lock);
1027 		name = execswnames[eswp-execsw];
1028 		ASSERT(name);
1029 		if (modload("exec", name) == -1)
1030 			return (-1);
1031 		rw_enter(eswp->exec_lock, RW_READER);
1032 	}
1033 	return (0);
1034 }
1035 
1036 static int
1037 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
1038     priv_set_t *fset, cred_t *cr, const char *pathname)
1039 {
1040 	proc_t *pp = ttoproc(curthread);
1041 	uid_t uid, gid;
1042 	int privflags = 0;
1043 
1044 	/*
1045 	 * Remember credentials.
1046 	 */
1047 	uid = cr->cr_uid;
1048 	gid = cr->cr_gid;
1049 
1050 	/* Will try to reset the PRIV_AWARE bit later. */
1051 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
1052 		privflags |= PRIV_RESET;
1053 
1054 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
1055 		/*
1056 		 * If it's a set-uid root program we perform the
1057 		 * forced privilege look-aside. This has three possible
1058 		 * outcomes:
1059 		 *	no look aside information -> treat as before
1060 		 *	look aside in Limit set -> apply forced privs
1061 		 *	look aside not in Limit set -> ignore set-uid root
1062 		 *
1063 		 * Ordinary set-uid root execution only allowed if the limit
1064 		 * set holds all unsafe privileges.
1065 		 */
1066 		if (vattrp->va_mode & VSUID) {
1067 			if (vattrp->va_uid == 0) {
1068 				int res = get_forced_privs(cr, pathname, fset);
1069 
1070 				switch (res) {
1071 				case -1:
1072 					if (priv_issubset(&priv_unsafe,
1073 					    &CR_LPRIV(cr))) {
1074 						uid = vattrp->va_uid;
1075 						privflags |= PRIV_SETUGID;
1076 					}
1077 					break;
1078 				case 0:
1079 					privflags |= PRIV_FORCED|PRIV_INCREASE;
1080 					break;
1081 				default:
1082 					break;
1083 				}
1084 			} else {
1085 				uid = vattrp->va_uid;
1086 				privflags |= PRIV_SETUGID;
1087 			}
1088 		}
1089 		if (vattrp->va_mode & VSGID) {
1090 			gid = vattrp->va_gid;
1091 			privflags |= PRIV_SETUGID;
1092 		}
1093 	}
1094 
1095 	/*
1096 	 * Do we need to change our credential anyway?
1097 	 * This is the case when E != I or P != I, as
1098 	 * we need to do the assignments (with F empty and A full)
1099 	 * Or when I is not a subset of L; in that case we need to
1100 	 * enforce L.
1101 	 *
1102 	 *		I' = L & I
1103 	 *
1104 	 *		E' = P' = (I' + F) & A
1105 	 * or
1106 	 *		E' = P' = I'
1107 	 */
1108 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
1109 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
1110 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
1111 		privflags |= PRIV_RESET;
1112 
1113 	/* Child has more privileges than parent */
1114 	if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
1115 		privflags |= PRIV_INCREASE;
1116 
1117 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
1118 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
1119 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
1120 		privflags |= MAC_FLAGS;
1121 	/*
1122 	 * Set setuid/setgid protections if no ptrace() compatibility.
1123 	 * For privileged processes, honor setuid/setgid even in
1124 	 * the presence of ptrace() compatibility.
1125 	 */
1126 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
1127 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
1128 	    (cr->cr_uid != uid ||
1129 	    cr->cr_gid != gid ||
1130 	    cr->cr_suid != uid ||
1131 	    cr->cr_sgid != gid)) {
1132 		*uidp = uid;
1133 		*gidp = gid;
1134 		privflags |= PRIV_SETID;
1135 	}
1136 	return (privflags);
1137 }
1138 
1139 int
1140 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
1141 {
1142 	int error;
1143 	proc_t *p = ttoproc(curthread);
1144 
1145 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
1146 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
1147 		return (error);
1148 	/*
1149 	 * Check the access mode.
1150 	 * If VPROC, ask /proc if the file is an object file.
1151 	 */
1152 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
1153 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
1154 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
1155 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
1156 		if (error == 0)
1157 			error = EACCES;
1158 		return (error);
1159 	}
1160 
1161 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
1162 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
1163 		/*
1164 		 * If process is under ptrace(2) compatibility,
1165 		 * fail the exec(2).
1166 		 */
1167 		if (p->p_proc_flag & P_PR_PTRACE)
1168 			goto bad;
1169 		/*
1170 		 * Process is traced via /proc.
1171 		 * Arrange to invalidate the /proc vnode.
1172 		 */
1173 		args->traceinval = 1;
1174 	}
1175 	return (0);
1176 bad:
1177 	if (error == 0)
1178 		error = ENOEXEC;
1179 	return (error);
1180 }
1181 
1182 /*
1183  * Map a section of an executable file into the user's
1184  * address space.
1185  */
1186 int
1187 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1188     off_t offset, int prot, int page, uint_t szc)
1189 {
1190 	int error = 0;
1191 	off_t oldoffset;
1192 	caddr_t zfodbase, oldaddr;
1193 	size_t end, oldlen;
1194 	size_t zfoddiff;
1195 	label_t ljb;
1196 	proc_t *p = ttoproc(curthread);
1197 
1198 	oldaddr = addr;
1199 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1200 	if (len) {
1201 		oldlen = len;
1202 		len += ((size_t)oldaddr - (size_t)addr);
1203 		oldoffset = offset;
1204 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
1205 		if (page) {
1206 			spgcnt_t  prefltmem, availm, npages;
1207 			int preread;
1208 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1209 
1210 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1211 				mflag |= MAP_TEXT;
1212 			} else {
1213 				mflag |= MAP_INITDATA;
1214 			}
1215 
1216 			if (valid_usr_range(addr, len, prot, p->p_as,
1217 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1218 				error = ENOMEM;
1219 				goto bad;
1220 			}
1221 			if (error = VOP_MAP(vp, (offset_t)offset,
1222 			    p->p_as, &addr, len, prot, PROT_ALL,
1223 			    mflag, CRED(), NULL))
1224 				goto bad;
1225 
1226 			/*
1227 			 * If the segment can fit, then we prefault
1228 			 * the entire segment in.  This is based on the
1229 			 * model that says the best working set of a
1230 			 * small program is all of its pages.
1231 			 */
1232 			npages = (spgcnt_t)btopr(len);
1233 			prefltmem = freemem - desfree;
1234 			preread =
1235 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1236 
1237 			/*
1238 			 * If we aren't prefaulting the segment,
1239 			 * increment "deficit", if necessary to ensure
1240 			 * that pages will become available when this
1241 			 * process starts executing.
1242 			 */
1243 			availm = freemem - lotsfree;
1244 			if (preread == 0 && npages > availm &&
1245 			    deficit < lotsfree) {
1246 				deficit += MIN((pgcnt_t)(npages - availm),
1247 				    lotsfree - deficit);
1248 			}
1249 
1250 			if (preread) {
1251 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1252 				    "execmap preread:freemem %d size %lu",
1253 				    freemem, len);
1254 				(void) as_fault(p->p_as->a_hat, p->p_as,
1255 				    (caddr_t)addr, len, F_INVAL, S_READ);
1256 			}
1257 		} else {
1258 			if (valid_usr_range(addr, len, prot, p->p_as,
1259 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1260 				error = ENOMEM;
1261 				goto bad;
1262 			}
1263 
1264 			if (error = as_map(p->p_as, addr, len,
1265 			    segvn_create, zfod_argsp))
1266 				goto bad;
1267 			/*
1268 			 * Read in the segment in one big chunk.
1269 			 */
1270 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1271 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1272 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1273 				goto bad;
1274 			/*
1275 			 * Now set protections.
1276 			 */
1277 			if (prot != PROT_ZFOD) {
1278 				(void) as_setprot(p->p_as, (caddr_t)addr,
1279 				    len, prot);
1280 			}
1281 		}
1282 	}
1283 
1284 	if (zfodlen) {
1285 		struct as *as = curproc->p_as;
1286 		struct seg *seg;
1287 		uint_t zprot = 0;
1288 
1289 		end = (size_t)addr + len;
1290 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1291 		zfoddiff = (uintptr_t)zfodbase - end;
1292 		if (zfoddiff) {
1293 			/*
1294 			 * Before we go to zero the remaining space on the last
1295 			 * page, make sure we have write permission.
1296 			 *
1297 			 * Normal illumos binaries don't even hit the case
1298 			 * where we have to change permission on the last page
1299 			 * since their protection is typically either
1300 			 *    PROT_USER | PROT_WRITE | PROT_READ
1301 			 * or
1302 			 *    PROT_ZFOD (same as PROT_ALL).
1303 			 *
1304 			 * We need to be careful how we zero-fill the last page
1305 			 * if the segment protection does not include
1306 			 * PROT_WRITE. Using as_setprot() can cause the VM
1307 			 * segment code to call segvn_vpage(), which must
1308 			 * allocate a page struct for each page in the segment.
1309 			 * If we have a very large segment, this may fail, so
1310 			 * we have to check for that, even though we ignore
1311 			 * other return values from as_setprot.
1312 			 */
1313 
1314 			AS_LOCK_ENTER(as, RW_READER);
1315 			seg = as_segat(curproc->p_as, (caddr_t)end);
1316 			if (seg != NULL)
1317 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1318 				    &zprot);
1319 			AS_LOCK_EXIT(as);
1320 
1321 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1322 				if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
1323 				    zprot | PROT_WRITE) == ENOMEM) {
1324 					error = ENOMEM;
1325 					goto bad;
1326 				}
1327 			}
1328 
1329 			if (on_fault(&ljb)) {
1330 				no_fault();
1331 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1332 					(void) as_setprot(as, (caddr_t)end,
1333 					    zfoddiff - 1, zprot);
1334 				error = EFAULT;
1335 				goto bad;
1336 			}
1337 			uzero((void *)end, zfoddiff);
1338 			no_fault();
1339 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1340 				(void) as_setprot(as, (caddr_t)end,
1341 				    zfoddiff - 1, zprot);
1342 		}
1343 		if (zfodlen > zfoddiff) {
1344 			struct segvn_crargs crargs =
1345 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1346 
1347 			zfodlen -= zfoddiff;
1348 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1349 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1350 				error = ENOMEM;
1351 				goto bad;
1352 			}
1353 			if (szc > 0) {
1354 				/*
1355 				 * ASSERT alignment because the mapelfexec()
1356 				 * caller for the szc > 0 case extended zfod
1357 				 * so it's end is pgsz aligned.
1358 				 */
1359 				size_t pgsz = page_get_pagesize(szc);
1360 				ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1361 
1362 				if (IS_P2ALIGNED(zfodbase, pgsz)) {
1363 					crargs.szc = szc;
1364 				} else {
1365 					crargs.szc = AS_MAP_HEAP;
1366 				}
1367 			} else {
1368 				crargs.szc = AS_MAP_NO_LPOOB;
1369 			}
1370 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1371 			    zfodlen, segvn_create, &crargs))
1372 				goto bad;
1373 			if (prot != PROT_ZFOD) {
1374 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1375 				    zfodlen, prot);
1376 			}
1377 		}
1378 	}
1379 	return (0);
1380 bad:
1381 	return (error);
1382 }
1383 
1384 void
1385 setexecenv(struct execenv *ep)
1386 {
1387 	proc_t *p = ttoproc(curthread);
1388 	klwp_t *lwp = ttolwp(curthread);
1389 	struct vnode *vp;
1390 
1391 	p->p_bssbase = ep->ex_bssbase;
1392 	p->p_brkbase = ep->ex_brkbase;
1393 	p->p_brksize = ep->ex_brksize;
1394 	if (p->p_exec)
1395 		VN_RELE(p->p_exec);	/* out with the old */
1396 	vp = p->p_exec = ep->ex_vp;
1397 	if (vp != NULL)
1398 		VN_HOLD(vp);		/* in with the new */
1399 
1400 	lwp->lwp_sigaltstack.ss_sp = 0;
1401 	lwp->lwp_sigaltstack.ss_size = 0;
1402 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1403 }
1404 
1405 int
1406 execopen(struct vnode **vpp, int *fdp)
1407 {
1408 	struct vnode *vp = *vpp;
1409 	file_t *fp;
1410 	int error = 0;
1411 	int filemode = FREAD;
1412 
1413 	VN_HOLD(vp);		/* open reference */
1414 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1415 		VN_RELE(vp);
1416 		*fdp = -1;	/* just in case falloc changed value */
1417 		return (error);
1418 	}
1419 	if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1420 		VN_RELE(vp);
1421 		setf(*fdp, NULL);
1422 		unfalloc(fp);
1423 		*fdp = -1;
1424 		return (error);
1425 	}
1426 	*vpp = vp;		/* vnode should not have changed */
1427 	fp->f_vnode = vp;
1428 	mutex_exit(&fp->f_tlock);
1429 	setf(*fdp, fp);
1430 	return (0);
1431 }
1432 
1433 int
1434 execclose(int fd)
1435 {
1436 	return (closeandsetf(fd, NULL));
1437 }
1438 
1439 
1440 /*
1441  * noexec stub function.
1442  */
1443 /*ARGSUSED*/
1444 int
1445 noexec(
1446     struct vnode *vp,
1447     struct execa *uap,
1448     struct uarg *args,
1449     struct intpdata *idatap,
1450     int level,
1451     long *execsz,
1452     int setid,
1453     caddr_t exec_file,
1454     struct cred *cred)
1455 {
1456 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1457 	return (ENOEXEC);
1458 }
1459 
1460 /*
1461  * Support routines for building a user stack.
1462  *
1463  * execve(path, argv, envp) must construct a new stack with the specified
1464  * arguments and environment variables (see exec_args() for a description
1465  * of the user stack layout).  To do this, we copy the arguments and
1466  * environment variables from the old user address space into the kernel,
1467  * free the old as, create the new as, and copy our buffered information
1468  * to the new stack.  Our kernel buffer has the following structure:
1469  *
1470  *	+-----------------------+ <--- stk_base + stk_size
1471  *	| string offsets	|
1472  *	+-----------------------+ <--- stk_offp
1473  *	|			|
1474  *	| STK_AVAIL() space	|
1475  *	|			|
1476  *	+-----------------------+ <--- stk_strp
1477  *	| strings		|
1478  *	+-----------------------+ <--- stk_base
1479  *
1480  * When we add a string, we store the string's contents (including the null
1481  * terminator) at stk_strp, and we store the offset of the string relative to
1482  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1483  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1484  * the difference between these pointers.  If we run out of space, we return
1485  * an error and exec_args() starts all over again with a buffer twice as large.
1486  * When we're all done, the kernel buffer looks like this:
1487  *
1488  *	+-----------------------+ <--- stk_base + stk_size
1489  *	| argv[0] offset	|
1490  *	+-----------------------+
1491  *	| ...			|
1492  *	+-----------------------+
1493  *	| argv[argc-1] offset	|
1494  *	+-----------------------+
1495  *	| envp[0] offset	|
1496  *	+-----------------------+
1497  *	| ...			|
1498  *	+-----------------------+
1499  *	| envp[envc-1] offset	|
1500  *	+-----------------------+
1501  *	| AT_SUN_PLATFORM offset|
1502  *	+-----------------------+
1503  *	| AT_SUN_EXECNAME offset|
1504  *	+-----------------------+ <--- stk_offp
1505  *	|			|
1506  *	| STK_AVAIL() space	|
1507  *	|			|
1508  *	+-----------------------+ <--- stk_strp
1509  *	| AT_SUN_EXECNAME offset|
1510  *	+-----------------------+
1511  *	| AT_SUN_PLATFORM offset|
1512  *	+-----------------------+
1513  *	| envp[envc-1] string	|
1514  *	+-----------------------+
1515  *	| ...			|
1516  *	+-----------------------+
1517  *	| envp[0] string	|
1518  *	+-----------------------+
1519  *	| argv[argc-1] string	|
1520  *	+-----------------------+
1521  *	| ...			|
1522  *	+-----------------------+
1523  *	| argv[0] string	|
1524  *	+-----------------------+ <--- stk_base
1525  */
1526 
1527 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1528 
1529 /*
1530  * Add a string to the stack.
1531  */
1532 static int
1533 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1534 {
1535 	int error;
1536 	size_t len;
1537 
1538 	if (STK_AVAIL(args) < sizeof (int))
1539 		return (E2BIG);
1540 	*--args->stk_offp = args->stk_strp - args->stk_base;
1541 
1542 	if (segflg == UIO_USERSPACE) {
1543 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1544 		if (error != 0)
1545 			return (error);
1546 	} else {
1547 		len = strlen(sp) + 1;
1548 		if (len > STK_AVAIL(args))
1549 			return (E2BIG);
1550 		bcopy(sp, args->stk_strp, len);
1551 	}
1552 
1553 	args->stk_strp += len;
1554 
1555 	return (0);
1556 }
1557 
1558 static int
1559 stk_getptr(uarg_t *args, char *src, char **dst)
1560 {
1561 	int error;
1562 
1563 	if (args->from_model == DATAMODEL_NATIVE) {
1564 		ulong_t ptr;
1565 		error = fulword(src, &ptr);
1566 		*dst = (caddr_t)ptr;
1567 	} else {
1568 		uint32_t ptr;
1569 		error = fuword32(src, &ptr);
1570 		*dst = (caddr_t)(uintptr_t)ptr;
1571 	}
1572 	return (error);
1573 }
1574 
1575 static int
1576 stk_putptr(uarg_t *args, char *addr, char *value)
1577 {
1578 	if (args->to_model == DATAMODEL_NATIVE)
1579 		return (sulword(addr, (ulong_t)value));
1580 	else
1581 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1582 }
1583 
1584 static int
1585 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1586 {
1587 	char *sp;
1588 	int argc, error;
1589 	int argv_empty = 0;
1590 	size_t ptrsize = args->from_ptrsize;
1591 	size_t size, pad;
1592 	char *argv = (char *)uap->argp;
1593 	char *envp = (char *)uap->envp;
1594 
1595 	/*
1596 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1597 	 * In the rare case that we have nested interpreters then those names
1598 	 * and arguments are also copied to the subsequent slots in argv.
1599 	 */
1600 	if (intp != NULL && intp->intp_name[0] != NULL) {
1601 		int i;
1602 
1603 		for (i = 0; i < INTP_MAXDEPTH; i++) {
1604 			if (intp->intp_name[i] == NULL)
1605 				break;
1606 			error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE);
1607 			if (error != 0)
1608 				return (error);
1609 			if (intp->intp_arg[i] != NULL) {
1610 				error = stk_add(args, intp->intp_arg[i],
1611 				    UIO_SYSSPACE);
1612 				if (error != 0)
1613 					return (error);
1614 			}
1615 		}
1616 
1617 		if (args->fname != NULL)
1618 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1619 		else
1620 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1621 		if (error)
1622 			return (error);
1623 
1624 		/*
1625 		 * Check for an empty argv[].
1626 		 */
1627 		if (stk_getptr(args, argv, &sp))
1628 			return (EFAULT);
1629 		if (sp == NULL)
1630 			argv_empty = 1;
1631 
1632 		argv += ptrsize;		/* ignore original argv[0] */
1633 	}
1634 
1635 	if (argv_empty == 0) {
1636 		/*
1637 		 * Add argv[] strings to the stack.
1638 		 */
1639 		for (;;) {
1640 			if (stk_getptr(args, argv, &sp))
1641 				return (EFAULT);
1642 			if (sp == NULL)
1643 				break;
1644 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1645 				return (error);
1646 			argv += ptrsize;
1647 		}
1648 	}
1649 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1650 	args->arglen = args->stk_strp - args->stk_base;
1651 
1652 	/*
1653 	 * Add environ[] strings to the stack.
1654 	 */
1655 	if (envp != NULL) {
1656 		for (;;) {
1657 			char *tmp = args->stk_strp;
1658 			if (stk_getptr(args, envp, &sp))
1659 				return (EFAULT);
1660 			if (sp == NULL)
1661 				break;
1662 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1663 				return (error);
1664 			if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
1665 				/* Undo the copied string */
1666 				args->stk_strp = tmp;
1667 				*(args->stk_offp++) = NULL;
1668 			}
1669 			envp += ptrsize;
1670 		}
1671 	}
1672 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1673 	args->ne = args->na - argc;
1674 
1675 	/*
1676 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1677 	 * AT_SUN_EMULATOR strings to the stack.
1678 	 */
1679 	if (auxvpp != NULL && *auxvpp != NULL) {
1680 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1681 			return (error);
1682 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1683 			return (error);
1684 		if (args->brandname != NULL &&
1685 		    (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1686 			return (error);
1687 		if (args->emulator != NULL &&
1688 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1689 			return (error);
1690 	}
1691 
1692 	/*
1693 	 * Compute the size of the stack.  This includes all the pointers,
1694 	 * the space reserved for the aux vector, and all the strings.
1695 	 * The total number of pointers is args->na (which is argc + envc)
1696 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1697 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1698 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1699 	 * all the strings, at the very top of the stack.
1700 	 */
1701 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1702 	    (args->stk_strp - args->stk_base);
1703 
1704 	/*
1705 	 * Pad the string section with zeroes to align the stack size.
1706 	 */
1707 	pad = P2NPHASE(size, args->stk_align);
1708 
1709 	if (STK_AVAIL(args) < pad)
1710 		return (E2BIG);
1711 
1712 	args->usrstack_size = size + pad;
1713 
1714 	while (pad-- != 0)
1715 		*args->stk_strp++ = 0;
1716 
1717 	args->nc = args->stk_strp - args->stk_base;
1718 
1719 	return (0);
1720 }
1721 
1722 static int
1723 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1724 {
1725 	size_t ptrsize = args->to_ptrsize;
1726 	ssize_t pslen;
1727 	char *kstrp = args->stk_base;
1728 	char *ustrp = usrstack - args->nc - ptrsize;
1729 	char *usp = usrstack - args->usrstack_size;
1730 	int *offp = (int *)(args->stk_base + args->stk_size);
1731 	int envc = args->ne;
1732 	int argc = args->na - envc;
1733 	int i;
1734 
1735 	/*
1736 	 * Record argc for /proc.
1737 	 */
1738 	up->u_argc = argc;
1739 
1740 	/*
1741 	 * Put argc on the stack.  Note that even though it's an int,
1742 	 * it always consumes ptrsize bytes (for alignment).
1743 	 */
1744 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1745 		return (-1);
1746 
1747 	/*
1748 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1749 	 */
1750 	up->u_argv = (uintptr_t)(usp += ptrsize);
1751 
1752 	/*
1753 	 * Put the argv[] pointers on the stack.
1754 	 */
1755 	for (i = 0; i < argc; i++, usp += ptrsize)
1756 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1757 			return (-1);
1758 
1759 	/*
1760 	 * Copy arguments to u_psargs.
1761 	 */
1762 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1763 	for (i = 0; i < pslen; i++)
1764 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1765 	while (i < PSARGSZ)
1766 		up->u_psargs[i++] = '\0';
1767 
1768 	/*
1769 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1770 	 * record envp for /proc.
1771 	 */
1772 	up->u_envp = (uintptr_t)(usp += ptrsize);
1773 
1774 	/*
1775 	 * Put the envp[] pointers on the stack.
1776 	 */
1777 	for (i = 0; i < envc; i++, usp += ptrsize)
1778 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1779 			return (-1);
1780 
1781 	/*
1782 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1783 	 * remember where the stack ends, which is also where auxv begins.
1784 	 */
1785 	args->stackend = usp += ptrsize;
1786 
1787 	/*
1788 	 * Put all the argv[], envp[], and auxv strings on the stack.
1789 	 */
1790 	if (copyout(args->stk_base, ustrp, args->nc))
1791 		return (-1);
1792 
1793 	/*
1794 	 * Fill in the aux vector now that we know the user stack addresses
1795 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1796 	 * AT_SUN_EMULATOR strings.
1797 	 */
1798 	if (auxvpp != NULL && *auxvpp != NULL) {
1799 		if (args->to_model == DATAMODEL_NATIVE) {
1800 			auxv_t **a = (auxv_t **)auxvpp;
1801 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1802 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1803 			if (args->brandname != NULL)
1804 				ADDAUX(*a,
1805 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1806 			if (args->emulator != NULL)
1807 				ADDAUX(*a,
1808 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1809 		} else {
1810 			auxv32_t **a = (auxv32_t **)auxvpp;
1811 			ADDAUX(*a,
1812 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1813 			ADDAUX(*a,
1814 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1815 			if (args->brandname != NULL)
1816 				ADDAUX(*a, AT_SUN_BRANDNAME,
1817 				    (int)(uintptr_t)&ustrp[*--offp])
1818 			if (args->emulator != NULL)
1819 				ADDAUX(*a, AT_SUN_EMULATOR,
1820 				    (int)(uintptr_t)&ustrp[*--offp])
1821 		}
1822 	}
1823 
1824 	return (0);
1825 }
1826 
1827 /*
1828  * Though the actual stack base is constant, slew the %sp by a random aligned
1829  * amount in [0,aslr_max_stack_skew).  Mostly, this makes life slightly more
1830  * complicated for buffer overflows hoping to overwrite the return address.
1831  *
1832  * On some platforms this helps avoid cache thrashing when identical processes
1833  * simultaneously share caches that don't provide enough associativity
1834  * (e.g. sun4v systems). In this case stack slewing makes the same hot stack
1835  * variables in different processes live in different cache sets increasing
1836  * effective associativity.
1837  */
1838 size_t
1839 exec_get_spslew(void)
1840 {
1841 #ifdef sun4v
1842 	static uint_t sp_color_stride = 16;
1843 	static uint_t sp_color_mask = 0x1f;
1844 	static uint_t sp_current_color = (uint_t)-1;
1845 #endif
1846 	size_t off;
1847 
1848 	ASSERT(ISP2(aslr_max_stack_skew));
1849 
1850 	if ((aslr_max_stack_skew == 0) ||
1851 	    !secflag_enabled(curproc, PROC_SEC_ASLR)) {
1852 #ifdef sun4v
1853 		uint_t spcolor = atomic_inc_32_nv(&sp_current_color);
1854 		return ((size_t)((spcolor & sp_color_mask) *
1855 		    SA(sp_color_stride)));
1856 #else
1857 		return (0);
1858 #endif
1859 	}
1860 
1861 	(void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1862 	return (SA(P2PHASE(off, aslr_max_stack_skew)));
1863 }
1864 
1865 /*
1866  * Initialize a new user stack with the specified arguments and environment.
1867  * The initial user stack layout is as follows:
1868  *
1869  *	User Stack
1870  *	+---------------+
1871  *	|		|
1872  *	| stack guard	|
1873  *	| (64-bit only)	|
1874  *	|		|
1875  *	+...............+ <--- stack limit (base - curproc->p_stk_ctl)
1876  *	.		.
1877  *	.		.
1878  *	.		.
1879  *	+---------------+ <--- curproc->p_usrstack
1880  *	|		|
1881  *	| slew		|
1882  *	|		|
1883  *	+---------------+
1884  *	| NULL		|
1885  *	+---------------+
1886  *	|		|
1887  *	| auxv strings	|
1888  *	|		|
1889  *	+---------------+
1890  *	|		|
1891  *	| envp strings	|
1892  *	|		|
1893  *	+---------------+
1894  *	|		|
1895  *	| argv strings	|
1896  *	|		|
1897  *	+---------------+ <--- ustrp
1898  *	|		|
1899  *	| aux vector	|
1900  *	|		|
1901  *	+---------------+ <--- auxv
1902  *	| NULL		|
1903  *	+---------------+
1904  *	| envp[envc-1]	|
1905  *	+---------------+
1906  *	| ...		|
1907  *	+---------------+
1908  *	| envp[0]	|
1909  *	+---------------+ <--- envp[]
1910  *	| NULL		|
1911  *	+---------------+
1912  *	| argv[argc-1]	|
1913  *	+---------------+
1914  *	| ...		|
1915  *	+---------------+
1916  *	| argv[0]	|
1917  *	+---------------+ <--- argv[]
1918  *	| argc		|
1919  *	+---------------+ <--- stack base
1920  *
1921  * In 64-bit processes, a stack guard segment is allocated at the address
1922  * immediately below where the stack limit ends.  This protects new library
1923  * mappings (such as the linker) from being placed in relatively dangerous
1924  * proximity to the stack.
1925  */
1926 int
1927 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1928 {
1929 	size_t size;
1930 	int error;
1931 	proc_t *p = ttoproc(curthread);
1932 	user_t *up = PTOU(p);
1933 	char *usrstack;
1934 	rctl_entity_p_t e;
1935 	struct as *as;
1936 	extern int use_stk_lpg;
1937 	size_t sp_slew;
1938 #if defined(_LP64)
1939 	const size_t sg_sz = (stack_guard_seg_sz & PAGEMASK);
1940 #endif /* defined(_LP64) */
1941 
1942 	args->from_model = p->p_model;
1943 	if (p->p_model == DATAMODEL_NATIVE) {
1944 		args->from_ptrsize = sizeof (long);
1945 	} else {
1946 		args->from_ptrsize = sizeof (int32_t);
1947 	}
1948 
1949 	if (args->to_model == DATAMODEL_NATIVE) {
1950 		args->to_ptrsize = sizeof (long);
1951 		args->ncargs = NCARGS;
1952 		args->stk_align = STACK_ALIGN;
1953 		if (args->addr32)
1954 			usrstack = (char *)USRSTACK64_32;
1955 		else
1956 			usrstack = (char *)USRSTACK;
1957 	} else {
1958 		args->to_ptrsize = sizeof (int32_t);
1959 		args->ncargs = NCARGS32;
1960 		args->stk_align = STACK_ALIGN32;
1961 		usrstack = (char *)USRSTACK32;
1962 	}
1963 
1964 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1965 
1966 #if defined(__sparc)
1967 	/*
1968 	 * Make sure user register windows are empty before
1969 	 * attempting to make a new stack.
1970 	 */
1971 	(void) flush_user_windows_to_stack(NULL);
1972 #endif
1973 
1974 	for (size = PAGESIZE; ; size *= 2) {
1975 		args->stk_size = size;
1976 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1977 		args->stk_strp = args->stk_base;
1978 		args->stk_offp = (int *)(args->stk_base + size);
1979 		error = stk_copyin(uap, args, intp, auxvpp);
1980 		if (error == 0)
1981 			break;
1982 		kmem_free(args->stk_base, size);
1983 		if (error != E2BIG && error != ENAMETOOLONG)
1984 			return (error);
1985 		if (size >= args->ncargs)
1986 			return (E2BIG);
1987 	}
1988 
1989 	size = args->usrstack_size;
1990 
1991 	ASSERT(error == 0);
1992 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1993 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1994 
1995 	if (size > args->ncargs) {
1996 		kmem_free(args->stk_base, args->stk_size);
1997 		return (E2BIG);
1998 	}
1999 
2000 	/*
2001 	 * Leave only the current lwp and force the other lwps to exit.
2002 	 * If another lwp beat us to the punch by calling exit(), bail out.
2003 	 */
2004 	if ((error = exitlwps(0)) != 0) {
2005 		kmem_free(args->stk_base, args->stk_size);
2006 		return (error);
2007 	}
2008 
2009 	/*
2010 	 * Revoke any doors created by the process.
2011 	 */
2012 	if (p->p_door_list)
2013 		door_exit();
2014 
2015 	/*
2016 	 * Release schedctl data structures.
2017 	 */
2018 	if (p->p_pagep)
2019 		schedctl_proc_cleanup();
2020 
2021 	/*
2022 	 * Clean up any DTrace helpers for the process.
2023 	 */
2024 	if (p->p_dtrace_helpers != NULL) {
2025 		ASSERT(dtrace_helpers_cleanup != NULL);
2026 		(*dtrace_helpers_cleanup)(p);
2027 	}
2028 
2029 	mutex_enter(&p->p_lock);
2030 	/*
2031 	 * Cleanup the DTrace provider associated with this process.
2032 	 */
2033 	if (p->p_dtrace_probes) {
2034 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
2035 		dtrace_fasttrap_exec_ptr(p);
2036 	}
2037 	mutex_exit(&p->p_lock);
2038 
2039 	/*
2040 	 * discard the lwpchan cache.
2041 	 */
2042 	if (p->p_lcp != NULL)
2043 		lwpchan_destroy_cache(1);
2044 
2045 	/*
2046 	 * Delete the POSIX timers.
2047 	 */
2048 	if (p->p_itimer != NULL)
2049 		timer_exit();
2050 
2051 	/*
2052 	 * Delete the ITIMER_REALPROF interval timer.
2053 	 * The other ITIMER_* interval timers are specified
2054 	 * to be inherited across exec().
2055 	 */
2056 	delete_itimer_realprof();
2057 
2058 	if (AU_AUDITING())
2059 		audit_exec(args->stk_base, args->stk_base + args->arglen,
2060 		    args->na - args->ne, args->ne, args->pfcred);
2061 
2062 	/*
2063 	 * Ensure that we don't change resource associations while we
2064 	 * change address spaces.
2065 	 */
2066 	mutex_enter(&p->p_lock);
2067 	pool_barrier_enter();
2068 	mutex_exit(&p->p_lock);
2069 
2070 	/*
2071 	 * Destroy the old address space and create a new one.
2072 	 * From here on, any errors are fatal to the exec()ing process.
2073 	 * On error we return -1, which means the caller must SIGKILL
2074 	 * the process.
2075 	 */
2076 	relvm();
2077 
2078 	mutex_enter(&p->p_lock);
2079 	pool_barrier_exit();
2080 	mutex_exit(&p->p_lock);
2081 
2082 	up->u_execsw = args->execswp;
2083 
2084 	p->p_brkbase = NULL;
2085 	p->p_brksize = 0;
2086 	p->p_brkpageszc = 0;
2087 	p->p_stksize = 0;
2088 	p->p_stkpageszc = 0;
2089 	p->p_stkg_start = 0;
2090 	p->p_stkg_end = 0;
2091 	p->p_model = args->to_model;
2092 	p->p_usrstack = usrstack;
2093 	p->p_stkprot = args->stk_prot;
2094 	p->p_datprot = args->dat_prot;
2095 
2096 	/*
2097 	 * Reset resource controls such that all controls are again active as
2098 	 * well as appropriate to the potentially new address model for the
2099 	 * process.
2100 	 */
2101 	e.rcep_p.proc = p;
2102 	e.rcep_t = RCENTITY_PROCESS;
2103 	rctl_set_reset(p->p_rctls, p, &e);
2104 
2105 	/* Too early to call map_pgsz for the heap */
2106 	if (use_stk_lpg) {
2107 		p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
2108 	}
2109 
2110 	mutex_enter(&p->p_lock);
2111 	p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
2112 	mutex_exit(&p->p_lock);
2113 
2114 	sp_slew = exec_get_spslew();
2115 	ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
2116 	/* Be certain we don't underflow */
2117 	VERIFY((curproc->p_usrstack - (size + sp_slew)) < curproc->p_usrstack);
2118 	exec_set_sp(size + sp_slew);
2119 
2120 	as = as_alloc();
2121 	p->p_as = as;
2122 	as->a_proc = p;
2123 	if (p->p_model == DATAMODEL_ILP32 || args->addr32)
2124 		as->a_userlimit = (caddr_t)USERLIMIT32;
2125 	(void) hat_setup(as->a_hat, HAT_ALLOC);
2126 	hat_join_srd(as->a_hat, args->ex_vp);
2127 
2128 	/* Write out the contents of the new stack. */
2129 	error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
2130 	kmem_free(args->stk_base, args->stk_size);
2131 
2132 #if defined(_LP64)
2133 	/* Add stack guard segment (if needed) after successful copyout */
2134 	if (error == 0 && p->p_model == DATAMODEL_LP64 && sg_sz != 0) {
2135 		seghole_crargs_t sca;
2136 		caddr_t addr_end = (caddr_t)(((uintptr_t)usrstack -
2137 		    p->p_stk_ctl) & PAGEMASK);
2138 		caddr_t addr_start = addr_end - sg_sz;
2139 
2140 		DTRACE_PROBE4(stack__guard__chk, proc_t *, p,
2141 		    caddr_t, addr_start, caddr_t, addr_end, size_t, sg_sz);
2142 
2143 		if (addr_end >= usrstack || addr_start >= addr_end ||
2144 		    valid_usr_range(addr_start, sg_sz, PROT_NONE, as,
2145 		    as->a_userlimit) != RANGE_OKAY) {
2146 			return (E2BIG);
2147 		}
2148 
2149 		/* Create un-mappable area in AS with seg_hole */
2150 		sca.name = "stack_guard";
2151 		error = as_map(as, addr_start, sg_sz, seghole_create, &sca);
2152 		if (error == 0) {
2153 			p->p_stkg_start = (uintptr_t)addr_start;
2154 			p->p_stkg_end = (uintptr_t)addr_start + sg_sz;
2155 		}
2156 	}
2157 #endif /* defined(_LP64) */
2158 
2159 	return (error);
2160 }
2161