xref: /illumos-gate/usr/src/uts/common/os/exec.c (revision 9164a50bf932130cbb5097a16f6986873ce0e6e5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1988 AT&T	*/
27 /*	  All Rights Reserved	*/
28 /*
29  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
30  * Copyright 2019 Joyent, Inc.
31  * Copyright 2024 Oxide Computer Company
32  */
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/systm.h>
38 #include <sys/signal.h>
39 #include <sys/cred_impl.h>
40 #include <sys/policy.h>
41 #include <sys/user.h>
42 #include <sys/errno.h>
43 #include <sys/file.h>
44 #include <sys/vfs.h>
45 #include <sys/vnode.h>
46 #include <sys/mman.h>
47 #include <sys/acct.h>
48 #include <sys/cpuvar.h>
49 #include <sys/proc.h>
50 #include <sys/cmn_err.h>
51 #include <sys/debug.h>
52 #include <sys/pathname.h>
53 #include <sys/vm.h>
54 #include <sys/lgrp.h>
55 #include <sys/vtrace.h>
56 #include <sys/exec.h>
57 #include <sys/execx.h>
58 #include <sys/exechdr.h>
59 #include <sys/kmem.h>
60 #include <sys/prsystm.h>
61 #include <sys/modctl.h>
62 #include <sys/vmparam.h>
63 #include <sys/door.h>
64 #include <sys/schedctl.h>
65 #include <sys/utrap.h>
66 #include <sys/systeminfo.h>
67 #include <sys/stack.h>
68 #include <sys/rctl.h>
69 #include <sys/dtrace.h>
70 #include <sys/lwpchan_impl.h>
71 #include <sys/pool.h>
72 #include <sys/sdt.h>
73 #include <sys/brand.h>
74 #include <sys/klpd.h>
75 #include <sys/random.h>
76 
77 #include <c2/audit.h>
78 
79 #include <vm/hat.h>
80 #include <vm/anon.h>
81 #include <vm/as.h>
82 #include <vm/seg.h>
83 #include <vm/seg_vn.h>
84 #include <vm/seg_hole.h>
85 
86 #define	PRIV_RESET		0x01	/* needs to reset privs */
87 #define	PRIV_SETID		0x02	/* needs to change uids */
88 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
89 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
90 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
91 #define	PRIV_FORCED		0x20	/* has forced privileges */
92 
93 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
94     priv_set_t *, cred_t *, const char *);
95 static int hold_execsw(struct execsw *);
96 
97 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
98 uint_t auxv_hwcap_2 = 0;	/* AT_SUN_HWCAP2 */
99 uint_t auxv_hwcap_3 = 0;	/* AT_SUN_HWCAP3 */
100 #if defined(_SYSCALL32_IMPL)
101 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
102 uint_t auxv_hwcap32_2 = 0;	/* 32-bit version of auxv_hwcap2 */
103 uint_t auxv_hwcap32_3 = 0;	/* 32-bit version of auxv_hwcap3 */
104 #endif
105 
106 #define	PSUIDFLAGS		(SNOCD|SUGID)
107 
108 /*
109  * These are consumed within the specific exec modules, but are defined here
110  * because
111  *
112  * 1) The exec modules are unloadable, which would make this near useless.
113  *
114  * 2) We want them to be common across all of them, should more than ELF come
115  *    to support them.
116  *
117  * All must be powers of 2.
118  */
119 size_t aslr_max_brk_skew = 16 * 1024 * 1024; /* 16MB */
120 #pragma weak exec_stackgap = aslr_max_stack_skew /* Old, compatible name */
121 size_t aslr_max_stack_skew = 64 * 1024; /* 64KB */
122 
123 /*
124  * Size of guard segment for 64-bit processes and minimum size it can be shrunk
125  * to in the case of grow() operations.  These are kept as variables in case
126  * they need to be tuned in an emergency.
127  */
128 size_t stack_guard_seg_sz = 256 * 1024 * 1024;
129 size_t stack_guard_min_sz = 64 * 1024 * 1024;
130 
131 /*
132  * exece() - system call wrapper around exec_common()
133  */
134 int
135 exece(uintptr_t file, const char **argp, const char **envp, int flags)
136 {
137 	int error;
138 
139 	if ((flags & ~EXEC_DESCRIPTOR) != 0)
140 		return (set_errno(EINVAL));
141 
142 	if ((flags & EXEC_DESCRIPTOR) != 0) {
143 		/*
144 		 * If EXEC_DESCRIPTOR is specified, then the `file`
145 		 * parameter is the number of a file descriptor in the current
146 		 * process.
147 		 */
148 		char *path = NULL;
149 		size_t allocsize;
150 		int fd = (int)file;
151 		vnode_t *vp = NULL;
152 
153 		if ((error = fgetstartvp(fd, NULL, &vp)) != 0)
154 			return (set_errno(error));
155 
156 		mutex_enter(&vp->v_lock);
157 		if (vp->v_path != NULL && vp->v_path != vn_vpath_empty) {
158 			allocsize = strlen(vp->v_path) + 1;
159 			path = kmem_alloc(allocsize, KM_NOSLEEP);
160 			if (path == NULL) {
161 				mutex_exit(&vp->v_lock);
162 				VN_RELE(vp);
163 				return (set_errno(ENOMEM));
164 			}
165 			bcopy(vp->v_path, path, allocsize);
166 		}
167 		mutex_exit(&vp->v_lock);
168 
169 		/*
170 		 * In the unlikely event that the descriptor's path is not
171 		 * cached, we fall back to using a constructed one.
172 		 */
173 		if (path == NULL) {
174 			/* 8 for "/dev/fd/", 10 for %d, + \0 == 19 */
175 			allocsize = 20;
176 			path = kmem_alloc(allocsize, KM_NOSLEEP);
177 			if (path == NULL) {
178 				VN_RELE(vp);
179 				return (set_errno(ENOMEM));
180 			}
181 			(void) snprintf(path, allocsize, "/dev/fd/%d", fd);
182 		}
183 
184 		error = exec_common(path, argp, envp, vp, EBA_NONE);
185 		VN_RELE(vp);
186 		kmem_free(path, allocsize);
187 	} else {
188 		const char *fname = (const char *)file;
189 
190 		error = exec_common(fname, argp, envp, NULL, EBA_NONE);
191 	}
192 
193 	return (error ? (set_errno(error)) : 0);
194 }
195 
196 int
197 exec_common(const char *fname, const char **argp, const char **envp,
198     vnode_t *vp, int brand_action)
199 {
200 	vnode_t *dir = NULL, *tmpvp = NULL;
201 	proc_t *p = ttoproc(curthread);
202 	klwp_t *lwp = ttolwp(curthread);
203 	struct user *up = PTOU(p);
204 	size_t execsz;		/* temporary count of exec size */
205 	int i;
206 	int error;
207 	char exec_file[MAXCOMLEN+1];
208 	struct pathname pn;
209 	struct pathname resolvepn;
210 	struct uarg args;
211 	struct execa ua;
212 	k_sigset_t savedmask;
213 	lwpdir_t *lwpdir = NULL;
214 	tidhash_t *tidhash;
215 	lwpdir_t *old_lwpdir = NULL;
216 	uint_t old_lwpdir_sz;
217 	tidhash_t *old_tidhash;
218 	uint_t old_tidhash_sz;
219 	ret_tidhash_t *ret_tidhash;
220 	lwpent_t *lep;
221 	boolean_t brandme = B_FALSE;
222 
223 	/*
224 	 * exec() is not supported for the /proc agent lwp.
225 	 */
226 	if (curthread == p->p_agenttp)
227 		return (ENOTSUP);
228 
229 	if (brand_action != EBA_NONE) {
230 		/*
231 		 * Brand actions are not supported for processes that are not
232 		 * running in a branded zone.
233 		 */
234 		if (!ZONE_IS_BRANDED(p->p_zone))
235 			return (ENOTSUP);
236 
237 		if (brand_action == EBA_NATIVE) {
238 			/* Only branded processes can be unbranded */
239 			if (!PROC_IS_BRANDED(p))
240 				return (ENOTSUP);
241 		} else {
242 			/* Only unbranded processes can be branded */
243 			if (PROC_IS_BRANDED(p))
244 				return (ENOTSUP);
245 			brandme = B_TRUE;
246 		}
247 	} else {
248 		/*
249 		 * If this is a native zone, or if the process is already
250 		 * branded, then we don't need to do anything.  If this is
251 		 * a native process in a branded zone, we need to brand the
252 		 * process as it exec()s the new binary.
253 		 */
254 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
255 			brandme = B_TRUE;
256 	}
257 
258 	/*
259 	 * Inform /proc that an exec() has started.
260 	 * Hold signals that are ignored by default so that we will
261 	 * not be interrupted by a signal that will be ignored after
262 	 * successful completion of gexec().
263 	 */
264 	mutex_enter(&p->p_lock);
265 	prexecstart();
266 	schedctl_finish_sigblock(curthread);
267 	savedmask = curthread->t_hold;
268 	sigorset(&curthread->t_hold, &ignoredefault);
269 	mutex_exit(&p->p_lock);
270 
271 	if (vp != NULL) {
272 		/*
273 		 * When a vnode is passed in we take an extra hold here and
274 		 * release it before returning. This means that callers don't
275 		 * need to account for the reference changing over the call.
276 		 */
277 		VN_HOLD(vp);
278 		pn_alloc(&pn);
279 		pn_alloc(&resolvepn);
280 		VERIFY0(pn_set(&pn, fname));
281 		VERIFY0(pn_set(&resolvepn, fname));
282 	} else {
283 		/*
284 		 * Look up path name and remember last component for later.
285 		 * To help coreadm expand its %d token, we attempt to save
286 		 * the directory containing the executable in p_execdir. The
287 		 * first call to lookuppn() may fail and return EINVAL because
288 		 * dirvpp is non-NULL. In that case, we make a second call to
289 		 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
290 		 * but coreadm is allowed to expand %d to the empty string and
291 		 * there are other cases in which that failure may occur.
292 		 */
293 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
294 			goto out;
295 		pn_alloc(&resolvepn);
296 		error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp);
297 		if (error != 0) {
298 			pn_free(&resolvepn);
299 			pn_free(&pn);
300 			if (error != EINVAL)
301 				goto out;
302 
303 			dir = NULL;
304 			if ((error = pn_get((char *)fname, UIO_USERSPACE,
305 			    &pn)) != 0) {
306 				goto out;
307 			}
308 			pn_alloc(&resolvepn);
309 			if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
310 			    &vp)) != 0) {
311 				pn_free(&resolvepn);
312 				pn_free(&pn);
313 				goto out;
314 			}
315 		}
316 	}
317 
318 	if (vp == NULL) {
319 		if (dir != NULL)
320 			VN_RELE(dir);
321 		error = ENOENT;
322 		pn_free(&resolvepn);
323 		pn_free(&pn);
324 		goto out;
325 	}
326 
327 	if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
328 		if (dir != NULL)
329 			VN_RELE(dir);
330 		pn_free(&resolvepn);
331 		pn_free(&pn);
332 		VN_RELE(vp);
333 		goto out;
334 	}
335 
336 	/*
337 	 * We do not allow executing files in attribute directories.
338 	 * We test this by determining whether the resolved path
339 	 * contains a "/" when we're in an attribute directory;
340 	 * only if the pathname does not contain a "/" the resolved path
341 	 * points to a file in the current working (attribute) directory.
342 	 */
343 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
344 	    strchr(resolvepn.pn_path, '/') == NULL) {
345 		if (dir != NULL)
346 			VN_RELE(dir);
347 		error = EACCES;
348 		pn_free(&resolvepn);
349 		pn_free(&pn);
350 		VN_RELE(vp);
351 		goto out;
352 	}
353 
354 	bzero(exec_file, MAXCOMLEN+1);
355 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
356 	bzero(&args, sizeof (args));
357 	args.pathname = resolvepn.pn_path;
358 	/* don't free resolvepn until we are done with args */
359 	pn_free(&pn);
360 
361 	/*
362 	 * If we're running in a profile shell, then call pfexecd.
363 	 */
364 	if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
365 		error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
366 		    &args.scrubenv);
367 
368 		/* Returning errno in case we're not allowed to execute. */
369 		if (error > 0) {
370 			if (dir != NULL)
371 				VN_RELE(dir);
372 			pn_free(&resolvepn);
373 			VN_RELE(vp);
374 			goto out;
375 		}
376 
377 		/* Don't change the credentials when using old ptrace. */
378 		if (args.pfcred != NULL &&
379 		    (p->p_proc_flag & P_PR_PTRACE) != 0) {
380 			crfree(args.pfcred);
381 			args.pfcred = NULL;
382 			args.scrubenv = B_FALSE;
383 		}
384 	}
385 
386 	/*
387 	 * Specific exec handlers, or policies determined via
388 	 * /etc/system may override the historical default.
389 	 */
390 	args.stk_prot = PROT_ZFOD;
391 	args.dat_prot = PROT_ZFOD;
392 
393 	CPU_STATS_ADD_K(sys, sysexec, 1);
394 	DTRACE_PROC1(exec, char *, args.pathname);
395 
396 	ua.fname = fname;
397 	ua.argp = argp;
398 	ua.envp = envp;
399 
400 	/* If necessary, brand this process before we start the exec. */
401 	if (brandme)
402 		brand_setbrand(p);
403 
404 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
405 	    exec_file, p->p_cred, brand_action)) != 0) {
406 		if (brandme)
407 			brand_clearbrand(p, B_FALSE);
408 		VN_RELE(vp);
409 		if (dir != NULL)
410 			VN_RELE(dir);
411 		pn_free(&resolvepn);
412 		goto fail;
413 	}
414 
415 	/*
416 	 * Free floating point registers (sun4u only)
417 	 */
418 	ASSERT(lwp != NULL);
419 	lwp_freeregs(lwp, 1);
420 
421 	/*
422 	 * Free thread and process context ops.
423 	 */
424 	if (curthread->t_ctx)
425 		freectx(curthread, 1);
426 	if (p->p_pctx)
427 		freepctx(p, 1);
428 
429 	/*
430 	 * Remember file name for accounting; clear any cached DTrace predicate.
431 	 */
432 	up->u_acflag &= ~AFORK;
433 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
434 	curthread->t_predcache = 0;
435 
436 	/*
437 	 * Clear contract template state
438 	 */
439 	lwp_ctmpl_clear(lwp);
440 
441 	/*
442 	 * Save the directory in which we found the executable for expanding
443 	 * the %d token used in core file patterns.
444 	 */
445 	mutex_enter(&p->p_lock);
446 	tmpvp = p->p_execdir;
447 	p->p_execdir = dir;
448 	if (p->p_execdir != NULL)
449 		VN_HOLD(p->p_execdir);
450 	mutex_exit(&p->p_lock);
451 
452 	if (tmpvp != NULL)
453 		VN_RELE(tmpvp);
454 
455 	/*
456 	 * Reset stack state to the user stack, clear set of signals
457 	 * caught on the signal stack, and reset list of signals that
458 	 * restart system calls; the new program's environment should
459 	 * not be affected by detritus from the old program.  Any
460 	 * pending held signals remain held, so don't clear t_hold.
461 	 */
462 	mutex_enter(&p->p_lock);
463 	lwp->lwp_oldcontext = 0;
464 	lwp->lwp_ustack = 0;
465 	lwp->lwp_old_stk_ctl = 0;
466 	sigemptyset(&up->u_signodefer);
467 	sigemptyset(&up->u_sigonstack);
468 	sigemptyset(&up->u_sigresethand);
469 	lwp->lwp_sigaltstack.ss_sp = 0;
470 	lwp->lwp_sigaltstack.ss_size = 0;
471 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
472 
473 	/*
474 	 * Make saved resource limit == current resource limit.
475 	 */
476 	for (i = 0; i < RLIM_NLIMITS; i++) {
477 		/*CONSTCOND*/
478 		if (RLIM_SAVED(i)) {
479 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
480 			    &up->u_saved_rlimit[i]);
481 		}
482 	}
483 
484 	/*
485 	 * If the action was to catch the signal, then the action
486 	 * must be reset to SIG_DFL.
487 	 */
488 	sigdefault(p);
489 	p->p_flag &= ~(SNOWAIT|SJCTL);
490 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
491 	up->u_signal[SIGCLD - 1] = SIG_DFL;
492 
493 	/*
494 	 * Delete the dot4 sigqueues/signotifies.
495 	 */
496 	sigqfree(p);
497 
498 	mutex_exit(&p->p_lock);
499 
500 	mutex_enter(&p->p_pflock);
501 	p->p_prof.pr_base = NULL;
502 	p->p_prof.pr_size = 0;
503 	p->p_prof.pr_off = 0;
504 	p->p_prof.pr_scale = 0;
505 	p->p_prof.pr_samples = 0;
506 	mutex_exit(&p->p_pflock);
507 
508 	ASSERT(curthread->t_schedctl == NULL);
509 
510 #if defined(__sparc)
511 	if (p->p_utraps != NULL)
512 		utrap_free(p);
513 #endif	/* __sparc */
514 
515 	/*
516 	 * Close all close-on-exec files.
517 	 */
518 	close_exec(P_FINFO(p));
519 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
520 
521 	/* Unbrand ourself if necessary. */
522 	if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
523 		brand_clearbrand(p, B_FALSE);
524 
525 	setregs(&args);
526 
527 	/* Mark this as an executable vnode */
528 	mutex_enter(&vp->v_lock);
529 	vp->v_flag |= VVMEXEC;
530 	mutex_exit(&vp->v_lock);
531 
532 	VN_RELE(vp);
533 	if (dir != NULL)
534 		VN_RELE(dir);
535 	pn_free(&resolvepn);
536 
537 	/*
538 	 * Allocate a new lwp directory and lwpid hash table if necessary.
539 	 */
540 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
541 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
542 		lwpdir->ld_next = lwpdir + 1;
543 		tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
544 		if (p->p_lwpdir != NULL)
545 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
546 		else
547 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
548 	}
549 
550 	if (PROC_IS_BRANDED(p))
551 		BROP(p)->b_exec();
552 
553 	mutex_enter(&p->p_lock);
554 	prbarrier(p);
555 
556 	/*
557 	 * Reset lwp id to the default value of 1.
558 	 * This is a single-threaded process now
559 	 * and lwp #1 is lwp_wait()able by default.
560 	 * The t_unpark flag should not be inherited.
561 	 */
562 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
563 	curthread->t_tid = 1;
564 	kpreempt_disable();
565 	ASSERT(curthread->t_lpl != NULL);
566 	p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
567 	kpreempt_enable();
568 	if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
569 		lgrp_update_trthr_migrations(1);
570 	}
571 	curthread->t_unpark = 0;
572 	curthread->t_proc_flag |= TP_TWAIT;
573 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
574 	p->p_lwpdaemon = 0;			/* but oh well ... */
575 	p->p_lwpid = 1;
576 
577 	/*
578 	 * Install the newly-allocated lwp directory and lwpid hash table
579 	 * and insert the current thread into the new hash table.
580 	 */
581 	if (lwpdir != NULL) {
582 		old_lwpdir = p->p_lwpdir;
583 		old_lwpdir_sz = p->p_lwpdir_sz;
584 		old_tidhash = p->p_tidhash;
585 		old_tidhash_sz = p->p_tidhash_sz;
586 		p->p_lwpdir = p->p_lwpfree = lwpdir;
587 		p->p_lwpdir_sz = 2;
588 		lep->le_thread = curthread;
589 		lep->le_lwpid = curthread->t_tid;
590 		lep->le_start = curthread->t_start;
591 		lwp_hash_in(p, lep, tidhash, 2, 0);
592 		p->p_tidhash = tidhash;
593 		p->p_tidhash_sz = 2;
594 	}
595 	ret_tidhash = p->p_ret_tidhash;
596 	p->p_ret_tidhash = NULL;
597 
598 	/*
599 	 * Restore the saved signal mask and
600 	 * inform /proc that the exec() has finished.
601 	 */
602 	curthread->t_hold = savedmask;
603 	prexecend();
604 	mutex_exit(&p->p_lock);
605 	if (old_lwpdir) {
606 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
607 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
608 	}
609 	while (ret_tidhash != NULL) {
610 		ret_tidhash_t *next = ret_tidhash->rth_next;
611 		kmem_free(ret_tidhash->rth_tidhash,
612 		    ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
613 		kmem_free(ret_tidhash, sizeof (*ret_tidhash));
614 		ret_tidhash = next;
615 	}
616 
617 	ASSERT(error == 0);
618 	DTRACE_PROC(exec__success);
619 	return (0);
620 
621 fail:
622 	DTRACE_PROC1(exec__failure, int, error);
623 out:		/* error return */
624 	mutex_enter(&p->p_lock);
625 	curthread->t_hold = savedmask;
626 	prexecend();
627 	mutex_exit(&p->p_lock);
628 	ASSERT(error != 0);
629 	return (error);
630 }
631 
632 
633 /*
634  * Perform generic exec duties and switchout to object-file specific
635  * handler.
636  */
637 int
638 gexec(
639 	struct vnode **vpp,
640 	struct execa *uap,
641 	struct uarg *args,
642 	struct intpdata *idatap,
643 	int level,
644 	size_t *execsz,
645 	caddr_t exec_file,
646 	struct cred *cred,
647 	int brand_action)
648 {
649 	struct vnode *vp, *execvp = NULL;
650 	proc_t *pp = ttoproc(curthread);
651 	struct execsw *eswp;
652 	int error = 0;
653 	int suidflags = 0;
654 	ssize_t resid;
655 	uid_t uid, gid;
656 	struct vattr vattr;
657 	char magbuf[MAGIC_BYTES];
658 	int setid;
659 	cred_t *oldcred, *newcred = NULL;
660 	int privflags = 0;
661 	int setidfl;
662 	priv_set_t fset;
663 	secflagset_t old_secflags;
664 
665 	secflags_copy(&old_secflags, &pp->p_secflags.psf_effective);
666 
667 	/*
668 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
669 	 * previous setting so we can restore it if we encounter an error.
670 	 */
671 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
672 		mutex_enter(&pp->p_lock);
673 		suidflags = pp->p_flag & PSUIDFLAGS;
674 		pp->p_flag &= ~PSUIDFLAGS;
675 		mutex_exit(&pp->p_lock);
676 	}
677 
678 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
679 		goto bad_noclose;
680 
681 	/* need to open vnode for stateful file systems */
682 	if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
683 		goto bad_noclose;
684 	vp = *vpp;
685 
686 	/*
687 	 * Note: to support binary compatibility with SunOS a.out
688 	 * executables, we read in the first four bytes, as the
689 	 * magic number is in bytes 2-3.
690 	 */
691 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
692 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
693 		goto bad;
694 	if (resid != 0)
695 		goto bad;
696 
697 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
698 		goto bad;
699 
700 	if (level == 0 &&
701 	    (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
702 	    args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {
703 
704 		/* Pfcred is a credential with a ref count of 1 */
705 
706 		if (args->pfcred != NULL) {
707 			privflags |= PRIV_INCREASE|PRIV_RESET;
708 			newcred = cred = args->pfcred;
709 		} else {
710 			newcred = cred = crdup(cred);
711 		}
712 
713 		/* If we can, drop the PA bit */
714 		if ((privflags & PRIV_RESET) != 0)
715 			priv_adjust_PA(cred);
716 
717 		if (privflags & PRIV_SETID) {
718 			cred->cr_uid = uid;
719 			cred->cr_gid = gid;
720 			cred->cr_suid = uid;
721 			cred->cr_sgid = gid;
722 		}
723 
724 		if (privflags & MAC_FLAGS) {
725 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
726 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
727 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
728 		}
729 
730 		/*
731 		 * Implement the privilege updates:
732 		 *
733 		 * Restrict with L:
734 		 *
735 		 *	I' = I & L
736 		 *
737 		 *	E' = P' = (I' + F) & A
738 		 *
739 		 * But if running under ptrace, we cap I and F with P.
740 		 */
741 		if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
742 			if ((privflags & PRIV_INCREASE) != 0 &&
743 			    (pp->p_proc_flag & P_PR_PTRACE) != 0) {
744 				priv_intersect(&CR_OPPRIV(cred),
745 				    &CR_IPRIV(cred));
746 				priv_intersect(&CR_OPPRIV(cred), &fset);
747 			}
748 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
749 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
750 			if (privflags & PRIV_FORCED) {
751 				priv_set_PA(cred);
752 				priv_union(&fset, &CR_EPRIV(cred));
753 				priv_union(&fset, &CR_PPRIV(cred));
754 			}
755 			priv_adjust_PA(cred);
756 		}
757 	} else if (level == 0 && args->pfcred != NULL) {
758 		newcred = cred = args->pfcred;
759 		privflags |= PRIV_INCREASE;
760 		/* pfcred is not forced to adhere to these settings */
761 		priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
762 		CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
763 		priv_adjust_PA(cred);
764 	}
765 
766 	/* The new image gets the inheritable secflags as its secflags */
767 	secflags_promote(pp);
768 
769 	/* SunOS 4.x buy-back */
770 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
771 	    (vattr.va_mode & (VSUID|VSGID))) {
772 		char path[MAXNAMELEN];
773 		refstr_t *mntpt = NULL;
774 		int ret = -1;
775 
776 		bzero(path, sizeof (path));
777 		zone_hold(pp->p_zone);
778 
779 		ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
780 		    sizeof (path), cred);
781 
782 		/* fallback to mountpoint if a path can't be found */
783 		if ((ret != 0) || (ret == 0 && path[0] == '\0'))
784 			mntpt = vfs_getmntpoint(vp->v_vfsp);
785 
786 		if (mntpt == NULL)
787 			zcmn_err(pp->p_zone->zone_id, CE_NOTE,
788 			    "!uid %d: setuid execution not allowed, "
789 			    "file=%s", cred->cr_uid, path);
790 		else
791 			zcmn_err(pp->p_zone->zone_id, CE_NOTE,
792 			    "!uid %d: setuid execution not allowed, "
793 			    "fs=%s, file=%s", cred->cr_uid,
794 			    ZONE_PATH_TRANSLATE(refstr_value(mntpt),
795 			    pp->p_zone), exec_file);
796 
797 		if (!INGLOBALZONE(pp)) {
798 			/* zone_rootpath always has trailing / */
799 			if (mntpt == NULL)
800 				cmn_err(CE_NOTE, "!zone: %s, uid: %d "
801 				    "setuid execution not allowed, file=%s%s",
802 				    pp->p_zone->zone_name, cred->cr_uid,
803 				    pp->p_zone->zone_rootpath, path + 1);
804 			else
805 				cmn_err(CE_NOTE, "!zone: %s, uid: %d "
806 				    "setuid execution not allowed, fs=%s, "
807 				    "file=%s", pp->p_zone->zone_name,
808 				    cred->cr_uid, refstr_value(mntpt),
809 				    exec_file);
810 		}
811 
812 		if (mntpt != NULL)
813 			refstr_rele(mntpt);
814 
815 		zone_rele(pp->p_zone);
816 	}
817 
818 	/*
819 	 * execsetid() told us whether or not we had to change the
820 	 * credentials of the process.  In privflags, it told us
821 	 * whether we gained any privileges or executed a set-uid executable.
822 	 */
823 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));
824 
825 	/*
826 	 * Use /etc/system variable to determine if the stack
827 	 * should be marked as executable by default.
828 	 */
829 	if ((noexec_user_stack != 0) ||
830 	    secflag_enabled(pp, PROC_SEC_NOEXECSTACK))
831 		args->stk_prot &= ~PROT_EXEC;
832 
833 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
834 	args->ex_vp = vp;
835 
836 	/*
837 	 * Traditionally, the setid flags told the sub processes whether
838 	 * the file just executed was set-uid or set-gid; this caused
839 	 * some confusion as the 'setid' flag did not match the SUGID
840 	 * process flag which is only set when the uids/gids do not match.
841 	 * A script set-gid/set-uid to the real uid/gid would start with
842 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
843 	 * Now we flag those cases where the calling process cannot
844 	 * be trusted to influence the newly exec'ed process, either
845 	 * because it runs with more privileges or when the uids/gids
846 	 * do in fact not match.
847 	 * This also makes the runtime linker agree with the on exec
848 	 * values of SNOCD and SUGID.
849 	 */
850 	setidfl = 0;
851 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
852 	    !supgroupmember(cred->cr_gid, cred))) {
853 		setidfl |= EXECSETID_UGIDS;
854 	}
855 	if (setid & PRIV_SETUGID)
856 		setidfl |= EXECSETID_SETID;
857 	if (setid & PRIV_FORCED)
858 		setidfl |= EXECSETID_PRIVS;
859 
860 	execvp = pp->p_exec;
861 	if (execvp)
862 		VN_HOLD(execvp);
863 
864 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
865 	    setidfl, exec_file, cred, brand_action);
866 	rw_exit(eswp->exec_lock);
867 	if (error != 0) {
868 		if (execvp)
869 			VN_RELE(execvp);
870 		/*
871 		 * If this process's p_exec has been set to the vp of
872 		 * the executable by exec_func, we will return without
873 		 * calling VOP_CLOSE because proc_exit will close it
874 		 * on exit.
875 		 */
876 		if (pp->p_exec == vp)
877 			goto bad_noclose;
878 		else
879 			goto bad;
880 	}
881 
882 	if (level == 0) {
883 		uid_t oruid;
884 
885 		if (execvp != NULL) {
886 			/*
887 			 * Close the previous executable only if we are
888 			 * at level 0.
889 			 */
890 			(void) VOP_CLOSE(execvp, FREAD, 1, (offset_t)0,
891 			    cred, NULL);
892 		}
893 
894 		mutex_enter(&pp->p_crlock);
895 
896 		oruid = pp->p_cred->cr_ruid;
897 
898 		if (newcred != NULL) {
899 			/*
900 			 * Free the old credentials, and set the new ones.
901 			 * Do this for both the process and the (single) thread.
902 			 */
903 			crfree(pp->p_cred);
904 			pp->p_cred = cred;	/* cred already held for proc */
905 			crhold(cred);		/* hold new cred for thread */
906 			/*
907 			 * DTrace accesses t_cred in probe context.  t_cred
908 			 * must always be either NULL, or point to a valid,
909 			 * allocated cred structure.
910 			 */
911 			oldcred = curthread->t_cred;
912 			curthread->t_cred = cred;
913 			crfree(oldcred);
914 
915 			if (priv_basic_test >= 0 &&
916 			    !PRIV_ISASSERT(&CR_IPRIV(newcred),
917 			    priv_basic_test)) {
918 				pid_t pid = pp->p_pid;
919 				char *fn = PTOU(pp)->u_comm;
920 
921 				cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
922 				    "privilege removed from E/I", fn, pid);
923 			}
924 		}
925 		/*
926 		 * On emerging from a successful exec(), the saved
927 		 * uid and gid equal the effective uid and gid.
928 		 */
929 		cred->cr_suid = cred->cr_uid;
930 		cred->cr_sgid = cred->cr_gid;
931 
932 		/*
933 		 * If the real and effective ids do not match, this
934 		 * is a setuid process that should not dump core.
935 		 * The group comparison is tricky; we prevent the code
936 		 * from flagging SNOCD when executing with an effective gid
937 		 * which is a supplementary group.
938 		 */
939 		if (cred->cr_ruid != cred->cr_uid ||
940 		    (cred->cr_rgid != cred->cr_gid &&
941 		    !supgroupmember(cred->cr_gid, cred)) ||
942 		    (privflags & PRIV_INCREASE) != 0)
943 			suidflags = PSUIDFLAGS;
944 		else
945 			suidflags = 0;
946 
947 		mutex_exit(&pp->p_crlock);
948 		if (newcred != NULL && oruid != newcred->cr_ruid) {
949 			/* Note that the process remains in the same zone. */
950 			mutex_enter(&pidlock);
951 			upcount_dec(oruid, crgetzoneid(newcred));
952 			upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
953 			mutex_exit(&pidlock);
954 		}
955 		if (suidflags) {
956 			mutex_enter(&pp->p_lock);
957 			pp->p_flag |= suidflags;
958 			mutex_exit(&pp->p_lock);
959 		}
960 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
961 			/*
962 			 * If process is traced via /proc, arrange to
963 			 * invalidate the associated /proc vnode.
964 			 */
965 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
966 				args->traceinval = 1;
967 		}
968 		if (pp->p_proc_flag & P_PR_PTRACE)
969 			psignal(pp, SIGTRAP);
970 		if (args->traceinval)
971 			prinvalidate(&pp->p_user);
972 	}
973 	if (execvp)
974 		VN_RELE(execvp);
975 	return (0);
976 
977 bad:
978 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, cred, NULL);
979 
980 bad_noclose:
981 	if (newcred != NULL)
982 		crfree(newcred);
983 	if (error == 0)
984 		error = ENOEXEC;
985 
986 	mutex_enter(&pp->p_lock);
987 	if (suidflags) {
988 		pp->p_flag |= suidflags;
989 	}
990 	/*
991 	 * Restore the effective secflags, to maintain the invariant they
992 	 * never change for a given process
993 	 */
994 	secflags_copy(&pp->p_secflags.psf_effective, &old_secflags);
995 	mutex_exit(&pp->p_lock);
996 
997 	return (error);
998 }
999 
1000 extern char *execswnames[];
1001 
1002 struct execsw *
1003 allocate_execsw(char *name, char *magic, size_t magic_size)
1004 {
1005 	int i, j;
1006 	char *ename;
1007 	char *magicp;
1008 
1009 	mutex_enter(&execsw_lock);
1010 	for (i = 0; i < nexectype; i++) {
1011 		if (execswnames[i] == NULL) {
1012 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1013 			(void) strcpy(ename, name);
1014 			execswnames[i] = ename;
1015 			/*
1016 			 * Set the magic number last so that we
1017 			 * don't need to hold the execsw_lock in
1018 			 * findexectype().
1019 			 */
1020 			magicp = kmem_alloc(magic_size, KM_SLEEP);
1021 			for (j = 0; j < magic_size; j++)
1022 				magicp[j] = magic[j];
1023 			execsw[i].exec_magic = magicp;
1024 			mutex_exit(&execsw_lock);
1025 			return (&execsw[i]);
1026 		}
1027 	}
1028 	mutex_exit(&execsw_lock);
1029 	return (NULL);
1030 }
1031 
1032 /*
1033  * Find the exec switch table entry with the corresponding magic string.
1034  */
1035 struct execsw *
1036 findexecsw(char *magic)
1037 {
1038 	struct execsw *eswp;
1039 
1040 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1041 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1042 		if (magic && eswp->exec_maglen != 0 &&
1043 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
1044 			return (eswp);
1045 	}
1046 	return (NULL);
1047 }
1048 
1049 /*
1050  * Find the execsw[] index for the given exec header string by looking for the
1051  * magic string at a specified offset and length for each kind of executable
1052  * file format until one matches.  If no execsw[] entry is found, try to
1053  * autoload a module for this magic string.
1054  */
1055 struct execsw *
1056 findexec_by_hdr(char *header)
1057 {
1058 	struct execsw *eswp;
1059 
1060 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1061 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1062 		if (header && eswp->exec_maglen != 0 &&
1063 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
1064 		    eswp->exec_maglen) == 0) {
1065 			if (hold_execsw(eswp) != 0)
1066 				return (NULL);
1067 			return (eswp);
1068 		}
1069 	}
1070 	return (NULL);	/* couldn't find the type */
1071 }
1072 
1073 /*
1074  * Find the execsw[] index for the given magic string.  If no execsw[] entry
1075  * is found, try to autoload a module for this magic string.
1076  */
1077 struct execsw *
1078 findexec_by_magic(char *magic)
1079 {
1080 	struct execsw *eswp;
1081 
1082 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1083 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1084 		if (magic && eswp->exec_maglen != 0 &&
1085 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
1086 			if (hold_execsw(eswp) != 0)
1087 				return (NULL);
1088 			return (eswp);
1089 		}
1090 	}
1091 	return (NULL);	/* couldn't find the type */
1092 }
1093 
1094 static int
1095 hold_execsw(struct execsw *eswp)
1096 {
1097 	char *name;
1098 
1099 	rw_enter(eswp->exec_lock, RW_READER);
1100 	while (!LOADED_EXEC(eswp)) {
1101 		rw_exit(eswp->exec_lock);
1102 		name = execswnames[eswp-execsw];
1103 		ASSERT(name);
1104 		if (modload("exec", name) == -1)
1105 			return (-1);
1106 		rw_enter(eswp->exec_lock, RW_READER);
1107 	}
1108 	return (0);
1109 }
1110 
1111 static int
1112 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
1113     priv_set_t *fset, cred_t *cr, const char *pathname)
1114 {
1115 	proc_t *pp = ttoproc(curthread);
1116 	uid_t uid, gid;
1117 	int privflags = 0;
1118 
1119 	/*
1120 	 * Remember credentials.
1121 	 */
1122 	uid = cr->cr_uid;
1123 	gid = cr->cr_gid;
1124 
1125 	/* Will try to reset the PRIV_AWARE bit later. */
1126 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
1127 		privflags |= PRIV_RESET;
1128 
1129 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
1130 		/*
1131 		 * If it's a set-uid root program we perform the
1132 		 * forced privilege look-aside. This has three possible
1133 		 * outcomes:
1134 		 *	no look aside information -> treat as before
1135 		 *	look aside in Limit set -> apply forced privs
1136 		 *	look aside not in Limit set -> ignore set-uid root
1137 		 *
1138 		 * Ordinary set-uid root execution only allowed if the limit
1139 		 * set holds all unsafe privileges.
1140 		 */
1141 		if (vattrp->va_mode & VSUID) {
1142 			if (vattrp->va_uid == 0) {
1143 				int res = get_forced_privs(cr, pathname, fset);
1144 
1145 				switch (res) {
1146 				case -1:
1147 					if (priv_issubset(&priv_unsafe,
1148 					    &CR_LPRIV(cr))) {
1149 						uid = vattrp->va_uid;
1150 						privflags |= PRIV_SETUGID;
1151 					}
1152 					break;
1153 				case 0:
1154 					privflags |= PRIV_FORCED|PRIV_INCREASE;
1155 					break;
1156 				default:
1157 					break;
1158 				}
1159 			} else {
1160 				uid = vattrp->va_uid;
1161 				privflags |= PRIV_SETUGID;
1162 			}
1163 		}
1164 		if (vattrp->va_mode & VSGID) {
1165 			gid = vattrp->va_gid;
1166 			privflags |= PRIV_SETUGID;
1167 		}
1168 	}
1169 
1170 	/*
1171 	 * Do we need to change our credential anyway?
1172 	 * This is the case when E != I or P != I, as
1173 	 * we need to do the assignments (with F empty and A full)
1174 	 * Or when I is not a subset of L; in that case we need to
1175 	 * enforce L.
1176 	 *
1177 	 *		I' = L & I
1178 	 *
1179 	 *		E' = P' = (I' + F) & A
1180 	 * or
1181 	 *		E' = P' = I'
1182 	 */
1183 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
1184 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
1185 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
1186 		privflags |= PRIV_RESET;
1187 
1188 	/* Child has more privileges than parent */
1189 	if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
1190 		privflags |= PRIV_INCREASE;
1191 
1192 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
1193 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
1194 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
1195 		privflags |= MAC_FLAGS;
1196 	/*
1197 	 * Set setuid/setgid protections if no ptrace() compatibility.
1198 	 * For privileged processes, honor setuid/setgid even in
1199 	 * the presence of ptrace() compatibility.
1200 	 */
1201 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
1202 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
1203 	    (cr->cr_uid != uid ||
1204 	    cr->cr_gid != gid ||
1205 	    cr->cr_suid != uid ||
1206 	    cr->cr_sgid != gid)) {
1207 		*uidp = uid;
1208 		*gidp = gid;
1209 		privflags |= PRIV_SETID;
1210 	}
1211 	return (privflags);
1212 }
1213 
1214 int
1215 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
1216 {
1217 	int error;
1218 	proc_t *p = ttoproc(curthread);
1219 
1220 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
1221 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
1222 		return (error);
1223 	/*
1224 	 * Check the access mode.
1225 	 * If VPROC, ask /proc if the file is an object file.
1226 	 */
1227 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
1228 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
1229 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
1230 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
1231 		if (error == 0)
1232 			error = EACCES;
1233 		return (error);
1234 	}
1235 
1236 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
1237 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
1238 		/*
1239 		 * If process is under ptrace(2) compatibility,
1240 		 * fail the exec(2).
1241 		 */
1242 		if (p->p_proc_flag & P_PR_PTRACE)
1243 			goto bad;
1244 		/*
1245 		 * Process is traced via /proc.
1246 		 * Arrange to invalidate the /proc vnode.
1247 		 */
1248 		args->traceinval = 1;
1249 	}
1250 	return (0);
1251 bad:
1252 	if (error == 0)
1253 		error = ENOEXEC;
1254 	return (error);
1255 }
1256 
1257 /*
1258  * Map a section of an executable file into the user's
1259  * address space.
1260  */
1261 int
1262 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1263     off_t offset, int prot, int page, uint_t szc)
1264 {
1265 	int error = 0;
1266 	off_t oldoffset;
1267 	caddr_t zfodbase, oldaddr;
1268 	size_t end, oldlen;
1269 	size_t zfoddiff;
1270 	label_t ljb;
1271 	proc_t *p = ttoproc(curthread);
1272 
1273 	oldaddr = addr;
1274 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1275 	if (len) {
1276 		oldlen = len;
1277 		len += ((size_t)oldaddr - (size_t)addr);
1278 		oldoffset = offset;
1279 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
1280 		if (page) {
1281 			spgcnt_t  prefltmem, availm, npages;
1282 			int preread;
1283 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1284 
1285 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1286 				mflag |= MAP_TEXT;
1287 			} else {
1288 				mflag |= MAP_INITDATA;
1289 			}
1290 
1291 			if (valid_usr_range(addr, len, prot, p->p_as,
1292 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1293 				error = ENOMEM;
1294 				goto bad;
1295 			}
1296 			if (error = VOP_MAP(vp, (offset_t)offset,
1297 			    p->p_as, &addr, len, prot, PROT_ALL,
1298 			    mflag, CRED(), NULL))
1299 				goto bad;
1300 
1301 			/*
1302 			 * If the segment can fit, then we prefault
1303 			 * the entire segment in.  This is based on the
1304 			 * model that says the best working set of a
1305 			 * small program is all of its pages.
1306 			 */
1307 			npages = (spgcnt_t)btopr(len);
1308 			prefltmem = freemem - desfree;
1309 			preread =
1310 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1311 
1312 			/*
1313 			 * If we aren't prefaulting the segment,
1314 			 * increment "deficit", if necessary to ensure
1315 			 * that pages will become available when this
1316 			 * process starts executing.
1317 			 */
1318 			availm = freemem - lotsfree;
1319 			if (preread == 0 && npages > availm &&
1320 			    deficit < lotsfree) {
1321 				deficit += MIN((pgcnt_t)(npages - availm),
1322 				    lotsfree - deficit);
1323 			}
1324 
1325 			if (preread) {
1326 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1327 				    "execmap preread:freemem %d size %lu",
1328 				    freemem, len);
1329 				(void) as_fault(p->p_as->a_hat, p->p_as,
1330 				    (caddr_t)addr, len, F_INVAL, S_READ);
1331 			}
1332 		} else {
1333 			if (valid_usr_range(addr, len, prot, p->p_as,
1334 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1335 				error = ENOMEM;
1336 				goto bad;
1337 			}
1338 
1339 			if (error = as_map(p->p_as, addr, len,
1340 			    segvn_create, zfod_argsp))
1341 				goto bad;
1342 			/*
1343 			 * Read in the segment in one big chunk.
1344 			 */
1345 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1346 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1347 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1348 				goto bad;
1349 			/*
1350 			 * Now set protections.
1351 			 */
1352 			if (prot != PROT_ZFOD) {
1353 				(void) as_setprot(p->p_as, (caddr_t)addr,
1354 				    len, prot);
1355 			}
1356 		}
1357 	}
1358 
1359 	if (zfodlen) {
1360 		struct as *as = curproc->p_as;
1361 		struct seg *seg;
1362 		uint_t zprot = 0;
1363 
1364 		end = (size_t)addr + len;
1365 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1366 		zfoddiff = (uintptr_t)zfodbase - end;
1367 		if (zfoddiff) {
1368 			/*
1369 			 * Before we go to zero the remaining space on the last
1370 			 * page, make sure we have write permission.
1371 			 *
1372 			 * Normal illumos binaries don't even hit the case
1373 			 * where we have to change permission on the last page
1374 			 * since their protection is typically either
1375 			 *    PROT_USER | PROT_WRITE | PROT_READ
1376 			 * or
1377 			 *    PROT_ZFOD (same as PROT_ALL).
1378 			 *
1379 			 * We need to be careful how we zero-fill the last page
1380 			 * if the segment protection does not include
1381 			 * PROT_WRITE. Using as_setprot() can cause the VM
1382 			 * segment code to call segvn_vpage(), which must
1383 			 * allocate a page struct for each page in the segment.
1384 			 * If we have a very large segment, this may fail, so
1385 			 * we have to check for that, even though we ignore
1386 			 * other return values from as_setprot.
1387 			 */
1388 
1389 			AS_LOCK_ENTER(as, RW_READER);
1390 			seg = as_segat(curproc->p_as, (caddr_t)end);
1391 			if (seg != NULL)
1392 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1393 				    &zprot);
1394 			AS_LOCK_EXIT(as);
1395 
1396 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1397 				if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
1398 				    zprot | PROT_WRITE) == ENOMEM) {
1399 					error = ENOMEM;
1400 					goto bad;
1401 				}
1402 			}
1403 
1404 			if (on_fault(&ljb)) {
1405 				no_fault();
1406 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1407 					(void) as_setprot(as, (caddr_t)end,
1408 					    zfoddiff - 1, zprot);
1409 				error = EFAULT;
1410 				goto bad;
1411 			}
1412 			uzero((void *)end, zfoddiff);
1413 			no_fault();
1414 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1415 				(void) as_setprot(as, (caddr_t)end,
1416 				    zfoddiff - 1, zprot);
1417 		}
1418 		if (zfodlen > zfoddiff) {
1419 			struct segvn_crargs crargs =
1420 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1421 
1422 			zfodlen -= zfoddiff;
1423 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1424 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1425 				error = ENOMEM;
1426 				goto bad;
1427 			}
1428 			if (szc > 0) {
1429 				/*
1430 				 * ASSERT alignment because the mapelfexec()
1431 				 * caller for the szc > 0 case extended zfod
1432 				 * so it's end is pgsz aligned.
1433 				 */
1434 				size_t pgsz = page_get_pagesize(szc);
1435 				ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1436 
1437 				if (IS_P2ALIGNED(zfodbase, pgsz)) {
1438 					crargs.szc = szc;
1439 				} else {
1440 					crargs.szc = AS_MAP_HEAP;
1441 				}
1442 			} else {
1443 				crargs.szc = AS_MAP_NO_LPOOB;
1444 			}
1445 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1446 			    zfodlen, segvn_create, &crargs))
1447 				goto bad;
1448 			if (prot != PROT_ZFOD) {
1449 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1450 				    zfodlen, prot);
1451 			}
1452 		}
1453 	}
1454 	return (0);
1455 bad:
1456 	return (error);
1457 }
1458 
1459 void
1460 setexecenv(struct execenv *ep)
1461 {
1462 	proc_t *p = ttoproc(curthread);
1463 	klwp_t *lwp = ttolwp(curthread);
1464 	struct vnode *vp;
1465 
1466 	p->p_bssbase = ep->ex_bssbase;
1467 	p->p_brkbase = ep->ex_brkbase;
1468 	p->p_brksize = ep->ex_brksize;
1469 	if (p->p_exec)
1470 		VN_RELE(p->p_exec);	/* out with the old */
1471 	vp = p->p_exec = ep->ex_vp;
1472 	if (vp != NULL)
1473 		VN_HOLD(vp);		/* in with the new */
1474 
1475 	lwp->lwp_sigaltstack.ss_sp = 0;
1476 	lwp->lwp_sigaltstack.ss_size = 0;
1477 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1478 }
1479 
1480 int
1481 execopen(struct vnode **vpp, int *fdp)
1482 {
1483 	struct vnode *vp = *vpp;
1484 	file_t *fp;
1485 	int error = 0;
1486 	int filemode = FREAD;
1487 
1488 	VN_HOLD(vp);		/* open reference */
1489 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1490 		VN_RELE(vp);
1491 		*fdp = -1;	/* just in case falloc changed value */
1492 		return (error);
1493 	}
1494 	if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1495 		VN_RELE(vp);
1496 		setf(*fdp, NULL);
1497 		unfalloc(fp);
1498 		*fdp = -1;
1499 		return (error);
1500 	}
1501 	*vpp = vp;		/* vnode should not have changed */
1502 	fp->f_vnode = vp;
1503 	mutex_exit(&fp->f_tlock);
1504 	setf(*fdp, fp);
1505 	return (0);
1506 }
1507 
1508 int
1509 execclose(int fd)
1510 {
1511 	return (closeandsetf(fd, NULL));
1512 }
1513 
1514 
1515 /*
1516  * noexec stub function.
1517  */
1518 /*ARGSUSED*/
1519 int
1520 noexec(
1521     struct vnode *vp,
1522     struct execa *uap,
1523     struct uarg *args,
1524     struct intpdata *idatap,
1525     int level,
1526     size_t *execsz,
1527     int setid,
1528     caddr_t exec_file,
1529     struct cred *cred)
1530 {
1531 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1532 	return (ENOEXEC);
1533 }
1534 
1535 /*
1536  * Support routines for building a user stack.
1537  *
1538  * execve(path, argv, envp) must construct a new stack with the specified
1539  * arguments and environment variables (see exec_args() for a description
1540  * of the user stack layout).  To do this, we copy the arguments and
1541  * environment variables from the old user address space into the kernel,
1542  * free the old as, create the new as, and copy our buffered information
1543  * to the new stack.  Our kernel buffer has the following structure:
1544  *
1545  *	+-----------------------+ <--- stk_base + stk_size
1546  *	| string offsets	|
1547  *	+-----------------------+ <--- stk_offp
1548  *	|			|
1549  *	| STK_AVAIL() space	|
1550  *	|			|
1551  *	+-----------------------+ <--- stk_strp
1552  *	| strings		|
1553  *	+-----------------------+ <--- stk_base
1554  *
1555  * When we add a string, we store the string's contents (including the null
1556  * terminator) at stk_strp, and we store the offset of the string relative to
1557  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1558  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1559  * the difference between these pointers.  If we run out of space, we return
1560  * an error and exec_args() starts all over again with a buffer twice as large.
1561  * When we're all done, the kernel buffer looks like this:
1562  *
1563  *	+-----------------------+ <--- stk_base + stk_size
1564  *	| argv[0] offset	|
1565  *	+-----------------------+
1566  *	| ...			|
1567  *	+-----------------------+
1568  *	| argv[argc-1] offset	|
1569  *	+-----------------------+
1570  *	| envp[0] offset	|
1571  *	+-----------------------+
1572  *	| ...			|
1573  *	+-----------------------+
1574  *	| envp[envc-1] offset	|
1575  *	+-----------------------+
1576  *	| AT_SUN_PLATFORM offset|
1577  *	+-----------------------+
1578  *	| AT_SUN_EXECNAME offset|
1579  *	+-----------------------+ <--- stk_offp
1580  *	|			|
1581  *	| STK_AVAIL() space	|
1582  *	|			|
1583  *	+-----------------------+ <--- stk_strp
1584  *	| AT_SUN_EXECNAME offset|
1585  *	+-----------------------+
1586  *	| AT_SUN_PLATFORM offset|
1587  *	+-----------------------+
1588  *	| envp[envc-1] string	|
1589  *	+-----------------------+
1590  *	| ...			|
1591  *	+-----------------------+
1592  *	| envp[0] string	|
1593  *	+-----------------------+
1594  *	| argv[argc-1] string	|
1595  *	+-----------------------+
1596  *	| ...			|
1597  *	+-----------------------+
1598  *	| argv[0] string	|
1599  *	+-----------------------+ <--- stk_base
1600  */
1601 
1602 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1603 
1604 /*
1605  * Add a string to the stack.
1606  */
1607 static int
1608 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1609 {
1610 	int error;
1611 	size_t len;
1612 
1613 	if (STK_AVAIL(args) < sizeof (int))
1614 		return (E2BIG);
1615 	*--args->stk_offp = args->stk_strp - args->stk_base;
1616 
1617 	if (segflg == UIO_USERSPACE) {
1618 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1619 		if (error != 0)
1620 			return (error);
1621 	} else {
1622 		len = strlen(sp) + 1;
1623 		if (len > STK_AVAIL(args))
1624 			return (E2BIG);
1625 		bcopy(sp, args->stk_strp, len);
1626 	}
1627 
1628 	args->stk_strp += len;
1629 
1630 	return (0);
1631 }
1632 
1633 static int
1634 stk_getptr(uarg_t *args, char *src, char **dst)
1635 {
1636 	int error;
1637 
1638 	if (args->from_model == DATAMODEL_NATIVE) {
1639 		ulong_t ptr;
1640 		error = fulword(src, &ptr);
1641 		*dst = (caddr_t)ptr;
1642 	} else {
1643 		uint32_t ptr;
1644 		error = fuword32(src, &ptr);
1645 		*dst = (caddr_t)(uintptr_t)ptr;
1646 	}
1647 	return (error);
1648 }
1649 
1650 static int
1651 stk_putptr(uarg_t *args, char *addr, char *value)
1652 {
1653 	if (args->to_model == DATAMODEL_NATIVE)
1654 		return (sulword(addr, (ulong_t)value));
1655 	else
1656 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1657 }
1658 
1659 static int
1660 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1661 {
1662 	char *sp;
1663 	int argc, error;
1664 	int argv_empty = 0;
1665 	size_t ptrsize = args->from_ptrsize;
1666 	size_t size, pad;
1667 	char *argv = (char *)uap->argp;
1668 	char *envp = (char *)uap->envp;
1669 
1670 	/*
1671 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1672 	 * In the rare case that we have nested interpreters then those names
1673 	 * and arguments are also copied to the subsequent slots in argv.
1674 	 */
1675 	if (intp != NULL && intp->intp_name[0] != NULL) {
1676 		int i;
1677 
1678 		for (i = 0; i < INTP_MAXDEPTH; i++) {
1679 			if (intp->intp_name[i] == NULL)
1680 				break;
1681 			error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE);
1682 			if (error != 0)
1683 				return (error);
1684 			if (intp->intp_arg[i] != NULL) {
1685 				error = stk_add(args, intp->intp_arg[i],
1686 				    UIO_SYSSPACE);
1687 				if (error != 0)
1688 					return (error);
1689 			}
1690 		}
1691 
1692 		if (args->fname != NULL)
1693 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1694 		else
1695 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1696 		if (error)
1697 			return (error);
1698 
1699 		/*
1700 		 * Check for an empty argv[].
1701 		 */
1702 		if (stk_getptr(args, argv, &sp))
1703 			return (EFAULT);
1704 		if (sp == NULL)
1705 			argv_empty = 1;
1706 
1707 		argv += ptrsize;		/* ignore original argv[0] */
1708 	}
1709 
1710 	if (argv_empty == 0) {
1711 		/*
1712 		 * Add argv[] strings to the stack.
1713 		 */
1714 		for (;;) {
1715 			if (stk_getptr(args, argv, &sp))
1716 				return (EFAULT);
1717 			if (sp == NULL)
1718 				break;
1719 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1720 				return (error);
1721 			argv += ptrsize;
1722 		}
1723 	}
1724 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1725 	args->arglen = args->stk_strp - args->stk_base;
1726 
1727 	/*
1728 	 * Add environ[] strings to the stack.
1729 	 */
1730 	if (envp != NULL) {
1731 		for (;;) {
1732 			char *tmp = args->stk_strp;
1733 			if (stk_getptr(args, envp, &sp))
1734 				return (EFAULT);
1735 			if (sp == NULL)
1736 				break;
1737 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1738 				return (error);
1739 			if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
1740 				/* Undo the copied string */
1741 				args->stk_strp = tmp;
1742 				*(args->stk_offp++) = 0;
1743 			}
1744 			envp += ptrsize;
1745 		}
1746 	}
1747 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1748 	args->ne = args->na - argc;
1749 
1750 	/*
1751 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1752 	 * AT_SUN_EMULATOR strings to the stack.
1753 	 */
1754 	if (auxvpp != NULL && *auxvpp != NULL) {
1755 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1756 			return (error);
1757 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1758 			return (error);
1759 		if (args->brandname != NULL &&
1760 		    (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1761 			return (error);
1762 		if (args->emulator != NULL &&
1763 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1764 			return (error);
1765 	}
1766 
1767 	/*
1768 	 * Compute the size of the stack.  This includes all the pointers,
1769 	 * the space reserved for the aux vector, and all the strings.
1770 	 * The total number of pointers is args->na (which is argc + envc)
1771 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1772 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1773 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1774 	 * all the strings, at the very top of the stack.
1775 	 */
1776 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1777 	    (args->stk_strp - args->stk_base);
1778 
1779 	/*
1780 	 * Pad the string section with zeroes to align the stack size.
1781 	 */
1782 	pad = P2NPHASE(size, args->stk_align);
1783 
1784 	if (STK_AVAIL(args) < pad)
1785 		return (E2BIG);
1786 
1787 	args->usrstack_size = size + pad;
1788 
1789 	while (pad-- != 0)
1790 		*args->stk_strp++ = 0;
1791 
1792 	args->nc = args->stk_strp - args->stk_base;
1793 
1794 	return (0);
1795 }
1796 
1797 static int
1798 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1799 {
1800 	size_t ptrsize = args->to_ptrsize;
1801 	ssize_t pslen;
1802 	char *kstrp = args->stk_base;
1803 	char *ustrp = usrstack - args->nc - ptrsize;
1804 	char *usp = usrstack - args->usrstack_size;
1805 	int *offp = (int *)(args->stk_base + args->stk_size);
1806 	int envc = args->ne;
1807 	int argc = args->na - envc;
1808 	int i;
1809 
1810 	/*
1811 	 * Record argc for /proc.
1812 	 */
1813 	up->u_argc = argc;
1814 
1815 	/*
1816 	 * Put argc on the stack.  Note that even though it's an int,
1817 	 * it always consumes ptrsize bytes (for alignment).
1818 	 */
1819 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1820 		return (-1);
1821 
1822 	/*
1823 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1824 	 */
1825 	up->u_argv = (uintptr_t)(usp += ptrsize);
1826 
1827 	/*
1828 	 * Put the argv[] pointers on the stack.
1829 	 */
1830 	for (i = 0; i < argc; i++, usp += ptrsize)
1831 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1832 			return (-1);
1833 
1834 	/*
1835 	 * Copy arguments to u_psargs.
1836 	 */
1837 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1838 	for (i = 0; i < pslen; i++)
1839 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1840 	while (i < PSARGSZ)
1841 		up->u_psargs[i++] = '\0';
1842 
1843 	/*
1844 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1845 	 * record envp for /proc.
1846 	 */
1847 	up->u_envp = (uintptr_t)(usp += ptrsize);
1848 
1849 	/*
1850 	 * Put the envp[] pointers on the stack.
1851 	 */
1852 	for (i = 0; i < envc; i++, usp += ptrsize)
1853 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1854 			return (-1);
1855 
1856 	/*
1857 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1858 	 * remember where the stack ends, which is also where auxv begins.
1859 	 */
1860 	args->stackend = usp += ptrsize;
1861 
1862 	/*
1863 	 * Put all the argv[], envp[], and auxv strings on the stack.
1864 	 */
1865 	if (copyout(args->stk_base, ustrp, args->nc))
1866 		return (-1);
1867 
1868 	/*
1869 	 * Fill in the aux vector now that we know the user stack addresses
1870 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1871 	 * AT_SUN_EMULATOR strings.
1872 	 */
1873 	if (auxvpp != NULL && *auxvpp != NULL) {
1874 		if (args->to_model == DATAMODEL_NATIVE) {
1875 			auxv_t **a = (auxv_t **)auxvpp;
1876 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1877 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1878 			if (args->brandname != NULL)
1879 				ADDAUX(*a,
1880 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1881 			if (args->emulator != NULL)
1882 				ADDAUX(*a,
1883 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1884 		} else {
1885 			auxv32_t **a = (auxv32_t **)auxvpp;
1886 			ADDAUX(*a,
1887 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1888 			ADDAUX(*a,
1889 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1890 			if (args->brandname != NULL)
1891 				ADDAUX(*a, AT_SUN_BRANDNAME,
1892 				    (int)(uintptr_t)&ustrp[*--offp])
1893 			if (args->emulator != NULL)
1894 				ADDAUX(*a, AT_SUN_EMULATOR,
1895 				    (int)(uintptr_t)&ustrp[*--offp])
1896 		}
1897 	}
1898 
1899 	return (0);
1900 }
1901 
1902 /*
1903  * Though the actual stack base is constant, slew the %sp by a random aligned
1904  * amount in [0,aslr_max_stack_skew).  Mostly, this makes life slightly more
1905  * complicated for buffer overflows hoping to overwrite the return address.
1906  *
1907  * On some platforms this helps avoid cache thrashing when identical processes
1908  * simultaneously share caches that don't provide enough associativity
1909  * (e.g. sun4v systems). In this case stack slewing makes the same hot stack
1910  * variables in different processes live in different cache sets increasing
1911  * effective associativity.
1912  */
1913 size_t
1914 exec_get_spslew(void)
1915 {
1916 #ifdef sun4v
1917 	static uint_t sp_color_stride = 16;
1918 	static uint_t sp_color_mask = 0x1f;
1919 	static uint_t sp_current_color = (uint_t)-1;
1920 #endif
1921 	size_t off;
1922 
1923 	ASSERT(ISP2(aslr_max_stack_skew));
1924 
1925 	if ((aslr_max_stack_skew == 0) ||
1926 	    !secflag_enabled(curproc, PROC_SEC_ASLR)) {
1927 #ifdef sun4v
1928 		uint_t spcolor = atomic_inc_32_nv(&sp_current_color);
1929 		return ((size_t)((spcolor & sp_color_mask) *
1930 		    SA(sp_color_stride)));
1931 #else
1932 		return (0);
1933 #endif
1934 	}
1935 
1936 	(void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1937 	return (SA(P2PHASE(off, aslr_max_stack_skew)));
1938 }
1939 
1940 /*
1941  * Initialize a new user stack with the specified arguments and environment.
1942  * The initial user stack layout is as follows:
1943  *
1944  *	User Stack
1945  *	+---------------+
1946  *	|		|
1947  *	| stack guard	|
1948  *	| (64-bit only)	|
1949  *	|		|
1950  *	+...............+ <--- stack limit (base - curproc->p_stk_ctl)
1951  *	.		.
1952  *	.		.
1953  *	.		.
1954  *	+---------------+ <--- curproc->p_usrstack
1955  *	|		|
1956  *	| slew		|
1957  *	|		|
1958  *	+---------------+
1959  *	| NULL		|
1960  *	+---------------+
1961  *	|		|
1962  *	| auxv strings	|
1963  *	|		|
1964  *	+---------------+
1965  *	|		|
1966  *	| envp strings	|
1967  *	|		|
1968  *	+---------------+
1969  *	|		|
1970  *	| argv strings	|
1971  *	|		|
1972  *	+---------------+ <--- ustrp
1973  *	|		|
1974  *	| aux vector	|
1975  *	|		|
1976  *	+---------------+ <--- auxv
1977  *	| NULL		|
1978  *	+---------------+
1979  *	| envp[envc-1]	|
1980  *	+---------------+
1981  *	| ...		|
1982  *	+---------------+
1983  *	| envp[0]	|
1984  *	+---------------+ <--- envp[]
1985  *	| NULL		|
1986  *	+---------------+
1987  *	| argv[argc-1]	|
1988  *	+---------------+
1989  *	| ...		|
1990  *	+---------------+
1991  *	| argv[0]	|
1992  *	+---------------+ <--- argv[]
1993  *	| argc		|
1994  *	+---------------+ <--- stack base
1995  *
1996  * In 64-bit processes, a stack guard segment is allocated at the address
1997  * immediately below where the stack limit ends.  This protects new library
1998  * mappings (such as the linker) from being placed in relatively dangerous
1999  * proximity to the stack.
2000  */
2001 int
2002 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
2003 {
2004 	size_t size;
2005 	int error;
2006 	proc_t *p = ttoproc(curthread);
2007 	user_t *up = PTOU(p);
2008 	char *usrstack;
2009 	rctl_entity_p_t e;
2010 	struct as *as;
2011 	extern int use_stk_lpg;
2012 	size_t sp_slew;
2013 #if defined(_LP64)
2014 	const size_t sg_sz = (stack_guard_seg_sz & PAGEMASK);
2015 #endif /* defined(_LP64) */
2016 
2017 	args->from_model = p->p_model;
2018 	if (p->p_model == DATAMODEL_NATIVE) {
2019 		args->from_ptrsize = sizeof (long);
2020 	} else {
2021 		args->from_ptrsize = sizeof (int32_t);
2022 	}
2023 
2024 	if (args->to_model == DATAMODEL_NATIVE) {
2025 		args->to_ptrsize = sizeof (long);
2026 		args->ncargs = NCARGS;
2027 		args->stk_align = STACK_ALIGN;
2028 		if (args->addr32)
2029 			usrstack = (char *)USRSTACK64_32;
2030 		else
2031 			usrstack = (char *)USRSTACK;
2032 	} else {
2033 		args->to_ptrsize = sizeof (int32_t);
2034 		args->ncargs = NCARGS32;
2035 		args->stk_align = STACK_ALIGN32;
2036 		usrstack = (char *)USRSTACK32;
2037 	}
2038 
2039 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
2040 
2041 #if defined(__sparc)
2042 	/*
2043 	 * Make sure user register windows are empty before
2044 	 * attempting to make a new stack.
2045 	 */
2046 	(void) flush_user_windows_to_stack(NULL);
2047 #endif
2048 
2049 	for (size = PAGESIZE; ; size *= 2) {
2050 		args->stk_size = size;
2051 		args->stk_base = kmem_alloc(size, KM_SLEEP);
2052 		args->stk_strp = args->stk_base;
2053 		args->stk_offp = (int *)(args->stk_base + size);
2054 		error = stk_copyin(uap, args, intp, auxvpp);
2055 		if (error == 0)
2056 			break;
2057 		kmem_free(args->stk_base, size);
2058 		if (error != E2BIG && error != ENAMETOOLONG)
2059 			return (error);
2060 		if (size >= args->ncargs)
2061 			return (E2BIG);
2062 	}
2063 
2064 	size = args->usrstack_size;
2065 
2066 	ASSERT(error == 0);
2067 	ASSERT(P2PHASE(size, args->stk_align) == 0);
2068 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
2069 
2070 	if (size > args->ncargs) {
2071 		kmem_free(args->stk_base, args->stk_size);
2072 		return (E2BIG);
2073 	}
2074 
2075 	/*
2076 	 * Leave only the current lwp and force the other lwps to exit.
2077 	 * If another lwp beat us to the punch by calling exit(), bail out.
2078 	 */
2079 	if ((error = exitlwps(0)) != 0) {
2080 		kmem_free(args->stk_base, args->stk_size);
2081 		return (error);
2082 	}
2083 
2084 	/*
2085 	 * Revoke any doors created by the process.
2086 	 */
2087 	if (p->p_door_list)
2088 		door_exit();
2089 
2090 	/*
2091 	 * Release schedctl data structures.
2092 	 */
2093 	if (p->p_pagep)
2094 		schedctl_proc_cleanup();
2095 
2096 	/*
2097 	 * Clean up any DTrace helpers for the process.
2098 	 */
2099 	if (p->p_dtrace_helpers != NULL) {
2100 		ASSERT(dtrace_helpers_cleanup != NULL);
2101 		(*dtrace_helpers_cleanup)(p);
2102 	}
2103 
2104 	mutex_enter(&p->p_lock);
2105 	/*
2106 	 * Cleanup the DTrace provider associated with this process.
2107 	 */
2108 	if (p->p_dtrace_probes) {
2109 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
2110 		dtrace_fasttrap_exec_ptr(p);
2111 	}
2112 	mutex_exit(&p->p_lock);
2113 
2114 	/*
2115 	 * discard the lwpchan cache.
2116 	 */
2117 	if (p->p_lcp != NULL)
2118 		lwpchan_destroy_cache(1);
2119 
2120 	/*
2121 	 * Delete the POSIX timers.
2122 	 */
2123 	if (p->p_itimer != NULL)
2124 		timer_exit();
2125 
2126 	/*
2127 	 * Delete the ITIMER_REALPROF interval timer.
2128 	 * The other ITIMER_* interval timers are specified
2129 	 * to be inherited across exec().
2130 	 */
2131 	delete_itimer_realprof();
2132 
2133 	if (AU_AUDITING())
2134 		audit_exec(args->stk_base, args->stk_base + args->arglen,
2135 		    args->na - args->ne, args->ne, args->pfcred);
2136 
2137 	/*
2138 	 * Ensure that we don't change resource associations while we
2139 	 * change address spaces.
2140 	 */
2141 	mutex_enter(&p->p_lock);
2142 	pool_barrier_enter();
2143 	mutex_exit(&p->p_lock);
2144 
2145 	/*
2146 	 * Destroy the old address space and create a new one.
2147 	 * From here on, any errors are fatal to the exec()ing process.
2148 	 * On error we return -1, which means the caller must SIGKILL
2149 	 * the process.
2150 	 */
2151 	relvm();
2152 
2153 	mutex_enter(&p->p_lock);
2154 	pool_barrier_exit();
2155 	mutex_exit(&p->p_lock);
2156 
2157 	up->u_execsw = args->execswp;
2158 
2159 	p->p_brkbase = NULL;
2160 	p->p_brksize = 0;
2161 	p->p_brkpageszc = 0;
2162 	p->p_stksize = 0;
2163 	p->p_stkpageszc = 0;
2164 	p->p_stkg_start = 0;
2165 	p->p_stkg_end = 0;
2166 	p->p_model = args->to_model;
2167 	p->p_usrstack = usrstack;
2168 	p->p_stkprot = args->stk_prot;
2169 	p->p_datprot = args->dat_prot;
2170 
2171 	/*
2172 	 * Reset resource controls such that all controls are again active as
2173 	 * well as appropriate to the potentially new address model for the
2174 	 * process.
2175 	 */
2176 	e.rcep_p.proc = p;
2177 	e.rcep_t = RCENTITY_PROCESS;
2178 	rctl_set_reset(p->p_rctls, p, &e);
2179 
2180 	/* Too early to call map_pgsz for the heap */
2181 	if (use_stk_lpg) {
2182 		p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
2183 	}
2184 
2185 	mutex_enter(&p->p_lock);
2186 	p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
2187 	mutex_exit(&p->p_lock);
2188 
2189 	sp_slew = exec_get_spslew();
2190 	ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
2191 	/* Be certain we don't underflow */
2192 	VERIFY((curproc->p_usrstack - (size + sp_slew)) < curproc->p_usrstack);
2193 	exec_set_sp(size + sp_slew);
2194 
2195 	as = as_alloc();
2196 	p->p_as = as;
2197 	as->a_proc = p;
2198 	if (p->p_model == DATAMODEL_ILP32 || args->addr32)
2199 		as->a_userlimit = (caddr_t)USERLIMIT32;
2200 	(void) hat_setup(as->a_hat, HAT_ALLOC);
2201 	hat_join_srd(as->a_hat, args->ex_vp);
2202 
2203 	/* Write out the contents of the new stack. */
2204 	error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
2205 	kmem_free(args->stk_base, args->stk_size);
2206 
2207 #if defined(_LP64)
2208 	/* Add stack guard segment (if needed) after successful copyout */
2209 	if (error == 0 && p->p_model == DATAMODEL_LP64 && sg_sz != 0) {
2210 		seghole_crargs_t sca;
2211 		caddr_t addr_end = (caddr_t)(((uintptr_t)usrstack -
2212 		    p->p_stk_ctl) & PAGEMASK);
2213 		caddr_t addr_start = addr_end - sg_sz;
2214 
2215 		DTRACE_PROBE4(stack__guard__chk, proc_t *, p,
2216 		    caddr_t, addr_start, caddr_t, addr_end, size_t, sg_sz);
2217 
2218 		if (addr_end >= usrstack || addr_start >= addr_end ||
2219 		    valid_usr_range(addr_start, sg_sz, PROT_NONE, as,
2220 		    as->a_userlimit) != RANGE_OKAY) {
2221 			return (E2BIG);
2222 		}
2223 
2224 		/* Create un-mappable area in AS with seg_hole */
2225 		sca.name = "stack_guard";
2226 		error = as_map(as, addr_start, sg_sz, seghole_create, &sca);
2227 		if (error == 0) {
2228 			p->p_stkg_start = (uintptr_t)addr_start;
2229 			p->p_stkg_end = (uintptr_t)addr_start + sg_sz;
2230 		}
2231 	}
2232 #endif /* defined(_LP64) */
2233 
2234 	return (error);
2235 }
2236