xref: /titanic_50/usr/src/uts/common/os/exec.c (revision d8c870b0de5416eb7c3dbc6e97d93a26a5a5f299)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*	Copyright (c) 1988 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/vtrace.h>
53 #include <sys/exec.h>
54 #include <sys/exechdr.h>
55 #include <sys/kmem.h>
56 #include <sys/prsystm.h>
57 #include <sys/modctl.h>
58 #include <sys/vmparam.h>
59 #include <sys/schedctl.h>
60 #include <sys/utrap.h>
61 #include <sys/systeminfo.h>
62 #include <sys/stack.h>
63 #include <sys/rctl.h>
64 #include <sys/dtrace.h>
65 #include <sys/lwpchan_impl.h>
66 #include <sys/pool.h>
67 #include <sys/sdt.h>
68 #include <sys/brand.h>
69 
70 #include <c2/audit.h>
71 
72 #include <vm/hat.h>
73 #include <vm/anon.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_vn.h>
77 
78 #define	PRIV_RESET		0x01	/* needs to reset privs */
79 #define	PRIV_SETID		0x02	/* needs to change uids */
80 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
81 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
82 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
83 
84 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
85 static int hold_execsw(struct execsw *);
86 
87 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
88 #if defined(_SYSCALL32_IMPL)
89 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
90 #endif
91 
92 #define	PSUIDFLAGS		(SNOCD|SUGID)
93 
94 /*
95  * exec() - wrapper around exece providing NULL environment pointer
96  */
97 int
98 exec(const char *fname, const char **argp)
99 {
100 	return (exece(fname, argp, NULL));
101 }
102 
103 /*
104  * exece() - system call wrapper around exec_common()
105  */
106 int
107 exece(const char *fname, const char **argp, const char **envp)
108 {
109 	int error;
110 
111 	error = exec_common(fname, argp, envp, EBA_NONE);
112 	return (error ? (set_errno(error)) : 0);
113 }
114 
115 int
116 exec_common(const char *fname, const char **argp, const char **envp,
117     int brand_action)
118 {
119 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
120 	proc_t *p = ttoproc(curthread);
121 	klwp_t *lwp = ttolwp(curthread);
122 	struct user *up = PTOU(p);
123 	long execsz;		/* temporary count of exec size */
124 	int i;
125 	int error;
126 	char exec_file[MAXCOMLEN+1];
127 	struct pathname pn;
128 	struct pathname resolvepn;
129 	struct uarg args;
130 	struct execa ua;
131 	k_sigset_t savedmask;
132 	lwpdir_t *lwpdir = NULL;
133 	lwpdir_t **tidhash;
134 	lwpdir_t *old_lwpdir = NULL;
135 	uint_t old_lwpdir_sz;
136 	lwpdir_t **old_tidhash;
137 	uint_t old_tidhash_sz;
138 	lwpent_t *lep;
139 	int brandme = 0;
140 
141 	/*
142 	 * exec() is not supported for the /proc agent lwp.
143 	 */
144 	if (curthread == p->p_agenttp)
145 		return (ENOTSUP);
146 
147 	if ((error = secpolicy_basic_exec(CRED())) != 0)
148 		return (error);
149 
150 	if (brand_action != EBA_NONE) {
151 		/*
152 		 * Brand actions are not supported for processes that are not
153 		 * running in a branded zone.
154 		 */
155 		if (!ZONE_IS_BRANDED(p->p_zone))
156 			return (ENOTSUP);
157 
158 		if (brand_action == EBA_NATIVE) {
159 			/* Only branded processes can be unbranded */
160 			if (!PROC_IS_BRANDED(p))
161 				return (ENOTSUP);
162 		} else {
163 			/* Only unbranded processes can be branded */
164 			if (PROC_IS_BRANDED(p))
165 				return (ENOTSUP);
166 			brandme = 1;
167 		}
168 	} else {
169 		/*
170 		 * If this is a native zone, or if the process is already
171 		 * branded, then we don't need to do anything.  If this is
172 		 * a native process in a branded zone, we need to brand the
173 		 * process as it exec()s the new binary.
174 		 */
175 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
176 			brandme = 1;
177 	}
178 
179 	/*
180 	 * Inform /proc that an exec() has started.
181 	 * Hold signals that are ignored by default so that we will
182 	 * not be interrupted by a signal that will be ignored after
183 	 * successful completion of gexec().
184 	 */
185 	mutex_enter(&p->p_lock);
186 	prexecstart();
187 	schedctl_finish_sigblock(curthread);
188 	savedmask = curthread->t_hold;
189 	sigorset(&curthread->t_hold, &ignoredefault);
190 	mutex_exit(&p->p_lock);
191 
192 	/*
193 	 * Look up path name and remember last component for later.
194 	 * To help coreadm expand its %d token, we attempt to save
195 	 * the directory containing the executable in p_execdir. The
196 	 * first call to lookuppn() may fail and return EINVAL because
197 	 * dirvpp is non-NULL. In that case, we make a second call to
198 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
199 	 * but coreadm is allowed to expand %d to the empty string and
200 	 * there are other cases in which that failure may occur.
201 	 */
202 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
203 		goto out;
204 	pn_alloc(&resolvepn);
205 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
206 		pn_free(&resolvepn);
207 		pn_free(&pn);
208 		if (error != EINVAL)
209 			goto out;
210 
211 		dir = NULL;
212 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
213 			goto out;
214 		pn_alloc(&resolvepn);
215 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
216 		    &vp)) != 0) {
217 			pn_free(&resolvepn);
218 			pn_free(&pn);
219 			goto out;
220 		}
221 	}
222 	if (vp == NULL) {
223 		if (dir != NULL)
224 			VN_RELE(dir);
225 		error = ENOENT;
226 		pn_free(&resolvepn);
227 		pn_free(&pn);
228 		goto out;
229 	}
230 
231 	/*
232 	 * We do not allow executing files in attribute directories.
233 	 * We test this by determining whether the resolved path
234 	 * contains a "/" when we're in an attribute directory;
235 	 * only if the pathname does not contain a "/" the resolved path
236 	 * points to a file in the current working (attribute) directory.
237 	 */
238 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
239 	    strchr(resolvepn.pn_path, '/') == NULL) {
240 		if (dir != NULL)
241 			VN_RELE(dir);
242 		error = EACCES;
243 		pn_free(&resolvepn);
244 		pn_free(&pn);
245 		VN_RELE(vp);
246 		goto out;
247 	}
248 
249 	bzero(exec_file, MAXCOMLEN+1);
250 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
251 	bzero(&args, sizeof (args));
252 	args.pathname = resolvepn.pn_path;
253 	/* don't free resolvepn until we are done with args */
254 	pn_free(&pn);
255 
256 	/*
257 	 * Specific exec handlers, or policies determined via
258 	 * /etc/system may override the historical default.
259 	 */
260 	args.stk_prot = PROT_ZFOD;
261 	args.dat_prot = PROT_ZFOD;
262 
263 	CPU_STATS_ADD_K(sys, sysexec, 1);
264 	DTRACE_PROC1(exec, char *, args.pathname);
265 
266 	ua.fname = fname;
267 	ua.argp = argp;
268 	ua.envp = envp;
269 
270 	/* If necessary, brand this process before we start the exec. */
271 	if (brandme != 0)
272 		brand_setbrand(p);
273 
274 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
275 	    exec_file, p->p_cred, brand_action)) != 0) {
276 		if (brandme != 0)
277 			BROP(p)->b_proc_exit(p, lwp);
278 		VN_RELE(vp);
279 		if (dir != NULL)
280 			VN_RELE(dir);
281 		pn_free(&resolvepn);
282 		goto fail;
283 	}
284 
285 	/*
286 	 * Free floating point registers (sun4u only)
287 	 */
288 	ASSERT(lwp != NULL);
289 	lwp_freeregs(lwp, 1);
290 
291 	/*
292 	 * Free thread and process context ops.
293 	 */
294 	if (curthread->t_ctx)
295 		freectx(curthread, 1);
296 	if (p->p_pctx)
297 		freepctx(p, 1);
298 
299 	/*
300 	 * Remember file name for accounting; clear any cached DTrace predicate.
301 	 */
302 	up->u_acflag &= ~AFORK;
303 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
304 	curthread->t_predcache = NULL;
305 
306 	/*
307 	 * Clear contract template state
308 	 */
309 	lwp_ctmpl_clear(lwp);
310 
311 	/*
312 	 * Save the directory in which we found the executable for expanding
313 	 * the %d token used in core file patterns.
314 	 */
315 	mutex_enter(&p->p_lock);
316 	tmpvp = p->p_execdir;
317 	p->p_execdir = dir;
318 	if (p->p_execdir != NULL)
319 		VN_HOLD(p->p_execdir);
320 	mutex_exit(&p->p_lock);
321 
322 	if (tmpvp != NULL)
323 		VN_RELE(tmpvp);
324 
325 	/*
326 	 * Reset stack state to the user stack, clear set of signals
327 	 * caught on the signal stack, and reset list of signals that
328 	 * restart system calls; the new program's environment should
329 	 * not be affected by detritus from the old program.  Any
330 	 * pending held signals remain held, so don't clear t_hold.
331 	 */
332 	mutex_enter(&p->p_lock);
333 	lwp->lwp_oldcontext = 0;
334 	lwp->lwp_ustack = 0;
335 	lwp->lwp_old_stk_ctl = 0;
336 	sigemptyset(&up->u_signodefer);
337 	sigemptyset(&up->u_sigonstack);
338 	sigemptyset(&up->u_sigresethand);
339 	lwp->lwp_sigaltstack.ss_sp = 0;
340 	lwp->lwp_sigaltstack.ss_size = 0;
341 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
342 
343 	/*
344 	 * Make saved resource limit == current resource limit.
345 	 */
346 	for (i = 0; i < RLIM_NLIMITS; i++) {
347 		/*CONSTCOND*/
348 		if (RLIM_SAVED(i)) {
349 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
350 			    &up->u_saved_rlimit[i]);
351 		}
352 	}
353 
354 	/*
355 	 * If the action was to catch the signal, then the action
356 	 * must be reset to SIG_DFL.
357 	 */
358 	sigdefault(p);
359 	p->p_flag &= ~(SNOWAIT|SJCTL);
360 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
361 	up->u_signal[SIGCLD - 1] = SIG_DFL;
362 
363 	/*
364 	 * Delete the dot4 sigqueues/signotifies.
365 	 */
366 	sigqfree(p);
367 
368 	mutex_exit(&p->p_lock);
369 
370 	mutex_enter(&p->p_pflock);
371 	p->p_prof.pr_base = NULL;
372 	p->p_prof.pr_size = 0;
373 	p->p_prof.pr_off = 0;
374 	p->p_prof.pr_scale = 0;
375 	p->p_prof.pr_samples = 0;
376 	mutex_exit(&p->p_pflock);
377 
378 	ASSERT(curthread->t_schedctl == NULL);
379 
380 #if defined(__sparc)
381 	if (p->p_utraps != NULL)
382 		utrap_free(p);
383 #endif	/* __sparc */
384 
385 	/*
386 	 * Close all close-on-exec files.
387 	 */
388 	close_exec(P_FINFO(p));
389 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
390 
391 	/* Unbrand ourself if requested. */
392 	if (brand_action == EBA_NATIVE)
393 		BROP(p)->b_proc_exit(p, lwp);
394 	ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p));
395 
396 	setregs(&args);
397 
398 	/* Mark this as an executable vnode */
399 	mutex_enter(&vp->v_lock);
400 	vp->v_flag |= VVMEXEC;
401 	mutex_exit(&vp->v_lock);
402 
403 	VN_RELE(vp);
404 	if (dir != NULL)
405 		VN_RELE(dir);
406 	pn_free(&resolvepn);
407 
408 	/*
409 	 * Allocate a new lwp directory and lwpid hash table if necessary.
410 	 */
411 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
412 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
413 		lwpdir->ld_next = lwpdir + 1;
414 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
415 		if (p->p_lwpdir != NULL)
416 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
417 		else
418 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
419 	}
420 
421 	if (PROC_IS_BRANDED(p))
422 		BROP(p)->b_exec();
423 
424 	mutex_enter(&p->p_lock);
425 	prbarrier(p);
426 
427 	/*
428 	 * Reset lwp id to the default value of 1.
429 	 * This is a single-threaded process now
430 	 * and lwp #1 is lwp_wait()able by default.
431 	 * The t_unpark flag should not be inherited.
432 	 */
433 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
434 	curthread->t_tid = 1;
435 	curthread->t_unpark = 0;
436 	curthread->t_proc_flag |= TP_TWAIT;
437 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
438 	p->p_lwpdaemon = 0;			/* but oh well ... */
439 	p->p_lwpid = 1;
440 
441 	/*
442 	 * Install the newly-allocated lwp directory and lwpid hash table
443 	 * and insert the current thread into the new hash table.
444 	 */
445 	if (lwpdir != NULL) {
446 		old_lwpdir = p->p_lwpdir;
447 		old_lwpdir_sz = p->p_lwpdir_sz;
448 		old_tidhash = p->p_tidhash;
449 		old_tidhash_sz = p->p_tidhash_sz;
450 		p->p_lwpdir = p->p_lwpfree = lwpdir;
451 		p->p_lwpdir_sz = 2;
452 		p->p_tidhash = tidhash;
453 		p->p_tidhash_sz = 2;
454 		lep->le_thread = curthread;
455 		lep->le_lwpid = curthread->t_tid;
456 		lep->le_start = curthread->t_start;
457 		lwp_hash_in(p, lep);
458 	}
459 
460 	/*
461 	 * Restore the saved signal mask and
462 	 * inform /proc that the exec() has finished.
463 	 */
464 	curthread->t_hold = savedmask;
465 	prexecend();
466 	mutex_exit(&p->p_lock);
467 	if (old_lwpdir) {
468 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
469 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
470 	}
471 
472 	ASSERT(error == 0);
473 	DTRACE_PROC(exec__success);
474 	return (0);
475 
476 fail:
477 	DTRACE_PROC1(exec__failure, int, error);
478 out:		/* error return */
479 	mutex_enter(&p->p_lock);
480 	curthread->t_hold = savedmask;
481 	prexecend();
482 	mutex_exit(&p->p_lock);
483 	ASSERT(error != 0);
484 	return (error);
485 }
486 
487 
488 /*
489  * Perform generic exec duties and switchout to object-file specific
490  * handler.
491  */
492 int
493 gexec(
494 	struct vnode **vpp,
495 	struct execa *uap,
496 	struct uarg *args,
497 	struct intpdata *idatap,
498 	int level,
499 	long *execsz,
500 	caddr_t exec_file,
501 	struct cred *cred,
502 	int brand_action)
503 {
504 	struct vnode *vp;
505 	proc_t *pp = ttoproc(curthread);
506 	struct execsw *eswp;
507 	int error = 0;
508 	int suidflags = 0;
509 	ssize_t resid;
510 	uid_t uid, gid;
511 	struct vattr vattr;
512 	char magbuf[MAGIC_BYTES];
513 	int setid;
514 	cred_t *oldcred, *newcred = NULL;
515 	int privflags = 0;
516 	int setidfl;
517 
518 	/*
519 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
520 	 * previous setting so we can restore it if we encounter an error.
521 	 */
522 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
523 		mutex_enter(&pp->p_lock);
524 		suidflags = pp->p_flag & PSUIDFLAGS;
525 		pp->p_flag &= ~PSUIDFLAGS;
526 		mutex_exit(&pp->p_lock);
527 	}
528 
529 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
530 		goto bad;
531 
532 	/* need to open vnode for stateful file systems like rfs */
533 	if ((error = VOP_OPEN(vpp, FREAD, CRED())) != 0)
534 		goto bad;
535 	vp = *vpp;
536 
537 	/*
538 	 * Note: to support binary compatibility with SunOS a.out
539 	 * executables, we read in the first four bytes, as the
540 	 * magic number is in bytes 2-3.
541 	 */
542 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
543 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
544 		goto bad;
545 	if (resid != 0)
546 		goto bad;
547 
548 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
549 		goto bad;
550 
551 	if (level == 0 &&
552 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
553 
554 		newcred = cred = crdup(cred);
555 
556 		/* If we can, drop the PA bit */
557 		if ((privflags & PRIV_RESET) != 0)
558 			priv_adjust_PA(cred);
559 
560 		if (privflags & PRIV_SETID) {
561 			cred->cr_uid = uid;
562 			cred->cr_gid = gid;
563 			cred->cr_suid = uid;
564 			cred->cr_sgid = gid;
565 		}
566 
567 		if (privflags & MAC_FLAGS) {
568 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
569 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
570 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
571 		}
572 
573 		/*
574 		 * Implement the privilege updates:
575 		 *
576 		 * Restrict with L:
577 		 *
578 		 *	I' = I & L
579 		 *
580 		 *	E' = P' = (I' + F) & A
581 		 *
582 		 * But if running under ptrace, we cap I with P.
583 		 */
584 		if ((privflags & PRIV_RESET) != 0) {
585 			if ((privflags & PRIV_INCREASE) != 0 &&
586 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
587 				priv_intersect(&CR_OPPRIV(cred),
588 						    &CR_IPRIV(cred));
589 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
590 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
591 			priv_adjust_PA(cred);
592 		}
593 	}
594 
595 	/* SunOS 4.x buy-back */
596 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
597 	    (vattr.va_mode & (VSUID|VSGID))) {
598 		cmn_err(CE_NOTE,
599 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
600 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
601 	}
602 
603 	/*
604 	 * execsetid() told us whether or not we had to change the
605 	 * credentials of the process.  In privflags, it told us
606 	 * whether we gained any privileges or executed a set-uid executable.
607 	 */
608 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
609 
610 	/*
611 	 * Use /etc/system variable to determine if the stack
612 	 * should be marked as executable by default.
613 	 */
614 	if (noexec_user_stack)
615 		args->stk_prot &= ~PROT_EXEC;
616 
617 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
618 
619 	/*
620 	 * Traditionally, the setid flags told the sub processes whether
621 	 * the file just executed was set-uid or set-gid; this caused
622 	 * some confusion as the 'setid' flag did not match the SUGID
623 	 * process flag which is only set when the uids/gids do not match.
624 	 * A script set-gid/set-uid to the real uid/gid would start with
625 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
626 	 * Now we flag those cases where the calling process cannot
627 	 * be trusted to influence the newly exec'ed process, either
628 	 * because it runs with more privileges or when the uids/gids
629 	 * do in fact not match.
630 	 * This also makes the runtime linker agree with the on exec
631 	 * values of SNOCD and SUGID.
632 	 */
633 	setidfl = 0;
634 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
635 	    !supgroupmember(cred->cr_gid, cred))) {
636 		setidfl |= EXECSETID_UGIDS;
637 	}
638 	if (setid & PRIV_SETUGID)
639 		setidfl |= EXECSETID_SETID;
640 	if (setid & PRIV_INCREASE)
641 		setidfl |= EXECSETID_PRIVS;
642 
643 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
644 		setidfl, exec_file, cred, brand_action);
645 	rw_exit(eswp->exec_lock);
646 	if (error != 0) {
647 		if (newcred != NULL)
648 			crfree(newcred);
649 		goto bad;
650 	}
651 
652 	if (level == 0) {
653 		mutex_enter(&pp->p_crlock);
654 		if (newcred != NULL) {
655 			/*
656 			 * Free the old credentials, and set the new ones.
657 			 * Do this for both the process and the (single) thread.
658 			 */
659 			crfree(pp->p_cred);
660 			pp->p_cred = cred;	/* cred already held for proc */
661 			crhold(cred);		/* hold new cred for thread */
662 			/*
663 			 * DTrace accesses t_cred in probe context.  t_cred
664 			 * must always be either NULL, or point to a valid,
665 			 * allocated cred structure.
666 			 */
667 			oldcred = curthread->t_cred;
668 			curthread->t_cred = cred;
669 			crfree(oldcred);
670 		}
671 		/*
672 		 * On emerging from a successful exec(), the saved
673 		 * uid and gid equal the effective uid and gid.
674 		 */
675 		cred->cr_suid = cred->cr_uid;
676 		cred->cr_sgid = cred->cr_gid;
677 
678 		/*
679 		 * If the real and effective ids do not match, this
680 		 * is a setuid process that should not dump core.
681 		 * The group comparison is tricky; we prevent the code
682 		 * from flagging SNOCD when executing with an effective gid
683 		 * which is a supplementary group.
684 		 */
685 		if (cred->cr_ruid != cred->cr_uid ||
686 		    (cred->cr_rgid != cred->cr_gid &&
687 		    !supgroupmember(cred->cr_gid, cred)) ||
688 		    (privflags & PRIV_INCREASE) != 0)
689 			suidflags = PSUIDFLAGS;
690 		else
691 			suidflags = 0;
692 
693 		mutex_exit(&pp->p_crlock);
694 		if (suidflags) {
695 			mutex_enter(&pp->p_lock);
696 			pp->p_flag |= suidflags;
697 			mutex_exit(&pp->p_lock);
698 		}
699 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
700 			/*
701 			 * If process is traced via /proc, arrange to
702 			 * invalidate the associated /proc vnode.
703 			 */
704 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
705 				args->traceinval = 1;
706 		}
707 		if (pp->p_proc_flag & P_PR_PTRACE)
708 			psignal(pp, SIGTRAP);
709 		if (args->traceinval)
710 			prinvalidate(&pp->p_user);
711 	}
712 
713 	return (0);
714 bad:
715 	if (error == 0)
716 		error = ENOEXEC;
717 
718 	if (suidflags) {
719 		mutex_enter(&pp->p_lock);
720 		pp->p_flag |= suidflags;
721 		mutex_exit(&pp->p_lock);
722 	}
723 	return (error);
724 }
725 
726 extern char *execswnames[];
727 
728 struct execsw *
729 allocate_execsw(char *name, char *magic, size_t magic_size)
730 {
731 	int i, j;
732 	char *ename;
733 	char *magicp;
734 
735 	mutex_enter(&execsw_lock);
736 	for (i = 0; i < nexectype; i++) {
737 		if (execswnames[i] == NULL) {
738 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
739 			(void) strcpy(ename, name);
740 			execswnames[i] = ename;
741 			/*
742 			 * Set the magic number last so that we
743 			 * don't need to hold the execsw_lock in
744 			 * findexectype().
745 			 */
746 			magicp = kmem_alloc(magic_size, KM_SLEEP);
747 			for (j = 0; j < magic_size; j++)
748 				magicp[j] = magic[j];
749 			execsw[i].exec_magic = magicp;
750 			mutex_exit(&execsw_lock);
751 			return (&execsw[i]);
752 		}
753 	}
754 	mutex_exit(&execsw_lock);
755 	return (NULL);
756 }
757 
758 /*
759  * Find the exec switch table entry with the corresponding magic string.
760  */
761 struct execsw *
762 findexecsw(char *magic)
763 {
764 	struct execsw *eswp;
765 
766 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
767 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
768 		if (magic && eswp->exec_maglen != 0 &&
769 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
770 			return (eswp);
771 	}
772 	return (NULL);
773 }
774 
775 /*
776  * Find the execsw[] index for the given exec header string by looking for the
777  * magic string at a specified offset and length for each kind of executable
778  * file format until one matches.  If no execsw[] entry is found, try to
779  * autoload a module for this magic string.
780  */
781 struct execsw *
782 findexec_by_hdr(char *header)
783 {
784 	struct execsw *eswp;
785 
786 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
787 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
788 		if (header && eswp->exec_maglen != 0 &&
789 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
790 			    eswp->exec_maglen) == 0) {
791 			if (hold_execsw(eswp) != 0)
792 				return (NULL);
793 			return (eswp);
794 		}
795 	}
796 	return (NULL);	/* couldn't find the type */
797 }
798 
799 /*
800  * Find the execsw[] index for the given magic string.  If no execsw[] entry
801  * is found, try to autoload a module for this magic string.
802  */
803 struct execsw *
804 findexec_by_magic(char *magic)
805 {
806 	struct execsw *eswp;
807 
808 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
809 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
810 		if (magic && eswp->exec_maglen != 0 &&
811 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
812 			if (hold_execsw(eswp) != 0)
813 				return (NULL);
814 			return (eswp);
815 		}
816 	}
817 	return (NULL);	/* couldn't find the type */
818 }
819 
820 static int
821 hold_execsw(struct execsw *eswp)
822 {
823 	char *name;
824 
825 	rw_enter(eswp->exec_lock, RW_READER);
826 	while (!LOADED_EXEC(eswp)) {
827 		rw_exit(eswp->exec_lock);
828 		name = execswnames[eswp-execsw];
829 		ASSERT(name);
830 		if (modload("exec", name) == -1)
831 			return (-1);
832 		rw_enter(eswp->exec_lock, RW_READER);
833 	}
834 	return (0);
835 }
836 
837 static int
838 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
839 {
840 	proc_t *pp = ttoproc(curthread);
841 	uid_t uid, gid;
842 	cred_t *cr = pp->p_cred;
843 	int privflags = 0;
844 
845 	/*
846 	 * Remember credentials.
847 	 */
848 	uid = cr->cr_uid;
849 	gid = cr->cr_gid;
850 
851 	/* Will try to reset the PRIV_AWARE bit later. */
852 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
853 		privflags |= PRIV_RESET;
854 
855 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
856 		/*
857 		 * Set-uid root execution only allowed if the limit set
858 		 * holds all unsafe privileges.
859 		 */
860 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
861 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
862 			uid = vattrp->va_uid;
863 			privflags |= PRIV_SETUGID;
864 		}
865 		if (vattrp->va_mode & VSGID) {
866 			gid = vattrp->va_gid;
867 			privflags |= PRIV_SETUGID;
868 		}
869 	}
870 
871 	/*
872 	 * Do we need to change our credential anyway?
873 	 * This is the case when E != I or P != I, as
874 	 * we need to do the assignments (with F empty and A full)
875 	 * Or when I is not a subset of L; in that case we need to
876 	 * enforce L.
877 	 *
878 	 *		I' = L & I
879 	 *
880 	 *		E' = P' = (I' + F) & A
881 	 * or
882 	 *		E' = P' = I'
883 	 */
884 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
885 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
886 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
887 		privflags |= PRIV_RESET;
888 
889 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
890 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
891 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
892 		privflags |= MAC_FLAGS;
893 
894 	/*
895 	 * When we introduce the "forced" set then we will need
896 	 * to set PRIV_INCREASE here if I not a subset of P.
897 	 * If the "allowed" set is introduced we will need to do
898 	 * a similar thing; however, it seems more reasonable to
899 	 * have the allowed set reduce "L": script language interpreters
900 	 * would typically have an allowed set of "all".
901 	 */
902 
903 	/*
904 	 * Set setuid/setgid protections if no ptrace() compatibility.
905 	 * For privileged processes, honor setuid/setgid even in
906 	 * the presence of ptrace() compatibility.
907 	 */
908 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
909 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
910 	    (cr->cr_uid != uid ||
911 	    cr->cr_gid != gid ||
912 	    cr->cr_suid != uid ||
913 	    cr->cr_sgid != gid)) {
914 		*uidp = uid;
915 		*gidp = gid;
916 		privflags |= PRIV_SETID;
917 	}
918 	return (privflags);
919 }
920 
921 int
922 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
923 {
924 	int error;
925 	proc_t *p = ttoproc(curthread);
926 
927 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
928 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred))
929 		return (error);
930 	/*
931 	 * Check the access mode.
932 	 * If VPROC, ask /proc if the file is an object file.
933 	 */
934 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred)) != 0 ||
935 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
936 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
937 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
938 		if (error == 0)
939 			error = EACCES;
940 		return (error);
941 	}
942 
943 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
944 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred))) {
945 		/*
946 		 * If process is under ptrace(2) compatibility,
947 		 * fail the exec(2).
948 		 */
949 		if (p->p_proc_flag & P_PR_PTRACE)
950 			goto bad;
951 		/*
952 		 * Process is traced via /proc.
953 		 * Arrange to invalidate the /proc vnode.
954 		 */
955 		args->traceinval = 1;
956 	}
957 	return (0);
958 bad:
959 	if (error == 0)
960 		error = ENOEXEC;
961 	return (error);
962 }
963 
964 /*
965  * Map a section of an executable file into the user's
966  * address space.
967  */
968 int
969 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
970     off_t offset, int prot, int page, uint_t szc)
971 {
972 	int error = 0;
973 	off_t oldoffset;
974 	caddr_t zfodbase, oldaddr;
975 	size_t end, oldlen;
976 	size_t zfoddiff;
977 	label_t ljb;
978 	proc_t *p = ttoproc(curthread);
979 
980 	oldaddr = addr;
981 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
982 	if (len) {
983 		oldlen = len;
984 		len += ((size_t)oldaddr - (size_t)addr);
985 		oldoffset = offset;
986 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
987 		if (page) {
988 			spgcnt_t  prefltmem, availm, npages;
989 			int preread;
990 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
991 
992 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
993 				mflag |= MAP_TEXT;
994 			} else {
995 				mflag |= MAP_INITDATA;
996 			}
997 
998 			if (valid_usr_range(addr, len, prot, p->p_as,
999 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1000 				error = ENOMEM;
1001 				goto bad;
1002 			}
1003 			if (error = VOP_MAP(vp, (offset_t)offset,
1004 			    p->p_as, &addr, len, prot, PROT_ALL,
1005 			    mflag, CRED()))
1006 				goto bad;
1007 
1008 			/*
1009 			 * If the segment can fit, then we prefault
1010 			 * the entire segment in.  This is based on the
1011 			 * model that says the best working set of a
1012 			 * small program is all of its pages.
1013 			 */
1014 			npages = (spgcnt_t)btopr(len);
1015 			prefltmem = freemem - desfree;
1016 			preread =
1017 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1018 
1019 			/*
1020 			 * If we aren't prefaulting the segment,
1021 			 * increment "deficit", if necessary to ensure
1022 			 * that pages will become available when this
1023 			 * process starts executing.
1024 			 */
1025 			availm = freemem - lotsfree;
1026 			if (preread == 0 && npages > availm &&
1027 			    deficit < lotsfree) {
1028 				deficit += MIN((pgcnt_t)(npages - availm),
1029 				    lotsfree - deficit);
1030 			}
1031 
1032 			if (preread) {
1033 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1034 				    "execmap preread:freemem %d size %lu",
1035 				    freemem, len);
1036 				(void) as_fault(p->p_as->a_hat, p->p_as,
1037 				    (caddr_t)addr, len, F_INVAL, S_READ);
1038 			}
1039 		} else {
1040 			if (valid_usr_range(addr, len, prot, p->p_as,
1041 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1042 				error = ENOMEM;
1043 				goto bad;
1044 			}
1045 
1046 			if (error = as_map(p->p_as, addr, len,
1047 			    segvn_create, zfod_argsp))
1048 				goto bad;
1049 			/*
1050 			 * Read in the segment in one big chunk.
1051 			 */
1052 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1053 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1054 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1055 				goto bad;
1056 			/*
1057 			 * Now set protections.
1058 			 */
1059 			if (prot != PROT_ZFOD) {
1060 				(void) as_setprot(p->p_as, (caddr_t)addr,
1061 				    len, prot);
1062 			}
1063 		}
1064 	}
1065 
1066 	if (zfodlen) {
1067 		struct as *as = curproc->p_as;
1068 		struct seg *seg;
1069 		uint_t zprot = 0;
1070 
1071 		end = (size_t)addr + len;
1072 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1073 		zfoddiff = (uintptr_t)zfodbase - end;
1074 		if (zfoddiff) {
1075 			/*
1076 			 * Before we go to zero the remaining space on the last
1077 			 * page, make sure we have write permission.
1078 			 */
1079 
1080 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1081 			seg = as_segat(curproc->p_as, (caddr_t)end);
1082 			if (seg != NULL)
1083 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1084 				    &zprot);
1085 			AS_LOCK_EXIT(as, &as->a_lock);
1086 
1087 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1088 				(void) as_setprot(as, (caddr_t)end,
1089 				    zfoddiff - 1, zprot | PROT_WRITE);
1090 			}
1091 
1092 			if (on_fault(&ljb)) {
1093 				no_fault();
1094 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1095 					(void) as_setprot(as, (caddr_t)end,
1096 					zfoddiff - 1, zprot);
1097 				error = EFAULT;
1098 				goto bad;
1099 			}
1100 			uzero((void *)end, zfoddiff);
1101 			no_fault();
1102 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1103 				(void) as_setprot(as, (caddr_t)end,
1104 				    zfoddiff - 1, zprot);
1105 		}
1106 		if (zfodlen > zfoddiff) {
1107 			struct segvn_crargs crargs =
1108 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1109 
1110 			zfodlen -= zfoddiff;
1111 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1112 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1113 				error = ENOMEM;
1114 				goto bad;
1115 			}
1116 			if (szc > 0) {
1117 				/*
1118 				 * ASSERT alignment because the mapelfexec()
1119 				 * caller for the szc > 0 case extended zfod
1120 				 * so it's end is pgsz aligned.
1121 				 */
1122 				size_t pgsz = page_get_pagesize(szc);
1123 				ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1124 
1125 				if (IS_P2ALIGNED(zfodbase, pgsz)) {
1126 					crargs.szc = szc;
1127 				} else {
1128 					crargs.szc = AS_MAP_HEAP;
1129 				}
1130 			} else {
1131 				crargs.szc = AS_MAP_NO_LPOOB;
1132 			}
1133 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1134 			    zfodlen, segvn_create, &crargs))
1135 				goto bad;
1136 			if (prot != PROT_ZFOD) {
1137 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1138 				    zfodlen, prot);
1139 			}
1140 		}
1141 	}
1142 	return (0);
1143 bad:
1144 	return (error);
1145 }
1146 
1147 void
1148 setexecenv(struct execenv *ep)
1149 {
1150 	proc_t *p = ttoproc(curthread);
1151 	klwp_t *lwp = ttolwp(curthread);
1152 	struct vnode *vp;
1153 
1154 	p->p_bssbase = ep->ex_bssbase;
1155 	p->p_brkbase = ep->ex_brkbase;
1156 	p->p_brksize = ep->ex_brksize;
1157 	if (p->p_exec)
1158 		VN_RELE(p->p_exec);	/* out with the old */
1159 	vp = p->p_exec = ep->ex_vp;
1160 	if (vp != NULL)
1161 		VN_HOLD(vp);		/* in with the new */
1162 
1163 	lwp->lwp_sigaltstack.ss_sp = 0;
1164 	lwp->lwp_sigaltstack.ss_size = 0;
1165 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1166 }
1167 
1168 int
1169 execopen(struct vnode **vpp, int *fdp)
1170 {
1171 	struct vnode *vp = *vpp;
1172 	file_t *fp;
1173 	int error = 0;
1174 	int filemode = FREAD;
1175 
1176 	VN_HOLD(vp);		/* open reference */
1177 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1178 		VN_RELE(vp);
1179 		*fdp = -1;	/* just in case falloc changed value */
1180 		return (error);
1181 	}
1182 	if (error = VOP_OPEN(&vp, filemode, CRED())) {
1183 		VN_RELE(vp);
1184 		setf(*fdp, NULL);
1185 		unfalloc(fp);
1186 		*fdp = -1;
1187 		return (error);
1188 	}
1189 	*vpp = vp;		/* vnode should not have changed */
1190 	fp->f_vnode = vp;
1191 	mutex_exit(&fp->f_tlock);
1192 	setf(*fdp, fp);
1193 	return (0);
1194 }
1195 
1196 int
1197 execclose(int fd)
1198 {
1199 	return (closeandsetf(fd, NULL));
1200 }
1201 
1202 
1203 /*
1204  * noexec stub function.
1205  */
1206 /*ARGSUSED*/
1207 int
1208 noexec(
1209     struct vnode *vp,
1210     struct execa *uap,
1211     struct uarg *args,
1212     struct intpdata *idatap,
1213     int level,
1214     long *execsz,
1215     int setid,
1216     caddr_t exec_file,
1217     struct cred *cred)
1218 {
1219 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1220 	return (ENOEXEC);
1221 }
1222 
1223 /*
1224  * Support routines for building a user stack.
1225  *
1226  * execve(path, argv, envp) must construct a new stack with the specified
1227  * arguments and environment variables (see exec_args() for a description
1228  * of the user stack layout).  To do this, we copy the arguments and
1229  * environment variables from the old user address space into the kernel,
1230  * free the old as, create the new as, and copy our buffered information
1231  * to the new stack.  Our kernel buffer has the following structure:
1232  *
1233  *	+-----------------------+ <--- stk_base + stk_size
1234  *	| string offsets	|
1235  *	+-----------------------+ <--- stk_offp
1236  *	|			|
1237  *	| STK_AVAIL() space	|
1238  *	|			|
1239  *	+-----------------------+ <--- stk_strp
1240  *	| strings		|
1241  *	+-----------------------+ <--- stk_base
1242  *
1243  * When we add a string, we store the string's contents (including the null
1244  * terminator) at stk_strp, and we store the offset of the string relative to
1245  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1246  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1247  * the difference between these pointers.  If we run out of space, we return
1248  * an error and exec_args() starts all over again with a buffer twice as large.
1249  * When we're all done, the kernel buffer looks like this:
1250  *
1251  *	+-----------------------+ <--- stk_base + stk_size
1252  *	| argv[0] offset	|
1253  *	+-----------------------+
1254  *	| ...			|
1255  *	+-----------------------+
1256  *	| argv[argc-1] offset	|
1257  *	+-----------------------+
1258  *	| envp[0] offset	|
1259  *	+-----------------------+
1260  *	| ...			|
1261  *	+-----------------------+
1262  *	| envp[envc-1] offset	|
1263  *	+-----------------------+
1264  *	| AT_SUN_PLATFORM offset|
1265  *	+-----------------------+
1266  *	| AT_SUN_EXECNAME offset|
1267  *	+-----------------------+ <--- stk_offp
1268  *	|			|
1269  *	| STK_AVAIL() space	|
1270  *	|			|
1271  *	+-----------------------+ <--- stk_strp
1272  *	| AT_SUN_EXECNAME offset|
1273  *	+-----------------------+
1274  *	| AT_SUN_PLATFORM offset|
1275  *	+-----------------------+
1276  *	| envp[envc-1] string	|
1277  *	+-----------------------+
1278  *	| ...			|
1279  *	+-----------------------+
1280  *	| envp[0] string	|
1281  *	+-----------------------+
1282  *	| argv[argc-1] string	|
1283  *	+-----------------------+
1284  *	| ...			|
1285  *	+-----------------------+
1286  *	| argv[0] string	|
1287  *	+-----------------------+ <--- stk_base
1288  */
1289 
1290 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1291 
1292 /*
1293  * Add a string to the stack.
1294  */
1295 static int
1296 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1297 {
1298 	int error;
1299 	size_t len;
1300 
1301 	if (STK_AVAIL(args) < sizeof (int))
1302 		return (E2BIG);
1303 	*--args->stk_offp = args->stk_strp - args->stk_base;
1304 
1305 	if (segflg == UIO_USERSPACE) {
1306 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1307 		if (error != 0)
1308 			return (error);
1309 	} else {
1310 		len = strlen(sp) + 1;
1311 		if (len > STK_AVAIL(args))
1312 			return (E2BIG);
1313 		bcopy(sp, args->stk_strp, len);
1314 	}
1315 
1316 	args->stk_strp += len;
1317 
1318 	return (0);
1319 }
1320 
1321 static int
1322 stk_getptr(uarg_t *args, char *src, char **dst)
1323 {
1324 	int error;
1325 
1326 	if (args->from_model == DATAMODEL_NATIVE) {
1327 		ulong_t ptr;
1328 		error = fulword(src, &ptr);
1329 		*dst = (caddr_t)ptr;
1330 	} else {
1331 		uint32_t ptr;
1332 		error = fuword32(src, &ptr);
1333 		*dst = (caddr_t)(uintptr_t)ptr;
1334 	}
1335 	return (error);
1336 }
1337 
1338 static int
1339 stk_putptr(uarg_t *args, char *addr, char *value)
1340 {
1341 	if (args->to_model == DATAMODEL_NATIVE)
1342 		return (sulword(addr, (ulong_t)value));
1343 	else
1344 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1345 }
1346 
1347 static int
1348 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1349 {
1350 	char *sp;
1351 	int argc, error;
1352 	int argv_empty = 0;
1353 	size_t ptrsize = args->from_ptrsize;
1354 	size_t size, pad;
1355 	char *argv = (char *)uap->argp;
1356 	char *envp = (char *)uap->envp;
1357 
1358 	/*
1359 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1360 	 */
1361 	if (intp != NULL && intp->intp_name != NULL) {
1362 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1363 			return (error);
1364 		if (intp->intp_arg != NULL &&
1365 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1366 			return (error);
1367 		if (args->fname != NULL)
1368 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1369 		else
1370 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1371 		if (error)
1372 			return (error);
1373 
1374 		/*
1375 		 * Check for an empty argv[].
1376 		 */
1377 		if (stk_getptr(args, argv, &sp))
1378 			return (EFAULT);
1379 		if (sp == NULL)
1380 			argv_empty = 1;
1381 
1382 		argv += ptrsize;		/* ignore original argv[0] */
1383 	}
1384 
1385 	if (argv_empty == 0) {
1386 		/*
1387 		 * Add argv[] strings to the stack.
1388 		 */
1389 		for (;;) {
1390 			if (stk_getptr(args, argv, &sp))
1391 				return (EFAULT);
1392 			if (sp == NULL)
1393 				break;
1394 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1395 				return (error);
1396 			argv += ptrsize;
1397 		}
1398 	}
1399 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1400 	args->arglen = args->stk_strp - args->stk_base;
1401 
1402 	/*
1403 	 * Add environ[] strings to the stack.
1404 	 */
1405 	if (envp != NULL) {
1406 		for (;;) {
1407 			if (stk_getptr(args, envp, &sp))
1408 				return (EFAULT);
1409 			if (sp == NULL)
1410 				break;
1411 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1412 				return (error);
1413 			envp += ptrsize;
1414 		}
1415 	}
1416 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1417 	args->ne = args->na - argc;
1418 
1419 	/*
1420 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1421 	 * AT_SUN_EMULATOR strings to the stack.
1422 	 */
1423 	if (auxvpp != NULL && *auxvpp != NULL) {
1424 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1425 			return (error);
1426 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1427 			return (error);
1428 		if (args->brandname != NULL &&
1429 		    (error = stk_add(args, args->brandname,
1430 			UIO_SYSSPACE)) != 0)
1431 			return (error);
1432 		if (args->emulator != NULL &&
1433 		    (error = stk_add(args, args->emulator,
1434 			UIO_SYSSPACE)) != 0)
1435 			return (error);
1436 	}
1437 
1438 	/*
1439 	 * Compute the size of the stack.  This includes all the pointers,
1440 	 * the space reserved for the aux vector, and all the strings.
1441 	 * The total number of pointers is args->na (which is argc + envc)
1442 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1443 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1444 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1445 	 * all the strings, at the very top of the stack.
1446 	 */
1447 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1448 	    (args->stk_strp - args->stk_base);
1449 
1450 	/*
1451 	 * Pad the string section with zeroes to align the stack size.
1452 	 */
1453 	pad = P2NPHASE(size, args->stk_align);
1454 
1455 	if (STK_AVAIL(args) < pad)
1456 		return (E2BIG);
1457 
1458 	args->usrstack_size = size + pad;
1459 
1460 	while (pad-- != 0)
1461 		*args->stk_strp++ = 0;
1462 
1463 	args->nc = args->stk_strp - args->stk_base;
1464 
1465 	return (0);
1466 }
1467 
1468 static int
1469 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1470 {
1471 	size_t ptrsize = args->to_ptrsize;
1472 	ssize_t pslen;
1473 	char *kstrp = args->stk_base;
1474 	char *ustrp = usrstack - args->nc - ptrsize;
1475 	char *usp = usrstack - args->usrstack_size;
1476 	int *offp = (int *)(args->stk_base + args->stk_size);
1477 	int envc = args->ne;
1478 	int argc = args->na - envc;
1479 	int i;
1480 
1481 	/*
1482 	 * Record argc for /proc.
1483 	 */
1484 	up->u_argc = argc;
1485 
1486 	/*
1487 	 * Put argc on the stack.  Note that even though it's an int,
1488 	 * it always consumes ptrsize bytes (for alignment).
1489 	 */
1490 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1491 		return (-1);
1492 
1493 	/*
1494 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1495 	 */
1496 	up->u_argv = (uintptr_t)(usp += ptrsize);
1497 
1498 	/*
1499 	 * Put the argv[] pointers on the stack.
1500 	 */
1501 	for (i = 0; i < argc; i++, usp += ptrsize)
1502 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1503 			return (-1);
1504 
1505 	/*
1506 	 * Copy arguments to u_psargs.
1507 	 */
1508 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1509 	for (i = 0; i < pslen; i++)
1510 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1511 	while (i < PSARGSZ)
1512 		up->u_psargs[i++] = '\0';
1513 
1514 	/*
1515 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1516 	 * record envp for /proc.
1517 	 */
1518 	up->u_envp = (uintptr_t)(usp += ptrsize);
1519 
1520 	/*
1521 	 * Put the envp[] pointers on the stack.
1522 	 */
1523 	for (i = 0; i < envc; i++, usp += ptrsize)
1524 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1525 			return (-1);
1526 
1527 	/*
1528 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1529 	 * remember where the stack ends, which is also where auxv begins.
1530 	 */
1531 	args->stackend = usp += ptrsize;
1532 
1533 	/*
1534 	 * Put all the argv[], envp[], and auxv strings on the stack.
1535 	 */
1536 	if (copyout(args->stk_base, ustrp, args->nc))
1537 		return (-1);
1538 
1539 	/*
1540 	 * Fill in the aux vector now that we know the user stack addresses
1541 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1542 	 * AT_SUN_EMULATOR strings.
1543 	 */
1544 	if (auxvpp != NULL && *auxvpp != NULL) {
1545 		if (args->to_model == DATAMODEL_NATIVE) {
1546 			auxv_t **a = (auxv_t **)auxvpp;
1547 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1548 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1549 			if (args->brandname != NULL)
1550 				ADDAUX(*a,
1551 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1552 			if (args->emulator != NULL)
1553 				ADDAUX(*a,
1554 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1555 		} else {
1556 			auxv32_t **a = (auxv32_t **)auxvpp;
1557 			ADDAUX(*a,
1558 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1559 			ADDAUX(*a,
1560 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1561 			if (args->brandname != NULL)
1562 				ADDAUX(*a, AT_SUN_BRANDNAME,
1563 				    (int)(uintptr_t)&ustrp[*--offp])
1564 			if (args->emulator != NULL)
1565 				ADDAUX(*a, AT_SUN_EMULATOR,
1566 				    (int)(uintptr_t)&ustrp[*--offp])
1567 		}
1568 	}
1569 
1570 	return (0);
1571 }
1572 
1573 /*
1574  * Initialize a new user stack with the specified arguments and environment.
1575  * The initial user stack layout is as follows:
1576  *
1577  *	User Stack
1578  *	+---------------+ <--- curproc->p_usrstack
1579  *	|		|
1580  *	| slew		|
1581  *	|		|
1582  *	+---------------+
1583  *	| NULL		|
1584  *	+---------------+
1585  *	|		|
1586  *	| auxv strings	|
1587  *	|		|
1588  *	+---------------+
1589  *	|		|
1590  *	| envp strings	|
1591  *	|		|
1592  *	+---------------+
1593  *	|		|
1594  *	| argv strings	|
1595  *	|		|
1596  *	+---------------+ <--- ustrp
1597  *	|		|
1598  *	| aux vector	|
1599  *	|		|
1600  *	+---------------+ <--- auxv
1601  *	| NULL		|
1602  *	+---------------+
1603  *	| envp[envc-1]	|
1604  *	+---------------+
1605  *	| ...		|
1606  *	+---------------+
1607  *	| envp[0]	|
1608  *	+---------------+ <--- envp[]
1609  *	| NULL		|
1610  *	+---------------+
1611  *	| argv[argc-1]	|
1612  *	+---------------+
1613  *	| ...		|
1614  *	+---------------+
1615  *	| argv[0]	|
1616  *	+---------------+ <--- argv[]
1617  *	| argc		|
1618  *	+---------------+ <--- stack base
1619  */
1620 int
1621 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1622 {
1623 	size_t size;
1624 	int error;
1625 	proc_t *p = ttoproc(curthread);
1626 	user_t *up = PTOU(p);
1627 	char *usrstack;
1628 	rctl_entity_p_t e;
1629 	struct as *as;
1630 	extern int use_stk_lpg;
1631 	size_t sp_slew;
1632 
1633 	args->from_model = p->p_model;
1634 	if (p->p_model == DATAMODEL_NATIVE) {
1635 		args->from_ptrsize = sizeof (long);
1636 	} else {
1637 		args->from_ptrsize = sizeof (int32_t);
1638 	}
1639 
1640 	if (args->to_model == DATAMODEL_NATIVE) {
1641 		args->to_ptrsize = sizeof (long);
1642 		args->ncargs = NCARGS;
1643 		args->stk_align = STACK_ALIGN;
1644 		usrstack = (char *)USRSTACK;
1645 	} else {
1646 		args->to_ptrsize = sizeof (int32_t);
1647 		args->ncargs = NCARGS32;
1648 		args->stk_align = STACK_ALIGN32;
1649 		usrstack = (char *)USRSTACK32;
1650 	}
1651 
1652 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1653 
1654 #if defined(__sparc)
1655 	/*
1656 	 * Make sure user register windows are empty before
1657 	 * attempting to make a new stack.
1658 	 */
1659 	(void) flush_user_windows_to_stack(NULL);
1660 #endif
1661 
1662 	for (size = PAGESIZE; ; size *= 2) {
1663 		args->stk_size = size;
1664 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1665 		args->stk_strp = args->stk_base;
1666 		args->stk_offp = (int *)(args->stk_base + size);
1667 		error = stk_copyin(uap, args, intp, auxvpp);
1668 		if (error == 0)
1669 			break;
1670 		kmem_free(args->stk_base, size);
1671 		if (error != E2BIG && error != ENAMETOOLONG)
1672 			return (error);
1673 		if (size >= args->ncargs)
1674 			return (E2BIG);
1675 	}
1676 
1677 	size = args->usrstack_size;
1678 
1679 	ASSERT(error == 0);
1680 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1681 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1682 
1683 	if (size > args->ncargs) {
1684 		kmem_free(args->stk_base, args->stk_size);
1685 		return (E2BIG);
1686 	}
1687 
1688 	/*
1689 	 * Leave only the current lwp and force the other lwps to exit.
1690 	 * If another lwp beat us to the punch by calling exit(), bail out.
1691 	 */
1692 	if ((error = exitlwps(0)) != 0) {
1693 		kmem_free(args->stk_base, args->stk_size);
1694 		return (error);
1695 	}
1696 
1697 	/*
1698 	 * Revoke any doors created by the process.
1699 	 */
1700 	if (p->p_door_list)
1701 		door_exit();
1702 
1703 	/*
1704 	 * Release schedctl data structures.
1705 	 */
1706 	if (p->p_pagep)
1707 		schedctl_proc_cleanup();
1708 
1709 	/*
1710 	 * Clean up any DTrace helpers for the process.
1711 	 */
1712 	if (p->p_dtrace_helpers != NULL) {
1713 		ASSERT(dtrace_helpers_cleanup != NULL);
1714 		(*dtrace_helpers_cleanup)();
1715 	}
1716 
1717 	mutex_enter(&p->p_lock);
1718 	/*
1719 	 * Cleanup the DTrace provider associated with this process.
1720 	 */
1721 	if (p->p_dtrace_probes) {
1722 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1723 		dtrace_fasttrap_exec_ptr(p);
1724 	}
1725 	mutex_exit(&p->p_lock);
1726 
1727 	/*
1728 	 * discard the lwpchan cache.
1729 	 */
1730 	if (p->p_lcp != NULL)
1731 		lwpchan_destroy_cache(1);
1732 
1733 	/*
1734 	 * Delete the POSIX timers.
1735 	 */
1736 	if (p->p_itimer != NULL)
1737 		timer_exit();
1738 
1739 #ifdef C2_AUDIT
1740 	if (audit_active)
1741 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1742 		    args->na - args->ne, args->ne);
1743 #endif
1744 
1745 	/*
1746 	 * Ensure that we don't change resource associations while we
1747 	 * change address spaces.
1748 	 */
1749 	mutex_enter(&p->p_lock);
1750 	pool_barrier_enter();
1751 	mutex_exit(&p->p_lock);
1752 
1753 	/*
1754 	 * Destroy the old address space and create a new one.
1755 	 * From here on, any errors are fatal to the exec()ing process.
1756 	 * On error we return -1, which means the caller must SIGKILL
1757 	 * the process.
1758 	 */
1759 	relvm();
1760 
1761 	mutex_enter(&p->p_lock);
1762 	pool_barrier_exit();
1763 	mutex_exit(&p->p_lock);
1764 
1765 	up->u_execsw = args->execswp;
1766 
1767 	p->p_brkbase = NULL;
1768 	p->p_brksize = 0;
1769 	p->p_brkpageszc = 0;
1770 	p->p_stksize = 0;
1771 	p->p_stkpageszc = 0;
1772 	p->p_model = args->to_model;
1773 	p->p_usrstack = usrstack;
1774 	p->p_stkprot = args->stk_prot;
1775 	p->p_datprot = args->dat_prot;
1776 
1777 	/*
1778 	 * Reset resource controls such that all controls are again active as
1779 	 * well as appropriate to the potentially new address model for the
1780 	 * process.
1781 	 */
1782 	e.rcep_p.proc = p;
1783 	e.rcep_t = RCENTITY_PROCESS;
1784 	rctl_set_reset(p->p_rctls, p, &e);
1785 
1786 	/* Too early to call map_pgsz for the heap */
1787 	if (use_stk_lpg) {
1788 		p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
1789 	}
1790 
1791 	mutex_enter(&p->p_lock);
1792 	p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1793 	mutex_exit(&p->p_lock);
1794 
1795 	/*
1796 	 * Some platforms may choose to randomize real stack start by adding a
1797 	 * small slew (not more than a few hundred bytes) to the top of the
1798 	 * stack. This helps avoid cache thrashing when identical processes
1799 	 * simultaneously share caches that don't provide enough associativity
1800 	 * (e.g. sun4v systems). In this case stack slewing makes the same hot
1801 	 * stack variables in different processes to live in different cache
1802 	 * sets increasing effective associativity.
1803 	 */
1804 	sp_slew = exec_get_spslew();
1805 	ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
1806 	exec_set_sp(size + sp_slew);
1807 
1808 	as = as_alloc();
1809 	p->p_as = as;
1810 	as->a_proc = p;
1811 	if (p->p_model == DATAMODEL_ILP32)
1812 		as->a_userlimit = (caddr_t)USERLIMIT32;
1813 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1814 
1815 	/*
1816 	 * Finally, write out the contents of the new stack.
1817 	 */
1818 	error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
1819 	kmem_free(args->stk_base, args->stk_size);
1820 	return (error);
1821 }
1822