xref: /illumos-gate/usr/src/uts/common/os/exec.c (revision 19bd46b5d133ec3b843cb24e87bd5c84b7b062d5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*	Copyright (c) 1988 AT&T	*/
30 /*	  All Rights Reserved  	*/
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/cred_impl.h>
38 #include <sys/policy.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/mman.h>
45 #include <sys/acct.h>
46 #include <sys/cpuvar.h>
47 #include <sys/proc.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/pathname.h>
51 #include <sys/vm.h>
52 #include <sys/lgrp.h>
53 #include <sys/vtrace.h>
54 #include <sys/exec.h>
55 #include <sys/exechdr.h>
56 #include <sys/kmem.h>
57 #include <sys/prsystm.h>
58 #include <sys/modctl.h>
59 #include <sys/vmparam.h>
60 #include <sys/door.h>
61 #include <sys/schedctl.h>
62 #include <sys/utrap.h>
63 #include <sys/systeminfo.h>
64 #include <sys/stack.h>
65 #include <sys/rctl.h>
66 #include <sys/dtrace.h>
67 #include <sys/lwpchan_impl.h>
68 #include <sys/pool.h>
69 #include <sys/sdt.h>
70 #include <sys/brand.h>
71 
72 #include <c2/audit.h>
73 
74 #include <vm/hat.h>
75 #include <vm/anon.h>
76 #include <vm/as.h>
77 #include <vm/seg.h>
78 #include <vm/seg_vn.h>
79 
80 #define	PRIV_RESET		0x01	/* needs to reset privs */
81 #define	PRIV_SETID		0x02	/* needs to change uids */
82 #define	PRIV_SETUGID		0x04	/* is setuid/setgid/forced privs */
83 #define	PRIV_INCREASE		0x08	/* child runs with more privs */
84 #define	MAC_FLAGS		0x10	/* need to adjust MAC flags */
85 
86 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *);
87 static int hold_execsw(struct execsw *);
88 
89 uint_t auxv_hwcap = 0;	/* auxv AT_SUN_HWCAP value; determined on the fly */
90 #if defined(_SYSCALL32_IMPL)
91 uint_t auxv_hwcap32 = 0;	/* 32-bit version of auxv_hwcap */
92 #endif
93 
94 #define	PSUIDFLAGS		(SNOCD|SUGID)
95 
96 /*
97  * exec() - wrapper around exece providing NULL environment pointer
98  */
99 int
100 exec(const char *fname, const char **argp)
101 {
102 	return (exece(fname, argp, NULL));
103 }
104 
105 /*
106  * exece() - system call wrapper around exec_common()
107  */
108 int
109 exece(const char *fname, const char **argp, const char **envp)
110 {
111 	int error;
112 
113 	error = exec_common(fname, argp, envp, EBA_NONE);
114 	return (error ? (set_errno(error)) : 0);
115 }
116 
117 int
118 exec_common(const char *fname, const char **argp, const char **envp,
119     int brand_action)
120 {
121 	vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
122 	proc_t *p = ttoproc(curthread);
123 	klwp_t *lwp = ttolwp(curthread);
124 	struct user *up = PTOU(p);
125 	long execsz;		/* temporary count of exec size */
126 	int i;
127 	int error;
128 	char exec_file[MAXCOMLEN+1];
129 	struct pathname pn;
130 	struct pathname resolvepn;
131 	struct uarg args;
132 	struct execa ua;
133 	k_sigset_t savedmask;
134 	lwpdir_t *lwpdir = NULL;
135 	lwpdir_t **tidhash;
136 	lwpdir_t *old_lwpdir = NULL;
137 	uint_t old_lwpdir_sz;
138 	lwpdir_t **old_tidhash;
139 	uint_t old_tidhash_sz;
140 	lwpent_t *lep;
141 	int brandme = 0;
142 
143 	/*
144 	 * exec() is not supported for the /proc agent lwp.
145 	 */
146 	if (curthread == p->p_agenttp)
147 		return (ENOTSUP);
148 
149 	if (brand_action != EBA_NONE) {
150 		/*
151 		 * Brand actions are not supported for processes that are not
152 		 * running in a branded zone.
153 		 */
154 		if (!ZONE_IS_BRANDED(p->p_zone))
155 			return (ENOTSUP);
156 
157 		if (brand_action == EBA_NATIVE) {
158 			/* Only branded processes can be unbranded */
159 			if (!PROC_IS_BRANDED(p))
160 				return (ENOTSUP);
161 		} else {
162 			/* Only unbranded processes can be branded */
163 			if (PROC_IS_BRANDED(p))
164 				return (ENOTSUP);
165 			brandme = 1;
166 		}
167 	} else {
168 		/*
169 		 * If this is a native zone, or if the process is already
170 		 * branded, then we don't need to do anything.  If this is
171 		 * a native process in a branded zone, we need to brand the
172 		 * process as it exec()s the new binary.
173 		 */
174 		if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
175 			brandme = 1;
176 	}
177 
178 	/*
179 	 * Inform /proc that an exec() has started.
180 	 * Hold signals that are ignored by default so that we will
181 	 * not be interrupted by a signal that will be ignored after
182 	 * successful completion of gexec().
183 	 */
184 	mutex_enter(&p->p_lock);
185 	prexecstart();
186 	schedctl_finish_sigblock(curthread);
187 	savedmask = curthread->t_hold;
188 	sigorset(&curthread->t_hold, &ignoredefault);
189 	mutex_exit(&p->p_lock);
190 
191 	/*
192 	 * Look up path name and remember last component for later.
193 	 * To help coreadm expand its %d token, we attempt to save
194 	 * the directory containing the executable in p_execdir. The
195 	 * first call to lookuppn() may fail and return EINVAL because
196 	 * dirvpp is non-NULL. In that case, we make a second call to
197 	 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
198 	 * but coreadm is allowed to expand %d to the empty string and
199 	 * there are other cases in which that failure may occur.
200 	 */
201 	if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
202 		goto out;
203 	pn_alloc(&resolvepn);
204 	if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
205 		pn_free(&resolvepn);
206 		pn_free(&pn);
207 		if (error != EINVAL)
208 			goto out;
209 
210 		dir = NULL;
211 		if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
212 			goto out;
213 		pn_alloc(&resolvepn);
214 		if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
215 		    &vp)) != 0) {
216 			pn_free(&resolvepn);
217 			pn_free(&pn);
218 			goto out;
219 		}
220 	}
221 	if (vp == NULL) {
222 		if (dir != NULL)
223 			VN_RELE(dir);
224 		error = ENOENT;
225 		pn_free(&resolvepn);
226 		pn_free(&pn);
227 		goto out;
228 	}
229 
230 	if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
231 		if (dir != NULL)
232 			VN_RELE(dir);
233 		pn_free(&resolvepn);
234 		pn_free(&pn);
235 		VN_RELE(vp);
236 		goto out;
237 	}
238 
239 	/*
240 	 * We do not allow executing files in attribute directories.
241 	 * We test this by determining whether the resolved path
242 	 * contains a "/" when we're in an attribute directory;
243 	 * only if the pathname does not contain a "/" the resolved path
244 	 * points to a file in the current working (attribute) directory.
245 	 */
246 	if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
247 	    strchr(resolvepn.pn_path, '/') == NULL) {
248 		if (dir != NULL)
249 			VN_RELE(dir);
250 		error = EACCES;
251 		pn_free(&resolvepn);
252 		pn_free(&pn);
253 		VN_RELE(vp);
254 		goto out;
255 	}
256 
257 	bzero(exec_file, MAXCOMLEN+1);
258 	(void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
259 	bzero(&args, sizeof (args));
260 	args.pathname = resolvepn.pn_path;
261 	/* don't free resolvepn until we are done with args */
262 	pn_free(&pn);
263 
264 	/*
265 	 * Specific exec handlers, or policies determined via
266 	 * /etc/system may override the historical default.
267 	 */
268 	args.stk_prot = PROT_ZFOD;
269 	args.dat_prot = PROT_ZFOD;
270 
271 	CPU_STATS_ADD_K(sys, sysexec, 1);
272 	DTRACE_PROC1(exec, char *, args.pathname);
273 
274 	ua.fname = fname;
275 	ua.argp = argp;
276 	ua.envp = envp;
277 
278 	/* If necessary, brand this process before we start the exec. */
279 	if (brandme != 0)
280 		brand_setbrand(p);
281 
282 	if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
283 	    exec_file, p->p_cred, brand_action)) != 0) {
284 		if (brandme != 0)
285 			BROP(p)->b_proc_exit(p, lwp);
286 		VN_RELE(vp);
287 		if (dir != NULL)
288 			VN_RELE(dir);
289 		pn_free(&resolvepn);
290 		goto fail;
291 	}
292 
293 	/*
294 	 * Free floating point registers (sun4u only)
295 	 */
296 	ASSERT(lwp != NULL);
297 	lwp_freeregs(lwp, 1);
298 
299 	/*
300 	 * Free thread and process context ops.
301 	 */
302 	if (curthread->t_ctx)
303 		freectx(curthread, 1);
304 	if (p->p_pctx)
305 		freepctx(p, 1);
306 
307 	/*
308 	 * Remember file name for accounting; clear any cached DTrace predicate.
309 	 */
310 	up->u_acflag &= ~AFORK;
311 	bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
312 	curthread->t_predcache = NULL;
313 
314 	/*
315 	 * Clear contract template state
316 	 */
317 	lwp_ctmpl_clear(lwp);
318 
319 	/*
320 	 * Save the directory in which we found the executable for expanding
321 	 * the %d token used in core file patterns.
322 	 */
323 	mutex_enter(&p->p_lock);
324 	tmpvp = p->p_execdir;
325 	p->p_execdir = dir;
326 	if (p->p_execdir != NULL)
327 		VN_HOLD(p->p_execdir);
328 	mutex_exit(&p->p_lock);
329 
330 	if (tmpvp != NULL)
331 		VN_RELE(tmpvp);
332 
333 	/*
334 	 * Reset stack state to the user stack, clear set of signals
335 	 * caught on the signal stack, and reset list of signals that
336 	 * restart system calls; the new program's environment should
337 	 * not be affected by detritus from the old program.  Any
338 	 * pending held signals remain held, so don't clear t_hold.
339 	 */
340 	mutex_enter(&p->p_lock);
341 	lwp->lwp_oldcontext = 0;
342 	lwp->lwp_ustack = 0;
343 	lwp->lwp_old_stk_ctl = 0;
344 	sigemptyset(&up->u_signodefer);
345 	sigemptyset(&up->u_sigonstack);
346 	sigemptyset(&up->u_sigresethand);
347 	lwp->lwp_sigaltstack.ss_sp = 0;
348 	lwp->lwp_sigaltstack.ss_size = 0;
349 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
350 
351 	/*
352 	 * Make saved resource limit == current resource limit.
353 	 */
354 	for (i = 0; i < RLIM_NLIMITS; i++) {
355 		/*CONSTCOND*/
356 		if (RLIM_SAVED(i)) {
357 			(void) rctl_rlimit_get(rctlproc_legacy[i], p,
358 			    &up->u_saved_rlimit[i]);
359 		}
360 	}
361 
362 	/*
363 	 * If the action was to catch the signal, then the action
364 	 * must be reset to SIG_DFL.
365 	 */
366 	sigdefault(p);
367 	p->p_flag &= ~(SNOWAIT|SJCTL);
368 	p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
369 	up->u_signal[SIGCLD - 1] = SIG_DFL;
370 
371 	/*
372 	 * Delete the dot4 sigqueues/signotifies.
373 	 */
374 	sigqfree(p);
375 
376 	mutex_exit(&p->p_lock);
377 
378 	mutex_enter(&p->p_pflock);
379 	p->p_prof.pr_base = NULL;
380 	p->p_prof.pr_size = 0;
381 	p->p_prof.pr_off = 0;
382 	p->p_prof.pr_scale = 0;
383 	p->p_prof.pr_samples = 0;
384 	mutex_exit(&p->p_pflock);
385 
386 	ASSERT(curthread->t_schedctl == NULL);
387 
388 #if defined(__sparc)
389 	if (p->p_utraps != NULL)
390 		utrap_free(p);
391 #endif	/* __sparc */
392 
393 	/*
394 	 * Close all close-on-exec files.
395 	 */
396 	close_exec(P_FINFO(p));
397 	TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
398 
399 	/* Unbrand ourself if requested. */
400 	if (brand_action == EBA_NATIVE)
401 		BROP(p)->b_proc_exit(p, lwp);
402 	ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p));
403 
404 	setregs(&args);
405 
406 	/* Mark this as an executable vnode */
407 	mutex_enter(&vp->v_lock);
408 	vp->v_flag |= VVMEXEC;
409 	mutex_exit(&vp->v_lock);
410 
411 	VN_RELE(vp);
412 	if (dir != NULL)
413 		VN_RELE(dir);
414 	pn_free(&resolvepn);
415 
416 	/*
417 	 * Allocate a new lwp directory and lwpid hash table if necessary.
418 	 */
419 	if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
420 		lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
421 		lwpdir->ld_next = lwpdir + 1;
422 		tidhash = kmem_zalloc(2 * sizeof (lwpdir_t *), KM_SLEEP);
423 		if (p->p_lwpdir != NULL)
424 			lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
425 		else
426 			lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
427 	}
428 
429 	if (PROC_IS_BRANDED(p))
430 		BROP(p)->b_exec();
431 
432 	mutex_enter(&p->p_lock);
433 	prbarrier(p);
434 
435 	/*
436 	 * Reset lwp id to the default value of 1.
437 	 * This is a single-threaded process now
438 	 * and lwp #1 is lwp_wait()able by default.
439 	 * The t_unpark flag should not be inherited.
440 	 */
441 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
442 	curthread->t_tid = 1;
443 	kpreempt_disable();
444 	ASSERT(curthread->t_lpl != NULL);
445 	p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
446 	kpreempt_enable();
447 	if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
448 		lgrp_update_trthr_migrations(1);
449 	}
450 	curthread->t_unpark = 0;
451 	curthread->t_proc_flag |= TP_TWAIT;
452 	curthread->t_proc_flag &= ~TP_DAEMON;	/* daemons shouldn't exec */
453 	p->p_lwpdaemon = 0;			/* but oh well ... */
454 	p->p_lwpid = 1;
455 
456 	/*
457 	 * Install the newly-allocated lwp directory and lwpid hash table
458 	 * and insert the current thread into the new hash table.
459 	 */
460 	if (lwpdir != NULL) {
461 		old_lwpdir = p->p_lwpdir;
462 		old_lwpdir_sz = p->p_lwpdir_sz;
463 		old_tidhash = p->p_tidhash;
464 		old_tidhash_sz = p->p_tidhash_sz;
465 		p->p_lwpdir = p->p_lwpfree = lwpdir;
466 		p->p_lwpdir_sz = 2;
467 		p->p_tidhash = tidhash;
468 		p->p_tidhash_sz = 2;
469 		lep->le_thread = curthread;
470 		lep->le_lwpid = curthread->t_tid;
471 		lep->le_start = curthread->t_start;
472 		lwp_hash_in(p, lep);
473 	}
474 
475 	/*
476 	 * Restore the saved signal mask and
477 	 * inform /proc that the exec() has finished.
478 	 */
479 	curthread->t_hold = savedmask;
480 	prexecend();
481 	mutex_exit(&p->p_lock);
482 	if (old_lwpdir) {
483 		kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
484 		kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *));
485 	}
486 
487 	ASSERT(error == 0);
488 	DTRACE_PROC(exec__success);
489 	return (0);
490 
491 fail:
492 	DTRACE_PROC1(exec__failure, int, error);
493 out:		/* error return */
494 	mutex_enter(&p->p_lock);
495 	curthread->t_hold = savedmask;
496 	prexecend();
497 	mutex_exit(&p->p_lock);
498 	ASSERT(error != 0);
499 	return (error);
500 }
501 
502 
503 /*
504  * Perform generic exec duties and switchout to object-file specific
505  * handler.
506  */
507 int
508 gexec(
509 	struct vnode **vpp,
510 	struct execa *uap,
511 	struct uarg *args,
512 	struct intpdata *idatap,
513 	int level,
514 	long *execsz,
515 	caddr_t exec_file,
516 	struct cred *cred,
517 	int brand_action)
518 {
519 	struct vnode *vp;
520 	proc_t *pp = ttoproc(curthread);
521 	struct execsw *eswp;
522 	int error = 0;
523 	int suidflags = 0;
524 	ssize_t resid;
525 	uid_t uid, gid;
526 	struct vattr vattr;
527 	char magbuf[MAGIC_BYTES];
528 	int setid;
529 	cred_t *oldcred, *newcred = NULL;
530 	int privflags = 0;
531 	int setidfl;
532 
533 	/*
534 	 * If the SNOCD or SUGID flag is set, turn it off and remember the
535 	 * previous setting so we can restore it if we encounter an error.
536 	 */
537 	if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
538 		mutex_enter(&pp->p_lock);
539 		suidflags = pp->p_flag & PSUIDFLAGS;
540 		pp->p_flag &= ~PSUIDFLAGS;
541 		mutex_exit(&pp->p_lock);
542 	}
543 
544 	if ((error = execpermissions(*vpp, &vattr, args)) != 0)
545 		goto bad;
546 
547 	/* need to open vnode for stateful file systems like rfs */
548 	if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
549 		goto bad;
550 	vp = *vpp;
551 
552 	/*
553 	 * Note: to support binary compatibility with SunOS a.out
554 	 * executables, we read in the first four bytes, as the
555 	 * magic number is in bytes 2-3.
556 	 */
557 	if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
558 	    (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
559 		goto bad;
560 	if (resid != 0)
561 		goto bad;
562 
563 	if ((eswp = findexec_by_hdr(magbuf)) == NULL)
564 		goto bad;
565 
566 	if (level == 0 &&
567 	    (privflags = execsetid(vp, &vattr, &uid, &gid)) != 0) {
568 
569 		newcred = cred = crdup(cred);
570 
571 		/* If we can, drop the PA bit */
572 		if ((privflags & PRIV_RESET) != 0)
573 			priv_adjust_PA(cred);
574 
575 		if (privflags & PRIV_SETID) {
576 			cred->cr_uid = uid;
577 			cred->cr_gid = gid;
578 			cred->cr_suid = uid;
579 			cred->cr_sgid = gid;
580 		}
581 
582 		if (privflags & MAC_FLAGS) {
583 			if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
584 				CR_FLAGS(cred) &= ~NET_MAC_AWARE;
585 			CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
586 		}
587 
588 		/*
589 		 * Implement the privilege updates:
590 		 *
591 		 * Restrict with L:
592 		 *
593 		 *	I' = I & L
594 		 *
595 		 *	E' = P' = (I' + F) & A
596 		 *
597 		 * But if running under ptrace, we cap I with P.
598 		 */
599 		if ((privflags & PRIV_RESET) != 0) {
600 			if ((privflags & PRIV_INCREASE) != 0 &&
601 			    (pp->p_proc_flag & P_PR_PTRACE) != 0)
602 				priv_intersect(&CR_OPPRIV(cred),
603 				    &CR_IPRIV(cred));
604 			priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
605 			CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
606 			priv_adjust_PA(cred);
607 		}
608 	}
609 
610 	/* SunOS 4.x buy-back */
611 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
612 	    (vattr.va_mode & (VSUID|VSGID))) {
613 		cmn_err(CE_NOTE,
614 		    "!%s, uid %d: setuid execution not allowed, dev=%lx",
615 		    exec_file, cred->cr_uid, vp->v_vfsp->vfs_dev);
616 	}
617 
618 	/*
619 	 * execsetid() told us whether or not we had to change the
620 	 * credentials of the process.  In privflags, it told us
621 	 * whether we gained any privileges or executed a set-uid executable.
622 	 */
623 	setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE));
624 
625 	/*
626 	 * Use /etc/system variable to determine if the stack
627 	 * should be marked as executable by default.
628 	 */
629 	if (noexec_user_stack)
630 		args->stk_prot &= ~PROT_EXEC;
631 
632 	args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
633 	args->ex_vp = vp;
634 
635 	/*
636 	 * Traditionally, the setid flags told the sub processes whether
637 	 * the file just executed was set-uid or set-gid; this caused
638 	 * some confusion as the 'setid' flag did not match the SUGID
639 	 * process flag which is only set when the uids/gids do not match.
640 	 * A script set-gid/set-uid to the real uid/gid would start with
641 	 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
642 	 * Now we flag those cases where the calling process cannot
643 	 * be trusted to influence the newly exec'ed process, either
644 	 * because it runs with more privileges or when the uids/gids
645 	 * do in fact not match.
646 	 * This also makes the runtime linker agree with the on exec
647 	 * values of SNOCD and SUGID.
648 	 */
649 	setidfl = 0;
650 	if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
651 	    !supgroupmember(cred->cr_gid, cred))) {
652 		setidfl |= EXECSETID_UGIDS;
653 	}
654 	if (setid & PRIV_SETUGID)
655 		setidfl |= EXECSETID_SETID;
656 	if (setid & PRIV_INCREASE)
657 		setidfl |= EXECSETID_PRIVS;
658 
659 	error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
660 	    setidfl, exec_file, cred, brand_action);
661 	rw_exit(eswp->exec_lock);
662 	if (error != 0) {
663 		if (newcred != NULL)
664 			crfree(newcred);
665 		goto bad;
666 	}
667 
668 	if (level == 0) {
669 		mutex_enter(&pp->p_crlock);
670 		if (newcred != NULL) {
671 			/*
672 			 * Free the old credentials, and set the new ones.
673 			 * Do this for both the process and the (single) thread.
674 			 */
675 			crfree(pp->p_cred);
676 			pp->p_cred = cred;	/* cred already held for proc */
677 			crhold(cred);		/* hold new cred for thread */
678 			/*
679 			 * DTrace accesses t_cred in probe context.  t_cred
680 			 * must always be either NULL, or point to a valid,
681 			 * allocated cred structure.
682 			 */
683 			oldcred = curthread->t_cred;
684 			curthread->t_cred = cred;
685 			crfree(oldcred);
686 		}
687 		/*
688 		 * On emerging from a successful exec(), the saved
689 		 * uid and gid equal the effective uid and gid.
690 		 */
691 		cred->cr_suid = cred->cr_uid;
692 		cred->cr_sgid = cred->cr_gid;
693 
694 		/*
695 		 * If the real and effective ids do not match, this
696 		 * is a setuid process that should not dump core.
697 		 * The group comparison is tricky; we prevent the code
698 		 * from flagging SNOCD when executing with an effective gid
699 		 * which is a supplementary group.
700 		 */
701 		if (cred->cr_ruid != cred->cr_uid ||
702 		    (cred->cr_rgid != cred->cr_gid &&
703 		    !supgroupmember(cred->cr_gid, cred)) ||
704 		    (privflags & PRIV_INCREASE) != 0)
705 			suidflags = PSUIDFLAGS;
706 		else
707 			suidflags = 0;
708 
709 		mutex_exit(&pp->p_crlock);
710 		if (suidflags) {
711 			mutex_enter(&pp->p_lock);
712 			pp->p_flag |= suidflags;
713 			mutex_exit(&pp->p_lock);
714 		}
715 		if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
716 			/*
717 			 * If process is traced via /proc, arrange to
718 			 * invalidate the associated /proc vnode.
719 			 */
720 			if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
721 				args->traceinval = 1;
722 		}
723 		if (pp->p_proc_flag & P_PR_PTRACE)
724 			psignal(pp, SIGTRAP);
725 		if (args->traceinval)
726 			prinvalidate(&pp->p_user);
727 	}
728 
729 	return (0);
730 bad:
731 	if (error == 0)
732 		error = ENOEXEC;
733 
734 	if (suidflags) {
735 		mutex_enter(&pp->p_lock);
736 		pp->p_flag |= suidflags;
737 		mutex_exit(&pp->p_lock);
738 	}
739 	return (error);
740 }
741 
742 extern char *execswnames[];
743 
744 struct execsw *
745 allocate_execsw(char *name, char *magic, size_t magic_size)
746 {
747 	int i, j;
748 	char *ename;
749 	char *magicp;
750 
751 	mutex_enter(&execsw_lock);
752 	for (i = 0; i < nexectype; i++) {
753 		if (execswnames[i] == NULL) {
754 			ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
755 			(void) strcpy(ename, name);
756 			execswnames[i] = ename;
757 			/*
758 			 * Set the magic number last so that we
759 			 * don't need to hold the execsw_lock in
760 			 * findexectype().
761 			 */
762 			magicp = kmem_alloc(magic_size, KM_SLEEP);
763 			for (j = 0; j < magic_size; j++)
764 				magicp[j] = magic[j];
765 			execsw[i].exec_magic = magicp;
766 			mutex_exit(&execsw_lock);
767 			return (&execsw[i]);
768 		}
769 	}
770 	mutex_exit(&execsw_lock);
771 	return (NULL);
772 }
773 
774 /*
775  * Find the exec switch table entry with the corresponding magic string.
776  */
777 struct execsw *
778 findexecsw(char *magic)
779 {
780 	struct execsw *eswp;
781 
782 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
783 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
784 		if (magic && eswp->exec_maglen != 0 &&
785 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
786 			return (eswp);
787 	}
788 	return (NULL);
789 }
790 
791 /*
792  * Find the execsw[] index for the given exec header string by looking for the
793  * magic string at a specified offset and length for each kind of executable
794  * file format until one matches.  If no execsw[] entry is found, try to
795  * autoload a module for this magic string.
796  */
797 struct execsw *
798 findexec_by_hdr(char *header)
799 {
800 	struct execsw *eswp;
801 
802 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
803 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
804 		if (header && eswp->exec_maglen != 0 &&
805 		    bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
806 		    eswp->exec_maglen) == 0) {
807 			if (hold_execsw(eswp) != 0)
808 				return (NULL);
809 			return (eswp);
810 		}
811 	}
812 	return (NULL);	/* couldn't find the type */
813 }
814 
815 /*
816  * Find the execsw[] index for the given magic string.  If no execsw[] entry
817  * is found, try to autoload a module for this magic string.
818  */
819 struct execsw *
820 findexec_by_magic(char *magic)
821 {
822 	struct execsw *eswp;
823 
824 	for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
825 		ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
826 		if (magic && eswp->exec_maglen != 0 &&
827 		    bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
828 			if (hold_execsw(eswp) != 0)
829 				return (NULL);
830 			return (eswp);
831 		}
832 	}
833 	return (NULL);	/* couldn't find the type */
834 }
835 
836 static int
837 hold_execsw(struct execsw *eswp)
838 {
839 	char *name;
840 
841 	rw_enter(eswp->exec_lock, RW_READER);
842 	while (!LOADED_EXEC(eswp)) {
843 		rw_exit(eswp->exec_lock);
844 		name = execswnames[eswp-execsw];
845 		ASSERT(name);
846 		if (modload("exec", name) == -1)
847 			return (-1);
848 		rw_enter(eswp->exec_lock, RW_READER);
849 	}
850 	return (0);
851 }
852 
853 static int
854 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp)
855 {
856 	proc_t *pp = ttoproc(curthread);
857 	uid_t uid, gid;
858 	cred_t *cr = pp->p_cred;
859 	int privflags = 0;
860 
861 	/*
862 	 * Remember credentials.
863 	 */
864 	uid = cr->cr_uid;
865 	gid = cr->cr_gid;
866 
867 	/* Will try to reset the PRIV_AWARE bit later. */
868 	if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
869 		privflags |= PRIV_RESET;
870 
871 	if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
872 		/*
873 		 * Set-uid root execution only allowed if the limit set
874 		 * holds all unsafe privileges.
875 		 */
876 		if ((vattrp->va_mode & VSUID) && (vattrp->va_uid != 0 ||
877 		    priv_issubset(&priv_unsafe, &CR_LPRIV(cr)))) {
878 			uid = vattrp->va_uid;
879 			privflags |= PRIV_SETUGID;
880 		}
881 		if (vattrp->va_mode & VSGID) {
882 			gid = vattrp->va_gid;
883 			privflags |= PRIV_SETUGID;
884 		}
885 	}
886 
887 	/*
888 	 * Do we need to change our credential anyway?
889 	 * This is the case when E != I or P != I, as
890 	 * we need to do the assignments (with F empty and A full)
891 	 * Or when I is not a subset of L; in that case we need to
892 	 * enforce L.
893 	 *
894 	 *		I' = L & I
895 	 *
896 	 *		E' = P' = (I' + F) & A
897 	 * or
898 	 *		E' = P' = I'
899 	 */
900 	if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
901 	    !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
902 	    !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
903 		privflags |= PRIV_RESET;
904 
905 	/* If MAC-aware flag(s) are on, need to update cred to remove. */
906 	if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
907 	    (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
908 		privflags |= MAC_FLAGS;
909 
910 	/*
911 	 * When we introduce the "forced" set then we will need
912 	 * to set PRIV_INCREASE here if I not a subset of P.
913 	 * If the "allowed" set is introduced we will need to do
914 	 * a similar thing; however, it seems more reasonable to
915 	 * have the allowed set reduce "L": script language interpreters
916 	 * would typically have an allowed set of "all".
917 	 */
918 
919 	/*
920 	 * Set setuid/setgid protections if no ptrace() compatibility.
921 	 * For privileged processes, honor setuid/setgid even in
922 	 * the presence of ptrace() compatibility.
923 	 */
924 	if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
925 	    PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
926 	    (cr->cr_uid != uid ||
927 	    cr->cr_gid != gid ||
928 	    cr->cr_suid != uid ||
929 	    cr->cr_sgid != gid)) {
930 		*uidp = uid;
931 		*gidp = gid;
932 		privflags |= PRIV_SETID;
933 	}
934 	return (privflags);
935 }
936 
937 int
938 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
939 {
940 	int error;
941 	proc_t *p = ttoproc(curthread);
942 
943 	vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
944 	if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
945 		return (error);
946 	/*
947 	 * Check the access mode.
948 	 * If VPROC, ask /proc if the file is an object file.
949 	 */
950 	if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
951 	    !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
952 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
953 	    (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
954 		if (error == 0)
955 			error = EACCES;
956 		return (error);
957 	}
958 
959 	if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
960 	    (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
961 		/*
962 		 * If process is under ptrace(2) compatibility,
963 		 * fail the exec(2).
964 		 */
965 		if (p->p_proc_flag & P_PR_PTRACE)
966 			goto bad;
967 		/*
968 		 * Process is traced via /proc.
969 		 * Arrange to invalidate the /proc vnode.
970 		 */
971 		args->traceinval = 1;
972 	}
973 	return (0);
974 bad:
975 	if (error == 0)
976 		error = ENOEXEC;
977 	return (error);
978 }
979 
980 /*
981  * Map a section of an executable file into the user's
982  * address space.
983  */
984 int
985 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
986     off_t offset, int prot, int page, uint_t szc)
987 {
988 	int error = 0;
989 	off_t oldoffset;
990 	caddr_t zfodbase, oldaddr;
991 	size_t end, oldlen;
992 	size_t zfoddiff;
993 	label_t ljb;
994 	proc_t *p = ttoproc(curthread);
995 
996 	oldaddr = addr;
997 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
998 	if (len) {
999 		oldlen = len;
1000 		len += ((size_t)oldaddr - (size_t)addr);
1001 		oldoffset = offset;
1002 		offset = (off_t)((uintptr_t)offset & PAGEMASK);
1003 		if (page) {
1004 			spgcnt_t  prefltmem, availm, npages;
1005 			int preread;
1006 			uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1007 
1008 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1009 				mflag |= MAP_TEXT;
1010 			} else {
1011 				mflag |= MAP_INITDATA;
1012 			}
1013 
1014 			if (valid_usr_range(addr, len, prot, p->p_as,
1015 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1016 				error = ENOMEM;
1017 				goto bad;
1018 			}
1019 			if (error = VOP_MAP(vp, (offset_t)offset,
1020 			    p->p_as, &addr, len, prot, PROT_ALL,
1021 			    mflag, CRED(), NULL))
1022 				goto bad;
1023 
1024 			/*
1025 			 * If the segment can fit, then we prefault
1026 			 * the entire segment in.  This is based on the
1027 			 * model that says the best working set of a
1028 			 * small program is all of its pages.
1029 			 */
1030 			npages = (spgcnt_t)btopr(len);
1031 			prefltmem = freemem - desfree;
1032 			preread =
1033 			    (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1034 
1035 			/*
1036 			 * If we aren't prefaulting the segment,
1037 			 * increment "deficit", if necessary to ensure
1038 			 * that pages will become available when this
1039 			 * process starts executing.
1040 			 */
1041 			availm = freemem - lotsfree;
1042 			if (preread == 0 && npages > availm &&
1043 			    deficit < lotsfree) {
1044 				deficit += MIN((pgcnt_t)(npages - availm),
1045 				    lotsfree - deficit);
1046 			}
1047 
1048 			if (preread) {
1049 				TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1050 				    "execmap preread:freemem %d size %lu",
1051 				    freemem, len);
1052 				(void) as_fault(p->p_as->a_hat, p->p_as,
1053 				    (caddr_t)addr, len, F_INVAL, S_READ);
1054 			}
1055 		} else {
1056 			if (valid_usr_range(addr, len, prot, p->p_as,
1057 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1058 				error = ENOMEM;
1059 				goto bad;
1060 			}
1061 
1062 			if (error = as_map(p->p_as, addr, len,
1063 			    segvn_create, zfod_argsp))
1064 				goto bad;
1065 			/*
1066 			 * Read in the segment in one big chunk.
1067 			 */
1068 			if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1069 			    oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1070 			    (rlim64_t)0, CRED(), (ssize_t *)0))
1071 				goto bad;
1072 			/*
1073 			 * Now set protections.
1074 			 */
1075 			if (prot != PROT_ZFOD) {
1076 				(void) as_setprot(p->p_as, (caddr_t)addr,
1077 				    len, prot);
1078 			}
1079 		}
1080 	}
1081 
1082 	if (zfodlen) {
1083 		struct as *as = curproc->p_as;
1084 		struct seg *seg;
1085 		uint_t zprot = 0;
1086 
1087 		end = (size_t)addr + len;
1088 		zfodbase = (caddr_t)roundup(end, PAGESIZE);
1089 		zfoddiff = (uintptr_t)zfodbase - end;
1090 		if (zfoddiff) {
1091 			/*
1092 			 * Before we go to zero the remaining space on the last
1093 			 * page, make sure we have write permission.
1094 			 */
1095 
1096 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1097 			seg = as_segat(curproc->p_as, (caddr_t)end);
1098 			if (seg != NULL)
1099 				SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1100 				    &zprot);
1101 			AS_LOCK_EXIT(as, &as->a_lock);
1102 
1103 			if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1104 				(void) as_setprot(as, (caddr_t)end,
1105 				    zfoddiff - 1, zprot | PROT_WRITE);
1106 			}
1107 
1108 			if (on_fault(&ljb)) {
1109 				no_fault();
1110 				if (seg != NULL && (zprot & PROT_WRITE) == 0)
1111 					(void) as_setprot(as, (caddr_t)end,
1112 					    zfoddiff - 1, zprot);
1113 				error = EFAULT;
1114 				goto bad;
1115 			}
1116 			uzero((void *)end, zfoddiff);
1117 			no_fault();
1118 			if (seg != NULL && (zprot & PROT_WRITE) == 0)
1119 				(void) as_setprot(as, (caddr_t)end,
1120 				    zfoddiff - 1, zprot);
1121 		}
1122 		if (zfodlen > zfoddiff) {
1123 			struct segvn_crargs crargs =
1124 			    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1125 
1126 			zfodlen -= zfoddiff;
1127 			if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1128 			    p->p_as->a_userlimit) != RANGE_OKAY) {
1129 				error = ENOMEM;
1130 				goto bad;
1131 			}
1132 			if (szc > 0) {
1133 				/*
1134 				 * ASSERT alignment because the mapelfexec()
1135 				 * caller for the szc > 0 case extended zfod
1136 				 * so it's end is pgsz aligned.
1137 				 */
1138 				size_t pgsz = page_get_pagesize(szc);
1139 				ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1140 
1141 				if (IS_P2ALIGNED(zfodbase, pgsz)) {
1142 					crargs.szc = szc;
1143 				} else {
1144 					crargs.szc = AS_MAP_HEAP;
1145 				}
1146 			} else {
1147 				crargs.szc = AS_MAP_NO_LPOOB;
1148 			}
1149 			if (error = as_map(p->p_as, (caddr_t)zfodbase,
1150 			    zfodlen, segvn_create, &crargs))
1151 				goto bad;
1152 			if (prot != PROT_ZFOD) {
1153 				(void) as_setprot(p->p_as, (caddr_t)zfodbase,
1154 				    zfodlen, prot);
1155 			}
1156 		}
1157 	}
1158 	return (0);
1159 bad:
1160 	return (error);
1161 }
1162 
1163 void
1164 setexecenv(struct execenv *ep)
1165 {
1166 	proc_t *p = ttoproc(curthread);
1167 	klwp_t *lwp = ttolwp(curthread);
1168 	struct vnode *vp;
1169 
1170 	p->p_bssbase = ep->ex_bssbase;
1171 	p->p_brkbase = ep->ex_brkbase;
1172 	p->p_brksize = ep->ex_brksize;
1173 	if (p->p_exec)
1174 		VN_RELE(p->p_exec);	/* out with the old */
1175 	vp = p->p_exec = ep->ex_vp;
1176 	if (vp != NULL)
1177 		VN_HOLD(vp);		/* in with the new */
1178 
1179 	lwp->lwp_sigaltstack.ss_sp = 0;
1180 	lwp->lwp_sigaltstack.ss_size = 0;
1181 	lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1182 }
1183 
1184 int
1185 execopen(struct vnode **vpp, int *fdp)
1186 {
1187 	struct vnode *vp = *vpp;
1188 	file_t *fp;
1189 	int error = 0;
1190 	int filemode = FREAD;
1191 
1192 	VN_HOLD(vp);		/* open reference */
1193 	if (error = falloc(NULL, filemode, &fp, fdp)) {
1194 		VN_RELE(vp);
1195 		*fdp = -1;	/* just in case falloc changed value */
1196 		return (error);
1197 	}
1198 	if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1199 		VN_RELE(vp);
1200 		setf(*fdp, NULL);
1201 		unfalloc(fp);
1202 		*fdp = -1;
1203 		return (error);
1204 	}
1205 	*vpp = vp;		/* vnode should not have changed */
1206 	fp->f_vnode = vp;
1207 	mutex_exit(&fp->f_tlock);
1208 	setf(*fdp, fp);
1209 	return (0);
1210 }
1211 
1212 int
1213 execclose(int fd)
1214 {
1215 	return (closeandsetf(fd, NULL));
1216 }
1217 
1218 
1219 /*
1220  * noexec stub function.
1221  */
1222 /*ARGSUSED*/
1223 int
1224 noexec(
1225     struct vnode *vp,
1226     struct execa *uap,
1227     struct uarg *args,
1228     struct intpdata *idatap,
1229     int level,
1230     long *execsz,
1231     int setid,
1232     caddr_t exec_file,
1233     struct cred *cred)
1234 {
1235 	cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1236 	return (ENOEXEC);
1237 }
1238 
1239 /*
1240  * Support routines for building a user stack.
1241  *
1242  * execve(path, argv, envp) must construct a new stack with the specified
1243  * arguments and environment variables (see exec_args() for a description
1244  * of the user stack layout).  To do this, we copy the arguments and
1245  * environment variables from the old user address space into the kernel,
1246  * free the old as, create the new as, and copy our buffered information
1247  * to the new stack.  Our kernel buffer has the following structure:
1248  *
1249  *	+-----------------------+ <--- stk_base + stk_size
1250  *	| string offsets	|
1251  *	+-----------------------+ <--- stk_offp
1252  *	|			|
1253  *	| STK_AVAIL() space	|
1254  *	|			|
1255  *	+-----------------------+ <--- stk_strp
1256  *	| strings		|
1257  *	+-----------------------+ <--- stk_base
1258  *
1259  * When we add a string, we store the string's contents (including the null
1260  * terminator) at stk_strp, and we store the offset of the string relative to
1261  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1262  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1263  * the difference between these pointers.  If we run out of space, we return
1264  * an error and exec_args() starts all over again with a buffer twice as large.
1265  * When we're all done, the kernel buffer looks like this:
1266  *
1267  *	+-----------------------+ <--- stk_base + stk_size
1268  *	| argv[0] offset	|
1269  *	+-----------------------+
1270  *	| ...			|
1271  *	+-----------------------+
1272  *	| argv[argc-1] offset	|
1273  *	+-----------------------+
1274  *	| envp[0] offset	|
1275  *	+-----------------------+
1276  *	| ...			|
1277  *	+-----------------------+
1278  *	| envp[envc-1] offset	|
1279  *	+-----------------------+
1280  *	| AT_SUN_PLATFORM offset|
1281  *	+-----------------------+
1282  *	| AT_SUN_EXECNAME offset|
1283  *	+-----------------------+ <--- stk_offp
1284  *	|			|
1285  *	| STK_AVAIL() space	|
1286  *	|			|
1287  *	+-----------------------+ <--- stk_strp
1288  *	| AT_SUN_EXECNAME offset|
1289  *	+-----------------------+
1290  *	| AT_SUN_PLATFORM offset|
1291  *	+-----------------------+
1292  *	| envp[envc-1] string	|
1293  *	+-----------------------+
1294  *	| ...			|
1295  *	+-----------------------+
1296  *	| envp[0] string	|
1297  *	+-----------------------+
1298  *	| argv[argc-1] string	|
1299  *	+-----------------------+
1300  *	| ...			|
1301  *	+-----------------------+
1302  *	| argv[0] string	|
1303  *	+-----------------------+ <--- stk_base
1304  */
1305 
1306 #define	STK_AVAIL(args)		((char *)(args)->stk_offp - (args)->stk_strp)
1307 
1308 /*
1309  * Add a string to the stack.
1310  */
1311 static int
1312 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1313 {
1314 	int error;
1315 	size_t len;
1316 
1317 	if (STK_AVAIL(args) < sizeof (int))
1318 		return (E2BIG);
1319 	*--args->stk_offp = args->stk_strp - args->stk_base;
1320 
1321 	if (segflg == UIO_USERSPACE) {
1322 		error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1323 		if (error != 0)
1324 			return (error);
1325 	} else {
1326 		len = strlen(sp) + 1;
1327 		if (len > STK_AVAIL(args))
1328 			return (E2BIG);
1329 		bcopy(sp, args->stk_strp, len);
1330 	}
1331 
1332 	args->stk_strp += len;
1333 
1334 	return (0);
1335 }
1336 
1337 static int
1338 stk_getptr(uarg_t *args, char *src, char **dst)
1339 {
1340 	int error;
1341 
1342 	if (args->from_model == DATAMODEL_NATIVE) {
1343 		ulong_t ptr;
1344 		error = fulword(src, &ptr);
1345 		*dst = (caddr_t)ptr;
1346 	} else {
1347 		uint32_t ptr;
1348 		error = fuword32(src, &ptr);
1349 		*dst = (caddr_t)(uintptr_t)ptr;
1350 	}
1351 	return (error);
1352 }
1353 
1354 static int
1355 stk_putptr(uarg_t *args, char *addr, char *value)
1356 {
1357 	if (args->to_model == DATAMODEL_NATIVE)
1358 		return (sulword(addr, (ulong_t)value));
1359 	else
1360 		return (suword32(addr, (uint32_t)(uintptr_t)value));
1361 }
1362 
1363 static int
1364 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1365 {
1366 	char *sp;
1367 	int argc, error;
1368 	int argv_empty = 0;
1369 	size_t ptrsize = args->from_ptrsize;
1370 	size_t size, pad;
1371 	char *argv = (char *)uap->argp;
1372 	char *envp = (char *)uap->envp;
1373 
1374 	/*
1375 	 * Copy interpreter's name and argument to argv[0] and argv[1].
1376 	 */
1377 	if (intp != NULL && intp->intp_name != NULL) {
1378 		if ((error = stk_add(args, intp->intp_name, UIO_SYSSPACE)) != 0)
1379 			return (error);
1380 		if (intp->intp_arg != NULL &&
1381 		    (error = stk_add(args, intp->intp_arg, UIO_SYSSPACE)) != 0)
1382 			return (error);
1383 		if (args->fname != NULL)
1384 			error = stk_add(args, args->fname, UIO_SYSSPACE);
1385 		else
1386 			error = stk_add(args, uap->fname, UIO_USERSPACE);
1387 		if (error)
1388 			return (error);
1389 
1390 		/*
1391 		 * Check for an empty argv[].
1392 		 */
1393 		if (stk_getptr(args, argv, &sp))
1394 			return (EFAULT);
1395 		if (sp == NULL)
1396 			argv_empty = 1;
1397 
1398 		argv += ptrsize;		/* ignore original argv[0] */
1399 	}
1400 
1401 	if (argv_empty == 0) {
1402 		/*
1403 		 * Add argv[] strings to the stack.
1404 		 */
1405 		for (;;) {
1406 			if (stk_getptr(args, argv, &sp))
1407 				return (EFAULT);
1408 			if (sp == NULL)
1409 				break;
1410 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1411 				return (error);
1412 			argv += ptrsize;
1413 		}
1414 	}
1415 	argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1416 	args->arglen = args->stk_strp - args->stk_base;
1417 
1418 	/*
1419 	 * Add environ[] strings to the stack.
1420 	 */
1421 	if (envp != NULL) {
1422 		for (;;) {
1423 			if (stk_getptr(args, envp, &sp))
1424 				return (EFAULT);
1425 			if (sp == NULL)
1426 				break;
1427 			if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1428 				return (error);
1429 			envp += ptrsize;
1430 		}
1431 	}
1432 	args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1433 	args->ne = args->na - argc;
1434 
1435 	/*
1436 	 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1437 	 * AT_SUN_EMULATOR strings to the stack.
1438 	 */
1439 	if (auxvpp != NULL && *auxvpp != NULL) {
1440 		if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1441 			return (error);
1442 		if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1443 			return (error);
1444 		if (args->brandname != NULL &&
1445 		    (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1446 			return (error);
1447 		if (args->emulator != NULL &&
1448 		    (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1449 			return (error);
1450 	}
1451 
1452 	/*
1453 	 * Compute the size of the stack.  This includes all the pointers,
1454 	 * the space reserved for the aux vector, and all the strings.
1455 	 * The total number of pointers is args->na (which is argc + envc)
1456 	 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1457 	 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1458 	 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1459 	 * all the strings, at the very top of the stack.
1460 	 */
1461 	size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1462 	    (args->stk_strp - args->stk_base);
1463 
1464 	/*
1465 	 * Pad the string section with zeroes to align the stack size.
1466 	 */
1467 	pad = P2NPHASE(size, args->stk_align);
1468 
1469 	if (STK_AVAIL(args) < pad)
1470 		return (E2BIG);
1471 
1472 	args->usrstack_size = size + pad;
1473 
1474 	while (pad-- != 0)
1475 		*args->stk_strp++ = 0;
1476 
1477 	args->nc = args->stk_strp - args->stk_base;
1478 
1479 	return (0);
1480 }
1481 
1482 static int
1483 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1484 {
1485 	size_t ptrsize = args->to_ptrsize;
1486 	ssize_t pslen;
1487 	char *kstrp = args->stk_base;
1488 	char *ustrp = usrstack - args->nc - ptrsize;
1489 	char *usp = usrstack - args->usrstack_size;
1490 	int *offp = (int *)(args->stk_base + args->stk_size);
1491 	int envc = args->ne;
1492 	int argc = args->na - envc;
1493 	int i;
1494 
1495 	/*
1496 	 * Record argc for /proc.
1497 	 */
1498 	up->u_argc = argc;
1499 
1500 	/*
1501 	 * Put argc on the stack.  Note that even though it's an int,
1502 	 * it always consumes ptrsize bytes (for alignment).
1503 	 */
1504 	if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1505 		return (-1);
1506 
1507 	/*
1508 	 * Add argc space (ptrsize) to usp and record argv for /proc.
1509 	 */
1510 	up->u_argv = (uintptr_t)(usp += ptrsize);
1511 
1512 	/*
1513 	 * Put the argv[] pointers on the stack.
1514 	 */
1515 	for (i = 0; i < argc; i++, usp += ptrsize)
1516 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1517 			return (-1);
1518 
1519 	/*
1520 	 * Copy arguments to u_psargs.
1521 	 */
1522 	pslen = MIN(args->arglen, PSARGSZ) - 1;
1523 	for (i = 0; i < pslen; i++)
1524 		up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1525 	while (i < PSARGSZ)
1526 		up->u_psargs[i++] = '\0';
1527 
1528 	/*
1529 	 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1530 	 * record envp for /proc.
1531 	 */
1532 	up->u_envp = (uintptr_t)(usp += ptrsize);
1533 
1534 	/*
1535 	 * Put the envp[] pointers on the stack.
1536 	 */
1537 	for (i = 0; i < envc; i++, usp += ptrsize)
1538 		if (stk_putptr(args, usp, &ustrp[*--offp]))
1539 			return (-1);
1540 
1541 	/*
1542 	 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1543 	 * remember where the stack ends, which is also where auxv begins.
1544 	 */
1545 	args->stackend = usp += ptrsize;
1546 
1547 	/*
1548 	 * Put all the argv[], envp[], and auxv strings on the stack.
1549 	 */
1550 	if (copyout(args->stk_base, ustrp, args->nc))
1551 		return (-1);
1552 
1553 	/*
1554 	 * Fill in the aux vector now that we know the user stack addresses
1555 	 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1556 	 * AT_SUN_EMULATOR strings.
1557 	 */
1558 	if (auxvpp != NULL && *auxvpp != NULL) {
1559 		if (args->to_model == DATAMODEL_NATIVE) {
1560 			auxv_t **a = (auxv_t **)auxvpp;
1561 			ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1562 			ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1563 			if (args->brandname != NULL)
1564 				ADDAUX(*a,
1565 				    AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1566 			if (args->emulator != NULL)
1567 				ADDAUX(*a,
1568 				    AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1569 		} else {
1570 			auxv32_t **a = (auxv32_t **)auxvpp;
1571 			ADDAUX(*a,
1572 			    AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1573 			ADDAUX(*a,
1574 			    AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1575 			if (args->brandname != NULL)
1576 				ADDAUX(*a, AT_SUN_BRANDNAME,
1577 				    (int)(uintptr_t)&ustrp[*--offp])
1578 			if (args->emulator != NULL)
1579 				ADDAUX(*a, AT_SUN_EMULATOR,
1580 				    (int)(uintptr_t)&ustrp[*--offp])
1581 		}
1582 	}
1583 
1584 	return (0);
1585 }
1586 
1587 /*
1588  * Initialize a new user stack with the specified arguments and environment.
1589  * The initial user stack layout is as follows:
1590  *
1591  *	User Stack
1592  *	+---------------+ <--- curproc->p_usrstack
1593  *	|		|
1594  *	| slew		|
1595  *	|		|
1596  *	+---------------+
1597  *	| NULL		|
1598  *	+---------------+
1599  *	|		|
1600  *	| auxv strings	|
1601  *	|		|
1602  *	+---------------+
1603  *	|		|
1604  *	| envp strings	|
1605  *	|		|
1606  *	+---------------+
1607  *	|		|
1608  *	| argv strings	|
1609  *	|		|
1610  *	+---------------+ <--- ustrp
1611  *	|		|
1612  *	| aux vector	|
1613  *	|		|
1614  *	+---------------+ <--- auxv
1615  *	| NULL		|
1616  *	+---------------+
1617  *	| envp[envc-1]	|
1618  *	+---------------+
1619  *	| ...		|
1620  *	+---------------+
1621  *	| envp[0]	|
1622  *	+---------------+ <--- envp[]
1623  *	| NULL		|
1624  *	+---------------+
1625  *	| argv[argc-1]	|
1626  *	+---------------+
1627  *	| ...		|
1628  *	+---------------+
1629  *	| argv[0]	|
1630  *	+---------------+ <--- argv[]
1631  *	| argc		|
1632  *	+---------------+ <--- stack base
1633  */
1634 int
1635 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1636 {
1637 	size_t size;
1638 	int error;
1639 	proc_t *p = ttoproc(curthread);
1640 	user_t *up = PTOU(p);
1641 	char *usrstack;
1642 	rctl_entity_p_t e;
1643 	struct as *as;
1644 	extern int use_stk_lpg;
1645 	size_t sp_slew;
1646 
1647 	args->from_model = p->p_model;
1648 	if (p->p_model == DATAMODEL_NATIVE) {
1649 		args->from_ptrsize = sizeof (long);
1650 	} else {
1651 		args->from_ptrsize = sizeof (int32_t);
1652 	}
1653 
1654 	if (args->to_model == DATAMODEL_NATIVE) {
1655 		args->to_ptrsize = sizeof (long);
1656 		args->ncargs = NCARGS;
1657 		args->stk_align = STACK_ALIGN;
1658 		usrstack = (char *)USRSTACK;
1659 	} else {
1660 		args->to_ptrsize = sizeof (int32_t);
1661 		args->ncargs = NCARGS32;
1662 		args->stk_align = STACK_ALIGN32;
1663 		usrstack = (char *)USRSTACK32;
1664 	}
1665 
1666 	ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1667 
1668 #if defined(__sparc)
1669 	/*
1670 	 * Make sure user register windows are empty before
1671 	 * attempting to make a new stack.
1672 	 */
1673 	(void) flush_user_windows_to_stack(NULL);
1674 #endif
1675 
1676 	for (size = PAGESIZE; ; size *= 2) {
1677 		args->stk_size = size;
1678 		args->stk_base = kmem_alloc(size, KM_SLEEP);
1679 		args->stk_strp = args->stk_base;
1680 		args->stk_offp = (int *)(args->stk_base + size);
1681 		error = stk_copyin(uap, args, intp, auxvpp);
1682 		if (error == 0)
1683 			break;
1684 		kmem_free(args->stk_base, size);
1685 		if (error != E2BIG && error != ENAMETOOLONG)
1686 			return (error);
1687 		if (size >= args->ncargs)
1688 			return (E2BIG);
1689 	}
1690 
1691 	size = args->usrstack_size;
1692 
1693 	ASSERT(error == 0);
1694 	ASSERT(P2PHASE(size, args->stk_align) == 0);
1695 	ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1696 
1697 	if (size > args->ncargs) {
1698 		kmem_free(args->stk_base, args->stk_size);
1699 		return (E2BIG);
1700 	}
1701 
1702 	/*
1703 	 * Leave only the current lwp and force the other lwps to exit.
1704 	 * If another lwp beat us to the punch by calling exit(), bail out.
1705 	 */
1706 	if ((error = exitlwps(0)) != 0) {
1707 		kmem_free(args->stk_base, args->stk_size);
1708 		return (error);
1709 	}
1710 
1711 	/*
1712 	 * Revoke any doors created by the process.
1713 	 */
1714 	if (p->p_door_list)
1715 		door_exit();
1716 
1717 	/*
1718 	 * Release schedctl data structures.
1719 	 */
1720 	if (p->p_pagep)
1721 		schedctl_proc_cleanup();
1722 
1723 	/*
1724 	 * Clean up any DTrace helpers for the process.
1725 	 */
1726 	if (p->p_dtrace_helpers != NULL) {
1727 		ASSERT(dtrace_helpers_cleanup != NULL);
1728 		(*dtrace_helpers_cleanup)();
1729 	}
1730 
1731 	mutex_enter(&p->p_lock);
1732 	/*
1733 	 * Cleanup the DTrace provider associated with this process.
1734 	 */
1735 	if (p->p_dtrace_probes) {
1736 		ASSERT(dtrace_fasttrap_exec_ptr != NULL);
1737 		dtrace_fasttrap_exec_ptr(p);
1738 	}
1739 	mutex_exit(&p->p_lock);
1740 
1741 	/*
1742 	 * discard the lwpchan cache.
1743 	 */
1744 	if (p->p_lcp != NULL)
1745 		lwpchan_destroy_cache(1);
1746 
1747 	/*
1748 	 * Delete the POSIX timers.
1749 	 */
1750 	if (p->p_itimer != NULL)
1751 		timer_exit();
1752 
1753 	if (audit_active)
1754 		audit_exec(args->stk_base, args->stk_base + args->arglen,
1755 		    args->na - args->ne, args->ne);
1756 
1757 	/*
1758 	 * Ensure that we don't change resource associations while we
1759 	 * change address spaces.
1760 	 */
1761 	mutex_enter(&p->p_lock);
1762 	pool_barrier_enter();
1763 	mutex_exit(&p->p_lock);
1764 
1765 	/*
1766 	 * Destroy the old address space and create a new one.
1767 	 * From here on, any errors are fatal to the exec()ing process.
1768 	 * On error we return -1, which means the caller must SIGKILL
1769 	 * the process.
1770 	 */
1771 	relvm();
1772 
1773 	mutex_enter(&p->p_lock);
1774 	pool_barrier_exit();
1775 	mutex_exit(&p->p_lock);
1776 
1777 	up->u_execsw = args->execswp;
1778 
1779 	p->p_brkbase = NULL;
1780 	p->p_brksize = 0;
1781 	p->p_brkpageszc = 0;
1782 	p->p_stksize = 0;
1783 	p->p_stkpageszc = 0;
1784 	p->p_model = args->to_model;
1785 	p->p_usrstack = usrstack;
1786 	p->p_stkprot = args->stk_prot;
1787 	p->p_datprot = args->dat_prot;
1788 
1789 	/*
1790 	 * Reset resource controls such that all controls are again active as
1791 	 * well as appropriate to the potentially new address model for the
1792 	 * process.
1793 	 */
1794 	e.rcep_p.proc = p;
1795 	e.rcep_t = RCENTITY_PROCESS;
1796 	rctl_set_reset(p->p_rctls, p, &e);
1797 
1798 	/* Too early to call map_pgsz for the heap */
1799 	if (use_stk_lpg) {
1800 		p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
1801 	}
1802 
1803 	mutex_enter(&p->p_lock);
1804 	p->p_flag |= SAUTOLPG;	/* kernel controls page sizes */
1805 	mutex_exit(&p->p_lock);
1806 
1807 	/*
1808 	 * Some platforms may choose to randomize real stack start by adding a
1809 	 * small slew (not more than a few hundred bytes) to the top of the
1810 	 * stack. This helps avoid cache thrashing when identical processes
1811 	 * simultaneously share caches that don't provide enough associativity
1812 	 * (e.g. sun4v systems). In this case stack slewing makes the same hot
1813 	 * stack variables in different processes to live in different cache
1814 	 * sets increasing effective associativity.
1815 	 */
1816 	sp_slew = exec_get_spslew();
1817 	ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
1818 	exec_set_sp(size + sp_slew);
1819 
1820 	as = as_alloc();
1821 	p->p_as = as;
1822 	as->a_proc = p;
1823 	if (p->p_model == DATAMODEL_ILP32)
1824 		as->a_userlimit = (caddr_t)USERLIMIT32;
1825 	(void) hat_setup(as->a_hat, HAT_ALLOC);
1826 	hat_join_srd(as->a_hat, args->ex_vp);
1827 
1828 	/*
1829 	 * Finally, write out the contents of the new stack.
1830 	 */
1831 	error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
1832 	kmem_free(args->stk_base, args->stk_size);
1833 	return (error);
1834 }
1835