1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* Copyright (c) 1988 AT&T */
27 /* All Rights Reserved */
28 /*
29 * Copyright 2015 Garrett D'Amore <garrett@damore.org>
30 * Copyright 2019 Joyent, Inc.
31 * Copyright 2024 Oxide Computer Company
32 */
33
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/systm.h>
38 #include <sys/signal.h>
39 #include <sys/cred_impl.h>
40 #include <sys/policy.h>
41 #include <sys/user.h>
42 #include <sys/errno.h>
43 #include <sys/file.h>
44 #include <sys/vfs.h>
45 #include <sys/vnode.h>
46 #include <sys/mman.h>
47 #include <sys/acct.h>
48 #include <sys/cpuvar.h>
49 #include <sys/proc.h>
50 #include <sys/cmn_err.h>
51 #include <sys/debug.h>
52 #include <sys/pathname.h>
53 #include <sys/vm.h>
54 #include <sys/lgrp.h>
55 #include <sys/vtrace.h>
56 #include <sys/exec.h>
57 #include <sys/execx.h>
58 #include <sys/exechdr.h>
59 #include <sys/kmem.h>
60 #include <sys/prsystm.h>
61 #include <sys/modctl.h>
62 #include <sys/vmparam.h>
63 #include <sys/door.h>
64 #include <sys/schedctl.h>
65 #include <sys/utrap.h>
66 #include <sys/systeminfo.h>
67 #include <sys/stack.h>
68 #include <sys/rctl.h>
69 #include <sys/dtrace.h>
70 #include <sys/lwpchan_impl.h>
71 #include <sys/pool.h>
72 #include <sys/sdt.h>
73 #include <sys/brand.h>
74 #include <sys/klpd.h>
75 #include <sys/random.h>
76
77 #include <c2/audit.h>
78
79 #include <vm/hat.h>
80 #include <vm/anon.h>
81 #include <vm/as.h>
82 #include <vm/seg.h>
83 #include <vm/seg_vn.h>
84 #include <vm/seg_hole.h>
85
86 #define PRIV_RESET 0x01 /* needs to reset privs */
87 #define PRIV_SETID 0x02 /* needs to change uids */
88 #define PRIV_SETUGID 0x04 /* is setuid/setgid/forced privs */
89 #define PRIV_INCREASE 0x08 /* child runs with more privs */
90 #define MAC_FLAGS 0x10 /* need to adjust MAC flags */
91 #define PRIV_FORCED 0x20 /* has forced privileges */
92
93 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
94 priv_set_t *, cred_t *, const char *);
95 static int hold_execsw(struct execsw *);
96
97 uint_t auxv_hwcap = 0; /* auxv AT_SUN_HWCAP value; determined on the fly */
98 uint_t auxv_hwcap_2 = 0; /* AT_SUN_HWCAP2 */
99 uint_t auxv_hwcap_3 = 0; /* AT_SUN_HWCAP3 */
100 #if defined(_SYSCALL32_IMPL)
101 uint_t auxv_hwcap32 = 0; /* 32-bit version of auxv_hwcap */
102 uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */
103 uint_t auxv_hwcap32_3 = 0; /* 32-bit version of auxv_hwcap3 */
104 #endif
105
106 #define PSUIDFLAGS (SNOCD|SUGID)
107
108 /*
109 * These are consumed within the specific exec modules, but are defined here
110 * because
111 *
112 * 1) The exec modules are unloadable, which would make this near useless.
113 *
114 * 2) We want them to be common across all of them, should more than ELF come
115 * to support them.
116 *
117 * All must be powers of 2.
118 */
119 size_t aslr_max_brk_skew = 16 * 1024 * 1024; /* 16MB */
120 #pragma weak exec_stackgap = aslr_max_stack_skew /* Old, compatible name */
121 size_t aslr_max_stack_skew = 64 * 1024; /* 64KB */
122
123 /*
124 * Size of guard segment for 64-bit processes and minimum size it can be shrunk
125 * to in the case of grow() operations. These are kept as variables in case
126 * they need to be tuned in an emergency.
127 */
128 size_t stack_guard_seg_sz = 256 * 1024 * 1024;
129 size_t stack_guard_min_sz = 64 * 1024 * 1024;
130
131 /*
132 * exece() - system call wrapper around exec_common()
133 */
134 int
exece(uintptr_t file,const char ** argp,const char ** envp,int flags)135 exece(uintptr_t file, const char **argp, const char **envp, int flags)
136 {
137 int error;
138
139 if ((flags & ~EXEC_DESCRIPTOR) != 0)
140 return (set_errno(EINVAL));
141
142 if ((flags & EXEC_DESCRIPTOR) != 0) {
143 /*
144 * If EXEC_DESCRIPTOR is specified, then the `file`
145 * parameter is the number of a file descriptor in the current
146 * process.
147 */
148 char *path = NULL;
149 size_t allocsize;
150 int fd = (int)file;
151 vnode_t *vp = NULL;
152
153 if ((error = fgetstartvp(fd, NULL, &vp)) != 0)
154 return (set_errno(error));
155
156 mutex_enter(&vp->v_lock);
157 if (vp->v_path != NULL && vp->v_path != vn_vpath_empty) {
158 allocsize = strlen(vp->v_path) + 1;
159 path = kmem_alloc(allocsize, KM_NOSLEEP);
160 if (path == NULL) {
161 mutex_exit(&vp->v_lock);
162 VN_RELE(vp);
163 return (set_errno(ENOMEM));
164 }
165 bcopy(vp->v_path, path, allocsize);
166 }
167 mutex_exit(&vp->v_lock);
168
169 /*
170 * In the unlikely event that the descriptor's path is not
171 * cached, we fall back to using a constructed one.
172 */
173 if (path == NULL) {
174 /* 8 for "/dev/fd/", 10 for %d, + \0 == 19 */
175 allocsize = 20;
176 path = kmem_alloc(allocsize, KM_NOSLEEP);
177 if (path == NULL) {
178 VN_RELE(vp);
179 return (set_errno(ENOMEM));
180 }
181 (void) snprintf(path, allocsize, "/dev/fd/%d", fd);
182 }
183
184 error = exec_common(path, argp, envp, vp, EBA_NONE);
185 VN_RELE(vp);
186 kmem_free(path, allocsize);
187 } else {
188 const char *fname = (const char *)file;
189
190 error = exec_common(fname, argp, envp, NULL, EBA_NONE);
191 }
192
193 return (error ? (set_errno(error)) : 0);
194 }
195
196 int
exec_common(const char * fname,const char ** argp,const char ** envp,vnode_t * vp,int brand_action)197 exec_common(const char *fname, const char **argp, const char **envp,
198 vnode_t *vp, int brand_action)
199 {
200 vnode_t *dir = NULL, *tmpvp = NULL;
201 proc_t *p = ttoproc(curthread);
202 klwp_t *lwp = ttolwp(curthread);
203 struct user *up = PTOU(p);
204 size_t execsz; /* temporary count of exec size */
205 int i;
206 int error;
207 char exec_file[MAXCOMLEN+1];
208 struct pathname pn;
209 struct pathname resolvepn;
210 struct uarg args;
211 struct execa ua;
212 k_sigset_t savedmask;
213 lwpdir_t *lwpdir = NULL;
214 tidhash_t *tidhash;
215 lwpdir_t *old_lwpdir = NULL;
216 uint_t old_lwpdir_sz;
217 tidhash_t *old_tidhash;
218 uint_t old_tidhash_sz;
219 ret_tidhash_t *ret_tidhash;
220 lwpent_t *lep;
221 boolean_t brandme = B_FALSE;
222
223 /*
224 * exec() is not supported for the /proc agent lwp.
225 */
226 if (curthread == p->p_agenttp)
227 return (ENOTSUP);
228
229 if (brand_action != EBA_NONE) {
230 /*
231 * Brand actions are not supported for processes that are not
232 * running in a branded zone.
233 */
234 if (!ZONE_IS_BRANDED(p->p_zone))
235 return (ENOTSUP);
236
237 if (brand_action == EBA_NATIVE) {
238 /* Only branded processes can be unbranded */
239 if (!PROC_IS_BRANDED(p))
240 return (ENOTSUP);
241 } else {
242 /* Only unbranded processes can be branded */
243 if (PROC_IS_BRANDED(p))
244 return (ENOTSUP);
245 brandme = B_TRUE;
246 }
247 } else {
248 /*
249 * If this is a native zone, or if the process is already
250 * branded, then we don't need to do anything. If this is
251 * a native process in a branded zone, we need to brand the
252 * process as it exec()s the new binary.
253 */
254 if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
255 brandme = B_TRUE;
256 }
257
258 /*
259 * Inform /proc that an exec() has started.
260 * Hold signals that are ignored by default so that we will
261 * not be interrupted by a signal that will be ignored after
262 * successful completion of gexec().
263 */
264 mutex_enter(&p->p_lock);
265 prexecstart();
266 schedctl_finish_sigblock(curthread);
267 savedmask = curthread->t_hold;
268 sigorset(&curthread->t_hold, &ignoredefault);
269 mutex_exit(&p->p_lock);
270
271 if (vp != NULL) {
272 /*
273 * When a vnode is passed in we take an extra hold here and
274 * release it before returning. This means that callers don't
275 * need to account for the reference changing over the call.
276 */
277 VN_HOLD(vp);
278 pn_alloc(&pn);
279 pn_alloc(&resolvepn);
280 VERIFY0(pn_set(&pn, fname));
281 VERIFY0(pn_set(&resolvepn, fname));
282 } else {
283 /*
284 * Look up path name and remember last component for later.
285 * To help coreadm expand its %d token, we attempt to save
286 * the directory containing the executable in p_execdir. The
287 * first call to lookuppn() may fail and return EINVAL because
288 * dirvpp is non-NULL. In that case, we make a second call to
289 * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
290 * but coreadm is allowed to expand %d to the empty string and
291 * there are other cases in which that failure may occur.
292 */
293 if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
294 goto out;
295 pn_alloc(&resolvepn);
296 error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp);
297 if (error != 0) {
298 pn_free(&resolvepn);
299 pn_free(&pn);
300 if (error != EINVAL)
301 goto out;
302
303 dir = NULL;
304 if ((error = pn_get((char *)fname, UIO_USERSPACE,
305 &pn)) != 0) {
306 goto out;
307 }
308 pn_alloc(&resolvepn);
309 if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
310 &vp)) != 0) {
311 pn_free(&resolvepn);
312 pn_free(&pn);
313 goto out;
314 }
315 }
316 }
317
318 if (vp == NULL) {
319 if (dir != NULL)
320 VN_RELE(dir);
321 error = ENOENT;
322 pn_free(&resolvepn);
323 pn_free(&pn);
324 goto out;
325 }
326
327 if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
328 if (dir != NULL)
329 VN_RELE(dir);
330 pn_free(&resolvepn);
331 pn_free(&pn);
332 VN_RELE(vp);
333 goto out;
334 }
335
336 /*
337 * We do not allow executing files in attribute directories.
338 * We test this by determining whether the resolved path
339 * contains a "/" when we're in an attribute directory;
340 * only if the pathname does not contain a "/" the resolved path
341 * points to a file in the current working (attribute) directory.
342 */
343 if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
344 strchr(resolvepn.pn_path, '/') == NULL) {
345 if (dir != NULL)
346 VN_RELE(dir);
347 error = EACCES;
348 pn_free(&resolvepn);
349 pn_free(&pn);
350 VN_RELE(vp);
351 goto out;
352 }
353
354 bzero(exec_file, MAXCOMLEN+1);
355 (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
356 bzero(&args, sizeof (args));
357 args.pathname = resolvepn.pn_path;
358 /* don't free resolvepn until we are done with args */
359 pn_free(&pn);
360
361 /*
362 * If we're running in a profile shell, then call pfexecd.
363 */
364 if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
365 error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
366 &args.scrubenv);
367
368 /* Returning errno in case we're not allowed to execute. */
369 if (error > 0) {
370 if (dir != NULL)
371 VN_RELE(dir);
372 pn_free(&resolvepn);
373 VN_RELE(vp);
374 goto out;
375 }
376
377 /* Don't change the credentials when using old ptrace. */
378 if (args.pfcred != NULL &&
379 (p->p_proc_flag & P_PR_PTRACE) != 0) {
380 crfree(args.pfcred);
381 args.pfcred = NULL;
382 args.scrubenv = B_FALSE;
383 }
384 }
385
386 /*
387 * Specific exec handlers, or policies determined via
388 * /etc/system may override the historical default.
389 */
390 args.stk_prot = PROT_ZFOD;
391 args.dat_prot = PROT_ZFOD;
392
393 CPU_STATS_ADD_K(sys, sysexec, 1);
394 DTRACE_PROC1(exec, char *, args.pathname);
395
396 ua.fname = fname;
397 ua.argp = argp;
398 ua.envp = envp;
399
400 /* If necessary, brand this process before we start the exec. */
401 if (brandme)
402 brand_setbrand(p);
403
404 if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
405 exec_file, p->p_cred, brand_action)) != 0) {
406 if (brandme)
407 brand_clearbrand(p, B_FALSE);
408 VN_RELE(vp);
409 if (dir != NULL)
410 VN_RELE(dir);
411 pn_free(&resolvepn);
412 goto fail;
413 }
414
415 /*
416 * Free floating point registers (sun4u only)
417 */
418 ASSERT(lwp != NULL);
419 lwp_freeregs(lwp, 1);
420
421 /*
422 * Free thread and process context ops.
423 */
424 if (curthread->t_ctx)
425 freectx(curthread, 1);
426 if (p->p_pctx)
427 freepctx(p, 1);
428
429 /*
430 * Remember file name for accounting; clear any cached DTrace predicate.
431 */
432 up->u_acflag &= ~AFORK;
433 bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
434 curthread->t_predcache = 0;
435
436 /*
437 * Clear contract template state
438 */
439 lwp_ctmpl_clear(lwp);
440
441 /*
442 * Save the directory in which we found the executable for expanding
443 * the %d token used in core file patterns.
444 */
445 mutex_enter(&p->p_lock);
446 tmpvp = p->p_execdir;
447 p->p_execdir = dir;
448 if (p->p_execdir != NULL)
449 VN_HOLD(p->p_execdir);
450 mutex_exit(&p->p_lock);
451
452 if (tmpvp != NULL)
453 VN_RELE(tmpvp);
454
455 /*
456 * Reset stack state to the user stack, clear set of signals
457 * caught on the signal stack, and reset list of signals that
458 * restart system calls; the new program's environment should
459 * not be affected by detritus from the old program. Any
460 * pending held signals remain held, so don't clear t_hold.
461 */
462 mutex_enter(&p->p_lock);
463 lwp->lwp_oldcontext = 0;
464 lwp->lwp_ustack = 0;
465 lwp->lwp_old_stk_ctl = 0;
466 sigemptyset(&up->u_signodefer);
467 sigemptyset(&up->u_sigonstack);
468 sigemptyset(&up->u_sigresethand);
469 lwp->lwp_sigaltstack.ss_sp = 0;
470 lwp->lwp_sigaltstack.ss_size = 0;
471 lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
472
473 /*
474 * Make saved resource limit == current resource limit.
475 */
476 for (i = 0; i < RLIM_NLIMITS; i++) {
477 /*CONSTCOND*/
478 if (RLIM_SAVED(i)) {
479 (void) rctl_rlimit_get(rctlproc_legacy[i], p,
480 &up->u_saved_rlimit[i]);
481 }
482 }
483
484 /*
485 * If the action was to catch the signal, then the action
486 * must be reset to SIG_DFL.
487 */
488 sigdefault(p);
489 p->p_flag &= ~(SNOWAIT|SJCTL);
490 p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
491 up->u_signal[SIGCLD - 1] = SIG_DFL;
492
493 /*
494 * Delete the dot4 sigqueues/signotifies.
495 */
496 sigqfree(p);
497
498 mutex_exit(&p->p_lock);
499
500 mutex_enter(&p->p_pflock);
501 p->p_prof.pr_base = NULL;
502 p->p_prof.pr_size = 0;
503 p->p_prof.pr_off = 0;
504 p->p_prof.pr_scale = 0;
505 p->p_prof.pr_samples = 0;
506 mutex_exit(&p->p_pflock);
507
508 ASSERT(curthread->t_schedctl == NULL);
509
510 #if defined(__sparc)
511 if (p->p_utraps != NULL)
512 utrap_free(p);
513 #endif /* __sparc */
514
515 /*
516 * Close all close-on-exec files.
517 */
518 close_exec(P_FINFO(p));
519 TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
520
521 /* Unbrand ourself if necessary. */
522 if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
523 brand_clearbrand(p, B_FALSE);
524
525 setregs(&args);
526
527 /* Mark this as an executable vnode */
528 mutex_enter(&vp->v_lock);
529 vp->v_flag |= VVMEXEC;
530 mutex_exit(&vp->v_lock);
531
532 VN_RELE(vp);
533 if (dir != NULL)
534 VN_RELE(dir);
535 pn_free(&resolvepn);
536
537 /*
538 * Allocate a new lwp directory and lwpid hash table if necessary.
539 */
540 if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
541 lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
542 lwpdir->ld_next = lwpdir + 1;
543 tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
544 if (p->p_lwpdir != NULL)
545 lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
546 else
547 lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
548 }
549
550 if (PROC_IS_BRANDED(p))
551 BROP(p)->b_exec();
552
553 mutex_enter(&p->p_lock);
554 prbarrier(p);
555
556 /*
557 * Reset lwp id to the default value of 1.
558 * This is a single-threaded process now
559 * and lwp #1 is lwp_wait()able by default.
560 * The t_unpark flag should not be inherited.
561 */
562 ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
563 curthread->t_tid = 1;
564 kpreempt_disable();
565 ASSERT(curthread->t_lpl != NULL);
566 p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
567 kpreempt_enable();
568 if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
569 lgrp_update_trthr_migrations(1);
570 }
571 curthread->t_unpark = 0;
572 curthread->t_proc_flag |= TP_TWAIT;
573 curthread->t_proc_flag &= ~TP_DAEMON; /* daemons shouldn't exec */
574 p->p_lwpdaemon = 0; /* but oh well ... */
575 p->p_lwpid = 1;
576
577 /*
578 * Install the newly-allocated lwp directory and lwpid hash table
579 * and insert the current thread into the new hash table.
580 */
581 if (lwpdir != NULL) {
582 old_lwpdir = p->p_lwpdir;
583 old_lwpdir_sz = p->p_lwpdir_sz;
584 old_tidhash = p->p_tidhash;
585 old_tidhash_sz = p->p_tidhash_sz;
586 p->p_lwpdir = p->p_lwpfree = lwpdir;
587 p->p_lwpdir_sz = 2;
588 lep->le_thread = curthread;
589 lep->le_lwpid = curthread->t_tid;
590 lep->le_start = curthread->t_start;
591 lwp_hash_in(p, lep, tidhash, 2, 0);
592 p->p_tidhash = tidhash;
593 p->p_tidhash_sz = 2;
594 }
595 ret_tidhash = p->p_ret_tidhash;
596 p->p_ret_tidhash = NULL;
597
598 /*
599 * Restore the saved signal mask and
600 * inform /proc that the exec() has finished.
601 */
602 curthread->t_hold = savedmask;
603 prexecend();
604 mutex_exit(&p->p_lock);
605 if (old_lwpdir) {
606 kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
607 kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
608 }
609 while (ret_tidhash != NULL) {
610 ret_tidhash_t *next = ret_tidhash->rth_next;
611 kmem_free(ret_tidhash->rth_tidhash,
612 ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
613 kmem_free(ret_tidhash, sizeof (*ret_tidhash));
614 ret_tidhash = next;
615 }
616
617 ASSERT(error == 0);
618 DTRACE_PROC(exec__success);
619 return (0);
620
621 fail:
622 DTRACE_PROC1(exec__failure, int, error);
623 out: /* error return */
624 mutex_enter(&p->p_lock);
625 curthread->t_hold = savedmask;
626 prexecend();
627 mutex_exit(&p->p_lock);
628 ASSERT(error != 0);
629 return (error);
630 }
631
632
633 /*
634 * Perform generic exec duties and switchout to object-file specific
635 * handler.
636 */
637 int
gexec(struct vnode ** vpp,struct execa * uap,struct uarg * args,struct intpdata * idatap,int level,size_t * execsz,caddr_t exec_file,struct cred * cred,int brand_action)638 gexec(
639 struct vnode **vpp,
640 struct execa *uap,
641 struct uarg *args,
642 struct intpdata *idatap,
643 int level,
644 size_t *execsz,
645 caddr_t exec_file,
646 struct cred *cred,
647 int brand_action)
648 {
649 struct vnode *vp, *execvp = NULL;
650 proc_t *pp = ttoproc(curthread);
651 struct execsw *eswp;
652 int error = 0;
653 int suidflags = 0;
654 ssize_t resid;
655 uid_t uid, gid;
656 struct vattr vattr;
657 char magbuf[MAGIC_BYTES];
658 int setid;
659 cred_t *oldcred, *newcred = NULL;
660 int privflags = 0;
661 int setidfl;
662 priv_set_t fset;
663 secflagset_t old_secflags;
664
665 secflags_copy(&old_secflags, &pp->p_secflags.psf_effective);
666
667 /*
668 * If the SNOCD or SUGID flag is set, turn it off and remember the
669 * previous setting so we can restore it if we encounter an error.
670 */
671 if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
672 mutex_enter(&pp->p_lock);
673 suidflags = pp->p_flag & PSUIDFLAGS;
674 pp->p_flag &= ~PSUIDFLAGS;
675 mutex_exit(&pp->p_lock);
676 }
677
678 if ((error = execpermissions(*vpp, &vattr, args)) != 0)
679 goto bad_noclose;
680
681 /* need to open vnode for stateful file systems */
682 if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
683 goto bad_noclose;
684 vp = *vpp;
685
686 /*
687 * Note: to support binary compatibility with SunOS a.out
688 * executables, we read in the first four bytes, as the
689 * magic number is in bytes 2-3.
690 */
691 if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
692 (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
693 goto bad;
694 if (resid != 0)
695 goto bad;
696
697 if ((eswp = findexec_by_hdr(magbuf)) == NULL)
698 goto bad;
699
700 if (level == 0 &&
701 (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
702 args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {
703
704 /* Pfcred is a credential with a ref count of 1 */
705
706 if (args->pfcred != NULL) {
707 privflags |= PRIV_INCREASE|PRIV_RESET;
708 newcred = cred = args->pfcred;
709 } else {
710 newcred = cred = crdup(cred);
711 }
712
713 /* If we can, drop the PA bit */
714 if ((privflags & PRIV_RESET) != 0)
715 priv_adjust_PA(cred);
716
717 if (privflags & PRIV_SETID) {
718 cred->cr_uid = uid;
719 cred->cr_gid = gid;
720 cred->cr_suid = uid;
721 cred->cr_sgid = gid;
722 }
723
724 if (privflags & MAC_FLAGS) {
725 if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
726 CR_FLAGS(cred) &= ~NET_MAC_AWARE;
727 CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
728 }
729
730 /*
731 * Implement the privilege updates:
732 *
733 * Restrict with L:
734 *
735 * I' = I & L
736 *
737 * E' = P' = (I' + F) & A
738 *
739 * But if running under ptrace, we cap I and F with P.
740 */
741 if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
742 if ((privflags & PRIV_INCREASE) != 0 &&
743 (pp->p_proc_flag & P_PR_PTRACE) != 0) {
744 priv_intersect(&CR_OPPRIV(cred),
745 &CR_IPRIV(cred));
746 priv_intersect(&CR_OPPRIV(cred), &fset);
747 }
748 priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
749 CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
750 if (privflags & PRIV_FORCED) {
751 priv_set_PA(cred);
752 priv_union(&fset, &CR_EPRIV(cred));
753 priv_union(&fset, &CR_PPRIV(cred));
754 }
755 priv_adjust_PA(cred);
756 }
757 } else if (level == 0 && args->pfcred != NULL) {
758 newcred = cred = args->pfcred;
759 privflags |= PRIV_INCREASE;
760 /* pfcred is not forced to adhere to these settings */
761 priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
762 CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
763 priv_adjust_PA(cred);
764 }
765
766 /* The new image gets the inheritable secflags as its secflags */
767 secflags_promote(pp);
768
769 /* SunOS 4.x buy-back */
770 if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
771 (vattr.va_mode & (VSUID|VSGID))) {
772 char path[MAXNAMELEN];
773 refstr_t *mntpt = NULL;
774 int ret = -1;
775
776 bzero(path, sizeof (path));
777 zone_hold(pp->p_zone);
778
779 ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
780 sizeof (path), cred);
781
782 /* fallback to mountpoint if a path can't be found */
783 if ((ret != 0) || (ret == 0 && path[0] == '\0'))
784 mntpt = vfs_getmntpoint(vp->v_vfsp);
785
786 if (mntpt == NULL)
787 zcmn_err(pp->p_zone->zone_id, CE_NOTE,
788 "!uid %d: setuid execution not allowed, "
789 "file=%s", cred->cr_uid, path);
790 else
791 zcmn_err(pp->p_zone->zone_id, CE_NOTE,
792 "!uid %d: setuid execution not allowed, "
793 "fs=%s, file=%s", cred->cr_uid,
794 ZONE_PATH_TRANSLATE(refstr_value(mntpt),
795 pp->p_zone), exec_file);
796
797 if (!INGLOBALZONE(pp)) {
798 /* zone_rootpath always has trailing / */
799 if (mntpt == NULL)
800 cmn_err(CE_NOTE, "!zone: %s, uid: %d "
801 "setuid execution not allowed, file=%s%s",
802 pp->p_zone->zone_name, cred->cr_uid,
803 pp->p_zone->zone_rootpath, path + 1);
804 else
805 cmn_err(CE_NOTE, "!zone: %s, uid: %d "
806 "setuid execution not allowed, fs=%s, "
807 "file=%s", pp->p_zone->zone_name,
808 cred->cr_uid, refstr_value(mntpt),
809 exec_file);
810 }
811
812 if (mntpt != NULL)
813 refstr_rele(mntpt);
814
815 zone_rele(pp->p_zone);
816 }
817
818 /*
819 * execsetid() told us whether or not we had to change the
820 * credentials of the process. In privflags, it told us
821 * whether we gained any privileges or executed a set-uid executable.
822 */
823 setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));
824
825 /*
826 * Use /etc/system variable to determine if the stack
827 * should be marked as executable by default.
828 */
829 if ((noexec_user_stack != 0) ||
830 secflag_enabled(pp, PROC_SEC_NOEXECSTACK))
831 args->stk_prot &= ~PROT_EXEC;
832
833 args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
834 args->ex_vp = vp;
835
836 /*
837 * Traditionally, the setid flags told the sub processes whether
838 * the file just executed was set-uid or set-gid; this caused
839 * some confusion as the 'setid' flag did not match the SUGID
840 * process flag which is only set when the uids/gids do not match.
841 * A script set-gid/set-uid to the real uid/gid would start with
842 * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
843 * Now we flag those cases where the calling process cannot
844 * be trusted to influence the newly exec'ed process, either
845 * because it runs with more privileges or when the uids/gids
846 * do in fact not match.
847 * This also makes the runtime linker agree with the on exec
848 * values of SNOCD and SUGID.
849 */
850 setidfl = 0;
851 if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
852 !supgroupmember(cred->cr_gid, cred))) {
853 setidfl |= EXECSETID_UGIDS;
854 }
855 if (setid & PRIV_SETUGID)
856 setidfl |= EXECSETID_SETID;
857 if (setid & PRIV_FORCED)
858 setidfl |= EXECSETID_PRIVS;
859
860 execvp = pp->p_exec;
861 if (execvp)
862 VN_HOLD(execvp);
863
864 error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
865 setidfl, exec_file, cred, brand_action);
866 rw_exit(eswp->exec_lock);
867 if (error != 0) {
868 if (execvp)
869 VN_RELE(execvp);
870 /*
871 * If this process's p_exec has been set to the vp of
872 * the executable by exec_func, we will return without
873 * calling VOP_CLOSE because proc_exit will close it
874 * on exit.
875 */
876 if (pp->p_exec == vp)
877 goto bad_noclose;
878 else
879 goto bad;
880 }
881
882 if (level == 0) {
883 uid_t oruid;
884
885 if (execvp != NULL) {
886 /*
887 * Close the previous executable only if we are
888 * at level 0.
889 */
890 (void) VOP_CLOSE(execvp, FREAD, 1, (offset_t)0,
891 cred, NULL);
892 }
893
894 mutex_enter(&pp->p_crlock);
895
896 oruid = pp->p_cred->cr_ruid;
897
898 if (newcred != NULL) {
899 /*
900 * Free the old credentials, and set the new ones.
901 * Do this for both the process and the (single) thread.
902 */
903 crfree(pp->p_cred);
904 pp->p_cred = cred; /* cred already held for proc */
905 crhold(cred); /* hold new cred for thread */
906 /*
907 * DTrace accesses t_cred in probe context. t_cred
908 * must always be either NULL, or point to a valid,
909 * allocated cred structure.
910 */
911 oldcred = curthread->t_cred;
912 curthread->t_cred = cred;
913 crfree(oldcred);
914
915 if (priv_basic_test >= 0 &&
916 !PRIV_ISASSERT(&CR_IPRIV(newcred),
917 priv_basic_test)) {
918 pid_t pid = pp->p_pid;
919 char *fn = PTOU(pp)->u_comm;
920
921 cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
922 "privilege removed from E/I", fn, pid);
923 }
924 }
925 /*
926 * On emerging from a successful exec(), the saved
927 * uid and gid equal the effective uid and gid.
928 */
929 cred->cr_suid = cred->cr_uid;
930 cred->cr_sgid = cred->cr_gid;
931
932 /*
933 * If the real and effective ids do not match, this
934 * is a setuid process that should not dump core.
935 * The group comparison is tricky; we prevent the code
936 * from flagging SNOCD when executing with an effective gid
937 * which is a supplementary group.
938 */
939 if (cred->cr_ruid != cred->cr_uid ||
940 (cred->cr_rgid != cred->cr_gid &&
941 !supgroupmember(cred->cr_gid, cred)) ||
942 (privflags & PRIV_INCREASE) != 0)
943 suidflags = PSUIDFLAGS;
944 else
945 suidflags = 0;
946
947 mutex_exit(&pp->p_crlock);
948 if (newcred != NULL && oruid != newcred->cr_ruid) {
949 /* Note that the process remains in the same zone. */
950 mutex_enter(&pidlock);
951 upcount_dec(oruid, crgetzoneid(newcred));
952 upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
953 mutex_exit(&pidlock);
954 }
955 if (suidflags) {
956 mutex_enter(&pp->p_lock);
957 pp->p_flag |= suidflags;
958 mutex_exit(&pp->p_lock);
959 }
960 if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
961 /*
962 * If process is traced via /proc, arrange to
963 * invalidate the associated /proc vnode.
964 */
965 if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
966 args->traceinval = 1;
967 }
968 if (pp->p_proc_flag & P_PR_PTRACE)
969 psignal(pp, SIGTRAP);
970 if (args->traceinval)
971 prinvalidate(&pp->p_user);
972 }
973 if (execvp)
974 VN_RELE(execvp);
975 return (0);
976
977 bad:
978 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, cred, NULL);
979
980 bad_noclose:
981 if (newcred != NULL)
982 crfree(newcred);
983 if (error == 0)
984 error = ENOEXEC;
985
986 mutex_enter(&pp->p_lock);
987 if (suidflags) {
988 pp->p_flag |= suidflags;
989 }
990 /*
991 * Restore the effective secflags, to maintain the invariant they
992 * never change for a given process
993 */
994 secflags_copy(&pp->p_secflags.psf_effective, &old_secflags);
995 mutex_exit(&pp->p_lock);
996
997 return (error);
998 }
999
1000 extern char *execswnames[];
1001
1002 struct execsw *
allocate_execsw(char * name,char * magic,size_t magic_size)1003 allocate_execsw(char *name, char *magic, size_t magic_size)
1004 {
1005 int i, j;
1006 char *ename;
1007 char *magicp;
1008
1009 mutex_enter(&execsw_lock);
1010 for (i = 0; i < nexectype; i++) {
1011 if (execswnames[i] == NULL) {
1012 ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1013 (void) strcpy(ename, name);
1014 execswnames[i] = ename;
1015 /*
1016 * Set the magic number last so that we
1017 * don't need to hold the execsw_lock in
1018 * findexectype().
1019 */
1020 magicp = kmem_alloc(magic_size, KM_SLEEP);
1021 for (j = 0; j < magic_size; j++)
1022 magicp[j] = magic[j];
1023 execsw[i].exec_magic = magicp;
1024 mutex_exit(&execsw_lock);
1025 return (&execsw[i]);
1026 }
1027 }
1028 mutex_exit(&execsw_lock);
1029 return (NULL);
1030 }
1031
1032 /*
1033 * Find the exec switch table entry with the corresponding magic string.
1034 */
1035 struct execsw *
findexecsw(char * magic)1036 findexecsw(char *magic)
1037 {
1038 struct execsw *eswp;
1039
1040 for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1041 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1042 if (magic && eswp->exec_maglen != 0 &&
1043 bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
1044 return (eswp);
1045 }
1046 return (NULL);
1047 }
1048
1049 /*
1050 * Find the execsw[] index for the given exec header string by looking for the
1051 * magic string at a specified offset and length for each kind of executable
1052 * file format until one matches. If no execsw[] entry is found, try to
1053 * autoload a module for this magic string.
1054 */
1055 struct execsw *
findexec_by_hdr(char * header)1056 findexec_by_hdr(char *header)
1057 {
1058 struct execsw *eswp;
1059
1060 for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1061 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1062 if (header && eswp->exec_maglen != 0 &&
1063 bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
1064 eswp->exec_maglen) == 0) {
1065 if (hold_execsw(eswp) != 0)
1066 return (NULL);
1067 return (eswp);
1068 }
1069 }
1070 return (NULL); /* couldn't find the type */
1071 }
1072
1073 /*
1074 * Find the execsw[] index for the given magic string. If no execsw[] entry
1075 * is found, try to autoload a module for this magic string.
1076 */
1077 struct execsw *
findexec_by_magic(char * magic)1078 findexec_by_magic(char *magic)
1079 {
1080 struct execsw *eswp;
1081
1082 for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1083 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1084 if (magic && eswp->exec_maglen != 0 &&
1085 bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
1086 if (hold_execsw(eswp) != 0)
1087 return (NULL);
1088 return (eswp);
1089 }
1090 }
1091 return (NULL); /* couldn't find the type */
1092 }
1093
1094 static int
hold_execsw(struct execsw * eswp)1095 hold_execsw(struct execsw *eswp)
1096 {
1097 char *name;
1098
1099 rw_enter(eswp->exec_lock, RW_READER);
1100 while (!LOADED_EXEC(eswp)) {
1101 rw_exit(eswp->exec_lock);
1102 name = execswnames[eswp-execsw];
1103 ASSERT(name);
1104 if (modload("exec", name) == -1)
1105 return (-1);
1106 rw_enter(eswp->exec_lock, RW_READER);
1107 }
1108 return (0);
1109 }
1110
1111 static int
execsetid(struct vnode * vp,struct vattr * vattrp,uid_t * uidp,uid_t * gidp,priv_set_t * fset,cred_t * cr,const char * pathname)1112 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
1113 priv_set_t *fset, cred_t *cr, const char *pathname)
1114 {
1115 proc_t *pp = ttoproc(curthread);
1116 uid_t uid, gid;
1117 int privflags = 0;
1118
1119 /*
1120 * Remember credentials.
1121 */
1122 uid = cr->cr_uid;
1123 gid = cr->cr_gid;
1124
1125 /* Will try to reset the PRIV_AWARE bit later. */
1126 if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
1127 privflags |= PRIV_RESET;
1128
1129 if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
1130 /*
1131 * If it's a set-uid root program we perform the
1132 * forced privilege look-aside. This has three possible
1133 * outcomes:
1134 * no look aside information -> treat as before
1135 * look aside in Limit set -> apply forced privs
1136 * look aside not in Limit set -> ignore set-uid root
1137 *
1138 * Ordinary set-uid root execution only allowed if the limit
1139 * set holds all unsafe privileges.
1140 */
1141 if (vattrp->va_mode & VSUID) {
1142 if (vattrp->va_uid == 0) {
1143 int res = get_forced_privs(cr, pathname, fset);
1144
1145 switch (res) {
1146 case -1:
1147 if (priv_issubset(&priv_unsafe,
1148 &CR_LPRIV(cr))) {
1149 uid = vattrp->va_uid;
1150 privflags |= PRIV_SETUGID;
1151 }
1152 break;
1153 case 0:
1154 privflags |= PRIV_FORCED|PRIV_INCREASE;
1155 break;
1156 default:
1157 break;
1158 }
1159 } else {
1160 uid = vattrp->va_uid;
1161 privflags |= PRIV_SETUGID;
1162 }
1163 }
1164 if (vattrp->va_mode & VSGID) {
1165 gid = vattrp->va_gid;
1166 privflags |= PRIV_SETUGID;
1167 }
1168 }
1169
1170 /*
1171 * Do we need to change our credential anyway?
1172 * This is the case when E != I or P != I, as
1173 * we need to do the assignments (with F empty and A full)
1174 * Or when I is not a subset of L; in that case we need to
1175 * enforce L.
1176 *
1177 * I' = L & I
1178 *
1179 * E' = P' = (I' + F) & A
1180 * or
1181 * E' = P' = I'
1182 */
1183 if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
1184 !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
1185 !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
1186 privflags |= PRIV_RESET;
1187
1188 /* Child has more privileges than parent */
1189 if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
1190 privflags |= PRIV_INCREASE;
1191
1192 /* If MAC-aware flag(s) are on, need to update cred to remove. */
1193 if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
1194 (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
1195 privflags |= MAC_FLAGS;
1196 /*
1197 * Set setuid/setgid protections if no ptrace() compatibility.
1198 * For privileged processes, honor setuid/setgid even in
1199 * the presence of ptrace() compatibility.
1200 */
1201 if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
1202 PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
1203 (cr->cr_uid != uid ||
1204 cr->cr_gid != gid ||
1205 cr->cr_suid != uid ||
1206 cr->cr_sgid != gid)) {
1207 *uidp = uid;
1208 *gidp = gid;
1209 privflags |= PRIV_SETID;
1210 }
1211 return (privflags);
1212 }
1213
1214 int
execpermissions(struct vnode * vp,struct vattr * vattrp,struct uarg * args)1215 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
1216 {
1217 int error;
1218 proc_t *p = ttoproc(curthread);
1219
1220 vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
1221 if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
1222 return (error);
1223 /*
1224 * Check the access mode.
1225 * If VPROC, ask /proc if the file is an object file.
1226 */
1227 if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
1228 !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
1229 (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
1230 (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
1231 if (error == 0)
1232 error = EACCES;
1233 return (error);
1234 }
1235
1236 if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
1237 (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
1238 /*
1239 * If process is under ptrace(2) compatibility,
1240 * fail the exec(2).
1241 */
1242 if (p->p_proc_flag & P_PR_PTRACE)
1243 goto bad;
1244 /*
1245 * Process is traced via /proc.
1246 * Arrange to invalidate the /proc vnode.
1247 */
1248 args->traceinval = 1;
1249 }
1250 return (0);
1251 bad:
1252 if (error == 0)
1253 error = ENOEXEC;
1254 return (error);
1255 }
1256
1257 /*
1258 * Map a section of an executable file into the user's
1259 * address space.
1260 */
1261 int
execmap(struct vnode * vp,caddr_t addr,size_t len,size_t zfodlen,off_t offset,int prot,int page,uint_t szc)1262 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1263 off_t offset, int prot, int page, uint_t szc)
1264 {
1265 int error = 0;
1266 off_t oldoffset;
1267 caddr_t zfodbase, oldaddr;
1268 size_t end, oldlen;
1269 size_t zfoddiff;
1270 label_t ljb;
1271 proc_t *p = ttoproc(curthread);
1272
1273 oldaddr = addr;
1274 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1275 if (len) {
1276 oldlen = len;
1277 len += ((size_t)oldaddr - (size_t)addr);
1278 oldoffset = offset;
1279 offset = (off_t)((uintptr_t)offset & PAGEMASK);
1280 if (page) {
1281 spgcnt_t prefltmem, availm, npages;
1282 int preread;
1283 uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1284
1285 if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1286 mflag |= MAP_TEXT;
1287 } else {
1288 mflag |= MAP_INITDATA;
1289 }
1290
1291 if (valid_usr_range(addr, len, prot, p->p_as,
1292 p->p_as->a_userlimit) != RANGE_OKAY) {
1293 error = ENOMEM;
1294 goto bad;
1295 }
1296 if (error = VOP_MAP(vp, (offset_t)offset,
1297 p->p_as, &addr, len, prot, PROT_ALL,
1298 mflag, CRED(), NULL))
1299 goto bad;
1300
1301 /*
1302 * If the segment can fit, then we prefault
1303 * the entire segment in. This is based on the
1304 * model that says the best working set of a
1305 * small program is all of its pages.
1306 */
1307 npages = (spgcnt_t)btopr(len);
1308 prefltmem = freemem - desfree;
1309 preread =
1310 (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1311
1312 /*
1313 * If we aren't prefaulting the segment,
1314 * increment "deficit", if necessary to ensure
1315 * that pages will become available when this
1316 * process starts executing.
1317 */
1318 availm = freemem - lotsfree;
1319 if (preread == 0 && npages > availm &&
1320 deficit < lotsfree) {
1321 deficit += MIN((pgcnt_t)(npages - availm),
1322 lotsfree - deficit);
1323 }
1324
1325 if (preread) {
1326 TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1327 "execmap preread:freemem %d size %lu",
1328 freemem, len);
1329 (void) as_fault(p->p_as->a_hat, p->p_as,
1330 (caddr_t)addr, len, F_INVAL, S_READ);
1331 }
1332 } else {
1333 if (valid_usr_range(addr, len, prot, p->p_as,
1334 p->p_as->a_userlimit) != RANGE_OKAY) {
1335 error = ENOMEM;
1336 goto bad;
1337 }
1338
1339 if (error = as_map(p->p_as, addr, len,
1340 segvn_create, zfod_argsp))
1341 goto bad;
1342 /*
1343 * Read in the segment in one big chunk.
1344 */
1345 if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1346 oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1347 (rlim64_t)0, CRED(), (ssize_t *)0))
1348 goto bad;
1349 /*
1350 * Now set protections.
1351 */
1352 if (prot != PROT_ZFOD) {
1353 (void) as_setprot(p->p_as, (caddr_t)addr,
1354 len, prot);
1355 }
1356 }
1357 }
1358
1359 if (zfodlen) {
1360 struct as *as = curproc->p_as;
1361 struct seg *seg;
1362 uint_t zprot = 0;
1363
1364 end = (size_t)addr + len;
1365 zfodbase = (caddr_t)roundup(end, PAGESIZE);
1366 zfoddiff = (uintptr_t)zfodbase - end;
1367 if (zfoddiff) {
1368 /*
1369 * Before we go to zero the remaining space on the last
1370 * page, make sure we have write permission.
1371 *
1372 * Normal illumos binaries don't even hit the case
1373 * where we have to change permission on the last page
1374 * since their protection is typically either
1375 * PROT_USER | PROT_WRITE | PROT_READ
1376 * or
1377 * PROT_ZFOD (same as PROT_ALL).
1378 *
1379 * We need to be careful how we zero-fill the last page
1380 * if the segment protection does not include
1381 * PROT_WRITE. Using as_setprot() can cause the VM
1382 * segment code to call segvn_vpage(), which must
1383 * allocate a page struct for each page in the segment.
1384 * If we have a very large segment, this may fail, so
1385 * we have to check for that, even though we ignore
1386 * other return values from as_setprot.
1387 */
1388
1389 AS_LOCK_ENTER(as, RW_READER);
1390 seg = as_segat(curproc->p_as, (caddr_t)end);
1391 if (seg != NULL)
1392 SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1393 &zprot);
1394 AS_LOCK_EXIT(as);
1395
1396 if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1397 if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
1398 zprot | PROT_WRITE) == ENOMEM) {
1399 error = ENOMEM;
1400 goto bad;
1401 }
1402 }
1403
1404 if (on_fault(&ljb)) {
1405 no_fault();
1406 if (seg != NULL && (zprot & PROT_WRITE) == 0)
1407 (void) as_setprot(as, (caddr_t)end,
1408 zfoddiff - 1, zprot);
1409 error = EFAULT;
1410 goto bad;
1411 }
1412 uzero((void *)end, zfoddiff);
1413 no_fault();
1414 if (seg != NULL && (zprot & PROT_WRITE) == 0)
1415 (void) as_setprot(as, (caddr_t)end,
1416 zfoddiff - 1, zprot);
1417 }
1418 if (zfodlen > zfoddiff) {
1419 struct segvn_crargs crargs =
1420 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1421
1422 zfodlen -= zfoddiff;
1423 if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1424 p->p_as->a_userlimit) != RANGE_OKAY) {
1425 error = ENOMEM;
1426 goto bad;
1427 }
1428 if (szc > 0) {
1429 /*
1430 * ASSERT alignment because the mapelfexec()
1431 * caller for the szc > 0 case extended zfod
1432 * so it's end is pgsz aligned.
1433 */
1434 size_t pgsz = page_get_pagesize(szc);
1435 ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1436
1437 if (IS_P2ALIGNED(zfodbase, pgsz)) {
1438 crargs.szc = szc;
1439 } else {
1440 crargs.szc = AS_MAP_HEAP;
1441 }
1442 } else {
1443 crargs.szc = AS_MAP_NO_LPOOB;
1444 }
1445 if (error = as_map(p->p_as, (caddr_t)zfodbase,
1446 zfodlen, segvn_create, &crargs))
1447 goto bad;
1448 if (prot != PROT_ZFOD) {
1449 (void) as_setprot(p->p_as, (caddr_t)zfodbase,
1450 zfodlen, prot);
1451 }
1452 }
1453 }
1454 return (0);
1455 bad:
1456 return (error);
1457 }
1458
1459 void
setexecenv(struct execenv * ep)1460 setexecenv(struct execenv *ep)
1461 {
1462 proc_t *p = ttoproc(curthread);
1463 klwp_t *lwp = ttolwp(curthread);
1464 struct vnode *vp;
1465
1466 p->p_bssbase = ep->ex_bssbase;
1467 p->p_brkbase = ep->ex_brkbase;
1468 p->p_brksize = ep->ex_brksize;
1469 if (p->p_exec)
1470 VN_RELE(p->p_exec); /* out with the old */
1471 vp = p->p_exec = ep->ex_vp;
1472 if (vp != NULL)
1473 VN_HOLD(vp); /* in with the new */
1474
1475 lwp->lwp_sigaltstack.ss_sp = 0;
1476 lwp->lwp_sigaltstack.ss_size = 0;
1477 lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1478 }
1479
1480 int
execopen(struct vnode ** vpp,int * fdp)1481 execopen(struct vnode **vpp, int *fdp)
1482 {
1483 struct vnode *vp = *vpp;
1484 file_t *fp;
1485 int error = 0;
1486 int filemode = FREAD;
1487
1488 VN_HOLD(vp); /* open reference */
1489 if (error = falloc(NULL, filemode, &fp, fdp)) {
1490 VN_RELE(vp);
1491 *fdp = -1; /* just in case falloc changed value */
1492 return (error);
1493 }
1494 if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1495 VN_RELE(vp);
1496 setf(*fdp, NULL);
1497 unfalloc(fp);
1498 *fdp = -1;
1499 return (error);
1500 }
1501 *vpp = vp; /* vnode should not have changed */
1502 fp->f_vnode = vp;
1503 mutex_exit(&fp->f_tlock);
1504 setf(*fdp, fp);
1505 return (0);
1506 }
1507
1508 int
execclose(int fd)1509 execclose(int fd)
1510 {
1511 return (closeandsetf(fd, NULL));
1512 }
1513
1514
1515 /*
1516 * noexec stub function.
1517 */
1518 /*ARGSUSED*/
1519 int
noexec(struct vnode * vp,struct execa * uap,struct uarg * args,struct intpdata * idatap,int level,size_t * execsz,int setid,caddr_t exec_file,struct cred * cred)1520 noexec(
1521 struct vnode *vp,
1522 struct execa *uap,
1523 struct uarg *args,
1524 struct intpdata *idatap,
1525 int level,
1526 size_t *execsz,
1527 int setid,
1528 caddr_t exec_file,
1529 struct cred *cred)
1530 {
1531 cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1532 return (ENOEXEC);
1533 }
1534
1535 /*
1536 * Support routines for building a user stack.
1537 *
1538 * execve(path, argv, envp) must construct a new stack with the specified
1539 * arguments and environment variables (see exec_args() for a description
1540 * of the user stack layout). To do this, we copy the arguments and
1541 * environment variables from the old user address space into the kernel,
1542 * free the old as, create the new as, and copy our buffered information
1543 * to the new stack. Our kernel buffer has the following structure:
1544 *
1545 * +-----------------------+ <--- stk_base + stk_size
1546 * | string offsets |
1547 * +-----------------------+ <--- stk_offp
1548 * | |
1549 * | STK_AVAIL() space |
1550 * | |
1551 * +-----------------------+ <--- stk_strp
1552 * | strings |
1553 * +-----------------------+ <--- stk_base
1554 *
1555 * When we add a string, we store the string's contents (including the null
1556 * terminator) at stk_strp, and we store the offset of the string relative to
1557 * stk_base at --stk_offp. At strings are added, stk_strp increases and
1558 * stk_offp decreases. The amount of space remaining, STK_AVAIL(), is just
1559 * the difference between these pointers. If we run out of space, we return
1560 * an error and exec_args() starts all over again with a buffer twice as large.
1561 * When we're all done, the kernel buffer looks like this:
1562 *
1563 * +-----------------------+ <--- stk_base + stk_size
1564 * | argv[0] offset |
1565 * +-----------------------+
1566 * | ... |
1567 * +-----------------------+
1568 * | argv[argc-1] offset |
1569 * +-----------------------+
1570 * | envp[0] offset |
1571 * +-----------------------+
1572 * | ... |
1573 * +-----------------------+
1574 * | envp[envc-1] offset |
1575 * +-----------------------+
1576 * | AT_SUN_PLATFORM offset|
1577 * +-----------------------+
1578 * | AT_SUN_EXECNAME offset|
1579 * +-----------------------+ <--- stk_offp
1580 * | |
1581 * | STK_AVAIL() space |
1582 * | |
1583 * +-----------------------+ <--- stk_strp
1584 * | AT_SUN_EXECNAME offset|
1585 * +-----------------------+
1586 * | AT_SUN_PLATFORM offset|
1587 * +-----------------------+
1588 * | envp[envc-1] string |
1589 * +-----------------------+
1590 * | ... |
1591 * +-----------------------+
1592 * | envp[0] string |
1593 * +-----------------------+
1594 * | argv[argc-1] string |
1595 * +-----------------------+
1596 * | ... |
1597 * +-----------------------+
1598 * | argv[0] string |
1599 * +-----------------------+ <--- stk_base
1600 */
1601
1602 #define STK_AVAIL(args) ((char *)(args)->stk_offp - (args)->stk_strp)
1603
1604 /*
1605 * Add a string to the stack.
1606 */
1607 static int
stk_add(uarg_t * args,const char * sp,enum uio_seg segflg)1608 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1609 {
1610 int error;
1611 size_t len;
1612
1613 if (STK_AVAIL(args) < sizeof (int))
1614 return (E2BIG);
1615 *--args->stk_offp = args->stk_strp - args->stk_base;
1616
1617 if (segflg == UIO_USERSPACE) {
1618 error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1619 if (error != 0)
1620 return (error);
1621 } else {
1622 len = strlen(sp) + 1;
1623 if (len > STK_AVAIL(args))
1624 return (E2BIG);
1625 bcopy(sp, args->stk_strp, len);
1626 }
1627
1628 args->stk_strp += len;
1629
1630 return (0);
1631 }
1632
1633 static int
stk_getptr(uarg_t * args,char * src,char ** dst)1634 stk_getptr(uarg_t *args, char *src, char **dst)
1635 {
1636 int error;
1637
1638 if (args->from_model == DATAMODEL_NATIVE) {
1639 ulong_t ptr;
1640 error = fulword(src, &ptr);
1641 *dst = (caddr_t)ptr;
1642 } else {
1643 uint32_t ptr;
1644 error = fuword32(src, &ptr);
1645 *dst = (caddr_t)(uintptr_t)ptr;
1646 }
1647 return (error);
1648 }
1649
1650 static int
stk_putptr(uarg_t * args,char * addr,char * value)1651 stk_putptr(uarg_t *args, char *addr, char *value)
1652 {
1653 if (args->to_model == DATAMODEL_NATIVE)
1654 return (sulword(addr, (ulong_t)value));
1655 else
1656 return (suword32(addr, (uint32_t)(uintptr_t)value));
1657 }
1658
1659 static int
stk_copyin(execa_t * uap,uarg_t * args,intpdata_t * intp,void ** auxvpp)1660 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1661 {
1662 char *sp;
1663 int argc, error;
1664 int argv_empty = 0;
1665 size_t ptrsize = args->from_ptrsize;
1666 size_t size, pad;
1667 char *argv = (char *)uap->argp;
1668 char *envp = (char *)uap->envp;
1669
1670 /*
1671 * Copy interpreter's name and argument to argv[0] and argv[1].
1672 * In the rare case that we have nested interpreters then those names
1673 * and arguments are also copied to the subsequent slots in argv.
1674 */
1675 if (intp != NULL && intp->intp_name[0] != NULL) {
1676 int i;
1677
1678 for (i = 0; i < INTP_MAXDEPTH; i++) {
1679 if (intp->intp_name[i] == NULL)
1680 break;
1681 error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE);
1682 if (error != 0)
1683 return (error);
1684 if (intp->intp_arg[i] != NULL) {
1685 error = stk_add(args, intp->intp_arg[i],
1686 UIO_SYSSPACE);
1687 if (error != 0)
1688 return (error);
1689 }
1690 }
1691
1692 if (args->fname != NULL)
1693 error = stk_add(args, args->fname, UIO_SYSSPACE);
1694 else
1695 error = stk_add(args, uap->fname, UIO_USERSPACE);
1696 if (error)
1697 return (error);
1698
1699 /*
1700 * Check for an empty argv[].
1701 */
1702 if (stk_getptr(args, argv, &sp))
1703 return (EFAULT);
1704 if (sp == NULL)
1705 argv_empty = 1;
1706
1707 argv += ptrsize; /* ignore original argv[0] */
1708 }
1709
1710 if (argv_empty == 0) {
1711 /*
1712 * Add argv[] strings to the stack.
1713 */
1714 for (;;) {
1715 if (stk_getptr(args, argv, &sp))
1716 return (EFAULT);
1717 if (sp == NULL)
1718 break;
1719 if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1720 return (error);
1721 argv += ptrsize;
1722 }
1723 }
1724 argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1725 args->arglen = args->stk_strp - args->stk_base;
1726
1727 /*
1728 * Add environ[] strings to the stack.
1729 */
1730 if (envp != NULL) {
1731 for (;;) {
1732 char *tmp = args->stk_strp;
1733 if (stk_getptr(args, envp, &sp))
1734 return (EFAULT);
1735 if (sp == NULL)
1736 break;
1737 if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1738 return (error);
1739 if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
1740 /* Undo the copied string */
1741 args->stk_strp = tmp;
1742 *(args->stk_offp++) = 0;
1743 }
1744 envp += ptrsize;
1745 }
1746 }
1747 args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1748 args->ne = args->na - argc;
1749
1750 /*
1751 * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1752 * AT_SUN_EMULATOR strings to the stack.
1753 */
1754 if (auxvpp != NULL && *auxvpp != NULL) {
1755 if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1756 return (error);
1757 if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1758 return (error);
1759 if (args->brandname != NULL &&
1760 (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1761 return (error);
1762 if (args->emulator != NULL &&
1763 (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1764 return (error);
1765 }
1766
1767 /*
1768 * Compute the size of the stack. This includes all the pointers,
1769 * the space reserved for the aux vector, and all the strings.
1770 * The total number of pointers is args->na (which is argc + envc)
1771 * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1772 * after the last argument (i.e. argv[argc]); (3) the NULL after the
1773 * last environment variable (i.e. envp[envc]); and (4) the NULL after
1774 * all the strings, at the very top of the stack.
1775 */
1776 size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1777 (args->stk_strp - args->stk_base);
1778
1779 /*
1780 * Pad the string section with zeroes to align the stack size.
1781 */
1782 pad = P2NPHASE(size, args->stk_align);
1783
1784 if (STK_AVAIL(args) < pad)
1785 return (E2BIG);
1786
1787 args->usrstack_size = size + pad;
1788
1789 while (pad-- != 0)
1790 *args->stk_strp++ = 0;
1791
1792 args->nc = args->stk_strp - args->stk_base;
1793
1794 return (0);
1795 }
1796
1797 static int
stk_copyout(uarg_t * args,char * usrstack,void ** auxvpp,user_t * up)1798 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1799 {
1800 size_t ptrsize = args->to_ptrsize;
1801 ssize_t pslen;
1802 char *kstrp = args->stk_base;
1803 char *ustrp = usrstack - args->nc - ptrsize;
1804 char *usp = usrstack - args->usrstack_size;
1805 int *offp = (int *)(args->stk_base + args->stk_size);
1806 int envc = args->ne;
1807 int argc = args->na - envc;
1808 int i;
1809
1810 /*
1811 * Record argc for /proc.
1812 */
1813 up->u_argc = argc;
1814
1815 /*
1816 * Put argc on the stack. Note that even though it's an int,
1817 * it always consumes ptrsize bytes (for alignment).
1818 */
1819 if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1820 return (-1);
1821
1822 /*
1823 * Add argc space (ptrsize) to usp and record argv for /proc.
1824 */
1825 up->u_argv = (uintptr_t)(usp += ptrsize);
1826
1827 /*
1828 * Put the argv[] pointers on the stack.
1829 */
1830 for (i = 0; i < argc; i++, usp += ptrsize)
1831 if (stk_putptr(args, usp, &ustrp[*--offp]))
1832 return (-1);
1833
1834 /*
1835 * Copy arguments to u_psargs.
1836 */
1837 pslen = MIN(args->arglen, PSARGSZ) - 1;
1838 for (i = 0; i < pslen; i++)
1839 up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1840 while (i < PSARGSZ)
1841 up->u_psargs[i++] = '\0';
1842
1843 /*
1844 * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1845 * record envp for /proc.
1846 */
1847 up->u_envp = (uintptr_t)(usp += ptrsize);
1848
1849 /*
1850 * Put the envp[] pointers on the stack.
1851 */
1852 for (i = 0; i < envc; i++, usp += ptrsize)
1853 if (stk_putptr(args, usp, &ustrp[*--offp]))
1854 return (-1);
1855
1856 /*
1857 * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1858 * remember where the stack ends, which is also where auxv begins.
1859 */
1860 args->stackend = usp += ptrsize;
1861
1862 /*
1863 * Put all the argv[], envp[], and auxv strings on the stack.
1864 */
1865 if (copyout(args->stk_base, ustrp, args->nc))
1866 return (-1);
1867
1868 /*
1869 * Fill in the aux vector now that we know the user stack addresses
1870 * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1871 * AT_SUN_EMULATOR strings.
1872 */
1873 if (auxvpp != NULL && *auxvpp != NULL) {
1874 if (args->to_model == DATAMODEL_NATIVE) {
1875 auxv_t **a = (auxv_t **)auxvpp;
1876 ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1877 ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1878 if (args->brandname != NULL)
1879 ADDAUX(*a,
1880 AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1881 if (args->emulator != NULL)
1882 ADDAUX(*a,
1883 AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1884 } else {
1885 auxv32_t **a = (auxv32_t **)auxvpp;
1886 ADDAUX(*a,
1887 AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1888 ADDAUX(*a,
1889 AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1890 if (args->brandname != NULL)
1891 ADDAUX(*a, AT_SUN_BRANDNAME,
1892 (int)(uintptr_t)&ustrp[*--offp])
1893 if (args->emulator != NULL)
1894 ADDAUX(*a, AT_SUN_EMULATOR,
1895 (int)(uintptr_t)&ustrp[*--offp])
1896 }
1897 }
1898
1899 return (0);
1900 }
1901
1902 /*
1903 * Though the actual stack base is constant, slew the %sp by a random aligned
1904 * amount in [0,aslr_max_stack_skew). Mostly, this makes life slightly more
1905 * complicated for buffer overflows hoping to overwrite the return address.
1906 *
1907 * On some platforms this helps avoid cache thrashing when identical processes
1908 * simultaneously share caches that don't provide enough associativity
1909 * (e.g. sun4v systems). In this case stack slewing makes the same hot stack
1910 * variables in different processes live in different cache sets increasing
1911 * effective associativity.
1912 */
1913 size_t
exec_get_spslew(void)1914 exec_get_spslew(void)
1915 {
1916 #ifdef sun4v
1917 static uint_t sp_color_stride = 16;
1918 static uint_t sp_color_mask = 0x1f;
1919 static uint_t sp_current_color = (uint_t)-1;
1920 #endif
1921 size_t off;
1922
1923 ASSERT(ISP2(aslr_max_stack_skew));
1924
1925 if ((aslr_max_stack_skew == 0) ||
1926 !secflag_enabled(curproc, PROC_SEC_ASLR)) {
1927 #ifdef sun4v
1928 uint_t spcolor = atomic_inc_32_nv(&sp_current_color);
1929 return ((size_t)((spcolor & sp_color_mask) *
1930 SA(sp_color_stride)));
1931 #else
1932 return (0);
1933 #endif
1934 }
1935
1936 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1937 return (SA(P2PHASE(off, aslr_max_stack_skew)));
1938 }
1939
1940 /*
1941 * Initialize a new user stack with the specified arguments and environment.
1942 * The initial user stack layout is as follows:
1943 *
1944 * User Stack
1945 * +---------------+
1946 * | |
1947 * | stack guard |
1948 * | (64-bit only) |
1949 * | |
1950 * +...............+ <--- stack limit (base - curproc->p_stk_ctl)
1951 * . .
1952 * . .
1953 * . .
1954 * +---------------+ <--- curproc->p_usrstack
1955 * | |
1956 * | slew |
1957 * | |
1958 * +---------------+
1959 * | NULL |
1960 * +---------------+
1961 * | |
1962 * | auxv strings |
1963 * | |
1964 * +---------------+
1965 * | |
1966 * | envp strings |
1967 * | |
1968 * +---------------+
1969 * | |
1970 * | argv strings |
1971 * | |
1972 * +---------------+ <--- ustrp
1973 * | |
1974 * | aux vector |
1975 * | |
1976 * +---------------+ <--- auxv
1977 * | NULL |
1978 * +---------------+
1979 * | envp[envc-1] |
1980 * +---------------+
1981 * | ... |
1982 * +---------------+
1983 * | envp[0] |
1984 * +---------------+ <--- envp[]
1985 * | NULL |
1986 * +---------------+
1987 * | argv[argc-1] |
1988 * +---------------+
1989 * | ... |
1990 * +---------------+
1991 * | argv[0] |
1992 * +---------------+ <--- argv[]
1993 * | argc |
1994 * +---------------+ <--- stack base
1995 *
1996 * In 64-bit processes, a stack guard segment is allocated at the address
1997 * immediately below where the stack limit ends. This protects new library
1998 * mappings (such as the linker) from being placed in relatively dangerous
1999 * proximity to the stack.
2000 */
2001 int
exec_args(execa_t * uap,uarg_t * args,intpdata_t * intp,void ** auxvpp)2002 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
2003 {
2004 size_t size;
2005 int error;
2006 proc_t *p = ttoproc(curthread);
2007 user_t *up = PTOU(p);
2008 char *usrstack;
2009 rctl_entity_p_t e;
2010 struct as *as;
2011 extern int use_stk_lpg;
2012 size_t sp_slew;
2013 #if defined(_LP64)
2014 const size_t sg_sz = (stack_guard_seg_sz & PAGEMASK);
2015 #endif /* defined(_LP64) */
2016
2017 args->from_model = p->p_model;
2018 if (p->p_model == DATAMODEL_NATIVE) {
2019 args->from_ptrsize = sizeof (long);
2020 } else {
2021 args->from_ptrsize = sizeof (int32_t);
2022 }
2023
2024 if (args->to_model == DATAMODEL_NATIVE) {
2025 args->to_ptrsize = sizeof (long);
2026 args->ncargs = NCARGS;
2027 args->stk_align = STACK_ALIGN;
2028 if (args->addr32)
2029 usrstack = (char *)USRSTACK64_32;
2030 else
2031 usrstack = (char *)USRSTACK;
2032 } else {
2033 args->to_ptrsize = sizeof (int32_t);
2034 args->ncargs = NCARGS32;
2035 args->stk_align = STACK_ALIGN32;
2036 usrstack = (char *)USRSTACK32;
2037 }
2038
2039 ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
2040
2041 #if defined(__sparc)
2042 /*
2043 * Make sure user register windows are empty before
2044 * attempting to make a new stack.
2045 */
2046 (void) flush_user_windows_to_stack(NULL);
2047 #endif
2048
2049 for (size = PAGESIZE; ; size *= 2) {
2050 args->stk_size = size;
2051 args->stk_base = kmem_alloc(size, KM_SLEEP);
2052 args->stk_strp = args->stk_base;
2053 args->stk_offp = (int *)(args->stk_base + size);
2054 error = stk_copyin(uap, args, intp, auxvpp);
2055 if (error == 0)
2056 break;
2057 kmem_free(args->stk_base, size);
2058 if (error != E2BIG && error != ENAMETOOLONG)
2059 return (error);
2060 if (size >= args->ncargs)
2061 return (E2BIG);
2062 }
2063
2064 size = args->usrstack_size;
2065
2066 ASSERT(error == 0);
2067 ASSERT(P2PHASE(size, args->stk_align) == 0);
2068 ASSERT((ssize_t)STK_AVAIL(args) >= 0);
2069
2070 if (size > args->ncargs) {
2071 kmem_free(args->stk_base, args->stk_size);
2072 return (E2BIG);
2073 }
2074
2075 /*
2076 * Leave only the current lwp and force the other lwps to exit.
2077 * If another lwp beat us to the punch by calling exit(), bail out.
2078 */
2079 if ((error = exitlwps(0)) != 0) {
2080 kmem_free(args->stk_base, args->stk_size);
2081 return (error);
2082 }
2083
2084 /*
2085 * Revoke any doors created by the process.
2086 */
2087 if (p->p_door_list)
2088 door_exit();
2089
2090 /*
2091 * Release schedctl data structures.
2092 */
2093 if (p->p_pagep)
2094 schedctl_proc_cleanup();
2095
2096 /*
2097 * Clean up any DTrace helpers for the process.
2098 */
2099 if (p->p_dtrace_helpers != NULL) {
2100 ASSERT(dtrace_helpers_cleanup != NULL);
2101 (*dtrace_helpers_cleanup)(p);
2102 }
2103
2104 mutex_enter(&p->p_lock);
2105 /*
2106 * Cleanup the DTrace provider associated with this process.
2107 */
2108 if (p->p_dtrace_probes) {
2109 ASSERT(dtrace_fasttrap_exec_ptr != NULL);
2110 dtrace_fasttrap_exec_ptr(p);
2111 }
2112 mutex_exit(&p->p_lock);
2113
2114 /*
2115 * discard the lwpchan cache.
2116 */
2117 if (p->p_lcp != NULL)
2118 lwpchan_destroy_cache(1);
2119
2120 /*
2121 * Delete the POSIX timers.
2122 */
2123 if (p->p_itimer != NULL)
2124 timer_exit();
2125
2126 /*
2127 * Delete the ITIMER_REALPROF interval timer.
2128 * The other ITIMER_* interval timers are specified
2129 * to be inherited across exec().
2130 */
2131 delete_itimer_realprof();
2132
2133 if (AU_AUDITING())
2134 audit_exec(args->stk_base, args->stk_base + args->arglen,
2135 args->na - args->ne, args->ne, args->pfcred);
2136
2137 /*
2138 * Ensure that we don't change resource associations while we
2139 * change address spaces.
2140 */
2141 mutex_enter(&p->p_lock);
2142 pool_barrier_enter();
2143 mutex_exit(&p->p_lock);
2144
2145 /*
2146 * Destroy the old address space and create a new one.
2147 * From here on, any errors are fatal to the exec()ing process.
2148 * On error we return -1, which means the caller must SIGKILL
2149 * the process.
2150 */
2151 relvm();
2152
2153 mutex_enter(&p->p_lock);
2154 pool_barrier_exit();
2155 mutex_exit(&p->p_lock);
2156
2157 up->u_execsw = args->execswp;
2158
2159 p->p_brkbase = NULL;
2160 p->p_brksize = 0;
2161 p->p_brkpageszc = 0;
2162 p->p_stksize = 0;
2163 p->p_stkpageszc = 0;
2164 p->p_stkg_start = 0;
2165 p->p_stkg_end = 0;
2166 p->p_model = args->to_model;
2167 p->p_usrstack = usrstack;
2168 p->p_stkprot = args->stk_prot;
2169 p->p_datprot = args->dat_prot;
2170
2171 /*
2172 * Reset resource controls such that all controls are again active as
2173 * well as appropriate to the potentially new address model for the
2174 * process.
2175 */
2176 e.rcep_p.proc = p;
2177 e.rcep_t = RCENTITY_PROCESS;
2178 rctl_set_reset(p->p_rctls, p, &e);
2179
2180 /* Too early to call map_pgsz for the heap */
2181 if (use_stk_lpg) {
2182 p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
2183 }
2184
2185 mutex_enter(&p->p_lock);
2186 p->p_flag |= SAUTOLPG; /* kernel controls page sizes */
2187 mutex_exit(&p->p_lock);
2188
2189 sp_slew = exec_get_spslew();
2190 ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
2191 /* Be certain we don't underflow */
2192 VERIFY((curproc->p_usrstack - (size + sp_slew)) < curproc->p_usrstack);
2193 exec_set_sp(size + sp_slew);
2194
2195 as = as_alloc();
2196 p->p_as = as;
2197 as->a_proc = p;
2198 if (p->p_model == DATAMODEL_ILP32 || args->addr32)
2199 as->a_userlimit = (caddr_t)USERLIMIT32;
2200 (void) hat_setup(as->a_hat, HAT_ALLOC);
2201 hat_join_srd(as->a_hat, args->ex_vp);
2202
2203 /* Write out the contents of the new stack. */
2204 error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
2205 kmem_free(args->stk_base, args->stk_size);
2206
2207 #if defined(_LP64)
2208 /* Add stack guard segment (if needed) after successful copyout */
2209 if (error == 0 && p->p_model == DATAMODEL_LP64 && sg_sz != 0) {
2210 seghole_crargs_t sca;
2211 caddr_t addr_end = (caddr_t)(((uintptr_t)usrstack -
2212 p->p_stk_ctl) & PAGEMASK);
2213 caddr_t addr_start = addr_end - sg_sz;
2214
2215 DTRACE_PROBE4(stack__guard__chk, proc_t *, p,
2216 caddr_t, addr_start, caddr_t, addr_end, size_t, sg_sz);
2217
2218 if (addr_end >= usrstack || addr_start >= addr_end ||
2219 valid_usr_range(addr_start, sg_sz, PROT_NONE, as,
2220 as->a_userlimit) != RANGE_OKAY) {
2221 return (E2BIG);
2222 }
2223
2224 /* Create un-mappable area in AS with seg_hole */
2225 sca.name = "stack_guard";
2226 error = as_map(as, addr_start, sg_sz, seghole_create, &sca);
2227 if (error == 0) {
2228 p->p_stkg_start = (uintptr_t)addr_start;
2229 p->p_stkg_end = (uintptr_t)addr_start + sg_sz;
2230 }
2231 }
2232 #endif /* defined(_LP64) */
2233
2234 return (error);
2235 }
2236