1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 1993, David Greenman 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_capsicum.h" 33 #include "opt_hwpmc_hooks.h" 34 #include "opt_ktrace.h" 35 #include "opt_vm.h" 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/acct.h> 40 #include <sys/asan.h> 41 #include <sys/capsicum.h> 42 #include <sys/eventhandler.h> 43 #include <sys/exec.h> 44 #include <sys/fcntl.h> 45 #include <sys/filedesc.h> 46 #include <sys/imgact.h> 47 #include <sys/imgact_elf.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/malloc.h> 51 #include <sys/mman.h> 52 #include <sys/mount.h> 53 #include <sys/mutex.h> 54 #include <sys/namei.h> 55 #include <sys/priv.h> 56 #include <sys/proc.h> 57 #include <sys/ptrace.h> 58 #include <sys/resourcevar.h> 59 #include <sys/rwlock.h> 60 #include <sys/sched.h> 61 #include <sys/sdt.h> 62 #include <sys/sf_buf.h> 63 #include <sys/shm.h> 64 #include <sys/signalvar.h> 65 #include <sys/smp.h> 66 #include <sys/stat.h> 67 #include <sys/syscallsubr.h> 68 #include <sys/sysctl.h> 69 #include <sys/sysent.h> 70 #include <sys/sysproto.h> 71 #include <sys/timers.h> 72 #include <sys/umtx.h> 73 #include <sys/vnode.h> 74 #include <sys/wait.h> 75 #ifdef KTRACE 76 #include <sys/ktrace.h> 77 #endif 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <vm/pmap.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_kern.h> 85 #include <vm/vm_extern.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_pager.h> 88 89 #ifdef HWPMC_HOOKS 90 #include <sys/pmckern.h> 91 #endif 92 93 #include <machine/reg.h> 94 95 #include <security/audit/audit.h> 96 #include <security/mac/mac_framework.h> 97 98 #ifdef KDTRACE_HOOKS 99 #include <sys/dtrace_bsd.h> 100 dtrace_execexit_func_t dtrace_fasttrap_exec; 101 #endif 102 103 SDT_PROVIDER_DECLARE(proc); 104 SDT_PROBE_DEFINE1(proc, , , exec, "char *"); 105 SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); 106 SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); 107 108 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); 109 110 int coredump_pack_fileinfo = 1; 111 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, 112 &coredump_pack_fileinfo, 0, 113 "Enable file path packing in 'procstat -f' coredump notes"); 114 115 int coredump_pack_vmmapinfo = 1; 116 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, 117 &coredump_pack_vmmapinfo, 0, 118 "Enable file path packing in 'procstat -v' coredump notes"); 119 120 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); 121 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); 122 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); 123 static int do_execve(struct thread *td, struct image_args *args, 124 struct mac *mac_p, struct vmspace *oldvmspace); 125 126 /* XXX This should be vm_size_t. */ 127 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| 128 CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", 129 "Location of process' ps_strings structure"); 130 131 /* XXX This should be vm_size_t. */ 132 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| 133 CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", 134 "Top of process stack"); 135 136 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, 137 NULL, 0, sysctl_kern_stackprot, "I", 138 "Stack memory permissions"); 139 140 u_long ps_arg_cache_limit = PAGE_SIZE / 16; 141 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 142 &ps_arg_cache_limit, 0, 143 "Process' command line characters cache limit"); 144 145 static int disallow_high_osrel; 146 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, 147 &disallow_high_osrel, 0, 148 "Disallow execution of binaries built for higher version of the world"); 149 150 static int map_at_zero = 0; 151 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, 152 "Permit processes to map an object at virtual address 0."); 153 154 static int 155 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) 156 { 157 struct proc *p; 158 int error; 159 160 p = curproc; 161 #ifdef SCTL_MASK32 162 if (req->flags & SCTL_MASK32) { 163 unsigned int val; 164 val = (unsigned int)p->p_sysent->sv_psstrings; 165 error = SYSCTL_OUT(req, &val, sizeof(val)); 166 } else 167 #endif 168 error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, 169 sizeof(p->p_sysent->sv_psstrings)); 170 return error; 171 } 172 173 static int 174 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) 175 { 176 struct proc *p; 177 int error; 178 179 p = curproc; 180 #ifdef SCTL_MASK32 181 if (req->flags & SCTL_MASK32) { 182 unsigned int val; 183 val = (unsigned int)p->p_sysent->sv_usrstack; 184 error = SYSCTL_OUT(req, &val, sizeof(val)); 185 } else 186 #endif 187 error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, 188 sizeof(p->p_sysent->sv_usrstack)); 189 return error; 190 } 191 192 static int 193 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) 194 { 195 struct proc *p; 196 197 p = curproc; 198 return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, 199 sizeof(p->p_sysent->sv_stackprot))); 200 } 201 202 /* 203 * Each of the items is a pointer to a `const struct execsw', hence the 204 * double pointer here. 205 */ 206 static const struct execsw **execsw; 207 208 #ifndef _SYS_SYSPROTO_H_ 209 struct execve_args { 210 char *fname; 211 char **argv; 212 char **envv; 213 }; 214 #endif 215 216 int 217 sys_execve(struct thread *td, struct execve_args *uap) 218 { 219 struct image_args args; 220 struct vmspace *oldvmspace; 221 int error; 222 223 error = pre_execve(td, &oldvmspace); 224 if (error != 0) 225 return (error); 226 error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, 227 uap->argv, uap->envv); 228 if (error == 0) 229 error = kern_execve(td, &args, NULL, oldvmspace); 230 post_execve(td, error, oldvmspace); 231 AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); 232 return (error); 233 } 234 235 #ifndef _SYS_SYSPROTO_H_ 236 struct fexecve_args { 237 int fd; 238 char **argv; 239 char **envv; 240 }; 241 #endif 242 int 243 sys_fexecve(struct thread *td, struct fexecve_args *uap) 244 { 245 struct image_args args; 246 struct vmspace *oldvmspace; 247 int error; 248 249 error = pre_execve(td, &oldvmspace); 250 if (error != 0) 251 return (error); 252 error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, 253 uap->argv, uap->envv); 254 if (error == 0) { 255 args.fd = uap->fd; 256 error = kern_execve(td, &args, NULL, oldvmspace); 257 } 258 post_execve(td, error, oldvmspace); 259 AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); 260 return (error); 261 } 262 263 #ifndef _SYS_SYSPROTO_H_ 264 struct __mac_execve_args { 265 char *fname; 266 char **argv; 267 char **envv; 268 struct mac *mac_p; 269 }; 270 #endif 271 272 int 273 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) 274 { 275 #ifdef MAC 276 struct image_args args; 277 struct vmspace *oldvmspace; 278 int error; 279 280 error = pre_execve(td, &oldvmspace); 281 if (error != 0) 282 return (error); 283 error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, 284 uap->argv, uap->envv); 285 if (error == 0) 286 error = kern_execve(td, &args, uap->mac_p, oldvmspace); 287 post_execve(td, error, oldvmspace); 288 AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); 289 return (error); 290 #else 291 return (ENOSYS); 292 #endif 293 } 294 295 int 296 pre_execve(struct thread *td, struct vmspace **oldvmspace) 297 { 298 struct proc *p; 299 int error; 300 301 KASSERT(td == curthread, ("non-current thread %p", td)); 302 error = 0; 303 p = td->td_proc; 304 if ((p->p_flag & P_HADTHREADS) != 0) { 305 PROC_LOCK(p); 306 if (thread_single(p, SINGLE_BOUNDARY) != 0) 307 error = ERESTART; 308 PROC_UNLOCK(p); 309 } 310 KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, 311 ("nested execve")); 312 *oldvmspace = p->p_vmspace; 313 return (error); 314 } 315 316 void 317 post_execve(struct thread *td, int error, struct vmspace *oldvmspace) 318 { 319 struct proc *p; 320 321 KASSERT(td == curthread, ("non-current thread %p", td)); 322 p = td->td_proc; 323 if ((p->p_flag & P_HADTHREADS) != 0) { 324 PROC_LOCK(p); 325 /* 326 * If success, we upgrade to SINGLE_EXIT state to 327 * force other threads to suicide. 328 */ 329 if (error == EJUSTRETURN) 330 thread_single(p, SINGLE_EXIT); 331 else 332 thread_single_end(p, SINGLE_BOUNDARY); 333 PROC_UNLOCK(p); 334 } 335 exec_cleanup(td, oldvmspace); 336 } 337 338 /* 339 * kern_execve() has the astonishing property of not always returning to 340 * the caller. If sufficiently bad things happen during the call to 341 * do_execve(), it can end up calling exit1(); as a result, callers must 342 * avoid doing anything which they might need to undo (e.g., allocating 343 * memory). 344 */ 345 int 346 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p, 347 struct vmspace *oldvmspace) 348 { 349 350 AUDIT_ARG_ARGV(args->begin_argv, args->argc, 351 exec_args_get_begin_envv(args) - args->begin_argv); 352 AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc, 353 args->endp - exec_args_get_begin_envv(args)); 354 return (do_execve(td, args, mac_p, oldvmspace)); 355 } 356 357 /* 358 * In-kernel implementation of execve(). All arguments are assumed to be 359 * userspace pointers from the passed thread. 360 */ 361 static int 362 do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, 363 struct vmspace *oldvmspace) 364 { 365 struct proc *p = td->td_proc; 366 struct nameidata nd; 367 struct ucred *oldcred; 368 struct uidinfo *euip = NULL; 369 uintptr_t stack_base; 370 struct image_params image_params, *imgp; 371 struct vattr attr; 372 int (*img_first)(struct image_params *); 373 struct pargs *oldargs = NULL, *newargs = NULL; 374 struct sigacts *oldsigacts = NULL, *newsigacts = NULL; 375 #ifdef KTRACE 376 struct vnode *tracevp = NULL; 377 struct ucred *tracecred = NULL; 378 #endif 379 struct vnode *oldtextvp = NULL, *newtextvp; 380 int credential_changing; 381 #ifdef MAC 382 struct label *interpvplabel = NULL; 383 int will_transition; 384 #endif 385 #ifdef HWPMC_HOOKS 386 struct pmckern_procexec pe; 387 #endif 388 int error, i, orig_osrel; 389 uint32_t orig_fctl0; 390 static const char fexecv_proc_title[] = "(fexecv)"; 391 392 imgp = &image_params; 393 394 /* 395 * Lock the process and set the P_INEXEC flag to indicate that 396 * it should be left alone until we're done here. This is 397 * necessary to avoid race conditions - e.g. in ptrace() - 398 * that might allow a local user to illicitly obtain elevated 399 * privileges. 400 */ 401 PROC_LOCK(p); 402 KASSERT((p->p_flag & P_INEXEC) == 0, 403 ("%s(): process already has P_INEXEC flag", __func__)); 404 p->p_flag |= P_INEXEC; 405 PROC_UNLOCK(p); 406 407 /* 408 * Initialize part of the common data 409 */ 410 bzero(imgp, sizeof(*imgp)); 411 imgp->proc = p; 412 imgp->attr = &attr; 413 imgp->args = args; 414 oldcred = p->p_ucred; 415 orig_osrel = p->p_osrel; 416 orig_fctl0 = p->p_fctl0; 417 418 #ifdef MAC 419 error = mac_execve_enter(imgp, mac_p); 420 if (error) 421 goto exec_fail; 422 #endif 423 424 /* 425 * Translate the file name. namei() returns a vnode pointer 426 * in ni_vp among other things. 427 * 428 * XXXAUDIT: It would be desirable to also audit the name of the 429 * interpreter if this is an interpreted binary. 430 */ 431 if (args->fname != NULL) { 432 NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | 433 SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); 434 } 435 436 SDT_PROBE1(proc, , , exec, args->fname); 437 438 interpret: 439 if (args->fname != NULL) { 440 #ifdef CAPABILITY_MODE 441 /* 442 * While capability mode can't reach this point via direct 443 * path arguments to execve(), we also don't allow 444 * interpreters to be used in capability mode (for now). 445 * Catch indirect lookups and return a permissions error. 446 */ 447 if (IN_CAPABILITY_MODE(td)) { 448 error = ECAPMODE; 449 goto exec_fail; 450 } 451 #endif 452 error = namei(&nd); 453 if (error) 454 goto exec_fail; 455 456 newtextvp = nd.ni_vp; 457 imgp->vp = newtextvp; 458 } else { 459 AUDIT_ARG_FD(args->fd); 460 /* 461 * Descriptors opened only with O_EXEC or O_RDONLY are allowed. 462 */ 463 error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); 464 if (error) 465 goto exec_fail; 466 vn_lock(newtextvp, LK_SHARED | LK_RETRY); 467 AUDIT_ARG_VNODE1(newtextvp); 468 imgp->vp = newtextvp; 469 } 470 471 /* 472 * Check file permissions. Also 'opens' file and sets its vnode to 473 * text mode. 474 */ 475 error = exec_check_permissions(imgp); 476 if (error) 477 goto exec_fail_dealloc; 478 479 imgp->object = imgp->vp->v_object; 480 if (imgp->object != NULL) 481 vm_object_reference(imgp->object); 482 483 error = exec_map_first_page(imgp); 484 if (error) 485 goto exec_fail_dealloc; 486 487 imgp->proc->p_osrel = 0; 488 imgp->proc->p_fctl0 = 0; 489 490 /* 491 * Implement image setuid/setgid. 492 * 493 * Determine new credentials before attempting image activators 494 * so that it can be used by process_exec handlers to determine 495 * credential/setid changes. 496 * 497 * Don't honor setuid/setgid if the filesystem prohibits it or if 498 * the process is being traced. 499 * 500 * We disable setuid/setgid/etc in capability mode on the basis 501 * that most setugid applications are not written with that 502 * environment in mind, and will therefore almost certainly operate 503 * incorrectly. In principle there's no reason that setugid 504 * applications might not be useful in capability mode, so we may want 505 * to reconsider this conservative design choice in the future. 506 * 507 * XXXMAC: For the time being, use NOSUID to also prohibit 508 * transitions on the file system. 509 */ 510 credential_changing = 0; 511 credential_changing |= (attr.va_mode & S_ISUID) && 512 oldcred->cr_uid != attr.va_uid; 513 credential_changing |= (attr.va_mode & S_ISGID) && 514 oldcred->cr_gid != attr.va_gid; 515 #ifdef MAC 516 will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, 517 interpvplabel, imgp); 518 credential_changing |= will_transition; 519 #endif 520 521 /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ 522 if (credential_changing) 523 imgp->proc->p_pdeathsig = 0; 524 525 if (credential_changing && 526 #ifdef CAPABILITY_MODE 527 ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && 528 #endif 529 (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && 530 (p->p_flag & P_TRACED) == 0) { 531 imgp->credential_setid = true; 532 VOP_UNLOCK(imgp->vp); 533 imgp->newcred = crdup(oldcred); 534 if (attr.va_mode & S_ISUID) { 535 euip = uifind(attr.va_uid); 536 change_euid(imgp->newcred, euip); 537 } 538 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 539 if (attr.va_mode & S_ISGID) 540 change_egid(imgp->newcred, attr.va_gid); 541 /* 542 * Implement correct POSIX saved-id behavior. 543 * 544 * XXXMAC: Note that the current logic will save the 545 * uid and gid if a MAC domain transition occurs, even 546 * though maybe it shouldn't. 547 */ 548 change_svuid(imgp->newcred, imgp->newcred->cr_uid); 549 change_svgid(imgp->newcred, imgp->newcred->cr_gid); 550 } else { 551 /* 552 * Implement correct POSIX saved-id behavior. 553 * 554 * XXX: It's not clear that the existing behavior is 555 * POSIX-compliant. A number of sources indicate that the 556 * saved uid/gid should only be updated if the new ruid is 557 * not equal to the old ruid, or the new euid is not equal 558 * to the old euid and the new euid is not equal to the old 559 * ruid. The FreeBSD code always updates the saved uid/gid. 560 * Also, this code uses the new (replaced) euid and egid as 561 * the source, which may or may not be the right ones to use. 562 */ 563 if (oldcred->cr_svuid != oldcred->cr_uid || 564 oldcred->cr_svgid != oldcred->cr_gid) { 565 VOP_UNLOCK(imgp->vp); 566 imgp->newcred = crdup(oldcred); 567 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 568 change_svuid(imgp->newcred, imgp->newcred->cr_uid); 569 change_svgid(imgp->newcred, imgp->newcred->cr_gid); 570 } 571 } 572 /* The new credentials are installed into the process later. */ 573 574 /* 575 * Do the best to calculate the full path to the image file. 576 */ 577 if (args->fname != NULL && args->fname[0] == '/') 578 imgp->execpath = args->fname; 579 else { 580 VOP_UNLOCK(imgp->vp); 581 if (vn_fullpath(imgp->vp, &imgp->execpath, &imgp->freepath) != 0) 582 imgp->execpath = args->fname; 583 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 584 } 585 586 /* 587 * If the current process has a special image activator it 588 * wants to try first, call it. For example, emulating shell 589 * scripts differently. 590 */ 591 error = -1; 592 if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) 593 error = img_first(imgp); 594 595 /* 596 * Loop through the list of image activators, calling each one. 597 * An activator returns -1 if there is no match, 0 on success, 598 * and an error otherwise. 599 */ 600 for (i = 0; error == -1 && execsw[i]; ++i) { 601 if (execsw[i]->ex_imgact == NULL || 602 execsw[i]->ex_imgact == img_first) { 603 continue; 604 } 605 error = (*execsw[i]->ex_imgact)(imgp); 606 } 607 608 if (error) { 609 if (error == -1) 610 error = ENOEXEC; 611 goto exec_fail_dealloc; 612 } 613 614 /* 615 * Special interpreter operation, cleanup and loop up to try to 616 * activate the interpreter. 617 */ 618 if (imgp->interpreted) { 619 exec_unmap_first_page(imgp); 620 /* 621 * The text reference needs to be removed for scripts. 622 * There is a short period before we determine that 623 * something is a script where text reference is active. 624 * The vnode lock is held over this entire period 625 * so nothing should illegitimately be blocked. 626 */ 627 MPASS(imgp->textset); 628 VOP_UNSET_TEXT_CHECKED(newtextvp); 629 imgp->textset = false; 630 /* free name buffer and old vnode */ 631 if (args->fname != NULL) 632 NDFREE(&nd, NDF_ONLY_PNBUF); 633 #ifdef MAC 634 mac_execve_interpreter_enter(newtextvp, &interpvplabel); 635 #endif 636 if (imgp->opened) { 637 VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); 638 imgp->opened = 0; 639 } 640 vput(newtextvp); 641 vm_object_deallocate(imgp->object); 642 imgp->object = NULL; 643 imgp->credential_setid = false; 644 if (imgp->newcred != NULL) { 645 crfree(imgp->newcred); 646 imgp->newcred = NULL; 647 } 648 imgp->execpath = NULL; 649 free(imgp->freepath, M_TEMP); 650 imgp->freepath = NULL; 651 /* set new name to that of the interpreter */ 652 NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | 653 SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); 654 args->fname = imgp->interpreter_name; 655 goto interpret; 656 } 657 658 /* 659 * NB: We unlock the vnode here because it is believed that none 660 * of the sv_copyout_strings/sv_fixup operations require the vnode. 661 */ 662 VOP_UNLOCK(imgp->vp); 663 664 if (disallow_high_osrel && 665 P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { 666 error = ENOEXEC; 667 uprintf("Osrel %d for image %s too high\n", p->p_osrel, 668 imgp->execpath != NULL ? imgp->execpath : "<unresolved>"); 669 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 670 goto exec_fail_dealloc; 671 } 672 673 /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ 674 if (SV_PROC_FLAG(p, SV_CAPSICUM)) 675 sys_cap_enter(td, NULL); 676 677 /* 678 * Copy out strings (args and env) and initialize stack base. 679 */ 680 error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base); 681 if (error != 0) { 682 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 683 goto exec_fail_dealloc; 684 } 685 686 /* 687 * Stack setup. 688 */ 689 error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); 690 if (error != 0) { 691 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 692 goto exec_fail_dealloc; 693 } 694 695 if (args->fdp != NULL) { 696 /* Install a brand new file descriptor table. */ 697 fdinstall_remapped(td, args->fdp); 698 args->fdp = NULL; 699 } else { 700 /* 701 * Keep on using the existing file descriptor table. For 702 * security and other reasons, the file descriptor table 703 * cannot be shared after an exec. 704 */ 705 fdunshare(td); 706 pdunshare(td); 707 /* close files on exec */ 708 fdcloseexec(td); 709 } 710 711 /* 712 * Malloc things before we need locks. 713 */ 714 i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; 715 /* Cache arguments if they fit inside our allowance */ 716 if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { 717 newargs = pargs_alloc(i); 718 bcopy(imgp->args->begin_argv, newargs->ar_args, i); 719 } 720 721 /* 722 * For security and other reasons, signal handlers cannot 723 * be shared after an exec. The new process gets a copy of the old 724 * handlers. In execsigs(), the new process will have its signals 725 * reset. 726 */ 727 if (sigacts_shared(p->p_sigacts)) { 728 oldsigacts = p->p_sigacts; 729 newsigacts = sigacts_alloc(); 730 sigacts_copy(newsigacts, oldsigacts); 731 } 732 733 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 734 735 PROC_LOCK(p); 736 if (oldsigacts) 737 p->p_sigacts = newsigacts; 738 /* Stop profiling */ 739 stopprofclock(p); 740 741 /* reset caught signals */ 742 execsigs(p); 743 744 /* name this process - nameiexec(p, ndp) */ 745 bzero(p->p_comm, sizeof(p->p_comm)); 746 if (args->fname) 747 bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, 748 min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); 749 else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) 750 bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); 751 bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); 752 #ifdef KTR 753 sched_clear_tdname(td); 754 #endif 755 756 /* 757 * mark as execed, wakeup the process that vforked (if any) and tell 758 * it that it now has its own resources back 759 */ 760 p->p_flag |= P_EXEC; 761 if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) 762 p->p_flag2 &= ~P2_NOTRACE; 763 if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0) 764 p->p_flag2 &= ~P2_STKGAP_DISABLE; 765 if (p->p_flag & P_PPWAIT) { 766 p->p_flag &= ~(P_PPWAIT | P_PPTRACE); 767 cv_broadcast(&p->p_pwait); 768 /* STOPs are no longer ignored, arrange for AST */ 769 signotify(td); 770 } 771 772 /* 773 * Implement image setuid/setgid installation. 774 */ 775 if (imgp->credential_setid) { 776 /* 777 * Turn off syscall tracing for set-id programs, except for 778 * root. Record any set-id flags first to make sure that 779 * we do not regain any tracing during a possible block. 780 */ 781 setsugid(p); 782 783 #ifdef KTRACE 784 if (p->p_tracecred != NULL && 785 priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED)) 786 ktrprocexec(p, &tracecred, &tracevp); 787 #endif 788 /* 789 * Close any file descriptors 0..2 that reference procfs, 790 * then make sure file descriptors 0..2 are in use. 791 * 792 * Both fdsetugidsafety() and fdcheckstd() may call functions 793 * taking sleepable locks, so temporarily drop our locks. 794 */ 795 PROC_UNLOCK(p); 796 VOP_UNLOCK(imgp->vp); 797 fdsetugidsafety(td); 798 error = fdcheckstd(td); 799 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 800 if (error != 0) 801 goto exec_fail_dealloc; 802 PROC_LOCK(p); 803 #ifdef MAC 804 if (will_transition) { 805 mac_vnode_execve_transition(oldcred, imgp->newcred, 806 imgp->vp, interpvplabel, imgp); 807 } 808 #endif 809 } else { 810 if (oldcred->cr_uid == oldcred->cr_ruid && 811 oldcred->cr_gid == oldcred->cr_rgid) 812 p->p_flag &= ~P_SUGID; 813 } 814 /* 815 * Set the new credentials. 816 */ 817 if (imgp->newcred != NULL) { 818 proc_set_cred(p, imgp->newcred); 819 crfree(oldcred); 820 oldcred = NULL; 821 } 822 823 /* 824 * Store the vp for use in procfs. This vnode was referenced by namei 825 * or fgetvp_exec. 826 */ 827 oldtextvp = p->p_textvp; 828 p->p_textvp = newtextvp; 829 830 #ifdef KDTRACE_HOOKS 831 /* 832 * Tell the DTrace fasttrap provider about the exec if it 833 * has declared an interest. 834 */ 835 if (dtrace_fasttrap_exec) 836 dtrace_fasttrap_exec(p); 837 #endif 838 839 /* 840 * Notify others that we exec'd, and clear the P_INEXEC flag 841 * as we're now a bona fide freshly-execed process. 842 */ 843 KNOTE_LOCKED(p->p_klist, NOTE_EXEC); 844 p->p_flag &= ~P_INEXEC; 845 846 /* clear "fork but no exec" flag, as we _are_ execing */ 847 p->p_acflag &= ~AFORK; 848 849 /* 850 * Free any previous argument cache and replace it with 851 * the new argument cache, if any. 852 */ 853 oldargs = p->p_args; 854 p->p_args = newargs; 855 newargs = NULL; 856 857 PROC_UNLOCK(p); 858 859 #ifdef HWPMC_HOOKS 860 /* 861 * Check if system-wide sampling is in effect or if the 862 * current process is using PMCs. If so, do exec() time 863 * processing. This processing needs to happen AFTER the 864 * P_INEXEC flag is cleared. 865 */ 866 if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { 867 VOP_UNLOCK(imgp->vp); 868 pe.pm_credentialschanged = credential_changing; 869 pe.pm_entryaddr = imgp->entry_addr; 870 871 PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); 872 vn_lock(imgp->vp, LK_SHARED | LK_RETRY); 873 } 874 #endif 875 876 /* Set values passed into the program in registers. */ 877 (*p->p_sysent->sv_setregs)(td, imgp, stack_base); 878 879 VOP_MMAPPED(imgp->vp); 880 881 SDT_PROBE1(proc, , , exec__success, args->fname); 882 883 exec_fail_dealloc: 884 if (error != 0) { 885 p->p_osrel = orig_osrel; 886 p->p_fctl0 = orig_fctl0; 887 } 888 889 if (imgp->firstpage != NULL) 890 exec_unmap_first_page(imgp); 891 892 if (imgp->vp != NULL) { 893 if (args->fname) 894 NDFREE(&nd, NDF_ONLY_PNBUF); 895 if (imgp->opened) 896 VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); 897 if (imgp->textset) 898 VOP_UNSET_TEXT_CHECKED(imgp->vp); 899 if (error != 0) 900 vput(imgp->vp); 901 else 902 VOP_UNLOCK(imgp->vp); 903 } 904 905 if (imgp->object != NULL) 906 vm_object_deallocate(imgp->object); 907 908 free(imgp->freepath, M_TEMP); 909 910 if (error == 0) { 911 if (p->p_ptevents & PTRACE_EXEC) { 912 PROC_LOCK(p); 913 if (p->p_ptevents & PTRACE_EXEC) 914 td->td_dbgflags |= TDB_EXEC; 915 PROC_UNLOCK(p); 916 } 917 } else { 918 exec_fail: 919 /* we're done here, clear P_INEXEC */ 920 PROC_LOCK(p); 921 p->p_flag &= ~P_INEXEC; 922 PROC_UNLOCK(p); 923 924 SDT_PROBE1(proc, , , exec__failure, error); 925 } 926 927 if (imgp->newcred != NULL && oldcred != NULL) 928 crfree(imgp->newcred); 929 930 #ifdef MAC 931 mac_execve_exit(imgp); 932 mac_execve_interpreter_exit(interpvplabel); 933 #endif 934 exec_free_args(args); 935 936 /* 937 * Handle deferred decrement of ref counts. 938 */ 939 if (oldtextvp != NULL) 940 vrele(oldtextvp); 941 #ifdef KTRACE 942 if (tracevp != NULL) 943 vrele(tracevp); 944 if (tracecred != NULL) 945 crfree(tracecred); 946 #endif 947 pargs_drop(oldargs); 948 pargs_drop(newargs); 949 if (oldsigacts != NULL) 950 sigacts_free(oldsigacts); 951 if (euip != NULL) 952 uifree(euip); 953 954 if (error && imgp->vmspace_destroyed) { 955 /* sorry, no more process anymore. exit gracefully */ 956 exec_cleanup(td, oldvmspace); 957 exit1(td, 0, SIGABRT); 958 /* NOT REACHED */ 959 } 960 961 #ifdef KTRACE 962 if (error == 0) 963 ktrprocctor(p); 964 #endif 965 966 /* 967 * We don't want cpu_set_syscall_retval() to overwrite any of 968 * the register values put in place by exec_setregs(). 969 * Implementations of cpu_set_syscall_retval() will leave 970 * registers unmodified when returning EJUSTRETURN. 971 */ 972 return (error == 0 ? EJUSTRETURN : error); 973 } 974 975 void 976 exec_cleanup(struct thread *td, struct vmspace *oldvmspace) 977 { 978 if ((td->td_pflags & TDP_EXECVMSPC) != 0) { 979 KASSERT(td->td_proc->p_vmspace != oldvmspace, 980 ("oldvmspace still used")); 981 vmspace_free(oldvmspace); 982 td->td_pflags &= ~TDP_EXECVMSPC; 983 } 984 } 985 986 int 987 exec_map_first_page(struct image_params *imgp) 988 { 989 vm_object_t object; 990 vm_page_t m; 991 int error; 992 993 if (imgp->firstpage != NULL) 994 exec_unmap_first_page(imgp); 995 996 object = imgp->vp->v_object; 997 if (object == NULL) 998 return (EACCES); 999 #if VM_NRESERVLEVEL > 0 1000 if ((object->flags & OBJ_COLORED) == 0) { 1001 VM_OBJECT_WLOCK(object); 1002 vm_object_color(object, 0); 1003 VM_OBJECT_WUNLOCK(object); 1004 } 1005 #endif 1006 error = vm_page_grab_valid_unlocked(&m, object, 0, 1007 VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) | 1008 VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); 1009 1010 if (error != VM_PAGER_OK) 1011 return (EIO); 1012 imgp->firstpage = sf_buf_alloc(m, 0); 1013 imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); 1014 1015 return (0); 1016 } 1017 1018 void 1019 exec_unmap_first_page(struct image_params *imgp) 1020 { 1021 vm_page_t m; 1022 1023 if (imgp->firstpage != NULL) { 1024 m = sf_buf_page(imgp->firstpage); 1025 sf_buf_free(imgp->firstpage); 1026 imgp->firstpage = NULL; 1027 vm_page_unwire(m, PQ_ACTIVE); 1028 } 1029 } 1030 1031 /* 1032 * Destroy old address space, and allocate a new stack. 1033 * The new stack is only sgrowsiz large because it is grown 1034 * automatically on a page fault. 1035 */ 1036 int 1037 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) 1038 { 1039 int error; 1040 struct proc *p = imgp->proc; 1041 struct vmspace *vmspace = p->p_vmspace; 1042 struct thread *td = curthread; 1043 vm_object_t obj; 1044 struct rlimit rlim_stack; 1045 vm_offset_t sv_minuser, stack_addr; 1046 vm_map_t map; 1047 vm_prot_t stack_prot; 1048 u_long ssiz; 1049 1050 imgp->vmspace_destroyed = 1; 1051 imgp->sysent = sv; 1052 1053 sigfastblock_clear(td); 1054 umtx_exec(p); 1055 itimers_exec(p); 1056 if (sv->sv_onexec != NULL) 1057 sv->sv_onexec(p, imgp); 1058 1059 EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); 1060 1061 /* 1062 * Blow away entire process VM, if address space not shared, 1063 * otherwise, create a new VM space so that other threads are 1064 * not disrupted 1065 */ 1066 map = &vmspace->vm_map; 1067 if (map_at_zero) 1068 sv_minuser = sv->sv_minuser; 1069 else 1070 sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); 1071 if (refcount_load(&vmspace->vm_refcnt) == 1 && 1072 vm_map_min(map) == sv_minuser && 1073 vm_map_max(map) == sv->sv_maxuser && 1074 cpu_exec_vmspace_reuse(p, map)) { 1075 shmexit(vmspace); 1076 pmap_remove_pages(vmspace_pmap(vmspace)); 1077 vm_map_remove(map, vm_map_min(map), vm_map_max(map)); 1078 /* 1079 * An exec terminates mlockall(MCL_FUTURE). 1080 * ASLR and W^X states must be re-evaluated. 1081 */ 1082 vm_map_lock(map); 1083 vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | 1084 MAP_ASLR_IGNSTART | MAP_WXORX); 1085 vm_map_unlock(map); 1086 } else { 1087 error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); 1088 if (error) 1089 return (error); 1090 vmspace = p->p_vmspace; 1091 map = &vmspace->vm_map; 1092 } 1093 map->flags |= imgp->map_flags; 1094 1095 /* Map a shared page */ 1096 obj = sv->sv_shared_page_obj; 1097 if (obj != NULL) { 1098 vm_object_reference(obj); 1099 error = vm_map_fixed(map, obj, 0, 1100 sv->sv_shared_page_base, sv->sv_shared_page_len, 1101 VM_PROT_READ | VM_PROT_EXECUTE, 1102 VM_PROT_READ | VM_PROT_EXECUTE, 1103 MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); 1104 if (error != KERN_SUCCESS) { 1105 vm_object_deallocate(obj); 1106 return (vm_mmap_to_errno(error)); 1107 } 1108 } 1109 1110 /* Allocate a new stack */ 1111 if (imgp->stack_sz != 0) { 1112 ssiz = trunc_page(imgp->stack_sz); 1113 PROC_LOCK(p); 1114 lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); 1115 PROC_UNLOCK(p); 1116 if (ssiz > rlim_stack.rlim_max) 1117 ssiz = rlim_stack.rlim_max; 1118 if (ssiz > rlim_stack.rlim_cur) { 1119 rlim_stack.rlim_cur = ssiz; 1120 kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); 1121 } 1122 } else if (sv->sv_maxssiz != NULL) { 1123 ssiz = *sv->sv_maxssiz; 1124 } else { 1125 ssiz = maxssiz; 1126 } 1127 imgp->eff_stack_sz = lim_cur(curthread, RLIMIT_STACK); 1128 if (ssiz < imgp->eff_stack_sz) 1129 imgp->eff_stack_sz = ssiz; 1130 stack_addr = sv->sv_usrstack - ssiz; 1131 stack_prot = obj != NULL && imgp->stack_prot != 0 ? 1132 imgp->stack_prot : sv->sv_stackprot; 1133 error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, stack_prot, 1134 VM_PROT_ALL, MAP_STACK_GROWS_DOWN); 1135 if (error != KERN_SUCCESS) { 1136 uprintf("exec_new_vmspace: mapping stack size %#jx prot %#x " 1137 "failed mach error %d errno %d\n", (uintmax_t)ssiz, 1138 stack_prot, error, vm_mmap_to_errno(error)); 1139 return (vm_mmap_to_errno(error)); 1140 } 1141 1142 /* 1143 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they 1144 * are still used to enforce the stack rlimit on the process stack. 1145 */ 1146 vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; 1147 vmspace->vm_maxsaddr = (char *)stack_addr; 1148 1149 return (0); 1150 } 1151 1152 /* 1153 * Copy out argument and environment strings from the old process address 1154 * space into the temporary string buffer. 1155 */ 1156 int 1157 exec_copyin_args(struct image_args *args, const char *fname, 1158 enum uio_seg segflg, char **argv, char **envv) 1159 { 1160 u_long arg, env; 1161 int error; 1162 1163 bzero(args, sizeof(*args)); 1164 if (argv == NULL) 1165 return (EFAULT); 1166 1167 /* 1168 * Allocate demand-paged memory for the file name, argument, and 1169 * environment strings. 1170 */ 1171 error = exec_alloc_args(args); 1172 if (error != 0) 1173 return (error); 1174 1175 /* 1176 * Copy the file name. 1177 */ 1178 error = exec_args_add_fname(args, fname, segflg); 1179 if (error != 0) 1180 goto err_exit; 1181 1182 /* 1183 * extract arguments first 1184 */ 1185 for (;;) { 1186 error = fueword(argv++, &arg); 1187 if (error == -1) { 1188 error = EFAULT; 1189 goto err_exit; 1190 } 1191 if (arg == 0) 1192 break; 1193 error = exec_args_add_arg(args, (char *)(uintptr_t)arg, 1194 UIO_USERSPACE); 1195 if (error != 0) 1196 goto err_exit; 1197 } 1198 1199 /* 1200 * extract environment strings 1201 */ 1202 if (envv) { 1203 for (;;) { 1204 error = fueword(envv++, &env); 1205 if (error == -1) { 1206 error = EFAULT; 1207 goto err_exit; 1208 } 1209 if (env == 0) 1210 break; 1211 error = exec_args_add_env(args, 1212 (char *)(uintptr_t)env, UIO_USERSPACE); 1213 if (error != 0) 1214 goto err_exit; 1215 } 1216 } 1217 1218 return (0); 1219 1220 err_exit: 1221 exec_free_args(args); 1222 return (error); 1223 } 1224 1225 int 1226 exec_copyin_data_fds(struct thread *td, struct image_args *args, 1227 const void *data, size_t datalen, const int *fds, size_t fdslen) 1228 { 1229 struct filedesc *ofdp; 1230 const char *p; 1231 int *kfds; 1232 int error; 1233 1234 memset(args, '\0', sizeof(*args)); 1235 ofdp = td->td_proc->p_fd; 1236 if (datalen >= ARG_MAX || fdslen >= ofdp->fd_nfiles) 1237 return (E2BIG); 1238 error = exec_alloc_args(args); 1239 if (error != 0) 1240 return (error); 1241 1242 args->begin_argv = args->buf; 1243 args->stringspace = ARG_MAX; 1244 1245 if (datalen > 0) { 1246 /* 1247 * Argument buffer has been provided. Copy it into the 1248 * kernel as a single string and add a terminating null 1249 * byte. 1250 */ 1251 error = copyin(data, args->begin_argv, datalen); 1252 if (error != 0) 1253 goto err_exit; 1254 args->begin_argv[datalen] = '\0'; 1255 args->endp = args->begin_argv + datalen + 1; 1256 args->stringspace -= datalen + 1; 1257 1258 /* 1259 * Traditional argument counting. Count the number of 1260 * null bytes. 1261 */ 1262 for (p = args->begin_argv; p < args->endp; ++p) 1263 if (*p == '\0') 1264 ++args->argc; 1265 } else { 1266 /* No argument buffer provided. */ 1267 args->endp = args->begin_argv; 1268 } 1269 1270 /* Create new file descriptor table. */ 1271 kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); 1272 error = copyin(fds, kfds, fdslen * sizeof(int)); 1273 if (error != 0) { 1274 free(kfds, M_TEMP); 1275 goto err_exit; 1276 } 1277 error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); 1278 free(kfds, M_TEMP); 1279 if (error != 0) 1280 goto err_exit; 1281 1282 return (0); 1283 err_exit: 1284 exec_free_args(args); 1285 return (error); 1286 } 1287 1288 struct exec_args_kva { 1289 vm_offset_t addr; 1290 u_int gen; 1291 SLIST_ENTRY(exec_args_kva) next; 1292 }; 1293 1294 DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva); 1295 1296 static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; 1297 static struct mtx exec_args_kva_mtx; 1298 static u_int exec_args_gen; 1299 1300 static void 1301 exec_prealloc_args_kva(void *arg __unused) 1302 { 1303 struct exec_args_kva *argkva; 1304 u_int i; 1305 1306 SLIST_INIT(&exec_args_kva_freelist); 1307 mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); 1308 for (i = 0; i < exec_map_entries; i++) { 1309 argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); 1310 argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); 1311 argkva->gen = exec_args_gen; 1312 SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); 1313 } 1314 } 1315 SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); 1316 1317 static vm_offset_t 1318 exec_alloc_args_kva(void **cookie) 1319 { 1320 struct exec_args_kva *argkva; 1321 1322 argkva = (void *)atomic_readandclear_ptr( 1323 (uintptr_t *)DPCPU_PTR(exec_args_kva)); 1324 if (argkva == NULL) { 1325 mtx_lock(&exec_args_kva_mtx); 1326 while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) 1327 (void)mtx_sleep(&exec_args_kva_freelist, 1328 &exec_args_kva_mtx, 0, "execkva", 0); 1329 SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); 1330 mtx_unlock(&exec_args_kva_mtx); 1331 } 1332 kasan_mark((void *)argkva->addr, exec_map_entry_size, 1333 exec_map_entry_size, 0); 1334 *(struct exec_args_kva **)cookie = argkva; 1335 return (argkva->addr); 1336 } 1337 1338 static void 1339 exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) 1340 { 1341 vm_offset_t base; 1342 1343 base = argkva->addr; 1344 kasan_mark((void *)argkva->addr, 0, exec_map_entry_size, 1345 KASAN_EXEC_ARGS_FREED); 1346 if (argkva->gen != gen) { 1347 (void)vm_map_madvise(exec_map, base, base + exec_map_entry_size, 1348 MADV_FREE); 1349 argkva->gen = gen; 1350 } 1351 if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), 1352 (uintptr_t)NULL, (uintptr_t)argkva)) { 1353 mtx_lock(&exec_args_kva_mtx); 1354 SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); 1355 wakeup_one(&exec_args_kva_freelist); 1356 mtx_unlock(&exec_args_kva_mtx); 1357 } 1358 } 1359 1360 static void 1361 exec_free_args_kva(void *cookie) 1362 { 1363 1364 exec_release_args_kva(cookie, exec_args_gen); 1365 } 1366 1367 static void 1368 exec_args_kva_lowmem(void *arg __unused) 1369 { 1370 SLIST_HEAD(, exec_args_kva) head; 1371 struct exec_args_kva *argkva; 1372 u_int gen; 1373 int i; 1374 1375 gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; 1376 1377 /* 1378 * Force an madvise of each KVA range. Any currently allocated ranges 1379 * will have MADV_FREE applied once they are freed. 1380 */ 1381 SLIST_INIT(&head); 1382 mtx_lock(&exec_args_kva_mtx); 1383 SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); 1384 mtx_unlock(&exec_args_kva_mtx); 1385 while ((argkva = SLIST_FIRST(&head)) != NULL) { 1386 SLIST_REMOVE_HEAD(&head, next); 1387 exec_release_args_kva(argkva, gen); 1388 } 1389 1390 CPU_FOREACH(i) { 1391 argkva = (void *)atomic_readandclear_ptr( 1392 (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); 1393 if (argkva != NULL) 1394 exec_release_args_kva(argkva, gen); 1395 } 1396 } 1397 EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, 1398 EVENTHANDLER_PRI_ANY); 1399 1400 /* 1401 * Allocate temporary demand-paged, zero-filled memory for the file name, 1402 * argument, and environment strings. 1403 */ 1404 int 1405 exec_alloc_args(struct image_args *args) 1406 { 1407 1408 args->buf = (char *)exec_alloc_args_kva(&args->bufkva); 1409 return (0); 1410 } 1411 1412 void 1413 exec_free_args(struct image_args *args) 1414 { 1415 1416 if (args->buf != NULL) { 1417 exec_free_args_kva(args->bufkva); 1418 args->buf = NULL; 1419 } 1420 if (args->fname_buf != NULL) { 1421 free(args->fname_buf, M_TEMP); 1422 args->fname_buf = NULL; 1423 } 1424 if (args->fdp != NULL) 1425 fdescfree_remapped(args->fdp); 1426 } 1427 1428 /* 1429 * A set to functions to fill struct image args. 1430 * 1431 * NOTE: exec_args_add_fname() must be called (possibly with a NULL 1432 * fname) before the other functions. All exec_args_add_arg() calls must 1433 * be made before any exec_args_add_env() calls. exec_args_adjust_args() 1434 * may be called any time after exec_args_add_fname(). 1435 * 1436 * exec_args_add_fname() - install path to be executed 1437 * exec_args_add_arg() - append an argument string 1438 * exec_args_add_env() - append an env string 1439 * exec_args_adjust_args() - adjust location of the argument list to 1440 * allow new arguments to be prepended 1441 */ 1442 int 1443 exec_args_add_fname(struct image_args *args, const char *fname, 1444 enum uio_seg segflg) 1445 { 1446 int error; 1447 size_t length; 1448 1449 KASSERT(args->fname == NULL, ("fname already appended")); 1450 KASSERT(args->endp == NULL, ("already appending to args")); 1451 1452 if (fname != NULL) { 1453 args->fname = args->buf; 1454 error = segflg == UIO_SYSSPACE ? 1455 copystr(fname, args->fname, PATH_MAX, &length) : 1456 copyinstr(fname, args->fname, PATH_MAX, &length); 1457 if (error != 0) 1458 return (error == ENAMETOOLONG ? E2BIG : error); 1459 } else 1460 length = 0; 1461 1462 /* Set up for _arg_*()/_env_*() */ 1463 args->endp = args->buf + length; 1464 /* begin_argv must be set and kept updated */ 1465 args->begin_argv = args->endp; 1466 KASSERT(exec_map_entry_size - length >= ARG_MAX, 1467 ("too little space remaining for arguments %zu < %zu", 1468 exec_map_entry_size - length, (size_t)ARG_MAX)); 1469 args->stringspace = ARG_MAX; 1470 1471 return (0); 1472 } 1473 1474 static int 1475 exec_args_add_str(struct image_args *args, const char *str, 1476 enum uio_seg segflg, int *countp) 1477 { 1478 int error; 1479 size_t length; 1480 1481 KASSERT(args->endp != NULL, ("endp not initialized")); 1482 KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); 1483 1484 error = (segflg == UIO_SYSSPACE) ? 1485 copystr(str, args->endp, args->stringspace, &length) : 1486 copyinstr(str, args->endp, args->stringspace, &length); 1487 if (error != 0) 1488 return (error == ENAMETOOLONG ? E2BIG : error); 1489 args->stringspace -= length; 1490 args->endp += length; 1491 (*countp)++; 1492 1493 return (0); 1494 } 1495 1496 int 1497 exec_args_add_arg(struct image_args *args, const char *argp, 1498 enum uio_seg segflg) 1499 { 1500 1501 KASSERT(args->envc == 0, ("appending args after env")); 1502 1503 return (exec_args_add_str(args, argp, segflg, &args->argc)); 1504 } 1505 1506 int 1507 exec_args_add_env(struct image_args *args, const char *envp, 1508 enum uio_seg segflg) 1509 { 1510 1511 if (args->envc == 0) 1512 args->begin_envv = args->endp; 1513 1514 return (exec_args_add_str(args, envp, segflg, &args->envc)); 1515 } 1516 1517 int 1518 exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend) 1519 { 1520 ssize_t offset; 1521 1522 KASSERT(args->endp != NULL, ("endp not initialized")); 1523 KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); 1524 1525 offset = extend - consume; 1526 if (args->stringspace < offset) 1527 return (E2BIG); 1528 memmove(args->begin_argv + extend, args->begin_argv + consume, 1529 args->endp - args->begin_argv + consume); 1530 if (args->envc > 0) 1531 args->begin_envv += offset; 1532 args->endp += offset; 1533 args->stringspace -= offset; 1534 return (0); 1535 } 1536 1537 char * 1538 exec_args_get_begin_envv(struct image_args *args) 1539 { 1540 1541 KASSERT(args->endp != NULL, ("endp not initialized")); 1542 1543 if (args->envc > 0) 1544 return (args->begin_envv); 1545 return (args->endp); 1546 } 1547 1548 void 1549 exec_stackgap(struct image_params *imgp, uintptr_t *dp) 1550 { 1551 if (imgp->sysent->sv_stackgap == NULL || 1552 (imgp->proc->p_fctl0 & (NT_FREEBSD_FCTL_ASLR_DISABLE | 1553 NT_FREEBSD_FCTL_ASG_DISABLE)) != 0 || 1554 (imgp->map_flags & MAP_ASLR) == 0) 1555 return; 1556 imgp->sysent->sv_stackgap(imgp, dp); 1557 } 1558 1559 /* 1560 * Copy strings out to the new process address space, constructing new arg 1561 * and env vector tables. Return a pointer to the base so that it can be used 1562 * as the initial stack pointer. 1563 */ 1564 int 1565 exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) 1566 { 1567 int argc, envc; 1568 char **vectp; 1569 char *stringp; 1570 uintptr_t destp, ustringp; 1571 struct ps_strings *arginfo; 1572 struct proc *p; 1573 size_t execpath_len; 1574 int error, szsigcode, szps; 1575 char canary[sizeof(long) * 8]; 1576 1577 szps = sizeof(pagesizes[0]) * MAXPAGESIZES; 1578 /* 1579 * Calculate string base and vector table pointers. 1580 * Also deal with signal trampoline code for this exec type. 1581 */ 1582 if (imgp->execpath != NULL && imgp->auxargs != NULL) 1583 execpath_len = strlen(imgp->execpath) + 1; 1584 else 1585 execpath_len = 0; 1586 p = imgp->proc; 1587 szsigcode = 0; 1588 arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; 1589 imgp->ps_strings = arginfo; 1590 if (p->p_sysent->sv_sigcode_base == 0) { 1591 if (p->p_sysent->sv_szsigcode != NULL) 1592 szsigcode = *(p->p_sysent->sv_szsigcode); 1593 } 1594 destp = (uintptr_t)arginfo; 1595 1596 /* 1597 * install sigcode 1598 */ 1599 if (szsigcode != 0) { 1600 destp -= szsigcode; 1601 destp = rounddown2(destp, sizeof(void *)); 1602 error = copyout(p->p_sysent->sv_sigcode, (void *)destp, 1603 szsigcode); 1604 if (error != 0) 1605 return (error); 1606 } 1607 1608 /* 1609 * Copy the image path for the rtld. 1610 */ 1611 if (execpath_len != 0) { 1612 destp -= execpath_len; 1613 destp = rounddown2(destp, sizeof(void *)); 1614 imgp->execpathp = (void *)destp; 1615 error = copyout(imgp->execpath, imgp->execpathp, execpath_len); 1616 if (error != 0) 1617 return (error); 1618 } 1619 1620 /* 1621 * Prepare the canary for SSP. 1622 */ 1623 arc4rand(canary, sizeof(canary), 0); 1624 destp -= sizeof(canary); 1625 imgp->canary = (void *)destp; 1626 error = copyout(canary, imgp->canary, sizeof(canary)); 1627 if (error != 0) 1628 return (error); 1629 imgp->canarylen = sizeof(canary); 1630 1631 /* 1632 * Prepare the pagesizes array. 1633 */ 1634 destp -= szps; 1635 destp = rounddown2(destp, sizeof(void *)); 1636 imgp->pagesizes = (void *)destp; 1637 error = copyout(pagesizes, imgp->pagesizes, szps); 1638 if (error != 0) 1639 return (error); 1640 imgp->pagesizeslen = szps; 1641 1642 /* 1643 * Allocate room for the argument and environment strings. 1644 */ 1645 destp -= ARG_MAX - imgp->args->stringspace; 1646 destp = rounddown2(destp, sizeof(void *)); 1647 ustringp = destp; 1648 1649 exec_stackgap(imgp, &destp); 1650 1651 if (imgp->auxargs) { 1652 /* 1653 * Allocate room on the stack for the ELF auxargs 1654 * array. It has up to AT_COUNT entries. 1655 */ 1656 destp -= AT_COUNT * sizeof(Elf_Auxinfo); 1657 destp = rounddown2(destp, sizeof(void *)); 1658 } 1659 1660 vectp = (char **)destp; 1661 1662 /* 1663 * Allocate room for the argv[] and env vectors including the 1664 * terminating NULL pointers. 1665 */ 1666 vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; 1667 1668 /* 1669 * vectp also becomes our initial stack base 1670 */ 1671 *stack_base = (uintptr_t)vectp; 1672 1673 stringp = imgp->args->begin_argv; 1674 argc = imgp->args->argc; 1675 envc = imgp->args->envc; 1676 1677 /* 1678 * Copy out strings - arguments and environment. 1679 */ 1680 error = copyout(stringp, (void *)ustringp, 1681 ARG_MAX - imgp->args->stringspace); 1682 if (error != 0) 1683 return (error); 1684 1685 /* 1686 * Fill in "ps_strings" struct for ps, w, etc. 1687 */ 1688 imgp->argv = vectp; 1689 if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || 1690 suword32(&arginfo->ps_nargvstr, argc) != 0) 1691 return (EFAULT); 1692 1693 /* 1694 * Fill in argument portion of vector table. 1695 */ 1696 for (; argc > 0; --argc) { 1697 if (suword(vectp++, ustringp) != 0) 1698 return (EFAULT); 1699 while (*stringp++ != 0) 1700 ustringp++; 1701 ustringp++; 1702 } 1703 1704 /* a null vector table pointer separates the argp's from the envp's */ 1705 if (suword(vectp++, 0) != 0) 1706 return (EFAULT); 1707 1708 imgp->envv = vectp; 1709 if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || 1710 suword32(&arginfo->ps_nenvstr, envc) != 0) 1711 return (EFAULT); 1712 1713 /* 1714 * Fill in environment portion of vector table. 1715 */ 1716 for (; envc > 0; --envc) { 1717 if (suword(vectp++, ustringp) != 0) 1718 return (EFAULT); 1719 while (*stringp++ != 0) 1720 ustringp++; 1721 ustringp++; 1722 } 1723 1724 /* end of vector table is a null pointer */ 1725 if (suword(vectp, 0) != 0) 1726 return (EFAULT); 1727 1728 if (imgp->auxargs) { 1729 vectp++; 1730 error = imgp->sysent->sv_copyout_auxargs(imgp, 1731 (uintptr_t)vectp); 1732 if (error != 0) 1733 return (error); 1734 } 1735 1736 return (0); 1737 } 1738 1739 /* 1740 * Check permissions of file to execute. 1741 * Called with imgp->vp locked. 1742 * Return 0 for success or error code on failure. 1743 */ 1744 int 1745 exec_check_permissions(struct image_params *imgp) 1746 { 1747 struct vnode *vp = imgp->vp; 1748 struct vattr *attr = imgp->attr; 1749 struct thread *td; 1750 int error; 1751 1752 td = curthread; 1753 1754 /* Get file attributes */ 1755 error = VOP_GETATTR(vp, attr, td->td_ucred); 1756 if (error) 1757 return (error); 1758 1759 #ifdef MAC 1760 error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); 1761 if (error) 1762 return (error); 1763 #endif 1764 1765 /* 1766 * 1) Check if file execution is disabled for the filesystem that 1767 * this file resides on. 1768 * 2) Ensure that at least one execute bit is on. Otherwise, a 1769 * privileged user will always succeed, and we don't want this 1770 * to happen unless the file really is executable. 1771 * 3) Ensure that the file is a regular file. 1772 */ 1773 if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || 1774 (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || 1775 (attr->va_type != VREG)) 1776 return (EACCES); 1777 1778 /* 1779 * Zero length files can't be exec'd 1780 */ 1781 if (attr->va_size == 0) 1782 return (ENOEXEC); 1783 1784 /* 1785 * Check for execute permission to file based on current credentials. 1786 */ 1787 error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); 1788 if (error) 1789 return (error); 1790 1791 /* 1792 * Check number of open-for-writes on the file and deny execution 1793 * if there are any. 1794 * 1795 * Add a text reference now so no one can write to the 1796 * executable while we're activating it. 1797 * 1798 * Remember if this was set before and unset it in case this is not 1799 * actually an executable image. 1800 */ 1801 error = VOP_SET_TEXT(vp); 1802 if (error != 0) 1803 return (error); 1804 imgp->textset = true; 1805 1806 /* 1807 * Call filesystem specific open routine (which does nothing in the 1808 * general case). 1809 */ 1810 error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); 1811 if (error == 0) 1812 imgp->opened = 1; 1813 return (error); 1814 } 1815 1816 /* 1817 * Exec handler registration 1818 */ 1819 int 1820 exec_register(const struct execsw *execsw_arg) 1821 { 1822 const struct execsw **es, **xs, **newexecsw; 1823 u_int count = 2; /* New slot and trailing NULL */ 1824 1825 if (execsw) 1826 for (es = execsw; *es; es++) 1827 count++; 1828 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); 1829 xs = newexecsw; 1830 if (execsw) 1831 for (es = execsw; *es; es++) 1832 *xs++ = *es; 1833 *xs++ = execsw_arg; 1834 *xs = NULL; 1835 if (execsw) 1836 free(execsw, M_TEMP); 1837 execsw = newexecsw; 1838 return (0); 1839 } 1840 1841 int 1842 exec_unregister(const struct execsw *execsw_arg) 1843 { 1844 const struct execsw **es, **xs, **newexecsw; 1845 int count = 1; 1846 1847 if (execsw == NULL) 1848 panic("unregister with no handlers left?\n"); 1849 1850 for (es = execsw; *es; es++) { 1851 if (*es == execsw_arg) 1852 break; 1853 } 1854 if (*es == NULL) 1855 return (ENOENT); 1856 for (es = execsw; *es; es++) 1857 if (*es != execsw_arg) 1858 count++; 1859 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); 1860 xs = newexecsw; 1861 for (es = execsw; *es; es++) 1862 if (*es != execsw_arg) 1863 *xs++ = *es; 1864 *xs = NULL; 1865 if (execsw) 1866 free(execsw, M_TEMP); 1867 execsw = newexecsw; 1868 return (0); 1869 } 1870