1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include "opt_hwpmc_hooks.h" 44 #include "opt_hwt_hooks.h" 45 #include "opt_vm.h" 46 47 #define EXTERR_CATEGORY EXTERR_CAT_MMAP 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/capsicum.h> 51 #include <sys/exterrvar.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/elf.h> 57 #include <sys/filedesc.h> 58 #include <sys/priv.h> 59 #include <sys/proc.h> 60 #include <sys/procctl.h> 61 #include <sys/racct.h> 62 #include <sys/resource.h> 63 #include <sys/resourcevar.h> 64 #include <sys/rwlock.h> 65 #include <sys/sysctl.h> 66 #include <sys/vnode.h> 67 #include <sys/fcntl.h> 68 #include <sys/file.h> 69 #include <sys/mman.h> 70 #include <sys/mount.h> 71 #include <sys/conf.h> 72 #include <sys/stat.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysent.h> 75 #include <sys/vmmeter.h> 76 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 77 #include <machine/md_var.h> 78 #endif 79 80 #include <security/audit/audit.h> 81 #include <security/mac/mac_framework.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <vm/pmap.h> 86 #include <vm/vm_map.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_extern.h> 92 #include <vm/vm_page.h> 93 #include <vm/vnode_pager.h> 94 95 #ifdef HWPMC_HOOKS 96 #include <sys/pmckern.h> 97 #endif 98 99 #ifdef HWT_HOOKS 100 #include <dev/hwt/hwt_hook.h> 101 #endif 102 103 int old_mlock = 0; 104 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 105 "Do not apply RLIMIT_MEMLOCK on mlockall"); 106 static int mincore_mapped = 1; 107 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 108 "mincore reports mappings, not residency"); 109 static int imply_prot_max = 0; 110 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 111 "Imply maximum page protections in mmap() when none are specified"); 112 113 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 114 115 #if defined(COMPAT_43) 116 int 117 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 118 { 119 120 td->td_retval[0] = PAGE_SIZE; 121 return (0); 122 } 123 #endif /* COMPAT_43 */ 124 125 /* 126 * Memory Map (mmap) system call. Note that the file offset 127 * and address are allowed to be NOT page aligned, though if 128 * the MAP_FIXED flag it set, both must have the same remainder 129 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 130 * page-aligned, the actual mapping starts at trunc_page(addr) 131 * and the return value is adjusted up by the page offset. 132 * 133 * Generally speaking, only character devices which are themselves 134 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 135 * there would be no cache coherency between a descriptor and a VM mapping 136 * both to the same character device. 137 */ 138 #ifndef _SYS_SYSPROTO_H_ 139 struct mmap_args { 140 void *addr; 141 size_t len; 142 int prot; 143 int flags; 144 int fd; 145 long pad; 146 off_t pos; 147 }; 148 #endif 149 150 int 151 sys_mmap(struct thread *td, struct mmap_args *uap) 152 { 153 154 return (kern_mmap(td, &(struct mmap_req){ 155 .mr_hint = (uintptr_t)uap->addr, 156 .mr_len = uap->len, 157 .mr_prot = uap->prot, 158 .mr_flags = uap->flags, 159 .mr_fd = uap->fd, 160 .mr_pos = uap->pos, 161 })); 162 } 163 164 int 165 kern_mmap_maxprot(struct proc *p, int prot) 166 { 167 168 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 169 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 170 return (_PROT_ALL); 171 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 172 prot != PROT_NONE) 173 return (prot); 174 return (_PROT_ALL); 175 } 176 177 int 178 kern_mmap(struct thread *td, const struct mmap_req *mrp) 179 { 180 struct vmspace *vms; 181 struct file *fp; 182 struct proc *p; 183 off_t pos; 184 vm_offset_t addr, orig_addr; 185 vm_size_t len, pageoff, size; 186 vm_prot_t cap_maxprot; 187 int align, error, fd, flags, max_prot, prot; 188 cap_rights_t rights; 189 mmap_check_fp_fn check_fp_fn; 190 191 orig_addr = addr = mrp->mr_hint; 192 len = mrp->mr_len; 193 prot = mrp->mr_prot; 194 flags = mrp->mr_flags; 195 fd = mrp->mr_fd; 196 pos = mrp->mr_pos; 197 check_fp_fn = mrp->mr_check_fp_fn; 198 199 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) { 200 return (EXTERROR(EINVAL, "unknown PROT bits %#jx", prot)); 201 } 202 max_prot = PROT_MAX_EXTRACT(prot); 203 prot = PROT_EXTRACT(prot); 204 if (max_prot != 0 && (max_prot & prot) != prot) { 205 return (EXTERROR(ENOTSUP, 206 "prot %#jx is not subset of max_prot %#jx", 207 prot, max_prot)); 208 } 209 210 p = td->td_proc; 211 212 /* 213 * Always honor PROT_MAX if set. If not, default to all 214 * permissions unless we're implying maximum permissions. 215 */ 216 if (max_prot == 0) 217 max_prot = kern_mmap_maxprot(p, prot); 218 219 vms = p->p_vmspace; 220 fp = NULL; 221 AUDIT_ARG_FD(fd); 222 223 /* 224 * Ignore old flags that used to be defined but did not do anything. 225 */ 226 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 227 228 /* 229 * Enforce the constraints. 230 * Mapping of length 0 is only allowed for old binaries. 231 * Anonymous mapping shall specify -1 as filedescriptor and 232 * zero position for new code. Be nice to ancient a.out 233 * binaries and correct pos for anonymous mapping, since old 234 * ld.so sometimes issues anonymous map requests with non-zero 235 * pos. 236 */ 237 if (!SV_CURPROC_FLAG(SV_AOUT)) { 238 if (len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) 239 return (EXTERROR(EINVAL, "mapping with zero length")); 240 if ((flags & MAP_ANON) != 0) { 241 if (fd != -1) 242 return (EXTERROR(EINVAL, 243 "fd %#jd not -1 for MAP_ANON", fd)); 244 if (pos != 0) 245 return (EXTERROR(EINVAL, 246 "offset %#jd not zero for MAP_ANON", pos)); 247 } 248 } else { 249 if ((flags & MAP_ANON) != 0) 250 pos = 0; 251 } 252 253 if (flags & MAP_STACK) { 254 if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != 255 (PROT_READ | PROT_WRITE))) { 256 return (EXTERROR(EINVAL, 257 "MAP_STACK with prot %#jx < rw", prot)); 258 } 259 flags |= MAP_ANON; 260 pos = 0; 261 } 262 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 263 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 264 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT | 265 MAP_ALIGNMENT_MASK)) != 0) { 266 return (EXTERROR(EINVAL, "reserved flag set (flags %#jx)", 267 flags)); 268 } 269 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) { 270 return (EXTERROR(EINVAL, "EXCL without FIXED (flags %#jx)", 271 flags)); 272 } 273 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | 274 MAP_PRIVATE)) { 275 return (EXTERROR(EINVAL, 276 "both SHARED and PRIVATE set (flags %#jx)", flags)); 277 } 278 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 279 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 280 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) { 281 return (EXTERROR(EINVAL, "GUARD with wrong parameters")); 282 } 283 284 /* 285 * Align the file position to a page boundary, 286 * and save its page offset component. 287 */ 288 pageoff = (pos & PAGE_MASK); 289 pos -= pageoff; 290 291 /* Compute size from len by rounding (on both ends). */ 292 size = len + pageoff; /* low end... */ 293 size = round_page(size); /* hi end */ 294 /* Check for rounding up to zero. */ 295 if (len > size) 296 return (ENOMEM); 297 298 /* Ensure alignment is at least a page and fits in a pointer. */ 299 align = flags & MAP_ALIGNMENT_MASK; 300 if (align != 0 && align != MAP_ALIGNED_SUPER) { 301 if (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY) 302 return (EXTERROR(EINVAL, "bad alignment %#jx >= %#jx", 303 align >> MAP_ALIGNMENT_SHIFT, 304 sizeof(void *) * NBBY)); 305 else if (align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT) 306 return (EXTERROR(EINVAL, "bad alignment %#jx < %#jx", 307 align >> MAP_ALIGNMENT_SHIFT, PAGE_SHIFT)); 308 } 309 310 /* 311 * Check for illegal addresses. Watch out for address wrap... Note 312 * that VM_*_ADDRESS are not constants due to casts (argh). 313 */ 314 if (flags & MAP_FIXED) { 315 /* 316 * The specified address must have the same remainder 317 * as the file offset taken modulo PAGE_SIZE, so it 318 * should be aligned after adjustment by pageoff. 319 */ 320 addr -= pageoff; 321 if ((addr & PAGE_MASK) != 0) { 322 return (EXTERROR(EINVAL, 323 "fixed mapping at %#jx not page aligned %#jx", addr, 324 PAGE_SIZE)); 325 } 326 327 /* Address range must be all in user VM space. */ 328 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) { 329 return (EXTERROR(EINVAL, 330 "mapping %#jx-%#jx outside vm_map", addr, 331 addr + size)); 332 } 333 if ((flags & MAP_32BIT) && addr + size > MAP_32BIT_MAX_ADDR) { 334 return (EXTERROR(EINVAL, 335 "fixed 32bit mapping of [%#jx %#jx] does not fit into 4G", 336 addr, addr + size)); 337 } 338 } else if (flags & MAP_32BIT) { 339 /* 340 * For MAP_32BIT, override the hint if it is too high and 341 * do not bother moving the mapping past the heap (since 342 * the heap is usually above 2GB). 343 */ 344 if (addr + size > MAP_32BIT_MAX_ADDR) 345 addr = 0; 346 } else { 347 /* 348 * XXX for non-fixed mappings where no hint is provided or 349 * the hint would fall in the potential heap space, 350 * place it after the end of the largest possible heap. 351 * 352 * For anonymous mappings within the address space of the 353 * calling process, the absence of a hint is handled at a 354 * lower level in order to implement different clustering 355 * strategies for ASLR. 356 */ 357 if (((flags & MAP_ANON) == 0 && addr == 0) || 358 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 359 addr < round_page((vm_offset_t)vms->vm_daddr + 360 lim_max(td, RLIMIT_DATA)))) 361 addr = round_page((vm_offset_t)vms->vm_daddr + 362 lim_max(td, RLIMIT_DATA)); 363 } 364 if (len == 0) { 365 /* 366 * Return success without mapping anything for old 367 * binaries that request a page-aligned mapping of 368 * length 0. For modern binaries, this function 369 * returns an error earlier. 370 */ 371 error = 0; 372 } else if ((flags & MAP_GUARD) != 0) { 373 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 374 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 375 } else if ((flags & MAP_ANON) != 0) { 376 /* 377 * Mapping blank space is trivial. 378 * 379 * This relies on VM_PROT_* matching PROT_*. 380 */ 381 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 382 max_prot, flags, NULL, pos, FALSE, td); 383 } else { 384 /* 385 * Mapping file, get fp for validation and don't let the 386 * descriptor disappear on us if we block. Check capability 387 * rights, but also return the maximum rights to be combined 388 * with maxprot later. 389 */ 390 cap_rights_init_one(&rights, CAP_MMAP); 391 if (prot & PROT_READ) 392 cap_rights_set_one(&rights, CAP_MMAP_R); 393 if ((flags & MAP_SHARED) != 0) { 394 if (prot & PROT_WRITE) 395 cap_rights_set_one(&rights, CAP_MMAP_W); 396 } 397 if (prot & PROT_EXEC) 398 cap_rights_set_one(&rights, CAP_MMAP_X); 399 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 400 if (error != 0) 401 goto done; 402 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 403 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 404 EXTERROR(EINVAL, "neither SHARED nor PRIVATE req"); 405 error = EINVAL; 406 goto done; 407 } 408 if (check_fp_fn != NULL) { 409 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 410 flags); 411 if (error != 0) 412 goto done; 413 } 414 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 415 addr = orig_addr; 416 /* This relies on VM_PROT_* matching PROT_*. */ 417 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 418 max_prot & cap_maxprot, flags, pos, td); 419 } 420 421 if (error == 0) 422 td->td_retval[0] = addr + pageoff; 423 done: 424 if (fp) 425 fdrop(fp, td); 426 427 return (error); 428 } 429 430 #if defined(COMPAT_FREEBSD6) 431 int 432 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 433 { 434 return (kern_mmap(td, &(struct mmap_req){ 435 .mr_hint = (uintptr_t)uap->addr, 436 .mr_len = uap->len, 437 .mr_prot = uap->prot, 438 .mr_flags = uap->flags, 439 .mr_fd = uap->fd, 440 .mr_pos = uap->pos, 441 })); 442 } 443 #endif 444 445 #ifdef COMPAT_43 446 #ifndef _SYS_SYSPROTO_H_ 447 struct ommap_args { 448 caddr_t addr; 449 int len; 450 int prot; 451 int flags; 452 int fd; 453 long pos; 454 }; 455 #endif 456 int 457 ommap(struct thread *td, struct ommap_args *uap) 458 { 459 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 460 uap->flags, uap->fd, uap->pos)); 461 } 462 463 int 464 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 465 int oflags, int fd, long pos) 466 { 467 static const char cvtbsdprot[8] = { 468 0, 469 PROT_EXEC, 470 PROT_WRITE, 471 PROT_EXEC | PROT_WRITE, 472 PROT_READ, 473 PROT_EXEC | PROT_READ, 474 PROT_WRITE | PROT_READ, 475 PROT_EXEC | PROT_WRITE | PROT_READ, 476 }; 477 int flags, prot; 478 479 if (len < 0) 480 return (EINVAL); 481 482 #define OMAP_ANON 0x0002 483 #define OMAP_COPY 0x0020 484 #define OMAP_SHARED 0x0010 485 #define OMAP_FIXED 0x0100 486 487 prot = cvtbsdprot[oprot & 0x7]; 488 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 489 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 490 prot != 0) 491 prot |= PROT_EXEC; 492 #endif 493 flags = 0; 494 if (oflags & OMAP_ANON) 495 flags |= MAP_ANON; 496 if (oflags & OMAP_COPY) 497 flags |= MAP_COPY; 498 if (oflags & OMAP_SHARED) 499 flags |= MAP_SHARED; 500 else 501 flags |= MAP_PRIVATE; 502 if (oflags & OMAP_FIXED) 503 flags |= MAP_FIXED; 504 return (kern_mmap(td, &(struct mmap_req){ 505 .mr_hint = hint, 506 .mr_len = len, 507 .mr_prot = prot, 508 .mr_flags = flags, 509 .mr_fd = fd, 510 .mr_pos = pos, 511 })); 512 } 513 #endif /* COMPAT_43 */ 514 515 #ifndef _SYS_SYSPROTO_H_ 516 struct msync_args { 517 void *addr; 518 size_t len; 519 int flags; 520 }; 521 #endif 522 int 523 sys_msync(struct thread *td, struct msync_args *uap) 524 { 525 526 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 527 } 528 529 int 530 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 531 { 532 vm_offset_t addr; 533 vm_size_t pageoff; 534 vm_map_t map; 535 int rv; 536 537 addr = addr0; 538 pageoff = (addr & PAGE_MASK); 539 addr -= pageoff; 540 size += pageoff; 541 size = (vm_size_t) round_page(size); 542 if (addr + size < addr) 543 return (EINVAL); 544 545 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 546 return (EINVAL); 547 548 map = &td->td_proc->p_vmspace->vm_map; 549 550 /* 551 * Clean the pages and interpret the return value. 552 */ 553 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 554 (flags & MS_INVALIDATE) != 0); 555 switch (rv) { 556 case KERN_SUCCESS: 557 return (0); 558 case KERN_INVALID_ADDRESS: 559 return (ENOMEM); 560 case KERN_INVALID_ARGUMENT: 561 return (EBUSY); 562 case KERN_FAILURE: 563 return (EIO); 564 default: 565 return (EINVAL); 566 } 567 } 568 569 #ifndef _SYS_SYSPROTO_H_ 570 struct munmap_args { 571 void *addr; 572 size_t len; 573 }; 574 #endif 575 int 576 sys_munmap(struct thread *td, struct munmap_args *uap) 577 { 578 579 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 580 } 581 582 int 583 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 584 { 585 #ifdef HWPMC_HOOKS 586 struct pmckern_map_out pkm; 587 vm_map_entry_t entry; 588 bool pmc_handled; 589 #endif 590 vm_offset_t addr, end; 591 vm_size_t pageoff; 592 vm_map_t map; 593 int rv; 594 595 if (size == 0) 596 return (EINVAL); 597 598 addr = addr0; 599 pageoff = (addr & PAGE_MASK); 600 addr -= pageoff; 601 size += pageoff; 602 size = (vm_size_t) round_page(size); 603 end = addr + size; 604 map = &td->td_proc->p_vmspace->vm_map; 605 if (!vm_map_range_valid(map, addr, end)) 606 return (EINVAL); 607 608 vm_map_lock(map); 609 #ifdef HWPMC_HOOKS 610 pmc_handled = false; 611 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 612 pmc_handled = true; 613 /* 614 * Inform hwpmc if the address range being unmapped contains 615 * an executable region. 616 */ 617 pkm.pm_address = (uintptr_t) NULL; 618 if (vm_map_lookup_entry(map, addr, &entry)) { 619 for (; entry->start < end; 620 entry = vm_map_entry_succ(entry)) { 621 if (vm_map_check_protection(map, entry->start, 622 entry->end, VM_PROT_EXECUTE) == TRUE) { 623 pkm.pm_address = (uintptr_t) addr; 624 pkm.pm_size = (size_t) size; 625 break; 626 } 627 } 628 } 629 } 630 #endif 631 rv = vm_map_delete(map, addr, end); 632 633 #ifdef HWT_HOOKS 634 if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) { 635 struct hwt_record_entry ent; 636 637 ent.addr = (uintptr_t) addr; 638 ent.fullpath = NULL; 639 ent.record_type = HWT_RECORD_MUNMAP; 640 HWT_CALL_HOOK(td, HWT_RECORD, &ent); 641 } 642 #endif 643 644 #ifdef HWPMC_HOOKS 645 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 646 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 647 vm_map_lock_downgrade(map); 648 if (pkm.pm_address != (uintptr_t) NULL) 649 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 650 vm_map_unlock_read(map); 651 } else 652 #endif 653 vm_map_unlock(map); 654 655 return (vm_mmap_to_errno(rv)); 656 } 657 658 #ifndef _SYS_SYSPROTO_H_ 659 struct mprotect_args { 660 const void *addr; 661 size_t len; 662 int prot; 663 }; 664 #endif 665 int 666 sys_mprotect(struct thread *td, struct mprotect_args *uap) 667 { 668 669 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, 670 uap->prot, 0)); 671 } 672 673 int 674 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot, 675 int flags) 676 { 677 vm_offset_t addr; 678 vm_size_t pageoff; 679 int vm_error, max_prot; 680 681 addr = addr0; 682 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 683 return (EINVAL); 684 max_prot = PROT_MAX_EXTRACT(prot); 685 prot = PROT_EXTRACT(prot); 686 pageoff = (addr & PAGE_MASK); 687 addr -= pageoff; 688 size += pageoff; 689 size = (vm_size_t) round_page(size); 690 #ifdef COMPAT_FREEBSD32 691 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 692 if (((addr + size) & 0xffffffff) < addr) 693 return (EINVAL); 694 } else 695 #endif 696 if (addr + size < addr) 697 return (EINVAL); 698 699 flags |= VM_MAP_PROTECT_SET_PROT; 700 if (max_prot != 0) 701 flags |= VM_MAP_PROTECT_SET_MAXPROT; 702 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 703 addr, addr + size, prot, max_prot, flags); 704 705 switch (vm_error) { 706 case KERN_SUCCESS: 707 return (0); 708 case KERN_PROTECTION_FAILURE: 709 return (EACCES); 710 case KERN_RESOURCE_SHORTAGE: 711 return (ENOMEM); 712 case KERN_OUT_OF_BOUNDS: 713 return (ENOTSUP); 714 } 715 return (EINVAL); 716 } 717 718 #ifndef _SYS_SYSPROTO_H_ 719 struct minherit_args { 720 void *addr; 721 size_t len; 722 int inherit; 723 }; 724 #endif 725 int 726 sys_minherit(struct thread *td, struct minherit_args *uap) 727 { 728 729 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 730 uap->inherit)); 731 } 732 733 int 734 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 735 { 736 vm_offset_t addr; 737 vm_size_t size, pageoff; 738 vm_inherit_t inherit; 739 740 addr = (vm_offset_t)addr0; 741 size = len; 742 inherit = inherit0; 743 744 pageoff = (addr & PAGE_MASK); 745 addr -= pageoff; 746 size += pageoff; 747 size = (vm_size_t) round_page(size); 748 if (addr + size < addr) 749 return (EINVAL); 750 751 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 752 addr + size, inherit)) { 753 case KERN_SUCCESS: 754 return (0); 755 case KERN_PROTECTION_FAILURE: 756 return (EACCES); 757 } 758 return (EINVAL); 759 } 760 761 #ifndef _SYS_SYSPROTO_H_ 762 struct madvise_args { 763 void *addr; 764 size_t len; 765 int behav; 766 }; 767 #endif 768 769 int 770 sys_madvise(struct thread *td, struct madvise_args *uap) 771 { 772 773 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 774 } 775 776 int 777 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 778 { 779 vm_map_t map; 780 vm_offset_t addr, end, start; 781 int flags; 782 783 /* 784 * Check for our special case, advising the swap pager we are 785 * "immortal." 786 */ 787 if (behav == MADV_PROTECT) { 788 flags = PPROT_SET; 789 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 790 PROC_SPROTECT, &flags)); 791 } 792 793 /* 794 * Check for illegal addresses. Watch out for address wrap... Note 795 * that VM_*_ADDRESS are not constants due to casts (argh). 796 */ 797 map = &td->td_proc->p_vmspace->vm_map; 798 addr = addr0; 799 if (!vm_map_range_valid(map, addr, addr + len)) 800 return (EINVAL); 801 802 /* 803 * Since this routine is only advisory, we default to conservative 804 * behavior. 805 */ 806 start = trunc_page(addr); 807 end = round_page(addr + len); 808 809 /* 810 * vm_map_madvise() checks for illegal values of behav. 811 */ 812 return (vm_map_madvise(map, start, end, behav)); 813 } 814 815 #ifndef _SYS_SYSPROTO_H_ 816 struct mincore_args { 817 const void *addr; 818 size_t len; 819 char *vec; 820 }; 821 #endif 822 823 int 824 sys_mincore(struct thread *td, struct mincore_args *uap) 825 { 826 827 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 828 } 829 830 int 831 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 832 { 833 pmap_t pmap; 834 vm_map_t map; 835 vm_map_entry_t current, entry; 836 vm_object_t object; 837 vm_offset_t addr, cend, end, first_addr; 838 vm_paddr_t pa; 839 vm_page_t m; 840 vm_pindex_t pindex; 841 int error, lastvecindex, mincoreinfo, vecindex; 842 unsigned int timestamp; 843 844 /* 845 * Make sure that the addresses presented are valid for user 846 * mode. 847 */ 848 first_addr = addr = trunc_page(addr0); 849 end = round_page(addr0 + len); 850 map = &td->td_proc->p_vmspace->vm_map; 851 if (end > vm_map_max(map) || end < addr) 852 return (ENOMEM); 853 854 pmap = vmspace_pmap(td->td_proc->p_vmspace); 855 856 vm_map_lock_read(map); 857 RestartScan: 858 timestamp = map->timestamp; 859 860 if (!vm_map_lookup_entry(map, addr, &entry)) { 861 vm_map_unlock_read(map); 862 return (ENOMEM); 863 } 864 865 /* 866 * Do this on a map entry basis so that if the pages are not 867 * in the current processes address space, we can easily look 868 * up the pages elsewhere. 869 */ 870 lastvecindex = -1; 871 while (entry->start < end) { 872 /* 873 * check for contiguity 874 */ 875 current = entry; 876 entry = vm_map_entry_succ(current); 877 if (current->end < end && 878 entry->start > current->end) { 879 vm_map_unlock_read(map); 880 return (ENOMEM); 881 } 882 883 /* 884 * ignore submaps (for now) or null objects 885 */ 886 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 887 current->object.vm_object == NULL) 888 continue; 889 890 /* 891 * limit this scan to the current map entry and the 892 * limits for the mincore call 893 */ 894 if (addr < current->start) 895 addr = current->start; 896 cend = current->end; 897 if (cend > end) 898 cend = end; 899 900 for (; addr < cend; addr += PAGE_SIZE) { 901 /* 902 * Check pmap first, it is likely faster, also 903 * it can provide info as to whether we are the 904 * one referencing or modifying the page. 905 */ 906 m = NULL; 907 object = NULL; 908 retry: 909 pa = 0; 910 mincoreinfo = pmap_mincore(pmap, addr, &pa); 911 if (mincore_mapped) { 912 /* 913 * We only care about this pmap's 914 * mapping of the page, if any. 915 */ 916 ; 917 } else if (pa != 0) { 918 /* 919 * The page is mapped by this process but not 920 * both accessed and modified. It is also 921 * managed. Acquire the object lock so that 922 * other mappings might be examined. The page's 923 * identity may change at any point before its 924 * object lock is acquired, so re-validate if 925 * necessary. 926 */ 927 m = PHYS_TO_VM_PAGE(pa); 928 while (object == NULL || m->object != object) { 929 if (object != NULL) 930 VM_OBJECT_WUNLOCK(object); 931 object = atomic_load_ptr(&m->object); 932 if (object == NULL) 933 goto retry; 934 VM_OBJECT_WLOCK(object); 935 } 936 if (pa != pmap_extract(pmap, addr)) 937 goto retry; 938 KASSERT(vm_page_all_valid(m), 939 ("mincore: page %p is mapped but invalid", 940 m)); 941 } else if (mincoreinfo == 0) { 942 /* 943 * The page is not mapped by this process. If 944 * the object implements managed pages, then 945 * determine if the page is resident so that 946 * the mappings might be examined. 947 */ 948 if (current->object.vm_object != object) { 949 if (object != NULL) 950 VM_OBJECT_WUNLOCK(object); 951 object = current->object.vm_object; 952 VM_OBJECT_WLOCK(object); 953 } 954 if ((object->flags & OBJ_SWAP) != 0 || 955 object->type == OBJT_VNODE) { 956 pindex = OFF_TO_IDX(current->offset + 957 (addr - current->start)); 958 m = vm_page_lookup(object, pindex); 959 if (m != NULL && vm_page_none_valid(m)) 960 m = NULL; 961 if (m != NULL) 962 mincoreinfo = MINCORE_INCORE; 963 } 964 } 965 if (m != NULL) { 966 VM_OBJECT_ASSERT_WLOCKED(m->object); 967 968 /* Examine other mappings of the page. */ 969 if (m->dirty == 0 && pmap_is_modified(m)) 970 vm_page_dirty(m); 971 if (m->dirty != 0) 972 mincoreinfo |= MINCORE_MODIFIED_OTHER; 973 974 /* 975 * The first test for PGA_REFERENCED is an 976 * optimization. The second test is 977 * required because a concurrent pmap 978 * operation could clear the last reference 979 * and set PGA_REFERENCED before the call to 980 * pmap_is_referenced(). 981 */ 982 if ((m->a.flags & PGA_REFERENCED) != 0 || 983 pmap_is_referenced(m) || 984 (m->a.flags & PGA_REFERENCED) != 0) 985 mincoreinfo |= MINCORE_REFERENCED_OTHER; 986 } 987 if (object != NULL) 988 VM_OBJECT_WUNLOCK(object); 989 990 /* 991 * subyte may page fault. In case it needs to modify 992 * the map, we release the lock. 993 */ 994 vm_map_unlock_read(map); 995 996 /* 997 * calculate index into user supplied byte vector 998 */ 999 vecindex = atop(addr - first_addr); 1000 1001 /* 1002 * If we have skipped map entries, we need to make sure that 1003 * the byte vector is zeroed for those skipped entries. 1004 */ 1005 while ((lastvecindex + 1) < vecindex) { 1006 ++lastvecindex; 1007 error = subyte(vec + lastvecindex, 0); 1008 if (error) { 1009 error = EFAULT; 1010 goto done2; 1011 } 1012 } 1013 1014 /* 1015 * Pass the page information to the user 1016 */ 1017 error = subyte(vec + vecindex, mincoreinfo); 1018 if (error) { 1019 error = EFAULT; 1020 goto done2; 1021 } 1022 1023 /* 1024 * If the map has changed, due to the subyte, the previous 1025 * output may be invalid. 1026 */ 1027 vm_map_lock_read(map); 1028 if (timestamp != map->timestamp) 1029 goto RestartScan; 1030 1031 lastvecindex = vecindex; 1032 } 1033 } 1034 1035 /* 1036 * subyte may page fault. In case it needs to modify 1037 * the map, we release the lock. 1038 */ 1039 vm_map_unlock_read(map); 1040 1041 /* 1042 * Zero the last entries in the byte vector. 1043 */ 1044 vecindex = atop(end - first_addr); 1045 while ((lastvecindex + 1) < vecindex) { 1046 ++lastvecindex; 1047 error = subyte(vec + lastvecindex, 0); 1048 if (error) { 1049 error = EFAULT; 1050 goto done2; 1051 } 1052 } 1053 1054 /* 1055 * If the map has changed, due to the subyte, the previous 1056 * output may be invalid. 1057 */ 1058 vm_map_lock_read(map); 1059 if (timestamp != map->timestamp) 1060 goto RestartScan; 1061 vm_map_unlock_read(map); 1062 done2: 1063 return (error); 1064 } 1065 1066 #ifndef _SYS_SYSPROTO_H_ 1067 struct mlock_args { 1068 const void *addr; 1069 size_t len; 1070 }; 1071 #endif 1072 int 1073 sys_mlock(struct thread *td, struct mlock_args *uap) 1074 { 1075 1076 return (kern_mlock(td->td_proc, td->td_ucred, 1077 __DECONST(uintptr_t, uap->addr), uap->len)); 1078 } 1079 1080 int 1081 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1082 { 1083 vm_offset_t addr, end, last, start; 1084 vm_size_t npages, size; 1085 vm_map_t map; 1086 unsigned long nsize; 1087 int error; 1088 1089 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1090 if (error) 1091 return (error); 1092 addr = addr0; 1093 size = len; 1094 last = addr + size; 1095 start = trunc_page(addr); 1096 end = round_page(last); 1097 if (last < addr || end < addr) 1098 return (EINVAL); 1099 npages = atop(end - start); 1100 if (npages > vm_page_max_user_wired) 1101 return (ENOMEM); 1102 map = &proc->p_vmspace->vm_map; 1103 PROC_LOCK(proc); 1104 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1105 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1106 PROC_UNLOCK(proc); 1107 return (ENOMEM); 1108 } 1109 PROC_UNLOCK(proc); 1110 #ifdef RACCT 1111 if (racct_enable) { 1112 PROC_LOCK(proc); 1113 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1114 PROC_UNLOCK(proc); 1115 if (error != 0) 1116 return (ENOMEM); 1117 } 1118 #endif 1119 error = vm_map_wire(map, start, end, 1120 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1121 #ifdef RACCT 1122 if (racct_enable && error != KERN_SUCCESS) { 1123 PROC_LOCK(proc); 1124 racct_set(proc, RACCT_MEMLOCK, 1125 ptoa(pmap_wired_count(map->pmap))); 1126 PROC_UNLOCK(proc); 1127 } 1128 #endif 1129 switch (error) { 1130 case KERN_SUCCESS: 1131 return (0); 1132 case KERN_INVALID_ARGUMENT: 1133 return (EINVAL); 1134 default: 1135 return (ENOMEM); 1136 } 1137 } 1138 1139 #ifndef _SYS_SYSPROTO_H_ 1140 struct mlockall_args { 1141 int how; 1142 }; 1143 #endif 1144 1145 int 1146 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1147 { 1148 vm_map_t map; 1149 int error; 1150 1151 map = &td->td_proc->p_vmspace->vm_map; 1152 error = priv_check(td, PRIV_VM_MLOCK); 1153 if (error) 1154 return (error); 1155 1156 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1157 return (EINVAL); 1158 1159 /* 1160 * If wiring all pages in the process would cause it to exceed 1161 * a hard resource limit, return ENOMEM. 1162 */ 1163 if (!old_mlock && uap->how & MCL_CURRENT) { 1164 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1165 return (ENOMEM); 1166 } 1167 #ifdef RACCT 1168 if (racct_enable) { 1169 PROC_LOCK(td->td_proc); 1170 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1171 PROC_UNLOCK(td->td_proc); 1172 if (error != 0) 1173 return (ENOMEM); 1174 } 1175 #endif 1176 1177 if (uap->how & MCL_FUTURE) { 1178 vm_map_lock(map); 1179 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1180 vm_map_unlock(map); 1181 error = 0; 1182 } 1183 1184 if (uap->how & MCL_CURRENT) { 1185 /* 1186 * P1003.1-2001 mandates that all currently mapped pages 1187 * will be memory resident and locked (wired) upon return 1188 * from mlockall(). vm_map_wire() will wire pages, by 1189 * calling vm_fault_wire() for each page in the region. 1190 */ 1191 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1192 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1193 if (error == KERN_SUCCESS) 1194 error = 0; 1195 else if (error == KERN_RESOURCE_SHORTAGE) 1196 error = ENOMEM; 1197 else 1198 error = EAGAIN; 1199 } 1200 #ifdef RACCT 1201 if (racct_enable && error != KERN_SUCCESS) { 1202 PROC_LOCK(td->td_proc); 1203 racct_set(td->td_proc, RACCT_MEMLOCK, 1204 ptoa(pmap_wired_count(map->pmap))); 1205 PROC_UNLOCK(td->td_proc); 1206 } 1207 #endif 1208 1209 return (error); 1210 } 1211 1212 #ifndef _SYS_SYSPROTO_H_ 1213 struct munlockall_args { 1214 register_t dummy; 1215 }; 1216 #endif 1217 1218 int 1219 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1220 { 1221 vm_map_t map; 1222 int error; 1223 1224 map = &td->td_proc->p_vmspace->vm_map; 1225 error = priv_check(td, PRIV_VM_MUNLOCK); 1226 if (error) 1227 return (error); 1228 1229 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1230 vm_map_lock(map); 1231 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1232 vm_map_unlock(map); 1233 1234 /* Forcibly unwire all pages. */ 1235 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1236 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1237 #ifdef RACCT 1238 if (racct_enable && error == KERN_SUCCESS) { 1239 PROC_LOCK(td->td_proc); 1240 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1241 PROC_UNLOCK(td->td_proc); 1242 } 1243 #endif 1244 1245 return (error); 1246 } 1247 1248 #ifndef _SYS_SYSPROTO_H_ 1249 struct munlock_args { 1250 const void *addr; 1251 size_t len; 1252 }; 1253 #endif 1254 int 1255 sys_munlock(struct thread *td, struct munlock_args *uap) 1256 { 1257 1258 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1259 } 1260 1261 int 1262 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1263 { 1264 vm_offset_t addr, end, last, start; 1265 #ifdef RACCT 1266 vm_map_t map; 1267 #endif 1268 int error; 1269 1270 error = priv_check(td, PRIV_VM_MUNLOCK); 1271 if (error) 1272 return (error); 1273 addr = addr0; 1274 last = addr + size; 1275 start = trunc_page(addr); 1276 end = round_page(last); 1277 if (last < addr || end < addr) 1278 return (EINVAL); 1279 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1280 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1281 #ifdef RACCT 1282 if (racct_enable && error == KERN_SUCCESS) { 1283 PROC_LOCK(td->td_proc); 1284 map = &td->td_proc->p_vmspace->vm_map; 1285 racct_set(td->td_proc, RACCT_MEMLOCK, 1286 ptoa(pmap_wired_count(map->pmap))); 1287 PROC_UNLOCK(td->td_proc); 1288 } 1289 #endif 1290 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1291 } 1292 1293 /* 1294 * vm_mmap_vnode() 1295 * 1296 * Helper function for vm_mmap. Perform sanity check specific for mmap 1297 * operations on vnodes. 1298 */ 1299 int 1300 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1301 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1302 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1303 boolean_t *writecounted) 1304 { 1305 struct vattr va; 1306 vm_object_t obj; 1307 vm_ooffset_t foff; 1308 struct ucred *cred; 1309 int error, flags; 1310 bool writex; 1311 1312 cred = td->td_ucred; 1313 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1314 (*flagsp & MAP_SHARED) != 0; 1315 if ((error = vget(vp, LK_SHARED)) != 0) 1316 return (error); 1317 AUDIT_ARG_VNODE1(vp); 1318 foff = *foffp; 1319 flags = *flagsp; 1320 obj = vp->v_object; 1321 if (vp->v_type == VREG) { 1322 /* 1323 * Get the proper underlying object 1324 */ 1325 if (obj == NULL) { 1326 error = EINVAL; 1327 goto done; 1328 } 1329 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1330 vput(vp); 1331 vp = (struct vnode *)obj->handle; 1332 /* 1333 * Bypass filesystems obey the mpsafety of the 1334 * underlying fs. Tmpfs never bypasses. 1335 */ 1336 error = vget(vp, LK_SHARED); 1337 if (error != 0) 1338 return (error); 1339 } 1340 if (writex) { 1341 *writecounted = TRUE; 1342 vm_pager_update_writecount(obj, 0, objsize); 1343 } 1344 } else { 1345 error = EXTERROR(EINVAL, "non-reg file"); 1346 goto done; 1347 } 1348 if ((error = VOP_GETATTR(vp, &va, cred))) 1349 goto done; 1350 #ifdef MAC 1351 /* This relies on VM_PROT_* matching PROT_*. */ 1352 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1353 if (error != 0) 1354 goto done; 1355 #endif 1356 if ((flags & MAP_SHARED) != 0) { 1357 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1358 if (prot & VM_PROT_WRITE) { 1359 error = EPERM; 1360 goto done; 1361 } 1362 *maxprotp &= ~VM_PROT_WRITE; 1363 } 1364 } 1365 /* 1366 * If it is a regular file without any references 1367 * we do not need to sync it. 1368 * Adjust object size to be the size of actual file. 1369 */ 1370 objsize = round_page(va.va_size); 1371 if (va.va_nlink == 0) 1372 flags |= MAP_NOSYNC; 1373 if (obj->type == OBJT_VNODE) { 1374 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1375 cred); 1376 if (obj == NULL) { 1377 error = ENOMEM; 1378 goto done; 1379 } 1380 } else { 1381 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1382 vm_object_reference(obj); 1383 #if VM_NRESERVLEVEL > 0 1384 if ((obj->flags & OBJ_COLORED) == 0) { 1385 VM_OBJECT_WLOCK(obj); 1386 vm_object_color(obj, 0); 1387 VM_OBJECT_WUNLOCK(obj); 1388 } 1389 #endif 1390 } 1391 *objp = obj; 1392 *flagsp = flags; 1393 1394 VOP_MMAPPED(vp); 1395 1396 done: 1397 if (error != 0 && *writecounted) { 1398 *writecounted = FALSE; 1399 vm_pager_update_writecount(obj, objsize, 0); 1400 } 1401 vput(vp); 1402 return (error); 1403 } 1404 1405 /* 1406 * vm_mmap_cdev() 1407 * 1408 * Helper function for vm_mmap. Perform sanity check specific for mmap 1409 * operations on cdevs. 1410 */ 1411 int 1412 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1413 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1414 vm_ooffset_t *foff, vm_object_t *objp) 1415 { 1416 vm_object_t obj; 1417 int error, flags; 1418 1419 flags = *flagsp; 1420 1421 if (dsw->d_flags & D_MMAP_ANON) { 1422 *objp = NULL; 1423 *foff = 0; 1424 *maxprotp = VM_PROT_ALL; 1425 *flagsp |= MAP_ANON; 1426 return (0); 1427 } 1428 1429 /* 1430 * cdevs do not provide private mappings of any kind. 1431 */ 1432 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1433 (prot & VM_PROT_WRITE) != 0) 1434 return (EACCES); 1435 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) { 1436 return (EXTERROR(EINVAL, "cdev mapping must be shared")); 1437 } 1438 1439 /* 1440 * Force device mappings to be shared. 1441 */ 1442 flags |= MAP_SHARED; 1443 #ifdef MAC_XXX 1444 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1445 if (error != 0) 1446 return (error); 1447 #endif 1448 /* 1449 * First, try d_mmap_single(). If that is not implemented 1450 * (returns ENODEV), fall back to using the device pager. 1451 * Note that d_mmap_single() must return a reference to the 1452 * object (it needs to bump the reference count of the object 1453 * it returns somehow). 1454 * 1455 * XXX assumes VM_PROT_* == PROT_* 1456 */ 1457 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1458 if (error != ENODEV) 1459 return (error); 1460 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1461 td->td_ucred); 1462 if (obj == NULL) { 1463 return (EXTERROR(EINVAL, 1464 "cdev driver does not support mmap")); 1465 } 1466 *objp = obj; 1467 *flagsp = flags; 1468 return (0); 1469 } 1470 1471 int 1472 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1473 vm_prot_t maxprot, int flags, 1474 objtype_t handle_type, void *handle, 1475 vm_ooffset_t foff) 1476 { 1477 vm_object_t object; 1478 struct thread *td = curthread; 1479 int error; 1480 boolean_t writecounted; 1481 1482 if (size == 0) { 1483 return (EXTERROR(EINVAL, "zero-sized req")); 1484 } 1485 1486 size = round_page(size); 1487 object = NULL; 1488 writecounted = FALSE; 1489 1490 switch (handle_type) { 1491 case OBJT_DEVICE: { 1492 struct cdevsw *dsw; 1493 struct cdev *cdev; 1494 int ref; 1495 1496 cdev = handle; 1497 dsw = dev_refthread(cdev, &ref); 1498 if (dsw == NULL) 1499 return (ENXIO); 1500 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1501 dsw, &foff, &object); 1502 dev_relthread(cdev, ref); 1503 break; 1504 } 1505 case OBJT_VNODE: 1506 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1507 handle, &foff, &object, &writecounted); 1508 break; 1509 default: 1510 error = EXTERROR(EINVAL, "unsupported backing obj type %jd", 1511 handle_type); 1512 break; 1513 } 1514 if (error) 1515 return (error); 1516 1517 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1518 foff, writecounted, td); 1519 if (error != 0 && object != NULL) { 1520 /* 1521 * If this mapping was accounted for in the vnode's 1522 * writecount, then undo that now. 1523 */ 1524 if (writecounted) 1525 vm_pager_release_writecount(object, 0, size); 1526 vm_object_deallocate(object); 1527 } 1528 return (error); 1529 } 1530 1531 int 1532 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1533 { 1534 int error; 1535 1536 RACCT_PROC_LOCK(td->td_proc); 1537 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1538 RACCT_PROC_UNLOCK(td->td_proc); 1539 return (ENOMEM); 1540 } 1541 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1542 RACCT_PROC_UNLOCK(td->td_proc); 1543 return (ENOMEM); 1544 } 1545 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1546 if (ptoa(pmap_wired_count(map->pmap)) + size > 1547 lim_cur(td, RLIMIT_MEMLOCK)) { 1548 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1549 RACCT_PROC_UNLOCK(td->td_proc); 1550 return (ENOMEM); 1551 } 1552 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1553 ptoa(pmap_wired_count(map->pmap)) + size); 1554 if (error != 0) { 1555 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1556 RACCT_PROC_UNLOCK(td->td_proc); 1557 return (error); 1558 } 1559 } 1560 RACCT_PROC_UNLOCK(td->td_proc); 1561 return (0); 1562 } 1563 1564 /* 1565 * Internal version of mmap that maps a specific VM object into an 1566 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1567 */ 1568 int 1569 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1570 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1571 boolean_t writecounted, struct thread *td) 1572 { 1573 vm_offset_t default_addr, max_addr; 1574 int docow, error, findspace, rv; 1575 bool curmap, fitit; 1576 1577 curmap = map == &td->td_proc->p_vmspace->vm_map; 1578 if (curmap) { 1579 error = kern_mmap_racct_check(td, map, size); 1580 if (error != 0) 1581 return (error); 1582 } 1583 1584 /* 1585 * We currently can only deal with page aligned file offsets. 1586 * The mmap() system call already enforces this by subtracting 1587 * the page offset from the file offset, but checking here 1588 * catches errors in device drivers (e.g. d_single_mmap() 1589 * callbacks) and other internal mapping requests (such as in 1590 * exec). 1591 */ 1592 if ((foff & PAGE_MASK) != 0) { 1593 return (EXTERROR(EINVAL, "offset %#jx not page-aligned", foff)); 1594 } 1595 1596 if ((flags & MAP_FIXED) == 0) { 1597 fitit = true; 1598 *addr = round_page(*addr); 1599 } else { 1600 if (*addr != trunc_page(*addr)) { 1601 return (EXTERROR(EINVAL, 1602 "non-fixed mapping address %#jx not aligned", 1603 *addr)); 1604 } 1605 fitit = false; 1606 } 1607 1608 if (flags & MAP_ANON) { 1609 if (object != NULL) { 1610 return (EXTERROR(EINVAL, 1611 "anon mapping backed by an object")); 1612 } 1613 if (foff != 0) { 1614 return (EXTERROR(EINVAL, 1615 "anon mapping with non-zero offset %#jx", foff)); 1616 } 1617 docow = 0; 1618 } else if (flags & MAP_PREFAULT_READ) 1619 docow = MAP_PREFAULT; 1620 else 1621 docow = MAP_PREFAULT_PARTIAL; 1622 1623 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1624 docow |= MAP_COPY_ON_WRITE; 1625 if (flags & MAP_NOSYNC) 1626 docow |= MAP_DISABLE_SYNCER; 1627 if (flags & MAP_NOCORE) 1628 docow |= MAP_DISABLE_COREDUMP; 1629 /* Shared memory is also shared with children. */ 1630 if (flags & MAP_SHARED) 1631 docow |= MAP_INHERIT_SHARE; 1632 if (writecounted) 1633 docow |= MAP_WRITECOUNT; 1634 if (flags & MAP_STACK) { 1635 if (object != NULL) { 1636 return (EXTERROR(EINVAL, 1637 "stack mapping backed by an object")); 1638 } 1639 docow |= MAP_STACK_AREA; 1640 } 1641 if ((flags & MAP_EXCL) != 0) 1642 docow |= MAP_CHECK_EXCL; 1643 if ((flags & MAP_GUARD) != 0) 1644 docow |= MAP_CREATE_GUARD; 1645 1646 if (fitit) { 1647 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1648 findspace = VMFS_SUPER_SPACE; 1649 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1650 findspace = VMFS_ALIGNED_SPACE(flags >> 1651 MAP_ALIGNMENT_SHIFT); 1652 else 1653 findspace = VMFS_OPTIMAL_SPACE; 1654 max_addr = 0; 1655 if ((flags & MAP_32BIT) != 0) 1656 max_addr = MAP_32BIT_MAX_ADDR; 1657 if (curmap) { 1658 default_addr = 1659 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1660 vm_daddr + lim_max(td, RLIMIT_DATA)); 1661 if ((flags & MAP_32BIT) != 0) 1662 default_addr = 0; 1663 rv = vm_map_find_min(map, object, foff, addr, size, 1664 default_addr, max_addr, findspace, prot, maxprot, 1665 docow); 1666 } else { 1667 rv = vm_map_find(map, object, foff, addr, size, 1668 max_addr, findspace, prot, maxprot, docow); 1669 } 1670 } else { 1671 rv = vm_map_fixed(map, object, foff, *addr, size, 1672 prot, maxprot, docow); 1673 } 1674 1675 if (rv == KERN_SUCCESS) { 1676 /* 1677 * If the process has requested that all future mappings 1678 * be wired, then heed this. 1679 */ 1680 if ((map->flags & MAP_WIREFUTURE) != 0) { 1681 vm_map_lock(map); 1682 if ((map->flags & MAP_WIREFUTURE) != 0) 1683 (void)vm_map_wire_locked(map, *addr, 1684 *addr + size, VM_MAP_WIRE_USER | 1685 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1686 VM_MAP_WIRE_NOHOLES)); 1687 vm_map_unlock(map); 1688 } 1689 } 1690 return (vm_mmap_to_errno(rv)); 1691 } 1692 1693 /* 1694 * Translate a Mach VM return code to zero on success or the appropriate errno 1695 * on failure. 1696 */ 1697 int 1698 vm_mmap_to_errno(int rv) 1699 { 1700 int error; 1701 1702 switch (rv) { 1703 case KERN_SUCCESS: 1704 return (0); 1705 case KERN_INVALID_ADDRESS: 1706 case KERN_NO_SPACE: 1707 error = ENOMEM; 1708 break; 1709 case KERN_PROTECTION_FAILURE: 1710 error = EACCES; 1711 break; 1712 default: 1713 error = EINVAL; 1714 break; 1715 } 1716 if ((curthread->td_pflags2 & (TDP2_UEXTERR | TDP2_EXTERR)) == 1717 TDP2_UEXTERR) 1718 EXTERROR(error, "mach error %jd", rv); 1719 return (error); 1720 } 1721