1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_hwpmc_hooks.h" 49 #include "opt_vm.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/capsicum.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/elf.h> 59 #include <sys/filedesc.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/procctl.h> 63 #include <sys/racct.h> 64 #include <sys/resource.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/sysctl.h> 68 #include <sys/vnode.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/mman.h> 72 #include <sys/mount.h> 73 #include <sys/conf.h> 74 #include <sys/stat.h> 75 #include <sys/syscallsubr.h> 76 #include <sys/sysent.h> 77 #include <sys/vmmeter.h> 78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 79 #include <machine/md_var.h> 80 #endif 81 82 #include <security/audit/audit.h> 83 #include <security/mac/mac_framework.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <vm/pmap.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_pageout.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vnode_pager.h> 96 97 #ifdef HWPMC_HOOKS 98 #include <sys/pmckern.h> 99 #endif 100 101 int old_mlock = 0; 102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 103 "Do not apply RLIMIT_MEMLOCK on mlockall"); 104 static int mincore_mapped = 1; 105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 106 "mincore reports mappings, not residency"); 107 static int imply_prot_max = 0; 108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 109 "Imply maximum page protections in mmap() when none are specified"); 110 111 #ifdef MAP_32BIT 112 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 113 #endif 114 115 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 116 117 #ifndef _SYS_SYSPROTO_H_ 118 struct sbrk_args { 119 int incr; 120 }; 121 #endif 122 123 int 124 sys_sbrk(struct thread *td, struct sbrk_args *uap) 125 { 126 /* Not yet implemented */ 127 return (EOPNOTSUPP); 128 } 129 130 #ifndef _SYS_SYSPROTO_H_ 131 struct sstk_args { 132 int incr; 133 }; 134 #endif 135 136 int 137 sys_sstk(struct thread *td, struct sstk_args *uap) 138 { 139 /* Not yet implemented */ 140 return (EOPNOTSUPP); 141 } 142 143 #if defined(COMPAT_43) 144 int 145 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 146 { 147 148 td->td_retval[0] = PAGE_SIZE; 149 return (0); 150 } 151 #endif /* COMPAT_43 */ 152 153 /* 154 * Memory Map (mmap) system call. Note that the file offset 155 * and address are allowed to be NOT page aligned, though if 156 * the MAP_FIXED flag it set, both must have the same remainder 157 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 158 * page-aligned, the actual mapping starts at trunc_page(addr) 159 * and the return value is adjusted up by the page offset. 160 * 161 * Generally speaking, only character devices which are themselves 162 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 163 * there would be no cache coherency between a descriptor and a VM mapping 164 * both to the same character device. 165 */ 166 #ifndef _SYS_SYSPROTO_H_ 167 struct mmap_args { 168 void *addr; 169 size_t len; 170 int prot; 171 int flags; 172 int fd; 173 long pad; 174 off_t pos; 175 }; 176 #endif 177 178 int 179 sys_mmap(struct thread *td, struct mmap_args *uap) 180 { 181 182 return (kern_mmap(td, &(struct mmap_req){ 183 .mr_hint = (uintptr_t)uap->addr, 184 .mr_len = uap->len, 185 .mr_prot = uap->prot, 186 .mr_flags = uap->flags, 187 .mr_fd = uap->fd, 188 .mr_pos = uap->pos, 189 })); 190 } 191 192 int 193 kern_mmap_maxprot(struct proc *p, int prot) 194 { 195 196 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 197 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 198 return (_PROT_ALL); 199 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 200 prot != PROT_NONE) 201 return (prot); 202 return (_PROT_ALL); 203 } 204 205 int 206 kern_mmap(struct thread *td, const struct mmap_req *mrp) 207 { 208 struct vmspace *vms; 209 struct file *fp; 210 struct proc *p; 211 off_t pos; 212 vm_offset_t addr, orig_addr; 213 vm_size_t len, pageoff, size; 214 vm_prot_t cap_maxprot; 215 int align, error, fd, flags, max_prot, prot; 216 cap_rights_t rights; 217 mmap_check_fp_fn check_fp_fn; 218 219 orig_addr = addr = mrp->mr_hint; 220 len = mrp->mr_len; 221 prot = mrp->mr_prot; 222 flags = mrp->mr_flags; 223 fd = mrp->mr_fd; 224 pos = mrp->mr_pos; 225 check_fp_fn = mrp->mr_check_fp_fn; 226 227 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 228 return (EINVAL); 229 max_prot = PROT_MAX_EXTRACT(prot); 230 prot = PROT_EXTRACT(prot); 231 if (max_prot != 0 && (max_prot & prot) != prot) 232 return (ENOTSUP); 233 234 p = td->td_proc; 235 236 /* 237 * Always honor PROT_MAX if set. If not, default to all 238 * permissions unless we're implying maximum permissions. 239 */ 240 if (max_prot == 0) 241 max_prot = kern_mmap_maxprot(p, prot); 242 243 vms = p->p_vmspace; 244 fp = NULL; 245 AUDIT_ARG_FD(fd); 246 247 /* 248 * Ignore old flags that used to be defined but did not do anything. 249 */ 250 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 251 252 /* 253 * Enforce the constraints. 254 * Mapping of length 0 is only allowed for old binaries. 255 * Anonymous mapping shall specify -1 as filedescriptor and 256 * zero position for new code. Be nice to ancient a.out 257 * binaries and correct pos for anonymous mapping, since old 258 * ld.so sometimes issues anonymous map requests with non-zero 259 * pos. 260 */ 261 if (!SV_CURPROC_FLAG(SV_AOUT)) { 262 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 263 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 264 return (EINVAL); 265 } else { 266 if ((flags & MAP_ANON) != 0) 267 pos = 0; 268 } 269 270 if (flags & MAP_STACK) { 271 if ((fd != -1) || 272 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 273 return (EINVAL); 274 flags |= MAP_ANON; 275 pos = 0; 276 } 277 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 278 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 279 MAP_PREFAULT_READ | MAP_GUARD | 280 #ifdef MAP_32BIT 281 MAP_32BIT | 282 #endif 283 MAP_ALIGNMENT_MASK)) != 0) 284 return (EINVAL); 285 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 286 return (EINVAL); 287 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 288 return (EINVAL); 289 if (prot != PROT_NONE && 290 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 291 return (EINVAL); 292 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 293 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 294 #ifdef MAP_32BIT 295 MAP_32BIT | 296 #endif 297 MAP_ALIGNMENT_MASK)) != 0)) 298 return (EINVAL); 299 300 /* 301 * Align the file position to a page boundary, 302 * and save its page offset component. 303 */ 304 pageoff = (pos & PAGE_MASK); 305 pos -= pageoff; 306 307 /* Compute size from len by rounding (on both ends). */ 308 size = len + pageoff; /* low end... */ 309 size = round_page(size); /* hi end */ 310 /* Check for rounding up to zero. */ 311 if (len > size) 312 return (ENOMEM); 313 314 /* Ensure alignment is at least a page and fits in a pointer. */ 315 align = flags & MAP_ALIGNMENT_MASK; 316 if (align != 0 && align != MAP_ALIGNED_SUPER && 317 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 318 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 319 return (EINVAL); 320 321 /* 322 * Check for illegal addresses. Watch out for address wrap... Note 323 * that VM_*_ADDRESS are not constants due to casts (argh). 324 */ 325 if (flags & MAP_FIXED) { 326 /* 327 * The specified address must have the same remainder 328 * as the file offset taken modulo PAGE_SIZE, so it 329 * should be aligned after adjustment by pageoff. 330 */ 331 addr -= pageoff; 332 if (addr & PAGE_MASK) 333 return (EINVAL); 334 335 /* Address range must be all in user VM space. */ 336 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) 337 return (EINVAL); 338 #ifdef MAP_32BIT 339 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 340 return (EINVAL); 341 } else if (flags & MAP_32BIT) { 342 /* 343 * For MAP_32BIT, override the hint if it is too high and 344 * do not bother moving the mapping past the heap (since 345 * the heap is usually above 2GB). 346 */ 347 if (addr + size > MAP_32BIT_MAX_ADDR) 348 addr = 0; 349 #endif 350 } else { 351 /* 352 * XXX for non-fixed mappings where no hint is provided or 353 * the hint would fall in the potential heap space, 354 * place it after the end of the largest possible heap. 355 * 356 * For anonymous mappings within the address space of the 357 * calling process, the absence of a hint is handled at a 358 * lower level in order to implement different clustering 359 * strategies for ASLR. 360 */ 361 if (((flags & MAP_ANON) == 0 && addr == 0) || 362 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 363 addr < round_page((vm_offset_t)vms->vm_daddr + 364 lim_max(td, RLIMIT_DATA)))) 365 addr = round_page((vm_offset_t)vms->vm_daddr + 366 lim_max(td, RLIMIT_DATA)); 367 } 368 if (len == 0) { 369 /* 370 * Return success without mapping anything for old 371 * binaries that request a page-aligned mapping of 372 * length 0. For modern binaries, this function 373 * returns an error earlier. 374 */ 375 error = 0; 376 } else if ((flags & MAP_GUARD) != 0) { 377 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 378 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 379 } else if ((flags & MAP_ANON) != 0) { 380 /* 381 * Mapping blank space is trivial. 382 * 383 * This relies on VM_PROT_* matching PROT_*. 384 */ 385 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 386 max_prot, flags, NULL, pos, FALSE, td); 387 } else { 388 /* 389 * Mapping file, get fp for validation and don't let the 390 * descriptor disappear on us if we block. Check capability 391 * rights, but also return the maximum rights to be combined 392 * with maxprot later. 393 */ 394 cap_rights_init_one(&rights, CAP_MMAP); 395 if (prot & PROT_READ) 396 cap_rights_set_one(&rights, CAP_MMAP_R); 397 if ((flags & MAP_SHARED) != 0) { 398 if (prot & PROT_WRITE) 399 cap_rights_set_one(&rights, CAP_MMAP_W); 400 } 401 if (prot & PROT_EXEC) 402 cap_rights_set_one(&rights, CAP_MMAP_X); 403 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 404 if (error != 0) 405 goto done; 406 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 407 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 408 error = EINVAL; 409 goto done; 410 } 411 if (check_fp_fn != NULL) { 412 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 413 flags); 414 if (error != 0) 415 goto done; 416 } 417 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 418 addr = orig_addr; 419 /* This relies on VM_PROT_* matching PROT_*. */ 420 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 421 max_prot & cap_maxprot, flags, pos, td); 422 } 423 424 if (error == 0) 425 td->td_retval[0] = addr + pageoff; 426 done: 427 if (fp) 428 fdrop(fp, td); 429 430 return (error); 431 } 432 433 #if defined(COMPAT_FREEBSD6) 434 int 435 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 436 { 437 return (kern_mmap(td, &(struct mmap_req){ 438 .mr_hint = (uintptr_t)uap->addr, 439 .mr_len = uap->len, 440 .mr_prot = uap->prot, 441 .mr_flags = uap->flags, 442 .mr_fd = uap->fd, 443 .mr_pos = uap->pos, 444 })); 445 } 446 #endif 447 448 #ifdef COMPAT_43 449 #ifndef _SYS_SYSPROTO_H_ 450 struct ommap_args { 451 caddr_t addr; 452 int len; 453 int prot; 454 int flags; 455 int fd; 456 long pos; 457 }; 458 #endif 459 int 460 ommap(struct thread *td, struct ommap_args *uap) 461 { 462 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 463 uap->flags, uap->fd, uap->pos)); 464 } 465 466 int 467 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 468 int oflags, int fd, long pos) 469 { 470 static const char cvtbsdprot[8] = { 471 0, 472 PROT_EXEC, 473 PROT_WRITE, 474 PROT_EXEC | PROT_WRITE, 475 PROT_READ, 476 PROT_EXEC | PROT_READ, 477 PROT_WRITE | PROT_READ, 478 PROT_EXEC | PROT_WRITE | PROT_READ, 479 }; 480 int flags, prot; 481 482 if (len < 0) 483 return (EINVAL); 484 485 #define OMAP_ANON 0x0002 486 #define OMAP_COPY 0x0020 487 #define OMAP_SHARED 0x0010 488 #define OMAP_FIXED 0x0100 489 490 prot = cvtbsdprot[oprot & 0x7]; 491 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 492 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 493 prot != 0) 494 prot |= PROT_EXEC; 495 #endif 496 flags = 0; 497 if (oflags & OMAP_ANON) 498 flags |= MAP_ANON; 499 if (oflags & OMAP_COPY) 500 flags |= MAP_COPY; 501 if (oflags & OMAP_SHARED) 502 flags |= MAP_SHARED; 503 else 504 flags |= MAP_PRIVATE; 505 if (oflags & OMAP_FIXED) 506 flags |= MAP_FIXED; 507 return (kern_mmap(td, &(struct mmap_req){ 508 .mr_hint = hint, 509 .mr_len = len, 510 .mr_prot = prot, 511 .mr_flags = flags, 512 .mr_fd = fd, 513 .mr_pos = pos, 514 })); 515 } 516 #endif /* COMPAT_43 */ 517 518 #ifndef _SYS_SYSPROTO_H_ 519 struct msync_args { 520 void *addr; 521 size_t len; 522 int flags; 523 }; 524 #endif 525 int 526 sys_msync(struct thread *td, struct msync_args *uap) 527 { 528 529 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 530 } 531 532 int 533 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 534 { 535 vm_offset_t addr; 536 vm_size_t pageoff; 537 vm_map_t map; 538 int rv; 539 540 addr = addr0; 541 pageoff = (addr & PAGE_MASK); 542 addr -= pageoff; 543 size += pageoff; 544 size = (vm_size_t) round_page(size); 545 if (addr + size < addr) 546 return (EINVAL); 547 548 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 549 return (EINVAL); 550 551 map = &td->td_proc->p_vmspace->vm_map; 552 553 /* 554 * Clean the pages and interpret the return value. 555 */ 556 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 557 (flags & MS_INVALIDATE) != 0); 558 switch (rv) { 559 case KERN_SUCCESS: 560 return (0); 561 case KERN_INVALID_ADDRESS: 562 return (ENOMEM); 563 case KERN_INVALID_ARGUMENT: 564 return (EBUSY); 565 case KERN_FAILURE: 566 return (EIO); 567 default: 568 return (EINVAL); 569 } 570 } 571 572 #ifndef _SYS_SYSPROTO_H_ 573 struct munmap_args { 574 void *addr; 575 size_t len; 576 }; 577 #endif 578 int 579 sys_munmap(struct thread *td, struct munmap_args *uap) 580 { 581 582 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 583 } 584 585 int 586 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 587 { 588 #ifdef HWPMC_HOOKS 589 struct pmckern_map_out pkm; 590 vm_map_entry_t entry; 591 bool pmc_handled; 592 #endif 593 vm_offset_t addr, end; 594 vm_size_t pageoff; 595 vm_map_t map; 596 int rv; 597 598 if (size == 0) 599 return (EINVAL); 600 601 addr = addr0; 602 pageoff = (addr & PAGE_MASK); 603 addr -= pageoff; 604 size += pageoff; 605 size = (vm_size_t) round_page(size); 606 end = addr + size; 607 map = &td->td_proc->p_vmspace->vm_map; 608 if (!vm_map_range_valid(map, addr, end)) 609 return (EINVAL); 610 611 vm_map_lock(map); 612 #ifdef HWPMC_HOOKS 613 pmc_handled = false; 614 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 615 pmc_handled = true; 616 /* 617 * Inform hwpmc if the address range being unmapped contains 618 * an executable region. 619 */ 620 pkm.pm_address = (uintptr_t) NULL; 621 if (vm_map_lookup_entry(map, addr, &entry)) { 622 for (; entry->start < end; 623 entry = vm_map_entry_succ(entry)) { 624 if (vm_map_check_protection(map, entry->start, 625 entry->end, VM_PROT_EXECUTE) == TRUE) { 626 pkm.pm_address = (uintptr_t) addr; 627 pkm.pm_size = (size_t) size; 628 break; 629 } 630 } 631 } 632 } 633 #endif 634 rv = vm_map_delete(map, addr, end); 635 636 #ifdef HWPMC_HOOKS 637 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 638 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 639 vm_map_lock_downgrade(map); 640 if (pkm.pm_address != (uintptr_t) NULL) 641 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 642 vm_map_unlock_read(map); 643 } else 644 #endif 645 vm_map_unlock(map); 646 647 return (vm_mmap_to_errno(rv)); 648 } 649 650 #ifndef _SYS_SYSPROTO_H_ 651 struct mprotect_args { 652 const void *addr; 653 size_t len; 654 int prot; 655 }; 656 #endif 657 int 658 sys_mprotect(struct thread *td, struct mprotect_args *uap) 659 { 660 661 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 662 } 663 664 int 665 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 666 { 667 vm_offset_t addr; 668 vm_size_t pageoff; 669 int vm_error, max_prot; 670 int flags; 671 672 addr = addr0; 673 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 674 return (EINVAL); 675 max_prot = PROT_MAX_EXTRACT(prot); 676 prot = PROT_EXTRACT(prot); 677 pageoff = (addr & PAGE_MASK); 678 addr -= pageoff; 679 size += pageoff; 680 size = (vm_size_t) round_page(size); 681 #ifdef COMPAT_FREEBSD32 682 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 683 if (((addr + size) & 0xffffffff) < addr) 684 return (EINVAL); 685 } else 686 #endif 687 if (addr + size < addr) 688 return (EINVAL); 689 690 flags = VM_MAP_PROTECT_SET_PROT; 691 if (max_prot != 0) 692 flags |= VM_MAP_PROTECT_SET_MAXPROT; 693 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 694 addr, addr + size, prot, max_prot, flags); 695 696 switch (vm_error) { 697 case KERN_SUCCESS: 698 return (0); 699 case KERN_PROTECTION_FAILURE: 700 return (EACCES); 701 case KERN_RESOURCE_SHORTAGE: 702 return (ENOMEM); 703 case KERN_OUT_OF_BOUNDS: 704 return (ENOTSUP); 705 } 706 return (EINVAL); 707 } 708 709 #ifndef _SYS_SYSPROTO_H_ 710 struct minherit_args { 711 void *addr; 712 size_t len; 713 int inherit; 714 }; 715 #endif 716 int 717 sys_minherit(struct thread *td, struct minherit_args *uap) 718 { 719 720 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 721 uap->inherit)); 722 } 723 724 int 725 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 726 { 727 vm_offset_t addr; 728 vm_size_t size, pageoff; 729 vm_inherit_t inherit; 730 731 addr = (vm_offset_t)addr0; 732 size = len; 733 inherit = inherit0; 734 735 pageoff = (addr & PAGE_MASK); 736 addr -= pageoff; 737 size += pageoff; 738 size = (vm_size_t) round_page(size); 739 if (addr + size < addr) 740 return (EINVAL); 741 742 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 743 addr + size, inherit)) { 744 case KERN_SUCCESS: 745 return (0); 746 case KERN_PROTECTION_FAILURE: 747 return (EACCES); 748 } 749 return (EINVAL); 750 } 751 752 #ifndef _SYS_SYSPROTO_H_ 753 struct madvise_args { 754 void *addr; 755 size_t len; 756 int behav; 757 }; 758 #endif 759 760 int 761 sys_madvise(struct thread *td, struct madvise_args *uap) 762 { 763 764 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 765 } 766 767 int 768 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 769 { 770 vm_map_t map; 771 vm_offset_t addr, end, start; 772 int flags; 773 774 /* 775 * Check for our special case, advising the swap pager we are 776 * "immortal." 777 */ 778 if (behav == MADV_PROTECT) { 779 flags = PPROT_SET; 780 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 781 PROC_SPROTECT, &flags)); 782 } 783 784 /* 785 * Check for illegal addresses. Watch out for address wrap... Note 786 * that VM_*_ADDRESS are not constants due to casts (argh). 787 */ 788 map = &td->td_proc->p_vmspace->vm_map; 789 addr = addr0; 790 if (!vm_map_range_valid(map, addr, addr + len)) 791 return (EINVAL); 792 793 /* 794 * Since this routine is only advisory, we default to conservative 795 * behavior. 796 */ 797 start = trunc_page(addr); 798 end = round_page(addr + len); 799 800 /* 801 * vm_map_madvise() checks for illegal values of behav. 802 */ 803 return (vm_map_madvise(map, start, end, behav)); 804 } 805 806 #ifndef _SYS_SYSPROTO_H_ 807 struct mincore_args { 808 const void *addr; 809 size_t len; 810 char *vec; 811 }; 812 #endif 813 814 int 815 sys_mincore(struct thread *td, struct mincore_args *uap) 816 { 817 818 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 819 } 820 821 int 822 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 823 { 824 pmap_t pmap; 825 vm_map_t map; 826 vm_map_entry_t current, entry; 827 vm_object_t object; 828 vm_offset_t addr, cend, end, first_addr; 829 vm_paddr_t pa; 830 vm_page_t m; 831 vm_pindex_t pindex; 832 int error, lastvecindex, mincoreinfo, vecindex; 833 unsigned int timestamp; 834 835 /* 836 * Make sure that the addresses presented are valid for user 837 * mode. 838 */ 839 first_addr = addr = trunc_page(addr0); 840 end = round_page(addr0 + len); 841 map = &td->td_proc->p_vmspace->vm_map; 842 if (end > vm_map_max(map) || end < addr) 843 return (ENOMEM); 844 845 pmap = vmspace_pmap(td->td_proc->p_vmspace); 846 847 vm_map_lock_read(map); 848 RestartScan: 849 timestamp = map->timestamp; 850 851 if (!vm_map_lookup_entry(map, addr, &entry)) { 852 vm_map_unlock_read(map); 853 return (ENOMEM); 854 } 855 856 /* 857 * Do this on a map entry basis so that if the pages are not 858 * in the current processes address space, we can easily look 859 * up the pages elsewhere. 860 */ 861 lastvecindex = -1; 862 while (entry->start < end) { 863 /* 864 * check for contiguity 865 */ 866 current = entry; 867 entry = vm_map_entry_succ(current); 868 if (current->end < end && 869 entry->start > current->end) { 870 vm_map_unlock_read(map); 871 return (ENOMEM); 872 } 873 874 /* 875 * ignore submaps (for now) or null objects 876 */ 877 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 878 current->object.vm_object == NULL) 879 continue; 880 881 /* 882 * limit this scan to the current map entry and the 883 * limits for the mincore call 884 */ 885 if (addr < current->start) 886 addr = current->start; 887 cend = current->end; 888 if (cend > end) 889 cend = end; 890 891 for (; addr < cend; addr += PAGE_SIZE) { 892 /* 893 * Check pmap first, it is likely faster, also 894 * it can provide info as to whether we are the 895 * one referencing or modifying the page. 896 */ 897 m = NULL; 898 object = NULL; 899 retry: 900 pa = 0; 901 mincoreinfo = pmap_mincore(pmap, addr, &pa); 902 if (mincore_mapped) { 903 /* 904 * We only care about this pmap's 905 * mapping of the page, if any. 906 */ 907 ; 908 } else if (pa != 0) { 909 /* 910 * The page is mapped by this process but not 911 * both accessed and modified. It is also 912 * managed. Acquire the object lock so that 913 * other mappings might be examined. The page's 914 * identity may change at any point before its 915 * object lock is acquired, so re-validate if 916 * necessary. 917 */ 918 m = PHYS_TO_VM_PAGE(pa); 919 while (object == NULL || m->object != object) { 920 if (object != NULL) 921 VM_OBJECT_WUNLOCK(object); 922 object = atomic_load_ptr(&m->object); 923 if (object == NULL) 924 goto retry; 925 VM_OBJECT_WLOCK(object); 926 } 927 if (pa != pmap_extract(pmap, addr)) 928 goto retry; 929 KASSERT(vm_page_all_valid(m), 930 ("mincore: page %p is mapped but invalid", 931 m)); 932 } else if (mincoreinfo == 0) { 933 /* 934 * The page is not mapped by this process. If 935 * the object implements managed pages, then 936 * determine if the page is resident so that 937 * the mappings might be examined. 938 */ 939 if (current->object.vm_object != object) { 940 if (object != NULL) 941 VM_OBJECT_WUNLOCK(object); 942 object = current->object.vm_object; 943 VM_OBJECT_WLOCK(object); 944 } 945 if ((object->flags & OBJ_SWAP) != 0 || 946 object->type == OBJT_VNODE) { 947 pindex = OFF_TO_IDX(current->offset + 948 (addr - current->start)); 949 m = vm_page_lookup(object, pindex); 950 if (m != NULL && vm_page_none_valid(m)) 951 m = NULL; 952 if (m != NULL) 953 mincoreinfo = MINCORE_INCORE; 954 } 955 } 956 if (m != NULL) { 957 VM_OBJECT_ASSERT_WLOCKED(m->object); 958 959 /* Examine other mappings of the page. */ 960 if (m->dirty == 0 && pmap_is_modified(m)) 961 vm_page_dirty(m); 962 if (m->dirty != 0) 963 mincoreinfo |= MINCORE_MODIFIED_OTHER; 964 965 /* 966 * The first test for PGA_REFERENCED is an 967 * optimization. The second test is 968 * required because a concurrent pmap 969 * operation could clear the last reference 970 * and set PGA_REFERENCED before the call to 971 * pmap_is_referenced(). 972 */ 973 if ((m->a.flags & PGA_REFERENCED) != 0 || 974 pmap_is_referenced(m) || 975 (m->a.flags & PGA_REFERENCED) != 0) 976 mincoreinfo |= MINCORE_REFERENCED_OTHER; 977 } 978 if (object != NULL) 979 VM_OBJECT_WUNLOCK(object); 980 981 /* 982 * subyte may page fault. In case it needs to modify 983 * the map, we release the lock. 984 */ 985 vm_map_unlock_read(map); 986 987 /* 988 * calculate index into user supplied byte vector 989 */ 990 vecindex = atop(addr - first_addr); 991 992 /* 993 * If we have skipped map entries, we need to make sure that 994 * the byte vector is zeroed for those skipped entries. 995 */ 996 while ((lastvecindex + 1) < vecindex) { 997 ++lastvecindex; 998 error = subyte(vec + lastvecindex, 0); 999 if (error) { 1000 error = EFAULT; 1001 goto done2; 1002 } 1003 } 1004 1005 /* 1006 * Pass the page information to the user 1007 */ 1008 error = subyte(vec + vecindex, mincoreinfo); 1009 if (error) { 1010 error = EFAULT; 1011 goto done2; 1012 } 1013 1014 /* 1015 * If the map has changed, due to the subyte, the previous 1016 * output may be invalid. 1017 */ 1018 vm_map_lock_read(map); 1019 if (timestamp != map->timestamp) 1020 goto RestartScan; 1021 1022 lastvecindex = vecindex; 1023 } 1024 } 1025 1026 /* 1027 * subyte may page fault. In case it needs to modify 1028 * the map, we release the lock. 1029 */ 1030 vm_map_unlock_read(map); 1031 1032 /* 1033 * Zero the last entries in the byte vector. 1034 */ 1035 vecindex = atop(end - first_addr); 1036 while ((lastvecindex + 1) < vecindex) { 1037 ++lastvecindex; 1038 error = subyte(vec + lastvecindex, 0); 1039 if (error) { 1040 error = EFAULT; 1041 goto done2; 1042 } 1043 } 1044 1045 /* 1046 * If the map has changed, due to the subyte, the previous 1047 * output may be invalid. 1048 */ 1049 vm_map_lock_read(map); 1050 if (timestamp != map->timestamp) 1051 goto RestartScan; 1052 vm_map_unlock_read(map); 1053 done2: 1054 return (error); 1055 } 1056 1057 #ifndef _SYS_SYSPROTO_H_ 1058 struct mlock_args { 1059 const void *addr; 1060 size_t len; 1061 }; 1062 #endif 1063 int 1064 sys_mlock(struct thread *td, struct mlock_args *uap) 1065 { 1066 1067 return (kern_mlock(td->td_proc, td->td_ucred, 1068 __DECONST(uintptr_t, uap->addr), uap->len)); 1069 } 1070 1071 int 1072 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1073 { 1074 vm_offset_t addr, end, last, start; 1075 vm_size_t npages, size; 1076 vm_map_t map; 1077 unsigned long nsize; 1078 int error; 1079 1080 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1081 if (error) 1082 return (error); 1083 addr = addr0; 1084 size = len; 1085 last = addr + size; 1086 start = trunc_page(addr); 1087 end = round_page(last); 1088 if (last < addr || end < addr) 1089 return (EINVAL); 1090 npages = atop(end - start); 1091 if (npages > vm_page_max_user_wired) 1092 return (ENOMEM); 1093 map = &proc->p_vmspace->vm_map; 1094 PROC_LOCK(proc); 1095 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1096 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1097 PROC_UNLOCK(proc); 1098 return (ENOMEM); 1099 } 1100 PROC_UNLOCK(proc); 1101 #ifdef RACCT 1102 if (racct_enable) { 1103 PROC_LOCK(proc); 1104 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1105 PROC_UNLOCK(proc); 1106 if (error != 0) 1107 return (ENOMEM); 1108 } 1109 #endif 1110 error = vm_map_wire(map, start, end, 1111 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1112 #ifdef RACCT 1113 if (racct_enable && error != KERN_SUCCESS) { 1114 PROC_LOCK(proc); 1115 racct_set(proc, RACCT_MEMLOCK, 1116 ptoa(pmap_wired_count(map->pmap))); 1117 PROC_UNLOCK(proc); 1118 } 1119 #endif 1120 switch (error) { 1121 case KERN_SUCCESS: 1122 return (0); 1123 case KERN_INVALID_ARGUMENT: 1124 return (EINVAL); 1125 default: 1126 return (ENOMEM); 1127 } 1128 } 1129 1130 #ifndef _SYS_SYSPROTO_H_ 1131 struct mlockall_args { 1132 int how; 1133 }; 1134 #endif 1135 1136 int 1137 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1138 { 1139 vm_map_t map; 1140 int error; 1141 1142 map = &td->td_proc->p_vmspace->vm_map; 1143 error = priv_check(td, PRIV_VM_MLOCK); 1144 if (error) 1145 return (error); 1146 1147 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1148 return (EINVAL); 1149 1150 /* 1151 * If wiring all pages in the process would cause it to exceed 1152 * a hard resource limit, return ENOMEM. 1153 */ 1154 if (!old_mlock && uap->how & MCL_CURRENT) { 1155 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1156 return (ENOMEM); 1157 } 1158 #ifdef RACCT 1159 if (racct_enable) { 1160 PROC_LOCK(td->td_proc); 1161 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1162 PROC_UNLOCK(td->td_proc); 1163 if (error != 0) 1164 return (ENOMEM); 1165 } 1166 #endif 1167 1168 if (uap->how & MCL_FUTURE) { 1169 vm_map_lock(map); 1170 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1171 vm_map_unlock(map); 1172 error = 0; 1173 } 1174 1175 if (uap->how & MCL_CURRENT) { 1176 /* 1177 * P1003.1-2001 mandates that all currently mapped pages 1178 * will be memory resident and locked (wired) upon return 1179 * from mlockall(). vm_map_wire() will wire pages, by 1180 * calling vm_fault_wire() for each page in the region. 1181 */ 1182 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1183 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1184 if (error == KERN_SUCCESS) 1185 error = 0; 1186 else if (error == KERN_RESOURCE_SHORTAGE) 1187 error = ENOMEM; 1188 else 1189 error = EAGAIN; 1190 } 1191 #ifdef RACCT 1192 if (racct_enable && error != KERN_SUCCESS) { 1193 PROC_LOCK(td->td_proc); 1194 racct_set(td->td_proc, RACCT_MEMLOCK, 1195 ptoa(pmap_wired_count(map->pmap))); 1196 PROC_UNLOCK(td->td_proc); 1197 } 1198 #endif 1199 1200 return (error); 1201 } 1202 1203 #ifndef _SYS_SYSPROTO_H_ 1204 struct munlockall_args { 1205 register_t dummy; 1206 }; 1207 #endif 1208 1209 int 1210 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1211 { 1212 vm_map_t map; 1213 int error; 1214 1215 map = &td->td_proc->p_vmspace->vm_map; 1216 error = priv_check(td, PRIV_VM_MUNLOCK); 1217 if (error) 1218 return (error); 1219 1220 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1221 vm_map_lock(map); 1222 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1223 vm_map_unlock(map); 1224 1225 /* Forcibly unwire all pages. */ 1226 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1227 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1228 #ifdef RACCT 1229 if (racct_enable && error == KERN_SUCCESS) { 1230 PROC_LOCK(td->td_proc); 1231 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1232 PROC_UNLOCK(td->td_proc); 1233 } 1234 #endif 1235 1236 return (error); 1237 } 1238 1239 #ifndef _SYS_SYSPROTO_H_ 1240 struct munlock_args { 1241 const void *addr; 1242 size_t len; 1243 }; 1244 #endif 1245 int 1246 sys_munlock(struct thread *td, struct munlock_args *uap) 1247 { 1248 1249 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1250 } 1251 1252 int 1253 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1254 { 1255 vm_offset_t addr, end, last, start; 1256 #ifdef RACCT 1257 vm_map_t map; 1258 #endif 1259 int error; 1260 1261 error = priv_check(td, PRIV_VM_MUNLOCK); 1262 if (error) 1263 return (error); 1264 addr = addr0; 1265 last = addr + size; 1266 start = trunc_page(addr); 1267 end = round_page(last); 1268 if (last < addr || end < addr) 1269 return (EINVAL); 1270 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1271 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1272 #ifdef RACCT 1273 if (racct_enable && error == KERN_SUCCESS) { 1274 PROC_LOCK(td->td_proc); 1275 map = &td->td_proc->p_vmspace->vm_map; 1276 racct_set(td->td_proc, RACCT_MEMLOCK, 1277 ptoa(pmap_wired_count(map->pmap))); 1278 PROC_UNLOCK(td->td_proc); 1279 } 1280 #endif 1281 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1282 } 1283 1284 /* 1285 * vm_mmap_vnode() 1286 * 1287 * Helper function for vm_mmap. Perform sanity check specific for mmap 1288 * operations on vnodes. 1289 */ 1290 int 1291 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1292 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1293 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1294 boolean_t *writecounted) 1295 { 1296 struct vattr va; 1297 vm_object_t obj; 1298 vm_ooffset_t foff; 1299 struct ucred *cred; 1300 int error, flags; 1301 bool writex; 1302 1303 cred = td->td_ucred; 1304 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1305 (*flagsp & MAP_SHARED) != 0; 1306 if ((error = vget(vp, LK_SHARED)) != 0) 1307 return (error); 1308 AUDIT_ARG_VNODE1(vp); 1309 foff = *foffp; 1310 flags = *flagsp; 1311 obj = vp->v_object; 1312 if (vp->v_type == VREG) { 1313 /* 1314 * Get the proper underlying object 1315 */ 1316 if (obj == NULL) { 1317 error = EINVAL; 1318 goto done; 1319 } 1320 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1321 vput(vp); 1322 vp = (struct vnode *)obj->handle; 1323 /* 1324 * Bypass filesystems obey the mpsafety of the 1325 * underlying fs. Tmpfs never bypasses. 1326 */ 1327 error = vget(vp, LK_SHARED); 1328 if (error != 0) 1329 return (error); 1330 } 1331 if (writex) { 1332 *writecounted = TRUE; 1333 vm_pager_update_writecount(obj, 0, objsize); 1334 } 1335 } else { 1336 error = EINVAL; 1337 goto done; 1338 } 1339 if ((error = VOP_GETATTR(vp, &va, cred))) 1340 goto done; 1341 #ifdef MAC 1342 /* This relies on VM_PROT_* matching PROT_*. */ 1343 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1344 if (error != 0) 1345 goto done; 1346 #endif 1347 if ((flags & MAP_SHARED) != 0) { 1348 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1349 if (prot & VM_PROT_WRITE) { 1350 error = EPERM; 1351 goto done; 1352 } 1353 *maxprotp &= ~VM_PROT_WRITE; 1354 } 1355 } 1356 /* 1357 * If it is a regular file without any references 1358 * we do not need to sync it. 1359 * Adjust object size to be the size of actual file. 1360 */ 1361 objsize = round_page(va.va_size); 1362 if (va.va_nlink == 0) 1363 flags |= MAP_NOSYNC; 1364 if (obj->type == OBJT_VNODE) { 1365 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1366 cred); 1367 if (obj == NULL) { 1368 error = ENOMEM; 1369 goto done; 1370 } 1371 } else { 1372 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1373 vm_object_reference(obj); 1374 #if VM_NRESERVLEVEL > 0 1375 if ((obj->flags & OBJ_COLORED) == 0) { 1376 VM_OBJECT_WLOCK(obj); 1377 vm_object_color(obj, 0); 1378 VM_OBJECT_WUNLOCK(obj); 1379 } 1380 #endif 1381 } 1382 *objp = obj; 1383 *flagsp = flags; 1384 1385 VOP_MMAPPED(vp); 1386 1387 done: 1388 if (error != 0 && *writecounted) { 1389 *writecounted = FALSE; 1390 vm_pager_update_writecount(obj, objsize, 0); 1391 } 1392 vput(vp); 1393 return (error); 1394 } 1395 1396 /* 1397 * vm_mmap_cdev() 1398 * 1399 * Helper function for vm_mmap. Perform sanity check specific for mmap 1400 * operations on cdevs. 1401 */ 1402 int 1403 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1404 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1405 vm_ooffset_t *foff, vm_object_t *objp) 1406 { 1407 vm_object_t obj; 1408 int error, flags; 1409 1410 flags = *flagsp; 1411 1412 if (dsw->d_flags & D_MMAP_ANON) { 1413 *objp = NULL; 1414 *foff = 0; 1415 *maxprotp = VM_PROT_ALL; 1416 *flagsp |= MAP_ANON; 1417 return (0); 1418 } 1419 /* 1420 * cdevs do not provide private mappings of any kind. 1421 */ 1422 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1423 (prot & VM_PROT_WRITE) != 0) 1424 return (EACCES); 1425 if (flags & (MAP_PRIVATE|MAP_COPY)) 1426 return (EINVAL); 1427 /* 1428 * Force device mappings to be shared. 1429 */ 1430 flags |= MAP_SHARED; 1431 #ifdef MAC_XXX 1432 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1433 if (error != 0) 1434 return (error); 1435 #endif 1436 /* 1437 * First, try d_mmap_single(). If that is not implemented 1438 * (returns ENODEV), fall back to using the device pager. 1439 * Note that d_mmap_single() must return a reference to the 1440 * object (it needs to bump the reference count of the object 1441 * it returns somehow). 1442 * 1443 * XXX assumes VM_PROT_* == PROT_* 1444 */ 1445 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1446 if (error != ENODEV) 1447 return (error); 1448 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1449 td->td_ucred); 1450 if (obj == NULL) 1451 return (EINVAL); 1452 *objp = obj; 1453 *flagsp = flags; 1454 return (0); 1455 } 1456 1457 int 1458 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1459 vm_prot_t maxprot, int flags, 1460 objtype_t handle_type, void *handle, 1461 vm_ooffset_t foff) 1462 { 1463 vm_object_t object; 1464 struct thread *td = curthread; 1465 int error; 1466 boolean_t writecounted; 1467 1468 if (size == 0) 1469 return (EINVAL); 1470 1471 size = round_page(size); 1472 object = NULL; 1473 writecounted = FALSE; 1474 1475 switch (handle_type) { 1476 case OBJT_DEVICE: { 1477 struct cdevsw *dsw; 1478 struct cdev *cdev; 1479 int ref; 1480 1481 cdev = handle; 1482 dsw = dev_refthread(cdev, &ref); 1483 if (dsw == NULL) 1484 return (ENXIO); 1485 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1486 dsw, &foff, &object); 1487 dev_relthread(cdev, ref); 1488 break; 1489 } 1490 case OBJT_VNODE: 1491 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1492 handle, &foff, &object, &writecounted); 1493 break; 1494 default: 1495 error = EINVAL; 1496 break; 1497 } 1498 if (error) 1499 return (error); 1500 1501 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1502 foff, writecounted, td); 1503 if (error != 0 && object != NULL) { 1504 /* 1505 * If this mapping was accounted for in the vnode's 1506 * writecount, then undo that now. 1507 */ 1508 if (writecounted) 1509 vm_pager_release_writecount(object, 0, size); 1510 vm_object_deallocate(object); 1511 } 1512 return (error); 1513 } 1514 1515 int 1516 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1517 { 1518 int error; 1519 1520 RACCT_PROC_LOCK(td->td_proc); 1521 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1522 RACCT_PROC_UNLOCK(td->td_proc); 1523 return (ENOMEM); 1524 } 1525 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1526 RACCT_PROC_UNLOCK(td->td_proc); 1527 return (ENOMEM); 1528 } 1529 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1530 if (ptoa(pmap_wired_count(map->pmap)) + size > 1531 lim_cur(td, RLIMIT_MEMLOCK)) { 1532 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1533 RACCT_PROC_UNLOCK(td->td_proc); 1534 return (ENOMEM); 1535 } 1536 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1537 ptoa(pmap_wired_count(map->pmap)) + size); 1538 if (error != 0) { 1539 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1540 RACCT_PROC_UNLOCK(td->td_proc); 1541 return (error); 1542 } 1543 } 1544 RACCT_PROC_UNLOCK(td->td_proc); 1545 return (0); 1546 } 1547 1548 /* 1549 * Internal version of mmap that maps a specific VM object into an 1550 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1551 */ 1552 int 1553 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1554 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1555 boolean_t writecounted, struct thread *td) 1556 { 1557 vm_offset_t max_addr; 1558 int docow, error, findspace, rv; 1559 bool curmap, fitit; 1560 1561 curmap = map == &td->td_proc->p_vmspace->vm_map; 1562 if (curmap) { 1563 error = kern_mmap_racct_check(td, map, size); 1564 if (error != 0) 1565 return (error); 1566 } 1567 1568 /* 1569 * We currently can only deal with page aligned file offsets. 1570 * The mmap() system call already enforces this by subtracting 1571 * the page offset from the file offset, but checking here 1572 * catches errors in device drivers (e.g. d_single_mmap() 1573 * callbacks) and other internal mapping requests (such as in 1574 * exec). 1575 */ 1576 if (foff & PAGE_MASK) 1577 return (EINVAL); 1578 1579 if ((flags & MAP_FIXED) == 0) { 1580 fitit = true; 1581 *addr = round_page(*addr); 1582 } else { 1583 if (*addr != trunc_page(*addr)) 1584 return (EINVAL); 1585 fitit = false; 1586 } 1587 1588 if (flags & MAP_ANON) { 1589 if (object != NULL || foff != 0) 1590 return (EINVAL); 1591 docow = 0; 1592 } else if (flags & MAP_PREFAULT_READ) 1593 docow = MAP_PREFAULT; 1594 else 1595 docow = MAP_PREFAULT_PARTIAL; 1596 1597 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1598 docow |= MAP_COPY_ON_WRITE; 1599 if (flags & MAP_NOSYNC) 1600 docow |= MAP_DISABLE_SYNCER; 1601 if (flags & MAP_NOCORE) 1602 docow |= MAP_DISABLE_COREDUMP; 1603 /* Shared memory is also shared with children. */ 1604 if (flags & MAP_SHARED) 1605 docow |= MAP_INHERIT_SHARE; 1606 if (writecounted) 1607 docow |= MAP_WRITECOUNT; 1608 if (flags & MAP_STACK) { 1609 if (object != NULL) 1610 return (EINVAL); 1611 docow |= MAP_STACK_GROWS_DOWN; 1612 } 1613 if ((flags & MAP_EXCL) != 0) 1614 docow |= MAP_CHECK_EXCL; 1615 if ((flags & MAP_GUARD) != 0) 1616 docow |= MAP_CREATE_GUARD; 1617 1618 if (fitit) { 1619 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1620 findspace = VMFS_SUPER_SPACE; 1621 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1622 findspace = VMFS_ALIGNED_SPACE(flags >> 1623 MAP_ALIGNMENT_SHIFT); 1624 else 1625 findspace = VMFS_OPTIMAL_SPACE; 1626 max_addr = 0; 1627 #ifdef MAP_32BIT 1628 if ((flags & MAP_32BIT) != 0) 1629 max_addr = MAP_32BIT_MAX_ADDR; 1630 #endif 1631 if (curmap) { 1632 rv = vm_map_find_min(map, object, foff, addr, size, 1633 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1634 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1635 findspace, prot, maxprot, docow); 1636 } else { 1637 rv = vm_map_find(map, object, foff, addr, size, 1638 max_addr, findspace, prot, maxprot, docow); 1639 } 1640 } else { 1641 rv = vm_map_fixed(map, object, foff, *addr, size, 1642 prot, maxprot, docow); 1643 } 1644 1645 if (rv == KERN_SUCCESS) { 1646 /* 1647 * If the process has requested that all future mappings 1648 * be wired, then heed this. 1649 */ 1650 if ((map->flags & MAP_WIREFUTURE) != 0) { 1651 vm_map_lock(map); 1652 if ((map->flags & MAP_WIREFUTURE) != 0) 1653 (void)vm_map_wire_locked(map, *addr, 1654 *addr + size, VM_MAP_WIRE_USER | 1655 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1656 VM_MAP_WIRE_NOHOLES)); 1657 vm_map_unlock(map); 1658 } 1659 } 1660 return (vm_mmap_to_errno(rv)); 1661 } 1662 1663 /* 1664 * Translate a Mach VM return code to zero on success or the appropriate errno 1665 * on failure. 1666 */ 1667 int 1668 vm_mmap_to_errno(int rv) 1669 { 1670 1671 switch (rv) { 1672 case KERN_SUCCESS: 1673 return (0); 1674 case KERN_INVALID_ADDRESS: 1675 case KERN_NO_SPACE: 1676 return (ENOMEM); 1677 case KERN_PROTECTION_FAILURE: 1678 return (EACCES); 1679 default: 1680 return (EINVAL); 1681 } 1682 } 1683