1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_hwpmc_hooks.h" 49 #include "opt_vm.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/capsicum.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/filedesc.h> 59 #include <sys/priv.h> 60 #include <sys/proc.h> 61 #include <sys/procctl.h> 62 #include <sys/racct.h> 63 #include <sys/resource.h> 64 #include <sys/resourcevar.h> 65 #include <sys/rwlock.h> 66 #include <sys/sysctl.h> 67 #include <sys/vnode.h> 68 #include <sys/fcntl.h> 69 #include <sys/file.h> 70 #include <sys/mman.h> 71 #include <sys/mount.h> 72 #include <sys/conf.h> 73 #include <sys/stat.h> 74 #include <sys/syscallsubr.h> 75 #include <sys/sysent.h> 76 #include <sys/vmmeter.h> 77 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 78 #include <machine/md_var.h> 79 #endif 80 81 #include <security/audit/audit.h> 82 #include <security/mac/mac_framework.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <vm/pmap.h> 87 #include <vm/vm_map.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vm_pageout.h> 92 #include <vm/vm_extern.h> 93 #include <vm/vm_page.h> 94 #include <vm/vnode_pager.h> 95 96 #ifdef HWPMC_HOOKS 97 #include <sys/pmckern.h> 98 #endif 99 100 int old_mlock = 0; 101 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 102 "Do not apply RLIMIT_MEMLOCK on mlockall"); 103 static int mincore_mapped = 1; 104 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 105 "mincore reports mappings, not residency"); 106 107 #ifdef MAP_32BIT 108 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 109 #endif 110 111 #ifndef _SYS_SYSPROTO_H_ 112 struct sbrk_args { 113 int incr; 114 }; 115 #endif 116 117 int 118 sys_sbrk(struct thread *td, struct sbrk_args *uap) 119 { 120 /* Not yet implemented */ 121 return (EOPNOTSUPP); 122 } 123 124 #ifndef _SYS_SYSPROTO_H_ 125 struct sstk_args { 126 int incr; 127 }; 128 #endif 129 130 int 131 sys_sstk(struct thread *td, struct sstk_args *uap) 132 { 133 /* Not yet implemented */ 134 return (EOPNOTSUPP); 135 } 136 137 #if defined(COMPAT_43) 138 int 139 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 140 { 141 142 td->td_retval[0] = PAGE_SIZE; 143 return (0); 144 } 145 #endif /* COMPAT_43 */ 146 147 148 /* 149 * Memory Map (mmap) system call. Note that the file offset 150 * and address are allowed to be NOT page aligned, though if 151 * the MAP_FIXED flag it set, both must have the same remainder 152 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 153 * page-aligned, the actual mapping starts at trunc_page(addr) 154 * and the return value is adjusted up by the page offset. 155 * 156 * Generally speaking, only character devices which are themselves 157 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 158 * there would be no cache coherency between a descriptor and a VM mapping 159 * both to the same character device. 160 */ 161 #ifndef _SYS_SYSPROTO_H_ 162 struct mmap_args { 163 void *addr; 164 size_t len; 165 int prot; 166 int flags; 167 int fd; 168 long pad; 169 off_t pos; 170 }; 171 #endif 172 173 int 174 sys_mmap(struct thread *td, struct mmap_args *uap) 175 { 176 177 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 178 uap->flags, uap->fd, uap->pos)); 179 } 180 181 int 182 kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags, 183 int fd, off_t pos) 184 { 185 struct vmspace *vms; 186 struct file *fp; 187 vm_offset_t addr; 188 vm_size_t pageoff, size; 189 vm_prot_t cap_maxprot; 190 int align, error; 191 cap_rights_t rights; 192 193 vms = td->td_proc->p_vmspace; 194 fp = NULL; 195 AUDIT_ARG_FD(fd); 196 addr = addr0; 197 198 /* 199 * Ignore old flags that used to be defined but did not do anything. 200 */ 201 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 202 203 /* 204 * Enforce the constraints. 205 * Mapping of length 0 is only allowed for old binaries. 206 * Anonymous mapping shall specify -1 as filedescriptor and 207 * zero position for new code. Be nice to ancient a.out 208 * binaries and correct pos for anonymous mapping, since old 209 * ld.so sometimes issues anonymous map requests with non-zero 210 * pos. 211 */ 212 if (!SV_CURPROC_FLAG(SV_AOUT)) { 213 if ((len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 214 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 215 return (EINVAL); 216 } else { 217 if ((flags & MAP_ANON) != 0) 218 pos = 0; 219 } 220 221 if (flags & MAP_STACK) { 222 if ((fd != -1) || 223 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 224 return (EINVAL); 225 flags |= MAP_ANON; 226 pos = 0; 227 } 228 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 229 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 230 MAP_PREFAULT_READ | MAP_GUARD | 231 #ifdef MAP_32BIT 232 MAP_32BIT | 233 #endif 234 MAP_ALIGNMENT_MASK)) != 0) 235 return (EINVAL); 236 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 237 return (EINVAL); 238 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 239 return (EINVAL); 240 if (prot != PROT_NONE && 241 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 242 return (EINVAL); 243 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 244 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 245 #ifdef MAP_32BIT 246 MAP_32BIT | 247 #endif 248 MAP_ALIGNMENT_MASK)) != 0)) 249 return (EINVAL); 250 251 /* 252 * Align the file position to a page boundary, 253 * and save its page offset component. 254 */ 255 pageoff = (pos & PAGE_MASK); 256 pos -= pageoff; 257 258 /* Compute size from len by rounding (on both ends). */ 259 size = len + pageoff; /* low end... */ 260 size = round_page(size); /* hi end */ 261 /* Check for rounding up to zero. */ 262 if (len > size) 263 return (ENOMEM); 264 265 /* Ensure alignment is at least a page and fits in a pointer. */ 266 align = flags & MAP_ALIGNMENT_MASK; 267 if (align != 0 && align != MAP_ALIGNED_SUPER && 268 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 269 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 270 return (EINVAL); 271 272 /* 273 * Check for illegal addresses. Watch out for address wrap... Note 274 * that VM_*_ADDRESS are not constants due to casts (argh). 275 */ 276 if (flags & MAP_FIXED) { 277 /* 278 * The specified address must have the same remainder 279 * as the file offset taken modulo PAGE_SIZE, so it 280 * should be aligned after adjustment by pageoff. 281 */ 282 addr -= pageoff; 283 if (addr & PAGE_MASK) 284 return (EINVAL); 285 286 /* Address range must be all in user VM space. */ 287 if (addr < vm_map_min(&vms->vm_map) || 288 addr + size > vm_map_max(&vms->vm_map)) 289 return (EINVAL); 290 if (addr + size < addr) 291 return (EINVAL); 292 #ifdef MAP_32BIT 293 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 294 return (EINVAL); 295 } else if (flags & MAP_32BIT) { 296 /* 297 * For MAP_32BIT, override the hint if it is too high and 298 * do not bother moving the mapping past the heap (since 299 * the heap is usually above 2GB). 300 */ 301 if (addr + size > MAP_32BIT_MAX_ADDR) 302 addr = 0; 303 #endif 304 } else { 305 /* 306 * XXX for non-fixed mappings where no hint is provided or 307 * the hint would fall in the potential heap space, 308 * place it after the end of the largest possible heap. 309 * 310 * There should really be a pmap call to determine a reasonable 311 * location. 312 */ 313 if (addr == 0 || 314 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 315 addr < round_page((vm_offset_t)vms->vm_daddr + 316 lim_max(td, RLIMIT_DATA)))) 317 addr = round_page((vm_offset_t)vms->vm_daddr + 318 lim_max(td, RLIMIT_DATA)); 319 } 320 if (len == 0) { 321 /* 322 * Return success without mapping anything for old 323 * binaries that request a page-aligned mapping of 324 * length 0. For modern binaries, this function 325 * returns an error earlier. 326 */ 327 error = 0; 328 } else if ((flags & MAP_GUARD) != 0) { 329 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 330 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 331 } else if ((flags & MAP_ANON) != 0) { 332 /* 333 * Mapping blank space is trivial. 334 * 335 * This relies on VM_PROT_* matching PROT_*. 336 */ 337 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 338 VM_PROT_ALL, flags, NULL, pos, FALSE, td); 339 } else { 340 /* 341 * Mapping file, get fp for validation and don't let the 342 * descriptor disappear on us if we block. Check capability 343 * rights, but also return the maximum rights to be combined 344 * with maxprot later. 345 */ 346 cap_rights_init(&rights, CAP_MMAP); 347 if (prot & PROT_READ) 348 cap_rights_set(&rights, CAP_MMAP_R); 349 if ((flags & MAP_SHARED) != 0) { 350 if (prot & PROT_WRITE) 351 cap_rights_set(&rights, CAP_MMAP_W); 352 } 353 if (prot & PROT_EXEC) 354 cap_rights_set(&rights, CAP_MMAP_X); 355 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 356 if (error != 0) 357 goto done; 358 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 359 td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { 360 error = EINVAL; 361 goto done; 362 } 363 364 /* This relies on VM_PROT_* matching PROT_*. */ 365 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 366 cap_maxprot, flags, pos, td); 367 } 368 369 if (error == 0) 370 td->td_retval[0] = (register_t) (addr + pageoff); 371 done: 372 if (fp) 373 fdrop(fp, td); 374 375 return (error); 376 } 377 378 #if defined(COMPAT_FREEBSD6) 379 int 380 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 381 { 382 383 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 384 uap->flags, uap->fd, uap->pos)); 385 } 386 #endif 387 388 #ifdef COMPAT_43 389 #ifndef _SYS_SYSPROTO_H_ 390 struct ommap_args { 391 caddr_t addr; 392 int len; 393 int prot; 394 int flags; 395 int fd; 396 long pos; 397 }; 398 #endif 399 int 400 ommap(struct thread *td, struct ommap_args *uap) 401 { 402 static const char cvtbsdprot[8] = { 403 0, 404 PROT_EXEC, 405 PROT_WRITE, 406 PROT_EXEC | PROT_WRITE, 407 PROT_READ, 408 PROT_EXEC | PROT_READ, 409 PROT_WRITE | PROT_READ, 410 PROT_EXEC | PROT_WRITE | PROT_READ, 411 }; 412 int flags, prot; 413 414 #define OMAP_ANON 0x0002 415 #define OMAP_COPY 0x0020 416 #define OMAP_SHARED 0x0010 417 #define OMAP_FIXED 0x0100 418 419 prot = cvtbsdprot[uap->prot & 0x7]; 420 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 421 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 422 prot != 0) 423 prot |= PROT_EXEC; 424 #endif 425 flags = 0; 426 if (uap->flags & OMAP_ANON) 427 flags |= MAP_ANON; 428 if (uap->flags & OMAP_COPY) 429 flags |= MAP_COPY; 430 if (uap->flags & OMAP_SHARED) 431 flags |= MAP_SHARED; 432 else 433 flags |= MAP_PRIVATE; 434 if (uap->flags & OMAP_FIXED) 435 flags |= MAP_FIXED; 436 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, 437 uap->fd, uap->pos)); 438 } 439 #endif /* COMPAT_43 */ 440 441 442 #ifndef _SYS_SYSPROTO_H_ 443 struct msync_args { 444 void *addr; 445 size_t len; 446 int flags; 447 }; 448 #endif 449 int 450 sys_msync(struct thread *td, struct msync_args *uap) 451 { 452 453 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 454 } 455 456 int 457 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 458 { 459 vm_offset_t addr; 460 vm_size_t pageoff; 461 vm_map_t map; 462 int rv; 463 464 addr = addr0; 465 pageoff = (addr & PAGE_MASK); 466 addr -= pageoff; 467 size += pageoff; 468 size = (vm_size_t) round_page(size); 469 if (addr + size < addr) 470 return (EINVAL); 471 472 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 473 return (EINVAL); 474 475 map = &td->td_proc->p_vmspace->vm_map; 476 477 /* 478 * Clean the pages and interpret the return value. 479 */ 480 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 481 (flags & MS_INVALIDATE) != 0); 482 switch (rv) { 483 case KERN_SUCCESS: 484 return (0); 485 case KERN_INVALID_ADDRESS: 486 return (ENOMEM); 487 case KERN_INVALID_ARGUMENT: 488 return (EBUSY); 489 case KERN_FAILURE: 490 return (EIO); 491 default: 492 return (EINVAL); 493 } 494 } 495 496 #ifndef _SYS_SYSPROTO_H_ 497 struct munmap_args { 498 void *addr; 499 size_t len; 500 }; 501 #endif 502 int 503 sys_munmap(struct thread *td, struct munmap_args *uap) 504 { 505 506 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 507 } 508 509 int 510 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 511 { 512 #ifdef HWPMC_HOOKS 513 struct pmckern_map_out pkm; 514 vm_map_entry_t entry; 515 bool pmc_handled; 516 #endif 517 vm_offset_t addr; 518 vm_size_t pageoff; 519 vm_map_t map; 520 521 if (size == 0) 522 return (EINVAL); 523 524 addr = addr0; 525 pageoff = (addr & PAGE_MASK); 526 addr -= pageoff; 527 size += pageoff; 528 size = (vm_size_t) round_page(size); 529 if (addr + size < addr) 530 return (EINVAL); 531 532 /* 533 * Check for illegal addresses. Watch out for address wrap... 534 */ 535 map = &td->td_proc->p_vmspace->vm_map; 536 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 537 return (EINVAL); 538 vm_map_lock(map); 539 #ifdef HWPMC_HOOKS 540 pmc_handled = false; 541 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 542 pmc_handled = true; 543 /* 544 * Inform hwpmc if the address range being unmapped contains 545 * an executable region. 546 */ 547 pkm.pm_address = (uintptr_t) NULL; 548 if (vm_map_lookup_entry(map, addr, &entry)) { 549 for (; entry->start < addr + size; 550 entry = entry->next) { 551 if (vm_map_check_protection(map, entry->start, 552 entry->end, VM_PROT_EXECUTE) == TRUE) { 553 pkm.pm_address = (uintptr_t) addr; 554 pkm.pm_size = (size_t) size; 555 break; 556 } 557 } 558 } 559 } 560 #endif 561 vm_map_delete(map, addr, addr + size); 562 563 #ifdef HWPMC_HOOKS 564 if (__predict_false(pmc_handled)) { 565 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 566 vm_map_lock_downgrade(map); 567 if (pkm.pm_address != (uintptr_t) NULL) 568 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 569 vm_map_unlock_read(map); 570 } else 571 #endif 572 vm_map_unlock(map); 573 574 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 575 return (0); 576 } 577 578 #ifndef _SYS_SYSPROTO_H_ 579 struct mprotect_args { 580 const void *addr; 581 size_t len; 582 int prot; 583 }; 584 #endif 585 int 586 sys_mprotect(struct thread *td, struct mprotect_args *uap) 587 { 588 589 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 590 } 591 592 int 593 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 594 { 595 vm_offset_t addr; 596 vm_size_t pageoff; 597 598 addr = addr0; 599 prot = (prot & VM_PROT_ALL); 600 pageoff = (addr & PAGE_MASK); 601 addr -= pageoff; 602 size += pageoff; 603 size = (vm_size_t) round_page(size); 604 #ifdef COMPAT_FREEBSD32 605 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 606 if (((addr + size) & 0xffffffff) < addr) 607 return (EINVAL); 608 } else 609 #endif 610 if (addr + size < addr) 611 return (EINVAL); 612 613 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 614 addr + size, prot, FALSE)) { 615 case KERN_SUCCESS: 616 return (0); 617 case KERN_PROTECTION_FAILURE: 618 return (EACCES); 619 case KERN_RESOURCE_SHORTAGE: 620 return (ENOMEM); 621 } 622 return (EINVAL); 623 } 624 625 #ifndef _SYS_SYSPROTO_H_ 626 struct minherit_args { 627 void *addr; 628 size_t len; 629 int inherit; 630 }; 631 #endif 632 int 633 sys_minherit(struct thread *td, struct minherit_args *uap) 634 { 635 vm_offset_t addr; 636 vm_size_t size, pageoff; 637 vm_inherit_t inherit; 638 639 addr = (vm_offset_t)uap->addr; 640 size = uap->len; 641 inherit = uap->inherit; 642 643 pageoff = (addr & PAGE_MASK); 644 addr -= pageoff; 645 size += pageoff; 646 size = (vm_size_t) round_page(size); 647 if (addr + size < addr) 648 return (EINVAL); 649 650 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 651 addr + size, inherit)) { 652 case KERN_SUCCESS: 653 return (0); 654 case KERN_PROTECTION_FAILURE: 655 return (EACCES); 656 } 657 return (EINVAL); 658 } 659 660 #ifndef _SYS_SYSPROTO_H_ 661 struct madvise_args { 662 void *addr; 663 size_t len; 664 int behav; 665 }; 666 #endif 667 668 int 669 sys_madvise(struct thread *td, struct madvise_args *uap) 670 { 671 672 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 673 } 674 675 int 676 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 677 { 678 vm_map_t map; 679 vm_offset_t addr, end, start; 680 int flags; 681 682 /* 683 * Check for our special case, advising the swap pager we are 684 * "immortal." 685 */ 686 if (behav == MADV_PROTECT) { 687 flags = PPROT_SET; 688 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 689 PROC_SPROTECT, &flags)); 690 } 691 692 /* 693 * Check for illegal addresses. Watch out for address wrap... Note 694 * that VM_*_ADDRESS are not constants due to casts (argh). 695 */ 696 map = &td->td_proc->p_vmspace->vm_map; 697 addr = addr0; 698 if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) 699 return (EINVAL); 700 if ((addr + len) < addr) 701 return (EINVAL); 702 703 /* 704 * Since this routine is only advisory, we default to conservative 705 * behavior. 706 */ 707 start = trunc_page(addr); 708 end = round_page(addr + len); 709 710 /* 711 * vm_map_madvise() checks for illegal values of behav. 712 */ 713 return (vm_map_madvise(map, start, end, behav)); 714 } 715 716 #ifndef _SYS_SYSPROTO_H_ 717 struct mincore_args { 718 const void *addr; 719 size_t len; 720 char *vec; 721 }; 722 #endif 723 724 int 725 sys_mincore(struct thread *td, struct mincore_args *uap) 726 { 727 728 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 729 } 730 731 int 732 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 733 { 734 vm_offset_t addr, first_addr; 735 vm_offset_t end, cend; 736 pmap_t pmap; 737 vm_map_t map; 738 int error = 0; 739 int vecindex, lastvecindex; 740 vm_map_entry_t current; 741 vm_map_entry_t entry; 742 vm_object_t object; 743 vm_paddr_t locked_pa; 744 vm_page_t m; 745 vm_pindex_t pindex; 746 int mincoreinfo; 747 unsigned int timestamp; 748 boolean_t locked; 749 750 /* 751 * Make sure that the addresses presented are valid for user 752 * mode. 753 */ 754 first_addr = addr = trunc_page(addr0); 755 end = addr + (vm_size_t)round_page(len); 756 map = &td->td_proc->p_vmspace->vm_map; 757 if (end > vm_map_max(map) || end < addr) 758 return (ENOMEM); 759 760 pmap = vmspace_pmap(td->td_proc->p_vmspace); 761 762 vm_map_lock_read(map); 763 RestartScan: 764 timestamp = map->timestamp; 765 766 if (!vm_map_lookup_entry(map, addr, &entry)) { 767 vm_map_unlock_read(map); 768 return (ENOMEM); 769 } 770 771 /* 772 * Do this on a map entry basis so that if the pages are not 773 * in the current processes address space, we can easily look 774 * up the pages elsewhere. 775 */ 776 lastvecindex = -1; 777 for (current = entry; current->start < end; current = current->next) { 778 779 /* 780 * check for contiguity 781 */ 782 if (current->end < end && current->next->start > current->end) { 783 vm_map_unlock_read(map); 784 return (ENOMEM); 785 } 786 787 /* 788 * ignore submaps (for now) or null objects 789 */ 790 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 791 current->object.vm_object == NULL) 792 continue; 793 794 /* 795 * limit this scan to the current map entry and the 796 * limits for the mincore call 797 */ 798 if (addr < current->start) 799 addr = current->start; 800 cend = current->end; 801 if (cend > end) 802 cend = end; 803 804 /* 805 * scan this entry one page at a time 806 */ 807 while (addr < cend) { 808 /* 809 * Check pmap first, it is likely faster, also 810 * it can provide info as to whether we are the 811 * one referencing or modifying the page. 812 */ 813 object = NULL; 814 locked_pa = 0; 815 retry: 816 m = NULL; 817 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 818 if (mincore_mapped) { 819 /* 820 * We only care about this pmap's 821 * mapping of the page, if any. 822 */ 823 if (locked_pa != 0) { 824 vm_page_unlock(PHYS_TO_VM_PAGE( 825 locked_pa)); 826 } 827 } else if (locked_pa != 0) { 828 /* 829 * The page is mapped by this process but not 830 * both accessed and modified. It is also 831 * managed. Acquire the object lock so that 832 * other mappings might be examined. 833 */ 834 m = PHYS_TO_VM_PAGE(locked_pa); 835 if (m->object != object) { 836 if (object != NULL) 837 VM_OBJECT_WUNLOCK(object); 838 object = m->object; 839 locked = VM_OBJECT_TRYWLOCK(object); 840 vm_page_unlock(m); 841 if (!locked) { 842 VM_OBJECT_WLOCK(object); 843 vm_page_lock(m); 844 goto retry; 845 } 846 } else 847 vm_page_unlock(m); 848 KASSERT(m->valid == VM_PAGE_BITS_ALL, 849 ("mincore: page %p is mapped but invalid", 850 m)); 851 } else if (mincoreinfo == 0) { 852 /* 853 * The page is not mapped by this process. If 854 * the object implements managed pages, then 855 * determine if the page is resident so that 856 * the mappings might be examined. 857 */ 858 if (current->object.vm_object != object) { 859 if (object != NULL) 860 VM_OBJECT_WUNLOCK(object); 861 object = current->object.vm_object; 862 VM_OBJECT_WLOCK(object); 863 } 864 if (object->type == OBJT_DEFAULT || 865 object->type == OBJT_SWAP || 866 object->type == OBJT_VNODE) { 867 pindex = OFF_TO_IDX(current->offset + 868 (addr - current->start)); 869 m = vm_page_lookup(object, pindex); 870 if (m != NULL && m->valid == 0) 871 m = NULL; 872 if (m != NULL) 873 mincoreinfo = MINCORE_INCORE; 874 } 875 } 876 if (m != NULL) { 877 /* Examine other mappings to the page. */ 878 if (m->dirty == 0 && pmap_is_modified(m)) 879 vm_page_dirty(m); 880 if (m->dirty != 0) 881 mincoreinfo |= MINCORE_MODIFIED_OTHER; 882 /* 883 * The first test for PGA_REFERENCED is an 884 * optimization. The second test is 885 * required because a concurrent pmap 886 * operation could clear the last reference 887 * and set PGA_REFERENCED before the call to 888 * pmap_is_referenced(). 889 */ 890 if ((m->aflags & PGA_REFERENCED) != 0 || 891 pmap_is_referenced(m) || 892 (m->aflags & PGA_REFERENCED) != 0) 893 mincoreinfo |= MINCORE_REFERENCED_OTHER; 894 } 895 if (object != NULL) 896 VM_OBJECT_WUNLOCK(object); 897 898 /* 899 * subyte may page fault. In case it needs to modify 900 * the map, we release the lock. 901 */ 902 vm_map_unlock_read(map); 903 904 /* 905 * calculate index into user supplied byte vector 906 */ 907 vecindex = atop(addr - first_addr); 908 909 /* 910 * If we have skipped map entries, we need to make sure that 911 * the byte vector is zeroed for those skipped entries. 912 */ 913 while ((lastvecindex + 1) < vecindex) { 914 ++lastvecindex; 915 error = subyte(vec + lastvecindex, 0); 916 if (error) { 917 error = EFAULT; 918 goto done2; 919 } 920 } 921 922 /* 923 * Pass the page information to the user 924 */ 925 error = subyte(vec + vecindex, mincoreinfo); 926 if (error) { 927 error = EFAULT; 928 goto done2; 929 } 930 931 /* 932 * If the map has changed, due to the subyte, the previous 933 * output may be invalid. 934 */ 935 vm_map_lock_read(map); 936 if (timestamp != map->timestamp) 937 goto RestartScan; 938 939 lastvecindex = vecindex; 940 addr += PAGE_SIZE; 941 } 942 } 943 944 /* 945 * subyte may page fault. In case it needs to modify 946 * the map, we release the lock. 947 */ 948 vm_map_unlock_read(map); 949 950 /* 951 * Zero the last entries in the byte vector. 952 */ 953 vecindex = atop(end - first_addr); 954 while ((lastvecindex + 1) < vecindex) { 955 ++lastvecindex; 956 error = subyte(vec + lastvecindex, 0); 957 if (error) { 958 error = EFAULT; 959 goto done2; 960 } 961 } 962 963 /* 964 * If the map has changed, due to the subyte, the previous 965 * output may be invalid. 966 */ 967 vm_map_lock_read(map); 968 if (timestamp != map->timestamp) 969 goto RestartScan; 970 vm_map_unlock_read(map); 971 done2: 972 return (error); 973 } 974 975 #ifndef _SYS_SYSPROTO_H_ 976 struct mlock_args { 977 const void *addr; 978 size_t len; 979 }; 980 #endif 981 int 982 sys_mlock(struct thread *td, struct mlock_args *uap) 983 { 984 985 return (kern_mlock(td->td_proc, td->td_ucred, 986 __DECONST(uintptr_t, uap->addr), uap->len)); 987 } 988 989 int 990 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 991 { 992 vm_offset_t addr, end, last, start; 993 vm_size_t npages, size; 994 vm_map_t map; 995 unsigned long nsize; 996 int error; 997 998 error = priv_check_cred(cred, PRIV_VM_MLOCK); 999 if (error) 1000 return (error); 1001 addr = addr0; 1002 size = len; 1003 last = addr + size; 1004 start = trunc_page(addr); 1005 end = round_page(last); 1006 if (last < addr || end < addr) 1007 return (EINVAL); 1008 npages = atop(end - start); 1009 if (npages > vm_page_max_user_wired) 1010 return (ENOMEM); 1011 map = &proc->p_vmspace->vm_map; 1012 PROC_LOCK(proc); 1013 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1014 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1015 PROC_UNLOCK(proc); 1016 return (ENOMEM); 1017 } 1018 PROC_UNLOCK(proc); 1019 #ifdef RACCT 1020 if (racct_enable) { 1021 PROC_LOCK(proc); 1022 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1023 PROC_UNLOCK(proc); 1024 if (error != 0) 1025 return (ENOMEM); 1026 } 1027 #endif 1028 error = vm_map_wire(map, start, end, 1029 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1030 #ifdef RACCT 1031 if (racct_enable && error != KERN_SUCCESS) { 1032 PROC_LOCK(proc); 1033 racct_set(proc, RACCT_MEMLOCK, 1034 ptoa(pmap_wired_count(map->pmap))); 1035 PROC_UNLOCK(proc); 1036 } 1037 #endif 1038 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1039 } 1040 1041 #ifndef _SYS_SYSPROTO_H_ 1042 struct mlockall_args { 1043 int how; 1044 }; 1045 #endif 1046 1047 int 1048 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1049 { 1050 vm_map_t map; 1051 int error; 1052 1053 map = &td->td_proc->p_vmspace->vm_map; 1054 error = priv_check(td, PRIV_VM_MLOCK); 1055 if (error) 1056 return (error); 1057 1058 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1059 return (EINVAL); 1060 1061 /* 1062 * If wiring all pages in the process would cause it to exceed 1063 * a hard resource limit, return ENOMEM. 1064 */ 1065 if (!old_mlock && uap->how & MCL_CURRENT) { 1066 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1067 return (ENOMEM); 1068 } 1069 #ifdef RACCT 1070 if (racct_enable) { 1071 PROC_LOCK(td->td_proc); 1072 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1073 PROC_UNLOCK(td->td_proc); 1074 if (error != 0) 1075 return (ENOMEM); 1076 } 1077 #endif 1078 1079 if (uap->how & MCL_FUTURE) { 1080 vm_map_lock(map); 1081 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1082 vm_map_unlock(map); 1083 error = 0; 1084 } 1085 1086 if (uap->how & MCL_CURRENT) { 1087 /* 1088 * P1003.1-2001 mandates that all currently mapped pages 1089 * will be memory resident and locked (wired) upon return 1090 * from mlockall(). vm_map_wire() will wire pages, by 1091 * calling vm_fault_wire() for each page in the region. 1092 */ 1093 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1094 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1095 if (error == KERN_SUCCESS) 1096 error = 0; 1097 else if (error == KERN_RESOURCE_SHORTAGE) 1098 error = ENOMEM; 1099 else 1100 error = EAGAIN; 1101 } 1102 #ifdef RACCT 1103 if (racct_enable && error != KERN_SUCCESS) { 1104 PROC_LOCK(td->td_proc); 1105 racct_set(td->td_proc, RACCT_MEMLOCK, 1106 ptoa(pmap_wired_count(map->pmap))); 1107 PROC_UNLOCK(td->td_proc); 1108 } 1109 #endif 1110 1111 return (error); 1112 } 1113 1114 #ifndef _SYS_SYSPROTO_H_ 1115 struct munlockall_args { 1116 register_t dummy; 1117 }; 1118 #endif 1119 1120 int 1121 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1122 { 1123 vm_map_t map; 1124 int error; 1125 1126 map = &td->td_proc->p_vmspace->vm_map; 1127 error = priv_check(td, PRIV_VM_MUNLOCK); 1128 if (error) 1129 return (error); 1130 1131 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1132 vm_map_lock(map); 1133 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1134 vm_map_unlock(map); 1135 1136 /* Forcibly unwire all pages. */ 1137 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1138 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1139 #ifdef RACCT 1140 if (racct_enable && error == KERN_SUCCESS) { 1141 PROC_LOCK(td->td_proc); 1142 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1143 PROC_UNLOCK(td->td_proc); 1144 } 1145 #endif 1146 1147 return (error); 1148 } 1149 1150 #ifndef _SYS_SYSPROTO_H_ 1151 struct munlock_args { 1152 const void *addr; 1153 size_t len; 1154 }; 1155 #endif 1156 int 1157 sys_munlock(struct thread *td, struct munlock_args *uap) 1158 { 1159 1160 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1161 } 1162 1163 int 1164 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1165 { 1166 vm_offset_t addr, end, last, start; 1167 #ifdef RACCT 1168 vm_map_t map; 1169 #endif 1170 int error; 1171 1172 error = priv_check(td, PRIV_VM_MUNLOCK); 1173 if (error) 1174 return (error); 1175 addr = addr0; 1176 last = addr + size; 1177 start = trunc_page(addr); 1178 end = round_page(last); 1179 if (last < addr || end < addr) 1180 return (EINVAL); 1181 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1182 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1183 #ifdef RACCT 1184 if (racct_enable && error == KERN_SUCCESS) { 1185 PROC_LOCK(td->td_proc); 1186 map = &td->td_proc->p_vmspace->vm_map; 1187 racct_set(td->td_proc, RACCT_MEMLOCK, 1188 ptoa(pmap_wired_count(map->pmap))); 1189 PROC_UNLOCK(td->td_proc); 1190 } 1191 #endif 1192 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1193 } 1194 1195 /* 1196 * vm_mmap_vnode() 1197 * 1198 * Helper function for vm_mmap. Perform sanity check specific for mmap 1199 * operations on vnodes. 1200 */ 1201 int 1202 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1203 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1204 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1205 boolean_t *writecounted) 1206 { 1207 struct vattr va; 1208 vm_object_t obj; 1209 vm_ooffset_t foff; 1210 struct ucred *cred; 1211 int error, flags; 1212 bool writex; 1213 1214 cred = td->td_ucred; 1215 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1216 (*flagsp & MAP_SHARED) != 0; 1217 if ((error = vget(vp, LK_SHARED, td)) != 0) 1218 return (error); 1219 AUDIT_ARG_VNODE1(vp); 1220 foff = *foffp; 1221 flags = *flagsp; 1222 obj = vp->v_object; 1223 if (vp->v_type == VREG) { 1224 /* 1225 * Get the proper underlying object 1226 */ 1227 if (obj == NULL) { 1228 error = EINVAL; 1229 goto done; 1230 } 1231 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1232 vput(vp); 1233 vp = (struct vnode *)obj->handle; 1234 /* 1235 * Bypass filesystems obey the mpsafety of the 1236 * underlying fs. Tmpfs never bypasses. 1237 */ 1238 error = vget(vp, LK_SHARED, td); 1239 if (error != 0) 1240 return (error); 1241 } 1242 if (writex) { 1243 *writecounted = TRUE; 1244 vnode_pager_update_writecount(obj, 0, objsize); 1245 } 1246 } else { 1247 error = EINVAL; 1248 goto done; 1249 } 1250 if ((error = VOP_GETATTR(vp, &va, cred))) 1251 goto done; 1252 #ifdef MAC 1253 /* This relies on VM_PROT_* matching PROT_*. */ 1254 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1255 if (error != 0) 1256 goto done; 1257 #endif 1258 if ((flags & MAP_SHARED) != 0) { 1259 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1260 if (prot & VM_PROT_WRITE) { 1261 error = EPERM; 1262 goto done; 1263 } 1264 *maxprotp &= ~VM_PROT_WRITE; 1265 } 1266 } 1267 /* 1268 * If it is a regular file without any references 1269 * we do not need to sync it. 1270 * Adjust object size to be the size of actual file. 1271 */ 1272 objsize = round_page(va.va_size); 1273 if (va.va_nlink == 0) 1274 flags |= MAP_NOSYNC; 1275 if (obj->type == OBJT_VNODE) { 1276 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1277 cred); 1278 if (obj == NULL) { 1279 error = ENOMEM; 1280 goto done; 1281 } 1282 } else { 1283 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1284 ("wrong object type")); 1285 VM_OBJECT_WLOCK(obj); 1286 vm_object_reference_locked(obj); 1287 #if VM_NRESERVLEVEL > 0 1288 vm_object_color(obj, 0); 1289 #endif 1290 VM_OBJECT_WUNLOCK(obj); 1291 } 1292 *objp = obj; 1293 *flagsp = flags; 1294 1295 vfs_mark_atime(vp, cred); 1296 1297 done: 1298 if (error != 0 && *writecounted) { 1299 *writecounted = FALSE; 1300 vnode_pager_update_writecount(obj, objsize, 0); 1301 } 1302 vput(vp); 1303 return (error); 1304 } 1305 1306 /* 1307 * vm_mmap_cdev() 1308 * 1309 * Helper function for vm_mmap. Perform sanity check specific for mmap 1310 * operations on cdevs. 1311 */ 1312 int 1313 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1314 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1315 vm_ooffset_t *foff, vm_object_t *objp) 1316 { 1317 vm_object_t obj; 1318 int error, flags; 1319 1320 flags = *flagsp; 1321 1322 if (dsw->d_flags & D_MMAP_ANON) { 1323 *objp = NULL; 1324 *foff = 0; 1325 *maxprotp = VM_PROT_ALL; 1326 *flagsp |= MAP_ANON; 1327 return (0); 1328 } 1329 /* 1330 * cdevs do not provide private mappings of any kind. 1331 */ 1332 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1333 (prot & VM_PROT_WRITE) != 0) 1334 return (EACCES); 1335 if (flags & (MAP_PRIVATE|MAP_COPY)) 1336 return (EINVAL); 1337 /* 1338 * Force device mappings to be shared. 1339 */ 1340 flags |= MAP_SHARED; 1341 #ifdef MAC_XXX 1342 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1343 if (error != 0) 1344 return (error); 1345 #endif 1346 /* 1347 * First, try d_mmap_single(). If that is not implemented 1348 * (returns ENODEV), fall back to using the device pager. 1349 * Note that d_mmap_single() must return a reference to the 1350 * object (it needs to bump the reference count of the object 1351 * it returns somehow). 1352 * 1353 * XXX assumes VM_PROT_* == PROT_* 1354 */ 1355 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1356 if (error != ENODEV) 1357 return (error); 1358 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1359 td->td_ucred); 1360 if (obj == NULL) 1361 return (EINVAL); 1362 *objp = obj; 1363 *flagsp = flags; 1364 return (0); 1365 } 1366 1367 /* 1368 * vm_mmap() 1369 * 1370 * Internal version of mmap used by exec, sys5 shared memory, and 1371 * various device drivers. Handle is either a vnode pointer, a 1372 * character device, or NULL for MAP_ANON. 1373 */ 1374 int 1375 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1376 vm_prot_t maxprot, int flags, 1377 objtype_t handle_type, void *handle, 1378 vm_ooffset_t foff) 1379 { 1380 vm_object_t object; 1381 struct thread *td = curthread; 1382 int error; 1383 boolean_t writecounted; 1384 1385 if (size == 0) 1386 return (EINVAL); 1387 1388 size = round_page(size); 1389 object = NULL; 1390 writecounted = FALSE; 1391 1392 /* 1393 * Lookup/allocate object. 1394 */ 1395 switch (handle_type) { 1396 case OBJT_DEVICE: { 1397 struct cdevsw *dsw; 1398 struct cdev *cdev; 1399 int ref; 1400 1401 cdev = handle; 1402 dsw = dev_refthread(cdev, &ref); 1403 if (dsw == NULL) 1404 return (ENXIO); 1405 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1406 dsw, &foff, &object); 1407 dev_relthread(cdev, ref); 1408 break; 1409 } 1410 case OBJT_VNODE: 1411 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1412 handle, &foff, &object, &writecounted); 1413 break; 1414 case OBJT_DEFAULT: 1415 if (handle == NULL) { 1416 error = 0; 1417 break; 1418 } 1419 /* FALLTHROUGH */ 1420 default: 1421 error = EINVAL; 1422 break; 1423 } 1424 if (error) 1425 return (error); 1426 1427 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1428 foff, writecounted, td); 1429 if (error != 0 && object != NULL) { 1430 /* 1431 * If this mapping was accounted for in the vnode's 1432 * writecount, then undo that now. 1433 */ 1434 if (writecounted) 1435 vnode_pager_release_writecount(object, 0, size); 1436 vm_object_deallocate(object); 1437 } 1438 return (error); 1439 } 1440 1441 /* 1442 * Internal version of mmap that maps a specific VM object into an 1443 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1444 */ 1445 int 1446 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1447 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1448 boolean_t writecounted, struct thread *td) 1449 { 1450 boolean_t curmap, fitit; 1451 vm_offset_t max_addr; 1452 int docow, error, findspace, rv; 1453 1454 curmap = map == &td->td_proc->p_vmspace->vm_map; 1455 if (curmap) { 1456 RACCT_PROC_LOCK(td->td_proc); 1457 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1458 RACCT_PROC_UNLOCK(td->td_proc); 1459 return (ENOMEM); 1460 } 1461 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1462 RACCT_PROC_UNLOCK(td->td_proc); 1463 return (ENOMEM); 1464 } 1465 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1466 if (ptoa(pmap_wired_count(map->pmap)) + size > 1467 lim_cur(td, RLIMIT_MEMLOCK)) { 1468 racct_set_force(td->td_proc, RACCT_VMEM, 1469 map->size); 1470 RACCT_PROC_UNLOCK(td->td_proc); 1471 return (ENOMEM); 1472 } 1473 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1474 ptoa(pmap_wired_count(map->pmap)) + size); 1475 if (error != 0) { 1476 racct_set_force(td->td_proc, RACCT_VMEM, 1477 map->size); 1478 RACCT_PROC_UNLOCK(td->td_proc); 1479 return (error); 1480 } 1481 } 1482 RACCT_PROC_UNLOCK(td->td_proc); 1483 } 1484 1485 /* 1486 * We currently can only deal with page aligned file offsets. 1487 * The mmap() system call already enforces this by subtracting 1488 * the page offset from the file offset, but checking here 1489 * catches errors in device drivers (e.g. d_single_mmap() 1490 * callbacks) and other internal mapping requests (such as in 1491 * exec). 1492 */ 1493 if (foff & PAGE_MASK) 1494 return (EINVAL); 1495 1496 if ((flags & MAP_FIXED) == 0) { 1497 fitit = TRUE; 1498 *addr = round_page(*addr); 1499 } else { 1500 if (*addr != trunc_page(*addr)) 1501 return (EINVAL); 1502 fitit = FALSE; 1503 } 1504 1505 if (flags & MAP_ANON) { 1506 if (object != NULL || foff != 0) 1507 return (EINVAL); 1508 docow = 0; 1509 } else if (flags & MAP_PREFAULT_READ) 1510 docow = MAP_PREFAULT; 1511 else 1512 docow = MAP_PREFAULT_PARTIAL; 1513 1514 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1515 docow |= MAP_COPY_ON_WRITE; 1516 if (flags & MAP_NOSYNC) 1517 docow |= MAP_DISABLE_SYNCER; 1518 if (flags & MAP_NOCORE) 1519 docow |= MAP_DISABLE_COREDUMP; 1520 /* Shared memory is also shared with children. */ 1521 if (flags & MAP_SHARED) 1522 docow |= MAP_INHERIT_SHARE; 1523 if (writecounted) 1524 docow |= MAP_VN_WRITECOUNT; 1525 if (flags & MAP_STACK) { 1526 if (object != NULL) 1527 return (EINVAL); 1528 docow |= MAP_STACK_GROWS_DOWN; 1529 } 1530 if ((flags & MAP_EXCL) != 0) 1531 docow |= MAP_CHECK_EXCL; 1532 if ((flags & MAP_GUARD) != 0) 1533 docow |= MAP_CREATE_GUARD; 1534 1535 if (fitit) { 1536 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1537 findspace = VMFS_SUPER_SPACE; 1538 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1539 findspace = VMFS_ALIGNED_SPACE(flags >> 1540 MAP_ALIGNMENT_SHIFT); 1541 else 1542 findspace = VMFS_OPTIMAL_SPACE; 1543 max_addr = 0; 1544 #ifdef MAP_32BIT 1545 if ((flags & MAP_32BIT) != 0) 1546 max_addr = MAP_32BIT_MAX_ADDR; 1547 #endif 1548 if (curmap) { 1549 rv = vm_map_find_min(map, object, foff, addr, size, 1550 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1551 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1552 findspace, prot, maxprot, docow); 1553 } else { 1554 rv = vm_map_find(map, object, foff, addr, size, 1555 max_addr, findspace, prot, maxprot, docow); 1556 } 1557 } else { 1558 rv = vm_map_fixed(map, object, foff, *addr, size, 1559 prot, maxprot, docow); 1560 } 1561 1562 if (rv == KERN_SUCCESS) { 1563 /* 1564 * If the process has requested that all future mappings 1565 * be wired, then heed this. 1566 */ 1567 if ((map->flags & MAP_WIREFUTURE) != 0) { 1568 vm_map_lock(map); 1569 if ((map->flags & MAP_WIREFUTURE) != 0) 1570 (void)vm_map_wire_locked(map, *addr, 1571 *addr + size, VM_MAP_WIRE_USER | 1572 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1573 VM_MAP_WIRE_NOHOLES)); 1574 vm_map_unlock(map); 1575 } 1576 } 1577 return (vm_mmap_to_errno(rv)); 1578 } 1579 1580 /* 1581 * Translate a Mach VM return code to zero on success or the appropriate errno 1582 * on failure. 1583 */ 1584 int 1585 vm_mmap_to_errno(int rv) 1586 { 1587 1588 switch (rv) { 1589 case KERN_SUCCESS: 1590 return (0); 1591 case KERN_INVALID_ADDRESS: 1592 case KERN_NO_SPACE: 1593 return (ENOMEM); 1594 case KERN_PROTECTION_FAILURE: 1595 return (EACCES); 1596 default: 1597 return (EINVAL); 1598 } 1599 } 1600