1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_compat.h" 49 #include "opt_hwpmc_hooks.h" 50 #include "opt_vm.h" 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/capsicum.h> 55 #include <sys/kernel.h> 56 #include <sys/lock.h> 57 #include <sys/mutex.h> 58 #include <sys/sysproto.h> 59 #include <sys/filedesc.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/procctl.h> 63 #include <sys/racct.h> 64 #include <sys/resource.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/sysctl.h> 68 #include <sys/vnode.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/mman.h> 72 #include <sys/mount.h> 73 #include <sys/conf.h> 74 #include <sys/stat.h> 75 #include <sys/syscallsubr.h> 76 #include <sys/sysent.h> 77 #include <sys/vmmeter.h> 78 79 #include <security/audit/audit.h> 80 #include <security/mac/mac_framework.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_param.h> 84 #include <vm/pmap.h> 85 #include <vm/vm_map.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_pager.h> 89 #include <vm/vm_pageout.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_page.h> 92 #include <vm/vnode_pager.h> 93 94 #ifdef HWPMC_HOOKS 95 #include <sys/pmckern.h> 96 #endif 97 98 int old_mlock = 0; 99 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 100 "Do not apply RLIMIT_MEMLOCK on mlockall"); 101 102 #ifdef MAP_32BIT 103 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 104 #endif 105 106 #ifndef _SYS_SYSPROTO_H_ 107 struct sbrk_args { 108 int incr; 109 }; 110 #endif 111 112 int 113 sys_sbrk(struct thread *td, struct sbrk_args *uap) 114 { 115 /* Not yet implemented */ 116 return (EOPNOTSUPP); 117 } 118 119 #ifndef _SYS_SYSPROTO_H_ 120 struct sstk_args { 121 int incr; 122 }; 123 #endif 124 125 int 126 sys_sstk(struct thread *td, struct sstk_args *uap) 127 { 128 /* Not yet implemented */ 129 return (EOPNOTSUPP); 130 } 131 132 #if defined(COMPAT_43) 133 #ifndef _SYS_SYSPROTO_H_ 134 struct getpagesize_args { 135 int dummy; 136 }; 137 #endif 138 139 int 140 ogetpagesize(struct thread *td, struct getpagesize_args *uap) 141 { 142 143 td->td_retval[0] = PAGE_SIZE; 144 return (0); 145 } 146 #endif /* COMPAT_43 */ 147 148 149 /* 150 * Memory Map (mmap) system call. Note that the file offset 151 * and address are allowed to be NOT page aligned, though if 152 * the MAP_FIXED flag it set, both must have the same remainder 153 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 154 * page-aligned, the actual mapping starts at trunc_page(addr) 155 * and the return value is adjusted up by the page offset. 156 * 157 * Generally speaking, only character devices which are themselves 158 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 159 * there would be no cache coherency between a descriptor and a VM mapping 160 * both to the same character device. 161 */ 162 #ifndef _SYS_SYSPROTO_H_ 163 struct mmap_args { 164 void *addr; 165 size_t len; 166 int prot; 167 int flags; 168 int fd; 169 long pad; 170 off_t pos; 171 }; 172 #endif 173 174 int 175 sys_mmap(struct thread *td, struct mmap_args *uap) 176 { 177 178 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 179 uap->flags, uap->fd, uap->pos)); 180 } 181 182 int 183 kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags, 184 int fd, off_t pos) 185 { 186 struct vmspace *vms; 187 struct file *fp; 188 vm_offset_t addr; 189 vm_size_t pageoff; 190 vm_prot_t cap_maxprot; 191 int align, error; 192 cap_rights_t rights; 193 194 vms = td->td_proc->p_vmspace; 195 fp = NULL; 196 AUDIT_ARG_FD(fd); 197 addr = addr0; 198 199 /* 200 * Ignore old flags that used to be defined but did not do anything. 201 */ 202 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 203 204 /* 205 * Enforce the constraints. 206 * Mapping of length 0 is only allowed for old binaries. 207 * Anonymous mapping shall specify -1 as filedescriptor and 208 * zero position for new code. Be nice to ancient a.out 209 * binaries and correct pos for anonymous mapping, since old 210 * ld.so sometimes issues anonymous map requests with non-zero 211 * pos. 212 */ 213 if (!SV_CURPROC_FLAG(SV_AOUT)) { 214 if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 215 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 216 return (EINVAL); 217 } else { 218 if ((flags & MAP_ANON) != 0) 219 pos = 0; 220 } 221 222 if (flags & MAP_STACK) { 223 if ((fd != -1) || 224 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 225 return (EINVAL); 226 flags |= MAP_ANON; 227 pos = 0; 228 } 229 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 230 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 231 MAP_PREFAULT_READ | MAP_GUARD | 232 #ifdef MAP_32BIT 233 MAP_32BIT | 234 #endif 235 MAP_ALIGNMENT_MASK)) != 0) 236 return (EINVAL); 237 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 238 return (EINVAL); 239 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 240 return (EINVAL); 241 if (prot != PROT_NONE && 242 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 243 return (EINVAL); 244 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 245 pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT | 246 MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0)) 247 return (EINVAL); 248 249 /* 250 * Align the file position to a page boundary, 251 * and save its page offset component. 252 */ 253 pageoff = (pos & PAGE_MASK); 254 pos -= pageoff; 255 256 /* Adjust size for rounding (on both ends). */ 257 size += pageoff; /* low end... */ 258 size = (vm_size_t) round_page(size); /* hi end */ 259 260 /* Ensure alignment is at least a page and fits in a pointer. */ 261 align = flags & MAP_ALIGNMENT_MASK; 262 if (align != 0 && align != MAP_ALIGNED_SUPER && 263 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 264 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 265 return (EINVAL); 266 267 /* 268 * Check for illegal addresses. Watch out for address wrap... Note 269 * that VM_*_ADDRESS are not constants due to casts (argh). 270 */ 271 if (flags & MAP_FIXED) { 272 /* 273 * The specified address must have the same remainder 274 * as the file offset taken modulo PAGE_SIZE, so it 275 * should be aligned after adjustment by pageoff. 276 */ 277 addr -= pageoff; 278 if (addr & PAGE_MASK) 279 return (EINVAL); 280 281 /* Address range must be all in user VM space. */ 282 if (addr < vm_map_min(&vms->vm_map) || 283 addr + size > vm_map_max(&vms->vm_map)) 284 return (EINVAL); 285 if (addr + size < addr) 286 return (EINVAL); 287 #ifdef MAP_32BIT 288 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 289 return (EINVAL); 290 } else if (flags & MAP_32BIT) { 291 /* 292 * For MAP_32BIT, override the hint if it is too high and 293 * do not bother moving the mapping past the heap (since 294 * the heap is usually above 2GB). 295 */ 296 if (addr + size > MAP_32BIT_MAX_ADDR) 297 addr = 0; 298 #endif 299 } else { 300 /* 301 * XXX for non-fixed mappings where no hint is provided or 302 * the hint would fall in the potential heap space, 303 * place it after the end of the largest possible heap. 304 * 305 * There should really be a pmap call to determine a reasonable 306 * location. 307 */ 308 if (addr == 0 || 309 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 310 addr < round_page((vm_offset_t)vms->vm_daddr + 311 lim_max(td, RLIMIT_DATA)))) 312 addr = round_page((vm_offset_t)vms->vm_daddr + 313 lim_max(td, RLIMIT_DATA)); 314 } 315 if (size == 0) { 316 /* 317 * Return success without mapping anything for old 318 * binaries that request a page-aligned mapping of 319 * length 0. For modern binaries, this function 320 * returns an error earlier. 321 */ 322 error = 0; 323 } else if ((flags & MAP_GUARD) != 0) { 324 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 325 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 326 } else if ((flags & MAP_ANON) != 0) { 327 /* 328 * Mapping blank space is trivial. 329 * 330 * This relies on VM_PROT_* matching PROT_*. 331 */ 332 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 333 VM_PROT_ALL, flags, NULL, pos, FALSE, td); 334 } else { 335 /* 336 * Mapping file, get fp for validation and don't let the 337 * descriptor disappear on us if we block. Check capability 338 * rights, but also return the maximum rights to be combined 339 * with maxprot later. 340 */ 341 cap_rights_init(&rights, CAP_MMAP); 342 if (prot & PROT_READ) 343 cap_rights_set(&rights, CAP_MMAP_R); 344 if ((flags & MAP_SHARED) != 0) { 345 if (prot & PROT_WRITE) 346 cap_rights_set(&rights, CAP_MMAP_W); 347 } 348 if (prot & PROT_EXEC) 349 cap_rights_set(&rights, CAP_MMAP_X); 350 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 351 if (error != 0) 352 goto done; 353 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 354 td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { 355 error = EINVAL; 356 goto done; 357 } 358 359 /* This relies on VM_PROT_* matching PROT_*. */ 360 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 361 cap_maxprot, flags, pos, td); 362 } 363 364 if (error == 0) 365 td->td_retval[0] = (register_t) (addr + pageoff); 366 done: 367 if (fp) 368 fdrop(fp, td); 369 370 return (error); 371 } 372 373 #if defined(COMPAT_FREEBSD6) 374 int 375 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 376 { 377 378 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 379 uap->flags, uap->fd, uap->pos)); 380 } 381 #endif 382 383 #ifdef COMPAT_43 384 #ifndef _SYS_SYSPROTO_H_ 385 struct ommap_args { 386 caddr_t addr; 387 int len; 388 int prot; 389 int flags; 390 int fd; 391 long pos; 392 }; 393 #endif 394 int 395 ommap(struct thread *td, struct ommap_args *uap) 396 { 397 static const char cvtbsdprot[8] = { 398 0, 399 PROT_EXEC, 400 PROT_WRITE, 401 PROT_EXEC | PROT_WRITE, 402 PROT_READ, 403 PROT_EXEC | PROT_READ, 404 PROT_WRITE | PROT_READ, 405 PROT_EXEC | PROT_WRITE | PROT_READ, 406 }; 407 int flags, prot; 408 409 #define OMAP_ANON 0x0002 410 #define OMAP_COPY 0x0020 411 #define OMAP_SHARED 0x0010 412 #define OMAP_FIXED 0x0100 413 414 prot = cvtbsdprot[uap->prot & 0x7]; 415 #ifdef COMPAT_FREEBSD32 416 #if defined(__amd64__) 417 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 418 prot != 0) 419 prot |= PROT_EXEC; 420 #endif 421 #endif 422 flags = 0; 423 if (uap->flags & OMAP_ANON) 424 flags |= MAP_ANON; 425 if (uap->flags & OMAP_COPY) 426 flags |= MAP_COPY; 427 if (uap->flags & OMAP_SHARED) 428 flags |= MAP_SHARED; 429 else 430 flags |= MAP_PRIVATE; 431 if (uap->flags & OMAP_FIXED) 432 flags |= MAP_FIXED; 433 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, 434 uap->fd, uap->pos)); 435 } 436 #endif /* COMPAT_43 */ 437 438 439 #ifndef _SYS_SYSPROTO_H_ 440 struct msync_args { 441 void *addr; 442 size_t len; 443 int flags; 444 }; 445 #endif 446 int 447 sys_msync(struct thread *td, struct msync_args *uap) 448 { 449 450 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 451 } 452 453 int 454 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 455 { 456 vm_offset_t addr; 457 vm_size_t pageoff; 458 vm_map_t map; 459 int rv; 460 461 addr = addr0; 462 pageoff = (addr & PAGE_MASK); 463 addr -= pageoff; 464 size += pageoff; 465 size = (vm_size_t) round_page(size); 466 if (addr + size < addr) 467 return (EINVAL); 468 469 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 470 return (EINVAL); 471 472 map = &td->td_proc->p_vmspace->vm_map; 473 474 /* 475 * Clean the pages and interpret the return value. 476 */ 477 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 478 (flags & MS_INVALIDATE) != 0); 479 switch (rv) { 480 case KERN_SUCCESS: 481 return (0); 482 case KERN_INVALID_ADDRESS: 483 return (ENOMEM); 484 case KERN_INVALID_ARGUMENT: 485 return (EBUSY); 486 case KERN_FAILURE: 487 return (EIO); 488 default: 489 return (EINVAL); 490 } 491 } 492 493 #ifndef _SYS_SYSPROTO_H_ 494 struct munmap_args { 495 void *addr; 496 size_t len; 497 }; 498 #endif 499 int 500 sys_munmap(struct thread *td, struct munmap_args *uap) 501 { 502 503 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 504 } 505 506 int 507 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 508 { 509 #ifdef HWPMC_HOOKS 510 struct pmckern_map_out pkm; 511 vm_map_entry_t entry; 512 bool pmc_handled; 513 #endif 514 vm_offset_t addr; 515 vm_size_t pageoff; 516 vm_map_t map; 517 518 if (size == 0) 519 return (EINVAL); 520 521 addr = addr0; 522 pageoff = (addr & PAGE_MASK); 523 addr -= pageoff; 524 size += pageoff; 525 size = (vm_size_t) round_page(size); 526 if (addr + size < addr) 527 return (EINVAL); 528 529 /* 530 * Check for illegal addresses. Watch out for address wrap... 531 */ 532 map = &td->td_proc->p_vmspace->vm_map; 533 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 534 return (EINVAL); 535 vm_map_lock(map); 536 #ifdef HWPMC_HOOKS 537 pmc_handled = false; 538 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 539 pmc_handled = true; 540 /* 541 * Inform hwpmc if the address range being unmapped contains 542 * an executable region. 543 */ 544 pkm.pm_address = (uintptr_t) NULL; 545 if (vm_map_lookup_entry(map, addr, &entry)) { 546 for (; entry->start < addr + size; 547 entry = entry->next) { 548 if (vm_map_check_protection(map, entry->start, 549 entry->end, VM_PROT_EXECUTE) == TRUE) { 550 pkm.pm_address = (uintptr_t) addr; 551 pkm.pm_size = (size_t) size; 552 break; 553 } 554 } 555 } 556 } 557 #endif 558 vm_map_delete(map, addr, addr + size); 559 560 #ifdef HWPMC_HOOKS 561 if (__predict_false(pmc_handled)) { 562 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 563 vm_map_lock_downgrade(map); 564 if (pkm.pm_address != (uintptr_t) NULL) 565 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 566 vm_map_unlock_read(map); 567 } else 568 #endif 569 vm_map_unlock(map); 570 571 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 572 return (0); 573 } 574 575 #ifndef _SYS_SYSPROTO_H_ 576 struct mprotect_args { 577 const void *addr; 578 size_t len; 579 int prot; 580 }; 581 #endif 582 int 583 sys_mprotect(struct thread *td, struct mprotect_args *uap) 584 { 585 586 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 587 } 588 589 int 590 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 591 { 592 vm_offset_t addr; 593 vm_size_t pageoff; 594 595 addr = addr0; 596 prot = (prot & VM_PROT_ALL); 597 pageoff = (addr & PAGE_MASK); 598 addr -= pageoff; 599 size += pageoff; 600 size = (vm_size_t) round_page(size); 601 if (addr + size < addr) 602 return (EINVAL); 603 604 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 605 addr + size, prot, FALSE)) { 606 case KERN_SUCCESS: 607 return (0); 608 case KERN_PROTECTION_FAILURE: 609 return (EACCES); 610 case KERN_RESOURCE_SHORTAGE: 611 return (ENOMEM); 612 } 613 return (EINVAL); 614 } 615 616 #ifndef _SYS_SYSPROTO_H_ 617 struct minherit_args { 618 void *addr; 619 size_t len; 620 int inherit; 621 }; 622 #endif 623 int 624 sys_minherit(struct thread *td, struct minherit_args *uap) 625 { 626 vm_offset_t addr; 627 vm_size_t size, pageoff; 628 vm_inherit_t inherit; 629 630 addr = (vm_offset_t)uap->addr; 631 size = uap->len; 632 inherit = uap->inherit; 633 634 pageoff = (addr & PAGE_MASK); 635 addr -= pageoff; 636 size += pageoff; 637 size = (vm_size_t) round_page(size); 638 if (addr + size < addr) 639 return (EINVAL); 640 641 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 642 addr + size, inherit)) { 643 case KERN_SUCCESS: 644 return (0); 645 case KERN_PROTECTION_FAILURE: 646 return (EACCES); 647 } 648 return (EINVAL); 649 } 650 651 #ifndef _SYS_SYSPROTO_H_ 652 struct madvise_args { 653 void *addr; 654 size_t len; 655 int behav; 656 }; 657 #endif 658 659 int 660 sys_madvise(struct thread *td, struct madvise_args *uap) 661 { 662 663 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 664 } 665 666 int 667 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 668 { 669 vm_map_t map; 670 vm_offset_t addr, end, start; 671 int flags; 672 673 /* 674 * Check for our special case, advising the swap pager we are 675 * "immortal." 676 */ 677 if (behav == MADV_PROTECT) { 678 flags = PPROT_SET; 679 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 680 PROC_SPROTECT, &flags)); 681 } 682 683 /* 684 * Check for illegal behavior 685 */ 686 if (behav < 0 || behav > MADV_CORE) 687 return (EINVAL); 688 /* 689 * Check for illegal addresses. Watch out for address wrap... Note 690 * that VM_*_ADDRESS are not constants due to casts (argh). 691 */ 692 map = &td->td_proc->p_vmspace->vm_map; 693 addr = addr0; 694 if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) 695 return (EINVAL); 696 if ((addr + len) < addr) 697 return (EINVAL); 698 699 /* 700 * Since this routine is only advisory, we default to conservative 701 * behavior. 702 */ 703 start = trunc_page(addr); 704 end = round_page(addr + len); 705 706 if (vm_map_madvise(map, start, end, behav)) 707 return (EINVAL); 708 return (0); 709 } 710 711 #ifndef _SYS_SYSPROTO_H_ 712 struct mincore_args { 713 const void *addr; 714 size_t len; 715 char *vec; 716 }; 717 #endif 718 719 int 720 sys_mincore(struct thread *td, struct mincore_args *uap) 721 { 722 723 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 724 } 725 726 int 727 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 728 { 729 vm_offset_t addr, first_addr; 730 vm_offset_t end, cend; 731 pmap_t pmap; 732 vm_map_t map; 733 int error = 0; 734 int vecindex, lastvecindex; 735 vm_map_entry_t current; 736 vm_map_entry_t entry; 737 vm_object_t object; 738 vm_paddr_t locked_pa; 739 vm_page_t m; 740 vm_pindex_t pindex; 741 int mincoreinfo; 742 unsigned int timestamp; 743 boolean_t locked; 744 745 /* 746 * Make sure that the addresses presented are valid for user 747 * mode. 748 */ 749 first_addr = addr = trunc_page(addr0); 750 end = addr + (vm_size_t)round_page(len); 751 map = &td->td_proc->p_vmspace->vm_map; 752 if (end > vm_map_max(map) || end < addr) 753 return (ENOMEM); 754 755 pmap = vmspace_pmap(td->td_proc->p_vmspace); 756 757 vm_map_lock_read(map); 758 RestartScan: 759 timestamp = map->timestamp; 760 761 if (!vm_map_lookup_entry(map, addr, &entry)) { 762 vm_map_unlock_read(map); 763 return (ENOMEM); 764 } 765 766 /* 767 * Do this on a map entry basis so that if the pages are not 768 * in the current processes address space, we can easily look 769 * up the pages elsewhere. 770 */ 771 lastvecindex = -1; 772 for (current = entry; current->start < end; current = current->next) { 773 774 /* 775 * check for contiguity 776 */ 777 if (current->end < end && current->next->start > current->end) { 778 vm_map_unlock_read(map); 779 return (ENOMEM); 780 } 781 782 /* 783 * ignore submaps (for now) or null objects 784 */ 785 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 786 current->object.vm_object == NULL) 787 continue; 788 789 /* 790 * limit this scan to the current map entry and the 791 * limits for the mincore call 792 */ 793 if (addr < current->start) 794 addr = current->start; 795 cend = current->end; 796 if (cend > end) 797 cend = end; 798 799 /* 800 * scan this entry one page at a time 801 */ 802 while (addr < cend) { 803 /* 804 * Check pmap first, it is likely faster, also 805 * it can provide info as to whether we are the 806 * one referencing or modifying the page. 807 */ 808 object = NULL; 809 locked_pa = 0; 810 retry: 811 m = NULL; 812 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 813 if (locked_pa != 0) { 814 /* 815 * The page is mapped by this process but not 816 * both accessed and modified. It is also 817 * managed. Acquire the object lock so that 818 * other mappings might be examined. 819 */ 820 m = PHYS_TO_VM_PAGE(locked_pa); 821 if (m->object != object) { 822 if (object != NULL) 823 VM_OBJECT_WUNLOCK(object); 824 object = m->object; 825 locked = VM_OBJECT_TRYWLOCK(object); 826 vm_page_unlock(m); 827 if (!locked) { 828 VM_OBJECT_WLOCK(object); 829 vm_page_lock(m); 830 goto retry; 831 } 832 } else 833 vm_page_unlock(m); 834 KASSERT(m->valid == VM_PAGE_BITS_ALL, 835 ("mincore: page %p is mapped but invalid", 836 m)); 837 } else if (mincoreinfo == 0) { 838 /* 839 * The page is not mapped by this process. If 840 * the object implements managed pages, then 841 * determine if the page is resident so that 842 * the mappings might be examined. 843 */ 844 if (current->object.vm_object != object) { 845 if (object != NULL) 846 VM_OBJECT_WUNLOCK(object); 847 object = current->object.vm_object; 848 VM_OBJECT_WLOCK(object); 849 } 850 if (object->type == OBJT_DEFAULT || 851 object->type == OBJT_SWAP || 852 object->type == OBJT_VNODE) { 853 pindex = OFF_TO_IDX(current->offset + 854 (addr - current->start)); 855 m = vm_page_lookup(object, pindex); 856 if (m != NULL && m->valid == 0) 857 m = NULL; 858 if (m != NULL) 859 mincoreinfo = MINCORE_INCORE; 860 } 861 } 862 if (m != NULL) { 863 /* Examine other mappings to the page. */ 864 if (m->dirty == 0 && pmap_is_modified(m)) 865 vm_page_dirty(m); 866 if (m->dirty != 0) 867 mincoreinfo |= MINCORE_MODIFIED_OTHER; 868 /* 869 * The first test for PGA_REFERENCED is an 870 * optimization. The second test is 871 * required because a concurrent pmap 872 * operation could clear the last reference 873 * and set PGA_REFERENCED before the call to 874 * pmap_is_referenced(). 875 */ 876 if ((m->aflags & PGA_REFERENCED) != 0 || 877 pmap_is_referenced(m) || 878 (m->aflags & PGA_REFERENCED) != 0) 879 mincoreinfo |= MINCORE_REFERENCED_OTHER; 880 } 881 if (object != NULL) 882 VM_OBJECT_WUNLOCK(object); 883 884 /* 885 * subyte may page fault. In case it needs to modify 886 * the map, we release the lock. 887 */ 888 vm_map_unlock_read(map); 889 890 /* 891 * calculate index into user supplied byte vector 892 */ 893 vecindex = atop(addr - first_addr); 894 895 /* 896 * If we have skipped map entries, we need to make sure that 897 * the byte vector is zeroed for those skipped entries. 898 */ 899 while ((lastvecindex + 1) < vecindex) { 900 ++lastvecindex; 901 error = subyte(vec + lastvecindex, 0); 902 if (error) { 903 error = EFAULT; 904 goto done2; 905 } 906 } 907 908 /* 909 * Pass the page information to the user 910 */ 911 error = subyte(vec + vecindex, mincoreinfo); 912 if (error) { 913 error = EFAULT; 914 goto done2; 915 } 916 917 /* 918 * If the map has changed, due to the subyte, the previous 919 * output may be invalid. 920 */ 921 vm_map_lock_read(map); 922 if (timestamp != map->timestamp) 923 goto RestartScan; 924 925 lastvecindex = vecindex; 926 addr += PAGE_SIZE; 927 } 928 } 929 930 /* 931 * subyte may page fault. In case it needs to modify 932 * the map, we release the lock. 933 */ 934 vm_map_unlock_read(map); 935 936 /* 937 * Zero the last entries in the byte vector. 938 */ 939 vecindex = atop(end - first_addr); 940 while ((lastvecindex + 1) < vecindex) { 941 ++lastvecindex; 942 error = subyte(vec + lastvecindex, 0); 943 if (error) { 944 error = EFAULT; 945 goto done2; 946 } 947 } 948 949 /* 950 * If the map has changed, due to the subyte, the previous 951 * output may be invalid. 952 */ 953 vm_map_lock_read(map); 954 if (timestamp != map->timestamp) 955 goto RestartScan; 956 vm_map_unlock_read(map); 957 done2: 958 return (error); 959 } 960 961 #ifndef _SYS_SYSPROTO_H_ 962 struct mlock_args { 963 const void *addr; 964 size_t len; 965 }; 966 #endif 967 int 968 sys_mlock(struct thread *td, struct mlock_args *uap) 969 { 970 971 return (kern_mlock(td->td_proc, td->td_ucred, 972 __DECONST(uintptr_t, uap->addr), uap->len)); 973 } 974 975 int 976 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 977 { 978 vm_offset_t addr, end, last, start; 979 vm_size_t npages, size; 980 vm_map_t map; 981 unsigned long nsize; 982 int error; 983 984 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 985 if (error) 986 return (error); 987 addr = addr0; 988 size = len; 989 last = addr + size; 990 start = trunc_page(addr); 991 end = round_page(last); 992 if (last < addr || end < addr) 993 return (EINVAL); 994 npages = atop(end - start); 995 if (npages > vm_page_max_wired) 996 return (ENOMEM); 997 map = &proc->p_vmspace->vm_map; 998 PROC_LOCK(proc); 999 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1000 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1001 PROC_UNLOCK(proc); 1002 return (ENOMEM); 1003 } 1004 PROC_UNLOCK(proc); 1005 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1006 return (EAGAIN); 1007 #ifdef RACCT 1008 if (racct_enable) { 1009 PROC_LOCK(proc); 1010 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1011 PROC_UNLOCK(proc); 1012 if (error != 0) 1013 return (ENOMEM); 1014 } 1015 #endif 1016 error = vm_map_wire(map, start, end, 1017 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1018 #ifdef RACCT 1019 if (racct_enable && error != KERN_SUCCESS) { 1020 PROC_LOCK(proc); 1021 racct_set(proc, RACCT_MEMLOCK, 1022 ptoa(pmap_wired_count(map->pmap))); 1023 PROC_UNLOCK(proc); 1024 } 1025 #endif 1026 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1027 } 1028 1029 #ifndef _SYS_SYSPROTO_H_ 1030 struct mlockall_args { 1031 int how; 1032 }; 1033 #endif 1034 1035 int 1036 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1037 { 1038 vm_map_t map; 1039 int error; 1040 1041 map = &td->td_proc->p_vmspace->vm_map; 1042 error = priv_check(td, PRIV_VM_MLOCK); 1043 if (error) 1044 return (error); 1045 1046 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1047 return (EINVAL); 1048 1049 /* 1050 * If wiring all pages in the process would cause it to exceed 1051 * a hard resource limit, return ENOMEM. 1052 */ 1053 if (!old_mlock && uap->how & MCL_CURRENT) { 1054 PROC_LOCK(td->td_proc); 1055 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) { 1056 PROC_UNLOCK(td->td_proc); 1057 return (ENOMEM); 1058 } 1059 PROC_UNLOCK(td->td_proc); 1060 } 1061 #ifdef RACCT 1062 if (racct_enable) { 1063 PROC_LOCK(td->td_proc); 1064 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1065 PROC_UNLOCK(td->td_proc); 1066 if (error != 0) 1067 return (ENOMEM); 1068 } 1069 #endif 1070 1071 if (uap->how & MCL_FUTURE) { 1072 vm_map_lock(map); 1073 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1074 vm_map_unlock(map); 1075 error = 0; 1076 } 1077 1078 if (uap->how & MCL_CURRENT) { 1079 /* 1080 * P1003.1-2001 mandates that all currently mapped pages 1081 * will be memory resident and locked (wired) upon return 1082 * from mlockall(). vm_map_wire() will wire pages, by 1083 * calling vm_fault_wire() for each page in the region. 1084 */ 1085 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1086 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1087 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1088 } 1089 #ifdef RACCT 1090 if (racct_enable && error != KERN_SUCCESS) { 1091 PROC_LOCK(td->td_proc); 1092 racct_set(td->td_proc, RACCT_MEMLOCK, 1093 ptoa(pmap_wired_count(map->pmap))); 1094 PROC_UNLOCK(td->td_proc); 1095 } 1096 #endif 1097 1098 return (error); 1099 } 1100 1101 #ifndef _SYS_SYSPROTO_H_ 1102 struct munlockall_args { 1103 register_t dummy; 1104 }; 1105 #endif 1106 1107 int 1108 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1109 { 1110 vm_map_t map; 1111 int error; 1112 1113 map = &td->td_proc->p_vmspace->vm_map; 1114 error = priv_check(td, PRIV_VM_MUNLOCK); 1115 if (error) 1116 return (error); 1117 1118 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1119 vm_map_lock(map); 1120 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1121 vm_map_unlock(map); 1122 1123 /* Forcibly unwire all pages. */ 1124 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1125 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1126 #ifdef RACCT 1127 if (racct_enable && error == KERN_SUCCESS) { 1128 PROC_LOCK(td->td_proc); 1129 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1130 PROC_UNLOCK(td->td_proc); 1131 } 1132 #endif 1133 1134 return (error); 1135 } 1136 1137 #ifndef _SYS_SYSPROTO_H_ 1138 struct munlock_args { 1139 const void *addr; 1140 size_t len; 1141 }; 1142 #endif 1143 int 1144 sys_munlock(struct thread *td, struct munlock_args *uap) 1145 { 1146 1147 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1148 } 1149 1150 int 1151 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1152 { 1153 vm_offset_t addr, end, last, start; 1154 #ifdef RACCT 1155 vm_map_t map; 1156 #endif 1157 int error; 1158 1159 error = priv_check(td, PRIV_VM_MUNLOCK); 1160 if (error) 1161 return (error); 1162 addr = addr0; 1163 last = addr + size; 1164 start = trunc_page(addr); 1165 end = round_page(last); 1166 if (last < addr || end < addr) 1167 return (EINVAL); 1168 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1169 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1170 #ifdef RACCT 1171 if (racct_enable && error == KERN_SUCCESS) { 1172 PROC_LOCK(td->td_proc); 1173 map = &td->td_proc->p_vmspace->vm_map; 1174 racct_set(td->td_proc, RACCT_MEMLOCK, 1175 ptoa(pmap_wired_count(map->pmap))); 1176 PROC_UNLOCK(td->td_proc); 1177 } 1178 #endif 1179 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1180 } 1181 1182 /* 1183 * vm_mmap_vnode() 1184 * 1185 * Helper function for vm_mmap. Perform sanity check specific for mmap 1186 * operations on vnodes. 1187 */ 1188 int 1189 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1190 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1191 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1192 boolean_t *writecounted) 1193 { 1194 struct vattr va; 1195 vm_object_t obj; 1196 vm_ooffset_t foff; 1197 struct ucred *cred; 1198 int error, flags, locktype; 1199 1200 cred = td->td_ucred; 1201 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1202 locktype = LK_EXCLUSIVE; 1203 else 1204 locktype = LK_SHARED; 1205 if ((error = vget(vp, locktype, td)) != 0) 1206 return (error); 1207 AUDIT_ARG_VNODE1(vp); 1208 foff = *foffp; 1209 flags = *flagsp; 1210 obj = vp->v_object; 1211 if (vp->v_type == VREG) { 1212 /* 1213 * Get the proper underlying object 1214 */ 1215 if (obj == NULL) { 1216 error = EINVAL; 1217 goto done; 1218 } 1219 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1220 vput(vp); 1221 vp = (struct vnode *)obj->handle; 1222 /* 1223 * Bypass filesystems obey the mpsafety of the 1224 * underlying fs. Tmpfs never bypasses. 1225 */ 1226 error = vget(vp, locktype, td); 1227 if (error != 0) 1228 return (error); 1229 } 1230 if (locktype == LK_EXCLUSIVE) { 1231 *writecounted = TRUE; 1232 vnode_pager_update_writecount(obj, 0, objsize); 1233 } 1234 } else { 1235 error = EINVAL; 1236 goto done; 1237 } 1238 if ((error = VOP_GETATTR(vp, &va, cred))) 1239 goto done; 1240 #ifdef MAC 1241 /* This relies on VM_PROT_* matching PROT_*. */ 1242 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1243 if (error != 0) 1244 goto done; 1245 #endif 1246 if ((flags & MAP_SHARED) != 0) { 1247 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1248 if (prot & VM_PROT_WRITE) { 1249 error = EPERM; 1250 goto done; 1251 } 1252 *maxprotp &= ~VM_PROT_WRITE; 1253 } 1254 } 1255 /* 1256 * If it is a regular file without any references 1257 * we do not need to sync it. 1258 * Adjust object size to be the size of actual file. 1259 */ 1260 objsize = round_page(va.va_size); 1261 if (va.va_nlink == 0) 1262 flags |= MAP_NOSYNC; 1263 if (obj->type == OBJT_VNODE) { 1264 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1265 cred); 1266 if (obj == NULL) { 1267 error = ENOMEM; 1268 goto done; 1269 } 1270 } else { 1271 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1272 ("wrong object type")); 1273 VM_OBJECT_WLOCK(obj); 1274 vm_object_reference_locked(obj); 1275 #if VM_NRESERVLEVEL > 0 1276 vm_object_color(obj, 0); 1277 #endif 1278 VM_OBJECT_WUNLOCK(obj); 1279 } 1280 *objp = obj; 1281 *flagsp = flags; 1282 1283 vfs_mark_atime(vp, cred); 1284 1285 done: 1286 if (error != 0 && *writecounted) { 1287 *writecounted = FALSE; 1288 vnode_pager_update_writecount(obj, objsize, 0); 1289 } 1290 vput(vp); 1291 return (error); 1292 } 1293 1294 /* 1295 * vm_mmap_cdev() 1296 * 1297 * Helper function for vm_mmap. Perform sanity check specific for mmap 1298 * operations on cdevs. 1299 */ 1300 int 1301 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1302 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1303 vm_ooffset_t *foff, vm_object_t *objp) 1304 { 1305 vm_object_t obj; 1306 int error, flags; 1307 1308 flags = *flagsp; 1309 1310 if (dsw->d_flags & D_MMAP_ANON) { 1311 *objp = NULL; 1312 *foff = 0; 1313 *maxprotp = VM_PROT_ALL; 1314 *flagsp |= MAP_ANON; 1315 return (0); 1316 } 1317 /* 1318 * cdevs do not provide private mappings of any kind. 1319 */ 1320 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1321 (prot & VM_PROT_WRITE) != 0) 1322 return (EACCES); 1323 if (flags & (MAP_PRIVATE|MAP_COPY)) 1324 return (EINVAL); 1325 /* 1326 * Force device mappings to be shared. 1327 */ 1328 flags |= MAP_SHARED; 1329 #ifdef MAC_XXX 1330 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1331 if (error != 0) 1332 return (error); 1333 #endif 1334 /* 1335 * First, try d_mmap_single(). If that is not implemented 1336 * (returns ENODEV), fall back to using the device pager. 1337 * Note that d_mmap_single() must return a reference to the 1338 * object (it needs to bump the reference count of the object 1339 * it returns somehow). 1340 * 1341 * XXX assumes VM_PROT_* == PROT_* 1342 */ 1343 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1344 if (error != ENODEV) 1345 return (error); 1346 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1347 td->td_ucred); 1348 if (obj == NULL) 1349 return (EINVAL); 1350 *objp = obj; 1351 *flagsp = flags; 1352 return (0); 1353 } 1354 1355 /* 1356 * vm_mmap() 1357 * 1358 * Internal version of mmap used by exec, sys5 shared memory, and 1359 * various device drivers. Handle is either a vnode pointer, a 1360 * character device, or NULL for MAP_ANON. 1361 */ 1362 int 1363 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1364 vm_prot_t maxprot, int flags, 1365 objtype_t handle_type, void *handle, 1366 vm_ooffset_t foff) 1367 { 1368 vm_object_t object; 1369 struct thread *td = curthread; 1370 int error; 1371 boolean_t writecounted; 1372 1373 if (size == 0) 1374 return (EINVAL); 1375 1376 size = round_page(size); 1377 object = NULL; 1378 writecounted = FALSE; 1379 1380 /* 1381 * Lookup/allocate object. 1382 */ 1383 switch (handle_type) { 1384 case OBJT_DEVICE: { 1385 struct cdevsw *dsw; 1386 struct cdev *cdev; 1387 int ref; 1388 1389 cdev = handle; 1390 dsw = dev_refthread(cdev, &ref); 1391 if (dsw == NULL) 1392 return (ENXIO); 1393 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1394 dsw, &foff, &object); 1395 dev_relthread(cdev, ref); 1396 break; 1397 } 1398 case OBJT_VNODE: 1399 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1400 handle, &foff, &object, &writecounted); 1401 break; 1402 case OBJT_DEFAULT: 1403 if (handle == NULL) { 1404 error = 0; 1405 break; 1406 } 1407 /* FALLTHROUGH */ 1408 default: 1409 error = EINVAL; 1410 break; 1411 } 1412 if (error) 1413 return (error); 1414 1415 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1416 foff, writecounted, td); 1417 if (error != 0 && object != NULL) { 1418 /* 1419 * If this mapping was accounted for in the vnode's 1420 * writecount, then undo that now. 1421 */ 1422 if (writecounted) 1423 vnode_pager_release_writecount(object, 0, size); 1424 vm_object_deallocate(object); 1425 } 1426 return (error); 1427 } 1428 1429 /* 1430 * Internal version of mmap that maps a specific VM object into an 1431 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1432 */ 1433 int 1434 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1435 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1436 boolean_t writecounted, struct thread *td) 1437 { 1438 boolean_t curmap, fitit; 1439 vm_offset_t max_addr; 1440 int docow, error, findspace, rv; 1441 1442 curmap = map == &td->td_proc->p_vmspace->vm_map; 1443 if (curmap) { 1444 PROC_LOCK(td->td_proc); 1445 if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) { 1446 PROC_UNLOCK(td->td_proc); 1447 return (ENOMEM); 1448 } 1449 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1450 PROC_UNLOCK(td->td_proc); 1451 return (ENOMEM); 1452 } 1453 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1454 if (ptoa(pmap_wired_count(map->pmap)) + size > 1455 lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) { 1456 racct_set_force(td->td_proc, RACCT_VMEM, 1457 map->size); 1458 PROC_UNLOCK(td->td_proc); 1459 return (ENOMEM); 1460 } 1461 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1462 ptoa(pmap_wired_count(map->pmap)) + size); 1463 if (error != 0) { 1464 racct_set_force(td->td_proc, RACCT_VMEM, 1465 map->size); 1466 PROC_UNLOCK(td->td_proc); 1467 return (error); 1468 } 1469 } 1470 PROC_UNLOCK(td->td_proc); 1471 } 1472 1473 /* 1474 * We currently can only deal with page aligned file offsets. 1475 * The mmap() system call already enforces this by subtracting 1476 * the page offset from the file offset, but checking here 1477 * catches errors in device drivers (e.g. d_single_mmap() 1478 * callbacks) and other internal mapping requests (such as in 1479 * exec). 1480 */ 1481 if (foff & PAGE_MASK) 1482 return (EINVAL); 1483 1484 if ((flags & MAP_FIXED) == 0) { 1485 fitit = TRUE; 1486 *addr = round_page(*addr); 1487 } else { 1488 if (*addr != trunc_page(*addr)) 1489 return (EINVAL); 1490 fitit = FALSE; 1491 } 1492 1493 if (flags & MAP_ANON) { 1494 if (object != NULL || foff != 0) 1495 return (EINVAL); 1496 docow = 0; 1497 } else if (flags & MAP_PREFAULT_READ) 1498 docow = MAP_PREFAULT; 1499 else 1500 docow = MAP_PREFAULT_PARTIAL; 1501 1502 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1503 docow |= MAP_COPY_ON_WRITE; 1504 if (flags & MAP_NOSYNC) 1505 docow |= MAP_DISABLE_SYNCER; 1506 if (flags & MAP_NOCORE) 1507 docow |= MAP_DISABLE_COREDUMP; 1508 /* Shared memory is also shared with children. */ 1509 if (flags & MAP_SHARED) 1510 docow |= MAP_INHERIT_SHARE; 1511 if (writecounted) 1512 docow |= MAP_VN_WRITECOUNT; 1513 if (flags & MAP_STACK) { 1514 if (object != NULL) 1515 return (EINVAL); 1516 docow |= MAP_STACK_GROWS_DOWN; 1517 } 1518 if ((flags & MAP_EXCL) != 0) 1519 docow |= MAP_CHECK_EXCL; 1520 if ((flags & MAP_GUARD) != 0) 1521 docow |= MAP_CREATE_GUARD; 1522 1523 if (fitit) { 1524 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1525 findspace = VMFS_SUPER_SPACE; 1526 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1527 findspace = VMFS_ALIGNED_SPACE(flags >> 1528 MAP_ALIGNMENT_SHIFT); 1529 else 1530 findspace = VMFS_OPTIMAL_SPACE; 1531 max_addr = 0; 1532 #ifdef MAP_32BIT 1533 if ((flags & MAP_32BIT) != 0) 1534 max_addr = MAP_32BIT_MAX_ADDR; 1535 #endif 1536 if (curmap) { 1537 rv = vm_map_find_min(map, object, foff, addr, size, 1538 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1539 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1540 findspace, prot, maxprot, docow); 1541 } else { 1542 rv = vm_map_find(map, object, foff, addr, size, 1543 max_addr, findspace, prot, maxprot, docow); 1544 } 1545 } else { 1546 rv = vm_map_fixed(map, object, foff, *addr, size, 1547 prot, maxprot, docow); 1548 } 1549 1550 if (rv == KERN_SUCCESS) { 1551 /* 1552 * If the process has requested that all future mappings 1553 * be wired, then heed this. 1554 */ 1555 if (map->flags & MAP_WIREFUTURE) { 1556 vm_map_wire(map, *addr, *addr + size, 1557 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1558 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1559 } 1560 } 1561 return (vm_mmap_to_errno(rv)); 1562 } 1563 1564 /* 1565 * Translate a Mach VM return code to zero on success or the appropriate errno 1566 * on failure. 1567 */ 1568 int 1569 vm_mmap_to_errno(int rv) 1570 { 1571 1572 switch (rv) { 1573 case KERN_SUCCESS: 1574 return (0); 1575 case KERN_INVALID_ADDRESS: 1576 case KERN_NO_SPACE: 1577 return (ENOMEM); 1578 case KERN_PROTECTION_FAILURE: 1579 return (EACCES); 1580 default: 1581 return (EINVAL); 1582 } 1583 } 1584