1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_hwpmc_hooks.h" 49 #include "opt_vm.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/capsicum.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/elf.h> 59 #include <sys/filedesc.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/procctl.h> 63 #include <sys/racct.h> 64 #include <sys/resource.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/sysctl.h> 68 #include <sys/vnode.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/mman.h> 72 #include <sys/mount.h> 73 #include <sys/conf.h> 74 #include <sys/stat.h> 75 #include <sys/syscallsubr.h> 76 #include <sys/sysent.h> 77 #include <sys/vmmeter.h> 78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 79 #include <machine/md_var.h> 80 #endif 81 82 #include <security/audit/audit.h> 83 #include <security/mac/mac_framework.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <vm/pmap.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_pageout.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vnode_pager.h> 96 97 #ifdef HWPMC_HOOKS 98 #include <sys/pmckern.h> 99 #endif 100 101 int old_mlock = 0; 102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 103 "Do not apply RLIMIT_MEMLOCK on mlockall"); 104 static int mincore_mapped = 1; 105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 106 "mincore reports mappings, not residency"); 107 static int imply_prot_max = 0; 108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 109 "Imply maximum page permissions in mmap() when none are specified"); 110 111 #ifdef MAP_32BIT 112 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 113 #endif 114 115 #ifndef _SYS_SYSPROTO_H_ 116 struct sbrk_args { 117 int incr; 118 }; 119 #endif 120 121 int 122 sys_sbrk(struct thread *td, struct sbrk_args *uap) 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 int 135 sys_sstk(struct thread *td, struct sstk_args *uap) 136 { 137 /* Not yet implemented */ 138 return (EOPNOTSUPP); 139 } 140 141 #if defined(COMPAT_43) 142 int 143 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 144 { 145 146 td->td_retval[0] = PAGE_SIZE; 147 return (0); 148 } 149 #endif /* COMPAT_43 */ 150 151 152 /* 153 * Memory Map (mmap) system call. Note that the file offset 154 * and address are allowed to be NOT page aligned, though if 155 * the MAP_FIXED flag it set, both must have the same remainder 156 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 157 * page-aligned, the actual mapping starts at trunc_page(addr) 158 * and the return value is adjusted up by the page offset. 159 * 160 * Generally speaking, only character devices which are themselves 161 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 162 * there would be no cache coherency between a descriptor and a VM mapping 163 * both to the same character device. 164 */ 165 #ifndef _SYS_SYSPROTO_H_ 166 struct mmap_args { 167 void *addr; 168 size_t len; 169 int prot; 170 int flags; 171 int fd; 172 long pad; 173 off_t pos; 174 }; 175 #endif 176 177 int 178 sys_mmap(struct thread *td, struct mmap_args *uap) 179 { 180 181 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 182 uap->flags, uap->fd, uap->pos)); 183 } 184 185 int 186 kern_mmap_maxprot(struct proc *p, int prot) 187 { 188 189 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 190 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 191 return (_PROT_ALL); 192 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 193 prot != PROT_NONE) 194 return (prot); 195 return (_PROT_ALL); 196 } 197 198 int 199 kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags, 200 int fd, off_t pos) 201 { 202 struct vmspace *vms; 203 struct file *fp; 204 struct proc *p; 205 vm_offset_t addr; 206 vm_size_t pageoff, size; 207 vm_prot_t cap_maxprot; 208 int align, error, max_prot; 209 cap_rights_t rights; 210 211 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 212 return (EINVAL); 213 max_prot = PROT_MAX_EXTRACT(prot); 214 prot = PROT_EXTRACT(prot); 215 if (max_prot != 0 && (max_prot & prot) != prot) 216 return (EINVAL); 217 218 p = td->td_proc; 219 220 /* 221 * Always honor PROT_MAX if set. If not, default to all 222 * permissions unless we're implying maximum permissions. 223 */ 224 if (max_prot == 0) 225 max_prot = kern_mmap_maxprot(p, prot); 226 227 vms = p->p_vmspace; 228 fp = NULL; 229 AUDIT_ARG_FD(fd); 230 addr = addr0; 231 232 /* 233 * Ignore old flags that used to be defined but did not do anything. 234 */ 235 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 236 237 /* 238 * Enforce the constraints. 239 * Mapping of length 0 is only allowed for old binaries. 240 * Anonymous mapping shall specify -1 as filedescriptor and 241 * zero position for new code. Be nice to ancient a.out 242 * binaries and correct pos for anonymous mapping, since old 243 * ld.so sometimes issues anonymous map requests with non-zero 244 * pos. 245 */ 246 if (!SV_CURPROC_FLAG(SV_AOUT)) { 247 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 248 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 249 return (EINVAL); 250 } else { 251 if ((flags & MAP_ANON) != 0) 252 pos = 0; 253 } 254 255 if (flags & MAP_STACK) { 256 if ((fd != -1) || 257 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 258 return (EINVAL); 259 flags |= MAP_ANON; 260 pos = 0; 261 } 262 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 263 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 264 MAP_PREFAULT_READ | MAP_GUARD | 265 #ifdef MAP_32BIT 266 MAP_32BIT | 267 #endif 268 MAP_ALIGNMENT_MASK)) != 0) 269 return (EINVAL); 270 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 271 return (EINVAL); 272 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 273 return (EINVAL); 274 if (prot != PROT_NONE && 275 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 276 return (EINVAL); 277 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 278 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 279 #ifdef MAP_32BIT 280 MAP_32BIT | 281 #endif 282 MAP_ALIGNMENT_MASK)) != 0)) 283 return (EINVAL); 284 285 /* 286 * Align the file position to a page boundary, 287 * and save its page offset component. 288 */ 289 pageoff = (pos & PAGE_MASK); 290 pos -= pageoff; 291 292 /* Compute size from len by rounding (on both ends). */ 293 size = len + pageoff; /* low end... */ 294 size = round_page(size); /* hi end */ 295 /* Check for rounding up to zero. */ 296 if (len > size) 297 return (ENOMEM); 298 299 /* Ensure alignment is at least a page and fits in a pointer. */ 300 align = flags & MAP_ALIGNMENT_MASK; 301 if (align != 0 && align != MAP_ALIGNED_SUPER && 302 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 303 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 304 return (EINVAL); 305 306 /* 307 * Check for illegal addresses. Watch out for address wrap... Note 308 * that VM_*_ADDRESS are not constants due to casts (argh). 309 */ 310 if (flags & MAP_FIXED) { 311 /* 312 * The specified address must have the same remainder 313 * as the file offset taken modulo PAGE_SIZE, so it 314 * should be aligned after adjustment by pageoff. 315 */ 316 addr -= pageoff; 317 if (addr & PAGE_MASK) 318 return (EINVAL); 319 320 /* Address range must be all in user VM space. */ 321 if (addr < vm_map_min(&vms->vm_map) || 322 addr + size > vm_map_max(&vms->vm_map)) 323 return (EINVAL); 324 if (addr + size < addr) 325 return (EINVAL); 326 #ifdef MAP_32BIT 327 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 328 return (EINVAL); 329 } else if (flags & MAP_32BIT) { 330 /* 331 * For MAP_32BIT, override the hint if it is too high and 332 * do not bother moving the mapping past the heap (since 333 * the heap is usually above 2GB). 334 */ 335 if (addr + size > MAP_32BIT_MAX_ADDR) 336 addr = 0; 337 #endif 338 } else { 339 /* 340 * XXX for non-fixed mappings where no hint is provided or 341 * the hint would fall in the potential heap space, 342 * place it after the end of the largest possible heap. 343 * 344 * There should really be a pmap call to determine a reasonable 345 * location. 346 */ 347 if (addr == 0 || 348 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 349 addr < round_page((vm_offset_t)vms->vm_daddr + 350 lim_max(td, RLIMIT_DATA)))) 351 addr = round_page((vm_offset_t)vms->vm_daddr + 352 lim_max(td, RLIMIT_DATA)); 353 } 354 if (len == 0) { 355 /* 356 * Return success without mapping anything for old 357 * binaries that request a page-aligned mapping of 358 * length 0. For modern binaries, this function 359 * returns an error earlier. 360 */ 361 error = 0; 362 } else if ((flags & MAP_GUARD) != 0) { 363 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 364 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 365 } else if ((flags & MAP_ANON) != 0) { 366 /* 367 * Mapping blank space is trivial. 368 * 369 * This relies on VM_PROT_* matching PROT_*. 370 */ 371 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 372 max_prot, flags, NULL, pos, FALSE, td); 373 } else { 374 /* 375 * Mapping file, get fp for validation and don't let the 376 * descriptor disappear on us if we block. Check capability 377 * rights, but also return the maximum rights to be combined 378 * with maxprot later. 379 */ 380 cap_rights_init(&rights, CAP_MMAP); 381 if (prot & PROT_READ) 382 cap_rights_set(&rights, CAP_MMAP_R); 383 if ((flags & MAP_SHARED) != 0) { 384 if (prot & PROT_WRITE) 385 cap_rights_set(&rights, CAP_MMAP_W); 386 } 387 if (prot & PROT_EXEC) 388 cap_rights_set(&rights, CAP_MMAP_X); 389 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 390 if (error != 0) 391 goto done; 392 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 393 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 394 error = EINVAL; 395 goto done; 396 } 397 398 /* This relies on VM_PROT_* matching PROT_*. */ 399 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 400 max_prot & cap_maxprot, flags, pos, td); 401 } 402 403 if (error == 0) 404 td->td_retval[0] = (register_t) (addr + pageoff); 405 done: 406 if (fp) 407 fdrop(fp, td); 408 409 return (error); 410 } 411 412 #if defined(COMPAT_FREEBSD6) 413 int 414 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 415 { 416 417 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 418 uap->flags, uap->fd, uap->pos)); 419 } 420 #endif 421 422 #ifdef COMPAT_43 423 #ifndef _SYS_SYSPROTO_H_ 424 struct ommap_args { 425 caddr_t addr; 426 int len; 427 int prot; 428 int flags; 429 int fd; 430 long pos; 431 }; 432 #endif 433 int 434 ommap(struct thread *td, struct ommap_args *uap) 435 { 436 static const char cvtbsdprot[8] = { 437 0, 438 PROT_EXEC, 439 PROT_WRITE, 440 PROT_EXEC | PROT_WRITE, 441 PROT_READ, 442 PROT_EXEC | PROT_READ, 443 PROT_WRITE | PROT_READ, 444 PROT_EXEC | PROT_WRITE | PROT_READ, 445 }; 446 int flags, prot; 447 448 #define OMAP_ANON 0x0002 449 #define OMAP_COPY 0x0020 450 #define OMAP_SHARED 0x0010 451 #define OMAP_FIXED 0x0100 452 453 prot = cvtbsdprot[uap->prot & 0x7]; 454 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 455 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 456 prot != 0) 457 prot |= PROT_EXEC; 458 #endif 459 flags = 0; 460 if (uap->flags & OMAP_ANON) 461 flags |= MAP_ANON; 462 if (uap->flags & OMAP_COPY) 463 flags |= MAP_COPY; 464 if (uap->flags & OMAP_SHARED) 465 flags |= MAP_SHARED; 466 else 467 flags |= MAP_PRIVATE; 468 if (uap->flags & OMAP_FIXED) 469 flags |= MAP_FIXED; 470 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, 471 uap->fd, uap->pos)); 472 } 473 #endif /* COMPAT_43 */ 474 475 476 #ifndef _SYS_SYSPROTO_H_ 477 struct msync_args { 478 void *addr; 479 size_t len; 480 int flags; 481 }; 482 #endif 483 int 484 sys_msync(struct thread *td, struct msync_args *uap) 485 { 486 487 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 488 } 489 490 int 491 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 492 { 493 vm_offset_t addr; 494 vm_size_t pageoff; 495 vm_map_t map; 496 int rv; 497 498 addr = addr0; 499 pageoff = (addr & PAGE_MASK); 500 addr -= pageoff; 501 size += pageoff; 502 size = (vm_size_t) round_page(size); 503 if (addr + size < addr) 504 return (EINVAL); 505 506 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 507 return (EINVAL); 508 509 map = &td->td_proc->p_vmspace->vm_map; 510 511 /* 512 * Clean the pages and interpret the return value. 513 */ 514 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 515 (flags & MS_INVALIDATE) != 0); 516 switch (rv) { 517 case KERN_SUCCESS: 518 return (0); 519 case KERN_INVALID_ADDRESS: 520 return (ENOMEM); 521 case KERN_INVALID_ARGUMENT: 522 return (EBUSY); 523 case KERN_FAILURE: 524 return (EIO); 525 default: 526 return (EINVAL); 527 } 528 } 529 530 #ifndef _SYS_SYSPROTO_H_ 531 struct munmap_args { 532 void *addr; 533 size_t len; 534 }; 535 #endif 536 int 537 sys_munmap(struct thread *td, struct munmap_args *uap) 538 { 539 540 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 541 } 542 543 int 544 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 545 { 546 #ifdef HWPMC_HOOKS 547 struct pmckern_map_out pkm; 548 vm_map_entry_t entry; 549 bool pmc_handled; 550 #endif 551 vm_offset_t addr; 552 vm_size_t pageoff; 553 vm_map_t map; 554 555 if (size == 0) 556 return (EINVAL); 557 558 addr = addr0; 559 pageoff = (addr & PAGE_MASK); 560 addr -= pageoff; 561 size += pageoff; 562 size = (vm_size_t) round_page(size); 563 if (addr + size < addr) 564 return (EINVAL); 565 566 /* 567 * Check for illegal addresses. Watch out for address wrap... 568 */ 569 map = &td->td_proc->p_vmspace->vm_map; 570 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 571 return (EINVAL); 572 vm_map_lock(map); 573 #ifdef HWPMC_HOOKS 574 pmc_handled = false; 575 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 576 pmc_handled = true; 577 /* 578 * Inform hwpmc if the address range being unmapped contains 579 * an executable region. 580 */ 581 pkm.pm_address = (uintptr_t) NULL; 582 if (vm_map_lookup_entry(map, addr, &entry)) { 583 for (; entry->start < addr + size; 584 entry = entry->next) { 585 if (vm_map_check_protection(map, entry->start, 586 entry->end, VM_PROT_EXECUTE) == TRUE) { 587 pkm.pm_address = (uintptr_t) addr; 588 pkm.pm_size = (size_t) size; 589 break; 590 } 591 } 592 } 593 } 594 #endif 595 vm_map_delete(map, addr, addr + size); 596 597 #ifdef HWPMC_HOOKS 598 if (__predict_false(pmc_handled)) { 599 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 600 vm_map_lock_downgrade(map); 601 if (pkm.pm_address != (uintptr_t) NULL) 602 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 603 vm_map_unlock_read(map); 604 } else 605 #endif 606 vm_map_unlock(map); 607 608 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 609 return (0); 610 } 611 612 #ifndef _SYS_SYSPROTO_H_ 613 struct mprotect_args { 614 const void *addr; 615 size_t len; 616 int prot; 617 }; 618 #endif 619 int 620 sys_mprotect(struct thread *td, struct mprotect_args *uap) 621 { 622 623 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 624 } 625 626 int 627 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 628 { 629 vm_offset_t addr; 630 vm_size_t pageoff; 631 int vm_error, max_prot; 632 633 addr = addr0; 634 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 635 return (EINVAL); 636 max_prot = PROT_MAX_EXTRACT(prot); 637 prot = PROT_EXTRACT(prot); 638 pageoff = (addr & PAGE_MASK); 639 addr -= pageoff; 640 size += pageoff; 641 size = (vm_size_t) round_page(size); 642 #ifdef COMPAT_FREEBSD32 643 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 644 if (((addr + size) & 0xffffffff) < addr) 645 return (EINVAL); 646 } else 647 #endif 648 if (addr + size < addr) 649 return (EINVAL); 650 651 vm_error = KERN_SUCCESS; 652 if (max_prot != 0) { 653 if ((max_prot & prot) != prot) 654 return (EINVAL); 655 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 656 addr, addr + size, max_prot, TRUE); 657 } 658 if (vm_error == KERN_SUCCESS) 659 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 660 addr, addr + size, prot, FALSE); 661 662 switch (vm_error) { 663 case KERN_SUCCESS: 664 return (0); 665 case KERN_PROTECTION_FAILURE: 666 return (EACCES); 667 case KERN_RESOURCE_SHORTAGE: 668 return (ENOMEM); 669 } 670 return (EINVAL); 671 } 672 673 #ifndef _SYS_SYSPROTO_H_ 674 struct minherit_args { 675 void *addr; 676 size_t len; 677 int inherit; 678 }; 679 #endif 680 int 681 sys_minherit(struct thread *td, struct minherit_args *uap) 682 { 683 vm_offset_t addr; 684 vm_size_t size, pageoff; 685 vm_inherit_t inherit; 686 687 addr = (vm_offset_t)uap->addr; 688 size = uap->len; 689 inherit = uap->inherit; 690 691 pageoff = (addr & PAGE_MASK); 692 addr -= pageoff; 693 size += pageoff; 694 size = (vm_size_t) round_page(size); 695 if (addr + size < addr) 696 return (EINVAL); 697 698 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 699 addr + size, inherit)) { 700 case KERN_SUCCESS: 701 return (0); 702 case KERN_PROTECTION_FAILURE: 703 return (EACCES); 704 } 705 return (EINVAL); 706 } 707 708 #ifndef _SYS_SYSPROTO_H_ 709 struct madvise_args { 710 void *addr; 711 size_t len; 712 int behav; 713 }; 714 #endif 715 716 int 717 sys_madvise(struct thread *td, struct madvise_args *uap) 718 { 719 720 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 721 } 722 723 int 724 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 725 { 726 vm_map_t map; 727 vm_offset_t addr, end, start; 728 int flags; 729 730 /* 731 * Check for our special case, advising the swap pager we are 732 * "immortal." 733 */ 734 if (behav == MADV_PROTECT) { 735 flags = PPROT_SET; 736 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 737 PROC_SPROTECT, &flags)); 738 } 739 740 /* 741 * Check for illegal addresses. Watch out for address wrap... Note 742 * that VM_*_ADDRESS are not constants due to casts (argh). 743 */ 744 map = &td->td_proc->p_vmspace->vm_map; 745 addr = addr0; 746 if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) 747 return (EINVAL); 748 if ((addr + len) < addr) 749 return (EINVAL); 750 751 /* 752 * Since this routine is only advisory, we default to conservative 753 * behavior. 754 */ 755 start = trunc_page(addr); 756 end = round_page(addr + len); 757 758 /* 759 * vm_map_madvise() checks for illegal values of behav. 760 */ 761 return (vm_map_madvise(map, start, end, behav)); 762 } 763 764 #ifndef _SYS_SYSPROTO_H_ 765 struct mincore_args { 766 const void *addr; 767 size_t len; 768 char *vec; 769 }; 770 #endif 771 772 int 773 sys_mincore(struct thread *td, struct mincore_args *uap) 774 { 775 776 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 777 } 778 779 int 780 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 781 { 782 vm_offset_t addr, first_addr; 783 vm_offset_t end, cend; 784 pmap_t pmap; 785 vm_map_t map; 786 int error = 0; 787 int vecindex, lastvecindex; 788 vm_map_entry_t current; 789 vm_map_entry_t entry; 790 vm_object_t object; 791 vm_paddr_t locked_pa; 792 vm_page_t m; 793 vm_pindex_t pindex; 794 int mincoreinfo; 795 unsigned int timestamp; 796 boolean_t locked; 797 798 /* 799 * Make sure that the addresses presented are valid for user 800 * mode. 801 */ 802 first_addr = addr = trunc_page(addr0); 803 end = addr + (vm_size_t)round_page(len); 804 map = &td->td_proc->p_vmspace->vm_map; 805 if (end > vm_map_max(map) || end < addr) 806 return (ENOMEM); 807 808 pmap = vmspace_pmap(td->td_proc->p_vmspace); 809 810 vm_map_lock_read(map); 811 RestartScan: 812 timestamp = map->timestamp; 813 814 if (!vm_map_lookup_entry(map, addr, &entry)) { 815 vm_map_unlock_read(map); 816 return (ENOMEM); 817 } 818 819 /* 820 * Do this on a map entry basis so that if the pages are not 821 * in the current processes address space, we can easily look 822 * up the pages elsewhere. 823 */ 824 lastvecindex = -1; 825 for (current = entry; current->start < end; current = current->next) { 826 827 /* 828 * check for contiguity 829 */ 830 if (current->end < end && current->next->start > current->end) { 831 vm_map_unlock_read(map); 832 return (ENOMEM); 833 } 834 835 /* 836 * ignore submaps (for now) or null objects 837 */ 838 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 839 current->object.vm_object == NULL) 840 continue; 841 842 /* 843 * limit this scan to the current map entry and the 844 * limits for the mincore call 845 */ 846 if (addr < current->start) 847 addr = current->start; 848 cend = current->end; 849 if (cend > end) 850 cend = end; 851 852 /* 853 * scan this entry one page at a time 854 */ 855 while (addr < cend) { 856 /* 857 * Check pmap first, it is likely faster, also 858 * it can provide info as to whether we are the 859 * one referencing or modifying the page. 860 */ 861 object = NULL; 862 locked_pa = 0; 863 retry: 864 m = NULL; 865 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 866 if (mincore_mapped) { 867 /* 868 * We only care about this pmap's 869 * mapping of the page, if any. 870 */ 871 if (locked_pa != 0) { 872 vm_page_unlock(PHYS_TO_VM_PAGE( 873 locked_pa)); 874 } 875 } else if (locked_pa != 0) { 876 /* 877 * The page is mapped by this process but not 878 * both accessed and modified. It is also 879 * managed. Acquire the object lock so that 880 * other mappings might be examined. 881 */ 882 m = PHYS_TO_VM_PAGE(locked_pa); 883 if (m->object != object) { 884 if (object != NULL) 885 VM_OBJECT_WUNLOCK(object); 886 object = m->object; 887 locked = VM_OBJECT_TRYWLOCK(object); 888 vm_page_unlock(m); 889 if (!locked) { 890 VM_OBJECT_WLOCK(object); 891 vm_page_lock(m); 892 goto retry; 893 } 894 } else 895 vm_page_unlock(m); 896 KASSERT(m->valid == VM_PAGE_BITS_ALL, 897 ("mincore: page %p is mapped but invalid", 898 m)); 899 } else if (mincoreinfo == 0) { 900 /* 901 * The page is not mapped by this process. If 902 * the object implements managed pages, then 903 * determine if the page is resident so that 904 * the mappings might be examined. 905 */ 906 if (current->object.vm_object != object) { 907 if (object != NULL) 908 VM_OBJECT_WUNLOCK(object); 909 object = current->object.vm_object; 910 VM_OBJECT_WLOCK(object); 911 } 912 if (object->type == OBJT_DEFAULT || 913 object->type == OBJT_SWAP || 914 object->type == OBJT_VNODE) { 915 pindex = OFF_TO_IDX(current->offset + 916 (addr - current->start)); 917 m = vm_page_lookup(object, pindex); 918 if (m != NULL && m->valid == 0) 919 m = NULL; 920 if (m != NULL) 921 mincoreinfo = MINCORE_INCORE; 922 } 923 } 924 if (m != NULL) { 925 /* Examine other mappings to the page. */ 926 if (m->dirty == 0 && pmap_is_modified(m)) 927 vm_page_dirty(m); 928 if (m->dirty != 0) 929 mincoreinfo |= MINCORE_MODIFIED_OTHER; 930 /* 931 * The first test for PGA_REFERENCED is an 932 * optimization. The second test is 933 * required because a concurrent pmap 934 * operation could clear the last reference 935 * and set PGA_REFERENCED before the call to 936 * pmap_is_referenced(). 937 */ 938 if ((m->aflags & PGA_REFERENCED) != 0 || 939 pmap_is_referenced(m) || 940 (m->aflags & PGA_REFERENCED) != 0) 941 mincoreinfo |= MINCORE_REFERENCED_OTHER; 942 } 943 if (object != NULL) 944 VM_OBJECT_WUNLOCK(object); 945 946 /* 947 * subyte may page fault. In case it needs to modify 948 * the map, we release the lock. 949 */ 950 vm_map_unlock_read(map); 951 952 /* 953 * calculate index into user supplied byte vector 954 */ 955 vecindex = atop(addr - first_addr); 956 957 /* 958 * If we have skipped map entries, we need to make sure that 959 * the byte vector is zeroed for those skipped entries. 960 */ 961 while ((lastvecindex + 1) < vecindex) { 962 ++lastvecindex; 963 error = subyte(vec + lastvecindex, 0); 964 if (error) { 965 error = EFAULT; 966 goto done2; 967 } 968 } 969 970 /* 971 * Pass the page information to the user 972 */ 973 error = subyte(vec + vecindex, mincoreinfo); 974 if (error) { 975 error = EFAULT; 976 goto done2; 977 } 978 979 /* 980 * If the map has changed, due to the subyte, the previous 981 * output may be invalid. 982 */ 983 vm_map_lock_read(map); 984 if (timestamp != map->timestamp) 985 goto RestartScan; 986 987 lastvecindex = vecindex; 988 addr += PAGE_SIZE; 989 } 990 } 991 992 /* 993 * subyte may page fault. In case it needs to modify 994 * the map, we release the lock. 995 */ 996 vm_map_unlock_read(map); 997 998 /* 999 * Zero the last entries in the byte vector. 1000 */ 1001 vecindex = atop(end - first_addr); 1002 while ((lastvecindex + 1) < vecindex) { 1003 ++lastvecindex; 1004 error = subyte(vec + lastvecindex, 0); 1005 if (error) { 1006 error = EFAULT; 1007 goto done2; 1008 } 1009 } 1010 1011 /* 1012 * If the map has changed, due to the subyte, the previous 1013 * output may be invalid. 1014 */ 1015 vm_map_lock_read(map); 1016 if (timestamp != map->timestamp) 1017 goto RestartScan; 1018 vm_map_unlock_read(map); 1019 done2: 1020 return (error); 1021 } 1022 1023 #ifndef _SYS_SYSPROTO_H_ 1024 struct mlock_args { 1025 const void *addr; 1026 size_t len; 1027 }; 1028 #endif 1029 int 1030 sys_mlock(struct thread *td, struct mlock_args *uap) 1031 { 1032 1033 return (kern_mlock(td->td_proc, td->td_ucred, 1034 __DECONST(uintptr_t, uap->addr), uap->len)); 1035 } 1036 1037 int 1038 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1039 { 1040 vm_offset_t addr, end, last, start; 1041 vm_size_t npages, size; 1042 vm_map_t map; 1043 unsigned long nsize; 1044 int error; 1045 1046 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1047 if (error) 1048 return (error); 1049 addr = addr0; 1050 size = len; 1051 last = addr + size; 1052 start = trunc_page(addr); 1053 end = round_page(last); 1054 if (last < addr || end < addr) 1055 return (EINVAL); 1056 npages = atop(end - start); 1057 if (npages > vm_page_max_user_wired) 1058 return (ENOMEM); 1059 map = &proc->p_vmspace->vm_map; 1060 PROC_LOCK(proc); 1061 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1062 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1063 PROC_UNLOCK(proc); 1064 return (ENOMEM); 1065 } 1066 PROC_UNLOCK(proc); 1067 #ifdef RACCT 1068 if (racct_enable) { 1069 PROC_LOCK(proc); 1070 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1071 PROC_UNLOCK(proc); 1072 if (error != 0) 1073 return (ENOMEM); 1074 } 1075 #endif 1076 error = vm_map_wire(map, start, end, 1077 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1078 #ifdef RACCT 1079 if (racct_enable && error != KERN_SUCCESS) { 1080 PROC_LOCK(proc); 1081 racct_set(proc, RACCT_MEMLOCK, 1082 ptoa(pmap_wired_count(map->pmap))); 1083 PROC_UNLOCK(proc); 1084 } 1085 #endif 1086 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1087 } 1088 1089 #ifndef _SYS_SYSPROTO_H_ 1090 struct mlockall_args { 1091 int how; 1092 }; 1093 #endif 1094 1095 int 1096 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1097 { 1098 vm_map_t map; 1099 int error; 1100 1101 map = &td->td_proc->p_vmspace->vm_map; 1102 error = priv_check(td, PRIV_VM_MLOCK); 1103 if (error) 1104 return (error); 1105 1106 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1107 return (EINVAL); 1108 1109 /* 1110 * If wiring all pages in the process would cause it to exceed 1111 * a hard resource limit, return ENOMEM. 1112 */ 1113 if (!old_mlock && uap->how & MCL_CURRENT) { 1114 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1115 return (ENOMEM); 1116 } 1117 #ifdef RACCT 1118 if (racct_enable) { 1119 PROC_LOCK(td->td_proc); 1120 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1121 PROC_UNLOCK(td->td_proc); 1122 if (error != 0) 1123 return (ENOMEM); 1124 } 1125 #endif 1126 1127 if (uap->how & MCL_FUTURE) { 1128 vm_map_lock(map); 1129 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1130 vm_map_unlock(map); 1131 error = 0; 1132 } 1133 1134 if (uap->how & MCL_CURRENT) { 1135 /* 1136 * P1003.1-2001 mandates that all currently mapped pages 1137 * will be memory resident and locked (wired) upon return 1138 * from mlockall(). vm_map_wire() will wire pages, by 1139 * calling vm_fault_wire() for each page in the region. 1140 */ 1141 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1142 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1143 if (error == KERN_SUCCESS) 1144 error = 0; 1145 else if (error == KERN_RESOURCE_SHORTAGE) 1146 error = ENOMEM; 1147 else 1148 error = EAGAIN; 1149 } 1150 #ifdef RACCT 1151 if (racct_enable && error != KERN_SUCCESS) { 1152 PROC_LOCK(td->td_proc); 1153 racct_set(td->td_proc, RACCT_MEMLOCK, 1154 ptoa(pmap_wired_count(map->pmap))); 1155 PROC_UNLOCK(td->td_proc); 1156 } 1157 #endif 1158 1159 return (error); 1160 } 1161 1162 #ifndef _SYS_SYSPROTO_H_ 1163 struct munlockall_args { 1164 register_t dummy; 1165 }; 1166 #endif 1167 1168 int 1169 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1170 { 1171 vm_map_t map; 1172 int error; 1173 1174 map = &td->td_proc->p_vmspace->vm_map; 1175 error = priv_check(td, PRIV_VM_MUNLOCK); 1176 if (error) 1177 return (error); 1178 1179 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1180 vm_map_lock(map); 1181 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1182 vm_map_unlock(map); 1183 1184 /* Forcibly unwire all pages. */ 1185 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1186 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1187 #ifdef RACCT 1188 if (racct_enable && error == KERN_SUCCESS) { 1189 PROC_LOCK(td->td_proc); 1190 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1191 PROC_UNLOCK(td->td_proc); 1192 } 1193 #endif 1194 1195 return (error); 1196 } 1197 1198 #ifndef _SYS_SYSPROTO_H_ 1199 struct munlock_args { 1200 const void *addr; 1201 size_t len; 1202 }; 1203 #endif 1204 int 1205 sys_munlock(struct thread *td, struct munlock_args *uap) 1206 { 1207 1208 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1209 } 1210 1211 int 1212 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1213 { 1214 vm_offset_t addr, end, last, start; 1215 #ifdef RACCT 1216 vm_map_t map; 1217 #endif 1218 int error; 1219 1220 error = priv_check(td, PRIV_VM_MUNLOCK); 1221 if (error) 1222 return (error); 1223 addr = addr0; 1224 last = addr + size; 1225 start = trunc_page(addr); 1226 end = round_page(last); 1227 if (last < addr || end < addr) 1228 return (EINVAL); 1229 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1230 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1231 #ifdef RACCT 1232 if (racct_enable && error == KERN_SUCCESS) { 1233 PROC_LOCK(td->td_proc); 1234 map = &td->td_proc->p_vmspace->vm_map; 1235 racct_set(td->td_proc, RACCT_MEMLOCK, 1236 ptoa(pmap_wired_count(map->pmap))); 1237 PROC_UNLOCK(td->td_proc); 1238 } 1239 #endif 1240 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1241 } 1242 1243 /* 1244 * vm_mmap_vnode() 1245 * 1246 * Helper function for vm_mmap. Perform sanity check specific for mmap 1247 * operations on vnodes. 1248 */ 1249 int 1250 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1251 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1252 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1253 boolean_t *writecounted) 1254 { 1255 struct vattr va; 1256 vm_object_t obj; 1257 vm_ooffset_t foff; 1258 struct ucred *cred; 1259 int error, flags; 1260 bool writex; 1261 1262 cred = td->td_ucred; 1263 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1264 (*flagsp & MAP_SHARED) != 0; 1265 if ((error = vget(vp, LK_SHARED, td)) != 0) 1266 return (error); 1267 AUDIT_ARG_VNODE1(vp); 1268 foff = *foffp; 1269 flags = *flagsp; 1270 obj = vp->v_object; 1271 if (vp->v_type == VREG) { 1272 /* 1273 * Get the proper underlying object 1274 */ 1275 if (obj == NULL) { 1276 error = EINVAL; 1277 goto done; 1278 } 1279 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1280 vput(vp); 1281 vp = (struct vnode *)obj->handle; 1282 /* 1283 * Bypass filesystems obey the mpsafety of the 1284 * underlying fs. Tmpfs never bypasses. 1285 */ 1286 error = vget(vp, LK_SHARED, td); 1287 if (error != 0) 1288 return (error); 1289 } 1290 if (writex) { 1291 *writecounted = TRUE; 1292 vm_pager_update_writecount(obj, 0, objsize); 1293 } 1294 } else { 1295 error = EINVAL; 1296 goto done; 1297 } 1298 if ((error = VOP_GETATTR(vp, &va, cred))) 1299 goto done; 1300 #ifdef MAC 1301 /* This relies on VM_PROT_* matching PROT_*. */ 1302 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1303 if (error != 0) 1304 goto done; 1305 #endif 1306 if ((flags & MAP_SHARED) != 0) { 1307 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1308 if (prot & VM_PROT_WRITE) { 1309 error = EPERM; 1310 goto done; 1311 } 1312 *maxprotp &= ~VM_PROT_WRITE; 1313 } 1314 } 1315 /* 1316 * If it is a regular file without any references 1317 * we do not need to sync it. 1318 * Adjust object size to be the size of actual file. 1319 */ 1320 objsize = round_page(va.va_size); 1321 if (va.va_nlink == 0) 1322 flags |= MAP_NOSYNC; 1323 if (obj->type == OBJT_VNODE) { 1324 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1325 cred); 1326 if (obj == NULL) { 1327 error = ENOMEM; 1328 goto done; 1329 } 1330 } else { 1331 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1332 ("wrong object type")); 1333 VM_OBJECT_WLOCK(obj); 1334 vm_object_reference_locked(obj); 1335 #if VM_NRESERVLEVEL > 0 1336 vm_object_color(obj, 0); 1337 #endif 1338 VM_OBJECT_WUNLOCK(obj); 1339 } 1340 *objp = obj; 1341 *flagsp = flags; 1342 1343 vfs_mark_atime(vp, cred); 1344 1345 done: 1346 if (error != 0 && *writecounted) { 1347 *writecounted = FALSE; 1348 vm_pager_update_writecount(obj, objsize, 0); 1349 } 1350 vput(vp); 1351 return (error); 1352 } 1353 1354 /* 1355 * vm_mmap_cdev() 1356 * 1357 * Helper function for vm_mmap. Perform sanity check specific for mmap 1358 * operations on cdevs. 1359 */ 1360 int 1361 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1362 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1363 vm_ooffset_t *foff, vm_object_t *objp) 1364 { 1365 vm_object_t obj; 1366 int error, flags; 1367 1368 flags = *flagsp; 1369 1370 if (dsw->d_flags & D_MMAP_ANON) { 1371 *objp = NULL; 1372 *foff = 0; 1373 *maxprotp = VM_PROT_ALL; 1374 *flagsp |= MAP_ANON; 1375 return (0); 1376 } 1377 /* 1378 * cdevs do not provide private mappings of any kind. 1379 */ 1380 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1381 (prot & VM_PROT_WRITE) != 0) 1382 return (EACCES); 1383 if (flags & (MAP_PRIVATE|MAP_COPY)) 1384 return (EINVAL); 1385 /* 1386 * Force device mappings to be shared. 1387 */ 1388 flags |= MAP_SHARED; 1389 #ifdef MAC_XXX 1390 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1391 if (error != 0) 1392 return (error); 1393 #endif 1394 /* 1395 * First, try d_mmap_single(). If that is not implemented 1396 * (returns ENODEV), fall back to using the device pager. 1397 * Note that d_mmap_single() must return a reference to the 1398 * object (it needs to bump the reference count of the object 1399 * it returns somehow). 1400 * 1401 * XXX assumes VM_PROT_* == PROT_* 1402 */ 1403 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1404 if (error != ENODEV) 1405 return (error); 1406 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1407 td->td_ucred); 1408 if (obj == NULL) 1409 return (EINVAL); 1410 *objp = obj; 1411 *flagsp = flags; 1412 return (0); 1413 } 1414 1415 /* 1416 * vm_mmap() 1417 * 1418 * Internal version of mmap used by exec, sys5 shared memory, and 1419 * various device drivers. Handle is either a vnode pointer, a 1420 * character device, or NULL for MAP_ANON. 1421 */ 1422 int 1423 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1424 vm_prot_t maxprot, int flags, 1425 objtype_t handle_type, void *handle, 1426 vm_ooffset_t foff) 1427 { 1428 vm_object_t object; 1429 struct thread *td = curthread; 1430 int error; 1431 boolean_t writecounted; 1432 1433 if (size == 0) 1434 return (EINVAL); 1435 1436 size = round_page(size); 1437 object = NULL; 1438 writecounted = FALSE; 1439 1440 /* 1441 * Lookup/allocate object. 1442 */ 1443 switch (handle_type) { 1444 case OBJT_DEVICE: { 1445 struct cdevsw *dsw; 1446 struct cdev *cdev; 1447 int ref; 1448 1449 cdev = handle; 1450 dsw = dev_refthread(cdev, &ref); 1451 if (dsw == NULL) 1452 return (ENXIO); 1453 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1454 dsw, &foff, &object); 1455 dev_relthread(cdev, ref); 1456 break; 1457 } 1458 case OBJT_VNODE: 1459 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1460 handle, &foff, &object, &writecounted); 1461 break; 1462 case OBJT_DEFAULT: 1463 if (handle == NULL) { 1464 error = 0; 1465 break; 1466 } 1467 /* FALLTHROUGH */ 1468 default: 1469 error = EINVAL; 1470 break; 1471 } 1472 if (error) 1473 return (error); 1474 1475 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1476 foff, writecounted, td); 1477 if (error != 0 && object != NULL) { 1478 /* 1479 * If this mapping was accounted for in the vnode's 1480 * writecount, then undo that now. 1481 */ 1482 if (writecounted) 1483 vm_pager_release_writecount(object, 0, size); 1484 vm_object_deallocate(object); 1485 } 1486 return (error); 1487 } 1488 1489 /* 1490 * Internal version of mmap that maps a specific VM object into an 1491 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1492 */ 1493 int 1494 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1495 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1496 boolean_t writecounted, struct thread *td) 1497 { 1498 boolean_t curmap, fitit; 1499 vm_offset_t max_addr; 1500 int docow, error, findspace, rv; 1501 1502 curmap = map == &td->td_proc->p_vmspace->vm_map; 1503 if (curmap) { 1504 RACCT_PROC_LOCK(td->td_proc); 1505 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1506 RACCT_PROC_UNLOCK(td->td_proc); 1507 return (ENOMEM); 1508 } 1509 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1510 RACCT_PROC_UNLOCK(td->td_proc); 1511 return (ENOMEM); 1512 } 1513 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1514 if (ptoa(pmap_wired_count(map->pmap)) + size > 1515 lim_cur(td, RLIMIT_MEMLOCK)) { 1516 racct_set_force(td->td_proc, RACCT_VMEM, 1517 map->size); 1518 RACCT_PROC_UNLOCK(td->td_proc); 1519 return (ENOMEM); 1520 } 1521 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1522 ptoa(pmap_wired_count(map->pmap)) + size); 1523 if (error != 0) { 1524 racct_set_force(td->td_proc, RACCT_VMEM, 1525 map->size); 1526 RACCT_PROC_UNLOCK(td->td_proc); 1527 return (error); 1528 } 1529 } 1530 RACCT_PROC_UNLOCK(td->td_proc); 1531 } 1532 1533 /* 1534 * We currently can only deal with page aligned file offsets. 1535 * The mmap() system call already enforces this by subtracting 1536 * the page offset from the file offset, but checking here 1537 * catches errors in device drivers (e.g. d_single_mmap() 1538 * callbacks) and other internal mapping requests (such as in 1539 * exec). 1540 */ 1541 if (foff & PAGE_MASK) 1542 return (EINVAL); 1543 1544 if ((flags & MAP_FIXED) == 0) { 1545 fitit = TRUE; 1546 *addr = round_page(*addr); 1547 } else { 1548 if (*addr != trunc_page(*addr)) 1549 return (EINVAL); 1550 fitit = FALSE; 1551 } 1552 1553 if (flags & MAP_ANON) { 1554 if (object != NULL || foff != 0) 1555 return (EINVAL); 1556 docow = 0; 1557 } else if (flags & MAP_PREFAULT_READ) 1558 docow = MAP_PREFAULT; 1559 else 1560 docow = MAP_PREFAULT_PARTIAL; 1561 1562 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1563 docow |= MAP_COPY_ON_WRITE; 1564 if (flags & MAP_NOSYNC) 1565 docow |= MAP_DISABLE_SYNCER; 1566 if (flags & MAP_NOCORE) 1567 docow |= MAP_DISABLE_COREDUMP; 1568 /* Shared memory is also shared with children. */ 1569 if (flags & MAP_SHARED) 1570 docow |= MAP_INHERIT_SHARE; 1571 if (writecounted) 1572 docow |= MAP_WRITECOUNT; 1573 if (flags & MAP_STACK) { 1574 if (object != NULL) 1575 return (EINVAL); 1576 docow |= MAP_STACK_GROWS_DOWN; 1577 } 1578 if ((flags & MAP_EXCL) != 0) 1579 docow |= MAP_CHECK_EXCL; 1580 if ((flags & MAP_GUARD) != 0) 1581 docow |= MAP_CREATE_GUARD; 1582 1583 if (fitit) { 1584 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1585 findspace = VMFS_SUPER_SPACE; 1586 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1587 findspace = VMFS_ALIGNED_SPACE(flags >> 1588 MAP_ALIGNMENT_SHIFT); 1589 else 1590 findspace = VMFS_OPTIMAL_SPACE; 1591 max_addr = 0; 1592 #ifdef MAP_32BIT 1593 if ((flags & MAP_32BIT) != 0) 1594 max_addr = MAP_32BIT_MAX_ADDR; 1595 #endif 1596 if (curmap) { 1597 rv = vm_map_find_min(map, object, foff, addr, size, 1598 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1599 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1600 findspace, prot, maxprot, docow); 1601 } else { 1602 rv = vm_map_find(map, object, foff, addr, size, 1603 max_addr, findspace, prot, maxprot, docow); 1604 } 1605 } else { 1606 rv = vm_map_fixed(map, object, foff, *addr, size, 1607 prot, maxprot, docow); 1608 } 1609 1610 if (rv == KERN_SUCCESS) { 1611 /* 1612 * If the process has requested that all future mappings 1613 * be wired, then heed this. 1614 */ 1615 if ((map->flags & MAP_WIREFUTURE) != 0) { 1616 vm_map_lock(map); 1617 if ((map->flags & MAP_WIREFUTURE) != 0) 1618 (void)vm_map_wire_locked(map, *addr, 1619 *addr + size, VM_MAP_WIRE_USER | 1620 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1621 VM_MAP_WIRE_NOHOLES)); 1622 vm_map_unlock(map); 1623 } 1624 } 1625 return (vm_mmap_to_errno(rv)); 1626 } 1627 1628 /* 1629 * Translate a Mach VM return code to zero on success or the appropriate errno 1630 * on failure. 1631 */ 1632 int 1633 vm_mmap_to_errno(int rv) 1634 { 1635 1636 switch (rv) { 1637 case KERN_SUCCESS: 1638 return (0); 1639 case KERN_INVALID_ADDRESS: 1640 case KERN_NO_SPACE: 1641 return (ENOMEM); 1642 case KERN_PROTECTION_FAILURE: 1643 return (EACCES); 1644 default: 1645 return (EINVAL); 1646 } 1647 } 1648