1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_hwpmc_hooks.h" 49 #include "opt_vm.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/capsicum.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/elf.h> 59 #include <sys/filedesc.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/procctl.h> 63 #include <sys/racct.h> 64 #include <sys/resource.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/sysctl.h> 68 #include <sys/vnode.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/mman.h> 72 #include <sys/mount.h> 73 #include <sys/conf.h> 74 #include <sys/stat.h> 75 #include <sys/syscallsubr.h> 76 #include <sys/sysent.h> 77 #include <sys/vmmeter.h> 78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 79 #include <machine/md_var.h> 80 #endif 81 82 #include <security/audit/audit.h> 83 #include <security/mac/mac_framework.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <vm/pmap.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_pageout.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vnode_pager.h> 96 97 #ifdef HWPMC_HOOKS 98 #include <sys/pmckern.h> 99 #endif 100 101 int old_mlock = 0; 102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 103 "Do not apply RLIMIT_MEMLOCK on mlockall"); 104 static int mincore_mapped = 1; 105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 106 "mincore reports mappings, not residency"); 107 static int imply_prot_max = 0; 108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 109 "Imply maximum page permissions in mmap() when none are specified"); 110 111 #ifdef MAP_32BIT 112 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 113 #endif 114 115 #ifndef _SYS_SYSPROTO_H_ 116 struct sbrk_args { 117 int incr; 118 }; 119 #endif 120 121 int 122 sys_sbrk(struct thread *td, struct sbrk_args *uap) 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 int 135 sys_sstk(struct thread *td, struct sstk_args *uap) 136 { 137 /* Not yet implemented */ 138 return (EOPNOTSUPP); 139 } 140 141 #if defined(COMPAT_43) 142 int 143 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 144 { 145 146 td->td_retval[0] = PAGE_SIZE; 147 return (0); 148 } 149 #endif /* COMPAT_43 */ 150 151 152 /* 153 * Memory Map (mmap) system call. Note that the file offset 154 * and address are allowed to be NOT page aligned, though if 155 * the MAP_FIXED flag it set, both must have the same remainder 156 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 157 * page-aligned, the actual mapping starts at trunc_page(addr) 158 * and the return value is adjusted up by the page offset. 159 * 160 * Generally speaking, only character devices which are themselves 161 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 162 * there would be no cache coherency between a descriptor and a VM mapping 163 * both to the same character device. 164 */ 165 #ifndef _SYS_SYSPROTO_H_ 166 struct mmap_args { 167 void *addr; 168 size_t len; 169 int prot; 170 int flags; 171 int fd; 172 long pad; 173 off_t pos; 174 }; 175 #endif 176 177 int 178 sys_mmap(struct thread *td, struct mmap_args *uap) 179 { 180 181 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 182 uap->flags, uap->fd, uap->pos)); 183 } 184 185 int 186 kern_mmap_maxprot(struct proc *p, int prot) 187 { 188 189 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 190 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 191 return (_PROT_ALL); 192 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 193 prot != PROT_NONE) 194 return (prot); 195 return (_PROT_ALL); 196 } 197 198 int 199 kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags, 200 int fd, off_t pos) 201 { 202 struct mmap_req mr = { 203 .mr_hint = addr0, 204 .mr_len = len, 205 .mr_prot = prot, 206 .mr_flags = flags, 207 .mr_fd = fd, 208 .mr_pos = pos 209 }; 210 211 return (kern_mmap_req(td, &mr)); 212 } 213 214 int 215 kern_mmap_req(struct thread *td, const struct mmap_req *mrp) 216 { 217 struct vmspace *vms; 218 struct file *fp; 219 struct proc *p; 220 off_t pos; 221 vm_offset_t addr; 222 vm_size_t len, pageoff, size; 223 vm_prot_t cap_maxprot; 224 int align, error, fd, flags, max_prot, prot; 225 cap_rights_t rights; 226 mmap_check_fp_fn check_fp_fn; 227 228 addr = mrp->mr_hint; 229 len = mrp->mr_len; 230 prot = mrp->mr_prot; 231 flags = mrp->mr_flags; 232 fd = mrp->mr_fd; 233 pos = mrp->mr_pos; 234 check_fp_fn = mrp->mr_check_fp_fn; 235 236 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 237 return (EINVAL); 238 max_prot = PROT_MAX_EXTRACT(prot); 239 prot = PROT_EXTRACT(prot); 240 if (max_prot != 0 && (max_prot & prot) != prot) 241 return (ENOTSUP); 242 243 p = td->td_proc; 244 245 /* 246 * Always honor PROT_MAX if set. If not, default to all 247 * permissions unless we're implying maximum permissions. 248 */ 249 if (max_prot == 0) 250 max_prot = kern_mmap_maxprot(p, prot); 251 252 vms = p->p_vmspace; 253 fp = NULL; 254 AUDIT_ARG_FD(fd); 255 256 /* 257 * Ignore old flags that used to be defined but did not do anything. 258 */ 259 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 260 261 /* 262 * Enforce the constraints. 263 * Mapping of length 0 is only allowed for old binaries. 264 * Anonymous mapping shall specify -1 as filedescriptor and 265 * zero position for new code. Be nice to ancient a.out 266 * binaries and correct pos for anonymous mapping, since old 267 * ld.so sometimes issues anonymous map requests with non-zero 268 * pos. 269 */ 270 if (!SV_CURPROC_FLAG(SV_AOUT)) { 271 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 272 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 273 return (EINVAL); 274 } else { 275 if ((flags & MAP_ANON) != 0) 276 pos = 0; 277 } 278 279 if (flags & MAP_STACK) { 280 if ((fd != -1) || 281 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 282 return (EINVAL); 283 flags |= MAP_ANON; 284 pos = 0; 285 } 286 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 287 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 288 MAP_PREFAULT_READ | MAP_GUARD | 289 #ifdef MAP_32BIT 290 MAP_32BIT | 291 #endif 292 MAP_ALIGNMENT_MASK)) != 0) 293 return (EINVAL); 294 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 295 return (EINVAL); 296 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 297 return (EINVAL); 298 if (prot != PROT_NONE && 299 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 300 return (EINVAL); 301 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 302 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 303 #ifdef MAP_32BIT 304 MAP_32BIT | 305 #endif 306 MAP_ALIGNMENT_MASK)) != 0)) 307 return (EINVAL); 308 309 /* 310 * Align the file position to a page boundary, 311 * and save its page offset component. 312 */ 313 pageoff = (pos & PAGE_MASK); 314 pos -= pageoff; 315 316 /* Compute size from len by rounding (on both ends). */ 317 size = len + pageoff; /* low end... */ 318 size = round_page(size); /* hi end */ 319 /* Check for rounding up to zero. */ 320 if (len > size) 321 return (ENOMEM); 322 323 /* Ensure alignment is at least a page and fits in a pointer. */ 324 align = flags & MAP_ALIGNMENT_MASK; 325 if (align != 0 && align != MAP_ALIGNED_SUPER && 326 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 327 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 328 return (EINVAL); 329 330 /* 331 * Check for illegal addresses. Watch out for address wrap... Note 332 * that VM_*_ADDRESS are not constants due to casts (argh). 333 */ 334 if (flags & MAP_FIXED) { 335 /* 336 * The specified address must have the same remainder 337 * as the file offset taken modulo PAGE_SIZE, so it 338 * should be aligned after adjustment by pageoff. 339 */ 340 addr -= pageoff; 341 if (addr & PAGE_MASK) 342 return (EINVAL); 343 344 /* Address range must be all in user VM space. */ 345 if (addr < vm_map_min(&vms->vm_map) || 346 addr + size > vm_map_max(&vms->vm_map)) 347 return (EINVAL); 348 if (addr + size < addr) 349 return (EINVAL); 350 #ifdef MAP_32BIT 351 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 352 return (EINVAL); 353 } else if (flags & MAP_32BIT) { 354 /* 355 * For MAP_32BIT, override the hint if it is too high and 356 * do not bother moving the mapping past the heap (since 357 * the heap is usually above 2GB). 358 */ 359 if (addr + size > MAP_32BIT_MAX_ADDR) 360 addr = 0; 361 #endif 362 } else { 363 /* 364 * XXX for non-fixed mappings where no hint is provided or 365 * the hint would fall in the potential heap space, 366 * place it after the end of the largest possible heap. 367 * 368 * There should really be a pmap call to determine a reasonable 369 * location. 370 */ 371 if (addr == 0 || 372 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 373 addr < round_page((vm_offset_t)vms->vm_daddr + 374 lim_max(td, RLIMIT_DATA)))) 375 addr = round_page((vm_offset_t)vms->vm_daddr + 376 lim_max(td, RLIMIT_DATA)); 377 } 378 if (len == 0) { 379 /* 380 * Return success without mapping anything for old 381 * binaries that request a page-aligned mapping of 382 * length 0. For modern binaries, this function 383 * returns an error earlier. 384 */ 385 error = 0; 386 } else if ((flags & MAP_GUARD) != 0) { 387 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 388 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 389 } else if ((flags & MAP_ANON) != 0) { 390 /* 391 * Mapping blank space is trivial. 392 * 393 * This relies on VM_PROT_* matching PROT_*. 394 */ 395 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 396 max_prot, flags, NULL, pos, FALSE, td); 397 } else { 398 /* 399 * Mapping file, get fp for validation and don't let the 400 * descriptor disappear on us if we block. Check capability 401 * rights, but also return the maximum rights to be combined 402 * with maxprot later. 403 */ 404 cap_rights_init_one(&rights, CAP_MMAP); 405 if (prot & PROT_READ) 406 cap_rights_set_one(&rights, CAP_MMAP_R); 407 if ((flags & MAP_SHARED) != 0) { 408 if (prot & PROT_WRITE) 409 cap_rights_set_one(&rights, CAP_MMAP_W); 410 } 411 if (prot & PROT_EXEC) 412 cap_rights_set_one(&rights, CAP_MMAP_X); 413 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 414 if (error != 0) 415 goto done; 416 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 417 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 418 error = EINVAL; 419 goto done; 420 } 421 if (check_fp_fn != NULL) { 422 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 423 flags); 424 if (error != 0) 425 goto done; 426 } 427 /* This relies on VM_PROT_* matching PROT_*. */ 428 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 429 max_prot & cap_maxprot, flags, pos, td); 430 } 431 432 if (error == 0) 433 td->td_retval[0] = (register_t) (addr + pageoff); 434 done: 435 if (fp) 436 fdrop(fp, td); 437 438 return (error); 439 } 440 441 #if defined(COMPAT_FREEBSD6) 442 int 443 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 444 { 445 446 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 447 uap->flags, uap->fd, uap->pos)); 448 } 449 #endif 450 451 #ifdef COMPAT_43 452 #ifndef _SYS_SYSPROTO_H_ 453 struct ommap_args { 454 caddr_t addr; 455 int len; 456 int prot; 457 int flags; 458 int fd; 459 long pos; 460 }; 461 #endif 462 int 463 ommap(struct thread *td, struct ommap_args *uap) 464 { 465 static const char cvtbsdprot[8] = { 466 0, 467 PROT_EXEC, 468 PROT_WRITE, 469 PROT_EXEC | PROT_WRITE, 470 PROT_READ, 471 PROT_EXEC | PROT_READ, 472 PROT_WRITE | PROT_READ, 473 PROT_EXEC | PROT_WRITE | PROT_READ, 474 }; 475 int flags, prot; 476 477 #define OMAP_ANON 0x0002 478 #define OMAP_COPY 0x0020 479 #define OMAP_SHARED 0x0010 480 #define OMAP_FIXED 0x0100 481 482 prot = cvtbsdprot[uap->prot & 0x7]; 483 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 484 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 485 prot != 0) 486 prot |= PROT_EXEC; 487 #endif 488 flags = 0; 489 if (uap->flags & OMAP_ANON) 490 flags |= MAP_ANON; 491 if (uap->flags & OMAP_COPY) 492 flags |= MAP_COPY; 493 if (uap->flags & OMAP_SHARED) 494 flags |= MAP_SHARED; 495 else 496 flags |= MAP_PRIVATE; 497 if (uap->flags & OMAP_FIXED) 498 flags |= MAP_FIXED; 499 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, 500 uap->fd, uap->pos)); 501 } 502 #endif /* COMPAT_43 */ 503 504 505 #ifndef _SYS_SYSPROTO_H_ 506 struct msync_args { 507 void *addr; 508 size_t len; 509 int flags; 510 }; 511 #endif 512 int 513 sys_msync(struct thread *td, struct msync_args *uap) 514 { 515 516 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 517 } 518 519 int 520 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 521 { 522 vm_offset_t addr; 523 vm_size_t pageoff; 524 vm_map_t map; 525 int rv; 526 527 addr = addr0; 528 pageoff = (addr & PAGE_MASK); 529 addr -= pageoff; 530 size += pageoff; 531 size = (vm_size_t) round_page(size); 532 if (addr + size < addr) 533 return (EINVAL); 534 535 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 536 return (EINVAL); 537 538 map = &td->td_proc->p_vmspace->vm_map; 539 540 /* 541 * Clean the pages and interpret the return value. 542 */ 543 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 544 (flags & MS_INVALIDATE) != 0); 545 switch (rv) { 546 case KERN_SUCCESS: 547 return (0); 548 case KERN_INVALID_ADDRESS: 549 return (ENOMEM); 550 case KERN_INVALID_ARGUMENT: 551 return (EBUSY); 552 case KERN_FAILURE: 553 return (EIO); 554 default: 555 return (EINVAL); 556 } 557 } 558 559 #ifndef _SYS_SYSPROTO_H_ 560 struct munmap_args { 561 void *addr; 562 size_t len; 563 }; 564 #endif 565 int 566 sys_munmap(struct thread *td, struct munmap_args *uap) 567 { 568 569 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 570 } 571 572 int 573 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 574 { 575 #ifdef HWPMC_HOOKS 576 struct pmckern_map_out pkm; 577 vm_map_entry_t entry; 578 bool pmc_handled; 579 #endif 580 vm_offset_t addr; 581 vm_size_t pageoff; 582 vm_map_t map; 583 584 if (size == 0) 585 return (EINVAL); 586 587 addr = addr0; 588 pageoff = (addr & PAGE_MASK); 589 addr -= pageoff; 590 size += pageoff; 591 size = (vm_size_t) round_page(size); 592 if (addr + size < addr) 593 return (EINVAL); 594 595 /* 596 * Check for illegal addresses. Watch out for address wrap... 597 */ 598 map = &td->td_proc->p_vmspace->vm_map; 599 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 600 return (EINVAL); 601 vm_map_lock(map); 602 #ifdef HWPMC_HOOKS 603 pmc_handled = false; 604 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 605 pmc_handled = true; 606 /* 607 * Inform hwpmc if the address range being unmapped contains 608 * an executable region. 609 */ 610 pkm.pm_address = (uintptr_t) NULL; 611 if (vm_map_lookup_entry(map, addr, &entry)) { 612 for (; entry->start < addr + size; 613 entry = vm_map_entry_succ(entry)) { 614 if (vm_map_check_protection(map, entry->start, 615 entry->end, VM_PROT_EXECUTE) == TRUE) { 616 pkm.pm_address = (uintptr_t) addr; 617 pkm.pm_size = (size_t) size; 618 break; 619 } 620 } 621 } 622 } 623 #endif 624 vm_map_delete(map, addr, addr + size); 625 626 #ifdef HWPMC_HOOKS 627 if (__predict_false(pmc_handled)) { 628 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 629 vm_map_lock_downgrade(map); 630 if (pkm.pm_address != (uintptr_t) NULL) 631 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 632 vm_map_unlock_read(map); 633 } else 634 #endif 635 vm_map_unlock(map); 636 637 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 638 return (0); 639 } 640 641 #ifndef _SYS_SYSPROTO_H_ 642 struct mprotect_args { 643 const void *addr; 644 size_t len; 645 int prot; 646 }; 647 #endif 648 int 649 sys_mprotect(struct thread *td, struct mprotect_args *uap) 650 { 651 652 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 653 } 654 655 int 656 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 657 { 658 vm_offset_t addr; 659 vm_size_t pageoff; 660 int vm_error, max_prot; 661 662 addr = addr0; 663 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 664 return (EINVAL); 665 max_prot = PROT_MAX_EXTRACT(prot); 666 prot = PROT_EXTRACT(prot); 667 pageoff = (addr & PAGE_MASK); 668 addr -= pageoff; 669 size += pageoff; 670 size = (vm_size_t) round_page(size); 671 #ifdef COMPAT_FREEBSD32 672 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 673 if (((addr + size) & 0xffffffff) < addr) 674 return (EINVAL); 675 } else 676 #endif 677 if (addr + size < addr) 678 return (EINVAL); 679 680 vm_error = KERN_SUCCESS; 681 if (max_prot != 0) { 682 if ((max_prot & prot) != prot) 683 return (ENOTSUP); 684 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 685 addr, addr + size, max_prot, TRUE); 686 } 687 if (vm_error == KERN_SUCCESS) 688 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 689 addr, addr + size, prot, FALSE); 690 691 switch (vm_error) { 692 case KERN_SUCCESS: 693 return (0); 694 case KERN_PROTECTION_FAILURE: 695 return (EACCES); 696 case KERN_RESOURCE_SHORTAGE: 697 return (ENOMEM); 698 } 699 return (EINVAL); 700 } 701 702 #ifndef _SYS_SYSPROTO_H_ 703 struct minherit_args { 704 void *addr; 705 size_t len; 706 int inherit; 707 }; 708 #endif 709 int 710 sys_minherit(struct thread *td, struct minherit_args *uap) 711 { 712 vm_offset_t addr; 713 vm_size_t size, pageoff; 714 vm_inherit_t inherit; 715 716 addr = (vm_offset_t)uap->addr; 717 size = uap->len; 718 inherit = uap->inherit; 719 720 pageoff = (addr & PAGE_MASK); 721 addr -= pageoff; 722 size += pageoff; 723 size = (vm_size_t) round_page(size); 724 if (addr + size < addr) 725 return (EINVAL); 726 727 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 728 addr + size, inherit)) { 729 case KERN_SUCCESS: 730 return (0); 731 case KERN_PROTECTION_FAILURE: 732 return (EACCES); 733 } 734 return (EINVAL); 735 } 736 737 #ifndef _SYS_SYSPROTO_H_ 738 struct madvise_args { 739 void *addr; 740 size_t len; 741 int behav; 742 }; 743 #endif 744 745 int 746 sys_madvise(struct thread *td, struct madvise_args *uap) 747 { 748 749 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 750 } 751 752 int 753 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 754 { 755 vm_map_t map; 756 vm_offset_t addr, end, start; 757 int flags; 758 759 /* 760 * Check for our special case, advising the swap pager we are 761 * "immortal." 762 */ 763 if (behav == MADV_PROTECT) { 764 flags = PPROT_SET; 765 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 766 PROC_SPROTECT, &flags)); 767 } 768 769 /* 770 * Check for illegal addresses. Watch out for address wrap... Note 771 * that VM_*_ADDRESS are not constants due to casts (argh). 772 */ 773 map = &td->td_proc->p_vmspace->vm_map; 774 addr = addr0; 775 if (addr < vm_map_min(map) || addr + len > vm_map_max(map)) 776 return (EINVAL); 777 if ((addr + len) < addr) 778 return (EINVAL); 779 780 /* 781 * Since this routine is only advisory, we default to conservative 782 * behavior. 783 */ 784 start = trunc_page(addr); 785 end = round_page(addr + len); 786 787 /* 788 * vm_map_madvise() checks for illegal values of behav. 789 */ 790 return (vm_map_madvise(map, start, end, behav)); 791 } 792 793 #ifndef _SYS_SYSPROTO_H_ 794 struct mincore_args { 795 const void *addr; 796 size_t len; 797 char *vec; 798 }; 799 #endif 800 801 int 802 sys_mincore(struct thread *td, struct mincore_args *uap) 803 { 804 805 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 806 } 807 808 int 809 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 810 { 811 pmap_t pmap; 812 vm_map_t map; 813 vm_map_entry_t current, entry; 814 vm_object_t object; 815 vm_offset_t addr, cend, end, first_addr; 816 vm_paddr_t pa; 817 vm_page_t m; 818 vm_pindex_t pindex; 819 int error, lastvecindex, mincoreinfo, vecindex; 820 unsigned int timestamp; 821 822 /* 823 * Make sure that the addresses presented are valid for user 824 * mode. 825 */ 826 first_addr = addr = trunc_page(addr0); 827 end = round_page(addr0 + len); 828 map = &td->td_proc->p_vmspace->vm_map; 829 if (end > vm_map_max(map) || end < addr) 830 return (ENOMEM); 831 832 pmap = vmspace_pmap(td->td_proc->p_vmspace); 833 834 vm_map_lock_read(map); 835 RestartScan: 836 timestamp = map->timestamp; 837 838 if (!vm_map_lookup_entry(map, addr, &entry)) { 839 vm_map_unlock_read(map); 840 return (ENOMEM); 841 } 842 843 /* 844 * Do this on a map entry basis so that if the pages are not 845 * in the current processes address space, we can easily look 846 * up the pages elsewhere. 847 */ 848 lastvecindex = -1; 849 while (entry->start < end) { 850 851 /* 852 * check for contiguity 853 */ 854 current = entry; 855 entry = vm_map_entry_succ(current); 856 if (current->end < end && 857 entry->start > current->end) { 858 vm_map_unlock_read(map); 859 return (ENOMEM); 860 } 861 862 /* 863 * ignore submaps (for now) or null objects 864 */ 865 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 866 current->object.vm_object == NULL) 867 continue; 868 869 /* 870 * limit this scan to the current map entry and the 871 * limits for the mincore call 872 */ 873 if (addr < current->start) 874 addr = current->start; 875 cend = current->end; 876 if (cend > end) 877 cend = end; 878 879 for (; addr < cend; addr += PAGE_SIZE) { 880 /* 881 * Check pmap first, it is likely faster, also 882 * it can provide info as to whether we are the 883 * one referencing or modifying the page. 884 */ 885 m = NULL; 886 object = NULL; 887 retry: 888 pa = 0; 889 mincoreinfo = pmap_mincore(pmap, addr, &pa); 890 if (mincore_mapped) { 891 /* 892 * We only care about this pmap's 893 * mapping of the page, if any. 894 */ 895 ; 896 } else if (pa != 0) { 897 /* 898 * The page is mapped by this process but not 899 * both accessed and modified. It is also 900 * managed. Acquire the object lock so that 901 * other mappings might be examined. The page's 902 * identity may change at any point before its 903 * object lock is acquired, so re-validate if 904 * necessary. 905 */ 906 m = PHYS_TO_VM_PAGE(pa); 907 while (object == NULL || m->object != object) { 908 if (object != NULL) 909 VM_OBJECT_WUNLOCK(object); 910 object = atomic_load_ptr(&m->object); 911 if (object == NULL) 912 goto retry; 913 VM_OBJECT_WLOCK(object); 914 } 915 if (pa != pmap_extract(pmap, addr)) 916 goto retry; 917 KASSERT(vm_page_all_valid(m), 918 ("mincore: page %p is mapped but invalid", 919 m)); 920 } else if (mincoreinfo == 0) { 921 /* 922 * The page is not mapped by this process. If 923 * the object implements managed pages, then 924 * determine if the page is resident so that 925 * the mappings might be examined. 926 */ 927 if (current->object.vm_object != object) { 928 if (object != NULL) 929 VM_OBJECT_WUNLOCK(object); 930 object = current->object.vm_object; 931 VM_OBJECT_WLOCK(object); 932 } 933 if (object->type == OBJT_DEFAULT || 934 object->type == OBJT_SWAP || 935 object->type == OBJT_VNODE) { 936 pindex = OFF_TO_IDX(current->offset + 937 (addr - current->start)); 938 m = vm_page_lookup(object, pindex); 939 if (m != NULL && vm_page_none_valid(m)) 940 m = NULL; 941 if (m != NULL) 942 mincoreinfo = MINCORE_INCORE; 943 } 944 } 945 if (m != NULL) { 946 VM_OBJECT_ASSERT_WLOCKED(m->object); 947 948 /* Examine other mappings of the page. */ 949 if (m->dirty == 0 && pmap_is_modified(m)) 950 vm_page_dirty(m); 951 if (m->dirty != 0) 952 mincoreinfo |= MINCORE_MODIFIED_OTHER; 953 954 /* 955 * The first test for PGA_REFERENCED is an 956 * optimization. The second test is 957 * required because a concurrent pmap 958 * operation could clear the last reference 959 * and set PGA_REFERENCED before the call to 960 * pmap_is_referenced(). 961 */ 962 if ((m->a.flags & PGA_REFERENCED) != 0 || 963 pmap_is_referenced(m) || 964 (m->a.flags & PGA_REFERENCED) != 0) 965 mincoreinfo |= MINCORE_REFERENCED_OTHER; 966 } 967 if (object != NULL) 968 VM_OBJECT_WUNLOCK(object); 969 970 /* 971 * subyte may page fault. In case it needs to modify 972 * the map, we release the lock. 973 */ 974 vm_map_unlock_read(map); 975 976 /* 977 * calculate index into user supplied byte vector 978 */ 979 vecindex = atop(addr - first_addr); 980 981 /* 982 * If we have skipped map entries, we need to make sure that 983 * the byte vector is zeroed for those skipped entries. 984 */ 985 while ((lastvecindex + 1) < vecindex) { 986 ++lastvecindex; 987 error = subyte(vec + lastvecindex, 0); 988 if (error) { 989 error = EFAULT; 990 goto done2; 991 } 992 } 993 994 /* 995 * Pass the page information to the user 996 */ 997 error = subyte(vec + vecindex, mincoreinfo); 998 if (error) { 999 error = EFAULT; 1000 goto done2; 1001 } 1002 1003 /* 1004 * If the map has changed, due to the subyte, the previous 1005 * output may be invalid. 1006 */ 1007 vm_map_lock_read(map); 1008 if (timestamp != map->timestamp) 1009 goto RestartScan; 1010 1011 lastvecindex = vecindex; 1012 } 1013 } 1014 1015 /* 1016 * subyte may page fault. In case it needs to modify 1017 * the map, we release the lock. 1018 */ 1019 vm_map_unlock_read(map); 1020 1021 /* 1022 * Zero the last entries in the byte vector. 1023 */ 1024 vecindex = atop(end - first_addr); 1025 while ((lastvecindex + 1) < vecindex) { 1026 ++lastvecindex; 1027 error = subyte(vec + lastvecindex, 0); 1028 if (error) { 1029 error = EFAULT; 1030 goto done2; 1031 } 1032 } 1033 1034 /* 1035 * If the map has changed, due to the subyte, the previous 1036 * output may be invalid. 1037 */ 1038 vm_map_lock_read(map); 1039 if (timestamp != map->timestamp) 1040 goto RestartScan; 1041 vm_map_unlock_read(map); 1042 done2: 1043 return (error); 1044 } 1045 1046 #ifndef _SYS_SYSPROTO_H_ 1047 struct mlock_args { 1048 const void *addr; 1049 size_t len; 1050 }; 1051 #endif 1052 int 1053 sys_mlock(struct thread *td, struct mlock_args *uap) 1054 { 1055 1056 return (kern_mlock(td->td_proc, td->td_ucred, 1057 __DECONST(uintptr_t, uap->addr), uap->len)); 1058 } 1059 1060 int 1061 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1062 { 1063 vm_offset_t addr, end, last, start; 1064 vm_size_t npages, size; 1065 vm_map_t map; 1066 unsigned long nsize; 1067 int error; 1068 1069 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1070 if (error) 1071 return (error); 1072 addr = addr0; 1073 size = len; 1074 last = addr + size; 1075 start = trunc_page(addr); 1076 end = round_page(last); 1077 if (last < addr || end < addr) 1078 return (EINVAL); 1079 npages = atop(end - start); 1080 if (npages > vm_page_max_user_wired) 1081 return (ENOMEM); 1082 map = &proc->p_vmspace->vm_map; 1083 PROC_LOCK(proc); 1084 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1085 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1086 PROC_UNLOCK(proc); 1087 return (ENOMEM); 1088 } 1089 PROC_UNLOCK(proc); 1090 #ifdef RACCT 1091 if (racct_enable) { 1092 PROC_LOCK(proc); 1093 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1094 PROC_UNLOCK(proc); 1095 if (error != 0) 1096 return (ENOMEM); 1097 } 1098 #endif 1099 error = vm_map_wire(map, start, end, 1100 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1101 #ifdef RACCT 1102 if (racct_enable && error != KERN_SUCCESS) { 1103 PROC_LOCK(proc); 1104 racct_set(proc, RACCT_MEMLOCK, 1105 ptoa(pmap_wired_count(map->pmap))); 1106 PROC_UNLOCK(proc); 1107 } 1108 #endif 1109 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1110 } 1111 1112 #ifndef _SYS_SYSPROTO_H_ 1113 struct mlockall_args { 1114 int how; 1115 }; 1116 #endif 1117 1118 int 1119 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1120 { 1121 vm_map_t map; 1122 int error; 1123 1124 map = &td->td_proc->p_vmspace->vm_map; 1125 error = priv_check(td, PRIV_VM_MLOCK); 1126 if (error) 1127 return (error); 1128 1129 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1130 return (EINVAL); 1131 1132 /* 1133 * If wiring all pages in the process would cause it to exceed 1134 * a hard resource limit, return ENOMEM. 1135 */ 1136 if (!old_mlock && uap->how & MCL_CURRENT) { 1137 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1138 return (ENOMEM); 1139 } 1140 #ifdef RACCT 1141 if (racct_enable) { 1142 PROC_LOCK(td->td_proc); 1143 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1144 PROC_UNLOCK(td->td_proc); 1145 if (error != 0) 1146 return (ENOMEM); 1147 } 1148 #endif 1149 1150 if (uap->how & MCL_FUTURE) { 1151 vm_map_lock(map); 1152 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1153 vm_map_unlock(map); 1154 error = 0; 1155 } 1156 1157 if (uap->how & MCL_CURRENT) { 1158 /* 1159 * P1003.1-2001 mandates that all currently mapped pages 1160 * will be memory resident and locked (wired) upon return 1161 * from mlockall(). vm_map_wire() will wire pages, by 1162 * calling vm_fault_wire() for each page in the region. 1163 */ 1164 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1165 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1166 if (error == KERN_SUCCESS) 1167 error = 0; 1168 else if (error == KERN_RESOURCE_SHORTAGE) 1169 error = ENOMEM; 1170 else 1171 error = EAGAIN; 1172 } 1173 #ifdef RACCT 1174 if (racct_enable && error != KERN_SUCCESS) { 1175 PROC_LOCK(td->td_proc); 1176 racct_set(td->td_proc, RACCT_MEMLOCK, 1177 ptoa(pmap_wired_count(map->pmap))); 1178 PROC_UNLOCK(td->td_proc); 1179 } 1180 #endif 1181 1182 return (error); 1183 } 1184 1185 #ifndef _SYS_SYSPROTO_H_ 1186 struct munlockall_args { 1187 register_t dummy; 1188 }; 1189 #endif 1190 1191 int 1192 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1193 { 1194 vm_map_t map; 1195 int error; 1196 1197 map = &td->td_proc->p_vmspace->vm_map; 1198 error = priv_check(td, PRIV_VM_MUNLOCK); 1199 if (error) 1200 return (error); 1201 1202 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1203 vm_map_lock(map); 1204 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1205 vm_map_unlock(map); 1206 1207 /* Forcibly unwire all pages. */ 1208 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1209 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1210 #ifdef RACCT 1211 if (racct_enable && error == KERN_SUCCESS) { 1212 PROC_LOCK(td->td_proc); 1213 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1214 PROC_UNLOCK(td->td_proc); 1215 } 1216 #endif 1217 1218 return (error); 1219 } 1220 1221 #ifndef _SYS_SYSPROTO_H_ 1222 struct munlock_args { 1223 const void *addr; 1224 size_t len; 1225 }; 1226 #endif 1227 int 1228 sys_munlock(struct thread *td, struct munlock_args *uap) 1229 { 1230 1231 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1232 } 1233 1234 int 1235 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1236 { 1237 vm_offset_t addr, end, last, start; 1238 #ifdef RACCT 1239 vm_map_t map; 1240 #endif 1241 int error; 1242 1243 error = priv_check(td, PRIV_VM_MUNLOCK); 1244 if (error) 1245 return (error); 1246 addr = addr0; 1247 last = addr + size; 1248 start = trunc_page(addr); 1249 end = round_page(last); 1250 if (last < addr || end < addr) 1251 return (EINVAL); 1252 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1253 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1254 #ifdef RACCT 1255 if (racct_enable && error == KERN_SUCCESS) { 1256 PROC_LOCK(td->td_proc); 1257 map = &td->td_proc->p_vmspace->vm_map; 1258 racct_set(td->td_proc, RACCT_MEMLOCK, 1259 ptoa(pmap_wired_count(map->pmap))); 1260 PROC_UNLOCK(td->td_proc); 1261 } 1262 #endif 1263 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1264 } 1265 1266 /* 1267 * vm_mmap_vnode() 1268 * 1269 * Helper function for vm_mmap. Perform sanity check specific for mmap 1270 * operations on vnodes. 1271 */ 1272 int 1273 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1274 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1275 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1276 boolean_t *writecounted) 1277 { 1278 struct vattr va; 1279 vm_object_t obj; 1280 vm_ooffset_t foff; 1281 struct ucred *cred; 1282 int error, flags; 1283 bool writex; 1284 1285 cred = td->td_ucred; 1286 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1287 (*flagsp & MAP_SHARED) != 0; 1288 if ((error = vget(vp, LK_SHARED, td)) != 0) 1289 return (error); 1290 AUDIT_ARG_VNODE1(vp); 1291 foff = *foffp; 1292 flags = *flagsp; 1293 obj = vp->v_object; 1294 if (vp->v_type == VREG) { 1295 /* 1296 * Get the proper underlying object 1297 */ 1298 if (obj == NULL) { 1299 error = EINVAL; 1300 goto done; 1301 } 1302 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1303 vput(vp); 1304 vp = (struct vnode *)obj->handle; 1305 /* 1306 * Bypass filesystems obey the mpsafety of the 1307 * underlying fs. Tmpfs never bypasses. 1308 */ 1309 error = vget(vp, LK_SHARED, td); 1310 if (error != 0) 1311 return (error); 1312 } 1313 if (writex) { 1314 *writecounted = TRUE; 1315 vm_pager_update_writecount(obj, 0, objsize); 1316 } 1317 } else { 1318 error = EINVAL; 1319 goto done; 1320 } 1321 if ((error = VOP_GETATTR(vp, &va, cred))) 1322 goto done; 1323 #ifdef MAC 1324 /* This relies on VM_PROT_* matching PROT_*. */ 1325 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1326 if (error != 0) 1327 goto done; 1328 #endif 1329 if ((flags & MAP_SHARED) != 0) { 1330 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1331 if (prot & VM_PROT_WRITE) { 1332 error = EPERM; 1333 goto done; 1334 } 1335 *maxprotp &= ~VM_PROT_WRITE; 1336 } 1337 } 1338 /* 1339 * If it is a regular file without any references 1340 * we do not need to sync it. 1341 * Adjust object size to be the size of actual file. 1342 */ 1343 objsize = round_page(va.va_size); 1344 if (va.va_nlink == 0) 1345 flags |= MAP_NOSYNC; 1346 if (obj->type == OBJT_VNODE) { 1347 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1348 cred); 1349 if (obj == NULL) { 1350 error = ENOMEM; 1351 goto done; 1352 } 1353 } else { 1354 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1355 ("wrong object type")); 1356 vm_object_reference(obj); 1357 #if VM_NRESERVLEVEL > 0 1358 if ((obj->flags & OBJ_COLORED) == 0) { 1359 VM_OBJECT_WLOCK(obj); 1360 vm_object_color(obj, 0); 1361 VM_OBJECT_WUNLOCK(obj); 1362 } 1363 #endif 1364 } 1365 *objp = obj; 1366 *flagsp = flags; 1367 1368 VOP_MMAPPED(vp); 1369 1370 done: 1371 if (error != 0 && *writecounted) { 1372 *writecounted = FALSE; 1373 vm_pager_update_writecount(obj, objsize, 0); 1374 } 1375 vput(vp); 1376 return (error); 1377 } 1378 1379 /* 1380 * vm_mmap_cdev() 1381 * 1382 * Helper function for vm_mmap. Perform sanity check specific for mmap 1383 * operations on cdevs. 1384 */ 1385 int 1386 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1387 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1388 vm_ooffset_t *foff, vm_object_t *objp) 1389 { 1390 vm_object_t obj; 1391 int error, flags; 1392 1393 flags = *flagsp; 1394 1395 if (dsw->d_flags & D_MMAP_ANON) { 1396 *objp = NULL; 1397 *foff = 0; 1398 *maxprotp = VM_PROT_ALL; 1399 *flagsp |= MAP_ANON; 1400 return (0); 1401 } 1402 /* 1403 * cdevs do not provide private mappings of any kind. 1404 */ 1405 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1406 (prot & VM_PROT_WRITE) != 0) 1407 return (EACCES); 1408 if (flags & (MAP_PRIVATE|MAP_COPY)) 1409 return (EINVAL); 1410 /* 1411 * Force device mappings to be shared. 1412 */ 1413 flags |= MAP_SHARED; 1414 #ifdef MAC_XXX 1415 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1416 if (error != 0) 1417 return (error); 1418 #endif 1419 /* 1420 * First, try d_mmap_single(). If that is not implemented 1421 * (returns ENODEV), fall back to using the device pager. 1422 * Note that d_mmap_single() must return a reference to the 1423 * object (it needs to bump the reference count of the object 1424 * it returns somehow). 1425 * 1426 * XXX assumes VM_PROT_* == PROT_* 1427 */ 1428 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1429 if (error != ENODEV) 1430 return (error); 1431 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1432 td->td_ucred); 1433 if (obj == NULL) 1434 return (EINVAL); 1435 *objp = obj; 1436 *flagsp = flags; 1437 return (0); 1438 } 1439 1440 /* 1441 * vm_mmap() 1442 * 1443 * Internal version of mmap used by exec, sys5 shared memory, and 1444 * various device drivers. Handle is either a vnode pointer, a 1445 * character device, or NULL for MAP_ANON. 1446 */ 1447 int 1448 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1449 vm_prot_t maxprot, int flags, 1450 objtype_t handle_type, void *handle, 1451 vm_ooffset_t foff) 1452 { 1453 vm_object_t object; 1454 struct thread *td = curthread; 1455 int error; 1456 boolean_t writecounted; 1457 1458 if (size == 0) 1459 return (EINVAL); 1460 1461 size = round_page(size); 1462 object = NULL; 1463 writecounted = FALSE; 1464 1465 /* 1466 * Lookup/allocate object. 1467 */ 1468 switch (handle_type) { 1469 case OBJT_DEVICE: { 1470 struct cdevsw *dsw; 1471 struct cdev *cdev; 1472 int ref; 1473 1474 cdev = handle; 1475 dsw = dev_refthread(cdev, &ref); 1476 if (dsw == NULL) 1477 return (ENXIO); 1478 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1479 dsw, &foff, &object); 1480 dev_relthread(cdev, ref); 1481 break; 1482 } 1483 case OBJT_VNODE: 1484 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1485 handle, &foff, &object, &writecounted); 1486 break; 1487 case OBJT_DEFAULT: 1488 if (handle == NULL) { 1489 error = 0; 1490 break; 1491 } 1492 /* FALLTHROUGH */ 1493 default: 1494 error = EINVAL; 1495 break; 1496 } 1497 if (error) 1498 return (error); 1499 1500 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1501 foff, writecounted, td); 1502 if (error != 0 && object != NULL) { 1503 /* 1504 * If this mapping was accounted for in the vnode's 1505 * writecount, then undo that now. 1506 */ 1507 if (writecounted) 1508 vm_pager_release_writecount(object, 0, size); 1509 vm_object_deallocate(object); 1510 } 1511 return (error); 1512 } 1513 1514 /* 1515 * Internal version of mmap that maps a specific VM object into an 1516 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1517 */ 1518 int 1519 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1520 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1521 boolean_t writecounted, struct thread *td) 1522 { 1523 boolean_t curmap, fitit; 1524 vm_offset_t max_addr; 1525 int docow, error, findspace, rv; 1526 1527 curmap = map == &td->td_proc->p_vmspace->vm_map; 1528 if (curmap) { 1529 RACCT_PROC_LOCK(td->td_proc); 1530 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1531 RACCT_PROC_UNLOCK(td->td_proc); 1532 return (ENOMEM); 1533 } 1534 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1535 RACCT_PROC_UNLOCK(td->td_proc); 1536 return (ENOMEM); 1537 } 1538 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1539 if (ptoa(pmap_wired_count(map->pmap)) + size > 1540 lim_cur(td, RLIMIT_MEMLOCK)) { 1541 racct_set_force(td->td_proc, RACCT_VMEM, 1542 map->size); 1543 RACCT_PROC_UNLOCK(td->td_proc); 1544 return (ENOMEM); 1545 } 1546 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1547 ptoa(pmap_wired_count(map->pmap)) + size); 1548 if (error != 0) { 1549 racct_set_force(td->td_proc, RACCT_VMEM, 1550 map->size); 1551 RACCT_PROC_UNLOCK(td->td_proc); 1552 return (error); 1553 } 1554 } 1555 RACCT_PROC_UNLOCK(td->td_proc); 1556 } 1557 1558 /* 1559 * We currently can only deal with page aligned file offsets. 1560 * The mmap() system call already enforces this by subtracting 1561 * the page offset from the file offset, but checking here 1562 * catches errors in device drivers (e.g. d_single_mmap() 1563 * callbacks) and other internal mapping requests (such as in 1564 * exec). 1565 */ 1566 if (foff & PAGE_MASK) 1567 return (EINVAL); 1568 1569 if ((flags & MAP_FIXED) == 0) { 1570 fitit = TRUE; 1571 *addr = round_page(*addr); 1572 } else { 1573 if (*addr != trunc_page(*addr)) 1574 return (EINVAL); 1575 fitit = FALSE; 1576 } 1577 1578 if (flags & MAP_ANON) { 1579 if (object != NULL || foff != 0) 1580 return (EINVAL); 1581 docow = 0; 1582 } else if (flags & MAP_PREFAULT_READ) 1583 docow = MAP_PREFAULT; 1584 else 1585 docow = MAP_PREFAULT_PARTIAL; 1586 1587 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1588 docow |= MAP_COPY_ON_WRITE; 1589 if (flags & MAP_NOSYNC) 1590 docow |= MAP_DISABLE_SYNCER; 1591 if (flags & MAP_NOCORE) 1592 docow |= MAP_DISABLE_COREDUMP; 1593 /* Shared memory is also shared with children. */ 1594 if (flags & MAP_SHARED) 1595 docow |= MAP_INHERIT_SHARE; 1596 if (writecounted) 1597 docow |= MAP_WRITECOUNT; 1598 if (flags & MAP_STACK) { 1599 if (object != NULL) 1600 return (EINVAL); 1601 docow |= MAP_STACK_GROWS_DOWN; 1602 } 1603 if ((flags & MAP_EXCL) != 0) 1604 docow |= MAP_CHECK_EXCL; 1605 if ((flags & MAP_GUARD) != 0) 1606 docow |= MAP_CREATE_GUARD; 1607 1608 if (fitit) { 1609 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1610 findspace = VMFS_SUPER_SPACE; 1611 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1612 findspace = VMFS_ALIGNED_SPACE(flags >> 1613 MAP_ALIGNMENT_SHIFT); 1614 else 1615 findspace = VMFS_OPTIMAL_SPACE; 1616 max_addr = 0; 1617 #ifdef MAP_32BIT 1618 if ((flags & MAP_32BIT) != 0) 1619 max_addr = MAP_32BIT_MAX_ADDR; 1620 #endif 1621 if (curmap) { 1622 rv = vm_map_find_min(map, object, foff, addr, size, 1623 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1624 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1625 findspace, prot, maxprot, docow); 1626 } else { 1627 rv = vm_map_find(map, object, foff, addr, size, 1628 max_addr, findspace, prot, maxprot, docow); 1629 } 1630 } else { 1631 rv = vm_map_fixed(map, object, foff, *addr, size, 1632 prot, maxprot, docow); 1633 } 1634 1635 if (rv == KERN_SUCCESS) { 1636 /* 1637 * If the process has requested that all future mappings 1638 * be wired, then heed this. 1639 */ 1640 if ((map->flags & MAP_WIREFUTURE) != 0) { 1641 vm_map_lock(map); 1642 if ((map->flags & MAP_WIREFUTURE) != 0) 1643 (void)vm_map_wire_locked(map, *addr, 1644 *addr + size, VM_MAP_WIRE_USER | 1645 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1646 VM_MAP_WIRE_NOHOLES)); 1647 vm_map_unlock(map); 1648 } 1649 } 1650 return (vm_mmap_to_errno(rv)); 1651 } 1652 1653 /* 1654 * Translate a Mach VM return code to zero on success or the appropriate errno 1655 * on failure. 1656 */ 1657 int 1658 vm_mmap_to_errno(int rv) 1659 { 1660 1661 switch (rv) { 1662 case KERN_SUCCESS: 1663 return (0); 1664 case KERN_INVALID_ADDRESS: 1665 case KERN_NO_SPACE: 1666 return (ENOMEM); 1667 case KERN_PROTECTION_FAILURE: 1668 return (EACCES); 1669 default: 1670 return (EINVAL); 1671 } 1672 } 1673