1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 #include "opt_hwpmc_hooks.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/capsicum.h> 50 #include <sys/kernel.h> 51 #include <sys/lock.h> 52 #include <sys/mutex.h> 53 #include <sys/sysproto.h> 54 #include <sys/elf.h> 55 #include <sys/filedesc.h> 56 #include <sys/priv.h> 57 #include <sys/proc.h> 58 #include <sys/procctl.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/rwlock.h> 63 #include <sys/sysctl.h> 64 #include <sys/vnode.h> 65 #include <sys/fcntl.h> 66 #include <sys/file.h> 67 #include <sys/mman.h> 68 #include <sys/mount.h> 69 #include <sys/conf.h> 70 #include <sys/stat.h> 71 #include <sys/syscallsubr.h> 72 #include <sys/sysent.h> 73 #include <sys/vmmeter.h> 74 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 75 #include <machine/md_var.h> 76 #endif 77 78 #include <security/audit/audit.h> 79 #include <security/mac/mac_framework.h> 80 81 #include <vm/vm.h> 82 #include <vm/vm_param.h> 83 #include <vm/pmap.h> 84 #include <vm/vm_map.h> 85 #include <vm/vm_object.h> 86 #include <vm/vm_page.h> 87 #include <vm/vm_pager.h> 88 #include <vm/vm_pageout.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_page.h> 91 #include <vm/vnode_pager.h> 92 93 #ifdef HWPMC_HOOKS 94 #include <sys/pmckern.h> 95 #endif 96 97 int old_mlock = 0; 98 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 99 "Do not apply RLIMIT_MEMLOCK on mlockall"); 100 static int mincore_mapped = 1; 101 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 102 "mincore reports mappings, not residency"); 103 static int imply_prot_max = 0; 104 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 105 "Imply maximum page protections in mmap() when none are specified"); 106 107 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 108 109 #ifndef _SYS_SYSPROTO_H_ 110 struct sbrk_args { 111 int incr; 112 }; 113 #endif 114 115 int 116 sys_sbrk(struct thread *td, struct sbrk_args *uap) 117 { 118 /* Not yet implemented */ 119 return (EOPNOTSUPP); 120 } 121 122 #ifndef _SYS_SYSPROTO_H_ 123 struct sstk_args { 124 int incr; 125 }; 126 #endif 127 128 int 129 sys_sstk(struct thread *td, struct sstk_args *uap) 130 { 131 /* Not yet implemented */ 132 return (EOPNOTSUPP); 133 } 134 135 #if defined(COMPAT_43) 136 int 137 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 138 { 139 140 td->td_retval[0] = PAGE_SIZE; 141 return (0); 142 } 143 #endif /* COMPAT_43 */ 144 145 /* 146 * Memory Map (mmap) system call. Note that the file offset 147 * and address are allowed to be NOT page aligned, though if 148 * the MAP_FIXED flag it set, both must have the same remainder 149 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 150 * page-aligned, the actual mapping starts at trunc_page(addr) 151 * and the return value is adjusted up by the page offset. 152 * 153 * Generally speaking, only character devices which are themselves 154 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 155 * there would be no cache coherency between a descriptor and a VM mapping 156 * both to the same character device. 157 */ 158 #ifndef _SYS_SYSPROTO_H_ 159 struct mmap_args { 160 void *addr; 161 size_t len; 162 int prot; 163 int flags; 164 int fd; 165 long pad; 166 off_t pos; 167 }; 168 #endif 169 170 int 171 sys_mmap(struct thread *td, struct mmap_args *uap) 172 { 173 174 return (kern_mmap(td, &(struct mmap_req){ 175 .mr_hint = (uintptr_t)uap->addr, 176 .mr_len = uap->len, 177 .mr_prot = uap->prot, 178 .mr_flags = uap->flags, 179 .mr_fd = uap->fd, 180 .mr_pos = uap->pos, 181 })); 182 } 183 184 int 185 kern_mmap_maxprot(struct proc *p, int prot) 186 { 187 188 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 189 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 190 return (_PROT_ALL); 191 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 192 prot != PROT_NONE) 193 return (prot); 194 return (_PROT_ALL); 195 } 196 197 int 198 kern_mmap(struct thread *td, const struct mmap_req *mrp) 199 { 200 struct vmspace *vms; 201 struct file *fp; 202 struct proc *p; 203 off_t pos; 204 vm_offset_t addr, orig_addr; 205 vm_size_t len, pageoff, size; 206 vm_prot_t cap_maxprot; 207 int align, error, fd, flags, max_prot, prot; 208 cap_rights_t rights; 209 mmap_check_fp_fn check_fp_fn; 210 211 orig_addr = addr = mrp->mr_hint; 212 len = mrp->mr_len; 213 prot = mrp->mr_prot; 214 flags = mrp->mr_flags; 215 fd = mrp->mr_fd; 216 pos = mrp->mr_pos; 217 check_fp_fn = mrp->mr_check_fp_fn; 218 219 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 220 return (EINVAL); 221 max_prot = PROT_MAX_EXTRACT(prot); 222 prot = PROT_EXTRACT(prot); 223 if (max_prot != 0 && (max_prot & prot) != prot) 224 return (ENOTSUP); 225 226 p = td->td_proc; 227 228 /* 229 * Always honor PROT_MAX if set. If not, default to all 230 * permissions unless we're implying maximum permissions. 231 */ 232 if (max_prot == 0) 233 max_prot = kern_mmap_maxprot(p, prot); 234 235 vms = p->p_vmspace; 236 fp = NULL; 237 AUDIT_ARG_FD(fd); 238 239 /* 240 * Ignore old flags that used to be defined but did not do anything. 241 */ 242 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 243 244 /* 245 * Enforce the constraints. 246 * Mapping of length 0 is only allowed for old binaries. 247 * Anonymous mapping shall specify -1 as filedescriptor and 248 * zero position for new code. Be nice to ancient a.out 249 * binaries and correct pos for anonymous mapping, since old 250 * ld.so sometimes issues anonymous map requests with non-zero 251 * pos. 252 */ 253 if (!SV_CURPROC_FLAG(SV_AOUT)) { 254 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 255 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 256 return (EINVAL); 257 } else { 258 if ((flags & MAP_ANON) != 0) 259 pos = 0; 260 } 261 262 if (flags & MAP_STACK) { 263 if ((fd != -1) || 264 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 265 return (EINVAL); 266 flags |= MAP_ANON; 267 pos = 0; 268 } 269 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 270 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 271 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0) 272 return (EINVAL); 273 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 274 return (EINVAL); 275 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 276 return (EINVAL); 277 if (prot != PROT_NONE && 278 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 279 return (EINVAL); 280 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 281 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 282 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) 283 return (EINVAL); 284 285 /* 286 * Align the file position to a page boundary, 287 * and save its page offset component. 288 */ 289 pageoff = (pos & PAGE_MASK); 290 pos -= pageoff; 291 292 /* Compute size from len by rounding (on both ends). */ 293 size = len + pageoff; /* low end... */ 294 size = round_page(size); /* hi end */ 295 /* Check for rounding up to zero. */ 296 if (len > size) 297 return (ENOMEM); 298 299 /* Ensure alignment is at least a page and fits in a pointer. */ 300 align = flags & MAP_ALIGNMENT_MASK; 301 if (align != 0 && align != MAP_ALIGNED_SUPER && 302 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 303 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 304 return (EINVAL); 305 306 /* 307 * Check for illegal addresses. Watch out for address wrap... Note 308 * that VM_*_ADDRESS are not constants due to casts (argh). 309 */ 310 if (flags & MAP_FIXED) { 311 /* 312 * The specified address must have the same remainder 313 * as the file offset taken modulo PAGE_SIZE, so it 314 * should be aligned after adjustment by pageoff. 315 */ 316 addr -= pageoff; 317 if (addr & PAGE_MASK) 318 return (EINVAL); 319 320 /* Address range must be all in user VM space. */ 321 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) 322 return (EINVAL); 323 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 324 return (EINVAL); 325 } else if (flags & MAP_32BIT) { 326 /* 327 * For MAP_32BIT, override the hint if it is too high and 328 * do not bother moving the mapping past the heap (since 329 * the heap is usually above 2GB). 330 */ 331 if (addr + size > MAP_32BIT_MAX_ADDR) 332 addr = 0; 333 } else { 334 /* 335 * XXX for non-fixed mappings where no hint is provided or 336 * the hint would fall in the potential heap space, 337 * place it after the end of the largest possible heap. 338 * 339 * For anonymous mappings within the address space of the 340 * calling process, the absence of a hint is handled at a 341 * lower level in order to implement different clustering 342 * strategies for ASLR. 343 */ 344 if (((flags & MAP_ANON) == 0 && addr == 0) || 345 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 346 addr < round_page((vm_offset_t)vms->vm_daddr + 347 lim_max(td, RLIMIT_DATA)))) 348 addr = round_page((vm_offset_t)vms->vm_daddr + 349 lim_max(td, RLIMIT_DATA)); 350 } 351 if (len == 0) { 352 /* 353 * Return success without mapping anything for old 354 * binaries that request a page-aligned mapping of 355 * length 0. For modern binaries, this function 356 * returns an error earlier. 357 */ 358 error = 0; 359 } else if ((flags & MAP_GUARD) != 0) { 360 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 361 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 362 } else if ((flags & MAP_ANON) != 0) { 363 /* 364 * Mapping blank space is trivial. 365 * 366 * This relies on VM_PROT_* matching PROT_*. 367 */ 368 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 369 max_prot, flags, NULL, pos, FALSE, td); 370 } else { 371 /* 372 * Mapping file, get fp for validation and don't let the 373 * descriptor disappear on us if we block. Check capability 374 * rights, but also return the maximum rights to be combined 375 * with maxprot later. 376 */ 377 cap_rights_init_one(&rights, CAP_MMAP); 378 if (prot & PROT_READ) 379 cap_rights_set_one(&rights, CAP_MMAP_R); 380 if ((flags & MAP_SHARED) != 0) { 381 if (prot & PROT_WRITE) 382 cap_rights_set_one(&rights, CAP_MMAP_W); 383 } 384 if (prot & PROT_EXEC) 385 cap_rights_set_one(&rights, CAP_MMAP_X); 386 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 387 if (error != 0) 388 goto done; 389 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 390 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 391 error = EINVAL; 392 goto done; 393 } 394 if (check_fp_fn != NULL) { 395 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 396 flags); 397 if (error != 0) 398 goto done; 399 } 400 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 401 addr = orig_addr; 402 /* This relies on VM_PROT_* matching PROT_*. */ 403 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 404 max_prot & cap_maxprot, flags, pos, td); 405 } 406 407 if (error == 0) 408 td->td_retval[0] = addr + pageoff; 409 done: 410 if (fp) 411 fdrop(fp, td); 412 413 return (error); 414 } 415 416 #if defined(COMPAT_FREEBSD6) 417 int 418 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 419 { 420 return (kern_mmap(td, &(struct mmap_req){ 421 .mr_hint = (uintptr_t)uap->addr, 422 .mr_len = uap->len, 423 .mr_prot = uap->prot, 424 .mr_flags = uap->flags, 425 .mr_fd = uap->fd, 426 .mr_pos = uap->pos, 427 })); 428 } 429 #endif 430 431 #ifdef COMPAT_43 432 #ifndef _SYS_SYSPROTO_H_ 433 struct ommap_args { 434 caddr_t addr; 435 int len; 436 int prot; 437 int flags; 438 int fd; 439 long pos; 440 }; 441 #endif 442 int 443 ommap(struct thread *td, struct ommap_args *uap) 444 { 445 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 446 uap->flags, uap->fd, uap->pos)); 447 } 448 449 int 450 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 451 int oflags, int fd, long pos) 452 { 453 static const char cvtbsdprot[8] = { 454 0, 455 PROT_EXEC, 456 PROT_WRITE, 457 PROT_EXEC | PROT_WRITE, 458 PROT_READ, 459 PROT_EXEC | PROT_READ, 460 PROT_WRITE | PROT_READ, 461 PROT_EXEC | PROT_WRITE | PROT_READ, 462 }; 463 int flags, prot; 464 465 if (len < 0) 466 return (EINVAL); 467 468 #define OMAP_ANON 0x0002 469 #define OMAP_COPY 0x0020 470 #define OMAP_SHARED 0x0010 471 #define OMAP_FIXED 0x0100 472 473 prot = cvtbsdprot[oprot & 0x7]; 474 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 475 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 476 prot != 0) 477 prot |= PROT_EXEC; 478 #endif 479 flags = 0; 480 if (oflags & OMAP_ANON) 481 flags |= MAP_ANON; 482 if (oflags & OMAP_COPY) 483 flags |= MAP_COPY; 484 if (oflags & OMAP_SHARED) 485 flags |= MAP_SHARED; 486 else 487 flags |= MAP_PRIVATE; 488 if (oflags & OMAP_FIXED) 489 flags |= MAP_FIXED; 490 return (kern_mmap(td, &(struct mmap_req){ 491 .mr_hint = hint, 492 .mr_len = len, 493 .mr_prot = prot, 494 .mr_flags = flags, 495 .mr_fd = fd, 496 .mr_pos = pos, 497 })); 498 } 499 #endif /* COMPAT_43 */ 500 501 #ifndef _SYS_SYSPROTO_H_ 502 struct msync_args { 503 void *addr; 504 size_t len; 505 int flags; 506 }; 507 #endif 508 int 509 sys_msync(struct thread *td, struct msync_args *uap) 510 { 511 512 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 513 } 514 515 int 516 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 517 { 518 vm_offset_t addr; 519 vm_size_t pageoff; 520 vm_map_t map; 521 int rv; 522 523 addr = addr0; 524 pageoff = (addr & PAGE_MASK); 525 addr -= pageoff; 526 size += pageoff; 527 size = (vm_size_t) round_page(size); 528 if (addr + size < addr) 529 return (EINVAL); 530 531 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 532 return (EINVAL); 533 534 map = &td->td_proc->p_vmspace->vm_map; 535 536 /* 537 * Clean the pages and interpret the return value. 538 */ 539 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 540 (flags & MS_INVALIDATE) != 0); 541 switch (rv) { 542 case KERN_SUCCESS: 543 return (0); 544 case KERN_INVALID_ADDRESS: 545 return (ENOMEM); 546 case KERN_INVALID_ARGUMENT: 547 return (EBUSY); 548 case KERN_FAILURE: 549 return (EIO); 550 default: 551 return (EINVAL); 552 } 553 } 554 555 #ifndef _SYS_SYSPROTO_H_ 556 struct munmap_args { 557 void *addr; 558 size_t len; 559 }; 560 #endif 561 int 562 sys_munmap(struct thread *td, struct munmap_args *uap) 563 { 564 565 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 566 } 567 568 int 569 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 570 { 571 #ifdef HWPMC_HOOKS 572 struct pmckern_map_out pkm; 573 vm_map_entry_t entry; 574 bool pmc_handled; 575 #endif 576 vm_offset_t addr, end; 577 vm_size_t pageoff; 578 vm_map_t map; 579 int rv; 580 581 if (size == 0) 582 return (EINVAL); 583 584 addr = addr0; 585 pageoff = (addr & PAGE_MASK); 586 addr -= pageoff; 587 size += pageoff; 588 size = (vm_size_t) round_page(size); 589 end = addr + size; 590 map = &td->td_proc->p_vmspace->vm_map; 591 if (!vm_map_range_valid(map, addr, end)) 592 return (EINVAL); 593 594 vm_map_lock(map); 595 #ifdef HWPMC_HOOKS 596 pmc_handled = false; 597 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 598 pmc_handled = true; 599 /* 600 * Inform hwpmc if the address range being unmapped contains 601 * an executable region. 602 */ 603 pkm.pm_address = (uintptr_t) NULL; 604 if (vm_map_lookup_entry(map, addr, &entry)) { 605 for (; entry->start < end; 606 entry = vm_map_entry_succ(entry)) { 607 if (vm_map_check_protection(map, entry->start, 608 entry->end, VM_PROT_EXECUTE) == TRUE) { 609 pkm.pm_address = (uintptr_t) addr; 610 pkm.pm_size = (size_t) size; 611 break; 612 } 613 } 614 } 615 } 616 #endif 617 rv = vm_map_delete(map, addr, end); 618 619 #ifdef HWPMC_HOOKS 620 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 621 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 622 vm_map_lock_downgrade(map); 623 if (pkm.pm_address != (uintptr_t) NULL) 624 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 625 vm_map_unlock_read(map); 626 } else 627 #endif 628 vm_map_unlock(map); 629 630 return (vm_mmap_to_errno(rv)); 631 } 632 633 #ifndef _SYS_SYSPROTO_H_ 634 struct mprotect_args { 635 const void *addr; 636 size_t len; 637 int prot; 638 }; 639 #endif 640 int 641 sys_mprotect(struct thread *td, struct mprotect_args *uap) 642 { 643 644 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, 645 uap->prot, 0)); 646 } 647 648 int 649 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot, 650 int flags) 651 { 652 vm_offset_t addr; 653 vm_size_t pageoff; 654 int vm_error, max_prot; 655 656 addr = addr0; 657 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 658 return (EINVAL); 659 max_prot = PROT_MAX_EXTRACT(prot); 660 prot = PROT_EXTRACT(prot); 661 pageoff = (addr & PAGE_MASK); 662 addr -= pageoff; 663 size += pageoff; 664 size = (vm_size_t) round_page(size); 665 #ifdef COMPAT_FREEBSD32 666 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 667 if (((addr + size) & 0xffffffff) < addr) 668 return (EINVAL); 669 } else 670 #endif 671 if (addr + size < addr) 672 return (EINVAL); 673 674 flags |= VM_MAP_PROTECT_SET_PROT; 675 if (max_prot != 0) 676 flags |= VM_MAP_PROTECT_SET_MAXPROT; 677 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 678 addr, addr + size, prot, max_prot, flags); 679 680 switch (vm_error) { 681 case KERN_SUCCESS: 682 return (0); 683 case KERN_PROTECTION_FAILURE: 684 return (EACCES); 685 case KERN_RESOURCE_SHORTAGE: 686 return (ENOMEM); 687 case KERN_OUT_OF_BOUNDS: 688 return (ENOTSUP); 689 } 690 return (EINVAL); 691 } 692 693 #ifndef _SYS_SYSPROTO_H_ 694 struct minherit_args { 695 void *addr; 696 size_t len; 697 int inherit; 698 }; 699 #endif 700 int 701 sys_minherit(struct thread *td, struct minherit_args *uap) 702 { 703 704 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 705 uap->inherit)); 706 } 707 708 int 709 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 710 { 711 vm_offset_t addr; 712 vm_size_t size, pageoff; 713 vm_inherit_t inherit; 714 715 addr = (vm_offset_t)addr0; 716 size = len; 717 inherit = inherit0; 718 719 pageoff = (addr & PAGE_MASK); 720 addr -= pageoff; 721 size += pageoff; 722 size = (vm_size_t) round_page(size); 723 if (addr + size < addr) 724 return (EINVAL); 725 726 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 727 addr + size, inherit)) { 728 case KERN_SUCCESS: 729 return (0); 730 case KERN_PROTECTION_FAILURE: 731 return (EACCES); 732 } 733 return (EINVAL); 734 } 735 736 #ifndef _SYS_SYSPROTO_H_ 737 struct madvise_args { 738 void *addr; 739 size_t len; 740 int behav; 741 }; 742 #endif 743 744 int 745 sys_madvise(struct thread *td, struct madvise_args *uap) 746 { 747 748 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 749 } 750 751 int 752 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 753 { 754 vm_map_t map; 755 vm_offset_t addr, end, start; 756 int flags; 757 758 /* 759 * Check for our special case, advising the swap pager we are 760 * "immortal." 761 */ 762 if (behav == MADV_PROTECT) { 763 flags = PPROT_SET; 764 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 765 PROC_SPROTECT, &flags)); 766 } 767 768 /* 769 * Check for illegal addresses. Watch out for address wrap... Note 770 * that VM_*_ADDRESS are not constants due to casts (argh). 771 */ 772 map = &td->td_proc->p_vmspace->vm_map; 773 addr = addr0; 774 if (!vm_map_range_valid(map, addr, addr + len)) 775 return (EINVAL); 776 777 /* 778 * Since this routine is only advisory, we default to conservative 779 * behavior. 780 */ 781 start = trunc_page(addr); 782 end = round_page(addr + len); 783 784 /* 785 * vm_map_madvise() checks for illegal values of behav. 786 */ 787 return (vm_map_madvise(map, start, end, behav)); 788 } 789 790 #ifndef _SYS_SYSPROTO_H_ 791 struct mincore_args { 792 const void *addr; 793 size_t len; 794 char *vec; 795 }; 796 #endif 797 798 int 799 sys_mincore(struct thread *td, struct mincore_args *uap) 800 { 801 802 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 803 } 804 805 int 806 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 807 { 808 pmap_t pmap; 809 vm_map_t map; 810 vm_map_entry_t current, entry; 811 vm_object_t object; 812 vm_offset_t addr, cend, end, first_addr; 813 vm_paddr_t pa; 814 vm_page_t m; 815 vm_pindex_t pindex; 816 int error, lastvecindex, mincoreinfo, vecindex; 817 unsigned int timestamp; 818 819 /* 820 * Make sure that the addresses presented are valid for user 821 * mode. 822 */ 823 first_addr = addr = trunc_page(addr0); 824 end = round_page(addr0 + len); 825 map = &td->td_proc->p_vmspace->vm_map; 826 if (end > vm_map_max(map) || end < addr) 827 return (ENOMEM); 828 829 pmap = vmspace_pmap(td->td_proc->p_vmspace); 830 831 vm_map_lock_read(map); 832 RestartScan: 833 timestamp = map->timestamp; 834 835 if (!vm_map_lookup_entry(map, addr, &entry)) { 836 vm_map_unlock_read(map); 837 return (ENOMEM); 838 } 839 840 /* 841 * Do this on a map entry basis so that if the pages are not 842 * in the current processes address space, we can easily look 843 * up the pages elsewhere. 844 */ 845 lastvecindex = -1; 846 while (entry->start < end) { 847 /* 848 * check for contiguity 849 */ 850 current = entry; 851 entry = vm_map_entry_succ(current); 852 if (current->end < end && 853 entry->start > current->end) { 854 vm_map_unlock_read(map); 855 return (ENOMEM); 856 } 857 858 /* 859 * ignore submaps (for now) or null objects 860 */ 861 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 862 current->object.vm_object == NULL) 863 continue; 864 865 /* 866 * limit this scan to the current map entry and the 867 * limits for the mincore call 868 */ 869 if (addr < current->start) 870 addr = current->start; 871 cend = current->end; 872 if (cend > end) 873 cend = end; 874 875 for (; addr < cend; addr += PAGE_SIZE) { 876 /* 877 * Check pmap first, it is likely faster, also 878 * it can provide info as to whether we are the 879 * one referencing or modifying the page. 880 */ 881 m = NULL; 882 object = NULL; 883 retry: 884 pa = 0; 885 mincoreinfo = pmap_mincore(pmap, addr, &pa); 886 if (mincore_mapped) { 887 /* 888 * We only care about this pmap's 889 * mapping of the page, if any. 890 */ 891 ; 892 } else if (pa != 0) { 893 /* 894 * The page is mapped by this process but not 895 * both accessed and modified. It is also 896 * managed. Acquire the object lock so that 897 * other mappings might be examined. The page's 898 * identity may change at any point before its 899 * object lock is acquired, so re-validate if 900 * necessary. 901 */ 902 m = PHYS_TO_VM_PAGE(pa); 903 while (object == NULL || m->object != object) { 904 if (object != NULL) 905 VM_OBJECT_WUNLOCK(object); 906 object = atomic_load_ptr(&m->object); 907 if (object == NULL) 908 goto retry; 909 VM_OBJECT_WLOCK(object); 910 } 911 if (pa != pmap_extract(pmap, addr)) 912 goto retry; 913 KASSERT(vm_page_all_valid(m), 914 ("mincore: page %p is mapped but invalid", 915 m)); 916 } else if (mincoreinfo == 0) { 917 /* 918 * The page is not mapped by this process. If 919 * the object implements managed pages, then 920 * determine if the page is resident so that 921 * the mappings might be examined. 922 */ 923 if (current->object.vm_object != object) { 924 if (object != NULL) 925 VM_OBJECT_WUNLOCK(object); 926 object = current->object.vm_object; 927 VM_OBJECT_WLOCK(object); 928 } 929 if ((object->flags & OBJ_SWAP) != 0 || 930 object->type == OBJT_VNODE) { 931 pindex = OFF_TO_IDX(current->offset + 932 (addr - current->start)); 933 m = vm_page_lookup(object, pindex); 934 if (m != NULL && vm_page_none_valid(m)) 935 m = NULL; 936 if (m != NULL) 937 mincoreinfo = MINCORE_INCORE; 938 } 939 } 940 if (m != NULL) { 941 VM_OBJECT_ASSERT_WLOCKED(m->object); 942 943 /* Examine other mappings of the page. */ 944 if (m->dirty == 0 && pmap_is_modified(m)) 945 vm_page_dirty(m); 946 if (m->dirty != 0) 947 mincoreinfo |= MINCORE_MODIFIED_OTHER; 948 949 /* 950 * The first test for PGA_REFERENCED is an 951 * optimization. The second test is 952 * required because a concurrent pmap 953 * operation could clear the last reference 954 * and set PGA_REFERENCED before the call to 955 * pmap_is_referenced(). 956 */ 957 if ((m->a.flags & PGA_REFERENCED) != 0 || 958 pmap_is_referenced(m) || 959 (m->a.flags & PGA_REFERENCED) != 0) 960 mincoreinfo |= MINCORE_REFERENCED_OTHER; 961 } 962 if (object != NULL) 963 VM_OBJECT_WUNLOCK(object); 964 965 /* 966 * subyte may page fault. In case it needs to modify 967 * the map, we release the lock. 968 */ 969 vm_map_unlock_read(map); 970 971 /* 972 * calculate index into user supplied byte vector 973 */ 974 vecindex = atop(addr - first_addr); 975 976 /* 977 * If we have skipped map entries, we need to make sure that 978 * the byte vector is zeroed for those skipped entries. 979 */ 980 while ((lastvecindex + 1) < vecindex) { 981 ++lastvecindex; 982 error = subyte(vec + lastvecindex, 0); 983 if (error) { 984 error = EFAULT; 985 goto done2; 986 } 987 } 988 989 /* 990 * Pass the page information to the user 991 */ 992 error = subyte(vec + vecindex, mincoreinfo); 993 if (error) { 994 error = EFAULT; 995 goto done2; 996 } 997 998 /* 999 * If the map has changed, due to the subyte, the previous 1000 * output may be invalid. 1001 */ 1002 vm_map_lock_read(map); 1003 if (timestamp != map->timestamp) 1004 goto RestartScan; 1005 1006 lastvecindex = vecindex; 1007 } 1008 } 1009 1010 /* 1011 * subyte may page fault. In case it needs to modify 1012 * the map, we release the lock. 1013 */ 1014 vm_map_unlock_read(map); 1015 1016 /* 1017 * Zero the last entries in the byte vector. 1018 */ 1019 vecindex = atop(end - first_addr); 1020 while ((lastvecindex + 1) < vecindex) { 1021 ++lastvecindex; 1022 error = subyte(vec + lastvecindex, 0); 1023 if (error) { 1024 error = EFAULT; 1025 goto done2; 1026 } 1027 } 1028 1029 /* 1030 * If the map has changed, due to the subyte, the previous 1031 * output may be invalid. 1032 */ 1033 vm_map_lock_read(map); 1034 if (timestamp != map->timestamp) 1035 goto RestartScan; 1036 vm_map_unlock_read(map); 1037 done2: 1038 return (error); 1039 } 1040 1041 #ifndef _SYS_SYSPROTO_H_ 1042 struct mlock_args { 1043 const void *addr; 1044 size_t len; 1045 }; 1046 #endif 1047 int 1048 sys_mlock(struct thread *td, struct mlock_args *uap) 1049 { 1050 1051 return (kern_mlock(td->td_proc, td->td_ucred, 1052 __DECONST(uintptr_t, uap->addr), uap->len)); 1053 } 1054 1055 int 1056 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1057 { 1058 vm_offset_t addr, end, last, start; 1059 vm_size_t npages, size; 1060 vm_map_t map; 1061 unsigned long nsize; 1062 int error; 1063 1064 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1065 if (error) 1066 return (error); 1067 addr = addr0; 1068 size = len; 1069 last = addr + size; 1070 start = trunc_page(addr); 1071 end = round_page(last); 1072 if (last < addr || end < addr) 1073 return (EINVAL); 1074 npages = atop(end - start); 1075 if (npages > vm_page_max_user_wired) 1076 return (ENOMEM); 1077 map = &proc->p_vmspace->vm_map; 1078 PROC_LOCK(proc); 1079 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1080 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1081 PROC_UNLOCK(proc); 1082 return (ENOMEM); 1083 } 1084 PROC_UNLOCK(proc); 1085 #ifdef RACCT 1086 if (racct_enable) { 1087 PROC_LOCK(proc); 1088 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1089 PROC_UNLOCK(proc); 1090 if (error != 0) 1091 return (ENOMEM); 1092 } 1093 #endif 1094 error = vm_map_wire(map, start, end, 1095 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1096 #ifdef RACCT 1097 if (racct_enable && error != KERN_SUCCESS) { 1098 PROC_LOCK(proc); 1099 racct_set(proc, RACCT_MEMLOCK, 1100 ptoa(pmap_wired_count(map->pmap))); 1101 PROC_UNLOCK(proc); 1102 } 1103 #endif 1104 switch (error) { 1105 case KERN_SUCCESS: 1106 return (0); 1107 case KERN_INVALID_ARGUMENT: 1108 return (EINVAL); 1109 default: 1110 return (ENOMEM); 1111 } 1112 } 1113 1114 #ifndef _SYS_SYSPROTO_H_ 1115 struct mlockall_args { 1116 int how; 1117 }; 1118 #endif 1119 1120 int 1121 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1122 { 1123 vm_map_t map; 1124 int error; 1125 1126 map = &td->td_proc->p_vmspace->vm_map; 1127 error = priv_check(td, PRIV_VM_MLOCK); 1128 if (error) 1129 return (error); 1130 1131 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1132 return (EINVAL); 1133 1134 /* 1135 * If wiring all pages in the process would cause it to exceed 1136 * a hard resource limit, return ENOMEM. 1137 */ 1138 if (!old_mlock && uap->how & MCL_CURRENT) { 1139 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1140 return (ENOMEM); 1141 } 1142 #ifdef RACCT 1143 if (racct_enable) { 1144 PROC_LOCK(td->td_proc); 1145 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1146 PROC_UNLOCK(td->td_proc); 1147 if (error != 0) 1148 return (ENOMEM); 1149 } 1150 #endif 1151 1152 if (uap->how & MCL_FUTURE) { 1153 vm_map_lock(map); 1154 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1155 vm_map_unlock(map); 1156 error = 0; 1157 } 1158 1159 if (uap->how & MCL_CURRENT) { 1160 /* 1161 * P1003.1-2001 mandates that all currently mapped pages 1162 * will be memory resident and locked (wired) upon return 1163 * from mlockall(). vm_map_wire() will wire pages, by 1164 * calling vm_fault_wire() for each page in the region. 1165 */ 1166 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1167 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1168 if (error == KERN_SUCCESS) 1169 error = 0; 1170 else if (error == KERN_RESOURCE_SHORTAGE) 1171 error = ENOMEM; 1172 else 1173 error = EAGAIN; 1174 } 1175 #ifdef RACCT 1176 if (racct_enable && error != KERN_SUCCESS) { 1177 PROC_LOCK(td->td_proc); 1178 racct_set(td->td_proc, RACCT_MEMLOCK, 1179 ptoa(pmap_wired_count(map->pmap))); 1180 PROC_UNLOCK(td->td_proc); 1181 } 1182 #endif 1183 1184 return (error); 1185 } 1186 1187 #ifndef _SYS_SYSPROTO_H_ 1188 struct munlockall_args { 1189 register_t dummy; 1190 }; 1191 #endif 1192 1193 int 1194 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1195 { 1196 vm_map_t map; 1197 int error; 1198 1199 map = &td->td_proc->p_vmspace->vm_map; 1200 error = priv_check(td, PRIV_VM_MUNLOCK); 1201 if (error) 1202 return (error); 1203 1204 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1205 vm_map_lock(map); 1206 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1207 vm_map_unlock(map); 1208 1209 /* Forcibly unwire all pages. */ 1210 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1211 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1212 #ifdef RACCT 1213 if (racct_enable && error == KERN_SUCCESS) { 1214 PROC_LOCK(td->td_proc); 1215 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1216 PROC_UNLOCK(td->td_proc); 1217 } 1218 #endif 1219 1220 return (error); 1221 } 1222 1223 #ifndef _SYS_SYSPROTO_H_ 1224 struct munlock_args { 1225 const void *addr; 1226 size_t len; 1227 }; 1228 #endif 1229 int 1230 sys_munlock(struct thread *td, struct munlock_args *uap) 1231 { 1232 1233 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1234 } 1235 1236 int 1237 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1238 { 1239 vm_offset_t addr, end, last, start; 1240 #ifdef RACCT 1241 vm_map_t map; 1242 #endif 1243 int error; 1244 1245 error = priv_check(td, PRIV_VM_MUNLOCK); 1246 if (error) 1247 return (error); 1248 addr = addr0; 1249 last = addr + size; 1250 start = trunc_page(addr); 1251 end = round_page(last); 1252 if (last < addr || end < addr) 1253 return (EINVAL); 1254 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1255 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1256 #ifdef RACCT 1257 if (racct_enable && error == KERN_SUCCESS) { 1258 PROC_LOCK(td->td_proc); 1259 map = &td->td_proc->p_vmspace->vm_map; 1260 racct_set(td->td_proc, RACCT_MEMLOCK, 1261 ptoa(pmap_wired_count(map->pmap))); 1262 PROC_UNLOCK(td->td_proc); 1263 } 1264 #endif 1265 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1266 } 1267 1268 /* 1269 * vm_mmap_vnode() 1270 * 1271 * Helper function for vm_mmap. Perform sanity check specific for mmap 1272 * operations on vnodes. 1273 */ 1274 int 1275 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1276 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1277 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1278 boolean_t *writecounted) 1279 { 1280 struct vattr va; 1281 vm_object_t obj; 1282 vm_ooffset_t foff; 1283 struct ucred *cred; 1284 int error, flags; 1285 bool writex; 1286 1287 cred = td->td_ucred; 1288 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1289 (*flagsp & MAP_SHARED) != 0; 1290 if ((error = vget(vp, LK_SHARED)) != 0) 1291 return (error); 1292 AUDIT_ARG_VNODE1(vp); 1293 foff = *foffp; 1294 flags = *flagsp; 1295 obj = vp->v_object; 1296 if (vp->v_type == VREG) { 1297 /* 1298 * Get the proper underlying object 1299 */ 1300 if (obj == NULL) { 1301 error = EINVAL; 1302 goto done; 1303 } 1304 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1305 vput(vp); 1306 vp = (struct vnode *)obj->handle; 1307 /* 1308 * Bypass filesystems obey the mpsafety of the 1309 * underlying fs. Tmpfs never bypasses. 1310 */ 1311 error = vget(vp, LK_SHARED); 1312 if (error != 0) 1313 return (error); 1314 } 1315 if (writex) { 1316 *writecounted = TRUE; 1317 vm_pager_update_writecount(obj, 0, objsize); 1318 } 1319 } else { 1320 error = EINVAL; 1321 goto done; 1322 } 1323 if ((error = VOP_GETATTR(vp, &va, cred))) 1324 goto done; 1325 #ifdef MAC 1326 /* This relies on VM_PROT_* matching PROT_*. */ 1327 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1328 if (error != 0) 1329 goto done; 1330 #endif 1331 if ((flags & MAP_SHARED) != 0) { 1332 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1333 if (prot & VM_PROT_WRITE) { 1334 error = EPERM; 1335 goto done; 1336 } 1337 *maxprotp &= ~VM_PROT_WRITE; 1338 } 1339 } 1340 /* 1341 * If it is a regular file without any references 1342 * we do not need to sync it. 1343 * Adjust object size to be the size of actual file. 1344 */ 1345 objsize = round_page(va.va_size); 1346 if (va.va_nlink == 0) 1347 flags |= MAP_NOSYNC; 1348 if (obj->type == OBJT_VNODE) { 1349 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1350 cred); 1351 if (obj == NULL) { 1352 error = ENOMEM; 1353 goto done; 1354 } 1355 } else { 1356 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1357 vm_object_reference(obj); 1358 #if VM_NRESERVLEVEL > 0 1359 if ((obj->flags & OBJ_COLORED) == 0) { 1360 VM_OBJECT_WLOCK(obj); 1361 vm_object_color(obj, 0); 1362 VM_OBJECT_WUNLOCK(obj); 1363 } 1364 #endif 1365 } 1366 *objp = obj; 1367 *flagsp = flags; 1368 1369 VOP_MMAPPED(vp); 1370 1371 done: 1372 if (error != 0 && *writecounted) { 1373 *writecounted = FALSE; 1374 vm_pager_update_writecount(obj, objsize, 0); 1375 } 1376 vput(vp); 1377 return (error); 1378 } 1379 1380 /* 1381 * vm_mmap_cdev() 1382 * 1383 * Helper function for vm_mmap. Perform sanity check specific for mmap 1384 * operations on cdevs. 1385 */ 1386 int 1387 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1388 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1389 vm_ooffset_t *foff, vm_object_t *objp) 1390 { 1391 vm_object_t obj; 1392 int error, flags; 1393 1394 flags = *flagsp; 1395 1396 if (dsw->d_flags & D_MMAP_ANON) { 1397 *objp = NULL; 1398 *foff = 0; 1399 *maxprotp = VM_PROT_ALL; 1400 *flagsp |= MAP_ANON; 1401 return (0); 1402 } 1403 /* 1404 * cdevs do not provide private mappings of any kind. 1405 */ 1406 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1407 (prot & VM_PROT_WRITE) != 0) 1408 return (EACCES); 1409 if (flags & (MAP_PRIVATE|MAP_COPY)) 1410 return (EINVAL); 1411 /* 1412 * Force device mappings to be shared. 1413 */ 1414 flags |= MAP_SHARED; 1415 #ifdef MAC_XXX 1416 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1417 if (error != 0) 1418 return (error); 1419 #endif 1420 /* 1421 * First, try d_mmap_single(). If that is not implemented 1422 * (returns ENODEV), fall back to using the device pager. 1423 * Note that d_mmap_single() must return a reference to the 1424 * object (it needs to bump the reference count of the object 1425 * it returns somehow). 1426 * 1427 * XXX assumes VM_PROT_* == PROT_* 1428 */ 1429 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1430 if (error != ENODEV) 1431 return (error); 1432 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1433 td->td_ucred); 1434 if (obj == NULL) 1435 return (EINVAL); 1436 *objp = obj; 1437 *flagsp = flags; 1438 return (0); 1439 } 1440 1441 int 1442 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1443 vm_prot_t maxprot, int flags, 1444 objtype_t handle_type, void *handle, 1445 vm_ooffset_t foff) 1446 { 1447 vm_object_t object; 1448 struct thread *td = curthread; 1449 int error; 1450 boolean_t writecounted; 1451 1452 if (size == 0) 1453 return (EINVAL); 1454 1455 size = round_page(size); 1456 object = NULL; 1457 writecounted = FALSE; 1458 1459 switch (handle_type) { 1460 case OBJT_DEVICE: { 1461 struct cdevsw *dsw; 1462 struct cdev *cdev; 1463 int ref; 1464 1465 cdev = handle; 1466 dsw = dev_refthread(cdev, &ref); 1467 if (dsw == NULL) 1468 return (ENXIO); 1469 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1470 dsw, &foff, &object); 1471 dev_relthread(cdev, ref); 1472 break; 1473 } 1474 case OBJT_VNODE: 1475 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1476 handle, &foff, &object, &writecounted); 1477 break; 1478 default: 1479 error = EINVAL; 1480 break; 1481 } 1482 if (error) 1483 return (error); 1484 1485 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1486 foff, writecounted, td); 1487 if (error != 0 && object != NULL) { 1488 /* 1489 * If this mapping was accounted for in the vnode's 1490 * writecount, then undo that now. 1491 */ 1492 if (writecounted) 1493 vm_pager_release_writecount(object, 0, size); 1494 vm_object_deallocate(object); 1495 } 1496 return (error); 1497 } 1498 1499 int 1500 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1501 { 1502 int error; 1503 1504 RACCT_PROC_LOCK(td->td_proc); 1505 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1506 RACCT_PROC_UNLOCK(td->td_proc); 1507 return (ENOMEM); 1508 } 1509 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1510 RACCT_PROC_UNLOCK(td->td_proc); 1511 return (ENOMEM); 1512 } 1513 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1514 if (ptoa(pmap_wired_count(map->pmap)) + size > 1515 lim_cur(td, RLIMIT_MEMLOCK)) { 1516 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1517 RACCT_PROC_UNLOCK(td->td_proc); 1518 return (ENOMEM); 1519 } 1520 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1521 ptoa(pmap_wired_count(map->pmap)) + size); 1522 if (error != 0) { 1523 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1524 RACCT_PROC_UNLOCK(td->td_proc); 1525 return (error); 1526 } 1527 } 1528 RACCT_PROC_UNLOCK(td->td_proc); 1529 return (0); 1530 } 1531 1532 /* 1533 * Internal version of mmap that maps a specific VM object into an 1534 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1535 */ 1536 int 1537 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1538 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1539 boolean_t writecounted, struct thread *td) 1540 { 1541 vm_offset_t default_addr, max_addr; 1542 int docow, error, findspace, rv; 1543 bool curmap, fitit; 1544 1545 curmap = map == &td->td_proc->p_vmspace->vm_map; 1546 if (curmap) { 1547 error = kern_mmap_racct_check(td, map, size); 1548 if (error != 0) 1549 return (error); 1550 } 1551 1552 /* 1553 * We currently can only deal with page aligned file offsets. 1554 * The mmap() system call already enforces this by subtracting 1555 * the page offset from the file offset, but checking here 1556 * catches errors in device drivers (e.g. d_single_mmap() 1557 * callbacks) and other internal mapping requests (such as in 1558 * exec). 1559 */ 1560 if (foff & PAGE_MASK) 1561 return (EINVAL); 1562 1563 if ((flags & MAP_FIXED) == 0) { 1564 fitit = true; 1565 *addr = round_page(*addr); 1566 } else { 1567 if (*addr != trunc_page(*addr)) 1568 return (EINVAL); 1569 fitit = false; 1570 } 1571 1572 if (flags & MAP_ANON) { 1573 if (object != NULL || foff != 0) 1574 return (EINVAL); 1575 docow = 0; 1576 } else if (flags & MAP_PREFAULT_READ) 1577 docow = MAP_PREFAULT; 1578 else 1579 docow = MAP_PREFAULT_PARTIAL; 1580 1581 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1582 docow |= MAP_COPY_ON_WRITE; 1583 if (flags & MAP_NOSYNC) 1584 docow |= MAP_DISABLE_SYNCER; 1585 if (flags & MAP_NOCORE) 1586 docow |= MAP_DISABLE_COREDUMP; 1587 /* Shared memory is also shared with children. */ 1588 if (flags & MAP_SHARED) 1589 docow |= MAP_INHERIT_SHARE; 1590 if (writecounted) 1591 docow |= MAP_WRITECOUNT; 1592 if (flags & MAP_STACK) { 1593 if (object != NULL) 1594 return (EINVAL); 1595 docow |= MAP_STACK_GROWS_DOWN; 1596 } 1597 if ((flags & MAP_EXCL) != 0) 1598 docow |= MAP_CHECK_EXCL; 1599 if ((flags & MAP_GUARD) != 0) 1600 docow |= MAP_CREATE_GUARD; 1601 1602 if (fitit) { 1603 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1604 findspace = VMFS_SUPER_SPACE; 1605 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1606 findspace = VMFS_ALIGNED_SPACE(flags >> 1607 MAP_ALIGNMENT_SHIFT); 1608 else 1609 findspace = VMFS_OPTIMAL_SPACE; 1610 max_addr = 0; 1611 if ((flags & MAP_32BIT) != 0) 1612 max_addr = MAP_32BIT_MAX_ADDR; 1613 if (curmap) { 1614 default_addr = 1615 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1616 vm_daddr + lim_max(td, RLIMIT_DATA)); 1617 if ((flags & MAP_32BIT) != 0) 1618 default_addr = 0; 1619 rv = vm_map_find_min(map, object, foff, addr, size, 1620 default_addr, max_addr, findspace, prot, maxprot, 1621 docow); 1622 } else { 1623 rv = vm_map_find(map, object, foff, addr, size, 1624 max_addr, findspace, prot, maxprot, docow); 1625 } 1626 } else { 1627 rv = vm_map_fixed(map, object, foff, *addr, size, 1628 prot, maxprot, docow); 1629 } 1630 1631 if (rv == KERN_SUCCESS) { 1632 /* 1633 * If the process has requested that all future mappings 1634 * be wired, then heed this. 1635 */ 1636 if ((map->flags & MAP_WIREFUTURE) != 0) { 1637 vm_map_lock(map); 1638 if ((map->flags & MAP_WIREFUTURE) != 0) 1639 (void)vm_map_wire_locked(map, *addr, 1640 *addr + size, VM_MAP_WIRE_USER | 1641 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1642 VM_MAP_WIRE_NOHOLES)); 1643 vm_map_unlock(map); 1644 } 1645 } 1646 return (vm_mmap_to_errno(rv)); 1647 } 1648 1649 /* 1650 * Translate a Mach VM return code to zero on success or the appropriate errno 1651 * on failure. 1652 */ 1653 int 1654 vm_mmap_to_errno(int rv) 1655 { 1656 1657 switch (rv) { 1658 case KERN_SUCCESS: 1659 return (0); 1660 case KERN_INVALID_ADDRESS: 1661 case KERN_NO_SPACE: 1662 return (ENOMEM); 1663 case KERN_PROTECTION_FAILURE: 1664 return (EACCES); 1665 default: 1666 return (EINVAL); 1667 } 1668 } 1669