1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 #include "opt_hwpmc_hooks.h" 45 #include "opt_vm.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/capsicum.h> 50 #include <sys/kernel.h> 51 #include <sys/lock.h> 52 #include <sys/mutex.h> 53 #include <sys/sysproto.h> 54 #include <sys/elf.h> 55 #include <sys/filedesc.h> 56 #include <sys/priv.h> 57 #include <sys/proc.h> 58 #include <sys/procctl.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/rwlock.h> 63 #include <sys/sysctl.h> 64 #include <sys/vnode.h> 65 #include <sys/fcntl.h> 66 #include <sys/file.h> 67 #include <sys/mman.h> 68 #include <sys/mount.h> 69 #include <sys/conf.h> 70 #include <sys/stat.h> 71 #include <sys/syscallsubr.h> 72 #include <sys/sysent.h> 73 #include <sys/vmmeter.h> 74 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 75 #include <machine/md_var.h> 76 #endif 77 78 #include <security/audit/audit.h> 79 #include <security/mac/mac_framework.h> 80 81 #include <vm/vm.h> 82 #include <vm/vm_param.h> 83 #include <vm/pmap.h> 84 #include <vm/vm_map.h> 85 #include <vm/vm_object.h> 86 #include <vm/vm_page.h> 87 #include <vm/vm_pager.h> 88 #include <vm/vm_pageout.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_page.h> 91 #include <vm/vnode_pager.h> 92 93 #ifdef HWPMC_HOOKS 94 #include <sys/pmckern.h> 95 #endif 96 97 int old_mlock = 0; 98 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 99 "Do not apply RLIMIT_MEMLOCK on mlockall"); 100 static int mincore_mapped = 1; 101 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 102 "mincore reports mappings, not residency"); 103 static int imply_prot_max = 0; 104 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 105 "Imply maximum page protections in mmap() when none are specified"); 106 107 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 108 109 #if defined(COMPAT_43) 110 int 111 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 112 { 113 114 td->td_retval[0] = PAGE_SIZE; 115 return (0); 116 } 117 #endif /* COMPAT_43 */ 118 119 /* 120 * Memory Map (mmap) system call. Note that the file offset 121 * and address are allowed to be NOT page aligned, though if 122 * the MAP_FIXED flag it set, both must have the same remainder 123 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 124 * page-aligned, the actual mapping starts at trunc_page(addr) 125 * and the return value is adjusted up by the page offset. 126 * 127 * Generally speaking, only character devices which are themselves 128 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 129 * there would be no cache coherency between a descriptor and a VM mapping 130 * both to the same character device. 131 */ 132 #ifndef _SYS_SYSPROTO_H_ 133 struct mmap_args { 134 void *addr; 135 size_t len; 136 int prot; 137 int flags; 138 int fd; 139 long pad; 140 off_t pos; 141 }; 142 #endif 143 144 int 145 sys_mmap(struct thread *td, struct mmap_args *uap) 146 { 147 148 return (kern_mmap(td, &(struct mmap_req){ 149 .mr_hint = (uintptr_t)uap->addr, 150 .mr_len = uap->len, 151 .mr_prot = uap->prot, 152 .mr_flags = uap->flags, 153 .mr_fd = uap->fd, 154 .mr_pos = uap->pos, 155 })); 156 } 157 158 int 159 kern_mmap_maxprot(struct proc *p, int prot) 160 { 161 162 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 163 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 164 return (_PROT_ALL); 165 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 166 prot != PROT_NONE) 167 return (prot); 168 return (_PROT_ALL); 169 } 170 171 int 172 kern_mmap(struct thread *td, const struct mmap_req *mrp) 173 { 174 struct vmspace *vms; 175 struct file *fp; 176 struct proc *p; 177 off_t pos; 178 vm_offset_t addr, orig_addr; 179 vm_size_t len, pageoff, size; 180 vm_prot_t cap_maxprot; 181 int align, error, fd, flags, max_prot, prot; 182 cap_rights_t rights; 183 mmap_check_fp_fn check_fp_fn; 184 185 orig_addr = addr = mrp->mr_hint; 186 len = mrp->mr_len; 187 prot = mrp->mr_prot; 188 flags = mrp->mr_flags; 189 fd = mrp->mr_fd; 190 pos = mrp->mr_pos; 191 check_fp_fn = mrp->mr_check_fp_fn; 192 193 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 194 return (EINVAL); 195 max_prot = PROT_MAX_EXTRACT(prot); 196 prot = PROT_EXTRACT(prot); 197 if (max_prot != 0 && (max_prot & prot) != prot) 198 return (ENOTSUP); 199 200 p = td->td_proc; 201 202 /* 203 * Always honor PROT_MAX if set. If not, default to all 204 * permissions unless we're implying maximum permissions. 205 */ 206 if (max_prot == 0) 207 max_prot = kern_mmap_maxprot(p, prot); 208 209 vms = p->p_vmspace; 210 fp = NULL; 211 AUDIT_ARG_FD(fd); 212 213 /* 214 * Ignore old flags that used to be defined but did not do anything. 215 */ 216 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 217 218 /* 219 * Enforce the constraints. 220 * Mapping of length 0 is only allowed for old binaries. 221 * Anonymous mapping shall specify -1 as filedescriptor and 222 * zero position for new code. Be nice to ancient a.out 223 * binaries and correct pos for anonymous mapping, since old 224 * ld.so sometimes issues anonymous map requests with non-zero 225 * pos. 226 */ 227 if (!SV_CURPROC_FLAG(SV_AOUT)) { 228 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 229 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 230 return (EINVAL); 231 } else { 232 if ((flags & MAP_ANON) != 0) 233 pos = 0; 234 } 235 236 if (flags & MAP_STACK) { 237 if ((fd != -1) || 238 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 239 return (EINVAL); 240 flags |= MAP_ANON; 241 pos = 0; 242 } 243 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 244 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 245 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0) 246 return (EINVAL); 247 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 248 return (EINVAL); 249 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 250 return (EINVAL); 251 if (prot != PROT_NONE && 252 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 253 return (EINVAL); 254 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 255 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 256 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) 257 return (EINVAL); 258 259 /* 260 * Align the file position to a page boundary, 261 * and save its page offset component. 262 */ 263 pageoff = (pos & PAGE_MASK); 264 pos -= pageoff; 265 266 /* Compute size from len by rounding (on both ends). */ 267 size = len + pageoff; /* low end... */ 268 size = round_page(size); /* hi end */ 269 /* Check for rounding up to zero. */ 270 if (len > size) 271 return (ENOMEM); 272 273 /* Ensure alignment is at least a page and fits in a pointer. */ 274 align = flags & MAP_ALIGNMENT_MASK; 275 if (align != 0 && align != MAP_ALIGNED_SUPER && 276 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 277 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 278 return (EINVAL); 279 280 /* 281 * Check for illegal addresses. Watch out for address wrap... Note 282 * that VM_*_ADDRESS are not constants due to casts (argh). 283 */ 284 if (flags & MAP_FIXED) { 285 /* 286 * The specified address must have the same remainder 287 * as the file offset taken modulo PAGE_SIZE, so it 288 * should be aligned after adjustment by pageoff. 289 */ 290 addr -= pageoff; 291 if (addr & PAGE_MASK) 292 return (EINVAL); 293 294 /* Address range must be all in user VM space. */ 295 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) 296 return (EINVAL); 297 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 298 return (EINVAL); 299 } else if (flags & MAP_32BIT) { 300 /* 301 * For MAP_32BIT, override the hint if it is too high and 302 * do not bother moving the mapping past the heap (since 303 * the heap is usually above 2GB). 304 */ 305 if (addr + size > MAP_32BIT_MAX_ADDR) 306 addr = 0; 307 } else { 308 /* 309 * XXX for non-fixed mappings where no hint is provided or 310 * the hint would fall in the potential heap space, 311 * place it after the end of the largest possible heap. 312 * 313 * For anonymous mappings within the address space of the 314 * calling process, the absence of a hint is handled at a 315 * lower level in order to implement different clustering 316 * strategies for ASLR. 317 */ 318 if (((flags & MAP_ANON) == 0 && addr == 0) || 319 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 320 addr < round_page((vm_offset_t)vms->vm_daddr + 321 lim_max(td, RLIMIT_DATA)))) 322 addr = round_page((vm_offset_t)vms->vm_daddr + 323 lim_max(td, RLIMIT_DATA)); 324 } 325 if (len == 0) { 326 /* 327 * Return success without mapping anything for old 328 * binaries that request a page-aligned mapping of 329 * length 0. For modern binaries, this function 330 * returns an error earlier. 331 */ 332 error = 0; 333 } else if ((flags & MAP_GUARD) != 0) { 334 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 335 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 336 } else if ((flags & MAP_ANON) != 0) { 337 /* 338 * Mapping blank space is trivial. 339 * 340 * This relies on VM_PROT_* matching PROT_*. 341 */ 342 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 343 max_prot, flags, NULL, pos, FALSE, td); 344 } else { 345 /* 346 * Mapping file, get fp for validation and don't let the 347 * descriptor disappear on us if we block. Check capability 348 * rights, but also return the maximum rights to be combined 349 * with maxprot later. 350 */ 351 cap_rights_init_one(&rights, CAP_MMAP); 352 if (prot & PROT_READ) 353 cap_rights_set_one(&rights, CAP_MMAP_R); 354 if ((flags & MAP_SHARED) != 0) { 355 if (prot & PROT_WRITE) 356 cap_rights_set_one(&rights, CAP_MMAP_W); 357 } 358 if (prot & PROT_EXEC) 359 cap_rights_set_one(&rights, CAP_MMAP_X); 360 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 361 if (error != 0) 362 goto done; 363 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 364 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 365 error = EINVAL; 366 goto done; 367 } 368 if (check_fp_fn != NULL) { 369 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 370 flags); 371 if (error != 0) 372 goto done; 373 } 374 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 375 addr = orig_addr; 376 /* This relies on VM_PROT_* matching PROT_*. */ 377 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 378 max_prot & cap_maxprot, flags, pos, td); 379 } 380 381 if (error == 0) 382 td->td_retval[0] = addr + pageoff; 383 done: 384 if (fp) 385 fdrop(fp, td); 386 387 return (error); 388 } 389 390 #if defined(COMPAT_FREEBSD6) 391 int 392 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 393 { 394 return (kern_mmap(td, &(struct mmap_req){ 395 .mr_hint = (uintptr_t)uap->addr, 396 .mr_len = uap->len, 397 .mr_prot = uap->prot, 398 .mr_flags = uap->flags, 399 .mr_fd = uap->fd, 400 .mr_pos = uap->pos, 401 })); 402 } 403 #endif 404 405 #ifdef COMPAT_43 406 #ifndef _SYS_SYSPROTO_H_ 407 struct ommap_args { 408 caddr_t addr; 409 int len; 410 int prot; 411 int flags; 412 int fd; 413 long pos; 414 }; 415 #endif 416 int 417 ommap(struct thread *td, struct ommap_args *uap) 418 { 419 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 420 uap->flags, uap->fd, uap->pos)); 421 } 422 423 int 424 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 425 int oflags, int fd, long pos) 426 { 427 static const char cvtbsdprot[8] = { 428 0, 429 PROT_EXEC, 430 PROT_WRITE, 431 PROT_EXEC | PROT_WRITE, 432 PROT_READ, 433 PROT_EXEC | PROT_READ, 434 PROT_WRITE | PROT_READ, 435 PROT_EXEC | PROT_WRITE | PROT_READ, 436 }; 437 int flags, prot; 438 439 if (len < 0) 440 return (EINVAL); 441 442 #define OMAP_ANON 0x0002 443 #define OMAP_COPY 0x0020 444 #define OMAP_SHARED 0x0010 445 #define OMAP_FIXED 0x0100 446 447 prot = cvtbsdprot[oprot & 0x7]; 448 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 449 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 450 prot != 0) 451 prot |= PROT_EXEC; 452 #endif 453 flags = 0; 454 if (oflags & OMAP_ANON) 455 flags |= MAP_ANON; 456 if (oflags & OMAP_COPY) 457 flags |= MAP_COPY; 458 if (oflags & OMAP_SHARED) 459 flags |= MAP_SHARED; 460 else 461 flags |= MAP_PRIVATE; 462 if (oflags & OMAP_FIXED) 463 flags |= MAP_FIXED; 464 return (kern_mmap(td, &(struct mmap_req){ 465 .mr_hint = hint, 466 .mr_len = len, 467 .mr_prot = prot, 468 .mr_flags = flags, 469 .mr_fd = fd, 470 .mr_pos = pos, 471 })); 472 } 473 #endif /* COMPAT_43 */ 474 475 #ifndef _SYS_SYSPROTO_H_ 476 struct msync_args { 477 void *addr; 478 size_t len; 479 int flags; 480 }; 481 #endif 482 int 483 sys_msync(struct thread *td, struct msync_args *uap) 484 { 485 486 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 487 } 488 489 int 490 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 491 { 492 vm_offset_t addr; 493 vm_size_t pageoff; 494 vm_map_t map; 495 int rv; 496 497 addr = addr0; 498 pageoff = (addr & PAGE_MASK); 499 addr -= pageoff; 500 size += pageoff; 501 size = (vm_size_t) round_page(size); 502 if (addr + size < addr) 503 return (EINVAL); 504 505 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 506 return (EINVAL); 507 508 map = &td->td_proc->p_vmspace->vm_map; 509 510 /* 511 * Clean the pages and interpret the return value. 512 */ 513 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 514 (flags & MS_INVALIDATE) != 0); 515 switch (rv) { 516 case KERN_SUCCESS: 517 return (0); 518 case KERN_INVALID_ADDRESS: 519 return (ENOMEM); 520 case KERN_INVALID_ARGUMENT: 521 return (EBUSY); 522 case KERN_FAILURE: 523 return (EIO); 524 default: 525 return (EINVAL); 526 } 527 } 528 529 #ifndef _SYS_SYSPROTO_H_ 530 struct munmap_args { 531 void *addr; 532 size_t len; 533 }; 534 #endif 535 int 536 sys_munmap(struct thread *td, struct munmap_args *uap) 537 { 538 539 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 540 } 541 542 int 543 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 544 { 545 #ifdef HWPMC_HOOKS 546 struct pmckern_map_out pkm; 547 vm_map_entry_t entry; 548 bool pmc_handled; 549 #endif 550 vm_offset_t addr, end; 551 vm_size_t pageoff; 552 vm_map_t map; 553 int rv; 554 555 if (size == 0) 556 return (EINVAL); 557 558 addr = addr0; 559 pageoff = (addr & PAGE_MASK); 560 addr -= pageoff; 561 size += pageoff; 562 size = (vm_size_t) round_page(size); 563 end = addr + size; 564 map = &td->td_proc->p_vmspace->vm_map; 565 if (!vm_map_range_valid(map, addr, end)) 566 return (EINVAL); 567 568 vm_map_lock(map); 569 #ifdef HWPMC_HOOKS 570 pmc_handled = false; 571 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 572 pmc_handled = true; 573 /* 574 * Inform hwpmc if the address range being unmapped contains 575 * an executable region. 576 */ 577 pkm.pm_address = (uintptr_t) NULL; 578 if (vm_map_lookup_entry(map, addr, &entry)) { 579 for (; entry->start < end; 580 entry = vm_map_entry_succ(entry)) { 581 if (vm_map_check_protection(map, entry->start, 582 entry->end, VM_PROT_EXECUTE) == TRUE) { 583 pkm.pm_address = (uintptr_t) addr; 584 pkm.pm_size = (size_t) size; 585 break; 586 } 587 } 588 } 589 } 590 #endif 591 rv = vm_map_delete(map, addr, end); 592 593 #ifdef HWPMC_HOOKS 594 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 595 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 596 vm_map_lock_downgrade(map); 597 if (pkm.pm_address != (uintptr_t) NULL) 598 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 599 vm_map_unlock_read(map); 600 } else 601 #endif 602 vm_map_unlock(map); 603 604 return (vm_mmap_to_errno(rv)); 605 } 606 607 #ifndef _SYS_SYSPROTO_H_ 608 struct mprotect_args { 609 const void *addr; 610 size_t len; 611 int prot; 612 }; 613 #endif 614 int 615 sys_mprotect(struct thread *td, struct mprotect_args *uap) 616 { 617 618 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, 619 uap->prot, 0)); 620 } 621 622 int 623 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot, 624 int flags) 625 { 626 vm_offset_t addr; 627 vm_size_t pageoff; 628 int vm_error, max_prot; 629 630 addr = addr0; 631 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 632 return (EINVAL); 633 max_prot = PROT_MAX_EXTRACT(prot); 634 prot = PROT_EXTRACT(prot); 635 pageoff = (addr & PAGE_MASK); 636 addr -= pageoff; 637 size += pageoff; 638 size = (vm_size_t) round_page(size); 639 #ifdef COMPAT_FREEBSD32 640 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 641 if (((addr + size) & 0xffffffff) < addr) 642 return (EINVAL); 643 } else 644 #endif 645 if (addr + size < addr) 646 return (EINVAL); 647 648 flags |= VM_MAP_PROTECT_SET_PROT; 649 if (max_prot != 0) 650 flags |= VM_MAP_PROTECT_SET_MAXPROT; 651 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 652 addr, addr + size, prot, max_prot, flags); 653 654 switch (vm_error) { 655 case KERN_SUCCESS: 656 return (0); 657 case KERN_PROTECTION_FAILURE: 658 return (EACCES); 659 case KERN_RESOURCE_SHORTAGE: 660 return (ENOMEM); 661 case KERN_OUT_OF_BOUNDS: 662 return (ENOTSUP); 663 } 664 return (EINVAL); 665 } 666 667 #ifndef _SYS_SYSPROTO_H_ 668 struct minherit_args { 669 void *addr; 670 size_t len; 671 int inherit; 672 }; 673 #endif 674 int 675 sys_minherit(struct thread *td, struct minherit_args *uap) 676 { 677 678 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 679 uap->inherit)); 680 } 681 682 int 683 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 684 { 685 vm_offset_t addr; 686 vm_size_t size, pageoff; 687 vm_inherit_t inherit; 688 689 addr = (vm_offset_t)addr0; 690 size = len; 691 inherit = inherit0; 692 693 pageoff = (addr & PAGE_MASK); 694 addr -= pageoff; 695 size += pageoff; 696 size = (vm_size_t) round_page(size); 697 if (addr + size < addr) 698 return (EINVAL); 699 700 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 701 addr + size, inherit)) { 702 case KERN_SUCCESS: 703 return (0); 704 case KERN_PROTECTION_FAILURE: 705 return (EACCES); 706 } 707 return (EINVAL); 708 } 709 710 #ifndef _SYS_SYSPROTO_H_ 711 struct madvise_args { 712 void *addr; 713 size_t len; 714 int behav; 715 }; 716 #endif 717 718 int 719 sys_madvise(struct thread *td, struct madvise_args *uap) 720 { 721 722 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 723 } 724 725 int 726 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 727 { 728 vm_map_t map; 729 vm_offset_t addr, end, start; 730 int flags; 731 732 /* 733 * Check for our special case, advising the swap pager we are 734 * "immortal." 735 */ 736 if (behav == MADV_PROTECT) { 737 flags = PPROT_SET; 738 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 739 PROC_SPROTECT, &flags)); 740 } 741 742 /* 743 * Check for illegal addresses. Watch out for address wrap... Note 744 * that VM_*_ADDRESS are not constants due to casts (argh). 745 */ 746 map = &td->td_proc->p_vmspace->vm_map; 747 addr = addr0; 748 if (!vm_map_range_valid(map, addr, addr + len)) 749 return (EINVAL); 750 751 /* 752 * Since this routine is only advisory, we default to conservative 753 * behavior. 754 */ 755 start = trunc_page(addr); 756 end = round_page(addr + len); 757 758 /* 759 * vm_map_madvise() checks for illegal values of behav. 760 */ 761 return (vm_map_madvise(map, start, end, behav)); 762 } 763 764 #ifndef _SYS_SYSPROTO_H_ 765 struct mincore_args { 766 const void *addr; 767 size_t len; 768 char *vec; 769 }; 770 #endif 771 772 int 773 sys_mincore(struct thread *td, struct mincore_args *uap) 774 { 775 776 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 777 } 778 779 int 780 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 781 { 782 pmap_t pmap; 783 vm_map_t map; 784 vm_map_entry_t current, entry; 785 vm_object_t object; 786 vm_offset_t addr, cend, end, first_addr; 787 vm_paddr_t pa; 788 vm_page_t m; 789 vm_pindex_t pindex; 790 int error, lastvecindex, mincoreinfo, vecindex; 791 unsigned int timestamp; 792 793 /* 794 * Make sure that the addresses presented are valid for user 795 * mode. 796 */ 797 first_addr = addr = trunc_page(addr0); 798 end = round_page(addr0 + len); 799 map = &td->td_proc->p_vmspace->vm_map; 800 if (end > vm_map_max(map) || end < addr) 801 return (ENOMEM); 802 803 pmap = vmspace_pmap(td->td_proc->p_vmspace); 804 805 vm_map_lock_read(map); 806 RestartScan: 807 timestamp = map->timestamp; 808 809 if (!vm_map_lookup_entry(map, addr, &entry)) { 810 vm_map_unlock_read(map); 811 return (ENOMEM); 812 } 813 814 /* 815 * Do this on a map entry basis so that if the pages are not 816 * in the current processes address space, we can easily look 817 * up the pages elsewhere. 818 */ 819 lastvecindex = -1; 820 while (entry->start < end) { 821 /* 822 * check for contiguity 823 */ 824 current = entry; 825 entry = vm_map_entry_succ(current); 826 if (current->end < end && 827 entry->start > current->end) { 828 vm_map_unlock_read(map); 829 return (ENOMEM); 830 } 831 832 /* 833 * ignore submaps (for now) or null objects 834 */ 835 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 836 current->object.vm_object == NULL) 837 continue; 838 839 /* 840 * limit this scan to the current map entry and the 841 * limits for the mincore call 842 */ 843 if (addr < current->start) 844 addr = current->start; 845 cend = current->end; 846 if (cend > end) 847 cend = end; 848 849 for (; addr < cend; addr += PAGE_SIZE) { 850 /* 851 * Check pmap first, it is likely faster, also 852 * it can provide info as to whether we are the 853 * one referencing or modifying the page. 854 */ 855 m = NULL; 856 object = NULL; 857 retry: 858 pa = 0; 859 mincoreinfo = pmap_mincore(pmap, addr, &pa); 860 if (mincore_mapped) { 861 /* 862 * We only care about this pmap's 863 * mapping of the page, if any. 864 */ 865 ; 866 } else if (pa != 0) { 867 /* 868 * The page is mapped by this process but not 869 * both accessed and modified. It is also 870 * managed. Acquire the object lock so that 871 * other mappings might be examined. The page's 872 * identity may change at any point before its 873 * object lock is acquired, so re-validate if 874 * necessary. 875 */ 876 m = PHYS_TO_VM_PAGE(pa); 877 while (object == NULL || m->object != object) { 878 if (object != NULL) 879 VM_OBJECT_WUNLOCK(object); 880 object = atomic_load_ptr(&m->object); 881 if (object == NULL) 882 goto retry; 883 VM_OBJECT_WLOCK(object); 884 } 885 if (pa != pmap_extract(pmap, addr)) 886 goto retry; 887 KASSERT(vm_page_all_valid(m), 888 ("mincore: page %p is mapped but invalid", 889 m)); 890 } else if (mincoreinfo == 0) { 891 /* 892 * The page is not mapped by this process. If 893 * the object implements managed pages, then 894 * determine if the page is resident so that 895 * the mappings might be examined. 896 */ 897 if (current->object.vm_object != object) { 898 if (object != NULL) 899 VM_OBJECT_WUNLOCK(object); 900 object = current->object.vm_object; 901 VM_OBJECT_WLOCK(object); 902 } 903 if ((object->flags & OBJ_SWAP) != 0 || 904 object->type == OBJT_VNODE) { 905 pindex = OFF_TO_IDX(current->offset + 906 (addr - current->start)); 907 m = vm_page_lookup(object, pindex); 908 if (m != NULL && vm_page_none_valid(m)) 909 m = NULL; 910 if (m != NULL) 911 mincoreinfo = MINCORE_INCORE; 912 } 913 } 914 if (m != NULL) { 915 VM_OBJECT_ASSERT_WLOCKED(m->object); 916 917 /* Examine other mappings of the page. */ 918 if (m->dirty == 0 && pmap_is_modified(m)) 919 vm_page_dirty(m); 920 if (m->dirty != 0) 921 mincoreinfo |= MINCORE_MODIFIED_OTHER; 922 923 /* 924 * The first test for PGA_REFERENCED is an 925 * optimization. The second test is 926 * required because a concurrent pmap 927 * operation could clear the last reference 928 * and set PGA_REFERENCED before the call to 929 * pmap_is_referenced(). 930 */ 931 if ((m->a.flags & PGA_REFERENCED) != 0 || 932 pmap_is_referenced(m) || 933 (m->a.flags & PGA_REFERENCED) != 0) 934 mincoreinfo |= MINCORE_REFERENCED_OTHER; 935 } 936 if (object != NULL) 937 VM_OBJECT_WUNLOCK(object); 938 939 /* 940 * subyte may page fault. In case it needs to modify 941 * the map, we release the lock. 942 */ 943 vm_map_unlock_read(map); 944 945 /* 946 * calculate index into user supplied byte vector 947 */ 948 vecindex = atop(addr - first_addr); 949 950 /* 951 * If we have skipped map entries, we need to make sure that 952 * the byte vector is zeroed for those skipped entries. 953 */ 954 while ((lastvecindex + 1) < vecindex) { 955 ++lastvecindex; 956 error = subyte(vec + lastvecindex, 0); 957 if (error) { 958 error = EFAULT; 959 goto done2; 960 } 961 } 962 963 /* 964 * Pass the page information to the user 965 */ 966 error = subyte(vec + vecindex, mincoreinfo); 967 if (error) { 968 error = EFAULT; 969 goto done2; 970 } 971 972 /* 973 * If the map has changed, due to the subyte, the previous 974 * output may be invalid. 975 */ 976 vm_map_lock_read(map); 977 if (timestamp != map->timestamp) 978 goto RestartScan; 979 980 lastvecindex = vecindex; 981 } 982 } 983 984 /* 985 * subyte may page fault. In case it needs to modify 986 * the map, we release the lock. 987 */ 988 vm_map_unlock_read(map); 989 990 /* 991 * Zero the last entries in the byte vector. 992 */ 993 vecindex = atop(end - first_addr); 994 while ((lastvecindex + 1) < vecindex) { 995 ++lastvecindex; 996 error = subyte(vec + lastvecindex, 0); 997 if (error) { 998 error = EFAULT; 999 goto done2; 1000 } 1001 } 1002 1003 /* 1004 * If the map has changed, due to the subyte, the previous 1005 * output may be invalid. 1006 */ 1007 vm_map_lock_read(map); 1008 if (timestamp != map->timestamp) 1009 goto RestartScan; 1010 vm_map_unlock_read(map); 1011 done2: 1012 return (error); 1013 } 1014 1015 #ifndef _SYS_SYSPROTO_H_ 1016 struct mlock_args { 1017 const void *addr; 1018 size_t len; 1019 }; 1020 #endif 1021 int 1022 sys_mlock(struct thread *td, struct mlock_args *uap) 1023 { 1024 1025 return (kern_mlock(td->td_proc, td->td_ucred, 1026 __DECONST(uintptr_t, uap->addr), uap->len)); 1027 } 1028 1029 int 1030 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1031 { 1032 vm_offset_t addr, end, last, start; 1033 vm_size_t npages, size; 1034 vm_map_t map; 1035 unsigned long nsize; 1036 int error; 1037 1038 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1039 if (error) 1040 return (error); 1041 addr = addr0; 1042 size = len; 1043 last = addr + size; 1044 start = trunc_page(addr); 1045 end = round_page(last); 1046 if (last < addr || end < addr) 1047 return (EINVAL); 1048 npages = atop(end - start); 1049 if (npages > vm_page_max_user_wired) 1050 return (ENOMEM); 1051 map = &proc->p_vmspace->vm_map; 1052 PROC_LOCK(proc); 1053 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1054 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1055 PROC_UNLOCK(proc); 1056 return (ENOMEM); 1057 } 1058 PROC_UNLOCK(proc); 1059 #ifdef RACCT 1060 if (racct_enable) { 1061 PROC_LOCK(proc); 1062 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1063 PROC_UNLOCK(proc); 1064 if (error != 0) 1065 return (ENOMEM); 1066 } 1067 #endif 1068 error = vm_map_wire(map, start, end, 1069 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1070 #ifdef RACCT 1071 if (racct_enable && error != KERN_SUCCESS) { 1072 PROC_LOCK(proc); 1073 racct_set(proc, RACCT_MEMLOCK, 1074 ptoa(pmap_wired_count(map->pmap))); 1075 PROC_UNLOCK(proc); 1076 } 1077 #endif 1078 switch (error) { 1079 case KERN_SUCCESS: 1080 return (0); 1081 case KERN_INVALID_ARGUMENT: 1082 return (EINVAL); 1083 default: 1084 return (ENOMEM); 1085 } 1086 } 1087 1088 #ifndef _SYS_SYSPROTO_H_ 1089 struct mlockall_args { 1090 int how; 1091 }; 1092 #endif 1093 1094 int 1095 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1096 { 1097 vm_map_t map; 1098 int error; 1099 1100 map = &td->td_proc->p_vmspace->vm_map; 1101 error = priv_check(td, PRIV_VM_MLOCK); 1102 if (error) 1103 return (error); 1104 1105 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1106 return (EINVAL); 1107 1108 /* 1109 * If wiring all pages in the process would cause it to exceed 1110 * a hard resource limit, return ENOMEM. 1111 */ 1112 if (!old_mlock && uap->how & MCL_CURRENT) { 1113 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1114 return (ENOMEM); 1115 } 1116 #ifdef RACCT 1117 if (racct_enable) { 1118 PROC_LOCK(td->td_proc); 1119 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1120 PROC_UNLOCK(td->td_proc); 1121 if (error != 0) 1122 return (ENOMEM); 1123 } 1124 #endif 1125 1126 if (uap->how & MCL_FUTURE) { 1127 vm_map_lock(map); 1128 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1129 vm_map_unlock(map); 1130 error = 0; 1131 } 1132 1133 if (uap->how & MCL_CURRENT) { 1134 /* 1135 * P1003.1-2001 mandates that all currently mapped pages 1136 * will be memory resident and locked (wired) upon return 1137 * from mlockall(). vm_map_wire() will wire pages, by 1138 * calling vm_fault_wire() for each page in the region. 1139 */ 1140 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1141 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1142 if (error == KERN_SUCCESS) 1143 error = 0; 1144 else if (error == KERN_RESOURCE_SHORTAGE) 1145 error = ENOMEM; 1146 else 1147 error = EAGAIN; 1148 } 1149 #ifdef RACCT 1150 if (racct_enable && error != KERN_SUCCESS) { 1151 PROC_LOCK(td->td_proc); 1152 racct_set(td->td_proc, RACCT_MEMLOCK, 1153 ptoa(pmap_wired_count(map->pmap))); 1154 PROC_UNLOCK(td->td_proc); 1155 } 1156 #endif 1157 1158 return (error); 1159 } 1160 1161 #ifndef _SYS_SYSPROTO_H_ 1162 struct munlockall_args { 1163 register_t dummy; 1164 }; 1165 #endif 1166 1167 int 1168 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1169 { 1170 vm_map_t map; 1171 int error; 1172 1173 map = &td->td_proc->p_vmspace->vm_map; 1174 error = priv_check(td, PRIV_VM_MUNLOCK); 1175 if (error) 1176 return (error); 1177 1178 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1179 vm_map_lock(map); 1180 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1181 vm_map_unlock(map); 1182 1183 /* Forcibly unwire all pages. */ 1184 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1185 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1186 #ifdef RACCT 1187 if (racct_enable && error == KERN_SUCCESS) { 1188 PROC_LOCK(td->td_proc); 1189 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1190 PROC_UNLOCK(td->td_proc); 1191 } 1192 #endif 1193 1194 return (error); 1195 } 1196 1197 #ifndef _SYS_SYSPROTO_H_ 1198 struct munlock_args { 1199 const void *addr; 1200 size_t len; 1201 }; 1202 #endif 1203 int 1204 sys_munlock(struct thread *td, struct munlock_args *uap) 1205 { 1206 1207 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1208 } 1209 1210 int 1211 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1212 { 1213 vm_offset_t addr, end, last, start; 1214 #ifdef RACCT 1215 vm_map_t map; 1216 #endif 1217 int error; 1218 1219 error = priv_check(td, PRIV_VM_MUNLOCK); 1220 if (error) 1221 return (error); 1222 addr = addr0; 1223 last = addr + size; 1224 start = trunc_page(addr); 1225 end = round_page(last); 1226 if (last < addr || end < addr) 1227 return (EINVAL); 1228 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1229 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1230 #ifdef RACCT 1231 if (racct_enable && error == KERN_SUCCESS) { 1232 PROC_LOCK(td->td_proc); 1233 map = &td->td_proc->p_vmspace->vm_map; 1234 racct_set(td->td_proc, RACCT_MEMLOCK, 1235 ptoa(pmap_wired_count(map->pmap))); 1236 PROC_UNLOCK(td->td_proc); 1237 } 1238 #endif 1239 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1240 } 1241 1242 /* 1243 * vm_mmap_vnode() 1244 * 1245 * Helper function for vm_mmap. Perform sanity check specific for mmap 1246 * operations on vnodes. 1247 */ 1248 int 1249 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1250 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1251 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1252 boolean_t *writecounted) 1253 { 1254 struct vattr va; 1255 vm_object_t obj; 1256 vm_ooffset_t foff; 1257 struct ucred *cred; 1258 int error, flags; 1259 bool writex; 1260 1261 cred = td->td_ucred; 1262 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1263 (*flagsp & MAP_SHARED) != 0; 1264 if ((error = vget(vp, LK_SHARED)) != 0) 1265 return (error); 1266 AUDIT_ARG_VNODE1(vp); 1267 foff = *foffp; 1268 flags = *flagsp; 1269 obj = vp->v_object; 1270 if (vp->v_type == VREG) { 1271 /* 1272 * Get the proper underlying object 1273 */ 1274 if (obj == NULL) { 1275 error = EINVAL; 1276 goto done; 1277 } 1278 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1279 vput(vp); 1280 vp = (struct vnode *)obj->handle; 1281 /* 1282 * Bypass filesystems obey the mpsafety of the 1283 * underlying fs. Tmpfs never bypasses. 1284 */ 1285 error = vget(vp, LK_SHARED); 1286 if (error != 0) 1287 return (error); 1288 } 1289 if (writex) { 1290 *writecounted = TRUE; 1291 vm_pager_update_writecount(obj, 0, objsize); 1292 } 1293 } else { 1294 error = EINVAL; 1295 goto done; 1296 } 1297 if ((error = VOP_GETATTR(vp, &va, cred))) 1298 goto done; 1299 #ifdef MAC 1300 /* This relies on VM_PROT_* matching PROT_*. */ 1301 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1302 if (error != 0) 1303 goto done; 1304 #endif 1305 if ((flags & MAP_SHARED) != 0) { 1306 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1307 if (prot & VM_PROT_WRITE) { 1308 error = EPERM; 1309 goto done; 1310 } 1311 *maxprotp &= ~VM_PROT_WRITE; 1312 } 1313 } 1314 /* 1315 * If it is a regular file without any references 1316 * we do not need to sync it. 1317 * Adjust object size to be the size of actual file. 1318 */ 1319 objsize = round_page(va.va_size); 1320 if (va.va_nlink == 0) 1321 flags |= MAP_NOSYNC; 1322 if (obj->type == OBJT_VNODE) { 1323 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1324 cred); 1325 if (obj == NULL) { 1326 error = ENOMEM; 1327 goto done; 1328 } 1329 } else { 1330 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1331 vm_object_reference(obj); 1332 #if VM_NRESERVLEVEL > 0 1333 if ((obj->flags & OBJ_COLORED) == 0) { 1334 VM_OBJECT_WLOCK(obj); 1335 vm_object_color(obj, 0); 1336 VM_OBJECT_WUNLOCK(obj); 1337 } 1338 #endif 1339 } 1340 *objp = obj; 1341 *flagsp = flags; 1342 1343 VOP_MMAPPED(vp); 1344 1345 done: 1346 if (error != 0 && *writecounted) { 1347 *writecounted = FALSE; 1348 vm_pager_update_writecount(obj, objsize, 0); 1349 } 1350 vput(vp); 1351 return (error); 1352 } 1353 1354 /* 1355 * vm_mmap_cdev() 1356 * 1357 * Helper function for vm_mmap. Perform sanity check specific for mmap 1358 * operations on cdevs. 1359 */ 1360 int 1361 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1362 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1363 vm_ooffset_t *foff, vm_object_t *objp) 1364 { 1365 vm_object_t obj; 1366 int error, flags; 1367 1368 flags = *flagsp; 1369 1370 if (dsw->d_flags & D_MMAP_ANON) { 1371 *objp = NULL; 1372 *foff = 0; 1373 *maxprotp = VM_PROT_ALL; 1374 *flagsp |= MAP_ANON; 1375 return (0); 1376 } 1377 /* 1378 * cdevs do not provide private mappings of any kind. 1379 */ 1380 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1381 (prot & VM_PROT_WRITE) != 0) 1382 return (EACCES); 1383 if (flags & (MAP_PRIVATE|MAP_COPY)) 1384 return (EINVAL); 1385 /* 1386 * Force device mappings to be shared. 1387 */ 1388 flags |= MAP_SHARED; 1389 #ifdef MAC_XXX 1390 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1391 if (error != 0) 1392 return (error); 1393 #endif 1394 /* 1395 * First, try d_mmap_single(). If that is not implemented 1396 * (returns ENODEV), fall back to using the device pager. 1397 * Note that d_mmap_single() must return a reference to the 1398 * object (it needs to bump the reference count of the object 1399 * it returns somehow). 1400 * 1401 * XXX assumes VM_PROT_* == PROT_* 1402 */ 1403 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1404 if (error != ENODEV) 1405 return (error); 1406 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1407 td->td_ucred); 1408 if (obj == NULL) 1409 return (EINVAL); 1410 VM_OBJECT_WLOCK(obj); 1411 vm_object_set_flag(obj, OBJ_CDEVH); 1412 VM_OBJECT_WUNLOCK(obj); 1413 *objp = obj; 1414 *flagsp = flags; 1415 return (0); 1416 } 1417 1418 int 1419 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1420 vm_prot_t maxprot, int flags, 1421 objtype_t handle_type, void *handle, 1422 vm_ooffset_t foff) 1423 { 1424 vm_object_t object; 1425 struct thread *td = curthread; 1426 int error; 1427 boolean_t writecounted; 1428 1429 if (size == 0) 1430 return (EINVAL); 1431 1432 size = round_page(size); 1433 object = NULL; 1434 writecounted = FALSE; 1435 1436 switch (handle_type) { 1437 case OBJT_DEVICE: { 1438 struct cdevsw *dsw; 1439 struct cdev *cdev; 1440 int ref; 1441 1442 cdev = handle; 1443 dsw = dev_refthread(cdev, &ref); 1444 if (dsw == NULL) 1445 return (ENXIO); 1446 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1447 dsw, &foff, &object); 1448 dev_relthread(cdev, ref); 1449 break; 1450 } 1451 case OBJT_VNODE: 1452 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1453 handle, &foff, &object, &writecounted); 1454 break; 1455 default: 1456 error = EINVAL; 1457 break; 1458 } 1459 if (error) 1460 return (error); 1461 1462 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1463 foff, writecounted, td); 1464 if (error != 0 && object != NULL) { 1465 /* 1466 * If this mapping was accounted for in the vnode's 1467 * writecount, then undo that now. 1468 */ 1469 if (writecounted) 1470 vm_pager_release_writecount(object, 0, size); 1471 vm_object_deallocate(object); 1472 } 1473 return (error); 1474 } 1475 1476 int 1477 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1478 { 1479 int error; 1480 1481 RACCT_PROC_LOCK(td->td_proc); 1482 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1483 RACCT_PROC_UNLOCK(td->td_proc); 1484 return (ENOMEM); 1485 } 1486 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1487 RACCT_PROC_UNLOCK(td->td_proc); 1488 return (ENOMEM); 1489 } 1490 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1491 if (ptoa(pmap_wired_count(map->pmap)) + size > 1492 lim_cur(td, RLIMIT_MEMLOCK)) { 1493 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1494 RACCT_PROC_UNLOCK(td->td_proc); 1495 return (ENOMEM); 1496 } 1497 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1498 ptoa(pmap_wired_count(map->pmap)) + size); 1499 if (error != 0) { 1500 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1501 RACCT_PROC_UNLOCK(td->td_proc); 1502 return (error); 1503 } 1504 } 1505 RACCT_PROC_UNLOCK(td->td_proc); 1506 return (0); 1507 } 1508 1509 /* 1510 * Internal version of mmap that maps a specific VM object into an 1511 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1512 */ 1513 int 1514 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1515 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1516 boolean_t writecounted, struct thread *td) 1517 { 1518 vm_offset_t default_addr, max_addr; 1519 int docow, error, findspace, rv; 1520 bool curmap, fitit; 1521 1522 curmap = map == &td->td_proc->p_vmspace->vm_map; 1523 if (curmap) { 1524 error = kern_mmap_racct_check(td, map, size); 1525 if (error != 0) 1526 return (error); 1527 } 1528 1529 /* 1530 * We currently can only deal with page aligned file offsets. 1531 * The mmap() system call already enforces this by subtracting 1532 * the page offset from the file offset, but checking here 1533 * catches errors in device drivers (e.g. d_single_mmap() 1534 * callbacks) and other internal mapping requests (such as in 1535 * exec). 1536 */ 1537 if (foff & PAGE_MASK) 1538 return (EINVAL); 1539 1540 if ((flags & MAP_FIXED) == 0) { 1541 fitit = true; 1542 *addr = round_page(*addr); 1543 } else { 1544 if (*addr != trunc_page(*addr)) 1545 return (EINVAL); 1546 fitit = false; 1547 } 1548 1549 if (flags & MAP_ANON) { 1550 if (object != NULL || foff != 0) 1551 return (EINVAL); 1552 docow = 0; 1553 } else if (flags & MAP_PREFAULT_READ) 1554 docow = MAP_PREFAULT; 1555 else 1556 docow = MAP_PREFAULT_PARTIAL; 1557 1558 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1559 docow |= MAP_COPY_ON_WRITE; 1560 if (flags & MAP_NOSYNC) 1561 docow |= MAP_DISABLE_SYNCER; 1562 if (flags & MAP_NOCORE) 1563 docow |= MAP_DISABLE_COREDUMP; 1564 /* Shared memory is also shared with children. */ 1565 if (flags & MAP_SHARED) 1566 docow |= MAP_INHERIT_SHARE; 1567 if (writecounted) 1568 docow |= MAP_WRITECOUNT; 1569 if (flags & MAP_STACK) { 1570 if (object != NULL) 1571 return (EINVAL); 1572 docow |= MAP_STACK_GROWS_DOWN; 1573 } 1574 if ((flags & MAP_EXCL) != 0) 1575 docow |= MAP_CHECK_EXCL; 1576 if ((flags & MAP_GUARD) != 0) 1577 docow |= MAP_CREATE_GUARD; 1578 1579 if (fitit) { 1580 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1581 findspace = VMFS_SUPER_SPACE; 1582 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1583 findspace = VMFS_ALIGNED_SPACE(flags >> 1584 MAP_ALIGNMENT_SHIFT); 1585 else 1586 findspace = VMFS_OPTIMAL_SPACE; 1587 max_addr = 0; 1588 if ((flags & MAP_32BIT) != 0) 1589 max_addr = MAP_32BIT_MAX_ADDR; 1590 if (curmap) { 1591 default_addr = 1592 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1593 vm_daddr + lim_max(td, RLIMIT_DATA)); 1594 if ((flags & MAP_32BIT) != 0) 1595 default_addr = 0; 1596 rv = vm_map_find_min(map, object, foff, addr, size, 1597 default_addr, max_addr, findspace, prot, maxprot, 1598 docow); 1599 } else { 1600 rv = vm_map_find(map, object, foff, addr, size, 1601 max_addr, findspace, prot, maxprot, docow); 1602 } 1603 } else { 1604 rv = vm_map_fixed(map, object, foff, *addr, size, 1605 prot, maxprot, docow); 1606 } 1607 1608 if (rv == KERN_SUCCESS) { 1609 /* 1610 * If the process has requested that all future mappings 1611 * be wired, then heed this. 1612 */ 1613 if ((map->flags & MAP_WIREFUTURE) != 0) { 1614 vm_map_lock(map); 1615 if ((map->flags & MAP_WIREFUTURE) != 0) 1616 (void)vm_map_wire_locked(map, *addr, 1617 *addr + size, VM_MAP_WIRE_USER | 1618 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1619 VM_MAP_WIRE_NOHOLES)); 1620 vm_map_unlock(map); 1621 } 1622 } 1623 return (vm_mmap_to_errno(rv)); 1624 } 1625 1626 /* 1627 * Translate a Mach VM return code to zero on success or the appropriate errno 1628 * on failure. 1629 */ 1630 int 1631 vm_mmap_to_errno(int rv) 1632 { 1633 1634 switch (rv) { 1635 case KERN_SUCCESS: 1636 return (0); 1637 case KERN_INVALID_ADDRESS: 1638 case KERN_NO_SPACE: 1639 return (ENOMEM); 1640 case KERN_PROTECTION_FAILURE: 1641 return (EACCES); 1642 default: 1643 return (EINVAL); 1644 } 1645 } 1646