1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include "opt_hwpmc_hooks.h" 44 #include "opt_vm.h" 45 46 #define EXTERR_CATEGORY EXTERR_CAT_MMAP 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/capsicum.h> 50 #include <sys/exterrvar.h> 51 #include <sys/kernel.h> 52 #include <sys/lock.h> 53 #include <sys/mutex.h> 54 #include <sys/sysproto.h> 55 #include <sys/elf.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 76 #include <machine/md_var.h> 77 #endif 78 79 #include <security/audit/audit.h> 80 #include <security/mac/mac_framework.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_param.h> 84 #include <vm/pmap.h> 85 #include <vm/vm_map.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_pager.h> 89 #include <vm/vm_pageout.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_page.h> 92 #include <vm/vnode_pager.h> 93 94 #ifdef HWPMC_HOOKS 95 #include <sys/pmckern.h> 96 #endif 97 98 int old_mlock = 0; 99 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 100 "Do not apply RLIMIT_MEMLOCK on mlockall"); 101 static int mincore_mapped = 1; 102 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 103 "mincore reports mappings, not residency"); 104 static int imply_prot_max = 0; 105 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 106 "Imply maximum page protections in mmap() when none are specified"); 107 108 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 109 110 #if defined(COMPAT_43) 111 int 112 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 113 { 114 115 td->td_retval[0] = PAGE_SIZE; 116 return (0); 117 } 118 #endif /* COMPAT_43 */ 119 120 /* 121 * Memory Map (mmap) system call. Note that the file offset 122 * and address are allowed to be NOT page aligned, though if 123 * the MAP_FIXED flag it set, both must have the same remainder 124 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 125 * page-aligned, the actual mapping starts at trunc_page(addr) 126 * and the return value is adjusted up by the page offset. 127 * 128 * Generally speaking, only character devices which are themselves 129 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 130 * there would be no cache coherency between a descriptor and a VM mapping 131 * both to the same character device. 132 */ 133 #ifndef _SYS_SYSPROTO_H_ 134 struct mmap_args { 135 void *addr; 136 size_t len; 137 int prot; 138 int flags; 139 int fd; 140 long pad; 141 off_t pos; 142 }; 143 #endif 144 145 int 146 sys_mmap(struct thread *td, struct mmap_args *uap) 147 { 148 149 return (kern_mmap(td, &(struct mmap_req){ 150 .mr_hint = (uintptr_t)uap->addr, 151 .mr_len = uap->len, 152 .mr_prot = uap->prot, 153 .mr_flags = uap->flags, 154 .mr_fd = uap->fd, 155 .mr_pos = uap->pos, 156 })); 157 } 158 159 int 160 kern_mmap_maxprot(struct proc *p, int prot) 161 { 162 163 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 164 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 165 return (_PROT_ALL); 166 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 167 prot != PROT_NONE) 168 return (prot); 169 return (_PROT_ALL); 170 } 171 172 int 173 kern_mmap(struct thread *td, const struct mmap_req *mrp) 174 { 175 struct vmspace *vms; 176 struct file *fp; 177 struct proc *p; 178 off_t pos; 179 vm_offset_t addr, orig_addr; 180 vm_size_t len, pageoff, size; 181 vm_prot_t cap_maxprot; 182 int align, error, fd, flags, max_prot, prot; 183 cap_rights_t rights; 184 mmap_check_fp_fn check_fp_fn; 185 186 orig_addr = addr = mrp->mr_hint; 187 len = mrp->mr_len; 188 prot = mrp->mr_prot; 189 flags = mrp->mr_flags; 190 fd = mrp->mr_fd; 191 pos = mrp->mr_pos; 192 check_fp_fn = mrp->mr_check_fp_fn; 193 194 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) { 195 SET_ERROR0(EINVAL, "unknown PROT bits"); 196 return (EINVAL); 197 } 198 max_prot = PROT_MAX_EXTRACT(prot); 199 prot = PROT_EXTRACT(prot); 200 if (max_prot != 0 && (max_prot & prot) != prot) { 201 SET_ERROR0(ENOTSUP, "prot is not subset of max_prot"); 202 return (ENOTSUP); 203 } 204 205 p = td->td_proc; 206 207 /* 208 * Always honor PROT_MAX if set. If not, default to all 209 * permissions unless we're implying maximum permissions. 210 */ 211 if (max_prot == 0) 212 max_prot = kern_mmap_maxprot(p, prot); 213 214 vms = p->p_vmspace; 215 fp = NULL; 216 AUDIT_ARG_FD(fd); 217 218 /* 219 * Ignore old flags that used to be defined but did not do anything. 220 */ 221 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 222 223 /* 224 * Enforce the constraints. 225 * Mapping of length 0 is only allowed for old binaries. 226 * Anonymous mapping shall specify -1 as filedescriptor and 227 * zero position for new code. Be nice to ancient a.out 228 * binaries and correct pos for anonymous mapping, since old 229 * ld.so sometimes issues anonymous map requests with non-zero 230 * pos. 231 */ 232 if (!SV_CURPROC_FLAG(SV_AOUT)) { 233 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 234 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 235 return (EINVAL); 236 } else { 237 if ((flags & MAP_ANON) != 0) 238 pos = 0; 239 } 240 241 if (flags & MAP_STACK) { 242 if ((fd != -1) || 243 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 244 return (EINVAL); 245 flags |= MAP_ANON; 246 pos = 0; 247 } 248 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 249 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 250 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0) 251 return (EINVAL); 252 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 253 return (EINVAL); 254 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 255 return (EINVAL); 256 if (prot != PROT_NONE && 257 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 258 return (EINVAL); 259 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 260 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 261 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) 262 return (EINVAL); 263 264 /* 265 * Align the file position to a page boundary, 266 * and save its page offset component. 267 */ 268 pageoff = (pos & PAGE_MASK); 269 pos -= pageoff; 270 271 /* Compute size from len by rounding (on both ends). */ 272 size = len + pageoff; /* low end... */ 273 size = round_page(size); /* hi end */ 274 /* Check for rounding up to zero. */ 275 if (len > size) 276 return (ENOMEM); 277 278 /* Ensure alignment is at least a page and fits in a pointer. */ 279 align = flags & MAP_ALIGNMENT_MASK; 280 if (align != 0 && align != MAP_ALIGNED_SUPER && 281 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 282 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 283 return (EINVAL); 284 285 /* 286 * Check for illegal addresses. Watch out for address wrap... Note 287 * that VM_*_ADDRESS are not constants due to casts (argh). 288 */ 289 if (flags & MAP_FIXED) { 290 /* 291 * The specified address must have the same remainder 292 * as the file offset taken modulo PAGE_SIZE, so it 293 * should be aligned after adjustment by pageoff. 294 */ 295 addr -= pageoff; 296 if (addr & PAGE_MASK) 297 return (EINVAL); 298 299 /* Address range must be all in user VM space. */ 300 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) 301 return (EINVAL); 302 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 303 return (EINVAL); 304 } else if (flags & MAP_32BIT) { 305 /* 306 * For MAP_32BIT, override the hint if it is too high and 307 * do not bother moving the mapping past the heap (since 308 * the heap is usually above 2GB). 309 */ 310 if (addr + size > MAP_32BIT_MAX_ADDR) 311 addr = 0; 312 } else { 313 /* 314 * XXX for non-fixed mappings where no hint is provided or 315 * the hint would fall in the potential heap space, 316 * place it after the end of the largest possible heap. 317 * 318 * For anonymous mappings within the address space of the 319 * calling process, the absence of a hint is handled at a 320 * lower level in order to implement different clustering 321 * strategies for ASLR. 322 */ 323 if (((flags & MAP_ANON) == 0 && addr == 0) || 324 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 325 addr < round_page((vm_offset_t)vms->vm_daddr + 326 lim_max(td, RLIMIT_DATA)))) 327 addr = round_page((vm_offset_t)vms->vm_daddr + 328 lim_max(td, RLIMIT_DATA)); 329 } 330 if (len == 0) { 331 /* 332 * Return success without mapping anything for old 333 * binaries that request a page-aligned mapping of 334 * length 0. For modern binaries, this function 335 * returns an error earlier. 336 */ 337 error = 0; 338 } else if ((flags & MAP_GUARD) != 0) { 339 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 340 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 341 } else if ((flags & MAP_ANON) != 0) { 342 /* 343 * Mapping blank space is trivial. 344 * 345 * This relies on VM_PROT_* matching PROT_*. 346 */ 347 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 348 max_prot, flags, NULL, pos, FALSE, td); 349 } else { 350 /* 351 * Mapping file, get fp for validation and don't let the 352 * descriptor disappear on us if we block. Check capability 353 * rights, but also return the maximum rights to be combined 354 * with maxprot later. 355 */ 356 cap_rights_init_one(&rights, CAP_MMAP); 357 if (prot & PROT_READ) 358 cap_rights_set_one(&rights, CAP_MMAP_R); 359 if ((flags & MAP_SHARED) != 0) { 360 if (prot & PROT_WRITE) 361 cap_rights_set_one(&rights, CAP_MMAP_W); 362 } 363 if (prot & PROT_EXEC) 364 cap_rights_set_one(&rights, CAP_MMAP_X); 365 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 366 if (error != 0) 367 goto done; 368 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 369 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 370 error = EINVAL; 371 goto done; 372 } 373 if (check_fp_fn != NULL) { 374 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 375 flags); 376 if (error != 0) 377 goto done; 378 } 379 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 380 addr = orig_addr; 381 /* This relies on VM_PROT_* matching PROT_*. */ 382 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 383 max_prot & cap_maxprot, flags, pos, td); 384 } 385 386 if (error == 0) 387 td->td_retval[0] = addr + pageoff; 388 done: 389 if (fp) 390 fdrop(fp, td); 391 392 return (error); 393 } 394 395 #if defined(COMPAT_FREEBSD6) 396 int 397 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 398 { 399 return (kern_mmap(td, &(struct mmap_req){ 400 .mr_hint = (uintptr_t)uap->addr, 401 .mr_len = uap->len, 402 .mr_prot = uap->prot, 403 .mr_flags = uap->flags, 404 .mr_fd = uap->fd, 405 .mr_pos = uap->pos, 406 })); 407 } 408 #endif 409 410 #ifdef COMPAT_43 411 #ifndef _SYS_SYSPROTO_H_ 412 struct ommap_args { 413 caddr_t addr; 414 int len; 415 int prot; 416 int flags; 417 int fd; 418 long pos; 419 }; 420 #endif 421 int 422 ommap(struct thread *td, struct ommap_args *uap) 423 { 424 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 425 uap->flags, uap->fd, uap->pos)); 426 } 427 428 int 429 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 430 int oflags, int fd, long pos) 431 { 432 static const char cvtbsdprot[8] = { 433 0, 434 PROT_EXEC, 435 PROT_WRITE, 436 PROT_EXEC | PROT_WRITE, 437 PROT_READ, 438 PROT_EXEC | PROT_READ, 439 PROT_WRITE | PROT_READ, 440 PROT_EXEC | PROT_WRITE | PROT_READ, 441 }; 442 int flags, prot; 443 444 if (len < 0) 445 return (EINVAL); 446 447 #define OMAP_ANON 0x0002 448 #define OMAP_COPY 0x0020 449 #define OMAP_SHARED 0x0010 450 #define OMAP_FIXED 0x0100 451 452 prot = cvtbsdprot[oprot & 0x7]; 453 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 454 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 455 prot != 0) 456 prot |= PROT_EXEC; 457 #endif 458 flags = 0; 459 if (oflags & OMAP_ANON) 460 flags |= MAP_ANON; 461 if (oflags & OMAP_COPY) 462 flags |= MAP_COPY; 463 if (oflags & OMAP_SHARED) 464 flags |= MAP_SHARED; 465 else 466 flags |= MAP_PRIVATE; 467 if (oflags & OMAP_FIXED) 468 flags |= MAP_FIXED; 469 return (kern_mmap(td, &(struct mmap_req){ 470 .mr_hint = hint, 471 .mr_len = len, 472 .mr_prot = prot, 473 .mr_flags = flags, 474 .mr_fd = fd, 475 .mr_pos = pos, 476 })); 477 } 478 #endif /* COMPAT_43 */ 479 480 #ifndef _SYS_SYSPROTO_H_ 481 struct msync_args { 482 void *addr; 483 size_t len; 484 int flags; 485 }; 486 #endif 487 int 488 sys_msync(struct thread *td, struct msync_args *uap) 489 { 490 491 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 492 } 493 494 int 495 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 496 { 497 vm_offset_t addr; 498 vm_size_t pageoff; 499 vm_map_t map; 500 int rv; 501 502 addr = addr0; 503 pageoff = (addr & PAGE_MASK); 504 addr -= pageoff; 505 size += pageoff; 506 size = (vm_size_t) round_page(size); 507 if (addr + size < addr) 508 return (EINVAL); 509 510 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 511 return (EINVAL); 512 513 map = &td->td_proc->p_vmspace->vm_map; 514 515 /* 516 * Clean the pages and interpret the return value. 517 */ 518 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 519 (flags & MS_INVALIDATE) != 0); 520 switch (rv) { 521 case KERN_SUCCESS: 522 return (0); 523 case KERN_INVALID_ADDRESS: 524 return (ENOMEM); 525 case KERN_INVALID_ARGUMENT: 526 return (EBUSY); 527 case KERN_FAILURE: 528 return (EIO); 529 default: 530 return (EINVAL); 531 } 532 } 533 534 #ifndef _SYS_SYSPROTO_H_ 535 struct munmap_args { 536 void *addr; 537 size_t len; 538 }; 539 #endif 540 int 541 sys_munmap(struct thread *td, struct munmap_args *uap) 542 { 543 544 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 545 } 546 547 int 548 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 549 { 550 #ifdef HWPMC_HOOKS 551 struct pmckern_map_out pkm; 552 vm_map_entry_t entry; 553 bool pmc_handled; 554 #endif 555 vm_offset_t addr, end; 556 vm_size_t pageoff; 557 vm_map_t map; 558 int rv; 559 560 if (size == 0) 561 return (EINVAL); 562 563 addr = addr0; 564 pageoff = (addr & PAGE_MASK); 565 addr -= pageoff; 566 size += pageoff; 567 size = (vm_size_t) round_page(size); 568 end = addr + size; 569 map = &td->td_proc->p_vmspace->vm_map; 570 if (!vm_map_range_valid(map, addr, end)) 571 return (EINVAL); 572 573 vm_map_lock(map); 574 #ifdef HWPMC_HOOKS 575 pmc_handled = false; 576 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 577 pmc_handled = true; 578 /* 579 * Inform hwpmc if the address range being unmapped contains 580 * an executable region. 581 */ 582 pkm.pm_address = (uintptr_t) NULL; 583 if (vm_map_lookup_entry(map, addr, &entry)) { 584 for (; entry->start < end; 585 entry = vm_map_entry_succ(entry)) { 586 if (vm_map_check_protection(map, entry->start, 587 entry->end, VM_PROT_EXECUTE) == TRUE) { 588 pkm.pm_address = (uintptr_t) addr; 589 pkm.pm_size = (size_t) size; 590 break; 591 } 592 } 593 } 594 } 595 #endif 596 rv = vm_map_delete(map, addr, end); 597 598 #ifdef HWPMC_HOOKS 599 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 600 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 601 vm_map_lock_downgrade(map); 602 if (pkm.pm_address != (uintptr_t) NULL) 603 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 604 vm_map_unlock_read(map); 605 } else 606 #endif 607 vm_map_unlock(map); 608 609 return (vm_mmap_to_errno(rv)); 610 } 611 612 #ifndef _SYS_SYSPROTO_H_ 613 struct mprotect_args { 614 const void *addr; 615 size_t len; 616 int prot; 617 }; 618 #endif 619 int 620 sys_mprotect(struct thread *td, struct mprotect_args *uap) 621 { 622 623 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, 624 uap->prot, 0)); 625 } 626 627 int 628 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot, 629 int flags) 630 { 631 vm_offset_t addr; 632 vm_size_t pageoff; 633 int vm_error, max_prot; 634 635 addr = addr0; 636 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 637 return (EINVAL); 638 max_prot = PROT_MAX_EXTRACT(prot); 639 prot = PROT_EXTRACT(prot); 640 pageoff = (addr & PAGE_MASK); 641 addr -= pageoff; 642 size += pageoff; 643 size = (vm_size_t) round_page(size); 644 #ifdef COMPAT_FREEBSD32 645 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 646 if (((addr + size) & 0xffffffff) < addr) 647 return (EINVAL); 648 } else 649 #endif 650 if (addr + size < addr) 651 return (EINVAL); 652 653 flags |= VM_MAP_PROTECT_SET_PROT; 654 if (max_prot != 0) 655 flags |= VM_MAP_PROTECT_SET_MAXPROT; 656 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 657 addr, addr + size, prot, max_prot, flags); 658 659 switch (vm_error) { 660 case KERN_SUCCESS: 661 return (0); 662 case KERN_PROTECTION_FAILURE: 663 return (EACCES); 664 case KERN_RESOURCE_SHORTAGE: 665 return (ENOMEM); 666 case KERN_OUT_OF_BOUNDS: 667 return (ENOTSUP); 668 } 669 return (EINVAL); 670 } 671 672 #ifndef _SYS_SYSPROTO_H_ 673 struct minherit_args { 674 void *addr; 675 size_t len; 676 int inherit; 677 }; 678 #endif 679 int 680 sys_minherit(struct thread *td, struct minherit_args *uap) 681 { 682 683 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 684 uap->inherit)); 685 } 686 687 int 688 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 689 { 690 vm_offset_t addr; 691 vm_size_t size, pageoff; 692 vm_inherit_t inherit; 693 694 addr = (vm_offset_t)addr0; 695 size = len; 696 inherit = inherit0; 697 698 pageoff = (addr & PAGE_MASK); 699 addr -= pageoff; 700 size += pageoff; 701 size = (vm_size_t) round_page(size); 702 if (addr + size < addr) 703 return (EINVAL); 704 705 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 706 addr + size, inherit)) { 707 case KERN_SUCCESS: 708 return (0); 709 case KERN_PROTECTION_FAILURE: 710 return (EACCES); 711 } 712 return (EINVAL); 713 } 714 715 #ifndef _SYS_SYSPROTO_H_ 716 struct madvise_args { 717 void *addr; 718 size_t len; 719 int behav; 720 }; 721 #endif 722 723 int 724 sys_madvise(struct thread *td, struct madvise_args *uap) 725 { 726 727 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 728 } 729 730 int 731 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 732 { 733 vm_map_t map; 734 vm_offset_t addr, end, start; 735 int flags; 736 737 /* 738 * Check for our special case, advising the swap pager we are 739 * "immortal." 740 */ 741 if (behav == MADV_PROTECT) { 742 flags = PPROT_SET; 743 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 744 PROC_SPROTECT, &flags)); 745 } 746 747 /* 748 * Check for illegal addresses. Watch out for address wrap... Note 749 * that VM_*_ADDRESS are not constants due to casts (argh). 750 */ 751 map = &td->td_proc->p_vmspace->vm_map; 752 addr = addr0; 753 if (!vm_map_range_valid(map, addr, addr + len)) 754 return (EINVAL); 755 756 /* 757 * Since this routine is only advisory, we default to conservative 758 * behavior. 759 */ 760 start = trunc_page(addr); 761 end = round_page(addr + len); 762 763 /* 764 * vm_map_madvise() checks for illegal values of behav. 765 */ 766 return (vm_map_madvise(map, start, end, behav)); 767 } 768 769 #ifndef _SYS_SYSPROTO_H_ 770 struct mincore_args { 771 const void *addr; 772 size_t len; 773 char *vec; 774 }; 775 #endif 776 777 int 778 sys_mincore(struct thread *td, struct mincore_args *uap) 779 { 780 781 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 782 } 783 784 int 785 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 786 { 787 pmap_t pmap; 788 vm_map_t map; 789 vm_map_entry_t current, entry; 790 vm_object_t object; 791 vm_offset_t addr, cend, end, first_addr; 792 vm_paddr_t pa; 793 vm_page_t m; 794 vm_pindex_t pindex; 795 int error, lastvecindex, mincoreinfo, vecindex; 796 unsigned int timestamp; 797 798 /* 799 * Make sure that the addresses presented are valid for user 800 * mode. 801 */ 802 first_addr = addr = trunc_page(addr0); 803 end = round_page(addr0 + len); 804 map = &td->td_proc->p_vmspace->vm_map; 805 if (end > vm_map_max(map) || end < addr) 806 return (ENOMEM); 807 808 pmap = vmspace_pmap(td->td_proc->p_vmspace); 809 810 vm_map_lock_read(map); 811 RestartScan: 812 timestamp = map->timestamp; 813 814 if (!vm_map_lookup_entry(map, addr, &entry)) { 815 vm_map_unlock_read(map); 816 return (ENOMEM); 817 } 818 819 /* 820 * Do this on a map entry basis so that if the pages are not 821 * in the current processes address space, we can easily look 822 * up the pages elsewhere. 823 */ 824 lastvecindex = -1; 825 while (entry->start < end) { 826 /* 827 * check for contiguity 828 */ 829 current = entry; 830 entry = vm_map_entry_succ(current); 831 if (current->end < end && 832 entry->start > current->end) { 833 vm_map_unlock_read(map); 834 return (ENOMEM); 835 } 836 837 /* 838 * ignore submaps (for now) or null objects 839 */ 840 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 841 current->object.vm_object == NULL) 842 continue; 843 844 /* 845 * limit this scan to the current map entry and the 846 * limits for the mincore call 847 */ 848 if (addr < current->start) 849 addr = current->start; 850 cend = current->end; 851 if (cend > end) 852 cend = end; 853 854 for (; addr < cend; addr += PAGE_SIZE) { 855 /* 856 * Check pmap first, it is likely faster, also 857 * it can provide info as to whether we are the 858 * one referencing or modifying the page. 859 */ 860 m = NULL; 861 object = NULL; 862 retry: 863 pa = 0; 864 mincoreinfo = pmap_mincore(pmap, addr, &pa); 865 if (mincore_mapped) { 866 /* 867 * We only care about this pmap's 868 * mapping of the page, if any. 869 */ 870 ; 871 } else if (pa != 0) { 872 /* 873 * The page is mapped by this process but not 874 * both accessed and modified. It is also 875 * managed. Acquire the object lock so that 876 * other mappings might be examined. The page's 877 * identity may change at any point before its 878 * object lock is acquired, so re-validate if 879 * necessary. 880 */ 881 m = PHYS_TO_VM_PAGE(pa); 882 while (object == NULL || m->object != object) { 883 if (object != NULL) 884 VM_OBJECT_WUNLOCK(object); 885 object = atomic_load_ptr(&m->object); 886 if (object == NULL) 887 goto retry; 888 VM_OBJECT_WLOCK(object); 889 } 890 if (pa != pmap_extract(pmap, addr)) 891 goto retry; 892 KASSERT(vm_page_all_valid(m), 893 ("mincore: page %p is mapped but invalid", 894 m)); 895 } else if (mincoreinfo == 0) { 896 /* 897 * The page is not mapped by this process. If 898 * the object implements managed pages, then 899 * determine if the page is resident so that 900 * the mappings might be examined. 901 */ 902 if (current->object.vm_object != object) { 903 if (object != NULL) 904 VM_OBJECT_WUNLOCK(object); 905 object = current->object.vm_object; 906 VM_OBJECT_WLOCK(object); 907 } 908 if ((object->flags & OBJ_SWAP) != 0 || 909 object->type == OBJT_VNODE) { 910 pindex = OFF_TO_IDX(current->offset + 911 (addr - current->start)); 912 m = vm_page_lookup(object, pindex); 913 if (m != NULL && vm_page_none_valid(m)) 914 m = NULL; 915 if (m != NULL) 916 mincoreinfo = MINCORE_INCORE; 917 } 918 } 919 if (m != NULL) { 920 VM_OBJECT_ASSERT_WLOCKED(m->object); 921 922 /* Examine other mappings of the page. */ 923 if (m->dirty == 0 && pmap_is_modified(m)) 924 vm_page_dirty(m); 925 if (m->dirty != 0) 926 mincoreinfo |= MINCORE_MODIFIED_OTHER; 927 928 /* 929 * The first test for PGA_REFERENCED is an 930 * optimization. The second test is 931 * required because a concurrent pmap 932 * operation could clear the last reference 933 * and set PGA_REFERENCED before the call to 934 * pmap_is_referenced(). 935 */ 936 if ((m->a.flags & PGA_REFERENCED) != 0 || 937 pmap_is_referenced(m) || 938 (m->a.flags & PGA_REFERENCED) != 0) 939 mincoreinfo |= MINCORE_REFERENCED_OTHER; 940 } 941 if (object != NULL) 942 VM_OBJECT_WUNLOCK(object); 943 944 /* 945 * subyte may page fault. In case it needs to modify 946 * the map, we release the lock. 947 */ 948 vm_map_unlock_read(map); 949 950 /* 951 * calculate index into user supplied byte vector 952 */ 953 vecindex = atop(addr - first_addr); 954 955 /* 956 * If we have skipped map entries, we need to make sure that 957 * the byte vector is zeroed for those skipped entries. 958 */ 959 while ((lastvecindex + 1) < vecindex) { 960 ++lastvecindex; 961 error = subyte(vec + lastvecindex, 0); 962 if (error) { 963 error = EFAULT; 964 goto done2; 965 } 966 } 967 968 /* 969 * Pass the page information to the user 970 */ 971 error = subyte(vec + vecindex, mincoreinfo); 972 if (error) { 973 error = EFAULT; 974 goto done2; 975 } 976 977 /* 978 * If the map has changed, due to the subyte, the previous 979 * output may be invalid. 980 */ 981 vm_map_lock_read(map); 982 if (timestamp != map->timestamp) 983 goto RestartScan; 984 985 lastvecindex = vecindex; 986 } 987 } 988 989 /* 990 * subyte may page fault. In case it needs to modify 991 * the map, we release the lock. 992 */ 993 vm_map_unlock_read(map); 994 995 /* 996 * Zero the last entries in the byte vector. 997 */ 998 vecindex = atop(end - first_addr); 999 while ((lastvecindex + 1) < vecindex) { 1000 ++lastvecindex; 1001 error = subyte(vec + lastvecindex, 0); 1002 if (error) { 1003 error = EFAULT; 1004 goto done2; 1005 } 1006 } 1007 1008 /* 1009 * If the map has changed, due to the subyte, the previous 1010 * output may be invalid. 1011 */ 1012 vm_map_lock_read(map); 1013 if (timestamp != map->timestamp) 1014 goto RestartScan; 1015 vm_map_unlock_read(map); 1016 done2: 1017 return (error); 1018 } 1019 1020 #ifndef _SYS_SYSPROTO_H_ 1021 struct mlock_args { 1022 const void *addr; 1023 size_t len; 1024 }; 1025 #endif 1026 int 1027 sys_mlock(struct thread *td, struct mlock_args *uap) 1028 { 1029 1030 return (kern_mlock(td->td_proc, td->td_ucred, 1031 __DECONST(uintptr_t, uap->addr), uap->len)); 1032 } 1033 1034 int 1035 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1036 { 1037 vm_offset_t addr, end, last, start; 1038 vm_size_t npages, size; 1039 vm_map_t map; 1040 unsigned long nsize; 1041 int error; 1042 1043 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1044 if (error) 1045 return (error); 1046 addr = addr0; 1047 size = len; 1048 last = addr + size; 1049 start = trunc_page(addr); 1050 end = round_page(last); 1051 if (last < addr || end < addr) 1052 return (EINVAL); 1053 npages = atop(end - start); 1054 if (npages > vm_page_max_user_wired) 1055 return (ENOMEM); 1056 map = &proc->p_vmspace->vm_map; 1057 PROC_LOCK(proc); 1058 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1059 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1060 PROC_UNLOCK(proc); 1061 return (ENOMEM); 1062 } 1063 PROC_UNLOCK(proc); 1064 #ifdef RACCT 1065 if (racct_enable) { 1066 PROC_LOCK(proc); 1067 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1068 PROC_UNLOCK(proc); 1069 if (error != 0) 1070 return (ENOMEM); 1071 } 1072 #endif 1073 error = vm_map_wire(map, start, end, 1074 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1075 #ifdef RACCT 1076 if (racct_enable && error != KERN_SUCCESS) { 1077 PROC_LOCK(proc); 1078 racct_set(proc, RACCT_MEMLOCK, 1079 ptoa(pmap_wired_count(map->pmap))); 1080 PROC_UNLOCK(proc); 1081 } 1082 #endif 1083 switch (error) { 1084 case KERN_SUCCESS: 1085 return (0); 1086 case KERN_INVALID_ARGUMENT: 1087 return (EINVAL); 1088 default: 1089 return (ENOMEM); 1090 } 1091 } 1092 1093 #ifndef _SYS_SYSPROTO_H_ 1094 struct mlockall_args { 1095 int how; 1096 }; 1097 #endif 1098 1099 int 1100 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1101 { 1102 vm_map_t map; 1103 int error; 1104 1105 map = &td->td_proc->p_vmspace->vm_map; 1106 error = priv_check(td, PRIV_VM_MLOCK); 1107 if (error) 1108 return (error); 1109 1110 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1111 return (EINVAL); 1112 1113 /* 1114 * If wiring all pages in the process would cause it to exceed 1115 * a hard resource limit, return ENOMEM. 1116 */ 1117 if (!old_mlock && uap->how & MCL_CURRENT) { 1118 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1119 return (ENOMEM); 1120 } 1121 #ifdef RACCT 1122 if (racct_enable) { 1123 PROC_LOCK(td->td_proc); 1124 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1125 PROC_UNLOCK(td->td_proc); 1126 if (error != 0) 1127 return (ENOMEM); 1128 } 1129 #endif 1130 1131 if (uap->how & MCL_FUTURE) { 1132 vm_map_lock(map); 1133 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1134 vm_map_unlock(map); 1135 error = 0; 1136 } 1137 1138 if (uap->how & MCL_CURRENT) { 1139 /* 1140 * P1003.1-2001 mandates that all currently mapped pages 1141 * will be memory resident and locked (wired) upon return 1142 * from mlockall(). vm_map_wire() will wire pages, by 1143 * calling vm_fault_wire() for each page in the region. 1144 */ 1145 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1146 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1147 if (error == KERN_SUCCESS) 1148 error = 0; 1149 else if (error == KERN_RESOURCE_SHORTAGE) 1150 error = ENOMEM; 1151 else 1152 error = EAGAIN; 1153 } 1154 #ifdef RACCT 1155 if (racct_enable && error != KERN_SUCCESS) { 1156 PROC_LOCK(td->td_proc); 1157 racct_set(td->td_proc, RACCT_MEMLOCK, 1158 ptoa(pmap_wired_count(map->pmap))); 1159 PROC_UNLOCK(td->td_proc); 1160 } 1161 #endif 1162 1163 return (error); 1164 } 1165 1166 #ifndef _SYS_SYSPROTO_H_ 1167 struct munlockall_args { 1168 register_t dummy; 1169 }; 1170 #endif 1171 1172 int 1173 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1174 { 1175 vm_map_t map; 1176 int error; 1177 1178 map = &td->td_proc->p_vmspace->vm_map; 1179 error = priv_check(td, PRIV_VM_MUNLOCK); 1180 if (error) 1181 return (error); 1182 1183 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1184 vm_map_lock(map); 1185 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1186 vm_map_unlock(map); 1187 1188 /* Forcibly unwire all pages. */ 1189 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1190 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1191 #ifdef RACCT 1192 if (racct_enable && error == KERN_SUCCESS) { 1193 PROC_LOCK(td->td_proc); 1194 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1195 PROC_UNLOCK(td->td_proc); 1196 } 1197 #endif 1198 1199 return (error); 1200 } 1201 1202 #ifndef _SYS_SYSPROTO_H_ 1203 struct munlock_args { 1204 const void *addr; 1205 size_t len; 1206 }; 1207 #endif 1208 int 1209 sys_munlock(struct thread *td, struct munlock_args *uap) 1210 { 1211 1212 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1213 } 1214 1215 int 1216 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1217 { 1218 vm_offset_t addr, end, last, start; 1219 #ifdef RACCT 1220 vm_map_t map; 1221 #endif 1222 int error; 1223 1224 error = priv_check(td, PRIV_VM_MUNLOCK); 1225 if (error) 1226 return (error); 1227 addr = addr0; 1228 last = addr + size; 1229 start = trunc_page(addr); 1230 end = round_page(last); 1231 if (last < addr || end < addr) 1232 return (EINVAL); 1233 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1234 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1235 #ifdef RACCT 1236 if (racct_enable && error == KERN_SUCCESS) { 1237 PROC_LOCK(td->td_proc); 1238 map = &td->td_proc->p_vmspace->vm_map; 1239 racct_set(td->td_proc, RACCT_MEMLOCK, 1240 ptoa(pmap_wired_count(map->pmap))); 1241 PROC_UNLOCK(td->td_proc); 1242 } 1243 #endif 1244 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1245 } 1246 1247 /* 1248 * vm_mmap_vnode() 1249 * 1250 * Helper function for vm_mmap. Perform sanity check specific for mmap 1251 * operations on vnodes. 1252 */ 1253 int 1254 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1255 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1256 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1257 boolean_t *writecounted) 1258 { 1259 struct vattr va; 1260 vm_object_t obj; 1261 vm_ooffset_t foff; 1262 struct ucred *cred; 1263 int error, flags; 1264 bool writex; 1265 1266 cred = td->td_ucred; 1267 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1268 (*flagsp & MAP_SHARED) != 0; 1269 if ((error = vget(vp, LK_SHARED)) != 0) 1270 return (error); 1271 AUDIT_ARG_VNODE1(vp); 1272 foff = *foffp; 1273 flags = *flagsp; 1274 obj = vp->v_object; 1275 if (vp->v_type == VREG) { 1276 /* 1277 * Get the proper underlying object 1278 */ 1279 if (obj == NULL) { 1280 error = EINVAL; 1281 goto done; 1282 } 1283 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1284 vput(vp); 1285 vp = (struct vnode *)obj->handle; 1286 /* 1287 * Bypass filesystems obey the mpsafety of the 1288 * underlying fs. Tmpfs never bypasses. 1289 */ 1290 error = vget(vp, LK_SHARED); 1291 if (error != 0) 1292 return (error); 1293 } 1294 if (writex) { 1295 *writecounted = TRUE; 1296 vm_pager_update_writecount(obj, 0, objsize); 1297 } 1298 } else { 1299 error = EINVAL; 1300 goto done; 1301 } 1302 if ((error = VOP_GETATTR(vp, &va, cred))) 1303 goto done; 1304 #ifdef MAC 1305 /* This relies on VM_PROT_* matching PROT_*. */ 1306 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1307 if (error != 0) 1308 goto done; 1309 #endif 1310 if ((flags & MAP_SHARED) != 0) { 1311 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1312 if (prot & VM_PROT_WRITE) { 1313 error = EPERM; 1314 goto done; 1315 } 1316 *maxprotp &= ~VM_PROT_WRITE; 1317 } 1318 } 1319 /* 1320 * If it is a regular file without any references 1321 * we do not need to sync it. 1322 * Adjust object size to be the size of actual file. 1323 */ 1324 objsize = round_page(va.va_size); 1325 if (va.va_nlink == 0) 1326 flags |= MAP_NOSYNC; 1327 if (obj->type == OBJT_VNODE) { 1328 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1329 cred); 1330 if (obj == NULL) { 1331 error = ENOMEM; 1332 goto done; 1333 } 1334 } else { 1335 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1336 vm_object_reference(obj); 1337 #if VM_NRESERVLEVEL > 0 1338 if ((obj->flags & OBJ_COLORED) == 0) { 1339 VM_OBJECT_WLOCK(obj); 1340 vm_object_color(obj, 0); 1341 VM_OBJECT_WUNLOCK(obj); 1342 } 1343 #endif 1344 } 1345 *objp = obj; 1346 *flagsp = flags; 1347 1348 VOP_MMAPPED(vp); 1349 1350 done: 1351 if (error != 0 && *writecounted) { 1352 *writecounted = FALSE; 1353 vm_pager_update_writecount(obj, objsize, 0); 1354 } 1355 vput(vp); 1356 return (error); 1357 } 1358 1359 /* 1360 * vm_mmap_cdev() 1361 * 1362 * Helper function for vm_mmap. Perform sanity check specific for mmap 1363 * operations on cdevs. 1364 */ 1365 int 1366 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1367 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1368 vm_ooffset_t *foff, vm_object_t *objp) 1369 { 1370 vm_object_t obj; 1371 int error, flags; 1372 1373 flags = *flagsp; 1374 1375 if (dsw->d_flags & D_MMAP_ANON) { 1376 *objp = NULL; 1377 *foff = 0; 1378 *maxprotp = VM_PROT_ALL; 1379 *flagsp |= MAP_ANON; 1380 return (0); 1381 } 1382 /* 1383 * cdevs do not provide private mappings of any kind. 1384 */ 1385 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1386 (prot & VM_PROT_WRITE) != 0) 1387 return (EACCES); 1388 if (flags & (MAP_PRIVATE|MAP_COPY)) 1389 return (EINVAL); 1390 /* 1391 * Force device mappings to be shared. 1392 */ 1393 flags |= MAP_SHARED; 1394 #ifdef MAC_XXX 1395 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1396 if (error != 0) 1397 return (error); 1398 #endif 1399 /* 1400 * First, try d_mmap_single(). If that is not implemented 1401 * (returns ENODEV), fall back to using the device pager. 1402 * Note that d_mmap_single() must return a reference to the 1403 * object (it needs to bump the reference count of the object 1404 * it returns somehow). 1405 * 1406 * XXX assumes VM_PROT_* == PROT_* 1407 */ 1408 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1409 if (error != ENODEV) 1410 return (error); 1411 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1412 td->td_ucred); 1413 if (obj == NULL) 1414 return (EINVAL); 1415 *objp = obj; 1416 *flagsp = flags; 1417 return (0); 1418 } 1419 1420 int 1421 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1422 vm_prot_t maxprot, int flags, 1423 objtype_t handle_type, void *handle, 1424 vm_ooffset_t foff) 1425 { 1426 vm_object_t object; 1427 struct thread *td = curthread; 1428 int error; 1429 boolean_t writecounted; 1430 1431 if (size == 0) 1432 return (EINVAL); 1433 1434 size = round_page(size); 1435 object = NULL; 1436 writecounted = FALSE; 1437 1438 switch (handle_type) { 1439 case OBJT_DEVICE: { 1440 struct cdevsw *dsw; 1441 struct cdev *cdev; 1442 int ref; 1443 1444 cdev = handle; 1445 dsw = dev_refthread(cdev, &ref); 1446 if (dsw == NULL) 1447 return (ENXIO); 1448 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1449 dsw, &foff, &object); 1450 dev_relthread(cdev, ref); 1451 break; 1452 } 1453 case OBJT_VNODE: 1454 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1455 handle, &foff, &object, &writecounted); 1456 break; 1457 default: 1458 error = EINVAL; 1459 break; 1460 } 1461 if (error) 1462 return (error); 1463 1464 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1465 foff, writecounted, td); 1466 if (error != 0 && object != NULL) { 1467 /* 1468 * If this mapping was accounted for in the vnode's 1469 * writecount, then undo that now. 1470 */ 1471 if (writecounted) 1472 vm_pager_release_writecount(object, 0, size); 1473 vm_object_deallocate(object); 1474 } 1475 return (error); 1476 } 1477 1478 int 1479 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1480 { 1481 int error; 1482 1483 RACCT_PROC_LOCK(td->td_proc); 1484 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1485 RACCT_PROC_UNLOCK(td->td_proc); 1486 return (ENOMEM); 1487 } 1488 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1489 RACCT_PROC_UNLOCK(td->td_proc); 1490 return (ENOMEM); 1491 } 1492 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1493 if (ptoa(pmap_wired_count(map->pmap)) + size > 1494 lim_cur(td, RLIMIT_MEMLOCK)) { 1495 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1496 RACCT_PROC_UNLOCK(td->td_proc); 1497 return (ENOMEM); 1498 } 1499 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1500 ptoa(pmap_wired_count(map->pmap)) + size); 1501 if (error != 0) { 1502 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1503 RACCT_PROC_UNLOCK(td->td_proc); 1504 return (error); 1505 } 1506 } 1507 RACCT_PROC_UNLOCK(td->td_proc); 1508 return (0); 1509 } 1510 1511 /* 1512 * Internal version of mmap that maps a specific VM object into an 1513 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1514 */ 1515 int 1516 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1517 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1518 boolean_t writecounted, struct thread *td) 1519 { 1520 vm_offset_t default_addr, max_addr; 1521 int docow, error, findspace, rv; 1522 bool curmap, fitit; 1523 1524 curmap = map == &td->td_proc->p_vmspace->vm_map; 1525 if (curmap) { 1526 error = kern_mmap_racct_check(td, map, size); 1527 if (error != 0) 1528 return (error); 1529 } 1530 1531 /* 1532 * We currently can only deal with page aligned file offsets. 1533 * The mmap() system call already enforces this by subtracting 1534 * the page offset from the file offset, but checking here 1535 * catches errors in device drivers (e.g. d_single_mmap() 1536 * callbacks) and other internal mapping requests (such as in 1537 * exec). 1538 */ 1539 if (foff & PAGE_MASK) 1540 return (EINVAL); 1541 1542 if ((flags & MAP_FIXED) == 0) { 1543 fitit = true; 1544 *addr = round_page(*addr); 1545 } else { 1546 if (*addr != trunc_page(*addr)) 1547 return (EINVAL); 1548 fitit = false; 1549 } 1550 1551 if (flags & MAP_ANON) { 1552 if (object != NULL || foff != 0) 1553 return (EINVAL); 1554 docow = 0; 1555 } else if (flags & MAP_PREFAULT_READ) 1556 docow = MAP_PREFAULT; 1557 else 1558 docow = MAP_PREFAULT_PARTIAL; 1559 1560 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1561 docow |= MAP_COPY_ON_WRITE; 1562 if (flags & MAP_NOSYNC) 1563 docow |= MAP_DISABLE_SYNCER; 1564 if (flags & MAP_NOCORE) 1565 docow |= MAP_DISABLE_COREDUMP; 1566 /* Shared memory is also shared with children. */ 1567 if (flags & MAP_SHARED) 1568 docow |= MAP_INHERIT_SHARE; 1569 if (writecounted) 1570 docow |= MAP_WRITECOUNT; 1571 if (flags & MAP_STACK) { 1572 if (object != NULL) 1573 return (EINVAL); 1574 docow |= MAP_STACK_AREA; 1575 } 1576 if ((flags & MAP_EXCL) != 0) 1577 docow |= MAP_CHECK_EXCL; 1578 if ((flags & MAP_GUARD) != 0) 1579 docow |= MAP_CREATE_GUARD; 1580 1581 if (fitit) { 1582 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1583 findspace = VMFS_SUPER_SPACE; 1584 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1585 findspace = VMFS_ALIGNED_SPACE(flags >> 1586 MAP_ALIGNMENT_SHIFT); 1587 else 1588 findspace = VMFS_OPTIMAL_SPACE; 1589 max_addr = 0; 1590 if ((flags & MAP_32BIT) != 0) 1591 max_addr = MAP_32BIT_MAX_ADDR; 1592 if (curmap) { 1593 default_addr = 1594 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1595 vm_daddr + lim_max(td, RLIMIT_DATA)); 1596 if ((flags & MAP_32BIT) != 0) 1597 default_addr = 0; 1598 rv = vm_map_find_min(map, object, foff, addr, size, 1599 default_addr, max_addr, findspace, prot, maxprot, 1600 docow); 1601 } else { 1602 rv = vm_map_find(map, object, foff, addr, size, 1603 max_addr, findspace, prot, maxprot, docow); 1604 } 1605 } else { 1606 rv = vm_map_fixed(map, object, foff, *addr, size, 1607 prot, maxprot, docow); 1608 } 1609 1610 if (rv == KERN_SUCCESS) { 1611 /* 1612 * If the process has requested that all future mappings 1613 * be wired, then heed this. 1614 */ 1615 if ((map->flags & MAP_WIREFUTURE) != 0) { 1616 vm_map_lock(map); 1617 if ((map->flags & MAP_WIREFUTURE) != 0) 1618 (void)vm_map_wire_locked(map, *addr, 1619 *addr + size, VM_MAP_WIRE_USER | 1620 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1621 VM_MAP_WIRE_NOHOLES)); 1622 vm_map_unlock(map); 1623 } 1624 } 1625 return (vm_mmap_to_errno(rv)); 1626 } 1627 1628 /* 1629 * Translate a Mach VM return code to zero on success or the appropriate errno 1630 * on failure. 1631 */ 1632 int 1633 vm_mmap_to_errno(int rv) 1634 { 1635 1636 switch (rv) { 1637 case KERN_SUCCESS: 1638 return (0); 1639 case KERN_INVALID_ADDRESS: 1640 case KERN_NO_SPACE: 1641 return (ENOMEM); 1642 case KERN_PROTECTION_FAILURE: 1643 return (EACCES); 1644 default: 1645 return (EINVAL); 1646 } 1647 } 1648