1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_hwpmc_hooks.h" 49 #include "opt_vm.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/capsicum.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/elf.h> 59 #include <sys/filedesc.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/procctl.h> 63 #include <sys/racct.h> 64 #include <sys/resource.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/sysctl.h> 68 #include <sys/vnode.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/mman.h> 72 #include <sys/mount.h> 73 #include <sys/conf.h> 74 #include <sys/stat.h> 75 #include <sys/syscallsubr.h> 76 #include <sys/sysent.h> 77 #include <sys/vmmeter.h> 78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 79 #include <machine/md_var.h> 80 #endif 81 82 #include <security/audit/audit.h> 83 #include <security/mac/mac_framework.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <vm/pmap.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_pageout.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vnode_pager.h> 96 97 #ifdef HWPMC_HOOKS 98 #include <sys/pmckern.h> 99 #endif 100 101 int old_mlock = 0; 102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 103 "Do not apply RLIMIT_MEMLOCK on mlockall"); 104 static int mincore_mapped = 1; 105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 106 "mincore reports mappings, not residency"); 107 static int imply_prot_max = 0; 108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 109 "Imply maximum page protections in mmap() when none are specified"); 110 111 #ifdef MAP_32BIT 112 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 113 #endif 114 115 #ifndef _SYS_SYSPROTO_H_ 116 struct sbrk_args { 117 int incr; 118 }; 119 #endif 120 121 int 122 sys_sbrk(struct thread *td, struct sbrk_args *uap) 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 int 135 sys_sstk(struct thread *td, struct sstk_args *uap) 136 { 137 /* Not yet implemented */ 138 return (EOPNOTSUPP); 139 } 140 141 #if defined(COMPAT_43) 142 int 143 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 144 { 145 146 td->td_retval[0] = PAGE_SIZE; 147 return (0); 148 } 149 #endif /* COMPAT_43 */ 150 151 /* 152 * Memory Map (mmap) system call. Note that the file offset 153 * and address are allowed to be NOT page aligned, though if 154 * the MAP_FIXED flag it set, both must have the same remainder 155 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 156 * page-aligned, the actual mapping starts at trunc_page(addr) 157 * and the return value is adjusted up by the page offset. 158 * 159 * Generally speaking, only character devices which are themselves 160 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 161 * there would be no cache coherency between a descriptor and a VM mapping 162 * both to the same character device. 163 */ 164 #ifndef _SYS_SYSPROTO_H_ 165 struct mmap_args { 166 void *addr; 167 size_t len; 168 int prot; 169 int flags; 170 int fd; 171 long pad; 172 off_t pos; 173 }; 174 #endif 175 176 int 177 sys_mmap(struct thread *td, struct mmap_args *uap) 178 { 179 180 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 181 uap->flags, uap->fd, uap->pos)); 182 } 183 184 int 185 kern_mmap_maxprot(struct proc *p, int prot) 186 { 187 188 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 189 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 190 return (_PROT_ALL); 191 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 192 prot != PROT_NONE) 193 return (prot); 194 return (_PROT_ALL); 195 } 196 197 int 198 kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags, 199 int fd, off_t pos) 200 { 201 struct mmap_req mr = { 202 .mr_hint = addr0, 203 .mr_len = len, 204 .mr_prot = prot, 205 .mr_flags = flags, 206 .mr_fd = fd, 207 .mr_pos = pos 208 }; 209 210 return (kern_mmap_req(td, &mr)); 211 } 212 213 int 214 kern_mmap_req(struct thread *td, const struct mmap_req *mrp) 215 { 216 struct vmspace *vms; 217 struct file *fp; 218 struct proc *p; 219 off_t pos; 220 vm_offset_t addr; 221 vm_size_t len, pageoff, size; 222 vm_prot_t cap_maxprot; 223 int align, error, fd, flags, max_prot, prot; 224 cap_rights_t rights; 225 mmap_check_fp_fn check_fp_fn; 226 227 addr = mrp->mr_hint; 228 len = mrp->mr_len; 229 prot = mrp->mr_prot; 230 flags = mrp->mr_flags; 231 fd = mrp->mr_fd; 232 pos = mrp->mr_pos; 233 check_fp_fn = mrp->mr_check_fp_fn; 234 235 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 236 return (EINVAL); 237 max_prot = PROT_MAX_EXTRACT(prot); 238 prot = PROT_EXTRACT(prot); 239 if (max_prot != 0 && (max_prot & prot) != prot) 240 return (ENOTSUP); 241 242 p = td->td_proc; 243 244 /* 245 * Always honor PROT_MAX if set. If not, default to all 246 * permissions unless we're implying maximum permissions. 247 */ 248 if (max_prot == 0) 249 max_prot = kern_mmap_maxprot(p, prot); 250 251 vms = p->p_vmspace; 252 fp = NULL; 253 AUDIT_ARG_FD(fd); 254 255 /* 256 * Ignore old flags that used to be defined but did not do anything. 257 */ 258 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 259 260 /* 261 * Enforce the constraints. 262 * Mapping of length 0 is only allowed for old binaries. 263 * Anonymous mapping shall specify -1 as filedescriptor and 264 * zero position for new code. Be nice to ancient a.out 265 * binaries and correct pos for anonymous mapping, since old 266 * ld.so sometimes issues anonymous map requests with non-zero 267 * pos. 268 */ 269 if (!SV_CURPROC_FLAG(SV_AOUT)) { 270 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 271 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 272 return (EINVAL); 273 } else { 274 if ((flags & MAP_ANON) != 0) 275 pos = 0; 276 } 277 278 if (flags & MAP_STACK) { 279 if ((fd != -1) || 280 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 281 return (EINVAL); 282 flags |= MAP_ANON; 283 pos = 0; 284 } 285 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 286 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 287 MAP_PREFAULT_READ | MAP_GUARD | 288 #ifdef MAP_32BIT 289 MAP_32BIT | 290 #endif 291 MAP_ALIGNMENT_MASK)) != 0) 292 return (EINVAL); 293 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 294 return (EINVAL); 295 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 296 return (EINVAL); 297 if (prot != PROT_NONE && 298 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 299 return (EINVAL); 300 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 301 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 302 #ifdef MAP_32BIT 303 MAP_32BIT | 304 #endif 305 MAP_ALIGNMENT_MASK)) != 0)) 306 return (EINVAL); 307 308 /* 309 * Align the file position to a page boundary, 310 * and save its page offset component. 311 */ 312 pageoff = (pos & PAGE_MASK); 313 pos -= pageoff; 314 315 /* Compute size from len by rounding (on both ends). */ 316 size = len + pageoff; /* low end... */ 317 size = round_page(size); /* hi end */ 318 /* Check for rounding up to zero. */ 319 if (len > size) 320 return (ENOMEM); 321 322 /* Ensure alignment is at least a page and fits in a pointer. */ 323 align = flags & MAP_ALIGNMENT_MASK; 324 if (align != 0 && align != MAP_ALIGNED_SUPER && 325 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 326 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 327 return (EINVAL); 328 329 /* 330 * Check for illegal addresses. Watch out for address wrap... Note 331 * that VM_*_ADDRESS are not constants due to casts (argh). 332 */ 333 if (flags & MAP_FIXED) { 334 /* 335 * The specified address must have the same remainder 336 * as the file offset taken modulo PAGE_SIZE, so it 337 * should be aligned after adjustment by pageoff. 338 */ 339 addr -= pageoff; 340 if (addr & PAGE_MASK) 341 return (EINVAL); 342 343 /* Address range must be all in user VM space. */ 344 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) 345 return (EINVAL); 346 #ifdef MAP_32BIT 347 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 348 return (EINVAL); 349 } else if (flags & MAP_32BIT) { 350 /* 351 * For MAP_32BIT, override the hint if it is too high and 352 * do not bother moving the mapping past the heap (since 353 * the heap is usually above 2GB). 354 */ 355 if (addr + size > MAP_32BIT_MAX_ADDR) 356 addr = 0; 357 #endif 358 } else { 359 /* 360 * XXX for non-fixed mappings where no hint is provided or 361 * the hint would fall in the potential heap space, 362 * place it after the end of the largest possible heap. 363 * 364 * There should really be a pmap call to determine a reasonable 365 * location. 366 */ 367 if (addr == 0 || 368 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 369 addr < round_page((vm_offset_t)vms->vm_daddr + 370 lim_max(td, RLIMIT_DATA)))) 371 addr = round_page((vm_offset_t)vms->vm_daddr + 372 lim_max(td, RLIMIT_DATA)); 373 } 374 if (len == 0) { 375 /* 376 * Return success without mapping anything for old 377 * binaries that request a page-aligned mapping of 378 * length 0. For modern binaries, this function 379 * returns an error earlier. 380 */ 381 error = 0; 382 } else if ((flags & MAP_GUARD) != 0) { 383 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 384 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 385 } else if ((flags & MAP_ANON) != 0) { 386 /* 387 * Mapping blank space is trivial. 388 * 389 * This relies on VM_PROT_* matching PROT_*. 390 */ 391 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 392 max_prot, flags, NULL, pos, FALSE, td); 393 } else { 394 /* 395 * Mapping file, get fp for validation and don't let the 396 * descriptor disappear on us if we block. Check capability 397 * rights, but also return the maximum rights to be combined 398 * with maxprot later. 399 */ 400 cap_rights_init_one(&rights, CAP_MMAP); 401 if (prot & PROT_READ) 402 cap_rights_set_one(&rights, CAP_MMAP_R); 403 if ((flags & MAP_SHARED) != 0) { 404 if (prot & PROT_WRITE) 405 cap_rights_set_one(&rights, CAP_MMAP_W); 406 } 407 if (prot & PROT_EXEC) 408 cap_rights_set_one(&rights, CAP_MMAP_X); 409 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 410 if (error != 0) 411 goto done; 412 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 413 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 414 error = EINVAL; 415 goto done; 416 } 417 if (check_fp_fn != NULL) { 418 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 419 flags); 420 if (error != 0) 421 goto done; 422 } 423 /* This relies on VM_PROT_* matching PROT_*. */ 424 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 425 max_prot & cap_maxprot, flags, pos, td); 426 } 427 428 if (error == 0) 429 td->td_retval[0] = (register_t) (addr + pageoff); 430 done: 431 if (fp) 432 fdrop(fp, td); 433 434 return (error); 435 } 436 437 #if defined(COMPAT_FREEBSD6) 438 int 439 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 440 { 441 442 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 443 uap->flags, uap->fd, uap->pos)); 444 } 445 #endif 446 447 #ifdef COMPAT_43 448 #ifndef _SYS_SYSPROTO_H_ 449 struct ommap_args { 450 caddr_t addr; 451 int len; 452 int prot; 453 int flags; 454 int fd; 455 long pos; 456 }; 457 #endif 458 int 459 ommap(struct thread *td, struct ommap_args *uap) 460 { 461 static const char cvtbsdprot[8] = { 462 0, 463 PROT_EXEC, 464 PROT_WRITE, 465 PROT_EXEC | PROT_WRITE, 466 PROT_READ, 467 PROT_EXEC | PROT_READ, 468 PROT_WRITE | PROT_READ, 469 PROT_EXEC | PROT_WRITE | PROT_READ, 470 }; 471 int flags, prot; 472 473 #define OMAP_ANON 0x0002 474 #define OMAP_COPY 0x0020 475 #define OMAP_SHARED 0x0010 476 #define OMAP_FIXED 0x0100 477 478 prot = cvtbsdprot[uap->prot & 0x7]; 479 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 480 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 481 prot != 0) 482 prot |= PROT_EXEC; 483 #endif 484 flags = 0; 485 if (uap->flags & OMAP_ANON) 486 flags |= MAP_ANON; 487 if (uap->flags & OMAP_COPY) 488 flags |= MAP_COPY; 489 if (uap->flags & OMAP_SHARED) 490 flags |= MAP_SHARED; 491 else 492 flags |= MAP_PRIVATE; 493 if (uap->flags & OMAP_FIXED) 494 flags |= MAP_FIXED; 495 return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, 496 uap->fd, uap->pos)); 497 } 498 #endif /* COMPAT_43 */ 499 500 #ifndef _SYS_SYSPROTO_H_ 501 struct msync_args { 502 void *addr; 503 size_t len; 504 int flags; 505 }; 506 #endif 507 int 508 sys_msync(struct thread *td, struct msync_args *uap) 509 { 510 511 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 512 } 513 514 int 515 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 516 { 517 vm_offset_t addr; 518 vm_size_t pageoff; 519 vm_map_t map; 520 int rv; 521 522 addr = addr0; 523 pageoff = (addr & PAGE_MASK); 524 addr -= pageoff; 525 size += pageoff; 526 size = (vm_size_t) round_page(size); 527 if (addr + size < addr) 528 return (EINVAL); 529 530 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 531 return (EINVAL); 532 533 map = &td->td_proc->p_vmspace->vm_map; 534 535 /* 536 * Clean the pages and interpret the return value. 537 */ 538 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 539 (flags & MS_INVALIDATE) != 0); 540 switch (rv) { 541 case KERN_SUCCESS: 542 return (0); 543 case KERN_INVALID_ADDRESS: 544 return (ENOMEM); 545 case KERN_INVALID_ARGUMENT: 546 return (EBUSY); 547 case KERN_FAILURE: 548 return (EIO); 549 default: 550 return (EINVAL); 551 } 552 } 553 554 #ifndef _SYS_SYSPROTO_H_ 555 struct munmap_args { 556 void *addr; 557 size_t len; 558 }; 559 #endif 560 int 561 sys_munmap(struct thread *td, struct munmap_args *uap) 562 { 563 564 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 565 } 566 567 int 568 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 569 { 570 #ifdef HWPMC_HOOKS 571 struct pmckern_map_out pkm; 572 vm_map_entry_t entry; 573 bool pmc_handled; 574 #endif 575 vm_offset_t addr, end; 576 vm_size_t pageoff; 577 vm_map_t map; 578 579 if (size == 0) 580 return (EINVAL); 581 582 addr = addr0; 583 pageoff = (addr & PAGE_MASK); 584 addr -= pageoff; 585 size += pageoff; 586 size = (vm_size_t) round_page(size); 587 end = addr + size; 588 map = &td->td_proc->p_vmspace->vm_map; 589 if (!vm_map_range_valid(map, addr, end)) 590 return (EINVAL); 591 592 vm_map_lock(map); 593 #ifdef HWPMC_HOOKS 594 pmc_handled = false; 595 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 596 pmc_handled = true; 597 /* 598 * Inform hwpmc if the address range being unmapped contains 599 * an executable region. 600 */ 601 pkm.pm_address = (uintptr_t) NULL; 602 if (vm_map_lookup_entry(map, addr, &entry)) { 603 for (; entry->start < end; 604 entry = vm_map_entry_succ(entry)) { 605 if (vm_map_check_protection(map, entry->start, 606 entry->end, VM_PROT_EXECUTE) == TRUE) { 607 pkm.pm_address = (uintptr_t) addr; 608 pkm.pm_size = (size_t) size; 609 break; 610 } 611 } 612 } 613 } 614 #endif 615 vm_map_delete(map, addr, end); 616 617 #ifdef HWPMC_HOOKS 618 if (__predict_false(pmc_handled)) { 619 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 620 vm_map_lock_downgrade(map); 621 if (pkm.pm_address != (uintptr_t) NULL) 622 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 623 vm_map_unlock_read(map); 624 } else 625 #endif 626 vm_map_unlock(map); 627 628 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 629 return (0); 630 } 631 632 #ifndef _SYS_SYSPROTO_H_ 633 struct mprotect_args { 634 const void *addr; 635 size_t len; 636 int prot; 637 }; 638 #endif 639 int 640 sys_mprotect(struct thread *td, struct mprotect_args *uap) 641 { 642 643 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 644 } 645 646 int 647 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 648 { 649 vm_offset_t addr; 650 vm_size_t pageoff; 651 int vm_error, max_prot; 652 653 addr = addr0; 654 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 655 return (EINVAL); 656 max_prot = PROT_MAX_EXTRACT(prot); 657 prot = PROT_EXTRACT(prot); 658 pageoff = (addr & PAGE_MASK); 659 addr -= pageoff; 660 size += pageoff; 661 size = (vm_size_t) round_page(size); 662 #ifdef COMPAT_FREEBSD32 663 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 664 if (((addr + size) & 0xffffffff) < addr) 665 return (EINVAL); 666 } else 667 #endif 668 if (addr + size < addr) 669 return (EINVAL); 670 671 vm_error = KERN_SUCCESS; 672 if (max_prot != 0) { 673 if ((max_prot & prot) != prot) 674 return (ENOTSUP); 675 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 676 addr, addr + size, max_prot, TRUE); 677 } 678 if (vm_error == KERN_SUCCESS) 679 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 680 addr, addr + size, prot, FALSE); 681 682 switch (vm_error) { 683 case KERN_SUCCESS: 684 return (0); 685 case KERN_PROTECTION_FAILURE: 686 return (EACCES); 687 case KERN_RESOURCE_SHORTAGE: 688 return (ENOMEM); 689 } 690 return (EINVAL); 691 } 692 693 #ifndef _SYS_SYSPROTO_H_ 694 struct minherit_args { 695 void *addr; 696 size_t len; 697 int inherit; 698 }; 699 #endif 700 int 701 sys_minherit(struct thread *td, struct minherit_args *uap) 702 { 703 704 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 705 uap->inherit)); 706 } 707 708 int 709 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 710 { 711 vm_offset_t addr; 712 vm_size_t size, pageoff; 713 vm_inherit_t inherit; 714 715 addr = (vm_offset_t)addr0; 716 size = len; 717 inherit = inherit0; 718 719 pageoff = (addr & PAGE_MASK); 720 addr -= pageoff; 721 size += pageoff; 722 size = (vm_size_t) round_page(size); 723 if (addr + size < addr) 724 return (EINVAL); 725 726 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 727 addr + size, inherit)) { 728 case KERN_SUCCESS: 729 return (0); 730 case KERN_PROTECTION_FAILURE: 731 return (EACCES); 732 } 733 return (EINVAL); 734 } 735 736 #ifndef _SYS_SYSPROTO_H_ 737 struct madvise_args { 738 void *addr; 739 size_t len; 740 int behav; 741 }; 742 #endif 743 744 int 745 sys_madvise(struct thread *td, struct madvise_args *uap) 746 { 747 748 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 749 } 750 751 int 752 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 753 { 754 vm_map_t map; 755 vm_offset_t addr, end, start; 756 int flags; 757 758 /* 759 * Check for our special case, advising the swap pager we are 760 * "immortal." 761 */ 762 if (behav == MADV_PROTECT) { 763 flags = PPROT_SET; 764 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 765 PROC_SPROTECT, &flags)); 766 } 767 768 /* 769 * Check for illegal addresses. Watch out for address wrap... Note 770 * that VM_*_ADDRESS are not constants due to casts (argh). 771 */ 772 map = &td->td_proc->p_vmspace->vm_map; 773 addr = addr0; 774 if (!vm_map_range_valid(map, addr, addr + len)) 775 return (EINVAL); 776 777 /* 778 * Since this routine is only advisory, we default to conservative 779 * behavior. 780 */ 781 start = trunc_page(addr); 782 end = round_page(addr + len); 783 784 /* 785 * vm_map_madvise() checks for illegal values of behav. 786 */ 787 return (vm_map_madvise(map, start, end, behav)); 788 } 789 790 #ifndef _SYS_SYSPROTO_H_ 791 struct mincore_args { 792 const void *addr; 793 size_t len; 794 char *vec; 795 }; 796 #endif 797 798 int 799 sys_mincore(struct thread *td, struct mincore_args *uap) 800 { 801 802 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 803 } 804 805 int 806 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 807 { 808 pmap_t pmap; 809 vm_map_t map; 810 vm_map_entry_t current, entry; 811 vm_object_t object; 812 vm_offset_t addr, cend, end, first_addr; 813 vm_paddr_t pa; 814 vm_page_t m; 815 vm_pindex_t pindex; 816 int error, lastvecindex, mincoreinfo, vecindex; 817 unsigned int timestamp; 818 819 /* 820 * Make sure that the addresses presented are valid for user 821 * mode. 822 */ 823 first_addr = addr = trunc_page(addr0); 824 end = round_page(addr0 + len); 825 map = &td->td_proc->p_vmspace->vm_map; 826 if (end > vm_map_max(map) || end < addr) 827 return (ENOMEM); 828 829 pmap = vmspace_pmap(td->td_proc->p_vmspace); 830 831 vm_map_lock_read(map); 832 RestartScan: 833 timestamp = map->timestamp; 834 835 if (!vm_map_lookup_entry(map, addr, &entry)) { 836 vm_map_unlock_read(map); 837 return (ENOMEM); 838 } 839 840 /* 841 * Do this on a map entry basis so that if the pages are not 842 * in the current processes address space, we can easily look 843 * up the pages elsewhere. 844 */ 845 lastvecindex = -1; 846 while (entry->start < end) { 847 /* 848 * check for contiguity 849 */ 850 current = entry; 851 entry = vm_map_entry_succ(current); 852 if (current->end < end && 853 entry->start > current->end) { 854 vm_map_unlock_read(map); 855 return (ENOMEM); 856 } 857 858 /* 859 * ignore submaps (for now) or null objects 860 */ 861 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 862 current->object.vm_object == NULL) 863 continue; 864 865 /* 866 * limit this scan to the current map entry and the 867 * limits for the mincore call 868 */ 869 if (addr < current->start) 870 addr = current->start; 871 cend = current->end; 872 if (cend > end) 873 cend = end; 874 875 for (; addr < cend; addr += PAGE_SIZE) { 876 /* 877 * Check pmap first, it is likely faster, also 878 * it can provide info as to whether we are the 879 * one referencing or modifying the page. 880 */ 881 m = NULL; 882 object = NULL; 883 retry: 884 pa = 0; 885 mincoreinfo = pmap_mincore(pmap, addr, &pa); 886 if (mincore_mapped) { 887 /* 888 * We only care about this pmap's 889 * mapping of the page, if any. 890 */ 891 ; 892 } else if (pa != 0) { 893 /* 894 * The page is mapped by this process but not 895 * both accessed and modified. It is also 896 * managed. Acquire the object lock so that 897 * other mappings might be examined. The page's 898 * identity may change at any point before its 899 * object lock is acquired, so re-validate if 900 * necessary. 901 */ 902 m = PHYS_TO_VM_PAGE(pa); 903 while (object == NULL || m->object != object) { 904 if (object != NULL) 905 VM_OBJECT_WUNLOCK(object); 906 object = atomic_load_ptr(&m->object); 907 if (object == NULL) 908 goto retry; 909 VM_OBJECT_WLOCK(object); 910 } 911 if (pa != pmap_extract(pmap, addr)) 912 goto retry; 913 KASSERT(vm_page_all_valid(m), 914 ("mincore: page %p is mapped but invalid", 915 m)); 916 } else if (mincoreinfo == 0) { 917 /* 918 * The page is not mapped by this process. If 919 * the object implements managed pages, then 920 * determine if the page is resident so that 921 * the mappings might be examined. 922 */ 923 if (current->object.vm_object != object) { 924 if (object != NULL) 925 VM_OBJECT_WUNLOCK(object); 926 object = current->object.vm_object; 927 VM_OBJECT_WLOCK(object); 928 } 929 if (object->type == OBJT_DEFAULT || 930 object->type == OBJT_SWAP || 931 object->type == OBJT_VNODE) { 932 pindex = OFF_TO_IDX(current->offset + 933 (addr - current->start)); 934 m = vm_page_lookup(object, pindex); 935 if (m != NULL && vm_page_none_valid(m)) 936 m = NULL; 937 if (m != NULL) 938 mincoreinfo = MINCORE_INCORE; 939 } 940 } 941 if (m != NULL) { 942 VM_OBJECT_ASSERT_WLOCKED(m->object); 943 944 /* Examine other mappings of the page. */ 945 if (m->dirty == 0 && pmap_is_modified(m)) 946 vm_page_dirty(m); 947 if (m->dirty != 0) 948 mincoreinfo |= MINCORE_MODIFIED_OTHER; 949 950 /* 951 * The first test for PGA_REFERENCED is an 952 * optimization. The second test is 953 * required because a concurrent pmap 954 * operation could clear the last reference 955 * and set PGA_REFERENCED before the call to 956 * pmap_is_referenced(). 957 */ 958 if ((m->a.flags & PGA_REFERENCED) != 0 || 959 pmap_is_referenced(m) || 960 (m->a.flags & PGA_REFERENCED) != 0) 961 mincoreinfo |= MINCORE_REFERENCED_OTHER; 962 } 963 if (object != NULL) 964 VM_OBJECT_WUNLOCK(object); 965 966 /* 967 * subyte may page fault. In case it needs to modify 968 * the map, we release the lock. 969 */ 970 vm_map_unlock_read(map); 971 972 /* 973 * calculate index into user supplied byte vector 974 */ 975 vecindex = atop(addr - first_addr); 976 977 /* 978 * If we have skipped map entries, we need to make sure that 979 * the byte vector is zeroed for those skipped entries. 980 */ 981 while ((lastvecindex + 1) < vecindex) { 982 ++lastvecindex; 983 error = subyte(vec + lastvecindex, 0); 984 if (error) { 985 error = EFAULT; 986 goto done2; 987 } 988 } 989 990 /* 991 * Pass the page information to the user 992 */ 993 error = subyte(vec + vecindex, mincoreinfo); 994 if (error) { 995 error = EFAULT; 996 goto done2; 997 } 998 999 /* 1000 * If the map has changed, due to the subyte, the previous 1001 * output may be invalid. 1002 */ 1003 vm_map_lock_read(map); 1004 if (timestamp != map->timestamp) 1005 goto RestartScan; 1006 1007 lastvecindex = vecindex; 1008 } 1009 } 1010 1011 /* 1012 * subyte may page fault. In case it needs to modify 1013 * the map, we release the lock. 1014 */ 1015 vm_map_unlock_read(map); 1016 1017 /* 1018 * Zero the last entries in the byte vector. 1019 */ 1020 vecindex = atop(end - first_addr); 1021 while ((lastvecindex + 1) < vecindex) { 1022 ++lastvecindex; 1023 error = subyte(vec + lastvecindex, 0); 1024 if (error) { 1025 error = EFAULT; 1026 goto done2; 1027 } 1028 } 1029 1030 /* 1031 * If the map has changed, due to the subyte, the previous 1032 * output may be invalid. 1033 */ 1034 vm_map_lock_read(map); 1035 if (timestamp != map->timestamp) 1036 goto RestartScan; 1037 vm_map_unlock_read(map); 1038 done2: 1039 return (error); 1040 } 1041 1042 #ifndef _SYS_SYSPROTO_H_ 1043 struct mlock_args { 1044 const void *addr; 1045 size_t len; 1046 }; 1047 #endif 1048 int 1049 sys_mlock(struct thread *td, struct mlock_args *uap) 1050 { 1051 1052 return (kern_mlock(td->td_proc, td->td_ucred, 1053 __DECONST(uintptr_t, uap->addr), uap->len)); 1054 } 1055 1056 int 1057 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1058 { 1059 vm_offset_t addr, end, last, start; 1060 vm_size_t npages, size; 1061 vm_map_t map; 1062 unsigned long nsize; 1063 int error; 1064 1065 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1066 if (error) 1067 return (error); 1068 addr = addr0; 1069 size = len; 1070 last = addr + size; 1071 start = trunc_page(addr); 1072 end = round_page(last); 1073 if (last < addr || end < addr) 1074 return (EINVAL); 1075 npages = atop(end - start); 1076 if (npages > vm_page_max_user_wired) 1077 return (ENOMEM); 1078 map = &proc->p_vmspace->vm_map; 1079 PROC_LOCK(proc); 1080 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1081 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1082 PROC_UNLOCK(proc); 1083 return (ENOMEM); 1084 } 1085 PROC_UNLOCK(proc); 1086 #ifdef RACCT 1087 if (racct_enable) { 1088 PROC_LOCK(proc); 1089 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1090 PROC_UNLOCK(proc); 1091 if (error != 0) 1092 return (ENOMEM); 1093 } 1094 #endif 1095 error = vm_map_wire(map, start, end, 1096 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1097 #ifdef RACCT 1098 if (racct_enable && error != KERN_SUCCESS) { 1099 PROC_LOCK(proc); 1100 racct_set(proc, RACCT_MEMLOCK, 1101 ptoa(pmap_wired_count(map->pmap))); 1102 PROC_UNLOCK(proc); 1103 } 1104 #endif 1105 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1106 } 1107 1108 #ifndef _SYS_SYSPROTO_H_ 1109 struct mlockall_args { 1110 int how; 1111 }; 1112 #endif 1113 1114 int 1115 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1116 { 1117 vm_map_t map; 1118 int error; 1119 1120 map = &td->td_proc->p_vmspace->vm_map; 1121 error = priv_check(td, PRIV_VM_MLOCK); 1122 if (error) 1123 return (error); 1124 1125 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1126 return (EINVAL); 1127 1128 /* 1129 * If wiring all pages in the process would cause it to exceed 1130 * a hard resource limit, return ENOMEM. 1131 */ 1132 if (!old_mlock && uap->how & MCL_CURRENT) { 1133 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1134 return (ENOMEM); 1135 } 1136 #ifdef RACCT 1137 if (racct_enable) { 1138 PROC_LOCK(td->td_proc); 1139 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1140 PROC_UNLOCK(td->td_proc); 1141 if (error != 0) 1142 return (ENOMEM); 1143 } 1144 #endif 1145 1146 if (uap->how & MCL_FUTURE) { 1147 vm_map_lock(map); 1148 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1149 vm_map_unlock(map); 1150 error = 0; 1151 } 1152 1153 if (uap->how & MCL_CURRENT) { 1154 /* 1155 * P1003.1-2001 mandates that all currently mapped pages 1156 * will be memory resident and locked (wired) upon return 1157 * from mlockall(). vm_map_wire() will wire pages, by 1158 * calling vm_fault_wire() for each page in the region. 1159 */ 1160 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1161 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1162 if (error == KERN_SUCCESS) 1163 error = 0; 1164 else if (error == KERN_RESOURCE_SHORTAGE) 1165 error = ENOMEM; 1166 else 1167 error = EAGAIN; 1168 } 1169 #ifdef RACCT 1170 if (racct_enable && error != KERN_SUCCESS) { 1171 PROC_LOCK(td->td_proc); 1172 racct_set(td->td_proc, RACCT_MEMLOCK, 1173 ptoa(pmap_wired_count(map->pmap))); 1174 PROC_UNLOCK(td->td_proc); 1175 } 1176 #endif 1177 1178 return (error); 1179 } 1180 1181 #ifndef _SYS_SYSPROTO_H_ 1182 struct munlockall_args { 1183 register_t dummy; 1184 }; 1185 #endif 1186 1187 int 1188 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1189 { 1190 vm_map_t map; 1191 int error; 1192 1193 map = &td->td_proc->p_vmspace->vm_map; 1194 error = priv_check(td, PRIV_VM_MUNLOCK); 1195 if (error) 1196 return (error); 1197 1198 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1199 vm_map_lock(map); 1200 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1201 vm_map_unlock(map); 1202 1203 /* Forcibly unwire all pages. */ 1204 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1205 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1206 #ifdef RACCT 1207 if (racct_enable && error == KERN_SUCCESS) { 1208 PROC_LOCK(td->td_proc); 1209 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1210 PROC_UNLOCK(td->td_proc); 1211 } 1212 #endif 1213 1214 return (error); 1215 } 1216 1217 #ifndef _SYS_SYSPROTO_H_ 1218 struct munlock_args { 1219 const void *addr; 1220 size_t len; 1221 }; 1222 #endif 1223 int 1224 sys_munlock(struct thread *td, struct munlock_args *uap) 1225 { 1226 1227 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1228 } 1229 1230 int 1231 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1232 { 1233 vm_offset_t addr, end, last, start; 1234 #ifdef RACCT 1235 vm_map_t map; 1236 #endif 1237 int error; 1238 1239 error = priv_check(td, PRIV_VM_MUNLOCK); 1240 if (error) 1241 return (error); 1242 addr = addr0; 1243 last = addr + size; 1244 start = trunc_page(addr); 1245 end = round_page(last); 1246 if (last < addr || end < addr) 1247 return (EINVAL); 1248 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1249 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1250 #ifdef RACCT 1251 if (racct_enable && error == KERN_SUCCESS) { 1252 PROC_LOCK(td->td_proc); 1253 map = &td->td_proc->p_vmspace->vm_map; 1254 racct_set(td->td_proc, RACCT_MEMLOCK, 1255 ptoa(pmap_wired_count(map->pmap))); 1256 PROC_UNLOCK(td->td_proc); 1257 } 1258 #endif 1259 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1260 } 1261 1262 /* 1263 * vm_mmap_vnode() 1264 * 1265 * Helper function for vm_mmap. Perform sanity check specific for mmap 1266 * operations on vnodes. 1267 */ 1268 int 1269 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1270 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1271 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1272 boolean_t *writecounted) 1273 { 1274 struct vattr va; 1275 vm_object_t obj; 1276 vm_ooffset_t foff; 1277 struct ucred *cred; 1278 int error, flags; 1279 bool writex; 1280 1281 cred = td->td_ucred; 1282 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1283 (*flagsp & MAP_SHARED) != 0; 1284 if ((error = vget(vp, LK_SHARED)) != 0) 1285 return (error); 1286 AUDIT_ARG_VNODE1(vp); 1287 foff = *foffp; 1288 flags = *flagsp; 1289 obj = vp->v_object; 1290 if (vp->v_type == VREG) { 1291 /* 1292 * Get the proper underlying object 1293 */ 1294 if (obj == NULL) { 1295 error = EINVAL; 1296 goto done; 1297 } 1298 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1299 vput(vp); 1300 vp = (struct vnode *)obj->handle; 1301 /* 1302 * Bypass filesystems obey the mpsafety of the 1303 * underlying fs. Tmpfs never bypasses. 1304 */ 1305 error = vget(vp, LK_SHARED); 1306 if (error != 0) 1307 return (error); 1308 } 1309 if (writex) { 1310 *writecounted = TRUE; 1311 vm_pager_update_writecount(obj, 0, objsize); 1312 } 1313 } else { 1314 error = EINVAL; 1315 goto done; 1316 } 1317 if ((error = VOP_GETATTR(vp, &va, cred))) 1318 goto done; 1319 #ifdef MAC 1320 /* This relies on VM_PROT_* matching PROT_*. */ 1321 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1322 if (error != 0) 1323 goto done; 1324 #endif 1325 if ((flags & MAP_SHARED) != 0) { 1326 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1327 if (prot & VM_PROT_WRITE) { 1328 error = EPERM; 1329 goto done; 1330 } 1331 *maxprotp &= ~VM_PROT_WRITE; 1332 } 1333 } 1334 /* 1335 * If it is a regular file without any references 1336 * we do not need to sync it. 1337 * Adjust object size to be the size of actual file. 1338 */ 1339 objsize = round_page(va.va_size); 1340 if (va.va_nlink == 0) 1341 flags |= MAP_NOSYNC; 1342 if (obj->type == OBJT_VNODE) { 1343 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1344 cred); 1345 if (obj == NULL) { 1346 error = ENOMEM; 1347 goto done; 1348 } 1349 } else { 1350 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1351 ("wrong object type")); 1352 vm_object_reference(obj); 1353 #if VM_NRESERVLEVEL > 0 1354 if ((obj->flags & OBJ_COLORED) == 0) { 1355 VM_OBJECT_WLOCK(obj); 1356 vm_object_color(obj, 0); 1357 VM_OBJECT_WUNLOCK(obj); 1358 } 1359 #endif 1360 } 1361 *objp = obj; 1362 *flagsp = flags; 1363 1364 VOP_MMAPPED(vp); 1365 1366 done: 1367 if (error != 0 && *writecounted) { 1368 *writecounted = FALSE; 1369 vm_pager_update_writecount(obj, objsize, 0); 1370 } 1371 vput(vp); 1372 return (error); 1373 } 1374 1375 /* 1376 * vm_mmap_cdev() 1377 * 1378 * Helper function for vm_mmap. Perform sanity check specific for mmap 1379 * operations on cdevs. 1380 */ 1381 int 1382 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1383 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1384 vm_ooffset_t *foff, vm_object_t *objp) 1385 { 1386 vm_object_t obj; 1387 int error, flags; 1388 1389 flags = *flagsp; 1390 1391 if (dsw->d_flags & D_MMAP_ANON) { 1392 *objp = NULL; 1393 *foff = 0; 1394 *maxprotp = VM_PROT_ALL; 1395 *flagsp |= MAP_ANON; 1396 return (0); 1397 } 1398 /* 1399 * cdevs do not provide private mappings of any kind. 1400 */ 1401 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1402 (prot & VM_PROT_WRITE) != 0) 1403 return (EACCES); 1404 if (flags & (MAP_PRIVATE|MAP_COPY)) 1405 return (EINVAL); 1406 /* 1407 * Force device mappings to be shared. 1408 */ 1409 flags |= MAP_SHARED; 1410 #ifdef MAC_XXX 1411 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1412 if (error != 0) 1413 return (error); 1414 #endif 1415 /* 1416 * First, try d_mmap_single(). If that is not implemented 1417 * (returns ENODEV), fall back to using the device pager. 1418 * Note that d_mmap_single() must return a reference to the 1419 * object (it needs to bump the reference count of the object 1420 * it returns somehow). 1421 * 1422 * XXX assumes VM_PROT_* == PROT_* 1423 */ 1424 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1425 if (error != ENODEV) 1426 return (error); 1427 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1428 td->td_ucred); 1429 if (obj == NULL) 1430 return (EINVAL); 1431 *objp = obj; 1432 *flagsp = flags; 1433 return (0); 1434 } 1435 1436 /* 1437 * vm_mmap() 1438 * 1439 * Internal version of mmap used by exec, sys5 shared memory, and 1440 * various device drivers. Handle is either a vnode pointer, a 1441 * character device, or NULL for MAP_ANON. 1442 */ 1443 int 1444 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1445 vm_prot_t maxprot, int flags, 1446 objtype_t handle_type, void *handle, 1447 vm_ooffset_t foff) 1448 { 1449 vm_object_t object; 1450 struct thread *td = curthread; 1451 int error; 1452 boolean_t writecounted; 1453 1454 if (size == 0) 1455 return (EINVAL); 1456 1457 size = round_page(size); 1458 object = NULL; 1459 writecounted = FALSE; 1460 1461 /* 1462 * Lookup/allocate object. 1463 */ 1464 switch (handle_type) { 1465 case OBJT_DEVICE: { 1466 struct cdevsw *dsw; 1467 struct cdev *cdev; 1468 int ref; 1469 1470 cdev = handle; 1471 dsw = dev_refthread(cdev, &ref); 1472 if (dsw == NULL) 1473 return (ENXIO); 1474 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1475 dsw, &foff, &object); 1476 dev_relthread(cdev, ref); 1477 break; 1478 } 1479 case OBJT_VNODE: 1480 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1481 handle, &foff, &object, &writecounted); 1482 break; 1483 case OBJT_DEFAULT: 1484 if (handle == NULL) { 1485 error = 0; 1486 break; 1487 } 1488 /* FALLTHROUGH */ 1489 default: 1490 error = EINVAL; 1491 break; 1492 } 1493 if (error) 1494 return (error); 1495 1496 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1497 foff, writecounted, td); 1498 if (error != 0 && object != NULL) { 1499 /* 1500 * If this mapping was accounted for in the vnode's 1501 * writecount, then undo that now. 1502 */ 1503 if (writecounted) 1504 vm_pager_release_writecount(object, 0, size); 1505 vm_object_deallocate(object); 1506 } 1507 return (error); 1508 } 1509 1510 /* 1511 * Internal version of mmap that maps a specific VM object into an 1512 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1513 */ 1514 int 1515 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1516 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1517 boolean_t writecounted, struct thread *td) 1518 { 1519 boolean_t curmap, fitit; 1520 vm_offset_t max_addr; 1521 int docow, error, findspace, rv; 1522 1523 curmap = map == &td->td_proc->p_vmspace->vm_map; 1524 if (curmap) { 1525 RACCT_PROC_LOCK(td->td_proc); 1526 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1527 RACCT_PROC_UNLOCK(td->td_proc); 1528 return (ENOMEM); 1529 } 1530 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1531 RACCT_PROC_UNLOCK(td->td_proc); 1532 return (ENOMEM); 1533 } 1534 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1535 if (ptoa(pmap_wired_count(map->pmap)) + size > 1536 lim_cur(td, RLIMIT_MEMLOCK)) { 1537 racct_set_force(td->td_proc, RACCT_VMEM, 1538 map->size); 1539 RACCT_PROC_UNLOCK(td->td_proc); 1540 return (ENOMEM); 1541 } 1542 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1543 ptoa(pmap_wired_count(map->pmap)) + size); 1544 if (error != 0) { 1545 racct_set_force(td->td_proc, RACCT_VMEM, 1546 map->size); 1547 RACCT_PROC_UNLOCK(td->td_proc); 1548 return (error); 1549 } 1550 } 1551 RACCT_PROC_UNLOCK(td->td_proc); 1552 } 1553 1554 /* 1555 * We currently can only deal with page aligned file offsets. 1556 * The mmap() system call already enforces this by subtracting 1557 * the page offset from the file offset, but checking here 1558 * catches errors in device drivers (e.g. d_single_mmap() 1559 * callbacks) and other internal mapping requests (such as in 1560 * exec). 1561 */ 1562 if (foff & PAGE_MASK) 1563 return (EINVAL); 1564 1565 if ((flags & MAP_FIXED) == 0) { 1566 fitit = TRUE; 1567 *addr = round_page(*addr); 1568 } else { 1569 if (*addr != trunc_page(*addr)) 1570 return (EINVAL); 1571 fitit = FALSE; 1572 } 1573 1574 if (flags & MAP_ANON) { 1575 if (object != NULL || foff != 0) 1576 return (EINVAL); 1577 docow = 0; 1578 } else if (flags & MAP_PREFAULT_READ) 1579 docow = MAP_PREFAULT; 1580 else 1581 docow = MAP_PREFAULT_PARTIAL; 1582 1583 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1584 docow |= MAP_COPY_ON_WRITE; 1585 if (flags & MAP_NOSYNC) 1586 docow |= MAP_DISABLE_SYNCER; 1587 if (flags & MAP_NOCORE) 1588 docow |= MAP_DISABLE_COREDUMP; 1589 /* Shared memory is also shared with children. */ 1590 if (flags & MAP_SHARED) 1591 docow |= MAP_INHERIT_SHARE; 1592 if (writecounted) 1593 docow |= MAP_WRITECOUNT; 1594 if (flags & MAP_STACK) { 1595 if (object != NULL) 1596 return (EINVAL); 1597 docow |= MAP_STACK_GROWS_DOWN; 1598 } 1599 if ((flags & MAP_EXCL) != 0) 1600 docow |= MAP_CHECK_EXCL; 1601 if ((flags & MAP_GUARD) != 0) 1602 docow |= MAP_CREATE_GUARD; 1603 1604 if (fitit) { 1605 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1606 findspace = VMFS_SUPER_SPACE; 1607 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1608 findspace = VMFS_ALIGNED_SPACE(flags >> 1609 MAP_ALIGNMENT_SHIFT); 1610 else 1611 findspace = VMFS_OPTIMAL_SPACE; 1612 max_addr = 0; 1613 #ifdef MAP_32BIT 1614 if ((flags & MAP_32BIT) != 0) 1615 max_addr = MAP_32BIT_MAX_ADDR; 1616 #endif 1617 if (curmap) { 1618 rv = vm_map_find_min(map, object, foff, addr, size, 1619 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1620 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1621 findspace, prot, maxprot, docow); 1622 } else { 1623 rv = vm_map_find(map, object, foff, addr, size, 1624 max_addr, findspace, prot, maxprot, docow); 1625 } 1626 } else { 1627 rv = vm_map_fixed(map, object, foff, *addr, size, 1628 prot, maxprot, docow); 1629 } 1630 1631 if (rv == KERN_SUCCESS) { 1632 /* 1633 * If the process has requested that all future mappings 1634 * be wired, then heed this. 1635 */ 1636 if ((map->flags & MAP_WIREFUTURE) != 0) { 1637 vm_map_lock(map); 1638 if ((map->flags & MAP_WIREFUTURE) != 0) 1639 (void)vm_map_wire_locked(map, *addr, 1640 *addr + size, VM_MAP_WIRE_USER | 1641 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1642 VM_MAP_WIRE_NOHOLES)); 1643 vm_map_unlock(map); 1644 } 1645 } 1646 return (vm_mmap_to_errno(rv)); 1647 } 1648 1649 /* 1650 * Translate a Mach VM return code to zero on success or the appropriate errno 1651 * on failure. 1652 */ 1653 int 1654 vm_mmap_to_errno(int rv) 1655 { 1656 1657 switch (rv) { 1658 case KERN_SUCCESS: 1659 return (0); 1660 case KERN_INVALID_ADDRESS: 1661 case KERN_NO_SPACE: 1662 return (ENOMEM); 1663 case KERN_PROTECTION_FAILURE: 1664 return (EACCES); 1665 default: 1666 return (EINVAL); 1667 } 1668 } 1669