1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1988 University of Utah. 5 * Copyright (c) 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * the Systems Programming Group of the University of Utah Computer 10 * Science Department. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 37 * 38 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 39 */ 40 41 /* 42 * Mapped file (mmap) interface to VM 43 */ 44 45 #include <sys/cdefs.h> 46 __FBSDID("$FreeBSD$"); 47 48 #include "opt_hwpmc_hooks.h" 49 #include "opt_vm.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/capsicum.h> 54 #include <sys/kernel.h> 55 #include <sys/lock.h> 56 #include <sys/mutex.h> 57 #include <sys/sysproto.h> 58 #include <sys/elf.h> 59 #include <sys/filedesc.h> 60 #include <sys/priv.h> 61 #include <sys/proc.h> 62 #include <sys/procctl.h> 63 #include <sys/racct.h> 64 #include <sys/resource.h> 65 #include <sys/resourcevar.h> 66 #include <sys/rwlock.h> 67 #include <sys/sysctl.h> 68 #include <sys/vnode.h> 69 #include <sys/fcntl.h> 70 #include <sys/file.h> 71 #include <sys/mman.h> 72 #include <sys/mount.h> 73 #include <sys/conf.h> 74 #include <sys/stat.h> 75 #include <sys/syscallsubr.h> 76 #include <sys/sysent.h> 77 #include <sys/vmmeter.h> 78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ 79 #include <machine/md_var.h> 80 #endif 81 82 #include <security/audit/audit.h> 83 #include <security/mac/mac_framework.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <vm/pmap.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_object.h> 90 #include <vm/vm_page.h> 91 #include <vm/vm_pager.h> 92 #include <vm/vm_pageout.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_page.h> 95 #include <vm/vnode_pager.h> 96 97 #ifdef HWPMC_HOOKS 98 #include <sys/pmckern.h> 99 #endif 100 101 int old_mlock = 0; 102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 103 "Do not apply RLIMIT_MEMLOCK on mlockall"); 104 static int mincore_mapped = 1; 105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, 106 "mincore reports mappings, not residency"); 107 static int imply_prot_max = 0; 108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, 109 "Imply maximum page protections in mmap() when none are specified"); 110 111 #ifdef MAP_32BIT 112 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 113 #endif 114 115 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow"); 116 117 #ifndef _SYS_SYSPROTO_H_ 118 struct sbrk_args { 119 int incr; 120 }; 121 #endif 122 123 int 124 sys_sbrk(struct thread *td, struct sbrk_args *uap) 125 { 126 /* Not yet implemented */ 127 return (EOPNOTSUPP); 128 } 129 130 #ifndef _SYS_SYSPROTO_H_ 131 struct sstk_args { 132 int incr; 133 }; 134 #endif 135 136 int 137 sys_sstk(struct thread *td, struct sstk_args *uap) 138 { 139 /* Not yet implemented */ 140 return (EOPNOTSUPP); 141 } 142 143 #if defined(COMPAT_43) 144 int 145 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) 146 { 147 148 td->td_retval[0] = PAGE_SIZE; 149 return (0); 150 } 151 #endif /* COMPAT_43 */ 152 153 /* 154 * Memory Map (mmap) system call. Note that the file offset 155 * and address are allowed to be NOT page aligned, though if 156 * the MAP_FIXED flag it set, both must have the same remainder 157 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 158 * page-aligned, the actual mapping starts at trunc_page(addr) 159 * and the return value is adjusted up by the page offset. 160 * 161 * Generally speaking, only character devices which are themselves 162 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 163 * there would be no cache coherency between a descriptor and a VM mapping 164 * both to the same character device. 165 */ 166 #ifndef _SYS_SYSPROTO_H_ 167 struct mmap_args { 168 void *addr; 169 size_t len; 170 int prot; 171 int flags; 172 int fd; 173 long pad; 174 off_t pos; 175 }; 176 #endif 177 178 int 179 sys_mmap(struct thread *td, struct mmap_args *uap) 180 { 181 182 return (kern_mmap(td, &(struct mmap_req){ 183 .mr_hint = (uintptr_t)uap->addr, 184 .mr_len = uap->len, 185 .mr_prot = uap->prot, 186 .mr_flags = uap->flags, 187 .mr_fd = uap->fd, 188 .mr_pos = uap->pos, 189 })); 190 } 191 192 int 193 kern_mmap_maxprot(struct proc *p, int prot) 194 { 195 196 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || 197 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) 198 return (_PROT_ALL); 199 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && 200 prot != PROT_NONE) 201 return (prot); 202 return (_PROT_ALL); 203 } 204 205 int 206 kern_mmap(struct thread *td, const struct mmap_req *mrp) 207 { 208 struct vmspace *vms; 209 struct file *fp; 210 struct proc *p; 211 off_t pos; 212 vm_offset_t addr, orig_addr; 213 vm_size_t len, pageoff, size; 214 vm_prot_t cap_maxprot; 215 int align, error, fd, flags, max_prot, prot; 216 cap_rights_t rights; 217 mmap_check_fp_fn check_fp_fn; 218 219 orig_addr = addr = mrp->mr_hint; 220 len = mrp->mr_len; 221 prot = mrp->mr_prot; 222 flags = mrp->mr_flags; 223 fd = mrp->mr_fd; 224 pos = mrp->mr_pos; 225 check_fp_fn = mrp->mr_check_fp_fn; 226 227 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 228 return (EINVAL); 229 max_prot = PROT_MAX_EXTRACT(prot); 230 prot = PROT_EXTRACT(prot); 231 if (max_prot != 0 && (max_prot & prot) != prot) 232 return (ENOTSUP); 233 234 p = td->td_proc; 235 236 /* 237 * Always honor PROT_MAX if set. If not, default to all 238 * permissions unless we're implying maximum permissions. 239 */ 240 if (max_prot == 0) 241 max_prot = kern_mmap_maxprot(p, prot); 242 243 vms = p->p_vmspace; 244 fp = NULL; 245 AUDIT_ARG_FD(fd); 246 247 /* 248 * Ignore old flags that used to be defined but did not do anything. 249 */ 250 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 251 252 /* 253 * Enforce the constraints. 254 * Mapping of length 0 is only allowed for old binaries. 255 * Anonymous mapping shall specify -1 as filedescriptor and 256 * zero position for new code. Be nice to ancient a.out 257 * binaries and correct pos for anonymous mapping, since old 258 * ld.so sometimes issues anonymous map requests with non-zero 259 * pos. 260 */ 261 if (!SV_CURPROC_FLAG(SV_AOUT)) { 262 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || 263 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) 264 return (EINVAL); 265 } else { 266 if ((flags & MAP_ANON) != 0) 267 pos = 0; 268 } 269 270 if (flags & MAP_STACK) { 271 if ((fd != -1) || 272 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 273 return (EINVAL); 274 flags |= MAP_ANON; 275 pos = 0; 276 } 277 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 278 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 279 MAP_PREFAULT_READ | MAP_GUARD | 280 #ifdef MAP_32BIT 281 MAP_32BIT | 282 #endif 283 MAP_ALIGNMENT_MASK)) != 0) 284 return (EINVAL); 285 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 286 return (EINVAL); 287 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 288 return (EINVAL); 289 if (prot != PROT_NONE && 290 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 291 return (EINVAL); 292 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || 293 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | 294 #ifdef MAP_32BIT 295 MAP_32BIT | 296 #endif 297 MAP_ALIGNMENT_MASK)) != 0)) 298 return (EINVAL); 299 300 /* 301 * Align the file position to a page boundary, 302 * and save its page offset component. 303 */ 304 pageoff = (pos & PAGE_MASK); 305 pos -= pageoff; 306 307 /* Compute size from len by rounding (on both ends). */ 308 size = len + pageoff; /* low end... */ 309 size = round_page(size); /* hi end */ 310 /* Check for rounding up to zero. */ 311 if (len > size) 312 return (ENOMEM); 313 314 /* Ensure alignment is at least a page and fits in a pointer. */ 315 align = flags & MAP_ALIGNMENT_MASK; 316 if (align != 0 && align != MAP_ALIGNED_SUPER && 317 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 318 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 319 return (EINVAL); 320 321 /* 322 * Check for illegal addresses. Watch out for address wrap... Note 323 * that VM_*_ADDRESS are not constants due to casts (argh). 324 */ 325 if (flags & MAP_FIXED) { 326 /* 327 * The specified address must have the same remainder 328 * as the file offset taken modulo PAGE_SIZE, so it 329 * should be aligned after adjustment by pageoff. 330 */ 331 addr -= pageoff; 332 if (addr & PAGE_MASK) 333 return (EINVAL); 334 335 /* Address range must be all in user VM space. */ 336 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) 337 return (EINVAL); 338 #ifdef MAP_32BIT 339 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 340 return (EINVAL); 341 } else if (flags & MAP_32BIT) { 342 /* 343 * For MAP_32BIT, override the hint if it is too high and 344 * do not bother moving the mapping past the heap (since 345 * the heap is usually above 2GB). 346 */ 347 if (addr + size > MAP_32BIT_MAX_ADDR) 348 addr = 0; 349 #endif 350 } else { 351 /* 352 * XXX for non-fixed mappings where no hint is provided or 353 * the hint would fall in the potential heap space, 354 * place it after the end of the largest possible heap. 355 * 356 * There should really be a pmap call to determine a reasonable 357 * location. 358 */ 359 if (addr == 0 || 360 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 361 addr < round_page((vm_offset_t)vms->vm_daddr + 362 lim_max(td, RLIMIT_DATA)))) 363 addr = round_page((vm_offset_t)vms->vm_daddr + 364 lim_max(td, RLIMIT_DATA)); 365 } 366 if (len == 0) { 367 /* 368 * Return success without mapping anything for old 369 * binaries that request a page-aligned mapping of 370 * length 0. For modern binaries, this function 371 * returns an error earlier. 372 */ 373 error = 0; 374 } else if ((flags & MAP_GUARD) != 0) { 375 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, 376 VM_PROT_NONE, flags, NULL, pos, FALSE, td); 377 } else if ((flags & MAP_ANON) != 0) { 378 /* 379 * Mapping blank space is trivial. 380 * 381 * This relies on VM_PROT_* matching PROT_*. 382 */ 383 error = vm_mmap_object(&vms->vm_map, &addr, size, prot, 384 max_prot, flags, NULL, pos, FALSE, td); 385 } else { 386 /* 387 * Mapping file, get fp for validation and don't let the 388 * descriptor disappear on us if we block. Check capability 389 * rights, but also return the maximum rights to be combined 390 * with maxprot later. 391 */ 392 cap_rights_init_one(&rights, CAP_MMAP); 393 if (prot & PROT_READ) 394 cap_rights_set_one(&rights, CAP_MMAP_R); 395 if ((flags & MAP_SHARED) != 0) { 396 if (prot & PROT_WRITE) 397 cap_rights_set_one(&rights, CAP_MMAP_W); 398 } 399 if (prot & PROT_EXEC) 400 cap_rights_set_one(&rights, CAP_MMAP_X); 401 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); 402 if (error != 0) 403 goto done; 404 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 405 p->p_osrel >= P_OSREL_MAP_FSTRICT) { 406 error = EINVAL; 407 goto done; 408 } 409 if (check_fp_fn != NULL) { 410 error = check_fp_fn(fp, prot, max_prot & cap_maxprot, 411 flags); 412 if (error != 0) 413 goto done; 414 } 415 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data)) 416 addr = orig_addr; 417 /* This relies on VM_PROT_* matching PROT_*. */ 418 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, 419 max_prot & cap_maxprot, flags, pos, td); 420 } 421 422 if (error == 0) 423 td->td_retval[0] = addr + pageoff; 424 done: 425 if (fp) 426 fdrop(fp, td); 427 428 return (error); 429 } 430 431 #if defined(COMPAT_FREEBSD6) 432 int 433 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 434 { 435 return (kern_mmap(td, &(struct mmap_req){ 436 .mr_hint = (uintptr_t)uap->addr, 437 .mr_len = uap->len, 438 .mr_prot = uap->prot, 439 .mr_flags = uap->flags, 440 .mr_fd = uap->fd, 441 .mr_pos = uap->pos, 442 })); 443 } 444 #endif 445 446 #ifdef COMPAT_43 447 #ifndef _SYS_SYSPROTO_H_ 448 struct ommap_args { 449 caddr_t addr; 450 int len; 451 int prot; 452 int flags; 453 int fd; 454 long pos; 455 }; 456 #endif 457 int 458 ommap(struct thread *td, struct ommap_args *uap) 459 { 460 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot, 461 uap->flags, uap->fd, uap->pos)); 462 } 463 464 int 465 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot, 466 int oflags, int fd, long pos) 467 { 468 static const char cvtbsdprot[8] = { 469 0, 470 PROT_EXEC, 471 PROT_WRITE, 472 PROT_EXEC | PROT_WRITE, 473 PROT_READ, 474 PROT_EXEC | PROT_READ, 475 PROT_WRITE | PROT_READ, 476 PROT_EXEC | PROT_WRITE | PROT_READ, 477 }; 478 int flags, prot; 479 480 if (len < 0) 481 return (EINVAL); 482 483 #define OMAP_ANON 0x0002 484 #define OMAP_COPY 0x0020 485 #define OMAP_SHARED 0x0010 486 #define OMAP_FIXED 0x0100 487 488 prot = cvtbsdprot[oprot & 0x7]; 489 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) 490 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 491 prot != 0) 492 prot |= PROT_EXEC; 493 #endif 494 flags = 0; 495 if (oflags & OMAP_ANON) 496 flags |= MAP_ANON; 497 if (oflags & OMAP_COPY) 498 flags |= MAP_COPY; 499 if (oflags & OMAP_SHARED) 500 flags |= MAP_SHARED; 501 else 502 flags |= MAP_PRIVATE; 503 if (oflags & OMAP_FIXED) 504 flags |= MAP_FIXED; 505 return (kern_mmap(td, &(struct mmap_req){ 506 .mr_hint = hint, 507 .mr_len = len, 508 .mr_prot = prot, 509 .mr_flags = flags, 510 .mr_fd = fd, 511 .mr_pos = pos, 512 })); 513 } 514 #endif /* COMPAT_43 */ 515 516 #ifndef _SYS_SYSPROTO_H_ 517 struct msync_args { 518 void *addr; 519 size_t len; 520 int flags; 521 }; 522 #endif 523 int 524 sys_msync(struct thread *td, struct msync_args *uap) 525 { 526 527 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); 528 } 529 530 int 531 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) 532 { 533 vm_offset_t addr; 534 vm_size_t pageoff; 535 vm_map_t map; 536 int rv; 537 538 addr = addr0; 539 pageoff = (addr & PAGE_MASK); 540 addr -= pageoff; 541 size += pageoff; 542 size = (vm_size_t) round_page(size); 543 if (addr + size < addr) 544 return (EINVAL); 545 546 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 547 return (EINVAL); 548 549 map = &td->td_proc->p_vmspace->vm_map; 550 551 /* 552 * Clean the pages and interpret the return value. 553 */ 554 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 555 (flags & MS_INVALIDATE) != 0); 556 switch (rv) { 557 case KERN_SUCCESS: 558 return (0); 559 case KERN_INVALID_ADDRESS: 560 return (ENOMEM); 561 case KERN_INVALID_ARGUMENT: 562 return (EBUSY); 563 case KERN_FAILURE: 564 return (EIO); 565 default: 566 return (EINVAL); 567 } 568 } 569 570 #ifndef _SYS_SYSPROTO_H_ 571 struct munmap_args { 572 void *addr; 573 size_t len; 574 }; 575 #endif 576 int 577 sys_munmap(struct thread *td, struct munmap_args *uap) 578 { 579 580 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); 581 } 582 583 int 584 kern_munmap(struct thread *td, uintptr_t addr0, size_t size) 585 { 586 #ifdef HWPMC_HOOKS 587 struct pmckern_map_out pkm; 588 vm_map_entry_t entry; 589 bool pmc_handled; 590 #endif 591 vm_offset_t addr, end; 592 vm_size_t pageoff; 593 vm_map_t map; 594 int rv; 595 596 if (size == 0) 597 return (EINVAL); 598 599 addr = addr0; 600 pageoff = (addr & PAGE_MASK); 601 addr -= pageoff; 602 size += pageoff; 603 size = (vm_size_t) round_page(size); 604 end = addr + size; 605 map = &td->td_proc->p_vmspace->vm_map; 606 if (!vm_map_range_valid(map, addr, end)) 607 return (EINVAL); 608 609 vm_map_lock(map); 610 #ifdef HWPMC_HOOKS 611 pmc_handled = false; 612 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { 613 pmc_handled = true; 614 /* 615 * Inform hwpmc if the address range being unmapped contains 616 * an executable region. 617 */ 618 pkm.pm_address = (uintptr_t) NULL; 619 if (vm_map_lookup_entry(map, addr, &entry)) { 620 for (; entry->start < end; 621 entry = vm_map_entry_succ(entry)) { 622 if (vm_map_check_protection(map, entry->start, 623 entry->end, VM_PROT_EXECUTE) == TRUE) { 624 pkm.pm_address = (uintptr_t) addr; 625 pkm.pm_size = (size_t) size; 626 break; 627 } 628 } 629 } 630 } 631 #endif 632 rv = vm_map_delete(map, addr, end); 633 634 #ifdef HWPMC_HOOKS 635 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) { 636 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 637 vm_map_lock_downgrade(map); 638 if (pkm.pm_address != (uintptr_t) NULL) 639 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 640 vm_map_unlock_read(map); 641 } else 642 #endif 643 vm_map_unlock(map); 644 645 return (vm_mmap_to_errno(rv)); 646 } 647 648 #ifndef _SYS_SYSPROTO_H_ 649 struct mprotect_args { 650 const void *addr; 651 size_t len; 652 int prot; 653 }; 654 #endif 655 int 656 sys_mprotect(struct thread *td, struct mprotect_args *uap) 657 { 658 659 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); 660 } 661 662 int 663 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) 664 { 665 vm_offset_t addr; 666 vm_size_t pageoff; 667 int vm_error, max_prot; 668 int flags; 669 670 addr = addr0; 671 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) 672 return (EINVAL); 673 max_prot = PROT_MAX_EXTRACT(prot); 674 prot = PROT_EXTRACT(prot); 675 pageoff = (addr & PAGE_MASK); 676 addr -= pageoff; 677 size += pageoff; 678 size = (vm_size_t) round_page(size); 679 #ifdef COMPAT_FREEBSD32 680 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 681 if (((addr + size) & 0xffffffff) < addr) 682 return (EINVAL); 683 } else 684 #endif 685 if (addr + size < addr) 686 return (EINVAL); 687 688 flags = VM_MAP_PROTECT_SET_PROT; 689 if (max_prot != 0) 690 flags |= VM_MAP_PROTECT_SET_MAXPROT; 691 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, 692 addr, addr + size, prot, max_prot, flags); 693 694 switch (vm_error) { 695 case KERN_SUCCESS: 696 return (0); 697 case KERN_PROTECTION_FAILURE: 698 return (EACCES); 699 case KERN_RESOURCE_SHORTAGE: 700 return (ENOMEM); 701 case KERN_OUT_OF_BOUNDS: 702 return (ENOTSUP); 703 } 704 return (EINVAL); 705 } 706 707 #ifndef _SYS_SYSPROTO_H_ 708 struct minherit_args { 709 void *addr; 710 size_t len; 711 int inherit; 712 }; 713 #endif 714 int 715 sys_minherit(struct thread *td, struct minherit_args *uap) 716 { 717 718 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, 719 uap->inherit)); 720 } 721 722 int 723 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) 724 { 725 vm_offset_t addr; 726 vm_size_t size, pageoff; 727 vm_inherit_t inherit; 728 729 addr = (vm_offset_t)addr0; 730 size = len; 731 inherit = inherit0; 732 733 pageoff = (addr & PAGE_MASK); 734 addr -= pageoff; 735 size += pageoff; 736 size = (vm_size_t) round_page(size); 737 if (addr + size < addr) 738 return (EINVAL); 739 740 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 741 addr + size, inherit)) { 742 case KERN_SUCCESS: 743 return (0); 744 case KERN_PROTECTION_FAILURE: 745 return (EACCES); 746 } 747 return (EINVAL); 748 } 749 750 #ifndef _SYS_SYSPROTO_H_ 751 struct madvise_args { 752 void *addr; 753 size_t len; 754 int behav; 755 }; 756 #endif 757 758 int 759 sys_madvise(struct thread *td, struct madvise_args *uap) 760 { 761 762 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); 763 } 764 765 int 766 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) 767 { 768 vm_map_t map; 769 vm_offset_t addr, end, start; 770 int flags; 771 772 /* 773 * Check for our special case, advising the swap pager we are 774 * "immortal." 775 */ 776 if (behav == MADV_PROTECT) { 777 flags = PPROT_SET; 778 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 779 PROC_SPROTECT, &flags)); 780 } 781 782 /* 783 * Check for illegal addresses. Watch out for address wrap... Note 784 * that VM_*_ADDRESS are not constants due to casts (argh). 785 */ 786 map = &td->td_proc->p_vmspace->vm_map; 787 addr = addr0; 788 if (!vm_map_range_valid(map, addr, addr + len)) 789 return (EINVAL); 790 791 /* 792 * Since this routine is only advisory, we default to conservative 793 * behavior. 794 */ 795 start = trunc_page(addr); 796 end = round_page(addr + len); 797 798 /* 799 * vm_map_madvise() checks for illegal values of behav. 800 */ 801 return (vm_map_madvise(map, start, end, behav)); 802 } 803 804 #ifndef _SYS_SYSPROTO_H_ 805 struct mincore_args { 806 const void *addr; 807 size_t len; 808 char *vec; 809 }; 810 #endif 811 812 int 813 sys_mincore(struct thread *td, struct mincore_args *uap) 814 { 815 816 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); 817 } 818 819 int 820 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) 821 { 822 pmap_t pmap; 823 vm_map_t map; 824 vm_map_entry_t current, entry; 825 vm_object_t object; 826 vm_offset_t addr, cend, end, first_addr; 827 vm_paddr_t pa; 828 vm_page_t m; 829 vm_pindex_t pindex; 830 int error, lastvecindex, mincoreinfo, vecindex; 831 unsigned int timestamp; 832 833 /* 834 * Make sure that the addresses presented are valid for user 835 * mode. 836 */ 837 first_addr = addr = trunc_page(addr0); 838 end = round_page(addr0 + len); 839 map = &td->td_proc->p_vmspace->vm_map; 840 if (end > vm_map_max(map) || end < addr) 841 return (ENOMEM); 842 843 pmap = vmspace_pmap(td->td_proc->p_vmspace); 844 845 vm_map_lock_read(map); 846 RestartScan: 847 timestamp = map->timestamp; 848 849 if (!vm_map_lookup_entry(map, addr, &entry)) { 850 vm_map_unlock_read(map); 851 return (ENOMEM); 852 } 853 854 /* 855 * Do this on a map entry basis so that if the pages are not 856 * in the current processes address space, we can easily look 857 * up the pages elsewhere. 858 */ 859 lastvecindex = -1; 860 while (entry->start < end) { 861 /* 862 * check for contiguity 863 */ 864 current = entry; 865 entry = vm_map_entry_succ(current); 866 if (current->end < end && 867 entry->start > current->end) { 868 vm_map_unlock_read(map); 869 return (ENOMEM); 870 } 871 872 /* 873 * ignore submaps (for now) or null objects 874 */ 875 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 876 current->object.vm_object == NULL) 877 continue; 878 879 /* 880 * limit this scan to the current map entry and the 881 * limits for the mincore call 882 */ 883 if (addr < current->start) 884 addr = current->start; 885 cend = current->end; 886 if (cend > end) 887 cend = end; 888 889 for (; addr < cend; addr += PAGE_SIZE) { 890 /* 891 * Check pmap first, it is likely faster, also 892 * it can provide info as to whether we are the 893 * one referencing or modifying the page. 894 */ 895 m = NULL; 896 object = NULL; 897 retry: 898 pa = 0; 899 mincoreinfo = pmap_mincore(pmap, addr, &pa); 900 if (mincore_mapped) { 901 /* 902 * We only care about this pmap's 903 * mapping of the page, if any. 904 */ 905 ; 906 } else if (pa != 0) { 907 /* 908 * The page is mapped by this process but not 909 * both accessed and modified. It is also 910 * managed. Acquire the object lock so that 911 * other mappings might be examined. The page's 912 * identity may change at any point before its 913 * object lock is acquired, so re-validate if 914 * necessary. 915 */ 916 m = PHYS_TO_VM_PAGE(pa); 917 while (object == NULL || m->object != object) { 918 if (object != NULL) 919 VM_OBJECT_WUNLOCK(object); 920 object = atomic_load_ptr(&m->object); 921 if (object == NULL) 922 goto retry; 923 VM_OBJECT_WLOCK(object); 924 } 925 if (pa != pmap_extract(pmap, addr)) 926 goto retry; 927 KASSERT(vm_page_all_valid(m), 928 ("mincore: page %p is mapped but invalid", 929 m)); 930 } else if (mincoreinfo == 0) { 931 /* 932 * The page is not mapped by this process. If 933 * the object implements managed pages, then 934 * determine if the page is resident so that 935 * the mappings might be examined. 936 */ 937 if (current->object.vm_object != object) { 938 if (object != NULL) 939 VM_OBJECT_WUNLOCK(object); 940 object = current->object.vm_object; 941 VM_OBJECT_WLOCK(object); 942 } 943 if ((object->flags & OBJ_SWAP) != 0 || 944 object->type == OBJT_VNODE) { 945 pindex = OFF_TO_IDX(current->offset + 946 (addr - current->start)); 947 m = vm_page_lookup(object, pindex); 948 if (m != NULL && vm_page_none_valid(m)) 949 m = NULL; 950 if (m != NULL) 951 mincoreinfo = MINCORE_INCORE; 952 } 953 } 954 if (m != NULL) { 955 VM_OBJECT_ASSERT_WLOCKED(m->object); 956 957 /* Examine other mappings of the page. */ 958 if (m->dirty == 0 && pmap_is_modified(m)) 959 vm_page_dirty(m); 960 if (m->dirty != 0) 961 mincoreinfo |= MINCORE_MODIFIED_OTHER; 962 963 /* 964 * The first test for PGA_REFERENCED is an 965 * optimization. The second test is 966 * required because a concurrent pmap 967 * operation could clear the last reference 968 * and set PGA_REFERENCED before the call to 969 * pmap_is_referenced(). 970 */ 971 if ((m->a.flags & PGA_REFERENCED) != 0 || 972 pmap_is_referenced(m) || 973 (m->a.flags & PGA_REFERENCED) != 0) 974 mincoreinfo |= MINCORE_REFERENCED_OTHER; 975 } 976 if (object != NULL) 977 VM_OBJECT_WUNLOCK(object); 978 979 /* 980 * subyte may page fault. In case it needs to modify 981 * the map, we release the lock. 982 */ 983 vm_map_unlock_read(map); 984 985 /* 986 * calculate index into user supplied byte vector 987 */ 988 vecindex = atop(addr - first_addr); 989 990 /* 991 * If we have skipped map entries, we need to make sure that 992 * the byte vector is zeroed for those skipped entries. 993 */ 994 while ((lastvecindex + 1) < vecindex) { 995 ++lastvecindex; 996 error = subyte(vec + lastvecindex, 0); 997 if (error) { 998 error = EFAULT; 999 goto done2; 1000 } 1001 } 1002 1003 /* 1004 * Pass the page information to the user 1005 */ 1006 error = subyte(vec + vecindex, mincoreinfo); 1007 if (error) { 1008 error = EFAULT; 1009 goto done2; 1010 } 1011 1012 /* 1013 * If the map has changed, due to the subyte, the previous 1014 * output may be invalid. 1015 */ 1016 vm_map_lock_read(map); 1017 if (timestamp != map->timestamp) 1018 goto RestartScan; 1019 1020 lastvecindex = vecindex; 1021 } 1022 } 1023 1024 /* 1025 * subyte may page fault. In case it needs to modify 1026 * the map, we release the lock. 1027 */ 1028 vm_map_unlock_read(map); 1029 1030 /* 1031 * Zero the last entries in the byte vector. 1032 */ 1033 vecindex = atop(end - first_addr); 1034 while ((lastvecindex + 1) < vecindex) { 1035 ++lastvecindex; 1036 error = subyte(vec + lastvecindex, 0); 1037 if (error) { 1038 error = EFAULT; 1039 goto done2; 1040 } 1041 } 1042 1043 /* 1044 * If the map has changed, due to the subyte, the previous 1045 * output may be invalid. 1046 */ 1047 vm_map_lock_read(map); 1048 if (timestamp != map->timestamp) 1049 goto RestartScan; 1050 vm_map_unlock_read(map); 1051 done2: 1052 return (error); 1053 } 1054 1055 #ifndef _SYS_SYSPROTO_H_ 1056 struct mlock_args { 1057 const void *addr; 1058 size_t len; 1059 }; 1060 #endif 1061 int 1062 sys_mlock(struct thread *td, struct mlock_args *uap) 1063 { 1064 1065 return (kern_mlock(td->td_proc, td->td_ucred, 1066 __DECONST(uintptr_t, uap->addr), uap->len)); 1067 } 1068 1069 int 1070 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) 1071 { 1072 vm_offset_t addr, end, last, start; 1073 vm_size_t npages, size; 1074 vm_map_t map; 1075 unsigned long nsize; 1076 int error; 1077 1078 error = priv_check_cred(cred, PRIV_VM_MLOCK); 1079 if (error) 1080 return (error); 1081 addr = addr0; 1082 size = len; 1083 last = addr + size; 1084 start = trunc_page(addr); 1085 end = round_page(last); 1086 if (last < addr || end < addr) 1087 return (EINVAL); 1088 npages = atop(end - start); 1089 if (npages > vm_page_max_user_wired) 1090 return (ENOMEM); 1091 map = &proc->p_vmspace->vm_map; 1092 PROC_LOCK(proc); 1093 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1094 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { 1095 PROC_UNLOCK(proc); 1096 return (ENOMEM); 1097 } 1098 PROC_UNLOCK(proc); 1099 #ifdef RACCT 1100 if (racct_enable) { 1101 PROC_LOCK(proc); 1102 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1103 PROC_UNLOCK(proc); 1104 if (error != 0) 1105 return (ENOMEM); 1106 } 1107 #endif 1108 error = vm_map_wire(map, start, end, 1109 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1110 #ifdef RACCT 1111 if (racct_enable && error != KERN_SUCCESS) { 1112 PROC_LOCK(proc); 1113 racct_set(proc, RACCT_MEMLOCK, 1114 ptoa(pmap_wired_count(map->pmap))); 1115 PROC_UNLOCK(proc); 1116 } 1117 #endif 1118 switch (error) { 1119 case KERN_SUCCESS: 1120 return (0); 1121 case KERN_INVALID_ARGUMENT: 1122 return (EINVAL); 1123 default: 1124 return (ENOMEM); 1125 } 1126 } 1127 1128 #ifndef _SYS_SYSPROTO_H_ 1129 struct mlockall_args { 1130 int how; 1131 }; 1132 #endif 1133 1134 int 1135 sys_mlockall(struct thread *td, struct mlockall_args *uap) 1136 { 1137 vm_map_t map; 1138 int error; 1139 1140 map = &td->td_proc->p_vmspace->vm_map; 1141 error = priv_check(td, PRIV_VM_MLOCK); 1142 if (error) 1143 return (error); 1144 1145 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1146 return (EINVAL); 1147 1148 /* 1149 * If wiring all pages in the process would cause it to exceed 1150 * a hard resource limit, return ENOMEM. 1151 */ 1152 if (!old_mlock && uap->how & MCL_CURRENT) { 1153 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) 1154 return (ENOMEM); 1155 } 1156 #ifdef RACCT 1157 if (racct_enable) { 1158 PROC_LOCK(td->td_proc); 1159 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1160 PROC_UNLOCK(td->td_proc); 1161 if (error != 0) 1162 return (ENOMEM); 1163 } 1164 #endif 1165 1166 if (uap->how & MCL_FUTURE) { 1167 vm_map_lock(map); 1168 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1169 vm_map_unlock(map); 1170 error = 0; 1171 } 1172 1173 if (uap->how & MCL_CURRENT) { 1174 /* 1175 * P1003.1-2001 mandates that all currently mapped pages 1176 * will be memory resident and locked (wired) upon return 1177 * from mlockall(). vm_map_wire() will wire pages, by 1178 * calling vm_fault_wire() for each page in the region. 1179 */ 1180 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1181 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1182 if (error == KERN_SUCCESS) 1183 error = 0; 1184 else if (error == KERN_RESOURCE_SHORTAGE) 1185 error = ENOMEM; 1186 else 1187 error = EAGAIN; 1188 } 1189 #ifdef RACCT 1190 if (racct_enable && error != KERN_SUCCESS) { 1191 PROC_LOCK(td->td_proc); 1192 racct_set(td->td_proc, RACCT_MEMLOCK, 1193 ptoa(pmap_wired_count(map->pmap))); 1194 PROC_UNLOCK(td->td_proc); 1195 } 1196 #endif 1197 1198 return (error); 1199 } 1200 1201 #ifndef _SYS_SYSPROTO_H_ 1202 struct munlockall_args { 1203 register_t dummy; 1204 }; 1205 #endif 1206 1207 int 1208 sys_munlockall(struct thread *td, struct munlockall_args *uap) 1209 { 1210 vm_map_t map; 1211 int error; 1212 1213 map = &td->td_proc->p_vmspace->vm_map; 1214 error = priv_check(td, PRIV_VM_MUNLOCK); 1215 if (error) 1216 return (error); 1217 1218 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1219 vm_map_lock(map); 1220 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1221 vm_map_unlock(map); 1222 1223 /* Forcibly unwire all pages. */ 1224 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1225 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1226 #ifdef RACCT 1227 if (racct_enable && error == KERN_SUCCESS) { 1228 PROC_LOCK(td->td_proc); 1229 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1230 PROC_UNLOCK(td->td_proc); 1231 } 1232 #endif 1233 1234 return (error); 1235 } 1236 1237 #ifndef _SYS_SYSPROTO_H_ 1238 struct munlock_args { 1239 const void *addr; 1240 size_t len; 1241 }; 1242 #endif 1243 int 1244 sys_munlock(struct thread *td, struct munlock_args *uap) 1245 { 1246 1247 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); 1248 } 1249 1250 int 1251 kern_munlock(struct thread *td, uintptr_t addr0, size_t size) 1252 { 1253 vm_offset_t addr, end, last, start; 1254 #ifdef RACCT 1255 vm_map_t map; 1256 #endif 1257 int error; 1258 1259 error = priv_check(td, PRIV_VM_MUNLOCK); 1260 if (error) 1261 return (error); 1262 addr = addr0; 1263 last = addr + size; 1264 start = trunc_page(addr); 1265 end = round_page(last); 1266 if (last < addr || end < addr) 1267 return (EINVAL); 1268 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1269 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1270 #ifdef RACCT 1271 if (racct_enable && error == KERN_SUCCESS) { 1272 PROC_LOCK(td->td_proc); 1273 map = &td->td_proc->p_vmspace->vm_map; 1274 racct_set(td->td_proc, RACCT_MEMLOCK, 1275 ptoa(pmap_wired_count(map->pmap))); 1276 PROC_UNLOCK(td->td_proc); 1277 } 1278 #endif 1279 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1280 } 1281 1282 /* 1283 * vm_mmap_vnode() 1284 * 1285 * Helper function for vm_mmap. Perform sanity check specific for mmap 1286 * operations on vnodes. 1287 */ 1288 int 1289 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1290 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1291 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1292 boolean_t *writecounted) 1293 { 1294 struct vattr va; 1295 vm_object_t obj; 1296 vm_ooffset_t foff; 1297 struct ucred *cred; 1298 int error, flags; 1299 bool writex; 1300 1301 cred = td->td_ucred; 1302 writex = (*maxprotp & VM_PROT_WRITE) != 0 && 1303 (*flagsp & MAP_SHARED) != 0; 1304 if ((error = vget(vp, LK_SHARED)) != 0) 1305 return (error); 1306 AUDIT_ARG_VNODE1(vp); 1307 foff = *foffp; 1308 flags = *flagsp; 1309 obj = vp->v_object; 1310 if (vp->v_type == VREG) { 1311 /* 1312 * Get the proper underlying object 1313 */ 1314 if (obj == NULL) { 1315 error = EINVAL; 1316 goto done; 1317 } 1318 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1319 vput(vp); 1320 vp = (struct vnode *)obj->handle; 1321 /* 1322 * Bypass filesystems obey the mpsafety of the 1323 * underlying fs. Tmpfs never bypasses. 1324 */ 1325 error = vget(vp, LK_SHARED); 1326 if (error != 0) 1327 return (error); 1328 } 1329 if (writex) { 1330 *writecounted = TRUE; 1331 vm_pager_update_writecount(obj, 0, objsize); 1332 } 1333 } else { 1334 error = EINVAL; 1335 goto done; 1336 } 1337 if ((error = VOP_GETATTR(vp, &va, cred))) 1338 goto done; 1339 #ifdef MAC 1340 /* This relies on VM_PROT_* matching PROT_*. */ 1341 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); 1342 if (error != 0) 1343 goto done; 1344 #endif 1345 if ((flags & MAP_SHARED) != 0) { 1346 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1347 if (prot & VM_PROT_WRITE) { 1348 error = EPERM; 1349 goto done; 1350 } 1351 *maxprotp &= ~VM_PROT_WRITE; 1352 } 1353 } 1354 /* 1355 * If it is a regular file without any references 1356 * we do not need to sync it. 1357 * Adjust object size to be the size of actual file. 1358 */ 1359 objsize = round_page(va.va_size); 1360 if (va.va_nlink == 0) 1361 flags |= MAP_NOSYNC; 1362 if (obj->type == OBJT_VNODE) { 1363 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1364 cred); 1365 if (obj == NULL) { 1366 error = ENOMEM; 1367 goto done; 1368 } 1369 } else { 1370 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type")); 1371 vm_object_reference(obj); 1372 #if VM_NRESERVLEVEL > 0 1373 if ((obj->flags & OBJ_COLORED) == 0) { 1374 VM_OBJECT_WLOCK(obj); 1375 vm_object_color(obj, 0); 1376 VM_OBJECT_WUNLOCK(obj); 1377 } 1378 #endif 1379 } 1380 *objp = obj; 1381 *flagsp = flags; 1382 1383 VOP_MMAPPED(vp); 1384 1385 done: 1386 if (error != 0 && *writecounted) { 1387 *writecounted = FALSE; 1388 vm_pager_update_writecount(obj, objsize, 0); 1389 } 1390 vput(vp); 1391 return (error); 1392 } 1393 1394 /* 1395 * vm_mmap_cdev() 1396 * 1397 * Helper function for vm_mmap. Perform sanity check specific for mmap 1398 * operations on cdevs. 1399 */ 1400 int 1401 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1402 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, 1403 vm_ooffset_t *foff, vm_object_t *objp) 1404 { 1405 vm_object_t obj; 1406 int error, flags; 1407 1408 flags = *flagsp; 1409 1410 if (dsw->d_flags & D_MMAP_ANON) { 1411 *objp = NULL; 1412 *foff = 0; 1413 *maxprotp = VM_PROT_ALL; 1414 *flagsp |= MAP_ANON; 1415 return (0); 1416 } 1417 /* 1418 * cdevs do not provide private mappings of any kind. 1419 */ 1420 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1421 (prot & VM_PROT_WRITE) != 0) 1422 return (EACCES); 1423 if (flags & (MAP_PRIVATE|MAP_COPY)) 1424 return (EINVAL); 1425 /* 1426 * Force device mappings to be shared. 1427 */ 1428 flags |= MAP_SHARED; 1429 #ifdef MAC_XXX 1430 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); 1431 if (error != 0) 1432 return (error); 1433 #endif 1434 /* 1435 * First, try d_mmap_single(). If that is not implemented 1436 * (returns ENODEV), fall back to using the device pager. 1437 * Note that d_mmap_single() must return a reference to the 1438 * object (it needs to bump the reference count of the object 1439 * it returns somehow). 1440 * 1441 * XXX assumes VM_PROT_* == PROT_* 1442 */ 1443 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1444 if (error != ENODEV) 1445 return (error); 1446 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1447 td->td_ucred); 1448 if (obj == NULL) 1449 return (EINVAL); 1450 *objp = obj; 1451 *flagsp = flags; 1452 return (0); 1453 } 1454 1455 int 1456 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1457 vm_prot_t maxprot, int flags, 1458 objtype_t handle_type, void *handle, 1459 vm_ooffset_t foff) 1460 { 1461 vm_object_t object; 1462 struct thread *td = curthread; 1463 int error; 1464 boolean_t writecounted; 1465 1466 if (size == 0) 1467 return (EINVAL); 1468 1469 size = round_page(size); 1470 object = NULL; 1471 writecounted = FALSE; 1472 1473 switch (handle_type) { 1474 case OBJT_DEVICE: { 1475 struct cdevsw *dsw; 1476 struct cdev *cdev; 1477 int ref; 1478 1479 cdev = handle; 1480 dsw = dev_refthread(cdev, &ref); 1481 if (dsw == NULL) 1482 return (ENXIO); 1483 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, 1484 dsw, &foff, &object); 1485 dev_relthread(cdev, ref); 1486 break; 1487 } 1488 case OBJT_VNODE: 1489 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1490 handle, &foff, &object, &writecounted); 1491 break; 1492 default: 1493 error = EINVAL; 1494 break; 1495 } 1496 if (error) 1497 return (error); 1498 1499 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1500 foff, writecounted, td); 1501 if (error != 0 && object != NULL) { 1502 /* 1503 * If this mapping was accounted for in the vnode's 1504 * writecount, then undo that now. 1505 */ 1506 if (writecounted) 1507 vm_pager_release_writecount(object, 0, size); 1508 vm_object_deallocate(object); 1509 } 1510 return (error); 1511 } 1512 1513 int 1514 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size) 1515 { 1516 int error; 1517 1518 RACCT_PROC_LOCK(td->td_proc); 1519 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { 1520 RACCT_PROC_UNLOCK(td->td_proc); 1521 return (ENOMEM); 1522 } 1523 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1524 RACCT_PROC_UNLOCK(td->td_proc); 1525 return (ENOMEM); 1526 } 1527 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1528 if (ptoa(pmap_wired_count(map->pmap)) + size > 1529 lim_cur(td, RLIMIT_MEMLOCK)) { 1530 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1531 RACCT_PROC_UNLOCK(td->td_proc); 1532 return (ENOMEM); 1533 } 1534 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1535 ptoa(pmap_wired_count(map->pmap)) + size); 1536 if (error != 0) { 1537 racct_set_force(td->td_proc, RACCT_VMEM, map->size); 1538 RACCT_PROC_UNLOCK(td->td_proc); 1539 return (error); 1540 } 1541 } 1542 RACCT_PROC_UNLOCK(td->td_proc); 1543 return (0); 1544 } 1545 1546 /* 1547 * Internal version of mmap that maps a specific VM object into an 1548 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. 1549 */ 1550 int 1551 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1552 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, 1553 boolean_t writecounted, struct thread *td) 1554 { 1555 vm_offset_t max_addr; 1556 int docow, error, findspace, rv; 1557 bool curmap, fitit; 1558 1559 curmap = map == &td->td_proc->p_vmspace->vm_map; 1560 if (curmap) { 1561 error = kern_mmap_racct_check(td, map, size); 1562 if (error != 0) 1563 return (error); 1564 } 1565 1566 /* 1567 * We currently can only deal with page aligned file offsets. 1568 * The mmap() system call already enforces this by subtracting 1569 * the page offset from the file offset, but checking here 1570 * catches errors in device drivers (e.g. d_single_mmap() 1571 * callbacks) and other internal mapping requests (such as in 1572 * exec). 1573 */ 1574 if (foff & PAGE_MASK) 1575 return (EINVAL); 1576 1577 if ((flags & MAP_FIXED) == 0) { 1578 fitit = TRUE; 1579 *addr = round_page(*addr); 1580 } else { 1581 if (*addr != trunc_page(*addr)) 1582 return (EINVAL); 1583 fitit = FALSE; 1584 } 1585 1586 if (flags & MAP_ANON) { 1587 if (object != NULL || foff != 0) 1588 return (EINVAL); 1589 docow = 0; 1590 } else if (flags & MAP_PREFAULT_READ) 1591 docow = MAP_PREFAULT; 1592 else 1593 docow = MAP_PREFAULT_PARTIAL; 1594 1595 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1596 docow |= MAP_COPY_ON_WRITE; 1597 if (flags & MAP_NOSYNC) 1598 docow |= MAP_DISABLE_SYNCER; 1599 if (flags & MAP_NOCORE) 1600 docow |= MAP_DISABLE_COREDUMP; 1601 /* Shared memory is also shared with children. */ 1602 if (flags & MAP_SHARED) 1603 docow |= MAP_INHERIT_SHARE; 1604 if (writecounted) 1605 docow |= MAP_WRITECOUNT; 1606 if (flags & MAP_STACK) { 1607 if (object != NULL) 1608 return (EINVAL); 1609 docow |= MAP_STACK_GROWS_DOWN; 1610 } 1611 if ((flags & MAP_EXCL) != 0) 1612 docow |= MAP_CHECK_EXCL; 1613 if ((flags & MAP_GUARD) != 0) 1614 docow |= MAP_CREATE_GUARD; 1615 1616 if (fitit) { 1617 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1618 findspace = VMFS_SUPER_SPACE; 1619 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1620 findspace = VMFS_ALIGNED_SPACE(flags >> 1621 MAP_ALIGNMENT_SHIFT); 1622 else 1623 findspace = VMFS_OPTIMAL_SPACE; 1624 max_addr = 0; 1625 #ifdef MAP_32BIT 1626 if ((flags & MAP_32BIT) != 0) 1627 max_addr = MAP_32BIT_MAX_ADDR; 1628 #endif 1629 if (curmap) { 1630 rv = vm_map_find_min(map, object, foff, addr, size, 1631 round_page((vm_offset_t)td->td_proc->p_vmspace-> 1632 vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, 1633 findspace, prot, maxprot, docow); 1634 } else { 1635 rv = vm_map_find(map, object, foff, addr, size, 1636 max_addr, findspace, prot, maxprot, docow); 1637 } 1638 } else { 1639 rv = vm_map_fixed(map, object, foff, *addr, size, 1640 prot, maxprot, docow); 1641 } 1642 1643 if (rv == KERN_SUCCESS) { 1644 /* 1645 * If the process has requested that all future mappings 1646 * be wired, then heed this. 1647 */ 1648 if ((map->flags & MAP_WIREFUTURE) != 0) { 1649 vm_map_lock(map); 1650 if ((map->flags & MAP_WIREFUTURE) != 0) 1651 (void)vm_map_wire_locked(map, *addr, 1652 *addr + size, VM_MAP_WIRE_USER | 1653 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : 1654 VM_MAP_WIRE_NOHOLES)); 1655 vm_map_unlock(map); 1656 } 1657 } 1658 return (vm_mmap_to_errno(rv)); 1659 } 1660 1661 /* 1662 * Translate a Mach VM return code to zero on success or the appropriate errno 1663 * on failure. 1664 */ 1665 int 1666 vm_mmap_to_errno(int rv) 1667 { 1668 1669 switch (rv) { 1670 case KERN_SUCCESS: 1671 return (0); 1672 case KERN_INVALID_ADDRESS: 1673 case KERN_NO_SPACE: 1674 return (ENOMEM); 1675 case KERN_PROTECTION_FAILURE: 1676 return (EACCES); 1677 default: 1678 return (EINVAL); 1679 } 1680 } 1681