1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capsicum.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/procctl.h> 60 #include <sys/racct.h> 61 #include <sys/resource.h> 62 #include <sys/resourcevar.h> 63 #include <sys/rwlock.h> 64 #include <sys/sysctl.h> 65 #include <sys/vnode.h> 66 #include <sys/fcntl.h> 67 #include <sys/file.h> 68 #include <sys/mman.h> 69 #include <sys/mount.h> 70 #include <sys/conf.h> 71 #include <sys/stat.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysent.h> 74 #include <sys/vmmeter.h> 75 76 #include <security/mac/mac_framework.h> 77 78 #include <vm/vm.h> 79 #include <vm/vm_param.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_object.h> 83 #include <vm/vm_page.h> 84 #include <vm/vm_pager.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_extern.h> 87 #include <vm/vm_page.h> 88 #include <vm/vnode_pager.h> 89 90 #ifdef HWPMC_HOOKS 91 #include <sys/pmckern.h> 92 #endif 93 94 int old_mlock = 0; 95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, 96 "Do not apply RLIMIT_MEMLOCK on mlockall"); 97 98 #ifdef MAP_32BIT 99 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) 100 #endif 101 102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 103 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 105 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 107 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 108 109 #ifndef _SYS_SYSPROTO_H_ 110 struct sbrk_args { 111 int incr; 112 }; 113 #endif 114 115 /* 116 * MPSAFE 117 */ 118 /* ARGSUSED */ 119 int 120 sys_sbrk(td, uap) 121 struct thread *td; 122 struct sbrk_args *uap; 123 { 124 /* Not yet implemented */ 125 return (EOPNOTSUPP); 126 } 127 128 #ifndef _SYS_SYSPROTO_H_ 129 struct sstk_args { 130 int incr; 131 }; 132 #endif 133 134 /* 135 * MPSAFE 136 */ 137 /* ARGSUSED */ 138 int 139 sys_sstk(td, uap) 140 struct thread *td; 141 struct sstk_args *uap; 142 { 143 /* Not yet implemented */ 144 return (EOPNOTSUPP); 145 } 146 147 #if defined(COMPAT_43) 148 #ifndef _SYS_SYSPROTO_H_ 149 struct getpagesize_args { 150 int dummy; 151 }; 152 #endif 153 154 int 155 ogetpagesize(td, uap) 156 struct thread *td; 157 struct getpagesize_args *uap; 158 { 159 /* MP SAFE */ 160 td->td_retval[0] = PAGE_SIZE; 161 return (0); 162 } 163 #endif /* COMPAT_43 */ 164 165 166 /* 167 * Memory Map (mmap) system call. Note that the file offset 168 * and address are allowed to be NOT page aligned, though if 169 * the MAP_FIXED flag it set, both must have the same remainder 170 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 171 * page-aligned, the actual mapping starts at trunc_page(addr) 172 * and the return value is adjusted up by the page offset. 173 * 174 * Generally speaking, only character devices which are themselves 175 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 176 * there would be no cache coherency between a descriptor and a VM mapping 177 * both to the same character device. 178 */ 179 #ifndef _SYS_SYSPROTO_H_ 180 struct mmap_args { 181 void *addr; 182 size_t len; 183 int prot; 184 int flags; 185 int fd; 186 long pad; 187 off_t pos; 188 }; 189 #endif 190 191 /* 192 * MPSAFE 193 */ 194 int 195 sys_mmap(td, uap) 196 struct thread *td; 197 struct mmap_args *uap; 198 { 199 #ifdef HWPMC_HOOKS 200 struct pmckern_map_in pkm; 201 #endif 202 struct file *fp; 203 struct vnode *vp; 204 vm_offset_t addr; 205 vm_size_t size, pageoff; 206 vm_prot_t cap_maxprot, maxprot; 207 void *handle; 208 objtype_t handle_type; 209 int align, error, flags, prot; 210 off_t pos; 211 struct vmspace *vms = td->td_proc->p_vmspace; 212 cap_rights_t rights; 213 214 addr = (vm_offset_t) uap->addr; 215 size = uap->len; 216 prot = uap->prot; 217 flags = uap->flags; 218 pos = uap->pos; 219 220 fp = NULL; 221 222 /* 223 * Ignore old flags that used to be defined but did not do anything. 224 */ 225 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); 226 227 /* 228 * Enforce the constraints. 229 * Mapping of length 0 is only allowed for old binaries. 230 * Anonymous mapping shall specify -1 as filedescriptor and 231 * zero position for new code. Be nice to ancient a.out 232 * binaries and correct pos for anonymous mapping, since old 233 * ld.so sometimes issues anonymous map requests with non-zero 234 * pos. 235 */ 236 if (!SV_CURPROC_FLAG(SV_AOUT)) { 237 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 238 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 239 return (EINVAL); 240 } else { 241 if ((flags & MAP_ANON) != 0) 242 pos = 0; 243 } 244 245 if (flags & MAP_STACK) { 246 if ((uap->fd != -1) || 247 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 248 return (EINVAL); 249 flags |= MAP_ANON; 250 pos = 0; 251 } 252 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | 253 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | 254 MAP_PREFAULT_READ | 255 #ifdef MAP_32BIT 256 MAP_32BIT | 257 #endif 258 MAP_ALIGNMENT_MASK)) != 0) 259 return (EINVAL); 260 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) 261 return (EINVAL); 262 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) 263 return (EINVAL); 264 if (prot != PROT_NONE && 265 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) 266 return (EINVAL); 267 268 /* 269 * Align the file position to a page boundary, 270 * and save its page offset component. 271 */ 272 pageoff = (pos & PAGE_MASK); 273 pos -= pageoff; 274 275 /* Adjust size for rounding (on both ends). */ 276 size += pageoff; /* low end... */ 277 size = (vm_size_t) round_page(size); /* hi end */ 278 279 /* Ensure alignment is at least a page and fits in a pointer. */ 280 align = flags & MAP_ALIGNMENT_MASK; 281 if (align != 0 && align != MAP_ALIGNED_SUPER && 282 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || 283 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) 284 return (EINVAL); 285 286 /* 287 * Check for illegal addresses. Watch out for address wrap... Note 288 * that VM_*_ADDRESS are not constants due to casts (argh). 289 */ 290 if (flags & MAP_FIXED) { 291 /* 292 * The specified address must have the same remainder 293 * as the file offset taken modulo PAGE_SIZE, so it 294 * should be aligned after adjustment by pageoff. 295 */ 296 addr -= pageoff; 297 if (addr & PAGE_MASK) 298 return (EINVAL); 299 300 /* Address range must be all in user VM space. */ 301 if (addr < vm_map_min(&vms->vm_map) || 302 addr + size > vm_map_max(&vms->vm_map)) 303 return (EINVAL); 304 if (addr + size < addr) 305 return (EINVAL); 306 #ifdef MAP_32BIT 307 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) 308 return (EINVAL); 309 } else if (flags & MAP_32BIT) { 310 /* 311 * For MAP_32BIT, override the hint if it is too high and 312 * do not bother moving the mapping past the heap (since 313 * the heap is usually above 2GB). 314 */ 315 if (addr + size > MAP_32BIT_MAX_ADDR) 316 addr = 0; 317 #endif 318 } else { 319 /* 320 * XXX for non-fixed mappings where no hint is provided or 321 * the hint would fall in the potential heap space, 322 * place it after the end of the largest possible heap. 323 * 324 * There should really be a pmap call to determine a reasonable 325 * location. 326 */ 327 PROC_LOCK(td->td_proc); 328 if (addr == 0 || 329 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 330 addr < round_page((vm_offset_t)vms->vm_daddr + 331 lim_max(td->td_proc, RLIMIT_DATA)))) 332 addr = round_page((vm_offset_t)vms->vm_daddr + 333 lim_max(td->td_proc, RLIMIT_DATA)); 334 PROC_UNLOCK(td->td_proc); 335 } 336 if (flags & MAP_ANON) { 337 /* 338 * Mapping blank space is trivial. 339 */ 340 handle = NULL; 341 handle_type = OBJT_DEFAULT; 342 maxprot = VM_PROT_ALL; 343 cap_maxprot = VM_PROT_ALL; 344 } else { 345 /* 346 * Mapping file, get fp for validation and don't let the 347 * descriptor disappear on us if we block. Check capability 348 * rights, but also return the maximum rights to be combined 349 * with maxprot later. 350 */ 351 cap_rights_init(&rights, CAP_MMAP); 352 if (prot & PROT_READ) 353 cap_rights_set(&rights, CAP_MMAP_R); 354 if ((flags & MAP_SHARED) != 0) { 355 if (prot & PROT_WRITE) 356 cap_rights_set(&rights, CAP_MMAP_W); 357 } 358 if (prot & PROT_EXEC) 359 cap_rights_set(&rights, CAP_MMAP_X); 360 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp); 361 if (error != 0) 362 goto done; 363 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && 364 td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) { 365 error = EINVAL; 366 goto done; 367 } 368 if (fp->f_type == DTYPE_SHM) { 369 handle = fp->f_data; 370 handle_type = OBJT_SWAP; 371 maxprot = VM_PROT_NONE; 372 373 /* FREAD should always be set. */ 374 if (fp->f_flag & FREAD) 375 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 376 if (fp->f_flag & FWRITE) 377 maxprot |= VM_PROT_WRITE; 378 goto map; 379 } 380 if (fp->f_type != DTYPE_VNODE) { 381 error = ENODEV; 382 goto done; 383 } 384 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 385 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 386 /* 387 * POSIX shared-memory objects are defined to have 388 * kernel persistence, and are not defined to support 389 * read(2)/write(2) -- or even open(2). Thus, we can 390 * use MAP_ASYNC to trade on-disk coherence for speed. 391 * The shm_open(3) library routine turns on the FPOSIXSHM 392 * flag to request this behavior. 393 */ 394 if (fp->f_flag & FPOSIXSHM) 395 flags |= MAP_NOSYNC; 396 #endif 397 vp = fp->f_vnode; 398 /* 399 * Ensure that file and memory protections are 400 * compatible. Note that we only worry about 401 * writability if mapping is shared; in this case, 402 * current and max prot are dictated by the open file. 403 * XXX use the vnode instead? Problem is: what 404 * credentials do we use for determination? What if 405 * proc does a setuid? 406 */ 407 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 408 maxprot = VM_PROT_NONE; 409 else 410 maxprot = VM_PROT_EXECUTE; 411 if (fp->f_flag & FREAD) { 412 maxprot |= VM_PROT_READ; 413 } else if (prot & PROT_READ) { 414 error = EACCES; 415 goto done; 416 } 417 /* 418 * If we are sharing potential changes (either via 419 * MAP_SHARED or via the implicit sharing of character 420 * device mappings), and we are trying to get write 421 * permission although we opened it without asking 422 * for it, bail out. 423 */ 424 if ((flags & MAP_SHARED) != 0) { 425 if ((fp->f_flag & FWRITE) != 0) { 426 maxprot |= VM_PROT_WRITE; 427 } else if ((prot & PROT_WRITE) != 0) { 428 error = EACCES; 429 goto done; 430 } 431 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 432 maxprot |= VM_PROT_WRITE; 433 cap_maxprot |= VM_PROT_WRITE; 434 } 435 handle = (void *)vp; 436 handle_type = OBJT_VNODE; 437 } 438 map: 439 td->td_fpop = fp; 440 maxprot &= cap_maxprot; 441 442 /* This relies on VM_PROT_* matching PROT_*. */ 443 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 444 flags, handle_type, handle, pos); 445 td->td_fpop = NULL; 446 #ifdef HWPMC_HOOKS 447 /* inform hwpmc(4) if an executable is being mapped */ 448 if (error == 0 && handle_type == OBJT_VNODE && 449 (prot & PROT_EXEC)) { 450 pkm.pm_file = handle; 451 pkm.pm_address = (uintptr_t) addr; 452 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 453 } 454 #endif 455 if (error == 0) 456 td->td_retval[0] = (register_t) (addr + pageoff); 457 done: 458 if (fp) 459 fdrop(fp, td); 460 461 return (error); 462 } 463 464 int 465 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 466 { 467 struct mmap_args oargs; 468 469 oargs.addr = uap->addr; 470 oargs.len = uap->len; 471 oargs.prot = uap->prot; 472 oargs.flags = uap->flags; 473 oargs.fd = uap->fd; 474 oargs.pos = uap->pos; 475 return (sys_mmap(td, &oargs)); 476 } 477 478 #ifdef COMPAT_43 479 #ifndef _SYS_SYSPROTO_H_ 480 struct ommap_args { 481 caddr_t addr; 482 int len; 483 int prot; 484 int flags; 485 int fd; 486 long pos; 487 }; 488 #endif 489 int 490 ommap(td, uap) 491 struct thread *td; 492 struct ommap_args *uap; 493 { 494 struct mmap_args nargs; 495 static const char cvtbsdprot[8] = { 496 0, 497 PROT_EXEC, 498 PROT_WRITE, 499 PROT_EXEC | PROT_WRITE, 500 PROT_READ, 501 PROT_EXEC | PROT_READ, 502 PROT_WRITE | PROT_READ, 503 PROT_EXEC | PROT_WRITE | PROT_READ, 504 }; 505 506 #define OMAP_ANON 0x0002 507 #define OMAP_COPY 0x0020 508 #define OMAP_SHARED 0x0010 509 #define OMAP_FIXED 0x0100 510 511 nargs.addr = uap->addr; 512 nargs.len = uap->len; 513 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 514 #ifdef COMPAT_FREEBSD32 515 #if defined(__amd64__) 516 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 517 nargs.prot != 0) 518 nargs.prot |= PROT_EXEC; 519 #endif 520 #endif 521 nargs.flags = 0; 522 if (uap->flags & OMAP_ANON) 523 nargs.flags |= MAP_ANON; 524 if (uap->flags & OMAP_COPY) 525 nargs.flags |= MAP_COPY; 526 if (uap->flags & OMAP_SHARED) 527 nargs.flags |= MAP_SHARED; 528 else 529 nargs.flags |= MAP_PRIVATE; 530 if (uap->flags & OMAP_FIXED) 531 nargs.flags |= MAP_FIXED; 532 nargs.fd = uap->fd; 533 nargs.pos = uap->pos; 534 return (sys_mmap(td, &nargs)); 535 } 536 #endif /* COMPAT_43 */ 537 538 539 #ifndef _SYS_SYSPROTO_H_ 540 struct msync_args { 541 void *addr; 542 size_t len; 543 int flags; 544 }; 545 #endif 546 /* 547 * MPSAFE 548 */ 549 int 550 sys_msync(td, uap) 551 struct thread *td; 552 struct msync_args *uap; 553 { 554 vm_offset_t addr; 555 vm_size_t size, pageoff; 556 int flags; 557 vm_map_t map; 558 int rv; 559 560 addr = (vm_offset_t) uap->addr; 561 size = uap->len; 562 flags = uap->flags; 563 564 pageoff = (addr & PAGE_MASK); 565 addr -= pageoff; 566 size += pageoff; 567 size = (vm_size_t) round_page(size); 568 if (addr + size < addr) 569 return (EINVAL); 570 571 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 572 return (EINVAL); 573 574 map = &td->td_proc->p_vmspace->vm_map; 575 576 /* 577 * Clean the pages and interpret the return value. 578 */ 579 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 580 (flags & MS_INVALIDATE) != 0); 581 switch (rv) { 582 case KERN_SUCCESS: 583 return (0); 584 case KERN_INVALID_ADDRESS: 585 return (ENOMEM); 586 case KERN_INVALID_ARGUMENT: 587 return (EBUSY); 588 case KERN_FAILURE: 589 return (EIO); 590 default: 591 return (EINVAL); 592 } 593 } 594 595 #ifndef _SYS_SYSPROTO_H_ 596 struct munmap_args { 597 void *addr; 598 size_t len; 599 }; 600 #endif 601 /* 602 * MPSAFE 603 */ 604 int 605 sys_munmap(td, uap) 606 struct thread *td; 607 struct munmap_args *uap; 608 { 609 #ifdef HWPMC_HOOKS 610 struct pmckern_map_out pkm; 611 vm_map_entry_t entry; 612 #endif 613 vm_offset_t addr; 614 vm_size_t size, pageoff; 615 vm_map_t map; 616 617 addr = (vm_offset_t) uap->addr; 618 size = uap->len; 619 if (size == 0) 620 return (EINVAL); 621 622 pageoff = (addr & PAGE_MASK); 623 addr -= pageoff; 624 size += pageoff; 625 size = (vm_size_t) round_page(size); 626 if (addr + size < addr) 627 return (EINVAL); 628 629 /* 630 * Check for illegal addresses. Watch out for address wrap... 631 */ 632 map = &td->td_proc->p_vmspace->vm_map; 633 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 634 return (EINVAL); 635 vm_map_lock(map); 636 #ifdef HWPMC_HOOKS 637 /* 638 * Inform hwpmc if the address range being unmapped contains 639 * an executable region. 640 */ 641 pkm.pm_address = (uintptr_t) NULL; 642 if (vm_map_lookup_entry(map, addr, &entry)) { 643 for (; 644 entry != &map->header && entry->start < addr + size; 645 entry = entry->next) { 646 if (vm_map_check_protection(map, entry->start, 647 entry->end, VM_PROT_EXECUTE) == TRUE) { 648 pkm.pm_address = (uintptr_t) addr; 649 pkm.pm_size = (size_t) size; 650 break; 651 } 652 } 653 } 654 #endif 655 vm_map_delete(map, addr, addr + size); 656 657 #ifdef HWPMC_HOOKS 658 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 659 vm_map_lock_downgrade(map); 660 if (pkm.pm_address != (uintptr_t) NULL) 661 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 662 vm_map_unlock_read(map); 663 #else 664 vm_map_unlock(map); 665 #endif 666 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 667 return (0); 668 } 669 670 #ifndef _SYS_SYSPROTO_H_ 671 struct mprotect_args { 672 const void *addr; 673 size_t len; 674 int prot; 675 }; 676 #endif 677 /* 678 * MPSAFE 679 */ 680 int 681 sys_mprotect(td, uap) 682 struct thread *td; 683 struct mprotect_args *uap; 684 { 685 vm_offset_t addr; 686 vm_size_t size, pageoff; 687 vm_prot_t prot; 688 689 addr = (vm_offset_t) uap->addr; 690 size = uap->len; 691 prot = uap->prot & VM_PROT_ALL; 692 693 pageoff = (addr & PAGE_MASK); 694 addr -= pageoff; 695 size += pageoff; 696 size = (vm_size_t) round_page(size); 697 if (addr + size < addr) 698 return (EINVAL); 699 700 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 701 addr + size, prot, FALSE)) { 702 case KERN_SUCCESS: 703 return (0); 704 case KERN_PROTECTION_FAILURE: 705 return (EACCES); 706 case KERN_RESOURCE_SHORTAGE: 707 return (ENOMEM); 708 } 709 return (EINVAL); 710 } 711 712 #ifndef _SYS_SYSPROTO_H_ 713 struct minherit_args { 714 void *addr; 715 size_t len; 716 int inherit; 717 }; 718 #endif 719 /* 720 * MPSAFE 721 */ 722 int 723 sys_minherit(td, uap) 724 struct thread *td; 725 struct minherit_args *uap; 726 { 727 vm_offset_t addr; 728 vm_size_t size, pageoff; 729 vm_inherit_t inherit; 730 731 addr = (vm_offset_t)uap->addr; 732 size = uap->len; 733 inherit = uap->inherit; 734 735 pageoff = (addr & PAGE_MASK); 736 addr -= pageoff; 737 size += pageoff; 738 size = (vm_size_t) round_page(size); 739 if (addr + size < addr) 740 return (EINVAL); 741 742 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 743 addr + size, inherit)) { 744 case KERN_SUCCESS: 745 return (0); 746 case KERN_PROTECTION_FAILURE: 747 return (EACCES); 748 } 749 return (EINVAL); 750 } 751 752 #ifndef _SYS_SYSPROTO_H_ 753 struct madvise_args { 754 void *addr; 755 size_t len; 756 int behav; 757 }; 758 #endif 759 760 /* 761 * MPSAFE 762 */ 763 int 764 sys_madvise(td, uap) 765 struct thread *td; 766 struct madvise_args *uap; 767 { 768 vm_offset_t start, end; 769 vm_map_t map; 770 int flags; 771 772 /* 773 * Check for our special case, advising the swap pager we are 774 * "immortal." 775 */ 776 if (uap->behav == MADV_PROTECT) { 777 flags = PPROT_SET; 778 return (kern_procctl(td, P_PID, td->td_proc->p_pid, 779 PROC_SPROTECT, &flags)); 780 } 781 782 /* 783 * Check for illegal behavior 784 */ 785 if (uap->behav < 0 || uap->behav > MADV_CORE) 786 return (EINVAL); 787 /* 788 * Check for illegal addresses. Watch out for address wrap... Note 789 * that VM_*_ADDRESS are not constants due to casts (argh). 790 */ 791 map = &td->td_proc->p_vmspace->vm_map; 792 if ((vm_offset_t)uap->addr < vm_map_min(map) || 793 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 794 return (EINVAL); 795 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 796 return (EINVAL); 797 798 /* 799 * Since this routine is only advisory, we default to conservative 800 * behavior. 801 */ 802 start = trunc_page((vm_offset_t) uap->addr); 803 end = round_page((vm_offset_t) uap->addr + uap->len); 804 805 if (vm_map_madvise(map, start, end, uap->behav)) 806 return (EINVAL); 807 return (0); 808 } 809 810 #ifndef _SYS_SYSPROTO_H_ 811 struct mincore_args { 812 const void *addr; 813 size_t len; 814 char *vec; 815 }; 816 #endif 817 818 /* 819 * MPSAFE 820 */ 821 int 822 sys_mincore(td, uap) 823 struct thread *td; 824 struct mincore_args *uap; 825 { 826 vm_offset_t addr, first_addr; 827 vm_offset_t end, cend; 828 pmap_t pmap; 829 vm_map_t map; 830 char *vec; 831 int error = 0; 832 int vecindex, lastvecindex; 833 vm_map_entry_t current; 834 vm_map_entry_t entry; 835 vm_object_t object; 836 vm_paddr_t locked_pa; 837 vm_page_t m; 838 vm_pindex_t pindex; 839 int mincoreinfo; 840 unsigned int timestamp; 841 boolean_t locked; 842 843 /* 844 * Make sure that the addresses presented are valid for user 845 * mode. 846 */ 847 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 848 end = addr + (vm_size_t)round_page(uap->len); 849 map = &td->td_proc->p_vmspace->vm_map; 850 if (end > vm_map_max(map) || end < addr) 851 return (ENOMEM); 852 853 /* 854 * Address of byte vector 855 */ 856 vec = uap->vec; 857 858 pmap = vmspace_pmap(td->td_proc->p_vmspace); 859 860 vm_map_lock_read(map); 861 RestartScan: 862 timestamp = map->timestamp; 863 864 if (!vm_map_lookup_entry(map, addr, &entry)) { 865 vm_map_unlock_read(map); 866 return (ENOMEM); 867 } 868 869 /* 870 * Do this on a map entry basis so that if the pages are not 871 * in the current processes address space, we can easily look 872 * up the pages elsewhere. 873 */ 874 lastvecindex = -1; 875 for (current = entry; 876 (current != &map->header) && (current->start < end); 877 current = current->next) { 878 879 /* 880 * check for contiguity 881 */ 882 if (current->end < end && 883 (entry->next == &map->header || 884 current->next->start > current->end)) { 885 vm_map_unlock_read(map); 886 return (ENOMEM); 887 } 888 889 /* 890 * ignore submaps (for now) or null objects 891 */ 892 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 893 current->object.vm_object == NULL) 894 continue; 895 896 /* 897 * limit this scan to the current map entry and the 898 * limits for the mincore call 899 */ 900 if (addr < current->start) 901 addr = current->start; 902 cend = current->end; 903 if (cend > end) 904 cend = end; 905 906 /* 907 * scan this entry one page at a time 908 */ 909 while (addr < cend) { 910 /* 911 * Check pmap first, it is likely faster, also 912 * it can provide info as to whether we are the 913 * one referencing or modifying the page. 914 */ 915 object = NULL; 916 locked_pa = 0; 917 retry: 918 m = NULL; 919 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 920 if (locked_pa != 0) { 921 /* 922 * The page is mapped by this process but not 923 * both accessed and modified. It is also 924 * managed. Acquire the object lock so that 925 * other mappings might be examined. 926 */ 927 m = PHYS_TO_VM_PAGE(locked_pa); 928 if (m->object != object) { 929 if (object != NULL) 930 VM_OBJECT_WUNLOCK(object); 931 object = m->object; 932 locked = VM_OBJECT_TRYWLOCK(object); 933 vm_page_unlock(m); 934 if (!locked) { 935 VM_OBJECT_WLOCK(object); 936 vm_page_lock(m); 937 goto retry; 938 } 939 } else 940 vm_page_unlock(m); 941 KASSERT(m->valid == VM_PAGE_BITS_ALL, 942 ("mincore: page %p is mapped but invalid", 943 m)); 944 } else if (mincoreinfo == 0) { 945 /* 946 * The page is not mapped by this process. If 947 * the object implements managed pages, then 948 * determine if the page is resident so that 949 * the mappings might be examined. 950 */ 951 if (current->object.vm_object != object) { 952 if (object != NULL) 953 VM_OBJECT_WUNLOCK(object); 954 object = current->object.vm_object; 955 VM_OBJECT_WLOCK(object); 956 } 957 if (object->type == OBJT_DEFAULT || 958 object->type == OBJT_SWAP || 959 object->type == OBJT_VNODE) { 960 pindex = OFF_TO_IDX(current->offset + 961 (addr - current->start)); 962 m = vm_page_lookup(object, pindex); 963 if (m == NULL && 964 vm_page_is_cached(object, pindex)) 965 mincoreinfo = MINCORE_INCORE; 966 if (m != NULL && m->valid == 0) 967 m = NULL; 968 if (m != NULL) 969 mincoreinfo = MINCORE_INCORE; 970 } 971 } 972 if (m != NULL) { 973 /* Examine other mappings to the page. */ 974 if (m->dirty == 0 && pmap_is_modified(m)) 975 vm_page_dirty(m); 976 if (m->dirty != 0) 977 mincoreinfo |= MINCORE_MODIFIED_OTHER; 978 /* 979 * The first test for PGA_REFERENCED is an 980 * optimization. The second test is 981 * required because a concurrent pmap 982 * operation could clear the last reference 983 * and set PGA_REFERENCED before the call to 984 * pmap_is_referenced(). 985 */ 986 if ((m->aflags & PGA_REFERENCED) != 0 || 987 pmap_is_referenced(m) || 988 (m->aflags & PGA_REFERENCED) != 0) 989 mincoreinfo |= MINCORE_REFERENCED_OTHER; 990 } 991 if (object != NULL) 992 VM_OBJECT_WUNLOCK(object); 993 994 /* 995 * subyte may page fault. In case it needs to modify 996 * the map, we release the lock. 997 */ 998 vm_map_unlock_read(map); 999 1000 /* 1001 * calculate index into user supplied byte vector 1002 */ 1003 vecindex = OFF_TO_IDX(addr - first_addr); 1004 1005 /* 1006 * If we have skipped map entries, we need to make sure that 1007 * the byte vector is zeroed for those skipped entries. 1008 */ 1009 while ((lastvecindex + 1) < vecindex) { 1010 ++lastvecindex; 1011 error = subyte(vec + lastvecindex, 0); 1012 if (error) { 1013 error = EFAULT; 1014 goto done2; 1015 } 1016 } 1017 1018 /* 1019 * Pass the page information to the user 1020 */ 1021 error = subyte(vec + vecindex, mincoreinfo); 1022 if (error) { 1023 error = EFAULT; 1024 goto done2; 1025 } 1026 1027 /* 1028 * If the map has changed, due to the subyte, the previous 1029 * output may be invalid. 1030 */ 1031 vm_map_lock_read(map); 1032 if (timestamp != map->timestamp) 1033 goto RestartScan; 1034 1035 lastvecindex = vecindex; 1036 addr += PAGE_SIZE; 1037 } 1038 } 1039 1040 /* 1041 * subyte may page fault. In case it needs to modify 1042 * the map, we release the lock. 1043 */ 1044 vm_map_unlock_read(map); 1045 1046 /* 1047 * Zero the last entries in the byte vector. 1048 */ 1049 vecindex = OFF_TO_IDX(end - first_addr); 1050 while ((lastvecindex + 1) < vecindex) { 1051 ++lastvecindex; 1052 error = subyte(vec + lastvecindex, 0); 1053 if (error) { 1054 error = EFAULT; 1055 goto done2; 1056 } 1057 } 1058 1059 /* 1060 * If the map has changed, due to the subyte, the previous 1061 * output may be invalid. 1062 */ 1063 vm_map_lock_read(map); 1064 if (timestamp != map->timestamp) 1065 goto RestartScan; 1066 vm_map_unlock_read(map); 1067 done2: 1068 return (error); 1069 } 1070 1071 #ifndef _SYS_SYSPROTO_H_ 1072 struct mlock_args { 1073 const void *addr; 1074 size_t len; 1075 }; 1076 #endif 1077 /* 1078 * MPSAFE 1079 */ 1080 int 1081 sys_mlock(td, uap) 1082 struct thread *td; 1083 struct mlock_args *uap; 1084 { 1085 1086 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len)); 1087 } 1088 1089 int 1090 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len) 1091 { 1092 vm_offset_t addr, end, last, start; 1093 vm_size_t npages, size; 1094 vm_map_t map; 1095 unsigned long nsize; 1096 int error; 1097 1098 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0); 1099 if (error) 1100 return (error); 1101 addr = (vm_offset_t)addr0; 1102 size = len; 1103 last = addr + size; 1104 start = trunc_page(addr); 1105 end = round_page(last); 1106 if (last < addr || end < addr) 1107 return (EINVAL); 1108 npages = atop(end - start); 1109 if (npages > vm_page_max_wired) 1110 return (ENOMEM); 1111 map = &proc->p_vmspace->vm_map; 1112 PROC_LOCK(proc); 1113 nsize = ptoa(npages + pmap_wired_count(map->pmap)); 1114 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1115 PROC_UNLOCK(proc); 1116 return (ENOMEM); 1117 } 1118 PROC_UNLOCK(proc); 1119 if (npages + vm_cnt.v_wire_count > vm_page_max_wired) 1120 return (EAGAIN); 1121 #ifdef RACCT 1122 PROC_LOCK(proc); 1123 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1124 PROC_UNLOCK(proc); 1125 if (error != 0) 1126 return (ENOMEM); 1127 #endif 1128 error = vm_map_wire(map, start, end, 1129 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1130 #ifdef RACCT 1131 if (error != KERN_SUCCESS) { 1132 PROC_LOCK(proc); 1133 racct_set(proc, RACCT_MEMLOCK, 1134 ptoa(pmap_wired_count(map->pmap))); 1135 PROC_UNLOCK(proc); 1136 } 1137 #endif 1138 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1139 } 1140 1141 #ifndef _SYS_SYSPROTO_H_ 1142 struct mlockall_args { 1143 int how; 1144 }; 1145 #endif 1146 1147 /* 1148 * MPSAFE 1149 */ 1150 int 1151 sys_mlockall(td, uap) 1152 struct thread *td; 1153 struct mlockall_args *uap; 1154 { 1155 vm_map_t map; 1156 int error; 1157 1158 map = &td->td_proc->p_vmspace->vm_map; 1159 error = priv_check(td, PRIV_VM_MLOCK); 1160 if (error) 1161 return (error); 1162 1163 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1164 return (EINVAL); 1165 1166 /* 1167 * If wiring all pages in the process would cause it to exceed 1168 * a hard resource limit, return ENOMEM. 1169 */ 1170 if (!old_mlock && uap->how & MCL_CURRENT) { 1171 PROC_LOCK(td->td_proc); 1172 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1173 PROC_UNLOCK(td->td_proc); 1174 return (ENOMEM); 1175 } 1176 PROC_UNLOCK(td->td_proc); 1177 } 1178 #ifdef RACCT 1179 PROC_LOCK(td->td_proc); 1180 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1181 PROC_UNLOCK(td->td_proc); 1182 if (error != 0) 1183 return (ENOMEM); 1184 #endif 1185 1186 if (uap->how & MCL_FUTURE) { 1187 vm_map_lock(map); 1188 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1189 vm_map_unlock(map); 1190 error = 0; 1191 } 1192 1193 if (uap->how & MCL_CURRENT) { 1194 /* 1195 * P1003.1-2001 mandates that all currently mapped pages 1196 * will be memory resident and locked (wired) upon return 1197 * from mlockall(). vm_map_wire() will wire pages, by 1198 * calling vm_fault_wire() for each page in the region. 1199 */ 1200 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1201 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1202 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1203 } 1204 #ifdef RACCT 1205 if (error != KERN_SUCCESS) { 1206 PROC_LOCK(td->td_proc); 1207 racct_set(td->td_proc, RACCT_MEMLOCK, 1208 ptoa(pmap_wired_count(map->pmap))); 1209 PROC_UNLOCK(td->td_proc); 1210 } 1211 #endif 1212 1213 return (error); 1214 } 1215 1216 #ifndef _SYS_SYSPROTO_H_ 1217 struct munlockall_args { 1218 register_t dummy; 1219 }; 1220 #endif 1221 1222 /* 1223 * MPSAFE 1224 */ 1225 int 1226 sys_munlockall(td, uap) 1227 struct thread *td; 1228 struct munlockall_args *uap; 1229 { 1230 vm_map_t map; 1231 int error; 1232 1233 map = &td->td_proc->p_vmspace->vm_map; 1234 error = priv_check(td, PRIV_VM_MUNLOCK); 1235 if (error) 1236 return (error); 1237 1238 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1239 vm_map_lock(map); 1240 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1241 vm_map_unlock(map); 1242 1243 /* Forcibly unwire all pages. */ 1244 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1245 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1246 #ifdef RACCT 1247 if (error == KERN_SUCCESS) { 1248 PROC_LOCK(td->td_proc); 1249 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1250 PROC_UNLOCK(td->td_proc); 1251 } 1252 #endif 1253 1254 return (error); 1255 } 1256 1257 #ifndef _SYS_SYSPROTO_H_ 1258 struct munlock_args { 1259 const void *addr; 1260 size_t len; 1261 }; 1262 #endif 1263 /* 1264 * MPSAFE 1265 */ 1266 int 1267 sys_munlock(td, uap) 1268 struct thread *td; 1269 struct munlock_args *uap; 1270 { 1271 vm_offset_t addr, end, last, start; 1272 vm_size_t size; 1273 #ifdef RACCT 1274 vm_map_t map; 1275 #endif 1276 int error; 1277 1278 error = priv_check(td, PRIV_VM_MUNLOCK); 1279 if (error) 1280 return (error); 1281 addr = (vm_offset_t)uap->addr; 1282 size = uap->len; 1283 last = addr + size; 1284 start = trunc_page(addr); 1285 end = round_page(last); 1286 if (last < addr || end < addr) 1287 return (EINVAL); 1288 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1289 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1290 #ifdef RACCT 1291 if (error == KERN_SUCCESS) { 1292 PROC_LOCK(td->td_proc); 1293 map = &td->td_proc->p_vmspace->vm_map; 1294 racct_set(td->td_proc, RACCT_MEMLOCK, 1295 ptoa(pmap_wired_count(map->pmap))); 1296 PROC_UNLOCK(td->td_proc); 1297 } 1298 #endif 1299 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1300 } 1301 1302 /* 1303 * vm_mmap_vnode() 1304 * 1305 * Helper function for vm_mmap. Perform sanity check specific for mmap 1306 * operations on vnodes. 1307 * 1308 * For VCHR vnodes, the vnode lock is held over the call to 1309 * vm_mmap_cdev() to keep vp->v_rdev valid. 1310 */ 1311 int 1312 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1313 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1314 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1315 boolean_t *writecounted) 1316 { 1317 struct vattr va; 1318 vm_object_t obj; 1319 vm_offset_t foff; 1320 struct mount *mp; 1321 struct ucred *cred; 1322 int error, flags, locktype; 1323 1324 mp = vp->v_mount; 1325 cred = td->td_ucred; 1326 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1327 locktype = LK_EXCLUSIVE; 1328 else 1329 locktype = LK_SHARED; 1330 if ((error = vget(vp, locktype, td)) != 0) 1331 return (error); 1332 foff = *foffp; 1333 flags = *flagsp; 1334 obj = vp->v_object; 1335 if (vp->v_type == VREG) { 1336 /* 1337 * Get the proper underlying object 1338 */ 1339 if (obj == NULL) { 1340 error = EINVAL; 1341 goto done; 1342 } 1343 if (obj->type == OBJT_VNODE && obj->handle != vp) { 1344 vput(vp); 1345 vp = (struct vnode *)obj->handle; 1346 /* 1347 * Bypass filesystems obey the mpsafety of the 1348 * underlying fs. Tmpfs never bypasses. 1349 */ 1350 error = vget(vp, locktype, td); 1351 if (error != 0) 1352 return (error); 1353 } 1354 if (locktype == LK_EXCLUSIVE) { 1355 *writecounted = TRUE; 1356 vnode_pager_update_writecount(obj, 0, objsize); 1357 } 1358 } else if (vp->v_type == VCHR) { 1359 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1360 vp->v_rdev, foffp, objp); 1361 if (error == 0) 1362 goto mark_atime; 1363 goto done; 1364 } else { 1365 error = EINVAL; 1366 goto done; 1367 } 1368 if ((error = VOP_GETATTR(vp, &va, cred))) 1369 goto done; 1370 #ifdef MAC 1371 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1372 if (error != 0) 1373 goto done; 1374 #endif 1375 if ((flags & MAP_SHARED) != 0) { 1376 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1377 if (prot & PROT_WRITE) { 1378 error = EPERM; 1379 goto done; 1380 } 1381 *maxprotp &= ~VM_PROT_WRITE; 1382 } 1383 } 1384 /* 1385 * If it is a regular file without any references 1386 * we do not need to sync it. 1387 * Adjust object size to be the size of actual file. 1388 */ 1389 objsize = round_page(va.va_size); 1390 if (va.va_nlink == 0) 1391 flags |= MAP_NOSYNC; 1392 if (obj->type == OBJT_VNODE) 1393 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, 1394 cred); 1395 else { 1396 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, 1397 ("wrong object type")); 1398 vm_object_reference(obj); 1399 } 1400 if (obj == NULL) { 1401 error = ENOMEM; 1402 goto done; 1403 } 1404 *objp = obj; 1405 *flagsp = flags; 1406 1407 mark_atime: 1408 vfs_mark_atime(vp, cred); 1409 1410 done: 1411 if (error != 0 && *writecounted) { 1412 *writecounted = FALSE; 1413 vnode_pager_update_writecount(obj, objsize, 0); 1414 } 1415 vput(vp); 1416 return (error); 1417 } 1418 1419 /* 1420 * vm_mmap_cdev() 1421 * 1422 * MPSAFE 1423 * 1424 * Helper function for vm_mmap. Perform sanity check specific for mmap 1425 * operations on cdevs. 1426 */ 1427 int 1428 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1429 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1430 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1431 { 1432 vm_object_t obj; 1433 struct cdevsw *dsw; 1434 int error, flags, ref; 1435 1436 flags = *flagsp; 1437 1438 dsw = dev_refthread(cdev, &ref); 1439 if (dsw == NULL) 1440 return (ENXIO); 1441 if (dsw->d_flags & D_MMAP_ANON) { 1442 dev_relthread(cdev, ref); 1443 *maxprotp = VM_PROT_ALL; 1444 *flagsp |= MAP_ANON; 1445 return (0); 1446 } 1447 /* 1448 * cdevs do not provide private mappings of any kind. 1449 */ 1450 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1451 (prot & PROT_WRITE) != 0) { 1452 dev_relthread(cdev, ref); 1453 return (EACCES); 1454 } 1455 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1456 dev_relthread(cdev, ref); 1457 return (EINVAL); 1458 } 1459 /* 1460 * Force device mappings to be shared. 1461 */ 1462 flags |= MAP_SHARED; 1463 #ifdef MAC_XXX 1464 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1465 if (error != 0) { 1466 dev_relthread(cdev, ref); 1467 return (error); 1468 } 1469 #endif 1470 /* 1471 * First, try d_mmap_single(). If that is not implemented 1472 * (returns ENODEV), fall back to using the device pager. 1473 * Note that d_mmap_single() must return a reference to the 1474 * object (it needs to bump the reference count of the object 1475 * it returns somehow). 1476 * 1477 * XXX assumes VM_PROT_* == PROT_* 1478 */ 1479 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1480 dev_relthread(cdev, ref); 1481 if (error != ENODEV) 1482 return (error); 1483 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1484 td->td_ucred); 1485 if (obj == NULL) 1486 return (EINVAL); 1487 *objp = obj; 1488 *flagsp = flags; 1489 return (0); 1490 } 1491 1492 /* 1493 * vm_mmap_shm() 1494 * 1495 * MPSAFE 1496 * 1497 * Helper function for vm_mmap. Perform sanity check specific for mmap 1498 * operations on shm file descriptors. 1499 */ 1500 int 1501 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1502 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1503 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1504 { 1505 int error; 1506 1507 if ((*flagsp & MAP_SHARED) != 0 && 1508 (*maxprotp & VM_PROT_WRITE) == 0 && 1509 (prot & PROT_WRITE) != 0) 1510 return (EACCES); 1511 #ifdef MAC 1512 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1513 if (error != 0) 1514 return (error); 1515 #endif 1516 error = shm_mmap(shmfd, objsize, foff, objp); 1517 if (error) 1518 return (error); 1519 return (0); 1520 } 1521 1522 /* 1523 * vm_mmap() 1524 * 1525 * MPSAFE 1526 * 1527 * Internal version of mmap. Currently used by mmap, exec, and sys5 1528 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1529 */ 1530 int 1531 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1532 vm_prot_t maxprot, int flags, 1533 objtype_t handle_type, void *handle, 1534 vm_ooffset_t foff) 1535 { 1536 boolean_t fitit; 1537 vm_object_t object = NULL; 1538 struct thread *td = curthread; 1539 int docow, error, findspace, rv; 1540 boolean_t writecounted; 1541 1542 if (size == 0) 1543 return (0); 1544 1545 size = round_page(size); 1546 1547 if (map == &td->td_proc->p_vmspace->vm_map) { 1548 PROC_LOCK(td->td_proc); 1549 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1550 PROC_UNLOCK(td->td_proc); 1551 return (ENOMEM); 1552 } 1553 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1554 PROC_UNLOCK(td->td_proc); 1555 return (ENOMEM); 1556 } 1557 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1558 if (ptoa(pmap_wired_count(map->pmap)) + size > 1559 lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1560 racct_set_force(td->td_proc, RACCT_VMEM, 1561 map->size); 1562 PROC_UNLOCK(td->td_proc); 1563 return (ENOMEM); 1564 } 1565 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1566 ptoa(pmap_wired_count(map->pmap)) + size); 1567 if (error != 0) { 1568 racct_set_force(td->td_proc, RACCT_VMEM, 1569 map->size); 1570 PROC_UNLOCK(td->td_proc); 1571 return (error); 1572 } 1573 } 1574 PROC_UNLOCK(td->td_proc); 1575 } 1576 1577 /* 1578 * We currently can only deal with page aligned file offsets. 1579 * The check is here rather than in the syscall because the 1580 * kernel calls this function internally for other mmaping 1581 * operations (such as in exec) and non-aligned offsets will 1582 * cause pmap inconsistencies...so we want to be sure to 1583 * disallow this in all cases. 1584 */ 1585 if (foff & PAGE_MASK) 1586 return (EINVAL); 1587 1588 if ((flags & MAP_FIXED) == 0) { 1589 fitit = TRUE; 1590 *addr = round_page(*addr); 1591 } else { 1592 if (*addr != trunc_page(*addr)) 1593 return (EINVAL); 1594 fitit = FALSE; 1595 } 1596 writecounted = FALSE; 1597 1598 /* 1599 * Lookup/allocate object. 1600 */ 1601 switch (handle_type) { 1602 case OBJT_DEVICE: 1603 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1604 handle, &foff, &object); 1605 break; 1606 case OBJT_VNODE: 1607 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1608 handle, &foff, &object, &writecounted); 1609 break; 1610 case OBJT_SWAP: 1611 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1612 handle, foff, &object); 1613 break; 1614 case OBJT_DEFAULT: 1615 if (handle == NULL) { 1616 error = 0; 1617 break; 1618 } 1619 /* FALLTHROUGH */ 1620 default: 1621 error = EINVAL; 1622 break; 1623 } 1624 if (error) 1625 return (error); 1626 if (flags & MAP_ANON) { 1627 object = NULL; 1628 docow = 0; 1629 /* 1630 * Unnamed anonymous regions always start at 0. 1631 */ 1632 if (handle == 0) 1633 foff = 0; 1634 } else if (flags & MAP_PREFAULT_READ) 1635 docow = MAP_PREFAULT; 1636 else 1637 docow = MAP_PREFAULT_PARTIAL; 1638 1639 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1640 docow |= MAP_COPY_ON_WRITE; 1641 if (flags & MAP_NOSYNC) 1642 docow |= MAP_DISABLE_SYNCER; 1643 if (flags & MAP_NOCORE) 1644 docow |= MAP_DISABLE_COREDUMP; 1645 /* Shared memory is also shared with children. */ 1646 if (flags & MAP_SHARED) 1647 docow |= MAP_INHERIT_SHARE; 1648 if (writecounted) 1649 docow |= MAP_VN_WRITECOUNT; 1650 if (flags & MAP_STACK) { 1651 if (object != NULL) 1652 return (EINVAL); 1653 docow |= MAP_STACK_GROWS_DOWN; 1654 } 1655 if ((flags & MAP_EXCL) != 0) 1656 docow |= MAP_CHECK_EXCL; 1657 1658 if (fitit) { 1659 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) 1660 findspace = VMFS_SUPER_SPACE; 1661 else if ((flags & MAP_ALIGNMENT_MASK) != 0) 1662 findspace = VMFS_ALIGNED_SPACE(flags >> 1663 MAP_ALIGNMENT_SHIFT); 1664 else 1665 findspace = VMFS_OPTIMAL_SPACE; 1666 rv = vm_map_find(map, object, foff, addr, size, 1667 #ifdef MAP_32BIT 1668 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR : 1669 #endif 1670 0, findspace, prot, maxprot, docow); 1671 } else { 1672 rv = vm_map_fixed(map, object, foff, *addr, size, 1673 prot, maxprot, docow); 1674 } 1675 1676 if (rv == KERN_SUCCESS) { 1677 /* 1678 * If the process has requested that all future mappings 1679 * be wired, then heed this. 1680 */ 1681 if (map->flags & MAP_WIREFUTURE) { 1682 vm_map_wire(map, *addr, *addr + size, 1683 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1684 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1685 } 1686 } else { 1687 /* 1688 * If this mapping was accounted for in the vnode's 1689 * writecount, then undo that now. 1690 */ 1691 if (writecounted) 1692 vnode_pager_release_writecount(object, 0, size); 1693 /* 1694 * Lose the object reference. Will destroy the 1695 * object if it's an unnamed anonymous mapping 1696 * or named anonymous without other references. 1697 */ 1698 vm_object_deallocate(object); 1699 } 1700 return (vm_mmap_to_errno(rv)); 1701 } 1702 1703 /* 1704 * Translate a Mach VM return code to zero on success or the appropriate errno 1705 * on failure. 1706 */ 1707 int 1708 vm_mmap_to_errno(int rv) 1709 { 1710 1711 switch (rv) { 1712 case KERN_SUCCESS: 1713 return (0); 1714 case KERN_INVALID_ADDRESS: 1715 case KERN_NO_SPACE: 1716 return (ENOMEM); 1717 case KERN_PROTECTION_FAILURE: 1718 return (EACCES); 1719 default: 1720 return (EINVAL); 1721 } 1722 } 1723