1 /*- 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 35 * 36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 37 */ 38 39 /* 40 * Mapped file (mmap) interface to VM 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_compat.h" 47 #include "opt_hwpmc_hooks.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/capability.h> 52 #include <sys/kernel.h> 53 #include <sys/lock.h> 54 #include <sys/mutex.h> 55 #include <sys/sysproto.h> 56 #include <sys/filedesc.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/racct.h> 60 #include <sys/resource.h> 61 #include <sys/resourcevar.h> 62 #include <sys/sysctl.h> 63 #include <sys/vnode.h> 64 #include <sys/fcntl.h> 65 #include <sys/file.h> 66 #include <sys/mman.h> 67 #include <sys/mount.h> 68 #include <sys/conf.h> 69 #include <sys/stat.h> 70 #include <sys/sysent.h> 71 #include <sys/vmmeter.h> 72 73 #include <security/mac/mac_framework.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_param.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_object.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_pager.h> 82 #include <vm/vm_pageout.h> 83 #include <vm/vm_extern.h> 84 #include <vm/vm_page.h> 85 #include <vm/vnode_pager.h> 86 87 #ifdef HWPMC_HOOKS 88 #include <sys/pmckern.h> 89 #endif 90 91 int old_mlock = 0; 92 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0, 93 "Do not apply RLIMIT_MEMLOCK on mlockall"); 94 TUNABLE_INT("vm.old_mlock", &old_mlock); 95 96 #ifndef _SYS_SYSPROTO_H_ 97 struct sbrk_args { 98 int incr; 99 }; 100 #endif 101 102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 103 int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); 104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 105 int *, struct cdev *, vm_ooffset_t *, vm_object_t *); 106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, 107 int *, struct shmfd *, vm_ooffset_t, vm_object_t *); 108 109 /* 110 * MPSAFE 111 */ 112 /* ARGSUSED */ 113 int 114 sys_sbrk(td, uap) 115 struct thread *td; 116 struct sbrk_args *uap; 117 { 118 /* Not yet implemented */ 119 return (EOPNOTSUPP); 120 } 121 122 #ifndef _SYS_SYSPROTO_H_ 123 struct sstk_args { 124 int incr; 125 }; 126 #endif 127 128 /* 129 * MPSAFE 130 */ 131 /* ARGSUSED */ 132 int 133 sys_sstk(td, uap) 134 struct thread *td; 135 struct sstk_args *uap; 136 { 137 /* Not yet implemented */ 138 return (EOPNOTSUPP); 139 } 140 141 #if defined(COMPAT_43) 142 #ifndef _SYS_SYSPROTO_H_ 143 struct getpagesize_args { 144 int dummy; 145 }; 146 #endif 147 148 int 149 ogetpagesize(td, uap) 150 struct thread *td; 151 struct getpagesize_args *uap; 152 { 153 /* MP SAFE */ 154 td->td_retval[0] = PAGE_SIZE; 155 return (0); 156 } 157 #endif /* COMPAT_43 */ 158 159 160 /* 161 * Memory Map (mmap) system call. Note that the file offset 162 * and address are allowed to be NOT page aligned, though if 163 * the MAP_FIXED flag it set, both must have the same remainder 164 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 165 * page-aligned, the actual mapping starts at trunc_page(addr) 166 * and the return value is adjusted up by the page offset. 167 * 168 * Generally speaking, only character devices which are themselves 169 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 170 * there would be no cache coherency between a descriptor and a VM mapping 171 * both to the same character device. 172 */ 173 #ifndef _SYS_SYSPROTO_H_ 174 struct mmap_args { 175 void *addr; 176 size_t len; 177 int prot; 178 int flags; 179 int fd; 180 long pad; 181 off_t pos; 182 }; 183 #endif 184 185 /* 186 * MPSAFE 187 */ 188 int 189 sys_mmap(td, uap) 190 struct thread *td; 191 struct mmap_args *uap; 192 { 193 #ifdef HWPMC_HOOKS 194 struct pmckern_map_in pkm; 195 #endif 196 struct file *fp; 197 struct vnode *vp; 198 vm_offset_t addr; 199 vm_size_t size, pageoff; 200 vm_prot_t cap_maxprot, prot, maxprot; 201 void *handle; 202 objtype_t handle_type; 203 int flags, error; 204 off_t pos; 205 struct vmspace *vms = td->td_proc->p_vmspace; 206 cap_rights_t rights; 207 208 addr = (vm_offset_t) uap->addr; 209 size = uap->len; 210 prot = uap->prot & VM_PROT_ALL; 211 flags = uap->flags; 212 pos = uap->pos; 213 214 fp = NULL; 215 216 /* 217 * Enforce the constraints. 218 * Mapping of length 0 is only allowed for old binaries. 219 * Anonymous mapping shall specify -1 as filedescriptor and 220 * zero position for new code. Be nice to ancient a.out 221 * binaries and correct pos for anonymous mapping, since old 222 * ld.so sometimes issues anonymous map requests with non-zero 223 * pos. 224 */ 225 if (!SV_CURPROC_FLAG(SV_AOUT)) { 226 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) || 227 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0))) 228 return (EINVAL); 229 } else { 230 if ((flags & MAP_ANON) != 0) 231 pos = 0; 232 } 233 234 if (flags & MAP_STACK) { 235 if ((uap->fd != -1) || 236 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 237 return (EINVAL); 238 flags |= MAP_ANON; 239 pos = 0; 240 } 241 242 /* 243 * Align the file position to a page boundary, 244 * and save its page offset component. 245 */ 246 pageoff = (pos & PAGE_MASK); 247 pos -= pageoff; 248 249 /* Adjust size for rounding (on both ends). */ 250 size += pageoff; /* low end... */ 251 size = (vm_size_t) round_page(size); /* hi end */ 252 253 /* 254 * Check for illegal addresses. Watch out for address wrap... Note 255 * that VM_*_ADDRESS are not constants due to casts (argh). 256 */ 257 if (flags & MAP_FIXED) { 258 /* 259 * The specified address must have the same remainder 260 * as the file offset taken modulo PAGE_SIZE, so it 261 * should be aligned after adjustment by pageoff. 262 */ 263 addr -= pageoff; 264 if (addr & PAGE_MASK) 265 return (EINVAL); 266 267 /* Address range must be all in user VM space. */ 268 if (addr < vm_map_min(&vms->vm_map) || 269 addr + size > vm_map_max(&vms->vm_map)) 270 return (EINVAL); 271 if (addr + size < addr) 272 return (EINVAL); 273 } else { 274 /* 275 * XXX for non-fixed mappings where no hint is provided or 276 * the hint would fall in the potential heap space, 277 * place it after the end of the largest possible heap. 278 * 279 * There should really be a pmap call to determine a reasonable 280 * location. 281 */ 282 PROC_LOCK(td->td_proc); 283 if (addr == 0 || 284 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 285 addr < round_page((vm_offset_t)vms->vm_daddr + 286 lim_max(td->td_proc, RLIMIT_DATA)))) 287 addr = round_page((vm_offset_t)vms->vm_daddr + 288 lim_max(td->td_proc, RLIMIT_DATA)); 289 PROC_UNLOCK(td->td_proc); 290 } 291 if (flags & MAP_ANON) { 292 /* 293 * Mapping blank space is trivial. 294 */ 295 handle = NULL; 296 handle_type = OBJT_DEFAULT; 297 maxprot = VM_PROT_ALL; 298 cap_maxprot = VM_PROT_ALL; 299 } else { 300 /* 301 * Mapping file, get fp for validation and don't let the 302 * descriptor disappear on us if we block. Check capability 303 * rights, but also return the maximum rights to be combined 304 * with maxprot later. 305 */ 306 rights = CAP_MMAP; 307 if (prot & PROT_READ) 308 rights |= CAP_READ; 309 if ((flags & MAP_SHARED) != 0) { 310 if (prot & PROT_WRITE) 311 rights |= CAP_WRITE; 312 } 313 if (prot & PROT_EXEC) 314 rights |= CAP_MAPEXEC; 315 if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot, 316 &fp)) != 0) 317 goto done; 318 if (fp->f_type == DTYPE_SHM) { 319 handle = fp->f_data; 320 handle_type = OBJT_SWAP; 321 maxprot = VM_PROT_NONE; 322 323 /* FREAD should always be set. */ 324 if (fp->f_flag & FREAD) 325 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; 326 if (fp->f_flag & FWRITE) 327 maxprot |= VM_PROT_WRITE; 328 goto map; 329 } 330 if (fp->f_type != DTYPE_VNODE) { 331 error = ENODEV; 332 goto done; 333 } 334 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ 335 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) 336 /* 337 * POSIX shared-memory objects are defined to have 338 * kernel persistence, and are not defined to support 339 * read(2)/write(2) -- or even open(2). Thus, we can 340 * use MAP_ASYNC to trade on-disk coherence for speed. 341 * The shm_open(3) library routine turns on the FPOSIXSHM 342 * flag to request this behavior. 343 */ 344 if (fp->f_flag & FPOSIXSHM) 345 flags |= MAP_NOSYNC; 346 #endif 347 vp = fp->f_vnode; 348 /* 349 * Ensure that file and memory protections are 350 * compatible. Note that we only worry about 351 * writability if mapping is shared; in this case, 352 * current and max prot are dictated by the open file. 353 * XXX use the vnode instead? Problem is: what 354 * credentials do we use for determination? What if 355 * proc does a setuid? 356 */ 357 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC) 358 maxprot = VM_PROT_NONE; 359 else 360 maxprot = VM_PROT_EXECUTE; 361 if (fp->f_flag & FREAD) { 362 maxprot |= VM_PROT_READ; 363 } else if (prot & PROT_READ) { 364 error = EACCES; 365 goto done; 366 } 367 /* 368 * If we are sharing potential changes (either via 369 * MAP_SHARED or via the implicit sharing of character 370 * device mappings), and we are trying to get write 371 * permission although we opened it without asking 372 * for it, bail out. 373 */ 374 if ((flags & MAP_SHARED) != 0) { 375 if ((fp->f_flag & FWRITE) != 0) { 376 maxprot |= VM_PROT_WRITE; 377 } else if ((prot & PROT_WRITE) != 0) { 378 error = EACCES; 379 goto done; 380 } 381 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) { 382 maxprot |= VM_PROT_WRITE; 383 cap_maxprot |= VM_PROT_WRITE; 384 } 385 handle = (void *)vp; 386 handle_type = OBJT_VNODE; 387 } 388 map: 389 td->td_fpop = fp; 390 maxprot &= cap_maxprot; 391 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 392 flags, handle_type, handle, pos); 393 td->td_fpop = NULL; 394 #ifdef HWPMC_HOOKS 395 /* inform hwpmc(4) if an executable is being mapped */ 396 if (error == 0 && handle_type == OBJT_VNODE && 397 (prot & PROT_EXEC)) { 398 pkm.pm_file = handle; 399 pkm.pm_address = (uintptr_t) addr; 400 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm); 401 } 402 #endif 403 if (error == 0) 404 td->td_retval[0] = (register_t) (addr + pageoff); 405 done: 406 if (fp) 407 fdrop(fp, td); 408 409 return (error); 410 } 411 412 int 413 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) 414 { 415 struct mmap_args oargs; 416 417 oargs.addr = uap->addr; 418 oargs.len = uap->len; 419 oargs.prot = uap->prot; 420 oargs.flags = uap->flags; 421 oargs.fd = uap->fd; 422 oargs.pos = uap->pos; 423 return (sys_mmap(td, &oargs)); 424 } 425 426 #ifdef COMPAT_43 427 #ifndef _SYS_SYSPROTO_H_ 428 struct ommap_args { 429 caddr_t addr; 430 int len; 431 int prot; 432 int flags; 433 int fd; 434 long pos; 435 }; 436 #endif 437 int 438 ommap(td, uap) 439 struct thread *td; 440 struct ommap_args *uap; 441 { 442 struct mmap_args nargs; 443 static const char cvtbsdprot[8] = { 444 0, 445 PROT_EXEC, 446 PROT_WRITE, 447 PROT_EXEC | PROT_WRITE, 448 PROT_READ, 449 PROT_EXEC | PROT_READ, 450 PROT_WRITE | PROT_READ, 451 PROT_EXEC | PROT_WRITE | PROT_READ, 452 }; 453 454 #define OMAP_ANON 0x0002 455 #define OMAP_COPY 0x0020 456 #define OMAP_SHARED 0x0010 457 #define OMAP_FIXED 0x0100 458 459 nargs.addr = uap->addr; 460 nargs.len = uap->len; 461 nargs.prot = cvtbsdprot[uap->prot & 0x7]; 462 #ifdef COMPAT_FREEBSD32 463 #if defined(__amd64__) || defined(__ia64__) 464 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && 465 nargs.prot != 0) 466 nargs.prot |= PROT_EXEC; 467 #endif 468 #endif 469 nargs.flags = 0; 470 if (uap->flags & OMAP_ANON) 471 nargs.flags |= MAP_ANON; 472 if (uap->flags & OMAP_COPY) 473 nargs.flags |= MAP_COPY; 474 if (uap->flags & OMAP_SHARED) 475 nargs.flags |= MAP_SHARED; 476 else 477 nargs.flags |= MAP_PRIVATE; 478 if (uap->flags & OMAP_FIXED) 479 nargs.flags |= MAP_FIXED; 480 nargs.fd = uap->fd; 481 nargs.pos = uap->pos; 482 return (sys_mmap(td, &nargs)); 483 } 484 #endif /* COMPAT_43 */ 485 486 487 #ifndef _SYS_SYSPROTO_H_ 488 struct msync_args { 489 void *addr; 490 size_t len; 491 int flags; 492 }; 493 #endif 494 /* 495 * MPSAFE 496 */ 497 int 498 sys_msync(td, uap) 499 struct thread *td; 500 struct msync_args *uap; 501 { 502 vm_offset_t addr; 503 vm_size_t size, pageoff; 504 int flags; 505 vm_map_t map; 506 int rv; 507 508 addr = (vm_offset_t) uap->addr; 509 size = uap->len; 510 flags = uap->flags; 511 512 pageoff = (addr & PAGE_MASK); 513 addr -= pageoff; 514 size += pageoff; 515 size = (vm_size_t) round_page(size); 516 if (addr + size < addr) 517 return (EINVAL); 518 519 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 520 return (EINVAL); 521 522 map = &td->td_proc->p_vmspace->vm_map; 523 524 /* 525 * Clean the pages and interpret the return value. 526 */ 527 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, 528 (flags & MS_INVALIDATE) != 0); 529 switch (rv) { 530 case KERN_SUCCESS: 531 return (0); 532 case KERN_INVALID_ADDRESS: 533 return (EINVAL); /* Sun returns ENOMEM? */ 534 case KERN_INVALID_ARGUMENT: 535 return (EBUSY); 536 case KERN_FAILURE: 537 return (EIO); 538 default: 539 return (EINVAL); 540 } 541 } 542 543 #ifndef _SYS_SYSPROTO_H_ 544 struct munmap_args { 545 void *addr; 546 size_t len; 547 }; 548 #endif 549 /* 550 * MPSAFE 551 */ 552 int 553 sys_munmap(td, uap) 554 struct thread *td; 555 struct munmap_args *uap; 556 { 557 #ifdef HWPMC_HOOKS 558 struct pmckern_map_out pkm; 559 vm_map_entry_t entry; 560 #endif 561 vm_offset_t addr; 562 vm_size_t size, pageoff; 563 vm_map_t map; 564 565 addr = (vm_offset_t) uap->addr; 566 size = uap->len; 567 if (size == 0) 568 return (EINVAL); 569 570 pageoff = (addr & PAGE_MASK); 571 addr -= pageoff; 572 size += pageoff; 573 size = (vm_size_t) round_page(size); 574 if (addr + size < addr) 575 return (EINVAL); 576 577 /* 578 * Check for illegal addresses. Watch out for address wrap... 579 */ 580 map = &td->td_proc->p_vmspace->vm_map; 581 if (addr < vm_map_min(map) || addr + size > vm_map_max(map)) 582 return (EINVAL); 583 vm_map_lock(map); 584 #ifdef HWPMC_HOOKS 585 /* 586 * Inform hwpmc if the address range being unmapped contains 587 * an executable region. 588 */ 589 pkm.pm_address = (uintptr_t) NULL; 590 if (vm_map_lookup_entry(map, addr, &entry)) { 591 for (; 592 entry != &map->header && entry->start < addr + size; 593 entry = entry->next) { 594 if (vm_map_check_protection(map, entry->start, 595 entry->end, VM_PROT_EXECUTE) == TRUE) { 596 pkm.pm_address = (uintptr_t) addr; 597 pkm.pm_size = (size_t) size; 598 break; 599 } 600 } 601 } 602 #endif 603 vm_map_delete(map, addr, addr + size); 604 605 #ifdef HWPMC_HOOKS 606 /* downgrade the lock to prevent a LOR with the pmc-sx lock */ 607 vm_map_lock_downgrade(map); 608 if (pkm.pm_address != (uintptr_t) NULL) 609 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); 610 vm_map_unlock_read(map); 611 #else 612 vm_map_unlock(map); 613 #endif 614 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ 615 return (0); 616 } 617 618 #ifndef _SYS_SYSPROTO_H_ 619 struct mprotect_args { 620 const void *addr; 621 size_t len; 622 int prot; 623 }; 624 #endif 625 /* 626 * MPSAFE 627 */ 628 int 629 sys_mprotect(td, uap) 630 struct thread *td; 631 struct mprotect_args *uap; 632 { 633 vm_offset_t addr; 634 vm_size_t size, pageoff; 635 vm_prot_t prot; 636 637 addr = (vm_offset_t) uap->addr; 638 size = uap->len; 639 prot = uap->prot & VM_PROT_ALL; 640 641 pageoff = (addr & PAGE_MASK); 642 addr -= pageoff; 643 size += pageoff; 644 size = (vm_size_t) round_page(size); 645 if (addr + size < addr) 646 return (EINVAL); 647 648 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, 649 addr + size, prot, FALSE)) { 650 case KERN_SUCCESS: 651 return (0); 652 case KERN_PROTECTION_FAILURE: 653 return (EACCES); 654 case KERN_RESOURCE_SHORTAGE: 655 return (ENOMEM); 656 } 657 return (EINVAL); 658 } 659 660 #ifndef _SYS_SYSPROTO_H_ 661 struct minherit_args { 662 void *addr; 663 size_t len; 664 int inherit; 665 }; 666 #endif 667 /* 668 * MPSAFE 669 */ 670 int 671 sys_minherit(td, uap) 672 struct thread *td; 673 struct minherit_args *uap; 674 { 675 vm_offset_t addr; 676 vm_size_t size, pageoff; 677 vm_inherit_t inherit; 678 679 addr = (vm_offset_t)uap->addr; 680 size = uap->len; 681 inherit = uap->inherit; 682 683 pageoff = (addr & PAGE_MASK); 684 addr -= pageoff; 685 size += pageoff; 686 size = (vm_size_t) round_page(size); 687 if (addr + size < addr) 688 return (EINVAL); 689 690 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, 691 addr + size, inherit)) { 692 case KERN_SUCCESS: 693 return (0); 694 case KERN_PROTECTION_FAILURE: 695 return (EACCES); 696 } 697 return (EINVAL); 698 } 699 700 #ifndef _SYS_SYSPROTO_H_ 701 struct madvise_args { 702 void *addr; 703 size_t len; 704 int behav; 705 }; 706 #endif 707 708 /* 709 * MPSAFE 710 */ 711 int 712 sys_madvise(td, uap) 713 struct thread *td; 714 struct madvise_args *uap; 715 { 716 vm_offset_t start, end; 717 vm_map_t map; 718 struct proc *p; 719 int error; 720 721 /* 722 * Check for our special case, advising the swap pager we are 723 * "immortal." 724 */ 725 if (uap->behav == MADV_PROTECT) { 726 error = priv_check(td, PRIV_VM_MADV_PROTECT); 727 if (error == 0) { 728 p = td->td_proc; 729 PROC_LOCK(p); 730 p->p_flag |= P_PROTECTED; 731 PROC_UNLOCK(p); 732 } 733 return (error); 734 } 735 /* 736 * Check for illegal behavior 737 */ 738 if (uap->behav < 0 || uap->behav > MADV_CORE) 739 return (EINVAL); 740 /* 741 * Check for illegal addresses. Watch out for address wrap... Note 742 * that VM_*_ADDRESS are not constants due to casts (argh). 743 */ 744 map = &td->td_proc->p_vmspace->vm_map; 745 if ((vm_offset_t)uap->addr < vm_map_min(map) || 746 (vm_offset_t)uap->addr + uap->len > vm_map_max(map)) 747 return (EINVAL); 748 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 749 return (EINVAL); 750 751 /* 752 * Since this routine is only advisory, we default to conservative 753 * behavior. 754 */ 755 start = trunc_page((vm_offset_t) uap->addr); 756 end = round_page((vm_offset_t) uap->addr + uap->len); 757 758 if (vm_map_madvise(map, start, end, uap->behav)) 759 return (EINVAL); 760 return (0); 761 } 762 763 #ifndef _SYS_SYSPROTO_H_ 764 struct mincore_args { 765 const void *addr; 766 size_t len; 767 char *vec; 768 }; 769 #endif 770 771 /* 772 * MPSAFE 773 */ 774 int 775 sys_mincore(td, uap) 776 struct thread *td; 777 struct mincore_args *uap; 778 { 779 vm_offset_t addr, first_addr; 780 vm_offset_t end, cend; 781 pmap_t pmap; 782 vm_map_t map; 783 char *vec; 784 int error = 0; 785 int vecindex, lastvecindex; 786 vm_map_entry_t current; 787 vm_map_entry_t entry; 788 vm_object_t object; 789 vm_paddr_t locked_pa; 790 vm_page_t m; 791 vm_pindex_t pindex; 792 int mincoreinfo; 793 unsigned int timestamp; 794 boolean_t locked; 795 796 /* 797 * Make sure that the addresses presented are valid for user 798 * mode. 799 */ 800 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 801 end = addr + (vm_size_t)round_page(uap->len); 802 map = &td->td_proc->p_vmspace->vm_map; 803 if (end > vm_map_max(map) || end < addr) 804 return (ENOMEM); 805 806 /* 807 * Address of byte vector 808 */ 809 vec = uap->vec; 810 811 pmap = vmspace_pmap(td->td_proc->p_vmspace); 812 813 vm_map_lock_read(map); 814 RestartScan: 815 timestamp = map->timestamp; 816 817 if (!vm_map_lookup_entry(map, addr, &entry)) { 818 vm_map_unlock_read(map); 819 return (ENOMEM); 820 } 821 822 /* 823 * Do this on a map entry basis so that if the pages are not 824 * in the current processes address space, we can easily look 825 * up the pages elsewhere. 826 */ 827 lastvecindex = -1; 828 for (current = entry; 829 (current != &map->header) && (current->start < end); 830 current = current->next) { 831 832 /* 833 * check for contiguity 834 */ 835 if (current->end < end && 836 (entry->next == &map->header || 837 current->next->start > current->end)) { 838 vm_map_unlock_read(map); 839 return (ENOMEM); 840 } 841 842 /* 843 * ignore submaps (for now) or null objects 844 */ 845 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || 846 current->object.vm_object == NULL) 847 continue; 848 849 /* 850 * limit this scan to the current map entry and the 851 * limits for the mincore call 852 */ 853 if (addr < current->start) 854 addr = current->start; 855 cend = current->end; 856 if (cend > end) 857 cend = end; 858 859 /* 860 * scan this entry one page at a time 861 */ 862 while (addr < cend) { 863 /* 864 * Check pmap first, it is likely faster, also 865 * it can provide info as to whether we are the 866 * one referencing or modifying the page. 867 */ 868 object = NULL; 869 locked_pa = 0; 870 retry: 871 m = NULL; 872 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa); 873 if (locked_pa != 0) { 874 /* 875 * The page is mapped by this process but not 876 * both accessed and modified. It is also 877 * managed. Acquire the object lock so that 878 * other mappings might be examined. 879 */ 880 m = PHYS_TO_VM_PAGE(locked_pa); 881 if (m->object != object) { 882 if (object != NULL) 883 VM_OBJECT_UNLOCK(object); 884 object = m->object; 885 locked = VM_OBJECT_TRYLOCK(object); 886 vm_page_unlock(m); 887 if (!locked) { 888 VM_OBJECT_LOCK(object); 889 vm_page_lock(m); 890 goto retry; 891 } 892 } else 893 vm_page_unlock(m); 894 KASSERT(m->valid == VM_PAGE_BITS_ALL, 895 ("mincore: page %p is mapped but invalid", 896 m)); 897 } else if (mincoreinfo == 0) { 898 /* 899 * The page is not mapped by this process. If 900 * the object implements managed pages, then 901 * determine if the page is resident so that 902 * the mappings might be examined. 903 */ 904 if (current->object.vm_object != object) { 905 if (object != NULL) 906 VM_OBJECT_UNLOCK(object); 907 object = current->object.vm_object; 908 VM_OBJECT_LOCK(object); 909 } 910 if (object->type == OBJT_DEFAULT || 911 object->type == OBJT_SWAP || 912 object->type == OBJT_VNODE) { 913 pindex = OFF_TO_IDX(current->offset + 914 (addr - current->start)); 915 m = vm_page_lookup(object, pindex); 916 if (m == NULL && 917 vm_page_is_cached(object, pindex)) 918 mincoreinfo = MINCORE_INCORE; 919 if (m != NULL && m->valid == 0) 920 m = NULL; 921 if (m != NULL) 922 mincoreinfo = MINCORE_INCORE; 923 } 924 } 925 if (m != NULL) { 926 /* Examine other mappings to the page. */ 927 if (m->dirty == 0 && pmap_is_modified(m)) 928 vm_page_dirty(m); 929 if (m->dirty != 0) 930 mincoreinfo |= MINCORE_MODIFIED_OTHER; 931 /* 932 * The first test for PGA_REFERENCED is an 933 * optimization. The second test is 934 * required because a concurrent pmap 935 * operation could clear the last reference 936 * and set PGA_REFERENCED before the call to 937 * pmap_is_referenced(). 938 */ 939 if ((m->aflags & PGA_REFERENCED) != 0 || 940 pmap_is_referenced(m) || 941 (m->aflags & PGA_REFERENCED) != 0) 942 mincoreinfo |= MINCORE_REFERENCED_OTHER; 943 } 944 if (object != NULL) 945 VM_OBJECT_UNLOCK(object); 946 947 /* 948 * subyte may page fault. In case it needs to modify 949 * the map, we release the lock. 950 */ 951 vm_map_unlock_read(map); 952 953 /* 954 * calculate index into user supplied byte vector 955 */ 956 vecindex = OFF_TO_IDX(addr - first_addr); 957 958 /* 959 * If we have skipped map entries, we need to make sure that 960 * the byte vector is zeroed for those skipped entries. 961 */ 962 while ((lastvecindex + 1) < vecindex) { 963 error = subyte(vec + lastvecindex, 0); 964 if (error) { 965 error = EFAULT; 966 goto done2; 967 } 968 ++lastvecindex; 969 } 970 971 /* 972 * Pass the page information to the user 973 */ 974 error = subyte(vec + vecindex, mincoreinfo); 975 if (error) { 976 error = EFAULT; 977 goto done2; 978 } 979 980 /* 981 * If the map has changed, due to the subyte, the previous 982 * output may be invalid. 983 */ 984 vm_map_lock_read(map); 985 if (timestamp != map->timestamp) 986 goto RestartScan; 987 988 lastvecindex = vecindex; 989 addr += PAGE_SIZE; 990 } 991 } 992 993 /* 994 * subyte may page fault. In case it needs to modify 995 * the map, we release the lock. 996 */ 997 vm_map_unlock_read(map); 998 999 /* 1000 * Zero the last entries in the byte vector. 1001 */ 1002 vecindex = OFF_TO_IDX(end - first_addr); 1003 while ((lastvecindex + 1) < vecindex) { 1004 error = subyte(vec + lastvecindex, 0); 1005 if (error) { 1006 error = EFAULT; 1007 goto done2; 1008 } 1009 ++lastvecindex; 1010 } 1011 1012 /* 1013 * If the map has changed, due to the subyte, the previous 1014 * output may be invalid. 1015 */ 1016 vm_map_lock_read(map); 1017 if (timestamp != map->timestamp) 1018 goto RestartScan; 1019 vm_map_unlock_read(map); 1020 done2: 1021 return (error); 1022 } 1023 1024 #ifndef _SYS_SYSPROTO_H_ 1025 struct mlock_args { 1026 const void *addr; 1027 size_t len; 1028 }; 1029 #endif 1030 /* 1031 * MPSAFE 1032 */ 1033 int 1034 sys_mlock(td, uap) 1035 struct thread *td; 1036 struct mlock_args *uap; 1037 { 1038 struct proc *proc; 1039 vm_offset_t addr, end, last, start; 1040 vm_size_t npages, size; 1041 unsigned long nsize; 1042 int error; 1043 1044 error = priv_check(td, PRIV_VM_MLOCK); 1045 if (error) 1046 return (error); 1047 addr = (vm_offset_t)uap->addr; 1048 size = uap->len; 1049 last = addr + size; 1050 start = trunc_page(addr); 1051 end = round_page(last); 1052 if (last < addr || end < addr) 1053 return (EINVAL); 1054 npages = atop(end - start); 1055 if (npages > vm_page_max_wired) 1056 return (ENOMEM); 1057 proc = td->td_proc; 1058 PROC_LOCK(proc); 1059 nsize = ptoa(npages + vmspace_wired_count(proc->p_vmspace)); 1060 if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) { 1061 PROC_UNLOCK(proc); 1062 return (ENOMEM); 1063 } 1064 PROC_UNLOCK(proc); 1065 if (npages + cnt.v_wire_count > vm_page_max_wired) 1066 return (EAGAIN); 1067 #ifdef RACCT 1068 PROC_LOCK(proc); 1069 error = racct_set(proc, RACCT_MEMLOCK, nsize); 1070 PROC_UNLOCK(proc); 1071 if (error != 0) 1072 return (ENOMEM); 1073 #endif 1074 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, 1075 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1076 #ifdef RACCT 1077 if (error != KERN_SUCCESS) { 1078 PROC_LOCK(proc); 1079 racct_set(proc, RACCT_MEMLOCK, 1080 ptoa(vmspace_wired_count(proc->p_vmspace))); 1081 PROC_UNLOCK(proc); 1082 } 1083 #endif 1084 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1085 } 1086 1087 #ifndef _SYS_SYSPROTO_H_ 1088 struct mlockall_args { 1089 int how; 1090 }; 1091 #endif 1092 1093 /* 1094 * MPSAFE 1095 */ 1096 int 1097 sys_mlockall(td, uap) 1098 struct thread *td; 1099 struct mlockall_args *uap; 1100 { 1101 vm_map_t map; 1102 int error; 1103 1104 map = &td->td_proc->p_vmspace->vm_map; 1105 error = priv_check(td, PRIV_VM_MLOCK); 1106 if (error) 1107 return (error); 1108 1109 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) 1110 return (EINVAL); 1111 1112 /* 1113 * If wiring all pages in the process would cause it to exceed 1114 * a hard resource limit, return ENOMEM. 1115 */ 1116 if (!old_mlock && uap->how & MCL_CURRENT) { 1117 PROC_LOCK(td->td_proc); 1118 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1119 PROC_UNLOCK(td->td_proc); 1120 return (ENOMEM); 1121 } 1122 PROC_UNLOCK(td->td_proc); 1123 } 1124 #ifdef RACCT 1125 PROC_LOCK(td->td_proc); 1126 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); 1127 PROC_UNLOCK(td->td_proc); 1128 if (error != 0) 1129 return (ENOMEM); 1130 #endif 1131 1132 if (uap->how & MCL_FUTURE) { 1133 vm_map_lock(map); 1134 vm_map_modflags(map, MAP_WIREFUTURE, 0); 1135 vm_map_unlock(map); 1136 error = 0; 1137 } 1138 1139 if (uap->how & MCL_CURRENT) { 1140 /* 1141 * P1003.1-2001 mandates that all currently mapped pages 1142 * will be memory resident and locked (wired) upon return 1143 * from mlockall(). vm_map_wire() will wire pages, by 1144 * calling vm_fault_wire() for each page in the region. 1145 */ 1146 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), 1147 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1148 error = (error == KERN_SUCCESS ? 0 : EAGAIN); 1149 } 1150 #ifdef RACCT 1151 if (error != KERN_SUCCESS) { 1152 PROC_LOCK(td->td_proc); 1153 racct_set(td->td_proc, RACCT_MEMLOCK, 1154 ptoa(vmspace_wired_count(td->td_proc->p_vmspace))); 1155 PROC_UNLOCK(td->td_proc); 1156 } 1157 #endif 1158 1159 return (error); 1160 } 1161 1162 #ifndef _SYS_SYSPROTO_H_ 1163 struct munlockall_args { 1164 register_t dummy; 1165 }; 1166 #endif 1167 1168 /* 1169 * MPSAFE 1170 */ 1171 int 1172 sys_munlockall(td, uap) 1173 struct thread *td; 1174 struct munlockall_args *uap; 1175 { 1176 vm_map_t map; 1177 int error; 1178 1179 map = &td->td_proc->p_vmspace->vm_map; 1180 error = priv_check(td, PRIV_VM_MUNLOCK); 1181 if (error) 1182 return (error); 1183 1184 /* Clear the MAP_WIREFUTURE flag from this vm_map. */ 1185 vm_map_lock(map); 1186 vm_map_modflags(map, 0, MAP_WIREFUTURE); 1187 vm_map_unlock(map); 1188 1189 /* Forcibly unwire all pages. */ 1190 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), 1191 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); 1192 #ifdef RACCT 1193 if (error == KERN_SUCCESS) { 1194 PROC_LOCK(td->td_proc); 1195 racct_set(td->td_proc, RACCT_MEMLOCK, 0); 1196 PROC_UNLOCK(td->td_proc); 1197 } 1198 #endif 1199 1200 return (error); 1201 } 1202 1203 #ifndef _SYS_SYSPROTO_H_ 1204 struct munlock_args { 1205 const void *addr; 1206 size_t len; 1207 }; 1208 #endif 1209 /* 1210 * MPSAFE 1211 */ 1212 int 1213 sys_munlock(td, uap) 1214 struct thread *td; 1215 struct munlock_args *uap; 1216 { 1217 vm_offset_t addr, end, last, start; 1218 vm_size_t size; 1219 int error; 1220 1221 error = priv_check(td, PRIV_VM_MUNLOCK); 1222 if (error) 1223 return (error); 1224 addr = (vm_offset_t)uap->addr; 1225 size = uap->len; 1226 last = addr + size; 1227 start = trunc_page(addr); 1228 end = round_page(last); 1229 if (last < addr || end < addr) 1230 return (EINVAL); 1231 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, 1232 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 1233 #ifdef RACCT 1234 if (error == KERN_SUCCESS) { 1235 PROC_LOCK(td->td_proc); 1236 racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start)); 1237 PROC_UNLOCK(td->td_proc); 1238 } 1239 #endif 1240 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1241 } 1242 1243 /* 1244 * vm_mmap_vnode() 1245 * 1246 * Helper function for vm_mmap. Perform sanity check specific for mmap 1247 * operations on vnodes. 1248 * 1249 * For VCHR vnodes, the vnode lock is held over the call to 1250 * vm_mmap_cdev() to keep vp->v_rdev valid. 1251 */ 1252 int 1253 vm_mmap_vnode(struct thread *td, vm_size_t objsize, 1254 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1255 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, 1256 boolean_t *writecounted) 1257 { 1258 struct vattr va; 1259 vm_object_t obj; 1260 vm_offset_t foff; 1261 struct mount *mp; 1262 struct ucred *cred; 1263 int error, flags, locktype; 1264 1265 mp = vp->v_mount; 1266 cred = td->td_ucred; 1267 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) 1268 locktype = LK_EXCLUSIVE; 1269 else 1270 locktype = LK_SHARED; 1271 if ((error = vget(vp, locktype, td)) != 0) 1272 return (error); 1273 foff = *foffp; 1274 flags = *flagsp; 1275 obj = vp->v_object; 1276 if (vp->v_type == VREG) { 1277 /* 1278 * Get the proper underlying object 1279 */ 1280 if (obj == NULL) { 1281 error = EINVAL; 1282 goto done; 1283 } 1284 if (obj->handle != vp) { 1285 vput(vp); 1286 vp = (struct vnode *)obj->handle; 1287 /* 1288 * Bypass filesystems obey the mpsafety of the 1289 * underlying fs. 1290 */ 1291 error = vget(vp, locktype, td); 1292 if (error != 0) 1293 return (error); 1294 } 1295 if (locktype == LK_EXCLUSIVE) { 1296 *writecounted = TRUE; 1297 vnode_pager_update_writecount(obj, 0, objsize); 1298 } 1299 } else if (vp->v_type == VCHR) { 1300 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, 1301 vp->v_rdev, foffp, objp); 1302 if (error == 0) 1303 goto mark_atime; 1304 goto done; 1305 } else { 1306 error = EINVAL; 1307 goto done; 1308 } 1309 if ((error = VOP_GETATTR(vp, &va, cred))) 1310 goto done; 1311 #ifdef MAC 1312 error = mac_vnode_check_mmap(cred, vp, prot, flags); 1313 if (error != 0) 1314 goto done; 1315 #endif 1316 if ((flags & MAP_SHARED) != 0) { 1317 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { 1318 if (prot & PROT_WRITE) { 1319 error = EPERM; 1320 goto done; 1321 } 1322 *maxprotp &= ~VM_PROT_WRITE; 1323 } 1324 } 1325 /* 1326 * If it is a regular file without any references 1327 * we do not need to sync it. 1328 * Adjust object size to be the size of actual file. 1329 */ 1330 objsize = round_page(va.va_size); 1331 if (va.va_nlink == 0) 1332 flags |= MAP_NOSYNC; 1333 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); 1334 if (obj == NULL) { 1335 error = ENOMEM; 1336 goto done; 1337 } 1338 *objp = obj; 1339 *flagsp = flags; 1340 1341 mark_atime: 1342 vfs_mark_atime(vp, cred); 1343 1344 done: 1345 vput(vp); 1346 return (error); 1347 } 1348 1349 /* 1350 * vm_mmap_cdev() 1351 * 1352 * MPSAFE 1353 * 1354 * Helper function for vm_mmap. Perform sanity check specific for mmap 1355 * operations on cdevs. 1356 */ 1357 int 1358 vm_mmap_cdev(struct thread *td, vm_size_t objsize, 1359 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1360 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp) 1361 { 1362 vm_object_t obj; 1363 struct cdevsw *dsw; 1364 int error, flags, ref; 1365 1366 flags = *flagsp; 1367 1368 dsw = dev_refthread(cdev, &ref); 1369 if (dsw == NULL) 1370 return (ENXIO); 1371 if (dsw->d_flags & D_MMAP_ANON) { 1372 dev_relthread(cdev, ref); 1373 *maxprotp = VM_PROT_ALL; 1374 *flagsp |= MAP_ANON; 1375 return (0); 1376 } 1377 /* 1378 * cdevs do not provide private mappings of any kind. 1379 */ 1380 if ((*maxprotp & VM_PROT_WRITE) == 0 && 1381 (prot & PROT_WRITE) != 0) { 1382 dev_relthread(cdev, ref); 1383 return (EACCES); 1384 } 1385 if (flags & (MAP_PRIVATE|MAP_COPY)) { 1386 dev_relthread(cdev, ref); 1387 return (EINVAL); 1388 } 1389 /* 1390 * Force device mappings to be shared. 1391 */ 1392 flags |= MAP_SHARED; 1393 #ifdef MAC_XXX 1394 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot); 1395 if (error != 0) { 1396 dev_relthread(cdev, ref); 1397 return (error); 1398 } 1399 #endif 1400 /* 1401 * First, try d_mmap_single(). If that is not implemented 1402 * (returns ENODEV), fall back to using the device pager. 1403 * Note that d_mmap_single() must return a reference to the 1404 * object (it needs to bump the reference count of the object 1405 * it returns somehow). 1406 * 1407 * XXX assumes VM_PROT_* == PROT_* 1408 */ 1409 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); 1410 dev_relthread(cdev, ref); 1411 if (error != ENODEV) 1412 return (error); 1413 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, 1414 td->td_ucred); 1415 if (obj == NULL) 1416 return (EINVAL); 1417 *objp = obj; 1418 *flagsp = flags; 1419 return (0); 1420 } 1421 1422 /* 1423 * vm_mmap_shm() 1424 * 1425 * MPSAFE 1426 * 1427 * Helper function for vm_mmap. Perform sanity check specific for mmap 1428 * operations on shm file descriptors. 1429 */ 1430 int 1431 vm_mmap_shm(struct thread *td, vm_size_t objsize, 1432 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, 1433 struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp) 1434 { 1435 int error; 1436 1437 if ((*flagsp & MAP_SHARED) != 0 && 1438 (*maxprotp & VM_PROT_WRITE) == 0 && 1439 (prot & PROT_WRITE) != 0) 1440 return (EACCES); 1441 #ifdef MAC 1442 error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp); 1443 if (error != 0) 1444 return (error); 1445 #endif 1446 error = shm_mmap(shmfd, objsize, foff, objp); 1447 if (error) 1448 return (error); 1449 return (0); 1450 } 1451 1452 /* 1453 * vm_mmap() 1454 * 1455 * MPSAFE 1456 * 1457 * Internal version of mmap. Currently used by mmap, exec, and sys5 1458 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON. 1459 */ 1460 int 1461 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1462 vm_prot_t maxprot, int flags, 1463 objtype_t handle_type, void *handle, 1464 vm_ooffset_t foff) 1465 { 1466 boolean_t fitit; 1467 vm_object_t object = NULL; 1468 struct thread *td = curthread; 1469 int docow, error, rv; 1470 boolean_t writecounted; 1471 1472 if (size == 0) 1473 return (0); 1474 1475 size = round_page(size); 1476 1477 if (map == &td->td_proc->p_vmspace->vm_map) { 1478 PROC_LOCK(td->td_proc); 1479 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) { 1480 PROC_UNLOCK(td->td_proc); 1481 return (ENOMEM); 1482 } 1483 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { 1484 PROC_UNLOCK(td->td_proc); 1485 return (ENOMEM); 1486 } 1487 if (!old_mlock && map->flags & MAP_WIREFUTURE) { 1488 if (ptoa(vmspace_wired_count(td->td_proc->p_vmspace)) + 1489 size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) { 1490 racct_set_force(td->td_proc, RACCT_VMEM, 1491 map->size); 1492 PROC_UNLOCK(td->td_proc); 1493 return (ENOMEM); 1494 } 1495 error = racct_set(td->td_proc, RACCT_MEMLOCK, 1496 ptoa(vmspace_wired_count(td->td_proc->p_vmspace)) + 1497 size); 1498 if (error != 0) { 1499 racct_set_force(td->td_proc, RACCT_VMEM, 1500 map->size); 1501 PROC_UNLOCK(td->td_proc); 1502 return (error); 1503 } 1504 } 1505 PROC_UNLOCK(td->td_proc); 1506 } 1507 1508 /* 1509 * We currently can only deal with page aligned file offsets. 1510 * The check is here rather than in the syscall because the 1511 * kernel calls this function internally for other mmaping 1512 * operations (such as in exec) and non-aligned offsets will 1513 * cause pmap inconsistencies...so we want to be sure to 1514 * disallow this in all cases. 1515 */ 1516 if (foff & PAGE_MASK) 1517 return (EINVAL); 1518 1519 if ((flags & MAP_FIXED) == 0) { 1520 fitit = TRUE; 1521 *addr = round_page(*addr); 1522 } else { 1523 if (*addr != trunc_page(*addr)) 1524 return (EINVAL); 1525 fitit = FALSE; 1526 } 1527 writecounted = FALSE; 1528 1529 /* 1530 * Lookup/allocate object. 1531 */ 1532 switch (handle_type) { 1533 case OBJT_DEVICE: 1534 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, 1535 handle, &foff, &object); 1536 break; 1537 case OBJT_VNODE: 1538 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, 1539 handle, &foff, &object, &writecounted); 1540 break; 1541 case OBJT_SWAP: 1542 error = vm_mmap_shm(td, size, prot, &maxprot, &flags, 1543 handle, foff, &object); 1544 break; 1545 case OBJT_DEFAULT: 1546 if (handle == NULL) { 1547 error = 0; 1548 break; 1549 } 1550 /* FALLTHROUGH */ 1551 default: 1552 error = EINVAL; 1553 break; 1554 } 1555 if (error) 1556 return (error); 1557 if (flags & MAP_ANON) { 1558 object = NULL; 1559 docow = 0; 1560 /* 1561 * Unnamed anonymous regions always start at 0. 1562 */ 1563 if (handle == 0) 1564 foff = 0; 1565 } else if (flags & MAP_PREFAULT_READ) 1566 docow = MAP_PREFAULT; 1567 else 1568 docow = MAP_PREFAULT_PARTIAL; 1569 1570 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1571 docow |= MAP_COPY_ON_WRITE; 1572 if (flags & MAP_NOSYNC) 1573 docow |= MAP_DISABLE_SYNCER; 1574 if (flags & MAP_NOCORE) 1575 docow |= MAP_DISABLE_COREDUMP; 1576 /* Shared memory is also shared with children. */ 1577 if (flags & MAP_SHARED) 1578 docow |= MAP_INHERIT_SHARE; 1579 if (writecounted) 1580 docow |= MAP_VN_WRITECOUNT; 1581 1582 if (flags & MAP_STACK) 1583 rv = vm_map_stack(map, *addr, size, prot, maxprot, 1584 docow | MAP_STACK_GROWS_DOWN); 1585 else if (fitit) 1586 rv = vm_map_find(map, object, foff, addr, size, 1587 object != NULL && object->type == OBJT_DEVICE ? 1588 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow); 1589 else 1590 rv = vm_map_fixed(map, object, foff, *addr, size, 1591 prot, maxprot, docow); 1592 1593 if (rv == KERN_SUCCESS) { 1594 /* 1595 * If the process has requested that all future mappings 1596 * be wired, then heed this. 1597 */ 1598 if (map->flags & MAP_WIREFUTURE) { 1599 vm_map_wire(map, *addr, *addr + size, 1600 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? 1601 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); 1602 } 1603 } else { 1604 /* 1605 * If this mapping was accounted for in the vnode's 1606 * writecount, then undo that now. 1607 */ 1608 if (writecounted) 1609 vnode_pager_release_writecount(object, 0, size); 1610 /* 1611 * Lose the object reference. Will destroy the 1612 * object if it's an unnamed anonymous mapping 1613 * or named anonymous without other references. 1614 */ 1615 vm_object_deallocate(object); 1616 } 1617 return (vm_mmap_to_errno(rv)); 1618 } 1619 1620 /* 1621 * Translate a Mach VM return code to zero on success or the appropriate errno 1622 * on failure. 1623 */ 1624 int 1625 vm_mmap_to_errno(int rv) 1626 { 1627 1628 switch (rv) { 1629 case KERN_SUCCESS: 1630 return (0); 1631 case KERN_INVALID_ADDRESS: 1632 case KERN_NO_SPACE: 1633 return (ENOMEM); 1634 case KERN_PROTECTION_FAILURE: 1635 return (EACCES); 1636 default: 1637 return (EINVAL); 1638 } 1639 } 1640